mirror of
https://github.com/zebrajr/pytorch.git
synced 2026-01-15 12:15:51 +00:00
Changes: - Remove ECR login from `setup-linux` action - Update `setup-rocm` to use the new action - Update all workflows to explicitly call `ecr-login` after setup This eliminates the previous mix of instance role and OIDC credential sources, providing a uniform authentication flow that works on any runner type without requiring AWS instance roles. Signed-off-by: Eli Uriegas <eliuriegas@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/169962 Approved by: https://github.com/atalman, https://github.com/malfet ghstack dependencies: #169980
161 lines
5.7 KiB
YAML
161 lines
5.7 KiB
YAML
name: Index PyTorch Tests for Target Determination
|
|
|
|
on:
|
|
workflow_dispatch:
|
|
schedule:
|
|
- cron: '0 0 * * *'
|
|
|
|
permissions:
|
|
id-token: write
|
|
contents: read
|
|
|
|
jobs:
|
|
get-label-type:
|
|
if: github.repository_owner == 'pytorch'
|
|
name: get-label-type
|
|
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
|
with:
|
|
triggering_actor: ${{ github.triggering_actor }}
|
|
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
|
curr_branch: ${{ github.head_ref || github.ref_name }}
|
|
curr_ref_type: ${{ github.ref_type }}
|
|
|
|
index:
|
|
needs: get-label-type
|
|
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" # 1 GPU A10G 24GB each
|
|
environment: target-determinator-env
|
|
steps:
|
|
- name: Clone PyTorch
|
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
with:
|
|
path: pytorch
|
|
|
|
- name: Setup Linux
|
|
uses: ./pytorch/.github/actions/setup-linux
|
|
|
|
- name: Login to ECR
|
|
uses: ./pytorch/.github/actions/ecr-login
|
|
|
|
- name: Calculate docker image
|
|
id: calculate-docker-image
|
|
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
|
|
with:
|
|
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
|
working-directory: pytorch
|
|
|
|
- name: Use following to pull public copy of the image
|
|
id: print-ghcr-mirror
|
|
env:
|
|
ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
|
shell: bash
|
|
run: |
|
|
tag=${ECR_DOCKER_IMAGE##*:}
|
|
echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
|
|
|
|
- name: Pull docker image
|
|
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
|
|
with:
|
|
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
|
|
|
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
|
|
id: install-nvidia-driver
|
|
uses: pytorch/test-infra/.github/actions/setup-nvidia@main
|
|
|
|
- name: Clone CodeLlama
|
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
with:
|
|
repository: osalpekar/codellama
|
|
ref: 1ec50e0cfc0fadc3b6ceb146617e2119ab26eb34
|
|
path: codellama
|
|
|
|
- name: Clone Target Determination Code
|
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
with:
|
|
repository: osalpekar/llm-target-determinator
|
|
ref: v0.0.2
|
|
path: llm-target-determinator
|
|
|
|
- name: Configure AWS credentials
|
|
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
|
|
with:
|
|
role-to-assume: arn:aws:iam::308535385114:role/gha_target_determinator_s3_read_write
|
|
aws-region: us-east-1
|
|
|
|
- name: Download checkpoint
|
|
shell: bash
|
|
env:
|
|
AWS_DEFAULT_REGION: us-east-1
|
|
run: |
|
|
# Do this outside of docker so I don't have to put env vars in
|
|
pip3 install awscli==1.29.40
|
|
cd codellama
|
|
mkdir "CodeLlama-7b-Python"
|
|
aws s3 cp \
|
|
"s3://target-determinator-assets/CodeLlama-7b-Python" \
|
|
"CodeLlama-7b-Python" \
|
|
--recursive
|
|
|
|
- name: Run indexer
|
|
shell: bash -l {0}
|
|
env:
|
|
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
|
GITHUB_RUN_ID: ${{ github.run_id }}
|
|
AWS_DEFAULT_REGION: us-east-1
|
|
run: |
|
|
# detached container should get cleaned up by teardown_ec2_linux
|
|
# Disable shellcheck warning for GPU_FLAG
|
|
# shellcheck disable=SC2086
|
|
container_name=$(docker run \
|
|
${GPU_FLAG:-} \
|
|
-e MAX_JOBS="$(nproc --ignore=2)" \
|
|
-e AWS_DEFAULT_REGION \
|
|
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
|
|
--security-opt seccomp=unconfined \
|
|
--cap-add=SYS_PTRACE \
|
|
--tty \
|
|
--detach \
|
|
--user jenkins \
|
|
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
|
|
-w /var/lib/jenkins/workspace \
|
|
"${DOCKER_IMAGE}"
|
|
)
|
|
chmod +x pytorch/.github/scripts/td_llm_indexer.sh
|
|
docker exec -t "${container_name}" sh -c 'pytorch/.github/scripts/td_llm_indexer.sh'
|
|
|
|
- name: Upload to s3
|
|
shell: bash -l {0}
|
|
env:
|
|
AWS_DEFAULT_REGION: us-east-1
|
|
run: |
|
|
cd llm-target-determinator/assets
|
|
|
|
TIMESTAMP=$(date -Iseconds)
|
|
ZIP_NAME="indexer-files-${TIMESTAMP}.zip"
|
|
|
|
# Create a zipfile with all the generated indices
|
|
zip -r "${ZIP_NAME}" indexer-files
|
|
|
|
# Note that because the below 2 operations are not atomic, there will
|
|
# be a period of a few seconds between these where there is no index
|
|
# present in the latest/ folder. To account for this, the retriever
|
|
# should have some retry logic with backoff to ensure fetching the
|
|
# index doesn't fail.
|
|
# Move the old index into the archived/ folder
|
|
aws s3 mv \
|
|
"s3://target-determinator-assets/indexes/latest" \
|
|
"s3://target-determinator-assets/indexes/archived" \
|
|
--recursive
|
|
|
|
# Move the new index into the latestl/ folder
|
|
aws s3 cp \
|
|
"${ZIP_NAME}" \
|
|
"s3://target-determinator-assets/indexes/latest/${ZIP_NAME}"
|
|
|
|
- name: Teardown Linux
|
|
uses: pytorch/test-infra/.github/actions/teardown-linux@main
|
|
if: always()
|
|
|
|
concurrency:
|
|
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
|
cancel-in-progress: true
|