E2E (NVIDIA L40S x4) #316

Workflow file for this run

.github/workflows/e2e-nvidia-l40s-x4.yml at ccac4fd

	# SPDX-License-Identifier: Apache-2.0

	name: E2E (NVIDIA L40S x4)

	on:
	schedule:
	- cron: '0 16 * * *' # Runs at 4PM UTC every day
	workflow_dispatch:
	inputs:
	pr_or_branch:
	description: 'pull request number or branch name'
	required: true
	default: 'main'

	env:
	TMPDIR: /home/tmp

	jobs:
	start-large-ec2-runner:
	runs-on: ubuntu-latest
	outputs:
	label: ${{ steps.launch-ec2-instance-with-fallback.outputs.label }}
	ec2-instance-id: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }}
	ec2-instance-region: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }}
	steps:
	- name: Checkout "launch-ec2-runner-with-fallback" in-house CI action
	uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
	with:
	repository: instructlab/ci-actions
	# clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents
	path: ci-actions
	ref: release-v0.1
	sparse-checkout: \|
	actions/launch-ec2-runner-with-fallback

	- name: Launch EC2 Runner with Fallback
	id: launch-ec2-instance-with-fallback
	uses: ./ci-actions/actions/launch-ec2-runner-with-fallback
	env:
	TMPDIR: "/tmp"
	with:
	aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
	aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
	github_token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
	regions_config: >
	[
	{
	"region": "us-east-2",
	"subnets": {
	"us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}",
	"us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}",
	"us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}"
	},
	"ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}",
	"security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"
	},
	{
	"region": "us-east-1",
	"subnets": {
	"us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}",
	"us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}",
	"us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}",
	"us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}",
	"us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}",
	"us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}"
	},
	"ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}",
	"security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}"
	}
	]
	try_spot_instance_first: false
	ec2_instance_type: g6e.12xlarge
	aws_resource_tags: >
	[
	{"Key": "Name", "Value": "instructlab-ci-github-large-runner"},
	{"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
	{"Key": "GitHubRef", "Value": "${{ github.ref }}"},
	{"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
	]

	e2e-large-test:
	needs:
	- start-large-ec2-runner
	runs-on: ${{ needs.start-large-ec2-runner.outputs.label }}

	permissions:
	pull-requests: write

	steps:
	- name: "Harden Runner"
	# v2.10.1
	uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0
	with:
	egress-policy: audit
	- name: Install Packages
	run: \|
	cat /etc/os-release
	mkdir -p "${TMPDIR}"
	sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel

	- name: Checkout instructlab/instructlab
	uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
	with:
	repository: "instructlab/instructlab"
	path: "instructlab"
	# https://github.com/actions/checkout/issues/249
	fetch-depth: 0

	- name: Checkout instructlab/training
	uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
	with:
	repository: "instructlab/training"
	path: "training"
	# https://github.com/actions/checkout/issues/249
	fetch-depth: 0

	- name: Determine if pr_or_branch is a PR number
	id: check_pr
	run: \|
	PR_OR_BRANCH=${{ github.event.inputs.pr_or_branch \|\| 'main' }} # Default to 'main' if not set
	if [[ "$PR_OR_BRANCH" =~ ^[0-9]+$ ]]; then
	echo "is_pr=true" >> "$GITHUB_OUTPUT"
	else
	echo "is_pr=false" >> "$GITHUB_OUTPUT"
	fi
	echo "pr_or_branch=$PR_OR_BRANCH" >> "$GITHUB_OUTPUT"

	- name: Check if gh cli is installed
	id: gh_cli
	run: \|
	if command -v gh &> /dev/null ; then
	echo "gh_cli_installed=true" >> "$GITHUB_OUTPUT"
	else
	echo "gh_cli_installed=false" >> "$GITHUB_OUTPUT"
	fi

	- name: Install gh CLI
	if: steps.gh_cli.outputs.gh_cli_installed == 'false'
	run: \|
	sudo dnf install 'dnf-command(config-manager)' -y
	sudo dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo
	sudo dnf install gh --repo gh-cli -y

	- name: test gh CLI
	run: \|
	gh --version

	- name: set default repo
	working-directory: ./training
	run: \|
	gh repo set-default ${{ github.server_url }}/${{ github.repository }}
	env:
	GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}

	- name: Add comment to PR
	if: steps.check_pr.outputs.is_pr == 'true'
	working-directory: ./training
	run: \|
	gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "${{ github.workflow }} workflow launched on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})"
	env:
	GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}

	- name: Fetch and checkout PR
	if: steps.check_pr.outputs.is_pr == 'true'
	working-directory: ./training
	run: \|
	gh pr checkout ${{ steps.check_pr.outputs.pr_or_branch }}
	env:
	GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}

	- name: Checkout branch
	if: steps.check_pr.outputs.is_pr == 'false'
	working-directory: ./training
	run: \|
	git checkout ${{ steps.check_pr.outputs.pr_or_branch }}

	- name: Install ilab
	working-directory: ./instructlab
	run: \|
	PYTHON=python3.11 ./scripts/install-ilab-with-cuda.sh

	- name: Update instructlab-training library
	working-directory: ./training
	run: \|
	. ../instructlab/venv/bin/activate
	# Patch out our own pin from the ilab repo constraints file
	ilab_constraints=../instructlab/constraints-dev.txt
	sed -i '/instructlab-training==/d' $ilab_constraints
	# Since we reuse the virtual environment prepared using ilab
	# constraints, we should stick to the same constraints when
	# installing latest training.
	#
	# FIX: this is not ideal; a proper fix would require decoupling the
	# two repos in CI: either by removing the job completely and relying
	# on "sdk" (no ilab) test runs; or by preparing a separate
	# constraints file that would consider both the requirements files
	# for the training library AND for the ilab - so that they are
	# consistent.
	pip_install="pip install -c $ilab_constraints"
	$pip_install .
	$pip_install .[cuda]

	- name: Check disk before tests
	if: always()
	run: \|
	df -h

	- name: Run e2e test
	working-directory: ./instructlab
	env:
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
	run: \|
	. venv/bin/activate

	# set preserve to true so we can retain the logs
	./scripts/e2e-ci.sh -lp

	# we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
	# and we know that it will be written into a directory created by `mktemp -d`.
	# Given this information, we can use the following command to find the file:
	log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl")
	phase_num=1;
	for log_file in $log_files; do
	mv "${log_file}" phase-${phase_num}-training-log.jsonl
	((phase_num++))
	done

	- name: Check disk after tests
	if: always()
	run: \|
	df -h

	- name: Upload training logs Phase 1
	uses: actions/upload-artifact@v4
	with:
	name: phase-1-training-log.jsonl
	path: ./instructlab/phase-1-training-log.jsonl
	retention-days: 1
	overwrite: true

	- name: Upload training logs Phase 2
	uses: actions/upload-artifact@v4
	with:
	name: phase-2-training-log.jsonl
	path: ./instructlab/phase-2-training-log.jsonl
	retention-days: 1
	overwrite: true

	- name: Add comment to PR if the workflow failed
	if: failure() && steps.check_pr.outputs.is_pr == 'true'
	working-directory: ./training
	run: \|
	gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow failed on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), please investigate."
	env:
	GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}

	- name: Add comment to PR if the workflow succeeded
	if: success() && steps.check_pr.outputs.is_pr == 'true'
	working-directory: ./training
	run: \|
	gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow succeeded on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), congrats!"
	env:
	GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}

	- name: Send Discord notification for failure
	if: failure() && steps.check_pr.outputs.is_pr == 'false'
	uses: sarisia/actions-status-discord@5ddd3b114a98457dd80a39b2f00b6a998cd69008 # v1.15.3
	with:
	webhook: ${{ secrets.SON_OF_JEEVES_DISCORD_WEBHOOK }}
	status: ${{ job.status }}
	title: "e2e-nvidia-l40s-x4"
	description: \|
	Job in ${{ github.repository }} running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed with failures ❌
	Click [here](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) to view details.
	color: 0xCB2431 # Red color for failure

	- name: Send Discord notification for success
	if: success() && steps.check_pr.outputs.is_pr == 'false'
	uses: sarisia/actions-status-discord@5ddd3b114a98457dd80a39b2f00b6a998cd69008 # v1.15.3
	with:
	webhook: ${{ secrets.SON_OF_JEEVES_DISCORD_WEBHOOK }}
	status: ${{ job.status }}
	title: "e2e-nvidia-l40s-x4"
	description: \|
	Job in ${{ github.repository }} running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed successfully ✅
	Click [here](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) to view details.
	color: 0x28A745 # Green color for success

	stop-large-ec2-runner:
	needs:
	- start-large-ec2-runner
	- e2e-large-test
	runs-on: ubuntu-latest
	if: ${{ always() }}
	steps:
	- name: Configure AWS credentials
	uses: aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df # v4.2.1
	with:
	aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
	aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
	aws-region: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-region }}

	- name: Stop EC2 runner
	uses: machulav/ec2-github-runner@8b37f736c69ba6af391e437447d3c07548478d78 # v2.4.0
	with:
	mode: stop
	github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
	label: ${{ needs.start-large-ec2-runner.outputs.label }}
	ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}

	loss-graphs:
	needs:
	- stop-large-ec2-runner
	runs-on: ubuntu-latest
	if: ${{ always() }}
	steps:
	- name: "Harden Runner"
	# v2.10.1
	uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0
	with:
	egress-policy: audit

	- name: Configure AWS credentials
	uses: aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df # v4.2.1
	with:
	aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
	aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
	aws-region: ${{ vars.AWS_REGION }}

	- name: Download loss data Phase 1
	id: phase-1-download-logs
	uses: actions/download-artifact@v4
	with:
	name: phase-1-training-log.jsonl
	path: downloaded-data

	- name: Download loss data Phase 2
	id: phase-2-download-logs
	uses: actions/download-artifact@v4
	with:
	name: phase-2-training-log.jsonl
	path: downloaded-data

	- name: Checkout instructlab/training
	uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
	with:
	repository: "instructlab/training"
	path: "training"
	fetch-depth: 0

	- name: Install dependencies
	working-directory: ./training
	run: \|
	python -m pip install --upgrade pip
	pip install -r requirements-dev.txt -c constraints-dev.txt

	- name: Try to upload Phase 1 to s3
	id: phase-1-upload-s3
	continue-on-error: true
	run: \|
	python training/scripts/create-loss-graph.py \
	--log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \
	--output-file "./phase-1-test.md" \
	--phase "1" \
	--aws-region "${{ vars.AWS_REGION }}" \
	--bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
	--base-branch "${GITHUB_REF##*/}" \
	--head-sha "${{ github.sha }}" \
	--pr-number "${{ github.event.number }}" \
	--origin-repository "${{ github.repository }}"

	- name: Try to upload Phase 2 to s3
	id: phase-2-upload-s3
	continue-on-error: true
	run: \|
	python training/scripts/create-loss-graph.py \
	--log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \
	--output-file "./phase-2-test.md" \
	--phase "2" \
	--aws-region "${{ vars.AWS_REGION }}" \
	--bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
	--base-branch "${GITHUB_REF##*/}" \
	--head-sha "${{ github.sha }}" \
	--pr-number "${{ github.event.number }}" \
	--origin-repository "${{ github.repository }}"

	- name: Check Phase 1 S3 upload status for success
	if: steps.phase-1-upload-s3.outcome == 'success'
	run: \|
	echo "Uploaded Phase 1 loss graph to S3."
	cat ./phase-1-test.md >> "${GITHUB_STEP_SUMMARY}"

	- name: Check Phase 2 S3 upload status for success
	if: steps.phase-2-upload-s3.outcome == 'success'
	run: \|
	echo "Uploaded Phase 2 loss graph to S3."
	cat ./phase-2-test.md >> "${GITHUB_STEP_SUMMARY}"

	- name: Check Phase 1 S3 upload status for failure
	if: steps.phase-1-upload-s3.outcome == 'failure'
	run: \|
	echo "::warning::Failed to upload Phase 1 loss graph to S3. This won't block the workflow, but you may want to investigate."
	echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"

	- name: Check Phase 2 S3 upload status for failure
	if: steps.phase-2-upload-s3.outcome == 'failure'
	run: \|
	echo "::warning::Failed to upload Phase 2 loss graph to S3. This won't block the workflow, but you may want to investigate."
	echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

E2E (NVIDIA L40S x4) #316

Workflow file

E2E (NVIDIA L40S x4) #316

Uh oh!

Jobs

Run details

Workflow file for this run