From 81176527dd351c93ef21ea60a149ff2d18897eb0 Mon Sep 17 00:00:00 2001 From: Thomas Hardy Date: Thu, 4 Dec 2025 16:42:51 -0800 Subject: [PATCH 1/3] upload nightly tps metrics to S3 --- .../workflows/nightly-throughput-stress.yml | 40 +++++++++++++++++-- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/.github/workflows/nightly-throughput-stress.yml b/.github/workflows/nightly-throughput-stress.yml index c63f27491..efc377ad9 100644 --- a/.github/workflows/nightly-throughput-stress.yml +++ b/.github/workflows/nightly-throughput-stress.yml @@ -4,9 +4,6 @@ on: schedule: # Run at 3 AM PST (11:00 UTC) - offset from existing nightly - cron: '00 11 * * *' - push: - branches: - - nightly_tps workflow_dispatch: inputs: duration: @@ -28,12 +25,16 @@ on: permissions: contents: read actions: write + id-token: write env: # Workflow configuration TEST_DURATION: ${{ inputs.duration || vars.NIGHTLY_TEST_DURATION || '5h' }} TEST_TIMEOUT: ${{ inputs.timeout || vars.NIGHTLY_TEST_TIMEOUT || '5h30m' }} + # AWS S3 metrics upload ARN + AWS_S3_METRICS_UPLOAD_ROLE_ARN: ${{ vars.AWS_S3_METRICS_UPLOAD_ROLE_ARN }} + # Logging and artifacts WORKER_LOG_DIR: /tmp/throughput-stress-logs @@ -105,6 +106,14 @@ jobs: - name: Install Temporal CLI uses: temporalio/setup-temporal@v0 + - name: Install Prometheus + run: | + PROM_VERSION="3.8.0" + wget -q https://github.com/prometheus/prometheus/releases/download/v${PROM_VERSION}/prometheus-${PROM_VERSION}.linux-amd64.tar.gz + tar xzf prometheus-${PROM_VERSION}.linux-amd64.tar.gz + sudo mv prometheus-${PROM_VERSION}.linux-amd64/prometheus /usr/local/bin/ + prometheus --version + - name: Setup log directory run: mkdir -p $WORKER_LOG_DIR @@ -137,6 +146,11 @@ jobs: --duration $TEST_DURATION \ --timeout $TEST_TIMEOUT \ --max-concurrent 10 \ + --prom-listen-address 127.0.0.1:9091 \ + --worker-prom-listen-address 127.0.0.1:9092 \ + --prom-instance-addr 127.0.0.1:9090 \ + --prom-instance-config \ + --prom-export-worker-metrics $RUN_ID.parquet \ --option internal-iterations=10 \ --option continue-as-new-after-iterations=3 \ --option sleep-time=1s \ @@ -144,6 +158,26 @@ jobs: --option min-throughput-per-hour=1000 \ 2>&1 | tee $WORKER_LOG_DIR/scenario.log + - name: Configure AWS credentials + if: always() + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ env.AWS_S3_METRICS_UPLOAD_ROLE_ARN }} + aws-region: us-west-2 + + - name: Upload metrics to S3 + if: always() + run: | + DATE=$(date +%Y-%m-%d) + # Use test/ prefix on non-main branches + PREFIX="language=typescript/date=$DATE" + if [[ "${{ github.ref }}" != "refs/heads/main" ]]; then + PREFIX="test/$PREFIX" + fi + aws s3 cp omes/$RUN_ID.parquet \ + "s3://cloud-data-ingest-prod/github/sdk_load_test/$PREFIX/$RUN_ID.parquet" + + - name: Upload logs on failure if: failure() || cancelled() uses: actions/upload-artifact@v4 From 13bbeb223e6b6d952de6249d4ec2d59029494013 Mon Sep 17 00:00:00 2001 From: Thomas Hardy Date: Fri, 5 Dec 2025 15:13:44 -0800 Subject: [PATCH 2/3] use env vars instead of var interpolation, add is_experiment input --- .../workflows/nightly-throughput-stress.yml | 27 ++++++++++++------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/.github/workflows/nightly-throughput-stress.yml b/.github/workflows/nightly-throughput-stress.yml index efc377ad9..0a30d2013 100644 --- a/.github/workflows/nightly-throughput-stress.yml +++ b/.github/workflows/nightly-throughput-stress.yml @@ -21,10 +21,14 @@ on: required: false default: 360 type: number + is_experiment: + description: 'Mark this run as an experiment (excluded from nightly dashboards)' + required: false + default: false + type: boolean permissions: contents: read - actions: write id-token: write env: @@ -43,6 +47,12 @@ env: OMES_REF: main RUN_ID: ${{ github.run_id }}-throughput-stress + # Prometheus version + PROM_VERSION: "3.8.0" + + # Language + SDK_LANG: "typescript" + jobs: throughput-stress: runs-on: ubuntu-latest-4-cores @@ -108,7 +118,6 @@ jobs: - name: Install Prometheus run: | - PROM_VERSION="3.8.0" wget -q https://github.com/prometheus/prometheus/releases/download/v${PROM_VERSION}/prometheus-${PROM_VERSION}.linux-amd64.tar.gz tar xzf prometheus-${PROM_VERSION}.linux-amd64.tar.gz sudo mv prometheus-${PROM_VERSION}.linux-amd64/prometheus /usr/local/bin/ @@ -140,7 +149,7 @@ jobs: # to give CI a bit more time for visibility consistency go run ./cmd run-scenario-with-worker \ --scenario throughput_stress \ - --language typescript \ + --language $SDK_LANG \ --version $(pwd)/.. \ --run-id $RUN_ID \ --duration $TEST_DURATION \ @@ -169,14 +178,14 @@ jobs: if: always() run: | DATE=$(date +%Y-%m-%d) - # Use test/ prefix on non-main branches - PREFIX="language=typescript/date=$DATE" - if [[ "${{ github.ref }}" != "refs/heads/main" ]]; then - PREFIX="test/$PREFIX" + IS_EXPERIMENT="false" + # Set as an experiment if we are not on the main branch or input as an experiment + if [[ "$GH_REF" != "refs/heads/main" || "$IS_EXPERIMENT_INPUT" == "true" ]]; then + IS_EXPERIMENT="true" fi + echo "Uploading metrics: is_experiment=$IS_EXPERIMENT, language=$SDK_LANG, date=$DATE" aws s3 cp omes/$RUN_ID.parquet \ - "s3://cloud-data-ingest-prod/github/sdk_load_test/$PREFIX/$RUN_ID.parquet" - + "s3://cloud-data-ingest-prod/github/sdk_load_test/is_experiment=$IS_EXPERIMENT/language=$SDK_LANG/date=$DATE/$RUN_ID.parquet" - name: Upload logs on failure if: failure() || cancelled() From 76deb9e2d55a081f6bb7a6bc6c3a8e207a4ea5ea Mon Sep 17 00:00:00 2001 From: Thomas Hardy Date: Fri, 5 Dec 2025 15:54:38 -0800 Subject: [PATCH 3/3] formatting --- .github/workflows/nightly-throughput-stress.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/nightly-throughput-stress.yml b/.github/workflows/nightly-throughput-stress.yml index 0a30d2013..f9cb4c765 100644 --- a/.github/workflows/nightly-throughput-stress.yml +++ b/.github/workflows/nightly-throughput-stress.yml @@ -48,10 +48,10 @@ env: RUN_ID: ${{ github.run_id }}-throughput-stress # Prometheus version - PROM_VERSION: "3.8.0" + PROM_VERSION: '3.8.0' # Language - SDK_LANG: "typescript" + SDK_LANG: 'typescript' jobs: throughput-stress: @@ -173,7 +173,7 @@ jobs: with: role-to-assume: ${{ env.AWS_S3_METRICS_UPLOAD_ROLE_ARN }} aws-region: us-west-2 - + - name: Upload metrics to S3 if: always() run: |