From d1809cd4076ce3d1a9e038e9a05bdb36c50f0d21 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Wed, 10 Dec 2025 17:02:26 +0100 Subject: [PATCH 1/5] add new docs for jobs and add index --- docs/jobs/_toctree.yml | 41 +++++++++++ docs/jobs/configuration.md | 0 docs/jobs/data1.md | 0 docs/jobs/docker.md | 0 docs/jobs/frameworks.md | 0 docs/jobs/index.md | 11 +++ docs/jobs/inference1.md | 0 docs/jobs/manage.md | 0 docs/jobs/pricing.md | 0 docs/jobs/quickstart.md | 144 +++++++++++++++++++++++++++++++++++++ docs/jobs/schedule.md | 0 docs/jobs/training1.md | 0 docs/jobs/webhooks.md | 0 13 files changed, 196 insertions(+) create mode 100644 docs/jobs/_toctree.yml create mode 100644 docs/jobs/configuration.md create mode 100644 docs/jobs/data1.md create mode 100644 docs/jobs/docker.md create mode 100644 docs/jobs/frameworks.md create mode 100644 docs/jobs/index.md create mode 100644 docs/jobs/inference1.md create mode 100644 docs/jobs/manage.md create mode 100644 docs/jobs/pricing.md create mode 100644 docs/jobs/quickstart.md create mode 100644 docs/jobs/schedule.md create mode 100644 docs/jobs/training1.md create mode 100644 docs/jobs/webhooks.md diff --git a/docs/jobs/_toctree.yml b/docs/jobs/_toctree.yml new file mode 100644 index 000000000..4fe0875ec --- /dev/null +++ b/docs/jobs/_toctree.yml @@ -0,0 +1,41 @@ +- local: index + title: Hugging Face Jobs + +- title: Overview + sections: + - local: index + title: Hugging Face Jobs + - local: quickstart + title: Quickstart + - local: docker + title: Docker + - local: schedule + title: Schedule Jobs + - local: webhooks + title: Webhook Automation + - local: pricing + title: Pricing and Billing + +- title: Tutorials + sections: + - title: Training + sections: + - local: training1 + title: Training Tuto 1 + - title: Inference + sections: + - local: inference1 + title: Inference Tuto 1 + - title: Data + sections: + - local: data1 + title: Data Tuto 1 + +- title: Guides + sections: + - local: manage + title: Manage Jobs + - local: configuration + title: Configuration + - local: frameworks + title: Frameworks Setups diff --git a/docs/jobs/configuration.md b/docs/jobs/configuration.md new file mode 100644 index 000000000..e69de29bb diff --git a/docs/jobs/data1.md b/docs/jobs/data1.md new file mode 100644 index 000000000..e69de29bb diff --git a/docs/jobs/docker.md b/docs/jobs/docker.md new file mode 100644 index 000000000..e69de29bb diff --git a/docs/jobs/frameworks.md b/docs/jobs/frameworks.md new file mode 100644 index 000000000..e69de29bb diff --git a/docs/jobs/index.md b/docs/jobs/index.md new file mode 100644 index 000000000..4f1541d45 --- /dev/null +++ b/docs/jobs/index.md @@ -0,0 +1,11 @@ +# Hugging Face Jobs + +Run compute jobs on Hugging Face infrastructure with a familiar UV & Docker-like interface! + +

UV & Docker-like CLI

uv,run,ps,logs,inspect

Any Hardware

CPUs to A100s & TPUs

Run Anything

UV, Docker, HF Spaces & more

Pay-as-you-go

Pay only for seconds used

+ +The Hugging Face Hub provides compute for AI and data workflows via Jobs. + +Jobs runs on Hugging Face infrastructure and aim at providing AI builders, Data engineers, developers and AI agents an easy access to cloud infrastructure to run their workloads. They are ideal to fine tune AI models and run inference with GPUs, but also for data ingestion and processing as well. + +A job is defined with a command to run (e.g. a UV or python command), a hardware flavor (CPU, GPU, TPU), and optionnally a Docker Image from Hugging Face Spaces or Docker Hub. Many jobs can run in parallel, which is useful e.g. for parameters tuning or parallel inference and data processing. diff --git a/docs/jobs/inference1.md b/docs/jobs/inference1.md new file mode 100644 index 000000000..e69de29bb diff --git a/docs/jobs/manage.md b/docs/jobs/manage.md new file mode 100644 index 000000000..e69de29bb diff --git a/docs/jobs/pricing.md b/docs/jobs/pricing.md new file mode 100644 index 000000000..e69de29bb diff --git a/docs/jobs/quickstart.md b/docs/jobs/quickstart.md new file mode 100644 index 000000000..21291442c --- /dev/null +++ b/docs/jobs/quickstart.md @@ -0,0 +1,144 @@ +# Quickstart + +In this guide you will run a Job to fine-tune an open source model on Hugging Face infrastastructure in only a few minutes. Make sure you are logged in to Hugging Face and have access to your [Jobs page](https://huggingface.co/settings/jobs). + +
+ + +
+ +## Getting started + +First install the Hugging Face CLI: + +1. Install the CLI + +```bash +curl -LsSf https://hf.co/cli/install.sh | bash +``` + +Install the CLI (using Homebrew) + +```bash +brew install huggingface-cli +``` + +Install the CLI (using uv) + +```bash +uv tool install hf +``` + +2. Login to your Hugging Face account: + +Login + +```bash +hf auth login +``` + +3. Create your first jobs using the `hf jobs` command: + +Run a UV command or script + +```bash +hf jobs uv run python -c 'print("Hello from the cloud!")' +``` + +```bash +hf jobs uv run path/to/script.py +``` + +Run a Docker command + +```bash +hf jobs run python:3.12 python -c 'print("Hello from the cloud!")' +``` + +4. Monitor your job + +The job logs appear in your terminal, but you can also see the job in your jobs page. Open the job page to see the job information, status and logs: + +
+ + +
+ +
+ + +
+ + +## The training script + +Here is a simple training script to fine-tune a base model to a conversational model using Supervised Fine-Tuning (SFT). It uses the [Qwen/Qwen2.5-0.5B](https://huggingface.co/Qwen/Qwen2.5-0.5B) model and the [trl-lib/Capybara](https://huggingface.co/datasets/trl-lib/Capybara) dataset, and the [TRL](https://huggingface.co/docs/trl/en/index) library, and saves the resulting model to your Hugging Face account under the name `"Qwen2.5-0.5B-SFT"`: + +```python +from datasets import load_dataset +from trl import SFTTrainer + +dataset = load_dataset("trl-lib/Capybara", split="train") +trainer = SFTTrainer( + model="Qwen/Qwen2.5-0.5B", + train_dataset=dataset, +) +trainer.train() +trainer.push_to_hub("Qwen2.5-0.5B-SFT") +``` + +Save this script as `train.py` + +## Run the training job + +`hf jobs` takes several arguments: select the hardware with `--flavor`, and pass environment variable with `--env` and `--secrets`. Here we use the A100 Large GPU flavor with `--flavor a100-large` and pass your Hugging Face token as a secret with `--secrets HF_TOKEN` in order to be able to push the resulting model to your account. + +Moreover, UV accepts the `--with` argument to define python dependencies, so we use `--with trl` to have the `trl` library available. + +You can now run the final command which looks like this: + +```bash +hf jobs uv run \ + --flavor a100-large \ + --with trl \ + --secrets HF_TOKEN \ + train.py +``` + +The logs appear in your terminal, and you can safely Ctrl+C to stop streaming the logs, the job will keep running. + +``` +... +Downloaded nvidia-cudnn-cu12 +Downloaded torch +Installed 66 packages in 233ms +Generating train split: 100%|██████████| 15806/15806 [00:00<00:00, 76686.50 examples/s] +Generating test split: 100%|██████████| 200/200 [00:00<00:00, 43880.36 examples/s] +Tokenizing train dataset: 100%|██████████| 15806/15806 [00:41<00:00, 384.97 examples/s] +Truncating train dataset: 100%|██████████| 15806/15806 [00:00<00:00, 212272.92 examples/s] +The model is already on multiple devices. Skipping the move to device specified in `args`. +The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}. +{'loss': 1.7357, 'grad_norm': 4.8733229637146, 'learning_rate': 1.9969635627530365e-05, 'entropy': 1.7238958358764649, 'num_tokens': 59528.0, 'mean_token_accuracy': 0.6124177813529968, 'epoch': 0.01} +{'loss': 1.6239, 'grad_norm': 6.200186729431152, 'learning_rate': 1.9935897435897437e-05, 'entropy': 1.644005584716797, 'num_tokens': 115219.0, 'mean_token_accuracy': 0.6259662985801697, 'epoch': 0.01} +{'loss': 1.4449, 'grad_norm': 6.167325496673584, 'learning_rate': 1.990215924426451e-05, 'entropy': 1.5156117916107177, 'num_tokens': 171787.0, 'mean_token_accuracy': 0.6586395859718323, 'epoch': 0.02} +{'loss': 1.6023, 'grad_norm': 5.133708953857422, 'learning_rate': 1.986842105263158e-05, 'entropy': 1.6885507702827454, 'num_tokens': 226067.0, 'mean_token_accuracy': 0.6271904468536377, 'epoch': 0.02} +``` + +Follow the Job advancements on the job page on Hugging Face: + + +
+ + +
+ +Once the job is done, find your model on your account: + +
+ + +
+ +Congrats ! You just run your first Job to fine-tune an open source model 🔥 + +Feel free to try out your model locally and evaluate it using e.g. [tranfomers](https://huggingface.co/docs/transformers) by clicking on "Use this model", or deploy it to [Inference Endpoints](https://huggingface.co/docs/inference-endpoints) in one click using the "Deploy" button. diff --git a/docs/jobs/schedule.md b/docs/jobs/schedule.md new file mode 100644 index 000000000..e69de29bb diff --git a/docs/jobs/training1.md b/docs/jobs/training1.md new file mode 100644 index 000000000..e69de29bb diff --git a/docs/jobs/webhooks.md b/docs/jobs/webhooks.md new file mode 100644 index 000000000..e69de29bb From 7b947ee8862535d7270c72d967ef2933149f3039 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Thu, 11 Dec 2025 19:18:50 +0100 Subject: [PATCH 2/5] quickstart and pricing --- docs/jobs/_toctree.yml | 10 ++++------ docs/jobs/docker.md | 0 docs/jobs/index.md | 28 ++++++++++++++++++++++++++++ docs/jobs/pricing.md | 29 +++++++++++++++++++++++++++++ docs/jobs/quickstart.md | 39 ++++++++++++++++++++------------------- 5 files changed, 81 insertions(+), 25 deletions(-) delete mode 100644 docs/jobs/docker.md diff --git a/docs/jobs/_toctree.yml b/docs/jobs/_toctree.yml index 4fe0875ec..dc7e6414f 100644 --- a/docs/jobs/_toctree.yml +++ b/docs/jobs/_toctree.yml @@ -7,12 +7,6 @@ title: Hugging Face Jobs - local: quickstart title: Quickstart - - local: docker - title: Docker - - local: schedule - title: Schedule Jobs - - local: webhooks - title: Webhook Automation - local: pricing title: Pricing and Billing @@ -39,3 +33,7 @@ title: Configuration - local: frameworks title: Frameworks Setups + - local: schedule + title: Schedule Jobs + - local: webhooks + title: Webhook Automation diff --git a/docs/jobs/docker.md b/docs/jobs/docker.md deleted file mode 100644 index e69de29bb..000000000 diff --git a/docs/jobs/index.md b/docs/jobs/index.md index 4f1541d45..ea3fa577b 100644 --- a/docs/jobs/index.md +++ b/docs/jobs/index.md @@ -9,3 +9,31 @@ The Hugging Face Hub provides compute for AI and data workflows via Jobs. Jobs runs on Hugging Face infrastructure and aim at providing AI builders, Data engineers, developers and AI agents an easy access to cloud infrastructure to run their workloads. They are ideal to fine tune AI models and run inference with GPUs, but also for data ingestion and processing as well. A job is defined with a command to run (e.g. a UV or python command), a hardware flavor (CPU, GPU, TPU), and optionnally a Docker Image from Hugging Face Spaces or Docker Hub. Many jobs can run in parallel, which is useful e.g. for parameters tuning or parallel inference and data processing. + +## Run Jobs from anywhere + +There are multiple tools you can use to run jobs: + +* the `hf` Command Line Interface (see the [CLI installation steps](https://huggingface.co/docs/huggingface_hub/main/en/guides/cli) and the [Jobs CLI documentation](https://huggingface.co/docs/huggingface_hub/guides/cli#hf-jobs) for more information) +* the `huggingface_hub` Python client (see the [`huggingface_hub` Jobs documentation](https://huggingface.co/docs/huggingface_hub/guides/jobs) for more information) +* the Jobs HTTP API (see the [Jobs HTTP API documentation](./http) for more information) + +## Run any workload + +The `hf` Jobs CLI and the `huggingface_hub` Python client offer a UV-like interface to run Python workloads. UV installs the required Python dependencies and run the Python script in one single command. Python dependencies may also be defined in a self-contained UV script, and in this case there is no need to specify anything but the UV script to run the Job. + +```diff +- uv run ++ hf jobs uv run +``` + +More generally, Hugging Face Jobs supports any workload based on Docker and a command. Jobs offers a Docker-like interface to rub Jobs, where you can specify a Docker image from Hugging Face Spaces or Docker Hub, as well as the command to run. Docker provides the ability to package ready-to-use environments as Docker images that are shared by the community or custom made. Therefore you may choose or define your Docker image based on what your workloads need (e.g. python, torch, vllm) and run any command. This is more advanced than using UV but provides more flexibility. + +```diff +- docker run ++ hf jobs run +``` + +## Automate Jobs + +Trigger Jobs automatically with a schedule or using webhooks. With a schedule, you can run Jobs every X minutes, hours, days, weeks or months. Scheduling Jobs uses the `cron` syntax like `"*/5 * * * *"` for "every 5 minutes", or aliases like `"@hourly"`, `"@daily"`, `"weekly"` or `"@monthly"`. With webhooks, Jobs can run whenever there is an update on Hugging face. For example you can configure webhooks to trigger for every model update under a given account, and retrieve the updated model from the webhook payload in the Job. diff --git a/docs/jobs/pricing.md b/docs/jobs/pricing.md index e69de29bb..9d3519080 100644 --- a/docs/jobs/pricing.md +++ b/docs/jobs/pricing.md @@ -0,0 +1,29 @@ +# Pricing and Billing + +Billing on Jobs is based on hardware usage and is computed by the minute: you get charged for every minute the Jobs runs on the requested hardware. + +During a Job’s lifecycle, it is only billed when the Job is Starting or Running. This means that there is no cost during build. + +If a running Job starts to fail, it will be automatically suspended and the billing will stop. + +Jobs have a timeout of 30 minutes by default. You can change this behavior by setting a custom `timeout` when creating the Job. For example in the CLI: + +```bash +hf jobs run --timeout 3h ... +``` + +You can look at your current billing information for Jobs in in your [Billing](https://huggingface.co/settings/billing) page, under the "Compute Usage" section: + +
+ + +
+ +To interrupt the billing on a Job, you can cancel it: + +
+ + +
+ +Additional information about billing can be found in the dedicated Hub-wide section. diff --git a/docs/jobs/quickstart.md b/docs/jobs/quickstart.md index 21291442c..1102effdb 100644 --- a/docs/jobs/quickstart.md +++ b/docs/jobs/quickstart.md @@ -3,8 +3,8 @@ In this guide you will run a Job to fine-tune an open source model on Hugging Face infrastastructure in only a few minutes. Make sure you are logged in to Hugging Face and have access to your [Jobs page](https://huggingface.co/settings/jobs).
- - + +
## Getting started @@ -14,19 +14,19 @@ First install the Hugging Face CLI: 1. Install the CLI ```bash -curl -LsSf https://hf.co/cli/install.sh | bash +>>> curl -LsSf https://hf.co/cli/install.sh | bash ``` Install the CLI (using Homebrew) ```bash -brew install huggingface-cli +>>> brew install huggingface-cli ``` Install the CLI (using uv) ```bash -uv tool install hf +>>> uv tool install hf ``` 2. Login to your Hugging Face account: @@ -34,7 +34,7 @@ uv tool install hf Login ```bash -hf auth login +>>> hf auth login ``` 3. Create your first jobs using the `hf jobs` command: @@ -42,27 +42,28 @@ hf auth login Run a UV command or script ```bash -hf jobs uv run python -c 'print("Hello from the cloud!")' +>>> hf jobs uv run python -c 'print("Hello from the cloud!")' +Job started with ID: 693aef401a39f67af5a41c0e +View at: https://huggingface.co/jobs/lhoestq/693aef401a39f67af5a41c0e +Hello from the cloud! ``` ```bash -hf jobs uv run path/to/script.py +>>> hf jobs uv run path/to/script.py ``` Run a Docker command ```bash -hf jobs run python:3.12 python -c 'print("Hello from the cloud!")' +>>> hf jobs run ubuntu echo 'Hello from the cloud!' +Job started with ID: 693aee76c67c9f186cfe233e +View at: https://huggingface.co/jobs/lhoestq/693aee76c67c9f186cfe233e +Hello from the cloud! ``` -4. Monitor your job +4. Check your first jobs -The job logs appear in your terminal, but you can also see the job in your jobs page. Open the job page to see the job information, status and logs: - -
- - -
+The job logs appear in your terminal, but you can also see them in your jobs page. Open the job page to see the job information, status and logs:
@@ -87,7 +88,7 @@ trainer.train() trainer.push_to_hub("Qwen2.5-0.5B-SFT") ``` -Save this script as `train.py` +Save this script as `train.py`, and we can now run it with UV on Hugging Face Jobs. ## Run the training job @@ -135,8 +136,8 @@ Follow the Job advancements on the job page on Hugging Face: Once the job is done, find your model on your account:
- - + +
Congrats ! You just run your first Job to fine-tune an open source model 🔥 From 45b6ad6eb81ff606319d67b3cac9d83594be0452 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Mon, 15 Dec 2025 20:29:04 +0100 Subject: [PATCH 3/5] add manage page --- docs/jobs/manage.md | 173 +++++++++++++++++++++++++++++++++++++++++++ docs/jobs/pricing.md | 10 ++- 2 files changed, 182 insertions(+), 1 deletion(-) diff --git a/docs/jobs/manage.md b/docs/jobs/manage.md index e69de29bb..9be23a346 100644 --- a/docs/jobs/manage.md +++ b/docs/jobs/manage.md @@ -0,0 +1,173 @@ +## Manage Jobs + +### List Jobs + +Find your list of Jobs in the Jobs page or your organization Jobs page (user/organization page > settings > Jobs): + + +
+ + +
+ +It is also available in the Hugging Face CLI. Show the list of running Jobs with `hf jobs ps` and use `-a` to show all the Jobs: + +```bash +>>> hf jobs ps +JOB ID IMAGE/SPACE COMMAND CREATED STATUS +------------ ---------------- ----------- ------------------- ------- +69402ea6c... ghcr.io/astra... uv run p... 2025-12-15 15:52:06 RUNNING +>>> hf jobs ps -a +JOB ID IMAGE/SPACE COMMAND CREATED STATUS +------------ ---------- --------------- ------------------- --------- +69402ea6c... ghcr.io... uv run pytho... 2025-12-15 15:52:06 RUNNING +693b06b8c... ghcr.io... uv run pytho... 2025-12-11 18:00:24 CANCELED +693b069fc... ghcr.io... uv run pytho... 2025-12-11 17:59:59 ERROR +693aef401... ghcr.io... uv run pytho... 2025-12-11 16:20:16 COMPLETED +693aee76c... ubuntu echo Hello f... 2025-12-11 16:16:54 COMPLETED +693ae8e3c... python:... python -c pr... 2025-12-11 15:53:07 COMPLETED +``` + +You can filter Jobs based on conditions provided, using the format key=value: + +```bash +>>> hf jobs ps --filter status=error -a +JOB ID IMAGE/SPACE COMMAND CREATED STATUS +------------ ---------- ------------------ ------------------- ------ +693b069fc... ghcr.io... uv run python -... 2025-12-11 17:59:59 ERROR +693996dec... ghcr.io... bash -c python ... 2025-12-10 15:50:54 ERROR +69399695c... ghcr.io... uv run --with t... 2025-12-10 15:49:41 ERROR +693994bdc... ghcr.io... uv run --with t... 2025-12-10 15:41:49 ERROR +68d3c1af3... ghcr.io... uv run bash -c ... 2025-09-24 10:02:23 ERROR +>>> hf jobs ps --filter "command=*train.py" --filter status=error -a +JOB ID IMAGE/SPACE COMMAND CREATED STATUS +------------ ------------ ---------------- ------------------- ------ +69399695c... ghcr.io/a... uv run --with... 2025-12-10 15:49:41 ERROR +693994bdc... ghcr.io/a... uv run --with... 2025-12-10 15:41:49 ERROR +``` + +Specify your organization `namespace` to list Jobs under your organization: + +```bash +>>> hf jobs ps --namespace +``` + + +### Inspect a Job + +You can see the status logs of a Job in the Job page: + +
+ + +
+ +Alternatively using the CLI + +```bash +>>> hf jobs inspect 693994e21a39f67af5a41ad0 +[ + { + "id": "693994e21a39f67af5a41ad0", + "created_at": "2025-12-10 15:42:26.835000+00:00", + "docker_image": "ghcr.io/astral-sh/uv:python3.12-bookworm", + "space_id": null, + "command": ["bash", "-c", "python -c \"import urllib.request; import os; from pathlib import Path; o = urllib.request.build_opener(); o.addheaders = [(\\\"Authorization\\\", \\\"Bearer \\\" + os.environ[\\\"UV_SCRIPT_HF_TOKEN\\\"])]; Path(\\\"/tmp/script.py\\\").write_bytes(o.open(os.environ[\\\"UV_SCRIPT_URL\\\"]).read())\" && uv run --with trl /tmp/script.py"], + "arguments": [], + "environment": {"UV_SCRIPT_URL": "https://huggingface.co/datasets/lhoestq/hf-cli-jobs-uv-run-scripts/resolve/728cc5682eb402d7ffe66a2f6f97645b34cb08dd/train.py"}, + "secrets": ["HF_TOKEN", "UV_SCRIPT_HF_TOKEN"], + "flavor": "a100-large", + "status": {"stage": "COMPLETED", "message": null}, + "owner": {"id": "5e9ecfc04957053f60648a3e", "name": "lhoestq", "type": "user"}, + "endpoint": "https://huggingface.co", + "url": "https://huggingface.co/jobs/lhoestq/693994e21a39f67af5a41ad0" + } +] +``` + +and for the logs + +```bash +>>> hf jobs logs 693994e21a39f67af5a41ad0 +Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB) +Downloading numpy (15.8MiB) +Downloading nvidia-cuda-cupti-cu12 (9.8MiB) +Downloading tokenizers (3.1MiB) +Downloading nvidia-cusolver-cu12 (255.1MiB) +Downloading nvidia-cufft-cu12 (184.2MiB) +Downloading transformers (11.4MiB) +Downloading setuptools (1.1MiB) +... +``` + +Specify your organization `namespace` to inspect a Job under your organization: + +```bash +hf jobs inspect --namespace +hf jobs logs --namespace +``` + +### Debug a Job + +If a Job has an error, you can see it in on the Job page + +
+ + +
+ +Look at the status message and the logs on the Job page to see what went wrong. + +You may also look at the last lines of logs to see what happened before a Job failed. You can see that in the Job page, or using the CLI: + +```bash +>>> hf jobs logs 69405cf51a39f67af5a41f29 | tail -n 10 + Downloaded nvidia-cudnn-cu12 + Downloaded torch +Installed 66 packages in 226ms +Generating train split: 100%|██████████| 15806/15806 [00:00<00:00, 73330.17 examples/s] +Generating test split: 100%|██████████| 200/200 [00:00<00:00, 45427.32 examples/s] +Traceback (most recent call last): + File "/tmp/script.py", line 7, in + train_dataset=train_dataset, + ^^^^^^^^^^^^^ +NameError: name 'train_dataset' is not defined. Did you mean: 'load_dataset'? +``` + +Debug a Job locally using your local UV or Docker setup: + +* `hf jobs uv run ...` -> `uv run ...` +* `hf jobs run ...` -> `docker run ...` + +The status message can say "Job timeout": it means the Job didn't finish in time before the timeout (the default is 30min) and therefore it was stopped. +In this case you need to specify a higher timeout, using `--timeout` in the CLI, e.g. + +```bash +hf jobs uv run --timeout 3h ... +``` + +### Cancel Jobs + +Use the "Cancel" button on the Job page to cancel a Job: + +
+ + +
+ +or in the CLI: + +```bash +hf jobs cancel 693b06b8c67c9f186cfe239e +``` + +
+ + +
+ +Specify your organization `namespace` to cancel a Job under your organization: + +```bash +hf jobs cancel --namespace +``` diff --git a/docs/jobs/pricing.md b/docs/jobs/pricing.md index 9d3519080..3f05477e0 100644 --- a/docs/jobs/pricing.md +++ b/docs/jobs/pricing.md @@ -6,7 +6,15 @@ During a Job’s lifecycle, it is only billed when the Job is Starting or Runnin If a running Job starts to fail, it will be automatically suspended and the billing will stop. -Jobs have a timeout of 30 minutes by default. You can change this behavior by setting a custom `timeout` when creating the Job. For example in the CLI: +Billing is done to the user's namespace by default, but you can bill to your organization instead by specifying the right `namespace`: + +```bash +hf jobs run --namespace my-org-name ... +``` + +In this case the Job runs under the organization account, and you can see it in your organization Jobs page (organization page > settings > Jobs). + +Moreobver Jobs have a timeout of 30 minutes by default. You can change this behavior by setting a custom `timeout` when creating the Job. For example in the CLI: ```bash hf jobs run --timeout 3h ... From 11e0686a9d0783e768e58f01cb2d075931683d1d Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Wed, 17 Dec 2025 16:16:57 +0100 Subject: [PATCH 4/5] add api, cli, config, frameworks, python, schedule, webhooks --- docs/jobs/_toctree.yml | 9 ++ docs/jobs/api.md | 66 +++++++++++++++ docs/jobs/cli.md | 9 ++ docs/jobs/configuration.md | 167 +++++++++++++++++++++++++++++++++++++ docs/jobs/data1.md | 1 + docs/jobs/frameworks.md | 28 +++++++ docs/jobs/index.md | 2 +- docs/jobs/inference1.md | 1 + docs/jobs/python.md | 8 ++ docs/jobs/schedule.md | 43 ++++++++++ docs/jobs/training1.md | 1 + docs/jobs/webhooks.md | 32 +++++++ 12 files changed, 366 insertions(+), 1 deletion(-) create mode 100644 docs/jobs/api.md create mode 100644 docs/jobs/cli.md create mode 100644 docs/jobs/python.md diff --git a/docs/jobs/_toctree.yml b/docs/jobs/_toctree.yml index dc7e6414f..e4d333ff0 100644 --- a/docs/jobs/_toctree.yml +++ b/docs/jobs/_toctree.yml @@ -37,3 +37,12 @@ title: Schedule Jobs - local: webhooks title: Webhook Automation + +- title: Reference + sections: + - local: cli + title: Command Line Interface (CLI) + - local: python + title: Python client + - local: api + title: Jobs API Endpoints diff --git a/docs/jobs/api.md b/docs/jobs/api.md new file mode 100644 index 000000000..bef377acf --- /dev/null +++ b/docs/jobs/api.md @@ -0,0 +1,66 @@ +# Jobs API Endpoints + +The Jobs HTTP API Endpoints are available under `https://huggingface.co/api/jobs`. + +Authenticate using a Hugging face token with the permission to start and manage Jobs under your namespace (your account or organization). +Pass the token as a Bearer token with the header: `"Authorization: Bearer {token}"`. + +Here is a list of available endpoints and arguments: + +## Jobs + +* POST `https://huggingface.co/api/jobs/{namespace}` + Run a Job. + Arguments: + * image: string + * command: string + * env, *optional*: object key -> value + * secrets, *optional*: object key -> value + * flavor, *optional*: string + * timeout, *optional*: number +* GET `https://huggingface.co/api/jobs/{namespace}` + List Jobs. +* GET `https://huggingface.co/api/jobs/{namespace}/{job_id}` + Inspect a Job. +* GET `https://huggingface.co/api/jobs/{namespace}/{job_id}/logs` + Fetch the logs of a Job. +* GET `https://huggingface.co/api/jobs/{namespace}/{job_id}/cancel` + Cancel a Job. + +## Scheduled Jobs + +* POST `https://huggingface.co/api/scheduled-jobs/{namespace}` + Create a scheduled Job. + Arguments: + * jobSpec: + * image: string + * command: string + * env: object key -> value + * secrets: object key -> value + * flavor: string + * timeout: number + * schedule: string + * concurrency, *optional*: bool + * suspend, *optional*: bool +* GET `https://huggingface.co/api/scheduled-jobs/{namespace}` + List scheduled Jobs. +* GET `https://huggingface.co/api/scheduled-jobs/{namespace}/{job_id}` + Inspect a scheduled Job. +* DELETE `https://huggingface.co/api/scheduled-jobs/{namespace}/{job_id}` + Delete a scheduled Job. +* GET `https://huggingface.co/api/scheduled-jobs/{namespace}/{job_id}/suspend` + Suspend a scheduled Job. +* GET `https://huggingface.co/api/scheduled-jobs/{namespace}/{job_id}/resume` + Resume a scheduled Job. + +## Webhooks + +* POST `https://huggingface.co/api/settings/webhooks` + Create a webhook that triggers this Job. + Arguments: + * watched: list of objects + * type: one of "dataset", "model", "org", "space", "user" + * name: string + * jobSourceId: string + * domains, *optional*: list of "repo", "discussion" + * secret, *optional*: string diff --git a/docs/jobs/cli.md b/docs/jobs/cli.md new file mode 100644 index 000000000..65b6eab70 --- /dev/null +++ b/docs/jobs/cli.md @@ -0,0 +1,9 @@ +# Jobs Command Line Interface (CLI) + +The `huggingface_hub` Python package comes with a built-in CLI called `hf`. This tool allows you to interact with the Hugging Face Hub directly from a terminal. For example, you can log in to your account, create a repository, upload and download files, etc. It also comes with handy features to configure your machine or manage your cache, and start and manage Jobs. + +Find the `hf jobs` installation steps, guides and reference in the `huggingface_hub` documentation here: + +* [Installation](https://huggingface.co/docs/huggingface_hub/en/guides/cli#getting-started) +* [Run and manage Jobs](https://huggingface.co/docs/huggingface_hub/en/guides/cli#hf-jobs) +* [CLI reference for Jobs](https://huggingface.co/docs/huggingface_hub/en/package_reference/cli#hf-jobs) diff --git a/docs/jobs/configuration.md b/docs/jobs/configuration.md index e69de29bb..ad60aae65 100644 --- a/docs/jobs/configuration.md +++ b/docs/jobs/configuration.md @@ -0,0 +1,167 @@ +# Configuration + +## Authentication + +You need to be authenticated with `hf auth login` to run Jobs, and use a token with the permission to start and manage Jobs. + +Alternatively, pass a Hugging Face token manually with `--token` in the CLI, the `token` argument in Python or a Bearer token for the HTTP API. + +## UV Jobs + +Specify the UV script or python command to run as you would with UV: + +```bash +>>> hf jobs uv run train.py +``` + +```bash +>>> hf jobs uv run python -c 'print("Hello from the cloud!")' +``` + +The `hf jobs uv run` command accepts an UV argument like `--with` and `--python`. The `--with` argument lets you specify python dependencies, and `--python` lets you choose the python version to use: + + +```bash +>>> hf jobs uv run --with trl train.py +>>> hf jobs uv run --python 3.12 train.py +``` + +Arguments following the command (or script) are not interpreted as arguments to uv. All options to uv must be provided before the command, e.g., uv run --verbose foo. A `--` can be used to separate the command from jobs/uv options for clarity, e.g. + +```bash +>>> hf jobs uv run --with trl-jobs -- trl-jobs sft --model_name Qwen/Qwen3-0.6B --dataset_name trl-lib/Capybara +``` + +Find the list of all arguments in the [CLI documentation](https://huggingface.co/docs/huggingface_hub/package_reference/cli#hf-jobs-uv-run) and the [UV Commands documentation](https://docs.astral.sh/uv/reference/cli/#uv-run). + +By default, UV Jobs run with the `ghcr.io/astral-sh/uv:python3.12-bookworm` Docker image, but you can use another image as long as it has UV installed, using `--image `. + +## Docker Jobs + +Specify the Docker image and the command to run as you would with docker: + +```bash +>>> hf jobs run ubuntu echo "Hello from the cloud!" +``` + +All options to Jobs must be provided before the command. A `--` can be used to separate the command from jobs/uv options for clarity, e.g. + +```bash +>>> hf jobs run --token hf_xxx ubuntu -- echo "Hello from the cloud!" +``` + +Find the list of all arguments in the [CLI documentation](https://huggingface.co/docs/huggingface_hub/package_reference/cli#hf-jobs-run). + +## Environment variables and Secrets + +You can pass environment variables to your job using + +```bash +# Pass environment variables +>>> hf jobs uv run -e FOO=foo -e BAR=bar python -c 'import os; print(os.environ["FOO"], os.environ["BAR"])' +``` + +```bash +# Pass an environment from a local .env file +>>> hf jobs uv run --env-file .env python -c 'import os; print(os.environ["FOO"], os.environ["BAR"])' +``` + +```bash +# Pass secrets - they will be encrypted server side +>>> hf jobs uv run -s MY_SECRET=psswrd python -c 'import os; print(os.environ["MY_SECRET"])' +``` + +```bash +# Pass secrets from a local .env.secrets file - they will be encrypted server side +>>> hf jobs uv run --secrets-file .env.secrets python -c 'import os; print(os.environ["MY_SECRET"])' +``` + +> [!TIP] +> Use `--secrets HF_TOKEN` to pass your local Hugging Face token implicitly. +> With this syntax, the secret is retrieved from the environment variable. +> For `HF_TOKEN`, it may read the token file located in the Hugging Face home folder if the environment variable is unset. + +## Hardware flavor + +Run jobs on GPUs or TPUs with the `flavor` argument. For example, to run a PyTorch job on an A10G GPU: + +```bash +>>> hf jobs uv run --with torch --flavor a10g-small python -c "import torch; print(f'This code ran with the following GPU: {torch.cuda.get_device_name()}')" +``` + +Running this will show the following output! + +``` +This code ran with the following GPU: NVIDIA A10G +``` + +Here is another example to run a fine-tuning script like [trl/scripts/sft.py](https://github.com/huggingface/trl/blob/main/trl/scripts/sft.py): + +```bash +>>> hf jobs uv run --with trl --flavor a10g-small -s HF_TOKEN -- sft.py --model_name_or_path Qwen/Qwen2-0.5B ... +``` + +> [!TIP] +> For comprehensive guidance on running model training jobs with TRL on Hugging Face infrastructure, check out the [TRL Jobs Training documentation](https://huggingface.co/docs/trl/main/en/jobs_training). It covers fine-tuning recipes, hardware selection, and best practices for training models efficiently. + +Available `--flavor` options: + +- CPU: `cpu-basic`, `cpu-upgrade` +- GPU: `t4-small`, `t4-medium`, `l4x1`, `l4x4`, `a10g-small`, `a10g-large`, `a10g-largex2`, `a10g-largex4`,`a100-large` +- TPU: `v5e-1x1`, `v5e-2x2`, `v5e-2x4` + +(updated in 12/2025 from Hugging Face [suggested_hardware docs](https://huggingface.co/docs/hub/en/spaces-config-reference)) + +## Timeout + +Jobs have a default timeout (30 minutes), after which they will automatically stop. This is important to know when running long-running tasks like model training. + +You can specify a custom timeout value using the `--timeout` parameter when running a job. The timeout can be specified in two ways: + +1. **As a number** (interpreted as seconds): + +Use `--timeout` and pass the number in seconds (here 2 hours = 7200 seconds): + +```bash +>>> hf jobs uv run --timeout 7200 --with torch --flavor a10g-large train.py +``` + +2. **As a string with time units**: + +Or use `--timeout` and use diffetent time units: + +```bash +>>> hf jobs uv run --timeout 2h --with torch --flavor a10g-large train.py +``` + +Other examples: + +```bash +--timeout 30m # 30 minutes +--timeout 1.5h # 1.5 hours +--timeout 1d # 1 day +--timeout 3600s # 3600 seconds +``` + +Supported time units: +- `s` - seconds +- `m` - minutes +- `h` - hours +- `d` - days + +> [!WARNING] +> If you don't specify a timeout, a default timeout will be applied to your job. For long-running tasks like model training that may take hours, make sure to set an appropriate timeout to avoid unexpected job terminations. + +## Namespace + +Run Jobs under your organization account using the `--namespace` argument. Make sure you are logged in with a token that has the permission to start and manage Jobs under your orgzanization account. + +```bash +>>> hf jobs uv run --namespace my-org-name python -c "print('Running in an org account')" +``` + +Note that you can pass a token with the right permission manually: + +```bash +>>> hf jobs uv run --namespace my-org-name --token hf_xxx python -c "print('Running in an org account')" +``` diff --git a/docs/jobs/data1.md b/docs/jobs/data1.md index e69de29bb..db30895eb 100644 --- a/docs/jobs/data1.md +++ b/docs/jobs/data1.md @@ -0,0 +1 @@ +🚧 this section is under construction 🚧 diff --git a/docs/jobs/frameworks.md b/docs/jobs/frameworks.md index e69de29bb..2032a774d 100644 --- a/docs/jobs/frameworks.md +++ b/docs/jobs/frameworks.md @@ -0,0 +1,28 @@ +# Frameworks Setups + +Here is the list of frameworks that provide ready-to-use Docker images with UV that you can use in Jobs. + +These Docker images already have uv installed but if you want to use an image + uv for an image without uv insalled you’ll need to make sure uv is installed first. This will work well in many cases but for LLM inference libraries which can have quite specific requirements, it can be useful to use a specific image that has the library installed. + +## vLLM + +vLLM is a very well known and heavily used inference engine. It is known for its ability to scale inference for LLMs. +They provide the `vllm/vllm-openai` Docker image with vLLM and UV ready. This image is ideal to run batch inference. + +Use the `--image` argument to use this Docker image: + +```bash +>>> hf jobs uv run --image vllm/vllm-openai --flavor l4x4 generate-responses.py +``` + +You can find more information on vLLM batch inference on Jobs in [Daniel Van Strien's blog post](https://danielvanstrien.xyz/posts/2025/hf-jobs/vllm-batch-inference.html). + +## TRL + +TRL is a library designed for post-training models using techniques like Supervised Fine-Tuning (SFT), Group Relative Policy Optimization (GRPO), and Direct Preference Optimization (DPO). An up-to-date Docker image with UV and all TRL dependencies is available at `huggingface/trl` and can be used directly with Hugging Face Jobs. + +Use the `--image` argument to use this Docker image: + +```bash +>>> hf jobs uv run --image huggingface/trl --flavor a100-large -s HF_TOKEN train.py +``` diff --git a/docs/jobs/index.md b/docs/jobs/index.md index ea3fa577b..0c9d7d0df 100644 --- a/docs/jobs/index.md +++ b/docs/jobs/index.md @@ -16,7 +16,7 @@ There are multiple tools you can use to run jobs: * the `hf` Command Line Interface (see the [CLI installation steps](https://huggingface.co/docs/huggingface_hub/main/en/guides/cli) and the [Jobs CLI documentation](https://huggingface.co/docs/huggingface_hub/guides/cli#hf-jobs) for more information) * the `huggingface_hub` Python client (see the [`huggingface_hub` Jobs documentation](https://huggingface.co/docs/huggingface_hub/guides/jobs) for more information) -* the Jobs HTTP API (see the [Jobs HTTP API documentation](./http) for more information) +* the Jobs HTTP API (see the [Jobs HTTP API documentation](./http-api) for more information) ## Run any workload diff --git a/docs/jobs/inference1.md b/docs/jobs/inference1.md index e69de29bb..db30895eb 100644 --- a/docs/jobs/inference1.md +++ b/docs/jobs/inference1.md @@ -0,0 +1 @@ +🚧 this section is under construction 🚧 diff --git a/docs/jobs/python.md b/docs/jobs/python.md new file mode 100644 index 000000000..019be89a9 --- /dev/null +++ b/docs/jobs/python.md @@ -0,0 +1,8 @@ +# Jobs Python Client + +The `huggingface_hub` Python package comes with a client called `HfApi`. This client allows you to interact with the Hugging Face Hub directly in Python. For example, you can log in to your account, create a repository, upload and download files, etc. It also comes with handy features to configure your machine or manage your cache, and start and manage Jobs. + +Find the installation steps and guides in the `huggingface_hub` documentation: + +* [Installation](https://huggingface.co/docs/huggingface_hub/en/installation) +* [Run and manage Jobs](https://huggingface.co/docs/huggingface_hub/en/guides/jobs) diff --git a/docs/jobs/schedule.md b/docs/jobs/schedule.md index e69de29bb..f48d64ffd 100644 --- a/docs/jobs/schedule.md +++ b/docs/jobs/schedule.md @@ -0,0 +1,43 @@ +# Schedule Jobs + +Schedule and manage jobs that will run on HF infrastructure. + +Use `hf jobs uv run ` or [`hf jobs run`] with a schedule of `@annually`, `@yearly`, `@monthly`, `@weekly`, `@daily`, `@hourly`, or a CRON schedule expression (e.g., `"0 9 * * 1"` for 9 AM every Monday): + +```bash +# Schedule a job that runs every hour +>>> hf jobs scheduled uv run @hourly python -c "print('This runs every hour!')" + +# Use the CRON syntax +>>> hf jobs scheduled uv run "*/5 * * * *" python -c "print('This runs every five minutes!')" + +# Schedule with GPU +>>> hf jobs scheduled uv run --flavor a10g-small --with torch @hourly python -c 'import torch; print(f"This code ran with the following GPU: {torch.cuda.get_device_name()}")' + +# Schedule with a Docker image +>>> hf jobs scheduled run @hourly python:3.12 python -c "print('This runs every hour!')" +``` + +Use the same parameters as `hf jobs uv run` and `hf jobs run` to pass environment variables, secrets, timeout, etc. + +Manage scheduled jobs using `hf jobs scheduled ps`, `hf jobs scheduled inspect`, `hf jobs scheduled suspend`, `hf jobs scheduled resume`, and `hf jobs scheduled delete`: + +```python +# List your active scheduled jobs +>>> hf jobs scheduled ps + +# List all your scheduled jobs (including suspended jobs) +>>> hf jobs scheduled ps -a + +# Inspect the status of a job +>>> hf jobs scheduled inspect + +# Suspend (pause) a scheduled job +>>> hf jobs scheduled suspend + +# Resume a scheduled job +>>> hf jobs scheduled resume + +# Delete a scheduled job +>>> hf jobs scheduled delete +``` diff --git a/docs/jobs/training1.md b/docs/jobs/training1.md index e69de29bb..db30895eb 100644 --- a/docs/jobs/training1.md +++ b/docs/jobs/training1.md @@ -0,0 +1 @@ +🚧 this section is under construction 🚧 diff --git a/docs/jobs/webhooks.md b/docs/jobs/webhooks.md index e69de29bb..5ec2cc3bf 100644 --- a/docs/jobs/webhooks.md +++ b/docs/jobs/webhooks.md @@ -0,0 +1,32 @@ +# Webhooks Automation + +Webhooks allow you to listen for new changes on specific repositories or to all repositories belonging to particular set of users/organizations (not just your repos, but any repo) on Hugging Face. + +Use `create_webhook` in the `huggingface_hub` Python client to create a webhook that triggers a Job when a change happens in a Hugging Face repository: + +```python +from huggingface_hub import create_webhook + +# Example: Creating a webhook that triggers a Job +webhook = create_webhook( + job_id=job_id, + watched=[{"type": "user", "name": "your-username"}, {"type": "org", "name": "your-org-name"}], + domains=["repo", "discussion"], + secret="your-secret" +) +``` + +The webhook triggers the Job with the webhook payload in the environment variable `WEBHOOK_PAYLOAD`. + +The webjook payload contains multiple fields, here are a few useful ones: + +* event: + * action: one of "create", "delete", "move", "update" + * scope: string +* repo: + * owner: string + * headSha: string + * name: string + * type: one of "dataset", "model", "space" + +You can find more information on webhooks in the [`huggingface_hub` Webhooks documentation](https://huggingface.co/docs/huggingface_hub/en/guides/webhooks). From 6d22331186c0a1f5678b48b5d8c94d48a37f9b53 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Wed, 17 Dec 2025 16:19:45 +0100 Subject: [PATCH 5/5] add ci --- .../workflows/jobs_build_documentation.yml | 20 ++++++++++++++++++ .../workflows/jobs_build_pr_documentation.yml | 21 +++++++++++++++++++ .../jobs_upload_pr_documentation.yml | 16 ++++++++++++++ 3 files changed, 57 insertions(+) create mode 100644 .github/workflows/jobs_build_documentation.yml create mode 100644 .github/workflows/jobs_build_pr_documentation.yml create mode 100644 .github/workflows/jobs_upload_pr_documentation.yml diff --git a/.github/workflows/jobs_build_documentation.yml b/.github/workflows/jobs_build_documentation.yml new file mode 100644 index 000000000..8904c21a0 --- /dev/null +++ b/.github/workflows/jobs_build_documentation.yml @@ -0,0 +1,20 @@ +name: Build Jobs documentation + +on: + push: + paths: + - "docs/jobs/**" + branches: + - main + +jobs: + build: + uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main + with: + commit_sha: ${{ github.sha }} + package: hub-docs + package_name: jobs + path_to_docs: hub-docs/docs/jobs/ + additional_args: --not_python_module + secrets: + hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} diff --git a/.github/workflows/jobs_build_pr_documentation.yml b/.github/workflows/jobs_build_pr_documentation.yml new file mode 100644 index 000000000..7e3703a57 --- /dev/null +++ b/.github/workflows/jobs_build_pr_documentation.yml @@ -0,0 +1,21 @@ +name: Build Jobs PR Documentation + +on: + pull_request: + paths: + - "docs/jobs/**" + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + build: + uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main + with: + commit_sha: ${{ github.event.pull_request.head.sha }} + pr_number: ${{ github.event.number }} + package: hub-docs + package_name: jobs + path_to_docs: hub-docs/docs/jobs/ + additional_args: --not_python_module diff --git a/.github/workflows/jobs_upload_pr_documentation.yml b/.github/workflows/jobs_upload_pr_documentation.yml new file mode 100644 index 000000000..fd5f60908 --- /dev/null +++ b/.github/workflows/jobs_upload_pr_documentation.yml @@ -0,0 +1,16 @@ +name: Upload Jobs PR Documentation + +on: + workflow_run: + workflows: ["Build Jobs PR Documentation"] + types: + - completed + +jobs: + build: + uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main + with: + package_name: jobs + secrets: + hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} + comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}