# Baked SageMaker training image for the Composer-replication RL stack (F3 §3.2). # # The repeatable path: bake trl+vllm+the framework into an image so jobs don't # pip-install at startup (saves ~5-10 min/job and removes a flaky failure # surface). The one-shot smoke can instead use the stock DLC + source_dir # (run_sagemaker_launch.py --image dlc), which needs no local build. # # Base: AWS PyTorch DLC, tag RESOLVED LIVE against the us-west-2 registry. # MUST be torch-2.7: trl 1.5 → transformers>=4.56.2 → torch.float8_e8m0fnu # (torch>=2.7). The torch-2.6 DLC fails AutoModel.from_pretrained on that dtype. # cu128, -v1.26 build suffix required (no bare floating tag exists). FROM 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.7.1-gpu-py312-cu128-ubuntu22.04-sagemaker-v1.26 # RL stack baked in. torch 2.7 + CUDA 12.8 already in the DLC — do NOT reinstall # torch. vllm>=0.9 is the torch-2.7 line (0.8.x hard-pins torch 2.6 and would # fight this base); pin to a 2.7-matched vllm to avoid a wheel/CUDA mismatch. RUN pip install --no-cache-dir \ "trl>=1.5,<2" "peft>=0.13" "accelerate>=1.0" "datasets>=3.0" \ "vllm>=0.9" "fsspec>=2024.6" "s3fs>=2024.6" "hf_transfer>=0.1.6" # The framework itself (train + serverless extras → trainer, loss, executors, # replica_entrypoint, s3fs all present). COPY . /opt/composer_replication RUN pip install --no-cache-dir -e "/opt/composer_replication[train,serverless]" ENV HF_HOME=/opt/ml/input/hf_cache \ HF_HUB_ENABLE_HF_TRANSFER=1