composer-replication-framework / docker /Dockerfile.sagemaker
Baladithya Balamurugan
Wave 20: fix SageMaker smoke — torch-2.7 DLC + drop vllm pin (the real conflict)
a578ad9
# Baked SageMaker training image for the Composer-replication RL stack (F3 §3.2).
#
# The repeatable path: bake trl+vllm+the framework into an image so jobs don't
# pip-install at startup (saves ~5-10 min/job and removes a flaky failure
# surface). The one-shot smoke can instead use the stock DLC + source_dir
# (run_sagemaker_launch.py --image dlc), which needs no local build.
#
# Base: AWS PyTorch DLC, tag RESOLVED LIVE against the us-west-2 registry.
# MUST be torch-2.7: trl 1.5 → transformers>=4.56.2 → torch.float8_e8m0fnu
# (torch>=2.7). The torch-2.6 DLC fails AutoModel.from_pretrained on that dtype.
# cu128, -v1.26 build suffix required (no bare floating tag exists).
FROM 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.7.1-gpu-py312-cu128-ubuntu22.04-sagemaker-v1.26
# RL stack baked in. torch 2.7 + CUDA 12.8 already in the DLC — do NOT reinstall
# torch. vllm>=0.9 is the torch-2.7 line (0.8.x hard-pins torch 2.6 and would
# fight this base); pin to a 2.7-matched vllm to avoid a wheel/CUDA mismatch.
RUN pip install --no-cache-dir \
"trl>=1.5,<2" "peft>=0.13" "accelerate>=1.0" "datasets>=3.0" \
"vllm>=0.9" "fsspec>=2024.6" "s3fs>=2024.6" "hf_transfer>=0.1.6"
# The framework itself (train + serverless extras → trainer, loss, executors,
# replica_entrypoint, s3fs all present).
COPY . /opt/composer_replication
RUN pip install --no-cache-dir -e "/opt/composer_replication[train,serverless]"
ENV HF_HOME=/opt/ml/input/hf_cache \
HF_HUB_ENABLE_HF_TRANSFER=1