prepare_dataset / Dockerfile
mr-kush's picture
add Dockerfile for dataset preparation environment setup
1412806
raw
history blame
992 Bytes
# Use a slim Python base
FROM python:3.12-slim
# Create a non-root user
RUN useradd -m -u 1000 user
USER user
ENV HOME=/home/user \
PATH="/home/user/.local/bin:$PATH"
WORKDIR /home/user/app
# Copy only requirements first for caching
COPY --chown=user requirements.txt /home/user/app/requirements.txt
# Install OS-level dependencies and Python dependencies
RUN pip install --upgrade pip \
&& pip install --no-cache-dir -r requirements.txt
# Set environment variables for Hugging Face cache locations (inside container)
ENV HF_HOME=/home/user/app/hf_cache \
HF_DATASETS_CACHE=/home/user/app/hf_cache \
HF_METRICS_CACHE=/home/user/app/hf_cache
# Create cache directory and ensure write permissions
RUN mkdir -p /home/user/app/hf_cache && chmod -R 777 /home/user/app/hf_cache
# Copy all other source files (your scripts, modules, etc.)
COPY --chown=user . /home/user/app
# Default command: run dataset preparation script
CMD ["python", "prepare_dataset_pipeline.py"]