- Multi-modal AI infrastructure for RunPod RTX 4090 - Automatic model orchestration (text, image, music) - Text: vLLM + Qwen 2.5 7B Instruct - Image: Flux.1 Schnell via OpenEDAI - Music: MusicGen Medium via AudioCraft - Cost-optimized sequential loading on single GPU - Template preparation scripts for rapid deployment - Comprehensive documentation (README, DEPLOYMENT, TEMPLATE)
35 lines
694 B
Docker
35 lines
694 B
Docker
FROM nvidia/cuda:12.4.0-runtime-ubuntu22.04
|
|
|
|
WORKDIR /app
|
|
|
|
# Install Python and system dependencies
|
|
RUN apt-get update && apt-get install -y \
|
|
python3.11 \
|
|
python3-pip \
|
|
git \
|
|
&& rm -rf /var/lib/apt/lists/*
|
|
|
|
# Upgrade pip
|
|
RUN pip3 install --no-cache-dir --upgrade pip
|
|
|
|
# Install vLLM and dependencies
|
|
COPY requirements.txt .
|
|
RUN pip3 install --no-cache-dir -r requirements.txt
|
|
|
|
# Copy application code
|
|
COPY server.py .
|
|
|
|
# Create directory for model cache
|
|
RUN mkdir -p /workspace/huggingface_cache
|
|
|
|
# Environment variables
|
|
ENV HF_HOME=/workspace/huggingface_cache
|
|
ENV VLLM_HOST=0.0.0.0
|
|
ENV VLLM_PORT=8000
|
|
|
|
# Expose port
|
|
EXPOSE 8000
|
|
|
|
# Run the server
|
|
CMD ["python3", "server.py"]
|