#!/bin/bash # # RunPod Template Preparation Script # Prepares a RunPod instance for template creation # # This script: # 1. Installs Docker & Docker Compose # 2. Installs Tailscale # 3. Builds all Docker images # 4. Pre-downloads all models # 5. Validates everything works # 6. Cleans up for template creation # # Usage: ./prepare-template.sh # Run this on the RunPod instance you want to save as a template # set -e # Exit on error # Colors for output RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' # No Color # Logging functions log_info() { echo -e "${BLUE}[INFO]${NC} $1" } log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1" } log_warn() { echo -e "${YELLOW}[WARN]${NC} $1" } log_error() { echo -e "${RED}[ERROR]${NC} $1" } # Check if running on RunPod check_environment() { log_info "Checking environment..." if ! nvidia-smi &> /dev/null; then log_error "NVIDIA GPU not detected. Are you running on a GPU instance?" exit 1 fi if [ ! -d "/workspace" ]; then log_warn "/workspace directory not found. Creating it..." mkdir -p /workspace fi log_success "Environment check passed" } # Install Docker install_docker() { if command -v docker &> /dev/null; then log_info "Docker already installed: $(docker --version)" return fi log_info "Installing Docker..." curl -fsSL https://get.docker.com -o get-docker.sh sh get-docker.sh rm get-docker.sh # Start Docker daemon (RunPod requires --iptables=false --bridge=none) log_info "Starting Docker daemon..." pkill dockerd 2>/dev/null || true sleep 2 dockerd --iptables=false --bridge=none > /var/log/dockerd.log 2>&1 & sleep 10 # Verify Docker is running if docker ps &> /dev/null; then log_success "Docker installed and running: $(docker --version)" else log_error "Docker failed to start. Check /var/log/dockerd.log" exit 1 fi } # Install Docker Compose install_docker_compose() { if docker compose version &> /dev/null; then log_info "Docker Compose already installed: $(docker compose version)" return fi log_info "Installing Docker Compose..." # Docker Compose is usually bundled with Docker now # If not, install it separately if ! docker compose version &> /dev/null; then DOCKER_COMPOSE_VERSION="v2.23.0" curl -L "https://github.com/docker/compose/releases/download/${DOCKER_COMPOSE_VERSION}/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose chmod +x /usr/local/bin/docker-compose fi log_success "Docker Compose installed: $(docker compose version)" } # Install Tailscale install_tailscale() { if command -v tailscale &> /dev/null; then log_info "Tailscale already installed: $(tailscale version)" return fi log_info "Installing Tailscale..." curl -fsSL https://tailscale.com/install.sh | sh log_success "Tailscale installed: $(tailscale version)" } # Build Docker images build_docker_images() { log_info "Building Docker images..." cd /workspace/ai # Use legacy builder (buildkit has permission issues in RunPod) export DOCKER_BUILDKIT=0 # Build orchestrator log_info "Building orchestrator..." docker compose -f compose.yaml build orchestrator # Build vLLM log_info "Building vLLM..." docker compose -f compose.yaml build vllm-qwen # Build MusicGen log_info "Building MusicGen..." docker compose -f compose.yaml build musicgen # Pull Flux image (pre-built) log_info "Pulling Flux.1 image..." docker pull ghcr.io/matatonic/openedai-images-flux:latest log_success "All Docker images built" } # Pre-download models download_models() { log_info "Pre-downloading AI models (this will take 30-45 minutes)..." cd /workspace/ai # Create model cache directories mkdir -p /workspace/huggingface_cache mkdir -p /workspace/flux/models mkdir -p /workspace/musicgen/models # Download Qwen 2.5 7B log_info "Downloading Qwen 2.5 7B (14GB)..." docker compose --profile text up -d vllm-qwen # Wait for model to download log_info "Waiting for Qwen model to download..." while ! docker logs ai_vllm-qwen_1 2>&1 | grep -q "Model loaded successfully\|AsyncLLMEngine initialized"; do echo -n "." sleep 10 done echo "" log_success "Qwen 2.5 7B downloaded" docker compose stop vllm-qwen # Download Flux.1 Schnell log_info "Downloading Flux.1 Schnell (12GB)..." docker compose --profile image up -d flux log_info "Waiting for Flux model to download..." sleep 180 # Flux takes about 3 minutes to download and initialize log_success "Flux.1 Schnell downloaded" docker compose stop flux # Download MusicGen Medium log_info "Downloading MusicGen Medium (11GB)..." docker compose --profile audio up -d musicgen log_info "Waiting for MusicGen model to download..." while ! docker logs ai_musicgen_1 2>&1 | grep -q "Model loaded successfully\|initialized successfully"; do echo -n "." sleep 10 done echo "" log_success "MusicGen Medium downloaded" docker compose stop musicgen log_success "All models downloaded and cached" } # Validate installation validate_installation() { log_info "Validating installation..." cd /workspace/ai # Start orchestrator log_info "Starting orchestrator for validation..." docker compose -f compose.yaml up -d orchestrator sleep 10 # Check orchestrator health if curl -s http://localhost:9000/health | grep -q "healthy\|ok"; then log_success "Orchestrator is healthy" else log_error "Orchestrator health check failed" docker logs ai_orchestrator exit 1 fi # Check models are cached if [ -d "/workspace/huggingface_cache" ] && [ "$(ls -A /workspace/huggingface_cache)" ]; then log_success "Hugging Face cache populated" else log_warn "Hugging Face cache may be empty" fi # Stop orchestrator docker compose -f compose.yaml down log_success "Validation passed" } # Clean up for template creation cleanup_for_template() { log_info "Cleaning up for template creation..." # Remove sensitive data log_info "Removing sensitive files..." rm -f /workspace/ai/.env rm -f /root/.ssh/known_hosts rm -f /root/.bash_history rm -f /root/.python_history # Clear logs log_info "Clearing logs..." find /var/log -type f -name "*.log" -delete 2>/dev/null || true journalctl --vacuum-time=1s 2>/dev/null || true # Logout from Tailscale log_info "Logging out from Tailscale..." tailscale logout 2>/dev/null || true # Clean Docker (but keep images) log_info "Cleaning Docker cache..." docker system prune -af --volumes || true # Create template marker log_info "Creating template version marker..." cat > /workspace/TEMPLATE_VERSION </dev/null || echo "installed") - Orchestrator (ai_orchestrator) - Text Generation (vLLM + Qwen 2.5 7B) - Image Generation (Flux.1 Schnell) - Music Generation (MusicGen Medium) Models Cached: ~37GB EOF log_success "Cleanup complete" } # Main execution main() { log_info "======================================" log_info "RunPod Template Preparation Script" log_info "======================================" log_info "" check_environment install_docker install_docker_compose install_tailscale build_docker_images download_models validate_installation cleanup_for_template log_info "" log_success "======================================" log_success "Template Preparation Complete!" log_success "======================================" log_info "" log_info "Next steps:" log_info "1. Review /workspace/TEMPLATE_VERSION" log_info "2. Go to RunPod Dashboard → My Pods" log_info "3. Select this pod → ⋮ → Save as Template" log_info "4. Name: multi-modal-ai-v1.0" log_info "5. Test deployment from template" log_info "" log_info "Template will enable 2-3 minute deployments instead of 60-90 minutes!" log_info "" } # Run main function main "$@"