refactor: consolidate model management into Ansible playbook
Remove flux/musicgen standalone implementations in favor of ComfyUI: - Delete models/flux/ and models/musicgen/ directories - Remove redundant scripts (install.sh, download-models.sh, prepare-template.sh) - Update README.md to reference Ansible playbook commands - Update playbook.yml to remove flux/musicgen service definitions - Add COMFYUI_MODELS.md with comprehensive model installation guide - Update stop-all.sh to only manage orchestrator and vLLM services All model downloads and dependency management now handled via Ansible playbook tags (base, python, vllm, comfyui, comfyui-essential). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -1,36 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Download AI Models
|
||||
# Wrapper for Ansible models tag
|
||||
#
|
||||
|
||||
set -e
|
||||
|
||||
cd "$(dirname "$0")/.."
|
||||
|
||||
echo "========================================="
|
||||
echo " Downloading AI Models (~37GB)"
|
||||
echo "========================================="
|
||||
echo ""
|
||||
|
||||
# Source .env if it exists
|
||||
if [ -f .env ]; then
|
||||
set -a
|
||||
source .env
|
||||
set +a
|
||||
fi
|
||||
|
||||
# Check HF_TOKEN
|
||||
if [ -z "$HF_TOKEN" ]; then
|
||||
echo "Error: HF_TOKEN not set"
|
||||
echo "Add HF_TOKEN to .env file"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Run Ansible with models tag
|
||||
ansible-playbook playbook.yml --tags models
|
||||
|
||||
echo ""
|
||||
echo "========================================="
|
||||
echo " Model download complete!"
|
||||
echo "========================================="
|
||||
@@ -1,50 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Install AI Infrastructure
|
||||
# Wrapper script for Ansible playbook
|
||||
#
|
||||
# Usage:
|
||||
# ./install.sh # Full installation
|
||||
# ./install.sh --tags base # Install specific components
|
||||
#
|
||||
|
||||
set -e
|
||||
|
||||
cd "$(dirname "$0")/.."
|
||||
|
||||
echo "========================================="
|
||||
echo " RunPod AI Infrastructure Installation"
|
||||
echo "========================================="
|
||||
echo ""
|
||||
|
||||
# Check if Ansible is installed
|
||||
if ! command -v ansible-playbook &> /dev/null; then
|
||||
echo "Ansible not found. Installing..."
|
||||
sudo apt update
|
||||
sudo apt install -y ansible
|
||||
fi
|
||||
|
||||
# Check for .env file
|
||||
if [ ! -f .env ]; then
|
||||
echo "Warning: .env file not found"
|
||||
echo "Copy .env.example to .env and add your HF_TOKEN"
|
||||
echo ""
|
||||
fi
|
||||
|
||||
# Source .env if it exists
|
||||
if [ -f .env ]; then
|
||||
set -a
|
||||
source .env
|
||||
set +a
|
||||
fi
|
||||
|
||||
# Run Ansible playbook
|
||||
echo "Running Ansible playbook..."
|
||||
echo ""
|
||||
|
||||
ansible-playbook playbook.yml "$@"
|
||||
|
||||
echo ""
|
||||
echo "========================================="
|
||||
echo " Installation complete!"
|
||||
echo "========================================="
|
||||
@@ -1,314 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# RunPod Template Preparation Script
|
||||
# Prepares a RunPod instance for template creation
|
||||
#
|
||||
# This script:
|
||||
# 1. Installs Docker & Docker Compose
|
||||
# 2. Installs Tailscale
|
||||
# 3. Builds all Docker images
|
||||
# 4. Pre-downloads all models
|
||||
# 5. Validates everything works
|
||||
# 6. Cleans up for template creation
|
||||
#
|
||||
# Usage: ./prepare-template.sh
|
||||
# Run this on the RunPod instance you want to save as a template
|
||||
#
|
||||
|
||||
set -e # Exit on error
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# Logging functions
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||
}
|
||||
|
||||
log_warn() {
|
||||
echo -e "${YELLOW}[WARN]${NC} $1"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1"
|
||||
}
|
||||
|
||||
# Check if running on RunPod
|
||||
check_environment() {
|
||||
log_info "Checking environment..."
|
||||
|
||||
if ! nvidia-smi &> /dev/null; then
|
||||
log_error "NVIDIA GPU not detected. Are you running on a GPU instance?"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -d "/workspace" ]; then
|
||||
log_warn "/workspace directory not found. Creating it..."
|
||||
mkdir -p /workspace
|
||||
fi
|
||||
|
||||
log_success "Environment check passed"
|
||||
}
|
||||
|
||||
# Install Docker
|
||||
install_docker() {
|
||||
if command -v docker &> /dev/null; then
|
||||
log_info "Docker already installed: $(docker --version)"
|
||||
return
|
||||
fi
|
||||
|
||||
log_info "Installing Docker..."
|
||||
curl -fsSL https://get.docker.com -o get-docker.sh
|
||||
sh get-docker.sh
|
||||
rm get-docker.sh
|
||||
|
||||
# Start Docker daemon (RunPod requires --iptables=false --bridge=none)
|
||||
log_info "Starting Docker daemon..."
|
||||
pkill dockerd 2>/dev/null || true
|
||||
sleep 2
|
||||
dockerd --iptables=false --bridge=none > /var/log/dockerd.log 2>&1 &
|
||||
sleep 10
|
||||
|
||||
# Verify Docker is running
|
||||
if docker ps &> /dev/null; then
|
||||
log_success "Docker installed and running: $(docker --version)"
|
||||
else
|
||||
log_error "Docker failed to start. Check /var/log/dockerd.log"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Install Docker Compose
|
||||
install_docker_compose() {
|
||||
if docker compose version &> /dev/null; then
|
||||
log_info "Docker Compose already installed: $(docker compose version)"
|
||||
return
|
||||
fi
|
||||
|
||||
log_info "Installing Docker Compose..."
|
||||
|
||||
# Docker Compose is usually bundled with Docker now
|
||||
# If not, install it separately
|
||||
if ! docker compose version &> /dev/null; then
|
||||
DOCKER_COMPOSE_VERSION="v2.23.0"
|
||||
curl -L "https://github.com/docker/compose/releases/download/${DOCKER_COMPOSE_VERSION}/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
|
||||
chmod +x /usr/local/bin/docker-compose
|
||||
fi
|
||||
|
||||
log_success "Docker Compose installed: $(docker compose version)"
|
||||
}
|
||||
|
||||
# Install Tailscale
|
||||
install_tailscale() {
|
||||
if command -v tailscale &> /dev/null; then
|
||||
log_info "Tailscale already installed: $(tailscale version)"
|
||||
return
|
||||
fi
|
||||
|
||||
log_info "Installing Tailscale..."
|
||||
curl -fsSL https://tailscale.com/install.sh | sh
|
||||
|
||||
log_success "Tailscale installed: $(tailscale version)"
|
||||
}
|
||||
|
||||
# Build Docker images
|
||||
build_docker_images() {
|
||||
log_info "Building Docker images..."
|
||||
|
||||
cd /workspace/ai
|
||||
|
||||
# Use legacy builder (buildkit has permission issues in RunPod)
|
||||
export DOCKER_BUILDKIT=0
|
||||
|
||||
# Build orchestrator
|
||||
log_info "Building orchestrator..."
|
||||
docker compose -f compose.yaml build orchestrator
|
||||
|
||||
# Build vLLM
|
||||
log_info "Building vLLM..."
|
||||
docker compose -f compose.yaml build vllm-qwen
|
||||
|
||||
# Build MusicGen
|
||||
log_info "Building MusicGen..."
|
||||
docker compose -f compose.yaml build musicgen
|
||||
|
||||
# Pull Flux image (pre-built)
|
||||
log_info "Pulling Flux.1 image..."
|
||||
docker pull ghcr.io/matatonic/openedai-images-flux:latest
|
||||
|
||||
log_success "All Docker images built"
|
||||
}
|
||||
|
||||
# Pre-download models
|
||||
download_models() {
|
||||
log_info "Pre-downloading AI models (this will take 30-45 minutes)..."
|
||||
|
||||
cd /workspace/ai
|
||||
|
||||
# Create model cache directories
|
||||
mkdir -p /workspace/huggingface_cache
|
||||
mkdir -p /workspace/flux/models
|
||||
mkdir -p /workspace/musicgen/models
|
||||
|
||||
# Download Qwen 2.5 7B
|
||||
log_info "Downloading Qwen 2.5 7B (14GB)..."
|
||||
docker compose --profile text up -d vllm-qwen
|
||||
|
||||
# Wait for model to download
|
||||
log_info "Waiting for Qwen model to download..."
|
||||
while ! docker logs ai_vllm-qwen_1 2>&1 | grep -q "Model loaded successfully\|AsyncLLMEngine initialized"; do
|
||||
echo -n "."
|
||||
sleep 10
|
||||
done
|
||||
echo ""
|
||||
log_success "Qwen 2.5 7B downloaded"
|
||||
|
||||
docker compose stop vllm-qwen
|
||||
|
||||
# Download Flux.1 Schnell
|
||||
log_info "Downloading Flux.1 Schnell (12GB)..."
|
||||
docker compose --profile image up -d flux
|
||||
|
||||
log_info "Waiting for Flux model to download..."
|
||||
sleep 180 # Flux takes about 3 minutes to download and initialize
|
||||
log_success "Flux.1 Schnell downloaded"
|
||||
|
||||
docker compose stop flux
|
||||
|
||||
# Download MusicGen Medium
|
||||
log_info "Downloading MusicGen Medium (11GB)..."
|
||||
docker compose --profile audio up -d musicgen
|
||||
|
||||
log_info "Waiting for MusicGen model to download..."
|
||||
while ! docker logs ai_musicgen_1 2>&1 | grep -q "Model loaded successfully\|initialized successfully"; do
|
||||
echo -n "."
|
||||
sleep 10
|
||||
done
|
||||
echo ""
|
||||
log_success "MusicGen Medium downloaded"
|
||||
|
||||
docker compose stop musicgen
|
||||
|
||||
log_success "All models downloaded and cached"
|
||||
}
|
||||
|
||||
# Validate installation
|
||||
validate_installation() {
|
||||
log_info "Validating installation..."
|
||||
|
||||
cd /workspace/ai
|
||||
|
||||
# Start orchestrator
|
||||
log_info "Starting orchestrator for validation..."
|
||||
docker compose -f compose.yaml up -d orchestrator
|
||||
|
||||
sleep 10
|
||||
|
||||
# Check orchestrator health
|
||||
if curl -s http://localhost:9000/health | grep -q "healthy\|ok"; then
|
||||
log_success "Orchestrator is healthy"
|
||||
else
|
||||
log_error "Orchestrator health check failed"
|
||||
docker logs ai_orchestrator
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check models are cached
|
||||
if [ -d "/workspace/huggingface_cache" ] && [ "$(ls -A /workspace/huggingface_cache)" ]; then
|
||||
log_success "Hugging Face cache populated"
|
||||
else
|
||||
log_warn "Hugging Face cache may be empty"
|
||||
fi
|
||||
|
||||
# Stop orchestrator
|
||||
docker compose -f compose.yaml down
|
||||
|
||||
log_success "Validation passed"
|
||||
}
|
||||
|
||||
# Clean up for template creation
|
||||
cleanup_for_template() {
|
||||
log_info "Cleaning up for template creation..."
|
||||
|
||||
# Remove sensitive data
|
||||
log_info "Removing sensitive files..."
|
||||
rm -f /workspace/ai/.env
|
||||
rm -f /root/.ssh/known_hosts
|
||||
rm -f /root/.bash_history
|
||||
rm -f /root/.python_history
|
||||
|
||||
# Clear logs
|
||||
log_info "Clearing logs..."
|
||||
find /var/log -type f -name "*.log" -delete 2>/dev/null || true
|
||||
journalctl --vacuum-time=1s 2>/dev/null || true
|
||||
|
||||
# Logout from Tailscale
|
||||
log_info "Logging out from Tailscale..."
|
||||
tailscale logout 2>/dev/null || true
|
||||
|
||||
# Clean Docker (but keep images)
|
||||
log_info "Cleaning Docker cache..."
|
||||
docker system prune -af --volumes || true
|
||||
|
||||
# Create template marker
|
||||
log_info "Creating template version marker..."
|
||||
cat > /workspace/TEMPLATE_VERSION <<EOF
|
||||
RunPod Multi-Modal AI Template
|
||||
Version: 1.0
|
||||
Created: $(date)
|
||||
Components:
|
||||
- Docker $(docker --version | cut -d' ' -f3)
|
||||
- Docker Compose $(docker compose version --short)
|
||||
- Tailscale $(tailscale version --short 2>/dev/null || echo "installed")
|
||||
- Orchestrator (ai_orchestrator)
|
||||
- Text Generation (vLLM + Qwen 2.5 7B)
|
||||
- Image Generation (Flux.1 Schnell)
|
||||
- Music Generation (MusicGen Medium)
|
||||
Models Cached: ~37GB
|
||||
EOF
|
||||
|
||||
log_success "Cleanup complete"
|
||||
}
|
||||
|
||||
# Main execution
|
||||
main() {
|
||||
log_info "======================================"
|
||||
log_info "RunPod Template Preparation Script"
|
||||
log_info "======================================"
|
||||
log_info ""
|
||||
|
||||
check_environment
|
||||
install_docker
|
||||
install_docker_compose
|
||||
install_tailscale
|
||||
build_docker_images
|
||||
download_models
|
||||
validate_installation
|
||||
cleanup_for_template
|
||||
|
||||
log_info ""
|
||||
log_success "======================================"
|
||||
log_success "Template Preparation Complete!"
|
||||
log_success "======================================"
|
||||
log_info ""
|
||||
log_info "Next steps:"
|
||||
log_info "1. Review /workspace/TEMPLATE_VERSION"
|
||||
log_info "2. Go to RunPod Dashboard → My Pods"
|
||||
log_info "3. Select this pod → ⋮ → Save as Template"
|
||||
log_info "4. Name: multi-modal-ai-v1.0"
|
||||
log_info "5. Test deployment from template"
|
||||
log_info ""
|
||||
log_info "Template will enable 2-3 minute deployments instead of 60-90 minutes!"
|
||||
log_info ""
|
||||
}
|
||||
|
||||
# Run main function
|
||||
main "$@"
|
||||
@@ -17,8 +17,6 @@ pkill -f "orchestrator_subprocess.py" || echo "Orchestrator not running"
|
||||
|
||||
echo "Stopping model services..."
|
||||
pkill -f "models/vllm/server.py" || echo "vLLM not running"
|
||||
pkill -f "models/flux/server.py" || echo "Flux not running"
|
||||
pkill -f "models/musicgen/server.py" || echo "MusicGen not running"
|
||||
|
||||
echo ""
|
||||
echo "All services stopped"
|
||||
|
||||
Reference in New Issue
Block a user