--- # # RunPod AI Infrastructure Ansible Playbook # # This playbook provisions a RunPod GPU instance with multi-modal AI services. # It replaces all bash scripts with reproducible Ansible tasks. # # Usage: # ansible-playbook playbook.yml # Full deployment # ansible-playbook playbook.yml --tags base # Install system packages # ansible-playbook playbook.yml --tags python # Setup Python environment # ansible-playbook playbook.yml --tags models # Download models only # ansible-playbook playbook.yml --tags validate # Validate installation # # Tags: # base - System packages and dependencies # python - Python environment setup # dependencies- Install Python packages # models - Download AI models # tailscale - Install and configure Tailscale # systemd - Configure systemd services # validate - Health checks and validation # - name: Provision RunPod GPU Instance for AI Services hosts: localhost connection: local become: false vars: # Paths workspace_dir: /workspace ai_dir: "{{ workspace_dir }}/ai" cache_dir: "{{ workspace_dir }}/huggingface_cache" models_dir: "{{ workspace_dir }}/models" # Python configuration python_version: "3.10" pip_version: "23.3.1" # Model configuration models: vllm: name: "Qwen/Qwen2.5-7B-Instruct" size_gb: 14 flux: name: "black-forest-labs/FLUX.1-schnell" size_gb: 12 musicgen: name: "facebook/musicgen-medium" size_gb: 11 # Service configuration services: - name: orchestrator port: 9000 script: model-orchestrator/orchestrator_subprocess.py - name: vllm port: 8001 script: models/vllm/server.py - name: flux port: 8002 script: models/flux/server.py - name: musicgen port: 8003 script: models/musicgen/server.py tasks: # # Base System Setup # - name: Base system packages tags: [base, always] block: - name: Check GPU availability shell: nvidia-smi register: nvidia_check changed_when: false failed_when: nvidia_check.rc != 0 - name: Display GPU information debug: msg: "{{ nvidia_check.stdout_lines }}" - name: Ensure workspace directory exists file: path: "{{ workspace_dir }}" state: directory mode: '0755' - name: Update apt cache apt: update_cache: yes cache_valid_time: 3600 become: true - name: Install base system packages apt: name: - build-essential - python3-dev - python3-pip - python3-venv - git - curl - wget - vim - htop - tmux - net-tools state: present become: true # # Python Environment Setup # - name: Python environment setup tags: [python] block: - name: Upgrade pip pip: name: pip version: "{{ pip_version }}" executable: pip3 extra_args: --upgrade become: true - name: Install core Python packages pip: requirements: "{{ ai_dir }}/core/requirements.txt" executable: pip3 become: true # # Install Model Dependencies # - name: Install model dependencies tags: [dependencies] block: - name: Install vLLM dependencies pip: requirements: "{{ ai_dir }}/models/vllm/requirements.txt" executable: pip3 become: true - name: Install Flux dependencies pip: requirements: "{{ ai_dir }}/models/flux/requirements.txt" executable: pip3 become: true - name: Install MusicGen dependencies pip: requirements: "{{ ai_dir }}/models/musicgen/requirements.txt" executable: pip3 become: true # # Download AI Models # - name: Download AI models tags: [models] block: - name: Create model cache directories file: path: "{{ item }}" state: directory mode: '0755' loop: - "{{ cache_dir }}" - "{{ models_dir }}/flux" - "{{ models_dir }}/musicgen" - name: Check if models are already cached stat: path: "{{ cache_dir }}/models--{{ item.value.name | regex_replace('/', '--') }}" register: model_cache_check loop: "{{ models | dict2items }}" loop_control: label: "{{ item.key }}" - name: Download Qwen 2.5 7B model (14GB, ~15 minutes) shell: | python3 -c " from transformers import AutoTokenizer, AutoModelForCausalLM import os os.environ['HF_HOME'] = '{{ cache_dir }}' print('Downloading Qwen 2.5 7B Instruct...') AutoTokenizer.from_pretrained('{{ models.vllm.name }}') print('Tokenizer downloaded successfully') " environment: HF_TOKEN: "{{ lookup('env', 'HF_TOKEN') }}" HF_HOME: "{{ cache_dir }}" when: not (model_cache_check.results[0].stat.exists | default(false)) register: vllm_download async: 1800 # 30 minutes timeout poll: 30 - name: Download Flux.1 Schnell model (12GB, ~12 minutes) shell: | python3 -c " from diffusers import FluxPipeline import os os.environ['HF_HOME'] = '{{ cache_dir }}' print('Downloading Flux.1 Schnell...') FluxPipeline.from_pretrained( '{{ models.flux.name }}', cache_dir='{{ cache_dir }}' ) print('Flux.1 downloaded successfully') " environment: HF_TOKEN: "{{ lookup('env', 'HF_TOKEN') }}" HF_HOME: "{{ cache_dir }}" when: not (model_cache_check.results[1].stat.exists | default(false)) register: flux_download async: 1200 # 20 minutes timeout poll: 30 - name: Download MusicGen Medium model (11GB, ~10 minutes) shell: | python3 -c " from audiocraft.models import MusicGen import os os.environ['HF_HOME'] = '{{ cache_dir }}' print('Downloading MusicGen Medium...') MusicGen.get_pretrained('{{ models.musicgen.name }}') print('MusicGen downloaded successfully') " environment: HF_TOKEN: "{{ lookup('env', 'HF_TOKEN') }}" HF_HOME: "{{ cache_dir }}" when: not (model_cache_check.results[2].stat.exists | default(false)) register: musicgen_download async: 900 # 15 minutes timeout poll: 30 - name: Display model download summary debug: msg: | Model downloads completed: - Qwen 2.5 7B: {{ 'Downloaded' if vllm_download.changed | default(false) else 'Already cached' }} - Flux.1 Schnell: {{ 'Downloaded' if flux_download.changed | default(false) else 'Already cached' }} - MusicGen Medium: {{ 'Downloaded' if musicgen_download.changed | default(false) else 'Already cached' }} Total cache size: ~37GB # # Tailscale VPN # - name: Install and configure Tailscale tags: [tailscale] block: - name: Check if Tailscale is installed command: which tailscale register: tailscale_check changed_when: false failed_when: false - name: Install Tailscale shell: curl -fsSL https://tailscale.com/install.sh | sh become: true when: tailscale_check.rc != 0 - name: Display Tailscale setup instructions debug: msg: | Tailscale installed. To connect: 1. Start tailscaled: tailscaled --tun=userspace-networking --socks5-server=localhost:1055 & 2. Authenticate: tailscale up --advertise-tags=tag:gpu 3. Get IP: tailscale ip -4 Note: Authentication requires manual intervention via provided URL # # Systemd Services (Optional) # - name: Configure systemd services tags: [systemd, never] # never = skip by default block: - name: Create systemd service for orchestrator template: src: "{{ ai_dir }}/systemd/ai-orchestrator.service.j2" dest: /etc/systemd/system/ai-orchestrator.service mode: '0644' become: true - name: Reload systemd daemon systemd: daemon_reload: yes become: true - name: Enable orchestrator service systemd: name: ai-orchestrator enabled: yes become: true - name: Display systemd instructions debug: msg: | Systemd service configured. To manage: - Start: sudo systemctl start ai-orchestrator - Stop: sudo systemctl stop ai-orchestrator - Status: sudo systemctl status ai-orchestrator - Logs: sudo journalctl -u ai-orchestrator -f # # Validation # - name: Validate installation tags: [validate, never] # never = skip by default, run explicitly block: - name: Check Python packages shell: pip3 list | grep -E "(fastapi|uvicorn|torch|vllm|diffusers|audiocraft)" register: pip_check changed_when: false - name: Display installed packages debug: msg: "{{ pip_check.stdout_lines }}" - name: Check GPU memory shell: nvidia-smi --query-gpu=memory.free --format=csv,noheader,nounits register: gpu_memory changed_when: false - name: Display GPU memory debug: msg: "Free GPU memory: {{ gpu_memory.stdout }} MB" - name: Check cached models shell: du -sh {{ cache_dir }} register: cache_size changed_when: false - name: Display cache information debug: msg: "Model cache size: {{ cache_size.stdout }}" - name: Verify service scripts are executable file: path: "{{ ai_dir }}/{{ item.script }}" mode: '0755' loop: "{{ services }}" - name: Display validation summary debug: msg: | ✓ Installation validated successfully! Next steps: 1. Start orchestrator: python3 {{ ai_dir }}/model-orchestrator/orchestrator_subprocess.py 2. Test endpoint: curl http://localhost:9000/health 3. Configure LiteLLM on VPS to connect via Tailscale Services: {% for service in services %} - {{ service.name }}: http://localhost:{{ service.port }} {% endfor %} # # Cleanup for Template Creation # - name: Cleanup for template creation tags: [cleanup, never] # never = skip by default, run explicitly block: - name: Remove sensitive files file: path: "{{ item }}" state: absent loop: - "{{ ai_dir }}/.env" - /root/.ssh/known_hosts - /root/.bash_history - /root/.python_history - name: Clear system logs shell: find /var/log -type f -name "*.log" -delete become: true ignore_errors: yes - name: Create template version marker copy: dest: "{{ workspace_dir }}/TEMPLATE_VERSION" content: | RunPod Multi-Modal AI Template (Process-Based Architecture) Version: 2.0 Created: {{ ansible_date_time.iso8601 }} Components: - Python {{ python_version }} - Orchestrator (process-based) - Text Generation (vLLM + Qwen 2.5 7B) - Image Generation (Flux.1 Schnell) - Music Generation (MusicGen Medium) Models Cached: ~37GB Architecture: No Docker, direct Python execution Deployment: 1. Create .env file with HF_TOKEN 2. Run: python3 {{ ai_dir }}/model-orchestrator/orchestrator_subprocess.py 3. Access: http://localhost:9000/health - name: Display template creation instructions debug: msg: | Template prepared successfully! Next steps in RunPod dashboard: 1. Stop all running services 2. Go to My Pods → Select this pod → ⋮ → Save as Template 3. Name: multi-modal-ai-process-v2.0 4. Description: Process-based multi-modal AI (text/image/music) 5. Save and test deployment from template Template enables 2-3 minute deployments instead of 60+ minutes!