Initial implementation of AudioCraft Studio

Complete web interface for Meta's AudioCraft AI audio generation:

- Gradio UI with tabs for all 5 model families (MusicGen, AudioGen,
  MAGNeT, MusicGen Style, JASCO)
- REST API with FastAPI, OpenAPI docs, and API key auth
- VRAM management with ComfyUI coexistence support
- SQLite database for project/generation history
- Batch processing queue for async generation
- Docker deployment optimized for RunPod with RTX 4090

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-11-25 19:34:27 +01:00
commit ffbf02b12c
67 changed files with 12032 additions and 0 deletions

151
config/models.yaml Normal file
View File

@@ -0,0 +1,151 @@
# AudioCraft Model Registry Configuration
# This file defines all available models and their configurations
models:
musicgen:
enabled: true
display_name: "MusicGen"
description: "Text-to-music generation with optional melody conditioning"
default_variant: medium
variants:
small:
hf_id: facebook/musicgen-small
vram_mb: 1500
max_duration: 30
description: "Fast, lightweight model (300M params)"
medium:
hf_id: facebook/musicgen-medium
vram_mb: 5000
max_duration: 30
description: "Balanced quality and speed (1.5B params)"
large:
hf_id: facebook/musicgen-large
vram_mb: 10000
max_duration: 30
description: "Highest quality, slower (3.3B params)"
melody:
hf_id: facebook/musicgen-melody
vram_mb: 5000
max_duration: 30
conditioning:
- melody
description: "Melody-conditioned generation (1.5B params)"
stereo-small:
hf_id: facebook/musicgen-stereo-small
vram_mb: 1800
max_duration: 30
channels: 2
description: "Stereo output, fast (300M params)"
stereo-medium:
hf_id: facebook/musicgen-stereo-medium
vram_mb: 6000
max_duration: 30
channels: 2
description: "Stereo output, balanced (1.5B params)"
stereo-large:
hf_id: facebook/musicgen-stereo-large
vram_mb: 12000
max_duration: 30
channels: 2
description: "Stereo output, highest quality (3.3B params)"
stereo-melody:
hf_id: facebook/musicgen-stereo-melody
vram_mb: 6000
max_duration: 30
channels: 2
conditioning:
- melody
description: "Stereo melody-conditioned (1.5B params)"
audiogen:
enabled: true
display_name: "AudioGen"
description: "Text-to-sound effects generation"
default_variant: medium
variants:
medium:
hf_id: facebook/audiogen-medium
vram_mb: 5000
max_duration: 10
description: "Sound effects generator (1.5B params)"
magnet:
enabled: true
display_name: "MAGNeT"
description: "Fast non-autoregressive music generation"
default_variant: medium-10secs
variants:
small-10secs:
hf_id: facebook/magnet-small-10secs
vram_mb: 1500
max_duration: 10
description: "Fast 10-second clips (300M params)"
medium-10secs:
hf_id: facebook/magnet-medium-10secs
vram_mb: 5000
max_duration: 10
description: "Quality 10-second clips (1.5B params)"
small-30secs:
hf_id: facebook/magnet-small-30secs
vram_mb: 1800
max_duration: 30
description: "Fast 30-second clips (300M params)"
medium-30secs:
hf_id: facebook/magnet-medium-30secs
vram_mb: 6000
max_duration: 30
description: "Quality 30-second clips (1.5B params)"
musicgen-style:
enabled: true
display_name: "MusicGen Style"
description: "Style-conditioned music generation from reference audio"
default_variant: medium
variants:
medium:
hf_id: facebook/musicgen-style
vram_mb: 5000
max_duration: 30
conditioning:
- style
description: "Style transfer from reference audio (1.5B params)"
jasco:
enabled: true
display_name: "JASCO"
description: "Chord and drum-conditioned music generation"
default_variant: chords-drums-400M
variants:
chords-drums-400M:
hf_id: facebook/jasco-chords-drums-400M
vram_mb: 2000
max_duration: 10
conditioning:
- chords
- drums
description: "Chord/drum control, fast (400M params)"
chords-drums-1B:
hf_id: facebook/jasco-chords-drums-1B
vram_mb: 4000
max_duration: 10
conditioning:
- chords
- drums
description: "Chord/drum control, higher quality (1B params)"
# Default generation parameters
defaults:
generation:
duration: 10
temperature: 1.0
top_k: 250
top_p: 0.0
cfg_coef: 3.0
# VRAM thresholds for warnings
vram:
warning_threshold: 0.85 # 85% utilization warning
critical_threshold: 0.95 # 95% utilization critical
# Presets are loaded from data/presets/*.yaml
presets_dir: "./data/presets"