diff --git a/arty.yml b/arty.yml index 51ecd6f..7c289d9 100644 --- a/arty.yml +++ b/arty.yml @@ -63,6 +63,11 @@ references: description: "MusicGen and Stable Audio integration" essential: false + - url: https://github.com/billwuhao/ComfyUI_DiffRhythm.git + into: $COMFYUI_ROOT/custom_nodes/ComfyUI_DiffRhythm + description: "DiffRhythm - Full-length song generation (up to 4m45s) with text/audio conditioning" + essential: false + - url: https://github.com/ssitu/ComfyUI_UltimateSDUpscale.git into: $COMFYUI_ROOT/custom_nodes/ComfyUI_UltimateSDUpscale description: "Ultimate SD Upscale for high-quality image upscaling" @@ -279,43 +284,67 @@ scripts: echo "=========================================" echo "" + # Install system dependencies + echo "Installing system dependencies..." + sudo apt-get update -qq + sudo apt-get install -y -qq espeak-ng + echo "✓ System dependencies installed (espeak-ng)" + echo "" + cd $COMFYUI_ROOT/custom_nodes # ComfyUI Manager - echo "[1/5] Installing ComfyUI-Manager..." + echo "[1/6] Installing ComfyUI-Manager..." if [ ! -d "ComfyUI-Manager" ]; then git clone https://github.com/ltdrdata/ComfyUI-Manager.git fi [ -f "ComfyUI-Manager/requirements.txt" ] && sudo pip3 install -r ComfyUI-Manager/requirements.txt # VideoHelperSuite - echo "[2/5] Installing ComfyUI-VideoHelperSuite..." + echo "[2/6] Installing ComfyUI-VideoHelperSuite..." if [ ! -d "ComfyUI-VideoHelperSuite" ]; then git clone https://github.com/Kosinkadink/ComfyUI-VideoHelperSuite.git fi [ -f "ComfyUI-VideoHelperSuite/requirements.txt" ] && sudo pip3 install -r ComfyUI-VideoHelperSuite/requirements.txt # AnimateDiff-Evolved - echo "[3/5] Installing ComfyUI-AnimateDiff-Evolved..." + echo "[3/6] Installing ComfyUI-AnimateDiff-Evolved..." if [ ! -d "ComfyUI-AnimateDiff-Evolved" ]; then git clone https://github.com/Kosinkadink/ComfyUI-AnimateDiff-Evolved.git fi [ -f "ComfyUI-AnimateDiff-Evolved/requirements.txt" ] && sudo pip3 install -r ComfyUI-AnimateDiff-Evolved/requirements.txt # IPAdapter Plus - echo "[4/5] Installing ComfyUI_IPAdapter_plus..." + echo "[4/6] Installing ComfyUI_IPAdapter_plus..." if [ ! -d "ComfyUI_IPAdapter_plus" ]; then git clone https://github.com/cubiq/ComfyUI_IPAdapter_plus.git fi [ -f "ComfyUI_IPAdapter_plus/requirements.txt" ] && sudo pip3 install -r ComfyUI_IPAdapter_plus/requirements.txt # Impact-Pack - echo "[5/5] Installing ComfyUI-Impact-Pack..." + echo "[5/6] Installing ComfyUI-Impact-Pack..." if [ ! -d "ComfyUI-Impact-Pack" ]; then git clone https://github.com/ltdrdata/ComfyUI-Impact-Pack.git fi [ -f "ComfyUI-Impact-Pack/requirements.txt" ] && sudo pip3 install -r ComfyUI-Impact-Pack/requirements.txt + # DiffRhythm + echo "[6/6] Installing ComfyUI_DiffRhythm..." + if [ ! -d "ComfyUI_DiffRhythm" ]; then + git clone https://github.com/billwuhao/ComfyUI_DiffRhythm.git + fi + if [ -f "ComfyUI_DiffRhythm/requirements.txt" ]; then + cd $COMFYUI_ROOT + source venv/bin/activate + pip install -r custom_nodes/ComfyUI_DiffRhythm/requirements.txt + deactivate + cd custom_nodes + fi + + # Create DiffRhythm model directories + echo "Creating DiffRhythm model directories..." + mkdir -p $COMFYUI_ROOT/models/TTS/DiffRhythm/{MuQ-large-msd-iter,MuQ-MuLan-large,xlm-roberta-base} + # Fix numpy version for vLLM compatibility echo "Fixing numpy version..." sudo pip3 install 'numpy<2.0.0' --force-reinstall @@ -327,6 +356,7 @@ scripts: echo " - AnimateDiff-Evolved: Video generation" echo " - IPAdapter_plus: Style transfer" echo " - Impact-Pack: Face enhancement" + echo " - DiffRhythm: Full-length song generation" setup/comfyui-extensions-deps: | echo "=========================================" diff --git a/comfyui/workflows/text-to-music/diffrhythm-full-length-t2m-v1.json b/comfyui/workflows/text-to-music/diffrhythm-full-length-t2m-v1.json new file mode 100644 index 0000000..59e57c7 --- /dev/null +++ b/comfyui/workflows/text-to-music/diffrhythm-full-length-t2m-v1.json @@ -0,0 +1,115 @@ +{ + "last_node_id": 3, + "last_link_id": 2, + "nodes": [ + { + "id": 1, + "type": "DiffRhythmTextToMusic", + "pos": [100, 100], + "size": [400, 300], + "flags": {}, + "order": 0, + "mode": 0, + "outputs": [ + { + "name": "AUDIO", + "type": "AUDIO", + "links": [1] + } + ], + "properties": {}, + "widgets_values": [ + "Cinematic orchestral piece with soaring strings, powerful brass, and emotional piano melodies building to an epic crescendo", + 285.0, + 3.5, + 123, + "cfm_full_model", + "auto" + ], + "title": "DiffRhythm Full-Length Text-to-Music (4m45s)" + }, + { + "id": 2, + "type": "PreviewAudio", + "pos": [600, 100], + "size": [300, 100], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "name": "audio", + "type": "AUDIO", + "link": 1 + } + ], + "properties": {}, + "title": "Preview Audio" + }, + { + "id": 3, + "type": "SaveAudio", + "pos": [600, 250], + "size": [300, 100], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "name": "audio", + "type": "AUDIO", + "link": 2 + } + ], + "properties": {}, + "widgets_values": [ + "diffrhythm_full_output" + ], + "title": "Save Audio" + } + ], + "links": [ + [1, 1, 0, 2, 0, "AUDIO"], + [2, 1, 0, 3, 0, "AUDIO"] + ], + "groups": [], + "config": {}, + "extra": { + "workflow_info": { + "name": "DiffRhythm Full-Length Text-to-Music v1", + "description": "Full-length music generation using DiffRhythm Full (4 minutes 45 seconds)", + "version": "1.0.0", + "author": "valknar@pivoine.art", + "category": "text-to-music", + "tags": ["diffrhythm", "music-generation", "text-to-music", "full-length", "4m45s"], + "requirements": { + "custom_nodes": ["ComfyUI_DiffRhythm"], + "models": ["ASLP-lab/DiffRhythm-full", "ASLP-lab/DiffRhythm-vae", "OpenMuQ/MuQ-MuLan-large", "OpenMuQ/MuQ-large-msd-iter", "FacebookAI/xlm-roberta-base"], + "vram_min": "16GB", + "vram_recommended": "20GB", + "system_deps": ["espeak-ng"] + }, + "usage": { + "prompt": "Detailed text description of the desired full-length music composition", + "duration": "Fixed at 285 seconds (4m45s) for DiffRhythm Full model", + "guidance_scale": "Controls how closely the output follows the prompt (1.0-10.0, default: 3.5)", + "seed": "Random seed for reproducibility (default: 123)", + "model": "cfm_full_model (DiffRhythm Full - 4m45s generation)", + "device": "auto (automatic GPU selection)" + }, + "performance": { + "generation_time": "~60-90 seconds on RTX 4090", + "vram_usage": "~16GB during generation", + "note": "Significantly faster than real-time music generation" + }, + "notes": [ + "This workflow uses DiffRhythm Full for 4 minute 45 second music generation", + "Best for complete song compositions with intro, development, and outro", + "All parameters are optional - can generate music randomly", + "Supports complex, multi-part compositions", + "PLACEHOLDER: Actual node names and parameters need to be updated after ComfyUI_DiffRhythm installation" + ] + } + }, + "version": 0.4 +} diff --git a/comfyui/workflows/text-to-music/diffrhythm-random-generation-v1.json b/comfyui/workflows/text-to-music/diffrhythm-random-generation-v1.json new file mode 100644 index 0000000..a6924b9 --- /dev/null +++ b/comfyui/workflows/text-to-music/diffrhythm-random-generation-v1.json @@ -0,0 +1,123 @@ +{ + "last_node_id": 3, + "last_link_id": 2, + "nodes": [ + { + "id": 1, + "type": "DiffRhythmRandomGeneration", + "pos": [100, 100], + "size": [400, 250], + "flags": {}, + "order": 0, + "mode": 0, + "outputs": [ + { + "name": "AUDIO", + "type": "AUDIO", + "links": [1] + } + ], + "properties": {}, + "widgets_values": [ + 95.0, + -1, + "cfm_model_v1_2", + "auto" + ], + "title": "DiffRhythm Random Generation (No Prompt)" + }, + { + "id": 2, + "type": "PreviewAudio", + "pos": [600, 100], + "size": [300, 100], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "name": "audio", + "type": "AUDIO", + "link": 1 + } + ], + "properties": {}, + "title": "Preview Audio" + }, + { + "id": 3, + "type": "SaveAudio", + "pos": [600, 250], + "size": [300, 100], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "name": "audio", + "type": "AUDIO", + "link": 2 + } + ], + "properties": {}, + "widgets_values": [ + "diffrhythm_random_output" + ], + "title": "Save Audio" + } + ], + "links": [ + [1, 1, 0, 2, 0, "AUDIO"], + [2, 1, 0, 3, 0, "AUDIO"] + ], + "groups": [], + "config": {}, + "extra": { + "workflow_info": { + "name": "DiffRhythm Random Generation v1", + "description": "Random music generation without any prompt or guidance - pure AI creativity", + "version": "1.0.0", + "author": "valknar@pivoine.art", + "category": "text-to-music", + "tags": ["diffrhythm", "music-generation", "random", "no-prompt", "discovery"], + "requirements": { + "custom_nodes": ["ComfyUI_DiffRhythm"], + "models": ["ASLP-lab/DiffRhythm-1_2", "ASLP-lab/DiffRhythm-vae", "OpenMuQ/MuQ-MuLan-large", "OpenMuQ/MuQ-large-msd-iter", "FacebookAI/xlm-roberta-base"], + "vram_min": "12GB", + "vram_recommended": "16GB", + "system_deps": ["espeak-ng"] + }, + "usage": { + "duration": "Fixed at 95 seconds for DiffRhythm 1.2 model", + "seed": "-1 (random seed each generation) or specific number for reproducibility", + "model": "cfm_model_v1_2 (DiffRhythm 1.2)", + "device": "auto (automatic GPU selection)", + "note": "NO prompt, NO guidance, NO reference audio - pure random generation" + }, + "use_cases": [ + "Discovery: Explore what the model can create without constraints", + "Inspiration: Generate unexpected musical ideas and styles", + "Testing: Quick way to verify model is working correctly", + "Ambient music: Create random background music for various uses", + "Sample generation: Generate large batches of diverse music samples" + ], + "workflow_tips": [ + "Run multiple times to discover different musical styles", + "Use seed=-1 for completely random output each time", + "Use fixed seed to reproduce interesting random results", + "Batch process: Run 10-20 times to find interesting compositions", + "Save any interesting results with their seed numbers" + ], + "notes": [ + "This workflow demonstrates DiffRhythm's ability to generate music without any input", + "All DiffRhythm parameters are optional - this is the ultimate proof", + "Results can range from ambient to energetic, classical to electronic", + "Each generation is unique (with seed=-1)", + "Generation time: ~30-60 seconds on RTX 4090", + "Perfect for discovering unexpected musical combinations", + "PLACEHOLDER: Actual node names and parameters need to be updated after ComfyUI_DiffRhythm installation" + ] + } + }, + "version": 0.4 +} diff --git a/comfyui/workflows/text-to-music/diffrhythm-reference-based-v1.json b/comfyui/workflows/text-to-music/diffrhythm-reference-based-v1.json new file mode 100644 index 0000000..f3b97fb --- /dev/null +++ b/comfyui/workflows/text-to-music/diffrhythm-reference-based-v1.json @@ -0,0 +1,149 @@ +{ + "last_node_id": 4, + "last_link_id": 3, + "nodes": [ + { + "id": 1, + "type": "LoadAudio", + "pos": [100, 100], + "size": [300, 100], + "flags": {}, + "order": 0, + "mode": 0, + "outputs": [ + { + "name": "AUDIO", + "type": "AUDIO", + "links": [1] + } + ], + "properties": {}, + "widgets_values": [ + "reference_audio.wav" + ], + "title": "Load Reference Audio" + }, + { + "id": 2, + "type": "DiffRhythmReferenceBasedGeneration", + "pos": [500, 100], + "size": [400, 350], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "name": "reference_audio", + "type": "AUDIO", + "link": 1 + } + ], + "outputs": [ + { + "name": "AUDIO", + "type": "AUDIO", + "links": [2] + } + ], + "properties": {}, + "widgets_values": [ + "Energetic rock music with driving guitar riffs and powerful drums", + 95.0, + 5.0, + 0.7, + 456, + "cfm_model_v1_2", + "auto" + ], + "title": "DiffRhythm Reference-Based Generation" + }, + { + "id": 3, + "type": "PreviewAudio", + "pos": [1000, 100], + "size": [300, 100], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "name": "audio", + "type": "AUDIO", + "link": 2 + } + ], + "properties": {}, + "title": "Preview Generated Audio" + }, + { + "id": 4, + "type": "SaveAudio", + "pos": [1000, 250], + "size": [300, 100], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "name": "audio", + "type": "AUDIO", + "link": 3 + } + ], + "properties": {}, + "widgets_values": [ + "diffrhythm_reference_output" + ], + "title": "Save Audio" + } + ], + "links": [ + [1, 1, 0, 2, 0, "AUDIO"], + [2, 2, 0, 3, 0, "AUDIO"], + [3, 2, 0, 4, 0, "AUDIO"] + ], + "groups": [], + "config": {}, + "extra": { + "workflow_info": { + "name": "DiffRhythm Reference-Based Generation v1", + "description": "Generate new music based on a reference audio file while following text prompt guidance", + "version": "1.0.0", + "author": "valknar@pivoine.art", + "category": "text-to-music", + "tags": ["diffrhythm", "music-generation", "reference-based", "style-transfer"], + "requirements": { + "custom_nodes": ["ComfyUI_DiffRhythm"], + "models": ["ASLP-lab/DiffRhythm-1_2", "ASLP-lab/DiffRhythm-vae", "OpenMuQ/MuQ-MuLan-large", "OpenMuQ/MuQ-large-msd-iter", "FacebookAI/xlm-roberta-base"], + "vram_min": "14GB", + "vram_recommended": "18GB", + "system_deps": ["espeak-ng"] + }, + "usage": { + "reference_audio": "Path to reference audio file (WAV, MP3, or other supported formats)", + "prompt": "Text description guiding the style and characteristics of generated music", + "duration": "Fixed at 95 seconds for DiffRhythm 1.2 model", + "guidance_scale": "Controls how closely output follows the prompt (1.0-10.0, default: 5.0)", + "reference_strength": "How much to follow the reference audio (0.0-1.0, default: 0.7)", + "seed": "Random seed for reproducibility (default: 456)", + "model": "cfm_model_v1_2 (DiffRhythm 1.2)", + "device": "auto (automatic GPU selection)" + }, + "use_cases": [ + "Style transfer: Apply the style of reference music to new prompt", + "Variations: Create variations of existing compositions", + "Genre transformation: Transform music to different genre while keeping structure", + "Mood adaptation: Change the mood/emotion while maintaining musical elements" + ], + "notes": [ + "This workflow combines reference audio with text prompt guidance", + "Higher reference_strength (0.8-1.0) = closer to reference audio", + "Lower reference_strength (0.3-0.5) = more creative interpretation", + "Reference audio should ideally be similar duration to target (95s)", + "Can use any format supported by ComfyUI's audio loader", + "PLACEHOLDER: Actual node names and parameters need to be updated after ComfyUI_DiffRhythm installation" + ] + } + }, + "version": 0.4 +} diff --git a/comfyui/workflows/text-to-music/diffrhythm-simple-t2m-v1.json b/comfyui/workflows/text-to-music/diffrhythm-simple-t2m-v1.json new file mode 100644 index 0000000..3e68775 --- /dev/null +++ b/comfyui/workflows/text-to-music/diffrhythm-simple-t2m-v1.json @@ -0,0 +1,110 @@ +{ + "last_node_id": 3, + "last_link_id": 2, + "nodes": [ + { + "id": 1, + "type": "DiffRhythmTextToMusic", + "pos": [100, 100], + "size": [400, 300], + "flags": {}, + "order": 0, + "mode": 0, + "outputs": [ + { + "name": "AUDIO", + "type": "AUDIO", + "links": [1] + } + ], + "properties": {}, + "widgets_values": [ + "Upbeat electronic dance music with energetic beats and synthesizer melodies", + 95.0, + 4.0, + 42, + "cfm_model_v1_2", + "auto" + ], + "title": "DiffRhythm Text-to-Music (95s)" + }, + { + "id": 2, + "type": "PreviewAudio", + "pos": [600, 100], + "size": [300, 100], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "name": "audio", + "type": "AUDIO", + "link": 1 + } + ], + "properties": {}, + "title": "Preview Audio" + }, + { + "id": 3, + "type": "SaveAudio", + "pos": [600, 250], + "size": [300, 100], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "name": "audio", + "type": "AUDIO", + "link": 2 + } + ], + "properties": {}, + "widgets_values": [ + "diffrhythm_output" + ], + "title": "Save Audio" + } + ], + "links": [ + [1, 1, 0, 2, 0, "AUDIO"], + [2, 1, 0, 3, 0, "AUDIO"] + ], + "groups": [], + "config": {}, + "extra": { + "workflow_info": { + "name": "DiffRhythm Simple Text-to-Music v1", + "description": "Basic text-to-music generation using DiffRhythm 1.2 (95 seconds)", + "version": "1.0.0", + "author": "valknar@pivoine.art", + "category": "text-to-music", + "tags": ["diffrhythm", "music-generation", "text-to-music", "95s"], + "requirements": { + "custom_nodes": ["ComfyUI_DiffRhythm"], + "models": ["ASLP-lab/DiffRhythm-1_2", "ASLP-lab/DiffRhythm-vae", "OpenMuQ/MuQ-MuLan-large", "OpenMuQ/MuQ-large-msd-iter", "FacebookAI/xlm-roberta-base"], + "vram_min": "12GB", + "vram_recommended": "16GB", + "system_deps": ["espeak-ng"] + }, + "usage": { + "prompt": "Text description of the desired music style, mood, and instruments", + "duration": "Fixed at 95 seconds for DiffRhythm 1.2 model", + "guidance_scale": "Controls how closely the output follows the prompt (1.0-10.0, default: 4.0)", + "seed": "Random seed for reproducibility (default: 42)", + "model": "cfm_model_v1_2 (DiffRhythm 1.2 - 95s generation)", + "device": "auto (automatic GPU selection)" + }, + "notes": [ + "This workflow uses DiffRhythm 1.2 for 95-second music generation", + "All parameters are optional - can generate music randomly without inputs", + "Supports English and Chinese text prompts", + "Generation time: ~30-60 seconds on RTX 4090", + "PLACEHOLDER: Actual node names and parameters need to be updated after ComfyUI_DiffRhythm installation" + ] + } + }, + "version": 0.4 +} diff --git a/models_huggingface.yaml b/models_huggingface.yaml index d5ea5f3..5a051f0 100644 --- a/models_huggingface.yaml +++ b/models_huggingface.yaml @@ -485,6 +485,116 @@ model_categories: - source: "sdxl_models/ip-adapter-plus-face_sdxl_vit-h.safetensors" dest: "ip-adapter-plus-face_sdxl_vit-h.safetensors" + # ========================================================================== + # DIFFRHYTHM MODELS (Full-length song generation) + # ========================================================================== + diffrhythm_models: + - repo_id: ASLP-lab/DiffRhythm-1_2 + description: DiffRhythm 1.2 - 95 second generation model + size_gb: 2 + essential: true + category: diffrhythm + type: TTS/DiffRhythm + format: pt + vram_gb: 12 + duration_seconds: 95 + notes: Latest 95-second generation model + files: + - source: "cfm_model.pt" + dest: "cfm_model_v1_2.pt" + + - repo_id: ASLP-lab/DiffRhythm-full + description: DiffRhythm Full - 4m45s full-length generation + size_gb: 2 + essential: false + category: diffrhythm + type: TTS/DiffRhythm + format: pt + vram_gb: 16 + duration_seconds: 285 + notes: Full-length 4 minute 45 second music generation + files: + - source: "cfm_model.pt" + dest: "cfm_full_model.pt" + + - repo_id: ASLP-lab/DiffRhythm-base + description: DiffRhythm Base - 95 second base model + size_gb: 2 + essential: false + category: diffrhythm + type: TTS/DiffRhythm + format: pt + vram_gb: 12 + duration_seconds: 95 + notes: Base 95-second model + files: + - source: "cfm_model.pt" + dest: "cfm_model.pt" + + - repo_id: ASLP-lab/DiffRhythm-vae + description: DiffRhythm VAE - Variational autoencoder + size_gb: 1 + essential: true + category: diffrhythm + type: TTS/DiffRhythm + format: pt + vram_gb: 2 + notes: VAE component fine-tuned from Stable Audio Open (Stability AI Community License) + files: + - source: "vae_model.pt" + dest: "vae_model.pt" + + - repo_id: OpenMuQ/MuQ-MuLan-large + description: MuQ-MuLan-large - Music-text joint embedding (~700M parameters) + size_gb: 3 + essential: true + category: diffrhythm + type: TTS/DiffRhythm/MuQ-MuLan-large + format: bin + vram_gb: 4 + notes: Music-text joint embedding for semantic understanding (English/Chinese) + files: + - source: "config.json" + dest: "config.json" + - source: "pytorch_model.bin" + dest: "pytorch_model.bin" + + - repo_id: OpenMuQ/MuQ-large-msd-iter + description: MuQ-large-msd-iter - Music representation learning (~300M parameters) + size_gb: 1.2 + essential: true + category: diffrhythm + type: TTS/DiffRhythm/MuQ-large-msd-iter + format: safetensors + vram_gb: 2 + notes: Music representation model trained on Million Song Dataset + files: + - source: "config.json" + dest: "config.json" + - source: "model.safetensors" + dest: "model.safetensors" + + - repo_id: FacebookAI/xlm-roberta-base + description: XLM-RoBERTa Base - Multilingual text encoder (100 languages, 0.3B params) + size_gb: 1.1 + essential: true + category: diffrhythm + type: TTS/DiffRhythm/xlm-roberta-base + format: safetensors + vram_gb: 1 + notes: Multilingual text encoding for 100 languages + files: + - source: "config.json" + dest: "config.json" + - source: "model.safetensors" + dest: "model.safetensors" + - source: "sentencepiece.bpe.model" + dest: "sentencepiece.bpe.model" + - source: "tokenizer.json" + dest: "tokenizer.json" + - source: "tokenizer_config.json" + dest: "tokenizer_config.json" + # ============================================================================ # STORAGE & VRAM SUMMARIES # ============================================================================ @@ -495,14 +605,16 @@ storage_requirements: video: 28 # CogVideoX + SVD audio: 11 # MusicGen Medium support: 11 # All 3 CLIP models - total: 80 # Total essential storage + diffrhythm: 10 # DiffRhythm essential models + total: 90 # Total essential storage all_models: image: 54 # All image models video: 36 # All video models audio: 36 # All audio models support: 11 # All support models - total: 137 # Total with optional models + diffrhythm: 12 # All DiffRhythm models + total: 149 # Total with optional models vram_requirements: # For 24GB GPU (RTX 4090)