Files
runpod/comfyui/workflows/text-to-music/diffrhythm-simple-t2m-v1.json
Sebastian Krüger d74a7cb7cb
All checks were successful
Build and Push RunPod Docker Image / build-and-push (push) Successful in 14s
fix: replace custom Pivoine node with direct DiffRhythm patch
- Remove custom PivoineDiffRhythmRun wrapper node
- Add git patch file for ComfyUI_DiffRhythm __init__.py
- Patch adds LlamaConfig fix at import time
- Add arty script 'fix/diffrhythm-patch' to apply patch
- Revert all workflows to use original DiffRhythmRun
- Remove startup_patch.py and revert start.sh

This approach is cleaner and more maintainable than wrapping the node.
The patch directly fixes the tensor dimension mismatch (32 vs 64) in
DiffRhythm's rotary position embeddings by ensuring num_attention_heads
and num_key_value_heads are properly set based on hidden_size.

References:
- https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/44
- https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/48
2025-11-24 19:27:18 +01:00

126 lines
3.5 KiB
JSON

{
"last_node_id": 3,
"last_link_id": 2,
"nodes": [
{
"id": 1,
"type": "DiffRhythmRun",
"pos": [100, 100],
"size": [400, 400],
"flags": {},
"order": 0,
"mode": 0,
"outputs": [
{
"name": "AUDIO",
"type": "AUDIO",
"links": [1, 2]
}
],
"properties": {
"Node name for S&R": "DiffRhythmRun"
},
"widgets_values": [
"cfm_model_v1_2.pt",
"Upbeat electronic dance music with energetic beats and synthesizer melodies",
true,
"euler",
30,
4,
"speed",
42,
"randomize",
false,
"[-1, 20], [60, -1]"
],
"title": "DiffRhythm Text-to-Music (95s)"
},
{
"id": 2,
"type": "PreviewAudio",
"pos": [600, 100],
"size": [300, 100],
"flags": {},
"order": 1,
"mode": 0,
"inputs": [
{
"name": "audio",
"type": "AUDIO",
"link": 1
}
],
"properties": {
"Node name for S&R": "PreviewAudio"
},
"title": "Preview Audio"
},
{
"id": 3,
"type": "SaveAudio",
"pos": [600, 250],
"size": [300, 100],
"flags": {},
"order": 2,
"mode": 0,
"inputs": [
{
"name": "audio",
"type": "AUDIO",
"link": 2
}
],
"properties": {
"Node name for S&R": "SaveAudio"
},
"widgets_values": [
"diffrhythm_output"
],
"title": "Save Audio"
}
],
"links": [
[1, 1, 0, 2, 0, "AUDIO"],
[2, 1, 0, 3, 0, "AUDIO"]
],
"groups": [],
"config": {},
"extra": {
"workflow_info": {
"name": "DiffRhythm Simple Text-to-Music v1",
"description": "Basic text-to-music generation using DiffRhythm 1.2 (95 seconds)",
"version": "1.0.0",
"author": "valknar@pivoine.art",
"category": "text-to-music",
"tags": ["diffrhythm", "music-generation", "text-to-music", "95s"],
"requirements": {
"custom_nodes": ["ComfyUI_DiffRhythm"],
"models": ["ASLP-lab/DiffRhythm-1_2", "ASLP-lab/DiffRhythm-vae", "OpenMuQ/MuQ-MuLan-large", "OpenMuQ/MuQ-large-msd-iter", "FacebookAI/xlm-roberta-base"],
"vram_min": "12GB",
"vram_recommended": "16GB",
"system_deps": ["espeak-ng"]
},
"usage": {
"model": "cfm_model_v1_2.pt (DiffRhythm 1.2 - 95s generation)",
"style_prompt": "Text description of the desired music style, mood, and instruments",
"unload_model": "Boolean to unload model after generation (default: true)",
"odeint_method": "ODE solver: euler, midpoint, rk4, implicit_adams (default: euler)",
"steps": "Number of diffusion steps: 1-100 (default: 30)",
"cfg": "Classifier-free guidance scale: 1-10 (default: 4)",
"quality_or_speed": "Generation mode: quality or speed (default: speed)",
"seed": "Random seed for reproducibility (default: 42)",
"edit": "Enable segment editing mode (default: false)",
"edit_segments": "Segments to edit when edit=true (default: [-1, 20], [60, -1])"
},
"notes": [
"This workflow uses DiffRhythm 1.2 for 95-second music generation",
"All parameters except model and style_prompt are optional",
"Supports English and Chinese text prompts",
"Generation time: ~30-60 seconds on RTX 4090",
"Can optionally connect MultiLineLyricsDR node for lyrics input"
]
}
},
"version": 0.4
}