diff --git a/arty.yml b/arty.yml index 2156ee0..aa97a8f 100644 --- a/arty.yml +++ b/arty.yml @@ -396,37 +396,56 @@ scripts: echo "" echo "Category: 🌸Pivoine/Audio" - fix/diffrhythm-transformers: | + fix/diffrhythm-patch: | echo "=========================================" - echo " Fix DiffRhythm Transformers Version" + echo " Apply DiffRhythm LlamaConfig Patch" echo "=========================================" echo "" echo "Issue: Tensor dimension mismatch (32 vs 64) in rotary embeddings" - echo "Solution: Downgrade transformers to 4.49.0" + echo "Solution: Patch DiffRhythm __init__.py to fix LlamaConfig" echo "" echo "References:" echo " - https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/44" echo " - https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/48" echo "" - cd /workspace/ComfyUI - source venv/bin/activate + DIFF_RHYTHM_DIR="/workspace/ComfyUI/custom_nodes/ComfyUI_DiffRhythm" + PATCH_FILE="/workspace/ai/comfyui/patches/diffrhythm-llamaconfig-fix.patch" - echo "Current transformers version:" - pip show transformers | grep Version - echo "" + if [ ! -d "$DIFF_RHYTHM_DIR" ]; then + echo "✗ Error: DiffRhythm not found at $DIFF_RHYTHM_DIR" + exit 1 + fi - echo "Installing transformers==4.49.0..." - pip install transformers==4.49.0 + if [ ! -f "$PATCH_FILE" ]; then + echo "✗ Error: Patch file not found at $PATCH_FILE" + exit 1 + fi - echo "" - echo "✓ Transformers downgraded to 4.49.0" - echo "" - echo "Next steps:" - echo " 1. Restart ComfyUI: arty services/comfyui/restart" - echo " 2. Test DiffRhythm workflows" + cd "$DIFF_RHYTHM_DIR" - deactivate + echo "Checking if patch already applied..." + if grep -q "PatchedLlamaConfig" __init__.py; then + echo "✓ Patch already applied!" + exit 0 + fi + + echo "Applying patch..." + patch -p1 < "$PATCH_FILE" + + if [ $? -eq 0 ]; then + echo "" + echo "✓ Patch applied successfully!" + echo "" + echo "Next steps:" + echo " 1. Restart ComfyUI: arty services/comfyui/restart" + echo " 2. Test DiffRhythm workflows" + else + echo "" + echo "✗ Failed to apply patch" + echo "You may need to manually apply the patch or check for conflicts" + exit 1 + fi setup/comfyui-extensions-deps: | echo "=========================================" diff --git a/comfyui/nodes/__init__.py b/comfyui/nodes/__init__.py deleted file mode 100644 index f28d2d5..0000000 --- a/comfyui/nodes/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -""" -Pivoine Custom ComfyUI Nodes -Custom node extensions and wrappers for RunPod deployment - -Author: valknar@pivoine.art -""" - -from .pivoine_diffrhythm import NODE_CLASS_MAPPINGS as DIFFRHYTHM_MAPPINGS -from .pivoine_diffrhythm import NODE_DISPLAY_NAME_MAPPINGS as DIFFRHYTHM_DISPLAY - -# Combine all node mappings -NODE_CLASS_MAPPINGS = { - **DIFFRHYTHM_MAPPINGS, -} - -NODE_DISPLAY_NAME_MAPPINGS = { - **DIFFRHYTHM_DISPLAY, -} - -__all__ = ['NODE_CLASS_MAPPINGS', 'NODE_DISPLAY_NAME_MAPPINGS'] diff --git a/comfyui/nodes/pivoine_diffrhythm.py b/comfyui/nodes/pivoine_diffrhythm.py deleted file mode 100644 index 7f159fe..0000000 --- a/comfyui/nodes/pivoine_diffrhythm.py +++ /dev/null @@ -1,101 +0,0 @@ -""" -Pivoine DiffRhythm Node -Custom wrapper for DiffRhythm that fixes LlamaConfig initialization issues -with transformers 4.49.0+ to prevent tensor dimension mismatches. - -Known Issue: DiffRhythm's DIT model doesn't specify num_attention_heads and -num_key_value_heads in LlamaConfig, causing "The size of tensor a (32) must -match the size of tensor b (64)" error in rotary position embeddings. - -This patch globally intercepts LlamaConfig at import time. - -Reference: https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/44 -Reference: https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/48 - -Author: valknar@pivoine.art -""" - -import sys -sys.path.append('/workspace/ComfyUI/custom_nodes/ComfyUI_DiffRhythm') - -# CRITICAL: Patch LlamaConfig BEFORE any DiffRhythm imports -# This must happen at module import time, not at runtime -from transformers.models.llama import LlamaConfig as _OriginalLlamaConfig - -class PatchedLlamaConfig(_OriginalLlamaConfig): - """ - Patched LlamaConfig that automatically adds missing attention head parameters. - - Fixes the tensor dimension mismatch (32 vs 64) in DiffRhythm's rotary - position embeddings by ensuring num_attention_heads and num_key_value_heads - are properly set based on hidden_size. - """ - def __init__(self, *args, **kwargs): - # If hidden_size is provided but num_attention_heads is not, calculate it - if 'hidden_size' in kwargs and 'num_attention_heads' not in kwargs: - hidden_size = kwargs['hidden_size'] - # Standard Llama architecture: head_dim = 64, so num_heads = hidden_size // 64 - kwargs['num_attention_heads'] = hidden_size // 64 - - # If num_key_value_heads is not provided, use GQA configuration - if 'num_attention_heads' in kwargs and 'num_key_value_heads' not in kwargs: - # For GQA (Grouped Query Attention), typically num_kv_heads = num_heads // 4 - kwargs['num_key_value_heads'] = max(1, kwargs['num_attention_heads'] // 4) - - # Call original __init__ with patched parameters - super().__init__(*args, **kwargs) - -# Replace LlamaConfig in transformers module BEFORE DiffRhythm imports it -import transformers.models.llama -transformers.models.llama.LlamaConfig = PatchedLlamaConfig - -# Also replace in modeling_llama module if it's already imported -import transformers.models.llama.modeling_llama -transformers.models.llama.modeling_llama.LlamaConfig = PatchedLlamaConfig - -# Now import DiffRhythm modules - they will use our patched LlamaConfig -import infer_utils - -# Monkey-patch decode_audio to force chunked=False -_original_decode_audio = infer_utils.decode_audio - -def patched_decode_audio(latent, vae_model, chunked=True): - """Patched version that always uses chunked=False""" - return _original_decode_audio(latent, vae_model, chunked=False) - -infer_utils.decode_audio = patched_decode_audio - -# Import DiffRhythm node -from DiffRhythmNode import DiffRhythmRun - -class PivoineDiffRhythmRun(DiffRhythmRun): - """ - Pivoine version of DiffRhythmRun with enhanced compatibility and error handling. - - Changes from original: - - Globally patches LlamaConfig to add missing num_attention_heads and num_key_value_heads - - Monkey-patches decode_audio to always use chunked=False for stability - - Fixes tensor dimension mismatch in rotary position embeddings (32 vs 64) - - Compatible with transformers 4.49.0+ - - Requires ~12-16GB VRAM, works reliably on RTX 4090 - - Technical details: - - Sets num_attention_heads = hidden_size // 64 (standard Llama architecture) - - Sets num_key_value_heads = num_attention_heads // 4 (GQA configuration) - - This ensures head_dim = hidden_size // num_attention_heads = 64 (not 32) - - Patch is applied globally at import time, affecting all LlamaConfig instances - """ - - CATEGORY = "🌸Pivoine/Audio" - - @classmethod - def INPUT_TYPES(cls): - return super().INPUT_TYPES() - -NODE_CLASS_MAPPINGS = { - "PivoineDiffRhythmRun": PivoineDiffRhythmRun, -} - -NODE_DISPLAY_NAME_MAPPINGS = { - "PivoineDiffRhythmRun": "Pivoine DiffRhythm Run", -} diff --git a/comfyui/patches/diffrhythm-llamaconfig-fix.patch b/comfyui/patches/diffrhythm-llamaconfig-fix.patch new file mode 100644 index 0000000..231c6c0 --- /dev/null +++ b/comfyui/patches/diffrhythm-llamaconfig-fix.patch @@ -0,0 +1,56 @@ +diff --git a/__init__.py b/__init__.py +index 1234567..abcdefg 100644 +--- a/__init__.py ++++ b/__init__.py +@@ -1,3 +1,51 @@ ++""" ++DiffRhythm ComfyUI Node with LlamaConfig Patch ++ ++PATCH: Fixes "The size of tensor a (32) must match the size of tensor b (64)" error ++in DiffRhythm's rotary position embeddings by patching LlamaConfig initialization. ++ ++Issue: DiffRhythm's DIT model doesn't specify num_attention_heads and ++num_key_value_heads when creating LlamaConfig, causing transformers 4.49.0+ ++to incorrectly infer head_dim = 32 instead of 64. ++ ++Solution: Patch LlamaConfig globally before importing DiffRhythmNode. ++ ++Reference: https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/44 ++Reference: https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/48 ++ ++Patch author: valknar@pivoine.art ++""" ++ ++# CRITICAL: Patch LlamaConfig BEFORE importing DiffRhythmNode ++from transformers.models.llama import LlamaConfig as _OriginalLlamaConfig ++ ++class PatchedLlamaConfig(_OriginalLlamaConfig): ++ """ ++ Patched LlamaConfig that automatically adds missing attention head parameters. ++ ++ Standard Llama architecture assumptions: ++ - head_dim = 64 (fixed) ++ - num_attention_heads = hidden_size // head_dim ++ - num_key_value_heads = num_attention_heads // 4 (for GQA) ++ """ ++ def __init__(self, *args, **kwargs): ++ # If hidden_size is provided but num_attention_heads is not, calculate it ++ if 'hidden_size' in kwargs and 'num_attention_heads' not in kwargs: ++ hidden_size = kwargs['hidden_size'] ++ kwargs['num_attention_heads'] = hidden_size // 64 ++ ++ # If num_key_value_heads is not provided, use GQA configuration ++ if 'num_attention_heads' in kwargs and 'num_key_value_heads' not in kwargs: ++ kwargs['num_key_value_heads'] = max(1, kwargs['num_attention_heads'] // 4) ++ ++ super().__init__(*args, **kwargs) ++ ++# Replace LlamaConfig in transformers module BEFORE DiffRhythm imports it ++import transformers.models.llama ++transformers.models.llama.LlamaConfig = PatchedLlamaConfig ++import transformers.models.llama.modeling_llama ++transformers.models.llama.modeling_llama.LlamaConfig = PatchedLlamaConfig ++ + from .DiffRhythmNode import NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS + + __all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS"] diff --git a/comfyui/workflows/text-to-music/diffrhythm-full-length-t2m-v1.json b/comfyui/workflows/text-to-music/diffrhythm-full-length-t2m-v1.json index 0f47f2b..85db7d4 100644 --- a/comfyui/workflows/text-to-music/diffrhythm-full-length-t2m-v1.json +++ b/comfyui/workflows/text-to-music/diffrhythm-full-length-t2m-v1.json @@ -4,7 +4,7 @@ "nodes": [ { "id": 1, - "type": "PivoineDiffRhythmRun", + "type": "DiffRhythmRun", "pos": [100, 100], "size": [400, 400], "flags": {}, diff --git a/comfyui/workflows/text-to-music/diffrhythm-random-generation-v1.json b/comfyui/workflows/text-to-music/diffrhythm-random-generation-v1.json index 5ebaa9c..55ba62f 100644 --- a/comfyui/workflows/text-to-music/diffrhythm-random-generation-v1.json +++ b/comfyui/workflows/text-to-music/diffrhythm-random-generation-v1.json @@ -4,7 +4,7 @@ "nodes": [ { "id": 1, - "type": "PivoineDiffRhythmRun", + "type": "DiffRhythmRun", "pos": [100, 100], "size": [400, 400], "flags": {}, diff --git a/comfyui/workflows/text-to-music/diffrhythm-reference-based-v1.json b/comfyui/workflows/text-to-music/diffrhythm-reference-based-v1.json index b4f924a..efe8b93 100644 --- a/comfyui/workflows/text-to-music/diffrhythm-reference-based-v1.json +++ b/comfyui/workflows/text-to-music/diffrhythm-reference-based-v1.json @@ -27,7 +27,7 @@ }, { "id": 2, - "type": "PivoineDiffRhythmRun", + "type": "DiffRhythmRun", "pos": [500, 100], "size": [400, 450], "flags": {}, diff --git a/comfyui/workflows/text-to-music/diffrhythm-simple-t2m-v1.json b/comfyui/workflows/text-to-music/diffrhythm-simple-t2m-v1.json index a1b069d..3f422cb 100644 --- a/comfyui/workflows/text-to-music/diffrhythm-simple-t2m-v1.json +++ b/comfyui/workflows/text-to-music/diffrhythm-simple-t2m-v1.json @@ -4,7 +4,7 @@ "nodes": [ { "id": 1, - "type": "PivoineDiffRhythmRun", + "type": "DiffRhythmRun", "pos": [100, 100], "size": [400, 400], "flags": {},