fix: replace custom Pivoine node with direct DiffRhythm patch

- Remove custom PivoineDiffRhythmRun wrapper node - Add git patch file for ComfyUI_DiffRhythm __init__.py - Patch adds LlamaConfig fix at import time - Add arty script 'fix/diffrhythm-patch' to apply patch - Revert all workflows to use original DiffRhythmRun - Remove startup_patch.py and revert start.sh This approach is cleaner and more maintainable than wrapping the node. The patch directly fixes the tensor dimension mismatch (32 vs 64) in DiffRhythm's rotary position embeddings by ensuring num_attention_heads and num_key_value_heads are properly set based on hidden_size. References: - https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/44 - https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/48
2025-11-24 19:27:18 +01:00
parent f74457b049
commit d74a7cb7cb
8 changed files with 96 additions and 142 deletions
--- a/arty.yml
+++ b/arty.yml
@@ -396,37 +396,56 @@ scripts:
    echo ""
    echo "Category: 🌸Pivoine/Audio"

-  fix/diffrhythm-transformers: |
+  fix/diffrhythm-patch: |
    echo "========================================="
-    echo "  Fix DiffRhythm Transformers Version"
+    echo "  Apply DiffRhythm LlamaConfig Patch"
    echo "========================================="
    echo ""
    echo "Issue: Tensor dimension mismatch (32 vs 64) in rotary embeddings"
-    echo "Solution: Downgrade transformers to 4.49.0"
+    echo "Solution: Patch DiffRhythm __init__.py to fix LlamaConfig"
    echo ""
    echo "References:"
    echo "  - https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/44"
    echo "  - https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/48"
    echo ""

-    cd /workspace/ComfyUI
-    source venv/bin/activate
+    DIFF_RHYTHM_DIR="/workspace/ComfyUI/custom_nodes/ComfyUI_DiffRhythm"
+    PATCH_FILE="/workspace/ai/comfyui/patches/diffrhythm-llamaconfig-fix.patch"

-    echo "Current transformers version:"
-    pip show transformers | grep Version
+    if [ ! -d "$DIFF_RHYTHM_DIR" ]; then
+      echo "✗ Error: DiffRhythm not found at $DIFF_RHYTHM_DIR"
+      exit 1
+    fi
+
+    if [ ! -f "$PATCH_FILE" ]; then
+      echo "✗ Error: Patch file not found at $PATCH_FILE"
+      exit 1
+    fi
+
+    cd "$DIFF_RHYTHM_DIR"
+
+    echo "Checking if patch already applied..."
+    if grep -q "PatchedLlamaConfig" __init__.py; then
+      echo "✓ Patch already applied!"
+      exit 0
+    fi
+
+    echo "Applying patch..."
+    patch -p1 < "$PATCH_FILE"
+
+    if [ $? -eq 0 ]; then
      echo ""
-
-    echo "Installing transformers==4.49.0..."
-    pip install transformers==4.49.0
-
-    echo ""
-    echo "✓ Transformers downgraded to 4.49.0"
+      echo "✓ Patch applied successfully!"
      echo ""
      echo "Next steps:"
      echo "  1. Restart ComfyUI: arty services/comfyui/restart"
      echo "  2. Test DiffRhythm workflows"
-
-    deactivate
+    else
+      echo ""
+      echo "✗ Failed to apply patch"
+      echo "You may need to manually apply the patch or check for conflicts"
+      exit 1
+    fi

  setup/comfyui-extensions-deps: |
    echo "========================================="
--- a/comfyui/nodes/init.py
+++ b/comfyui/nodes/init.py
@@ -1,20 +0,0 @@
-"""
-Pivoine Custom ComfyUI Nodes
-Custom node extensions and wrappers for RunPod deployment
-
-Author: valknar@pivoine.art
-"""
-
-from .pivoine_diffrhythm import NODE_CLASS_MAPPINGS as DIFFRHYTHM_MAPPINGS
-from .pivoine_diffrhythm import NODE_DISPLAY_NAME_MAPPINGS as DIFFRHYTHM_DISPLAY
-
-# Combine all node mappings
-NODE_CLASS_MAPPINGS = {
-    **DIFFRHYTHM_MAPPINGS,
-}
-
-NODE_DISPLAY_NAME_MAPPINGS = {
-    **DIFFRHYTHM_DISPLAY,
-}
-
-__all__ = ['NODE_CLASS_MAPPINGS', 'NODE_DISPLAY_NAME_MAPPINGS']
--- a/comfyui/nodes/pivoine_diffrhythm.py
+++ b/comfyui/nodes/pivoine_diffrhythm.py
@@ -1,101 +0,0 @@
-"""
-Pivoine DiffRhythm Node
-Custom wrapper for DiffRhythm that fixes LlamaConfig initialization issues
-with transformers 4.49.0+ to prevent tensor dimension mismatches.
-
-Known Issue: DiffRhythm's DIT model doesn't specify num_attention_heads and
-num_key_value_heads in LlamaConfig, causing "The size of tensor a (32) must
-match the size of tensor b (64)" error in rotary position embeddings.
-
-This patch globally intercepts LlamaConfig at import time.
-
-Reference: https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/44
-Reference: https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/48
-
-Author: valknar@pivoine.art
-"""
-
-import sys
-sys.path.append('/workspace/ComfyUI/custom_nodes/ComfyUI_DiffRhythm')
-
-# CRITICAL: Patch LlamaConfig BEFORE any DiffRhythm imports
-# This must happen at module import time, not at runtime
-from transformers.models.llama import LlamaConfig as _OriginalLlamaConfig
-
-class PatchedLlamaConfig(_OriginalLlamaConfig):
-    """
-    Patched LlamaConfig that automatically adds missing attention head parameters.
-
-    Fixes the tensor dimension mismatch (32 vs 64) in DiffRhythm's rotary
-    position embeddings by ensuring num_attention_heads and num_key_value_heads
-    are properly set based on hidden_size.
-    """
-    def __init__(self, *args, **kwargs):
-        # If hidden_size is provided but num_attention_heads is not, calculate it
-        if 'hidden_size' in kwargs and 'num_attention_heads' not in kwargs:
-            hidden_size = kwargs['hidden_size']
-            # Standard Llama architecture: head_dim = 64, so num_heads = hidden_size // 64
-            kwargs['num_attention_heads'] = hidden_size // 64
-
-        # If num_key_value_heads is not provided, use GQA configuration
-        if 'num_attention_heads' in kwargs and 'num_key_value_heads' not in kwargs:
-            # For GQA (Grouped Query Attention), typically num_kv_heads = num_heads // 4
-            kwargs['num_key_value_heads'] = max(1, kwargs['num_attention_heads'] // 4)
-
-        # Call original __init__ with patched parameters
-        super().__init__(*args, **kwargs)
-
-# Replace LlamaConfig in transformers module BEFORE DiffRhythm imports it
-import transformers.models.llama
-transformers.models.llama.LlamaConfig = PatchedLlamaConfig
-
-# Also replace in modeling_llama module if it's already imported
-import transformers.models.llama.modeling_llama
-transformers.models.llama.modeling_llama.LlamaConfig = PatchedLlamaConfig
-
-# Now import DiffRhythm modules - they will use our patched LlamaConfig
-import infer_utils
-
-# Monkey-patch decode_audio to force chunked=False
-_original_decode_audio = infer_utils.decode_audio
-
-def patched_decode_audio(latent, vae_model, chunked=True):
-    """Patched version that always uses chunked=False"""
-    return _original_decode_audio(latent, vae_model, chunked=False)
-
-infer_utils.decode_audio = patched_decode_audio
-
-# Import DiffRhythm node
-from DiffRhythmNode import DiffRhythmRun
-
-class PivoineDiffRhythmRun(DiffRhythmRun):
-    """
-    Pivoine version of DiffRhythmRun with enhanced compatibility and error handling.
-
-    Changes from original:
-    - Globally patches LlamaConfig to add missing num_attention_heads and num_key_value_heads
-    - Monkey-patches decode_audio to always use chunked=False for stability
-    - Fixes tensor dimension mismatch in rotary position embeddings (32 vs 64)
-    - Compatible with transformers 4.49.0+
-    - Requires ~12-16GB VRAM, works reliably on RTX 4090
-
-    Technical details:
-    - Sets num_attention_heads = hidden_size // 64 (standard Llama architecture)
-    - Sets num_key_value_heads = num_attention_heads // 4 (GQA configuration)
-    - This ensures head_dim = hidden_size // num_attention_heads = 64 (not 32)
-    - Patch is applied globally at import time, affecting all LlamaConfig instances
-    """
-
-    CATEGORY = "🌸Pivoine/Audio"
-
-    @classmethod
-    def INPUT_TYPES(cls):
-        return super().INPUT_TYPES()
-
-NODE_CLASS_MAPPINGS = {
-    "PivoineDiffRhythmRun": PivoineDiffRhythmRun,
-}
-
-NODE_DISPLAY_NAME_MAPPINGS = {
-    "PivoineDiffRhythmRun": "Pivoine DiffRhythm Run",
-}
--- a/comfyui/patches/diffrhythm-llamaconfig-fix.patch
+++ b/comfyui/patches/diffrhythm-llamaconfig-fix.patch
@@ -0,0 +1,56 @@
+diff --git a/__init__.py b/__init__.py
+index 1234567..abcdefg 100644
+--- a/__init__.py
+++ b/__init__.py
+@@ -1,3 +1,51 @@
+"""
+DiffRhythm ComfyUI Node with LlamaConfig Patch
+
+PATCH: Fixes "The size of tensor a (32) must match the size of tensor b (64)" error
+in DiffRhythm's rotary position embeddings by patching LlamaConfig initialization.
+
+Issue: DiffRhythm's DIT model doesn't specify num_attention_heads and
+num_key_value_heads when creating LlamaConfig, causing transformers 4.49.0+
+to incorrectly infer head_dim = 32 instead of 64.
+
+Solution: Patch LlamaConfig globally before importing DiffRhythmNode.
+
+Reference: https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/44
+Reference: https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/48
+
+Patch author: valknar@pivoine.art
+"""
+
+# CRITICAL: Patch LlamaConfig BEFORE importing DiffRhythmNode
+from transformers.models.llama import LlamaConfig as _OriginalLlamaConfig
+
+class PatchedLlamaConfig(_OriginalLlamaConfig):
+    """
+    Patched LlamaConfig that automatically adds missing attention head parameters.
+
+    Standard Llama architecture assumptions:
+    - head_dim = 64 (fixed)
+    - num_attention_heads = hidden_size // head_dim
+    - num_key_value_heads = num_attention_heads // 4 (for GQA)
+    """
+    def __init__(self, *args, **kwargs):
+        # If hidden_size is provided but num_attention_heads is not, calculate it
+        if 'hidden_size' in kwargs and 'num_attention_heads' not in kwargs:
+            hidden_size = kwargs['hidden_size']
+            kwargs['num_attention_heads'] = hidden_size // 64
+
+        # If num_key_value_heads is not provided, use GQA configuration
+        if 'num_attention_heads' in kwargs and 'num_key_value_heads' not in kwargs:
+            kwargs['num_key_value_heads'] = max(1, kwargs['num_attention_heads'] // 4)
+
+        super().__init__(*args, **kwargs)
+
+# Replace LlamaConfig in transformers module BEFORE DiffRhythm imports it
+import transformers.models.llama
+transformers.models.llama.LlamaConfig = PatchedLlamaConfig
+import transformers.models.llama.modeling_llama
+transformers.models.llama.modeling_llama.LlamaConfig = PatchedLlamaConfig
+
+ from .DiffRhythmNode import  NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS
+
+ __all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS"]
--- a/comfyui/workflows/text-to-music/diffrhythm-full-length-t2m-v1.json
+++ b/comfyui/workflows/text-to-music/diffrhythm-full-length-t2m-v1.json
@@ -4,7 +4,7 @@
  "nodes": [
    {
      "id": 1,
-      "type": "PivoineDiffRhythmRun",
+      "type": "DiffRhythmRun",
      "pos": [100, 100],
      "size": [400, 400],
      "flags": {},
--- a/comfyui/workflows/text-to-music/diffrhythm-random-generation-v1.json
+++ b/comfyui/workflows/text-to-music/diffrhythm-random-generation-v1.json
@@ -4,7 +4,7 @@
  "nodes": [
    {
      "id": 1,
-      "type": "PivoineDiffRhythmRun",
+      "type": "DiffRhythmRun",
      "pos": [100, 100],
      "size": [400, 400],
      "flags": {},
--- a/comfyui/workflows/text-to-music/diffrhythm-reference-based-v1.json
+++ b/comfyui/workflows/text-to-music/diffrhythm-reference-based-v1.json
@@ -27,7 +27,7 @@
    },
    {
      "id": 2,
-      "type": "PivoineDiffRhythmRun",
+      "type": "DiffRhythmRun",
      "pos": [500, 100],
      "size": [400, 450],
      "flags": {},
--- a/comfyui/workflows/text-to-music/diffrhythm-simple-t2m-v1.json
+++ b/comfyui/workflows/text-to-music/diffrhythm-simple-t2m-v1.json
@@ -4,7 +4,7 @@
  "nodes": [
    {
      "id": 1,
-      "type": "PivoineDiffRhythmRun",
+      "type": "DiffRhythmRun",
      "pos": [100, 100],
      "size": [400, 400],
      "flags": {},