fix: replace custom Pivoine node with direct DiffRhythm patch
All checks were successful
Build and Push RunPod Docker Image / build-and-push (push) Successful in 14s

- Remove custom PivoineDiffRhythmRun wrapper node
- Add git patch file for ComfyUI_DiffRhythm __init__.py
- Patch adds LlamaConfig fix at import time
- Add arty script 'fix/diffrhythm-patch' to apply patch
- Revert all workflows to use original DiffRhythmRun
- Remove startup_patch.py and revert start.sh

This approach is cleaner and more maintainable than wrapping the node.
The patch directly fixes the tensor dimension mismatch (32 vs 64) in
DiffRhythm's rotary position embeddings by ensuring num_attention_heads
and num_key_value_heads are properly set based on hidden_size.

References:
- https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/44
- https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/48
This commit is contained in:
2025-11-24 19:27:18 +01:00
parent f74457b049
commit d74a7cb7cb
8 changed files with 96 additions and 142 deletions

View File

@@ -396,37 +396,56 @@ scripts:
echo ""
echo "Category: 🌸Pivoine/Audio"
fix/diffrhythm-transformers: |
fix/diffrhythm-patch: |
echo "========================================="
echo " Fix DiffRhythm Transformers Version"
echo " Apply DiffRhythm LlamaConfig Patch"
echo "========================================="
echo ""
echo "Issue: Tensor dimension mismatch (32 vs 64) in rotary embeddings"
echo "Solution: Downgrade transformers to 4.49.0"
echo "Solution: Patch DiffRhythm __init__.py to fix LlamaConfig"
echo ""
echo "References:"
echo " - https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/44"
echo " - https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/48"
echo ""
cd /workspace/ComfyUI
source venv/bin/activate
DIFF_RHYTHM_DIR="/workspace/ComfyUI/custom_nodes/ComfyUI_DiffRhythm"
PATCH_FILE="/workspace/ai/comfyui/patches/diffrhythm-llamaconfig-fix.patch"
echo "Current transformers version:"
pip show transformers | grep Version
if [ ! -d "$DIFF_RHYTHM_DIR" ]; then
echo "✗ Error: DiffRhythm not found at $DIFF_RHYTHM_DIR"
exit 1
fi
if [ ! -f "$PATCH_FILE" ]; then
echo "✗ Error: Patch file not found at $PATCH_FILE"
exit 1
fi
cd "$DIFF_RHYTHM_DIR"
echo "Checking if patch already applied..."
if grep -q "PatchedLlamaConfig" __init__.py; then
echo "✓ Patch already applied!"
exit 0
fi
echo "Applying patch..."
patch -p1 < "$PATCH_FILE"
if [ $? -eq 0 ]; then
echo ""
echo "Installing transformers==4.49.0..."
pip install transformers==4.49.0
echo ""
echo "✓ Transformers downgraded to 4.49.0"
echo "✓ Patch applied successfully!"
echo ""
echo "Next steps:"
echo " 1. Restart ComfyUI: arty services/comfyui/restart"
echo " 2. Test DiffRhythm workflows"
deactivate
else
echo ""
echo "✗ Failed to apply patch"
echo "You may need to manually apply the patch or check for conflicts"
exit 1
fi
setup/comfyui-extensions-deps: |
echo "========================================="

View File

@@ -1,20 +0,0 @@
"""
Pivoine Custom ComfyUI Nodes
Custom node extensions and wrappers for RunPod deployment
Author: valknar@pivoine.art
"""
from .pivoine_diffrhythm import NODE_CLASS_MAPPINGS as DIFFRHYTHM_MAPPINGS
from .pivoine_diffrhythm import NODE_DISPLAY_NAME_MAPPINGS as DIFFRHYTHM_DISPLAY
# Combine all node mappings
NODE_CLASS_MAPPINGS = {
**DIFFRHYTHM_MAPPINGS,
}
NODE_DISPLAY_NAME_MAPPINGS = {
**DIFFRHYTHM_DISPLAY,
}
__all__ = ['NODE_CLASS_MAPPINGS', 'NODE_DISPLAY_NAME_MAPPINGS']

View File

@@ -1,101 +0,0 @@
"""
Pivoine DiffRhythm Node
Custom wrapper for DiffRhythm that fixes LlamaConfig initialization issues
with transformers 4.49.0+ to prevent tensor dimension mismatches.
Known Issue: DiffRhythm's DIT model doesn't specify num_attention_heads and
num_key_value_heads in LlamaConfig, causing "The size of tensor a (32) must
match the size of tensor b (64)" error in rotary position embeddings.
This patch globally intercepts LlamaConfig at import time.
Reference: https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/44
Reference: https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/48
Author: valknar@pivoine.art
"""
import sys
sys.path.append('/workspace/ComfyUI/custom_nodes/ComfyUI_DiffRhythm')
# CRITICAL: Patch LlamaConfig BEFORE any DiffRhythm imports
# This must happen at module import time, not at runtime
from transformers.models.llama import LlamaConfig as _OriginalLlamaConfig
class PatchedLlamaConfig(_OriginalLlamaConfig):
"""
Patched LlamaConfig that automatically adds missing attention head parameters.
Fixes the tensor dimension mismatch (32 vs 64) in DiffRhythm's rotary
position embeddings by ensuring num_attention_heads and num_key_value_heads
are properly set based on hidden_size.
"""
def __init__(self, *args, **kwargs):
# If hidden_size is provided but num_attention_heads is not, calculate it
if 'hidden_size' in kwargs and 'num_attention_heads' not in kwargs:
hidden_size = kwargs['hidden_size']
# Standard Llama architecture: head_dim = 64, so num_heads = hidden_size // 64
kwargs['num_attention_heads'] = hidden_size // 64
# If num_key_value_heads is not provided, use GQA configuration
if 'num_attention_heads' in kwargs and 'num_key_value_heads' not in kwargs:
# For GQA (Grouped Query Attention), typically num_kv_heads = num_heads // 4
kwargs['num_key_value_heads'] = max(1, kwargs['num_attention_heads'] // 4)
# Call original __init__ with patched parameters
super().__init__(*args, **kwargs)
# Replace LlamaConfig in transformers module BEFORE DiffRhythm imports it
import transformers.models.llama
transformers.models.llama.LlamaConfig = PatchedLlamaConfig
# Also replace in modeling_llama module if it's already imported
import transformers.models.llama.modeling_llama
transformers.models.llama.modeling_llama.LlamaConfig = PatchedLlamaConfig
# Now import DiffRhythm modules - they will use our patched LlamaConfig
import infer_utils
# Monkey-patch decode_audio to force chunked=False
_original_decode_audio = infer_utils.decode_audio
def patched_decode_audio(latent, vae_model, chunked=True):
"""Patched version that always uses chunked=False"""
return _original_decode_audio(latent, vae_model, chunked=False)
infer_utils.decode_audio = patched_decode_audio
# Import DiffRhythm node
from DiffRhythmNode import DiffRhythmRun
class PivoineDiffRhythmRun(DiffRhythmRun):
"""
Pivoine version of DiffRhythmRun with enhanced compatibility and error handling.
Changes from original:
- Globally patches LlamaConfig to add missing num_attention_heads and num_key_value_heads
- Monkey-patches decode_audio to always use chunked=False for stability
- Fixes tensor dimension mismatch in rotary position embeddings (32 vs 64)
- Compatible with transformers 4.49.0+
- Requires ~12-16GB VRAM, works reliably on RTX 4090
Technical details:
- Sets num_attention_heads = hidden_size // 64 (standard Llama architecture)
- Sets num_key_value_heads = num_attention_heads // 4 (GQA configuration)
- This ensures head_dim = hidden_size // num_attention_heads = 64 (not 32)
- Patch is applied globally at import time, affecting all LlamaConfig instances
"""
CATEGORY = "🌸Pivoine/Audio"
@classmethod
def INPUT_TYPES(cls):
return super().INPUT_TYPES()
NODE_CLASS_MAPPINGS = {
"PivoineDiffRhythmRun": PivoineDiffRhythmRun,
}
NODE_DISPLAY_NAME_MAPPINGS = {
"PivoineDiffRhythmRun": "Pivoine DiffRhythm Run",
}

View File

@@ -0,0 +1,56 @@
diff --git a/__init__.py b/__init__.py
index 1234567..abcdefg 100644
--- a/__init__.py
+++ b/__init__.py
@@ -1,3 +1,51 @@
+"""
+DiffRhythm ComfyUI Node with LlamaConfig Patch
+
+PATCH: Fixes "The size of tensor a (32) must match the size of tensor b (64)" error
+in DiffRhythm's rotary position embeddings by patching LlamaConfig initialization.
+
+Issue: DiffRhythm's DIT model doesn't specify num_attention_heads and
+num_key_value_heads when creating LlamaConfig, causing transformers 4.49.0+
+to incorrectly infer head_dim = 32 instead of 64.
+
+Solution: Patch LlamaConfig globally before importing DiffRhythmNode.
+
+Reference: https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/44
+Reference: https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/48
+
+Patch author: valknar@pivoine.art
+"""
+
+# CRITICAL: Patch LlamaConfig BEFORE importing DiffRhythmNode
+from transformers.models.llama import LlamaConfig as _OriginalLlamaConfig
+
+class PatchedLlamaConfig(_OriginalLlamaConfig):
+ """
+ Patched LlamaConfig that automatically adds missing attention head parameters.
+
+ Standard Llama architecture assumptions:
+ - head_dim = 64 (fixed)
+ - num_attention_heads = hidden_size // head_dim
+ - num_key_value_heads = num_attention_heads // 4 (for GQA)
+ """
+ def __init__(self, *args, **kwargs):
+ # If hidden_size is provided but num_attention_heads is not, calculate it
+ if 'hidden_size' in kwargs and 'num_attention_heads' not in kwargs:
+ hidden_size = kwargs['hidden_size']
+ kwargs['num_attention_heads'] = hidden_size // 64
+
+ # If num_key_value_heads is not provided, use GQA configuration
+ if 'num_attention_heads' in kwargs and 'num_key_value_heads' not in kwargs:
+ kwargs['num_key_value_heads'] = max(1, kwargs['num_attention_heads'] // 4)
+
+ super().__init__(*args, **kwargs)
+
+# Replace LlamaConfig in transformers module BEFORE DiffRhythm imports it
+import transformers.models.llama
+transformers.models.llama.LlamaConfig = PatchedLlamaConfig
+import transformers.models.llama.modeling_llama
+transformers.models.llama.modeling_llama.LlamaConfig = PatchedLlamaConfig
+
from .DiffRhythmNode import NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS
__all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS"]

View File

@@ -4,7 +4,7 @@
"nodes": [
{
"id": 1,
"type": "PivoineDiffRhythmRun",
"type": "DiffRhythmRun",
"pos": [100, 100],
"size": [400, 400],
"flags": {},

View File

@@ -4,7 +4,7 @@
"nodes": [
{
"id": 1,
"type": "PivoineDiffRhythmRun",
"type": "DiffRhythmRun",
"pos": [100, 100],
"size": [400, 400],
"flags": {},

View File

@@ -27,7 +27,7 @@
},
{
"id": 2,
"type": "PivoineDiffRhythmRun",
"type": "DiffRhythmRun",
"pos": [500, 100],
"size": [400, 450],
"flags": {},

View File

@@ -4,7 +4,7 @@
"nodes": [
{
"id": 1,
"type": "PivoineDiffRhythmRun",
"type": "DiffRhythmRun",
"pos": [100, 100],
"size": [400, 400],
"flags": {},