From 91f6e9bd59f7b8f7145adc1a7869786c42b5bab4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20Kr=C3=BCger?= Date: Mon, 24 Nov 2025 18:53:18 +0100 Subject: [PATCH] fix: patch DiffRhythm DIT to add missing LlamaConfig attention head parameters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds monkey-patch for DiT.__init__() to properly configure LlamaConfig with num_attention_heads and num_key_value_heads parameters, which are missing in the upstream DiffRhythm code. Root cause: transformers 4.49.0+ requires these parameters but DiffRhythm's dit.py only specifies hidden_size, causing the library to incorrectly infer head_dim as 32 instead of 64, leading to tensor dimension mismatches. Solution: - Sets num_attention_heads = hidden_size // 64 (standard Llama architecture) - Sets num_key_value_heads = num_attention_heads // 4 (GQA configuration) - Ensures head_dim = 64, fixing the "tensor a (32) vs tensor b (64)" error This is a proper fix rather than just downgrading transformers version. References: - https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/44 - https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/48 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- comfyui/nodes/pivoine_diffrhythm.py | 72 ++++++++++++++++++++++++----- 1 file changed, 61 insertions(+), 11 deletions(-) diff --git a/comfyui/nodes/pivoine_diffrhythm.py b/comfyui/nodes/pivoine_diffrhythm.py index 37064f0..2c842a7 100644 --- a/comfyui/nodes/pivoine_diffrhythm.py +++ b/comfyui/nodes/pivoine_diffrhythm.py @@ -1,11 +1,13 @@ """ Pivoine DiffRhythm Node -Custom wrapper for DiffRhythm that ensures correct transformer library version -compatibility and provides fallback fixes for tensor dimension issues. +Custom wrapper for DiffRhythm that fixes LlamaConfig initialization issues +with transformers 4.49.0+ to prevent tensor dimension mismatches. -Known Issue: DiffRhythm requires transformers==4.49.0. Newer versions (4.50+) -cause "The size of tensor a (32) must match the size of tensor b (64)" error -in rotary position embeddings due to transformer block initialization changes. +Known Issue: DiffRhythm's DIT model doesn't specify num_attention_heads and +num_key_value_heads in LlamaConfig, causing "The size of tensor a (32) must +match the size of tensor b (64)" error in rotary position embeddings. + +This patch adds the missing parameters to LlamaConfig initialization. Reference: https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/44 Reference: https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/48 @@ -24,9 +26,54 @@ def patched_decode_audio(latent, vae_model, chunked=True): """Patched version that always uses chunked=False""" return _original_decode_audio(latent, vae_model, chunked=False) -# Apply the monkey patch +# Apply the decode_audio monkey patch infer_utils.decode_audio = patched_decode_audio +# Monkey-patch DiT __init__ to fix LlamaConfig initialization +from diffrhythm.model import dit +from transformers.models.llama import LlamaConfig +import torch.nn as nn + +_original_dit_init = dit.DiT.__init__ + +def patched_dit_init(self, *args, **kwargs): + """ + Patched DiT.__init__ that adds missing num_attention_heads and + num_key_value_heads to LlamaConfig initialization. + + This fixes the tensor dimension mismatch (32 vs 64) error in + rotary position embeddings with transformers 4.49.0+. + """ + # Call original __init__ but intercept the LlamaConfig creation + _original_llama_config = LlamaConfig + + def patched_llama_config(*config_args, **config_kwargs): + """Add missing attention head parameters to LlamaConfig""" + hidden_size = config_kwargs.get('hidden_size', config_args[0] if config_args else 1024) + + # Standard Llama architecture: head_dim = 64, so num_heads = hidden_size // 64 + # For GQA (Grouped Query Attention), num_key_value_heads is usually num_heads // 4 + num_attention_heads = hidden_size // 64 + num_key_value_heads = max(1, num_attention_heads // 4) + + config_kwargs['num_attention_heads'] = config_kwargs.get('num_attention_heads', num_attention_heads) + config_kwargs['num_key_value_heads'] = config_kwargs.get('num_key_value_heads', num_key_value_heads) + + return _original_llama_config(*config_args, **config_kwargs) + + # Temporarily replace LlamaConfig in the dit module + dit.LlamaConfig = patched_llama_config + + try: + # Call the original __init__ + _original_dit_init(self, *args, **kwargs) + finally: + # Restore original LlamaConfig + dit.LlamaConfig = _original_llama_config + +# Apply the DiT init monkey patch +dit.DiT.__init__ = patched_dit_init + from DiffRhythmNode import DiffRhythmRun class PivoineDiffRhythmRun(DiffRhythmRun): @@ -34,13 +81,16 @@ class PivoineDiffRhythmRun(DiffRhythmRun): Pivoine version of DiffRhythmRun with enhanced compatibility and error handling. Changes from original: + - Patches DIT.__init__ to add missing num_attention_heads and num_key_value_heads to LlamaConfig - Monkey-patches decode_audio to always use chunked=False for stability - - Ensures transformers library version compatibility (requires 4.49.0) - - Prevents tensor dimension mismatch in VAE decoding - - Requires more VRAM (~12-16GB) but works reliably on RTX 4090 + - Fixes tensor dimension mismatch in rotary position embeddings (32 vs 64) + - Compatible with transformers 4.49.0+ + - Requires ~12-16GB VRAM, works reliably on RTX 4090 - Note: If you encounter "tensor a (32) must match tensor b (64)" errors, - ensure transformers==4.49.0 is installed in your ComfyUI venv. + Technical details: + - Sets num_attention_heads = hidden_size // 64 (standard Llama architecture) + - Sets num_key_value_heads = num_attention_heads // 4 (GQA configuration) + - This ensures head_dim = hidden_size // num_attention_heads = 64 (not 32) """ CATEGORY = "🌸Pivoine/Audio"