From 80a81aa12f61c36c718f6fe85065574063fc3e24 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sebastian=20Kr=C3=BCger?= <valknar@pivoine.art>
Date: Sun, 23 Nov 2025 09:36:15 +0100
Subject: [PATCH] feat: add CogVideoX-I2V and text encoders to model
 configuration

- Add THUDM/CogVideoX-5b-I2V model entry (20GB, I2V-specific model)
- Add T5-XXL FP16 text encoder for CogVideoX (9GB)
- Add CLIP-L text encoder for CogVideoX and SD3 (1GB)
- Add CLIP-G text encoder for SD3 (3GB)

Note: CogVideoX models are auto-downloaded by DownloadAndLoadCogVideoModel node
Text encoders already linked manually to /workspace/ComfyUI/models/text_encoders/
---
 comfyui_models.yaml | 56 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 55 insertions(+), 1 deletion(-)

diff --git a/comfyui_models.yaml b/comfyui_models.yaml
index 1003065..78ab3b2 100644
--- a/comfyui_models.yaml
+++ b/comfyui_models.yaml
@@ -119,11 +119,26 @@ model_categories:
       vram_gb: 20
       frames: 49
       resolution: 720p
-      notes: State-of-the-art text-to-video generation
+      notes: State-of-the-art text-to-video generation, auto-downloaded by DownloadAndLoadCogVideoModel node
       files:
         - source: "transformer/diffusion_pytorch_model.safetensors"
           dest: "cogvideox-5b-transformer.safetensors"
 
+    - repo_id: THUDM/CogVideoX-5b-I2V
+      description: CogVideoX-5B-I2V - Image-to-video generation
+      size_gb: 20
+      essential: true
+      category: video
+      type: diffusion_models
+      format: fp16
+      vram_gb: 20
+      frames: 49
+      resolution: 720p
+      notes: Image-to-video model, auto-downloaded by DownloadAndLoadCogVideoModel node
+      files:
+        - source: "transformer/diffusion_pytorch_model.safetensors"
+          dest: "cogvideox-5b-i2v-transformer.safetensors"
+
     - repo_id: stabilityai/stable-video-diffusion-img2vid
       description: SVD - 14 frame image-to-video
       size_gb: 8
@@ -247,6 +262,45 @@ model_categories:
         - source: "model.safetensors"
           dest: "siglip-so400m-patch14-384.safetensors"
 
+    - repo_id: stabilityai/stable-diffusion-3.5-large
+      description: T5-XXL FP16 - For CogVideoX text encoding
+      size_gb: 9
+      essential: true
+      category: support
+      type: text_encoders
+      format: fp16
+      vram_gb: 4
+      notes: T5 text encoder required for CogVideoX models
+      files:
+        - source: "text_encoders/t5xxl_fp16.safetensors"
+          dest: "t5xxl_fp16.safetensors"
+
+    - repo_id: stabilityai/stable-diffusion-3.5-large
+      description: CLIP-L - For CogVideoX and SD3
+      size_gb: 1
+      essential: true
+      category: support
+      type: text_encoders
+      format: fp32
+      vram_gb: 1
+      notes: CLIP-L text encoder for CogVideoX and SD3 models
+      files:
+        - source: "text_encoders/clip_l.safetensors"
+          dest: "clip_l.safetensors"
+
+    - repo_id: stabilityai/stable-diffusion-3.5-large
+      description: CLIP-G - For SD3 models
+      size_gb: 3
+      essential: false
+      category: support
+      type: text_encoders
+      format: fp32
+      vram_gb: 2
+      notes: CLIP-G text encoder for SD3 models
+      files:
+        - source: "text_encoders/clip_g.safetensors"
+          dest: "clip_g.safetensors"
+
   # ==========================================================================
   # ANIMATEDIFF MODELS
   # ==========================================================================