diff --git a/comfyui/workflows/image-to-video/cogvideox-i2v-production-v1.json b/comfyui/workflows/image-to-video/cogvideox-i2v-production-v1.json index 81372d4..771f1f7 100644 --- a/comfyui/workflows/image-to-video/cogvideox-i2v-production-v1.json +++ b/comfyui/workflows/image-to-video/cogvideox-i2v-production-v1.json @@ -1,207 +1,441 @@ { "last_node_id": 10, - "last_link_id": 12, + "last_link_id": 14, "nodes": [ { "id": 1, "type": "LoadImage", - "pos": [ - 50, - 100 - ], - "size": [ - 315, - 314 - ], - "widgets_values": [ - "input_frame.png", - "image" - ], + "pos": [50, 100], + "size": [315, 314], + "widgets_values": ["input_frame.png", "image"], "title": "API Input Image", "flags": {}, "order": 0, "mode": 0, "properties": { "Node name for S&R": "LoadImage" - } + }, + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [1], + "slot_index": 0 + }, + { + "name": "MASK", + "type": "MASK", + "links": null + } + ] }, { "id": 2, "type": "DownloadAndLoadCogVideoModel", - "pos": [ - 50, - 500 - ], - "widgets_values": [ - "THUDM/CogVideoX-5b-I2V" - ], - "title": "CogVideoX-5b Loader", + "pos": [50, 500], + "size": [350, 100], + "widgets_values": ["THUDM/CogVideoX-5b-I2V"], + "title": "CogVideoX-5b-I2V Loader", "flags": {}, "order": 1, "mode": 0, "properties": { "Node name for S&R": "DownloadAndLoadCogVideoModel" }, - "size": { - "0": 350, - "1": 100 - } + "outputs": [ + { + "name": "model", + "type": "COGVIDEOMODEL", + "links": [2], + "slot_index": 0 + }, + { + "name": "vae", + "type": "VAE", + "links": [3], + "slot_index": 1 + } + ] }, { - "id": 3, - "type": "CogVideoTextEncode", - "pos": [ - 450, - 100 - ], - "widgets_values": [ - "Camera movement description, action, scene details" - ], - "title": "API Video Prompt", + "id": 7, + "type": "DualCLIPLoader", + "pos": [50, 650], + "size": [350, 100], + "widgets_values": ["t5xxl_fp16.safetensors", "clip_l.safetensors", "flux"], + "title": "CLIP Loader", "flags": {}, "order": 2, "mode": 0, "properties": { - "Node name for S&R": "CogVideoTextEncode" + "Node name for S&R": "DualCLIPLoader" }, - "size": { - "0": 400, - "1": 200 - } + "outputs": [ + { + "name": "CLIP", + "type": "CLIP", + "links": [4, 5], + "slot_index": 0 + } + ] }, { - "id": 4, - "type": "CogVideoSampler", - "pos": [ - 800, - 100 - ], - "widgets_values": [ - 42, - "CogVideoXDDIM", - 49, - 50, - 6.0 - ], - "title": "CogVideoX Sampler (6s @ 8fps)", + "id": 3, + "type": "CogVideoTextEncode", + "pos": [450, 100], + "size": [400, 200], + "widgets_values": ["Camera movement description, action, scene details"], + "title": "API Video Prompt (Positive)", "flags": {}, "order": 3, "mode": 0, "properties": { - "Node name for S&R": "CogVideoSampler" + "Node name for S&R": "CogVideoTextEncode" }, - "size": { - "0": 315, - "1": 474 - } + "inputs": [ + { + "name": "clip", + "type": "CLIP", + "link": 4 + } + ], + "outputs": [ + { + "name": "conditioning", + "type": "CONDITIONING", + "links": [6], + "slot_index": 0 + }, + { + "name": "clip", + "type": "CLIP", + "links": null + } + ] }, { - "id": 5, - "type": "CogVideoDecode", - "pos": [ - 1150, - 100 - ], - "title": "VAE Decode Video", + "id": 8, + "type": "CogVideoTextEncode", + "pos": [450, 350], + "size": [400, 200], + "widgets_values": ["low quality, blurry, distorted"], + "title": "API Video Prompt (Negative)", "flags": {}, "order": 4, "mode": 0, "properties": { - "Node name for S&R": "CogVideoDecode" + "Node name for S&R": "CogVideoTextEncode" }, - "widgets_values": [ - true, - 240, - 360, - 0.25, - 0.25 + "inputs": [ + { + "name": "clip", + "type": "CLIP", + "link": 5 + } ], - "size": { - "0": 315, - "1": 100 - } + "outputs": [ + { + "name": "conditioning", + "type": "CONDITIONING", + "links": [7], + "slot_index": 0 + }, + { + "name": "clip", + "type": "CLIP", + "links": null + } + ] }, { - "id": 6, - "type": "VHS_VideoCombine", - "pos": [ - 1450, - 100 - ], - "widgets_values": [ - 8, - 0, - "cogvideox_output", - "video/h264-mp4" - ], - "title": "Combine Video Frames", + "id": 9, + "type": "CogVideoImageEncode", + "pos": [450, 600], + "size": [315, 100], + "widgets_values": [], + "title": "Encode Input Image", "flags": {}, "order": 5, "mode": 0, + "properties": { + "Node name for S&R": "CogVideoImageEncode" + }, + "inputs": [ + { + "name": "vae", + "type": "VAE", + "link": 3 + }, + { + "name": "image", + "type": "IMAGE", + "link": 1 + } + ], + "outputs": [ + { + "name": "samples", + "type": "LATENT", + "links": [8], + "slot_index": 0 + } + ] + }, + { + "id": 4, + "type": "CogVideoSampler", + "pos": [900, 100], + "size": [315, 474], + "widgets_values": [42, "CogVideoXDDIM", 49, 50, 6.0], + "title": "CogVideoX Sampler (6s @ 8fps)", + "flags": {}, + "order": 6, + "mode": 0, + "properties": { + "Node name for S&R": "CogVideoSampler" + }, + "inputs": [ + { + "name": "model", + "type": "COGVIDEOMODEL", + "link": 2 + }, + { + "name": "positive", + "type": "CONDITIONING", + "link": 6 + }, + { + "name": "negative", + "type": "CONDITIONING", + "link": 7 + }, + { + "name": "image_cond_latents", + "type": "LATENT", + "link": 8 + } + ], + "outputs": [ + { + "name": "samples", + "type": "LATENT", + "links": [9], + "slot_index": 0 + } + ] + }, + { + "id": 5, + "type": "CogVideoDecode", + "pos": [1250, 100], + "size": [315, 200], + "widgets_values": [true, 240, 360, 0.25, 0.25], + "title": "VAE Decode Video", + "flags": {}, + "order": 7, + "mode": 0, + "properties": { + "Node name for S&R": "CogVideoDecode" + }, + "inputs": [ + { + "name": "vae", + "type": "VAE", + "link": 10 + }, + { + "name": "samples", + "type": "LATENT", + "link": 9 + } + ], + "outputs": [ + { + "name": "images", + "type": "IMAGE", + "links": [11], + "slot_index": 0 + } + ] + }, + { + "id": 10, + "type": "CogVideoXVAELoader", + "pos": [900, 600], + "size": [315, 100], + "widgets_values": ["THUDM/CogVideoX-5b-I2V"], + "title": "VAE Loader", + "flags": {}, + "order": 8, + "mode": 0, + "properties": { + "Node name for S&R": "CogVideoXVAELoader" + }, + "outputs": [ + { + "name": "vae", + "type": "VAE", + "links": [10], + "slot_index": 0 + } + ] + }, + { + "id": 6, + "type": "VHS_VideoCombine", + "pos": [1600, 100], + "size": [315, 200], + "widgets_values": [8, 0, "cogvideox_output", "video/h264-mp4"], + "title": "Combine Video Frames", + "flags": {}, + "order": 9, + "mode": 0, "properties": { "Node name for S&R": "VHS_VideoCombine" }, - "size": { - "0": 315, - "1": 100 - } + "inputs": [ + { + "name": "images", + "type": "IMAGE", + "link": 11 + } + ], + "outputs": [ + { + "name": "Filenames", + "type": "VHS_FILENAMES", + "links": null + } + ] } ], - "links": [], + "links": [ + [1, 1, 0, 9, 1, "IMAGE"], + [2, 2, 0, 4, 0, "COGVIDEOMODEL"], + [3, 2, 1, 9, 0, "VAE"], + [4, 7, 0, 3, 0, "CLIP"], + [5, 7, 0, 8, 0, "CLIP"], + [6, 3, 0, 4, 1, "CONDITIONING"], + [7, 8, 0, 4, 2, "CONDITIONING"], + [8, 9, 0, 4, 3, "LATENT"], + [9, 4, 0, 5, 1, "LATENT"], + [10, 10, 0, 5, 0, "VAE"], + [11, 5, 0, 6, 0, "IMAGE"] + ], + "groups": [], + "config": {}, "extra": { "workflow_info": { "name": "CogVideoX Image-to-Video Production", - "version": "1.0.0", - "description": "AI-driven image-to-video using CogVideoX-5b. Generate 6-second videos (48 frames @ 8fps) from input images with camera movement and action.", + "version": "1.1.0", + "description": "AI-driven image-to-video using CogVideoX-5b-I2V. Generate 6-second videos (49 frames @ 8fps) from input images with camera movement and action.", "category": "image-to-video", - "tags": [ - "cogvideox", - "i2v", - "video-generation", - "production" - ], + "tags": ["cogvideox", "i2v", "video-generation", "production"], "requirements": { - "models": [ - "CogVideoX-5b" - ], - "custom_nodes": [ - "ComfyUI-VideoHelperSuite", - "ComfyUI-CogVideoXWrapper" - ], - "vram_min": "20GB" + "models": ["CogVideoX-5b-I2V", "t5xxl", "clip_l"], + "custom_nodes": ["ComfyUI-VideoHelperSuite", "ComfyUI-CogVideoXWrapper"], + "vram_min": "20GB", + "vram_recommended": "24GB" }, "parameters": { "input_image": { "node_id": 1, + "widget_index": 0, "type": "image", "required": true, - "description": "Starting frame for video" + "description": "Starting frame for video generation" }, - "video_prompt": { + "positive_prompt": { "node_id": 3, + "widget_index": 0, "type": "string", "required": true, - "description": "Describe camera movement and action" + "default": "Camera movement description, action, scene details", + "description": "Describe desired camera movement, actions, and scene" + }, + "negative_prompt": { + "node_id": 8, + "widget_index": 0, + "type": "string", + "required": false, + "default": "low quality, blurry, distorted", + "description": "Undesired elements to avoid" + }, + "seed": { + "node_id": 4, + "widget_index": 0, + "type": "integer", + "required": false, + "default": 42, + "description": "Random seed for reproducibility" + }, + "num_frames": { + "node_id": 4, + "widget_index": 2, + "type": "integer", + "required": false, + "default": 49, + "min": 1, + "max": 1024, + "description": "Number of frames to generate (49 = ~6s @ 8fps)" }, "steps": { "node_id": 4, + "widget_index": 3, "type": "integer", + "required": false, "default": 50, - "description": "Sampling steps (50 recommended)" + "min": 20, + "max": 100, + "description": "Sampling steps (50 recommended for quality)" + }, + "cfg": { + "node_id": 4, + "widget_index": 4, + "type": "float", + "required": false, + "default": 6.0, + "min": 1.0, + "max": 15.0, + "description": "Classifier-free guidance scale" }, "fps": { "node_id": 6, + "widget_index": 0, "type": "integer", + "required": false, "default": 8, - "description": "Output framerate" + "description": "Output video framerate" + } + }, + "outputs": { + "video": { + "node_id": 6, + "type": "video", + "format": "MP4 (H.264)", + "resolution": "Based on input image", + "duration": "~6 seconds @ 8fps (49 frames)" } }, "performance": { "avg_generation_time": "120-180 seconds", "vram_usage": "~20-22GB", - "output": "6 seconds @ 8fps (48 frames)" - } + "gpu_utilization": "95-100%" + }, + "use_cases": [ + "Animate static images with camera motion", + "Create video loops from single frames", + "Add dynamic movement to product shots", + "Generate cinematic camera movements" + ], + "notes": [ + "CogVideoX-5b-I2V is specifically trained for image-to-video generation", + "Model will download automatically on first use (~10GB)", + "Enable VAE tiling to reduce VRAM usage", + "Higher steps (50-100) improve quality but increase generation time" + ] } }, "version": 0.4 -} \ No newline at end of file +}