diff --git a/README.md b/README.md
index 48e3b64f..e5ca8bb6 100644
--- a/README.md
+++ b/README.md
@@ -199,6 +199,11 @@ models we currently offer, along with their foundational information.
Min(W, H) = 768 768 ≤ Max(W, H) ≤ 1360 Max(W, H) % 16 = 0 |
720 * 480 |
+
+ Number of Frames |
+ Should be 16N + 1 where N <= 10 (default 81) |
+ Should be 8N + 1 where N <= 6 (default 49) |
+
Inference Precision |
BF16 (Recommended), FP16, FP32, FP8*, INT8, Not supported: INT4 |
diff --git a/README_ja.md b/README_ja.md
index 074cc93f..3927d17a 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -191,6 +191,11 @@ CogVideoXは、[清影](https://chatglm.cn/video?fr=osm_cogvideox) と同源の
Min(W, H) = 768 768 ≤ Max(W, H) ≤ 1360 Max(W, H) % 16 = 0 |
720 * 480 |
+
+ フレーム数 |
+ 16N + 1 (N <= 10) である必要があります (デフォルト 81) |
+ 8N + 1 (N <= 6) である必要があります (デフォルト 49) |
+
推論精度 |
BF16(推奨), FP16, FP32,FP8*,INT8,INT4非対応 |
diff --git a/README_zh.md b/README_zh.md
index c6c81f87..1444a927 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -180,7 +180,12 @@ CogVideoX是 [清影](https://chatglm.cn/video?fr=osm_cogvideox) 同源的开源
1360 * 768 |
Min(W, H) = 768 768 ≤ Max(W, H) ≤ 1360 Max(W, H) % 16 = 0 |
720 * 480 |
-
+
+
+ 帧数 |
+ 必须为 16N + 1 其中 N <= 10 (默认 81) |
+ 必须为 8N + 1 其中 N <= 6 (默认 49) |
+
推理精度 |
BF16(推荐), FP16, FP32,FP8*,INT8,不支持INT4 |
diff --git a/inference/cli_demo.py b/inference/cli_demo.py
index 41f4267c..37dfcfc7 100644
--- a/inference/cli_demo.py
+++ b/inference/cli_demo.py
@@ -100,7 +100,7 @@ def generate_video(
if width is None or height is None:
height, width = desired_resolution
logging.info(f"\033[1mUsing default resolution {desired_resolution} for {model_name}\033[0m")
- elif (width, height) != desired_resolution:
+ elif (height, width) != desired_resolution:
if generate_type == "i2v":
# For i2v models, use user-defined width and height
logging.warning(
@@ -111,7 +111,7 @@ def generate_video(
logging.warning(
f"\033[1;31m{model_name} is not supported for custom resolution. Setting back to default resolution {desired_resolution}.\033[0m"
)
- width, height = desired_resolution
+ height, width = desired_resolution
if generate_type == "i2v":
pipe = CogVideoXImageToVideoPipeline.from_pretrained(model_path, torch_dtype=dtype)