mirror of
https://github.com/THUDM/CogVideo.git
synced 2025-04-05 03:04:56 +08:00
fix #472
This commit is contained in:
parent
2360393b99
commit
0c6dc7b5d5
47
README.md
47
README.md
@ -171,49 +171,49 @@ models we currently offer, along with their foundational information.
|
||||
<table style="border-collapse: collapse; width: 100%;">
|
||||
<tr>
|
||||
<th style="text-align: center;">Model Name</th>
|
||||
<th style="text-align: center;">CogVideoX1.5-5B (Latest)</th>
|
||||
<th style="text-align: center;">CogVideoX1.5-5B-I2V (Latest)</th>
|
||||
<th style="text-align: center;">CogVideoX-2B</th>
|
||||
<th style="text-align: center;">CogVideoX-5B</th>
|
||||
<th style="text-align: center;">CogVideoX-5B-I2V</th>
|
||||
<th style="text-align: center;">CogVideoX1.5-5B</th>
|
||||
<th style="text-align: center;">CogVideoX1.5-5B-I2V</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">Release Date</td>
|
||||
<th style="text-align: center;">November 8, 2024</th>
|
||||
<th style="text-align: center;">November 8, 2024</th>
|
||||
<th style="text-align: center;">August 6, 2024</th>
|
||||
<th style="text-align: center;">August 27, 2024</th>
|
||||
<th style="text-align: center;">September 19, 2024</th>
|
||||
<th style="text-align: center;">November 8, 2024</th>
|
||||
<th style="text-align: center;">November 8, 2024</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">Video Resolution</td>
|
||||
<td colspan="3" style="text-align: center;">720 * 480</td>
|
||||
<td colspan="1" style="text-align: center;">1360 * 768</td>
|
||||
<td colspan="1" style="text-align: center;">256 <= W <=1360<br>256 <= H <=768<br> W,H % 16 == 0</td>
|
||||
<td colspan="1" style="text-align: center;">256 <= W <=1360<br> 256 <= H <=768<br> W,H % 16 == 0</td>
|
||||
<td colspan="3" style="text-align: center;">720 * 480</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">Inference Precision</td>
|
||||
<td style="text-align: center;"><b>FP16*(recommended)</b>, BF16, FP32, FP8*, INT8, not supported: INT4</td>
|
||||
<td colspan="2" style="text-align: center;"><b>BF16(recommended)</b>, FP16, FP32, FP8*, INT8, not supported: INT4</td>
|
||||
<td colspan="2" style="text-align: center;"><b>BF16</b></td>
|
||||
<td style="text-align: center;"><b>FP16*(Recommended)</b>, BF16, FP32, FP8*, INT8, Not supported: INT4</td>
|
||||
<td colspan="2" style="text-align: center;"><b>BF16 (Recommended)</b>, FP16, FP32, FP8*, INT8, Not supported: INT4</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">Single GPU Memory Usage</td>
|
||||
<td style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> FP16: 18GB<br><b>diffusers FP16: from 4GB*</b><br><b>diffusers INT8(torchao): from 3.6GB*</b></td>
|
||||
<td colspan="2" style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> BF16: 26GB<br><b>diffusers BF16 : from 5GB*</b><br><b>diffusers INT8(torchao): from 4.4GB*</b></td>
|
||||
<td colspan="2" style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> BF16: 66GB<br></td>
|
||||
<td style="text-align: center;">Single GPU Memory Usage<br></td>
|
||||
<td colspan="2" style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> BF16: 66GB <br></td>
|
||||
<td style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> FP16: 18GB <br><b>diffusers FP16: 4GB minimum* </b><br><b>diffusers INT8 (torchao): 3.6GB minimum*</b></td>
|
||||
<td colspan="2" style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> BF16: 26GB <br><b>diffusers BF16 : 5GB minimum* </b><br><b>diffusers INT8 (torchao): 4.4GB minimum* </b></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">Multi-GPU Memory Usage</td>
|
||||
<td colspan="2" style="text-align: center;"><b>Not Supported</b><br></td>
|
||||
<td style="text-align: center;"><b>FP16: 10GB* using diffusers</b><br></td>
|
||||
<td colspan="2" style="text-align: center;"><b>BF16: 15GB* using diffusers</b><br></td>
|
||||
<td colspan="2" style="text-align: center;"><b>Not supported</b><br></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">Inference Speed<br>(Step = 50, FP/BF16)</td>
|
||||
<td colspan="2" style="text-align: center;">Single A100: ~1000 seconds (5-second video)<br>Single H100: ~550 seconds (5-second video)</td>
|
||||
<td style="text-align: center;">Single A100: ~90 seconds<br>Single H100: ~45 seconds</td>
|
||||
<td colspan="2" style="text-align: center;">Single A100: ~180 seconds<br>Single H100: ~90 seconds</td>
|
||||
<td colspan="2" style="text-align: center;">Single A100: ~1000 seconds (5-second video)<br>Single H100: ~550 seconds (5-second video)</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">Prompt Language</td>
|
||||
@ -221,38 +221,37 @@ models we currently offer, along with their foundational information.
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">Prompt Token Limit</td>
|
||||
<td colspan="3" style="text-align: center;">226 Tokens</td>
|
||||
<td colspan="2" style="text-align: center;">224 Tokens</td>
|
||||
<td colspan="3" style="text-align: center;">226 Tokens</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">Video Length</td>
|
||||
<td colspan="2" style="text-align: center;">5 seconds or 10 seconds</td>
|
||||
<td colspan="3" style="text-align: center;">6 seconds</td>
|
||||
<td colspan="2" style="text-align: center;">5 or 10 seconds</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">Frame Rate</td>
|
||||
<td colspan="3" style="text-align: center;">8 frames / second</td>
|
||||
<td colspan="2" style="text-align: center;">16 frames / second</td>
|
||||
<td colspan="2" style="text-align: center;">16 frames / second </td>
|
||||
<td colspan="3" style="text-align: center;">8 frames / second </td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">Positional Encoding</td>
|
||||
<td style="text-align: center;">3d_sincos_pos_embed</td>
|
||||
<td style="text-align: center;">Position Encoding</td>
|
||||
<td colspan="2" style="text-align: center;">3d_rope_pos_embed</td>
|
||||
<td style="text-align: center;">3d_sincos_pos_embed</td>
|
||||
<td style="text-align: center;">3d_rope_pos_embed</td>
|
||||
<td style="text-align: center;">3d_rope_pos_embed + learnable_pos_embed</td>
|
||||
<td style="text-align: center;">3d_rope_pos_embed</td>
|
||||
<td style="text-align: center;">3d_rope_pos_embed</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">Download Link (Diffusers)</td>
|
||||
<td colspan="2" style="text-align: center;"> Coming Soon </td>
|
||||
<td style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX-2b">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX-2b">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX-2b">🟣 WiseModel</a></td>
|
||||
<td style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX-5b">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX-5b">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX-5b">🟣 WiseModel</a></td>
|
||||
<td style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX-5b-I2V">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX-5b-I2V">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX-5b-I2V">🟣 WiseModel</a></td>
|
||||
<td colspan="2" style="text-align: center;"> Coming Soon </td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">Download Link (SAT)</td>
|
||||
<td colspan="3" style="text-align: center;"><a href="./sat/README_zh.md">SAT</a></td>
|
||||
<td colspan="2" style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX1.5-5b-SAT">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX1.5-5b-SAT">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX1.5-5b-SAT">🟣 WiseModel</a></td>
|
||||
<td colspan="3" style="text-align: center;"><a href="./sat/README_zh.md">SAT</a></td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
|
63
README_ja.md
63
README_ja.md
@ -163,88 +163,87 @@ CogVideoXは、[清影](https://chatglm.cn/video?fr=osm_cogvideox) と同源の
|
||||
<table style="border-collapse: collapse; width: 100%;">
|
||||
<tr>
|
||||
<th style="text-align: center;">モデル名</th>
|
||||
<th style="text-align: center;">CogVideoX1.5-5B (最新)</th>
|
||||
<th style="text-align: center;">CogVideoX1.5-5B-I2V (最新)</th>
|
||||
<th style="text-align: center;">CogVideoX-2B</th>
|
||||
<th style="text-align: center;">CogVideoX-5B</th>
|
||||
<th style="text-align: center;">CogVideoX-5B-I2V</th>
|
||||
<th style="text-align: center;">CogVideoX1.5-5B</th>
|
||||
<th style="text-align: center;">CogVideoX1.5-5B-I2V</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">リリース日</td>
|
||||
<td style="text-align: center;">公開日</td>
|
||||
<th style="text-align: center;">2024年11月8日</th>
|
||||
<th style="text-align: center;">2024年11月8日</th>
|
||||
<th style="text-align: center;">2024年8月6日</th>
|
||||
<th style="text-align: center;">2024年8月27日</th>
|
||||
<th style="text-align: center;">2024年9月19日</th>
|
||||
<th style="text-align: center;">2024年11月8日</th>
|
||||
<th style="text-align: center;">2024年11月8日</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">ビデオ解像度</td>
|
||||
<td colspan="3" style="text-align: center;">720 * 480</td>
|
||||
<td colspan="1" style="text-align: center;">1360 * 768</td>
|
||||
<td colspan="1" style="text-align: center;">256 <= W <=1360<br>256 <= H <=768<br> W,H % 16 == 0</td>
|
||||
<td colspan="1" style="text-align: center;">256 <= W <=1360<br> 256 <= H <=768<br> W,H % 16 == 0</td>
|
||||
<td colspan="3" style="text-align: center;">720 * 480</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">推論精度</td>
|
||||
<td style="text-align: center;"><b>FP16*(推奨)</b>, BF16, FP32, FP8*, INT8, INT4は非対応</td>
|
||||
<td colspan="2" style="text-align: center;"><b>BF16(推奨)</b>, FP16, FP32, FP8*, INT8, INT4は非対応</td>
|
||||
<td colspan="2" style="text-align: center;"><b>BF16</b></td>
|
||||
<td style="text-align: center;"><b>FP16*(推奨)</b>, BF16, FP32,FP8*,INT8,INT4非対応</td>
|
||||
<td colspan="2" style="text-align: center;"><b>BF16(推奨)</b>, FP16, FP32,FP8*,INT8,INT4非対応</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">シングルGPUメモリ消費</td>
|
||||
<td style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> FP16: 18GB<br><b>diffusers FP16: 4GBから*</b><br><b>diffusers INT8(torchao): 3.6GBから*</b></td>
|
||||
<td colspan="2" style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> BF16: 26GB<br><b>diffusers BF16: 5GBから*</b><br><b>diffusers INT8(torchao): 4.4GBから*</b></td>
|
||||
<td colspan="2" style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> BF16: 66GB<br></td>
|
||||
<td style="text-align: center;">単一GPUメモリ消費量<br></td>
|
||||
<td colspan="2" style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> BF16: 66GB <br></td>
|
||||
<td style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> FP16: 18GB <br><b>diffusers FP16: 4GB以上* </b><br><b>diffusers INT8(torchao): 3.6GB以上*</b></td>
|
||||
<td colspan="2" style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> BF16: 26GB <br><b>diffusers BF16 : 5GB以上* </b><br><b>diffusers INT8(torchao): 4.4GB以上* </b></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">マルチGPUメモリ消費</td>
|
||||
<td style="text-align: center;"><b>FP16: 10GB* using diffusers</b><br></td>
|
||||
<td colspan="2" style="text-align: center;"><b>BF16: 15GB* using diffusers</b><br></td>
|
||||
<td colspan="2" style="text-align: center;"><b>サポートなし</b><br></td>
|
||||
<td style="text-align: center;">複数GPU推論メモリ消費量</td>
|
||||
<td colspan="2" style="text-align: center;"><b>非対応</b><br></td>
|
||||
<td style="text-align: center;"><b>FP16: 10GB* diffusers使用</b><br></td>
|
||||
<td colspan="2" style="text-align: center;"><b>BF16: 15GB* diffusers使用</b><br></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">推論速度<br>(ステップ数 = 50, FP/BF16)</td>
|
||||
<td style="text-align: center;">単一A100: 約90秒<br>単一H100: 約45秒</td>
|
||||
<td colspan="2" style="text-align: center;">単一A100: 約180秒<br>単一H100: 約90秒</td>
|
||||
<td colspan="2" style="text-align: center;">単一A100: 約1000秒(5秒動画)<br>単一H100: 約550秒(5秒動画)</td>
|
||||
<td style="text-align: center;">推論速度<br>(Step = 50, FP/BF16)</td>
|
||||
<td colspan="2" style="text-align: center;">シングルA100: ~1000秒(5秒ビデオ)<br>シングルH100: ~550秒(5秒ビデオ)</td>
|
||||
<td style="text-align: center;">シングルA100: ~90秒<br>シングルH100: ~45秒</td>
|
||||
<td colspan="2" style="text-align: center;">シングルA100: ~180秒<br>シングルH100: ~90秒</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">プロンプト言語</td>
|
||||
<td colspan="5" style="text-align: center;">英語*</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">プロンプトトークン制限</td>
|
||||
<td colspan="3" style="text-align: center;">226トークン</td>
|
||||
<td style="text-align: center;">プロンプト長さの上限</td>
|
||||
<td colspan="2" style="text-align: center;">224トークン</td>
|
||||
<td colspan="3" style="text-align: center;">226トークン</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">ビデオの長さ</td>
|
||||
<td colspan="3" style="text-align: center;">6秒</td>
|
||||
<td style="text-align: center;">ビデオ長さ</td>
|
||||
<td colspan="2" style="text-align: center;">5秒または10秒</td>
|
||||
<td colspan="3" style="text-align: center;">6秒</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">フレームレート</td>
|
||||
<td colspan="3" style="text-align: center;">8 フレーム / 秒</td>
|
||||
<td colspan="2" style="text-align: center;">16 フレーム / 秒</td>
|
||||
<td colspan="2" style="text-align: center;">16フレーム/秒</td>
|
||||
<td colspan="3" style="text-align: center;">8フレーム/秒</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">位置エンコーディング</td>
|
||||
<td style="text-align: center;">3d_sincos_pos_embed</td>
|
||||
<td colspan="2" style="text-align: center;">3d_rope_pos_embed</td>
|
||||
<td style="text-align: center;">3d_sincos_pos_embed</td>
|
||||
<td style="text-align: center;">3d_rope_pos_embed</td>
|
||||
<td style="text-align: center;">3d_rope_pos_embed + learnable_pos_embed</td>
|
||||
<td style="text-align: center;">3d_rope_pos_embed</td>
|
||||
<td style="text-align: center;">3d_rope_pos_embed</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">ダウンロードリンク (Diffusers)</td>
|
||||
<td colspan="2" style="text-align: center;"> 近日公開 </td>
|
||||
<td style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX-2b">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX-2b">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX-2b">🟣 WiseModel</a></td>
|
||||
<td style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX-5b">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX-5b">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX-5b">🟣 WiseModel</a></td>
|
||||
<td style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX-5b-I2V">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX-5b-I2V">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX-5b-I2V">🟣 WiseModel</a></td>
|
||||
<td colspan="2" style="text-align: center;">近日公開</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">ダウンロードリンク (SAT)</td>
|
||||
<td colspan="3" style="text-align: center;"><a href="./sat/README_zh.md">SAT</a></td>
|
||||
<td colspan="2" style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX1.5-5b-SAT">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX1.5-5b-SAT">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX1.5-5b-SAT">🟣 WiseModel</a></td>
|
||||
<td colspan="3" style="text-align: center;"><a href="./sat/README_zh.md">SAT</a></td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
|
36
README_zh.md
36
README_zh.md
@ -154,49 +154,49 @@ CogVideoX是 [清影](https://chatglm.cn/video?fr=osm_cogvideox) 同源的开源
|
||||
<table style="border-collapse: collapse; width: 100%;">
|
||||
<tr>
|
||||
<th style="text-align: center;">模型名</th>
|
||||
<th style="text-align: center;">CogVideoX1.5-5B (最新)</th>
|
||||
<th style="text-align: center;">CogVideoX1.5-5B-I2V (最新)</th>
|
||||
<th style="text-align: center;">CogVideoX-2B</th>
|
||||
<th style="text-align: center;">CogVideoX-5B</th>
|
||||
<th style="text-align: center;">CogVideoX-5B-I2V </th>
|
||||
<th style="text-align: center;">CogVideoX1.5-5B</th>
|
||||
<th style="text-align: center;">CogVideoX1.5-5B-I2V</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">发布时间</td>
|
||||
<th style="text-align: center;">2024年11月8日</th>
|
||||
<th style="text-align: center;">2024年11月8日</th>
|
||||
<th style="text-align: center;">2024年8月6日</th>
|
||||
<th style="text-align: center;">2024年8月27日</th>
|
||||
<th style="text-align: center;">2024年9月19日</th>
|
||||
<th style="text-align: center;">2024年11月8日</th>
|
||||
<th style="text-align: center;">2024年11月8日</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">视频分辨率</td>
|
||||
<td colspan="3" style="text-align: center;">720 * 480</td>
|
||||
<td colspan="1" style="text-align: center;">1360 * 768</td>
|
||||
<td colspan="1" style="text-align: center;">256 <= W <=1360<br> 256 <= H <=768<br> W,H % 16 == 0</td>
|
||||
</tr>
|
||||
<td colspan="3" style="text-align: center;">720 * 480</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">推理精度</td>
|
||||
<td colspan="2" style="text-align: center;"><b>BF16</b></td>
|
||||
<td style="text-align: center;"><b>FP16*(推荐)</b>, BF16, FP32,FP8*,INT8,不支持INT4</td>
|
||||
<td colspan="2" style="text-align: center;"><b>BF16(推荐)</b>, FP16, FP32,FP8*,INT8,不支持INT4</td>
|
||||
<td colspan="2" style="text-align: center;"><b>BF16</b></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">单GPU显存消耗<br></td>
|
||||
<td colspan="2" style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> BF16: 66GB <br></td>
|
||||
<td style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> FP16: 18GB <br><b>diffusers FP16: 4GB起* </b><br><b>diffusers INT8(torchao): 3.6G起*</b></td>
|
||||
<td colspan="2" style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> BF16: 26GB <br><b>diffusers BF16 : 5GB起* </b><br><b>diffusers INT8(torchao): 4.4G起* </b></td>
|
||||
<td colspan="2" style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> BF16: 66GB <br></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">多GPU推理显存消耗</td>
|
||||
<td colspan="2" style="text-align: center;"><b>不支持</b><br></td>
|
||||
<td style="text-align: center;"><b>FP16: 10GB* using diffusers</b><br></td>
|
||||
<td colspan="2" style="text-align: center;"><b>BF16: 15GB* using diffusers</b><br></td>
|
||||
<td colspan="2" style="text-align: center;"><b>Not support</b><br></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">推理速度<br>(Step = 50, FP/BF16)</td>
|
||||
<td colspan="2" style="text-align: center;">单卡A100: ~1000秒(5秒视频)<br>单卡H100: ~550秒(5秒视频)</td>
|
||||
<td style="text-align: center;">单卡A100: ~90秒<br>单卡H100: ~45秒</td>
|
||||
<td colspan="2" style="text-align: center;">单卡A100: ~180秒<br>单卡H100: ~90秒</td>
|
||||
<td colspan="2" style="text-align: center;">单卡A100: ~1000秒(5秒视频)<br>单卡H100: ~550秒(5秒视频)</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">提示词语言</td>
|
||||
@ -204,39 +204,37 @@ CogVideoX是 [清影](https://chatglm.cn/video?fr=osm_cogvideox) 同源的开源
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">提示词长度上限</td>
|
||||
<td colspan="3" style="text-align: center;">226 Tokens</td>
|
||||
<td colspan="2" style="text-align: center;">224 Tokens</td>
|
||||
<td colspan="3" style="text-align: center;">226 Tokens</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">视频长度</td>
|
||||
<td colspan="3" style="text-align: center;">6 秒</td>
|
||||
<td colspan="2" style="text-align: center;">5 秒 或 10 秒</td>
|
||||
<td colspan="3" style="text-align: center;">6 秒</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">帧率</td>
|
||||
<td colspan="3" style="text-align: center;">8 帧 / 秒 </td>
|
||||
<td colspan="2" style="text-align: center;">16 帧 / 秒 </td>
|
||||
<td colspan="3" style="text-align: center;">8 帧 / 秒 </td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">位置编码</td>
|
||||
<td style="text-align: center;">3d_sincos_pos_embed</td>
|
||||
<td colspan="2" style="text-align: center;">3d_rope_pos_embed</td>
|
||||
<td style="text-align: center;">3d_sincos_pos_embed</td>
|
||||
<td style="text-align: center;">3d_rope_pos_embed</td>
|
||||
<td style="text-align: center;">3d_rope_pos_embed + learnable_pos_embed</td>
|
||||
<td style="text-align: center;">3d_rope_pos_embed</td>
|
||||
<td style="text-align: center;">3d_rope_pos_embed</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">下载链接 (Diffusers)</td>
|
||||
<td colspan="2" style="text-align: center;"> 即将推出 </td>
|
||||
<td style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX-2b">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX-2b">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX-2b">🟣 WiseModel</a></td>
|
||||
<td style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX-5b">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX-5b">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX-5b">🟣 WiseModel</a></td>
|
||||
<td style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX-5b-I2V">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX-5b-I2V">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX-5b-I2V">🟣 WiseModel</a></td>
|
||||
<td colspan="2" style="text-align: center;"> 即将推出 </td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="text-align: center;">下载链接 (SAT)</td>
|
||||
<td colspan="3" style="text-align: center;"><a href="./sat/README_zh.md">SAT</a></td>
|
||||
<td colspan="2" style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX1.5-5b-SAT">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX1.5-5b-SAT">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX1.5-5b-SAT">🟣 WiseModel</a></td>
|
||||
|
||||
<td colspan="3" style="text-align: center;"><a href="./sat/README_zh.md">SAT</a></td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
|
@ -7,7 +7,6 @@ import numpy as np
|
||||
import torch
|
||||
from torch import nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from sat.model.base_model import BaseModel, non_conflict
|
||||
from sat.model.mixins import BaseMixin
|
||||
from sat.transformer_defaults import HOOKS_DEFAULT, attention_fn_default
|
||||
|
@ -4,7 +4,7 @@ echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
|
||||
|
||||
environs="WORLD_SIZE=1 RANK=0 LOCAL_RANK=0 LOCAL_WORLD_SIZE=1"
|
||||
|
||||
run_cmd="$environs python sample_video.py --base configs/cogvideox1.5_5b.yaml configs/test_inference.yaml --seed $RANDOM"
|
||||
run_cmd="$environs python sample_video.py --base configs/test_cogvideox_5b.yaml configs/test_inference.yaml --seed $RANDOM"
|
||||
|
||||
echo ${run_cmd}
|
||||
eval ${run_cmd}
|
||||
|
@ -1,17 +1,13 @@
|
||||
import logging
|
||||
import math
|
||||
import re
|
||||
import random
|
||||
from abc import abstractmethod
|
||||
from contextlib import contextmanager
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import pytorch_lightning as pl
|
||||
import torch
|
||||
import torch.distributed
|
||||
import torch.nn as nn
|
||||
from einops import rearrange
|
||||
from packaging import version
|
||||
|
||||
from vae_modules.ema import LitEma
|
||||
@ -56,17 +52,6 @@ class AbstractAutoencoder(pl.LightningModule):
|
||||
if version.parse(torch.__version__) >= version.parse("2.0.0"):
|
||||
self.automatic_optimization = False
|
||||
|
||||
# def apply_ckpt(self, ckpt: Union[None, str, dict]):
|
||||
# if ckpt is None:
|
||||
# return
|
||||
# if isinstance(ckpt, str):
|
||||
# ckpt = {
|
||||
# "target": "sgm.modules.checkpoint.CheckpointEngine",
|
||||
# "params": {"ckpt_path": ckpt},
|
||||
# }
|
||||
# engine = instantiate_from_config(ckpt)
|
||||
# engine(self)
|
||||
|
||||
def apply_ckpt(self, ckpt: Union[None, str, dict]):
|
||||
if ckpt is None:
|
||||
return
|
||||
@ -85,6 +70,18 @@ class AbstractAutoencoder(pl.LightningModule):
|
||||
print("Unexpected keys: ", unexpected_keys)
|
||||
print(f"Restored from {path}")
|
||||
|
||||
def apply_ckpt(self, ckpt: Union[None, str, dict]):
|
||||
if ckpt is None:
|
||||
return
|
||||
if isinstance(ckpt, str):
|
||||
ckpt = {
|
||||
"target": "sgm.modules.checkpoint.CheckpointEngine",
|
||||
"params": {"ckpt_path": ckpt},
|
||||
}
|
||||
engine = instantiate_from_config(ckpt)
|
||||
engine(self)
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def get_input(self, batch) -> Any:
|
||||
raise NotImplementedError()
|
||||
@ -216,12 +213,13 @@ class AutoencodingEngine(AbstractAutoencoder):
|
||||
return self.decoder.get_last_layer()
|
||||
|
||||
def encode(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
return_reg_log: bool = False,
|
||||
unregularized: bool = False,
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
return_reg_log: bool = False,
|
||||
unregularized: bool = False,
|
||||
**kwargs,
|
||||
) -> Union[torch.Tensor, Tuple[torch.Tensor, dict]]:
|
||||
z = self.encoder(x)
|
||||
z = self.encoder(x, **kwargs)
|
||||
if unregularized:
|
||||
return z, dict()
|
||||
z, reg_log = self.regularization(z)
|
||||
|
Loading…
x
Reference in New Issue
Block a user