This commit is contained in:
zR 2024-11-08 21:37:43 +08:00
parent 2360393b99
commit 0c6dc7b5d5
6 changed files with 90 additions and 97 deletions

View File

@ -171,49 +171,49 @@ models we currently offer, along with their foundational information.
<table style="border-collapse: collapse; width: 100%;"> <table style="border-collapse: collapse; width: 100%;">
<tr> <tr>
<th style="text-align: center;">Model Name</th> <th style="text-align: center;">Model Name</th>
<th style="text-align: center;">CogVideoX1.5-5B (Latest)</th>
<th style="text-align: center;">CogVideoX1.5-5B-I2V (Latest)</th>
<th style="text-align: center;">CogVideoX-2B</th> <th style="text-align: center;">CogVideoX-2B</th>
<th style="text-align: center;">CogVideoX-5B</th> <th style="text-align: center;">CogVideoX-5B</th>
<th style="text-align: center;">CogVideoX-5B-I2V</th> <th style="text-align: center;">CogVideoX-5B-I2V</th>
<th style="text-align: center;">CogVideoX1.5-5B</th>
<th style="text-align: center;">CogVideoX1.5-5B-I2V</th>
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">Release Date</td> <td style="text-align: center;">Release Date</td>
<th style="text-align: center;">November 8, 2024</th>
<th style="text-align: center;">November 8, 2024</th>
<th style="text-align: center;">August 6, 2024</th> <th style="text-align: center;">August 6, 2024</th>
<th style="text-align: center;">August 27, 2024</th> <th style="text-align: center;">August 27, 2024</th>
<th style="text-align: center;">September 19, 2024</th> <th style="text-align: center;">September 19, 2024</th>
<th style="text-align: center;">November 8, 2024</th>
<th style="text-align: center;">November 8, 2024</th>
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">Video Resolution</td> <td style="text-align: center;">Video Resolution</td>
<td colspan="3" style="text-align: center;">720 * 480</td>
<td colspan="1" style="text-align: center;">1360 * 768</td> <td colspan="1" style="text-align: center;">1360 * 768</td>
<td colspan="1" style="text-align: center;">256 <= W <=1360<br>256 <= H <=768<br> W,H % 16 == 0</td> <td colspan="1" style="text-align: center;">256 <= W <=1360<br> 256 <= H <=768<br> W,H % 16 == 0</td>
<td colspan="3" style="text-align: center;">720 * 480</td>
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">Inference Precision</td> <td style="text-align: center;">Inference Precision</td>
<td style="text-align: center;"><b>FP16*(recommended)</b>, BF16, FP32, FP8*, INT8, not supported: INT4</td>
<td colspan="2" style="text-align: center;"><b>BF16(recommended)</b>, FP16, FP32, FP8*, INT8, not supported: INT4</td>
<td colspan="2" style="text-align: center;"><b>BF16</b></td> <td colspan="2" style="text-align: center;"><b>BF16</b></td>
<td style="text-align: center;"><b>FP16*(Recommended)</b>, BF16, FP32, FP8*, INT8, Not supported: INT4</td>
<td colspan="2" style="text-align: center;"><b>BF16 (Recommended)</b>, FP16, FP32, FP8*, INT8, Not supported: INT4</td>
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">Single GPU Memory Usage</td> <td style="text-align: center;">Single GPU Memory Usage<br></td>
<td style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> FP16: 18GB<br><b>diffusers FP16: from 4GB*</b><br><b>diffusers INT8(torchao): from 3.6GB*</b></td> <td colspan="2" style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> BF16: 66GB <br></td>
<td colspan="2" style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> BF16: 26GB<br><b>diffusers BF16 : from 5GB*</b><br><b>diffusers INT8(torchao): from 4.4GB*</b></td> <td style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> FP16: 18GB <br><b>diffusers FP16: 4GB minimum* </b><br><b>diffusers INT8 (torchao): 3.6GB minimum*</b></td>
<td colspan="2" style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> BF16: 66GB<br></td> <td colspan="2" style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> BF16: 26GB <br><b>diffusers BF16 : 5GB minimum* </b><br><b>diffusers INT8 (torchao): 4.4GB minimum* </b></td>
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">Multi-GPU Memory Usage</td> <td style="text-align: center;">Multi-GPU Memory Usage</td>
<td colspan="2" style="text-align: center;"><b>Not Supported</b><br></td>
<td style="text-align: center;"><b>FP16: 10GB* using diffusers</b><br></td> <td style="text-align: center;"><b>FP16: 10GB* using diffusers</b><br></td>
<td colspan="2" style="text-align: center;"><b>BF16: 15GB* using diffusers</b><br></td> <td colspan="2" style="text-align: center;"><b>BF16: 15GB* using diffusers</b><br></td>
<td colspan="2" style="text-align: center;"><b>Not supported</b><br></td>
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">Inference Speed<br>(Step = 50, FP/BF16)</td> <td style="text-align: center;">Inference Speed<br>(Step = 50, FP/BF16)</td>
<td colspan="2" style="text-align: center;">Single A100: ~1000 seconds (5-second video)<br>Single H100: ~550 seconds (5-second video)</td>
<td style="text-align: center;">Single A100: ~90 seconds<br>Single H100: ~45 seconds</td> <td style="text-align: center;">Single A100: ~90 seconds<br>Single H100: ~45 seconds</td>
<td colspan="2" style="text-align: center;">Single A100: ~180 seconds<br>Single H100: ~90 seconds</td> <td colspan="2" style="text-align: center;">Single A100: ~180 seconds<br>Single H100: ~90 seconds</td>
<td colspan="2" style="text-align: center;">Single A100: ~1000 seconds (5-second video)<br>Single H100: ~550 seconds (5-second video)</td>
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">Prompt Language</td> <td style="text-align: center;">Prompt Language</td>
@ -221,38 +221,37 @@ models we currently offer, along with their foundational information.
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">Prompt Token Limit</td> <td style="text-align: center;">Prompt Token Limit</td>
<td colspan="3" style="text-align: center;">226 Tokens</td>
<td colspan="2" style="text-align: center;">224 Tokens</td> <td colspan="2" style="text-align: center;">224 Tokens</td>
<td colspan="3" style="text-align: center;">226 Tokens</td>
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">Video Length</td> <td style="text-align: center;">Video Length</td>
<td colspan="2" style="text-align: center;">5 seconds or 10 seconds</td>
<td colspan="3" style="text-align: center;">6 seconds</td> <td colspan="3" style="text-align: center;">6 seconds</td>
<td colspan="2" style="text-align: center;">5 or 10 seconds</td>
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">Frame Rate</td> <td style="text-align: center;">Frame Rate</td>
<td colspan="3" style="text-align: center;">8 frames / second</td> <td colspan="2" style="text-align: center;">16 frames / second </td>
<td colspan="2" style="text-align: center;">16 frames / second</td> <td colspan="3" style="text-align: center;">8 frames / second </td>
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">Positional Encoding</td> <td style="text-align: center;">Position Encoding</td>
<td colspan="2" style="text-align: center;">3d_rope_pos_embed</td>
<td style="text-align: center;">3d_sincos_pos_embed</td> <td style="text-align: center;">3d_sincos_pos_embed</td>
<td style="text-align: center;">3d_rope_pos_embed</td> <td style="text-align: center;">3d_rope_pos_embed</td>
<td style="text-align: center;">3d_rope_pos_embed + learnable_pos_embed</td> <td style="text-align: center;">3d_rope_pos_embed + learnable_pos_embed</td>
<td style="text-align: center;">3d_rope_pos_embed</td>
<td style="text-align: center;">3d_rope_pos_embed</td>
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">Download Link (Diffusers)</td> <td style="text-align: center;">Download Link (Diffusers)</td>
<td colspan="2" style="text-align: center;"> Coming Soon </td>
<td style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX-2b">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX-2b">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX-2b">🟣 WiseModel</a></td> <td style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX-2b">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX-2b">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX-2b">🟣 WiseModel</a></td>
<td style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX-5b">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX-5b">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX-5b">🟣 WiseModel</a></td> <td style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX-5b">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX-5b">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX-5b">🟣 WiseModel</a></td>
<td style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX-5b-I2V">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX-5b-I2V">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX-5b-I2V">🟣 WiseModel</a></td> <td style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX-5b-I2V">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX-5b-I2V">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX-5b-I2V">🟣 WiseModel</a></td>
<td colspan="2" style="text-align: center;"> Coming Soon </td>
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">Download Link (SAT)</td> <td style="text-align: center;">Download Link (SAT)</td>
<td colspan="3" style="text-align: center;"><a href="./sat/README_zh.md">SAT</a></td>
<td colspan="2" style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX1.5-5b-SAT">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX1.5-5b-SAT">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX1.5-5b-SAT">🟣 WiseModel</a></td> <td colspan="2" style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX1.5-5b-SAT">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX1.5-5b-SAT">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX1.5-5b-SAT">🟣 WiseModel</a></td>
<td colspan="3" style="text-align: center;"><a href="./sat/README_zh.md">SAT</a></td>
</tr> </tr>
</table> </table>

View File

@ -163,88 +163,87 @@ CogVideoXは、[清影](https://chatglm.cn/video?fr=osm_cogvideox) と同源の
<table style="border-collapse: collapse; width: 100%;"> <table style="border-collapse: collapse; width: 100%;">
<tr> <tr>
<th style="text-align: center;">モデル名</th> <th style="text-align: center;">モデル名</th>
<th style="text-align: center;">CogVideoX1.5-5B (最新)</th>
<th style="text-align: center;">CogVideoX1.5-5B-I2V (最新)</th>
<th style="text-align: center;">CogVideoX-2B</th> <th style="text-align: center;">CogVideoX-2B</th>
<th style="text-align: center;">CogVideoX-5B</th> <th style="text-align: center;">CogVideoX-5B</th>
<th style="text-align: center;">CogVideoX-5B-I2V</th> <th style="text-align: center;">CogVideoX-5B-I2V</th>
<th style="text-align: center;">CogVideoX1.5-5B</th>
<th style="text-align: center;">CogVideoX1.5-5B-I2V</th>
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">リリース日</td> <td style="text-align: center;">公開日</td>
<th style="text-align: center;">2024年11月8日</th>
<th style="text-align: center;">2024年11月8日</th>
<th style="text-align: center;">2024年8月6日</th> <th style="text-align: center;">2024年8月6日</th>
<th style="text-align: center;">2024年8月27日</th> <th style="text-align: center;">2024年8月27日</th>
<th style="text-align: center;">2024年9月19日</th> <th style="text-align: center;">2024年9月19日</th>
<th style="text-align: center;">2024年11月8日</th>
<th style="text-align: center;">2024年11月8日</th>
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">ビデオ解像度</td> <td style="text-align: center;">ビデオ解像度</td>
<td colspan="3" style="text-align: center;">720 * 480</td>
<td colspan="1" style="text-align: center;">1360 * 768</td> <td colspan="1" style="text-align: center;">1360 * 768</td>
<td colspan="1" style="text-align: center;">256 <= W <=1360<br>256 <= H <=768<br> W,H % 16 == 0</td> <td colspan="1" style="text-align: center;">256 <= W <=1360<br> 256 <= H <=768<br> W,H % 16 == 0</td>
<td colspan="3" style="text-align: center;">720 * 480</td>
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">推論精度</td> <td style="text-align: center;">推論精度</td>
<td style="text-align: center;"><b>FP16*(推奨)</b>, BF16, FP32, FP8*, INT8, INT4は非対応</td>
<td colspan="2" style="text-align: center;"><b>BF16(推奨)</b>, FP16, FP32, FP8*, INT8, INT4は非対応</td>
<td colspan="2" style="text-align: center;"><b>BF16</b></td> <td colspan="2" style="text-align: center;"><b>BF16</b></td>
<td style="text-align: center;"><b>FP16*(推奨)</b>, BF16, FP32FP8*INT8INT4非対応</td>
<td colspan="2" style="text-align: center;"><b>BF16(推奨)</b>, FP16, FP32FP8*INT8INT4非対応</td>
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">シングルGPUメモリ消費</td> <td style="text-align: center;">単一GPUメモリ消費量<br></td>
<td style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> FP16: 18GB<br><b>diffusers FP16: 4GBから*</b><br><b>diffusers INT8(torchao): 3.6GBから*</b></td> <td colspan="2" style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> BF16: 66GB <br></td>
<td colspan="2" style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> BF16: 26GB<br><b>diffusers BF16: 5GBから*</b><br><b>diffusers INT8(torchao): 4.4GBから*</b></td> <td style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> FP16: 18GB <br><b>diffusers FP16: 4GB以上* </b><br><b>diffusers INT8(torchao): 3.6GB以上*</b></td>
<td colspan="2" style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> BF16: 66GB<br></td> <td colspan="2" style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> BF16: 26GB <br><b>diffusers BF16 : 5GB以上* </b><br><b>diffusers INT8(torchao): 4.4GB以上* </b></td>
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">マルチGPUメモリ消費</td> <td style="text-align: center;">複数GPU推論メモリ消費量</td>
<td style="text-align: center;"><b>FP16: 10GB* using diffusers</b><br></td> <td colspan="2" style="text-align: center;"><b>非対応</b><br></td>
<td colspan="2" style="text-align: center;"><b>BF16: 15GB* using diffusers</b><br></td> <td style="text-align: center;"><b>FP16: 10GB* diffusers使用</b><br></td>
<td colspan="2" style="text-align: center;"><b>サポートなし</b><br></td> <td colspan="2" style="text-align: center;"><b>BF16: 15GB* diffusers使用</b><br></td>
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">推論速度<br>(ステップ数 = 50, FP/BF16)</td> <td style="text-align: center;">推論速度<br>(Step = 50, FP/BF16)</td>
<td style="text-align: center;">単一A100: 約90秒<br>単一H100: 約45秒</td> <td colspan="2" style="text-align: center;">シングルA100: ~1000秒(5秒ビデオ)<br>シングルH100: ~550秒(5秒ビデオ)</td>
<td colspan="2" style="text-align: center;">単一A100: 約180秒<br>単一H100: 約90</td> <td style="text-align: center;">シングルA100: ~90秒<br>シングルH100: ~45</td>
<td colspan="2" style="text-align: center;">単一A100: 約1000秒(5秒動画)<br>単一H100: 約550秒(5秒動画)</td> <td colspan="2" style="text-align: center;">シングルA100: ~180秒<br>シングルH100: ~90秒</td>
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">プロンプト言語</td> <td style="text-align: center;">プロンプト言語</td>
<td colspan="5" style="text-align: center;">英語*</td> <td colspan="5" style="text-align: center;">英語*</td>
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">プロンプトトークン制限</td> <td style="text-align: center;">プロンプト長さの上限</td>
<td colspan="3" style="text-align: center;">226トークン</td>
<td colspan="2" style="text-align: center;">224トークン</td> <td colspan="2" style="text-align: center;">224トークン</td>
<td colspan="3" style="text-align: center;">226トークン</td>
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">ビデオの長さ</td> <td style="text-align: center;">ビデオ長さ</td>
<td colspan="3" style="text-align: center;">6秒</td>
<td colspan="2" style="text-align: center;">5秒または10秒</td> <td colspan="2" style="text-align: center;">5秒または10秒</td>
<td colspan="3" style="text-align: center;">6秒</td>
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">フレームレート</td> <td style="text-align: center;">フレームレート</td>
<td colspan="3" style="text-align: center;">8 フレーム / </td> <td colspan="2" style="text-align: center;">16フレーム/</td>
<td colspan="2" style="text-align: center;">16 フレーム / </td> <td colspan="3" style="text-align: center;">8フレーム/</td>
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">位置エンコーディング</td> <td style="text-align: center;">位置エンコーディング</td>
<td colspan="2" style="text-align: center;">3d_rope_pos_embed</td>
<td style="text-align: center;">3d_sincos_pos_embed</td> <td style="text-align: center;">3d_sincos_pos_embed</td>
<td style="text-align: center;">3d_rope_pos_embed</td> <td style="text-align: center;">3d_rope_pos_embed</td>
<td style="text-align: center;">3d_rope_pos_embed + learnable_pos_embed</td> <td style="text-align: center;">3d_rope_pos_embed + learnable_pos_embed</td>
<td style="text-align: center;">3d_rope_pos_embed</td>
<td style="text-align: center;">3d_rope_pos_embed</td>
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">ダウンロードリンク (Diffusers)</td> <td style="text-align: center;">ダウンロードリンク (Diffusers)</td>
<td colspan="2" style="text-align: center;"> 近日公開 </td>
<td style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX-2b">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX-2b">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX-2b">🟣 WiseModel</a></td> <td style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX-2b">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX-2b">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX-2b">🟣 WiseModel</a></td>
<td style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX-5b">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX-5b">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX-5b">🟣 WiseModel</a></td> <td style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX-5b">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX-5b">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX-5b">🟣 WiseModel</a></td>
<td style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX-5b-I2V">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX-5b-I2V">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX-5b-I2V">🟣 WiseModel</a></td> <td style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX-5b-I2V">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX-5b-I2V">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX-5b-I2V">🟣 WiseModel</a></td>
<td colspan="2" style="text-align: center;">近日公開</td>
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">ダウンロードリンク (SAT)</td> <td style="text-align: center;">ダウンロードリンク (SAT)</td>
<td colspan="3" style="text-align: center;"><a href="./sat/README_zh.md">SAT</a></td>
<td colspan="2" style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX1.5-5b-SAT">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX1.5-5b-SAT">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX1.5-5b-SAT">🟣 WiseModel</a></td> <td colspan="2" style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX1.5-5b-SAT">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX1.5-5b-SAT">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX1.5-5b-SAT">🟣 WiseModel</a></td>
<td colspan="3" style="text-align: center;"><a href="./sat/README_zh.md">SAT</a></td>
</tr> </tr>
</table> </table>

View File

@ -154,49 +154,49 @@ CogVideoX是 [清影](https://chatglm.cn/video?fr=osm_cogvideox) 同源的开源
<table style="border-collapse: collapse; width: 100%;"> <table style="border-collapse: collapse; width: 100%;">
<tr> <tr>
<th style="text-align: center;">模型名</th> <th style="text-align: center;">模型名</th>
<th style="text-align: center;">CogVideoX1.5-5B (最新)</th>
<th style="text-align: center;">CogVideoX1.5-5B-I2V (最新)</th>
<th style="text-align: center;">CogVideoX-2B</th> <th style="text-align: center;">CogVideoX-2B</th>
<th style="text-align: center;">CogVideoX-5B</th> <th style="text-align: center;">CogVideoX-5B</th>
<th style="text-align: center;">CogVideoX-5B-I2V </th> <th style="text-align: center;">CogVideoX-5B-I2V </th>
<th style="text-align: center;">CogVideoX1.5-5B</th>
<th style="text-align: center;">CogVideoX1.5-5B-I2V</th>
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">发布时间</td> <td style="text-align: center;">发布时间</td>
<th style="text-align: center;">2024年11月8日</th>
<th style="text-align: center;">2024年11月8日</th>
<th style="text-align: center;">2024年8月6日</th> <th style="text-align: center;">2024年8月6日</th>
<th style="text-align: center;">2024年8月27日</th> <th style="text-align: center;">2024年8月27日</th>
<th style="text-align: center;">2024年9月19日</th> <th style="text-align: center;">2024年9月19日</th>
<th style="text-align: center;">2024年11月8日</th>
<th style="text-align: center;">2024年11月8日</th>
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">视频分辨率</td> <td style="text-align: center;">视频分辨率</td>
<td colspan="3" style="text-align: center;">720 * 480</td>
<td colspan="1" style="text-align: center;">1360 * 768</td> <td colspan="1" style="text-align: center;">1360 * 768</td>
<td colspan="1" style="text-align: center;">256 <= W <=1360<br> 256 <= H <=768<br> W,H % 16 == 0</td> <td colspan="1" style="text-align: center;">256 <= W <=1360<br> 256 <= H <=768<br> W,H % 16 == 0</td>
</tr> <td colspan="3" style="text-align: center;">720 * 480</td>
</tr>
<tr> <tr>
<td style="text-align: center;">推理精度</td> <td style="text-align: center;">推理精度</td>
<td colspan="2" style="text-align: center;"><b>BF16</b></td>
<td style="text-align: center;"><b>FP16*(推荐)</b>, BF16, FP32FP8*INT8不支持INT4</td> <td style="text-align: center;"><b>FP16*(推荐)</b>, BF16, FP32FP8*INT8不支持INT4</td>
<td colspan="2" style="text-align: center;"><b>BF16(推荐)</b>, FP16, FP32FP8*INT8不支持INT4</td> <td colspan="2" style="text-align: center;"><b>BF16(推荐)</b>, FP16, FP32FP8*INT8不支持INT4</td>
<td colspan="2" style="text-align: center;"><b>BF16</b></td>
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">单GPU显存消耗<br></td> <td style="text-align: center;">单GPU显存消耗<br></td>
<td colspan="2" style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> BF16: 66GB <br></td>
<td style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> FP16: 18GB <br><b>diffusers FP16: 4GB起* </b><br><b>diffusers INT8(torchao): 3.6G起*</b></td> <td style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> FP16: 18GB <br><b>diffusers FP16: 4GB起* </b><br><b>diffusers INT8(torchao): 3.6G起*</b></td>
<td colspan="2" style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> BF16: 26GB <br><b>diffusers BF16 : 5GB起* </b><br><b>diffusers INT8(torchao): 4.4G起* </b></td> <td colspan="2" style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> BF16: 26GB <br><b>diffusers BF16 : 5GB起* </b><br><b>diffusers INT8(torchao): 4.4G起* </b></td>
<td colspan="2" style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> BF16: 66GB <br></td>
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">多GPU推理显存消耗</td> <td style="text-align: center;">多GPU推理显存消耗</td>
<td colspan="2" style="text-align: center;"><b>不支持</b><br></td>
<td style="text-align: center;"><b>FP16: 10GB* using diffusers</b><br></td> <td style="text-align: center;"><b>FP16: 10GB* using diffusers</b><br></td>
<td colspan="2" style="text-align: center;"><b>BF16: 15GB* using diffusers</b><br></td> <td colspan="2" style="text-align: center;"><b>BF16: 15GB* using diffusers</b><br></td>
<td colspan="2" style="text-align: center;"><b>Not support</b><br></td>
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">推理速度<br>(Step = 50, FP/BF16)</td> <td style="text-align: center;">推理速度<br>(Step = 50, FP/BF16)</td>
<td colspan="2" style="text-align: center;">单卡A100: ~1000秒(5秒视频)<br>单卡H100: ~550秒(5秒视频)</td>
<td style="text-align: center;">单卡A100: ~90秒<br>单卡H100: ~45秒</td> <td style="text-align: center;">单卡A100: ~90秒<br>单卡H100: ~45秒</td>
<td colspan="2" style="text-align: center;">单卡A100: ~180秒<br>单卡H100: ~90秒</td> <td colspan="2" style="text-align: center;">单卡A100: ~180秒<br>单卡H100: ~90秒</td>
<td colspan="2" style="text-align: center;">单卡A100: ~1000秒(5秒视频)<br>单卡H100: ~550秒(5秒视频)</td>
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">提示词语言</td> <td style="text-align: center;">提示词语言</td>
@ -204,39 +204,37 @@ CogVideoX是 [清影](https://chatglm.cn/video?fr=osm_cogvideox) 同源的开源
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">提示词长度上限</td> <td style="text-align: center;">提示词长度上限</td>
<td colspan="3" style="text-align: center;">226 Tokens</td>
<td colspan="2" style="text-align: center;">224 Tokens</td> <td colspan="2" style="text-align: center;">224 Tokens</td>
<td colspan="3" style="text-align: center;">226 Tokens</td>
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">视频长度</td> <td style="text-align: center;">视频长度</td>
<td colspan="3" style="text-align: center;">6 秒</td>
<td colspan="2" style="text-align: center;">5 秒 或 10 秒</td> <td colspan="2" style="text-align: center;">5 秒 或 10 秒</td>
<td colspan="3" style="text-align: center;">6 秒</td>
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">帧率</td> <td style="text-align: center;">帧率</td>
<td colspan="3" style="text-align: center;">8 帧 / 秒 </td>
<td colspan="2" style="text-align: center;">16 帧 / 秒 </td> <td colspan="2" style="text-align: center;">16 帧 / 秒 </td>
<td colspan="3" style="text-align: center;">8 帧 / 秒 </td>
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">位置编码</td> <td style="text-align: center;">位置编码</td>
<td colspan="2" style="text-align: center;">3d_rope_pos_embed</td>
<td style="text-align: center;">3d_sincos_pos_embed</td> <td style="text-align: center;">3d_sincos_pos_embed</td>
<td style="text-align: center;">3d_rope_pos_embed</td> <td style="text-align: center;">3d_rope_pos_embed</td>
<td style="text-align: center;">3d_rope_pos_embed + learnable_pos_embed</td> <td style="text-align: center;">3d_rope_pos_embed + learnable_pos_embed</td>
<td style="text-align: center;">3d_rope_pos_embed</td>
<td style="text-align: center;">3d_rope_pos_embed</td>
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">下载链接 (Diffusers)</td> <td style="text-align: center;">下载链接 (Diffusers)</td>
<td colspan="2" style="text-align: center;"> 即将推出 </td>
<td style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX-2b">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX-2b">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX-2b">🟣 WiseModel</a></td> <td style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX-2b">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX-2b">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX-2b">🟣 WiseModel</a></td>
<td style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX-5b">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX-5b">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX-5b">🟣 WiseModel</a></td> <td style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX-5b">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX-5b">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX-5b">🟣 WiseModel</a></td>
<td style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX-5b-I2V">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX-5b-I2V">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX-5b-I2V">🟣 WiseModel</a></td> <td style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX-5b-I2V">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX-5b-I2V">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX-5b-I2V">🟣 WiseModel</a></td>
<td colspan="2" style="text-align: center;"> 即将推出 </td>
</tr> </tr>
<tr> <tr>
<td style="text-align: center;">下载链接 (SAT)</td> <td style="text-align: center;">下载链接 (SAT)</td>
<td colspan="3" style="text-align: center;"><a href="./sat/README_zh.md">SAT</a></td>
<td colspan="2" style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX1.5-5b-SAT">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX1.5-5b-SAT">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX1.5-5b-SAT">🟣 WiseModel</a></td> <td colspan="2" style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX1.5-5b-SAT">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX1.5-5b-SAT">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX1.5-5b-SAT">🟣 WiseModel</a></td>
<td colspan="3" style="text-align: center;"><a href="./sat/README_zh.md">SAT</a></td>
</tr> </tr>
</table> </table>

View File

@ -7,7 +7,6 @@ import numpy as np
import torch import torch
from torch import nn from torch import nn
import torch.nn.functional as F import torch.nn.functional as F
from sat.model.base_model import BaseModel, non_conflict from sat.model.base_model import BaseModel, non_conflict
from sat.model.mixins import BaseMixin from sat.model.mixins import BaseMixin
from sat.transformer_defaults import HOOKS_DEFAULT, attention_fn_default from sat.transformer_defaults import HOOKS_DEFAULT, attention_fn_default

View File

@ -4,7 +4,7 @@ echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
environs="WORLD_SIZE=1 RANK=0 LOCAL_RANK=0 LOCAL_WORLD_SIZE=1" environs="WORLD_SIZE=1 RANK=0 LOCAL_RANK=0 LOCAL_WORLD_SIZE=1"
run_cmd="$environs python sample_video.py --base configs/cogvideox1.5_5b.yaml configs/test_inference.yaml --seed $RANDOM" run_cmd="$environs python sample_video.py --base configs/test_cogvideox_5b.yaml configs/test_inference.yaml --seed $RANDOM"
echo ${run_cmd} echo ${run_cmd}
eval ${run_cmd} eval ${run_cmd}

View File

@ -1,17 +1,13 @@
import logging import logging
import math import math
import re import re
import random
from abc import abstractmethod from abc import abstractmethod
from contextlib import contextmanager from contextlib import contextmanager
from typing import Any, Dict, List, Optional, Tuple, Union from typing import Any, Dict, List, Optional, Tuple, Union
import numpy as np
import pytorch_lightning as pl import pytorch_lightning as pl
import torch import torch
import torch.distributed import torch.distributed
import torch.nn as nn
from einops import rearrange
from packaging import version from packaging import version
from vae_modules.ema import LitEma from vae_modules.ema import LitEma
@ -56,17 +52,6 @@ class AbstractAutoencoder(pl.LightningModule):
if version.parse(torch.__version__) >= version.parse("2.0.0"): if version.parse(torch.__version__) >= version.parse("2.0.0"):
self.automatic_optimization = False self.automatic_optimization = False
# def apply_ckpt(self, ckpt: Union[None, str, dict]):
# if ckpt is None:
# return
# if isinstance(ckpt, str):
# ckpt = {
# "target": "sgm.modules.checkpoint.CheckpointEngine",
# "params": {"ckpt_path": ckpt},
# }
# engine = instantiate_from_config(ckpt)
# engine(self)
def apply_ckpt(self, ckpt: Union[None, str, dict]): def apply_ckpt(self, ckpt: Union[None, str, dict]):
if ckpt is None: if ckpt is None:
return return
@ -85,6 +70,18 @@ class AbstractAutoencoder(pl.LightningModule):
print("Unexpected keys: ", unexpected_keys) print("Unexpected keys: ", unexpected_keys)
print(f"Restored from {path}") print(f"Restored from {path}")
def apply_ckpt(self, ckpt: Union[None, str, dict]):
if ckpt is None:
return
if isinstance(ckpt, str):
ckpt = {
"target": "sgm.modules.checkpoint.CheckpointEngine",
"params": {"ckpt_path": ckpt},
}
engine = instantiate_from_config(ckpt)
engine(self)
@abstractmethod @abstractmethod
def get_input(self, batch) -> Any: def get_input(self, batch) -> Any:
raise NotImplementedError() raise NotImplementedError()
@ -216,12 +213,13 @@ class AutoencodingEngine(AbstractAutoencoder):
return self.decoder.get_last_layer() return self.decoder.get_last_layer()
def encode( def encode(
self, self,
x: torch.Tensor, x: torch.Tensor,
return_reg_log: bool = False, return_reg_log: bool = False,
unregularized: bool = False, unregularized: bool = False,
**kwargs,
) -> Union[torch.Tensor, Tuple[torch.Tensor, dict]]: ) -> Union[torch.Tensor, Tuple[torch.Tensor, dict]]:
z = self.encoder(x) z = self.encoder(x, **kwargs)
if unregularized: if unregularized:
return z, dict() return z, dict()
z, reg_log = self.regularization(z) z, reg_log = self.regularization(z)