Spaces:
Running
on
Zero
Running
on
Zero
gradio tweaks
Browse files
README.md
CHANGED
|
@@ -7,6 +7,284 @@ sdk: gradio
|
|
| 7 |
sdk_version: 4.38.1
|
| 8 |
app_file: webgui.py
|
| 9 |
pinned: false
|
|
|
|
| 10 |
---
|
|
|
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
sdk_version: 4.38.1
|
| 8 |
app_file: webgui.py
|
| 9 |
pinned: false
|
| 10 |
+
suggested_hardware: a10g-large
|
| 11 |
---
|
| 12 |
+
<h1 align='center'>EchoMimic: Lifelike Audio-Driven Portrait Animations through Editable Landmark Conditioning</h1>
|
| 13 |
|
| 14 |
+
<div align='center'>
|
| 15 |
+
<a href='https://github.com/yuange250' target='_blank'>Zhiyuan Chen</a><sup>*</sup> 
|
| 16 |
+
<a href='https://github.com/JoeFannie' target='_blank'>Jiajiong Cao</a><sup>*</sup> 
|
| 17 |
+
<a href='https://github.com/octavianChen' target='_blank'>Zhiquan Chen</a><sup></sup> 
|
| 18 |
+
<a href='https://github.com/lymhust' target='_blank'>Yuming Li</a><sup></sup> 
|
| 19 |
+
<a href='https://github.com/' target='_blank'>Chenguang Ma</a><sup></sup>
|
| 20 |
+
</div>
|
| 21 |
+
<div align='center'>
|
| 22 |
+
*Equal Contribution.
|
| 23 |
+
</div>
|
| 24 |
+
|
| 25 |
+
<div align='center'>
|
| 26 |
+
Terminal Technology Department, Alipay, Ant Group.
|
| 27 |
+
</div>
|
| 28 |
+
<br>
|
| 29 |
+
<div align='center'>
|
| 30 |
+
<a href='https://badtobest.github.io/echomimic.html'><img src='https://img.shields.io/badge/Project-Page-blue'></a>
|
| 31 |
+
<a href='https://huggingface.co/BadToBest/EchoMimic'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Model-yellow'></a>
|
| 32 |
+
<a href='https://www.modelscope.cn/models/BadToBest/EchoMimic'><img src='https://img.shields.io/badge/ModelScope-Model-purple'></a>
|
| 33 |
+
<a href='https://arxiv.org/abs/2407.08136'><img src='https://img.shields.io/badge/Paper-Arxiv-red'></a>
|
| 34 |
+
<a href='assets/echomimic.png'><img src='https://badges.aleen42.com/src/wechat.svg'></a>
|
| 35 |
+
</div>
|
| 36 |
+
|
| 37 |
+
## 📣 📣 Updates
|
| 38 |
+
* [2024.07.17] 🔥🔥🔥 Accelerated models and pipe are released. The inference speed can be improved by **10x** (from ~7mins/240frames to ~50s/240frames on V100 GPU)
|
| 39 |
+
* [2024.07.14] 🔥 [ComfyUI](https://github.com/smthemex/ComfyUI_EchoMimic) is now available. Thanks @smthemex for the contribution.
|
| 40 |
+
* [2024.07.13] 🔥 Thanks [NewGenAI](https://www.youtube.com/@StableAIHub) for the [video installation tutorial](https://www.youtube.com/watch?v=8R0lTIY7tfI).
|
| 41 |
+
* [2024.07.13] 🔥 We release our pose&audio driven codes and models.
|
| 42 |
+
* [2024.07.12] 🔥 WebUI and GradioUI versions are released. We thank @greengerong @Robin021 and @O-O1024 for their contributions.
|
| 43 |
+
* [2024.07.12] 🔥 Our [paper](https://arxiv.org/abs/2407.08136) is in public on arxiv.
|
| 44 |
+
* [2024.07.09] 🔥 We release our audio driven codes and models.
|
| 45 |
+
|
| 46 |
+
## Gallery
|
| 47 |
+
### Audio Driven (Sing)
|
| 48 |
+
|
| 49 |
+
<table class="center">
|
| 50 |
+
|
| 51 |
+
<tr>
|
| 52 |
+
<td width=30% style="border: none">
|
| 53 |
+
<video controls loop src="https://github.com/BadToBest/EchoMimic/assets/11451501/d014d921-9f94-4640-97ad-035b00effbfe" muted="false"></video>
|
| 54 |
+
</td>
|
| 55 |
+
<td width=30% style="border: none">
|
| 56 |
+
<video controls loop src="https://github.com/BadToBest/EchoMimic/assets/11451501/877603a5-a4f9-4486-a19f-8888422daf78" muted="false"></video>
|
| 57 |
+
</td>
|
| 58 |
+
<td width=30% style="border: none">
|
| 59 |
+
<video controls loop src="https://github.com/BadToBest/EchoMimic/assets/11451501/e0cb5afb-40a6-4365-84f8-cb2834c4cfe7" muted="false"></video>
|
| 60 |
+
</td>
|
| 61 |
+
</tr>
|
| 62 |
+
|
| 63 |
+
</table>
|
| 64 |
+
|
| 65 |
+
### Audio Driven (English)
|
| 66 |
+
|
| 67 |
+
<table class="center">
|
| 68 |
+
|
| 69 |
+
<tr>
|
| 70 |
+
<td width=30% style="border: none">
|
| 71 |
+
<video controls loop src="https://github.com/BadToBest/EchoMimic/assets/11451501/386982cd-3ff8-470d-a6d9-b621e112f8a5" muted="false"></video>
|
| 72 |
+
</td>
|
| 73 |
+
<td width=30% style="border: none">
|
| 74 |
+
<video controls loop src="https://github.com/BadToBest/EchoMimic/assets/11451501/5c60bb91-1776-434e-a720-8857a00b1501" muted="false"></video>
|
| 75 |
+
</td>
|
| 76 |
+
<td width=30% style="border: none">
|
| 77 |
+
<video controls loop src="https://github.com/BadToBest/EchoMimic/assets/11451501/1f15adc5-0f33-4afa-b96a-2011886a4a06" muted="false"></video>
|
| 78 |
+
</td>
|
| 79 |
+
</tr>
|
| 80 |
+
|
| 81 |
+
</table>
|
| 82 |
+
|
| 83 |
+
### Audio Driven (Chinese)
|
| 84 |
+
|
| 85 |
+
<table class="center">
|
| 86 |
+
|
| 87 |
+
<tr>
|
| 88 |
+
<td width=30% style="border: none">
|
| 89 |
+
<video controls loop src="https://github.com/BadToBest/EchoMimic/assets/11451501/a8092f9a-a5dc-4cd6-95be-1831afaccf00" muted="false"></video>
|
| 90 |
+
</td>
|
| 91 |
+
<td width=30% style="border: none">
|
| 92 |
+
<video controls loop src="https://github.com/BadToBest/EchoMimic/assets/11451501/c8b5c59f-0483-42ef-b3ee-4cffae6c7a52" muted="false"></video>
|
| 93 |
+
</td>
|
| 94 |
+
<td width=30% style="border: none">
|
| 95 |
+
<video controls loop src="https://github.com/BadToBest/EchoMimic/assets/11451501/532a3e60-2bac-4039-a06c-ff6bf06cb4a4" muted="false"></video>
|
| 96 |
+
</td>
|
| 97 |
+
</tr>
|
| 98 |
+
|
| 99 |
+
</table>
|
| 100 |
+
|
| 101 |
+
### Landmark Driven
|
| 102 |
+
|
| 103 |
+
<table class="center">
|
| 104 |
+
|
| 105 |
+
<tr>
|
| 106 |
+
<td width=30% style="border: none">
|
| 107 |
+
<video controls loop src="https://github.com/BadToBest/EchoMimic/assets/11451501/1da6c46f-4532-4375-a0dc-0a4d6fd30a39" muted="false"></video>
|
| 108 |
+
</td>
|
| 109 |
+
<td width=30% style="border: none">
|
| 110 |
+
<video controls loop src="https://github.com/BadToBest/EchoMimic/assets/11451501/d4f4d5c1-e228-463a-b383-27fb90ed6172" muted="false"></video>
|
| 111 |
+
</td>
|
| 112 |
+
<td width=30% style="border: none">
|
| 113 |
+
<video controls loop src="https://github.com/BadToBest/EchoMimic/assets/11451501/18bd2c93-319e-4d1c-8255-3f02ba717475" muted="false"></video>
|
| 114 |
+
</td>
|
| 115 |
+
</tr>
|
| 116 |
+
|
| 117 |
+
</table>
|
| 118 |
+
|
| 119 |
+
### Audio + Selected Landmark Driven
|
| 120 |
+
|
| 121 |
+
<table class="center">
|
| 122 |
+
|
| 123 |
+
<tr>
|
| 124 |
+
<td width=30% style="border: none">
|
| 125 |
+
<video controls loop src="https://github.com/BadToBest/EchoMimic/assets/11451501/4a29d735-ec1b-474d-b843-3ff0bdf85f55" muted="false"></video>
|
| 126 |
+
</td>
|
| 127 |
+
<td width=30% style="border: none">
|
| 128 |
+
<video controls loop src="https://github.com/BadToBest/EchoMimic/assets/11451501/b994c8f5-8dae-4dd8-870f-962b50dc091f" muted="false"></video>
|
| 129 |
+
</td>
|
| 130 |
+
<td width=30% style="border: none">
|
| 131 |
+
<video controls loop src="https://github.com/BadToBest/EchoMimic/assets/11451501/955c1d51-07b2-494d-ab93-895b9c43b896" muted="false"></video>
|
| 132 |
+
</td>
|
| 133 |
+
</tr>
|
| 134 |
+
|
| 135 |
+
</table>
|
| 136 |
+
|
| 137 |
+
**(Some demo images above are sourced from image websites. If there is any infringement, we will immediately remove them and apologize.)**
|
| 138 |
+
|
| 139 |
+
## Installation
|
| 140 |
+
|
| 141 |
+
### Download the Codes
|
| 142 |
+
|
| 143 |
+
```bash
|
| 144 |
+
git clone https://github.com/BadToBest/EchoMimic
|
| 145 |
+
cd EchoMimic
|
| 146 |
+
```
|
| 147 |
+
|
| 148 |
+
### Python Environment Setup
|
| 149 |
+
|
| 150 |
+
- Tested System Environment: Centos 7.2/Ubuntu 22.04, Cuda >= 11.7
|
| 151 |
+
- Tested GPUs: A100(80G) / RTX4090D (24G) / V100(16G)
|
| 152 |
+
- Tested Python Version: 3.8 / 3.10 / 3.11
|
| 153 |
+
|
| 154 |
+
Create conda environment (Recommended):
|
| 155 |
+
|
| 156 |
+
```bash
|
| 157 |
+
conda create -n echomimic python=3.8
|
| 158 |
+
conda activate echomimic
|
| 159 |
+
```
|
| 160 |
+
|
| 161 |
+
Install packages with `pip`
|
| 162 |
+
```bash
|
| 163 |
+
pip install -r requirements.txt
|
| 164 |
+
```
|
| 165 |
+
|
| 166 |
+
### Download ffmpeg-static
|
| 167 |
+
Download and decompress [ffmpeg-static](https://www.johnvansickle.com/ffmpeg/old-releases/ffmpeg-4.4-amd64-static.tar.xz), then
|
| 168 |
+
```
|
| 169 |
+
export FFMPEG_PATH=/path/to/ffmpeg-4.4-amd64-static
|
| 170 |
+
```
|
| 171 |
+
|
| 172 |
+
### Download pretrained weights
|
| 173 |
+
|
| 174 |
+
```shell
|
| 175 |
+
git lfs install
|
| 176 |
+
git clone https://huggingface.co/BadToBest/EchoMimic pretrained_weights
|
| 177 |
+
```
|
| 178 |
+
|
| 179 |
+
The **pretrained_weights** is organized as follows.
|
| 180 |
+
|
| 181 |
+
```
|
| 182 |
+
./pretrained_weights/
|
| 183 |
+
├── denoising_unet.pth
|
| 184 |
+
├── reference_unet.pth
|
| 185 |
+
├── motion_module.pth
|
| 186 |
+
├── face_locator.pth
|
| 187 |
+
├── sd-vae-ft-mse
|
| 188 |
+
│ └── ...
|
| 189 |
+
├── sd-image-variations-diffusers
|
| 190 |
+
│ └── ...
|
| 191 |
+
└── audio_processor
|
| 192 |
+
└── whisper_tiny.pt
|
| 193 |
+
```
|
| 194 |
+
|
| 195 |
+
In which **denoising_unet.pth** / **reference_unet.pth** / **motion_module.pth** / **face_locator.pth** are the main checkpoints of **EchoMimic**. Other models in this hub can be also downloaded from it's original hub, thanks to their brilliant works:
|
| 196 |
+
- [sd-vae-ft-mse](https://huggingface.co/stabilityai/sd-vae-ft-mse)
|
| 197 |
+
- [sd-image-variations-diffusers](https://huggingface.co/lambdalabs/sd-image-variations-diffusers)
|
| 198 |
+
- [audio_processor(whisper)](https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt)
|
| 199 |
+
|
| 200 |
+
### Audio-Drived Algo Inference
|
| 201 |
+
Run the python inference script:
|
| 202 |
+
|
| 203 |
+
```bash
|
| 204 |
+
python -u infer_audio2vid.py
|
| 205 |
+
python -u infer_audio2vid_pose.py
|
| 206 |
+
```
|
| 207 |
+
|
| 208 |
+
### Audio-Drived Algo Inference On Your Own Cases
|
| 209 |
+
|
| 210 |
+
Edit the inference config file **./configs/prompts/animation.yaml**, and add your own case:
|
| 211 |
+
|
| 212 |
+
```bash
|
| 213 |
+
test_cases:
|
| 214 |
+
"path/to/your/image":
|
| 215 |
+
- "path/to/your/audio"
|
| 216 |
+
```
|
| 217 |
+
|
| 218 |
+
The run the python inference script:
|
| 219 |
+
```bash
|
| 220 |
+
python -u infer_audio2vid.py
|
| 221 |
+
```
|
| 222 |
+
|
| 223 |
+
### Motion Alignment between Ref. Img. and Driven Vid.
|
| 224 |
+
|
| 225 |
+
(Firstly download the checkpoints with '_pose.pth' postfix from huggingface)
|
| 226 |
+
|
| 227 |
+
Edit driver_video and ref_image to your path in demo_motion_sync.py, then run
|
| 228 |
+
```bash
|
| 229 |
+
python -u demo_motion_sync.py
|
| 230 |
+
```
|
| 231 |
+
|
| 232 |
+
### Audio&Pose-Drived Algo Inference
|
| 233 |
+
Edit ./configs/prompts/animation_pose.yaml, then run
|
| 234 |
+
```bash
|
| 235 |
+
python -u infer_audio2vid_pose.py
|
| 236 |
+
```
|
| 237 |
+
|
| 238 |
+
### Pose-Drived Algo Inference
|
| 239 |
+
Set draw_mouse=True in line 135 of infer_audio2vid_pose.py. Edit ./configs/prompts/animation_pose.yaml, then run
|
| 240 |
+
```bash
|
| 241 |
+
python -u infer_audio2vid_pose.py
|
| 242 |
+
```
|
| 243 |
+
|
| 244 |
+
### Run the Gradio UI
|
| 245 |
+
|
| 246 |
+
Thanks to the contribution from @Robin021:
|
| 247 |
+
|
| 248 |
+
```bash
|
| 249 |
+
|
| 250 |
+
python -u webgui.py --server_port=3000
|
| 251 |
+
|
| 252 |
+
```
|
| 253 |
+
|
| 254 |
+
## Release Plans
|
| 255 |
+
|
| 256 |
+
| Status | Milestone | ETA |
|
| 257 |
+
|:--------:|:-------------------------------------------------------------------------|:--:|
|
| 258 |
+
| ✅ | The inference source code of the Audio-Driven algo meet everyone on GitHub | 9th July, 2024 |
|
| 259 |
+
| ✅ | Pretrained models trained on English and Mandarin Chinese to be released | 9th July, 2024 |
|
| 260 |
+
| ✅ | The inference source code of the Pose-Driven algo meet everyone on GitHub | 13th July, 2024 |
|
| 261 |
+
| ✅ | Pretrained models with better pose control to be released | 13th July, 2024 |
|
| 262 |
+
| ✅ | Accelerated models to be released | 17th July, 2024 |
|
| 263 |
+
| 🚀 | Pretrained models with better sing performance to be released | TBD |
|
| 264 |
+
| 🚀 | Large-Scale and High-resolution Chinese-Based Talking Head Dataset | TBD |
|
| 265 |
+
|
| 266 |
+
## Acknowledgements
|
| 267 |
+
|
| 268 |
+
We would like to thank the contributors to the [AnimateDiff](https://github.com/guoyww/AnimateDiff), [Moore-AnimateAnyone](https://github.com/MooreThreads/Moore-AnimateAnyone) and [MuseTalk](https://github.com/TMElyralab/MuseTalk) repositories, for their open research and exploration.
|
| 269 |
+
|
| 270 |
+
We are also grateful to [V-Express](https://github.com/tencent-ailab/V-Express) and [hallo](https://github.com/fudan-generative-vision/hallo) for their outstanding work in the area of diffusion-based talking heads.
|
| 271 |
+
|
| 272 |
+
If we missed any open-source projects or related articles, we would like to complement the acknowledgement of this specific work immediately.
|
| 273 |
+
|
| 274 |
+
## Citation
|
| 275 |
+
|
| 276 |
+
If you find our work useful for your research, please consider citing the paper :
|
| 277 |
+
|
| 278 |
+
```
|
| 279 |
+
@misc{chen2024echomimic,
|
| 280 |
+
title={EchoMimic: Lifelike Audio-Driven Portrait Animations through Editable Landmark Conditioning},
|
| 281 |
+
author={Zhiyuan Chen, Jiajiong Cao, Zhiquan Chen, Yuming Li, Chenguang Ma},
|
| 282 |
+
year={2024},
|
| 283 |
+
archivePrefix={arXiv},
|
| 284 |
+
primaryClass={cs.CV}
|
| 285 |
+
}
|
| 286 |
+
```
|
| 287 |
+
|
| 288 |
+
## Star History
|
| 289 |
+
|
| 290 |
+
[](https://star-history.com/?spm=5176.28103460.0.0.342a3da23STWrU#BadToBest/EchoMimic&Date)
|
webgui.py
CHANGED
|
@@ -211,30 +211,67 @@ def process_video(uploaded_img, uploaded_audio, width, height, length, seed, fac
|
|
| 211 |
|
| 212 |
with gr.Blocks() as demo:
|
| 213 |
gr.Markdown('# EchoMimic')
|
| 214 |
-
gr.Markdown('
|
|
|
|
| 215 |
with gr.Row():
|
| 216 |
with gr.Column():
|
| 217 |
uploaded_img = gr.Image(type="filepath", label="Reference Image")
|
| 218 |
uploaded_audio = gr.Audio(type="filepath", label="Input Audio")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
with gr.Column():
|
| 220 |
output_video = gr.Video()
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
|
| 239 |
def generate_video(uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device):
|
| 240 |
|
|
@@ -263,7 +300,8 @@ with gr.Blocks() as demo:
|
|
| 263 |
fps,
|
| 264 |
device
|
| 265 |
],
|
| 266 |
-
outputs=output_video
|
|
|
|
| 267 |
)
|
| 268 |
parser = argparse.ArgumentParser(description='EchoMimic')
|
| 269 |
parser.add_argument('--server_name', type=str, default='0.0.0.0', help='Server name')
|
|
@@ -273,5 +311,5 @@ args = parser.parse_args()
|
|
| 273 |
# demo.launch(server_name=args.server_name, server_port=args.server_port, inbrowser=True)
|
| 274 |
|
| 275 |
if __name__ == '__main__':
|
| 276 |
-
demo.launch()
|
| 277 |
#demo.launch(server_name=args.server_name, server_port=args.server_port, inbrowser=True)
|
|
|
|
| 211 |
|
| 212 |
with gr.Blocks() as demo:
|
| 213 |
gr.Markdown('# EchoMimic')
|
| 214 |
+
gr.Markdown('## Lifelike Audio-Driven Portrait Animations through Editable Landmark Conditioning')
|
| 215 |
+
gr.Markdown('Inference time: from ~7mins/240frames to ~50s/240frames on V100 GPU')
|
| 216 |
with gr.Row():
|
| 217 |
with gr.Column():
|
| 218 |
uploaded_img = gr.Image(type="filepath", label="Reference Image")
|
| 219 |
uploaded_audio = gr.Audio(type="filepath", label="Input Audio")
|
| 220 |
+
with gr.Accordion("Advanced Configuration", open=False):
|
| 221 |
+
with gr.Row():
|
| 222 |
+
width = gr.Slider(label="Width", minimum=128, maximum=1024, value=default_values["width"])
|
| 223 |
+
height = gr.Slider(label="Height", minimum=128, maximum=1024, value=default_values["height"])
|
| 224 |
+
with gr.Row():
|
| 225 |
+
length = gr.Slider(label="Length", minimum=100, maximum=5000, value=default_values["length"])
|
| 226 |
+
seed = gr.Slider(label="Seed", minimum=0, maximum=10000, value=default_values["seed"])
|
| 227 |
+
with gr.Row():
|
| 228 |
+
facemask_dilation_ratio = gr.Slider(label="Facemask Dilation Ratio", minimum=0.0, maximum=1.0, step=0.01, value=default_values["facemask_dilation_ratio"])
|
| 229 |
+
facecrop_dilation_ratio = gr.Slider(label="Facecrop Dilation Ratio", minimum=0.0, maximum=1.0, step=0.01, value=default_values["facecrop_dilation_ratio"])
|
| 230 |
+
with gr.Row():
|
| 231 |
+
context_frames = gr.Slider(label="Context Frames", minimum=0, maximum=50, step=1, value=default_values["context_frames"])
|
| 232 |
+
context_overlap = gr.Slider(label="Context Overlap", minimum=0, maximum=10, step=1, value=default_values["context_overlap"])
|
| 233 |
+
with gr.Row():
|
| 234 |
+
cfg = gr.Slider(label="CFG", minimum=0.0, maximum=10.0, step=0.1, value=default_values["cfg"])
|
| 235 |
+
steps = gr.Slider(label="Steps", minimum=1, maximum=100, step=1, value=default_values["steps"])
|
| 236 |
+
with gr.Row():
|
| 237 |
+
sample_rate = gr.Slider(label="Sample Rate", minimum=8000, maximum=48000, step=1000, value=default_values["sample_rate"])
|
| 238 |
+
fps = gr.Slider(label="FPS", minimum=1, maximum=60, step=1, value=default_values["fps"])
|
| 239 |
+
device = gr.Radio(label="Device", choices=["cuda", "cpu"], value=default_values["device"])
|
| 240 |
+
generate_button = gr.Button("Generate Video")
|
| 241 |
with gr.Column():
|
| 242 |
output_video = gr.Video()
|
| 243 |
+
gr.Examples(
|
| 244 |
+
label = "Portrait examples",
|
| 245 |
+
examples = [
|
| 246 |
+
['assets/test_imgs/a.png'],
|
| 247 |
+
['assets/test_imgs/b.png'],
|
| 248 |
+
['assets/test_imgs/c.png'],
|
| 249 |
+
['assets/test_imgs/d.png'],
|
| 250 |
+
['assets/test_imgs/e.png']
|
| 251 |
+
],
|
| 252 |
+
inputs = [uploaded_img]
|
| 253 |
+
)
|
| 254 |
+
gr.Examples(
|
| 255 |
+
label = "Audio examples",
|
| 256 |
+
examples = [
|
| 257 |
+
['assets/test_audios/chunnuanhuakai.wav'],
|
| 258 |
+
['assets/test_audios/chunwang.wav'],
|
| 259 |
+
['assets/test_audios/echomimic_en_girl.wav'],
|
| 260 |
+
['assets/test_audios/echomimic_en.wav'],
|
| 261 |
+
['assets/test_audios/echomimic_girl.wav'],
|
| 262 |
+
['assets/test_audios/echomimic.wav'],
|
| 263 |
+
['assets/test_audios/jane.wav'],
|
| 264 |
+
['assets/test_audios/mei.wav'],
|
| 265 |
+
['assets/test_audios/walden.wav'],
|
| 266 |
+
['assets/test_audios/yun.wav'],
|
| 267 |
+
],
|
| 268 |
+
inputs = [uploaded_audio]
|
| 269 |
+
)
|
| 270 |
+
gr.HTML("""
|
| 271 |
+
<a href="https://huggingface.co/spaces/fffiloni/EchoMimic?duplicate=true">
|
| 272 |
+
<img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-xl.svg" alt="Duplicate this Space">
|
| 273 |
+
</a>
|
| 274 |
+
""")
|
| 275 |
|
| 276 |
def generate_video(uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device):
|
| 277 |
|
|
|
|
| 300 |
fps,
|
| 301 |
device
|
| 302 |
],
|
| 303 |
+
outputs=output_video,
|
| 304 |
+
show_api=False
|
| 305 |
)
|
| 306 |
parser = argparse.ArgumentParser(description='EchoMimic')
|
| 307 |
parser.add_argument('--server_name', type=str, default='0.0.0.0', help='Server name')
|
|
|
|
| 311 |
# demo.launch(server_name=args.server_name, server_port=args.server_port, inbrowser=True)
|
| 312 |
|
| 313 |
if __name__ == '__main__':
|
| 314 |
+
demo.queue(max_size=3).launch(show_api=False, show_error=True)
|
| 315 |
#demo.launch(server_name=args.server_name, server_port=args.server_port, inbrowser=True)
|