Spaces:

kevinwang676
/

SadTalker

Running

App Files Files Community

kevinwang676 commited on Jul 19, 2023

Commit

9aaf024

1 Parent(s): 8f28809

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +23 -0
.gitignore +174 -0
.ipynb_checkpoints/Untitled-checkpoint.ipynb +6 -0
LICENSE +21 -0
README.md +268 -13
Untitled.ipynb +533 -0
app_sadtalker.py +111 -0
checkpoints/SadTalker_V0.0.2_256.safetensors +3 -0
checkpoints/SadTalker_V0.0.2_512.safetensors +3 -0
checkpoints/mapping_00109-model.pth.tar +3 -0
checkpoints/mapping_00229-model.pth.tar +3 -0
cog.yaml +35 -0
docs/FAQ.md +46 -0
docs/best_practice.md +94 -0
docs/changlelog.md +29 -0
docs/example_crop.gif +3 -0
docs/example_crop_still.gif +3 -0
docs/example_full.gif +3 -0
docs/example_full_crop.gif +0 -0
docs/example_full_enhanced.gif +3 -0
docs/face3d.md +48 -0
docs/free_view_result.gif +3 -0
docs/install.md +47 -0
docs/resize_good.gif +3 -0
docs/resize_no.gif +3 -0
docs/sadtalker_logo.png +0 -0
docs/using_ref_video.gif +3 -0
docs/webui_extension.md +50 -0
examples/driven_audio/RD_Radio31_000.wav +0 -0
examples/driven_audio/RD_Radio34_002.wav +0 -0
examples/driven_audio/RD_Radio36_000.wav +0 -0
examples/driven_audio/RD_Radio40_000.wav +0 -0
examples/driven_audio/bus_chinese.wav +0 -0
examples/driven_audio/chinese_news.wav +3 -0
examples/driven_audio/chinese_poem1.wav +0 -0
examples/driven_audio/chinese_poem2.wav +0 -0
examples/driven_audio/deyu.wav +3 -0
examples/driven_audio/eluosi.wav +3 -0
examples/driven_audio/fayu.wav +3 -0
examples/driven_audio/imagine.wav +3 -0
examples/driven_audio/itosinger1.wav +0 -0
examples/driven_audio/japanese.wav +3 -0
examples/ref_video/WDA_AlexandriaOcasioCortez_000.mp4 +3 -0
examples/ref_video/WDA_KatieHill_000.mp4 +3 -0
examples/source_image/art_0.png +0 -0
examples/source_image/art_1.png +0 -0
examples/source_image/art_10.png +0 -0
examples/source_image/art_11.png +0 -0
examples/source_image/art_12.png +0 -0
examples/source_image/art_13.png +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,26 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+docs/example_crop.gif filter=lfs diff=lfs merge=lfs -text
+docs/example_crop_still.gif filter=lfs diff=lfs merge=lfs -text
+docs/example_full.gif filter=lfs diff=lfs merge=lfs -text
+docs/example_full_enhanced.gif filter=lfs diff=lfs merge=lfs -text
+docs/free_view_result.gif filter=lfs diff=lfs merge=lfs -text
+docs/resize_good.gif filter=lfs diff=lfs merge=lfs -text
+docs/resize_no.gif filter=lfs diff=lfs merge=lfs -text
+docs/using_ref_video.gif filter=lfs diff=lfs merge=lfs -text
+examples/driven_audio/chinese_news.wav filter=lfs diff=lfs merge=lfs -text
+examples/driven_audio/deyu.wav filter=lfs diff=lfs merge=lfs -text
+examples/driven_audio/eluosi.wav filter=lfs diff=lfs merge=lfs -text
+examples/driven_audio/fayu.wav filter=lfs diff=lfs merge=lfs -text
+examples/driven_audio/imagine.wav filter=lfs diff=lfs merge=lfs -text
+examples/driven_audio/japanese.wav filter=lfs diff=lfs merge=lfs -text
+examples/ref_video/WDA_AlexandriaOcasioCortez_000.mp4 filter=lfs diff=lfs merge=lfs -text
+examples/ref_video/WDA_KatieHill_000.mp4 filter=lfs diff=lfs merge=lfs -text
+examples/source_image/art_16.png filter=lfs diff=lfs merge=lfs -text
+examples/source_image/art_17.png filter=lfs diff=lfs merge=lfs -text
+examples/source_image/art_3.png filter=lfs diff=lfs merge=lfs -text
+examples/source_image/art_4.png filter=lfs diff=lfs merge=lfs -text
+examples/source_image/art_5.png filter=lfs diff=lfs merge=lfs -text
+examples/source_image/art_8.png filter=lfs diff=lfs merge=lfs -text
+examples/source_image/art_9.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,174 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+examples/results/*
+gfpgan/*
+checkpoints/*
+assets/*
+results/*
+Dockerfile
+start_docker.sh
+start.sh
+checkpoints
+# Mac
+.DS_Store

.ipynb_checkpoints/Untitled-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Tencent AI Lab
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,13 +1,268 @@
----
-title: SadTalker
-emoji: 🏃
-colorFrom: gray
-colorTo: blue
-sdk: gradio
-sdk_version: 3.37.0
-app_file: app.py
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+<div align="center">
+<img src='https://user-images.githubusercontent.com/4397546/229094115-862c747e-7397-4b54-ba4a-bd368bfe2e0f.png' width='500px'/>
+<!--<h2> 😭 SadTalker： <span style="font-size:12px">Learning Realistic 3D Motion Coefficients for Stylized Audio-Driven Single Image Talking Face Animation </span> </h2> -->
+  <a href='https://arxiv.org/abs/2211.12194'><img src='https://img.shields.io/badge/ArXiv-PDF-red'></a> &nbsp; <a href='https://sadtalker.github.io'><img src='https://img.shields.io/badge/Project-Page-Green'></a> &nbsp; [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Winfredy/SadTalker/blob/main/quick_demo.ipynb) &nbsp; [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/vinthony/SadTalker) &nbsp; [![sd webui-colab](https://img.shields.io/badge/Automatic1111-Colab-green)](https://colab.research.google.com/github/camenduru/stable-diffusion-webui-colab/blob/main/video/stable/stable_diffusion_1_5_video_webui_colab.ipynb) &nbsp; [![Replicate](https://replicate.com/cjwbw/sadtalker/badge)](https://replicate.com/cjwbw/sadtalker)
+<div>
+    <a target='_blank'>Wenxuan Zhang <sup>*,1,2</sup> </a>&emsp;
+    <a href='https://vinthony.github.io/' target='_blank'>Xiaodong Cun <sup>*,2</a>&emsp;
+    <a href='https://xuanwangvc.github.io/' target='_blank'>Xuan Wang <sup>3</sup></a>&emsp;
+    <a href='https://yzhang2016.github.io/' target='_blank'>Yong Zhang <sup>2</sup></a>&emsp;
+    <a href='https://xishen0220.github.io/' target='_blank'>Xi Shen <sup>2</sup></a>&emsp; </br>
+    <a href='https://yuguo-xjtu.github.io/' target='_blank'>Yu Guo<sup>1</sup> </a>&emsp;
+    <a href='https://scholar.google.com/citations?hl=zh-CN&user=4oXBp9UAAAAJ' target='_blank'>Ying Shan <sup>2</sup> </a>&emsp;
+    <a target='_blank'>Fei Wang <sup>1</sup> </a>&emsp;
+</div>
+<br>
+<div>
+    <sup>1</sup> Xi'an Jiaotong University &emsp; <sup>2</sup> Tencent AI Lab &emsp; <sup>3</sup> Ant Group &emsp;
+</div>
+<br>
+<i><strong><a href='https://arxiv.org/abs/2211.12194' target='_blank'>CVPR 2023</a></strong></i>
+<br>
+<br>
+![sadtalker](https://user-images.githubusercontent.com/4397546/222490039-b1f6156b-bf00-405b-9fda-0c9a9156f991.gif)
+<b>TL;DR: &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; single portrait image 🙎‍♂️  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;+  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; audio 🎤  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; =  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; talking head video 🎞.</b>
+<br>
+</div>
+## 🔥 Highlight
+- 🔥 The extension of the [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) is online. Checkout more details [here](docs/webui_extension.md).
+https://user-images.githubusercontent.com/4397546/231495639-5d4bb925-ea64-4a36-a519-6389917dac29.mp4
+- 🔥 `full image mode` is online! checkout [here](https://github.com/Winfredy/SadTalker#full-bodyimage-generation) for more details.
+| still+enhancer in v0.0.1                 | still + enhancer   in v0.0.2       |   [input image @bagbag1815](https://twitter.com/bagbag1815/status/1642754319094108161) |
+|:--------------------: |:--------------------: | :----: |
+| <video  src="https://user-images.githubusercontent.com/48216707/229484996-5d7be64f-2553-4c9e-a452-c5cf0b8ebafe.mp4" type="video/mp4"> </video> | <video  src="https://user-images.githubusercontent.com/4397546/230717873-355b7bf3-d3de-49f9-a439-9220e623fce7.mp4" type="video/mp4"> </video>  | <img src='./examples/source_image/full_body_2.png' width='380'>
+- 🔥 Several new mode, eg, `still mode`, `reference mode`, `resize mode` are online for better and custom applications.
+- 🔥 Happy to  see more community demos at [bilibili](https://search.bilibili.com/all?keyword=sadtalker&from_source=webtop_search&spm_id_from=333.1007&search_source=3
+), [Youtube](https://www.youtube.com/results?search_query=sadtalker&sp=CAM%253D) and [twitter #sadtalker](https://twitter.com/search?q=%23sadtalker&src=typed_query).
+## 📋 Changelog (Previous changelog can be founded [here](docs/changlelog.md))
+- __[2023.06.12]__: add more new features in WEBUI extension, see the discussion [here](https://github.com/OpenTalker/SadTalker/discussions/386).
+- __[2023.06.05]__: release a new 512 beta face model. Fixed some bugs and improve the performance.
+- __[2023.04.15]__: Adding automatic1111 colab by @camenduru, thanks for this awesome colab: [![sd webui-colab](https://img.shields.io/badge/Automatic1111-Colab-green)](https://colab.research.google.com/github/camenduru/stable-diffusion-webui-colab/blob/main/video/stable/stable_diffusion_1_5_video_webui_colab.ipynb).
+- __[2023.04.12]__: adding a more detailed sd-webui installation document, fixed reinstallation problem.
+- __[2023.04.12]__: Fixed the sd-webui safe issues becasue of the 3rd packages, optimize the output path in `sd-webui-extension`.
+- __[2023.04.08]__: ❗️❗️❗️ In v0.0.2, we add a logo watermark to the generated video to prevent abusing since it is very realistic.
+- __[2023.04.08]__: v0.0.2, full image animation, adding baidu driver for download checkpoints. Optimizing the logic about enhancer.
+## 🚧 TODO: See the Discussion https://github.com/OpenTalker/SadTalker/issues/280
+## If you have any problem, please view our [FAQ](docs/FAQ.md) before opening an issue.
+## ⚙️ 1. Installation.
+Tutorials from communities: [中文windows教程](https://www.bilibili.com/video/BV1Dc411W7V6/) | [日本語コース](https://br-d.fanbox.cc/posts/5685086?utm_campaign=manage_post_page&utm_medium=share&utm_source=twitter)
+### Linux:
+1. Installing [anaconda](https://www.anaconda.com/), python and git.
+2. Creating the env and install the requirements.
+  ```bash
+  git clone https://github.com/Winfredy/SadTalker.git
+  cd SadTalker
+  conda create -n sadtalker python=3.8
+  conda activate sadtalker
+  pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu113
+  conda install ffmpeg
+  pip install -r requirements.txt
+  ### tts is optional for gradio demo.
+  ### pip install TTS
+  ```
+### Windows ([中文windows教程](https://www.bilibili.com/video/BV1Dc411W7V6/)):
+1. Install [Python 3.10.6](https://www.python.org/downloads/windows/), checking "Add Python to PATH".
+2. Install [git](https://git-scm.com/download/win) manually (OR `scoop install git` via [scoop](https://scoop.sh/)).
+3. Install `ffmpeg`, following [this instruction](https://www.wikihow.com/Install-FFmpeg-on-Windows) (OR using `scoop install ffmpeg` via [scoop](https://scoop.sh/)).
+4. Download our SadTalker repository, for example by running `git clone https://github.com/Winfredy/SadTalker.git`.
+5. Download the `checkpoint` and `gfpgan` [below↓](https://github.com/Winfredy/SadTalker#-2-download-trained-models).
+5. Run `start.bat` from Windows Explorer as normal, non-administrator, user, a gradio WebUI demo will be started.
+### Macbook:
+More tips about installnation on Macbook and the Docker file can be founded [here](docs/install.md)
+## 📥 2. Download Trained Models.
+You can run the following script to put all the models in the right place.
+```bash
+bash scripts/download_models.sh
+```
+Other alternatives:
+> we also provide an offline patch (`gfpgan/`), thus, no model will be downloaded when generating.
+**Google Driver**: download our pre-trained model from [ this link (main checkpoints)](https://drive.google.com/file/d/1gwWh45pF7aelNP_P78uDJL8Sycep-K7j/view?usp=sharing) and [ gfpgan (offline patch)](https://drive.google.com/file/d/19AIBsmfcHW6BRJmeqSFlG5fL445Xmsyi?usp=sharing)
+**Github Release Page**: download all the files from the [lastest github release page](https://github.com/Winfredy/SadTalker/releases), and then, put it in ./checkpoints.
+**百度云盘**: we provided the downloaded model in [checkpoints,  提取码: sadt.](https://pan.baidu.com/s/1P4fRgk9gaSutZnn8YW034Q?pwd=sadt) And [gfpgan,  提取码: sadt.](https://pan.baidu.com/s/1kb1BCPaLOWX1JJb9Czbn6w?pwd=sadt)
+<details><summary>Model Details</summary>
+Model explains:
+##### New version
+| Model | Description
+| :--- | :----------
+|checkpoints/mapping_00229-model.pth.tar | Pre-trained MappingNet in Sadtalker.
+|checkpoints/mapping_00109-model.pth.tar | Pre-trained MappingNet in Sadtalker.
+|checkpoints/SadTalker_V0.0.2_256.safetensors | packaged sadtalker checkpoints of old version, 256 face render).
+|checkpoints/SadTalker_V0.0.2_512.safetensors | packaged sadtalker checkpoints of old version, 512 face render).
+|gfpgan/weights | Face detection and enhanced models used in `facexlib` and `gfpgan`.
+##### Old version
+| Model | Description
+| :--- | :----------
+|checkpoints/auido2exp_00300-model.pth | Pre-trained ExpNet in Sadtalker.
+|checkpoints/auido2pose_00140-model.pth | Pre-trained PoseVAE in Sadtalker.
+|checkpoints/mapping_00229-model.pth.tar | Pre-trained MappingNet in Sadtalker.
+|checkpoints/mapping_00109-model.pth.tar | Pre-trained MappingNet in Sadtalker.
+|checkpoints/facevid2vid_00189-model.pth.tar | Pre-trained face-vid2vid model from [the reappearance of face-vid2vid](https://github.com/zhanglonghao1992/One-Shot_Free-View_Neural_Talking_Head_Synthesis).
+|checkpoints/epoch_20.pth | Pre-trained 3DMM extractor in [Deep3DFaceReconstruction](https://github.com/microsoft/Deep3DFaceReconstruction).
+|checkpoints/wav2lip.pth | Highly accurate lip-sync model in [Wav2lip](https://github.com/Rudrabha/Wav2Lip).
+|checkpoints/shape_predictor_68_face_landmarks.dat | Face landmark model used in [dilb](http://dlib.net/).
+|checkpoints/BFM | 3DMM library file.
+|checkpoints/hub | Face detection models used in [face alignment](https://github.com/1adrianb/face-alignment).
+|gfpgan/weights | Face detection and enhanced models used in `facexlib` and `gfpgan`.
+The final folder will be shown as:
+<img width="331" alt="image" src="https://user-images.githubusercontent.com/4397546/232511411-4ca75cbf-a434-48c5-9ae0-9009e8316484.png">
+</details>
+## 🔮 3. Quick Start ([Best Practice](docs/best_practice.md)).
+### WebUI Demos:
+**Online**: [Huggingface](https://huggingface.co/spaces/vinthony/SadTalker) | [SDWebUI-Colab](https://colab.research.google.com/github/camenduru/stable-diffusion-webui-colab/blob/main/video/stable/stable_diffusion_1_5_video_webui_colab.ipynb) | [Colab](https://colab.research.google.com/github/Winfredy/SadTalker/blob/main/quick_demo.ipynb)
+**Local Autiomatic1111 stable-diffusion webui extension**: please refer to [Autiomatic1111 stable-diffusion webui docs](docs/webui_extension.md).
+**Local gradio demo(highly recommanded!)**: Similar to our [hugging-face demo](https://huggingface.co/spaces/vinthony/SadTalker) can be run by:
+```bash
+## you need manually install TTS(https://github.com/coqui-ai/TTS) via `pip install tts` in advanced.
+python app.py
+```
+**Local gradio demo(highly recommanded!)**:
+- windows: just double click `webui.bat`, the requirements will be installed automatically.
+- Linux/Mac OS: run `bash webui.sh` to start the webui.
+### Manually usages:
+##### Animating a portrait image from default config:
+```bash
+python inference.py --driven_audio <audio.wav> \
+                    --source_image <video.mp4 or picture.png> \
+                    --enhancer gfpgan
+```
+The results will be saved in `results/$SOME_TIMESTAMP/*.mp4`.
+##### Full body/image Generation:
+Using `--still` to generate a natural full body video. You can add `enhancer` to improve the quality of the generated video.
+```bash
+python inference.py --driven_audio <audio.wav> \
+                    --source_image <video.mp4 or picture.png> \
+                    --result_dir <a file to store results> \
+                    --still \
+                    --preprocess full \
+                    --enhancer gfpgan
+```
+More examples and configuration and tips can be founded in the [ >>> best practice documents <<<](docs/best_practice.md).
+## 🛎 Citation
+If you find our work useful in your research, please consider citing:
+```bibtex
+@article{zhang2022sadtalker,
+  title={SadTalker: Learning Realistic 3D Motion Coefficients for Stylized Audio-Driven Single Image Talking Face Animation},
+  author={Zhang, Wenxuan and Cun, Xiaodong and Wang, Xuan and Zhang, Yong and Shen, Xi and Guo, Yu and Shan, Ying and Wang, Fei},
+  journal={arXiv preprint arXiv:2211.12194},
+  year={2022}
+}
+```
+## 💗 Acknowledgements
+Facerender code borrows heavily from [zhanglonghao's reproduction of face-vid2vid](https://github.com/zhanglonghao1992/One-Shot_Free-View_Neural_Talking_Head_Synthesis) and [PIRender](https://github.com/RenYurui/PIRender). We thank the authors for sharing their wonderful code. In training process, We also use the model from [Deep3DFaceReconstruction](https://github.com/microsoft/Deep3DFaceReconstruction) and [Wav2lip](https://github.com/Rudrabha/Wav2Lip). We thank for their wonderful work.
+See also these wonderful 3rd libraries we use:
+- **Face Utils**: https://github.com/xinntao/facexlib
+- **Face Enhancement**: https://github.com/TencentARC/GFPGAN
+- **Image/Video Enhancement**:https://github.com/xinntao/Real-ESRGAN
+## 🥂 Extensions:
+- [SadTalker-Video-Lip-Sync](https://github.com/Zz-ww/SadTalker-Video-Lip-Sync) from [@Zz-ww](https://github.com/Zz-ww): SadTalker for Video Lip Editing
+## 🥂 Related Works
+- [StyleHEAT: One-Shot High-Resolution Editable Talking Face Generation via Pre-trained StyleGAN (ECCV 2022)](https://github.com/FeiiYin/StyleHEAT)
+- [CodeTalker: Speech-Driven 3D Facial Animation with Discrete Motion Prior (CVPR 2023)](https://github.com/Doubiiu/CodeTalker)
+- [VideoReTalking: Audio-based Lip Synchronization for Talking Head Video Editing In the Wild (SIGGRAPH Asia 2022)](https://github.com/vinthony/video-retalking)
+- [DPE: Disentanglement of Pose and Expression for General Video Portrait Editing (CVPR 2023)](https://github.com/Carlyx/DPE)
+- [3D GAN Inversion with Facial Symmetry Prior (CVPR 2023)](https://github.com/FeiiYin/SPI/)
+- [T2M-GPT: Generating Human Motion from Textual Descriptions with Discrete Representations (CVPR 2023)](https://github.com/Mael-zys/T2M-GPT)
+## 📢 Disclaimer
+This is not an official product of Tencent. This repository can only be used for personal/research/non-commercial purposes.
+LOGO: color and font suggestion: [ChatGPT](ai.com), logo font：[Montserrat Alternates
+](https://fonts.google.com/specimen/Montserrat+Alternates?preview.text=SadTalker&preview.text_type=custom&query=mont).
+All the copyright of the demo images and audio are from communities users or the geneartion from stable diffusion. Free free to contact us if you feel uncomfortable.

Untitled.ipynb ADDED Viewed

	@@ -0,0 +1,533 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "c69a901b-8c3f-4188-97bd-5594c4496ec5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import HfApi\n",
+    "api = HfApi()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "8b249adc-ccd0-4145-86ce-64509ad276cf",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "faf7b41b81e54705bae8921f2a86e9fd",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from huggingface_hub import login\n",
+    "login()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "41815b66-b12b-45da-9a5e-92c527fe2b4a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6bb4c00dda664acd9f70e029e15dbd57",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "SadTalker_V0.0.2_512.safetensors:   0%|          | 0.00/725M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b9a78d85c20642ac86da6b6d0ab97031",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "mapping_00109-model.pth.tar:   0%|          | 0.00/156M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d396b1372ea04ae79dd82677c1c33352",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Upload 31 LFS files:   0%|          | 0/31 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7b924c6f560b40b78152a87f5c388d6b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "mapping_00229-model.pth.tar:   0%|          | 0.00/156M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4245c014a4d04d6daacd47d7d81c6249",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "SadTalker_V0.0.2_256.safetensors:   0%|          | 0.00/725M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d7f2b0a6b3ed4d5e895da21237f42577",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "example_crop.gif:   0%|          | 0.00/1.55M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "24ea098c821c4fc79ffa38b2fe5a1575",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "example_crop_still.gif:   0%|          | 0.00/1.25M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b828fea9017846748cbe423f1b5fada7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "example_full.gif:   0%|          | 0.00/1.46M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a0b664fbf3f8465d9b286290e5f14d8f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "example_full_enhanced.gif:   0%|          | 0.00/5.78M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e7483527580845429ce33f4f987e7ddb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "free_view_result.gif:   0%|          | 0.00/5.61M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7662c74dc37e4e8a9e756da854c7267a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "resize_good.gif:   0%|          | 0.00/1.73M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0e6e3f1dbb26419c895a7fed7a7f31ac",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "resize_no.gif:   0%|          | 0.00/2.14M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bdf7d2a9e5784350af4be3d57cc72fa4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "using_ref_video.gif:   0%|          | 0.00/8.11M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3348de1636bc4d85b2674eeed42d03d9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "chinese_news.wav:   0%|          | 0.00/1.54M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "40d1dc6faf634304a13e18feab5816af",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "deyu.wav:   0%|          | 0.00/2.69M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2c91050e17e34841ae74d4a9b414eaf7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "eluosi.wav:   0%|          | 0.00/1.79M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "35ed59910dd04972b65bd0684dd497c1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "fayu.wav:   0%|          | 0.00/1.94M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6cc44214a3224b9fa952d8fdd69b262d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "imagine.wav:   0%|          | 0.00/1.62M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "21b4c8e2efbe4205b961d849c52c0c66",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "japanese.wav:   0%|          | 0.00/2.62M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8df2d5959a394789b9defb2d8dea9680",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "WDA_AlexandriaOcasioCortez_000.mp4:   0%|          | 0.00/2.26M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bee85421a93345f88e1ba92bdc627a92",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "WDA_KatieHill_000.mp4:   0%|          | 0.00/3.55M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1dfceecdadf2443bb935b58c06116ac3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "art_16.png:   0%|          | 0.00/1.48M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7b054b4faa49444786ac49f10bc2ca27",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "art_17.png:   0%|          | 0.00/2.09M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "78db6e2bdf3f4e8289cda18189d2f9bd",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "art_3.png:   0%|          | 0.00/1.35M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ef5a284a239548c4964ea3a924975cbc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "art_4.png:   0%|          | 0.00/3.63M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fbdf097df06247c98da1a3f7b4322c92",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "art_5.png:   0%|          | 0.00/1.23M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1099a9330eae49178d53d35827f9fa05",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "art_8.png:   0%|          | 0.00/3.12M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f53f1a45946f4a8db6c7a48f525b8f7c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "art_9.png:   0%|          | 0.00/1.26M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "de038d6f5aa84c63b2335af75c82e337",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "GFPGANv1.4.pth:   0%|          | 0.00/349M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "69daa05b13a745fa8a27e6447282fdff",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "alignment_WFLW_4HG.pth:   0%|          | 0.00/194M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ee0b32bbe61647359a6e8324a092b95e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "detection_Resnet50_Final.pth:   0%|          | 0.00/109M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "709b3e709e704234826dd1050f260688",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "parsing_parsenet.pth:   0%|          | 0.00/85.3M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "api.upload_folder(\n",
+    "    folder_path=\"/workspace/SadTalker\",\n",
+    "    repo_id=\"kevinwang676/SadTalker\",\n",
+    "    repo_type=\"space\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c15c0fad-c4ac-4fd5-b886-89d2f97bc285",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

app_sadtalker.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import os, sys
+import gradio as gr
+from src.gradio_demo import SadTalker
+try:
+    import webui  # in webui
+    in_webui = True
+except:
+    in_webui = False
+def toggle_audio_file(choice):
+    if choice == False:
+        return gr.update(visible=True), gr.update(visible=False)
+    else:
+        return gr.update(visible=False), gr.update(visible=True)
+def ref_video_fn(path_of_ref_video):
+    if path_of_ref_video is not None:
+        return gr.update(value=True)
+    else:
+        return gr.update(value=False)
+def sadtalker_demo(checkpoint_path='checkpoints', config_path='src/config', warpfn=None):
+    sad_talker = SadTalker(checkpoint_path, config_path, lazy_load=True)
+    with gr.Blocks(analytics_enabled=False) as sadtalker_interface:
+        gr.Markdown("<div align='center'> <h2> 😭 SadTalker: Learning Realistic 3D Motion Coefficients for Stylized Audio-Driven Single Image Talking Face Animation (CVPR 2023) </span> </h2> \
+                    <a style='font-size:18px;color: #efefef' href='https://arxiv.org/abs/2211.12194'>Arxiv</a> &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \
+                    <a style='font-size:18px;color: #efefef' href='https://sadtalker.github.io'>Homepage</a>  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \
+                     <a style='font-size:18px;color: #efefef' href='https://github.com/Winfredy/SadTalker'> Github </div>")
+        with gr.Row().style(equal_height=False):
+            with gr.Column(variant='panel'):
+                with gr.Tabs(elem_id="sadtalker_source_image"):
+                    with gr.TabItem('Upload image'):
+                        with gr.Row():
+                            source_image = gr.Image(label="Source image", source="upload", type="filepath", elem_id="img2img_image").style(width=512)
+                with gr.Tabs(elem_id="sadtalker_driven_audio"):
+                    with gr.TabItem('Upload OR TTS'):
+                        with gr.Column(variant='panel'):
+                            driven_audio = gr.Audio(label="Input audio", source="upload", type="filepath")
+                        if sys.platform != 'win32' and not in_webui:
+                            from src.utils.text2speech import TTSTalker
+                            tts_talker = TTSTalker()
+                            with gr.Column(variant='panel'):
+                                input_text = gr.Textbox(label="Generating audio from text", lines=5, placeholder="please enter some text here, we genreate the audio from text using @Coqui.ai TTS.")
+                                tts = gr.Button('Generate audio',elem_id="sadtalker_audio_generate", variant='primary')
+                                tts.click(fn=tts_talker.test, inputs=[input_text], outputs=[driven_audio])
+            with gr.Column(variant='panel'):
+                with gr.Tabs(elem_id="sadtalker_checkbox"):
+                    with gr.TabItem('Settings'):
+                        gr.Markdown("need help? please visit our [best practice page](https://github.com/OpenTalker/SadTalker/blob/main/docs/best_practice.md) for more detials")
+                        with gr.Column(variant='panel'):
+                            # width = gr.Slider(minimum=64, elem_id="img2img_width", maximum=2048, step=8, label="Manually Crop Width", value=512) # img2img_width
+                            # height = gr.Slider(minimum=64, elem_id="img2img_height", maximum=2048, step=8, label="Manually Crop Height", value=512) # img2img_width
+                            pose_style = gr.Slider(minimum=0, maximum=46, step=1, label="Pose style", value=0) #
+                            size_of_image = gr.Radio([256, 512], value=256, label='face model resolution', info="use 256/512 model?") #
+                            preprocess_type = gr.Radio(['crop', 'resize','full', 'extcrop', 'extfull'], value='crop', label='preprocess', info="How to handle input image?")
+                            is_still_mode = gr.Checkbox(label="Still Mode (fewer hand motion, works with preprocess `full`)")
+                            batch_size = gr.Slider(label="batch size in generation", step=1, maximum=10, value=2)
+                            enhancer = gr.Checkbox(label="GFPGAN as Face enhancer")
+                            submit = gr.Button('Generate', elem_id="sadtalker_generate", variant='primary')
+                with gr.Tabs(elem_id="sadtalker_genearted"):
+                        gen_video = gr.Video(label="Generated video", format="mp4").style(width=256)
+        if warpfn:
+            submit.click(
+                        fn=warpfn(sad_talker.test),
+                        inputs=[source_image,
+                                driven_audio,
+                                preprocess_type,
+                                is_still_mode,
+                                enhancer,
+                                batch_size,
+                                size_of_image,
+                                pose_style
+                                ],
+                        outputs=[gen_video]
+                        )
+        else:
+            submit.click(
+                        fn=sad_talker.test,
+                        inputs=[source_image,
+                                driven_audio,
+                                preprocess_type,
+                                is_still_mode,
+                                enhancer,
+                                batch_size,
+                                size_of_image,
+                                pose_style
+                                ],
+                        outputs=[gen_video]
+                        )
+    return sadtalker_interface
+if __name__ == "__main__":
+    demo = sadtalker_demo()
+    demo.queue()
+    demo.launch()

checkpoints/SadTalker_V0.0.2_256.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c211f5d6de003516bf1bbda9f47049a4c9c99133b1ab565c6961e5af16477bff
+size 725066984

checkpoints/SadTalker_V0.0.2_512.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0e063f7ff5258240bdb0f7690783a7b1374e6a4a81ce8fa33456f4cd49694340
+size 725066984

checkpoints/mapping_00109-model.pth.tar ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:84a8642468a3fcfdd9ab6be955267043116c2bec2284686a5262f1eaf017f64c
+size 155779231

checkpoints/mapping_00229-model.pth.tar ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:62a1e06006cc963220f6477438518ed86e9788226c62ae382ddc42fbcefb83f1
+size 155521183

cog.yaml ADDED Viewed

	@@ -0,0 +1,35 @@

+build:
+  gpu: true
+  cuda: "11.3"
+  python_version: "3.8"
+  system_packages:
+    - "ffmpeg"
+    - "libgl1-mesa-glx"
+    - "libglib2.0-0"
+  python_packages:
+    - "torch==1.12.1"
+    - "torchvision==0.13.1"
+    - "torchaudio==0.12.1"
+    - "joblib==1.1.0"
+    - "scikit-image==0.19.3"
+    - "basicsr==1.4.2"
+    - "facexlib==0.3.0"
+    - "resampy==0.3.1"
+    - "pydub==0.25.1"
+    - "scipy==1.10.1"
+    - "kornia==0.6.8"
+    - "face_alignment==1.3.5"
+    - "imageio==2.19.3"
+    - "imageio-ffmpeg==0.4.7"
+    - "librosa==0.9.2" #
+    - "tqdm==4.65.0"
+    - "yacs==0.1.8"
+    - "gfpgan==1.3.8"
+    - "dlib-bin==19.24.1"
+    - "av==10.0.0"
+    - "trimesh==3.9.20"
+  run:
+    - mkdir -p /root/.cache/torch/hub/checkpoints/ && wget --output-document "/root/.cache/torch/hub/checkpoints/s3fd-619a316812.pth" "https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth"
+    - mkdir -p /root/.cache/torch/hub/checkpoints/ && wget --output-document "/root/.cache/torch/hub/checkpoints/2DFAN4-cd938726ad.zip" "https://www.adrianbulat.com/downloads/python-fan/2DFAN4-cd938726ad.zip"
+predict: "predict.py:Predictor"

docs/FAQ.md ADDED Viewed

	@@ -0,0 +1,46 @@

+## Frequency Asked Question
+**Q: `ffmpeg` is not recognized as an internal or external command**
+In Linux, you can install the ffmpeg via `conda install ffmpeg`. Or on Mac OS X, try to install ffmpeg via `brew install ffmpeg`. On windows, make sure you have `ffmpeg` in the `%PATH%` as suggested in [#54](https://github.com/Winfredy/SadTalker/issues/54), then, following [this](https://www.geeksforgeeks.org/how-to-install-ffmpeg-on-windows/) installation to install `ffmpeg`.
+**Q: Running Requirments.**
+Please refer to the discussion here: https://github.com/Winfredy/SadTalker/issues/124#issuecomment-1508113989
+**Q: ModuleNotFoundError: No module named 'ai'**
+please check the checkpoint's size of the `epoch_20.pth`. (https://github.com/Winfredy/SadTalker/issues/167, https://github.com/Winfredy/SadTalker/issues/113)
+**Q: Illegal Hardware Error: Mac M1**
+please reinstall the `dlib` by `pip install dlib` individually. (https://github.com/Winfredy/SadTalker/issues/129, https://github.com/Winfredy/SadTalker/issues/109)
+**Q: FileNotFoundError: [Errno 2] No such file or directory: checkpoints\BFM_Fitting\similarity_Lm3D_all.mat**
+Make sure you have downloaded the checkpoints and gfpgan as [here](https://github.com/Winfredy/SadTalker#-2-download-trained-models) and placed them in the right place.
+**Q: RuntimeError: unexpected EOF, expected 237192 more bytes. The file might be corrupted.**
+The files are not automatically downloaded. Please update the code and download the gfpgan folders as [here](https://github.com/Winfredy/SadTalker#-2-download-trained-models).
+**Q: CUDA out of memory error**
+please refer to https://stackoverflow.com/questions/73747731/runtimeerror-cuda-out-of-memory-how-setting-max-split-size-mb
+```
+# windows
+set PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128
+python inference.py ...
+# linux
+export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128
+python inference.py ...
+```
+**Q: Error while decoding stream #0:0: Invalid data found when processing input [mp3float @ 0000015037628c00] Header missing**
+Our method only support wav or mp3 files as input, please make sure the feeded audios are in these formats.

docs/best_practice.md ADDED Viewed

	@@ -0,0 +1,94 @@

+# Best Practice and Tips for configuration
+> Our model only works on REAL person's photo or the portrait image similar to REAL person. The anime talking head genreation method will be released in future.
+Advanced confiurations for `inference.py`:
+| Name        | Configuration | default |   Explaination  |
+|:------------- |:------------- |:----- | :------------- |
+| Enhance Mode | `--enhancer` | None | Using `gfpgan` or `RestoreFormer` to enhance the generated face via face restoration network
+| Background Enhancer | `--background_enhancer` | None | Using `realesrgan` to enhance the full video.
+| Still Mode   | ` --still` | False |  Using the same pose parameters as the original image, fewer head motion.
+| Expressive Mode | `--expression_scale` | 1.0 | a larger value will make the expression motion stronger.
+| save path | `--result_dir` |`./results` | The file will be save in the newer location.
+| preprocess | `--preprocess` | `crop` | Run and produce the results in the croped input image. Other choices: `resize`, where the images will be resized to the specific resolution. `full` Run the full image animation, use with `--still` to get better results.
+| ref Mode (eye) | `--ref_eyeblink` | None | A video path, where we borrow the eyeblink from this reference video to provide more natural eyebrow movement.
+| ref Mode (pose) | `--ref_pose` | None | A video path, where we borrow the pose from the head reference video.
+| 3D Mode | `--face3dvis` | False | Need additional installation. More details to generate the 3d face can be founded [here](docs/face3d.md).
+| free-view Mode | `--input_yaw`,<br> `--input_pitch`,<br> `--input_roll` | None | Genearting novel view or free-view 4D talking head from a single image. More details can be founded [here](https://github.com/Winfredy/SadTalker#generating-4d-free-view-talking-examples-from-audio-and-a-single-image).
+### About `--preprocess`
+Our method automatically handle the input images via `crop`, `resize` and `full`.
+ In `crop` mode, we only generate the croped image via the facial keypoints and generated the facial anime avator. The animation of both expression and head pose are realistic.
+> still mode will stop the eyeblink and head pose movement.
+|  [input image @bagbag1815](https://twitter.com/bagbag1815/status/1642754319094108161) | crop | crop w/still |
+|:--------------------: |:--------------------: | :----: |
+| <img src='../examples/source_image/full_body_2.png' width='380'> | ![full_body_2](example_crop.gif) | ![full_body_2](example_crop_still.gif) |
+ In `resize` mode, we resize the whole images to generate the fully talking head video. Thus, an image similar to the ID photo can be produced. ⚠️ It will produce bad results for full person images.
+| <img src='../examples/source_image/full_body_2.png' width='380'> |  <img src='../examples/source_image/full4.jpeg' width='380'> |
+|:--------------------: |:--------------------: |
+| ❌ not suitable for resize mode | ✅ good for resize mode |
+| <img src='resize_no.gif'> |  <img src='resize_good.gif' width='380'> |
+In `full` mode, our model will automatically process the croped region and paste back to the original image. Remember to use `--still` to keep the original head pose.
+| input | `--still` | `--still` & `enhancer` |
+|:--------------------: |:--------------------: | :--:|
+| <img src='../examples/source_image/full_body_2.png' width='380'> |  <img src='./example_full.gif' width='380'> |  <img src='./example_full_enhanced.gif' width='380'>
+### About `--enhancer`
+For better facial quality, we intergate [gfpgan](https://github.com/TencentARC/GFPGAN) and [real-esrgan](https://github.com/xinntao/Real-ESRGAN) for different purpose. Just adding `--enhancer <gfpgan or RestoreFormer>` or `--background_enhancer <realesrgan>` for the enhancement of the face and the full image.
+```bash
+# make sure above packages are available:
+pip install gfpgan
+pip install realesrgan
+```
+### About `--face3dvis`
+This flag indicate that we can generated the 3d-rendered face and it's 3d facial landmarks. More details can be founded [here](face3d.md).
+| Input        | Animated 3d face |
+|:-------------: | :-------------: |
+|  <img src='../examples/source_image/art_0.png' width='200px'> | <video src="https://user-images.githubusercontent.com/4397546/226856847-5a6a0a4d-a5ec-49e2-9b05-3206db65e8e3.mp4"></video>  |
+> Kindly ensure to activate the audio as the default audio playing is incompatible with GitHub.
+#### reference eye-link mode.
+| Input, w/ reference video   ,  reference video    |
+|:-------------: |
+|  ![free_view](using_ref_video.gif)|
+| If the reference video is shorter than the input audio, we will loop the reference video .
+#### Generating 4D free-view talking examples from audio and a single image
+We use `input_yaw`, `input_pitch`, `input_roll` to control head pose. For example, `--input_yaw -20 30 10` means the input head yaw degree changes from -20 to 30 and then changes from 30 to 10.
+```bash
+python inference.py --driven_audio <audio.wav> \
+                    --source_image <video.mp4 or picture.png> \
+                    --result_dir <a file to store results> \
+                    --input_yaw -20 30 10
+```
+| Results, Free-view results,  Novel view results  |
+|:-------------: |
+|  ![free_view](free_view_result.gif)|

docs/changlelog.md ADDED Viewed

	@@ -0,0 +1,29 @@

+## changelogs
+- __[2023.04.06]__: stable-diffiusion webui extension is release.
+- __[2023.04.03]__: Enable TTS in huggingface and gradio local demo.
+- __[2023.03.30]__: Launch beta version of the full body mode.
+- __[2023.03.30]__: Launch new feature: through using reference videos, our algorithm can generate videos with more natural eye blinking and some eyebrow movement.
+- __[2023.03.29]__: `resize mode` is online by `python infererence.py --preprocess resize`! Where we can produce a larger crop of the image as discussed in https://github.com/Winfredy/SadTalker/issues/35.
+- __[2023.03.29]__: local gradio demo is online! `python app.py` to start the demo. New `requirments.txt` is used to avoid the bugs in `librosa`.
+- __[2023.03.28]__: Online demo is launched in [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/vinthony/SadTalker), thanks AK!
+- __[2023.03.22]__: Launch new feature: generating the 3d face animation from a single image. New applications about it will be updated.
+- __[2023.03.22]__: Launch new feature: `still mode`, where only a small head pose will be produced via `python inference.py --still`.
+- __[2023.03.18]__: Support `expression intensity`, now you can change the intensity of the generated motion: `python inference.py --expression_scale 1.3 (some value > 1)`.
+- __[2023.03.18]__: Reconfig the data folders, now you can download the checkpoint automatically using `bash scripts/download_models.sh`.
+- __[2023.03.18]__: We have offically integrate the [GFPGAN](https://github.com/TencentARC/GFPGAN) for face enhancement, using `python inference.py --enhancer gfpgan` for  better visualization performance.
+- __[2023.03.14]__: Specify the version of package `joblib` to remove the errors in using `librosa`, [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Winfredy/SadTalker/blob/main/quick_demo.ipynb) is online!
+- __[2023.03.06]__: Solve some bugs in code and errors in installation
+- __[2023.03.03]__: Release the test code for audio-driven single image animation!
+- __[2023.02.28]__: SadTalker has been accepted by CVPR 2023!

docs/example_crop.gif ADDED Viewed

Git LFS Details

SHA256: da08306e3e6355928887e74057ee4221f9d877d8536341d907e29fe35e078b45
Pointer size: 132 Bytes
Size of remote file: 1.55 MB

docs/example_crop_still.gif ADDED Viewed

Git LFS Details

SHA256: 667c7531ed0a4d97a3ca9b15f79eea655b93dc40eda94498aa43b9e6a48c49aa
Pointer size: 132 Bytes
Size of remote file: 1.25 MB

docs/example_full.gif ADDED Viewed

Git LFS Details

SHA256: 2d1a2b8f5ed7b942a8625a5767828c1bc47568165a187079fbbb8492ed57301b
Pointer size: 132 Bytes
Size of remote file: 1.46 MB

docs/example_full_crop.gif ADDED Viewed

docs/example_full_enhanced.gif ADDED Viewed

Git LFS Details

SHA256: 906ca893e72854021c7715f784dc3fe219bbe67b73ff461e6ba8374f0d3b4712
Pointer size: 132 Bytes
Size of remote file: 5.78 MB

docs/face3d.md ADDED Viewed

	@@ -0,0 +1,48 @@

+## 3D Face visualization
+We use pytorch3d to visualize the produced 3d face from a single image.
+Since it is not easy to install, we produce a new install guidence here:
+```bash
+git clone https://github.com/Winfredy/SadTalker.git
+cd SadTalker
+conda create -n sadtalker3d python=3.8
+source activate sadtalker3d
+conda install ffmpeg
+conda install -c fvcore -c iopath -c conda-forge fvcore iopath
+conda install libgcc gmp
+pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu113
+# insintall pytorch3d
+pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/py38_cu113_pyt1110/download.html
+pip install -r requirements3d.txt
+### install gpfgan for enhancer
+pip install git+https://github.com/TencentARC/GFPGAN
+### when occurs gcc version problem `from pytorch import _C` from pytorch3d, add the anaconda path to LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/$YOUR_ANACONDA_PATH/lib/
+```
+Then, generating the result via:
+```bash
+python inference.py --driven_audio <audio.wav> \
+                    --source_image <video.mp4 or picture.png> \
+                    --result_dir <a file to store results> \
+                    --face3dvis
+```
+Then, the results will be given in the folders with the file name of `face3d.mp4`.
+More applications about 3d face will be released.

docs/free_view_result.gif ADDED Viewed

Git LFS Details

SHA256: 035a7fba6800964254728f82fec47fe5c91458183e19a7506dd54d89940af40f
Pointer size: 132 Bytes
Size of remote file: 5.61 MB

docs/install.md ADDED Viewed

	@@ -0,0 +1,47 @@

+### Mac (Tested on M1 Mac OS 13.3)
+```
+git clone https://github.com/Winfredy/SadTalker.git
+cd SadTalker
+conda create -n sadtalker python=3.8
+conda activate sadtalker
+# install pytorch 2.0
+pip install torch torchvision torchaudio
+conda install ffmpeg
+pip install -r requirements.txt
+pip install dlib # mac need to install the original dlib.
+```
+### Windows Native
+- Make sure you have `ffmpeg` in the `%PATH%` as suggested in [#54](https://github.com/Winfredy/SadTalker/issues/54), following [this](https://www.geeksforgeeks.org/how-to-install-ffmpeg-on-windows/) installation to install `ffmpeg`.
+### Windows WSL
+- Make sure the environment: `export LD_LIBRARY_PATH=/usr/lib/wsl/lib:$LD_LIBRARY_PATH`
+### Docker installnation
+A dockerfile are also provided by [@thegenerativegeneration](https://github.com/thegenerativegeneration) in [docker hub](https://hub.docker.com/repository/docker/wawa9000/sadtalker), which can be used directly as:
+```bash
+docker run --gpus "all" --rm -v $(pwd):/host_dir wawa9000/sadtalker \
+    --driven_audio /host_dir/deyu.wav \
+    --source_image /host_dir/image.jpg \
+    --expression_scale 1.0 \
+    --still \
+    --result_dir /host_dir
+```

docs/resize_good.gif ADDED Viewed

Git LFS Details

SHA256: ada6f2ea847e71c2a963882fd83f6b54193f4fe7c402f9f20698632b15bbdc0c
Pointer size: 132 Bytes
Size of remote file: 1.73 MB

docs/resize_no.gif ADDED Viewed

Git LFS Details

SHA256: c7702f0be5c87c8977bf3c4a73ea4d27e90d0a5a3015816abb880cfd8f75c6ac
Pointer size: 132 Bytes
Size of remote file: 2.14 MB

docs/sadtalker_logo.png ADDED Viewed

docs/using_ref_video.gif ADDED Viewed

Git LFS Details

SHA256: 9bb68ae077a6c009e7d30a36d34c30bf1310a073ab3c7d9cc1b5c9abe285e888
Pointer size: 132 Bytes
Size of remote file: 8.11 MB

docs/webui_extension.md ADDED Viewed

	@@ -0,0 +1,50 @@

+## Run SadTalker as a Stable Diffusion WebUI Extension.
+1. Installing the lastest version of [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) and install the sadtalker via `extension`.
+<img width="726" alt="image" src="https://user-images.githubusercontent.com/4397546/230698519-267d1d1f-6e99-4dd4-81e1-7b889259efbd.png">
+2. Download the checkpoints manually, for Linux and Mac:
+    ```bash
+    cd SOMEWHERE_YOU_LIKE
+    bash <(wget -qO- https://raw.githubusercontent.com/Winfredy/SadTalker/main/scripts/download_models.sh)
+    ```
+    For windows, you can download all the checkpoints from [google drive](https://drive.google.com/drive/folders/1Wd88VDoLhVzYsQ30_qDVluQr_Xm46yHT?usp=sharing) or [百度云盘](https://pan.baidu.com/s/1nXuVNd0exUl37ISwWqbFGA?pwd=sadt) 提取码: sadt.
+3.1. options 1: put the checkpoint in `stable-diffusion-webui/models/SadTalker` or `stable-diffusion-webui/extensions/SadTalker/checkpoints/`, the checkpoints will be detected automatically.
+3.2. Options 2: Set the path of `SADTALKTER_CHECKPOINTS` in `webui_user.sh`(linux) or `webui_user.bat`(windows) by:
+    > only works if you are directly starting webui from `webui_user.sh` or `webui_user.bat`.
+    ```bash
+    # windows (webui_user.bat)
+    set SADTALKER_CHECKPOINTS=D:\SadTalker\checkpoints
+    # linux (webui_user.sh)
+    export SADTALKER_CHECKPOINTS=/path/to/SadTalker/checkpoints
+    ```
+4. Then, starting the webui via `webui.sh or webui_user.sh(linux)` or `webui_user.bat(windows)` or any other methods, the SadTalker can be used in stable-diffusion-webui directly.
+    <img width="726" alt="image" src="https://user-images.githubusercontent.com/4397546/230698614-58015182-2916-4240-b324-e69022ef75b3.png">
+## Questsions
+1. if you are running on CPU, you need to specific `--disable-safe-unpickle` in `webui_user.sh` or `webui_user.bat`.
+    ```bash
+    # windows (webui_user.bat)
+    set COMMANDLINE_ARGS="--disable-safe-unpickle"
+    # linux (webui_user.sh)
+    export COMMANDLINE_ARGS="--disable-safe-unpickle"
+    ```
+(Some [important discussion](https://github.com/Winfredy/SadTalker/issues/78) if you are unable to use `full` mode).

examples/driven_audio/RD_Radio31_000.wav ADDED Viewed

Binary file (512 kB). View file

examples/driven_audio/RD_Radio34_002.wav ADDED Viewed

Binary file (512 kB). View file

examples/driven_audio/RD_Radio36_000.wav ADDED Viewed

Binary file (512 kB). View file

examples/driven_audio/RD_Radio40_000.wav ADDED Viewed

Binary file (512 kB). View file

examples/driven_audio/bus_chinese.wav ADDED Viewed

Binary file (652 kB). View file

examples/driven_audio/chinese_news.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b0f4d313a1ca671bc4831d60bcf0c12225efbffe6c0e93e54fbfe9bcd4021cb
+size 1536078

examples/driven_audio/chinese_poem1.wav ADDED Viewed

Binary file (263 kB). View file

examples/driven_audio/chinese_poem2.wav ADDED Viewed

Binary file (461 kB). View file

examples/driven_audio/deyu.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba1839c57770a2ab0b593ce814344bfd4d750da02acc9be9e8cf5b9113a0f88a
+size 2694784

examples/driven_audio/eluosi.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d4a3593815dc7b68c256672baa61934c9479efa770af2065fb0886f02713606e
+size 1786672

examples/driven_audio/fayu.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:16ebd13626ae4171030b4ea05cceef06078483c352e4b68d469fc2a52bfffceb
+size 1940428

examples/driven_audio/imagine.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2db410217e074d91ae6011e1c5dc0b94f02d05d381c50af8e54253eeacad17d2
+size 1618510

examples/driven_audio/itosinger1.wav ADDED Viewed

Binary file (500 kB). View file

examples/driven_audio/japanese.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3db5426d0b158799e2be4f609b11f75bfbd4affffe18e9a1c8e6f241fcdedcfc
+size 2622712

examples/ref_video/WDA_AlexandriaOcasioCortez_000.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a85242c3fc4d50e2202cea393b9e7ee59019759b68e78e26a254d528c22615a7
+size 2257667

examples/ref_video/WDA_KatieHill_000.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1fbb4cfd64eedc49b170c441714a9c4fd5e2c2f8a11592070ad89fbd257f2817
+size 3548230

examples/source_image/art_0.png ADDED Viewed

examples/source_image/art_1.png ADDED Viewed

examples/source_image/art_10.png ADDED Viewed

examples/source_image/art_11.png ADDED Viewed

examples/source_image/art_12.png ADDED Viewed

examples/source_image/art_13.png ADDED Viewed