John6666 commited on
Commit
488e66f
1 Parent(s): d08d468

Upload 5 files

Browse files
Files changed (5) hide show
  1. README.md +13 -14
  2. app.py +205 -215
  3. packages.txt +1 -2
  4. pre-requirements.txt +1 -0
  5. requirements.txt +20 -19
README.md CHANGED
@@ -1,14 +1,13 @@
1
- ---
2
- title: SadTalker (Gradio 4.x)
3
- emoji: 😭
4
- colorFrom: purple
5
- colorTo: green
6
- sdk: gradio
7
- sdk_version: 4.40.0
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- ---
12
-
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: SadTalker (Gradio 4.x, latest PyTorch)
3
+ emoji: 😭
4
+ colorFrom: purple
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 4.42.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
app.py CHANGED
@@ -1,216 +1,206 @@
1
- import spaces
2
- import os, sys
3
- import tempfile
4
- import gradio as gr
5
- from src.gradio_demo import SadTalker
6
- # from src.utils.text2speech import TTSTalker
7
- from huggingface_hub import snapshot_download
8
-
9
- def get_source_image(image):
10
- return image
11
-
12
- try:
13
- import webui # in webui
14
- in_webui = True
15
- except:
16
- in_webui = False
17
-
18
-
19
- def toggle_audio_file(choice):
20
- if choice == False:
21
- return gr.update(visible=True), gr.update(visible=False)
22
- else:
23
- return gr.update(visible=False), gr.update(visible=True)
24
-
25
- def ref_video_fn(path_of_ref_video):
26
- if path_of_ref_video is not None:
27
- return gr.update(value=True)
28
- else:
29
- return gr.update(value=False)
30
-
31
- def download_model():
32
- REPO_ID = 'vinthony/SadTalker-V002rc'
33
- snapshot_download(repo_id=REPO_ID, local_dir='./checkpoints', local_dir_use_symlinks=True)
34
-
35
- def sadtalker_demo():
36
-
37
- download_model()
38
-
39
- sad_talker = SadTalker(lazy_load=True)
40
- # tts_talker = TTSTalker()
41
-
42
- with gr.Blocks(analytics_enabled=False, theme="Hev832/Applio") as sadtalker_interface:
43
- gr.Markdown("<div align='center'> <h2> 😭 SadTalker: Learning Realistic 3D Motion Coefficients for Stylized Audio-Driven Single Image Talking Face Animation (CVPR 2023) </span> </h2> \
44
- <a style='font-size:18px;color: #efefef' href='https://arxiv.org/abs/2211.12194'>Arxiv</a> &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \
45
- <a style='font-size:18px;color: #efefef' href='https://sadtalker.github.io'>Homepage</a> &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \
46
- <a style='font-size:18px;color: #efefef' href='https://github.com/Winfredy/SadTalker'> Github </div>")
47
-
48
-
49
- gr.Markdown("""
50
- <b>You may duplicate the space and upgrade to GPU in settings for better performance and faster inference without waiting in the queue. <a style='display:inline-block' href="https://huggingface.co/spaces/vinthony/SadTalker?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a></b> \
51
- <br/><b>Alternatively, try our GitHub <a href=https://github.com/Winfredy/SadTalker> code </a> on your own GPU. </b> <a style='display:inline-block' href="https://github.com/Winfredy/SadTalker"><img src="https://img.shields.io/github/stars/Winfredy/SadTalker?style=social"/></a> \
52
- """)
53
-
54
- with gr.Row():
55
- with gr.Column(variant='panel'):
56
- with gr.Tabs(elem_id="sadtalker_source_image"):
57
- with gr.TabItem('Source image'):
58
- with gr.Row():
59
- source_image = gr.Image(label="Source image", type="filepath", elem_id="img2img_image")
60
-
61
-
62
- with gr.TabItem('Driving Methods',elem_id="sadtalker_driven_audio"):
63
- gr.Markdown("Possible driving combinations: <br> 1. Audio only 2. Audio/IDLE Mode + Ref Video(pose, blink, pose+blink) 3. IDLE Mode only 4. Ref Video only (all) ")
64
- with gr.Row():
65
- driven_audio = gr.Audio(label="Input audio", type="filepath")
66
- driven_audio_no = gr.Audio(label="Use IDLE mode, no audio is required", type="filepath", visible=False)
67
- with gr.Column():
68
- use_idle_mode = gr.Checkbox(label="Use Idle Animation")
69
- length_of_audio = gr.Number(value=5, label="The length(seconds) of the generated video.")
70
- use_idle_mode.change(toggle_audio_file, inputs=use_idle_mode, outputs=[driven_audio, driven_audio_no]) # todo
71
- with gr.Row():
72
- ref_video = gr.Video(label="Reference Video", elem_id="vidref")
73
-
74
- with gr.Column():
75
- use_ref_video = gr.Checkbox(label="Use Reference Video")
76
- ref_info = gr.Radio(['pose', 'blink','pose+blink', 'all'], value='pose', label='Reference Video',info="How to borrow from reference Video?((fully transfer, aka, video driving mode))")
77
-
78
-
79
- ref_video.change(ref_video_fn, inputs=ref_video, outputs=[use_ref_video]) # todo
80
-
81
-
82
-
83
- with gr.TabItem('Settings',elem_id="sadtalker_checkbox"):
84
- gr.Markdown("need help? please visit our [[best practice page](https://github.com/OpenTalker/SadTalker/blob/main/docs/best_practice.md)] for more detials")
85
- with gr.Column(variant='panel'):
86
- with gr.Row():
87
- pose_style = gr.Slider(minimum=0, maximum=45, step=1, label="Pose style", value=0) #
88
- exp_weight = gr.Slider(minimum=0, maximum=3, step=0.1, label="expression scale", value=1) #
89
- blink_every = gr.Checkbox(label="use eye blink", value=True)
90
- with gr.Row():
91
- size_of_image = gr.Radio([256, 512], value=256, label='face model resolution', info="use 256/512 model?") #
92
- preprocess_type = gr.Radio(['crop', 'resize','full', 'extcrop', 'extfull'], value='crop', label='preprocess', info="How to handle input image?")
93
- with gr.Row():
94
- is_still_mode = gr.Checkbox(label="Still Mode (fewer head motion, works with preprocess `full`)")
95
- facerender = gr.Radio(['facevid2vid','pirender'], value='facevid2vid', label='facerender', info="which face render?")
96
- with gr.Row():
97
- batch_size = gr.Slider(label="batch size in generation", step=1, maximum=10, value=1)
98
- enhancer = gr.Checkbox(label="GFPGAN as Face enhancer")
99
-
100
- submit = gr.Button('Generate', elem_id="sadtalker_generate", variant='primary')
101
-
102
- with gr.Tabs(elem_id="sadtalker_genearted"):
103
- gen_video = gr.Video(label="Generated video", format="mp4")
104
-
105
-
106
-
107
- submit.click(
108
- fn=sad_talker.test,
109
- inputs=[source_image,
110
- driven_audio,
111
- preprocess_type,
112
- is_still_mode,
113
- enhancer,
114
- batch_size,
115
- size_of_image,
116
- pose_style,
117
- facerender,
118
- exp_weight,
119
- use_ref_video,
120
- ref_video,
121
- ref_info,
122
- use_idle_mode,
123
- length_of_audio,
124
- blink_every
125
- ],
126
- outputs=[gen_video],
127
- )
128
-
129
- with gr.Row():
130
- examples = [
131
- [
132
- 'examples/source_image/full_body_1.png',
133
- 'examples/driven_audio/bus_chinese.wav',
134
- 'crop',
135
- True,
136
- False
137
- ],
138
- [
139
- 'examples/source_image/full_body_2.png',
140
- 'examples/driven_audio/japanese.wav',
141
- 'crop',
142
- False,
143
- False
144
- ],
145
- [
146
- 'examples/source_image/full3.png',
147
- 'examples/driven_audio/deyu.wav',
148
- 'crop',
149
- False,
150
- True
151
- ],
152
- [
153
- 'examples/source_image/full4.jpeg',
154
- 'examples/driven_audio/eluosi.wav',
155
- 'full',
156
- False,
157
- True
158
- ],
159
- [
160
- 'examples/source_image/full4.jpeg',
161
- 'examples/driven_audio/imagine.wav',
162
- 'full',
163
- True,
164
- True
165
- ],
166
- [
167
- 'examples/source_image/full_body_1.png',
168
- 'examples/driven_audio/bus_chinese.wav',
169
- 'full',
170
- True,
171
- False
172
- ],
173
- [
174
- 'examples/source_image/art_13.png',
175
- 'examples/driven_audio/fayu.wav',
176
- 'resize',
177
- True,
178
- False
179
- ],
180
- [
181
- 'examples/source_image/art_5.png',
182
- 'examples/driven_audio/chinese_news.wav',
183
- 'resize',
184
- False,
185
- False
186
- ],
187
- [
188
- 'examples/source_image/art_5.png',
189
- 'examples/driven_audio/RD_Radio31_000.wav',
190
- 'resize',
191
- True,
192
- True
193
- ],
194
- ]
195
- gr.Examples(examples=examples,
196
- inputs=[
197
- source_image,
198
- driven_audio,
199
- preprocess_type,
200
- is_still_mode,
201
- enhancer],
202
- outputs=[gen_video],
203
- fn=sad_talker.test,
204
- cache_examples=False
205
- #cache_examples=os.getenv('SYSTEM') == 'spaces'
206
- ) #
207
-
208
- return sadtalker_interface
209
-
210
-
211
- if __name__ == "__main__":
212
- demo = sadtalker_demo()
213
- demo.queue()
214
- demo.launch()
215
- #demo.queue(max_size=10, api_open=True)
216
  #demo.launch(debug=True)
 
1
+ import spaces
2
+ import os, sys
3
+ import tempfile
4
+ import gradio as gr
5
+ from src.gradio_demo import SadTalker
6
+ # from src.utils.text2speech import TTSTalker
7
+ from huggingface_hub import snapshot_download
8
+
9
+ def get_source_image(image):
10
+ return image
11
+
12
+ try:
13
+ import webui # in webui
14
+ in_webui = True
15
+ except:
16
+ in_webui = False
17
+
18
+
19
+ def toggle_audio_file(choice):
20
+ if choice == False:
21
+ return gr.update(visible=True), gr.update(visible=False)
22
+ else:
23
+ return gr.update(visible=False), gr.update(visible=True)
24
+
25
+ def ref_video_fn(path_of_ref_video):
26
+ if path_of_ref_video is not None:
27
+ return gr.update(value=True)
28
+ else:
29
+ return gr.update(value=False)
30
+
31
+ def download_model():
32
+ REPO_ID = 'vinthony/SadTalker-V002rc'
33
+ snapshot_download(repo_id=REPO_ID, local_dir='./checkpoints', local_dir_use_symlinks=True)
34
+
35
+ def sadtalker_demo():
36
+
37
+ download_model()
38
+
39
+ sad_talker = SadTalker(lazy_load=True)
40
+ # tts_talker = TTSTalker()
41
+
42
+ with gr.Blocks(analytics_enabled=False, theme="Hev832/Applio") as sadtalker_interface:
43
+ gr.Markdown("<div align='center'> <h2> 😭 SadTalker: Learning Realistic 3D Motion Coefficients for Stylized Audio-Driven Single Image Talking Face Animation (CVPR 2023) </span> </h2> \
44
+ <a style='font-size:18px;color: #efefef' href='https://arxiv.org/abs/2211.12194'>Arxiv</a> &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \
45
+ <a style='font-size:18px;color: #efefef' href='https://sadtalker.github.io'>Homepage</a> &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \
46
+ <a style='font-size:18px;color: #efefef' href='https://github.com/Winfredy/SadTalker'> Github </div>")
47
+
48
+ gr.Markdown("""
49
+ <b>You may duplicate the space and upgrade to GPU in settings for better performance and faster inference without waiting in the queue. <a style='display:inline-block' href="https://huggingface.co/spaces/vinthony/SadTalker?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a></b> \
50
+ <br/><b>Alternatively, try our GitHub <a href=https://github.com/Winfredy/SadTalker> code </a> on your own GPU. </b> <a style='display:inline-block' href="https://github.com/Winfredy/SadTalker"><img src="https://img.shields.io/github/stars/Winfredy/SadTalker?style=social"/></a> \
51
+ """)
52
+
53
+ with gr.Row():
54
+ with gr.Column(variant='panel'):
55
+ with gr.Tabs(elem_id="sadtalker_source_image"):
56
+ with gr.TabItem('Source image'):
57
+ with gr.Row():
58
+ source_image = gr.Image(label="Source image", type="filepath", elem_id="img2img_image")
59
+
60
+ with gr.TabItem('Driving Methods',elem_id="sadtalker_driven_audio"):
61
+ gr.Markdown("Possible driving combinations: <br> 1. Audio only 2. Audio/IDLE Mode + Ref Video(pose, blink, pose+blink) 3. IDLE Mode only 4. Ref Video only (all) ")
62
+ with gr.Row():
63
+ driven_audio = gr.Audio(label="Input audio", type="filepath")
64
+ driven_audio_no = gr.Audio(label="Use IDLE mode, no audio is required", type="filepath", visible=False)
65
+ with gr.Column():
66
+ use_idle_mode = gr.Checkbox(label="Use Idle Animation")
67
+ length_of_audio = gr.Number(value=5, label="The length(seconds) of the generated video.")
68
+ use_idle_mode.change(toggle_audio_file, inputs=use_idle_mode, outputs=[driven_audio, driven_audio_no]) # todo
69
+ with gr.Row():
70
+ ref_video = gr.Video(label="Reference Video", elem_id="vidref")
71
+ with gr.Column():
72
+ use_ref_video = gr.Checkbox(label="Use Reference Video")
73
+ ref_info = gr.Radio(['pose', 'blink','pose+blink', 'all'], value='pose', label='Reference Video',info="How to borrow from reference Video?((fully transfer, aka, video driving mode))")
74
+ ref_video.change(ref_video_fn, inputs=ref_video, outputs=[use_ref_video]) # todo
75
+
76
+ with gr.TabItem('Settings',elem_id="sadtalker_checkbox"):
77
+ gr.Markdown("need help? please visit our [[best practice page](https://github.com/OpenTalker/SadTalker/blob/main/docs/best_practice.md)] for more detials")
78
+ with gr.Column(variant='panel'):
79
+ with gr.Row():
80
+ pose_style = gr.Slider(minimum=0, maximum=45, step=1, label="Pose style", value=0) #
81
+ exp_weight = gr.Slider(minimum=0, maximum=3, step=0.1, label="expression scale", value=1) #
82
+ blink_every = gr.Checkbox(label="use eye blink", value=True)
83
+ with gr.Row():
84
+ size_of_image = gr.Radio([256, 512], value=256, label='face model resolution', info="use 256/512 model?") #
85
+ preprocess_type = gr.Radio(['crop', 'resize','full', 'extcrop', 'extfull'], value='crop', label='preprocess', info="How to handle input image?")
86
+ with gr.Row():
87
+ is_still_mode = gr.Checkbox(label="Still Mode (fewer head motion, works with preprocess `full`)")
88
+ facerender = gr.Radio(['facevid2vid','pirender'], value='facevid2vid', label='facerender', info="which face render?")
89
+ with gr.Row():
90
+ batch_size = gr.Slider(label="batch size in generation", step=1, maximum=10, value=1)
91
+ enhancer = gr.Checkbox(label="GFPGAN as Face enhancer")
92
+
93
+ submit = gr.Button('Generate', elem_id="sadtalker_generate", variant='primary')
94
+
95
+ with gr.Tabs(elem_id="sadtalker_genearted"):
96
+ gen_video = gr.Video(label="Generated video", format="mp4")
97
+
98
+ submit.click(
99
+ fn=sad_talker.test,
100
+ inputs=[source_image,
101
+ driven_audio,
102
+ preprocess_type,
103
+ is_still_mode,
104
+ enhancer,
105
+ batch_size,
106
+ size_of_image,
107
+ pose_style,
108
+ facerender,
109
+ exp_weight,
110
+ use_ref_video,
111
+ ref_video,
112
+ ref_info,
113
+ use_idle_mode,
114
+ length_of_audio,
115
+ blink_every
116
+ ],
117
+ outputs=[gen_video],
118
+ )
119
+
120
+ with gr.Row():
121
+ examples = [
122
+ [
123
+ 'examples/source_image/full_body_1.png',
124
+ 'examples/driven_audio/bus_chinese.wav',
125
+ 'crop',
126
+ True,
127
+ False
128
+ ],
129
+ [
130
+ 'examples/source_image/full_body_2.png',
131
+ 'examples/driven_audio/japanese.wav',
132
+ 'crop',
133
+ False,
134
+ False
135
+ ],
136
+ [
137
+ 'examples/source_image/full3.png',
138
+ 'examples/driven_audio/deyu.wav',
139
+ 'crop',
140
+ False,
141
+ True
142
+ ],
143
+ [
144
+ 'examples/source_image/full4.jpeg',
145
+ 'examples/driven_audio/eluosi.wav',
146
+ 'full',
147
+ False,
148
+ True
149
+ ],
150
+ [
151
+ 'examples/source_image/full4.jpeg',
152
+ 'examples/driven_audio/imagine.wav',
153
+ 'full',
154
+ True,
155
+ True
156
+ ],
157
+ [
158
+ 'examples/source_image/full_body_1.png',
159
+ 'examples/driven_audio/bus_chinese.wav',
160
+ 'full',
161
+ True,
162
+ False
163
+ ],
164
+ [
165
+ 'examples/source_image/art_13.png',
166
+ 'examples/driven_audio/fayu.wav',
167
+ 'resize',
168
+ True,
169
+ False
170
+ ],
171
+ [
172
+ 'examples/source_image/art_5.png',
173
+ 'examples/driven_audio/chinese_news.wav',
174
+ 'resize',
175
+ False,
176
+ False
177
+ ],
178
+ [
179
+ 'examples/source_image/art_5.png',
180
+ 'examples/driven_audio/RD_Radio31_000.wav',
181
+ 'resize',
182
+ True,
183
+ True
184
+ ],
185
+ ]
186
+ gr.Examples(examples=examples,
187
+ inputs=[
188
+ source_image,
189
+ driven_audio,
190
+ preprocess_type,
191
+ is_still_mode,
192
+ enhancer],
193
+ outputs=[gen_video],
194
+ fn=sad_talker.test,
195
+ cache_examples=False
196
+ #cache_examples=os.getenv('SYSTEM') == 'spaces'
197
+ ) #
198
+
199
+ return sadtalker_interface
200
+
201
+ if __name__ == "__main__":
202
+ demo = sadtalker_demo()
203
+ demo.queue()
204
+ demo.launch()
205
+ #demo.queue(max_size=10, api_open=True)
 
 
 
 
 
 
 
 
 
 
206
  #demo.launch(debug=True)
packages.txt CHANGED
@@ -1,2 +1 @@
1
- ffmpeg
2
- libsndfile1
 
1
+ ffmpeg libsndfile1
 
pre-requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ pip>=23.0.0
requirements.txt CHANGED
@@ -1,24 +1,25 @@
1
- torch==1.13.1
2
- torchvision==0.14.1
3
- torchaudio==0.13.1
4
- numpy==1.23.5
5
- face_alignment==1.3.0
6
- imageio==2.19.3
7
- imageio-ffmpeg==0.4.7
8
- librosa==0.8.0
9
- numba==0.56.4
10
- resampy==0.3.1
11
- pydub==0.25.1
12
  scipy
13
- kornia==0.6.8
14
  tqdm
15
- yacs==0.1.8
16
  pyyaml
17
- joblib==1.1.0
18
- scikit-image==0.19.3
19
- basicsr==1.4.2
20
- facexlib==0.3.0
 
21
  dlib-bin
22
- gfpgan
23
  av
24
- safetensors
 
 
1
+ torch
2
+ torchvision
3
+ torchaudio
4
+ numpy
5
+ face_alignment
6
+ imageio
7
+ imageio-ffmpeg
8
+ librosa
9
+ numba
10
+ resampy
11
+ pydub
12
  scipy
13
+ kornia
14
  tqdm
15
+ yacs
16
  pyyaml
17
+ joblib
18
+ scikit-image
19
+ git+https://github.com/XPixelGroup/BasicSR
20
+ git+https://github.com/TencentARC/GFPGAN
21
+ facexlib
22
  dlib-bin
 
23
  av
24
+ safetensors
25
+ TTS