Spaces:
Running
Running
์กํ๊ท
commited on
Commit
ยท
da8e881
0
Parent(s):
initial commit (`f74eed7` in https://bitbucket.org/maum-system/cvpr22-demo-gradio)
Browse files- .dockerignore +2 -0
- .gitattributes +31 -0
- .gitignore +13 -0
- README.md +46 -0
- app.py +214 -0
- background_image/black.png +3 -0
- background_image/cvpr.png +3 -0
- background_image/river.mp4 +3 -0
- background_image/sky.mp4 +3 -0
- client_rest.py +73 -0
- docs/article.md +6 -0
- docs/description.txt +4 -0
- docs/title.txt +1 -0
- lang.yaml +20 -0
- output_file/.gitkeep +0 -0
- requirements.txt +6 -0
- sample_text.yaml +8 -0
- translator/__init__.py +1 -0
- translator/v3.py +58 -0
.dockerignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
output_file/*
|
2 |
+
!output_file/.gitkeep
|
.gitattributes
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
20 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
26 |
+
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.mp4 filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
30 |
+
output_file/* filter=lfs diff=lfs merge=lfs -text
|
31 |
+
background_image/* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.DS_Store
|
2 |
+
flagged/
|
3 |
+
__pycache__/
|
4 |
+
.vscode/
|
5 |
+
output_file/*
|
6 |
+
|
7 |
+
!output_file/.gitkeep
|
8 |
+
|
9 |
+
*.mp4
|
10 |
+
*.png
|
11 |
+
!background_image/*
|
12 |
+
*.mkv
|
13 |
+
gradio_queue.db*
|
README.md
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Talking Face Generation with Multilingual TTS
|
3 |
+
emoji: ๐
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: blue
|
6 |
+
sdk: gradio
|
7 |
+
app_file: app.py
|
8 |
+
pinned: false
|
9 |
+
license: cc-by-nc-sa-4.0
|
10 |
+
---
|
11 |
+
|
12 |
+
# Configuration
|
13 |
+
|
14 |
+
`title`: _string_
|
15 |
+
Display title for the Space
|
16 |
+
|
17 |
+
`emoji`: _string_
|
18 |
+
Space emoji (emoji-only character allowed)
|
19 |
+
|
20 |
+
`colorFrom`: _string_
|
21 |
+
Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
|
22 |
+
|
23 |
+
`colorTo`: _string_
|
24 |
+
Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
|
25 |
+
|
26 |
+
`sdk`: _string_
|
27 |
+
Can be either `gradio`, `streamlit`, or `static`
|
28 |
+
|
29 |
+
`sdk_version` : _string_
|
30 |
+
Only applicable for `streamlit` SDK.
|
31 |
+
See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
|
32 |
+
|
33 |
+
`app_file`: _string_
|
34 |
+
Path to your main application file (which contains either `gradio` or `streamlit` Python code, or `static` html code).
|
35 |
+
Path is relative to the root of the repository.
|
36 |
+
|
37 |
+
`models`: _List[string]_
|
38 |
+
HF model IDs (like "gpt2" or "deepset/roberta-base-squad2") used in the Space.
|
39 |
+
Will be parsed automatically from your code if not specified here.
|
40 |
+
|
41 |
+
`datasets`: _List[string]_
|
42 |
+
HF dataset IDs (like "common_voice" or "oscar-corpus/OSCAR-2109") used in the Space.
|
43 |
+
Will be parsed automatically from your code if not specified here.
|
44 |
+
|
45 |
+
`pinned`: _boolean_
|
46 |
+
Whether the Space stays on top of your list.
|
app.py
ADDED
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import subprocess
|
3 |
+
|
4 |
+
REST_IP = os.environ['REST_IP']
|
5 |
+
SERVICE_PORT = int(os.environ['SERVICE_PORT'])
|
6 |
+
TRANSLATION_APIKEY_URL = os.environ['TRANSLATION_APIKEY_URL']
|
7 |
+
GOOGLE_APPLICATION_CREDENTIALS = os.environ['GOOGLE_APPLICATION_CREDENTIALS']
|
8 |
+
subprocess.call(f"wget --no-check-certificate -O {GOOGLE_APPLICATION_CREDENTIALS} {TRANSLATION_APIKEY_URL}", shell=True)
|
9 |
+
|
10 |
+
import gradio as gr
|
11 |
+
from client_rest import RestAPIApplication
|
12 |
+
from pathlib import Path
|
13 |
+
import argparse
|
14 |
+
import threading
|
15 |
+
from translator import GoogleAuthTranslation
|
16 |
+
import yaml
|
17 |
+
|
18 |
+
TITLE = Path("docs/title.txt").read_text()
|
19 |
+
DESCRIPTION = Path("docs/description.txt").read_text()
|
20 |
+
|
21 |
+
class Translator:
|
22 |
+
def __init__(self, yaml_path='lang.yaml'):
|
23 |
+
self.google_translation = GoogleAuthTranslation(project_id="cvpr-2022-demonstration")
|
24 |
+
with open(yaml_path) as f:
|
25 |
+
self.supporting_languages = yaml.load(f, Loader=yaml.FullLoader)
|
26 |
+
|
27 |
+
def _get_text_with_lang(self, text, lang):
|
28 |
+
lang_detected = self.google_translation.detect(text)
|
29 |
+
print(lang_detected, lang)
|
30 |
+
if lang is None:
|
31 |
+
lang = lang_detected
|
32 |
+
|
33 |
+
if lang != lang_detected:
|
34 |
+
target_text = self.google_translation.translate(text, lang=lang)
|
35 |
+
else:
|
36 |
+
target_text = text
|
37 |
+
|
38 |
+
return target_text, lang
|
39 |
+
|
40 |
+
def _convert_lang_from_index(self, lang):
|
41 |
+
lang_finder = [name for name in self.supporting_languages
|
42 |
+
if self.supporting_languages[name]['language'] == lang]
|
43 |
+
if len(lang_finder) == 1:
|
44 |
+
lang = lang_finder[0]
|
45 |
+
else:
|
46 |
+
raise AssertionError(f"Given language index can't be understood! | lang: {lang}")
|
47 |
+
|
48 |
+
return lang
|
49 |
+
|
50 |
+
def get_translation(self, text, lang, use_translation=True):
|
51 |
+
lang_ = self._convert_lang_from_index(lang)
|
52 |
+
|
53 |
+
if use_translation:
|
54 |
+
target_text, _ = self._get_text_with_lang(text, lang_)
|
55 |
+
else:
|
56 |
+
target_text = text
|
57 |
+
|
58 |
+
return target_text, lang
|
59 |
+
|
60 |
+
|
61 |
+
class GradioApplication:
|
62 |
+
def __init__(self, rest_ip, rest_port, max_seed):
|
63 |
+
self.lang_list = {
|
64 |
+
'Korean': 'ko_KR',
|
65 |
+
'English': 'en_US',
|
66 |
+
'Japanese': 'ja_JP',
|
67 |
+
'Chinese': 'zh_CN'
|
68 |
+
}
|
69 |
+
self.background_list = [None,
|
70 |
+
"background_image/cvpr.png",
|
71 |
+
"background_image/black.png",
|
72 |
+
"background_image/river.mp4",
|
73 |
+
"background_image/sky.mp4"]
|
74 |
+
|
75 |
+
self.translator = Translator()
|
76 |
+
self.rest_application = RestAPIApplication(rest_ip, rest_port)
|
77 |
+
self.output_dir = Path("output_file")
|
78 |
+
|
79 |
+
inputs = prepare_input()
|
80 |
+
outputs = prepare_output()
|
81 |
+
|
82 |
+
self.iface = gr.Interface(fn=self.infer,
|
83 |
+
title=TITLE,
|
84 |
+
description=DESCRIPTION,
|
85 |
+
inputs=inputs,
|
86 |
+
outputs=outputs,
|
87 |
+
allow_flagging='never',
|
88 |
+
article=Path("docs/article.md").read_text())
|
89 |
+
|
90 |
+
self.max_seed = max_seed
|
91 |
+
self._file_seed = 0
|
92 |
+
self.lock = threading.Lock()
|
93 |
+
|
94 |
+
|
95 |
+
def _get_file_seed(self):
|
96 |
+
return f"{self._file_seed % self.max_seed:02d}"
|
97 |
+
|
98 |
+
def _reset_file_seed(self):
|
99 |
+
self._file_seed = 0
|
100 |
+
|
101 |
+
def _counter_file_seed(self):
|
102 |
+
with self.lock:
|
103 |
+
self._file_seed += 1
|
104 |
+
|
105 |
+
def get_lang_code(self, lang):
|
106 |
+
return self.lang_list[lang]
|
107 |
+
|
108 |
+
def get_background_data(self, background_index):
|
109 |
+
# get background filename and its extension
|
110 |
+
data_path = self.background_list[background_index]
|
111 |
+
|
112 |
+
if data_path is not None:
|
113 |
+
with open(data_path, 'rb') as rf:
|
114 |
+
background_data = rf.read()
|
115 |
+
is_video_background = str(data_path).endswith(".mp4")
|
116 |
+
else:
|
117 |
+
background_data = None
|
118 |
+
is_video_background = False
|
119 |
+
|
120 |
+
return background_data, is_video_background
|
121 |
+
|
122 |
+
def infer(self, text, lang, duration_rate, pad_begin, pad_end, action, background_index):
|
123 |
+
self._counter_file_seed()
|
124 |
+
print(f"File Seed: {self._file_seed}")
|
125 |
+
target_text, lang_dest = self.translator.get_translation(text, lang)
|
126 |
+
lang_rpc_code = self.get_lang_code(lang_dest)
|
127 |
+
|
128 |
+
background_data, is_video_background = self.get_background_data(background_index)
|
129 |
+
|
130 |
+
video_data = self.rest_application.get_video(target_text, lang_rpc_code, duration_rate, pad_begin, pad_end, action.lower(),
|
131 |
+
background_data, is_video_background)
|
132 |
+
print(len(video_data))
|
133 |
+
|
134 |
+
video_filename = self.output_dir / f"{self._file_seed:02d}.mp4"
|
135 |
+
with open(video_filename, "wb") as video_file:
|
136 |
+
video_file.write(video_data)
|
137 |
+
|
138 |
+
return f"Language: {lang_dest}\nText: \n{target_text}", str(video_filename)
|
139 |
+
|
140 |
+
def run(self, server_port=7860, share=False):
|
141 |
+
try:
|
142 |
+
self.iface.launch(height=900,
|
143 |
+
share=share, server_port=server_port,
|
144 |
+
enable_queue=True)
|
145 |
+
|
146 |
+
except KeyboardInterrupt:
|
147 |
+
gr.close_all()
|
148 |
+
|
149 |
+
|
150 |
+
def prepare_input():
|
151 |
+
text_input = gr.inputs.Textbox(lines=2,
|
152 |
+
placeholder="Type your text with English, Chinese, Korean, and Japanese.",
|
153 |
+
default="Hello, this is demonstration for talking face generation "
|
154 |
+
"with multilingual text-to-speech.",
|
155 |
+
label="Text")
|
156 |
+
lang_input = gr.inputs.Radio(['Korean', 'English', 'Japanese', 'Chinese'],
|
157 |
+
type='value',
|
158 |
+
default=None,
|
159 |
+
label="Language")
|
160 |
+
duration_rate_input = gr.inputs.Slider(minimum=0.8,
|
161 |
+
maximum=1.2,
|
162 |
+
step=0.01,
|
163 |
+
default=1.0,
|
164 |
+
label="Duration (The bigger the value, the slower it pronounces)")
|
165 |
+
start_padding_input = gr.inputs.Slider(minimum=0.0,
|
166 |
+
maximum=2.0,
|
167 |
+
step=0.1,
|
168 |
+
default=0.0,
|
169 |
+
label="Start padding (s)")
|
170 |
+
end_padding_input = gr.inputs.Slider(minimum=0.0,
|
171 |
+
maximum=2.0,
|
172 |
+
step=0.1,
|
173 |
+
default=0.0,
|
174 |
+
label="End padding (s)")
|
175 |
+
action_input = gr.inputs.Radio(['Default', 'Hand', 'BothHand', 'HandDown', 'Sorry'],
|
176 |
+
type='value',
|
177 |
+
default='Default',
|
178 |
+
label="Select an action...")
|
179 |
+
background_input = gr.inputs.Radio(['None', 'CVPR', 'Black', 'River', 'Sky'],
|
180 |
+
type='index',
|
181 |
+
default='None',
|
182 |
+
label="Background image")
|
183 |
+
|
184 |
+
return [text_input, lang_input,
|
185 |
+
duration_rate_input, start_padding_input, end_padding_input,
|
186 |
+
action_input, background_input]
|
187 |
+
|
188 |
+
|
189 |
+
def prepare_output():
|
190 |
+
translation_result_otuput = gr.outputs.Textbox(type="str",
|
191 |
+
label="Translation Result")
|
192 |
+
|
193 |
+
video_output = gr.outputs.Video()
|
194 |
+
return [translation_result_otuput, video_output]
|
195 |
+
|
196 |
+
|
197 |
+
def parse_args():
|
198 |
+
parser = argparse.ArgumentParser(
|
199 |
+
description='GRADIO DEMO for talking face generation submitted to CVPR2022')
|
200 |
+
parser.add_argument('-p', '--port', dest='gradio_port', type=int, default=7860, help="Port for gradio")
|
201 |
+
parser.add_argument('--rest_ip', type=str, default=REST_IP, help="IP for REST API")
|
202 |
+
parser.add_argument('--rest_port', type=int, default=SERVICE_PORT, help="Port for REST API")
|
203 |
+
parser.add_argument('--max_seed', type=int, default=20, help="Max seed for saving video")
|
204 |
+
parser.add_argument('--share', action='store_true', help='get publicly sharable link')
|
205 |
+
args = parser.parse_args()
|
206 |
+
return args
|
207 |
+
|
208 |
+
|
209 |
+
if __name__ == '__main__':
|
210 |
+
args = parse_args()
|
211 |
+
|
212 |
+
gradio_application = GradioApplication(args.rest_ip, args.rest_port, args.max_seed)
|
213 |
+
gradio_application.run(server_port=args.gradio_port, share=args.share)
|
214 |
+
|
background_image/black.png
ADDED
![]() |
Git LFS Details
|
background_image/cvpr.png
ADDED
![]() |
Git LFS Details
|
background_image/river.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e8fedd95028adbabf17eb7fcc67bc37d22dc996cb45878a8a52cc95dcfb21cf3
|
3 |
+
size 4523353
|
background_image/sky.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f12109f770ba152228370f0386d2fdc9892ff79875d3c3296efde200822a3bbc
|
3 |
+
size 2222081
|
client_rest.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import json
|
3 |
+
import base64
|
4 |
+
import argparse
|
5 |
+
|
6 |
+
VIDEO_WIDTH = 1080
|
7 |
+
VIDEO_HEIGHT = 1920
|
8 |
+
|
9 |
+
class RestAPIApplication:
|
10 |
+
def __init__(self, ip, port):
|
11 |
+
|
12 |
+
if port < 0:
|
13 |
+
self.post_request_addr = f"http://{ip}/register/"
|
14 |
+
self.post_headers = {"Content-Type": "application/json"}
|
15 |
+
self.generate_addr = (lambda id_: f'http://{ip}/generate/{id_}')
|
16 |
+
else:
|
17 |
+
self.post_request_addr = f"http://{ip}:{port}/register/"
|
18 |
+
self.post_headers = {"Content-Type": "application/json"}
|
19 |
+
self.generate_addr = (lambda id_: f'http://{ip}:{port}/generate/{id_}')
|
20 |
+
|
21 |
+
@staticmethod
|
22 |
+
def _get_json_request(text, lang, duration_rate, pad_begin, pad_end, action, background_data=None, is_video_background=False):
|
23 |
+
request_form = dict()
|
24 |
+
|
25 |
+
request_form['text'] = text
|
26 |
+
request_form['speaker'] = 0
|
27 |
+
request_form['width'] = VIDEO_WIDTH
|
28 |
+
request_form['height'] = VIDEO_HEIGHT
|
29 |
+
|
30 |
+
request_form['action'] = action
|
31 |
+
|
32 |
+
if background_data is not None:
|
33 |
+
background_base64 = base64.b64encode(background_data).decode("UTF-8")
|
34 |
+
else:
|
35 |
+
background_base64 = ""
|
36 |
+
|
37 |
+
request_form['background'] = background_base64
|
38 |
+
request_form['durationRate'] = 1.0
|
39 |
+
request_form['isVideoBackground'] = is_video_background
|
40 |
+
request_form['lang'] = lang
|
41 |
+
|
42 |
+
request_as_json = json.dumps(request_form)
|
43 |
+
return request_as_json
|
44 |
+
|
45 |
+
@staticmethod
|
46 |
+
def _get_video_id(results):
|
47 |
+
return json.loads(bytes.decode(results.content))['id']
|
48 |
+
|
49 |
+
def get_video(self, text, lang, duration_rate, pad_begin, pad_end, action, background_data=None, is_video_background=False):
|
50 |
+
request_json = self._get_json_request(text, lang, duration_rate, pad_begin, pad_end, action, background_data, is_video_background)
|
51 |
+
|
52 |
+
# POST request with jsonified request
|
53 |
+
results = requests.post(self.post_request_addr, headers=self.post_headers, data=request_json)
|
54 |
+
|
55 |
+
# GET video with the given id
|
56 |
+
video_id = self._get_video_id(results)
|
57 |
+
video_results = requests.get(self.generate_addr(video_id))
|
58 |
+
|
59 |
+
return video_results.content
|
60 |
+
|
61 |
+
|
62 |
+
def parse_args():
|
63 |
+
parser = argparse.ArgumentParser(
|
64 |
+
description='REST API interface for talking face generation submitted to CVPR2022')
|
65 |
+
parser.add_argument('-i', '--ip', dest='rest_ip', type=str, default="127.0.0.1", help="IP for REST API")
|
66 |
+
parser.add_argument('-p', '--port', dest='rest_port', type=int, default=8080, help="Port for REST API")
|
67 |
+
args = parser.parse_args()
|
68 |
+
return args
|
69 |
+
|
70 |
+
|
71 |
+
if __name__ == '__main__':
|
72 |
+
args = parse_args()
|
73 |
+
rest_api_application = RestAPIApplication(args.rest_ip, args.rest_port)
|
docs/article.md
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[Youtube](https://youtu.be/F6h0s0M4vBI)
|
2 |
+
|
3 |
+
|
4 |
+
|
5 |
+
|
6 |
+
Recent studies in talking face generation have focused on building a train-once-use-everywhere model i.e. a model that will generalize from any source speech to any target identity. A number of works have already claimed this functionality and have added that their models will also generalize to any language. However, we show, using languages from different language families, that these models do not translate well when the training language and the testing language are sufficiently different. We reduce the scope of the problem to building a language-robust talking face generation system on seen identities i.e. the target identity is the same as the training identity. In this work, we introduce a talking face generation system that will generalize to different languages. We evaluate the efficacy of our system using a multilingual text-to-speech system. We also discuss the usage of joint text-to-speech system and the talking face generation system as a neural dubber system.
|
docs/description.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
CVPR 2022 Demo Track (Round 1) #8 |
|
2 |
+
This system generates talking face video with corresponding text.
|
3 |
+
You can input text with one of four languages, which are Chinese, English, Japanese, and Korean.
|
4 |
+
If your text language and target language is different, it translates the sentence into target language with Google Translation API.
|
docs/title.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Talking Face Generation with Multilingual TTS
|
lang.yaml
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ko:
|
2 |
+
index: 1
|
3 |
+
language: "Korean"
|
4 |
+
locale: "ko_KR"
|
5 |
+
google_dest: "ko"
|
6 |
+
en:
|
7 |
+
index: 2
|
8 |
+
language: "English"
|
9 |
+
locale: "en_US"
|
10 |
+
google_dest: "en"
|
11 |
+
ja:
|
12 |
+
index: 3
|
13 |
+
language: "Japanese"
|
14 |
+
locale: "ja_JP"
|
15 |
+
google_dest: "ja"
|
16 |
+
zh:
|
17 |
+
index: 4
|
18 |
+
language: "Chinese"
|
19 |
+
locale: "zh_CN"
|
20 |
+
google_dest: "zh-CN"
|
output_file/.gitkeep
ADDED
File without changes
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
jinja2
|
3 |
+
googletrans==4.0.0-rc1
|
4 |
+
PyYAML
|
5 |
+
opencv-python
|
6 |
+
google-cloud-translate
|
sample_text.yaml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ko:
|
2 |
+
- "์๋
ํ์ธ์? ํ๊ตญ์ด๋ก ๋งํ๊ณ ์์ต๋๋ค."
|
3 |
+
en:
|
4 |
+
- "Hello. Now I'm speaking in English."
|
5 |
+
zh:
|
6 |
+
- "ไฝ ๅฅฝ? ๆๅจ่ฏดๆฎ้่ฏใ"
|
7 |
+
ja:
|
8 |
+
- "ใใใซใกใฏใ ไปใๆฅๆฌ่ชใง่ฉฑใใฆใใพใใ"
|
translator/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .v3 import GoogleAuthTranslation
|
translator/v3.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from google.cloud import translate
|
2 |
+
import yaml
|
3 |
+
|
4 |
+
|
5 |
+
class GoogleAuthTranslation:
|
6 |
+
def __init__(self, project_id, yaml_path='lang.yaml'):
|
7 |
+
self.translator = translate.TranslationServiceClient()
|
8 |
+
self.location = "global"
|
9 |
+
self.parent = f"projects/{project_id}/locations/{self.location}"
|
10 |
+
|
11 |
+
with open(yaml_path) as f:
|
12 |
+
self.supporting_languages = yaml.load(f, Loader=yaml.FullLoader)
|
13 |
+
|
14 |
+
def _detect(self, query):
|
15 |
+
response = self.translator.detect_language(
|
16 |
+
request={
|
17 |
+
"parent": self.parent,
|
18 |
+
"content": query,
|
19 |
+
"mime_type": "text/plain", # mime types: text/plain, text/html
|
20 |
+
}
|
21 |
+
)
|
22 |
+
|
23 |
+
for language in response.languages:
|
24 |
+
# First language is the most confident one
|
25 |
+
return language.language_code
|
26 |
+
|
27 |
+
def _get_dest_from_lang(self, lang):
|
28 |
+
try:
|
29 |
+
return self.supporting_languages[lang]['google_dest']
|
30 |
+
|
31 |
+
except KeyError as e:
|
32 |
+
raise e
|
33 |
+
|
34 |
+
def _get_lang_from_dest(self, dest):
|
35 |
+
for key in self.supporting_languages:
|
36 |
+
if self.supporting_languages[key]['google_dest'] == dest:
|
37 |
+
return key
|
38 |
+
|
39 |
+
raise RuntimeError(f"Detected langauge {dest} is not supported for TTS.")
|
40 |
+
|
41 |
+
def translate(self, query, lang):
|
42 |
+
|
43 |
+
dest = self._get_dest_from_lang(lang)
|
44 |
+
|
45 |
+
response = self.translator.translate_text(
|
46 |
+
request={
|
47 |
+
"parent": self.parent,
|
48 |
+
"contents": [query],
|
49 |
+
"mime_type": "text/plain", # mime types: text/plain, text/html
|
50 |
+
"target_language_code": dest,
|
51 |
+
}
|
52 |
+
)
|
53 |
+
|
54 |
+
return " ".join([translation.translated_text for translation in response.translations])
|
55 |
+
|
56 |
+
def detect(self, query):
|
57 |
+
dest = self._detect(query)
|
58 |
+
return self._get_lang_from_dest(dest)
|