Spaces:
Running
Running
Yurii Paniv
commited on
Commit
·
efc4c8b
1
Parent(s):
f5aefe9
Add data logging
Browse files- README.md +1 -0
- app.py +28 -0
- data_logger.py +41 -0
- requirements-dev.txt +1 -0
README.md
CHANGED
@@ -18,6 +18,7 @@ Text-to-Speech for Crimean Tatar language
|
|
18 |
Source code: https://github.com/robinhad/qirimtatar-tts
|
19 |
Online demo: https://huggingface.co/spaces/robinhad/qirimtatar-tts
|
20 |
You're welcome to join UA Speech Recognition and Synthesis community: Telegram https://t.me/speech_recognition_uk
|
|
|
21 |
|
22 |
## Examples
|
23 |
Test sentence:
|
|
|
18 |
Source code: https://github.com/robinhad/qirimtatar-tts
|
19 |
Online demo: https://huggingface.co/spaces/robinhad/qirimtatar-tts
|
20 |
You're welcome to join UA Speech Recognition and Synthesis community: Telegram https://t.me/speech_recognition_uk
|
21 |
+
Note: demo input is saved to improve Text-to-Speech engine and demo experience. By using this demo you give your consent to this.
|
22 |
|
23 |
## Examples
|
24 |
Test sentence:
|
app.py
CHANGED
@@ -1,3 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
from crh_transliterator.transliterator import transliterate
|
3 |
from crh_preprocessor.preprocessor import preprocess
|
@@ -18,6 +23,27 @@ class VoiceOption(Enum):
|
|
18 |
# Abibulla = "Абібулла (чоловічий) 👨"
|
19 |
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
print(f"CUDA available? {is_available()}")
|
22 |
|
23 |
|
@@ -42,6 +68,8 @@ def tts(text: str, voice: str):
|
|
42 |
}
|
43 |
|
44 |
speaker_name = voice_mapping[voice]
|
|
|
|
|
45 |
text_limit = 7200
|
46 |
text = (
|
47 |
text if len(text) < text_limit else text[0:text_limit]
|
|
|
1 |
+
from os import getenv
|
2 |
+
from queue import Queue
|
3 |
+
from threading import Thread
|
4 |
+
from time import sleep
|
5 |
+
from data_logger import log_data
|
6 |
import gradio as gr
|
7 |
from crh_transliterator.transliterator import transliterate
|
8 |
from crh_preprocessor.preprocessor import preprocess
|
|
|
23 |
# Abibulla = "Абібулла (чоловічий) 👨"
|
24 |
|
25 |
|
26 |
+
def check_thread(logging_queue: Queue):
|
27 |
+
logging_callback = log_data(hf_token=getenv("HF_API_TOKEN"), dataset_name="crh-tts-output", private=False)
|
28 |
+
while True:
|
29 |
+
sleep(60)
|
30 |
+
batch = []
|
31 |
+
while not logging_queue.empty():
|
32 |
+
batch.append(logging_queue.get())
|
33 |
+
|
34 |
+
if len(batch) > 0:
|
35 |
+
try:
|
36 |
+
logging_callback(batch)
|
37 |
+
except:
|
38 |
+
print("Error happened while pushing data to HF. Puttting items back in queue...")
|
39 |
+
for item in batch:
|
40 |
+
logging_queue.put(item)
|
41 |
+
|
42 |
+
if getenv("HF_API_TOKEN") is not None:
|
43 |
+
log_queue = Queue()
|
44 |
+
t = Thread(target=check_thread, args=(log_queue,))
|
45 |
+
t.start()
|
46 |
+
|
47 |
print(f"CUDA available? {is_available()}")
|
48 |
|
49 |
|
|
|
68 |
}
|
69 |
|
70 |
speaker_name = voice_mapping[voice]
|
71 |
+
if getenv("HF_API_TOKEN") is not None:
|
72 |
+
log_queue.put([text, speaker_name, str(datetime.utcnow())])
|
73 |
text_limit = 7200
|
74 |
text = (
|
75 |
text if len(text) < text_limit else text[0:text_limit]
|
data_logger.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from gradio import utils
|
2 |
+
import os
|
3 |
+
import csv
|
4 |
+
import huggingface_hub
|
5 |
+
|
6 |
+
def log_data(hf_token: str, dataset_name: str, private=True):
|
7 |
+
path_to_dataset_repo = huggingface_hub.create_repo(
|
8 |
+
name=dataset_name,
|
9 |
+
token=hf_token,
|
10 |
+
private=private,
|
11 |
+
repo_type="dataset",
|
12 |
+
exist_ok=True,
|
13 |
+
)
|
14 |
+
flagging_dir = "flagged"
|
15 |
+
dataset_dir = os.path.join(flagging_dir, dataset_name)
|
16 |
+
repo = huggingface_hub.Repository(
|
17 |
+
local_dir=dataset_dir,
|
18 |
+
clone_from=path_to_dataset_repo,
|
19 |
+
use_auth_token=hf_token,
|
20 |
+
)
|
21 |
+
repo.git_pull(lfs=True)
|
22 |
+
log_file = os.path.join(dataset_dir, "data.csv")
|
23 |
+
|
24 |
+
def log_function(data):
|
25 |
+
repo.git_pull(lfs=True)
|
26 |
+
|
27 |
+
with open(log_file, "a", newline="", encoding="utf-8") as csvfile:
|
28 |
+
writer = csv.writer(csvfile)
|
29 |
+
|
30 |
+
for row in data:
|
31 |
+
writer.writerow(utils.sanitize_list_for_csv(row))
|
32 |
+
|
33 |
+
with open(log_file, "r", encoding="utf-8") as csvfile:
|
34 |
+
line_count = len([None for row in csv.reader(csvfile)]) - 1
|
35 |
+
|
36 |
+
repo.push_to_hub(commit_message="Flagged sample #{}".format(line_count))
|
37 |
+
|
38 |
+
return line_count
|
39 |
+
|
40 |
+
return log_function
|
41 |
+
|
requirements-dev.txt
CHANGED
@@ -1,3 +1,4 @@
|
|
1 |
-r requirements.txt
|
2 |
-r requirements-test.txt
|
|
|
3 |
black
|
|
|
1 |
-r requirements.txt
|
2 |
-r requirements-test.txt
|
3 |
+
huggingface_hub
|
4 |
black
|