Spaces:
Runtime error
Runtime error
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
from transformers import pipeline
|
5 |
+
from transformers import T5ForConditionalGeneration, T5Tokenizer
|
6 |
+
|
7 |
+
import re
|
8 |
+
import os
|
9 |
+
import json
|
10 |
+
import requests
|
11 |
+
import whisper
|
12 |
+
from yt_dlp import YoutubeDL
|
13 |
+
|
14 |
+
import matplotlib as plt
|
15 |
+
|
16 |
+
#whisper_model = whisper.load_model('small')
|
17 |
+
|
18 |
+
path = "Hyeonsieun/NTtoGT_1epoch"
|
19 |
+
tokenizer = T5Tokenizer.from_pretrained(path)
|
20 |
+
model = T5ForConditionalGeneration.from_pretrained(path)
|
21 |
+
|
22 |
+
|
23 |
+
MODEL_NAME = "openai/whisper-large-v2"
|
24 |
+
BATCH_SIZE = 8
|
25 |
+
#FILE_LIMIT_MB = 1000
|
26 |
+
|
27 |
+
pipe = pipeline(
|
28 |
+
task="automatic-speech-recognition",
|
29 |
+
model=MODEL_NAME,
|
30 |
+
chunk_length_s=30,
|
31 |
+
)
|
32 |
+
|
33 |
+
|
34 |
+
def transcribe(inputs):
|
35 |
+
if inputs is None:
|
36 |
+
raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
|
37 |
+
|
38 |
+
text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"]
|
39 |
+
return text
|
40 |
+
|
41 |
+
def remove_spaces_within_dollar(text):
|
42 |
+
# ๋ฌ๋ฌ ๊ธฐํธ๋ก ๋๋ฌ์ธ์ธ ๋ถ๋ถ์์ ์คํ์ด์ค ์ ๊ฑฐ
|
43 |
+
# ์ ๊ท ํํ์: \$.*?\$ ๋ '$'๋ก ์์ํด์ '$'๋ก ๋๋๋ ์ต์ํ์ ๋ฌธ์์ด์ ์ฐพ์ (non-greedy)
|
44 |
+
# re.sub์ repl ํ๋ผ๋ฏธํฐ์ ํจ์๋ฅผ ์ฌ์ฉํ์ฌ ๋งค์น๋ ๋ถ๋ถ์์๋ง ๋ณ๊ฒฝ์ ์ ์ฉ
|
45 |
+
result = re.sub(r'\$(.*?)\$', lambda match: match.group(0).replace(' ', ''), text)
|
46 |
+
return result
|
47 |
+
|
48 |
+
|
49 |
+
def audio_correction(file):
|
50 |
+
ASR_result = transcribe(file)
|
51 |
+
text_list = split_text_complex_rules_with_warning(ASR_result)
|
52 |
+
whole_text = ''
|
53 |
+
for text in text_list:
|
54 |
+
input_text = f"translate the text pronouncing the formula to a LaTeX equation: {text}"
|
55 |
+
inputs = tokenizer.encode(
|
56 |
+
input_text,
|
57 |
+
return_tensors='pt',
|
58 |
+
max_length=325,
|
59 |
+
padding='max_length',
|
60 |
+
truncation=True
|
61 |
+
)
|
62 |
+
# Get correct sentence ids.
|
63 |
+
corrected_ids = model.generate(
|
64 |
+
inputs,
|
65 |
+
max_length=325,
|
66 |
+
num_beams=5, # `num_beams=1` indicated temperature sampling.
|
67 |
+
early_stopping=True
|
68 |
+
)
|
69 |
+
# Decode.
|
70 |
+
corrected_sentence = tokenizer.decode(
|
71 |
+
corrected_ids[0],
|
72 |
+
skip_special_tokens=False
|
73 |
+
)
|
74 |
+
whole_text += corrected_sentence
|
75 |
+
|
76 |
+
return remove_spaces_within_dollar(whole_text)[5:-4]
|
77 |
+
|
78 |
+
def youtubeASR(link):
|
79 |
+
# ์ ํ๋ธ์ ์์ฑ๋ง ๋ค์ด๋ก๋ํ ์์ ํ์ผ๋ช
|
80 |
+
out_fn = 'temp1.mp3'
|
81 |
+
|
82 |
+
ydl_opts = {
|
83 |
+
'format': 'bestaudio/best', # Audio๋ง ๋ค์ด๋ก๋
|
84 |
+
'outtmpl': out_fn, # ์ง์ ํ ํ์ผ๋ช
์ผ๋ก ์ ์ฅ
|
85 |
+
}
|
86 |
+
|
87 |
+
with YoutubeDL(ydl_opts) as ydl:
|
88 |
+
ydl.download([link])
|
89 |
+
|
90 |
+
result = pipe(out_fn, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"] # Youtube์์ ๋ฐ์ ์์ฑ ํ์ผ(out_fn)์ ๋ฐ์์ฐ๊ธฐ
|
91 |
+
script = result['text'] # ๋ฐ์์ฐ๊ธฐ ํ ๋ด์ฉ ์ ์ฅ
|
92 |
+
return script
|
93 |
+
|
94 |
+
def split_text_complex_rules_with_warning(text):
|
95 |
+
# ์ฝค๋ง๋ฅผ ์ ์ธํ ๊ตฌ๋์ ์ผ๋ก ๋ฌธ์ฅ ๋ถ๋ฆฌ
|
96 |
+
parts = re.split(r'(?<=[.?!])\s+', text)
|
97 |
+
|
98 |
+
result = []
|
99 |
+
warnings = [] # ๊ฒฝ๊ณ ๋ฉ์์ง๋ฅผ ์ ์ฅํ ๋ฆฌ์คํธ
|
100 |
+
for part in parts:
|
101 |
+
# ๊ฐ ๋ถ๋ถ์ ๊ธธ์ด๊ฐ 256์๋ฅผ ์ด๊ณผํ๋ ๊ฒฝ์ฐ ์ฝค๋ง๋ก ์ถ๊ฐ ๋ถ๋ฆฌ
|
102 |
+
if len(part) > 256:
|
103 |
+
subparts = re.split(r',\s*', part)
|
104 |
+
for subpart in subparts:
|
105 |
+
# ๋น ๋ฌธ์์ด ์ ๊ฑฐ ๋ฐ ๊ธธ์ด๊ฐ 256์ ์ดํ์ธ ๊ฒฝ์ฐ๋ง ๊ฒฐ๊ณผ ๋ฆฌ์คํธ์ ์ถ๊ฐ
|
106 |
+
trimmed_subpart = subpart.strip()
|
107 |
+
if trimmed_subpart and len(trimmed_subpart) <= 256:
|
108 |
+
result.append(trimmed_subpart)
|
109 |
+
else:
|
110 |
+
# ๊ธธ์ด๊ฐ 256์๋ฅผ ์ด๊ณผํ๋ ๊ฒฝ์ฐ ๊ฒฝ๊ณ ๋ฉ์์ง ์ถ๊ฐ
|
111 |
+
warnings.append(f"๋ฌธ์ฅ ๊ธธ์ด๊ฐ 256์๋ฅผ ์ด๊ณผํฉ๋๋ค: {trimmed_subpart[:50]}... (๊ธธ์ด: {len(trimmed_subpart)})")
|
112 |
+
else:
|
113 |
+
# ๊ธธ์ด๊ฐ 256์ ์ดํ์ธ ๊ฒฝ์ฐ ๋ฐ๋ก ๊ฒฐ๊ณผ ๋ฆฌ์คํธ์ ์ถ๊ฐ
|
114 |
+
result.append(part.strip())
|
115 |
+
warnings = 0
|
116 |
+
|
117 |
+
return result
|
118 |
+
|
119 |
+
|
120 |
+
def youtube_correction(link):
|
121 |
+
ASR_result = youtubeASR(link)
|
122 |
+
text_list = split_text_complex_rules_with_warning(ASR_result)
|
123 |
+
whole_text = ''
|
124 |
+
for text in text_list:
|
125 |
+
input_text = f"translate the text pronouncing the formula to a LaTeX equation: {text}"
|
126 |
+
inputs = tokenizer.encode(
|
127 |
+
input_text,
|
128 |
+
return_tensors='pt',
|
129 |
+
max_length=325,
|
130 |
+
padding='max_length',
|
131 |
+
truncation=True
|
132 |
+
)
|
133 |
+
# Get correct sentence ids.
|
134 |
+
corrected_ids = model.generate(
|
135 |
+
inputs,
|
136 |
+
max_length=325,
|
137 |
+
num_beams=5, # `num_beams=1` indicated temperature sampling.
|
138 |
+
early_stopping=True
|
139 |
+
)
|
140 |
+
# Decode.
|
141 |
+
corrected_sentence = tokenizer.decode(
|
142 |
+
corrected_ids[0],
|
143 |
+
skip_special_tokens=False
|
144 |
+
)
|
145 |
+
whole_text += corrected_sentence
|
146 |
+
|
147 |
+
return remove_spaces_within_dollar(whole_text)[5:-4]
|
148 |
+
|
149 |
+
|
150 |
+
demo = gr.Blocks()
|
151 |
+
|
152 |
+
file_transcribe = gr.Interface(
|
153 |
+
fn=audio_correction,
|
154 |
+
inputs=gr.components.Audio(sources="upload", type="filepath"),
|
155 |
+
outputs="text"
|
156 |
+
)
|
157 |
+
|
158 |
+
yt_transcribe = gr.Interface(
|
159 |
+
fn=youtube_correction,
|
160 |
+
inputs="text",
|
161 |
+
outputs="text"
|
162 |
+
)
|
163 |
+
|
164 |
+
with demo:
|
165 |
+
gr.TabbedInterface([file_transcribe, yt_transcribe], ["Audio file", "YouTube"])
|
166 |
+
|
167 |
+
demo.launch()
|