Spaces:
Runtime error
Runtime error
upd: log
Browse files
app.py
CHANGED
@@ -78,25 +78,22 @@ def transcribe(image, audio):
|
|
78 |
|
79 |
sr, y = audio
|
80 |
|
81 |
-
# Convert stereo to mono if necessary
|
82 |
if y.ndim > 1:
|
83 |
y = y.mean(axis=1)
|
84 |
|
85 |
-
# Convert the numpy array to a PyTorch tensor for torchaudio processing
|
86 |
y_tensor = torch.tensor(y, dtype=torch.float32)
|
|
|
87 |
|
88 |
if sr != 16000:
|
89 |
resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
|
90 |
y_tensor = resampler(y_tensor)
|
91 |
sr = 16000
|
92 |
|
93 |
-
# Normalize the audio
|
94 |
y_tensor /= torch.max(torch.abs(y_tensor))
|
95 |
|
96 |
-
# Convert back to a numpy array for compatibility with the feature extractor
|
97 |
y = y_tensor.numpy()
|
|
|
98 |
|
99 |
-
# Create input features for the Whisper model
|
100 |
input_features = transcriber.feature_extractor(
|
101 |
y, sampling_rate=sr, return_tensors="pt"
|
102 |
).input_features
|
@@ -120,7 +117,6 @@ qa_interface = gr.Interface(
|
|
120 |
live=False,
|
121 |
)
|
122 |
|
123 |
-
# Interface for real-time speech recognition
|
124 |
speech_interface = gr.Interface(
|
125 |
fn=transcribe,
|
126 |
inputs=[
|
@@ -130,18 +126,10 @@ speech_interface = gr.Interface(
|
|
130 |
outputs=gr.Textbox(label="Распознанный текст"),
|
131 |
live=True,
|
132 |
)
|
133 |
-
|
134 |
-
# Combine the interfaces in a Gradio Tabbed layout
|
135 |
interface = gr.TabbedInterface(
|
136 |
[qa_interface, speech_interface],
|
137 |
["Текстовый вопрос", "Голосовой вопрос"],
|
138 |
title="Демо визуального ответчика на вопросы (на русском)",
|
139 |
-
# description=(
|
140 |
-
# "Gradio демо для модели doc-qa с переводом вопросов и ответов"
|
141 |
-
# "на русский язык. Загрузите изображение и задайте вопрос, чтобы"
|
142 |
-
# "получить ответ. Вы также можете использовать голосовой ввод!"
|
143 |
-
# ),
|
144 |
-
# live=True,
|
145 |
)
|
146 |
|
147 |
interface.launch(debug=True, share=True)
|
|
|
78 |
|
79 |
sr, y = audio
|
80 |
|
|
|
81 |
if y.ndim > 1:
|
82 |
y = y.mean(axis=1)
|
83 |
|
|
|
84 |
y_tensor = torch.tensor(y, dtype=torch.float32)
|
85 |
+
print(y.shape)
|
86 |
|
87 |
if sr != 16000:
|
88 |
resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
|
89 |
y_tensor = resampler(y_tensor)
|
90 |
sr = 16000
|
91 |
|
|
|
92 |
y_tensor /= torch.max(torch.abs(y_tensor))
|
93 |
|
|
|
94 |
y = y_tensor.numpy()
|
95 |
+
print(y.shape)
|
96 |
|
|
|
97 |
input_features = transcriber.feature_extractor(
|
98 |
y, sampling_rate=sr, return_tensors="pt"
|
99 |
).input_features
|
|
|
117 |
live=False,
|
118 |
)
|
119 |
|
|
|
120 |
speech_interface = gr.Interface(
|
121 |
fn=transcribe,
|
122 |
inputs=[
|
|
|
126 |
outputs=gr.Textbox(label="Распознанный текст"),
|
127 |
live=True,
|
128 |
)
|
|
|
|
|
129 |
interface = gr.TabbedInterface(
|
130 |
[qa_interface, speech_interface],
|
131 |
["Текстовый вопрос", "Голосовой вопрос"],
|
132 |
title="Демо визуального ответчика на вопросы (на русском)",
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
)
|
134 |
|
135 |
interface.launch(debug=True, share=True)
|