Spaces:

andgrt
/

doc-qa-demo-gradio

Runtime error

App Files Files Community

andgrt commited on Nov 3, 2024

Commit

3f5c9cb

1 Parent(s): c8128c2

upd: log

Browse files

Files changed (1) hide show

app.py +2 -14

app.py CHANGED Viewed

@@ -78,25 +78,22 @@ def transcribe(image, audio):
     sr, y = audio
-    # Convert stereo to mono if necessary
     if y.ndim > 1:
         y = y.mean(axis=1)
-    # Convert the numpy array to a PyTorch tensor for torchaudio processing
     y_tensor = torch.tensor(y, dtype=torch.float32)
     if sr != 16000:
         resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
         y_tensor = resampler(y_tensor)
         sr = 16000
-    # Normalize the audio
     y_tensor /= torch.max(torch.abs(y_tensor))
-    # Convert back to a numpy array for compatibility with the feature extractor
     y = y_tensor.numpy()
-    # Create input features for the Whisper model
     input_features = transcriber.feature_extractor(
         y, sampling_rate=sr, return_tensors="pt"
     ).input_features
@@ -120,7 +117,6 @@ qa_interface = gr.Interface(
     live=False,
 )
-# Interface for real-time speech recognition
 speech_interface = gr.Interface(
     fn=transcribe,
     inputs=[
@@ -130,18 +126,10 @@ speech_interface = gr.Interface(
     outputs=gr.Textbox(label="Распознанный текст"),
     live=True,
 )
-# Combine the interfaces in a Gradio Tabbed layout
 interface = gr.TabbedInterface(
     [qa_interface, speech_interface],
     ["Текстовый вопрос", "Голосовой вопрос"],
     title="Демо визуального ответчика на вопросы (на русском)",
-    # description=(
-    #     "Gradio демо для модели doc-qa с переводом вопросов и ответов"
-    #     "на русский язык. Загрузите изображение и задайте вопрос, чтобы"
-    #     "получить ответ. Вы также можете использовать голосовой ввод!"
-    # ),
-    # live=True,
 )
 interface.launch(debug=True, share=True)

     sr, y = audio
     if y.ndim > 1:
         y = y.mean(axis=1)
     y_tensor = torch.tensor(y, dtype=torch.float32)
+    print(y.shape)
     if sr != 16000:
         resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
         y_tensor = resampler(y_tensor)
         sr = 16000
     y_tensor /= torch.max(torch.abs(y_tensor))
     y = y_tensor.numpy()
+    print(y.shape)
     input_features = transcriber.feature_extractor(
         y, sampling_rate=sr, return_tensors="pt"
     ).input_features
     live=False,
 )
 speech_interface = gr.Interface(
     fn=transcribe,
     inputs=[
     outputs=gr.Textbox(label="Распознанный текст"),
     live=True,
 )
 interface = gr.TabbedInterface(
     [qa_interface, speech_interface],
     ["Текстовый вопрос", "Голосовой вопрос"],
     title="Демо визуального ответчика на вопросы (на русском)",
 )
 interface.launch(debug=True, share=True)