andgrt commited on
Commit
2891eba
·
1 Parent(s): 1e19f64

upd: audio output

Browse files
Files changed (1) hide show
  1. app.py +29 -2
app.py CHANGED
@@ -5,8 +5,14 @@ from transformers import (
5
  AutoProcessor,
6
  AutoModelForDocumentQuestionAnswering,
7
  pipeline,
 
8
  )
9
  import torch
 
 
 
 
 
10
 
11
  processor = AutoProcessor.from_pretrained(
12
  "MariaK/layoutlmv2-base-uncased_finetuned_docvqa_v2"
@@ -71,6 +77,17 @@ def generate_answer(image, question):
71
  return answer_ru
72
 
73
 
 
 
 
 
 
 
 
 
 
 
 
74
  def transcribe(image, audio):
75
  if not image or not audio:
76
  return
@@ -79,6 +96,10 @@ def transcribe(image, audio):
79
 
80
  if y.ndim > 1:
81
  y = y.mean(axis=1)
 
 
 
 
82
  transcription_text = transcriber({"sampling_rate": sr, "raw": y})["text"]
83
 
84
  return generate_answer(image, transcription_text)
@@ -90,7 +111,10 @@ qa_interface = gr.Interface(
90
  gr.Image(type="pil"),
91
  gr.Textbox(label="Вопрос (на русском)", placeholder="Ваш вопрос"),
92
  ],
93
- outputs=gr.Textbox(label="Ответ (на русском)"),
 
 
 
94
  examples=[["doc.png", "О чем данный документ?"]],
95
  live=False,
96
  )
@@ -101,7 +125,10 @@ speech_interface = gr.Interface(
101
  gr.Image(type="pil"),
102
  gr.Audio(sources="microphone", label="Голосовой ввод"),
103
  ],
104
- outputs=gr.Textbox(label="Распознанный текст"),
 
 
 
105
  live=True,
106
  )
107
  interface = gr.TabbedInterface(
 
5
  AutoProcessor,
6
  AutoModelForDocumentQuestionAnswering,
7
  pipeline,
8
+ VitsModel,
9
  )
10
  import torch
11
+ import numpy as np
12
+
13
+
14
+ mms_tts_model = VitsModel.from_pretrained("facebook/mms-tts-rus")
15
+ mms_tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-rus")
16
 
17
  processor = AutoProcessor.from_pretrained(
18
  "MariaK/layoutlmv2-base-uncased_finetuned_docvqa_v2"
 
77
  return answer_ru
78
 
79
 
80
+ def text_to_speech(text):
81
+
82
+ inputs = mms_tts_tokenizer(text, return_tensors="pt")
83
+
84
+ with torch.no_grad():
85
+ output = mms_tts_model(**inputs).waveform
86
+
87
+ audio = output.numpy()
88
+ return text, (16000, audio.squeeze())
89
+
90
+
91
  def transcribe(image, audio):
92
  if not image or not audio:
93
  return
 
96
 
97
  if y.ndim > 1:
98
  y = y.mean(axis=1)
99
+
100
+ y = y.astype(np.float32)
101
+ y /= np.max(np.abs(y))
102
+
103
  transcription_text = transcriber({"sampling_rate": sr, "raw": y})["text"]
104
 
105
  return generate_answer(image, transcription_text)
 
111
  gr.Image(type="pil"),
112
  gr.Textbox(label="Вопрос (на русском)", placeholder="Ваш вопрос"),
113
  ],
114
+ outputs=[
115
+ gr.Textbox(label="Ответ (на русском)"),
116
+ gr.Audio(label="Сгенерированное аудио"),
117
+ ],
118
  examples=[["doc.png", "О чем данный документ?"]],
119
  live=False,
120
  )
 
125
  gr.Image(type="pil"),
126
  gr.Audio(sources="microphone", label="Голосовой ввод"),
127
  ],
128
+ outputs=[
129
+ gr.Textbox(label="Ответ (на русском)"),
130
+ gr.Audio(label="Сгенерированное аудио"),
131
+ ],
132
  live=True,
133
  )
134
  interface = gr.TabbedInterface(