Spaces:
Runtime error
Runtime error
File size: 4,308 Bytes
abcdc69 fbf438c 55b76ac 2891eba abcdc69 36e06f1 2891eba 3c9287e 36e06f1 c61a50f 36e06f1 c61a50f 36e06f1 abcdc69 658a6fd 1e19f64 658a6fd 55b76ac abcdc69 8bddd88 36e06f1 abcdc69 8bddd88 abcdc69 2891eba 09be984 7ce07b6 09be984 7ce07b6 55b76ac 2891eba 1e19f64 cfcd1f4 09be984 55b76ac abcdc69 658a6fd 09be984 abcdc69 22f2eb7 abcdc69 2891eba 55b76ac 658a6fd 09be984 658a6fd c602ea4 658a6fd 2891eba 658a6fd abcdc69 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
import gradio as gr
from transformers import (
AutoTokenizer,
AutoModelForSeq2SeqLM,
AutoProcessor,
AutoModelForDocumentQuestionAnswering,
pipeline,
VitsModel,
)
import torch
import numpy as np
mms_tts_model = VitsModel.from_pretrained("facebook/mms-tts-rus")
mms_tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-rus")
processor = AutoProcessor.from_pretrained(
"MariaK/layoutlmv2-base-uncased_finetuned_docvqa_v2"
)
model = AutoModelForDocumentQuestionAnswering.from_pretrained(
"MariaK/layoutlmv2-base-uncased_finetuned_docvqa_v2"
)
tokenizer_ru2en = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ru-en")
model_ru2en = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-ru-en")
tokenizer_en2ru = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ru")
model_en2ru = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-ru")
transcriber = pipeline(
"automatic-speech-recognition", model="lorenzoncina/whisper-medium-ru"
)
def translate_ru2en(text):
inputs = tokenizer_ru2en(text, return_tensors="pt")
outputs = model_ru2en.generate(**inputs)
translated_text = tokenizer_ru2en.decode(outputs[0], skip_special_tokens=True)
return translated_text
def translate_en2ru(text):
inputs = tokenizer_en2ru(text, return_tensors="pt")
outputs = model_en2ru.generate(**inputs)
translated_text = tokenizer_en2ru.decode(outputs[0], skip_special_tokens=True)
return translated_text
def generate_answer_git(image, question):
with torch.no_grad():
encoding = processor(
images=image,
text=question,
return_tensors="pt",
max_length=512,
truncation=True,
)
outputs = model(**encoding)
start_logits = outputs.start_logits
end_logits = outputs.end_logits
predicted_start_idx = start_logits.argmax(-1).item()
predicted_end_idx = end_logits.argmax(-1).item()
return processor.tokenizer.decode(
encoding.input_ids.squeeze()[predicted_start_idx : predicted_end_idx + 1]
)
def generate_answer(image, question):
question_en = translate_ru2en(question)
print(f"Вопрос на английском: {question_en}")
answer_en = generate_answer_git(image, question_en)
print(f"Ответ на английском: {answer_en}")
answer_ru = translate_en2ru(answer_en)
return answer_ru
def text_to_speech(text):
inputs = mms_tts_tokenizer(text, return_tensors="pt")
with torch.no_grad():
output = mms_tts_model(**inputs).waveform
audio = output.numpy()
return text, (16000, audio.squeeze())
def transcribe_pipeline(image, audio):
if not image or not audio:
return None, None
sr, y = audio
if y.ndim > 1:
y = y.mean(axis=1)
y = y.astype(np.float32)
y /= np.max(np.abs(y))
transcription_text = transcriber({"sampling_rate": sr, "raw": y})["text"]
return text_to_speech(generate_answer(image, transcription_text))
def text_pipeline(image, question):
if not image or not question:
return None, None
return text_to_speech(generate_answer(image, question))
qa_interface = gr.Interface(
fn=text_pipeline,
inputs=[
gr.Image(type="pil"),
gr.Textbox(label="Вопрос (на русском)", placeholder="Ваш вопрос"),
],
outputs=[
gr.Textbox(label="Ответ (на русском)"),
gr.Audio(label="Сгенерированное аудио"),
],
examples=[["doc.png", "О чем данный документ?"]],
live=False,
)
speech_interface = gr.Interface(
fn=transcribe_pipeline,
inputs=[
gr.Image(type="pil"),
gr.Audio(sources="microphone", label="Голосовой ввод"),
],
outputs=[
gr.Textbox(label="Ответ (на русском)"),
gr.Audio(label="Сгенерированное аудио"),
],
live=True,
)
interface = gr.TabbedInterface(
[qa_interface, speech_interface],
["Текстовый вопрос", "Голосовой вопрос"],
title="Демо визуального ответчика на вопросы (на русском)",
)
interface.launch(debug=True, share=True)
|