Spaces:
Runtime error
Runtime error
File size: 4,106 Bytes
abcdc69 fbf438c 55b76ac 2891eba abcdc69 36e06f1 2891eba 3c9287e 36e06f1 c61a50f 36e06f1 c61a50f 36e06f1 abcdc69 658a6fd 1e19f64 658a6fd 55b76ac abcdc69 8bddd88 36e06f1 abcdc69 8bddd88 abcdc69 2891eba 7ce07b6 55b76ac 2891eba 1e19f64 cfcd1f4 55b76ac abcdc69 658a6fd abcdc69 22f2eb7 abcdc69 2891eba 55b76ac 658a6fd c602ea4 658a6fd 2891eba 658a6fd abcdc69 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import gradio as gr
from transformers import (
AutoTokenizer,
AutoModelForSeq2SeqLM,
AutoProcessor,
AutoModelForDocumentQuestionAnswering,
pipeline,
VitsModel,
)
import torch
import numpy as np
mms_tts_model = VitsModel.from_pretrained("facebook/mms-tts-rus")
mms_tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-rus")
processor = AutoProcessor.from_pretrained(
"MariaK/layoutlmv2-base-uncased_finetuned_docvqa_v2"
)
model = AutoModelForDocumentQuestionAnswering.from_pretrained(
"MariaK/layoutlmv2-base-uncased_finetuned_docvqa_v2"
)
tokenizer_ru2en = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ru-en")
model_ru2en = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-ru-en")
tokenizer_en2ru = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ru")
model_en2ru = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-ru")
transcriber = pipeline(
"automatic-speech-recognition", model="lorenzoncina/whisper-medium-ru"
)
def translate_ru2en(text):
inputs = tokenizer_ru2en(text, return_tensors="pt")
outputs = model_ru2en.generate(**inputs)
translated_text = tokenizer_ru2en.decode(outputs[0], skip_special_tokens=True)
return translated_text
def translate_en2ru(text):
inputs = tokenizer_en2ru(text, return_tensors="pt")
outputs = model_en2ru.generate(**inputs)
translated_text = tokenizer_en2ru.decode(outputs[0], skip_special_tokens=True)
return translated_text
def generate_answer_git(image, question):
with torch.no_grad():
encoding = processor(
images=image,
text=question,
return_tensors="pt",
max_length=512,
truncation=True,
)
outputs = model(**encoding)
start_logits = outputs.start_logits
end_logits = outputs.end_logits
predicted_start_idx = start_logits.argmax(-1).item()
predicted_end_idx = end_logits.argmax(-1).item()
return processor.tokenizer.decode(
encoding.input_ids.squeeze()[predicted_start_idx : predicted_end_idx + 1]
)
def generate_answer(image, question):
question_en = translate_ru2en(question)
print(f"Вопрос на английском: {question_en}")
answer_en = generate_answer_git(image, question_en)
print(f"Ответ на английском: {answer_en}")
answer_ru = translate_en2ru(answer_en)
return answer_ru
def text_to_speech(text):
inputs = mms_tts_tokenizer(text, return_tensors="pt")
with torch.no_grad():
output = mms_tts_model(**inputs).waveform
audio = output.numpy()
return text, (16000, audio.squeeze())
def transcribe(image, audio):
if not image or not audio:
return
sr, y = audio
if y.ndim > 1:
y = y.mean(axis=1)
y = y.astype(np.float32)
y /= np.max(np.abs(y))
transcription_text = transcriber({"sampling_rate": sr, "raw": y})["text"]
return generate_answer(image, transcription_text)
qa_interface = gr.Interface(
fn=generate_answer,
inputs=[
gr.Image(type="pil"),
gr.Textbox(label="Вопрос (на русском)", placeholder="Ваш вопрос"),
],
outputs=[
gr.Textbox(label="Ответ (на русском)"),
gr.Audio(label="Сгенерированное аудио"),
],
examples=[["doc.png", "О чем данный документ?"]],
live=False,
)
speech_interface = gr.Interface(
fn=transcribe,
inputs=[
gr.Image(type="pil"),
gr.Audio(sources="microphone", label="Голосовой ввод"),
],
outputs=[
gr.Textbox(label="Ответ (на русском)"),
gr.Audio(label="Сгенерированное аудио"),
],
live=True,
)
interface = gr.TabbedInterface(
[qa_interface, speech_interface],
["Текстовый вопрос", "Голосовой вопрос"],
title="Демо визуального ответчика на вопросы (на русском)",
)
interface.launch(debug=True, share=True)
|