Spaces:
Runtime error
Runtime error
File size: 3,469 Bytes
abcdc69 fbf438c 55b76ac abcdc69 36e06f1 3c9287e 36e06f1 c61a50f 36e06f1 c61a50f 36e06f1 abcdc69 658a6fd 1e19f64 658a6fd 55b76ac abcdc69 8bddd88 36e06f1 abcdc69 8bddd88 abcdc69 7ce07b6 55b76ac 1e19f64 cfcd1f4 55b76ac abcdc69 658a6fd abcdc69 22f2eb7 abcdc69 f743c94 55b76ac 658a6fd c602ea4 658a6fd abcdc69 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
import gradio as gr
from transformers import (
AutoTokenizer,
AutoModelForSeq2SeqLM,
AutoProcessor,
AutoModelForDocumentQuestionAnswering,
pipeline,
)
import torch
processor = AutoProcessor.from_pretrained(
"MariaK/layoutlmv2-base-uncased_finetuned_docvqa_v2"
)
model = AutoModelForDocumentQuestionAnswering.from_pretrained(
"MariaK/layoutlmv2-base-uncased_finetuned_docvqa_v2"
)
tokenizer_ru2en = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ru-en")
model_ru2en = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-ru-en")
tokenizer_en2ru = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ru")
model_en2ru = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-ru")
transcriber = pipeline(
"automatic-speech-recognition", model="lorenzoncina/whisper-medium-ru"
)
def translate_ru2en(text):
inputs = tokenizer_ru2en(text, return_tensors="pt")
outputs = model_ru2en.generate(**inputs)
translated_text = tokenizer_ru2en.decode(outputs[0], skip_special_tokens=True)
return translated_text
def translate_en2ru(text):
inputs = tokenizer_en2ru(text, return_tensors="pt")
outputs = model_en2ru.generate(**inputs)
translated_text = tokenizer_en2ru.decode(outputs[0], skip_special_tokens=True)
return translated_text
def generate_answer_git(image, question):
with torch.no_grad():
encoding = processor(
images=image,
text=question,
return_tensors="pt",
max_length=512,
truncation=True,
)
outputs = model(**encoding)
start_logits = outputs.start_logits
end_logits = outputs.end_logits
predicted_start_idx = start_logits.argmax(-1).item()
predicted_end_idx = end_logits.argmax(-1).item()
return processor.tokenizer.decode(
encoding.input_ids.squeeze()[predicted_start_idx : predicted_end_idx + 1]
)
def generate_answer(image, question):
question_en = translate_ru2en(question)
print(f"Вопрос на английском: {question_en}")
answer_en = generate_answer_git(image, question_en)
print(f"Ответ на английском: {answer_en}")
answer_ru = translate_en2ru(answer_en)
return answer_ru
def transcribe(image, audio):
if not image or not audio:
return
sr, y = audio
if y.ndim > 1:
y = y.mean(axis=1)
transcription_text = transcriber({"sampling_rate": sr, "raw": y})["text"]
return generate_answer(image, transcription_text)
qa_interface = gr.Interface(
fn=generate_answer,
inputs=[
gr.Image(type="pil"),
gr.Textbox(label="Вопрос (на русском)", placeholder="Ваш вопрос"),
],
outputs=gr.Textbox(label="Ответ (на русском)"),
examples=[["doc.png", "О чем данный документ?"]],
live=False,
)
speech_interface = gr.Interface(
fn=transcribe,
inputs=[
gr.Image(type="pil"),
gr.Audio(sources="microphone", label="Голосовой ввод"),
],
outputs=gr.Textbox(label="Распознанный текст"),
live=True,
)
interface = gr.TabbedInterface(
[qa_interface, speech_interface],
["Текстовый вопрос", "Голосовой вопрос"],
title="Демо визуального ответчика на вопросы (на русском)",
)
interface.launch(debug=True, share=True)
|