Spaces:
Runtime error
Runtime error
File size: 4,041 Bytes
abcdc69 fbf438c 55b76ac abcdc69 36e06f1 c8128c2 3c9287e 36e06f1 c61a50f 36e06f1 c61a50f 36e06f1 abcdc69 658a6fd 55b76ac abcdc69 8bddd88 36e06f1 abcdc69 8bddd88 abcdc69 7ce07b6 55b76ac c8128c2 3f5c9cb cfcd1f4 c8128c2 3f5c9cb c8128c2 cfcd1f4 55b76ac abcdc69 658a6fd abcdc69 22f2eb7 abcdc69 f743c94 55b76ac 658a6fd c602ea4 658a6fd abcdc69 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import gradio as gr
from transformers import (
AutoTokenizer,
AutoModelForSeq2SeqLM,
AutoProcessor,
AutoModelForDocumentQuestionAnswering,
pipeline,
)
import torch
import torchaudio
processor = AutoProcessor.from_pretrained(
"MariaK/layoutlmv2-base-uncased_finetuned_docvqa_v2"
)
model = AutoModelForDocumentQuestionAnswering.from_pretrained(
"MariaK/layoutlmv2-base-uncased_finetuned_docvqa_v2"
)
tokenizer_ru2en = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ru-en")
model_ru2en = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-ru-en")
tokenizer_en2ru = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ru")
model_en2ru = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-ru")
transcriber = pipeline(
"automatic-speech-recognition", model="artyomboyko/whisper-base-fine_tuned-ru"
)
def translate_ru2en(text):
inputs = tokenizer_ru2en(text, return_tensors="pt")
outputs = model_ru2en.generate(**inputs)
translated_text = tokenizer_ru2en.decode(outputs[0], skip_special_tokens=True)
return translated_text
def translate_en2ru(text):
inputs = tokenizer_en2ru(text, return_tensors="pt")
outputs = model_en2ru.generate(**inputs)
translated_text = tokenizer_en2ru.decode(outputs[0], skip_special_tokens=True)
return translated_text
def generate_answer_git(image, question):
with torch.no_grad():
encoding = processor(
images=image,
text=question,
return_tensors="pt",
max_length=512,
truncation=True,
)
outputs = model(**encoding)
start_logits = outputs.start_logits
end_logits = outputs.end_logits
predicted_start_idx = start_logits.argmax(-1).item()
predicted_end_idx = end_logits.argmax(-1).item()
return processor.tokenizer.decode(
encoding.input_ids.squeeze()[predicted_start_idx : predicted_end_idx + 1]
)
def generate_answer(image, question):
question_en = translate_ru2en(question)
print(f"Вопрос на английском: {question_en}")
answer_en = generate_answer_git(image, question_en)
print(f"Ответ на английском: {answer_en}")
answer_ru = translate_en2ru(answer_en)
return answer_ru
def transcribe(image, audio):
if not image or not audio:
return
sr, y = audio
if y.ndim > 1:
y = y.mean(axis=1)
y_tensor = torch.tensor(y, dtype=torch.float32)
print(y.shape)
if sr != 16000:
resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
y_tensor = resampler(y_tensor)
sr = 16000
y_tensor /= torch.max(torch.abs(y_tensor))
y = y_tensor.numpy()
print(y.shape)
input_features = transcriber.feature_extractor(
y, sampling_rate=sr, return_tensors="pt"
).input_features
transcription = transcriber.model.generate(input_features)
transcription_text = transcriber.tokenizer.decode(
transcription[0], skip_special_tokens=True
)
return generate_answer(image, transcription_text)
qa_interface = gr.Interface(
fn=generate_answer,
inputs=[
gr.Image(type="pil"),
gr.Textbox(label="Вопрос (на русском)", placeholder="Ваш вопрос"),
],
outputs=gr.Textbox(label="Ответ (на русском)"),
examples=[["doc.png", "О чем данный документ?"]],
live=False,
)
speech_interface = gr.Interface(
fn=transcribe,
inputs=[
gr.Image(type="pil"),
gr.Audio(sources="microphone", label="Голосовой ввод"),
],
outputs=gr.Textbox(label="Распознанный текст"),
live=True,
)
interface = gr.TabbedInterface(
[qa_interface, speech_interface],
["Текстовый вопрос", "Голосовой вопрос"],
title="Демо визуального ответчика на вопросы (на русском)",
)
interface.launch(debug=True, share=True)
|