Spaces:

andgrt
/

doc-qa-demo-gradio

Runtime error

File size: 4,308 Bytes

abcdc69
 
 
 
 
fbf438c
55b76ac
2891eba
abcdc69
36e06f1
2891eba
 
 
 
 
3c9287e
36e06f1
c61a50f
36e06f1
 
c61a50f
36e06f1
abcdc69
 
 
 
 
 
658a6fd
1e19f64
658a6fd
55b76ac
abcdc69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8bddd88
36e06f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
abcdc69
 
 
 
 
 
 
8bddd88
abcdc69
 
 
 
 
 
 
2891eba
 
 
 
 
 
 
 
 
 
 
09be984
7ce07b6
09be984
7ce07b6
 
55b76ac
 
 
2891eba
 
 
 
1e19f64
cfcd1f4
09be984
 
 
 
 
 
 
 
55b76ac
abcdc69
658a6fd
09be984
abcdc69
22f2eb7
 
abcdc69
2891eba
 
 
 
55b76ac
658a6fd
 
 
 
09be984
658a6fd
 
c602ea4
658a6fd
2891eba
 
 
 
658a6fd
 
 
 
 
abcdc69

import gradio as gr
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    AutoProcessor,
    AutoModelForDocumentQuestionAnswering,
    pipeline,
    VitsModel,
)
import torch
import numpy as np


mms_tts_model = VitsModel.from_pretrained("facebook/mms-tts-rus")
mms_tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-rus")

processor = AutoProcessor.from_pretrained(
    "MariaK/layoutlmv2-base-uncased_finetuned_docvqa_v2"
)
model = AutoModelForDocumentQuestionAnswering.from_pretrained(
    "MariaK/layoutlmv2-base-uncased_finetuned_docvqa_v2"
)

tokenizer_ru2en = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ru-en")
model_ru2en = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-ru-en")
tokenizer_en2ru = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ru")
model_en2ru = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-ru")

transcriber = pipeline(
    "automatic-speech-recognition", model="lorenzoncina/whisper-medium-ru"
)


def translate_ru2en(text):
    inputs = tokenizer_ru2en(text, return_tensors="pt")
    outputs = model_ru2en.generate(**inputs)
    translated_text = tokenizer_ru2en.decode(outputs[0], skip_special_tokens=True)
    return translated_text


def translate_en2ru(text):
    inputs = tokenizer_en2ru(text, return_tensors="pt")
    outputs = model_en2ru.generate(**inputs)
    translated_text = tokenizer_en2ru.decode(outputs[0], skip_special_tokens=True)
    return translated_text


def generate_answer_git(image, question):
    with torch.no_grad():
        encoding = processor(
            images=image,
            text=question,
            return_tensors="pt",
            max_length=512,
            truncation=True,
        )
        outputs = model(**encoding)
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits
        predicted_start_idx = start_logits.argmax(-1).item()
        predicted_end_idx = end_logits.argmax(-1).item()

    return processor.tokenizer.decode(
        encoding.input_ids.squeeze()[predicted_start_idx : predicted_end_idx + 1]
    )


def generate_answer(image, question):
    question_en = translate_ru2en(question)
    print(f"Вопрос на английском: {question_en}")

    answer_en = generate_answer_git(image, question_en)
    print(f"Ответ на английском: {answer_en}")

    answer_ru = translate_en2ru(answer_en)

    return answer_ru


def text_to_speech(text):

    inputs = mms_tts_tokenizer(text, return_tensors="pt")

    with torch.no_grad():
        output = mms_tts_model(**inputs).waveform

    audio = output.numpy()
    return text, (16000, audio.squeeze())


def transcribe_pipeline(image, audio):
    if not image or not audio:
        return None, None

    sr, y = audio

    if y.ndim > 1:
        y = y.mean(axis=1)

    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    transcription_text = transcriber({"sampling_rate": sr, "raw": y})["text"]

    return text_to_speech(generate_answer(image, transcription_text))


def text_pipeline(image, question):
    if not image or not question:
        return None, None

    return text_to_speech(generate_answer(image, question))


qa_interface = gr.Interface(
    fn=text_pipeline,
    inputs=[
        gr.Image(type="pil"),
        gr.Textbox(label="Вопрос (на русском)", placeholder="Ваш вопрос"),
    ],
    outputs=[
        gr.Textbox(label="Ответ (на русском)"),
        gr.Audio(label="Сгенерированное аудио"),
    ],
    examples=[["doc.png", "О чем данный документ?"]],
    live=False,
)

speech_interface = gr.Interface(
    fn=transcribe_pipeline,
    inputs=[
        gr.Image(type="pil"),
        gr.Audio(sources="microphone", label="Голосовой ввод"),
    ],
    outputs=[
        gr.Textbox(label="Ответ (на русском)"),
        gr.Audio(label="Сгенерированное аудио"),
    ],
    live=True,
)
interface = gr.TabbedInterface(
    [qa_interface, speech_interface],
    ["Текстовый вопрос", "Голосовой вопрос"],
    title="Демо визуального ответчика на вопросы (на русском)",
)

interface.launch(debug=True, share=True)