File size: 6,108 Bytes
a5cb220
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47e5c17
 
a5cb220
 
 
47e5c17
a5cb220
 
47e5c17
a5cb220
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
709b20f
a5cb220
 
709b20f
a5cb220
 
 
 
 
 
 
 
 
47e5c17
a5cb220
 
 
 
 
 
47e5c17
 
a5cb220
47e5c17
a5cb220
47e5c17
709b20f
 
a5cb220
 
 
 
 
47e5c17
 
709b20f
 
 
 
a5cb220
c822289
709b20f
 
 
 
 
a5cb220
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import os
import warnings
import torch
import gradio as gr
from transformers import T5Tokenizer, T5ForConditionalGeneration, pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
import pdfplumber
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle

# Suppress warnings globally
warnings.filterwarnings("ignore")

# Setup models
device = "cuda:0" if torch.cuda.is_available() else "cpu"
whisper_model_id = "openai/whisper-medium"

# Load Whisper model and processor
whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(whisper_model_id)
whisper_processor = AutoProcessor.from_pretrained(whisper_model_id)

# Create Whisper pipeline
whisper_pipe = pipeline(
    "automatic-speech-recognition",
    model=whisper_model,
    tokenizer=whisper_processor.tokenizer,
    feature_extractor=whisper_processor.feature_extractor,
    device=device
)

# Setup FLAN-T5 model and tokenizer
flan_t5_model_id = "google/flan-t5-large"
flan_t5_tokenizer = T5Tokenizer.from_pretrained(flan_t5_model_id)
flan_t5_model = T5ForConditionalGeneration.from_pretrained(flan_t5_model_id)

# Function to transcribe audio files
def transcribe_audio(file):
    result = whisper_pipe(file)
    return result['text']

# Function to extract text and questions from PDF
def extract_text_from_pdf(pdf_file):
    text = ""
    questions = []
    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text
                # Extract questions based on numbering
                lines = page_text.split("\n")
                for line in lines:
                    if line.strip() and line.strip()[0].isdigit():
                        questions.append(line.strip())
    return text, questions

# Function to generate form data with FLAN-T5
def generate_form_data(text, questions):
    responses = []
    for question in questions:
        input_text = f"""The following text is a transcript from an audio recording. Read the text and answer the following question in a complete sentence.\n\nText: {text}\n\nQuestion: {question}\n\nAnswer:"""

        # Tokenize the input text
        inputs = flan_t5_tokenizer(input_text, return_tensors='pt', max_length=1024, truncation=True)

        # Generate the answer using the model
        with torch.no_grad():
            outputs = flan_t5_model.generate(**inputs, max_length=100)

        # Decode the generated text
        generated_text = flan_t5_tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Handle incomplete or missing answers
        if not generated_text.strip():
            generated_text = "The answer to this question is not present in the script."
        elif len(generated_text.strip()) < 10:  # Arbitrary threshold for short/incomplete answers
            input_text = f"""Based on the following transcript, provide a more detailed answer to the question.\n\nText: {text}\n\nQuestion: {question}\n\nAnswer:"""
            inputs = flan_t5_tokenizer(input_text, return_tensors='pt', max_length=1024, truncation=True)
            outputs = flan_t5_model.generate(**inputs, max_length=100)
            generated_text = flan_t5_tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Append question and response
        responses.append(f"Question: {question}\nAnswer: {generated_text.strip()}")

    return "\n\n".join(responses)

# Function to save responses to PDF
def save_responses_to_pdf(responses, output_pdf_path):
    document = SimpleDocTemplate(output_pdf_path, pagesize=letter)
    styles = getSampleStyleSheet()

    # Custom style for responses
    response_style = ParagraphStyle(
        name='ResponseStyle',
        parent=styles['BodyText'],
        fontSize=10,
        spaceAfter=12
    )

    content = []
    for index, response in enumerate(responses, start=1):
        # Add the response number and content
        heading = Paragraph(f"<b>File {index}:</b>", styles['Heading2'])
        response_text = Paragraph(response.replace("\n", "<br/>"), response_style)

        content.append(heading)
        content.append(Spacer(1, 6))  # Space between heading and response
        content.append(response_text)
        content.append(Spacer(1, 18))  # Space between responses

    document.build(content)

# Gradio interface function
def process_files(audio_files, pdf_file):
    responses = []
    for audio_file in audio_files:
        # Transcribe audio
        transcribed_text = transcribe_audio(audio_file.name)
        # Extract text and form fields from PDF
        pdf_text, pdf_questions = extract_text_from_pdf(pdf_file.name)
        # Generate form data
        form_data = generate_form_data(transcribed_text, pdf_questions)
        responses.append(form_data)
    
    # Save all responses to a PDF
    output_pdf_path = "output.pdf"
    save_responses_to_pdf(responses, output_pdf_path)
    
    # Return the PDF path and the generated responses
    return output_pdf_path, "\n\n".join(responses)

# Gradio interface definition
interface = gr.Interface(
    fn=process_files,
    inputs=[
        gr.Files(label="Upload Audio Dataset"),
        gr.File(label="Upload PDF File with Questions")
    ],
    outputs=[
        gr.File(label="Download Output PDF"),
        gr.Textbox(label="Generated Responses", lines=20, placeholder="The responses will be shown here...")
    ],
    title="FillUp by Umar Majeed",
    description="""This is a beta version of FillUp, an application designed to auto-fill predefined forms using call data. 
    Upload the audio files from which you want to extract text and a PDF form that contains the questions to be answered. 
    At the end, you will receive a PDF file with the responses.

    For reference, you can download a sample form from [https://drive.google.com/drive/folders/13LolIqxufzysqNoGMfuCAvpA9AkbRfL7?usp=drive_link]. Use this dummy data to understand how the model works."""
)

# Launch the interface
interface.launch()