Spaces:
Sleeping
Sleeping
File size: 5,727 Bytes
a5cb220 c822289 a5cb220 c822289 a5cb220 c822289 a5cb220 c822289 a5cb220 c822289 a5cb220 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import os
import warnings
import torch
import gradio as gr
from transformers import T5Tokenizer, T5ForConditionalGeneration, pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
import pdfplumber
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
# Suppress warnings globally
warnings.filterwarnings("ignore")
# Setup models
device = "cuda:0" if torch.cuda.is_available() else "cpu"
whisper_model_id = "openai/whisper-medium"
# Load Whisper model and processor
whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(whisper_model_id)
whisper_processor = AutoProcessor.from_pretrained(whisper_model_id)
# Create Whisper pipeline
whisper_pipe = pipeline(
"automatic-speech-recognition",
model=whisper_model,
tokenizer=whisper_processor.tokenizer,
feature_extractor=whisper_processor.feature_extractor,
device=device
)
# Setup FLAN-T5 model and tokenizer
flan_t5_model_id = "google/flan-t5-large"
flan_t5_tokenizer = T5Tokenizer.from_pretrained(flan_t5_model_id)
flan_t5_model = T5ForConditionalGeneration.from_pretrained(flan_t5_model_id)
# Function to transcribe audio files
def transcribe_audio(file_path):
result = whisper_pipe(file_path)
return result['text']
# Function to extract text and questions from PDF
def extract_text_from_pdf(pdf_path):
text = ""
questions = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text
# Extract questions based on numbering
lines = page_text.split("\n")
for line in lines:
if line.strip() and line.strip()[0].isdigit():
questions.append(line.strip())
return text, questions
# Function to generate form data with FLAN-T5
def generate_form_data(text, questions):
responses = []
for question in questions:
input_text = f"""The following text is a transcript from an audio recording. Read the text and answer the following question in a complete sentence.\n\nText: {text}\n\nQuestion: {question}\n\nAnswer:"""
# Tokenize the input text
inputs = flan_t5_tokenizer(input_text, return_tensors='pt', max_length=1024, truncation=True)
# Generate the answer using the model
with torch.no_grad():
outputs = flan_t5_model.generate(**inputs, max_length=100)
# Decode the generated text
generated_text = flan_t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
# Handle incomplete or missing answers
if not generated_text.strip():
generated_text = "The answer to this question is not present in the script."
elif len(generated_text.strip()) < 10: # Arbitrary threshold for short/incomplete answers
input_text = f"""Based on the following transcript, provide a more detailed answer to the question.\n\nText: {text}\n\nQuestion: {question}\n\nAnswer:"""
inputs = flan_t5_tokenizer(input_text, return_tensors='pt', max_length=1024, truncation=True)
outputs = flan_t5_model.generate(**inputs, max_length=100)
generated_text = flan_t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
# Append question and response
responses.append(f"Question: {question}\nAnswer: {generated_text.strip()}")
return "\n\n".join(responses)
# Function to save responses to PDF
def save_responses_to_pdf(responses, output_pdf_path):
document = SimpleDocTemplate(output_pdf_path, pagesize=letter)
styles = getSampleStyleSheet()
# Custom style for responses
response_style = ParagraphStyle(
name='ResponseStyle',
parent=styles['BodyText'],
fontSize=10,
spaceAfter=12
)
content = []
# Add heading
heading = Paragraph("<b>FillUp by Umar Majeed</b>", styles['Title'])
content.append(heading)
content.append(Spacer(1, 12))
for index, response in enumerate(responses, start=1):
# Add the response number and content
file_heading = Paragraph(f"<b>File {index}:</b>", styles['Heading2'])
response_text = Paragraph(response.replace("\n", "<br/>"), response_style)
content.append(file_heading)
content.append(Spacer(1, 6)) # Space between heading and response
content.append(response_text)
content.append(Spacer(1, 18)) # Space between responses
document.build(content)
# Gradio interface function
def process_files(audio_files, pdf_file):
responses = []
for audio_file in audio_files:
# Transcribe audio
transcribed_text = transcribe_audio(audio_file.name)
# Extract text and form fields from PDF
pdf_text, pdf_questions = extract_text_from_pdf(pdf_file.name)
# Generate form data
form_data = generate_form_data(transcribed_text, pdf_questions)
responses.append(form_data)
# Save all responses to a PDF
output_pdf_path = "output.pdf"
save_responses_to_pdf(responses, output_pdf_path)
return output_pdf_path
# Gradio interface definition
interface = gr.Interface(
fn=process_files,
inputs=[
gr.Files(label="Upload Audio Files", type="file"),
gr.File(label="Upload PDF File", type="file")
],
outputs=gr.File(label="Download Output PDF"),
title="FillUp by Umar Majeed",
description="Upload audio files and a PDF file. The application will transcribe the audio, extract questions from the PDF, and generate a response PDF."
)
# Launch the interface
interface.launch()
|