import gradio as gr import re import os import fitz from transformers import AutoTokenizer, AutoModelForSeq2SeqLM tokenizer = AutoTokenizer.from_pretrained("potsawee/t5-large-generation-squad-QuestionAnswer") model = AutoModelForSeq2SeqLM.from_pretrained("potsawee/t5-large-generation-squad-QuestionAnswer") def extract_text_from_pdf(pdf_file_path): doc = fitz.open(pdf_file_path) text = "" for page in doc: text+=page.get_text() return text def generate_question_answer_pairs(pdf_file): if pdf_file is None: return "Please upload a PDF file" pdf_text = extract_text_from_pdf(pdf_file.name) sentences = re.split(r'(?<=[.!?])', pdf_text) question_answer_pairs = [] for sentence in sentences: input_ids = tokenizer.encode(sentence, return_tensors="pt") outputs = model.generate(input_ids, max_length=100, num_return_sequences=1) question_answer = tokenizer.decode(outputs[0], skip_special_tokens=True) question_answer_pairs.append(question_answer) result = '' for question_answer in question_answer_pairs: qa_parts = question_answer.split("?") if len(qa_parts) >= 2: question_part = qa_parts[0] + "?" answer_part = qa_parts[1].strip() result += f"Question: {question_part}\nAnswer: {answer_part}\n\n" return result title = "Question-Answer Pairs Generation" input_file = gr.File(label="Upload a PDF file") output_text = gr.Textbox() interface = gr.Interface( fn=generate_question_answer_pairs, inputs=input_file, outputs=output_text, title=title, ) interface.launch()