Spaces:
Sleeping
Sleeping
import gradio as gr | |
import easyocr | |
import cv2 | |
import numpy as np | |
from PIL import Image | |
import pdf2image | |
import tempfile | |
from langchain_groq import ChatGroq | |
import re | |
import os | |
# Initialize OCR reader | |
reader = easyocr.Reader(['en']) | |
# Initialize LLM | |
llm = ChatGroq( | |
temperature=0, | |
groq_api_key= os.getenv("GROQ_API_KEY"), | |
model_name="llama-3.3-70b-versatile" | |
) | |
# Utility to clean up unwanted characters | |
def clean_text(text): | |
text = re.sub(r"[*β’ββͺοΈβ¦β‘οΈ~]+", "", text) # remove bullet points and asterisks | |
text = re.sub(r"\s{2,}", " ", text) # remove excessive spacing | |
return text.strip() | |
def extract_text_and_summarize(file): | |
file_path = file.name | |
# If it's a PDF, convert to image | |
if file_path.lower().endswith(".pdf"): | |
images = pdf2image.convert_from_path(file_path) | |
image = np.array(images[0]) | |
else: | |
image = cv2.imread(file_path) | |
# OCR | |
results = reader.readtext(image) | |
extracted_text = ' '.join([text[1] for text in results]) | |
extracted_text = clean_text(extracted_text) | |
if not extracted_text.strip(): | |
return "No readable text found.", "" | |
# LLM summarization | |
messages = [ | |
{"role": "system", "content": "Your job is to summarize the given research paper and list its key sub-domains and topics clearly."}, | |
{"role": "user", "content": extracted_text} | |
] | |
result = llm.invoke(messages) | |
summarized_text = clean_text(result.content) | |
return extracted_text, summarized_text | |
# Build Gradio UI | |
with gr.Blocks(title="Research Paper Summarizer") as iface: | |
gr.Markdown( | |
""" | |
# π§ Research Paper Summarizer | |
Upload an **image** or **PDF** of a research paper. This app will: | |
- Extract text using OCR | |
- Summarize the content | |
- List key subdomains and research topics | |
β‘ Powered by EasyOCR & LLaMA-3 via Groq | |
""" | |
) | |
with gr.Row(): | |
file_input = gr.File(label="π Upload Research Paper (Image or PDF)", file_types=[".png", ".jpg", ".jpeg", ".pdf"]) | |
with gr.Row(): | |
extracted_box = gr.Textbox(label="π Extracted Text", lines=10, interactive=False) | |
summary_box = gr.Textbox(label="π Summarized Topics & Subdomains", lines=10, interactive=False) | |
run_button = gr.Button("π Run Summarizer") | |
run_button.click(fn=extract_text_and_summarize, inputs=file_input, outputs=[extracted_box, summary_box]) | |
iface.launch(share=True) |