omvishesh's picture
Update app.py
e4ff482 verified
raw
history blame
2.48 kB
import gradio as gr
import easyocr
import cv2
import numpy as np
from PIL import Image
import pdf2image
import tempfile
from langchain_groq import ChatGroq
import re
import os
# Initialize OCR reader
reader = easyocr.Reader(['en'])
# Initialize LLM
llm = ChatGroq(
temperature=0,
groq_api_key= os.getenv("GROQ_API_KEY"),
model_name="llama-3.3-70b-versatile"
)
# Utility to clean up unwanted characters
def clean_text(text):
text = re.sub(r"[*‒●β–ͺ️✦➑️~]+", "", text) # remove bullet points and asterisks
text = re.sub(r"\s{2,}", " ", text) # remove excessive spacing
return text.strip()
def extract_text_and_summarize(file):
file_path = file.name
# If it's a PDF, convert to image
if file_path.lower().endswith(".pdf"):
images = pdf2image.convert_from_path(file_path)
image = np.array(images[0])
else:
image = cv2.imread(file_path)
# OCR
results = reader.readtext(image)
extracted_text = ' '.join([text[1] for text in results])
extracted_text = clean_text(extracted_text)
if not extracted_text.strip():
return "No readable text found.", ""
# LLM summarization
messages = [
{"role": "system", "content": "Your job is to summarize the given research paper and list its key sub-domains and topics clearly."},
{"role": "user", "content": extracted_text}
]
result = llm.invoke(messages)
summarized_text = clean_text(result.content)
return extracted_text, summarized_text
# Build Gradio UI
with gr.Blocks(title="Research Paper Summarizer") as iface:
gr.Markdown(
"""
# 🧠 Research Paper Summarizer
Upload an **image** or **PDF** of a research paper. This app will:
- Extract text using OCR
- Summarize the content
- List key subdomains and research topics
⚑ Powered by EasyOCR & LLaMA-3 via Groq
"""
)
with gr.Row():
file_input = gr.File(label="πŸ“„ Upload Research Paper (Image or PDF)", file_types=[".png", ".jpg", ".jpeg", ".pdf"])
with gr.Row():
extracted_box = gr.Textbox(label="πŸ” Extracted Text", lines=10, interactive=False)
summary_box = gr.Textbox(label="πŸ“Œ Summarized Topics & Subdomains", lines=10, interactive=False)
run_button = gr.Button("πŸ” Run Summarizer")
run_button.click(fn=extract_text_and_summarize, inputs=file_input, outputs=[extracted_box, summary_box])
iface.launch(share=True)