omvishesh commited on
Commit
7e76a37
Β·
verified Β·
1 Parent(s): 9d61546

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +80 -0
  2. requirements.txt +9 -0
app.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import easyocr
3
+ import cv2
4
+ import numpy as np
5
+ from PIL import Image
6
+ import pdf2image
7
+ import tempfile
8
+ from langchain_groq import ChatGroq
9
+ import re
10
+
11
+ # Initialize OCR reader
12
+ reader = easyocr.Reader(['en'])
13
+
14
+ # Initialize LLM
15
+ llm = ChatGroq(
16
+ temperature=0,
17
+ groq_api_key="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",
18
+ model_name="llama-3.3-70b-versatile"
19
+ )
20
+
21
+ # Utility to clean up unwanted characters
22
+ def clean_text(text):
23
+ text = re.sub(r"[*‒●β–ͺ️✦➑️~]+", "", text) # remove bullet points and asterisks
24
+ text = re.sub(r"\s{2,}", " ", text) # remove excessive spacing
25
+ return text.strip()
26
+
27
+ def extract_text_and_summarize(file):
28
+ file_path = file.name
29
+
30
+ # If it's a PDF, convert to image
31
+ if file_path.lower().endswith(".pdf"):
32
+ images = pdf2image.convert_from_path(file_path)
33
+ image = np.array(images[0])
34
+ else:
35
+ image = cv2.imread(file_path)
36
+
37
+ # OCR
38
+ results = reader.readtext(image)
39
+ extracted_text = ' '.join([text[1] for text in results])
40
+ extracted_text = clean_text(extracted_text)
41
+
42
+ if not extracted_text.strip():
43
+ return "No readable text found.", ""
44
+
45
+ # LLM summarization
46
+ messages = [
47
+ {"role": "system", "content": "Your job is to summarize the given research paper and list its key sub-domains and topics clearly."},
48
+ {"role": "user", "content": extracted_text}
49
+ ]
50
+ result = llm.invoke(messages)
51
+
52
+ summarized_text = clean_text(result.content)
53
+ return extracted_text, summarized_text
54
+
55
+ # Build Gradio UI
56
+ with gr.Blocks(title="Research Paper Summarizer") as iface:
57
+ gr.Markdown(
58
+ """
59
+ # 🧠 Research Paper Summarizer
60
+ Upload an **image** or **PDF** of a research paper. This app will:
61
+ - Extract text using OCR
62
+ - Summarize the content
63
+ - List key subdomains and research topics
64
+
65
+ ⚑ Powered by EasyOCR & LLaMA-3 via Groq
66
+ """
67
+ )
68
+
69
+ with gr.Row():
70
+ file_input = gr.File(label="πŸ“„ Upload Research Paper (Image or PDF)", file_types=[".png", ".jpg", ".jpeg", ".pdf"])
71
+
72
+ with gr.Row():
73
+ extracted_box = gr.Textbox(label="πŸ” Extracted Text", lines=10, interactive=False)
74
+ summary_box = gr.Textbox(label="πŸ“Œ Summarized Topics & Subdomains", lines=10, interactive=False)
75
+
76
+ run_button = gr.Button("πŸ” Run Summarizer")
77
+
78
+ run_button.click(fn=extract_text_and_summarize, inputs=file_input, outputs=[extracted_box, summary_box])
79
+
80
+ iface.launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ easyocr
3
+ cv2
4
+ numpy
5
+ PIL
6
+ pdf2imag
7
+ tempfile
8
+ langchain_groq
9
+ re