omvishesh commited on
Commit
9e7576b
Β·
verified Β·
1 Parent(s): eca5dfd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -79
app.py CHANGED
@@ -1,80 +1,80 @@
1
- import gradio as gr
2
- import easyocr
3
- import cv2
4
- import numpy as np
5
- from PIL import Image
6
- import pdf2image
7
- import tempfile
8
- from langchain_groq import ChatGroq
9
- import re
10
-
11
- # Initialize OCR reader
12
- reader = easyocr.Reader(['en'])
13
-
14
- # Initialize LLM
15
- llm = ChatGroq(
16
- temperature=0,
17
- groq_api_key="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",
18
- model_name="llama-3.3-70b-versatile"
19
- )
20
-
21
- # Utility to clean up unwanted characters
22
- def clean_text(text):
23
- text = re.sub(r"[*‒●β–ͺ️✦➑️~]+", "", text) # remove bullet points and asterisks
24
- text = re.sub(r"\s{2,}", " ", text) # remove excessive spacing
25
- return text.strip()
26
-
27
- def extract_text_and_summarize(file):
28
- file_path = file.name
29
-
30
- # If it's a PDF, convert to image
31
- if file_path.lower().endswith(".pdf"):
32
- images = pdf2image.convert_from_path(file_path)
33
- image = np.array(images[0])
34
- else:
35
- image = cv2.imread(file_path)
36
-
37
- # OCR
38
- results = reader.readtext(image)
39
- extracted_text = ' '.join([text[1] for text in results])
40
- extracted_text = clean_text(extracted_text)
41
-
42
- if not extracted_text.strip():
43
- return "No readable text found.", ""
44
-
45
- # LLM summarization
46
- messages = [
47
- {"role": "system", "content": "Your job is to summarize the given research paper and list its key sub-domains and topics clearly."},
48
- {"role": "user", "content": extracted_text}
49
- ]
50
- result = llm.invoke(messages)
51
-
52
- summarized_text = clean_text(result.content)
53
- return extracted_text, summarized_text
54
-
55
- # Build Gradio UI
56
- with gr.Blocks(title="Research Paper Summarizer") as iface:
57
- gr.Markdown(
58
- """
59
- # 🧠 Research Paper Summarizer
60
- Upload an **image** or **PDF** of a research paper. This app will:
61
- - Extract text using OCR
62
- - Summarize the content
63
- - List key subdomains and research topics
64
-
65
- ⚑ Powered by EasyOCR & LLaMA-3 via Groq
66
- """
67
- )
68
-
69
- with gr.Row():
70
- file_input = gr.File(label="πŸ“„ Upload Research Paper (Image or PDF)", file_types=[".png", ".jpg", ".jpeg", ".pdf"])
71
-
72
- with gr.Row():
73
- extracted_box = gr.Textbox(label="πŸ” Extracted Text", lines=10, interactive=False)
74
- summary_box = gr.Textbox(label="πŸ“Œ Summarized Topics & Subdomains", lines=10, interactive=False)
75
-
76
- run_button = gr.Button("πŸ” Run Summarizer")
77
-
78
- run_button.click(fn=extract_text_and_summarize, inputs=file_input, outputs=[extracted_box, summary_box])
79
-
80
  iface.launch(share=True)
 
1
+ import gradio as gr
2
+ import easyocr
3
+ import cv2
4
+ import numpy as np
5
+ from PIL import Image
6
+ import pdf2image
7
+ import tempfile
8
+ from langchain_groq import ChatGroq
9
+ import re
10
+
11
+ # Initialize OCR reader
12
+ reader = easyocr.Reader(['en'])
13
+
14
+ # Initialize LLM
15
+ llm = ChatGroq(
16
+ temperature=0,
17
+ groq_api_key= groq_api_key,
18
+ model_name="llama-3.3-70b-versatile"
19
+ )
20
+
21
+ # Utility to clean up unwanted characters
22
+ def clean_text(text):
23
+ text = re.sub(r"[*‒●β–ͺ️✦➑️~]+", "", text) # remove bullet points and asterisks
24
+ text = re.sub(r"\s{2,}", " ", text) # remove excessive spacing
25
+ return text.strip()
26
+
27
+ def extract_text_and_summarize(file):
28
+ file_path = file.name
29
+
30
+ # If it's a PDF, convert to image
31
+ if file_path.lower().endswith(".pdf"):
32
+ images = pdf2image.convert_from_path(file_path)
33
+ image = np.array(images[0])
34
+ else:
35
+ image = cv2.imread(file_path)
36
+
37
+ # OCR
38
+ results = reader.readtext(image)
39
+ extracted_text = ' '.join([text[1] for text in results])
40
+ extracted_text = clean_text(extracted_text)
41
+
42
+ if not extracted_text.strip():
43
+ return "No readable text found.", ""
44
+
45
+ # LLM summarization
46
+ messages = [
47
+ {"role": "system", "content": "Your job is to summarize the given research paper and list its key sub-domains and topics clearly."},
48
+ {"role": "user", "content": extracted_text}
49
+ ]
50
+ result = llm.invoke(messages)
51
+
52
+ summarized_text = clean_text(result.content)
53
+ return extracted_text, summarized_text
54
+
55
+ # Build Gradio UI
56
+ with gr.Blocks(title="Research Paper Summarizer") as iface:
57
+ gr.Markdown(
58
+ """
59
+ # 🧠 Research Paper Summarizer
60
+ Upload an **image** or **PDF** of a research paper. This app will:
61
+ - Extract text using OCR
62
+ - Summarize the content
63
+ - List key subdomains and research topics
64
+
65
+ ⚑ Powered by EasyOCR & LLaMA-3 via Groq
66
+ """
67
+ )
68
+
69
+ with gr.Row():
70
+ file_input = gr.File(label="πŸ“„ Upload Research Paper (Image or PDF)", file_types=[".png", ".jpg", ".jpeg", ".pdf"])
71
+
72
+ with gr.Row():
73
+ extracted_box = gr.Textbox(label="πŸ” Extracted Text", lines=10, interactive=False)
74
+ summary_box = gr.Textbox(label="πŸ“Œ Summarized Topics & Subdomains", lines=10, interactive=False)
75
+
76
+ run_button = gr.Button("πŸ” Run Summarizer")
77
+
78
+ run_button.click(fn=extract_text_and_summarize, inputs=file_input, outputs=[extracted_box, summary_box])
79
+
80
  iface.launch(share=True)