Shuja1401 commited on
Commit
e1e7abc
Β·
verified Β·
1 Parent(s): f40dac9

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +146 -0
app.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Paper_News_Gradio_App.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1_JHJRpT4KWmECR-ep25CGZ0eM55Bm7TK
8
+ """
9
+
10
+ !pip install -q gradio PyMuPDF tiktoken openai
11
+
12
+ import gradio as gr
13
+ import fitz # PyMuPDF
14
+ import re
15
+ import tiktoken
16
+ import time
17
+ from openai import OpenAI
18
+ import os
19
+
20
+ # Set your API key securely
21
+ os.environ["OPENAI_API_KEY"] = "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" # Replace with your API key
22
+ client = OpenAI()
23
+
24
+ # --- Step 3: Extract and clean PDF text ---
25
+ def extract_text_from_pdf(pdf_file_path):
26
+ text = ""
27
+ with fitz.open(pdf_file_path) as doc:
28
+ for page in doc:
29
+ text += page.get_text()
30
+ return text
31
+
32
+
33
+ def clean_text(text):
34
+ text = re.sub(r'\s+', ' ', text)
35
+ text = re.sub(r'[^\x20-\x7E]+', '', text)
36
+ return text.strip()
37
+
38
+ def split_into_chunks(text, max_tokens=1000):
39
+ encoding = tiktoken.get_encoding("cl100k_base")
40
+ words = text.split()
41
+ chunks, current_chunk, current_tokens = [], [], 0
42
+
43
+ for word in words:
44
+ tokens = len(encoding.encode(word))
45
+ if current_tokens + tokens > max_tokens:
46
+ chunks.append(" ".join(current_chunk))
47
+ current_chunk, current_tokens = [word], tokens
48
+ else:
49
+ current_chunk.append(word)
50
+ current_tokens += tokens
51
+
52
+ if current_chunk:
53
+ chunks.append(" ".join(current_chunk))
54
+
55
+ return chunks
56
+
57
+ def summarize_chunk(chunk):
58
+ try:
59
+ response = client.chat.completions.create(
60
+ model="gpt-3.5-turbo",
61
+ messages=[
62
+ {"role": "system", "content": "You are a helpful assistant that summarizes documents."},
63
+ {"role": "user", "content": f"Summarize the following text:\n\n{chunk}"}
64
+ ],
65
+ temperature=0.3
66
+ )
67
+ return response.choices[0].message.content
68
+ except Exception as e:
69
+ return f"Error: {e}"
70
+
71
+ def generate_special_summaries(summary_text):
72
+ prompt = f"""
73
+ From the text below, generate the following:
74
+ 1. ELI5 (Explain Like I’m 5)
75
+ 2. Why It Matters
76
+ 3. TL;DR (One-line summary)
77
+
78
+ Text:
79
+ \"\"\"
80
+ {summary_text}
81
+ \"\"\"
82
+ """
83
+ response = client.chat.completions.create(
84
+ model="gpt-3.5-turbo",
85
+ messages=[
86
+ {"role": "system", "content": "You are an expert summarizer."},
87
+ {"role": "user", "content": prompt}
88
+ ]
89
+ )
90
+
91
+ full_reply = response.choices[0].message.content.strip()
92
+
93
+ # Optional: extract segments using string splitting (or just return raw if formatted well)
94
+ return full_reply
95
+
96
+ def process_pdf(pdf_file):
97
+ try:
98
+ raw_text = extract_text_from_pdf(pdf_file)
99
+ cleaned_text = clean_text(raw_text)
100
+ chunks = split_into_chunks(cleaned_text)
101
+
102
+ summaries = []
103
+ for i, chunk in enumerate(chunks):
104
+ summary = summarize_chunk(chunk)
105
+ summaries.append(summary)
106
+ time.sleep(1.5)
107
+
108
+ full_summary = "\n\n".join(summaries)
109
+ special = generate_special_summaries(full_summary)
110
+
111
+ # Split the special summary into parts
112
+ eli5, why_matters, tldr = "", "", ""
113
+ for section in special.split("\n\n"):
114
+ if section.lower().startswith("1. eli5"):
115
+ eli5 = section.replace("1. ELI5:", "").strip()
116
+ elif section.lower().startswith("2. why"):
117
+ why_matters = section.replace("2. Why It Matters:", "").strip()
118
+ elif section.lower().startswith("3. tl;dr") or section.lower().startswith("3. tldr"):
119
+ tldr = section.replace("3. TL;DR:", "").replace("3. Tldr:", "").strip()
120
+
121
+ return full_summary, eli5, why_matters, tldr
122
+
123
+ except Exception as e:
124
+ error_msg = f"❌ Error: {str(e)}"
125
+ return error_msg, error_msg, error_msg, error_msg
126
+
127
+ with gr.Blocks() as demo:
128
+ gr.Markdown("### πŸ“š Paper News Summarizer")
129
+ gr.Markdown("Upload a research paper PDF and get a human-friendly summary, ELI5, and TL;DR. Powered by GPT-3.5.")
130
+
131
+ with gr.Row():
132
+ pdf_input = gr.File(label="πŸ“„ Upload Research Paper (PDF)", file_types=[".pdf"])
133
+ submit_btn = gr.Button("Submit", variant="primary")
134
+ clear_btn = gr.Button("Clear")
135
+
136
+ summary_output = gr.Textbox(label="πŸ“˜ Full Summary", lines=10)
137
+ eli5_output = gr.Textbox(label="πŸ§’ ELI5", lines=3)
138
+ why_output = gr.Textbox(label="🎯 Why It Matters", lines=3)
139
+ tldr_output = gr.Textbox(label="⚑ TL;DR", lines=2)
140
+
141
+ submit_btn.click(fn=process_pdf, inputs=pdf_input,
142
+ outputs=[summary_output, eli5_output, why_output, tldr_output])
143
+
144
+ clear_btn.click(lambda: ("", "", "", ""), outputs=[summary_output, eli5_output, why_output, tldr_output])
145
+
146
+ demo.launch(debug=True)