Spaces:
Sleeping
Sleeping
Delete app.py
Browse files
app.py
DELETED
@@ -1,146 +0,0 @@
|
|
1 |
-
# -*- coding: utf-8 -*-
|
2 |
-
"""Paper_News_Gradio_App.ipynb
|
3 |
-
|
4 |
-
Automatically generated by Colab.
|
5 |
-
|
6 |
-
Original file is located at
|
7 |
-
https://colab.research.google.com/drive/1_JHJRpT4KWmECR-ep25CGZ0eM55Bm7TK
|
8 |
-
"""
|
9 |
-
|
10 |
-
!pip install -q gradio PyMuPDF tiktoken openai
|
11 |
-
|
12 |
-
import gradio as gr
|
13 |
-
import fitz # PyMuPDF
|
14 |
-
import re
|
15 |
-
import tiktoken
|
16 |
-
import time
|
17 |
-
from openai import OpenAI
|
18 |
-
import os
|
19 |
-
|
20 |
-
# Set your API key securely
|
21 |
-
os.environ["OPENAI_API_KEY"] = "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" # Replace with your API key
|
22 |
-
client = OpenAI()
|
23 |
-
|
24 |
-
# --- Step 3: Extract and clean PDF text ---
|
25 |
-
def extract_text_from_pdf(pdf_file_path):
|
26 |
-
text = ""
|
27 |
-
with fitz.open(pdf_file_path) as doc:
|
28 |
-
for page in doc:
|
29 |
-
text += page.get_text()
|
30 |
-
return text
|
31 |
-
|
32 |
-
|
33 |
-
def clean_text(text):
|
34 |
-
text = re.sub(r'\s+', ' ', text)
|
35 |
-
text = re.sub(r'[^\x20-\x7E]+', '', text)
|
36 |
-
return text.strip()
|
37 |
-
|
38 |
-
def split_into_chunks(text, max_tokens=1000):
|
39 |
-
encoding = tiktoken.get_encoding("cl100k_base")
|
40 |
-
words = text.split()
|
41 |
-
chunks, current_chunk, current_tokens = [], [], 0
|
42 |
-
|
43 |
-
for word in words:
|
44 |
-
tokens = len(encoding.encode(word))
|
45 |
-
if current_tokens + tokens > max_tokens:
|
46 |
-
chunks.append(" ".join(current_chunk))
|
47 |
-
current_chunk, current_tokens = [word], tokens
|
48 |
-
else:
|
49 |
-
current_chunk.append(word)
|
50 |
-
current_tokens += tokens
|
51 |
-
|
52 |
-
if current_chunk:
|
53 |
-
chunks.append(" ".join(current_chunk))
|
54 |
-
|
55 |
-
return chunks
|
56 |
-
|
57 |
-
def summarize_chunk(chunk):
|
58 |
-
try:
|
59 |
-
response = client.chat.completions.create(
|
60 |
-
model="gpt-3.5-turbo",
|
61 |
-
messages=[
|
62 |
-
{"role": "system", "content": "You are a helpful assistant that summarizes documents."},
|
63 |
-
{"role": "user", "content": f"Summarize the following text:\n\n{chunk}"}
|
64 |
-
],
|
65 |
-
temperature=0.3
|
66 |
-
)
|
67 |
-
return response.choices[0].message.content
|
68 |
-
except Exception as e:
|
69 |
-
return f"Error: {e}"
|
70 |
-
|
71 |
-
def generate_special_summaries(summary_text):
|
72 |
-
prompt = f"""
|
73 |
-
From the text below, generate the following:
|
74 |
-
1. ELI5 (Explain Like Iβm 5)
|
75 |
-
2. Why It Matters
|
76 |
-
3. TL;DR (One-line summary)
|
77 |
-
|
78 |
-
Text:
|
79 |
-
\"\"\"
|
80 |
-
{summary_text}
|
81 |
-
\"\"\"
|
82 |
-
"""
|
83 |
-
response = client.chat.completions.create(
|
84 |
-
model="gpt-3.5-turbo",
|
85 |
-
messages=[
|
86 |
-
{"role": "system", "content": "You are an expert summarizer."},
|
87 |
-
{"role": "user", "content": prompt}
|
88 |
-
]
|
89 |
-
)
|
90 |
-
|
91 |
-
full_reply = response.choices[0].message.content.strip()
|
92 |
-
|
93 |
-
# Optional: extract segments using string splitting (or just return raw if formatted well)
|
94 |
-
return full_reply
|
95 |
-
|
96 |
-
def process_pdf(pdf_file):
|
97 |
-
try:
|
98 |
-
raw_text = extract_text_from_pdf(pdf_file)
|
99 |
-
cleaned_text = clean_text(raw_text)
|
100 |
-
chunks = split_into_chunks(cleaned_text)
|
101 |
-
|
102 |
-
summaries = []
|
103 |
-
for i, chunk in enumerate(chunks):
|
104 |
-
summary = summarize_chunk(chunk)
|
105 |
-
summaries.append(summary)
|
106 |
-
time.sleep(1.5)
|
107 |
-
|
108 |
-
full_summary = "\n\n".join(summaries)
|
109 |
-
special = generate_special_summaries(full_summary)
|
110 |
-
|
111 |
-
# Split the special summary into parts
|
112 |
-
eli5, why_matters, tldr = "", "", ""
|
113 |
-
for section in special.split("\n\n"):
|
114 |
-
if section.lower().startswith("1. eli5"):
|
115 |
-
eli5 = section.replace("1. ELI5:", "").strip()
|
116 |
-
elif section.lower().startswith("2. why"):
|
117 |
-
why_matters = section.replace("2. Why It Matters:", "").strip()
|
118 |
-
elif section.lower().startswith("3. tl;dr") or section.lower().startswith("3. tldr"):
|
119 |
-
tldr = section.replace("3. TL;DR:", "").replace("3. Tldr:", "").strip()
|
120 |
-
|
121 |
-
return full_summary, eli5, why_matters, tldr
|
122 |
-
|
123 |
-
except Exception as e:
|
124 |
-
error_msg = f"β Error: {str(e)}"
|
125 |
-
return error_msg, error_msg, error_msg, error_msg
|
126 |
-
|
127 |
-
with gr.Blocks() as demo:
|
128 |
-
gr.Markdown("### π Paper News Summarizer")
|
129 |
-
gr.Markdown("Upload a research paper PDF and get a human-friendly summary, ELI5, and TL;DR. Powered by GPT-3.5.")
|
130 |
-
|
131 |
-
with gr.Row():
|
132 |
-
pdf_input = gr.File(label="π Upload Research Paper (PDF)", file_types=[".pdf"])
|
133 |
-
submit_btn = gr.Button("Submit", variant="primary")
|
134 |
-
clear_btn = gr.Button("Clear")
|
135 |
-
|
136 |
-
summary_output = gr.Textbox(label="π Full Summary", lines=10)
|
137 |
-
eli5_output = gr.Textbox(label="π§ ELI5", lines=3)
|
138 |
-
why_output = gr.Textbox(label="π― Why It Matters", lines=3)
|
139 |
-
tldr_output = gr.Textbox(label="β‘ TL;DR", lines=2)
|
140 |
-
|
141 |
-
submit_btn.click(fn=process_pdf, inputs=pdf_input,
|
142 |
-
outputs=[summary_output, eli5_output, why_output, tldr_output])
|
143 |
-
|
144 |
-
clear_btn.click(lambda: ("", "", "", ""), outputs=[summary_output, eli5_output, why_output, tldr_output])
|
145 |
-
|
146 |
-
demo.launch(debug=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|