ikraamkb commited on
Commit
130c582
·
verified ·
1 Parent(s): 91bdad5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +174 -1
app.py CHANGED
@@ -1,4 +1,4 @@
1
- import gradio as gr
2
  from transformers import pipeline
3
  import fitz # PyMuPDF
4
  import docx
@@ -91,4 +91,177 @@ app = gr.mount_gradio_app(app, demo, path="/")
91
  # Optional root redirect
92
  @app.get("/")
93
  def redirect_to_interface():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  return RedirectResponse(url="/")
 
 
 
1
+ """import gradio as gr
2
  from transformers import pipeline
3
  import fitz # PyMuPDF
4
  import docx
 
91
  # Optional root redirect
92
  @app.get("/")
93
  def redirect_to_interface():
94
+ return RedirectResponse(url="/")"""
95
+ import gradio as gr
96
+ from transformers import pipeline, AutoTokenizer
97
+ import fitz # PyMuPDF
98
+ import docx
99
+ import pptx
100
+ import openpyxl
101
+ import re
102
+ from nltk.tokenize import sent_tokenize
103
+ from fastapi import FastAPI
104
+ from fastapi.responses import RedirectResponse
105
+ from typing import Optional
106
+ import torch
107
+
108
+ # CPU-optimized model loading
109
+ MODEL_NAME = "facebook/bart-large-cnn" # Good balance of quality and size
110
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
111
+
112
+ # Use smaller batch sizes and disable GPU
113
+ pipe = pipeline(
114
+ "summarization",
115
+ model=MODEL_NAME,
116
+ tokenizer=tokenizer,
117
+ device=-1, # Force CPU usage
118
+ torch_dtype=torch.float32 # Use 32-bit floats on CPU
119
+ )
120
+
121
+ # Text processing utilities
122
+ def clean_text(text: str) -> str:
123
+ """Optimized text cleaning for CPU"""
124
+ text = re.sub(r'\s+', ' ', text) # Combine whitespace
125
+ text = re.sub(r'•\s*|\d\.\s+', '', text) # Remove bullets and numbers
126
+ text = re.sub(r'\[.*?\]|\(.*?\)', '', text) # Remove brackets/parentheses
127
+ return text.strip()
128
+
129
+ def split_into_chunks(text: str, max_chunk_size: int = 768) -> list[str]:
130
+ """CPU-efficient text chunking"""
131
+ sentences = sent_tokenize(text)
132
+ chunks = []
133
+ current_chunk = ""
134
+
135
+ for sentence in sentences:
136
+ if len(current_chunk.split()) + len(sentence.split()) <= max_chunk_size:
137
+ current_chunk += " " + sentence
138
+ else:
139
+ chunks.append(current_chunk.strip())
140
+ current_chunk = sentence
141
+
142
+ if current_chunk:
143
+ chunks.append(current_chunk.strip())
144
+
145
+ return chunks
146
+
147
+ # Memory-efficient text extraction
148
+ def extract_text(file) -> tuple[Optional[str], Optional[str]]:
149
+ ext = file.name.split(".")[-1].lower()
150
+ path = file.name
151
+
152
+ try:
153
+ if ext == "pdf":
154
+ text = []
155
+ with fitz.open(path) as doc:
156
+ for page in doc:
157
+ text.append(page.get_text("text"))
158
+ return clean_text("\n".join(text)), None
159
+
160
+ elif ext == "docx":
161
+ doc = docx.Document(path)
162
+ return clean_text("\n".join(p.text for p in doc.paragraphs)), None
163
+
164
+ elif ext == "pptx":
165
+ text = []
166
+ prs = pptx.Presentation(path)
167
+ for slide in prs.slides:
168
+ for shape in slide.shapes:
169
+ if hasattr(shape, "text"):
170
+ text.append(shape.text)
171
+ return clean_text("\n".join(text)), None
172
+
173
+ elif ext == "xlsx":
174
+ text = []
175
+ wb = openpyxl.load_workbook(path, read_only=True)
176
+ for sheet in wb.sheetnames:
177
+ for row in wb[sheet].iter_rows(values_only=True):
178
+ text.append(" ".join(str(cell) for cell in row if cell))
179
+ return clean_text("\n".join(text)), None
180
+
181
+ return None, "Unsupported file format"
182
+
183
+ except Exception as e:
184
+ return None, f"Error reading {ext.upper()}: {str(e)}"
185
+
186
+ # CPU-optimized summarization
187
+ def summarize_document(file, summary_length: str = "medium"):
188
+ # CPU-friendly length parameters
189
+ length_params = {
190
+ "short": {"max_length": 80, "min_length": 30},
191
+ "medium": {"max_length": 150, "min_length": 60},
192
+ "long": {"max_length": 200, "min_length": 80}
193
+ }
194
+
195
+ text, error = extract_text(file)
196
+ if error:
197
+ return error
198
+
199
+ if not text or len(text.split()) < 30:
200
+ return "Document too short to summarize meaningfully"
201
+
202
+ try:
203
+ chunks = split_into_chunks(text)
204
+ summaries = []
205
+
206
+ for chunk in chunks:
207
+ summary = pipe(
208
+ chunk,
209
+ max_length=length_params[summary_length]["max_length"],
210
+ min_length=length_params[summary_length]["min_length"],
211
+ do_sample=False,
212
+ truncation=True,
213
+ no_repeat_ngram_size=2, # Reduced from 3 for CPU
214
+ num_beams=2, # Reduced from 4 for CPU
215
+ early_stopping=True
216
+ )
217
+ summaries.append(summary[0]['summary_text'])
218
+
219
+ # Efficient summary combination
220
+ final_summary = " ".join(summaries)
221
+ final_summary = ". ".join(s.strip().capitalize()
222
+ for s in final_summary.split(". ")
223
+ if s.strip())
224
+
225
+ return final_summary if len(final_summary) > 25 else "Summary too short - try a longer document"
226
+
227
+ except Exception as e:
228
+ return f"Summarization error: {str(e)}"
229
+
230
+ # Lightweight Gradio interface
231
+ with gr.Blocks(title="CPU Document Summarizer", theme="soft") as demo:
232
+ gr.Markdown("## 📄 CPU-Optimized Document Summarizer")
233
+
234
+ with gr.Row():
235
+ with gr.Column():
236
+ file_input = gr.File(
237
+ label="Upload Document",
238
+ file_types=[".pdf", ".docx", ".pptx", ".xlsx"],
239
+ type="filepath"
240
+ )
241
+ length_radio = gr.Radio(
242
+ ["short", "medium", "long"],
243
+ value="medium",
244
+ label="Summary Length"
245
+ )
246
+ submit_btn = gr.Button("Summarize", variant="primary")
247
+
248
+ with gr.Column():
249
+ output = gr.Textbox(label="Summary", lines=8)
250
+ status = gr.Textbox(label="Status", interactive=False)
251
+
252
+ @submit_btn.click(inputs=[file_input, length_radio], outputs=[output, status])
253
+ def process(file, length):
254
+ if not file:
255
+ return "", "Error: No file uploaded"
256
+ status = "Processing... (this may take a while on CPU)"
257
+ summary = summarize_document(file, length)
258
+ return summary, "Done"
259
+
260
+ # FastAPI setup
261
+ app = FastAPI()
262
+
263
+ @app.get("/")
264
+ def redirect():
265
  return RedirectResponse(url="/")
266
+
267
+ app = gr.mount_gradio_app(app, demo, path="/")