ikraamkb commited on
Commit
551e732
·
verified ·
1 Parent(s): 145f8e8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +115 -100
app.py CHANGED
@@ -93,143 +93,160 @@ app = gr.mount_gradio_app(app, demo, path="/")
93
  def redirect_to_interface():
94
  return RedirectResponse(url="/")"""
95
  import gradio as gr
96
- from transformers import pipeline, AutoTokenizer
97
  import fitz # PyMuPDF
98
  import docx
99
  import pptx
100
  import openpyxl
101
  import re
 
102
  from nltk.tokenize import sent_tokenize
 
103
  from fastapi import FastAPI
104
  from fastapi.responses import RedirectResponse
105
- from typing import Optional
106
- import torch
107
 
108
- # CPU-optimized model loading
109
- MODEL_NAME = "facebook/bart-large-cnn" # Good balance of quality and size
110
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 
 
111
 
112
- # Use smaller batch sizes and disable GPU
113
- pipe = pipeline(
 
 
 
 
 
 
114
  "summarization",
115
- model=MODEL_NAME,
116
  tokenizer=tokenizer,
117
  device=-1, # Force CPU usage
118
- torch_dtype=torch.float32 # Use 32-bit floats on CPU
119
  )
120
 
121
- # Text processing utilities
122
  def clean_text(text: str) -> str:
123
- """Optimized text cleaning for CPU"""
124
- text = re.sub(r'\s+', ' ', text) # Combine whitespace
125
- text = re.sub(r'•\s*|\d\.\s+', '', text) # Remove bullets and numbers
126
  text = re.sub(r'\[.*?\]|\(.*?\)', '', text) # Remove brackets/parentheses
 
127
  return text.strip()
128
 
129
- def split_into_chunks(text: str, max_chunk_size: int = 768) -> list[str]:
130
- """CPU-efficient text chunking"""
131
- sentences = sent_tokenize(text)
132
- chunks = []
133
- current_chunk = ""
134
-
135
- for sentence in sentences:
136
- if len(current_chunk.split()) + len(sentence.split()) <= max_chunk_size:
137
- current_chunk += " " + sentence
138
- else:
139
- chunks.append(current_chunk.strip())
140
- current_chunk = sentence
141
-
142
- if current_chunk:
143
- chunks.append(current_chunk.strip())
144
-
145
- return chunks
146
-
147
- # Memory-efficient text extraction
148
- def extract_text(file) -> tuple[Optional[str], Optional[str]]:
149
- ext = file.name.split(".")[-1].lower()
150
- path = file.name
151
-
152
  try:
153
- if ext == "pdf":
154
- text = []
155
- with fitz.open(path) as doc:
156
- for page in doc:
157
- text.append(page.get_text("text"))
158
- return clean_text("\n".join(text)), None
159
 
160
- elif ext == "docx":
161
- doc = docx.Document(path)
162
- return clean_text("\n".join(p.text for p in doc.paragraphs)), None
163
 
164
- elif ext == "pptx":
 
165
  text = []
166
- prs = pptx.Presentation(path)
167
  for slide in prs.slides:
168
  for shape in slide.shapes:
169
  if hasattr(shape, "text"):
170
  text.append(shape.text)
171
- return clean_text("\n".join(text)), None
172
 
173
- elif ext == "xlsx":
 
174
  text = []
175
- wb = openpyxl.load_workbook(path, read_only=True)
176
  for sheet in wb.sheetnames:
177
  for row in wb[sheet].iter_rows(values_only=True):
178
  text.append(" ".join(str(cell) for cell in row if cell))
179
- return clean_text("\n".join(text)), None
180
 
181
- return None, "Unsupported file format"
182
-
183
  except Exception as e:
184
- return None, f"Error reading {ext.upper()}: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
 
186
- # CPU-optimized summarization
187
- def summarize_document(file, summary_length: str = "medium"):
188
- # CPU-friendly length parameters
189
  length_params = {
190
  "short": {"max_length": 80, "min_length": 30},
191
  "medium": {"max_length": 150, "min_length": 60},
192
  "long": {"max_length": 200, "min_length": 80}
193
  }
194
 
195
- text, error = extract_text(file)
196
- if error:
197
- return error
198
 
199
- if not text or len(text.split()) < 30:
200
- return "Document too short to summarize meaningfully"
201
-
202
- try:
203
- chunks = split_into_chunks(text)
204
- summaries = []
205
-
206
- for chunk in chunks:
207
- summary = pipe(
208
  chunk,
209
- max_length=length_params[summary_length]["max_length"],
210
- min_length=length_params[summary_length]["min_length"],
211
  do_sample=False,
212
  truncation=True,
213
- no_repeat_ngram_size=2, # Reduced from 3 for CPU
214
- num_beams=2, # Reduced from 4 for CPU
215
  early_stopping=True
216
  )
217
  summaries.append(summary[0]['summary_text'])
218
-
219
- # Efficient summary combination
220
- final_summary = " ".join(summaries)
221
- final_summary = ". ".join(s.strip().capitalize()
222
- for s in final_summary.split(". ")
223
- if s.strip())
224
-
225
- return final_summary if len(final_summary) > 25 else "Summary too short - try a longer document"
226
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  except Exception as e:
228
  return f"Summarization error: {str(e)}"
229
 
230
- # Lightweight Gradio interface
231
- with gr.Blocks(title="CPU Document Summarizer", theme="soft") as demo:
232
- gr.Markdown("## 📄 CPU-Optimized Document Summarizer")
 
233
 
234
  with gr.Row():
235
  with gr.Column():
@@ -243,24 +260,22 @@ with gr.Blocks(title="CPU Document Summarizer", theme="soft") as demo:
243
  value="medium",
244
  label="Summary Length"
245
  )
246
- submit_btn = gr.Button("Summarize", variant="primary")
247
-
248
  with gr.Column():
249
- output = gr.Textbox(label="Summary", lines=8)
250
  status = gr.Textbox(label="Status", interactive=False)
251
 
252
- @submit_btn.click(inputs=[file_input, length_radio], outputs=[output, status])
253
- def process(file, length):
254
- if not file:
255
- return "", "Error: No file uploaded"
256
- status = "Processing... (this may take a while on CPU)"
257
- summary = summarize_document(file, length)
258
- return summary, "Done"
259
 
260
- # FastAPI setup
261
- app = FastAPI()
262
  app = gr.mount_gradio_app(app, demo, path="/")
263
 
264
  @app.get("/")
265
- def redirect():
266
- return RedirectResponse(url="/")
 
93
  def redirect_to_interface():
94
  return RedirectResponse(url="/")"""
95
  import gradio as gr
96
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
97
  import fitz # PyMuPDF
98
  import docx
99
  import pptx
100
  import openpyxl
101
  import re
102
+ import nltk
103
  from nltk.tokenize import sent_tokenize
104
+ import torch
105
  from fastapi import FastAPI
106
  from fastapi.responses import RedirectResponse
107
+ import os
 
108
 
109
+ # Download required NLTK data
110
+ try:
111
+ nltk.data.find('tokenizers/punkt')
112
+ except LookupError:
113
+ nltk.download('punkt')
114
 
115
+ # Initialize components
116
+ app = FastAPI()
117
+
118
+ # Load summarization model (CPU optimized)
119
+ MODEL_NAME = "facebook/bart-large-cnn"
120
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
121
+ model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
122
+ summarizer = pipeline(
123
  "summarization",
124
+ model=model,
125
  tokenizer=tokenizer,
126
  device=-1, # Force CPU usage
127
+ torch_dtype=torch.float32
128
  )
129
 
 
130
  def clean_text(text: str) -> str:
131
+ """Clean and normalize document text"""
132
+ text = re.sub(r'\s+', ' ', text) # Normalize whitespace
133
+ text = re.sub(r'•\s*|\d\.\s+', '', text) # Remove bullets and numbering
134
  text = re.sub(r'\[.*?\]|\(.*?\)', '', text) # Remove brackets/parentheses
135
+ text = re.sub(r'\bPage\s*\d+\b', '', text, flags=re.IGNORECASE) # Remove page numbers
136
  return text.strip()
137
 
138
+ def extract_text(file_path: str, file_extension: str) -> tuple[str, str]:
139
+ """Extract text from various document formats"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  try:
141
+ if file_extension == "pdf":
142
+ with fitz.open(file_path) as doc:
143
+ return clean_text("\n".join(page.get_text("text") for page in doc)), ""
 
 
 
144
 
145
+ elif file_extension == "docx":
146
+ doc = docx.Document(file_path)
147
+ return clean_text("\n".join(p.text for p in doc.paragraphs)), ""
148
 
149
+ elif file_extension == "pptx":
150
+ prs = pptx.Presentation(file_path)
151
  text = []
 
152
  for slide in prs.slides:
153
  for shape in slide.shapes:
154
  if hasattr(shape, "text"):
155
  text.append(shape.text)
156
+ return clean_text("\n".join(text)), ""
157
 
158
+ elif file_extension == "xlsx":
159
+ wb = openpyxl.load_workbook(file_path, read_only=True)
160
  text = []
 
161
  for sheet in wb.sheetnames:
162
  for row in wb[sheet].iter_rows(values_only=True):
163
  text.append(" ".join(str(cell) for cell in row if cell))
164
+ return clean_text("\n".join(text)), ""
165
 
166
+ return "", "Unsupported file format"
 
167
  except Exception as e:
168
+ return "", f"Error reading {file_extension.upper()} file: {str(e)}"
169
+
170
+ def chunk_text(text: str, max_tokens: int = 768) -> list[str]:
171
+ """Split text into manageable chunks for summarization"""
172
+ try:
173
+ sentences = sent_tokenize(text)
174
+ except:
175
+ # Fallback if sentence tokenization fails
176
+ words = text.split()
177
+ sentences = [' '.join(words[i:i+20]) for i in range(0, len(words), 20)]
178
+
179
+ chunks = []
180
+ current_chunk = ""
181
+
182
+ for sentence in sentences:
183
+ if len(current_chunk.split()) + len(sentence.split()) <= max_tokens:
184
+ current_chunk += " " + sentence
185
+ else:
186
+ chunks.append(current_chunk.strip())
187
+ current_chunk = sentence
188
+
189
+ if current_chunk:
190
+ chunks.append(current_chunk.strip())
191
+
192
+ return chunks
193
 
194
+ def generate_summary(text: str, length: str = "medium") -> str:
195
+ """Generate summary with appropriate length parameters"""
 
196
  length_params = {
197
  "short": {"max_length": 80, "min_length": 30},
198
  "medium": {"max_length": 150, "min_length": 60},
199
  "long": {"max_length": 200, "min_length": 80}
200
  }
201
 
202
+ chunks = chunk_text(text)
203
+ summaries = []
 
204
 
205
+ for chunk in chunks:
206
+ try:
207
+ summary = summarizer(
 
 
 
 
 
 
208
  chunk,
209
+ max_length=length_params[length]["max_length"],
210
+ min_length=length_params[length]["min_length"],
211
  do_sample=False,
212
  truncation=True,
213
+ no_repeat_ngram_size=2,
214
+ num_beams=2,
215
  early_stopping=True
216
  )
217
  summaries.append(summary[0]['summary_text'])
218
+ except Exception as e:
219
+ summaries.append(f"[Chunk error: {str(e)}]")
220
+
221
+ # Combine and format the final summary
222
+ final_summary = " ".join(summaries)
223
+ final_summary = ". ".join(s.strip().capitalize() for s in final_summary.split(". ") if s.strip())
224
+ return final_summary if len(final_summary) > 25 else "Summary too short - document may be too brief"
225
+
226
+ def summarize_document(file, summary_length: str):
227
+ """Main processing function for Gradio interface"""
228
+ if file is None:
229
+ return "Please upload a document first"
230
+
231
+ file_path = file.name
232
+ file_extension = file_path.split(".")[-1].lower()
233
+
234
+ text, error = extract_text(file_path, file_extension)
235
+ if error:
236
+ return error
237
+
238
+ if not text or len(text.split()) < 30:
239
+ return "Document is too short or contains too little text to summarize"
240
+
241
+ try:
242
+ return generate_summary(text, summary_length)
243
  except Exception as e:
244
  return f"Summarization error: {str(e)}"
245
 
246
+ # Gradio Interface
247
+ with gr.Blocks(title="Document Summarizer", theme=gr.themes.Soft()) as demo:
248
+ gr.Markdown("# 📄 Document Summarizer")
249
+ gr.Markdown("Upload a document to generate a concise summary")
250
 
251
  with gr.Row():
252
  with gr.Column():
 
260
  value="medium",
261
  label="Summary Length"
262
  )
263
+ submit_btn = gr.Button("Generate Summary", variant="primary")
264
+
265
  with gr.Column():
266
+ output = gr.Textbox(label="Summary", lines=10)
267
  status = gr.Textbox(label="Status", interactive=False)
268
 
269
+ submit_btn.click(
270
+ fn=summarize_document,
271
+ inputs=[file_input, length_radio],
272
+ outputs=[output, status],
273
+ api_name="summarize"
274
+ )
 
275
 
276
+ # Mount Gradio app to FastAPI
 
277
  app = gr.mount_gradio_app(app, demo, path="/")
278
 
279
  @app.get("/")
280
+ def redirect_to_interface():
281
+ return RedirectResponse(url="/")