ikraamkb commited on
Commit
9b32604
·
verified ·
1 Parent(s): 40fa0d9

add the readers

Browse files
Files changed (1) hide show
  1. app.py +68 -14
app.py CHANGED
@@ -103,7 +103,11 @@ import nltk
103
  from nltk.tokenize import sent_tokenize
104
  import torch
105
  from fastapi import FastAPI
106
- from fastapi.responses import RedirectResponse
 
 
 
 
107
 
108
  # Download required NLTK data
109
  nltk.download('punkt', quiet=True)
@@ -111,7 +115,7 @@ nltk.download('punkt', quiet=True)
111
  # Initialize components
112
  app = FastAPI()
113
 
114
- # Load summarization model (CPU optimized)
115
  MODEL_NAME = "facebook/bart-large-cnn"
116
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
117
  model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
@@ -123,6 +127,9 @@ summarizer = pipeline(
123
  torch_dtype=torch.float32
124
  )
125
 
 
 
 
126
  def clean_text(text: str) -> str:
127
  """Clean and normalize document text"""
128
  text = re.sub(r'\s+', ' ', text) # Normalize whitespace
@@ -136,7 +143,16 @@ def extract_text(file_path: str, file_extension: str) -> tuple[str, str]:
136
  try:
137
  if file_extension == "pdf":
138
  with fitz.open(file_path) as doc:
139
- return clean_text("\n".join(page.get_text("text") for page in doc)), ""
 
 
 
 
 
 
 
 
 
140
 
141
  elif file_extension == "docx":
142
  doc = docx.Document(file_path)
@@ -159,6 +175,10 @@ def extract_text(file_path: str, file_extension: str) -> tuple[str, str]:
159
  text.append(" ".join(str(cell) for cell in row if cell))
160
  return clean_text("\n".join(text)), ""
161
 
 
 
 
 
162
  return "", "Unsupported file format"
163
  except Exception as e:
164
  return "", f"Error reading {file_extension.upper()} file: {str(e)}"
@@ -219,37 +239,49 @@ def generate_summary(text: str, length: str = "medium") -> str:
219
  final_summary = ". ".join(s.strip().capitalize() for s in final_summary.split(". ") if s.strip())
220
  return final_summary if len(final_summary) > 25 else "Summary too short - document may be too brief"
221
 
222
- def summarize_document(file, summary_length: str):
 
 
 
 
 
 
 
 
 
 
 
223
  """Main processing function for Gradio interface"""
224
  if file is None:
225
- return "Please upload a document first", "Ready"
226
 
227
  file_path = file.name
228
  file_extension = file_path.split(".")[-1].lower()
229
 
230
  text, error = extract_text(file_path, file_extension)
231
  if error:
232
- return error, "Error"
233
 
234
  if not text or len(text.split()) < 30:
235
- return "Document is too short or contains too little text to summarize", "Ready"
236
 
237
  try:
238
  summary = generate_summary(text, summary_length)
239
- return summary, "Summary complete"
 
240
  except Exception as e:
241
- return f"Summarization error: {str(e)}", "Error"
242
 
243
  # Gradio Interface
244
  with gr.Blocks(title="Document Summarizer", theme=gr.themes.Soft()) as demo:
245
- gr.Markdown("# 📄 Document Summarizer")
246
- gr.Markdown("Upload a document to generate a concise summary")
247
 
248
  with gr.Row():
249
  with gr.Column():
250
  file_input = gr.File(
251
  label="Upload Document",
252
- file_types=[".pdf", ".docx", ".pptx", ".xlsx"],
253
  type="filepath"
254
  )
255
  length_radio = gr.Radio(
@@ -257,19 +289,41 @@ with gr.Blocks(title="Document Summarizer", theme=gr.themes.Soft()) as demo:
257
  value="medium",
258
  label="Summary Length"
259
  )
 
 
 
 
260
  submit_btn = gr.Button("Generate Summary", variant="primary")
261
 
262
  with gr.Column():
263
  output = gr.Textbox(label="Summary", lines=10)
264
  status = gr.Textbox(label="Status", interactive=False)
 
 
 
 
 
 
 
 
 
 
265
 
266
  submit_btn.click(
267
  fn=summarize_document,
268
- inputs=[file_input, length_radio],
269
- outputs=[output, status],
270
  api_name="summarize"
271
  )
272
 
 
 
 
 
 
 
 
 
273
  # Mount Gradio app to FastAPI
274
  app = gr.mount_gradio_app(app, demo, path="/")
275
 
 
103
  from nltk.tokenize import sent_tokenize
104
  import torch
105
  from fastapi import FastAPI
106
+ from fastapi.responses import RedirectResponse, FileResponse
107
+ from gtts import gTTS
108
+ import tempfile
109
+ import os
110
+ import easyocr
111
 
112
  # Download required NLTK data
113
  nltk.download('punkt', quiet=True)
 
115
  # Initialize components
116
  app = FastAPI()
117
 
118
+ # Load models (CPU optimized)
119
  MODEL_NAME = "facebook/bart-large-cnn"
120
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
121
  model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
 
127
  torch_dtype=torch.float32
128
  )
129
 
130
+ # Initialize EasyOCR reader
131
+ reader = easyocr.Reader(['en']) # English only for faster initialization
132
+
133
  def clean_text(text: str) -> str:
134
  """Clean and normalize document text"""
135
  text = re.sub(r'\s+', ' ', text) # Normalize whitespace
 
143
  try:
144
  if file_extension == "pdf":
145
  with fitz.open(file_path) as doc:
146
+ text = "\n".join(page.get_text("text") for page in doc)
147
+ # Try OCR for scanned PDFs if text extraction fails
148
+ if len(text.strip()) < 50:
149
+ images = [page.get_pixmap() for page in doc]
150
+ temp_img = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
151
+ images[0].save(temp_img.name)
152
+ ocr_result = reader.readtext(temp_img.name, detail=0)
153
+ os.unlink(temp_img.name)
154
+ text = "\n".join(ocr_result) if ocr_result else text
155
+ return clean_text(text), ""
156
 
157
  elif file_extension == "docx":
158
  doc = docx.Document(file_path)
 
175
  text.append(" ".join(str(cell) for cell in row if cell))
176
  return clean_text("\n".join(text)), ""
177
 
178
+ elif file_extension in ["jpg", "jpeg", "png"]:
179
+ ocr_result = reader.readtext(file_path, detail=0)
180
+ return clean_text("\n".join(ocr_result)), ""
181
+
182
  return "", "Unsupported file format"
183
  except Exception as e:
184
  return "", f"Error reading {file_extension.upper()} file: {str(e)}"
 
239
  final_summary = ". ".join(s.strip().capitalize() for s in final_summary.split(". ") if s.strip())
240
  return final_summary if len(final_summary) > 25 else "Summary too short - document may be too brief"
241
 
242
+ def text_to_speech(text: str) -> str:
243
+ """Convert text to speech and return temporary audio file path"""
244
+ try:
245
+ tts = gTTS(text)
246
+ temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
247
+ tts.save(temp_audio.name)
248
+ return temp_audio.name
249
+ except Exception as e:
250
+ print(f"Error in text-to-speech: {e}")
251
+ return ""
252
+
253
+ def summarize_document(file, summary_length: str, enable_tts: bool):
254
  """Main processing function for Gradio interface"""
255
  if file is None:
256
+ return "Please upload a document first", "Ready", None
257
 
258
  file_path = file.name
259
  file_extension = file_path.split(".")[-1].lower()
260
 
261
  text, error = extract_text(file_path, file_extension)
262
  if error:
263
+ return error, "Error", None
264
 
265
  if not text or len(text.split()) < 30:
266
+ return "Document is too short or contains too little text to summarize", "Ready", None
267
 
268
  try:
269
  summary = generate_summary(text, summary_length)
270
+ audio_path = text_to_speech(summary) if enable_tts else None
271
+ return summary, "Summary complete", audio_path
272
  except Exception as e:
273
+ return f"Summarization error: {str(e)}", "Error", None
274
 
275
  # Gradio Interface
276
  with gr.Blocks(title="Document Summarizer", theme=gr.themes.Soft()) as demo:
277
+ gr.Markdown("# 📄 Advanced Document Summarizer")
278
+ gr.Markdown("Upload a document to generate a summary with optional audio reading")
279
 
280
  with gr.Row():
281
  with gr.Column():
282
  file_input = gr.File(
283
  label="Upload Document",
284
+ file_types=[".pdf", ".docx", ".pptx", ".xlsx", ".jpg", ".jpeg", ".png"],
285
  type="filepath"
286
  )
287
  length_radio = gr.Radio(
 
289
  value="medium",
290
  label="Summary Length"
291
  )
292
+ tts_checkbox = gr.Checkbox(
293
+ label="Enable Text-to-Speech",
294
+ value=False
295
+ )
296
  submit_btn = gr.Button("Generate Summary", variant="primary")
297
 
298
  with gr.Column():
299
  output = gr.Textbox(label="Summary", lines=10)
300
  status = gr.Textbox(label="Status", interactive=False)
301
+ audio_output = gr.Audio(label="Audio Summary", visible=False)
302
+
303
+ def toggle_audio_visibility(enable_tts):
304
+ return gr.Audio(visible=enable_tts)
305
+
306
+ tts_checkbox.change(
307
+ fn=toggle_audio_visibility,
308
+ inputs=tts_checkbox,
309
+ outputs=audio_output
310
+ )
311
 
312
  submit_btn.click(
313
  fn=summarize_document,
314
+ inputs=[file_input, length_radio, tts_checkbox],
315
+ outputs=[output, status, audio_output],
316
  api_name="summarize"
317
  )
318
 
319
+ # FastAPI endpoints for audio files
320
+ @app.get("/audio/{file_name}")
321
+ async def get_audio(file_name: str):
322
+ file_path = os.path.join(tempfile.gettempdir(), file_name)
323
+ if os.path.exists(file_path):
324
+ return FileResponse(file_path, media_type="audio/mpeg")
325
+ return JSONResponse({"error": "File not found"}, status_code=404)
326
+
327
  # Mount Gradio app to FastAPI
328
  app = gr.mount_gradio_app(app, demo, path="/")
329