ikraamkb commited on
Commit
315a442
·
verified ·
1 Parent(s): 9b32604

add download pdf

Browse files
Files changed (1) hide show
  1. app.py +45 -105
app.py CHANGED
@@ -1,97 +1,3 @@
1
- """import gradio as gr
2
- from transformers import pipeline
3
- import fitz # PyMuPDF
4
- import docx
5
- import pptx
6
- import openpyxl
7
- import os
8
-
9
- from fastapi import FastAPI
10
- from fastapi.responses import RedirectResponse
11
-
12
- # Load your custom summarization model
13
- pipe = pipeline("summarization", model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn")
14
-
15
- # Document text extraction function
16
- def extract_text(file):
17
- ext = file.name.split(".")[-1].lower()
18
- path = file.name
19
-
20
- if ext == "pdf":
21
- try:
22
- with fitz.open(path) as doc:
23
- return "\n".join([page.get_text("text") for page in doc])
24
- except Exception as e:
25
- return f"Error reading PDF: {e}"
26
-
27
- elif ext == "docx":
28
- try:
29
- doc = docx.Document(path)
30
- return "\n".join([p.text for p in doc.paragraphs])
31
- except Exception as e:
32
- return f"Error reading DOCX: {e}"
33
-
34
- elif ext == "pptx":
35
- try:
36
- prs = pptx.Presentation(path)
37
- text = ""
38
- for slide in prs.slides:
39
- for shape in slide.shapes:
40
- if hasattr(shape, "text"):
41
- text += shape.text + "\n"
42
- return text
43
- except Exception as e:
44
- return f"Error reading PPTX: {e}"
45
-
46
- elif ext == "xlsx":
47
- try:
48
- wb = openpyxl.load_workbook(path)
49
- text = ""
50
- for sheet in wb.sheetnames:
51
- for row in wb[sheet].iter_rows(values_only=True):
52
- text += " ".join([str(cell) for cell in row if cell]) + "\n"
53
- return text
54
- except Exception as e:
55
- return f"Error reading XLSX: {e}"
56
- else:
57
- return "Unsupported file format"
58
-
59
- # Summarization logic
60
- def summarize_document(file):
61
- text = extract_text(file)
62
- if "Error" in text or "Unsupported" in text:
63
- return text
64
-
65
- word_count = len(text.split())
66
- max_summary_len = max(20, int(word_count * 0.2))
67
-
68
- try:
69
- summary = pipe(text, max_length=max_summary_len, min_length=int(max_summary_len * 0.6), do_sample=False)
70
- # Print the summary to debug its structure
71
- print(summary)
72
- return summary[0]['summary_text'] # Access the correct key for the output
73
- except Exception as e:
74
- return f"Error during summarization: {e}"
75
-
76
- # Gradio Interface
77
- demo = gr.Interface(
78
- fn=summarize_document,
79
- inputs=gr.File(label="Upload a document (PDF, DOCX, PPTX, XLSX)", file_types=[".pdf", ".docx", ".pptx", ".xlsx"]),
80
- outputs=gr.Textbox(label="20% Summary"),
81
- title="📄 Document Summarizer (20% Length)",
82
- description="Upload a document and get a concise summary generated by your custom Hugging Face model."
83
- )
84
-
85
- # FastAPI setup
86
- app = FastAPI()
87
-
88
- # Mount Gradio at "/"
89
- app = gr.mount_gradio_app(app, demo, path="/")
90
-
91
- # Optional root redirect
92
- @app.get("/")
93
- def redirect_to_interface():
94
- return RedirectResponse(url="/")"""
95
  import gradio as gr
96
  from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
97
  import fitz # PyMuPDF
@@ -108,6 +14,8 @@ from gtts import gTTS
108
  import tempfile
109
  import os
110
  import easyocr
 
 
111
 
112
  # Download required NLTK data
113
  nltk.download('punkt', quiet=True)
@@ -250,32 +158,63 @@ def text_to_speech(text: str) -> str:
250
  print(f"Error in text-to-speech: {e}")
251
  return ""
252
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  def summarize_document(file, summary_length: str, enable_tts: bool):
254
  """Main processing function for Gradio interface"""
255
  if file is None:
256
- return "Please upload a document first", "Ready", None
257
 
258
  file_path = file.name
259
  file_extension = file_path.split(".")[-1].lower()
 
260
 
261
  text, error = extract_text(file_path, file_extension)
262
  if error:
263
- return error, "Error", None
264
 
265
  if not text or len(text.split()) < 30:
266
- return "Document is too short or contains too little text to summarize", "Ready", None
267
 
268
  try:
269
  summary = generate_summary(text, summary_length)
270
  audio_path = text_to_speech(summary) if enable_tts else None
271
- return summary, "Summary complete", audio_path
 
272
  except Exception as e:
273
- return f"Summarization error: {str(e)}", "Error", None
274
 
275
  # Gradio Interface
276
  with gr.Blocks(title="Document Summarizer", theme=gr.themes.Soft()) as demo:
277
  gr.Markdown("# 📄 Advanced Document Summarizer")
278
- gr.Markdown("Upload a document to generate a summary with optional audio reading")
279
 
280
  with gr.Row():
281
  with gr.Column():
@@ -299,6 +238,7 @@ with gr.Blocks(title="Document Summarizer", theme=gr.themes.Soft()) as demo:
299
  output = gr.Textbox(label="Summary", lines=10)
300
  status = gr.Textbox(label="Status", interactive=False)
301
  audio_output = gr.Audio(label="Audio Summary", visible=False)
 
302
 
303
  def toggle_audio_visibility(enable_tts):
304
  return gr.Audio(visible=enable_tts)
@@ -312,16 +252,16 @@ with gr.Blocks(title="Document Summarizer", theme=gr.themes.Soft()) as demo:
312
  submit_btn.click(
313
  fn=summarize_document,
314
  inputs=[file_input, length_radio, tts_checkbox],
315
- outputs=[output, status, audio_output],
316
  api_name="summarize"
317
  )
318
 
319
- # FastAPI endpoints for audio files
320
- @app.get("/audio/{file_name}")
321
- async def get_audio(file_name: str):
322
  file_path = os.path.join(tempfile.gettempdir(), file_name)
323
  if os.path.exists(file_path):
324
- return FileResponse(file_path, media_type="audio/mpeg")
325
  return JSONResponse({"error": "File not found"}, status_code=404)
326
 
327
  # Mount Gradio app to FastAPI
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
3
  import fitz # PyMuPDF
 
14
  import tempfile
15
  import os
16
  import easyocr
17
+ from fpdf import FPDF
18
+ import datetime
19
 
20
  # Download required NLTK data
21
  nltk.download('punkt', quiet=True)
 
158
  print(f"Error in text-to-speech: {e}")
159
  return ""
160
 
161
+ def create_pdf(summary: str, original_filename: str) -> str:
162
+ """Create a PDF file from the summary text"""
163
+ try:
164
+ # Create PDF object
165
+ pdf = FPDF()
166
+ pdf.add_page()
167
+ pdf.set_font("Arial", size=12)
168
+
169
+ # Add title
170
+ pdf.set_font("Arial", 'B', 16)
171
+ pdf.cell(200, 10, txt="Document Summary", ln=1, align='C')
172
+ pdf.set_font("Arial", size=12)
173
+
174
+ # Add metadata
175
+ pdf.cell(200, 10, txt=f"Original file: {original_filename}", ln=1)
176
+ pdf.cell(200, 10, txt=f"Generated on: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", ln=1)
177
+ pdf.ln(10)
178
+
179
+ # Add summary content
180
+ pdf.multi_cell(0, 10, txt=summary)
181
+
182
+ # Save to temporary file
183
+ temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
184
+ pdf.output(temp_pdf.name)
185
+ return temp_pdf.name
186
+ except Exception as e:
187
+ print(f"Error creating PDF: {e}")
188
+ return ""
189
+
190
  def summarize_document(file, summary_length: str, enable_tts: bool):
191
  """Main processing function for Gradio interface"""
192
  if file is None:
193
+ return "Please upload a document first", "Ready", None, None
194
 
195
  file_path = file.name
196
  file_extension = file_path.split(".")[-1].lower()
197
+ original_filename = os.path.basename(file_path)
198
 
199
  text, error = extract_text(file_path, file_extension)
200
  if error:
201
+ return error, "Error", None, None
202
 
203
  if not text or len(text.split()) < 30:
204
+ return "Document is too short or contains too little text to summarize", "Ready", None, None
205
 
206
  try:
207
  summary = generate_summary(text, summary_length)
208
  audio_path = text_to_speech(summary) if enable_tts else None
209
+ pdf_path = create_pdf(summary, original_filename)
210
+ return summary, "Summary complete", audio_path, pdf_path
211
  except Exception as e:
212
+ return f"Summarization error: {str(e)}", "Error", None, None
213
 
214
  # Gradio Interface
215
  with gr.Blocks(title="Document Summarizer", theme=gr.themes.Soft()) as demo:
216
  gr.Markdown("# 📄 Advanced Document Summarizer")
217
+ gr.Markdown("Upload a document to generate a summary with optional audio reading and PDF download")
218
 
219
  with gr.Row():
220
  with gr.Column():
 
238
  output = gr.Textbox(label="Summary", lines=10)
239
  status = gr.Textbox(label="Status", interactive=False)
240
  audio_output = gr.Audio(label="Audio Summary", visible=False)
241
+ pdf_download = gr.File(label="Download Summary as PDF", visible=False)
242
 
243
  def toggle_audio_visibility(enable_tts):
244
  return gr.Audio(visible=enable_tts)
 
252
  submit_btn.click(
253
  fn=summarize_document,
254
  inputs=[file_input, length_radio, tts_checkbox],
255
+ outputs=[output, status, audio_output, pdf_download],
256
  api_name="summarize"
257
  )
258
 
259
+ # FastAPI endpoints for files
260
+ @app.get("/files/{file_name}")
261
+ async def get_file(file_name: str):
262
  file_path = os.path.join(tempfile.gettempdir(), file_name)
263
  if os.path.exists(file_path):
264
+ return FileResponse(file_path)
265
  return JSONResponse({"error": "File not found"}, status_code=404)
266
 
267
  # Mount Gradio app to FastAPI