Shreyas094 commited on
Commit
09df7eb
·
verified ·
1 Parent(s): e15b1c1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -53
app.py CHANGED
@@ -216,69 +216,118 @@ def save_text_to_pdf(text, output_path):
216
  # Define font size and line spacing
217
  font_size = 9
218
  line_spacing = 1 * font_size
219
- max_lines_per_page = int(text_height // line_spacing)
220
-
221
- # Load a built-in font
222
- font = "helv"
223
-
224
- # Split the text into lines
225
- lines = text.split("\n")
226
-
227
- current_line = 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  for line in lines:
229
- if current_line >= max_lines_per_page:
230
- page = doc.new_page() # Add a new page
231
- current_line = 0
 
 
 
232
 
233
- rect = fitz.Rect(margin, margin + current_line * line_spacing, text_width, margin + (current_line + 1) * line_spacing)
234
- page.insert_textbox(rect, line, fontsize=font_size, fontname=font, align=fitz.TEXT_ALIGN_LEFT)
235
- current_line += 1
236
 
237
- doc.save(output_path)
238
- print(f"Text saved to PDF at {output_path}.")
 
239
 
240
- # Function to handle user queries
241
- def handle_query(query, is_read_pdf, instructions, pdf_file=None):
242
- print("Handling user query...")
243
- max_chars_per_chunk = 1000 # Adjust this value as needed to control chunk size
244
 
245
- if is_read_pdf and pdf_file:
246
- pdf_text = read_pdf(pdf_file)
247
- text_chunks = [pdf_text[i:i+max_chars_per_chunk] for i in range(0, len(pdf_text), max_chars_per_chunk)]
248
  else:
249
- search_results = google_search(query)
250
- text_chunks = []
251
- for result in search_results:
252
- if result["text"]:
253
- text_chunks.extend([result["text"][i:i+max_chars_per_chunk] for i in range(0, len(result["text"]), max_chars_per_chunk)])
 
 
 
254
 
255
- summaries = []
 
256
  for chunk in text_chunks:
257
- formatted_prompt = format_prompt_with_instructions(chunk, instructions)
258
- summary = generate_text(formatted_prompt)
259
- if summary:
260
- summaries.append(summary)
261
-
262
- combined_summary = " ".join(summaries)
263
- save_text_to_pdf(combined_summary, "output_summary.pdf")
264
- return combined_summary
265
-
266
- def run_app():
267
- with gr.Blocks() as demo:
268
- gr.Markdown("# Web and PDF Summarizer")
269
 
270
- query = gr.Textbox(label="Enter your query", placeholder="Enter query here")
271
- pdf_file = gr.File(label="Upload PDF", file_types=[".pdf"])
272
- is_read_pdf = gr.Checkbox(label="Read PDF", value=False)
273
- instructions = gr.Textbox(label="Enter instructions", placeholder="Enter instructions here")
274
- output = gr.Textbox(label="Summary")
275
 
276
- clear_cache_btn = gr.Button("Clear Cache")
277
- clear_cache_btn.click(fn=clear_cache, outputs=output)
 
 
 
 
278
 
279
- generate_btn = gr.Button("Generate Summary")
280
- generate_btn.click(fn=handle_query, inputs=[query, is_read_pdf, instructions, pdf_file], outputs=output)
 
 
281
 
282
- demo.launch()
283
 
284
- run_app()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
  # Define font size and line spacing
217
  font_size = 9
218
  line_spacing = 1 * font_size
219
+ fontname = "times-roman" # Use a supported font name
220
+
221
+ # Process the text
222
+
223
+ into lines that fit within the text_width
224
+ lines = []
225
+ current_line = ""
226
+ current_line_width = 0
227
+ words = text.split(" ")
228
+ for word in words:
229
+ word_width = fitz.get_text_length(word, fontname, font_size)
230
+ if current_line_width + word_width <= text_width:
231
+ current_line += word + " "
232
+ current_line_width += word_width + fitz.get_text_length(" ", fontname, font_size)
233
+ else:
234
+ lines.append(current_line.strip())
235
+ current_line = word + " "
236
+ current_line_width = word_width + fitz.get_text_length(" ", fontname, font_size)
237
+ if current_line:
238
+ lines.append(current_line.strip())
239
+
240
+ # Add the lines to the page with margins
241
+ x = margin
242
+ y = margin
243
  for line in lines:
244
+ if y + line_spacing > text_height:
245
+ # Create a new page if text exceeds the page height
246
+ page = doc.new_page()
247
+ y = margin # Reset y-coordinate for the new page
248
+ page.insert_text((x, y), line, fontname=fontname, fontsize=font_size)
249
+ y += line_spacing
250
 
251
+ doc.save(output_path) # Save the PDF to the specified output path
252
+ print(f"Text saved to PDF at {output_path}")
 
253
 
254
+ # Function to process the PDF or search query and generate a summary
255
+ def process_input(query_or_file, is_pdf, instructions, api_key):
256
+ load_dotenv() # Load environment variables from .env file
257
 
258
+ HUGGINGFACE_API_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
 
 
 
259
 
260
+ if is_pdf:
261
+ print(f"Processing PDF: {query_or_file.name}")
262
+ input_text = read_pdf(query_or_file)
263
  else:
264
+ print(f"Processing search query: {query_or_file}")
265
+ search_results = google_search(query_or_file)
266
+ input_text = "\n\n".join(result["text"] for result in search_results if result["text"])
267
+
268
+ # Split the input text into smaller chunks to fit within the token limit
269
+ chunk_size = 1024 # Adjust as needed to stay within the token limit
270
+ text_chunks = [input_text[i:i + chunk_size] for i in range(0, len(input_text), chunk_size)]
271
+ print(f"Total number of chunks: {len(text_chunks)}")
272
 
273
+ # Generate summaries for each chunk and concatenate them
274
+ concatenated_summary = ""
275
  for chunk in text_chunks:
276
+ prompt = format_prompt_with_instructions(chunk, instructions)
277
+ chunk_summary = generate_text(prompt)
278
+ concatenated_summary += f"{chunk_summary}\n\n"
 
 
 
 
 
 
 
 
 
279
 
280
+ print("Final concatenated summary generated.")
281
+ return concatenated_summary
 
 
 
282
 
283
+ # Function to clear cache
284
+ def clear_cache():
285
+ try:
286
+ # Clear Gradio cache
287
+ cache_dir = tempfile.gettempdir()
288
+ shutil.rmtree(os.path.join(cache_dir, "gradio"), ignore_errors=True)
289
 
290
+ # Clear any custom cache you might have
291
+ # For example, if you're caching PDF files or search results:
292
+ if os.path.exists("output_summary.pdf"):
293
+ os.remove("output_summary.pdf")
294
 
295
+ # Add any other cache clearing operations here
296
 
297
+ print("Cache cleared successfully.")
298
+ return "Cache cleared successfully."
299
+ except Exception as e:
300
+ print(f"Error clearing cache: {e}")
301
+ return f"Error clearing cache: {e}"
302
+
303
+ def summarization_interface():
304
+ with gr.Blocks() as demo:
305
+ gr.Markdown("# PDF and Web Summarization Tool")
306
+
307
+ with gr.Tab("Summarize PDF"):
308
+ pdf_file = gr.File(label="Upload PDF", file_types=[".pdf"])
309
+ pdf_instructions = gr.Textbox(label="Instructions for Summarization", placeholder="Enter instructions for summarization", lines=3)
310
+ pdf_summary_output = gr.Textbox(label="Summary Output")
311
+ pdf_api_key = gr.Textbox(label="Hugging Face API Key", type="password")
312
+ pdf_summarize_button = gr.Button("Generate Summary")
313
+ pdf_clear_cache_button = gr.Button("Clear Cache")
314
+
315
+ with gr.Tab("Summarize Web Search"):
316
+ search_query = gr.Textbox(label="Enter Search Query", placeholder="Enter search query")
317
+ search_instructions = gr.Textbox(label="Instructions for Summarization", placeholder="Enter instructions for summarization", lines=3)
318
+ search_summary_output = gr.Textbox(label="Summary Output")
319
+ search_api_key = gr.Textbox(label="Hugging Face API Key", type="password")
320
+ search_summarize_button = gr.Button("Generate Summary")
321
+ search_clear_cache_button = gr.Button("Clear Cache")
322
+
323
+ # Bind functions to button clicks
324
+ pdf_summarize_button.click(fn=lambda file, instructions, api_key: process_input(file, True, instructions, api_key), inputs=[pdf_file, pdf_instructions, pdf_api_key], outputs=pdf_summary_output)
325
+ search_summarize_button.click(fn=lambda query, instructions, api_key: process_input(query, False, instructions, api_key), inputs=[search_query, search_instructions, search_api_key], outputs=search_summary_output)
326
+ pdf_clear_cache_button.click(fn=clear_cache, inputs=None, outputs=pdf_summary_output)
327
+ search_clear_cache_button.click(fn=clear_cache, inputs=None, outputs=search_summary_output)
328
+
329
+ return demo
330
+
331
+ # Launch the Gradio interface
332
+ demo = summarization_interface()
333
+ demo.launch()