Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -216,69 +216,118 @@ def save_text_to_pdf(text, output_path):
|
|
216 |
# Define font size and line spacing
|
217 |
font_size = 9
|
218 |
line_spacing = 1 * font_size
|
219 |
-
|
220 |
-
|
221 |
-
#
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
228 |
for line in lines:
|
229 |
-
if
|
230 |
-
|
231 |
-
|
|
|
|
|
|
|
232 |
|
233 |
-
|
234 |
-
|
235 |
-
current_line += 1
|
236 |
|
237 |
-
|
238 |
-
|
|
|
239 |
|
240 |
-
|
241 |
-
def handle_query(query, is_read_pdf, instructions, pdf_file=None):
|
242 |
-
print("Handling user query...")
|
243 |
-
max_chars_per_chunk = 1000 # Adjust this value as needed to control chunk size
|
244 |
|
245 |
-
if
|
246 |
-
|
247 |
-
|
248 |
else:
|
249 |
-
|
250 |
-
|
251 |
-
for result in search_results
|
252 |
-
|
253 |
-
|
|
|
|
|
|
|
254 |
|
255 |
-
summaries
|
|
|
256 |
for chunk in text_chunks:
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
summaries.append(summary)
|
261 |
-
|
262 |
-
combined_summary = " ".join(summaries)
|
263 |
-
save_text_to_pdf(combined_summary, "output_summary.pdf")
|
264 |
-
return combined_summary
|
265 |
-
|
266 |
-
def run_app():
|
267 |
-
with gr.Blocks() as demo:
|
268 |
-
gr.Markdown("# Web and PDF Summarizer")
|
269 |
|
270 |
-
|
271 |
-
|
272 |
-
is_read_pdf = gr.Checkbox(label="Read PDF", value=False)
|
273 |
-
instructions = gr.Textbox(label="Enter instructions", placeholder="Enter instructions here")
|
274 |
-
output = gr.Textbox(label="Summary")
|
275 |
|
276 |
-
|
277 |
-
|
|
|
|
|
|
|
|
|
278 |
|
279 |
-
|
280 |
-
|
|
|
|
|
281 |
|
282 |
-
|
283 |
|
284 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
216 |
# Define font size and line spacing
|
217 |
font_size = 9
|
218 |
line_spacing = 1 * font_size
|
219 |
+
fontname = "times-roman" # Use a supported font name
|
220 |
+
|
221 |
+
# Process the text
|
222 |
+
|
223 |
+
into lines that fit within the text_width
|
224 |
+
lines = []
|
225 |
+
current_line = ""
|
226 |
+
current_line_width = 0
|
227 |
+
words = text.split(" ")
|
228 |
+
for word in words:
|
229 |
+
word_width = fitz.get_text_length(word, fontname, font_size)
|
230 |
+
if current_line_width + word_width <= text_width:
|
231 |
+
current_line += word + " "
|
232 |
+
current_line_width += word_width + fitz.get_text_length(" ", fontname, font_size)
|
233 |
+
else:
|
234 |
+
lines.append(current_line.strip())
|
235 |
+
current_line = word + " "
|
236 |
+
current_line_width = word_width + fitz.get_text_length(" ", fontname, font_size)
|
237 |
+
if current_line:
|
238 |
+
lines.append(current_line.strip())
|
239 |
+
|
240 |
+
# Add the lines to the page with margins
|
241 |
+
x = margin
|
242 |
+
y = margin
|
243 |
for line in lines:
|
244 |
+
if y + line_spacing > text_height:
|
245 |
+
# Create a new page if text exceeds the page height
|
246 |
+
page = doc.new_page()
|
247 |
+
y = margin # Reset y-coordinate for the new page
|
248 |
+
page.insert_text((x, y), line, fontname=fontname, fontsize=font_size)
|
249 |
+
y += line_spacing
|
250 |
|
251 |
+
doc.save(output_path) # Save the PDF to the specified output path
|
252 |
+
print(f"Text saved to PDF at {output_path}")
|
|
|
253 |
|
254 |
+
# Function to process the PDF or search query and generate a summary
|
255 |
+
def process_input(query_or_file, is_pdf, instructions, api_key):
|
256 |
+
load_dotenv() # Load environment variables from .env file
|
257 |
|
258 |
+
HUGGINGFACE_API_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
|
|
|
|
|
|
|
259 |
|
260 |
+
if is_pdf:
|
261 |
+
print(f"Processing PDF: {query_or_file.name}")
|
262 |
+
input_text = read_pdf(query_or_file)
|
263 |
else:
|
264 |
+
print(f"Processing search query: {query_or_file}")
|
265 |
+
search_results = google_search(query_or_file)
|
266 |
+
input_text = "\n\n".join(result["text"] for result in search_results if result["text"])
|
267 |
+
|
268 |
+
# Split the input text into smaller chunks to fit within the token limit
|
269 |
+
chunk_size = 1024 # Adjust as needed to stay within the token limit
|
270 |
+
text_chunks = [input_text[i:i + chunk_size] for i in range(0, len(input_text), chunk_size)]
|
271 |
+
print(f"Total number of chunks: {len(text_chunks)}")
|
272 |
|
273 |
+
# Generate summaries for each chunk and concatenate them
|
274 |
+
concatenated_summary = ""
|
275 |
for chunk in text_chunks:
|
276 |
+
prompt = format_prompt_with_instructions(chunk, instructions)
|
277 |
+
chunk_summary = generate_text(prompt)
|
278 |
+
concatenated_summary += f"{chunk_summary}\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
279 |
|
280 |
+
print("Final concatenated summary generated.")
|
281 |
+
return concatenated_summary
|
|
|
|
|
|
|
282 |
|
283 |
+
# Function to clear cache
|
284 |
+
def clear_cache():
|
285 |
+
try:
|
286 |
+
# Clear Gradio cache
|
287 |
+
cache_dir = tempfile.gettempdir()
|
288 |
+
shutil.rmtree(os.path.join(cache_dir, "gradio"), ignore_errors=True)
|
289 |
|
290 |
+
# Clear any custom cache you might have
|
291 |
+
# For example, if you're caching PDF files or search results:
|
292 |
+
if os.path.exists("output_summary.pdf"):
|
293 |
+
os.remove("output_summary.pdf")
|
294 |
|
295 |
+
# Add any other cache clearing operations here
|
296 |
|
297 |
+
print("Cache cleared successfully.")
|
298 |
+
return "Cache cleared successfully."
|
299 |
+
except Exception as e:
|
300 |
+
print(f"Error clearing cache: {e}")
|
301 |
+
return f"Error clearing cache: {e}"
|
302 |
+
|
303 |
+
def summarization_interface():
|
304 |
+
with gr.Blocks() as demo:
|
305 |
+
gr.Markdown("# PDF and Web Summarization Tool")
|
306 |
+
|
307 |
+
with gr.Tab("Summarize PDF"):
|
308 |
+
pdf_file = gr.File(label="Upload PDF", file_types=[".pdf"])
|
309 |
+
pdf_instructions = gr.Textbox(label="Instructions for Summarization", placeholder="Enter instructions for summarization", lines=3)
|
310 |
+
pdf_summary_output = gr.Textbox(label="Summary Output")
|
311 |
+
pdf_api_key = gr.Textbox(label="Hugging Face API Key", type="password")
|
312 |
+
pdf_summarize_button = gr.Button("Generate Summary")
|
313 |
+
pdf_clear_cache_button = gr.Button("Clear Cache")
|
314 |
+
|
315 |
+
with gr.Tab("Summarize Web Search"):
|
316 |
+
search_query = gr.Textbox(label="Enter Search Query", placeholder="Enter search query")
|
317 |
+
search_instructions = gr.Textbox(label="Instructions for Summarization", placeholder="Enter instructions for summarization", lines=3)
|
318 |
+
search_summary_output = gr.Textbox(label="Summary Output")
|
319 |
+
search_api_key = gr.Textbox(label="Hugging Face API Key", type="password")
|
320 |
+
search_summarize_button = gr.Button("Generate Summary")
|
321 |
+
search_clear_cache_button = gr.Button("Clear Cache")
|
322 |
+
|
323 |
+
# Bind functions to button clicks
|
324 |
+
pdf_summarize_button.click(fn=lambda file, instructions, api_key: process_input(file, True, instructions, api_key), inputs=[pdf_file, pdf_instructions, pdf_api_key], outputs=pdf_summary_output)
|
325 |
+
search_summarize_button.click(fn=lambda query, instructions, api_key: process_input(query, False, instructions, api_key), inputs=[search_query, search_instructions, search_api_key], outputs=search_summary_output)
|
326 |
+
pdf_clear_cache_button.click(fn=clear_cache, inputs=None, outputs=pdf_summary_output)
|
327 |
+
search_clear_cache_button.click(fn=clear_cache, inputs=None, outputs=search_summary_output)
|
328 |
+
|
329 |
+
return demo
|
330 |
+
|
331 |
+
# Launch the Gradio interface
|
332 |
+
demo = summarization_interface()
|
333 |
+
demo.launch()
|