Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -216,69 +216,118 @@ def save_text_to_pdf(text, output_path):
|
|
| 216 |
# Define font size and line spacing
|
| 217 |
font_size = 9
|
| 218 |
line_spacing = 1 * font_size
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
#
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
for line in lines:
|
| 229 |
-
if
|
| 230 |
-
|
| 231 |
-
|
|
|
|
|
|
|
|
|
|
| 232 |
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
current_line += 1
|
| 236 |
|
| 237 |
-
|
| 238 |
-
|
|
|
|
| 239 |
|
| 240 |
-
|
| 241 |
-
def handle_query(query, is_read_pdf, instructions, pdf_file=None):
|
| 242 |
-
print("Handling user query...")
|
| 243 |
-
max_chars_per_chunk = 1000 # Adjust this value as needed to control chunk size
|
| 244 |
|
| 245 |
-
if
|
| 246 |
-
|
| 247 |
-
|
| 248 |
else:
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
for result in search_results
|
| 252 |
-
|
| 253 |
-
|
|
|
|
|
|
|
|
|
|
| 254 |
|
| 255 |
-
summaries
|
|
|
|
| 256 |
for chunk in text_chunks:
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
summaries.append(summary)
|
| 261 |
-
|
| 262 |
-
combined_summary = " ".join(summaries)
|
| 263 |
-
save_text_to_pdf(combined_summary, "output_summary.pdf")
|
| 264 |
-
return combined_summary
|
| 265 |
-
|
| 266 |
-
def run_app():
|
| 267 |
-
with gr.Blocks() as demo:
|
| 268 |
-
gr.Markdown("# Web and PDF Summarizer")
|
| 269 |
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
is_read_pdf = gr.Checkbox(label="Read PDF", value=False)
|
| 273 |
-
instructions = gr.Textbox(label="Enter instructions", placeholder="Enter instructions here")
|
| 274 |
-
output = gr.Textbox(label="Summary")
|
| 275 |
|
| 276 |
-
|
| 277 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
|
| 279 |
-
|
| 280 |
-
|
|
|
|
|
|
|
| 281 |
|
| 282 |
-
|
| 283 |
|
| 284 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
# Define font size and line spacing
|
| 217 |
font_size = 9
|
| 218 |
line_spacing = 1 * font_size
|
| 219 |
+
fontname = "times-roman" # Use a supported font name
|
| 220 |
+
|
| 221 |
+
# Process the text
|
| 222 |
+
|
| 223 |
+
into lines that fit within the text_width
|
| 224 |
+
lines = []
|
| 225 |
+
current_line = ""
|
| 226 |
+
current_line_width = 0
|
| 227 |
+
words = text.split(" ")
|
| 228 |
+
for word in words:
|
| 229 |
+
word_width = fitz.get_text_length(word, fontname, font_size)
|
| 230 |
+
if current_line_width + word_width <= text_width:
|
| 231 |
+
current_line += word + " "
|
| 232 |
+
current_line_width += word_width + fitz.get_text_length(" ", fontname, font_size)
|
| 233 |
+
else:
|
| 234 |
+
lines.append(current_line.strip())
|
| 235 |
+
current_line = word + " "
|
| 236 |
+
current_line_width = word_width + fitz.get_text_length(" ", fontname, font_size)
|
| 237 |
+
if current_line:
|
| 238 |
+
lines.append(current_line.strip())
|
| 239 |
+
|
| 240 |
+
# Add the lines to the page with margins
|
| 241 |
+
x = margin
|
| 242 |
+
y = margin
|
| 243 |
for line in lines:
|
| 244 |
+
if y + line_spacing > text_height:
|
| 245 |
+
# Create a new page if text exceeds the page height
|
| 246 |
+
page = doc.new_page()
|
| 247 |
+
y = margin # Reset y-coordinate for the new page
|
| 248 |
+
page.insert_text((x, y), line, fontname=fontname, fontsize=font_size)
|
| 249 |
+
y += line_spacing
|
| 250 |
|
| 251 |
+
doc.save(output_path) # Save the PDF to the specified output path
|
| 252 |
+
print(f"Text saved to PDF at {output_path}")
|
|
|
|
| 253 |
|
| 254 |
+
# Function to process the PDF or search query and generate a summary
|
| 255 |
+
def process_input(query_or_file, is_pdf, instructions, api_key):
|
| 256 |
+
load_dotenv() # Load environment variables from .env file
|
| 257 |
|
| 258 |
+
HUGGINGFACE_API_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
|
|
|
|
|
|
|
|
|
|
| 259 |
|
| 260 |
+
if is_pdf:
|
| 261 |
+
print(f"Processing PDF: {query_or_file.name}")
|
| 262 |
+
input_text = read_pdf(query_or_file)
|
| 263 |
else:
|
| 264 |
+
print(f"Processing search query: {query_or_file}")
|
| 265 |
+
search_results = google_search(query_or_file)
|
| 266 |
+
input_text = "\n\n".join(result["text"] for result in search_results if result["text"])
|
| 267 |
+
|
| 268 |
+
# Split the input text into smaller chunks to fit within the token limit
|
| 269 |
+
chunk_size = 1024 # Adjust as needed to stay within the token limit
|
| 270 |
+
text_chunks = [input_text[i:i + chunk_size] for i in range(0, len(input_text), chunk_size)]
|
| 271 |
+
print(f"Total number of chunks: {len(text_chunks)}")
|
| 272 |
|
| 273 |
+
# Generate summaries for each chunk and concatenate them
|
| 274 |
+
concatenated_summary = ""
|
| 275 |
for chunk in text_chunks:
|
| 276 |
+
prompt = format_prompt_with_instructions(chunk, instructions)
|
| 277 |
+
chunk_summary = generate_text(prompt)
|
| 278 |
+
concatenated_summary += f"{chunk_summary}\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
|
| 280 |
+
print("Final concatenated summary generated.")
|
| 281 |
+
return concatenated_summary
|
|
|
|
|
|
|
|
|
|
| 282 |
|
| 283 |
+
# Function to clear cache
|
| 284 |
+
def clear_cache():
|
| 285 |
+
try:
|
| 286 |
+
# Clear Gradio cache
|
| 287 |
+
cache_dir = tempfile.gettempdir()
|
| 288 |
+
shutil.rmtree(os.path.join(cache_dir, "gradio"), ignore_errors=True)
|
| 289 |
|
| 290 |
+
# Clear any custom cache you might have
|
| 291 |
+
# For example, if you're caching PDF files or search results:
|
| 292 |
+
if os.path.exists("output_summary.pdf"):
|
| 293 |
+
os.remove("output_summary.pdf")
|
| 294 |
|
| 295 |
+
# Add any other cache clearing operations here
|
| 296 |
|
| 297 |
+
print("Cache cleared successfully.")
|
| 298 |
+
return "Cache cleared successfully."
|
| 299 |
+
except Exception as e:
|
| 300 |
+
print(f"Error clearing cache: {e}")
|
| 301 |
+
return f"Error clearing cache: {e}"
|
| 302 |
+
|
| 303 |
+
def summarization_interface():
|
| 304 |
+
with gr.Blocks() as demo:
|
| 305 |
+
gr.Markdown("# PDF and Web Summarization Tool")
|
| 306 |
+
|
| 307 |
+
with gr.Tab("Summarize PDF"):
|
| 308 |
+
pdf_file = gr.File(label="Upload PDF", file_types=[".pdf"])
|
| 309 |
+
pdf_instructions = gr.Textbox(label="Instructions for Summarization", placeholder="Enter instructions for summarization", lines=3)
|
| 310 |
+
pdf_summary_output = gr.Textbox(label="Summary Output")
|
| 311 |
+
pdf_api_key = gr.Textbox(label="Hugging Face API Key", type="password")
|
| 312 |
+
pdf_summarize_button = gr.Button("Generate Summary")
|
| 313 |
+
pdf_clear_cache_button = gr.Button("Clear Cache")
|
| 314 |
+
|
| 315 |
+
with gr.Tab("Summarize Web Search"):
|
| 316 |
+
search_query = gr.Textbox(label="Enter Search Query", placeholder="Enter search query")
|
| 317 |
+
search_instructions = gr.Textbox(label="Instructions for Summarization", placeholder="Enter instructions for summarization", lines=3)
|
| 318 |
+
search_summary_output = gr.Textbox(label="Summary Output")
|
| 319 |
+
search_api_key = gr.Textbox(label="Hugging Face API Key", type="password")
|
| 320 |
+
search_summarize_button = gr.Button("Generate Summary")
|
| 321 |
+
search_clear_cache_button = gr.Button("Clear Cache")
|
| 322 |
+
|
| 323 |
+
# Bind functions to button clicks
|
| 324 |
+
pdf_summarize_button.click(fn=lambda file, instructions, api_key: process_input(file, True, instructions, api_key), inputs=[pdf_file, pdf_instructions, pdf_api_key], outputs=pdf_summary_output)
|
| 325 |
+
search_summarize_button.click(fn=lambda query, instructions, api_key: process_input(query, False, instructions, api_key), inputs=[search_query, search_instructions, search_api_key], outputs=search_summary_output)
|
| 326 |
+
pdf_clear_cache_button.click(fn=clear_cache, inputs=None, outputs=pdf_summary_output)
|
| 327 |
+
search_clear_cache_button.click(fn=clear_cache, inputs=None, outputs=search_summary_output)
|
| 328 |
+
|
| 329 |
+
return demo
|
| 330 |
+
|
| 331 |
+
# Launch the Gradio interface
|
| 332 |
+
demo = summarization_interface()
|
| 333 |
+
demo.launch()
|