awacke1 commited on
Commit
bf5b316
ยท
verified ยท
1 Parent(s): 0df46b2

Update back.branched.PDFAddedRAG.03282025.app.py

Browse files
back.branched.PDFAddedRAG.03282025.app.py CHANGED
@@ -101,9 +101,10 @@ def generate_filename(prompt, file_type, original_name=None):
101
  safe_date_time = datetime.now(central).strftime("%m%d_%H%M")
102
  if original_name and file_type == "md": # For images
103
  base_name = os.path.splitext(original_name)[0]
104
- safe_prompt = re.sub(r'[<>:"/\\|?*\n]', ' ', prompt).strip()[:100]
105
- return f"{safe_date_time}_{safe_prompt}_{base_name}.{file_type}"
106
- safe_prompt = re.sub(r'[<>:"/\\|?*\n]', ' ', prompt).strip()[:240]
 
107
  return f"{safe_date_time}_{safe_prompt}.{file_type}"
108
 
109
  def create_and_save_file(content, file_type="md", prompt=None, original_name=None, should_save=True):
@@ -308,7 +309,7 @@ def generate_questions(pdf_path):
308
  pdf = PdfReader(f)
309
  for page in pdf.pages:
310
  text += page.extract_text() or ""
311
- prompt = f"Can you generate a question that can only be answered from this document?:\n{text[:2000]}\n\n"
312
  response = client.chat.completions.create(
313
  model="gpt-4o-2024-05-13",
314
  messages=[{"role": "user", "content": prompt}]
@@ -320,7 +321,12 @@ def process_rag_query(query, vector_store_id):
320
  response = client.chat.completions.create(
321
  model="gpt-4o-2024-05-13",
322
  messages=[{"role": "user", "content": query}],
323
- tools=[{"type": "file_search", "file_search": {"vector_store_ids": [vector_store_id]}}],
 
 
 
 
 
324
  tool_choice="auto"
325
  )
326
  tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else []
@@ -331,26 +337,28 @@ def process_rag_query(query, vector_store_id):
331
 
332
  def evaluate_rag(vector_store_id, questions_dict):
333
  k = 5
334
- total_queries = len(questions_dict)
335
  correct_retrievals_at_k = 0
336
  reciprocal_ranks = []
337
  average_precisions = []
338
 
339
- for filename, query in questions_dict.items():
340
- expected_file = filename
341
- response, tool_calls = process_rag_query(query, vector_store_id)
342
- if not tool_calls:
343
- continue
344
- retrieved_files = [call.arguments.get("file_id", "") for call in tool_calls if "file_search" in call.type][:k]
345
- if expected_file in retrieved_files:
346
- rank = retrieved_files.index(expected_file) + 1
347
- correct_retrievals_at_k += 1
348
- reciprocal_ranks.append(1 / rank)
349
- precisions = [1 if f == expected_file else 0 for f in retrieved_files[:rank]]
350
- average_precisions.append(sum(precisions) / len(precisions))
351
- else:
352
- reciprocal_ranks.append(0)
353
- average_precisions.append(0)
 
 
354
 
355
  recall_at_k = correct_retrievals_at_k / total_queries if total_queries else 0
356
  mrr = sum(reciprocal_ranks) / total_queries if total_queries else 0
@@ -367,12 +375,26 @@ def rag_pdf_gallery():
367
  stats = upload_pdf_files_to_vector_store(vector_store_details["id"], pdf_paths)
368
  st.json(stats)
369
 
370
- with st.spinner("Generating evaluation questions..."):
 
 
 
 
 
 
 
 
 
 
 
371
  questions_dict = {os.path.basename(p): generate_questions(p) for p in pdf_paths}
372
- st.json(questions_dict)
 
 
 
373
 
374
- query = st.text_input("Ask a question about the PDFs:")
375
- if query:
376
  with st.spinner("Processing RAG query..."):
377
  response, tool_calls = process_rag_query(query, vector_store_details["id"])
378
  if response:
@@ -381,6 +403,7 @@ def rag_pdf_gallery():
381
  for call in tool_calls:
382
  if "file_search" in call.type:
383
  st.json(call.arguments)
 
384
 
385
  if st.button("Evaluate RAG Performance"):
386
  with st.spinner("Evaluating..."):
@@ -465,7 +488,7 @@ def main():
465
  option = st.selectbox("Select Input Type", ("Text", "Image", "Audio", "Video", "ArXiv Search", "RAG PDF Gallery"))
466
 
467
  if option == "Text":
468
- default_text = "emojis in markdown. Maybe a buckeyball feature rating comparing them against each other in markdown emoji outline or tables."
469
  col1, col2 = st.columns([1, 5])
470
  with col1:
471
  if st.button("๐Ÿ“ MD", key="md_button"):
@@ -475,9 +498,10 @@ def main():
475
  st.rerun()
476
  with col2:
477
  text_input = st.text_input("Enter your text:", value=st.session_state.get("text_input", ""), key="text_input_field")
478
- if text_input and text_input != st.session_state.get("text_input", ""): # Only process if changed
479
  with st.spinner("Processing..."):
480
  process_text(text_input)
 
481
 
482
  elif option == "Image":
483
  col1, col2 = st.columns(2)
@@ -489,7 +513,7 @@ def main():
489
  st.session_state["image_prompt"] = "Show electronic text of text in the image."
490
  text_input = st.text_input("Image Prompt:", value=st.session_state.get("image_prompt", "Describe this image and list ten facts in a markdown outline with emojis."))
491
  image_input = st.file_uploader("Upload an image (max 200MB)", type=["png", "jpg", "jpeg"], accept_multiple_files=False)
492
- if image_input and text_input:
493
  if image_input.size > 200 * 1024 * 1024:
494
  st.error("Image exceeds 200MB limit.")
495
  else:
@@ -503,13 +527,13 @@ def main():
503
  text_input = st.text_input("Audio Prompt:", value="Summarize this audio transcription in Markdown.")
504
  audio_input = st.file_uploader("Upload an audio file (max 200MB)", type=["mp3", "wav", "flac", "m4a"], accept_multiple_files=False)
505
  audio_bytes = audio_recorder()
506
- if audio_bytes:
507
  with open("recorded_audio.wav", "wb") as f:
508
  f.write(audio_bytes)
509
  with st.spinner("Processing..."):
510
  process_audio(audio_bytes, text_input)
511
  st.rerun()
512
- elif audio_input and text_input:
513
  with st.spinner("Processing..."):
514
  process_audio(audio_input, text_input)
515
  st.rerun()
@@ -517,7 +541,7 @@ def main():
517
  elif option == "Video":
518
  text_input = st.text_input("Video Prompt:", value="Summarize this video and its transcription in Markdown.")
519
  video_input = st.file_uploader("Upload a video file (max 200MB)", type=["mp4"], accept_multiple_files=False)
520
- if video_input and text_input:
521
  if video_input.size > 200 * 1024 * 1024:
522
  st.error("Video exceeds 200MB limit.")
523
  else:
@@ -527,7 +551,7 @@ def main():
527
 
528
  elif option == "ArXiv Search":
529
  query = st.text_input("AI Search ArXiv Scholarly Articles:")
530
- if query:
531
  with st.spinner("Searching ArXiv..."):
532
  result = search_arxiv(query)
533
  st.markdown(result)
 
101
  safe_date_time = datetime.now(central).strftime("%m%d_%H%M")
102
  if original_name and file_type == "md": # For images
103
  base_name = os.path.splitext(original_name)[0]
104
+ safe_prompt = re.sub(r'[<>:"/\\|?*\n]', ' ', prompt).strip()[:50]
105
+ file_stem = f"{safe_date_time}_{safe_prompt}_{base_name}"[:100] # Cap at 100 chars
106
+ return f"{file_stem}.{file_type}"
107
+ safe_prompt = re.sub(r'[<>:"/\\|?*\n]', ' ', prompt).strip()[:100] # Cap at 100 chars
108
  return f"{safe_date_time}_{safe_prompt}.{file_type}"
109
 
110
  def create_and_save_file(content, file_type="md", prompt=None, original_name=None, should_save=True):
 
309
  pdf = PdfReader(f)
310
  for page in pdf.pages:
311
  text += page.extract_text() or ""
312
+ prompt = f"Generate a 10-question quiz with answers based only on this document. Format as markdown with numbered questions and answers:\n{text[:2000]}\n\n"
313
  response = client.chat.completions.create(
314
  model="gpt-4o-2024-05-13",
315
  messages=[{"role": "user", "content": prompt}]
 
321
  response = client.chat.completions.create(
322
  model="gpt-4o-2024-05-13",
323
  messages=[{"role": "user", "content": query}],
324
+ tools=[{
325
+ "type": "file_search",
326
+ "file_search": {
327
+ "vector_store_ids": [vector_store_id]
328
+ }
329
+ }],
330
  tool_choice="auto"
331
  )
332
  tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else []
 
337
 
338
  def evaluate_rag(vector_store_id, questions_dict):
339
  k = 5
340
+ total_queries = len(questions_dict) * 10 # 10 questions per PDF
341
  correct_retrievals_at_k = 0
342
  reciprocal_ranks = []
343
  average_precisions = []
344
 
345
+ for filename, quiz in questions_dict.items():
346
+ questions = re.findall(r"\d+\.\s(.*?)\n\s*Answer:\s(.*?)\n", quiz, re.DOTALL)
347
+ for question, _ in questions:
348
+ expected_file = filename
349
+ response, tool_calls = process_rag_query(question, vector_store_id)
350
+ if not tool_calls:
351
+ continue
352
+ retrieved_files = [call.arguments.get("file_id", "") for call in tool_calls if "file_search" in call.type][:k]
353
+ if expected_file in retrieved_files:
354
+ rank = retrieved_files.index(expected_file) + 1
355
+ correct_retrievals_at_k += 1
356
+ reciprocal_ranks.append(1 / rank)
357
+ precisions = [1 if f == expected_file else 0 for f in retrieved_files[:rank]]
358
+ average_precisions.append(sum(precisions) / len(precisions))
359
+ else:
360
+ reciprocal_ranks.append(0)
361
+ average_precisions.append(0)
362
 
363
  recall_at_k = correct_retrievals_at_k / total_queries if total_queries else 0
364
  mrr = sum(reciprocal_ranks) / total_queries if total_queries else 0
 
375
  stats = upload_pdf_files_to_vector_store(vector_store_details["id"], pdf_paths)
376
  st.json(stats)
377
 
378
+ col1, col2, col3 = st.columns(3)
379
+ with col1:
380
+ if st.button("๐Ÿ“ Quiz"):
381
+ st.session_state["rag_prompt"] = "Generate a 10-question quiz with answers based only on this document."
382
+ with col2:
383
+ if st.button("๐Ÿ“‘ Summary"):
384
+ st.session_state["rag_prompt"] = "Summarize this per page and output as markdown outline with emojis and numbered outline with multiple levels summarizing everything unique per page in method steps or fact steps."
385
+ with col3:
386
+ if st.button("๐Ÿ” Key Facts"):
387
+ st.session_state["rag_prompt"] = "Extract 10 key facts from this document in markdown with emojis."
388
+
389
+ with st.spinner("Generating questions..."):
390
  questions_dict = {os.path.basename(p): generate_questions(p) for p in pdf_paths}
391
+ st.markdown("### Generated Quiz")
392
+ for filename, quiz in questions_dict.items():
393
+ st.markdown(f"#### {filename}")
394
+ st.markdown(quiz)
395
 
396
+ query = st.text_input("Ask a question about the PDFs:", value=st.session_state.get("rag_prompt", ""))
397
+ if query and st.button("Submit RAG Query"):
398
  with st.spinner("Processing RAG query..."):
399
  response, tool_calls = process_rag_query(query, vector_store_details["id"])
400
  if response:
 
403
  for call in tool_calls:
404
  if "file_search" in call.type:
405
  st.json(call.arguments)
406
+ st.rerun()
407
 
408
  if st.button("Evaluate RAG Performance"):
409
  with st.spinner("Evaluating..."):
 
488
  option = st.selectbox("Select Input Type", ("Text", "Image", "Audio", "Video", "ArXiv Search", "RAG PDF Gallery"))
489
 
490
  if option == "Text":
491
+ default_text = "Create a summary of PDF py libraries and usage in py with emojis in markdown. Maybe a buckeyball feature rating comparing them against each other in markdown emoji outline or tables."
492
  col1, col2 = st.columns([1, 5])
493
  with col1:
494
  if st.button("๐Ÿ“ MD", key="md_button"):
 
498
  st.rerun()
499
  with col2:
500
  text_input = st.text_input("Enter your text:", value=st.session_state.get("text_input", ""), key="text_input_field")
501
+ if text_input and st.button("Submit Text"):
502
  with st.spinner("Processing..."):
503
  process_text(text_input)
504
+ st.rerun()
505
 
506
  elif option == "Image":
507
  col1, col2 = st.columns(2)
 
513
  st.session_state["image_prompt"] = "Show electronic text of text in the image."
514
  text_input = st.text_input("Image Prompt:", value=st.session_state.get("image_prompt", "Describe this image and list ten facts in a markdown outline with emojis."))
515
  image_input = st.file_uploader("Upload an image (max 200MB)", type=["png", "jpg", "jpeg"], accept_multiple_files=False)
516
+ if image_input and text_input and st.button("Submit Image"):
517
  if image_input.size > 200 * 1024 * 1024:
518
  st.error("Image exceeds 200MB limit.")
519
  else:
 
527
  text_input = st.text_input("Audio Prompt:", value="Summarize this audio transcription in Markdown.")
528
  audio_input = st.file_uploader("Upload an audio file (max 200MB)", type=["mp3", "wav", "flac", "m4a"], accept_multiple_files=False)
529
  audio_bytes = audio_recorder()
530
+ if audio_bytes and text_input and st.button("Submit Audio Recording"):
531
  with open("recorded_audio.wav", "wb") as f:
532
  f.write(audio_bytes)
533
  with st.spinner("Processing..."):
534
  process_audio(audio_bytes, text_input)
535
  st.rerun()
536
+ elif audio_input and text_input and st.button("Submit Audio File"):
537
  with st.spinner("Processing..."):
538
  process_audio(audio_input, text_input)
539
  st.rerun()
 
541
  elif option == "Video":
542
  text_input = st.text_input("Video Prompt:", value="Summarize this video and its transcription in Markdown.")
543
  video_input = st.file_uploader("Upload a video file (max 200MB)", type=["mp4"], accept_multiple_files=False)
544
+ if video_input and text_input and st.button("Submit Video"):
545
  if video_input.size > 200 * 1024 * 1024:
546
  st.error("Video exceeds 200MB limit.")
547
  else:
 
551
 
552
  elif option == "ArXiv Search":
553
  query = st.text_input("AI Search ArXiv Scholarly Articles:")
554
+ if query and st.button("Search ArXiv"):
555
  with st.spinner("Searching ArXiv..."):
556
  result = search_arxiv(query)
557
  st.markdown(result)