Update back.branched.PDFAddedRAG.03282025.app.py
Browse files
back.branched.PDFAddedRAG.03282025.app.py
CHANGED
@@ -101,9 +101,10 @@ def generate_filename(prompt, file_type, original_name=None):
|
|
101 |
safe_date_time = datetime.now(central).strftime("%m%d_%H%M")
|
102 |
if original_name and file_type == "md": # For images
|
103 |
base_name = os.path.splitext(original_name)[0]
|
104 |
-
safe_prompt = re.sub(r'[<>:"/\\|?*\n]', ' ', prompt).strip()[:
|
105 |
-
|
106 |
-
|
|
|
107 |
return f"{safe_date_time}_{safe_prompt}.{file_type}"
|
108 |
|
109 |
def create_and_save_file(content, file_type="md", prompt=None, original_name=None, should_save=True):
|
@@ -308,7 +309,7 @@ def generate_questions(pdf_path):
|
|
308 |
pdf = PdfReader(f)
|
309 |
for page in pdf.pages:
|
310 |
text += page.extract_text() or ""
|
311 |
-
prompt = f"
|
312 |
response = client.chat.completions.create(
|
313 |
model="gpt-4o-2024-05-13",
|
314 |
messages=[{"role": "user", "content": prompt}]
|
@@ -320,7 +321,12 @@ def process_rag_query(query, vector_store_id):
|
|
320 |
response = client.chat.completions.create(
|
321 |
model="gpt-4o-2024-05-13",
|
322 |
messages=[{"role": "user", "content": query}],
|
323 |
-
tools=[{
|
|
|
|
|
|
|
|
|
|
|
324 |
tool_choice="auto"
|
325 |
)
|
326 |
tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else []
|
@@ -331,26 +337,28 @@ def process_rag_query(query, vector_store_id):
|
|
331 |
|
332 |
def evaluate_rag(vector_store_id, questions_dict):
|
333 |
k = 5
|
334 |
-
total_queries = len(questions_dict)
|
335 |
correct_retrievals_at_k = 0
|
336 |
reciprocal_ranks = []
|
337 |
average_precisions = []
|
338 |
|
339 |
-
for filename,
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
|
|
|
|
354 |
|
355 |
recall_at_k = correct_retrievals_at_k / total_queries if total_queries else 0
|
356 |
mrr = sum(reciprocal_ranks) / total_queries if total_queries else 0
|
@@ -367,12 +375,26 @@ def rag_pdf_gallery():
|
|
367 |
stats = upload_pdf_files_to_vector_store(vector_store_details["id"], pdf_paths)
|
368 |
st.json(stats)
|
369 |
|
370 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
371 |
questions_dict = {os.path.basename(p): generate_questions(p) for p in pdf_paths}
|
372 |
-
st.
|
|
|
|
|
|
|
373 |
|
374 |
-
query = st.text_input("Ask a question about the PDFs:")
|
375 |
-
if query:
|
376 |
with st.spinner("Processing RAG query..."):
|
377 |
response, tool_calls = process_rag_query(query, vector_store_details["id"])
|
378 |
if response:
|
@@ -381,6 +403,7 @@ def rag_pdf_gallery():
|
|
381 |
for call in tool_calls:
|
382 |
if "file_search" in call.type:
|
383 |
st.json(call.arguments)
|
|
|
384 |
|
385 |
if st.button("Evaluate RAG Performance"):
|
386 |
with st.spinner("Evaluating..."):
|
@@ -465,7 +488,7 @@ def main():
|
|
465 |
option = st.selectbox("Select Input Type", ("Text", "Image", "Audio", "Video", "ArXiv Search", "RAG PDF Gallery"))
|
466 |
|
467 |
if option == "Text":
|
468 |
-
default_text = "emojis in markdown. Maybe a buckeyball feature rating comparing them against each other in markdown emoji outline or tables."
|
469 |
col1, col2 = st.columns([1, 5])
|
470 |
with col1:
|
471 |
if st.button("๐ MD", key="md_button"):
|
@@ -475,9 +498,10 @@ def main():
|
|
475 |
st.rerun()
|
476 |
with col2:
|
477 |
text_input = st.text_input("Enter your text:", value=st.session_state.get("text_input", ""), key="text_input_field")
|
478 |
-
if text_input and
|
479 |
with st.spinner("Processing..."):
|
480 |
process_text(text_input)
|
|
|
481 |
|
482 |
elif option == "Image":
|
483 |
col1, col2 = st.columns(2)
|
@@ -489,7 +513,7 @@ def main():
|
|
489 |
st.session_state["image_prompt"] = "Show electronic text of text in the image."
|
490 |
text_input = st.text_input("Image Prompt:", value=st.session_state.get("image_prompt", "Describe this image and list ten facts in a markdown outline with emojis."))
|
491 |
image_input = st.file_uploader("Upload an image (max 200MB)", type=["png", "jpg", "jpeg"], accept_multiple_files=False)
|
492 |
-
if image_input and text_input:
|
493 |
if image_input.size > 200 * 1024 * 1024:
|
494 |
st.error("Image exceeds 200MB limit.")
|
495 |
else:
|
@@ -503,13 +527,13 @@ def main():
|
|
503 |
text_input = st.text_input("Audio Prompt:", value="Summarize this audio transcription in Markdown.")
|
504 |
audio_input = st.file_uploader("Upload an audio file (max 200MB)", type=["mp3", "wav", "flac", "m4a"], accept_multiple_files=False)
|
505 |
audio_bytes = audio_recorder()
|
506 |
-
if audio_bytes:
|
507 |
with open("recorded_audio.wav", "wb") as f:
|
508 |
f.write(audio_bytes)
|
509 |
with st.spinner("Processing..."):
|
510 |
process_audio(audio_bytes, text_input)
|
511 |
st.rerun()
|
512 |
-
elif audio_input and text_input:
|
513 |
with st.spinner("Processing..."):
|
514 |
process_audio(audio_input, text_input)
|
515 |
st.rerun()
|
@@ -517,7 +541,7 @@ def main():
|
|
517 |
elif option == "Video":
|
518 |
text_input = st.text_input("Video Prompt:", value="Summarize this video and its transcription in Markdown.")
|
519 |
video_input = st.file_uploader("Upload a video file (max 200MB)", type=["mp4"], accept_multiple_files=False)
|
520 |
-
if video_input and text_input:
|
521 |
if video_input.size > 200 * 1024 * 1024:
|
522 |
st.error("Video exceeds 200MB limit.")
|
523 |
else:
|
@@ -527,7 +551,7 @@ def main():
|
|
527 |
|
528 |
elif option == "ArXiv Search":
|
529 |
query = st.text_input("AI Search ArXiv Scholarly Articles:")
|
530 |
-
if query:
|
531 |
with st.spinner("Searching ArXiv..."):
|
532 |
result = search_arxiv(query)
|
533 |
st.markdown(result)
|
|
|
101 |
safe_date_time = datetime.now(central).strftime("%m%d_%H%M")
|
102 |
if original_name and file_type == "md": # For images
|
103 |
base_name = os.path.splitext(original_name)[0]
|
104 |
+
safe_prompt = re.sub(r'[<>:"/\\|?*\n]', ' ', prompt).strip()[:50]
|
105 |
+
file_stem = f"{safe_date_time}_{safe_prompt}_{base_name}"[:100] # Cap at 100 chars
|
106 |
+
return f"{file_stem}.{file_type}"
|
107 |
+
safe_prompt = re.sub(r'[<>:"/\\|?*\n]', ' ', prompt).strip()[:100] # Cap at 100 chars
|
108 |
return f"{safe_date_time}_{safe_prompt}.{file_type}"
|
109 |
|
110 |
def create_and_save_file(content, file_type="md", prompt=None, original_name=None, should_save=True):
|
|
|
309 |
pdf = PdfReader(f)
|
310 |
for page in pdf.pages:
|
311 |
text += page.extract_text() or ""
|
312 |
+
prompt = f"Generate a 10-question quiz with answers based only on this document. Format as markdown with numbered questions and answers:\n{text[:2000]}\n\n"
|
313 |
response = client.chat.completions.create(
|
314 |
model="gpt-4o-2024-05-13",
|
315 |
messages=[{"role": "user", "content": prompt}]
|
|
|
321 |
response = client.chat.completions.create(
|
322 |
model="gpt-4o-2024-05-13",
|
323 |
messages=[{"role": "user", "content": query}],
|
324 |
+
tools=[{
|
325 |
+
"type": "file_search",
|
326 |
+
"file_search": {
|
327 |
+
"vector_store_ids": [vector_store_id]
|
328 |
+
}
|
329 |
+
}],
|
330 |
tool_choice="auto"
|
331 |
)
|
332 |
tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else []
|
|
|
337 |
|
338 |
def evaluate_rag(vector_store_id, questions_dict):
|
339 |
k = 5
|
340 |
+
total_queries = len(questions_dict) * 10 # 10 questions per PDF
|
341 |
correct_retrievals_at_k = 0
|
342 |
reciprocal_ranks = []
|
343 |
average_precisions = []
|
344 |
|
345 |
+
for filename, quiz in questions_dict.items():
|
346 |
+
questions = re.findall(r"\d+\.\s(.*?)\n\s*Answer:\s(.*?)\n", quiz, re.DOTALL)
|
347 |
+
for question, _ in questions:
|
348 |
+
expected_file = filename
|
349 |
+
response, tool_calls = process_rag_query(question, vector_store_id)
|
350 |
+
if not tool_calls:
|
351 |
+
continue
|
352 |
+
retrieved_files = [call.arguments.get("file_id", "") for call in tool_calls if "file_search" in call.type][:k]
|
353 |
+
if expected_file in retrieved_files:
|
354 |
+
rank = retrieved_files.index(expected_file) + 1
|
355 |
+
correct_retrievals_at_k += 1
|
356 |
+
reciprocal_ranks.append(1 / rank)
|
357 |
+
precisions = [1 if f == expected_file else 0 for f in retrieved_files[:rank]]
|
358 |
+
average_precisions.append(sum(precisions) / len(precisions))
|
359 |
+
else:
|
360 |
+
reciprocal_ranks.append(0)
|
361 |
+
average_precisions.append(0)
|
362 |
|
363 |
recall_at_k = correct_retrievals_at_k / total_queries if total_queries else 0
|
364 |
mrr = sum(reciprocal_ranks) / total_queries if total_queries else 0
|
|
|
375 |
stats = upload_pdf_files_to_vector_store(vector_store_details["id"], pdf_paths)
|
376 |
st.json(stats)
|
377 |
|
378 |
+
col1, col2, col3 = st.columns(3)
|
379 |
+
with col1:
|
380 |
+
if st.button("๐ Quiz"):
|
381 |
+
st.session_state["rag_prompt"] = "Generate a 10-question quiz with answers based only on this document."
|
382 |
+
with col2:
|
383 |
+
if st.button("๐ Summary"):
|
384 |
+
st.session_state["rag_prompt"] = "Summarize this per page and output as markdown outline with emojis and numbered outline with multiple levels summarizing everything unique per page in method steps or fact steps."
|
385 |
+
with col3:
|
386 |
+
if st.button("๐ Key Facts"):
|
387 |
+
st.session_state["rag_prompt"] = "Extract 10 key facts from this document in markdown with emojis."
|
388 |
+
|
389 |
+
with st.spinner("Generating questions..."):
|
390 |
questions_dict = {os.path.basename(p): generate_questions(p) for p in pdf_paths}
|
391 |
+
st.markdown("### Generated Quiz")
|
392 |
+
for filename, quiz in questions_dict.items():
|
393 |
+
st.markdown(f"#### {filename}")
|
394 |
+
st.markdown(quiz)
|
395 |
|
396 |
+
query = st.text_input("Ask a question about the PDFs:", value=st.session_state.get("rag_prompt", ""))
|
397 |
+
if query and st.button("Submit RAG Query"):
|
398 |
with st.spinner("Processing RAG query..."):
|
399 |
response, tool_calls = process_rag_query(query, vector_store_details["id"])
|
400 |
if response:
|
|
|
403 |
for call in tool_calls:
|
404 |
if "file_search" in call.type:
|
405 |
st.json(call.arguments)
|
406 |
+
st.rerun()
|
407 |
|
408 |
if st.button("Evaluate RAG Performance"):
|
409 |
with st.spinner("Evaluating..."):
|
|
|
488 |
option = st.selectbox("Select Input Type", ("Text", "Image", "Audio", "Video", "ArXiv Search", "RAG PDF Gallery"))
|
489 |
|
490 |
if option == "Text":
|
491 |
+
default_text = "Create a summary of PDF py libraries and usage in py with emojis in markdown. Maybe a buckeyball feature rating comparing them against each other in markdown emoji outline or tables."
|
492 |
col1, col2 = st.columns([1, 5])
|
493 |
with col1:
|
494 |
if st.button("๐ MD", key="md_button"):
|
|
|
498 |
st.rerun()
|
499 |
with col2:
|
500 |
text_input = st.text_input("Enter your text:", value=st.session_state.get("text_input", ""), key="text_input_field")
|
501 |
+
if text_input and st.button("Submit Text"):
|
502 |
with st.spinner("Processing..."):
|
503 |
process_text(text_input)
|
504 |
+
st.rerun()
|
505 |
|
506 |
elif option == "Image":
|
507 |
col1, col2 = st.columns(2)
|
|
|
513 |
st.session_state["image_prompt"] = "Show electronic text of text in the image."
|
514 |
text_input = st.text_input("Image Prompt:", value=st.session_state.get("image_prompt", "Describe this image and list ten facts in a markdown outline with emojis."))
|
515 |
image_input = st.file_uploader("Upload an image (max 200MB)", type=["png", "jpg", "jpeg"], accept_multiple_files=False)
|
516 |
+
if image_input and text_input and st.button("Submit Image"):
|
517 |
if image_input.size > 200 * 1024 * 1024:
|
518 |
st.error("Image exceeds 200MB limit.")
|
519 |
else:
|
|
|
527 |
text_input = st.text_input("Audio Prompt:", value="Summarize this audio transcription in Markdown.")
|
528 |
audio_input = st.file_uploader("Upload an audio file (max 200MB)", type=["mp3", "wav", "flac", "m4a"], accept_multiple_files=False)
|
529 |
audio_bytes = audio_recorder()
|
530 |
+
if audio_bytes and text_input and st.button("Submit Audio Recording"):
|
531 |
with open("recorded_audio.wav", "wb") as f:
|
532 |
f.write(audio_bytes)
|
533 |
with st.spinner("Processing..."):
|
534 |
process_audio(audio_bytes, text_input)
|
535 |
st.rerun()
|
536 |
+
elif audio_input and text_input and st.button("Submit Audio File"):
|
537 |
with st.spinner("Processing..."):
|
538 |
process_audio(audio_input, text_input)
|
539 |
st.rerun()
|
|
|
541 |
elif option == "Video":
|
542 |
text_input = st.text_input("Video Prompt:", value="Summarize this video and its transcription in Markdown.")
|
543 |
video_input = st.file_uploader("Upload a video file (max 200MB)", type=["mp4"], accept_multiple_files=False)
|
544 |
+
if video_input and text_input and st.button("Submit Video"):
|
545 |
if video_input.size > 200 * 1024 * 1024:
|
546 |
st.error("Video exceeds 200MB limit.")
|
547 |
else:
|
|
|
551 |
|
552 |
elif option == "ArXiv Search":
|
553 |
query = st.text_input("AI Search ArXiv Scholarly Articles:")
|
554 |
+
if query and st.button("Search ArXiv"):
|
555 |
with st.spinner("Searching ArXiv..."):
|
556 |
result = search_arxiv(query)
|
557 |
st.markdown(result)
|