Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
import os
|
|
|
|
| 3 |
import requests
|
| 4 |
import pdfplumber
|
| 5 |
import chromadb
|
|
@@ -48,7 +49,7 @@ if "processed_chunks" not in st.session_state:
|
|
| 48 |
if "vector_store" not in st.session_state:
|
| 49 |
st.session_state.vector_store = None
|
| 50 |
|
| 51 |
-
# -----------------
|
| 52 |
def extract_metadata_llm(pdf_path):
|
| 53 |
"""Extracts metadata using LLM instead of regex."""
|
| 54 |
with pdfplumber.open(pdf_path) as pdf:
|
|
@@ -58,12 +59,13 @@ def extract_metadata_llm(pdf_path):
|
|
| 58 |
metadata_prompt = PromptTemplate(
|
| 59 |
input_variables=["text"],
|
| 60 |
template="""
|
| 61 |
-
Given the following first page of a research paper, extract:
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
|
|
|
| 67 |
Ensure accurate extraction.
|
| 68 |
|
| 69 |
First page content:
|
|
@@ -74,7 +76,19 @@ def extract_metadata_llm(pdf_path):
|
|
| 74 |
metadata_chain = LLMChain(llm=llm_judge, prompt=metadata_prompt, output_key="metadata")
|
| 75 |
metadata_response = metadata_chain.invoke({"text": first_page_text})
|
| 76 |
|
| 77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
# ----------------- Step 1: Choose PDF Source -----------------
|
| 80 |
pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import os
|
| 3 |
+
import json
|
| 4 |
import requests
|
| 5 |
import pdfplumber
|
| 6 |
import chromadb
|
|
|
|
| 49 |
if "vector_store" not in st.session_state:
|
| 50 |
st.session_state.vector_store = None
|
| 51 |
|
| 52 |
+
# ----------------- Metadata Extraction -----------------
|
| 53 |
def extract_metadata_llm(pdf_path):
|
| 54 |
"""Extracts metadata using LLM instead of regex."""
|
| 55 |
with pdfplumber.open(pdf_path) as pdf:
|
|
|
|
| 59 |
metadata_prompt = PromptTemplate(
|
| 60 |
input_variables=["text"],
|
| 61 |
template="""
|
| 62 |
+
Given the following first page of a research paper, extract metadata in JSON format with these fields:
|
| 63 |
+
{
|
| 64 |
+
"Title": "Paper Title",
|
| 65 |
+
"Author": "Author Name(s)",
|
| 66 |
+
"Emails": "List of Emails",
|
| 67 |
+
"Affiliations": "Author Affiliation(s)"
|
| 68 |
+
}
|
| 69 |
Ensure accurate extraction.
|
| 70 |
|
| 71 |
First page content:
|
|
|
|
| 76 |
metadata_chain = LLMChain(llm=llm_judge, prompt=metadata_prompt, output_key="metadata")
|
| 77 |
metadata_response = metadata_chain.invoke({"text": first_page_text})
|
| 78 |
|
| 79 |
+
try:
|
| 80 |
+
# Ensure response is a valid JSON string and convert it to a dictionary
|
| 81 |
+
metadata_dict = json.loads(metadata_response["metadata"])
|
| 82 |
+
except json.JSONDecodeError:
|
| 83 |
+
metadata_dict = {
|
| 84 |
+
"Title": "Unknown",
|
| 85 |
+
"Author": "Unknown",
|
| 86 |
+
"Emails": "No emails found",
|
| 87 |
+
"Affiliations": "No affiliations found"
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
return metadata_dict
|
| 91 |
+
|
| 92 |
|
| 93 |
# ----------------- Step 1: Choose PDF Source -----------------
|
| 94 |
pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)
|