Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import streamlit as st
|
2 |
import os
|
|
|
3 |
import requests
|
4 |
import pdfplumber
|
5 |
import chromadb
|
@@ -48,7 +49,7 @@ if "processed_chunks" not in st.session_state:
|
|
48 |
if "vector_store" not in st.session_state:
|
49 |
st.session_state.vector_store = None
|
50 |
|
51 |
-
# -----------------
|
52 |
def extract_metadata_llm(pdf_path):
|
53 |
"""Extracts metadata using LLM instead of regex."""
|
54 |
with pdfplumber.open(pdf_path) as pdf:
|
@@ -58,12 +59,13 @@ def extract_metadata_llm(pdf_path):
|
|
58 |
metadata_prompt = PromptTemplate(
|
59 |
input_variables=["text"],
|
60 |
template="""
|
61 |
-
Given the following first page of a research paper, extract:
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
|
|
67 |
Ensure accurate extraction.
|
68 |
|
69 |
First page content:
|
@@ -74,7 +76,19 @@ def extract_metadata_llm(pdf_path):
|
|
74 |
metadata_chain = LLMChain(llm=llm_judge, prompt=metadata_prompt, output_key="metadata")
|
75 |
metadata_response = metadata_chain.invoke({"text": first_page_text})
|
76 |
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
# ----------------- Step 1: Choose PDF Source -----------------
|
80 |
pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)
|
|
|
1 |
import streamlit as st
|
2 |
import os
|
3 |
+
import json
|
4 |
import requests
|
5 |
import pdfplumber
|
6 |
import chromadb
|
|
|
49 |
if "vector_store" not in st.session_state:
|
50 |
st.session_state.vector_store = None
|
51 |
|
52 |
+
# ----------------- Metadata Extraction -----------------
|
53 |
def extract_metadata_llm(pdf_path):
|
54 |
"""Extracts metadata using LLM instead of regex."""
|
55 |
with pdfplumber.open(pdf_path) as pdf:
|
|
|
59 |
metadata_prompt = PromptTemplate(
|
60 |
input_variables=["text"],
|
61 |
template="""
|
62 |
+
Given the following first page of a research paper, extract metadata in JSON format with these fields:
|
63 |
+
{
|
64 |
+
"Title": "Paper Title",
|
65 |
+
"Author": "Author Name(s)",
|
66 |
+
"Emails": "List of Emails",
|
67 |
+
"Affiliations": "Author Affiliation(s)"
|
68 |
+
}
|
69 |
Ensure accurate extraction.
|
70 |
|
71 |
First page content:
|
|
|
76 |
metadata_chain = LLMChain(llm=llm_judge, prompt=metadata_prompt, output_key="metadata")
|
77 |
metadata_response = metadata_chain.invoke({"text": first_page_text})
|
78 |
|
79 |
+
try:
|
80 |
+
# Ensure response is a valid JSON string and convert it to a dictionary
|
81 |
+
metadata_dict = json.loads(metadata_response["metadata"])
|
82 |
+
except json.JSONDecodeError:
|
83 |
+
metadata_dict = {
|
84 |
+
"Title": "Unknown",
|
85 |
+
"Author": "Unknown",
|
86 |
+
"Emails": "No emails found",
|
87 |
+
"Affiliations": "No affiliations found"
|
88 |
+
}
|
89 |
+
|
90 |
+
return metadata_dict
|
91 |
+
|
92 |
|
93 |
# ----------------- Step 1: Choose PDF Source -----------------
|
94 |
pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)
|