Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -54,30 +54,16 @@ def extract_metadata_llm(pdf_path):
|
|
54 |
with pdfplumber.open(pdf_path) as pdf:
|
55 |
first_page_text = pdf.pages[0].extract_text() if pdf.pages else "No text found."
|
56 |
|
57 |
-
# LLM
|
58 |
-
metadata_prompt = PromptTemplate(
|
59 |
-
input_variables=["text"],
|
60 |
-
template="""
|
61 |
-
Given the following first page of a research paper, extract metadata in JSON format with these fields:
|
62 |
-
{
|
63 |
-
"Title": "Paper Title",
|
64 |
-
"Author": "Author Name(s)",
|
65 |
-
"Emails": "List of Emails",
|
66 |
-
"Affiliations": "Author Affiliation(s)"
|
67 |
-
}
|
68 |
-
Ensure accurate extraction.
|
69 |
-
|
70 |
-
First page content:
|
71 |
-
{text}
|
72 |
-
"""
|
73 |
-
)
|
74 |
-
|
75 |
metadata_chain = LLMChain(llm=llm_judge, prompt=metadata_prompt, output_key="metadata")
|
76 |
metadata_response = metadata_chain.invoke({"text": first_page_text})
|
77 |
|
|
|
|
|
|
|
|
|
78 |
try:
|
79 |
-
|
80 |
-
metadata_dict = json.loads(metadata_response["metadata"])
|
81 |
except json.JSONDecodeError:
|
82 |
metadata_dict = {
|
83 |
"Title": "Unknown",
|
@@ -89,6 +75,7 @@ def extract_metadata_llm(pdf_path):
|
|
89 |
return metadata_dict
|
90 |
|
91 |
|
|
|
92 |
# ----------------- Step 1: Choose PDF Source -----------------
|
93 |
pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)
|
94 |
|
|
|
54 |
with pdfplumber.open(pdf_path) as pdf:
|
55 |
first_page_text = pdf.pages[0].extract_text() if pdf.pages else "No text found."
|
56 |
|
57 |
+
# Run LLM Metadata Extraction
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
metadata_chain = LLMChain(llm=llm_judge, prompt=metadata_prompt, output_key="metadata")
|
59 |
metadata_response = metadata_chain.invoke({"text": first_page_text})
|
60 |
|
61 |
+
# Ensure LLM Output is a Proper JSON String
|
62 |
+
json_match = re.search(r"```json\n(.*?)\n```", metadata_response["metadata"], re.DOTALL)
|
63 |
+
json_text = json_match.group(1) if json_match else metadata_response["metadata"]
|
64 |
+
|
65 |
try:
|
66 |
+
metadata_dict = json.loads(json_text)
|
|
|
67 |
except json.JSONDecodeError:
|
68 |
metadata_dict = {
|
69 |
"Title": "Unknown",
|
|
|
75 |
return metadata_dict
|
76 |
|
77 |
|
78 |
+
|
79 |
# ----------------- Step 1: Choose PDF Source -----------------
|
80 |
pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)
|
81 |
|