Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -49,43 +49,32 @@ if "vector_store" not in st.session_state:
|
|
49 |
st.session_state.vector_store = None
|
50 |
|
51 |
# ----------------- Improved Metadata Extraction -----------------
|
52 |
-
def
|
53 |
-
"""Extracts
|
54 |
with pdfplumber.open(pdf_path) as pdf:
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
title
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
author = author if author else "Unknown Author"
|
79 |
-
|
80 |
-
# Extract emails
|
81 |
-
emails = set(re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", first_page_text))
|
82 |
-
email_str = ", ".join(emails) if emails else "No emails found"
|
83 |
-
|
84 |
-
# Extract affiliations
|
85 |
-
affiliations = set(re.findall(r"(?:Department|Faculty|Institute|University|College|School)\s+[\w\s]+", first_page_text))
|
86 |
-
affiliation_str = ", ".join(affiliations) if affiliations else "No affiliations found"
|
87 |
-
|
88 |
-
return title, author, email_str, affiliation_str
|
89 |
|
90 |
# ----------------- Step 1: Choose PDF Source -----------------
|
91 |
pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)
|
|
|
49 |
st.session_state.vector_store = None
|
50 |
|
51 |
# ----------------- Improved Metadata Extraction -----------------
|
52 |
+
def extract_metadata_llm(pdf_path):
|
53 |
+
"""Extracts metadata using LLM instead of regex."""
|
54 |
with pdfplumber.open(pdf_path) as pdf:
|
55 |
+
first_page_text = pdf.pages[0].extract_text() if pdf.pages else "No text found."
|
56 |
+
|
57 |
+
# LLM prompt for extracting metadata
|
58 |
+
metadata_prompt = PromptTemplate(
|
59 |
+
input_variables=["text"],
|
60 |
+
template="""
|
61 |
+
Given the following first page of a research paper, extract:
|
62 |
+
- The title of the paper
|
63 |
+
- The authors' names
|
64 |
+
- Any email addresses present
|
65 |
+
- The affiliations of the authors
|
66 |
+
|
67 |
+
Ensure accurate extraction.
|
68 |
+
|
69 |
+
First page content:
|
70 |
+
{text}
|
71 |
+
"""
|
72 |
+
)
|
73 |
+
|
74 |
+
metadata_chain = LLMChain(llm=llm_judge, prompt=metadata_prompt, output_key="metadata")
|
75 |
+
metadata_response = metadata_chain.invoke({"text": first_page_text})
|
76 |
+
|
77 |
+
return metadata_response["metadata"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
# ----------------- Step 1: Choose PDF Source -----------------
|
80 |
pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)
|