Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -48,34 +48,68 @@ if "vector_store" not in st.session_state:
|
|
48 |
st.session_state.vector_store = None
|
49 |
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
# ----------------- Metadata Extraction -----------------
|
52 |
def extract_metadata_llm(pdf_path):
|
53 |
-
"""Extracts metadata using LLM
|
54 |
|
55 |
with pdfplumber.open(pdf_path) as pdf:
|
56 |
first_page_text = pdf.pages[0].extract_text() or "No text found." if pdf.pages else "No text found."
|
57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
# Streamlit Debugging: Show extracted text
|
59 |
-
st.subheader("π Extracted First Page Text
|
60 |
-
st.text_area("
|
61 |
|
62 |
# Define metadata prompt
|
63 |
metadata_prompt = PromptTemplate(
|
64 |
-
input_variables=["text"],
|
65 |
template="""
|
66 |
-
Given the
|
67 |
-
-
|
68 |
-
-
|
69 |
-
|
|
|
|
|
|
|
70 |
Example output:
|
71 |
-
{
|
72 |
"Title": "Example Paper Title",
|
73 |
"Author": "John Doe, Jane Smith",
|
74 |
"Emails": "[email protected], [email protected]",
|
75 |
"Affiliations": "School of AI, University of Example"
|
76 |
-
}
|
77 |
-
|
78 |
-
Now, extract
|
79 |
{text}
|
80 |
"""
|
81 |
)
|
@@ -85,10 +119,10 @@ def extract_metadata_llm(pdf_path):
|
|
85 |
|
86 |
# Debugging: Log the LLM input
|
87 |
st.subheader("π LLM Input for Metadata Extraction")
|
88 |
-
st.json({"text":
|
89 |
|
90 |
try:
|
91 |
-
metadata_response = metadata_chain.invoke({"text":
|
92 |
|
93 |
# Debugging: Log raw LLM response
|
94 |
st.subheader("π Raw LLM Response")
|
@@ -103,7 +137,7 @@ def extract_metadata_llm(pdf_path):
|
|
103 |
metadata_dict = json.loads(metadata_response["metadata"].strip("```json\n").strip("\n```"))
|
104 |
except json.JSONDecodeError:
|
105 |
metadata_dict = {
|
106 |
-
"Title":
|
107 |
"Author": "Unknown",
|
108 |
"Emails": "No emails found",
|
109 |
"Affiliations": "No affiliations found"
|
@@ -112,7 +146,7 @@ def extract_metadata_llm(pdf_path):
|
|
112 |
except Exception as e:
|
113 |
st.error(f"β LLM Metadata Extraction Failed: {e}")
|
114 |
metadata_dict = {
|
115 |
-
"Title":
|
116 |
"Author": "Unknown",
|
117 |
"Emails": "No emails found",
|
118 |
"Affiliations": "No affiliations found"
|
@@ -128,6 +162,8 @@ def extract_metadata_llm(pdf_path):
|
|
128 |
st.json(metadata_dict)
|
129 |
|
130 |
return metadata_dict
|
|
|
|
|
131 |
|
132 |
|
133 |
# ----------------- Step 1: Choose PDF Source -----------------
|
|
|
48 |
st.session_state.vector_store = None
|
49 |
|
50 |
|
51 |
+
# ----------------- Text Cleaning Functions -----------------
|
52 |
+
def clean_extracted_text(text):
|
53 |
+
"""
|
54 |
+
Cleans extracted PDF text by removing excessive line breaks, fixing spacing issues, and resolving OCR artifacts.
|
55 |
+
"""
|
56 |
+
text = re.sub(r'\n+', '\n', text) # Remove excessive newlines
|
57 |
+
text = re.sub(r'\s{2,}', ' ', text) # Remove extra spaces
|
58 |
+
text = re.sub(r'(\w)-\n(\w)', r'\1\2', text) # Fix hyphenated words split by a newline
|
59 |
+
return text.strip()
|
60 |
+
|
61 |
+
def extract_title_manually(text):
|
62 |
+
"""
|
63 |
+
Attempts to find the title by checking the first few lines.
|
64 |
+
- Titles are usually long enough (more than 5 words).
|
65 |
+
- Ignores common header text like "Abstract", "Introduction".
|
66 |
+
"""
|
67 |
+
lines = text.split("\n")
|
68 |
+
ignore_keywords = ["abstract", "introduction", "keywords", "contents", "table", "figure"]
|
69 |
+
|
70 |
+
for line in lines[:5]: # Check only the first 5 lines
|
71 |
+
clean_line = line.strip()
|
72 |
+
if len(clean_line.split()) > 5 and not any(word.lower() in clean_line.lower() for word in ignore_keywords):
|
73 |
+
return clean_line # Return first valid title
|
74 |
+
return "Unknown"
|
75 |
+
|
76 |
# ----------------- Metadata Extraction -----------------
|
77 |
def extract_metadata_llm(pdf_path):
|
78 |
+
"""Extracts metadata using LLM with improved title detection and JSON handling."""
|
79 |
|
80 |
with pdfplumber.open(pdf_path) as pdf:
|
81 |
first_page_text = pdf.pages[0].extract_text() or "No text found." if pdf.pages else "No text found."
|
82 |
|
83 |
+
# Apply text cleaning
|
84 |
+
cleaned_text = clean_extracted_text(first_page_text)
|
85 |
+
|
86 |
+
# Attempt manual title extraction before LLM
|
87 |
+
pre_extracted_title = extract_title_manually(cleaned_text)
|
88 |
+
|
89 |
# Streamlit Debugging: Show extracted text
|
90 |
+
st.subheader("π Extracted First Page Text (Cleaned)")
|
91 |
+
st.text_area("Cleaned Text:", cleaned_text, height=200)
|
92 |
|
93 |
# Define metadata prompt
|
94 |
metadata_prompt = PromptTemplate(
|
95 |
+
input_variables=["text", "pre_title"],
|
96 |
template="""
|
97 |
+
Given the first page of a research paper, extract metadata **strictly in JSON format**.
|
98 |
+
- The title is typically in the first few lines and is often in a larger font or bold.
|
99 |
+
- If a phrase like "Short Paper:" appears, the actual title follows.
|
100 |
+
- If no clear title is found, use the pre-extracted title: "{pre_title}".
|
101 |
+
- If a field is missing, return `"Unknown"`.
|
102 |
+
- Ensure the JSON format is **valid**.
|
103 |
+
|
104 |
Example output:
|
105 |
+
{{
|
106 |
"Title": "Example Paper Title",
|
107 |
"Author": "John Doe, Jane Smith",
|
108 |
"Emails": "[email protected], [email protected]",
|
109 |
"Affiliations": "School of AI, University of Example"
|
110 |
+
}}
|
111 |
+
|
112 |
+
Now, extract metadata from this document:
|
113 |
{text}
|
114 |
"""
|
115 |
)
|
|
|
119 |
|
120 |
# Debugging: Log the LLM input
|
121 |
st.subheader("π LLM Input for Metadata Extraction")
|
122 |
+
st.json({"text": cleaned_text, "pre_title": pre_extracted_title})
|
123 |
|
124 |
try:
|
125 |
+
metadata_response = metadata_chain.invoke({"text": cleaned_text, "pre_title": pre_extracted_title})
|
126 |
|
127 |
# Debugging: Log raw LLM response
|
128 |
st.subheader("π Raw LLM Response")
|
|
|
137 |
metadata_dict = json.loads(metadata_response["metadata"].strip("```json\n").strip("\n```"))
|
138 |
except json.JSONDecodeError:
|
139 |
metadata_dict = {
|
140 |
+
"Title": pre_extracted_title, # Use pre-extracted title as fallback
|
141 |
"Author": "Unknown",
|
142 |
"Emails": "No emails found",
|
143 |
"Affiliations": "No affiliations found"
|
|
|
146 |
except Exception as e:
|
147 |
st.error(f"β LLM Metadata Extraction Failed: {e}")
|
148 |
metadata_dict = {
|
149 |
+
"Title": pre_extracted_title, # Use pre-extracted title
|
150 |
"Author": "Unknown",
|
151 |
"Emails": "No emails found",
|
152 |
"Affiliations": "No affiliations found"
|
|
|
162 |
st.json(metadata_dict)
|
163 |
|
164 |
return metadata_dict
|
165 |
+
|
166 |
+
|
167 |
|
168 |
|
169 |
# ----------------- Step 1: Choose PDF Source -----------------
|