DrishtiSharma commited on
Commit
22c44a9
Β·
verified Β·
1 Parent(s): 456dd99

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -86
app.py CHANGED
@@ -74,94 +74,53 @@ def extract_title_manually(text):
74
  return "Unknown"
75
 
76
  # ----------------- Metadata Extraction -----------------
77
- def extract_metadata_llm(pdf_path):
78
- """Extracts metadata using LLM with improved title detection and JSON handling."""
 
79
 
80
  with pdfplumber.open(pdf_path) as pdf:
81
- first_page_text = pdf.pages[0].extract_text() or "No text found." if pdf.pages else "No text found."
82
-
83
- # Apply text cleaning
84
- cleaned_text = clean_extracted_text(first_page_text)
85
-
86
- # Attempt manual title extraction before LLM
87
- pre_extracted_title = extract_title_manually(cleaned_text)
88
-
89
- # Streamlit Debugging: Show extracted text
90
- st.subheader("πŸ“„ Extracted First Page Text (Cleaned)")
91
- st.text_area("Cleaned Text:", cleaned_text, height=200)
92
-
93
- # Define metadata prompt
94
- metadata_prompt = PromptTemplate(
95
- input_variables=["text", "pre_title"],
96
- template="""
97
- Given the first page of a research paper, extract metadata **strictly in JSON format**.
98
- - The title is typically in the first few lines and is often in a larger font or bold.
99
- - If a phrase like "Short Paper:" appears, the actual title follows.
100
- - If no clear title is found, use the pre-extracted title: "{pre_title}".
101
- - If a field is missing, return `"Unknown"`.
102
- - Ensure the JSON format is **valid**.
103
-
104
- Example output:
105
- {{
106
- "Title": "Example Paper Title",
107
- "Author": "John Doe, Jane Smith",
108
109
- "Affiliations": "School of AI, University of Example"
110
- }}
111
-
112
- Now, extract metadata from this document:
113
- {text}
114
- """
115
- )
116
-
117
- # Run LLM Metadata Extraction
118
- metadata_chain = LLMChain(llm=llm_judge, prompt=metadata_prompt, output_key="metadata")
119
-
120
- # Debugging: Log the LLM input
121
- st.subheader("πŸ” LLM Input for Metadata Extraction")
122
- st.json({"text": cleaned_text, "pre_title": pre_extracted_title})
123
-
124
- try:
125
- metadata_response = metadata_chain.invoke({"text": cleaned_text, "pre_title": pre_extracted_title})
126
-
127
- # Debugging: Log raw LLM response
128
- st.subheader("πŸ” Raw LLM Response")
129
- st.json(metadata_response)
130
-
131
- # Handle JSON extraction from LLM response
132
- try:
133
- metadata_dict = json.loads(metadata_response["metadata"])
134
- except json.JSONDecodeError:
135
- try:
136
- # Attempt to clean up JSON if needed
137
- metadata_dict = json.loads(metadata_response["metadata"].strip("```json\n").strip("\n```"))
138
- except json.JSONDecodeError:
139
- metadata_dict = {
140
- "Title": pre_extracted_title, # Use pre-extracted title as fallback
141
- "Author": "Unknown",
142
- "Emails": "No emails found",
143
- "Affiliations": "No affiliations found"
144
- }
145
-
146
- except Exception as e:
147
- st.error(f"❌ LLM Metadata Extraction Failed: {e}")
148
- metadata_dict = {
149
- "Title": pre_extracted_title, # Use pre-extracted title
150
- "Author": "Unknown",
151
- "Emails": "No emails found",
152
- "Affiliations": "No affiliations found"
153
  }
154
 
155
- # Ensure all required fields exist
156
- required_fields = ["Title", "Author", "Emails", "Affiliations"]
157
- for field in required_fields:
158
- metadata_dict.setdefault(field, "Unknown")
159
-
160
- # Streamlit Debugging: Display Final Extracted Metadata
161
- st.subheader("βœ… Extracted Metadata")
162
- st.json(metadata_dict)
163
-
164
- return metadata_dict
165
 
166
  # ----------------- Step 1: Choose PDF Source -----------------
167
  pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)
@@ -204,7 +163,7 @@ if not st.session_state.pdf_loaded and "pdf_path" in st.session_state:
204
  st.json(docs[0].metadata)
205
 
206
  # Extract metadata
207
- metadata = extract_metadata_llm(st.session_state.pdf_path)
208
 
209
  # Display extracted-metadata
210
  if isinstance(metadata, dict):
@@ -214,7 +173,7 @@ if not st.session_state.pdf_loaded and "pdf_path" in st.session_state:
214
  st.write(f"**Emails:** {metadata.get('Emails', 'No emails found')}")
215
  st.write(f"**Affiliations:** {metadata.get('Affiliations', 'No affiliations found')}")
216
  else:
217
- st.error("Metadata extraction failed. Check the LLM response format.")
218
 
219
  # Embedding Model
220
  model_name = "nomic-ai/modernbert-embed-base"
 
74
  return "Unknown"
75
 
76
  # ----------------- Metadata Extraction -----------------
77
+ # ----------------- Metadata Extraction -----------------
78
+ def extract_metadata(pdf_path):
79
+ """Extracts metadata using simple heuristics without LLM."""
80
 
81
  with pdfplumber.open(pdf_path) as pdf:
82
+ if not pdf.pages:
83
+ return {
84
+ "Title": "Unknown",
85
+ "Author": "Unknown",
86
+ "Emails": "No emails found",
87
+ "Affiliations": "No affiliations found"
88
+ }
89
+
90
+ # Extract text from the first page
91
+ first_page_text = pdf.pages[0].extract_text() or "No text found."
92
+ cleaned_text = clean_extracted_text(first_page_text)
93
+
94
+ # Extract Title
95
+ pre_extracted_title = extract_title_manually(cleaned_text)
96
+
97
+ # Extract Authors (Names typically appear before affiliations)
98
+ author_pattern = re.compile(r"([\w\-\s]+,\s?)+[\w\-\s]+")
99
+ authors = "Unknown"
100
+ for line in cleaned_text.split("\n"):
101
+ match = author_pattern.search(line)
102
+ if match:
103
+ authors = match.group(0)
104
+ break
105
+
106
+ # Extract Emails
107
+ email_pattern = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
108
+ emails = ", ".join(email_pattern.findall(cleaned_text)) or "No emails found"
109
+
110
+ # Extract Affiliations (usually below author names)
111
+ affiliations = "Unknown"
112
+ for i, line in enumerate(cleaned_text.split("\n")):
113
+ if "@" in line: # Email appears before affiliations
114
+ affiliations = cleaned_text.split("\n")[i + 1] if i + 1 < len(cleaned_text.split("\n")) else "Unknown"
115
+ break
116
+
117
+ return {
118
+ "Title": pre_extracted_title,
119
+ "Author": authors,
120
+ "Emails": emails,
121
+ "Affiliations": affiliations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  }
123
 
 
 
 
 
 
 
 
 
 
 
124
 
125
  # ----------------- Step 1: Choose PDF Source -----------------
126
  pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)
 
163
  st.json(docs[0].metadata)
164
 
165
  # Extract metadata
166
+ metadata = extract_metadata(st.session_state.pdf_path)
167
 
168
  # Display extracted-metadata
169
  if isinstance(metadata, dict):
 
173
  st.write(f"**Emails:** {metadata.get('Emails', 'No emails found')}")
174
  st.write(f"**Affiliations:** {metadata.get('Affiliations', 'No affiliations found')}")
175
  else:
176
+ st.error("Metadata extraction failed.")
177
 
178
  # Embedding Model
179
  model_name = "nomic-ai/modernbert-embed-base"