Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -55,27 +55,35 @@ def extract_metadata(pdf_path):
|
|
55 |
metadata = pdf.metadata or {}
|
56 |
first_page_text = pdf.pages[0].extract_text() if pdf.pages else ""
|
57 |
|
58 |
-
#
|
59 |
-
|
60 |
-
if not title and first_page_text:
|
61 |
-
title_match = re.search(r"(?i)title[:\-]?\s*(.*)", first_page_text)
|
62 |
-
title = title_match.group(1) if title_match else first_page_text.split("\n")[0]
|
63 |
|
64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
# Extract author
|
67 |
author = metadata.get("Author", "").strip()
|
68 |
-
if not author
|
69 |
-
|
70 |
-
|
|
|
|
|
|
|
|
|
71 |
|
72 |
# Extract emails
|
73 |
-
emails = re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", first_page_text)
|
74 |
-
email_str = ", ".join(
|
75 |
|
76 |
# Extract affiliations
|
77 |
-
affiliations = re.findall(r"(?:Department|Faculty|Institute|University|College|School)\s+[\w\s]+", first_page_text)
|
78 |
-
affiliation_str = ", ".join(
|
79 |
|
80 |
return title, author, email_str, affiliation_str
|
81 |
|
|
|
55 |
metadata = pdf.metadata or {}
|
56 |
first_page_text = pdf.pages[0].extract_text() if pdf.pages else ""
|
57 |
|
58 |
+
# Normalize and split first-page text into lines
|
59 |
+
lines = first_page_text.split("\n") if first_page_text else []
|
|
|
|
|
|
|
60 |
|
61 |
+
# Extract title (Try multiple strategies)
|
62 |
+
title = metadata.get("Title", "").strip()
|
63 |
+
if not title:
|
64 |
+
for line in lines[:5]: # Check the first few lines
|
65 |
+
if len(line.strip()) > 5 and not line.isdigit(): # Avoid numbers (page numbers)
|
66 |
+
title = line.strip()
|
67 |
+
break
|
68 |
+
title = title if title else "Untitled Document"
|
69 |
|
70 |
# Extract author
|
71 |
author = metadata.get("Author", "").strip()
|
72 |
+
if not author:
|
73 |
+
for line in lines:
|
74 |
+
author_match = re.search(r"(?i)(?:by|authors?)[:\-]?\s*([\w\s,]+)", line)
|
75 |
+
if author_match:
|
76 |
+
author = author_match.group(1).strip()
|
77 |
+
break
|
78 |
+
author = author if author else "Unknown Author"
|
79 |
|
80 |
# Extract emails
|
81 |
+
emails = set(re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", first_page_text))
|
82 |
+
email_str = ", ".join(emails) if emails else "No emails found"
|
83 |
|
84 |
# Extract affiliations
|
85 |
+
affiliations = set(re.findall(r"(?:Department|Faculty|Institute|University|College|School)\s+[\w\s]+", first_page_text))
|
86 |
+
affiliation_str = ", ".join(affiliations) if affiliations else "No affiliations found"
|
87 |
|
88 |
return title, author, email_str, affiliation_str
|
89 |
|