DrishtiSharma commited on
Commit
dba1813
·
verified ·
1 Parent(s): 5b5e7ef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -13
app.py CHANGED
@@ -55,27 +55,35 @@ def extract_metadata(pdf_path):
55
  metadata = pdf.metadata or {}
56
  first_page_text = pdf.pages[0].extract_text() if pdf.pages else ""
57
 
58
- # Extract title
59
- title = metadata.get("Title", "").strip()
60
- if not title and first_page_text:
61
- title_match = re.search(r"(?i)title[:\-]?\s*(.*)", first_page_text)
62
- title = title_match.group(1) if title_match else first_page_text.split("\n")[0]
63
 
64
- title = title.strip() if title else "Untitled Document"
 
 
 
 
 
 
 
65
 
66
  # Extract author
67
  author = metadata.get("Author", "").strip()
68
- if not author and first_page_text:
69
- author_match = re.search(r"(?i)(?:by|authors?)[:\-]?\s*([\w\s,]+)", first_page_text)
70
- author = author_match.group(1) if author_match else "Unknown Author"
 
 
 
 
71
 
72
  # Extract emails
73
- emails = re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", first_page_text)
74
- email_str = ", ".join(set(emails)) if emails else "No emails found"
75
 
76
  # Extract affiliations
77
- affiliations = re.findall(r"(?:Department|Faculty|Institute|University|College|School)\s+[\w\s]+", first_page_text)
78
- affiliation_str = ", ".join(set(affiliations)) if affiliations else "No affiliations found"
79
 
80
  return title, author, email_str, affiliation_str
81
 
 
55
  metadata = pdf.metadata or {}
56
  first_page_text = pdf.pages[0].extract_text() if pdf.pages else ""
57
 
58
+ # Normalize and split first-page text into lines
59
+ lines = first_page_text.split("\n") if first_page_text else []
 
 
 
60
 
61
+ # Extract title (Try multiple strategies)
62
+ title = metadata.get("Title", "").strip()
63
+ if not title:
64
+ for line in lines[:5]: # Check the first few lines
65
+ if len(line.strip()) > 5 and not line.isdigit(): # Avoid numbers (page numbers)
66
+ title = line.strip()
67
+ break
68
+ title = title if title else "Untitled Document"
69
 
70
  # Extract author
71
  author = metadata.get("Author", "").strip()
72
+ if not author:
73
+ for line in lines:
74
+ author_match = re.search(r"(?i)(?:by|authors?)[:\-]?\s*([\w\s,]+)", line)
75
+ if author_match:
76
+ author = author_match.group(1).strip()
77
+ break
78
+ author = author if author else "Unknown Author"
79
 
80
  # Extract emails
81
+ emails = set(re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", first_page_text))
82
+ email_str = ", ".join(emails) if emails else "No emails found"
83
 
84
  # Extract affiliations
85
+ affiliations = set(re.findall(r"(?:Department|Faculty|Institute|University|College|School)\s+[\w\s]+", first_page_text))
86
+ affiliation_str = ", ".join(affiliations) if affiliations else "No affiliations found"
87
 
88
  return title, author, email_str, affiliation_str
89