DrishtiSharma commited on
Commit
5b5e7ef
·
verified ·
1 Parent(s): ce2e765

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -12
app.py CHANGED
@@ -50,30 +50,32 @@ if "vector_store" not in st.session_state:
50
 
51
  # ----------------- Improved Metadata Extraction -----------------
52
  def extract_metadata(pdf_path):
53
- """Extracts title, author, emails, and affiliations from PDF."""
54
  with pdfplumber.open(pdf_path) as pdf:
55
  metadata = pdf.metadata or {}
 
56
 
57
  # Extract title
58
  title = metadata.get("Title", "").strip()
59
- if not title and pdf.pages:
60
- text = pdf.pages[0].extract_text()
61
- title_match = re.search(r"(?i)title[:\-]?\s*(.*)", text or "")
62
- title = title_match.group(1) if title_match else text.split("\n")[0] if text else "Untitled Document"
 
63
 
64
  # Extract author
65
  author = metadata.get("Author", "").strip()
66
- if not author and pdf.pages:
67
- author_match = re.search(r"(?i)by\s+([A-Za-z\s,]+)", pdf.pages[0].extract_text() or "")
68
- author = author_match.group(1).strip() if author_match else "Unknown Author"
69
 
70
  # Extract emails
71
- emails = re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", pdf.pages[0].extract_text() or "")
72
- email_str = ", ".join(emails) if emails else "No emails found"
73
 
74
  # Extract affiliations
75
- affiliations = re.findall(r"(?:Department|Faculty|Institute|University|College|School)\s+[\w\s]+", pdf.pages[0].extract_text() or "")
76
- affiliation_str = ", ".join(affiliations) if affiliations else "No affiliations found"
77
 
78
  return title, author, email_str, affiliation_str
79
 
 
50
 
51
  # ----------------- Improved Metadata Extraction -----------------
52
  def extract_metadata(pdf_path):
53
+ """Extracts title, author, emails, and affiliations from the first page if metadata is missing."""
54
  with pdfplumber.open(pdf_path) as pdf:
55
  metadata = pdf.metadata or {}
56
+ first_page_text = pdf.pages[0].extract_text() if pdf.pages else ""
57
 
58
  # Extract title
59
  title = metadata.get("Title", "").strip()
60
+ if not title and first_page_text:
61
+ title_match = re.search(r"(?i)title[:\-]?\s*(.*)", first_page_text)
62
+ title = title_match.group(1) if title_match else first_page_text.split("\n")[0]
63
+
64
+ title = title.strip() if title else "Untitled Document"
65
 
66
  # Extract author
67
  author = metadata.get("Author", "").strip()
68
+ if not author and first_page_text:
69
+ author_match = re.search(r"(?i)(?:by|authors?)[:\-]?\s*([\w\s,]+)", first_page_text)
70
+ author = author_match.group(1) if author_match else "Unknown Author"
71
 
72
  # Extract emails
73
+ emails = re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", first_page_text)
74
+ email_str = ", ".join(set(emails)) if emails else "No emails found"
75
 
76
  # Extract affiliations
77
+ affiliations = re.findall(r"(?:Department|Faculty|Institute|University|College|School)\s+[\w\s]+", first_page_text)
78
+ affiliation_str = ", ".join(set(affiliations)) if affiliations else "No affiliations found"
79
 
80
  return title, author, email_str, affiliation_str
81