DrishtiSharma commited on
Commit
264abd1
·
verified ·
1 Parent(s): dc4c28d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -12
app.py CHANGED
@@ -75,7 +75,7 @@ def extract_title_manually(text):
75
 
76
  # ----------------- Metadata Extraction -----------------
77
  def extract_metadata(pdf_path):
78
- """Extracts metadata (Title, Authors, Emails, Affiliations) from the first page of a PDF."""
79
 
80
  with pdfplumber.open(pdf_path) as pdf:
81
  if not pdf.pages:
@@ -96,28 +96,28 @@ def extract_metadata(pdf_path):
96
  "Affiliations": "No affiliations found"
97
  }
98
 
99
- cleaned_text = clean_extracted_text(first_page_text)
100
  lines = cleaned_text.split("\n")
101
 
102
  # ---- Extract Title ----
103
  title = "Unknown"
104
- for line in lines[:5]: # Check only the first few lines
105
  clean_line = line.strip()
106
- if len(clean_line.split()) > 5 and not clean_line.lower().startswith(("abstract", "introduction", "keywords")):
107
  title = clean_line
108
  break
109
 
110
  # ---- Extract Authors ----
111
- author_pattern = re.compile(r"([A-Z][a-z]+(?:\s[A-Z][a-z]+)*),?\s?([A-Z][a-z]+)?") # Detects first-name last-name patterns
112
- authors = []
113
  for line in lines:
114
- if "@" in line: # Stop if we reach the emails section
115
  break
116
- match = author_pattern.findall(line)
117
- if match:
118
- authors.extend([" ".join(name).strip() for name in match])
119
 
120
- authors = ", ".join(authors) if authors else "Unknown"
121
 
122
  # ---- Extract Emails ----
123
  email_pattern = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
@@ -126,7 +126,7 @@ def extract_metadata(pdf_path):
126
  # ---- Extract Affiliations ----
127
  affiliations = "Unknown"
128
  for i, line in enumerate(lines):
129
- if "@" in line: # Affiliations usually appear after emails
130
  if i + 1 < len(lines):
131
  affiliations = lines[i + 1].strip()
132
  break
 
75
 
76
  # ----------------- Metadata Extraction -----------------
77
  def extract_metadata(pdf_path):
78
+ """Extracts Title, Authors, Emails, and Affiliations from the first page of a PDF with improved accuracy."""
79
 
80
  with pdfplumber.open(pdf_path) as pdf:
81
  if not pdf.pages:
 
96
  "Affiliations": "No affiliations found"
97
  }
98
 
99
+ cleaned_text = first_page_text.strip()
100
  lines = cleaned_text.split("\n")
101
 
102
  # ---- Extract Title ----
103
  title = "Unknown"
104
+ for line in lines[:5]: # Only check the first few lines
105
  clean_line = line.strip()
106
+ if 6 < len(clean_line.split()) < 20 and not clean_line.lower().startswith(("abstract", "keywords", "introduction")):
107
  title = clean_line
108
  break
109
 
110
  # ---- Extract Authors ----
111
+ author_candidates = []
112
+ name_pattern = re.compile(r"\b[A-Z][a-z]+(?:\s[A-Z][a-z]+)+\b") # Firstname Lastname format
113
  for line in lines:
114
+ if "@" in line: # Stop when reaching emails
115
  break
116
+ matches = name_pattern.findall(line)
117
+ if matches and len(matches) < 5: # Avoid false positives
118
+ author_candidates.extend(matches)
119
 
120
+ authors = ", ".join(author_candidates) if author_candidates else "Unknown"
121
 
122
  # ---- Extract Emails ----
123
  email_pattern = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
 
126
  # ---- Extract Affiliations ----
127
  affiliations = "Unknown"
128
  for i, line in enumerate(lines):
129
+ if "@" in line: # Look for affiliations after email section
130
  if i + 1 < len(lines):
131
  affiliations = lines[i + 1].strip()
132
  break