DrishtiSharma commited on
Commit
c576780
·
verified ·
1 Parent(s): 22c44a9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -14
app.py CHANGED
@@ -73,10 +73,9 @@ def extract_title_manually(text):
73
  return clean_line # Return first valid title
74
  return "Unknown"
75
 
76
- # ----------------- Metadata Extraction -----------------
77
  # ----------------- Metadata Extraction -----------------
78
  def extract_metadata(pdf_path):
79
- """Extracts metadata using simple heuristics without LLM."""
80
 
81
  with pdfplumber.open(pdf_path) as pdf:
82
  if not pdf.pages:
@@ -90,32 +89,41 @@ def extract_metadata(pdf_path):
90
  # Extract text from the first page
91
  first_page_text = pdf.pages[0].extract_text() or "No text found."
92
  cleaned_text = clean_extracted_text(first_page_text)
 
 
 
 
 
 
 
 
 
93
 
94
- # Extract Title
95
- pre_extracted_title = extract_title_manually(cleaned_text)
96
-
97
- # Extract Authors (Names typically appear before affiliations)
98
- author_pattern = re.compile(r"([\w\-\s]+,\s?)+[\w\-\s]+")
99
  authors = "Unknown"
100
- for line in cleaned_text.split("\n"):
 
 
101
  match = author_pattern.search(line)
102
  if match:
103
  authors = match.group(0)
104
  break
105
 
106
- # Extract Emails
107
  email_pattern = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
108
  emails = ", ".join(email_pattern.findall(cleaned_text)) or "No emails found"
109
 
110
- # Extract Affiliations (usually below author names)
111
  affiliations = "Unknown"
112
- for i, line in enumerate(cleaned_text.split("\n")):
113
- if "@" in line: # Email appears before affiliations
114
- affiliations = cleaned_text.split("\n")[i + 1] if i + 1 < len(cleaned_text.split("\n")) else "Unknown"
 
115
  break
116
 
117
  return {
118
- "Title": pre_extracted_title,
119
  "Author": authors,
120
  "Emails": emails,
121
  "Affiliations": affiliations
 
73
  return clean_line # Return first valid title
74
  return "Unknown"
75
 
 
76
  # ----------------- Metadata Extraction -----------------
77
  def extract_metadata(pdf_path):
78
+ """Extracts metadata (title, authors, emails, affiliations) from the first page of a PDF."""
79
 
80
  with pdfplumber.open(pdf_path) as pdf:
81
  if not pdf.pages:
 
89
  # Extract text from the first page
90
  first_page_text = pdf.pages[0].extract_text() or "No text found."
91
  cleaned_text = clean_extracted_text(first_page_text)
92
+ lines = cleaned_text.split("\n")
93
+
94
+ # ---- Extract Title ----
95
+ title = "Unknown"
96
+ for line in lines[:5]: # First few lines usually contain the title
97
+ clean_line = line.strip()
98
+ if len(clean_line.split()) > 5 and not clean_line.lower().startswith(("abstract", "introduction", "keywords")):
99
+ title = clean_line
100
+ break
101
 
102
+ # ---- Extract Authors ----
103
+ author_pattern = re.compile(r"([\w\-\s]+,\s?)+[\w\-\s]+") # Names are comma-separated
 
 
 
104
  authors = "Unknown"
105
+ for line in lines:
106
+ if "@" in line: # Authors appear before emails
107
+ break
108
  match = author_pattern.search(line)
109
  if match:
110
  authors = match.group(0)
111
  break
112
 
113
+ # ---- Extract Emails ----
114
  email_pattern = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
115
  emails = ", ".join(email_pattern.findall(cleaned_text)) or "No emails found"
116
 
117
+ # ---- Extract Affiliations ----
118
  affiliations = "Unknown"
119
+ for i, line in enumerate(lines):
120
+ if "@" in line: # Affiliations are usually after emails
121
+ if i + 1 < len(lines):
122
+ affiliations = lines[i + 1].strip()
123
  break
124
 
125
  return {
126
+ "Title": title,
127
  "Author": authors,
128
  "Emails": emails,
129
  "Affiliations": affiliations