DrishtiSharma commited on
Commit
dc4c28d
·
verified ·
1 Parent(s): 7f19084

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -12
app.py CHANGED
@@ -75,8 +75,8 @@ def extract_title_manually(text):
75
 
76
  # ----------------- Metadata Extraction -----------------
77
  def extract_metadata(pdf_path):
78
- """Extracts metadata (title, authors, emails, affiliations) from the first page of a PDF."""
79
-
80
  with pdfplumber.open(pdf_path) as pdf:
81
  if not pdf.pages:
82
  return {
@@ -87,28 +87,37 @@ def extract_metadata(pdf_path):
87
  }
88
 
89
  # Extract text from the first page
90
- first_page_text = pdf.pages[0].extract_text() or "No text found."
 
 
 
 
 
 
 
 
91
  cleaned_text = clean_extracted_text(first_page_text)
92
  lines = cleaned_text.split("\n")
93
 
94
  # ---- Extract Title ----
95
  title = "Unknown"
96
- for line in lines[:5]: # First few lines usually contain the title
97
  clean_line = line.strip()
98
  if len(clean_line.split()) > 5 and not clean_line.lower().startswith(("abstract", "introduction", "keywords")):
99
  title = clean_line
100
  break
101
 
102
  # ---- Extract Authors ----
103
- author_pattern = re.compile(r"([\w\-\s]+,\s?)+[\w\-\s]+") # Names are comma-separated
104
- authors = "Unknown"
105
  for line in lines:
106
- if "@" in line: # Authors appear before emails
107
  break
108
- match = author_pattern.search(line)
109
  if match:
110
- authors = match.group(0)
111
- break
 
112
 
113
  # ---- Extract Emails ----
114
  email_pattern = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
@@ -117,7 +126,7 @@ def extract_metadata(pdf_path):
117
  # ---- Extract Affiliations ----
118
  affiliations = "Unknown"
119
  for i, line in enumerate(lines):
120
- if "@" in line: # Affiliations are usually after emails
121
  if i + 1 < len(lines):
122
  affiliations = lines[i + 1].strip()
123
  break
@@ -129,7 +138,6 @@ def extract_metadata(pdf_path):
129
  "Affiliations": affiliations
130
  }
131
 
132
-
133
  # ----------------- Step 1: Choose PDF Source -----------------
134
  pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)
135
 
 
75
 
76
  # ----------------- Metadata Extraction -----------------
77
  def extract_metadata(pdf_path):
78
+ """Extracts metadata (Title, Authors, Emails, Affiliations) from the first page of a PDF."""
79
+
80
  with pdfplumber.open(pdf_path) as pdf:
81
  if not pdf.pages:
82
  return {
 
87
  }
88
 
89
  # Extract text from the first page
90
+ first_page_text = pdf.pages[0].extract_text()
91
+ if not first_page_text:
92
+ return {
93
+ "Title": "Unknown",
94
+ "Author": "Unknown",
95
+ "Emails": "No emails found",
96
+ "Affiliations": "No affiliations found"
97
+ }
98
+
99
  cleaned_text = clean_extracted_text(first_page_text)
100
  lines = cleaned_text.split("\n")
101
 
102
  # ---- Extract Title ----
103
  title = "Unknown"
104
+ for line in lines[:5]: # Check only the first few lines
105
  clean_line = line.strip()
106
  if len(clean_line.split()) > 5 and not clean_line.lower().startswith(("abstract", "introduction", "keywords")):
107
  title = clean_line
108
  break
109
 
110
  # ---- Extract Authors ----
111
+ author_pattern = re.compile(r"([A-Z][a-z]+(?:\s[A-Z][a-z]+)*),?\s?([A-Z][a-z]+)?") # Detects first-name last-name patterns
112
+ authors = []
113
  for line in lines:
114
+ if "@" in line: # Stop if we reach the emails section
115
  break
116
+ match = author_pattern.findall(line)
117
  if match:
118
+ authors.extend([" ".join(name).strip() for name in match])
119
+
120
+ authors = ", ".join(authors) if authors else "Unknown"
121
 
122
  # ---- Extract Emails ----
123
  email_pattern = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
 
126
  # ---- Extract Affiliations ----
127
  affiliations = "Unknown"
128
  for i, line in enumerate(lines):
129
+ if "@" in line: # Affiliations usually appear after emails
130
  if i + 1 < len(lines):
131
  affiliations = lines[i + 1].strip()
132
  break
 
138
  "Affiliations": affiliations
139
  }
140
 
 
141
  # ----------------- Step 1: Choose PDF Source -----------------
142
  pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)
143