Spaces:

DrishtiSharma
/

docqa-with-deepseek-r1

Build error

App Files Files Community

DrishtiSharma commited on Feb 14

Commit

264abd1

verified ·

1 Parent(s): dc4c28d

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -12

app.py CHANGED Viewed

@@ -75,7 +75,7 @@ def extract_title_manually(text):
 # ----------------- Metadata Extraction -----------------
 def extract_metadata(pdf_path):
-    """Extracts metadata (Title, Authors, Emails, Affiliations) from the first page of a PDF."""
     with pdfplumber.open(pdf_path) as pdf:
         if not pdf.pages:
@@ -96,28 +96,28 @@ def extract_metadata(pdf_path):
                 "Affiliations": "No affiliations found"
             }
-        cleaned_text = clean_extracted_text(first_page_text)
         lines = cleaned_text.split("\n")
         # ---- Extract Title ----
         title = "Unknown"
-        for line in lines[:5]:  # Check only the first few lines
             clean_line = line.strip()
-            if len(clean_line.split()) > 5 and not clean_line.lower().startswith(("abstract", "introduction", "keywords")):
                 title = clean_line
                 break
         # ---- Extract Authors ----
-        author_pattern = re.compile(r"([A-Z][a-z]+(?:\s[A-Z][a-z]+)*),?\s?([A-Z][a-z]+)?")  # Detects first-name last-name patterns
-        authors = []
         for line in lines:
-            if "@" in line:  # Stop if we reach the emails section
                 break
-            match = author_pattern.findall(line)
-            if match:
-                authors.extend([" ".join(name).strip() for name in match])
-        authors = ", ".join(authors) if authors else "Unknown"
         # ---- Extract Emails ----
         email_pattern = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
@@ -126,7 +126,7 @@ def extract_metadata(pdf_path):
         # ---- Extract Affiliations ----
         affiliations = "Unknown"
         for i, line in enumerate(lines):
-            if "@" in line:  # Affiliations usually appear after emails
                 if i + 1 < len(lines):
                     affiliations = lines[i + 1].strip()
                 break

 # ----------------- Metadata Extraction -----------------
 def extract_metadata(pdf_path):
+    """Extracts Title, Authors, Emails, and Affiliations from the first page of a PDF with improved accuracy."""
     with pdfplumber.open(pdf_path) as pdf:
         if not pdf.pages:
                 "Affiliations": "No affiliations found"
             }
+        cleaned_text = first_page_text.strip()
         lines = cleaned_text.split("\n")
         # ---- Extract Title ----
         title = "Unknown"
+        for line in lines[:5]:  # Only check the first few lines
             clean_line = line.strip()
+            if 6 < len(clean_line.split()) < 20 and not clean_line.lower().startswith(("abstract", "keywords", "introduction")):
                 title = clean_line
                 break
         # ---- Extract Authors ----
+        author_candidates = []
+        name_pattern = re.compile(r"\b[A-Z][a-z]+(?:\s[A-Z][a-z]+)+\b")  # Firstname Lastname format
         for line in lines:
+            if "@" in line:  # Stop when reaching emails
                 break
+            matches = name_pattern.findall(line)
+            if matches and len(matches) < 5:  # Avoid false positives
+                author_candidates.extend(matches)
+        authors = ", ".join(author_candidates) if author_candidates else "Unknown"
         # ---- Extract Emails ----
         email_pattern = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
         # ---- Extract Affiliations ----
         affiliations = "Unknown"
         for i, line in enumerate(lines):
+            if "@" in line:  # Look for affiliations after email section
                 if i + 1 < len(lines):
                     affiliations = lines[i + 1].strip()
                 break