Spaces:

DrishtiSharma
/

docqa-with-deepseek-r1

Build error

App Files Files Community

DrishtiSharma commited on Feb 14

Commit

c576780

verified ·

1 Parent(s): 22c44a9

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -14

app.py CHANGED Viewed

@@ -73,10 +73,9 @@ def extract_title_manually(text):
             return clean_line  # Return first valid title
     return "Unknown"
-# ----------------- Metadata Extraction -----------------
 # ----------------- Metadata Extraction -----------------
 def extract_metadata(pdf_path):
-    """Extracts metadata using simple heuristics without LLM."""
     with pdfplumber.open(pdf_path) as pdf:
         if not pdf.pages:
@@ -90,32 +89,41 @@ def extract_metadata(pdf_path):
         # Extract text from the first page
         first_page_text = pdf.pages[0].extract_text() or "No text found."
         cleaned_text = clean_extracted_text(first_page_text)
-        # Extract Title
-        pre_extracted_title = extract_title_manually(cleaned_text)
-        # Extract Authors (Names typically appear before affiliations)
-        author_pattern = re.compile(r"([\w\-\s]+,\s?)+[\w\-\s]+")
         authors = "Unknown"
-        for line in cleaned_text.split("\n"):
             match = author_pattern.search(line)
             if match:
                 authors = match.group(0)
                 break
-        # Extract Emails
         email_pattern = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
         emails = ", ".join(email_pattern.findall(cleaned_text)) or "No emails found"
-        # Extract Affiliations (usually below author names)
         affiliations = "Unknown"
-        for i, line in enumerate(cleaned_text.split("\n")):
-            if "@" in line:  # Email appears before affiliations
-                affiliations = cleaned_text.split("\n")[i + 1] if i + 1 < len(cleaned_text.split("\n")) else "Unknown"
                 break
         return {
-            "Title": pre_extracted_title,
             "Author": authors,
             "Emails": emails,
             "Affiliations": affiliations

             return clean_line  # Return first valid title
     return "Unknown"
 # ----------------- Metadata Extraction -----------------
 def extract_metadata(pdf_path):
+    """Extracts metadata (title, authors, emails, affiliations) from the first page of a PDF."""
     with pdfplumber.open(pdf_path) as pdf:
         if not pdf.pages:
         # Extract text from the first page
         first_page_text = pdf.pages[0].extract_text() or "No text found."
         cleaned_text = clean_extracted_text(first_page_text)
+        lines = cleaned_text.split("\n")
+        # ---- Extract Title ----
+        title = "Unknown"
+        for line in lines[:5]:  # First few lines usually contain the title
+            clean_line = line.strip()
+            if len(clean_line.split()) > 5 and not clean_line.lower().startswith(("abstract", "introduction", "keywords")):
+                title = clean_line
+                break
+        # ---- Extract Authors ----
+        author_pattern = re.compile(r"([\w\-\s]+,\s?)+[\w\-\s]+")  # Names are comma-separated
         authors = "Unknown"
+        for line in lines:
+            if "@" in line:  # Authors appear before emails
+                break
             match = author_pattern.search(line)
             if match:
                 authors = match.group(0)
                 break
+        # ---- Extract Emails ----
         email_pattern = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
         emails = ", ".join(email_pattern.findall(cleaned_text)) or "No emails found"
+        # ---- Extract Affiliations ----
         affiliations = "Unknown"
+        for i, line in enumerate(lines):
+            if "@" in line:  # Affiliations are usually after emails
+                if i + 1 < len(lines):
+                    affiliations = lines[i + 1].strip()
                 break
         return {
+            "Title": title,
             "Author": authors,
             "Emails": emails,
             "Affiliations": affiliations