Spaces:

DrishtiSharma
/

docqa-with-deepseek-r1

Build error

App Files Files Community

DrishtiSharma commited on Feb 14

Commit

dc4c28d

verified ·

1 Parent(s): 7f19084

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -12

app.py CHANGED Viewed

@@ -75,8 +75,8 @@ def extract_title_manually(text):
 # ----------------- Metadata Extraction -----------------
 def extract_metadata(pdf_path):
-    """Extracts metadata (title, authors, emails, affiliations) from the first page of a PDF."""
     with pdfplumber.open(pdf_path) as pdf:
         if not pdf.pages:
             return {
@@ -87,28 +87,37 @@ def extract_metadata(pdf_path):
             }
         # Extract text from the first page
-        first_page_text = pdf.pages[0].extract_text() or "No text found."
         cleaned_text = clean_extracted_text(first_page_text)
         lines = cleaned_text.split("\n")
         # ---- Extract Title ----
         title = "Unknown"
-        for line in lines[:5]:  # First few lines usually contain the title
             clean_line = line.strip()
             if len(clean_line.split()) > 5 and not clean_line.lower().startswith(("abstract", "introduction", "keywords")):
                 title = clean_line
                 break
         # ---- Extract Authors ----
-        author_pattern = re.compile(r"([\w\-\s]+,\s?)+[\w\-\s]+")  # Names are comma-separated
-        authors = "Unknown"
         for line in lines:
-            if "@" in line:  # Authors appear before emails
                 break
-            match = author_pattern.search(line)
             if match:
-                authors = match.group(0)
-                break
         # ---- Extract Emails ----
         email_pattern = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
@@ -117,7 +126,7 @@ def extract_metadata(pdf_path):
         # ---- Extract Affiliations ----
         affiliations = "Unknown"
         for i, line in enumerate(lines):
-            if "@" in line:  # Affiliations are usually after emails
                 if i + 1 < len(lines):
                     affiliations = lines[i + 1].strip()
                 break
@@ -129,7 +138,6 @@ def extract_metadata(pdf_path):
             "Affiliations": affiliations
         }
 # ----------------- Step 1: Choose PDF Source -----------------
 pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)

 # ----------------- Metadata Extraction -----------------
 def extract_metadata(pdf_path):
+    """Extracts metadata (Title, Authors, Emails, Affiliations) from the first page of a PDF."""
     with pdfplumber.open(pdf_path) as pdf:
         if not pdf.pages:
             return {
             }
         # Extract text from the first page
+        first_page_text = pdf.pages[0].extract_text()
+        if not first_page_text:
+            return {
+                "Title": "Unknown",
+                "Author": "Unknown",
+                "Emails": "No emails found",
+                "Affiliations": "No affiliations found"
+            }
         cleaned_text = clean_extracted_text(first_page_text)
         lines = cleaned_text.split("\n")
         # ---- Extract Title ----
         title = "Unknown"
+        for line in lines[:5]:  # Check only the first few lines
             clean_line = line.strip()
             if len(clean_line.split()) > 5 and not clean_line.lower().startswith(("abstract", "introduction", "keywords")):
                 title = clean_line
                 break
         # ---- Extract Authors ----
+        author_pattern = re.compile(r"([A-Z][a-z]+(?:\s[A-Z][a-z]+)*),?\s?([A-Z][a-z]+)?")  # Detects first-name last-name patterns
+        authors = []
         for line in lines:
+            if "@" in line:  # Stop if we reach the emails section
                 break
+            match = author_pattern.findall(line)
             if match:
+                authors.extend([" ".join(name).strip() for name in match])
+        authors = ", ".join(authors) if authors else "Unknown"
         # ---- Extract Emails ----
         email_pattern = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
         # ---- Extract Affiliations ----
         affiliations = "Unknown"
         for i, line in enumerate(lines):
+            if "@" in line:  # Affiliations usually appear after emails
                 if i + 1 < len(lines):
                     affiliations = lines[i + 1].strip()
                 break
             "Affiliations": affiliations
         }
 # ----------------- Step 1: Choose PDF Source -----------------
 pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)