Spaces:

DrishtiSharma
/

docqa-with-deepseek-r1

Build error

App Files Files Community

DrishtiSharma commited on Feb 14

Commit

5b5e7ef

verified ·

1 Parent(s): ce2e765

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -12

app.py CHANGED Viewed

@@ -50,30 +50,32 @@ if "vector_store" not in st.session_state:
 # ----------------- Improved Metadata Extraction -----------------
 def extract_metadata(pdf_path):
-    """Extracts title, author, emails, and affiliations from PDF."""
     with pdfplumber.open(pdf_path) as pdf:
         metadata = pdf.metadata or {}
         # Extract title
         title = metadata.get("Title", "").strip()
-        if not title and pdf.pages:
-            text = pdf.pages[0].extract_text()
-            title_match = re.search(r"(?i)title[:\-]?\s*(.*)", text or "")
-            title = title_match.group(1) if title_match else text.split("\n")[0] if text else "Untitled Document"
         # Extract author
         author = metadata.get("Author", "").strip()
-        if not author and pdf.pages:
-            author_match = re.search(r"(?i)by\s+([A-Za-z\s,]+)", pdf.pages[0].extract_text() or "")
-            author = author_match.group(1).strip() if author_match else "Unknown Author"
         # Extract emails
-        emails = re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", pdf.pages[0].extract_text() or "")
-        email_str = ", ".join(emails) if emails else "No emails found"
         # Extract affiliations
-        affiliations = re.findall(r"(?:Department|Faculty|Institute|University|College|School)\s+[\w\s]+", pdf.pages[0].extract_text() or "")
-        affiliation_str = ", ".join(affiliations) if affiliations else "No affiliations found"
     return title, author, email_str, affiliation_str

 # ----------------- Improved Metadata Extraction -----------------
 def extract_metadata(pdf_path):
+    """Extracts title, author, emails, and affiliations from the first page if metadata is missing."""
     with pdfplumber.open(pdf_path) as pdf:
         metadata = pdf.metadata or {}
+        first_page_text = pdf.pages[0].extract_text() if pdf.pages else ""
         # Extract title
         title = metadata.get("Title", "").strip()
+        if not title and first_page_text:
+            title_match = re.search(r"(?i)title[:\-]?\s*(.*)", first_page_text)
+            title = title_match.group(1) if title_match else first_page_text.split("\n")[0]
+        title = title.strip() if title else "Untitled Document"
         # Extract author
         author = metadata.get("Author", "").strip()
+        if not author and first_page_text:
+            author_match = re.search(r"(?i)(?:by|authors?)[:\-]?\s*([\w\s,]+)", first_page_text)
+            author = author_match.group(1) if author_match else "Unknown Author"
         # Extract emails
+        emails = re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", first_page_text)
+        email_str = ", ".join(set(emails)) if emails else "No emails found"
         # Extract affiliations
+        affiliations = re.findall(r"(?:Department|Faculty|Institute|University|College|School)\s+[\w\s]+", first_page_text)
+        affiliation_str = ", ".join(set(affiliations)) if affiliations else "No affiliations found"
     return title, author, email_str, affiliation_str