Spaces:

DrishtiSharma
/

docqa-with-deepseek-r1

Build error

App Files Files Community

DrishtiSharma commited on Feb 14

Commit

fd8e822

verified ·

1 Parent(s): 264abd1

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -54

app.py CHANGED Viewed

@@ -74,70 +74,58 @@ def extract_title_manually(text):
     return "Unknown"
 # ----------------- Metadata Extraction -----------------
-def extract_metadata(pdf_path):
-    """Extracts Title, Authors, Emails, and Affiliations from the first page of a PDF with improved accuracy."""
     with pdfplumber.open(pdf_path) as pdf:
         if not pdf.pages:
-            return {
-                "Title": "Unknown",
-                "Author": "Unknown",
-                "Emails": "No emails found",
-                "Affiliations": "No affiliations found"
-            }
         # Extract text from the first page
         first_page_text = pdf.pages[0].extract_text()
         if not first_page_text:
-            return {
-                "Title": "Unknown",
-                "Author": "Unknown",
-                "Emails": "No emails found",
-                "Affiliations": "No affiliations found"
-            }
         cleaned_text = first_page_text.strip()
-        lines = cleaned_text.split("\n")
-        # ---- Extract Title ----
-        title = "Unknown"
-        for line in lines[:5]:  # Only check the first few lines
-            clean_line = line.strip()
-            if 6 < len(clean_line.split()) < 20 and not clean_line.lower().startswith(("abstract", "keywords", "introduction")):
-                title = clean_line
-                break
-        # ---- Extract Authors ----
-        author_candidates = []
-        name_pattern = re.compile(r"\b[A-Z][a-z]+(?:\s[A-Z][a-z]+)+\b")  # Firstname Lastname format
-        for line in lines:
-            if "@" in line:  # Stop when reaching emails
-                break
-            matches = name_pattern.findall(line)
-            if matches and len(matches) < 5:  # Avoid false positives
-                author_candidates.extend(matches)
-        authors = ", ".join(author_candidates) if author_candidates else "Unknown"
-        # ---- Extract Emails ----
-        email_pattern = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
-        emails = ", ".join(email_pattern.findall(cleaned_text)) or "No emails found"
-        # ---- Extract Affiliations ----
-        affiliations = "Unknown"
-        for i, line in enumerate(lines):
-            if "@" in line:  # Look for affiliations after email section
-                if i + 1 < len(lines):
-                    affiliations = lines[i + 1].strip()
-                break
-        return {
-            "Title": title,
-            "Author": authors,
-            "Emails": emails,
-            "Affiliations": affiliations
         }
 # ----------------- Step 1: Choose PDF Source -----------------
 pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)

     return "Unknown"
 # ----------------- Metadata Extraction -----------------
+def extract_metadata_llm(pdf_path):
+    """Extracts metadata using LLM for better accuracy."""
     with pdfplumber.open(pdf_path) as pdf:
         if not pdf.pages:
+            return {"Title": "Unknown", "Author": "Unknown", "Emails": "No emails found", "Affiliations": "No affiliations found"}
         # Extract text from the first page
         first_page_text = pdf.pages[0].extract_text()
         if not first_page_text:
+            return {"Title": "Unknown", "Author": "Unknown", "Emails": "No emails found", "Affiliations": "No affiliations found"}
         cleaned_text = first_page_text.strip()
+    # Define a structured prompt for the LLM
+    metadata_prompt = PromptTemplate(
+        input_variables=["text"],
+        template="""
+        Extract the following metadata from the research paper's first page:
+        - Title
+        - Authors (comma-separated)
+        - Emails (comma-separated)
+        - Affiliations
+        Ensure the output is in **valid JSON format** with keys: "Title", "Author", "Emails", "Affiliations".
+        Here is the text:
+        {text}
+        Provide the JSON output only, no extra text.
+        """
+    )
+    # Run the LLM Metadata Extraction
+    metadata_chain = LLMChain(llm=llm, prompt=metadata_prompt, output_key="metadata")
+    try:
+        metadata_response = metadata_chain.invoke({"text": cleaned_text})
+        # Convert the LLM response into a dictionary
+        metadata_dict = json.loads(metadata_response["metadata"])
+    except Exception as e:
+        metadata_dict = {
+            "Title": "Unknown",
+            "Author": "Unknown",
+            "Emails": "No emails found",
+            "Affiliations": "No affiliations found"
         }
+    return metadata_dict
 # ----------------- Step 1: Choose PDF Source -----------------
 pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)