Spaces:

DrishtiSharma
/

docqa-with-deepseek-r1

Build error

App Files Files Community

DrishtiSharma commited on Feb 14

Commit

aeca549

verified ·

1 Parent(s): dba1813

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -36

app.py CHANGED Viewed

@@ -49,43 +49,32 @@ if "vector_store" not in st.session_state:
     st.session_state.vector_store = None
 # ----------------- Improved Metadata Extraction -----------------
-def extract_metadata(pdf_path):
-    """Extracts title, author, emails, and affiliations from the first page if metadata is missing."""
     with pdfplumber.open(pdf_path) as pdf:
-        metadata = pdf.metadata or {}
-        first_page_text = pdf.pages[0].extract_text() if pdf.pages else ""
-        # Normalize and split first-page text into lines
-        lines = first_page_text.split("\n") if first_page_text else []
-        # Extract title (Try multiple strategies)
-        title = metadata.get("Title", "").strip()
-        if not title:
-            for line in lines[:5]:  # Check the first few lines
-                if len(line.strip()) > 5 and not line.isdigit():  # Avoid numbers (page numbers)
-                    title = line.strip()
-                    break
-        title = title if title else "Untitled Document"
-        # Extract author
-        author = metadata.get("Author", "").strip()
-        if not author:
-            for line in lines:
-                author_match = re.search(r"(?i)(?:by|authors?)[:\-]?\s*([\w\s,]+)", line)
-                if author_match:
-                    author = author_match.group(1).strip()
-                    break
-        author = author if author else "Unknown Author"
-        # Extract emails
-        emails = set(re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", first_page_text))
-        email_str = ", ".join(emails) if emails else "No emails found"
-        # Extract affiliations
-        affiliations = set(re.findall(r"(?:Department|Faculty|Institute|University|College|School)\s+[\w\s]+", first_page_text))
-        affiliation_str = ", ".join(affiliations) if affiliations else "No affiliations found"
-    return title, author, email_str, affiliation_str
 # ----------------- Step 1: Choose PDF Source -----------------
 pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)

     st.session_state.vector_store = None
 # ----------------- Improved Metadata Extraction -----------------
+def extract_metadata_llm(pdf_path):
+    """Extracts metadata using LLM instead of regex."""
     with pdfplumber.open(pdf_path) as pdf:
+        first_page_text = pdf.pages[0].extract_text() if pdf.pages else "No text found."
+    # LLM prompt for extracting metadata
+    metadata_prompt = PromptTemplate(
+        input_variables=["text"],
+        template="""
+        Given the following first page of a research paper, extract:
+        - The title of the paper
+        - The authors' names
+        - Any email addresses present
+        - The affiliations of the authors
+        Ensure accurate extraction.
+        First page content:
+        {text}
+        """
+    )
+    metadata_chain = LLMChain(llm=llm_judge, prompt=metadata_prompt, output_key="metadata")
+    metadata_response = metadata_chain.invoke({"text": first_page_text})
+    return metadata_response["metadata"]
 # ----------------- Step 1: Choose PDF Source -----------------
 pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)