Spaces:

DrishtiSharma
/

docqa-with-deepseek-r1

Build error

App Files Files Community

DrishtiSharma commited on Feb 14

Commit

8b24191

verified ·

1 Parent(s): 8ec9856

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -8

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import streamlit as st
 import os
 import requests
 import pdfplumber
 import chromadb
@@ -48,7 +49,7 @@ if "processed_chunks" not in st.session_state:
 if "vector_store" not in st.session_state:
     st.session_state.vector_store = None
-# ----------------- Improved Metadata Extraction -----------------
 def extract_metadata_llm(pdf_path):
     """Extracts metadata using LLM instead of regex."""
     with pdfplumber.open(pdf_path) as pdf:
@@ -58,12 +59,13 @@ def extract_metadata_llm(pdf_path):
     metadata_prompt = PromptTemplate(
         input_variables=["text"],
         template="""
-        Given the following first page of a research paper, extract:
-        - The title of the paper
-        - The authors' names
-        - Any email addresses present
-        - The affiliations of the authors
         Ensure accurate extraction.
         First page content:
@@ -74,7 +76,19 @@ def extract_metadata_llm(pdf_path):
     metadata_chain = LLMChain(llm=llm_judge, prompt=metadata_prompt, output_key="metadata")
     metadata_response = metadata_chain.invoke({"text": first_page_text})
-    return metadata_response["metadata"]
 # ----------------- Step 1: Choose PDF Source -----------------
 pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)

 import streamlit as st
 import os
+import json
 import requests
 import pdfplumber
 import chromadb
 if "vector_store" not in st.session_state:
     st.session_state.vector_store = None
+# ----------------- Metadata Extraction -----------------
 def extract_metadata_llm(pdf_path):
     """Extracts metadata using LLM instead of regex."""
     with pdfplumber.open(pdf_path) as pdf:
     metadata_prompt = PromptTemplate(
         input_variables=["text"],
         template="""
+        Given the following first page of a research paper, extract metadata in JSON format with these fields:
+        {
+            "Title": "Paper Title",
+            "Author": "Author Name(s)",
+            "Emails": "List of Emails",
+            "Affiliations": "Author Affiliation(s)"
+        }
         Ensure accurate extraction.
         First page content:
     metadata_chain = LLMChain(llm=llm_judge, prompt=metadata_prompt, output_key="metadata")
     metadata_response = metadata_chain.invoke({"text": first_page_text})
+    try:
+        # Ensure response is a valid JSON string and convert it to a dictionary
+        metadata_dict = json.loads(metadata_response["metadata"])
+    except json.JSONDecodeError:
+        metadata_dict = {
+            "Title": "Unknown",
+            "Author": "Unknown",
+            "Emails": "No emails found",
+            "Affiliations": "No affiliations found"
+        }
+    return metadata_dict
 # ----------------- Step 1: Choose PDF Source -----------------
 pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)