Spaces:

DrishtiSharma
/

docqa-with-deepseek-r1

Build error

App Files Files Community

DrishtiSharma commited on Feb 14

Commit

0240de7

verified ·

1 Parent(s): d291cf6

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -20

app.py CHANGED Viewed

@@ -50,16 +50,15 @@ if "vector_store" not in st.session_state:
 # ----------------- Metadata Extraction -----------------
 def extract_metadata_llm(pdf_path):
-    """Extracts metadata using LLM"""
     with pdfplumber.open(pdf_path) as pdf:
         first_page_text = pdf.pages[0].extract_text() if pdf.pages else "No text found."
-    # Define metadata_prompt
     metadata_prompt = PromptTemplate(
         input_variables=["text"],
         template="""
         Given the following first page of a research paper, extract metadata **strictly in JSON format**.
         - If no data is found for a field, return `"Unknown"` instead.
         - The response must be valid JSON.
@@ -81,28 +80,34 @@ def extract_metadata_llm(pdf_path):
         """
     )
     metadata_chain = LLMChain(llm=llm_judge, prompt=metadata_prompt, output_key="metadata")
     metadata_response = metadata_chain.invoke({"text": first_page_text})
-    # Ensure LLM Output is a Proper JSON String
-    json_match = re.search(r"```json\n(.*?)\n```", metadata_response["metadata"], re.DOTALL)
-    json_text = json_match.group(1) if json_match else metadata_response["metadata"]
     try:
-        metadata_dict = json.loads(json_text)
-        # Ensure all expected fields are present
-        required_fields = ["Title", "Author", "Emails", "Affiliations"]
-        for field in required_fields:
-            if field not in metadata_dict:
-                metadata_dict[field] = "Unknown"
     except json.JSONDecodeError:
-        metadata_dict = {
-            "Title": "Unknown",
-            "Author": "Unknown",
-            "Emails": "No emails found",
-            "Affiliations": "No affiliations found"
-        }
     return metadata_dict

 # ----------------- Metadata Extraction -----------------
 def extract_metadata_llm(pdf_path):
+    """Extracts metadata using LLM instead of regex."""
     with pdfplumber.open(pdf_path) as pdf:
         first_page_text = pdf.pages[0].extract_text() if pdf.pages else "No text found."
+    # Define metadata prompt
     metadata_prompt = PromptTemplate(
         input_variables=["text"],
         template="""
         Given the following first page of a research paper, extract metadata **strictly in JSON format**.
         - If no data is found for a field, return `"Unknown"` instead.
         - The response must be valid JSON.
         """
     )
+    # Run LLM Metadata Extraction
     metadata_chain = LLMChain(llm=llm_judge, prompt=metadata_prompt, output_key="metadata")
     metadata_response = metadata_chain.invoke({"text": first_page_text})
+    # Handle JSON extraction from LLM response
     try:
+        # Try parsing directly as JSON
+        metadata_dict = json.loads(metadata_response["metadata"])
     except json.JSONDecodeError:
+        # Fallback: Extract JSON using regex
+        json_match = re.search(r"```json\n(.*?)\n```", metadata_response["metadata"], re.DOTALL)
+        json_text = json_match.group(1) if json_match else metadata_response["metadata"]
+        try:
+            metadata_dict = json.loads(json_text)
+        except json.JSONDecodeError:
+            metadata_dict = {
+                "Title": "Unknown",
+                "Author": "Unknown",
+                "Emails": "No emails found",
+                "Affiliations": "No affiliations found"
+            }
+    # Ensure all required fields exist
+    required_fields = ["Title", "Author", "Emails", "Affiliations"]
+    for field in required_fields:
+        if field not in metadata_dict or not metadata_dict[field].strip():
+            metadata_dict[field] = "Unknown"
     return metadata_dict