Spaces:

DrishtiSharma
/

docqa-with-deepseek-r1

Build error

App Files Files Community

DrishtiSharma commited on Feb 14

Commit

a609396

verified ·

1 Parent(s): 0240de7

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -28

app.py CHANGED Viewed

@@ -30,7 +30,7 @@ rag_llm.verbose = True
 # Clear ChromaDB cache to fix tenant issue
 chromadb.api.client.SharedSystemClient.clear_system_cache()
-st.title("Blah")
 # ----------------- ChromaDB Persistent Directory -----------------
 CHROMA_DB_DIR = "/mnt/data/chroma_db"
@@ -48,11 +48,17 @@ if "processed_chunks" not in st.session_state:
 if "vector_store" not in st.session_state:
     st.session_state.vector_store = None
 # ----------------- Metadata Extraction -----------------
 def extract_metadata_llm(pdf_path):
-    """Extracts metadata using LLM instead of regex."""
     with pdfplumber.open(pdf_path) as pdf:
-        first_page_text = pdf.pages[0].extract_text() if pdf.pages else "No text found."
     # Define metadata prompt
     metadata_prompt = PromptTemplate(
@@ -60,54 +66,67 @@ def extract_metadata_llm(pdf_path):
         template="""
         Given the following first page of a research paper, extract metadata **strictly in JSON format**.
         - If no data is found for a field, return `"Unknown"` instead.
-        - The response must be valid JSON.
         Example output:
-        ```json
         {
             "Title": "Example Paper Title",
             "Author": "John Doe, Jane Smith",
             "Emails": "[email protected], [email protected]",
             "Affiliations": "School of AI, University of Example"
         }
-        ```
         Now, extract the metadata from this document:
-        ```
         {text}
-        ```
         """
     )
     # Run LLM Metadata Extraction
     metadata_chain = LLMChain(llm=llm_judge, prompt=metadata_prompt, output_key="metadata")
-    metadata_response = metadata_chain.invoke({"text": first_page_text})
-    # Handle JSON extraction from LLM response
     try:
-        # Try parsing directly as JSON
-        metadata_dict = json.loads(metadata_response["metadata"])
-    except json.JSONDecodeError:
-        # Fallback: Extract JSON using regex
-        json_match = re.search(r"```json\n(.*?)\n```", metadata_response["metadata"], re.DOTALL)
-        json_text = json_match.group(1) if json_match else metadata_response["metadata"]
         try:
-            metadata_dict = json.loads(json_text)
         except json.JSONDecodeError:
-            metadata_dict = {
-                "Title": "Unknown",
-                "Author": "Unknown",
-                "Emails": "No emails found",
-                "Affiliations": "No affiliations found"
-            }
     # Ensure all required fields exist
     required_fields = ["Title", "Author", "Emails", "Affiliations"]
     for field in required_fields:
-        if field not in metadata_dict or not metadata_dict[field].strip():
-            metadata_dict[field] = "Unknown"
     return metadata_dict

 # Clear ChromaDB cache to fix tenant issue
 chromadb.api.client.SharedSystemClient.clear_system_cache()
+# st.title("Blah")
 # ----------------- ChromaDB Persistent Directory -----------------
 CHROMA_DB_DIR = "/mnt/data/chroma_db"
 if "vector_store" not in st.session_state:
     st.session_state.vector_store = None
+# ----------------- Metadata Extraction -----------------
 # ----------------- Metadata Extraction -----------------
 def extract_metadata_llm(pdf_path):
+    """Extracts metadata using LLM instead of regex and logs progress in Streamlit UI."""
     with pdfplumber.open(pdf_path) as pdf:
+        first_page_text = pdf.pages[0].extract_text() or "No text found." if pdf.pages else "No text found."
+    # Streamlit Debugging: Show extracted text
+    st.subheader("📄 Extracted First Page Text for Metadata")
+    st.text_area("First Page Text:", first_page_text, height=200)
     # Define metadata prompt
     metadata_prompt = PromptTemplate(
         template="""
         Given the following first page of a research paper, extract metadata **strictly in JSON format**.
         - If no data is found for a field, return `"Unknown"` instead.
+        - Ensure the output is valid JSON (do not include markdown syntax).
         Example output:
         {
             "Title": "Example Paper Title",
             "Author": "John Doe, Jane Smith",
             "Emails": "[email protected], [email protected]",
             "Affiliations": "School of AI, University of Example"
         }
         Now, extract the metadata from this document:
         {text}
         """
     )
     # Run LLM Metadata Extraction
     metadata_chain = LLMChain(llm=llm_judge, prompt=metadata_prompt, output_key="metadata")
+    # Debugging: Log the LLM input
+    st.subheader("🔍 LLM Input for Metadata Extraction")
+    st.json({"text": first_page_text})
     try:
+        metadata_response = metadata_chain.invoke({"text": first_page_text})
+        # Debugging: Log raw LLM response
+        st.subheader("🔍 Raw LLM Response")
+        st.json(metadata_response)
+        # Handle JSON extraction from LLM response
         try:
+            metadata_dict = json.loads(metadata_response["metadata"])
         except json.JSONDecodeError:
+            try:
+                # Attempt to clean up JSON if needed
+                metadata_dict = json.loads(metadata_response["metadata"].strip("```json\n").strip("\n```"))
+            except json.JSONDecodeError:
+                metadata_dict = {
+                    "Title": "Unknown",
+                    "Author": "Unknown",
+                    "Emails": "No emails found",
+                    "Affiliations": "No affiliations found"
+                }
+    except Exception as e:
+        st.error(f"❌ LLM Metadata Extraction Failed: {e}")
+        metadata_dict = {
+            "Title": "Unknown",
+            "Author": "Unknown",
+            "Emails": "No emails found",
+            "Affiliations": "No affiliations found"
+        }
     # Ensure all required fields exist
     required_fields = ["Title", "Author", "Emails", "Affiliations"]
     for field in required_fields:
+        metadata_dict.setdefault(field, "Unknown")
+    # Streamlit Debugging: Display Final Extracted Metadata
+    st.subheader("✅ Extracted Metadata")
+    st.json(metadata_dict)
     return metadata_dict