Spaces:

DrishtiSharma
/

docqa-with-deepseek-r1

Build error

App Files Files Community

DrishtiSharma commited on Feb 14

Commit

c0bbae3

verified ·

1 Parent(s): b9040c0

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -16

app.py CHANGED Viewed

@@ -48,34 +48,68 @@ if "vector_store" not in st.session_state:
     st.session_state.vector_store = None
 # ----------------- Metadata Extraction -----------------
 def extract_metadata_llm(pdf_path):
-    """Extracts metadata using LLM instead of regex and logs progress in Streamlit UI."""
     with pdfplumber.open(pdf_path) as pdf:
         first_page_text = pdf.pages[0].extract_text() or "No text found." if pdf.pages else "No text found."
     # Streamlit Debugging: Show extracted text
-    st.subheader("📄 Extracted First Page Text for Metadata")
-    st.text_area("First Page Text:", first_page_text, height=200)
     # Define metadata prompt
     metadata_prompt = PromptTemplate(
-        input_variables=["text"],
         template="""
-        Given the following first page of a research paper, extract metadata **strictly in JSON format**.
-        - If no data is found for a field, return `"Unknown"` instead.
-        - Ensure the output is valid JSON (do not include markdown syntax).
         Example output:
-        {
             "Title": "Example Paper Title",
             "Author": "John Doe, Jane Smith",
             "Emails": "[email protected], [email protected]",
             "Affiliations": "School of AI, University of Example"
-        }
-        Now, extract the metadata from this document:
         {text}
         """
     )
@@ -85,10 +119,10 @@ def extract_metadata_llm(pdf_path):
     # Debugging: Log the LLM input
     st.subheader("🔍 LLM Input for Metadata Extraction")
-    st.json({"text": first_page_text})
     try:
-        metadata_response = metadata_chain.invoke({"text": first_page_text})
         # Debugging: Log raw LLM response
         st.subheader("🔍 Raw LLM Response")
@@ -103,7 +137,7 @@ def extract_metadata_llm(pdf_path):
                 metadata_dict = json.loads(metadata_response["metadata"].strip("```json\n").strip("\n```"))
             except json.JSONDecodeError:
                 metadata_dict = {
-                    "Title": "Unknown",
                     "Author": "Unknown",
                     "Emails": "No emails found",
                     "Affiliations": "No affiliations found"
@@ -112,7 +146,7 @@ def extract_metadata_llm(pdf_path):
     except Exception as e:
         st.error(f"❌ LLM Metadata Extraction Failed: {e}")
         metadata_dict = {
-            "Title": "Unknown",
             "Author": "Unknown",
             "Emails": "No emails found",
             "Affiliations": "No affiliations found"
@@ -128,6 +162,8 @@ def extract_metadata_llm(pdf_path):
     st.json(metadata_dict)
     return metadata_dict
 # ----------------- Step 1: Choose PDF Source -----------------

     st.session_state.vector_store = None
+# ----------------- Text Cleaning Functions -----------------
+def clean_extracted_text(text):
+    """
+    Cleans extracted PDF text by removing excessive line breaks, fixing spacing issues, and resolving OCR artifacts.
+    """
+    text = re.sub(r'\n+', '\n', text)  # Remove excessive newlines
+    text = re.sub(r'\s{2,}', ' ', text)  # Remove extra spaces
+    text = re.sub(r'(\w)-\n(\w)', r'\1\2', text)  # Fix hyphenated words split by a newline
+    return text.strip()
+def extract_title_manually(text):
+    """
+    Attempts to find the title by checking the first few lines.
+    - Titles are usually long enough (more than 5 words).
+    - Ignores common header text like "Abstract", "Introduction".
+    """
+    lines = text.split("\n")
+    ignore_keywords = ["abstract", "introduction", "keywords", "contents", "table", "figure"]
+    for line in lines[:5]:  # Check only the first 5 lines
+        clean_line = line.strip()
+        if len(clean_line.split()) > 5 and not any(word.lower() in clean_line.lower() for word in ignore_keywords):
+            return clean_line  # Return first valid title
+    return "Unknown"
 # ----------------- Metadata Extraction -----------------
 def extract_metadata_llm(pdf_path):
+    """Extracts metadata using LLM with improved title detection and JSON handling."""
     with pdfplumber.open(pdf_path) as pdf:
         first_page_text = pdf.pages[0].extract_text() or "No text found." if pdf.pages else "No text found."
+    # Apply text cleaning
+    cleaned_text = clean_extracted_text(first_page_text)
+    # Attempt manual title extraction before LLM
+    pre_extracted_title = extract_title_manually(cleaned_text)
     # Streamlit Debugging: Show extracted text
+    st.subheader("📄 Extracted First Page Text (Cleaned)")
+    st.text_area("Cleaned Text:", cleaned_text, height=200)
     # Define metadata prompt
     metadata_prompt = PromptTemplate(
+        input_variables=["text", "pre_title"],
         template="""
+        Given the first page of a research paper, extract metadata **strictly in JSON format**.
+        - The title is typically in the first few lines and is often in a larger font or bold.
+        - If a phrase like "Short Paper:" appears, the actual title follows.
+        - If no clear title is found, use the pre-extracted title: "{pre_title}".
+        - If a field is missing, return `"Unknown"`.
+        - Ensure the JSON format is **valid**.
         Example output:
+        {{
             "Title": "Example Paper Title",
             "Author": "John Doe, Jane Smith",
             "Emails": "[email protected], [email protected]",
             "Affiliations": "School of AI, University of Example"
+        }}
+        Now, extract metadata from this document:
         {text}
         """
     )
     # Debugging: Log the LLM input
     st.subheader("🔍 LLM Input for Metadata Extraction")
+    st.json({"text": cleaned_text, "pre_title": pre_extracted_title})
     try:
+        metadata_response = metadata_chain.invoke({"text": cleaned_text, "pre_title": pre_extracted_title})
         # Debugging: Log raw LLM response
         st.subheader("🔍 Raw LLM Response")
                 metadata_dict = json.loads(metadata_response["metadata"].strip("```json\n").strip("\n```"))
             except json.JSONDecodeError:
                 metadata_dict = {
+                    "Title": pre_extracted_title,  # Use pre-extracted title as fallback
                     "Author": "Unknown",
                     "Emails": "No emails found",
                     "Affiliations": "No affiliations found"
     except Exception as e:
         st.error(f"❌ LLM Metadata Extraction Failed: {e}")
         metadata_dict = {
+            "Title": pre_extracted_title,  # Use pre-extracted title
             "Author": "Unknown",
             "Emails": "No emails found",
             "Affiliations": "No affiliations found"
     st.json(metadata_dict)
     return metadata_dict
 # ----------------- Step 1: Choose PDF Source -----------------