Spaces:

DrishtiSharma
/

docqa-with-deepseek-r1

Build error

DrishtiSharma commited on Feb 14

Commit

f8ed37f

verified ·

1 Parent(s): 97424be

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -54,30 +54,16 @@ def extract_metadata_llm(pdf_path):
     with pdfplumber.open(pdf_path) as pdf:
         first_page_text = pdf.pages[0].extract_text() if pdf.pages else "No text found."
-    # LLM prompt for extracting metadata
-    metadata_prompt = PromptTemplate(
-        input_variables=["text"],
-        template="""
-        Given the following first page of a research paper, extract metadata in JSON format with these fields:
-        {
-            "Title": "Paper Title",
-            "Author": "Author Name(s)",
-            "Emails": "List of Emails",
-            "Affiliations": "Author Affiliation(s)"
-        }
-        Ensure accurate extraction.
-        First page content:
-        {text}
-        """
-    )
     metadata_chain = LLMChain(llm=llm_judge, prompt=metadata_prompt, output_key="metadata")
     metadata_response = metadata_chain.invoke({"text": first_page_text})
     try:
-        # Ensure response is a valid JSON string and convert it to a dictionary
-        metadata_dict = json.loads(metadata_response["metadata"])
     except json.JSONDecodeError:
         metadata_dict = {
             "Title": "Unknown",
@@ -89,6 +75,7 @@ def extract_metadata_llm(pdf_path):
     return metadata_dict
 # ----------------- Step 1: Choose PDF Source -----------------
 pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)

     with pdfplumber.open(pdf_path) as pdf:
         first_page_text = pdf.pages[0].extract_text() if pdf.pages else "No text found."
+    # Run LLM Metadata Extraction
     metadata_chain = LLMChain(llm=llm_judge, prompt=metadata_prompt, output_key="metadata")
     metadata_response = metadata_chain.invoke({"text": first_page_text})
+    # Ensure LLM Output is a Proper JSON String
+    json_match = re.search(r"```json\n(.*?)\n```", metadata_response["metadata"], re.DOTALL)
+    json_text = json_match.group(1) if json_match else metadata_response["metadata"]
     try:
+        metadata_dict = json.loads(json_text)
     except json.JSONDecodeError:
         metadata_dict = {
             "Title": "Unknown",
     return metadata_dict
 # ----------------- Step 1: Choose PDF Source -----------------
 pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)