DrishtiSharma commited on
Commit
0240de7
·
verified ·
1 Parent(s): d291cf6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -20
app.py CHANGED
@@ -50,16 +50,15 @@ if "vector_store" not in st.session_state:
50
 
51
  # ----------------- Metadata Extraction -----------------
52
  def extract_metadata_llm(pdf_path):
53
- """Extracts metadata using LLM"""
54
  with pdfplumber.open(pdf_path) as pdf:
55
  first_page_text = pdf.pages[0].extract_text() if pdf.pages else "No text found."
56
 
57
- # Define metadata_prompt
58
  metadata_prompt = PromptTemplate(
59
  input_variables=["text"],
60
  template="""
61
  Given the following first page of a research paper, extract metadata **strictly in JSON format**.
62
-
63
  - If no data is found for a field, return `"Unknown"` instead.
64
  - The response must be valid JSON.
65
 
@@ -81,28 +80,34 @@ def extract_metadata_llm(pdf_path):
81
  """
82
  )
83
 
84
-
85
  metadata_chain = LLMChain(llm=llm_judge, prompt=metadata_prompt, output_key="metadata")
86
  metadata_response = metadata_chain.invoke({"text": first_page_text})
87
 
88
- # Ensure LLM Output is a Proper JSON String
89
- json_match = re.search(r"```json\n(.*?)\n```", metadata_response["metadata"], re.DOTALL)
90
- json_text = json_match.group(1) if json_match else metadata_response["metadata"]
91
-
92
  try:
93
- metadata_dict = json.loads(json_text)
94
- # Ensure all expected fields are present
95
- required_fields = ["Title", "Author", "Emails", "Affiliations"]
96
- for field in required_fields:
97
- if field not in metadata_dict:
98
- metadata_dict[field] = "Unknown"
99
  except json.JSONDecodeError:
100
- metadata_dict = {
101
- "Title": "Unknown",
102
- "Author": "Unknown",
103
- "Emails": "No emails found",
104
- "Affiliations": "No affiliations found"
105
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
  return metadata_dict
108
 
 
50
 
51
  # ----------------- Metadata Extraction -----------------
52
  def extract_metadata_llm(pdf_path):
53
+ """Extracts metadata using LLM instead of regex."""
54
  with pdfplumber.open(pdf_path) as pdf:
55
  first_page_text = pdf.pages[0].extract_text() if pdf.pages else "No text found."
56
 
57
+ # Define metadata prompt
58
  metadata_prompt = PromptTemplate(
59
  input_variables=["text"],
60
  template="""
61
  Given the following first page of a research paper, extract metadata **strictly in JSON format**.
 
62
  - If no data is found for a field, return `"Unknown"` instead.
63
  - The response must be valid JSON.
64
 
 
80
  """
81
  )
82
 
83
+ # Run LLM Metadata Extraction
84
  metadata_chain = LLMChain(llm=llm_judge, prompt=metadata_prompt, output_key="metadata")
85
  metadata_response = metadata_chain.invoke({"text": first_page_text})
86
 
87
+ # Handle JSON extraction from LLM response
 
 
 
88
  try:
89
+ # Try parsing directly as JSON
90
+ metadata_dict = json.loads(metadata_response["metadata"])
 
 
 
 
91
  except json.JSONDecodeError:
92
+ # Fallback: Extract JSON using regex
93
+ json_match = re.search(r"```json\n(.*?)\n```", metadata_response["metadata"], re.DOTALL)
94
+ json_text = json_match.group(1) if json_match else metadata_response["metadata"]
95
+
96
+ try:
97
+ metadata_dict = json.loads(json_text)
98
+ except json.JSONDecodeError:
99
+ metadata_dict = {
100
+ "Title": "Unknown",
101
+ "Author": "Unknown",
102
+ "Emails": "No emails found",
103
+ "Affiliations": "No affiliations found"
104
+ }
105
+
106
+ # Ensure all required fields exist
107
+ required_fields = ["Title", "Author", "Emails", "Affiliations"]
108
+ for field in required_fields:
109
+ if field not in metadata_dict or not metadata_dict[field].strip():
110
+ metadata_dict[field] = "Unknown"
111
 
112
  return metadata_dict
113