DrishtiSharma commited on
Commit
a609396
Β·
verified Β·
1 Parent(s): 0240de7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -28
app.py CHANGED
@@ -30,7 +30,7 @@ rag_llm.verbose = True
30
  # Clear ChromaDB cache to fix tenant issue
31
  chromadb.api.client.SharedSystemClient.clear_system_cache()
32
 
33
- st.title("Blah")
34
 
35
  # ----------------- ChromaDB Persistent Directory -----------------
36
  CHROMA_DB_DIR = "/mnt/data/chroma_db"
@@ -48,11 +48,17 @@ if "processed_chunks" not in st.session_state:
48
  if "vector_store" not in st.session_state:
49
  st.session_state.vector_store = None
50
 
 
51
  # ----------------- Metadata Extraction -----------------
52
  def extract_metadata_llm(pdf_path):
53
- """Extracts metadata using LLM instead of regex."""
 
54
  with pdfplumber.open(pdf_path) as pdf:
55
- first_page_text = pdf.pages[0].extract_text() if pdf.pages else "No text found."
 
 
 
 
56
 
57
  # Define metadata prompt
58
  metadata_prompt = PromptTemplate(
@@ -60,54 +66,67 @@ def extract_metadata_llm(pdf_path):
60
  template="""
61
  Given the following first page of a research paper, extract metadata **strictly in JSON format**.
62
  - If no data is found for a field, return `"Unknown"` instead.
63
- - The response must be valid JSON.
64
-
65
  Example output:
66
- ```json
67
  {
68
  "Title": "Example Paper Title",
69
  "Author": "John Doe, Jane Smith",
70
71
  "Affiliations": "School of AI, University of Example"
72
  }
73
- ```
74
-
75
  Now, extract the metadata from this document:
76
-
77
- ```
78
  {text}
79
- ```
80
  """
81
  )
82
 
83
  # Run LLM Metadata Extraction
84
  metadata_chain = LLMChain(llm=llm_judge, prompt=metadata_prompt, output_key="metadata")
85
- metadata_response = metadata_chain.invoke({"text": first_page_text})
86
 
87
- # Handle JSON extraction from LLM response
 
 
 
88
  try:
89
- # Try parsing directly as JSON
90
- metadata_dict = json.loads(metadata_response["metadata"])
91
- except json.JSONDecodeError:
92
- # Fallback: Extract JSON using regex
93
- json_match = re.search(r"```json\n(.*?)\n```", metadata_response["metadata"], re.DOTALL)
94
- json_text = json_match.group(1) if json_match else metadata_response["metadata"]
95
 
 
96
  try:
97
- metadata_dict = json.loads(json_text)
98
  except json.JSONDecodeError:
99
- metadata_dict = {
100
- "Title": "Unknown",
101
- "Author": "Unknown",
102
- "Emails": "No emails found",
103
- "Affiliations": "No affiliations found"
104
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
  # Ensure all required fields exist
107
  required_fields = ["Title", "Author", "Emails", "Affiliations"]
108
  for field in required_fields:
109
- if field not in metadata_dict or not metadata_dict[field].strip():
110
- metadata_dict[field] = "Unknown"
 
 
 
111
 
112
  return metadata_dict
113
 
 
30
  # Clear ChromaDB cache to fix tenant issue
31
  chromadb.api.client.SharedSystemClient.clear_system_cache()
32
 
33
+ # st.title("Blah")
34
 
35
  # ----------------- ChromaDB Persistent Directory -----------------
36
  CHROMA_DB_DIR = "/mnt/data/chroma_db"
 
48
  if "vector_store" not in st.session_state:
49
  st.session_state.vector_store = None
50
 
51
+ # ----------------- Metadata Extraction -----------------
52
  # ----------------- Metadata Extraction -----------------
53
  def extract_metadata_llm(pdf_path):
54
+ """Extracts metadata using LLM instead of regex and logs progress in Streamlit UI."""
55
+
56
  with pdfplumber.open(pdf_path) as pdf:
57
+ first_page_text = pdf.pages[0].extract_text() or "No text found." if pdf.pages else "No text found."
58
+
59
+ # Streamlit Debugging: Show extracted text
60
+ st.subheader("πŸ“„ Extracted First Page Text for Metadata")
61
+ st.text_area("First Page Text:", first_page_text, height=200)
62
 
63
  # Define metadata prompt
64
  metadata_prompt = PromptTemplate(
 
66
  template="""
67
  Given the following first page of a research paper, extract metadata **strictly in JSON format**.
68
  - If no data is found for a field, return `"Unknown"` instead.
69
+ - Ensure the output is valid JSON (do not include markdown syntax).
70
+
71
  Example output:
 
72
  {
73
  "Title": "Example Paper Title",
74
  "Author": "John Doe, Jane Smith",
75
76
  "Affiliations": "School of AI, University of Example"
77
  }
78
+
 
79
  Now, extract the metadata from this document:
 
 
80
  {text}
 
81
  """
82
  )
83
 
84
  # Run LLM Metadata Extraction
85
  metadata_chain = LLMChain(llm=llm_judge, prompt=metadata_prompt, output_key="metadata")
 
86
 
87
+ # Debugging: Log the LLM input
88
+ st.subheader("πŸ” LLM Input for Metadata Extraction")
89
+ st.json({"text": first_page_text})
90
+
91
  try:
92
+ metadata_response = metadata_chain.invoke({"text": first_page_text})
93
+
94
+ # Debugging: Log raw LLM response
95
+ st.subheader("πŸ” Raw LLM Response")
96
+ st.json(metadata_response)
 
97
 
98
+ # Handle JSON extraction from LLM response
99
  try:
100
+ metadata_dict = json.loads(metadata_response["metadata"])
101
  except json.JSONDecodeError:
102
+ try:
103
+ # Attempt to clean up JSON if needed
104
+ metadata_dict = json.loads(metadata_response["metadata"].strip("```json\n").strip("\n```"))
105
+ except json.JSONDecodeError:
106
+ metadata_dict = {
107
+ "Title": "Unknown",
108
+ "Author": "Unknown",
109
+ "Emails": "No emails found",
110
+ "Affiliations": "No affiliations found"
111
+ }
112
+
113
+ except Exception as e:
114
+ st.error(f"❌ LLM Metadata Extraction Failed: {e}")
115
+ metadata_dict = {
116
+ "Title": "Unknown",
117
+ "Author": "Unknown",
118
+ "Emails": "No emails found",
119
+ "Affiliations": "No affiliations found"
120
+ }
121
 
122
  # Ensure all required fields exist
123
  required_fields = ["Title", "Author", "Emails", "Affiliations"]
124
  for field in required_fields:
125
+ metadata_dict.setdefault(field, "Unknown")
126
+
127
+ # Streamlit Debugging: Display Final Extracted Metadata
128
+ st.subheader("βœ… Extracted Metadata")
129
+ st.json(metadata_dict)
130
 
131
  return metadata_dict
132