timeki commited on
Commit
90eb0dc
·
2 Parent(s): 0bbee33 f3408f9

Merged in main (pull request #13)

Browse files
climateqa/constants.py CHANGED
@@ -65,4 +65,39 @@ OWID_CATEGORIES = ['Access to Energy', 'Agricultural Production',
65
  'Oil Spills', 'Outdoor Air Pollution', 'Ozone Layer', 'Pandemics',
66
  'Pesticides', 'Plastic Pollution', 'Renewable Energy', 'Soil',
67
  'Transport', 'Urbanization', 'Waste Management', 'Water Pollution',
68
- 'Water Use & Stress', 'Wildfires']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  'Oil Spills', 'Outdoor Air Pollution', 'Ozone Layer', 'Pandemics',
66
  'Pesticides', 'Plastic Pollution', 'Renewable Energy', 'Soil',
67
  'Transport', 'Urbanization', 'Waste Management', 'Water Pollution',
68
+ 'Water Use & Stress', 'Wildfires']
69
+
70
+
71
+ DOCUMENT_METADATA_DEFAULT_VALUES = {
72
+ "chunk_type": "",
73
+ "document_id": "",
74
+ "document_number": 0.0,
75
+ "element_id": "",
76
+ "figure_code": "",
77
+ "file_size": "",
78
+ "image_path": "",
79
+ "n_pages": 0.0,
80
+ "name": "",
81
+ "num_characters": 0.0,
82
+ "num_tokens": 0.0,
83
+ "num_tokens_approx": 0.0,
84
+ "num_words": 0.0,
85
+ "page_number": 0,
86
+ "release_date": 0.0,
87
+ "report_type": "",
88
+ "section_header": "",
89
+ "short_name": "",
90
+ "source": "",
91
+ "toc_level0": "",
92
+ "toc_level1": "",
93
+ "toc_level2": "",
94
+ "toc_level3": "",
95
+ "url": "",
96
+ "similarity_score": 0.0,
97
+ "content": "",
98
+ "reranking_score": 0.0,
99
+ "query_used_for_retrieval": "",
100
+ "sources_used": [""],
101
+ "question_used": "",
102
+ "index_used": ""
103
+ }
climateqa/logging.py CHANGED
@@ -4,14 +4,45 @@ import json
4
  from huggingface_hub import HfApi
5
  import gradio as gr
6
  import csv
 
 
 
 
 
7
 
8
- def serialize_docs(docs:list)->list:
 
 
 
 
 
 
 
 
 
 
 
 
9
  new_docs = []
10
  for doc in docs:
11
- new_doc = {}
12
- new_doc["page_content"] = doc.page_content
13
- new_doc["metadata"] = doc.metadata
 
 
 
 
 
 
 
 
 
 
 
14
  new_docs.append(new_doc)
 
 
 
15
  return new_docs
16
 
17
  ## AZURE LOGGING - DEPRECATED
@@ -93,22 +124,37 @@ def serialize_docs(docs:list)->list:
93
 
94
  ## HUGGING FACE LOGGING
95
 
96
- def log_on_huggingface(log_filename, logs):
97
  """Log data to Hugging Face dataset repository.
98
 
99
  Args:
100
  log_filename (str): Name of the file to store logs
101
  logs (dict): Log data to store
 
102
  """
103
  try:
104
- # Get Hugging Face token from environment
105
- hf_token = os.getenv("HF_LOGS_TOKEN")
106
- if not hf_token:
107
- print("HF_LOGS_TOKEN not found in environment variables")
108
- return
 
 
 
 
 
 
 
 
 
 
 
109
 
110
- # Get repository name from environment or use default
111
- repo_id = os.getenv("HF_DATASET_REPO", "timeki/climateqa_logs")
 
 
 
112
 
113
  # Initialize HfApi
114
  api = HfApi(token=hf_token)
@@ -158,10 +204,13 @@ def log_interaction_to_huggingface(history, output_query, sources, docs, share_c
158
  "time": timestamp,
159
  }
160
  # Log to Hugging Face
161
- log_on_huggingface(f"chat/{timestamp}.json", logs)
 
 
 
162
  except Exception as e:
163
  print(f"Error logging to Hugging Face: {e}")
164
- error_msg = f"ClimateQ&A Error: {str(e)[:100]} - The error has been noted, try another question and if the error remains, you can contact us :)"
165
  raise gr.Error(error_msg)
166
 
167
  def log_drias_interaction_to_huggingface(query, sql_query, user_id):
@@ -182,7 +231,7 @@ def log_drias_interaction_to_huggingface(query, sql_query, user_id):
182
  "sql_query": sql_query,
183
  "time": timestamp,
184
  }
185
- log_on_huggingface(f"drias/drias_{timestamp}.json", logs)
186
  print(f"Logged Drias interaction to Hugging Face: {logs}")
187
  else:
188
  print("share_client or user_id is None, or GRADIO_ENV is local")
 
4
  from huggingface_hub import HfApi
5
  import gradio as gr
6
  import csv
7
+ import pandas as pd
8
+ import io
9
+ from typing import TypedDict, List
10
+ from climateqa.constants import DOCUMENT_METADATA_DEFAULT_VALUES
11
+ from langchain_core.documents import Document
12
 
13
+ def serialize_docs(docs:list[Document])->list:
14
+ """Convert document objects to a simplified format compatible with Hugging Face datasets.
15
+
16
+ This function processes document objects by extracting their page content and metadata,
17
+ normalizing the metadata structure to ensure consistency. It applies default values
18
+ from DOCUMENT_METADATA_DEFAULT_VALUES for any missing metadata fields.
19
+
20
+ Args:
21
+ docs (list): List of document objects, each with page_content and metadata attributes
22
+
23
+ Returns:
24
+ list: List of dictionaries with standardized "page_content" and "metadata" fields
25
+ """
26
  new_docs = []
27
  for doc in docs:
28
+ # Make sure we have a clean doc format
29
+ new_doc = {
30
+ "page_content": doc.page_content,
31
+ "metadata": {}
32
+ }
33
+
34
+ # Ensure all metadata fields exist with defaults if missing
35
+ for field, default_value in DOCUMENT_METADATA_DEFAULT_VALUES.items():
36
+ new_value = doc.metadata.get(field, default_value)
37
+ try:
38
+ new_doc["metadata"][field] = type(default_value)(new_value)
39
+ except:
40
+ new_doc["metadata"][field] = default_value
41
+
42
  new_docs.append(new_doc)
43
+
44
+ if new_docs == []:
45
+ new_docs = [{"page_content": "No documents found", "metadata": DOCUMENT_METADATA_DEFAULT_VALUES}]
46
  return new_docs
47
 
48
  ## AZURE LOGGING - DEPRECATED
 
124
 
125
  ## HUGGING FACE LOGGING
126
 
127
+ def log_on_huggingface(log_filename, logs, log_type="chat"):
128
  """Log data to Hugging Face dataset repository.
129
 
130
  Args:
131
  log_filename (str): Name of the file to store logs
132
  logs (dict): Log data to store
133
+ log_type (str): Type of log to store
134
  """
135
  try:
136
+ if log_type =="chat":
137
+ # Get Hugging Face token from environment
138
+ hf_token = os.getenv("HF_LOGS_TOKEN")
139
+ if not hf_token:
140
+ print("HF_LOGS_TOKEN not found in environment variables")
141
+ return
142
+
143
+ # Get repository name from environment or use default
144
+ repo_id = os.getenv("HF_DATASET_REPO", "Ekimetrics/climateqa_logs")
145
+
146
+ elif log_type =="drias":
147
+ # Get Hugging Face token from environment
148
+ hf_token = os.getenv("HF_LOGS_DRIAS_TOKEN")
149
+ if not hf_token:
150
+ print("HF_LOGS_DRIAS_TOKEN not found in environment variables")
151
+ return
152
 
153
+ # Get repository name from environment or use default
154
+ repo_id = os.getenv("HF_DATASET_REPO_DRIAS", "Ekimetrics/climateqa_logs_talk_to_data")
155
+
156
+ else:
157
+ raise ValueError(f"Invalid log type: {log_type}")
158
 
159
  # Initialize HfApi
160
  api = HfApi(token=hf_token)
 
204
  "time": timestamp,
205
  }
206
  # Log to Hugging Face
207
+ log_on_huggingface(f"chat/{timestamp}.json", logs, log_type="chat")
208
+ print(f"Logged interaction to Hugging Face")
209
+ else:
210
+ print("Did not log to Hugging Face because GRADIO_ENV is local")
211
  except Exception as e:
212
  print(f"Error logging to Hugging Face: {e}")
213
+ error_msg = f"ClimateQ&A Error: {str(e)[:100]})"
214
  raise gr.Error(error_msg)
215
 
216
  def log_drias_interaction_to_huggingface(query, sql_query, user_id):
 
231
  "sql_query": sql_query,
232
  "time": timestamp,
233
  }
234
+ log_on_huggingface(f"drias/drias_{timestamp}.json", logs, log_type="drias")
235
  print(f"Logged Drias interaction to Hugging Face: {logs}")
236
  else:
237
  print("share_client or user_id is None, or GRADIO_ENV is local")
data/drias/drias.db DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1e29ba55d0122dc034b76113941769b44214355d4528bcc5b3d8f71f3c50bf59
3
- size 280621056