ai / sms_spam /utils.py
kevinhug's picture
llm evals
4c3d0df
import re
import json
# sometimes models will enclose the JSON in markdown! (e.g. ```json)
# this function removes those delimiters should they be there
def json_completion(completion):
completion = re.sub(r'^```json\n', '', completion.strip())
completion = re.sub(r'\n```$', '', completion)
return completion
class InvalidQueryException(Exception):
def __init__(self, message, query=None):
self.message = message
self.query = query
if query:
self.message += f"\nQuery: {self.query}"
super().__init__(self.message)
def is_valid(query_spec:str, columns:str, check_runnable=True):
"Test if a query is valid"
try:
check_query(query_spec, columns, check_runnable)
return True
except (KeyError, InvalidQueryException):
return False
def check_query(query_spec:str, columns:str, check_runnable=True):
"Raise an exception if a query is invalid."
query_spec = query_spec.replace("'", '"')
try:
spec = json.loads(query_spec)
except json.decoder.JSONDecodeError:
raise InvalidQueryException(f"JSON parsing error:\n{query_spec}", query_spec)
valid_calculate_ops = [
"COUNT",
"COUNT_DISTINCT",
"HEATMAP",
"CONCURRENCY",
"SUM",
"AVG",
"MAX",
"MIN",
"P001",
"P01",
"P05",
"P10",
"P25",
"P50",
"P75",
"P90",
"P95",
"P99",
"P999",
"RATE_AVG",
"RATE_SUM",
"RATE_MAX",
]
valid_filter_ops = [
"=",
"!=",
">",
">=",
"<",
"<=",
"starts-with",
"does-not-start-with",
"exists",
"does-not-exist",
"contains",
"does-not-contain",
"in",
"not-in",
]
if spec == {} or isinstance(spec, float):
raise InvalidQueryException("Query spec cannot be empty.", query_spec)
if isinstance(spec, str):
raise InvalidQueryException("Query spec was not parsed to json.", query_spec)
if "calculations" in spec:
for calc in spec["calculations"]:
if "op" not in calc:
raise InvalidQueryException(f"{calc}: Calculation must have an op.", query_spec)
if calc["op"] not in valid_calculate_ops:
raise InvalidQueryException(f"Invalid calculation: {calc['op']}", query_spec)
if calc["op"] == "COUNT" or calc["op"] == "CONCURRENCY":
if "column" in calc:
raise InvalidQueryException(f"{calc}: {calc['op']} cannot take a column as input.", query_spec)
else:
if "column" not in calc:
raise InvalidQueryException(f"{calc}: {calc['op']} must take a column as input.", query_spec)
if check_runnable and calc["column"] not in columns:
raise InvalidQueryException(f"Invalid column: {calc['column']}", query_spec)
if "filters" in spec:
for filter in spec["filters"]:
if not isinstance(filter, dict):
raise InvalidQueryException("filter of type other than dict found in query.", query_spec)
if "op" not in filter:
raise InvalidQueryException("No op found in filter.", query_spec)
if filter["op"] not in valid_filter_ops:
raise InvalidQueryException(f"Invalid filter: {filter['op']}", query_spec)
if check_runnable and filter["column"] not in columns:
raise InvalidQueryException(f"Invalid column: {filter['column']}", query_spec)
if filter["op"] == "exists" or filter["op"] == "does-not-exist":
if "value" in filter:
raise InvalidQueryException(f"{filter}: {filter['op']} cannot take a value as input.", query_spec)
else:
if filter["op"] == "in" or filter["op"] == "not-in":
if not isinstance(filter["value"], list):
raise InvalidQueryException(f"{filter}: {filter['op']} must take a list as input.", query_spec)
else:
if "value" not in filter:
raise InvalidQueryException(f"{filter}: {filter['op']} must take a value as input.", query_spec)
if "filter_combination" in spec:
if isinstance(spec["filter_combination"], str) and spec[
"filter_combination"
].lower() not in ["and", "or"]:
raise InvalidQueryException(f"Invalid filter combination: {spec['filter_combination']}", query_spec)
if "breakdowns" in spec:
for breakdown in spec["breakdowns"]:
if check_runnable and breakdown not in columns:
raise InvalidQueryException(f"Invalid column: {breakdown}", query_spec)
if "orders" in spec:
for order in spec["orders"]:
if "order" not in order:
raise InvalidQueryException(f"Invalid order without orders key: {query_spec}")
if order["order"] != "ascending" and order["order"] != "descending":
raise InvalidQueryException(f"Invalid order: {order['order']}", query_spec)
if "op" in order:
if order["op"] not in valid_calculate_ops:
raise InvalidQueryException(f"Invalid order: {order['op']}", query_spec)
if not any(calc["op"] == order["op"] for calc in spec.get("calculations", [])):
raise InvalidQueryException(f"{order}: Order op must be present in calculations: {order['op']}", query_spec)
if order["op"] == "COUNT" or order["op"] == "CONCURRENCY":
if "column" in order:
raise InvalidQueryException(f"{order}: {order['op']} cannot take a column as input.", query_spec)
else:
if "column" not in order:
raise InvalidQueryException(f"{order}: {order['op']} must take a column as input.", query_spec)
if check_runnable and order["column"] not in columns:
raise InvalidQueryException(f"{order}: Invalid column in order: {order['column']}", query_spec)
else:
if "column" not in order:
raise InvalidQueryException(f"{order}: Order must take a column or op as input.", query_spec)
if check_runnable and order["column"] not in columns:
raise InvalidQueryException(f"{order}: Invalid column in order: {order['column']}", query_spec)
if "havings" in spec:
for having in spec["havings"]:
if "calculate_op" not in having:
raise InvalidQueryException(f"{having}: Having must have a calculate_op.", query_spec)
if "value" not in having:
raise InvalidQueryException(f"{having}: Having must have a value.", query_spec)
if "op" not in having:
raise InvalidQueryException(f"{having}: Having must have an op.", query_spec)
if having["calculate_op"] == "HEATMAP":
raise InvalidQueryException("HEATMAP is not supported in having.", query_spec)
if (
having["calculate_op"] == "COUNT"
or having["calculate_op"] == "CONCURRENCY"
):
if "column" in having:
raise InvalidQueryException(f"{having}: {having['calculate_op']} cannot take a column as input.", query_spec)
else:
if "column" not in having:
raise InvalidQueryException(f"{having}: {having['calculate_op']} must take a column as input.", query_spec)
if check_runnable and having["column"] not in columns:
raise InvalidQueryException(f"{having}: Invalid column in having: {having['column']}", query_spec)
if "time_range" in spec:
if "start_time" in spec and "end_time" in spec:
raise InvalidQueryException("Time range cannot be specified with start_time and end_time.", query_spec)
if not isinstance(spec["time_range"], int):
raise InvalidQueryException(f"time_range must be an int: {spec['time_range']}", query_spec)
if "start_time" in spec:
if not isinstance(spec["start_time"], int):
raise InvalidQueryException(f"start_time must be an int: {spec['start_time']}", query_spec)
if "end_time" in spec:
if not isinstance(spec["end_time"], int):
raise InvalidQueryException(f"end_time must be an int: {spec['end_time']}", query_spec)
if "granularity" in spec:
if not isinstance(spec["granularity"], int):
raise InvalidQueryException(f"granularity must be an int: {spec['granularity']}", query_spec)
time_range = (
spec["time_range"]
if "time_range" in spec
else spec["end_time"] - spec["start_time"]
if "start_time" in spec and "end_time" in spec
else 7200
)
if spec["granularity"] > time_range / 10:
raise InvalidQueryException(f"granularity must be <= time_range / 10: {spec['granularity']}", query_spec)
if spec["granularity"] < time_range / 1000:
raise InvalidQueryException(f"granularity must be >= time_range / 1000: {spec['granularity']}", query_spec)
if "limit" in spec:
if not isinstance(spec["limit"], int):
raise InvalidQueryException(f"limit must be an int: {spec['limit']}", query_spec)