Spaces:

griptape
/

uw-teaching-effectiveness

Running

App Files Files

uw-teaching-effectiveness / griptape_statemachine /parsers /uw_csv_parser.py

kateforsberg

first commit

d477d5c 6 months ago

raw

history blame

8.41 kB

	# TODO: create a csv parser
	from __future__ import annotations

	from ast import Lambda
	import contextlib
	import csv
	from pathlib import Path
	from typing import TYPE_CHECKING, Callable

	import yaml

	if TYPE_CHECKING:
	from io import TextIOWrapper


	class CsvParser:

	def __init__(self, directory: str) -> None:
	self.yaml_path = Path.joinpath(Path.cwd(), Path(f"{directory}/config.yaml"))
	self.csv_directory = Path.joinpath(Path.cwd(), Path(f"{directory}/csv_files"))
	csv_files = Path(self.csv_directory).glob("*")
	self.csv_file_paths = [file for file in csv_files if file.is_file()]

	def csv_parser(self) -> None:
	"""This is going to take in a big csv, split it, and put it in config.yaml"""
	# This is going to parse multiple different csv files this time.
	split_csv = {}
	for csv_file in self.csv_file_paths:
	with Path.open(csv_file, "r", newline="") as csvfile:
	self.split_csv(csvfile, split_csv)
	# split_csv should have all the information
	yaml_data = yaml.safe_load(self.yaml_path.read_text())
	# Rulesets CHANGE
	try:
	yaml_data["rulesets"] = self.csv_rulesets(
	split_csv["Ruleset ID"]
	) # Rulesets
	except KeyError:
	print("No rulesets")
	# Agents DONE
	try:
	yaml_data["structures"] = self.csv_agents(
	split_csv["Agent ID"]
	) # Agent Definitions
	except KeyError:
	print("No structures")
	# States
	# Tailoring (affects the states section only) CHANGE
	if "State ID to Tailor" in split_csv:
	try:
	yaml_data["states"] = self.csv_states(
	split_csv["State ID"], # State Definitions
	split_csv["State ID to Tailor"], # Agent Tailoring State ID
	)
	except KeyError:
	print(" no states")
	else:
	try:
	yaml_data["states"] = self.csv_states(
	split_csv["State ID"], # State Definitions
	[], # Agent Tailoring State ID
	)
	except KeyError:
	print(" no states")
	try:
	yaml_data["prompts"] = self.csv_prompts(split_csv["Prompt ID"])
	except KeyError:
	print("no prompts")
	# # Transitioning (affects event section) DONE
	try:
	yaml_data["events"] = self.csv_transition_id(
	split_csv["Transition ID"]
	) # State Transitions
	except KeyError:
	print("No transitions")
	# That's all folks!
	self.update_and_save(yaml_data)

	def split_csv(self, csv_file: TextIOWrapper, all_information: dict) -> None:
	"""Takes in a csv_file, and splits it into a dictionary that is headed by each of the sections.
	Hooray!
	"""
	reader = csv.reader(csv_file)
	# Get the header of the section
	header = next(reader)
	header = header[0] # Go to the meat of it (get rid of descriptive header)
	current_information = []
	for row in reader:
	key = row[0]
	# If the row is empty and/or has no value in the first column.
	if key == ",,":
	continue
	current_information.append({key: row[1:]})
	all_information[header] = current_information

	def csv_kbs(self, kb_info: list) -> dict:
	dictionary = {}
	for row in kb_info:
	key, value = row.popitem()
	if key and value[0] and value[1]:
	dictionary[key] = {"file_path": value[0], "file_type": value[1]}
	return dictionary

	def csv_rulesets(self, ruleset_info: list) -> dict:
	dictionary = {}
	for row in ruleset_info:
	key, value = row.popitem()
	if key and value[0] and value[1]:
	rules = [
	rule.strip().strip('"').lstrip("- ")
	for rule in value[1].split("\n")
	if rule.strip()
	]
	dictionary[key] = {
	"name": value[0],
	"rules": rules,
	} # Will have to check this.
	return dictionary

	def csv_prompts(self, prompt_info: list) -> dict:
	dictionary = {}
	for row in prompt_info:
	key, value = row.popitem()
	if key and value[0]:
	dictionary[key] = {"prompt": value[0]}
	if value[1]:
	dictionary[key]["author_intent"] = value[1]
	return dictionary

	def csv_agents(self, agent_info: list) -> dict:
	dictionary = {}
	for row in agent_info:
	key, value = row.popitem()
	if key:
	ruleset_ids = []
	if value[0]:
	ruleset_ids = [rule_id.strip() for rule_id in value[0].split(",")]
	config = {
	"model": "gpt-4o",
	"ruleset_ids": ruleset_ids,
	}
	# If there is a global KB used
	if value[1]:
	config["vector_stores"] = [value[1]]
	# If there is a global prompt used (can be overrided by state specfic)
	if value[2]:
	config["prompt_id"] = value[2]
	# If there is a model override
	if value[4]:
	config["model"] = value[4]
	dictionary[key] = config
	return dictionary

	def csv_states(self, state_info: list, tailor_info: list) -> dict:
	states = {}
	for row in state_info:
	key, value = row.popitem()
	if not key:
	continue
	if key == "start":
	states[key] = {"initial": True}
	elif key == "end":
	states[key] = {"final": True}
	else:
	states[key] = {}
	if value[0] and value[0] != "none":
	agent_list = {name.strip(): {} for name in value[0].split(",")}
	states[key]["structures"] = agent_list
	for row in tailor_info:
	tailor, value = row.popitem()
	if not tailor:
	continue
	structures = (
	states[tailor]["structures"]
	if tailor in states and "structures" in states[tailor]
	else {}
	)
	structure = value
	structure_name = structure[0]
	# if ruleset
	try:
	structure_ruleset = structure[1]
	structure_ruleset_list = []
	for item in structure_ruleset.split(","):
	if item.strip() != "":
	structure_ruleset_list.append(item.strip())
	if len(structure_ruleset_list):
	structures[structure_name] = {
	"ruleset_ids": structure_ruleset_list,
	}
	except KeyError:
	structures[structure_name] = {}
	try:
	if structure[2]:
	structures[structure_name]["prompt_id"] = structure[2]
	except KeyError:
	pass
	states[tailor] = {"structures": structures}
	return states

	def csv_transition_id(self, transition_info: list) -> dict:
	events = {}
	for row in transition_info:
	key, value = row.popitem()
	if key and value[0] and value[1]:
	if key in events:
	# Add the transition if there already are transitions
	events[key]["transitions"].append(
	{"from": value[0], "to": value[1]}
	)
	else:
	# create the first transition
	events[key] = {
	"transitions": [
	{"from": value[0], "to": value[1]},
	]
	}
	return events

	def update_and_save(self, config: dict) -> None:
	with self.yaml_path.open("w") as file:
	yaml.dump(config, file, default_flow_style=False, line_break="\n")


	if __name__ == "__main__":
	CsvParser("uw_programmatic").csv_parser()