to_delete / dreamcoder /domains /list /main.py

Fraser-Greenlee

add dreamcoder codebase

e1c1753 over 3 years ago

16 kB

	import random
	from collections import defaultdict
	import json
	import math
	import os
	import datetime

	from dreamcoder.dreamcoder import explorationCompression
	from dreamcoder.utilities import eprint, flatten, testTrainSplit
	from dreamcoder.grammar import Grammar
	from dreamcoder.task import Task
	from dreamcoder.type import Context, arrow, tbool, tlist, tint, t0, UnificationFailure
	from dreamcoder.domains.list.listPrimitives import basePrimitives, primitives, McCarthyPrimitives, bootstrapTarget_extra, no_length
	from dreamcoder.domains.list.makeListTasks import make_list_bootstrap_tasks, sortBootstrap, EASYLISTTASKS


	def retrieveJSONTasks(filename, features=False):
	"""
	For JSON of the form:
	{"name": str,
	"type": {"input" : bool\|int\|list-of-bool\|list-of-int,
	"output": bool\|int\|list-of-bool\|list-of-int},
	"examples": [{"i": data, "o": data}]}
	"""
	with open(filename, "r") as f:
	loaded = json.load(f)
	TP = {
	"bool": tbool,
	"int": tint,
	"list-of-bool": tlist(tbool),
	"list-of-int": tlist(tint),
	}
	return [Task(
	item["name"],
	arrow(TP[item["type"]["input"]], TP[item["type"]["output"]]),
	[((ex["i"],), ex["o"]) for ex in item["examples"]],
	features=(None if not features else list_features(
	[((ex["i"],), ex["o"]) for ex in item["examples"]])),
	cache=False,
	) for item in loaded]


	def list_features(examples):
	if any(isinstance(i, int) for (i,), _ in examples):
	# obtain features for number inputs as list of numbers
	examples = [(([i],), o) for (i,), o in examples]
	elif any(not isinstance(i, list) for (i,), _ in examples):
	# can't handle non-lists
	return []
	elif any(isinstance(x, list) for (xs,), _ in examples for x in xs):
	# nested lists are hard to extract features for, so we'll
	# obtain features as if flattened
	examples = [(([x for xs in ys for x in xs],), o)
	for (ys,), o in examples]

	# assume all tasks have the same number of examples
	# and all inputs are lists
	features = []
	ot = type(examples[0][1])

	def mean(l): return 0 if not l else sum(l) / len(l)
	imean = [mean(i) for (i,), o in examples]
	ivar = [sum((v - imean[idx])**2
	for v in examples[idx][0][0])
	for idx in range(len(examples))]

	# DISABLED length of each input and output
	# total difference between length of input and output
	# DISABLED normalized count of numbers in input but not in output
	# total normalized count of numbers in input but not in output
	# total difference between means of input and output
	# total difference between variances of input and output
	# output type (-1=bool, 0=int, 1=list)
	# DISABLED outputs if integers, else -1s
	# DISABLED outputs if bools (-1/1), else 0s
	if ot == list: # lists of ints or bools
	omean = [mean(o) for (i,), o in examples]
	ovar = [sum((v - omean[idx])**2
	for v in examples[idx][1])
	for idx in range(len(examples))]

	def cntr(
	l, o): return 0 if not l else len(
	set(l).difference(
	set(o))) / len(l)
	cnt_not_in_output = [cntr(i, o) for (i,), o in examples]

	#features += [len(i) for (i,), o in examples]
	#features += [len(o) for (i,), o in examples]
	features.append(sum(len(i) - len(o) for (i,), o in examples))
	#features += cnt_not_int_output
	features.append(sum(cnt_not_in_output))
	features.append(sum(om - im for im, om in zip(imean, omean)))
	features.append(sum(ov - iv for iv, ov in zip(ivar, ovar)))
	features.append(1)
	# features += [-1 for _ in examples]
	# features += [0 for _ in examples]
	elif ot == bool:
	outs = [o for (i,), o in examples]

	#features += [len(i) for (i,), o in examples]
	#features += [-1 for _ in examples]
	features.append(sum(len(i) for (i,), o in examples))
	#features += [0 for _ in examples]
	features.append(0)
	features.append(sum(imean))
	features.append(sum(ivar))
	features.append(-1)
	# features += [-1 for _ in examples]
	# features += [1 if o else -1 for o in outs]
	else: # int
	def cntr(
	l, o): return 0 if not l else len(
	set(l).difference(
	set(o))) / len(l)
	cnt_not_in_output = [cntr(i, [o]) for (i,), o in examples]
	outs = [o for (i,), o in examples]

	#features += [len(i) for (i,), o in examples]
	#features += [1 for (i,), o in examples]
	features.append(sum(len(i) for (i,), o in examples))
	#features += cnt_not_int_output
	features.append(sum(cnt_not_in_output))
	features.append(sum(o - im for im, o in zip(imean, outs)))
	features.append(sum(ivar))
	features.append(0)
	# features += outs
	# features += [0 for _ in examples]

	return features


	def isListFunction(tp):
	try:
	Context().unify(tp, arrow(tlist(tint), t0))
	return True
	except UnificationFailure:
	return False


	def isIntFunction(tp):
	try:
	Context().unify(tp, arrow(tint, t0))
	return True
	except UnificationFailure:
	return False

	try:
	from dreamcoder.recognition import RecurrentFeatureExtractor
	class LearnedFeatureExtractor(RecurrentFeatureExtractor):
	H = 64

	special = None

	def tokenize(self, examples):
	def sanitize(l): return [z if z in self.lexicon else "?"
	for z_ in l
	for z in (z_ if isinstance(z_, list) else [z_])]

	tokenized = []
	for xs, y in examples:
	if isinstance(y, list):
	y = ["LIST_START"] + y + ["LIST_END"]
	else:
	y = [y]
	y = sanitize(y)
	if len(y) > self.maximumLength:
	return None

	serializedInputs = []
	for xi, x in enumerate(xs):
	if isinstance(x, list):
	x = ["LIST_START"] + x + ["LIST_END"]
	else:
	x = [x]
	x = sanitize(x)
	if len(x) > self.maximumLength:
	return None
	serializedInputs.append(x)

	tokenized.append((tuple(serializedInputs), y))

	return tokenized

	def __init__(self, tasks, testingTasks=[], cuda=False):
	self.lexicon = set(flatten((t.examples for t in tasks + testingTasks), abort=lambda x: isinstance(
	x, str))).union({"LIST_START", "LIST_END", "?"})

	# Calculate the maximum length
	self.maximumLength = float('inf') # Believe it or not this is actually important to have here
	self.maximumLength = max(len(l)
	for t in tasks + testingTasks
	for xs, y in self.tokenize(t.examples)
	for l in [y] + [x for x in xs])

	self.recomputeTasks = True

	super(
	LearnedFeatureExtractor,
	self).__init__(
	lexicon=list(
	self.lexicon),
	tasks=tasks,
	cuda=cuda,
	H=self.H,
	bidirectional=True)
	except: pass

	def train_necessary(t):
	if t.name in {"head", "is-primes", "len", "pop", "repeat-many", "tail", "keep primes", "keep squares"}:
	return True
	if any(t.name.startswith(x) for x in {
	"add-k", "append-k", "bool-identify-geq-k", "count-k", "drop-k",
	"empty", "evens", "has-k", "index-k", "is-mod-k", "kth-largest",
	"kth-smallest", "modulo-k", "mult-k", "remove-index-k",
	"remove-mod-k", "repeat-k", "replace-all-with-index-k", "rotate-k",
	"slice-k-n", "take-k",
	}):
	return "some"
	return False


	def list_options(parser):
	parser.add_argument(
	"--noMap", action="store_true", default=False,
	help="Disable built-in map primitive")
	parser.add_argument(
	"--noUnfold", action="store_true", default=False,
	help="Disable built-in unfold primitive")
	parser.add_argument(
	"--noLength", action="store_true", default=False,
	help="Disable built-in length primitive")
	parser.add_argument(
	"--dataset",
	type=str,
	default="Lucas-old",
	choices=[
	"bootstrap",
	"sorting",
	"Lucas-old",
	"Lucas-depth1",
	"Lucas-depth2",
	"Lucas-depth3"])
	parser.add_argument("--maxTasks", type=int,
	default=None,
	help="truncate tasks to fit within this boundary")
	parser.add_argument("--primitives",
	default="common",
	help="Which primitive set to use",
	choices=["McCarthy", "base", "rich", "common", "noLength"])
	parser.add_argument("--extractor", type=str,
	choices=["hand", "deep", "learned"],
	default="learned")
	parser.add_argument("--split", metavar="TRAIN_RATIO",
	type=float,
	help="split test/train")
	parser.add_argument("-H", "--hidden", type=int,
	default=64,
	help="number of hidden units")
	parser.add_argument("--random-seed", type=int, default=17)


	def main(args):
	"""
	Takes the return value of the `commandlineArguments()` function as input and
	trains/tests the model on manipulating sequences of numbers.
	"""
	random.seed(args.pop("random_seed"))

	dataset = args.pop("dataset")
	tasks = {
	"Lucas-old": lambda: retrieveJSONTasks("data/list_tasks.json") + sortBootstrap(),
	"bootstrap": make_list_bootstrap_tasks,
	"sorting": sortBootstrap,
	"Lucas-depth1": lambda: retrieveJSONTasks("data/list_tasks2.json")[:105],
	"Lucas-depth2": lambda: retrieveJSONTasks("data/list_tasks2.json")[:4928],
	"Lucas-depth3": lambda: retrieveJSONTasks("data/list_tasks2.json"),
	}[dataset]()

	maxTasks = args.pop("maxTasks")
	if maxTasks and len(tasks) > maxTasks:
	necessaryTasks = [] # maxTasks will not consider these
	if dataset.startswith("Lucas2.0") and dataset != "Lucas2.0-depth1":
	necessaryTasks = tasks[:105]

	eprint("Unwilling to handle {} tasks, truncating..".format(len(tasks)))
	random.shuffle(tasks)
	del tasks[maxTasks:]
	tasks = necessaryTasks + tasks

	if dataset.startswith("Lucas"):
	# extra tasks for filter
	tasks.extend([
	Task("remove empty lists",
	arrow(tlist(tlist(tbool)), tlist(tlist(tbool))),
	[((ls,), list(filter(lambda l: len(l) > 0, ls)))
	for _ in range(15)
	for ls in [[[random.random() < 0.5 for _ in range(random.randint(0, 3))]
	for _ in range(4)]]]),
	Task("keep squares",
	arrow(tlist(tint), tlist(tint)),
	[((xs,), list(filter(lambda x: int(math.sqrt(x)) ** 2 == x,
	xs)))
	for _ in range(15)
	for xs in [[random.choice([0, 1, 4, 9, 16, 25])
	if random.random() < 0.5
	else random.randint(0, 9)
	for _ in range(7)]]]),
	Task("keep primes",
	arrow(tlist(tint), tlist(tint)),
	[((xs,), list(filter(lambda x: x in {2, 3, 5, 7, 11, 13, 17,
	19, 23, 29, 31, 37}, xs)))
	for _ in range(15)
	for xs in [[random.choice([2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37])
	if random.random() < 0.5
	else random.randint(0, 9)
	for _ in range(7)]]]),
	])
	for i in range(4):
	tasks.extend([
	Task("keep eq %s" % i,
	arrow(tlist(tint), tlist(tint)),
	[((xs,), list(filter(lambda x: x == i, xs)))
	for _ in range(15)
	for xs in [[random.randint(0, 6) for _ in range(5)]]]),
	Task("remove eq %s" % i,
	arrow(tlist(tint), tlist(tint)),
	[((xs,), list(filter(lambda x: x != i, xs)))
	for _ in range(15)
	for xs in [[random.randint(0, 6) for _ in range(5)]]]),
	Task("keep gt %s" % i,
	arrow(tlist(tint), tlist(tint)),
	[((xs,), list(filter(lambda x: x > i, xs)))
	for _ in range(15)
	for xs in [[random.randint(0, 6) for _ in range(5)]]]),
	Task("remove gt %s" % i,
	arrow(tlist(tint), tlist(tint)),
	[((xs,), list(filter(lambda x: not x > i, xs)))
	for _ in range(15)
	for xs in [[random.randint(0, 6) for _ in range(5)]]])
	])

	def isIdentityTask(t):
	return all( len(xs) == 1 and xs[0] == y for xs, y in t.examples )
	eprint("Removed", sum(isIdentityTask(t) for t in tasks), "tasks that were just the identity function")
	tasks = [t for t in tasks if not isIdentityTask(t) ]

	prims = {"base": basePrimitives,
	"McCarthy": McCarthyPrimitives,
	"common": bootstrapTarget_extra,
	"noLength": no_length,
	"rich": primitives}[args.pop("primitives")]()
	haveLength = not args.pop("noLength")
	haveMap = not args.pop("noMap")
	haveUnfold = not args.pop("noUnfold")
	eprint(f"Including map as a primitive? {haveMap}")
	eprint(f"Including length as a primitive? {haveLength}")
	eprint(f"Including unfold as a primitive? {haveUnfold}")
	baseGrammar = Grammar.uniform([p
	for p in prims
	if (p.name != "map" or haveMap) and \
	(p.name != "unfold" or haveUnfold) and \
	(p.name != "length" or haveLength)])

	extractor = {
	"learned": LearnedFeatureExtractor,
	}[args.pop("extractor")]
	extractor.H = args.pop("hidden")

	timestamp = datetime.datetime.now().isoformat()
	outputDirectory = "experimentOutputs/list/%s"%timestamp
	os.system("mkdir -p %s"%outputDirectory)

	args.update({
	"featureExtractor": extractor,
	"outputPrefix": "%s/list"%outputDirectory,
	"evaluationTimeout": 0.0005,
	})


	eprint("Got {} list tasks".format(len(tasks)))
	split = args.pop("split")
	if split:
	train_some = defaultdict(list)
	for t in tasks:
	necessary = train_necessary(t)
	if not necessary:
	continue
	if necessary == "some":
	train_some[t.name.split()[0]].append(t)
	else:
	t.mustTrain = True
	for k in sorted(train_some):
	ts = train_some[k]
	random.shuffle(ts)
	ts.pop().mustTrain = True

	test, train = testTrainSplit(tasks, split)
	if True:
	test = [t for t in test
	if t.name not in EASYLISTTASKS]

	eprint(
	"Alotted {} tasks for training and {} for testing".format(
	len(train), len(test)))
	else:
	train = tasks
	test = []

	explorationCompression(baseGrammar, train, testingTasks=test, **args)