Spaces:

amanuelyh
/

grammar_correction

Build error

App Files Files Community

grammar_correction / dataset_preprocess.py

amanuelyh

Add initial implementation of Grammar Correction tool with Flask and Streamlit

03c77e0 8 months ago

raw

history blame contribute delete

2.3 kB

	import os
	import csv
	from datasets import load_dataset

	REPLACEMENTS = [
	(" .", "."),
	(" ,", ","),
	(" '", "'"),
	(" ?", "?"),
	(" !", "!"),
	(" :", ":"),
	(" ;", ";"),
	(" n't", "n't"),
	("2 0 0 6", "2006"),
	("5 5", "55"),
	("4 0 0", "400"),
	("1 7-5 0", "1750"),
	("2 0 %", "20%"),
	("5 0", "50"),
	("1 2", "12"),
	("1 0", "10"),
	('" ballast water', '"ballast water')
	]

	def remove_excess_spaces(text):
	for old, new in REPLACEMENTS:
	text = text.replace(old, new)
	return text

	def generate_csv(csv_path, dataset):
	os.makedirs(os.path.dirname(csv_path), exist_ok=True)
	with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
	writer = csv.writer(csvfile)
	writer.writerow(["input", "target"])
	for case in dataset:
	input_text = "grammar: " + case["sentence"]
	input_text = remove_excess_spaces(input_text)
	for correction in case["corrections"]:
	correction = remove_excess_spaces(correction)
	if input_text and correction:
	writer.writerow([input_text, correction])

	train_dataset = load_dataset("jfleg", split="validation[:]")
	eval_dataset = load_dataset("jfleg", split="test[:]")

	generate_csv("Dataset/JFLEG/train.csv", train_dataset)
	generate_csv("Dataset/JFLEG/eval.csv", eval_dataset)

	c4_dataset = load_dataset("liweili/c4_200m", split="train", streaming=True)

	def c4_generate_csv(csv_path, iterator, num_examples):
	os.makedirs(os.path.dirname(csv_path), exist_ok=True)
	with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
	writer = csv.writer(csvfile)
	writer.writerow(["input", "target"])
	for _ in range(num_examples):
	try:
	data = next(iterator)
	input_text = "grammar: " + data["input"]
	input_text = remove_excess_spaces(input_text)
	correction = remove_excess_spaces(data["output"])
	if input_text and correction:
	writer.writerow([input_text, correction])
	except StopIteration:
	break

	c4_iterator = iter(c4_dataset)
	c4_generate_csv("Dataset/C4_200M/c4data.csv", c4_iterator, num_examples=3500)