Spaces:
Running
Running
import os | |
import csv | |
from datasets import load_dataset | |
REPLACEMENTS = [ | |
(" .", "."), | |
(" ,", ","), | |
(" '", "'"), | |
(" ?", "?"), | |
(" !", "!"), | |
(" :", ":"), | |
(" ;", ";"), | |
(" n't", "n't"), | |
("2 0 0 6", "2006"), | |
("5 5", "55"), | |
("4 0 0", "400"), | |
("1 7-5 0", "1750"), | |
("2 0 %", "20%"), | |
("5 0", "50"), | |
("1 2", "12"), | |
("1 0", "10"), | |
('" ballast water', '"ballast water') | |
] | |
def remove_excess_spaces(text): | |
for old, new in REPLACEMENTS: | |
text = text.replace(old, new) | |
return text | |
def generate_csv(csv_path, dataset): | |
os.makedirs(os.path.dirname(csv_path), exist_ok=True) | |
with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile: | |
writer = csv.writer(csvfile) | |
writer.writerow(["input", "target"]) | |
for case in dataset: | |
input_text = "grammar: " + case["sentence"] | |
input_text = remove_excess_spaces(input_text) | |
for correction in case["corrections"]: | |
correction = remove_excess_spaces(correction) | |
if input_text and correction: | |
writer.writerow([input_text, correction]) | |
train_dataset = load_dataset("jfleg", split="validation[:]") | |
eval_dataset = load_dataset("jfleg", split="test[:]") | |
generate_csv("Dataset/JFLEG/train.csv", train_dataset) | |
generate_csv("Dataset/JFLEG/eval.csv", eval_dataset) | |
c4_dataset = load_dataset("liweili/c4_200m", split="train", streaming=True) | |
def c4_generate_csv(csv_path, iterator, num_examples): | |
os.makedirs(os.path.dirname(csv_path), exist_ok=True) | |
with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile: | |
writer = csv.writer(csvfile) | |
writer.writerow(["input", "target"]) | |
for _ in range(num_examples): | |
try: | |
data = next(iterator) | |
input_text = "grammar: " + data["input"] | |
input_text = remove_excess_spaces(input_text) | |
correction = remove_excess_spaces(data["output"]) | |
if input_text and correction: | |
writer.writerow([input_text, correction]) | |
except StopIteration: | |
break | |
c4_iterator = iter(c4_dataset) | |
c4_generate_csv("Dataset/C4_200M/c4data.csv", c4_iterator, num_examples=3500) | |