grammar_correction / dataset_preprocess.py
amanuelyh's picture
Add initial implementation of Grammar Correction tool with Flask and Streamlit
03c77e0
import os
import csv
from datasets import load_dataset
REPLACEMENTS = [
(" .", "."),
(" ,", ","),
(" '", "'"),
(" ?", "?"),
(" !", "!"),
(" :", ":"),
(" ;", ";"),
(" n't", "n't"),
("2 0 0 6", "2006"),
("5 5", "55"),
("4 0 0", "400"),
("1 7-5 0", "1750"),
("2 0 %", "20%"),
("5 0", "50"),
("1 2", "12"),
("1 0", "10"),
('" ballast water', '"ballast water')
]
def remove_excess_spaces(text):
for old, new in REPLACEMENTS:
text = text.replace(old, new)
return text
def generate_csv(csv_path, dataset):
os.makedirs(os.path.dirname(csv_path), exist_ok=True)
with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["input", "target"])
for case in dataset:
input_text = "grammar: " + case["sentence"]
input_text = remove_excess_spaces(input_text)
for correction in case["corrections"]:
correction = remove_excess_spaces(correction)
if input_text and correction:
writer.writerow([input_text, correction])
train_dataset = load_dataset("jfleg", split="validation[:]")
eval_dataset = load_dataset("jfleg", split="test[:]")
generate_csv("Dataset/JFLEG/train.csv", train_dataset)
generate_csv("Dataset/JFLEG/eval.csv", eval_dataset)
c4_dataset = load_dataset("liweili/c4_200m", split="train", streaming=True)
def c4_generate_csv(csv_path, iterator, num_examples):
os.makedirs(os.path.dirname(csv_path), exist_ok=True)
with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["input", "target"])
for _ in range(num_examples):
try:
data = next(iterator)
input_text = "grammar: " + data["input"]
input_text = remove_excess_spaces(input_text)
correction = remove_excess_spaces(data["output"])
if input_text and correction:
writer.writerow([input_text, correction])
except StopIteration:
break
c4_iterator = iter(c4_dataset)
c4_generate_csv("Dataset/C4_200M/c4data.csv", c4_iterator, num_examples=3500)