Spaces:
Running
Running
File size: 2,299 Bytes
03c77e0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
import os
import csv
from datasets import load_dataset
REPLACEMENTS = [
(" .", "."),
(" ,", ","),
(" '", "'"),
(" ?", "?"),
(" !", "!"),
(" :", ":"),
(" ;", ";"),
(" n't", "n't"),
("2 0 0 6", "2006"),
("5 5", "55"),
("4 0 0", "400"),
("1 7-5 0", "1750"),
("2 0 %", "20%"),
("5 0", "50"),
("1 2", "12"),
("1 0", "10"),
('" ballast water', '"ballast water')
]
def remove_excess_spaces(text):
for old, new in REPLACEMENTS:
text = text.replace(old, new)
return text
def generate_csv(csv_path, dataset):
os.makedirs(os.path.dirname(csv_path), exist_ok=True)
with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["input", "target"])
for case in dataset:
input_text = "grammar: " + case["sentence"]
input_text = remove_excess_spaces(input_text)
for correction in case["corrections"]:
correction = remove_excess_spaces(correction)
if input_text and correction:
writer.writerow([input_text, correction])
train_dataset = load_dataset("jfleg", split="validation[:]")
eval_dataset = load_dataset("jfleg", split="test[:]")
generate_csv("Dataset/JFLEG/train.csv", train_dataset)
generate_csv("Dataset/JFLEG/eval.csv", eval_dataset)
c4_dataset = load_dataset("liweili/c4_200m", split="train", streaming=True)
def c4_generate_csv(csv_path, iterator, num_examples):
os.makedirs(os.path.dirname(csv_path), exist_ok=True)
with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["input", "target"])
for _ in range(num_examples):
try:
data = next(iterator)
input_text = "grammar: " + data["input"]
input_text = remove_excess_spaces(input_text)
correction = remove_excess_spaces(data["output"])
if input_text and correction:
writer.writerow([input_text, correction])
except StopIteration:
break
c4_iterator = iter(c4_dataset)
c4_generate_csv("Dataset/C4_200M/c4data.csv", c4_iterator, num_examples=3500)
|