File size: 2,299 Bytes
03c77e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import os
import csv
from datasets import load_dataset

REPLACEMENTS = [
    (" .", "."), 
    (" ,", ","), 
    (" '", "'"), 
    (" ?", "?"), 
    (" !", "!"), 
    (" :", ":"), 
    (" ;", ";"), 
    (" n't", "n't"), 
    ("2 0 0 6", "2006"), 
    ("5 5", "55"), 
    ("4 0 0", "400"), 
    ("1 7-5 0", "1750"), 
    ("2 0 %", "20%"), 
    ("5 0", "50"), 
    ("1 2", "12"), 
    ("1 0", "10"), 
    ('" ballast water', '"ballast water')
]

def remove_excess_spaces(text):
    for old, new in REPLACEMENTS:
        text = text.replace(old, new)
    return text

def generate_csv(csv_path, dataset):
    os.makedirs(os.path.dirname(csv_path), exist_ok=True)
    with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["input", "target"])
        for case in dataset:
            input_text = "grammar: " + case["sentence"]
            input_text = remove_excess_spaces(input_text)
            for correction in case["corrections"]:
                correction = remove_excess_spaces(correction)
                if input_text and correction:
                    writer.writerow([input_text, correction])

train_dataset = load_dataset("jfleg", split="validation[:]") 
eval_dataset = load_dataset("jfleg", split="test[:]")

generate_csv("Dataset/JFLEG/train.csv", train_dataset)
generate_csv("Dataset/JFLEG/eval.csv", eval_dataset)

c4_dataset = load_dataset("liweili/c4_200m", split="train", streaming=True)

def c4_generate_csv(csv_path, iterator, num_examples):
    os.makedirs(os.path.dirname(csv_path), exist_ok=True)
    with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["input", "target"])
        for _ in range(num_examples):
            try:
                data = next(iterator)
                input_text = "grammar: " + data["input"]
                input_text = remove_excess_spaces(input_text)
                correction = remove_excess_spaces(data["output"])
                if input_text and correction:
                    writer.writerow([input_text, correction])
            except StopIteration:
                break

c4_iterator = iter(c4_dataset)
c4_generate_csv("Dataset/C4_200M/c4data.csv", c4_iterator, num_examples=3500)