arampacha
/

wav2vec2-large-xlsr-ukrainian

@@ -17,7 +17,7 @@ model-index:
     dataset:
       name: Common Voice uk
       type: common_voice
-      args: cs
     metrics:
        - name: Test WER
          type: wer
@@ -93,12 +93,12 @@ resampler = torchaudio.transforms.Resample(48_000, 16_000)
 # Preprocessing the datasets.
 # We need to read the aduio files as arrays and normalize charecters
 def speech_file_to_array_fn(batch):
     batch["sentence"] = re.sub(re.compile(chars_to_ignore_regex), '', batch["sentence"]).lower().strip()
     batch["sentence"] = re.sub(re.compile('i'), 'і', batch['sentence'])
     batch["sentence"] = re.sub(re.compile('o'), 'о', batch['sentence'])
     batch["sentence"] = re.sub(re.compile('a'), 'а', batch['sentence'])
     batch["sentence"] = re.sub(re.compile('ы'), 'и', batch['sentence'])
-    batch["sentence"] = re.sub(re.compile("['`]"), '’', batch['sentence'])
     batch["sentence"] = re.sub(re.compile("–"), '', batch['sentence'])
     batch['sentence'] = re.sub('  ', ' ', batch['sentence'])
     speech_array, sampling_rate = torchaudio.load(batch["path"])

     dataset:
       name: Common Voice uk
       type: common_voice
+      args: uk
     metrics:
        - name: Test WER
          type: wer
 # Preprocessing the datasets.
 # We need to read the aduio files as arrays and normalize charecters
 def speech_file_to_array_fn(batch):
+    batch["sentence"] = re.sub(re.compile("['`]"), '’', batch['sentence'])
     batch["sentence"] = re.sub(re.compile(chars_to_ignore_regex), '', batch["sentence"]).lower().strip()
     batch["sentence"] = re.sub(re.compile('i'), 'і', batch['sentence'])
     batch["sentence"] = re.sub(re.compile('o'), 'о', batch['sentence'])
     batch["sentence"] = re.sub(re.compile('a'), 'а', batch['sentence'])
     batch["sentence"] = re.sub(re.compile('ы'), 'и', batch['sentence'])
     batch["sentence"] = re.sub(re.compile("–"), '', batch['sentence'])
     batch['sentence'] = re.sub('  ', ' ', batch['sentence'])
     speech_array, sampling_rate = torchaudio.load(batch["path"])