arampacha commited on
Commit
016d068
·
2 Parent(s): 5bdae8b 3b222ca

Merge branch 'main' of https://huggingface.co/arampacha/wav2vec2-large-xlsr-ukrainian into main

Browse files
Files changed (1) hide show
  1. README.md +2 -2
README.md CHANGED
@@ -17,7 +17,7 @@ model-index:
17
  dataset:
18
  name: Common Voice uk
19
  type: common_voice
20
- args: cs
21
  metrics:
22
  - name: Test WER
23
  type: wer
@@ -93,12 +93,12 @@ resampler = torchaudio.transforms.Resample(48_000, 16_000)
93
  # Preprocessing the datasets.
94
  # We need to read the aduio files as arrays and normalize charecters
95
  def speech_file_to_array_fn(batch):
 
96
  batch["sentence"] = re.sub(re.compile(chars_to_ignore_regex), '', batch["sentence"]).lower().strip()
97
  batch["sentence"] = re.sub(re.compile('i'), 'і', batch['sentence'])
98
  batch["sentence"] = re.sub(re.compile('o'), 'о', batch['sentence'])
99
  batch["sentence"] = re.sub(re.compile('a'), 'а', batch['sentence'])
100
  batch["sentence"] = re.sub(re.compile('ы'), 'и', batch['sentence'])
101
- batch["sentence"] = re.sub(re.compile("['`]"), '’', batch['sentence'])
102
  batch["sentence"] = re.sub(re.compile("–"), '', batch['sentence'])
103
  batch['sentence'] = re.sub(' ', ' ', batch['sentence'])
104
  speech_array, sampling_rate = torchaudio.load(batch["path"])
 
17
  dataset:
18
  name: Common Voice uk
19
  type: common_voice
20
+ args: uk
21
  metrics:
22
  - name: Test WER
23
  type: wer
 
93
  # Preprocessing the datasets.
94
  # We need to read the aduio files as arrays and normalize charecters
95
  def speech_file_to_array_fn(batch):
96
+ batch["sentence"] = re.sub(re.compile("['`]"), '’', batch['sentence'])
97
  batch["sentence"] = re.sub(re.compile(chars_to_ignore_regex), '', batch["sentence"]).lower().strip()
98
  batch["sentence"] = re.sub(re.compile('i'), 'і', batch['sentence'])
99
  batch["sentence"] = re.sub(re.compile('o'), 'о', batch['sentence'])
100
  batch["sentence"] = re.sub(re.compile('a'), 'а', batch['sentence'])
101
  batch["sentence"] = re.sub(re.compile('ы'), 'и', batch['sentence'])
 
102
  batch["sentence"] = re.sub(re.compile("–"), '', batch['sentence'])
103
  batch['sentence'] = re.sub(' ', ' ', batch['sentence'])
104
  speech_array, sampling_rate = torchaudio.load(batch["path"])