marcel
/

wav2vec2-large-xlsr-german-demo

@@ -50,15 +50,15 @@ resampler = torchaudio.transforms.Resample(48_000, 16_000)
 # Preprocessing the datasets.
 # We need to read the aduio files as arrays
 def speech_file_to_array_fn(batch):
-\\tspeech_array, sampling_rate = torchaudio.load(batch["path"])
-\\tbatch["speech"] = resampler(speech_array).squeeze().numpy()
-\\treturn batch
 test_dataset = test_dataset.map(speech_file_to_array_fn)
 inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
 with torch.no_grad():
-\tlogits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
 predicted_ids = torch.argmax(logits, dim=-1)
@@ -90,52 +90,52 @@ model.to("cuda")
 chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\”\�\カ\æ\無\ན\カ\臣\ѹ\…\«\»\ð\ı\„\幺\א\ב\比\ш\ע\)\ứ\в\œ\ч\+\—\ш\‚\נ\м\ń\乡\$\=\ש\ф\支\(\°\и\к\̇]'
 substitutions = {
-\t'e' : '[\ə\é\ě\ę\ê\ế\ế\ë\ė\е]',
-\t'o' : '[\ō\ô\ô\ó\ò\ø\ọ\ŏ\õ\ő\о]',
-\t'a' : '[\á\ā\ā\ă\ã\å\â\à\ą\а]',
-\t'c' : '[\č\ć\ç\с]',
-\t'l' : '[\ł]',
-\t'u' : '[\ú\ū\ứ\ů]',
-\t'und' : '[\&]',
-\t'r' : '[\ř]',
-\t'y' : '[\ý]',
-\t's' : '[\ś\š\ș\ş]',
-\t'i' : '[\ī\ǐ\í\ï\î\ï]',
-\t'z' : '[\ź\ž\ź\ż]',
-\t'n' : '[\ñ\ń\ņ]',
-\t'g' : '[\ğ]',
-\t'ss' : '[\ß]',
-\t't' : '[\ț\ť]',
-\t'd' : '[\ď\đ]',
-\t"'": '[\ʿ\་\’\`\´\ʻ\`\‘]',
-\t'p': '\р'
 }
 resampler = torchaudio.transforms.Resample(48_000, 16_000)
 # Preprocessing the datasets.
 # We need to read the aduio files as arrays
 def speech_file_to_array_fn(batch):
-\tbatch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
-\tfor x in substitutions:
-\t\tbatch["sentence"] = re.sub(substitutions[x], x, batch["sentence"])
-\t\tspeech_array, sampling_rate = torchaudio.load(batch["path"])
-\tspeech_array, sampling_rate = torchaudio.load(batch["path"])
-\tbatch["speech"] = resampler(speech_array).squeeze().numpy()
-\treturn batch
 test_dataset = test_dataset.map(speech_file_to_array_fn)
 # Preprocessing the datasets.
 # We need to read the aduio files as arrays
 def evaluate(batch):
-\tinputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
-\twith torch.no_grad():
-\t\tlogits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
-\tpred_ids = torch.argmax(logits, dim=-1)
-\tbatch["pred_strings"] = processor.batch_decode(pred_ids)
-\treturn batch
 result = test_dataset.map(evaluate, batched=True, batch_size=8)

 # Preprocessing the datasets.
 # We need to read the aduio files as arrays
 def speech_file_to_array_fn(batch):
+	speech_array, sampling_rate = torchaudio.load(batch["path"])
+	batch["speech"] = resampler(speech_array).squeeze().numpy()
+	return batch
 test_dataset = test_dataset.map(speech_file_to_array_fn)
 inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
 with torch.no_grad():
+	logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
 predicted_ids = torch.argmax(logits, dim=-1)
 chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\”\�\カ\æ\無\ན\カ\臣\ѹ\…\«\»\ð\ı\„\幺\א\ב\比\ш\ע\)\ứ\в\œ\ч\+\—\ш\‚\נ\м\ń\乡\$\=\ש\ф\支\(\°\и\к\̇]'
 substitutions = {
+	'e' : '[\ə\é\ě\ę\ê\ế\ế\ë\ė\е]',
+	'o' : '[\ō\ô\ô\ó\ò\ø\ọ\ŏ\õ\ő\о]',
+	'a' : '[\á\ā\ā\ă\ã\å\â\à\ą\а]',
+	'c' : '[\č\ć\ç\с]',
+	'l' : '[\ł]',
+	'u' : '[\ú\ū\ứ\ů]',
+	'und' : '[\&]',
+	'r' : '[\ř]',
+	'y' : '[\ý]',
+	's' : '[\ś\š\ș\ş]',
+	'i' : '[\ī\ǐ\í\ï\î\ï]',
+	'z' : '[\ź\ž\ź\ż]',
+	'n' : '[\ñ\ń\ņ]',
+	'g' : '[\ğ]',
+	'ss' : '[\ß]',
+	't' : '[\ț\ť]',
+	'd' : '[\ď\đ]',
+	"'": '[\ʿ\་\’\`\´\ʻ\`\‘]',
+	'p': '\р'
 }
 resampler = torchaudio.transforms.Resample(48_000, 16_000)
 # Preprocessing the datasets.
 # We need to read the aduio files as arrays
 def speech_file_to_array_fn(batch):
+	batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
+	for x in substitutions:
+		batch["sentence"] = re.sub(substitutions[x], x, batch["sentence"])
+		speech_array, sampling_rate = torchaudio.load(batch["path"])
+	speech_array, sampling_rate = torchaudio.load(batch["path"])
+	batch["speech"] = resampler(speech_array).squeeze().numpy()
+	return batch
 test_dataset = test_dataset.map(speech_file_to_array_fn)
 # Preprocessing the datasets.
 # We need to read the aduio files as arrays
 def evaluate(batch):
+	inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
+	with torch.no_grad():
+		logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
+	pred_ids = torch.argmax(logits, dim=-1)
+	batch["pred_strings"] = processor.batch_decode(pred_ids)
+	return batch
 result = test_dataset.map(evaluate, batched=True, batch_size=8)