Rolv-Arild commited on
Commit
57f88c8
·
1 Parent(s): 8773566

Fix vocabulary creation

Browse files
Files changed (1) hide show
  1. run_speech_recognition_ctc.py +8 -8
run_speech_recognition_ctc.py CHANGED
@@ -314,14 +314,14 @@ def create_vocabulary_from_data(
314
  all_text = " ".join(batch["target_text"])
315
  alphabet.update(all_text)
316
 
317
- # vocabs = datasets.map(
318
- # extract_all_chars,
319
- # batched=True,
320
- # batch_size=-1,
321
- # keep_in_memory=True,
322
- # remove_columns=datasets["train"].column_names,
323
- # )
324
- #
325
  # # take union of all unique characters in each dataset
326
  # vocab_set = functools.reduce(
327
  # lambda vocab_1, vocab_2: {"vocab": list(set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]))}, vocabs.values()
 
314
  all_text = " ".join(batch["target_text"])
315
  alphabet.update(all_text)
316
 
317
+ datasets.map(
318
+ extract_all_chars,
319
+ batched=True,
320
+ batch_size=-1,
321
+ keep_in_memory=True,
322
+ remove_columns=datasets["train"].column_names,
323
+ )
324
+
325
  # # take union of all unique characters in each dataset
326
  # vocab_set = functools.reduce(
327
  # lambda vocab_1, vocab_2: {"vocab": list(set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]))}, vocabs.values()