Commit
·
57f88c8
1
Parent(s):
8773566
Fix vocabulary creation
Browse files
run_speech_recognition_ctc.py
CHANGED
@@ -314,14 +314,14 @@ def create_vocabulary_from_data(
|
|
314 |
all_text = " ".join(batch["target_text"])
|
315 |
alphabet.update(all_text)
|
316 |
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
# # take union of all unique characters in each dataset
|
326 |
# vocab_set = functools.reduce(
|
327 |
# lambda vocab_1, vocab_2: {"vocab": list(set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]))}, vocabs.values()
|
|
|
314 |
all_text = " ".join(batch["target_text"])
|
315 |
alphabet.update(all_text)
|
316 |
|
317 |
+
datasets.map(
|
318 |
+
extract_all_chars,
|
319 |
+
batched=True,
|
320 |
+
batch_size=-1,
|
321 |
+
keep_in_memory=True,
|
322 |
+
remove_columns=datasets["train"].column_names,
|
323 |
+
)
|
324 |
+
|
325 |
# # take union of all unique characters in each dataset
|
326 |
# vocab_set = functools.reduce(
|
327 |
# lambda vocab_1, vocab_2: {"vocab": list(set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]))}, vocabs.values()
|