|
from datasets import load_dataset |
|
from datasets import Audio |
|
|
|
|
|
|
|
class Dataset: |
|
|
|
def __init__(self, n:int = 100): |
|
|
|
self.n = n |
|
self.options = ['LibriSpeech Clean', 'LibriSpeech Other', 'Common Voice', 'VoxPopuli', 'TEDLIUM', 'GigaSpeech', 'SPGISpeech', 'AMI', 'OWN'] |
|
self.selected = None |
|
self.dataset = None |
|
self.text = None |
|
|
|
def get_options(self): |
|
return self.options |
|
|
|
def _check_text(self): |
|
sample = next(iter(self.dataset)) |
|
print(sample) |
|
self._get_text(sample) |
|
|
|
def _get_text(self, sample): |
|
if "text" in sample: |
|
self.text = "text" |
|
return sample["text"] |
|
elif "sentence" in sample: |
|
self.text = "sentence" |
|
return sample["sentence"] |
|
elif "normalized_text" in sample: |
|
self.text = "normalized_text" |
|
return sample["normalized_text"] |
|
elif "transcript" in sample: |
|
self.text = "transcript" |
|
return sample["transcript"] |
|
else: |
|
raise ValueError(f"Sample: {sample.keys()} has no transcript.") |
|
|
|
def filter(self, input_column:str = None): |
|
|
|
if input_column is None: |
|
if self.text is not None: |
|
input_column = self.text |
|
else: |
|
input_column = self._check_text() |
|
|
|
def is_target_text_in_range(ref): |
|
if ref.strip() == "ignore time segment in scoring": |
|
return False |
|
else: |
|
return ref.strip() != "" |
|
|
|
self.dataset = self.dataset.filter(is_target_text_in_range, input_columns=[input_column]) |
|
return self.dataset |
|
|
|
def normalised(self, normalise): |
|
self.dataset = self.dataset.map(normalise) |
|
|
|
def _select(self, option:str): |
|
if option not in self.options: |
|
raise ValueError(f"This value is not an option, please see: {self.options}") |
|
self.selected = option |
|
|
|
def _preprocess(self): |
|
|
|
self.dataset = self.dataset.take(self.n) |
|
self.dataset = self.dataset.cast_column("audio", Audio(sampling_rate=16000)) |
|
|
|
def load(self, option:str = None): |
|
|
|
self._select(option) |
|
|
|
if option == "OWN": |
|
pass |
|
elif option == "LibriSpeech Clean": |
|
self.dataset = load_dataset("librispeech_asr", "all", split="test.clean", streaming=True) |
|
elif option == "LibriSpeech Other": |
|
self.dataset = load_dataset("librispeech_asr", "all", split="test.other", streaming=True) |
|
elif option == "Common Voice": |
|
self.dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", revision="streaming", split="test", streaming=True, token=True, trust_remote_code=True) |
|
elif option == "VoxPopuli": |
|
self.dataset = load_dataset("facebook/voxpopuli", "en", split="test", streaming=True, trust_remote_code=True) |
|
elif option == "TEDLIUM": |
|
self.dataset = load_dataset("LIUM/tedlium", "release3", split="test", streaming=True, trust_remote_code=True) |
|
elif option == "GigaSpeech": |
|
self.dataset = load_dataset("speechcolab/gigaspeech", "xs", split="test", streaming=True, token=True, trust_remote_code=True) |
|
elif option == "SPGISpeech": |
|
self.dataset = load_dataset("kensho/spgispeech", "S", split="test", streaming=True, token=True, trust_remote_code=True) |
|
elif option == "AMI": |
|
self.dataset = load_dataset("edinburghcstr/ami", "ihm", split="test", streaming=True, trust_remote_code=True) |
|
|
|
self._preprocess() |