Spaces:
Sleeping
Sleeping
# Copyright 2021 The HuggingFace Team. All rights reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
import subprocess | |
from typing import TYPE_CHECKING, Union | |
import numpy as np | |
from ..utils import logging | |
from .base import Pipeline | |
if TYPE_CHECKING: | |
from ...feature_extraction_sequence_utils import SequenceFeatureExtractor | |
logger = logging.get_logger(__name__) | |
def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array: | |
""" | |
Helper function to read an audio file through ffmpeg. | |
""" | |
ar = f"{sampling_rate}" | |
ac = "1" | |
format_for_conversion = "f32le" | |
ffmpeg_command = [ | |
"ffmpeg", | |
"-i", | |
"pipe:0", | |
"-ac", | |
ac, | |
"-ar", | |
ar, | |
"-f", | |
format_for_conversion, | |
"-hide_banner", | |
"-loglevel", | |
"quiet", | |
"pipe:1", | |
] | |
try: | |
ffmpeg_process = subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) | |
except FileNotFoundError: | |
raise ValueError("ffmpeg was not found but is required to load audio files from filename") | |
output_stream = ffmpeg_process.communicate(bpayload) | |
out_bytes = output_stream[0] | |
audio = np.frombuffer(out_bytes, np.float32) | |
if audio.shape[0] == 0: | |
raise ValueError("Malformed soundfile") | |
return audio | |
class AutomaticSpeechRecognitionPipeline(Pipeline): | |
""" | |
Pipeline that aims at extracting spoken text contained within some audio. | |
The input can be either a raw waveform or a audio file. In case of the audio file, ffmpeg should be installed for | |
to support multiple audio formats | |
""" | |
def __init__(self, feature_extractor: "SequenceFeatureExtractor", *args, **kwargs): | |
""" | |
Arguments: | |
feature_extractor (:obj:`~transformers.SequenceFeatureExtractor`): | |
The feature extractor that will be used by the pipeline to encode waveform for the model. | |
model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`): | |
The model that will be used by the pipeline to make predictions. This needs to be a model inheriting | |
from :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` | |
for TensorFlow. | |
tokenizer (:obj:`~transformers.PreTrainedTokenizer`): | |
The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from | |
:class:`~transformers.PreTrainedTokenizer`. | |
modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`): | |
Model card attributed to the model for this pipeline. | |
framework (:obj:`str`, `optional`): | |
The framework to use, either :obj:`"pt"` for PyTorch or :obj:`"tf"` for TensorFlow. The specified | |
framework must be installed. | |
If no framework is specified, will default to the one currently installed. If no framework is specified | |
and both frameworks are installed, will default to the framework of the :obj:`model`, or to PyTorch if | |
no model is provided. | |
device (:obj:`int`, `optional`, defaults to -1): | |
Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, a positive will run the | |
model on the associated CUDA device id. | |
""" | |
super().__init__(*args, **kwargs) | |
self.feature_extractor = feature_extractor | |
if self.framework == "tf": | |
raise ValueError("The AutomaticSpeechRecognitionPipeline is only available in PyTorch.") | |
def __call__( | |
self, | |
inputs: Union[np.ndarray, bytes, str], | |
**kwargs, | |
): | |
""" | |
Classify the sequence(s) given as inputs. See the :obj:`~transformers.AutomaticSpeechRecognitionPipeline` | |
documentation for more information. | |
Args: | |
inputs (:obj:`np.ndarray` or :obj:`bytes` or :obj:`str`): | |
The inputs is either a raw waveform (:obj:`np.ndarray` of shape (n, ) of type :obj:`np.float32` or | |
:obj:`np.float64`) at the correct sampling rate (no further check will be done) or a :obj:`str` that is | |
the filename of the audio file, the file will be read at the correct sampling rate to get the waveform | |
using `ffmpeg`. This requires `ffmpeg` to be installed on the system. If `inputs` is :obj:`bytes` it is | |
supposed to be the content of an audio file and is interpreted by `ffmpeg` in the same way. | |
Return: | |
A :obj:`dict` with the following keys: | |
- **text** (:obj:`str`) -- The recognized text. | |
""" | |
if isinstance(inputs, str): | |
with open(inputs, "rb") as f: | |
inputs = f.read() | |
if isinstance(inputs, bytes): | |
inputs = ffmpeg_read(inputs, self.feature_extractor.sampling_rate) | |
assert isinstance(inputs, np.ndarray), "We expect a numpy ndarray as input" | |
assert len(inputs.shape) == 1, "We expect a single channel audio input for AutomaticSpeechRecognitionPipeline" | |
processed = self.feature_extractor( | |
inputs, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt" | |
) | |
processed = self.ensure_tensor_on_device(**processed) | |
name = self.model.__class__.__name__ | |
if name.endswith("ForConditionalGeneration"): | |
input_ids = processed["input_features"] | |
tokens = self.model.generate(input_ids=input_ids) | |
tokens = tokens.squeeze(0) | |
elif name.endswith("ForCTC"): | |
outputs = self.model(**processed) | |
tokens = outputs.logits.squeeze(0).argmax(dim=-1) | |
skip_special_tokens = False if "CTC" in self.tokenizer.__class__.__name__ else True | |
recognized_string = self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens) | |
return {"text": recognized_string} | |