Spaces:
Sleeping
Sleeping
File size: 1,255 Bytes
aaa3b8b 41d9375 1534a11 41d9375 aaa3b8b 1534a11 41d9375 1534a11 41d9375 1534a11 41d9375 1534a11 41d9375 1534a11 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
import os
import torch
import librosa
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
# Charger le modèle et le processeur
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_NAME = "facebook/wav2vec2-large-xlsr-53-french"
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME).to(device)
model.eval()
def transcribe_audio(audio_path, sampling_rate=16000):
# Charger l'audio
audio, sr = librosa.load(audio_path, sr=sampling_rate)
# Transformer l'audio en entrée pour le modèle
input_values = processor(audio, sampling_rate=sampling_rate, return_tensors="pt").input_values.to(device)
# Obtenir les prédictions
with torch.no_grad():
logits = model(input_values).logits
# Décoder les prédictions en texte
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0]
return transcription
# Exemple d'utilisation
if __name__ == "__main__":
base_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data"))
audio_path = os.path.join(base_path, "colere", "c1af.wav")
texte = transcribe_audio(audio_path)
print(f"Transcription : {texte}")
|