|
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC |
|
import torch |
|
import librosa |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
|
|
|
|
model_name = "facebook/wav2vec2-large-xlsr-53-french" |
|
processor = Wav2Vec2Processor.from_pretrained(model_name) |
|
model = Wav2Vec2ForCTC.from_pretrained(model_name) |
|
|
|
|
|
audio_file = "C:\\Users\\fkpamegan\\Downloads\\datasets_oreau2_m_sessp_07a01Pa.wav" |
|
y, sr = librosa.load(audio_file, sr=16000) |
|
|
|
|
|
input_values = processor(y, return_tensors="pt").input_values |
|
|
|
|
|
with torch.no_grad(): |
|
logits = model(input_values).logitsa |
|
|
|
|
|
predicted_ids = torch.argmax(logits, dim=-1) |
|
|
|
|
|
transcription = processor.decode(predicted_ids[0]) |
|
|
|
print("Transcription:", transcription) |
|
|
|
|
|
|
|
pitch, magnitudes = librosa.core.piptrack(y=y, sr=sr) |
|
intensity = librosa.feature.rms(y=y) |
|
|
|
|
|
tempo, _ = librosa.beat.beat_track(y=y, sr=sr) |
|
|
|
|
|
plt.figure(figsize=(10, 6)) |
|
librosa.display.specshow(pitch, x_axis='time', y_axis='log') |
|
plt.colorbar() |
|
plt.title("Pitch (Hauteur Tonale)") |
|
plt.show() |
|
|
|
|
|
plt.figure(figsize=(10, 6)) |
|
librosa.display.specshow(intensity, x_axis='time') |
|
plt.colorbar() |
|
plt.title("Intensité") |
|
plt.show() |
|
|
|
|
|
features = np.hstack([ |
|
np.mean(intensity, axis=1), |
|
np.mean(pitch, axis=1), |
|
tempo |
|
]) |
|
|
|
|
|
print("Caractéristiques combinées :") |
|
print(features) |
|
|