andgrt commited on
Commit
c8128c2
·
1 Parent(s): cfcd1f4

upd: resemple audio for whisper

Browse files
Files changed (1) hide show
  1. app.py +16 -4
app.py CHANGED
@@ -7,7 +7,7 @@ from transformers import (
7
  pipeline,
8
  )
9
  import torch
10
- import numpy as np
11
 
12
  processor = AutoProcessor.from_pretrained(
13
  "MariaK/layoutlmv2-base-uncased_finetuned_docvqa_v2"
@@ -78,13 +78,25 @@ def transcribe(image, audio):
78
 
79
  sr, y = audio
80
 
81
- # Convert to mono if stereo
82
  if y.ndim > 1:
83
  y = y.mean(axis=1)
84
 
85
- y = y.astype(np.float32)
86
- y /= np.max(np.abs(y))
87
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  input_features = transcriber.feature_extractor(
89
  y, sampling_rate=sr, return_tensors="pt"
90
  ).input_features
 
7
  pipeline,
8
  )
9
  import torch
10
+ import torchaudio
11
 
12
  processor = AutoProcessor.from_pretrained(
13
  "MariaK/layoutlmv2-base-uncased_finetuned_docvqa_v2"
 
78
 
79
  sr, y = audio
80
 
81
+ # Convert stereo to mono if necessary
82
  if y.ndim > 1:
83
  y = y.mean(axis=1)
84
 
85
+ # Convert the numpy array to a PyTorch tensor for torchaudio processing
86
+ y_tensor = torch.tensor(y, dtype=torch.float32)
87
 
88
+ if sr != 16000:
89
+ resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
90
+ y_tensor = resampler(y_tensor)
91
+ sr = 16000
92
+
93
+ # Normalize the audio
94
+ y_tensor /= torch.max(torch.abs(y_tensor))
95
+
96
+ # Convert back to a numpy array for compatibility with the feature extractor
97
+ y = y_tensor.numpy()
98
+
99
+ # Create input features for the Whisper model
100
  input_features = transcriber.feature_extractor(
101
  y, sampling_rate=sr, return_tensors="pt"
102
  ).input_features