Spaces:

andgrt
/

doc-qa-demo-gradio

Runtime error

andgrt commited on Nov 3, 2024

Commit

c8128c2

1 Parent(s): cfcd1f4

upd: resemple audio for whisper

Files changed (1) hide show

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ from transformers import (
     pipeline,
 )
 import torch
-import numpy as np
 processor = AutoProcessor.from_pretrained(
     "MariaK/layoutlmv2-base-uncased_finetuned_docvqa_v2"
@@ -78,13 +78,25 @@ def transcribe(image, audio):
     sr, y = audio
-    # Convert to mono if stereo
     if y.ndim > 1:
         y = y.mean(axis=1)
-    y = y.astype(np.float32)
-    y /= np.max(np.abs(y))
     input_features = transcriber.feature_extractor(
         y, sampling_rate=sr, return_tensors="pt"
     ).input_features

     pipeline,
 )
 import torch
+import torchaudio
 processor = AutoProcessor.from_pretrained(
     "MariaK/layoutlmv2-base-uncased_finetuned_docvqa_v2"
     sr, y = audio
+    # Convert stereo to mono if necessary
     if y.ndim > 1:
         y = y.mean(axis=1)
+    # Convert the numpy array to a PyTorch tensor for torchaudio processing
+    y_tensor = torch.tensor(y, dtype=torch.float32)
+    if sr != 16000:
+        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
+        y_tensor = resampler(y_tensor)
+        sr = 16000
+    # Normalize the audio
+    y_tensor /= torch.max(torch.abs(y_tensor))
+    # Convert back to a numpy array for compatibility with the feature extractor
+    y = y_tensor.numpy()
+    # Create input features for the Whisper model
     input_features = transcriber.feature_extractor(
         y, sampling_rate=sr, return_tensors="pt"
     ).input_features