andgrt commited on
Commit
1e19f64
·
1 Parent(s): 86e6582

upd: speech-to-text

Browse files
Files changed (1) hide show
  1. app.py +2 -26
app.py CHANGED
@@ -7,7 +7,6 @@ from transformers import (
7
  pipeline,
8
  )
9
  import torch
10
- import torchaudio
11
 
12
  processor = AutoProcessor.from_pretrained(
13
  "MariaK/layoutlmv2-base-uncased_finetuned_docvqa_v2"
@@ -22,7 +21,7 @@ tokenizer_en2ru = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ru")
22
  model_en2ru = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-ru")
23
 
24
  transcriber = pipeline(
25
- "automatic-speech-recognition", model="artyomboyko/whisper-base-fine_tuned-ru"
26
  )
27
 
28
 
@@ -80,30 +79,7 @@ def transcribe(image, audio):
80
 
81
  if y.ndim > 1:
82
  y = y.mean(axis=1)
83
-
84
- y_tensor = torch.tensor(y, dtype=torch.float32)
85
- print(y.shape)
86
-
87
- if sr != 16000:
88
- resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
89
- y_tensor = resampler(y_tensor)
90
- sr = 16000
91
-
92
- y_tensor /= torch.max(torch.abs(y_tensor))
93
-
94
- y = y_tensor.numpy()
95
- print(y.shape)
96
-
97
- input_features = transcriber.feature_extractor(
98
- y, sampling_rate=sr, return_tensors="pt"
99
- ).input_features
100
- print(input_features.shape)
101
- print(input_features)
102
-
103
- transcription = transcriber.model.generate(input_features)
104
- transcription_text = transcriber.tokenizer.decode(
105
- transcription[0], skip_special_tokens=True
106
- )
107
 
108
  return generate_answer(image, transcription_text)
109
 
 
7
  pipeline,
8
  )
9
  import torch
 
10
 
11
  processor = AutoProcessor.from_pretrained(
12
  "MariaK/layoutlmv2-base-uncased_finetuned_docvqa_v2"
 
21
  model_en2ru = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-ru")
22
 
23
  transcriber = pipeline(
24
+ "automatic-speech-recognition", model="lorenzoncina/whisper-medium-ru"
25
  )
26
 
27
 
 
79
 
80
  if y.ndim > 1:
81
  y = y.mean(axis=1)
82
+ transcription_text = transcriber({"sampling_rate": sr, "raw": y})["text"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
  return generate_answer(image, transcription_text)
85