Lguyogiro commited on
Commit
5a57f1f
·
1 Parent(s): a5809d0
Files changed (2) hide show
  1. app.py +2 -7
  2. asr.py +4 -3
app.py CHANGED
@@ -10,12 +10,7 @@ from audio_recorder_streamlit import audio_recorder
10
  def load_asr_model():
11
  return load_model()
12
 
13
- asr_model = load_asr_model()
14
-
15
-
16
- def transcribe(audio_file):
17
- transcript = openai.Audio.transcribe("whisper-1", audio_file)
18
- return transcript
19
 
20
 
21
  def save_audio_file(audio_bytes, file_extension):
@@ -43,7 +38,7 @@ def transcribe_audio(file_path):
43
  :return: The transcribed text
44
  """
45
  with open(file_path, "rb") as audio_file:
46
- transcript = inference(asr_model, audio_file)
47
  return transcript
48
 
49
 
 
10
  def load_asr_model():
11
  return load_model()
12
 
13
+ processor, asr_model = load_asr_model()
 
 
 
 
 
14
 
15
 
16
  def save_audio_file(audio_bytes, file_extension):
 
38
  :return: The transcribed text
39
  """
40
  with open(file_path, "rb") as audio_file:
41
+ transcript = inference(processor, asr_model, audio_file)
42
  return transcript
43
 
44
 
asr.py CHANGED
@@ -14,12 +14,13 @@ def load_model():
14
  target_lang = "oji"
15
  processor = AutoProcessor.from_pretrained(model_id, target_lang=target_lang, use_auth_token=hf_token)
16
  model = Wav2Vec2ForCTC.from_pretrained(model_id, target_lang=target_lang, ignore_mismatched_sizes=True, use_safetensors=True, use_auth_token=hf_token)
 
17
 
18
 
19
- def inference(model, raw_data):
20
- # arr, rate = read_audio_data(audio_path)
21
  # arr.squeeze().numpy(), ...
22
- inputs = processor(raw_data, sampling_rate=16_000, return_tensors="pt")
23
 
24
  with torch.no_grad():
25
  outputs = model(**inputs).logits
 
14
  target_lang = "oji"
15
  processor = AutoProcessor.from_pretrained(model_id, target_lang=target_lang, use_auth_token=hf_token)
16
  model = Wav2Vec2ForCTC.from_pretrained(model_id, target_lang=target_lang, ignore_mismatched_sizes=True, use_safetensors=True, use_auth_token=hf_token)
17
+ return processor, model
18
 
19
 
20
+ def inference(processor, model, audio_path):
21
+ arr, rate = read_audio_data(audio_path)
22
  # arr.squeeze().numpy(), ...
23
+ inputs = processor(arr.squeeze().numpy(), sampling_rate=16_000, return_tensors="pt")
24
 
25
  with torch.no_grad():
26
  outputs = model(**inputs).logits