pratikshahp commited on
Commit
e1cc7bb
·
verified ·
1 Parent(s): 6bee2d3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -8
app.py CHANGED
@@ -2,6 +2,51 @@
2
  # Author: Pratiksha Patel
3
  # Description: This script record the audio, transform it to text, detect the language of the file and save it to a txt file.
4
  # import required modules
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  import torch
6
  import streamlit as st
7
  from audio_recorder_streamlit import audio_recorder
@@ -9,18 +54,18 @@ import numpy as np
9
  from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
10
 
11
  def transcribe_audio(audio_bytes):
12
- processor = AutoProcessor.from_pretrained("openai/whisper-large")
13
- model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large")
14
-
15
  # Convert audio bytes to numpy array
16
  audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
17
-
18
  # Normalize audio array
19
  audio_tensor = torch.tensor(audio_array, dtype=torch.float64) / 32768.0
20
-
21
  # Provide inputs to the processor
22
  #inputs = processor(audio=audio_tensor, sampling_rate=16000, return_tensors="pt")
23
- input_features = processor(audio_tensor, sampling_rate=16000, return_tensors="pt").input_features
24
 
25
  # generate token ids
26
  predicted_ids = model.generate(input_features)
@@ -34,10 +79,11 @@ def transcribe_audio(audio_bytes):
34
  st.title("Audio to Text Transcription..")
35
 
36
  audio_bytes = audio_recorder(pause_threshold=3.0, sample_rate=16_000)
 
37
 
38
  if audio_bytes:
39
  st.audio(audio_bytes, format="audio/wav")
40
-
41
  transcription = transcribe_audio(audio_bytes)
42
 
43
  if transcription:
@@ -47,4 +93,3 @@ if audio_bytes:
47
  st.write("Error: Failed to transcribe audio.")
48
  else:
49
  st.write("No audio recorded.")
50
-
 
2
  # Author: Pratiksha Patel
3
  # Description: This script record the audio, transform it to text, detect the language of the file and save it to a txt file.
4
  # import required modules
5
+ #import torch
6
+ #import streamlit as st
7
+ #from audio_recorder_streamlit import audio_recorder
8
+ #import numpy as np
9
+ #from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
10
+
11
+ #def transcribe_audio(audio_bytes):
12
+ # processor = AutoProcessor.from_pretrained("openai/whisper-large")
13
+ # model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large")
14
+
15
+ # Convert audio bytes to numpy array
16
+ # audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
17
+
18
+ # Normalize audio array
19
+ #audio_tensor = torch.tensor(audio_array, dtype=torch.float64) / 32768.0
20
+
21
+ # Provide inputs to the processor
22
+ ##inputs = processor(audio=audio_tensor, sampling_rate=16000, return_tensors="pt")
23
+ #input_features = processor(audio_tensor, sampling_rate=16000, return_tensors="pt").input_features
24
+
25
+ # generate token ids
26
+ #predicted_ids = model.generate(input_features)
27
+ # decode token ids to text
28
+ #transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
29
+
30
+ #transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
31
+ #return transcription
32
+ # Streamlit app
33
+ #st.title("Audio to Text Transcription..")
34
+
35
+ #audio_bytes = audio_recorder(pause_threshold=3.0, sample_rate=16_000)
36
+
37
+ #if audio_bytes:
38
+ # st.audio(audio_bytes, format="audio/wav")
39
+
40
+ # transcription = transcribe_audio(audio_bytes)
41
+
42
+ # if transcription:
43
+ # st.write("Transcription:")
44
+ # st.write(transcription)
45
+ #else:
46
+ # st.write("Error: Failed to transcribe audio.")
47
+ #else:
48
+ # st.write("No audio recorded.")
49
+
50
  import torch
51
  import streamlit as st
52
  from audio_recorder_streamlit import audio_recorder
 
54
  from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
55
 
56
  def transcribe_audio(audio_bytes):
57
+ processor = AutoProcessor.from_pretrained('facebook/s2t-small-librispeech-asr')
58
+ model = AutoModelForSpeechSeq2Seq.from_pretrained('facebook/s2t-small-librispeech-asr')
59
+
60
  # Convert audio bytes to numpy array
61
  audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
62
+
63
  # Normalize audio array
64
  audio_tensor = torch.tensor(audio_array, dtype=torch.float64) / 32768.0
65
+
66
  # Provide inputs to the processor
67
  #inputs = processor(audio=audio_tensor, sampling_rate=16000, return_tensors="pt")
68
+ input_features = processor(audio_tensor, sampling_rate=16000, return_tensors="pt").input_features
69
 
70
  # generate token ids
71
  predicted_ids = model.generate(input_features)
 
79
  st.title("Audio to Text Transcription..")
80
 
81
  audio_bytes = audio_recorder(pause_threshold=3.0, sample_rate=16_000)
82
+ st.write(audio_bytes)
83
 
84
  if audio_bytes:
85
  st.audio(audio_bytes, format="audio/wav")
86
+
87
  transcription = transcribe_audio(audio_bytes)
88
 
89
  if transcription:
 
93
  st.write("Error: Failed to transcribe audio.")
94
  else:
95
  st.write("No audio recorded.")