Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -2,6 +2,51 @@
|
|
2 |
# Author: Pratiksha Patel
|
3 |
# Description: This script record the audio, transform it to text, detect the language of the file and save it to a txt file.
|
4 |
# import required modules
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
import torch
|
6 |
import streamlit as st
|
7 |
from audio_recorder_streamlit import audio_recorder
|
@@ -9,18 +54,18 @@ import numpy as np
|
|
9 |
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
|
10 |
|
11 |
def transcribe_audio(audio_bytes):
|
12 |
-
processor = AutoProcessor.from_pretrained(
|
13 |
-
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
14 |
-
|
15 |
# Convert audio bytes to numpy array
|
16 |
audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
|
17 |
-
|
18 |
# Normalize audio array
|
19 |
audio_tensor = torch.tensor(audio_array, dtype=torch.float64) / 32768.0
|
20 |
-
|
21 |
# Provide inputs to the processor
|
22 |
#inputs = processor(audio=audio_tensor, sampling_rate=16000, return_tensors="pt")
|
23 |
-
input_features = processor(audio_tensor, sampling_rate=16000, return_tensors="pt").input_features
|
24 |
|
25 |
# generate token ids
|
26 |
predicted_ids = model.generate(input_features)
|
@@ -34,10 +79,11 @@ def transcribe_audio(audio_bytes):
|
|
34 |
st.title("Audio to Text Transcription..")
|
35 |
|
36 |
audio_bytes = audio_recorder(pause_threshold=3.0, sample_rate=16_000)
|
|
|
37 |
|
38 |
if audio_bytes:
|
39 |
st.audio(audio_bytes, format="audio/wav")
|
40 |
-
|
41 |
transcription = transcribe_audio(audio_bytes)
|
42 |
|
43 |
if transcription:
|
@@ -47,4 +93,3 @@ if audio_bytes:
|
|
47 |
st.write("Error: Failed to transcribe audio.")
|
48 |
else:
|
49 |
st.write("No audio recorded.")
|
50 |
-
|
|
|
2 |
# Author: Pratiksha Patel
|
3 |
# Description: This script record the audio, transform it to text, detect the language of the file and save it to a txt file.
|
4 |
# import required modules
|
5 |
+
#import torch
|
6 |
+
#import streamlit as st
|
7 |
+
#from audio_recorder_streamlit import audio_recorder
|
8 |
+
#import numpy as np
|
9 |
+
#from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
|
10 |
+
|
11 |
+
#def transcribe_audio(audio_bytes):
|
12 |
+
# processor = AutoProcessor.from_pretrained("openai/whisper-large")
|
13 |
+
# model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large")
|
14 |
+
|
15 |
+
# Convert audio bytes to numpy array
|
16 |
+
# audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
|
17 |
+
|
18 |
+
# Normalize audio array
|
19 |
+
#audio_tensor = torch.tensor(audio_array, dtype=torch.float64) / 32768.0
|
20 |
+
|
21 |
+
# Provide inputs to the processor
|
22 |
+
##inputs = processor(audio=audio_tensor, sampling_rate=16000, return_tensors="pt")
|
23 |
+
#input_features = processor(audio_tensor, sampling_rate=16000, return_tensors="pt").input_features
|
24 |
+
|
25 |
+
# generate token ids
|
26 |
+
#predicted_ids = model.generate(input_features)
|
27 |
+
# decode token ids to text
|
28 |
+
#transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
|
29 |
+
|
30 |
+
#transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
|
31 |
+
#return transcription
|
32 |
+
# Streamlit app
|
33 |
+
#st.title("Audio to Text Transcription..")
|
34 |
+
|
35 |
+
#audio_bytes = audio_recorder(pause_threshold=3.0, sample_rate=16_000)
|
36 |
+
|
37 |
+
#if audio_bytes:
|
38 |
+
# st.audio(audio_bytes, format="audio/wav")
|
39 |
+
|
40 |
+
# transcription = transcribe_audio(audio_bytes)
|
41 |
+
|
42 |
+
# if transcription:
|
43 |
+
# st.write("Transcription:")
|
44 |
+
# st.write(transcription)
|
45 |
+
#else:
|
46 |
+
# st.write("Error: Failed to transcribe audio.")
|
47 |
+
#else:
|
48 |
+
# st.write("No audio recorded.")
|
49 |
+
|
50 |
import torch
|
51 |
import streamlit as st
|
52 |
from audio_recorder_streamlit import audio_recorder
|
|
|
54 |
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
|
55 |
|
56 |
def transcribe_audio(audio_bytes):
|
57 |
+
processor = AutoProcessor.from_pretrained('facebook/s2t-small-librispeech-asr')
|
58 |
+
model = AutoModelForSpeechSeq2Seq.from_pretrained('facebook/s2t-small-librispeech-asr')
|
59 |
+
|
60 |
# Convert audio bytes to numpy array
|
61 |
audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
|
62 |
+
|
63 |
# Normalize audio array
|
64 |
audio_tensor = torch.tensor(audio_array, dtype=torch.float64) / 32768.0
|
65 |
+
|
66 |
# Provide inputs to the processor
|
67 |
#inputs = processor(audio=audio_tensor, sampling_rate=16000, return_tensors="pt")
|
68 |
+
input_features = processor(audio_tensor, sampling_rate=16000, return_tensors="pt").input_features
|
69 |
|
70 |
# generate token ids
|
71 |
predicted_ids = model.generate(input_features)
|
|
|
79 |
st.title("Audio to Text Transcription..")
|
80 |
|
81 |
audio_bytes = audio_recorder(pause_threshold=3.0, sample_rate=16_000)
|
82 |
+
st.write(audio_bytes)
|
83 |
|
84 |
if audio_bytes:
|
85 |
st.audio(audio_bytes, format="audio/wav")
|
86 |
+
|
87 |
transcription = transcribe_audio(audio_bytes)
|
88 |
|
89 |
if transcription:
|
|
|
93 |
st.write("Error: Failed to transcribe audio.")
|
94 |
else:
|
95 |
st.write("No audio recorded.")
|
|