Spaces:

pratikshahp
/

audio-to-text-conversion

Runtime error

App Files Files Community

pratikshahp commited on Apr 1, 2024

Commit

e1cc7bb

verified ·

1 Parent(s): 6bee2d3

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -8

app.py CHANGED Viewed

@@ -2,6 +2,51 @@
 # Author: Pratiksha Patel
 # Description: This script record the audio, transform it to text, detect the language of the file and save it to a txt file.
 # import required modules
 import torch
 import streamlit as st
 from audio_recorder_streamlit import audio_recorder
@@ -9,18 +54,18 @@ import numpy as np
 from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
 def transcribe_audio(audio_bytes):
-    processor = AutoProcessor.from_pretrained("openai/whisper-large")
-    model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large")
     # Convert audio bytes to numpy array
     audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
     # Normalize audio array
     audio_tensor = torch.tensor(audio_array, dtype=torch.float64) / 32768.0
     # Provide inputs to the processor
     #inputs = processor(audio=audio_tensor, sampling_rate=16000, return_tensors="pt")
-    input_features = processor(audio_tensor, sampling_rate=16000, return_tensors="pt").input_features
    # generate token ids
     predicted_ids = model.generate(input_features)
@@ -34,10 +79,11 @@ def transcribe_audio(audio_bytes):
 st.title("Audio to Text Transcription..")
 audio_bytes = audio_recorder(pause_threshold=3.0, sample_rate=16_000)
 if audio_bytes:
     st.audio(audio_bytes, format="audio/wav")
     transcription = transcribe_audio(audio_bytes)
     if transcription:
@@ -47,4 +93,3 @@ if audio_bytes:
         st.write("Error: Failed to transcribe audio.")
 else:
     st.write("No audio recorded.")

 # Author: Pratiksha Patel
 # Description: This script record the audio, transform it to text, detect the language of the file and save it to a txt file.
 # import required modules
+#import torch
+#import streamlit as st
+#from audio_recorder_streamlit import audio_recorder
+#import numpy as np
+#from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
+#def transcribe_audio(audio_bytes):
+ #   processor = AutoProcessor.from_pretrained("openai/whisper-large")
+  #  model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large")
+    # Convert audio bytes to numpy array
+   # audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
+    # Normalize audio array
+    #audio_tensor = torch.tensor(audio_array, dtype=torch.float64) / 32768.0
+    # Provide inputs to the processor
+    ##inputs = processor(audio=audio_tensor, sampling_rate=16000, return_tensors="pt")
+    #input_features = processor(audio_tensor, sampling_rate=16000, return_tensors="pt").input_features
+   # generate token ids
+    #predicted_ids = model.generate(input_features)
+    # decode token ids to text
+    #transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
+    #transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
+    #return transcription
+# Streamlit app
+#st.title("Audio to Text Transcription..")
+#audio_bytes = audio_recorder(pause_threshold=3.0, sample_rate=16_000)
+#if audio_bytes:
+ #   st.audio(audio_bytes, format="audio/wav")
+  #  transcription = transcribe_audio(audio_bytes)
+   # if transcription:
+    #    st.write("Transcription:")
+     #   st.write(transcription)
+    #else:
+     #   st.write("Error: Failed to transcribe audio.")
+#else:
+ #   st.write("No audio recorded.")
 import torch
 import streamlit as st
 from audio_recorder_streamlit import audio_recorder
 from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
 def transcribe_audio(audio_bytes):
+    processor = AutoProcessor.from_pretrained('facebook/s2t-small-librispeech-asr')
+    model = AutoModelForSpeechSeq2Seq.from_pretrained('facebook/s2t-small-librispeech-asr')
     # Convert audio bytes to numpy array
     audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
     # Normalize audio array
     audio_tensor = torch.tensor(audio_array, dtype=torch.float64) / 32768.0
     # Provide inputs to the processor
     #inputs = processor(audio=audio_tensor, sampling_rate=16000, return_tensors="pt")
+    input_features = processor(audio_tensor, sampling_rate=16000, return_tensors="pt").input_features
    # generate token ids
     predicted_ids = model.generate(input_features)
 st.title("Audio to Text Transcription..")
 audio_bytes = audio_recorder(pause_threshold=3.0, sample_rate=16_000)
+st.write(audio_bytes)
 if audio_bytes:
     st.audio(audio_bytes, format="audio/wav")
     transcription = transcribe_audio(audio_bytes)
     if transcription:
         st.write("Error: Failed to transcribe audio.")
 else:
     st.write("No audio recorded.")