Spaces:

pratikshahp
/

audio-to-text-conversion

Runtime error

App Files Files Community

pratikshahp commited on Mar 26, 2024

Commit

1d97cff

verified ·

1 Parent(s): 19f09f4

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -9

app.py CHANGED Viewed

@@ -6,9 +6,25 @@
 import os
 import streamlit as st
 from audio_recorder_streamlit import audio_recorder
-import whisper
 from langdetect import detect
 # Function to open a file
 def startfile(fn):
     os.system('open %s' % fn)
@@ -20,18 +36,23 @@ def create_and_open_txt(text, filename):
         file.write(text)
     startfile(filename)
-# Ask user to record audio
 st.title("Audio to Text Transcription..")
 audio_bytes = audio_recorder(pause_threshold=3.0, sample_rate=16_000)
-# Load the base model and transcribe the audio
-model = whisper.load_model("base")
-result = model.transcribe(audio_bytes)
-transcribed_text = result["text"]
-print(transcribed_text)
-st.write("Transcription:")
-st.write(transcribed_text)
 # Detect the language
 language = detect(transcribed_text)
 st.write(f"Detected language: {language}")

 import os
 import streamlit as st
 from audio_recorder_streamlit import audio_recorder
 from langdetect import detect
+# Use a pipeline as a high-level helper
+from transformers import pipeline
+pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large")
+# Load model directly
+from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
+def transcribe_audio(audio_bytes):
+    processor = AutoProcessor.from_pretrained("openai/whisper-large")
+    model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large")
+    audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
+    # Cast audio array to double precision and normalize
+    audio_tensor = torch.tensor(audio_array, dtype=torch.float64) / 32768.0
+    input_values = processor(audio_tensor, return_tensors="pt", sampling_rate=16000).input_values
+    logits = model(input_values).logits
+    predicted_ids = torch.argmax(logits, dim=-1)
+    transcription = processor.decode(predicted_ids[0])
+    return transcription
 # Function to open a file
 def startfile(fn):
     os.system('open %s' % fn)
         file.write(text)
     startfile(filename)
+# Streamlit app
 st.title("Audio to Text Transcription..")
 audio_bytes = audio_recorder(pause_threshold=3.0, sample_rate=16_000)
+if audio_bytes:
+    st.audio(audio_bytes, format="audio/wav")
+    transcription = transcribe_audio(audio_bytes)
+    if transcription:
+        st.write("Transcription:")
+        st.write(transcription)
+    else:
+        st.write("Error: Failed to transcribe audio.")
+else:
+    st.write("No audio recorded.")
 # Detect the language
 language = detect(transcribed_text)
 st.write(f"Detected language: {language}")