Spaces:
Runtime error
Runtime error
File size: 3,377 Bytes
3c4ad65 e1cc7bb d1035a0 3c4ad65 71a635a 1d97cff 3c4ad65 1d97cff a47b51d e1cc7bb b521892 1d97cff e1cc7bb b521892 1d97cff e1cc7bb b521892 4d2986c e1cc7bb 4d2986c 15275a9 4d2986c 821e791 1d97cff 3c4ad65 1d97cff 19f09f4 1d97cff e1cc7bb 1d97cff 3c4ad65 1d97cff 59002ea |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
# Transform an audio to text script with language detection.
# Author: Pratiksha Patel
# Description: This script record the audio, transform it to text, detect the language of the file and save it to a txt file.
# import required modules
#import torch
#import streamlit as st
#from audio_recorder_streamlit import audio_recorder
#import numpy as np
#from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
#def transcribe_audio(audio_bytes):
# processor = AutoProcessor.from_pretrained("openai/whisper-large")
# model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large")
# Convert audio bytes to numpy array
# audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
# Normalize audio array
#audio_tensor = torch.tensor(audio_array, dtype=torch.float64) / 32768.0
# Provide inputs to the processor
##inputs = processor(audio=audio_tensor, sampling_rate=16000, return_tensors="pt")
#input_features = processor(audio_tensor, sampling_rate=16000, return_tensors="pt").input_features
# generate token ids
#predicted_ids = model.generate(input_features)
# decode token ids to text
#transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
#transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
#return transcription
# Streamlit app
#st.title("Audio to Text Transcription..")
#audio_bytes = audio_recorder(pause_threshold=3.0, sample_rate=16_000)
#if audio_bytes:
# st.audio(audio_bytes, format="audio/wav")
# transcription = transcribe_audio(audio_bytes)
# if transcription:
# st.write("Transcription:")
# st.write(transcription)
#else:
# st.write("Error: Failed to transcribe audio.")
#else:
# st.write("No audio recorded.")
import torch
import streamlit as st
from audio_recorder_streamlit import audio_recorder
import numpy as np
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
def transcribe_audio(audio_bytes):
processor = AutoProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
model = AutoModelForSpeechSeq2Seq.from_pretrained("facebook/s2t-small-librispeech-asr")
# Convert audio bytes to numpy array
audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
# Normalize audio array
audio_tensor = torch.tensor(audio_array, dtype=torch.float64) / 32768.0
# Provide inputs to the processor
#inputs = processor(audio=audio_tensor, sampling_rate=16000, return_tensors="pt")
input_features = processor(audio_tensor, sampling_rate=16000, return_tensors="pt").input_features
# generate token ids
predicted_ids = model.generate(input_features)
# decode token ids to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
return transcription
# Streamlit app
st.title("Audio to Text Transcription..")
audio_bytes = audio_recorder(pause_threshold=3.0, sample_rate=16_000)
if audio_bytes:
st.audio(audio_bytes, format="audio/wav")
transcription = transcribe_audio(audio_bytes)
if transcription:
st.write("Transcription:")
st.write(transcription)
else:
st.write("Error: Failed to transcribe audio.")
else:
st.write("No audio recorded.") |