Spaces:
Runtime error
Runtime error
File size: 2,230 Bytes
3c4ad65 d1035a0 3c4ad65 19f09f4 71a635a 1d97cff 6134bda 1d97cff 3c4ad65 1d97cff 3c4ad65 d2cfc63 3c4ad65 d2cfc63 3c4ad65 d2cfc63 3c4ad65 1d97cff 3c4ad65 1d97cff 19f09f4 3c4ad65 1d97cff 3c4ad65 1d97cff 3c4ad65 d2cfc63 3c4ad65 d2cfc63 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
# Transform an audio to text script with language detection.
# Author: Pratiksha Patel
# Description: This script record the audio, transform it to text, detect the language of the file and save it to a txt file.
# import required modules
import os
import torch
import streamlit as st
from audio_recorder_streamlit import audio_recorder
from langdetect import detect
import numpy as np
# Use a pipeline as a high-level helper
#from transformers import pipeline
#pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large")
# Load model directly
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
def transcribe_audio(audio_bytes):
processor = AutoProcessor.from_pretrained("openai/whisper-large")
model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large")
audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
# Cast audio array to double precision and normalize
audio_tensor = torch.tensor(audio_array, dtype=torch.float64) / 32768.0
input_values = processor(audio_tensor, return_tensors="pt", sampling_rate=16000).input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0])
return transcription
# Function to open a file
#def startfile(fn):
# os.system('open %s' % fn)
# Function to create and open a txt file
#def create_and_open_txt(text, filename):
# Create and write the text to a txt file
# with open(filename, "w") as file:
# file.write(text)
# startfile(filename)
# Streamlit app
st.title("Audio to Text Transcription..")
audio_bytes = audio_recorder(pause_threshold=3.0, sample_rate=16_000)
if audio_bytes:
st.audio(audio_bytes, format="audio/wav")
transcription = transcribe_audio(audio_bytes)
if transcription:
st.write("Transcription:")
st.write(transcription)
else:
st.write("Error: Failed to transcribe audio.")
else:
st.write("No audio recorded.")
# Detect the language
#language = detect(transcription)
#st.write(f"Detected language: {language}")
# Create and open a txt file with the text
#create_and_open_txt(transcription, f"output_{language}.txt")
|