File size: 2,230 Bytes
3c4ad65
 
 
 
 
 
d1035a0
3c4ad65
 
19f09f4
71a635a
1d97cff
6134bda
 
1d97cff
 
3c4ad65
1d97cff
 
 
 
 
 
 
 
 
 
 
 
3c4ad65
d2cfc63
 
3c4ad65
 
d2cfc63
3c4ad65
d2cfc63
 
 
3c4ad65
1d97cff
3c4ad65
1d97cff
19f09f4
3c4ad65
1d97cff
 
 
 
3c4ad65
1d97cff
 
 
 
 
 
 
3c4ad65
d2cfc63
 
3c4ad65
 
d2cfc63
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# Transform an audio to text script with language detection.
# Author: Pratiksha Patel

# Description: This script record the audio, transform it to text, detect the language of the file and save it to a txt file.
# import required modules
import os
import torch
import streamlit as st
from audio_recorder_streamlit import audio_recorder
from langdetect import detect
import numpy as np
# Use a pipeline as a high-level helper
#from transformers import pipeline
#pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large")
# Load model directly
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq

def transcribe_audio(audio_bytes):
    processor = AutoProcessor.from_pretrained("openai/whisper-large")
    model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large")
    audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
    # Cast audio array to double precision and normalize
    audio_tensor = torch.tensor(audio_array, dtype=torch.float64) / 32768.0
    input_values = processor(audio_tensor, return_tensors="pt", sampling_rate=16000).input_values
    logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0])
    return transcription
    
# Function to open a file
#def startfile(fn):
 #   os.system('open %s' % fn)
    
# Function to create and open a txt file
#def create_and_open_txt(text, filename):
    # Create and write the text to a txt file
 #   with open(filename, "w") as file:
  #      file.write(text)
   # startfile(filename)

# Streamlit app
st.title("Audio to Text Transcription..")

audio_bytes = audio_recorder(pause_threshold=3.0, sample_rate=16_000)

if audio_bytes:
    st.audio(audio_bytes, format="audio/wav")
    
    transcription = transcribe_audio(audio_bytes)

    if transcription:
        st.write("Transcription:")
        st.write(transcription)
    else:
        st.write("Error: Failed to transcribe audio.")
else:
    st.write("No audio recorded.")
# Detect the language
#language = detect(transcription)
#st.write(f"Detected language: {language}")

# Create and open a txt file with the text
#create_and_open_txt(transcription, f"output_{language}.txt")