File size: 3,377 Bytes
3c4ad65
 
 
 
e1cc7bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1035a0
3c4ad65
 
71a635a
1d97cff
3c4ad65
1d97cff
a47b51d
 
e1cc7bb
b521892
1d97cff
e1cc7bb
b521892
1d97cff
e1cc7bb
b521892
4d2986c
e1cc7bb
4d2986c
 
 
 
 
15275a9
4d2986c
 
821e791
1d97cff
3c4ad65
1d97cff
19f09f4
1d97cff
 
e1cc7bb
1d97cff
3c4ad65
1d97cff
 
 
 
 
 
59002ea
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# Transform an audio to text script with language detection.
# Author: Pratiksha Patel
# Description: This script record the audio, transform it to text, detect the language of the file and save it to a txt file.
# import required modules
#import torch
#import streamlit as st
#from audio_recorder_streamlit import audio_recorder
#import numpy as np
#from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq

#def transcribe_audio(audio_bytes):
 #   processor = AutoProcessor.from_pretrained("openai/whisper-large")
  #  model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large")
    
    # Convert audio bytes to numpy array
   # audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
    
    # Normalize audio array
    #audio_tensor = torch.tensor(audio_array, dtype=torch.float64) / 32768.0
    
    # Provide inputs to the processor
    ##inputs = processor(audio=audio_tensor, sampling_rate=16000, return_tensors="pt")
    #input_features = processor(audio_tensor, sampling_rate=16000, return_tensors="pt").input_features 

   # generate token ids
    #predicted_ids = model.generate(input_features)
    # decode token ids to text
    #transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)

    #transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    #return transcription
# Streamlit app
#st.title("Audio to Text Transcription..")

#audio_bytes = audio_recorder(pause_threshold=3.0, sample_rate=16_000)

#if audio_bytes:
 #   st.audio(audio_bytes, format="audio/wav")
    
  #  transcription = transcribe_audio(audio_bytes)

   # if transcription:
    #    st.write("Transcription:")
     #   st.write(transcription)
    #else:
     #   st.write("Error: Failed to transcribe audio.")
#else:
 #   st.write("No audio recorded.")
import torch
import streamlit as st
from audio_recorder_streamlit import audio_recorder
import numpy as np
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq

def transcribe_audio(audio_bytes):
    processor = AutoProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
    model = AutoModelForSpeechSeq2Seq.from_pretrained("facebook/s2t-small-librispeech-asr")

    # Convert audio bytes to numpy array
    audio_array = np.frombuffer(audio_bytes, dtype=np.int16)

    # Normalize audio array
    audio_tensor = torch.tensor(audio_array, dtype=torch.float64) / 32768.0

    # Provide inputs to the processor
    #inputs = processor(audio=audio_tensor, sampling_rate=16000, return_tensors="pt")
    input_features = processor(audio_tensor, sampling_rate=16000, return_tensors="pt").input_features

   # generate token ids
    predicted_ids = model.generate(input_features)
    # decode token ids to text
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)

    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    return transcription

# Streamlit app
st.title("Audio to Text Transcription..")

audio_bytes = audio_recorder(pause_threshold=3.0, sample_rate=16_000)
if audio_bytes:
    st.audio(audio_bytes, format="audio/wav")

    transcription = transcribe_audio(audio_bytes)

    if transcription:
        st.write("Transcription:")
        st.write(transcription)
    else:
        st.write("Error: Failed to transcribe audio.")
else:
    st.write("No audio recorded.")