pratikshahp commited on
Commit
1d97cff
·
verified ·
1 Parent(s): 19f09f4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -9
app.py CHANGED
@@ -6,9 +6,25 @@
6
  import os
7
  import streamlit as st
8
  from audio_recorder_streamlit import audio_recorder
9
- import whisper
10
  from langdetect import detect
 
 
 
 
 
11
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  # Function to open a file
13
  def startfile(fn):
14
  os.system('open %s' % fn)
@@ -20,18 +36,23 @@ def create_and_open_txt(text, filename):
20
  file.write(text)
21
  startfile(filename)
22
 
23
- # Ask user to record audio
24
  st.title("Audio to Text Transcription..")
 
25
  audio_bytes = audio_recorder(pause_threshold=3.0, sample_rate=16_000)
26
 
27
- # Load the base model and transcribe the audio
28
- model = whisper.load_model("base")
29
- result = model.transcribe(audio_bytes)
30
- transcribed_text = result["text"]
31
- print(transcribed_text)
32
- st.write("Transcription:")
33
- st.write(transcribed_text)
34
 
 
 
 
 
 
 
 
35
  # Detect the language
36
  language = detect(transcribed_text)
37
  st.write(f"Detected language: {language}")
 
6
  import os
7
  import streamlit as st
8
  from audio_recorder_streamlit import audio_recorder
 
9
  from langdetect import detect
10
+ # Use a pipeline as a high-level helper
11
+ from transformers import pipeline
12
+ pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large")
13
+ # Load model directly
14
+ from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
15
 
16
+ def transcribe_audio(audio_bytes):
17
+ processor = AutoProcessor.from_pretrained("openai/whisper-large")
18
+ model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large")
19
+ audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
20
+ # Cast audio array to double precision and normalize
21
+ audio_tensor = torch.tensor(audio_array, dtype=torch.float64) / 32768.0
22
+ input_values = processor(audio_tensor, return_tensors="pt", sampling_rate=16000).input_values
23
+ logits = model(input_values).logits
24
+ predicted_ids = torch.argmax(logits, dim=-1)
25
+ transcription = processor.decode(predicted_ids[0])
26
+ return transcription
27
+
28
  # Function to open a file
29
  def startfile(fn):
30
  os.system('open %s' % fn)
 
36
  file.write(text)
37
  startfile(filename)
38
 
39
+ # Streamlit app
40
  st.title("Audio to Text Transcription..")
41
+
42
  audio_bytes = audio_recorder(pause_threshold=3.0, sample_rate=16_000)
43
 
44
+ if audio_bytes:
45
+ st.audio(audio_bytes, format="audio/wav")
46
+
47
+ transcription = transcribe_audio(audio_bytes)
 
 
 
48
 
49
+ if transcription:
50
+ st.write("Transcription:")
51
+ st.write(transcription)
52
+ else:
53
+ st.write("Error: Failed to transcribe audio.")
54
+ else:
55
+ st.write("No audio recorded.")
56
  # Detect the language
57
  language = detect(transcribed_text)
58
  st.write(f"Detected language: {language}")