aar2dee2 commited on
Commit
5042d26
·
1 Parent(s): ae363d1

convert numpy array to audio segment

Browse files
Files changed (2) hide show
  1. app.py +16 -2
  2. requirements.txt +2 -1
app.py CHANGED
@@ -1,5 +1,7 @@
1
  # # Import required libraries
2
 
 
 
3
  import vocode
4
  from vocode import getenv
5
  import gradio as gr
@@ -66,6 +68,18 @@ logger = logging.getLogger(__name__)
66
  logger.setLevel(logging.DEBUG)
67
 
68
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  def main(input_audio):
70
  logger.info(f"Type of input_audio: {type(input_audio)}")
71
  logger.info(f"input_audio: {input_audio}")
@@ -88,7 +102,8 @@ def main(input_audio):
88
  while True:
89
  try:
90
  # Transcribe the input_audio using WhisperTranscriber
91
- transcript = transcriber.transcribe(input_audio)
 
92
  logger.info(f"Transcription: {transcript}")
93
  response = agent.generate_response(transcript)
94
  logger.info(f"Agent response: {response}")
@@ -118,7 +133,6 @@ file_translate = gr.Interface(
118
  fn=main,
119
  inputs=gr.Audio(source="upload", type="filepath", format="wav"),
120
  outputs=gr.Audio(label="Generated Speech", type="filepath", format="wav"),
121
- examples=[["./data/example/strauss-oppenheimer.wav"]],
122
  title=title,
123
  description=description,
124
  )
 
1
  # # Import required libraries
2
 
3
+ import numpy as np
4
+ from pydub import AudioSegment
5
  import vocode
6
  from vocode import getenv
7
  import gradio as gr
 
68
  logger.setLevel(logging.DEBUG)
69
 
70
 
71
+ def convert_to_audio_segment(input_audio):
72
+ sample_rate, audio_data = input_audio
73
+ audio_data = audio_data.astype(np.int16) # Convert to 16-bit data
74
+ audio_segment = AudioSegment(
75
+ audio_data.tobytes(), # Convert numpy array to bytes
76
+ frame_rate=sample_rate,
77
+ sample_width=audio_data.dtype.itemsize, # 2 bytes for 16-bit audio
78
+ channels=1 # mono audio
79
+ )
80
+ return audio_segment
81
+
82
+
83
  def main(input_audio):
84
  logger.info(f"Type of input_audio: {type(input_audio)}")
85
  logger.info(f"input_audio: {input_audio}")
 
102
  while True:
103
  try:
104
  # Transcribe the input_audio using WhisperTranscriber
105
+ input_audio_segment = convert_to_audio_segment(input_audio)
106
+ transcript = transcriber.transcribe(input_audio_segment)
107
  logger.info(f"Transcription: {transcript}")
108
  response = agent.generate_response(transcript)
109
  logger.info(f"Agent response: {response}")
 
133
  fn=main,
134
  inputs=gr.Audio(source="upload", type="filepath", format="wav"),
135
  outputs=gr.Audio(label="Generated Speech", type="filepath", format="wav"),
 
136
  title=title,
137
  description=description,
138
  )
requirements.txt CHANGED
@@ -1,3 +1,4 @@
1
  vocode[io]
2
  google-cloud-texttospeech
3
- gradio==3.43.2
 
 
1
  vocode[io]
2
  google-cloud-texttospeech
3
+ gradio==3.43.2
4
+ numpy