Spaces:
Runtime error
Runtime error
aar2dee2
commited on
Commit
·
5042d26
1
Parent(s):
ae363d1
convert numpy array to audio segment
Browse files- app.py +16 -2
- requirements.txt +2 -1
app.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
| 1 |
# # Import required libraries
|
| 2 |
|
|
|
|
|
|
|
| 3 |
import vocode
|
| 4 |
from vocode import getenv
|
| 5 |
import gradio as gr
|
|
@@ -66,6 +68,18 @@ logger = logging.getLogger(__name__)
|
|
| 66 |
logger.setLevel(logging.DEBUG)
|
| 67 |
|
| 68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
def main(input_audio):
|
| 70 |
logger.info(f"Type of input_audio: {type(input_audio)}")
|
| 71 |
logger.info(f"input_audio: {input_audio}")
|
|
@@ -88,7 +102,8 @@ def main(input_audio):
|
|
| 88 |
while True:
|
| 89 |
try:
|
| 90 |
# Transcribe the input_audio using WhisperTranscriber
|
| 91 |
-
|
|
|
|
| 92 |
logger.info(f"Transcription: {transcript}")
|
| 93 |
response = agent.generate_response(transcript)
|
| 94 |
logger.info(f"Agent response: {response}")
|
|
@@ -118,7 +133,6 @@ file_translate = gr.Interface(
|
|
| 118 |
fn=main,
|
| 119 |
inputs=gr.Audio(source="upload", type="filepath", format="wav"),
|
| 120 |
outputs=gr.Audio(label="Generated Speech", type="filepath", format="wav"),
|
| 121 |
-
examples=[["./data/example/strauss-oppenheimer.wav"]],
|
| 122 |
title=title,
|
| 123 |
description=description,
|
| 124 |
)
|
|
|
|
| 1 |
# # Import required libraries
|
| 2 |
|
| 3 |
+
import numpy as np
|
| 4 |
+
from pydub import AudioSegment
|
| 5 |
import vocode
|
| 6 |
from vocode import getenv
|
| 7 |
import gradio as gr
|
|
|
|
| 68 |
logger.setLevel(logging.DEBUG)
|
| 69 |
|
| 70 |
|
| 71 |
+
def convert_to_audio_segment(input_audio):
|
| 72 |
+
sample_rate, audio_data = input_audio
|
| 73 |
+
audio_data = audio_data.astype(np.int16) # Convert to 16-bit data
|
| 74 |
+
audio_segment = AudioSegment(
|
| 75 |
+
audio_data.tobytes(), # Convert numpy array to bytes
|
| 76 |
+
frame_rate=sample_rate,
|
| 77 |
+
sample_width=audio_data.dtype.itemsize, # 2 bytes for 16-bit audio
|
| 78 |
+
channels=1 # mono audio
|
| 79 |
+
)
|
| 80 |
+
return audio_segment
|
| 81 |
+
|
| 82 |
+
|
| 83 |
def main(input_audio):
|
| 84 |
logger.info(f"Type of input_audio: {type(input_audio)}")
|
| 85 |
logger.info(f"input_audio: {input_audio}")
|
|
|
|
| 102 |
while True:
|
| 103 |
try:
|
| 104 |
# Transcribe the input_audio using WhisperTranscriber
|
| 105 |
+
input_audio_segment = convert_to_audio_segment(input_audio)
|
| 106 |
+
transcript = transcriber.transcribe(input_audio_segment)
|
| 107 |
logger.info(f"Transcription: {transcript}")
|
| 108 |
response = agent.generate_response(transcript)
|
| 109 |
logger.info(f"Agent response: {response}")
|
|
|
|
| 133 |
fn=main,
|
| 134 |
inputs=gr.Audio(source="upload", type="filepath", format="wav"),
|
| 135 |
outputs=gr.Audio(label="Generated Speech", type="filepath", format="wav"),
|
|
|
|
| 136 |
title=title,
|
| 137 |
description=description,
|
| 138 |
)
|
requirements.txt
CHANGED
|
@@ -1,3 +1,4 @@
|
|
| 1 |
vocode[io]
|
| 2 |
google-cloud-texttospeech
|
| 3 |
-
gradio==3.43.2
|
|
|
|
|
|
| 1 |
vocode[io]
|
| 2 |
google-cloud-texttospeech
|
| 3 |
+
gradio==3.43.2
|
| 4 |
+
numpy
|