Spaces:

Compumacy
/

wisalQA_P1

Runtime error

File size: 13,524 Bytes

a059ad0

import asyncio
import base64
import os
import time
from io import BytesIO
from google.genai import types
from google.genai.types import (
    LiveConnectConfig,
    SpeechConfig,
    VoiceConfig,
    PrebuiltVoiceConfig,
    Content,
    Part,
)
import gradio as gr
import numpy as np
import websockets
from dotenv import load_dotenv
from fastrtc import (
    AsyncAudioVideoStreamHandler,
    Stream,
    WebRTC,
    get_cloudflare_turn_credentials_async,
    wait_for_item,
)
from google import genai
from gradio.utils import get_space
from PIL import Image

# ------------------------------------------
import asyncio
import base64
import json
import os
import pathlib
from typing import AsyncGenerator, Literal

import gradio as gr
import numpy as np
from dotenv import load_dotenv
from fastapi import FastAPI
from fastapi.responses import HTMLResponse
from fastrtc import (
    AsyncStreamHandler,
    Stream,
    get_cloudflare_turn_credentials_async,
    wait_for_item,
)
from google import genai
from google.genai.types import (
    LiveConnectConfig,
    PrebuiltVoiceConfig,
    SpeechConfig,
    VoiceConfig,
)
from gradio.utils import get_space
from pydantic import BaseModel
# ------------------------------------------------
from dotenv import load_dotenv
load_dotenv()
import os
import io
import asyncio
from pydub import AudioSegment

# Gemini: google-genai
from google import genai
# ---------------------------------------------------
# VAD imports from reference code
import collections
import webrtcvad
import time

# helper functions
GEMINI_API_KEY="AIzaSyCUCivstFpC9pq_jMHMYdlPrmh9Bx97dFo"

TAVILY_API_KEY="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"

OPENAI_API_KEY="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"

QDRANT_API_KEY="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIiwiZXhwIjoxNzUxMDUxNzg4fQ.I9J-K7OM0BtcNKgj2d4uVM8QYAHYfFCVAyP4rlZkK2E"

QDRANT_URL="https://6a3aade6-e8ad-4a6c-a579-21f5af90b7e8.us-east4-0.gcp.cloud.qdrant.io"

OPENAI_API_KEY="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"

WEAVIATE_URL="yorcqe2sqswhcaivxvt9a.c0.us-west3.gcp.weaviate.cloud"

WEAVIATE_API_KEY="d2d0VGdZQTBmdTFlOWdDZl9tT2h3WDVWd1NpT1dQWHdGK0xjR1hYeWxicUxHVnFRazRUSjY2VlRUVlkwPV92MjAw"

DEEPINFRA_API_KEY="285LUJulGIprqT6hcPhiXtcrphU04FG4"

DEEPINFRA_BASE_URL="https://api.deepinfra.com/v1/openai"
def encode_audio(data: np.ndarray) -> dict:
    """Encode Audio data to send to the server"""
    return {
        "mime_type": "audio/pcm",
        "data": base64.b64encode(data.tobytes()).decode("UTF-8"),
    }
def encode_audio2(data: np.ndarray) -> bytes:
    """Encode Audio data to send to the server"""
    return data.tobytes()

import soundfile as sf

def numpy_array_to_wav_bytes(audio_array, sample_rate=16000):
    buffer = io.BytesIO()
    sf.write(buffer, audio_array, sample_rate, format='WAV')
    return buffer.getvalue()


def numpy_array_to_wav_bytes(audio_array, sample_rate=16000):
    """

    Convert a NumPy audio array to WAV bytes.



    Args:

        audio_array (np.ndarray): Audio signal (1D or 2D).

        sample_rate (int): Sample rate in Hz.



    Returns:

        bytes: WAV-formatted audio data.

    """
    buffer = io.BytesIO()
    sf.write(buffer, audio_array, sample_rate, format='WAV')
    buffer.seek(0)
    return buffer.read()
# webrtc handler class
class GeminiHandler(AsyncStreamHandler):
    """Handler for the Gemini API with chained latency calculation."""

    def __init__(

        self,

        expected_layout: Literal["mono"] = "mono",

        output_sample_rate: int = 24000,prompt_dict: dict = {"prompt":"PHQ-9"},

    ) -> None:
        super().__init__(
            expected_layout,
            output_sample_rate,
            input_sample_rate=16000,
        )
        self.input_queue: asyncio.Queue = asyncio.Queue()
        self.output_queue: asyncio.Queue = asyncio.Queue()
        self.quit: asyncio.Event = asyncio.Event()
        self.prompt_dict = prompt_dict
        # self.model = "gemini-2.5-flash-preview-tts"
        self.model = "gemini-2.0-flash-live-001"
        self.t2t_model = "gemini-2.0-flash"
        self.s2t_model = "gemini-2.0-flash"

        # --- VAD Initialization ---
        self.vad = webrtcvad.Vad(3)
        self.VAD_RATE = 16000
        self.VAD_FRAME_MS = 20
        self.VAD_FRAME_SAMPLES = int(self.VAD_RATE * (self.VAD_FRAME_MS / 1000.0))
        self.VAD_FRAME_BYTES = self.VAD_FRAME_SAMPLES * 2
        padding_ms = 300
        self.vad_padding_frames = padding_ms // self.VAD_FRAME_MS
        self.vad_ring_buffer = collections.deque(maxlen=self.vad_padding_frames)
        self.vad_ratio = 0.9
        self.vad_triggered = False
        self.wav_data = bytearray()
        self.internal_buffer = bytearray()
        
        self.end_of_speech_time: float | None = None
        self.first_latency_calculated: bool = False

    def copy(self) -> "GeminiHandler":
        return GeminiHandler(
            expected_layout="mono",
            output_sample_rate=self.output_sample_rate,
            prompt_dict=self.prompt_dict,
        )

    def t2t(self, text: str) -> str:
        print(f"Sending text to Gemini: {text}")
        response = self.chat.send_message(text)
        print(f"Received response from Gemini: {response.text}")
        return response.text

    def s2t(self, audio) -> str:
        response = self.s2t_client.models.generate_content(
            model=self.s2t_model,
            contents=[
                types.Part.from_bytes(data=audio, mime_type='audio/wav'),
                'Generate a transcript of the speech.'
            ]
        )
        return response.text

    async def start_up(self):
        # Flag for if we are using text-to-text in the middle of the chain or not.
        self.t2t_bool = False
        self.sys_prompt = None
        
        self.t2t_client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
        self.s2t_client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))#, http_options={"api_version": "v1alpha"})
        if self.sys_prompt is not None: 
            chat_config = types.GenerateContentConfig(system_instruction=self.sys_prompt)
        else:
            chat_config = types.GenerateContentConfig(system_instruction="You are a helpful assistant.")
        self.chat = self.t2t_client.chats.create(model=self.t2t_model, config=chat_config)
        self.t2s_client = genai.Client(api_key=os.environ.get("GEMINI_API_KEY"))

        voice_name = "Puck"
        if self.t2t_bool:
            sys_instruction = f""" You are Wisal, an AI assistant developed by Compumacy AI , and a knowledgeable Autism .

                                Your sole purpose is to provide helpful, respectful, and easy-to-understand answers about Autism Spectrum Disorder (ASD).

                                Always be clear, non-judgmental, and supportive."""
        else:
            sys_instruction = self.sys_prompt
            
        if sys_instruction is not None:
            config = LiveConnectConfig(
            response_modalities=["AUDIO"],
            speech_config=SpeechConfig(
                voice_config=VoiceConfig(
                    prebuilt_voice_config=PrebuiltVoiceConfig(voice_name=voice_name)
                )
            ),
            system_instruction=Content(parts=[Part.from_text(text=sys_instruction)])
            )
        else:
            config = LiveConnectConfig(
                response_modalities=["AUDIO"],
                speech_config=SpeechConfig(
                    voice_config=VoiceConfig(
                        prebuilt_voice_config=PrebuiltVoiceConfig(voice_name=voice_name)
                    )
                ),
            )
        
        async with self.t2s_client.aio.live.connect(model=self.model, config=config) as session:
            async for text_from_user in self.stream():
                print("--------------------------------------------")
                print(f"Received text from user and reading aloud: {text_from_user}")
                print("--------------------------------------------")
                if text_from_user and text_from_user.strip():
                    if self.t2t_bool:
                        prompt = f"""

                              You are Wisal, an AI assistant developed by Compumacy AI , and a knowledgeable Autism .

                                Your sole purpose is to provide helpful, respectful, and easy-to-understand answers about Autism Spectrum Disorder (ASD).

                                Always be clear, non-judgmental, and supportive.



                        {text_from_user}

                        """
                    else:
                        prompt = text_from_user
                    await session.send_client_content(
                        turns=types.Content(
                        role='user', parts=[types.Part(text=prompt)]))
                    async for resp_chunk in session.receive():
                        if resp_chunk.data:
                            array = np.frombuffer(resp_chunk.data, dtype=np.int16)
                            self.output_queue.put_nowait((self.output_sample_rate, array))
                    

    async def stream(self) -> AsyncGenerator[bytes, None]:
        while not self.quit.is_set():
            try:
                # Get the text message to be converted to speech
                text_to_speak = await self.input_queue.get()
                yield text_to_speak
            except (asyncio.TimeoutError, TimeoutError):
                pass
    
    async def receive(self, frame: tuple[int, np.ndarray]) -> None:
        sr, array = frame
        audio_bytes = array.tobytes()
        self.internal_buffer.extend(audio_bytes)

        while len(self.internal_buffer) >= self.VAD_FRAME_BYTES:
            vad_frame = self.internal_buffer[:self.VAD_FRAME_BYTES]
            self.internal_buffer = self.internal_buffer[self.VAD_FRAME_BYTES:]
            is_speech = self.vad.is_speech(vad_frame, self.VAD_RATE)

            if not self.vad_triggered:
                self.vad_ring_buffer.append((vad_frame, is_speech))
                num_voiced = len([f for f, speech in self.vad_ring_buffer if speech])
                if num_voiced > self.vad_ratio * self.vad_ring_buffer.maxlen:
                    print("Speech detected, starting to record...")
                    self.vad_triggered = True
                    for f, s in self.vad_ring_buffer:
                        self.wav_data.extend(f)
                    self.vad_ring_buffer.clear()
            else:
                self.wav_data.extend(vad_frame)
                self.vad_ring_buffer.append((vad_frame, is_speech))
                num_unvoiced = len([f for f, speech in self.vad_ring_buffer if not speech])
                if num_unvoiced > self.vad_ratio * self.vad_ring_buffer.maxlen:
                    print("End of speech detected.")
                    

                    self.end_of_speech_time = time.monotonic()
                    
                    self.vad_triggered = False
                    full_utterance_np = np.frombuffer(self.wav_data, dtype=np.int16)
                    audio_input_wav = numpy_array_to_wav_bytes(full_utterance_np, sr)

                    text_input = self.s2t(audio_input_wav)
                    if text_input and text_input.strip():
                        if self.t2t_bool:
                            text_message = self.t2t(text_input)           
                        else:
                            text_message = text_input
                        self.input_queue.put_nowait(text_message)
                    else:
                        print("STT returned empty transcript, skipping.")

                    self.vad_ring_buffer.clear()
                    self.wav_data = bytearray()

    async def emit(self) -> tuple[int, np.ndarray] | None:

        return await wait_for_item(self.output_queue)

    def shutdown(self) -> None:

        self.quit.set()
        

with gr.Blocks() as demo:
    gr.Markdown("# Gemini Chained Speech-to-Speech Demo")
    
    # for audio modality
    # with gr.Row(visible=(modality_selector.value == "audio")) as row2:
    with gr.Row() as row2:
        with gr.Column():  # Optional, can be removed if not needed
            webrtc2 = WebRTC(
                label="Audio Chat",
                modality="audio",
                mode="send-receive",
                elem_id="audio-source",
                rtc_configuration=get_cloudflare_turn_credentials_async,
                icon="https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
                pulse_color="rgb(255, 255, 255)",
                icon_button_color="rgb(255, 255, 255)",
            )
            # Corrected inputs and outputs for webrtc2.stream to use webrtc2
            webrtc2.stream(
                GeminiHandler(),
                inputs=[webrtc2], # Was webrtc
                outputs=[webrtc2],# Was webrtc
                time_limit=180 if get_space() else None,
                concurrency_limit=2 if get_space() else None,
            )

if __name__ == "__main__":
    demo.launch(server_port=7860)