from fastapi import FastAPI, Response from fastapi.responses import FileResponse from kokoro import KPipeline import soundfile as sf import os import numpy as np import torch from huggingface_hub import InferenceClient def llm_chat_response(text): HF_TOKEN = os.getenv("HF_TOKEN") client = InferenceClient(api_key=HF_TOKEN) messages = [ { "role": "user", "content": [ { "type": "text", "text": text + str('describe in one line only') } #, # { # "type": "image_url", # "image_url": { # "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" # } # } ] } ] response_from_llama = client.chat.completions.create( model="meta-llama/Llama-3.2-11B-Vision-Instruct", messages=messages, max_tokens=500) return response_from_llama.choices[0].message['content'] app = FastAPI() # Initialize pipeline once at startup pipeline = KPipeline(lang_code='a') @app.post("/generate") async def generate_audio(text: str, voice: str = "af_heart", speed: float = 1.0): text_reply = llm_chat_response(text) # Generate audio generator = pipeline( text_reply, voice=voice, speed=speed, split_pattern=r'\n+' ) # # Save first segment only for demo # for i, (gs, ps, audio) in enumerate(generator): # sf.write(f"output_{i}.wav", audio, 24000) # return FileResponse( # f"output_{i}.wav", # media_type="audio/wav", # filename="output.wav" # ) # return Response("No audio generated", status_code=400) # Process only the first segment for demo for i, (gs, ps, audio) in enumerate(generator): # Convert PyTorch tensor to NumPy array audio_numpy = audio.cpu().numpy() # Convert to 16-bit PCM # Ensure the audio is in the range [-1, 1] audio_numpy = np.clip(audio_numpy, -1, 1) # Convert to 16-bit signed integers pcm_data = (audio_numpy * 32767).astype(np.int16) # Convert to bytes (automatically uses row-major order) raw_audio = pcm_data.tobytes() # Return PCM data with minimal necessary headers return Response( content=raw_audio, media_type="application/octet-stream", headers={ "Content-Disposition": f'attachment; filename="output.pcm"', "X-Sample-Rate": "24000", "X-Bits-Per-Sample": "16", "X-Endianness": "little" } ) return Response("No audio generated", status_code=400)