Spaces:

diabolic6045
/

tts-api

Sleeping

tts-api / app.py

Avinyaa

1b567fa 3 months ago

6.21 kB

	# Import configuration first to setup environment
	import app_config

	from fastapi import FastAPI, HTTPException, Form
	from fastapi.responses import FileResponse
	from pydantic import BaseModel
	from kokoro import KPipeline
	import soundfile as sf
	import torch
	import os
	import tempfile
	import uuid
	import logging
	from typing import Optional

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	app = FastAPI(title="Kokoro TTS API", description="Text-to-Speech API using Kokoro", version="1.0.0")

	class TTSRequest(BaseModel):
	text: str
	voice: str = "af_heart"
	lang_code: str = "a"

	class KokoroTTSService:
	def __init__(self):
	self.device = "cuda" if torch.cuda.is_available() else "cpu"
	logger.info(f"Using device: {self.device}")

	if app_config.is_hf_spaces():
	logger.info("Running on Hugging Face Spaces")

	try:
	# Initialize Kokoro pipeline following the working example pattern
	logger.info("Initializing Kokoro TTS pipeline...")
	self.pipeline = KPipeline(lang_code='a')
	logger.info("Kokoro TTS pipeline loaded successfully")
	except Exception as e:
	logger.error(f"Failed to load Kokoro TTS pipeline: {e}")
	raise e

	def generate_speech(self, text: str, voice: str = "af_heart", lang_code: str = "a") -> str:
	"""Generate speech and return the path to the output file"""
	try:
	# Create a unique filename for the output
	output_filename = f"kokoro_output_{uuid.uuid4().hex}.wav"
	output_path = os.path.join(app_config.get_temp_dir(), output_filename)

	# Update pipeline language if different
	if self.pipeline.lang_code != lang_code:
	logger.info(f"Switching language from {self.pipeline.lang_code} to {lang_code}")
	self.pipeline = KPipeline(lang_code=lang_code)

	# Generate speech using Kokoro (following the working example pattern)
	generator = self.pipeline(text, voice=voice)

	# Get the first (and typically only) audio output
	for i, (gs, ps, audio) in enumerate(generator):
	logger.info(f"Generated audio segment {i}: gs={gs}, ps={ps}")
	# Save the audio to file
	sf.write(output_path, audio, 24000)
	break # Take the first generated audio

	return output_path
	except Exception as e:
	logger.error(f"Error generating speech: {e}")
	raise HTTPException(status_code=500, detail=f"Failed to generate speech: {str(e)}")

	def get_available_voices(self):
	"""Return list of available voices"""
	# Extended list based on the working example
	return [
	"af_heart", "af_bella", "af_nicole", "af_aoede", "af_kore",
	"af_sarah", "af_nova", "af_sky", "af_alloy", "af_jessica", "af_river",
	"am_michael", "am_fenrir", "am_puck", "am_echo", "am_eric",
	"am_liam", "am_onyx", "am_santa", "am_adam",
	"bf_emma", "bf_isabella", "bf_alice", "bf_lily",
	"bm_george", "bm_fable", "bm_lewis", "bm_daniel"
	]

	# Initialize Kokoro TTS service
	tts_service = KokoroTTSService()

	@app.get("/")
	async def root():
	return {"message": "Kokoro TTS API is running", "status": "healthy"}

	@app.get("/health")
	async def health_check():
	return {"status": "healthy", "device": tts_service.device}

	@app.get("/voices")
	async def get_voices():
	"""Get list of available voices"""
	return {"voices": tts_service.get_available_voices()}

	@app.post("/tts")
	async def text_to_speech(
	text: str = Form(...),
	voice: str = Form("af_heart"),
	lang_code: str = Form("a")
	):
	"""
	Convert text to speech using Kokoro TTS

	- text: The text to convert to speech
	- voice: Voice to use (default: "af_heart")
	- lang_code: Language code (default: "a" for auto-detect)
	"""

	if not text.strip():
	raise HTTPException(status_code=400, detail="Text cannot be empty")

	# Validate voice
	available_voices = tts_service.get_available_voices()
	if voice not in available_voices:
	raise HTTPException(
	status_code=400,
	detail=f"Voice '{voice}' not available. Available voices: {available_voices}"
	)

	try:
	# Generate speech
	output_path = tts_service.generate_speech(text, voice, lang_code)

	# Return the generated audio file
	return FileResponse(
	output_path,
	media_type="audio/wav",
	filename=f"kokoro_tts_{voice}_{uuid.uuid4().hex}.wav",
	headers={"Content-Disposition": "attachment"}
	)

	except Exception as e:
	logger.error(f"Error in TTS endpoint: {e}")
	raise HTTPException(status_code=500, detail=str(e))

	@app.post("/tts-json")
	async def text_to_speech_json(request: TTSRequest):
	"""
	Convert text to speech using JSON request body

	- request: TTSRequest containing text, voice, and lang_code
	"""

	if not request.text.strip():
	raise HTTPException(status_code=400, detail="Text cannot be empty")

	# Validate voice
	available_voices = tts_service.get_available_voices()
	if request.voice not in available_voices:
	raise HTTPException(
	status_code=400,
	detail=f"Voice '{request.voice}' not available. Available voices: {available_voices}"
	)

	try:
	# Generate speech
	output_path = tts_service.generate_speech(request.text, request.voice, request.lang_code)

	# Return the generated audio file
	return FileResponse(
	output_path,
	media_type="audio/wav",
	filename=f"kokoro_tts_{request.voice}_{uuid.uuid4().hex}.wav",
	headers={"Content-Disposition": "attachment"}
	)

	except Exception as e:
	logger.error(f"Error in TTS JSON endpoint: {e}")
	raise HTTPException(status_code=500, detail=str(e))