Spaces:

slabstech
/

dhwani-internal-api-server

Paused

App Files Files Community

sachin commited on 16 days ago

Commit

6efcad4

1 Parent(s): 7222675

test

Browse files

Files changed (2) hide show

requirements.txt +162 -1
src/server/main.py +97 -236

requirements.txt CHANGED Viewed

@@ -33,4 +33,165 @@ uvicorn
 fastapi
 pydub
 python-multipart
-hf_xet

 fastapi
 pydub
 python-multipart
+hf_xet
+accelerate==1.6.0
+aiofiles==23.2.1
+aiohappyeyeballs==2.6.1
+aiohttp==3.11.16
+aiosignal==1.3.2
+annotated-types==0.7.0
+antlr4-python3-runtime==4.9.3
+anyio==4.9.0
+async-timeout==5.0.1
+attrs==25.3.0
+audioread==3.0.1
+bitsandbytes==0.45.5
+boto3==1.37.29
+botocore==1.37.29
+cached_path==1.7.1
+cachetools==5.5.2
+certifi==2025.1.31
+cffi==1.17.1
+charset-normalizer==3.4.1
+click==8.1.8
+contourpy==1.3.1
+cycler==0.12.1
+datasets==3.5.0
+decorator==5.2.1
+dill==0.3.8
+docker-pycreds==0.4.0
+einops==0.8.1
+einx==0.3.0
+ema-pytorch==0.7.7
+encodec==0.1.1
+exceptiongroup==1.2.2
+f5-tts==1.1.0
+fastapi==0.115.12
+ffmpy==0.5.0
+filelock==3.18.0
+fonttools==4.57.0
+frozendict==2.4.6
+frozenlist==1.5.0
+fsspec==2024.12.0
+gitdb==4.0.12
+GitPython==3.1.44
+google-api-core==2.24.2
+google-auth==2.38.0
+google-cloud-core==2.4.3
+google-cloud-storage==2.19.0
+google-crc32c==1.7.1
+google-resumable-media==2.7.2
+googleapis-common-protos==1.69.2
+gradio==5.14.0
+gradio_client==1.7.0
+groovy==0.1.2
+h11==0.14.0
+httpcore==1.0.7
+httpx==0.28.1
+huggingface-hub==0.27.1
+hydra-core==1.3.2
+idna==3.10
+jieba==0.42.1
+Jinja2==3.1.6
+jmespath==1.0.1
+joblib==1.4.2
+kiwisolver==1.4.8
+lazy_loader==0.4
+librosa==0.11.0
+llvmlite==0.44.0
+loguru==0.7.3
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.10.1
+mdurl==0.1.2
+mpmath==1.3.0
+msgpack==1.1.0
+multidict==6.3.2
+multiprocess==0.70.16
+networkx==3.4.2
+numba==0.61.0
+numpy==1.26.4
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-cusparselt-cu12==0.6.2
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+omegaconf==2.3.0
+orjson==3.10.16
+packaging==24.2
+pandas==2.2.3
+pillow==11.1.0
+platformdirs==4.3.7
+pooch==1.8.2
+propcache==0.3.1
+proto-plus==1.26.1
+protobuf==5.29.4
+psutil==7.0.0
+pyarrow==19.0.1
+pyasn1==0.6.1
+pyasn1_modules==0.4.2
+pycparser==2.22
+pydantic==2.10.6
+pydantic_core==2.27.2
+pydub==0.25.1
+Pygments==2.19.1
+pyparsing==3.2.3
+pypinyin==0.54.0
+python-dateutil==2.9.0.post0
+python-multipart==0.0.20
+pytz==2025.2
+PyYAML==6.0.2
+regex==2024.11.6
+requests==2.32.3
+rich==13.9.4
+rsa==4.9
+ruff==0.11.4
+s3transfer==0.11.4
+safehttpx==0.1.6
+safetensors==0.5.3
+scikit-learn==1.6.1
+scipy==1.15.2
+semantic-version==2.10.0
+sentry-sdk==2.25.1
+setproctitle==1.3.5
+shellingham==1.5.4
+six==1.17.0
+smmap==5.0.2
+sniffio==1.3.1
+soundfile==0.13.1
+soxr==0.5.0.post1
+starlette==0.46.1
+sympy==1.13.1
+threadpoolctl==3.6.0
+tokenizers==0.21.1
+tomli==2.2.1
+tomlkit==0.13.2
+torch==2.6.0
+torchaudio==2.6.0
+torchdiffeq==0.2.5
+tqdm==4.67.1
+transformers==4.50.3
+transformers-stream-generator==0.0.5
+triton==3.2.0
+typer==0.15.2
+typing-inspection==0.4.0
+typing_extensions==4.13.1
+tzdata==2025.2
+urllib3==2.3.0
+uvicorn==0.34.0
+vocos==0.1.0
+wandb==0.19.9
+websockets==14.2
+x-transformers==2.2.8
+xxhash==3.5.0
+yarl==1.19.0

src/server/main.py CHANGED Viewed

@@ -30,7 +30,6 @@ import zipfile
 import soundfile as sf
 import torch
 from fastapi import Body, FastAPI, HTTPException, Response
-from parler_tts import ParlerTTSForConditionalGeneration
 from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
 import numpy as np
 from config import SPEED, ResponseFormat, config
@@ -64,98 +63,12 @@ if torch.cuda.is_available():
 else:
     print("CUDA is not available on this system.")
-class TTSModelManager:
-    def __init__(self):
-        self.model_tokenizer: OrderedDict[
-            str, tuple[ParlerTTSForConditionalGeneration, AutoTokenizer, AutoTokenizer]
-        ] = OrderedDict()
-        self.max_length = 50
-    def load_model(
-        self, model_name: str
-    ) -> tuple[ParlerTTSForConditionalGeneration, AutoTokenizer, AutoTokenizer]:
-        logger.debug(f"Loading {model_name}...")
-        start = time.perf_counter()
-        model_name = "ai4bharat/indic-parler-tts"
-        attn_implementation = "flash_attention_2"
-        model = ParlerTTSForConditionalGeneration.from_pretrained(
-            model_name,
-            attn_implementation=attn_implementation
-        ).to(device, dtype=torch_dtype)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        description_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path)
-        # Set pad tokens
-        if tokenizer.pad_token is None:
-            tokenizer.pad_token = tokenizer.eos_token
-        if description_tokenizer.pad_token is None:
-            description_tokenizer.pad_token = description_tokenizer.eos_token
-        # TODO - temporary disable -torch.compile
-        '''
-        # Update model configuration
-        model.config.pad_token_id = tokenizer.pad_token_id
-        # Update for deprecation: use max_batch_size instead of batch_size
-        if hasattr(model.generation_config.cache_config, 'max_batch_size'):
-            model.generation_config.cache_config.max_batch_size = 1
-        model.generation_config.cache_implementation = "static"
-        '''
-        # Compile the model
-        compile_mode = "default"
-        #compile_mode = "reduce-overhead"
-        model.forward = torch.compile(model.forward, mode=compile_mode)
-        # Warmup
-        warmup_inputs = tokenizer("Warmup text for compilation",
-                                return_tensors="pt",
-                                padding="max_length",
-                                max_length=self.max_length).to(device)
-        model_kwargs = {
-            "input_ids": warmup_inputs["input_ids"],
-            "attention_mask": warmup_inputs["attention_mask"],
-            "prompt_input_ids": warmup_inputs["input_ids"],
-            "prompt_attention_mask": warmup_inputs["attention_mask"],
-        }
-        n_steps = 1 if compile_mode == "default" else 2
-        for _ in range(n_steps):
-            _ = model.generate(**model_kwargs)
-        logger.info(
-            f"Loaded {model_name} with Flash Attention and compilation in {time.perf_counter() - start:.2f} seconds"
-        )
-        return model, tokenizer, description_tokenizer
-    def get_or_load_model(
-        self, model_name: str
-    ) -> tuple[ParlerTTSForConditionalGeneration, AutoTokenizer, AutoTokenizer]:
-        if model_name not in self.model_tokenizer:
-            logger.info(f"Model {model_name} isn't already loaded")
-            if len(self.model_tokenizer) == config.max_models:
-                logger.info("Unloading the oldest loaded model")
-                del self.model_tokenizer[next(iter(self.model_tokenizer))]
-            self.model_tokenizer[model_name] = self.load_model(model_name)
-        return self.model_tokenizer[model_name]
-tts_model_manager = TTSModelManager()
-@asynccontextmanager
-async def lifespan(_: FastAPI):
-    if not config.lazy_load_model:
-        tts_model_manager.get_or_load_model(config.model)
-    yield
 app = FastAPI(
     title="Dhwani API",
     description="AI Chat API supporting Indian languages",
     version="1.0.0",
     redirect_slashes=False,
-    lifespan=lifespan
 )
 def chunk_text(text, chunk_size):
@@ -165,158 +78,106 @@ def chunk_text(text, chunk_size):
         chunks.append(' '.join(words[i:i + chunk_size]))
     return chunks
-@app.post("/v1/audio/speech")
-async def generate_audio(
-    input: Annotated[str, Body()] = config.input,
-    voice: Annotated[str, Body()] = config.voice,
-    model: Annotated[str, Body()] = config.model,
-    response_format: Annotated[ResponseFormat, Body(include_in_schema=False)] = config.response_format,
-    speed: Annotated[float, Body(include_in_schema=False)] = SPEED,
-) -> StreamingResponse:
-    tts, tokenizer, description_tokenizer = tts_model_manager.get_or_load_model(model)
-    if speed != SPEED:
-        logger.warning(
-            "Specifying speed isn't supported by this model. Audio will be generated with the default speed"
-        )
-    start = time.perf_counter()
-    chunk_size = 15
-    all_chunks = chunk_text(input, chunk_size)
-    if len(all_chunks) <= chunk_size:
-        desc_inputs = description_tokenizer(voice,
-                                          return_tensors="pt",
-                                          padding="max_length",
-                                          max_length=tts_model_manager.max_length).to(device)
-        prompt_inputs = tokenizer(input,
-                                return_tensors="pt",
-                                padding="max_length",
-                                max_length=tts_model_manager.max_length).to(device)
-        input_ids = desc_inputs["input_ids"]
-        attention_mask = desc_inputs["attention_mask"]
-        prompt_input_ids = prompt_inputs["input_ids"]
-        prompt_attention_mask = prompt_inputs["attention_mask"]
-        generation = tts.generate(
-            input_ids=input_ids,
-            prompt_input_ids=prompt_input_ids,
-            attention_mask=attention_mask,
-            prompt_attention_mask=prompt_attention_mask
-        ).to(torch.float32)
-        audio_arr = generation.cpu().float().numpy().squeeze()
-    else:
-        all_descriptions = [voice] * len(all_chunks)
-        description_inputs = description_tokenizer(all_descriptions,
-                                                 return_tensors="pt",
-                                                 padding=True).to(device)
-        prompts = tokenizer(all_chunks,
-                          return_tensors="pt",
-                          padding=True).to(device)
-        set_seed(0)
-        generation = tts.generate(
-            input_ids=description_inputs["input_ids"],
-            attention_mask=description_inputs["attention_mask"],
-            prompt_input_ids=prompts["input_ids"],
-            prompt_attention_mask=prompts["attention_mask"],
-            do_sample=True,
-            return_dict_in_generate=True,
-        )
-        chunk_audios = []
-        for i, audio in enumerate(generation.sequences):
-            audio_data = audio[:generation.audios_length[i]].cpu().float().numpy().squeeze()
-            chunk_audios.append(audio_data)
-        audio_arr = np.concatenate(chunk_audios)
-    device_str = str(device)
-    logger.info(
-        f"Took {time.perf_counter() - start:.2f} seconds to generate audio for {len(input.split())} words using {device_str.upper()}"
-    )
-    audio_buffer = io.BytesIO()
-    sf.write(audio_buffer, audio_arr, tts.config.sampling_rate, format=response_format)
-    audio_buffer.seek(0)
-    return StreamingResponse(audio_buffer, media_type=f"audio/{response_format}")
-def create_in_memory_zip(file_data):
-    in_memory_zip = io.BytesIO()
-    with zipfile.ZipFile(in_memory_zip, 'w') as zipf:
-        for file_name, data in file_data.items():
-            zipf.writestr(file_name, data)
-    in_memory_zip.seek(0)
-    return in_memory_zip
-@app.post("/v1/audio/speech_batch")
-async def generate_audio_batch(
-    input: Annotated[List[str], Body()] = config.input,
-    voice: Annotated[List[str], Body()] = config.voice,
-    model: Annotated[str, Body(include_in_schema=False)] = config.model,
-    response_format: Annotated[ResponseFormat, Body()] = config.response_format,
-    speed: Annotated[float, Body(include_in_schema=False)] = SPEED,
-) -> StreamingResponse:
-    tts, tokenizer, description_tokenizer = tts_model_manager.get_or_load_model(model)
-    if speed != SPEED:
-        logger.warning(
-            "Specifying speed isn't supported by this model. Audio will be generated with the default speed"
-        )
-    start = time.perf_counter()
-    chunk_size = 15
-    all_chunks = []
-    all_descriptions = []
-    for i, text in enumerate(input):
-        chunks = chunk_text(text, chunk_size)
-        all_chunks.extend(chunks)
-        all_descriptions.extend([voice[i]] * len(chunks))
-    description_inputs = description_tokenizer(all_descriptions,
-                                             return_tensors="pt",
-                                             padding=True).to(device)
-    prompts = tokenizer(all_chunks,
-                       return_tensors="pt",
-                       padding=True).to(device)
-    set_seed(0)
-    generation = tts.generate(
-        input_ids=description_inputs["input_ids"],
-        attention_mask=description_inputs["attention_mask"],
-        prompt_input_ids=prompts["input_ids"],
-        prompt_attention_mask=prompts["attention_mask"],
-        do_sample=True,
-        return_dict_in_generate=True,
-    )
-    audio_outputs = []
-    current_index = 0
-    for i, text in enumerate(input):
-        chunks = chunk_text(text, chunk_size)
-        chunk_audios = []
-        for j in range(len(chunks)):
-            audio_arr = generation.sequences[current_index][:generation.audios_length[current_index]].cpu().float().numpy().squeeze()
-            chunk_audios.append(audio_arr)
-            current_index += 1
-        combined_audio = np.concatenate(chunk_audios)
-        audio_outputs.append(combined_audio)
-    file_data = {}
-    for i, audio in enumerate(audio_outputs):
-        file_name = f"out_{i}.{response_format}"
-        audio_bytes = io.BytesIO()
-        sf.write(audio_bytes, audio, tts.config.sampling_rate, format=response_format)
-        audio_bytes.seek(0)
-        file_data[file_name] = audio_bytes.read()
-    in_memory_zip = create_in_memory_zip(file_data)
-    logger.info(
-        f"Took {time.perf_counter() - start:.2f} seconds to generate audio"
     )
-    return StreamingResponse(in_memory_zip, media_type="application/zip")
 # Supported language codes
 SUPPORTED_LANGUAGES = {

 import soundfile as sf
 import torch
 from fastapi import Body, FastAPI, HTTPException, Response
 from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
 import numpy as np
 from config import SPEED, ResponseFormat, config
 else:
     print("CUDA is not available on this system.")
 app = FastAPI(
     title="Dhwani API",
     description="AI Chat API supporting Indian languages",
     version="1.0.0",
     redirect_slashes=False,
+    #lifespan=lifespan
 )
 def chunk_text(text, chunk_size):
         chunks.append(' '.join(words[i:i + chunk_size]))
     return chunks
+import io
+import torch
+import requests
+import tempfile
+import numpy as np
+import soundfile as sf
+from fastapi import FastAPI, HTTPException
+from transformers import AutoModel
+from pydantic import BaseModel
+from typing import Optional
+from starlette.responses import StreamingResponse
+tts_repo_id = "ai4bharat/IndicF5"
+tts_model = AutoModel.from_pretrained(tts_repo_id, trust_remote_code=True)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print("Device:", device)
+tts_model = tts_model.to(device)
+EXAMPLES = [
+    {
+        "audio_name": "KAN_F (Happy)",
+        "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/KAN_F_HAPPY_00001.wav",
+        "ref_text": "ನಮ್‌ ಫ್ರಿಜ್ಜಲ್ಲಿ  ಕೂಲಿಂಗ್‌ ಸಮಸ್ಯೆ ಆಗಿ ನಾನ್‌ ಭಾಳ ದಿನದಿಂದ ಒದ್ದಾಡ್ತಿದ್ದೆ, ಆದ್ರೆ ಅದ್ನೀಗ ಮೆಕಾನಿಕ್ ಆಗಿರೋ ನಿಮ್‌ ಸಹಾಯ್ದಿಂದ ಬಗೆಹರಿಸ್ಕೋಬೋದು ಅಂತಾಗಿ ನಿರಾಳ ಆಯ್ತು ನಂಗೆ.",
+        "synth_text": "ಚೆನ್ನೈನ ಶೇರ್ ಆಟೋ ಪ್ರಯಾಣಿಕರ ನಡುವೆ ಆಹಾರವನ್ನು ಹಂಚಿಕೊಂಡು ತಿನ್ನುವುದು ನನಗೆ ಮನಸ್ಸಿಗೆ ತುಂಬಾ ಒಳ್ಳೆಯದೆನಿಸುವ ವಿಷಯ."
+    },
+]
+# Pydantic model for request body
+class SynthesizeRequest(BaseModel):
+    text: str  # Text to synthesize (expected in Kannada)
+    ref_audio_name: str  # Dropdown of audio names from EXAMPLES
+    ref_text: Optional[str] = None  # Optional, defaults to example ref_text if not provided
+# Function to load audio from URL
+def load_audio_from_url(url: str):
+    response = requests.get(url)
+    if response.status_code == 200:
+        audio_data, sample_rate = sf.read(io.BytesIO(response.content))
+        return sample_rate, audio_data
+    raise HTTPException(status_code=500, detail="Failed to load reference audio from URL.")
+# Function to synthesize speech
+def synthesize_speech(text: str, ref_audio_name: str, ref_text: str):
+    # Find the matching example
+    ref_audio_url = None
+    for example in EXAMPLES:
+        if example["audio_name"] == ref_audio_name:
+            ref_audio_url = example["audio_url"]
+            if not ref_text:
+                ref_text = example["ref_text"]
+            break
+    if not ref_audio_url:
+        raise HTTPException(status_code=400, detail="Invalid reference audio name.")
+    if not text.strip():
+        raise HTTPException(status_code=400, detail="Text to synthesize cannot be empty.")
+    if not ref_text or not ref_text.strip():
+        raise HTTPException(status_code=400, detail="Reference text cannot be empty.")
+    # Load reference audio from URL
+    sample_rate, audio_data = load_audio_from_url(ref_audio_url)
+    # Save reference audio to a temporary file
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
+        sf.write(temp_audio.name, audio_data, samplerate=sample_rate, format='WAV')
+        temp_audio.flush()
+        # Generate speech
+        audio = tts_model(text, ref_audio_path=temp_audio.name, ref_text=ref_text)
+    # Normalize output
+    if audio.dtype == np.int16:
+        audio = audio.astype(np.float32) / 32768.0
+    # Save generated audio to a BytesIO buffer
+    buffer = io.BytesIO()
+    sf.write(buffer, audio, 24000, format='WAV')
+    buffer.seek(0)
+    return buffer
+@app.post("/v1/audio/speech")
+async def synthesize(request: SynthesizeRequest):
+    # If ref_text is not provided, it will default to the example's ref_text in the synthesize_speech function
+    audio_buffer = synthesize_speech(request.text, request.ref_audio_name, request.ref_text)
+    # Return the audio as a streaming response
+    return StreamingResponse(
+        audio_buffer,
+        media_type="audio/wav",
+        headers={"Content-Disposition": "attachment; filename=synthesized_speech.wav"}
     )
 # Supported language codes
 SUPPORTED_LANGUAGES = {