Spaces:

nampham1106
/

tts-kokoro

Sleeping

App Files Files Community

nam pham commited on Feb 7

Commit

2573d67

1 Parent(s): acbb39e

feat: init app

Browse files

Files changed (7) hide show

Dockerfile +37 -0
app/audio_utils.py +41 -0
app/server.py +170 -0
bin/api-start.sh +3 -0
client.py +28 -0
poetry.lock +0 -0
pyproject.toml +17 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,37 @@

+ARG PYTHON_VERSION=3.12
+FROM python:$PYTHON_VERSION-slim as builder
+RUN pip install poetry==1.8.2
+ENV POETRY_NO_INTERACTION=1 \
+    POETRY_VIRTUALENVS_IN_PROJECT=1 \
+    POETRY_VIRTUALENVS_CREATE=1 \
+    POETRY_CACHE_DIR=/tmp/poetry_cache
+WORKDIR /app
+COPY pyproject.toml poetry.lock ./
+RUN poetry install --no-root && rm -rf $POETRY_CACHE_DIR
+FROM python:$PYTHON_VERSION-slim as runtime
+####### Add your own installation commands here #######
+# RUN pip install some-package
+# RUN wget https://path/to/some/data/or/weights
+# RUN apt-get update && apt-get install -y <package-name>
+RUN mkdir -p /app/cache && chmod 777 /app/cache
+RUN useradd -m -u 1000 user
+COPY . /app
+RUN chown -R user:user /app
+USER user
+ENV VIRTUAL_ENV=/app/.venv \
+    PATH="/app/.venv/bin:$PATH"
+COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
+# Install litserve and requirements
+EXPOSE 7860
+CMD ["sh", "/app/bin/start-api.sh"]

app/audio_utils.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import os
+import soundfile as sf
+import numpy as np
+def combine_audio_files(folder, index):
+    """
+    Combine all audio files from the folder and return the concatenated audio data.
+    Args:
+        - folder (str): Path to the folder containing the .wav files.
+        - index (int): The index of the last file to be concatenated.
+    Returns: None
+    """
+    # Get a sorted list of .wav files by their numeric names
+    wav_files = sorted(
+        [os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".wav")],
+        key=lambda x: int(os.path.splitext(os.path.basename(x))[0]),
+    )
+    # Raise an error if no .wav files are found
+    if not wav_files:
+        raise ValueError("No .wav files found in the input folder.")
+    # Combine all .wav files
+    audio_data = []
+    samplerate = None
+    for wav_file in wav_files[:index+1]:
+        data, samplerate = sf.read(wav_file)
+        audio_data.append(data)
+        os.remove(wav_file)  # Clean up each file after reading
+    # Return the concatenated audio data and the sampling rate
+    final_audio = np.concatenate(audio_data)
+    return final_audio, samplerate

app/server.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import os
+import io
+import soundfile as sf
+import litserve as ls
+from fastapi.responses import Response
+from kokoro import KPipeline
+from audio_utils import combine_audio_files
+class KokoroAPI(ls.LitAPI):
+    """
+    KokoroAPI is a subclass of ls.LitAPI that provides an interface to the Kokoro model for text-to-speech task.
+    Methods:
+        - setup(device): Called once at startup for the task-specific setup.
+        - decode_request(request): Convert the request payload to model input.
+        - predict(inputs): Uses the model to generate audio from the input text.
+        - encode_response(output): Convert the model output to a response payload.
+    """
+    def __init__(self):
+        super().__init__()
+        self.pipeline = None
+        self.current_lang = None
+    def setup(self, device):
+        self.device = device
+    def decode_request(self, request):
+        """
+        Convert the request payload to model input.
+        """
+        # Extract the inputs from request payload
+        language_code = request.get("language_code", "a")
+        text = request.get("text", "")
+        voice = request.get("voice", "af_heart")
+        # Initialize or update pipeline if language changes
+        if self.current_lang != language_code:
+            self.current_lang = language_code
+            self.pipeline = KPipeline(lang_code=language_code, device=self.device)
+        # Return the inputs
+        return text, voice
+    def predict(self, inputs):
+        """
+        Run inference and generate audio file using the Kokoro model.
+        """
+        # Get the inputs
+        text, voice = inputs
+        try:
+            # Generate audio files
+            generator = self.pipeline(text, voice=voice, speed=1, split_pattern=r"\n+")
+            # Create the output directory if it does not exist
+            output_dir = "output"
+            if not os.path.exists(output_dir):
+                os.makedirs(output_dir)
+            # Save each audio file
+            for i, (gs, ps, audio) in enumerate(generator):
+                file_path = f"{output_dir}/{i}.wav"
+                sf.write(file_path, audio, 24000)
+            # Combine all audio files
+            final_audio, samplerate = combine_audio_files(output_dir, i)
+            # Save the final audio to a buffer
+            audio_buffer = io.BytesIO()
+            sf.write(audio_buffer, final_audio, samplerate, format="WAV")
+            audio_buffer.seek(0)
+            audio_data = audio_buffer.getvalue()
+            audio_buffer.close()
+            return audio_data
+        finally:
+            # Clean up output directory if it exists
+            if os.path.exists(output_dir):
+                for file in os.listdir(output_dir):
+                    file_path = os.path.join(output_dir, file)
+                    try:
+                        os.remove(file_path)
+                    except:
+                        pass
+                try:
+                    os.rmdir(output_dir)
+                except:
+                    pass
+    def encode_response(self, output):
+        """
+        Convert the model output to a response payload.
+        """
+        # Package the generated audio data into a response
+        return Response(content=output, headers={"Content-Type": "audio/wav"})
+if __name__ == "__main__":
+    # Create an instance of the KokoroAPI class and run the server
+    api = KokoroAPI()
+    server = ls.LitServer(api, track_requests=True)
+    server.run(port=8000)

bin/api-start.sh ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ #!/bin/bash
2	+
3	+ python app/app/server.py

client.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import requests
+url = "http://localhost:8000/predict"
+# Example inputs
+lang_code = "a"  # American English
+text = """
+The sky above the port was the color of television, tuned to a dead channel.
+"It's not like I'm using," Case heard someone say, as he shouldered his way through the crowd around the door of the Chat. "It's like my body's developed this massive drug deficiency."
+It was a Sprawl voice and a Sprawl joke. The Chatsubo was a bar for professional expatriates; you could drink there for a week and never hear two words in Japanese.
+These were to have an enormous impact, not only because they were associated with Constantine, but also because, as in so many other areas, the decisions taken by Constantine (or in his name) were to have great significance for centuries to come. One of the main issues was the shape that Christian churches were to take, since there was not, apparently, a tradition of monumental church buildings when Constantine decided to help the Christian church build a series of truly spectacular structures. The main form that these churches took was that of the basilica, a multipurpose rectangular structure, based ultimately on the earlier Greek stoa, which could be found in most of the great cities of the empire. Christianity, unlike classical polytheism, needed a large interior space for the celebration of its religious services, and the basilica aptly filled that need. We naturally do not know the degree to which the emperor was involved in the design of new churches, but it is tempting to connect this with the secular basilica that Constantine completed in the Roman forum (the so-called Basilica of Maxentius) and the one he probably built in Trier, in connection with his residence in the city at a time when he was still caesar.
+"""
+voice = "af_heart"  # Make sure lang_code matches voice
+# Prepare the request payload
+payload = {
+    "language_code": lang_code,
+    "text": text,
+    "voice": voice,
+}
+# Send the request to the server
+response = requests.post(url, json=payload)
+# Save the output to a file
+with open("output.wav", "wb") as f:
+    f.write(response.content)

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,17 @@

+[tool.poetry]
+name = "text to speed service"
+version = "0.1.0"
+description = ""
+authors = ["nam pham <[email protected]>"]
+readme = "README.md"
+[tool.poetry.dependencies]
+python = "3.10.15"
+litserve = "^0.2.6"
+soundfile = "^0.13.1"
+kokoro = "^0.7.9"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"