nam pham commited on
Commit
2573d67
·
1 Parent(s): acbb39e

feat: init app

Browse files
Files changed (7) hide show
  1. Dockerfile +37 -0
  2. app/audio_utils.py +41 -0
  3. app/server.py +170 -0
  4. bin/api-start.sh +3 -0
  5. client.py +28 -0
  6. poetry.lock +0 -0
  7. pyproject.toml +17 -0
Dockerfile ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG PYTHON_VERSION=3.12
2
+ FROM python:$PYTHON_VERSION-slim as builder
3
+
4
+ RUN pip install poetry==1.8.2
5
+
6
+ ENV POETRY_NO_INTERACTION=1 \
7
+ POETRY_VIRTUALENVS_IN_PROJECT=1 \
8
+ POETRY_VIRTUALENVS_CREATE=1 \
9
+ POETRY_CACHE_DIR=/tmp/poetry_cache
10
+
11
+ WORKDIR /app
12
+
13
+ COPY pyproject.toml poetry.lock ./
14
+
15
+ RUN poetry install --no-root && rm -rf $POETRY_CACHE_DIR
16
+
17
+ FROM python:$PYTHON_VERSION-slim as runtime
18
+
19
+ ####### Add your own installation commands here #######
20
+ # RUN pip install some-package
21
+ # RUN wget https://path/to/some/data/or/weights
22
+ # RUN apt-get update && apt-get install -y <package-name>
23
+ RUN mkdir -p /app/cache && chmod 777 /app/cache
24
+
25
+ RUN useradd -m -u 1000 user
26
+ COPY . /app
27
+ RUN chown -R user:user /app
28
+ USER user
29
+ ENV VIRTUAL_ENV=/app/.venv \
30
+ PATH="/app/.venv/bin:$PATH"
31
+
32
+ COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
33
+
34
+ # Install litserve and requirements
35
+ EXPOSE 7860
36
+ CMD ["sh", "/app/bin/start-api.sh"]
37
+
app/audio_utils.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import soundfile as sf
4
+
5
+ import numpy as np
6
+
7
+
8
+
9
+
10
+
11
+ def combine_audio_files(folder, index):
12
+
13
+ """
14
+ Combine all audio files from the folder and return the concatenated audio data.
15
+ Args:
16
+ - folder (str): Path to the folder containing the .wav files.
17
+ - index (int): The index of the last file to be concatenated.
18
+ Returns: None
19
+ """
20
+
21
+ # Get a sorted list of .wav files by their numeric names
22
+ wav_files = sorted(
23
+ [os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".wav")],
24
+ key=lambda x: int(os.path.splitext(os.path.basename(x))[0]),
25
+ )
26
+
27
+ # Raise an error if no .wav files are found
28
+ if not wav_files:
29
+ raise ValueError("No .wav files found in the input folder.")
30
+
31
+ # Combine all .wav files
32
+ audio_data = []
33
+ samplerate = None
34
+ for wav_file in wav_files[:index+1]:
35
+ data, samplerate = sf.read(wav_file)
36
+ audio_data.append(data)
37
+ os.remove(wav_file) # Clean up each file after reading
38
+
39
+ # Return the concatenated audio data and the sampling rate
40
+ final_audio = np.concatenate(audio_data)
41
+ return final_audio, samplerate
app/server.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import io
4
+
5
+ import soundfile as sf
6
+
7
+ import litserve as ls
8
+
9
+ from fastapi.responses import Response
10
+
11
+ from kokoro import KPipeline
12
+
13
+ from audio_utils import combine_audio_files
14
+
15
+
16
+
17
+
18
+
19
+ class KokoroAPI(ls.LitAPI):
20
+
21
+ """
22
+
23
+ KokoroAPI is a subclass of ls.LitAPI that provides an interface to the Kokoro model for text-to-speech task.
24
+
25
+
26
+
27
+ Methods:
28
+
29
+ - setup(device): Called once at startup for the task-specific setup.
30
+
31
+ - decode_request(request): Convert the request payload to model input.
32
+
33
+ - predict(inputs): Uses the model to generate audio from the input text.
34
+
35
+ - encode_response(output): Convert the model output to a response payload.
36
+
37
+ """
38
+
39
+
40
+
41
+ def __init__(self):
42
+
43
+ super().__init__()
44
+
45
+ self.pipeline = None
46
+
47
+ self.current_lang = None
48
+
49
+
50
+
51
+ def setup(self, device):
52
+
53
+ self.device = device
54
+
55
+
56
+
57
+ def decode_request(self, request):
58
+
59
+ """
60
+
61
+ Convert the request payload to model input.
62
+
63
+ """
64
+
65
+ # Extract the inputs from request payload
66
+
67
+ language_code = request.get("language_code", "a")
68
+
69
+ text = request.get("text", "")
70
+
71
+ voice = request.get("voice", "af_heart")
72
+
73
+
74
+
75
+ # Initialize or update pipeline if language changes
76
+
77
+ if self.current_lang != language_code:
78
+
79
+ self.current_lang = language_code
80
+
81
+ self.pipeline = KPipeline(lang_code=language_code, device=self.device)
82
+
83
+
84
+
85
+ # Return the inputs
86
+
87
+ return text, voice
88
+
89
+
90
+
91
+ def predict(self, inputs):
92
+
93
+ """
94
+
95
+ Run inference and generate audio file using the Kokoro model.
96
+
97
+ """
98
+
99
+ # Get the inputs
100
+
101
+ text, voice = inputs
102
+
103
+
104
+
105
+ try:
106
+ # Generate audio files
107
+ generator = self.pipeline(text, voice=voice, speed=1, split_pattern=r"\n+")
108
+
109
+ # Create the output directory if it does not exist
110
+ output_dir = "output"
111
+ if not os.path.exists(output_dir):
112
+ os.makedirs(output_dir)
113
+
114
+ # Save each audio file
115
+ for i, (gs, ps, audio) in enumerate(generator):
116
+ file_path = f"{output_dir}/{i}.wav"
117
+ sf.write(file_path, audio, 24000)
118
+
119
+ # Combine all audio files
120
+ final_audio, samplerate = combine_audio_files(output_dir, i)
121
+
122
+ # Save the final audio to a buffer
123
+ audio_buffer = io.BytesIO()
124
+ sf.write(audio_buffer, final_audio, samplerate, format="WAV")
125
+ audio_buffer.seek(0)
126
+ audio_data = audio_buffer.getvalue()
127
+ audio_buffer.close()
128
+
129
+ return audio_data
130
+ finally:
131
+ # Clean up output directory if it exists
132
+ if os.path.exists(output_dir):
133
+ for file in os.listdir(output_dir):
134
+ file_path = os.path.join(output_dir, file)
135
+ try:
136
+ os.remove(file_path)
137
+ except:
138
+ pass
139
+ try:
140
+ os.rmdir(output_dir)
141
+ except:
142
+ pass
143
+
144
+
145
+
146
+ def encode_response(self, output):
147
+
148
+ """
149
+
150
+ Convert the model output to a response payload.
151
+
152
+ """
153
+
154
+ # Package the generated audio data into a response
155
+
156
+ return Response(content=output, headers={"Content-Type": "audio/wav"})
157
+
158
+
159
+
160
+
161
+
162
+ if __name__ == "__main__":
163
+
164
+ # Create an instance of the KokoroAPI class and run the server
165
+
166
+ api = KokoroAPI()
167
+
168
+ server = ls.LitServer(api, track_requests=True)
169
+
170
+ server.run(port=8000)
bin/api-start.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ python app/app/server.py
client.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+
3
+ url = "http://localhost:8000/predict"
4
+
5
+ # Example inputs
6
+ lang_code = "a" # American English
7
+
8
+ text = """
9
+ The sky above the port was the color of television, tuned to a dead channel.
10
+ "It's not like I'm using," Case heard someone say, as he shouldered his way through the crowd around the door of the Chat. "It's like my body's developed this massive drug deficiency."
11
+ It was a Sprawl voice and a Sprawl joke. The Chatsubo was a bar for professional expatriates; you could drink there for a week and never hear two words in Japanese.
12
+ These were to have an enormous impact, not only because they were associated with Constantine, but also because, as in so many other areas, the decisions taken by Constantine (or in his name) were to have great significance for centuries to come. One of the main issues was the shape that Christian churches were to take, since there was not, apparently, a tradition of monumental church buildings when Constantine decided to help the Christian church build a series of truly spectacular structures. The main form that these churches took was that of the basilica, a multipurpose rectangular structure, based ultimately on the earlier Greek stoa, which could be found in most of the great cities of the empire. Christianity, unlike classical polytheism, needed a large interior space for the celebration of its religious services, and the basilica aptly filled that need. We naturally do not know the degree to which the emperor was involved in the design of new churches, but it is tempting to connect this with the secular basilica that Constantine completed in the Roman forum (the so-called Basilica of Maxentius) and the one he probably built in Trier, in connection with his residence in the city at a time when he was still caesar.
13
+ """
14
+ voice = "af_heart" # Make sure lang_code matches voice
15
+
16
+ # Prepare the request payload
17
+ payload = {
18
+ "language_code": lang_code,
19
+ "text": text,
20
+ "voice": voice,
21
+ }
22
+
23
+ # Send the request to the server
24
+ response = requests.post(url, json=payload)
25
+
26
+ # Save the output to a file
27
+ with open("output.wav", "wb") as f:
28
+ f.write(response.content)
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "text to speed service"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["nam pham <[email protected]>"]
6
+ readme = "README.md"
7
+
8
+ [tool.poetry.dependencies]
9
+ python = "3.10.15"
10
+ litserve = "^0.2.6"
11
+ soundfile = "^0.13.1"
12
+ kokoro = "^0.7.9"
13
+
14
+
15
+ [build-system]
16
+ requires = ["poetry-core"]
17
+ build-backend = "poetry.core.masonry.api"