Spaces:
Sleeping
Sleeping
Divax
commited on
Commit
·
94fd4b0
1
Parent(s):
71905d8
test
Browse files- Dockerfile +13 -19
- Dockerfile.coqui +0 -51
- README.md +64 -288
- README_coqui.md +0 -351
- app.py +0 -414
- app_config.py +0 -54
- client_example.py +0 -269
- requirements.txt +8 -13
- requirements_coqui.txt +0 -12
- start_c3po_api.py +17 -136
- startup.py +0 -120
- test.py +0 -144
- test_build.py +69 -0
- test_coqui_api.py +0 -146
- test_coqui_tts.py +0 -99
- test_kokoro_install.py +0 -86
Dockerfile
CHANGED
@@ -1,13 +1,12 @@
|
|
1 |
-
FROM python:3.11
|
2 |
|
3 |
# Set up a new user named "user" with user ID 1000
|
4 |
RUN useradd -m -u 1000 user
|
5 |
|
6 |
-
# Install system dependencies
|
7 |
RUN apt-get update && apt-get install -y \
|
8 |
git \
|
9 |
git-lfs \
|
10 |
-
espeak-ng \
|
11 |
ffmpeg \
|
12 |
&& rm -rf /var/lib/apt/lists/*
|
13 |
|
@@ -17,35 +16,30 @@ RUN git lfs install
|
|
17 |
# Switch to the "user" user
|
18 |
USER user
|
19 |
|
20 |
-
# Set
|
21 |
ENV HOME=/home/user \
|
22 |
PATH=/home/user/.local/bin:$PATH \
|
23 |
COQUI_TOS_AGREED=1 \
|
24 |
-
|
25 |
-
FORCE_CPU=true \
|
26 |
-
CUDA_VISIBLE_DEVICES=""
|
27 |
|
28 |
-
# Set the working directory
|
29 |
WORKDIR $HOME/app
|
30 |
|
31 |
-
#
|
32 |
RUN pip install --no-cache-dir --upgrade pip
|
33 |
|
34 |
-
# Copy
|
35 |
COPY --chown=user requirements.txt .
|
36 |
RUN pip install --no-cache-dir -r requirements.txt
|
37 |
|
38 |
-
#
|
39 |
-
RUN python -
|
40 |
|
41 |
-
#
|
42 |
-
|
43 |
-
|
44 |
-
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
45 |
-
COPY --chown=user . $HOME/app
|
46 |
|
47 |
# Expose the port
|
48 |
EXPOSE 7860
|
49 |
|
50 |
-
# Start the API
|
51 |
-
CMD ["uvicorn", "
|
|
|
1 |
+
FROM python:3.11-slim
|
2 |
|
3 |
# Set up a new user named "user" with user ID 1000
|
4 |
RUN useradd -m -u 1000 user
|
5 |
|
6 |
+
# Install only essential system dependencies
|
7 |
RUN apt-get update && apt-get install -y \
|
8 |
git \
|
9 |
git-lfs \
|
|
|
10 |
ffmpeg \
|
11 |
&& rm -rf /var/lib/apt/lists/*
|
12 |
|
|
|
16 |
# Switch to the "user" user
|
17 |
USER user
|
18 |
|
19 |
+
# Set environment variables
|
20 |
ENV HOME=/home/user \
|
21 |
PATH=/home/user/.local/bin:$PATH \
|
22 |
COQUI_TOS_AGREED=1 \
|
23 |
+
HF_HUB_DISABLE_TELEMETRY=1
|
|
|
|
|
24 |
|
25 |
+
# Set the working directory
|
26 |
WORKDIR $HOME/app
|
27 |
|
28 |
+
# Upgrade pip
|
29 |
RUN pip install --no-cache-dir --upgrade pip
|
30 |
|
31 |
+
# Copy and install requirements
|
32 |
COPY --chown=user requirements.txt .
|
33 |
RUN pip install --no-cache-dir -r requirements.txt
|
34 |
|
35 |
+
# Pre-download the C-3PO model to speed up startup
|
36 |
+
RUN python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='Borcherding/XTTS-v2_C3PO', local_dir='./models/XTTS-v2_C3PO', local_dir_use_symlinks=False)"
|
37 |
|
38 |
+
# Copy the API file
|
39 |
+
COPY --chown=user coqui_api.py .
|
|
|
|
|
|
|
40 |
|
41 |
# Expose the port
|
42 |
EXPOSE 7860
|
43 |
|
44 |
+
# Start the C-3PO TTS API
|
45 |
+
CMD ["uvicorn", "coqui_api:app", "--host", "0.0.0.0", "--port", "7860"]
|
Dockerfile.coqui
DELETED
@@ -1,51 +0,0 @@
|
|
1 |
-
FROM python:3.11
|
2 |
-
|
3 |
-
# Set up a new user named "user" with user ID 1000
|
4 |
-
RUN useradd -m -u 1000 user
|
5 |
-
|
6 |
-
# Install system dependencies as root
|
7 |
-
RUN apt-get update && apt-get install -y \
|
8 |
-
git \
|
9 |
-
git-lfs \
|
10 |
-
espeak-ng \
|
11 |
-
ffmpeg \
|
12 |
-
libsndfile1 \
|
13 |
-
&& rm -rf /var/lib/apt/lists/*
|
14 |
-
|
15 |
-
# Initialize git lfs
|
16 |
-
RUN git lfs install
|
17 |
-
|
18 |
-
# Switch to the "user" user
|
19 |
-
USER user
|
20 |
-
|
21 |
-
# Set home to the user's home directory
|
22 |
-
ENV HOME=/home/user \
|
23 |
-
PATH=/home/user/.local/bin:$PATH \
|
24 |
-
COQUI_TOS_AGREED=1 \
|
25 |
-
HF_HUB_DISABLE_TELEMETRY=1 \
|
26 |
-
HF_HOME=/home/user/.cache/huggingface
|
27 |
-
|
28 |
-
# Set the working directory to the user's home directory
|
29 |
-
WORKDIR $HOME/app
|
30 |
-
|
31 |
-
# Upgrade pip
|
32 |
-
RUN pip install --no-cache-dir --upgrade pip
|
33 |
-
|
34 |
-
# Install PyTorch with CPU support for Hugging Face Spaces
|
35 |
-
RUN pip install --no-cache-dir torch torchaudio --index-url https://download.pytorch.org/whl/cpu
|
36 |
-
|
37 |
-
# Copy requirements and install dependencies
|
38 |
-
COPY --chown=user requirements.txt .
|
39 |
-
RUN pip install --no-cache-dir -r requirements.txt
|
40 |
-
|
41 |
-
# Copy the API file
|
42 |
-
COPY --chown=user coqui_api.py .
|
43 |
-
|
44 |
-
# Create necessary directories
|
45 |
-
RUN mkdir -p $HOME/.cache $HOME/app/models
|
46 |
-
|
47 |
-
# Expose the port
|
48 |
-
EXPOSE 7860
|
49 |
-
|
50 |
-
# Start the Coqui TTS API
|
51 |
-
CMD ["uvicorn", "coqui_api:app", "--host", "0.0.0.0", "--port", "7860"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
@@ -1,334 +1,110 @@
|
|
1 |
-
|
2 |
-
title: XTTS C3PO Voice Cloning API
|
3 |
-
emoji: 🤖
|
4 |
-
colorFrom: indigo
|
5 |
-
colorTo: yellow
|
6 |
-
sdk: docker
|
7 |
-
pinned: false
|
8 |
-
---
|
9 |
|
10 |
-
|
11 |
|
12 |
-
|
13 |
|
14 |
-
|
|
|
|
|
|
|
|
|
15 |
|
16 |
-
|
17 |
-
- **Custom Voice Cloning**: Upload your own reference audio for voice cloning
|
18 |
-
- **Multilingual Support**: 16+ languages with C3PO voice
|
19 |
-
- **No Upload Required**: Use C3PO voice without any file uploads
|
20 |
-
- **RESTful API**: Clean API with automatic documentation
|
21 |
-
- **Docker Support**: Optimized for Hugging Face Spaces deployment
|
22 |
-
- **PyTorch 2.6 Compatible**: Includes compatibility fixes
|
23 |
-
|
24 |
-
## About the C3PO Model
|
25 |
-
|
26 |
-
This API uses the XTTS-v2 C3PO model from [Borcherding/XTTS-v2_C3PO](https://huggingface.co/Borcherding/XTTS-v2_C3PO), which provides the iconic voice of C-3PO from Star Wars. The model supports:
|
27 |
-
|
28 |
-
- High-quality C3PO voice synthesis
|
29 |
-
- Multilingual C3PO speech (16+ languages)
|
30 |
-
- Custom voice cloning capabilities
|
31 |
-
- Real-time speech generation
|
32 |
-
|
33 |
-
## Quick Start
|
34 |
-
|
35 |
-
### Using C3PO Voice (No Upload Required)
|
36 |
|
|
|
37 |
```bash
|
38 |
-
|
39 |
-
|
40 |
-
-F "language=en" \
|
41 |
-
--output c3po_speech.wav
|
42 |
-
```
|
43 |
|
44 |
-
|
45 |
-
|
46 |
-
```bash
|
47 |
-
curl -X POST "http://localhost:7860/tts" \
|
48 |
-
-F "text=This will be spoken in your custom voice!" \
|
49 |
-
-F "language=en" \
|
50 |
-
-F "speaker_file=@your_reference_voice.wav" \
|
51 |
-
--output custom_speech.wav
|
52 |
```
|
53 |
|
54 |
-
|
55 |
-
|
56 |
-
### C3PO Voice Only
|
57 |
-
- **POST** `/tts-c3po` - Generate speech using C3PO voice (no file upload needed)
|
58 |
-
- **Parameters:**
|
59 |
-
- `text` (form): Text to convert to speech (max 500 characters)
|
60 |
-
- `language` (form): Language code (default: "en")
|
61 |
-
- `no_lang_auto_detect` (form): Disable automatic language detection
|
62 |
-
|
63 |
-
### Voice Cloning with Fallback
|
64 |
-
- **POST** `/tts` - Convert text to speech with optional custom voice
|
65 |
-
- **Parameters:**
|
66 |
-
- `text` (form): Text to convert to speech (max 500 characters)
|
67 |
-
- `language` (form): Language code (default: "en")
|
68 |
-
- `voice_cleanup` (form): Apply audio cleanup to reference voice
|
69 |
-
- `no_lang_auto_detect` (form): Disable automatic language detection
|
70 |
-
- `speaker_file` (file, optional): Reference speaker audio file (uses C3PO if not provided)
|
71 |
-
|
72 |
-
### JSON API
|
73 |
-
- **POST** `/tts-json` - Convert text to speech using JSON request body
|
74 |
-
- **Body:** JSON object with `text`, `language`, `voice_cleanup`, `no_lang_auto_detect`
|
75 |
-
- **File:** `speaker_file` (optional) - Reference speaker audio file
|
76 |
-
|
77 |
-
### Information Endpoints
|
78 |
-
- **GET** `/health` - Check API status, device info, and supported languages
|
79 |
-
- **GET** `/languages` - Get list of supported languages
|
80 |
-
- **GET** `/docs` - Interactive API documentation (Swagger UI)
|
81 |
-
|
82 |
-
## Usage Examples
|
83 |
-
|
84 |
-
### Python - C3PO Voice
|
85 |
-
|
86 |
-
```python
|
87 |
-
import requests
|
88 |
-
|
89 |
-
# Generate C3PO speech
|
90 |
-
url = "http://localhost:7860/tts-c3po"
|
91 |
-
data = {
|
92 |
-
"text": "Hello there! I am C-3PO, human-cyborg relations.",
|
93 |
-
"language": "en"
|
94 |
-
}
|
95 |
-
|
96 |
-
response = requests.post(url, data=data)
|
97 |
-
|
98 |
-
if response.status_code == 200:
|
99 |
-
with open("c3po_speech.wav", "wb") as f:
|
100 |
-
f.write(response.content)
|
101 |
-
print("C3PO speech generated!")
|
102 |
-
```
|
103 |
-
|
104 |
-
### Python - Custom Voice with C3PO Fallback
|
105 |
-
|
106 |
-
```python
|
107 |
-
import requests
|
108 |
-
|
109 |
-
url = "http://localhost:7860/tts"
|
110 |
-
data = {
|
111 |
-
"text": "This will use C3PO voice if no speaker file is provided.",
|
112 |
-
"language": "en"
|
113 |
-
}
|
114 |
-
|
115 |
-
# No speaker_file provided - will use C3PO voice
|
116 |
-
response = requests.post(url, data=data)
|
117 |
-
|
118 |
-
if response.status_code == 200:
|
119 |
-
with open("speech_output.wav", "wb") as f:
|
120 |
-
f.write(response.content)
|
121 |
-
```
|
122 |
-
|
123 |
-
### Multilingual C3PO
|
124 |
-
|
125 |
-
```python
|
126 |
-
# C3PO speaking Spanish
|
127 |
-
data = {
|
128 |
-
"text": "Hola, soy C-3PO. Domino más de seis millones de formas de comunicación.",
|
129 |
-
"language": "es"
|
130 |
-
}
|
131 |
-
response = requests.post("http://localhost:7860/tts-c3po", data=data)
|
132 |
-
```
|
133 |
-
|
134 |
-
## Supported Languages
|
135 |
-
|
136 |
-
The C3PO model supports all XTTS-v2 languages:
|
137 |
-
|
138 |
-
- **en** - English
|
139 |
-
- **es** - Spanish
|
140 |
-
- **fr** - French
|
141 |
-
- **de** - German
|
142 |
-
- **it** - Italian
|
143 |
-
- **pt** - Portuguese (Brazilian)
|
144 |
-
- **pl** - Polish
|
145 |
-
- **tr** - Turkish
|
146 |
-
- **ru** - Russian
|
147 |
-
- **nl** - Dutch
|
148 |
-
- **cs** - Czech
|
149 |
-
- **ar** - Arabic
|
150 |
-
- **zh-cn** - Mandarin Chinese
|
151 |
-
- **ja** - Japanese
|
152 |
-
- **ko** - Korean
|
153 |
-
- **hu** - Hungarian
|
154 |
-
- **hi** - Hindi
|
155 |
-
|
156 |
-
## Setup
|
157 |
-
|
158 |
-
### CPU-Only Installation (Recommended for most users)
|
159 |
-
|
160 |
-
For CPU-only usage (no GPU required):
|
161 |
-
```bash
|
162 |
-
# Ubuntu/Debian
|
163 |
-
sudo apt-get install espeak-ng ffmpeg git git-lfs
|
164 |
-
|
165 |
-
# macOS
|
166 |
-
brew install espeak ffmpeg git git-lfs
|
167 |
-
```
|
168 |
-
|
169 |
-
2. **Install CPU-only PyTorch and dependencies:**
|
170 |
```bash
|
171 |
-
#
|
172 |
-
chmod +x install_cpu.sh
|
173 |
-
./install_cpu.sh
|
174 |
-
|
175 |
-
# Option 2: Manual installation
|
176 |
-
pip install torch torchaudio --index-url https://download.pytorch.org/whl/cpu
|
177 |
pip install -r requirements.txt
|
178 |
-
python -m unidic download
|
179 |
-
```
|
180 |
|
181 |
-
|
182 |
-
|
183 |
-
export FORCE_CPU=true
|
184 |
-
export CUDA_VISIBLE_DEVICES=""
|
185 |
```
|
186 |
|
187 |
-
|
188 |
-
```bash
|
189 |
-
uvicorn app:app --host 0.0.0.0 --port 7860
|
190 |
-
```
|
191 |
|
192 |
-
|
193 |
|
194 |
-
|
195 |
-
- Automatic C3PO model downloading
|
196 |
-
- Proper user permissions (user ID 1000)
|
197 |
-
- PyTorch 2.6 compatibility fixes
|
198 |
-
- COQUI license agreement handling
|
199 |
-
|
200 |
-
### Local Development
|
201 |
-
|
202 |
-
1. **Install system dependencies:**
|
203 |
```bash
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
brew install espeak ffmpeg git git-lfs
|
209 |
```
|
210 |
|
211 |
-
|
212 |
```bash
|
213 |
-
|
214 |
-
|
|
|
|
|
215 |
```
|
216 |
|
217 |
-
|
218 |
```bash
|
219 |
-
|
|
|
|
|
|
|
220 |
```
|
221 |
|
222 |
-
|
223 |
```bash
|
224 |
-
|
225 |
```
|
226 |
|
227 |
-
|
228 |
|
229 |
-
|
230 |
-
# Build and run
|
231 |
-
docker build -t xtts-c3po-api .
|
232 |
-
docker run -p 7860:7860 xtts-c3po-api
|
233 |
-
```
|
234 |
|
235 |
-
##
|
236 |
|
237 |
-
|
238 |
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
5. **Speaker**: Single speaker, clear pronunciation
|
244 |
|
245 |
-
##
|
246 |
|
247 |
-
|
248 |
-
- **Voice**: C3PO from Star Wars
|
249 |
-
- **Source**: [Borcherding/XTTS-v2_C3PO](https://huggingface.co/Borcherding/XTTS-v2_C3PO)
|
250 |
-
- **Languages**: 16+ supported
|
251 |
-
- **License**: CPML (Coqui Public Model License)
|
252 |
|
253 |
-
## Testing
|
254 |
|
255 |
-
Run the test suite:
|
256 |
```bash
|
257 |
-
#
|
258 |
-
python
|
259 |
-
|
260 |
-
# Test API endpoints
|
261 |
-
python client_example.py
|
262 |
-
```
|
263 |
-
|
264 |
-
## Environment Variables
|
265 |
-
|
266 |
-
Automatically configured:
|
267 |
-
- `COQUI_TOS_AGREED=1` - Agrees to CPML license
|
268 |
-
- `NUMBA_DISABLE_JIT=1` - Disables Numba JIT compilation
|
269 |
-
|
270 |
-
## API Response Examples
|
271 |
-
|
272 |
-
### Health Check Response
|
273 |
-
```json
|
274 |
-
{
|
275 |
-
"status": "healthy",
|
276 |
-
"device": "cuda",
|
277 |
-
"model": "XTTS-v2 C3PO",
|
278 |
-
"default_voice": "C3PO",
|
279 |
-
"supported_languages": ["en", "es", "fr", ...]
|
280 |
-
}
|
281 |
-
```
|
282 |
-
|
283 |
-
### Languages Response
|
284 |
-
```json
|
285 |
-
{
|
286 |
-
"languages": ["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "ko", "hu", "hi"]
|
287 |
-
}
|
288 |
```
|
289 |
|
290 |
-
##
|
291 |
-
|
292 |
-
### CPU Performance
|
293 |
-
When running on CPU:
|
294 |
-
- Speech generation will be slower than GPU (30-60 seconds vs 3-5 seconds)
|
295 |
-
- Memory usage is lower (2-4GB RAM vs 4-8GB VRAM)
|
296 |
-
- No CUDA installation required
|
297 |
-
- Works on any system with sufficient RAM
|
298 |
-
|
299 |
-
### PyTorch Loading Issues
|
300 |
-
The API includes fixes for PyTorch 2.6's `weights_only=True` default. If you encounter loading issues, ensure the compatibility fix is applied.
|
301 |
-
|
302 |
-
### Model Download Issues
|
303 |
-
If the C3PO model fails to download:
|
304 |
-
1. Check internet connection
|
305 |
-
2. Verify git and git-lfs are installed
|
306 |
-
3. Manually clone: `git clone https://huggingface.co/Borcherding/XTTS-v2_C3PO XTTS-v2_C3PO`
|
307 |
-
|
308 |
-
### Audio Quality Issues
|
309 |
-
- Use high-quality reference audio for custom voices
|
310 |
-
- Enable `voice_cleanup` for noisy reference audio
|
311 |
-
- Ensure reference audio is 3-10 seconds long
|
312 |
|
313 |
-
|
314 |
-
- **CPU Mode**: Requires 2-4GB RAM, works on most modern computers
|
315 |
-
- **GPU Mode**: Requires 4GB+ VRAM for optimal performance
|
316 |
-
- Reduce text length for batch processing
|
317 |
-
- Use CPU mode with `FORCE_CPU=true` environment variable
|
318 |
|
319 |
-
|
320 |
-
|
321 |
-
1. Set environment variables: `export FORCE_CPU=true CUDA_VISIBLE_DEVICES=""`
|
322 |
-
2. Install CPU-only PyTorch: `pip install torch torchaudio --index-url https://download.pytorch.org/whl/cpu`
|
323 |
-
3. Restart the API after setting environment variables
|
324 |
|
325 |
-
##
|
326 |
|
327 |
-
|
|
|
|
|
|
|
|
|
328 |
|
329 |
-
## Credits
|
330 |
|
331 |
-
-
|
332 |
-
-
|
333 |
-
-
|
334 |
|
|
|
1 |
+
# 🤖 C-3PO TTS API
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
+
A FastAPI-based text-to-speech service using the **C-3PO fine-tuned XTTS v2 model** from [Borcherding/XTTS-v2_C3PO](https://huggingface.co/Borcherding/XTTS-v2_C3PO) for authentic C-3PO voice synthesis.
|
4 |
|
5 |
+
## ✨ Features
|
6 |
|
7 |
+
- 🤖 **Authentic C-3PO Voice**: Fine-tuned XTTS v2 model with 20 unique C-3PO voice lines
|
8 |
+
- 🌍 **17+ Languages**: Multilingual support while maintaining C-3PO characteristics
|
9 |
+
- 🎭 **Voice Cloning**: Optional custom voice cloning capabilities
|
10 |
+
- 🚀 **FastAPI**: Modern API with automatic documentation
|
11 |
+
- 🐳 **Docker Ready**: Containerized for easy deployment
|
12 |
|
13 |
+
## 🚀 Quick Start
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
+
### Docker Deployment
|
16 |
```bash
|
17 |
+
# Build the container
|
18 |
+
docker build -t c3po-tts .
|
|
|
|
|
|
|
19 |
|
20 |
+
# Run the container
|
21 |
+
docker run -p 7860:7860 c3po-tts
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
```
|
23 |
|
24 |
+
### Local Development
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
```bash
|
26 |
+
# Install dependencies
|
|
|
|
|
|
|
|
|
|
|
27 |
pip install -r requirements.txt
|
|
|
|
|
28 |
|
29 |
+
# Run the API
|
30 |
+
python coqui_api.py
|
|
|
|
|
31 |
```
|
32 |
|
33 |
+
The API will be available at `http://localhost:7860`
|
|
|
|
|
|
|
34 |
|
35 |
+
## 📡 API Endpoints
|
36 |
|
37 |
+
### C-3PO Text-to-Speech
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
```bash
|
39 |
+
curl -X POST "http://localhost:7860/tts-c3po" \
|
40 |
+
-F "text=I am C-3PO, human-cyborg relations." \
|
41 |
+
-F "language=en" \
|
42 |
+
--output c3po_voice.wav
|
|
|
43 |
```
|
44 |
|
45 |
+
### General Text-to-Speech (with C-3PO voice by default)
|
46 |
```bash
|
47 |
+
curl -X POST "http://localhost:7860/tts" \
|
48 |
+
-F "text=The odds of successfully navigating an asteroid field are approximately 3,720 to 1." \
|
49 |
+
-F "language=en" \
|
50 |
+
--output c3po_output.wav
|
51 |
```
|
52 |
|
53 |
+
### JSON API
|
54 |
```bash
|
55 |
+
curl -X POST "http://localhost:7860/tts-json" \
|
56 |
+
-H "Content-Type: application/json" \
|
57 |
+
-d '{"text": "R2-D2, you know better than to trust a strange computer!", "language": "en"}' \
|
58 |
+
--output c3po_json.wav
|
59 |
```
|
60 |
|
61 |
+
### Health Check
|
62 |
```bash
|
63 |
+
curl http://localhost:7860/health
|
64 |
```
|
65 |
|
66 |
+
## 🌍 Supported Languages
|
67 |
|
68 |
+
English, Spanish, French, German, Italian, Portuguese, Polish, Turkish, Russian, Dutch, Czech, Arabic, Chinese, Japanese, Hungarian, Korean, Hindi
|
|
|
|
|
|
|
|
|
69 |
|
70 |
+
## 🎨 Example C-3PO Phrases
|
71 |
|
72 |
+
Perfect texts for demonstrating C-3PO's voice:
|
73 |
|
74 |
+
- "I am C-3PO, human-cyborg relations."
|
75 |
+
- "The odds of successfully navigating an asteroid field are approximately 3,720 to 1."
|
76 |
+
- "R2-D2, you know better than to trust a strange computer!"
|
77 |
+
- "Oh my! How interesting!"
|
|
|
78 |
|
79 |
+
## 📖 API Documentation
|
80 |
|
81 |
+
Visit `http://localhost:7860/docs` for interactive API documentation.
|
|
|
|
|
|
|
|
|
82 |
|
83 |
+
## 🧪 Testing
|
84 |
|
|
|
85 |
```bash
|
86 |
+
# Run the C-3PO test suite
|
87 |
+
python test_c3po_model.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
```
|
89 |
|
90 |
+
## 🔧 Configuration
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
|
92 |
+
The API automatically downloads the C-3PO model on first run. Environment variables:
|
|
|
|
|
|
|
|
|
93 |
|
94 |
+
- `COQUI_TOS_AGREED=1`: Accepts Coqui TTS terms
|
95 |
+
- `HF_HUB_DISABLE_TELEMETRY=1`: Disables telemetry
|
|
|
|
|
|
|
96 |
|
97 |
+
## 📦 Files
|
98 |
|
99 |
+
- `coqui_api.py`: Main C-3PO TTS API
|
100 |
+
- `test_c3po_model.py`: Test suite for C-3PO functionality
|
101 |
+
- `start_c3po_api.py`: Startup script with dependency checks
|
102 |
+
- `Dockerfile`: Container configuration
|
103 |
+
- `requirements.txt`: Python dependencies
|
104 |
|
105 |
+
## 🎭 Credits
|
106 |
|
107 |
+
- [C-3PO Fine-tuned Model](https://huggingface.co/Borcherding/XTTS-v2_C3PO) by Borcherding
|
108 |
+
- [Coqui TTS](https://github.com/coqui-ai/TTS) - The underlying TTS engine
|
109 |
+
- [FastAPI](https://fastapi.tiangolo.com/) - Web framework
|
110 |
|
README_coqui.md
DELETED
@@ -1,351 +0,0 @@
|
|
1 |
-
# 🤖 Coqui TTS C-3PO API for Hugging Face Spaces
|
2 |
-
|
3 |
-
A FastAPI-based text-to-speech service using the Coqui TTS library with the **C-3PO fine-tuned XTTS v2 model** from [Borcherding/XTTS-v2_C3PO](https://huggingface.co/Borcherding/XTTS-v2_C3PO) for authentic C-3PO voice synthesis.
|
4 |
-
|
5 |
-
## ✨ Features
|
6 |
-
|
7 |
-
- 🤖 **C-3PO Voice**: Authentic C-3PO voice using fine-tuned XTTS v2 model
|
8 |
-
- 🎯 **Text-to-Speech**: Convert text to natural-sounding speech
|
9 |
-
- 🎭 **Voice Cloning**: Clone any voice from a reference audio sample
|
10 |
-
- 🌍 **Multilingual**: Support for 17+ languages with C-3PO voice characteristics
|
11 |
-
- 🚀 **FastAPI**: Modern, fast API with automatic documentation
|
12 |
-
- 🐳 **Docker Ready**: Containerized for easy deployment
|
13 |
-
- ☁️ **Hugging Face Spaces**: Optimized for HF Spaces deployment
|
14 |
-
|
15 |
-
## 🎭 C-3PO Model Information
|
16 |
-
|
17 |
-
This API uses the fine-tuned C-3PO voice model from [Borcherding/XTTS-v2_C3PO](https://huggingface.co/Borcherding/XTTS-v2_C3PO), which features:
|
18 |
-
|
19 |
-
- **Fine-tuned on 20 unique C-3PO voice lines** from Star Wars
|
20 |
-
- **Multi-lingual support** (17 languages) while maintaining C-3PO's distinctive voice
|
21 |
-
- **Emotion & Style Transfer** capturing C-3PO's formal, protocol droid characteristics
|
22 |
-
- **High-Quality Audio** output at 24kHz sampling rate
|
23 |
-
|
24 |
-
## 📡 API Endpoints
|
25 |
-
|
26 |
-
### 1. Health Check
|
27 |
-
```bash
|
28 |
-
GET /health
|
29 |
-
```
|
30 |
-
Returns API status, model information, and C-3PO voice availability.
|
31 |
-
|
32 |
-
### 2. List Models
|
33 |
-
```bash
|
34 |
-
GET /models
|
35 |
-
```
|
36 |
-
Returns available TTS models.
|
37 |
-
|
38 |
-
### 3. C-3PO Text-to-Speech (Dedicated)
|
39 |
-
```bash
|
40 |
-
POST /tts-c3po
|
41 |
-
```
|
42 |
-
**Parameters:**
|
43 |
-
- `text` (string): Text to convert to C-3PO voice (2-500 characters)
|
44 |
-
- `language` (string): Language code (default: "en")
|
45 |
-
|
46 |
-
**Example using curl:**
|
47 |
-
```bash
|
48 |
-
curl -X POST "http://localhost:7860/tts-c3po" \
|
49 |
-
-F "text=I am C-3PO, human-cyborg relations." \
|
50 |
-
-F "language=en" \
|
51 |
-
--output c3po_voice.wav
|
52 |
-
```
|
53 |
-
|
54 |
-
### 4. General Text-to-Speech
|
55 |
-
```bash
|
56 |
-
POST /tts
|
57 |
-
```
|
58 |
-
**Parameters:**
|
59 |
-
- `text` (string): Text to convert to speech (2-500 characters)
|
60 |
-
- `language` (string): Language code (default: "en")
|
61 |
-
- `speaker_file` (file, optional): Reference audio for voice cloning
|
62 |
-
- `use_c3po_voice` (boolean): Use C-3PO voice if no speaker file provided (default: true)
|
63 |
-
|
64 |
-
**Example using curl:**
|
65 |
-
```bash
|
66 |
-
# C-3PO voice (default)
|
67 |
-
curl -X POST "http://localhost:7860/tts" \
|
68 |
-
-F "text=The odds of successfully navigating an asteroid field are approximately 3,720 to 1." \
|
69 |
-
-F "language=en" \
|
70 |
-
--output c3po_output.wav
|
71 |
-
|
72 |
-
# Custom voice cloning
|
73 |
-
curl -X POST "http://localhost:7860/tts" \
|
74 |
-
-F "text=This will sound like the reference voice." \
|
75 |
-
-F "language=en" \
|
76 |
-
-F "speaker_file=@reference_voice.wav" \
|
77 |
-
-F "use_c3po_voice=false" \
|
78 |
-
--output cloned_voice.wav
|
79 |
-
```
|
80 |
-
|
81 |
-
### 5. JSON TTS (C-3PO Voice)
|
82 |
-
```bash
|
83 |
-
POST /tts-json
|
84 |
-
```
|
85 |
-
**JSON Body:**
|
86 |
-
```json
|
87 |
-
{
|
88 |
-
"text": "R2-D2, you know better than to trust a strange computer!",
|
89 |
-
"language": "en"
|
90 |
-
}
|
91 |
-
```
|
92 |
-
|
93 |
-
## 🚀 Deployment on Hugging Face Spaces
|
94 |
-
|
95 |
-
### Step 1: Create a new Space
|
96 |
-
1. Go to [Hugging Face Spaces](https://huggingface.co/spaces)
|
97 |
-
2. Click "Create new Space"
|
98 |
-
3. Choose "Docker" as the SDK
|
99 |
-
4. Set your space name and visibility
|
100 |
-
|
101 |
-
### Step 2: Add files to your Space
|
102 |
-
Upload these files to your Hugging Face Space repository:
|
103 |
-
|
104 |
-
```
|
105 |
-
your-space/
|
106 |
-
├── coqui_api.py # Main API file with C-3PO integration
|
107 |
-
├── requirements.txt # Dependencies (includes huggingface_hub)
|
108 |
-
├── Dockerfile.coqui # Docker configuration
|
109 |
-
├── test_c3po_model.py # Test script for C-3PO functionality
|
110 |
-
└── README.md # This file
|
111 |
-
```
|
112 |
-
|
113 |
-
### Step 3: Configure your Space
|
114 |
-
Rename the files in your Space:
|
115 |
-
- `Dockerfile.coqui` → `Dockerfile`
|
116 |
-
|
117 |
-
### Step 4: Deploy
|
118 |
-
Your Space will automatically build and deploy. The build process may take 15-20 minutes as it downloads the C-3PO fine-tuned model from Hugging Face.
|
119 |
-
|
120 |
-
## 💻 Local Development
|
121 |
-
|
122 |
-
### Requirements
|
123 |
-
- Python 3.11+
|
124 |
-
- PyTorch
|
125 |
-
- Coqui TTS library
|
126 |
-
- Hugging Face Hub
|
127 |
-
|
128 |
-
### Installation
|
129 |
-
```bash
|
130 |
-
# Clone the repository
|
131 |
-
git clone <your-repo>
|
132 |
-
cd <your-repo>
|
133 |
-
|
134 |
-
# Install dependencies
|
135 |
-
pip install -r requirements.txt
|
136 |
-
|
137 |
-
# Run the API
|
138 |
-
python coqui_api.py
|
139 |
-
```
|
140 |
-
|
141 |
-
The API will be available at `http://localhost:7860`
|
142 |
-
|
143 |
-
### Testing
|
144 |
-
```bash
|
145 |
-
# Run the C-3PO model test suite
|
146 |
-
python test_c3po_model.py
|
147 |
-
|
148 |
-
# Run the general test client
|
149 |
-
python test_coqui_api.py
|
150 |
-
```
|
151 |
-
|
152 |
-
## 🎪 Usage Examples
|
153 |
-
|
154 |
-
### Python Client - C-3PO Voice
|
155 |
-
```python
|
156 |
-
import requests
|
157 |
-
|
158 |
-
# C-3PO voice synthesis
|
159 |
-
data = {"text": "I am C-3PO, human-cyborg relations.", "language": "en"}
|
160 |
-
response = requests.post("http://localhost:7860/tts-c3po", data=data)
|
161 |
-
|
162 |
-
with open("c3po_output.wav", "wb") as f:
|
163 |
-
f.write(response.content)
|
164 |
-
|
165 |
-
# JSON API
|
166 |
-
import json
|
167 |
-
headers = {'Content-Type': 'application/json'}
|
168 |
-
data = {"text": "The odds are approximately 3,720 to 1!", "language": "en"}
|
169 |
-
response = requests.post("http://localhost:7860/tts-json", json=data, headers=headers)
|
170 |
-
|
171 |
-
with open("c3po_json.wav", "wb") as f:
|
172 |
-
f.write(response.content)
|
173 |
-
```
|
174 |
-
|
175 |
-
### JavaScript/Web - C-3PO Voice
|
176 |
-
```javascript
|
177 |
-
// C-3PO voice synthesis
|
178 |
-
const formData = new FormData();
|
179 |
-
formData.append('text', 'Oh my! How interesting!');
|
180 |
-
formData.append('language', 'en');
|
181 |
-
|
182 |
-
fetch('http://localhost:7860/tts-c3po', {
|
183 |
-
method: 'POST',
|
184 |
-
body: formData
|
185 |
-
})
|
186 |
-
.then(response => response.blob())
|
187 |
-
.then(blob => {
|
188 |
-
const url = URL.createObjectURL(blob);
|
189 |
-
const audio = new Audio(url);
|
190 |
-
audio.play();
|
191 |
-
});
|
192 |
-
|
193 |
-
// JSON API
|
194 |
-
fetch('http://localhost:7860/tts-json', {
|
195 |
-
method: 'POST',
|
196 |
-
headers: {'Content-Type': 'application/json'},
|
197 |
-
body: JSON.stringify({
|
198 |
-
text: 'R2-D2, you know better than to trust a strange computer!',
|
199 |
-
language: 'en'
|
200 |
-
})
|
201 |
-
})
|
202 |
-
.then(response => response.blob())
|
203 |
-
.then(blob => {
|
204 |
-
const url = URL.createObjectURL(blob);
|
205 |
-
const audio = new Audio(url);
|
206 |
-
audio.play();
|
207 |
-
});
|
208 |
-
```
|
209 |
-
|
210 |
-
## 🎨 C-3PO Voice Examples
|
211 |
-
|
212 |
-
Perfect texts for demonstrating C-3PO's voice characteristics:
|
213 |
-
|
214 |
-
```bash
|
215 |
-
# Classic C-3PO phrases
|
216 |
-
curl -X POST "http://localhost:7860/tts-c3po" \
|
217 |
-
-F "text=I am C-3PO, human-cyborg relations." \
|
218 |
-
-F "language=en" --output c3po_intro.wav
|
219 |
-
|
220 |
-
curl -X POST "http://localhost:7860/tts-c3po" \
|
221 |
-
-F "text=The odds of successfully navigating an asteroid field are approximately 3,720 to 1." \
|
222 |
-
-F "language=en" --output c3po_odds.wav
|
223 |
-
|
224 |
-
curl -X POST "http://localhost:7860/tts-c3po" \
|
225 |
-
-F "text=R2-D2, you know better than to trust a strange computer!" \
|
226 |
-
-F "language=en" --output c3po_r2d2.wav
|
227 |
-
|
228 |
-
curl -X POST "http://localhost:7860/tts-c3po" \
|
229 |
-
-F "text=Oh my! How interesting!" \
|
230 |
-
-F "language=en" --output c3po_oh_my.wav
|
231 |
-
```
|
232 |
-
|
233 |
-
## 🌍 Multilingual C-3PO Support
|
234 |
-
|
235 |
-
The C-3PO model maintains its distinctive voice characteristics across multiple languages:
|
236 |
-
|
237 |
-
```python
|
238 |
-
# Multilingual examples
|
239 |
-
languages = [
|
240 |
-
("Hello, I am C-3PO", "en"),
|
241 |
-
("Hola, soy C-3PO", "es"),
|
242 |
-
("Bonjour, je suis C-3PO", "fr"),
|
243 |
-
("Guten Tag, ich bin C-3PO", "de"),
|
244 |
-
("Ciao, sono C-3PO", "it"),
|
245 |
-
("Olá, eu sou C-3PO", "pt")
|
246 |
-
]
|
247 |
-
|
248 |
-
for text, lang in languages:
|
249 |
-
response = requests.post("http://localhost:7860/tts-c3po",
|
250 |
-
data={"text": text, "language": lang})
|
251 |
-
with open(f"c3po_{lang}.wav", "wb") as f:
|
252 |
-
f.write(response.content)
|
253 |
-
```
|
254 |
-
|
255 |
-
## 🔧 Voice Cloning Guide
|
256 |
-
|
257 |
-
1. **Prepare Reference Audio:**
|
258 |
-
- Duration: 5-10 seconds (optimal)
|
259 |
-
- Format: WAV, MP3, or M4A
|
260 |
-
- Quality: Clear speech, minimal background noise
|
261 |
-
- Content: Natural speaking, preferably in target language
|
262 |
-
|
263 |
-
2. **API Request:**
|
264 |
-
```bash
|
265 |
-
curl -X POST "http://your-space.hf.space/tts" \
|
266 |
-
-F "text=Your text to synthesize" \
|
267 |
-
-F "language=en" \
|
268 |
-
-F "speaker_file=@your_reference.wav" \
|
269 |
-
--output result.wav
|
270 |
-
```
|
271 |
-
|
272 |
-
3. **Tips for Best Results:**
|
273 |
-
- Use high-quality reference audio
|
274 |
-
- Match the language of reference and target text
|
275 |
-
- Keep text length reasonable (under 500 characters)
|
276 |
-
- Experiment with different reference samples
|
277 |
-
|
278 |
-
## Supported Languages
|
279 |
-
|
280 |
-
The XTTS v2 model supports multiple languages including:
|
281 |
-
- English (en)
|
282 |
-
- Spanish (es)
|
283 |
-
- French (fr)
|
284 |
-
- German (de)
|
285 |
-
- Italian (it)
|
286 |
-
- Portuguese (pt)
|
287 |
-
- Polish (pl)
|
288 |
-
- Turkish (tr)
|
289 |
-
- Russian (ru)
|
290 |
-
- Dutch (nl)
|
291 |
-
- Czech (cs)
|
292 |
-
- Arabic (ar)
|
293 |
-
- Chinese (zh-cn)
|
294 |
-
- Japanese (ja)
|
295 |
-
- Hungarian (hu)
|
296 |
-
- Korean (ko)
|
297 |
-
|
298 |
-
## Troubleshooting
|
299 |
-
|
300 |
-
### Common Issues
|
301 |
-
|
302 |
-
1. **Model Download Errors:**
|
303 |
-
- The first run downloads ~1.7GB model files
|
304 |
-
- Ensure stable internet connection
|
305 |
-
- Check Hugging Face Spaces logs
|
306 |
-
|
307 |
-
2. **Audio Quality Issues:**
|
308 |
-
- Use high-quality reference audio for voice cloning
|
309 |
-
- Ensure reference audio matches target language
|
310 |
-
- Try different reference samples
|
311 |
-
|
312 |
-
3. **Memory Issues on HF Spaces:**
|
313 |
-
- The model requires significant memory
|
314 |
-
- Consider upgrading to a higher-tier Space if needed
|
315 |
-
|
316 |
-
4. **API Timeouts:**
|
317 |
-
- Initial model loading takes time
|
318 |
-
- Subsequent requests are faster
|
319 |
-
- Consider warming up the model with a test request
|
320 |
-
|
321 |
-
### Environment Variables
|
322 |
-
|
323 |
-
- `COQUI_TOS_AGREED=1`: Accepts Coqui TTS terms of service
|
324 |
-
- `HF_HUB_DISABLE_TELEMETRY=1`: Disables telemetry
|
325 |
-
- `TORCH_HOME`: PyTorch cache directory
|
326 |
-
|
327 |
-
## API Documentation
|
328 |
-
|
329 |
-
Once deployed, visit your Space URL and add `/docs` to access the interactive API documentation:
|
330 |
-
```
|
331 |
-
https://your-username-your-space-name.hf.space/docs
|
332 |
-
```
|
333 |
-
|
334 |
-
## Contributing
|
335 |
-
|
336 |
-
1. Fork the repository
|
337 |
-
2. Create a feature branch
|
338 |
-
3. Make your changes
|
339 |
-
4. Test thoroughly
|
340 |
-
5. Submit a pull request
|
341 |
-
|
342 |
-
## License
|
343 |
-
|
344 |
-
This project uses the Coqui TTS library. Please check [Coqui TTS license](https://github.com/coqui-ai/TTS) for usage terms.
|
345 |
-
|
346 |
-
## Credits
|
347 |
-
|
348 |
-
- [Coqui TTS](https://github.com/coqui-ai/TTS) - The underlying TTS engine
|
349 |
-
- [XTTS v2](https://arxiv.org/abs/2309.11321) - The voice cloning model
|
350 |
-
- [FastAPI](https://fastapi.tiangolo.com/) - Web framework
|
351 |
-
- [Hugging Face Spaces](https://huggingface.co/spaces) - Deployment platform
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
DELETED
@@ -1,414 +0,0 @@
|
|
1 |
-
# Import configuration first to setup environment
|
2 |
-
import app_config
|
3 |
-
|
4 |
-
import os
|
5 |
-
import sys
|
6 |
-
import io
|
7 |
-
import subprocess
|
8 |
-
import uuid
|
9 |
-
import time
|
10 |
-
import torch
|
11 |
-
import torchaudio
|
12 |
-
import tempfile
|
13 |
-
import logging
|
14 |
-
from typing import Optional
|
15 |
-
|
16 |
-
# Fix PyTorch weights_only issue for XTTS
|
17 |
-
import torch.serialization
|
18 |
-
from TTS.tts.configs.xtts_config import XttsConfig
|
19 |
-
torch.serialization.add_safe_globals([XttsConfig])
|
20 |
-
|
21 |
-
# Set environment variables
|
22 |
-
os.environ["COQUI_TOS_AGREED"] = "1"
|
23 |
-
os.environ["NUMBA_DISABLE_JIT"] = "1"
|
24 |
-
|
25 |
-
# Force CPU usage if specified
|
26 |
-
if os.environ.get("FORCE_CPU", "false").lower() == "true":
|
27 |
-
os.environ["CUDA_VISIBLE_DEVICES"] = ""
|
28 |
-
|
29 |
-
from fastapi import FastAPI, HTTPException, UploadFile, File, Form
|
30 |
-
from fastapi.responses import FileResponse
|
31 |
-
from pydantic import BaseModel
|
32 |
-
import langid
|
33 |
-
from scipy.io.wavfile import write
|
34 |
-
from pydub import AudioSegment
|
35 |
-
|
36 |
-
from TTS.api import TTS
|
37 |
-
from TTS.tts.configs.xtts_config import XttsConfig
|
38 |
-
from TTS.tts.models.xtts import Xtts
|
39 |
-
from TTS.utils.generic_utils import get_user_data_dir
|
40 |
-
|
41 |
-
# Configure logging
|
42 |
-
logging.basicConfig(level=logging.INFO)
|
43 |
-
logger = logging.getLogger(__name__)
|
44 |
-
|
45 |
-
app = FastAPI(title="XTTS C3PO API", description="Text-to-Speech API using XTTS-v2 C3PO model", version="1.0.0")
|
46 |
-
|
47 |
-
class TTSRequest(BaseModel):
|
48 |
-
text: str
|
49 |
-
language: str = "en"
|
50 |
-
voice_cleanup: bool = False
|
51 |
-
no_lang_auto_detect: bool = False
|
52 |
-
|
53 |
-
class XTTSService:
|
54 |
-
def __init__(self):
|
55 |
-
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
56 |
-
logger.info(f"Using device: {self.device}")
|
57 |
-
|
58 |
-
# Use the C3PO model path
|
59 |
-
self.model_path = "XTTS-v2_C3PO/"
|
60 |
-
self.config_path = "XTTS-v2_C3PO/config.json"
|
61 |
-
|
62 |
-
# Check if model files exist, if not download them
|
63 |
-
if not os.path.exists(self.config_path):
|
64 |
-
logger.info("C3PO model not found locally, downloading...")
|
65 |
-
self._download_c3po_model()
|
66 |
-
|
67 |
-
# Load configuration
|
68 |
-
config = XttsConfig()
|
69 |
-
config.load_json(self.config_path)
|
70 |
-
|
71 |
-
# Initialize and load model
|
72 |
-
self.model = Xtts.init_from_config(config)
|
73 |
-
self.model.load_checkpoint(
|
74 |
-
config,
|
75 |
-
checkpoint_path=os.path.join(self.model_path, "model.pth"),
|
76 |
-
vocab_path=os.path.join(self.model_path, "vocab.json"),
|
77 |
-
eval=True,
|
78 |
-
)
|
79 |
-
|
80 |
-
if self.device == "cuda":
|
81 |
-
self.model.cuda()
|
82 |
-
|
83 |
-
self.supported_languages = config.languages
|
84 |
-
logger.info(f"XTTS C3PO model loaded successfully. Supported languages: {self.supported_languages}")
|
85 |
-
|
86 |
-
# Set default reference audio (C3PO voice)
|
87 |
-
self.default_reference = os.path.join(self.model_path, "reference.wav")
|
88 |
-
if not os.path.exists(self.default_reference):
|
89 |
-
# Look for any reference audio in the model directory
|
90 |
-
for file in os.listdir(self.model_path):
|
91 |
-
if file.endswith(('.wav', '.mp3', '.m4a')):
|
92 |
-
self.default_reference = os.path.join(self.model_path, file)
|
93 |
-
break
|
94 |
-
else:
|
95 |
-
self.default_reference = None
|
96 |
-
|
97 |
-
if self.default_reference:
|
98 |
-
logger.info(f"Default C3PO reference audio: {self.default_reference}")
|
99 |
-
else:
|
100 |
-
logger.warning("No default reference audio found in C3PO model directory")
|
101 |
-
|
102 |
-
def _download_c3po_model(self):
|
103 |
-
"""Download the C3PO model from Hugging Face"""
|
104 |
-
try:
|
105 |
-
logger.info("Downloading C3PO model from Hugging Face...")
|
106 |
-
subprocess.run([
|
107 |
-
"git", "clone",
|
108 |
-
"https://huggingface.co/Borcherding/XTTS-v2_C3PO",
|
109 |
-
"XTTS-v2_C3PO"
|
110 |
-
], check=True)
|
111 |
-
logger.info("C3PO model downloaded successfully")
|
112 |
-
except subprocess.CalledProcessError as e:
|
113 |
-
logger.error(f"Failed to download C3PO model: {e}")
|
114 |
-
raise HTTPException(status_code=500, detail="Failed to download C3PO model")
|
115 |
-
|
116 |
-
def generate_speech(self, text: str, speaker_wav_path: str = None, language: str = "en",
|
117 |
-
voice_cleanup: bool = False, no_lang_auto_detect: bool = False) -> str:
|
118 |
-
"""Generate speech and return the path to the output file"""
|
119 |
-
try:
|
120 |
-
# Use default C3PO voice if no speaker file provided
|
121 |
-
if speaker_wav_path is None:
|
122 |
-
if self.default_reference is None:
|
123 |
-
raise HTTPException(status_code=400, detail="No reference audio available. Please upload a speaker file.")
|
124 |
-
speaker_wav_path = self.default_reference
|
125 |
-
logger.info("Using default C3PO voice")
|
126 |
-
|
127 |
-
# Validate language
|
128 |
-
if language not in self.supported_languages:
|
129 |
-
raise HTTPException(status_code=400, detail=f"Language '{language}' not supported. Supported: {self.supported_languages}")
|
130 |
-
|
131 |
-
# Language detection for longer texts
|
132 |
-
if len(text) > 15 and not no_lang_auto_detect:
|
133 |
-
language_predicted = langid.classify(text)[0].strip()
|
134 |
-
if language_predicted == "zh":
|
135 |
-
language_predicted = "zh-cn"
|
136 |
-
|
137 |
-
if language_predicted != language:
|
138 |
-
logger.warning(f"Detected language: {language_predicted}, chosen: {language}")
|
139 |
-
|
140 |
-
# Text length validation
|
141 |
-
if len(text) < 2:
|
142 |
-
raise HTTPException(status_code=400, detail="Text too short, please provide longer text")
|
143 |
-
|
144 |
-
if len(text) > 500: # Increased limit for API
|
145 |
-
raise HTTPException(status_code=400, detail="Text too long, maximum 500 characters")
|
146 |
-
|
147 |
-
# Voice cleanup if requested
|
148 |
-
processed_speaker_wav = speaker_wav_path
|
149 |
-
if voice_cleanup:
|
150 |
-
processed_speaker_wav = self._cleanup_audio(speaker_wav_path)
|
151 |
-
|
152 |
-
# Generate conditioning latents
|
153 |
-
try:
|
154 |
-
gpt_cond_latent, speaker_embedding = self.model.get_conditioning_latents(
|
155 |
-
audio_path=processed_speaker_wav,
|
156 |
-
gpt_cond_len=30,
|
157 |
-
gpt_cond_chunk_len=4,
|
158 |
-
max_ref_length=60
|
159 |
-
)
|
160 |
-
except Exception as e:
|
161 |
-
logger.error(f"Speaker encoding error: {e}")
|
162 |
-
raise HTTPException(status_code=400, detail="Error processing reference audio. Please check the audio file.")
|
163 |
-
|
164 |
-
# Generate speech
|
165 |
-
logger.info("Generating speech...")
|
166 |
-
start_time = time.time()
|
167 |
-
|
168 |
-
out = self.model.inference(
|
169 |
-
text,
|
170 |
-
language,
|
171 |
-
gpt_cond_latent,
|
172 |
-
speaker_embedding,
|
173 |
-
repetition_penalty=5.0,
|
174 |
-
temperature=0.75,
|
175 |
-
)
|
176 |
-
|
177 |
-
inference_time = time.time() - start_time
|
178 |
-
logger.info(f"Speech generation completed in {inference_time:.2f} seconds")
|
179 |
-
|
180 |
-
# Save output
|
181 |
-
output_filename = f"xtts_c3po_output_{uuid.uuid4().hex}.wav"
|
182 |
-
output_path = os.path.join(tempfile.gettempdir(), output_filename)
|
183 |
-
|
184 |
-
torchaudio.save(output_path, torch.tensor(out["wav"]).unsqueeze(0), 24000)
|
185 |
-
|
186 |
-
return output_path
|
187 |
-
|
188 |
-
except Exception as e:
|
189 |
-
logger.error(f"Error generating speech: {e}")
|
190 |
-
if isinstance(e, HTTPException):
|
191 |
-
raise e
|
192 |
-
raise HTTPException(status_code=500, detail=f"Failed to generate speech: {str(e)}")
|
193 |
-
|
194 |
-
def _cleanup_audio(self, audio_path: str) -> str:
|
195 |
-
"""Apply audio cleanup filters"""
|
196 |
-
try:
|
197 |
-
output_path = audio_path + "_cleaned.wav"
|
198 |
-
|
199 |
-
# Basic audio cleanup using ffmpeg-python or similar
|
200 |
-
# For now, just return the original path
|
201 |
-
# You can implement more sophisticated cleanup here
|
202 |
-
|
203 |
-
return audio_path
|
204 |
-
except Exception as e:
|
205 |
-
logger.warning(f"Audio cleanup failed: {e}, using original audio")
|
206 |
-
return audio_path
|
207 |
-
|
208 |
-
# Initialize XTTS service
|
209 |
-
logger.info("Initializing XTTS C3PO service...")
|
210 |
-
tts_service = XTTSService()
|
211 |
-
|
212 |
-
@app.get("/")
|
213 |
-
async def root():
|
214 |
-
return {"message": "XTTS C3PO API is running", "status": "healthy", "model": "C3PO"}
|
215 |
-
|
216 |
-
@app.get("/health")
|
217 |
-
async def health_check():
|
218 |
-
return {
|
219 |
-
"status": "healthy",
|
220 |
-
"device": tts_service.device,
|
221 |
-
"model": "XTTS-v2 C3PO",
|
222 |
-
"supported_languages": tts_service.supported_languages,
|
223 |
-
"default_voice": "C3PO" if tts_service.default_reference else "None"
|
224 |
-
}
|
225 |
-
|
226 |
-
@app.get("/languages")
|
227 |
-
async def get_languages():
|
228 |
-
"""Get list of supported languages"""
|
229 |
-
return {"languages": tts_service.supported_languages}
|
230 |
-
|
231 |
-
@app.post("/tts")
|
232 |
-
async def text_to_speech(
|
233 |
-
text: str = Form(...),
|
234 |
-
language: str = Form("en"),
|
235 |
-
voice_cleanup: bool = Form(False),
|
236 |
-
no_lang_auto_detect: bool = Form(False),
|
237 |
-
speaker_file: UploadFile = File(None)
|
238 |
-
):
|
239 |
-
"""
|
240 |
-
Convert text to speech using XTTS C3PO voice cloning
|
241 |
-
|
242 |
-
- **text**: The text to convert to speech (max 500 characters)
|
243 |
-
- **language**: Language code (default: "en")
|
244 |
-
- **voice_cleanup**: Apply audio cleanup to reference voice
|
245 |
-
- **no_lang_auto_detect**: Disable automatic language detection
|
246 |
-
- **speaker_file**: Reference speaker audio file (optional, uses C3PO voice if not provided)
|
247 |
-
"""
|
248 |
-
|
249 |
-
if not text.strip():
|
250 |
-
raise HTTPException(status_code=400, detail="Text cannot be empty")
|
251 |
-
|
252 |
-
speaker_temp_path = None
|
253 |
-
|
254 |
-
try:
|
255 |
-
# Handle speaker file if provided
|
256 |
-
if speaker_file is not None:
|
257 |
-
# Validate file type
|
258 |
-
if not speaker_file.content_type.startswith('audio/'):
|
259 |
-
raise HTTPException(status_code=400, detail="Speaker file must be an audio file")
|
260 |
-
|
261 |
-
# Save uploaded speaker file temporarily
|
262 |
-
speaker_temp_path = os.path.join(tempfile.gettempdir(), f"speaker_{uuid.uuid4().hex}.wav")
|
263 |
-
|
264 |
-
with open(speaker_temp_path, "wb") as buffer:
|
265 |
-
content = await speaker_file.read()
|
266 |
-
buffer.write(content)
|
267 |
-
|
268 |
-
# Generate speech (will use C3PO voice if no speaker file provided)
|
269 |
-
output_path = tts_service.generate_speech(
|
270 |
-
text,
|
271 |
-
speaker_temp_path,
|
272 |
-
language,
|
273 |
-
voice_cleanup,
|
274 |
-
no_lang_auto_detect
|
275 |
-
)
|
276 |
-
|
277 |
-
# Clean up temporary speaker file
|
278 |
-
if speaker_temp_path and os.path.exists(speaker_temp_path):
|
279 |
-
try:
|
280 |
-
os.remove(speaker_temp_path)
|
281 |
-
except:
|
282 |
-
pass
|
283 |
-
|
284 |
-
# Return the generated audio file
|
285 |
-
voice_type = "custom" if speaker_file else "c3po"
|
286 |
-
return FileResponse(
|
287 |
-
output_path,
|
288 |
-
media_type="audio/wav",
|
289 |
-
filename=f"xtts_{voice_type}_output_{uuid.uuid4().hex}.wav",
|
290 |
-
headers={"Content-Disposition": "attachment"}
|
291 |
-
)
|
292 |
-
|
293 |
-
except Exception as e:
|
294 |
-
# Clean up files in case of error
|
295 |
-
if speaker_temp_path and os.path.exists(speaker_temp_path):
|
296 |
-
try:
|
297 |
-
os.remove(speaker_temp_path)
|
298 |
-
except:
|
299 |
-
pass
|
300 |
-
|
301 |
-
logger.error(f"Error in TTS endpoint: {e}")
|
302 |
-
if isinstance(e, HTTPException):
|
303 |
-
raise e
|
304 |
-
raise HTTPException(status_code=500, detail=str(e))
|
305 |
-
|
306 |
-
@app.post("/tts-json")
|
307 |
-
async def text_to_speech_json(
|
308 |
-
request: TTSRequest,
|
309 |
-
speaker_file: UploadFile = File(None)
|
310 |
-
):
|
311 |
-
"""
|
312 |
-
Convert text to speech using JSON request body
|
313 |
-
|
314 |
-
- **request**: TTSRequest containing text, language, and options
|
315 |
-
- **speaker_file**: Reference speaker audio file (optional, uses C3PO voice if not provided)
|
316 |
-
"""
|
317 |
-
|
318 |
-
if not request.text.strip():
|
319 |
-
raise HTTPException(status_code=400, detail="Text cannot be empty")
|
320 |
-
|
321 |
-
speaker_temp_path = None
|
322 |
-
|
323 |
-
try:
|
324 |
-
# Handle speaker file if provided
|
325 |
-
if speaker_file is not None:
|
326 |
-
# Validate file type
|
327 |
-
if not speaker_file.content_type.startswith('audio/'):
|
328 |
-
raise HTTPException(status_code=400, detail="Speaker file must be an audio file")
|
329 |
-
|
330 |
-
# Save uploaded speaker file temporarily
|
331 |
-
speaker_temp_path = os.path.join(tempfile.gettempdir(), f"speaker_{uuid.uuid4().hex}.wav")
|
332 |
-
|
333 |
-
with open(speaker_temp_path, "wb") as buffer:
|
334 |
-
content = await speaker_file.read()
|
335 |
-
buffer.write(content)
|
336 |
-
|
337 |
-
# Generate speech
|
338 |
-
output_path = tts_service.generate_speech(
|
339 |
-
request.text,
|
340 |
-
speaker_temp_path,
|
341 |
-
request.language,
|
342 |
-
request.voice_cleanup,
|
343 |
-
request.no_lang_auto_detect
|
344 |
-
)
|
345 |
-
|
346 |
-
# Clean up temporary speaker file
|
347 |
-
if speaker_temp_path and os.path.exists(speaker_temp_path):
|
348 |
-
try:
|
349 |
-
os.remove(speaker_temp_path)
|
350 |
-
except:
|
351 |
-
pass
|
352 |
-
|
353 |
-
# Return the generated audio file
|
354 |
-
voice_type = "custom" if speaker_file else "c3po"
|
355 |
-
return FileResponse(
|
356 |
-
output_path,
|
357 |
-
media_type="audio/wav",
|
358 |
-
filename=f"xtts_{voice_type}_{request.language}_{uuid.uuid4().hex}.wav",
|
359 |
-
headers={"Content-Disposition": "attachment"}
|
360 |
-
)
|
361 |
-
|
362 |
-
except Exception as e:
|
363 |
-
# Clean up files in case of error
|
364 |
-
if speaker_temp_path and os.path.exists(speaker_temp_path):
|
365 |
-
try:
|
366 |
-
os.remove(speaker_temp_path)
|
367 |
-
except:
|
368 |
-
pass
|
369 |
-
|
370 |
-
logger.error(f"Error in TTS JSON endpoint: {e}")
|
371 |
-
if isinstance(e, HTTPException):
|
372 |
-
raise e
|
373 |
-
raise HTTPException(status_code=500, detail=str(e))
|
374 |
-
|
375 |
-
@app.post("/tts-c3po")
|
376 |
-
async def text_to_speech_c3po_only(
|
377 |
-
text: str = Form(...),
|
378 |
-
language: str = Form("en"),
|
379 |
-
no_lang_auto_detect: bool = Form(False)
|
380 |
-
):
|
381 |
-
"""
|
382 |
-
Convert text to speech using C3PO voice only (no file upload needed)
|
383 |
-
|
384 |
-
- **text**: The text to convert to speech (max 500 characters)
|
385 |
-
- **language**: Language code (default: "en")
|
386 |
-
- **no_lang_auto_detect**: Disable automatic language detection
|
387 |
-
"""
|
388 |
-
|
389 |
-
if not text.strip():
|
390 |
-
raise HTTPException(status_code=400, detail="Text cannot be empty")
|
391 |
-
|
392 |
-
try:
|
393 |
-
# Generate speech using C3PO voice
|
394 |
-
output_path = tts_service.generate_speech(
|
395 |
-
text,
|
396 |
-
None, # Use default C3PO voice
|
397 |
-
language,
|
398 |
-
False, # No voice cleanup needed for default voice
|
399 |
-
no_lang_auto_detect
|
400 |
-
)
|
401 |
-
|
402 |
-
# Return the generated audio file
|
403 |
-
return FileResponse(
|
404 |
-
output_path,
|
405 |
-
media_type="audio/wav",
|
406 |
-
filename=f"c3po_voice_{uuid.uuid4().hex}.wav",
|
407 |
-
headers={"Content-Disposition": "attachment"}
|
408 |
-
)
|
409 |
-
|
410 |
-
except Exception as e:
|
411 |
-
logger.error(f"Error in C3PO TTS endpoint: {e}")
|
412 |
-
if isinstance(e, HTTPException):
|
413 |
-
raise e
|
414 |
-
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app_config.py
DELETED
@@ -1,54 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
Configuration for Kokoro TTS API, especially for Hugging Face Spaces deployment.
|
3 |
-
"""
|
4 |
-
|
5 |
-
import os
|
6 |
-
import tempfile
|
7 |
-
import logging
|
8 |
-
|
9 |
-
# Configure logging
|
10 |
-
logging.basicConfig(level=logging.INFO)
|
11 |
-
logger = logging.getLogger(__name__)
|
12 |
-
|
13 |
-
def setup_hf_cache():
|
14 |
-
"""Setup cache environment variables for Hugging Face Spaces"""
|
15 |
-
# Use user's home directory for cache
|
16 |
-
home_dir = os.path.expanduser("~")
|
17 |
-
cache_dir = os.path.join(home_dir, ".cache")
|
18 |
-
|
19 |
-
cache_settings = {
|
20 |
-
'HF_HOME': cache_dir,
|
21 |
-
'TRANSFORMERS_CACHE': cache_dir,
|
22 |
-
'HF_HUB_CACHE': cache_dir,
|
23 |
-
'TORCH_HOME': cache_dir,
|
24 |
-
'NUMBA_CACHE_DIR': os.path.join(cache_dir, 'numba'),
|
25 |
-
'NUMBA_DISABLE_JIT': '1',
|
26 |
-
'HF_HUB_DISABLE_TELEMETRY': '1'
|
27 |
-
}
|
28 |
-
|
29 |
-
# Set environment variables
|
30 |
-
for key, value in cache_settings.items():
|
31 |
-
os.environ[key] = value
|
32 |
-
logger.info(f"Set {key} to {value}")
|
33 |
-
|
34 |
-
# Create cache directories
|
35 |
-
cache_dirs = [cache_dir, os.path.join(cache_dir, 'numba')]
|
36 |
-
for cache_path in cache_dirs:
|
37 |
-
try:
|
38 |
-
os.makedirs(cache_path, exist_ok=True)
|
39 |
-
logger.info(f"Created cache directory: {cache_path}")
|
40 |
-
except Exception as e:
|
41 |
-
logger.warning(f"Could not create {cache_path}: {e}")
|
42 |
-
|
43 |
-
logger.info("Cache environment setup completed")
|
44 |
-
|
45 |
-
def get_temp_dir():
|
46 |
-
"""Get a writable temporary directory"""
|
47 |
-
return tempfile.gettempdir()
|
48 |
-
|
49 |
-
def is_hf_spaces():
|
50 |
-
"""Check if running on Hugging Face Spaces"""
|
51 |
-
return os.environ.get('SPACE_ID') is not None
|
52 |
-
|
53 |
-
# Initialize cache setup
|
54 |
-
setup_hf_cache()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
client_example.py
DELETED
@@ -1,269 +0,0 @@
|
|
1 |
-
import requests
|
2 |
-
import os
|
3 |
-
|
4 |
-
def test_c3po_voice():
|
5 |
-
"""Test the C3PO voice without uploading any files"""
|
6 |
-
|
7 |
-
# API endpoint for C3PO voice only
|
8 |
-
url = "http://localhost:7860/tts-c3po"
|
9 |
-
|
10 |
-
# Text to convert to speech
|
11 |
-
text = "Hello there! I am C-3PO, human-cyborg relations. How may I assist you today?"
|
12 |
-
|
13 |
-
# Prepare the request data
|
14 |
-
data = {
|
15 |
-
"text": text,
|
16 |
-
"language": "en",
|
17 |
-
"no_lang_auto_detect": False
|
18 |
-
}
|
19 |
-
|
20 |
-
try:
|
21 |
-
print("Testing C3PO voice...")
|
22 |
-
print(f"Text: {text}")
|
23 |
-
|
24 |
-
response = requests.post(url, data=data)
|
25 |
-
|
26 |
-
if response.status_code == 200:
|
27 |
-
# Save the generated audio
|
28 |
-
output_filename = "c3po_voice_sample.wav"
|
29 |
-
with open(output_filename, "wb") as f:
|
30 |
-
f.write(response.content)
|
31 |
-
print(f"Success! C3PO voice sample saved as {output_filename}")
|
32 |
-
else:
|
33 |
-
print(f"Error: {response.status_code}")
|
34 |
-
print(response.text)
|
35 |
-
|
36 |
-
except requests.exceptions.ConnectionError:
|
37 |
-
print("Error: Could not connect to the API. Make sure the server is running on http://localhost:7860")
|
38 |
-
except Exception as e:
|
39 |
-
print(f"Error: {e}")
|
40 |
-
|
41 |
-
def test_xtts_with_custom_voice():
|
42 |
-
"""Example of using XTTS with custom voice upload"""
|
43 |
-
|
44 |
-
# API endpoint
|
45 |
-
url = "http://localhost:7860/tts"
|
46 |
-
|
47 |
-
# Text to convert to speech
|
48 |
-
text = "This is a test of XTTS voice cloning with a custom reference voice."
|
49 |
-
|
50 |
-
# Path to your speaker reference audio file
|
51 |
-
speaker_file_path = "reference.wav" # Update this path to your reference audio
|
52 |
-
|
53 |
-
# Check if speaker file exists
|
54 |
-
if not os.path.exists(speaker_file_path):
|
55 |
-
print(f"Custom voice test skipped: Speaker file not found at {speaker_file_path}")
|
56 |
-
print("To test custom voice cloning:")
|
57 |
-
print("1. Record 3-10 seconds of clear speech")
|
58 |
-
print("2. Save as 'reference.wav' in this directory")
|
59 |
-
print("3. Run this test again")
|
60 |
-
return
|
61 |
-
|
62 |
-
# Prepare the request data
|
63 |
-
data = {
|
64 |
-
"text": text,
|
65 |
-
"language": "en",
|
66 |
-
"voice_cleanup": False,
|
67 |
-
"no_lang_auto_detect": False
|
68 |
-
}
|
69 |
-
|
70 |
-
files = {
|
71 |
-
"speaker_file": open(speaker_file_path, "rb")
|
72 |
-
}
|
73 |
-
|
74 |
-
try:
|
75 |
-
print("Testing XTTS with custom voice...")
|
76 |
-
print(f"Text: {text}")
|
77 |
-
print(f"Speaker file: {speaker_file_path}")
|
78 |
-
|
79 |
-
response = requests.post(url, data=data, files=files)
|
80 |
-
|
81 |
-
if response.status_code == 200:
|
82 |
-
# Save the generated audio
|
83 |
-
output_filename = "custom_voice_clone.wav"
|
84 |
-
with open(output_filename, "wb") as f:
|
85 |
-
f.write(response.content)
|
86 |
-
print(f"Success! Custom voice clone saved as {output_filename}")
|
87 |
-
else:
|
88 |
-
print(f"Error: {response.status_code}")
|
89 |
-
print(response.text)
|
90 |
-
|
91 |
-
except requests.exceptions.ConnectionError:
|
92 |
-
print("Error: Could not connect to the API. Make sure the server is running on http://localhost:7860")
|
93 |
-
except Exception as e:
|
94 |
-
print(f"Error: {e}")
|
95 |
-
finally:
|
96 |
-
files["speaker_file"].close()
|
97 |
-
|
98 |
-
def test_xtts_fallback_to_c3po():
|
99 |
-
"""Test XTTS endpoint without speaker file (should use C3PO voice)"""
|
100 |
-
|
101 |
-
# API endpoint
|
102 |
-
url = "http://localhost:7860/tts"
|
103 |
-
|
104 |
-
# Text to convert to speech
|
105 |
-
text = "When no custom voice is provided, I will speak in the C3PO voice by default."
|
106 |
-
|
107 |
-
# Prepare the request data (no speaker file)
|
108 |
-
data = {
|
109 |
-
"text": text,
|
110 |
-
"language": "en",
|
111 |
-
"voice_cleanup": False,
|
112 |
-
"no_lang_auto_detect": False
|
113 |
-
}
|
114 |
-
|
115 |
-
try:
|
116 |
-
print("Testing XTTS fallback to C3PO voice...")
|
117 |
-
print(f"Text: {text}")
|
118 |
-
|
119 |
-
response = requests.post(url, data=data)
|
120 |
-
|
121 |
-
if response.status_code == 200:
|
122 |
-
# Save the generated audio
|
123 |
-
output_filename = "xtts_c3po_fallback.wav"
|
124 |
-
with open(output_filename, "wb") as f:
|
125 |
-
f.write(response.content)
|
126 |
-
print(f"Success! XTTS with C3PO fallback saved as {output_filename}")
|
127 |
-
else:
|
128 |
-
print(f"Error: {response.status_code}")
|
129 |
-
print(response.text)
|
130 |
-
|
131 |
-
except requests.exceptions.ConnectionError:
|
132 |
-
print("Error: Could not connect to the API. Make sure the server is running on http://localhost:7860")
|
133 |
-
except Exception as e:
|
134 |
-
print(f"Error: {e}")
|
135 |
-
|
136 |
-
def test_multilingual_c3po():
|
137 |
-
"""Test C3PO voice in different languages"""
|
138 |
-
|
139 |
-
# API endpoint for C3PO voice only
|
140 |
-
url = "http://localhost:7860/tts-c3po"
|
141 |
-
|
142 |
-
# Test different languages
|
143 |
-
test_cases = [
|
144 |
-
("en", "Hello, I am C-3PO. I am fluent in over six million forms of communication."),
|
145 |
-
("es", "Hola, soy C-3PO. Domino más de seis millones de formas de comunicación."),
|
146 |
-
("fr", "Bonjour, je suis C-3PO. Je maîtrise plus de six millions de formes de communication."),
|
147 |
-
("de", "Hallo, ich bin C-3PO. Ich beherrsche über sechs Millionen Kommunikationsformen."),
|
148 |
-
]
|
149 |
-
|
150 |
-
for language, text in test_cases:
|
151 |
-
data = {
|
152 |
-
"text": text,
|
153 |
-
"language": language,
|
154 |
-
"no_lang_auto_detect": True # Force the specified language
|
155 |
-
}
|
156 |
-
|
157 |
-
try:
|
158 |
-
print(f"Testing C3PO voice in {language.upper()}...")
|
159 |
-
print(f"Text: {text}")
|
160 |
-
|
161 |
-
response = requests.post(url, data=data)
|
162 |
-
|
163 |
-
if response.status_code == 200:
|
164 |
-
# Save the generated audio
|
165 |
-
output_filename = f"c3po_voice_{language}.wav"
|
166 |
-
with open(output_filename, "wb") as f:
|
167 |
-
f.write(response.content)
|
168 |
-
print(f"Success! C3PO {language} voice saved as {output_filename}")
|
169 |
-
else:
|
170 |
-
print(f"Error: {response.status_code}")
|
171 |
-
print(response.text)
|
172 |
-
|
173 |
-
except requests.exceptions.ConnectionError:
|
174 |
-
print("Error: Could not connect to the API. Make sure the server is running on http://localhost:7860")
|
175 |
-
except Exception as e:
|
176 |
-
print(f"Error: {e}")
|
177 |
-
|
178 |
-
print() # Add spacing between tests
|
179 |
-
|
180 |
-
def get_supported_languages():
|
181 |
-
"""Get list of supported languages"""
|
182 |
-
try:
|
183 |
-
response = requests.get("http://localhost:7860/languages")
|
184 |
-
if response.status_code == 200:
|
185 |
-
languages = response.json()
|
186 |
-
print("Supported languages:", languages["languages"])
|
187 |
-
return languages["languages"]
|
188 |
-
else:
|
189 |
-
print("Failed to get languages:", response.status_code)
|
190 |
-
return []
|
191 |
-
except requests.exceptions.ConnectionError:
|
192 |
-
print("API is not running. Start it with: uvicorn app:app --host 0.0.0.0 --port 7860")
|
193 |
-
return []
|
194 |
-
|
195 |
-
def check_api_health():
|
196 |
-
"""Check if the API is running"""
|
197 |
-
try:
|
198 |
-
response = requests.get("http://localhost:7860/health")
|
199 |
-
if response.status_code == 200:
|
200 |
-
health_info = response.json()
|
201 |
-
print("API Health Check:")
|
202 |
-
print(f" Status: {health_info['status']}")
|
203 |
-
print(f" Device: {health_info['device']}")
|
204 |
-
print(f" Model: {health_info['model']}")
|
205 |
-
print(f" Default Voice: {health_info['default_voice']}")
|
206 |
-
print(f" Languages: {len(health_info['supported_languages'])} supported")
|
207 |
-
return True
|
208 |
-
else:
|
209 |
-
print("API health check failed:", response.status_code)
|
210 |
-
return False
|
211 |
-
except requests.exceptions.ConnectionError:
|
212 |
-
print("API is not running. Start it with: uvicorn app:app --host 0.0.0.0 --port 7860")
|
213 |
-
return False
|
214 |
-
|
215 |
-
def create_sample_reference():
|
216 |
-
"""Instructions for creating a reference audio file"""
|
217 |
-
print("\n" + "="*50)
|
218 |
-
print("REFERENCE AUDIO SETUP")
|
219 |
-
print("="*50)
|
220 |
-
print("To use XTTS voice cloning, you need a reference audio file:")
|
221 |
-
print("1. Record 3-10 seconds of clear speech")
|
222 |
-
print("2. Save as WAV format (recommended)")
|
223 |
-
print("3. Ensure good audio quality (no background noise)")
|
224 |
-
print("4. Place the file in the same directory as this script")
|
225 |
-
print("5. Update the 'speaker_file_path' variable in the functions above")
|
226 |
-
print("\nExample recording text:")
|
227 |
-
print("'Hello, this is my voice. I'm recording this sample for voice cloning.'")
|
228 |
-
print("="*50)
|
229 |
-
|
230 |
-
if __name__ == "__main__":
|
231 |
-
print("XTTS C3PO API Client Example")
|
232 |
-
print("=" * 40)
|
233 |
-
|
234 |
-
# First check if API is running
|
235 |
-
if check_api_health():
|
236 |
-
print()
|
237 |
-
|
238 |
-
# Get supported languages
|
239 |
-
languages = get_supported_languages()
|
240 |
-
print()
|
241 |
-
|
242 |
-
# Test C3PO voice (no file upload needed)
|
243 |
-
print("1. Testing C3PO voice (no upload required)...")
|
244 |
-
test_c3po_voice()
|
245 |
-
print()
|
246 |
-
|
247 |
-
# Test XTTS fallback to C3PO
|
248 |
-
print("2. Testing XTTS endpoint without speaker file (C3PO fallback)...")
|
249 |
-
test_xtts_fallback_to_c3po()
|
250 |
-
print()
|
251 |
-
|
252 |
-
# Test custom voice if reference file exists
|
253 |
-
print("3. Testing custom voice cloning...")
|
254 |
-
test_xtts_with_custom_voice()
|
255 |
-
print()
|
256 |
-
|
257 |
-
# Test multilingual C3PO
|
258 |
-
print("4. Testing multilingual C3PO voice...")
|
259 |
-
test_multilingual_c3po()
|
260 |
-
|
261 |
-
print("All tests completed!")
|
262 |
-
print("\nGenerated files:")
|
263 |
-
for file in os.listdir("."):
|
264 |
-
if file.endswith(".wav") and ("c3po" in file or "custom" in file or "xtts" in file):
|
265 |
-
print(f" - {file}")
|
266 |
-
|
267 |
-
else:
|
268 |
-
print("\nPlease start the API server first:")
|
269 |
-
print("uvicorn app:app --host 0.0.0.0 --port 7860")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -1,13 +1,8 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
num2words>=0.5.14
|
10 |
-
pysbd>=0.3.4
|
11 |
-
tqdm>=4.64.1
|
12 |
-
coqui-tts == 0.26.2
|
13 |
-
huggingface_hub>=0.17.0
|
|
|
1 |
+
fastapi>=0.104.1
|
2 |
+
uvicorn>=0.24.0
|
3 |
+
python-multipart>=0.0.6
|
4 |
+
torch>=2.0.0
|
5 |
+
torchaudio>=2.0.0
|
6 |
+
coqui-tts>=0.22.0
|
7 |
+
huggingface_hub>=0.17.0
|
8 |
+
pydantic>=2.0.0
|
|
|
|
|
|
|
|
|
|
requirements_coqui.txt
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
fastapi>=0.104.1
|
2 |
-
uvicorn[standard]>=0.24.0
|
3 |
-
python-multipart>=0.0.6
|
4 |
-
coqui-tts==0.26.2
|
5 |
-
torch>=2.0.0
|
6 |
-
torchaudio>=2.0.0
|
7 |
-
numpy>=1.24.0
|
8 |
-
scipy>=1.11.0
|
9 |
-
pydub>=0.25.1
|
10 |
-
librosa>=0.10.0
|
11 |
-
soundfile>=0.12.1
|
12 |
-
typing-extensions>=4.8.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
start_c3po_api.py
CHANGED
@@ -1,171 +1,52 @@
|
|
1 |
#!/usr/bin/env python3
|
2 |
"""
|
3 |
-
|
4 |
-
Handles model download, initialization, and server startup
|
5 |
"""
|
6 |
|
7 |
import os
|
8 |
import sys
|
9 |
-
import subprocess
|
10 |
import logging
|
11 |
-
import time
|
12 |
-
from pathlib import Path
|
13 |
|
14 |
# Configure logging
|
15 |
-
logging.basicConfig(
|
16 |
-
level=logging.INFO,
|
17 |
-
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
18 |
-
)
|
19 |
logger = logging.getLogger(__name__)
|
20 |
|
21 |
-
def check_dependencies():
|
22 |
-
"""Check if all required dependencies are installed"""
|
23 |
-
logger.info("🔍 Checking dependencies...")
|
24 |
-
|
25 |
-
try:
|
26 |
-
import torch
|
27 |
-
import TTS
|
28 |
-
import fastapi
|
29 |
-
import huggingface_hub
|
30 |
-
logger.info("✅ All core dependencies found")
|
31 |
-
return True
|
32 |
-
except ImportError as e:
|
33 |
-
logger.error(f"❌ Missing dependency: {e}")
|
34 |
-
logger.info("💡 Install with: pip install -r requirements.txt")
|
35 |
-
return False
|
36 |
-
|
37 |
-
def check_gpu():
|
38 |
-
"""Check GPU availability"""
|
39 |
-
try:
|
40 |
-
import torch
|
41 |
-
if torch.cuda.is_available():
|
42 |
-
gpu_name = torch.cuda.get_device_name(0)
|
43 |
-
logger.info(f"🎮 GPU available: {gpu_name}")
|
44 |
-
return True
|
45 |
-
else:
|
46 |
-
logger.info("💻 No GPU available, using CPU")
|
47 |
-
return False
|
48 |
-
except Exception as e:
|
49 |
-
logger.warning(f"⚠️ GPU check failed: {e}")
|
50 |
-
return False
|
51 |
-
|
52 |
-
def check_disk_space():
|
53 |
-
"""Check available disk space for model download"""
|
54 |
-
try:
|
55 |
-
import shutil
|
56 |
-
free_space = shutil.disk_usage('.').free / (1024**3) # GB
|
57 |
-
|
58 |
-
if free_space < 5:
|
59 |
-
logger.warning(f"⚠️ Low disk space: {free_space:.1f}GB available")
|
60 |
-
logger.warning("💽 C-3PO model requires ~2GB space")
|
61 |
-
else:
|
62 |
-
logger.info(f"💾 Disk space: {free_space:.1f}GB available")
|
63 |
-
|
64 |
-
return free_space > 2
|
65 |
-
except Exception as e:
|
66 |
-
logger.warning(f"⚠️ Disk space check failed: {e}")
|
67 |
-
return True
|
68 |
-
|
69 |
def setup_environment():
|
70 |
-
"""Set up environment variables"""
|
71 |
os.environ["COQUI_TOS_AGREED"] = "1"
|
72 |
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
|
73 |
-
|
74 |
-
# Create models directory
|
75 |
-
models_dir = Path("./models")
|
76 |
-
models_dir.mkdir(exist_ok=True)
|
77 |
-
|
78 |
logger.info("🌍 Environment configured")
|
79 |
|
80 |
-
def
|
81 |
-
"""
|
82 |
-
logger.info("
|
83 |
-
|
84 |
-
try:
|
85 |
-
subprocess.check_call([
|
86 |
-
sys.executable, "-m", "pip", "install", "-r", "requirements.txt"
|
87 |
-
])
|
88 |
-
logger.info("✅ Dependencies installed successfully")
|
89 |
-
return True
|
90 |
-
except subprocess.CalledProcessError as e:
|
91 |
-
logger.error(f"❌ Failed to install dependencies: {e}")
|
92 |
-
return False
|
93 |
-
|
94 |
-
def test_model_download():
|
95 |
-
"""Test if the C-3PO model can be downloaded"""
|
96 |
-
logger.info("🤖 Testing C-3PO model availability...")
|
97 |
|
98 |
try:
|
99 |
-
from huggingface_hub import repo_info
|
100 |
-
|
101 |
-
# Check if the repo exists and is accessible
|
102 |
-
info = repo_info(repo_id="Borcherding/XTTS-v2_C3PO")
|
103 |
-
logger.info(f"✅ C-3PO model accessible: {info.id}")
|
104 |
-
logger.info(f" Last modified: {info.last_modified}")
|
105 |
-
|
106 |
-
return True
|
107 |
-
except Exception as e:
|
108 |
-
logger.error(f"❌ C-3PO model not accessible: {e}")
|
109 |
-
return False
|
110 |
-
|
111 |
-
def start_api_server():
|
112 |
-
"""Start the FastAPI server"""
|
113 |
-
logger.info("🚀 Starting C-3PO TTS API server...")
|
114 |
-
|
115 |
-
try:
|
116 |
-
# Import and run the API
|
117 |
import uvicorn
|
118 |
from coqui_api import app
|
119 |
|
120 |
logger.info("🎭 C-3PO TTS API starting on http://localhost:7860")
|
121 |
-
logger.info("📖 API documentation
|
122 |
|
123 |
-
uvicorn.run(
|
124 |
-
app,
|
125 |
-
host="0.0.0.0",
|
126 |
-
port=7860,
|
127 |
-
log_level="info"
|
128 |
-
)
|
129 |
|
|
|
|
|
|
|
|
|
130 |
except Exception as e:
|
131 |
-
logger.error(f"❌ Failed to start API
|
132 |
-
|
133 |
|
134 |
def main():
|
135 |
"""Main startup sequence"""
|
136 |
-
print("🤖 C-3PO TTS API
|
137 |
-
print("=" *
|
138 |
|
139 |
-
# Step 1: Check dependencies
|
140 |
-
if not check_dependencies():
|
141 |
-
logger.info("📦 Attempting to install dependencies...")
|
142 |
-
if not install_dependencies():
|
143 |
-
logger.error("❌ Failed to install dependencies. Exiting.")
|
144 |
-
sys.exit(1)
|
145 |
-
|
146 |
-
# Step 2: Setup environment
|
147 |
setup_environment()
|
148 |
|
149 |
-
# Step 3: Check system resources
|
150 |
-
has_gpu = check_gpu()
|
151 |
-
has_space = check_disk_space()
|
152 |
-
|
153 |
-
if not has_space:
|
154 |
-
logger.error("❌ Insufficient disk space. Exiting.")
|
155 |
-
sys.exit(1)
|
156 |
-
|
157 |
-
# Step 4: Test model availability
|
158 |
-
if not test_model_download():
|
159 |
-
logger.warning("⚠️ C-3PO model may not be accessible")
|
160 |
-
logger.warning(" The API will fall back to standard XTTS v2")
|
161 |
-
|
162 |
-
# Step 5: Start the server
|
163 |
-
print("\n" + "=" * 50)
|
164 |
-
logger.info("🎬 All checks passed! Starting C-3PO TTS API...")
|
165 |
-
print("=" * 50)
|
166 |
-
|
167 |
try:
|
168 |
-
|
169 |
except KeyboardInterrupt:
|
170 |
logger.info("\n🛑 Server stopped by user")
|
171 |
except Exception as e:
|
|
|
1 |
#!/usr/bin/env python3
|
2 |
"""
|
3 |
+
Simple startup script for C-3PO TTS API
|
|
|
4 |
"""
|
5 |
|
6 |
import os
|
7 |
import sys
|
|
|
8 |
import logging
|
|
|
|
|
9 |
|
10 |
# Configure logging
|
11 |
+
logging.basicConfig(level=logging.INFO)
|
|
|
|
|
|
|
12 |
logger = logging.getLogger(__name__)
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
def setup_environment():
|
15 |
+
"""Set up required environment variables"""
|
16 |
os.environ["COQUI_TOS_AGREED"] = "1"
|
17 |
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
|
|
|
|
|
|
|
|
|
|
|
18 |
logger.info("🌍 Environment configured")
|
19 |
|
20 |
+
def start_api():
|
21 |
+
"""Start the C-3PO TTS API"""
|
22 |
+
logger.info("🤖 Starting C-3PO TTS API...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
import uvicorn
|
26 |
from coqui_api import app
|
27 |
|
28 |
logger.info("🎭 C-3PO TTS API starting on http://localhost:7860")
|
29 |
+
logger.info("📖 API documentation: http://localhost:7860/docs")
|
30 |
|
31 |
+
uvicorn.run(app, host="0.0.0.0", port=7860, log_level="info")
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
+
except ImportError as e:
|
34 |
+
logger.error(f"❌ Missing dependency: {e}")
|
35 |
+
logger.info("💡 Install with: pip install -r requirements.txt")
|
36 |
+
sys.exit(1)
|
37 |
except Exception as e:
|
38 |
+
logger.error(f"❌ Failed to start API: {e}")
|
39 |
+
sys.exit(1)
|
40 |
|
41 |
def main():
|
42 |
"""Main startup sequence"""
|
43 |
+
print("🤖 C-3PO TTS API")
|
44 |
+
print("=" * 30)
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
setup_environment()
|
47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
try:
|
49 |
+
start_api()
|
50 |
except KeyboardInterrupt:
|
51 |
logger.info("\n🛑 Server stopped by user")
|
52 |
except Exception as e:
|
startup.py
DELETED
@@ -1,120 +0,0 @@
|
|
1 |
-
#!/usr/bin/env python3
|
2 |
-
"""
|
3 |
-
Startup script for Kokoro TTS API on Hugging Face Spaces
|
4 |
-
"""
|
5 |
-
|
6 |
-
import os
|
7 |
-
import sys
|
8 |
-
import logging
|
9 |
-
import subprocess
|
10 |
-
|
11 |
-
# Configure logging
|
12 |
-
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
13 |
-
logger = logging.getLogger(__name__)
|
14 |
-
|
15 |
-
def check_environment():
|
16 |
-
"""Check the environment and permissions"""
|
17 |
-
logger.info("=== Environment Check ===")
|
18 |
-
|
19 |
-
# Check if running on HF Spaces
|
20 |
-
space_id = os.environ.get('SPACE_ID')
|
21 |
-
if space_id:
|
22 |
-
logger.info(f"Running on Hugging Face Spaces: {space_id}")
|
23 |
-
else:
|
24 |
-
logger.info("Not running on Hugging Face Spaces")
|
25 |
-
|
26 |
-
# Check Python version
|
27 |
-
logger.info(f"Python version: {sys.version}")
|
28 |
-
|
29 |
-
# Check current user and home directory
|
30 |
-
logger.info(f"Current user: {os.getenv('USER', 'unknown')}")
|
31 |
-
logger.info(f"Home directory: {os.path.expanduser('~')}")
|
32 |
-
logger.info(f"Current working directory: {os.getcwd()}")
|
33 |
-
|
34 |
-
# Check available disk space
|
35 |
-
try:
|
36 |
-
result = subprocess.run(['df', '-h', '/tmp'], capture_output=True, text=True)
|
37 |
-
logger.info(f"Disk space in /tmp:\n{result.stdout}")
|
38 |
-
except Exception as e:
|
39 |
-
logger.warning(f"Could not check disk space: {e}")
|
40 |
-
|
41 |
-
# Check write permissions for important directories
|
42 |
-
test_dirs = ['/tmp', os.path.expanduser('~'), os.getcwd()]
|
43 |
-
for test_dir in test_dirs:
|
44 |
-
try:
|
45 |
-
test_file = os.path.join(test_dir, 'test_write.tmp')
|
46 |
-
with open(test_file, 'w') as f:
|
47 |
-
f.write('test')
|
48 |
-
os.remove(test_file)
|
49 |
-
logger.info(f"✅ Write permission OK: {test_dir}")
|
50 |
-
except Exception as e:
|
51 |
-
logger.warning(f"❌ Write permission failed: {test_dir} - {e}")
|
52 |
-
|
53 |
-
def check_dependencies():
|
54 |
-
"""Check if required packages are installed"""
|
55 |
-
logger.info("=== Checking dependencies ===")
|
56 |
-
|
57 |
-
required_packages = [
|
58 |
-
'kokoro',
|
59 |
-
'soundfile',
|
60 |
-
'torch',
|
61 |
-
'fastapi',
|
62 |
-
'uvicorn'
|
63 |
-
]
|
64 |
-
|
65 |
-
for package in required_packages:
|
66 |
-
try:
|
67 |
-
__import__(package)
|
68 |
-
logger.info(f"✅ {package} is available")
|
69 |
-
except ImportError:
|
70 |
-
logger.error(f"❌ {package} is not available")
|
71 |
-
|
72 |
-
def test_kokoro():
|
73 |
-
"""Test Kokoro TTS functionality"""
|
74 |
-
logger.info("=== Testing Kokoro TTS ===")
|
75 |
-
|
76 |
-
try:
|
77 |
-
# Import after setting up environment
|
78 |
-
import app_config # This will setup environment
|
79 |
-
from kokoro import KPipeline
|
80 |
-
|
81 |
-
logger.info("Initializing Kokoro pipeline...")
|
82 |
-
pipeline = KPipeline(lang_code='a')
|
83 |
-
logger.info("✅ Kokoro pipeline initialized successfully")
|
84 |
-
|
85 |
-
# Test generation
|
86 |
-
logger.info("Testing speech generation...")
|
87 |
-
text = "Hello, this is a test."
|
88 |
-
generator = pipeline(text, voice='af_heart')
|
89 |
-
|
90 |
-
for i, (gs, ps, audio) in enumerate(generator):
|
91 |
-
logger.info(f"✅ Generated audio segment {i}: gs={gs}, ps={ps}, audio shape: {audio.shape}")
|
92 |
-
break
|
93 |
-
|
94 |
-
logger.info("✅ Kokoro TTS test completed successfully")
|
95 |
-
return True
|
96 |
-
|
97 |
-
except Exception as e:
|
98 |
-
logger.error(f"❌ Kokoro TTS test failed: {e}")
|
99 |
-
import traceback
|
100 |
-
logger.error(f"Full traceback: {traceback.format_exc()}")
|
101 |
-
return False
|
102 |
-
|
103 |
-
def main():
|
104 |
-
"""Main startup function"""
|
105 |
-
logger.info("🚀 Starting Kokoro TTS API setup...")
|
106 |
-
|
107 |
-
check_environment()
|
108 |
-
check_dependencies()
|
109 |
-
|
110 |
-
if test_kokoro():
|
111 |
-
logger.info("🎉 All checks passed! Starting the API...")
|
112 |
-
# Import and start the app
|
113 |
-
import uvicorn
|
114 |
-
uvicorn.run("app:app", host="0.0.0.0", port=7860, log_level="info")
|
115 |
-
else:
|
116 |
-
logger.error("❌ Setup failed. Please check the logs above.")
|
117 |
-
sys.exit(1)
|
118 |
-
|
119 |
-
if __name__ == "__main__":
|
120 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test.py
DELETED
@@ -1,144 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import torch
|
3 |
-
import torchaudio
|
4 |
-
import subprocess
|
5 |
-
|
6 |
-
# Set environment variables for CPU-only usage
|
7 |
-
os.environ['COQUI_TOS_AGREED'] = '1'
|
8 |
-
os.environ['NUMBA_DISABLE_JIT'] = '1'
|
9 |
-
os.environ['FORCE_CPU'] = 'true'
|
10 |
-
os.environ['CUDA_VISIBLE_DEVICES'] = ''
|
11 |
-
|
12 |
-
# Fix PyTorch weights_only issue for XTTS
|
13 |
-
import torch.serialization
|
14 |
-
from TTS.tts.configs.xtts_config import XttsConfig
|
15 |
-
torch.serialization.add_safe_globals([XttsConfig])
|
16 |
-
|
17 |
-
from TTS.api import TTS
|
18 |
-
from TTS.tts.configs.xtts_config import XttsConfig
|
19 |
-
from TTS.tts.models.xtts import Xtts
|
20 |
-
from TTS.utils.generic_utils import get_user_data_dir
|
21 |
-
|
22 |
-
print("Testing XTTS C3PO voice cloning...")
|
23 |
-
|
24 |
-
# C3PO model path
|
25 |
-
model_path = "XTTS-v2_C3PO/"
|
26 |
-
config_path = "XTTS-v2_C3PO/config.json"
|
27 |
-
|
28 |
-
# Check if model files exist, if not download them
|
29 |
-
if not os.path.exists(config_path):
|
30 |
-
print("C3PO model not found locally, downloading...")
|
31 |
-
try:
|
32 |
-
subprocess.run([
|
33 |
-
"git", "clone",
|
34 |
-
"https://huggingface.co/Borcherding/XTTS-v2_C3PO",
|
35 |
-
"XTTS-v2_C3PO"
|
36 |
-
], check=True)
|
37 |
-
print("C3PO model downloaded successfully")
|
38 |
-
except subprocess.CalledProcessError as e:
|
39 |
-
print(f"Failed to download C3PO model: {e}")
|
40 |
-
exit(1)
|
41 |
-
|
42 |
-
# Load configuration
|
43 |
-
config = XttsConfig()
|
44 |
-
config.load_json(config_path)
|
45 |
-
|
46 |
-
# Initialize and load model
|
47 |
-
model = Xtts.init_from_config(config)
|
48 |
-
model.load_checkpoint(
|
49 |
-
config,
|
50 |
-
checkpoint_path=os.path.join(model_path, "model.pth"),
|
51 |
-
vocab_path=os.path.join(model_path, "vocab.json"),
|
52 |
-
eval=True,
|
53 |
-
)
|
54 |
-
|
55 |
-
device = "cpu" # Force CPU usage
|
56 |
-
print(f"C3PO model loaded on {device} (forced CPU mode)")
|
57 |
-
|
58 |
-
# Text to convert to speech
|
59 |
-
text = "Hello there! I am C-3PO, human-cyborg relations. How may I assist you today?"
|
60 |
-
|
61 |
-
# Look for reference audio in the C3PO model directory
|
62 |
-
reference_audio_path = None
|
63 |
-
for file in os.listdir(model_path):
|
64 |
-
if file.endswith(('.wav', '.mp3', '.m4a')):
|
65 |
-
reference_audio_path = os.path.join(model_path, file)
|
66 |
-
print(f"Found C3PO reference audio: {file}")
|
67 |
-
break
|
68 |
-
|
69 |
-
# If no reference audio found, create a simple test reference
|
70 |
-
if reference_audio_path is None:
|
71 |
-
print("No reference audio found in C3PO model, creating test reference...")
|
72 |
-
reference_audio_path = "test_reference.wav"
|
73 |
-
|
74 |
-
# Generate a simple sine wave as placeholder
|
75 |
-
import numpy as np
|
76 |
-
sample_rate = 24000
|
77 |
-
duration = 3 # seconds
|
78 |
-
frequency = 440 # Hz
|
79 |
-
t = np.linspace(0, duration, int(sample_rate * duration))
|
80 |
-
audio_data = 0.3 * np.sin(2 * np.pi * frequency * t)
|
81 |
-
|
82 |
-
# Save as WAV
|
83 |
-
torchaudio.save(reference_audio_path, torch.tensor(audio_data).unsqueeze(0), sample_rate)
|
84 |
-
print(f"Test reference audio created: {reference_audio_path}")
|
85 |
-
|
86 |
-
try:
|
87 |
-
# Generate conditioning latents
|
88 |
-
print("Processing reference audio...")
|
89 |
-
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
|
90 |
-
audio_path=reference_audio_path,
|
91 |
-
gpt_cond_len=30,
|
92 |
-
gpt_cond_chunk_len=4,
|
93 |
-
max_ref_length=60
|
94 |
-
)
|
95 |
-
|
96 |
-
# Generate speech
|
97 |
-
print("Generating C3PO speech...")
|
98 |
-
out = model.inference(
|
99 |
-
text,
|
100 |
-
"en", # language
|
101 |
-
gpt_cond_latent,
|
102 |
-
speaker_embedding,
|
103 |
-
repetition_penalty=5.0,
|
104 |
-
temperature=0.75,
|
105 |
-
)
|
106 |
-
|
107 |
-
# Save output
|
108 |
-
output_path = "c3po_test_output.wav"
|
109 |
-
torchaudio.save(output_path, torch.tensor(out["wav"]).unsqueeze(0), 24000)
|
110 |
-
print(f"C3PO speech generated successfully! Saved as: {output_path}")
|
111 |
-
|
112 |
-
# Test multilingual capabilities
|
113 |
-
print("\nTesting multilingual C3PO...")
|
114 |
-
multilingual_tests = [
|
115 |
-
("es", "Hola, soy C-3PO. Domino más de seis millones de formas de comunicación."),
|
116 |
-
("fr", "Bonjour, je suis C-3PO. Je maîtrise plus de six millions de formes de communication."),
|
117 |
-
("de", "Hallo, ich bin C-3PO. Ich beherrsche über sechs Millionen Kommunikationsformen."),
|
118 |
-
]
|
119 |
-
|
120 |
-
for lang, test_text in multilingual_tests:
|
121 |
-
print(f"Generating {lang.upper()} speech...")
|
122 |
-
out = model.inference(
|
123 |
-
test_text,
|
124 |
-
lang,
|
125 |
-
gpt_cond_latent,
|
126 |
-
speaker_embedding,
|
127 |
-
repetition_penalty=5.0,
|
128 |
-
temperature=0.75,
|
129 |
-
)
|
130 |
-
|
131 |
-
output_path = f"c3po_test_{lang}.wav"
|
132 |
-
torchaudio.save(output_path, torch.tensor(out["wav"]).unsqueeze(0), 24000)
|
133 |
-
print(f"C3PO {lang.upper()} speech saved as: {output_path}")
|
134 |
-
|
135 |
-
except Exception as e:
|
136 |
-
print(f"Error during speech generation: {e}")
|
137 |
-
import traceback
|
138 |
-
traceback.print_exc()
|
139 |
-
|
140 |
-
print("XTTS C3PO test completed!")
|
141 |
-
print("\nGenerated files:")
|
142 |
-
for file in os.listdir("."):
|
143 |
-
if file.startswith("c3po_test") and file.endswith(".wav"):
|
144 |
-
print(f" - {file}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test_build.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Simple build test for C-3PO TTS API
|
4 |
+
Tests if all dependencies can be imported
|
5 |
+
"""
|
6 |
+
|
7 |
+
def test_imports():
|
8 |
+
"""Test if all required packages can be imported"""
|
9 |
+
print("🔍 Testing imports...")
|
10 |
+
|
11 |
+
try:
|
12 |
+
import fastapi
|
13 |
+
print("✅ FastAPI")
|
14 |
+
|
15 |
+
import uvicorn
|
16 |
+
print("✅ Uvicorn")
|
17 |
+
|
18 |
+
import torch
|
19 |
+
print("✅ PyTorch")
|
20 |
+
|
21 |
+
import torchaudio
|
22 |
+
print("✅ TorchAudio")
|
23 |
+
|
24 |
+
import TTS
|
25 |
+
print("✅ Coqui TTS")
|
26 |
+
|
27 |
+
import huggingface_hub
|
28 |
+
print("✅ Hugging Face Hub")
|
29 |
+
|
30 |
+
import pydantic
|
31 |
+
print("✅ Pydantic")
|
32 |
+
|
33 |
+
return True
|
34 |
+
|
35 |
+
except ImportError as e:
|
36 |
+
print(f"❌ Import failed: {e}")
|
37 |
+
return False
|
38 |
+
|
39 |
+
def test_api_creation():
|
40 |
+
"""Test if the API can be created without errors"""
|
41 |
+
print("\n🚀 Testing API creation...")
|
42 |
+
|
43 |
+
try:
|
44 |
+
from coqui_api import app
|
45 |
+
print("✅ API created successfully")
|
46 |
+
return True
|
47 |
+
except Exception as e:
|
48 |
+
print(f"❌ API creation failed: {e}")
|
49 |
+
return False
|
50 |
+
|
51 |
+
def main():
|
52 |
+
"""Run build tests"""
|
53 |
+
print("🧪 C-3PO TTS Build Test")
|
54 |
+
print("=" * 30)
|
55 |
+
|
56 |
+
import_ok = test_imports()
|
57 |
+
api_ok = test_api_creation()
|
58 |
+
|
59 |
+
print("\n" + "=" * 30)
|
60 |
+
|
61 |
+
if import_ok and api_ok:
|
62 |
+
print("🎉 All tests passed! Ready to deploy.")
|
63 |
+
return 0
|
64 |
+
else:
|
65 |
+
print("❌ Some tests failed. Check dependencies.")
|
66 |
+
return 1
|
67 |
+
|
68 |
+
if __name__ == "__main__":
|
69 |
+
exit(main())
|
test_coqui_api.py
DELETED
@@ -1,146 +0,0 @@
|
|
1 |
-
import requests
|
2 |
-
import os
|
3 |
-
import time
|
4 |
-
|
5 |
-
# API base URL (update this to your deployed Hugging Face Space URL)
|
6 |
-
BASE_URL = "http://localhost:7860" # Change to your HF Space URL when deployed
|
7 |
-
|
8 |
-
def test_health():
|
9 |
-
"""Test the health endpoint"""
|
10 |
-
print("🔍 Testing health endpoint...")
|
11 |
-
try:
|
12 |
-
response = requests.get(f"{BASE_URL}/health")
|
13 |
-
if response.status_code == 200:
|
14 |
-
print("✅ Health check passed!")
|
15 |
-
print(f"Response: {response.json()}")
|
16 |
-
else:
|
17 |
-
print(f"❌ Health check failed: {response.status_code}")
|
18 |
-
print(f"Response: {response.text}")
|
19 |
-
except Exception as e:
|
20 |
-
print(f"❌ Health check error: {e}")
|
21 |
-
|
22 |
-
def test_list_models():
|
23 |
-
"""Test the models endpoint"""
|
24 |
-
print("\n🔍 Testing models endpoint...")
|
25 |
-
try:
|
26 |
-
response = requests.get(f"{BASE_URL}/models")
|
27 |
-
if response.status_code == 200:
|
28 |
-
models = response.json()
|
29 |
-
print("✅ Models endpoint working!")
|
30 |
-
print(f"Found {len(models.get('models', []))} models")
|
31 |
-
# Show first 5 models
|
32 |
-
for i, model in enumerate(models.get('models', [])[:5]):
|
33 |
-
print(f" {i+1}. {model}")
|
34 |
-
else:
|
35 |
-
print(f"❌ Models endpoint failed: {response.status_code}")
|
36 |
-
except Exception as e:
|
37 |
-
print(f"❌ Models endpoint error: {e}")
|
38 |
-
|
39 |
-
def test_simple_tts():
|
40 |
-
"""Test simple text-to-speech without voice cloning"""
|
41 |
-
print("\n🔍 Testing simple TTS...")
|
42 |
-
try:
|
43 |
-
data = {
|
44 |
-
"text": "Hello world! This is a test of Coqui TTS.",
|
45 |
-
"language": "en"
|
46 |
-
}
|
47 |
-
|
48 |
-
response = requests.post(f"{BASE_URL}/tts", data=data)
|
49 |
-
|
50 |
-
if response.status_code == 200:
|
51 |
-
# Save the audio file
|
52 |
-
output_file = "simple_tts_output.wav"
|
53 |
-
with open(output_file, "wb") as f:
|
54 |
-
f.write(response.content)
|
55 |
-
print(f"✅ Simple TTS successful! Audio saved to: {output_file}")
|
56 |
-
print(f"File size: {len(response.content)} bytes")
|
57 |
-
else:
|
58 |
-
print(f"❌ Simple TTS failed: {response.status_code}")
|
59 |
-
print(f"Response: {response.text}")
|
60 |
-
except Exception as e:
|
61 |
-
print(f"❌ Simple TTS error: {e}")
|
62 |
-
|
63 |
-
def test_voice_cloning(speaker_file_path=None):
|
64 |
-
"""Test voice cloning with uploaded speaker file"""
|
65 |
-
if not speaker_file_path or not os.path.exists(speaker_file_path):
|
66 |
-
print("\n⚠️ Skipping voice cloning test - no speaker file provided")
|
67 |
-
print(" To test voice cloning, provide a .wav file path")
|
68 |
-
return
|
69 |
-
|
70 |
-
print(f"\n🔍 Testing voice cloning with: {speaker_file_path}")
|
71 |
-
try:
|
72 |
-
data = {
|
73 |
-
"text": "This is voice cloning using Coqui TTS. The voice should match the reference audio.",
|
74 |
-
"language": "en"
|
75 |
-
}
|
76 |
-
|
77 |
-
with open(speaker_file_path, "rb") as f:
|
78 |
-
files = {"speaker_file": f}
|
79 |
-
response = requests.post(f"{BASE_URL}/tts", data=data, files=files)
|
80 |
-
|
81 |
-
if response.status_code == 200:
|
82 |
-
# Save the cloned audio
|
83 |
-
output_file = "voice_cloned_output.wav"
|
84 |
-
with open(output_file, "wb") as f:
|
85 |
-
f.write(response.content)
|
86 |
-
print(f"✅ Voice cloning successful! Audio saved to: {output_file}")
|
87 |
-
print(f"File size: {len(response.content)} bytes")
|
88 |
-
else:
|
89 |
-
print(f"❌ Voice cloning failed: {response.status_code}")
|
90 |
-
print(f"Response: {response.text}")
|
91 |
-
except Exception as e:
|
92 |
-
print(f"❌ Voice cloning error: {e}")
|
93 |
-
|
94 |
-
def test_json_tts():
|
95 |
-
"""Test JSON endpoint"""
|
96 |
-
print("\n🔍 Testing JSON TTS endpoint...")
|
97 |
-
try:
|
98 |
-
import json
|
99 |
-
|
100 |
-
data = {
|
101 |
-
"text": "This is a JSON request test for Coqui TTS API.",
|
102 |
-
"language": "en"
|
103 |
-
}
|
104 |
-
|
105 |
-
response = requests.post(
|
106 |
-
f"{BASE_URL}/tts-json",
|
107 |
-
headers={"Content-Type": "application/json"},
|
108 |
-
data=json.dumps(data)
|
109 |
-
)
|
110 |
-
|
111 |
-
if response.status_code == 200:
|
112 |
-
output_file = "json_tts_output.wav"
|
113 |
-
with open(output_file, "wb") as f:
|
114 |
-
f.write(response.content)
|
115 |
-
print(f"✅ JSON TTS successful! Audio saved to: {output_file}")
|
116 |
-
print(f"File size: {len(response.content)} bytes")
|
117 |
-
else:
|
118 |
-
print(f"❌ JSON TTS failed: {response.status_code}")
|
119 |
-
print(f"Response: {response.text}")
|
120 |
-
except Exception as e:
|
121 |
-
print(f"❌ JSON TTS error: {e}")
|
122 |
-
|
123 |
-
def main():
|
124 |
-
print("🐸 Testing Coqui TTS API")
|
125 |
-
print("=" * 50)
|
126 |
-
|
127 |
-
# Test all endpoints
|
128 |
-
test_health()
|
129 |
-
test_list_models()
|
130 |
-
test_simple_tts()
|
131 |
-
test_json_tts()
|
132 |
-
|
133 |
-
# Test voice cloning if speaker file is available
|
134 |
-
# You can specify a speaker file path here
|
135 |
-
speaker_file = None # Change to your speaker file path
|
136 |
-
test_voice_cloning(speaker_file)
|
137 |
-
|
138 |
-
print("\n🎉 API testing completed!")
|
139 |
-
print("\nTo test voice cloning:")
|
140 |
-
print("1. Record a short audio sample (5-10 seconds)")
|
141 |
-
print("2. Save it as a .wav file")
|
142 |
-
print("3. Update speaker_file variable with the file path")
|
143 |
-
print("4. Run the test again")
|
144 |
-
|
145 |
-
if __name__ == "__main__":
|
146 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test_coqui_tts.py
DELETED
@@ -1,99 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
from TTS.api import TTS
|
3 |
-
import os
|
4 |
-
|
5 |
-
def test_coqui_tts():
|
6 |
-
"""Test Coqui TTS functionality"""
|
7 |
-
|
8 |
-
# Get device
|
9 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
10 |
-
print(f"Using device: {device}")
|
11 |
-
|
12 |
-
try:
|
13 |
-
# List available 🐸TTS models
|
14 |
-
print("\n=== Available TTS Models ===")
|
15 |
-
tts_instance = TTS()
|
16 |
-
models = tts_instance.list_models()
|
17 |
-
|
18 |
-
# Print first 10 models to avoid overwhelming output
|
19 |
-
print("First 10 available models:")
|
20 |
-
for i, model in enumerate(models[:10]):
|
21 |
-
print(f"{i+1}. {model}")
|
22 |
-
|
23 |
-
if len(models) > 10:
|
24 |
-
print(f"... and {len(models) - 10} more models")
|
25 |
-
|
26 |
-
except Exception as e:
|
27 |
-
print(f"Error listing models: {e}")
|
28 |
-
return
|
29 |
-
|
30 |
-
try:
|
31 |
-
# Initialize TTS with XTTS v2 model
|
32 |
-
print("\n=== Initializing XTTS v2 Model ===")
|
33 |
-
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
|
34 |
-
print("XTTS v2 model loaded successfully!")
|
35 |
-
|
36 |
-
# List speakers if available
|
37 |
-
print("\n=== Available Speakers ===")
|
38 |
-
if hasattr(tts, 'speakers') and tts.speakers:
|
39 |
-
print("Available speakers:")
|
40 |
-
for speaker in tts.speakers[:10]: # Show first 10
|
41 |
-
print(f"- {speaker}")
|
42 |
-
if len(tts.speakers) > 10:
|
43 |
-
print(f"... and {len(tts.speakers) - 10} more speakers")
|
44 |
-
else:
|
45 |
-
print("No preset speakers available or speakers list is empty")
|
46 |
-
|
47 |
-
except Exception as e:
|
48 |
-
print(f"Error initializing XTTS v2 model: {e}")
|
49 |
-
print("This might be due to model download requirements or missing dependencies")
|
50 |
-
return
|
51 |
-
|
52 |
-
try:
|
53 |
-
# Test TTS to file with preset speaker (if available)
|
54 |
-
print("\n=== Testing TTS to File ===")
|
55 |
-
output_file = "test_output.wav"
|
56 |
-
|
57 |
-
# Check if we have speakers available
|
58 |
-
if hasattr(tts, 'speakers') and tts.speakers:
|
59 |
-
# Use first available speaker
|
60 |
-
speaker_name = tts.speakers[0]
|
61 |
-
print(f"Using speaker: {speaker_name}")
|
62 |
-
|
63 |
-
tts.tts_to_file(
|
64 |
-
text="Hello world! This is a test of Coqui TTS library.",
|
65 |
-
speaker=speaker_name,
|
66 |
-
language="en",
|
67 |
-
file_path=output_file
|
68 |
-
)
|
69 |
-
else:
|
70 |
-
# Try without speaker specification
|
71 |
-
print("No speakers available, trying without speaker specification...")
|
72 |
-
tts.tts_to_file(
|
73 |
-
text="Hello world! This is a test of Coqui TTS library.",
|
74 |
-
language="en",
|
75 |
-
file_path=output_file
|
76 |
-
)
|
77 |
-
|
78 |
-
if os.path.exists(output_file):
|
79 |
-
print(f"✅ TTS successful! Audio saved to: {output_file}")
|
80 |
-
file_size = os.path.getsize(output_file)
|
81 |
-
print(f"File size: {file_size} bytes")
|
82 |
-
else:
|
83 |
-
print("❌ TTS failed - output file not created")
|
84 |
-
|
85 |
-
except Exception as e:
|
86 |
-
print(f"Error during TTS generation: {e}")
|
87 |
-
|
88 |
-
# Note about voice cloning
|
89 |
-
print("\n=== Voice Cloning Information ===")
|
90 |
-
print("To test voice cloning, you would need:")
|
91 |
-
print("1. A reference audio file (speaker_wav parameter)")
|
92 |
-
print("2. Use tts.tts() method instead of tts_to_file()")
|
93 |
-
print("Example:")
|
94 |
-
print('wav = tts.tts(text="Hello!", speaker_wav="reference.wav", language="en")')
|
95 |
-
|
96 |
-
if __name__ == "__main__":
|
97 |
-
print("🐸 Testing Coqui TTS Library")
|
98 |
-
print("=" * 50)
|
99 |
-
test_coqui_tts()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test_kokoro_install.py
DELETED
@@ -1,86 +0,0 @@
|
|
1 |
-
#!/usr/bin/env python3
|
2 |
-
"""
|
3 |
-
Simple test script to verify Kokoro TTS installation and functionality.
|
4 |
-
"""
|
5 |
-
|
6 |
-
import os
|
7 |
-
|
8 |
-
# Set basic environment variables
|
9 |
-
os.environ['NUMBA_DISABLE_JIT'] = '1'
|
10 |
-
|
11 |
-
def test_kokoro_import():
|
12 |
-
"""Test if Kokoro can be imported"""
|
13 |
-
try:
|
14 |
-
from kokoro import KPipeline
|
15 |
-
import soundfile as sf
|
16 |
-
import torch
|
17 |
-
print("✅ All required packages imported successfully!")
|
18 |
-
return True
|
19 |
-
except ImportError as e:
|
20 |
-
print(f"❌ Import error: {e}")
|
21 |
-
return False
|
22 |
-
|
23 |
-
def test_kokoro_pipeline():
|
24 |
-
"""Test if Kokoro pipeline can be initialized"""
|
25 |
-
try:
|
26 |
-
from kokoro import KPipeline
|
27 |
-
pipeline = KPipeline(lang_code='a')
|
28 |
-
print("✅ Kokoro pipeline initialized successfully!")
|
29 |
-
return True
|
30 |
-
except Exception as e:
|
31 |
-
print(f"❌ Pipeline initialization error: {e}")
|
32 |
-
return False
|
33 |
-
|
34 |
-
def test_kokoro_generation():
|
35 |
-
"""Test if Kokoro can generate speech"""
|
36 |
-
try:
|
37 |
-
from kokoro import KPipeline
|
38 |
-
import soundfile as sf
|
39 |
-
|
40 |
-
pipeline = KPipeline(lang_code='a')
|
41 |
-
text = "Hello, this is a test of Kokoro TTS."
|
42 |
-
|
43 |
-
generator = pipeline(text, voice='af_heart')
|
44 |
-
|
45 |
-
for i, (gs, ps, audio) in enumerate(generator):
|
46 |
-
print(f"✅ Generated audio segment {i}: gs={gs}, ps={ps}")
|
47 |
-
# Save test audio
|
48 |
-
sf.write('test_kokoro.wav', audio, 24000)
|
49 |
-
print("✅ Test audio saved as 'test_kokoro.wav'")
|
50 |
-
break # Just test the first segment
|
51 |
-
|
52 |
-
return True
|
53 |
-
except Exception as e:
|
54 |
-
print(f"❌ Speech generation error: {e}")
|
55 |
-
return False
|
56 |
-
|
57 |
-
def main():
|
58 |
-
"""Run all tests"""
|
59 |
-
print("🎤 Testing Kokoro TTS Installation")
|
60 |
-
print("=" * 40)
|
61 |
-
|
62 |
-
tests = [
|
63 |
-
("Import Test", test_kokoro_import),
|
64 |
-
("Pipeline Test", test_kokoro_pipeline),
|
65 |
-
("Generation Test", test_kokoro_generation)
|
66 |
-
]
|
67 |
-
|
68 |
-
passed = 0
|
69 |
-
total = len(tests)
|
70 |
-
|
71 |
-
for test_name, test_func in tests:
|
72 |
-
print(f"\n🔍 Running {test_name}...")
|
73 |
-
if test_func():
|
74 |
-
passed += 1
|
75 |
-
else:
|
76 |
-
print(f"❌ {test_name} failed!")
|
77 |
-
|
78 |
-
print(f"\n📊 Results: {passed}/{total} tests passed")
|
79 |
-
|
80 |
-
if passed == total:
|
81 |
-
print("🎉 All tests passed! Kokoro TTS is ready to use.")
|
82 |
-
else:
|
83 |
-
print("⚠️ Some tests failed. Please check the installation.")
|
84 |
-
|
85 |
-
if __name__ == "__main__":
|
86 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|