Divax commited on
Commit
94fd4b0
·
1 Parent(s): 71905d8
Files changed (16) hide show
  1. Dockerfile +13 -19
  2. Dockerfile.coqui +0 -51
  3. README.md +64 -288
  4. README_coqui.md +0 -351
  5. app.py +0 -414
  6. app_config.py +0 -54
  7. client_example.py +0 -269
  8. requirements.txt +8 -13
  9. requirements_coqui.txt +0 -12
  10. start_c3po_api.py +17 -136
  11. startup.py +0 -120
  12. test.py +0 -144
  13. test_build.py +69 -0
  14. test_coqui_api.py +0 -146
  15. test_coqui_tts.py +0 -99
  16. test_kokoro_install.py +0 -86
Dockerfile CHANGED
@@ -1,13 +1,12 @@
1
- FROM python:3.11
2
 
3
  # Set up a new user named "user" with user ID 1000
4
  RUN useradd -m -u 1000 user
5
 
6
- # Install system dependencies as root
7
  RUN apt-get update && apt-get install -y \
8
  git \
9
  git-lfs \
10
- espeak-ng \
11
  ffmpeg \
12
  && rm -rf /var/lib/apt/lists/*
13
 
@@ -17,35 +16,30 @@ RUN git lfs install
17
  # Switch to the "user" user
18
  USER user
19
 
20
- # Set home to the user's home directory
21
  ENV HOME=/home/user \
22
  PATH=/home/user/.local/bin:$PATH \
23
  COQUI_TOS_AGREED=1 \
24
- NUMBA_DISABLE_JIT=1 \
25
- FORCE_CPU=true \
26
- CUDA_VISIBLE_DEVICES=""
27
 
28
- # Set the working directory to the user's home directory
29
  WORKDIR $HOME/app
30
 
31
- # Try and run pip command after setting the user with `USER user` to avoid permission issues with Python
32
  RUN pip install --no-cache-dir --upgrade pip
33
 
34
- # Copy requirements first and install dependencies
35
  COPY --chown=user requirements.txt .
36
  RUN pip install --no-cache-dir -r requirements.txt
37
 
38
- # Download unidic for mecab (required for some TTS features)
39
- RUN python -m unidic download
40
 
41
- # Clone the C3PO XTTS model
42
- RUN git clone https://huggingface.co/Borcherding/XTTS-v2_C3PO XTTS-v2_C3PO
43
-
44
- # Copy the current directory contents into the container at $HOME/app setting the owner to the user
45
- COPY --chown=user . $HOME/app
46
 
47
  # Expose the port
48
  EXPOSE 7860
49
 
50
- # Start the API directly
51
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
+ FROM python:3.11-slim
2
 
3
  # Set up a new user named "user" with user ID 1000
4
  RUN useradd -m -u 1000 user
5
 
6
+ # Install only essential system dependencies
7
  RUN apt-get update && apt-get install -y \
8
  git \
9
  git-lfs \
 
10
  ffmpeg \
11
  && rm -rf /var/lib/apt/lists/*
12
 
 
16
  # Switch to the "user" user
17
  USER user
18
 
19
+ # Set environment variables
20
  ENV HOME=/home/user \
21
  PATH=/home/user/.local/bin:$PATH \
22
  COQUI_TOS_AGREED=1 \
23
+ HF_HUB_DISABLE_TELEMETRY=1
 
 
24
 
25
+ # Set the working directory
26
  WORKDIR $HOME/app
27
 
28
+ # Upgrade pip
29
  RUN pip install --no-cache-dir --upgrade pip
30
 
31
+ # Copy and install requirements
32
  COPY --chown=user requirements.txt .
33
  RUN pip install --no-cache-dir -r requirements.txt
34
 
35
+ # Pre-download the C-3PO model to speed up startup
36
+ RUN python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='Borcherding/XTTS-v2_C3PO', local_dir='./models/XTTS-v2_C3PO', local_dir_use_symlinks=False)"
37
 
38
+ # Copy the API file
39
+ COPY --chown=user coqui_api.py .
 
 
 
40
 
41
  # Expose the port
42
  EXPOSE 7860
43
 
44
+ # Start the C-3PO TTS API
45
+ CMD ["uvicorn", "coqui_api:app", "--host", "0.0.0.0", "--port", "7860"]
Dockerfile.coqui DELETED
@@ -1,51 +0,0 @@
1
- FROM python:3.11
2
-
3
- # Set up a new user named "user" with user ID 1000
4
- RUN useradd -m -u 1000 user
5
-
6
- # Install system dependencies as root
7
- RUN apt-get update && apt-get install -y \
8
- git \
9
- git-lfs \
10
- espeak-ng \
11
- ffmpeg \
12
- libsndfile1 \
13
- && rm -rf /var/lib/apt/lists/*
14
-
15
- # Initialize git lfs
16
- RUN git lfs install
17
-
18
- # Switch to the "user" user
19
- USER user
20
-
21
- # Set home to the user's home directory
22
- ENV HOME=/home/user \
23
- PATH=/home/user/.local/bin:$PATH \
24
- COQUI_TOS_AGREED=1 \
25
- HF_HUB_DISABLE_TELEMETRY=1 \
26
- HF_HOME=/home/user/.cache/huggingface
27
-
28
- # Set the working directory to the user's home directory
29
- WORKDIR $HOME/app
30
-
31
- # Upgrade pip
32
- RUN pip install --no-cache-dir --upgrade pip
33
-
34
- # Install PyTorch with CPU support for Hugging Face Spaces
35
- RUN pip install --no-cache-dir torch torchaudio --index-url https://download.pytorch.org/whl/cpu
36
-
37
- # Copy requirements and install dependencies
38
- COPY --chown=user requirements.txt .
39
- RUN pip install --no-cache-dir -r requirements.txt
40
-
41
- # Copy the API file
42
- COPY --chown=user coqui_api.py .
43
-
44
- # Create necessary directories
45
- RUN mkdir -p $HOME/.cache $HOME/app/models
46
-
47
- # Expose the port
48
- EXPOSE 7860
49
-
50
- # Start the Coqui TTS API
51
- CMD ["uvicorn", "coqui_api:app", "--host", "0.0.0.0", "--port", "7860"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,334 +1,110 @@
1
- ---
2
- title: XTTS C3PO Voice Cloning API
3
- emoji: 🤖
4
- colorFrom: indigo
5
- colorTo: yellow
6
- sdk: docker
7
- pinned: false
8
- ---
9
 
10
- # XTTS C3PO Voice Cloning API
11
 
12
- A FastAPI-based Text-to-Speech API using XTTS-v2 with the iconic C3PO voice from Star Wars.
13
 
14
- ## Features
 
 
 
 
15
 
16
- - **C3PO Voice**: Pre-loaded with the iconic C3PO voice from Star Wars
17
- - **Custom Voice Cloning**: Upload your own reference audio for voice cloning
18
- - **Multilingual Support**: 16+ languages with C3PO voice
19
- - **No Upload Required**: Use C3PO voice without any file uploads
20
- - **RESTful API**: Clean API with automatic documentation
21
- - **Docker Support**: Optimized for Hugging Face Spaces deployment
22
- - **PyTorch 2.6 Compatible**: Includes compatibility fixes
23
-
24
- ## About the C3PO Model
25
-
26
- This API uses the XTTS-v2 C3PO model from [Borcherding/XTTS-v2_C3PO](https://huggingface.co/Borcherding/XTTS-v2_C3PO), which provides the iconic voice of C-3PO from Star Wars. The model supports:
27
-
28
- - High-quality C3PO voice synthesis
29
- - Multilingual C3PO speech (16+ languages)
30
- - Custom voice cloning capabilities
31
- - Real-time speech generation
32
-
33
- ## Quick Start
34
-
35
- ### Using C3PO Voice (No Upload Required)
36
 
 
37
  ```bash
38
- curl -X POST "http://localhost:7860/tts-c3po" \
39
- -F "text=Hello there! I am C-3PO, human-cyborg relations." \
40
- -F "language=en" \
41
- --output c3po_speech.wav
42
- ```
43
 
44
- ### Using Custom Voice Cloning
45
-
46
- ```bash
47
- curl -X POST "http://localhost:7860/tts" \
48
- -F "text=This will be spoken in your custom voice!" \
49
- -F "language=en" \
50
- -F "speaker_file=@your_reference_voice.wav" \
51
- --output custom_speech.wav
52
  ```
53
 
54
- ## API Endpoints
55
-
56
- ### C3PO Voice Only
57
- - **POST** `/tts-c3po` - Generate speech using C3PO voice (no file upload needed)
58
- - **Parameters:**
59
- - `text` (form): Text to convert to speech (max 500 characters)
60
- - `language` (form): Language code (default: "en")
61
- - `no_lang_auto_detect` (form): Disable automatic language detection
62
-
63
- ### Voice Cloning with Fallback
64
- - **POST** `/tts` - Convert text to speech with optional custom voice
65
- - **Parameters:**
66
- - `text` (form): Text to convert to speech (max 500 characters)
67
- - `language` (form): Language code (default: "en")
68
- - `voice_cleanup` (form): Apply audio cleanup to reference voice
69
- - `no_lang_auto_detect` (form): Disable automatic language detection
70
- - `speaker_file` (file, optional): Reference speaker audio file (uses C3PO if not provided)
71
-
72
- ### JSON API
73
- - **POST** `/tts-json` - Convert text to speech using JSON request body
74
- - **Body:** JSON object with `text`, `language`, `voice_cleanup`, `no_lang_auto_detect`
75
- - **File:** `speaker_file` (optional) - Reference speaker audio file
76
-
77
- ### Information Endpoints
78
- - **GET** `/health` - Check API status, device info, and supported languages
79
- - **GET** `/languages` - Get list of supported languages
80
- - **GET** `/docs` - Interactive API documentation (Swagger UI)
81
-
82
- ## Usage Examples
83
-
84
- ### Python - C3PO Voice
85
-
86
- ```python
87
- import requests
88
-
89
- # Generate C3PO speech
90
- url = "http://localhost:7860/tts-c3po"
91
- data = {
92
- "text": "Hello there! I am C-3PO, human-cyborg relations.",
93
- "language": "en"
94
- }
95
-
96
- response = requests.post(url, data=data)
97
-
98
- if response.status_code == 200:
99
- with open("c3po_speech.wav", "wb") as f:
100
- f.write(response.content)
101
- print("C3PO speech generated!")
102
- ```
103
-
104
- ### Python - Custom Voice with C3PO Fallback
105
-
106
- ```python
107
- import requests
108
-
109
- url = "http://localhost:7860/tts"
110
- data = {
111
- "text": "This will use C3PO voice if no speaker file is provided.",
112
- "language": "en"
113
- }
114
-
115
- # No speaker_file provided - will use C3PO voice
116
- response = requests.post(url, data=data)
117
-
118
- if response.status_code == 200:
119
- with open("speech_output.wav", "wb") as f:
120
- f.write(response.content)
121
- ```
122
-
123
- ### Multilingual C3PO
124
-
125
- ```python
126
- # C3PO speaking Spanish
127
- data = {
128
- "text": "Hola, soy C-3PO. Domino más de seis millones de formas de comunicación.",
129
- "language": "es"
130
- }
131
- response = requests.post("http://localhost:7860/tts-c3po", data=data)
132
- ```
133
-
134
- ## Supported Languages
135
-
136
- The C3PO model supports all XTTS-v2 languages:
137
-
138
- - **en** - English
139
- - **es** - Spanish
140
- - **fr** - French
141
- - **de** - German
142
- - **it** - Italian
143
- - **pt** - Portuguese (Brazilian)
144
- - **pl** - Polish
145
- - **tr** - Turkish
146
- - **ru** - Russian
147
- - **nl** - Dutch
148
- - **cs** - Czech
149
- - **ar** - Arabic
150
- - **zh-cn** - Mandarin Chinese
151
- - **ja** - Japanese
152
- - **ko** - Korean
153
- - **hu** - Hungarian
154
- - **hi** - Hindi
155
-
156
- ## Setup
157
-
158
- ### CPU-Only Installation (Recommended for most users)
159
-
160
- For CPU-only usage (no GPU required):
161
- ```bash
162
- # Ubuntu/Debian
163
- sudo apt-get install espeak-ng ffmpeg git git-lfs
164
-
165
- # macOS
166
- brew install espeak ffmpeg git git-lfs
167
- ```
168
-
169
- 2. **Install CPU-only PyTorch and dependencies:**
170
  ```bash
171
- # Option 1: Use the provided script
172
- chmod +x install_cpu.sh
173
- ./install_cpu.sh
174
-
175
- # Option 2: Manual installation
176
- pip install torch torchaudio --index-url https://download.pytorch.org/whl/cpu
177
  pip install -r requirements.txt
178
- python -m unidic download
179
- ```
180
 
181
- 3. **Set CPU-only environment variables:**
182
- ```bash
183
- export FORCE_CPU=true
184
- export CUDA_VISIBLE_DEVICES=""
185
  ```
186
 
187
- 4. **Run the API:**
188
- ```bash
189
- uvicorn app:app --host 0.0.0.0 --port 7860
190
- ```
191
 
192
- ### Hugging Face Spaces Deployment
193
 
194
- This API is optimized for Hugging Face Spaces with:
195
- - Automatic C3PO model downloading
196
- - Proper user permissions (user ID 1000)
197
- - PyTorch 2.6 compatibility fixes
198
- - COQUI license agreement handling
199
-
200
- ### Local Development
201
-
202
- 1. **Install system dependencies:**
203
  ```bash
204
- # Ubuntu/Debian
205
- sudo apt-get install espeak-ng ffmpeg git git-lfs
206
-
207
- # macOS
208
- brew install espeak ffmpeg git git-lfs
209
  ```
210
 
211
- 2. **Install Python dependencies:**
212
  ```bash
213
- pip install -r requirements.txt
214
- python -m unidic download
 
 
215
  ```
216
 
217
- 3. **Clone C3PO model (optional - auto-downloaded on first run):**
218
  ```bash
219
- git clone https://huggingface.co/Borcherding/XTTS-v2_C3PO XTTS-v2_C3PO
 
 
 
220
  ```
221
 
222
- 4. **Run the API:**
223
  ```bash
224
- uvicorn app:app --host 0.0.0.0 --port 7860
225
  ```
226
 
227
- ### Using Docker
228
 
229
- ```bash
230
- # Build and run
231
- docker build -t xtts-c3po-api .
232
- docker run -p 7860:7860 xtts-c3po-api
233
- ```
234
 
235
- ## Reference Audio Guidelines
236
 
237
- For custom voice cloning:
238
 
239
- 1. **Duration**: 3-10 seconds of clear speech
240
- 2. **Quality**: High-quality audio, minimal background noise
241
- 3. **Format**: WAV format recommended (MP3, M4A also supported)
242
- 4. **Content**: Natural speech, avoid music or effects
243
- 5. **Speaker**: Single speaker, clear pronunciation
244
 
245
- ## Model Information
246
 
247
- - **Base Model**: XTTS-v2
248
- - **Voice**: C3PO from Star Wars
249
- - **Source**: [Borcherding/XTTS-v2_C3PO](https://huggingface.co/Borcherding/XTTS-v2_C3PO)
250
- - **Languages**: 16+ supported
251
- - **License**: CPML (Coqui Public Model License)
252
 
253
- ## Testing
254
 
255
- Run the test suite:
256
  ```bash
257
- # Test C3PO model functionality
258
- python test.py
259
-
260
- # Test API endpoints
261
- python client_example.py
262
- ```
263
-
264
- ## Environment Variables
265
-
266
- Automatically configured:
267
- - `COQUI_TOS_AGREED=1` - Agrees to CPML license
268
- - `NUMBA_DISABLE_JIT=1` - Disables Numba JIT compilation
269
-
270
- ## API Response Examples
271
-
272
- ### Health Check Response
273
- ```json
274
- {
275
- "status": "healthy",
276
- "device": "cuda",
277
- "model": "XTTS-v2 C3PO",
278
- "default_voice": "C3PO",
279
- "supported_languages": ["en", "es", "fr", ...]
280
- }
281
- ```
282
-
283
- ### Languages Response
284
- ```json
285
- {
286
- "languages": ["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "ko", "hu", "hi"]
287
- }
288
  ```
289
 
290
- ## Troubleshooting
291
-
292
- ### CPU Performance
293
- When running on CPU:
294
- - Speech generation will be slower than GPU (30-60 seconds vs 3-5 seconds)
295
- - Memory usage is lower (2-4GB RAM vs 4-8GB VRAM)
296
- - No CUDA installation required
297
- - Works on any system with sufficient RAM
298
-
299
- ### PyTorch Loading Issues
300
- The API includes fixes for PyTorch 2.6's `weights_only=True` default. If you encounter loading issues, ensure the compatibility fix is applied.
301
-
302
- ### Model Download Issues
303
- If the C3PO model fails to download:
304
- 1. Check internet connection
305
- 2. Verify git and git-lfs are installed
306
- 3. Manually clone: `git clone https://huggingface.co/Borcherding/XTTS-v2_C3PO XTTS-v2_C3PO`
307
-
308
- ### Audio Quality Issues
309
- - Use high-quality reference audio for custom voices
310
- - Enable `voice_cleanup` for noisy reference audio
311
- - Ensure reference audio is 3-10 seconds long
312
 
313
- ### Memory Issues
314
- - **CPU Mode**: Requires 2-4GB RAM, works on most modern computers
315
- - **GPU Mode**: Requires 4GB+ VRAM for optimal performance
316
- - Reduce text length for batch processing
317
- - Use CPU mode with `FORCE_CPU=true` environment variable
318
 
319
- ### CPU-Only Installation Issues
320
- If you encounter GPU-related errors:
321
- 1. Set environment variables: `export FORCE_CPU=true CUDA_VISIBLE_DEVICES=""`
322
- 2. Install CPU-only PyTorch: `pip install torch torchaudio --index-url https://download.pytorch.org/whl/cpu`
323
- 3. Restart the API after setting environment variables
324
 
325
- ## License
326
 
327
- This project uses XTTS-v2 which is licensed under the Coqui Public Model License (CPML). The C3PO model is provided by the community. See https://coqui.ai/cpml for license details.
 
 
 
 
328
 
329
- ## Credits
330
 
331
- - **XTTS-v2**: Coqui AI
332
- - **C3PO Model**: [Borcherding](https://huggingface.co/Borcherding)
333
- - **Original Character**: C-3PO from Star Wars (Lucasfilm/Disney)
334
 
 
1
+ # 🤖 C-3PO TTS API
 
 
 
 
 
 
 
2
 
3
+ A FastAPI-based text-to-speech service using the **C-3PO fine-tuned XTTS v2 model** from [Borcherding/XTTS-v2_C3PO](https://huggingface.co/Borcherding/XTTS-v2_C3PO) for authentic C-3PO voice synthesis.
4
 
5
+ ## Features
6
 
7
+ - 🤖 **Authentic C-3PO Voice**: Fine-tuned XTTS v2 model with 20 unique C-3PO voice lines
8
+ - 🌍 **17+ Languages**: Multilingual support while maintaining C-3PO characteristics
9
+ - 🎭 **Voice Cloning**: Optional custom voice cloning capabilities
10
+ - 🚀 **FastAPI**: Modern API with automatic documentation
11
+ - 🐳 **Docker Ready**: Containerized for easy deployment
12
 
13
+ ## 🚀 Quick Start
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
+ ### Docker Deployment
16
  ```bash
17
+ # Build the container
18
+ docker build -t c3po-tts .
 
 
 
19
 
20
+ # Run the container
21
+ docker run -p 7860:7860 c3po-tts
 
 
 
 
 
 
22
  ```
23
 
24
+ ### Local Development
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  ```bash
26
+ # Install dependencies
 
 
 
 
 
27
  pip install -r requirements.txt
 
 
28
 
29
+ # Run the API
30
+ python coqui_api.py
 
 
31
  ```
32
 
33
+ The API will be available at `http://localhost:7860`
 
 
 
34
 
35
+ ## 📡 API Endpoints
36
 
37
+ ### C-3PO Text-to-Speech
 
 
 
 
 
 
 
 
38
  ```bash
39
+ curl -X POST "http://localhost:7860/tts-c3po" \
40
+ -F "text=I am C-3PO, human-cyborg relations." \
41
+ -F "language=en" \
42
+ --output c3po_voice.wav
 
43
  ```
44
 
45
+ ### General Text-to-Speech (with C-3PO voice by default)
46
  ```bash
47
+ curl -X POST "http://localhost:7860/tts" \
48
+ -F "text=The odds of successfully navigating an asteroid field are approximately 3,720 to 1." \
49
+ -F "language=en" \
50
+ --output c3po_output.wav
51
  ```
52
 
53
+ ### JSON API
54
  ```bash
55
+ curl -X POST "http://localhost:7860/tts-json" \
56
+ -H "Content-Type: application/json" \
57
+ -d '{"text": "R2-D2, you know better than to trust a strange computer!", "language": "en"}' \
58
+ --output c3po_json.wav
59
  ```
60
 
61
+ ### Health Check
62
  ```bash
63
+ curl http://localhost:7860/health
64
  ```
65
 
66
+ ## 🌍 Supported Languages
67
 
68
+ English, Spanish, French, German, Italian, Portuguese, Polish, Turkish, Russian, Dutch, Czech, Arabic, Chinese, Japanese, Hungarian, Korean, Hindi
 
 
 
 
69
 
70
+ ## 🎨 Example C-3PO Phrases
71
 
72
+ Perfect texts for demonstrating C-3PO's voice:
73
 
74
+ - "I am C-3PO, human-cyborg relations."
75
+ - "The odds of successfully navigating an asteroid field are approximately 3,720 to 1."
76
+ - "R2-D2, you know better than to trust a strange computer!"
77
+ - "Oh my! How interesting!"
 
78
 
79
+ ## 📖 API Documentation
80
 
81
+ Visit `http://localhost:7860/docs` for interactive API documentation.
 
 
 
 
82
 
83
+ ## 🧪 Testing
84
 
 
85
  ```bash
86
+ # Run the C-3PO test suite
87
+ python test_c3po_model.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  ```
89
 
90
+ ## 🔧 Configuration
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
+ The API automatically downloads the C-3PO model on first run. Environment variables:
 
 
 
 
93
 
94
+ - `COQUI_TOS_AGREED=1`: Accepts Coqui TTS terms
95
+ - `HF_HUB_DISABLE_TELEMETRY=1`: Disables telemetry
 
 
 
96
 
97
+ ## 📦 Files
98
 
99
+ - `coqui_api.py`: Main C-3PO TTS API
100
+ - `test_c3po_model.py`: Test suite for C-3PO functionality
101
+ - `start_c3po_api.py`: Startup script with dependency checks
102
+ - `Dockerfile`: Container configuration
103
+ - `requirements.txt`: Python dependencies
104
 
105
+ ## 🎭 Credits
106
 
107
+ - [C-3PO Fine-tuned Model](https://huggingface.co/Borcherding/XTTS-v2_C3PO) by Borcherding
108
+ - [Coqui TTS](https://github.com/coqui-ai/TTS) - The underlying TTS engine
109
+ - [FastAPI](https://fastapi.tiangolo.com/) - Web framework
110
 
README_coqui.md DELETED
@@ -1,351 +0,0 @@
1
- # 🤖 Coqui TTS C-3PO API for Hugging Face Spaces
2
-
3
- A FastAPI-based text-to-speech service using the Coqui TTS library with the **C-3PO fine-tuned XTTS v2 model** from [Borcherding/XTTS-v2_C3PO](https://huggingface.co/Borcherding/XTTS-v2_C3PO) for authentic C-3PO voice synthesis.
4
-
5
- ## ✨ Features
6
-
7
- - 🤖 **C-3PO Voice**: Authentic C-3PO voice using fine-tuned XTTS v2 model
8
- - 🎯 **Text-to-Speech**: Convert text to natural-sounding speech
9
- - 🎭 **Voice Cloning**: Clone any voice from a reference audio sample
10
- - 🌍 **Multilingual**: Support for 17+ languages with C-3PO voice characteristics
11
- - 🚀 **FastAPI**: Modern, fast API with automatic documentation
12
- - 🐳 **Docker Ready**: Containerized for easy deployment
13
- - ☁️ **Hugging Face Spaces**: Optimized for HF Spaces deployment
14
-
15
- ## 🎭 C-3PO Model Information
16
-
17
- This API uses the fine-tuned C-3PO voice model from [Borcherding/XTTS-v2_C3PO](https://huggingface.co/Borcherding/XTTS-v2_C3PO), which features:
18
-
19
- - **Fine-tuned on 20 unique C-3PO voice lines** from Star Wars
20
- - **Multi-lingual support** (17 languages) while maintaining C-3PO's distinctive voice
21
- - **Emotion & Style Transfer** capturing C-3PO's formal, protocol droid characteristics
22
- - **High-Quality Audio** output at 24kHz sampling rate
23
-
24
- ## 📡 API Endpoints
25
-
26
- ### 1. Health Check
27
- ```bash
28
- GET /health
29
- ```
30
- Returns API status, model information, and C-3PO voice availability.
31
-
32
- ### 2. List Models
33
- ```bash
34
- GET /models
35
- ```
36
- Returns available TTS models.
37
-
38
- ### 3. C-3PO Text-to-Speech (Dedicated)
39
- ```bash
40
- POST /tts-c3po
41
- ```
42
- **Parameters:**
43
- - `text` (string): Text to convert to C-3PO voice (2-500 characters)
44
- - `language` (string): Language code (default: "en")
45
-
46
- **Example using curl:**
47
- ```bash
48
- curl -X POST "http://localhost:7860/tts-c3po" \
49
- -F "text=I am C-3PO, human-cyborg relations." \
50
- -F "language=en" \
51
- --output c3po_voice.wav
52
- ```
53
-
54
- ### 4. General Text-to-Speech
55
- ```bash
56
- POST /tts
57
- ```
58
- **Parameters:**
59
- - `text` (string): Text to convert to speech (2-500 characters)
60
- - `language` (string): Language code (default: "en")
61
- - `speaker_file` (file, optional): Reference audio for voice cloning
62
- - `use_c3po_voice` (boolean): Use C-3PO voice if no speaker file provided (default: true)
63
-
64
- **Example using curl:**
65
- ```bash
66
- # C-3PO voice (default)
67
- curl -X POST "http://localhost:7860/tts" \
68
- -F "text=The odds of successfully navigating an asteroid field are approximately 3,720 to 1." \
69
- -F "language=en" \
70
- --output c3po_output.wav
71
-
72
- # Custom voice cloning
73
- curl -X POST "http://localhost:7860/tts" \
74
- -F "text=This will sound like the reference voice." \
75
- -F "language=en" \
76
- -F "speaker_file=@reference_voice.wav" \
77
- -F "use_c3po_voice=false" \
78
- --output cloned_voice.wav
79
- ```
80
-
81
- ### 5. JSON TTS (C-3PO Voice)
82
- ```bash
83
- POST /tts-json
84
- ```
85
- **JSON Body:**
86
- ```json
87
- {
88
- "text": "R2-D2, you know better than to trust a strange computer!",
89
- "language": "en"
90
- }
91
- ```
92
-
93
- ## 🚀 Deployment on Hugging Face Spaces
94
-
95
- ### Step 1: Create a new Space
96
- 1. Go to [Hugging Face Spaces](https://huggingface.co/spaces)
97
- 2. Click "Create new Space"
98
- 3. Choose "Docker" as the SDK
99
- 4. Set your space name and visibility
100
-
101
- ### Step 2: Add files to your Space
102
- Upload these files to your Hugging Face Space repository:
103
-
104
- ```
105
- your-space/
106
- ├── coqui_api.py # Main API file with C-3PO integration
107
- ├── requirements.txt # Dependencies (includes huggingface_hub)
108
- ├── Dockerfile.coqui # Docker configuration
109
- ├── test_c3po_model.py # Test script for C-3PO functionality
110
- └── README.md # This file
111
- ```
112
-
113
- ### Step 3: Configure your Space
114
- Rename the files in your Space:
115
- - `Dockerfile.coqui` → `Dockerfile`
116
-
117
- ### Step 4: Deploy
118
- Your Space will automatically build and deploy. The build process may take 15-20 minutes as it downloads the C-3PO fine-tuned model from Hugging Face.
119
-
120
- ## 💻 Local Development
121
-
122
- ### Requirements
123
- - Python 3.11+
124
- - PyTorch
125
- - Coqui TTS library
126
- - Hugging Face Hub
127
-
128
- ### Installation
129
- ```bash
130
- # Clone the repository
131
- git clone <your-repo>
132
- cd <your-repo>
133
-
134
- # Install dependencies
135
- pip install -r requirements.txt
136
-
137
- # Run the API
138
- python coqui_api.py
139
- ```
140
-
141
- The API will be available at `http://localhost:7860`
142
-
143
- ### Testing
144
- ```bash
145
- # Run the C-3PO model test suite
146
- python test_c3po_model.py
147
-
148
- # Run the general test client
149
- python test_coqui_api.py
150
- ```
151
-
152
- ## 🎪 Usage Examples
153
-
154
- ### Python Client - C-3PO Voice
155
- ```python
156
- import requests
157
-
158
- # C-3PO voice synthesis
159
- data = {"text": "I am C-3PO, human-cyborg relations.", "language": "en"}
160
- response = requests.post("http://localhost:7860/tts-c3po", data=data)
161
-
162
- with open("c3po_output.wav", "wb") as f:
163
- f.write(response.content)
164
-
165
- # JSON API
166
- import json
167
- headers = {'Content-Type': 'application/json'}
168
- data = {"text": "The odds are approximately 3,720 to 1!", "language": "en"}
169
- response = requests.post("http://localhost:7860/tts-json", json=data, headers=headers)
170
-
171
- with open("c3po_json.wav", "wb") as f:
172
- f.write(response.content)
173
- ```
174
-
175
- ### JavaScript/Web - C-3PO Voice
176
- ```javascript
177
- // C-3PO voice synthesis
178
- const formData = new FormData();
179
- formData.append('text', 'Oh my! How interesting!');
180
- formData.append('language', 'en');
181
-
182
- fetch('http://localhost:7860/tts-c3po', {
183
- method: 'POST',
184
- body: formData
185
- })
186
- .then(response => response.blob())
187
- .then(blob => {
188
- const url = URL.createObjectURL(blob);
189
- const audio = new Audio(url);
190
- audio.play();
191
- });
192
-
193
- // JSON API
194
- fetch('http://localhost:7860/tts-json', {
195
- method: 'POST',
196
- headers: {'Content-Type': 'application/json'},
197
- body: JSON.stringify({
198
- text: 'R2-D2, you know better than to trust a strange computer!',
199
- language: 'en'
200
- })
201
- })
202
- .then(response => response.blob())
203
- .then(blob => {
204
- const url = URL.createObjectURL(blob);
205
- const audio = new Audio(url);
206
- audio.play();
207
- });
208
- ```
209
-
210
- ## 🎨 C-3PO Voice Examples
211
-
212
- Perfect texts for demonstrating C-3PO's voice characteristics:
213
-
214
- ```bash
215
- # Classic C-3PO phrases
216
- curl -X POST "http://localhost:7860/tts-c3po" \
217
- -F "text=I am C-3PO, human-cyborg relations." \
218
- -F "language=en" --output c3po_intro.wav
219
-
220
- curl -X POST "http://localhost:7860/tts-c3po" \
221
- -F "text=The odds of successfully navigating an asteroid field are approximately 3,720 to 1." \
222
- -F "language=en" --output c3po_odds.wav
223
-
224
- curl -X POST "http://localhost:7860/tts-c3po" \
225
- -F "text=R2-D2, you know better than to trust a strange computer!" \
226
- -F "language=en" --output c3po_r2d2.wav
227
-
228
- curl -X POST "http://localhost:7860/tts-c3po" \
229
- -F "text=Oh my! How interesting!" \
230
- -F "language=en" --output c3po_oh_my.wav
231
- ```
232
-
233
- ## 🌍 Multilingual C-3PO Support
234
-
235
- The C-3PO model maintains its distinctive voice characteristics across multiple languages:
236
-
237
- ```python
238
- # Multilingual examples
239
- languages = [
240
- ("Hello, I am C-3PO", "en"),
241
- ("Hola, soy C-3PO", "es"),
242
- ("Bonjour, je suis C-3PO", "fr"),
243
- ("Guten Tag, ich bin C-3PO", "de"),
244
- ("Ciao, sono C-3PO", "it"),
245
- ("Olá, eu sou C-3PO", "pt")
246
- ]
247
-
248
- for text, lang in languages:
249
- response = requests.post("http://localhost:7860/tts-c3po",
250
- data={"text": text, "language": lang})
251
- with open(f"c3po_{lang}.wav", "wb") as f:
252
- f.write(response.content)
253
- ```
254
-
255
- ## 🔧 Voice Cloning Guide
256
-
257
- 1. **Prepare Reference Audio:**
258
- - Duration: 5-10 seconds (optimal)
259
- - Format: WAV, MP3, or M4A
260
- - Quality: Clear speech, minimal background noise
261
- - Content: Natural speaking, preferably in target language
262
-
263
- 2. **API Request:**
264
- ```bash
265
- curl -X POST "http://your-space.hf.space/tts" \
266
- -F "text=Your text to synthesize" \
267
- -F "language=en" \
268
- -F "speaker_file=@your_reference.wav" \
269
- --output result.wav
270
- ```
271
-
272
- 3. **Tips for Best Results:**
273
- - Use high-quality reference audio
274
- - Match the language of reference and target text
275
- - Keep text length reasonable (under 500 characters)
276
- - Experiment with different reference samples
277
-
278
- ## Supported Languages
279
-
280
- The XTTS v2 model supports multiple languages including:
281
- - English (en)
282
- - Spanish (es)
283
- - French (fr)
284
- - German (de)
285
- - Italian (it)
286
- - Portuguese (pt)
287
- - Polish (pl)
288
- - Turkish (tr)
289
- - Russian (ru)
290
- - Dutch (nl)
291
- - Czech (cs)
292
- - Arabic (ar)
293
- - Chinese (zh-cn)
294
- - Japanese (ja)
295
- - Hungarian (hu)
296
- - Korean (ko)
297
-
298
- ## Troubleshooting
299
-
300
- ### Common Issues
301
-
302
- 1. **Model Download Errors:**
303
- - The first run downloads ~1.7GB model files
304
- - Ensure stable internet connection
305
- - Check Hugging Face Spaces logs
306
-
307
- 2. **Audio Quality Issues:**
308
- - Use high-quality reference audio for voice cloning
309
- - Ensure reference audio matches target language
310
- - Try different reference samples
311
-
312
- 3. **Memory Issues on HF Spaces:**
313
- - The model requires significant memory
314
- - Consider upgrading to a higher-tier Space if needed
315
-
316
- 4. **API Timeouts:**
317
- - Initial model loading takes time
318
- - Subsequent requests are faster
319
- - Consider warming up the model with a test request
320
-
321
- ### Environment Variables
322
-
323
- - `COQUI_TOS_AGREED=1`: Accepts Coqui TTS terms of service
324
- - `HF_HUB_DISABLE_TELEMETRY=1`: Disables telemetry
325
- - `TORCH_HOME`: PyTorch cache directory
326
-
327
- ## API Documentation
328
-
329
- Once deployed, visit your Space URL and add `/docs` to access the interactive API documentation:
330
- ```
331
- https://your-username-your-space-name.hf.space/docs
332
- ```
333
-
334
- ## Contributing
335
-
336
- 1. Fork the repository
337
- 2. Create a feature branch
338
- 3. Make your changes
339
- 4. Test thoroughly
340
- 5. Submit a pull request
341
-
342
- ## License
343
-
344
- This project uses the Coqui TTS library. Please check [Coqui TTS license](https://github.com/coqui-ai/TTS) for usage terms.
345
-
346
- ## Credits
347
-
348
- - [Coqui TTS](https://github.com/coqui-ai/TTS) - The underlying TTS engine
349
- - [XTTS v2](https://arxiv.org/abs/2309.11321) - The voice cloning model
350
- - [FastAPI](https://fastapi.tiangolo.com/) - Web framework
351
- - [Hugging Face Spaces](https://huggingface.co/spaces) - Deployment platform
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py DELETED
@@ -1,414 +0,0 @@
1
- # Import configuration first to setup environment
2
- import app_config
3
-
4
- import os
5
- import sys
6
- import io
7
- import subprocess
8
- import uuid
9
- import time
10
- import torch
11
- import torchaudio
12
- import tempfile
13
- import logging
14
- from typing import Optional
15
-
16
- # Fix PyTorch weights_only issue for XTTS
17
- import torch.serialization
18
- from TTS.tts.configs.xtts_config import XttsConfig
19
- torch.serialization.add_safe_globals([XttsConfig])
20
-
21
- # Set environment variables
22
- os.environ["COQUI_TOS_AGREED"] = "1"
23
- os.environ["NUMBA_DISABLE_JIT"] = "1"
24
-
25
- # Force CPU usage if specified
26
- if os.environ.get("FORCE_CPU", "false").lower() == "true":
27
- os.environ["CUDA_VISIBLE_DEVICES"] = ""
28
-
29
- from fastapi import FastAPI, HTTPException, UploadFile, File, Form
30
- from fastapi.responses import FileResponse
31
- from pydantic import BaseModel
32
- import langid
33
- from scipy.io.wavfile import write
34
- from pydub import AudioSegment
35
-
36
- from TTS.api import TTS
37
- from TTS.tts.configs.xtts_config import XttsConfig
38
- from TTS.tts.models.xtts import Xtts
39
- from TTS.utils.generic_utils import get_user_data_dir
40
-
41
- # Configure logging
42
- logging.basicConfig(level=logging.INFO)
43
- logger = logging.getLogger(__name__)
44
-
45
- app = FastAPI(title="XTTS C3PO API", description="Text-to-Speech API using XTTS-v2 C3PO model", version="1.0.0")
46
-
47
- class TTSRequest(BaseModel):
48
- text: str
49
- language: str = "en"
50
- voice_cleanup: bool = False
51
- no_lang_auto_detect: bool = False
52
-
53
- class XTTSService:
54
- def __init__(self):
55
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
56
- logger.info(f"Using device: {self.device}")
57
-
58
- # Use the C3PO model path
59
- self.model_path = "XTTS-v2_C3PO/"
60
- self.config_path = "XTTS-v2_C3PO/config.json"
61
-
62
- # Check if model files exist, if not download them
63
- if not os.path.exists(self.config_path):
64
- logger.info("C3PO model not found locally, downloading...")
65
- self._download_c3po_model()
66
-
67
- # Load configuration
68
- config = XttsConfig()
69
- config.load_json(self.config_path)
70
-
71
- # Initialize and load model
72
- self.model = Xtts.init_from_config(config)
73
- self.model.load_checkpoint(
74
- config,
75
- checkpoint_path=os.path.join(self.model_path, "model.pth"),
76
- vocab_path=os.path.join(self.model_path, "vocab.json"),
77
- eval=True,
78
- )
79
-
80
- if self.device == "cuda":
81
- self.model.cuda()
82
-
83
- self.supported_languages = config.languages
84
- logger.info(f"XTTS C3PO model loaded successfully. Supported languages: {self.supported_languages}")
85
-
86
- # Set default reference audio (C3PO voice)
87
- self.default_reference = os.path.join(self.model_path, "reference.wav")
88
- if not os.path.exists(self.default_reference):
89
- # Look for any reference audio in the model directory
90
- for file in os.listdir(self.model_path):
91
- if file.endswith(('.wav', '.mp3', '.m4a')):
92
- self.default_reference = os.path.join(self.model_path, file)
93
- break
94
- else:
95
- self.default_reference = None
96
-
97
- if self.default_reference:
98
- logger.info(f"Default C3PO reference audio: {self.default_reference}")
99
- else:
100
- logger.warning("No default reference audio found in C3PO model directory")
101
-
102
- def _download_c3po_model(self):
103
- """Download the C3PO model from Hugging Face"""
104
- try:
105
- logger.info("Downloading C3PO model from Hugging Face...")
106
- subprocess.run([
107
- "git", "clone",
108
- "https://huggingface.co/Borcherding/XTTS-v2_C3PO",
109
- "XTTS-v2_C3PO"
110
- ], check=True)
111
- logger.info("C3PO model downloaded successfully")
112
- except subprocess.CalledProcessError as e:
113
- logger.error(f"Failed to download C3PO model: {e}")
114
- raise HTTPException(status_code=500, detail="Failed to download C3PO model")
115
-
116
- def generate_speech(self, text: str, speaker_wav_path: str = None, language: str = "en",
117
- voice_cleanup: bool = False, no_lang_auto_detect: bool = False) -> str:
118
- """Generate speech and return the path to the output file"""
119
- try:
120
- # Use default C3PO voice if no speaker file provided
121
- if speaker_wav_path is None:
122
- if self.default_reference is None:
123
- raise HTTPException(status_code=400, detail="No reference audio available. Please upload a speaker file.")
124
- speaker_wav_path = self.default_reference
125
- logger.info("Using default C3PO voice")
126
-
127
- # Validate language
128
- if language not in self.supported_languages:
129
- raise HTTPException(status_code=400, detail=f"Language '{language}' not supported. Supported: {self.supported_languages}")
130
-
131
- # Language detection for longer texts
132
- if len(text) > 15 and not no_lang_auto_detect:
133
- language_predicted = langid.classify(text)[0].strip()
134
- if language_predicted == "zh":
135
- language_predicted = "zh-cn"
136
-
137
- if language_predicted != language:
138
- logger.warning(f"Detected language: {language_predicted}, chosen: {language}")
139
-
140
- # Text length validation
141
- if len(text) < 2:
142
- raise HTTPException(status_code=400, detail="Text too short, please provide longer text")
143
-
144
- if len(text) > 500: # Increased limit for API
145
- raise HTTPException(status_code=400, detail="Text too long, maximum 500 characters")
146
-
147
- # Voice cleanup if requested
148
- processed_speaker_wav = speaker_wav_path
149
- if voice_cleanup:
150
- processed_speaker_wav = self._cleanup_audio(speaker_wav_path)
151
-
152
- # Generate conditioning latents
153
- try:
154
- gpt_cond_latent, speaker_embedding = self.model.get_conditioning_latents(
155
- audio_path=processed_speaker_wav,
156
- gpt_cond_len=30,
157
- gpt_cond_chunk_len=4,
158
- max_ref_length=60
159
- )
160
- except Exception as e:
161
- logger.error(f"Speaker encoding error: {e}")
162
- raise HTTPException(status_code=400, detail="Error processing reference audio. Please check the audio file.")
163
-
164
- # Generate speech
165
- logger.info("Generating speech...")
166
- start_time = time.time()
167
-
168
- out = self.model.inference(
169
- text,
170
- language,
171
- gpt_cond_latent,
172
- speaker_embedding,
173
- repetition_penalty=5.0,
174
- temperature=0.75,
175
- )
176
-
177
- inference_time = time.time() - start_time
178
- logger.info(f"Speech generation completed in {inference_time:.2f} seconds")
179
-
180
- # Save output
181
- output_filename = f"xtts_c3po_output_{uuid.uuid4().hex}.wav"
182
- output_path = os.path.join(tempfile.gettempdir(), output_filename)
183
-
184
- torchaudio.save(output_path, torch.tensor(out["wav"]).unsqueeze(0), 24000)
185
-
186
- return output_path
187
-
188
- except Exception as e:
189
- logger.error(f"Error generating speech: {e}")
190
- if isinstance(e, HTTPException):
191
- raise e
192
- raise HTTPException(status_code=500, detail=f"Failed to generate speech: {str(e)}")
193
-
194
- def _cleanup_audio(self, audio_path: str) -> str:
195
- """Apply audio cleanup filters"""
196
- try:
197
- output_path = audio_path + "_cleaned.wav"
198
-
199
- # Basic audio cleanup using ffmpeg-python or similar
200
- # For now, just return the original path
201
- # You can implement more sophisticated cleanup here
202
-
203
- return audio_path
204
- except Exception as e:
205
- logger.warning(f"Audio cleanup failed: {e}, using original audio")
206
- return audio_path
207
-
208
- # Initialize XTTS service
209
- logger.info("Initializing XTTS C3PO service...")
210
- tts_service = XTTSService()
211
-
212
- @app.get("/")
213
- async def root():
214
- return {"message": "XTTS C3PO API is running", "status": "healthy", "model": "C3PO"}
215
-
216
- @app.get("/health")
217
- async def health_check():
218
- return {
219
- "status": "healthy",
220
- "device": tts_service.device,
221
- "model": "XTTS-v2 C3PO",
222
- "supported_languages": tts_service.supported_languages,
223
- "default_voice": "C3PO" if tts_service.default_reference else "None"
224
- }
225
-
226
- @app.get("/languages")
227
- async def get_languages():
228
- """Get list of supported languages"""
229
- return {"languages": tts_service.supported_languages}
230
-
231
- @app.post("/tts")
232
- async def text_to_speech(
233
- text: str = Form(...),
234
- language: str = Form("en"),
235
- voice_cleanup: bool = Form(False),
236
- no_lang_auto_detect: bool = Form(False),
237
- speaker_file: UploadFile = File(None)
238
- ):
239
- """
240
- Convert text to speech using XTTS C3PO voice cloning
241
-
242
- - **text**: The text to convert to speech (max 500 characters)
243
- - **language**: Language code (default: "en")
244
- - **voice_cleanup**: Apply audio cleanup to reference voice
245
- - **no_lang_auto_detect**: Disable automatic language detection
246
- - **speaker_file**: Reference speaker audio file (optional, uses C3PO voice if not provided)
247
- """
248
-
249
- if not text.strip():
250
- raise HTTPException(status_code=400, detail="Text cannot be empty")
251
-
252
- speaker_temp_path = None
253
-
254
- try:
255
- # Handle speaker file if provided
256
- if speaker_file is not None:
257
- # Validate file type
258
- if not speaker_file.content_type.startswith('audio/'):
259
- raise HTTPException(status_code=400, detail="Speaker file must be an audio file")
260
-
261
- # Save uploaded speaker file temporarily
262
- speaker_temp_path = os.path.join(tempfile.gettempdir(), f"speaker_{uuid.uuid4().hex}.wav")
263
-
264
- with open(speaker_temp_path, "wb") as buffer:
265
- content = await speaker_file.read()
266
- buffer.write(content)
267
-
268
- # Generate speech (will use C3PO voice if no speaker file provided)
269
- output_path = tts_service.generate_speech(
270
- text,
271
- speaker_temp_path,
272
- language,
273
- voice_cleanup,
274
- no_lang_auto_detect
275
- )
276
-
277
- # Clean up temporary speaker file
278
- if speaker_temp_path and os.path.exists(speaker_temp_path):
279
- try:
280
- os.remove(speaker_temp_path)
281
- except:
282
- pass
283
-
284
- # Return the generated audio file
285
- voice_type = "custom" if speaker_file else "c3po"
286
- return FileResponse(
287
- output_path,
288
- media_type="audio/wav",
289
- filename=f"xtts_{voice_type}_output_{uuid.uuid4().hex}.wav",
290
- headers={"Content-Disposition": "attachment"}
291
- )
292
-
293
- except Exception as e:
294
- # Clean up files in case of error
295
- if speaker_temp_path and os.path.exists(speaker_temp_path):
296
- try:
297
- os.remove(speaker_temp_path)
298
- except:
299
- pass
300
-
301
- logger.error(f"Error in TTS endpoint: {e}")
302
- if isinstance(e, HTTPException):
303
- raise e
304
- raise HTTPException(status_code=500, detail=str(e))
305
-
306
- @app.post("/tts-json")
307
- async def text_to_speech_json(
308
- request: TTSRequest,
309
- speaker_file: UploadFile = File(None)
310
- ):
311
- """
312
- Convert text to speech using JSON request body
313
-
314
- - **request**: TTSRequest containing text, language, and options
315
- - **speaker_file**: Reference speaker audio file (optional, uses C3PO voice if not provided)
316
- """
317
-
318
- if not request.text.strip():
319
- raise HTTPException(status_code=400, detail="Text cannot be empty")
320
-
321
- speaker_temp_path = None
322
-
323
- try:
324
- # Handle speaker file if provided
325
- if speaker_file is not None:
326
- # Validate file type
327
- if not speaker_file.content_type.startswith('audio/'):
328
- raise HTTPException(status_code=400, detail="Speaker file must be an audio file")
329
-
330
- # Save uploaded speaker file temporarily
331
- speaker_temp_path = os.path.join(tempfile.gettempdir(), f"speaker_{uuid.uuid4().hex}.wav")
332
-
333
- with open(speaker_temp_path, "wb") as buffer:
334
- content = await speaker_file.read()
335
- buffer.write(content)
336
-
337
- # Generate speech
338
- output_path = tts_service.generate_speech(
339
- request.text,
340
- speaker_temp_path,
341
- request.language,
342
- request.voice_cleanup,
343
- request.no_lang_auto_detect
344
- )
345
-
346
- # Clean up temporary speaker file
347
- if speaker_temp_path and os.path.exists(speaker_temp_path):
348
- try:
349
- os.remove(speaker_temp_path)
350
- except:
351
- pass
352
-
353
- # Return the generated audio file
354
- voice_type = "custom" if speaker_file else "c3po"
355
- return FileResponse(
356
- output_path,
357
- media_type="audio/wav",
358
- filename=f"xtts_{voice_type}_{request.language}_{uuid.uuid4().hex}.wav",
359
- headers={"Content-Disposition": "attachment"}
360
- )
361
-
362
- except Exception as e:
363
- # Clean up files in case of error
364
- if speaker_temp_path and os.path.exists(speaker_temp_path):
365
- try:
366
- os.remove(speaker_temp_path)
367
- except:
368
- pass
369
-
370
- logger.error(f"Error in TTS JSON endpoint: {e}")
371
- if isinstance(e, HTTPException):
372
- raise e
373
- raise HTTPException(status_code=500, detail=str(e))
374
-
375
- @app.post("/tts-c3po")
376
- async def text_to_speech_c3po_only(
377
- text: str = Form(...),
378
- language: str = Form("en"),
379
- no_lang_auto_detect: bool = Form(False)
380
- ):
381
- """
382
- Convert text to speech using C3PO voice only (no file upload needed)
383
-
384
- - **text**: The text to convert to speech (max 500 characters)
385
- - **language**: Language code (default: "en")
386
- - **no_lang_auto_detect**: Disable automatic language detection
387
- """
388
-
389
- if not text.strip():
390
- raise HTTPException(status_code=400, detail="Text cannot be empty")
391
-
392
- try:
393
- # Generate speech using C3PO voice
394
- output_path = tts_service.generate_speech(
395
- text,
396
- None, # Use default C3PO voice
397
- language,
398
- False, # No voice cleanup needed for default voice
399
- no_lang_auto_detect
400
- )
401
-
402
- # Return the generated audio file
403
- return FileResponse(
404
- output_path,
405
- media_type="audio/wav",
406
- filename=f"c3po_voice_{uuid.uuid4().hex}.wav",
407
- headers={"Content-Disposition": "attachment"}
408
- )
409
-
410
- except Exception as e:
411
- logger.error(f"Error in C3PO TTS endpoint: {e}")
412
- if isinstance(e, HTTPException):
413
- raise e
414
- raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app_config.py DELETED
@@ -1,54 +0,0 @@
1
- """
2
- Configuration for Kokoro TTS API, especially for Hugging Face Spaces deployment.
3
- """
4
-
5
- import os
6
- import tempfile
7
- import logging
8
-
9
- # Configure logging
10
- logging.basicConfig(level=logging.INFO)
11
- logger = logging.getLogger(__name__)
12
-
13
- def setup_hf_cache():
14
- """Setup cache environment variables for Hugging Face Spaces"""
15
- # Use user's home directory for cache
16
- home_dir = os.path.expanduser("~")
17
- cache_dir = os.path.join(home_dir, ".cache")
18
-
19
- cache_settings = {
20
- 'HF_HOME': cache_dir,
21
- 'TRANSFORMERS_CACHE': cache_dir,
22
- 'HF_HUB_CACHE': cache_dir,
23
- 'TORCH_HOME': cache_dir,
24
- 'NUMBA_CACHE_DIR': os.path.join(cache_dir, 'numba'),
25
- 'NUMBA_DISABLE_JIT': '1',
26
- 'HF_HUB_DISABLE_TELEMETRY': '1'
27
- }
28
-
29
- # Set environment variables
30
- for key, value in cache_settings.items():
31
- os.environ[key] = value
32
- logger.info(f"Set {key} to {value}")
33
-
34
- # Create cache directories
35
- cache_dirs = [cache_dir, os.path.join(cache_dir, 'numba')]
36
- for cache_path in cache_dirs:
37
- try:
38
- os.makedirs(cache_path, exist_ok=True)
39
- logger.info(f"Created cache directory: {cache_path}")
40
- except Exception as e:
41
- logger.warning(f"Could not create {cache_path}: {e}")
42
-
43
- logger.info("Cache environment setup completed")
44
-
45
- def get_temp_dir():
46
- """Get a writable temporary directory"""
47
- return tempfile.gettempdir()
48
-
49
- def is_hf_spaces():
50
- """Check if running on Hugging Face Spaces"""
51
- return os.environ.get('SPACE_ID') is not None
52
-
53
- # Initialize cache setup
54
- setup_hf_cache()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
client_example.py DELETED
@@ -1,269 +0,0 @@
1
- import requests
2
- import os
3
-
4
- def test_c3po_voice():
5
- """Test the C3PO voice without uploading any files"""
6
-
7
- # API endpoint for C3PO voice only
8
- url = "http://localhost:7860/tts-c3po"
9
-
10
- # Text to convert to speech
11
- text = "Hello there! I am C-3PO, human-cyborg relations. How may I assist you today?"
12
-
13
- # Prepare the request data
14
- data = {
15
- "text": text,
16
- "language": "en",
17
- "no_lang_auto_detect": False
18
- }
19
-
20
- try:
21
- print("Testing C3PO voice...")
22
- print(f"Text: {text}")
23
-
24
- response = requests.post(url, data=data)
25
-
26
- if response.status_code == 200:
27
- # Save the generated audio
28
- output_filename = "c3po_voice_sample.wav"
29
- with open(output_filename, "wb") as f:
30
- f.write(response.content)
31
- print(f"Success! C3PO voice sample saved as {output_filename}")
32
- else:
33
- print(f"Error: {response.status_code}")
34
- print(response.text)
35
-
36
- except requests.exceptions.ConnectionError:
37
- print("Error: Could not connect to the API. Make sure the server is running on http://localhost:7860")
38
- except Exception as e:
39
- print(f"Error: {e}")
40
-
41
- def test_xtts_with_custom_voice():
42
- """Example of using XTTS with custom voice upload"""
43
-
44
- # API endpoint
45
- url = "http://localhost:7860/tts"
46
-
47
- # Text to convert to speech
48
- text = "This is a test of XTTS voice cloning with a custom reference voice."
49
-
50
- # Path to your speaker reference audio file
51
- speaker_file_path = "reference.wav" # Update this path to your reference audio
52
-
53
- # Check if speaker file exists
54
- if not os.path.exists(speaker_file_path):
55
- print(f"Custom voice test skipped: Speaker file not found at {speaker_file_path}")
56
- print("To test custom voice cloning:")
57
- print("1. Record 3-10 seconds of clear speech")
58
- print("2. Save as 'reference.wav' in this directory")
59
- print("3. Run this test again")
60
- return
61
-
62
- # Prepare the request data
63
- data = {
64
- "text": text,
65
- "language": "en",
66
- "voice_cleanup": False,
67
- "no_lang_auto_detect": False
68
- }
69
-
70
- files = {
71
- "speaker_file": open(speaker_file_path, "rb")
72
- }
73
-
74
- try:
75
- print("Testing XTTS with custom voice...")
76
- print(f"Text: {text}")
77
- print(f"Speaker file: {speaker_file_path}")
78
-
79
- response = requests.post(url, data=data, files=files)
80
-
81
- if response.status_code == 200:
82
- # Save the generated audio
83
- output_filename = "custom_voice_clone.wav"
84
- with open(output_filename, "wb") as f:
85
- f.write(response.content)
86
- print(f"Success! Custom voice clone saved as {output_filename}")
87
- else:
88
- print(f"Error: {response.status_code}")
89
- print(response.text)
90
-
91
- except requests.exceptions.ConnectionError:
92
- print("Error: Could not connect to the API. Make sure the server is running on http://localhost:7860")
93
- except Exception as e:
94
- print(f"Error: {e}")
95
- finally:
96
- files["speaker_file"].close()
97
-
98
- def test_xtts_fallback_to_c3po():
99
- """Test XTTS endpoint without speaker file (should use C3PO voice)"""
100
-
101
- # API endpoint
102
- url = "http://localhost:7860/tts"
103
-
104
- # Text to convert to speech
105
- text = "When no custom voice is provided, I will speak in the C3PO voice by default."
106
-
107
- # Prepare the request data (no speaker file)
108
- data = {
109
- "text": text,
110
- "language": "en",
111
- "voice_cleanup": False,
112
- "no_lang_auto_detect": False
113
- }
114
-
115
- try:
116
- print("Testing XTTS fallback to C3PO voice...")
117
- print(f"Text: {text}")
118
-
119
- response = requests.post(url, data=data)
120
-
121
- if response.status_code == 200:
122
- # Save the generated audio
123
- output_filename = "xtts_c3po_fallback.wav"
124
- with open(output_filename, "wb") as f:
125
- f.write(response.content)
126
- print(f"Success! XTTS with C3PO fallback saved as {output_filename}")
127
- else:
128
- print(f"Error: {response.status_code}")
129
- print(response.text)
130
-
131
- except requests.exceptions.ConnectionError:
132
- print("Error: Could not connect to the API. Make sure the server is running on http://localhost:7860")
133
- except Exception as e:
134
- print(f"Error: {e}")
135
-
136
- def test_multilingual_c3po():
137
- """Test C3PO voice in different languages"""
138
-
139
- # API endpoint for C3PO voice only
140
- url = "http://localhost:7860/tts-c3po"
141
-
142
- # Test different languages
143
- test_cases = [
144
- ("en", "Hello, I am C-3PO. I am fluent in over six million forms of communication."),
145
- ("es", "Hola, soy C-3PO. Domino más de seis millones de formas de comunicación."),
146
- ("fr", "Bonjour, je suis C-3PO. Je maîtrise plus de six millions de formes de communication."),
147
- ("de", "Hallo, ich bin C-3PO. Ich beherrsche über sechs Millionen Kommunikationsformen."),
148
- ]
149
-
150
- for language, text in test_cases:
151
- data = {
152
- "text": text,
153
- "language": language,
154
- "no_lang_auto_detect": True # Force the specified language
155
- }
156
-
157
- try:
158
- print(f"Testing C3PO voice in {language.upper()}...")
159
- print(f"Text: {text}")
160
-
161
- response = requests.post(url, data=data)
162
-
163
- if response.status_code == 200:
164
- # Save the generated audio
165
- output_filename = f"c3po_voice_{language}.wav"
166
- with open(output_filename, "wb") as f:
167
- f.write(response.content)
168
- print(f"Success! C3PO {language} voice saved as {output_filename}")
169
- else:
170
- print(f"Error: {response.status_code}")
171
- print(response.text)
172
-
173
- except requests.exceptions.ConnectionError:
174
- print("Error: Could not connect to the API. Make sure the server is running on http://localhost:7860")
175
- except Exception as e:
176
- print(f"Error: {e}")
177
-
178
- print() # Add spacing between tests
179
-
180
- def get_supported_languages():
181
- """Get list of supported languages"""
182
- try:
183
- response = requests.get("http://localhost:7860/languages")
184
- if response.status_code == 200:
185
- languages = response.json()
186
- print("Supported languages:", languages["languages"])
187
- return languages["languages"]
188
- else:
189
- print("Failed to get languages:", response.status_code)
190
- return []
191
- except requests.exceptions.ConnectionError:
192
- print("API is not running. Start it with: uvicorn app:app --host 0.0.0.0 --port 7860")
193
- return []
194
-
195
- def check_api_health():
196
- """Check if the API is running"""
197
- try:
198
- response = requests.get("http://localhost:7860/health")
199
- if response.status_code == 200:
200
- health_info = response.json()
201
- print("API Health Check:")
202
- print(f" Status: {health_info['status']}")
203
- print(f" Device: {health_info['device']}")
204
- print(f" Model: {health_info['model']}")
205
- print(f" Default Voice: {health_info['default_voice']}")
206
- print(f" Languages: {len(health_info['supported_languages'])} supported")
207
- return True
208
- else:
209
- print("API health check failed:", response.status_code)
210
- return False
211
- except requests.exceptions.ConnectionError:
212
- print("API is not running. Start it with: uvicorn app:app --host 0.0.0.0 --port 7860")
213
- return False
214
-
215
- def create_sample_reference():
216
- """Instructions for creating a reference audio file"""
217
- print("\n" + "="*50)
218
- print("REFERENCE AUDIO SETUP")
219
- print("="*50)
220
- print("To use XTTS voice cloning, you need a reference audio file:")
221
- print("1. Record 3-10 seconds of clear speech")
222
- print("2. Save as WAV format (recommended)")
223
- print("3. Ensure good audio quality (no background noise)")
224
- print("4. Place the file in the same directory as this script")
225
- print("5. Update the 'speaker_file_path' variable in the functions above")
226
- print("\nExample recording text:")
227
- print("'Hello, this is my voice. I'm recording this sample for voice cloning.'")
228
- print("="*50)
229
-
230
- if __name__ == "__main__":
231
- print("XTTS C3PO API Client Example")
232
- print("=" * 40)
233
-
234
- # First check if API is running
235
- if check_api_health():
236
- print()
237
-
238
- # Get supported languages
239
- languages = get_supported_languages()
240
- print()
241
-
242
- # Test C3PO voice (no file upload needed)
243
- print("1. Testing C3PO voice (no upload required)...")
244
- test_c3po_voice()
245
- print()
246
-
247
- # Test XTTS fallback to C3PO
248
- print("2. Testing XTTS endpoint without speaker file (C3PO fallback)...")
249
- test_xtts_fallback_to_c3po()
250
- print()
251
-
252
- # Test custom voice if reference file exists
253
- print("3. Testing custom voice cloning...")
254
- test_xtts_with_custom_voice()
255
- print()
256
-
257
- # Test multilingual C3PO
258
- print("4. Testing multilingual C3PO voice...")
259
- test_multilingual_c3po()
260
-
261
- print("All tests completed!")
262
- print("\nGenerated files:")
263
- for file in os.listdir("."):
264
- if file.endswith(".wav") and ("c3po" in file or "custom" in file or "xtts" in file):
265
- print(f" - {file}")
266
-
267
- else:
268
- print("\nPlease start the API server first:")
269
- print("uvicorn app:app --host 0.0.0.0 --port 7860")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,13 +1,8 @@
1
- SpeechRecognition>=3.8.1
2
- gtts>=2.3.2
3
- openai-whisper>=20240930
4
- pygame>=2.5.2
5
- anyascii>=0.3.0
6
- einops>=0.6.0
7
- encodec>=0.1.1
8
- inflect>=5.6.0
9
- num2words>=0.5.14
10
- pysbd>=0.3.4
11
- tqdm>=4.64.1
12
- coqui-tts == 0.26.2
13
- huggingface_hub>=0.17.0
 
1
+ fastapi>=0.104.1
2
+ uvicorn>=0.24.0
3
+ python-multipart>=0.0.6
4
+ torch>=2.0.0
5
+ torchaudio>=2.0.0
6
+ coqui-tts>=0.22.0
7
+ huggingface_hub>=0.17.0
8
+ pydantic>=2.0.0
 
 
 
 
 
requirements_coqui.txt DELETED
@@ -1,12 +0,0 @@
1
- fastapi>=0.104.1
2
- uvicorn[standard]>=0.24.0
3
- python-multipart>=0.0.6
4
- coqui-tts==0.26.2
5
- torch>=2.0.0
6
- torchaudio>=2.0.0
7
- numpy>=1.24.0
8
- scipy>=1.11.0
9
- pydub>=0.25.1
10
- librosa>=0.10.0
11
- soundfile>=0.12.1
12
- typing-extensions>=4.8.0
 
 
 
 
 
 
 
 
 
 
 
 
 
start_c3po_api.py CHANGED
@@ -1,171 +1,52 @@
1
  #!/usr/bin/env python3
2
  """
3
- Startup script for C-3PO TTS API
4
- Handles model download, initialization, and server startup
5
  """
6
 
7
  import os
8
  import sys
9
- import subprocess
10
  import logging
11
- import time
12
- from pathlib import Path
13
 
14
  # Configure logging
15
- logging.basicConfig(
16
- level=logging.INFO,
17
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
18
- )
19
  logger = logging.getLogger(__name__)
20
 
21
- def check_dependencies():
22
- """Check if all required dependencies are installed"""
23
- logger.info("🔍 Checking dependencies...")
24
-
25
- try:
26
- import torch
27
- import TTS
28
- import fastapi
29
- import huggingface_hub
30
- logger.info("✅ All core dependencies found")
31
- return True
32
- except ImportError as e:
33
- logger.error(f"❌ Missing dependency: {e}")
34
- logger.info("💡 Install with: pip install -r requirements.txt")
35
- return False
36
-
37
- def check_gpu():
38
- """Check GPU availability"""
39
- try:
40
- import torch
41
- if torch.cuda.is_available():
42
- gpu_name = torch.cuda.get_device_name(0)
43
- logger.info(f"🎮 GPU available: {gpu_name}")
44
- return True
45
- else:
46
- logger.info("💻 No GPU available, using CPU")
47
- return False
48
- except Exception as e:
49
- logger.warning(f"⚠️ GPU check failed: {e}")
50
- return False
51
-
52
- def check_disk_space():
53
- """Check available disk space for model download"""
54
- try:
55
- import shutil
56
- free_space = shutil.disk_usage('.').free / (1024**3) # GB
57
-
58
- if free_space < 5:
59
- logger.warning(f"⚠️ Low disk space: {free_space:.1f}GB available")
60
- logger.warning("💽 C-3PO model requires ~2GB space")
61
- else:
62
- logger.info(f"💾 Disk space: {free_space:.1f}GB available")
63
-
64
- return free_space > 2
65
- except Exception as e:
66
- logger.warning(f"⚠️ Disk space check failed: {e}")
67
- return True
68
-
69
  def setup_environment():
70
- """Set up environment variables"""
71
  os.environ["COQUI_TOS_AGREED"] = "1"
72
  os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
73
-
74
- # Create models directory
75
- models_dir = Path("./models")
76
- models_dir.mkdir(exist_ok=True)
77
-
78
  logger.info("🌍 Environment configured")
79
 
80
- def install_dependencies():
81
- """Install missing dependencies"""
82
- logger.info("📦 Installing dependencies...")
83
-
84
- try:
85
- subprocess.check_call([
86
- sys.executable, "-m", "pip", "install", "-r", "requirements.txt"
87
- ])
88
- logger.info("✅ Dependencies installed successfully")
89
- return True
90
- except subprocess.CalledProcessError as e:
91
- logger.error(f"❌ Failed to install dependencies: {e}")
92
- return False
93
-
94
- def test_model_download():
95
- """Test if the C-3PO model can be downloaded"""
96
- logger.info("🤖 Testing C-3PO model availability...")
97
 
98
  try:
99
- from huggingface_hub import repo_info
100
-
101
- # Check if the repo exists and is accessible
102
- info = repo_info(repo_id="Borcherding/XTTS-v2_C3PO")
103
- logger.info(f"✅ C-3PO model accessible: {info.id}")
104
- logger.info(f" Last modified: {info.last_modified}")
105
-
106
- return True
107
- except Exception as e:
108
- logger.error(f"❌ C-3PO model not accessible: {e}")
109
- return False
110
-
111
- def start_api_server():
112
- """Start the FastAPI server"""
113
- logger.info("🚀 Starting C-3PO TTS API server...")
114
-
115
- try:
116
- # Import and run the API
117
  import uvicorn
118
  from coqui_api import app
119
 
120
  logger.info("🎭 C-3PO TTS API starting on http://localhost:7860")
121
- logger.info("📖 API documentation available at http://localhost:7860/docs")
122
 
123
- uvicorn.run(
124
- app,
125
- host="0.0.0.0",
126
- port=7860,
127
- log_level="info"
128
- )
129
 
 
 
 
 
130
  except Exception as e:
131
- logger.error(f"❌ Failed to start API server: {e}")
132
- return False
133
 
134
  def main():
135
  """Main startup sequence"""
136
- print("🤖 C-3PO TTS API Startup")
137
- print("=" * 50)
138
 
139
- # Step 1: Check dependencies
140
- if not check_dependencies():
141
- logger.info("📦 Attempting to install dependencies...")
142
- if not install_dependencies():
143
- logger.error("❌ Failed to install dependencies. Exiting.")
144
- sys.exit(1)
145
-
146
- # Step 2: Setup environment
147
  setup_environment()
148
 
149
- # Step 3: Check system resources
150
- has_gpu = check_gpu()
151
- has_space = check_disk_space()
152
-
153
- if not has_space:
154
- logger.error("❌ Insufficient disk space. Exiting.")
155
- sys.exit(1)
156
-
157
- # Step 4: Test model availability
158
- if not test_model_download():
159
- logger.warning("⚠️ C-3PO model may not be accessible")
160
- logger.warning(" The API will fall back to standard XTTS v2")
161
-
162
- # Step 5: Start the server
163
- print("\n" + "=" * 50)
164
- logger.info("🎬 All checks passed! Starting C-3PO TTS API...")
165
- print("=" * 50)
166
-
167
  try:
168
- start_api_server()
169
  except KeyboardInterrupt:
170
  logger.info("\n🛑 Server stopped by user")
171
  except Exception as e:
 
1
  #!/usr/bin/env python3
2
  """
3
+ Simple startup script for C-3PO TTS API
 
4
  """
5
 
6
  import os
7
  import sys
 
8
  import logging
 
 
9
 
10
  # Configure logging
11
+ logging.basicConfig(level=logging.INFO)
 
 
 
12
  logger = logging.getLogger(__name__)
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  def setup_environment():
15
+ """Set up required environment variables"""
16
  os.environ["COQUI_TOS_AGREED"] = "1"
17
  os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
 
 
 
 
 
18
  logger.info("🌍 Environment configured")
19
 
20
+ def start_api():
21
+ """Start the C-3PO TTS API"""
22
+ logger.info("🤖 Starting C-3PO TTS API...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  import uvicorn
26
  from coqui_api import app
27
 
28
  logger.info("🎭 C-3PO TTS API starting on http://localhost:7860")
29
+ logger.info("📖 API documentation: http://localhost:7860/docs")
30
 
31
+ uvicorn.run(app, host="0.0.0.0", port=7860, log_level="info")
 
 
 
 
 
32
 
33
+ except ImportError as e:
34
+ logger.error(f"❌ Missing dependency: {e}")
35
+ logger.info("💡 Install with: pip install -r requirements.txt")
36
+ sys.exit(1)
37
  except Exception as e:
38
+ logger.error(f"❌ Failed to start API: {e}")
39
+ sys.exit(1)
40
 
41
  def main():
42
  """Main startup sequence"""
43
+ print("🤖 C-3PO TTS API")
44
+ print("=" * 30)
45
 
 
 
 
 
 
 
 
 
46
  setup_environment()
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  try:
49
+ start_api()
50
  except KeyboardInterrupt:
51
  logger.info("\n🛑 Server stopped by user")
52
  except Exception as e:
startup.py DELETED
@@ -1,120 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Startup script for Kokoro TTS API on Hugging Face Spaces
4
- """
5
-
6
- import os
7
- import sys
8
- import logging
9
- import subprocess
10
-
11
- # Configure logging
12
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
13
- logger = logging.getLogger(__name__)
14
-
15
- def check_environment():
16
- """Check the environment and permissions"""
17
- logger.info("=== Environment Check ===")
18
-
19
- # Check if running on HF Spaces
20
- space_id = os.environ.get('SPACE_ID')
21
- if space_id:
22
- logger.info(f"Running on Hugging Face Spaces: {space_id}")
23
- else:
24
- logger.info("Not running on Hugging Face Spaces")
25
-
26
- # Check Python version
27
- logger.info(f"Python version: {sys.version}")
28
-
29
- # Check current user and home directory
30
- logger.info(f"Current user: {os.getenv('USER', 'unknown')}")
31
- logger.info(f"Home directory: {os.path.expanduser('~')}")
32
- logger.info(f"Current working directory: {os.getcwd()}")
33
-
34
- # Check available disk space
35
- try:
36
- result = subprocess.run(['df', '-h', '/tmp'], capture_output=True, text=True)
37
- logger.info(f"Disk space in /tmp:\n{result.stdout}")
38
- except Exception as e:
39
- logger.warning(f"Could not check disk space: {e}")
40
-
41
- # Check write permissions for important directories
42
- test_dirs = ['/tmp', os.path.expanduser('~'), os.getcwd()]
43
- for test_dir in test_dirs:
44
- try:
45
- test_file = os.path.join(test_dir, 'test_write.tmp')
46
- with open(test_file, 'w') as f:
47
- f.write('test')
48
- os.remove(test_file)
49
- logger.info(f"✅ Write permission OK: {test_dir}")
50
- except Exception as e:
51
- logger.warning(f"❌ Write permission failed: {test_dir} - {e}")
52
-
53
- def check_dependencies():
54
- """Check if required packages are installed"""
55
- logger.info("=== Checking dependencies ===")
56
-
57
- required_packages = [
58
- 'kokoro',
59
- 'soundfile',
60
- 'torch',
61
- 'fastapi',
62
- 'uvicorn'
63
- ]
64
-
65
- for package in required_packages:
66
- try:
67
- __import__(package)
68
- logger.info(f"✅ {package} is available")
69
- except ImportError:
70
- logger.error(f"❌ {package} is not available")
71
-
72
- def test_kokoro():
73
- """Test Kokoro TTS functionality"""
74
- logger.info("=== Testing Kokoro TTS ===")
75
-
76
- try:
77
- # Import after setting up environment
78
- import app_config # This will setup environment
79
- from kokoro import KPipeline
80
-
81
- logger.info("Initializing Kokoro pipeline...")
82
- pipeline = KPipeline(lang_code='a')
83
- logger.info("✅ Kokoro pipeline initialized successfully")
84
-
85
- # Test generation
86
- logger.info("Testing speech generation...")
87
- text = "Hello, this is a test."
88
- generator = pipeline(text, voice='af_heart')
89
-
90
- for i, (gs, ps, audio) in enumerate(generator):
91
- logger.info(f"✅ Generated audio segment {i}: gs={gs}, ps={ps}, audio shape: {audio.shape}")
92
- break
93
-
94
- logger.info("✅ Kokoro TTS test completed successfully")
95
- return True
96
-
97
- except Exception as e:
98
- logger.error(f"❌ Kokoro TTS test failed: {e}")
99
- import traceback
100
- logger.error(f"Full traceback: {traceback.format_exc()}")
101
- return False
102
-
103
- def main():
104
- """Main startup function"""
105
- logger.info("🚀 Starting Kokoro TTS API setup...")
106
-
107
- check_environment()
108
- check_dependencies()
109
-
110
- if test_kokoro():
111
- logger.info("🎉 All checks passed! Starting the API...")
112
- # Import and start the app
113
- import uvicorn
114
- uvicorn.run("app:app", host="0.0.0.0", port=7860, log_level="info")
115
- else:
116
- logger.error("❌ Setup failed. Please check the logs above.")
117
- sys.exit(1)
118
-
119
- if __name__ == "__main__":
120
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test.py DELETED
@@ -1,144 +0,0 @@
1
- import os
2
- import torch
3
- import torchaudio
4
- import subprocess
5
-
6
- # Set environment variables for CPU-only usage
7
- os.environ['COQUI_TOS_AGREED'] = '1'
8
- os.environ['NUMBA_DISABLE_JIT'] = '1'
9
- os.environ['FORCE_CPU'] = 'true'
10
- os.environ['CUDA_VISIBLE_DEVICES'] = ''
11
-
12
- # Fix PyTorch weights_only issue for XTTS
13
- import torch.serialization
14
- from TTS.tts.configs.xtts_config import XttsConfig
15
- torch.serialization.add_safe_globals([XttsConfig])
16
-
17
- from TTS.api import TTS
18
- from TTS.tts.configs.xtts_config import XttsConfig
19
- from TTS.tts.models.xtts import Xtts
20
- from TTS.utils.generic_utils import get_user_data_dir
21
-
22
- print("Testing XTTS C3PO voice cloning...")
23
-
24
- # C3PO model path
25
- model_path = "XTTS-v2_C3PO/"
26
- config_path = "XTTS-v2_C3PO/config.json"
27
-
28
- # Check if model files exist, if not download them
29
- if not os.path.exists(config_path):
30
- print("C3PO model not found locally, downloading...")
31
- try:
32
- subprocess.run([
33
- "git", "clone",
34
- "https://huggingface.co/Borcherding/XTTS-v2_C3PO",
35
- "XTTS-v2_C3PO"
36
- ], check=True)
37
- print("C3PO model downloaded successfully")
38
- except subprocess.CalledProcessError as e:
39
- print(f"Failed to download C3PO model: {e}")
40
- exit(1)
41
-
42
- # Load configuration
43
- config = XttsConfig()
44
- config.load_json(config_path)
45
-
46
- # Initialize and load model
47
- model = Xtts.init_from_config(config)
48
- model.load_checkpoint(
49
- config,
50
- checkpoint_path=os.path.join(model_path, "model.pth"),
51
- vocab_path=os.path.join(model_path, "vocab.json"),
52
- eval=True,
53
- )
54
-
55
- device = "cpu" # Force CPU usage
56
- print(f"C3PO model loaded on {device} (forced CPU mode)")
57
-
58
- # Text to convert to speech
59
- text = "Hello there! I am C-3PO, human-cyborg relations. How may I assist you today?"
60
-
61
- # Look for reference audio in the C3PO model directory
62
- reference_audio_path = None
63
- for file in os.listdir(model_path):
64
- if file.endswith(('.wav', '.mp3', '.m4a')):
65
- reference_audio_path = os.path.join(model_path, file)
66
- print(f"Found C3PO reference audio: {file}")
67
- break
68
-
69
- # If no reference audio found, create a simple test reference
70
- if reference_audio_path is None:
71
- print("No reference audio found in C3PO model, creating test reference...")
72
- reference_audio_path = "test_reference.wav"
73
-
74
- # Generate a simple sine wave as placeholder
75
- import numpy as np
76
- sample_rate = 24000
77
- duration = 3 # seconds
78
- frequency = 440 # Hz
79
- t = np.linspace(0, duration, int(sample_rate * duration))
80
- audio_data = 0.3 * np.sin(2 * np.pi * frequency * t)
81
-
82
- # Save as WAV
83
- torchaudio.save(reference_audio_path, torch.tensor(audio_data).unsqueeze(0), sample_rate)
84
- print(f"Test reference audio created: {reference_audio_path}")
85
-
86
- try:
87
- # Generate conditioning latents
88
- print("Processing reference audio...")
89
- gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
90
- audio_path=reference_audio_path,
91
- gpt_cond_len=30,
92
- gpt_cond_chunk_len=4,
93
- max_ref_length=60
94
- )
95
-
96
- # Generate speech
97
- print("Generating C3PO speech...")
98
- out = model.inference(
99
- text,
100
- "en", # language
101
- gpt_cond_latent,
102
- speaker_embedding,
103
- repetition_penalty=5.0,
104
- temperature=0.75,
105
- )
106
-
107
- # Save output
108
- output_path = "c3po_test_output.wav"
109
- torchaudio.save(output_path, torch.tensor(out["wav"]).unsqueeze(0), 24000)
110
- print(f"C3PO speech generated successfully! Saved as: {output_path}")
111
-
112
- # Test multilingual capabilities
113
- print("\nTesting multilingual C3PO...")
114
- multilingual_tests = [
115
- ("es", "Hola, soy C-3PO. Domino más de seis millones de formas de comunicación."),
116
- ("fr", "Bonjour, je suis C-3PO. Je maîtrise plus de six millions de formes de communication."),
117
- ("de", "Hallo, ich bin C-3PO. Ich beherrsche über sechs Millionen Kommunikationsformen."),
118
- ]
119
-
120
- for lang, test_text in multilingual_tests:
121
- print(f"Generating {lang.upper()} speech...")
122
- out = model.inference(
123
- test_text,
124
- lang,
125
- gpt_cond_latent,
126
- speaker_embedding,
127
- repetition_penalty=5.0,
128
- temperature=0.75,
129
- )
130
-
131
- output_path = f"c3po_test_{lang}.wav"
132
- torchaudio.save(output_path, torch.tensor(out["wav"]).unsqueeze(0), 24000)
133
- print(f"C3PO {lang.upper()} speech saved as: {output_path}")
134
-
135
- except Exception as e:
136
- print(f"Error during speech generation: {e}")
137
- import traceback
138
- traceback.print_exc()
139
-
140
- print("XTTS C3PO test completed!")
141
- print("\nGenerated files:")
142
- for file in os.listdir("."):
143
- if file.startswith("c3po_test") and file.endswith(".wav"):
144
- print(f" - {file}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test_build.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Simple build test for C-3PO TTS API
4
+ Tests if all dependencies can be imported
5
+ """
6
+
7
+ def test_imports():
8
+ """Test if all required packages can be imported"""
9
+ print("🔍 Testing imports...")
10
+
11
+ try:
12
+ import fastapi
13
+ print("✅ FastAPI")
14
+
15
+ import uvicorn
16
+ print("✅ Uvicorn")
17
+
18
+ import torch
19
+ print("✅ PyTorch")
20
+
21
+ import torchaudio
22
+ print("✅ TorchAudio")
23
+
24
+ import TTS
25
+ print("✅ Coqui TTS")
26
+
27
+ import huggingface_hub
28
+ print("✅ Hugging Face Hub")
29
+
30
+ import pydantic
31
+ print("✅ Pydantic")
32
+
33
+ return True
34
+
35
+ except ImportError as e:
36
+ print(f"❌ Import failed: {e}")
37
+ return False
38
+
39
+ def test_api_creation():
40
+ """Test if the API can be created without errors"""
41
+ print("\n🚀 Testing API creation...")
42
+
43
+ try:
44
+ from coqui_api import app
45
+ print("✅ API created successfully")
46
+ return True
47
+ except Exception as e:
48
+ print(f"❌ API creation failed: {e}")
49
+ return False
50
+
51
+ def main():
52
+ """Run build tests"""
53
+ print("🧪 C-3PO TTS Build Test")
54
+ print("=" * 30)
55
+
56
+ import_ok = test_imports()
57
+ api_ok = test_api_creation()
58
+
59
+ print("\n" + "=" * 30)
60
+
61
+ if import_ok and api_ok:
62
+ print("🎉 All tests passed! Ready to deploy.")
63
+ return 0
64
+ else:
65
+ print("❌ Some tests failed. Check dependencies.")
66
+ return 1
67
+
68
+ if __name__ == "__main__":
69
+ exit(main())
test_coqui_api.py DELETED
@@ -1,146 +0,0 @@
1
- import requests
2
- import os
3
- import time
4
-
5
- # API base URL (update this to your deployed Hugging Face Space URL)
6
- BASE_URL = "http://localhost:7860" # Change to your HF Space URL when deployed
7
-
8
- def test_health():
9
- """Test the health endpoint"""
10
- print("🔍 Testing health endpoint...")
11
- try:
12
- response = requests.get(f"{BASE_URL}/health")
13
- if response.status_code == 200:
14
- print("✅ Health check passed!")
15
- print(f"Response: {response.json()}")
16
- else:
17
- print(f"❌ Health check failed: {response.status_code}")
18
- print(f"Response: {response.text}")
19
- except Exception as e:
20
- print(f"❌ Health check error: {e}")
21
-
22
- def test_list_models():
23
- """Test the models endpoint"""
24
- print("\n🔍 Testing models endpoint...")
25
- try:
26
- response = requests.get(f"{BASE_URL}/models")
27
- if response.status_code == 200:
28
- models = response.json()
29
- print("✅ Models endpoint working!")
30
- print(f"Found {len(models.get('models', []))} models")
31
- # Show first 5 models
32
- for i, model in enumerate(models.get('models', [])[:5]):
33
- print(f" {i+1}. {model}")
34
- else:
35
- print(f"❌ Models endpoint failed: {response.status_code}")
36
- except Exception as e:
37
- print(f"❌ Models endpoint error: {e}")
38
-
39
- def test_simple_tts():
40
- """Test simple text-to-speech without voice cloning"""
41
- print("\n🔍 Testing simple TTS...")
42
- try:
43
- data = {
44
- "text": "Hello world! This is a test of Coqui TTS.",
45
- "language": "en"
46
- }
47
-
48
- response = requests.post(f"{BASE_URL}/tts", data=data)
49
-
50
- if response.status_code == 200:
51
- # Save the audio file
52
- output_file = "simple_tts_output.wav"
53
- with open(output_file, "wb") as f:
54
- f.write(response.content)
55
- print(f"✅ Simple TTS successful! Audio saved to: {output_file}")
56
- print(f"File size: {len(response.content)} bytes")
57
- else:
58
- print(f"❌ Simple TTS failed: {response.status_code}")
59
- print(f"Response: {response.text}")
60
- except Exception as e:
61
- print(f"❌ Simple TTS error: {e}")
62
-
63
- def test_voice_cloning(speaker_file_path=None):
64
- """Test voice cloning with uploaded speaker file"""
65
- if not speaker_file_path or not os.path.exists(speaker_file_path):
66
- print("\n⚠️ Skipping voice cloning test - no speaker file provided")
67
- print(" To test voice cloning, provide a .wav file path")
68
- return
69
-
70
- print(f"\n🔍 Testing voice cloning with: {speaker_file_path}")
71
- try:
72
- data = {
73
- "text": "This is voice cloning using Coqui TTS. The voice should match the reference audio.",
74
- "language": "en"
75
- }
76
-
77
- with open(speaker_file_path, "rb") as f:
78
- files = {"speaker_file": f}
79
- response = requests.post(f"{BASE_URL}/tts", data=data, files=files)
80
-
81
- if response.status_code == 200:
82
- # Save the cloned audio
83
- output_file = "voice_cloned_output.wav"
84
- with open(output_file, "wb") as f:
85
- f.write(response.content)
86
- print(f"✅ Voice cloning successful! Audio saved to: {output_file}")
87
- print(f"File size: {len(response.content)} bytes")
88
- else:
89
- print(f"❌ Voice cloning failed: {response.status_code}")
90
- print(f"Response: {response.text}")
91
- except Exception as e:
92
- print(f"❌ Voice cloning error: {e}")
93
-
94
- def test_json_tts():
95
- """Test JSON endpoint"""
96
- print("\n🔍 Testing JSON TTS endpoint...")
97
- try:
98
- import json
99
-
100
- data = {
101
- "text": "This is a JSON request test for Coqui TTS API.",
102
- "language": "en"
103
- }
104
-
105
- response = requests.post(
106
- f"{BASE_URL}/tts-json",
107
- headers={"Content-Type": "application/json"},
108
- data=json.dumps(data)
109
- )
110
-
111
- if response.status_code == 200:
112
- output_file = "json_tts_output.wav"
113
- with open(output_file, "wb") as f:
114
- f.write(response.content)
115
- print(f"✅ JSON TTS successful! Audio saved to: {output_file}")
116
- print(f"File size: {len(response.content)} bytes")
117
- else:
118
- print(f"❌ JSON TTS failed: {response.status_code}")
119
- print(f"Response: {response.text}")
120
- except Exception as e:
121
- print(f"❌ JSON TTS error: {e}")
122
-
123
- def main():
124
- print("🐸 Testing Coqui TTS API")
125
- print("=" * 50)
126
-
127
- # Test all endpoints
128
- test_health()
129
- test_list_models()
130
- test_simple_tts()
131
- test_json_tts()
132
-
133
- # Test voice cloning if speaker file is available
134
- # You can specify a speaker file path here
135
- speaker_file = None # Change to your speaker file path
136
- test_voice_cloning(speaker_file)
137
-
138
- print("\n🎉 API testing completed!")
139
- print("\nTo test voice cloning:")
140
- print("1. Record a short audio sample (5-10 seconds)")
141
- print("2. Save it as a .wav file")
142
- print("3. Update speaker_file variable with the file path")
143
- print("4. Run the test again")
144
-
145
- if __name__ == "__main__":
146
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test_coqui_tts.py DELETED
@@ -1,99 +0,0 @@
1
- import torch
2
- from TTS.api import TTS
3
- import os
4
-
5
- def test_coqui_tts():
6
- """Test Coqui TTS functionality"""
7
-
8
- # Get device
9
- device = "cuda" if torch.cuda.is_available() else "cpu"
10
- print(f"Using device: {device}")
11
-
12
- try:
13
- # List available 🐸TTS models
14
- print("\n=== Available TTS Models ===")
15
- tts_instance = TTS()
16
- models = tts_instance.list_models()
17
-
18
- # Print first 10 models to avoid overwhelming output
19
- print("First 10 available models:")
20
- for i, model in enumerate(models[:10]):
21
- print(f"{i+1}. {model}")
22
-
23
- if len(models) > 10:
24
- print(f"... and {len(models) - 10} more models")
25
-
26
- except Exception as e:
27
- print(f"Error listing models: {e}")
28
- return
29
-
30
- try:
31
- # Initialize TTS with XTTS v2 model
32
- print("\n=== Initializing XTTS v2 Model ===")
33
- tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
34
- print("XTTS v2 model loaded successfully!")
35
-
36
- # List speakers if available
37
- print("\n=== Available Speakers ===")
38
- if hasattr(tts, 'speakers') and tts.speakers:
39
- print("Available speakers:")
40
- for speaker in tts.speakers[:10]: # Show first 10
41
- print(f"- {speaker}")
42
- if len(tts.speakers) > 10:
43
- print(f"... and {len(tts.speakers) - 10} more speakers")
44
- else:
45
- print("No preset speakers available or speakers list is empty")
46
-
47
- except Exception as e:
48
- print(f"Error initializing XTTS v2 model: {e}")
49
- print("This might be due to model download requirements or missing dependencies")
50
- return
51
-
52
- try:
53
- # Test TTS to file with preset speaker (if available)
54
- print("\n=== Testing TTS to File ===")
55
- output_file = "test_output.wav"
56
-
57
- # Check if we have speakers available
58
- if hasattr(tts, 'speakers') and tts.speakers:
59
- # Use first available speaker
60
- speaker_name = tts.speakers[0]
61
- print(f"Using speaker: {speaker_name}")
62
-
63
- tts.tts_to_file(
64
- text="Hello world! This is a test of Coqui TTS library.",
65
- speaker=speaker_name,
66
- language="en",
67
- file_path=output_file
68
- )
69
- else:
70
- # Try without speaker specification
71
- print("No speakers available, trying without speaker specification...")
72
- tts.tts_to_file(
73
- text="Hello world! This is a test of Coqui TTS library.",
74
- language="en",
75
- file_path=output_file
76
- )
77
-
78
- if os.path.exists(output_file):
79
- print(f"✅ TTS successful! Audio saved to: {output_file}")
80
- file_size = os.path.getsize(output_file)
81
- print(f"File size: {file_size} bytes")
82
- else:
83
- print("❌ TTS failed - output file not created")
84
-
85
- except Exception as e:
86
- print(f"Error during TTS generation: {e}")
87
-
88
- # Note about voice cloning
89
- print("\n=== Voice Cloning Information ===")
90
- print("To test voice cloning, you would need:")
91
- print("1. A reference audio file (speaker_wav parameter)")
92
- print("2. Use tts.tts() method instead of tts_to_file()")
93
- print("Example:")
94
- print('wav = tts.tts(text="Hello!", speaker_wav="reference.wav", language="en")')
95
-
96
- if __name__ == "__main__":
97
- print("🐸 Testing Coqui TTS Library")
98
- print("=" * 50)
99
- test_coqui_tts()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test_kokoro_install.py DELETED
@@ -1,86 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Simple test script to verify Kokoro TTS installation and functionality.
4
- """
5
-
6
- import os
7
-
8
- # Set basic environment variables
9
- os.environ['NUMBA_DISABLE_JIT'] = '1'
10
-
11
- def test_kokoro_import():
12
- """Test if Kokoro can be imported"""
13
- try:
14
- from kokoro import KPipeline
15
- import soundfile as sf
16
- import torch
17
- print("✅ All required packages imported successfully!")
18
- return True
19
- except ImportError as e:
20
- print(f"❌ Import error: {e}")
21
- return False
22
-
23
- def test_kokoro_pipeline():
24
- """Test if Kokoro pipeline can be initialized"""
25
- try:
26
- from kokoro import KPipeline
27
- pipeline = KPipeline(lang_code='a')
28
- print("✅ Kokoro pipeline initialized successfully!")
29
- return True
30
- except Exception as e:
31
- print(f"❌ Pipeline initialization error: {e}")
32
- return False
33
-
34
- def test_kokoro_generation():
35
- """Test if Kokoro can generate speech"""
36
- try:
37
- from kokoro import KPipeline
38
- import soundfile as sf
39
-
40
- pipeline = KPipeline(lang_code='a')
41
- text = "Hello, this is a test of Kokoro TTS."
42
-
43
- generator = pipeline(text, voice='af_heart')
44
-
45
- for i, (gs, ps, audio) in enumerate(generator):
46
- print(f"✅ Generated audio segment {i}: gs={gs}, ps={ps}")
47
- # Save test audio
48
- sf.write('test_kokoro.wav', audio, 24000)
49
- print("✅ Test audio saved as 'test_kokoro.wav'")
50
- break # Just test the first segment
51
-
52
- return True
53
- except Exception as e:
54
- print(f"❌ Speech generation error: {e}")
55
- return False
56
-
57
- def main():
58
- """Run all tests"""
59
- print("🎤 Testing Kokoro TTS Installation")
60
- print("=" * 40)
61
-
62
- tests = [
63
- ("Import Test", test_kokoro_import),
64
- ("Pipeline Test", test_kokoro_pipeline),
65
- ("Generation Test", test_kokoro_generation)
66
- ]
67
-
68
- passed = 0
69
- total = len(tests)
70
-
71
- for test_name, test_func in tests:
72
- print(f"\n🔍 Running {test_name}...")
73
- if test_func():
74
- passed += 1
75
- else:
76
- print(f"❌ {test_name} failed!")
77
-
78
- print(f"\n📊 Results: {passed}/{total} tests passed")
79
-
80
- if passed == total:
81
- print("🎉 All tests passed! Kokoro TTS is ready to use.")
82
- else:
83
- print("⚠️ Some tests failed. Please check the installation.")
84
-
85
- if __name__ == "__main__":
86
- main()