Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, request, jsonify, send_file
|
2 |
+
from flask_cors import CORS
|
3 |
+
import torch
|
4 |
+
import torchaudio
|
5 |
+
import soundfile as sf
|
6 |
+
import os
|
7 |
+
from transformers import VitsModel, AutoTokenizer
|
8 |
+
|
9 |
+
app = Flask(__name__)
|
10 |
+
CORS(app) # Allow Android requests
|
11 |
+
|
12 |
+
# Model paths for different languages (Hugging Face Hub)
|
13 |
+
MODELS = {
|
14 |
+
"kapampangan": "facebook/mms-tts-pam",
|
15 |
+
"tagalog": "facebook/mms-tts-tgl",
|
16 |
+
"english": "facebook/mms-tts-eng"
|
17 |
+
}
|
18 |
+
|
19 |
+
# Load models and processors
|
20 |
+
loaded_models = {}
|
21 |
+
loaded_processors = {}
|
22 |
+
|
23 |
+
for lang, path in MODELS.items():
|
24 |
+
print(f"Loading {lang} model from Hugging Face: {path}...")
|
25 |
+
loaded_models[lang] = VitsModel.from_pretrained(path)
|
26 |
+
loaded_processors[lang] = AutoTokenizer.from_pretrained(path)
|
27 |
+
print(f"{lang.capitalize()} model loaded successfully!")
|
28 |
+
|
29 |
+
SAMPLE_RATE = 16000 # Default sample rate
|
30 |
+
OUTPUT_DIR = "/tmp/" # Use /tmp for Hugging Face Spaces (limited storage)
|
31 |
+
|
32 |
+
@app.route("/tts", methods=["POST"])
|
33 |
+
def generate_tts():
|
34 |
+
""" API endpoint to generate speech based on the selected language. """
|
35 |
+
data = request.get_json()
|
36 |
+
text_input = data.get("text", "")
|
37 |
+
language = data.get("language", "kapampangan").lower()
|
38 |
+
|
39 |
+
if language not in MODELS:
|
40 |
+
return jsonify({"error": "Invalid language. Choose 'kapampangan', 'tagalog', or 'english'."}), 400
|
41 |
+
|
42 |
+
if not text_input:
|
43 |
+
return jsonify({"error": "No text provided"}), 400
|
44 |
+
|
45 |
+
print(f"Generating speech for: '{text_input}' in {language}")
|
46 |
+
|
47 |
+
# Select the correct model and processor
|
48 |
+
model = loaded_models[language]
|
49 |
+
processor = loaded_processors[language]
|
50 |
+
|
51 |
+
# Tokenize input text
|
52 |
+
inputs = processor(text_input, return_tensors="pt")
|
53 |
+
|
54 |
+
# Generate audio
|
55 |
+
with torch.no_grad():
|
56 |
+
output = model.generate(**inputs)
|
57 |
+
|
58 |
+
waveform = output.cpu().numpy().flatten()
|
59 |
+
|
60 |
+
# Save as WAV file
|
61 |
+
output_filename = os.path.join(OUTPUT_DIR, f"{language}_output.wav")
|
62 |
+
sf.write(output_filename, waveform, SAMPLE_RATE)
|
63 |
+
|
64 |
+
return jsonify({
|
65 |
+
"message": "TTS audio generated",
|
66 |
+
"file_url": f"/static/{language}_output.wav"
|
67 |
+
})
|
68 |
+
|
69 |
+
@app.route("/static/<filename>")
|
70 |
+
def serve_audio(filename):
|
71 |
+
""" Serve the generated WAV file. """
|
72 |
+
file_path = os.path.join(OUTPUT_DIR, filename)
|
73 |
+
if os.path.exists(file_path):
|
74 |
+
return send_file(file_path, mimetype="audio/wav")
|
75 |
+
return jsonify({"error": "File not found"}), 404
|
76 |
+
|
77 |
+
if __name__ == "__main__":
|
78 |
+
app.run(host="0.0.0.0", port=7860, debug=True)
|