File size: 2,468 Bytes
661887e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from flask import Flask, request, jsonify, send_file
from flask_cors import CORS
import torch
import torchaudio
import soundfile as sf
import os
from transformers import VitsModel, AutoTokenizer

app = Flask(__name__)
CORS(app)  # Allow Android requests

# Model paths for different languages (Hugging Face Hub)
MODELS = {
    "kapampangan": "facebook/mms-tts-pam",
    "tagalog": "facebook/mms-tts-tgl",
    "english": "facebook/mms-tts-eng"
}

# Load models and processors
loaded_models = {}
loaded_processors = {}

for lang, path in MODELS.items():
    print(f"Loading {lang} model from Hugging Face: {path}...")
    loaded_models[lang] = VitsModel.from_pretrained(path)
    loaded_processors[lang] = AutoTokenizer.from_pretrained(path)
    print(f"{lang.capitalize()} model loaded successfully!")

SAMPLE_RATE = 16000  # Default sample rate
OUTPUT_DIR = "/tmp/"  # Use /tmp for Hugging Face Spaces (limited storage)

@app.route("/tts", methods=["POST"])
def generate_tts():
    """ API endpoint to generate speech based on the selected language. """
    data = request.get_json()
    text_input = data.get("text", "")
    language = data.get("language", "kapampangan").lower()

    if language not in MODELS:
        return jsonify({"error": "Invalid language. Choose 'kapampangan', 'tagalog', or 'english'."}), 400

    if not text_input:
        return jsonify({"error": "No text provided"}), 400

    print(f"Generating speech for: '{text_input}' in {language}")

    # Select the correct model and processor
    model = loaded_models[language]
    processor = loaded_processors[language]

    # Tokenize input text
    inputs = processor(text_input, return_tensors="pt")

    # Generate audio
    with torch.no_grad():
        output = model.generate(**inputs)

    waveform = output.cpu().numpy().flatten()

    # Save as WAV file
    output_filename = os.path.join(OUTPUT_DIR, f"{language}_output.wav")
    sf.write(output_filename, waveform, SAMPLE_RATE)

    return jsonify({
        "message": "TTS audio generated",
        "file_url": f"/static/{language}_output.wav"
    })

@app.route("/static/<filename>")
def serve_audio(filename):
    """ Serve the generated WAV file. """
    file_path = os.path.join(OUTPUT_DIR, filename)
    if os.path.exists(file_path):
        return send_file(file_path, mimetype="audio/wav")
    return jsonify({"error": "File not found"}), 404

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=7860, debug=True)