Kapamtalk / app.py
Coco-18's picture
Update app.py
e085921 verified
raw
history blame
5.01 kB
import os
import torch
import torchaudio
import soundfile as sf
from flask import Flask, request, jsonify, send_file
from flask_cors import CORS
from transformers import Wav2Vec2ForCTC, AutoProcessor, VitsModel, AutoTokenizer
# Set cache directories
os.environ["HF_HOME"] = "/tmp/hf_home"
os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers_cache"
os.environ["HUGGINGFACE_HUB_CACHE"] = "/tmp/huggingface_cache"
os.environ["TORCH_HOME"] = "/tmp/torch_home"
app = Flask(__name__)
CORS(app)
# ASR Model (facebook/mms-1b-all)
ASR_MODEL_ID = "Coco-18/mms-asr-tgl-en-safetensor"
asr_processor = AutoProcessor.from_pretrained(ASR_MODEL_ID)
asr_model = Wav2Vec2ForCTC.from_pretrained(ASR_MODEL_ID)
# Language-specific configurations
LANGUAGE_CODES = {
"kapampangan": "pam",
"tagalog": "tgl",
"english": "eng"
}
# TTS Models (Kapampangan, Tagalog, English)
TTS_MODELS = {
"kapampangan": "facebook/mms-tts-pam",
"tagalog": "facebook/mms-tts-tgl",
"english": "facebook/mms-tts-eng"
}
tts_models = {}
tts_processors = {}
for lang, model_id in TTS_MODELS.items():
try:
tts_models[lang] = VitsModel.from_pretrained(model_id, cache_dir="/tmp/huggingface_cache")
tts_processors[lang] = AutoTokenizer.from_pretrained(model_id, cache_dir="/tmp/huggingface_cache")
print(f"βœ… TTS Model loaded: {lang}")
except Exception as e:
print(f"❌ Error loading {lang} TTS model: {e}")
tts_models[lang] = None
# Constants
SAMPLE_RATE = 16000
OUTPUT_DIR = "/tmp/"
os.makedirs(OUTPUT_DIR, exist_ok=True)
@app.route("/", methods=["GET"])
def home():
return jsonify({"message": "Speech API is running."})
@app.route("/asr", methods=["POST"])
def transcribe_audio():
try:
if "audio" not in request.files:
return jsonify({"error": "No audio file uploaded"}), 400
audio_file = request.files["audio"]
language = request.form.get("language", "english").lower()
# Validate language
if language not in LANGUAGE_CODES:
return jsonify({"error": f"Unsupported language: {language}"}), 400
# Get the language code for the ASR model
lang_code = LANGUAGE_CODES[language]
# Save audio file temporarily
audio_path = os.path.join(OUTPUT_DIR, "input_audio.wav")
audio_file.save(audio_path)
# Load and process audio
try:
waveform, sr = torchaudio.load(audio_path)
if sr != SAMPLE_RATE:
waveform = torchaudio.transforms.Resample(sr, SAMPLE_RATE)(waveform)
# Normalize audio (recommended for Wav2Vec2)
waveform = waveform / torch.max(torch.abs(waveform))
# Process audio for ASR
inputs = asr_processor(
waveform.squeeze().numpy(),
sampling_rate=SAMPLE_RATE,
return_tensors="pt",
language=lang_code # Set the language code
)
except Exception as e:
return jsonify({"error": f"Error processing audio: {str(e)}"}), 400
# Transcribe
with torch.no_grad():
logits = asr_model(**inputs).logits
ids = torch.argmax(logits, dim=-1)[0]
transcription = asr_processor.decode(ids)
# Log the transcription
print(f"Transcription ({language}): {transcription}")
return jsonify({"transcription": transcription})
except Exception as e:
print(f"ASR error: {str(e)}")
return jsonify({"error": f"ASR failed: {str(e)}"}), 500
@app.route("/tts", methods=["POST"])
def generate_tts():
try:
data = request.get_json()
text_input = data.get("text", "").strip()
language = data.get("language", "kapampangan").lower()
if language not in TTS_MODELS:
return jsonify({"error": "Invalid language"}), 400
if not text_input:
return jsonify({"error": "No text provided"}), 400
if tts_models[language] is None:
return jsonify({"error": "TTS model not available"}), 500
processor = tts_processors[language]
model = tts_models[language]
inputs = processor(text_input, return_tensors="pt")
with torch.no_grad():
output = model.generate(**inputs)
waveform = output.cpu().numpy().flatten()
output_filename = os.path.join(OUTPUT_DIR, f"{language}_tts.wav")
sf.write(output_filename, waveform, SAMPLE_RATE)
return jsonify({"file_url": f"/download/{language}_tts.wav"})
except Exception as e:
return jsonify({"error": f"TTS failed: {e}"}), 500
@app.route("/download/<filename>", methods=["GET"])
def download_audio(filename):
file_path = os.path.join(OUTPUT_DIR, filename)
if os.path.exists(file_path):
return send_file(file_path, mimetype="audio/wav", as_attachment=True)
return jsonify({"error": "File not found"}), 404
if __name__ == "__main__":
app.run(host="0.0.0.0", port=7860, debug=True)