import os import json import requests import threading import torch import librosa from flask import Flask, request, jsonify from transformers import WhisperProcessor, WhisperForConditionalGeneration # GLOBAL concurrency counter & lock concurrent_requests = 0 concurrent_requests_lock = threading.Lock() app = Flask(__name__) model_id = "ivrit-ai/whisper-large-v3-turbo" processor = WhisperProcessor.from_pretrained(model_id) model = WhisperForConditionalGeneration.from_pretrained(model_id) device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) forced_decoder_ids = processor.get_decoder_prompt_ids(language="he", task="transcribe") WEBHOOK_URL = "https://hook.eu1.make.com/86zogci73u394k2uqpulp5yjjwgm8b9x" def transcribe_in_background(audio_url, file_id, company, user, file_name): global concurrent_requests try: # Download audio r = requests.get(audio_url) audio_path = "/tmp/temp_audio.wav" with open(audio_path, "wb") as f: f.write(r.content) # Load audio & limit to 1 hour waveform, sr = librosa.load(audio_path, sr=16000) max_sec = 3600 waveform = waveform[: sr * max_sec] call_duration = int(len(waveform) / sr) # Transcribe in 25-second chunks chunk_sec = 25 chunk_size = sr * chunk_sec chunks = [waveform[i : i + chunk_size] for i in range(0, len(waveform), chunk_size)] partial_text = "" for chunk in chunks: inputs = processor(chunk, sampling_rate=sr, return_tensors="pt", padding=True) input_features = inputs.input_features.to(device) with torch.no_grad(): predicted_ids = model.generate( input_features, forced_decoder_ids=forced_decoder_ids ) transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] partial_text += transcription + "\n" # Send result to webhook payload = { "Transcription": partial_text.strip(), "callDuration": call_duration, "fileId": file_id, "company": company, "user": user, "fileName": file_name } requests.post(WEBHOOK_URL, json=payload) except Exception as e: error_payload = { "error": str(e), "fileId": file_id, "company": company, "user": user, "fileName": file_name } requests.post(WEBHOOK_URL, json=error_payload) finally: # Decrement concurrency count with concurrent_requests_lock: concurrent_requests -= 1 @app.route("/transcribe", methods=["POST"]) def transcribe_endpoint(): global concurrent_requests # We only allow ONE job at a time: with concurrent_requests_lock: if concurrent_requests >= 1: # Return a 200 (OK) and a JSON message return jsonify({ "message": "Server is already processing another job, please try again later." }), 200 # If it's free, occupy the slot concurrent_requests += 1 data = request.get_json() audio_url = data.get("audio_url") if not audio_url: # If missing the audio_url, free the slot we claimed with concurrent_requests_lock: concurrent_requests -= 1 return jsonify({"error": "Missing 'audio_url' in request"}), 400 # Read headers file_id = request.headers.get("fileId", "") company = request.headers.get("company", "") user = request.headers.get("user", "") file_name = request.headers.get("fileName", "") # Spawn a background thread thread = threading.Thread( target=transcribe_in_background, args=(audio_url, file_id, company, user, file_name) ) thread.start() return jsonify({ "status": "Received. Transcription in progress.", "note": "Results will be sent via webhook once done." }), 202 if __name__ == "__main__": app.run(host="0.0.0.0", port=7860)