File size: 4,835 Bytes
fdb39f4
67a7670
886af50
965be1d
e6cfbd7
 
965be1d
 
221d07a
3d55353
965be1d
697c7eb
965be1d
 
 
 
 
 
886af50
 
5b89128
221d07a
 
 
 
 
0348b75
965be1d
66d0ca2
 
965be1d
 
 
 
 
 
697c7eb
965be1d
 
 
 
 
697c7eb
965be1d
 
 
 
 
697c7eb
965be1d
 
c6798f9
965be1d
 
 
c6798f9
 
 
697c7eb
965be1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c6798f9
697c7eb
 
c6798f9
697c7eb
 
 
 
965be1d
 
 
 
697c7eb
 
 
 
 
c6798f9
697c7eb
965be1d
 
 
697c7eb
965be1d
66d0ca2
aa43ea6
697c7eb
aa43ea6
66d0ca2
aa43ea6
 
 
697c7eb
 
 
 
 
 
 
 
 
 
965be1d
 
697c7eb
965be1d
 
 
 
67a7670
965be1d
697c7eb
965be1d
66d0ca2
aa43ea6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import os
import json
import requests
import threading
import torch
import librosa

from flask import Flask, request, jsonify
from transformers import WhisperProcessor, WhisperForConditionalGeneration

###############################################################################
# 1) Configure environment & set up model
###############################################################################
os.environ["HF_HOME"] = "/tmp/hf_cache"
os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache"
os.environ["HF_DATASETS_CACHE"] = "/tmp/hf_cache"
os.environ["XDG_CACHE_HOME"] = "/tmp"

app = Flask(__name__)

model_id = "ivrit-ai/whisper-large-v3-turbo"
processor = WhisperProcessor.from_pretrained(model_id)
model = WhisperForConditionalGeneration.from_pretrained(model_id)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Force Hebrew transcription (skip auto-detect)
forced_decoder_ids = processor.get_decoder_prompt_ids(language="he", task="transcribe")

# Where we send the final transcription
WEBHOOK_URL = "https://hook.eu1.make.com/86zogci73u394k2uqpulp5yjjwgm8b9x"

###############################################################################
# 2) Background transcription function
###############################################################################
def transcribe_in_background(audio_url, file_id, company, user):
    """
    Called by a background thread. Downloads & transcribes audio,
    then sends results to your Make.com webhook.
    """
    try:
        # 1) Download the audio
        r = requests.get(audio_url)
        audio_path = "/tmp/temp_audio.wav"
        with open(audio_path, "wb") as f:
            f.write(r.content)

        # 2) Load with librosa
        waveform, sr = librosa.load(audio_path, sr=16000)

        # Optional: limit to ~1 hour
        max_sec = 3600
        waveform = waveform[: sr * max_sec]

        # Calculate total duration actually analyzed
        call_duration = int(len(waveform) / sr)  # Rounded to nearest second

        # 3) Split audio into 25-second chunks
        chunk_sec = 25
        chunk_size = sr * chunk_sec
        chunks = [waveform[i : i + chunk_size] for i in range(0, len(waveform), chunk_size)]

        partial_text = ""
        for chunk in chunks:
            inputs = processor(chunk, sampling_rate=sr, return_tensors="pt", padding=True)
            input_features = inputs.input_features.to(device)

            with torch.no_grad():
                predicted_ids = model.generate(
                    input_features,
                    forced_decoder_ids=forced_decoder_ids
                )

            transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
            partial_text += transcription + "\n"

        # 4) Post final transcription back to Make.com, including extra fields
        payload = {
            "Transcription": partial_text.strip(),
            "callDuration": call_duration,
            "fileId": file_id,
            "company": company,
            "user": user
        }
        requests.post(WEBHOOK_URL, json=payload)

    except Exception as e:
        # In case of errors, notify the webhook
        error_payload = {
            "error": str(e),
            "fileId": file_id,
            "company": company,
            "user": user
            # You could optionally include "callDuration" here if relevant
        }
        requests.post(WEBHOOK_URL, json=error_payload)

###############################################################################
# 3) Flask route: returns immediately, transcribes in a separate thread
###############################################################################
@app.route("/transcribe", methods=["POST"])
def transcribe_endpoint():
    # 1) Get JSON data from request
    data = request.get_json()
    audio_url = data.get("audio_url")
    if not audio_url:
        return jsonify({"error": "Missing 'audio_url' in request"}), 400

    # 2) Read custom headers (fileId, company, user)
    file_id = request.headers.get("fileId", "")
    company = request.headers.get("company", "")
    user = request.headers.get("user", "")

    # 3) Spawn a thread to handle transcription
    thread = threading.Thread(
        target=transcribe_in_background,
        args=(audio_url, file_id, company, user)
    )
    thread.start()

    # 4) Immediately return a JSON response
    return jsonify({
        "status": "Received. Transcription in progress.",
        "note": "Results will be sent via webhook once done."
    }), 202

###############################################################################
# 4) Run app if local; on HF Spaces, gunicorn is used
###############################################################################
if __name__ == "__main__":
    app.run(host="0.0.0.0", port=7860)