EladSpamson commited on
Commit
5f0d37b
·
verified ·
1 Parent(s): 57dbbfb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -20
app.py CHANGED
@@ -4,12 +4,12 @@ import requests
4
  import threading
5
  import torch
6
  import librosa
7
- import psutil
8
 
9
  from flask import Flask, request, jsonify
10
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
11
 
12
- # GLOBAL concurrency lock or counter
13
  concurrent_requests = 0
14
  concurrent_requests_lock = threading.Lock()
15
 
@@ -29,24 +29,24 @@ WEBHOOK_URL = "https://hook.eu1.make.com/86zogci73u394k2uqpulp5yjjwgm8b9x"
29
  def transcribe_in_background(audio_url, file_id, company, user, file_name):
30
  global concurrent_requests
31
  try:
32
- # 1) Download audio
33
  r = requests.get(audio_url)
34
  audio_path = "/tmp/temp_audio.wav"
35
  with open(audio_path, "wb") as f:
36
  f.write(r.content)
37
 
38
- # 2) Load audio
39
  waveform, sr = librosa.load(audio_path, sr=16000)
40
  max_sec = 3600
41
  waveform = waveform[: sr * max_sec]
42
 
43
  call_duration = int(len(waveform) / sr)
44
 
45
- # 3) Transcribe in chunks
46
  chunk_sec = 25
47
  chunk_size = sr * chunk_sec
48
  chunks = [waveform[i : i + chunk_size] for i in range(0, len(waveform), chunk_size)]
49
-
50
  partial_text = ""
51
  for chunk in chunks:
52
  inputs = processor(chunk, sampling_rate=sr, return_tensors="pt", padding=True)
@@ -57,7 +57,7 @@ def transcribe_in_background(audio_url, file_id, company, user, file_name):
57
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
58
  partial_text += transcription + "\n"
59
 
60
- # 4) Post final transcription
61
  payload = {
62
  "Transcription": partial_text.strip(),
63
  "callDuration": call_duration,
@@ -69,7 +69,6 @@ def transcribe_in_background(audio_url, file_id, company, user, file_name):
69
  requests.post(WEBHOOK_URL, json=payload)
70
 
71
  except Exception as e:
72
- # 5) Handle errors
73
  error_payload = {
74
  "error": str(e),
75
  "fileId": file_id,
@@ -80,47 +79,44 @@ def transcribe_in_background(audio_url, file_id, company, user, file_name):
80
  requests.post(WEBHOOK_URL, json=error_payload)
81
 
82
  finally:
83
- # Always decrement concurrency, even on error
84
  with concurrent_requests_lock:
85
- global concurrent_requests
86
  concurrent_requests -= 1
87
 
88
  @app.route("/transcribe", methods=["POST"])
89
  def transcribe_endpoint():
90
  global concurrent_requests
91
 
92
- # 1) Check concurrency
93
  with concurrent_requests_lock:
94
  if concurrent_requests >= 1:
95
- # We only allow ONE job at a time
96
- return jsonify({"error": "Server is busy with another transcription"}), 503
97
-
98
- # If it's free, claim the slot
99
  concurrent_requests += 1
100
 
101
- # 2) Parse request
102
  data = request.get_json()
103
  audio_url = data.get("audio_url")
104
  if not audio_url:
105
- # Since we've already claimed concurrency=1, we should free it
106
  with concurrent_requests_lock:
107
  concurrent_requests -= 1
108
  return jsonify({"error": "Missing 'audio_url' in request"}), 400
109
 
110
- # 3) Read custom headers
111
  file_id = request.headers.get("fileId", "")
112
  company = request.headers.get("company", "")
113
  user = request.headers.get("user", "")
114
  file_name = request.headers.get("fileName", "")
115
 
116
- # 4) Spawn a background thread
117
  thread = threading.Thread(
118
  target=transcribe_in_background,
119
  args=(audio_url, file_id, company, user, file_name)
120
  )
121
  thread.start()
122
 
123
- # 5) Return an immediate response
124
  return jsonify({
125
  "status": "Received. Transcription in progress.",
126
  "note": "Results will be sent via webhook once done."
 
4
  import threading
5
  import torch
6
  import librosa
7
+ #import psutil # Not needed for concurrency gating, only for CPU usage checks
8
 
9
  from flask import Flask, request, jsonify
10
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
11
 
12
+ # GLOBAL concurrency counter & lock
13
  concurrent_requests = 0
14
  concurrent_requests_lock = threading.Lock()
15
 
 
29
  def transcribe_in_background(audio_url, file_id, company, user, file_name):
30
  global concurrent_requests
31
  try:
32
+ # Download audio
33
  r = requests.get(audio_url)
34
  audio_path = "/tmp/temp_audio.wav"
35
  with open(audio_path, "wb") as f:
36
  f.write(r.content)
37
 
38
+ # Load & limit to 1 hour
39
  waveform, sr = librosa.load(audio_path, sr=16000)
40
  max_sec = 3600
41
  waveform = waveform[: sr * max_sec]
42
 
43
  call_duration = int(len(waveform) / sr)
44
 
45
+ # Split into 25-second chunks
46
  chunk_sec = 25
47
  chunk_size = sr * chunk_sec
48
  chunks = [waveform[i : i + chunk_size] for i in range(0, len(waveform), chunk_size)]
49
+
50
  partial_text = ""
51
  for chunk in chunks:
52
  inputs = processor(chunk, sampling_rate=sr, return_tensors="pt", padding=True)
 
57
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
58
  partial_text += transcription + "\n"
59
 
60
+ # Post final transcription
61
  payload = {
62
  "Transcription": partial_text.strip(),
63
  "callDuration": call_duration,
 
69
  requests.post(WEBHOOK_URL, json=payload)
70
 
71
  except Exception as e:
 
72
  error_payload = {
73
  "error": str(e),
74
  "fileId": file_id,
 
79
  requests.post(WEBHOOK_URL, json=error_payload)
80
 
81
  finally:
82
+ # Decrement concurrency counter
83
  with concurrent_requests_lock:
 
84
  concurrent_requests -= 1
85
 
86
  @app.route("/transcribe", methods=["POST"])
87
  def transcribe_endpoint():
88
  global concurrent_requests
89
 
90
+ # Concurrency check: only 1 job at a time
91
  with concurrent_requests_lock:
92
  if concurrent_requests >= 1:
93
+ # Return 429 if we already have a job in progress
94
+ return jsonify({"error": "Too many requests, server is already processing another job."}), 429
 
 
95
  concurrent_requests += 1
96
 
97
+ # Parse JSON
98
  data = request.get_json()
99
  audio_url = data.get("audio_url")
100
  if not audio_url:
101
+ # Free the concurrency slot since we're not using it
102
  with concurrent_requests_lock:
103
  concurrent_requests -= 1
104
  return jsonify({"error": "Missing 'audio_url' in request"}), 400
105
 
106
+ # Read custom headers
107
  file_id = request.headers.get("fileId", "")
108
  company = request.headers.get("company", "")
109
  user = request.headers.get("user", "")
110
  file_name = request.headers.get("fileName", "")
111
 
112
+ # Spawn a background thread
113
  thread = threading.Thread(
114
  target=transcribe_in_background,
115
  args=(audio_url, file_id, company, user, file_name)
116
  )
117
  thread.start()
118
 
119
+ # Return immediately
120
  return jsonify({
121
  "status": "Received. Transcription in progress.",
122
  "note": "Results will be sent via webhook once done."