Spaces:

Coco-18
/

Kapamtalk

Sleeping

App Files Files Community

Coco-18 commited on Mar 16

Commit

f03b779

verified ·

1 Parent(s): ddc1d69

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -30

app.py CHANGED Viewed

@@ -14,6 +14,8 @@ for path in ["/tmp/hf_home", "/tmp/transformers_cache", "/tmp/huggingface_hub_ca
 # Now import the rest of the libraries
 import torch
 import torchaudio
 import soundfile as sf
 from flask import Flask, request, jsonify, send_file
@@ -98,49 +100,51 @@ def transcribe_audio():
         audio_file = request.files["audio"]
         language = request.form.get("language", "english").lower()
-        # Validate language
         if language not in LANGUAGE_CODES:
             return jsonify({"error": f"Unsupported language: {language}"}), 400
-        # Get the language code for the ASR model
         lang_code = LANGUAGE_CODES[language]
-        # Save audio file temporarily
-        audio_path = os.path.join(OUTPUT_DIR, "input_audio")
-        audio_file.save(audio_path)
-        # Load and process audio
-        try:
-            # Load audio using torchaudio, which supports various formats
-            waveform, sr = torchaudio.load(audio_path)
-            # Resample if necessary
-            if sr != SAMPLE_RATE:
-                waveform = torchaudio.transforms.Resample(sr, SAMPLE_RATE)(waveform)
-            # Normalize audio (recommended for Wav2Vec2)
-            waveform = waveform / torch.max(torch.abs(waveform))
-            # Process audio for ASR
-            inputs = asr_processor(
-                waveform.squeeze().numpy(),
-                sampling_rate=SAMPLE_RATE,
-                return_tensors="pt",
-                language=lang_code  # Set the language code
-            )
-        except Exception as e:
-            return jsonify({"error": f"Error processing audio: {str(e)}"}), 400
-        # Transcribe
         with torch.no_grad():
             logits = asr_model(**inputs).logits
         ids = torch.argmax(logits, dim=-1)[0]
         transcription = asr_processor.decode(ids)
-        # Log the transcription
         print(f"Transcription ({language}): {transcription}")
         return jsonify({"transcription": transcription})
     except Exception as e:
         print(f"ASR error: {str(e)}")
         return jsonify({"error": f"ASR failed: {str(e)}"}), 500

 # Now import the rest of the libraries
 import torch
+from pydub import AudioSegment
+import tempfile
 import torchaudio
 import soundfile as sf
 from flask import Flask, request, jsonify, send_file
         audio_file = request.files["audio"]
         language = request.form.get("language", "english").lower()
         if language not in LANGUAGE_CODES:
             return jsonify({"error": f"Unsupported language: {language}"}), 400
         lang_code = LANGUAGE_CODES[language]
+        # Save the uploaded file temporarily
+        with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(audio_file.filename)[-1]) as temp_audio:
+            temp_audio.write(audio_file.read())
+            temp_audio_path = temp_audio.name
+        # Convert to WAV if necessary
+        wav_path = temp_audio_path
+        if not audio_file.filename.lower().endswith(".wav"):
+            wav_path = os.path.join(OUTPUT_DIR, "converted_audio.wav")
+            audio = AudioSegment.from_file(temp_audio_path)
+            audio = audio.set_frame_rate(SAMPLE_RATE).set_channels(1)
+            audio.export(wav_path, format="wav")
+        # Load and process the WAV file
+        waveform, sr = torchaudio.load(wav_path)
+        # Resample if needed
+        if sr != SAMPLE_RATE:
+            waveform = torchaudio.transforms.Resample(sr, SAMPLE_RATE)(waveform)
+        waveform = waveform / torch.max(torch.abs(waveform))
+        # Process audio for ASR
+        inputs = asr_processor(
+            waveform.squeeze().numpy(),
+            sampling_rate=SAMPLE_RATE,
+            return_tensors="pt",
+            language=lang_code
+        )
+        # Perform ASR
         with torch.no_grad():
             logits = asr_model(**inputs).logits
         ids = torch.argmax(logits, dim=-1)[0]
         transcription = asr_processor.decode(ids)
         print(f"Transcription ({language}): {transcription}")
         return jsonify({"transcription": transcription})
     except Exception as e:
         print(f"ASR error: {str(e)}")
         return jsonify({"error": f"ASR failed: {str(e)}"}), 500