IT2091024v2

Paused

App Files Files Community

Pijush2023 commited on Aug 7, 2024

Commit

1c2e53d

verified ·

1 Parent(s): cbe0a00

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -11

app.py CHANGED Viewed

@@ -425,19 +425,19 @@ def fetch_local_news():
     else:
         return "<p>Failed to fetch local news</p>"
-import numpy as np
-import torch
-from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
-model_id = 'openai/whisper-large-v3'
-device = "cuda:0" if torch.cuda.is_available() else "cpu"
-torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype).to(device)
-processor = AutoProcessor.from_pretrained(model_id)
-pipe_asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=15, batch_size=16, torch_dtype=torch_dtype, device=device, return_timestamps=True)
-base_audio_drive = "/data/audio"
 #Normal Code with sample rate is 44100 Hz
@@ -496,7 +496,36 @@ base_audio_drive = "/data/audio"
 #Resample part -1
 import numpy as np
-from scipy.signal import resample
 def transcribe_function(stream, new_chunk):
     try:
         sr, y = new_chunk[0], new_chunk[1]
@@ -504,6 +533,10 @@ def transcribe_function(stream, new_chunk):
         print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}")
         return stream, "", None
     # Ensure the sample rate is 16000 Hz
     target_sr = 16000
     if sr != target_sr:
@@ -525,6 +558,7 @@ def transcribe_function(stream, new_chunk):
     return stream, full_text, result
 def update_map_with_response(history):
     if not history:
         return ""

     else:
         return "<p>Failed to fetch local news</p>"
+# import numpy as np
+# import torch
+# from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
+# model_id = 'openai/whisper-large-v3'
+# device = "cuda:0" if torch.cuda.is_available() else "cpu"
+# torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+# model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype).to(device)
+# processor = AutoProcessor.from_pretrained(model_id)
+# pipe_asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=15, batch_size=16, torch_dtype=torch_dtype, device=device, return_timestamps=True)
+# base_audio_drive = "/data/audio"
 #Normal Code with sample rate is 44100 Hz
 #Resample part -1
 import numpy as np
+import torch
+from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
+from scipy.signal import resample  # Import resample from scipy.signal
+import base64
+import io
+from pydub import AudioSegment
+model_id = 'openai/whisper-large-v3'
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype).to(device)
+processor = AutoProcessor.from_pretrained(model_id)
+pipe_asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=15, batch_size=16, torch_dtype=torch_dtype, device=device, return_timestamps=True)
+def is_base64_audio(data):
+    try:
+        if isinstance(data, str):
+            base64.b64decode(data)
+            return True
+        return False
+    except Exception:
+        return False
+def base64_to_float32(base64_str):
+    audio_bytes = base64.b64decode(base64_str)
+    audio_segment = AudioSegment.from_file(io.BytesIO(audio_bytes), format="wav")
+    samples = np.array(audio_segment.get_array_of_samples())
+    return audio_segment.frame_rate, samples.astype(np.float32)
 def transcribe_function(stream, new_chunk):
     try:
         sr, y = new_chunk[0], new_chunk[1]
         print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}")
         return stream, "", None
+    # Check if input is base64 and convert to float32 if necessary
+    if is_base64_audio(y):
+        sr, y = base64_to_float32(y)
     # Ensure the sample rate is 16000 Hz
     target_sr = 16000
     if sr != target_sr:
     return stream, full_text, result
 def update_map_with_response(history):
     if not history:
         return ""