Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -425,19 +425,19 @@ def fetch_local_news():
|
|
425 |
else:
|
426 |
return "<p>Failed to fetch local news</p>"
|
427 |
|
428 |
-
import numpy as np
|
429 |
-
import torch
|
430 |
-
from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
|
431 |
|
432 |
-
model_id = 'openai/whisper-large-v3'
|
433 |
-
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
434 |
-
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
435 |
-
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype).to(device)
|
436 |
-
processor = AutoProcessor.from_pretrained(model_id)
|
437 |
|
438 |
-
pipe_asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=15, batch_size=16, torch_dtype=torch_dtype, device=device, return_timestamps=True)
|
439 |
|
440 |
-
base_audio_drive = "/data/audio"
|
441 |
|
442 |
#Normal Code with sample rate is 44100 Hz
|
443 |
|
@@ -496,7 +496,36 @@ base_audio_drive = "/data/audio"
|
|
496 |
#Resample part -1
|
497 |
|
498 |
import numpy as np
|
499 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
500 |
def transcribe_function(stream, new_chunk):
|
501 |
try:
|
502 |
sr, y = new_chunk[0], new_chunk[1]
|
@@ -504,6 +533,10 @@ def transcribe_function(stream, new_chunk):
|
|
504 |
print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}")
|
505 |
return stream, "", None
|
506 |
|
|
|
|
|
|
|
|
|
507 |
# Ensure the sample rate is 16000 Hz
|
508 |
target_sr = 16000
|
509 |
if sr != target_sr:
|
@@ -525,6 +558,7 @@ def transcribe_function(stream, new_chunk):
|
|
525 |
return stream, full_text, result
|
526 |
|
527 |
|
|
|
528 |
def update_map_with_response(history):
|
529 |
if not history:
|
530 |
return ""
|
|
|
425 |
else:
|
426 |
return "<p>Failed to fetch local news</p>"
|
427 |
|
428 |
+
# import numpy as np
|
429 |
+
# import torch
|
430 |
+
# from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
|
431 |
|
432 |
+
# model_id = 'openai/whisper-large-v3'
|
433 |
+
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
434 |
+
# torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
435 |
+
# model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype).to(device)
|
436 |
+
# processor = AutoProcessor.from_pretrained(model_id)
|
437 |
|
438 |
+
# pipe_asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=15, batch_size=16, torch_dtype=torch_dtype, device=device, return_timestamps=True)
|
439 |
|
440 |
+
# base_audio_drive = "/data/audio"
|
441 |
|
442 |
#Normal Code with sample rate is 44100 Hz
|
443 |
|
|
|
496 |
#Resample part -1
|
497 |
|
498 |
import numpy as np
|
499 |
+
import torch
|
500 |
+
from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
|
501 |
+
from scipy.signal import resample # Import resample from scipy.signal
|
502 |
+
import base64
|
503 |
+
import io
|
504 |
+
from pydub import AudioSegment
|
505 |
+
|
506 |
+
model_id = 'openai/whisper-large-v3'
|
507 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
508 |
+
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
509 |
+
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype).to(device)
|
510 |
+
processor = AutoProcessor.from_pretrained(model_id)
|
511 |
+
|
512 |
+
pipe_asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=15, batch_size=16, torch_dtype=torch_dtype, device=device, return_timestamps=True)
|
513 |
+
|
514 |
+
def is_base64_audio(data):
|
515 |
+
try:
|
516 |
+
if isinstance(data, str):
|
517 |
+
base64.b64decode(data)
|
518 |
+
return True
|
519 |
+
return False
|
520 |
+
except Exception:
|
521 |
+
return False
|
522 |
+
|
523 |
+
def base64_to_float32(base64_str):
|
524 |
+
audio_bytes = base64.b64decode(base64_str)
|
525 |
+
audio_segment = AudioSegment.from_file(io.BytesIO(audio_bytes), format="wav")
|
526 |
+
samples = np.array(audio_segment.get_array_of_samples())
|
527 |
+
return audio_segment.frame_rate, samples.astype(np.float32)
|
528 |
+
|
529 |
def transcribe_function(stream, new_chunk):
|
530 |
try:
|
531 |
sr, y = new_chunk[0], new_chunk[1]
|
|
|
533 |
print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}")
|
534 |
return stream, "", None
|
535 |
|
536 |
+
# Check if input is base64 and convert to float32 if necessary
|
537 |
+
if is_base64_audio(y):
|
538 |
+
sr, y = base64_to_float32(y)
|
539 |
+
|
540 |
# Ensure the sample rate is 16000 Hz
|
541 |
target_sr = 16000
|
542 |
if sr != target_sr:
|
|
|
558 |
return stream, full_text, result
|
559 |
|
560 |
|
561 |
+
|
562 |
def update_map_with_response(history):
|
563 |
if not history:
|
564 |
return ""
|