Pijush2023 commited on
Commit
1c2e53d
·
verified ·
1 Parent(s): cbe0a00

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -11
app.py CHANGED
@@ -425,19 +425,19 @@ def fetch_local_news():
425
  else:
426
  return "<p>Failed to fetch local news</p>"
427
 
428
- import numpy as np
429
- import torch
430
- from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
431
 
432
- model_id = 'openai/whisper-large-v3'
433
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
434
- torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
435
- model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype).to(device)
436
- processor = AutoProcessor.from_pretrained(model_id)
437
 
438
- pipe_asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=15, batch_size=16, torch_dtype=torch_dtype, device=device, return_timestamps=True)
439
 
440
- base_audio_drive = "/data/audio"
441
 
442
  #Normal Code with sample rate is 44100 Hz
443
 
@@ -496,7 +496,36 @@ base_audio_drive = "/data/audio"
496
  #Resample part -1
497
 
498
  import numpy as np
499
- from scipy.signal import resample
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
500
  def transcribe_function(stream, new_chunk):
501
  try:
502
  sr, y = new_chunk[0], new_chunk[1]
@@ -504,6 +533,10 @@ def transcribe_function(stream, new_chunk):
504
  print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}")
505
  return stream, "", None
506
 
 
 
 
 
507
  # Ensure the sample rate is 16000 Hz
508
  target_sr = 16000
509
  if sr != target_sr:
@@ -525,6 +558,7 @@ def transcribe_function(stream, new_chunk):
525
  return stream, full_text, result
526
 
527
 
 
528
  def update_map_with_response(history):
529
  if not history:
530
  return ""
 
425
  else:
426
  return "<p>Failed to fetch local news</p>"
427
 
428
+ # import numpy as np
429
+ # import torch
430
+ # from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
431
 
432
+ # model_id = 'openai/whisper-large-v3'
433
+ # device = "cuda:0" if torch.cuda.is_available() else "cpu"
434
+ # torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
435
+ # model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype).to(device)
436
+ # processor = AutoProcessor.from_pretrained(model_id)
437
 
438
+ # pipe_asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=15, batch_size=16, torch_dtype=torch_dtype, device=device, return_timestamps=True)
439
 
440
+ # base_audio_drive = "/data/audio"
441
 
442
  #Normal Code with sample rate is 44100 Hz
443
 
 
496
  #Resample part -1
497
 
498
  import numpy as np
499
+ import torch
500
+ from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
501
+ from scipy.signal import resample # Import resample from scipy.signal
502
+ import base64
503
+ import io
504
+ from pydub import AudioSegment
505
+
506
+ model_id = 'openai/whisper-large-v3'
507
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
508
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
509
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype).to(device)
510
+ processor = AutoProcessor.from_pretrained(model_id)
511
+
512
+ pipe_asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=15, batch_size=16, torch_dtype=torch_dtype, device=device, return_timestamps=True)
513
+
514
+ def is_base64_audio(data):
515
+ try:
516
+ if isinstance(data, str):
517
+ base64.b64decode(data)
518
+ return True
519
+ return False
520
+ except Exception:
521
+ return False
522
+
523
+ def base64_to_float32(base64_str):
524
+ audio_bytes = base64.b64decode(base64_str)
525
+ audio_segment = AudioSegment.from_file(io.BytesIO(audio_bytes), format="wav")
526
+ samples = np.array(audio_segment.get_array_of_samples())
527
+ return audio_segment.frame_rate, samples.astype(np.float32)
528
+
529
  def transcribe_function(stream, new_chunk):
530
  try:
531
  sr, y = new_chunk[0], new_chunk[1]
 
533
  print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}")
534
  return stream, "", None
535
 
536
+ # Check if input is base64 and convert to float32 if necessary
537
+ if is_base64_audio(y):
538
+ sr, y = base64_to_float32(y)
539
+
540
  # Ensure the sample rate is 16000 Hz
541
  target_sr = 16000
542
  if sr != target_sr:
 
558
  return stream, full_text, result
559
 
560
 
561
+
562
  def update_map_with_response(history):
563
  if not history:
564
  return ""