archit11 commited on
Commit
b268601
·
verified ·
1 Parent(s): 70351e3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -8
app.py CHANGED
@@ -2,7 +2,7 @@ import transformers
2
  import gradio as gr
3
  import torch
4
  import numpy as np
5
- from typing import Dict, List
6
  import spaces
7
 
8
  # Constants
@@ -11,12 +11,15 @@ SAMPLE_RATE = 16000
11
  MAX_NEW_TOKENS = 256
12
 
13
  # Load the pipeline
14
- pipe = transformers.pipeline(
15
- model=MODEL_NAME,
16
- trust_remote_code=True,
17
- device=0,
18
- torch_dtype='bfloat16'
19
- )
 
 
 
20
 
21
  def create_conversation_turns(prompt: str) -> List[Dict[str, str]]:
22
  return [
@@ -25,17 +28,25 @@ def create_conversation_turns(prompt: str) -> List[Dict[str, str]]:
25
  ]
26
 
27
  @spaces.GPU(duration=120)
28
- def transcribe_and_respond(audio: np.ndarray) -> str:
29
  try:
 
 
 
30
  # Ensure audio is float32
31
  if audio.dtype != np.float32:
32
  audio = audio.astype(np.float32)
33
 
 
 
 
 
34
  # Create input for the pipeline
35
  turns = create_conversation_turns("<|audio|>")
36
  inputs = {
37
  'audio': audio,
38
  'turns': turns,
 
39
  }
40
 
41
  # Generate response
 
2
  import gradio as gr
3
  import torch
4
  import numpy as np
5
+ from typing import Dict, List, Tuple
6
  import spaces
7
 
8
  # Constants
 
11
  MAX_NEW_TOKENS = 256
12
 
13
  # Load the pipeline
14
+ def load_pipeline():
15
+ return transformers.pipeline(
16
+ model=MODEL_NAME,
17
+ trust_remote_code=True,
18
+ device=0,
19
+ torch_dtype=torch.bfloat16
20
+ )
21
+
22
+ pipe = load_pipeline()
23
 
24
  def create_conversation_turns(prompt: str) -> List[Dict[str, str]]:
25
  return [
 
28
  ]
29
 
30
  @spaces.GPU(duration=120)
31
+ def transcribe_and_respond(audio_input: Tuple[int, np.ndarray]) -> str:
32
  try:
33
+ # Unpack the audio input
34
+ sample_rate, audio = audio_input
35
+
36
  # Ensure audio is float32
37
  if audio.dtype != np.float32:
38
  audio = audio.astype(np.float32)
39
 
40
+ # Resample if necessary
41
+ if sample_rate != SAMPLE_RATE:
42
+ audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=SAMPLE_RATE)
43
+
44
  # Create input for the pipeline
45
  turns = create_conversation_turns("<|audio|>")
46
  inputs = {
47
  'audio': audio,
48
  'turns': turns,
49
+ 'sampling_rate': SAMPLE_RATE
50
  }
51
 
52
  # Generate response