owiedotch commited on
Commit
84f6dd0
·
verified ·
1 Parent(s): 6ea0ef3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -22
app.py CHANGED
@@ -13,7 +13,11 @@ from pathlib import Path
13
 
14
  # Initialize the model
15
  def load_model():
16
- return SemantiCodec(token_rate=100, semantic_vocab_size=32768) # 1.40 kbps
 
 
 
 
17
 
18
  semanticodec = load_model()
19
 
@@ -22,7 +26,7 @@ def encode_audio(audio_path):
22
  """Encode audio file to tokens and return them as a file"""
23
  try:
24
  tokens = semanticodec.encode(audio_path)
25
- # Move tokens to CPU before converting to numpy
26
  if isinstance(tokens, torch.Tensor):
27
  tokens = tokens.cpu().numpy()
28
 
@@ -75,21 +79,20 @@ def decode_tokens(token_file):
75
  # Reshape to match expected format [batch, seq_len, features]
76
  tokens = tokens.reshape(1, -1, 1)
77
 
78
- # Convert to torch tensor (on CPU first)
79
  tokens = torch.tensor(tokens)
 
80
 
81
- # Check if model is on CUDA
82
- model_device = next(semanticodec.parameters()).device if hasattr(semanticodec, 'parameters') else 'cpu'
83
-
84
- # Move tokens to the same device as the model
85
- tokens = tokens.to(model_device)
86
 
87
  # Decode the tokens
88
  waveform = semanticodec.decode(tokens)
89
 
90
- # Move waveform to CPU for audio processing
91
  if isinstance(waveform, torch.Tensor):
92
- waveform = waveform.cpu().numpy()
93
 
94
  # Create in-memory file for audio
95
  output_buffer = io.BytesIO()
@@ -102,7 +105,8 @@ def decode_tokens(token_file):
102
 
103
  return output_buffer, f"Decoded {tokens.shape[1]} tokens to audio"
104
  except Exception as e:
105
- return None, f"Error decoding tokens: {str(e)}"
 
106
 
107
  @spaces.GPU(duration=80)
108
  def process_both(audio_path):
@@ -118,21 +122,15 @@ def process_both(audio_path):
118
  # Reshape to match expected format [batch, seq_len, features]
119
  tokens = tokens.reshape(1, -1, 1)
120
 
121
- # Convert back to torch tensor (on CPU first)
122
- tokens_tensor = torch.tensor(tokens)
123
-
124
- # Check if model is on CUDA
125
- model_device = next(semanticodec.parameters()).device if hasattr(semanticodec, 'parameters') else 'cpu'
126
-
127
- # Move tokens to the same device as the model
128
- tokens_tensor = tokens_tensor.to(model_device)
129
 
130
  # Decode
131
  waveform = semanticodec.decode(tokens_tensor)
132
 
133
- # Move waveform to CPU for audio processing
134
  if isinstance(waveform, torch.Tensor):
135
- waveform = waveform.cpu().numpy()
136
 
137
  # Create in-memory file
138
  output_buffer = io.BytesIO()
@@ -145,7 +143,8 @@ def process_both(audio_path):
145
 
146
  return output_buffer, f"Encoded to {tokens.shape[1]} tokens\nDecoded {tokens.shape[1]} tokens to audio"
147
  except Exception as e:
148
- return None, f"Error processing audio: {str(e)}"
 
149
 
150
  # Create Gradio interface
151
  with gr.Blocks(title="Oterin Audio Codec") as demo:
 
13
 
14
  # Initialize the model
15
  def load_model():
16
+ model = SemantiCodec(token_rate=100, semantic_vocab_size=32768) # 1.40 kbps
17
+ # Explicitly move model to CUDA
18
+ if torch.cuda.is_available():
19
+ model = model.cuda()
20
+ return model
21
 
22
  semanticodec = load_model()
23
 
 
26
  """Encode audio file to tokens and return them as a file"""
27
  try:
28
  tokens = semanticodec.encode(audio_path)
29
+ # Move tokens to CPU only for numpy conversion
30
  if isinstance(tokens, torch.Tensor):
31
  tokens = tokens.cpu().numpy()
32
 
 
79
  # Reshape to match expected format [batch, seq_len, features]
80
  tokens = tokens.reshape(1, -1, 1)
81
 
82
+ # Convert to torch tensor and move to CUDA explicitly
83
  tokens = torch.tensor(tokens)
84
+ tokens = tokens.cuda() # Force to CUDA
85
 
86
+ # Force any tensor objects to cuda to be safe
87
+ if isinstance(tokens, torch.Tensor) and not tokens.is_cuda:
88
+ tokens = tokens.cuda()
 
 
89
 
90
  # Decode the tokens
91
  waveform = semanticodec.decode(tokens)
92
 
93
+ # Move waveform to CPU ONLY at the end for audio processing
94
  if isinstance(waveform, torch.Tensor):
95
+ waveform = waveform.detach().cpu().numpy()
96
 
97
  # Create in-memory file for audio
98
  output_buffer = io.BytesIO()
 
105
 
106
  return output_buffer, f"Decoded {tokens.shape[1]} tokens to audio"
107
  except Exception as e:
108
+ import traceback
109
+ return None, f"Error decoding tokens: {str(e)}\n{traceback.format_exc()}"
110
 
111
  @spaces.GPU(duration=80)
112
  def process_both(audio_path):
 
122
  # Reshape to match expected format [batch, seq_len, features]
123
  tokens = tokens.reshape(1, -1, 1)
124
 
125
+ # Convert back to torch tensor and move to CUDA explicitly
126
+ tokens_tensor = torch.tensor(tokens).cuda() # Force to CUDA
 
 
 
 
 
 
127
 
128
  # Decode
129
  waveform = semanticodec.decode(tokens_tensor)
130
 
131
+ # Move waveform to CPU ONLY at the end for audio processing
132
  if isinstance(waveform, torch.Tensor):
133
+ waveform = waveform.detach().cpu().numpy()
134
 
135
  # Create in-memory file
136
  output_buffer = io.BytesIO()
 
143
 
144
  return output_buffer, f"Encoded to {tokens.shape[1]} tokens\nDecoded {tokens.shape[1]} tokens to audio"
145
  except Exception as e:
146
+ import traceback
147
+ return None, f"Error processing audio: {str(e)}\n{traceback.format_exc()}"
148
 
149
  # Create Gradio interface
150
  with gr.Blocks(title="Oterin Audio Codec") as demo: