owiedotch commited on
Commit
2c32151
·
verified ·
1 Parent(s): 84f6dd0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -21
app.py CHANGED
@@ -13,11 +13,7 @@ from pathlib import Path
13
 
14
  # Initialize the model
15
  def load_model():
16
- model = SemantiCodec(token_rate=100, semantic_vocab_size=32768) # 1.40 kbps
17
- # Explicitly move model to CUDA
18
- if torch.cuda.is_available():
19
- model = model.cuda()
20
- return model
21
 
22
  semanticodec = load_model()
23
 
@@ -26,7 +22,7 @@ def encode_audio(audio_path):
26
  """Encode audio file to tokens and return them as a file"""
27
  try:
28
  tokens = semanticodec.encode(audio_path)
29
- # Move tokens to CPU only for numpy conversion
30
  if isinstance(tokens, torch.Tensor):
31
  tokens = tokens.cpu().numpy()
32
 
@@ -79,20 +75,19 @@ def decode_tokens(token_file):
79
  # Reshape to match expected format [batch, seq_len, features]
80
  tokens = tokens.reshape(1, -1, 1)
81
 
82
- # Convert to torch tensor and move to CUDA explicitly
83
  tokens = torch.tensor(tokens)
84
- tokens = tokens.cuda() # Force to CUDA
85
 
86
- # Force any tensor objects to cuda to be safe
87
- if isinstance(tokens, torch.Tensor) and not tokens.is_cuda:
88
- tokens = tokens.cuda()
89
 
90
  # Decode the tokens
91
  waveform = semanticodec.decode(tokens)
92
 
93
- # Move waveform to CPU ONLY at the end for audio processing
94
  if isinstance(waveform, torch.Tensor):
95
- waveform = waveform.detach().cpu().numpy()
96
 
97
  # Create in-memory file for audio
98
  output_buffer = io.BytesIO()
@@ -105,8 +100,7 @@ def decode_tokens(token_file):
105
 
106
  return output_buffer, f"Decoded {tokens.shape[1]} tokens to audio"
107
  except Exception as e:
108
- import traceback
109
- return None, f"Error decoding tokens: {str(e)}\n{traceback.format_exc()}"
110
 
111
  @spaces.GPU(duration=80)
112
  def process_both(audio_path):
@@ -122,15 +116,19 @@ def process_both(audio_path):
122
  # Reshape to match expected format [batch, seq_len, features]
123
  tokens = tokens.reshape(1, -1, 1)
124
 
125
- # Convert back to torch tensor and move to CUDA explicitly
126
- tokens_tensor = torch.tensor(tokens).cuda() # Force to CUDA
 
 
 
 
127
 
128
  # Decode
129
  waveform = semanticodec.decode(tokens_tensor)
130
 
131
- # Move waveform to CPU ONLY at the end for audio processing
132
  if isinstance(waveform, torch.Tensor):
133
- waveform = waveform.detach().cpu().numpy()
134
 
135
  # Create in-memory file
136
  output_buffer = io.BytesIO()
@@ -143,8 +141,7 @@ def process_both(audio_path):
143
 
144
  return output_buffer, f"Encoded to {tokens.shape[1]} tokens\nDecoded {tokens.shape[1]} tokens to audio"
145
  except Exception as e:
146
- import traceback
147
- return None, f"Error processing audio: {str(e)}\n{traceback.format_exc()}"
148
 
149
  # Create Gradio interface
150
  with gr.Blocks(title="Oterin Audio Codec") as demo:
 
13
 
14
  # Initialize the model
15
  def load_model():
16
+ return SemantiCodec(token_rate=100, semantic_vocab_size=32768) # 1.40 kbps
 
 
 
 
17
 
18
  semanticodec = load_model()
19
 
 
22
  """Encode audio file to tokens and return them as a file"""
23
  try:
24
  tokens = semanticodec.encode(audio_path)
25
+ # Move tokens to CPU before converting to numpy
26
  if isinstance(tokens, torch.Tensor):
27
  tokens = tokens.cpu().numpy()
28
 
 
75
  # Reshape to match expected format [batch, seq_len, features]
76
  tokens = tokens.reshape(1, -1, 1)
77
 
78
+ # Convert to torch tensor (on CPU first)
79
  tokens = torch.tensor(tokens)
 
80
 
81
+ # Explicitly move tokens to CUDA
82
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
83
+ tokens = tokens.to(device)
84
 
85
  # Decode the tokens
86
  waveform = semanticodec.decode(tokens)
87
 
88
+ # Move waveform to CPU for audio processing
89
  if isinstance(waveform, torch.Tensor):
90
+ waveform = waveform.cpu().numpy()
91
 
92
  # Create in-memory file for audio
93
  output_buffer = io.BytesIO()
 
100
 
101
  return output_buffer, f"Decoded {tokens.shape[1]} tokens to audio"
102
  except Exception as e:
103
+ return None, f"Error decoding tokens: {str(e)}"
 
104
 
105
  @spaces.GPU(duration=80)
106
  def process_both(audio_path):
 
116
  # Reshape to match expected format [batch, seq_len, features]
117
  tokens = tokens.reshape(1, -1, 1)
118
 
119
+ # Convert back to torch tensor (on CPU first)
120
+ tokens_tensor = torch.tensor(tokens)
121
+
122
+ # Explicitly move tokens to CUDA
123
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
124
+ tokens_tensor = tokens_tensor.to(device)
125
 
126
  # Decode
127
  waveform = semanticodec.decode(tokens_tensor)
128
 
129
+ # Move waveform to CPU for audio processing
130
  if isinstance(waveform, torch.Tensor):
131
+ waveform = waveform.cpu().numpy()
132
 
133
  # Create in-memory file
134
  output_buffer = io.BytesIO()
 
141
 
142
  return output_buffer, f"Encoded to {tokens.shape[1]} tokens\nDecoded {tokens.shape[1]} tokens to audio"
143
  except Exception as e:
144
+ return None, f"Error processing audio: {str(e)}"
 
145
 
146
  # Create Gradio interface
147
  with gr.Blocks(title="Oterin Audio Codec") as demo: