Spaces:

owiedotch
/

oac

Sleeping

App Files Files Community

owiedotch commited on Feb 27

Commit

84f6dd0

verified ·

1 Parent(s): 6ea0ef3

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -22

app.py CHANGED Viewed

@@ -13,7 +13,11 @@ from pathlib import Path
 # Initialize the model
 def load_model():
-    return SemantiCodec(token_rate=100, semantic_vocab_size=32768)  # 1.40 kbps
 semanticodec = load_model()
@@ -22,7 +26,7 @@ def encode_audio(audio_path):
     """Encode audio file to tokens and return them as a file"""
     try:
         tokens = semanticodec.encode(audio_path)
-        # Move tokens to CPU before converting to numpy
         if isinstance(tokens, torch.Tensor):
             tokens = tokens.cpu().numpy()
@@ -75,21 +79,20 @@ def decode_tokens(token_file):
                 # Reshape to match expected format [batch, seq_len, features]
                 tokens = tokens.reshape(1, -1, 1)
-            # Convert to torch tensor (on CPU first)
             tokens = torch.tensor(tokens)
-        # Check if model is on CUDA
-        model_device = next(semanticodec.parameters()).device if hasattr(semanticodec, 'parameters') else 'cpu'
-        # Move tokens to the same device as the model
-        tokens = tokens.to(model_device)
         # Decode the tokens
         waveform = semanticodec.decode(tokens)
-        # Move waveform to CPU for audio processing
         if isinstance(waveform, torch.Tensor):
-            waveform = waveform.cpu().numpy()
         # Create in-memory file for audio
         output_buffer = io.BytesIO()
@@ -102,7 +105,8 @@ def decode_tokens(token_file):
         return output_buffer, f"Decoded {tokens.shape[1]} tokens to audio"
     except Exception as e:
-        return None, f"Error decoding tokens: {str(e)}"
 @spaces.GPU(duration=80)
 def process_both(audio_path):
@@ -118,21 +122,15 @@ def process_both(audio_path):
             # Reshape to match expected format [batch, seq_len, features]
             tokens = tokens.reshape(1, -1, 1)
-        # Convert back to torch tensor (on CPU first)
-        tokens_tensor = torch.tensor(tokens)
-        # Check if model is on CUDA
-        model_device = next(semanticodec.parameters()).device if hasattr(semanticodec, 'parameters') else 'cpu'
-        # Move tokens to the same device as the model
-        tokens_tensor = tokens_tensor.to(model_device)
         # Decode
         waveform = semanticodec.decode(tokens_tensor)
-        # Move waveform to CPU for audio processing
         if isinstance(waveform, torch.Tensor):
-            waveform = waveform.cpu().numpy()
         # Create in-memory file
         output_buffer = io.BytesIO()
@@ -145,7 +143,8 @@ def process_both(audio_path):
         return output_buffer, f"Encoded to {tokens.shape[1]} tokens\nDecoded {tokens.shape[1]} tokens to audio"
     except Exception as e:
-        return None, f"Error processing audio: {str(e)}"
 # Create Gradio interface
 with gr.Blocks(title="Oterin Audio Codec") as demo:

 # Initialize the model
 def load_model():
+    model = SemantiCodec(token_rate=100, semantic_vocab_size=32768)  # 1.40 kbps
+    # Explicitly move model to CUDA
+    if torch.cuda.is_available():
+        model = model.cuda()
+    return model
 semanticodec = load_model()
     """Encode audio file to tokens and return them as a file"""
     try:
         tokens = semanticodec.encode(audio_path)
+        # Move tokens to CPU only for numpy conversion
         if isinstance(tokens, torch.Tensor):
             tokens = tokens.cpu().numpy()
                 # Reshape to match expected format [batch, seq_len, features]
                 tokens = tokens.reshape(1, -1, 1)
+            # Convert to torch tensor and move to CUDA explicitly
             tokens = torch.tensor(tokens)
+            tokens = tokens.cuda()  # Force to CUDA
+        # Force any tensor objects to cuda to be safe
+        if isinstance(tokens, torch.Tensor) and not tokens.is_cuda:
+            tokens = tokens.cuda()
         # Decode the tokens
         waveform = semanticodec.decode(tokens)
+        # Move waveform to CPU ONLY at the end for audio processing
         if isinstance(waveform, torch.Tensor):
+            waveform = waveform.detach().cpu().numpy()
         # Create in-memory file for audio
         output_buffer = io.BytesIO()
         return output_buffer, f"Decoded {tokens.shape[1]} tokens to audio"
     except Exception as e:
+        import traceback
+        return None, f"Error decoding tokens: {str(e)}\n{traceback.format_exc()}"
 @spaces.GPU(duration=80)
 def process_both(audio_path):
             # Reshape to match expected format [batch, seq_len, features]
             tokens = tokens.reshape(1, -1, 1)
+        # Convert back to torch tensor and move to CUDA explicitly
+        tokens_tensor = torch.tensor(tokens).cuda()  # Force to CUDA
         # Decode
         waveform = semanticodec.decode(tokens_tensor)
+        # Move waveform to CPU ONLY at the end for audio processing
         if isinstance(waveform, torch.Tensor):
+            waveform = waveform.detach().cpu().numpy()
         # Create in-memory file
         output_buffer = io.BytesIO()
         return output_buffer, f"Encoded to {tokens.shape[1]} tokens\nDecoded {tokens.shape[1]} tokens to audio"
     except Exception as e:
+        import traceback
+        return None, f"Error processing audio: {str(e)}\n{traceback.format_exc()}"
 # Create Gradio interface
 with gr.Blocks(title="Oterin Audio Codec") as demo: