Spaces:

owiedotch
/

oac

Sleeping

App Files Files Community

owiedotch commited on Feb 27

Commit

16120e1

verified ·

1 Parent(s): dc59c25

Update app.py

Browse files

Files changed (1) hide show

app.py +103 -5

app.py CHANGED Viewed

@@ -10,11 +10,12 @@ import tempfile
 import io
 import uuid
 import pickle
 from pathlib import Path
 # Initialize the model and ensure it's on the correct device
 def load_model():
-    model = SemantiCodec(token_rate=100, semantic_vocab_size=32768)  # 1.40 kbps
     if torch.cuda.is_available():
         # Move the model to CUDA
         model.to("cuda:0")
@@ -26,6 +27,9 @@ semanticodec = load_model()
 model_device = "cuda:0" if torch.cuda.is_available() else "cpu"
 print(f"Model initialized on device: {model_device}")
 @spaces.GPU(duration=20)
 def encode_audio(audio_path):
     """Encode audio file to tokens and return them as a file"""
@@ -106,12 +110,11 @@ def decode_tokens(token_file):
         # Extract audio data - this should be a numpy array
         audio_data = waveform[0, 0]  # Shape should be [time]
-        sample_rate = 16000
         print(f"Audio data shape: {audio_data.shape}, dtype: {audio_data.dtype}")
         # Return in Gradio Audio compatible format: (sample_rate, audio_data)
-        return (sample_rate, audio_data), f"Decoded {tokens.shape[1]} tokens to audio"
     except Exception as e:
         print(f"Decoding error: {str(e)}")
         return None, f"Error decoding tokens: {str(e)}"
@@ -155,16 +158,98 @@ def process_both(audio_path):
         # Extract audio data - this should be a numpy array
         audio_data = waveform[0, 0]  # Shape should be [time]
-        sample_rate = 16000
         print(f"Audio data shape: {audio_data.shape}, dtype: {audio_data.dtype}")
         # Return in Gradio Audio compatible format: (sample_rate, audio_data)
-        return (sample_rate, audio_data), f"Encoded to {tokens.shape[1]} tokens\nDecoded {tokens.shape[1]} tokens to audio"
     except Exception as e:
         print(f"Processing error: {str(e)}")
         return None, f"Error processing audio: {str(e)}"
 # Create Gradio interface
 with gr.Blocks(title="Oterin Audio Codec") as demo:
     gr.Markdown("# Oterin Audio Codec")
@@ -186,6 +271,19 @@ with gr.Blocks(title="Oterin Audio Codec") as demo:
         decode_btn = gr.Button("Decode")
         decode_btn.click(decode_tokens, inputs=decode_input, outputs=[decode_output, decode_status])
     with gr.Tab("Both (Encode & Decode)"):
         with gr.Row():
             both_input = gr.Audio(type="filepath", label="Input Audio")

 import io
 import uuid
 import pickle
+import time
 from pathlib import Path
 # Initialize the model and ensure it's on the correct device
 def load_model():
+    model = SemantiCodec(token_rate=100, semantic_vocab_size=16384)  # 1.35 kbps
     if torch.cuda.is_available():
         # Move the model to CUDA
         model.to("cuda:0")
 model_device = "cuda:0" if torch.cuda.is_available() else "cpu"
 print(f"Model initialized on device: {model_device}")
+# Define sample rate as a constant
+SAMPLE_RATE = 32000
 @spaces.GPU(duration=20)
 def encode_audio(audio_path):
     """Encode audio file to tokens and return them as a file"""
         # Extract audio data - this should be a numpy array
         audio_data = waveform[0, 0]  # Shape should be [time]
         print(f"Audio data shape: {audio_data.shape}, dtype: {audio_data.dtype}")
         # Return in Gradio Audio compatible format: (sample_rate, audio_data)
+        return (SAMPLE_RATE, audio_data), f"Decoded {tokens.shape[1]} tokens to audio"
     except Exception as e:
         print(f"Decoding error: {str(e)}")
         return None, f"Error decoding tokens: {str(e)}"
         # Extract audio data - this should be a numpy array
         audio_data = waveform[0, 0]  # Shape should be [time]
         print(f"Audio data shape: {audio_data.shape}, dtype: {audio_data.dtype}")
         # Return in Gradio Audio compatible format: (sample_rate, audio_data)
+        return (SAMPLE_RATE, audio_data), f"Encoded to {tokens.shape[1]} tokens\nDecoded {tokens.shape[1]} tokens to audio"
     except Exception as e:
         print(f"Processing error: {str(e)}")
         return None, f"Error processing audio: {str(e)}"
+@spaces.GPU(duration=360)
+def stream_decode_tokens(token_file):
+    """Decode tokens to audio in streaming chunks"""
+    # Ensure the file exists and has content
+    if not token_file or not os.path.exists(token_file):
+        yield None, "Error: Empty or missing token file"
+        return
+    try:
+        # Load tokens using pickle instead of numpy load
+        with open(token_file, "rb") as f:
+            token_data = pickle.load(f)
+        tokens = token_data['tokens']
+        intended_device = token_data.get('device', model_device)
+        print(f"Loaded tokens with shape {tokens.shape}, intended device: {intended_device}")
+        # If tokens are too small, decode all at once
+        if tokens.shape[1] < 500:
+            # Convert to torch tensor with Long dtype for embedding
+            tokens_tensor = torch.tensor(tokens, dtype=torch.long)
+            tokens_tensor = tokens_tensor.to(model_device)
+            # Decode the tokens
+            waveform = semanticodec.decode(tokens_tensor)
+            if isinstance(waveform, torch.Tensor):
+                waveform = waveform.cpu().numpy()
+            audio_data = waveform[0, 0]
+            yield (SAMPLE_RATE, audio_data), f"Decoded {tokens.shape[1]} tokens to audio"
+            return
+        # Split tokens into chunks for streaming
+        chunk_size = 500  # Number of tokens per chunk
+        num_chunks = (tokens.shape[1] + chunk_size - 1) // chunk_size  # Ceiling division
+        # First status update
+        yield None, f"Starting decoding of {tokens.shape[1]} tokens in {num_chunks} chunks..."
+        all_audio_chunks = []
+        for i in range(num_chunks):
+            start_idx = i * chunk_size
+            end_idx = min((i + 1) * chunk_size, tokens.shape[1])
+            print(f"Decoding chunk {i+1}/{num_chunks}, tokens {start_idx} to {end_idx}")
+            # Extract chunk of tokens
+            token_chunk = tokens[:, start_idx:end_idx, :]
+            # Convert to torch tensor with Long dtype
+            tokens_tensor = torch.tensor(token_chunk, dtype=torch.long)
+            tokens_tensor = tokens_tensor.to(model_device)
+            # Ensure model is on the expected device
+            semanticodec.to(model_device)
+            # Decode the tokens
+            waveform = semanticodec.decode(tokens_tensor)
+            if isinstance(waveform, torch.Tensor):
+                waveform = waveform.cpu().numpy()
+            # Extract audio data
+            audio_chunk = waveform[0, 0]
+            all_audio_chunks.append(audio_chunk)
+            # Combine all chunks we have so far
+            combined_audio = np.concatenate(all_audio_chunks)
+            # Yield the combined audio for streaming playback
+            yield (SAMPLE_RATE, combined_audio), f"Decoded chunk {i+1}/{num_chunks} ({end_idx}/{tokens.shape[1]} tokens)"
+            # Small delay to allow Gradio to update UI
+            time.sleep(0.1)
+        # Final complete audio
+        combined_audio = np.concatenate(all_audio_chunks)
+        yield (SAMPLE_RATE, combined_audio), f"Completed decoding all {tokens.shape[1]} tokens"
+    except Exception as e:
+        print(f"Streaming decode error: {str(e)}")
+        yield None, f"Error decoding tokens: {str(e)}"
 # Create Gradio interface
 with gr.Blocks(title="Oterin Audio Codec") as demo:
     gr.Markdown("# Oterin Audio Codec")
         decode_btn = gr.Button("Decode")
         decode_btn.click(decode_tokens, inputs=decode_input, outputs=[decode_output, decode_status])
+    with gr.Tab("Stream Decode (Listen while decoding)"):
+        with gr.Row():
+            stream_decode_input = gr.File(label="Token File (.oterin)", file_types=[".oterin"])
+            stream_decode_output = gr.Audio(label="Streaming Audio Output")
+        stream_decode_status = gr.Textbox(label="Status")
+        stream_decode_btn = gr.Button("Start Streaming Decode")
+        stream_decode_btn.click(
+            stream_decode_tokens,
+            inputs=stream_decode_input,
+            outputs=[stream_decode_output, stream_decode_status],
+            show_progress=True
+        )
     with gr.Tab("Both (Encode & Decode)"):
         with gr.Row():
             both_input = gr.Audio(type="filepath", label="Input Audio")