owiedotch commited on
Commit
fc8b181
·
verified ·
1 Parent(s): 72b8ffa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -164
app.py CHANGED
@@ -1,168 +1,68 @@
 
1
  import gradio as gr
2
- import spaces
3
- import torch
4
- import torchaudio
5
- from semanticodec import SemantiCodec
6
- import tempfile
7
  import numpy as np
8
- import lz4.frame
9
- import os
10
- from typing import Generator
11
- import asyncio # Import asyncio for cancellation
12
- import traceback # Import traceback for error handling
13
- import pickle
14
  import soundfile as sf
 
 
 
15
 
16
- # Initialize model with the specified parameters
17
- semanticodec = SemantiCodec(token_rate=100, semantic_vocab_size=32768) # 1.40 kbps
18
-
19
- # Move the entire model to GPU if available
20
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
21
- semanticodec = semanticodec.to(device)
22
- print(f"Model moved to device: {device}")
23
-
24
- # Global variables for cancellation
25
- cancel_encode = False
26
- cancel_decode = False
27
- cancel_stream = False
28
-
29
- @spaces.GPU(duration=75)
30
- def encode_audio(filepath):
31
- """Encode and decode audio file"""
32
- try:
33
- # Encode and decode directly as in the example
34
- tokens = semanticodec.encode(filepath)
35
- waveform = semanticodec.decode(tokens)
36
-
37
- # Save using soundfile
38
- sf.write("output.wav", waveform[0,0], 16000)
39
- return "output.wav"
40
-
41
- except Exception as e:
42
- print(f"Error: {e}")
43
- traceback.print_exc()
44
- return None
45
-
46
- # Add this function to handle the output
47
- def handle_encode_output(file_path):
48
- if file_path is None:
49
- return None, gr.Markdown("Encoding failed. Please ensure you've uploaded an audio file and try again.", visible=True)
50
- return file_path, gr.Markdown(visible=False)
51
-
52
- @spaces.GPU(duration=75)
53
- def decode_audio(encoded_file='encoded.pkl'):
54
- """Decode tokens back to audio"""
55
- try:
56
- # Load the tokens
57
- with open(encoded_file, 'rb') as f:
58
- data = pickle.load(f)
59
-
60
- tokens = data['tokens']
61
- sample_rate = data['sample_rate']
62
-
63
- # Move tokens to same device as model
64
- device = next(semanticodec.parameters()).device
65
- tokens = tokens.to(device)
66
-
67
- # Decode
68
- waveform = semanticodec.decode(tokens)
69
-
70
- # Save the reconstruction file
71
- torchaudio.save("output.wav", waveform[0,0].cpu(), sample_rate)
72
- return "output.wav"
73
-
74
- except Exception as e:
75
- print(f"Decoding error: {e}")
76
- traceback.print_exc()
77
- return None
78
-
79
- @spaces.GPU(duration=75)
80
- async def stream_decode_audio(encoded_file_path) -> Generator[tuple, None, None]:
81
- global cancel_stream
82
-
83
- try:
84
- # Load encoded data and sample rate from the .owie file
85
- with open(encoded_file_path, 'rb') as temp_file:
86
- sample_rate = int.from_bytes(temp_file.read(4), byteorder='little')
87
- ndim = int.from_bytes(temp_file.read(4), byteorder='little')
88
- shape = tuple(int.from_bytes(temp_file.read(4), byteorder='little') for _ in range(ndim))
89
- compressed_size = int.from_bytes(temp_file.read(4), byteorder='little')
90
- compressed_data = temp_file.read(compressed_size)
91
- tokens_numpy_bytes = lz4.frame.decompress(compressed_data)
92
- tokens_numpy = np.frombuffer(tokens_numpy_bytes, dtype=np.int64).reshape(shape)
93
-
94
- # Create a tensor from the numpy array
95
- tokens = torch.from_numpy(tokens_numpy)
96
-
97
- # Determine the device of the model
98
- model_device = next(semanticodec.parameters()).device
99
- print(f"Model device: {model_device}")
100
-
101
- # Move the tokens to the same device as the model
102
- tokens = tokens.to(model_device)
103
- print(f"Streaming tokens device: {tokens.device}")
104
-
105
- # Decode the audio in chunks
106
- chunk_size = sample_rate * 2 # Adjust chunk size as needed
107
- with torch.no_grad():
108
- for i in range(0, tokens.shape[1], chunk_size):
109
- if cancel_stream:
110
- break # Exit the loop if cancellation is requested
111
-
112
- tokens_chunk = tokens[:, i:i+chunk_size, :]
113
- audio_chunk = semanticodec.decode(tokens_chunk)
114
- # Convert to numpy array and transpose
115
- audio_data = audio_chunk.squeeze(0).cpu().numpy().T
116
- yield (sample_rate, audio_data)
117
- await asyncio.sleep(0) # Allow for cancellation check
118
-
119
- except Exception as e:
120
- print(f"Streaming decoding error: {e}")
121
- print(f"Traceback: {traceback.format_exc()}")
122
- yield (sample_rate, np.zeros((chunk_size, 1), dtype=np.float32)) # Return silence
123
-
124
- finally:
125
- cancel_stream = False
126
-
127
- # Gradio Interface
128
- with gr.Blocks() as demo:
129
- gr.Markdown("## Audio Compression with SemantiCodec (GPU/CPU)")
130
-
131
- with gr.Tab("Encode"):
132
- input_audio = gr.Audio(label="Input Audio", type="filepath")
133
- encode_button = gr.Button("Encode")
134
- cancel_encode_button = gr.Button("Cancel")
135
- encoded_output = gr.File(label="Encoded File (.owie)", type="filepath")
136
- encode_error_message = gr.Markdown(visible=False)
137
-
138
- def encode_wrapper(audio):
139
- if audio is None:
140
- return None, gr.Markdown("Please upload an audio file before encoding.", visible=True)
141
- return handle_encode_output(encode_audio(audio))
142
-
143
- encode_button.click(
144
- encode_wrapper,
145
- inputs=input_audio,
146
- outputs=[encoded_output, encode_error_message]
147
- )
148
- cancel_encode_button.click(lambda: globals().update(cancel_encode=True), outputs=None)
149
-
150
- with gr.Tab("Decode"):
151
- input_encoded = gr.File(label="Encoded File (.owie)", type="filepath")
152
- decode_button = gr.Button("Decode")
153
- cancel_decode_button = gr.Button("Cancel")
154
- decoded_output = gr.Audio(label="Decoded Audio", type="filepath")
155
-
156
- decode_button.click(decode_audio, inputs=input_encoded, outputs=decoded_output)
157
- cancel_decode_button.click(lambda: globals().update(cancel_decode=True), outputs=None)
158
-
159
- with gr.Tab("Streaming"):
160
- input_encoded_stream = gr.File(label="Encoded File (.owie)", type="filepath")
161
- stream_button = gr.Button("Start Streaming")
162
- cancel_stream_button = gr.Button("Cancel")
163
- audio_output = gr.Audio(label="Streaming Audio Output", streaming=True)
164
-
165
- stream_button.click(stream_decode_audio, inputs=input_encoded_stream, outputs=audio_output)
166
- cancel_stream_button.click(lambda: globals().update(cancel_stream=True), outputs=None)
167
-
168
- demo.queue().launch()
 
1
+ import os
2
  import gradio as gr
 
 
 
 
 
3
  import numpy as np
 
 
 
 
 
 
4
  import soundfile as sf
5
+ from semanticodec import SemantiCodec
6
+ from huggingface_hub import HfApi
7
+ import spaces
8
 
9
+ # Initialize the model
10
+ def load_model():
11
+ return SemantiCodec(token_rate=100, semantic_vocab_size=32768) # 1.40 kbps
12
+
13
+ semanticodec = load_model()
14
+
15
+ @spaces.GPU(duration=60)
16
+ def encode_audio(audio_path):
17
+ """Encode audio file to tokens and save them"""
18
+ tokens = semanticodec.encode(audio_path)
19
+ token_path = "encoded_audio.oterin"
20
+ np.save(token_path, tokens)
21
+ return token_path, f"Encoded to {len(tokens)} tokens"
22
+
23
+ @spaces.GPU(duration=60)
24
+ def decode_tokens(token_path):
25
+ """Decode tokens to audio"""
26
+ tokens = np.load(token_path)
27
+ waveform = semanticodec.decode(tokens)
28
+ output_path = "output.wav"
29
+ sf.write(output_path, waveform[0, 0], 32000)
30
+ return output_path, f"Decoded {len(tokens)} tokens to audio"
31
+
32
+ def process_both(audio_path):
33
+ """Encode and then decode the audio"""
34
+ token_path, encode_msg = encode_audio(audio_path)
35
+ output_path, decode_msg = decode_tokens(token_path)
36
+ return output_path, f"{encode_msg}\n{decode_msg}"
37
+
38
+ # Create Gradio interface
39
+ with gr.Blocks(title="Oterin Audio Codec") as demo:
40
+ gr.Markdown("# Oterin Audio Codec")
41
+ gr.Markdown("Upload an audio file to encode it to semantic tokens, decode tokens back to audio, or do both.")
42
+
43
+ with gr.Tab("Encode Audio"):
44
+ with gr.Row():
45
+ encode_input = gr.Audio(type="filepath", label="Input Audio")
46
+ encode_output = gr.File(label="Encoded Tokens (.oterin)")
47
+ encode_status = gr.Textbox(label="Status")
48
+ encode_btn = gr.Button("Encode")
49
+ encode_btn.click(encode_audio, inputs=encode_input, outputs=[encode_output, encode_status])
50
+
51
+ with gr.Tab("Decode Tokens"):
52
+ with gr.Row():
53
+ decode_input = gr.File(label="Token File (.oterin)")
54
+ decode_output = gr.Audio(label="Decoded Audio")
55
+ decode_status = gr.Textbox(label="Status")
56
+ decode_btn = gr.Button("Decode")
57
+ decode_btn.click(decode_tokens, inputs=decode_input, outputs=[decode_output, decode_status])
58
+
59
+ with gr.Tab("Both (Encode & Decode)"):
60
+ with gr.Row():
61
+ both_input = gr.Audio(type="filepath", label="Input Audio")
62
+ both_output = gr.Audio(label="Reconstructed Audio")
63
+ both_status = gr.Textbox(label="Status")
64
+ both_btn = gr.Button("Process")
65
+ both_btn.click(process_both, inputs=both_input, outputs=[both_output, both_status])
66
+
67
+ if __name__ == "__main__":
68
+ demo.launch()