Spaces:
Running
Running
Martijn Bartelds
commited on
Commit
·
160f237
1
Parent(s):
b341c9c
Update app
Browse files- neural_acoustic_distance.py +60 -60
neural_acoustic_distance.py
CHANGED
@@ -27,66 +27,66 @@ model_id = st.selectbox(
|
|
27 |
if model_id == "other":
|
28 |
model_id = st.text_input("Enter the wav2vec 2.0 model you want to use:", value = "facebook/wav2vec2-large-960h", key = "model")
|
29 |
|
30 |
-
try:
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
except OSError:
|
88 |
-
st.error("Please select a wav2vec 2.0 compatible model identifier on the [Hugging Face Model Hub](https://huggingface.co/models?filter=wav2vec2).")
|
89 |
-
featurizer_a = None
|
90 |
|
91 |
def aligner(x, y) -> Any:
|
92 |
return dtw(x, y, keep_internals=True)
|
|
|
27 |
if model_id == "other":
|
28 |
model_id = st.text_input("Enter the wav2vec 2.0 model you want to use:", value = "facebook/wav2vec2-large-960h", key = "model")
|
29 |
|
30 |
+
# try:
|
31 |
+
cfg = AutoConfig.from_pretrained(model_id)
|
32 |
+
layer = st.number_input("Select the layer you want to use:",
|
33 |
+
min_value = 1, max_value = cfg.num_hidden_layers, value=10)
|
34 |
+
|
35 |
+
def load_wav2vec2_featurizer(model_id: str, layer: Optional[int] = None):
|
36 |
+
from transformers.models.wav2vec2 import Wav2Vec2Model
|
37 |
+
import soundfile as sf
|
38 |
+
from scipy import signal
|
39 |
+
import torch
|
40 |
+
import numpy as np
|
41 |
+
|
42 |
+
transformers.logging.set_verbosity(transformers.logging.ERROR)
|
43 |
+
|
44 |
+
model_kwargs = {}
|
45 |
+
if layer is not None:
|
46 |
+
model_kwargs["num_hidden_layers"] = layer if layer > 0 else 0
|
47 |
+
|
48 |
+
with st.spinner("Loading..."):
|
49 |
+
model = Wav2Vec2Model.from_pretrained(model_id, **model_kwargs)
|
50 |
+
model.eval()
|
51 |
+
if torch.cuda.is_available():
|
52 |
+
model.cuda()
|
53 |
+
st.success("Done!")
|
54 |
+
|
55 |
+
@torch.no_grad()
|
56 |
+
def _featurize(path):
|
57 |
+
input_values, rate = sf.read(path, dtype=np.float32)
|
58 |
+
if len(input_values.shape) == 2:
|
59 |
+
input_values = input_values.mean(1)
|
60 |
+
if rate != 16_000:
|
61 |
+
new_length = int(input_values.shape[0] / rate * 16_000)
|
62 |
+
input_values = signal.resample(input_values, new_length)
|
63 |
+
|
64 |
+
input_values = torch.from_numpy(input_values).unsqueeze(0)
|
65 |
+
if torch.cuda.is_available():
|
66 |
+
input_values = input_values.cuda()
|
67 |
+
|
68 |
+
if layer is None:
|
69 |
+
hidden_states = model(input_values, output_hidden_states=True).hidden_states
|
70 |
+
hidden_states = [s.squeeze(0).cpu().numpy() for s in hidden_states]
|
71 |
+
return hidden_states
|
72 |
+
|
73 |
+
if layer >= 0:
|
74 |
+
hidden_state = model(input_values).last_hidden_state.squeeze(0).cpu().numpy()
|
75 |
+
else:
|
76 |
+
hidden_state = model.feature_extractor(input_values)
|
77 |
+
hidden_state = hidden_state.transpose(1, 2)
|
78 |
+
if layer == -1:
|
79 |
+
hidden_state = model.feature_projection(hidden_state)
|
80 |
+
hidden_state = hidden_state.squeeze(0).cpu().numpy()
|
81 |
+
|
82 |
+
return hidden_state
|
83 |
+
|
84 |
+
return _featurize
|
85 |
+
|
86 |
+
featurizer_a = load_wav2vec2_featurizer(model_id, layer)
|
87 |
+
# except OSError:
|
88 |
+
# st.error("Please select a wav2vec 2.0 compatible model identifier on the [Hugging Face Model Hub](https://huggingface.co/models?filter=wav2vec2).")
|
89 |
+
# featurizer_a = None
|
90 |
|
91 |
def aligner(x, y) -> Any:
|
92 |
return dtw(x, y, keep_internals=True)
|