File size: 2,525 Bytes
ef910fd
522c171
8d49ccc
 
791f610
 
 
 
522c171
 
 
 
e13efdb
 
 
ef910fd
8d49ccc
 
 
 
 
 
 
 
 
 
85c8109
bd53b31
 
 
 
 
 
 
 
 
f3346e4
 
8d49ccc
 
 
 
 
 
 
 
 
 
 
 
791f610
8d49ccc
 
 
 
 
ef910fd
8e797fd
 
ef910fd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import gradio as gr
import librosa
import tensorflow as tf 
from huggingface_hub import from_pretrained_keras
from itertools import groupby
import numpy as np

model = from_pretrained_keras("CXDJY/snore_ai")

def load_audio_to_tensor(filename):
    audio, sampling_rate = librosa.load(filename, sr=None, mono=True)  # load audio and convert to mono
    wave = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)  # resample to 16KHz
    rms = librosa.feature.rms(y=audio)[0]                           # get root mean square of audio
    volume = np.mean(rms)                                               # get volume of audio
    return wave, volume

def preprocess_mp3(sample, index):
    sample = sample[0]
    sample = tf.cast(sample, tf.float32)
    zero_padding = tf.zeros([16000] - tf.shape(sample), dtype=tf.float32)
    wave = tf.concat([zero_padding, sample], 0)
    spectrogram = tf.signal.stft(wave, frame_length=320, frame_step=32)
    spectrogram = tf.abs(spectrogram)
    spectrogram = tf.expand_dims(spectrogram, axis=2)
    return spectrogram

def greet(name):
    wave, volume = load_audio_to_tensor(name)
    # power = sum(wave * 2) / len(wave)   # audio signal power
    # SNR = 3.5                           # signal-to-noise ratio
    # SNR_linear = 10 ** (SNR / 10)       # convert SNR to linear scale
    # noise_power = power / SNR_linear    # noise power

    # # add noise to audio to simulate environment
    # noise = np.random.normal(0, noise_power ** 0.5, wave.shape)  # generate noise
    # wave = (wave + noise) * 32768.0     # add noise to the audio signal
    # tensor_wave = tf.convert_to_tensor(wave, dtype=tf.float32)  # convert to tensor
    # min_wave = min(wave)  
    if len(wave) > 16000:
        sequence_stride = 16000
    else:
        sequence_stride = 16000-1

# create audio slices
    audio_slices = tf.keras.utils.timeseries_dataset_from_array(wave, wave, sequence_length=16000, sequence_stride=sequence_stride, batch_size=1)
    samples, index = audio_slices.as_numpy_iterator().next()   
    
    audio_slices = audio_slices.map(preprocess_mp3)
    audio_slices = audio_slices.batch(64)   

    # model = from_pretrained_keras("CXDJY/snore_ai")

    yhat = model.predict(audio_slices)
    yhat = [1 if prediction > 0.99 else 0 for prediction in yhat]
    yhat1 = [key for key, group in groupby(yhat)]
    return yhat1

iface = gr.Interface(fn=greet, inputs="file", outputs="text")
# iface = gr.Interface(fn=greet, inputs="audio", outputs="text")
iface.launch()