Spaces:
Sleeping
Sleeping
File size: 4,608 Bytes
736e98d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
from fastapi import FastAPI, UploadFile, File, Body, Form
from pathlib import Path
from fastapi.middleware.cors import CORSMiddleware
from typing import List
import numpy as np
from resemblyzer import preprocess_wav, VoiceEncoder
from itertools import groupby
from pathlib import Path
from tqdm import tqdm
import os
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
import glob
UPLOAD_DIR = Path() / "uploads"
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
app = FastAPI()
# Add a CORS middleware to allow cross-origin requests from the frontend
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
# del all files in uploads folder
def delFiles():
files = glob.glob('uploads/*')
for f in files:
os.remove(f)
# main function which returns the name of person which has highest similarity index with test audio
async def predictor(names, file_uploads, usersNum, recordingsNum):
speaker_embed_list = []
encoder = VoiceEncoder()
# Iterating over list of files corresponding to each user
speaker_wavs_list = []
fileInd = 0
names.pop() # to remove key named "test"
for name in names:
wav_fpaths = []
for ind in range(int(recordingsNum)):
file_upload = file_uploads[fileInd]
data = await file_upload.read()
# appending person's name to the his/her recordings
filename = name+"¬"+file_upload.filename
file_path = UPLOAD_DIR / filename
with open(file_path, "wb") as file_object:
file_object.write(data)
wav_fpaths.append(Path(file_path))
fileInd += 1
try:
speaker_wavs = {speaker: list(map(preprocess_wav, wav_fpaths)) for speaker, wav_fpaths in
groupby(tqdm(wav_fpaths, "Preprocessing wavs", len(wav_fpaths), unit="wavs"),
lambda wav_fpath: os.path.basename(wav_fpath).split("¬")[0])} # extracting person's name from file name
speaker_wavs_list.append(speaker_wavs)
except Exception as e:
print("error ", e)
# make a list of the pre-processed audios ki arrays
for sp_wvs in speaker_wavs_list:
speaker_embed_list.append(
np.array([encoder.embed_speaker(wavs) for wavs in sp_wvs.values()]))
# making preprocessed test audio
wav_fpaths = []
file_upload = file_uploads[-1]
data = await file_upload.read()
filename = "test¬"+file_upload.filename
file_path = UPLOAD_DIR / filename
with open(file_path, "wb") as file_object:
file_object.write(data)
wav_fpaths.append(Path(file_path))
test_pos_wavs = {speaker: list(map(preprocess_wav, wav_fpaths)) for speaker, wav_fpaths in
groupby(tqdm(wav_fpaths, "Preprocessing wavs", len(wav_fpaths), unit="wavs"),
lambda wav_fpath: "test")}
test_pos_emb = np.array([encoder.embed_speaker(wavs)
for wavs in test_pos_wavs.values()])
# calculates cosine similarity between the ground truth (test file) and registered audios
speakers = {}
val = 0
for spkr_embd in speaker_embed_list:
key_val = names[val]
spkr_sim = cosine_similarity(spkr_embd, test_pos_emb)[0][0]
speakers[key_val] = spkr_sim
val += 1
norm = [float(i)/sum(speakers.values()) for i in speakers.values()]
for i in range(len(norm)):
key_val = names[i]
speakers[key_val] = norm[i]
identified = max(speakers, key=speakers.get)
print("\nThe identity of the test speaker:\n", identified, "with a similarity with test of",
speakers[identified]*100, "percent match as compared to all.")
return identified
# Update the function parameter to use the Body module and media_type
@app.post("/predict/")
async def resultGenerator(names: List[str] = Form(...), file_uploads: List[UploadFile] = File(...), usersNum: str = Form(...), recordingsNum: str = Form(...)):
# equal to 2 because names list is of the form [name1, name2,..., test]
try:
if (len(names) <= 2):
return {"error: ", "Incorrect data provided"}
else:
result = await predictor(names, file_uploads, usersNum, recordingsNum)
print('## Test Audio Belonged To: {}'.format(result))
delFiles() # to delete all files from backend, used in this identification
return {"result": result}
except:
return {"error": "Server not responding"}
|