|
""" |
|
This file contains some utilities functions used to find parallel sentences |
|
in two monolingual corpora. |
|
|
|
Code in this file has been adapted from the LASER repository: |
|
https://github.com/facebookresearch/LASER |
|
""" |
|
|
|
import faiss |
|
import numpy as np |
|
import time |
|
import gzip |
|
import lzma |
|
|
|
|
|
def score(x, y, fwd_mean, bwd_mean, margin): |
|
return margin(x.dot(y), (fwd_mean + bwd_mean) / 2) |
|
|
|
|
|
def score_candidates(x, y, candidate_inds, fwd_mean, bwd_mean, margin): |
|
scores = np.zeros(candidate_inds.shape) |
|
for i in range(scores.shape[0]): |
|
for j in range(scores.shape[1]): |
|
k = candidate_inds[i, j] |
|
scores[i, j] = score(x[i], y[k], fwd_mean[i], bwd_mean[k], margin) |
|
return scores |
|
|
|
|
|
def kNN(x, y, k, use_ann_search=False, ann_num_clusters=32768, ann_num_cluster_probe=3): |
|
start_time = time.time() |
|
if use_ann_search: |
|
print("Perform approx. kNN search") |
|
n_cluster = min(ann_num_clusters, int(y.shape[0]/1000)) |
|
quantizer = faiss.IndexFlatIP(y.shape[1]) |
|
index = faiss.IndexIVFFlat(quantizer, y.shape[1], n_cluster, faiss.METRIC_INNER_PRODUCT) |
|
index.nprobe = ann_num_cluster_probe |
|
index.train(y) |
|
index.add(y) |
|
sim, ind = index.search(x, k) |
|
else: |
|
print("Perform exact search") |
|
idx = faiss.IndexFlatIP(y.shape[1]) |
|
idx.add(y) |
|
sim, ind = idx.search(x, k) |
|
|
|
print("Done: {:.2f} sec".format(time.time()-start_time)) |
|
return sim, ind |
|
|
|
|
|
def file_open(filepath): |
|
|
|
if filepath.endswith('.gz'): |
|
return gzip.open(filepath, 'rt', encoding='utf8') |
|
elif filepath.endswith('xz'): |
|
return lzma.open(filepath, 'rt', encoding='utf8') |
|
else: |
|
return open(filepath, 'r', encoding='utf8') |
|
|