Alberto Carmona
Track error cloning the repo
ebd4e51
raw
history blame
1.08 kB
from random import uniform
import numpy as np
from collections import OrderedDict, defaultdict
from itertools import tee
import time
# -----------------------------------------------
def find_ngrams(input_list, n):
return zip(*[input_list[i:] for i in range(n)])
def compute_div_n(caps,n=1):
aggr_div = []
for k in caps:
all_ngrams = set()
lenT = 0.
for c in caps[k]:
tkns = c.split()
lenT += len(tkns)
ng = find_ngrams(tkns, n)
all_ngrams.update(ng)
aggr_div.append(float(len(all_ngrams))/ (1e-6 + float(lenT)))
return np.array(aggr_div).mean(), np.array(aggr_div)
def compute_global_div_n(caps,n=1):
aggr_div = []
all_ngrams = set()
lenT = 0.
for k in caps:
for c in caps[k]:
tkns = c.split()
lenT += len(tkns)
ng = find_ngrams(tkns, n)
all_ngrams.update(ng)
if n == 1:
aggr_div.append(float(len(all_ngrams)))
else:
aggr_div.append(float(len(all_ngrams))/ (1e-6 + float(lenT)))
return aggr_div[0], np.repeat(np.array(aggr_div),len(caps))