Spaces:
Runtime error
Runtime error
from collections import defaultdict | |
import re | |
import six | |
from six.moves import xrange | |
from .ngram import NGram | |
class LangProfile(object): | |
MINIMUM_FREQ = 2 | |
LESS_FREQ_RATIO = 100000 | |
ROMAN_CHAR_RE = re.compile(r'^[A-Za-z]$') | |
ROMAN_SUBSTR_RE = re.compile(r'.*[A-Za-z].*') | |
def __init__(self, name=None, freq=None, n_words=None): | |
self.freq = defaultdict(int) | |
if freq is not None: | |
self.freq.update(freq) | |
if n_words is None: | |
n_words = [0] * NGram.N_GRAM | |
self.name = name | |
self.n_words = n_words | |
def add(self, gram): | |
'''Add n-gram to profile.''' | |
if self.name is None or gram is None: # Illegal | |
return | |
length = len(gram) | |
if length < 1 or length > NGram.N_GRAM: # Illegal | |
return | |
self.n_words[length - 1] += 1 | |
self.freq[gram] += 1 | |
def omit_less_freq(self): | |
'''Eliminate below less frequency n-grams and noise Latin alphabets.''' | |
if self.name is None: # Illegal | |
return | |
threshold = max(self.n_words[0] // self.LESS_FREQ_RATIO, self.MINIMUM_FREQ) | |
roman = 0 | |
for key, count in list(six.iteritems(self.freq)): | |
if count <= threshold: | |
self.n_words[len(key)-1] -= count | |
del self.freq[key] | |
elif self.ROMAN_CHAR_RE.match(key): | |
roman += count | |
# roman check | |
if roman < self.n_words[0] // 3: | |
for key, count in list(six.iteritems(self.freq)): | |
if self.ROMAN_SUBSTR_RE.match(key): | |
self.n_words[len(key)-1] -= count | |
del self.freq[key] | |
def update(self, text): | |
'''Update the language profile with (fragmented) text. | |
Extract n-grams from text and add their frequency into the profile. | |
''' | |
if text is None: | |
return | |
text = NGram.normalize_vi(text) | |
gram = NGram() | |
for ch in text: | |
gram.add_char(ch) | |
for n in xrange(1, NGram.N_GRAM+1): | |
self.add(gram.get(n)) | |