File size: 8,658 Bytes
09b47fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
import random
import re

import six
from six.moves import zip, xrange

from .lang_detect_exception import ErrorCode, LangDetectException
from .language import Language
from .utils.ngram import NGram
from .utils.unicode_block import unicode_block


class Detector(object):
    '''
    Detector class is to detect language from specified text.
    Its instance is able to be constructed via the factory class DetectorFactory.

    After appending a target text to the Detector instance with .append(string),
    the detector provides the language detection results for target text via .detect() or .get_probabilities().

    .detect() method returns a single language name which has the highest probability.
    .get_probabilities() methods returns a list of multiple languages and their probabilities.

    The detector has some parameters for language detection.
    See set_alpha(double), .set_max_text_length(int) .set_prior_map(dict).

    Example:

        from langdetect.detector_factory import DetectorFactory
        factory = DetectorFactory()
        factory.load_profile('/path/to/profile/directory')

        def detect(text):
            detector = factory.create()
            detector.append(text)
            return detector.detect()

        def detect_langs(text):
            detector = factory.create()
            detector.append(text)
            return detector.get_probabilities()
    '''

    ALPHA_DEFAULT = 0.5
    ALPHA_WIDTH = 0.05

    ITERATION_LIMIT = 1000
    PROB_THRESHOLD = 0.1
    CONV_THRESHOLD = 0.99999
    BASE_FREQ = 10000
    UNKNOWN_LANG = 'unknown'

    URL_RE = re.compile(r'https?://[-_.?&~;+=/#0-9A-Za-z]{1,2076}')
    MAIL_RE = re.compile(r'[-_.0-9A-Za-z]{1,64}@[-_0-9A-Za-z]{1,255}[-_.0-9A-Za-z]{1,255}')

    def __init__(self, factory):
        self.word_lang_prob_map = factory.word_lang_prob_map
        self.langlist = factory.langlist
        self.seed = factory.seed
        self.random = random.Random()
        self.text = ''
        self.langprob = None

        self.alpha = self.ALPHA_DEFAULT
        self.n_trial = 7
        self.max_text_length = 10000
        self.prior_map = None
        self.verbose = False

    def set_verbose(self):
        self.verbose = True

    def set_alpha(self, alpha):
        self.alpha = alpha

    def set_prior_map(self, prior_map):
        '''Set prior information about language probabilities.'''
        self.prior_map = [0.0] * len(self.langlist)
        sump = 0.0
        for i in xrange(len(self.prior_map)):
            lang = self.langlist[i]
            if lang in prior_map:
                p = prior_map[lang]
                if p < 0:
                    raise LangDetectException(ErrorCode.InitParamError, 'Prior probability must be non-negative.')
                self.prior_map[i] = p
                sump += p
        if sump <= 0.0:
            raise LangDetectException(ErrorCode.InitParamError, 'More one of prior probability must be non-zero.')
        for i in xrange(len(self.prior_map)):
            self.prior_map[i] /= sump

    def set_max_text_length(self, max_text_length):
        '''Specify max size of target text to use for language detection.
        The default value is 10000(10KB).
        '''
        self.max_text_length = max_text_length

    def append(self, text):
        '''Append the target text for language detection.
        If the total size of target text exceeds the limit size specified by
        Detector.set_max_text_length(int), the rest is cut down.
        '''
        text = self.URL_RE.sub(' ', text)
        text = self.MAIL_RE.sub(' ', text)
        text = NGram.normalize_vi(text)
        pre = 0
        for i in xrange(min(len(text), self.max_text_length)):
            ch = text[i]
            if ch != ' ' or pre != ' ':
                self.text += ch
            pre = ch

    def cleaning_text(self):
        '''Cleaning text to detect
        (eliminate URL, e-mail address and Latin sentence if it is not written in Latin alphabet).
        '''
        latin_count, non_latin_count = 0, 0
        for ch in self.text:
            if 'A' <= ch <= 'z':
                latin_count += 1
            elif ch >= six.u('\u0300') and unicode_block(ch) != 'Latin Extended Additional':
                non_latin_count += 1

        if latin_count * 2 < non_latin_count:
            text_without_latin = ''
            for ch in self.text:
                if ch < 'A' or 'z' < ch:
                    text_without_latin += ch
            self.text = text_without_latin

    def detect(self):
        '''Detect language of the target text and return the language name
        which has the highest probability.
        '''
        probabilities = self.get_probabilities()
        if probabilities:
            return probabilities[0].lang
        return self.UNKNOWN_LANG

    def get_probabilities(self):
        if self.langprob is None:
            self._detect_block()
        return self._sort_probability(self.langprob)

    def _detect_block(self):
        self.cleaning_text()
        ngrams = self._extract_ngrams()
        if not ngrams:
            raise LangDetectException(ErrorCode.CantDetectError, 'No features in text.')

        self.langprob = [0.0] * len(self.langlist)

        self.random.seed(self.seed)
        for t in xrange(self.n_trial):
            prob = self._init_probability()
            alpha = self.alpha + self.random.gauss(0.0, 1.0) * self.ALPHA_WIDTH

            i = 0
            while True:
                self._update_lang_prob(prob, self.random.choice(ngrams), alpha)
                if i % 5 == 0:
                    if self._normalize_prob(prob) > self.CONV_THRESHOLD or i >= self.ITERATION_LIMIT:
                        break
                    if self.verbose:
                        six.print_('>', self._sort_probability(prob))
                i += 1
            for j in xrange(len(self.langprob)):
                self.langprob[j] += prob[j] / self.n_trial
            if self.verbose:
                six.print_('==>', self._sort_probability(prob))

    def _init_probability(self):
        '''Initialize the map of language probabilities.
        If there is the specified prior map, use it as initial map.
        '''
        if self.prior_map is not None:
            return list(self.prior_map)
        else:
            return [1.0 / len(self.langlist)] * len(self.langlist)

    def _extract_ngrams(self):
        '''Extract n-grams from target text.'''
        RANGE = list(xrange(1, NGram.N_GRAM + 1))

        result = []
        ngram = NGram()
        for ch in self.text:
            ngram.add_char(ch)
            if ngram.capitalword:
                continue
            for n in RANGE:
                # optimized w = ngram.get(n)
                if len(ngram.grams) < n:
                    break
                w = ngram.grams[-n:]
                if w and w != ' ' and w in self.word_lang_prob_map:
                    result.append(w)
        return result

    def _update_lang_prob(self, prob, word, alpha):
        '''Update language probabilities with N-gram string(N=1,2,3).'''
        if word is None or word not in self.word_lang_prob_map:
            return False

        lang_prob_map = self.word_lang_prob_map[word]
        if self.verbose:
            six.print_('%s(%s): %s' % (word, self._unicode_encode(word), self._word_prob_to_string(lang_prob_map)))

        weight = alpha / self.BASE_FREQ
        for i in xrange(len(prob)):
            prob[i] *= weight + lang_prob_map[i]
        return True

    def _word_prob_to_string(self, prob):
        result = ''
        for j in xrange(len(prob)):
            p = prob[j]
            if p >= 0.00001:
                result += ' %s:%.5f' % (self.langlist[j], p)
        return result

    def _normalize_prob(self, prob):
        '''Normalize probabilities and check convergence by the maximun probability.
        '''
        maxp, sump = 0.0, sum(prob)
        for i in xrange(len(prob)):
            p = prob[i] / sump
            if maxp < p:
                maxp = p
            prob[i] = p
        return maxp

    def _sort_probability(self, prob):
        result = [Language(lang, p) for (lang, p) in zip(self.langlist, prob) if p > self.PROB_THRESHOLD]
        result.sort(reverse=True)
        return result

    def _unicode_encode(self, word):
        buf = ''
        for ch in word:
            if ch >= six.u('\u0080'):
                st = hex(0x10000 + ord(ch))[2:]
                while len(st) < 4:
                    st = '0' + st
                buf += r'\u' + st[1:5]
            else:
                buf += ch
        return buf