|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from collections import namedtuple |
|
|
|
from .charsetprober import CharSetProber |
|
from .enums import CharacterCategory, ProbingState, SequenceLikelihood |
|
|
|
SingleByteCharSetModel = namedtuple( |
|
"SingleByteCharSetModel", |
|
[ |
|
"charset_name", |
|
"language", |
|
"char_to_order_map", |
|
"language_model", |
|
"typical_positive_ratio", |
|
"keep_ascii_letters", |
|
"alphabet", |
|
], |
|
) |
|
|
|
|
|
class SingleByteCharSetProber(CharSetProber): |
|
SAMPLE_SIZE = 64 |
|
SB_ENOUGH_REL_THRESHOLD = 1024 |
|
POSITIVE_SHORTCUT_THRESHOLD = 0.95 |
|
NEGATIVE_SHORTCUT_THRESHOLD = 0.05 |
|
|
|
def __init__(self, model, is_reversed=False, name_prober=None): |
|
super().__init__() |
|
self._model = model |
|
|
|
self._reversed = is_reversed |
|
|
|
self._name_prober = name_prober |
|
self._last_order = None |
|
self._seq_counters = None |
|
self._total_seqs = None |
|
self._total_char = None |
|
self._control_char = None |
|
self._freq_char = None |
|
self.reset() |
|
|
|
def reset(self): |
|
super().reset() |
|
|
|
self._last_order = 255 |
|
self._seq_counters = [0] * SequenceLikelihood.get_num_categories() |
|
self._total_seqs = 0 |
|
self._total_char = 0 |
|
self._control_char = 0 |
|
|
|
self._freq_char = 0 |
|
|
|
@property |
|
def charset_name(self): |
|
if self._name_prober: |
|
return self._name_prober.charset_name |
|
return self._model.charset_name |
|
|
|
@property |
|
def language(self): |
|
if self._name_prober: |
|
return self._name_prober.language |
|
return self._model.language |
|
|
|
def feed(self, byte_str): |
|
|
|
if not self._model.keep_ascii_letters: |
|
byte_str = self.filter_international_words(byte_str) |
|
else: |
|
byte_str = self.remove_xml_tags(byte_str) |
|
if not byte_str: |
|
return self.state |
|
char_to_order_map = self._model.char_to_order_map |
|
language_model = self._model.language_model |
|
for char in byte_str: |
|
order = char_to_order_map.get(char, CharacterCategory.UNDEFINED) |
|
|
|
|
|
|
|
|
|
|
|
if order < CharacterCategory.CONTROL: |
|
self._total_char += 1 |
|
if order < self.SAMPLE_SIZE: |
|
self._freq_char += 1 |
|
if self._last_order < self.SAMPLE_SIZE: |
|
self._total_seqs += 1 |
|
if not self._reversed: |
|
lm_cat = language_model[self._last_order][order] |
|
else: |
|
lm_cat = language_model[order][self._last_order] |
|
self._seq_counters[lm_cat] += 1 |
|
self._last_order = order |
|
|
|
charset_name = self._model.charset_name |
|
if self.state == ProbingState.DETECTING: |
|
if self._total_seqs > self.SB_ENOUGH_REL_THRESHOLD: |
|
confidence = self.get_confidence() |
|
if confidence > self.POSITIVE_SHORTCUT_THRESHOLD: |
|
self.logger.debug( |
|
"%s confidence = %s, we have a winner", charset_name, confidence |
|
) |
|
self._state = ProbingState.FOUND_IT |
|
elif confidence < self.NEGATIVE_SHORTCUT_THRESHOLD: |
|
self.logger.debug( |
|
"%s confidence = %s, below negative shortcut threshold %s", |
|
charset_name, |
|
confidence, |
|
self.NEGATIVE_SHORTCUT_THRESHOLD, |
|
) |
|
self._state = ProbingState.NOT_ME |
|
|
|
return self.state |
|
|
|
def get_confidence(self): |
|
r = 0.01 |
|
if self._total_seqs > 0: |
|
r = ( |
|
( |
|
self._seq_counters[SequenceLikelihood.POSITIVE] |
|
+ 0.25 * self._seq_counters[SequenceLikelihood.LIKELY] |
|
) |
|
/ self._total_seqs |
|
/ self._model.typical_positive_ratio |
|
) |
|
|
|
|
|
|
|
r = r * (self._total_char - self._control_char) / self._total_char |
|
r = r * self._freq_char / self._total_char |
|
if r >= 1.0: |
|
r = 0.99 |
|
return r |
|
|