Spaces:
Running
Running
import string | |
from random import random, sample | |
from utilities_language_general.rus_constants import nlp, PHRASES, BAD_USER_TARGET_WORDS | |
from utilities_language_general.rus_utils import get_tags, check_token, define_gender, convert_gender, make_inflection, get_distractors_from_model | |
class SENTENCE: | |
def __init__(self, original: str, n_sentence: int, max_num_distractors): | |
self.original = original | |
self.n_sentence = n_sentence | |
self.max_num_distractors = max_num_distractors | |
self.parsed = nlp(self.original) | |
self.sentence_lemma_pos = [] | |
self.sentence_phrases = [] | |
self.target_words = [] | |
def lemmatize_sentence(self): | |
for token in self.parsed: | |
lemma_pos = f'{token.lemma_}_{token.pos_}' | |
self.sentence_lemma_pos.append((lemma_pos, token)) | |
def bind_phrases(self): | |
previous_was_phrase = False | |
for i in range(len(self.sentence_lemma_pos) - 1): | |
phrase_candidate = f'{self.sentence_lemma_pos[i][0]}_{self.sentence_lemma_pos[i + 1][0]}' | |
if phrase_candidate in PHRASES and not previous_was_phrase: | |
# phrase is {phrase: {original_token1: spacy.token, original_token2: spacy.token}} | |
phrase = [ | |
f'{self.sentence_lemma_pos[i][0]}_{self.sentence_lemma_pos[i + 1][0]}', | |
{ | |
'original_token1': self.sentence_lemma_pos[i][1], | |
'original_token2': self.sentence_lemma_pos[i + 1][1] | |
} | |
] | |
self.sentence_phrases.append(phrase) | |
previous_was_phrase = True | |
else: | |
if not previous_was_phrase: | |
self.sentence_phrases.append(self.sentence_lemma_pos[i][1]) | |
previous_was_phrase = False | |
def search_target_words_automatically(self, model, target_minimum: set, frequency_dict: dict = None, summary:list=None): | |
for token in self.sentence_phrases: | |
# TODO: Still do not have w2v model with phrases | |
# therefore cannot come up with the criteria | |
if isinstance(token, list): # if token is a phrase | |
original_token1 = token[1]['original_token1'] | |
original_token2 = token[1]['original_token2'] | |
original_token1_tags = get_tags(original_token1.text)[0] | |
original_token2_tags = get_tags(original_token2.text)[0] | |
tags = original_token1_tags | original_token2_tags | |
not_ner = True if (original_token1.ent_type == 0 and original_token2.ent_type == 0) else False | |
target_word = { | |
'sentence_number': self.n_sentence, | |
'sentence_text': self.original, | |
'original_text': f'{original_token1.text} {original_token2.text}', | |
'lemma': token[0], | |
'pos': ('phrase', [original_token1.pos_, original_token2.pos_]), | |
'gender': list({define_gender(original_token1), define_gender(original_token2)})[0], | |
'tags': tags, | |
'position_in_sentence': self.original.find(original_token1.text), | |
'not_named_entity': not_ner, | |
'frequency_in_text': 0, | |
'in_summary': self.original in summary | |
} | |
self.target_words.append(target_word) | |
else: # if token is just a spacy.nlp token | |
if check_token(model=model, token=token, lemma_pos='auto', current_minimum=target_minimum): | |
target_word = { | |
'sentence_number': self.n_sentence, | |
'sentence_text': self.original, | |
'original_text': token.text, | |
'lemma': token.lemma_, | |
'pos': ('simple', token.pos_), | |
'gender': define_gender(token.lemma_), | |
'number_children': len([child for child in token.children]), | |
'tags': get_tags(token.text)[0], | |
'position_in_sentence': self.original.find(token.text), | |
'not_named_entity': True if token.ent_type == 0 else False, | |
'frequency_in_text': frequency_dict.get(token.lemma_, 1), | |
'in_summary': self.original in summary | |
} | |
self.target_words.append(target_word) | |
def search_user_target_words(self, model, user_target_words: set = None, frequency_dict: dict = None, summary:list=None): | |
for _utw in user_target_words: | |
if _utw in self.original: | |
parse_utw = nlp(_utw) | |
gender = convert_gender(parse_utw[0].morph.to_dict().get('Gender')) | |
if ' ' in _utw: | |
tags = get_tags(parse_utw[0].text)[0] | get_tags(parse_utw[1].text)[0] | |
user_target_word_lemma = '_'.join([f'{token.lemma_}_{token.pos_}' for token in parse_utw]) | |
user_target_word_pos = ('phrase', [token.pos_ for token in parse_utw]) | |
user_target_word_tags = tags | |
not_ner = True if (parse_utw[0].ent_type == 0 and parse_utw[1].ent_type == 0) else False | |
else: | |
user_target_word_lemma = f'{parse_utw[0].lemma_}_{parse_utw[0].pos_}' | |
user_target_word_pos = ('simple', parse_utw[0].pos_) | |
user_target_word_tags = get_tags(parse_utw[0].text)[0] | |
not_ner = parse_utw[0].ent_type == 0 | |
target_word = { | |
'sentence_number': self.n_sentence, | |
'sentence_text': self.original, | |
'original_text': _utw, | |
'lemma': user_target_word_lemma, | |
'pos': user_target_word_pos, | |
'gender': gender if gender else 'masc', | |
'tags': user_target_word_tags, | |
'position_in_sentence': self.original.find(_utw), | |
'not_named_entity': not_ner, | |
'frequency_in_text': frequency_dict.get(user_target_word_lemma, 1), | |
'in_summary': self.original in summary | |
} | |
if not (model.has_index_for(user_target_word_lemma) | |
or model.has_index_for(f'{user_target_word_lemma}_{user_target_word_pos[1]}')): | |
BAD_USER_TARGET_WORDS.append(_utw) | |
else: | |
self.target_words.append(target_word) | |
def search_target_words(self, model, target_words_automatic_mode: bool, target_minimum, | |
user_target_words: set = None, | |
frequency_dict: dict = None, summary: list=None): | |
if target_words_automatic_mode: | |
self.search_target_words_automatically(model=model, target_minimum=target_minimum, | |
frequency_dict=frequency_dict, summary=summary) | |
else: | |
self.search_user_target_words(model=model, user_target_words=user_target_words, | |
frequency_dict=frequency_dict, summary=summary) | |
def attach_distractors_to_target_word(self, model, scaler, classifier, pos_dict, global_distractors, | |
distractor_minimum, level_name, max_frequency, logs, progress): | |
n_target_words = len(self.target_words) | |
bad_target_words = [] | |
for i, target_word in enumerate(self.target_words): | |
pos = target_word['pos'][0] if target_word['pos'][0] == 'phrase' else target_word['pos'][1] | |
distractors = get_distractors_from_model(doc=self.parsed, model=model, scaler=scaler, classifier=classifier, pos_dict=pos_dict, | |
target_text=target_word['original_text'], lemma=target_word['lemma'], | |
pos=pos, gender=target_word['gender'], lemma_index=target_word['position_in_sentence'], | |
global_distractors=global_distractors, distractor_minimum=distractor_minimum, level_name=level_name, | |
max_num_distractors=self.max_num_distractors) | |
if distractors is None or target_word['frequency_in_text'] > max_frequency: | |
target_word['distractors'] = distractors | |
bad_target_words.append(target_word) | |
target_word['distractors'] = distractors | |
target_word['distractors_number'] = len(distractors) if distractors is not None else 0 | |
progress.progress(i / n_target_words) | |
logs.update(label=f'Обработали {i}/{n_target_words} слов в {self.n_sentence + 1}-м предложении', | |
state='running') | |
for btw in bad_target_words: | |
BAD_USER_TARGET_WORDS.append(btw['original_text']) | |
self.target_words.remove(btw) | |
progress.progress(100) | |
logs.update(label=f'Обработали {n_target_words}/{n_target_words} слов в {self.n_sentence + 1}-м предложении', | |
state='running') | |
def inflect_distractors(self, level_name): | |
bad_target_words = [] | |
for target_word in self.target_words: | |
inflected_distractors = [] | |
for distractor_lemma, distractor_similarity in target_word['distractors']: | |
if distractor_lemma.count('_') > 1: | |
# TODO The same. Has to train model and test this code | |
inflected = make_inflection(text=distractor_lemma, level=level_name, | |
pos=target_word['pos'][1], tags=target_word['tags']) | |
else: | |
inflected = make_inflection(text=distractor_lemma, level=level_name, | |
pos=target_word['pos'][1], tags=target_word['tags']) | |
if inflected is not None: | |
inflected_distractors.append(inflected) | |
num_distractors = min(4, self.max_num_distractors) if self.max_num_distractors >= 4 \ | |
else self.max_num_distractors | |
if len(inflected_distractors) < num_distractors: | |
bad_target_words.append(target_word) | |
else: | |
target_word['inflected_distractors'] = inflected_distractors | |
for btw in bad_target_words: | |
BAD_USER_TARGET_WORDS.append(btw['original_text']) | |
self.target_words.remove(btw) | |
def filter_target_words(self, target_words_automatic_mode): | |
c_position = 0 | |
bad_target_words = [] | |
for target_word in self.target_words: | |
position_difference = 5 if target_words_automatic_mode else 0 | |
if not (target_word['position_in_sentence'] == 0 | |
or abs(target_word['position_in_sentence'] - c_position) >= position_difference): | |
bad_target_words.append(target_word) | |
for btw in bad_target_words: | |
BAD_USER_TARGET_WORDS.append(btw['original_text']) | |
self.target_words.remove(btw) | |
def sample_distractors(self, num_distractors): | |
for target_word in self.target_words: | |
len_inflected_distractors = len(target_word['inflected_distractors']) | |
num_distractors = min(len_inflected_distractors, num_distractors) \ | |
if num_distractors >= 4 else num_distractors | |
target_word['inflected_distractors'] = sample(target_word['inflected_distractors'][:min( | |
len_inflected_distractors, 10)], num_distractors) | |
class TASK: | |
def __init__(self, task_data): | |
self.task_data = task_data | |
self.original_text = None | |
self.sentence_text = None | |
self.inflected_distractors = None | |
self.sentence_number = task_data['sentence_number'] | |
self.position_in_sentence = task_data['position_in_sentence'] | |
self.result = '' | |
self.variants = [] | |
for key, value in task_data.items(): | |
self.__setattr__(key, value) | |
def __repr__(self): | |
return '\n'.join([f'{key}\t=\t{value}' for key, value in self.__dict__.items()]) | |
def compile_task(self, max_num_distractors): | |
len_distractors = len(self.inflected_distractors) | |
len_variants = min(len_distractors, max_num_distractors) if max_num_distractors > 4 \ | |
else max_num_distractors | |
letters = (f'({letter})' for letter in string.ascii_lowercase[:len_variants + 1]) | |
try: | |
distractors = sample(self.inflected_distractors, len_variants) + [self.original_text, ] | |
except ValueError: | |
distractors = self.inflected_distractors + [self.original_text, ] | |
self.variants.append( | |
(self.original_text, [f'{item[0]} {item[1].replace("_", " ")}' | |
for item in zip(letters, sorted(distractors, key=lambda _: random()))])) | |