Spaces:
Running
Running
import json | |
import spacy | |
import gensim | |
import pymorphy3 | |
import streamlit as st | |
from pickle import load | |
from transformers import pipeline | |
from summarizer import Summarizer | |
from torch import cuda, device | |
device = device('cuda' if cuda.is_available else 'cpu') | |
def load_morph(): | |
_morph = pymorphy3.MorphAnalyzer(lang='ru') | |
return _morph | |
def load_w2v(model): | |
with st.spinner('Загружаю языковую модель'): | |
if model == 'model1': | |
model_path = r'language_data/model1.gz' | |
else: | |
model_path = r'language_data/model2.gz' | |
return gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True) | |
def load_spacy(): | |
with st.spinner('Загружаю морфо-синтаксический парсер'): | |
_nlp = spacy.load('ru_core_news_lg') | |
return _nlp | |
def load_bert(): | |
with st.spinner('Загружаю языковую модель'): | |
_pipeline = pipeline(task="fill-mask", model="a-v-bely/ruBert-base-finetuned-russian-moshkov-child-corpus-pro", device=device) | |
return _pipeline | |
def load_summarizer(): | |
return Summarizer() | |
def load_classifiers(model): | |
if model == 'model1': | |
scaler_path = 'language_data/model1_no_wn_minmaxscaler.pickle' | |
classifier_path = 'language_data/model1_no_wn_catboost_classifier.pickle' | |
elif model == 'model2': | |
scaler_path = 'language_data/model2_no_wn_minmaxscaler.pickle' | |
classifier_path = 'language_data/model2_no_wn_catboost_classifier.pickle' | |
else: | |
scaler_path = 'language_data/model3_no_wn_minmaxscaler.pickle' | |
classifier_path = 'language_data/model3_no_wn_catboost_classifier.pickle' | |
with (open(scaler_path, 'rb') as f1, open(classifier_path, 'rb') as f2, open('language_data/pos_dict.pickle', 'rb') as f3): | |
scaler = load(f1) | |
classifier = load(f2) | |
pos_dict = load(f3) | |
return pos_dict, scaler, classifier | |
nlp = load_spacy() | |
morph = load_morph() | |
summarization = load_summarizer() | |
w2v_model1_path = r'model1.gz' | |
w2v_model2_path = r'model2.gz' | |
# Upload stop list | |
stop_list = set() | |
with open(r'language_data/stop_words.txt', 'r', encoding='utf-8') as read_file: | |
for line in read_file: | |
stop_list.add(line.strip()) | |
# Upload minimums | |
a1_path, a1_target_set = r'language_data/A1_MINIMUM.txt', set() | |
a2_path, a2_target_set = r'language_data/A2_MINIMUM.txt', set() | |
b1_path, b1_target_set = r'language_data/B1_MINIMUM.txt', set() | |
b2_path, b2_target_set = r'language_data/B2_MINIMUM.txt', set() | |
c1_path, c1_target_set = r'language_data/C1_MINIMUM.txt', set() | |
c2_path, c2_target_set = r'language_data/C2_MINIMUM.txt', set() | |
minimums_paths = (a1_path, a2_path, b1_path, b2_path, c1_path, c2_path) | |
minimums_sets = (a1_target_set, a2_target_set, b1_target_set, b2_target_set, c1_target_set, c2_target_set) | |
for i in range(len(minimums_paths)): | |
with open(minimums_paths[i], 'r', encoding='utf-8') as read_file: | |
for line in read_file: | |
minimums_sets[i].add(line.strip()) | |
MINIMUM_SETS = { | |
'A1': (a1_target_set, a1_target_set), | |
'A2': (a2_target_set, a2_target_set.union(a1_target_set)), | |
'B1': (b1_target_set, b1_target_set.union(a2_target_set)), | |
'B2': (b2_target_set, b2_target_set.union(b1_target_set)), | |
'C1': (c1_target_set, c1_target_set.union(b2_target_set)), | |
'C2': (c2_target_set, c2_target_set.union(c1_target_set)), | |
'Без уровня': (None, None) | |
} | |
LEVEL_NUMBERS = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6} | |
with open('language_data/phrases.json', 'r', encoding='utf-8') as f: | |
PHRASES = set(json.load(f)['PHRASES']) | |
BAD_USER_TARGET_WORDS = [] | |
COMBINE_POS = { | |
'simple': | |
{ | |
'A1': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], }, | |
'A2': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], }, | |
'B1': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], }, | |
'B2': {'VERB': ['AUX'], '': ['VERB'], }, | |
'C1': {'VERB': ['AUX'], '': ['VERB'], }, | |
'C2': {'VERB': ['AUX'], '': ['VERB'], }, | |
'Без уровня': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], } | |
}, | |
'phrase': | |
{ | |
'A1': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], }, | |
'A2': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], }, | |
'B1': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], }, | |
'B2': {'VERB': ['AUX'], '': ['VERB'], }, | |
'C1': {'VERB': ['AUX'], '': ['VERB'], }, | |
'C2': {'VERB': ['AUX'], '': ['VERB'], }, | |
'Без уровня': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], } | |
}, | |
} | |