Spaces:

a-v-bely
/

russian-task-generator

Running

File size: 2,438 Bytes

efbe6b4
 
 
 
 
 
 
a27ec1d
efbe6b4
 
 
 
 
a27ec1d
efbe6b4
 
 
 
 
a27ec1d
 
efbe6b4
 
 
 
a27ec1d
efbe6b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a27ec1d
 
 
 
efbe6b4

import json
import spacy
import gensim
import pymorphy2
import streamlit as st
from transformers import pipeline


@st.cache_resource
def load_morph():
    _morph = pymorphy2.MorphAnalyzer(lang='ru')
    return _morph


@st.cache_resource
def load_w2v(model_path):
    _w2v_model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)
    return _w2v_model


@st.cache_resource
def load_spacy():
    _nlp = spacy.load('ru_core_news_lg')
    return _nlp


@st.cache_resource
def load_bert():
    return pipeline("fill-mask", model="a-v-white/ruBert-base-finetuned-russian-moshkov-child-corpus-pro")


nlp = load_spacy()
morph = load_morph()
w2v_model_path = r'ALL_TOGETGER_phrases_s300_cw10_mc100_w4_negative5-075_mean_e10_shr.bin.gz'

# Upload stop list
stop_list = set()
with open(r'language_data/stop_words.txt', 'r', encoding='utf-8') as read_file:
    for line in read_file:
        stop_list.add(line.strip())

# Upload minimums
a1_path, a1_target_set = r'language_data/A1_MINIMUM.txt', set()
a2_path, a2_target_set = r'language_data/A2_MINIMUM.txt', set()
b1_path, b1_target_set = r'language_data/B1_MINIMUM.txt', set()
b2_path, b2_target_set = r'language_data/B2_MINIMUM.txt', set()
c1_path, c1_target_set = r'language_data/C1_MINIMUM.txt', set()
c2_path, c2_target_set = r'language_data/C2_MINIMUM.txt', set()
minimums_paths = (a1_path, a2_path, b1_path, b2_path)
minimums_sets = (a1_target_set, a2_target_set, b1_target_set, b2_target_set, c1_target_set, c2_target_set)
for i in range(len(minimums_paths)):
    with open(minimums_paths[i], 'r', encoding='utf-8') as read_file:
        for line in read_file:
            minimums_sets[i].add(line.strip())

a1_distractor_set = a1_target_set
a2_distractor_set = a2_target_set.union(a1_target_set)
b1_distractor_set = b1_target_set.union(a2_target_set)
b2_distractor_set = b2_target_set.union(b1_target_set)
c1_distractor_set = c1_target_set.union(b2_target_set)
c2_distractor_set = c2_target_set.union(c1_target_set)

with open('language_data/phrases.json', 'r', encoding='utf-8') as f:
    PHRASES = set(json.load(f)['PHRASES'])

SIMILARITY_VALUES_w2v = st.secrets['SIM_VAL_w2v']
SIMILARITY_VALUES_w2v = {SIMILARITY_VALUES_w2v[i]: SIMILARITY_VALUES_w2v[i + 1] for i in range(6)}
SIMILARITY_VALUES_bert = st.secrets['SIM_VAL_bert']
SIMILARITY_VALUES_bert = {SIMILARITY_VALUES_bert[i]: SIMILARITY_VALUES_bert[i + 1] for i in range(6)}
BAD_USER_TARGET_WORDS = []