File size: 4,870 Bytes
22169da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import os
import pickle
import re
import string

import streamlit as st

import numpy as np
import pandas as pd

import catboost

import torch
from transformers import AutoTokenizer, AutoModel

from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk import WordNetLemmatizer


def check_nltk():
	for path in nltk.data.path:
		try:
			if len(os.listdir(path)) > 0:
				return
		except:
			continue

	nltk.download("omw-1.4")
	nltk.download("wordnet")


with st.spinner('🌀 Загружаю данные...'):
	check_nltk()
	data = pd.read_csv('data/data.csv')
	embeddings = pd.read_csv('data/features_emb.csv')
	preds = pd.read_csv('data/catboost_preds.csv')
	catboost_bert_model = catboost.CatBoostClassifier(random_state=25).load_model('src/model/catboost.cbm')
	catboost_tf_idf_model = catboost.CatBoostClassifier(random_state=25).load_model('src/model/tf_idf_catboost.cbm')
	bert_tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased-conversational")
	bert_model = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased-conversational")
	tf_idf_vocab = pd.read_csv('data/tf_idf_vocab.csv', index_col='Unnamed: 0')


def get_random_message() -> str:
	return data.sample(1)['description'].values[0]


def get_bert_prediction(
	text: str
) -> str:

	res_mapper = {
		0: 'Контактная информация отсутствует',
		1: 'Есть контактная информация'
	}

	tokens = bert_tokenizer.encode(
		text,
		add_special_tokens=True,
		truncation=True,
		max_length=512
	)

	n = 512  # max длина вектора

	padded = torch.LongTensor(
		[
			np.array(tokens + [0] * (n - len(tokens)))
		]
	)

	attention_mask = torch.LongTensor(
		np.where(
			padded != 0, 1, 0
		)
	)

	with torch.no_grad():
		batch_embeddings = bert_model(padded, attention_mask=attention_mask)[0][:, 0, :].numpy()

	return res_mapper.get(int(catboost_bert_model.predict(batch_embeddings)))


def get_tf_idf_pred(text: str) -> str:

	res_mapper = {
		0: 'Контактная информация отсутствует',
		1: 'Есть контактная информация'
	}

	if len(text) == 0:
		return res_mapper.get(0)

	def remove_symbols(data):
		return re.sub('[/*,;-]', '', data)

	def remove_punc(data):
		trans = str.maketrans('', '', string.punctuation)
		return data.translate(trans)

	def white_space(data):
		return ' '.join(data.split())

	def lemmatization(data):
		return ' '.join([WordNetLemmatizer().lemmatize(word) for word in data.split()])

	def complete_noise(data):
		new_data = remove_symbols(data)
		new_data = remove_punc(new_data)
		new_data = white_space(new_data)
		new_data = lemmatization(new_data)
		return new_data

	text = complete_noise(text)
	with open('src/model/tf_idf.pk', 'rb') as fin:
		tf_idf = pickle.load(fin)
	tf_idf.vocabulary_ = tf_idf_vocab.to_dict()['0']
	# tf_idf_new = TfidfVectorizer(ngram_range=(1, 5), vocabulary=tf_idf_vocab.to_dict()['0'])
	# st.write(tf_idf.get_params())
	bag_of_words = tf_idf.transform([text])

	try:
		return res_mapper.get(int(catboost_tf_idf_model.predict(bag_of_words)))
	except:
		return 'В сообщении встречаются слова, отсутствующие в вокабуляре TF-IDF.'


def get_re_pred(text: str) -> str:

	url_pattern = re.compile(r'\b((?:https?://)?(?:(?:www\.)?(?:[\da-z\.-]+)\.(?:[a-z]{2,6})|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|(?:(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))(?::[0-9]{1,4}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])?(?:/[\w\.-]*)*/?)\b')
	phone_pattern = re.compile(r'(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})')
	if len(re.findall(url_pattern, text)) > 0:
		return 'Есть контактная информация (url)'
	elif len(re.findall(r'[\w\.-]+@[\w\.-]+(\.[\w]+)+', text)) > 0:
		return 'Есть контактная информация (mail)'
	elif len(re.findall(phone_pattern, text)) > 0:
		return 'Есть контактная информация (phone)'
	else:
		return 'Контактная информация отсутствует'