File size: 5,587 Bytes
8fb2bb2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import time
import joblib
import re 
import string
import pymorphy3
import torch 
from transformers import BertModel, BertTokenizer
from torch import nn


model_name = "cointegrated/rubert-tiny2"
tokenizer = BertTokenizer.from_pretrained(model_name)

bert_model = BertModel.from_pretrained(model_name)


class MyTinyBERT(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = bert_model
        for param in self.bert.parameters():
            param.requires_grad = False
        self.linear = nn.Sequential(
            nn.Linear(312, 256),
            nn.Sigmoid(),
            nn.Dropout(),
            nn.Linear(256, 6)
        )


    def forward(self, input_ids, attention_mask=None):
        # Pass the input_ids and attention_mask to the BERT model
        bert_out = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        # Normalize the output from BERT
        normed_bert_out = nn.functional.normalize(bert_out.last_hidden_state[:, 0, :])

        # Pass through the linear layer
        out = self.linear(normed_bert_out)

        return out
    

weights_path = "models/clf_rewievs_bert.pt"

model = MyTinyBERT()
model.load_state_dict(torch.load(weights_path, map_location=torch.device('cpu')))
model.to('cpu')
# tokenizer = transformers.AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2") 


# bert_model = transformers.AutoModel.from_pretrained("cointegrated/rubert-tiny2")
# weights_path = "./model_weights.pt"  # Replace with your .pt file path
# bert_model.load_state_dict(torch.load('models/clf_rewievs_bert.pt', map_location=torch.device('cpu')))

# bert_model.to('cpu')

morph = pymorphy3.MorphAnalyzer()

def lemmatize(text):
    words = text.split()
    lem_words = [morph.parse(word)[0].normal_form for word in words]
    return " ".join(lem_words)




logreg = joblib.load('models/logregmodel_restaurants.pkl')
vectorizer = joblib.load('models/tfidf_vectorizer_restaurants.pkl')

with open(
    "funcs/stopwords-ru.txt", "r", encoding="utf-8"
) as file:
    stop_words = set(file.read().split())


rating_dict = {
    1: "Отвратительно",
    2: "Плохо",
    3: "Удовлетворительно",
    4: "Хорошо",
    5: "Великолепно",}


emoji_pattern = re.compile(
    "["
    "\U0001F600-\U0001F64F"  # Emoticons
    "\U0001F300-\U0001F5FF"  # Symbols & Pictographs
    "\U0001F680-\U0001F6FF"  # Transport & Map Symbols
    "\U0001F1E0-\U0001F1FF"  # Flags (iOS)
    "\U00002700-\U000027BF"  # Dingbats
    "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
    "\U00002600-\U000026FF"  # Miscellaneous Symbols
    "\U00002B50-\U00002B55"  # Miscellaneous Symbols and Pictographs
    "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
    "\U0001F700-\U0001F77F"  # Alchemical Symbols
    "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
    "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
    "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
    "\U0001FA00-\U0001FA6F"  # Chess Symbols
    "]+",
    flags=re.UNICODE,
)

def clean(text, stopwords):
    text = text.lower()  # нижний регистр
    text = re.sub(r"http\S+", " ", text)  # удаляем ссылки
    text = re.sub(r"@\w+", " ", text)  # удаляем упоминания пользователей
    text = re.sub(r"#\w+", " ", text)  # удаляем хэштеги
    text = re.sub(r"\d+", " ", text)  # удаляем числа
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r"<.*?>", " ", text)  #
    text = re.sub(r"[️«»—]", " ", text)
    text = re.sub(r"[^а-яё ]", " ", text)
    text = text.lower()
    text = emoji_pattern.sub(r"", text)
    text = " ".join([word for word in text.split() if word not in stopwords])
    return text


def predict_review(review):
    start_time = time.time()

    # Очистка и лемматизация текста
    clean_text = clean(review, stop_words)
    lem_text = lemmatize(clean_text)

    # Преобразование текста в TF-IDF представление
    X_new = vectorizer.transform([lem_text])

    # Предсказание
    prediction = logreg.predict(X_new)[0]

    # Проверка допустимости предсказания
    if prediction not in rating_dict:
        rating = "Ошибка предсказания"
    else:
        rating = rating_dict[prediction]

    # Измерение времени
    end_time = time.time()
    elapsed_time = end_time - start_time

    print(f"Лейбл: {prediction}")
    print(f"Оценка отзыва: {rating}")
    print(f"Затраченное время: {elapsed_time:.6f} seconds")
    return prediction, rating, elapsed_time


def preprocess_input(text):
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding=True)
    return inputs


def predict_bert(text):
    start_time = time.time()

    model.eval()
    inputs = preprocess_input(text)
    
    # Move tensors to the correct device if using GPU
    inputs = {k: v.to('cpu') for k, v in inputs.items()}
    
    # Get model predictions
    with torch.no_grad():
        outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
    
    # Since the output is already logits, no need to access outputs.logits
    predicted_class = outputs.argmax(dim=-1).item()
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    return predicted_class, rating_dict[predicted_class], elapsed_time