Spaces:
Sleeping
Sleeping
| import torch | |
| from transformers import AutoTokenizer, AutoModel | |
| import numpy as np | |
| import re | |
| import string | |
| import time | |
| emoji_pattern = re.compile( | |
| "[" | |
| "\U0001F600-\U0001F64F" # Emoticons | |
| "\U0001F300-\U0001F5FF" # Symbols & Pictographs | |
| "\U0001F680-\U0001F6FF" # Transport & Map Symbols | |
| "\U0001F1E0-\U0001F1FF" # Flags (iOS) | |
| "\U00002700-\U000027BF" # Dingbats | |
| "\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs | |
| "\U00002600-\U000026FF" # Miscellaneous Symbols | |
| "\U00002B50-\U00002B55" # Miscellaneous Symbols and Pictographs | |
| "\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A | |
| "\U0001F700-\U0001F77F" # Alchemical Symbols | |
| "\U0001F780-\U0001F7FF" # Geometric Shapes Extended | |
| "\U0001F800-\U0001F8FF" # Supplemental Arrows-C | |
| "\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs | |
| "\U0001FA00-\U0001FA6F" # Chess Symbols | |
| "]+", | |
| flags=re.UNICODE, | |
| ) | |
| def clean(text): | |
| text = text.lower() # нижний регистр | |
| text = re.sub(r"http\S+", " ", text) # удаляем ссылки | |
| text = re.sub(r"@\w+", " ", text) # удаляем упоминания пользователей | |
| text = re.sub(r"#\w+", " ", text) # удаляем хэштеги | |
| text = re.sub(r"\d+", " ", text) # удаляем числа | |
| text = text.translate(str.maketrans("", "", string.punctuation)) | |
| text = re.sub(r"<.*?>", " ", text) # | |
| text = re.sub(r"[️«»—]", " ", text) | |
| text = re.sub(r"[^а-яё ]", " ", text) | |
| text = text.lower() | |
| text = emoji_pattern.sub(r"", text) | |
| return text | |
| def predict_class(text,model_to_embed, model_to_predict, tokenizer): | |
| start_time = time.time() | |
| text = clean(text) | |
| class_list = ['Крипта', 'Мода', 'Спорт', 'Технологии', 'Финансы'] | |
| encoded_input = tokenizer(text, max_length=64, truncation=True, padding='max_length', return_tensors='pt') | |
| encoded_input = {k: v.to(model_to_embed.device) for k, v in encoded_input.items()} | |
| with torch.no_grad(): | |
| model_output = model_to_embed(**encoded_input) | |
| embeddings = model_output.last_hidden_state[:, 0, :] | |
| embeddings = torch.nn.functional.normalize(embeddings) | |
| embeddings_np = embeddings.cpu().numpy() | |
| pred_class = model_to_predict.predict(embeddings_np) | |
| pred_proba = model_to_predict.predict_proba(embeddings_np) | |
| confidence = np.max(pred_proba) | |
| end_time = time.time() | |
| elapsed_time = end_time - start_time | |
| return f'Predicted class: {class_list[pred_class[0]]}, Confidence: {confidence:.4f}, Time: {round(elapsed_time, 4)}c' |