Spaces:

Seppukku
/

nlp_project_gpt_team

Running

App Files Files Community

nlp_project_gpt_team / funcs /sasha_funcs.py

Seppukku

initial commit

8fb2bb2 11 months ago

raw

history blame contribute delete

2.61 kB

	import torch
	from transformers import AutoTokenizer, AutoModel
	import numpy as np
	import re
	import string
	import time

	emoji_pattern = re.compile(
	"["
	"\U0001F600-\U0001F64F" # Emoticons
	"\U0001F300-\U0001F5FF" # Symbols & Pictographs
	"\U0001F680-\U0001F6FF" # Transport & Map Symbols
	"\U0001F1E0-\U0001F1FF" # Flags (iOS)
	"\U00002700-\U000027BF" # Dingbats
	"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs
	"\U00002600-\U000026FF" # Miscellaneous Symbols
	"\U00002B50-\U00002B55" # Miscellaneous Symbols and Pictographs
	"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A
	"\U0001F700-\U0001F77F" # Alchemical Symbols
	"\U0001F780-\U0001F7FF" # Geometric Shapes Extended
	"\U0001F800-\U0001F8FF" # Supplemental Arrows-C
	"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs
	"\U0001FA00-\U0001FA6F" # Chess Symbols
	"]+",
	flags=re.UNICODE,
	)


	def clean(text):
	text = text.lower() # нижний регистр
	text = re.sub(r"http\S+", " ", text) # удаляем ссылки
	text = re.sub(r"@\w+", " ", text) # удаляем упоминания пользователей
	text = re.sub(r"#\w+", " ", text) # удаляем хэштеги
	text = re.sub(r"\d+", " ", text) # удаляем числа
	text = text.translate(str.maketrans("", "", string.punctuation))
	text = re.sub(r"<.*?>", " ", text) #
	text = re.sub(r"[️«»—]", " ", text)
	text = re.sub(r"[^а-яё ]", " ", text)
	text = text.lower()
	text = emoji_pattern.sub(r"", text)
	return text

	def predict_class(text,model_to_embed, model_to_predict, tokenizer):
	start_time = time.time()
	text = clean(text)
	class_list = ['Крипта', 'Мода', 'Спорт', 'Технологии', 'Финансы']
	encoded_input = tokenizer(text, max_length=64, truncation=True, padding='max_length', return_tensors='pt')
	encoded_input = {k: v.to(model_to_embed.device) for k, v in encoded_input.items()}

	with torch.no_grad():
	model_output = model_to_embed(**encoded_input)

	embeddings = model_output.last_hidden_state[:, 0, :]

	embeddings = torch.nn.functional.normalize(embeddings)

	embeddings_np = embeddings.cpu().numpy()

	pred_class = model_to_predict.predict(embeddings_np)

	pred_proba = model_to_predict.predict_proba(embeddings_np)
	confidence = np.max(pred_proba)
	end_time = time.time()
	elapsed_time = end_time - start_time

	return f'Predicted class: {class_list[pred_class[0]]}, Confidence: {confidence:.4f}, Time: {round(elapsed_time, 4)}c'