Misconception1 / utils.py
harrycool12's picture
Upload 10 files
0f5c20a verified
import torch
import numpy as np
from torch import nn
import random
import os
from datetime import datetime
import time
import math
import pandas as pd
import matplotlib.pyplot as plt
import yaml
from deepdiff import DeepDiff
from types import SimpleNamespace
from glob import glob
from sklearn.metrics import roc_auc_score
def sep():
print("-"*100)
def get_timediff(time1,time2):
minute_,second_ = divmod(time2-time1,60)
return f"{int(minute_):02d}:{int(second_):02d}"
def current_date_time():
# Format the current date and time as "YYYY-MM-DD HH:MM:SS"
return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
def seed_torch(seed=42):
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed) # if you are using multi-GPU.
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = True
def init_logger(log_file=f'train.log'):
from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
logger = getLogger(__name__)
logger.setLevel(INFO)
handler1 = StreamHandler()
handler1.setFormatter(Formatter("%(message)s"))
handler2 = FileHandler(filename=log_file)
handler2.setFormatter(Formatter("%(message)s"))
logger.addHandler(handler1)
logger.addHandler(handler2)
return logger
def write_to_summary_log(summary_log_file, message):
with open(summary_log_file, 'a+') as file:
file.write(f"{message}\n")
class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self):
self.reset()
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
def asMinutes(s: float):
"Convert to minutes."
m = math.floor(s / 60)
s -= m * 60
return '%dm %ds' % (m, s)
def timeSince(since: float, percent: float):
now = time.time()
s = now - since
es = s / (percent)
rs = es - s
return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))
def load_yaml(file_path):
with open(file_path, 'r') as file:
return yaml.safe_load(file)
def simple_namespace(cfg):
for k, v in cfg.items():
if type(v) == dict:
cfg[k] = SimpleNamespace(**v)
return SimpleNamespace(**cfg)
def get_parameter_number(model, unit='M'):
total_num = sum(p.numel() for p in model.parameters())
trainable_num = sum(p.numel() for p in model.parameters() if p.requires_grad)
div = {"M": 1e6, "K": 1e3, "B": 1}[unit]
return f'Total params: {total_num/div:.1f}{unit}; Trainable params: {trainable_num//div:.1f}{unit}'
def sigmoid(x):
return 1 / (1 + np.exp(-x))
import os
import zipfile
import json
def zip_files_in_directory(directory_path, zip_file_path, is_exclude_dirs=True):
with zipfile.ZipFile(zip_file_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
for root, dirs, files in os.walk(directory_path):
# 仅在根目录下添加文件,不进入子目录
if is_exclude_dirs:
if root == directory_path:
for file in files:
file_path = os.path.join(root, file)
zipf.write(file_path, os.path.relpath(file_path, directory_path))
else:
for file in files:
file_path = os.path.join(root, file)
zipf.write(file_path, os.path.relpath(file_path, directory_path))
# -----
# Credit: https://www.kaggle.com/code/abdullahmeda/eedi-map-k-metric
import numpy as np
def apk(actual, predicted, k=25):
"""
Computes the average precision at k.
This function computes the average prescision at k between two lists of
items.
Parameters
----------
actual : list
A list of elements that are to be predicted (order doesn't matter)
predicted : list
A list of predicted elements (order does matter)
k : int, optional
The maximum number of predicted elements
Returns
-------
score : double
The average precision at k over the input lists
"""
if not actual:
return 0.0
if len(predicted)>k:
predicted = predicted[:k]
score = 0.0
num_hits = 0.0
for i,p in enumerate(predicted):
# first condition checks whether it is valid prediction
# second condition checks if prediction is not repeated
if p in actual and p not in predicted[:i]:
num_hits += 1.0
score += num_hits / (i+1.0)
return score / min(len(actual), k)
def mapk(actual, predicted, k=25):
"""
Computes the mean average precision at k.
This function computes the mean average prescision at k between two lists
of lists of items.
Parameters
----------
actual : list
A list of lists of elements that are to be predicted
(order doesn't matter in the lists)
predicted : list
A list of lists of predicted elements
(order matters in the lists)
k : int, optional
The maximum number of predicted elements
Returns
-------
score : double
The mean average precision at k over the input lists
"""
return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])
# ----
from torch import Tensor
def last_token_pool(last_hidden_states: Tensor,
attention_mask: Tensor) -> Tensor:
left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
if left_padding:
return last_hidden_states[:, -1]
else:
sequence_lengths = attention_mask.sum(dim=1) - 1
batch_size = last_hidden_states.shape[0]
return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]