Spaces:
Sleeping
Sleeping
import torch | |
import numpy as np | |
from torch import nn | |
import random | |
import os | |
from datetime import datetime | |
import time | |
import math | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import yaml | |
from deepdiff import DeepDiff | |
from types import SimpleNamespace | |
from glob import glob | |
from sklearn.metrics import roc_auc_score | |
def sep(): | |
print("-"*100) | |
def get_timediff(time1,time2): | |
minute_,second_ = divmod(time2-time1,60) | |
return f"{int(minute_):02d}:{int(second_):02d}" | |
def current_date_time(): | |
# Format the current date and time as "YYYY-MM-DD HH:MM:SS" | |
return datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
def seed_torch(seed=42): | |
random.seed(seed) | |
os.environ['PYTHONHASHSEED'] = str(seed) | |
np.random.seed(seed) | |
torch.manual_seed(seed) | |
torch.cuda.manual_seed(seed) | |
torch.cuda.manual_seed_all(seed) # if you are using multi-GPU. | |
torch.backends.cudnn.enabled = True | |
torch.backends.cudnn.benchmark = True | |
torch.backends.cudnn.deterministic = True | |
def init_logger(log_file=f'train.log'): | |
from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler | |
logger = getLogger(__name__) | |
logger.setLevel(INFO) | |
handler1 = StreamHandler() | |
handler1.setFormatter(Formatter("%(message)s")) | |
handler2 = FileHandler(filename=log_file) | |
handler2.setFormatter(Formatter("%(message)s")) | |
logger.addHandler(handler1) | |
logger.addHandler(handler2) | |
return logger | |
def write_to_summary_log(summary_log_file, message): | |
with open(summary_log_file, 'a+') as file: | |
file.write(f"{message}\n") | |
class AverageMeter(object): | |
"""Computes and stores the average and current value""" | |
def __init__(self): | |
self.reset() | |
def reset(self): | |
self.val = 0 | |
self.avg = 0 | |
self.sum = 0 | |
self.count = 0 | |
def update(self, val, n=1): | |
self.val = val | |
self.sum += val * n | |
self.count += n | |
self.avg = self.sum / self.count | |
def asMinutes(s: float): | |
"Convert to minutes." | |
m = math.floor(s / 60) | |
s -= m * 60 | |
return '%dm %ds' % (m, s) | |
def timeSince(since: float, percent: float): | |
now = time.time() | |
s = now - since | |
es = s / (percent) | |
rs = es - s | |
return '%s (remain %s)' % (asMinutes(s), asMinutes(rs)) | |
def load_yaml(file_path): | |
with open(file_path, 'r') as file: | |
return yaml.safe_load(file) | |
def simple_namespace(cfg): | |
for k, v in cfg.items(): | |
if type(v) == dict: | |
cfg[k] = SimpleNamespace(**v) | |
return SimpleNamespace(**cfg) | |
def get_parameter_number(model, unit='M'): | |
total_num = sum(p.numel() for p in model.parameters()) | |
trainable_num = sum(p.numel() for p in model.parameters() if p.requires_grad) | |
div = {"M": 1e6, "K": 1e3, "B": 1}[unit] | |
return f'Total params: {total_num/div:.1f}{unit}; Trainable params: {trainable_num//div:.1f}{unit}' | |
def sigmoid(x): | |
return 1 / (1 + np.exp(-x)) | |
import os | |
import zipfile | |
import json | |
def zip_files_in_directory(directory_path, zip_file_path, is_exclude_dirs=True): | |
with zipfile.ZipFile(zip_file_path, 'w', zipfile.ZIP_DEFLATED) as zipf: | |
for root, dirs, files in os.walk(directory_path): | |
# 仅在根目录下添加文件,不进入子目录 | |
if is_exclude_dirs: | |
if root == directory_path: | |
for file in files: | |
file_path = os.path.join(root, file) | |
zipf.write(file_path, os.path.relpath(file_path, directory_path)) | |
else: | |
for file in files: | |
file_path = os.path.join(root, file) | |
zipf.write(file_path, os.path.relpath(file_path, directory_path)) | |
# ----- | |
# Credit: https://www.kaggle.com/code/abdullahmeda/eedi-map-k-metric | |
import numpy as np | |
def apk(actual, predicted, k=25): | |
""" | |
Computes the average precision at k. | |
This function computes the average prescision at k between two lists of | |
items. | |
Parameters | |
---------- | |
actual : list | |
A list of elements that are to be predicted (order doesn't matter) | |
predicted : list | |
A list of predicted elements (order does matter) | |
k : int, optional | |
The maximum number of predicted elements | |
Returns | |
------- | |
score : double | |
The average precision at k over the input lists | |
""" | |
if not actual: | |
return 0.0 | |
if len(predicted)>k: | |
predicted = predicted[:k] | |
score = 0.0 | |
num_hits = 0.0 | |
for i,p in enumerate(predicted): | |
# first condition checks whether it is valid prediction | |
# second condition checks if prediction is not repeated | |
if p in actual and p not in predicted[:i]: | |
num_hits += 1.0 | |
score += num_hits / (i+1.0) | |
return score / min(len(actual), k) | |
def mapk(actual, predicted, k=25): | |
""" | |
Computes the mean average precision at k. | |
This function computes the mean average prescision at k between two lists | |
of lists of items. | |
Parameters | |
---------- | |
actual : list | |
A list of lists of elements that are to be predicted | |
(order doesn't matter in the lists) | |
predicted : list | |
A list of lists of predicted elements | |
(order matters in the lists) | |
k : int, optional | |
The maximum number of predicted elements | |
Returns | |
------- | |
score : double | |
The mean average precision at k over the input lists | |
""" | |
return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)]) | |
# ---- | |
from torch import Tensor | |
def last_token_pool(last_hidden_states: Tensor, | |
attention_mask: Tensor) -> Tensor: | |
left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0]) | |
if left_padding: | |
return last_hidden_states[:, -1] | |
else: | |
sequence_lengths = attention_mask.sum(dim=1) - 1 | |
batch_size = last_hidden_states.shape[0] | |
return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths] | |