Spaces:

LINC-BIT
/

EdgeTA

Running

App Files Files Community

EdgeTA / new_impl /cv /glip /object_detection /glip.py

LINC-BIT

Upload 1912 files

b84549f verified over 1 year ago

raw

history blame

86.7 kB

	import timm
	from timm.models._factory import load_checkpoint
	import torch
	import os
	from typing import List, Union, Optional, Tuple
	from torch import nn
	from torch.jit import Final
	from einops import rearrange, repeat
	from einops.layers.torch import Rearrange
	from utils.dl.common.model import get_model_device, set_module, get_module, get_model_latency, get_model_size, LayerActivation3
	import torch.nn.functional as F
	from utils.common.log import logger
	from transformers import AutoTokenizer
	import torch.nn.functional as F
	from maskrcnn_benchmark.modeling.detector.generalized_vl_rcnn import GeneralizedVLRCNN
	from maskrcnn_benchmark.config import cfg
	from maskrcnn_benchmark.structures.bounding_box import BoxList
	from torchvision import transforms as T
	import matplotlib.pyplot as plt
	import nltk
	import re
	from copy import deepcopy
	from abc import ABC, abstractmethod
	from methods.elasticdnn.pipeline.offline.fm_to_md.base import FM_to_MD_Util
	from methods.elasticdnn.pipeline.offline.fm_lora.base import FMLoRA_Util, LoRA
	from new_impl.cv.elasticdnn.api.model import ElasticDNN_OfflineFMModel, ElasticDNN_OfflineMDModel
	from methods.elasticdnn.model.base import Abs, KTakesAll, ElasticDNNUtil, Layer_WrappedWithFBS
	from transformers.models.bert.modeling_bert import BertSelfAttention
	from transformers import BertConfig
	import math
	from timm.models.layers import DropPath, to_2tuple, trunc_normal_

	def collect_mm_fn(batch):
	if len(batch[0]) == 2:
	dict = {'images' : [], 'targets' : []}
	else:
	dict = {'images' : [], 'targets' : [], "info_imgs" : [], "ids" : []}

	for item in batch:
	if len(item) == 2:
	img, new_target = item
	if len(new_target) == 0:
	continue
	dict['images'].append(img)
	dict['targets'].append(new_target)
	else:
	img, new_target, info_imgs, ids = item
	if len(new_target) == 0:
	continue
	dict['images'].append(img)
	dict['targets'].append(new_target)
	dict['info_imgs'].append(info_imgs)
	dict['ids'].append(ids)

	return dict, torch.Tensor([0])

	def run_ner(caption):
	noun_phrases = find_noun_phrases(caption)
	noun_phrases = [remove_punctuation(phrase) for phrase in noun_phrases]
	noun_phrases = [phrase for phrase in noun_phrases if phrase != '']
	relevant_phrases = noun_phrases
	labels = noun_phrases

	tokens_positive = []

	for entity, label in zip(relevant_phrases, labels):
	try:
	# search all occurrences and mark them as different entities
	for m in re.finditer(entity, caption.lower()):
	tokens_positive.append([[m.start(), m.end()]])
	except:
	print("noun entities:", noun_phrases)
	print("entity:", entity)
	print("caption:", caption.lower())

	return tokens_positive

	def build_transform(cfg, min_image_size):
	"""
	Creates a basic transformation that was used to train the models
	"""

	# we are loading images with OpenCV, so we don't need to convert them
	# to BGR, they are already! So all we need to do is to normalize
	# by 255 if we want to convert to BGR255 format, or flip the channels
	# if we want it to be in RGB in [0-1] range.
	if cfg.INPUT.TO_BGR255:
	to_bgr_transform = T.Lambda(lambda x: x * 255)
	else:
	to_bgr_transform = T.Lambda(lambda x: x[[2, 1, 0]])

	normalize_transform = T.Normalize(
	mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD
	)

	transform = T.Compose(
	[
	T.ToPILImage(),
	T.Resize(min_image_size) if min_image_size is not None else lambda x: x,
	T.ToTensor(),
	to_bgr_transform,
	normalize_transform,
	]
	)
	return transform

	def remove_punctuation(text: str) -> str:
	punct = ['\|', ':', ';', '@', '(', ')', '[', ']', '{', '}', '^',
	'\'', '\"', '’', '`', '?', '$', '%', '#', '!', '&', '*', '+', ',', '.'
	]
	for p in punct:
	text = text.replace(p, '')
	return text.strip()

	def create_positive_map_label_to_token_from_positive_map(positive_map, plus=0):
	positive_map_label_to_token = {}
	for i in range(len(positive_map)):
	positive_map_label_to_token[i + plus] = torch.nonzero(positive_map[i], as_tuple=True)[0].tolist()
	return positive_map_label_to_token

	def create_positive_map(tokenized, tokens_positive):
	"""construct a map such that positive_map[i,j] = True iff box i is associated to token j"""
	positive_map = torch.zeros((len(tokens_positive), 256), dtype=torch.float)

	for j, tok_list in enumerate(tokens_positive):
	for (beg, end) in tok_list:
	try:
	beg_pos = tokenized.char_to_token(beg)
	end_pos = tokenized.char_to_token(end - 1)
	except Exception as e:
	print("beg:", beg, "end:", end)
	print("token_positive:", tokens_positive)
	# print("beg_pos:", beg_pos, "end_pos:", end_pos)
	raise e
	if beg_pos is None:
	try:
	beg_pos = tokenized.char_to_token(beg + 1)
	if beg_pos is None:
	beg_pos = tokenized.char_to_token(beg + 2)
	except:
	beg_pos = None
	if end_pos is None:
	try:
	end_pos = tokenized.char_to_token(end - 2)
	if end_pos is None:
	end_pos = tokenized.char_to_token(end - 3)
	except:
	end_pos = None
	if beg_pos is None or end_pos is None:
	continue

	assert beg_pos is not None and end_pos is not None
	positive_map[j, beg_pos: end_pos + 1].fill_(1)
	return positive_map / (positive_map.sum(-1)[:, None] + 1e-6)

	def find_noun_phrases(caption: str) -> List[str]:
	caption = caption.lower()
	tokens = nltk.word_tokenize(caption)
	pos_tags = nltk.pos_tag(tokens)

	grammar = "NP: {<DT>?<JJ.><NN.*>+}"
	cp = nltk.RegexpParser(grammar)
	result = cp.parse(pos_tags)

	noun_phrases = list()
	for subtree in result.subtrees():
	if subtree.label() == 'NP':
	noun_phrases.append(' '.join(t[0] for t in subtree.leaves()))

	return noun_phrases

	class Glip(nn.Module):
	def __init__(self, config, pretrain_path, min_image_size=None,confidence_threshold=0.7):
	super(Glip, self).__init__()
	state_dict = torch.load(pretrain_path)['model']
	self.min_image_size = min_image_size
	self.cfg = config
	self.confidence_threshold = confidence_threshold
	self.tokenizer = AutoTokenizer.from_pretrained(cfg.MODEL.LANGUAGE_BACKBONE.MODEL_PATH)
	self.device = torch.device(cfg.MODEL.DEVICE)
	for k in list(state_dict.keys()):
	if k.startswith('module'):
	new_k = k.replace('module.', '')
	state_dict[new_k] = state_dict.pop(k)
	self.model = GeneralizedVLRCNN(config)
	self.model.load_state_dict(state_dict, strict=False)
	# self.transform = build_transform(config, min_image_size)

	def forward(self, images, targets, for_training=None):
	# img_list = []
	# for image in images:
	# img_list.append(self.transform(image).to(self.device))

	# if isinstance(texts, list):
	# # we directly provided a list of category names
	# caption_string = ""
	# tokens_positive = []
	# seperation_tokens = " . "
	# for word in texts:

	# tokens_positive.append([len(caption_string), len(caption_string) + len(word)])
	# caption_string += word
	# caption_string += seperation_tokens

	# tokenized = self.tokenizer([caption_string], return_tensors="pt")
	# tokens_positive = [tokens_positive]

	# texts = [caption_string]
	# print(tokens_positive)
	# else:
	device = torch.device(cfg.MODEL.DEVICE)
	images = [image.to(device) for image in images]
	targets = [target.to(device) for target in targets]
	texts = [t.get_field("caption") for t in targets if "caption" in t.fields()]
	positive_map = []
	# if custom_entity is None:
	# tokens_positive = self.run_ner(texts)
	# print(tokens_positive)
	# process positive map

	if self.training == False:
	try:
	tokens_positive = run_ner(texts[0])
	except:
	print('a')
	tokenized = self.tokenizer(texts, return_tensors="pt")
	positive_map = create_positive_map(tokenized, tokens_positive)
	if self.cfg.MODEL.RPN_ARCHITECTURE == "VLDYHEAD":
	plus = 1
	else:
	plus = 0
	positive_map = create_positive_map_label_to_token_from_positive_map(positive_map, plus=plus)
	else:
	for i, text in enumerate(texts):
	tokenized = self.tokenizer(text, return_tensors="pt")
	tokens_positive = targets[i].get_field('tokens_positive')
	positive_map.append(create_positive_map(tokenized, tokens_positive))

	positive_map = torch.cat(positive_map, dim=0).to(device)


	if self.training:
	proposal_losses = self.model(images, targets, texts, positive_map=positive_map)
	return proposal_losses
	else:
	proposals, token_logits, dot_product_logits = self.model(images, targets, texts, positive_map=positive_map)
	proposal = self._post_process(proposals[0])
	return proposal, token_logits, dot_product_logits

	def _post_process_fixed_thresh(self, predictions):
	scores = predictions.get_field("scores")
	labels = predictions.get_field("labels").tolist()
	thresh = scores.clone()
	for i, lb in enumerate(labels):
	if isinstance(self.confidence_threshold, float):
	thresh[i] = self.confidence_threshold
	elif len(self.confidence_threshold) == 1:
	thresh[i] = self.confidence_threshold[0]
	else:
	thresh[i] = self.confidence_threshold[lb - 1]
	keep = torch.nonzero(scores > thresh).squeeze(1)
	predictions = predictions[keep]

	scores = predictions.get_field("scores")
	_, idx = scores.sort(0, descending=True)
	return predictions[idx]

	def _post_process(self, predictions, threshold=0.5):
	scores = predictions.get_field("scores")
	labels = predictions.get_field("labels").tolist()
	thresh = scores.clone()
	for i, lb in enumerate(labels):
	if isinstance(self.confidence_threshold, float):
	thresh[i] = threshold
	elif len(self.confidence_threshold) == 1:
	thresh[i] = threshold
	else:
	thresh[i] = self.confidence_threshold[lb - 1]
	keep = torch.nonzero(scores > thresh).squeeze(1)
	predictions = predictions[keep]

	scores = predictions.get_field("scores")
	_, idx = scores.sort(0, descending=True)
	return predictions[idx]

	# @torch.no_grad()
	# def clip_vit_b_16():
	# # https://huggingface.co/openai/clip-vit-base-patch16
	# model = CLIPModelCanReceiveTextEmbeds.from_pretrained("openai/clip-vit-base-patch16")
	# processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

	# print(model)

	# from PIL import Image
	# import requests
	# image = Image.open('/data/zql/datasets/Caltech-256/data/caltech256/256_ObjectCategories/003.backpack/003_0001.jpg')
	# inputs = processor(text=["a photo of a dog", "a photo of a backpack", "a photo of a cat"], images=image, return_tensors="pt", padding=True)
	# print(inputs)

	# from utils.dl.common.model import LayerActivation2, get_module
	# input_embed_hook = LayerActivation2(get_module(model, 'text_model.embeddings'))
	# outputs = model(**inputs)
	# logits_per_image = outputs.logits_per_image # this is the image-text similarity score
	# probs = logits_per_image.softmax(dim=1)
	# print(probs)

	# input_embed = input_embed_hook.output
	# input_embed_hook.remove()

	# torch.save(input_embed, os.path.join(os.path.dirname(__file__), './test_input_embed.pth'))

	# print('embed', input_embed.size())

	# del inputs['input_ids']
	# inputs['input_embeds'] = input_embed
	# outputs = model(**inputs)
	# logits_per_image = outputs.logits_per_image # this is the image-text similarity score
	# probs = logits_per_image.softmax(dim=1)
	# print(probs)


	@torch.no_grad()
	def glip_model(config_path, pretrain_path):
	# https://huggingface.co/openai/clip-vit-base-patch16
	cfg.merge_from_file(config_path)
	return cfg, Glip(cfg, pretrain_path)

	class ToQKV_WrappedWithLoRA(nn.Module):
	def __init__(self, fc: nn.Linear, ab_r: int):
	super(ToQKV_WrappedWithLoRA, self).__init__()

	self.fc = fc
	self.ab = self.create_ab_as_linear(fc.weight.data, ab_r)

	def create_ab_as_linear(self, fc_weight: torch.Tensor, ab_r: int):
	res = nn.Sequential(
	LoRA(fc_weight.size(1), fc_weight.size(0) // ab_r, bias=False),
	LoRA(fc_weight.size(0) // ab_r, fc_weight.size(0), bias=False)
	).to(fc_weight.device)
	nn.init.kaiming_uniform_(res[0].weight, a=5 ** 0.5)
	nn.init.zeros_(res[1].weight)
	return res

	def forward(self, x):
	x1 = self.fc(x)
	x2 = self.ab(x)
	return x1 + x2

	def get_model_latency_2(model: torch.nn.Module, sample: dict, sample_num: int,
	device: str, warmup_sample_num: int, return_detail=False):
	"""Get the latency (inference time) of a PyTorch model.

	Reference: https://deci.ai/resources/blog/measure-inference-time-deep-neural-networks/

	Args:
	model (torch.nn.Module): A PyTorch model.
	model_input_size (Tuple[int]): Typically be `(1, 3, 32, 32)` or `(1, 3, 224, 224)`.
	sample_num (int): How many inputs which size is :attr:`model_input_size` will be tested and compute the average latency as result.
	device (str): Typically be 'cpu' or 'cuda'.
	warmup_sample_num (int): Let model perform some dummy inference to warm up the test environment to avoid measurement loss.
	return_detail (bool, optional): Beside the average latency, return all result measured. Defaults to False.

	Returns:
	Union[float, Tuple[float, List[float]]]: The average latency (and all lantecy data) of :attr:`model`.
	"""
	# if isinstance(model_input_size, tuple):
	# dummy_input = torch.rand(model_input_size).to(device)
	# else:
	# dummy_input = model_input_size

	model = model.to(device)
	model.eval()

	# warm up
	with torch.no_grad():
	for _ in range(warmup_sample_num):
	model(**sample)

	infer_time_list = []

	if device == 'cuda' or 'cuda' in str(device):
	with torch.no_grad():
	for _ in range(sample_num):
	s, e = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
	s.record()
	model(**sample)
	e.record()
	torch.cuda.synchronize()
	cur_model_infer_time = s.elapsed_time(e) / 1000.
	infer_time_list += [cur_model_infer_time]

	else:
	with torch.no_grad():
	for _ in range(sample_num):
	start = time.time()
	model(**sample)
	cur_model_infer_time = time.time() - start
	infer_time_list += [cur_model_infer_time]

	avg_infer_time = sum(infer_time_list) / sample_num

	if return_detail:
	return avg_infer_time, infer_time_list
	return avg_infer_time

	class WindowAttention(nn.Module):
	""" Window based multi-head self attention (W-MSA) module with relative position bias.
	It supports both of shifted and non-shifted window.
	Args:
	dim (int): Number of input channels.
	window_size (tuple[int]): The height and width of the window.
	num_heads (int): Number of attention heads.
	qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
	qk_scale (float \| None, optional): Override default qk scale of head_dim ** -0.5 if set
	attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
	proj_drop (float, optional): Dropout ratio of output. Default: 0.0
	"""

	def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):

	super().__init__()
	self.dim = dim
	self.window_size = window_size # Wh, Ww
	self.num_heads = num_heads
	head_dim = dim // num_heads
	self.scale = qk_scale or head_dim ** -0.5

	# define a parameter table of relative position bias
	self.relative_position_bias_table = nn.Parameter(
	torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)) # 2Wh-1 2*Ww-1, nH

	# get pair-wise relative position index for each token inside the window
	coords_h = torch.arange(self.window_size[0])
	coords_w = torch.arange(self.window_size[1])
	coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww
	coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
	relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, WhWw, WhWw
	relative_coords = relative_coords.permute(1, 2, 0).contiguous() # WhWw, WhWw, 2
	relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0
	relative_coords[:, :, 1] += self.window_size[1] - 1
	relative_coords[:, :, 0] = 2 self.window_size[1] - 1
	relative_position_index = relative_coords.sum(-1) # WhWw, WhWw
	self.register_buffer("relative_position_index", relative_position_index)

	self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
	self.attn_drop = nn.Dropout(attn_drop)
	self.proj = nn.Linear(dim, dim)
	self.proj_drop = nn.Dropout(proj_drop)

	trunc_normal_(self.relative_position_bias_table, std=.02)
	self.softmax = nn.Softmax(dim=-1)

	def forward(self, x, mask=None):
	""" Forward function.
	Args:
	x: input features with shape of (num_windows*B, N, C)
	mask: (0/-inf) mask with shape of (num_windows, WhWw, WhWw) or None
	"""
	B_, N, C = x.shape
	qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
	q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)

	q = q * self.scale
	attn = (q @ k.transpose(-2, -1))

	relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
	self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1) # WhWw,WhWw,nH
	relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, WhWw, WhWw
	attn = attn + relative_position_bias.unsqueeze(0)

	if mask is not None:
	nW = mask.shape[0]
	attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
	attn = attn.view(-1, self.num_heads, N, N)
	attn = self.softmax(attn)
	else:
	attn = self.softmax(attn)

	attn = self.attn_drop(attn)

	x = (attn @ v).transpose(1, 2).reshape(B_, N, -1)
	x = self.proj(x)
	x = self.proj_drop(x)
	return x

	class BiMultiHeadAttention(nn.Module):
	def __init__(self, v_dim, l_dim, embed_dim, num_heads, dropout=0.1, cfg=None):
	super(BiMultiHeadAttention, self).__init__()

	self.embed_dim = embed_dim
	self.num_heads = num_heads
	self.head_dim = embed_dim // num_heads
	self.v_dim = v_dim
	self.l_dim = l_dim

	assert (
	self.head_dim * self.num_heads == self.embed_dim
	), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
	self.scale = self.head_dim ** (-0.5)
	self.dropout = dropout

	self.v_proj = nn.Linear(self.v_dim, self.embed_dim)
	self.l_proj = nn.Linear(self.l_dim, self.embed_dim)
	self.values_v_proj = nn.Linear(self.v_dim, self.embed_dim)
	self.values_l_proj = nn.Linear(self.l_dim, self.embed_dim)

	self.out_v_proj = nn.Linear(self.embed_dim, self.v_dim)
	self.out_l_proj = nn.Linear(self.embed_dim, self.l_dim)

	self.stable_softmax_2d = cfg.MODEL.DYHEAD.FUSE_CONFIG.STABLE_SOFTMAX_2D
	self.clamp_min_for_underflow = cfg.MODEL.DYHEAD.FUSE_CONFIG.CLAMP_MIN_FOR_UNDERFLOW
	self.clamp_max_for_overflow = cfg.MODEL.DYHEAD.FUSE_CONFIG.CLAMP_MAX_FOR_OVERFLOW

	self._reset_parameters()

	def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
	return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

	def _reset_parameters(self):
	nn.init.xavier_uniform_(self.v_proj.weight)
	self.v_proj.bias.data.fill_(0)
	nn.init.xavier_uniform_(self.l_proj.weight)
	self.l_proj.bias.data.fill_(0)
	nn.init.xavier_uniform_(self.values_v_proj.weight)
	self.values_v_proj.bias.data.fill_(0)
	nn.init.xavier_uniform_(self.values_l_proj.weight)
	self.values_l_proj.bias.data.fill_(0)
	nn.init.xavier_uniform_(self.out_v_proj.weight)
	self.out_v_proj.bias.data.fill_(0)
	nn.init.xavier_uniform_(self.out_l_proj.weight)
	self.out_l_proj.bias.data.fill_(0)

	def forward(self, v, l, attention_mask_l=None):
	bsz, tgt_len, embed_dim = v.size()

	query_states = self.v_proj(v) * self.scale
	key_states = self._shape(self.l_proj(l), -1, bsz)
	value_v_states = self._shape(self.values_v_proj(v), -1, bsz)
	value_l_states = self._shape(self.values_l_proj(l), -1, bsz)

	proj_shape = (bsz * self.num_heads, -1, self.head_dim)
	query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
	key_states = key_states.view(*proj_shape)
	value_v_states = value_v_states.view(*proj_shape)
	value_l_states = value_l_states.view(*proj_shape)

	src_len = key_states.size(1)
	attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))

	if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
	raise ValueError(
	f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
	)

	# attn_weights_l = nn.functional.softmax(attn_weights.transpose(1, 2), dim=-1)

	if self.stable_softmax_2d:
	attn_weights = attn_weights - attn_weights.max()

	if self.clamp_min_for_underflow:
	attn_weights = torch.clamp(attn_weights, min=-50000) # Do not increase -50000, data type half has quite limited range
	if self.clamp_max_for_overflow:
	attn_weights = torch.clamp(attn_weights, max=50000) # Do not increase 50000, data type half has quite limited range

	attn_weights_T = attn_weights.transpose(1, 2)
	attn_weights_l = (attn_weights_T - torch.max(attn_weights_T, dim=-1, keepdim=True)[
	0])
	if self.clamp_min_for_underflow:
	attn_weights_l = torch.clamp(attn_weights_l, min=-50000) # Do not increase -50000, data type half has quite limited range
	if self.clamp_max_for_overflow:
	attn_weights_l = torch.clamp(attn_weights_l, max=50000) # Do not increase 50000, data type half has quite limited range

	attn_weights_l = attn_weights_l.softmax(dim=-1)

	if attention_mask_l is not None:
	assert (attention_mask_l.dim() == 2)
	attention_mask = attention_mask_l.unsqueeze(1).unsqueeze(1)
	attention_mask = attention_mask.expand(bsz, 1, tgt_len, src_len)
	attention_mask = attention_mask.masked_fill(attention_mask == 0, -9e15)

	if attention_mask.size() != (bsz, 1, tgt_len, src_len):
	raise ValueError(
	f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}"
	)
	attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
	attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)

	attn_weights_v = nn.functional.softmax(attn_weights, dim=-1)

	attn_probs_v = F.dropout(attn_weights_v, p=self.dropout, training=self.training)
	attn_probs_l = F.dropout(attn_weights_l, p=self.dropout, training=self.training)

	attn_output_v = torch.bmm(attn_probs_v, value_l_states)
	attn_output_l = torch.bmm(attn_probs_l, value_v_states)


	if attn_output_v.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
	raise ValueError(
	f"`attn_output_v` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output_v.size()}"
	)

	if attn_output_l.size() != (bsz * self.num_heads, src_len, self.head_dim):
	raise ValueError(
	f"`attn_output_l` should be of size {(bsz, self.num_heads, src_len, self.head_dim)}, but is {attn_output_l.size()}"
	)

	attn_output_v = attn_output_v.view(bsz, self.num_heads, tgt_len, self.head_dim)
	attn_output_v = attn_output_v.transpose(1, 2)
	attn_output_v = attn_output_v.reshape(bsz, tgt_len, self.embed_dim)

	attn_output_l = attn_output_l.view(bsz, self.num_heads, src_len, self.head_dim)
	attn_output_l = attn_output_l.transpose(1, 2)
	attn_output_l = attn_output_l.reshape(bsz, src_len, self.embed_dim)

	attn_output_v = self.out_v_proj(attn_output_v)
	attn_output_l = self.out_l_proj(attn_output_l)

	return attn_output_v, attn_output_l

	class BertSelfAttentionPrunable(BertSelfAttention):
	def __init__(self):
	config = BertConfig.from_pretrained('new_impl/cv/glip/object_detection/bert-base-uncased')
	super(BertSelfAttentionPrunable, self).__init__(config)

	def transpose_for_scores(self, x):
	new_x_shape = x.size()[:-1] + (self.num_attention_heads, -1)
	x = x.view(new_x_shape)
	return x.permute(0, 2, 1, 3)

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.FloatTensor] = None,
	head_mask: Optional[torch.FloatTensor] = None,
	encoder_hidden_states: Optional[torch.FloatTensor] = None,
	encoder_attention_mask: Optional[torch.FloatTensor] = None,
	past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
	output_attentions: Optional[bool] = False,
	) -> Tuple[torch.Tensor]:
	mixed_query_layer = self.query(hidden_states)

	# If this is instantiated as a cross-attention module, the keys
	# and values come from an encoder; the attention mask needs to be
	# such that the encoder's padding tokens are not attended to.
	is_cross_attention = encoder_hidden_states is not None

	if is_cross_attention and past_key_value is not None:
	# reuse k,v, cross_attentions
	key_layer = past_key_value[0]
	value_layer = past_key_value[1]
	attention_mask = encoder_attention_mask
	elif is_cross_attention:
	key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
	value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
	attention_mask = encoder_attention_mask
	elif past_key_value is not None:
	key_layer = self.transpose_for_scores(self.key(hidden_states))
	value_layer = self.transpose_for_scores(self.value(hidden_states))
	key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
	value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
	else:
	key_layer = self.transpose_for_scores(self.key(hidden_states))
	value_layer = self.transpose_for_scores(self.value(hidden_states))

	query_layer = self.transpose_for_scores(mixed_query_layer)

	use_cache = past_key_value is not None
	if self.is_decoder:
	# if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
	# Further calls to cross_attention layer can then reuse all cross-attention
	# key/value_states (first "if" case)
	# if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
	# all previous decoder key/value_states. Further calls to uni-directional self-attention
	# can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
	# if encoder bi-directional self-attention `past_key_value` is always `None`
	past_key_value = (key_layer, value_layer)

	# Take the dot product between "query" and "key" to get the raw attention scores.
	attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))

	if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
	query_length, key_length = query_layer.shape[2], key_layer.shape[2]
	if use_cache:
	position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
	-1, 1
	)
	else:
	position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
	position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
	distance = position_ids_l - position_ids_r

	positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
	positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility

	if self.position_embedding_type == "relative_key":
	relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
	attention_scores = attention_scores + relative_position_scores
	elif self.position_embedding_type == "relative_key_query":
	relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
	relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
	attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key

	attention_scores = attention_scores / math.sqrt(self.attention_head_size)
	if attention_mask is not None:
	# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
	attention_scores = attention_scores + attention_mask

	# Normalize the attention scores to probabilities.
	attention_probs = nn.functional.softmax(attention_scores, dim=-1)

	# This is actually dropping out entire tokens to attend to, which might
	# seem a bit unusual, but is taken from the original Transformer paper.
	attention_probs = self.dropout(attention_probs)

	# Mask heads if we want to
	if head_mask is not None:
	attention_probs = attention_probs * head_mask

	context_layer = torch.matmul(attention_probs, value_layer)

	context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
	new_context_layer_shape = context_layer.size()[:-2] + (self.query.out_features,) # NOTE: modified
	context_layer = context_layer.view(new_context_layer_shape)

	outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)

	if self.is_decoder:
	outputs = outputs + (past_key_value,)
	return outputs

	@staticmethod
	def init_from_exist_self_attn(attn: BertSelfAttention):
	# print(attn)

	res = BertSelfAttentionPrunable()

	for attr in dir(attn):
	# if str(attr) in ['transpose_for_scores'] or str(attr).startswith('_'):
	# continue
	# if isinstance(getattr(attn, attr), nn.Module):
	# print(attr)

	if isinstance(getattr(attn, attr), nn.Module):
	try:
	# print(attr, 'ok')
	setattr(res, attr, getattr(attn, attr))

	except Exception as e:
	print(attr, str(e))



	return res

	class FM_to_MD_GLIP_Util(FM_to_MD_Util):
	def init_md_from_fm_by_reducing_width_with_perf_test_2(self, fm: nn.Module, reducing_width_ratio: int,
	samples: torch.Tensor) -> nn.Module:
	fm_size = get_model_size(fm, True)
	fm_latency = get_model_latency_2(fm, samples, 20,
	get_model_device(fm), 20, False)

	master_dnn = self.init_md_from_fm_by_reducing_width(fm, reducing_width_ratio)
	master_dnn_size = get_model_size(master_dnn, True)
	logger.debug(f'inited master DNN: {master_dnn}')
	# from utils.dl.common.model import get_module
	# print('after generating')
	# get_module(fm, 'head').debug()
	# get_module(master_dnn, 'head').debug()
	# print('test master latency')
	master_dnn_latency = get_model_latency_2(fm, samples, 20,
	get_model_device(fm), 20, False)

	logger.info(f'init master DNN (w/o FBS yet) by reducing foundation model\'s width (by {reducing_width_ratio:d}x)')
	logger.info(f'foundation model ({fm_size:.3f}MB, {fm_latency:.4f}s/sample) -> '
	f'master DNN ({master_dnn_size:.3f}MB, {master_dnn_latency:.4f}s/sample)\n'
	f'(model size: ↓ {(fm_size / master_dnn_size):.2f}x, '
	f'latency: ↓ {(fm_latency / master_dnn_latency):.2f}x)')

	return master_dnn

	def init_md_from_fm_by_reducing_width(self, fm: nn.Module, reducing_width_ratio: int, sparsity=0.0) -> nn.Module:
	#sparsity: it is mainly used to make a distilled model used in the baseline algorithm, and the parameter can ensure that the model has the same size as the model used in the online algorithm.
	fm_vit = deepcopy(fm)

	def _f(n):
	return int(n // reducing_width_ratio)

	# def _rand_indexes(n):
	# return torch.randperm(n)[0: int(n // reducing_width_ratio)]

	def l1_max_indexes(p: torch.Tensor, dim=0):
	assert dim in [0, 1]
	assert p.dim() in [1, 2, 4]

	if dim == 1:
	p = p.T

	p_norm = p.abs().contiguous().view(p.size(0), -1).sum(dim=1)
	n = p.size(0)

	t1 = p_norm.argsort(descending=True)[0: int(n // reducing_width_ratio)]
	t2 = t1.sort()[0]
	return p_norm.argsort(descending=True)[0: int(n // reducing_width_ratio)].sort()[0]

	def l1_max_indexes_with_sparsity(p: torch.Tensor, dim=0):
	assert dim in [0, 1]
	assert p.dim() in [1, 2, 4]

	if dim == 1:
	p = p.T

	p_norm = p.abs().contiguous().view(p.size(0), -1).sum(dim=1)
	n = p.size(0)
	return p_norm.argsort(descending=True)[0: int(n // reducing_width_ratio * (1 - sparsity))].sort()[0]

	for layer_i, layer in enumerate(fm_vit.model.backbone.body.layers):
	for block in layer.blocks:
	ori_attn = block.attn
	new_attn = WindowAttention(ori_attn.dim, ori_attn.window_size, ori_attn.num_heads, True, ori_attn.scale, 0., 0.)
	new_attn.relative_position_index = ori_attn.relative_position_index
	new_attn.relative_position_bias_table = ori_attn.relative_position_bias_table
	new_attn.qkv = ori_attn.qkv
	new_attn.attn_drop = ori_attn.attn_drop
	new_attn.proj = ori_attn.proj
	new_attn.proj_drop = ori_attn.proj_drop
	set_module(block, 'attn', new_attn)

	# first_attn = True
	for layer_i, layer in enumerate(fm_vit.model.backbone.body.layers):
	for block_i, block in enumerate(layer.blocks):
	qkv = block.attn.qkv
	new_qkv = nn.Linear(qkv.in_features, _f(qkv.out_features),
	qkv.bias is not None, qkv.weight.device)
	indexes = l1_max_indexes(qkv.weight.data, 0)
	new_qkv.weight.data.copy_(qkv.weight.data[indexes])
	if qkv.bias is not None:
	new_qkv.bias.data.copy_(qkv.bias.data[indexes])
	# fm_vit.model.backbone.body.layers[0].blocks.0.attn.qkv
	set_module(fm_vit, f'model.backbone.body.layers.{layer_i}.blocks.{block_i}.attn.qkv', new_qkv)

	proj = block.attn.proj
	new_proj = nn.Linear(_f(proj.in_features), proj.out_features,
	proj.bias is not None, proj.weight.device)
	new_proj.weight.data.copy_(proj.weight.data[:, l1_max_indexes(proj.weight.data, 1)])
	if proj.bias is not None:
	new_proj.bias.data.copy_(proj.bias.data)
	set_module(fm_vit, f'model.backbone.body.layers.{layer_i}.blocks.{block_i}.attn.proj', new_proj)

	fc1 = block.mlp.fc1
	new_fc1 = nn.Linear(fc1.in_features, int(_f(fc1.out_features) * (1 - sparsity)),
	fc1.bias is not None, fc1.weight.device)
	indexes = l1_max_indexes_with_sparsity(fc1.weight.data, 0)
	new_fc1.weight.data.copy_(fc1.weight.data[indexes])
	if fc1.bias is not None:
	new_fc1.bias.data.copy_(fc1.bias.data[indexes])
	set_module(fm_vit, f'model.backbone.body.layers.{layer_i}.blocks.{block_i}.mlp.fc1', new_fc1)

	fc2 = block.mlp.fc2
	new_fc2 = nn.Linear(int(_f(fc2.in_features) * (1 - sparsity)), fc2.out_features,
	fc2.bias is not None, fc2.weight.device)
	new_fc2.weight.data.copy_(fc2.weight.data[:, l1_max_indexes_with_sparsity(fc2.weight.data, 1)])
	if fc2.bias is not None:
	new_fc2.bias.data.copy_(fc2.bias.data)
	set_module(fm_vit, f'model.backbone.body.layers.{layer_i}.blocks.{block_i}.mlp.fc2', new_fc2)

	for block in fm_vit.model.language_backbone.body.model.encoder.layer:
	set_module(block, 'attention.self', BertSelfAttentionPrunable.init_from_exist_self_attn(block.attention.self))

	for block_i, block in enumerate(fm_vit.model.language_backbone.body.model.encoder.layer):
	for k in ['query', 'key', 'value']:
	qkv = get_module(block, f'attention.self.{k}')
	new_qkv = nn.Linear(qkv.in_features, _f(qkv.out_features),
	qkv.bias is not None, qkv.weight.device)
	indexes = l1_max_indexes(qkv.weight.data, 0)
	new_qkv.weight.data.copy_(qkv.weight.data[indexes])
	if qkv.bias is not None:
	new_qkv.bias.data.copy_(qkv.bias.data[indexes])
	set_module(block, f'attention.self.{k}', new_qkv)

	proj = get_module(block, f'attention.output.dense')
	new_proj = nn.Linear(_f(proj.in_features), proj.out_features,
	proj.bias is not None, proj.weight.device)
	new_proj.weight.data.copy_(proj.weight.data[:, l1_max_indexes(proj.weight.data, 1)])
	if proj.bias is not None:
	new_proj.bias.data.copy_(proj.bias.data)
	set_module(block, f'attention.output.dense', new_proj)

	fc1 = get_module(block, f'intermediate.dense')
	new_fc1 = nn.Linear(fc1.in_features, int(_f(fc1.out_features) * (1 - sparsity)),
	fc1.bias is not None, fc1.weight.device)
	indexes = l1_max_indexes_with_sparsity(fc1.weight.data, 0)
	new_fc1.weight.data.copy_(fc1.weight.data[indexes])
	if fc1.bias is not None:
	new_fc1.bias.data.copy_(fc1.bias.data[indexes])
	set_module(block, f'intermediate.dense', new_fc1)

	fc2 = get_module(block, f'output.dense')
	new_fc2 = nn.Linear(int(_f(fc2.in_features) * (1 - sparsity)), fc2.out_features,
	fc2.bias is not None, fc2.weight.device)
	new_fc2.weight.data.copy_(fc2.weight.data[:, l1_max_indexes_with_sparsity(fc2.weight.data, 1)])
	if fc2.bias is not None:
	new_fc2.bias.data.copy_(fc2.bias.data)
	set_module(block, f'output.dense', new_fc2)

	for block_i, block in enumerate(fm_vit.model.rpn.head.dyhead_tower):
	if block_i % 3 == 0:
	tmp = block.b_attn.attn
	tmp.head_dim = int(tmp.head_dim // reducing_width_ratio)
	tmp.embed_dim = int(tmp.embed_dim // reducing_width_ratio)
	set_module(block, 'b_attn.attn', tmp)
	for k in ['v_proj', 'l_proj', 'values_v_proj', 'values_l_proj']:
	qkv = get_module(block, f'b_attn.attn.{k}')
	new_qkv = nn.Linear(qkv.in_features, _f(qkv.out_features),
	qkv.bias is not None, qkv.weight.device)
	indexes = l1_max_indexes(qkv.weight.data, 0)
	new_qkv.weight.data.copy_(qkv.weight.data[indexes])
	if qkv.bias is not None:
	new_qkv.bias.data.copy_(qkv.bias.data[indexes])
	set_module(block, f'b_attn.attn.{k}', new_qkv)

	for k in ['out_v_proj', 'out_l_proj']:
	qkv = get_module(block, f'b_attn.attn.{k}')

	new_qkv = nn.Linear(_f(qkv.in_features), qkv.out_features,
	qkv.bias is not None, qkv.weight.device)
	new_qkv.weight.data.copy_(qkv.weight.data[:, l1_max_indexes(qkv.weight.data, 1)])
	if qkv.bias is not None:
	new_qkv.bias.data.copy_(qkv.bias.data)
	set_module(block, f'b_attn.attn.{k}', new_qkv)

	elif block_i % 3 == 1:
	set_module(block, 'attention.self', BertSelfAttentionPrunable.init_from_exist_self_attn(block.attention.self))
	for k in ['query', 'key', 'value']:
	qkv = get_module(block, f'attention.self.{k}')

	new_qkv = nn.Linear(qkv.in_features, _f(qkv.out_features),
	qkv.bias is not None, qkv.weight.device)
	indexes = l1_max_indexes(qkv.weight.data, 0)

	new_qkv.weight.data.copy_(qkv.weight.data[indexes])
	if qkv.bias is not None:
	new_qkv.bias.data.copy_(qkv.bias.data[indexes])
	set_module(block, f'attention.self.{k}', new_qkv)

	proj = get_module(block, f'attention.output.dense')
	new_proj = nn.Linear(_f(proj.in_features), proj.out_features,
	proj.bias is not None, proj.weight.device)
	new_proj.weight.data.copy_(proj.weight.data[:, l1_max_indexes(proj.weight.data, 1)])
	if proj.bias is not None:
	new_proj.bias.data.copy_(proj.bias.data)
	set_module(block, f'attention.output.dense', new_proj)

	fc1 = get_module(block, f'intermediate.dense')
	new_fc1 = nn.Linear(fc1.in_features, int(_f(fc1.out_features) * (1 - sparsity)),
	fc1.bias is not None, fc1.weight.device)
	indexes = l1_max_indexes_with_sparsity(fc1.weight.data, 0)
	new_fc1.weight.data.copy_(fc1.weight.data[indexes])
	if fc1.bias is not None:
	new_fc1.bias.data.copy_(fc1.bias.data[indexes])
	set_module(block, f'intermediate.dense', new_fc1)

	fc2 = get_module(block, f'output.dense')
	new_fc2 = nn.Linear(int(_f(fc2.in_features) * (1 - sparsity)), fc2.out_features,
	fc2.bias is not None, fc2.weight.device)
	new_fc2.weight.data.copy_(fc2.weight.data[:, l1_max_indexes_with_sparsity(fc2.weight.data, 1)])
	if fc2.bias is not None:
	new_fc2.bias.data.copy_(fc2.bias.data)
	set_module(block, f'output.dense', new_fc2)

	# reduce dim_embedding
	# if name.endswith('patch_embed.proj'):
	# continue

	# new_layer = nn.Conv2d(module.in_channels, _f(module.out_channels), module.kernel_size, module.stride,
	# module.padding, module.dilation, module.groups, module.bias is not None, module.padding_mode,
	# module.weight.device)

	# rand_indexes = l1_max_indexes(module.weight.data)
	# new_layer.weight.data.copy_(module.weight.data[rand_indexes])
	# if new_layer.bias is not None:
	# new_layer.bias.data.copy_(module.bias.data[rand_indexes])

	# fm_vit.cls_token.data = fm_vit.cls_token.data[:, :, rand_indexes]
	# fm_vit.pos_embed.data = fm_vit.pos_embed.data[:, :, rand_indexes]

	# elif isinstance(module, nn.Linear):

	# if 'head' in name:
	# continue

	# new_layer = nn.Linear(_f(module.in_features), module.out_features,
	# module.bias is not None, module.weight.device)
	# new_layer.weight.data.copy_(module.weight.data[:, l1_max_indexes(module.weight.data, 1)])
	# if new_layer.bias is not None:
	# new_layer.bias.data.copy_(module.bias.data)
	# else:
	# first_attn = False
	# if first_attn:
	# first_attn = False
	# new_layer = nn.Linear(module.in_features, _f(module.out_features),
	# module.bias is not None, module.weight.device)

	# rand_indexes = l1_max_indexes(module.weight.data)
	# new_layer.weight.data.copy_(module.weight.data[rand_indexes])
	# if new_layer.bias is not None:
	# new_layer.bias.data.copy_(module.bias.data[rand_indexes])
	# else:
	# new_layer = nn.Linear(_f(module.in_features), _f(module.out_features),
	# module.bias is not None, module.weight.device)

	# rand_indexes = l1_max_indexes(module.weight.data)
	# new_layer.weight.data.copy_(module.weight.data[rand_indexes][:, l1_max_indexes(module.weight.data, 1)])
	# if new_layer.bias is not None:
	# new_layer.bias.data.copy_(module.bias.data[rand_indexes])

	# elif isinstance(module, nn.LayerNorm) and ('block' in name or name == 'norm' or name == 'norm.0'):
	# new_layer = nn.LayerNorm(_f(module.normalized_shape[0]), eps=module.eps, device=module.weight.device)
	# rand_indexes = l1_max_indexes(module.weight.data)
	# new_layer.weight.data.copy_(module.weight.data[rand_indexes])
	# new_layer.bias.data.copy_(module.bias.data[rand_indexes])

	# else:
	# continue

	# original_layer_str = str(module)
	# set_module(fm_vit, name, new_layer)
	# logger.debug(f'set_module, {name}, {new_layer}')
	# logger.debug(f'slim {name} from {original_layer_str} to {new_layer}')

	return fm_vit

	class FMLoRA_GLIP_Util(FMLoRA_Util):
	def train_only_lora_and_conv(self, fm: nn.Module):
	res = []
	for n, m in fm.named_modules():
	if isinstance(m, LoRA) or isinstance(m, nn.Conv2d):
	for p in m.parameters():
	p.requires_grad = True
	res += [p]
	else:
	for p in m.parameters():
	p.requires_grad = False
	return res


	@torch.no_grad()
	def add_lora_ab_to_fm(self, fm: nn.Module, ab_r: int, samples):
	fm.eval()

	# samples = {'images' : samples[0], 'targets' : samples[1]}

	for k, v in samples.items():
	if isinstance(v, torch.Tensor) or isinstance(v, BoxList):
	samples[k] = v.to(get_model_device(fm))
	print(k)

	_, o1_token_logits, o1_dot_product_logits = fm(**samples)

	mo_list = {k:v for k, v in fm.named_modules()}

	for name, module in fm.named_modules():
	if '.proj' in name or 'out' in name:
	continue
	if name.endswith(('k_proj', 'q_proj', 'v_proj', 'qkv', 'attn.proj', 'l_proj', 'query', 'key', 'value')):
	set_module(fm, name, ToQKV_WrappedWithLoRA(module, ab_r))

	_, o2_token_logits, o2_dot_product_logits = fm(**samples)

	output_diff = 0.
	for o1, o2 in list(zip(o1_dot_product_logits, o2_dot_product_logits)):
	output_diff += ((o1 - o2) ** 2).sum()

	if o1_token_logits is not None:
	output_diff += ((o1_token_logits - o2_token_logits) ** 2).sum()
	assert output_diff < 1e-5

	return fm

	@torch.no_grad()
	def absorb_lora_and_recover_net_structure(self, fm: nn.Module, samples: dict):
	fm.eval()
	# print('absorb lora before')

	for k, v in samples.items():
	if isinstance(v, torch.Tensor):
	samples[k] = v.to(get_model_device(fm))
	print(k)

	_, o1_token_logits, o1_dot_product_logits = fm(**samples)

	for name, module in fm.named_modules():
	if not isinstance(module, ToQKV_WrappedWithLoRA):
	continue

	fc = module.fc
	ab = module.ab

	fc.weight.add_(ab[1].weight @ ab[0].weight)

	set_module(fm, name, fc)

	# print('absorb lora after')
	_, o2_token_logits, o2_dot_product_logits = fm(**samples)

	output_diff = 0.
	for o1, o2 in list(zip(o1_dot_product_logits, o2_dot_product_logits)):
	output_diff += ((o1 - o2) ** 2).sum()

	if o1_token_logits is not None:
	output_diff += ((o1_token_logits - o2_token_logits) ** 2).sum()
	assert output_diff < 1e-3, output_diff

	return fm

	class ElasticDNN_OfflineMMDetFMModel(ElasticDNN_OfflineFMModel):
	def __init__(self, name: str, models_dict_path: str, device: str, num_classes=10, collate_fn=None):
	super().__init__(name, models_dict_path, device)
	self.num_classes = num_classes
	self.collate_fn = collate_fn

	def get_accuracy(self, test_loader, args, *kwargs):
	# print('DeeplabV3: start test acc')
	_d = test_loader.dataset
	from data import build_dataloader
	if _d.__class__.__name__ == 'MergedDataset':
	# print('\neval on merged datasets')
	datasets = _d.datasets
	if self.collate_fn is None:
	test_loaders = [build_dataloader(d, test_loader.batch_size, test_loader.num_workers, False, None, collate_fn=None) for d in datasets]
	else:
	test_loaders = [build_dataloader(d, test_loader.batch_size, test_loader.num_workers, False, None, collate_fn=self.collate_fn) for d in datasets]
	accs = [self.get_accuracy(loader) for loader in test_loaders]
	# print(accs)
	return sum(accs) / len(accs)

	# print('dataset len', len(test_loader.dataset))

	model = self.models_dict['main']
	device = self.device
	model.eval()

	# print('# classes', model.num_classes)

	model = model.to(device)
	from evaluator import COCOEvaluator, MMCOCODecoder
	from utils.common.others import HiddenPrints
	with torch.no_grad():
	with HiddenPrints():
	evaluator = COCOEvaluator(
	dataloader=test_loader,
	img_size=(416, 416),
	confthre=0.01,
	nmsthre=0.65,
	num_classes=len(test_loader.dataset.classes),
	testdev=True
	)
	res = evaluator.evaluate(model, False, False, decoder=MMCOCODecoder)
	map50 = res[1]
	# print('eval info', res[-1])
	return map50

	def infer(self, x, args, *kwargs):
	if len(args) > 0:
	print(args, len(args))
	return self.models_dict['main'](x, *args) # forward(x, label)
	return self.models_dict['main'](**x)

	class ElasticDNN_OfflineMMDetMDModel(ElasticDNN_OfflineMDModel):
	def __init__(self, name: str, models_dict_path: str, device: str, num_classes=10, collate_fn=None):
	super().__init__(name, models_dict_path, device)
	self.num_classes = num_classes
	self.collate_fn = collate_fn

	def get_accuracy(self, test_loader, args, *kwargs):
	# print('DeeplabV3: start test acc')
	_d = test_loader.dataset
	from data import build_dataloader
	if _d.__class__.__name__ == 'MergedDataset':
	# print('\neval on merged datasets')
	datasets = _d.datasets
	if self.collate_fn is None:
	test_loaders = [build_dataloader(d, test_loader.batch_size, test_loader.num_workers, False, None, collate_fn=None) for d in datasets]
	else:
	test_loaders = [build_dataloader(d, test_loader.batch_size, test_loader.num_workers, False, None, collate_fn=self.collate_fn) for d in datasets]
	accs = [self.get_accuracy(loader) for loader in test_loaders]
	# print(accs)
	return sum(accs) / len(accs)

	# print('dataset len', len(test_loader.dataset))

	model = self.models_dict['main']
	device = self.device
	model.eval()

	# print('# classes', model.num_classes)

	model = model.to(device)
	from evaluator import COCOEvaluator, MMCOCODecoder
	from utils.common.others import HiddenPrints
	with torch.no_grad():
	with HiddenPrints():
	evaluator = COCOEvaluator(
	dataloader=test_loader,
	img_size=(416, 416),
	confthre=0.01,
	nmsthre=0.65,
	num_classes=len(test_loader.dataset.classes),
	testdev=True
	)
	res = evaluator.evaluate(model, False, False, decoder=MMCOCODecoder)
	map50 = res[1]
	# print('eval info', res[-1])
	return map50

	def infer(self, x, args, *kwargs):
	if len(args) > 0:
	return self.models_dict['main'](x, *args) # forward(x, label)
	return self.models_dict['main'](**x)

	class SqueezeLast(nn.Module):
	def __init__(self):
	super(SqueezeLast, self).__init__()

	def forward(self, x):
	return x.squeeze(-1)


	class ProjConv_WrappedWithFBS(Layer_WrappedWithFBS):
	def __init__(self, raw_conv2d: nn.Conv2d, r):
	super(ProjConv_WrappedWithFBS, self).__init__()

	self.fbs = nn.Sequential(
	Abs(),
	nn.AdaptiveAvgPool2d(1),
	nn.Flatten(),
	nn.Linear(raw_conv2d.in_channels, raw_conv2d.out_channels // r),
	nn.ReLU(),
	nn.Linear(raw_conv2d.out_channels // r, raw_conv2d.out_channels),
	nn.ReLU()
	)

	self.raw_conv2d = raw_conv2d
	# self.raw_bn = raw_bn # remember clear the original BNs in the network

	nn.init.constant_(self.fbs[5].bias, 1.)
	nn.init.kaiming_normal_(self.fbs[5].weight)

	def forward(self, x):
	raw_x = self.raw_conv2d(x)

	if self.use_cached_channel_attention and self.cached_channel_attention is not None:
	channel_attention = self.cached_channel_attention
	else:
	self.cached_raw_channel_attention = self.fbs(x)
	self.cached_channel_attention = self.k_takes_all(self.cached_raw_channel_attention)

	channel_attention = self.cached_channel_attention

	return raw_x * channel_attention.unsqueeze(2).unsqueeze(3)


	class Linear_WrappedWithFBS(Layer_WrappedWithFBS):
	def __init__(self, linear: nn.Linear, r):
	super(Linear_WrappedWithFBS, self).__init__()

	self.linear = linear

	# for conv: (B, C_in, H, W) -> (B, C_in) -> (B, C_out)
	# for mlp in ViT: (B, #patches, D: dim of patches embedding) -> (B, D) -> (B, C_out)
	self.fbs = nn.Sequential(
	Rearrange('b n d -> b d n'),
	Abs(),
	nn.AdaptiveAvgPool1d(1),
	SqueezeLast(),
	nn.Linear(linear.in_features, max(linear.out_features // r, 36)),
	nn.ReLU(),
	nn.Linear(max(linear.out_features // r, 36), linear.out_features),
	nn.ReLU()
	)

	nn.init.constant_(self.fbs[6].bias, 1.)
	nn.init.kaiming_normal_(self.fbs[6].weight)


	def forward(self, x):
	if self.use_cached_channel_attention and self.cached_channel_attention is not None:
	channel_attention = self.cached_channel_attention
	else:
	self.cached_raw_channel_attention = self.fbs(x)
	self.cached_channel_attention = self.k_takes_all(self.cached_raw_channel_attention)

	channel_attention = self.cached_channel_attention

	raw_res = self.linear(x)

	return channel_attention.unsqueeze(1) * raw_res


	class ToQKV_WrappedWithFBS(Layer_WrappedWithFBS):
	"""
	This regards to_q/to_k/to_v as a whole (in fact it consists of multiple heads) and prunes it.
	It seems different channels of different heads are pruned according to the input.
	This is different from "removing some head" or "removing the same channels in each head".
	"""
	def __init__(self, to_qkv: nn.Linear, r):
	super(ToQKV_WrappedWithFBS, self).__init__()

	# self.to_qkv = to_qkv

	self.to_qk = nn.Linear(to_qkv.in_features, to_qkv.out_features // 3 * 2, bias=to_qkv.bias is not None)
	self.to_v = nn.Linear(to_qkv.in_features, to_qkv.out_features // 3, bias=to_qkv.bias is not None)
	self.to_qk.weight.data.copy_(to_qkv.weight.data[0: to_qkv.out_features // 3 * 2])
	if to_qkv.bias is not None:
	self.to_qk.bias.data.copy_(to_qkv.bias.data[0: to_qkv.out_features // 3 * 2])
	self.to_v.weight.data.copy_(to_qkv.weight.data[to_qkv.out_features // 3 * 2: ])
	if to_qkv.bias is not None:
	self.to_v.bias.data.copy_(to_qkv.bias.data[to_qkv.out_features // 3 * 2: ])

	self.fbs = nn.Sequential(
	Rearrange('b n d -> b d n'),
	Abs(),
	nn.AdaptiveAvgPool1d(1),
	SqueezeLast(),
	nn.Linear(to_qkv.in_features, to_qkv.out_features // 3 // r),
	nn.ReLU(),
	# nn.Linear(to_qkv.out_features // 3 // r, to_qkv.out_features // 3),
	nn.Linear(to_qkv.out_features // 3 // r, self.to_v.out_features),
	nn.ReLU()
	)

	nn.init.constant_(self.fbs[6].bias, 1.)
	nn.init.kaiming_normal_(self.fbs[6].weight)

	def forward(self, x):
	if self.use_cached_channel_attention and self.cached_channel_attention is not None:
	channel_attention = self.cached_channel_attention
	else:
	self.cached_raw_channel_attention = self.fbs(x)

	# print()
	# for attn in self.cached_raw_channel_attention.chunk(3, dim=1)[0: 1]:
	# print(self.cached_raw_channel_attention.size(), attn.size())
	# print(self.k_takes_all.k)
	# print(attn[0].nonzero(as_tuple=True)[0].size(), attn[0])

	self.cached_channel_attention = self.k_takes_all(self.cached_raw_channel_attention)


	# for attn in self.cached_channel_attention.chunk(3, dim=1)[0: 1]:
	# print(self.cached_channel_attention.size(), attn.size())
	# print(self.k_takes_all.k)
	# print(attn[0].nonzero(as_tuple=True)[0].size(), attn[0])
	# print()

	channel_attention = self.cached_channel_attention

	qk = self.to_qk(x)
	v = channel_attention.unsqueeze(1) * self.to_v(x)
	return torch.cat([qk, v], dim=-1)

	# qkv = raw_res.chunk(3, dim = -1)

	# raw_v = qkv[2]
	# print('raw_k, raw_v', qkv[0].sum((0, 1))[0: 10], qkv[0].sum((0, 1)).nonzero(as_tuple=True)[0].size(),
	# qkv[1].sum((0, 1))[0: 10], qkv[1].sum((0, 1)).nonzero(as_tuple=True)[0].size(),)
	# print('raw_v', raw_v.size(), raw_v.sum((0, 1))[0: 10], raw_v.sum((0, 1)).nonzero(as_tuple=True)[0].size())

	# qkv_attn = channel_attention.chunk(3, dim=-1)
	# print('attn', [attn[0][0: 10] for attn in qkv_attn])
	# print(channel_attention.unsqueeze(1).size(), raw_res.size())
	# print('fbs', channel_attention.size(), raw_res.size())
	# return channel_attention.unsqueeze(1) * raw_res


	class StaticFBS(nn.Module):
	def __init__(self, static_channel_attention):
	super(StaticFBS, self).__init__()
	assert static_channel_attention.dim() == 2 and static_channel_attention.size(0) == 1
	self.static_channel_attention = nn.Parameter(static_channel_attention, requires_grad=False) # (1, dim)

	def forward(self, x):
	# print('staticfbs', x, self.static_channel_attention.unsqueeze(1))
	return x * self.static_channel_attention.unsqueeze(1)

	class ElasticGLIPUtil(ElasticDNNUtil):
	def convert_raw_dnn_to_master_dnn(self, raw_dnn: nn.Module, r: float, ignore_layers=[]):
	assert len(ignore_layers) == 0, 'not supported yet'

	raw_vit = deepcopy(raw_dnn)



	for name, module in raw_vit.named_modules():
	# if name.endswith('patch_embed'):
	# set_module(module, 'proj', ProjConv_WrappedWithFBS(module.proj, r))
	# if name.endswith('attn') and not name.endswith('b_attn.attn') and not name.endswith('b_attn'):
	# set_module(module, 'qkv', ToQKV_WrappedWithFBS(module.qkv, r))
	if name.endswith('intermediate'):
	set_module(module, 'dense', Linear_WrappedWithFBS(module.dense, r))
	elif name.endswith('mlp'):
	set_module(module, 'fc1', Linear_WrappedWithFBS(module.fc1, r))

	return raw_vit

	def set_master_dnn_sparsity(self, master_dnn: nn.Module, sparsity: float):
	# for name, module in master_dnn.named_modules():
	# if not name.endswith('attn'):
	# continue

	# q_features = module.qkv.to_qk.out_features // 2

	# if (q_features - int(q_features * sparsity)) % module.num_heads != 0:
	# # tune sparsity to ensure #unpruned channel % num_heads == 0
	# # so that the pruning seems to reduce the dim_head of each head
	# tuned_sparsity = 1. - int((q_features - int(q_features * sparsity)) / module.num_heads) * module.num_heads / q_features
	# logger.debug(f'tune sparsity from {sparsity:.2f} to {tuned_sparsity}')
	# sparsity = tuned_sparsity
	# break

	return super().set_master_dnn_sparsity(master_dnn, sparsity)

	def select_most_rep_sample(self, master_dnn: nn.Module, samples: torch.Tensor):
	# print(samples)
	sample={}
	sample['images'] = [samples['images'][0]]
	sample['targets'] = [samples['targets'][0]]
	# return samples[0].unsqueeze(0)
	# res = {k: v[0: 1] for k, v in samples.items()}
	return sample

	def extract_surrogate_dnn_via_samples(self, master_dnn: nn.Module, samples: torch.Tensor, return_detail=False):#产生小模型的步骤
	sample = self.select_most_rep_sample(master_dnn, samples)
	# assert sample.dim() == 4 and sample.size(0) == 1

	# print('before')
	master_dnn.eval()
	self.clear_cached_channel_attention_in_master_dnn(master_dnn)
	with torch.no_grad():
	_, o1_token_logits, o1_dot_product_logits = master_dnn(**sample)
	# print('after')

	boosted_vit = deepcopy(master_dnn)

	def get_unpruned_indexes_from_channel_attn(channel_attn: torch.Tensor, k):
	assert channel_attn.size(0) == 1, 'use A representative sample to generate channel attentions'

	# print('attn_in_unpruned', channel_attn[0][0: 10])

	res = channel_attn[0].nonzero(as_tuple=True)[0] # should be one-dim

	# res = channel_attn[0].argsort(descending=True)[0: -int(channel_attn.size(1) * k)].sort()[0]

	# g = channel_attn
	# k = g.size(1) - int(g.size(1) * k)
	# res = g.topk(k, 1)[1][0].sort()[0]

	return res

	unpruned_indexes_of_layers = {}

	# for attn, ff in boosted_vit.transformer.layers:
	# for block_i, block in enumerate(boosted_vit.blocks):
	for layer_i, layer in enumerate(boosted_vit.model.backbone.body.layers):
	for block_i, block in enumerate(layer.blocks):
	# attn = block.attn
	# ff = block.mlp

	ff_0 = get_module(block, f'mlp.fc1')
	# ff_0_unpruned_indexes = get_unpruned_indexes_from_channel_attn(ff_0.cached_channel_attention, k)
	ff_0_pruned_indexes = ff_0.k_takes_all.cached_i[0].sort()[0]
	ff_0_unpruned_indexes = torch.LongTensor([ii for ii in range(ff_0.cached_channel_attention.size(1)) if ii not in ff_0_pruned_indexes])
	new_ff_0 = nn.Linear(ff_0.linear.in_features, ff_0_unpruned_indexes.size(0), ff_0.linear.bias is not None)
	new_ff_0.weight.data.copy_(ff_0.linear.weight.data[ff_0_unpruned_indexes])
	if ff_0.linear.bias is not None:
	new_ff_0.bias.data.copy_(ff_0.linear.bias.data[ff_0_unpruned_indexes])
	set_module(block, 'mlp.fc1', nn.Sequential(new_ff_0, StaticFBS(ff_0.cached_channel_attention[:, ff_0_unpruned_indexes])))

	ff_1 = get_module(block, f'mlp.fc2')
	new_ff_1 = nn.Linear(ff_0_unpruned_indexes.size(0), ff_1.out_features, ff_1.bias is not None)
	new_ff_1.weight.data.copy_(ff_1.weight.data[:, ff_0_unpruned_indexes])
	if ff_1.bias is not None:
	new_ff_1.bias.data.copy_(ff_1.bias.data)
	set_module(block, 'mlp.fc2', new_ff_1)

	unpruned_indexes_of_layers[f'model.backbone.body.layers.{layer_i}.blocks.{block_i}.mlp.fc1.0.weight'] = ff_0_unpruned_indexes
	# for block_i,block in enumerate(boosted_vit.vision_model.encoder.layers):

	# attn = block.self_attn
	# ff = block.mlp
	# ff_0 = ff.fc1
	# # ff_0_unpruned_indexes = get_unpruned_indexes_from_channel_attn(ff_0.cached_channel_attention, k)
	# ff_0_pruned_indexes = ff_0.k_takes_all.cached_i[0].sort()[0]
	# ff_0_unpruned_indexes = torch.LongTensor([ii for ii in range(ff_0.cached_channel_attention.size(1)) if ii not in ff_0_pruned_indexes])
	# new_ff_0 = nn.Linear(ff_0.linear.in_features, ff_0_unpruned_indexes.size(0), ff_0.linear.bias is not None)
	# new_ff_0.weight.data.copy_(ff_0.linear.weight.data[ff_0_unpruned_indexes])
	# if ff_0.linear.bias is not None:
	# new_ff_0.bias.data.copy_(ff_0.linear.bias.data[ff_0_unpruned_indexes])
	# set_module(ff, 'fc1', nn.Sequential(new_ff_0, StaticFBS(ff_0.cached_channel_attention[:, ff_0_unpruned_indexes])))

	# ff_1 = ff.fc2
	# new_ff_1 = nn.Linear(ff_0_unpruned_indexes.size(0), ff_1.out_features, ff_1.bias is not None)
	# new_ff_1.weight.data.copy_(ff_1.weight.data[:, ff_0_unpruned_indexes])
	# if ff_1.bias is not None:
	# new_ff_1.bias.data.copy_(ff_1.bias.data)
	# set_module(ff, 'fc2', new_ff_1)

	# unpruned_indexes_of_layers[f'vision_model.encoder.layers.{block_i}.mlp.fc1.0.weight'] = ff_0_unpruned_indexes


	# for block_i, block in enumerate(boosted_vit.text_decoder.bert.encoder.layer):
	# # attn = block.attn
	# # ff = block.mlp

	# ff_0 = get_module(block, f'intermediate.dense')
	# # ff_0_unpruned_indexes = get_unpruned_indexes_from_channel_attn(ff_0.cached_channel_attention, k)
	# ff_0_pruned_indexes = ff_0.k_takes_all.cached_i[0].sort()[0]
	# ff_0_unpruned_indexes = torch.LongTensor([ii for ii in range(ff_0.cached_channel_attention.size(1)) if ii not in ff_0_pruned_indexes])
	# new_ff_0 = nn.Linear(ff_0.linear.in_features, ff_0_unpruned_indexes.size(0), ff_0.linear.bias is not None)
	# new_ff_0.weight.data.copy_(ff_0.linear.weight.data[ff_0_unpruned_indexes])
	# if ff_0.linear.bias is not None:
	# new_ff_0.bias.data.copy_(ff_0.linear.bias.data[ff_0_unpruned_indexes])
	# set_module(block, 'intermediate.dense', nn.Sequential(new_ff_0, StaticFBS(ff_0.cached_channel_attention[:, ff_0_unpruned_indexes])))

	# ff_1 = get_module(block, f'output.dense')
	# new_ff_1 = nn.Linear(ff_0_unpruned_indexes.size(0), ff_1.out_features, ff_1.bias is not None)
	# new_ff_1.weight.data.copy_(ff_1.weight.data[:, ff_0_unpruned_indexes])
	# if ff_1.bias is not None:
	# new_ff_1.bias.data.copy_(ff_1.bias.data)
	# set_module(block, 'output.dense', new_ff_1)

	# unpruned_indexes_of_layers[f'text_decoder.bert.encoder.layer.{block_i}.intermediate.dense.0.weight'] = ff_0_unpruned_indexes
	surrogate_dnn = boosted_vit
	surrogate_dnn.eval()
	surrogate_dnn = surrogate_dnn.to(get_model_device(master_dnn))
	# logger.debug(surrogate_dnn)
	with torch.no_grad():
	_, o2_token_logits, o2_dot_product_logits = surrogate_dnn(**sample)

	output_diff = 0.
	for o1, o2 in list(zip(o1_dot_product_logits, o2_dot_product_logits)):
	output_diff += ((o1 - o2) ** 2).sum()

	if o1_token_logits is not None:
	output_diff += ((o1_token_logits - o2_token_logits) ** 2).sum()
	# assert output_diff < 1e-4, output_diff
	logger.info(f'output diff of master and surrogate DNN: {output_diff}')
	# logger.debug(f'example output of master/surrogate: {master_dnn_output.sum(0)[0: 10]}, {surrogate_dnn_output.sum(0)[0: 10]}')
	# logger.info(f'\nonly prune mlp!!!!\n')
	# logger.info(f'\nonly prune mlp!!!!\n')

	if return_detail:
	return boosted_vit, unpruned_indexes_of_layers

	return boosted_vit

	def extract_surrogate_dnn_via_samples_with_perf_test(self, master_dnn: nn.Module, samples, return_detail=False):
	master_dnn_size = get_model_size(master_dnn, True)
	sample = {}
	sample['images'] = [samples['images'][0]]
	sample['targets'] = [samples['targets'][0]]
	master_dnn_latency = self._get_model_latency(master_dnn, sample, 50,
	get_model_device(master_dnn), 50, False)

	res = self.extract_surrogate_dnn_via_samples(master_dnn, samples, return_detail)
	if not return_detail:
	surrogate_dnn = res
	else:
	surrogate_dnn, unpruned_indexes_of_layers = res
	surrogate_dnn_size = get_model_size(surrogate_dnn, True)
	surrogate_dnn_latency = self._get_model_latency(master_dnn, samples, 50,
	get_model_device(master_dnn), 50, False)

	logger.info(f'master DNN ({master_dnn_size:.3f}MB, {master_dnn_latency:.4f}s/sample) -> '
	f'surrogate DNN ({surrogate_dnn_size:.3f}MB, {surrogate_dnn_latency:.4f}s/sample)\n'
	f'(model size: ↓ {(master_dnn_size / surrogate_dnn_size):.2f}x, '
	f'latency: ↓ {(master_dnn_latency / surrogate_dnn_latency):.2f}x)')

	return res

	def _get_model_latency(self, model: torch.nn.Module, sample, sample_num: int,
	device: str, warmup_sample_num: int, return_detail=False):
	import time

	model = model.to(device)
	model.eval()
	sample['images'] = [sample['images'][0]]
	sample['targets'] = [sample['targets'][0]]
	# warm up
	with torch.no_grad():
	for _ in range(warmup_sample_num):
	model(**sample)

	infer_time_list = []

	if device == 'cuda' or 'cuda' in str(device):
	with torch.no_grad():
	for _ in range(sample_num):
	s, e = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
	s.record()
	model(**sample)
	e.record()
	torch.cuda.synchronize()
	cur_model_infer_time = s.elapsed_time(e) / 1000.
	infer_time_list += [cur_model_infer_time]

	else:
	with torch.no_grad():
	for _ in range(sample_num):
	start = time.time()
	model(**sample)
	cur_model_infer_time = time.time() - start
	infer_time_list += [cur_model_infer_time]

	avg_infer_time = sum(infer_time_list) / sample_num

	if return_detail:
	return avg_infer_time, infer_time_list
	return avg_infer_time


	# from typing import Any, Dict
	# from schema import Schema, Or
	# import schema
	# from data import Scenario, MergedDataset
	# from methods.base.alg import BaseAlg
	# from data import build_dataloader
	# from ..model import ElasticDNN_OfflineFMModel, ElasticDNN_OfflineMDModel
	# from ...model.base import ElasticDNNUtil
	# import torch.optim
	# import tqdm
	# import torch.nn.functional as F
	# from torch import nn
	# from utils.dl.common.env import create_tbwriter
	# import os
	# import random
	# import numpy as np
	# from copy import deepcopy
	# from utils.dl.common.model import LayerActivation2, get_module
	# from utils.common.log import logger


	# class ElasticDNN_Det_MDPretrainingWoFBSAlg(BaseAlg):
	# """
	# TODO: fine-tuned FM -> init MD -> trained MD -> construct indexes (only between similar weights) and fine-tune
	# """
	# def get_required_models_schema(self) -> Schema:
	# return Schema({
	# 'fm': ElasticDNN_OfflineFMModel,
	# 'md': ElasticDNN_OfflineMDModel
	# })

	# def get_required_hyp_schema(self) -> Schema:
	# return Schema({
	# 'launch_tbboard': bool,

	# 'samples_size': any,
	# 'generate_md_width_ratio': int,

	# 'train_batch_size': int,
	# 'val_batch_size': int,
	# 'num_workers': int,
	# 'optimizer': str,
	# 'optimizer_args': dict,
	# 'scheduler': str,
	# 'scheduler_args': dict,
	# 'num_iters': int,
	# 'val_freq': int,
	# 'distill_loss_weight': float
	# })

	# def run(self, scenario: Scenario, hyps: Dict) -> Dict[str, Any]:
	# super().run(scenario, hyps)

	# assert isinstance(self.models['md'], ElasticDNN_OfflineMDModel) # for auto completion
	# assert isinstance(self.models['fm'], ElasticDNN_OfflineFMModel) # for auto completion

	# # 1. add FBS
	# device = self.models['md'].device

	# if self.models['md'].models_dict['main'] == -1:
	# logger.info(f'init master DNN by reducing width of an adapted foundation model (already tuned by LoRA)...')

	# before_fm_model = deepcopy(self.models['fm'].models_dict['main'])
	# lora_util = self.models['fm'].get_lora_util()

	# sample = hyps['samples_size']
	# if isinstance(sample, (tuple, list)) and isinstance(sample[0], int):
	# sample = torch.rand(hyps['samples_size']).to(device)

	# lora_absorbed_fm_model = lora_util.absorb_lora_and_recover_net_structure(self.models['fm'].models_dict['main'],
	# sample)
	# self.models['fm'].models_dict['main'] = lora_absorbed_fm_model
	# master_dnn = self.models['fm'].generate_md_by_reducing_width(hyps['generate_md_width_ratio'],
	# sample)
	# self.models['fm'].models_dict['main'] = before_fm_model

	# self.models['md'].models_dict['main'] = master_dnn
	# self.models['md'].to(device)

	# # 2. train (knowledge distillation, index relationship)
	# offline_datasets = scenario.get_offline_datasets()
	# train_dataset = MergedDataset([d['train'] for d in offline_datasets.values()])
	# val_dataset = MergedDataset([d['val'] for d in offline_datasets.values()])
	# train_loader = iter(build_dataloader(train_dataset, hyps['train_batch_size'], hyps['num_workers'],
	# True, None))
	# val_loader = build_dataloader(val_dataset, hyps['val_batch_size'], hyps['num_workers'],
	# False, False)

	# # logger.info(f'FM acc: {self.models["fm"].get_accuracy(val_loader):.4f}')

	# # 2.1 train whole master DNN (knowledge distillation)
	# for p in master_dnn.parameters():
	# p.requires_grad = True
	# self.models['md'].to_train_mode()

	# optimizer = torch.optim.__dict__[hyps['optimizer']]([
	# {'params': self.models['md'].models_dict['main'].parameters(), **hyps['optimizer_args']}
	# ])
	# scheduler = torch.optim.lr_scheduler.__dict__[hyps['scheduler']](optimizer, **hyps['scheduler_args'])
	# tb_writer = create_tbwriter(os.path.join(self.res_save_dir, 'tb_log'), launch_tbboard=hyps['launch_tbboard'])
	# pbar = tqdm.tqdm(range(hyps['num_iters']), dynamic_ncols=True)
	# best_avg_val_acc = 0.

	# md_output_hook = None

	# for iter_index in pbar:
	# self.models['md'].to_train_mode()
	# self.models['fm'].to_eval_mode()

	# # rand_sparsity = random.random() * (hyps['max_sparsity'] - hyps['min_sparsity']) + hyps['min_sparsity']
	# # elastic_dnn_util.set_master_dnn_sparsity(self.models['md'].models_dict['main'], rand_sparsity)
	# if md_output_hook is None:
	# md_output_hook = self.models['md'].get_feature_hook()
	# fm_output_hook = self.models['fm'].get_feature_hook()

	# x, y = next(train_loader)
	# if isinstance(x, dict):
	# for k, v in x.items():
	# if isinstance(v, torch.Tensor):
	# x[k] = v.to(device)
	# y = y.to(device)
	# else:
	# x, y = x.to(device), y.to(device)

	# with torch.no_grad():
	# fm_output = self.models['fm'].infer(x)
	# task_loss = self.models['md'].forward_to_get_task_loss(x, y)

	# md_output = md_output_hook.output
	# fm_output = fm_output_hook.output

	# distill_loss = hyps['distill_loss_weight'] * self.models['md'].get_distill_loss(md_output, fm_output)
	# total_loss = task_loss + distill_loss

	# optimizer.zero_grad()
	# total_loss.backward()
	# optimizer.step()
	# scheduler.step()

	# if (iter_index + 1) % hyps['val_freq'] == 0:

	# # elastic_dnn_util.clear_cached_channel_attention_in_master_dnn(self.models['md'].models_dict['main'])
	# md_output_hook.remove()
	# md_output_hook = None
	# fm_output_hook.remove()
	# fm_output_hook = None

	# cur_md = self.models['md'].models_dict['main']
	# md_for_test = deepcopy(self.models['md'].models_dict['main'])
	# val_acc = 0.

	# self.models['md'].models_dict['main'] = md_for_test
	# self.models['md'].to_eval_mode()
	# val_acc = self.models['md'].get_accuracy(val_loader)

	# self.models['md'].models_dict['main'] = cur_md

	# self.models['md'].save_model(os.path.join(self.res_save_dir, 'models/md_last.pt'))
	# self.models['fm'].save_model(os.path.join(self.res_save_dir, 'models/fm_last.pt'))

	# if val_acc > best_avg_val_acc:
	# best_avg_val_acc = val_acc
	# self.models['md'].save_model(os.path.join(self.res_save_dir, 'models/md_best.pt'))
	# self.models['fm'].save_model(os.path.join(self.res_save_dir, 'models/fm_best.pt'))

	# tb_writer.add_scalars(f'losses', dict(task=task_loss, distill=distill_loss, total=total_loss), iter_index)
	# pbar.set_description(f'loss: {total_loss:.6f}')
	# if (iter_index + 1) >= hyps['val_freq']:
	# tb_writer.add_scalar(f'accs/val_acc', val_acc, iter_index)
	# pbar.set_description(f'loss: {total_loss:.6f}, val_acc: {val_acc:.4f}')

	# if __name__ == '__main__':
	# model = glip_model('new_impl/cv/glip/object_detection/pretrained_model/glip_Swin_T_O365_GoldG.yaml','new_impl/cv/glip/object_detection/pretrained_model/glip_tiny_model_o365_goldg_cc_sbu.pth').cuda()
	# model.eval()
	# # print(model)
	# # exit()


	# # config = CLIPConfig.from_pretrained('openai/clip-vit-base-patch16')
	# # print(config)

	# # # test 1: single image inference
	# from PIL import Image, ImageDraw
	# import requests
	# import numpy as np
	# ori_image = Image.open('new_impl/cv/glip/object_detection/9472793441_b7822c00de_z.jpg').convert("RGB")
	# image = [np.asarray(ori_image)[:, :, [2, 1, 0]]]
	# text = 'sofa . remote . dog . person . car . sky . plane .'
	# target = torch.Tensor()
	# o = model(image, text)
	# o = model._post_process(o[0])
	# print(o)
	# bboxes = o.bbox.cpu()
	# a = ImageDraw.ImageDraw(ori_image)
	# for box in bboxes:
	# box = box.int()
	# a.rectangle(((box[0], box[1]), (box[2], box[3])), fill=None, outline='red', width=2)
	# ori_image.save('test.jpg')
	# # print(o.logits_per_image.softmax(dim=1))

	# # o = model(image, torch.load('dnns/clip/test_input_embed.pth'), False)
	# # # print(o)
	# # print(o.logits_per_image.softmax(dim=1))
	# # exit()

	# # test 2: normal training using clip loss (batch)
	# from data import get_dataset, build_dataloader
	# from torchvision.transforms import Compose, ToTensor, Resize
	# dataset = get_dataset('Caltech256', '/data/zql/datasets/Caltech-256/data/caltech256/256_ObjectCategories/', 'train', transform=Compose([
	# Resize((32, 32)), ToTensor()
	# ]))
	# dataloader = build_dataloader(dataset, 8, 0, True, None)

	# from PIL import Image
	# import requests
	# images, labels = next(iter(dataloader))

	# # torch.save(images, 'dnns/clip/test_image.pth')
	# classes = dataset.classes
	# text = [f"a photo of a {classes[i]}" for i in labels] # should be ground truth
	# print(text)
	# print(images.size())

	# o = model(images, text, True)
	# print(o)
	# print(o.logits_per_image.softmax(dim=1))

	# # o = model(image, torch.load('dnns/clip/test_input_embed.pth'), False)
	# # # print(o)
	# # print(o.logits_per_image.softmax(dim=1))