Spaces:

AhmedAlmaghz
/

github-repo-test

Configuration error

github-repo-test / eval_mm /vlmevalkit /vlmeval /utils /dataset.py

Hisab Cloud

Upload folder using huggingface_hub

45e92bd verified over 1 year ago

6.47 kB

	import pandas as pd
	import hashlib
	from ..smp import *
	from .dataset_config import dataset_URLs, dataset_md5_dict, DATASET_TYPE
	from .custom_prompt import CustomPrompt
	from .matching_util import can_infer


	def isliststr(s):
	return (s[0] == '[') and (s[-1] == ']')


	def check_md5(data_path, dataset):
	if dataset not in dataset_md5_dict:
	warnings.warn(f'We do not have an md5 record for dataset {dataset}, skip the md5 check. ')
	return True
	assert osp.exists(data_path)
	with open(data_path, 'rb') as f:
	hash = hashlib.new('md5')
	for chunk in iter(lambda: f.read(2**20), b''):
	hash.update(chunk)
	if str(hash.hexdigest()) == dataset_md5_dict[dataset]:
	return True
	else:
	warnings.warn('this data file is incomplete, so it needs to be downloaded again.')
	return False


	def split_MMMU(msgs):
	text, images = None, []
	for s in msgs:
	if s['type'] == 'image':
	images.append(s['value'])
	elif s['type'] == 'text':
	assert text is None
	text = s['value']
	text_segs = text.split('<image ')
	segs = [dict(type='text', value=text_segs[0])]
	for i, seg in enumerate(text_segs):
	if i == 0:
	continue
	assert istype(seg[0], int) and seg[1] == '>'
	image_idx = int(seg[0]) - 1
	segs.append(dict(type='image', value=images[image_idx]))
	segs.append(dict(type='text', value=seg[2:]))
	return segs


	def MMMU_result_transfer(result_path):
	res = {}
	result_data = load(result_path)
	mcq = result_data['A'].notna()
	lt = len(result_data)
	for i in range(lt):
	line = result_data.iloc[i]
	if mcq[i]:
	options = {
	cand: line[cand]
	for cand in string.ascii_uppercase
	if cand in line and not pd.isna(line[cand])
	}
	prediction = line['prediction']
	infer_prediction = can_infer(prediction, options)
	res[line['id']] = infer_prediction
	else:
	res[line['id']] = line['prediction']
	result_json = result_path.replace('.xlsx', '.json')
	dump(res, result_json)
	return result_json


	class TSVDataset(CustomPrompt):

	def __init__(self, dataset='MMBench', skip_noimg=True):

	self.data_root = LMUDataRoot()
	assert osp.exists(self.data_root)

	self.dataset = dataset
	self.dataset_type = DATASET_TYPE(dataset)

	if dataset in dataset_URLs:
	url = dataset_URLs[dataset]
	file_name = url.split('/')[-1]
	data_path = osp.join(self.data_root, file_name)

	if osp.exists(data_path) and check_md5(data_path, dataset):
	pass
	elif osp.isfile(url):
	# If url is actually a file path, use it directly
	data_path = url
	else:
	warnings.warn('The dataset tsv is not downloaded')
	download_file(url, data_path)
	else:
	data_path = osp.join(self.data_root, dataset + '.tsv')
	assert osp.exists(data_path)

	data = load(data_path)
	self.skip_noimg = skip_noimg
	if skip_noimg and 'image' in data:
	data = data[~pd.isna(data['image'])]

	# Prompt for Captioning
	if listinstr(['COCO'], dataset):
	data['question'] = [(
	'Please describe this image in general. Directly provide the description, '
	'do not include prefix like "This image depicts". '
	)] * len(data)

	data['index'] = [str(x) for x in data['index']]

	self.meta_only = True
	if 'image' in data:
	data['image'] = [str(x) for x in data['image']]

	image_map = {x: y for x, y in zip(data['index'], data['image'])}
	for k in image_map:
	if len(image_map[k]) <= 64:
	idx = image_map[k]
	assert idx in image_map and len(image_map[idx]) > 64
	image_map[k] = image_map[idx]

	data['image'] = [
	eval(image_map[k]) if isliststr(image_map[k]) else image_map[k]
	for k in data['index']
	]
	self.meta_only = False

	if 'image_path' in data:
	data['image_path'] = [
	eval(pths) if isliststr(pths) else pths for pths in data['image_path']
	]

	if np.all([istype(x, int) for x in data['index']]):
	data['index'] = [int(x) for x in data['index']]

	self.data = data

	def __len__(self):
	return len(self.data)

	def build_prompt(self, line, dataset=None):
	if dataset is None:
	dataset = self.dataset

	if isinstance(line, int):
	line = self.data.iloc[line]

	if self.meta_only:
	tgt_path = line['image_path']
	else:
	tgt_path = self.dump_image(line, dataset)

	prompt = line['question']
	if DATASET_TYPE(dataset) == 'multi-choice':
	question = line['question']
	options = {
	cand: line[cand]
	for cand in string.ascii_uppercase
	if cand in line and not pd.isna(line[cand])
	}
	options_prompt = 'Options:\n'
	for key, item in options.items():
	options_prompt += f'{key}. {item}\n'
	hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
	prompt = ''
	if hint is not None:
	prompt += f'Hint: {hint}\n'
	prompt += f'Question: {question}\n'
	if len(options):
	prompt += options_prompt
	prompt += 'Please select the correct answer from the options above. \n'
	elif DATASET_TYPE(dataset) == 'VQA':
	if listinstr(['ocrvqa', 'textvqa', 'chartqa', 'docvqa'], dataset.lower()):
	prompt += '\nPlease try to answer the question with short words or phrases if possible\n.'

	msgs = []
	if isinstance(tgt_path, list):
	msgs.extend([dict(type='image', value=p) for p in tgt_path])
	else:
	msgs = [dict(type='image', value=tgt_path)]
	msgs.append(dict(type='text', value=prompt))

	return msgs

	def display(self, line):
	if isinstance(line, int):
	line = self.data.iloc[line]
	mmqa_display(line)