Spaces:
Runtime error
Runtime error
File size: 5,713 Bytes
970607e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
import math
import os
import argparse
import json
from tqdm import tqdm
from dc.eval.model_utils import load_video
import shortuuid
from dc.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from dc.conversation import conv_templates, SeparatorStyle
from dc.model.builder import load_pretrained_model
from dc.utils import disable_torch_init
from dc.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
from PIL import Image
import math
import torch
import time
def llava_inference(video_frames, question, conv_mode, model, tokenizer, image_processor, image_sizes):
if model.config.mm_use_im_start_end:
qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + question
else:
qs = DEFAULT_IMAGE_TOKEN + '\n' + question
conv = conv_templates[conv_mode].copy()
conv.append_message(conv.roles[0], qs)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
image_tensor = process_images(video_frames, image_processor, model.config)
with torch.inference_mode():
output_ids = model.generate(
input_ids,
images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True),
image_sizes=image_sizes,
do_sample=True if args.temperature > 0 else False,
temperature=args.temperature,
top_p=args.top_p,
num_beams=args.num_beams,
max_new_tokens=512,
use_cache=True)
outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
print(outputs)
return outputs
def split_list(lst, n):
"""Split a list into n (roughly) equal-sized chunks"""
chunk_size = math.ceil(len(lst) / n) # integer division
return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
def get_chunk(lst, n, k):
chunks = split_list(lst, n)
return chunks[k]
def parse_args():
"""
Parse command-line arguments.
"""
parser = argparse.ArgumentParser()
# Define the command-line arguments
parser.add_argument('--video_dir', help='Directory containing video files.', required=True)
parser.add_argument('--gt_file', help='Path to the ground truth file.', required=True)
parser.add_argument('--output_dir', help='Directory to save the model results JSON.', required=True)
parser.add_argument('--output_name', help='Name of the file for storing results JSON.', required=True)
parser.add_argument("--model_name", type=str, required=True)
parser.add_argument("--conv-mode", type=str, required=False, default='vicuna_v1')
parser.add_argument("--num_chunks", type=int, default=1)
parser.add_argument("--chunk_idx", type=int, default=0)
parser.add_argument("--num_frames", type=int, default=100)
parser.add_argument("--device", type=str, required=False, default='cuda:0')
parser.add_argument("--model-base", type=str, default=None)
parser.add_argument("--num_beams", type=int, default=1)
parser.add_argument("--temperature", type=float, default=0.2)
parser.add_argument("--top_p", type=float, default=None)
parser.add_argument("--use_pool", action='store_true')
return parser.parse_args()
def run_inference(args):
"""
Run inference on a set of video files using the Dense Connector model.
Args:
args: Command-line arguments.
"""
disable_torch_init()
model_path = os.path.expanduser(args.model_name)
model_name = get_model_name_from_path(model_path)
tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name, is_video=True, if_pool=args.use_pool)
gt_contents = json.load(open(args.gt_file, "r"))
gt_contents = get_chunk(gt_contents, args.num_chunks, args.chunk_idx)
answers_file = os.path.join(args.output_dir, f"{args.output_name}.json")
os.makedirs(args.output_dir, exist_ok=True)
ans_file = open(answers_file, "w")
# Create the output directory if it doesn't exist
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
output_list = [] # List to store the output results
conv_mode = args.conv_mode
video_formats = ['.mp4', '.avi', '.mov', '.mkv']
# Iterate over each sample in the ground truth file
index = 0
for sample in tqdm(gt_contents):
video_name = sample['video_name']
sample_set = sample
question = sample['Q']
# Load the video file
for fmt in video_formats: # Added this line
temp_path = os.path.join(args.video_dir, f"{video_name}{fmt}")
if os.path.exists(temp_path):
video_path = temp_path
video_frames, sizes = load_video(video_path, num_frm=args.num_frames)
# Run inference on the video and add the output to the list
output = llava_inference(video_frames, question, conv_mode, model,
tokenizer, image_processor, sizes)
sample_set['pred'] = output
output_list.append(sample_set)
ans_file.write(json.dumps(sample_set) + "\n")
index += 1
break
ans_file.close()
# Save the output list to a JSON file
# with open(os.path.join(args.output_dir, f"{args.output_name}.json"), 'w') as file:
# json.dump(output_list, file)
if __name__ == "__main__":
args = parse_args()
run_inference(args)
|