From dc56404e4bec79d8895e614ba77a2cd71ee9b04d Mon Sep 17 00:00:00 2001 From: yukang Date: Wed, 10 Jul 2024 03:24:38 +0800 Subject: [PATCH] Updated VILA-benchmark with better scripts (#123) * Add files via upload * Add files via upload * Add files via upload * Rename eval_benchmark_1_correctness.py to eval_benchmark_correctness.py * Rename eval_benchmark_2_detailed_orientation.py to eval_benchmark_detailed_orientation.py * Rename eval_benchmark_3_context.py to eval_benchmark_context.py * Rename eval_benchmark_4_temporal.py to eval_benchmark_temporal.py * Rename eval_benchmark_5_consistency.py to eval_benchmark_consistency.py * Update eval_all.sh * Update eval_all.sh * Update eval_all.sh * Create run_vila_benchmark.sh * Update slurm_run_all.sh * Update eval_all.sh * Update eval_all.sh * Create eval_vila_benchmark_gpt4.sh * Update eval_vila_benchmark_gpt4.sh * Update eval_all.sh * Update eval_all.sh * Delete llava/eval/video/convert_pred_to_json.py * Update model_vqa_videodemo_benchmark.py * Update eval_all.sh * Update run_vila_benchmark.sh * Update slurm_run_all_sharded.sh * Update slurm_run_all.sh --- llava/eval/video/convert_pred_to_json.py | 29 ---- ...tency.py => eval_benchmark_consistency.py} | 0 ...3_context.py => eval_benchmark_context.py} | 0 ...tness.py => eval_benchmark_correctness.py} | 0 ...=> eval_benchmark_detailed_orientation.py} | 0 ...temporal.py => eval_benchmark_temporal.py} | 0 .../video/model_vqa_videodemo_benchmark.py | 147 ++++++++++-------- scripts/v1_5/eval/video_chatgpt/eval_all.sh | 29 +++- .../video_chatgpt/eval_vila_benchmark_gpt4.sh | 14 ++ .../eval/video_chatgpt/run_vila_benchmark.sh | 17 ++ .../v1_5/eval/video_chatgpt/slurm_run_all.sh | 1 + .../video_chatgpt/slurm_run_all_sharded.sh | 3 +- 12 files changed, 145 insertions(+), 95 deletions(-) delete mode 100644 llava/eval/video/convert_pred_to_json.py rename llava/eval/video/{eval_benchmark_5_consistency.py => eval_benchmark_consistency.py} (100%) rename llava/eval/video/{eval_benchmark_3_context.py => eval_benchmark_context.py} (100%) rename llava/eval/video/{eval_benchmark_1_correctness.py => eval_benchmark_correctness.py} (100%) rename llava/eval/video/{eval_benchmark_2_detailed_orientation.py => eval_benchmark_detailed_orientation.py} (100%) rename llava/eval/video/{eval_benchmark_4_temporal.py => eval_benchmark_temporal.py} (100%) create mode 100644 scripts/v1_5/eval/video_chatgpt/eval_vila_benchmark_gpt4.sh create mode 100644 scripts/v1_5/eval/video_chatgpt/run_vila_benchmark.sh diff --git a/llava/eval/video/convert_pred_to_json.py b/llava/eval/video/convert_pred_to_json.py deleted file mode 100644 index 0e34a23a..00000000 --- a/llava/eval/video/convert_pred_to_json.py +++ /dev/null @@ -1,29 +0,0 @@ -import os -import json - -labels_json = json.load(open("/lustre/fs2/portfolios/nvr/users/yukangc/datasets/Video-Benchmark-Label-0607.json")) - -videos = [item['video_name'] for item in labels_json] - -model = "VILA1.5-40B-shortvideos" -pred_path = "./eval_output/%s/Demo_Zero_Shot_QA_eval2/%s_bin_20_80"%(model, model) - -pred_dict = {} -for pred in open(os.path.join(pred_path, "detailed_captions.txt")): - if ".mp4: " in pred: - video_name = pred.split(".mp4: ")[0] - if not video_name in videos: - continue - content = pred.split(".mp4: ")[1].rstrip("\n") - pred_dict[video_name] = content - else: - pred_dict[video_name] = pred_dict[video_name] + "\n%s"%pred - -output_json = [] -for item in labels_json: - video_name = item["video_name"] - item_output = item - item_output['pred'] = pred_dict[video_name] - output_json.append(item_output) - -json.dump(output_json, open(os.path.join(pred_path, "pred.json"), "w")) diff --git a/llava/eval/video/eval_benchmark_5_consistency.py b/llava/eval/video/eval_benchmark_consistency.py similarity index 100% rename from llava/eval/video/eval_benchmark_5_consistency.py rename to llava/eval/video/eval_benchmark_consistency.py diff --git a/llava/eval/video/eval_benchmark_3_context.py b/llava/eval/video/eval_benchmark_context.py similarity index 100% rename from llava/eval/video/eval_benchmark_3_context.py rename to llava/eval/video/eval_benchmark_context.py diff --git a/llava/eval/video/eval_benchmark_1_correctness.py b/llava/eval/video/eval_benchmark_correctness.py similarity index 100% rename from llava/eval/video/eval_benchmark_1_correctness.py rename to llava/eval/video/eval_benchmark_correctness.py diff --git a/llava/eval/video/eval_benchmark_2_detailed_orientation.py b/llava/eval/video/eval_benchmark_detailed_orientation.py similarity index 100% rename from llava/eval/video/eval_benchmark_2_detailed_orientation.py rename to llava/eval/video/eval_benchmark_detailed_orientation.py diff --git a/llava/eval/video/eval_benchmark_4_temporal.py b/llava/eval/video/eval_benchmark_temporal.py similarity index 100% rename from llava/eval/video/eval_benchmark_4_temporal.py rename to llava/eval/video/eval_benchmark_temporal.py diff --git a/llava/eval/video/model_vqa_videodemo_benchmark.py b/llava/eval/video/model_vqa_videodemo_benchmark.py index e8c36b63..31f93a56 100644 --- a/llava/eval/video/model_vqa_videodemo_benchmark.py +++ b/llava/eval/video/model_vqa_videodemo_benchmark.py @@ -5,24 +5,32 @@ import os import json from tqdm import tqdm -import shortuuid from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN from llava.conversation import conv_templates, SeparatorStyle +from llava import conversation as conversation_lib from llava.model.builder import load_pretrained_model from llava.data.dataset import LazySupervisedDataset from llava.utils import disable_torch_init from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria -from llava.mm_utils import process_images +from llava.mm_utils import process_images, process_image -from PIL import Image import math -import numpy as np +import signal -from torchvision.transforms import Resize -from pytorchvideo.data.encoded_video import EncodedVideo +LABEL_PATHS = { + "pexels": "/home/yukangc/VILA-Benchmark/label/pexels.json", + "robotics": "/home/yukangc/VILA-Benchmark/label/robotics.json", + "av": "/home/yukangc/VILA-Benchmark/label/av.json", + "long": "/home/yukangc/VILA-Benchmark/label/long.json", +} -import signal +VIDEO_DIR = { + "pexels": "/home/yukangc/VILA-Benchmark/data/pexels", + "robotics": "/home/yukangc/VILA-Benchmark/data/robotics", + "av": "/home/yukangc/VILA-Benchmark/data/av", + "long": "/home/yukangc/VILA-Benchmark/data/long", +} # This function will be called when the timeout is reached def handler(signum, frame): @@ -42,13 +50,30 @@ def get_chunk(lst, n, k): def get_model_output(model, image_processor, tokenizer, video_path, qs, args): + conversation_lib.default_conversation = conversation_lib.conv_templates[ + args.conv_mode + ] + if hasattr(model.config, 'num_video_frames') and model.config.num_video_frames is not None: + num_video_frames = model.config.num_video_frames + else: + num_video_frames = 8 - num_video_frames = model.config.num_video_frames - print("num_video_frames", num_video_frames) - images, video_loading_succeed = LazySupervisedDataset._load_video(video_path, num_video_frames, data_args=args) - image_tensor = process_images(images, image_processor, model.config) + if hasattr(model.config, 'fps') and model.config.fps is not None: + fps = model.config.fps + else: + fps = 0.0 - qs = '\n' * num_video_frames + qs + # print(fps) + images, frames_loaded = LazySupervisedDataset._load_video(video_path, num_video_frames, fps, args) + # image_tensor = process_images(images, image_processor, model.config) + image_tensor = torch.stack( + [process_image(image, args, None) for image in images] + ) + num_frames_loaded_successfully = len(images) + # print(f"Number of frames loaded successfully: {num_frames_loaded_successfully}") + qs = qs.replace("\n", "").replace("\n", "").replace("", "") + qs = qs.replace("