From 16e49cc499249ec7ddb7510339526fd3a3a6006c Mon Sep 17 00:00:00 2001 From: Pablo Monteagudo Lago Date: Mon, 20 Jan 2025 18:41:30 +0000 Subject: [PATCH 1/2] LLM benchmark proposal --- .../llm/benchmark/benchmark_config.yaml | 170 ++++++++++++++++++ .../llm/benchmark/llm_benchmark.py | 153 ++++++++++++++++ 2 files changed, 323 insertions(+) create mode 100644 src/brevitas_examples/llm/benchmark/benchmark_config.yaml create mode 100644 src/brevitas_examples/llm/benchmark/llm_benchmark.py diff --git a/src/brevitas_examples/llm/benchmark/benchmark_config.yaml b/src/brevitas_examples/llm/benchmark/benchmark_config.yaml new file mode 100644 index 000000000..bab226113 --- /dev/null +++ b/src/brevitas_examples/llm/benchmark/benchmark_config.yaml @@ -0,0 +1,170 @@ +act_calibration: +- false +act_equalization: +- null +- layerwise +- fx +act_equalization_alpha: +- 0.5 +bias_corr: +- false +checkpoint_name: +- null +convert_layernorm_to_rmsnorm: +- false +dataset: +- wikitext2 +- c4 +eval: +- false +export_prefix: +- null +export_target: +- null +- onnx_qcdq +- torch_qcdq +- sharded_torchmlir_group_weight +- sharded_packed_torchmlir_group_weight +few_shot_compile: +- false +few_shot_eval: +- false +few_shot_limit: +- null +few_shot_tasks: +- - arc_challenge + - arc_easy + - winogrande + - piqa +few_shot_zeroshot: +- false +functional_sdpa_quant: +- false +fuse_sequences: +- false +gpfq: +- false +gptq: +- false +gpxq_act_order: +- false +gpxq_block_name: +- null +gpxq_create_weight_orig: +- false +gpxq_max_accumulator_bit_width: +- null +gpxq_max_accumulator_tile_size: +- null +gpxq_use_quant_activations: +- false +input_bit_width: +- null +input_group_size: +- 64 +input_param_method: +- stats +- mse +input_quant_format: +- int +input_quant_granularity: +- per_tensor +- per_row +- per_group +input_quant_type: +- sym +- asym +input_scale_precision: +- float_scale +- po2_scale +input_scale_type: +- static +- dynamic +- no_scale +learned_round: +- null +- linear_round +learned_round_fast_update: +- false +learned_round_iters: +- 200 +learned_round_lr: +- 0.005 +learned_round_scale: +- false +learned_round_scale_lr: +- 0.01 +learned_round_scale_momentum: +- 0.9 +ln_affine_merge: +- false +load_awq: +- null +load_checkpoint: +- false +model: +- facebook/opt-125m +no_float16: +- false +no_quantize: +- false +nsamples: +- 128 +quant_sdpa: +- false +quantize_input_zero_point: +- false +quantize_last_layer: +- false +quantize_weight_zero_point: +- false +replace_mha: +- false +replace_rmsnorm: +- false +rotation: +- fx +- layerwise +- fused_no_fx +rotation_mode: +- had +- ort +rotation_orphan_sink: +- false +rotation_sdpa_regions: +- false +scale_rounding_func_type: +- round +- ceil +- floor +scaling_min_val: +- 0.0001 +seed: +- 0 +seqlen: +- 2048 +weight_bit_width: +- 8 +weight_equalization: +- false +weight_group_dim: +- 1 +- 0 +weight_group_size: +- 128 +weight_param_method: +- stats +- mse +- hqo +weight_quant_format: +- int +weight_quant_granularity: +- per_channel +- per_tensor +- per_group +weight_quant_type: +- sym +- asym +weight_scale_precision: +- float_scale +- po2_scale diff --git a/src/brevitas_examples/llm/benchmark/llm_benchmark.py b/src/brevitas_examples/llm/benchmark/llm_benchmark.py new file mode 100644 index 000000000..981214a8e --- /dev/null +++ b/src/brevitas_examples/llm/benchmark/llm_benchmark.py @@ -0,0 +1,153 @@ +from argparse import ArgumentParser +from argparse import Namespace +import itertools +import os +import re +import subprocess +import sys +import threading +from types import SimpleNamespace +from typing import Dict, List + +import pandas as pd +import randomname as rn +import yaml + +from brevitas_examples.llm.main import instantiate_llm_parser +from brevitas_examples.llm.main import validate + +# Set appropiately for your system +PYTHON_BIN = "CONDA_ENV_DIR/bin/python" +LLM_ENTRYPOINT = "BREVITAS_DIR/brevitas/src/brevitas_examples/llm/main.py" +RESULTS_FOLDER = "RESULTS_DIR" +CUDA_AVAILABLE_DEVICES = [0, 1] +NUM_GPUS_PER_PROCESS = 1 + + +def run_args_bucket(id: int, args_dicts_bucket: List[Dict]): + # Visible devices for the thread + thread_cuda_visible_devices = ",".join( + map(str, CUDA_AVAILABLE_DEVICES[id:id + NUM_GPUS_PER_PROCESS])) + # Iterate over the combinations launching the LLM entrypoint + for i in range(len(args_dicts_bucket)): + print(f"Thread {id}, starting process {i+1}/{len(args_dicts_bucket)}") + # Generate name for the experiment + job_name = rn.get_name() + job_folder = f"{RESULTS_FOLDER}/{job_name}" + # Create folder to store the results of the experiment + os.mkdir(job_folder) + # Save yaml file for reproducibility + with open(f"{job_folder}/config.yaml", 'w') as f: + yaml.dump(args_dicts_bucket[0], f) + # Run process + stdout_file = open(f"{job_folder}/stdout.out", 'w') + stderr_file = open(f"{job_folder}/stderr.out", 'w') + process = subprocess.Popen( + [PYTHON_BIN, LLM_ENTRYPOINT, "--config", f"{job_folder}/config.yaml"], + env={"CUDA_VISIBLE_DEVICES": thread_cuda_visible_devices}, + stdout=stdout_file, + stderr=stderr_file, + ) + # Wait before starting a new process to prevent using the same GPUs + process.wait() + stdout_file.close() + stderr_file.close() + + +def parse_config_args(args: List[str]) -> Namespace: + parser = ArgumentParser(add_help=False) + parser.add_argument( + '--config', + type=str, + default=None, + help= + 'Specify alternative default commandline args (e.g., config/default_template.yml). Default: %(default)s.' + ) + return parser.parse_args(args) + + +def parse_results(columns: List[str], results_folder: str = RESULTS_FOLDER) -> pd.DataFrame: + df = pd.DataFrame(columns=columns) + for entry in os.scandir(results_folder): + if entry.is_dir(): + # Get the identifier of the job + job_name = os.path.basename(entry.path) + # Retrieve the configuration from the YAML file + with open(f"{results_folder}/{job_name}/config.yaml", 'r') as f: + job_config = yaml.safe_load(f) + # Load the log file + with open(f"{results_folder}/{job_name}/stdout.out", 'r') as f: + job_log = f.read() + # Find the line containing Float PPL number + float_ppl_line = re.search(r"Float perplexity \((.*?)\): (\d+\.\d+)", job_log) + float_ppl = float(float_ppl_line.group(2)) if float_ppl_line is not None else None + # Find the line containing Quant PPL number + quant_ppl_line = re.search(r"Quantized perplexity \((.*?)\): (\d+\.\d+)", job_log) + quant_ppl = float(quant_ppl_line.group(2)) if quant_ppl_line is not None else None + # Add entry to DataFrame + row_data = { + "job_id": job_name, **job_config, "float_ppl": float_ppl, "quant_ppl": quant_ppl} + df.loc[len(df)] = list(row_data.values()) + return df + + +if __name__ == "__main__": + # Instantiate directory for storing the results + if not os.path.exists(RESULTS_FOLDER): + os.makedirs(RESULTS_FOLDER) + if len(sys.argv) > 1: + args = parse_config_args(sys.argv[1:]) + # Load argument combinations from specified YAML + with open(args.config, 'r') as f: + args_dict = yaml.safe_load(f) + else: + # Generate a YAML benchmark from default arguments + llm_parser = instantiate_llm_parser() + args_dict = { + action.dest: [action.default] if action.choices is None else action.choices + for action in llm_parser._actions} + del args_dict["help"] # Config file cannot be specified via YAML + del args_dict["config"] # Config file cannot be specified via YAML + # Save YAML in the results folder + with open(f"{RESULTS_FOLDER}/benchmark_config.yaml", 'w') as f: + yaml.dump(args_dict, f) + + # Generate combinations of arguments + args_keys, args_values = zip(*args_dict.items()) + # Retrieve argument combinations that are valid for the LLM entrypoint + args_combinations = [] + for v in itertools.product(*args_values): + args_combination = dict(zip(args_keys, v)) + try: + # Check if the arguments are valid + validate(SimpleNamespace(**args_combination)) + args_combinations.append(args_combination) + except AssertionError: + # Invalid configuration + pass + # Number of argument combinations + num_combinations = len(args_combinations) + num_buckets = len(CUDA_AVAILABLE_DEVICES) // NUM_GPUS_PER_PROCESS + bucket_size = num_combinations // num_buckets + # Split the combinations in differet buckets each belonging to a different thread + args_combinations_buckets = [ + args_combinations[i * bucket_size:(i + 1) * bucket_size] for i in range(num_buckets)] + + # Instantiate threads to run the arguments in each bucket + threads = [] + for i in range(num_buckets): + thread = threading.Thread( + target=run_args_bucket, args=( + i, + args_combinations_buckets[i], + )) + thread.start() + threads.append(thread) + + # Wait for all threads to complete + for thread in threads: + thread.join() + + # Parse results + df = parse_results(columns=["job_id"] + list(args_keys) + ["float_ppl", "quant_ppl"]) + df.to_csv(f"{RESULTS_FOLDER}/results.csv", index=False) From f7879ea317f4369a8030639a93ac7566e91c6ae3 Mon Sep 17 00:00:00 2001 From: Pablo Monteagudo Lago Date: Tue, 21 Jan 2025 18:14:26 +0000 Subject: [PATCH 2/2] Remove buckets --- .../llm/benchmark/llm_benchmark.py | 60 ++++++++++--------- src/brevitas_examples/llm/main.py | 8 ++- 2 files changed, 38 insertions(+), 30 deletions(-) diff --git a/src/brevitas_examples/llm/benchmark/llm_benchmark.py b/src/brevitas_examples/llm/benchmark/llm_benchmark.py index 981214a8e..092566e3d 100644 --- a/src/brevitas_examples/llm/benchmark/llm_benchmark.py +++ b/src/brevitas_examples/llm/benchmark/llm_benchmark.py @@ -13,7 +13,7 @@ import randomname as rn import yaml -from brevitas_examples.llm.main import instantiate_llm_parser +from brevitas_examples.llm.main import create_llm_args_parser from brevitas_examples.llm.main import validate # Set appropiately for your system @@ -22,15 +22,21 @@ RESULTS_FOLDER = "RESULTS_DIR" CUDA_AVAILABLE_DEVICES = [0, 1] NUM_GPUS_PER_PROCESS = 1 +NUM_RETRIES = 1 -def run_args_bucket(id: int, args_dicts_bucket: List[Dict]): +def run_args_bucket(id: int, args_dicts_queue: List[Dict]): # Visible devices for the thread thread_cuda_visible_devices = ",".join( map(str, CUDA_AVAILABLE_DEVICES[id:id + NUM_GPUS_PER_PROCESS])) # Iterate over the combinations launching the LLM entrypoint - for i in range(len(args_dicts_bucket)): - print(f"Thread {id}, starting process {i+1}/{len(args_dicts_bucket)}") + while True: + try: + # .pop is an atomic operation + args_dict = args_dicts_queue.pop() + except IndexError: + break + print(f"Thread {id}, remaining combinations {len(args_dicts_queue)}") # Generate name for the experiment job_name = rn.get_name() job_folder = f"{RESULTS_FOLDER}/{job_name}" @@ -38,20 +44,24 @@ def run_args_bucket(id: int, args_dicts_bucket: List[Dict]): os.mkdir(job_folder) # Save yaml file for reproducibility with open(f"{job_folder}/config.yaml", 'w') as f: - yaml.dump(args_dicts_bucket[0], f) - # Run process - stdout_file = open(f"{job_folder}/stdout.out", 'w') - stderr_file = open(f"{job_folder}/stderr.out", 'w') - process = subprocess.Popen( - [PYTHON_BIN, LLM_ENTRYPOINT, "--config", f"{job_folder}/config.yaml"], - env={"CUDA_VISIBLE_DEVICES": thread_cuda_visible_devices}, - stdout=stdout_file, - stderr=stderr_file, - ) - # Wait before starting a new process to prevent using the same GPUs - process.wait() - stdout_file.close() - stderr_file.close() + yaml.dump(args_dict, f) + # Enable reruning the process there was a crash + num_retries = 0 + while num_retries < NUM_RETRIES: + stdout_file = open(f"{job_folder}/stdout.out", 'w') + stderr_file = open(f"{job_folder}/stderr.out", 'w') + process = subprocess.Popen( + [PYTHON_BIN, LLM_ENTRYPOINT, "--config", f"{job_folder}/config.yaml"], + env={"CUDA_VISIBLE_DEVICES": thread_cuda_visible_devices}, + stdout=stdout_file, + stderr=stderr_file, + ) + # Wait before starting a new process to prevent using the same GPUs + return_code = process.wait() + stdout_file.close() + stderr_file.close() + if return_code is not None and return_code == 0: + break def parse_config_args(args: List[str]) -> Namespace: @@ -102,7 +112,7 @@ def parse_results(columns: List[str], results_folder: str = RESULTS_FOLDER) -> p args_dict = yaml.safe_load(f) else: # Generate a YAML benchmark from default arguments - llm_parser = instantiate_llm_parser() + llm_parser = create_llm_args_parser() args_dict = { action.dest: [action.default] if action.choices is None else action.choices for action in llm_parser._actions} @@ -126,20 +136,14 @@ def parse_results(columns: List[str], results_folder: str = RESULTS_FOLDER) -> p # Invalid configuration pass # Number of argument combinations - num_combinations = len(args_combinations) - num_buckets = len(CUDA_AVAILABLE_DEVICES) // NUM_GPUS_PER_PROCESS - bucket_size = num_combinations // num_buckets - # Split the combinations in differet buckets each belonging to a different thread - args_combinations_buckets = [ - args_combinations[i * bucket_size:(i + 1) * bucket_size] for i in range(num_buckets)] - + num_threads = len(CUDA_AVAILABLE_DEVICES) // NUM_GPUS_PER_PROCESS # Instantiate threads to run the arguments in each bucket threads = [] - for i in range(num_buckets): + for i in range(num_threads): thread = threading.Thread( target=run_args_bucket, args=( i, - args_combinations_buckets[i], + args_combinations[i], )) thread.start() threads.append(thread) diff --git a/src/brevitas_examples/llm/main.py b/src/brevitas_examples/llm/main.py index 8c4fe1968..d7aac0e9b 100644 --- a/src/brevitas_examples/llm/main.py +++ b/src/brevitas_examples/llm/main.py @@ -585,7 +585,7 @@ def override_defaults(args): return defaults -def parse_args(args, override_defaults={}): +def create_llm_args_parser(): parser = argparse.ArgumentParser() parser.add_argument( '--config', @@ -906,8 +906,12 @@ def parse_args(args, override_defaults={}): type=str, nargs='*', help='A list of tasks for zero_shot evaluation. Default: %(default)s') - parser.set_defaults(**override_defaults) + return parser + +def parse_args(args, override_defaults={}): + parser = create_llm_args_parser() + parser.set_defaults(**override_defaults) return parser.parse_args(args)