Xilinx · pablomlago · Jan 20, 2025 · Jan 21, 2025
diff --git a/src/brevitas_examples/llm/benchmark/benchmark_config.yaml b/src/brevitas_examples/llm/benchmark/benchmark_config.yaml
@@ -0,0 +1,170 @@
+act_calibration:
+- false
+act_equalization:
+- null
+- layerwise
+- fx
+act_equalization_alpha:
+- 0.5
+bias_corr:
+- false
+checkpoint_name:
+- null
+convert_layernorm_to_rmsnorm:
+- false
+dataset:
+- wikitext2
+- c4
+eval:
+- false
+export_prefix:
+- null
+export_target:
+- null
+- onnx_qcdq
+- torch_qcdq
+- sharded_torchmlir_group_weight
+- sharded_packed_torchmlir_group_weight
+few_shot_compile:
+- false
+few_shot_eval:
+- false
+few_shot_limit:
+- null
+few_shot_tasks:
+- - arc_challenge
+  - arc_easy
+  - winogrande
+  - piqa
+few_shot_zeroshot:
+- false
+functional_sdpa_quant:
+- false
+fuse_sequences:
+- false
+gpfq:
+- false
+gptq:
+- false
+gpxq_act_order:
+- false
+gpxq_block_name:
+- null
+gpxq_create_weight_orig:
+- false
+gpxq_max_accumulator_bit_width:
+- null
+gpxq_max_accumulator_tile_size:
+- null
+gpxq_use_quant_activations:
+- false
+input_bit_width:
+- null
+input_group_size:
+- 64
+input_param_method:
+- stats
+- mse
+input_quant_format:
+- int
+input_quant_granularity:
+- per_tensor
+- per_row
+- per_group
+input_quant_type:
+- sym
+- asym
+input_scale_precision:
+- float_scale
+- po2_scale
+input_scale_type:
+- static
+- dynamic
+- no_scale
+learned_round:
+- null
+- linear_round
+learned_round_fast_update:
+- false
+learned_round_iters:
+- 200
+learned_round_lr:
+- 0.005
+learned_round_scale:
+- false
+learned_round_scale_lr:
+- 0.01
+learned_round_scale_momentum:
+- 0.9
+ln_affine_merge:
+- false
+load_awq:
+- null
+load_checkpoint:
+- false
+model:
+- facebook/opt-125m
+no_float16:
+- false
+no_quantize:
+- false
+nsamples:
+- 128
+quant_sdpa:
+- false
+quantize_input_zero_point:
+- false
+quantize_last_layer:
+- false
+quantize_weight_zero_point:
+- false
+replace_mha:
+- false
+replace_rmsnorm:
+- false
+rotation:
+- fx
+- layerwise
+- fused_no_fx
+rotation_mode:
+- had
+- ort
+rotation_orphan_sink:
+- false
+rotation_sdpa_regions:
+- false
+scale_rounding_func_type:
+- round
+- ceil
+- floor
+scaling_min_val:
+- 0.0001
+seed:
+- 0
+seqlen:
+- 2048
+weight_bit_width:
+- 8
+weight_equalization:
+- false
+weight_group_dim:
+- 1
+- 0
+weight_group_size:
+- 128
+weight_param_method:
+- stats
+- mse
+- hqo
+weight_quant_format:
+- int
+weight_quant_granularity:
+- per_channel
+- per_tensor
+- per_group
+weight_quant_type:
+- sym
+- asym
+weight_scale_precision:
+- float_scale
+- po2_scale
diff --git a/src/brevitas_examples/llm/benchmark/llm_benchmark.py b/src/brevitas_examples/llm/benchmark/llm_benchmark.py
@@ -0,0 +1,157 @@
+from argparse import ArgumentParser
+from argparse import Namespace
+import itertools
+import os
+import re
+import subprocess
+import sys
+import threading
+from types import SimpleNamespace
+from typing import Dict, List
+
+import pandas as pd
+import randomname as rn
+import yaml
+
+from brevitas_examples.llm.main import create_llm_args_parser
+from brevitas_examples.llm.main import validate
+
+# Set appropiately for your system
+PYTHON_BIN = "CONDA_ENV_DIR/bin/python"
+LLM_ENTRYPOINT = "BREVITAS_DIR/brevitas/src/brevitas_examples/llm/main.py"
+RESULTS_FOLDER = "RESULTS_DIR"
+CUDA_AVAILABLE_DEVICES = [0, 1]
+NUM_GPUS_PER_PROCESS = 1
+NUM_RETRIES = 1
+
+
+def run_args_bucket(id: int, args_dicts_queue: List[Dict]):
+    # Visible devices for the thread
+    thread_cuda_visible_devices = ",".join(
+        map(str, CUDA_AVAILABLE_DEVICES[id:id + NUM_GPUS_PER_PROCESS]))
+    # Iterate over the combinations launching the LLM entrypoint
+    while True:
+        try:
+            # .pop is an atomic operation
+            args_dict = args_dicts_queue.pop()
+        except IndexError:
+            break
+        print(f"Thread {id}, remaining combinations {len(args_dicts_queue)}")
+        # Generate name for the experiment
+        job_name = rn.get_name()
+        job_folder = f"{RESULTS_FOLDER}/{job_name}"
+        # Create folder to store the results of the experiment
+        os.mkdir(job_folder)
+        # Save yaml file for reproducibility
+        with open(f"{job_folder}/config.yaml", 'w') as f:
+            yaml.dump(args_dict, f)
+        # Enable reruning the process there was a crash
+        num_retries = 0
+        while num_retries < NUM_RETRIES:
+            stdout_file = open(f"{job_folder}/stdout.out", 'w')
+            stderr_file = open(f"{job_folder}/stderr.out", 'w')
+            process = subprocess.Popen(
+                [PYTHON_BIN, LLM_ENTRYPOINT, "--config", f"{job_folder}/config.yaml"],
+                env={"CUDA_VISIBLE_DEVICES": thread_cuda_visible_devices},
+                stdout=stdout_file,
+                stderr=stderr_file,
+            )
+            # Wait before starting a new process to prevent using the same GPUs
+            return_code = process.wait()
+            stdout_file.close()
+            stderr_file.close()
+            if return_code is not None and return_code == 0:
+                break
+
+
+def parse_config_args(args: List[str]) -> Namespace:
+    parser = ArgumentParser(add_help=False)
+    parser.add_argument(
+        '--config',
+        type=str,
+        default=None,
+        help=
+        'Specify alternative default commandline args (e.g., config/default_template.yml). Default: %(default)s.'
+    )
+    return parser.parse_args(args)
+
+
+def parse_results(columns: List[str], results_folder: str = RESULTS_FOLDER) -> pd.DataFrame:
+    df = pd.DataFrame(columns=columns)
+    for entry in os.scandir(results_folder):
+        if entry.is_dir():
+            # Get the identifier of the job
+            job_name = os.path.basename(entry.path)
+            # Retrieve the configuration from the YAML file
+            with open(f"{results_folder}/{job_name}/config.yaml", 'r') as f:
+                job_config = yaml.safe_load(f)
+            # Load the log file
+            with open(f"{results_folder}/{job_name}/stdout.out", 'r') as f:
+                job_log = f.read()
+                # Find the line containing Float PPL number
+                float_ppl_line = re.search(r"Float perplexity \((.*?)\): (\d+\.\d+)", job_log)
+                float_ppl = float(float_ppl_line.group(2)) if float_ppl_line is not None else None
+                # Find the line containing Quant PPL number
+                quant_ppl_line = re.search(r"Quantized perplexity \((.*?)\): (\d+\.\d+)", job_log)
+                quant_ppl = float(quant_ppl_line.group(2)) if quant_ppl_line is not None else None
+            # Add entry to DataFrame
+            row_data = {
+                "job_id": job_name, **job_config, "float_ppl": float_ppl, "quant_ppl": quant_ppl}
+            df.loc[len(df)] = list(row_data.values())
+    return df
+
+
+if __name__ == "__main__":
+    # Instantiate directory for storing the results
+    if not os.path.exists(RESULTS_FOLDER):
+        os.makedirs(RESULTS_FOLDER)
+    if len(sys.argv) > 1:
+        args = parse_config_args(sys.argv[1:])
+        # Load argument combinations from specified YAML
+        with open(args.config, 'r') as f:
+            args_dict = yaml.safe_load(f)
+    else:
+        # Generate a YAML benchmark from default arguments
+        llm_parser = create_llm_args_parser()
+        args_dict = {
+            action.dest: [action.default] if action.choices is None else action.choices
+            for action in llm_parser._actions}
+        del args_dict["help"]  # Config file cannot be specified via YAML
+        del args_dict["config"]  # Config file cannot be specified via YAML
+        # Save YAML in the results folder
+        with open(f"{RESULTS_FOLDER}/benchmark_config.yaml", 'w') as f:
+            yaml.dump(args_dict, f)
+
+    # Generate combinations of arguments
+    args_keys, args_values = zip(*args_dict.items())
+    # Retrieve argument combinations that are valid for the LLM entrypoint
+    args_combinations = []
+    for v in itertools.product(*args_values):
+        args_combination = dict(zip(args_keys, v))
+        try:
+            # Check if the arguments are valid
+            validate(SimpleNamespace(**args_combination))
+            args_combinations.append(args_combination)
+        except AssertionError:
+            # Invalid configuration
+            pass
+    # Number of argument combinations
+    num_threads = len(CUDA_AVAILABLE_DEVICES) // NUM_GPUS_PER_PROCESS
+    # Instantiate threads to run the arguments in each bucket
+    threads = []
+    for i in range(num_threads):
+        thread = threading.Thread(
+            target=run_args_bucket, args=(
+                i,
+                args_combinations[i],
+            ))
+        thread.start()
+        threads.append(thread)
+
+    # Wait for all threads to complete
+    for thread in threads:
+        thread.join()
+
+    # Parse results
+    df = parse_results(columns=["job_id"] + list(args_keys) + ["float_ppl", "quant_ppl"])
+    df.to_csv(f"{RESULTS_FOLDER}/results.csv", index=False)
diff --git a/src/brevitas_examples/llm/main.py b/src/brevitas_examples/llm/main.py
@@ -585,7 +585,7 @@ def override_defaults(args):
     return defaults
 
 
-def parse_args(args, override_defaults={}):
+def create_llm_args_parser():
     parser = argparse.ArgumentParser()
     parser.add_argument(
         '--config',
@@ -906,8 +906,12 @@ def parse_args(args, override_defaults={}):
         type=str,
         nargs='*',
         help='A list of tasks for zero_shot evaluation. Default: %(default)s')
-    parser.set_defaults(**override_defaults)
+    return parser
+
 
+def parse_args(args, override_defaults={}):
+    parser = create_llm_args_parser()
+    parser.set_defaults(**override_defaults)
     return parser.parse_args(args)