Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat (llm): benchmark for llm entrypoint #1166

Draft
wants to merge 2 commits into
base: dev
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
170 changes: 170 additions & 0 deletions src/brevitas_examples/llm/benchmark/benchmark_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
act_calibration:
- false
act_equalization:
- null
- layerwise
- fx
act_equalization_alpha:
- 0.5
bias_corr:
- false
checkpoint_name:
- null
convert_layernorm_to_rmsnorm:
- false
dataset:
- wikitext2
- c4
eval:
- false
export_prefix:
- null
export_target:
- null
- onnx_qcdq
- torch_qcdq
- sharded_torchmlir_group_weight
- sharded_packed_torchmlir_group_weight
few_shot_compile:
- false
few_shot_eval:
- false
few_shot_limit:
- null
few_shot_tasks:
- - arc_challenge
- arc_easy
- winogrande
- piqa
few_shot_zeroshot:
- false
functional_sdpa_quant:
- false
fuse_sequences:
- false
gpfq:
- false
gptq:
- false
gpxq_act_order:
- false
gpxq_block_name:
- null
gpxq_create_weight_orig:
- false
gpxq_max_accumulator_bit_width:
- null
gpxq_max_accumulator_tile_size:
- null
gpxq_use_quant_activations:
- false
input_bit_width:
- null
input_group_size:
- 64
input_param_method:
- stats
- mse
input_quant_format:
- int
input_quant_granularity:
- per_tensor
- per_row
- per_group
input_quant_type:
- sym
- asym
input_scale_precision:
- float_scale
- po2_scale
input_scale_type:
- static
- dynamic
- no_scale
learned_round:
- null
- linear_round
learned_round_fast_update:
- false
learned_round_iters:
- 200
learned_round_lr:
- 0.005
learned_round_scale:
- false
learned_round_scale_lr:
- 0.01
learned_round_scale_momentum:
- 0.9
ln_affine_merge:
- false
load_awq:
- null
load_checkpoint:
- false
model:
- facebook/opt-125m
no_float16:
- false
no_quantize:
- false
nsamples:
- 128
quant_sdpa:
- false
quantize_input_zero_point:
- false
quantize_last_layer:
- false
quantize_weight_zero_point:
- false
replace_mha:
- false
replace_rmsnorm:
- false
rotation:
- fx
- layerwise
- fused_no_fx
rotation_mode:
- had
- ort
rotation_orphan_sink:
- false
rotation_sdpa_regions:
- false
scale_rounding_func_type:
- round
- ceil
- floor
scaling_min_val:
- 0.0001
seed:
- 0
seqlen:
- 2048
weight_bit_width:
- 8
weight_equalization:
- false
weight_group_dim:
- 1
- 0
weight_group_size:
- 128
weight_param_method:
- stats
- mse
- hqo
weight_quant_format:
- int
weight_quant_granularity:
- per_channel
- per_tensor
- per_group
weight_quant_type:
- sym
- asym
weight_scale_precision:
- float_scale
- po2_scale
157 changes: 157 additions & 0 deletions src/brevitas_examples/llm/benchmark/llm_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
from argparse import ArgumentParser
from argparse import Namespace
import itertools
import os
import re
import subprocess
import sys
import threading
from types import SimpleNamespace
from typing import Dict, List

import pandas as pd
import randomname as rn
import yaml

from brevitas_examples.llm.main import create_llm_args_parser
from brevitas_examples.llm.main import validate

# Set appropiately for your system
PYTHON_BIN = "CONDA_ENV_DIR/bin/python"
LLM_ENTRYPOINT = "BREVITAS_DIR/brevitas/src/brevitas_examples/llm/main.py"
RESULTS_FOLDER = "RESULTS_DIR"
CUDA_AVAILABLE_DEVICES = [0, 1]
NUM_GPUS_PER_PROCESS = 1
NUM_RETRIES = 1


def run_args_bucket(id: int, args_dicts_queue: List[Dict]):
# Visible devices for the thread
thread_cuda_visible_devices = ",".join(
map(str, CUDA_AVAILABLE_DEVICES[id:id + NUM_GPUS_PER_PROCESS]))
# Iterate over the combinations launching the LLM entrypoint
while True:
try:
# .pop is an atomic operation
args_dict = args_dicts_queue.pop()
except IndexError:
break
print(f"Thread {id}, remaining combinations {len(args_dicts_queue)}")
# Generate name for the experiment
job_name = rn.get_name()
job_folder = f"{RESULTS_FOLDER}/{job_name}"
# Create folder to store the results of the experiment
os.mkdir(job_folder)
# Save yaml file for reproducibility
with open(f"{job_folder}/config.yaml", 'w') as f:
yaml.dump(args_dict, f)
# Enable reruning the process there was a crash
num_retries = 0
while num_retries < NUM_RETRIES:
stdout_file = open(f"{job_folder}/stdout.out", 'w')
stderr_file = open(f"{job_folder}/stderr.out", 'w')
process = subprocess.Popen(
[PYTHON_BIN, LLM_ENTRYPOINT, "--config", f"{job_folder}/config.yaml"],
env={"CUDA_VISIBLE_DEVICES": thread_cuda_visible_devices},
stdout=stdout_file,
stderr=stderr_file,
)
# Wait before starting a new process to prevent using the same GPUs
return_code = process.wait()
stdout_file.close()
stderr_file.close()
if return_code is not None and return_code == 0:
break


def parse_config_args(args: List[str]) -> Namespace:
parser = ArgumentParser(add_help=False)
parser.add_argument(
'--config',
type=str,
default=None,
help=
'Specify alternative default commandline args (e.g., config/default_template.yml). Default: %(default)s.'
)
return parser.parse_args(args)


def parse_results(columns: List[str], results_folder: str = RESULTS_FOLDER) -> pd.DataFrame:
df = pd.DataFrame(columns=columns)
for entry in os.scandir(results_folder):
if entry.is_dir():
# Get the identifier of the job
job_name = os.path.basename(entry.path)
# Retrieve the configuration from the YAML file
with open(f"{results_folder}/{job_name}/config.yaml", 'r') as f:
job_config = yaml.safe_load(f)
# Load the log file
with open(f"{results_folder}/{job_name}/stdout.out", 'r') as f:
job_log = f.read()
# Find the line containing Float PPL number
float_ppl_line = re.search(r"Float perplexity \((.*?)\): (\d+\.\d+)", job_log)
float_ppl = float(float_ppl_line.group(2)) if float_ppl_line is not None else None
# Find the line containing Quant PPL number
quant_ppl_line = re.search(r"Quantized perplexity \((.*?)\): (\d+\.\d+)", job_log)
quant_ppl = float(quant_ppl_line.group(2)) if quant_ppl_line is not None else None
# Add entry to DataFrame
row_data = {
"job_id": job_name, **job_config, "float_ppl": float_ppl, "quant_ppl": quant_ppl}
df.loc[len(df)] = list(row_data.values())
return df


if __name__ == "__main__":
# Instantiate directory for storing the results
if not os.path.exists(RESULTS_FOLDER):
os.makedirs(RESULTS_FOLDER)
if len(sys.argv) > 1:
args = parse_config_args(sys.argv[1:])
# Load argument combinations from specified YAML
with open(args.config, 'r') as f:
args_dict = yaml.safe_load(f)
else:
# Generate a YAML benchmark from default arguments
llm_parser = create_llm_args_parser()
args_dict = {
action.dest: [action.default] if action.choices is None else action.choices
for action in llm_parser._actions}
del args_dict["help"] # Config file cannot be specified via YAML
del args_dict["config"] # Config file cannot be specified via YAML
# Save YAML in the results folder
with open(f"{RESULTS_FOLDER}/benchmark_config.yaml", 'w') as f:
yaml.dump(args_dict, f)

# Generate combinations of arguments
args_keys, args_values = zip(*args_dict.items())
# Retrieve argument combinations that are valid for the LLM entrypoint
args_combinations = []
for v in itertools.product(*args_values):
args_combination = dict(zip(args_keys, v))
try:
# Check if the arguments are valid
validate(SimpleNamespace(**args_combination))
args_combinations.append(args_combination)
except AssertionError:
# Invalid configuration
pass
# Number of argument combinations
num_threads = len(CUDA_AVAILABLE_DEVICES) // NUM_GPUS_PER_PROCESS
# Instantiate threads to run the arguments in each bucket
threads = []
for i in range(num_threads):
thread = threading.Thread(
target=run_args_bucket, args=(
i,
args_combinations[i],
))
thread.start()
threads.append(thread)

# Wait for all threads to complete
for thread in threads:
thread.join()

# Parse results
df = parse_results(columns=["job_id"] + list(args_keys) + ["float_ppl", "quant_ppl"])
df.to_csv(f"{RESULTS_FOLDER}/results.csv", index=False)
8 changes: 6 additions & 2 deletions src/brevitas_examples/llm/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -585,7 +585,7 @@ def override_defaults(args):
return defaults


def parse_args(args, override_defaults={}):
def create_llm_args_parser():
parser = argparse.ArgumentParser()
parser.add_argument(
'--config',
Expand Down Expand Up @@ -906,8 +906,12 @@ def parse_args(args, override_defaults={}):
type=str,
nargs='*',
help='A list of tasks for zero_shot evaluation. Default: %(default)s')
parser.set_defaults(**override_defaults)
return parser


def parse_args(args, override_defaults={}):
parser = create_llm_args_parser()
parser.set_defaults(**override_defaults)
return parser.parse_args(args)


Expand Down