Skip to content
This repository has been archived by the owner on Jul 24, 2024. It is now read-only.

Commit

Permalink
Merge branch 'main' into egor/ipex_config
Browse files Browse the repository at this point in the history
  • Loading branch information
Egor-Krivov committed Dec 27, 2023
2 parents 839ca17 + 9f706d2 commit f73e5ad
Show file tree
Hide file tree
Showing 9 changed files with 496 additions and 380 deletions.
13 changes: 9 additions & 4 deletions .github/workflows/execute-test-script.yml
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,9 @@ jobs:
shell: bash -el {0}
run: |
case "${{ inputs.compiler }}" in
torch_mlir)
torch_mlir*)
echo conda_env=mlir-dev >> $GITHUB_OUTPUT;;
ipex)
ipex*)
echo conda_env=ipex >> $GITHUB_OUTPUT;;
*)
if [[ ${{ inputs.device }} = "cuda" ]]; then
Expand Down Expand Up @@ -113,9 +113,14 @@ jobs:
URL="--url ${{ secrets.DB_URL }}"
fi
export DL_BENCH_ARGS="--host ${{ inputs.runner_type }} --compiler ${{ inputs.compiler }} --device ${{ inputs.device }} --tag ${{ inputs.tag }} -v ${URL}"
export DL_BENCH_ARGS="--host ${{ inputs.runner_type }} --compiler ${{ inputs.compiler }} --device ${{ inputs.device }} --tag ${{ inputs.tag }} -v ${URL} --skip_verification"
export KMP_AFFINITY=respect,noreset,granularity=fine,balanced
numactl -N 1 ${{ inputs.test_script }}
export OMP_NUM_THREADS=32
if [[ ${LABELS} = *glados* ]]; then
numactl -N 1 ${{ inputs.test_script }}
else
source ${{ inputs.test_script}}
fi
- name: Upload results.db to artifacts when running in AWS
if: contains(fromJSON(needs.get_runner_labels.outputs.runner_labels), 'aws')
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/test-single-config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,11 @@ on:
options:
- torch
- dynamo
- torch_mlir
- torchscript
- torchscript_onednn
- ipex
- torch_mlir
- ipex_onednn_graph
tag:
description: tag to label this run in DB
required: true
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ jobs:
{device: 'cpu', compiler: 'torchscript'},
{device: 'cpu', compiler: 'torchscript_onednn'},
{device: 'cpu', compiler: 'ipex'},
# {device: 'cpu', compiler: 'ipex_onednn_graph'},
# {device: 'xpu', compiler: 'ipex'},
{device: 'cpu', compiler: 'torch_mlir'}
]
Expand Down
51 changes: 44 additions & 7 deletions dl_bench/cli/launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from dl_bench.mlp_basic import MlpBasicBenchmark
from dl_bench.report.report import BenchmarkDb
from dl_bench.utils import Backend
from dl_bench.tools.compare_tensors import compare

benchmarks_table = {
"mlp_oneiter": MlpBasicBenchmark,
Expand All @@ -16,6 +17,19 @@
}


def fix_lengths(outputs, ref_outputs):
"""To speed up benchmarking we pass different number of batches for different backends.
Need to match the lenghts."""
min_lengths = min(len(outputs), len(ref_outputs))
if len(outputs) != len(ref_outputs):
print(
f"Slicing passed batches to smallest size {len(outputs)}->{min_lengths}; {len(ref_outputs)}->{min_lengths}"
)
return outputs[:min_lengths], ref_outputs[:min_lengths]
else:
return outputs, ref_outputs


def parse_args():
parser = argparse.ArgumentParser()
# Benchmark
Expand Down Expand Up @@ -67,6 +81,7 @@ def parse_args():
"torchscript",
"torchscript_onednn",
"ipex",
"ipex_onednn_graph",
"torch_mlir",
],
help="Compilation mode to use. No compilation by default.",
Expand Down Expand Up @@ -95,6 +110,12 @@ def parse_args():
parser.add_argument(
"-v", "--verbose", required=False, action="store_true", help="Verbose mode."
)
parser.add_argument(
"--skip_verification",
required=False,
action="store_true",
help="Skip output verification.",
)
return parser.parse_args()


Expand Down Expand Up @@ -123,16 +144,23 @@ def main():
compiler = "torch"
dtype = args.dtype
backend_desc = args.backend_desc or f"{host}_{device}_{compiler}"
if dtype != 'float32':
backend_desc += '_' + str(dtype)
if dtype != "float32":
backend_desc += "_" + str(dtype)

backend = Backend(device=device, compiler=compiler, dtype=dtype)
benchmark = benchmarks_table[benchmark_name]()
results = benchmark.run(backend=backend, params=benchmark_params)
benchmark = benchmarks_table[benchmark_name](benchmark_params)
if args.skip_verification:
results, _ = benchmark.inference(backend)
else:
ref_device = "cpu" if device not in "cuda" else device
reference_backend = Backend(device=ref_device, compiler="torch", dtype=dtype)
_, ref_outputs = benchmark.inference(reference_backend)
results, outputs = benchmark.inference(backend)
outputs, ref_outputs = fix_lengths(outputs, ref_outputs)
cmp_res = compare(outputs, ref_outputs)

print(f"Benchmark {benchmark_name} completed")


report = {
"tag": args.tag,
"benchmark": benchmark_name,
Expand All @@ -143,14 +171,23 @@ def main():
"device": device,
"compiler": compiler,
"dtype": dtype,
**{c: results.get(c, 0) for c in ["warmup_s", "duration_s", "samples_per_s", "flops_per_sample"]},
**{
c: results.get(c, 0)
for c in ["warmup_s", "duration_s", "samples_per_s", "flops_per_sample"]
},
}

db = BenchmarkDb(args.url)

if args.verbose:
print("Report:")
print("TFLOPS: {:.3}".format(results.get("flops_per_sample", 0) * results.get('samples_per_s', 0) / (10**12)))
print(
"TFLOPS: {:.3}".format(
results.get("flops_per_sample", 0)
* results.get("samples_per_s", 0)
/ (10**12)
)
)
pprint.pprint(report)

if args.output is not None:
Expand Down
178 changes: 26 additions & 152 deletions dl_bench/cnn.py
Original file line number Diff line number Diff line change
@@ -1,172 +1,46 @@
import time
from typing import List

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

from dl_bench.utils import Backend, Benchmark, TimerManager

from dl_bench.mlp import get_macs


def get_time():
return time.perf_counter()


# PARAMS
class RandomInfDataset(Dataset):
def __init__(self, n, in_shape):
super().__init__()

self.values = np.random.randn(n, *in_shape).astype(np.float32)

def __len__(self):
return len(self.values)

def __getitem__(self, index):
return self.values[index]


def get_inf_loaders(n, in_shape, batch_size, device: str):
# This speeds up data copy for cuda devices
pin_memory = device == "cuda"

ds = RandomInfDataset(n, in_shape)
train_loader = DataLoader(
ds, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=pin_memory
)
test_loader = DataLoader(
ds, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=pin_memory
)
return train_loader, test_loader
from dl_bench.utils import Benchmark, RandomInfDataset


def get_cnn(name):
from torchvision.models import vgg16, resnet18, resnet50, resnext50_32x4d, resnext101_32x8d, densenet121, efficientnet_v2_m, mobilenet_v3_large
from torchvision.models import (
vgg16,
resnet18,
resnet50,
resnext50_32x4d,
resnext101_32x8d,
densenet121,
efficientnet_v2_m,
mobilenet_v3_large,
)

name2model = {
'vgg16': vgg16,
'resnet18': resnet18,
'resnet50': resnet50,
'resnext50': resnext50_32x4d,
'resnext101': resnext101_32x8d,
'densenet121': densenet121,
'efficientnet_v2m': efficientnet_v2_m,
'mobilenet_v3l': mobilenet_v3_large,
"vgg16": vgg16,
"resnet18": resnet18,
"resnet50": resnet50,
"resnext50": resnext50_32x4d,
"resnext101": resnext101_32x8d,
"densenet121": densenet121,
"efficientnet_v2m": efficientnet_v2_m,
"mobilenet_v3_large": mobilenet_v3_large,
}
if name in name2model:
return name2model[name]()
else:
raise ValueError(f"Unknown name {name}")


def build_mlp(
n_chans_in: int,
n_chans_out: int,
struct: List[int],
norm_layer=None,
activ_layer=nn.ReLU,
inplace=None,
dropout: float = None,
):
params = {} if inplace is None else {"inplace": inplace}
bias = True if dropout is None else False

layers = []
in_dim = n_chans_in
for hidden_dim in struct:
layers.append(torch.nn.Linear(in_dim, hidden_dim, bias=bias))
if norm_layer is not None:
layers.append(norm_layer(hidden_dim))
if activ_layer is not None:
layers.append(activ_layer(**params))
if dropout is not None:
layers.append(torch.nn.Dropout(dropout, **params))
in_dim = hidden_dim

layers.append(torch.nn.Linear(in_dim, n_chans_out, bias=True))

return nn.Sequential(*layers)


def get_mlp(n_chans_in, n_chans_out, name):
params = name2params[name]

# net = nn.Sequential(nn.Flatten(), build_mlp(n_chans_in, **params))
net = build_mlp(n_chans_in, **params, n_chans_out=n_chans_out)
return net


IN_SHAPE = (3, 224, 224)

class CnnBenchmark(Benchmark):
def run(self, backend: Backend, params):
tm = TimerManager()

# PARAMS
name = params.get("name", "resnet50")
def __init__(self, params) -> None:
batch_size = int(params.get("batch_size", 1024))

# Do early stopping once we hit min_batches & min_seconds to accelerate measurement
in_shape = (3, 224, 224)
min_batches = 10
min_seconds = 10
DATASET_SIZE = max(10_240, batch_size * min_batches)
dataset = RandomInfDataset(DATASET_SIZE, in_shape)

trainloader, testloader = get_inf_loaders(
DATASET_SIZE,
in_shape=IN_SHAPE,
batch_size=batch_size,
device=backend.device_name,
)
name = params.get("name", "resnet50")
net = get_cnn(name=name)
# flops_per_sample = get_macs(net, IN_SHAPE, backend) * 2
flops_per_sample = 2 * get_macs(net, IN_SHAPE, backend)

sample = backend.to_device(torch.rand(batch_size, *IN_SHAPE))
net = backend.prepare_eval_model(net, sample_input=sample)
print("Warmup started")
with torch.no_grad():
net.eval()
with tm.timeit("warmup_s"):
net(sample)
print("Warmup done")

correct = 0
total = 0
n_items = 0

net.eval()
with torch.no_grad():
start = time.perf_counter()
with tm.timeit("duration_s"):
for x in testloader:
x = backend.to_device(x)
if backend.dtype == torch.float32:
output = net(x)
assert output.dtype is backend.dtype, f"{output.dtype}!={backend.dtype}"
_, predicted = torch.max(output.data, 1)

else:
with torch.autocast(device_type=backend.device_name, dtype=backend.dtype):
output = net(x)
assert output.dtype is backend.dtype, f"{output.dtype}!={backend.dtype}"
_, predicted = torch.max(output.data, 1)

n_items += len(x)

# early stopping
if (time.perf_counter() - start) > min_seconds and n_items > batch_size * min_batches:
break

print(f"{n_items} were processed in {tm.name2time['duration_s']}s")

results = tm.get_results()
results["samples_per_s"] = n_items / results["duration_s"]
results["flops_per_sample"] = flops_per_sample

return results

# print(f'Accuracy of the network on the 10000 test images: {100 * correct // total} %')
super().__init__(
net=net, in_shape=in_shape, dataset=dataset, batch_size=batch_size
)
Loading

0 comments on commit f73e5ad

Please sign in to comment.