Merge branch 'main' into egor/ipex_config

intel · Dec 27, 2023 · f73e5ad · f73e5ad
2 parents 839ca17 + 9f706d2
commit f73e5ad
Show file tree

Hide file tree

Showing 9 changed files with 496 additions and 380 deletions.
diff --git a/.github/workflows/execute-test-script.yml b/.github/workflows/execute-test-script.yml
@@ -82,9 +82,9 @@ jobs:
               shell: bash -el {0}
               run: |
                 case "${{ inputs.compiler }}" in
-                    torch_mlir)
+                    torch_mlir*)
                         echo conda_env=mlir-dev >> $GITHUB_OUTPUT;;
-                    ipex)
+                    ipex*)
                         echo conda_env=ipex >> $GITHUB_OUTPUT;;
                     *)
                         if [[ ${{ inputs.device }} = "cuda" ]]; then
@@ -113,9 +113,14 @@ jobs:
                       URL="--url ${{ secrets.DB_URL }}"
                   fi
 
-                  export DL_BENCH_ARGS="--host ${{ inputs.runner_type }} --compiler ${{ inputs.compiler }} --device ${{ inputs.device }} --tag ${{ inputs.tag }} -v ${URL}"
+                  export DL_BENCH_ARGS="--host ${{ inputs.runner_type }} --compiler ${{ inputs.compiler }} --device ${{ inputs.device }} --tag ${{ inputs.tag }} -v ${URL} --skip_verification"
                   export KMP_AFFINITY=respect,noreset,granularity=fine,balanced
-                  numactl -N 1 ${{ inputs.test_script }}
+                  export OMP_NUM_THREADS=32
+                  if [[ ${LABELS} = *glados* ]]; then
+                      numactl -N 1 ${{ inputs.test_script }}
+                  else
+                      source ${{ inputs.test_script}}
+                  fi
 
             - name: Upload results.db to artifacts when running in AWS
               if: contains(fromJSON(needs.get_runner_labels.outputs.runner_labels), 'aws')

diff --git a/.github/workflows/test-single-config.yml b/.github/workflows/test-single-config.yml
@@ -20,10 +20,11 @@ on:
                 options:
                     - torch
                     - dynamo
+                    - torch_mlir
                     - torchscript
                     - torchscript_onednn
                     - ipex
-                    - torch_mlir
+                    - ipex_onednn_graph
             tag:
                 description: tag to label this run in DB
                 required: true

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -51,6 +51,7 @@ jobs:
                   {device: 'cpu', compiler: 'torchscript'},
                   {device: 'cpu', compiler: 'torchscript_onednn'},
                   {device: 'cpu', compiler: 'ipex'},
+                #   {device: 'cpu', compiler: 'ipex_onednn_graph'},
 #                  {device: 'xpu', compiler: 'ipex'},
                   {device: 'cpu', compiler: 'torch_mlir'}
                 ]

diff --git a/dl_bench/cli/launcher.py b/dl_bench/cli/launcher.py
@@ -8,6 +8,7 @@
 from dl_bench.mlp_basic import MlpBasicBenchmark
 from dl_bench.report.report import BenchmarkDb
 from dl_bench.utils import Backend
+from dl_bench.tools.compare_tensors import compare
 
 benchmarks_table = {
     "mlp_oneiter": MlpBasicBenchmark,
@@ -16,6 +17,19 @@
 }
 
 
+def fix_lengths(outputs, ref_outputs):
+    """To speed up benchmarking we pass different number of batches for different backends.
+    Need to match the lenghts."""
+    min_lengths = min(len(outputs), len(ref_outputs))
+    if len(outputs) != len(ref_outputs):
+        print(
+            f"Slicing passed batches to smallest size {len(outputs)}->{min_lengths}; {len(ref_outputs)}->{min_lengths}"
+        )
+        return outputs[:min_lengths], ref_outputs[:min_lengths]
+    else:
+        return outputs, ref_outputs
+
+
 def parse_args():
     parser = argparse.ArgumentParser()
     # Benchmark
@@ -67,6 +81,7 @@ def parse_args():
             "torchscript",
             "torchscript_onednn",
             "ipex",
+            "ipex_onednn_graph",
             "torch_mlir",
         ],
         help="Compilation mode to use. No compilation by default.",
@@ -95,6 +110,12 @@ def parse_args():
     parser.add_argument(
         "-v", "--verbose", required=False, action="store_true", help="Verbose mode."
     )
+    parser.add_argument(
+        "--skip_verification",
+        required=False,
+        action="store_true",
+        help="Skip output verification.",
+    )
     return parser.parse_args()
 
 
@@ -123,16 +144,23 @@ def main():
         compiler = "torch"
     dtype = args.dtype
     backend_desc = args.backend_desc or f"{host}_{device}_{compiler}"
-    if dtype != 'float32':
-        backend_desc += '_' + str(dtype)
+    if dtype != "float32":
+        backend_desc += "_" + str(dtype)
 
     backend = Backend(device=device, compiler=compiler, dtype=dtype)
-    benchmark = benchmarks_table[benchmark_name]()
-    results = benchmark.run(backend=backend, params=benchmark_params)
+    benchmark = benchmarks_table[benchmark_name](benchmark_params)
+    if args.skip_verification:
+        results, _ = benchmark.inference(backend)
+    else:
+        ref_device = "cpu" if device not in "cuda" else device
+        reference_backend = Backend(device=ref_device, compiler="torch", dtype=dtype)
+        _, ref_outputs = benchmark.inference(reference_backend)
+        results, outputs = benchmark.inference(backend)
+        outputs, ref_outputs = fix_lengths(outputs, ref_outputs)
+        cmp_res = compare(outputs, ref_outputs)
 
     print(f"Benchmark {benchmark_name} completed")
 
-
     report = {
         "tag": args.tag,
         "benchmark": benchmark_name,
@@ -143,14 +171,23 @@ def main():
         "device": device,
         "compiler": compiler,
         "dtype": dtype,
-        **{c: results.get(c, 0) for c in ["warmup_s", "duration_s", "samples_per_s", "flops_per_sample"]},
+        **{
+            c: results.get(c, 0)
+            for c in ["warmup_s", "duration_s", "samples_per_s", "flops_per_sample"]
+        },
     }
 
     db = BenchmarkDb(args.url)
 
     if args.verbose:
         print("Report:")
-        print("TFLOPS: {:.3}".format(results.get("flops_per_sample", 0) * results.get('samples_per_s', 0) / (10**12)))
+        print(
+            "TFLOPS: {:.3}".format(
+                results.get("flops_per_sample", 0)
+                * results.get("samples_per_s", 0)
+                / (10**12)
+            )
+        )
         pprint.pprint(report)
 
     if args.output is not None:

diff --git a/dl_bench/cnn.py b/dl_bench/cnn.py
@@ -1,172 +1,46 @@
-import time
-from typing import List
-
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.optim as optim
-from torch.utils.data import DataLoader, Dataset
-
-from dl_bench.utils import Backend, Benchmark, TimerManager
-
-from dl_bench.mlp import get_macs
-
-
-def get_time():
-    return time.perf_counter()
-
-
-# PARAMS
-class RandomInfDataset(Dataset):
-    def __init__(self, n, in_shape):
-        super().__init__()
-
-        self.values = np.random.randn(n, *in_shape).astype(np.float32)
-
-    def __len__(self):
-        return len(self.values)
-
-    def __getitem__(self, index):
-        return self.values[index]
-
-
-def get_inf_loaders(n, in_shape, batch_size, device: str):
-    # This speeds up data copy for cuda devices
-    pin_memory = device == "cuda"
-
-    ds = RandomInfDataset(n, in_shape)
-    train_loader = DataLoader(
-        ds, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=pin_memory
-    )
-    test_loader = DataLoader(
-        ds, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=pin_memory
-    )
-    return train_loader, test_loader
+from dl_bench.utils import Benchmark, RandomInfDataset
 
 
 def get_cnn(name):
-    from torchvision.models import vgg16, resnet18, resnet50, resnext50_32x4d, resnext101_32x8d, densenet121, efficientnet_v2_m, mobilenet_v3_large
+    from torchvision.models import (
+        vgg16,
+        resnet18,
+        resnet50,
+        resnext50_32x4d,
+        resnext101_32x8d,
+        densenet121,
+        efficientnet_v2_m,
+        mobilenet_v3_large,
+    )
 
     name2model = {
-        'vgg16': vgg16,
-        'resnet18': resnet18,
-        'resnet50': resnet50,
-        'resnext50': resnext50_32x4d,
-        'resnext101': resnext101_32x8d,
-        'densenet121': densenet121,
-        'efficientnet_v2m': efficientnet_v2_m,
-        'mobilenet_v3l': mobilenet_v3_large,
+        "vgg16": vgg16,
+        "resnet18": resnet18,
+        "resnet50": resnet50,
+        "resnext50": resnext50_32x4d,
+        "resnext101": resnext101_32x8d,
+        "densenet121": densenet121,
+        "efficientnet_v2m": efficientnet_v2_m,
+        "mobilenet_v3_large": mobilenet_v3_large,
     }
     if name in name2model:
         return name2model[name]()
     else:
         raise ValueError(f"Unknown name {name}")
 
 
-def build_mlp(
-    n_chans_in: int,
-    n_chans_out: int,
-    struct: List[int],
-    norm_layer=None,
-    activ_layer=nn.ReLU,
-    inplace=None,
-    dropout: float = None,
-):
-    params = {} if inplace is None else {"inplace": inplace}
-    bias = True if dropout is None else False
-
-    layers = []
-    in_dim = n_chans_in
-    for hidden_dim in struct:
-        layers.append(torch.nn.Linear(in_dim, hidden_dim, bias=bias))
-        if norm_layer is not None:
-            layers.append(norm_layer(hidden_dim))
-        if activ_layer is not None:
-            layers.append(activ_layer(**params))
-        if dropout is not None:
-            layers.append(torch.nn.Dropout(dropout, **params))
-        in_dim = hidden_dim
-
-    layers.append(torch.nn.Linear(in_dim, n_chans_out, bias=True))
-
-    return nn.Sequential(*layers)
-
-
-def get_mlp(n_chans_in, n_chans_out, name):
-    params = name2params[name]
-
-    # net = nn.Sequential(nn.Flatten(), build_mlp(n_chans_in, **params))
-    net = build_mlp(n_chans_in, **params, n_chans_out=n_chans_out)
-    return net
-
-
-IN_SHAPE = (3, 224, 224)
-
 class CnnBenchmark(Benchmark):
-    def run(self, backend: Backend, params):
-        tm = TimerManager()
-
-        # PARAMS
-        name = params.get("name", "resnet50")
+    def __init__(self, params) -> None:
         batch_size = int(params.get("batch_size", 1024))
 
-        # Do early stopping once we hit min_batches & min_seconds to accelerate measurement
+        in_shape = (3, 224, 224)
         min_batches = 10
-        min_seconds = 10
         DATASET_SIZE = max(10_240, batch_size * min_batches)
+        dataset = RandomInfDataset(DATASET_SIZE, in_shape)
 
-        trainloader, testloader = get_inf_loaders(
-            DATASET_SIZE,
-            in_shape=IN_SHAPE,
-            batch_size=batch_size,
-            device=backend.device_name,
-        )
+        name = params.get("name", "resnet50")
         net = get_cnn(name=name)
-        # flops_per_sample = get_macs(net, IN_SHAPE, backend) * 2
-        flops_per_sample = 2 * get_macs(net, IN_SHAPE, backend)
-
-        sample = backend.to_device(torch.rand(batch_size, *IN_SHAPE))
-        net = backend.prepare_eval_model(net, sample_input=sample)
-        print("Warmup started")
-        with torch.no_grad():
-            net.eval()
-            with tm.timeit("warmup_s"):
-                net(sample)
-        print("Warmup done")
 
-        correct = 0
-        total = 0
-        n_items = 0
-
-        net.eval()
-        with torch.no_grad():
-            start = time.perf_counter()
-            with tm.timeit("duration_s"):
-                for x in testloader:
-                    x = backend.to_device(x)
-                    if backend.dtype == torch.float32:
-                        output = net(x)
-                        assert output.dtype is backend.dtype, f"{output.dtype}!={backend.dtype}"
-                        _, predicted = torch.max(output.data, 1)
-
-                    else:
-                        with torch.autocast(device_type=backend.device_name, dtype=backend.dtype):
-                            output = net(x)
-                            assert output.dtype is backend.dtype, f"{output.dtype}!={backend.dtype}"
-                            _, predicted = torch.max(output.data, 1)
-
-                    n_items += len(x)
-
-                    # early stopping
-                    if (time.perf_counter() - start) > min_seconds and n_items > batch_size * min_batches:
-                        break
-
-        print(f"{n_items} were processed in {tm.name2time['duration_s']}s")
-
-        results = tm.get_results()
-        results["samples_per_s"] = n_items / results["duration_s"]
-        results["flops_per_sample"] = flops_per_sample
-
-        return results
-
-        # print(f'Accuracy of the network on the 10000 test images: {100 * correct // total} %')
+        super().__init__(
+            net=net, in_shape=in_shape, dataset=dataset, batch_size=batch_size
+        )