diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst index 9ef012d66..8d4c12a95 100644 --- a/doc/ref_transform.rst +++ b/doc/ref_transform.rst @@ -80,6 +80,11 @@ Manipulating Instructions .. autofunction:: add_barrier +Loop Distribution +----------------- + +.. automodule:: loopy.transform.loop_distribution + Registering Library Routines ---------------------------- diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 13a597e5b..709f97c11 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -610,7 +610,7 @@ commonly called 'loop tiling': ... assumptions="n mod 16 = 0 and n >= 1") >>> knl = lp.split_iname(knl, "i", 16) >>> knl = lp.split_iname(knl, "j", 16) - >>> knl = lp.prioritize_loops(knl, "i_outer,j_outer,i_inner") + >>> knl = lp.prioritize_loops(knl, "i_outer,j_outer,i_inner,j_inner") >>> knl = lp.set_options(knl, write_code=True) >>> evt, (out,) = knl(queue, a=a_mat_dev) #define lid(N) ((int) get_local_id(N)) @@ -1029,8 +1029,8 @@ transformation exists in :func:`loopy.add_prefetch`: >>> evt, (out,) = knl_pf(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) ... - acc_k = 0.0f; a_fetch = a[16 * gid(0) + lid(0)]; + acc_k = 0.0f; for (int k = 0; k <= 15; ++k) acc_k = acc_k + a_fetch; out[16 * gid(0) + lid(0)] = acc_k; @@ -1053,10 +1053,10 @@ earlier: >>> evt, (out,) = knl_pf(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) ... - if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) - acc_k = 0.0f; if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) a_fetch[lid(0)] = a[16 * gid(0) + lid(0)]; + if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) + acc_k = 0.0f; barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch (insn_k_update depends on a_fetch_rule) */; if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) { @@ -1908,18 +1908,16 @@ Now to make things more interesting, we'll create a kernel with barriers: { __local int c[50 * 10 * 99]; - { - int const k_outer = 0; - + for (int i = 0; i <= 49; ++i) for (int j = 0; j <= 9; ++j) - for (int i = 0; i <= 49; ++i) - { - barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn rev-depends on insn_0) */; - c[990 * i + 99 * j + lid(0) + 1] = 2 * a[980 * i + 98 * j + lid(0) + 1]; - barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn_0 depends on insn) */; - e[980 * i + 98 * j + lid(0) + 1] = c[990 * i + 99 * j + 1 + lid(0) + 1] + c[990 * i + 99 * j + -1 + lid(0) + 1]; - } - } + { + int const k_outer = 0; + + barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn rev-depends on insn_0) */; + c[990 * i + 99 * j + lid(0) + 1] = 2 * a[980 * i + 98 * j + lid(0) + 1]; + barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn_0 depends on insn) */; + e[980 * i + 98 * j + lid(0) + 1] = c[990 * i + 99 * j + 1 + lid(0) + 1] + c[990 * i + 99 * j + -1 + lid(0) + 1]; + } } In this kernel, when a work-item performs the second instruction it uses data diff --git a/loopy/__init__.py b/loopy/__init__.py index ce3ba1439..6b53f6214 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -125,6 +125,8 @@ from loopy.transform.pack_and_unpack_args import pack_and_unpack_args_for_call from loopy.transform.realize_reduction import realize_reduction +from loopy.transform.loop_distribution import (distribute_loops, + IllegalLoopDistributionError) # }}} @@ -254,6 +256,8 @@ "pack_and_unpack_args_for_call", + "distribute_loops", "IllegalLoopDistributionError", + # }}} "get_dot_dependency_graph", diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 99f5f3503..43ed554d9 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -2115,4 +2115,64 @@ def get_outer_params(domains): # }}} +# {{{ get access map from an instruction + +class _IndexCollector(CombineMapper): + def __init__(self, var): + self.var = var + super().__init__() + + def combine(self, values): + import operator + return reduce(operator.or_, values, frozenset()) + + def map_subscript(self, expr): + if expr.aggregate.name == self.var: + return (super().map_subscript(expr) | frozenset([expr.index_tuple])) + else: + return super().map_subscript(expr) + + def map_algebraic_leaf(self, expr): + return frozenset() + + map_constant = map_algebraic_leaf + + +def _project_out_inames_from_maps(amaps, inames_to_project_out): + new_amaps = [] + for amap in amaps: + for iname in inames_to_project_out: + dt, pos = amap.get_var_dict()[iname] + amap = amap.project_out(dt, pos, 1) + + new_amaps.append(amap) + + return new_amaps + + +def _union_amaps(amaps): + import islpy as isl + return reduce(isl.Map.union, amaps[1:], amaps[0]) + + +def get_insn_access_map(kernel, insn_id, var): + from loopy.transform.subst import expand_subst + from loopy.match import Id + from loopy.symbolic import get_access_map + + insn = kernel.id_to_insn[insn_id] + + kernel = expand_subst(kernel, within=Id(insn_id)) + indices = list(_IndexCollector(var)((insn.expression, + insn.assignees, + tuple(insn.predicates)))) + + amaps = [get_access_map(kernel.get_inames_domain(insn.within_inames), + idx, kernel.assumptions) + for idx in indices] + + return _union_amaps(amaps) + +# }}} + # vim: foldmethod=marker diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 9dc3bdc36..a39ca8f8b 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -812,8 +812,161 @@ def is_similar_to_template(insn): # {{{ scheduling algorithm -def generate_loop_schedules_internal( - sched_state, debug=None): +def _get_outermost_diverging_inames(tree, within1, within2): + """ + For loop nestings *within1* and *within2*, returns the first inames at which + the loops nests diverge in the loop nesting tree *tree*. + + :arg tree: A :class:`loopy.tools.Tree` of inames, denoting a loop nesting. + :arg within1: A :class:`frozenset` of inames. + :arg within2: A :class:`frozenset` of inames. + """ + common_ancestors = (within1 & within2) | {""} + + innermost_parent = max(common_ancestors, + key=lambda k: tree.depth(k)) + iname1, = tree.children(innermost_parent) & within1 + iname2, = tree.children(innermost_parent) & within2 + + return iname1, iname2 + + +class V2SchedulerNotImplementedException(RuntimeError): + pass + + +def generate_loop_schedules_v2(kernel): + from loopy.schedule.tools import get_loop_nest_tree + from functools import reduce + from pytools.graph import compute_topological_order + from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag + + concurrent_inames = {iname for iname in kernel.all_inames() + if kernel.iname_tags_of_type(iname, ConcurrentTag)} + ilp_inames = {iname for iname in kernel.all_inames() + if kernel.iname_tags_of_type(iname, IlpBaseTag)} + vec_inames = {iname for iname in kernel.all_inames() + if kernel.iname_tags_of_type(iname, VectorizeTag)} + parallel_inames = (concurrent_inames - ilp_inames - vec_inames) + + # {{{ can v2 scheduler handle?? + + if any(len(insn.conflicts_with_groups) != 0 for insn in kernel.instructions): + raise V2SchedulerNotImplementedException("v2 scheduler cannot schedule" + " kernels with instruction having conflicts with groups.") + + if any(insn.priority != 0 for insn in kernel.instructions): + raise V2SchedulerNotImplementedException("v2 scheduler cannot schedule" + " kernels with instruction priorities set.") + + if kernel.schedule is not None: + # cannnot handle preschedule yet + raise V2SchedulerNotImplementedException("v2 scheduler cannot schedule" + " prescheduled kernels.") + + if ilp_inames or vec_inames: + raise V2SchedulerNotImplementedException("v2 scheduler cannot schedule" + " loops tagged with 'ilp'/'vec' as they are not guaranteed to" + " be single entry loops.") + + # }}} + + loop_nest_tree = get_loop_nest_tree(kernel) + + # loop_inames: inames that are realized as loops. Concurrent inames aren't + # realized as a loop in the generated code for a loopy.TargetBase. + loop_inames = (reduce(frozenset.union, (insn.within_inames + for insn in kernel.instructions), + frozenset()) + - parallel_inames) + + # The idea here is to build a DAG, where nodes are schedule items and if + # there exists an edge from schedule item A to schedule item B in the DAG => + # B *must* come after A in the linearized result. + + dag = {} + + # LeaveLoop(i) *must* follow EnterLoop(i) + dag.update({EnterLoop(iname=iname): frozenset({LeaveLoop(iname=iname)}) + for iname in loop_inames}) + dag.update({LeaveLoop(iname=iname): frozenset() + for iname in loop_inames}) + dag.update({RunInstruction(insn_id=insn.id): frozenset() + for insn in kernel.instructions}) + + # {{{ add constraints imposed by the loop nesting + + for outer_loop in loop_nest_tree.nodes(): + if outer_loop == "": + continue + + for child in loop_nest_tree.children(outer_loop): + inner_loop = child + dag[EnterLoop(iname=outer_loop)] |= {EnterLoop(iname=inner_loop)} + dag[LeaveLoop(iname=inner_loop)] |= {LeaveLoop(iname=outer_loop)} + + # }}} + + # {{{ add deps. b/w schedule items coming from insn. depepdencies + + for insn in kernel.instructions: + insn_loop_inames = insn.within_inames & loop_inames + for dep_id in insn.depends_on: + dep = kernel.id_to_insn[dep_id] + dep_loop_inames = dep.within_inames & loop_inames + # Enforce instruction dep: + dag[RunInstruction(insn_id=dep_id)] |= {RunInstruction(insn_id=insn.id)} + + # {{{ register deps on loop entry/leave because of insn. deps + + if dep_loop_inames < insn_loop_inames: + for iname in insn_loop_inames - dep_loop_inames: + dag[RunInstruction(insn_id=dep.id)] |= {EnterLoop(iname=iname)} + elif insn_loop_inames < dep_loop_inames: + for iname in dep_loop_inames - insn_loop_inames: + dag[LeaveLoop(iname=iname)] |= {RunInstruction(insn_id=insn.id)} + elif dep_loop_inames != insn_loop_inames: + insn_iname, dep_iname = _get_outermost_diverging_inames( + loop_nest_tree, insn_loop_inames, dep_loop_inames) + dag[LeaveLoop(iname=dep_iname)] |= {EnterLoop(iname=insn_iname)} + else: + pass + + # }}} + + for iname in insn_loop_inames: + # For an insn within a loop nest 'i' + # for i + # insn + # end i + # 'insn' *must* come b/w 'for i' and 'end i' + dag[EnterLoop(iname=iname)] |= {RunInstruction(insn_id=insn.id)} + dag[RunInstruction(insn_id=insn.id)] |= {LeaveLoop(iname=iname)} + + # }}} + + def iname_key(iname): + all_ancestors = sorted(loop_nest_tree.ancestors(iname), + key=lambda x: loop_nest_tree.depth(x)) + return ",".join(all_ancestors+[iname]) + + def key(x): + if isinstance(x, RunInstruction): + iname = max((kernel.id_to_insn[x.insn_id].within_inames & loop_inames), + key=lambda k: loop_nest_tree.depth(k), + default="") + result = (iname_key(iname), x.insn_id) + elif isinstance(x, (EnterLoop, LeaveLoop)): + result = (iname_key(x.iname),) + else: + raise NotImplementedError + + return result + + return compute_topological_order(dag, key=key) + + +def generate_loop_schedules_internal(sched_state, debug=None): # allow_insn is set to False initially and after entering each loop # to give loops containing high-priority instructions a chance. kernel = sched_state.kernel @@ -1958,6 +2111,41 @@ def generate_loop_schedules(kernel, callables_table, debug_args=None): callables_table, debug_args=debug_args) +def postprocess_schedule(kernel, callables_table, gen_sched): + from loopy.kernel import KernelState + + gen_sched = convert_barrier_instructions_to_barriers( + kernel, gen_sched) + + gsize, lsize = kernel.get_grid_size_upper_bounds(callables_table, + return_dict=True) + + if (gsize or lsize): + if not kernel.options.disable_global_barriers: + logger.debug("%s: barrier insertion: global" % kernel.name) + gen_sched = insert_barriers(kernel, callables_table, gen_sched, + synchronization_kind="global", + verify_only=(not + kernel.options.insert_gbarriers)) + + logger.debug("%s: barrier insertion: local" % kernel.name) + gen_sched = insert_barriers(kernel, callables_table, gen_sched, + synchronization_kind="local", verify_only=False) + logger.debug("%s: barrier insertion: done" % kernel.name) + + new_kernel = kernel.copy( + linearization=gen_sched, + state=KernelState.LINEARIZED) + + from loopy.schedule.device_mapping import \ + map_schedule_onto_host_or_device + if kernel.state != KernelState.LINEARIZED: + # Device mapper only gets run once. + new_kernel = map_schedule_onto_host_or_device(new_kernel) + + return new_kernel + + def generate_loop_schedules_inner(kernel, callables_table, debug_args=None): if debug_args is None: debug_args = {} @@ -1967,6 +2155,14 @@ def generate_loop_schedules_inner(kernel, callables_table, debug_args=None): raise LoopyError("cannot schedule a kernel that has not been " "preprocessed") + try: + gen_sched = generate_loop_schedules_v2(kernel) + yield postprocess_schedule(kernel, callables_table, gen_sched) + return + except V2SchedulerNotImplementedException as e: + from warnings import warn + warn(f"Falling back to a slow scheduler implementation due to: {e}") + schedule_count = 0 debug = ScheduleDebugger(**debug_args) @@ -2076,35 +2272,7 @@ def print_longest_dead_end(): sched_state, debug=debug, **schedule_gen_kwargs): debug.stop() - gen_sched = convert_barrier_instructions_to_barriers( - kernel, gen_sched) - - gsize, lsize = kernel.get_grid_size_upper_bounds(callables_table, - return_dict=True) - - if (gsize or lsize): - if not kernel.options.disable_global_barriers: - logger.debug("%s: barrier insertion: global" % kernel.name) - gen_sched = insert_barriers(kernel, callables_table, gen_sched, - synchronization_kind="global", - verify_only=(not - kernel.options.insert_gbarriers)) - - logger.debug("%s: barrier insertion: local" % kernel.name) - gen_sched = insert_barriers(kernel, callables_table, gen_sched, - synchronization_kind="local", verify_only=False) - logger.debug("%s: barrier insertion: done" % kernel.name) - - new_kernel = kernel.copy( - linearization=gen_sched, - state=KernelState.LINEARIZED) - - from loopy.schedule.device_mapping import \ - map_schedule_onto_host_or_device - if kernel.state != KernelState.LINEARIZED: - # Device mapper only gets run once. - new_kernel = map_schedule_onto_host_or_device(new_kernel) - + new_kernel = postprocess_schedule(kernel, callables_table, gen_sched) yield new_kernel debug.start() diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py index f2164b6dd..f48c57eae 100644 --- a/loopy/schedule/tools.py +++ b/loopy/schedule/tools.py @@ -20,16 +20,19 @@ THE SOFTWARE. """ -from functools import cached_property +from functools import cached_property, reduce import enum from typing import Sequence, FrozenSet, Tuple, List, Set, Dict from dataclasses import dataclass -from pytools import memoize_method +from pytools import memoize_method, memoize_on_first_arg import islpy as isl from loopy.kernel.data import AddressSpace, TemporaryVariable, ArrayArg from loopy.kernel import LoopKernel +from loopy.tools import Tree +from loopy.diagnostic import LoopyError +from pyrsistent import pmap # {{{ block boundary finder @@ -554,4 +557,412 @@ def do_accesses_result_in_races(self, insn1, insn1_dir, insn2, insn2_dir, # }}} -# vim: foldmethod=marker + +def _pull_out_loop_nest(tree, loop_nests, inames_to_pull_out): + """ + Returns a copy of *tree* that realizes *inames_to_pull_out* as loop + nesting. + + :arg tree: A :class:`loopy.tools.Tree`, where each node is + :class:`frozenset` of inames representing a loop nest. For example a + tree might look like: + + :arg loop_nests: A collection of nodes in *tree* that cover + *inames_to_pull_out*. + + :returns: a :class:`tuple` ``(new_tree, outer_loop_nest, inner_loop_nest)``, + where outer_loop_nest is the identifier for the new outer and inner + loop nests so that *inames_to_pull_out* is a valid nesting. + + .. note:: + + We could compute *loop_nests* within this routine's implementation, but + computing would be expensive and hence we ask the caller for this info. + + Example:: + *tree*: frozenset() + └── frozenset({'j', 'i'}) + └── frozenset({'k', 'l'}) + + *inames_to_pull_out*: frozenset({'k', 'i', 'j'}) + *loop_nests*: {frozenset({'j', 'i'}), frozenset({'k', 'l'})} + + Returns: + + *new_tree*: frozenset() + └── frozenset({'j', 'i'}) + └── frozenset({'k'}) + └── frozenset({'l'}) + + *outer_loop_nest*: frozenset({'k'}) + *inner_loop_nest*: frozenset({'l'}) + """ + assert all(isinstance(loop_nest, frozenset) for loop_nest in loop_nests) + assert inames_to_pull_out <= reduce(frozenset.union, loop_nests, frozenset()) + + # {{{ sanity check to ensure the loop nest *inames_to_pull_out* is possible + + loop_nests = sorted(loop_nests, key=lambda nest: tree.depth(nest)) + + for outer, inner in zip(loop_nests[:-1], loop_nests[1:]): + if outer != tree.parent(inner): + raise LoopyError(f"Cannot schedule loop nest {inames_to_pull_out} " + f" in the nesting tree:\n{tree}") + + assert tree.depth(loop_nests[0]) == 0 + + # }}} + + innermost_loop_nest = loop_nests[-1] + new_outer_loop_nest = inames_to_pull_out - reduce(frozenset.union, + loop_nests[:-1], + frozenset()) + new_inner_loop_nest = innermost_loop_nest - inames_to_pull_out + + if new_outer_loop_nest == innermost_loop_nest: + # such a loop nesting already exists => do nothing + return tree, new_outer_loop_nest, None + + # add the outer loop to our loop nest tree + tree = tree.add_node(new_outer_loop_nest, + parent=tree.parent(innermost_loop_nest)) + + # rename the old loop to the inner loop + tree = tree.rename_node(innermost_loop_nest, + new_id=new_inner_loop_nest) + + # set the parent of inner loop to be the outer loop + tree = tree.move_node(new_inner_loop_nest, new_parent=new_outer_loop_nest) + + return tree, new_outer_loop_nest, new_inner_loop_nest + + +def _add_inner_loops(tree, outer_loop_nest, inner_loop_nest): + """ + Returns a copy of *tree* that nests *inner_loop_nest* inside *outer_loop_nest*. + """ + # add the outer loop to our loop nest tree + return tree.add_node(inner_loop_nest, parent=outer_loop_nest) + + +def _order_loop_nests(loop_nest_tree, + strict_priorities, + relaxed_priorities, + iname_to_tree_node_id): + """ + Returns a loop nest where all nodes in the tree are instances of + :class:`str` denoting inames. Unlike *loop_nest_tree* which corresponds to + multiple loop nesting, this routine returns a unique loop nest that is + obtained after constraining *loop_nest_tree* with the constraints enforced + by *priorities*. + + :arg strict_priorities: Expresses strict nesting constraints similar to + :attr:`loopy.LoopKernel.loop_priorities`. These priorities are imposed + strictly i.e. if these conditions cannot be met a + :class:`loopy.diagnostic.LoopyError` is raised. + + :arg relaxed_priorities: Expresses strict nesting constraints similar to + :attr:`loopy.LoopKernel.loop_priorities`. These nesting constraints are + treated as options. + + :arg iname_to_tree_node_id: A mapping from iname to the loop nesting its a + part of. + """ + from pytools.graph import compute_topological_order as toposort + from warnings import warn + + loop_nests = set(iname_to_tree_node_id.values()) + + # flow_requirements: A mapping from the loop nest level to the nesting + # constraints applicable to it. + # Each nesting constraint is represented as a DAG. In the DAG, if there + # exists an edge from from iname 'i' -> iname 'j' => 'j' should be nested + # inside 'i'. + flow_requirements = {loop_nest: {iname: frozenset() + for iname in loop_nest} + for loop_nest in loop_nests} + + # The plan here is populate DAGs in *flow_requirements* and then perform a + # toposort for each loop nest. + + def _update_flow_requirements(priorities, cannot_satisfy_callback): + """ + Records *priorities* in *flow_requirements* and calls + *cannot_satisfy_callback* with an appropriate error message if the + priorities cannot be met. + """ + for priority in priorities: + for outer_iname, inner_iname in zip(priority[:-1], priority[1:]): + if inner_iname not in iname_to_tree_node_id: + cannot_satisfy_callback(f"Cannot enforce the constraint:" + f" {inner_iname} to be nested within" + f" {outer_iname}, as {inner_iname}" + f" is either a parallel loop or" + f" not an iname.") + continue + + if outer_iname not in iname_to_tree_node_id: + cannot_satisfy_callback(f"Cannot enforce the constraint:" + f" {inner_iname} to be nested within" + f" {outer_iname}, as {outer_iname}" + f" is either a parallel loop or" + f" not an iname.") + continue + + inner_iname_nest = iname_to_tree_node_id[inner_iname] + outer_iname_nest = iname_to_tree_node_id[outer_iname] + + if inner_iname_nest == outer_iname_nest: + flow_requirements[inner_iname_nest][outer_iname] |= {inner_iname} + else: + ancestors_of_inner_iname = (loop_nest_tree + .ancestors(inner_iname_nest)) + ancestors_of_outer_iname = (loop_nest_tree + .ancestors(outer_iname_nest)) + if outer_iname in ancestors_of_inner_iname: + # nesting constraint already satisfied => do nothing + pass + elif inner_iname in ancestors_of_outer_iname: + cannot_satisfy_callback("Cannot satisfy constraint that" + f" iname '{inner_iname}' must be" + f" nested within '{outer_iname}''.") + else: + # inner iname and outer iname are indirect family members + # => must be realized via dependencies in the linearization + # phase, not implemented in v2-scheduler yet. + from loopy.schedule import V2SchedulerNotImplementedException + raise V2SchedulerNotImplementedException("cannot" + " schedule kernels with priority dependencies" + " between sibling loop nests") + + def _raise_loopy_err(x): + raise LoopyError(x) + + # record strict priorities + _update_flow_requirements(strict_priorities, _raise_loopy_err) + # record relaxed priorities + _update_flow_requirements(relaxed_priorities, warn) + + # ordered_loop_nests: A mapping from the unordered loop nests to their + # ordered couterparts. For example. If we had only one loop nest + # `frozenset({"i", "j", "k"})`, and the prioirities said added the + # constraint that "i" must be nested within "k", then `ordered_loop_nests` + # would be: `{frozenset({"i", "j", "k"}): ["j", "k", "i"]}` i.e. the loop + # nests would now have an order. + ordered_loop_nests = {unordered_nest: toposort(flow, + key=lambda x: x) + for unordered_nest, flow in flow_requirements.items()} + + # {{{ combine 'loop_nest_tree' along with 'ordered_loop_nest_tree' + + assert loop_nest_tree.root == frozenset() + + new_tree = Tree.from_root("") + + old_to_new_parent = {} + + old_to_new_parent[loop_nest_tree.root] = "" + + # traversing 'tree' in an BFS fashion to create 'new_tree' + queue = list(loop_nest_tree.children(loop_nest_tree.root)) + + while queue: + current_nest = queue.pop(0) + + ordered_nest = ordered_loop_nests[current_nest] + new_tree = new_tree.add_node(ordered_nest[0], + parent=old_to_new_parent[loop_nest_tree + .parent(current_nest)]) + for new_parent, new_child in zip(ordered_nest[:-1], ordered_nest[1:]): + new_tree = new_tree.add_node(node=new_child, parent=new_parent) + + old_to_new_parent[current_nest] = ordered_nest[-1] + + queue.extend(list(loop_nest_tree.children(current_nest))) + + # }}} + + return new_tree + + +@memoize_on_first_arg +def _get_parallel_inames(kernel): + from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag + + concurrent_inames = {iname for iname in kernel.all_inames() + if kernel.iname_tags_of_type(iname, ConcurrentTag)} + ilp_inames = {iname for iname in kernel.all_inames() + if kernel.iname_tags_of_type(iname, IlpBaseTag)} + vec_inames = {iname for iname in kernel.all_inames() + if kernel.iname_tags_of_type(iname, VectorizeTag)} + return (concurrent_inames - ilp_inames - vec_inames) + + +def _get_partial_loop_nest_tree(kernel): + """ + Returns :class:`loopy.Tree` representing the *kernel*'s loop-nests. + + Each node of the returned tree has a :class:`frozenset` of inames. + All the inames in the identifier of a parent node of a loop nest in the + tree must be nested outside all the iname in identifier of the loop nest. + + .. note:: + + This routine only takes into account the nesting dependency + constraints of :attr:`loopy.InstructionBase.within_inames` of all the + *kernel*'s instructions and the iname tags. This routine does *NOT* + include the nesting constraints imposed by the dependencies between the + instructions and the dependencies imposed by the kernel's domain tree. + """ + from loopy.kernel.data import IlpBaseTag + + # figuring the possible loop nestings minus the concurrent_inames as they + # are never realized as actual loops + iname_chains = {insn.within_inames - _get_parallel_inames(kernel) + for insn in kernel.instructions} + + root = frozenset() + tree = Tree.from_root(root) + + # mapping from iname to the innermost loop nest they are part of in *tree*. + iname_to_tree_node_id = {} + + # if there were any loop with no inames, those have been already account + # for as the root. + iname_chains = iname_chains - {root} + + for iname_chain in iname_chains: + not_seen_inames = frozenset(iname for iname in iname_chain + if iname not in iname_to_tree_node_id) + seen_inames = iname_chain - not_seen_inames + + all_nests = {iname_to_tree_node_id[iname] for iname in seen_inames} + + tree, outer_loop, inner_loop = _pull_out_loop_nest(tree, + (all_nests + | {frozenset()}), + seen_inames) + if not_seen_inames: + # make '_not_seen_inames' nest inside the seen ones. + # example: if there is already a loop nesting "i,j,k" + # and the current iname chain is "i,j,l". Only way this is possible + # is if "l" is nested within "i,j"-loops. + tree = _add_inner_loops(tree, outer_loop, not_seen_inames) + + # {{{ update iname to node id + + for iname in outer_loop: + iname_to_tree_node_id[iname] = outer_loop + + if inner_loop is not None: + for iname in inner_loop: + iname_to_tree_node_id[iname] = inner_loop + + for iname in not_seen_inames: + iname_to_tree_node_id[iname] = not_seen_inames + + # }}} + + # {{{ make ILP tagged inames innermost + + ilp_inames = {iname for iname in kernel.all_inames() + if kernel.iname_tags_of_type(iname, IlpBaseTag)} + + for iname_chain in iname_chains: + for ilp_iname in (ilp_inames & iname_chains): + # pull out other loops so that ilp_iname is the innermost + all_nests = {iname_to_tree_node_id[iname] for iname in seen_inames} + tree, outer_loop, inner_loop = _pull_out_loop_nest(tree, + (all_nests + | {frozenset()}), + (iname_chain + - {ilp_iname})) + + for iname in outer_loop: + iname_to_tree_node_id[iname] = outer_loop + + if inner_loop is not None: + for iname in inner_loop: + iname_to_tree_node_id[iname] = inner_loop + + # }}} + + return tree + + +def _get_iname_to_tree_node_id_from_partial_loop_nest_tree(tree): + """ + Returns the mapping from the iname to the *tree*'s node that it was a part + of. + + :arg tree: A partial loop nest tree. + """ + iname_to_tree_node_id = {} + for node in tree.nodes(): + assert isinstance(node, frozenset) + for iname in node: + iname_to_tree_node_id[iname] = node + + return pmap(iname_to_tree_node_id) + + +def get_loop_nest_tree(kernel): + """ + Returns ```tree``` (an instance of :class:`Tree`) representing the loop + nesting for *kernel*. Each node of ``tree`` is an instance of :class:`str` + corresponding to the inames of *kernel* that are realized as concrete + ``for-loops``. A parent node in `tree` is always nested outside all its + children. + + .. note:: + + Multiple loop nestings might exist for *kernel*, but this routine returns + one valid loop nesting. + """ + from islpy import dim_type + + tree = _get_partial_loop_nest_tree(kernel) + iname_to_tree_node_id = ( + _get_iname_to_tree_node_id_from_partial_loop_nest_tree(tree)) + + strict_loop_priorities = frozenset() + + # {{{ impose constraints by the domain tree + + loop_inames = (reduce(frozenset.union, + (insn.within_inames + for insn in kernel.instructions), + frozenset()) + - _get_parallel_inames(kernel)) + + for dom in kernel.domains: + for outer_iname in set(dom.get_var_names(dim_type.param)): + if outer_iname not in loop_inames: + continue + + for inner_iname in dom.get_var_names(dim_type.set): + if inner_iname not in loop_inames: + continue + + # either outer_iname and inner_iname should belong to the same + # loop nest level or outer should be strictly outside inner + # iname + inner_iname_nest = iname_to_tree_node_id[inner_iname] + outer_iname_nest = iname_to_tree_node_id[outer_iname] + + if inner_iname_nest == outer_iname_nest: + strict_loop_priorities |= {(outer_iname, inner_iname)} + else: + ancestors_of_inner_iname = tree.ancestors(inner_iname_nest) + if outer_iname_nest not in ancestors_of_inner_iname: + raise LoopyError(f"Loop '{outer_iname}' cannot be nested" + f" outside '{inner_iname}'.") + + # }}} + + return _order_loop_nests(tree, + strict_loop_priorities, + kernel.loop_priority, + iname_to_tree_node_id) + +# vim: fdm=marker diff --git a/loopy/tools.py b/loopy/tools.py index 5ae620bba..91fa846fe 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -33,6 +33,8 @@ from pymbolic.mapper.persistent_hash import ( PersistentHashWalkMapper as PersistentHashWalkMapperBase) from sys import intern +from typing import FrozenSet, Generic, TypeVar, Iterator, Optional as OptionalT +from dataclasses import dataclass import logging logger = logging.getLogger(__name__) @@ -942,6 +944,239 @@ def _get_persistent_hashable_arg(arg): return wrapper + +# {{{ tree data structure + +T = TypeVar("T") + + +@dataclass(frozen=True) +class Tree(Generic[T]): + """ + An immutable tree implementation. + + .. automethod:: ancestors + .. automethod:: parent + .. automethod:: children + .. automethod:: create_node + .. automethod:: depth + .. automethod:: rename_node + .. automethod:: move_node + + .. note:: + + Almost all the operations are implemented recursively. NOT suitable for + deep trees. At the very least if the Python implementation is CPython + this allocates a new stack frame for each iteration of the operation. + """ + _parent_to_children: Map[T, FrozenSet[T]] + _child_to_parent: Map[T, OptionalT[T]] + + @staticmethod + def from_root(root: T): + return Tree(Map({root: frozenset()}), + Map({root: None})) + + @property + def root(self) -> T: + guess = set(self._child_to_parent).pop() + parent_of_guess = self.parent(guess) + while parent_of_guess is not None: + guess = parent_of_guess + parent_of_guess = self.parent(guess) + + return guess + + def ancestors(self, node: T) -> FrozenSet[T]: + """ + Returns a :class:`frozenset` of nodes that are ancestors of *node*. + """ + if not self.is_a_node(node): + raise ValueError(f"'{node}' not in tree.") + + if self.is_root(node): + # => root + return frozenset() + + parent = self._child_to_parent[node] + assert parent is not None + + return frozenset([parent]) | self.ancestors(parent) + + def parent(self, node: T) -> OptionalT[T]: + if not self.is_a_node(node): + raise ValueError(f"'{node}' not in tree.") + + return self._child_to_parent[node] + + def children(self, node: T) -> FrozenSet[T]: + if not self.is_a_node(node): + raise ValueError(f"'{node}' not in tree.") + + return self._parent_to_children[node] + + def depth(self, node: T) -> int: + if not self.is_a_node(node): + raise ValueError(f"'{node}' not in tree.") + + if self.is_root(node): + # => None + return 0 + + parent_of_node = self.parent(node) + assert parent_of_node is not None + + return 1 + self.depth(parent_of_node) + + def is_root(self, node: T) -> bool: + if not self.is_a_node(node): + raise ValueError(f"'{node}' not in tree.") + + return self.parent(node) is None + + def is_leaf(self, node: T) -> bool: + if not self.is_a_node(node): + raise ValueError(f"'{node}' not in tree.") + + return len(self.children(node)) == 0 + + def is_a_node(self, node: T) -> bool: + return node in self._child_to_parent + + def add_node(self, node: T, parent: T) -> "Tree[T]": + """ + Returns a :class:`Tree` with added node *node* having a parent + *parent*. + """ + if self.is_a_node(node): + raise ValueError(f"'{node}' already present in tree.") + + siblings = self._parent_to_children[parent] + + return Tree((self._parent_to_children + .set(parent, siblings | frozenset([node])) + .set(node, frozenset())), + self._child_to_parent.set(node, parent)) + + def rename_node(self, node: T, new_id: T) -> "Tree[T]": + """ + Returns a copy of *self* with *node* renamed to *new_id*. + """ + if not self.is_a_node(node): + raise ValueError(f"'{node}' not present in tree.") + + if self.is_a_node(new_id): + raise ValueError(f"cannot rename to '{new_id}', as its already a part" + " of the tree.") + + parent = self.parent(node) + children = self.children(node) + + # {{{ update child to parent + + new_child_to_parent = (self._child_to_parent.delete(node) + .set(new_id, parent)) + + for child in children: + new_child_to_parent = (new_child_to_parent + .set(child, new_id)) + + # }}} + + # {{{ update parent_to_children + + new_parent_to_children = (self._parent_to_children + .delete(node) + .set(new_id, self.children(node))) + + if parent is not None: + # update the child's name in the parent's children + new_parent_to_children = (new_parent_to_children + .delete(parent) + .set(parent, ((self.children(parent) + - frozenset([node])) + | frozenset([new_id])))) + + # }}} + + return Tree(new_parent_to_children, + new_child_to_parent) + + def move_node(self, node: T, new_parent: OptionalT[T]) -> "Tree[T]": + """ + Returns a copy of *self* with node *node* as a child of *new_parent*. + """ + if not self.is_a_node(node): + raise ValueError(f"'{node}' not a part of the tree => cannot move.") + + if self.is_root(node): + if new_parent is None: + return self + else: + raise ValueError("Moving root not allowed.") + + if new_parent is None: + raise ValueError("Making multiple roots not allowed") + + if not self.is_a_node(new_parent): + raise ValueError(f"Cannot move to '{new_parent}' as it's not in tree.") + + parent = self.parent(node) + assert parent is not None # parent=root handled as a special case + siblings = self.children(parent) + parents_new_children = siblings - frozenset([node]) + new_parents_children = self.children(new_parent) | frozenset([node]) + + new_child_to_parent = self._child_to_parent.set(node, new_parent) + new_parent_to_children = (self._parent_to_children + .set(parent, parents_new_children) + .set(new_parent, new_parents_children)) + + return Tree(new_parent_to_children, + new_child_to_parent) + + def __str__(self) -> str: + """ + Stringifies the tree by using the box-drawing unicode characters. + + :: + + >>> from loopy.tools import Tree + >>> tree = (Tree.from_root("Root") + ... .add_node("A", "Root") + ... .add_node("B", "Root") + ... .add_node("D", "B") + ... .add_node("E", "B") + ... .add_node("C", "A")) + + >>> print(tree) + Root + ├── A + │ └── C + └── B + ├── D + └── E + """ + def rec(node): + children_result = [rec(c) for c in self.children(node)] + + def post_process_non_last_child(child): + return ["├── " + child[0]] + [f"│ {c}" for c in child[1:]] + + def post_process_last_child(child): + return ["└── " + child[0]] + [f" {c}" for c in child[1:]] + + children_result = ([post_process_non_last_child(c) + for c in children_result[:-1]] + + [post_process_last_child(c) + for c in children_result[-1:]]) + return [str(node)] + sum(children_result, start=[]) + + return "\n".join(rec(self.root)) + + def nodes(self) -> Iterator[T]: + return iter(self._child_to_parent.keys()) + # }}} # vim: fdm=marker diff --git a/loopy/transform/loop_distribution.py b/loopy/transform/loop_distribution.py new file mode 100644 index 000000000..4d0725071 --- /dev/null +++ b/loopy/transform/loop_distribution.py @@ -0,0 +1,432 @@ +""" +.. currentmodule:: loopy + +.. autofunction:: distribute_loops + +.. autoclass:: IllegalLoopDistributionError +""" + +__copyright__ = """ +Copyright (C) 2022 Kaushik Kulkarni +""" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import islpy as isl + +from dataclasses import dataclass +from functools import reduce +import loopy.match as match +from loopy.diagnostic import LoopyError +from itertools import combinations +from more_itertools import split_when +from loopy.transform.iname import remove_any_newly_unused_inames + + +class IllegalLoopDistributionError(LoopyError): + """ + Raised if the call to :func:`distribute_loops` wouldn't + preserve dependencies in the transformed kernel. + """ + + +def _get_rev_dep_dag(kernel, insn_ids): + rev_dep_graph = {insn_id: set() for insn_id in insn_ids} + + for insn_id in insn_ids: + for dep in (kernel.id_to_insn[insn_id].depends_on + & insn_ids): + rev_dep_graph[dep].add(insn_id) + + return rev_dep_graph + + +def _union_amaps(isl_maps): + def union_map1_map2(map1, map2): + if map1.space != map2.space: + map1, map2 = isl.align_two(map1, map2) + return map1 | map2 + + return reduce(union_map1_map2, + isl_maps[1:], + isl_maps[0]) + + +def _is_loop_distribution_sound(kernel, + iname, + insns_before, + insns_after, + outer_inames): + from loopy.kernel.tools import get_insn_access_map + id_to_insn = kernel.id_to_insn + + insns_before = frozenset({insn_id + for insn_id in insns_before + if iname in id_to_insn[insn_id].dependency_names()}) + insns_after = frozenset({insn_id + for insn_id in insns_after + if iname in id_to_insn[insn_id].dependency_names()}) + + vars_written_in_insns_before = reduce(frozenset.union, + (id_to_insn[insn_id].assignee_var_names() + for insn_id in insns_before), + frozenset()) + vars_written_in_insns_after = reduce(frozenset.union, + (id_to_insn[insn_id].assignee_var_names() + for insn_id in insns_after), + frozenset()) + vars_read_in_insns_before = reduce(frozenset.union, + (id_to_insn[insn_id].read_dependency_names() + for insn_id in insns_before), + frozenset()) + vars_read_in_insns_after = reduce(frozenset.union, + (id_to_insn[insn_id].read_dependency_names() + for insn_id in insns_after), + frozenset()) + + # dep_inducing_vars: dependency inducing variables + dep_inducing_vars = ( + # WAW + (vars_written_in_insns_before & vars_written_in_insns_after) + # RAW + | (vars_written_in_insns_before & vars_read_in_insns_after) + # WAR + | (vars_read_in_insns_before & vars_written_in_insns_after)) + + # {{{ process_amap + + def process_amap(amap): + for i, outer_iname in enumerate(sorted(outer_inames)): + dt, pos = amap.get_var_dict()[outer_iname] + assert dt == isl.dim_type.in_ + amap = amap.move_dims(isl.dim_type.param, i, + dt, pos, 1) + + amap = amap.project_out_except([iname], [isl.dim_type.in_]) + assert amap.dim(isl.dim_type.in_) == 1 + assert amap.get_var_names(isl.dim_type.in_) == [iname] + return amap + + # }}} + + for var in dep_inducing_vars: + # There is some issue regarding the alignment of the polyhedra spaces + # here. + amaps_pred = [ + process_amap(get_insn_access_map(kernel, insn_id, var)) + for insn_id in insns_before + if var in id_to_insn[insn_id].dependency_names()] + + amaps_succ = [ + process_amap(get_insn_access_map(kernel, insn_id, var)) + for insn_id in insns_after + if var in id_to_insn[insn_id].dependency_names()] + + amap_pred = _union_amaps(amaps_pred) + amap_succ = _union_amaps(amaps_succ) + + assert amap_pred.dim(isl.dim_type.in_) == 1 + assert amap_succ.dim(isl.dim_type.in_) == 1 + assert amap_pred.dim(isl.dim_type.out) == amap_succ.dim(isl.dim_type.out) + + amap_pred = amap_pred.set_dim_name(isl.dim_type.in_, 0, f"{iname}_pred") + amap_succ = amap_succ.set_dim_name(isl.dim_type.in_, 0, f"{iname}_succ") + + for i in range(amap_pred.dim(isl.dim_type.out)): + amap_pred = amap_pred.set_dim_name(isl.dim_type.out, i, f"out_{i}") + amap_succ = amap_succ.set_dim_name(isl.dim_type.out, i, f"out_{i}") + + amap_pred, amap_succ = isl.align_two(amap_pred, amap_succ) + ipred_gt_isucc = (isl.BasicMap + .universe(amap_pred.space) + .add_constraint(isl.Constraint + .ineq_from_names(amap_pred.space, + {f"{iname}_pred": 1, + f"{iname}_succ": -1, + 1: -1, + }))) + + if not (amap_pred & amap_succ & ipred_gt_isucc).range().is_empty(): + return False + + return True + + +# {{{ _get_loop_nest_aware_statement_toposort + +@dataclass(frozen=True) +class ScheduleItem: + pass + + +@dataclass(frozen=True) +class BeginLoop(ScheduleItem): + iname: str + + +@dataclass(frozen=True) +class EndLoop(ScheduleItem): + iname: str + + +@dataclass(frozen=True) +class Statement(ScheduleItem): + insn_id: str + + +def _get_loop_nest_aware_statement_toposort(kernel, insn_ids, + insns_to_distribute, + outer_inames): + from loopy.schedule.tools import get_loop_nest_tree + from pytools.graph import compute_topological_order_with_dynamic_key + + id_to_insn = kernel.id_to_insn + all_inames = reduce(frozenset.union, + [id_to_insn[insn_id].within_inames + for insn_id in insn_ids], + frozenset() + ) - outer_inames + + dag = {**{BeginLoop(iname): set() + for iname in all_inames}, + **{EndLoop(iname): set() + for iname in all_inames}, + **{Statement(insn): set() + for insn in insn_ids} + } + + for insn_id in insn_ids: + insn = id_to_insn[insn_id] + for dep in (insn.depends_on & insn_ids): + + inames_dep = id_to_insn[dep].within_inames + inames_insn = id_to_insn[insn_id].within_inames + + # {{{ register deps on loop entry/leave because of insn. deps + + if inames_dep < inames_insn: + for iname in inames_insn - inames_dep: + dag[Statement(dep)].add(BeginLoop(iname)) + elif inames_insn < inames_dep: + for iname in inames_dep - inames_insn: + dag[EndLoop(iname)].add(Statement(insn_id)) + elif inames_dep != inames_insn: + for iname_dep in inames_dep - (inames_dep & inames_insn): + for iname_insn in inames_insn - (inames_dep & inames_insn): + dag[EndLoop(iname_dep)].add(BeginLoop(iname_insn)) + else: + dag[Statement(dep)].add(Statement(insn_id)) + + # }}} + + for iname in insn.within_inames - outer_inames: + dag[BeginLoop(iname)].add(Statement(insn_id)) + dag[Statement(insn_id)].add(EndLoop(iname)) + + # TODO: [perf] it would be better to only look at the + # statements inner to 'outer_inames'. + loop_nest_tree = get_loop_nest_tree(kernel) + + def trigger_key_update(state): + recent_instructions = (x.insn_id + for x in reversed(state.scheduled_nodes) + if isinstance(x, Statement)) + + try: + last_insn = next(recent_instructions) + except StopIteration: + # no instructions scheduled yet. + return False + else: + try: + second_to_last_insn = next(recent_instructions) + except StopIteration: + return last_insn in insns_to_distribute + else: + return ((second_to_last_insn in insns_to_distribute) + ^ (last_insn in insns_to_distribute)) + + def key(x, prev_node_in_distributed_insns): + if isinstance(x, Statement): + iname = max(id_to_insn[x.insn_id].within_inames, + key=loop_nest_tree.depth, + default="") + loop_nest = tuple( + sorted(loop_nest_tree.ancestors(iname), + key=loop_nest_tree.depth)) + (iname,) + + return (loop_nest, + ((x.insn_id in insns_to_distribute) + ^ prev_node_in_distributed_insns), + x.insn_id) + elif isinstance(x, (BeginLoop, EndLoop)): + loop_nest = tuple( + sorted(loop_nest_tree.ancestors(x.iname), + key=loop_nest_tree.depth)) + (x.iname,) + return (loop_nest,) + else: + raise NotImplementedError(x) + + def get_key(state): + if not any(isinstance(sched_node, Statement) + for sched_node in state.scheduled_nodes): + return lambda x: key(x, False) + else: + last_insn = next(x.insn_id + for x in reversed(state.scheduled_nodes) + if isinstance(x, Statement)) + return lambda x: key(x, + last_insn in insns_to_distribute) + + toposorted_dag = compute_topological_order_with_dynamic_key( + dag, trigger_key_update=trigger_key_update, get_key=get_key) + + return [x.insn_id for x in toposorted_dag if isinstance(x, Statement)] + +# }}} + + +@remove_any_newly_unused_inames +def distribute_loops(kernel, + insn_match, + outer_inames, + check_soundness: bool = __debug__, + ): + r""" + Returns a copy of *kernel* with instructions in *insn_match* having all + inames expect *outer_inames* duplicated. This is a dependency preserving + transformation which might require duplicating inames in instructions not + matching *insn_match*. + + :arg insn_match: A match expression as understood by + :func:`loopy.match.parse_match`. + :arg outer_loops: A set of inames under which the loop distribution is to + be performed. + :arg check_soundness: If *True*, raises an error if any duplication of the + inames would violate dependencies. If *False*, skips these checks and + could potentially return a kernel transformed with a + dependency-violating transformation. + + .. note:: + + - This assumes that the generated code would iterate over a loop in an + ascending order of the value taken by the loop-induction variable. + - This transformation is undefined for *kernel* that cannot be + linearized, as they don't represent a computable expression. + """ + from loopy.transform.iname import duplicate_inames + + within = match.parse_match(insn_match) + id_to_insn = kernel.id_to_insn + + insns_to_distribute = {insn.id + for insn in kernel.instructions + if within(kernel, insn)} + + insns_in_outer_inames = frozenset(insn.id + for insn in kernel.instructions + if insn.within_inames >= outer_inames) + + if not (insns_to_distribute <= insns_in_outer_inames): + raise LoopyError(f"Instructions {insns_to_distribute-insns_in_outer_inames} " + " to be distributed do not nest under outer_inames.") + + toposorted_insns = _get_loop_nest_aware_statement_toposort(kernel, + insns_in_outer_inames, + insns_to_distribute, + outer_inames) + + insn_partitions = tuple(split_when( + toposorted_insns, + lambda x, y: ((x in insns_to_distribute) ^ (y in insns_to_distribute)))) + + # {{{ compute which inames must be duplicate to perform the distribution. + + partition_idx_to_inner_inames = [ + reduce(frozenset.union, + (((id_to_insn[insn_id].within_inames + | id_to_insn[insn_id].reduction_inames()) + - outer_inames) + for insn_id in insn_partition), + frozenset()) + for insn_partition in insn_partitions + ] + + inner_iname_to_partition_indices = {} + + for ipartition, inner_inames in enumerate(partition_idx_to_inner_inames): + for inner_iname in inner_inames: + (inner_iname_to_partition_indices + .setdefault(inner_iname, set()) + .add(ipartition)) + + inames_to_duplicate = { + inner_iname + for inner_iname, ipartitions in inner_iname_to_partition_indices.items() + if len(ipartitions) > 1 + } + iname_to_duplicate_tags = {iname: kernel.inames[iname].tags + for iname in inames_to_duplicate} + + del partition_idx_to_inner_inames + + # }}} + + # {{{ check soundness + + if check_soundness: + for inner_iname in inames_to_duplicate: + ipartitions = sorted(inner_iname_to_partition_indices[inner_iname]) + + for ipart_pred, ipart_succ in combinations(ipartitions, 2): + if not _is_loop_distribution_sound(kernel, + inner_iname, + insn_partitions[ipart_pred], + insn_partitions[ipart_succ], + outer_inames): + raise IllegalLoopDistributionError( + f"Distributing the loops '{inames_to_duplicate}'" + " would violate memory dependencies between the instructions" + f" '{insn_partitions[ipart_pred]}' and " + f" '{insn_partitions[ipart_succ]}'.") + + # }}} + + for insn_partition in insn_partitions: + within_partition = match.Or(tuple(match.Id(insn_id) + for insn_id in insn_partition)) + inames_in_partition = reduce(frozenset.union, + ((id_to_insn[insn_id].within_inames + | id_to_insn[insn_id].reduction_inames()) + for insn_id in insn_partition), + frozenset()) + inames_to_duplicate_in_partition = inames_to_duplicate & inames_in_partition + if not inames_to_duplicate_in_partition: + continue + + kernel = duplicate_inames(kernel, + inames_to_duplicate_in_partition, + within=within_partition, + tags=iname_to_duplicate_tags) + + return kernel + +# vim: fdm=marker diff --git a/requirements.txt b/requirements.txt index c44f010c3..c9ee45715 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -git+https://github.com/inducer/pytools.git#egg=pytools >= 2021.1 +git+https://github.com/kaushikcfd/pytools.git@compute_topological_sort_with_dynamic_key#egg=pytools >= 2022.2 git+https://github.com/inducer/islpy.git#egg=islpy git+https://github.com/inducer/cgen.git#egg=cgen git+https://github.com/inducer/pyopencl.git#egg=pyopencl diff --git a/setup.py b/setup.py index f265326e7..92393a0e1 100644 --- a/setup.py +++ b/setup.py @@ -98,6 +98,7 @@ def write_git_revision(package_name): "Mako", "pyrsistent", "immutables", + "more-itertools", ], extras_require={ diff --git a/test/test_loop_distribution.py b/test/test_loop_distribution.py new file mode 100644 index 000000000..4ec908193 --- /dev/null +++ b/test/test_loop_distribution.py @@ -0,0 +1,256 @@ +__copyright__ = "Copyright (C) 2021 Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import sys +import numpy as np +import pyopencl as cl +import loopy as lp +from loopy.transform.loop_distribution import IllegalLoopDistributionError +import pytest + +import logging +logger = logging.getLogger(__name__) + +try: + import faulthandler +except ImportError: + pass +else: + faulthandler.enable() + +from pyopencl.tools import pytest_generate_tests_for_pyopencl \ + as pytest_generate_tests + +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa + +__all__ = [ + "pytest_generate_tests", + "cl" # "cl.create_some_context" + ] + + +def test_hello_loop_distribution(ctx_factory): + ctx = ctx_factory() + + t_unit = lp.make_kernel( + "{[i,j]: 0<=i, j<10}", + """ + for i + a[i] = 10 {id=w_a} + for j + b[i, j] = j*a[i] {id=w_b} + end + c[i] = 2*b[i, 5] {id=w_c} + end + """, + seq_dependencies=True) + + ref_t_unit = t_unit + + knl = lp.distribute_loops(t_unit.default_entrypoint, + insn_match="id:w_b", + outer_inames=frozenset()) + t_unit = t_unit.with_kernel(knl) + assert not (knl.id_to_insn["w_a"].within_inames + & knl.id_to_insn["w_b"].within_inames) + assert not (knl.id_to_insn["w_c"].within_inames + & knl.id_to_insn["w_b"].within_inames) + assert not (knl.id_to_insn["w_a"].within_inames + & knl.id_to_insn["w_c"].within_inames) + + lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit) + + +def test_soundness_check(ctx_factory): + ctx = ctx_factory() + + # {{{ WAW deps + + tunit = lp.make_kernel( + "{[i]: 1<=i<10}", + """ + a[i] = i {id=first_w_a} + a[i-1] = i**2 {id=second_w_a, dep=first_w_a} + """ + ) + ref_tunit = tunit + + knl = lp.distribute_loops(tunit.default_entrypoint, + "id:second_w_a", + outer_inames=frozenset()) + tunit = tunit.with_kernel(knl) + assert not (knl.id_to_insn["first_w_a"].within_inames + & knl.id_to_insn["second_w_a"].within_inames) + lp.auto_test_vs_ref(ref_tunit, ctx, tunit) + + tunit = lp.make_kernel( + "{[i]: 0<=i<10}", + """ + a[i] = i {id=first_w_a} + a[i+1] = i**2 {id=second_w_a, dep=first_w_a} + """ + ) + + with pytest.raises(IllegalLoopDistributionError): + lp.distribute_loops(tunit.default_entrypoint, + "id:second_w_a", + outer_inames=frozenset()) + + # }}} + + # {{{ RAW deps + + tunit = lp.make_kernel( + "{[i]: 1<=i<10}", + """ + b[0] = 0 {id=first_w_b} + a[i] = i {id=first_w_a} + b[i] = 2*a[i-1] {id=second_w_b} + """, + seq_dependencies=True, + ) + ref_tunit = tunit + + knl = lp.distribute_loops(tunit.default_entrypoint, + "id:second_w_b", + outer_inames=frozenset()) + tunit = tunit.with_kernel(knl) + assert not (knl.id_to_insn["first_w_a"].within_inames + & knl.id_to_insn["second_w_b"].within_inames) + lp.auto_test_vs_ref(ref_tunit, ctx, tunit) + + tunit = lp.make_kernel( + "{[i]: 0<=i<10}", + """ + a[i] = i {id=first_w_a} + b[i] = 2*a[i+1] {id=first_w_b} + """, + seq_dependencies=True + ) + + with pytest.raises(IllegalLoopDistributionError): + lp.distribute_loops(tunit.default_entrypoint, + "id:first_w_b", + outer_inames=frozenset()) + + # }}} + + # {{{ WAR deps + + tunit = lp.make_kernel( + "{[i, j]: 0<=i<10 and 0<=j<11}", + """ + b[j] = j**2 + a[i] = b[i+1] {id=first_w_a} + b[i] = 2*a[i] {id=first_w_b} + """, + seq_dependencies=True + ) + ref_tunit = tunit + + knl = lp.distribute_loops(tunit.default_entrypoint, + "id:first_w_b", + outer_inames=frozenset()) + tunit = tunit.with_kernel(knl) + assert not (knl.id_to_insn["first_w_a"].within_inames + & knl.id_to_insn["first_w_b"].within_inames) + lp.auto_test_vs_ref(ref_tunit, ctx, tunit) + + tunit = lp.make_kernel( + "{[i]: 1<=i<10}", + """ + b[0] = 0 {id=first_w_b} + a[i] = b[i-1] {id=first_w_a} + b[i] = 2*a[i] {id=second_w_b} + """, + seq_dependencies=True, + ) + + with pytest.raises(IllegalLoopDistributionError): + lp.distribute_loops(tunit.default_entrypoint, + "id:second_w_b", + outer_inames=frozenset()) + + # }}} + + +def test_reduction_inames_get_duplicated(ctx_factory): + ctx = ctx_factory() + + tunit = lp.make_kernel( + "{[i, j]: 0<=i<100 and 0<=j<10}", + """ + out1[i] = sum(j, mat1[j] * x1[i, j]) {id=w_out1} + out2[i] = sum(j, mat2[j] * x2[i, j]) {id=w_out2} + """, + ) + tunit = lp.add_dtypes(tunit, {"mat1": np.float64, + "mat2": np.float64, + "x1": np.float64, + "x2": np.float64, + }) + + ref_tunit = tunit + + knl = lp.distribute_loops(tunit.default_entrypoint, + "id:w_out2", + outer_inames=frozenset()) + tunit = tunit.with_kernel(knl) + + assert not (knl.id_to_insn["w_out1"].within_inames + & knl.id_to_insn["w_out2"].within_inames) + assert not (knl.id_to_insn["w_out1"].reduction_inames() + & knl.id_to_insn["w_out2"].reduction_inames()) + lp.auto_test_vs_ref(ref_tunit, ctx, tunit) + + +def test_avoids_unnecessary_loop_distribution(ctx_factory): + ctx = ctx_factory() + tunit = lp.make_kernel( + "{[i]: 0 <= i < 10}", + """ + y0[i] = i {id=w_y0} + y1[i] = i**2 {id=w_y1} + y2[i] = y0[i] + i**3 {id=w_y2} + y3[i] = 2*y2[i] {id=w_y3} + y4[i] = i**4 + y1[i] {id=w_y4} + """) + ref_tunit = tunit + + knl = lp.distribute_loops(tunit.default_entrypoint, + insn_match="writes:y2 or writes:y4", + outer_inames=frozenset()) + tunit = tunit.with_kernel(knl) + + assert (knl.id_to_insn["w_y2"].within_inames + == knl.id_to_insn["w_y4"].within_inames) + lp.auto_test_vs_ref(ref_tunit, ctx, tunit) + + +if __name__ == "__main__": + if len(sys.argv) > 1: + exec(sys.argv[1]) + else: + from pytest import main + main([__file__]) + +# vim: fdm=marker