From 3d9dee45a55893c3ffb0436d2f903685ccaf57a3 Mon Sep 17 00:00:00 2001
From: Colin Unger <lockshaw@lockshaw.net>
Date: Sat, 15 Feb 2025 18:44:37 -0800
Subject: [PATCH] Format

---
 .../include/compiler/graph_optimize_state.h   |   2 +-
 .../src/compiler/graph_optimize_state.cc      |   6 +-
 .../get_pcg_series_parallel_decomposition.cc  |  24 +-
 .../src/compiler/graph_optimize_result.cc     |   4 +-
 ...racted_tensor_set_movement_across_split.cc | 121 ++++---
 .../get_optimal_machine_mapping.cc            |  12 +-
 .../get_tensor_set_movement_across_split.cc   |  96 +++---
 .../get_machine_mapping_problem_tree.cc       |  37 +--
 ...get_optimal_machine_mapping_with_memory.cc |  13 +-
 ...ion_graph_series_parallel_decomposition.cc | 114 +++----
 .../get_pcg_series_parallel_decomposition.cc  | 296 ++++++++++--------
 .../task_graph_simulator/task_simulator.cc    |  24 +-
 lib/compiler/test/src/graph_optimize_state.cc |  42 +--
 .../src/local_cost_estimator.cc               |  52 +--
 lib/models/src/models/bert/bert.cc            |   2 +-
 .../initializers/kaiming_initializer_mode.h   |   8 +-
 lib/op-attrs/include/op-attrs/ops/attention.h |  19 +-
 .../include/op-attrs/ops/batch_norm.h         |  15 +-
 lib/op-attrs/include/op-attrs/ops/conv_2d.h   |  16 +-
 lib/op-attrs/include/op-attrs/ops/embedding.h |   7 +-
 .../include/op-attrs/ops/layer_norm.h         |   9 +-
 lib/op-attrs/include/op-attrs/ops/linear.h    |  16 +-
 .../include/op-attrs/shape_inference.h        |   2 +-
 lib/op-attrs/include/op-attrs/tensor_dims.h   |   4 +-
 lib/op-attrs/include/op-attrs/tensor_shape.h  |   4 +-
 .../src/op-attrs/initializer_attrs.cc         |  44 +--
 .../initializers/kaiming_initializer_mode.cc  |   8 +-
 lib/op-attrs/src/op-attrs/ops/attention.cc    |  73 +++--
 lib/op-attrs/src/op-attrs/ops/batch_norm.cc   |  49 +--
 lib/op-attrs/src/op-attrs/ops/conv_2d.cc      |  63 ++--
 .../src/op-attrs/ops/element_unary.cc         |   4 +-
 lib/op-attrs/src/op-attrs/ops/embedding.cc    |  19 +-
 lib/op-attrs/src/op-attrs/ops/layer_norm.cc   |  28 +-
 lib/op-attrs/src/op-attrs/ops/linear.cc       |  66 ++--
 lib/op-attrs/src/op-attrs/ops/repartition.cc  |   3 +-
 .../src/op-attrs/pcg_operator_attrs.cc        |   9 +-
 lib/op-attrs/src/op-attrs/shape_inference.cc  | 158 +++++-----
 lib/op-attrs/src/op-attrs/tensor_dims.cc      |   8 +-
 lib/op-attrs/src/op-attrs/tensor_shape.cc     |   8 +-
 lib/pcg/include/pcg/computation_graph.h       |  11 +-
 .../include/pcg/computation_graph_builder.h   |  18 +-
 .../parallel_computation_graph.h              |  19 +-
 .../parallel_computation_graph_builder.h      |   1 +
 lib/pcg/src/pcg/computation_graph.cc          |  61 ++--
 lib/pcg/src/pcg/computation_graph_builder.cc  | 104 +++---
 .../parallel_computation_graph.cc             |  86 ++---
 .../parallel_computation_graph_builder.cc     |  86 +++--
 lib/pcg/test/src/pcg/computation_graph.cc     |  42 ++-
 .../v1/v1_parallel_computation_graph.cc       |  15 +-
 .../parallel_computation_graph.cc             | 157 ++++++----
 .../parallel_computation_graph_builder.cc     | 151 ++++-----
 .../src/pcg/pcg_from_computation_graph.cc     |  83 +++--
 .../perform_shape_inference.cc                |  34 +-
 .../apply_substitution/apply_substitution.cc  |  13 +-
 .../evaluate_substitution_output.cc           |  11 +-
 .../perform_shape_inference.cc                |   4 +-
 .../test/src/substitutions/pcg_pattern.cc     |  17 +-
 .../utils/containers/transform_until.h        |   6 +-
 .../include/utils/containers/vector_of.h      |   2 +-
 lib/utils/include/utils/expected.h            |  16 +-
 .../src/utils/containers/transform_until.cc   |   3 +-
 lib/utils/src/utils/containers/vector_of.cc   |  14 +-
 .../src/utils/containers/transform_until.cc   |  12 +-
 63 files changed, 1346 insertions(+), 1105 deletions(-)

diff --git a/lib/compiler/include/compiler/graph_optimize_state.h b/lib/compiler/include/compiler/graph_optimize_state.h
index 8dd10f2227..404111ff8b 100644
--- a/lib/compiler/include/compiler/graph_optimize_state.h
+++ b/lib/compiler/include/compiler/graph_optimize_state.h
@@ -7,7 +7,7 @@ namespace FlexFlow {
 
 struct GraphOptimizeState {
   explicit GraphOptimizeState(GraphOptimizeResult const &graph_optimize_result,
-                     float runtime);
+                              float runtime);
 
   GraphOptimizeResult graph_optimize_result;
   float runtime;
diff --git a/lib/compiler/src/compiler/graph_optimize_state.cc b/lib/compiler/src/compiler/graph_optimize_state.cc
index 34a9a120e1..1091b92866 100644
--- a/lib/compiler/src/compiler/graph_optimize_state.cc
+++ b/lib/compiler/src/compiler/graph_optimize_state.cc
@@ -1,6 +1,6 @@
 #include "compiler/graph_optimize_state.h"
-#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h"
 #include "compiler/graph_optimize_result.h"
+#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h"
 
 namespace FlexFlow {
 
@@ -56,7 +56,9 @@ bool GraphOptimizeState::operator<(GraphOptimizeState const &other) const {
 }
 
 std::string format_as(GraphOptimizeState const &st) {
-  return fmt::format("<GraphOptimizeState graph_optimize_result={} runtime={}>", st.graph_optimize_result, st.runtime);
+  return fmt::format("<GraphOptimizeState graph_optimize_result={} runtime={}>",
+                     st.graph_optimize_result,
+                     st.runtime);
 }
 
 std::ostream &operator<<(std::ostream &s, GraphOptimizeState const &st) {
diff --git a/lib/compiler/src/compiler/series_parallel/pcg/get_pcg_series_parallel_decomposition.cc b/lib/compiler/src/compiler/series_parallel/pcg/get_pcg_series_parallel_decomposition.cc
index 8d447631ae..30a0655b2d 100644
--- a/lib/compiler/src/compiler/series_parallel/pcg/get_pcg_series_parallel_decomposition.cc
+++ b/lib/compiler/src/compiler/series_parallel/pcg/get_pcg_series_parallel_decomposition.cc
@@ -1,10 +1,10 @@
 #include "compiler/series_parallel/pcg/get_pcg_series_parallel_decomposition.h"
+#include "op-attrs/pcg_operator_attrs.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph.h"
 #include "utils/containers/get_only.h"
 #include "utils/graph/digraph/algorithms/materialize_digraph_view.h"
 #include "utils/graph/instances/adjacency_digraph.h"
 #include "utils/graph/series_parallel/get_series_parallel_decomposition.h"
-#include "op-attrs/pcg_operator_attrs.h"
 
 namespace FlexFlow {
 
@@ -19,7 +19,7 @@ std::optional<SeriesParallelDecomposition>
     }
   }
 
-  auto layer_is_weight_or_input = [&](parallel_layer_guid_t const &l) { 
+  auto layer_is_weight_or_input = [&](parallel_layer_guid_t const &l) {
     PCGOperatorAttrs op_attrs = get_parallel_layer_attrs(pcg, l).op_attrs;
     return op_attrs.has<WeightAttrs>() || op_attrs.has<InputAttrs>();
   };
@@ -29,20 +29,24 @@ std::optional<SeriesParallelDecomposition>
     return is_parallel_op(op_attrs);
   };
 
-  std::function<parallel_layer_guid_t(parallel_layer_guid_t const &)> follow_to_last_parallel_op 
-    = [&](parallel_layer_guid_t const &starting_point) -> parallel_layer_guid_t {
+  std::function<parallel_layer_guid_t(parallel_layer_guid_t const &)>
+      follow_to_last_parallel_op =
+          [&](parallel_layer_guid_t const &starting_point)
+      -> parallel_layer_guid_t {
+    assert(layer_is_weight_or_input(starting_point) ||
+           layer_is_parallel_op(starting_point));
 
-    assert (layer_is_weight_or_input(starting_point) || layer_is_parallel_op(starting_point));
-     
-    std::unordered_set<parallel_layer_guid_t> successors = get_successors(pcg, starting_point);
+    std::unordered_set<parallel_layer_guid_t> successors =
+        get_successors(pcg, starting_point);
 
     if (successors.size() != 1) {
       return starting_point;
     }
 
-    parallel_layer_guid_t successor = get_only(get_successors(pcg, starting_point));
+    parallel_layer_guid_t successor =
+        get_only(get_successors(pcg, starting_point));
 
-    assert (!layer_is_weight_or_input(successor));
+    assert(!layer_is_weight_or_input(successor));
     if (layer_is_parallel_op(successor)) {
       return follow_to_last_parallel_op(successor);
     } else {
@@ -54,7 +58,7 @@ std::optional<SeriesParallelDecomposition>
     std::unordered_set<parallel_layer_guid_t> weight_and_input_layers =
         filter(get_parallel_layers(pcg), layer_is_weight_or_input);
 
-    std::unordered_set<parallel_layer_guid_t> par_chain_endpoints = 
+    std::unordered_set<parallel_layer_guid_t> par_chain_endpoints =
         transform(weight_and_input_layers, follow_to_last_parallel_op);
 
     std::unordered_set<parallel_layer_guid_t> par_chain_endpoint_successors =
diff --git a/lib/compiler/test/src/compiler/graph_optimize_result.cc b/lib/compiler/test/src/compiler/graph_optimize_result.cc
index 8739709b98..f48c119603 100644
--- a/lib/compiler/test/src/compiler/graph_optimize_result.cc
+++ b/lib/compiler/test/src/compiler/graph_optimize_result.cc
@@ -3,7 +3,9 @@
 namespace FlexFlow {
 
 std::string format_as(GraphOptimizeResult const &r) {
-  return fmt::format("<GraphOptimizeResult\npcg={}\nmachine_mapping={}>", as_dot(r.pcg), r.machine_mapping);
+  return fmt::format("<GraphOptimizeResult\npcg={}\nmachine_mapping={}>",
+                     as_dot(r.pcg),
+                     r.machine_mapping);
 }
 
 std::ostream &operator<<(std::ostream &s, GraphOptimizeResult const &r) {
diff --git a/lib/compiler/test/src/compiler/machine_mapping/abstracted_tensor_set_movement/get_abstracted_tensor_set_movement_across_split.cc b/lib/compiler/test/src/compiler/machine_mapping/abstracted_tensor_set_movement/get_abstracted_tensor_set_movement_across_split.cc
index 7afb21a9fc..13067f5d02 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/abstracted_tensor_set_movement/get_abstracted_tensor_set_movement_across_split.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/abstracted_tensor_set_movement/get_abstracted_tensor_set_movement_across_split.cc
@@ -27,24 +27,25 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelComputationGraph pcg = empty_parallel_computation_graph();
 
     TensorShape input_shape = TensorShape{
-      TensorDims{
-        FFOrdered<nonnegative_int>{
-          10_n, 12_n,
+        TensorDims{
+            FFOrdered<nonnegative_int>{
+                10_n,
+                12_n,
+            },
         },
-      },
-      DataType::FLOAT,
+        DataType::FLOAT,
     };
 
     ParallelTensorShape par_input_shape = lift_to_parallel(input_shape);
 
     ParallelLayerAttrs partition_attrs = ParallelLayerAttrs{
-      /*op_attrs=*/PCGOperatorAttrs{
-        RepartitionAttrs{
-          /*repartition_dim=*/ff_dim_t{0_n},
-          /*repartition_degree=*/2_n,
+        /*op_attrs=*/PCGOperatorAttrs{
+            RepartitionAttrs{
+                /*repartition_dim=*/ff_dim_t{0_n},
+                /*repartition_degree=*/2_n,
+            },
         },
-      },
-      /*name=*/std::nullopt,
+        /*name=*/std::nullopt,
     };
 
     ParallelLayerAttrs relu_attrs = ParallelLayerAttrs{
@@ -72,19 +73,19 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("no edges across split") {
       ParallelLayerAddedResult input1 = pcg_add_input_layer(pcg, input_shape);
       parallel_tensor_guid_t t_input1 = get_only(input1.outputs);
-      ParallelLayerAddedResult partition_input1 = add_parallel_layer(pcg, partition_attrs, {t_input1}, {});
+      ParallelLayerAddedResult partition_input1 =
+          add_parallel_layer(pcg, partition_attrs, {t_input1}, {});
 
       ParallelLayerAddedResult input2 = pcg_add_input_layer(pcg, input_shape);
       parallel_tensor_guid_t t_input2 = get_only(input2.outputs);
-      ParallelLayerAddedResult partition_input2 = add_parallel_layer(pcg, partition_attrs, {t_input2}, {});
+      ParallelLayerAddedResult partition_input2 =
+          add_parallel_layer(pcg, partition_attrs, {t_input2}, {});
 
       PCGBinarySeriesSplit split = PCGBinarySeriesSplit{
-          make_series_split(
-            make_leaf(input1.parallel_layer),
-            make_leaf(partition_input1.parallel_layer)),
-          make_series_split(
-            make_leaf(input2.parallel_layer),
-            make_leaf(partition_input2.parallel_layer)),
+          make_series_split(make_leaf(input1.parallel_layer),
+                            make_leaf(partition_input1.parallel_layer)),
+          make_series_split(make_leaf(input2.parallel_layer),
+                            make_leaf(partition_input2.parallel_layer)),
       };
 
       AbstractedTensorSetMovement result =
@@ -101,21 +102,21 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("single edge across split") {
       ParallelLayerAddedResult input = pcg_add_input_layer(pcg, input_shape);
       parallel_tensor_guid_t t_input = get_only(input.outputs);
-      ParallelLayerAddedResult partition_input = add_parallel_layer(pcg, partition_attrs, {t_input}, {});
+      ParallelLayerAddedResult partition_input =
+          add_parallel_layer(pcg, partition_attrs, {t_input}, {});
       parallel_tensor_guid_t t_partition_input = get_only(input.outputs);
 
-      ParallelLayerAddedResult layer_1 = add_parallel_layer(
-          pcg, relu_attrs, {t_partition_input}, {});
+      ParallelLayerAddedResult layer_1 =
+          add_parallel_layer(pcg, relu_attrs, {t_partition_input}, {});
       parallel_tensor_guid_t t_layer_1 = get_only(layer_1.outputs);
-      ParallelLayerAddedResult layer_2 = add_parallel_layer(
-          pcg, relu_attrs, {t_layer_1}, {});
+      ParallelLayerAddedResult layer_2 =
+          add_parallel_layer(pcg, relu_attrs, {t_layer_1}, {});
 
       PCGBinarySeriesSplit split = PCGBinarySeriesSplit{
           make_series_split(
-            make_series_split(
-              make_leaf(input.parallel_layer),
-              make_leaf(partition_input.parallel_layer)),
-            make_leaf(layer_1.parallel_layer)),
+              make_series_split(make_leaf(input.parallel_layer),
+                                make_leaf(partition_input.parallel_layer)),
+              make_leaf(layer_1.parallel_layer)),
           make_leaf(layer_2.parallel_layer),
       };
 
@@ -147,28 +148,25 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("does not include edges removed by transitive reduction") {
       ParallelLayerAddedResult input = pcg_add_input_layer(pcg, input_shape);
       parallel_tensor_guid_t t_input = get_only(input.outputs);
-      ParallelLayerAddedResult partition_input = add_parallel_layer(pcg, partition_attrs, {t_input}, {});
+      ParallelLayerAddedResult partition_input =
+          add_parallel_layer(pcg, partition_attrs, {t_input}, {});
       parallel_tensor_guid_t t_partition_input = get_only(input.outputs);
 
-      ParallelLayerAddedResult layer_1 = add_parallel_layer(
-          pcg, relu_attrs, {t_partition_input}, {});
+      ParallelLayerAddedResult layer_1 =
+          add_parallel_layer(pcg, relu_attrs, {t_partition_input}, {});
       parallel_tensor_guid_t t_layer_1 = get_only(layer_1.outputs);
 
-      ParallelLayerAddedResult layer_2 = add_parallel_layer(
-          pcg, relu_attrs, {t_layer_1}, {});
+      ParallelLayerAddedResult layer_2 =
+          add_parallel_layer(pcg, relu_attrs, {t_layer_1}, {});
       parallel_tensor_guid_t t_layer_2 = get_only(layer_2.outputs);
 
-      ParallelLayerAddedResult layer_3 = add_parallel_layer(
-          pcg,
-          ew_add_attrs,
-          {t_layer_1, t_layer_2},
-          {});
+      ParallelLayerAddedResult layer_3 =
+          add_parallel_layer(pcg, ew_add_attrs, {t_layer_1, t_layer_2}, {});
 
       PCGBinarySeriesSplit split = PCGBinarySeriesSplit{
           make_series_split(
-              make_series_split(
-                make_leaf(input.parallel_layer),
-                make_leaf(partition_input.parallel_layer)),
+              make_series_split(make_leaf(input.parallel_layer),
+                                make_leaf(partition_input.parallel_layer)),
               make_series_split(make_leaf(layer_1.parallel_layer),
                                 make_leaf(layer_2.parallel_layer))),
           make_leaf(layer_3.parallel_layer),
@@ -203,24 +201,25 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("single tensor, multiple consumers across split") {
       ParallelLayerAddedResult input = pcg_add_input_layer(pcg, input_shape);
       parallel_tensor_guid_t t_input = get_only(input.outputs);
-      ParallelLayerAddedResult partition_input = add_parallel_layer(pcg, partition_attrs, {t_input}, {});
+      ParallelLayerAddedResult partition_input =
+          add_parallel_layer(pcg, partition_attrs, {t_input}, {});
       parallel_tensor_guid_t t_partition_input = get_only(input.outputs);
 
-      ParallelLayerAddedResult layer_1 = add_parallel_layer(
-          pcg, relu_attrs, {t_partition_input}, {});
+      ParallelLayerAddedResult layer_1 =
+          add_parallel_layer(pcg, relu_attrs, {t_partition_input}, {});
       parallel_tensor_guid_t t_layer_1 = get_only(layer_1.outputs);
 
-      ParallelLayerAddedResult layer_2 = add_parallel_layer(
-          pcg, relu_attrs, {t_layer_1}, {});
+      ParallelLayerAddedResult layer_2 =
+          add_parallel_layer(pcg, relu_attrs, {t_layer_1}, {});
 
-      ParallelLayerAddedResult layer_3 = add_parallel_layer(
-          pcg, relu_attrs, {t_layer_1}, {});
+      ParallelLayerAddedResult layer_3 =
+          add_parallel_layer(pcg, relu_attrs, {t_layer_1}, {});
 
       PCGBinarySeriesSplit split = PCGBinarySeriesSplit{
           make_series_split(
-            make_series_split(make_leaf(input.parallel_layer),
-                              make_leaf(partition_input.parallel_layer)),
-            make_leaf(layer_1.parallel_layer)),
+              make_series_split(make_leaf(input.parallel_layer),
+                                make_leaf(partition_input.parallel_layer)),
+              make_leaf(layer_1.parallel_layer)),
           make_parallel_split(make_leaf(layer_2.parallel_layer),
                               make_leaf(layer_3.parallel_layer)),
       };
@@ -258,17 +257,18 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("multiple tensors, multiple consumers across split") {
       ParallelLayerAddedResult input = pcg_add_input_layer(pcg, input_shape);
       parallel_tensor_guid_t t_input = get_only(input.outputs);
-      ParallelLayerAddedResult partition_input = add_parallel_layer(pcg, partition_attrs, {t_input}, {});
+      ParallelLayerAddedResult partition_input =
+          add_parallel_layer(pcg, partition_attrs, {t_input}, {});
       parallel_tensor_guid_t t_partition_input = get_only(input.outputs);
 
-      ParallelLayerAddedResult layer_1 = add_parallel_layer(
-          pcg, relu_attrs, {t_partition_input}, {});
+      ParallelLayerAddedResult layer_1 =
+          add_parallel_layer(pcg, relu_attrs, {t_partition_input}, {});
 
-      ParallelLayerAddedResult layer_2 = add_parallel_layer(
-          pcg, relu_attrs, {t_partition_input}, {});
+      ParallelLayerAddedResult layer_2 =
+          add_parallel_layer(pcg, relu_attrs, {t_partition_input}, {});
 
-      ParallelLayerAddedResult layer_3 = add_parallel_layer(
-          pcg, relu_attrs, {get_only(layer_1.outputs)}, {});
+      ParallelLayerAddedResult layer_3 =
+          add_parallel_layer(pcg, relu_attrs, {get_only(layer_1.outputs)}, {});
 
       ParallelLayerAddedResult layer_4 = add_parallel_layer(
           pcg,
@@ -278,9 +278,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       PCGBinarySeriesSplit split = PCGBinarySeriesSplit{
           make_series_split(
-              make_series_split(
-                make_leaf(input.parallel_layer),
-                make_leaf(partition_input.parallel_layer)),
+              make_series_split(make_leaf(input.parallel_layer),
+                                make_leaf(partition_input.parallel_layer)),
               make_parallel_split(make_leaf(layer_1.parallel_layer),
                                   make_leaf(layer_2.parallel_layer))),
           make_parallel_split(make_leaf(layer_3.parallel_layer),
diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
index f707bd216c..e506dea1d7 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
@@ -100,13 +100,13 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     TensorShape tensor_shape = TensorShape{
-      TensorDims{
-        FFOrdered<nonnegative_int>{
-          10_n,
-          8_n,
+        TensorDims{
+            FFOrdered<nonnegative_int>{
+                10_n,
+                8_n,
+            },
         },
-      },
-      DataType::FLOAT,
+        DataType::FLOAT,
     };
 
     UnmappedOpCostEstimateKey k1 = UnmappedOpCostEstimateKey{
diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc b/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc
index 14ed917cd3..51e6074bf2 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc
@@ -11,17 +11,16 @@
 using namespace ::FlexFlow;
 
 bool isDebuggerActive() {
-    std::ifstream in("/proc/self/status");
-    for(std::string line; std::getline(in, line);) {
-        static const int PREFIX_LEN = 11;
-        if(line.compare(0, PREFIX_LEN, "TracerPid:\t") == 0) {
-            return line.length() > PREFIX_LEN && line[PREFIX_LEN] != '0';
-        }
+  std::ifstream in("/proc/self/status");
+  for (std::string line; std::getline(in, line);) {
+    static int const PREFIX_LEN = 11;
+    if (line.compare(0, PREFIX_LEN, "TracerPid:\t") == 0) {
+      return line.length() > PREFIX_LEN && line[PREFIX_LEN] != '0';
     }
-    return false;
+  }
+  return false;
 }
 
-
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_tensor_set_movement_across_split") {
     auto make_pcg_series_split = [](PCGBinarySPDecomposition const &lhs,
@@ -41,31 +40,34 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelComputationGraph pcg = empty_parallel_computation_graph();
 
     TensorShape input_shape = TensorShape{
-      TensorDims{
-        FFOrdered<nonnegative_int>{
-          10_n,
-          12_n,
+        TensorDims{
+            FFOrdered<nonnegative_int>{
+                10_n,
+                12_n,
+            },
         },
-      },
-      DataType::FLOAT,
+        DataType::FLOAT,
     };
 
     ParallelLayerAddedResult input = pcg_add_input_layer(pcg, input_shape);
     parallel_tensor_guid_t t_input = get_only(input.outputs);
 
     ParallelLayerAttrs partition_attrs = ParallelLayerAttrs{
-      /*op_attrs=*/PCGOperatorAttrs{
-        RepartitionAttrs{
-          /*repartition_dim=*/ff_dim_t{0_n},
-          /*repartition_degree=*/2_n,
+        /*op_attrs=*/PCGOperatorAttrs{
+            RepartitionAttrs{
+                /*repartition_dim=*/ff_dim_t{0_n},
+                /*repartition_degree=*/2_n,
+            },
         },
-      },
-      /*name=*/std::nullopt,
+        /*name=*/std::nullopt,
     };
-    ParallelLayerAddedResult partition_input = add_parallel_layer(pcg, partition_attrs, {t_input}, {});
-    parallel_tensor_guid_t t_partition_input = get_only(partition_input.outputs);
+    ParallelLayerAddedResult partition_input =
+        add_parallel_layer(pcg, partition_attrs, {t_input}, {});
+    parallel_tensor_guid_t t_partition_input =
+        get_only(partition_input.outputs);
 
-    ParallelTensorShape partitioned_input_shape = get_parallel_tensor_shape(pcg, t_partition_input);
+    ParallelTensorShape partitioned_input_shape =
+        get_parallel_tensor_shape(pcg, t_partition_input);
 
     ParallelLayerAttrs relu_attrs = ParallelLayerAttrs{
         /*op_attrs=*/PCGOperatorAttrs{
@@ -80,20 +82,20 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelLayerAttrs ew_add_attrs = ParallelLayerAttrs{
         /*op_attrs=*/PCGOperatorAttrs{
             ElementBinaryAttrs{
-              /*type=*/OperatorType::EW_ADD,
-              /*compute_type=*/DataType::FLOAT,
-              /*should_broadcast_lhs=*/false,
-              /*should_broadcast_rhs=*/false,
+                /*type=*/OperatorType::EW_ADD,
+                /*compute_type=*/DataType::FLOAT,
+                /*should_broadcast_lhs=*/false,
+                /*should_broadcast_rhs=*/false,
             },
         },
         /*name=*/std::nullopt,
     };
 
-    ParallelLayerAddedResult relu_1 = add_parallel_layer(
-        pcg, relu_attrs, {t_partition_input}, {});
+    ParallelLayerAddedResult relu_1 =
+        add_parallel_layer(pcg, relu_attrs, {t_partition_input}, {});
     parallel_tensor_guid_t t_relu_1 = get_only(relu_1.outputs);
-    ParallelLayerAddedResult relu_2 = add_parallel_layer(
-        pcg, relu_attrs, {t_relu_1}, {});
+    ParallelLayerAddedResult relu_2 =
+        add_parallel_layer(pcg, relu_attrs, {t_relu_1}, {});
 
     MachineView pre_mv1 = MachineView{
         /*start=*/MachineSpaceCoordinate{
@@ -158,10 +160,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("single edge across split") {
       PCGBinarySeriesSplit split = PCGBinarySeriesSplit{
           make_pcg_series_split(
-            make_pcg_series_split(
-              make_pcg_leaf_node(input.parallel_layer),
-              make_pcg_leaf_node(partition_input.parallel_layer)),
-            make_pcg_leaf_node(relu_1.parallel_layer)),
+              make_pcg_series_split(
+                  make_pcg_leaf_node(input.parallel_layer),
+                  make_pcg_leaf_node(partition_input.parallel_layer)),
+              make_pcg_leaf_node(relu_1.parallel_layer)),
           make_pcg_leaf_node(relu_2.parallel_layer),
       };
 
@@ -197,14 +199,15 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("does not include edges removed by transitive reduction") {}
 
     SUBCASE("single tensor, multiple consumers across split") {
-      ParallelLayerAddedResult relu_3 = add_parallel_layer(
-          pcg, relu_attrs, {get_only(relu_1.outputs)}, {});
+      ParallelLayerAddedResult relu_3 =
+          add_parallel_layer(pcg, relu_attrs, {get_only(relu_1.outputs)}, {});
 
       PCGBinarySeriesSplit split = PCGBinarySeriesSplit{
           make_pcg_series_split(
-            make_pcg_series_split(make_pcg_leaf_node(input.parallel_layer),
-                                  make_pcg_leaf_node(partition_input.parallel_layer)),
-            make_pcg_leaf_node(relu_1.parallel_layer)),
+              make_pcg_series_split(
+                  make_pcg_leaf_node(input.parallel_layer),
+                  make_pcg_leaf_node(partition_input.parallel_layer)),
+              make_pcg_leaf_node(relu_1.parallel_layer)),
           make_pcg_parallel_split(make_pcg_leaf_node(relu_2.parallel_layer),
                                   make_pcg_leaf_node(relu_3.parallel_layer)),
       };
@@ -309,12 +312,13 @@ TEST_SUITE(FF_TEST_SUITE) {
           {});
 
       PCGBinarySeriesSplit split = PCGBinarySeriesSplit{
-          make_pcg_series_split(make_pcg_series_split(
-                                  make_pcg_leaf_node(input.parallel_layer),
-                                  make_pcg_leaf_node(partition_input.parallel_layer)),
-                                make_pcg_parallel_split(
-                                    make_pcg_leaf_node(relu_1.parallel_layer),
-                                    make_pcg_leaf_node(relu_3.parallel_layer))),
+          make_pcg_series_split(
+              make_pcg_series_split(
+                  make_pcg_leaf_node(input.parallel_layer),
+                  make_pcg_leaf_node(partition_input.parallel_layer)),
+              make_pcg_parallel_split(
+                  make_pcg_leaf_node(relu_1.parallel_layer),
+                  make_pcg_leaf_node(relu_3.parallel_layer))),
           make_pcg_parallel_split(make_pcg_leaf_node(relu_2.parallel_layer),
                                   make_pcg_leaf_node(relu_4.parallel_layer)),
       };
diff --git a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc
index c67e136f52..048f1ddcac 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc
@@ -64,13 +64,13 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelComputationGraph pcg = empty_parallel_computation_graph();
 
     TensorShape input_shape = TensorShape{
-      TensorDims{
-        FFOrdered<nonnegative_int>{
-          10_n,
-          1_n,
+        TensorDims{
+            FFOrdered<nonnegative_int>{
+                10_n,
+                1_n,
+            },
         },
-      },
-      DataType::FLOAT,
+        DataType::FLOAT,
     };
     ParallelTensorShape par_input_shape = lift_to_parallel(input_shape);
 
@@ -101,11 +101,11 @@ TEST_SUITE(FF_TEST_SUITE) {
         };
 
     SUBCASE("single layer") {
-      ParallelLayerAddedResult input_added = add_parallel_layer(
-          pcg,
-          /*layer_attrs=*/make_layer_attrs(input_attrs),
-          /*inputs=*/{},
-          /*output_labels=*/{});
+      ParallelLayerAddedResult input_added =
+          add_parallel_layer(pcg,
+                             /*layer_attrs=*/make_layer_attrs(input_attrs),
+                             /*inputs=*/{},
+                             /*output_labels=*/{});
       parallel_layer_guid_t input_layer = input_added.parallel_layer;
 
       UnmappedOpCostEstimateKey input_key = make_input_key(par_input_shape);
@@ -121,11 +121,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("two layers in series") {
-      ParallelLayerAddedResult input_added = add_parallel_layer(
-          pcg,
-          /*layer_attrs=*/make_layer_attrs(input_attrs),
-          /*inputs=*/{},
-          /*output_labels=*/{});
+      ParallelLayerAddedResult input_added =
+          add_parallel_layer(pcg,
+                             /*layer_attrs=*/make_layer_attrs(input_attrs),
+                             /*inputs=*/{},
+                             /*output_labels=*/{});
       parallel_layer_guid_t input_layer = input_added.parallel_layer;
       parallel_tensor_guid_t input = get_only(input_added.outputs);
 
@@ -139,10 +139,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       };
       ParallelTensorShape relu_output_shape = par_input_shape;
       ParallelLayerAddedResult relu_added =
-          add_parallel_layer(pcg,
-                             make_layer_attrs(relu_attrs),
-                             {input},
-                             {});
+          add_parallel_layer(pcg, make_layer_attrs(relu_attrs), {input}, {});
       parallel_layer_guid_t relu_layer = relu_added.parallel_layer;
       parallel_tensor_guid_t relu_output = get_only(relu_added.outputs);
 
diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
index b67bef8743..8ae1ebe753 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
@@ -100,18 +100,17 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     TensorShape tensor_shape = TensorShape{
-      TensorDims{
-        FFOrdered<nonnegative_int>{
-          12_n,
-          8_n,
+        TensorDims{
+            FFOrdered<nonnegative_int>{
+                12_n,
+                8_n,
+            },
         },
-      },
-      DataType::FLOAT,
+        DataType::FLOAT,
     };
 
     ParallelTensorShape par_tensor_shape = lift_to_parallel(tensor_shape);
 
-
     UnmappedOpCostEstimateKey k1 = UnmappedOpCostEstimateKey{
         /*op_attrs=*/PCGOperatorAttrs{InputAttrs{tensor_shape}},
         /*input_shapes=*/{},
diff --git a/lib/compiler/test/src/compiler/series_parallel/computation_graph/get_computation_graph_series_parallel_decomposition.cc b/lib/compiler/test/src/compiler/series_parallel/computation_graph/get_computation_graph_series_parallel_decomposition.cc
index cac41ec347..fcd508828c 100644
--- a/lib/compiler/test/src/compiler/series_parallel/computation_graph/get_computation_graph_series_parallel_decomposition.cc
+++ b/lib/compiler/test/src/compiler/series_parallel/computation_graph/get_computation_graph_series_parallel_decomposition.cc
@@ -18,8 +18,8 @@ TEST_SUITE(FF_TEST_SUITE) {
       "get_computation_graph_series_parallel_decomposition(ComputationGraph)") {
     auto make_layer_attrs = [](auto const &op_attrs) {
       return LayerAttrs{
-        /*op_attrs=*/ComputationGraphOpAttrs{op_attrs},
-        /*name=*/std::nullopt,
+          /*op_attrs=*/ComputationGraphOpAttrs{op_attrs},
+          /*name=*/std::nullopt,
       };
     };
 
@@ -36,21 +36,21 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     InitializerAttrs zero_init = InitializerAttrs{ZeroInitializerAttrs{}};
 
-    TensorShape input_shape =
-        TensorShape{TensorDims{FFOrdered<nonnegative_int>{
-                        10_n,
-                        12_n,
-                    },},
-                    DataType::FLOAT};
-
-    InputAttrs input_attrs = 
-        InputAttrs{
-          /*shape=*/input_shape,
-        };
+    TensorShape input_shape = TensorShape{TensorDims{
+                                              FFOrdered<nonnegative_int>{
+                                                  10_n,
+                                                  12_n,
+                                              },
+                                          },
+                                          DataType::FLOAT};
 
+    InputAttrs input_attrs = InputAttrs{
+        /*shape=*/input_shape,
+    };
 
     SUBCASE("just a single input") {
-      LayerAddedResult input_added = add_layer(cg, make_layer_attrs(input_attrs), {}, {});
+      LayerAddedResult input_added =
+          add_layer(cg, make_layer_attrs(input_attrs), {}, {});
 
       std::optional<SeriesParallelDecomposition> result =
           get_computation_graph_series_parallel_decomposition(cg);
@@ -62,42 +62,45 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("single operator plus inputs and weights") {
       LinearAttrs linear_attrs = LinearAttrs{
-        /*out_channels=*/14_n,
-        /*use_bias=*/true,
-        /*data_type=*/DataType::FLOAT,
-        /*activation=*/std::nullopt,
-        /*regularizer=*/std::nullopt,
+          /*out_channels=*/14_n,
+          /*use_bias=*/true,
+          /*data_type=*/DataType::FLOAT,
+          /*activation=*/std::nullopt,
+          /*regularizer=*/std::nullopt,
       };
 
-      TensorShape projection_weight_shape = 
-        throw_if_unexpected(get_projection_shape(linear_attrs, input_shape));
+      TensorShape projection_weight_shape =
+          throw_if_unexpected(get_projection_shape(linear_attrs, input_shape));
 
-      TensorShape bias_weight_shape = throw_if_unexpected(get_bias_shape(linear_attrs, input_shape));
+      TensorShape bias_weight_shape =
+          throw_if_unexpected(get_bias_shape(linear_attrs, input_shape));
 
       WeightAttrs projection_weight_attrs = WeightAttrs{
-        /*tensor_shape=*/projection_weight_shape,
-        /*initializer=*/zero_init,
+          /*tensor_shape=*/projection_weight_shape,
+          /*initializer=*/zero_init,
       };
 
       WeightAttrs bias_weight_attrs = WeightAttrs{
-        /*tensor_shape=*/bias_weight_shape,
-        /*initializer=*/zero_init,
+          /*tensor_shape=*/bias_weight_shape,
+          /*initializer=*/zero_init,
       };
 
       LayerAddedResult input_added =
           add_layer(cg, make_layer_attrs(input_attrs), {}, {});
       tensor_guid_t t_input = get_only(input_added.outputs);
 
-      LayerAddedResult projection_weight_added = 
+      LayerAddedResult projection_weight_added =
           add_layer(cg, make_layer_attrs(projection_weight_attrs), {}, {});
       tensor_guid_t t_projection = get_only(projection_weight_added.outputs);
 
-      LayerAddedResult bias_weight_added = 
+      LayerAddedResult bias_weight_added =
           add_layer(cg, make_layer_attrs(bias_weight_attrs), {}, {});
       tensor_guid_t t_bias = get_only(bias_weight_added.outputs);
 
-      LayerAddedResult linear_added = 
-          add_layer(cg, make_layer_attrs(linear_attrs), {t_input}, {t_projection, t_bias});
+      LayerAddedResult linear_added = add_layer(cg,
+                                                make_layer_attrs(linear_attrs),
+                                                {t_input},
+                                                {t_projection, t_bias});
 
       std::optional<SeriesParallelDecomposition> result =
           get_computation_graph_series_parallel_decomposition(cg);
@@ -123,37 +126,37 @@ TEST_SUITE(FF_TEST_SUITE) {
       //   op1     op2
 
       LinearAttrs linear_attrs = LinearAttrs{
-        /*out_channels=*/14_n,
-        /*use_bias=*/false,
-        /*data_type=*/DataType::FLOAT,
-        /*activation=*/std::nullopt,
-        /*regularizer=*/std::nullopt,
+          /*out_channels=*/14_n,
+          /*use_bias=*/false,
+          /*data_type=*/DataType::FLOAT,
+          /*activation=*/std::nullopt,
+          /*regularizer=*/std::nullopt,
       };
 
-      TensorShape projection_weight_shape = 
-        throw_if_unexpected(get_projection_shape(linear_attrs, input_shape));
+      TensorShape projection_weight_shape =
+          throw_if_unexpected(get_projection_shape(linear_attrs, input_shape));
 
       WeightAttrs projection_weight_attrs = WeightAttrs{
-        /*tensor_shape=*/projection_weight_shape,
-        /*initializer=*/zero_init,
+          /*tensor_shape=*/projection_weight_shape,
+          /*initializer=*/zero_init,
       };
 
       LayerAddedResult input_added =
           add_layer(cg, make_layer_attrs(input_attrs), {}, {});
       tensor_guid_t t_input = get_only(input_added.outputs);
 
-      LayerAddedResult w1_added = 
+      LayerAddedResult w1_added =
           add_layer(cg, make_layer_attrs(projection_weight_attrs), {}, {});
       tensor_guid_t t_w1 = get_only(w1_added.outputs);
 
-      LayerAddedResult w2_added = 
+      LayerAddedResult w2_added =
           add_layer(cg, make_layer_attrs(projection_weight_attrs), {}, {});
       tensor_guid_t t_w2 = get_only(w2_added.outputs);
 
-      LayerAddedResult op1_added = 
+      LayerAddedResult op1_added =
           add_layer(cg, make_layer_attrs(linear_attrs), {t_input}, {t_w1});
 
-      LayerAddedResult op2_added = 
+      LayerAddedResult op2_added =
           add_layer(cg, make_layer_attrs(linear_attrs), {t_input}, {t_w2});
 
       std::optional<SeriesParallelDecomposition> result =
@@ -175,11 +178,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     ElementUnaryAttrs relu_attrs = ElementUnaryAttrs{
-      /*op_type=*/OperatorType::RELU,
-      /*scalar=*/std::nullopt,
+        /*op_type=*/OperatorType::RELU,
+        /*scalar=*/std::nullopt,
     };
 
-    SUBCASE("SP with or without preprocessing, but preprocessing would change resulting SP "
+    SUBCASE("SP with or without preprocessing, but preprocessing would change "
+            "resulting SP "
             "decomposition") {
       // computation graph:
       //
@@ -195,10 +199,10 @@ TEST_SUITE(FF_TEST_SUITE) {
           add_layer(cg, make_layer_attrs(input_attrs), {}, {});
       tensor_guid_t t_input2 = get_only(input2_added.outputs);
 
-      LayerAddedResult op1_added = 
+      LayerAddedResult op1_added =
           add_layer(cg, make_layer_attrs(relu_attrs), {t_input1}, {});
 
-      LayerAddedResult op2_added = 
+      LayerAddedResult op2_added =
           add_layer(cg, make_layer_attrs(relu_attrs), {t_input2}, {});
 
       std::optional<SeriesParallelDecomposition> result =
@@ -233,21 +237,21 @@ TEST_SUITE(FF_TEST_SUITE) {
       tensor_guid_t t_input1 = get_only(input1_added.outputs);
 
       ElementBinaryAttrs ew_add_attrs = ElementBinaryAttrs{
-        /*type=*/OperatorType::EW_ADD,
-        /*compute_type=*/DataType::FLOAT,
-        /*should_broadcast_lhs=*/false,
-        /*should_broadcast_rhs=*/false,
+          /*type=*/OperatorType::EW_ADD,
+          /*compute_type=*/DataType::FLOAT,
+          /*should_broadcast_lhs=*/false,
+          /*should_broadcast_rhs=*/false,
       };
 
-      LayerAddedResult op1_added = 
+      LayerAddedResult op1_added =
           add_layer(cg, make_layer_attrs(relu_attrs), {t_input1}, {});
       tensor_guid_t t_op1 = get_only(op1_added.outputs);
 
-      LayerAddedResult op2_added = 
+      LayerAddedResult op2_added =
           add_layer(cg, make_layer_attrs(relu_attrs), {t_input1}, {});
       tensor_guid_t t_op2 = get_only(op2_added.outputs);
 
-      LayerAddedResult op3_added = 
+      LayerAddedResult op3_added =
           add_layer(cg, make_layer_attrs(relu_attrs), {t_op1}, {});
 
       LayerAddedResult op4_added =
diff --git a/lib/compiler/test/src/compiler/series_parallel/pcg/get_pcg_series_parallel_decomposition.cc b/lib/compiler/test/src/compiler/series_parallel/pcg/get_pcg_series_parallel_decomposition.cc
index b0bceda869..06664b38fa 100644
--- a/lib/compiler/test/src/compiler/series_parallel/pcg/get_pcg_series_parallel_decomposition.cc
+++ b/lib/compiler/test/src/compiler/series_parallel/pcg/get_pcg_series_parallel_decomposition.cc
@@ -1,15 +1,16 @@
 #include "compiler/series_parallel/pcg/get_pcg_series_parallel_decomposition.h"
 #include "op-attrs/ops/embedding.h"
 #include "op-attrs/ops/linear.h"
-#include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
-#include <doctest/doctest.h>
 #include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
 #include "utils/containers/get_only.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("get_pcg_series_parallel_decomposition(ParallelComputationGraph const &)") {
+  TEST_CASE("get_pcg_series_parallel_decomposition(ParallelComputationGraph "
+            "const &)") {
     SUBCASE("empty pcg") {
       ParallelComputationGraph pcg = empty_parallel_computation_graph();
 
@@ -21,22 +22,20 @@ TEST_SUITE(FF_TEST_SUITE) {
       CHECK(result == correct);
     }
 
-    TensorShape input_shape =
-        TensorShape{TensorDims{FFOrdered<nonnegative_int>{
-                        10_n,
-                        12_n,
-                    }},
-                    DataType::FLOAT};
+    TensorShape input_shape = TensorShape{TensorDims{FFOrdered<nonnegative_int>{
+                                              10_n,
+                                              12_n,
+                                          }},
+                                          DataType::FLOAT};
     InitializerAttrs zero_init = InitializerAttrs{ZeroInitializerAttrs{}};
 
     auto make_layer_attrs = [](auto const &op_attrs) -> ParallelLayerAttrs {
       return ParallelLayerAttrs{
-        /*op_attrs=*/PCGOperatorAttrs{op_attrs},
-        /*name=*/std::nullopt,
+          /*op_attrs=*/PCGOperatorAttrs{op_attrs},
+          /*name=*/std::nullopt,
       };
     };
 
-
     SUBCASE("just a single input") {
       ParallelComputationGraph pcg = empty_parallel_computation_graph();
 
@@ -54,44 +53,52 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("single operator plus inputs and weights") {
       ParallelComputationGraph pcg = empty_parallel_computation_graph();
 
-      ParallelLayerAddedResult input_added = pcg_add_input_layer(pcg, input_shape);
+      ParallelLayerAddedResult input_added =
+          pcg_add_input_layer(pcg, input_shape);
       parallel_tensor_guid_t t_input = get_only(input_added.outputs);
 
       LinearAttrs linear_attrs = LinearAttrs{
-        /*out_channels=*/14_n,
-        /*use_bias=*/true,
-        /*data_type=*/DataType::FLOAT,
-        /*activation=*/Activation::RELU,
-        /*regularizer=*/std::nullopt,
+          /*out_channels=*/14_n,
+          /*use_bias=*/true,
+          /*data_type=*/DataType::FLOAT,
+          /*activation=*/Activation::RELU,
+          /*regularizer=*/std::nullopt,
       };
 
-      TensorShape projection_weights_shape = throw_if_unexpected(get_projection_shape(linear_attrs, input_shape));
-      TensorShape bias_weights_shape = throw_if_unexpected(get_bias_shape(linear_attrs, input_shape));
+      TensorShape projection_weights_shape =
+          throw_if_unexpected(get_projection_shape(linear_attrs, input_shape));
+      TensorShape bias_weights_shape =
+          throw_if_unexpected(get_bias_shape(linear_attrs, input_shape));
 
       WeightAttrs projection_weight_attrs = WeightAttrs{
-        /*shape=*/projection_weights_shape,
-        /*initializer=*/InitializerAttrs{ZeroInitializerAttrs{}},
+          /*shape=*/projection_weights_shape,
+          /*initializer=*/InitializerAttrs{ZeroInitializerAttrs{}},
       };
-      ParallelLayerAddedResult projection_weights_added = add_parallel_layer(pcg, 
-                                                                             /*layer_attrs=*/make_layer_attrs(projection_weight_attrs),
-                                                                             /*inputs=*/{},
-                                                                             /*weights=*/{});
-      parallel_tensor_guid_t t_projection_weights = get_only(projection_weights_added.outputs);
+      ParallelLayerAddedResult projection_weights_added = add_parallel_layer(
+          pcg,
+          /*layer_attrs=*/make_layer_attrs(projection_weight_attrs),
+          /*inputs=*/{},
+          /*weights=*/{});
+      parallel_tensor_guid_t t_projection_weights =
+          get_only(projection_weights_added.outputs);
 
       WeightAttrs bias_weight_attrs = WeightAttrs{
-        /*shape=*/bias_weights_shape,
-        /*initializer=*/InitializerAttrs{ZeroInitializerAttrs{}},
+          /*shape=*/bias_weights_shape,
+          /*initializer=*/InitializerAttrs{ZeroInitializerAttrs{}},
       };
-      ParallelLayerAddedResult bias_weights_added = add_parallel_layer(pcg,
-                                                                       /*layer_attrs=*/make_layer_attrs(bias_weight_attrs),
-                                                                       /*inputs=*/{},
-                                                                       /*weights=*/{});
-      parallel_tensor_guid_t t_bias_weights = get_only(bias_weights_added.outputs);
-
-      ParallelLayerAddedResult linear_added = add_parallel_layer(pcg,
-                                                                 /*layer_attrs=*/make_layer_attrs(linear_attrs),
-                                                                 /*inputs=*/{t_input},
-                                                                 /*weights=*/{t_projection_weights, t_bias_weights});
+      ParallelLayerAddedResult bias_weights_added = add_parallel_layer(
+          pcg,
+          /*layer_attrs=*/make_layer_attrs(bias_weight_attrs),
+          /*inputs=*/{},
+          /*weights=*/{});
+      parallel_tensor_guid_t t_bias_weights =
+          get_only(bias_weights_added.outputs);
+
+      ParallelLayerAddedResult linear_added = add_parallel_layer(
+          pcg,
+          /*layer_attrs=*/make_layer_attrs(linear_attrs),
+          /*inputs=*/{t_input},
+          /*weights=*/{t_projection_weights, t_bias_weights});
 
       std::optional<SeriesParallelDecomposition> result =
           get_pcg_series_parallel_decomposition(pcg);
@@ -108,10 +115,11 @@ TEST_SUITE(FF_TEST_SUITE) {
       CHECK(result == correct);
     }
 
-    SUBCASE("SP without weight nodes but non-SP with weight nodes (parallel op chain following is not necessary)") {
+    SUBCASE("SP without weight nodes but non-SP with weight nodes (parallel op "
+            "chain following is not necessary)") {
       // A minimal computation graph where without weights (w1 and w2) the
       // computation graph is series-parallel, but with weight nodes it is not,
-      // but parallel op chain following is not necessary 
+      // but parallel op chain following is not necessary
       // (in this case because there are no parallel ops involved)
       //
       // w1   input   w2
@@ -121,42 +129,42 @@ TEST_SUITE(FF_TEST_SUITE) {
       ParallelComputationGraph pcg = empty_parallel_computation_graph();
 
       InputAttrs input_attrs = InputAttrs{
-        /*tensor_shape=*/input_shape,
+          /*tensor_shape=*/input_shape,
       };
 
       LinearAttrs linear_attrs = LinearAttrs{
-        /*out_channels=*/14_n,
-        /*use_bias=*/false,
-        /*data_type=*/DataType::FLOAT,
-        /*activation=*/std::nullopt,
-        /*regularizer=*/std::nullopt,
+          /*out_channels=*/14_n,
+          /*use_bias=*/false,
+          /*data_type=*/DataType::FLOAT,
+          /*activation=*/std::nullopt,
+          /*regularizer=*/std::nullopt,
       };
 
-      TensorShape projection_weight_shape = 
-        throw_if_unexpected(get_projection_shape(linear_attrs, input_shape));
+      TensorShape projection_weight_shape =
+          throw_if_unexpected(get_projection_shape(linear_attrs, input_shape));
 
       WeightAttrs projection_weight_attrs = WeightAttrs{
-        /*tensor_shape=*/projection_weight_shape,
-        /*initializer=*/zero_init,
+          /*tensor_shape=*/projection_weight_shape,
+          /*initializer=*/zero_init,
       };
 
       ParallelLayerAddedResult input_added =
           add_parallel_layer(pcg, make_layer_attrs(input_attrs), {}, {});
       parallel_tensor_guid_t t_input = get_only(input_added.outputs);
 
-      ParallelLayerAddedResult w1_added = 
-          add_parallel_layer(pcg, make_layer_attrs(projection_weight_attrs), {}, {});
+      ParallelLayerAddedResult w1_added = add_parallel_layer(
+          pcg, make_layer_attrs(projection_weight_attrs), {}, {});
       parallel_tensor_guid_t t_w1 = get_only(w1_added.outputs);
 
-      ParallelLayerAddedResult w2_added = 
-          add_parallel_layer(pcg, make_layer_attrs(projection_weight_attrs), {}, {});
+      ParallelLayerAddedResult w2_added = add_parallel_layer(
+          pcg, make_layer_attrs(projection_weight_attrs), {}, {});
       parallel_tensor_guid_t t_w2 = get_only(w2_added.outputs);
 
-      ParallelLayerAddedResult op1_added = 
-          add_parallel_layer(pcg, make_layer_attrs(linear_attrs), {t_input}, {t_w1});
+      ParallelLayerAddedResult op1_added = add_parallel_layer(
+          pcg, make_layer_attrs(linear_attrs), {t_input}, {t_w1});
 
-      ParallelLayerAddedResult op2_added = 
-          add_parallel_layer(pcg, make_layer_attrs(linear_attrs), {t_input}, {t_w2});
+      ParallelLayerAddedResult op2_added = add_parallel_layer(
+          pcg, make_layer_attrs(linear_attrs), {t_input}, {t_w2});
 
       std::optional<SeriesParallelDecomposition> result =
           get_pcg_series_parallel_decomposition(pcg);
@@ -176,7 +184,8 @@ TEST_SUITE(FF_TEST_SUITE) {
       CHECK(result == correct);
     }
 
-    SUBCASE("SP without weight nodes but non-SP with weight node (parallel op chain following necessary)") {
+    SUBCASE("SP without weight nodes but non-SP with weight node (parallel op "
+            "chain following necessary)") {
       // A minimal computation graph where without weights (w1 and w2) the
       // computation graph is series-parallel, but with weight nodes it is not
       // and parallel op chain following is necessary
@@ -195,120 +204,134 @@ TEST_SUITE(FF_TEST_SUITE) {
       ParallelComputationGraph pcg = empty_parallel_computation_graph();
 
       TensorShape input_shape = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{
-          12_n, 
-          10_n,
-        }},
-        DataType::FLOAT,
+          TensorDims{FFOrdered<nonnegative_int>{
+              12_n,
+              10_n,
+          }},
+          DataType::FLOAT,
       };
 
-      ParallelLayerAddedResult input_added = pcg_add_input_layer(pcg, input_shape);
+      ParallelLayerAddedResult input_added =
+          pcg_add_input_layer(pcg, input_shape);
       parallel_layer_guid_t layer_input = input_added.parallel_layer;
       parallel_tensor_guid_t t_input = get_only(input_added.outputs);
 
       RepartitionAttrs p2_attrs = RepartitionAttrs{
-        /*repartition_dim=*/ff_dim_t{0_n},
-        /*repartition_degree=*/3_n,
+          /*repartition_dim=*/ff_dim_t{0_n},
+          /*repartition_degree=*/3_n,
       };
-      ParallelLayerAddedResult p2_added = add_parallel_layer(pcg, make_layer_attrs(p2_attrs), {t_input}, {});
+      ParallelLayerAddedResult p2_added =
+          add_parallel_layer(pcg, make_layer_attrs(p2_attrs), {t_input}, {});
       parallel_tensor_guid_t t_p2 = get_only(p2_added.outputs);
 
       ParallelLayerAttrs p3_attrs = ParallelLayerAttrs{
-        PCGOperatorAttrs{RepartitionAttrs{
-          /*repartition_dim=*/ff_dim_t{1_n},
-          /*repartition_degree=*/2_n,
-        }},
-        /*name=*/std::nullopt,
+          PCGOperatorAttrs{RepartitionAttrs{
+              /*repartition_dim=*/ff_dim_t{1_n},
+              /*repartition_degree=*/2_n,
+          }},
+          /*name=*/std::nullopt,
       };
-      ParallelLayerAddedResult p3_added = add_parallel_layer(pcg, p3_attrs, {t_p2}, {});
+      ParallelLayerAddedResult p3_added =
+          add_parallel_layer(pcg, p3_attrs, {t_p2}, {});
       parallel_tensor_guid_t t_p3 = get_only(p3_added.outputs);
 
       CastAttrs op0_attrs = CastAttrs{
-        /*dtype=*/DataType::INT32, 
+          /*dtype=*/DataType::INT32,
       };
-      ParallelLayerAddedResult op0_added = add_parallel_layer(pcg, make_layer_attrs(op0_attrs), {t_p3}, {});
+      ParallelLayerAddedResult op0_added =
+          add_parallel_layer(pcg, make_layer_attrs(op0_attrs), {t_p3}, {});
       parallel_tensor_guid_t t_op0 = get_only(op0_added.outputs);
 
       EmbeddingAttrs op1_attrs = EmbeddingAttrs{
-        /*num_entires=*/100_n,
-        /*out_channels=*/22_n,
-        /*aggr=*/AggregateOp::SUM,
-        /*data_type=*/DataType::FLOAT,
+          /*num_entires=*/100_n,
+          /*out_channels=*/22_n,
+          /*aggr=*/AggregateOp::SUM,
+          /*data_type=*/DataType::FLOAT,
       };
 
-      TensorShape casted_input_shape = get_reduced_shape(get_parallel_tensor_shape(pcg, t_op0));
+      TensorShape casted_input_shape =
+          get_reduced_shape(get_parallel_tensor_shape(pcg, t_op0));
 
       WeightAttrs w1_attrs = WeightAttrs{
-        /*tensor_shape=*/throw_if_unexpected(get_weights_shape(op1_attrs, casted_input_shape)),
-        /*initializer=*/InitializerAttrs{ZeroInitializerAttrs{}},
+          /*tensor_shape=*/throw_if_unexpected(
+              get_weights_shape(op1_attrs, casted_input_shape)),
+          /*initializer=*/InitializerAttrs{ZeroInitializerAttrs{}},
       };
-      ParallelLayerAddedResult w1_added = add_parallel_layer(pcg, make_layer_attrs(w1_attrs), {}, {});
+      ParallelLayerAddedResult w1_added =
+          add_parallel_layer(pcg, make_layer_attrs(w1_attrs), {}, {});
       parallel_tensor_guid_t t_w1 = get_only(w1_added.outputs);
 
       ReplicateAttrs p1_attrs = ReplicateAttrs{
-        /*replicate_degree=*/6_n,
+          /*replicate_degree=*/6_n,
       };
-      ParallelLayerAddedResult p1_added = add_parallel_layer(pcg, make_layer_attrs(p1_attrs), {t_w1}, {});
+      ParallelLayerAddedResult p1_added =
+          add_parallel_layer(pcg, make_layer_attrs(p1_attrs), {t_w1}, {});
       parallel_tensor_guid_t t_p1 = get_only(p1_added.outputs);
 
-      ParallelLayerAddedResult op1_added = add_parallel_layer(pcg, make_layer_attrs(op1_attrs), {t_op0}, {t_p1});
+      ParallelLayerAddedResult op1_added =
+          add_parallel_layer(pcg, make_layer_attrs(op1_attrs), {t_op0}, {t_p1});
 
       LinearAttrs op2_attrs = LinearAttrs{
-        /*out_channels=*/14_n,
-        /*use_bias=*/false,
-        /*data_type=*/DataType::FLOAT,
-        /*activation=*/std::nullopt,
-        /*regularizer=*/std::nullopt,
+          /*out_channels=*/14_n,
+          /*use_bias=*/false,
+          /*data_type=*/DataType::FLOAT,
+          /*activation=*/std::nullopt,
+          /*regularizer=*/std::nullopt,
       };
 
       WeightAttrs w2_attrs = WeightAttrs{
-        /*tensor_shape=*/throw_if_unexpected(get_projection_shape(op2_attrs, input_shape)), 
-        /*initializer=*/InitializerAttrs{ZeroInitializerAttrs{}},
+          /*tensor_shape=*/throw_if_unexpected(
+              get_projection_shape(op2_attrs, input_shape)),
+          /*initializer=*/InitializerAttrs{ZeroInitializerAttrs{}},
       };
-      ParallelLayerAddedResult w2_added = add_parallel_layer(pcg, make_layer_attrs(w2_attrs), {}, {});
+      ParallelLayerAddedResult w2_added =
+          add_parallel_layer(pcg, make_layer_attrs(w2_attrs), {}, {});
       parallel_tensor_guid_t t_w2 = get_only(w2_added.outputs);
 
       ReplicateAttrs p4_attrs = ReplicateAttrs{
-        /*replicate_degree=*/3_n,
+          /*replicate_degree=*/3_n,
       };
-      ParallelLayerAddedResult p4_added = add_parallel_layer(pcg, make_layer_attrs(p4_attrs), {t_w2}, {});
+      ParallelLayerAddedResult p4_added =
+          add_parallel_layer(pcg, make_layer_attrs(p4_attrs), {t_w2}, {});
       parallel_tensor_guid_t t_p4 = get_only(p4_added.outputs);
 
       RepartitionAttrs p5_attrs = RepartitionAttrs{
-        /*repartition_dim=*/ff_dim_t{0_n},
-        /*repartition_degree=*/2_n,
+          /*repartition_dim=*/ff_dim_t{0_n},
+          /*repartition_degree=*/2_n,
       };
-      ParallelLayerAddedResult p5_added = add_parallel_layer(pcg, make_layer_attrs(p5_attrs), {t_p4}, {});
+      ParallelLayerAddedResult p5_added =
+          add_parallel_layer(pcg, make_layer_attrs(p5_attrs), {t_p4}, {});
       parallel_tensor_guid_t t_p5 = get_only(p5_added.outputs);
 
-      ParallelLayerAddedResult op2_added = add_parallel_layer(pcg, make_layer_attrs(op2_attrs), {t_p3}, {t_p5});
+      ParallelLayerAddedResult op2_added =
+          add_parallel_layer(pcg, make_layer_attrs(op2_attrs), {t_p3}, {t_p5});
 
       std::optional<SeriesParallelDecomposition> result =
           get_pcg_series_parallel_decomposition(pcg);
       std::optional<SeriesParallelDecomposition> correct =
           SeriesParallelDecomposition{SeriesSplit{{
               ParallelSplit{{
-                SeriesSplit{{
-                  w1_added.parallel_layer.raw_graph_node,
-                  p1_added.parallel_layer.raw_graph_node,
-                }},
-                SeriesSplit{{
-                  input_added.parallel_layer.raw_graph_node,
-                  p2_added.parallel_layer.raw_graph_node,
-                  p3_added.parallel_layer.raw_graph_node,
-                }},
-                SeriesSplit{{
-                  w2_added.parallel_layer.raw_graph_node,
-                  p4_added.parallel_layer.raw_graph_node,
-                  p5_added.parallel_layer.raw_graph_node,
-                }},
+                  SeriesSplit{{
+                      w1_added.parallel_layer.raw_graph_node,
+                      p1_added.parallel_layer.raw_graph_node,
+                  }},
+                  SeriesSplit{{
+                      input_added.parallel_layer.raw_graph_node,
+                      p2_added.parallel_layer.raw_graph_node,
+                      p3_added.parallel_layer.raw_graph_node,
+                  }},
+                  SeriesSplit{{
+                      w2_added.parallel_layer.raw_graph_node,
+                      p4_added.parallel_layer.raw_graph_node,
+                      p5_added.parallel_layer.raw_graph_node,
+                  }},
               }},
               ParallelSplit{{
-                SeriesSplit{{
-                  op0_added.parallel_layer.raw_graph_node,
-                  op1_added.parallel_layer.raw_graph_node,
-                }},
-                op2_added.parallel_layer.raw_graph_node,
+                  SeriesSplit{{
+                      op0_added.parallel_layer.raw_graph_node,
+                      op1_added.parallel_layer.raw_graph_node,
+                  }},
+                  op2_added.parallel_layer.raw_graph_node,
               }},
           }}};
 
@@ -317,14 +340,15 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     ParallelComputationGraph pcg = empty_parallel_computation_graph();
     InputAttrs input_attrs = InputAttrs{
-      /*tensor_shape=*/input_shape,
+        /*tensor_shape=*/input_shape,
     };
     ElementUnaryAttrs relu_attrs = ElementUnaryAttrs{
-      /*op_type=*/OperatorType::RELU,
-      /*scalar=*/std::nullopt,
+        /*op_type=*/OperatorType::RELU,
+        /*scalar=*/std::nullopt,
     };
 
-    SUBCASE("SP with or without preprocessing, but preprocessing would change resulting SP "
+    SUBCASE("SP with or without preprocessing, but preprocessing would change "
+            "resulting SP "
             "decomposition") {
       // parallel computation graph:
       //
@@ -340,10 +364,10 @@ TEST_SUITE(FF_TEST_SUITE) {
           add_parallel_layer(pcg, make_layer_attrs(input_attrs), {}, {});
       parallel_tensor_guid_t t_input2 = get_only(input2_added.outputs);
 
-      ParallelLayerAddedResult op1_added = 
+      ParallelLayerAddedResult op1_added =
           add_parallel_layer(pcg, make_layer_attrs(relu_attrs), {t_input1}, {});
 
-      ParallelLayerAddedResult op2_added = 
+      ParallelLayerAddedResult op2_added =
           add_parallel_layer(pcg, make_layer_attrs(relu_attrs), {t_input2}, {});
 
       std::optional<SeriesParallelDecomposition> result =
@@ -378,25 +402,25 @@ TEST_SUITE(FF_TEST_SUITE) {
       parallel_tensor_guid_t t_input1 = get_only(input1_added.outputs);
 
       ElementBinaryAttrs ew_add_attrs = ElementBinaryAttrs{
-        /*type=*/OperatorType::EW_ADD,
-        /*compute_type=*/DataType::FLOAT,
-        /*should_broadcast_lhs=*/false,
-        /*should_broadcast_rhs=*/false,
+          /*type=*/OperatorType::EW_ADD,
+          /*compute_type=*/DataType::FLOAT,
+          /*should_broadcast_lhs=*/false,
+          /*should_broadcast_rhs=*/false,
       };
 
-      ParallelLayerAddedResult op1_added = 
+      ParallelLayerAddedResult op1_added =
           add_parallel_layer(pcg, make_layer_attrs(relu_attrs), {t_input1}, {});
       parallel_tensor_guid_t t_op1 = get_only(op1_added.outputs);
 
-      ParallelLayerAddedResult op2_added = 
+      ParallelLayerAddedResult op2_added =
           add_parallel_layer(pcg, make_layer_attrs(relu_attrs), {t_input1}, {});
       parallel_tensor_guid_t t_op2 = get_only(op2_added.outputs);
 
-      ParallelLayerAddedResult op3_added = 
+      ParallelLayerAddedResult op3_added =
           add_parallel_layer(pcg, make_layer_attrs(relu_attrs), {t_op1}, {});
 
-      ParallelLayerAddedResult op4_added =
-          add_parallel_layer(pcg, make_layer_attrs(ew_add_attrs), {t_op1, t_op2}, {});
+      ParallelLayerAddedResult op4_added = add_parallel_layer(
+          pcg, make_layer_attrs(ew_add_attrs), {t_op1, t_op2}, {});
 
       std::optional<SeriesParallelDecomposition> result =
           get_pcg_series_parallel_decomposition(pcg);
diff --git a/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc b/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc
index 6a8bedccaf..f320e45d06 100644
--- a/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc
+++ b/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc
@@ -47,13 +47,13 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("linear graph") {
       ParallelComputationGraphBuilder b;
       TensorShape input_shape = TensorShape{
-        TensorDims{
-          FFOrdered<nonnegative_int>{
-            10_n,
-            7_n,
+          TensorDims{
+              FFOrdered<nonnegative_int>{
+                  10_n,
+                  7_n,
+              },
           },
-        },
-        DataType::FLOAT,
+          DataType::FLOAT,
       };
       parallel_tensor_guid_t tensor0 = b.create_input_tensor(input_shape);
       parallel_tensor_guid_t tensor1 = b.relu(tensor0);
@@ -126,13 +126,13 @@ TEST_SUITE(FF_TEST_SUITE) {
       ParallelComputationGraphBuilder b;
 
       TensorShape input_shape = TensorShape{
-        TensorDims{
-          FFOrdered<nonnegative_int>{
-            10_n,
-            1_n,
+          TensorDims{
+              FFOrdered<nonnegative_int>{
+                  10_n,
+                  1_n,
+              },
           },
-        },
-        DataType::FLOAT,
+          DataType::FLOAT,
       };
 
       parallel_tensor_guid_t tensor0 = b.create_input_tensor(input_shape);
diff --git a/lib/compiler/test/src/graph_optimize_state.cc b/lib/compiler/test/src/graph_optimize_state.cc
index 8a1a4bb495..5c00ce1558 100644
--- a/lib/compiler/test/src/graph_optimize_state.cc
+++ b/lib/compiler/test/src/graph_optimize_state.cc
@@ -7,13 +7,13 @@ using namespace FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("GraphOptimizeState::operator==") {
     TensorShape input_shape = TensorShape{
-      TensorDims{
-        FFOrdered<nonnegative_int>{
-          32_n,
-          16_n,
+        TensorDims{
+            FFOrdered<nonnegative_int>{
+                32_n,
+                16_n,
+            },
         },
-      },
-      DataType::FLOAT,
+        DataType::FLOAT,
     };
     // ParallelTensorShape input_shape =
     //     ParallelTensorShape{ParallelTensorDims{
@@ -70,13 +70,13 @@ TEST_SUITE(FF_TEST_SUITE) {
       ParallelComputationGraph pcg2 = create_pcg();
 
       GraphOptimizeState state1 = GraphOptimizeState{
-        GraphOptimizeResult{pcg1, empty_machine_mapping},
-        0,
+          GraphOptimizeResult{pcg1, empty_machine_mapping},
+          0,
       };
 
       GraphOptimizeState state2 = GraphOptimizeState{
-        GraphOptimizeResult{pcg2, empty_machine_mapping},
-        0,
+          GraphOptimizeResult{pcg2, empty_machine_mapping},
+          0,
       };
 
       CHECK(state1 == state2);
@@ -89,24 +89,24 @@ TEST_SUITE(FF_TEST_SUITE) {
           builder_.create_input_tensor(input_shape, "input0");
       parallel_tensor_guid_t dense0_ =
           builder_.dense(/*input=*/input0_,
-                        /*outDim=*/8_n,
-                        /*activation=*/Activation::RELU,
-                        /*use_bias=*/true,
-                        /*data_type=*/DataType::FLOAT,
-                        /*projection_initializer=*/zero_init,
-                        /*bias_initializer=*/zero_init,
-                        /*name=*/"dense0");
+                         /*outDim=*/8_n,
+                         /*activation=*/Activation::RELU,
+                         /*use_bias=*/true,
+                         /*data_type=*/DataType::FLOAT,
+                         /*projection_initializer=*/zero_init,
+                         /*bias_initializer=*/zero_init,
+                         /*name=*/"dense0");
 
       ParallelComputationGraph pcg_ = builder_.pcg;
 
       GraphOptimizeState state1 = GraphOptimizeState{
-        GraphOptimizeResult{pcg1, empty_machine_mapping},
-        0,
+          GraphOptimizeResult{pcg1, empty_machine_mapping},
+          0,
       };
 
       GraphOptimizeState state_ = GraphOptimizeState{
-        GraphOptimizeResult{pcg_, empty_machine_mapping},
-        0,
+          GraphOptimizeResult{pcg_, empty_machine_mapping},
+          0,
       };
 
       CHECK_FALSE(state1 == state_);
diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc
index 8ff1e8c216..fe32f41561 100644
--- a/lib/local-execution/src/local_cost_estimator.cc
+++ b/lib/local-execution/src/local_cost_estimator.cc
@@ -54,7 +54,8 @@ CostDetails LocalCostEstimator::estimate_cost(
   ComputationGraph cg = make_empty_computation_graph();
   for (ParallelTensorShape const &input : inputs) {
     TensorShape tensor_shape = get_piece_shape(input);
-    tensor_guid_t tensor_id = get_only(add_input_layer(cg, tensor_shape).outputs);
+    tensor_guid_t tensor_id =
+        get_only(add_input_layer(cg, tensor_shape).outputs);
     GenericTensorAccessorW tensor_backing =
         allocator.allocate_tensor(tensor_shape);
     tensor_backing_map.insert({tensor_id, tensor_backing});
@@ -69,31 +70,34 @@ CostDetails LocalCostEstimator::estimate_cost(
       };
 
   // add operator to graph
-  std::vector<TensorShape> weight_shapes = get_weight_shapes(layer_attrs.op_attrs, 
-                                                             transform(inputs, get_piece_shape));
-
-  std::vector<tensor_guid_t> weight_tensor_ids = transform(weight_shapes, 
-                                                           [&](TensorShape const &tensor_shape) {
-                                                             LayerAttrs attrs = LayerAttrs{
-                                                               ComputationGraphOpAttrs{
-                                                                 WeightAttrs{
-                                                                   /*tensor_shape=*/tensor_shape,
-                                                                   /*initializer=*/InitializerAttrs{ZeroInitializerAttrs{}},
-                                                                 },
-                                                               },
-                                                               /*name=*/std::nullopt,
-                                                             };
-                                                             
-                                                             return get_only(add_layer(cg, attrs, /*inputs=*/{}, /*weights=*/{}).outputs);
-                                                           });
+  std::vector<TensorShape> weight_shapes = get_weight_shapes(
+      layer_attrs.op_attrs, transform(inputs, get_piece_shape));
+
+  std::vector<tensor_guid_t> weight_tensor_ids =
+      transform(weight_shapes, [&](TensorShape const &tensor_shape) {
+        LayerAttrs attrs = LayerAttrs{
+            ComputationGraphOpAttrs{
+                WeightAttrs{
+                    /*tensor_shape=*/tensor_shape,
+                    /*initializer=*/InitializerAttrs{ZeroInitializerAttrs{}},
+                },
+            },
+            /*name=*/std::nullopt,
+        };
+
+        return get_only(
+            add_layer(cg, attrs, /*inputs=*/{}, /*weights=*/{}).outputs);
+      });
 
   std::vector<tensor_guid_t> output_tensor_ids =
-    add_layer(cg, layer_attrs, /*inputs=*/input_tensor_ids, /*weights=*/weight_tensor_ids).outputs;
-
-  LocalTrainingBacking local_backing(allocator,
-                                     cg,
-                                     tensor_backing_map,
-                                     this->runtime_arg_config);
+      add_layer(cg,
+                layer_attrs,
+                /*inputs=*/input_tensor_ids,
+                /*weights=*/weight_tensor_ids)
+          .outputs;
+
+  LocalTrainingBacking local_backing(
+      allocator, cg, tensor_backing_map, this->runtime_arg_config);
 
   local_backing.execute_init();
   PerLayerElapsedTime fwd = local_backing.execute_forward();
diff --git a/lib/models/src/models/bert/bert.cc b/lib/models/src/models/bert/bert.cc
index bbe9964692..535e03e413 100644
--- a/lib/models/src/models/bert/bert.cc
+++ b/lib/models/src/models/bert/bert.cc
@@ -1,7 +1,7 @@
 #include "models/bert/bert.h"
+#include "op-attrs/initializers/truncated_normal_initializer_attrs.dtg.h"
 #include "op-attrs/tensor_shape.h"
 #include "pcg/computation_graph.h"
-#include "op-attrs/initializers/truncated_normal_initializer_attrs.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/op-attrs/include/op-attrs/initializers/kaiming_initializer_mode.h b/lib/op-attrs/include/op-attrs/initializers/kaiming_initializer_mode.h
index 147541f16b..bd95ff677c 100644
--- a/lib/op-attrs/include/op-attrs/initializers/kaiming_initializer_mode.h
+++ b/lib/op-attrs/include/op-attrs/initializers/kaiming_initializer_mode.h
@@ -1,18 +1,20 @@
 #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_INITIALIZERS_KAIMING_INITIALIZER_MODE_H
 #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_INITIALIZERS_KAIMING_INITIALIZER_MODE_H
 
+#include "op-attrs/initializers/kaiming_initializer_mode.dtg.h"
 #include "op-attrs/tensor_dims.dtg.h"
 #include "utils/nonnegative_int/nonnegative_int.h"
-#include "op-attrs/initializers/kaiming_initializer_mode.dtg.h"
 
 namespace FlexFlow {
 
 /**
  * @brief `fan_in` and `fan_out` calculation from pytorch
  *
- * see https://github.com/pytorch/pytorch/blob/bd019c0bb485904a99fb38589444b1461ab1e486/torch/nn/init.py#L345-L363
+ * see
+ * https://github.com/pytorch/pytorch/blob/bd019c0bb485904a99fb38589444b1461ab1e486/torch/nn/init.py#L345-L363
  */
-nonnegative_int calculate_fan_for_mode(TensorDims const &dims, KaimingInitializerMode mode);
+nonnegative_int calculate_fan_for_mode(TensorDims const &dims,
+                                       KaimingInitializerMode mode);
 
 } // namespace FlexFlow
 
diff --git a/lib/op-attrs/include/op-attrs/ops/attention.h b/lib/op-attrs/include/op-attrs/ops/attention.h
index 5268daca8e..fa57a717e2 100644
--- a/lib/op-attrs/include/op-attrs/ops/attention.h
+++ b/lib/op-attrs/include/op-attrs/ops/attention.h
@@ -112,15 +112,16 @@ tl::expected<std::vector<ParallelTensorShape>, std::string>
                       ParallelTensorShape const &input_k,
                       ParallelTensorShape const &input_v);
 
-tl::expected<std::vector<InitializerAttrs>, std::string>
-    get_initializers(MultiHeadAttentionAttrs const &,
-                     TensorShape const &input_q,
-                     TensorShape const &input_k,
-                     TensorShape const &input_v,
-                     std::optional<InitializerAttrs> const &weights_initializer = std::nullopt,
-                     std::optional<InitializerAttrs> const &input_bias_initializer = std::nullopt,
-                     std::optional<InitializerAttrs> const &output_bias_initializer = std::nullopt);
-
+tl::expected<std::vector<InitializerAttrs>, std::string> get_initializers(
+    MultiHeadAttentionAttrs const &,
+    TensorShape const &input_q,
+    TensorShape const &input_k,
+    TensorShape const &input_v,
+    std::optional<InitializerAttrs> const &weights_initializer = std::nullopt,
+    std::optional<InitializerAttrs> const &input_bias_initializer =
+        std::nullopt,
+    std::optional<InitializerAttrs> const &output_bias_initializer =
+        std::nullopt);
 
 CHECK_VALID_OP_ATTR(MultiHeadAttentionAttrs);
 } // namespace FlexFlow
diff --git a/lib/op-attrs/include/op-attrs/ops/batch_norm.h b/lib/op-attrs/include/op-attrs/ops/batch_norm.h
index 08c3ff03bd..bcf6794f38 100644
--- a/lib/op-attrs/include/op-attrs/ops/batch_norm.h
+++ b/lib/op-attrs/include/op-attrs/ops/batch_norm.h
@@ -22,7 +22,8 @@ tl::expected<TensorShape, std::string>
     get_beta_weights_shape(BatchNormAttrs const &, TensorShape const &);
 
 tl::expected<std::vector<TensorShape>, std::string>
-  get_weight_shapes(BatchNormAttrs const &attrs, TensorShape const &input_shape);
+    get_weight_shapes(BatchNormAttrs const &attrs,
+                      TensorShape const &input_shape);
 
 tl::expected<ParallelTensorDimDegrees, std::string>
     get_output_parallel_dim_degrees(BatchNormAttrs const &,
@@ -35,7 +36,9 @@ tl::expected<ParallelTensorDimDegrees, std::string>
                                           ParallelTensorDimDegrees const &);
 
 tl::expected<std::vector<ParallelTensorDimDegrees>, std::string>
-  get_weight_parallel_dim_degrees(BatchNormAttrs const &attrs, ParallelTensorDimDegrees const &input_degrees);
+    get_weight_parallel_dim_degrees(
+        BatchNormAttrs const &attrs,
+        ParallelTensorDimDegrees const &input_degrees);
 
 tl::expected<ParallelTensorShape, std::string>
     get_output_shape(BatchNormAttrs const &, ParallelTensorShape const &);
@@ -46,15 +49,17 @@ tl::expected<ParallelTensorShape, std::string>
     get_beta_weights_shape(BatchNormAttrs const &, ParallelTensorShape const &);
 
 tl::expected<std::vector<ParallelTensorShape>, std::string>
-  get_weight_shapes(BatchNormAttrs const &attrs, ParallelTensorShape const &input_shape);
+    get_weight_shapes(BatchNormAttrs const &attrs,
+                      ParallelTensorShape const &input_shape);
 
 /**
  * @brief Chosen to match pytorch
  *
- * see https://github.com/pytorch/pytorch/blob/1eba9b3aa3c43f86f4a2c807ac8e12c4a7767340/torch/nn/modules/batchnorm.py#L93-L97
+ * see
+ * https://github.com/pytorch/pytorch/blob/1eba9b3aa3c43f86f4a2c807ac8e12c4a7767340/torch/nn/modules/batchnorm.py#L93-L97
  */
 tl::expected<std::vector<InitializerAttrs>, std::string>
-  get_initializers(BatchNormAttrs const &attrs);
+    get_initializers(BatchNormAttrs const &attrs);
 
 CHECK_VALID_OP_ATTR(BatchNormAttrs);
 
diff --git a/lib/op-attrs/include/op-attrs/ops/conv_2d.h b/lib/op-attrs/include/op-attrs/ops/conv_2d.h
index 5797fa7420..e4c7467de2 100644
--- a/lib/op-attrs/include/op-attrs/ops/conv_2d.h
+++ b/lib/op-attrs/include/op-attrs/ops/conv_2d.h
@@ -31,13 +31,15 @@ ParallelTensorShape get_bias_shape(Conv2DAttrs const &attrs,
 ParallelTensorShape get_output_shape(Conv2DAttrs const &attrs,
                                      ParallelTensorShape const &input_shape);
 
-std::vector<ParallelTensorShape> get_weight_shapes(Conv2DAttrs const &attrs,
-                                                   ParallelTensorShape const &input_shape);
-
-std::vector<InitializerAttrs> get_initializers(Conv2DAttrs const &attrs, 
-                                               TensorShape const &input_shape,
-                                               std::optional<InitializerAttrs> kernel_initializer = std::nullopt,
-                                               std::optional<InitializerAttrs> bias_initializer = std::nullopt);
+std::vector<ParallelTensorShape>
+    get_weight_shapes(Conv2DAttrs const &attrs,
+                      ParallelTensorShape const &input_shape);
+
+std::vector<InitializerAttrs> get_initializers(
+    Conv2DAttrs const &attrs,
+    TensorShape const &input_shape,
+    std::optional<InitializerAttrs> kernel_initializer = std::nullopt,
+    std::optional<InitializerAttrs> bias_initializer = std::nullopt);
 
 } // namespace FlexFlow
 
diff --git a/lib/op-attrs/include/op-attrs/ops/embedding.h b/lib/op-attrs/include/op-attrs/ops/embedding.h
index 3e5ed05170..d44adf5f54 100644
--- a/lib/op-attrs/include/op-attrs/ops/embedding.h
+++ b/lib/op-attrs/include/op-attrs/ops/embedding.h
@@ -28,9 +28,12 @@ tl::expected<ParallelTensorShape, std::string>
 /**
  * @brief Chosen to match pytorch
  *
- * see https://github.com/pytorch/pytorch/blob/1eba9b3aa3c43f86f4a2c807ac8e12c4a7767340/torch/nn/modules/sparse.py#L180-L182
+ * see
+ * https://github.com/pytorch/pytorch/blob/1eba9b3aa3c43f86f4a2c807ac8e12c4a7767340/torch/nn/modules/sparse.py#L180-L182
  */
-std::vector<InitializerAttrs> get_initializers(EmbeddingAttrs const &, std::optional<InitializerAttrs> const &initializer_attrs = std::nullopt);
+std::vector<InitializerAttrs> get_initializers(
+    EmbeddingAttrs const &,
+    std::optional<InitializerAttrs> const &initializer_attrs = std::nullopt);
 
 } // namespace FlexFlow
 
diff --git a/lib/op-attrs/include/op-attrs/ops/layer_norm.h b/lib/op-attrs/include/op-attrs/ops/layer_norm.h
index f3001a9bd4..4dcbeb665e 100644
--- a/lib/op-attrs/include/op-attrs/ops/layer_norm.h
+++ b/lib/op-attrs/include/op-attrs/ops/layer_norm.h
@@ -21,7 +21,8 @@ tl::expected<TensorShape, std::string>
     get_beta_weights_shape(LayerNormAttrs const &, TensorShape const &);
 
 tl::expected<std::vector<TensorShape>, std::string>
-    get_weight_shapes(LayerNormAttrs const &attrs, TensorShape const &input_shape); 
+    get_weight_shapes(LayerNormAttrs const &attrs,
+                      TensorShape const &input_shape);
 
 tl::expected<ParallelTensorShape, std::string>
     get_output_shape(LayerNormAttrs const &, ParallelTensorShape const &);
@@ -32,12 +33,14 @@ tl::expected<ParallelTensorShape, std::string>
     get_beta_weights_shape(LayerNormAttrs const &, ParallelTensorShape const &);
 
 tl::expected<std::vector<ParallelTensorShape>, std::string>
-    get_weight_shapes(LayerNormAttrs const &attrs, ParallelTensorShape const &input_shape); 
+    get_weight_shapes(LayerNormAttrs const &attrs,
+                      ParallelTensorShape const &input_shape);
 
 /**
  * @brief Chosen to match pytorch
  *
- * see https://github.com/pytorch/pytorch/blob/1eba9b3aa3c43f86f4a2c807ac8e12c4a7767340/torch/nn/modules/normalization.py#L210-L214
+ * see
+ * https://github.com/pytorch/pytorch/blob/1eba9b3aa3c43f86f4a2c807ac8e12c4a7767340/torch/nn/modules/normalization.py#L210-L214
  */
 std::vector<InitializerAttrs> get_initializers(LayerNormAttrs const &attrs);
 
diff --git a/lib/op-attrs/include/op-attrs/ops/linear.h b/lib/op-attrs/include/op-attrs/ops/linear.h
index 545c6f70c9..107f772e03 100644
--- a/lib/op-attrs/include/op-attrs/ops/linear.h
+++ b/lib/op-attrs/include/op-attrs/ops/linear.h
@@ -39,13 +39,15 @@ tl::expected<ParallelTensorShape, std::string>
                      ParallelTensorShape const &input);
 
 tl::expected<std::vector<ParallelTensorShape>, std::string>
-    get_weight_shapes(LinearAttrs const &attrs, ParallelTensorShape const &input_shape);
-
-tl::expected<std::vector<InitializerAttrs>, std::string> 
-  get_initializers(LinearAttrs const &, 
-                   TensorShape const &input_shape,
-                   std::optional<InitializerAttrs> const &projection_initializer = std::nullopt,
-                   std::optional<InitializerAttrs> const &kernel_initializer = std::nullopt);
+    get_weight_shapes(LinearAttrs const &attrs,
+                      ParallelTensorShape const &input_shape);
+
+tl::expected<std::vector<InitializerAttrs>, std::string> get_initializers(
+    LinearAttrs const &,
+    TensorShape const &input_shape,
+    std::optional<InitializerAttrs> const &projection_initializer =
+        std::nullopt,
+    std::optional<InitializerAttrs> const &kernel_initializer = std::nullopt);
 
 } // namespace FlexFlow
 
diff --git a/lib/op-attrs/include/op-attrs/shape_inference.h b/lib/op-attrs/include/op-attrs/shape_inference.h
index 476bc4f39e..8c679f442a 100644
--- a/lib/op-attrs/include/op-attrs/shape_inference.h
+++ b/lib/op-attrs/include/op-attrs/shape_inference.h
@@ -20,7 +20,7 @@ std::vector<ParallelTensorShape>
     get_output_shapes(PCGOperatorAttrs const &,
                       std::vector<ParallelTensorShape> const &input_shapes);
 
-std::vector<ParallelTensorShape> 
+std::vector<ParallelTensorShape>
     get_weight_shapes(PCGOperatorAttrs const &,
                       std::vector<ParallelTensorShape> const &input_shapes);
 
diff --git a/lib/op-attrs/include/op-attrs/tensor_dims.h b/lib/op-attrs/include/op-attrs/tensor_dims.h
index 98217d4252..97f3432c2f 100644
--- a/lib/op-attrs/include/op-attrs/tensor_dims.h
+++ b/lib/op-attrs/include/op-attrs/tensor_dims.h
@@ -18,7 +18,9 @@ bool tensor_dims_is_broadcastable_to(TensorDims const &curr,
 std::optional<TensorDims>
     get_broadcast_target_dims(std::unordered_set<TensorDims> const &);
 
-TensorDims slice_tensor_dims(TensorDims const &, std::optional<relative_ff_dim_t> const &start, std::optional<relative_ff_dim_t> const &stop);
+TensorDims slice_tensor_dims(TensorDims const &,
+                             std::optional<relative_ff_dim_t> const &start,
+                             std::optional<relative_ff_dim_t> const &stop);
 
 } // namespace FlexFlow
 
diff --git a/lib/op-attrs/include/op-attrs/tensor_shape.h b/lib/op-attrs/include/op-attrs/tensor_shape.h
index 95b3453ecf..a3cd8bfd9a 100644
--- a/lib/op-attrs/include/op-attrs/tensor_shape.h
+++ b/lib/op-attrs/include/op-attrs/tensor_shape.h
@@ -11,7 +11,9 @@ nonnegative_int &dim_at_idx(TensorShape &, relative_ff_dim_t);
 nonnegative_int get_num_elements(TensorShape const &);
 nonnegative_int get_size_in_bytes(TensorShape const &);
 
-TensorShape slice_tensor_shape(TensorShape const &, std::optional<relative_ff_dim_t> const &start, std::optional<relative_ff_dim_t> const &stop);
+TensorShape slice_tensor_shape(TensorShape const &,
+                               std::optional<relative_ff_dim_t> const &start,
+                               std::optional<relative_ff_dim_t> const &stop);
 
 } // namespace FlexFlow
 
diff --git a/lib/op-attrs/src/op-attrs/initializer_attrs.cc b/lib/op-attrs/src/op-attrs/initializer_attrs.cc
index 986dde446e..7635f170a0 100644
--- a/lib/op-attrs/src/op-attrs/initializer_attrs.cc
+++ b/lib/op-attrs/src/op-attrs/initializer_attrs.cc
@@ -8,53 +8,59 @@ InitializerAttrs make_zero_initializer() {
 }
 
 // fan_in and fan_out calculation from pytorch
-// see https://github.com/pytorch/pytorch/blob/bd019c0bb485904a99fb38589444b1461ab1e486/torch/nn/init.py#L345-L363
-static nonnegative_int calculate_fan_for_mode(TensorDims const &dims, KaimingInitializerMode mode) {
+// see
+// https://github.com/pytorch/pytorch/blob/bd019c0bb485904a99fb38589444b1461ab1e486/torch/nn/init.py#L345-L363
+static nonnegative_int calculate_fan_for_mode(TensorDims const &dims,
+                                              KaimingInitializerMode mode) {
   nonnegative_int num_input_fmaps = dim_at_idx(dims, relative_ff_dim_t{0});
   nonnegative_int num_output_fmaps = dim_at_idx(dims, relative_ff_dim_t{1});
 
-  nonnegative_int receptive_field_size = get_num_elements(slice_tensor_dims(dims, relative_ff_dim_t{2}, std::nullopt));
+  nonnegative_int receptive_field_size = get_num_elements(
+      slice_tensor_dims(dims, relative_ff_dim_t{2}, std::nullopt));
 
   if (mode == KaimingInitializerMode::FAN_IN) {
     return num_input_fmaps * receptive_field_size;
   } else {
-    assert (mode == KaimingInitializerMode::FAN_OUT);
+    assert(mode == KaimingInitializerMode::FAN_OUT);
 
     return num_output_fmaps * receptive_field_size;
   }
 }
 
 // from pytorch:
-// see https://github.com/pytorch/pytorch/blob/bd019c0bb485904a99fb38589444b1461ab1e486/torch/nn/init.py#L72-L139
-static float gain_for_nonlinearity(KaimingInitializerNonlinearity nonlinearity, std::optional<float> negative_slope = std::nullopt) {
+// see
+// https://github.com/pytorch/pytorch/blob/bd019c0bb485904a99fb38589444b1461ab1e486/torch/nn/init.py#L72-L139
+static float
+    gain_for_nonlinearity(KaimingInitializerNonlinearity nonlinearity,
+                          std::optional<float> negative_slope = std::nullopt) {
   if (nonlinearity == KaimingInitializerNonlinearity::RELU) {
-    assert (!negative_slope.has_value());
+    assert(!negative_slope.has_value());
     return sqrtf(2.0);
   } else {
-    assert (nonlinearity == KaimingInitializerNonlinearity::LEAKY_RELU);
+    assert(nonlinearity == KaimingInitializerNonlinearity::LEAKY_RELU);
 
     return sqrtf(2.0 / (1 + negative_slope.value() * negative_slope.value()));
   }
 }
 
 // from pytorch:
-// see https://github.com/pytorch/pytorch/blob/bd019c0bb485904a99fb38589444b1461ab1e486/torch/nn/init.py#L456-L518
-InitializerAttrs kaiming_uniform(
-  TensorDims const &dims, 
-  float a,
-  KaimingInitializerMode mode,
-  KaimingInitializerNonlinearity nonlinearity,
-  int seed) {
-  
+// see
+// https://github.com/pytorch/pytorch/blob/bd019c0bb485904a99fb38589444b1461ab1e486/torch/nn/init.py#L456-L518
+InitializerAttrs kaiming_uniform(TensorDims const &dims,
+                                 float a,
+                                 KaimingInitializerMode mode,
+                                 KaimingInitializerNonlinearity nonlinearity,
+                                 int seed) {
+
   nonnegative_int fan = calculate_fan_for_mode(dims, mode);
   float gain = gain_for_nonlinearity(nonlinearity, a);
   float std = gain / sqrtf(static_cast<float>(fan.unwrap_nonnegative()));
   float bound = sqrtf(3.0) * std;
 
   return InitializerAttrs{UniformInitializerAttrs{
-    /*seed=*/seed,
-    /*min_val=*/-bound,
-    /*max_val=*/bound,
+      /*seed=*/seed,
+      /*min_val=*/-bound,
+      /*max_val=*/bound,
   }};
 }
 
diff --git a/lib/op-attrs/src/op-attrs/initializers/kaiming_initializer_mode.cc b/lib/op-attrs/src/op-attrs/initializers/kaiming_initializer_mode.cc
index 0582b17441..b3d6e93c25 100644
--- a/lib/op-attrs/src/op-attrs/initializers/kaiming_initializer_mode.cc
+++ b/lib/op-attrs/src/op-attrs/initializers/kaiming_initializer_mode.cc
@@ -3,16 +3,18 @@
 
 namespace FlexFlow {
 
-nonnegative_int calculate_fan_for_mode(TensorDims const &dims, KaimingInitializerMode mode) {
+nonnegative_int calculate_fan_for_mode(TensorDims const &dims,
+                                       KaimingInitializerMode mode) {
   nonnegative_int num_input_fmaps = dim_at_idx(dims, relative_ff_dim_t{0});
   nonnegative_int num_output_fmaps = dim_at_idx(dims, relative_ff_dim_t{1});
 
-  nonnegative_int receptive_field_size = get_num_elements(slice_tensor_dims(dims, relative_ff_dim_t{2}, std::nullopt));
+  nonnegative_int receptive_field_size = get_num_elements(
+      slice_tensor_dims(dims, relative_ff_dim_t{2}, std::nullopt));
 
   if (mode == KaimingInitializerMode::FAN_IN) {
     return num_input_fmaps * receptive_field_size;
   } else {
-    assert (mode == KaimingInitializerMode::FAN_OUT);
+    assert(mode == KaimingInitializerMode::FAN_OUT);
 
     return num_output_fmaps * receptive_field_size;
   }
diff --git a/lib/op-attrs/src/op-attrs/ops/attention.cc b/lib/op-attrs/src/op-attrs/ops/attention.cc
index 8400d71164..07d4f3e287 100644
--- a/lib/op-attrs/src/op-attrs/ops/attention.cc
+++ b/lib/op-attrs/src/op-attrs/ops/attention.cc
@@ -98,7 +98,8 @@ nonnegative_int get_num_samples(MultiHeadAttentionInputs const &inputs) {
 
 static void check_attrs(MultiHeadAttentionAttrs const &attrs) {
   if (attrs.add_bias_kv) {
-    throw mk_runtime_error("add_bias_kv is not yet supported. If you need this functionality, please create an issue.");
+    throw mk_runtime_error("add_bias_kv is not yet supported. If you need this "
+                           "functionality, please create an issue.");
   }
 }
 
@@ -240,12 +241,14 @@ tl::expected<std::vector<TensorShape>, std::string>
                       TensorShape const &input_v) {
 
   std::vector<TensorShape> weight_shapes = {
-    PROPAGATE_ERR(get_weights_shape(attrs, input_q, input_k, input_v)),
+      PROPAGATE_ERR(get_weights_shape(attrs, input_q, input_k, input_v)),
   };
 
   if (attrs.bias) {
-    weight_shapes.push_back(PROPAGATE_ERR(get_input_bias_shape(attrs, input_q, input_k, input_v)));
-    weight_shapes.push_back(PROPAGATE_ERR(get_output_bias_shape(attrs, input_q, input_k, input_v)));
+    weight_shapes.push_back(
+        PROPAGATE_ERR(get_input_bias_shape(attrs, input_q, input_k, input_v)));
+    weight_shapes.push_back(
+        PROPAGATE_ERR(get_output_bias_shape(attrs, input_q, input_k, input_v)));
   }
 
   return weight_shapes;
@@ -329,7 +332,7 @@ tl::expected<ParallelTensorShape, std::string>
                           ParallelTensorShape const &input_k,
                           ParallelTensorShape const &input_v) {
   check_attrs(attrs);
-  
+
   MultiHeadAttentionParallelInputs parsed = ({
     tl::expected<MultiHeadAttentionParallelInputs, std::string> parse_result =
         parse_attention_parallel_input_shape(input_q, input_k, input_v);
@@ -413,69 +416,75 @@ tl::expected<std::vector<ParallelTensorShape>, std::string>
                       ParallelTensorShape const &input_v) {
 
   std::vector<ParallelTensorShape> weight_shapes = {
-    PROPAGATE_ERR(get_weights_shape(attrs, input_q, input_k, input_v)),
+      PROPAGATE_ERR(get_weights_shape(attrs, input_q, input_k, input_v)),
   };
 
   if (attrs.bias) {
-    weight_shapes.push_back(PROPAGATE_ERR(get_input_bias_shape(attrs, input_q, input_k, input_v)));
-    weight_shapes.push_back(PROPAGATE_ERR(get_output_bias_shape(attrs, input_q, input_k, input_v)));
+    weight_shapes.push_back(
+        PROPAGATE_ERR(get_input_bias_shape(attrs, input_q, input_k, input_v)));
+    weight_shapes.push_back(
+        PROPAGATE_ERR(get_output_bias_shape(attrs, input_q, input_k, input_v)));
   }
 
   return weight_shapes;
 }
 
-
-tl::expected<std::vector<InitializerAttrs>, std::string>
-    get_initializers(MultiHeadAttentionAttrs const &attrs,
-                     TensorShape const &input_q,
-                     TensorShape const &input_k,
-                     TensorShape const &input_v,
-                     std::optional<InitializerAttrs> const &maybe_weights_initializer,
-                     std::optional<InitializerAttrs> const &maybe_input_bias_initializer,
-                     std::optional<InitializerAttrs> const &maybe_output_bias_initializer) {
+tl::expected<std::vector<InitializerAttrs>, std::string> get_initializers(
+    MultiHeadAttentionAttrs const &attrs,
+    TensorShape const &input_q,
+    TensorShape const &input_k,
+    TensorShape const &input_v,
+    std::optional<InitializerAttrs> const &maybe_weights_initializer,
+    std::optional<InitializerAttrs> const &maybe_input_bias_initializer,
+    std::optional<InitializerAttrs> const &maybe_output_bias_initializer) {
   check_attrs(attrs);
 
   if (!attrs.bias && maybe_input_bias_initializer.has_value()) {
-    return tl::unexpected(fmt::format("Expected input_bias_initializer=std::nullopt since bias=false, but received input_bias_initializer: {}", maybe_input_bias_initializer.value()));
+    return tl::unexpected(
+        fmt::format("Expected input_bias_initializer=std::nullopt since "
+                    "bias=false, but received input_bias_initializer: {}",
+                    maybe_input_bias_initializer.value()));
   }
 
   if (!attrs.bias && maybe_output_bias_initializer.has_value()) {
-    return tl::unexpected(fmt::format("Expected output_bias_initializer=std::nullopt since bias=false, but received output_bias_initializer: {}", maybe_output_bias_initializer.value()));
+    return tl::unexpected(
+        fmt::format("Expected output_bias_initializer=std::nullopt since "
+                    "bias=false, but received output_bias_initializer: {}",
+                    maybe_output_bias_initializer.value()));
   }
 
   InitializerAttrs default_weights_initializer = InitializerAttrs{
-    GlorotUniformAttrs{
-      /*seed=*/0,
-    },
+      GlorotUniformAttrs{
+          /*seed=*/0,
+      },
   };
 
   InitializerAttrs default_input_bias_initializer = InitializerAttrs{
-    ZeroInitializerAttrs{},
+      ZeroInitializerAttrs{},
   };
 
   InitializerAttrs default_output_bias_initializer = InitializerAttrs{
-    ZeroInitializerAttrs{},
+      ZeroInitializerAttrs{},
   };
 
-  InitializerAttrs weights_initializer = 
+  InitializerAttrs weights_initializer =
       maybe_weights_initializer.value_or(default_weights_initializer);
-  InitializerAttrs input_bias_initializer = 
+  InitializerAttrs input_bias_initializer =
       maybe_input_bias_initializer.value_or(default_input_bias_initializer);
-  InitializerAttrs output_bias_initializer = 
+  InitializerAttrs output_bias_initializer =
       maybe_output_bias_initializer.value_or(default_output_bias_initializer);
 
   if (attrs.bias) {
     return std::vector{
-      weights_initializer,
-      input_bias_initializer,
-      output_bias_initializer,
+        weights_initializer,
+        input_bias_initializer,
+        output_bias_initializer,
     };
   } else {
     return std::vector{
-      weights_initializer,
+        weights_initializer,
     };
   }
 }
 
-
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/ops/batch_norm.cc b/lib/op-attrs/src/op-attrs/ops/batch_norm.cc
index bf3d1fbd8f..d4763ef004 100644
--- a/lib/op-attrs/src/op-attrs/ops/batch_norm.cc
+++ b/lib/op-attrs/src/op-attrs/ops/batch_norm.cc
@@ -90,14 +90,17 @@ tl::expected<TensorShape, std::string>
 }
 
 tl::expected<std::vector<TensorShape>, std::string>
-  get_weight_shapes(BatchNormAttrs const &attrs, TensorShape const &input_shape) {
-  
-  TensorShape gamma_shape = PROPAGATE_ERR(get_gamma_weights_shape(attrs, input_shape));
-  TensorShape beta_shape = PROPAGATE_ERR(get_beta_weights_shape(attrs, input_shape));
+    get_weight_shapes(BatchNormAttrs const &attrs,
+                      TensorShape const &input_shape) {
+
+  TensorShape gamma_shape =
+      PROPAGATE_ERR(get_gamma_weights_shape(attrs, input_shape));
+  TensorShape beta_shape =
+      PROPAGATE_ERR(get_beta_weights_shape(attrs, input_shape));
 
   return std::vector{
-    gamma_shape,
-    beta_shape,
+      gamma_shape,
+      beta_shape,
   };
 }
 
@@ -195,18 +198,21 @@ tl::expected<ParallelTensorDimDegrees, std::string>
 }
 
 tl::expected<std::vector<ParallelTensorDimDegrees>, std::string>
-  get_weight_parallel_dim_degrees(BatchNormAttrs const &attrs, ParallelTensorDimDegrees const &input_degrees) {
-  
-  ParallelTensorDimDegrees gamma_degrees = PROPAGATE_ERR(get_gamma_weights_parallel_dim_degrees(attrs, input_degrees));
-  ParallelTensorDimDegrees beta_degrees = PROPAGATE_ERR(get_beta_weights_parallel_dim_degrees(attrs, input_degrees));
+    get_weight_parallel_dim_degrees(
+        BatchNormAttrs const &attrs,
+        ParallelTensorDimDegrees const &input_degrees) {
+
+  ParallelTensorDimDegrees gamma_degrees = PROPAGATE_ERR(
+      get_gamma_weights_parallel_dim_degrees(attrs, input_degrees));
+  ParallelTensorDimDegrees beta_degrees = PROPAGATE_ERR(
+      get_beta_weights_parallel_dim_degrees(attrs, input_degrees));
 
   return std::vector{
-    gamma_degrees,
-    beta_degrees,
+      gamma_degrees,
+      beta_degrees,
   };
 }
 
-
 tl::expected<ParallelTensorShape, std::string>
     get_output_shape(BatchNormAttrs const &attrs,
                      ParallelTensorShape const &input_shape) {
@@ -285,19 +291,22 @@ tl::expected<ParallelTensorShape, std::string>
 }
 
 tl::expected<std::vector<ParallelTensorShape>, std::string>
-  get_weight_shapes(BatchNormAttrs const &attrs, ParallelTensorShape const &input_shape) {
-  
-  ParallelTensorShape gamma_shape = PROPAGATE_ERR(get_gamma_weights_shape(attrs, input_shape));
-  ParallelTensorShape beta_shape = PROPAGATE_ERR(get_beta_weights_shape(attrs, input_shape));
+    get_weight_shapes(BatchNormAttrs const &attrs,
+                      ParallelTensorShape const &input_shape) {
+
+  ParallelTensorShape gamma_shape =
+      PROPAGATE_ERR(get_gamma_weights_shape(attrs, input_shape));
+  ParallelTensorShape beta_shape =
+      PROPAGATE_ERR(get_beta_weights_shape(attrs, input_shape));
 
   return std::vector{
-    gamma_shape,
-    beta_shape,
+      gamma_shape,
+      beta_shape,
   };
 }
 
 tl::expected<std::vector<InitializerAttrs>, std::string>
-  get_initializers(BatchNormAttrs const &attrs) {
+    get_initializers(BatchNormAttrs const &attrs) {
   if (attrs.affine) {
     InitializerAttrs gamma_initializer =
         InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{1}}}};
diff --git a/lib/op-attrs/src/op-attrs/ops/conv_2d.cc b/lib/op-attrs/src/op-attrs/ops/conv_2d.cc
index d11ea6ed02..902417d050 100644
--- a/lib/op-attrs/src/op-attrs/ops/conv_2d.cc
+++ b/lib/op-attrs/src/op-attrs/ops/conv_2d.cc
@@ -2,8 +2,8 @@
 #include "op-attrs/initializers/kaiming_initializer_mode.h"
 #include "op-attrs/ops/conv_2d/conv_2d_input_shape.h"
 #include "op-attrs/ops/conv_2d/conv_2d_parallel_input_shape.h"
-#include "utils/integer_conversions.h"
 #include "utils/fmt/optional.h"
+#include "utils/integer_conversions.h"
 
 namespace FlexFlow {
 
@@ -91,7 +91,7 @@ TensorShape get_output_shape(Conv2DAttrs const &attrs,
 std::vector<TensorShape> get_weight_shapes(Conv2DAttrs const &attrs,
                                            TensorShape const &input_shape) {
   std::vector<TensorShape> weight_shapes = {
-    get_kernel_shape(attrs, input_shape),
+      get_kernel_shape(attrs, input_shape),
   };
 
   if (attrs.use_bias) {
@@ -172,10 +172,11 @@ ParallelTensorShape get_output_shape(Conv2DAttrs const &attrs,
       unpar, sum_degree, discard_copy_degree, shard_degrees);
 }
 
-std::vector<ParallelTensorShape> get_weight_shapes(Conv2DAttrs const &attrs,
-                                           ParallelTensorShape const &input_shape) {
+std::vector<ParallelTensorShape>
+    get_weight_shapes(Conv2DAttrs const &attrs,
+                      ParallelTensorShape const &input_shape) {
   std::vector<ParallelTensorShape> weight_shapes = {
-    get_kernel_shape(attrs, input_shape),
+      get_kernel_shape(attrs, input_shape),
   };
 
   if (attrs.use_bias) {
@@ -188,41 +189,49 @@ std::vector<ParallelTensorShape> get_weight_shapes(Conv2DAttrs const &attrs,
 /**
  * @brief Chosen to match pytorch implementation
  *
- * see https://github.com/pytorch/pytorch/blob/1eba9b3aa3c43f86f4a2c807ac8e12c4a7767340/torch/nn/modules/conv.py#L178-L187
+ * see
+ * https://github.com/pytorch/pytorch/blob/1eba9b3aa3c43f86f4a2c807ac8e12c4a7767340/torch/nn/modules/conv.py#L178-L187
  */
-std::vector<InitializerAttrs> get_initializers(
-  Conv2DAttrs const &attrs, 
-  TensorShape const &input_shape,
-  std::optional<InitializerAttrs> maybe_kernel_initializer,
-  std::optional<InitializerAttrs> maybe_bias_initializer) {
+std::vector<InitializerAttrs>
+    get_initializers(Conv2DAttrs const &attrs,
+                     TensorShape const &input_shape,
+                     std::optional<InitializerAttrs> maybe_kernel_initializer,
+                     std::optional<InitializerAttrs> maybe_bias_initializer) {
 
   if (!attrs.use_bias && maybe_bias_initializer.has_value()) {
-    throw mk_runtime_error(fmt::format("Unexpectedly received bias initializer while use_bias=false: {}", maybe_bias_initializer));
+    throw mk_runtime_error(fmt::format(
+        "Unexpectedly received bias initializer while use_bias=false: {}",
+        maybe_bias_initializer));
   }
 
   TensorShape kernel_shape = get_kernel_shape(attrs, input_shape);
 
-  InitializerAttrs kernel_default_initializer = InitializerAttrs{KaimingNormalAttrs{
-    /*a=*/sqrtf(5.0),
-    /*mode=*/KaimingInitializerMode::FAN_IN,
-    /*nonlinearity=*/KaimingInitializerNonlinearity::LEAKY_RELU,
-    /*seed=*/0,
-  }};
+  InitializerAttrs kernel_default_initializer =
+      InitializerAttrs{KaimingNormalAttrs{
+          /*a=*/sqrtf(5.0),
+          /*mode=*/KaimingInitializerMode::FAN_IN,
+          /*nonlinearity=*/KaimingInitializerNonlinearity::LEAKY_RELU,
+          /*seed=*/0,
+      }};
 
-  InitializerAttrs kernel_initializer = maybe_kernel_initializer.value_or(kernel_default_initializer);
+  InitializerAttrs kernel_initializer =
+      maybe_kernel_initializer.value_or(kernel_default_initializer);
 
-  nonnegative_int fan_in = calculate_fan_for_mode(kernel_shape.dims, KaimingInitializerMode::FAN_IN);
-  assert (fan_in != 0_n);
+  nonnegative_int fan_in =
+      calculate_fan_for_mode(kernel_shape.dims, KaimingInitializerMode::FAN_IN);
+  assert(fan_in != 0_n);
 
   float bound = 1 / sqrtf(static_cast<float>(fan_in.unwrap_nonnegative()));
 
-  InitializerAttrs bias_default_initializer = InitializerAttrs{UniformInitializerAttrs{
-    /*seed=*/0,
-    /*min_val=*/-bound,
-    /*max_val=*/bound,
-  }};
+  InitializerAttrs bias_default_initializer =
+      InitializerAttrs{UniformInitializerAttrs{
+          /*seed=*/0,
+          /*min_val=*/-bound,
+          /*max_val=*/bound,
+      }};
 
-  InitializerAttrs bias_initializer = maybe_bias_initializer.value_or(bias_default_initializer);
+  InitializerAttrs bias_initializer =
+      maybe_bias_initializer.value_or(bias_default_initializer);
 
   if (attrs.use_bias) {
     return {kernel_initializer, bias_initializer};
diff --git a/lib/op-attrs/src/op-attrs/ops/element_unary.cc b/lib/op-attrs/src/op-attrs/ops/element_unary.cc
index 5aebb0e939..fd65e1f5c9 100644
--- a/lib/op-attrs/src/op-attrs/ops/element_unary.cc
+++ b/lib/op-attrs/src/op-attrs/ops/element_unary.cc
@@ -5,8 +5,8 @@ namespace FlexFlow {
 
 ElementUnaryAttrs make_relu_attrs() {
   return ElementUnaryAttrs{
-    /*op_type=*/OperatorType::RELU,
-    /*scalar=*/std::nullopt,
+      /*op_type=*/OperatorType::RELU,
+      /*scalar=*/std::nullopt,
   };
 }
 
diff --git a/lib/op-attrs/src/op-attrs/ops/embedding.cc b/lib/op-attrs/src/op-attrs/ops/embedding.cc
index 39802c0c8f..4dc602646b 100644
--- a/lib/op-attrs/src/op-attrs/ops/embedding.cc
+++ b/lib/op-attrs/src/op-attrs/ops/embedding.cc
@@ -127,19 +127,18 @@ tl::expected<ParallelTensorShape, std::string>
       unpar, sum_degree, discard_copy_degree, shard_degrees);
 }
 
-std::vector<InitializerAttrs> get_initializers(EmbeddingAttrs const &, 
-                                                                          std::optional<InitializerAttrs> const &maybe_initializer_attrs) {
+std::vector<InitializerAttrs> get_initializers(
+    EmbeddingAttrs const &,
+    std::optional<InitializerAttrs> const &maybe_initializer_attrs) {
   InitializerAttrs default_initializer_attrs = InitializerAttrs{
-    NormInitializerAttrs{
-      /*seed=*/0,
-      /*mean=*/0.0,
-      /*stddev=*/1.0,
-    },
+      NormInitializerAttrs{
+          /*seed=*/0,
+          /*mean=*/0.0,
+          /*stddev=*/1.0,
+      },
   };
 
-  return {
-    maybe_initializer_attrs.value_or(default_initializer_attrs)
-  };
+  return {maybe_initializer_attrs.value_or(default_initializer_attrs)};
 }
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/ops/layer_norm.cc b/lib/op-attrs/src/op-attrs/ops/layer_norm.cc
index 4d29f4cc41..00c6bb5e9b 100644
--- a/lib/op-attrs/src/op-attrs/ops/layer_norm.cc
+++ b/lib/op-attrs/src/op-attrs/ops/layer_norm.cc
@@ -96,14 +96,17 @@ tl::expected<TensorShape, std::string>
 }
 
 tl::expected<std::vector<TensorShape>, std::string>
-    get_weight_shapes(LayerNormAttrs const &attrs, TensorShape const &input_shape) {
-  
-  TensorShape gamma_shape = PROPAGATE_ERR(get_gamma_weights_shape(attrs, input_shape));
-  TensorShape beta_shape = PROPAGATE_ERR(get_beta_weights_shape(attrs, input_shape));
+    get_weight_shapes(LayerNormAttrs const &attrs,
+                      TensorShape const &input_shape) {
+
+  TensorShape gamma_shape =
+      PROPAGATE_ERR(get_gamma_weights_shape(attrs, input_shape));
+  TensorShape beta_shape =
+      PROPAGATE_ERR(get_beta_weights_shape(attrs, input_shape));
 
   return std::vector{
-    gamma_shape,
-    beta_shape,
+      gamma_shape,
+      beta_shape,
   };
 }
 
@@ -207,14 +210,17 @@ tl::expected<ParallelTensorShape, std::string>
 }
 
 tl::expected<std::vector<ParallelTensorShape>, std::string>
-    get_weight_shapes(LayerNormAttrs const &attrs, ParallelTensorShape const &input_shape) {
+    get_weight_shapes(LayerNormAttrs const &attrs,
+                      ParallelTensorShape const &input_shape) {
 
-  ParallelTensorShape gamma_shape = PROPAGATE_ERR(get_gamma_weights_shape(attrs, input_shape));
-  ParallelTensorShape beta_shape = PROPAGATE_ERR(get_beta_weights_shape(attrs, input_shape));
+  ParallelTensorShape gamma_shape =
+      PROPAGATE_ERR(get_gamma_weights_shape(attrs, input_shape));
+  ParallelTensorShape beta_shape =
+      PROPAGATE_ERR(get_beta_weights_shape(attrs, input_shape));
 
   return std::vector{
-    gamma_shape,
-    beta_shape,
+      gamma_shape,
+      beta_shape,
   };
 }
 
diff --git a/lib/op-attrs/src/op-attrs/ops/linear.cc b/lib/op-attrs/src/op-attrs/ops/linear.cc
index 910ab7e382..fb26113613 100644
--- a/lib/op-attrs/src/op-attrs/ops/linear.cc
+++ b/lib/op-attrs/src/op-attrs/ops/linear.cc
@@ -1,12 +1,12 @@
 #include "op-attrs/ops/linear.h"
 #include "op-attrs/dim_ordered/slice.h"
 #include "op-attrs/dim_ordered/transform.h"
+#include "op-attrs/initializers/kaiming_initializer_mode.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "op-attrs/tensor_shape.h"
 #include "utils/containers/product.h"
 #include "utils/expected.h"
 #include "utils/integer_conversions.h"
-#include "op-attrs/initializers/kaiming_initializer_mode.h"
 
 namespace FlexFlow {
 
@@ -72,10 +72,11 @@ tl::expected<TensorShape, std::string>
 }
 
 tl::expected<std::vector<TensorShape>, std::string>
-    get_weight_shapes(LinearAttrs const &attrs, TensorShape const &input_shape) {
+    get_weight_shapes(LinearAttrs const &attrs,
+                      TensorShape const &input_shape) {
 
   std::vector<TensorShape> weight_shapes = {
-    PROPAGATE_ERR(get_projection_shape(attrs, input_shape)),
+      PROPAGATE_ERR(get_projection_shape(attrs, input_shape)),
   };
 
   if (attrs.use_bias) {
@@ -158,10 +159,11 @@ tl::expected<ParallelTensorShape, std::string>
 }
 
 tl::expected<std::vector<ParallelTensorShape>, std::string>
-    get_weight_shapes(LinearAttrs const &attrs, ParallelTensorShape const &input_shape) {
+    get_weight_shapes(LinearAttrs const &attrs,
+                      ParallelTensorShape const &input_shape) {
 
   std::vector<ParallelTensorShape> weight_shapes = {
-    PROPAGATE_ERR(get_projection_shape(attrs, input_shape)),
+      PROPAGATE_ERR(get_projection_shape(attrs, input_shape)),
   };
 
   if (attrs.use_bias) {
@@ -174,40 +176,50 @@ tl::expected<std::vector<ParallelTensorShape>, std::string>
 /**
  * @brief Chosen to match pytorch implementation
  *
- * see https://github.com/pytorch/pytorch/blob/1eba9b3aa3c43f86f4a2c807ac8e12c4a7767340/torch/nn/modules/linear.py#L114-L122
+ * see
+ * https://github.com/pytorch/pytorch/blob/1eba9b3aa3c43f86f4a2c807ac8e12c4a7767340/torch/nn/modules/linear.py#L114-L122
  */
 tl::expected<std::vector<InitializerAttrs>, std::string> get_initializers(
-  LinearAttrs const &attrs, 
-  TensorShape const &input_shape,
-  std::optional<InitializerAttrs> const &maybe_projection_initializer,
-  std::optional<InitializerAttrs> const &maybe_bias_initializer) {
+    LinearAttrs const &attrs,
+    TensorShape const &input_shape,
+    std::optional<InitializerAttrs> const &maybe_projection_initializer,
+    std::optional<InitializerAttrs> const &maybe_bias_initializer) {
 
   if (!attrs.use_bias && maybe_bias_initializer.has_value()) {
-    return tl::unexpected(fmt::format("Expected bias_initializer=std::nullopt since use_bias=false, but received bias_initializer: {}", maybe_bias_initializer.value()));
+    return tl::unexpected(
+        fmt::format("Expected bias_initializer=std::nullopt since "
+                    "use_bias=false, but received bias_initializer: {}",
+                    maybe_bias_initializer.value()));
   }
 
-  TensorShape projection_shape = PROPAGATE_ERR(get_projection_shape(attrs, input_shape));
-  
-  InitializerAttrs projection_default_initializer = InitializerAttrs{KaimingNormalAttrs{
-    /*a=*/sqrtf(5.0),
-    /*mode=*/KaimingInitializerMode::FAN_IN,
-    /*nonlinearity=*/KaimingInitializerNonlinearity::LEAKY_RELU,
-    /*seed=*/0,
-  }};
+  TensorShape projection_shape =
+      PROPAGATE_ERR(get_projection_shape(attrs, input_shape));
+
+  InitializerAttrs projection_default_initializer =
+      InitializerAttrs{KaimingNormalAttrs{
+          /*a=*/sqrtf(5.0),
+          /*mode=*/KaimingInitializerMode::FAN_IN,
+          /*nonlinearity=*/KaimingInitializerNonlinearity::LEAKY_RELU,
+          /*seed=*/0,
+      }};
 
-  InitializerAttrs projection_initializer = maybe_projection_initializer.value_or(projection_default_initializer);
+  InitializerAttrs projection_initializer =
+      maybe_projection_initializer.value_or(projection_default_initializer);
 
-  nonnegative_int fan_in = calculate_fan_for_mode(projection_shape.dims, KaimingInitializerMode::FAN_IN);
+  nonnegative_int fan_in = calculate_fan_for_mode(
+      projection_shape.dims, KaimingInitializerMode::FAN_IN);
 
   float bound = 1 / sqrtf(static_cast<float>(fan_in.unwrap_nonnegative()));
 
-  InitializerAttrs bias_default_initializer = InitializerAttrs{UniformInitializerAttrs{
-    /*seed=*/0,
-    /*min_val=*/-bound,
-    /*max_val=*/bound,
-  }};
+  InitializerAttrs bias_default_initializer =
+      InitializerAttrs{UniformInitializerAttrs{
+          /*seed=*/0,
+          /*min_val=*/-bound,
+          /*max_val=*/bound,
+      }};
 
-  InitializerAttrs bias_initializer = maybe_bias_initializer.value_or(bias_default_initializer);
+  InitializerAttrs bias_initializer =
+      maybe_bias_initializer.value_or(bias_default_initializer);
 
   if (attrs.use_bias) {
     return std::vector{projection_initializer, bias_initializer};
diff --git a/lib/op-attrs/src/op-attrs/ops/repartition.cc b/lib/op-attrs/src/op-attrs/ops/repartition.cc
index 43451da12c..d57a198585 100644
--- a/lib/op-attrs/src/op-attrs/ops/repartition.cc
+++ b/lib/op-attrs/src/op-attrs/ops/repartition.cc
@@ -11,7 +11,8 @@ RecordFormatter as_dot(RepartitionAttrs const &attrs) {
     return rr;
   };
 
-  r << kv("dim", attrs.repartition_dim) << kv("degree", attrs.repartition_degree);
+  r << kv("dim", attrs.repartition_dim)
+    << kv("degree", attrs.repartition_degree);
 
   return r;
 }
diff --git a/lib/op-attrs/src/op-attrs/pcg_operator_attrs.cc b/lib/op-attrs/src/op-attrs/pcg_operator_attrs.cc
index 168ef2260b..b2e4ae5a58 100644
--- a/lib/op-attrs/src/op-attrs/pcg_operator_attrs.cc
+++ b/lib/op-attrs/src/op-attrs/pcg_operator_attrs.cc
@@ -1,15 +1,14 @@
 #include "op-attrs/pcg_operator_attrs.h"
 #include "op-attrs/get_op_type.h"
-#include "op-attrs/ops/linear.h"
+#include "op-attrs/ops/broadcast.h"
 #include "op-attrs/ops/cast.h"
+#include "op-attrs/ops/combine.h"
 #include "op-attrs/ops/embedding.h"
 #include "op-attrs/ops/linear.h"
-#include "op-attrs/ops/weight.h"
-#include "op-attrs/ops/broadcast.h"
-#include "op-attrs/ops/repartition.h"
-#include "op-attrs/ops/combine.h"
 #include "op-attrs/ops/reduction.h"
+#include "op-attrs/ops/repartition.h"
 #include "op-attrs/ops/replicate.h"
+#include "op-attrs/ops/weight.h"
 #include "utils/overload.h"
 
 namespace FlexFlow {
diff --git a/lib/op-attrs/src/op-attrs/shape_inference.cc b/lib/op-attrs/src/op-attrs/shape_inference.cc
index 1c32749a92..4a0ff72fb4 100644
--- a/lib/op-attrs/src/op-attrs/shape_inference.cc
+++ b/lib/op-attrs/src/op-attrs/shape_inference.cc
@@ -16,9 +16,9 @@
 #include "op-attrs/ops/layer_norm.h"
 #include "op-attrs/ops/linear.h"
 #include "op-attrs/ops/pool_2d.h"
-#include "op-attrs/ops/replicate.h"
-#include "op-attrs/ops/repartition.h"
 #include "op-attrs/ops/reduction.h"
+#include "op-attrs/ops/repartition.h"
+#include "op-attrs/ops/replicate.h"
 #include "op-attrs/ops/softmax.h"
 #include "op-attrs/ops/weight.h"
 #include "utils/containers/get_only.h"
@@ -28,14 +28,14 @@ namespace FlexFlow {
 
 template <typename T>
 static std::pair<T, T> require_2(std::vector<T> const &v) {
-  assert (v.size() == 2);
+  assert(v.size() == 2);
 
   return {v.at(0), v.at(1)};
 }
 
 template <typename T>
 static std::tuple<T, T, T> require_3(std::vector<T> const &v) {
-  assert (v.size() == 3);
+  assert(v.size() == 3);
 
   return {v.at(0), v.at(1), v.at(2)};
 }
@@ -47,14 +47,15 @@ std::vector<TensorShape>
       [&](BatchMatmulAttrs const &attrs) -> std::vector<TensorShape> {
         auto [i1, i2] = require_2(input_shapes);
 
-        return {throw_if_unexpected(
-            get_output_shape(attrs, i1, i2))};
+        return {throw_if_unexpected(get_output_shape(attrs, i1, i2))};
       },
       [&](BatchNormAttrs const &attrs) -> std::vector<TensorShape> {
-        return {throw_if_unexpected(get_output_shape(attrs, get_only(input_shapes)))};
+        return {throw_if_unexpected(
+            get_output_shape(attrs, get_only(input_shapes)))};
       },
       [&](CastAttrs const &attrs) -> std::vector<TensorShape> {
-        return {throw_if_unexpected(get_output_shape(attrs, get_only(input_shapes)))};
+        return {throw_if_unexpected(
+            get_output_shape(attrs, get_only(input_shapes)))};
       },
       [&](ConcatAttrs const &attrs) -> std::vector<TensorShape> {
         return {throw_if_unexpected(get_output_shape(attrs, input_shapes))};
@@ -68,29 +69,33 @@ std::vector<TensorShape>
       [&](ElementBinaryAttrs const &attrs) -> std::vector<TensorShape> {
         auto [i1, i2] = require_2(input_shapes);
 
-        return {throw_if_unexpected(
-            get_output_shape(attrs, i1, i2))};
+        return {throw_if_unexpected(get_output_shape(attrs, i1, i2))};
       },
       [&](ElementUnaryAttrs const &attrs) -> std::vector<TensorShape> {
-        return {throw_if_unexpected(get_output_shape(attrs, get_only(input_shapes)))};
+        return {throw_if_unexpected(
+            get_output_shape(attrs, get_only(input_shapes)))};
       },
       [&](EmbeddingAttrs const &attrs) -> std::vector<TensorShape> {
-        return {throw_if_unexpected(get_output_shape(attrs, get_only(input_shapes)))};
+        return {throw_if_unexpected(
+            get_output_shape(attrs, get_only(input_shapes)))};
       },
       [&](FlatAttrs const &attrs) -> std::vector<TensorShape> {
         return {get_output_shape(attrs, get_only(input_shapes))};
       },
       [&](GatherAttrs const &attrs) -> std::vector<TensorShape> {
-        return {get_output_shape(attrs, input_shapes.at(0), input_shapes.at(1))};
+        return {
+            get_output_shape(attrs, input_shapes.at(0), input_shapes.at(1))};
       },
       [&](InputAttrs const &attrs) -> std::vector<TensorShape> {
         return {get_output_shape(attrs)};
       },
       [&](LayerNormAttrs const &attrs) -> std::vector<TensorShape> {
-        return {throw_if_unexpected(get_output_shape(attrs, get_only(input_shapes)))};
+        return {throw_if_unexpected(
+            get_output_shape(attrs, get_only(input_shapes)))};
       },
       [&](LinearAttrs const &attrs) -> std::vector<TensorShape> {
-        return {throw_if_unexpected(get_output_shape(attrs, get_only(input_shapes)))};
+        return {throw_if_unexpected(
+            get_output_shape(attrs, get_only(input_shapes)))};
       },
       [&](MultiHeadAttentionAttrs const &attrs) -> std::vector<TensorShape> {
         auto [i1, i2, i3] = require_3(input_shapes);
@@ -98,10 +103,12 @@ std::vector<TensorShape>
         return {throw_if_unexpected(get_output_shape(attrs, i1, i2, i3))};
       },
       [&](Pool2DAttrs const &attrs) -> std::vector<TensorShape> {
-        return {throw_if_unexpected(get_output_shape(attrs, get_only(input_shapes)))};
+        return {throw_if_unexpected(
+            get_output_shape(attrs, get_only(input_shapes)))};
       },
       [&](SoftmaxAttrs const &attrs) -> std::vector<TensorShape> {
-        return {throw_if_unexpected(get_output_shape(attrs, get_only(input_shapes)))};
+        return {throw_if_unexpected(
+            get_output_shape(attrs, get_only(input_shapes)))};
       },
       [&](WeightAttrs const &attrs) -> std::vector<TensorShape> {
         return {get_output_shape(attrs)};
@@ -119,20 +126,15 @@ std::vector<TensorShape>
         return {};
       },
       [&](BatchNormAttrs const &attrs) -> std::vector<TensorShape> {
-        return throw_if_unexpected(get_weight_shapes(attrs, get_only(input_shapes)));
-      },
-      [&](CastAttrs const &attrs) -> std::vector<TensorShape> {
-        return {};
-      },
-      [&](ConcatAttrs const &attrs) -> std::vector<TensorShape> {
-        return {};
+        return throw_if_unexpected(
+            get_weight_shapes(attrs, get_only(input_shapes)));
       },
+      [&](CastAttrs const &attrs) -> std::vector<TensorShape> { return {}; },
+      [&](ConcatAttrs const &attrs) -> std::vector<TensorShape> { return {}; },
       [&](Conv2DAttrs const &attrs) -> std::vector<TensorShape> {
         return get_weight_shapes(attrs, get_only(input_shapes));
       },
-      [&](DropoutAttrs const &attrs) -> std::vector<TensorShape> {
-        return {};
-      },
+      [&](DropoutAttrs const &attrs) -> std::vector<TensorShape> { return {}; },
       [&](ElementBinaryAttrs const &attrs) -> std::vector<TensorShape> {
         return {};
       },
@@ -140,37 +142,28 @@ std::vector<TensorShape>
         return {};
       },
       [&](EmbeddingAttrs const &attrs) -> std::vector<TensorShape> {
-        return {throw_if_unexpected(get_weights_shape(attrs, get_only(input_shapes)))};
-      },
-      [&](FlatAttrs const &attrs) -> std::vector<TensorShape> {
-        return {};
-      },
-      [&](GatherAttrs const &attrs) -> std::vector<TensorShape> {
-        return {};
-      },
-      [&](InputAttrs const &attrs) -> std::vector<TensorShape> {
-        return {};
+        return {throw_if_unexpected(
+            get_weights_shape(attrs, get_only(input_shapes)))};
       },
+      [&](FlatAttrs const &attrs) -> std::vector<TensorShape> { return {}; },
+      [&](GatherAttrs const &attrs) -> std::vector<TensorShape> { return {}; },
+      [&](InputAttrs const &attrs) -> std::vector<TensorShape> { return {}; },
       [&](LayerNormAttrs const &attrs) -> std::vector<TensorShape> {
-        return throw_if_unexpected(get_weight_shapes(attrs, get_only(input_shapes)));
+        return throw_if_unexpected(
+            get_weight_shapes(attrs, get_only(input_shapes)));
       },
       [&](LinearAttrs const &attrs) -> std::vector<TensorShape> {
-        return throw_if_unexpected(get_weight_shapes(attrs, get_only(input_shapes)));
+        return throw_if_unexpected(
+            get_weight_shapes(attrs, get_only(input_shapes)));
       },
       [&](MultiHeadAttentionAttrs const &attrs) -> std::vector<TensorShape> {
         auto [i1, i2, i3] = require_3(input_shapes);
 
         return throw_if_unexpected(get_weight_shapes(attrs, i1, i2, i3));
       },
-      [&](Pool2DAttrs const &attrs) -> std::vector<TensorShape> {
-        return {};
-      },
-      [&](SoftmaxAttrs const &attrs) -> std::vector<TensorShape> {
-        return {};
-      },
-      [&](WeightAttrs const &attrs) -> std::vector<TensorShape> {
-        return {};
-      },
+      [&](Pool2DAttrs const &attrs) -> std::vector<TensorShape> { return {}; },
+      [&](SoftmaxAttrs const &attrs) -> std::vector<TensorShape> { return {}; },
+      [&](WeightAttrs const &attrs) -> std::vector<TensorShape> { return {}; },
       [&](auto const &attrs) -> std::vector<TensorShape> {
         NOT_IMPLEMENTED();
       }});
@@ -183,17 +176,19 @@ std::vector<ParallelTensorShape>
       [&](BatchMatmulAttrs const &attrs) -> std::vector<ParallelTensorShape> {
         auto [i1, i2] = require_2(input_shapes);
 
-        return {throw_if_unexpected(
-            get_output_shape(attrs, i1, i2))};
+        return {throw_if_unexpected(get_output_shape(attrs, i1, i2))};
       },
       [&](BatchNormAttrs const &attrs) -> std::vector<ParallelTensorShape> {
-        return {throw_if_unexpected(get_output_shape(attrs, get_only(input_shapes)))};
+        return {throw_if_unexpected(
+            get_output_shape(attrs, get_only(input_shapes)))};
       },
       [&](CastAttrs const &attrs) -> std::vector<ParallelTensorShape> {
-        return {throw_if_unexpected(get_output_shape(attrs, get_only(input_shapes)))};
+        return {throw_if_unexpected(
+            get_output_shape(attrs, get_only(input_shapes)))};
       },
       [&](CombineAttrs const &attrs) -> std::vector<ParallelTensorShape> {
-        return {throw_if_unexpected(get_output_shape(attrs, get_only(input_shapes)))};
+        return {throw_if_unexpected(
+            get_output_shape(attrs, get_only(input_shapes)))};
       },
       [&](ConcatAttrs const &attrs) -> std::vector<ParallelTensorShape> {
         return {throw_if_unexpected(get_output_shape(attrs, input_shapes))};
@@ -202,54 +197,65 @@ std::vector<ParallelTensorShape>
         return {get_output_shape(attrs, get_only(input_shapes))};
       },
       [&](DropoutAttrs const &attrs) -> std::vector<ParallelTensorShape> {
-        return {throw_if_unexpected(get_output_shape(attrs, get_only(input_shapes)))};
+        return {throw_if_unexpected(
+            get_output_shape(attrs, get_only(input_shapes)))};
       },
       [&](ElementBinaryAttrs const &attrs) -> std::vector<ParallelTensorShape> {
         auto [i1, i2] = require_2(input_shapes);
 
-        return {throw_if_unexpected(
-            get_output_shape(attrs, i1, i2))};
+        return {throw_if_unexpected(get_output_shape(attrs, i1, i2))};
       },
       [&](ElementUnaryAttrs const &attrs) -> std::vector<ParallelTensorShape> {
-        return {throw_if_unexpected(get_output_shape(attrs, get_only(input_shapes)))};
+        return {throw_if_unexpected(
+            get_output_shape(attrs, get_only(input_shapes)))};
       },
       [&](EmbeddingAttrs const &attrs) -> std::vector<ParallelTensorShape> {
-        return {throw_if_unexpected(get_output_shape(attrs, get_only(input_shapes)))};
+        return {throw_if_unexpected(
+            get_output_shape(attrs, get_only(input_shapes)))};
       },
       [&](FlatAttrs const &attrs) -> std::vector<ParallelTensorShape> {
-        return {throw_if_unexpected(get_output_shape(attrs, get_only(input_shapes)))};
+        return {throw_if_unexpected(
+            get_output_shape(attrs, get_only(input_shapes)))};
       },
       [&](GatherAttrs const &attrs) -> std::vector<ParallelTensorShape> {
-        return {get_output_shape(attrs, input_shapes.at(0), input_shapes.at(1))};
+        return {
+            get_output_shape(attrs, input_shapes.at(0), input_shapes.at(1))};
       },
       [&](InputAttrs const &attrs) -> std::vector<ParallelTensorShape> {
         return {get_output_parallel_tensor_shape(attrs)};
       },
       [&](LayerNormAttrs const &attrs) -> std::vector<ParallelTensorShape> {
-        return {throw_if_unexpected(get_output_shape(attrs, get_only(input_shapes)))};
+        return {throw_if_unexpected(
+            get_output_shape(attrs, get_only(input_shapes)))};
       },
       [&](LinearAttrs const &attrs) -> std::vector<ParallelTensorShape> {
-        return {throw_if_unexpected(get_output_shape(attrs, get_only(input_shapes)))};
+        return {throw_if_unexpected(
+            get_output_shape(attrs, get_only(input_shapes)))};
       },
-      [&](MultiHeadAttentionAttrs const &attrs) -> std::vector<ParallelTensorShape> {
+      [&](MultiHeadAttentionAttrs const &attrs)
+          -> std::vector<ParallelTensorShape> {
         auto [i1, i2, i3] = require_3(input_shapes);
 
         return {throw_if_unexpected(get_output_shape(attrs, i1, i2, i3))};
       },
       [&](Pool2DAttrs const &attrs) -> std::vector<ParallelTensorShape> {
-        return {throw_if_unexpected(get_output_shape(attrs, get_only(input_shapes)))};
+        return {throw_if_unexpected(
+            get_output_shape(attrs, get_only(input_shapes)))};
       },
       [&](ReductionAttrs const &attrs) -> std::vector<ParallelTensorShape> {
-        return {throw_if_unexpected(get_output_shape(attrs, get_only(input_shapes)))};
+        return {throw_if_unexpected(
+            get_output_shape(attrs, get_only(input_shapes)))};
       },
       [&](RepartitionAttrs const &attrs) -> std::vector<ParallelTensorShape> {
-        return {throw_if_unexpected(get_output_shape(attrs, get_only(input_shapes)))};
+        return {throw_if_unexpected(
+            get_output_shape(attrs, get_only(input_shapes)))};
       },
       [&](ReplicateAttrs const &attrs) -> std::vector<ParallelTensorShape> {
         return {get_output_shape(attrs, get_only(input_shapes))};
       },
       [&](SoftmaxAttrs const &attrs) -> std::vector<ParallelTensorShape> {
-        return {throw_if_unexpected(get_output_shape(attrs, get_only(input_shapes)))};
+        return {throw_if_unexpected(
+            get_output_shape(attrs, get_only(input_shapes)))};
       },
       [&](WeightAttrs const &attrs) -> std::vector<ParallelTensorShape> {
         return {get_output_parallel_tensor_shape(attrs)};
@@ -259,7 +265,7 @@ std::vector<ParallelTensorShape>
       }});
 }
 
-std::vector<ParallelTensorShape> 
+std::vector<ParallelTensorShape>
     get_weight_shapes(PCGOperatorAttrs const &pcg_op_attrs,
                       std::vector<ParallelTensorShape> const &input_shapes) {
   return pcg_op_attrs.visit<std::vector<ParallelTensorShape>>(overload{
@@ -267,7 +273,8 @@ std::vector<ParallelTensorShape>
         return {};
       },
       [&](BatchNormAttrs const &attrs) -> std::vector<ParallelTensorShape> {
-        return throw_if_unexpected(get_weight_shapes(attrs, get_only(input_shapes)));
+        return throw_if_unexpected(
+            get_weight_shapes(attrs, get_only(input_shapes)));
       },
       [&](CastAttrs const &attrs) -> std::vector<ParallelTensorShape> {
         return {};
@@ -292,7 +299,8 @@ std::vector<ParallelTensorShape>
       },
       [&](EmbeddingAttrs const &attrs) -> std::vector<ParallelTensorShape> {
         return {
-          throw_if_unexpected(get_weights_shape(attrs, get_only(input_shapes))),
+            throw_if_unexpected(
+                get_weights_shape(attrs, get_only(input_shapes))),
         };
       },
       [&](FlatAttrs const &attrs) -> std::vector<ParallelTensorShape> {
@@ -305,12 +313,15 @@ std::vector<ParallelTensorShape>
         return {};
       },
       [&](LayerNormAttrs const &attrs) -> std::vector<ParallelTensorShape> {
-        return throw_if_unexpected(get_weight_shapes(attrs, get_only(input_shapes)));
+        return throw_if_unexpected(
+            get_weight_shapes(attrs, get_only(input_shapes)));
       },
       [&](LinearAttrs const &attrs) -> std::vector<ParallelTensorShape> {
-        return throw_if_unexpected(get_weight_shapes(attrs, get_only(input_shapes)));
+        return throw_if_unexpected(
+            get_weight_shapes(attrs, get_only(input_shapes)));
       },
-      [&](MultiHeadAttentionAttrs const &attrs) -> std::vector<ParallelTensorShape> {
+      [&](MultiHeadAttentionAttrs const &attrs)
+          -> std::vector<ParallelTensorShape> {
         auto [i1, i2, i3] = require_3(input_shapes);
 
         return throw_if_unexpected(get_weight_shapes(attrs, i1, i2, i3));
@@ -338,5 +349,4 @@ std::vector<ParallelTensorShape>
       }});
 }
 
-
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/tensor_dims.cc b/lib/op-attrs/src/op-attrs/tensor_dims.cc
index 9c6d979d92..8d0592eab7 100644
--- a/lib/op-attrs/src/op-attrs/tensor_dims.cc
+++ b/lib/op-attrs/src/op-attrs/tensor_dims.cc
@@ -1,4 +1,5 @@
 #include "op-attrs/tensor_dims.h"
+#include "op-attrs/dim_ordered/slice.h"
 #include "op-attrs/dim_ordered/zip.h"
 #include "op-attrs/replica_parallel_dim_set.h"
 #include "op-attrs/shard_parallel_dim.dtg.h"
@@ -10,7 +11,6 @@
 #include "utils/containers/zip.h"
 #include "utils/integer_conversions.h"
 #include "utils/nonnegative_int/num_elements.h"
-#include "op-attrs/dim_ordered/slice.h"
 
 namespace FlexFlow {
 
@@ -66,9 +66,11 @@ std::optional<TensorDims>
   return std::nullopt;
 }
 
-TensorDims slice_tensor_dims(TensorDims const &dims, std::optional<relative_ff_dim_t> const &start, std::optional<relative_ff_dim_t> const &stop) {
+TensorDims slice_tensor_dims(TensorDims const &dims,
+                             std::optional<relative_ff_dim_t> const &start,
+                             std::optional<relative_ff_dim_t> const &stop) {
   return TensorDims{
-    slice(dims.ff_ordered, start, stop),
+      slice(dims.ff_ordered, start, stop),
   };
 }
 
diff --git a/lib/op-attrs/src/op-attrs/tensor_shape.cc b/lib/op-attrs/src/op-attrs/tensor_shape.cc
index ef4436cf32..04b18794f1 100644
--- a/lib/op-attrs/src/op-attrs/tensor_shape.cc
+++ b/lib/op-attrs/src/op-attrs/tensor_shape.cc
@@ -28,10 +28,12 @@ nonnegative_int get_size_in_bytes(TensorShape const &s) {
   return get_num_elements(s) * size_of_datatype(s.data_type);
 }
 
-TensorShape slice_tensor_shape(TensorShape const &shape, std::optional<relative_ff_dim_t> const &start, std::optional<relative_ff_dim_t> const &stop) {
+TensorShape slice_tensor_shape(TensorShape const &shape,
+                               std::optional<relative_ff_dim_t> const &start,
+                               std::optional<relative_ff_dim_t> const &stop) {
   return TensorShape{
-    slice_tensor_dims(shape.dims, start, stop),
-    shape.data_type,
+      slice_tensor_dims(shape.dims, start, stop),
+      shape.data_type,
   };
 }
 
diff --git a/lib/pcg/include/pcg/computation_graph.h b/lib/pcg/include/pcg/computation_graph.h
index 00864fe771..4c76bf6974 100644
--- a/lib/pcg/include/pcg/computation_graph.h
+++ b/lib/pcg/include/pcg/computation_graph.h
@@ -15,11 +15,12 @@ ComputationGraph make_empty_computation_graph();
 
 std::unordered_set<layer_guid_t> get_layers(ComputationGraph const &);
 
-LayerAddedResult add_layer(ComputationGraph &computation_graph,
-                           LayerAttrs const &attrs,
-                           std::vector<tensor_guid_t> const &inputs,
-                           std::vector<tensor_guid_t> const &weights,
-                           std::optional<std::vector<CreateGrad>> const &outputs = std::nullopt);
+LayerAddedResult add_layer(
+    ComputationGraph &computation_graph,
+    LayerAttrs const &attrs,
+    std::vector<tensor_guid_t> const &inputs,
+    std::vector<tensor_guid_t> const &weights,
+    std::optional<std::vector<CreateGrad>> const &outputs = std::nullopt);
 
 LayerAddedResult add_input_layer(ComputationGraph &computation_graph,
                                  TensorShape const &tensor_shape);
diff --git a/lib/pcg/include/pcg/computation_graph_builder.h b/lib/pcg/include/pcg/computation_graph_builder.h
index 32994118b3..b996026ce7 100644
--- a/lib/pcg/include/pcg/computation_graph_builder.h
+++ b/lib/pcg/include/pcg/computation_graph_builder.h
@@ -241,10 +241,10 @@ struct ComputationGraphBuilder {
                    CreateGrad,
                    std::optional<std::string> const &name = std::nullopt);
 
-  tensor_guid_t create_weight(
-      TensorShape const &shape,
-      InitializerAttrs const &initializer,
-      std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t
+      create_weight(TensorShape const &shape,
+                    InitializerAttrs const &initializer,
+                    std::optional<std::string> const &name = std::nullopt);
   tensor_guid_t
       create_weight(TensorAttrs const &,
                     std::optional<std::string> const &name = std::nullopt);
@@ -255,11 +255,11 @@ struct ComputationGraphBuilder {
   TensorShape get_shape(tensor_guid_t const &) const;
 
 private:
-  std::vector<tensor_guid_t>
-      add_layer(LayerAttrs const &layer,
-                std::vector<tensor_guid_t> const &inputs,
-                std::vector<InitializerAttrs> const &weights,
-                std::optional<std::vector<CreateGrad>> const &outputs = std::nullopt);
+  std::vector<tensor_guid_t> add_layer(
+      LayerAttrs const &layer,
+      std::vector<tensor_guid_t> const &inputs,
+      std::vector<InitializerAttrs> const &weights,
+      std::optional<std::vector<CreateGrad>> const &outputs = std::nullopt);
 
   tensor_guid_t
       broadcast(tensor_guid_t const &, TensorDims const &, std::string const &);
diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
index 1eaa8456bf..3542e73dea 100644
--- a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
+++ b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
@@ -17,16 +17,15 @@ std::unordered_set<parallel_layer_guid_t>
 std::unordered_set<parallel_tensor_guid_t>
     get_parallel_tensors(ParallelComputationGraph const &);
 
-ParallelLayerAddedResult
-    add_parallel_layer(ParallelComputationGraph &pcg,
-                       ParallelLayerAttrs const &layer_attrs,
-                       std::vector<parallel_tensor_guid_t> const &inputs,
-                       std::vector<parallel_tensor_guid_t> const &weights,
-                       std::optional<std::vector<CreateGrad>> const &outputs = std::nullopt);
-
-ParallelLayerAddedResult
-    pcg_add_input_layer(ParallelComputationGraph &pcg,
-                        TensorShape const &tensor_shape);
+ParallelLayerAddedResult add_parallel_layer(
+    ParallelComputationGraph &pcg,
+    ParallelLayerAttrs const &layer_attrs,
+    std::vector<parallel_tensor_guid_t> const &inputs,
+    std::vector<parallel_tensor_guid_t> const &weights,
+    std::optional<std::vector<CreateGrad>> const &outputs = std::nullopt);
+
+ParallelLayerAddedResult pcg_add_input_layer(ParallelComputationGraph &pcg,
+                                             TensorShape const &tensor_shape);
 
 std::unordered_set<ParallelComputationGraphEdge>
     get_pcg_edges_from_layer_to_layer(ParallelComputationGraph const &,
diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h
index a3cda69cd8..d4cace4a2a 100644
--- a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h
+++ b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h
@@ -136,6 +136,7 @@ struct ParallelComputationGraphBuilder {
                       std::optional<std::string> const &name = std::nullopt);
 
   ParallelTensorShape get_shape(parallel_tensor_guid_t const &) const;
+
 private:
   parallel_tensor_guid_t as_type(parallel_tensor_guid_t const &,
                                  DataType,
diff --git a/lib/pcg/src/pcg/computation_graph.cc b/lib/pcg/src/pcg/computation_graph.cc
index f0af280d70..2e698c013a 100644
--- a/lib/pcg/src/pcg/computation_graph.cc
+++ b/lib/pcg/src/pcg/computation_graph.cc
@@ -35,19 +35,24 @@ std::unordered_set<layer_guid_t> get_layers(ComputationGraph const &cg) {
                    [&](Node const &n) { return layer_guid_t{n}; });
 }
 
-LayerAddedResult add_layer(ComputationGraph &computation_graph,
-                           LayerAttrs const &layer_attrs,
-                           std::vector<tensor_guid_t> const &inputs,
-                           std::vector<tensor_guid_t> const &weights,
-                           std::optional<std::vector<CreateGrad>> const &maybe_output_flags) {
-  std::vector<TensorShape> input_shapes 
-    = transform(inputs, [&](tensor_guid_t const &i) { return get_tensor_attrs(computation_graph, i).shape; });
+LayerAddedResult add_layer(
+    ComputationGraph &computation_graph,
+    LayerAttrs const &layer_attrs,
+    std::vector<tensor_guid_t> const &inputs,
+    std::vector<tensor_guid_t> const &weights,
+    std::optional<std::vector<CreateGrad>> const &maybe_output_flags) {
+  std::vector<TensorShape> input_shapes =
+      transform(inputs, [&](tensor_guid_t const &i) {
+        return get_tensor_attrs(computation_graph, i).shape;
+      });
 
-  std::vector<TensorShape> provided_weight_shapes
-    = transform(weights, [&](tensor_guid_t const &w) { return get_tensor_attrs(computation_graph, w).shape; });
+  std::vector<TensorShape> provided_weight_shapes =
+      transform(weights, [&](tensor_guid_t const &w) {
+        return get_tensor_attrs(computation_graph, w).shape;
+      });
 
-  std::vector<TensorShape> expected_weight_shapes
-    = get_weight_shapes(layer_attrs.op_attrs, input_shapes);
+  std::vector<TensorShape> expected_weight_shapes =
+      get_weight_shapes(layer_attrs.op_attrs, input_shapes);
 
   std::vector<DataflowOutput> raw_inputs = transform(
       inputs, [](tensor_guid_t const &t) { return t.raw_graph_output; });
@@ -55,22 +60,24 @@ LayerAddedResult add_layer(ComputationGraph &computation_graph,
   std::vector<DataflowOutput> raw_weights = transform(
       weights, [](tensor_guid_t const &t) { return t.raw_graph_output; });
 
-  std::vector<TensorShape> output_shapes = get_output_shapes(layer_attrs.op_attrs, input_shapes);
+  std::vector<TensorShape> output_shapes =
+      get_output_shapes(layer_attrs.op_attrs, input_shapes);
 
-  std::vector<CreateGrad> output_flags = maybe_output_flags.value_or(repeat_element(num_elements(output_shapes), CreateGrad::YES));
-
-  std::vector<TensorAttrs> output_attrs = 
-    zip_with_strict(output_shapes, output_flags,
-             [](TensorShape const &shape, CreateGrad const &create_grad) {
-               return TensorAttrs{
-                 /*shape=*/shape,
-                 /*create_grad=*/create_grad,
-               };
-             });
+  std::vector<CreateGrad> output_flags = maybe_output_flags.value_or(
+      repeat_element(num_elements(output_shapes), CreateGrad::YES));
 
+  std::vector<TensorAttrs> output_attrs = zip_with_strict(
+      output_shapes,
+      output_flags,
+      [](TensorShape const &shape, CreateGrad const &create_grad) {
+        return TensorAttrs{
+            /*shape=*/shape,
+            /*create_grad=*/create_grad,
+        };
+      });
 
-  NodeAddedResult added =
-      computation_graph.raw_graph.add_node(layer_attrs, concat_vectors(raw_inputs, raw_weights), output_attrs);
+  NodeAddedResult added = computation_graph.raw_graph.add_node(
+      layer_attrs, concat_vectors(raw_inputs, raw_weights), output_attrs);
 
   return LayerAddedResult{
       layer_guid_t{added.node},
@@ -82,11 +89,11 @@ LayerAddedResult add_layer(ComputationGraph &computation_graph,
 LayerAddedResult add_input_layer(ComputationGraph &cg,
                                  TensorShape const &tensor_shape) {
   LayerAttrs layer_attrs = LayerAttrs{
-    /*op_attrs=*/ComputationGraphOpAttrs{InputAttrs{tensor_shape}},
-    /*name=*/std::nullopt,
+      /*op_attrs=*/ComputationGraphOpAttrs{InputAttrs{tensor_shape}},
+      /*name=*/std::nullopt,
   };
 
-  return add_layer(cg, 
+  return add_layer(cg,
                    layer_attrs,
                    /*inputs=*/{},
                    /*weights=*/{},
diff --git a/lib/pcg/src/pcg/computation_graph_builder.cc b/lib/pcg/src/pcg/computation_graph_builder.cc
index b53d5668b7..267f05499c 100644
--- a/lib/pcg/src/pcg/computation_graph_builder.cc
+++ b/lib/pcg/src/pcg/computation_graph_builder.cc
@@ -2,10 +2,9 @@
 #include "op-attrs/computation_graph_op_attrs.h"
 #include "op-attrs/get_incoming_tensor_roles.h"
 #include "op-attrs/get_op_type.h"
-#include "op-attrs/ops/linear.h"
-#include "op-attrs/ops/pool_2d.h"
-#include "op-attrs/shape_inference.h"
+#include "op-attrs/ops/attention.h"
 #include "op-attrs/ops/attention_attrs.dtg.h"
+#include "op-attrs/ops/batch_norm.h"
 #include "op-attrs/ops/batch_norm_attrs.dtg.h"
 #include "op-attrs/ops/broadcast_attrs.dtg.h"
 #include "op-attrs/ops/cast_attrs.dtg.h"
@@ -15,15 +14,20 @@
 #include "op-attrs/ops/dropout_attrs.dtg.h"
 #include "op-attrs/ops/element_binary_attrs.dtg.h"
 #include "op-attrs/ops/element_unary_attrs.dtg.h"
+#include "op-attrs/ops/embedding.h"
 #include "op-attrs/ops/embedding_attrs.dtg.h"
 #include "op-attrs/ops/flat_attrs.dtg.h"
 #include "op-attrs/ops/gather_attrs.dtg.h"
+#include "op-attrs/ops/layer_norm.h"
 #include "op-attrs/ops/layer_norm_attrs.dtg.h"
+#include "op-attrs/ops/linear.h"
 #include "op-attrs/ops/linear_attrs.dtg.h"
+#include "op-attrs/ops/pool_2d.h"
 #include "op-attrs/ops/pool_2d_attrs.dtg.h"
 #include "op-attrs/ops/softmax_attrs.dtg.h"
 #include "op-attrs/ops/weight_attrs.dtg.h"
 #include "op-attrs/relative_ff_dim_t.h"
+#include "op-attrs/shape_inference.h"
 #include "op-attrs/tensor_dims.h"
 #include "op-attrs/tensor_shape.h"
 #include "pcg/computation_graph.h"
@@ -38,10 +42,6 @@
 #include "utils/containers/zip_with_strict.h"
 #include "utils/expected.h"
 #include "utils/stack_vector/stack_vector_of.h"
-#include "op-attrs/ops/batch_norm.h"
-#include "op-attrs/ops/attention.h"
-#include "op-attrs/ops/embedding.h"
-#include "op-attrs/ops/layer_norm.h"
 #include <fmt/format.h>
 
 namespace FlexFlow {
@@ -71,7 +71,8 @@ tensor_guid_t ComputationGraphBuilder::create_input(
       maybe_name,
   };
 
-  return get_only(this->add_layer(layer_attrs, {}, {}, std::vector{create_grad}));
+  return get_only(
+      this->add_layer(layer_attrs, {}, {}, std::vector{create_grad}));
 }
 
 tensor_guid_t ComputationGraphBuilder::create_weight(
@@ -79,11 +80,10 @@ tensor_guid_t ComputationGraphBuilder::create_weight(
     InitializerAttrs const &initializer,
     std::optional<std::string> const &maybe_name) {
   LayerAttrs layer_attrs = LayerAttrs{
-      ComputationGraphOpAttrs{
-        WeightAttrs{
+      ComputationGraphOpAttrs{WeightAttrs{
           /*shape=*/shape,
           /*initializer=*/initializer,
-        }},
+      }},
       maybe_name,
   };
 
@@ -115,14 +115,18 @@ std::vector<tensor_guid_t> ComputationGraphBuilder::add_layer(
     std::optional<std::vector<CreateGrad>> const &outputs) {
   check_incoming_tensor_roles(layer, inputs.size(), weight_initializers.size());
 
-  std::vector<TensorShape> input_shapes = transform(inputs, [&](tensor_guid_t const &t) { return this->get_shape(t); });
+  std::vector<TensorShape> input_shapes = transform(
+      inputs, [&](tensor_guid_t const &t) { return this->get_shape(t); });
 
-  std::vector<TensorShape> weight_shapes = get_weight_shapes(layer.op_attrs, input_shapes);
+  std::vector<TensorShape> weight_shapes =
+      get_weight_shapes(layer.op_attrs, input_shapes);
 
-  std::vector<tensor_guid_t> weights = zip_with_strict(weight_shapes, weight_initializers,
-                                                       [&](TensorShape const &shape, InitializerAttrs const &initializer) {
-                                                         return this->create_weight(shape, initializer);
-                                                       });
+  std::vector<tensor_guid_t> weights = zip_with_strict(
+      weight_shapes,
+      weight_initializers,
+      [&](TensorShape const &shape, InitializerAttrs const &initializer) {
+        return this->create_weight(shape, initializer);
+      });
 
   LayerAddedResult added = ::FlexFlow::add_layer(
       this->computation_graph, layer, inputs, weights, outputs);
@@ -409,15 +413,13 @@ tensor_guid_t ComputationGraphBuilder::conv2d(
 
   LayerAttrs layer = LayerAttrs{ComputationGraphOpAttrs{attrs}, name};
 
-  std::vector<InitializerAttrs> initializers = get_initializers(attrs,
-                                                                this->get_shape(input),
-                                                                maybe_kernel_initializer,
-                                                                maybe_bias_initializer);
+  std::vector<InitializerAttrs> initializers =
+      get_initializers(attrs,
+                       this->get_shape(input),
+                       maybe_kernel_initializer,
+                       maybe_bias_initializer);
 
-  return get_only(this->add_layer(
-      layer,
-      {input},
-      initializers));
+  return get_only(this->add_layer(layer, {input}, initializers));
 }
 
 tensor_guid_t ComputationGraphBuilder::dropout(
@@ -457,7 +459,8 @@ tensor_guid_t ComputationGraphBuilder::embedding(
 
   TensorShape input_shape = this->get_shape(input);
 
-  std::vector<InitializerAttrs> initializers = get_initializers(attrs, initializer);
+  std::vector<InitializerAttrs> initializers =
+      get_initializers(attrs, initializer);
 
   return get_only(this->add_layer(layer, {input}, initializers));
 }
@@ -484,8 +487,7 @@ tensor_guid_t ComputationGraphBuilder::gather(
 
   LayerAttrs layer = LayerAttrs{ComputationGraphOpAttrs{attrs}, name};
 
-  return get_only(
-      this->add_layer(layer, {input}, {}));
+  return get_only(this->add_layer(layer, {input}, {}));
 }
 tensor_guid_t ComputationGraphBuilder::pool2d(
     tensor_guid_t const &x,
@@ -545,7 +547,7 @@ tensor_guid_t ComputationGraphBuilder::adaptive_pool2d(
   TensorShape output_shape = throw_if_unexpected(
       get_output_shape(attrs, this->get_shape(casted_input)));
 
-  return get_only(this->add_layer(layer, {casted_input}, {})); 
+  return get_only(this->add_layer(layer, {casted_input}, {}));
 }
 
 tensor_guid_t ComputationGraphBuilder::batch_norm(
@@ -579,7 +581,8 @@ tensor_guid_t ComputationGraphBuilder::batch_norm(
 
   TensorShape input_shape = this->get_shape(input);
 
-  std::vector<InitializerAttrs> initializers = throw_if_unexpected(get_initializers(attrs));
+  std::vector<InitializerAttrs> initializers =
+      throw_if_unexpected(get_initializers(attrs));
 
   return get_only(this->add_layer(layer, {input}, initializers));
 }
@@ -629,16 +632,14 @@ tensor_guid_t ComputationGraphBuilder::multihead_attention(
 
   LayerAttrs layer = LayerAttrs{ComputationGraphOpAttrs{attrs}, name};
 
-  std::vector<InitializerAttrs> initializers = throw_if_unexpected(get_initializers(attrs,
-                                                                this->get_shape(query),
-                                                                this->get_shape(key),
-                                                                this->get_shape(value),
-                                                                initializer));
+  std::vector<InitializerAttrs> initializers =
+      throw_if_unexpected(get_initializers(attrs,
+                                           this->get_shape(query),
+                                           this->get_shape(key),
+                                           this->get_shape(value),
+                                           initializer));
 
-  return get_only(this->add_layer(
-      layer,
-      {query, key, value},
-      initializers));
+  return get_only(this->add_layer(layer, {query, key, value}, initializers));
 }
 
 TensorDims ComputationGraphBuilder::get_broadcast_target_dims(
@@ -686,13 +687,13 @@ tensor_guid_t ComputationGraphBuilder::dense(
 
   LayerAttrs layer = LayerAttrs{ComputationGraphOpAttrs{attrs}, name};
 
-  std::vector<InitializerAttrs> initializers = throw_if_unexpected(get_initializers(attrs,
-                                                                this->get_shape(input),
-                                                                maybe_projection_initializer,
-                                                                maybe_bias_initializer));
+  std::vector<InitializerAttrs> initializers =
+      throw_if_unexpected(get_initializers(attrs,
+                                           this->get_shape(input),
+                                           maybe_projection_initializer,
+                                           maybe_bias_initializer));
 
-  return get_only(this->add_layer(
-      layer, {input}, initializers));
+  return get_only(this->add_layer(layer, {input}, initializers));
 }
 
 tensor_guid_t ComputationGraphBuilder::concat(
@@ -720,11 +721,12 @@ tensor_guid_t ComputationGraphBuilder::flat(
     std::optional<std::string> const &maybe_name) {
   nonnegative_int input_num_dims = num_dims(this->get_shape(input));
 
-  ff_dim_t abs_start_dim = ff_dim_t_from_relative_ff_dim_t(start_dim, input_num_dims);
+  ff_dim_t abs_start_dim =
+      ff_dim_t_from_relative_ff_dim_t(start_dim, input_num_dims);
 
-  ff_dim_t abs_end_dim = ff_dim_t_from_relative_ff_dim_t(end_dim.value_or(relative_ff_dim_t{
-                                          input_num_dims.unwrap_nonnegative()}),
-                                      input_num_dims);
+  ff_dim_t abs_end_dim = ff_dim_t_from_relative_ff_dim_t(
+      end_dim.value_or(relative_ff_dim_t{input_num_dims.unwrap_nonnegative()}),
+      input_num_dims);
 
   FlatAttrs attrs = FlatAttrs{
       /*start_dim=*/abs_start_dim,
@@ -776,7 +778,6 @@ tensor_guid_t ComputationGraphBuilder::layer_norm(
 
   LayerAttrs layer = LayerAttrs{ComputationGraphOpAttrs{attrs}, name};
 
-
   std::vector<InitializerAttrs> initializers = get_initializers(attrs);
 
   return get_only(this->add_layer(layer, {input}, initializers));
@@ -808,8 +809,7 @@ tensor_guid_t ComputationGraphBuilder::softmax(
 
   LayerAttrs layer = LayerAttrs{ComputationGraphOpAttrs{attrs}, name};
 
-  return get_only(
-      this->add_layer(layer, {input}, {}));
+  return get_only(this->add_layer(layer, {input}, {}));
 }
 
 } // namespace FlexFlow
diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
index f70299db44..b08c0a575d 100644
--- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
+++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
@@ -1,5 +1,6 @@
 #include "pcg/parallel_computation_graph/parallel_computation_graph.h"
 #include "op-attrs/get_incoming_tensor_roles.h"
+#include "op-attrs/pcg_operator_attrs.h"
 #include "op-attrs/shape_inference.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph_edge.dtg.h"
@@ -17,18 +18,17 @@
 #include "utils/graph/dataflow_graph/algorithms/get_outgoing_edges.h"
 #include "utils/graph/dataflow_graph/dataflow_edge.dtg.h"
 #include "utils/graph/digraph/algorithms.h"
+#include "utils/graph/digraph/algorithms/get_subgraph_successors.h"
+#include "utils/graph/digraph/algorithms/get_successors.h"
 #include "utils/graph/digraph/algorithms/get_topological_ordering.h"
 #include "utils/graph/instances/unordered_set_labelled_open_dataflow_graph.h"
 #include "utils/graph/labelled_dataflow_graph/algorithms/find_isomorphism.h"
 #include "utils/graph/labelled_dataflow_graph/algorithms/rewrite_node_labels.h"
+#include "utils/graph/labelled_open_dataflow_graph/algorithms/as_dot.h"
 #include "utils/graph/node/algorithms.h"
 #include "utils/graph/node/node.dtg.h"
-#include <unordered_set>
-#include "utils/graph/digraph/algorithms/get_subgraph_successors.h"
-#include "utils/graph/digraph/algorithms/get_successors.h"
 #include "utils/record_formatter.h"
-#include "op-attrs/pcg_operator_attrs.h"
-#include "utils/graph/labelled_open_dataflow_graph/algorithms/as_dot.h"
+#include <unordered_set>
 
 namespace FlexFlow {
 
@@ -45,27 +45,35 @@ std::unordered_set<parallel_layer_guid_t>
                    [&](Node const &n) { return parallel_layer_guid_t{n}; });
 }
 
-ParallelLayerAddedResult
-    add_parallel_layer(ParallelComputationGraph &pcg,
-                       ParallelLayerAttrs const &layer_attrs,
-                       std::vector<parallel_tensor_guid_t> const &inputs,
-                       std::vector<parallel_tensor_guid_t> const &weights,
-                       std::optional<std::vector<CreateGrad>> const &maybe_output_flags) {
-  std::vector<ParallelTensorShape> input_shapes = 
-    transform(inputs, [&](parallel_tensor_guid_t const &i) { return get_parallel_tensor_shape(pcg, i); });
+ParallelLayerAddedResult add_parallel_layer(
+    ParallelComputationGraph &pcg,
+    ParallelLayerAttrs const &layer_attrs,
+    std::vector<parallel_tensor_guid_t> const &inputs,
+    std::vector<parallel_tensor_guid_t> const &weights,
+    std::optional<std::vector<CreateGrad>> const &maybe_output_flags) {
+  std::vector<ParallelTensorShape> input_shapes =
+      transform(inputs, [&](parallel_tensor_guid_t const &i) {
+        return get_parallel_tensor_shape(pcg, i);
+      });
 
-  std::vector<ParallelTensorShape> weight_shapes = 
-    transform(weights, [&](parallel_tensor_guid_t const &i) { return get_parallel_tensor_shape(pcg, i); });
+  std::vector<ParallelTensorShape> weight_shapes =
+      transform(weights, [&](parallel_tensor_guid_t const &i) {
+        return get_parallel_tensor_shape(pcg, i);
+      });
 
-  std::vector<ParallelTensorShape> correct_weight_shapes = 
-    get_weight_shapes(layer_attrs.op_attrs, input_shapes);
+  std::vector<ParallelTensorShape> correct_weight_shapes =
+      get_weight_shapes(layer_attrs.op_attrs, input_shapes);
 
   if (weight_shapes != correct_weight_shapes) {
-    throw mk_runtime_error(fmt::format("add_parallel_layer expected weight shapes {}, but received weights with shapes {}", correct_weight_shapes, weight_shapes));
+    throw mk_runtime_error(
+        fmt::format("add_parallel_layer expected weight shapes {}, but "
+                    "received weights with shapes {}",
+                    correct_weight_shapes,
+                    weight_shapes));
   }
 
-  std::vector<ParallelTensorShape> output_shapes = 
-    get_output_shapes(layer_attrs.op_attrs, input_shapes);
+  std::vector<ParallelTensorShape> output_shapes =
+      get_output_shapes(layer_attrs.op_attrs, input_shapes);
 
   std::vector<DataflowOutput> unwrapped_inputs =
       transform(inputs, [](parallel_tensor_guid_t const &t) {
@@ -77,16 +85,20 @@ ParallelLayerAddedResult
         return t.raw_graph_output;
       });
 
-  std::vector<CreateGrad> output_flags = maybe_output_flags.value_or(repeat_element(num_elements(output_shapes), CreateGrad::YES));
+  std::vector<CreateGrad> output_flags = maybe_output_flags.value_or(
+      repeat_element(num_elements(output_shapes), CreateGrad::YES));
 
-  std::vector<ParallelTensorAttrs> output_attrs = 
-    zip_with_strict(output_shapes, output_flags, 
-                    [](ParallelTensorShape const &shape, CreateGrad const &create_grad) {
-                      return ParallelTensorAttrs{shape, create_grad};
-                    });
+  std::vector<ParallelTensorAttrs> output_attrs = zip_with_strict(
+      output_shapes,
+      output_flags,
+      [](ParallelTensorShape const &shape, CreateGrad const &create_grad) {
+        return ParallelTensorAttrs{shape, create_grad};
+      });
 
-  NodeAddedResult op_added =
-      pcg.raw_graph.add_node(layer_attrs, concat_vectors(unwrapped_inputs, unwrapped_weights), output_attrs);
+  NodeAddedResult op_added = pcg.raw_graph.add_node(
+      layer_attrs,
+      concat_vectors(unwrapped_inputs, unwrapped_weights),
+      output_attrs);
 
   return ParallelLayerAddedResult{
       parallel_layer_guid_t{op_added.node},
@@ -96,9 +108,8 @@ ParallelLayerAddedResult
   };
 }
 
-ParallelLayerAddedResult
-    pcg_add_input_layer(ParallelComputationGraph &pcg,
-                        TensorShape const &tensor_shape) {
+ParallelLayerAddedResult pcg_add_input_layer(ParallelComputationGraph &pcg,
+                                             TensorShape const &tensor_shape) {
   ParallelLayerAttrs layer_attrs = ParallelLayerAttrs{
       /*op_attrs=*/PCGOperatorAttrs{InputAttrs{tensor_shape}},
       /*name=*/std::nullopt,
@@ -222,12 +233,14 @@ std::unordered_set<parallel_layer_guid_t>
                    [](Node const &n) { return parallel_layer_guid_t{n}; });
 }
 
-std::unordered_set<parallel_layer_guid_t>
-    get_subgraph_successors(ParallelComputationGraph const &pcg,
-                            std::unordered_set<parallel_layer_guid_t> const &subgraph_layers) {
+std::unordered_set<parallel_layer_guid_t> get_subgraph_successors(
+    ParallelComputationGraph const &pcg,
+    std::unordered_set<parallel_layer_guid_t> const &subgraph_layers) {
 
-  std::unordered_set<Node> raw_subgraph_nodes = transform(
-      subgraph_layers, [](parallel_layer_guid_t const &l) { return l.raw_graph_node; });
+  std::unordered_set<Node> raw_subgraph_nodes =
+      transform(subgraph_layers, [](parallel_layer_guid_t const &l) {
+        return l.raw_graph_node;
+      });
   std::unordered_set<Node> raw_successors =
       get_subgraph_successors(pcg.raw_graph, raw_subgraph_nodes);
 
@@ -338,5 +351,4 @@ void debug_print_dot(ParallelComputationGraph const &cg) {
   std::cout << as_dot(cg) << std::endl;
 }
 
-
 } // namespace FlexFlow
diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
index cc5b3fecf4..4e72b2fe0f 100644
--- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
+++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
@@ -22,14 +22,14 @@
 #include "op-attrs/parallel_op_attrs.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "op-attrs/pcg_operator_attrs.h"
+#include "op-attrs/shape_inference.h"
 #include "pcg/parallel_computation_graph/generate_weight_transform.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph.h"
 #include "utils/containers/concat_vectors.h"
+#include "utils/containers/count.h"
 #include "utils/containers/enumerate_vector.h"
 #include "utils/containers/get_only.h"
 #include "utils/containers/transform.h"
-#include "utils/containers/count.h"
-#include "op-attrs/shape_inference.h"
 #include "utils/containers/zip_with.h"
 
 namespace FlexFlow {
@@ -46,15 +46,17 @@ ParallelComputationGraphBuilder::ParallelComputationGraphBuilder()
     : pcg(empty_parallel_computation_graph()) {}
 
 parallel_tensor_guid_t ParallelComputationGraphBuilder::create_input_tensor(
-    TensorShape const &shape,
-    std::optional<std::string> const &name) {
+    TensorShape const &shape, std::optional<std::string> const &name) {
 
   ParallelLayerAttrs layer_attrs = ParallelLayerAttrs{
       PCGOperatorAttrs{InputAttrs{shape}},
       name,
   };
 
-  return get_only(add_parallel_layer(this->pcg, layer_attrs, {}, {}, std::vector{CreateGrad::NO}).outputs);
+  return get_only(
+      add_parallel_layer(
+          this->pcg, layer_attrs, {}, {}, std::vector{CreateGrad::NO})
+          .outputs);
 }
 
 parallel_tensor_guid_t ParallelComputationGraphBuilder::add(
@@ -163,7 +165,11 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::conv2d(
 
   ParallelTensorShape input_shape = this->get_shape(input);
 
-  std::vector<InitializerAttrs> initializers = get_initializers(attrs, get_reduced_shape(input_shape), maybe_kernel_initializer, maybe_bias_initializer);
+  std::vector<InitializerAttrs> initializers =
+      get_initializers(attrs,
+                       get_reduced_shape(input_shape),
+                       maybe_kernel_initializer,
+                       maybe_bias_initializer);
 
   return get_only(this->add_layer(layer, {input}, initializers));
 }
@@ -192,7 +198,11 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::dense(
 
   ParallelTensorShape input_shape = this->get_shape(input);
 
-  std::vector<InitializerAttrs> initializers = throw_if_unexpected(get_initializers(attrs, get_reduced_shape(input_shape), maybe_projection_initializer, maybe_bias_initializer));
+  std::vector<InitializerAttrs> initializers =
+      throw_if_unexpected(get_initializers(attrs,
+                                           get_reduced_shape(input_shape),
+                                           maybe_projection_initializer,
+                                           maybe_bias_initializer));
 
   return get_only(this->add_layer(layer, {input}, initializers));
 }
@@ -218,7 +228,8 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::embedding(
 
   ParallelLayerAttrs layer = ParallelLayerAttrs{PCGOperatorAttrs{attrs}, name};
 
-  std::vector<InitializerAttrs> initializers = get_initializers(attrs, maybe_kernel_initializer);
+  std::vector<InitializerAttrs> initializers =
+      get_initializers(attrs, maybe_kernel_initializer);
 
   return get_only(this->add_layer(layer, {input}, initializers));
 }
@@ -259,14 +270,14 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::multihead_attention(
 
   ParallelLayerAttrs layer = ParallelLayerAttrs{PCGOperatorAttrs{attrs}, name};
 
-  std::vector<InitializerAttrs> initializers = throw_if_unexpected(get_initializers(attrs, 
-                                                                get_reduced_shape(this->get_shape(query)),
-                                                                get_reduced_shape(this->get_shape(key)),
-                                                                get_reduced_shape(this->get_shape(value)),
-                                                                maybe_weights_initializer,
-                                                                maybe_input_bias_initializer,
-                                                                maybe_output_bias_initializer));
-
+  std::vector<InitializerAttrs> initializers = throw_if_unexpected(
+      get_initializers(attrs,
+                       get_reduced_shape(this->get_shape(query)),
+                       get_reduced_shape(this->get_shape(key)),
+                       get_reduced_shape(this->get_shape(value)),
+                       maybe_weights_initializer,
+                       maybe_input_bias_initializer,
+                       maybe_output_bias_initializer));
 
   return get_only(this->add_layer(layer, {query, key, value}, initializers));
 }
@@ -304,7 +315,8 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::batch_norm(
 
   std::vector<ParallelTensorAttrs> weights;
 
-  std::vector<InitializerAttrs> initializers = throw_if_unexpected(get_initializers(attrs));
+  std::vector<InitializerAttrs> initializers =
+      throw_if_unexpected(get_initializers(attrs));
 
   return get_only(this->add_layer(layer, {input}, initializers));
 }
@@ -494,8 +506,8 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::add_weight(
 
   ParallelLayerAttrs weight_layer_attrs = ParallelLayerAttrs{
       PCGOperatorAttrs{WeightAttrs{
-        /*shape=*/unpar_weight_shape,
-        /*initializer=*/initializer,
+          /*shape=*/unpar_weight_shape,
+          /*initializer=*/initializer,
       }},
       weight_name,
   };
@@ -511,7 +523,8 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::add_weight(
         std::nullopt,
     };
     current_weight_tensor = get_only(
-        add_parallel_layer(this->pcg, layer_attrs, {current_weight_tensor}, {}).outputs);
+        add_parallel_layer(this->pcg, layer_attrs, {current_weight_tensor}, {})
+            .outputs);
   }
 
   return current_weight_tensor;
@@ -540,21 +553,30 @@ std::vector<parallel_tensor_guid_t> ParallelComputationGraphBuilder::add_layer(
     std::vector<parallel_tensor_guid_t> const &inputs,
     std::vector<InitializerAttrs> const &weight_initializers) {
 
-  int num_weights_provided = count(weight_initializers, 
-                                   [](std::optional<InitializerAttrs> const &i) { return i.has_value(); });
+  int num_weights_provided =
+      count(weight_initializers, [](std::optional<InitializerAttrs> const &i) {
+        return i.has_value();
+      });
 
   check_incoming_tensor_roles(layer, inputs.size(), num_weights_provided);
 
-  std::vector<ParallelTensorShape> input_shapes = 
-    transform(inputs, [&](parallel_tensor_guid_t const &i) { return this->get_shape(i); });
-
-  std::vector<ParallelTensorShape> weight_shapes = get_weight_shapes(layer.op_attrs, input_shapes);
-  std::vector<parallel_tensor_guid_t> weight_tensors = 
-    zip_with(weight_shapes, weight_initializers, [&](ParallelTensorShape const &weight_shape, InitializerAttrs const &initializer) {
-      return this->add_weight(weight_shape, initializer);
-    });
-
-  return add_parallel_layer(this->pcg, layer, inputs, weight_tensors, {}).outputs;
+  std::vector<ParallelTensorShape> input_shapes =
+      transform(inputs, [&](parallel_tensor_guid_t const &i) {
+        return this->get_shape(i);
+      });
+
+  std::vector<ParallelTensorShape> weight_shapes =
+      get_weight_shapes(layer.op_attrs, input_shapes);
+  std::vector<parallel_tensor_guid_t> weight_tensors =
+      zip_with(weight_shapes,
+               weight_initializers,
+               [&](ParallelTensorShape const &weight_shape,
+                   InitializerAttrs const &initializer) {
+                 return this->add_weight(weight_shape, initializer);
+               });
+
+  return add_parallel_layer(this->pcg, layer, inputs, weight_tensors, {})
+      .outputs;
 }
 
 } // namespace FlexFlow
diff --git a/lib/pcg/test/src/pcg/computation_graph.cc b/lib/pcg/test/src/pcg/computation_graph.cc
index e5860877d3..341801d0b0 100644
--- a/lib/pcg/test/src/pcg/computation_graph.cc
+++ b/lib/pcg/test/src/pcg/computation_graph.cc
@@ -166,43 +166,53 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       auto make_layer_attrs = [](auto const &op_attrs) {
         return LayerAttrs{
-          /*op_attrs=*/ComputationGraphOpAttrs{op_attrs},
-          /*name=*/std::nullopt,
+            /*op_attrs=*/ComputationGraphOpAttrs{op_attrs},
+            /*name=*/std::nullopt,
         };
       };
 
       LinearAttrs linear_attrs = LinearAttrs{
-        /*out_channels=*/14_n,
-        /*use_bias=*/true,
-        /*data_type=*/DataType::FLOAT,
-        /*activation=*/Activation::RELU,
-        /*regularizer=*/std::nullopt,
+          /*out_channels=*/14_n,
+          /*use_bias=*/true,
+          /*data_type=*/DataType::FLOAT,
+          /*activation=*/Activation::RELU,
+          /*regularizer=*/std::nullopt,
       };
 
       InitializerAttrs zero_init = InitializerAttrs{ZeroInitializerAttrs{}};
 
       WeightAttrs projection_weight_attrs = WeightAttrs{
-        /*tensor_shape=*/throw_if_unexpected(get_projection_shape(linear_attrs, input_shape)),
-        /*initializer=*/zero_init,
+          /*tensor_shape=*/throw_if_unexpected(
+              get_projection_shape(linear_attrs, input_shape)),
+          /*initializer=*/zero_init,
       };
 
       WeightAttrs bias_weight_attrs = WeightAttrs{
-        /*tensor_shape=*/throw_if_unexpected(get_bias_shape(linear_attrs, input_shape)),
-        /*initializer=*/zero_init,
+          /*tensor_shape=*/throw_if_unexpected(
+              get_bias_shape(linear_attrs, input_shape)),
+          /*initializer=*/zero_init,
       };
 
       LayerAddedResult input_added = add_input_layer(cg, input_shape);
       tensor_guid_t t_input = get_only(input_added.outputs);
 
-      LayerAddedResult projection_weight_added = add_layer(cg, make_layer_attrs(projection_weight_attrs), {}, {});
-      tensor_guid_t t_projection_weight = get_only(projection_weight_added.outputs);
+      LayerAddedResult projection_weight_added =
+          add_layer(cg, make_layer_attrs(projection_weight_attrs), {}, {});
+      tensor_guid_t t_projection_weight =
+          get_only(projection_weight_added.outputs);
 
-      LayerAddedResult bias_weight_added = add_layer(cg, make_layer_attrs(bias_weight_attrs), {}, {});
+      LayerAddedResult bias_weight_added =
+          add_layer(cg, make_layer_attrs(bias_weight_attrs), {}, {});
       tensor_guid_t t_bias_weight = get_only(bias_weight_added.outputs);
 
-      LayerAddedResult linear_added = add_layer(cg, make_layer_attrs(linear_attrs), {t_input}, {t_projection_weight, t_bias_weight});
+      LayerAddedResult linear_added =
+          add_layer(cg,
+                    make_layer_attrs(linear_attrs),
+                    {t_input},
+                    {t_projection_weight, t_bias_weight});
 
-      std::vector<tensor_guid_t> result = get_incoming_weights(cg, linear_added.layer);
+      std::vector<tensor_guid_t> result =
+          get_incoming_weights(cg, linear_added.layer);
       std::vector<tensor_guid_t> correct = {
           t_projection_weight,
           t_bias_weight,
diff --git a/lib/pcg/test/src/pcg/file_format/v1/v1_parallel_computation_graph.cc b/lib/pcg/test/src/pcg/file_format/v1/v1_parallel_computation_graph.cc
index 98b9ca2705..9d5dceca18 100644
--- a/lib/pcg/test/src/pcg/file_format/v1/v1_parallel_computation_graph.cc
+++ b/lib/pcg/test/src/pcg/file_format/v1/v1_parallel_computation_graph.cc
@@ -10,17 +10,18 @@ TEST_SUITE(FF_TEST_SUITE) {
       ParallelComputationGraphBuilder b;
 
       TensorShape input_shape = TensorShape{
-        TensorDims{
-          FFOrdered<nonnegative_int>{
-            12_n,
-            16_n,
+          TensorDims{
+              FFOrdered<nonnegative_int>{
+                  12_n,
+                  16_n,
+              },
           },
-        },
-        DataType::FLOAT,
+          DataType::FLOAT,
       };
 
       parallel_tensor_guid_t input = b.create_input_tensor(input_shape);
-      parallel_tensor_guid_t t_partition = b.parallel_partition(input, ff_dim_t{0_n}, 2_n); 
+      parallel_tensor_guid_t t_partition =
+          b.parallel_partition(input, ff_dim_t{0_n}, 2_n);
       parallel_tensor_guid_t mm_output = b.dense(input, 8_n);
       parallel_tensor_guid_t relu_output = b.relu(mm_output);
 
diff --git a/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph.cc b/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
index 4c0fd1b013..d68e20bd92 100644
--- a/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
+++ b/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
@@ -12,12 +12,11 @@ using namespace ::FlexFlow;
 template <typename T>
 static ParallelLayerAttrs make_layer_attrs(T const &op_attrs) {
   return ParallelLayerAttrs{
-    /*op_attrs=*/PCGOperatorAttrs{op_attrs},
-    /*name=*/std::nullopt,
+      /*op_attrs=*/PCGOperatorAttrs{op_attrs},
+      /*name=*/std::nullopt,
   };
 };
 
-
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("topological_ordering") {
     // TODO(@lockshaw) should probably be replaced with a rapidcheck test that
@@ -28,12 +27,13 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelComputationGraph pcg = empty_parallel_computation_graph();
 
     TensorShape input_shape = TensorShape{
-      TensorDims{
-        FFOrdered<nonnegative_int>{
-          12_n, 16_n,
+        TensorDims{
+            FFOrdered<nonnegative_int>{
+                12_n,
+                16_n,
+            },
         },
-      },
-      DataType::FLOAT,
+        DataType::FLOAT,
     };
 
     ElementUnaryAttrs relu_attrs = make_relu_attrs();
@@ -63,16 +63,15 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelComputationGraph pcg = empty_parallel_computation_graph();
 
     TensorShape input_shape = TensorShape{
-      TensorDims{
-        FFOrdered<nonnegative_int>{
-          10_n, 12_n
+        TensorDims{
+            FFOrdered<nonnegative_int>{10_n, 12_n},
         },
-      },
-      DataType::FLOAT,
+        DataType::FLOAT,
     };
 
     SUBCASE("layer has no inputs") {
-      ParallelLayerAddedResult input_added = pcg_add_input_layer(pcg, input_shape);
+      ParallelLayerAddedResult input_added =
+          pcg_add_input_layer(pcg, input_shape);
 
       std::vector<parallel_tensor_guid_t> result =
           get_incoming_inputs(pcg, input_added.parallel_layer);
@@ -85,33 +84,43 @@ TEST_SUITE(FF_TEST_SUITE) {
       std::string my_op_name = "my op";
 
       LinearAttrs linear_attrs = LinearAttrs{
-        /*out_channels=*/14_n,
-        /*use_bias=*/true,
-        /*data_type=*/DataType::FLOAT,
-        /*activation=*/Activation::RELU,
-        /*regularizer=*/std::nullopt,
+          /*out_channels=*/14_n,
+          /*use_bias=*/true,
+          /*data_type=*/DataType::FLOAT,
+          /*activation=*/Activation::RELU,
+          /*regularizer=*/std::nullopt,
       };
 
       WeightAttrs projection_weight_attrs = WeightAttrs{
-        /*tensor_shape=*/throw_if_unexpected(get_projection_shape(linear_attrs, input_shape)),
-        /*initializer=*/InitializerAttrs{ZeroInitializerAttrs{}},
+          /*tensor_shape=*/throw_if_unexpected(
+              get_projection_shape(linear_attrs, input_shape)),
+          /*initializer=*/InitializerAttrs{ZeroInitializerAttrs{}},
       };
-      
+
       WeightAttrs bias_weight_attrs = WeightAttrs{
-        /*tensor_shape=*/throw_if_unexpected(get_bias_shape(linear_attrs, input_shape)),
-        /*initializer=*/InitializerAttrs{ZeroInitializerAttrs{}},
+          /*tensor_shape=*/throw_if_unexpected(
+              get_bias_shape(linear_attrs, input_shape)),
+          /*initializer=*/InitializerAttrs{ZeroInitializerAttrs{}},
       };
 
-      ParallelLayerAddedResult input_added = pcg_add_input_layer(pcg, input_shape);
+      ParallelLayerAddedResult input_added =
+          pcg_add_input_layer(pcg, input_shape);
       parallel_tensor_guid_t t_input = get_only(input_added.outputs);
 
-      ParallelLayerAddedResult projection_weight_added = add_parallel_layer(pcg, make_layer_attrs(projection_weight_attrs), {}, {});
-      parallel_tensor_guid_t t_projection = get_only(projection_weight_added.outputs);
+      ParallelLayerAddedResult projection_weight_added = add_parallel_layer(
+          pcg, make_layer_attrs(projection_weight_attrs), {}, {});
+      parallel_tensor_guid_t t_projection =
+          get_only(projection_weight_added.outputs);
 
-      ParallelLayerAddedResult bias_weight_added = add_parallel_layer(pcg, make_layer_attrs(bias_weight_attrs), {}, {});
+      ParallelLayerAddedResult bias_weight_added =
+          add_parallel_layer(pcg, make_layer_attrs(bias_weight_attrs), {}, {});
       parallel_tensor_guid_t t_bias = get_only(bias_weight_added.outputs);
 
-      ParallelLayerAddedResult linear_added = add_parallel_layer(pcg, make_layer_attrs(linear_attrs), {t_input}, {t_projection, t_bias});
+      ParallelLayerAddedResult linear_added =
+          add_parallel_layer(pcg,
+                             make_layer_attrs(linear_attrs),
+                             {t_input},
+                             {t_projection, t_bias});
 
       std::vector<parallel_tensor_guid_t> result =
           get_incoming_inputs(pcg, linear_added.parallel_layer);
@@ -124,12 +133,13 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE(
       "get_source_layer(ParallelComputationGraph, parallel_tensor_guid_t)") {
     TensorShape input_shape = TensorShape{
-      TensorDims{
-        FFOrdered<nonnegative_int>{
-          10_n, 12_n,
+        TensorDims{
+            FFOrdered<nonnegative_int>{
+                10_n,
+                12_n,
+            },
         },
-      },
-      DataType::FLOAT,
+        DataType::FLOAT,
     };
 
     ParallelComputationGraph pcg = empty_parallel_computation_graph();
@@ -194,18 +204,20 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE(
       "get_incoming_weights(ParallelComputationGraph, parallel_layer_guid_t)") {
     TensorShape input_shape = TensorShape{
-      TensorDims{
-        FFOrdered<nonnegative_int>{
-          10_n, 12_n,
+        TensorDims{
+            FFOrdered<nonnegative_int>{
+                10_n,
+                12_n,
+            },
         },
-      },
-      DataType::FLOAT,
+        DataType::FLOAT,
     };
 
     ParallelComputationGraph pcg = empty_parallel_computation_graph();
 
     SUBCASE("layer has no inputs or weights") {
-      ParallelLayerAddedResult input_added = pcg_add_input_layer(pcg, input_shape);
+      ParallelLayerAddedResult input_added =
+          pcg_add_input_layer(pcg, input_shape);
 
       std::vector<parallel_tensor_guid_t> result =
           get_incoming_weights(pcg, input_added.parallel_layer);
@@ -215,10 +227,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("layer has inputs but no weights") {
-      ParallelLayerAddedResult input_added = pcg_add_input_layer(pcg, input_shape);
+      ParallelLayerAddedResult input_added =
+          pcg_add_input_layer(pcg, input_shape);
       parallel_tensor_guid_t t_input = get_only(input_added.outputs);
 
-      ParallelLayerAddedResult relu_added = add_parallel_layer(pcg, make_layer_attrs(make_relu_attrs()), {t_input}, {});
+      ParallelLayerAddedResult relu_added = add_parallel_layer(
+          pcg, make_layer_attrs(make_relu_attrs()), {t_input}, {});
 
       std::vector<parallel_tensor_guid_t> result =
           get_incoming_weights(pcg, relu_added.parallel_layer);
@@ -241,36 +255,52 @@ TEST_SUITE(FF_TEST_SUITE) {
           /*regularizer=*/std::nullopt,
       };
 
-      ParallelLayerAddedResult input_added = pcg_add_input_layer(pcg, input_shape);
+      ParallelLayerAddedResult input_added =
+          pcg_add_input_layer(pcg, input_shape);
       parallel_tensor_guid_t t_input = get_only(input_added.outputs);
 
       RepartitionAttrs partition_input_attrs = RepartitionAttrs{
-        /*repartition_dim=*/ff_dim_t{0_n},
-        /*repartition_degree=*/2_n,
+          /*repartition_dim=*/ff_dim_t{0_n},
+          /*repartition_degree=*/2_n,
       };
 
-      ParallelLayerAddedResult partition_input_added = add_parallel_layer(pcg, make_layer_attrs(partition_input_attrs), {t_input}, {});
-      parallel_tensor_guid_t t_partitioned_input = get_only(partition_input_added.outputs);
+      ParallelLayerAddedResult partition_input_added = add_parallel_layer(
+          pcg, make_layer_attrs(partition_input_attrs), {t_input}, {});
+      parallel_tensor_guid_t t_partitioned_input =
+          get_only(partition_input_added.outputs);
 
       WeightAttrs projection_weight_attrs = WeightAttrs{
-        /*tensor_shape=*/throw_if_unexpected(get_projection_shape(linear_attrs, input_shape)),
-        /*initializer=*/InitializerAttrs{ZeroInitializerAttrs{}},
+          /*tensor_shape=*/throw_if_unexpected(
+              get_projection_shape(linear_attrs, input_shape)),
+          /*initializer=*/InitializerAttrs{ZeroInitializerAttrs{}},
       };
-      
-      ParallelLayerAddedResult projection_weight_added = add_parallel_layer(pcg, make_layer_attrs(projection_weight_attrs), {}, {});
-      parallel_tensor_guid_t t_projection_weight = get_only(projection_weight_added.outputs);
+
+      ParallelLayerAddedResult projection_weight_added = add_parallel_layer(
+          pcg, make_layer_attrs(projection_weight_attrs), {}, {});
+      parallel_tensor_guid_t t_projection_weight =
+          get_only(projection_weight_added.outputs);
 
       ReplicateAttrs replicate_projection_attrs = ReplicateAttrs{
-        /*replicate_degree=*/2_n,
+          /*replicate_degree=*/2_n,
       };
-      ParallelLayerAddedResult replicate_projection_added = add_parallel_layer(pcg, make_layer_attrs(replicate_projection_attrs), {t_projection_weight}, {});
-      parallel_tensor_guid_t t_replicated_projection_weight = get_only(replicate_projection_added.outputs);
-
-      ParallelLayerAddedResult linear_added = add_parallel_layer(pcg, make_layer_attrs(linear_attrs), {t_partitioned_input}, {t_replicated_projection_weight});
+      ParallelLayerAddedResult replicate_projection_added =
+          add_parallel_layer(pcg,
+                             make_layer_attrs(replicate_projection_attrs),
+                             {t_projection_weight},
+                             {});
+      parallel_tensor_guid_t t_replicated_projection_weight =
+          get_only(replicate_projection_added.outputs);
+
+      ParallelLayerAddedResult linear_added =
+          add_parallel_layer(pcg,
+                             make_layer_attrs(linear_attrs),
+                             {t_partitioned_input},
+                             {t_replicated_projection_weight});
 
       std::vector<parallel_tensor_guid_t> result =
           get_incoming_weights(pcg, linear_added.parallel_layer);
-      std::vector<parallel_tensor_guid_t> correct = {t_replicated_projection_weight};
+      std::vector<parallel_tensor_guid_t> correct = {
+          t_replicated_projection_weight};
 
       CHECK(result == correct);
     }
@@ -278,12 +308,13 @@ TEST_SUITE(FF_TEST_SUITE) {
 
   TEST_CASE("pcg_add_input_layer") {
     TensorShape input_shape = TensorShape{
-      TensorDims{
-        FFOrdered<nonnegative_int>{
-          12_n, 10_n,
+        TensorDims{
+            FFOrdered<nonnegative_int>{
+                12_n,
+                10_n,
+            },
         },
-      },
-      DataType::FLOAT,
+        DataType::FLOAT,
     };
 
     ParallelComputationGraph result = [&] {
diff --git a/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc b/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
index db6b47ab99..b82cb009a9 100644
--- a/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
+++ b/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
@@ -30,13 +30,13 @@ TEST_SUITE(FF_TEST_SUITE) {
     ShardParallelDim d2 = ShardParallelDim{15_n, 3_n};
 
     TensorShape lhs_shape = TensorShape{
-      TensorDims{
-        FFOrdered<nonnegative_int>{
-          10_n,
-          15_n,
+        TensorDims{
+            FFOrdered<nonnegative_int>{
+                10_n,
+                15_n,
+            },
         },
-      },
-      DataType::FLOAT,
+        DataType::FLOAT,
     };
 
     // ParallelTensorShape lhs_shape = ParallelTensorShape{
@@ -87,25 +87,25 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelComputationGraphBuilder b;
 
     TensorShape a_shape = TensorShape{
-      TensorDims{
-        FFOrdered<nonnegative_int>{
-          4_n,
-          10_n,
-          15_n,
+        TensorDims{
+            FFOrdered<nonnegative_int>{
+                4_n,
+                10_n,
+                15_n,
+            },
         },
-      },
-      DataType::FLOAT,
+        DataType::FLOAT,
     };
 
     TensorShape b_shape = TensorShape{
-      TensorDims{
-        FFOrdered<nonnegative_int>{
-          4_n,
-          15_n,
-          10_n,
+        TensorDims{
+            FFOrdered<nonnegative_int>{
+                4_n,
+                15_n,
+                10_n,
+            },
         },
-      },
-      DataType::FLOAT,
+        DataType::FLOAT,
     };
 
     parallel_tensor_guid_t a_tensor = b.create_input_tensor(a_shape);
@@ -140,12 +140,13 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelComputationGraphBuilder b;
 
     TensorShape input_shape = TensorShape{
-      TensorDims{
-        FFOrdered<nonnegative_int>{
-          10_n, 12_n,
+        TensorDims{
+            FFOrdered<nonnegative_int>{
+                10_n,
+                12_n,
+            },
         },
-      },
-      DataType::FLOAT,
+        DataType::FLOAT,
     };
 
     DataType output_datatype = DataType::DOUBLE;
@@ -182,9 +183,9 @@ TEST_SUITE(FF_TEST_SUITE) {
         DataType::FLOAT,
     };
 
-
     parallel_tensor_guid_t input = b.create_input_tensor(input_shape);
-    parallel_tensor_guid_t par_input = b.parallel_partition(input, ff_dim_t{0_n}, 2_n);
+    parallel_tensor_guid_t par_input =
+        b.parallel_partition(input, ff_dim_t{0_n}, 2_n);
 
     ParallelTensorShape par_input_shape = b.get_shape(par_input);
 
@@ -296,12 +297,13 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelComputationGraphBuilder b;
 
     TensorShape input_shape = TensorShape{
-      TensorDims{
-        FFOrdered<nonnegative_int>{
-          10_n, 16_n,
+        TensorDims{
+            FFOrdered<nonnegative_int>{
+                10_n,
+                16_n,
+            },
         },
-      },
-      DataType::FLOAT,
+        DataType::FLOAT,
     };
     nonnegative_int outDim = 14_n;
 
@@ -333,13 +335,13 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelComputationGraphBuilder b;
 
     TensorShape input_shape = TensorShape{
-      TensorDims{
-        FFOrdered<nonnegative_int>{
-          12_n,
-          10_n,
+        TensorDims{
+            FFOrdered<nonnegative_int>{
+                12_n,
+                10_n,
+            },
         },
-      },
-      DataType::INT32,
+        DataType::INT32,
     };
 
     parallel_tensor_guid_t input = b.create_input_tensor(input_shape);
@@ -370,12 +372,14 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelComputationGraphBuilder b;
 
     TensorShape query_shape = TensorShape{
-      TensorDims{
-        FFOrdered<nonnegative_int>{
-          12_n, 16_n, 10_n,
+        TensorDims{
+            FFOrdered<nonnegative_int>{
+                12_n,
+                16_n,
+                10_n,
+            },
         },
-      },
-      DataType::FLOAT,
+        DataType::FLOAT,
     };
 
     TensorShape key_shape = query_shape;
@@ -412,13 +416,13 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelComputationGraphBuilder b;
 
     TensorShape input_shape = TensorShape{
-      TensorDims{
-        FFOrdered<nonnegative_int>{
-          18_n,
-          32_n,
+        TensorDims{
+            FFOrdered<nonnegative_int>{
+                18_n,
+                32_n,
+            },
         },
-      },
-      DataType::FLOAT,
+        DataType::FLOAT,
     };
 
     parallel_tensor_guid_t input = b.create_input_tensor(input_shape);
@@ -447,12 +451,13 @@ TEST_SUITE(FF_TEST_SUITE) {
     ShardParallelDim feature_dim = ShardParallelDim{10_n, 1_n};
 
     TensorShape input_shape = TensorShape{
-      TensorDims{
-        FFOrdered<nonnegative_int>{
-          18_n, 10_n,
+        TensorDims{
+            FFOrdered<nonnegative_int>{
+                18_n,
+                10_n,
+            },
         },
-      },
-      DataType::FLOAT,
+        DataType::FLOAT,
     };
 
     parallel_tensor_guid_t input = b.create_input_tensor(input_shape);
@@ -479,12 +484,13 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelComputationGraphBuilder b;
 
     TensorShape input_shape = TensorShape{
-      TensorDims{
-        FFOrdered<nonnegative_int>{
-          18_n, 10_n,
+        TensorDims{
+            FFOrdered<nonnegative_int>{
+                18_n,
+                10_n,
+            },
         },
-      },
-      DataType::FLOAT,
+        DataType::FLOAT,
     };
 
     parallel_tensor_guid_t input = b.create_input_tensor(input_shape);
@@ -512,12 +518,13 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelComputationGraphBuilder b;
 
     TensorShape input_shape = TensorShape{
-      TensorDims{
-        FFOrdered<nonnegative_int>{
-          18_n, 10_n,
+        TensorDims{
+            FFOrdered<nonnegative_int>{
+                18_n,
+                10_n,
+            },
         },
-      },
-      DataType::FLOAT,
+        DataType::FLOAT,
     };
 
     parallel_tensor_guid_t input = b.create_input_tensor(input_shape);
@@ -543,17 +550,21 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelComputationGraphBuilder b;
 
     TensorShape input_shape = TensorShape{
-      TensorDims{
-        FFOrdered<nonnegative_int>{
-          18_n, 10_n,
+        TensorDims{
+            FFOrdered<nonnegative_int>{
+                18_n,
+                10_n,
+            },
         },
-      },
-      DataType::FLOAT,
+        DataType::FLOAT,
     };
 
     parallel_tensor_guid_t input = b.create_input_tensor(input_shape);
     input = b.parallel_partition(input, ff_dim_t{1_n}, 2_n);
-    input = b.dense(input, /*out_dim=*/12_n, /*activation=*/std::nullopt, /*use_bias=*/false);
+    input = b.dense(input,
+                    /*out_dim=*/12_n,
+                    /*activation=*/std::nullopt,
+                    /*use_bias=*/false);
     parallel_tensor_guid_t output = b.parallel_reduce(input, 2_n);
     parallel_layer_guid_t layer = get_source_layer(output);
 
diff --git a/lib/pcg/test/src/pcg/pcg_from_computation_graph.cc b/lib/pcg/test/src/pcg/pcg_from_computation_graph.cc
index 57f639cf8f..5a8f5fcd19 100644
--- a/lib/pcg/test/src/pcg/pcg_from_computation_graph.cc
+++ b/lib/pcg/test/src/pcg/pcg_from_computation_graph.cc
@@ -13,49 +13,60 @@ TEST_SUITE(FF_TEST_SUITE) {
     InitializerAttrs zero_init = InitializerAttrs{ZeroInitializerAttrs{}};
 
     TensorShape input_shape = TensorShape{
-      TensorDims{FFOrdered<nonnegative_int>{10_n, 12_n,},}, DataType::FLOAT,};
+        TensorDims{
+            FFOrdered<nonnegative_int>{
+                10_n,
+                12_n,
+            },
+        },
+        DataType::FLOAT,
+    };
 
     LinearAttrs linear_attrs = LinearAttrs{
-      /*out_channels=*/8_n,
-      /*use_bias=*/true,
-      /*data_type=*/DataType::FLOAT,
-      /*activation=*/Activation::RELU,
-      /*regularizer=*/std::nullopt,
+        /*out_channels=*/8_n,
+        /*use_bias=*/true,
+        /*data_type=*/DataType::FLOAT,
+        /*activation=*/Activation::RELU,
+        /*regularizer=*/std::nullopt,
     };
 
     WeightAttrs projection_weight_attrs = WeightAttrs{
-      /*tensor_shape=*/throw_if_unexpected(get_projection_shape(linear_attrs, input_shape)),
-      /*initializer=*/zero_init,
+        /*tensor_shape=*/throw_if_unexpected(
+            get_projection_shape(linear_attrs, input_shape)),
+        /*initializer=*/zero_init,
     };
 
     WeightAttrs bias_weight_attrs = WeightAttrs{
-      /*tensor_shape=*/throw_if_unexpected(get_bias_shape(linear_attrs, input_shape)),
-      /*initializer=*/zero_init,
+        /*tensor_shape=*/throw_if_unexpected(
+            get_bias_shape(linear_attrs, input_shape)),
+        /*initializer=*/zero_init,
     };
 
     ComputationGraph cg = [&] {
       auto make_layer_attrs = [](auto const &op_attrs) {
         return LayerAttrs{
-          /*op_attrs=*/ComputationGraphOpAttrs{op_attrs},
-          /*name=*/std::nullopt,
+            /*op_attrs=*/ComputationGraphOpAttrs{op_attrs},
+            /*name=*/std::nullopt,
         };
       };
-      
+
       ComputationGraph cg = make_empty_computation_graph();
 
       LayerAddedResult input_added = add_input_layer(cg, input_shape);
       tensor_guid_t t_input = get_only(input_added.outputs);
 
-      LayerAddedResult projection_weight_added = add_layer(cg, 
-                                                           make_layer_attrs(projection_weight_attrs), 
-                                                           /*inputs=*/{},
-                                                           /*weights=*/{});
+      LayerAddedResult projection_weight_added =
+          add_layer(cg,
+                    make_layer_attrs(projection_weight_attrs),
+                    /*inputs=*/{},
+                    /*weights=*/{});
       tensor_guid_t t_projection = get_only(projection_weight_added.outputs);
 
-      LayerAddedResult bias_weight_added = add_layer(cg, 
-                                                     make_layer_attrs(bias_weight_attrs),
-                                                     /*inputs=*/{},
-                                                     /*weights=*/{});
+      LayerAddedResult bias_weight_added =
+          add_layer(cg,
+                    make_layer_attrs(bias_weight_attrs),
+                    /*inputs=*/{},
+                    /*weights=*/{});
       tensor_guid_t t_bias = get_only(bias_weight_added.outputs);
 
       LayerAddedResult linear_added =
@@ -65,7 +76,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                     /*weights=*/{t_projection, t_bias});
       tensor_guid_t t_linear = get_only(linear_added.outputs);
 
-      add_layer(cg, 
+      add_layer(cg,
                 make_layer_attrs(make_relu_attrs()),
                 /*inputs=*/{t_linear},
                 /*weights=*/{});
@@ -76,26 +87,30 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelComputationGraph correct = [&] {
       auto make_layer_attrs = [](auto const &op_attrs) {
         return ParallelLayerAttrs{
-          /*op_attrs=*/PCGOperatorAttrs{op_attrs},
-          /*name=*/std::nullopt,
+            /*op_attrs=*/PCGOperatorAttrs{op_attrs},
+            /*name=*/std::nullopt,
         };
       };
 
       ParallelComputationGraph pcg = empty_parallel_computation_graph();
 
-      ParallelLayerAddedResult input_added = pcg_add_input_layer(pcg, input_shape);
+      ParallelLayerAddedResult input_added =
+          pcg_add_input_layer(pcg, input_shape);
       parallel_tensor_guid_t t_input = get_only(input_added.outputs);
 
-      ParallelLayerAddedResult projection_weight_added = add_parallel_layer(pcg, 
-                                                                            make_layer_attrs(projection_weight_attrs),
-                                                                            /*inputs=*/{},
-                                                                            /*weights=*/{});
-      parallel_tensor_guid_t t_projection = get_only(projection_weight_added.outputs);
+      ParallelLayerAddedResult projection_weight_added =
+          add_parallel_layer(pcg,
+                             make_layer_attrs(projection_weight_attrs),
+                             /*inputs=*/{},
+                             /*weights=*/{});
+      parallel_tensor_guid_t t_projection =
+          get_only(projection_weight_added.outputs);
 
-      ParallelLayerAddedResult bias_weight_added = add_parallel_layer(pcg,
-                                                                      make_layer_attrs(bias_weight_attrs),
-                                                                      /*inputs=*/{},
-                                                                      /*weights=*/{});
+      ParallelLayerAddedResult bias_weight_added =
+          add_parallel_layer(pcg,
+                             make_layer_attrs(bias_weight_attrs),
+                             /*inputs=*/{},
+                             /*weights=*/{});
       parallel_tensor_guid_t t_bias = get_only(bias_weight_added.outputs);
 
       ParallelLayerAddedResult linear_added =
diff --git a/lib/substitutions/src/substitutions/apply_substitution/perform_shape_inference.cc b/lib/substitutions/src/substitutions/apply_substitution/perform_shape_inference.cc
index 360859ba37..87dd5e6cbd 100644
--- a/lib/substitutions/src/substitutions/apply_substitution/perform_shape_inference.cc
+++ b/lib/substitutions/src/substitutions/apply_substitution/perform_shape_inference.cc
@@ -31,26 +31,32 @@ LabelledOpenDataflowGraphView<ParallelLayerAttrs, ParallelTensorShape>
 
     ParallelLayerAttrs n_attrs = g.at(n);
 
-    std::vector<IncomingTensorRole> incoming_tensor_roles = get_incoming_tensor_roles(n_attrs.op_attrs, incoming_shapes.size());
+    std::vector<IncomingTensorRole> incoming_tensor_roles =
+        get_incoming_tensor_roles(n_attrs.op_attrs, incoming_shapes.size());
 
-    auto incoming_shapes_with_role = [&](IncomingTensorRole role) -> std::vector<ParallelTensorShape> {
-     return filtrans(zip(incoming_shapes, incoming_tensor_roles),
-                     [&](std::pair<ParallelTensorShape, IncomingTensorRole> const &t) -> std::optional<ParallelTensorShape> {
-                       if (t.second == role) {
-                         return t.first;
-                       } else {
-                         return std::nullopt;
-                       }
-                     });
+    auto incoming_shapes_with_role =
+        [&](IncomingTensorRole role) -> std::vector<ParallelTensorShape> {
+      return filtrans(
+          zip(incoming_shapes, incoming_tensor_roles),
+          [&](std::pair<ParallelTensorShape, IncomingTensorRole> const &t)
+              -> std::optional<ParallelTensorShape> {
+            if (t.second == role) {
+              return t.first;
+            } else {
+              return std::nullopt;
+            }
+          });
     };
 
-    std::vector<ParallelTensorShape> input_shapes = incoming_shapes_with_role(IncomingTensorRole::INPUT);
-    std::vector<ParallelTensorShape> weight_shapes = incoming_shapes_with_role(IncomingTensorRole::WEIGHT);
+    std::vector<ParallelTensorShape> input_shapes =
+        incoming_shapes_with_role(IncomingTensorRole::INPUT);
+    std::vector<ParallelTensorShape> weight_shapes =
+        incoming_shapes_with_role(IncomingTensorRole::WEIGHT);
 
-    std::vector<ParallelTensorShape> inferred_weight_shapes = 
+    std::vector<ParallelTensorShape> inferred_weight_shapes =
         get_weight_shapes(n_attrs.op_attrs, input_shapes);
 
-    assert (weight_shapes == inferred_weight_shapes);
+    assert(weight_shapes == inferred_weight_shapes);
 
     std::vector<ParallelTensorShape> output_shapes =
         get_output_shapes(n_attrs.op_attrs, input_shapes);
diff --git a/lib/substitutions/test/src/substitutions/apply_substitution/apply_substitution.cc b/lib/substitutions/test/src/substitutions/apply_substitution/apply_substitution.cc
index 9aec74e6f2..ad78695fbb 100644
--- a/lib/substitutions/test/src/substitutions/apply_substitution/apply_substitution.cc
+++ b/lib/substitutions/test/src/substitutions/apply_substitution/apply_substitution.cc
@@ -69,17 +69,18 @@ TEST_SUITE(FF_TEST_SUITE) {
     std::string relu_match = "relu_match";
 
     TensorShape input_shape = TensorShape{
-      TensorDims{
-        FFOrdered<nonnegative_int>{
-          batch_size, in_channels,
+        TensorDims{
+            FFOrdered<nonnegative_int>{
+                batch_size,
+                in_channels,
+            },
         },
-      },
-      DataType::FLOAT,
+        DataType::FLOAT,
     };
 
     SubParallelComputationGraph pcg = [&] {
       ParallelComputationGraphBuilder b;
-      
+
       parallel_tensor_guid_t t = b.create_input_tensor(input_shape);
       t = b.parallel_partition(t, ff_dim_t{0_n}, batch_degree);
       t = b.dense(t,
diff --git a/lib/substitutions/test/src/substitutions/apply_substitution/evaluate_substitution_output.cc b/lib/substitutions/test/src/substitutions/apply_substitution/evaluate_substitution_output.cc
index 187bc30e68..75bbbcae9e 100644
--- a/lib/substitutions/test/src/substitutions/apply_substitution/evaluate_substitution_output.cc
+++ b/lib/substitutions/test/src/substitutions/apply_substitution/evaluate_substitution_output.cc
@@ -118,12 +118,13 @@ TEST_SUITE(FF_TEST_SUITE) {
     std::string relu_match = "relu_match";
 
     TensorShape input_shape = TensorShape{
-      TensorDims{
-        FFOrdered<nonnegative_int>{
-          batch_size, in_channels,
+        TensorDims{
+            FFOrdered<nonnegative_int>{
+                batch_size,
+                in_channels,
+            },
         },
-      },
-      DataType::FLOAT,
+        DataType::FLOAT,
     };
 
     SubParallelComputationGraph pcg = [&] {
diff --git a/lib/substitutions/test/src/substitutions/apply_substitution/perform_shape_inference.cc b/lib/substitutions/test/src/substitutions/apply_substitution/perform_shape_inference.cc
index 32658018ea..9b8e526c08 100644
--- a/lib/substitutions/test/src/substitutions/apply_substitution/perform_shape_inference.cc
+++ b/lib/substitutions/test/src/substitutions/apply_substitution/perform_shape_inference.cc
@@ -74,8 +74,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelLayerAttrs n1_weight_attrs = ParallelLayerAttrs{
         PCGOperatorAttrs{
             WeightAttrs{
-              get_reduced_shape(n1_weight_shape),
-              InitializerAttrs{ZeroInitializerAttrs{}},
+                get_reduced_shape(n1_weight_shape),
+                InitializerAttrs{ZeroInitializerAttrs{}},
             },
         },
         std::nullopt,
diff --git a/lib/substitutions/test/src/substitutions/pcg_pattern.cc b/lib/substitutions/test/src/substitutions/pcg_pattern.cc
index 5ce21d1619..8ba1fee873 100644
--- a/lib/substitutions/test/src/substitutions/pcg_pattern.cc
+++ b/lib/substitutions/test/src/substitutions/pcg_pattern.cc
@@ -20,19 +20,20 @@ TEST_SUITE(FF_TEST_SUITE) {
     nonnegative_int num_channels = 24_n;
 
     TensorShape a_shape = TensorShape{
-      TensorDims{
-        FFOrdered<nonnegative_int>{
-          batch_size, num_channels,
+        TensorDims{
+            FFOrdered<nonnegative_int>{
+                batch_size,
+                num_channels,
+            },
         },
-      },
-      DataType::FLOAT,
+        DataType::FLOAT,
     };
 
     std::string a_name = "a";
 
-    parallel_tensor_guid_t a_tensor =
-        builder.create_input_tensor(a_shape);
-    a_tensor = builder.parallel_partition(a_tensor, ff_dim_t{0_n}, batch_degree);
+    parallel_tensor_guid_t a_tensor = builder.create_input_tensor(a_shape);
+    a_tensor =
+        builder.parallel_partition(a_tensor, ff_dim_t{0_n}, batch_degree);
 
     nonnegative_int outDim = 16_n;
     std::string x_matmul_name = "x_matmul";
diff --git a/lib/utils/include/utils/containers/transform_until.h b/lib/utils/include/utils/containers/transform_until.h
index 859b9706a5..7e4cd72e23 100644
--- a/lib/utils/include/utils/containers/transform_until.h
+++ b/lib/utils/include/utils/containers/transform_until.h
@@ -1,13 +1,13 @@
 #ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_TRANSFORM_UNTIL_H
 #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_TRANSFORM_UNTIL_H
 
-#include <vector>
-#include <type_traits>
 #include <optional>
+#include <type_traits>
+#include <vector>
 
 namespace FlexFlow {
 
-template <typename T, 
+template <typename T,
           typename F,
           typename T2 = typename std::invoke_result_t<F, T>::value_type>
 std::vector<T2> transform_until(std::vector<T> const &ts, F &&f) {
diff --git a/lib/utils/include/utils/containers/vector_of.h b/lib/utils/include/utils/containers/vector_of.h
index df044fe401..47a892ff50 100644
--- a/lib/utils/include/utils/containers/vector_of.h
+++ b/lib/utils/include/utils/containers/vector_of.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_VECTOR_OF_H
 #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_VECTOR_OF_H
 
-#include <vector>
 #include <optional>
+#include <vector>
 
 namespace FlexFlow {
 
diff --git a/lib/utils/include/utils/expected.h b/lib/utils/include/utils/expected.h
index 0465d81886..1ca6d0a846 100644
--- a/lib/utils/include/utils/expected.h
+++ b/lib/utils/include/utils/expected.h
@@ -8,14 +8,14 @@
 
 namespace FlexFlow {
 
-#define PROPAGATE_ERR(...) ({ \
-  auto result = __VA_ARGS__; \
-  if (!result.has_value()) { \
-    return tl::unexpected(result.error()); \
-  } \
-  result.value(); \
-})
-
+#define PROPAGATE_ERR(...)                                                     \
+  ({                                                                           \
+    auto result = __VA_ARGS__;                                                 \
+    if (!result.has_value()) {                                                 \
+      return tl::unexpected(result.error());                                   \
+    }                                                                          \
+    result.value();                                                            \
+  })
 
 template <typename... Args>
 tl::unexpected<std::string> error_msg(Args &&...args) {
diff --git a/lib/utils/src/utils/containers/transform_until.cc b/lib/utils/src/utils/containers/transform_until.cc
index c3dd92fe0d..a1dc7703a6 100644
--- a/lib/utils/src/utils/containers/transform_until.cc
+++ b/lib/utils/src/utils/containers/transform_until.cc
@@ -7,7 +7,6 @@ using T = value_type<0>;
 using T2 = value_type<1>;
 using F = std::function<std::optional<T2>(T const &)>;
 
-template 
-  std::vector<T2> transform_until(std::vector<T> const &, F &&);
+template std::vector<T2> transform_until(std::vector<T> const &, F &&);
 
 } // namespace FlexFlow
diff --git a/lib/utils/src/utils/containers/vector_of.cc b/lib/utils/src/utils/containers/vector_of.cc
index 0f3ad54c67..451539248c 100644
--- a/lib/utils/src/utils/containers/vector_of.cc
+++ b/lib/utils/src/utils/containers/vector_of.cc
@@ -1,22 +1,18 @@
 #include "utils/containers/vector_of.h"
 #include "utils/archetypes/value_type.h"
-#include <unordered_set>
 #include <set>
+#include <unordered_set>
 
 namespace FlexFlow {
 
 using T = value_type<0>;
 
-template
-  std::vector<T> vector_of(std::vector<T> const &);
+template std::vector<T> vector_of(std::vector<T> const &);
 
-template
-  std::vector<T> vector_of(std::unordered_set<T> const &);
+template std::vector<T> vector_of(std::unordered_set<T> const &);
 
-template
-  std::vector<T> vector_of(std::set<T> const &);
+template std::vector<T> vector_of(std::set<T> const &);
 
-template
-  std::vector<T> vector_of(std::optional<T> const &);
+template std::vector<T> vector_of(std::optional<T> const &);
 
 } // namespace FlexFlow
diff --git a/lib/utils/test/src/utils/containers/transform_until.cc b/lib/utils/test/src/utils/containers/transform_until.cc
index 5b2ede725e..fd1095a2ba 100644
--- a/lib/utils/test/src/utils/containers/transform_until.cc
+++ b/lib/utils/test/src/utils/containers/transform_until.cc
@@ -1,13 +1,13 @@
 #include "utils/containers/transform_until.h"
+#include "test/utils/doctest/fmt/vector.h"
 #include "utils/exception.h"
 #include <doctest/doctest.h>
-#include "test/utils/doctest/fmt/vector.h"
 
 using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("transform_until") {
-    auto f = [](int x) -> std::optional<int> { 
+    auto f = [](int x) -> std::optional<int> {
       if (x >= 0) {
         return x + 1;
       } else {
@@ -45,10 +45,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("input container is empty") {
       std::vector<int> input = {};
 
-      std::vector<int> result = transform_until(input, 
-                                                [](int x) -> std::optional<int> {
-                                                  throw mk_runtime_error("err");
-                                                });
+      std::vector<int> result =
+          transform_until(input, [](int x) -> std::optional<int> {
+            throw mk_runtime_error("err");
+          });
       std::vector<int> correct = {};
 
       CHECK(result == correct);