PaddlePaddle · Xing-lil · Jan 20, 2025 · Jan 20, 2025 · Jan 22, 2025 · Jan 22, 2025
diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc
@@ -785,10 +785,11 @@ void HogwildWorker::CreateThreadOperators(const ProgramDesc &program) {
     op_names_.push_back(op_name);
     ops_.emplace_back(OpRegistry::CreateOp(*op_desc));
     // change to device stream
-    if (op_name == "c_broadcast" || op_name == "c_allreduce_sum" ||
-        (op_name == "reduce" &&
-         op_desc->GetAttrIfExists<int>("reduce_type") ==
-             static_cast<int>(phi::ReduceType::kRedSum))) {
+    if (op_name == "c_broadcast" ||
+        (op_name == "reduce" ||
+         op_name == "all_reduce" &&
+             op_desc->GetAttrIfExists<int>("reduce_type") ==
+                 static_cast<int>(phi::ReduceType::kRedSum))) {
       ops_[op_index]->SetAttr("use_calc_stream", true);
     }
     op_index++;

diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
@@ -63,13 +63,3 @@ REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_sum,
                              ops::CAllReduceOp,
                              ops::CAllReduceSumOpMaker,
                              ops::AllreduceSumInplaceInferer)
-
-PD_REGISTER_STRUCT_KERNEL(c_allreduce_sum,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::CAllReduceSumCPUKernel,
-                          float,
-                          double,
-                          int,
-                          int64_t,
-                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
@@ -1,38 +1,13 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+// /* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+//     http://www.apache.org/licenses/LICENSE-2.0
 
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-
-namespace paddle {
-namespace operators {
-DEFINE_C_ALLREDUCE_CUDA_KERNEL(CAllReduceSum, kRedSum)
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-PD_REGISTER_STRUCT_KERNEL(c_allreduce_sum,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::CAllReduceSumCUDAKernel,
-                          float,
-#if (NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000) || \
-    defined(PADDLE_WITH_HIP)
-                          phi::dtype::bfloat16,
-#endif
-                          double,
-                          int,
-                          int64_t,
-                          phi::dtype::float16) {
-}
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. */
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.kps b/paddle/fluid/operators/collective/c_allreduce_sum_op.kps
@@ -11,33 +11,3 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#ifdef PADDLE_WITH_XPU_KP
-
-// Please do not modify the following code
-#if defined(__CUDA_ARCH__)
-#undef __CUDA_ARCH__
-#endif
-
-#if defined(__CUDACC__)
-#undef __CUDACC__
-#endif
-
-#if defined(__CUDA__)
-#undef __CUDA__
-#endif
-
-#if defined(__NVCC__)
-#undef __NVCC__
-#endif
-
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_KERNEL(c_allreduce_sum,
-                   KP,
-                   phi::XPUPlace,
-                   ops::CAllReduceOpXPUKernel<ops::kRedSum, float>);
-
-#endif
diff --git a/python/paddle/distributed/auto_parallel/static/cost/base_cost.py b/python/paddle/distributed/auto_parallel/static/cost/base_cost.py
@@ -29,7 +29,7 @@
     "recv_v2",
     "broadcast",
     "all_gather",
-    "c_allreduce_sum",
+    "all_reduce",
     "c_identity",
 ]
 NON_COMP_TYPE = ["while", *COMM_OP_TYPE]
@@ -416,19 +416,19 @@ def build_dp_costs(
     if not has_found:
         return
 
-    c_allreduce_sum_descs = build_comm_desc_from_dist_op(
-        "c_allreduce_sum",
+    all_reduce_sum_descs = build_comm_desc_from_dist_op(
+        "all_reduce",
         dist_op,
         ctx,
         var_names,
         attrs=attrs,
         parallel_axis=parallel_axis,
     )
     comm_cost_list = build_comm_costs_from_descs(
-        _g_op_cost_factory["c_allreduce_sum"],
+        _g_op_cost_factory["all_reduce"],
         ctx,
         processes,
-        c_allreduce_sum_descs,
+        all_reduce_sum_descs,
         cluster,
         is_dp=True,
     )
@@ -789,7 +789,11 @@ def comm_count(self):
                 try:
                     var_name = self.op.input("X")[0]
                 except:
-                    var_name = self.op.output("Out")[0]
+                    try:
+                        var_name = self.op.output("Out")[0]
+                    except:
+                        var_name = self.op.output("out")[0]
+
                 var = get_var_with_recursion(
                     var_name, self.op.block, self.op.block.program
                 )

diff --git a/python/paddle/distributed/auto_parallel/static/cost/comm_op_cost.py b/python/paddle/distributed/auto_parallel/static/cost/comm_op_cost.py
@@ -23,7 +23,7 @@
 
 @register_op_cost
 class AllreduceSumOpCost(CommOpCost):
-    OP_TYPE = "c_allreduce_sum"
+    OP_TYPE = "all_reduce"
 
     def __init__(self, op=None, op_desc=None, comm_context=None):
         super().__init__(op=op, op_desc=op_desc, comm_context=comm_context)

diff --git a/python/paddle/distributed/auto_parallel/static/mapper.py b/python/paddle/distributed/auto_parallel/static/mapper.py
@@ -35,13 +35,15 @@ def is_collective_comm_op(op):
         "all_reduce",
         "broadcast",
     ]
-    reduce_tyep = [
+    reduce_type = [
         dist.ReduceOp.SUM,
         dist.ReduceOp.MIN,
         dist.ReduceOp.MAX,
         dist.ReduceOp.PROD,
     ]
-    if op.type == "reduce" and op.attr("reduce_tyep") in reduce_tyep:
+    if (op.type == "reduce" or op.type == "all_reduce") and op.attr(
+        "reduce_type"
+    ) in reduce_type:
         return True
     if op.type in comm_list:
         return True
@@ -104,7 +106,8 @@ def get_comm_volume(comm_op, src_rank, tgt_rank):
             new_tensor_shape.append(val)
     tensor_size = functools.reduce(operator.mul, new_tensor_shape, 1)
     tensor_bytes = tensor_size * get_dtype_bytes(tensor.dtype)
-    if "c_allreduce" in comm_op_type:
+    print("lzx debug comm_op_type:", comm_op_type)
+    if "all_reduce" in comm_op_type:
         comm_volume = 2 * tensor_bytes
     elif "all_gather" in comm_op_type:
         comm_volume = tensor_bytes

diff --git a/python/paddle/distributed/auto_parallel/static/operators/common.py b/python/paddle/distributed/auto_parallel/static/operators/common.py
@@ -511,7 +511,8 @@ def sync_and_scale_gradients(dist_ctx, op, groups, allreduce_var_names):
     dist_op_context = dist_ctx.dist_op_context
     main_block = dist_op_context.work_block
 
-    allreduce_type = "c_allreduce_sum"
+    # allreduce_type = "c_allreduce_sum"
+    reduce_type = dist.ReduceOp.SUM
     need_scale = dist_ctx.gradient_scale
     scale_using_allreduce_avg = dist_ctx.gradient_scale_using_allreduce_avg
 
@@ -521,7 +522,8 @@ def sync_and_scale_gradients(dist_ctx, op, groups, allreduce_var_names):
         and scale_using_allreduce_avg
         and int(paddle.version.nccl()) > 21000
     ):
-        allreduce_type = "c_allreduce_avg"
+        # allreduce_type = "c_allreduce_avg"
+        reduce_type = dist.ReduceOp.AVG
         need_scale = False
 
     for group in groups:
@@ -531,13 +533,13 @@ def sync_and_scale_gradients(dist_ctx, op, groups, allreduce_var_names):
             added_ops = []
             grad_var = main_block.var(var_name)
             allreduce_op = main_block.append_op(
-                type=allreduce_type,
-                inputs={'X': [grad_var]},
-                outputs={'Out': [grad_var]},
+                type='all_reduce',
+                inputs={'x': [grad_var]},
+                outputs={'out': [grad_var]},
                 attrs={
                     'ring_id': group.id,
-                    'use_calc_stream': True,
                     OP_ROLE_KEY: OpRole.Backward,
+                    'reduce_type': int(reduce_type),
                 },
             )
             allreduce_op._set_attr(
@@ -670,16 +672,18 @@ def is_data_parallel_scale_op(op):
 
 
 def is_data_parallel_reduce_op(op):
-    is_allreduce_op = op.type in [
-        "c_allreduce_sum",
-        "c_allreduce_avg",
+    is_all_reduce_op = op.type == "all_reduce" and op.desc.attr(
+        "reduce_type"
+    ) in [
+        dist.ReduceOp.SUM,
+        dist.ReduceOp.AVG,
     ]
     is_reduce_op = op.type == "reduce" and op.desc.attr("reduce_type") in [
         dist.ReduceOp.SUM,
         dist.ReduceOp.AVG,
     ]
     return (
-        (is_allreduce_op or is_reduce_op)
+        (is_all_reduce_op or is_reduce_op)
         and op.desc.has_attr("op_namescope")
         and ParallelMode.DataParallel in op.desc.attr("op_namescope")
     )
@@ -695,7 +699,7 @@ def is_amp_flag_sync_op(op):
 
 def is_global_norm_sync_op(op):
     return (
-        op.type == "c_allreduce_sum"
+        op.type == "all_reduce"
         and op.desc.has_attr("op_namescope")
         and SyncMode.GlobalNormSync in op.desc.attr("op_namescope")
     )

diff --git a/python/paddle/distributed/auto_parallel/static/operators/dist_default.py b/python/paddle/distributed/auto_parallel/static/operators/dist_default.py
@@ -13,6 +13,7 @@
 # limitations under the License
 
 
+import paddle.distributed as dist
 from paddle.distributed.fleet.meta_optimizers.common import OP_ROLE_KEY, OpRole
 
 from ..completion import contains_spmd_rule, get_phi_spmd_rule
@@ -67,12 +68,12 @@ def prim_operator_data_parallel_functor(ctx, src_op):
         sync_group = new_process_group(ctx.data_parallel_group)
 
         allreduce_op = main_block.append_op(
-            type='c_allreduce_sum',
-            inputs={'X': [var_name]},
-            outputs={'Out': [var_name]},
+            type='all_reduce',
+            inputs={'x': [var_name]},
+            outputs={'out': [var_name]},
             attrs={
                 'ring_id': sync_group.id,
-                'use_calc_stream': True,
+                'reduce_type': dist.ReduceOp.SUM,
                 OP_ROLE_KEY: OpRole.Backward,
             },
         )

diff --git a/python/paddle/distributed/auto_parallel/static/operators/dist_embedding.py b/python/paddle/distributed/auto_parallel/static/operators/dist_embedding.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License
-
+import paddle.distributed as dist
 from paddle.common_ops_import import check_variable_and_dtype
 from paddle.distributed.auto_parallel.static.cost.comm_op_cost import (
     AllreduceSumOpCost,
@@ -249,7 +249,7 @@ def calc_fwd_cost(self, dist_op, ctx, cluster):
         attrs = {"use_calc_stream": True, "use_model_parallel": True}
         var_names = serial_op.output("Out")
         c_allreduce_sum_desc_mapping = build_comm_desc_from_dist_op(
-            "c_allreduce_sum",
+            "all_reduce",
             dist_op,
             ctx,
             var_names,
@@ -511,12 +511,12 @@ def forward(ctx, *args, **kwargs):
 
         # use_model_parallel
         c_allreduce_sum_op = main_block.append_op(
-            type='c_allreduce_sum',
-            inputs={'X': [Out_var]},
-            outputs={'Out': [Out_var]},
+            type='all_reduce',
+            inputs={'x': [Out_var]},
+            outputs={'out': [Out_var]},
             attrs={
                 'ring_id': group.id,
-                'use_calc_stream': True,
+                'reduce_type': dist.ReduceOp.SUM,
                 'use_model_parallel': True,
                 OP_ROLE_KEY: src_op.attr('op_role'),
             },