Merge branch 'main' into ziyue/fix-double-buffer

microsoft · Dec 25, 2024 · 53c359e · 53c359e
2 parents 09d9949 + 3e7801b
commit 53c359e
Show file tree

Hide file tree

Showing 14 changed files with 48 additions and 67 deletions.
diff --git a/.azure-pipelines/integration-test-rocm.yml b/.azure-pipelines/integration-test-rocm.yml
@@ -1,10 +1,12 @@
 trigger:
 - main
+- release/*
 
 pr:
   branches:
     include:
     - main
+    - release/*
   drafts: false
 
 jobs:

diff --git a/.azure-pipelines/integration-test.yml b/.azure-pipelines/integration-test.yml
@@ -1,10 +1,12 @@
 trigger:
 - main
+- release/*
 
 pr:
   branches:
     include:
     - main
+    - release/*
   drafts: false
 
 jobs:

diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml
@@ -1,5 +1,6 @@
 trigger:
 - main
+- release/*
 
 # Do not run multi-nodes-test for PR, we can trigger it manually
 pr: none

diff --git a/.azure-pipelines/nccl-api-test.yaml b/.azure-pipelines/nccl-api-test.yaml
@@ -1,10 +1,12 @@
 trigger:
 - main
+- release/*
 
 pr:
   branches:
     include:
     - main
+    - release/*
   drafts: false
 
 jobs:

diff --git a/.azure-pipelines/ut.yml b/.azure-pipelines/ut.yml
@@ -1,10 +1,12 @@
 trigger:
 - main
+- release/*
 
 pr:
   branches:
     include:
     - main
+    - release/*
   drafts: false
 
 jobs:

diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
@@ -2,9 +2,13 @@ name: "CodeQL"
 
 on:
   push:
-    branches: [ main ]
+    branches:
+      - main
+      - release/*
   pull_request:
-    branches: [ main ]
+    branches:
+      - main
+      - release/*
   schedule:
     - cron: "30 1 * * 1"
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -9,7 +9,7 @@ set(MSCCLPP_SOVERSION ${MSCCLPP_MAJOR})
 set(MSCCLPP_VERSION "${MSCCLPP_MAJOR}.${MSCCLPP_MINOR}.${MSCCLPP_PATCH}")
 
 cmake_minimum_required(VERSION 3.25)
-enable_language(CXX)
+project(mscclpp LANGUAGES CXX)
 
 list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
 
@@ -61,7 +61,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
 if(MSCCLPP_USE_CUDA)
     set(CMAKE_CUDA_STANDARD 17)
     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall,-Wextra")
-    project(mscclpp LANGUAGES CXX CUDA)
+    enable_language(CUDA)
 
     # CUDA 11 or higher is required
     if(CUDAToolkit_VERSION_MAJOR LESS 11)
@@ -83,7 +83,6 @@ if(MSCCLPP_USE_CUDA)
 else()
     set(CMAKE_HIP_STANDARD 17)
     set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wall -Wextra")
-    project(mscclpp LANGUAGES CXX)
 
     set(CMAKE_HIP_ARCHITECTURES gfx90a gfx941 gfx942)
 

diff --git a/README.md b/README.md
@@ -6,9 +6,9 @@
 
 | Pipelines                | Build Status      |
 |--------------------------|-------------------|
-| Unit Tests (CUDA)        | [![Build Status](https://dev.azure.com/binyli/HPC/_apis/build/status%2Fmscclpp-ut?branchName=main)](https://dev.azure.com/binyli/HPC/_build/latest?definitionId=4&branchName=main) |
-| Integration Tests (CUDA) | [![Build Status](https://dev.azure.com/binyli/HPC/_apis/build/status%2Fmscclpp-test?branchName=main)](https://dev.azure.com/binyli/HPC/_build/latest?definitionId=3&branchName=main) |
-| Integration Tests (ROCm) | [![Build Status](https://dev.azure.com/binyli/HPC/_apis/build/status%2Fmscclpp-test-rocm?branchName=main)](https://dev.azure.com/binyli/HPC/_build/latest?definitionId=7&branchName=main) |
+| Unit Tests (CUDA)        | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
+| Integration Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398479&branchName=main) |
+| Integration Tests (ROCm) | [![Build Status](https://dev.azure.com/msazure/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test-rocm?branchName=main)](https://dev.azure.com/msazure/One/_build/latest?definitionId=399295&branchName=main) |
 
 A GPU-driven communication stack for scalable AI applications.
 

diff --git a/apps/nccl/src/broadcast.hpp b/apps/nccl/src/broadcast.hpp
@@ -17,12 +17,7 @@ __global__ void __launch_bounds__(1024, 1)
     broadcast6(void* sendbuff, void* scratchbuff, void* recvbuff, mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels,
                size_t channelOutOffset, size_t rank, [[maybe_unused]] size_t worldSize, size_t root,
                size_t nRanksPerNode, size_t nelemsPerGPU) {
-  const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
-  const size_t lid = tid % WARP_SIZE;
-  const size_t wid = tid / WARP_SIZE;
-
   const size_t nThread = blockDim.x * gridDim.x;
-  const size_t nWarp = nThread / WARP_SIZE;
   const size_t nPeer = nRanksPerNode - 1;
   const size_t chanOffset = nPeer * blockIdx.x;
 

diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu
@@ -118,43 +118,6 @@ static size_t ncclTypeSize(ncclDataType_t type) {
   return 0;
 }
 
-static double parseSize(const char* value) {
-  std::string valueStr(value);
-  std::istringstream iss(valueStr);
-  long long int units;
-  double size;
-  char size_lit = 0;
-
-  if (iss >> size) {
-    iss >> std::ws;  // eat whitespace
-    iss >> size_lit;
-  } else {
-    return -1.0;
-  }
-
-  if (size_lit != 0 && !std::isspace(size_lit)) {
-    switch (size_lit) {
-      case 'G':
-      case 'g':
-        units = 1024 * 1024 * 1024;
-        break;
-      case 'M':
-      case 'm':
-        units = 1024 * 1024;
-        break;
-      case 'K':
-      case 'k':
-        units = 1024;
-        break;
-      default:
-        return -1.0;
-    };
-  } else {
-    units = 1;
-  }
-  return size * units;
-}
-
 static mscclpp::Transport getTransport(int, int) {
   // if (rank / nRanksPerNode == peerRank / nRanksPerNode) {
   //   return mscclpp::Transport::CudaIpc;

diff --git a/cmake/AddFormatTargets.cmake b/cmake/AddFormatTargets.cmake
@@ -9,7 +9,7 @@ add_custom_target(format)
 find_program(CLANG_FORMAT clang-format)
 if(CLANG_FORMAT)
     message(STATUS "Found clang-format: ${CLANG_FORMAT}")
-    set(FIND_DIRS ${PROJECT_SOURCE_DIR}/src ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/python ${PROJECT_SOURCE_DIR}/test ${PROJECT_SOURCE_DIR}/apps)
+    set(FIND_DIRS ${PROJECT_SOURCE_DIR}/src ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/python ${PROJECT_SOURCE_DIR}/test ${PROJECT_SOURCE_DIR}/apps/nccl/src)
     add_custom_target(check-format-cpp ALL
         COMMAND ${CLANG_FORMAT} -style=file --dry-run `find ${FIND_DIRS} -type f -name *.h -o -name *.hpp -o -name *.c -o -name *.cc -o -name *.cpp -o -name *.cu`
     )

diff --git a/include/mscclpp/proxy_channel_device.hpp b/include/mscclpp/proxy_channel_device.hpp
@@ -92,10 +92,11 @@ struct BaseProxyChannelDeviceHandle {
   // can produce for and the sole proxy thread consumes it.
   FifoDeviceHandle fifo_;
 
-  BaseProxyChannelDeviceHandle() {}
+  MSCCLPP_HOST_DEVICE_INLINE BaseProxyChannelDeviceHandle() {}
 
-  BaseProxyChannelDeviceHandle(SemaphoreId semaphoreId, Host2DeviceSemaphoreDeviceHandle semaphore,
-                               FifoDeviceHandle fifo)
+  MSCCLPP_HOST_DEVICE_INLINE BaseProxyChannelDeviceHandle(SemaphoreId semaphoreId,
+                                                          Host2DeviceSemaphoreDeviceHandle semaphore,
+                                                          FifoDeviceHandle fifo)
       : semaphoreId_(semaphoreId), semaphore_(semaphore), fifo_(fifo) {}
 
 #if defined(MSCCLPP_DEVICE_COMPILE)
@@ -185,10 +186,11 @@ struct ProxyChannelDeviceHandle : public BaseProxyChannelDeviceHandle {
   MemoryId dst_;
   MemoryId src_;
 
-  ProxyChannelDeviceHandle(){};
+  MSCCLPP_HOST_DEVICE_INLINE ProxyChannelDeviceHandle(){};
 
-  ProxyChannelDeviceHandle(SemaphoreId semaphoreId, Host2DeviceSemaphoreDeviceHandle semaphore, FifoDeviceHandle fifo,
-                           MemoryId dst, MemoryId src)
+  MSCCLPP_HOST_DEVICE_INLINE ProxyChannelDeviceHandle(SemaphoreId semaphoreId,
+                                                      Host2DeviceSemaphoreDeviceHandle semaphore, FifoDeviceHandle fifo,
+                                                      MemoryId dst, MemoryId src)
       : BaseProxyChannelDeviceHandle(semaphoreId, semaphore, fifo), dst_(dst), src_(src) {}
 
 #if defined(MSCCLPP_DEVICE_COMPILE)

diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc
@@ -525,8 +525,9 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus, size_t constSrcOffse
   }
 }
 
-std::pair<size_t, u_int32_t> ExecutionPlan::Impl::calcSizePerRank(int rank, size_t inputSize, size_t outputSize) const {
-  std::pair<size_t, u_int32_t> sizePerRank;
+std::pair<size_t, uint32_t> ExecutionPlan::Impl::getSizeAndChunksForRank(int rank, size_t inputSize,
+                                                                         size_t outputSize) const {
+  std::pair<size_t, uint32_t> sizePerRank;
   if (this->inputChunks.at(rank) == 0 && this->outputChunks.at(rank) == 0) {
     throw mscclpp::Error("Output or Input chunks must be greater than 0", mscclpp::ErrorCode::ExecutorError);
   } else if (this->inputChunks.at(rank) != 0 && this->outputChunks.at(rank) != 0) {
@@ -549,15 +550,15 @@ size_t ExecutionPlan::Impl::getOffset(int rank, size_t inputSize, size_t outputS
   }
 
   const int nGroups = this->chunkGroups.at(rank);
-  auto sizePerRank = calcSizePerRank(rank, inputSize, outputSize);
-  uint32_t nInputChunks = sizePerRank.second;
-  uint32_t nelems = sizePerRank.first / (alignment * sizeof(uint8_t));
+  auto rankSizeAndChunks = getSizeAndChunksForRank(rank, inputSize, outputSize);
+  uint32_t nChunks = rankSizeAndChunks.second;
+  uint32_t nelems = rankSizeAndChunks.first / (alignment * sizeof(uint8_t));
   if (nelems % nGroups != 0) {
     throw Error("Input size must be a multiple of nGroups", ErrorCode::ExecutorError);
   }
 
   int nelemsPerGroup = nelems / nGroups;
-  int nChunksPerGroup = nInputChunks / nGroups;
+  int nChunksPerGroup = nChunks / nGroups;
   uint32_t minNelems = nelemsPerGroup / nChunksPerGroup;
   uint32_t remainder = nelemsPerGroup % nChunksPerGroup;
   uint32_t groupIdx = chunkIndex / nChunksPerGroup;
@@ -583,9 +584,17 @@ size_t ExecutionPlan::Impl::getNChunkSize(int rank, size_t inputSize, size_t out
 }
 
 size_t ExecutionPlan::Impl::getUpperBoundChunkSize(int rank, size_t inputSize, size_t outputSize) const {
-  auto sizePerRank = calcSizePerRank(rank, inputSize, outputSize);
-  uint32_t nChunks = sizePerRank.second;
-  return (sizePerRank.first + nChunks - 1) / nChunks;
+  size_t nInputChunks = this->inputChunks.at(rank);
+  size_t nOutputChunks = this->outputChunks.at(rank);
+  size_t inputChunkSize = 0;
+  size_t outputChunkSize = 0;
+  if (nInputChunks != 0) {
+    inputChunkSize = inputSize / nInputChunks;
+  }
+  if (nOutputChunks != 0) {
+    outputChunkSize = outputSize / nOutputChunks;
+  }
+  return std::max(inputChunkSize, outputChunkSize);
 }
 
 void ExecutionPlan::Impl::reset() {

diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp
@@ -117,7 +117,7 @@ struct ExecutionPlan::Impl {
   bool isInPlace;
 
  private:
-  std::pair<size_t, u_int32_t> calcSizePerRank(int rank, size_t inputSize, size_t outputSize) const;
+  std::pair<size_t, uint32_t> getSizeAndChunksForRank(int rank, size_t inputSize, size_t outputSize) const;
   size_t getOffset(int rank, size_t inputSize, size_t outputSize, uint32_t chunkIndex, uint32_t alignment = 16) const;
   size_t getNChunkSize(int rank, size_t inputSize, size_t outputSize, uint32_t nChunks,
                        const std::vector<uint32_t> offsets) const;