Skip to content

Commit

Permalink
Merge branch 'main' into ziyue/fix-double-buffer
Browse files Browse the repository at this point in the history
  • Loading branch information
yzygitzh authored Dec 25, 2024
2 parents 09d9949 + 3e7801b commit 53c359e
Show file tree
Hide file tree
Showing 14 changed files with 48 additions and 67 deletions.
2 changes: 2 additions & 0 deletions .azure-pipelines/integration-test-rocm.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
trigger:
- main
- release/*

pr:
branches:
include:
- main
- release/*
drafts: false

jobs:
Expand Down
2 changes: 2 additions & 0 deletions .azure-pipelines/integration-test.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
trigger:
- main
- release/*

pr:
branches:
include:
- main
- release/*
drafts: false

jobs:
Expand Down
1 change: 1 addition & 0 deletions .azure-pipelines/multi-nodes-test.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
trigger:
- main
- release/*

# Do not run multi-nodes-test for PR, we can trigger it manually
pr: none
Expand Down
2 changes: 2 additions & 0 deletions .azure-pipelines/nccl-api-test.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
trigger:
- main
- release/*

pr:
branches:
include:
- main
- release/*
drafts: false

jobs:
Expand Down
2 changes: 2 additions & 0 deletions .azure-pipelines/ut.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
trigger:
- main
- release/*

pr:
branches:
include:
- main
- release/*
drafts: false

jobs:
Expand Down
8 changes: 6 additions & 2 deletions .github/workflows/codeql-analysis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@ name: "CodeQL"

on:
push:
branches: [ main ]
branches:
- main
- release/*
pull_request:
branches: [ main ]
branches:
- main
- release/*
schedule:
- cron: "30 1 * * 1"

Expand Down
5 changes: 2 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ set(MSCCLPP_SOVERSION ${MSCCLPP_MAJOR})
set(MSCCLPP_VERSION "${MSCCLPP_MAJOR}.${MSCCLPP_MINOR}.${MSCCLPP_PATCH}")

cmake_minimum_required(VERSION 3.25)
enable_language(CXX)
project(mscclpp LANGUAGES CXX)

list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)

Expand Down Expand Up @@ -61,7 +61,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
if(MSCCLPP_USE_CUDA)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall,-Wextra")
project(mscclpp LANGUAGES CXX CUDA)
enable_language(CUDA)

# CUDA 11 or higher is required
if(CUDAToolkit_VERSION_MAJOR LESS 11)
Expand All @@ -83,7 +83,6 @@ if(MSCCLPP_USE_CUDA)
else()
set(CMAKE_HIP_STANDARD 17)
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wall -Wextra")
project(mscclpp LANGUAGES CXX)

set(CMAKE_HIP_ARCHITECTURES gfx90a gfx941 gfx942)

Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@

| Pipelines | Build Status |
|--------------------------|-------------------|
| Unit Tests (CUDA) | [![Build Status](https://dev.azure.com/binyli/HPC/_apis/build/status%2Fmscclpp-ut?branchName=main)](https://dev.azure.com/binyli/HPC/_build/latest?definitionId=4&branchName=main) |
| Integration Tests (CUDA) | [![Build Status](https://dev.azure.com/binyli/HPC/_apis/build/status%2Fmscclpp-test?branchName=main)](https://dev.azure.com/binyli/HPC/_build/latest?definitionId=3&branchName=main) |
| Integration Tests (ROCm) | [![Build Status](https://dev.azure.com/binyli/HPC/_apis/build/status%2Fmscclpp-test-rocm?branchName=main)](https://dev.azure.com/binyli/HPC/_build/latest?definitionId=7&branchName=main) |
| Unit Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
| Integration Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398479&branchName=main) |
| Integration Tests (ROCm) | [![Build Status](https://dev.azure.com/msazure/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test-rocm?branchName=main)](https://dev.azure.com/msazure/One/_build/latest?definitionId=399295&branchName=main) |

A GPU-driven communication stack for scalable AI applications.

Expand Down
5 changes: 0 additions & 5 deletions apps/nccl/src/broadcast.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,7 @@ __global__ void __launch_bounds__(1024, 1)
broadcast6(void* sendbuff, void* scratchbuff, void* recvbuff, mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels,
size_t channelOutOffset, size_t rank, [[maybe_unused]] size_t worldSize, size_t root,
size_t nRanksPerNode, size_t nelemsPerGPU) {
const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
const size_t lid = tid % WARP_SIZE;
const size_t wid = tid / WARP_SIZE;

const size_t nThread = blockDim.x * gridDim.x;
const size_t nWarp = nThread / WARP_SIZE;
const size_t nPeer = nRanksPerNode - 1;
const size_t chanOffset = nPeer * blockIdx.x;

Expand Down
37 changes: 0 additions & 37 deletions apps/nccl/src/nccl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -118,43 +118,6 @@ static size_t ncclTypeSize(ncclDataType_t type) {
return 0;
}

static double parseSize(const char* value) {
std::string valueStr(value);
std::istringstream iss(valueStr);
long long int units;
double size;
char size_lit = 0;

if (iss >> size) {
iss >> std::ws; // eat whitespace
iss >> size_lit;
} else {
return -1.0;
}

if (size_lit != 0 && !std::isspace(size_lit)) {
switch (size_lit) {
case 'G':
case 'g':
units = 1024 * 1024 * 1024;
break;
case 'M':
case 'm':
units = 1024 * 1024;
break;
case 'K':
case 'k':
units = 1024;
break;
default:
return -1.0;
};
} else {
units = 1;
}
return size * units;
}

static mscclpp::Transport getTransport(int, int) {
// if (rank / nRanksPerNode == peerRank / nRanksPerNode) {
// return mscclpp::Transport::CudaIpc;
Expand Down
2 changes: 1 addition & 1 deletion cmake/AddFormatTargets.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ add_custom_target(format)
find_program(CLANG_FORMAT clang-format)
if(CLANG_FORMAT)
message(STATUS "Found clang-format: ${CLANG_FORMAT}")
set(FIND_DIRS ${PROJECT_SOURCE_DIR}/src ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/python ${PROJECT_SOURCE_DIR}/test ${PROJECT_SOURCE_DIR}/apps)
set(FIND_DIRS ${PROJECT_SOURCE_DIR}/src ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/python ${PROJECT_SOURCE_DIR}/test ${PROJECT_SOURCE_DIR}/apps/nccl/src)
add_custom_target(check-format-cpp ALL
COMMAND ${CLANG_FORMAT} -style=file --dry-run `find ${FIND_DIRS} -type f -name *.h -o -name *.hpp -o -name *.c -o -name *.cc -o -name *.cpp -o -name *.cu`
)
Expand Down
14 changes: 8 additions & 6 deletions include/mscclpp/proxy_channel_device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,10 +92,11 @@ struct BaseProxyChannelDeviceHandle {
// can produce for and the sole proxy thread consumes it.
FifoDeviceHandle fifo_;

BaseProxyChannelDeviceHandle() {}
MSCCLPP_HOST_DEVICE_INLINE BaseProxyChannelDeviceHandle() {}

BaseProxyChannelDeviceHandle(SemaphoreId semaphoreId, Host2DeviceSemaphoreDeviceHandle semaphore,
FifoDeviceHandle fifo)
MSCCLPP_HOST_DEVICE_INLINE BaseProxyChannelDeviceHandle(SemaphoreId semaphoreId,
Host2DeviceSemaphoreDeviceHandle semaphore,
FifoDeviceHandle fifo)
: semaphoreId_(semaphoreId), semaphore_(semaphore), fifo_(fifo) {}

#if defined(MSCCLPP_DEVICE_COMPILE)
Expand Down Expand Up @@ -185,10 +186,11 @@ struct ProxyChannelDeviceHandle : public BaseProxyChannelDeviceHandle {
MemoryId dst_;
MemoryId src_;

ProxyChannelDeviceHandle(){};
MSCCLPP_HOST_DEVICE_INLINE ProxyChannelDeviceHandle(){};

ProxyChannelDeviceHandle(SemaphoreId semaphoreId, Host2DeviceSemaphoreDeviceHandle semaphore, FifoDeviceHandle fifo,
MemoryId dst, MemoryId src)
MSCCLPP_HOST_DEVICE_INLINE ProxyChannelDeviceHandle(SemaphoreId semaphoreId,
Host2DeviceSemaphoreDeviceHandle semaphore, FifoDeviceHandle fifo,
MemoryId dst, MemoryId src)
: BaseProxyChannelDeviceHandle(semaphoreId, semaphore, fifo), dst_(dst), src_(src) {}

#if defined(MSCCLPP_DEVICE_COMPILE)
Expand Down
27 changes: 18 additions & 9 deletions src/executor/execution_plan.cc
Original file line number Diff line number Diff line change
Expand Up @@ -525,8 +525,9 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus, size_t constSrcOffse
}
}

std::pair<size_t, u_int32_t> ExecutionPlan::Impl::calcSizePerRank(int rank, size_t inputSize, size_t outputSize) const {
std::pair<size_t, u_int32_t> sizePerRank;
std::pair<size_t, uint32_t> ExecutionPlan::Impl::getSizeAndChunksForRank(int rank, size_t inputSize,
size_t outputSize) const {
std::pair<size_t, uint32_t> sizePerRank;
if (this->inputChunks.at(rank) == 0 && this->outputChunks.at(rank) == 0) {
throw mscclpp::Error("Output or Input chunks must be greater than 0", mscclpp::ErrorCode::ExecutorError);
} else if (this->inputChunks.at(rank) != 0 && this->outputChunks.at(rank) != 0) {
Expand All @@ -549,15 +550,15 @@ size_t ExecutionPlan::Impl::getOffset(int rank, size_t inputSize, size_t outputS
}

const int nGroups = this->chunkGroups.at(rank);
auto sizePerRank = calcSizePerRank(rank, inputSize, outputSize);
uint32_t nInputChunks = sizePerRank.second;
uint32_t nelems = sizePerRank.first / (alignment * sizeof(uint8_t));
auto rankSizeAndChunks = getSizeAndChunksForRank(rank, inputSize, outputSize);
uint32_t nChunks = rankSizeAndChunks.second;
uint32_t nelems = rankSizeAndChunks.first / (alignment * sizeof(uint8_t));
if (nelems % nGroups != 0) {
throw Error("Input size must be a multiple of nGroups", ErrorCode::ExecutorError);
}

int nelemsPerGroup = nelems / nGroups;
int nChunksPerGroup = nInputChunks / nGroups;
int nChunksPerGroup = nChunks / nGroups;
uint32_t minNelems = nelemsPerGroup / nChunksPerGroup;
uint32_t remainder = nelemsPerGroup % nChunksPerGroup;
uint32_t groupIdx = chunkIndex / nChunksPerGroup;
Expand All @@ -583,9 +584,17 @@ size_t ExecutionPlan::Impl::getNChunkSize(int rank, size_t inputSize, size_t out
}

size_t ExecutionPlan::Impl::getUpperBoundChunkSize(int rank, size_t inputSize, size_t outputSize) const {
auto sizePerRank = calcSizePerRank(rank, inputSize, outputSize);
uint32_t nChunks = sizePerRank.second;
return (sizePerRank.first + nChunks - 1) / nChunks;
size_t nInputChunks = this->inputChunks.at(rank);
size_t nOutputChunks = this->outputChunks.at(rank);
size_t inputChunkSize = 0;
size_t outputChunkSize = 0;
if (nInputChunks != 0) {
inputChunkSize = inputSize / nInputChunks;
}
if (nOutputChunks != 0) {
outputChunkSize = outputSize / nOutputChunks;
}
return std::max(inputChunkSize, outputChunkSize);
}

void ExecutionPlan::Impl::reset() {
Expand Down
2 changes: 1 addition & 1 deletion src/include/execution_plan.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ struct ExecutionPlan::Impl {
bool isInPlace;

private:
std::pair<size_t, u_int32_t> calcSizePerRank(int rank, size_t inputSize, size_t outputSize) const;
std::pair<size_t, uint32_t> getSizeAndChunksForRank(int rank, size_t inputSize, size_t outputSize) const;
size_t getOffset(int rank, size_t inputSize, size_t outputSize, uint32_t chunkIndex, uint32_t alignment = 16) const;
size_t getNChunkSize(int rank, size_t inputSize, size_t outputSize, uint32_t nChunks,
const std::vector<uint32_t> offsets) const;
Expand Down

0 comments on commit 53c359e

Please sign in to comment.