Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Cherry-pick] Fix nccl-test failure issue (#421) #429

Merged
merged 1 commit into from
Dec 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 18 additions & 9 deletions src/executor/execution_plan.cc
Original file line number Diff line number Diff line change
Expand Up @@ -510,8 +510,9 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus, size_t constSrcOffse
}
}

std::pair<size_t, u_int32_t> ExecutionPlan::Impl::calcSizePerRank(int rank, size_t inputSize, size_t outputSize) const {
std::pair<size_t, u_int32_t> sizePerRank;
std::pair<size_t, uint32_t> ExecutionPlan::Impl::getSizeAndChunksForRank(int rank, size_t inputSize,
size_t outputSize) const {
std::pair<size_t, uint32_t> sizePerRank;
if (this->inputChunks.at(rank) == 0 && this->outputChunks.at(rank) == 0) {
throw mscclpp::Error("Output or Input chunks must be greater than 0", mscclpp::ErrorCode::ExecutorError);
} else if (this->inputChunks.at(rank) != 0 && this->outputChunks.at(rank) != 0) {
Expand All @@ -534,15 +535,15 @@ size_t ExecutionPlan::Impl::getOffset(int rank, size_t inputSize, size_t outputS
}

const int nGroups = this->chunkGroups.at(rank);
auto sizePerRank = calcSizePerRank(rank, inputSize, outputSize);
uint32_t nInputChunks = sizePerRank.second;
uint32_t nelems = sizePerRank.first / (alignment * sizeof(uint8_t));
auto rankSizeAndChunks = getSizeAndChunksForRank(rank, inputSize, outputSize);
uint32_t nChunks = rankSizeAndChunks.second;
uint32_t nelems = rankSizeAndChunks.first / (alignment * sizeof(uint8_t));
if (nelems % nGroups != 0) {
throw Error("Input size must be a multiple of nGroups", ErrorCode::ExecutorError);
}

int nelemsPerGroup = nelems / nGroups;
int nChunksPerGroup = nInputChunks / nGroups;
int nChunksPerGroup = nChunks / nGroups;
uint32_t minNelems = nelemsPerGroup / nChunksPerGroup;
uint32_t remainder = nelemsPerGroup % nChunksPerGroup;
uint32_t groupIdx = chunkIndex / nChunksPerGroup;
Expand All @@ -568,9 +569,17 @@ size_t ExecutionPlan::Impl::getNChunkSize(int rank, size_t inputSize, size_t out
}

size_t ExecutionPlan::Impl::getUpperBoundChunkSize(int rank, size_t inputSize, size_t outputSize) const {
auto sizePerRank = calcSizePerRank(rank, inputSize, outputSize);
uint32_t nChunks = sizePerRank.second;
return (sizePerRank.first + nChunks - 1) / nChunks;
size_t nInputChunks = this->inputChunks.at(rank);
size_t nOutputChunks = this->outputChunks.at(rank);
size_t inputChunkSize = 0;
size_t outputChunkSize = 0;
if (nInputChunks != 0) {
inputChunkSize = inputSize / nInputChunks;
}
if (nOutputChunks != 0) {
outputChunkSize = outputSize / nOutputChunks;
}
return std::max(inputChunkSize, outputChunkSize);
}

void ExecutionPlan::Impl::reset() {
Expand Down
2 changes: 1 addition & 1 deletion src/include/execution_plan.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ struct ExecutionPlan::Impl {
bool isInPlace;

private:
std::pair<size_t, u_int32_t> calcSizePerRank(int rank, size_t inputSize, size_t outputSize) const;
std::pair<size_t, uint32_t> getSizeAndChunksForRank(int rank, size_t inputSize, size_t outputSize) const;
size_t getOffset(int rank, size_t inputSize, size_t outputSize, uint32_t chunkIndex, uint32_t alignment = 16) const;
size_t getNChunkSize(int rank, size_t inputSize, size_t outputSize, uint32_t nChunks,
const std::vector<uint32_t> offsets) const;
Expand Down
Loading