Skip to content

Commit

Permalink
allow randomized removal of categories during boosting
Browse files Browse the repository at this point in the history
  • Loading branch information
paulbkoch committed Jan 2, 2025
1 parent 5098d64 commit ea6f96b
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 10 deletions.
33 changes: 24 additions & 9 deletions shared/libebm/PartitionOneDimensionalBoosting.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1002,12 +1002,12 @@ template<bool bHessian, size_t cCompilerScores> class PartitionOneDimensionalBoo
->Specialize<FloatMain, UIntMain, true, true, bHessian, GetArrayScores(cCompilerScores)>();
auto* pBinsEnd = IndexBin(aBins, cBytesPerBin * cBins);

const Bin<FloatMain, UIntMain, true, true, bHessian, GetArrayScores(cCompilerScores)>** const apBins =
const auto** const apBins =
reinterpret_cast<const Bin<FloatMain, UIntMain, true, true, bHessian, GetArrayScores(cCompilerScores)>**>(
pBinsEnd);

const Bin<FloatMain, UIntMain, true, true, bHessian, GetArrayScores(cCompilerScores)>** ppBin = apBins;
Bin<FloatMain, UIntMain, true, true, bHessian, GetArrayScores(cCompilerScores)>* pBin = aBins;
const auto** ppBin = apBins;
auto* pBin = aBins;

const Bin<FloatMain, UIntMain, true, true, bHessian, GetArrayScores(cCompilerScores)>* pMissingBin = nullptr;
bool bMissingIsolated = false;
Expand Down Expand Up @@ -1125,7 +1125,7 @@ template<bool bHessian, size_t cCompilerScores> class PartitionOneDimensionalBoo
size_t cRemaining;
if(bNominal) {
cRemaining = ppBin - apBins;
if(0 == cRemaining) {
if(cRemaining <= 1) {
// all categories are dregs, so pretend there's just one bin and everything is inside it
one_bin_return:;

Expand Down Expand Up @@ -1166,19 +1166,34 @@ template<bool bHessian, size_t cCompilerScores> class PartitionOneDimensionalBoo
return error;
}

size_t cKeep = static_cast<size_t>(std::round(categoricalInclusionPercent * cRemaining));
if(cRemaining <= cKeep && categoricalInclusionPercent < 1.0) {
cKeep = cRemaining - 1;
}
if(cKeep <= 1) {
cKeep = 2;
}
if(cRemaining < cKeep) {
cKeep = cRemaining;
}

const bool bShuffle = 1 != cCompilerScores || std::isnan(categoricalSmoothing);
const bool bSort = 1 == cCompilerScores && !std::isnan(categoricalSmoothing);

EBM_ASSERT(bShuffle || bSort);

if(bShuffle) {
while(size_t{1} != cRemaining) {
const auto** ppBinShuffle = apBins;
const auto* const* const ppBinShuffleEnd = apBins + cKeep;
do {
EBM_ASSERT(1 <= cRemaining);
const size_t iSwap = pRng->NextFast(cRemaining);
auto* const pTemp = apBins[iSwap];
auto* const pTemp = ppBinShuffle[iSwap];
--cRemaining;
apBins[iSwap] = apBins[cRemaining];
apBins[cRemaining] = pTemp;
}
ppBinShuffle[iSwap] = *ppBinShuffle;
*ppBinShuffle = pTemp;
++ppBinShuffle;
} while(ppBinShuffleEnd != ppBinShuffle);
}

if(bSort) {
Expand Down
2 changes: 1 addition & 1 deletion shared/libebm/tests/boosting_unusual_inputs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2380,7 +2380,7 @@ static double RandomizedTesting(const AccelerationFlags acceleration) {
}

TEST_CASE("stress test, boosting") {
const double expected = 15111161995602.100;
const double expected = 15453628422513.807;

double validationMetricExact = RandomizedTesting(AccelerationFlags_NONE);
CHECK(validationMetricExact == expected);
Expand Down

0 comments on commit ea6f96b

Please sign in to comment.