Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prepare for n_init=auto in KMeans #6142

Open
wants to merge 14 commits into
base: branch-25.02
Choose a base branch
from
30 changes: 27 additions & 3 deletions python/cuml/cuml/cluster/kmeans.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@

# distutils: language = c++

import warnings

from cuml.internals.safe_imports import cpu_only_import
np = cpu_only_import('numpy')
from cuml.internals.safe_imports import gpu_only_import
Expand Down Expand Up @@ -141,10 +143,17 @@ class KMeans(UniversalBase,
- If an ndarray is passed, it should be of
shape (`n_clusters`, `n_features`) and gives the initial centers.

n_init: int (default = 1)
n_init: 'auto' or int (default = 1)
Number of instances the k-means algorithm will be called with
different seeds. The final results will be from the instance
that produces lowest inertia out of n_init instances.

.. versionadded:: 24.12
Added 'auto' option for `n_init`.

.. versionchanged:: 25.05
Default value for `n_init` will change from 1 to `'auto'` in version 25.XX.

oversampling_factor : float64 (default = 2.0)
The amount of points to sample
in scalable k-means++ initialization for potential centroids.
Expand Down Expand Up @@ -211,15 +220,30 @@ class KMeans(UniversalBase,
params.metric = CuvsDistanceType.L2Expanded # distance metric as squared L2: @todo - support other metrics # noqa: E501
params.batch_samples = <int>self.max_samples_per_batch
params.oversampling_factor = <double>self.oversampling_factor
params.n_init = <int>self.n_init
n_init = self.n_init
if n_init == "warn":
warnings.warn(
"The default value of `n_init` will change from"
" 1 to 'auto' in 25.05. Set the value of `n_init`"
betatim marked this conversation as resolved.
Show resolved Hide resolved
" explicitly to suppress this warning.",
FutureWarning,
)
n_init = 1
if n_init == "auto":
if self.init in ("k-means||", "scalable-k-means++"):
params.n_init = 1
else:
params.n_init = 10
else:
params.n_init = <int>n_init
return <size_t>params
ELSE:
return None

@device_interop_preparation
def __init__(self, *, handle=None, n_clusters=8, max_iter=300, tol=1e-4,
verbose=False, random_state=1,
init='scalable-k-means++', n_init=1, oversampling_factor=2.0,
init='scalable-k-means++', n_init="warn", oversampling_factor=2.0,
max_samples_per_batch=1<<15, convert_dtype=True,
output_type=None):
super().__init__(handle=handle,
Expand Down
29 changes: 27 additions & 2 deletions python/cuml/cuml/tests/test_kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,24 @@ def random_state():
return random_state


def test_n_init_deprecation():
X, y = make_blobs(
random_state=0,
)

# Warn about default changing
kmeans = cuml.KMeans()
with pytest.warns(
FutureWarning, match="The default value of `n_init` will change from"
):
kmeans.fit(X)

# No warning when explicitly set to integer or 'auto'
for n_init in ("auto", 2):
kmeans = cuml.KMeans(n_init=n_init)
kmeans.fit(X)


@pytest.mark.xfail
def test_n_init_cluster_consistency(random_state):

Expand Down Expand Up @@ -127,7 +145,9 @@ def test_traditional_kmeans_plus_plus_init(
cuml_kmeans.fit(X)
cu_score = cuml_kmeans.score(X)

kmeans = cluster.KMeans(random_state=random_state, n_clusters=nclusters)
kmeans = cluster.KMeans(
random_state=random_state, n_clusters=nclusters, n_init=1
)
kmeans.fit(cp.asnumpy(X))
sk_score = kmeans.score(cp.asnumpy(X))

Expand Down Expand Up @@ -167,7 +187,9 @@ def test_weighted_kmeans(nrows, ncols, nclusters, max_weight, random_state):
cuml_kmeans.fit(X, sample_weight=wt)
cu_score = cuml_kmeans.score(X)

sk_kmeans = cluster.KMeans(random_state=random_state, n_clusters=nclusters)
sk_kmeans = cluster.KMeans(
random_state=random_state, n_clusters=nclusters, n_init=1
)
sk_kmeans.fit(cp.asnumpy(X), sample_weight=wt)
sk_score = sk_kmeans.score(cp.asnumpy(X))

Expand Down Expand Up @@ -196,6 +218,7 @@ def test_kmeans_clusters_blobs(
n_clusters=nclusters,
random_state=random_state,
output_type="numpy",
n_init=1,
)

preds = cuml_kmeans.fit_predict(X)
Expand Down Expand Up @@ -327,6 +350,7 @@ def test_all_kmeans_params(
oversampling_factor=oversampling_factor,
max_samples_per_batch=max_samples_per_batch,
output_type="cupy",
n_init=1,
)

cuml_kmeans.fit_predict(X)
Expand Down Expand Up @@ -355,6 +379,7 @@ def test_score(nrows, ncols, nclusters, random_state):
n_clusters=nclusters,
random_state=random_state,
output_type="numpy",
n_init=1,
)

cuml_kmeans.fit(X)
Expand Down
Loading