From de119aba734d0a0b2e22d622fab8dd06b6ee99ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89mile?= <73942755+e10e3@users.noreply.github.com> Date: Thu, 14 Nov 2024 14:54:33 +0100 Subject: [PATCH] Make mini-batch TF-IDF raise an exception (#1631) * Make the mini-batch methods unavailable for TF-IDF There is currently no mini-batch implementation of TF-IDF. To prevent Python from using the methods from the parent class BagOfWords (which would give incorrect results), we add the methods to TF-IDF and raise an error. * Add missing parameters from VectorizerMixin The paramters were documented in the docstring but were not in the constructor. * Changelog entry --- docs/releases/unreleased.md | 4 ++++ river/feature_extraction/vectorize.py | 13 +++++++++++++ 2 files changed, 17 insertions(+) diff --git a/docs/releases/unreleased.md b/docs/releases/unreleased.md index 4ed6dd48f1..5ed9315b52 100644 --- a/docs/releases/unreleased.md +++ b/docs/releases/unreleased.md @@ -14,6 +14,10 @@ - Make `drift.ADWIN` comply with the reference MOA implementation. +## feature extraction + +- The mini-batch methods for `feature_extraction.TFIDF` now systematically raise an exception, as they are not implemented. + ## stats - Removed the unexported class `stats.CentralMoments`. diff --git a/river/feature_extraction/vectorize.py b/river/feature_extraction/vectorize.py index 5f68450208..e0a9496730 100644 --- a/river/feature_extraction/vectorize.py +++ b/river/feature_extraction/vectorize.py @@ -451,6 +451,8 @@ def __init__( strip_accents=True, lowercase=True, preprocessor: typing.Callable | None = None, + stop_words: set[str] | None = None, + tokenizer_pattern=r"(?u)\b\w[\w\-]+\b", tokenizer: typing.Callable | None = None, ngram_range=(1, 1), ): @@ -459,6 +461,8 @@ def __init__( strip_accents=strip_accents, lowercase=lowercase, preprocessor=preprocessor, + stop_words=stop_words, + tokenizer_pattern=tokenizer_pattern, tokenizer=tokenizer, ngram_range=ngram_range, ) @@ -489,3 +493,12 @@ def transform_one(self, x): norm = math.sqrt(sum(tfidf**2 for tfidf in tfidfs.values())) return {term: tfidf / norm for term, tfidf in tfidfs.items()} return tfidfs + + # Mini-batch methods should be done wellâ„¢ and not just be a loop over the *_one equivalent. + def learn_many(self, X): + "Not available, will raise an exception." + raise NotImplementedError + + def transform_many(self, X): + "Not available, will raise an exception." + raise NotImplementedError