diff --git a/CHANGES.txt b/CHANGES.txt index 856b86070..b104c5879 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -108,6 +108,10 @@ v<0.8.2>, <07/04/2020> -- Add a set of utility functions. v<0.8.2>, <08/30/2020> -- Add COPOD and MAD algorithm. v<0.8.3>, <09/01/2020> -- Make decision score consistent. v<0.8.3>, <09/19/2020> -- Add model persistence documentation (save and load). +v<0.8.4>, <10/13/2020> -- Fix COPOD code inconsistency (issue #239). +v<0.8.4>, <10/24/2020> -- Fix LSCP minor bug (issue #180). +v<0.8.4>, <11/02/2020> -- Add support for Tensorflow 2. +v<0.8.4>, <11/12/2020> -- Merge PR #!02 for categortical data generation. diff --git a/examples/generate_data_categorical_example.py b/examples/generate_data_categorical_example.py new file mode 100644 index 000000000..7926eee78 --- /dev/null +++ b/examples/generate_data_categorical_example.py @@ -0,0 +1,49 @@ +# -*- coding: utf-8 -*- +"""Example of using and visualizing ``generate_data_categorical`` function. +""" +# Author: Yahya Almardeny +# License: BSD 2 clause + +from __future__ import division +from __future__ import print_function + +import os +import sys +import numpy as np +import matplotlib.pyplot as plt + +# temporary solution for relative imports in case pyod is not installed +# if pyod is installed, no need to use the following line + +sys.path.append( + os.path.abspath(os.path.join(os.path.dirname("__file__"), '..'))) + +from pyod.utils.data import generate_data_categorical + +if __name__ == "__main__": + contamination = 0.1 # percentage of outliers + + # Generate sample data in clusters + X_train, X_test, y_train, y_test = generate_data_categorical \ + (n_train=200, n_test=50, + n_category_in=8, n_category_out=5, + n_informative=1, n_features=1, + contamination=contamination, + shuffle=True, random_state=42) + + # note that visalizing it can only be in 1 dimension! + cats = list(np.ravel(X_train)) + labels = list(y_train) + fig, axs = plt.subplots(1, 2) + axs[0].bar(cats, labels) + axs[1].plot(cats, labels) + plt.title('Synthetic Categorical Train Data') + plt.show() + + cats = list(np.ravel(X_test)) + labels = list(y_test) + fig, axs = plt.subplots(1, 2) + axs[0].bar(cats, labels) + axs[1].plot(cats, labels) + plt.title('Synthetic Categorical Test Data') + plt.show() diff --git a/pyod/models/auto_encoder.py b/pyod/models/auto_encoder.py index 23ab1cfdf..1ed264dd9 100644 --- a/pyod/models/auto_encoder.py +++ b/pyod/models/auto_encoder.py @@ -8,10 +8,6 @@ from __future__ import print_function import numpy as np -from keras.models import Sequential -from keras.layers import Dense, Dropout -from keras.regularizers import l2 -from keras.losses import mean_squared_error from sklearn.preprocessing import StandardScaler from sklearn.utils import check_array from sklearn.utils.validation import check_is_fitted @@ -20,6 +16,19 @@ from ..utils.stat_models import pairwise_distances_no_broadcast from .base import BaseDetector +from .base_dl import _get_tensorflow_version + +# if tensorflow 2, import from tf directly +if _get_tensorflow_version() == 1: + from keras.models import Sequential + from keras.layers import Dense, Dropout + from keras.regularizers import l2 + from keras.losses import mean_squared_error +else: + from tensorflow.keras.models import Sequential + from tensorflow.keras.layers import Dense, Dropout + from tensorflow.keras.regularizers import l2 + from tensorflow.keras.losses import mean_squared_error # noinspection PyUnresolvedReferences,PyPep8Naming,PyTypeChecker @@ -78,7 +87,7 @@ class AutoEncoder(BaseDetector): - 1 = progress bar - 2 = one line per epoch. - For verbosity >= 1, model summary may be printed. + For verbose >= 1, model summary may be printed. random_state : random_state: int, RandomState instance or None, optional (default=None) diff --git a/pyod/models/base_dl.py b/pyod/models/base_dl.py new file mode 100644 index 000000000..d6fcdd8a9 --- /dev/null +++ b/pyod/models/base_dl.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- +"""Base class for deep learning models +""" +# Author: Yue Zhao +# License: BSD 2 clause + +from __future__ import division +from __future__ import print_function + +import tensorflow + +def _get_tensorflow_version(): # pragma: no cover + """ Utility function to decide the version of tensorflow, which will + affect how to import keras models. + + Returns + ------- + tensorflow version : int + + """ + + tf_version = str(tensorflow.__version__) + if int(tf_version.split(".")[0]) != 1 and int( + tf_version.split(".")[0]) != 2: + raise ValueError("tensorflow version error") + + return int(tf_version.split(".")[0]) \ No newline at end of file diff --git a/pyod/models/copod.py b/pyod/models/copod.py index bc59881df..235515cdb 100644 --- a/pyod/models/copod.py +++ b/pyod/models/copod.py @@ -79,9 +79,10 @@ def fit(self, X, y=None): Fitted estimator. """ X = check_array(X) - self._set_n_classes(y=None) + self._set_n_classes(y) self.X_train = X self.decision_function(X) + return self def decision_function(self, X): """Predict raw anomaly score of X using the fitted detector. diff --git a/pyod/models/gaal_base.py b/pyod/models/gaal_base.py index 12e179b2f..763d4c7d3 100644 --- a/pyod/models/gaal_base.py +++ b/pyod/models/gaal_base.py @@ -11,9 +11,17 @@ import math -import keras -from keras.layers import Input, Dense -from keras.models import Sequential, Model +from .base_dl import _get_tensorflow_version + +# if tensorflow 2, import from tf directly +if _get_tensorflow_version() == 1: + import keras + from keras.layers import Input, Dense + from keras.models import Sequential, Model +else: + import tensorflow.keras as keras + from tensorflow.keras.layers import Input, Dense + from tensorflow.keras.models import Sequential, Model # TODO: create a base class for so_gaal and mo_gaal diff --git a/pyod/models/lscp.py b/pyod/models/lscp.py index bfd31ea34..64bf4522d 100644 --- a/pyod/models/lscp.py +++ b/pyod/models/lscp.py @@ -341,10 +341,18 @@ def _get_local_region(self, X_test_norm): # keep nearby points which occur at least local_region_threshold times final_local_region_list = [[]] * X_test_norm.shape[0] for j in range(X_test_norm.shape[0]): - final_local_region_list[j] = [item for item, count in - collections.Counter( - local_region_list[j]).items() if - count > self.local_region_threshold] + tmp = [item for item, count in collections.Counter( + local_region_list[j]).items() if + count > self.local_region_threshold] + decrease_value = 0 + while len(tmp) < 2: + decrease_value = decrease_value + 1 + assert decrease_value < self.local_region_threshold + tmp = [item for item, count in + collections.Counter(local_region_list[j]).items() if + count > (self.local_region_threshold - decrease_value)] + + final_local_region_list[j] = tmp return final_local_region_list diff --git a/pyod/models/mo_gaal.py b/pyod/models/mo_gaal.py index d8f17e94b..25cf4ca0a 100644 --- a/pyod/models/mo_gaal.py +++ b/pyod/models/mo_gaal.py @@ -13,16 +13,23 @@ import numpy as np -from keras.layers import Input -from keras.models import Model -from keras.optimizers import SGD - from sklearn.utils import check_array from sklearn.utils.validation import check_is_fitted from .base import BaseDetector from .gaal_base import create_discriminator from .gaal_base import create_generator +from .base_dl import _get_tensorflow_version + +# if tensorflow 2, import from tf directly +if _get_tensorflow_version() == 1: + from keras.layers import Input + from keras.models import Model + from keras.optimizers import SGD +else: + from tensorflow.keras.layers import Input + from tensorflow.keras.models import Model + from tensorflow.keras.optimizers import SGD class MO_GAAL(BaseDetector): diff --git a/pyod/models/so_gaal.py b/pyod/models/so_gaal.py index 9dd3c8151..18b43f1ed 100644 --- a/pyod/models/so_gaal.py +++ b/pyod/models/so_gaal.py @@ -13,16 +13,23 @@ import numpy as np -from keras.layers import Input -from keras.models import Model -from keras.optimizers import SGD - from sklearn.utils import check_array from sklearn.utils.validation import check_is_fitted from .base import BaseDetector from .gaal_base import create_discriminator from .gaal_base import create_generator +from .base_dl import _get_tensorflow_version + +# if tensorflow 2, import from tf directly +if _get_tensorflow_version() == 1: + from keras.layers import Input + from keras.models import Model + from keras.optimizers import SGD +else: + from tensorflow.keras.layers import Input + from tensorflow.keras.models import Model + from tensorflow.keras.optimizers import SGD class SO_GAAL(BaseDetector): diff --git a/pyod/models/vae.py b/pyod/models/vae.py index fa612e8e2..eb8bbaf4c 100644 --- a/pyod/models/vae.py +++ b/pyod/models/vae.py @@ -22,13 +22,6 @@ import numpy as np -from keras.models import Model -from keras.layers import Lambda, Input, Dense, Dropout -from keras.regularizers import l2 -from keras.losses import mse, binary_crossentropy -from keras.utils import plot_model -from keras import backend as K - from sklearn.preprocessing import StandardScaler from sklearn.utils import check_array from sklearn.utils.validation import check_is_fitted @@ -37,6 +30,21 @@ from ..utils.stat_models import pairwise_distances_no_broadcast from .base import BaseDetector +from .base_dl import _get_tensorflow_version + +# if tensorflow 2, import from tf directly +if _get_tensorflow_version() == 1: + from keras.models import Model + from keras.layers import Lambda, Input, Dense, Dropout + from keras.regularizers import l2 + from keras.losses import mse, binary_crossentropy + from keras import backend as K +else: + from tensorflow.keras.models import Model + from tensorflow.keras.layers import Lambda, Input, Dense, Dropout + from tensorflow.keras.regularizers import l2 + from tensorflow.keras.losses import mse, binary_crossentropy + from tensorflow.keras import backend as K class VAE(BaseDetector): @@ -114,13 +122,13 @@ class VAE(BaseDetector): If True, apply standardization on the data. verbose : int, optional (default=1) - Verbosity mode. + verbose mode. - 0 = silent - 1 = progress bar - 2 = one line per epoch. - For verbosity >= 1, model summary may be printed. + For verbose >= 1, model summary may be printed. random_state : random_state: int, RandomState instance or None, opti (default=None) @@ -172,7 +180,7 @@ def __init__(self, encoder_neurons=None, decoder_neurons=None, output_activation='sigmoid', loss=mse, optimizer='adam', epochs=100, batch_size=32, dropout_rate=0.2, l2_regularizer=0.1, validation_size=0.1, preprocessing=True, - verbosity=1, random_state=None, contamination=0.1, + verbose=1, random_state=None, contamination=0.1, gamma=1.0, capacity=0.0): super(VAE, self).__init__(contamination=contamination) self.encoder_neurons = encoder_neurons @@ -187,7 +195,7 @@ def __init__(self, encoder_neurons=None, decoder_neurons=None, self.l2_regularizer = l2_regularizer self.validation_size = validation_size self.preprocessing = preprocessing - self.verbosity = verbosity + self.verbose = verbose self.random_state = random_state self.latent_dim = latent_dim self.gamma = gamma @@ -264,7 +272,7 @@ def _build_model(self): [z_mean, z_log]) # Instantiate encoder encoder = Model(inputs, [z_mean, z_log, z]) - if self.verbosity >= 1: + if self.verbose >= 1: encoder.summary() # Build Decoder @@ -281,7 +289,7 @@ def _build_model(self): layer) # Instatiate decoder decoder = Model(latent_inputs, outputs) - if self.verbosity >= 1: + if self.verbose >= 1: decoder.summary() # Generate outputs outputs = decoder(encoder(inputs)[2]) @@ -290,7 +298,7 @@ def _build_model(self): vae = Model(inputs, outputs) vae.add_loss(self.vae_loss(inputs, outputs, z_mean, z_log)) vae.compile(optimizer=self.optimizer) - if self.verbosity >= 1: + if self.verbose >= 1: vae.summary() return vae @@ -335,7 +343,7 @@ def fit(self, X, y=None): batch_size=self.batch_size, shuffle=True, validation_split=self.validation_size, - verbose=self.verbosity).history + verbose=self.verbose).history # Predict on X itself and calculate the reconstruction error as # the outlier scores. Noted X_norm was shuffled has to recreate if self.preprocessing: diff --git a/pyod/test/test_data.py b/pyod/test/test_data.py index dd95d9c0f..13b81dd86 100644 --- a/pyod/test/test_data.py +++ b/pyod/test/test_data.py @@ -17,6 +17,7 @@ # temporary solution for relative imports in case pyod is not installed # if pyod is installed, no need to use the following line +from pyod.utils.data import generate_data_categorical sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) @@ -141,8 +142,7 @@ def test_data_generate_cluster3(self): def test_data_generate_cluster5(self): with assert_raises(ValueError): - X_train, y_train, X_test, y_test = \ - generate_data_clusters(n_train=self.n_train, + generate_data_clusters(n_train=self.n_train, n_test=self.n_test, n_features=3, n_clusters='e', @@ -150,24 +150,21 @@ def test_data_generate_cluster5(self): random_state=self.random_state) with assert_raises(ValueError): - X_train, y_train, X_test, y_test = \ - generate_data_clusters(n_train=self.n_train, + generate_data_clusters(n_train=self.n_train, n_test=self.n_test, n_features='e', contamination=self.contamination, random_state=self.random_state) with assert_raises(ValueError): - X_train, y_train, X_test, y_test = \ - generate_data_clusters(n_train=self.n_train, + generate_data_clusters(n_train=self.n_train, n_test=self.n_test, n_features=3, contamination='e', random_state=self.random_state) with assert_raises(ValueError): - X_train, y_train, X_test, y_test = \ - generate_data_clusters(n_train=self.n_train, + generate_data_clusters(n_train=self.n_train, n_test=self.n_test, n_features=3, contamination=self.contamination, @@ -197,6 +194,181 @@ def test_data_generate_cluster6(self): self.n_train + self.n_test) assert_allclose(self.contamination, out_perc, atol=0.01) + def test_data_generate_categorical(self): + X_train, X_test, y_train, y_test = \ + generate_data_categorical(n_train=self.n_train, + n_test=self.n_test, + n_features=2, + contamination=self.contamination, + random_state=self.random_state) + + assert_equal(y_train.shape[0], X_train.shape[0]) + assert_equal(y_test.shape[0], X_test.shape[0]) + + assert_less_equal(self.n_train - X_train.shape[0], 1) + assert_equal(X_train.shape[1], 2) + + assert_less_equal(self.n_test - X_test.shape[0], 1) + assert_equal(X_test.shape[1], 2) + + out_perc = (np.sum(y_train) + np.sum(y_test)) / ( + self.n_train + self.n_test) + assert_allclose(self.contamination, out_perc, atol=0.01) + + def test_data_generate_categorical2(self): + X_train, X_test, y_train, y_test = \ + generate_data_categorical(n_train=self.n_train, + n_test=self.n_test, + n_features=4, + contamination=self.contamination, + random_state=self.random_state) + + assert_allclose(X_train.shape, (self.n_train, 4)) + assert_allclose(X_test.shape, (self.n_test, 4)) + + def test_data_generate_categorical3(self): + X_train, y_train, X_test, y_test = \ + generate_data_categorical(n_train=self.n_train, + n_test=self.n_test, + n_features=3, + contamination=self.contamination, + random_state=self.random_state) + + X_train2, y_train2, X_test2, y_test2 = \ + generate_data_categorical(n_train=self.n_train, + n_test=self.n_test, + n_features=3, + contamination=self.contamination, + random_state=self.random_state) + + assert np.array_equal(X_train, X_train2) + assert np.array_equal(X_train, X_train2) + assert np.array_equal(X_test, X_test2) + assert np.array_equal(y_train, y_train2) + assert np.array_equal(y_test, y_test2) + + def test_data_generate_categorical5(self): + with assert_raises(ValueError): + generate_data_categorical(n_train=self.n_train, n_test=self.n_test, + n_category_in=5, n_category_out=3, + n_informative=1, n_features=1, + contamination=self.contamination, + random_state=-1) + + with assert_raises(ValueError): + generate_data_categorical(n_train=0, n_test=self.n_test, + n_category_in=5, n_category_out=3, + n_informative=1, n_features=1, + contamination=self.contamination, + random_state=self.random_state) + + with assert_raises(ValueError): + generate_data_categorical(n_train=self.n_train, n_test=-1, + n_category_in=5, n_category_out=3, + n_informative=1, n_features=1, + contamination=self.contamination, + random_state=self.random_state) + + with assert_raises(ValueError): + generate_data_categorical(n_train='not int', n_test=self.n_test, + n_category_in=5, n_category_out=3, + n_informative=1, n_features=1, + contamination=self.contamination, + random_state=self.random_state) + + with assert_raises(ValueError): + generate_data_categorical(n_train=self.n_train, n_test='not int', + n_category_in=5, n_category_out=3, + n_informative=1, n_features=1, + contamination=self.contamination, + random_state=self.random_state) + + with assert_raises(ValueError): + generate_data_categorical(n_train=self.n_train, n_test=self.n_test, + n_category_in=5, n_category_out=3, + n_informative=1, n_features= 0, + contamination=self.contamination, + random_state=self.random_state) + + with assert_raises(ValueError): + generate_data_categorical(n_train=self.n_train, n_test=self.n_test, + n_category_in=5, n_category_out=3, + n_informative=1, n_features='not int', + contamination=self.contamination, + random_state=self.random_state) + with assert_raises(ValueError): + generate_data_categorical(n_train=self.n_train, n_test=self.n_test, + n_category_in=5, n_category_out=3, + n_informative=-1, n_features=1, + contamination=self.contamination, + random_state=self.random_state) + with assert_raises(ValueError): + generate_data_categorical(n_train=self.n_train, n_test=self.n_test, + n_category_in=5, n_category_out=3, + n_informative='not int', n_features=1, + contamination=self.contamination, + random_state=self.random_state) + with assert_raises(ValueError): + generate_data_categorical(n_train=self.n_train, n_test=self.n_test, + n_category_in=5, n_category_out=3, + n_informative=1, n_features=1, + contamination=0.6, + random_state=self.random_state) + with assert_raises(ValueError): + generate_data_categorical(n_train=self.n_train, n_test=self.n_test, + n_category_in=5, n_category_out=3, + n_informative=1, n_features=1, + contamination='not float', + random_state=self.random_state) + with assert_raises(ValueError): + generate_data_categorical(n_train=self.n_train, n_test=self.n_test, + n_category_in=-1, n_category_out=3, + n_informative=1, n_features=1, + contamination=self.contamination, + random_state=self.random_state) + with assert_raises(ValueError): + generate_data_categorical(n_train=self.n_train, n_test=self.n_test, + n_category_in='not int', n_category_out=3, + n_informative=1, n_features=1, + contamination=self.contamination, + random_state=self.random_state) + with assert_raises(ValueError): + generate_data_categorical(n_train=self.n_train, n_test=self.n_test, + n_category_in=self.n_train+self.n_test+1, + n_category_out=3, + n_informative=1, n_features=1, + contamination=self.contamination, + random_state=self.random_state) + + with assert_raises(ValueError): + generate_data_categorical(n_train=self.n_train, n_test=self.n_test, + n_category_in=5, n_category_out=-1, + n_informative=1, n_features=1, + contamination=self.contamination, + random_state=self.random_state) + with assert_raises(ValueError): + generate_data_categorical(n_train=self.n_train, n_test=self.n_test, + n_category_in=5, n_category_out='not int', + n_informative=1, n_features=1, + contamination=self.contamination, + random_state=self.random_state) + with assert_raises(ValueError): + generate_data_categorical(n_train=self.n_train, n_test=self.n_test, + n_category_in=5, + n_category_out=self.n_train+self.n_test+1, + n_informative=1, n_features=1, + contamination=self.contamination, + random_state=self.random_state) + + with assert_raises(ValueError): + generate_data_categorical(n_train=self.n_train, n_test=self.n_test, + n_category_in=5, + n_category_out=5, + n_informative=2, n_features=2, + contamination=self.contamination, + shuffle='not bool', + random_state=self.random_state) + def test_evaluate_print(self): X_train, y_train, X_test, y_test = generate_data( n_train=self.n_train, diff --git a/pyod/utils/data.py b/pyod/utils/data.py index af0c9f46e..5588c1879 100644 --- a/pyod/utils/data.py +++ b/pyod/utils/data.py @@ -8,9 +8,8 @@ from __future__ import division from __future__ import print_function -import numpy as np from warnings import warn - +import numpy as np from sklearn.datasets import make_blobs from sklearn.model_selection import train_test_split from sklearn.utils import column_or_1d @@ -489,3 +488,147 @@ def generate_data_clusters(n_train=1000, n_test=500, n_clusters=2, else: return train_test_split(X, y, test_size=n_test, random_state=random_state) + + +def generate_data_categorical(n_train=1000, n_test=500, n_features=2, + n_informative=2, n_category_in=2, + n_category_out=2, contamination=0.1, + shuffle=True, random_state=None): + """Utility function to generate synthesized categorical data. + + Parameters + ---------- + n_train : int, (default=1000) + The number of training points to generate. + + n_test : int, (default=500) + The number of test points to generate. + + n_features : int, optional (default=2) + The number of features for each sample. + + n_informative : int in (1, n_features), optional (default=2) + The number of informative features in the outlier points. + The higher the easier the outlier detection should be. + Note that n_informative should not be less than or + equal n_features. + + n_category_in : int in (1, n_inliers), optional (default=2) + The number of categories in the inlier points. + + n_category_out : int in (1, n_outliers), optional (default=2) + The number of categories in the outlier points. + + contamination : float in (0., 0.5), optional (default=0.1) + The amount of contamination of the data set, i.e. + the proportion of outliers in the data set. + + shuffle: bool, optional(default=True) + If True, inliers will be shuffled which makes more noisy distribution. + + random_state : int, RandomState instance or None, optional (default=None) + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. + + + Returns + ------- + X_train : numpy array of shape (n_train, n_features) + Training data. + + y_train : numpy array of shape (n_train,) + Training ground truth. + + X_test : numpy array of shape (n_test, n_features) + Test data. + + y_test : numpy array of shape (n_test,) + Test ground truth. + """ + + # initialize a random state and seeds for the instance + random_state = check_random_state(random_state) + + if isinstance(n_train, int): + check_parameter(n_train, low=1, param_name='n_train') + else: + raise ValueError("n_train should be int, got %s" % n_train) + + if isinstance(n_test, int): + check_parameter(n_test, low=0, param_name='n_test') + else: + raise ValueError("n_test should be int, got %s" % n_test) + + if isinstance(n_features, int): + check_parameter(n_features, low=0, param_name='n_features') + else: + raise ValueError("n_features should be int, got %s" % n_features) + + if isinstance(n_informative, int): + check_parameter(n_informative, low=0, high=n_features+1, param_name='n_informative') + else: + raise ValueError("n_informative should be int, got %s" % n_informative) + + if isinstance(contamination, float): + check_parameter(contamination, low=0, high=0.5, + param_name='contamination') + else: + raise ValueError("contamination should be float, got %s" % contamination) + + if not isinstance(shuffle, bool): + raise ValueError("shuffle should be bool, got %s" % shuffle) + + + # find the required number of outliers and inliers + n_samples = n_train + n_test + n_outliers = int(n_samples * contamination) + n_inliers = n_samples - n_outliers + + if isinstance(n_category_in, int): + check_parameter(n_category_in, low=0, high=n_inliers+1, param_name='n_category_in') + else: + raise ValueError("n_category_in should be int, got %s" % n_category_in) + + if isinstance(n_category_out, int): + check_parameter(n_category_out, low=0, high=n_outliers+1, param_name='n_category_out') + else: + raise ValueError("n_category_out should be int, got %s" % n_category_out) + + # Encapsulated functions to generate features + def __f(f): + quot, rem = divmod(f - 1, 26) + return __f(quot) + chr(rem + ord('A')) if f != 0 else '' + + # generate pool of features to be the base for naming the data points + features = [] + for i in range(1, n_features + 1): + features.append(__f(i)) + + # find the required distributions of categories over inliers and outliers + temp_ = [int(n_inliers / n_category_in)] * (n_category_in - 1) + dist_in = temp_ + [int(n_inliers - sum(temp_))] + temp_ = [int(n_outliers / n_category_out)] * (n_category_out - 1) + dist_out = temp_ + [int(n_outliers - sum(temp_))] + + # generate categorical data + X = [] + count = 0 + for f in features: + inliers = np.hstack([[f + str(i)] * dist_in[i] for i in range(n_category_in)]) + if shuffle: + random_state.shuffle(inliers) + if count < n_informative: + outliers = list(np.hstack( + [[f + str((n_category_in * 2) + i)] * dist_out[i] for i in range(n_category_out)])) + else: + outliers = list(inliers[random_state.randint(0, len(inliers), size=n_outliers)]) + count += 1 + + X.append(list(inliers) + outliers) + + return train_test_split(np.array(X).T, + np.array(([0]*n_inliers) + ([1]*n_outliers)), + test_size=n_test, + random_state=random_state) diff --git a/pyod/utils/utility.py b/pyod/utils/utility.py index cc837f2ab..15009f3a7 100644 --- a/pyod/utils/utility.py +++ b/pyod/utils/utility.py @@ -279,6 +279,7 @@ def get_label_n(y, y_pred, n=None): return y_pred + def get_intersection(lst1, lst2): """get the overlapping between two lists @@ -321,6 +322,7 @@ def get_list_diff(li1, li2): return (list(set(li1) - set(li2))) + def get_diff_elements(li1, li2): """get the elements in li1 but not li2, and vice versa @@ -344,6 +346,7 @@ def get_diff_elements(li1, li2): return (list(set(li1) - set(li2)) + list(set(li2) - set(li1))) + def argmaxn(value_list, n, order='desc'): """Return the index of top n elements in the list if order is set to 'desc', otherwise return the index of n smallest ones. diff --git a/pyod/version.py b/pyod/version.py index 6aa4a51c5..b5e5b5354 100644 --- a/pyod/version.py +++ b/pyod/version.py @@ -20,4 +20,4 @@ # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. # 'X.Y.dev0' is the canonical version of 'X.Y.dev' # -__version__ = '0.8.3' # pragma: no cover +__version__ = '0.8.4' # pragma: no cover