From caa12dd15144900ecfcb04908e9834669cb12304 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B5=85=E6=A2=A6?= Date: Thu, 3 Oct 2019 15:47:14 +0800 Subject: [PATCH] simplify input logic --- deepctr_torch/__init__.py | 2 +- deepctr_torch/inputs.py | 73 +++++++-------------------- deepctr_torch/models/basemodel.py | 19 +++++-- docs/source/Examples.md | 27 +++++----- docs/source/FAQ.md | 7 ++- docs/source/History.md | 1 + docs/source/Quick-Start.md | 14 ++--- docs/source/conf.py | 2 +- docs/source/index.rst | 4 +- examples/run_classification_criteo.py | 11 ++-- examples/run_multivalue_movielens.py | 11 ++-- examples/run_regression_movielens.py | 8 +-- setup.py | 4 +- 13 files changed, 73 insertions(+), 110 deletions(-) diff --git a/deepctr_torch/__init__.py b/deepctr_torch/__init__.py index 0eee0cdd..0c223331 100644 --- a/deepctr_torch/__init__.py +++ b/deepctr_torch/__init__.py @@ -2,5 +2,5 @@ from . import models from .utils import check_version -__version__ = '0.1.2' +__version__ = '0.1.3' check_version(__version__) \ No newline at end of file diff --git a/deepctr_torch/inputs.py b/deepctr_torch/inputs.py index 14772f1e..8058a6fc 100644 --- a/deepctr_torch/inputs.py +++ b/deepctr_torch/inputs.py @@ -41,70 +41,33 @@ def __new__(cls, name, dimension, maxlen, combiner="mean", use_hash=False, dtype embedding_name, embedding) -def get_fixlen_feature_names(feature_columns): - features = build_input_features( - feature_columns, include_varlen=False, include_fixlen=True) +def get_feature_names(feature_columns): + features = build_input_features(feature_columns) return list(features.keys()) - -def get_varlen_feature_names(feature_columns): - features = build_input_features( - feature_columns, include_varlen=True, include_fixlen=False) - return list(features.keys()) - - def get_inputs_list(inputs): return list(chain(*list(map(lambda x: x.values(), filter(lambda x: x is not None, inputs))))) -def build_input_features(feature_columns, include_varlen=True, mask_zero=True, prefix='', include_fixlen=True): - input_features = OrderedDict() +def build_input_features(feature_columns): features = OrderedDict() start = 0 - - if include_fixlen: - for feat in feature_columns: - feat_name = feat.name - if feat_name in features: - continue - if isinstance(feat, SparseFeat): - features[feat_name] = (start, start + 1) - start += 1 - elif isinstance(feat, DenseFeat): - features[feat_name] = (start, start + feat.dimension) - start += feat.dimension - if include_varlen: - for feat in feature_columns: - feat_name = feat.name - if feat_name in features: - continue - if isinstance(feat, VarLenSparseFeat): - features[feat_name] = (start, start + feat.maxlen) - start += feat.maxlen - - # if include_fixlen: - # for fc in feature_columns: - # if isinstance(fc, SparseFeat): - # input_features[fc.name] = 1 - # # Input( shape=(1,), name=prefix+fc.name, dtype=fc.dtype) - # elif isinstance(fc, DenseFeat): - # input_features[fc.name] = 1 - # # Input( - # # shape=(fc.dimension,), name=prefix + fc.name, dtype=fc.dtype) - # if include_varlen: - # for fc in feature_columns: - # if isinstance(fc, VarLenSparseFeat): - # input_features[fc.name] = 1 - # # Input(shape=(fc.maxlen,), name=prefix + 'seq_' + fc.name, - # # dtype=fc.dtype) - # if not mask_zero: - # for fc in feature_columns: - # input_features[fc.name + "_seq_length"] = 1 - # # Input(shape=( - # # 1,), name=prefix + 'seq_length_' + fc.name) - # input_features[fc.name + "_seq_max_length"] = 1 # fc.maxlen - + for feat in feature_columns: + feat_name = feat.name + if feat_name in features: + continue + if isinstance(feat, SparseFeat): + features[feat_name] = (start, start + 1) + start += 1 + elif isinstance(feat, DenseFeat): + features[feat_name] = (start, start + feat.dimension) + start += feat.dimension + elif isinstance(feat,VarLenSparseFeat): + features[feat_name] = (start, start + feat.maxlen) + start += feat.maxlen + else: + raise TypeError("Invalid feature column type,got",type(feat)) return features diff --git a/deepctr_torch/models/basemodel.py b/deepctr_torch/models/basemodel.py index cd7f0987..19a6eee6 100644 --- a/deepctr_torch/models/basemodel.py +++ b/deepctr_torch/models/basemodel.py @@ -135,7 +135,8 @@ def fit(self, x=None, shuffle=True, ): """ - :param x: Numpy array of training data (if the model has a single input), or list of Numpy arrays (if the model has multiple inputs). + :param x: Numpy array of training data (if the model has a single input), or list of Numpy arrays (if the model has multiple inputs).If input layers in the model are named, you can also pass a + dictionary mapping input names to Numpy arrays. :param y: Numpy array of target (label) data (if the model has a single output), or list of Numpy arrays (if the model has multiple outputs). :param batch_size: Integer or `None`. Number of samples per gradient update. If unspecified, `batch_size` will default to 256. :param epochs: Integer. Number of epochs to train the model. An epoch is an iteration over the entire `x` and `y` data provided. Note that in conjunction with `initial_epoch`, `epochs` is to be understood as "final epoch". The model is not trained for a number of iterations given by `epochs`, but merely until the epoch of index `epochs` is reached. @@ -146,6 +147,8 @@ def fit(self, x=None, :param shuffle: Boolean. Whether to shuffle the order of the batches at the beginning of each epoch. """ + if isinstance(x,dict): + x = [x[feature] for feature in self.feature_index] if validation_data: if len(validation_data) == 2: val_x, val_y = validation_data @@ -160,6 +163,8 @@ def fit(self, x=None, 'or alternatively it could be a dataset or a ' 'dataset or a dataset iterator. ' 'However we received `validation_data=%s`' % validation_data) + if isinstance(val_x, dict): + val_x = [val_x[feature] for feature in self.feature_index] elif validation_split and 0. < validation_split < 1.: if hasattr(x[0], 'shape'): @@ -191,16 +196,18 @@ def fit(self, x=None, model = self.train() loss_func = self.loss_func optim = self.optim - print("Train on {0} samples, validate on {1} samples".format( - len(train_tensor_data), len(val_y))) + + sample_num = len(train_tensor_data) + steps_per_epoch = (sample_num - 1) // batch_size + 1 + + print("Train on {0} samples, validate on {1} samples, {2} steps per epoch".format( + len(train_tensor_data), len(val_y),steps_per_epoch)) for epoch in range(initial_epoch, epochs): start_time = time.time() loss_epoch = 0 total_loss_epoch = 0 # if abs(loss_last - loss_now) < 0.0 - sample_num = len(train_tensor_data) train_result = {} - steps_per_epoch = (sample_num - 1) // batch_size + 1 try: with tqdm(enumerate(train_loader), disable=verbose != 1) as t: for index, (x_train, y_train) in t: @@ -272,6 +279,8 @@ def predict(self, x, batch_size=256): :return: Numpy array(s) of predictions. """ model = self.eval() + if isinstance(x, dict): + x = [x[feature] for feature in self.feature_index] for i in range(len(x)): if len(x[i].shape) == 1: x[i] = np.expand_dims(x[i], axis=1) diff --git a/docs/source/Examples.md b/docs/source/Examples.md index c7bd0107..2c3fbe51 100644 --- a/docs/source/Examples.md +++ b/docs/source/Examples.md @@ -30,7 +30,7 @@ from sklearn.metrics import log_loss, roc_auc_score from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, MinMaxScaler from deepctr_torch.models import * -from deepctr_torch.inputs import SparseFeat, DenseFeat, get_fixlen_feature_names +from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names import torch if __name__ == "__main__": @@ -59,14 +59,14 @@ if __name__ == "__main__": dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns - fixlen_feature_names = get_fixlen_feature_names( + feature_names = get_feature_names( linear_feature_columns + dnn_feature_columns) # 3.generate input data for model train, test = train_test_split(data, test_size=0.2) - train_model_input = [train[name] for name in fixlen_feature_names] - test_model_input = [test[name] for name in fixlen_feature_names] + train_model_input = {name:train[name] for name in feature_names} + test_model_input = {name:test[name] for name in feature_names} # 4.Define Model,train,predict and evaluate @@ -111,7 +111,7 @@ from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from deepctr_torch.models import DeepFM -from deepctr_torch.inputs import SparseFeat,get_fixlen_feature_names +from deepctr_torch.inputs import SparseFeat,get_feature_names if __name__ == "__main__": @@ -129,12 +129,12 @@ if __name__ == "__main__": for feat in sparse_features] linear_feature_columns = fixlen_feature_columns dnn_feature_columns = fixlen_feature_columns - fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns) + feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 3.generate input data for model train, test = train_test_split(data, test_size=0.2) - train_model_input = [train[name].values for name in fixlen_feature_names] - test_model_input = [test[name].values for name in fixlen_feature_names] + train_model_input = {name:train[name] for name in feature_names} + test_model_input = {name:test[name] for name in feature_names} # 4.Define Model,train,predict and evaluate device = 'cpu' @@ -190,7 +190,7 @@ from sklearn.preprocessing import LabelEncoder from tensorflow.python.keras.preprocessing.sequence import pad_sequences from deepctr_torch.models import DeepFM -from deepctr_torch.inputs import SparseFeat, VarLenSparseFeat,get_fixlen_feature_names,get_varlen_feature_names +from deepctr_torch.inputs import SparseFeat, VarLenSparseFeat,get_feature_names,get_varlen_feature_names def split(x): @@ -229,14 +229,13 @@ varlen_feature_columns = [VarLenSparseFeat('genres', len( linear_feature_columns = fixlen_feature_columns + varlen_feature_columns dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns -fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns) -varlen_feature_names = get_varlen_feature_names(linear_feature_columns+dnn_feature_columns) +feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 3.generate input data for model -fixlen_input = [data[name].values for name in fixlen_feature_names] -varlen_input = [genres_list]#varlen_feature_names[0] -model_input = fixlen_input + varlen_input # make sure the order is right +model_input = {name:data[name] for name in feature_names} +model_input['genres'] = genres_list + # 4.Define Model,compile and train model = DeepFM(linear_feature_columns,dnn_feature_columns,task='regression') diff --git a/docs/source/FAQ.md b/docs/source/FAQ.md index aa8f6a02..db86acc1 100644 --- a/docs/source/FAQ.md +++ b/docs/source/FAQ.md @@ -23,19 +23,18 @@ model = torch.load('DeepFM.h5') ## 2. How to add a long dense feature vector as a input to the model? ```python from deepctr_torch.models import DeepFM -from deepctr_torch.inputs import DenseFeat,SparseFeat,get_fixlen_feature_names +from deepctr_torch.inputs import DenseFeat,SparseFeat,get_feature_names import numpy as np feature_columns = [SparseFeat('user_id',120,),SparseFeat('item_id',60,),DenseFeat("pic_vec",5)] -fixlen_feature_names = get_fixlen_feature_names(feature_columns) +fixlen_feature_names = get_feature_names(feature_columns) user_id = np.array([[1],[0],[1]]) item_id = np.array([[30],[20],[10]]) pic_vec = np.array([[0.1,0.5,0.4,0.3,0.2],[0.1,0.5,0.4,0.3,0.2],[0.1,0.5,0.4,0.3,0.2]]) label = np.array([1,0,1]) -input_dict = {'user_id':user_id,'item_id':item_id,'pic_vec':pic_vec} -model_input = [input_dict[name] for name in fixlen_feature_names] +model_input = {'user_id':user_id,'item_id':item_id,'pic_vec':pic_vec} model = DeepFM(feature_columns,feature_columns) model.compile('adagrad','binary_crossentropy') diff --git a/docs/source/History.md b/docs/source/History.md index 8bc6bd41..985975ee 100644 --- a/docs/source/History.md +++ b/docs/source/History.md @@ -1,4 +1,5 @@ # History +- 10/03/2019 : [v0.1.3](https://github.com/shenweichen/DeepCTR-Torch/releases/tag/v0.1.3) released.Simplify the input logic. - 09/28/2019 : [v0.1.2](https://github.com/shenweichen/DeepCTR-Torch/releases/tag/v0.1.2) released.Add [sequence(multi-value) input support](./Examples.html#multi-value-input-movielens). - 09/24/2019 : [v0.1.1](https://github.com/shenweichen/DeepCTR-Torch/releases/tag/v0.1.1) released. Add [CCPM](./Features.html#ccpm-convolutional-click-prediction-model). - 09/22/2019 : DeepCTR-Torch first version v0.1.0 is released on [PyPi](https://pypi.org/project/deepctr-torch/) \ No newline at end of file diff --git a/docs/source/Quick-Start.md b/docs/source/Quick-Start.md index 5557efab..d59ae493 100644 --- a/docs/source/Quick-Start.md +++ b/docs/source/Quick-Start.md @@ -17,7 +17,7 @@ import pandas as pd from sklearn.preprocessing import LabelEncoder, MinMaxScaler from sklearn.model_selection import train_test_split from deepctr_torch.models import DeepFM -from deepctr_torch.inputs import SparseFeat, DenseFeat,get_fixlen_feature_names +from deepctr_torch.inputs import SparseFeat, DenseFeat,get_feature_names data = pd.read_csv('./criteo_sample.txt') @@ -75,22 +75,16 @@ dense_feature_columns = [DenseFeat(feat, 1) dnn_feature_columns = sparse_feature_columns + dense_feature_columns linear_feature_columns = sparse_feature_columns + dense_feature_columns -feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns) +feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) ``` ### Step 4: Generate the training samples and train the model -There are two rules here that we must follow - - - The `SparseFeat` and `DenseFeat` are placed in front of the `VarlenSparseFeat`. - - The order of the feature we fit into the model must be consistent with the order of the feature config list. - - ```python train, test = train_test_split(data, test_size=0.2) -train_model_input = [train[name] for name in feature_names] +train_model_input = {name:train[name] for name in feature_names} -test_model_input = [test[name] for name in feature_names] +test_model_input = {name:test[name] for name in feature_names} device = 'cpu' diff --git a/docs/source/conf.py b/docs/source/conf.py index 6679313e..27595820 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -26,7 +26,7 @@ # The short X.Y version version = '' # The full version, including alpha/beta/rc tags -release = '0.1.2' +release = '0.1.3' # -- General configuration --------------------------------------------------- diff --git a/docs/source/index.rst b/docs/source/index.rst index 4f3e2f5e..3e77837b 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -34,12 +34,12 @@ You can read the latest code at https://github.com/shenweichen/DeepCTR-Torch and News ----- +10/03/2019 : Simplify the input logic(`examples <./Examples.html#classification-criteo>`_). `Changelog `_ + 09/28/2019 : Add `sequence(multi-value) input support <./Examples.html#multi-value-input-movielens>`_ . `Changelog `_ 09/24/2019 : Add `CCPM <./Features.html#ccpm-convolutional-click-prediction-model>`_ . `Changelog `_ -09/22/2019 : DeepCTR-Torch first version v0.1.0 is released on `PyPi `_ ! - .. toctree:: :maxdepth: 2 diff --git a/examples/run_classification_criteo.py b/examples/run_classification_criteo.py index 84736dfc..606cfcc1 100644 --- a/examples/run_classification_criteo.py +++ b/examples/run_classification_criteo.py @@ -4,7 +4,7 @@ from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, MinMaxScaler from deepctr_torch.models import * -from deepctr_torch.inputs import SparseFeat, DenseFeat, get_fixlen_feature_names +from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names import torch @@ -34,14 +34,15 @@ dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns - fixlen_feature_names = get_fixlen_feature_names( + feature_names = get_feature_names( linear_feature_columns + dnn_feature_columns) # 3.generate input data for model train, test = train_test_split(data, test_size=0.2) - train_model_input = [train[name] for name in fixlen_feature_names] - test_model_input = [test[name] for name in fixlen_feature_names] + + train_model_input = {name:train[name] for name in feature_names} + test_model_input = {name:test[name] for name in feature_names} # 4.Define Model,train,predict and evaluate @@ -57,7 +58,7 @@ model.compile("adagrad", "binary_crossentropy", metrics=["binary_crossentropy", "auc"],) model.fit(train_model_input, train[target].values, - batch_size=32, epochs=10, validation_split=0.2, verbose=2) + batch_size=32, epochs=10, validation_split=0.0, verbose=2) pred_ans = model.predict(test_model_input, 256) print("") diff --git a/examples/run_multivalue_movielens.py b/examples/run_multivalue_movielens.py index 7c00eea1..07f621a3 100644 --- a/examples/run_multivalue_movielens.py +++ b/examples/run_multivalue_movielens.py @@ -4,7 +4,7 @@ from tensorflow.python.keras.preprocessing.sequence import pad_sequences from deepctr_torch.models import DeepFM -from deepctr_torch.inputs import SparseFeat, VarLenSparseFeat,get_fixlen_feature_names,get_varlen_feature_names +from deepctr_torch.inputs import SparseFeat, VarLenSparseFeat,get_feature_names def split(x): @@ -43,15 +43,12 @@ def split(x): linear_feature_columns = fixlen_feature_columns + varlen_feature_columns dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns -fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns) -varlen_feature_names = get_varlen_feature_names(linear_feature_columns+dnn_feature_columns) +feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 3.generate input data for model -fixlen_input = [data[name].values for name in fixlen_feature_names] -varlen_input = [genres_list]#varlen_feature_names[0] - -model_input = fixlen_input + varlen_input # make sure the order is right +model_input = {name:data[name] for name in feature_names} +model_input['genres'] = genres_list # 4.Define Model,compile and train model = DeepFM(linear_feature_columns,dnn_feature_columns,task='regression') diff --git a/examples/run_regression_movielens.py b/examples/run_regression_movielens.py index 85fe62cc..f88a144d 100644 --- a/examples/run_regression_movielens.py +++ b/examples/run_regression_movielens.py @@ -5,7 +5,7 @@ from sklearn.preprocessing import LabelEncoder from deepctr_torch.models import DeepFM -from deepctr_torch.inputs import SparseFeat,get_fixlen_feature_names +from deepctr_torch.inputs import SparseFeat,get_feature_names if __name__ == "__main__": @@ -23,12 +23,12 @@ for feat in sparse_features] linear_feature_columns = fixlen_feature_columns dnn_feature_columns = fixlen_feature_columns - fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns) + feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 3.generate input data for model train, test = train_test_split(data, test_size=0.2) - train_model_input = [train[name].values for name in fixlen_feature_names] - test_model_input = [test[name].values for name in fixlen_feature_names] + train_model_input = {name:train[name] for name in feature_names} + test_model_input = {name:test[name] for name in feature_names} # 4.Define Model,train,predict and evaluate device = 'cpu' diff --git a/setup.py b/setup.py index 0b5cd94b..b33e1fb0 100644 --- a/setup.py +++ b/setup.py @@ -4,12 +4,12 @@ long_description = fh.read() REQUIRED_PACKAGES = [ - 'torch>=1.1.0','deepctr','tqdm','sklearn' + 'torch>=1.1.0','tqdm','sklearn' ] setuptools.setup( name="deepctr-torch", - version="0.1.2", + version="0.1.3", author="Weichen Shen", author_email="wcshen1994@163.com", description="Easy-to-use,Modular and Extendible package of deep learning based CTR(Click Through Rate) prediction models with PyTorch",