diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d7b40818..bd0e66cd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -18,7 +18,7 @@ jobs: strategy: matrix: python-version: [3.6,3.7] - torch-version: [1.1.0,1.2.0,1.3.0,1.4.0] + torch-version: [1.1.0,1.2.0,1.3.0,1.4.0,1.5.0,1.6.0] # exclude: # - python-version: 3.5 diff --git a/deepctr_torch/__init__.py b/deepctr_torch/__init__.py index cbd90cdf..be085a40 100644 --- a/deepctr_torch/__init__.py +++ b/deepctr_torch/__init__.py @@ -2,5 +2,5 @@ from . import models from .utils import check_version -__version__ = '0.2.1' +__version__ = '0.2.2' check_version(__version__) \ No newline at end of file diff --git a/deepctr_torch/layers/interaction.py b/deepctr_torch/layers/interaction.py index 19140be7..5693c447 100644 --- a/deepctr_torch/layers/interaction.py +++ b/deepctr_torch/layers/interaction.py @@ -288,6 +288,9 @@ def __init__(self, in_features, attention_factor=4, l2_reg_w=0, dropout_rate=0, for tensor in [self.attention_W, self.projection_h, self.projection_p]: nn.init.xavier_normal_(tensor, ) + for tensor in [self.attention_b]: + nn.init.zeros_(tensor, ) + self.dropout = nn.Dropout(dropout_rate) self.to(device) diff --git a/deepctr_torch/layers/sequence.py b/deepctr_torch/layers/sequence.py index 414d9980..64736b5b 100644 --- a/deepctr_torch/layers/sequence.py +++ b/deepctr_torch/layers/sequence.py @@ -1,8 +1,8 @@ import torch import torch.nn as nn import torch.nn.functional as F - from torch.nn.utils.rnn import PackedSequence + from ..layers.core import LocalActivationUnit @@ -117,33 +117,34 @@ def forward(self, query, keys, keys_length, mask=None): - 3D tensor with shape: ``(batch_size, 1, embedding_size)``. """ batch_size, max_length, dim = keys.size() - + # Mask if self.supports_masking: if mask is None: raise ValueError("When supports_masking=True,input must support masking") keys_masks = mask.unsqueeze(1) else: - keys_masks = torch.arange(max_length, device=keys_length.device, dtype=keys_length.dtype).repeat(batch_size, 1) # [B, T] + keys_masks = torch.arange(max_length, device=keys_length.device, dtype=keys_length.dtype).repeat(batch_size, + 1) # [B, T] keys_masks = keys_masks < keys_length.view(-1, 1) # 0, 1 mask - keys_masks = keys_masks.unsqueeze(1) # [B, 1, T] - - attention_score = self.local_att(query, keys) # [B, T, 1] + keys_masks = keys_masks.unsqueeze(1) # [B, 1, T] - outputs = torch.transpose(attention_score, 1, 2) # [B, 1, T] + attention_score = self.local_att(query, keys) # [B, T, 1] + + outputs = torch.transpose(attention_score, 1, 2) # [B, 1, T] if self.weight_normalization: paddings = torch.ones_like(outputs) * (-2 ** 32 + 1) else: paddings = torch.zeros_like(outputs) - outputs = torch.where(keys_masks, outputs, paddings) # [B, 1, T] - + outputs = torch.where(keys_masks, outputs, paddings) # [B, 1, T] + # Scale - #outputs = outputs / (keys.shape[-1] ** 0.05) - + # outputs = outputs / (keys.shape[-1] ** 0.05) + if self.weight_normalization: - outputs = F.softmax(outputs,dim=-1) # [B, 1, T] + outputs = F.softmax(outputs, dim=-1) # [B, 1, T] if not self.return_score: # Weighted sum @@ -212,6 +213,8 @@ def __init__(self, input_size, hidden_size, bias=True): # (b_hr|b_hz|b_hh) self.bias_hh = nn.Parameter(torch.Tensor(3 * hidden_size)) self.register_parameter('bias_hh', self.bias_hh) + for tensor in [self.bias_ih, self.bias_hh]: + nn.init.zeros_(tensor, ) else: self.register_parameter('bias_ih', None) self.register_parameter('bias_hh', None) @@ -256,6 +259,8 @@ def __init__(self, input_size, hidden_size, bias=True): # (b_hr|b_hz|b_hh) self.bias_hh = nn.Parameter(torch.Tensor(3 * hidden_size)) self.register_parameter('bias_ih', self.bias_hh) + for tensor in [self.bias_ih, self.bias_hh]: + nn.init.zeros_(tensor, ) else: self.register_parameter('bias_ih', None) self.register_parameter('bias_hh', None) diff --git a/deepctr_torch/models/afm.py b/deepctr_torch/models/afm.py index 2cf57076..1ba0c8e3 100644 --- a/deepctr_torch/models/afm.py +++ b/deepctr_torch/models/afm.py @@ -47,7 +47,7 @@ def __init__(self, linear_feature_columns, dnn_feature_columns, use_attention=Tr if use_attention: self.fm = AFMLayer(self.embedding_size, attention_factor, l2_reg_att, afm_dropout, seed, device) - self.add_regularization_loss(self.fm.attention_W, l2_reg_att) + self.add_regularization_weight(self.fm.attention_W, l2_reg_att) else: self.fm = FM() diff --git a/deepctr_torch/models/autoint.py b/deepctr_torch/models/autoint.py index 1abfe317..69630016 100644 --- a/deepctr_torch/models/autoint.py +++ b/deepctr_torch/models/autoint.py @@ -72,7 +72,7 @@ def __init__(self, linear_feature_columns, dnn_feature_columns, att_layer_num=3, self.dnn = DNN(self.compute_input_dim(dnn_feature_columns), dnn_hidden_units, activation=dnn_activation, l2_reg=l2_reg_dnn, dropout_rate=dnn_dropout, use_bn=dnn_use_bn, init_std=init_std, device=device) - self.add_regularization_loss( + self.add_regularization_weight( filter(lambda x: 'weight' in x[0] and 'bn' not in x[0], self.dnn.named_parameters()), l2_reg_dnn) self.int_layers = nn.ModuleList( [InteractingLayer(self.embedding_size if i == 0 else att_embedding_size * att_head_num, diff --git a/deepctr_torch/models/basemodel.py b/deepctr_torch/models/basemodel.py index 50145d6c..74f5c1b5 100644 --- a/deepctr_torch/models/basemodel.py +++ b/deepctr_torch/models/basemodel.py @@ -112,9 +112,11 @@ def __init__(self, self.linear_model = Linear( linear_feature_columns, self.feature_index, device=device) - self.add_regularization_loss( + self.regularization_weight = [] + + self.add_regularization_weight( self.embedding_dict.parameters(), l2_reg_embedding) - self.add_regularization_loss( + self.add_regularization_weight( self.linear_model.parameters(), l2_reg_linear) self.out = PredictionLayer(task, ) @@ -216,8 +218,9 @@ def fit(self, x=None, optim.zero_grad() loss = loss_func(y_pred, y.squeeze(), reduction='sum') + reg_loss = self.get_regularization_loss() - total_loss = loss + self.reg_loss + self.aux_loss + total_loss = loss + reg_loss + self.aux_loss loss_epoch += loss.item() total_loss_epoch += total_loss.item() @@ -353,16 +356,22 @@ def compute_input_dim(self, feature_columns, include_sparse=True, include_dense= input_dim += dense_input_dim return input_dim - def add_regularization_loss(self, weight_list, weight_decay, p=2): - reg_loss = torch.zeros((1,), device=self.device) - for w in weight_list: - if isinstance(w, tuple): - l2_reg = torch.norm(w[1], p=p, ) - else: - l2_reg = torch.norm(w, p=p, ) - reg_loss = reg_loss + l2_reg - reg_loss = weight_decay * reg_loss - self.reg_loss = self.reg_loss + reg_loss + def add_regularization_weight(self, weight_list, weight_decay, p=2): + self.regularization_weight.append((list(weight_list), weight_decay, p)) + + def get_regularization_loss(self,): + total_reg_loss = torch.zeros((1,), device=self.device) + for weight_list, weight_decay, p in self.regularization_weight: + weight_reg_loss = torch.zeros((1,), device=self.device) + for w in weight_list: + if isinstance(w, tuple): + l2_reg = torch.norm(w[1], p=p, ) + else: + l2_reg = torch.norm(w, p=p, ) + weight_reg_loss = weight_reg_loss + l2_reg + reg_loss = weight_decay * weight_reg_loss + total_reg_loss += reg_loss + return total_reg_loss def add_auxiliary_loss(self, aux_loss, alpha): self.aux_loss = aux_loss * alpha diff --git a/deepctr_torch/models/ccpm.py b/deepctr_torch/models/ccpm.py index a583acd8..9993d028 100644 --- a/deepctr_torch/models/ccpm.py +++ b/deepctr_torch/models/ccpm.py @@ -63,9 +63,9 @@ def __init__(self, linear_feature_columns, dnn_feature_columns, conv_kernel_widt activation=dnn_activation, l2_reg=l2_reg_dnn, dropout_rate=dnn_dropout, use_bn=dnn_use_bn, init_std=init_std, device=device) self.dnn_linear = nn.Linear(dnn_hidden_units[-1], 1, bias=False).to(device) - self.add_regularization_loss( + self.add_regularization_weight( filter(lambda x: 'weight' in x[0] and 'bn' not in x[0], self.dnn.named_parameters()), l2_reg_dnn) - self.add_regularization_loss(self.dnn_linear.weight, l2_reg_dnn) + self.add_regularization_weight(self.dnn_linear.weight, l2_reg_dnn) self.to(device) diff --git a/deepctr_torch/models/dcn.py b/deepctr_torch/models/dcn.py index 8573f206..e94f5b33 100644 --- a/deepctr_torch/models/dcn.py +++ b/deepctr_torch/models/dcn.py @@ -64,10 +64,10 @@ def __init__(self,linear_feature_columns, device) self.crossnet = CrossNet(in_features=self.compute_input_dim(dnn_feature_columns), layer_num=cross_num, seed=1024, device=device) - self.add_regularization_loss( + self.add_regularization_weight( filter(lambda x: 'weight' in x[0] and 'bn' not in x[0], self.dnn.named_parameters()), l2_reg_dnn) - self.add_regularization_loss(self.dnn_linear.weight, l2_reg_linear) - self.add_regularization_loss(self.crossnet.kernels, l2_reg_cross) + self.add_regularization_weight(self.dnn_linear.weight, l2_reg_linear) + self.add_regularization_weight(self.crossnet.kernels, l2_reg_cross) self.to(device) def forward(self, X): diff --git a/deepctr_torch/models/deepfm.py b/deepctr_torch/models/deepfm.py index edf4b561..539f8fd9 100644 --- a/deepctr_torch/models/deepfm.py +++ b/deepctr_torch/models/deepfm.py @@ -62,9 +62,9 @@ def __init__(self, self.dnn_linear = nn.Linear( dnn_hidden_units[-1], 1, bias=False).to(device) - self.add_regularization_loss( + self.add_regularization_weight( filter(lambda x: 'weight' in x[0] and 'bn' not in x[0], self.dnn.named_parameters()), l2_reg_dnn) - self.add_regularization_loss(self.dnn_linear.weight, l2_reg_dnn) + self.add_regularization_weight(self.dnn_linear.weight, l2_reg_dnn) self.to(device) def forward(self, X): diff --git a/deepctr_torch/models/nfm.py b/deepctr_torch/models/nfm.py index c206c789..7ec56403 100644 --- a/deepctr_torch/models/nfm.py +++ b/deepctr_torch/models/nfm.py @@ -51,9 +51,9 @@ def __init__(self, init_std=init_std, device=device) self.dnn_linear = nn.Linear( dnn_hidden_units[-1], 1, bias=False).to(device) - self.add_regularization_loss( + self.add_regularization_weight( filter(lambda x: 'weight' in x[0] and 'bn' not in x[0], self.dnn.named_parameters()), l2_reg_dnn) - self.add_regularization_loss(self.dnn_linear.weight, l2_reg_dnn) + self.add_regularization_weight(self.dnn_linear.weight, l2_reg_dnn) self.bi_pooling = BiInteractionPooling() self.bi_dropout = bi_dropout if self.bi_dropout > 0: diff --git a/deepctr_torch/models/onn.py b/deepctr_torch/models/onn.py index df7fb8d5..b4837dc2 100644 --- a/deepctr_torch/models/onn.py +++ b/deepctr_torch/models/onn.py @@ -73,7 +73,7 @@ def __init__(self, linear_feature_columns, dnn_feature_columns, dnn_feature_columns, embedding_size=embedding_size, sparse=False).to(device) # add regularization for second_order_embedding - self.add_regularization_loss( + self.add_regularization_weight( self.second_order_embedding_dict.parameters(), l2_reg_embedding) dim = self.__compute_nffm_dnn_dim( @@ -85,9 +85,9 @@ def __init__(self, linear_feature_columns, dnn_feature_columns, init_std=init_std, device=device) self.dnn_linear = nn.Linear( dnn_hidden_units[-1], 1, bias=False).to(device) - self.add_regularization_loss( + self.add_regularization_weight( filter(lambda x: 'weight' in x[0] and 'bn' not in x[0], self.dnn.named_parameters()), l2_reg_dnn) - self.add_regularization_loss(self.dnn_linear.weight, l2_reg_dnn) + self.add_regularization_weight(self.dnn_linear.weight, l2_reg_dnn) self.to(device) def __compute_nffm_dnn_dim(self, feature_columns, embedding_size): diff --git a/deepctr_torch/models/pnn.py b/deepctr_torch/models/pnn.py index 2efc04c1..f528d212 100644 --- a/deepctr_torch/models/pnn.py +++ b/deepctr_torch/models/pnn.py @@ -72,9 +72,9 @@ def __init__(self, dnn_feature_columns, dnn_hidden_units=(128, 128), l2_reg_embe self.dnn_linear = nn.Linear( dnn_hidden_units[-1], 1, bias=False).to(device) - self.add_regularization_loss( + self.add_regularization_weight( filter(lambda x: 'weight' in x[0] and 'bn' not in x[0], self.dnn.named_parameters()), l2_reg_dnn) - self.add_regularization_loss(self.dnn_linear.weight, l2_reg_dnn) + self.add_regularization_weight(self.dnn_linear.weight, l2_reg_dnn) self.to(device) diff --git a/deepctr_torch/models/wdl.py b/deepctr_torch/models/wdl.py index 4d2f840a..59843f1b 100644 --- a/deepctr_torch/models/wdl.py +++ b/deepctr_torch/models/wdl.py @@ -54,9 +54,9 @@ def __init__(self, activation=dnn_activation, l2_reg=l2_reg_dnn, dropout_rate=dnn_dropout, use_bn=dnn_use_bn, init_std=init_std, device=device) self.dnn_linear = nn.Linear(dnn_hidden_units[-1], 1, bias=False).to(device) - self.add_regularization_loss( + self.add_regularization_weight( filter(lambda x: 'weight' in x[0] and 'bn' not in x[0], self.dnn.named_parameters()), l2_reg_dnn) - self.add_regularization_loss(self.dnn_linear.weight, l2_reg_dnn) + self.add_regularization_weight(self.dnn_linear.weight, l2_reg_dnn) self.to(device) diff --git a/deepctr_torch/models/xdeepfm.py b/deepctr_torch/models/xdeepfm.py index de7f7230..851c9080 100644 --- a/deepctr_torch/models/xdeepfm.py +++ b/deepctr_torch/models/xdeepfm.py @@ -57,10 +57,10 @@ def __init__(self, linear_feature_columns, dnn_feature_columns, dnn_hidden_units activation=dnn_activation, l2_reg=l2_reg_dnn, dropout_rate=dnn_dropout, use_bn=dnn_use_bn, init_std=init_std, device=device) self.dnn_linear = nn.Linear(dnn_hidden_units[-1], 1, bias=False).to(device) - self.add_regularization_loss( + self.add_regularization_weight( filter(lambda x: 'weight' in x[0] and 'bn' not in x[0], self.dnn.named_parameters()), l2_reg_dnn) - self.add_regularization_loss(self.dnn_linear.weight, l2_reg_dnn) + self.add_regularization_weight(self.dnn_linear.weight, l2_reg_dnn) self.cin_layer_size = cin_layer_size self.use_cin = len(self.cin_layer_size) > 0 and len(dnn_feature_columns) > 0 @@ -74,7 +74,7 @@ def __init__(self, linear_feature_columns, dnn_feature_columns, dnn_hidden_units self.cin = CIN(field_num, cin_layer_size, cin_activation, cin_split_half, l2_reg_cin, seed, device=device) self.cin_linear = nn.Linear(self.featuremap_num, 1, bias=False).to(device) - self.add_regularization_loss( + self.add_regularization_weight( filter(lambda x: 'weight' in x[0], self.cin.named_parameters()), l2_reg_cin) self.to(device) diff --git a/docs/pics/weichennote.png b/docs/pics/weichennote.png index fec7b11b..0b60a2f3 100644 Binary files a/docs/pics/weichennote.png and b/docs/pics/weichennote.png differ diff --git a/docs/source/History.md b/docs/source/History.md index 5d06a97b..4f96f576 100644 --- a/docs/source/History.md +++ b/docs/source/History.md @@ -1,4 +1,5 @@ # History +- 10/09/2020 : [v0.2.2](https://github.com/shenweichen/DeepCTR-Torch/releases/tag/v0.2.2) released.Improve the reproducibility & fix some bugs. - 03/27/2020 : [v0.2.1](https://github.com/shenweichen/DeepCTR-Torch/releases/tag/v0.2.1) released.Add [DIN](./Features.html#din-deep-interest-network) and [DIEN](./Features.html#dien-deep-interest-evolution-network) . - 01/31/2020 : [v0.2.0](https://github.com/shenweichen/DeepCTR-Torch/releases/tag/v0.2.0) released.Refactor [feature columns](./Features.html#feature-columns).Support to use double precision in metric calculation. - 10/03/2019 : [v0.1.3](https://github.com/shenweichen/DeepCTR-Torch/releases/tag/v0.1.3) released.Simplify the input logic. diff --git a/docs/source/conf.py b/docs/source/conf.py index 3098ee95..5a54c9cd 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -26,7 +26,7 @@ # The short X.Y version version = '' # The full version, including alpha/beta/rc tags -release = '0.2.1' +release = '0.2.2' # -- General configuration --------------------------------------------------- diff --git a/docs/source/index.rst b/docs/source/index.rst index 52b6aac0..2f473b76 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -34,12 +34,12 @@ You can read the latest code at https://github.com/shenweichen/DeepCTR-Torch and News ----- +10/09/2020 : Improve the reproducibility & fix some bugs. `Changelog `_ + 03/27/2020 : Add `DIN <./Features.html#din-deep-interest-network>`_ and `DIEN <./Features.html#dien-deep-interest-evolution-network>`_ . `Changelog `_ 01/31/2020 : Refactor `feature columns <./Features.html#feature-columns>`_ . Support double precision in metric calculation . `Changelog `_ -10/03/2019 : Simplify the input logic(`examples <./Examples.html#classification-criteo>`_). `Changelog `_ - DisscussionGroup ----------------------- diff --git a/setup.py b/setup.py index a57727e4..415844fe 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ setuptools.setup( name="deepctr-torch", - version="0.2.1", + version="0.2.2", author="Weichen Shen", author_email="wcshen1994@163.com", description="Easy-to-use,Modular and Extendible package of deep learning based CTR(Click Through Rate) prediction models with PyTorch", diff --git a/tests/models/DIEN_test.py b/tests/models/DIEN_test.py index a2cdb6ae..ef5882b2 100644 --- a/tests/models/DIEN_test.py +++ b/tests/models/DIEN_test.py @@ -90,8 +90,8 @@ def test_DIEN(gru_type, use_neg): x, y, feature_columns, behavior_feature_list = get_xy_fd(use_neg=use_neg) - model = DIEN(feature_columns, behavior_feature_list, - dnn_hidden_units=[4, 4, 4], dnn_dropout=0.5, gru_type=gru_type, device=get_device()) + model = DIEN(feature_columns, behavior_feature_list, gru_type=gru_type, use_negsampling=use_neg, + dnn_hidden_units=[4, 4, 4], dnn_dropout=0.5, device=get_device()) check_model(model, model_name, x, y)