-
Notifications
You must be signed in to change notification settings - Fork 714
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This reverts commit dc32b26.
- Loading branch information
浅梦
committed
Mar 27, 2020
1 parent
dc32b26
commit baa240b
Showing
26 changed files
with
239 additions
and
1,374 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,12 +4,10 @@ | |
Weichen Shen,[email protected] | ||
""" | ||
|
||
from collections import OrderedDict, namedtuple, defaultdict | ||
from itertools import chain | ||
from collections import OrderedDict, namedtuple | ||
|
||
import torch | ||
import torch.nn as nn | ||
import numpy as np | ||
|
||
from .layers.sequence import SequencePoolingLayer | ||
from .layers.utils import concat_fun | ||
|
@@ -29,8 +27,7 @@ def __new__(cls, name, vocabulary_size, embedding_dim=4, use_hash=False, dtype=" | |
if embedding_dim == "auto": | ||
embedding_dim = 6 * int(pow(vocabulary_size, 0.25)) | ||
if use_hash: | ||
print( | ||
"Notice! Feature Hashing on the fly currently is not supported in torch version,you can use tensorflow version!") | ||
print("Notice! Feature Hashing on the fly currently is not supported in torch version,you can use tensorflow version!") | ||
return super(SparseFeat, cls).__new__(cls, name, vocabulary_size, embedding_dim, use_hash, dtype, | ||
embedding_name, group_name) | ||
|
||
|
@@ -111,14 +108,23 @@ def build_input_features(feature_columns): | |
elif isinstance(feat, VarLenSparseFeat): | ||
features[feat_name] = (start, start + feat.maxlen) | ||
start += feat.maxlen | ||
if feat.length_name is not None and feat.length_name not in features: | ||
if feat.length_name is not None: | ||
features[feat.length_name] = (start, start + 1) | ||
start += 1 | ||
else: | ||
raise TypeError("Invalid feature column type,got", type(feat)) | ||
return features | ||
|
||
|
||
# def get_dense_input(features, feature_columns): | ||
# dense_feature_columns = list(filter(lambda x: isinstance( | ||
# x, DenseFeat), feature_columns)) if feature_columns else [] | ||
# dense_input_list = [] | ||
# for fc in dense_feature_columns: | ||
# dense_input_list.append(features[fc.name]) | ||
# return dense_input_list | ||
|
||
|
||
def combined_dnn_input(sparse_embedding_list, dense_value_list): | ||
if len(sparse_embedding_list) > 0 and len(dense_value_list) > 0: | ||
sparse_dnn_input = torch.flatten( | ||
|
@@ -133,6 +139,72 @@ def combined_dnn_input(sparse_embedding_list, dense_value_list): | |
else: | ||
raise NotImplementedError | ||
|
||
# | ||
# def embedding_lookup(sparse_embedding_dict, sparse_input_dict, sparse_feature_columns, return_feat_list=(), | ||
# mask_feat_list=(), to_list=False): | ||
# """ | ||
# Args: | ||
# sparse_embedding_dict: nn.ModuleDict, {embedding_name: nn.Embedding} | ||
# sparse_input_dict: OrderedDict, {feature_name:(start, start+dimension)} | ||
# sparse_feature_columns: list, sparse features | ||
# return_feat_list: list, names of feature to be returned, defualt () -> return all features | ||
# mask_feat_list, list, names of feature to be masked in hash transform | ||
# Return: | ||
# group_embedding_dict: defaultdict(list) | ||
# """ | ||
# group_embedding_dict = defaultdict(list) | ||
# for fc in sparse_feature_columns: | ||
# feature_name = fc.name | ||
# embedding_name = fc.embedding_name | ||
# if (len(return_feat_list) == 0 or feature_name in return_feat_list): | ||
# if fc.use_hash: | ||
# # lookup_idx = Hash(fc.vocabulary_size, mask_zero=(feature_name in mask_feat_list))( | ||
# # sparse_input_dict[feature_name]) | ||
# # TODO: add hash function | ||
# lookup_idx = sparse_input_dict[feature_name] | ||
# else: | ||
# lookup_idx = sparse_input_dict[feature_name] | ||
# | ||
# group_embedding_dict[fc.group_name].append(sparse_embedding_dict[embedding_name](lookup_idx)) | ||
# if to_list: | ||
# return list(chain.from_iterable(group_embedding_dict.values())) | ||
# return group_embedding_dict | ||
# | ||
# | ||
# def varlen_embedding_lookup(embedding_dict, sequence_input_dict, varlen_sparse_feature_columns): | ||
# varlen_embedding_vec_dict = {} | ||
# for fc in varlen_sparse_feature_columns: | ||
# feature_name = fc.name | ||
# embedding_name = fc.embedding_name | ||
# if fc.use_hash: | ||
# # lookup_idx = Hash(fc.vocabulary_size, mask_zero=True)(sequence_input_dict[feature_name]) | ||
# # TODO: add hash function | ||
# lookup_idx = sequence_input_dict[feature_name] | ||
# else: | ||
# lookup_idx = sequence_input_dict[feature_name] | ||
# varlen_embedding_vec_dict[feature_name] = embedding_dict[embedding_name](lookup_idx) | ||
# return varlen_embedding_vec_dict | ||
# | ||
# | ||
# def get_varlen_pooling_list(embedding_dict, features, varlen_sparse_feature_columns, to_list=False): | ||
# pooling_vec_list = defaultdict(list) | ||
# for fc in varlen_sparse_feature_columns: | ||
# feature_name = fc.name | ||
# combiner = fc.combiner | ||
# feature_length_name = fc.length_name | ||
# if feature_length_name is not None: | ||
# seq_input = embedding_dict[feature_name] | ||
# vec = SequencePoolingLayer(combiner)([seq_input, features[feature_length_name]]) | ||
# else: | ||
# seq_input = embedding_dict[feature_name] | ||
# vec = SequencePoolingLayer(combiner)(seq_input) | ||
# pooling_vec_list[fc.group_name].append(vec) | ||
# | ||
# if to_list: | ||
# return chain.from_iterable(pooling_vec_list.values()) | ||
# | ||
# return pooling_vec_list | ||
|
||
|
||
def get_varlen_pooling_list(embedding_dict, features, feature_index, varlen_sparse_feature_columns, device): | ||
varlen_sparse_embedding_list = [] | ||
|
@@ -177,95 +249,3 @@ def create_embedding_matrix(feature_columns, init_std=0.0001, linear=False, spar | |
nn.init.normal_(tensor.weight, mean=0, std=init_std) | ||
|
||
return embedding_dict.to(device) | ||
|
||
|
||
def input_from_feature_columns(self, X, feature_columns, embedding_dict, support_dense=True): | ||
sparse_feature_columns = list( | ||
filter(lambda x: isinstance(x, SparseFeat), feature_columns)) if len(feature_columns) else [] | ||
dense_feature_columns = list( | ||
filter(lambda x: isinstance(x, DenseFeat), feature_columns)) if len(feature_columns) else [] | ||
|
||
varlen_sparse_feature_columns = list( | ||
filter(lambda x: isinstance(x, VarLenSparseFeat), feature_columns)) if feature_columns else [] | ||
|
||
if not support_dense and len(dense_feature_columns) > 0: | ||
raise ValueError( | ||
"DenseFeat is not supported in dnn_feature_columns") | ||
|
||
sparse_embedding_list = [embedding_dict[feat.embedding_name]( | ||
X[:, self.feature_index[feat.name][0]:self.feature_index[feat.name][1]].long()) for | ||
feat in sparse_feature_columns] | ||
|
||
varlen_sparse_embedding_list = get_varlen_pooling_list(self.embedding_dict, X, self.feature_index, | ||
varlen_sparse_feature_columns, self.device) | ||
|
||
dense_value_list = [X[:, self.feature_index[feat.name][0]:self.feature_index[feat.name][1]] for feat in | ||
dense_feature_columns] | ||
|
||
return sparse_embedding_list + varlen_sparse_embedding_list, dense_value_list | ||
|
||
|
||
|
||
def embedding_lookup(X, sparse_embedding_dict, sparse_input_dict, sparse_feature_columns, return_feat_list=(), | ||
mask_feat_list=(), to_list=False): | ||
""" | ||
Args: | ||
X: input Tensor [batch_size x hidden_dim] | ||
sparse_embedding_dict: nn.ModuleDict, {embedding_name: nn.Embedding} | ||
sparse_input_dict: OrderedDict, {feature_name:(start, start+dimension)} | ||
sparse_feature_columns: list, sparse features | ||
return_feat_list: list, names of feature to be returned, defualt () -> return all features | ||
mask_feat_list, list, names of feature to be masked in hash transform | ||
Return: | ||
group_embedding_dict: defaultdict(list) | ||
""" | ||
group_embedding_dict = defaultdict(list) | ||
for fc in sparse_feature_columns: | ||
feature_name = fc.name | ||
embedding_name = fc.embedding_name | ||
if (len(return_feat_list) == 0 or feature_name in return_feat_list): | ||
# TODO: add hash function | ||
# if fc.use_hash: | ||
# raise NotImplementedError("hash function is not implemented in this version!") | ||
lookup_idx = np.array(sparse_input_dict[feature_name]) | ||
input_tensor = X[:, lookup_idx[0]:lookup_idx[1]].long() | ||
emb = sparse_embedding_dict[embedding_name](input_tensor) | ||
group_embedding_dict[fc.group_name].append(emb) | ||
if to_list: | ||
return list(chain.from_iterable(group_embedding_dict.values())) | ||
return group_embedding_dict | ||
|
||
|
||
def varlen_embedding_lookup(X, embedding_dict, sequence_input_dict, varlen_sparse_feature_columns): | ||
varlen_embedding_vec_dict = {} | ||
for fc in varlen_sparse_feature_columns: | ||
feature_name = fc.name | ||
embedding_name = fc.embedding_name | ||
if fc.use_hash: | ||
# lookup_idx = Hash(fc.vocabulary_size, mask_zero=True)(sequence_input_dict[feature_name]) | ||
# TODO: add hash function | ||
lookup_idx = sequence_input_dict[feature_name] | ||
else: | ||
lookup_idx = sequence_input_dict[feature_name] | ||
varlen_embedding_vec_dict[feature_name] = embedding_dict[embedding_name]( | ||
X[:, lookup_idx[0]:lookup_idx[1]].long()) # (lookup_idx) | ||
|
||
return varlen_embedding_vec_dict | ||
|
||
|
||
def get_dense_input(X, features, feature_columns): | ||
dense_feature_columns = list(filter(lambda x: isinstance( | ||
x, DenseFeat), feature_columns)) if feature_columns else [] | ||
dense_input_list = [] | ||
for fc in dense_feature_columns: | ||
lookup_idx = np.array(features[fc.name]) | ||
input_tensor = X[:, lookup_idx[0]:lookup_idx[1]].float() | ||
dense_input_list.append(input_tensor) | ||
return dense_input_list | ||
|
||
|
||
def maxlen_lookup(X, sparse_input_dict, maxlen_column): | ||
if maxlen_column is None or len(maxlen_column)==0: | ||
raise ValueError('please add max length column for VarLenSparseFeat of DIEN input') | ||
lookup_idx = np.array(sparse_input_dict[maxlen_column[0]]) | ||
return X[:, lookup_idx[0]:lookup_idx[1]].long() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
from .interaction import * | ||
from .core import * | ||
from .utils import concat_fun | ||
from .sequence import * | ||
from .sequence import KMaxPooling, SequencePoolingLayer |
Oops, something went wrong.