-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfeature_hash.cpp
64 lines (44 loc) · 1.58 KB
/
feature_hash.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
/*
The function feature_hash implements the feature hashing trick as discussed in the paper
References
----------
[1] Kilian Weinberger; Anirban Dasgupta; John Langford; Alex Smola; Josh Attenberg (2009). Feature Hashing
for Large Scale Multitask Learning. Proc. ICML.
Caveat: Use the code at your own risk
Contact: Chandresh Maurya for any bug or improvement
Email: [email protected]
*/
#include<iostream>
#include<armadillo>
#include<functional>//for hash function
#include "hash.h"
/***************************Namespaces***********************************************/
using namespace std;
using namespace arma;
void feature_hash(mat& train_cat,mat& cat_data, int output_dim)
{
train_cat.zeros(cat_data.n_rows,output_dim);
hash<double> myhash;
for(uword i = 0;i< cat_data.n_rows;i++)
{
for(uword j=0;j<cat_data.n_cols;j++)
{
if(cat_data(i,j) != 0)
{
// cout<<" i,j="<<i<<","<<j<<endl;
uint64_t value = uniform_hash(&cat_data(i,j), sizeof(double),1);
if(myhash(cat_data(i,j))%2 == 0)
train_cat(i,value%output_dim) +=1;
else
train_cat(i,value%output_dim) -=1;
}
}
}
}
int main()
{
mat train_cat(4,5,fill::ones); //Assumption:: train_cat stores categorical string data, you should do ordinal encoding to convert strings to uniques numbers before trying feature_hash
mat cat_data; //output hashed data
int output_dim = cat_data.n_cols; // output feature dimensions you want to have or project
feature_hash(train_cat,cat_data, output_dim);
}