forked from zotroneneis/machine_learning_basics
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathk_nearest_neighbors.rs
91 lines (87 loc) · 3.27 KB
/
k_nearest_neighbors.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
use super::RngSeed;
use csv;
use ndarray::*;
use std::collections::HashMap;
use std::hash::Hash;
use std::ops::AddAssign;
use utils::{argsort_floats_1d, shuffle2, train_test_split};
/// Computes the euclidean distance between each example in the training data
/// and a new input example
fn euclid_distance(data_x: &Array2<f64>, example: &ArrayView1<f64>) -> Array1<f64> {
let mut x1: Array2<f64> = data_x - example;
x1.mapv_inplace(|e| e.powf(2.0));
let mut x3: Array1<f64> = x1.map_axis(Axis(0), |row| row.scalar_sum());
x3.mapv_inplace(f64::sqrt);
x3
}
/// Given the data set 'data_x' and 'data_y', predict the label of 'example'
/// using the 'k' nearest neighbors
fn predict<Y: Clone + Hash + Eq>(
data_x: &Array2<f64>,
data_y: &Array1<Y>,
example: &ArrayView1<f64>,
k: usize,
) -> Y {
let dists = euclid_distance(data_x, example);
let mut dists_ix = argsort_floats_1d(&dists);
// Indecies of the 'k' smallest distance data entries
dists_ix.slice_collapse(s![..k]);
// Labels of 'k' nearest entries
let k_ys: Array1<Y> = dists_ix.map(|ix| data_y[[*ix]].clone());
// Group targets by their count
let mut y_counts: HashMap<Y, usize> = HashMap::new();
for y in k_ys.iter() {
if y_counts.contains_key(&y) {
y_counts.get_mut(&y).unwrap().add_assign(1);
} else {
// TODO should this not be possible without cloning, since its into_iter
y_counts.insert(y.clone(), 1);
}
}
// Return the most common label among 'k' nearest neighbors
y_counts
.into_iter()
.max_by(|(_, cnt1), (_, cnt2)| cnt1.cmp(&cnt2))
.unwrap()
.0
}
pub fn run(k: usize, train_test_split_ratio: f64, rng_seed: Option<RngSeed>) {
// Load data from csv file into arrays
let file_path = "datasets/digits.csv";
let mut rdr = csv::Reader::from_path(file_path).unwrap();
let n_samples = 1796;
// let n_samples = rdr.records().count();
let n_features = rdr.headers().unwrap().len() - 1;
let mut data_x: Array2<f64> = Array::zeros((n_samples, n_features + 1));
let mut data_y: Array1<u32> = Array::zeros(n_samples);
for (i, result) in rdr.records().enumerate() {
let r = result.unwrap();
// The last entry of each row in the csv file is the target/label
let y = r.get(n_features).expect("idx").parse().expect("parse");
data_y[[i]] = y;
let x: Array1<f64> = r
.into_iter()
.take(n_features)
.map(|e| e.parse().unwrap())
.collect();
data_x.slice_mut(s![i, ..-1]).assign(&x);
}
let (data_x, data_y) = shuffle2(data_x, data_y, rng_seed);
let dataset = train_test_split(data_x, data_y, train_test_split_ratio);
// Compute accuracy on test set
let y_preds: Array1<u32> = dataset
.x_test
.outer_iter()
.map(|example| predict(&dataset.x_train, &dataset.y_train, &example, k))
.collect();
let n_eq =
y_preds
.into_iter()
.zip(dataset.y_test.into_iter())
.fold(
0,
|acc, (y_pred, y_targ)| if y_pred == y_targ { acc + 1 } else { acc },
);
let test_acc: f64 = (n_eq as f64) / (y_preds.len() as f64) * 100.00;
println!("{}", test_acc);
}