Reinitialize .git due to large packed file of 76Mb

jean-pierreBoth · Apr 18, 2020 · 3f93e91 · 3f93e91
commit 3f93e91
Show file tree

Hide file tree

Showing 17 changed files with 4,886 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+target/**
+Runs
+Cargo.lock
+rls*
+dumpreloadtest*
diff --git a/Cargo.toml b/Cargo.toml
@@ -0,0 +1,81 @@
+[package]
+name = "hnsw_rs"
+version = "0.1.3"
+authors = ["[email protected]"]
+description = "Ann based on Hierarchical Navigable Small World Graphs from Yu.A. Malkov and D.A Yashunin"
+license = "MIT/Apache-2.0"
+readme = "README.md"
+keywords = ["algorithms", "ann", "hnsw"]
+repository = "https://github.com/jean-pierreBoth/hnswlib-rs"
+edition= "2018"
+
+[features]
+
+# declare a feature with no dependancy to get some modulated debug print
+# to be run with cargo build --features verbose_1
+#verbose_1 = [ ]
+
+[profile.release]
+lto = true
+opt-level = 3
+
+[lib]
+# cargo rustc --lib -- --crate-type dylib [or staticlib] or rlib (default)
+# if we want to avoid specifying in advance crate-type 
+lib = "hnswlib"
+path = "src/lib.rs"
+#crate-type = ["dylib"]
+
+
+[[examples]]
+name = "random"
+path = "examples/random.rs"
+
+[[examples]]
+name = "ann-glove"
+path = "examples/ann-glove25-angular.rs"
+
+
+
+[[examples]]
+name = "ann-mnist"
+path = "examples/ann-mnist-784-euclidean.rs"
+
+#[[example]]
+
+[dependencies]
+# default is version spec is ^ meaning can update up to max non null version number
+# cargo doc --no-deps avoid dependencies doc generation
+#
+
+serde= {version = "1.0", doc = false}
+serde_derive={ version = "1.0", doc = false}
+
+
+# for //
+crossbeam-utils = "0.7"
+crossbeam-channel = "0.4"
+parking_lot = "0.9"
+rayon = {version = "0.9.0",  doc = false}
+num_cpus = {version = "1.8.0", doc = false}
+simdeez = {version = "0.6", doc = false}
+
+cpu-time = {version = "0.1", doc = false}
+time = {version = "0.1.39",  doc = false}
+
+ndarray = {version = "0.12", doc = false}
+
+clap = {version = "2.29", doc = false}
+#  for hashing . hashbrown still needed beccause of get_key_value(&key)
+hashbrown = {version = "0.3", doc = false}
+skiplist= {version = "0.2.10", doc=false}
+
+rand = {version = "0.7", doc = false}
+lazy_static = { version = "1.3", doc = false}
+typename = {version = "0.1", doc = false}
+# for benchmark reading
+hdf5 = {version="0.5", doc = false}
+# decreasing order of log for debug build : (max_level_)trace debug info warn error off
+# decreasing order of log for release build (release_max_level_)  .. idem
+log = { version = "0.4", features = ["max_level_debug", "release_max_level_info"] }
+simple_logger = { version = "1.0"}
diff --git a/LICENSE-APACHE b/LICENSE-APACHE
@@ -0,0 +1,13 @@
+Copyright 2020 jean-pierre.both
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/LICENSE-MIT b/LICENSE-MIT
@@ -0,0 +1,25 @@
+Copyright (c) 2020 jean-pierre.both
+
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,95 @@
+# hnsw-rs
+
+This crate provides a Rust implementation of the paper by Yu.A. Malkov and D.A Yashunin:
+
+"Efficient and Robust approximate nearest neighbours using Hierarchical Navigable Small World Graphs" (2016,2018)
+[https://arxiv.org/abs/1603.09320]
+
+## License
+
+Licensed under either of
+
+* Apache License, Version 2.0, [LICENSE-APACHE](LICENSE-APACHE) or <http://www.apache.org/licenses/LICENSE-2.0>
+* MIT license [LICENSE-MIT](LICENSE-MIT) or <http://opensource.org/licenses/MIT>
+
+at your option.
+
+This software was written on my own while working at [CEA](http://www.cea.fr/), [CEA-LIST](http://www-list.cea.fr/en/)
+
+## Functionalities
+
+The crate provides:
+
+* usual distances as L1, L2, Cosine, Jaccard, Hamming for vectors of standard numeric types.
+
+* Hellinger and Jeffreys distances between probability distributions (f32 and f64). It must be noted that the Jeffreys distance
+(a symetrized Kullback-Leibler divergence) do not satisfy the triangle inequality. (Neither Cosine distance !).
+
+* A structure to enable the user to implement its own distances. It takes as data, vectors of types T:Copy+Clone+Send+Sync.
+
+* An interface towards C and more specifically to the [Julia](https://julialang.org/) language.
+See the companion Julia package [HnswAnn.jl](https://gitlab.com/jpboth/HnswAnn.jl) and the building paragraph for some help for Julia users.
+
+* Dump and reload functions (Cf module hnswio) to store the graph once it is built. As the time necessary to compute the graph can be important it can be useful to store it for future use.
+
+## Implementation
+
+The graph construction and searches are multithreaded with the **parking_lot** crate (See **parallel_insert_data** and **parallel_search_neighbours** functions and also examples files).
+For the heavily used case (f32) we provide simd avx2 implementation in distance computations
+currently based on the **simdeez** crate.
+
+## Building
+
+By default the crate is a standalone project and builds a static libray and executable.
+To be used with the companion Julia package it is necessary to build a dynamic library.
+This can be done by just uncommenting (i.e get rid of the #) in file Cargo.toml the line:
+
+*#crate-type = ["dylib"]*
+
+and rerun the command: cargo build --release.
+
+This will generate a .so file in the target/release directory.
+
+## Algorithm and Input Parameters
+
+The algorithm stores points in layers (at most 16), and a graph is constructed to enable a search from less densely populated levels to most densely populated levels by constructing links from less dense layers to the most dense layer (level 0).
+
+Roughly the algorithm goes along runs as follows:
+
+Upon insertion, the level ***l*** of a new point is sampled with an exponential law, limiting the number of levels to 16,
+so that level 0 is the most densely populated layer, upper layers being exponentially less populated as level increases.  
+The nearest neighbour of the point is searched in lookup tables from the upper level to the level just above its layer (***l***), so we should arrive near the new point at its level at a relatively low cost. Then the ***max_nb_connection*** nearest neighbours are searched in neighbours of neighbours table (with a reverse updating of tables) recursively from its layer ***l*** down to the most populated level 0.  
+
+The scale parameter of the exponential law depends on the maximum number of connection possible for a point (parameter ***max_nb_connection***) to others.  
+Explicitly the scale parameter is chosen as : `scale=1/ln(max_nb_connection)`.
+
+The main parameters occuring in constructing the graph or in searching are:
+
+* max_nb_connection (in hnsw initialization)
+    The maximum number of links from one point to others. Values ranging from 16 to 64 are standard initialising values, the higher the more time consuming.
+
+* ef_construction (in hnsw initialization)  
+  This parameter controls the width of the search for neighbours during insertion. Values from 200 to 800 are standard initialising values, the higher the more time consuming.
+
+* max_layer (in hnsw initialization)  
+    The maximum number of layers in graph. Must be less or equal than 16.
+
+* ef_arg (in search methods)  
+    This parameter controls the width of the search in the lowest level, it must be greater than number of neighbours asked but can be less than ***ef_construction***.
+    As a rule of thumb could be between the number of neighbours we will ask for (knbn arg in search method) and max_nb_connection.
+
+* keep_pruned and extend_candidates.  
+    These parameters are described in the paper by Malkov and Yashunin can be used to
+    modify the search strategy. The interested user should check the paper to see the impact. By default
+    the values are as recommended in the paper.
+
+## Examples and Benchmarks
+
+Some examples are taken from the [ann-benchmarks site](https://github.com/erikbern/ann-benchmarks)
+and recall rates and request/s are given in comments in the examples files for some input parameters.
+The annhdf5 module implements reading the standardized data files
+of the [ann-benchmarks site](https://github.com/erikbern/ann-benchmarks),
+just download the necessary benchmark data files and modify path in sources accordingly.
+Then run: cargo build --examples --release.  
+It is possible in these examples to change from parallel searches to serial searches to check for speeds
+or modify parameters to see the impact on performance.
diff --git a/examples/ann-glove25-angular.rs b/examples/ann-glove25-angular.rs
@@ -0,0 +1,147 @@
+use std::time::{Duration, SystemTime};
+use cpu_time::ProcessTime;
+
+
+use typename::TypeName;
+
+
+//    glove 25 // 2.7 Ghz 4 cores 8Mb L3  k = 10
+//
+// max_nb_conn   ef_cons    ef_search   scale_factor    extend  keep pruned  recall        req/s      last ratio
+//  24            800         64            1.           1          0          0.928        4090        1.003
+//  24            800         64            1.           1          1          0.927        4594        1.003
+//  24            400,        48            1.           1          0          0.919        6349        1.0044
+//  24            800         48            1            1          1          0.918        5785        1.005
+//  24            400         32            1.           0          0          0.898        8662         
+//  24            400         64            1.           1          0          0.930        4711        1.0027
+//  24            400         64            1.           1          1          0.921        4550        1.0039
+//  24            400         64            0.5          0          0          0.916        4896        1.0046
+//  24           1600         48            1            1          0          0.924        5380        1.0034
+
+//  32            400         48            1            1          0          0.93         4706        1.0026
+//  32            800         64            1            1          0          0.94         3780.       1.0015
+//  32            1600        48            1            1          0          0.934        4455        1.0023
+//  48            1600        48            1            1          0          0.945        3253        1.00098
+
+//  24            400         48            1            1          0          0.92         6036.       1.0038
+//  48            800         48            1            1          0          0.935        4018        1.002
+//  48            800         64            1            1          0          0.942        3091        1.0014
+//  48            800         64            0.5          1          0          0.9395       3234        1.00167
+//  48            800         64            1            1          1          0.9435       2640        1.00126
+
+
+// k = 100
+
+//  24            800         48            1            1          0          0.96         2432      1.004
+//  48            800        128            1            1          0          0.979        1626      1.001
+
+extern crate hnsw_rs;
+
+use hnsw_rs::prelude::*;
+
+
+pub fn main() {
+
+    let _res = simple_logger::init();
+    let parallel = true;
+    //
+    let fname = String::from("/home.2/Data/ANN/glove-25-angular.hdf5");
+    println!("\n\n test_load_hdf5 {:?}", fname);
+    // now recall that data are stored in row order.
+    let mut anndata = AnnBenchmarkData::new(fname).unwrap();
+    // pre normalisation to use Dot computations instead of Cosine
+    anndata.do_l2_normalization();
+    // run bench
+    let nb_elem = anndata.train_data.len();
+    let max_nb_connection = 48;
+    let ef_c = 800;
+    println!(" max_nb_conn : {:?}, ef_construction : {:?} ", max_nb_connection,  ef_c);
+    let nb_layer = 16.min((nb_elem as f32).ln().trunc() as usize);
+    println!(" number of elements to insert {:?} , setting max nb layer to {:?} ef_construction {:?}", nb_elem, nb_layer, ef_c);
+    let nb_search = anndata.test_data.len();
+    println!(" number of search {:?}", nb_search);
+    // Hnsw allocation
+    let mut hnsw =  Hnsw::<f32, DistDot>::new(max_nb_connection, nb_elem, nb_layer, ef_c, DistDot{});
+    hnsw.set_extend_candidates(true);
+    // parallel insertion
+    let start = ProcessTime::now();
+    let now = SystemTime::now();
+    let data_for_par_insertion = anndata.train_data.iter().map( |x| (&x.0, x.1)).collect();
+    if parallel {
+        println!(" \n parallel insertion");
+        hnsw.parallel_insert(&data_for_par_insertion);
+    }
+    else {
+        println!(" \n serial insertion");
+        for d in data_for_par_insertion {
+            hnsw.insert(d);
+        }
+    }
+    let cpu_time: Duration = start.elapsed();
+    //
+    println!("\n hnsw data insertion cpu time  {:?}  system time {:?} ", cpu_time, now.elapsed()); 
+    hnsw.dump_layer_info();
+    println!(" hnsw data nb point inserted {:?}", hnsw.get_nb_point());
+    //
+    //  Now the bench with 10 neighbours
+    //
+    let knbn = 10;
+    let ef_search = 48;
+    search(&mut hnsw, knbn, ef_search, &anndata);
+
+    let knbn = 100;
+    let ef_search = 128;
+    search(&mut hnsw, knbn, ef_search, &anndata);
+}
+
+
+pub fn search<Dist>(hnsw: &mut Hnsw<f32, Dist>, knbn : usize, ef_search: usize, anndata : & AnnBenchmarkData) 
+     where Dist : Distance<f32> + Send + Sync + TypeName {
+
+    println!("\n\n ef_search : {:?} knbn : {:?} ",  ef_search, knbn);
+    let parallel = true;
+    //
+    let nb_elem = anndata.train_data.len();
+    let nb_search = anndata.test_data.len();
+    //
+    let mut recalls = Vec::<usize>::with_capacity(nb_elem);
+    let mut nb_returned = Vec::<usize>::with_capacity(nb_elem);
+    let mut last_distances_ratio = Vec::<f32>::with_capacity(nb_elem);
+    let mut knn_neighbours_for_tests = Vec::<Vec<Neighbour>>::with_capacity(nb_elem);
+    hnsw.set_searching_mode(true);
+    println!("searching with ef : {:?}", ef_search);
+    let start = ProcessTime::now();
+    let now = SystemTime::now();
+    // search
+    if parallel {
+        println!(" \n parallel search");
+        knn_neighbours_for_tests = hnsw.parallel_search(&anndata.test_data, knbn, ef_search);
+    } else {
+        println!(" \n serial search");
+        for i in 0..anndata.test_data.len() {
+            let knn_neighbours : Vec<Neighbour> = hnsw.search(&anndata.test_data[i], knbn, ef_search);
+            knn_neighbours_for_tests.push(knn_neighbours);
+        }
+    }
+    let cpu_time = start.elapsed();
+    let search_cpu_time = cpu_time.as_micros() as f32;
+    let search_sys_time = now.elapsed().unwrap().as_micros() as f32;
+    println!("total cpu time for search requests {:?} , system time {:?} ", search_cpu_time, now.elapsed());
+    // now compute recall rate
+    for i in 0..anndata.test_data.len() {
+        let max_dist = anndata.test_distances.row(i)[knbn-1];
+        let knn_neighbours_d : Vec<f32> = knn_neighbours_for_tests[i].iter().map(|p| p.distance).collect();
+        nb_returned.push(knn_neighbours_d.len());
+        let recall = knn_neighbours_d.iter().filter(|d| *d <= &max_dist).count();
+        recalls.push(recall);
+        let mut ratio = 0.;
+        if knn_neighbours_d.len() >= 1 {
+            ratio = knn_neighbours_d[knn_neighbours_d.len()-1]/max_dist;
+        }
+        last_distances_ratio.push(ratio);
+    }
+    let mean_recall = (recalls.iter().sum::<usize>() as f32)/((knbn * recalls.len()) as f32);
+    println!("\n mean fraction nb returned by search {:?} ", (nb_returned.iter().sum::<usize>() as f32)/ ((nb_returned.len() * knbn) as f32));
+    println!("\n last distances ratio {:?} ", last_distances_ratio.iter().sum::<f32>() / last_distances_ratio.len() as f32);
+    println!("\n recall rate for {:?} is {:?} , nb req /s {:?}", anndata.fname, mean_recall, (nb_search as f32) * 1.0e+6_f32/search_sys_time);
+}