From c27799c4fb8feab08c0f2462bdef37a617c1bba9 Mon Sep 17 00:00:00 2001 From: Harrison Burt <57491488+ChillFish8@users.noreply.github.com> Date: Sat, 18 Dec 2021 15:21:50 +0000 Subject: [PATCH] [ Performance ] update symspell to use custom fork (#48) * update to forked symspell * reformat code * resolve clippy lints * reformat Co-authored-by: Harrison Burt --- lnx-server/src/main.rs | 10 +++--- search-engine/search-index/Cargo.toml | 2 +- search-engine/search-index/src/corrections.rs | 34 ++++++------------- search-engine/search-index/src/helpers.rs | 8 ++--- search-engine/search-index/src/query.rs | 3 +- search-engine/search-index/src/stop_words.rs | 13 +++---- search-engine/search-index/src/storage.rs | 9 +++-- search-engine/search-index/src/structures.rs | 10 +++--- 8 files changed, 39 insertions(+), 50 deletions(-) diff --git a/lnx-server/src/main.rs b/lnx-server/src/main.rs index a88d0912..27bdb94c 100644 --- a/lnx-server/src/main.rs +++ b/lnx-server/src/main.rs @@ -189,13 +189,13 @@ async fn create_state(settings: &Settings) -> Result { let storage = StorageBackend::connect(Some(STORAGE_PATH.to_string()))?; let engine = { info!("loading existing indexes..."); - let existing_indexes: Vec; - if let Some(buff) = storage.load_structure(INDEX_KEYSPACE)? { + let raw_structure = storage.load_structure(INDEX_KEYSPACE)?; + let existing_indexes: Vec = if let Some(buff) = raw_structure { let buffer: Vec = bincode::deserialize(&buff)?; - existing_indexes = serde_json::from_slice(&buffer)?; + serde_json::from_slice(&buffer)? } else { - existing_indexes = vec![]; - } + vec![] + }; info!( " {} existing indexes discovered, recreating state...", diff --git a/search-engine/search-index/Cargo.toml b/search-engine/search-index/Cargo.toml index c24354fc..2086e995 100644 --- a/search-engine/search-index/Cargo.toml +++ b/search-engine/search-index/Cargo.toml @@ -9,7 +9,7 @@ edition = "2018" [dependencies] serde = { version = "1", features = ["derive"] } hashbrown = { version = "0.11", features = ["serde"] } -symspell = { git = "https://github.com/ChillFish8/symspell", branch = "0.6.0" } +symspell = { git = "https://github.com/lnx-search/symspell", branch = "master" } chrono = { version = "0.4", features = ["serde"] } tokio = { version = "1.12", features = ["sync", "fs", "rt"] } compress = { version = "0.2.1", default-features=false, features = ["lz4"] } diff --git a/search-engine/search-index/src/corrections.rs b/search-engine/search-index/src/corrections.rs index 13291019..9de0c308 100644 --- a/search-engine/search-index/src/corrections.rs +++ b/search-engine/search-index/src/corrections.rs @@ -2,7 +2,7 @@ use std::fmt::{Debug, Formatter}; use std::sync::Arc; use arc_swap::ArcSwap; -use symspell::{SymSpell, UnicodeiStringStrategy}; +use symspell::{AsciiStringStrategy, SymSpell}; use crate::helpers::FrequencyCounter; @@ -10,7 +10,7 @@ pub(crate) type SymSpellCorrectionManager = Arc; /// The manager around the sym spell fuzzy searching system. pub(crate) struct SymSpellManager { - sym: Arc>>, + sym: Arc>>, } impl SymSpellManager { @@ -24,25 +24,7 @@ impl SymSpellManager { /// /// If the index does not have a set of frequencies this returns the original string. pub(crate) fn correct(&self, sentence: &str) -> String { - let mut results = { self.sym.load().lookup_compound(sentence, 1) }; - - if results.is_empty() { - sentence.to_string() - } else { - let v = results.remove(0); - v.term - } - } - - /// Gets all predicted corrections for a given sentence. - pub(crate) fn get_corrections(&self, sentence: &str) -> Vec { - let mut results = { self.sym.load().lookup_compound(sentence, 1) }; - - if results.is_empty() { - vec![sentence.to_string()] - } else { - results.drain(..).map(|s| s.term).collect() - } + self.sym.load().lookup_compound(sentence, 2) } /// Sets a custom symspell handler for the given index. @@ -50,8 +32,14 @@ impl SymSpellManager { /// This means when something is next set to be corrected for the index, the /// custom frequencies will be used instead of the default. pub(crate) fn adjust_index_frequencies(&self, frequencies: &impl FrequencyCounter) { - let mut symspell: SymSpell = SymSpell::default(); - symspell.load_dictionary_from_map(frequencies.counts().clone()); + let mut symspell: SymSpell = SymSpell::default(); + symspell.using_dictionary_frequencies( + frequencies + .counts() + .into_iter() + .map(|(k, v)| (k.clone(), *v as i64)) + .collect(), + ); self.sym.store(Arc::from(symspell)) } diff --git a/search-engine/search-index/src/helpers.rs b/search-engine/search-index/src/helpers.rs index 77fd342f..b26bc4e0 100644 --- a/search-engine/search-index/src/helpers.rs +++ b/search-engine/search-index/src/helpers.rs @@ -136,11 +136,11 @@ impl PersistentFrequencySet { fn load_frequencies_from_store(&mut self) -> Result<()> { info!("[ FREQUENCY-COUNTER ] loading frequencies from persistent backend."); - let frequencies: HashMap; - if let Some(buff) = self.conn.load_structure(Self::KEYSPACE)? { - frequencies = deserialize(&buff)?; + let raw_structure = self.conn.load_structure(Self::KEYSPACE)?; + let frequencies: HashMap = if let Some(buff) = raw_structure { + deserialize(&buff)? } else { - frequencies = HashMap::new(); + HashMap::new() }; for (word, count) in frequencies { diff --git a/search-engine/search-index/src/query.rs b/search-engine/search-index/src/query.rs index 3d8f4486..bd33b368 100644 --- a/search-engine/search-index/src/query.rs +++ b/search-engine/search-index/src/query.rs @@ -296,7 +296,8 @@ impl QueryBuilder { /// Gets a list of suggested corrections based off of the index corpus. pub(crate) fn get_corrections(&self, query: &str) -> Vec { - self.corrections.get_corrections(query) + // TODO: reflect single output changes + vec![self.corrections.correct(query)] } /// Gets the unique document id field. diff --git a/search-engine/search-index/src/stop_words.rs b/search-engine/search-index/src/stop_words.rs index fda5c691..5e1529e2 100644 --- a/search-engine/search-index/src/stop_words.rs +++ b/search-engine/search-index/src/stop_words.rs @@ -99,7 +99,7 @@ impl StopWordManager { /// Removes a set of stop words from the index's specific set if it exists. pub fn remove_stop_words(&self, mut words: Vec) { - words = words.drain(..).map(|v| v.to_lowercase()).collect(); + words = words.into_iter().map(|v| v.to_lowercase()).collect(); let new_words: Vec = { let guard = self.index_stop_words.load(); @@ -140,12 +140,13 @@ impl PersistentStopWordManager { /// Creates a new `PersistentStopWordManager`. pub(crate) fn new(conn: StorageBackend, manager: StopWordManager) -> Result { debug!("[ STOP-WORDS ] loading stop words from persistent store"); - let words: Vec; - if let Some(buff) = conn.load_structure(Self::KEYSPACE)? { - words = deserialize(&buff)?; + + let raw_structure = conn.load_structure(Self::KEYSPACE)?; + let words: Vec = if let Some(buff) = raw_structure { + deserialize(&buff)? } else { - words = vec![]; - } + vec![] + }; let count = words.len(); manager.add_stop_words(words); diff --git a/search-engine/search-index/src/storage.rs b/search-engine/search-index/src/storage.rs index a8b5c965..a29b0c74 100644 --- a/search-engine/search-index/src/storage.rs +++ b/search-engine/search-index/src/storage.rs @@ -19,13 +19,12 @@ pub struct StorageBackend { impl StorageBackend { /// Connects to the sqlite DB. pub fn connect(fp: Option) -> Result { - let conn: Arc; - if let Some(ref fp) = fp { + let conn: Arc = if let Some(ref fp) = fp { std::fs::create_dir_all(fp)?; - conn = Arc::new(MmapDirectory::open(fp)?) + Arc::new(MmapDirectory::open(fp)?) } else { - conn = Arc::new(RamDirectory::create()); - } + Arc::new(RamDirectory::create()) + }; Ok(Self { fp, conn }) } diff --git a/search-engine/search-index/src/structures.rs b/search-engine/search-index/src/structures.rs index 0b797a04..efcffb52 100644 --- a/search-engine/search-index/src/structures.rs +++ b/search-engine/search-index/src/structures.rs @@ -254,12 +254,12 @@ impl IndexDeclaration { let corrections = Arc::new(SymSpellManager::new()); - let fp; - if let StorageType::FileSystem = self.storage_type { - fp = Some(format!("{}/{}", INDEX_METADATA_PATH, &self.name)) + let fp = if let StorageType::FileSystem = self.storage_type { + Some(format!("{}/{}", INDEX_METADATA_PATH, &self.name)) } else { - fp = None; - } + None + }; + let storage = StorageBackend::connect(fp)?; Ok(IndexContext {