From d968c8f615a87123f5847bc11d15713ee711100e Mon Sep 17 00:00:00 2001 From: Kreeben Date: Thu, 20 Oct 2022 10:49:04 +0200 Subject: [PATCH] revert back to using non optimized indexing strategy --- src/Sir.Cmd/Program.cs | 4 +++- src/Sir.Cmd/ValidateCommand.cs | 2 +- src/Sir.CommonCrawl/CCHelper.cs | 2 +- src/Sir.CommonCrawl/IndexWetFilesCommand.cs | 2 ++ src/Sir.Core/Field.cs | 4 ++-- src/Sir.Core/IField.cs | 2 +- src/Sir.Core/IModel.cs | 2 +- .../OptimizedPageIndexingStrategy.cs | 20 +++++++++++++++++++ src/Sir.Crawl/CrawlUserDirectoryCommand.cs | 7 ++++--- .../Controllers/WriteController.cs | 7 +++++-- src/Sir.HttpServer/HttpReader.cs | 2 +- src/Sir.HttpServer/HttpWriter.cs | 5 +++-- src/Sir.HttpServer/IHttpWriter.cs | 2 +- src/Sir.ImageTests/ImageModelTests.cs | 4 ++-- .../OptimizedPageIndexStrategyTests.cs | 2 +- src/Sir.ImageTests/UpdateSessionTests.cs | 6 +++--- src/Sir.Mnist/IndexMnistCommand.cs | 2 +- src/Sir.Mnist/ValidateMnistCommand.cs | 2 +- src/Sir.Search/SearchSession.cs | 5 ++++- src/Sir.Search/SessionFactory.cs | 17 ++++++++-------- src/Sir.StringTests/BagOfCharsModelTests.cs | 4 ++-- src/Sir.StringTests/NGramModelTests.cs | 4 ++-- .../OptimizedPageIndexStrategyTests.cs | 2 +- src/Sir.StringTests/UpdateSessionTests.cs | 6 +++--- src/Sir.Strings/BagOfCharsModel.cs | 10 ---------- src/Sir.WebFront/Features/CrawlJob.cs | 7 +++++-- src/Sir.WebFront/Features/SaveAsJob.cs | 8 ++++++-- src/Sir.WebFront/SearchClient.cs | 2 +- src/Sir.WebFront/WriteClient.cs | 5 +++-- src/Sir.Wikipedia/IndexWikipediaCommand.cs | 2 +- 30 files changed, 90 insertions(+), 59 deletions(-) diff --git a/src/Sir.Cmd/Program.cs b/src/Sir.Cmd/Program.cs index 76974549..e407e499 100644 --- a/src/Sir.Cmd/Program.cs +++ b/src/Sir.Cmd/Program.cs @@ -121,6 +121,7 @@ private static void Optimize(IDictionary args, ILogger logger) var reportFrequency = int.Parse(args["reportFrequency"]); var pageSize = int.Parse(args["pageSize"]); var fields = new HashSet(args["fields"].Split(',')); + var model = new BagOfCharsModel(); using (var sessionFactory = new SessionFactory(logger)) { @@ -128,7 +129,8 @@ private static void Optimize(IDictionary args, ILogger logger) dataDirectory, collection, fields, - new BagOfCharsModel(), + model, + new NonOptimizedPageIndexingStrategy(model), skip, take, reportFrequency, diff --git a/src/Sir.Cmd/ValidateCommand.cs b/src/Sir.Cmd/ValidateCommand.cs index 4e13a6ba..c5d9e95c 100644 --- a/src/Sir.Cmd/ValidateCommand.cs +++ b/src/Sir.Cmd/ValidateCommand.cs @@ -26,7 +26,7 @@ public void Run(IDictionary args, ILogger logger) { using (var validateSession = new ValidateSession( collectionId, - new SearchSession(dir, sessionFactory, model, logger), + new SearchSession(dir, sessionFactory, model, new NonOptimizedPageIndexingStrategy(model), logger), new QueryParser(dir, sessionFactory, model, logger))) { using (var documents = new DocumentStreamSession(dir, sessionFactory)) diff --git a/src/Sir.CommonCrawl/CCHelper.cs b/src/Sir.CommonCrawl/CCHelper.cs index acb95e7e..23eb6f2a 100644 --- a/src/Sir.CommonCrawl/CCHelper.cs +++ b/src/Sir.CommonCrawl/CCHelper.cs @@ -36,7 +36,7 @@ public static void WriteWatSegment( using (var sessionFactory = new SessionFactory(logger)) using (var writeSession = new WriteSession(new DocumentWriter(dataDirectory, collectionId, sessionFactory))) - using (var indexSession = new InMemoryIndexSession(model, model, sessionFactory, dataDirectory, collectionId)) + using (var indexSession = new InMemoryIndexSession(model, new NonOptimizedPageIndexingStrategy(model), sessionFactory, dataDirectory, collectionId)) { using (var queue = new ProducerConsumerQueue(document => { diff --git a/src/Sir.CommonCrawl/IndexWetFilesCommand.cs b/src/Sir.CommonCrawl/IndexWetFilesCommand.cs index 4d970a6f..8fc93de4 100644 --- a/src/Sir.CommonCrawl/IndexWetFilesCommand.cs +++ b/src/Sir.CommonCrawl/IndexWetFilesCommand.cs @@ -15,6 +15,7 @@ public void Run(IDictionary args, ILogger logger) var dataDirectory = args["dataDirectory"]; var fileName = args["fileName"]; var model = new BagOfCharsModel(); + var indexStrategy = new NonOptimizedPageIndexingStrategy(model); var collectionId = "cc_wet".ToHash(); var storeFields = new HashSet { "url" }; var indexFields = new HashSet { "description" }; @@ -33,6 +34,7 @@ public void Run(IDictionary args, ILogger logger) kvp.Key, kvp.Value)).ToList())), model, + indexStrategy, reportSize: 1000); } } diff --git a/src/Sir.Core/Field.cs b/src/Sir.Core/Field.cs index 58128706..a44166e4 100644 --- a/src/Sir.Core/Field.cs +++ b/src/Sir.Core/Field.cs @@ -34,7 +34,7 @@ private IEnumerable GetTokens() yield return node.Vector; } - public void Analyze(IModel model, bool label, IStreamDispatcher streamDispatcher) + public void Analyze(IModel model, IIndexReadWriteStrategy indexStrategy, bool label, IStreamDispatcher streamDispatcher) { var tokens = model.CreateEmbedding((T)Value, label); @@ -44,7 +44,7 @@ public void Analyze(IModel model, bool label, IStreamDispatcher streamDisp { foreach (var token in tokens) { - model.Put(Tree, new VectorNode(token, keyId: KeyId), reader); + indexStrategy.Put(Tree, new VectorNode(token, keyId: KeyId), reader); } _tokens = GetTokens(); diff --git a/src/Sir.Core/IField.cs b/src/Sir.Core/IField.cs index f79368b5..1580d694 100644 --- a/src/Sir.Core/IField.cs +++ b/src/Sir.Core/IField.cs @@ -10,6 +10,6 @@ public interface IField IEnumerable Tokens { get; } VectorNode Tree { get; } object Value { get; set; } - void Analyze(IModel model, bool label, IStreamDispatcher streamDispatcher); + void Analyze(IModel model, IIndexReadWriteStrategy indexStrategy, bool label, IStreamDispatcher streamDispatcher); } } \ No newline at end of file diff --git a/src/Sir.Core/IModel.cs b/src/Sir.Core/IModel.cs index f3b34608..49fa64fb 100644 --- a/src/Sir.Core/IModel.cs +++ b/src/Sir.Core/IModel.cs @@ -15,7 +15,7 @@ public interface IModel : IModel /// /// Vector space model. /// - public interface IModel : IVectorSpaceConfig, IDistanceCalculator, IIndexReadWriteStrategy + public interface IModel : IVectorSpaceConfig, IDistanceCalculator { } diff --git a/src/Sir.Core/IndexingStrategies/OptimizedPageIndexingStrategy.cs b/src/Sir.Core/IndexingStrategies/OptimizedPageIndexingStrategy.cs index b92f9893..c05cdfea 100644 --- a/src/Sir.Core/IndexingStrategies/OptimizedPageIndexingStrategy.cs +++ b/src/Sir.Core/IndexingStrategies/OptimizedPageIndexingStrategy.cs @@ -28,4 +28,24 @@ public void Put(VectorNode column, VectorNode node, IColumnReader reader) column.AddOrAppend(node, _model); } } + + public class NonOptimizedPageIndexingStrategy : IIndexReadWriteStrategy + { + private readonly IModel _model; + + public NonOptimizedPageIndexingStrategy(IModel model) + { + _model = model; + } + + public Hit GetClosestMatchOrNull(ISerializableVector vector, IModel model, IColumnReader reader) + { + return reader.ClosestMatchOrNullScanningAllPages(vector, model); + } + + public void Put(VectorNode column, VectorNode node, IColumnReader reader) + { + column.AddOrAppend(node, _model); + } + } } \ No newline at end of file diff --git a/src/Sir.Crawl/CrawlUserDirectoryCommand.cs b/src/Sir.Crawl/CrawlUserDirectoryCommand.cs index c5b5131f..87d7f6d8 100644 --- a/src/Sir.Crawl/CrawlUserDirectoryCommand.cs +++ b/src/Sir.Crawl/CrawlUserDirectoryCommand.cs @@ -30,6 +30,7 @@ public void Run(IDictionary args, ILogger logger) var urlCollectionId = "url".ToHash(); var htmlClient = new HtmlWeb(); var model = new BagOfCharsModel(); + var indexStrategy = new NonOptimizedPageIndexingStrategy(model); htmlClient.UserAgent = "Crawlcrawler (+https://crawlcrawler.com)"; @@ -37,7 +38,7 @@ public void Run(IDictionary args, ILogger logger) htmlClient.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"; #endif using (var database = new SessionFactory(logger)) - using (var dataSearchSession = new SearchSession(dataDirectory, database, model, logger)) + using (var dataSearchSession = new SearchSession(dataDirectory, database, model, new NonOptimizedPageIndexingStrategy(model), logger)) { foreach (var userDirectory in Directory.EnumerateDirectories(rootUserDirectory)) { @@ -85,7 +86,7 @@ public void Run(IDictionary args, ILogger logger) if (result != null) { - database.StoreDataAndPersistIndex(dataDirectory, collectionId, result.Document, _model); + database.StoreDataAndPersistIndex(dataDirectory, collectionId, result.Document, _model, indexStrategy); int crawlCount = 1; @@ -110,7 +111,7 @@ public void Run(IDictionary args, ILogger logger) if (r != null) { - database.StoreDataAndPersistIndex(dataDirectory, collectionId, r.Document, _model); + database.StoreDataAndPersistIndex(dataDirectory, collectionId, r.Document, _model, indexStrategy); } crawlCount++; diff --git a/src/Sir.HttpServer/Controllers/WriteController.cs b/src/Sir.HttpServer/Controllers/WriteController.cs index d489dc1e..f87ba6df 100644 --- a/src/Sir.HttpServer/Controllers/WriteController.cs +++ b/src/Sir.HttpServer/Controllers/WriteController.cs @@ -12,10 +12,12 @@ public class WriteController : Controller private readonly IModel _model; private readonly ILogger _logger; private readonly IConfigurationProvider _config; + private readonly IIndexReadWriteStrategy _indexStrategy; public WriteController( IHttpWriter writer, - IModel tokenizer, + IModel tokenizer, + IIndexReadWriteStrategy indexStrategy, ILogger logger, IConfigurationProvider config) { @@ -23,6 +25,7 @@ public WriteController( _model = tokenizer; _logger = logger; _config = config; + _indexStrategy = indexStrategy; } [HttpPost] @@ -41,7 +44,7 @@ public IActionResult Post(string accessToken) try { - _writer.Write(Request, _model); + _writer.Write(Request, _model, _indexStrategy); return Ok(); } diff --git a/src/Sir.HttpServer/HttpReader.cs b/src/Sir.HttpServer/HttpReader.cs index 3fd40d31..ec152dd1 100644 --- a/src/Sir.HttpServer/HttpReader.cs +++ b/src/Sir.HttpServer/HttpReader.cs @@ -58,7 +58,7 @@ public async Task Read(HttpRequest request, IModel model) _logger.LogDebug($"parsed query: {queryLog}"); #endif - using (var readSession = new SearchSession(_config.Get("data_dir"), _sessionFactory, model, _logger)) + using (var readSession = new SearchSession(_config.Get("data_dir"), _sessionFactory, model, new NonOptimizedPageIndexingStrategy(model), _logger)) { return readSession.Search(query, skip, take); } diff --git a/src/Sir.HttpServer/HttpWriter.cs b/src/Sir.HttpServer/HttpWriter.cs index 2fc492a8..8a5601b7 100644 --- a/src/Sir.HttpServer/HttpWriter.cs +++ b/src/Sir.HttpServer/HttpWriter.cs @@ -20,7 +20,7 @@ public HttpWriter(SessionFactory sessionFactory, IConfigurationProvider config) _config = config; } - public void Write(HttpRequest request, IModel model) + public void Write(HttpRequest request, IModel model, IIndexReadWriteStrategy indexStrategy) { var documents = Deserialize>(request.Body); var collectionId = request.Query["collection"].First().ToHash(); @@ -29,7 +29,8 @@ public void Write(HttpRequest request, IModel model) _config.Get("data_dir"), collectionId, documents, - model); + model, + indexStrategy); } private static T Deserialize(Stream stream) diff --git a/src/Sir.HttpServer/IHttpWriter.cs b/src/Sir.HttpServer/IHttpWriter.cs index 744584e0..c1493c75 100644 --- a/src/Sir.HttpServer/IHttpWriter.cs +++ b/src/Sir.HttpServer/IHttpWriter.cs @@ -7,6 +7,6 @@ namespace Sir.HttpServer /// public interface IHttpWriter { - void Write(HttpRequest request, IModel model); + void Write(HttpRequest request, IModel model, IIndexReadWriteStrategy indexStrategy); } } diff --git a/src/Sir.ImageTests/ImageModelTests.cs b/src/Sir.ImageTests/ImageModelTests.cs index b4ab6a7f..ae4206e3 100644 --- a/src/Sir.ImageTests/ImageModelTests.cs +++ b/src/Sir.ImageTests/ImageModelTests.cs @@ -28,7 +28,7 @@ public void Can_traverse_index_in_memory() using (var reader = _sessionFactory.CreateColumnReader("", 0, 0)) { - var index = model.CreateTree(model, reader, _data); + var index = model.CreateTree(new NonOptimizedPageIndexingStrategy(model), reader, _data); Print(index); @@ -80,7 +80,7 @@ public void Can_traverse_streamed() using (var reader = _sessionFactory.CreateColumnReader("", 0, 0)) { - var index = model.CreateTree(model, reader, _data); + var index = model.CreateTree(new NonOptimizedPageIndexingStrategy(model), reader, _data); using (var indexStream = new MemoryStream()) using (var vectorStream = new MemoryStream()) diff --git a/src/Sir.ImageTests/OptimizedPageIndexStrategyTests.cs b/src/Sir.ImageTests/OptimizedPageIndexStrategyTests.cs index c1d7a4a5..d1ae0ba2 100644 --- a/src/Sir.ImageTests/OptimizedPageIndexStrategyTests.cs +++ b/src/Sir.ImageTests/OptimizedPageIndexStrategyTests.cs @@ -26,7 +26,7 @@ public void Can_traverse_index_in_memory() using (var reader = _sessionFactory.CreateColumnReader("", 0, 0)) { - var index = model.CreateTree(model, reader, _data); + var index = model.CreateTree(new OptimizedPageIndexingStrategy(model), reader, _data); Debug.WriteLine(PathFinder.Visualize(index)); diff --git a/src/Sir.ImageTests/UpdateSessionTests.cs b/src/Sir.ImageTests/UpdateSessionTests.cs index c3abb653..05416175 100644 --- a/src/Sir.ImageTests/UpdateSessionTests.cs +++ b/src/Sir.ImageTests/UpdateSessionTests.cs @@ -42,7 +42,7 @@ public void Can_update_image_field() { var d = data[i]; - using (var indexSession = new InMemoryIndexSession(model, model, _sessionFactory, _directory, collectionId)) + using (var indexSession = new InMemoryIndexSession(model, new NonOptimizedPageIndexingStrategy(model), _sessionFactory, _directory, collectionId)) { var doc = new Document(new Field[] { new Field(fieldName, d) }); @@ -55,7 +55,7 @@ public void Can_update_image_field() var queryParser = new QueryParser(_directory, _sessionFactory, model); - using (var searchSession = new SearchSession(_directory, _sessionFactory, model, _loggerFactory.CreateLogger())) + using (var searchSession = new SearchSession(_directory, _sessionFactory, model, new NonOptimizedPageIndexingStrategy(model), _loggerFactory.CreateLogger())) { Assert.DoesNotThrow(() => { @@ -85,7 +85,7 @@ public void Can_update_image_field() updateSession.Update(documentIdToUpdate, 0, updatedWord); } - using (var searchSession = new SearchSession(_directory, _sessionFactory, model, _loggerFactory.CreateLogger())) + using (var searchSession = new SearchSession(_directory, _sessionFactory, model, new NonOptimizedPageIndexingStrategy(model), _loggerFactory.CreateLogger())) { Assert.DoesNotThrow(() => { diff --git a/src/Sir.Mnist/IndexMnistCommand.cs b/src/Sir.Mnist/IndexMnistCommand.cs index 39d25c03..8557c0fb 100644 --- a/src/Sir.Mnist/IndexMnistCommand.cs +++ b/src/Sir.Mnist/IndexMnistCommand.cs @@ -33,7 +33,7 @@ public void Run(IDictionary args, ILogger logger) sessionFactory.Truncate(dataDirectory, collectionId); using (var writeSession = new WriteSession(new DocumentWriter(dataDirectory, collectionId, sessionFactory))) - using (var indexSession = new InMemoryIndexSession(model, model, sessionFactory, dataDirectory, collectionId)) + using (var indexSession = new InMemoryIndexSession(model, new NonOptimizedPageIndexingStrategy(model), sessionFactory, dataDirectory, collectionId)) { var imageIndexId = writeSession.EnsureKeyExists("image"); diff --git a/src/Sir.Mnist/ValidateMnistCommand.cs b/src/Sir.Mnist/ValidateMnistCommand.cs index cac3bf8c..d90b3fc6 100644 --- a/src/Sir.Mnist/ValidateMnistCommand.cs +++ b/src/Sir.Mnist/ValidateMnistCommand.cs @@ -25,7 +25,7 @@ public void Run(IDictionary args, ILogger logger) var model = new LinearClassifierImageModel(); using (var sessionFactory = new SessionFactory(logger: logger)) - using (var querySession = new SearchSession(dataDirectory, sessionFactory, model, logger)) + using (var querySession = new SearchSession(dataDirectory, sessionFactory, model, new NonOptimizedPageIndexingStrategy(model), logger)) { var queryParser = new QueryParser(dataDirectory, sessionFactory, model, logger); diff --git a/src/Sir.Search/SearchSession.cs b/src/Sir.Search/SearchSession.cs index 336eea29..f480171b 100644 --- a/src/Sir.Search/SearchSession.cs +++ b/src/Sir.Search/SearchSession.cs @@ -13,6 +13,7 @@ public class SearchSession : DocumentStreamSession, IDisposable, ISearchSession { private readonly IStreamDispatcher _sessionFactory; private readonly IModel _model; + private readonly IIndexReadWriteStrategy _indexStrategy; private readonly PostingsResolver _postingsResolver; private readonly Scorer _scorer; private readonly ILogger _logger; @@ -21,12 +22,14 @@ public SearchSession( string directory, IStreamDispatcher sessionFactory, IModel model, + IIndexReadWriteStrategy indexStrategy, ILogger logger = null, PostingsResolver postingsResolver = null, Scorer scorer = null) : base(directory, sessionFactory) { _sessionFactory = sessionFactory; _model = model; + _indexStrategy = indexStrategy; _postingsResolver = postingsResolver ?? new PostingsResolver(); _scorer = scorer ?? new Scorer(); _logger = logger; @@ -120,7 +123,7 @@ private void Scan(IQuery query) if (reader != null) { - var hit =_model.GetClosestMatchOrNull(term.Vector, _model, reader); + var hit =_indexStrategy.GetClosestMatchOrNull(term.Vector, _model, reader); if (hit != null) { diff --git a/src/Sir.Search/SessionFactory.cs b/src/Sir.Search/SessionFactory.cs index 5f1b07b2..509e0280 100644 --- a/src/Sir.Search/SessionFactory.cs +++ b/src/Sir.Search/SessionFactory.cs @@ -171,6 +171,7 @@ public void Optimize( string collection, HashSet selectFields, IModel model, + IIndexReadWriteStrategy indexStrategy, int skipDocuments = 0, int takeDocuments = 0, int reportFrequency = 1000, @@ -206,7 +207,7 @@ public void Optimize( var count = 0; - using (var indexSession = new InMemoryIndexSession(model, model, this, directory, collectionId)) + using (var indexSession = new InMemoryIndexSession(model, indexStrategy, this, directory, collectionId)) { foreach (var document in payload) { @@ -272,7 +273,7 @@ public void StoreDataAndBuildInMemoryIndex( } } - public void BuildIndex(ulong collectionId, IEnumerable job, IModel model, InMemoryIndexSession indexSession, bool label = true) + public void BuildIndex(ulong collectionId, IEnumerable job, IModel model, IIndexReadWriteStrategy indexStrategy, InMemoryIndexSession indexSession, bool label = true) { LogDebug($"building index for collection {collectionId}"); @@ -295,7 +296,7 @@ public void BuildIndex(ulong collectionId, IEnumerable job, IModel< { if (field.Value != null) { - field.Analyze(model, label, this); + field.Analyze(model, indexStrategy, label, this); } } @@ -306,10 +307,10 @@ public void BuildIndex(ulong collectionId, IEnumerable job, IModel< LogDebug($"built index (collection {collectionId}) in {time.Elapsed}"); } - public void StoreDataAndPersistIndex(string directory, ulong collectionId, IEnumerable job, IModel model, int reportSize = 1000) + public void StoreDataAndPersistIndex(string directory, ulong collectionId, IEnumerable job, IModel model, IIndexReadWriteStrategy indexStrategy, int reportSize = 1000) { using (var writeSession = new WriteSession(new DocumentWriter(directory, collectionId, this))) - using (var indexSession = new InMemoryIndexSession(model, model, this, directory, collectionId)) + using (var indexSession = new InMemoryIndexSession(model, indexStrategy, this, directory, collectionId)) { StoreDataAndBuildInMemoryIndex(job, writeSession, indexSession, reportSize); @@ -320,10 +321,10 @@ public void StoreDataAndPersistIndex(string directory, ulong collectionId, IE } } - public void StoreDataAndPersistIndex(string directory, ulong collectionId, Document document, IModel model) + public void StoreDataAndPersistIndex(string directory, ulong collectionId, Document document, IModel model, IIndexReadWriteStrategy indexStrategy) { using (var writeSession = new WriteSession(new DocumentWriter(directory, collectionId, this))) - using (var indexSession = new InMemoryIndexSession(model, model, this, directory, collectionId)) + using (var indexSession = new InMemoryIndexSession(model, indexStrategy, this, directory, collectionId)) { StoreDataAndBuildInMemoryIndex(document, writeSession, indexSession); @@ -366,7 +367,7 @@ public bool DocumentExists(string directory, string collection, string key, T if (query != null) { - using (var searchSession = new SearchSession(directory, this, model, _logger)) + using (var searchSession = new SearchSession(directory, this, model, new NonOptimizedPageIndexingStrategy(model), _logger)) { var document = searchSession.SearchScalar(query); diff --git a/src/Sir.StringTests/BagOfCharsModelTests.cs b/src/Sir.StringTests/BagOfCharsModelTests.cs index dcb8d33c..95f7680c 100644 --- a/src/Sir.StringTests/BagOfCharsModelTests.cs +++ b/src/Sir.StringTests/BagOfCharsModelTests.cs @@ -23,7 +23,7 @@ public void Can_traverse_index_in_memory() using (var reader = _sessionFactory.CreateColumnReader("", 0, 0)) { - var index = model.CreateTree(model, reader, _data); + var index = model.CreateTree(new NonOptimizedPageIndexingStrategy(model), reader, _data); Debug.WriteLine(PathFinder.Visualize(index)); @@ -59,7 +59,7 @@ public void Can_traverse_streamed() using (var reader = _sessionFactory.CreateColumnReader("", 0, 0)) { - var index = model.CreateTree(model, reader, _data); + var index = model.CreateTree(new NonOptimizedPageIndexingStrategy(model), reader, _data); using (var indexStream = new MemoryStream()) using (var vectorStream = new MemoryStream()) diff --git a/src/Sir.StringTests/NGramModelTests.cs b/src/Sir.StringTests/NGramModelTests.cs index 846f7f64..be12c98a 100644 --- a/src/Sir.StringTests/NGramModelTests.cs +++ b/src/Sir.StringTests/NGramModelTests.cs @@ -23,7 +23,7 @@ public void Can_traverse_index_in_memory() using (var reader = _sessionFactory.CreateColumnReader("", 0, 0)) { - var index = model.CreateTree(model, reader, _data); + var index = model.CreateTree(new NonOptimizedPageIndexingStrategy(model), reader, _data); Debug.WriteLine(PathFinder.Visualize(index)); @@ -59,7 +59,7 @@ public void Can_traverse_streamed() using (var reader = _sessionFactory.CreateColumnReader("", 0, 0)) { - var index = model.CreateTree(model, reader, _data); + var index = model.CreateTree(new NonOptimizedPageIndexingStrategy(model), reader, _data); using (var indexStream = new MemoryStream()) using (var vectorStream = new MemoryStream()) diff --git a/src/Sir.StringTests/OptimizedPageIndexStrategyTests.cs b/src/Sir.StringTests/OptimizedPageIndexStrategyTests.cs index 02260135..fe5c9946 100644 --- a/src/Sir.StringTests/OptimizedPageIndexStrategyTests.cs +++ b/src/Sir.StringTests/OptimizedPageIndexStrategyTests.cs @@ -23,7 +23,7 @@ public void Can_traverse_index_in_memory() using (var reader = _sessionFactory.CreateColumnReader("", 0, 0)) { - var index = model.CreateTree(model, reader, _data); + var index = model.CreateTree(new NonOptimizedPageIndexingStrategy(model), reader, _data); Debug.WriteLine(PathFinder.Visualize(index)); diff --git a/src/Sir.StringTests/UpdateSessionTests.cs b/src/Sir.StringTests/UpdateSessionTests.cs index 15ea57c7..cd06b08e 100644 --- a/src/Sir.StringTests/UpdateSessionTests.cs +++ b/src/Sir.StringTests/UpdateSessionTests.cs @@ -38,7 +38,7 @@ public void Can_update_string_field() { var data = _data[i]; - using (var indexSession = new InMemoryIndexSession(model, model, _sessionFactory, _directory, collectionId)) + using (var indexSession = new InMemoryIndexSession(model, new NonOptimizedPageIndexingStrategy(model), _sessionFactory, _directory, collectionId)) { var doc = new Document(new Field[] { new Field(fieldName, data) }); @@ -51,7 +51,7 @@ public void Can_update_string_field() var queryParser = new QueryParser(_directory, _sessionFactory, model); - using (var searchSession = new SearchSession(_directory, _sessionFactory, model, _loggerFactory.CreateLogger())) + using (var searchSession = new SearchSession(_directory, _sessionFactory, model, new NonOptimizedPageIndexingStrategy(model), _loggerFactory.CreateLogger())) { Assert.DoesNotThrow(() => { @@ -81,7 +81,7 @@ public void Can_update_string_field() updateSession.Update(documentIdToUpdate, 0, updatedWord); } - using (var searchSession = new SearchSession(_directory, _sessionFactory, model, _loggerFactory.CreateLogger())) + using (var searchSession = new SearchSession(_directory, _sessionFactory, model, new NonOptimizedPageIndexingStrategy(model), _loggerFactory.CreateLogger())) { Assert.DoesNotThrow(() => { diff --git a/src/Sir.Strings/BagOfCharsModel.cs b/src/Sir.Strings/BagOfCharsModel.cs index d840d253..71640ece 100644 --- a/src/Sir.Strings/BagOfCharsModel.cs +++ b/src/Sir.Strings/BagOfCharsModel.cs @@ -9,11 +9,6 @@ public class BagOfCharsModel : DistanceCalculator, IModel public double FoldAngle => 0.55d; public override int NumOfDimensions => System.Text.Unicode.UnicodeRanges.All.Length; - public void Put(VectorNode column, VectorNode node, IColumnReader reader) - { - column.AddOrAppend(node, this); - } - public IEnumerable CreateEmbedding(string data, bool label) { var source = data.ToCharArray(); @@ -64,11 +59,6 @@ public IEnumerable CreateEmbedding(string data, bool label) } } } - - public Hit GetClosestMatchOrNull(ISerializableVector vector, IModel model, IColumnReader reader) - { - return reader.ClosestMatchOrNullScanningAllPages(vector, model); - } } public static class TokenizeOperations diff --git a/src/Sir.WebFront/Features/CrawlJob.cs b/src/Sir.WebFront/Features/CrawlJob.cs index d5bdf383..1ce524fd 100644 --- a/src/Sir.WebFront/Features/CrawlJob.cs +++ b/src/Sir.WebFront/Features/CrawlJob.cs @@ -20,12 +20,14 @@ public class CrawlJob : AsyncJob private readonly IModel _model; private readonly int _skip; private readonly int _take; + private readonly IIndexReadWriteStrategy _indexStrategy; public CrawlJob( string directory, SessionFactory sessionFactory, QueryParser queryParser, IModel model, + IIndexReadWriteStrategy indexStrategy, ILogger logger, string id, string[] collection, @@ -45,6 +47,7 @@ public CrawlJob( _model = model; _skip = skip; _take = take; + _indexStrategy = indexStrategy; Status["download"] = 0; Status["index"] = 0; @@ -78,7 +81,7 @@ private void DownloadAndIndexWetFile() or: Or, label: false); - using (var readSession = new SearchSession(_directory, _sessionFactory, _model, _logger)) + using (var readSession = new SearchSession(_directory, _sessionFactory, _model, new NonOptimizedPageIndexingStrategy(_model), _logger)) { var originalResult = readSession.Search(originalQuery, _skip, _take) .Documents @@ -203,7 +206,7 @@ private void DownloadAndIndexWetFile() { var time = Stopwatch.StartNew(); - _sessionFactory.StoreDataAndPersistIndex(_directory, wetCollectionId, writePayload, _model, reportSize: 1000); + _sessionFactory.StoreDataAndPersistIndex(_directory, wetCollectionId, writePayload, _model, _indexStrategy, reportSize: 1000); Status["index"] = 100; diff --git a/src/Sir.WebFront/Features/SaveAsJob.cs b/src/Sir.WebFront/Features/SaveAsJob.cs index 91e0085a..4408fca2 100644 --- a/src/Sir.WebFront/Features/SaveAsJob.cs +++ b/src/Sir.WebFront/Features/SaveAsJob.cs @@ -18,12 +18,14 @@ public class SaveAsJob : BaseJob private readonly int _take; private readonly string[] _select; private readonly bool _truncate; + private readonly IIndexReadWriteStrategy _indexStrategy; public SaveAsJob( string directory, SessionFactory sessionFactory, QueryParser queryParser, IModel model, + IIndexReadWriteStrategy indexStrategy, ILogger logger, string target, string[] collections, @@ -48,6 +50,7 @@ public SaveAsJob( _take = take; _select = select; _truncate = truncate; + _indexStrategy = indexStrategy; } public override void Execute() @@ -66,7 +69,7 @@ public override void Execute() var targetCollectionId = _target.ToHash(); IEnumerable documents; - using (var readSession = new SearchSession(_directory, _sessionFactory, _model, _logger)) + using (var readSession = new SearchSession(_directory, _sessionFactory, _model, new NonOptimizedPageIndexingStrategy(_model), _logger)) { documents = readSession.Search(query, _skip, _take).Documents; } @@ -88,7 +91,8 @@ public override void Execute() _directory, targetCollectionId, documents, - _model); + _model, + _indexStrategy); } catch (Exception ex) { diff --git a/src/Sir.WebFront/SearchClient.cs b/src/Sir.WebFront/SearchClient.cs index 348c0f83..e367e9e1 100644 --- a/src/Sir.WebFront/SearchClient.cs +++ b/src/Sir.WebFront/SearchClient.cs @@ -76,7 +76,7 @@ public async Task Read(HttpRequest request, IModel model) _logger.LogDebug($"parsed query: {queryLog}"); #endif - using (var readSession = new SearchSession(_config.Get("data_dir"), _sessionFactory, model, _logger)) + using (var readSession = new SearchSession(_config.Get("data_dir"), _sessionFactory, model, new NonOptimizedPageIndexingStrategy(model), _logger)) { return readSession.Search(query, skip, take); } diff --git a/src/Sir.WebFront/WriteClient.cs b/src/Sir.WebFront/WriteClient.cs index 64ceb382..0058d88a 100644 --- a/src/Sir.WebFront/WriteClient.cs +++ b/src/Sir.WebFront/WriteClient.cs @@ -20,7 +20,7 @@ public WriteClient(SessionFactory sessionFactory, IConfigurationProvider config) _config = config; } - public void Write(HttpRequest request, IModel model) + public void Write(HttpRequest request, IModel model, IIndexReadWriteStrategy indexStrategy) { var documents = Deserialize>(request.Body); var collectionId = request.Query["collection"].First().ToHash(); @@ -29,7 +29,8 @@ public void Write(HttpRequest request, IModel model) _config.Get("data_dir"), collectionId, documents, - model); + model, + indexStrategy); } private static T Deserialize(Stream stream) diff --git a/src/Sir.Wikipedia/IndexWikipediaCommand.cs b/src/Sir.Wikipedia/IndexWikipediaCommand.cs index d64cf5d2..227039d8 100644 --- a/src/Sir.Wikipedia/IndexWikipediaCommand.cs +++ b/src/Sir.Wikipedia/IndexWikipediaCommand.cs @@ -47,7 +47,7 @@ public void Run(IDictionary args, ILogger logger) foreach (var page in payload.Batch(pageSize)) { using (var indexStream = new IndexWriter(dataDirectory, collectionId, sessionFactory, logger: logger)) - using (var indexSession = new InMemoryIndexSession(model, new OptimizedPageIndexingStrategy(model), sessionFactory, dataDirectory, collectionId)) + using (var indexSession = new InMemoryIndexSession(model, new NonOptimizedPageIndexingStrategy(model), sessionFactory, dataDirectory, collectionId)) { foreach (var document in page) {