diff --git a/index.bat b/index.bat index 07d4531c..f5bf71b9 100644 --- a/index.bat +++ b/index.bat @@ -1 +1 @@ -sir.bat indexwikipedia --directory C:\projects\resin\src\Sir.HttpServer\AppData\database --collection wikipedia --skip 0 --take 1000000 --pageSize 10000 --sampleSize 1000 %* \ No newline at end of file +sir.bat indexwikipedia --directory C:\projects\resin\src\Sir.HttpServer\AppData\database --collection wikipedia --skip 0 --take 1000000 --pageSize 1000 --sampleSize 1000 %* \ No newline at end of file diff --git a/src/Sir.Cmd/AnalyzeDocumentCommand.cs b/src/Sir.Cmd/AnalyzeDocumentCommand.cs index 29caa739..d3445c1b 100644 --- a/src/Sir.Cmd/AnalyzeDocumentCommand.cs +++ b/src/Sir.Cmd/AnalyzeDocumentCommand.cs @@ -18,7 +18,6 @@ public void Run(IDictionary args, ILogger logger) var select = new HashSet(args["select"].Split(new char[] { ',', ' ' }, StringSplitOptions.RemoveEmptyEntries)); var collectionId = collection.ToHash(); var model = new BagOfCharsModel(); - var embedding = new SortedList(); using (var documentReader = new DocumentRegistryReader(dataDirectory, collectionId)) { @@ -26,7 +25,7 @@ public void Run(IDictionary args, ILogger logger) foreach (var field in doc.Fields) { - var tokens = model.CreateEmbedding(field.Value.ToString(), true, embedding); + var tokens = model.CreateEmbedding(field.Value.ToString(), true); var tree = new VectorNode(); foreach (var token in tokens) diff --git a/src/Sir.Cmd/BenchmarkCommand.cs b/src/Sir.Cmd/BenchmarkCommand.cs index 6578324a..4a0f2201 100644 --- a/src/Sir.Cmd/BenchmarkCommand.cs +++ b/src/Sir.Cmd/BenchmarkCommand.cs @@ -30,13 +30,12 @@ public void RunTokenizeBenchmark(IDictionary args, ILogger logge var model = new BagOfCharsModel(); var documents = new List(WikipediaHelper.Read(fileName, skip, take, new HashSet { "text" })); var timer = Stopwatch.StartNew(); - var embedding = new SortedList(); for (int i = 0; i < numOfRuns; i++) { foreach (var document in documents) { - var embeddings = new List(model.CreateEmbedding((string)document.Fields[0].Value, false, embedding)); + var embeddings = new List(model.CreateEmbedding((string)document.Fields[0].Value, false)); } } diff --git a/src/Sir.Cmd/ValidateCommand.cs b/src/Sir.Cmd/ValidateCommand.cs index 99898183..870388d5 100644 --- a/src/Sir.Cmd/ValidateCommand.cs +++ b/src/Sir.Cmd/ValidateCommand.cs @@ -22,13 +22,12 @@ public void Run(IDictionary args, ILogger logger) var selectFields = new HashSet { "title" }; var time = Stopwatch.StartNew(); var count = 0; - var embedding = new SortedList(); using (var kvReader = new KeyValue.KeyValueReader(dir, collectionId)) using (var validateSession = new ValidateSession( collectionId, new SearchSession(dir, model, new LogStructuredIndexingStrategy(model), logger), - new QueryParser(kvReader, model, embedding: embedding, logger: logger))) + new QueryParser(kvReader, model, logger: logger))) using (var documents = new DocumentStreamSession(dir)) { foreach (var doc in documents.ReadDocuments(collectionId, selectFields, skip, take)) diff --git a/src/Sir.ImageTests/ImageModelTests.cs b/src/Sir.ImageTests/ImageModelTests.cs index 6fdb96c3..763bda88 100644 --- a/src/Sir.ImageTests/ImageModelTests.cs +++ b/src/Sir.ImageTests/ImageModelTests.cs @@ -103,7 +103,7 @@ public void Can_traverse_streamed() throw new Exception($"unable to find {word} in tree."); } - if (hit.Score < model.IdenticalAngle) + if (hit.Score.Approximates(model.IdenticalAngle)) { throw new Exception($"unable to score {word}."); } diff --git a/src/Sir.Images/LinearClassifierImageModel.cs b/src/Sir.Images/LinearClassifierImageModel.cs index c1b60205..a4e024ab 100644 --- a/src/Sir.Images/LinearClassifierImageModel.cs +++ b/src/Sir.Images/LinearClassifierImageModel.cs @@ -11,7 +11,7 @@ public class LinearClassifierImageModel : Sir.DistanceCalculator, IModel public double FoldAngle => 0.75d; public override int NumOfDimensions => 784; - public IEnumerable CreateEmbedding(IImage data, bool label, SortedList embedding = null) + public IEnumerable CreateEmbedding(IImage data, bool label) { var pixels = data.Pixels.Select(x => Convert.ToSingle(x)); diff --git a/src/Sir.InformationRetreival/IModel.cs b/src/Sir.InformationRetreival/IModel.cs index 20a7c4b6..49fa64fb 100644 --- a/src/Sir.InformationRetreival/IModel.cs +++ b/src/Sir.InformationRetreival/IModel.cs @@ -9,7 +9,7 @@ namespace Sir /// The type of data the model should consist of. public interface IModel : IModel { - IEnumerable CreateEmbedding(T data, bool label, SortedList embedding = null); + IEnumerable CreateEmbedding(T data, bool label); } /// diff --git a/src/Sir.InformationRetreival/IO/GraphBuilder.cs b/src/Sir.InformationRetreival/IO/GraphBuilder.cs index 90a4b2cd..fd9c7bf4 100644 --- a/src/Sir.InformationRetreival/IO/GraphBuilder.cs +++ b/src/Sir.InformationRetreival/IO/GraphBuilder.cs @@ -9,11 +9,10 @@ public static class GraphBuilder public static VectorNode CreateTree(this IModel model, IIndexReadWriteStrategy indexingStrategy, params T[] data) { var root = new VectorNode(); - var embedding = new SortedList(); foreach (var item in data) { - foreach (var vector in model.CreateEmbedding(item, true, embedding)) + foreach (var vector in model.CreateEmbedding(item, true)) { indexingStrategy.Put(root, new VectorNode(vector)); } diff --git a/src/Sir.InformationRetreival/IndexingStrategies/LogStructuredIndexingStrategy.cs b/src/Sir.InformationRetreival/IndexingStrategies/LogStructuredIndexingStrategy.cs index 95bc4416..a1094efd 100644 --- a/src/Sir.InformationRetreival/IndexingStrategies/LogStructuredIndexingStrategy.cs +++ b/src/Sir.InformationRetreival/IndexingStrategies/LogStructuredIndexingStrategy.cs @@ -29,7 +29,7 @@ public void SerializePage(string directory, ulong collectionId, long keyId, Vect var time = Stopwatch.StartNew(); using (var vectorStream = StreamFactory.CreateAppendStream(directory, collectionId, keyId, "vec")) - using (var postingsWriter = new PostingsWriter(StreamFactory.CreateSeekableWriteStream(directory, collectionId, keyId, "pos"), indexCache:null)) + using (var postingsWriter = new PostingsWriter(StreamFactory.CreateSeekableWriteStream(directory, collectionId, keyId, "pos"), indexCache: indexCache)) using (var columnWriter = new ColumnWriter(StreamFactory.CreateAppendStream(directory, collectionId, keyId, "ix"))) using (var pageIndexWriter = new PageIndexWriter(StreamFactory.CreateAppendStream(directory, collectionId, keyId, "ixtp"))) { diff --git a/src/Sir.InformationRetreival/Parsers/QueryParser.cs b/src/Sir.InformationRetreival/Parsers/QueryParser.cs index 081f4705..d5765d2c 100644 --- a/src/Sir.InformationRetreival/Parsers/QueryParser.cs +++ b/src/Sir.InformationRetreival/Parsers/QueryParser.cs @@ -10,14 +10,12 @@ public class QueryParser private readonly KeyValueReader _kvReader; private readonly IModel _model; private readonly ILogger _logger; - private readonly SortedList _embedding; - public QueryParser(KeyValueReader kvReader, IModel model, SortedList embedding = null, ILogger logger = null) + public QueryParser(KeyValueReader kvReader, IModel model, ILogger logger = null) { _kvReader = kvReader; _model = model; _logger = logger; - _embedding = embedding ?? new SortedList(); } public Query Parse( @@ -243,7 +241,7 @@ private IList CreateTerms(ulong collectionId, string key, T value, bool an if (_kvReader.TryGetKeyId(key.ToHash(), out keyId)) { - var tokens = _model.CreateEmbedding(value, label, _embedding); + var tokens = _model.CreateEmbedding(value, label); foreach (var term in tokens) { diff --git a/src/Sir.InformationRetreival/SerializableVector.cs b/src/Sir.InformationRetreival/SerializableVector.cs index 5334bb2a..d1ed6028 100644 --- a/src/Sir.InformationRetreival/SerializableVector.cs +++ b/src/Sir.InformationRetreival/SerializableVector.cs @@ -17,6 +17,10 @@ public class SerializableVector : ISerializableVector public int[] Indices { get { return ((SparseVectorStorage)Value.Storage).Indices; } } public float[] Values { get { return ((SparseVectorStorage)Value.Storage).Values; } } + public SerializableVector() + { + } + public SerializableVector(int numOfDimensions, object label = null) { Value = CreateVector.Sparse(numOfDimensions); @@ -48,6 +52,11 @@ public SerializableVector(SortedList dictionary, int numOfDimensions Label = label; } + public bool IsEmptyVector() + { + return Value == null; + } + public SerializableVector(int[] index, float[] values, int numOfDimensions, object label = null) { var tuples = new Tuple[Math.Min(index.Length, numOfDimensions)]; diff --git a/src/Sir.InformationRetreival/Session/DocumentDatabase.cs b/src/Sir.InformationRetreival/Session/DocumentDatabase.cs index 373cb568..63e354d1 100644 --- a/src/Sir.InformationRetreival/Session/DocumentDatabase.cs +++ b/src/Sir.InformationRetreival/Session/DocumentDatabase.cs @@ -39,7 +39,7 @@ public DocumentDatabase(string directory, ulong collectionId, IModel model = public QueryParser CreateQueryParser() { - return new QueryParser(SearchSession.GetKeyValueReader(_collectionId), _model, IndexSession.EmptyEmbedding, _logger); + return new QueryParser(SearchSession.GetKeyValueReader(_collectionId), _model, _logger); } public IEnumerable StreamDocuments(HashSet fieldsOfInterest, int skip, int take) diff --git a/src/Sir.InformationRetreival/Session/DocumentStreamSession.cs b/src/Sir.InformationRetreival/Session/DocumentStreamSession.cs index 40466a99..448e62a8 100644 --- a/src/Sir.InformationRetreival/Session/DocumentStreamSession.cs +++ b/src/Sir.InformationRetreival/Session/DocumentStreamSession.cs @@ -201,12 +201,8 @@ public IEnumerable ReadDocumentValuesAsVectors( HashSet select, DocumentRegistryReader documentReader, IModel model, - bool label, - SortedList embedding = null) + bool label) { - if (embedding == null) - embedding = new SortedList(); - var docInfo = documentReader.GetDocumentAddress(doc.docId); var docMap = documentReader.GetDocumentMap(docInfo.offset, docInfo.length); @@ -222,7 +218,7 @@ public IEnumerable ReadDocumentValuesAsVectors( { var vInfo = documentReader.GetAddressOfValue(kvp.valId); - foreach (var vector in documentReader.GetValueConvertedToVectors(vInfo.offset, vInfo.len, vInfo.dataType, value => model.CreateEmbedding(value, label, embedding))) + foreach (var vector in documentReader.GetValueConvertedToVectors(vInfo.offset, vInfo.len, vInfo.dataType, value => model.CreateEmbedding(value, label))) { tree.AddIfUnique(new VectorNode(vector, docId:doc.docId, keyId:kvp.keyId), model); } diff --git a/src/Sir.InformationRetreival/Session/IIndexSession.cs b/src/Sir.InformationRetreival/Session/IIndexSession.cs deleted file mode 100644 index 7d2f385b..00000000 --- a/src/Sir.InformationRetreival/Session/IIndexSession.cs +++ /dev/null @@ -1,19 +0,0 @@ -using System.Collections.Generic; - -namespace Sir -{ - public interface IIndexSession : IIndexSession - { - void Put(long docId, long keyId, T value, bool label); - } - - public interface IIndexSession - { - IndexInfo GetIndexInfo(); - void Put(long docId, long keyId, IEnumerable tokens); - void Put(VectorNode documentTree); - void Commit(); - void Commit(long keyId); - IDictionary GetInMemoryIndices(); - } -} \ No newline at end of file diff --git a/src/Sir.InformationRetreival/Session/IndexCache.cs b/src/Sir.InformationRetreival/Session/IndexCache.cs index 2adb4a7d..c27672bd 100644 --- a/src/Sir.InformationRetreival/Session/IndexCache.cs +++ b/src/Sir.InformationRetreival/Session/IndexCache.cs @@ -35,7 +35,7 @@ public void Put(VectorNode node) { var hit = PathFinder.ClosestMatch(tree, vector, _model); - if (hit.Score >= _model.IdenticalAngle) + if (hit.Score.Approximates(_model.IdenticalAngle)) { return hit.Node.PostingsOffset == -1 ? null : hit.Node.PostingsOffset; } @@ -49,7 +49,7 @@ public void UpdatePostingsOffset(long keyId, ISerializableVector vector, long po { var hit = PathFinder.ClosestMatch(tree, vector, _model); - if (hit.Score >= _model.IdenticalAngle) + if (hit.Score.Approximates(_model.IdenticalAngle)) { hit.Node.PostingsOffset = postingsOffset; } diff --git a/src/Sir.InformationRetreival/Session/IndexDebugger.cs b/src/Sir.InformationRetreival/Session/IndexDebugger.cs index 50bafab8..f1e01bbb 100644 --- a/src/Sir.InformationRetreival/Session/IndexDebugger.cs +++ b/src/Sir.InformationRetreival/Session/IndexDebugger.cs @@ -24,7 +24,7 @@ public IndexDebugger(ILogger logger, int sampleSize = 1000) _logger = logger; } - public void Step(IIndexSession indexSession, string message = null) + public void Step(IndexSession indexSession, string message = null) { _steps++; diff --git a/src/Sir.InformationRetreival/Session/IndexSession.cs b/src/Sir.InformationRetreival/Session/IndexSession.cs index 77eb1834..a53b519d 100644 --- a/src/Sir.InformationRetreival/Session/IndexSession.cs +++ b/src/Sir.InformationRetreival/Session/IndexSession.cs @@ -1,5 +1,4 @@ using Microsoft.Extensions.Logging; -using Sir.IO; using System; using System.Collections.Generic; using System.Diagnostics; @@ -10,7 +9,7 @@ namespace Sir /// Write a paged index. /// /// - public class IndexSession : IIndexSession, IDisposable + public class IndexSession : IDisposable { private readonly IModel _model; private readonly IIndexReadWriteStrategy _indexingStrategy; @@ -20,8 +19,6 @@ public class IndexSession : IIndexSession, IDisposable private readonly ILogger _logger; private readonly IndexCache _indexCache; - public SortedList EmptyEmbedding = new SortedList(); - public IndexSession( string directory, ulong collectionId, @@ -41,12 +38,12 @@ public IndexSession( public void Put(long docId, long keyId, T value, bool label) { - var tokens = _model.CreateEmbedding(value, label, EmptyEmbedding); + var tokens = _model.CreateEmbedding(value, label); Put(docId, keyId, tokens); } - public void Put(long docId, long keyId, IEnumerable tokens) + private void Put(long docId, long keyId, IEnumerable tokens) { VectorNode column; @@ -58,27 +55,10 @@ public void Put(long docId, long keyId, IEnumerable tokens) foreach (var token in tokens) { - _indexingStrategy.Put( - column, - new VectorNode(vector:token, docId:docId, keyId:keyId)); - } - } - - public void Put(VectorNode token) - { - VectorNode column; - - if (!_index.TryGetValue(token.KeyId.Value, out column)) - { - column = new VectorNode(); - _index.Add(token.KeyId.Value, column); - } - - foreach (var node in PathFinder.All(token)) - { - _indexingStrategy.Put( - column, - new VectorNode(node.Vector, docIds: node.DocIds)); + if (!token.IsEmptyVector()) + _indexingStrategy.Put( + column, + new VectorNode(vector:token, docId:docId, keyId:keyId)); } } diff --git a/src/Sir.InformationRetreival/Session/SearchSession.cs b/src/Sir.InformationRetreival/Session/SearchSession.cs index caa32729..318a06c6 100644 --- a/src/Sir.InformationRetreival/Session/SearchSession.cs +++ b/src/Sir.InformationRetreival/Session/SearchSession.cs @@ -156,7 +156,7 @@ private void Scan(Query query, bool identicalMatchesOnly) if (hit != null) { - if (!identicalMatchesOnly || (hit.Score >= _model.IdenticalAngle)) + if (!identicalMatchesOnly || hit.Score.Approximates(_model.IdenticalAngle)) { term.Score = hit.Score; term.PostingsOffsets = hit.PostingsOffsets; diff --git a/src/Sir.KeyValue/ISerializableVector.cs b/src/Sir.KeyValue/ISerializableVector.cs index bf4f55fe..cf33e3c3 100644 --- a/src/Sir.KeyValue/ISerializableVector.cs +++ b/src/Sir.KeyValue/ISerializableVector.cs @@ -20,5 +20,6 @@ public interface ISerializableVector void AverageInPlace(ISerializableVector vector); ISerializableVector Append(ISerializableVector vector); ISerializableVector Shift(int numOfPositionsToShift, int numOfDimensions); + bool IsEmptyVector(); } } \ No newline at end of file diff --git a/src/Sir.Strings/BagOfCharsModel.cs b/src/Sir.Strings/BagOfCharsModel.cs index 0f212874..cc641f96 100644 --- a/src/Sir.Strings/BagOfCharsModel.cs +++ b/src/Sir.Strings/BagOfCharsModel.cs @@ -7,17 +7,15 @@ public class BagOfCharsModel : DistanceCalculator, IModel public double IdenticalAngle => 0.998d; public double FoldAngle => 0.55d; public override int NumOfDimensions => System.Text.Unicode.UnicodeRanges.All.Length; + private readonly SortedList _embedding = new SortedList(); - public IEnumerable CreateEmbedding(string data, bool label, SortedList embedding = null) + public IEnumerable CreateEmbedding(string data, bool label) { var source = data.ToCharArray(); if (source.Length > 0) { - if (embedding == null) - embedding = new SortedList(); - else - embedding.Clear(); + _embedding.Clear(); var offset = 0; int index = 0; @@ -28,20 +26,20 @@ public IEnumerable CreateEmbedding(string data, bool label, if (char.IsLetterOrDigit(c) || char.GetUnicodeCategory(c) == System.Globalization.UnicodeCategory.MathSymbol) { - embedding.AddOrAppendToComponent(c, 1); + _embedding.AddOrAppendToComponent(c, 1); } else { - if (embedding.Count > 0) + if (_embedding.Count > 0) { var len = index - offset; var vector = new SerializableVector( - embedding, + _embedding, NumOfDimensions, label ? new string(source, offset, len) : null); - embedding.Clear(); + _embedding.Clear(); yield return vector; } @@ -49,12 +47,12 @@ public IEnumerable CreateEmbedding(string data, bool label, } } - if (embedding.Count > 0) + if (_embedding.Count > 0) { var len = index - offset; var vector = new SerializableVector( - embedding, + _embedding, NumOfDimensions, label ? new string(source, offset, len) : null); diff --git a/src/Sir.Strings/BagOfWordsModel.cs b/src/Sir.Strings/BagOfWordsModel.cs new file mode 100644 index 00000000..de4ae1f2 --- /dev/null +++ b/src/Sir.Strings/BagOfWordsModel.cs @@ -0,0 +1,68 @@ +using System.Collections.Generic; + +namespace Sir.Strings +{ + public class BagOfWordsModel : DistanceCalculator, IModel + { + public double IdenticalAngle => 0.998d; + public double FoldAngle => 0.55d; + public override int NumOfDimensions => System.Text.Unicode.UnicodeRanges.All.Length; + private readonly SortedList _embedding = new SortedList(); + + public IEnumerable CreateEmbedding(string data, bool label) + { + var source = data.ToCharArray(); + + if (source.Length > 0) + { + _embedding.Clear(); + + var offset = 0; + int index = 0; + + for (; index < source.Length; index++) + { + char c = char.ToLower(source[index]); + + if (char.IsPunctuation(c)) + { + yield return new SerializableVector(); + } + else if (char.IsLetterOrDigit(c) || char.GetUnicodeCategory(c) == System.Globalization.UnicodeCategory.MathSymbol) + { + _embedding.AddOrAppendToComponent(c, 1); + } + else + { + if (_embedding.Count > 0) + { + var len = index - offset; + + var vector = new SerializableVector( + _embedding, + NumOfDimensions, + label ? new string(source, offset, len) : null); + + _embedding.Clear(); + yield return vector; + } + + offset = index + 1; + } + } + + if (_embedding.Count > 0) + { + var len = index - offset; + + var vector = new SerializableVector( + _embedding, + NumOfDimensions, + label ? new string(source, offset, len) : null); + + yield return vector; + } + } + } + } +} \ No newline at end of file diff --git a/src/Sir.Strings/NGramModel.cs b/src/Sir.Strings/NGramModel.cs index 699fa9f2..34e467fa 100644 --- a/src/Sir.Strings/NGramModel.cs +++ b/src/Sir.Strings/NGramModel.cs @@ -17,18 +17,13 @@ public NGramModel(BagOfCharsModel wordTokenizer) NumOfDimensions = wordTokenizer.NumOfDimensions * 2; } - public IEnumerable CreateEmbedding(string data, bool label, SortedList embedding = null) + public IEnumerable CreateEmbedding(string data, bool label) { - if (embedding == null) - embedding = new SortedList(); - else - embedding.Clear(); - ISerializableVector vec0 = null; var i = 0; - foreach (var token in _wordTokenizer.CreateEmbedding(data, label, embedding)) + foreach (var token in _wordTokenizer.CreateEmbedding(data, label)) { if (vec0 == null) { diff --git a/write.bat b/write.bat index 4568d69f..5db8472a 100644 --- a/write.bat +++ b/write.bat @@ -1 +1 @@ -sir.bat writewikipedia --directory C:\projects\resin\src\Sir.HttpServer\AppData\database --file d:\enwiki-20211122-cirrussearch-content.json.gz --collection wikipedia --skip 0 --take 1000000 --sampleSize 1000 %* \ No newline at end of file +sir.bat writewikipedia --directory C:\projects\resin\src\Sir.HttpServer\AppData\database --file d:\enwiki-20211122-cirrussearch-content.json.gz --collection wikipedia --skip 0 --take 10000 --sampleSize 1000 %* \ No newline at end of file