From c2c104c2e4f45a6e05e70d2b875f475c4f37052d Mon Sep 17 00:00:00 2001 From: Kreeben Date: Mon, 13 May 2024 15:52:04 +0200 Subject: [PATCH] implement paged postings writer --- src/AssemblyInfo.cs | 4 +- src/Sir.Cmd/Sir.Cmd.csproj | 8 +- .../Sir.DataProvider.CommonCrawl.csproj} | 2 +- src/Sir.Document/Field.cs | 7 +- src/Sir.HttpServer/HttpQueryParser.cs | 10 +- src/Sir.HttpServer/Sir.HttpServer.csproj | 2 +- src/Sir.ImageTests/Sir.ImageTests.csproj | 5 +- ....Images.csproj => Sir.Model.Images.csproj} | 0 src/Sir.InformationRetreival/Hit.cs | 8 ++ .../IIndexReadWriteStrategy.cs | 2 +- .../IO/ColumnReader.cs | 21 ++-- .../IO/GraphBuilder.cs | 17 +-- src/Sir.InformationRetreival/IO/PathFinder.cs | 2 +- .../IO/PostingsReader.cs | 17 +-- .../IO/PostingsWriter.cs | 63 +++++++++-- ...dOrchestrator.cs => TermPostingsMapper.cs} | 16 ++- src/Sir.InformationRetreival/IQuery.cs | 21 ---- src/Sir.InformationRetreival/IReducer.cs | 2 +- .../LogStructuredIndexingStrategy.cs | 4 +- ...SupervisedLogStructuredIndexingStrategy.cs | 4 +- .../Parsers/QueryParser.cs | 8 +- src/Sir.InformationRetreival/Query.cs | 10 +- src/Sir.InformationRetreival/Scorer.cs | 2 +- src/Sir.InformationRetreival/SearchResult.cs | 4 +- .../Session/DocumentDatabase.cs | 13 +-- .../Session/ISearchSession.cs | 4 +- .../Session/IndexCache.cs | 59 ++++++++++ .../Session/IndexSession.cs | 7 +- .../Session/IndexWriter.cs | 38 ------- .../Session/SearchSession.cs | 24 +++-- src/Sir.InformationRetreival/Term.cs | 2 +- src/Sir.KeyValue/StreamFactory.cs | 17 +++ .../Sir.DataProvider.Mnist.csproj} | 2 +- .../Sir.StringCompare.csproj | 2 +- .../BagOfCharsDatabaseTests.cs | 101 ++++++++++++++++-- src/Sir.StringTests/Sir.StringTests.csproj | 5 +- ...trings.csproj => Sir.Model.Strings.csproj} | 0 ...proj => Sir.DataProvider.Wikipedia.csproj} | 2 +- src/Sir.sln | 21 ++-- 39 files changed, 341 insertions(+), 195 deletions(-) rename src/{Sir.Mnist/Sir.Mnist.csproj => Sir.CommonCrawl/Sir.DataProvider.CommonCrawl.csproj} (85%) rename src/Sir.Images/{Sir.Images.csproj => Sir.Model.Images.csproj} (100%) rename src/Sir.InformationRetreival/IO/{PostingsReadOrchestrator.cs => TermPostingsMapper.cs} (65%) delete mode 100644 src/Sir.InformationRetreival/IQuery.cs create mode 100644 src/Sir.InformationRetreival/Session/IndexCache.cs delete mode 100644 src/Sir.InformationRetreival/Session/IndexWriter.cs rename src/{Sir.CommonCrawl/Sir.CommonCrawl.csproj => Sir.Mnist/Sir.DataProvider.Mnist.csproj} (86%) rename src/Sir.Strings/{Sir.Strings.csproj => Sir.Model.Strings.csproj} (100%) rename src/Sir.Wikipedia/{Sir.Wikipedia.csproj => Sir.DataProvider.Wikipedia.csproj} (85%) diff --git a/src/AssemblyInfo.cs b/src/AssemblyInfo.cs index 1965ba97..33a7bc68 100644 --- a/src/AssemblyInfo.cs +++ b/src/AssemblyInfo.cs @@ -1,4 +1,4 @@ using System.Reflection; -[assembly: AssemblyVersion("0.4.0.7")] -[assembly: AssemblyFileVersion("0.4.0.7")] \ No newline at end of file +[assembly: AssemblyVersion("0.5.0.2")] +[assembly: AssemblyFileVersion("0.5.0.2")] \ No newline at end of file diff --git a/src/Sir.Cmd/Sir.Cmd.csproj b/src/Sir.Cmd/Sir.Cmd.csproj index d0b374ba..a9c09911 100644 --- a/src/Sir.Cmd/Sir.Cmd.csproj +++ b/src/Sir.Cmd/Sir.Cmd.csproj @@ -12,10 +12,12 @@ - + + - - + + + diff --git a/src/Sir.Mnist/Sir.Mnist.csproj b/src/Sir.CommonCrawl/Sir.DataProvider.CommonCrawl.csproj similarity index 85% rename from src/Sir.Mnist/Sir.Mnist.csproj rename to src/Sir.CommonCrawl/Sir.DataProvider.CommonCrawl.csproj index d495b3e4..240b2dd6 100644 --- a/src/Sir.Mnist/Sir.Mnist.csproj +++ b/src/Sir.CommonCrawl/Sir.DataProvider.CommonCrawl.csproj @@ -13,8 +13,8 @@ - + diff --git a/src/Sir.Document/Field.cs b/src/Sir.Document/Field.cs index c68987d5..bb7d839e 100644 --- a/src/Sir.Document/Field.cs +++ b/src/Sir.Document/Field.cs @@ -7,15 +7,13 @@ namespace Sir.Documents [DebuggerDisplay("{Name}")] public class Field { - private IEnumerable _tokens; - public long KeyId { get; set; } public long DocumentId { get; set; } public string Name { get; } public object Value { get; set; } - public IEnumerable Tokens { get { return _tokens; } } + public IEnumerable Tokens { get; } - public Field(string name, object value, long keyId = -1, long documentId = -1) + public Field(string name, object value, long keyId = -1, long documentId = -1, IEnumerable tokens = null) { if (name is null) throw new ArgumentNullException(nameof(name)); if (value == null) throw new ArgumentNullException(nameof(value)); @@ -24,6 +22,7 @@ public Field(string name, object value, long keyId = -1, long documentId = -1) Value = value; KeyId = keyId; DocumentId = documentId; + Tokens = tokens; } } } \ No newline at end of file diff --git a/src/Sir.HttpServer/HttpQueryParser.cs b/src/Sir.HttpServer/HttpQueryParser.cs index a89a93ce..c7ebc517 100644 --- a/src/Sir.HttpServer/HttpQueryParser.cs +++ b/src/Sir.HttpServer/HttpQueryParser.cs @@ -20,7 +20,7 @@ public HttpQueryParser(QueryParser parser) _parser = parser; } - public async Task ParseRequest(HttpRequest request, IEnumerable collections = null) + public async Task ParseRequest(HttpRequest request, IEnumerable collections = null) { var select = request.Query["select"].ToArray(); @@ -56,7 +56,7 @@ public static async Task DeserializeFromStream(Stream stream) } } - public IQuery ParseFormattedString(string formattedQuery, string[] select) + public Query ParseFormattedString(string formattedQuery, string[] select) { var document = JsonConvert.DeserializeObject>( formattedQuery, new JsonConverter[] { new DictionaryConverter() }); @@ -64,12 +64,12 @@ public IQuery ParseFormattedString(string formattedQuery, string[] select) return ParseDictionary(document, select); } - public IQuery ParseDictionary(IDictionary document, string[] select) + public Query ParseDictionary(IDictionary document, string[] select) { return _parser.Parse(document, select, true); } - private void DoParseQuery(IQuery query, IDictionary result) + private void DoParseQuery(Query query, IDictionary result) { if (result == null) return; @@ -114,7 +114,7 @@ private void DoParseQuery(IQuery query, IDictionary result) } } - public void ParseQuery(IQuery query, IDictionary result) + public void ParseQuery(Query query, IDictionary result) { DoParseQuery(query, result); } diff --git a/src/Sir.HttpServer/Sir.HttpServer.csproj b/src/Sir.HttpServer/Sir.HttpServer.csproj index e8222b5b..cbd7650f 100644 --- a/src/Sir.HttpServer/Sir.HttpServer.csproj +++ b/src/Sir.HttpServer/Sir.HttpServer.csproj @@ -46,7 +46,7 @@ - + diff --git a/src/Sir.ImageTests/Sir.ImageTests.csproj b/src/Sir.ImageTests/Sir.ImageTests.csproj index 802d011a..f0a6c946 100644 --- a/src/Sir.ImageTests/Sir.ImageTests.csproj +++ b/src/Sir.ImageTests/Sir.ImageTests.csproj @@ -15,10 +15,9 @@ - - + - + diff --git a/src/Sir.Images/Sir.Images.csproj b/src/Sir.Images/Sir.Model.Images.csproj similarity index 100% rename from src/Sir.Images/Sir.Images.csproj rename to src/Sir.Images/Sir.Model.Images.csproj diff --git a/src/Sir.InformationRetreival/Hit.cs b/src/Sir.InformationRetreival/Hit.cs index ed1c3d89..0f8259d1 100644 --- a/src/Sir.InformationRetreival/Hit.cs +++ b/src/Sir.InformationRetreival/Hit.cs @@ -11,9 +11,17 @@ public class Hit public List PostingsOffsets { get; set; } public Hit (VectorNode node, double score) + { + Node = node ?? throw new System.ArgumentNullException(nameof(node)); + Score = score; + PostingsOffsets = new List { node.PostingsOffset }; + } + + public Hit(VectorNode node, double score, long postingsOffset) { Score = score; Node = node; + PostingsOffsets = new List { postingsOffset }; } public override string ToString() diff --git a/src/Sir.InformationRetreival/IIndexReadWriteStrategy.cs b/src/Sir.InformationRetreival/IIndexReadWriteStrategy.cs index d3ae6a30..d5fa434a 100644 --- a/src/Sir.InformationRetreival/IIndexReadWriteStrategy.cs +++ b/src/Sir.InformationRetreival/IIndexReadWriteStrategy.cs @@ -7,6 +7,6 @@ public interface IIndexReadWriteStrategy { void Put(VectorNode column, VectorNode node); Hit GetMatchOrNull(ISerializableVector vector, IModel model, ColumnReader reader); - void Commit(string directory, ulong collectionId, long keyId, VectorNode tree, ILogger logger = null); + void SerializePage(string directory, ulong collectionId, long keyId, VectorNode tree, IndexCache indexCache, ILogger logger = null); } } diff --git a/src/Sir.InformationRetreival/IO/ColumnReader.cs b/src/Sir.InformationRetreival/IO/ColumnReader.cs index e806aa60..9d77a6be 100644 --- a/src/Sir.InformationRetreival/IO/ColumnReader.cs +++ b/src/Sir.InformationRetreival/IO/ColumnReader.cs @@ -1,5 +1,4 @@ -using Microsoft.Extensions.Logging; -using System; +using System; using System.Buffers; using System.Collections.Generic; using System.IO; @@ -36,7 +35,7 @@ public Hit ClosestMatchOrNullScanningAllPages(ISerializableVector vector, IModel { var hit = ClosestMatchInPage(vector, model, page.offset); - if (hit.Score > 0) + if (hit != null && hit.Score > 0) { hits.Add(hit); } @@ -71,7 +70,7 @@ public Hit ClosestMatchOrNullStoppingAtFirstIdenticalPage(ISerializableVector ve { var hit = ClosestMatchInPage(vector, model, page.offset); - if (hit.Score > 0) + if (hit != null && hit.Score > 0) { if (best == null || hit.Score > best.Score) { @@ -82,10 +81,10 @@ public Hit ClosestMatchOrNullStoppingAtFirstIdenticalPage(ISerializableVector ve { best.PostingsOffsets.Add(hit.Node.PostingsOffset); } - } - if (hit.Score.Approximates(model.IdenticalAngle)) - break; + if (hit.Score.Approximates(model.IdenticalAngle)) + break; + } } return best; @@ -110,7 +109,7 @@ private Hit ClosestMatchInPage(ISerializableVector queryVector, IModel model, lo var angle = model.CosAngle(queryVector, vecOffset, (int)componentCount, _vectorFile); - if (angle >= model.IdenticalAngle) + if (angle >= model.IdenticalAngle || angle.Approximates(model.IdenticalAngle)) { bestScore = angle; bestNode = new VectorNode(postingsOffset: postingsOffset); @@ -124,7 +123,7 @@ private Hit ClosestMatchInPage(ISerializableVector queryVector, IModel model, lo bestScore = angle; bestNode = new VectorNode(postingsOffset: postingsOffset); } - else if (angle == bestScore) + else if (angle.Approximates(bestScore)) { bestNode.PostingsOffset = postingsOffset; } @@ -153,7 +152,7 @@ private Hit ClosestMatchInPage(ISerializableVector queryVector, IModel model, lo bestScore = angle; bestNode = new VectorNode(postingsOffset: postingsOffset); } - else if (angle > 0 && angle == bestScore) + else if (angle > 0 && angle.Approximates(bestScore)) { bestNode.PostingsOffset = postingsOffset; } @@ -188,7 +187,7 @@ private Hit ClosestMatchInPage(ISerializableVector queryVector, IModel model, lo ArrayPool.Shared.Return(block); - return new Hit(bestNode, bestScore); + return bestNode == null ? null : new Hit(bestNode, bestScore); } private void SkipTree() diff --git a/src/Sir.InformationRetreival/IO/GraphBuilder.cs b/src/Sir.InformationRetreival/IO/GraphBuilder.cs index 093ca565..90a4b2cd 100644 --- a/src/Sir.InformationRetreival/IO/GraphBuilder.cs +++ b/src/Sir.InformationRetreival/IO/GraphBuilder.cs @@ -33,7 +33,7 @@ public static void AddOrAppendSupervised( { var angle = cursor.Vector == null ? 0 : model.CosAngle(node.Vector, cursor.Vector); - if (angle >= model.IdenticalAngle) + if (angle >= model.IdenticalAngle || angle.Approximates(model.IdenticalAngle)) { if (!cursor.Vector.Label.Equals(node.Vector.Label)) throw new InvalidOperationException($"IdenticalAngle {model.IdenticalAngle} is too low. Angle was {angle}"); @@ -79,7 +79,7 @@ public static void AddOrAppend( { var angle = cursor.Vector == null ? 0 : model.CosAngle(node.Vector, cursor.Vector); - if (angle >= model.IdenticalAngle) + if (angle.Approximates(model.IdenticalAngle)) { AppendDocIds(cursor, node); @@ -123,7 +123,7 @@ public static void AddIfUnique( { var angle = cursor.Vector == null ? 0 : model.CosAngle(node.Vector, cursor.Vector); - if (angle >= model.IdenticalAngle) + if (angle.Approximates(model.IdenticalAngle)) { break; } @@ -165,7 +165,7 @@ public static bool TryAdd( { var angle = cursor.Vector == null ? 0 : model.CosAngle(node.Vector, cursor.Vector); - if (angle >= model.IdenticalAngle) + if (angle.Approximates(model.IdenticalAngle)) { return false; } @@ -209,7 +209,7 @@ public static void Build( { var angle = cursor.Vector == null ? 0 : model.CosAngle(node.Vector, cursor.Vector); - if (angle >= model.IdenticalAngle) + if (angle.Approximates(model.IdenticalAngle)) { break; } @@ -240,8 +240,11 @@ public static void Build( } } - public static void AppendDocIds(this VectorNode target, VectorNode source) + private static void AppendDocIds(this VectorNode target, VectorNode source) { + if (target.DocIds == null || source.DocIds == null) + return; + foreach (var d in source.DocIds) target.DocIds.Add(d); } @@ -297,7 +300,7 @@ public static (long offset, long length) SerializeTree(this VectorNode node, Str { if (node.PostingsOffset == -1 && postingsWriter != null) { - postingsWriter.SerializePostings(node); + node.PostingsOffset = postingsWriter.SerializePostings(node); } if (vectorStream != null) diff --git a/src/Sir.InformationRetreival/IO/PathFinder.cs b/src/Sir.InformationRetreival/IO/PathFinder.cs index e098aebc..79836c5c 100644 --- a/src/Sir.InformationRetreival/IO/PathFinder.cs +++ b/src/Sir.InformationRetreival/IO/PathFinder.cs @@ -25,7 +25,7 @@ public static Hit ClosestMatch(VectorNode root, ISerializableVector vector, IMod best = cursor; } - if (angle >= model.IdenticalAngle) + if (angle >= model.IdenticalAngle || angle.Approximates(model.IdenticalAngle)) { break; } diff --git a/src/Sir.InformationRetreival/IO/PostingsReader.cs b/src/Sir.InformationRetreival/IO/PostingsReader.cs index cb9cd7b2..7e07e3fe 100644 --- a/src/Sir.InformationRetreival/IO/PostingsReader.cs +++ b/src/Sir.InformationRetreival/IO/PostingsReader.cs @@ -25,10 +25,10 @@ public PostingsReader(string directory, ulong collectionId, long keyId, ILogger _collectionId = collectionId; } - public IList<(ulong, long)> Read(long keyId, IList offsets) + public HashSet<(ulong, long)> Read(long keyId, IList offsets) { var time = Stopwatch.StartNew(); - var documents = new List<(ulong, long)>(); + var documents = new HashSet<(ulong, long)>(); // collection ID, document ID foreach (var offset in offsets) GetPostingsFromStream(keyId, offset, documents); @@ -39,20 +39,21 @@ public PostingsReader(string directory, ulong collectionId, long keyId, ILogger return documents; } - private void GetPostingsFromStream(long keyId, long postingsOffset, List<(ulong collectionId, long docId)> documents) + private void GetPostingsFromStream(long keyId, long postingsOffset, HashSet<(ulong collectionId, long docId)> postings) { + // seek to page _stream.Seek(postingsOffset, SeekOrigin.Begin); var headerLen = sizeof(long) * 2; - var headerBuf = ArrayPool.Shared.Rent(headerLen); + // read header + var headerBuf = ArrayPool.Shared.Rent(headerLen); _stream.Read(headerBuf, 0, headerLen); - var numOfPostings = BitConverter.ToInt64(headerBuf); var addressOfNextPage = BitConverter.ToInt64(headerBuf, sizeof(long)); - ArrayPool.Shared.Return(headerBuf); + // read postings var listLen = sizeof(long) * numOfPostings; var listBuf = new byte[listLen]; var read = _stream.Read(listBuf); @@ -62,12 +63,12 @@ private void GetPostingsFromStream(long keyId, long postingsOffset, List<(ulong foreach (var docId in MemoryMarshal.Cast(listBuf)) { - documents.Add((_collectionId, docId)); + postings.Add((_collectionId, docId)); } if (addressOfNextPage > 0) { - GetPostingsFromStream(keyId, addressOfNextPage, documents); + GetPostingsFromStream(keyId, addressOfNextPage, postings); } } diff --git a/src/Sir.InformationRetreival/IO/PostingsWriter.cs b/src/Sir.InformationRetreival/IO/PostingsWriter.cs index 1ba2f9d3..780e1628 100644 --- a/src/Sir.InformationRetreival/IO/PostingsWriter.cs +++ b/src/Sir.InformationRetreival/IO/PostingsWriter.cs @@ -5,36 +5,77 @@ namespace Sir.IO { public class PostingsWriter : IDisposable { - private readonly Stream _postingsStream; + private readonly Stream _stream; + private readonly IndexCache _indexCache; - public PostingsWriter(Stream postingsStream) + public PostingsWriter(Stream postingsStream, IndexCache indexCache = null) { - _postingsStream = postingsStream; + _stream = postingsStream; + _indexCache = indexCache; + + _stream.Seek(0, SeekOrigin.End); } - public void SerializePostings(VectorNode node) + public long SerializePostings(VectorNode node) { if (node.DocIds.Count == 0) throw new ArgumentException("can't be empty", nameof(node.DocIds)); - node.PostingsOffset = _postingsStream.Position; + /* --------------- */ + /* write new page */ + /* ------------- */ + + // store stream position + var postingsOffset = _stream.Position; - // serialize item count - _postingsStream.Write(BitConverter.GetBytes((long)node.DocIds.Count)); + // serialize postings count + _stream.Write(BitConverter.GetBytes((long)node.DocIds.Count)); // serialize address of next page (unknown at this time) - _postingsStream.Write(BitConverter.GetBytes((long)0)); + _stream.Write(BitConverter.GetBytes((long)0)); + // serialize document IDs foreach (var docId in node.DocIds) { - _postingsStream.Write(BitConverter.GetBytes(docId)); + _stream.Write(BitConverter.GetBytes(docId)); + } + + long? existingPostingsOffset = null; + + if (_indexCache != null) + { + existingPostingsOffset = _indexCache.GetPostingsOffset(node.KeyId.Value, node.Vector); + } + + if (existingPostingsOffset.HasValue && existingPostingsOffset.Value > 0) + { + /* ------------------------------------ */ + /* reference new page in existing page */ + /* ---------------------------------- */ + + // rewind stream to existing postings page header + _stream.Seek(existingPostingsOffset.Value+sizeof(long), SeekOrigin.Begin); + + // set this as next page of existing postings page + _stream.Write(BitConverter.GetBytes(postingsOffset)); + + // go back to end of stream + _stream.Seek(0, SeekOrigin.End); + + // set this as offset of existing postings page + _indexCache.UpdatePostingsOffset(node.KeyId.Value, node.Vector, postingsOffset); + } + else if (_indexCache != null) + { + _indexCache.Put(new VectorNode(vector: node.Vector, postingsOffset: postingsOffset, keyId: node.KeyId)); } + return postingsOffset; } public void Dispose() { - if (_postingsStream != null ) + if (_stream != null ) { - _postingsStream.Dispose(); + _stream.Dispose(); } } } diff --git a/src/Sir.InformationRetreival/IO/PostingsReadOrchestrator.cs b/src/Sir.InformationRetreival/IO/TermPostingsMapper.cs similarity index 65% rename from src/Sir.InformationRetreival/IO/PostingsReadOrchestrator.cs rename to src/Sir.InformationRetreival/IO/TermPostingsMapper.cs index 7e4fe2dc..7e325207 100644 --- a/src/Sir.InformationRetreival/IO/PostingsReadOrchestrator.cs +++ b/src/Sir.InformationRetreival/IO/TermPostingsMapper.cs @@ -5,19 +5,19 @@ namespace Sir.IO { /// - /// Read postings lists from storage and map them to query terms + /// Read postings from storage and map them to query terms. /// - public class PostingsReadOrchestrator : IDisposable + public class TermPostingsMapper : IDisposable { - private readonly Dictionary<(string, ulong, long), PostingsReader> _readers = new Dictionary<(string, ulong, long), PostingsReader>(); + private readonly Dictionary<(string directory, ulong collectionId, long keyId), PostingsReader> _readers = new Dictionary<(string, ulong, long), PostingsReader>(); private readonly ILogger _logger; - public PostingsReadOrchestrator(ILogger logger = null) + public TermPostingsMapper(ILogger logger = null) { _logger = logger; } - public void ReadAndMapPostings(IQuery query) + public void ReadAndMap(Query query) { foreach (var term in query.AllTerms()) { @@ -30,11 +30,7 @@ public void ReadAndMapPostings(IQuery query) if (!_readers.TryGetValue(key, out reader)) { reader = new PostingsReader(term.Directory, term.CollectionId, term.KeyId, _logger); - - if (reader != null) - { - _readers.Add(key, reader); - } + _readers.Add(key, reader); } if (reader != null) diff --git a/src/Sir.InformationRetreival/IQuery.cs b/src/Sir.InformationRetreival/IQuery.cs deleted file mode 100644 index f5ed91af..00000000 --- a/src/Sir.InformationRetreival/IQuery.cs +++ /dev/null @@ -1,21 +0,0 @@ -using System.Collections.Generic; - -namespace Sir -{ - public interface IQuery - { - IQuery AndQuery { get; set; } - IQuery NotQuery { get; set; } - IQuery OrQuery { get; set; } - HashSet Select { get; } - IList Terms { get; } - bool IsUnion { get; set; } - bool IsIntersection { get; set; } - - IEnumerable All(); - IEnumerable AllTerms(); - int GetCollectionCount(); - void GetNumOfCollections(HashSet dic); - int TotalNumberOfTerms(); - } -} \ No newline at end of file diff --git a/src/Sir.InformationRetreival/IReducer.cs b/src/Sir.InformationRetreival/IReducer.cs index 4592e1d9..411b8714 100644 --- a/src/Sir.InformationRetreival/IReducer.cs +++ b/src/Sir.InformationRetreival/IReducer.cs @@ -4,6 +4,6 @@ namespace Sir.Strings { public interface IReducer { - void Reduce(IQuery mappedQuery, ref IDictionary<(ulong, long), double> result); + void Reduce(Query mappedQuery, ref IDictionary<(ulong, long), double> result); } } \ No newline at end of file diff --git a/src/Sir.InformationRetreival/IndexingStrategies/LogStructuredIndexingStrategy.cs b/src/Sir.InformationRetreival/IndexingStrategies/LogStructuredIndexingStrategy.cs index fd9e341e..95bc4416 100644 --- a/src/Sir.InformationRetreival/IndexingStrategies/LogStructuredIndexingStrategy.cs +++ b/src/Sir.InformationRetreival/IndexingStrategies/LogStructuredIndexingStrategy.cs @@ -24,12 +24,12 @@ public void Put(VectorNode column, VectorNode node) column.AddOrAppend(node, _model); } - public void Commit(string directory, ulong collectionId, long keyId, VectorNode tree, ILogger logger = null) + public void SerializePage(string directory, ulong collectionId, long keyId, VectorNode tree, IndexCache indexCache, ILogger logger = null) { var time = Stopwatch.StartNew(); using (var vectorStream = StreamFactory.CreateAppendStream(directory, collectionId, keyId, "vec")) - using (var postingsWriter = new PostingsWriter(StreamFactory.CreateAppendStream(directory, collectionId, keyId, "pos"))) + using (var postingsWriter = new PostingsWriter(StreamFactory.CreateSeekableWriteStream(directory, collectionId, keyId, "pos"), indexCache:null)) using (var columnWriter = new ColumnWriter(StreamFactory.CreateAppendStream(directory, collectionId, keyId, "ix"))) using (var pageIndexWriter = new PageIndexWriter(StreamFactory.CreateAppendStream(directory, collectionId, keyId, "ixtp"))) { diff --git a/src/Sir.InformationRetreival/IndexingStrategies/SupervisedLogStructuredIndexingStrategy.cs b/src/Sir.InformationRetreival/IndexingStrategies/SupervisedLogStructuredIndexingStrategy.cs index 9eda1c4e..29178aad 100644 --- a/src/Sir.InformationRetreival/IndexingStrategies/SupervisedLogStructuredIndexingStrategy.cs +++ b/src/Sir.InformationRetreival/IndexingStrategies/SupervisedLogStructuredIndexingStrategy.cs @@ -24,12 +24,12 @@ public void Put(VectorNode column, VectorNode node) column.AddOrAppendSupervised(node, _model); } - public void Commit(string directory, ulong collectionId, long keyId, VectorNode tree, ILogger logger = null) + public void SerializePage(string directory, ulong collectionId, long keyId, VectorNode tree, IndexCache indexCache, ILogger logger = null) { var time = Stopwatch.StartNew(); using (var vectorStream = StreamFactory.CreateAppendStream(directory, collectionId, keyId, "vec")) - using (var postingsWriter = new PostingsWriter(StreamFactory.CreateAppendStream(directory, collectionId, keyId, "pos"))) + using (var postingsWriter = new PostingsWriter(StreamFactory.CreateSeekableWriteStream(directory, collectionId, keyId, "pos"), indexCache)) using (var columnWriter = new ColumnWriter(StreamFactory.CreateAppendStream(directory, collectionId, keyId, "ix"))) using (var pageIndexWriter = new PageIndexWriter(StreamFactory.CreateAppendStream(directory, collectionId, keyId, "ixtp"))) { diff --git a/src/Sir.InformationRetreival/Parsers/QueryParser.cs b/src/Sir.InformationRetreival/Parsers/QueryParser.cs index 444c5b3e..081f4705 100644 --- a/src/Sir.InformationRetreival/Parsers/QueryParser.cs +++ b/src/Sir.InformationRetreival/Parsers/QueryParser.cs @@ -20,7 +20,7 @@ public QueryParser(KeyValueReader kvReader, IModel model, SortedList(); } - public IQuery Parse( + public Query Parse( ulong collectionId, T query, string field, @@ -41,7 +41,7 @@ public IQuery Parse( return new Query(terms, new string[] { select }, and, or, !and && !or); } - public IQuery Parse( + public Query Parse( string collection, T query, string field, @@ -62,7 +62,7 @@ public IQuery Parse( return new Query(terms, new string[] { select }, and, or, !and && !or); } - public IQuery Parse( + public Query Parse( IEnumerable collections, T q, string[] fields, @@ -141,7 +141,7 @@ public IQuery Parse( return Parse(root, select, label); } - public IQuery Parse(dynamic document, IEnumerable select, bool label) + public Query Parse(dynamic document, IEnumerable select, bool label) { Query root = null; Query cursor = null; diff --git a/src/Sir.InformationRetreival/Query.cs b/src/Sir.InformationRetreival/Query.cs index 86f980c8..399e533d 100644 --- a/src/Sir.InformationRetreival/Query.cs +++ b/src/Sir.InformationRetreival/Query.cs @@ -14,13 +14,13 @@ namespace Sir /// } /// } /// - public class Query : BooleanStatement, IQuery + public class Query : BooleanStatement { public IList Terms { get; } public HashSet Select { get; } - public IQuery AndQuery { get; set; } - public IQuery OrQuery { get; set; } - public IQuery NotQuery { get; set; } + public Query AndQuery { get; set; } + public Query OrQuery { get; set; } + public Query NotQuery { get; set; } public Query( IList terms, @@ -90,7 +90,7 @@ public IEnumerable AllTerms() yield return term; } - public IEnumerable All() + public IEnumerable All() { yield return this; diff --git a/src/Sir.InformationRetreival/Scorer.cs b/src/Sir.InformationRetreival/Scorer.cs index 68445715..ef3bd1d0 100644 --- a/src/Sir.InformationRetreival/Scorer.cs +++ b/src/Sir.InformationRetreival/Scorer.cs @@ -7,7 +7,7 @@ public class Scorer /// /// Reduce query to a list of scored document IDs. /// - public void Score(IQuery query, ref IDictionary<(ulong CollectionId, long DocumentId), double> result) + public void Score(Query query, ref IDictionary<(ulong CollectionId, long DocumentId), double> result) { IDictionary<(ulong, long), double> queryResult = new Dictionary<(ulong, long), double>(); diff --git a/src/Sir.InformationRetreival/SearchResult.cs b/src/Sir.InformationRetreival/SearchResult.cs index 4fc023d0..b8791246 100644 --- a/src/Sir.InformationRetreival/SearchResult.cs +++ b/src/Sir.InformationRetreival/SearchResult.cs @@ -5,12 +5,12 @@ namespace Sir { public class SearchResult { - public IQuery Query { get; } + public Query Query { get; } public long Total { get; } public IEnumerable Documents { get; } public int Count { get; } - public SearchResult(IQuery query, long total, int count, IEnumerable documents) + public SearchResult(Query query, long total, int count, IEnumerable documents) { Query = query; Total = total; diff --git a/src/Sir.InformationRetreival/Session/DocumentDatabase.cs b/src/Sir.InformationRetreival/Session/DocumentDatabase.cs index f266bd8a..373cb568 100644 --- a/src/Sir.InformationRetreival/Session/DocumentDatabase.cs +++ b/src/Sir.InformationRetreival/Session/DocumentDatabase.cs @@ -18,9 +18,9 @@ public class DocumentDatabase : IDisposable private WriteSession _writeSession; private IndexSession _indexSession; private SearchSession _searchSession; + private IndexCache _indexCache; private readonly IModel _model; private readonly ILogger _logger; - public IndexSession IndexSession { get { return _indexSession; } } public SearchSession SearchSession { get { return _searchSession; } } @@ -30,8 +30,9 @@ public DocumentDatabase(string directory, ulong collectionId, IModel model = _collectionId = collectionId; _model = model; _indexStrategy = indexStrategy; + _indexCache = new IndexCache(_model); _writeSession = new WriteSession(new DocumentRegistryWriter(directory, collectionId)); - _indexSession = new IndexSession(directory, collectionId, model, indexStrategy, logger); + _indexSession = new IndexSession(directory, collectionId, model, indexStrategy, _indexCache, logger); _searchSession = new SearchSession(directory, _model, _indexStrategy, logger); _logger = logger; } @@ -46,7 +47,7 @@ public IEnumerable StreamDocuments(HashSet fieldsOfInterest, i return _searchSession.ReadDocuments(_collectionId, fieldsOfInterest, skip, take); } - public SearchResult Read(IQuery query, int skip, int take) + public SearchResult Read(Query query, int skip, int take) { return _searchSession.Search(query, skip, take); } @@ -104,7 +105,7 @@ public void Truncate() LogInformation($"truncated collection {_collectionId} ({count} files affected)"); _writeSession = new WriteSession(new DocumentRegistryWriter(_directory, _collectionId)); - _indexSession = new IndexSession(_directory, _collectionId, _model, _indexStrategy, _logger); + _indexSession = new IndexSession(_directory, _collectionId, _model, _indexStrategy, _indexCache, _logger); _searchSession = new SearchSession(_directory, _model, _indexStrategy, _logger); } @@ -143,7 +144,7 @@ public void TruncateIndexOnly() LogInformation($"truncated index {_collectionId} ({count} files affected)"); _writeSession = new WriteSession(new DocumentRegistryWriter(_directory, _collectionId)); - _indexSession = new IndexSession(_directory, _collectionId, _model, _indexStrategy, _logger); + _indexSession = new IndexSession(_directory, _collectionId, _model, _indexStrategy, _indexCache, _logger); _searchSession = new SearchSession(_directory, _model, _indexStrategy, _logger); } @@ -165,7 +166,7 @@ public void Rename(ulong newCollectionId) _collectionId = newCollectionId; _writeSession = new WriteSession(new DocumentRegistryWriter(_directory, _collectionId)); - _indexSession = new IndexSession(_directory, _collectionId, _model, _indexStrategy, _logger); + _indexSession = new IndexSession(_directory, _collectionId, _model, _indexStrategy, _indexCache, _logger); _searchSession = new SearchSession(_directory, _model, _indexStrategy, _logger); } diff --git a/src/Sir.InformationRetreival/Session/ISearchSession.cs b/src/Sir.InformationRetreival/Session/ISearchSession.cs index cbe013a2..eb44d549 100644 --- a/src/Sir.InformationRetreival/Session/ISearchSession.cs +++ b/src/Sir.InformationRetreival/Session/ISearchSession.cs @@ -5,7 +5,7 @@ namespace Sir { public interface ISearchSession : IDisposable { - SearchResult Search(IQuery query, int skip, int take); - Document SearchScalar(IQuery query); + SearchResult Search(Query query, int skip, int take); + Document SearchScalar(Query query); } } \ No newline at end of file diff --git a/src/Sir.InformationRetreival/Session/IndexCache.cs b/src/Sir.InformationRetreival/Session/IndexCache.cs new file mode 100644 index 00000000..2adb4a7d --- /dev/null +++ b/src/Sir.InformationRetreival/Session/IndexCache.cs @@ -0,0 +1,59 @@ +using Sir.IO; +using System; +using System.Collections.Generic; + +namespace Sir +{ + public class IndexCache + { + private readonly IModel _model; + private readonly IDictionary _cache; // indices by key ID + + public IndexCache(IModel model) + { + _model = model; + _cache = new Dictionary(); + } + + public void Put(VectorNode node) + { + if (!node.KeyId.HasValue) + throw new ArgumentException(message:"VectorNode does not have a key ID.", paramName:nameof(node)); + + if (!_cache.TryGetValue(node.KeyId.Value, out var tree)) + { + tree = new VectorNode(); + _cache.Add(node.KeyId.Value, tree); + } + + GraphBuilder.AddOrAppend(tree, node, _model); + } + + public long? GetPostingsOffset(long keyId, ISerializableVector vector) + { + if (_cache.TryGetValue(keyId, out var tree)) + { + var hit = PathFinder.ClosestMatch(tree, vector, _model); + + if (hit.Score >= _model.IdenticalAngle) + { + return hit.Node.PostingsOffset == -1 ? null : hit.Node.PostingsOffset; + } + } + return null; + } + + public void UpdatePostingsOffset(long keyId, ISerializableVector vector, long postingsOffset) + { + if (_cache.TryGetValue(keyId, out var tree)) + { + var hit = PathFinder.ClosestMatch(tree, vector, _model); + + if (hit.Score >= _model.IdenticalAngle) + { + hit.Node.PostingsOffset = postingsOffset; + } + } + } + } +} \ No newline at end of file diff --git a/src/Sir.InformationRetreival/Session/IndexSession.cs b/src/Sir.InformationRetreival/Session/IndexSession.cs index 452252df..77eb1834 100644 --- a/src/Sir.InformationRetreival/Session/IndexSession.cs +++ b/src/Sir.InformationRetreival/Session/IndexSession.cs @@ -18,6 +18,7 @@ public class IndexSession : IIndexSession, IDisposable private readonly string _directory; private readonly ulong _collectionId; private readonly ILogger _logger; + private readonly IndexCache _indexCache; public SortedList EmptyEmbedding = new SortedList(); @@ -26,6 +27,7 @@ public IndexSession( ulong collectionId, IModel model, IIndexReadWriteStrategy indexingStrategy, + IndexCache indexCache, ILogger logger = null) { _model = model; @@ -34,6 +36,7 @@ public IndexSession( _directory = directory; _collectionId = collectionId; _logger = logger; + _indexCache = indexCache; } public void Put(long docId, long keyId, T value, bool label) @@ -57,7 +60,7 @@ public void Put(long docId, long keyId, IEnumerable tokens) { _indexingStrategy.Put( column, - new VectorNode(vector: token, docId: docId)); + new VectorNode(vector:token, docId:docId, keyId:keyId)); } } @@ -93,7 +96,7 @@ public void Commit(long keyId) var column = _index[keyId]; - _indexingStrategy.Commit(_directory, _collectionId, keyId, column, _logger); + _indexingStrategy.SerializePage(_directory, _collectionId, keyId, column, _indexCache, _logger); if (_logger != null) _logger.LogInformation($"committing index to disk for key {keyId} took {time.Elapsed}"); diff --git a/src/Sir.InformationRetreival/Session/IndexWriter.cs b/src/Sir.InformationRetreival/Session/IndexWriter.cs deleted file mode 100644 index b6d61420..00000000 --- a/src/Sir.InformationRetreival/Session/IndexWriter.cs +++ /dev/null @@ -1,38 +0,0 @@ -using Microsoft.Extensions.Logging; -using System; -using System.Collections.Generic; - -namespace Sir -{ - /// - /// Write a collection of indices to disk. - /// - public class IndexWriter : IDisposable - { - private readonly string _directory; - private readonly ulong _collectionId; - private readonly ILogger _logger; - - public IndexWriter( - string directory, - ulong collectionId, - ILogger logger = null) - { - _directory = directory; - _collectionId = collectionId; - _logger = logger; - } - - public void Dispose() - { - } - - public void Commit(IDictionary index, IIndexReadWriteStrategy indexingStrategy) - { - foreach (var column in index) - { - indexingStrategy.Commit(_directory, _collectionId, column.Key, column.Value); - } - } - } -} \ No newline at end of file diff --git a/src/Sir.InformationRetreival/Session/SearchSession.cs b/src/Sir.InformationRetreival/Session/SearchSession.cs index 996f6450..caa32729 100644 --- a/src/Sir.InformationRetreival/Session/SearchSession.cs +++ b/src/Sir.InformationRetreival/Session/SearchSession.cs @@ -15,7 +15,7 @@ public class SearchSession : DocumentStreamSession, IDisposable, ISearchSessi { private readonly IModel _model; private readonly IIndexReadWriteStrategy _indexStrategy; - private readonly PostingsReadOrchestrator _postingsReadOrchestrator; + private readonly TermPostingsMapper _termPostingsMapper; private readonly Scorer _scorer; private readonly ILogger _logger; private readonly Dictionary<(string, ulong, long), ColumnReader> _readers; @@ -25,12 +25,12 @@ public SearchSession( IModel model, IIndexReadWriteStrategy indexStrategy, ILogger logger = null, - PostingsReadOrchestrator postingsResolver = null, + TermPostingsMapper termPostingsMapper = null, Scorer scorer = null) : base(directory) { _model = model; _indexStrategy = indexStrategy; - _postingsReadOrchestrator = postingsResolver ?? new PostingsReadOrchestrator(logger); + _termPostingsMapper = termPostingsMapper ?? new TermPostingsMapper(logger); _scorer = scorer ?? new Scorer(); _logger = logger; _readers = new Dictionary<(string, ulong, long), ColumnReader>(); @@ -48,7 +48,7 @@ public override void ClearCachedReaders() base.ClearCachedReaders(); } - public SearchResult Search(IQuery query, int skip, int take) + public SearchResult Search(Query query, int skip, int take) { var result = OrchestrateSearch(query, skip, take, false); @@ -64,7 +64,7 @@ public SearchResult Search(IQuery query, int skip, int take) return new SearchResult(query, 0, 0, System.Linq.Enumerable.Empty()); } - public Document SearchScalar(IQuery query) + public Document SearchScalar(Query query) { var result = OrchestrateSearch(query, 0, 1, true); @@ -80,7 +80,7 @@ public Document SearchScalar(IQuery query) return null; } - public SearchResult SearchIdentical(IQuery query, int take) + public SearchResult SearchIdentical(Query query, int take) { var result = OrchestrateSearch(query, 0, take, true); @@ -96,7 +96,7 @@ public SearchResult SearchIdentical(IQuery query, int take) return new SearchResult(query, 0, 0, System.Linq.Enumerable.Empty()); } - private ScoredResult OrchestrateSearch(IQuery query, int skip, int take, bool identicalMatchesOnly) + private ScoredResult OrchestrateSearch(Query query, int skip, int take, bool identicalMatchesOnly) { var timer = Stopwatch.StartNew(); @@ -107,7 +107,7 @@ private ScoredResult OrchestrateSearch(IQuery query, int skip, int take, bool id timer.Restart(); // Read postings. - _postingsReadOrchestrator.ReadAndMapPostings(query); + _termPostingsMapper.ReadAndMap(query); LogDebug($"reading postings took {timer.Elapsed}"); timer.Restart(); @@ -130,7 +130,7 @@ private ScoredResult OrchestrateSearch(IQuery query, int skip, int take, bool id /// /// Scans the index to find the query's closest matching nodes and records their posting list addresses. /// - private void Scan(IQuery query, bool identicalMatchesOnly) + private void Scan(Query query, bool identicalMatchesOnly) { if (query == null) return; @@ -229,7 +229,9 @@ private IList ReadDocs( var doc = ReadDocument(d.Key, select, d.Value * scoreMultiplier); if (doc != null) + { result.Add(doc); + } } LogDebug($"reading documents took {timer.Elapsed}"); @@ -257,8 +259,8 @@ private void LogError(Exception ex, string message) public override void Dispose() { - if (_postingsReadOrchestrator!= null) - _postingsReadOrchestrator.Dispose(); + if (_termPostingsMapper!= null) + _termPostingsMapper.Dispose(); foreach (var reader in _readers.Values) { diff --git a/src/Sir.InformationRetreival/Term.cs b/src/Sir.InformationRetreival/Term.cs index 1518d054..8a9977a3 100644 --- a/src/Sir.InformationRetreival/Term.cs +++ b/src/Sir.InformationRetreival/Term.cs @@ -12,7 +12,7 @@ public class Term : BooleanStatement public ulong CollectionId { get; } public IList PostingsOffsets { get; set; } public double Score { get; set; } - public IList<(ulong collectionId, long documentId)> DocumentIds { get; set; } + public HashSet<(ulong collectionId, long documentId)> DocumentIds { get; set; } public object Label => Vector.Label; public Term( diff --git a/src/Sir.KeyValue/StreamFactory.cs b/src/Sir.KeyValue/StreamFactory.cs index a81baeba..b84caf56 100644 --- a/src/Sir.KeyValue/StreamFactory.cs +++ b/src/Sir.KeyValue/StreamFactory.cs @@ -37,5 +37,22 @@ public static Stream CreateAppendStream(string directory, ulong collectionId, lo return new FileStream(fileName, FileMode.Append, FileAccess.Write, FileShare.ReadWrite); } + + public static Stream CreateSeekableWriteStream(string directory, ulong collectionId, long keyId, string fileExtension) + { + if (!System.IO.Directory.Exists(directory)) + { + System.IO.Directory.CreateDirectory(directory); + } + + var fileName = Path.Combine(directory, $"{collectionId}.{keyId}.{fileExtension}"); + + if (!File.Exists(fileName)) + { + using (var fs = new FileStream(fileName, FileMode.Append, FileAccess.Write, FileShare.ReadWrite)) { } + } + + return new FileStream(fileName, FileMode.Open, FileAccess.Write, FileShare.ReadWrite); + } } } diff --git a/src/Sir.CommonCrawl/Sir.CommonCrawl.csproj b/src/Sir.Mnist/Sir.DataProvider.Mnist.csproj similarity index 86% rename from src/Sir.CommonCrawl/Sir.CommonCrawl.csproj rename to src/Sir.Mnist/Sir.DataProvider.Mnist.csproj index 9da76051..f97eb21d 100644 --- a/src/Sir.CommonCrawl/Sir.CommonCrawl.csproj +++ b/src/Sir.Mnist/Sir.DataProvider.Mnist.csproj @@ -13,8 +13,8 @@ + - diff --git a/src/Sir.StringCompare/Sir.StringCompare.csproj b/src/Sir.StringCompare/Sir.StringCompare.csproj index 418818e4..008bde84 100644 --- a/src/Sir.StringCompare/Sir.StringCompare.csproj +++ b/src/Sir.StringCompare/Sir.StringCompare.csproj @@ -16,7 +16,7 @@ - + diff --git a/src/Sir.StringTests/BagOfCharsDatabaseTests.cs b/src/Sir.StringTests/BagOfCharsDatabaseTests.cs index c196e69f..035227bb 100644 --- a/src/Sir.StringTests/BagOfCharsDatabaseTests.cs +++ b/src/Sir.StringTests/BagOfCharsDatabaseTests.cs @@ -13,7 +13,8 @@ public class BagOfCharsDatabaseTests { private ILoggerFactory _loggerFactory; private string _directory = Path.Combine(Environment.CurrentDirectory, "testdata"); - private readonly string[] _data = ["Ferriman–Gallwey score", "apples", "apricote", "apricots", "avocado", "avocados", "banana", "bananas", "blueberry", "blueberries", "cantalope"]; + private readonly string[] _dataPage0 = ["Ferriman–Gallwey score", "apples", "apricote", "apricots", "avocado", "avocados", "banana", "bananas", "blueberry", "blueberries", "cantalope"]; + private readonly string[] _dataPage1 = ["score", "apples and teddybears", "apricote sauce", "hey baberibba", "avocado sundae", "avocados are nice", "banana split", "I'm going bananas", "blueberry pie", "blueberries and sauce", "cantalope"]; [Test] public void Can_stream() @@ -21,7 +22,7 @@ public void Can_stream() var model = new BagOfCharsModel(); var strategy = new LogStructuredIndexingStrategy(model); var collectionId = "BagOfCharsDatabaseTests.Can_stream".ToHash(); - var documents = _data.Select(x => new Document(new Field[] {new Field("title", x)})).ToList(); + var documents = _dataPage0.Select(x => new Document(new Field[] {new Field("title", x)})).ToList(); using (var database = new DocumentDatabase(_directory, collectionId, model, strategy, _loggerFactory.CreateLogger("Debug"))) { @@ -41,7 +42,7 @@ public void Can_stream() Assert.DoesNotThrow(() => { var documentWas = document.Get("title").Value; - var documentShouldBe = _data[i++]; + var documentShouldBe = _dataPage0[i++]; if (!documentShouldBe.Equals(documentWas)) { @@ -58,7 +59,7 @@ public void Can_read_and_write() var model = new BagOfCharsModel(); var strategy = new LogStructuredIndexingStrategy(model); var collectionId = "BagOfCharsDatabaseTests.Can_read_and_write".ToHash(); - var documents = _data.Select(x => new Document(new Field[] { new Field("title", x) })).ToList(); + var documents = _dataPage0.Select(x => new Document(new Field[] { new Field("title", x) })).ToList(); using (var database = new DocumentDatabase(_directory, collectionId, model, strategy, _loggerFactory.CreateLogger("Debug"))) { @@ -73,7 +74,7 @@ public void Can_read_and_write() var queryParser = database.CreateQueryParser(); - foreach (var word in _data) + foreach (var word in _dataPage0) { Assert.DoesNotThrow(() => { @@ -92,13 +93,97 @@ public void Can_read_and_write() } } + [Test] + public void Can_read_and_write_paged() + { + // This test fails because BagOfCharsModel is also a bag of words model thus it scores 1 if there's a hit in a phrase that's not identical to the term. + var model = new BagOfCharsModel(); + var strategy = new LogStructuredIndexingStrategy(model); + var collectionId = "BagOfCharsDatabaseTests.Can_read_and_write_paged".ToHash(); + var page0 = _dataPage0.Select(x => new Document(new Field[] { new Field("title", x) })).ToList(); + var page1 = _dataPage1.Select(x => new Document(new Field[] { new Field("title", x) })).ToList(); + + using (var database = new DocumentDatabase(_directory, collectionId, model, strategy, _loggerFactory.CreateLogger("Debug"))) + { + database.Truncate(); + + foreach (var document in page0) + { + database.Write(document, label: true); + } + + database.Commit(); // create page + + foreach (var document in page1) + { + database.Write(document, label:true); + } + + database.Commit(); // create another page + + var queryParser = database.CreateQueryParser(); + + foreach (var word in _dataPage0) + { + Assert.DoesNotThrow(() => + { + var query = queryParser.Parse(collectionId, word, "title", "title", and: true, or: false, label: true); + var result = database.Read(query, skip: 0, take: 1); + + var documentWas = result.Documents.First().Get("title").Value; + var documentShouldBe = word; + + if (!documentShouldBe.Equals(documentWas)) + { + throw new Exception($"documentShouldBe: {documentShouldBe} documentWas: {documentWas} "); + } + }); + } + + foreach (var word in _dataPage1) + { + Assert.DoesNotThrow(() => + { + var query = queryParser.Parse(collectionId, word, "title", "title", and: true, or: false, label: true); + var result = database.Read(query, skip: 0, take: 2); + + var documentWas = result.Documents.First().Get("title").Value; + var documentShouldBe = word; + + if (!documentShouldBe.Equals(documentWas)) + { + throw new Exception($"documentShouldBe: {documentShouldBe} documentWas: {documentWas} "); + } + + //var found = false; + + //foreach(var document in result.Documents) + //{ + // var documentWas = result.Documents.First().Get("title").Value; + // var documentShouldBe = word; + + // if (documentShouldBe.Equals(documentWas)) + // { + // found = true; + // break; + // } + //} + + //if (!found) + // throw new Exception($"document not found. documentShouldBe: {word} "); + + }); + } + } + } + [Test] public void Can_optimize_index() { var model = new BagOfCharsModel(); var strategy = new LogStructuredIndexingStrategy(model); var collectionId = "BagOfCharsDatabaseTests.Can_optimize_index".ToHash(); - var documents = _data.Select(x => new Document(new Field[] { new Field("title", x) })).ToList(); + var documents = _dataPage0.Select(x => new Document(new Field[] { new Field("title", x) })).ToList(); using (var database = new DocumentDatabase(_directory, collectionId, model, strategy, _loggerFactory.CreateLogger("Debug"))) { @@ -113,7 +198,7 @@ public void Can_optimize_index() var queryParser = database.CreateQueryParser(); - foreach (var word in _data) + foreach (var word in _dataPage0) { Assert.DoesNotThrow(() => { @@ -127,7 +212,7 @@ public void Can_optimize_index() database.OptimizeAllIndices(); - foreach (var word in _data) + foreach (var word in _dataPage0) { Assert.DoesNotThrow(() => { diff --git a/src/Sir.StringTests/Sir.StringTests.csproj b/src/Sir.StringTests/Sir.StringTests.csproj index bf2f61d2..813a116b 100644 --- a/src/Sir.StringTests/Sir.StringTests.csproj +++ b/src/Sir.StringTests/Sir.StringTests.csproj @@ -15,10 +15,9 @@ - - - + + diff --git a/src/Sir.Strings/Sir.Strings.csproj b/src/Sir.Strings/Sir.Model.Strings.csproj similarity index 100% rename from src/Sir.Strings/Sir.Strings.csproj rename to src/Sir.Strings/Sir.Model.Strings.csproj diff --git a/src/Sir.Wikipedia/Sir.Wikipedia.csproj b/src/Sir.Wikipedia/Sir.DataProvider.Wikipedia.csproj similarity index 85% rename from src/Sir.Wikipedia/Sir.Wikipedia.csproj rename to src/Sir.Wikipedia/Sir.DataProvider.Wikipedia.csproj index 9da76051..240b2dd6 100644 --- a/src/Sir.Wikipedia/Sir.Wikipedia.csproj +++ b/src/Sir.Wikipedia/Sir.DataProvider.Wikipedia.csproj @@ -14,7 +14,7 @@ - + diff --git a/src/Sir.sln b/src/Sir.sln index 4abeef0d..1b051b8c 100644 --- a/src/Sir.sln +++ b/src/Sir.sln @@ -28,29 +28,25 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Sir.HttpServer", "Sir.HttpS EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Sir.KeyValue", "Sir.KeyValue\Sir.KeyValue.csproj", "{53B25A13-5C26-4344-9AB6-7E998282EF94}" EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "apps", "apps", "{B97268C5-0BFA-4022-BA3F-C07C1F239C8D}" +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "applications", "applications", "{B97268C5-0BFA-4022-BA3F-C07C1F239C8D}" EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tests", "tests", "{35E2693A-1A42-4690-81A8-D424C3D24AD1}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Sir.Documents", "Sir.Document\Sir.Documents.csproj", "{F6AFA2E5-1FEA-4DC6-9386-20315126C7C4}" EndProject -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Sir.CommonCrawl", "Sir.CommonCrawl\Sir.CommonCrawl.csproj", "{304CAC1B-825A-4D89-AE4C-9C0FC5206607}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Sir.DataProvider.CommonCrawl", "Sir.CommonCrawl\Sir.DataProvider.CommonCrawl.csproj", "{304CAC1B-825A-4D89-AE4C-9C0FC5206607}" EndProject -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Sir.Mnist", "Sir.Mnist\Sir.Mnist.csproj", "{436647C9-EDFF-44AD-AF2F-ABC4EBA70ED7}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Sir.DataProvider.Mnist", "Sir.Mnist\Sir.DataProvider.Mnist.csproj", "{436647C9-EDFF-44AD-AF2F-ABC4EBA70ED7}" EndProject -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Sir.Wikipedia", "Sir.Wikipedia\Sir.Wikipedia.csproj", "{6F3C960C-7652-430C-A253-081E1506BA81}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Sir.DataProvider.Wikipedia", "Sir.Wikipedia\Sir.DataProvider.Wikipedia.csproj", "{6F3C960C-7652-430C-A253-081E1506BA81}" EndProject -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Sir.Strings", "Sir.Strings\Sir.Strings.csproj", "{AB275A5B-E72E-475A-8E1A-FFA7A5F9C932}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Sir.Model.Strings", "Sir.Strings\Sir.Model.Strings.csproj", "{AB275A5B-E72E-475A-8E1A-FFA7A5F9C932}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Sir.StringCompare", "Sir.StringCompare\Sir.StringCompare.csproj", "{C6050E65-9411-41E3-A6EE-0A45E6FFB4FC}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Sir.Cmd", "Sir.Cmd\Sir.Cmd.csproj", "{CEDD3CA9-D38D-43BF-9013-212AE6332CE0}" EndProject -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Sir.Images", "Sir.Images\Sir.Images.csproj", "{1DC66643-0C0A-48AC-9019-5C64C002BA32}" -EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "core", "core", "{644A2FAF-6617-41F6-88B7-92F21493B048}" -EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "models", "models", "{0C62844F-36C0-4029-BB36-3C27C0F29272}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Sir.Model.Images", "Sir.Images\Sir.Model.Images.csproj", "{1DC66643-0C0A-48AC-9019-5C64C002BA32}" EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "exe", "exe", "{23D1F5F4-6D57-4995-98F6-38EED88C2260}" EndProject @@ -121,17 +117,12 @@ Global HideSolutionNode = FALSE EndGlobalSection GlobalSection(NestedProjects) = preSolution - {2002DD08-0083-4184-BB1A-2469B608DE95} = {644A2FAF-6617-41F6-88B7-92F21493B048} {C94C2F5D-AE55-4157-A74A-26D49EE73E96} = {23D1F5F4-6D57-4995-98F6-38EED88C2260} - {53B25A13-5C26-4344-9AB6-7E998282EF94} = {644A2FAF-6617-41F6-88B7-92F21493B048} - {F6AFA2E5-1FEA-4DC6-9386-20315126C7C4} = {644A2FAF-6617-41F6-88B7-92F21493B048} {304CAC1B-825A-4D89-AE4C-9C0FC5206607} = {B97268C5-0BFA-4022-BA3F-C07C1F239C8D} {436647C9-EDFF-44AD-AF2F-ABC4EBA70ED7} = {B97268C5-0BFA-4022-BA3F-C07C1F239C8D} {6F3C960C-7652-430C-A253-081E1506BA81} = {B97268C5-0BFA-4022-BA3F-C07C1F239C8D} - {AB275A5B-E72E-475A-8E1A-FFA7A5F9C932} = {0C62844F-36C0-4029-BB36-3C27C0F29272} {C6050E65-9411-41E3-A6EE-0A45E6FFB4FC} = {B97268C5-0BFA-4022-BA3F-C07C1F239C8D} {CEDD3CA9-D38D-43BF-9013-212AE6332CE0} = {23D1F5F4-6D57-4995-98F6-38EED88C2260} - {1DC66643-0C0A-48AC-9019-5C64C002BA32} = {0C62844F-36C0-4029-BB36-3C27C0F29272} {23D1F5F4-6D57-4995-98F6-38EED88C2260} = {B97268C5-0BFA-4022-BA3F-C07C1F239C8D} {20F14A4E-99AE-42FB-B447-6B78F1398406} = {35E2693A-1A42-4690-81A8-D424C3D24AD1} {BD85D84A-0F4E-4880-A0CB-128BA9F34EDF} = {35E2693A-1A42-4690-81A8-D424C3D24AD1}