diff --git a/src/AssemblyInfo.cs b/src/AssemblyInfo.cs
index 1965ba97..33a7bc68 100644
--- a/src/AssemblyInfo.cs
+++ b/src/AssemblyInfo.cs
@@ -1,4 +1,4 @@
using System.Reflection;
-[assembly: AssemblyVersion("0.4.0.7")]
-[assembly: AssemblyFileVersion("0.4.0.7")]
\ No newline at end of file
+[assembly: AssemblyVersion("0.5.0.2")]
+[assembly: AssemblyFileVersion("0.5.0.2")]
\ No newline at end of file
diff --git a/src/Sir.Cmd/Sir.Cmd.csproj b/src/Sir.Cmd/Sir.Cmd.csproj
index d0b374ba..a9c09911 100644
--- a/src/Sir.Cmd/Sir.Cmd.csproj
+++ b/src/Sir.Cmd/Sir.Cmd.csproj
@@ -12,10 +12,12 @@
-
+
+
-
-
+
+
+
diff --git a/src/Sir.Mnist/Sir.Mnist.csproj b/src/Sir.CommonCrawl/Sir.DataProvider.CommonCrawl.csproj
similarity index 85%
rename from src/Sir.Mnist/Sir.Mnist.csproj
rename to src/Sir.CommonCrawl/Sir.DataProvider.CommonCrawl.csproj
index d495b3e4..240b2dd6 100644
--- a/src/Sir.Mnist/Sir.Mnist.csproj
+++ b/src/Sir.CommonCrawl/Sir.DataProvider.CommonCrawl.csproj
@@ -13,8 +13,8 @@
-
+
diff --git a/src/Sir.Document/Field.cs b/src/Sir.Document/Field.cs
index c68987d5..bb7d839e 100644
--- a/src/Sir.Document/Field.cs
+++ b/src/Sir.Document/Field.cs
@@ -7,15 +7,13 @@ namespace Sir.Documents
[DebuggerDisplay("{Name}")]
public class Field
{
- private IEnumerable _tokens;
-
public long KeyId { get; set; }
public long DocumentId { get; set; }
public string Name { get; }
public object Value { get; set; }
- public IEnumerable Tokens { get { return _tokens; } }
+ public IEnumerable Tokens { get; }
- public Field(string name, object value, long keyId = -1, long documentId = -1)
+ public Field(string name, object value, long keyId = -1, long documentId = -1, IEnumerable tokens = null)
{
if (name is null) throw new ArgumentNullException(nameof(name));
if (value == null) throw new ArgumentNullException(nameof(value));
@@ -24,6 +22,7 @@ public Field(string name, object value, long keyId = -1, long documentId = -1)
Value = value;
KeyId = keyId;
DocumentId = documentId;
+ Tokens = tokens;
}
}
}
\ No newline at end of file
diff --git a/src/Sir.HttpServer/HttpQueryParser.cs b/src/Sir.HttpServer/HttpQueryParser.cs
index a89a93ce..c7ebc517 100644
--- a/src/Sir.HttpServer/HttpQueryParser.cs
+++ b/src/Sir.HttpServer/HttpQueryParser.cs
@@ -20,7 +20,7 @@ public HttpQueryParser(QueryParser parser)
_parser = parser;
}
- public async Task ParseRequest(HttpRequest request, IEnumerable collections = null)
+ public async Task ParseRequest(HttpRequest request, IEnumerable collections = null)
{
var select = request.Query["select"].ToArray();
@@ -56,7 +56,7 @@ public static async Task DeserializeFromStream(Stream stream)
}
}
- public IQuery ParseFormattedString(string formattedQuery, string[] select)
+ public Query ParseFormattedString(string formattedQuery, string[] select)
{
var document = JsonConvert.DeserializeObject>(
formattedQuery, new JsonConverter[] { new DictionaryConverter() });
@@ -64,12 +64,12 @@ public IQuery ParseFormattedString(string formattedQuery, string[] select)
return ParseDictionary(document, select);
}
- public IQuery ParseDictionary(IDictionary document, string[] select)
+ public Query ParseDictionary(IDictionary document, string[] select)
{
return _parser.Parse(document, select, true);
}
- private void DoParseQuery(IQuery query, IDictionary result)
+ private void DoParseQuery(Query query, IDictionary result)
{
if (result == null)
return;
@@ -114,7 +114,7 @@ private void DoParseQuery(IQuery query, IDictionary result)
}
}
- public void ParseQuery(IQuery query, IDictionary result)
+ public void ParseQuery(Query query, IDictionary result)
{
DoParseQuery(query, result);
}
diff --git a/src/Sir.HttpServer/Sir.HttpServer.csproj b/src/Sir.HttpServer/Sir.HttpServer.csproj
index e8222b5b..cbd7650f 100644
--- a/src/Sir.HttpServer/Sir.HttpServer.csproj
+++ b/src/Sir.HttpServer/Sir.HttpServer.csproj
@@ -46,7 +46,7 @@
-
+
diff --git a/src/Sir.ImageTests/Sir.ImageTests.csproj b/src/Sir.ImageTests/Sir.ImageTests.csproj
index 802d011a..f0a6c946 100644
--- a/src/Sir.ImageTests/Sir.ImageTests.csproj
+++ b/src/Sir.ImageTests/Sir.ImageTests.csproj
@@ -15,10 +15,9 @@
-
-
+
-
+
diff --git a/src/Sir.Images/Sir.Images.csproj b/src/Sir.Images/Sir.Model.Images.csproj
similarity index 100%
rename from src/Sir.Images/Sir.Images.csproj
rename to src/Sir.Images/Sir.Model.Images.csproj
diff --git a/src/Sir.InformationRetreival/Hit.cs b/src/Sir.InformationRetreival/Hit.cs
index ed1c3d89..0f8259d1 100644
--- a/src/Sir.InformationRetreival/Hit.cs
+++ b/src/Sir.InformationRetreival/Hit.cs
@@ -11,9 +11,17 @@ public class Hit
public List PostingsOffsets { get; set; }
public Hit (VectorNode node, double score)
+ {
+ Node = node ?? throw new System.ArgumentNullException(nameof(node));
+ Score = score;
+ PostingsOffsets = new List { node.PostingsOffset };
+ }
+
+ public Hit(VectorNode node, double score, long postingsOffset)
{
Score = score;
Node = node;
+ PostingsOffsets = new List { postingsOffset };
}
public override string ToString()
diff --git a/src/Sir.InformationRetreival/IIndexReadWriteStrategy.cs b/src/Sir.InformationRetreival/IIndexReadWriteStrategy.cs
index d3ae6a30..d5fa434a 100644
--- a/src/Sir.InformationRetreival/IIndexReadWriteStrategy.cs
+++ b/src/Sir.InformationRetreival/IIndexReadWriteStrategy.cs
@@ -7,6 +7,6 @@ public interface IIndexReadWriteStrategy
{
void Put(VectorNode column, VectorNode node);
Hit GetMatchOrNull(ISerializableVector vector, IModel model, ColumnReader reader);
- void Commit(string directory, ulong collectionId, long keyId, VectorNode tree, ILogger logger = null);
+ void SerializePage(string directory, ulong collectionId, long keyId, VectorNode tree, IndexCache indexCache, ILogger logger = null);
}
}
diff --git a/src/Sir.InformationRetreival/IO/ColumnReader.cs b/src/Sir.InformationRetreival/IO/ColumnReader.cs
index e806aa60..9d77a6be 100644
--- a/src/Sir.InformationRetreival/IO/ColumnReader.cs
+++ b/src/Sir.InformationRetreival/IO/ColumnReader.cs
@@ -1,5 +1,4 @@
-using Microsoft.Extensions.Logging;
-using System;
+using System;
using System.Buffers;
using System.Collections.Generic;
using System.IO;
@@ -36,7 +35,7 @@ public Hit ClosestMatchOrNullScanningAllPages(ISerializableVector vector, IModel
{
var hit = ClosestMatchInPage(vector, model, page.offset);
- if (hit.Score > 0)
+ if (hit != null && hit.Score > 0)
{
hits.Add(hit);
}
@@ -71,7 +70,7 @@ public Hit ClosestMatchOrNullStoppingAtFirstIdenticalPage(ISerializableVector ve
{
var hit = ClosestMatchInPage(vector, model, page.offset);
- if (hit.Score > 0)
+ if (hit != null && hit.Score > 0)
{
if (best == null || hit.Score > best.Score)
{
@@ -82,10 +81,10 @@ public Hit ClosestMatchOrNullStoppingAtFirstIdenticalPage(ISerializableVector ve
{
best.PostingsOffsets.Add(hit.Node.PostingsOffset);
}
- }
- if (hit.Score.Approximates(model.IdenticalAngle))
- break;
+ if (hit.Score.Approximates(model.IdenticalAngle))
+ break;
+ }
}
return best;
@@ -110,7 +109,7 @@ private Hit ClosestMatchInPage(ISerializableVector queryVector, IModel model, lo
var angle = model.CosAngle(queryVector, vecOffset, (int)componentCount, _vectorFile);
- if (angle >= model.IdenticalAngle)
+ if (angle >= model.IdenticalAngle || angle.Approximates(model.IdenticalAngle))
{
bestScore = angle;
bestNode = new VectorNode(postingsOffset: postingsOffset);
@@ -124,7 +123,7 @@ private Hit ClosestMatchInPage(ISerializableVector queryVector, IModel model, lo
bestScore = angle;
bestNode = new VectorNode(postingsOffset: postingsOffset);
}
- else if (angle == bestScore)
+ else if (angle.Approximates(bestScore))
{
bestNode.PostingsOffset = postingsOffset;
}
@@ -153,7 +152,7 @@ private Hit ClosestMatchInPage(ISerializableVector queryVector, IModel model, lo
bestScore = angle;
bestNode = new VectorNode(postingsOffset: postingsOffset);
}
- else if (angle > 0 && angle == bestScore)
+ else if (angle > 0 && angle.Approximates(bestScore))
{
bestNode.PostingsOffset = postingsOffset;
}
@@ -188,7 +187,7 @@ private Hit ClosestMatchInPage(ISerializableVector queryVector, IModel model, lo
ArrayPool.Shared.Return(block);
- return new Hit(bestNode, bestScore);
+ return bestNode == null ? null : new Hit(bestNode, bestScore);
}
private void SkipTree()
diff --git a/src/Sir.InformationRetreival/IO/GraphBuilder.cs b/src/Sir.InformationRetreival/IO/GraphBuilder.cs
index 093ca565..90a4b2cd 100644
--- a/src/Sir.InformationRetreival/IO/GraphBuilder.cs
+++ b/src/Sir.InformationRetreival/IO/GraphBuilder.cs
@@ -33,7 +33,7 @@ public static void AddOrAppendSupervised(
{
var angle = cursor.Vector == null ? 0 : model.CosAngle(node.Vector, cursor.Vector);
- if (angle >= model.IdenticalAngle)
+ if (angle >= model.IdenticalAngle || angle.Approximates(model.IdenticalAngle))
{
if (!cursor.Vector.Label.Equals(node.Vector.Label))
throw new InvalidOperationException($"IdenticalAngle {model.IdenticalAngle} is too low. Angle was {angle}");
@@ -79,7 +79,7 @@ public static void AddOrAppend(
{
var angle = cursor.Vector == null ? 0 : model.CosAngle(node.Vector, cursor.Vector);
- if (angle >= model.IdenticalAngle)
+ if (angle.Approximates(model.IdenticalAngle))
{
AppendDocIds(cursor, node);
@@ -123,7 +123,7 @@ public static void AddIfUnique(
{
var angle = cursor.Vector == null ? 0 : model.CosAngle(node.Vector, cursor.Vector);
- if (angle >= model.IdenticalAngle)
+ if (angle.Approximates(model.IdenticalAngle))
{
break;
}
@@ -165,7 +165,7 @@ public static bool TryAdd(
{
var angle = cursor.Vector == null ? 0 : model.CosAngle(node.Vector, cursor.Vector);
- if (angle >= model.IdenticalAngle)
+ if (angle.Approximates(model.IdenticalAngle))
{
return false;
}
@@ -209,7 +209,7 @@ public static void Build(
{
var angle = cursor.Vector == null ? 0 : model.CosAngle(node.Vector, cursor.Vector);
- if (angle >= model.IdenticalAngle)
+ if (angle.Approximates(model.IdenticalAngle))
{
break;
}
@@ -240,8 +240,11 @@ public static void Build(
}
}
- public static void AppendDocIds(this VectorNode target, VectorNode source)
+ private static void AppendDocIds(this VectorNode target, VectorNode source)
{
+ if (target.DocIds == null || source.DocIds == null)
+ return;
+
foreach (var d in source.DocIds)
target.DocIds.Add(d);
}
@@ -297,7 +300,7 @@ public static (long offset, long length) SerializeTree(this VectorNode node, Str
{
if (node.PostingsOffset == -1 && postingsWriter != null)
{
- postingsWriter.SerializePostings(node);
+ node.PostingsOffset = postingsWriter.SerializePostings(node);
}
if (vectorStream != null)
diff --git a/src/Sir.InformationRetreival/IO/PathFinder.cs b/src/Sir.InformationRetreival/IO/PathFinder.cs
index e098aebc..79836c5c 100644
--- a/src/Sir.InformationRetreival/IO/PathFinder.cs
+++ b/src/Sir.InformationRetreival/IO/PathFinder.cs
@@ -25,7 +25,7 @@ public static Hit ClosestMatch(VectorNode root, ISerializableVector vector, IMod
best = cursor;
}
- if (angle >= model.IdenticalAngle)
+ if (angle >= model.IdenticalAngle || angle.Approximates(model.IdenticalAngle))
{
break;
}
diff --git a/src/Sir.InformationRetreival/IO/PostingsReader.cs b/src/Sir.InformationRetreival/IO/PostingsReader.cs
index cb9cd7b2..7e07e3fe 100644
--- a/src/Sir.InformationRetreival/IO/PostingsReader.cs
+++ b/src/Sir.InformationRetreival/IO/PostingsReader.cs
@@ -25,10 +25,10 @@ public PostingsReader(string directory, ulong collectionId, long keyId, ILogger
_collectionId = collectionId;
}
- public IList<(ulong, long)> Read(long keyId, IList offsets)
+ public HashSet<(ulong, long)> Read(long keyId, IList offsets)
{
var time = Stopwatch.StartNew();
- var documents = new List<(ulong, long)>();
+ var documents = new HashSet<(ulong, long)>(); // collection ID, document ID
foreach (var offset in offsets)
GetPostingsFromStream(keyId, offset, documents);
@@ -39,20 +39,21 @@ public PostingsReader(string directory, ulong collectionId, long keyId, ILogger
return documents;
}
- private void GetPostingsFromStream(long keyId, long postingsOffset, List<(ulong collectionId, long docId)> documents)
+ private void GetPostingsFromStream(long keyId, long postingsOffset, HashSet<(ulong collectionId, long docId)> postings)
{
+ // seek to page
_stream.Seek(postingsOffset, SeekOrigin.Begin);
var headerLen = sizeof(long) * 2;
- var headerBuf = ArrayPool.Shared.Rent(headerLen);
+ // read header
+ var headerBuf = ArrayPool.Shared.Rent(headerLen);
_stream.Read(headerBuf, 0, headerLen);
-
var numOfPostings = BitConverter.ToInt64(headerBuf);
var addressOfNextPage = BitConverter.ToInt64(headerBuf, sizeof(long));
-
ArrayPool.Shared.Return(headerBuf);
+ // read postings
var listLen = sizeof(long) * numOfPostings;
var listBuf = new byte[listLen];
var read = _stream.Read(listBuf);
@@ -62,12 +63,12 @@ private void GetPostingsFromStream(long keyId, long postingsOffset, List<(ulong
foreach (var docId in MemoryMarshal.Cast(listBuf))
{
- documents.Add((_collectionId, docId));
+ postings.Add((_collectionId, docId));
}
if (addressOfNextPage > 0)
{
- GetPostingsFromStream(keyId, addressOfNextPage, documents);
+ GetPostingsFromStream(keyId, addressOfNextPage, postings);
}
}
diff --git a/src/Sir.InformationRetreival/IO/PostingsWriter.cs b/src/Sir.InformationRetreival/IO/PostingsWriter.cs
index 1ba2f9d3..780e1628 100644
--- a/src/Sir.InformationRetreival/IO/PostingsWriter.cs
+++ b/src/Sir.InformationRetreival/IO/PostingsWriter.cs
@@ -5,36 +5,77 @@ namespace Sir.IO
{
public class PostingsWriter : IDisposable
{
- private readonly Stream _postingsStream;
+ private readonly Stream _stream;
+ private readonly IndexCache _indexCache;
- public PostingsWriter(Stream postingsStream)
+ public PostingsWriter(Stream postingsStream, IndexCache indexCache = null)
{
- _postingsStream = postingsStream;
+ _stream = postingsStream;
+ _indexCache = indexCache;
+
+ _stream.Seek(0, SeekOrigin.End);
}
- public void SerializePostings(VectorNode node)
+ public long SerializePostings(VectorNode node)
{
if (node.DocIds.Count == 0) throw new ArgumentException("can't be empty", nameof(node.DocIds));
- node.PostingsOffset = _postingsStream.Position;
+ /* --------------- */
+ /* write new page */
+ /* ------------- */
+
+ // store stream position
+ var postingsOffset = _stream.Position;
- // serialize item count
- _postingsStream.Write(BitConverter.GetBytes((long)node.DocIds.Count));
+ // serialize postings count
+ _stream.Write(BitConverter.GetBytes((long)node.DocIds.Count));
// serialize address of next page (unknown at this time)
- _postingsStream.Write(BitConverter.GetBytes((long)0));
+ _stream.Write(BitConverter.GetBytes((long)0));
+ // serialize document IDs
foreach (var docId in node.DocIds)
{
- _postingsStream.Write(BitConverter.GetBytes(docId));
+ _stream.Write(BitConverter.GetBytes(docId));
+ }
+
+ long? existingPostingsOffset = null;
+
+ if (_indexCache != null)
+ {
+ existingPostingsOffset = _indexCache.GetPostingsOffset(node.KeyId.Value, node.Vector);
+ }
+
+ if (existingPostingsOffset.HasValue && existingPostingsOffset.Value > 0)
+ {
+ /* ------------------------------------ */
+ /* reference new page in existing page */
+ /* ---------------------------------- */
+
+ // rewind stream to existing postings page header
+ _stream.Seek(existingPostingsOffset.Value+sizeof(long), SeekOrigin.Begin);
+
+ // set this as next page of existing postings page
+ _stream.Write(BitConverter.GetBytes(postingsOffset));
+
+ // go back to end of stream
+ _stream.Seek(0, SeekOrigin.End);
+
+ // set this as offset of existing postings page
+ _indexCache.UpdatePostingsOffset(node.KeyId.Value, node.Vector, postingsOffset);
+ }
+ else if (_indexCache != null)
+ {
+ _indexCache.Put(new VectorNode(vector: node.Vector, postingsOffset: postingsOffset, keyId: node.KeyId));
}
+ return postingsOffset;
}
public void Dispose()
{
- if (_postingsStream != null )
+ if (_stream != null )
{
- _postingsStream.Dispose();
+ _stream.Dispose();
}
}
}
diff --git a/src/Sir.InformationRetreival/IO/PostingsReadOrchestrator.cs b/src/Sir.InformationRetreival/IO/TermPostingsMapper.cs
similarity index 65%
rename from src/Sir.InformationRetreival/IO/PostingsReadOrchestrator.cs
rename to src/Sir.InformationRetreival/IO/TermPostingsMapper.cs
index 7e4fe2dc..7e325207 100644
--- a/src/Sir.InformationRetreival/IO/PostingsReadOrchestrator.cs
+++ b/src/Sir.InformationRetreival/IO/TermPostingsMapper.cs
@@ -5,19 +5,19 @@
namespace Sir.IO
{
///
- /// Read postings lists from storage and map them to query terms
+ /// Read postings from storage and map them to query terms.
///
- public class PostingsReadOrchestrator : IDisposable
+ public class TermPostingsMapper : IDisposable
{
- private readonly Dictionary<(string, ulong, long), PostingsReader> _readers = new Dictionary<(string, ulong, long), PostingsReader>();
+ private readonly Dictionary<(string directory, ulong collectionId, long keyId), PostingsReader> _readers = new Dictionary<(string, ulong, long), PostingsReader>();
private readonly ILogger _logger;
- public PostingsReadOrchestrator(ILogger logger = null)
+ public TermPostingsMapper(ILogger logger = null)
{
_logger = logger;
}
- public void ReadAndMapPostings(IQuery query)
+ public void ReadAndMap(Query query)
{
foreach (var term in query.AllTerms())
{
@@ -30,11 +30,7 @@ public void ReadAndMapPostings(IQuery query)
if (!_readers.TryGetValue(key, out reader))
{
reader = new PostingsReader(term.Directory, term.CollectionId, term.KeyId, _logger);
-
- if (reader != null)
- {
- _readers.Add(key, reader);
- }
+ _readers.Add(key, reader);
}
if (reader != null)
diff --git a/src/Sir.InformationRetreival/IQuery.cs b/src/Sir.InformationRetreival/IQuery.cs
deleted file mode 100644
index f5ed91af..00000000
--- a/src/Sir.InformationRetreival/IQuery.cs
+++ /dev/null
@@ -1,21 +0,0 @@
-using System.Collections.Generic;
-
-namespace Sir
-{
- public interface IQuery
- {
- IQuery AndQuery { get; set; }
- IQuery NotQuery { get; set; }
- IQuery OrQuery { get; set; }
- HashSet Select { get; }
- IList Terms { get; }
- bool IsUnion { get; set; }
- bool IsIntersection { get; set; }
-
- IEnumerable All();
- IEnumerable AllTerms();
- int GetCollectionCount();
- void GetNumOfCollections(HashSet dic);
- int TotalNumberOfTerms();
- }
-}
\ No newline at end of file
diff --git a/src/Sir.InformationRetreival/IReducer.cs b/src/Sir.InformationRetreival/IReducer.cs
index 4592e1d9..411b8714 100644
--- a/src/Sir.InformationRetreival/IReducer.cs
+++ b/src/Sir.InformationRetreival/IReducer.cs
@@ -4,6 +4,6 @@ namespace Sir.Strings
{
public interface IReducer
{
- void Reduce(IQuery mappedQuery, ref IDictionary<(ulong, long), double> result);
+ void Reduce(Query mappedQuery, ref IDictionary<(ulong, long), double> result);
}
}
\ No newline at end of file
diff --git a/src/Sir.InformationRetreival/IndexingStrategies/LogStructuredIndexingStrategy.cs b/src/Sir.InformationRetreival/IndexingStrategies/LogStructuredIndexingStrategy.cs
index fd9e341e..95bc4416 100644
--- a/src/Sir.InformationRetreival/IndexingStrategies/LogStructuredIndexingStrategy.cs
+++ b/src/Sir.InformationRetreival/IndexingStrategies/LogStructuredIndexingStrategy.cs
@@ -24,12 +24,12 @@ public void Put(VectorNode column, VectorNode node)
column.AddOrAppend(node, _model);
}
- public void Commit(string directory, ulong collectionId, long keyId, VectorNode tree, ILogger logger = null)
+ public void SerializePage(string directory, ulong collectionId, long keyId, VectorNode tree, IndexCache indexCache, ILogger logger = null)
{
var time = Stopwatch.StartNew();
using (var vectorStream = StreamFactory.CreateAppendStream(directory, collectionId, keyId, "vec"))
- using (var postingsWriter = new PostingsWriter(StreamFactory.CreateAppendStream(directory, collectionId, keyId, "pos")))
+ using (var postingsWriter = new PostingsWriter(StreamFactory.CreateSeekableWriteStream(directory, collectionId, keyId, "pos"), indexCache:null))
using (var columnWriter = new ColumnWriter(StreamFactory.CreateAppendStream(directory, collectionId, keyId, "ix")))
using (var pageIndexWriter = new PageIndexWriter(StreamFactory.CreateAppendStream(directory, collectionId, keyId, "ixtp")))
{
diff --git a/src/Sir.InformationRetreival/IndexingStrategies/SupervisedLogStructuredIndexingStrategy.cs b/src/Sir.InformationRetreival/IndexingStrategies/SupervisedLogStructuredIndexingStrategy.cs
index 9eda1c4e..29178aad 100644
--- a/src/Sir.InformationRetreival/IndexingStrategies/SupervisedLogStructuredIndexingStrategy.cs
+++ b/src/Sir.InformationRetreival/IndexingStrategies/SupervisedLogStructuredIndexingStrategy.cs
@@ -24,12 +24,12 @@ public void Put(VectorNode column, VectorNode node)
column.AddOrAppendSupervised(node, _model);
}
- public void Commit(string directory, ulong collectionId, long keyId, VectorNode tree, ILogger logger = null)
+ public void SerializePage(string directory, ulong collectionId, long keyId, VectorNode tree, IndexCache indexCache, ILogger logger = null)
{
var time = Stopwatch.StartNew();
using (var vectorStream = StreamFactory.CreateAppendStream(directory, collectionId, keyId, "vec"))
- using (var postingsWriter = new PostingsWriter(StreamFactory.CreateAppendStream(directory, collectionId, keyId, "pos")))
+ using (var postingsWriter = new PostingsWriter(StreamFactory.CreateSeekableWriteStream(directory, collectionId, keyId, "pos"), indexCache))
using (var columnWriter = new ColumnWriter(StreamFactory.CreateAppendStream(directory, collectionId, keyId, "ix")))
using (var pageIndexWriter = new PageIndexWriter(StreamFactory.CreateAppendStream(directory, collectionId, keyId, "ixtp")))
{
diff --git a/src/Sir.InformationRetreival/Parsers/QueryParser.cs b/src/Sir.InformationRetreival/Parsers/QueryParser.cs
index 444c5b3e..081f4705 100644
--- a/src/Sir.InformationRetreival/Parsers/QueryParser.cs
+++ b/src/Sir.InformationRetreival/Parsers/QueryParser.cs
@@ -20,7 +20,7 @@ public QueryParser(KeyValueReader kvReader, IModel model, SortedList();
}
- public IQuery Parse(
+ public Query Parse(
ulong collectionId,
T query,
string field,
@@ -41,7 +41,7 @@ public IQuery Parse(
return new Query(terms, new string[] { select }, and, or, !and && !or);
}
- public IQuery Parse(
+ public Query Parse(
string collection,
T query,
string field,
@@ -62,7 +62,7 @@ public IQuery Parse(
return new Query(terms, new string[] { select }, and, or, !and && !or);
}
- public IQuery Parse(
+ public Query Parse(
IEnumerable collections,
T q,
string[] fields,
@@ -141,7 +141,7 @@ public IQuery Parse(
return Parse(root, select, label);
}
- public IQuery Parse(dynamic document, IEnumerable select, bool label)
+ public Query Parse(dynamic document, IEnumerable select, bool label)
{
Query root = null;
Query cursor = null;
diff --git a/src/Sir.InformationRetreival/Query.cs b/src/Sir.InformationRetreival/Query.cs
index 86f980c8..399e533d 100644
--- a/src/Sir.InformationRetreival/Query.cs
+++ b/src/Sir.InformationRetreival/Query.cs
@@ -14,13 +14,13 @@ namespace Sir
/// }
/// }
///
- public class Query : BooleanStatement, IQuery
+ public class Query : BooleanStatement
{
public IList Terms { get; }
public HashSet Select { get; }
- public IQuery AndQuery { get; set; }
- public IQuery OrQuery { get; set; }
- public IQuery NotQuery { get; set; }
+ public Query AndQuery { get; set; }
+ public Query OrQuery { get; set; }
+ public Query NotQuery { get; set; }
public Query(
IList terms,
@@ -90,7 +90,7 @@ public IEnumerable AllTerms()
yield return term;
}
- public IEnumerable All()
+ public IEnumerable All()
{
yield return this;
diff --git a/src/Sir.InformationRetreival/Scorer.cs b/src/Sir.InformationRetreival/Scorer.cs
index 68445715..ef3bd1d0 100644
--- a/src/Sir.InformationRetreival/Scorer.cs
+++ b/src/Sir.InformationRetreival/Scorer.cs
@@ -7,7 +7,7 @@ public class Scorer
///
/// Reduce query to a list of scored document IDs.
///
- public void Score(IQuery query, ref IDictionary<(ulong CollectionId, long DocumentId), double> result)
+ public void Score(Query query, ref IDictionary<(ulong CollectionId, long DocumentId), double> result)
{
IDictionary<(ulong, long), double> queryResult = new Dictionary<(ulong, long), double>();
diff --git a/src/Sir.InformationRetreival/SearchResult.cs b/src/Sir.InformationRetreival/SearchResult.cs
index 4fc023d0..b8791246 100644
--- a/src/Sir.InformationRetreival/SearchResult.cs
+++ b/src/Sir.InformationRetreival/SearchResult.cs
@@ -5,12 +5,12 @@ namespace Sir
{
public class SearchResult
{
- public IQuery Query { get; }
+ public Query Query { get; }
public long Total { get; }
public IEnumerable Documents { get; }
public int Count { get; }
- public SearchResult(IQuery query, long total, int count, IEnumerable documents)
+ public SearchResult(Query query, long total, int count, IEnumerable documents)
{
Query = query;
Total = total;
diff --git a/src/Sir.InformationRetreival/Session/DocumentDatabase.cs b/src/Sir.InformationRetreival/Session/DocumentDatabase.cs
index f266bd8a..373cb568 100644
--- a/src/Sir.InformationRetreival/Session/DocumentDatabase.cs
+++ b/src/Sir.InformationRetreival/Session/DocumentDatabase.cs
@@ -18,9 +18,9 @@ public class DocumentDatabase : IDisposable
private WriteSession _writeSession;
private IndexSession _indexSession;
private SearchSession _searchSession;
+ private IndexCache _indexCache;
private readonly IModel _model;
private readonly ILogger _logger;
-
public IndexSession IndexSession { get { return _indexSession; } }
public SearchSession SearchSession { get { return _searchSession; } }
@@ -30,8 +30,9 @@ public DocumentDatabase(string directory, ulong collectionId, IModel model =
_collectionId = collectionId;
_model = model;
_indexStrategy = indexStrategy;
+ _indexCache = new IndexCache(_model);
_writeSession = new WriteSession(new DocumentRegistryWriter(directory, collectionId));
- _indexSession = new IndexSession(directory, collectionId, model, indexStrategy, logger);
+ _indexSession = new IndexSession(directory, collectionId, model, indexStrategy, _indexCache, logger);
_searchSession = new SearchSession(directory, _model, _indexStrategy, logger);
_logger = logger;
}
@@ -46,7 +47,7 @@ public IEnumerable StreamDocuments(HashSet fieldsOfInterest, i
return _searchSession.ReadDocuments(_collectionId, fieldsOfInterest, skip, take);
}
- public SearchResult Read(IQuery query, int skip, int take)
+ public SearchResult Read(Query query, int skip, int take)
{
return _searchSession.Search(query, skip, take);
}
@@ -104,7 +105,7 @@ public void Truncate()
LogInformation($"truncated collection {_collectionId} ({count} files affected)");
_writeSession = new WriteSession(new DocumentRegistryWriter(_directory, _collectionId));
- _indexSession = new IndexSession(_directory, _collectionId, _model, _indexStrategy, _logger);
+ _indexSession = new IndexSession(_directory, _collectionId, _model, _indexStrategy, _indexCache, _logger);
_searchSession = new SearchSession(_directory, _model, _indexStrategy, _logger);
}
@@ -143,7 +144,7 @@ public void TruncateIndexOnly()
LogInformation($"truncated index {_collectionId} ({count} files affected)");
_writeSession = new WriteSession(new DocumentRegistryWriter(_directory, _collectionId));
- _indexSession = new IndexSession(_directory, _collectionId, _model, _indexStrategy, _logger);
+ _indexSession = new IndexSession(_directory, _collectionId, _model, _indexStrategy, _indexCache, _logger);
_searchSession = new SearchSession(_directory, _model, _indexStrategy, _logger);
}
@@ -165,7 +166,7 @@ public void Rename(ulong newCollectionId)
_collectionId = newCollectionId;
_writeSession = new WriteSession(new DocumentRegistryWriter(_directory, _collectionId));
- _indexSession = new IndexSession(_directory, _collectionId, _model, _indexStrategy, _logger);
+ _indexSession = new IndexSession(_directory, _collectionId, _model, _indexStrategy, _indexCache, _logger);
_searchSession = new SearchSession(_directory, _model, _indexStrategy, _logger);
}
diff --git a/src/Sir.InformationRetreival/Session/ISearchSession.cs b/src/Sir.InformationRetreival/Session/ISearchSession.cs
index cbe013a2..eb44d549 100644
--- a/src/Sir.InformationRetreival/Session/ISearchSession.cs
+++ b/src/Sir.InformationRetreival/Session/ISearchSession.cs
@@ -5,7 +5,7 @@ namespace Sir
{
public interface ISearchSession : IDisposable
{
- SearchResult Search(IQuery query, int skip, int take);
- Document SearchScalar(IQuery query);
+ SearchResult Search(Query query, int skip, int take);
+ Document SearchScalar(Query query);
}
}
\ No newline at end of file
diff --git a/src/Sir.InformationRetreival/Session/IndexCache.cs b/src/Sir.InformationRetreival/Session/IndexCache.cs
new file mode 100644
index 00000000..2adb4a7d
--- /dev/null
+++ b/src/Sir.InformationRetreival/Session/IndexCache.cs
@@ -0,0 +1,59 @@
+using Sir.IO;
+using System;
+using System.Collections.Generic;
+
+namespace Sir
+{
+ public class IndexCache
+ {
+ private readonly IModel _model;
+ private readonly IDictionary _cache; // indices by key ID
+
+ public IndexCache(IModel model)
+ {
+ _model = model;
+ _cache = new Dictionary();
+ }
+
+ public void Put(VectorNode node)
+ {
+ if (!node.KeyId.HasValue)
+ throw new ArgumentException(message:"VectorNode does not have a key ID.", paramName:nameof(node));
+
+ if (!_cache.TryGetValue(node.KeyId.Value, out var tree))
+ {
+ tree = new VectorNode();
+ _cache.Add(node.KeyId.Value, tree);
+ }
+
+ GraphBuilder.AddOrAppend(tree, node, _model);
+ }
+
+ public long? GetPostingsOffset(long keyId, ISerializableVector vector)
+ {
+ if (_cache.TryGetValue(keyId, out var tree))
+ {
+ var hit = PathFinder.ClosestMatch(tree, vector, _model);
+
+ if (hit.Score >= _model.IdenticalAngle)
+ {
+ return hit.Node.PostingsOffset == -1 ? null : hit.Node.PostingsOffset;
+ }
+ }
+ return null;
+ }
+
+ public void UpdatePostingsOffset(long keyId, ISerializableVector vector, long postingsOffset)
+ {
+ if (_cache.TryGetValue(keyId, out var tree))
+ {
+ var hit = PathFinder.ClosestMatch(tree, vector, _model);
+
+ if (hit.Score >= _model.IdenticalAngle)
+ {
+ hit.Node.PostingsOffset = postingsOffset;
+ }
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/Sir.InformationRetreival/Session/IndexSession.cs b/src/Sir.InformationRetreival/Session/IndexSession.cs
index 452252df..77eb1834 100644
--- a/src/Sir.InformationRetreival/Session/IndexSession.cs
+++ b/src/Sir.InformationRetreival/Session/IndexSession.cs
@@ -18,6 +18,7 @@ public class IndexSession : IIndexSession, IDisposable
private readonly string _directory;
private readonly ulong _collectionId;
private readonly ILogger _logger;
+ private readonly IndexCache _indexCache;
public SortedList EmptyEmbedding = new SortedList();
@@ -26,6 +27,7 @@ public IndexSession(
ulong collectionId,
IModel model,
IIndexReadWriteStrategy indexingStrategy,
+ IndexCache indexCache,
ILogger logger = null)
{
_model = model;
@@ -34,6 +36,7 @@ public IndexSession(
_directory = directory;
_collectionId = collectionId;
_logger = logger;
+ _indexCache = indexCache;
}
public void Put(long docId, long keyId, T value, bool label)
@@ -57,7 +60,7 @@ public void Put(long docId, long keyId, IEnumerable tokens)
{
_indexingStrategy.Put(
column,
- new VectorNode(vector: token, docId: docId));
+ new VectorNode(vector:token, docId:docId, keyId:keyId));
}
}
@@ -93,7 +96,7 @@ public void Commit(long keyId)
var column = _index[keyId];
- _indexingStrategy.Commit(_directory, _collectionId, keyId, column, _logger);
+ _indexingStrategy.SerializePage(_directory, _collectionId, keyId, column, _indexCache, _logger);
if (_logger != null)
_logger.LogInformation($"committing index to disk for key {keyId} took {time.Elapsed}");
diff --git a/src/Sir.InformationRetreival/Session/IndexWriter.cs b/src/Sir.InformationRetreival/Session/IndexWriter.cs
deleted file mode 100644
index b6d61420..00000000
--- a/src/Sir.InformationRetreival/Session/IndexWriter.cs
+++ /dev/null
@@ -1,38 +0,0 @@
-using Microsoft.Extensions.Logging;
-using System;
-using System.Collections.Generic;
-
-namespace Sir
-{
- ///
- /// Write a collection of indices to disk.
- ///
- public class IndexWriter : IDisposable
- {
- private readonly string _directory;
- private readonly ulong _collectionId;
- private readonly ILogger _logger;
-
- public IndexWriter(
- string directory,
- ulong collectionId,
- ILogger logger = null)
- {
- _directory = directory;
- _collectionId = collectionId;
- _logger = logger;
- }
-
- public void Dispose()
- {
- }
-
- public void Commit(IDictionary index, IIndexReadWriteStrategy indexingStrategy)
- {
- foreach (var column in index)
- {
- indexingStrategy.Commit(_directory, _collectionId, column.Key, column.Value);
- }
- }
- }
-}
\ No newline at end of file
diff --git a/src/Sir.InformationRetreival/Session/SearchSession.cs b/src/Sir.InformationRetreival/Session/SearchSession.cs
index 996f6450..caa32729 100644
--- a/src/Sir.InformationRetreival/Session/SearchSession.cs
+++ b/src/Sir.InformationRetreival/Session/SearchSession.cs
@@ -15,7 +15,7 @@ public class SearchSession : DocumentStreamSession, IDisposable, ISearchSessi
{
private readonly IModel _model;
private readonly IIndexReadWriteStrategy _indexStrategy;
- private readonly PostingsReadOrchestrator _postingsReadOrchestrator;
+ private readonly TermPostingsMapper _termPostingsMapper;
private readonly Scorer _scorer;
private readonly ILogger _logger;
private readonly Dictionary<(string, ulong, long), ColumnReader> _readers;
@@ -25,12 +25,12 @@ public SearchSession(
IModel model,
IIndexReadWriteStrategy indexStrategy,
ILogger logger = null,
- PostingsReadOrchestrator postingsResolver = null,
+ TermPostingsMapper termPostingsMapper = null,
Scorer scorer = null) : base(directory)
{
_model = model;
_indexStrategy = indexStrategy;
- _postingsReadOrchestrator = postingsResolver ?? new PostingsReadOrchestrator(logger);
+ _termPostingsMapper = termPostingsMapper ?? new TermPostingsMapper(logger);
_scorer = scorer ?? new Scorer();
_logger = logger;
_readers = new Dictionary<(string, ulong, long), ColumnReader>();
@@ -48,7 +48,7 @@ public override void ClearCachedReaders()
base.ClearCachedReaders();
}
- public SearchResult Search(IQuery query, int skip, int take)
+ public SearchResult Search(Query query, int skip, int take)
{
var result = OrchestrateSearch(query, skip, take, false);
@@ -64,7 +64,7 @@ public SearchResult Search(IQuery query, int skip, int take)
return new SearchResult(query, 0, 0, System.Linq.Enumerable.Empty());
}
- public Document SearchScalar(IQuery query)
+ public Document SearchScalar(Query query)
{
var result = OrchestrateSearch(query, 0, 1, true);
@@ -80,7 +80,7 @@ public Document SearchScalar(IQuery query)
return null;
}
- public SearchResult SearchIdentical(IQuery query, int take)
+ public SearchResult SearchIdentical(Query query, int take)
{
var result = OrchestrateSearch(query, 0, take, true);
@@ -96,7 +96,7 @@ public SearchResult SearchIdentical(IQuery query, int take)
return new SearchResult(query, 0, 0, System.Linq.Enumerable.Empty());
}
- private ScoredResult OrchestrateSearch(IQuery query, int skip, int take, bool identicalMatchesOnly)
+ private ScoredResult OrchestrateSearch(Query query, int skip, int take, bool identicalMatchesOnly)
{
var timer = Stopwatch.StartNew();
@@ -107,7 +107,7 @@ private ScoredResult OrchestrateSearch(IQuery query, int skip, int take, bool id
timer.Restart();
// Read postings.
- _postingsReadOrchestrator.ReadAndMapPostings(query);
+ _termPostingsMapper.ReadAndMap(query);
LogDebug($"reading postings took {timer.Elapsed}");
timer.Restart();
@@ -130,7 +130,7 @@ private ScoredResult OrchestrateSearch(IQuery query, int skip, int take, bool id
///
/// Scans the index to find the query's closest matching nodes and records their posting list addresses.
///
- private void Scan(IQuery query, bool identicalMatchesOnly)
+ private void Scan(Query query, bool identicalMatchesOnly)
{
if (query == null)
return;
@@ -229,7 +229,9 @@ private IList ReadDocs(
var doc = ReadDocument(d.Key, select, d.Value * scoreMultiplier);
if (doc != null)
+ {
result.Add(doc);
+ }
}
LogDebug($"reading documents took {timer.Elapsed}");
@@ -257,8 +259,8 @@ private void LogError(Exception ex, string message)
public override void Dispose()
{
- if (_postingsReadOrchestrator!= null)
- _postingsReadOrchestrator.Dispose();
+ if (_termPostingsMapper!= null)
+ _termPostingsMapper.Dispose();
foreach (var reader in _readers.Values)
{
diff --git a/src/Sir.InformationRetreival/Term.cs b/src/Sir.InformationRetreival/Term.cs
index 1518d054..8a9977a3 100644
--- a/src/Sir.InformationRetreival/Term.cs
+++ b/src/Sir.InformationRetreival/Term.cs
@@ -12,7 +12,7 @@ public class Term : BooleanStatement
public ulong CollectionId { get; }
public IList PostingsOffsets { get; set; }
public double Score { get; set; }
- public IList<(ulong collectionId, long documentId)> DocumentIds { get; set; }
+ public HashSet<(ulong collectionId, long documentId)> DocumentIds { get; set; }
public object Label => Vector.Label;
public Term(
diff --git a/src/Sir.KeyValue/StreamFactory.cs b/src/Sir.KeyValue/StreamFactory.cs
index a81baeba..b84caf56 100644
--- a/src/Sir.KeyValue/StreamFactory.cs
+++ b/src/Sir.KeyValue/StreamFactory.cs
@@ -37,5 +37,22 @@ public static Stream CreateAppendStream(string directory, ulong collectionId, lo
return new FileStream(fileName, FileMode.Append, FileAccess.Write, FileShare.ReadWrite);
}
+
+ public static Stream CreateSeekableWriteStream(string directory, ulong collectionId, long keyId, string fileExtension)
+ {
+ if (!System.IO.Directory.Exists(directory))
+ {
+ System.IO.Directory.CreateDirectory(directory);
+ }
+
+ var fileName = Path.Combine(directory, $"{collectionId}.{keyId}.{fileExtension}");
+
+ if (!File.Exists(fileName))
+ {
+ using (var fs = new FileStream(fileName, FileMode.Append, FileAccess.Write, FileShare.ReadWrite)) { }
+ }
+
+ return new FileStream(fileName, FileMode.Open, FileAccess.Write, FileShare.ReadWrite);
+ }
}
}
diff --git a/src/Sir.CommonCrawl/Sir.CommonCrawl.csproj b/src/Sir.Mnist/Sir.DataProvider.Mnist.csproj
similarity index 86%
rename from src/Sir.CommonCrawl/Sir.CommonCrawl.csproj
rename to src/Sir.Mnist/Sir.DataProvider.Mnist.csproj
index 9da76051..f97eb21d 100644
--- a/src/Sir.CommonCrawl/Sir.CommonCrawl.csproj
+++ b/src/Sir.Mnist/Sir.DataProvider.Mnist.csproj
@@ -13,8 +13,8 @@
+
-
diff --git a/src/Sir.StringCompare/Sir.StringCompare.csproj b/src/Sir.StringCompare/Sir.StringCompare.csproj
index 418818e4..008bde84 100644
--- a/src/Sir.StringCompare/Sir.StringCompare.csproj
+++ b/src/Sir.StringCompare/Sir.StringCompare.csproj
@@ -16,7 +16,7 @@
-
+
diff --git a/src/Sir.StringTests/BagOfCharsDatabaseTests.cs b/src/Sir.StringTests/BagOfCharsDatabaseTests.cs
index c196e69f..035227bb 100644
--- a/src/Sir.StringTests/BagOfCharsDatabaseTests.cs
+++ b/src/Sir.StringTests/BagOfCharsDatabaseTests.cs
@@ -13,7 +13,8 @@ public class BagOfCharsDatabaseTests
{
private ILoggerFactory _loggerFactory;
private string _directory = Path.Combine(Environment.CurrentDirectory, "testdata");
- private readonly string[] _data = ["Ferriman–Gallwey score", "apples", "apricote", "apricots", "avocado", "avocados", "banana", "bananas", "blueberry", "blueberries", "cantalope"];
+ private readonly string[] _dataPage0 = ["Ferriman–Gallwey score", "apples", "apricote", "apricots", "avocado", "avocados", "banana", "bananas", "blueberry", "blueberries", "cantalope"];
+ private readonly string[] _dataPage1 = ["score", "apples and teddybears", "apricote sauce", "hey baberibba", "avocado sundae", "avocados are nice", "banana split", "I'm going bananas", "blueberry pie", "blueberries and sauce", "cantalope"];
[Test]
public void Can_stream()
@@ -21,7 +22,7 @@ public void Can_stream()
var model = new BagOfCharsModel();
var strategy = new LogStructuredIndexingStrategy(model);
var collectionId = "BagOfCharsDatabaseTests.Can_stream".ToHash();
- var documents = _data.Select(x => new Document(new Field[] {new Field("title", x)})).ToList();
+ var documents = _dataPage0.Select(x => new Document(new Field[] {new Field("title", x)})).ToList();
using (var database = new DocumentDatabase(_directory, collectionId, model, strategy, _loggerFactory.CreateLogger("Debug")))
{
@@ -41,7 +42,7 @@ public void Can_stream()
Assert.DoesNotThrow(() =>
{
var documentWas = document.Get("title").Value;
- var documentShouldBe = _data[i++];
+ var documentShouldBe = _dataPage0[i++];
if (!documentShouldBe.Equals(documentWas))
{
@@ -58,7 +59,7 @@ public void Can_read_and_write()
var model = new BagOfCharsModel();
var strategy = new LogStructuredIndexingStrategy(model);
var collectionId = "BagOfCharsDatabaseTests.Can_read_and_write".ToHash();
- var documents = _data.Select(x => new Document(new Field[] { new Field("title", x) })).ToList();
+ var documents = _dataPage0.Select(x => new Document(new Field[] { new Field("title", x) })).ToList();
using (var database = new DocumentDatabase(_directory, collectionId, model, strategy, _loggerFactory.CreateLogger("Debug")))
{
@@ -73,7 +74,7 @@ public void Can_read_and_write()
var queryParser = database.CreateQueryParser();
- foreach (var word in _data)
+ foreach (var word in _dataPage0)
{
Assert.DoesNotThrow(() =>
{
@@ -92,13 +93,97 @@ public void Can_read_and_write()
}
}
+ [Test]
+ public void Can_read_and_write_paged()
+ {
+ // This test fails because BagOfCharsModel is also a bag of words model thus it scores 1 if there's a hit in a phrase that's not identical to the term.
+ var model = new BagOfCharsModel();
+ var strategy = new LogStructuredIndexingStrategy(model);
+ var collectionId = "BagOfCharsDatabaseTests.Can_read_and_write_paged".ToHash();
+ var page0 = _dataPage0.Select(x => new Document(new Field[] { new Field("title", x) })).ToList();
+ var page1 = _dataPage1.Select(x => new Document(new Field[] { new Field("title", x) })).ToList();
+
+ using (var database = new DocumentDatabase(_directory, collectionId, model, strategy, _loggerFactory.CreateLogger("Debug")))
+ {
+ database.Truncate();
+
+ foreach (var document in page0)
+ {
+ database.Write(document, label: true);
+ }
+
+ database.Commit(); // create page
+
+ foreach (var document in page1)
+ {
+ database.Write(document, label:true);
+ }
+
+ database.Commit(); // create another page
+
+ var queryParser = database.CreateQueryParser();
+
+ foreach (var word in _dataPage0)
+ {
+ Assert.DoesNotThrow(() =>
+ {
+ var query = queryParser.Parse(collectionId, word, "title", "title", and: true, or: false, label: true);
+ var result = database.Read(query, skip: 0, take: 1);
+
+ var documentWas = result.Documents.First().Get("title").Value;
+ var documentShouldBe = word;
+
+ if (!documentShouldBe.Equals(documentWas))
+ {
+ throw new Exception($"documentShouldBe: {documentShouldBe} documentWas: {documentWas} ");
+ }
+ });
+ }
+
+ foreach (var word in _dataPage1)
+ {
+ Assert.DoesNotThrow(() =>
+ {
+ var query = queryParser.Parse(collectionId, word, "title", "title", and: true, or: false, label: true);
+ var result = database.Read(query, skip: 0, take: 2);
+
+ var documentWas = result.Documents.First().Get("title").Value;
+ var documentShouldBe = word;
+
+ if (!documentShouldBe.Equals(documentWas))
+ {
+ throw new Exception($"documentShouldBe: {documentShouldBe} documentWas: {documentWas} ");
+ }
+
+ //var found = false;
+
+ //foreach(var document in result.Documents)
+ //{
+ // var documentWas = result.Documents.First().Get("title").Value;
+ // var documentShouldBe = word;
+
+ // if (documentShouldBe.Equals(documentWas))
+ // {
+ // found = true;
+ // break;
+ // }
+ //}
+
+ //if (!found)
+ // throw new Exception($"document not found. documentShouldBe: {word} ");
+
+ });
+ }
+ }
+ }
+
[Test]
public void Can_optimize_index()
{
var model = new BagOfCharsModel();
var strategy = new LogStructuredIndexingStrategy(model);
var collectionId = "BagOfCharsDatabaseTests.Can_optimize_index".ToHash();
- var documents = _data.Select(x => new Document(new Field[] { new Field("title", x) })).ToList();
+ var documents = _dataPage0.Select(x => new Document(new Field[] { new Field("title", x) })).ToList();
using (var database = new DocumentDatabase(_directory, collectionId, model, strategy, _loggerFactory.CreateLogger("Debug")))
{
@@ -113,7 +198,7 @@ public void Can_optimize_index()
var queryParser = database.CreateQueryParser();
- foreach (var word in _data)
+ foreach (var word in _dataPage0)
{
Assert.DoesNotThrow(() =>
{
@@ -127,7 +212,7 @@ public void Can_optimize_index()
database.OptimizeAllIndices();
- foreach (var word in _data)
+ foreach (var word in _dataPage0)
{
Assert.DoesNotThrow(() =>
{
diff --git a/src/Sir.StringTests/Sir.StringTests.csproj b/src/Sir.StringTests/Sir.StringTests.csproj
index bf2f61d2..813a116b 100644
--- a/src/Sir.StringTests/Sir.StringTests.csproj
+++ b/src/Sir.StringTests/Sir.StringTests.csproj
@@ -15,10 +15,9 @@
-
-
-
+
+
diff --git a/src/Sir.Strings/Sir.Strings.csproj b/src/Sir.Strings/Sir.Model.Strings.csproj
similarity index 100%
rename from src/Sir.Strings/Sir.Strings.csproj
rename to src/Sir.Strings/Sir.Model.Strings.csproj
diff --git a/src/Sir.Wikipedia/Sir.Wikipedia.csproj b/src/Sir.Wikipedia/Sir.DataProvider.Wikipedia.csproj
similarity index 85%
rename from src/Sir.Wikipedia/Sir.Wikipedia.csproj
rename to src/Sir.Wikipedia/Sir.DataProvider.Wikipedia.csproj
index 9da76051..240b2dd6 100644
--- a/src/Sir.Wikipedia/Sir.Wikipedia.csproj
+++ b/src/Sir.Wikipedia/Sir.DataProvider.Wikipedia.csproj
@@ -14,7 +14,7 @@
-
+
diff --git a/src/Sir.sln b/src/Sir.sln
index 4abeef0d..1b051b8c 100644
--- a/src/Sir.sln
+++ b/src/Sir.sln
@@ -28,29 +28,25 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Sir.HttpServer", "Sir.HttpS
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Sir.KeyValue", "Sir.KeyValue\Sir.KeyValue.csproj", "{53B25A13-5C26-4344-9AB6-7E998282EF94}"
EndProject
-Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "apps", "apps", "{B97268C5-0BFA-4022-BA3F-C07C1F239C8D}"
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "applications", "applications", "{B97268C5-0BFA-4022-BA3F-C07C1F239C8D}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tests", "tests", "{35E2693A-1A42-4690-81A8-D424C3D24AD1}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Sir.Documents", "Sir.Document\Sir.Documents.csproj", "{F6AFA2E5-1FEA-4DC6-9386-20315126C7C4}"
EndProject
-Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Sir.CommonCrawl", "Sir.CommonCrawl\Sir.CommonCrawl.csproj", "{304CAC1B-825A-4D89-AE4C-9C0FC5206607}"
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Sir.DataProvider.CommonCrawl", "Sir.CommonCrawl\Sir.DataProvider.CommonCrawl.csproj", "{304CAC1B-825A-4D89-AE4C-9C0FC5206607}"
EndProject
-Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Sir.Mnist", "Sir.Mnist\Sir.Mnist.csproj", "{436647C9-EDFF-44AD-AF2F-ABC4EBA70ED7}"
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Sir.DataProvider.Mnist", "Sir.Mnist\Sir.DataProvider.Mnist.csproj", "{436647C9-EDFF-44AD-AF2F-ABC4EBA70ED7}"
EndProject
-Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Sir.Wikipedia", "Sir.Wikipedia\Sir.Wikipedia.csproj", "{6F3C960C-7652-430C-A253-081E1506BA81}"
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Sir.DataProvider.Wikipedia", "Sir.Wikipedia\Sir.DataProvider.Wikipedia.csproj", "{6F3C960C-7652-430C-A253-081E1506BA81}"
EndProject
-Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Sir.Strings", "Sir.Strings\Sir.Strings.csproj", "{AB275A5B-E72E-475A-8E1A-FFA7A5F9C932}"
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Sir.Model.Strings", "Sir.Strings\Sir.Model.Strings.csproj", "{AB275A5B-E72E-475A-8E1A-FFA7A5F9C932}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Sir.StringCompare", "Sir.StringCompare\Sir.StringCompare.csproj", "{C6050E65-9411-41E3-A6EE-0A45E6FFB4FC}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Sir.Cmd", "Sir.Cmd\Sir.Cmd.csproj", "{CEDD3CA9-D38D-43BF-9013-212AE6332CE0}"
EndProject
-Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Sir.Images", "Sir.Images\Sir.Images.csproj", "{1DC66643-0C0A-48AC-9019-5C64C002BA32}"
-EndProject
-Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "core", "core", "{644A2FAF-6617-41F6-88B7-92F21493B048}"
-EndProject
-Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "models", "models", "{0C62844F-36C0-4029-BB36-3C27C0F29272}"
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Sir.Model.Images", "Sir.Images\Sir.Model.Images.csproj", "{1DC66643-0C0A-48AC-9019-5C64C002BA32}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "exe", "exe", "{23D1F5F4-6D57-4995-98F6-38EED88C2260}"
EndProject
@@ -121,17 +117,12 @@ Global
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(NestedProjects) = preSolution
- {2002DD08-0083-4184-BB1A-2469B608DE95} = {644A2FAF-6617-41F6-88B7-92F21493B048}
{C94C2F5D-AE55-4157-A74A-26D49EE73E96} = {23D1F5F4-6D57-4995-98F6-38EED88C2260}
- {53B25A13-5C26-4344-9AB6-7E998282EF94} = {644A2FAF-6617-41F6-88B7-92F21493B048}
- {F6AFA2E5-1FEA-4DC6-9386-20315126C7C4} = {644A2FAF-6617-41F6-88B7-92F21493B048}
{304CAC1B-825A-4D89-AE4C-9C0FC5206607} = {B97268C5-0BFA-4022-BA3F-C07C1F239C8D}
{436647C9-EDFF-44AD-AF2F-ABC4EBA70ED7} = {B97268C5-0BFA-4022-BA3F-C07C1F239C8D}
{6F3C960C-7652-430C-A253-081E1506BA81} = {B97268C5-0BFA-4022-BA3F-C07C1F239C8D}
- {AB275A5B-E72E-475A-8E1A-FFA7A5F9C932} = {0C62844F-36C0-4029-BB36-3C27C0F29272}
{C6050E65-9411-41E3-A6EE-0A45E6FFB4FC} = {B97268C5-0BFA-4022-BA3F-C07C1F239C8D}
{CEDD3CA9-D38D-43BF-9013-212AE6332CE0} = {23D1F5F4-6D57-4995-98F6-38EED88C2260}
- {1DC66643-0C0A-48AC-9019-5C64C002BA32} = {0C62844F-36C0-4029-BB36-3C27C0F29272}
{23D1F5F4-6D57-4995-98F6-38EED88C2260} = {B97268C5-0BFA-4022-BA3F-C07C1F239C8D}
{20F14A4E-99AE-42FB-B447-6B78F1398406} = {35E2693A-1A42-4690-81A8-D424C3D24AD1}
{BD85D84A-0F4E-4880-A0CB-128BA9F34EDF} = {35E2693A-1A42-4690-81A8-D424C3D24AD1}