Skip to content

Commit

Permalink
implement paged postings writer
Browse files Browse the repository at this point in the history
  • Loading branch information
kreeben committed May 13, 2024
1 parent 8d8d1a0 commit c2c104c
Show file tree
Hide file tree
Showing 39 changed files with 341 additions and 195 deletions.
4 changes: 2 additions & 2 deletions src/AssemblyInfo.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
using System.Reflection;

[assembly: AssemblyVersion("0.4.0.7")]
[assembly: AssemblyFileVersion("0.4.0.7")]
[assembly: AssemblyVersion("0.5.0.2")]
[assembly: AssemblyFileVersion("0.5.0.2")]
8 changes: 5 additions & 3 deletions src/Sir.Cmd/Sir.Cmd.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\Sir.CommonCrawl\Sir.CommonCrawl.csproj" />
<ProjectReference Include="..\Sir.CommonCrawl\Sir.DataProvider.CommonCrawl.csproj" />
<ProjectReference Include="..\Sir.Images\Sir.Model.Images.csproj" />
<ProjectReference Include="..\Sir.InformationRetreival\Sir.InformationRetrieval.csproj" />
<ProjectReference Include="..\Sir.Mnist\Sir.Mnist.csproj" />
<ProjectReference Include="..\Sir.Wikipedia\Sir.Wikipedia.csproj" />
<ProjectReference Include="..\Sir.Mnist\Sir.DataProvider.Mnist.csproj" />
<ProjectReference Include="..\Sir.Strings\Sir.Model.Strings.csproj" />
<ProjectReference Include="..\Sir.Wikipedia\Sir.DataProvider.Wikipedia.csproj" />
</ItemGroup>

<PropertyGroup>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\Sir.Images\Sir.Images.csproj" />
<ProjectReference Include="..\Sir.InformationRetreival\Sir.InformationRetrieval.csproj" />
<ProjectReference Include="..\Sir.Strings\Sir.Model.Strings.csproj" />
</ItemGroup>

</Project>
7 changes: 3 additions & 4 deletions src/Sir.Document/Field.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,13 @@ namespace Sir.Documents
[DebuggerDisplay("{Name}")]
public class Field
{
private IEnumerable<ISerializableVector> _tokens;

public long KeyId { get; set; }
public long DocumentId { get; set; }
public string Name { get; }
public object Value { get; set; }
public IEnumerable<ISerializableVector> Tokens { get { return _tokens; } }
public IEnumerable<ISerializableVector> Tokens { get; }

public Field(string name, object value, long keyId = -1, long documentId = -1)
public Field(string name, object value, long keyId = -1, long documentId = -1, IEnumerable<ISerializableVector> tokens = null)
{
if (name is null) throw new ArgumentNullException(nameof(name));
if (value == null) throw new ArgumentNullException(nameof(value));
Expand All @@ -24,6 +22,7 @@ public Field(string name, object value, long keyId = -1, long documentId = -1)
Value = value;
KeyId = keyId;
DocumentId = documentId;
Tokens = tokens;
}
}
}
10 changes: 5 additions & 5 deletions src/Sir.HttpServer/HttpQueryParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ public HttpQueryParser(QueryParser<string> parser)
_parser = parser;
}

public async Task<IQuery> ParseRequest(HttpRequest request, IEnumerable<string> collections = null)
public async Task<Query> ParseRequest(HttpRequest request, IEnumerable<string> collections = null)
{
var select = request.Query["select"].ToArray();

Expand Down Expand Up @@ -56,20 +56,20 @@ public static async Task<dynamic> DeserializeFromStream(Stream stream)
}
}

public IQuery ParseFormattedString(string formattedQuery, string[] select)
public Query ParseFormattedString(string formattedQuery, string[] select)
{
var document = JsonConvert.DeserializeObject<IDictionary<string, object>>(
formattedQuery, new JsonConverter[] { new DictionaryConverter() });

return ParseDictionary(document, select);
}

public IQuery ParseDictionary(IDictionary<string, object> document, string[] select)
public Query ParseDictionary(IDictionary<string, object> document, string[] select)
{
return _parser.Parse(document, select, true);
}

private void DoParseQuery(IQuery query, IDictionary<string, object> result)
private void DoParseQuery(Query query, IDictionary<string, object> result)
{
if (result == null)
return;
Expand Down Expand Up @@ -114,7 +114,7 @@ private void DoParseQuery(IQuery query, IDictionary<string, object> result)
}
}

public void ParseQuery(IQuery query, IDictionary<string, object> result)
public void ParseQuery(Query query, IDictionary<string, object> result)
{
DoParseQuery(query, result);
}
Expand Down
2 changes: 1 addition & 1 deletion src/Sir.HttpServer/Sir.HttpServer.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@

<ItemGroup>
<ProjectReference Include="..\Sir.InformationRetreival\Sir.InformationRetrieval.csproj" />
<ProjectReference Include="..\Sir.Strings\Sir.Strings.csproj" />
<ProjectReference Include="..\Sir.Strings\Sir.Model.Strings.csproj" />
</ItemGroup>

</Project>
5 changes: 2 additions & 3 deletions src/Sir.ImageTests/Sir.ImageTests.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,9 @@
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\Sir.Core\Sir.InformationRetreival.csproj" />
<ProjectReference Include="..\Sir.Images\Sir.Images.csproj" />
<ProjectReference Include="..\Sir.Images\Sir.Model.Images.csproj" />
<ProjectReference Include="..\Sir.InformationRetreival\Sir.InformationRetrieval.csproj" />
<ProjectReference Include="..\Sir.Mnist\Sir.Mnist.csproj" />
<ProjectReference Include="..\Sir.Mnist\Sir.DataProvider.Mnist.csproj" />
</ItemGroup>

<ItemGroup>
Expand Down
File renamed without changes.
8 changes: 8 additions & 0 deletions src/Sir.InformationRetreival/Hit.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,17 @@ public class Hit
public List<long> PostingsOffsets { get; set; }

public Hit (VectorNode node, double score)
{
Node = node ?? throw new System.ArgumentNullException(nameof(node));
Score = score;
PostingsOffsets = new List<long> { node.PostingsOffset };
}

public Hit(VectorNode node, double score, long postingsOffset)
{
Score = score;
Node = node;
PostingsOffsets = new List<long> { postingsOffset };
}

public override string ToString()
Expand Down
2 changes: 1 addition & 1 deletion src/Sir.InformationRetreival/IIndexReadWriteStrategy.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@ public interface IIndexReadWriteStrategy
{
void Put<T>(VectorNode column, VectorNode node);
Hit GetMatchOrNull(ISerializableVector vector, IModel model, ColumnReader reader);
void Commit(string directory, ulong collectionId, long keyId, VectorNode tree, ILogger logger = null);
void SerializePage(string directory, ulong collectionId, long keyId, VectorNode tree, IndexCache indexCache, ILogger logger = null);
}
}
21 changes: 10 additions & 11 deletions src/Sir.InformationRetreival/IO/ColumnReader.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
using Microsoft.Extensions.Logging;
using System;
using System;
using System.Buffers;
using System.Collections.Generic;
using System.IO;
Expand Down Expand Up @@ -36,7 +35,7 @@ public Hit ClosestMatchOrNullScanningAllPages(ISerializableVector vector, IModel
{
var hit = ClosestMatchInPage(vector, model, page.offset);

if (hit.Score > 0)
if (hit != null && hit.Score > 0)
{
hits.Add(hit);
}
Expand Down Expand Up @@ -71,7 +70,7 @@ public Hit ClosestMatchOrNullStoppingAtFirstIdenticalPage(ISerializableVector ve
{
var hit = ClosestMatchInPage(vector, model, page.offset);

if (hit.Score > 0)
if (hit != null && hit.Score > 0)
{
if (best == null || hit.Score > best.Score)
{
Expand All @@ -82,10 +81,10 @@ public Hit ClosestMatchOrNullStoppingAtFirstIdenticalPage(ISerializableVector ve
{
best.PostingsOffsets.Add(hit.Node.PostingsOffset);
}
}

if (hit.Score.Approximates(model.IdenticalAngle))
break;
if (hit.Score.Approximates(model.IdenticalAngle))
break;
}
}

return best;
Expand All @@ -110,7 +109,7 @@ private Hit ClosestMatchInPage(ISerializableVector queryVector, IModel model, lo

var angle = model.CosAngle(queryVector, vecOffset, (int)componentCount, _vectorFile);

if (angle >= model.IdenticalAngle)
if (angle >= model.IdenticalAngle || angle.Approximates(model.IdenticalAngle))
{
bestScore = angle;
bestNode = new VectorNode(postingsOffset: postingsOffset);
Expand All @@ -124,7 +123,7 @@ private Hit ClosestMatchInPage(ISerializableVector queryVector, IModel model, lo
bestScore = angle;
bestNode = new VectorNode(postingsOffset: postingsOffset);
}
else if (angle == bestScore)
else if (angle.Approximates(bestScore))
{
bestNode.PostingsOffset = postingsOffset;
}
Expand Down Expand Up @@ -153,7 +152,7 @@ private Hit ClosestMatchInPage(ISerializableVector queryVector, IModel model, lo
bestScore = angle;
bestNode = new VectorNode(postingsOffset: postingsOffset);
}
else if (angle > 0 && angle == bestScore)
else if (angle > 0 && angle.Approximates(bestScore))
{
bestNode.PostingsOffset = postingsOffset;
}
Expand Down Expand Up @@ -188,7 +187,7 @@ private Hit ClosestMatchInPage(ISerializableVector queryVector, IModel model, lo

ArrayPool<byte>.Shared.Return(block);

return new Hit(bestNode, bestScore);
return bestNode == null ? null : new Hit(bestNode, bestScore);
}

private void SkipTree()
Expand Down
17 changes: 10 additions & 7 deletions src/Sir.InformationRetreival/IO/GraphBuilder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ public static void AddOrAppendSupervised(
{
var angle = cursor.Vector == null ? 0 : model.CosAngle(node.Vector, cursor.Vector);

if (angle >= model.IdenticalAngle)
if (angle >= model.IdenticalAngle || angle.Approximates(model.IdenticalAngle))
{
if (!cursor.Vector.Label.Equals(node.Vector.Label))
throw new InvalidOperationException($"IdenticalAngle {model.IdenticalAngle} is too low. Angle was {angle}");
Expand Down Expand Up @@ -79,7 +79,7 @@ public static void AddOrAppend(
{
var angle = cursor.Vector == null ? 0 : model.CosAngle(node.Vector, cursor.Vector);

if (angle >= model.IdenticalAngle)
if (angle.Approximates(model.IdenticalAngle))
{
AppendDocIds(cursor, node);

Expand Down Expand Up @@ -123,7 +123,7 @@ public static void AddIfUnique(
{
var angle = cursor.Vector == null ? 0 : model.CosAngle(node.Vector, cursor.Vector);

if (angle >= model.IdenticalAngle)
if (angle.Approximates(model.IdenticalAngle))
{
break;
}
Expand Down Expand Up @@ -165,7 +165,7 @@ public static bool TryAdd(
{
var angle = cursor.Vector == null ? 0 : model.CosAngle(node.Vector, cursor.Vector);

if (angle >= model.IdenticalAngle)
if (angle.Approximates(model.IdenticalAngle))
{
return false;
}
Expand Down Expand Up @@ -209,7 +209,7 @@ public static void Build(
{
var angle = cursor.Vector == null ? 0 : model.CosAngle(node.Vector, cursor.Vector);

if (angle >= model.IdenticalAngle)
if (angle.Approximates(model.IdenticalAngle))
{
break;
}
Expand Down Expand Up @@ -240,8 +240,11 @@ public static void Build(
}
}

public static void AppendDocIds(this VectorNode target, VectorNode source)
private static void AppendDocIds(this VectorNode target, VectorNode source)
{
if (target.DocIds == null || source.DocIds == null)
return;

foreach (var d in source.DocIds)
target.DocIds.Add(d);
}
Expand Down Expand Up @@ -297,7 +300,7 @@ public static (long offset, long length) SerializeTree(this VectorNode node, Str
{
if (node.PostingsOffset == -1 && postingsWriter != null)
{
postingsWriter.SerializePostings(node);
node.PostingsOffset = postingsWriter.SerializePostings(node);
}

if (vectorStream != null)
Expand Down
2 changes: 1 addition & 1 deletion src/Sir.InformationRetreival/IO/PathFinder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ public static Hit ClosestMatch(VectorNode root, ISerializableVector vector, IMod
best = cursor;
}

if (angle >= model.IdenticalAngle)
if (angle >= model.IdenticalAngle || angle.Approximates(model.IdenticalAngle))
{
break;
}
Expand Down
17 changes: 9 additions & 8 deletions src/Sir.InformationRetreival/IO/PostingsReader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@ public PostingsReader(string directory, ulong collectionId, long keyId, ILogger
_collectionId = collectionId;
}

public IList<(ulong, long)> Read(long keyId, IList<long> offsets)
public HashSet<(ulong, long)> Read(long keyId, IList<long> offsets)
{
var time = Stopwatch.StartNew();
var documents = new List<(ulong, long)>();
var documents = new HashSet<(ulong, long)>(); // collection ID, document ID

foreach (var offset in offsets)
GetPostingsFromStream(keyId, offset, documents);
Expand All @@ -39,20 +39,21 @@ public PostingsReader(string directory, ulong collectionId, long keyId, ILogger
return documents;
}

private void GetPostingsFromStream(long keyId, long postingsOffset, List<(ulong collectionId, long docId)> documents)
private void GetPostingsFromStream(long keyId, long postingsOffset, HashSet<(ulong collectionId, long docId)> postings)
{
// seek to page
_stream.Seek(postingsOffset, SeekOrigin.Begin);

var headerLen = sizeof(long) * 2;
var headerBuf = ArrayPool<byte>.Shared.Rent(headerLen);

// read header
var headerBuf = ArrayPool<byte>.Shared.Rent(headerLen);
_stream.Read(headerBuf, 0, headerLen);

var numOfPostings = BitConverter.ToInt64(headerBuf);
var addressOfNextPage = BitConverter.ToInt64(headerBuf, sizeof(long));

ArrayPool<byte>.Shared.Return(headerBuf);

// read postings
var listLen = sizeof(long) * numOfPostings;
var listBuf = new byte[listLen];
var read = _stream.Read(listBuf);
Expand All @@ -62,12 +63,12 @@ private void GetPostingsFromStream(long keyId, long postingsOffset, List<(ulong

foreach (var docId in MemoryMarshal.Cast<byte, long>(listBuf))
{
documents.Add((_collectionId, docId));
postings.Add((_collectionId, docId));
}

if (addressOfNextPage > 0)
{
GetPostingsFromStream(keyId, addressOfNextPage, documents);
GetPostingsFromStream(keyId, addressOfNextPage, postings);
}
}

Expand Down
Loading

0 comments on commit c2c104c

Please sign in to comment.