Skip to content

Commit

Permalink
revert back to using non optimized indexing strategy
Browse files Browse the repository at this point in the history
  • Loading branch information
kreeben committed Oct 20, 2022
1 parent e3f1085 commit d968c8f
Show file tree
Hide file tree
Showing 30 changed files with 90 additions and 59 deletions.
4 changes: 3 additions & 1 deletion src/Sir.Cmd/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -121,14 +121,16 @@ private static void Optimize(IDictionary<string, string> args, ILogger logger)
var reportFrequency = int.Parse(args["reportFrequency"]);
var pageSize = int.Parse(args["pageSize"]);
var fields = new HashSet<string>(args["fields"].Split(','));
var model = new BagOfCharsModel();

using (var sessionFactory = new SessionFactory(logger))
{
sessionFactory.Optimize(
dataDirectory,
collection,
fields,
new BagOfCharsModel(),
model,
new NonOptimizedPageIndexingStrategy(model),
skip,
take,
reportFrequency,
Expand Down
2 changes: 1 addition & 1 deletion src/Sir.Cmd/ValidateCommand.cs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ public void Run(IDictionary<string, string> args, ILogger logger)
{
using (var validateSession = new ValidateSession<string>(
collectionId,
new SearchSession(dir, sessionFactory, model, logger),
new SearchSession(dir, sessionFactory, model, new NonOptimizedPageIndexingStrategy(model), logger),
new QueryParser<string>(dir, sessionFactory, model, logger)))
{
using (var documents = new DocumentStreamSession(dir, sessionFactory))
Expand Down
2 changes: 1 addition & 1 deletion src/Sir.CommonCrawl/CCHelper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ public static void WriteWatSegment(

using (var sessionFactory = new SessionFactory(logger))
using (var writeSession = new WriteSession(new DocumentWriter(dataDirectory, collectionId, sessionFactory)))
using (var indexSession = new InMemoryIndexSession<string>(model, model, sessionFactory, dataDirectory, collectionId))
using (var indexSession = new InMemoryIndexSession<string>(model, new NonOptimizedPageIndexingStrategy(model), sessionFactory, dataDirectory, collectionId))
{
using (var queue = new ProducerConsumerQueue<Document>(document =>
{
Expand Down
2 changes: 2 additions & 0 deletions src/Sir.CommonCrawl/IndexWetFilesCommand.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ public void Run(IDictionary<string, string> args, ILogger logger)
var dataDirectory = args["dataDirectory"];
var fileName = args["fileName"];
var model = new BagOfCharsModel();
var indexStrategy = new NonOptimizedPageIndexingStrategy(model);
var collectionId = "cc_wet".ToHash();
var storeFields = new HashSet<string> { "url" };
var indexFields = new HashSet<string> { "description" };
Expand All @@ -33,6 +34,7 @@ public void Run(IDictionary<string, string> args, ILogger logger)
kvp.Key,
kvp.Value)).ToList())),
model,
indexStrategy,
reportSize: 1000);
}
}
Expand Down
4 changes: 2 additions & 2 deletions src/Sir.Core/Field.cs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ private IEnumerable<ISerializableVector> GetTokens()
yield return node.Vector;
}

public void Analyze<T>(IModel<T> model, bool label, IStreamDispatcher streamDispatcher)
public void Analyze<T>(IModel<T> model, IIndexReadWriteStrategy indexStrategy, bool label, IStreamDispatcher streamDispatcher)
{
var tokens = model.CreateEmbedding((T)Value, label);

Expand All @@ -44,7 +44,7 @@ public void Analyze<T>(IModel<T> model, bool label, IStreamDispatcher streamDisp
{
foreach (var token in tokens)
{
model.Put<string>(Tree, new VectorNode(token, keyId: KeyId), reader);
indexStrategy.Put<string>(Tree, new VectorNode(token, keyId: KeyId), reader);
}

_tokens = GetTokens();
Expand Down
2 changes: 1 addition & 1 deletion src/Sir.Core/IField.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@ public interface IField
IEnumerable<ISerializableVector> Tokens { get; }
VectorNode Tree { get; }
object Value { get; set; }
void Analyze<T>(IModel<T> model, bool label, IStreamDispatcher streamDispatcher);
void Analyze<T>(IModel<T> model, IIndexReadWriteStrategy indexStrategy, bool label, IStreamDispatcher streamDispatcher);
}
}
2 changes: 1 addition & 1 deletion src/Sir.Core/IModel.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ public interface IModel<T> : IModel
/// <summary>
/// Vector space model.
/// </summary>
public interface IModel : IVectorSpaceConfig, IDistanceCalculator, IIndexReadWriteStrategy
public interface IModel : IVectorSpaceConfig, IDistanceCalculator
{
}

Expand Down
20 changes: 20 additions & 0 deletions src/Sir.Core/IndexingStrategies/OptimizedPageIndexingStrategy.cs
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,24 @@ public void Put<T>(VectorNode column, VectorNode node, IColumnReader reader)
column.AddOrAppend(node, _model);
}
}

public class NonOptimizedPageIndexingStrategy : IIndexReadWriteStrategy
{
private readonly IModel _model;

public NonOptimizedPageIndexingStrategy(IModel model)
{
_model = model;
}

public Hit GetClosestMatchOrNull(ISerializableVector vector, IModel model, IColumnReader reader)
{
return reader.ClosestMatchOrNullScanningAllPages(vector, model);
}

public void Put<T>(VectorNode column, VectorNode node, IColumnReader reader)
{
column.AddOrAppend(node, _model);
}
}
}
7 changes: 4 additions & 3 deletions src/Sir.Crawl/CrawlUserDirectoryCommand.cs
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,15 @@ public void Run(IDictionary<string, string> args, ILogger logger)
var urlCollectionId = "url".ToHash();
var htmlClient = new HtmlWeb();
var model = new BagOfCharsModel();
var indexStrategy = new NonOptimizedPageIndexingStrategy(model);

htmlClient.UserAgent = "Crawlcrawler (+https://crawlcrawler.com)";

#if DEBUG
htmlClient.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36";
#endif
using (var database = new SessionFactory(logger))
using (var dataSearchSession = new SearchSession(dataDirectory, database, model, logger))
using (var dataSearchSession = new SearchSession(dataDirectory, database, model, new NonOptimizedPageIndexingStrategy(model), logger))
{
foreach (var userDirectory in Directory.EnumerateDirectories(rootUserDirectory))
{
Expand Down Expand Up @@ -85,7 +86,7 @@ public void Run(IDictionary<string, string> args, ILogger logger)

if (result != null)
{
database.StoreDataAndPersistIndex(dataDirectory, collectionId, result.Document, _model);
database.StoreDataAndPersistIndex(dataDirectory, collectionId, result.Document, _model, indexStrategy);

int crawlCount = 1;

Expand All @@ -110,7 +111,7 @@ public void Run(IDictionary<string, string> args, ILogger logger)

if (r != null)
{
database.StoreDataAndPersistIndex(dataDirectory, collectionId, r.Document, _model);
database.StoreDataAndPersistIndex(dataDirectory, collectionId, r.Document, _model, indexStrategy);
}

crawlCount++;
Expand Down
7 changes: 5 additions & 2 deletions src/Sir.HttpServer/Controllers/WriteController.cs
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,20 @@ public class WriteController : Controller
private readonly IModel<string> _model;
private readonly ILogger<WriteController> _logger;
private readonly IConfigurationProvider _config;
private readonly IIndexReadWriteStrategy _indexStrategy;

public WriteController(
IHttpWriter writer,
IModel<string> tokenizer,
IModel<string> tokenizer,
IIndexReadWriteStrategy indexStrategy,
ILogger<WriteController> logger,
IConfigurationProvider config)
{
_writer = writer;
_model = tokenizer;
_logger = logger;
_config = config;
_indexStrategy = indexStrategy;
}

[HttpPost]
Expand All @@ -41,7 +44,7 @@ public IActionResult Post(string accessToken)

try
{
_writer.Write(Request, _model);
_writer.Write(Request, _model, _indexStrategy);

return Ok();
}
Expand Down
2 changes: 1 addition & 1 deletion src/Sir.HttpServer/HttpReader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ public async Task<SearchResult> Read(HttpRequest request, IModel<string> model)
_logger.LogDebug($"parsed query: {queryLog}");
#endif

using (var readSession = new SearchSession(_config.Get("data_dir"), _sessionFactory, model, _logger))
using (var readSession = new SearchSession(_config.Get("data_dir"), _sessionFactory, model, new NonOptimizedPageIndexingStrategy(model), _logger))
{
return readSession.Search(query, skip, take);
}
Expand Down
5 changes: 3 additions & 2 deletions src/Sir.HttpServer/HttpWriter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ public HttpWriter(SessionFactory sessionFactory, IConfigurationProvider config)
_config = config;
}

public void Write(HttpRequest request, IModel<string> model)
public void Write(HttpRequest request, IModel<string> model, IIndexReadWriteStrategy indexStrategy)
{
var documents = Deserialize<IEnumerable<Document>>(request.Body);
var collectionId = request.Query["collection"].First().ToHash();
Expand All @@ -29,7 +29,8 @@ public void Write(HttpRequest request, IModel<string> model)
_config.Get("data_dir"),
collectionId,
documents,
model);
model,
indexStrategy);
}

private static T Deserialize<T>(Stream stream)
Expand Down
2 changes: 1 addition & 1 deletion src/Sir.HttpServer/IHttpWriter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@ namespace Sir.HttpServer
/// </summary>
public interface IHttpWriter
{
void Write(HttpRequest request, IModel<string> model);
void Write(HttpRequest request, IModel<string> model, IIndexReadWriteStrategy indexStrategy);
}
}
4 changes: 2 additions & 2 deletions src/Sir.ImageTests/ImageModelTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ public void Can_traverse_index_in_memory()

using (var reader = _sessionFactory.CreateColumnReader("", 0, 0))
{
var index = model.CreateTree(model, reader, _data);
var index = model.CreateTree(new NonOptimizedPageIndexingStrategy(model), reader, _data);

Print(index);

Expand Down Expand Up @@ -80,7 +80,7 @@ public void Can_traverse_streamed()

using (var reader = _sessionFactory.CreateColumnReader("", 0, 0))
{
var index = model.CreateTree(model, reader, _data);
var index = model.CreateTree(new NonOptimizedPageIndexingStrategy(model), reader, _data);

using (var indexStream = new MemoryStream())
using (var vectorStream = new MemoryStream())
Expand Down
2 changes: 1 addition & 1 deletion src/Sir.ImageTests/OptimizedPageIndexStrategyTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ public void Can_traverse_index_in_memory()

using (var reader = _sessionFactory.CreateColumnReader("", 0, 0))
{
var index = model.CreateTree(model, reader, _data);
var index = model.CreateTree(new OptimizedPageIndexingStrategy(model), reader, _data);

Debug.WriteLine(PathFinder.Visualize(index));

Expand Down
6 changes: 3 additions & 3 deletions src/Sir.ImageTests/UpdateSessionTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ public void Can_update_image_field()
{
var d = data[i];

using (var indexSession = new InMemoryIndexSession<IImage>(model, model, _sessionFactory, _directory, collectionId))
using (var indexSession = new InMemoryIndexSession<IImage>(model, new NonOptimizedPageIndexingStrategy(model), _sessionFactory, _directory, collectionId))
{
var doc = new Document(new Field[] { new Field(fieldName, d) });

Expand All @@ -55,7 +55,7 @@ public void Can_update_image_field()

var queryParser = new QueryParser<IImage>(_directory, _sessionFactory, model);

using (var searchSession = new SearchSession(_directory, _sessionFactory, model, _loggerFactory.CreateLogger<SearchSession>()))
using (var searchSession = new SearchSession(_directory, _sessionFactory, model, new NonOptimizedPageIndexingStrategy(model), _loggerFactory.CreateLogger<SearchSession>()))
{
Assert.DoesNotThrow(() =>
{
Expand Down Expand Up @@ -85,7 +85,7 @@ public void Can_update_image_field()
updateSession.Update(documentIdToUpdate, 0, updatedWord);
}

using (var searchSession = new SearchSession(_directory, _sessionFactory, model, _loggerFactory.CreateLogger<SearchSession>()))
using (var searchSession = new SearchSession(_directory, _sessionFactory, model, new NonOptimizedPageIndexingStrategy(model), _loggerFactory.CreateLogger<SearchSession>()))
{
Assert.DoesNotThrow(() =>
{
Expand Down
2 changes: 1 addition & 1 deletion src/Sir.Mnist/IndexMnistCommand.cs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ public void Run(IDictionary<string, string> args, ILogger logger)
sessionFactory.Truncate(dataDirectory, collectionId);

using (var writeSession = new WriteSession(new DocumentWriter(dataDirectory, collectionId, sessionFactory)))
using (var indexSession = new InMemoryIndexSession<IImage>(model, model, sessionFactory, dataDirectory, collectionId))
using (var indexSession = new InMemoryIndexSession<IImage>(model, new NonOptimizedPageIndexingStrategy(model), sessionFactory, dataDirectory, collectionId))
{
var imageIndexId = writeSession.EnsureKeyExists("image");

Expand Down
2 changes: 1 addition & 1 deletion src/Sir.Mnist/ValidateMnistCommand.cs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ public void Run(IDictionary<string, string> args, ILogger logger)
var model = new LinearClassifierImageModel();

using (var sessionFactory = new SessionFactory(logger: logger))
using (var querySession = new SearchSession(dataDirectory, sessionFactory, model, logger))
using (var querySession = new SearchSession(dataDirectory, sessionFactory, model, new NonOptimizedPageIndexingStrategy(model), logger))
{
var queryParser = new QueryParser<IImage>(dataDirectory, sessionFactory, model, logger);

Expand Down
5 changes: 4 additions & 1 deletion src/Sir.Search/SearchSession.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ public class SearchSession : DocumentStreamSession, IDisposable, ISearchSession
{
private readonly IStreamDispatcher _sessionFactory;
private readonly IModel _model;
private readonly IIndexReadWriteStrategy _indexStrategy;
private readonly PostingsResolver _postingsResolver;
private readonly Scorer _scorer;
private readonly ILogger _logger;
Expand All @@ -21,12 +22,14 @@ public SearchSession(
string directory,
IStreamDispatcher sessionFactory,
IModel model,
IIndexReadWriteStrategy indexStrategy,
ILogger logger = null,
PostingsResolver postingsResolver = null,
Scorer scorer = null) : base(directory, sessionFactory)
{
_sessionFactory = sessionFactory;
_model = model;
_indexStrategy = indexStrategy;
_postingsResolver = postingsResolver ?? new PostingsResolver();
_scorer = scorer ?? new Scorer();
_logger = logger;
Expand Down Expand Up @@ -120,7 +123,7 @@ private void Scan(IQuery query)

if (reader != null)
{
var hit =_model.GetClosestMatchOrNull(term.Vector, _model, reader);
var hit =_indexStrategy.GetClosestMatchOrNull(term.Vector, _model, reader);

if (hit != null)
{
Expand Down
17 changes: 9 additions & 8 deletions src/Sir.Search/SessionFactory.cs
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ public void Optimize<T>(
string collection,
HashSet<string> selectFields,
IModel<T> model,
IIndexReadWriteStrategy indexStrategy,
int skipDocuments = 0,
int takeDocuments = 0,
int reportFrequency = 1000,
Expand Down Expand Up @@ -206,7 +207,7 @@ public void Optimize<T>(

var count = 0;

using (var indexSession = new InMemoryIndexSession<T>(model, model, this, directory, collectionId))
using (var indexSession = new InMemoryIndexSession<T>(model, indexStrategy, this, directory, collectionId))
{
foreach (var document in payload)
{
Expand Down Expand Up @@ -272,7 +273,7 @@ public void StoreDataAndBuildInMemoryIndex<T>(
}
}

public void BuildIndex<T>(ulong collectionId, IEnumerable<Document> job, IModel<T> model, InMemoryIndexSession<T> indexSession, bool label = true)
public void BuildIndex<T>(ulong collectionId, IEnumerable<Document> job, IModel<T> model, IIndexReadWriteStrategy indexStrategy, InMemoryIndexSession<T> indexSession, bool label = true)
{
LogDebug($"building index for collection {collectionId}");

Expand All @@ -295,7 +296,7 @@ public void BuildIndex<T>(ulong collectionId, IEnumerable<Document> job, IModel<
{
if (field.Value != null)
{
field.Analyze(model, label, this);
field.Analyze(model, indexStrategy, label, this);
}
}

Expand All @@ -306,10 +307,10 @@ public void BuildIndex<T>(ulong collectionId, IEnumerable<Document> job, IModel<
LogDebug($"built index (collection {collectionId}) in {time.Elapsed}");
}

public void StoreDataAndPersistIndex<T>(string directory, ulong collectionId, IEnumerable<IDocument> job, IModel<T> model, int reportSize = 1000)
public void StoreDataAndPersistIndex<T>(string directory, ulong collectionId, IEnumerable<IDocument> job, IModel<T> model, IIndexReadWriteStrategy indexStrategy, int reportSize = 1000)
{
using (var writeSession = new WriteSession(new DocumentWriter(directory, collectionId, this)))
using (var indexSession = new InMemoryIndexSession<T>(model, model, this, directory, collectionId))
using (var indexSession = new InMemoryIndexSession<T>(model, indexStrategy, this, directory, collectionId))
{
StoreDataAndBuildInMemoryIndex(job, writeSession, indexSession, reportSize);

Expand All @@ -320,10 +321,10 @@ public void StoreDataAndPersistIndex<T>(string directory, ulong collectionId, IE
}
}

public void StoreDataAndPersistIndex<T>(string directory, ulong collectionId, Document document, IModel<T> model)
public void StoreDataAndPersistIndex<T>(string directory, ulong collectionId, Document document, IModel<T> model, IIndexReadWriteStrategy indexStrategy)
{
using (var writeSession = new WriteSession(new DocumentWriter(directory, collectionId, this)))
using (var indexSession = new InMemoryIndexSession<T>(model, model, this, directory, collectionId))
using (var indexSession = new InMemoryIndexSession<T>(model, indexStrategy, this, directory, collectionId))
{
StoreDataAndBuildInMemoryIndex(document, writeSession, indexSession);

Expand Down Expand Up @@ -366,7 +367,7 @@ public bool DocumentExists<T>(string directory, string collection, string key, T

if (query != null)
{
using (var searchSession = new SearchSession(directory, this, model, _logger))
using (var searchSession = new SearchSession(directory, this, model, new NonOptimizedPageIndexingStrategy(model), _logger))
{
var document = searchSession.SearchScalar(query);

Expand Down
Loading

0 comments on commit d968c8f

Please sign in to comment.