Skip to content

Commit

Permalink
build wikipedia ngram model
Browse files Browse the repository at this point in the history
  • Loading branch information
kreeben committed Mar 30, 2022
1 parent ac510d7 commit b86d2fe
Show file tree
Hide file tree
Showing 4 changed files with 12 additions and 9 deletions.
10 changes: 5 additions & 5 deletions src/Sir.Cmd/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -54,19 +54,19 @@ static void Main(string[] args)
}
else if (command == "truncate")
{
Truncate(flags["dataDirectory"], flags["collection"], logger);
Truncate(flags["directory"], flags["collection"], logger);
}
else if (command == "truncate-index")
{
TruncateIndex(flags["dataDirectory"], flags["collection"], logger);
TruncateIndex(flags["directory"], flags["collection"], logger);
}
else if (command == "optimize")
{
Optimize(flags, logger);
}
else if (command == "rename")
{
Rename(flags["dataDirectory"], flags["collection"], flags["newCollection"], logger);
Rename(flags["directory"], flags["collection"], flags["newCollection"], logger);
}
else
{
Expand Down Expand Up @@ -114,7 +114,7 @@ private static IDictionary<string, string> ParseArgs(string[] args)

private static void Optimize(IDictionary<string, string> args, ILogger logger)
{
var dataDirectory = args["dataDirectory"];
var dataDirectory = args["directory"];
var collection = args["collection"];
var skip = int.Parse(args["skip"]);
var take = int.Parse(args["take"]);
Expand Down Expand Up @@ -196,7 +196,7 @@ public class AnalyzeDocumentCommand : ICommand
{
public void Run(IDictionary<string, string> args, ILogger logger)
{
var dataDirectory = args["dataDirectory"];
var dataDirectory = args["directory"];
var collection = args["collection"];
var documentId = long.Parse(args["documentId"]);
var select = new HashSet<string>(args["select"].Split(new char[] { ',', ' ' }, StringSplitOptions.RemoveEmptyEntries));
Expand Down
5 changes: 4 additions & 1 deletion src/Sir.Cmd/ValidateCommand.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,17 @@ namespace Sir.Cmd
{
public class ValidateCommand : ICommand
{
/// <summary>
/// E.g. validate --directory C:\projects\resin\src\Sir.HttpServer\AppData\database --collection wikipedia --skip 0 --take 1000
/// </summary>
public void Run(IDictionary<string, string> args, ILogger logger)
{
var dir = args["directory"];
var collection = args["collection"];
var skip = int.Parse(args["skip"]);
var take = int.Parse(args["take"]);
var collectionId = collection.ToHash();
var model = new BagOfCharsModel();
var model = new NGramModel(new BagOfCharsModel());
var selectFields = new HashSet<string> { "title" };
var time = Stopwatch.StartNew();

Expand Down
2 changes: 1 addition & 1 deletion src/Sir.Search/Models/NGramModel.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ namespace Sir.Search
public class NGramModel : DistanceCalculator, IModel<string>
{
public double IdenticalAngle => 0.95d;
public double FoldAngle => 0.75d;
public double FoldAngle => 0.45d;
public override int NumOfDimensions { get; }

private readonly BagOfCharsModel _wordTokenizer;
Expand Down
4 changes: 2 additions & 2 deletions src/Sir.Wikipedia/IndexWikipediaCommand.cs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ namespace Sir.Wikipedia
/// https://dumps.wikimedia.org/other/cirrussearch/current/enwiki-20201026-cirrussearch-content.json.gz
/// </summary>
/// <example>
/// indexwikipedia --dataDirectory c:\data\resin --file d:\enwiki-20201026-cirrussearch-content.json.gz --collection wikipedia
/// indexwikipedia --directory C:\projects\resin\src\Sir.HttpServer\AppData\database --file d:\enwiki-20211122-cirrussearch-content.json.gz --collection wikipedia --skip 0 --take 1000
/// </example>
public class IndexWikipediaCommand : ICommand
{
Expand All @@ -35,7 +35,7 @@ public void Run(IDictionary<string, string> args, ILogger logger)
if (take == 0)
take = int.MaxValue;

var model = new BagOfCharsModel();
var model = new NGramModel(new BagOfCharsModel());
var payload = WikipediaHelper.Read(fileName, skip, take, fieldsOfInterest);

using (var sessionFactory = new Database(logger))
Expand Down

0 comments on commit b86d2fe

Please sign in to comment.