Skip to content

Commit

Permalink
Expose searchBindings method on documents
Browse files Browse the repository at this point in the history
This allows HDT documents to return bindings objects instead of triples.
This could result in better performance if the end goal is to create
bindings objects, which often is the case in query engines.
  • Loading branch information
rubensworks committed Feb 11, 2025
1 parent 1e44567 commit e0e92ad
Show file tree
Hide file tree
Showing 10 changed files with 1,024 additions and 6 deletions.
2 changes: 1 addition & 1 deletion .eslintrc
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@
key-spacing: 0,
lines-around-comment: 2,
linebreak-style: 2,
max-nested-callbacks: [2, 2],
max-nested-callbacks: [2, 3],
new-cap: 2,
new-parens: 2,
newline-after-var: 0,
Expand Down
31 changes: 31 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,37 @@ hdt.fromFile('./test/test.hdt')
});
```

### Searching for bindings matching a pattern
Search for [bindings](https://rdf.js.org/query-spec/#bindings-interface) with `searchBindings`,
which takes [bindingsFactory](https://rdf.js.org/query-spec/#bindingsfactory-interface), subject, predicate, object, and options arguments.
Subject, predicate, and object can be IRIs, literals, or variables,
[represented as RDF/JS terms](https://rdf.js.org/data-model-spec/#term-interface).
If any of these parameters is a variable, it is considered a wildcard.
Optionally, an offset and limit can be passed in an options object,
selecting only the specified subset.

The promise returns an object with an array of bindings, the total number of expected bindings for the pattern,
and whether the total count is an estimate or exact.

If variables are reused across terms, this library will make sure to only return bindings when matches for those variables are equal.

```JavaScript
const DF = new (require('rdf-data-factory').DataFactory)();
const BF = new (require('@comunica/utils-bindings-factory').BindingsFactory)(DF);

var doc;
hdt.fromFile('./test/test.hdt')
.then(function(hdtDocument) {
doc = hdtDocument;
return doc.searchBindings(DF.namedNode('http://example.org/s1'), DF.variable('p'), DF.variable('o'), { offset: 0, limit: 10 })
})
.then(function(result) {
console.log('Approximately ' + result.totalCount + ' bindings match the pattern.');
result.bindings.forEach(function (binding) { console.log(binding.toString()); });
return doc.close();
});
```

### Search terms starting with a prefix
Find terms (literals and IRIs) that start with a given prefix.

Expand Down
140 changes: 140 additions & 0 deletions lib/HdtDocument.cc
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ const Nan::Persistent<Function>& HdtDocument::GetConstructor() {
constructorTemplate->InstanceTemplate()->SetInternalFieldCount(1);
// Create prototype
Nan::SetPrototypeMethod(constructorTemplate, "_searchTriples", SearchTriples);
Nan::SetPrototypeMethod(constructorTemplate, "_searchBindings", SearchBindings);
Nan::SetPrototypeMethod(constructorTemplate, "_searchLiterals", SearchLiterals);
Nan::SetPrototypeMethod(constructorTemplate, "_searchTerms", SearchTerms);
Nan::SetPrototypeMethod(constructorTemplate, "_fetchDistinctTerms", FetchDistinctTerms);
Expand Down Expand Up @@ -234,6 +235,141 @@ NAN_METHOD(HdtDocument::SearchTriples) {
new Nan::Callback(info[5].As<Function>()), info.This()));
}

/******** HdtDocument#_searchBindings ********/

class SearchBindingsWorker : public Nan::AsyncWorker {
HdtDocument* document;
// JavaScript function arguments
string subject, predicate, object;
uint32_t offset, limit;
// Callback return values
vector<TripleID> triples;
map<unsigned int, string> subjects, predicates, objects;
uint32_t totalCount;
bool hasExactCount;
bool varS, varP, varO;

public:
SearchBindingsWorker(HdtDocument* document, char* subject, char* predicate, char* object,
uint32_t offset, uint32_t limit, Nan::Callback* callback, Local<Object> self)
: Nan::AsyncWorker(callback),
document(document), subject(subject), predicate(predicate), object(object),
offset(offset), limit(limit), totalCount(0) {
SaveToPersistent(SELF, self);
};

void Execute() {
IteratorTripleID* it = NULL;
try {
// Determine which terms are variables
varS = isVariable(subject);
varP = isVariable(predicate);
varO = isVariable(object);

// Prepare the triple pattern
Dictionary* dict = document->GetHDT()->getDictionary();
TripleString triple(varS ? "" : subject, varP ? "" : predicate, varO ? "" : toHdtLiteral(object));
TripleID tripleId;
dict->tripleStringtoTripleID(triple, tripleId);
// If any of the components does not exist, there are no matches
if ((!varS && subject[0] && !tripleId.getSubject()) ||
(!varP && predicate[0] && !tripleId.getPredicate()) ||
(!varO && object[0] && !tripleId.getObject())) {
hasExactCount = true;
return;
}

// Estimate the total number of triples
it = document->GetHDT()->getTriples()->search(tripleId);
totalCount = it->estimatedNumResults();
hasExactCount = it->numResultEstimation() == EXACT;

// Go to the right offset
if (it->canGoTo())
try { it->skip(offset), offset = 0; }
catch (const runtime_error error) { /* invalid offset */ }
else
while (offset && it->hasNext()) it->next(), offset--;

// Add matching triples to the result vector
if (!offset) {
while (it->hasNext() && triples.size() < limit) {
TripleID& triple = *it->next();
triples.push_back(triple);
if (varS && !subjects.count(triple.getSubject())) {
subjects[triple.getSubject()] = dict->idToString(triple.getSubject(), SUBJECT);
}
if (varP && !predicates.count(triple.getPredicate())) {
predicates[triple.getPredicate()] = dict->idToString(triple.getPredicate(), PREDICATE);
}
if (varO && !objects.count(triple.getObject())) {
string object(dict->idToString(triple.getObject(), OBJECT));
objects[triple.getObject()] = fromHdtLiteral(object);
}
}
}
}
catch (const runtime_error error) { SetErrorMessage(error.what()); }
if (it)
delete it;
}

void HandleOKCallback() {
Nan::HandleScope scope;
// Convert the triple components into strings
map<unsigned int, string>::const_iterator it;
map<unsigned int, Local<String> > subjectStrings, predicateStrings, objectStrings;
for (it = subjects.begin(); it != subjects.end(); it++)
subjectStrings[it->first] = Nan::New(it->second.c_str()).ToLocalChecked();
for (it = predicates.begin(); it != predicates.end(); it++)
predicateStrings[it->first] = Nan::New(it->second.c_str()).ToLocalChecked();
for (it = objects.begin(); it != objects.end(); it++)
objectStrings[it->first] = Nan::New(it->second.c_str()).ToLocalChecked();

// Convert the triples into a double JavaScript array
uint32_t count = 0;
Local<Array> bindingsArray = Nan::New<Array>(triples.size());
uint32_t variables = (varS ? 1 : 0) + (varP ? 1 : 0) + (varO ? 1 : 0);
for (vector<TripleID>::const_iterator it = triples.begin(); it != triples.end(); it++) {
uint32_t countInner = 0;
Local<Object> bindingsArrayInner = Nan::New<Array>(variables);
if (varS) {
Nan::Set(bindingsArrayInner, countInner++, subjectStrings[it->getSubject()]);
}
if (varP) {
Nan::Set(bindingsArrayInner, countInner++, predicateStrings[it->getPredicate()]);
}
if (varO) {
Nan::Set(bindingsArrayInner, countInner++, objectStrings[it->getObject()]);
}
Nan::Set(bindingsArray, count++, bindingsArrayInner);
}

// Send the JavaScript array through the callback
const unsigned argc = 4;
Local<Value> argv[argc] = { Nan::Null(), bindingsArray,
Nan::New<Integer>((uint32_t)totalCount),
Nan::New<Boolean>((bool)hasExactCount) };
callback->Call(Nan::To<v8::Object>(GetFromPersistent(SELF)).ToLocalChecked(), argc, argv, async_resource);
}

void HandleErrorCallback() {
Nan::HandleScope scope;
Local<Value> argv[] = { Exception::Error(Nan::New(ErrorMessage()).ToLocalChecked()) };
callback->Call(Nan::To<v8::Object>(GetFromPersistent(SELF)).ToLocalChecked(), 1, argv, async_resource);
}
};

// Searches for a triple pattern in the document and return bindings.
// JavaScript signature: HdtDocument#_searchBindings(subject, predicate, object, offset, limit, callback)
NAN_METHOD(HdtDocument::SearchBindings) {
assert(info.Length() == 6);
Nan::AsyncQueueWorker(new SearchBindingsWorker(Unwrap<HdtDocument>(info.This()),
*Nan::Utf8String(info[0]), *Nan::Utf8String(info[1]), *Nan::Utf8String(info[2]),
Nan::To<uint32_t>(info[3]).FromJust(), Nan::To<uint32_t>(info[4]).FromJust(),
new Nan::Callback(info[5].As<Function>()), info.This()));
}



/******** HdtDocument#_searchLiterals ********/
Expand Down Expand Up @@ -612,6 +748,10 @@ NAN_PROPERTY_GETTER(HdtDocument::Closed) {
// "literal"^^<http://example.org/datatype>
// The functions below convert when needed.

// Check if a term is a variable
bool isVariable(string& term) {
return term[0] == '?';
}

// Converts a JavaScript literal to an HDT literal
string& toHdtLiteral(string& literal) {
Expand Down
4 changes: 4 additions & 0 deletions lib/HdtDocument.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ class HdtDocument : public node::ObjectWrap {

// HdtDocument#_searchTriples(subject, predicate, object, offset, limit, callback, self)
static NAN_METHOD(SearchTriples);
// HdtDocument#_searchBindings(subject, predicate, object, offset, limit, callback, self)
static NAN_METHOD(SearchBindings);
// HdtDocument#_searchLiterals(substring, offset, limit, callback, self)
static NAN_METHOD(SearchLiterals);
// HdtDocument#_searchTerms(prefix, limit, position, callback)
Expand All @@ -50,6 +52,8 @@ class HdtDocument : public node::ObjectWrap {
static NAN_PROPERTY_GETTER(Closed);
};

// Check if a term is a variable
bool isVariable(string& term);
// Converts a JavaScript literal to an HDT literal
std::string& toHdtLiteral(std::string& literal);
// Converts an HDT literal to a JavaScript literal
Expand Down
79 changes: 79 additions & 0 deletions lib/hdt.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ function isValidHdtTerm(term) {
return term && term.termType && (term.termType === 'Literal' || term.termType === 'BlankNode' || term.termType === 'NamedNode');
}

function isValidHdtTermForSearchBindings(term) {
return isValidHdtTerm(term) || (term && term.termType === 'Variable');
}

// Searches the document for triples with the given subject, predicate, and object.
HdtDocumentPrototype.searchTriples = function (subject, predicate, object, options) {
if (this.closed) return closedError;
Expand All @@ -29,6 +33,80 @@ HdtDocumentPrototype.searchTriples = function (subject, predicate, object, optio
});
};

// Searches the document for triples with the given subject, predicate, and object, and return them as bindings
HdtDocumentPrototype.searchBindings = function (bindingsFactory, subject, predicate, object, options) {
if (this.closed) return closedError;
if (!isValidHdtTermForSearchBindings(subject) || !isValidHdtTermForSearchBindings(predicate) || !isValidHdtTermForSearchBindings(object)) throw new Error('Passed invalid subject term');
options = options || {};
return new Promise((resolve, reject) => {
// Collect variable terms
const variables = [];
if (subject.termType === 'Variable') variables.push(subject);
if (predicate.termType === 'Variable') variables.push(predicate);
if (object.termType === 'Variable') variables.push(object);

// Check if we need to do post-filtering for overlapping variables
let shouldFilterIndexes = false;
const filterIndexes = variables.map((variable, i) => {
const equalVariables = [];
for (let j = i + 1; j < variables.length; j++) {
if (variable.equals(variables[j])) {
equalVariables.push(j);
shouldFilterIndexes = true;
}
}
return equalVariables;
});

// If we have offset and limit with overlapping variables, they must be handled in JS-land.
const offset = parseOffset(options);
const limit = parseLimit(options);

this._searchBindings(termToString(subject), termToString(predicate), termToString(object),
shouldFilterIndexes ? 0 : offset, shouldFilterIndexes ? MAX : limit,
(err, bindingsRaw, totalCount, hasExactCount) => {
if (err)
return reject(err);

// If we had overlapping variables, potentially filter bindings
if (shouldFilterIndexes) {
// Filter bindings
bindingsRaw = bindingsRaw.filter(binding => {
for (let i = 0; i < binding.length; i++) {
const bindingEntry = binding[i];
const filterI = filterIndexes[i];
if (filterI) {
for (const j of filterI) {
if (j !== undefined && bindingEntry !== binding[j]) {
totalCount--;
return false;
}
}
}
}
return true;
});

// Apply offsets and limits in JS-land
if (offset > 0 || limit < MAX) {
hasExactCount = true;
totalCount = bindingsRaw.length;
bindingsRaw = bindingsRaw.slice(offset, offset + limit);
}
}

// Create bindings objects
const bindings = bindingsRaw.map(b => bindingsFactory.bindings(b.map((value, i) => [variables[i], stringToTerm(value)])));

return resolve({
bindings,
totalCount,
hasExactCount,
});
});
});
};

// Gives an approximate number of matches of triples with the given subject, predicate, and object.
HdtDocumentPrototype.countTriples = function (subject, predicate, object) {
return this.search(subject, predicate, object, { offset: 0, limit: 0 });
Expand Down Expand Up @@ -152,6 +230,7 @@ module.exports = {
// Document the features of the HDT file
document.features = Object.freeze({
searchTriples: true, // supported by default
searchBindings: true, // supported by default
countTriples: true, // supported by default
searchLiterals: !!(document._features & 1),
readHeader: true, // supported by default
Expand Down
Loading

0 comments on commit e0e92ad

Please sign in to comment.