Skip to content

Commit

Permalink
Chore: refine reject domainset building
Browse files Browse the repository at this point in the history
  • Loading branch information
SukkaW committed Jan 28, 2024
1 parent f51de78 commit 504cd36
Show file tree
Hide file tree
Showing 7 changed files with 127 additions and 57 deletions.
2 changes: 1 addition & 1 deletion Build/build-microsoft-cdn.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ export const getMicrosoftCdnRulesetPromise = createMemoizedPromise(async () => {
trie.add(domain);
}
}
return new Set(PROBE_DOMAINS.flatMap(domain => trie.find(domain, false)));
return new Set(PROBE_DOMAINS.flatMap(domain => trie.find(domain)));
});

// Second trie is to remove blacklisted domains
Expand Down
70 changes: 40 additions & 30 deletions Build/build-reject-domainset.ts
Original file line number Diff line number Diff line change
Expand Up @@ -79,48 +79,58 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => {
console.log(`Import ${previousSize} rules from Hosts / AdBlock Filter Rules & reject_sukka.conf!`);

// Dedupe domainSets
await span.traceChild('dedupe from black keywords/suffixes').traceAsyncFn(async () => {
/** Collect DOMAIN-SUFFIX from non_ip/reject.conf for deduplication */
await span.traceChild('dedupe from black keywords/suffixes').traceAsyncFn(async (childSpan) => {
/** Collect DOMAIN-SUFFIX from non_ip/reject.conf for deduplication */
const domainSuffixSet = new Set<string>();
/** Collect DOMAIN-KEYWORD from non_ip/reject.conf for deduplication */
const domainKeywordsSet = new Set<string>();

for await (const line of readFileByLine(path.resolve(import.meta.dir, '../Source/non_ip/reject.conf'))) {
const [type, keyword] = line.split(',');
await childSpan.traceChild('collect keywords/suffixes').traceAsyncFn(async () => {
for await (const line of readFileByLine(path.resolve(import.meta.dir, '../Source/non_ip/reject.conf'))) {
const [type, value] = line.split(',');

if (type === 'DOMAIN-KEYWORD') {
domainKeywordsSet.add(keyword.trim());
} else if (type === 'DOMAIN-SUFFIX') {
domainSuffixSet.add(keyword.trim());
if (type === 'DOMAIN-KEYWORD') {
domainKeywordsSet.add(value.trim());
} else if (type === 'DOMAIN-SUFFIX') {
domainSuffixSet.add(value.trim());
}
}
}

const trie1 = createTrie(domainSets);

domainSuffixSet.forEach(suffix => {
trie1.find(suffix, true).forEach(f => domainSets.delete(f));
});
filterRuleWhitelistDomainSets.forEach(suffix => {
trie1.find(suffix, true).forEach(f => domainSets.delete(f));

if (suffix[0] === '.') {
// handle case like removing `g.msn.com` due to white `.g.msn.com` (`@@||g.msn.com`)
domainSets.delete(suffix.slice(1));
} else {
// If `g.msn.com` is whitelisted, then `.g.msn.com` should be removed from domain set
domainSets.delete(`.${suffix}`);
}

// Remove as many domains as possible from domainSets before creating trie
SetHelpers.subtract(domainSets, domainSuffixSet);
SetHelpers.subtract(domainSets, filterRuleWhitelistDomainSets);

childSpan.traceChild('dedupe from white/suffixes').traceSyncFn(() => {
const trie = createTrie(domainSets);

domainSuffixSet.forEach(suffix => {
trie.remove(suffix);
trie.substractSetInPlaceFromFound(suffix, domainSets);
});
filterRuleWhitelistDomainSets.forEach(suffix => {
trie.substractSetInPlaceFromFound(suffix, domainSets);

if (suffix[0] === '.') {
// handle case like removing `g.msn.com` due to white `.g.msn.com` (`@@||g.msn.com`)
domainSets.delete(suffix.slice(1));
} else {
// If `g.msn.com` is whitelisted, then `.g.msn.com` should be removed from domain set
domainSets.delete(`.${suffix}`);
}
});
});

// remove pre-defined enforced blacklist from whitelist
const kwfilter = createKeywordFilter(domainKeywordsSet);
childSpan.traceChild('dedupe from black keywords').traceSyncFn(() => {
const kwfilter = createKeywordFilter(domainKeywordsSet);

for (const domain of domainSets) {
for (const domain of domainSets) {
// Remove keyword
if (kwfilter(domain)) {
domainSets.delete(domain);
if (kwfilter(domain)) {
domainSets.delete(domain);
}
}
}
});

console.log(`Deduped ${previousSize} - ${domainSets.size} = ${previousSize - domainSets.size} from black keywords and suffixes!`);
});
Expand Down
10 changes: 5 additions & 5 deletions Build/lib/create-file.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,11 @@ export async function compareAndWriteFile(span: Span, linesA: string[], filePath
}
if (
lineA[0] === '/'
&& lineA[1] === '/'
&& lineA[3] === '#'
&& lineB[0] === '/'
&& lineB[1] === '/'
&& lineB[3] === '#'
&& lineA[1] === '/'
&& lineB[0] === '/'
&& lineB[1] === '/'
&& lineA[3] === '#'
&& lineB[3] === '#'
) {
continue;
}
Expand Down
2 changes: 1 addition & 1 deletion Build/lib/domain-deduper.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ export function domainDeduper(inputDomains: string[], toArray = true): string[]
continue;
}

const found = trie.find(d, false);
const found = trie.find(d);

for (let j = 0, len2 = found.length; j < len2; j++) {
sets.delete(found[j]);
Expand Down
3 changes: 2 additions & 1 deletion Build/lib/get-phishing-domains.ts
Original file line number Diff line number Diff line change
Expand Up @@ -104,11 +104,12 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g
return parentSpan.traceChild('delete whitelisted from domainset').traceSyncFn(() => {
for (let i = 0, len = WHITELIST_DOMAIN.length; i < len; i++) {
const white = WHITELIST_DOMAIN[i];
const found = trieForRemovingWhiteListed.find(`.${white}`, true);
const found = trieForRemovingWhiteListed.find(`.${white}`);
for (let j = 0, len2 = found.length; j < len2; j++) {
domainSet.delete(found[j]);
}
domainSet.delete(white);
domainSet.delete(`.${white}`);
}
});
});
Expand Down
16 changes: 0 additions & 16 deletions Build/lib/random-int.bench.ts

This file was deleted.

81 changes: 78 additions & 3 deletions Build/lib/trie.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,16 @@
* Suffix Trie based on Mnemonist Trie
*/

import { Trie } from 'mnemonist';

export const SENTINEL = Symbol('SENTINEL');

type TrieNode = {
[SENTINEL]: boolean
} & Map<string, TrieNode>;

const createNode = (): TrieNode => {
const map = new Map<string, TrieNode>();
const node = map as TrieNode;
const node = new Map<string, TrieNode>() as TrieNode;
node[SENTINEL] = false;
return node;
};
Expand Down Expand Up @@ -65,7 +66,7 @@ export const createTrie = (from?: string[] | Set<string>) => {
/**
* Method used to retrieve every item in the trie with the given prefix.
*/
const find = (inputSuffix: string, /** @default true */ includeEqualWithSuffix = true): string[] => {
const find = (inputSuffix: string, /** @default false */ includeEqualWithSuffix = false): string[] => {
let node: TrieNode | undefined = root;
let token: string;

Expand Down Expand Up @@ -103,6 +104,43 @@ export const createTrie = (from?: string[] | Set<string>) => {
return matches;
};

/**
* Works like trie.find, but instead of returning the matches as an array, it removes them from the given set in-place.
*/
const substractSetInPlaceFromFound = (inputSuffix: string, set: Set<string>) => {
let node: TrieNode | undefined = root;
let token: string;

for (let i = inputSuffix.length - 1; i >= 0; i--) {
token = inputSuffix[i];

node = node.get(token);
if (!node) {
return [];
}
}

// Performing DFS from prefix
const nodeStack: TrieNode[] = [node];
const suffixStack: string[] = [inputSuffix];

while (nodeStack.length) {
const suffix = suffixStack.pop()!;
node = nodeStack.pop()!;

if (node[SENTINEL]) {
if (suffix !== inputSuffix) {
set.delete(suffix);
}
}

node.forEach((childNode, k) => {
nodeStack.push(childNode);
suffixStack.push(k + suffix);
});
}
};

/**
* Method used to delete a prefix from the trie.
*/
Expand Down Expand Up @@ -173,13 +211,50 @@ export const createTrie = (from?: string[] | Set<string>) => {
from.forEach(add);
}

const dump = () => {
const node = root;
const nodeStack: TrieNode[] = [];
const suffixStack: string[] = [];
// Resolving initial string
const suffix = '';

nodeStack.push(node);
suffixStack.push(suffix);

const results: string[] = [];

let currentNode: TrieNode;
let currentPrefix: string;
let hasValue = false;

while (nodeStack.length) {
currentNode = nodeStack.pop()!;
currentPrefix = suffixStack.pop()!;

if (currentNode[SENTINEL]) {
hasValue = true;
}

node.forEach((childNode, k) => {
nodeStack.push(childNode);
suffixStack.push(k + suffix);
});

if (hasValue) results.push(currentPrefix);
}

return results;
};

return {
add,
contains,
find,
substractSetInPlaceFromFound,
remove,
delete: remove,
has,
dump,
get size() {
return size;
},
Expand Down

0 comments on commit 504cd36

Please sign in to comment.