Skip to content

Commit

Permalink
Perf: improve performance of reject suffix/keyword deduping
Browse files Browse the repository at this point in the history
  • Loading branch information
SukkaW committed Jan 21, 2024
1 parent 80deff8 commit 725f26b
Show file tree
Hide file tree
Showing 6 changed files with 40 additions and 56 deletions.
4 changes: 2 additions & 2 deletions Build/build-common.ts
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ async function transformDomainset(parentSpan: Span, sourcePath: string, relative
)
];

return createRuleset(
return span.traceAsyncFn(() => createRuleset(
span,
title,
description,
Expand All @@ -118,7 +118,7 @@ async function transformDomainset(parentSpan: Span, sourcePath: string, relative
'domainset',
path.resolve(outputSurgeDir, relativePath),
path.resolve(outputClashDir, `${relativePath.slice(0, -path.extname(relativePath).length)}.txt`)
);
));
}

/**
Expand Down
42 changes: 12 additions & 30 deletions Build/build-reject-domainset.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@ import path from 'path';
import { processHosts, processFilterRules, processDomainLists } from './lib/parse-filter';
import { createTrie } from './lib/trie';

import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, PREDEFINED_ENFORCED_BACKLIST, DOMAIN_LISTS } from './lib/reject-data-source';
import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, DOMAIN_LISTS } from './lib/reject-data-source';
import { createRuleset, compareAndWriteFile } from './lib/create-file';
import { processLine } from './lib/process-line';
import { domainDeduper } from './lib/domain-deduper';
import createKeywordFilter from './lib/aho-corasick';
import { readFileByLine } from './lib/fetch-text-by-line';
import { readFileByLine, readFileIntoProcessedArray } from './lib/fetch-text-by-line';
import { sortDomains } from './lib/stable-sort-domain';
import { task } from './trace';
import { getGorhillPublicSuffixPromise } from './lib/get-gorhill-publicsuffix';
Expand Down Expand Up @@ -63,25 +63,10 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => {
setAddFromArray(domainSets, purePhishingDomains);
}),
childSpan.traceChild('process reject_sukka.conf').traceAsyncFn(async () => {
for await (const l of readFileByLine(path.resolve(import.meta.dir, '../Source/domainset/reject_sukka.conf'))) {
const line = processLine(l);
if (!line) continue;
domainSets.add(line);
}
setAddFromArray(domainSets, await readFileIntoProcessedArray(path.resolve(import.meta.dir, '../Source/domainset/reject_sukka.conf')));
})
]);

// remove pre-defined enforced blacklist from whitelist
const trie0 = createTrie(filterRuleWhitelistDomainSets);

for (let i = 0, len1 = PREDEFINED_ENFORCED_BACKLIST.length; i < len1; i++) {
const enforcedBlack = PREDEFINED_ENFORCED_BACKLIST[i];
const found = trie0.find(enforcedBlack);
for (let j = 0, len2 = found.length; j < len2; j++) {
filterRuleWhitelistDomainSets.delete(found[j]);
}
}

return shouldStop;
});

Expand Down Expand Up @@ -116,25 +101,22 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => {
});
filterRuleWhitelistDomainSets.forEach(suffix => {
trie1.find(suffix, true).forEach(f => domainSets.delete(f));

if (suffix[0] === '.') {
// handle case like removing `g.msn.com` due to white `.g.msn.com` (`@@||g.msn.com`)
domainSets.delete(suffix.slice(1));
} else {
// If `g.msn.com` is whitelisted, then `.g.msn.com` should be removed from domain set
domainSets.delete(`.${suffix}`);
}
});

// remove pre-defined enforced blacklist from whitelist
const kwfilter = createKeywordFilter(domainKeywordsSet);

// handle case like removing `g.msn.com` due to white `.g.msn.com` (`@@||g.msn.com`)
for (const domain of domainSets) {
if (domain[0] === '.') {
if (filterRuleWhitelistDomainSets.has(domain)) {
domainSets.delete(domain);
continue;
}
} else if (filterRuleWhitelistDomainSets.has(`.${domain}`)) {
domainSets.delete(domain);
continue;
}

// Remove keyword
if (kwfilter.search(domain)) {
if (kwfilter(domain)) {
domainSets.delete(domain);
}
}
Expand Down
6 changes: 1 addition & 5 deletions Build/lib/aho-corasick.ts
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ const createKeywordFilter = (keys: string[] | Set<string>) => {

build();

const search = (text: string) => {
return (text: string) => {
let node: Node | undefined = root;

for (let i = 0, textLen = text.length; i < textLen; i++) {
Expand All @@ -96,10 +96,6 @@ const createKeywordFilter = (keys: string[] | Set<string>) => {

return false;
};

return {
search
};
};

export default createKeywordFilter;
36 changes: 21 additions & 15 deletions Build/lib/get-phishing-domains.ts
Original file line number Diff line number Diff line change
Expand Up @@ -99,17 +99,19 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g
SetAdd(domainSet, domainSet2);
}

span.traceChild('whitelisting phishing domains').traceSyncFn(() => {
const trieForRemovingWhiteListed = createTrie(domainSet);

for (let i = 0, len = WHITELIST_DOMAIN.length; i < len; i++) {
const white = WHITELIST_DOMAIN[i];
const found = trieForRemovingWhiteListed.find(`.${white}`, true);
for (let j = 0, len2 = found.length; j < len2; j++) {
domainSet.delete(found[j]);
span.traceChild('whitelisting phishing domains').traceSyncFn((parentSpan) => {
const trieForRemovingWhiteListed = parentSpan.traceChild('create trie for whitelisting').traceSyncFn(() => createTrie(domainSet));

return parentSpan.traceChild('delete whitelisted from domainset').traceSyncFn(() => {
for (let i = 0, len = WHITELIST_DOMAIN.length; i < len; i++) {
const white = WHITELIST_DOMAIN[i];
const found = trieForRemovingWhiteListed.find(`.${white}`, true);
for (let j = 0, len2 = found.length; j < len2; j++) {
domainSet.delete(found[j]);
}
domainSet.delete(white);
}
domainSet.delete(white);
}
});
});

const domainCountMap: Record<string, number> = {};
Expand Down Expand Up @@ -177,11 +179,15 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g
}
});

const results = span.traceChild('get final phishing results').traceSyncFn(
() => Object.entries(domainCountMap)
.filter(entries => entries[1] >= 5)
.map(entries => entries[0])
);
const results = span.traceChild('get final phishing results').traceSyncFn(() => {
const results: string[] = [];
for (const domain in domainCountMap) {
if (domainCountMap[domain] > 5) {
results.push(domain);
}
}
return results;
});

return [results, domainSet] as const;
});
4 changes: 0 additions & 4 deletions Build/lib/reject-data-source.ts
Original file line number Diff line number Diff line change
Expand Up @@ -211,10 +211,6 @@ export const PREDEFINED_WHITELIST = [
'pstmrk.it'
];

export const PREDEFINED_ENFORCED_BACKLIST = [
'telemetry.mozilla.org'
];

export const PREDEFINED_ENFORCED_WHITELIST = [
'godaddysites.com',
'web.app',
Expand Down
4 changes: 4 additions & 0 deletions Source/non_ip/reject.conf
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ DOMAIN-SUFFIX,pantheonsite.io
DOMAIN-SUFFIX,sitebeat.crazydomains.com
# >> Snowplow Analytics (publicsuffix)
DOMAIN-SUFFIX,try-snowplow.com
# >> Mozilla Telemetry (Enforcing)
DOMAIN-SUFFIX,telemetry-coverage.mozilla.org
DOMAIN-SUFFIX,telemetry.mozilla.org
DOMAIN-SUFFIX,incoming-telemetry.thunderbird.net

# >> Phishing
DOMAIN-SUFFIX,gofenews.com
Expand Down

0 comments on commit 725f26b

Please sign in to comment.