Skip to content

Commit

Permalink
Perf: use filesystem cache
Browse files Browse the repository at this point in the history
  • Loading branch information
SukkaW committed Dec 30, 2023
1 parent 6ed3695 commit 85801b1
Show file tree
Hide file tree
Showing 9 changed files with 144 additions and 73 deletions.
72 changes: 42 additions & 30 deletions Build/build-speedtest-domainset.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,57 +12,69 @@ import { getGorhillPublicSuffixPromise } from './lib/get-gorhill-publicsuffix';
import picocolors from 'picocolors';
import { fetchRemoteTextByLine } from './lib/fetch-text-by-line';
import { processLine } from './lib/process-line';
import { TTL, deserializeArray, fsCache, serializeArray } from './lib/cache-filesystem';

const s = new Sema(2);

const latestTopUserAgentsPromise = fetchWithRetry('https://unpkg.com/top-user-agents@latest/index.json')
.then(res => res.json<string[]>());
.then(res => res.json<string[]>()).then(userAgents => userAgents.filter(ua => ua.startsWith('Mozilla/5.0 ')));

const querySpeedtestApi = async (keyword: string): Promise<Array<string | null>> => {
const topUserAgents = (await Promise.all([
latestTopUserAgentsPromise,
s.acquire()
]))[0];
const topUserAgents = await latestTopUserAgentsPromise;

const url = `https://www.speedtest.net/api/js/servers?engine=js&search=${keyword}&limit=100`;

try {
const randomUserAgent = topUserAgents[Math.floor(Math.random() * topUserAgents.length)];
const key = `fetch speedtest endpoints: ${keyword}`;
console.log(key);
console.time(key);

const res = await fetchWithRetry(`https://www.speedtest.net/api/js/servers?engine=js&search=${keyword}&limit=100`, {
headers: {
dnt: '1',
Referer: 'https://www.speedtest.net/',
accept: 'application/json, text/plain, */*',
'User-Agent': randomUserAgent,
'Accept-Language': 'en-US,en;q=0.9',
...(randomUserAgent.includes('Chrome')
? {
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'Sec-Gpc': '1'
const json = await fsCache.apply(
url,
() => s.acquire().then(() => fetchWithRetry(url, {
headers: {
dnt: '1',
Referer: 'https://www.speedtest.net/',
accept: 'application/json, text/plain, */*',
'User-Agent': randomUserAgent,
'Accept-Language': 'en-US,en;q=0.9',
...(randomUserAgent.includes('Chrome')
? {
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'Sec-Gpc': '1'
}
: {})
},
signal: AbortSignal.timeout(1000 * 4),
retry: {
retries: 2
}
})).then(r => r.json<Array<{ url: string }>>()).then(data => data.reduce<string[]>(
(prev, cur) => {
const hn = tldts.getHostname(cur.url, { detectIp: false });
if (hn) {
prev.push(hn);
}
: {})
},
signal: AbortSignal.timeout(1000 * 4),
retry: {
retries: 2
return prev;
}, []
)).finally(() => s.release()),
{
ttl: TTL.ONE_WEEK(),
serializer: serializeArray,
deserializer: deserializeArray
}
});

const json = await res.json<Array<{ url: string }>>();
);

console.timeEnd(key);

return json.map(({ url }) => tldts.getHostname(url, { detectIp: false }));
return json;
} catch (e) {
console.log(e);
return [];
} finally {
s.release();
}
};

Expand Down
8 changes: 4 additions & 4 deletions Build/download-mock-assets.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ import path from 'path';
import { fetchWithRetry } from './lib/fetch-retry';

const ASSETS_LIST = {
'www-google-analytics-com_ga.js': 'https://unpkg.com/@adguard/scriptlets@1/dist/redirect-files/google-analytics-ga.js',
'www-googletagservices-com_gpt.js': 'https://unpkg.com/@adguard/scriptlets@1/dist/redirect-files/googletagservices-gpt.js',
'www-google-analytics-com_analytics.js': 'https://unpkg.com/@adguard/scriptlets@1/dist/redirect-files/google-analytics.js',
'www-googlesyndication-com_adsbygoogle.js': 'https://unpkg.com/@adguard/scriptlets@1/dist/redirect-files/googlesyndication-adsbygoogle.js'
'www-google-analytics-com_ga.js': 'https://raw.githubusercontent.com/AdguardTeam/Scriptlets/master/dist/redirect-files/google-analytics-ga.js',
'www-googletagservices-com_gpt.js': 'https://raw.githubusercontent.com/AdguardTeam/Scriptlets/master/dist/redirect-files/googletagservices-gpt.js',
'www-google-analytics-com_analytics.js': 'https://raw.githubusercontent.com/AdguardTeam/Scriptlets/master/dist/redirect-files/google-analytics.js',
'www-googlesyndication-com_adsbygoogle.js': 'https://raw.githubusercontent.com/AdguardTeam/Scriptlets/master/dist/redirect-files/googlesyndication-adsbygoogle.js'
} as const;

const mockDir = path.resolve(import.meta.dir, '../Mock');
Expand Down
17 changes: 16 additions & 1 deletion Build/download-publicsuffixlist.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,20 @@
import { TTL, fsCache } from './lib/cache-filesystem';
import { defaultRequestInit, fetchWithRetry } from './lib/fetch-retry';
import { createMemoizedPromise } from './lib/memo-promise';
import { traceAsync } from './lib/trace-runner';

export const getPublicSuffixListTextPromise = createMemoizedPromise(() => traceAsync('obtain public_suffix_list', () => fetchWithRetry('https://publicsuffix.org/list/public_suffix_list.dat', defaultRequestInit).then(r => r.text())));
export const getPublicSuffixListTextPromise = createMemoizedPromise(
() => traceAsync(
'obtain public_suffix_list',
() => fsCache.apply(
'https://publicsuffix.org/list/public_suffix_list.dat',
() => fetchWithRetry('https://publicsuffix.org/list/public_suffix_list.dat', defaultRequestInit).then(r => r.text()),
{
// https://github.com/publicsuffix/list/blob/master/.github/workflows/tld-update.yml
// Though the action runs every 24 hours, the IANA list is updated every 7 days.
// So a 3 day TTL should be enough.
ttl: TTL.THREE_DAYS()
}
)
)
);
20 changes: 18 additions & 2 deletions Build/lib/cache-filesystem.ts
Original file line number Diff line number Diff line change
Expand Up @@ -127,12 +127,28 @@ export class Cache {
}
}

// export const fsCache = new Cache({ cachePath: path.resolve(import.meta.dir, '../../.cache') });
export const fsCache = new Cache({ cachePath: path.resolve(import.meta.dir, '../../.cache') });
// process.on('exit', () => {
// fsCache.destroy();
// });

const separator = String.fromCharCode(0);
const randomInt = (min: number, max: number) => Math.floor(Math.random() * (max - min + 1)) + min;

// Add some randomness to the cache ttl to avoid thundering herd
export const TTL = {
TWLVE_HOURS: () => randomInt(9, 14) * 60 * 60 * 1000,
THREE_DAYS: () => randomInt(2, 4) * 24 * 60 * 60 * 1000,
ONE_WEEK: () => randomInt(5, 8) * 24 * 60 * 60 * 1000,
TWO_WEEKS: () => randomInt(12, 16) * 24 * 60 * 60 * 1000,
TEN_DAYS: () => randomInt(9, 11) * 24 * 60 * 60 * 1000
};

const separator = String.fromCharCode(0);
// const textEncoder = new TextEncoder();
// const textDecoder = new TextDecoder();
// export const serializeString = (str: string) => textEncoder.encode(str);
// export const deserializeString = (str: string) => textDecoder.decode(new Uint8Array(str.split(separator).map(Number)));
export const serializeSet = (set: Set<string>) => Array.from(set).join(separator);
export const deserializeSet = (str: string) => new Set(str.split(separator));
export const serializeArray = (arr: string[]) => arr.join(separator);
export const deserializeArray = (str: string) => str.split(separator);
37 changes: 19 additions & 18 deletions Build/lib/parse-filter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,15 @@ import { traceAsync } from './trace-runner';
import picocolors from 'picocolors';
import { normalizeDomain } from './normalize-domain';
import { fetchAssets } from './fetch-assets';
import { deserializeSet, fsCache, serializeSet } from './cache-filesystem';

const DEBUG_DOMAIN_TO_FIND: string | null = null; // example.com | null
let foundDebugDomain = false;

export function processDomainLists(domainListsUrl: string, includeAllSubDomain = false, _ttl: number | null = null) {
return traceAsync(`- processDomainLists: ${domainListsUrl}`, /* () => fsCache.apply(
export function processDomainLists(domainListsUrl: string, includeAllSubDomain = false, ttl: number | null = null) {
return traceAsync(`- processDomainLists: ${domainListsUrl}`, () => fsCache.apply(
domainListsUrl,
*/async () => {
async () => {
const domainSets = new Set<string>();

for await (const line of await fetchRemoteTextByLine(domainListsUrl)) {
Expand All @@ -32,19 +33,19 @@ export function processDomainLists(domainListsUrl: string, includeAllSubDomain =
}

return domainSets;
});/* ,
},
{
ttl,
temporaryBypass: DEBUG_DOMAIN_TO_FIND !== null,
serializer: serializeSet,
deserializer: deserializeSet
}
)); */
));
}
export function processHosts(hostsUrl: string, includeAllSubDomain = false, skipDomainCheck = false, _ttl: number | null = null) {
return traceAsync(`- processHosts: ${hostsUrl}`, /* () => fsCache.apply(
export function processHosts(hostsUrl: string, includeAllSubDomain = false, skipDomainCheck = false, ttl: number | null = null) {
return traceAsync(`- processHosts: ${hostsUrl}`, () => fsCache.apply(
hostsUrl,
*/async () => {
async () => {
const domainSets = new Set<string>();

for await (const l of await fetchRemoteTextByLine(hostsUrl)) {
Expand Down Expand Up @@ -73,14 +74,14 @@ export function processHosts(hostsUrl: string, includeAllSubDomain = false, skip
console.log(picocolors.gray('[process hosts]'), picocolors.gray(hostsUrl), picocolors.gray(domainSets.size));

return domainSets;
});
/* {
},
{
ttl,
temporaryBypass: DEBUG_DOMAIN_TO_FIND !== null,
serializer: serializeSet,
deserializer: deserializeSet
}
) */
));
}

// eslint-disable-next-line sukka-ts/no-const-enum -- bun bundler is smart, maybe?
Expand All @@ -95,15 +96,15 @@ const enum ParseType {
export async function processFilterRules(
filterRulesUrl: string,
fallbackUrls?: readonly string[] | undefined | null,
_ttl: number | null = null
ttl: number | null = null
): Promise<{ white: string[], black: string[], foundDebugDomain: boolean }> {
const [white, black, warningMessages] = await traceAsync(`- processFilterRules: ${filterRulesUrl}`, /* () => fsCache.apply<[
const [white, black, warningMessages] = await traceAsync(`- processFilterRules: ${filterRulesUrl}`, () => fsCache.apply<[
white: string[],
black: string[],
warningMessages: string[]
]>(
filterRulesUrl,
*/async () => {
async () => {
const whitelistDomainSets = new Set<string>();
const blacklistDomainSets = new Set<string>();

Expand Down Expand Up @@ -168,7 +169,7 @@ export async function processFilterRules(
// TODO-SUKKA: add cache here
if (!fallbackUrls || fallbackUrls.length === 0) {
for await (const line of await fetchRemoteTextByLine(filterRulesUrl)) {
// don't trim here
// don't trim here
lineCb(line);
}
} else {
Expand All @@ -191,14 +192,14 @@ export async function processFilterRules(
Array.from(blacklistDomainSets),
warningMessages
];
});
/* {
},
{
ttl,
temporaryBypass: DEBUG_DOMAIN_TO_FIND !== null,
serializer: JSON.stringify,
deserializer: JSON.parse
}
) */
));

warningMessages.forEach(msg => {
console.warn(
Expand Down
16 changes: 16 additions & 0 deletions Build/lib/random-int.bench.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import { bench, group, run } from 'mitata';
import { randomInt as nativeRandomInt } from 'crypto';

const randomInt = (min: number, max: number) => Math.floor(Math.random() * (max - min + 1)) + min;

group('random-int', () => {
bench('crypto.randomInt', () => {
nativeRandomInt(3, 7);
});

bench('Math.random', () => {
randomInt(3, 7);
});
});

run();
46 changes: 28 additions & 18 deletions Build/lib/reject-data-source.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
import { TTL } from './cache-filesystem';

export const HOSTS = [
['https://pgl.yoyo.org/adservers/serverlist.php?hostformat=hosts&showintro=0&mimetype=plaintext', true],
['https://someonewhocares.org/hosts/hosts', true],
['https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/hosts.txt', false],
['https://raw.githubusercontent.com/crazy-max/WindowsSpyBlocker/master/data/hosts/spy.txt', true],
// no coin list is not actively maintained, but it updates daily when being maintained, so we set a 3 days cache ttl
['https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/hosts.txt', false, false, TTL.THREE_DAYS()],
// have not been updated for more than a year, so we set a 14 days cache ttl
['https://raw.githubusercontent.com/crazy-max/WindowsSpyBlocker/master/data/hosts/spy.txt', true, false, TTL.TWO_WEEKS()],
['https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Extension/GoodbyeAds-Xiaomi-Extension.txt', false],
['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts', false],
['https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Extension/GoodbyeAds-Huawei-Extension.txt', false],
// ad-wars is not actively maintained, so we set a 7 days cache ttl
['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts', false, false, TTL.ONE_WEEK()],
['https://raw.githubusercontent.com/durablenapkin/block/master/luminati.txt', true],
// CoinBlockerList
// Although the hosts file is still actively maintained, the hosts_browser file is not updated since 2021-07, so we set a 10 days cache ttl
['https://zerodot1.gitlab.io/CoinBlockerLists/hosts_browser', true, true, 10 * 24 * 60 * 60 * 1000],
// Although the hosts file is still actively maintained, the hosts_browser file is not updated since 2021-07, so we set a 14 days cache ttl
['https://zerodot1.gitlab.io/CoinBlockerLists/hosts_browser', true, true, TTL.TWO_WEEKS()],
// Curben's UrlHaus Malicious URL Blocklist
// 'https://curbengh.github.io/urlhaus-filter/urlhaus-filter-agh-online.txt',
// 'https://urlhaus-filter.pages.dev/urlhaus-filter-agh-online.txt',
Expand All @@ -21,23 +27,24 @@ export const HOSTS = [
// Curben's PUP Domains Blocklist
// 'https://curbengh.github.io/pup-filter/pup-filter-agh.txt'
// 'https://pup-filter.pages.dev/pup-filter-agh.txt'
// The PUP filter has paused the update since 2023-05, so we set a 7 days cache ttl
['https://curbengh.github.io/pup-filter/pup-filter-hosts.txt', true, true, 7 * 24 * 60 * 60 * 1000]
// The PUP filter has paused the update since 2023-05, so we set a 14 days cache ttl
['https://curbengh.github.io/pup-filter/pup-filter-hosts.txt', true, true, TTL.TWO_WEEKS()]
] as const;

export const DOMAIN_LISTS = [
// BarbBlock
// The barbblock list has never been updated since 2019-05, so we set a 10 days cache ttl
['https://paulgb.github.io/BarbBlock/blacklists/domain-list.txt', true, 10 * 24 * 60 * 60 * 1000],
// The barbblock list has never been updated since 2019-05, so we set a 14 days cache ttl
['https://paulgb.github.io/BarbBlock/blacklists/domain-list.txt', true, TTL.TWO_WEEKS()],
// DigitalSide Threat-Intel - OSINT Hub
['https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt', true],
// Update once per day
['https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt', true, 24 * 60 * 60 * 1000],
// AdGuard CNAME Filter Combined
// Update on a 7 days basis, so we add a 36 hours cache ttl
['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_ads_justdomains.txt', true, 36 * 60 * 60 * 1000],
['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_trackers_justdomains.txt', true, 36 * 60 * 60 * 1000],
['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_clickthroughs_justdomains.txt', true, 36 * 60 * 60 * 1000],
['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_microsites_justdomains.txt', true, 36 * 60 * 60 * 1000],
['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_mail_trackers_justdomains.txt', true, 36 * 60 * 60 * 1000]
// Update on a 7 days basis, so we add a 3 hours cache ttl
['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_ads_justdomains.txt', true, TTL.THREE_DAYS()],
['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_trackers_justdomains.txt', true, TTL.THREE_DAYS()],
['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_clickthroughs_justdomains.txt', true, TTL.THREE_DAYS()],
['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_microsites_justdomains.txt', true, TTL.THREE_DAYS()],
['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_mail_trackers_justdomains.txt', true, TTL.THREE_DAYS()]
] as const;

export const ADGUARD_FILTERS = [
Expand Down Expand Up @@ -130,14 +137,17 @@ export const ADGUARD_FILTERS = [
// GameConsoleAdblockList
'https://raw.githubusercontent.com/DandelionSprout/adfilt/master/GameConsoleAdblockList.txt',
// PiHoleBlocklist
// Update almost once per 3 months, let's set a 10 days cache ttl
[
'https://perflyst.github.io/PiHoleBlocklist/SmartTV-AGH.txt',
[
'https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/SmartTV-AGH.txt'
]
],
TTL.TEN_DAYS()
],
// Spam404
'https://raw.githubusercontent.com/Spam404/lists/master/adblock-list.txt',
// Not actively maintained, let's use a 10 days cache ttl
['https://raw.githubusercontent.com/Spam404/lists/master/adblock-list.txt', null, TTL.TEN_DAYS()],
// Brave First Party & First Party CNAME
'https://raw.githubusercontent.com/brave/adblock-lists/master/brave-lists/brave-firstparty.txt'
] as const;
Expand Down
Binary file modified bun.lockb
Binary file not shown.
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
"eslint": "^8.56.0",
"eslint-config-sukka": "4.1.10-beta.2",
"eslint-formatter-sukka": "4.1.9",
"mitata": "^0.1.6",
"typescript": "^5.3.3"
},
"resolutions": {
Expand Down

0 comments on commit 85801b1

Please sign in to comment.