From 6895c7ee020127c759c70858b6dec565e706d604 Mon Sep 17 00:00:00 2001 From: SukkaW <isukkaw@gmail.com> Date: Sat, 11 Jan 2025 22:46:13 +0800 Subject: [PATCH] Refactor: fetch directly --- Build/build-reject-ip-list.ts | 29 ++-- Build/lib/cache-filesystem.ts | 256 +--------------------------------- Build/lib/fetch-assets.ts | 11 +- Build/lib/parse-filter.ts | 223 +++++++++++++---------------- 4 files changed, 121 insertions(+), 398 deletions(-) diff --git a/Build/build-reject-ip-list.ts b/Build/build-reject-ip-list.ts index 00c7a3de7..ed659ce2c 100644 --- a/Build/build-reject-ip-list.ts +++ b/Build/build-reject-ip-list.ts @@ -4,11 +4,11 @@ import { createReadlineInterfaceFromResponse, readFileIntoProcessedArray } from import { task } from './trace'; import { SHARED_DESCRIPTION } from './constants/description'; import { isProbablyIpv4, isProbablyIpv6 } from 'foxts/is-probably-ip'; -import { fsFetchCache, getFileContentHash } from './lib/cache-filesystem'; import { processLine } from './lib/process-line'; import { RulesetOutput } from './lib/create-file'; import { SOURCE_DIR } from './constants/dir'; import { $$fetch } from './lib/fetch-retry'; +import { fetchAssetsWithout304 } from './lib/fetch-assets'; const BOGUS_NXDOMAIN_URL = 'https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/bogus-nxdomain.china.conf'; const getBogusNxDomainIPsPromise: Promise<[ipv4: string[], ipv6: string[]]> = $$fetch(BOGUS_NXDOMAIN_URL).then(async (resp) => { @@ -37,26 +37,17 @@ const BOTNET_FILTER_MIRROR_URL = [ // https://curbengh.github.io/malware-filter/botnet-filter-dnscrypt-blocked-ips.txt ]; -const getBotNetFilterIPsPromise = fsFetchCache.applyWithHttp304AndMirrors<[ipv4: string[], ipv6: string[]]>( - BOTNET_FILTER_URL, - BOTNET_FILTER_MIRROR_URL, - getFileContentHash(__filename), - (text) => text.split('\n').reduce<[ipv4: string[], ipv6: string[]]>((acc, cur) => { - const ip = processLine(cur); - if (ip) { - if (isProbablyIpv4(ip)) { - acc[0].push(ip); - } else if (isProbablyIpv6(ip)) { - acc[1].push(ip); - } +const getBotNetFilterIPsPromise: Promise<[ipv4: string[], ipv6: string[]]> = fetchAssetsWithout304(BOTNET_FILTER_URL, BOTNET_FILTER_MIRROR_URL).then(text => text.split('\n').reduce<[ipv4: string[], ipv6: string[]]>((acc, cur) => { + const ip = processLine(cur); + if (ip) { + if (isProbablyIpv4(ip)) { + acc[0].push(ip); + } else if (isProbablyIpv6(ip)) { + acc[1].push(ip); } - return acc; - }, [[], []]), - { - serializer: JSON.stringify, - deserializer: JSON.parse } -); + return acc; +}, [[], []])); export const buildRejectIPList = task(require.main === module, __filename)(async (span) => { const [bogusNxDomainIPs, botNetIPs] = await Promise.all([ diff --git a/Build/lib/cache-filesystem.ts b/Build/lib/cache-filesystem.ts index 87e2d457d..181879e88 100644 --- a/Build/lib/cache-filesystem.ts +++ b/Build/lib/cache-filesystem.ts @@ -4,20 +4,12 @@ import os from 'node:os'; import path from 'node:path'; import { mkdirSync } from 'node:fs'; import picocolors from 'picocolors'; -import { mergeHeaders } from 'foxts/merge-headers'; -import { headersToObject } from 'foxts/headers-to-object'; -import { identity } from 'foxts/identity'; import { fastStringArrayJoin } from 'foxts/fast-string-array-join'; import { performance } from 'node:perf_hooks'; import fs from 'node:fs'; import { simpleStringHash } from 'foxts/simple-string-hash'; -import { defaultRequestInit, requestWithLog, ResponseError } from './fetch-retry'; -import type { UndiciResponseData } from './fetch-retry'; // import type { UndiciResponseData } from './fetch-retry'; -import { Custom304NotModifiedError, CustomAbortError, CustomNoETagFallbackError, fetchAssetsWithout304, sleepWithAbort } from './fetch-assets'; -import type { IncomingHttpHeaders } from 'undici/types/header'; -import { Headers } from 'undici'; import { CACHE_DIR } from '../constants/dir'; export interface CacheOptions<S = string> { @@ -69,20 +61,6 @@ export const TTL = { TWO_WEEKS: () => randomInt(10, 14) * ONE_DAY }; -function ensureETag(headers: IncomingHttpHeaders | Headers) { - if (headers instanceof Headers && headers.has('etag')) { - return headers.get('etag'); - } - - if ('etag' in headers && typeof headers.etag === 'string' && headers.etag.length > 0) { - return headers.etag; - } - if ('ETag' in headers && typeof headers.ETag === 'string' && headers.ETag.length > 0) { - return headers.ETag; - } - return null; -} - export class Cache<S = string> { private db: Database; /** Time before deletion */ @@ -199,238 +177,18 @@ export class Cache<S = string> { this.statement.del.run(key); } - async applyWithHttp304<T>( - url: string, - extraCacheKey: string, - fn: (resp: UndiciResponseData) => Promise<T>, - opt: Omit<CacheApplyOption<T, S>, 'incrementTtlWhenHit'> - // requestInit?: RequestInit - ): Promise<T> { - if (opt.temporaryBypass) { - return fn(await requestWithLog(url)); - } - - const baseKey = url + '$' + extraCacheKey; - const etagKey = baseKey + '$etag'; - const cachedKey = baseKey + '$cached'; - - const etag = this.get(etagKey); - - const onMiss = async (resp: UndiciResponseData) => { - const serializer = 'serializer' in opt ? opt.serializer : identity as any; - - const value = await fn(resp); - - let serverETag = ensureETag(resp.headers); - if (serverETag) { - // FUCK someonewhocares.org - if (url.includes('someonewhocares.org')) { - serverETag = serverETag.replace('-gzip', ''); - } - - console.log(picocolors.yellow('[cache] miss'), url, { status: resp.statusCode, cachedETag: etag, serverETag }); - - this.set(etagKey, serverETag, TTL.ONE_WEEK_STATIC); - this.set(cachedKey, serializer(value), TTL.ONE_WEEK_STATIC); - return value; - } - - this.del(etagKey); - console.log(picocolors.red('[cache] no etag'), picocolors.gray(url)); - if (opt.ttl) { - this.set(cachedKey, serializer(value), opt.ttl); - } - - return value; - }; - - const cached = this.get(cachedKey); - if (cached == null) { - return onMiss(await requestWithLog(url)); - } - - const resp = await requestWithLog( - url, - { - ...defaultRequestInit, - headers: (typeof etag === 'string' && etag.length > 0) - ? headersToObject(mergeHeaders(defaultRequestInit.headers, { 'If-None-Match': etag })) - : defaultRequestInit.headers - } - ); - - // Only miss if previously a ETag was present and the server responded with a 304 - if (!ensureETag(resp.headers) && resp.statusCode !== 304) { - return onMiss(resp); - } - - console.log(picocolors.green(`[cache] ${resp.statusCode === 304 ? 'http 304' : 'cache hit'}`), picocolors.gray(url)); - this.updateTtl(cachedKey, TTL.ONE_WEEK_STATIC); - - const deserializer = 'deserializer' in opt ? opt.deserializer : identity as any; - return deserializer(cached); - } - - async applyWithHttp304AndMirrors<T>( - primaryUrl: string, - mirrorUrls: string[], - extraCacheKey: string, - fn: (resp: string) => Promise<T> | T, - opt: Omit<CacheApplyOption<T, S>, 'incrementTtlWhenHit'> - ): Promise<T> { - if (opt.temporaryBypass) { - return fn(await fetchAssetsWithout304(primaryUrl, mirrorUrls)); - } - - const baseKey = primaryUrl + '$' + extraCacheKey; - const getETagKey = (url: string) => baseKey + '$' + url + '$etag'; - const cachedKey = baseKey + '$cached'; - - const controller = new AbortController(); - - const previouslyCached = this.get(cachedKey); - - const createFetchFallbackPromise = async (url: string, index: number) => { - // Most assets can be downloaded within 250ms. To avoid wasting bandwidth, we will wait for 500ms before downloading from the fallback URL. - if (index >= 0) { - try { - await sleepWithAbort(50 + (index + 1) * 100, controller.signal); - } catch { - console.log(picocolors.gray('[fetch cancelled early]'), picocolors.gray(url)); - throw new CustomAbortError(); - } - if (controller.signal.aborted) { - console.log(picocolors.gray('[fetch cancelled]'), picocolors.gray(url)); - throw new CustomAbortError(); - } - } - - const etag = this.get(getETagKey(url)); - const res = await requestWithLog( - url, - { - signal: controller.signal, - ...defaultRequestInit, - headers: (typeof etag === 'string' && etag.length > 0 && typeof previouslyCached === 'string' && previouslyCached.length > 1) - ? headersToObject(mergeHeaders(defaultRequestInit.headers, { 'If-None-Match': etag })) - : defaultRequestInit.headers - } - ); - - const serverETag = ensureETag(res.headers); - if (serverETag) { - this.set(getETagKey(url), serverETag, TTL.ONE_WEEK_STATIC); - } - // If we do not have a cached value, we ignore 304 - if (res.statusCode === 304 && typeof previouslyCached === 'string' && previouslyCached.length > 1) { - const err = new Custom304NotModifiedError(url, previouslyCached); - controller.abort(err); - throw err; - } - if (!serverETag && !this.get(getETagKey(primaryUrl)) && typeof previouslyCached === 'string') { - const err = new CustomNoETagFallbackError(previouslyCached); - controller.abort(err); - throw err; - } - - // either no etag and not cached - // or has etag but not 304 - const text = await res.body.text(); - - if (text.length < 2) { - throw new ResponseError(res, url, 'empty response'); - } - - controller.abort(); - return text; - }; - - try { - const text = mirrorUrls.length === 0 - ? await createFetchFallbackPromise(primaryUrl, -1) - : await Promise.any([ - createFetchFallbackPromise(primaryUrl, -1), - ...mirrorUrls.map(createFetchFallbackPromise) - ]); - - console.log(picocolors.yellow('[cache] miss'), primaryUrl); - const serializer = 'serializer' in opt ? opt.serializer : identity as any; - - const value = await fn(text); - - this.set(cachedKey, serializer(value), opt.ttl ?? TTL.ONE_WEEK_STATIC); - - return value; - } catch (e) { - const deserializer = 'deserializer' in opt ? opt.deserializer : identity as any; - - const on304 = (error: Custom304NotModifiedError): NonNullable<T> => { - console.log(picocolors.green('[cache] http 304'), picocolors.gray(primaryUrl)); - this.updateTtl(cachedKey, TTL.ONE_WEEK_STATIC); - return deserializer(error.data); - }; - - const onNoETagFallback = (error: CustomNoETagFallbackError): NonNullable<T> => { - console.log(picocolors.green('[cache] hit'), picocolors.gray(primaryUrl)); - return deserializer(error.data); - }; - - const onSingleError = (error: object & {}) => { - if ('name' in error) { - if (error.name === 'Custom304NotModifiedError') { - return on304(error as Custom304NotModifiedError); - } - if (error.name === 'CustomNoETagFallbackError') { - return onNoETagFallback(error as CustomNoETagFallbackError); - } - if (error.name === 'CustomAbortError' || error.name === 'AbortError') { - // noop - } - } - if ('digest' in error) { - if (error.digest === 'Custom304NotModifiedError') { - return on304(error as Custom304NotModifiedError); - } - if (error.digest === 'CustomNoETagFallbackError') { - return onNoETagFallback(error as CustomNoETagFallbackError); - } - } - return null; - }; - - if (e && typeof e === 'object') { - if ('errors' in e && Array.isArray(e.errors)) { - for (let i = 0, len = e.errors.length; i < len; i++) { - const error = e.errors[i]; - - const result = onSingleError(error); - if (result !== null) { - return result; - } - - console.log(picocolors.red('[fetch error 1]'), picocolors.gray(`[${primaryUrl}]`), error); - } - } else { - const result = onSingleError(e); - if (result !== null) { - return result; - } - - console.log(picocolors.red('[fetch error 2]'), picocolors.gray(`[${primaryUrl}]`), e); - } - } - - console.log(`Download Rule for [${primaryUrl}] failed`, { e, name: (e as any).name }); - throw e; - } - } - destroy() { this.db.close(); } + + deleteTable(tableName: string) { + this.db.exec(`DROP TABLE IF EXISTS ${tableName};`); + } } -export const fsFetchCache = new Cache({ cachePath: CACHE_DIR }); +// drop deprecated cache +new Cache({ cachePath: CACHE_DIR }).deleteTable('cache'); + // process.on('exit', () => { // fsFetchCache.destroy(); // }); diff --git a/Build/lib/fetch-assets.ts b/Build/lib/fetch-assets.ts index 085d9707d..558012af4 100644 --- a/Build/lib/fetch-assets.ts +++ b/Build/lib/fetch-assets.ts @@ -1,5 +1,5 @@ import picocolors from 'picocolors'; -import { defaultRequestInit, requestWithLog, ResponseError } from './fetch-retry'; +import { $$fetch, defaultRequestInit, ResponseError } from './fetch-retry'; import { wait } from 'foxts/wait'; // eslint-disable-next-line sukka/unicorn/custom-error-definition -- typescript is better @@ -41,7 +41,7 @@ export function sleepWithAbort(ms: number, signal: AbortSignal) { }); } -export async function fetchAssetsWithout304(url: string, fallbackUrls: string[] | readonly string[]) { +export async function fetchAssetsWithout304(url: string, fallbackUrls: null | undefined | string[] | readonly string[]) { const controller = new AbortController(); const createFetchFallbackPromise = async (url: string, index: number) => { @@ -58,8 +58,8 @@ export async function fetchAssetsWithout304(url: string, fallbackUrls: string[] console.log(picocolors.gray('[fetch cancelled]'), picocolors.gray(url)); throw new CustomAbortError(); } - const res = await requestWithLog(url, { signal: controller.signal, ...defaultRequestInit }); - const text = await res.body.text(); + const res = await $$fetch(url, { signal: controller.signal, ...defaultRequestInit }); + const text = await res.text(); if (text.length < 2) { throw new ResponseError(res, url, 'empty response w/o 304'); @@ -69,6 +69,9 @@ export async function fetchAssetsWithout304(url: string, fallbackUrls: string[] return text; }; + if (!fallbackUrls || fallbackUrls.length === 0) { + return createFetchFallbackPromise(url, -1); + } return Promise.any([ createFetchFallbackPromise(url, -1), ...fallbackUrls.map(createFetchFallbackPromise) diff --git a/Build/lib/parse-filter.ts b/Build/lib/parse-filter.ts index 3c28a946e..4f1f00554 100644 --- a/Build/lib/parse-filter.ts +++ b/Build/lib/parse-filter.ts @@ -4,16 +4,14 @@ import tldts from 'tldts-experimental'; import picocolors from 'picocolors'; import { normalizeDomain } from './normalize-domain'; -import { deserializeArray, fsFetchCache, serializeArray, getFileContentHash } from './cache-filesystem'; import type { Span } from '../trace'; import { createRetrieKeywordFilter as createKeywordFilter } from 'foxts/retrie'; import { looseTldtsOpt } from '../constants/loose-tldts-opt'; -import { identity } from 'foxts/identity'; import { DEBUG_DOMAIN_TO_FIND } from '../constants/reject-data-source'; import { noop } from 'foxts/noop'; +import { fetchAssetsWithout304 } from './fetch-assets'; let foundDebugDomain = false; -const temporaryBypass = typeof DEBUG_DOMAIN_TO_FIND === 'string'; const onBlackFound = DEBUG_DOMAIN_TO_FIND ? (line: string, meta: string) => { @@ -58,32 +56,24 @@ function domainListLineCb(l: string, set: string[], includeAllSubDomain: boolean export function processDomainLists( span: Span, - domainListsUrl: string, mirrors: string[] | null, includeAllSubDomain = false, - ttl: number | null = null, extraCacheKey: (input: string) => string = identity + domainListsUrl: string, mirrors: string[] | null, includeAllSubDomain = false ) { - return span.traceChild(`process domainlist: ${domainListsUrl}`).traceAsyncFn((childSpan) => fsFetchCache.applyWithHttp304AndMirrors<string[]>( - domainListsUrl, - mirrors ?? [], - extraCacheKey(getFileContentHash(__filename)), - (text) => { - const domainSets: string[] = []; - const filterRules = text.split('\n'); - - childSpan.traceChild('parse domain list').traceSyncFn(() => { - for (let i = 0, len = filterRules.length; i < len; i++) { - domainListLineCb(filterRules[i], domainSets, includeAllSubDomain, domainListsUrl); - } - }); - - return domainSets; - }, - { - ttl, - temporaryBypass, - serializer: serializeArray, - deserializer: deserializeArray - } - )); + return span.traceChildAsync(`process domainlist: ${domainListsUrl}`, async (span) => { + const text = await span.traceChildAsync(`process domainlist: ${domainListsUrl}`, () => fetchAssetsWithout304( + domainListsUrl, + mirrors + )); + const domainSets: string[] = []; + const filterRules = text.split('\n'); + + span.traceChildSync('parse domain list', () => { + for (let i = 0, len = filterRules.length; i < len; i++) { + domainListLineCb(filterRules[i], domainSets, includeAllSubDomain, domainListsUrl); + } + }); + + return domainSets; + }); } function hostsLineCb(l: string, set: string[], includeAllSubDomain: boolean, meta: string) { @@ -108,33 +98,23 @@ function hostsLineCb(l: string, set: string[], includeAllSubDomain: boolean, met export function processHosts( span: Span, - hostsUrl: string, mirrors: string[] | null, includeAllSubDomain = false, - ttl: number | null = null, extraCacheKey: (input: string) => string = identity + hostsUrl: string, mirrors: string[] | null, includeAllSubDomain = false ) { - return span.traceChild(`processhosts: ${hostsUrl}`).traceAsyncFn((childSpan) => fsFetchCache.applyWithHttp304AndMirrors<string[]>( - hostsUrl, - mirrors ?? [], - extraCacheKey(getFileContentHash(__filename)), - (text) => { - const domainSets: string[] = []; - - const filterRules = text.split('\n'); - - childSpan.traceChild('parse hosts').traceSyncFn(() => { - for (let i = 0, len = filterRules.length; i < len; i++) { - hostsLineCb(filterRules[i], domainSets, includeAllSubDomain, hostsUrl); - } - }); - - return domainSets; - }, - { - ttl, - temporaryBypass, - serializer: serializeArray, - deserializer: deserializeArray - } - )); + return span.traceChildAsync(`process hosts: ${hostsUrl}`, async (span) => { + const text = await span.traceChild('download').traceAsyncFn(() => fetchAssetsWithout304(hostsUrl, mirrors)); + + const domainSets: string[] = []; + + const filterRules = text.split('\n'); + + span.traceChild('parse hosts').traceSyncFn(() => { + for (let i = 0, len = filterRules.length; i < len; i++) { + hostsLineCb(filterRules[i], domainSets, includeAllSubDomain, hostsUrl); + } + }); + + return domainSets; + }); } const enum ParseType { @@ -153,92 +133,83 @@ export async function processFilterRules( parentSpan: Span, filterRulesUrl: string, fallbackUrls?: string[] | null, - ttl: number | null = null, + _ttl: number | null = null, allowThirdParty = false ): Promise<{ white: string[], black: string[], foundDebugDomain: boolean }> { - const [white, black, warningMessages] = await parentSpan.traceChild(`process filter rules: ${filterRulesUrl}`).traceAsyncFn((span) => fsFetchCache.applyWithHttp304AndMirrors<Readonly<[white: string[], black: string[], warningMessages: string[]]>>( - filterRulesUrl, - fallbackUrls ?? [], - getFileContentHash(__filename), - (text) => { - const whitelistDomainSets = new Set<string>(); - const blacklistDomainSets = new Set<string>(); + const [white, black, warningMessages] = await parentSpan.traceChild(`process filter rules: ${filterRulesUrl}`).traceAsyncFn(async (span) => { + const text = await fetchAssetsWithout304(filterRulesUrl, fallbackUrls); - const warningMessages: string[] = []; + const whitelistDomainSets = new Set<string>(); + const blacklistDomainSets = new Set<string>(); - const MUTABLE_PARSE_LINE_RESULT: [string, ParseType] = ['', ParseType.NotParsed]; - /** + const warningMessages: string[] = []; + + const MUTABLE_PARSE_LINE_RESULT: [string, ParseType] = ['', ParseType.NotParsed]; + /** * @param {string} line */ - const lineCb = (line: string) => { - const result = parse(line, MUTABLE_PARSE_LINE_RESULT, allowThirdParty); - const flag = result[1]; + const lineCb = (line: string) => { + const result = parse(line, MUTABLE_PARSE_LINE_RESULT, allowThirdParty); + const flag = result[1]; - if (flag === ParseType.NotParsed) { - throw new Error(`Didn't parse line: ${line}`); - } - if (flag === ParseType.Null) { - return; - } + if (flag === ParseType.NotParsed) { + throw new Error(`Didn't parse line: ${line}`); + } + if (flag === ParseType.Null) { + return; + } - const hostname = result[0]; + const hostname = result[0]; - if (flag === ParseType.WhiteIncludeSubdomain || flag === ParseType.WhiteAbsolute) { - onWhiteFound(hostname, filterRulesUrl); - } else { - onBlackFound(hostname, filterRulesUrl); - } + if (flag === ParseType.WhiteIncludeSubdomain || flag === ParseType.WhiteAbsolute) { + onWhiteFound(hostname, filterRulesUrl); + } else { + onBlackFound(hostname, filterRulesUrl); + } - switch (flag) { - case ParseType.WhiteIncludeSubdomain: - if (hostname[0] === '.') { - whitelistDomainSets.add(hostname); - } else { - whitelistDomainSets.add(`.${hostname}`); - } - break; - case ParseType.WhiteAbsolute: + switch (flag) { + case ParseType.WhiteIncludeSubdomain: + if (hostname[0] === '.') { whitelistDomainSets.add(hostname); - break; - case ParseType.BlackIncludeSubdomain: - if (hostname[0] === '.') { - blacklistDomainSets.add(hostname); - } else { - blacklistDomainSets.add(`.${hostname}`); - } - break; - case ParseType.BlackAbsolute: + } else { + whitelistDomainSets.add(`.${hostname}`); + } + break; + case ParseType.WhiteAbsolute: + whitelistDomainSets.add(hostname); + break; + case ParseType.BlackIncludeSubdomain: + if (hostname[0] === '.') { blacklistDomainSets.add(hostname); - break; - case ParseType.ErrorMessage: - warningMessages.push(hostname); - break; - default: - break; - } - }; + } else { + blacklistDomainSets.add(`.${hostname}`); + } + break; + case ParseType.BlackAbsolute: + blacklistDomainSets.add(hostname); + break; + case ParseType.ErrorMessage: + warningMessages.push(hostname); + break; + default: + break; + } + }; - const filterRules = text.split('\n'); + const filterRules = text.split('\n'); - span.traceChild('parse adguard filter').traceSyncFn(() => { - for (let i = 0, len = filterRules.length; i < len; i++) { - lineCb(filterRules[i]); - } - }); - - return [ - Array.from(whitelistDomainSets), - Array.from(blacklistDomainSets), - warningMessages - ] as const; - }, - { - ttl, - temporaryBypass, - serializer: JSON.stringify, - deserializer: JSON.parse - } - )); + span.traceChild('parse adguard filter').traceSyncFn(() => { + for (let i = 0, len = filterRules.length; i < len; i++) { + lineCb(filterRules[i]); + } + }); + + return [ + Array.from(whitelistDomainSets), + Array.from(blacklistDomainSets), + warningMessages + ] as const; + }); for (let i = 0, len = warningMessages.length; i < len; i++) { console.warn(