From 779c6b493978151e0193fbd288a4049340eceee1 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 14 Aug 2024 11:41:04 -0400 Subject: [PATCH 1/5] Move CDXJ file handling from bin/cli.js to WACZ class This will allow other applications like Browsertrix Crawler that want to pass in existing CDXJ files be able to do so simply by setting the right values in the WACZ class initialization rather than having to duplicate code from the js-wacz CLI. --- bin/cli.js | 31 +----------------------- constants.js | 6 +++++ fixtures/cdxj/invalid.txt | 1 + fixtures/cdxj/valid.cdxj | 1 + fixtures/cdxj/valid2.cdx | 2 ++ index.js | 51 +++++++++++++++++++++++++++++++++++++++ index.test.js | 48 +++++++++++++++++++++++++++++++++++- 7 files changed, 109 insertions(+), 31 deletions(-) create mode 100644 fixtures/cdxj/invalid.txt create mode 100644 fixtures/cdxj/valid.cdxj create mode 100644 fixtures/cdxj/valid2.cdx diff --git a/bin/cli.js b/bin/cli.js index 8229846..df96aa2 100755 --- a/bin/cli.js +++ b/bin/cli.js @@ -116,6 +116,7 @@ program.command('create') signingUrl: values?.signingUrl, signingToken: values?.signingToken, pages: values?.pages, + cdxj: values?.cdxj, log }) } catch (err) { @@ -123,36 +124,6 @@ program.command('create') process.exit(1) } - // Ingest user-provided CDX files, if any. - if (values?.cdxj) { - try { - const dirPath = values?.cdxj - const cdxjFiles = await fs.readdir(dirPath) - const allowedExts = ['cdx', 'cdxj'] - - for (let i = 0; i < cdxjFiles.length; i++) { - const cdxjFile = resolve(dirPath, cdxjFiles[i]) - - const ext = cdxjFile.split('.').pop() - if (!allowedExts.includes(ext)) { - log.warn(`CDXJ: Skipping file ${cdxjFile}, not a CDXJ file`) - continue - } - - log.info(`CDXJ: Reading entries from ${cdxjFile}`) - const rl = readline.createInterface({ input: createReadStream(cdxjFile) }) - - for await (const line of rl) { - archive.addCDXJ(line + '\n') - } - } - } catch (err) { - log.trace(err) - log.error('An error occurred while processing user-provided CDXJ indices.') - process.exit(1) - } - } - // Main process try { await archive.process() diff --git a/constants.js b/constants.js index 539534c..ddd4fff 100644 --- a/constants.js +++ b/constants.js @@ -34,6 +34,12 @@ export const PAGES_FIXTURE_PATH = `${PAGES_DIR_FIXTURES_PATH}pages.jsonl` */ export const EXTRA_PAGES_FIXTURE_PATH = `${PAGES_DIR_FIXTURES_PATH}extraPages.jsonl` +/** + * Path to the fixtures folder cdxj sub-directory. + * @constant + */ +export const CDXJ_DIR_FIXTURES_PATH = `${FIXTURES_PATH}cdxj${sep}` + /** * Colors scheme for log level. * @constant diff --git a/fixtures/cdxj/invalid.txt b/fixtures/cdxj/invalid.txt new file mode 100644 index 0000000..f49da92 --- /dev/null +++ b/fixtures/cdxj/invalid.txt @@ -0,0 +1 @@ +not cdxj diff --git a/fixtures/cdxj/valid.cdxj b/fixtures/cdxj/valid.cdxj new file mode 100644 index 0000000..69720d5 --- /dev/null +++ b/fixtures/cdxj/valid.cdxj @@ -0,0 +1 @@ +net,webrecorder)/ 20240814070734 {"url":"https://webrecorder.net/","mime":"text/html","status":200,"digest":"16966a2a2909825ad1d9a6f1b2f4833c8fe43428cb9920d0f974bd7b3d73c31d","length":3941,"offset":0,"filename":"rec-8bc4bd095683-20240307070734658-0.warc.gz"} diff --git a/fixtures/cdxj/valid2.cdx b/fixtures/cdxj/valid2.cdx new file mode 100644 index 0000000..4894940 --- /dev/null +++ b/fixtures/cdxj/valid2.cdx @@ -0,0 +1,2 @@ +net,webrecorder)/assets/wr-logo.svg 20240814162441 {"url":"https://webrecorder.net/assets/wr-logo.svg","mime":"image/svg","status":200,"digest":"00c5957f7c97b2e79433fe607bdb47ecb3837a7ee7b603849e7cfb52dcc5f4c7","length":2041,"offset":19176,"filename":"rec-de8ca7249fc0-20240814162441960-0.warc.gz"} +net,webrecorder)/assets/favicon.ico 20240814162442 {"url":"https://webrecorder.net/assets/favicon.ico","mime":"image/vnd.microsoft.icon","status":200,"digest":"e39a17af5d611f3a36784bc70128f93c10a7f5db03626d3030edf2ee1772e328","length":15398,"offset":87313,"filename":"rec-de8ca7249fc0-20240814162441960-0.warc.gz"} diff --git a/index.js b/index.js index 935d495..86c06ad 100644 --- a/index.js +++ b/index.js @@ -185,6 +185,12 @@ export class WACZ { */ pagesDir = null + /** + * Path to directory of CDXJ files to copy as-is into WACZ. + * @type {?string} + */ + cdxjDir = null + /** * @param {WACZOptions} options - See {@link WACZOptions} for details. */ @@ -289,6 +295,11 @@ export class WACZ { this.pagesDir = String(options?.pages).trim() } + if (options?.cdxj) { + this.detectPages = false + this.cdxjDir = String(options?.cdxj).trim() + } + if (options?.indexFromWARCs === false) { this.indexFromWARCs = false } @@ -360,6 +371,11 @@ export class WACZ { info('Initializing indexer') this.initWorkerPool() + if (this.cdxjDir) { + info('Reading provided CDXJ files') + await this.readFromExistingCDXJ() + } + if (this.indexFromWARCs) { info('Indexing WARCS') await this.indexWARCs() @@ -656,6 +672,41 @@ export class WACZ { } } + /** + * Reads lines from CDXJ files in `this.cdxjDir` into cdxArray. + * @returns {Promise} + */ + readFromExistingCDXJ = async () => { + this.stateCheck() + + const { cdxjDir, log, addFileToZip } = this + + if (!cdxjDir) { + throw new Error('Error copying CDXJ files, no directory specified.') + } + + const allowedExts = ['cdx', 'cdxj'] + + const cdxjFiles = await fs.readdir(cdxjDir) + + for (const cdxjFile of cdxjFiles) { + const cdxjFilepath = resolve(cdxjDir, cdxjFile) + + const ext = cdxjFilepath.toLowerCase().split('.').pop() + if (!allowedExts.includes(ext)) { + log.warn(`CDXJ: Skipping file ${cdxjFile}, not a CDXJ file`) + continue + } + + log.info(`CDXJ: Reading entries from ${cdxjFile}`) + const rl = readline.createInterface({ input: createReadStream(cdxjFilepath) }) + + for await (const line of rl) { + this.addCDXJ(line + '\n') + } + } + } + /** * Streams all the files listed in `this.WARCs` to the output ZIP. * @returns {Promise} diff --git a/index.test.js b/index.test.js index 2eaf526..e4806d0 100644 --- a/index.test.js +++ b/index.test.js @@ -11,7 +11,7 @@ import StreamZip from 'node-stream-zip' import * as dotenv from 'dotenv' import { WACZ } from './index.js' -import { FIXTURES_PATH, PAGES_DIR_FIXTURES_PATH, PAGES_FIXTURE_PATH, EXTRA_PAGES_FIXTURE_PATH } from './constants.js' +import { FIXTURES_PATH, PAGES_DIR_FIXTURES_PATH, PAGES_FIXTURE_PATH, EXTRA_PAGES_FIXTURE_PATH, CDXJ_DIR_FIXTURES_PATH } from './constants.js' import { assertSHA256WithPrefix, assertValidWACZSignatureFormat } from './utils/assertions.js' // see https://github.com/motdotla/dotenv#how-do-i-use-dotenv-with-import // Loads env vars from .env if provided @@ -376,3 +376,49 @@ test('WACZ.process with pagesDir option creates valid WACZ with provided pages f // Delete temp file await fs.unlink(options.output) }) + +test('WACZ.process with cdxj option creates valid WACZ with index from provided CDXJ files.', async (_t) => { + const options = { + input: FIXTURE_INPUT, + output: '../tmp.wacz', + url: 'https://lil.law.harvard.edu', + title: 'WACZ Title', + description: 'WACZ Description', + pages: PAGES_DIR_FIXTURES_PATH, + cdxj: CDXJ_DIR_FIXTURES_PATH + } + + const archive = new WACZ(options) + + await archive.process(false) + + const zip = new StreamZip.async({ file: options.output }) // eslint-disable-line + const zipEntries = await zip.entries() + + // + // Indexes should be present + // + // NOTE: A test for the ZipNum Shared Index feature would require additional / larger fixtures. + assert(await zip.entryData('indexes/index.cdx')) + + // Check index contests + const combinedCDX = (await zip.entryData('indexes/index.cdx')).toString('utf-8') + let pageIndex = 0 + + for (const entry of combinedCDX.split('\n')) { + if (pageIndex === 0) { + assert.equal(entry, 'net,webrecorder)/ 20240814070734 {"url":"https://webrecorder.net/","mime":"text/html","status":200,"digest":"16966a2a2909825ad1d9a6f1b2f4833c8fe43428cb9920d0f974bd7b3d73c31d","length":3941,"offset":0,"filename":"rec-8bc4bd095683-20240307070734658-0.warc.gz"}') + } else if (pageIndex === 1) { + assert.equal(entry, 'net,webrecorder)/assets/favicon.ico 20240814162442 {"url":"https://webrecorder.net/assets/favicon.ico","mime":"image/vnd.microsoft.icon","status":200,"digest":"e39a17af5d611f3a36784bc70128f93c10a7f5db03626d3030edf2ee1772e328","length":15398,"offset":87313,"filename":"rec-de8ca7249fc0-20240814162441960-0.warc.gz"}') + } else if (pageIndex === 2) { + assert.equal(entry, 'net,webrecorder)/assets/wr-logo.svg 20240814162441 {"url":"https://webrecorder.net/assets/wr-logo.svg","mime":"image/svg","status":200,"digest":"00c5957f7c97b2e79433fe607bdb47ecb3837a7ee7b603849e7cfb52dcc5f4c7","length":2041,"offset":19176,"filename":"rec-de8ca7249fc0-20240814162441960-0.warc.gz"}') + } else { + assert.equal(entry, '') + } + + pageIndex += 1 + } + + // Delete temp file + await fs.unlink(options.output) +}) From 18c5f64eecc6526765742ff8a9b2fb61450d6fb2 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 14 Aug 2024 13:01:24 -0400 Subject: [PATCH 2/5] Fix linting issues --- bin/cli.js | 5 ----- index.js | 2 +- index.test.js | 1 - 3 files changed, 1 insertion(+), 7 deletions(-) diff --git a/bin/cli.js b/bin/cli.js index df96aa2..b82cd37 100755 --- a/bin/cli.js +++ b/bin/cli.js @@ -1,10 +1,5 @@ #! /usr/bin/env node -import { createReadStream } from 'fs' -import fs from 'fs/promises' -import { resolve } from 'path' -import * as readline from 'node:readline/promises' - import log from 'loglevel' import logPrefix from 'loglevel-plugin-prefix' import { Command } from 'commander' diff --git a/index.js b/index.js index 86c06ad..5b8c228 100644 --- a/index.js +++ b/index.js @@ -679,7 +679,7 @@ export class WACZ { readFromExistingCDXJ = async () => { this.stateCheck() - const { cdxjDir, log, addFileToZip } = this + const { cdxjDir, log } = this if (!cdxjDir) { throw new Error('Error copying CDXJ files, no directory specified.') diff --git a/index.test.js b/index.test.js index e4806d0..42a5bc3 100644 --- a/index.test.js +++ b/index.test.js @@ -393,7 +393,6 @@ test('WACZ.process with cdxj option creates valid WACZ with index from provided await archive.process(false) const zip = new StreamZip.async({ file: options.output }) // eslint-disable-line - const zipEntries = await zip.entries() // // Indexes should be present From 813e550a27ad5f36b1a441eb684a3a656bb332f1 Mon Sep 17 00:00:00 2001 From: Matteo Cargnelutti Date: Thu, 15 Aug 2024 16:49:11 -0400 Subject: [PATCH 3/5] Docs for options.cdxj --- index.js | 9 +++++---- types.js | 7 ++++--- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/index.js b/index.js index 5b8c228..fbf80fe 100644 --- a/index.js +++ b/index.js @@ -290,6 +290,10 @@ export class WACZ { this.detectPages = false } + if (options?.indexFromWARCs === false) { + this.indexFromWARCs = false + } + if (options?.pages) { this.detectPages = false this.pagesDir = String(options?.pages).trim() @@ -297,13 +301,10 @@ export class WACZ { if (options?.cdxj) { this.detectPages = false + this.indexFromWARCs = false // Added here for clarity, but implied by calls to `this.addCDXJ()` this.cdxjDir = String(options?.cdxj).trim() } - if (options?.indexFromWARCs === false) { - this.indexFromWARCs = false - } - if (options?.url) { try { new URL(options.url) // eslint-disable-line diff --git a/types.js b/types.js index b0e5773..bb97d66 100644 --- a/types.js +++ b/types.js @@ -3,9 +3,8 @@ * @typedef {Object} WACZOptions * @property {string|string[]} input - Required. Path(s) to input .warc or .warc.gz file(s). Glob-compatible. * @property {string} output - Required. Path to output .wacz file. Will default to PWD + `archive.wacz` if not provided. - * @property {boolean} [indexFromWARCs=true] - If true, will attempt to generate CDXJ indexes from processed WARCs. Automatically disabled if `addCDXJ()` is called. - * @property {boolean} [detectPages=true] - If true (default), will attempt to detect pages in WARC records. Automatically disabled if `pages` is provided or `addPages()` is called. - * @property {?string} pages - Path to a folder containing pages files (pages.jsonl, extraPages.jsonl ...). + * @property {boolean} [indexFromWARCs=true] - If true, will attempt to generate CDXJ indexes from processed WARCs. Automatically disabled if `cdjx` is passed or `addCDXJ()` is called. + * @property {boolean} [detectPages=true] - If true (default), will attempt to detect pages in WARC records to generate a pages.jsonl file. Automatically disabled if: `pages`/ `cdxj` is provided, or `addPages()`/ `addCDJX()` is called. * @property {?string} url - If set, will be added to datapackage.json as `mainPageUrl`. * @property {?string} ts - If set, will be added to datapackage.json as `mainPageDate`. Can be any value that `Date()` can parse. * @property {?string} title - If set, will be added to datapackage.json as `title`. @@ -13,6 +12,8 @@ * @property {?string} signingUrl - If set, will be used to try and sign the resulting archive. * @property {?string} signingToken - Access token to be used in combination with `signingUrl`. * @property {?Object} datapackageExtras - If set, will be appended to datapackage.json under `extras`. + * @property {?string} cdxj - If set, skips indexing and allows for passing CDXJ files "as is". Path to a folder containing CDXJ files. Allows + * @property {?string} pages - If set, allows for passing pre-set pages.jsonl file(s). Path to a folder containing pages files (pages.jsonl, extraPages.jsonl ...). * @property {?any} log - Will be used instead of the Console API for logging, if compatible (i.e: loglevel). Defaults to globalThis.console. */ From 4f32104d4d1b55910490dfebaea16dcec267f4d6 Mon Sep 17 00:00:00 2001 From: Matteo Cargnelutti Date: Thu, 15 Aug 2024 16:58:46 -0400 Subject: [PATCH 4/5] Doc --- bin/cli.js | 3 ++- types.js | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/bin/cli.js b/bin/cli.js index b82cd37..1b3caaf 100755 --- a/bin/cli.js +++ b/bin/cli.js @@ -59,7 +59,8 @@ program.command('create') .option('--cdxj ', 'Path to a directory containing CDXJ indices to merge into final WACZ CDXJ. ' + 'If not provided, js-wacz will reindex from WARCS. Must be used in combination ' + - 'with --pages, since using this option will skip reading the WARC files.') + 'with --pages, since using this option will skip the step required to generate a ' + + 'pages.jsonl file.') .action(async (name, options, command) => { /** @type {Object} */ const values = options._optionValues diff --git a/types.js b/types.js index bb97d66..5bb8c7e 100644 --- a/types.js +++ b/types.js @@ -13,7 +13,7 @@ * @property {?string} signingToken - Access token to be used in combination with `signingUrl`. * @property {?Object} datapackageExtras - If set, will be appended to datapackage.json under `extras`. * @property {?string} cdxj - If set, skips indexing and allows for passing CDXJ files "as is". Path to a folder containing CDXJ files. Allows - * @property {?string} pages - If set, allows for passing pre-set pages.jsonl file(s). Path to a folder containing pages files (pages.jsonl, extraPages.jsonl ...). + * @property {?string} pages - If set, allows for passing pre-set pages.jsonl file(s). Path to a folder containing pages files (pages.jsonl, extraPages.jsonl ...). Must be used in combination with `pages`, since using this option will skip the step required to generate a pages.jsonl file. * @property {?any} log - Will be used instead of the Console API for logging, if compatible (i.e: loglevel). Defaults to globalThis.console. */ From 8b6a113e92b2c632c62e8bd605ec80d53c003d27 Mon Sep 17 00:00:00 2001 From: Matteo Cargnelutti Date: Thu, 15 Aug 2024 17:21:37 -0400 Subject: [PATCH 5/5] Update types.js --- types.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/types.js b/types.js index 5bb8c7e..7c025ec 100644 --- a/types.js +++ b/types.js @@ -12,7 +12,7 @@ * @property {?string} signingUrl - If set, will be used to try and sign the resulting archive. * @property {?string} signingToken - Access token to be used in combination with `signingUrl`. * @property {?Object} datapackageExtras - If set, will be appended to datapackage.json under `extras`. - * @property {?string} cdxj - If set, skips indexing and allows for passing CDXJ files "as is". Path to a folder containing CDXJ files. Allows + * @property {?string} cdxj - If set, skips indexing and allows for passing CDXJ files "as is". Path to a folder containing CDXJ files. * @property {?string} pages - If set, allows for passing pre-set pages.jsonl file(s). Path to a folder containing pages files (pages.jsonl, extraPages.jsonl ...). Must be used in combination with `pages`, since using this option will skip the step required to generate a pages.jsonl file. * @property {?any} log - Will be used instead of the Console API for logging, if compatible (i.e: loglevel). Defaults to globalThis.console. */