diff --git a/bin/cli.js b/bin/cli.js index 8229846..1b3caaf 100755 --- a/bin/cli.js +++ b/bin/cli.js @@ -1,10 +1,5 @@ #! /usr/bin/env node -import { createReadStream } from 'fs' -import fs from 'fs/promises' -import { resolve } from 'path' -import * as readline from 'node:readline/promises' - import log from 'loglevel' import logPrefix from 'loglevel-plugin-prefix' import { Command } from 'commander' @@ -64,7 +59,8 @@ program.command('create') .option('--cdxj ', 'Path to a directory containing CDXJ indices to merge into final WACZ CDXJ. ' + 'If not provided, js-wacz will reindex from WARCS. Must be used in combination ' + - 'with --pages, since using this option will skip reading the WARC files.') + 'with --pages, since using this option will skip the step required to generate a ' + + 'pages.jsonl file.') .action(async (name, options, command) => { /** @type {Object} */ const values = options._optionValues @@ -116,6 +112,7 @@ program.command('create') signingUrl: values?.signingUrl, signingToken: values?.signingToken, pages: values?.pages, + cdxj: values?.cdxj, log }) } catch (err) { @@ -123,36 +120,6 @@ program.command('create') process.exit(1) } - // Ingest user-provided CDX files, if any. - if (values?.cdxj) { - try { - const dirPath = values?.cdxj - const cdxjFiles = await fs.readdir(dirPath) - const allowedExts = ['cdx', 'cdxj'] - - for (let i = 0; i < cdxjFiles.length; i++) { - const cdxjFile = resolve(dirPath, cdxjFiles[i]) - - const ext = cdxjFile.split('.').pop() - if (!allowedExts.includes(ext)) { - log.warn(`CDXJ: Skipping file ${cdxjFile}, not a CDXJ file`) - continue - } - - log.info(`CDXJ: Reading entries from ${cdxjFile}`) - const rl = readline.createInterface({ input: createReadStream(cdxjFile) }) - - for await (const line of rl) { - archive.addCDXJ(line + '\n') - } - } - } catch (err) { - log.trace(err) - log.error('An error occurred while processing user-provided CDXJ indices.') - process.exit(1) - } - } - // Main process try { await archive.process() diff --git a/constants.js b/constants.js index 539534c..ddd4fff 100644 --- a/constants.js +++ b/constants.js @@ -34,6 +34,12 @@ export const PAGES_FIXTURE_PATH = `${PAGES_DIR_FIXTURES_PATH}pages.jsonl` */ export const EXTRA_PAGES_FIXTURE_PATH = `${PAGES_DIR_FIXTURES_PATH}extraPages.jsonl` +/** + * Path to the fixtures folder cdxj sub-directory. + * @constant + */ +export const CDXJ_DIR_FIXTURES_PATH = `${FIXTURES_PATH}cdxj${sep}` + /** * Colors scheme for log level. * @constant diff --git a/fixtures/cdxj/invalid.txt b/fixtures/cdxj/invalid.txt new file mode 100644 index 0000000..f49da92 --- /dev/null +++ b/fixtures/cdxj/invalid.txt @@ -0,0 +1 @@ +not cdxj diff --git a/fixtures/cdxj/valid.cdxj b/fixtures/cdxj/valid.cdxj new file mode 100644 index 0000000..69720d5 --- /dev/null +++ b/fixtures/cdxj/valid.cdxj @@ -0,0 +1 @@ +net,webrecorder)/ 20240814070734 {"url":"https://webrecorder.net/","mime":"text/html","status":200,"digest":"16966a2a2909825ad1d9a6f1b2f4833c8fe43428cb9920d0f974bd7b3d73c31d","length":3941,"offset":0,"filename":"rec-8bc4bd095683-20240307070734658-0.warc.gz"} diff --git a/fixtures/cdxj/valid2.cdx b/fixtures/cdxj/valid2.cdx new file mode 100644 index 0000000..4894940 --- /dev/null +++ b/fixtures/cdxj/valid2.cdx @@ -0,0 +1,2 @@ +net,webrecorder)/assets/wr-logo.svg 20240814162441 {"url":"https://webrecorder.net/assets/wr-logo.svg","mime":"image/svg","status":200,"digest":"00c5957f7c97b2e79433fe607bdb47ecb3837a7ee7b603849e7cfb52dcc5f4c7","length":2041,"offset":19176,"filename":"rec-de8ca7249fc0-20240814162441960-0.warc.gz"} +net,webrecorder)/assets/favicon.ico 20240814162442 {"url":"https://webrecorder.net/assets/favicon.ico","mime":"image/vnd.microsoft.icon","status":200,"digest":"e39a17af5d611f3a36784bc70128f93c10a7f5db03626d3030edf2ee1772e328","length":15398,"offset":87313,"filename":"rec-de8ca7249fc0-20240814162441960-0.warc.gz"} diff --git a/index.js b/index.js index 935d495..fbf80fe 100644 --- a/index.js +++ b/index.js @@ -185,6 +185,12 @@ export class WACZ { */ pagesDir = null + /** + * Path to directory of CDXJ files to copy as-is into WACZ. + * @type {?string} + */ + cdxjDir = null + /** * @param {WACZOptions} options - See {@link WACZOptions} for details. */ @@ -284,13 +290,19 @@ export class WACZ { this.detectPages = false } + if (options?.indexFromWARCs === false) { + this.indexFromWARCs = false + } + if (options?.pages) { this.detectPages = false this.pagesDir = String(options?.pages).trim() } - if (options?.indexFromWARCs === false) { - this.indexFromWARCs = false + if (options?.cdxj) { + this.detectPages = false + this.indexFromWARCs = false // Added here for clarity, but implied by calls to `this.addCDXJ()` + this.cdxjDir = String(options?.cdxj).trim() } if (options?.url) { @@ -360,6 +372,11 @@ export class WACZ { info('Initializing indexer') this.initWorkerPool() + if (this.cdxjDir) { + info('Reading provided CDXJ files') + await this.readFromExistingCDXJ() + } + if (this.indexFromWARCs) { info('Indexing WARCS') await this.indexWARCs() @@ -656,6 +673,41 @@ export class WACZ { } } + /** + * Reads lines from CDXJ files in `this.cdxjDir` into cdxArray. + * @returns {Promise} + */ + readFromExistingCDXJ = async () => { + this.stateCheck() + + const { cdxjDir, log } = this + + if (!cdxjDir) { + throw new Error('Error copying CDXJ files, no directory specified.') + } + + const allowedExts = ['cdx', 'cdxj'] + + const cdxjFiles = await fs.readdir(cdxjDir) + + for (const cdxjFile of cdxjFiles) { + const cdxjFilepath = resolve(cdxjDir, cdxjFile) + + const ext = cdxjFilepath.toLowerCase().split('.').pop() + if (!allowedExts.includes(ext)) { + log.warn(`CDXJ: Skipping file ${cdxjFile}, not a CDXJ file`) + continue + } + + log.info(`CDXJ: Reading entries from ${cdxjFile}`) + const rl = readline.createInterface({ input: createReadStream(cdxjFilepath) }) + + for await (const line of rl) { + this.addCDXJ(line + '\n') + } + } + } + /** * Streams all the files listed in `this.WARCs` to the output ZIP. * @returns {Promise} diff --git a/index.test.js b/index.test.js index 2eaf526..42a5bc3 100644 --- a/index.test.js +++ b/index.test.js @@ -11,7 +11,7 @@ import StreamZip from 'node-stream-zip' import * as dotenv from 'dotenv' import { WACZ } from './index.js' -import { FIXTURES_PATH, PAGES_DIR_FIXTURES_PATH, PAGES_FIXTURE_PATH, EXTRA_PAGES_FIXTURE_PATH } from './constants.js' +import { FIXTURES_PATH, PAGES_DIR_FIXTURES_PATH, PAGES_FIXTURE_PATH, EXTRA_PAGES_FIXTURE_PATH, CDXJ_DIR_FIXTURES_PATH } from './constants.js' import { assertSHA256WithPrefix, assertValidWACZSignatureFormat } from './utils/assertions.js' // see https://github.com/motdotla/dotenv#how-do-i-use-dotenv-with-import // Loads env vars from .env if provided @@ -376,3 +376,48 @@ test('WACZ.process with pagesDir option creates valid WACZ with provided pages f // Delete temp file await fs.unlink(options.output) }) + +test('WACZ.process with cdxj option creates valid WACZ with index from provided CDXJ files.', async (_t) => { + const options = { + input: FIXTURE_INPUT, + output: '../tmp.wacz', + url: 'https://lil.law.harvard.edu', + title: 'WACZ Title', + description: 'WACZ Description', + pages: PAGES_DIR_FIXTURES_PATH, + cdxj: CDXJ_DIR_FIXTURES_PATH + } + + const archive = new WACZ(options) + + await archive.process(false) + + const zip = new StreamZip.async({ file: options.output }) // eslint-disable-line + + // + // Indexes should be present + // + // NOTE: A test for the ZipNum Shared Index feature would require additional / larger fixtures. + assert(await zip.entryData('indexes/index.cdx')) + + // Check index contests + const combinedCDX = (await zip.entryData('indexes/index.cdx')).toString('utf-8') + let pageIndex = 0 + + for (const entry of combinedCDX.split('\n')) { + if (pageIndex === 0) { + assert.equal(entry, 'net,webrecorder)/ 20240814070734 {"url":"https://webrecorder.net/","mime":"text/html","status":200,"digest":"16966a2a2909825ad1d9a6f1b2f4833c8fe43428cb9920d0f974bd7b3d73c31d","length":3941,"offset":0,"filename":"rec-8bc4bd095683-20240307070734658-0.warc.gz"}') + } else if (pageIndex === 1) { + assert.equal(entry, 'net,webrecorder)/assets/favicon.ico 20240814162442 {"url":"https://webrecorder.net/assets/favicon.ico","mime":"image/vnd.microsoft.icon","status":200,"digest":"e39a17af5d611f3a36784bc70128f93c10a7f5db03626d3030edf2ee1772e328","length":15398,"offset":87313,"filename":"rec-de8ca7249fc0-20240814162441960-0.warc.gz"}') + } else if (pageIndex === 2) { + assert.equal(entry, 'net,webrecorder)/assets/wr-logo.svg 20240814162441 {"url":"https://webrecorder.net/assets/wr-logo.svg","mime":"image/svg","status":200,"digest":"00c5957f7c97b2e79433fe607bdb47ecb3837a7ee7b603849e7cfb52dcc5f4c7","length":2041,"offset":19176,"filename":"rec-de8ca7249fc0-20240814162441960-0.warc.gz"}') + } else { + assert.equal(entry, '') + } + + pageIndex += 1 + } + + // Delete temp file + await fs.unlink(options.output) +}) diff --git a/types.js b/types.js index b0e5773..7c025ec 100644 --- a/types.js +++ b/types.js @@ -3,9 +3,8 @@ * @typedef {Object} WACZOptions * @property {string|string[]} input - Required. Path(s) to input .warc or .warc.gz file(s). Glob-compatible. * @property {string} output - Required. Path to output .wacz file. Will default to PWD + `archive.wacz` if not provided. - * @property {boolean} [indexFromWARCs=true] - If true, will attempt to generate CDXJ indexes from processed WARCs. Automatically disabled if `addCDXJ()` is called. - * @property {boolean} [detectPages=true] - If true (default), will attempt to detect pages in WARC records. Automatically disabled if `pages` is provided or `addPages()` is called. - * @property {?string} pages - Path to a folder containing pages files (pages.jsonl, extraPages.jsonl ...). + * @property {boolean} [indexFromWARCs=true] - If true, will attempt to generate CDXJ indexes from processed WARCs. Automatically disabled if `cdjx` is passed or `addCDXJ()` is called. + * @property {boolean} [detectPages=true] - If true (default), will attempt to detect pages in WARC records to generate a pages.jsonl file. Automatically disabled if: `pages`/ `cdxj` is provided, or `addPages()`/ `addCDJX()` is called. * @property {?string} url - If set, will be added to datapackage.json as `mainPageUrl`. * @property {?string} ts - If set, will be added to datapackage.json as `mainPageDate`. Can be any value that `Date()` can parse. * @property {?string} title - If set, will be added to datapackage.json as `title`. @@ -13,6 +12,8 @@ * @property {?string} signingUrl - If set, will be used to try and sign the resulting archive. * @property {?string} signingToken - Access token to be used in combination with `signingUrl`. * @property {?Object} datapackageExtras - If set, will be appended to datapackage.json under `extras`. + * @property {?string} cdxj - If set, skips indexing and allows for passing CDXJ files "as is". Path to a folder containing CDXJ files. + * @property {?string} pages - If set, allows for passing pre-set pages.jsonl file(s). Path to a folder containing pages files (pages.jsonl, extraPages.jsonl ...). Must be used in combination with `pages`, since using this option will skip the step required to generate a pages.jsonl file. * @property {?any} log - Will be used instead of the Console API for logging, if compatible (i.e: loglevel). Defaults to globalThis.console. */