diff --git a/README.md b/README.md index eac0ad9..d16774d 100644 --- a/README.md +++ b/README.md @@ -100,6 +100,14 @@ If not provided, **js-wacz** is going to attempt to detect pages in WARC records js-wacz create -f "collection/*.warc.gz" --pages collection/pages.jsonl ``` +### --cdxj + +Allows to pass a directory of existing CDXJ files, rather than indexing from WARCs. Must be used in combination with `--pages`. + +```bash +js-wacz create -f "collection/*.warc.gz" --pages collection/pages.jsonl --cdxj collection/indexes/ +``` + ### --url If provided, will be used as the [`mainPageUrl` attribute for `datapackage.json`](https://specs.webrecorder.net/wacz/1.1.1/#datapackage-json). diff --git a/bin/cli.js b/bin/cli.js index 37d8eba..e92c136 100755 --- a/bin/cli.js +++ b/bin/cli.js @@ -1,6 +1,8 @@ #! /usr/bin/env node import { createReadStream } from 'fs' +import fs from 'fs/promises' +import { resolve } from 'path' import * as readline from 'node:readline/promises' import log from 'loglevel' @@ -59,6 +61,10 @@ program.command('create') .option( '--log-level ', 'Can be "silent", "trace", "debug", "info", "warn", "error"', 'info') + .option('--cdxj ', + 'Path to a directory containing CDXJ indices to merge into final WACZ CDXJ. ' + + 'If not provided, js-wacz will reindex from WARCS. Must be used in combination ' + + 'with --pages, since using this option will skip reading the WARC files.') .action(async (name, options, command) => { /** @type {Object} */ const values = options._optionValues @@ -93,6 +99,11 @@ program.command('create') return } + if (values?.cdxj && !values?.pages) { + console.error('Error: --cdxj option must be used in combination with --pages.') + return + } + // Pass options to WACZ try { archive = new WACZ({ @@ -133,6 +144,35 @@ program.command('create') } } + // Ingest user-provided CDX files, if any. + if (values?.cdxj) { + try { + const dirPath = values?.cdxj + const cdxjFiles = await fs.readdir(dirPath) + const allowedExts = ['cdx', 'cdxj'] + + for (let i = 0; i < cdxjFiles.length; i++) { + const cdxjFile = resolve(dirPath, cdxjFiles[i]) + + const ext = cdxjFile.split('.').pop() + if (!allowedExts.includes(ext)) { + log.info(`CDXJ: Skipping file ${cdxjFile}, not a CDXJ file`) + continue + } + + log.info(`CDXJ: Reading entries from ${cdxjFile}`) + const rl = readline.createInterface({ input: createReadStream(cdxjFile) }) + + for await (const line of rl) { + archive.addCDXJ(line + '\n') + } + } + } catch (err) { + log.trace(err) + log.error('An error occurred while processing user-provided CDXJ indices.') + } + } + // Main process try { await archive.process() diff --git a/index.js b/index.js index 0214a79..6836085 100644 --- a/index.js +++ b/index.js @@ -76,6 +76,12 @@ export class WACZ { */ detectPages = true + /** + * From WACZOptions.indexFromWARCs. + * @type {boolean} + */ + indexFromWARCs = true + /** * From WACZOptions.url. * @type {?string} @@ -270,6 +276,10 @@ export class WACZ { this.detectPages = false } + if (options?.indexFromWARCs === false) { + this.indexFromWARCs = false + } + if (options?.url) { try { new URL(options.url) // eslint-disable-line @@ -337,8 +347,10 @@ export class WACZ { info('Initializing indexer') this.initWorkerPool() - info('Indexing WARCS') - await this.indexWARCs() + if (this.indexFromWARCs) { + info('Indexing WARCS') + await this.indexWARCs() + } info('Harvesting sorted indexes from trees') this.harvestArraysFromTrees() @@ -792,6 +804,19 @@ export class WACZ { return page } + /** + * Allows to manually add a CDJX entry to `this.cdxTree`. + * Calling this method automatically turns indexing from WARCS off. + * @param {string} cdjx - CDJX as string + * @returns {Promise} + */ + addCDXJ = (cdjx) => { + this.stateCheck() + this.indexFromWARCs = false + + this.cdxTree.setIfNotPresent(cdjx, true) + } + /** * Adds a file to the output ZIP stream. * Automatically keeps trace of file in `this.resources` so it can be referenced in datapackage.json. diff --git a/index.test.js b/index.test.js index a85878b..7927c67 100644 --- a/index.test.js +++ b/index.test.js @@ -74,6 +74,20 @@ test('WACZ constructor accounts for options.detectPages if valid.', async (_t) = assert.equal(archive.detectPages, false) }) +test('WACZ constructor ignores options.indexFromWARCs if invalid.', async (_t) => { + const scenarios = ['foo', {}, Buffer.alloc(0), 12, () => {}] + + for (const indexFromWARCs of scenarios) { + const archive = new WACZ({ input: FIXTURE_INPUT, indexFromWARCs }) + assert.equal(archive.indexFromWARCs, true) + } +}) + +test('WACZ constructor accounts for options.indexFromWARCs if valid.', async (_t) => { + const archive = new WACZ({ input: FIXTURE_INPUT, indexFromWARCs: false }) + assert.equal(archive.indexFromWARCs, false) +}) + test('WACZ constructor ignores options.url if invalid.', async (_t) => { const scenarios = ['foo', {}, Buffer.alloc(0), 12, () => {}] @@ -178,6 +192,17 @@ test('addPage adds entry to pagesTree and turns detectPages off.', async (_t) => assert.equal(archive.pagesTree.length, 1) }) +test('addCDXJ adds entry to cdxTree and turns indexFromWARCs off.', async (_t) => { + const archive = new WACZ({ input: FIXTURE_INPUT }) + assert.equal(archive.indexFromWARCs, true) + assert.equal(archive.cdxTree.length, 0) + + archive.addCDXJ('net,webrecorder)/ 20240307070734 {"url":"https://webrecorder.net/","mime":"text/html","status":200,"digest":"16966a2a2909825ad1d9a6f1b2f4833c8fe43428cb9920d0f974bd7b3d73c31d","length":3941,"offset":0,"filename":"rec-8bc4bd095683-20240307070734658-0.warc.gz"}') + + assert.equal(archive.indexFromWARCs, false) + assert.equal(archive.cdxTree.length, 1) +}) + // Note: if `TEST_SIGNING_URL` / `TEST_SIGNING_TOKEN` are present, this will also test the signing feature. test('WACZ.process runs the entire process and writes a valid .wacz to disk, accounting for options.', async (_t) => { //