Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move CDXJ file handling from bin/cli.js to WACZ class #120

Merged
merged 5 commits into from
Aug 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 3 additions & 36 deletions bin/cli.js
Original file line number Diff line number Diff line change
@@ -1,10 +1,5 @@
#! /usr/bin/env node

import { createReadStream } from 'fs'
import fs from 'fs/promises'
import { resolve } from 'path'
import * as readline from 'node:readline/promises'

import log from 'loglevel'
import logPrefix from 'loglevel-plugin-prefix'
import { Command } from 'commander'
Expand Down Expand Up @@ -64,7 +59,8 @@ program.command('create')
.option('--cdxj <string>',
'Path to a directory containing CDXJ indices to merge into final WACZ CDXJ. ' +
'If not provided, js-wacz will reindex from WARCS. Must be used in combination ' +
'with --pages, since using this option will skip reading the WARC files.')
'with --pages, since using this option will skip the step required to generate a ' +
'pages.jsonl file.')
.action(async (name, options, command) => {
/** @type {Object} */
const values = options._optionValues
Expand Down Expand Up @@ -116,43 +112,14 @@ program.command('create')
signingUrl: values?.signingUrl,
signingToken: values?.signingToken,
pages: values?.pages,
cdxj: values?.cdxj,
log
})
} catch (err) {
log.error(`${err}`) // Show simplified report
process.exit(1)
}

// Ingest user-provided CDX files, if any.
if (values?.cdxj) {
try {
const dirPath = values?.cdxj
const cdxjFiles = await fs.readdir(dirPath)
const allowedExts = ['cdx', 'cdxj']

for (let i = 0; i < cdxjFiles.length; i++) {
const cdxjFile = resolve(dirPath, cdxjFiles[i])

const ext = cdxjFile.split('.').pop()
if (!allowedExts.includes(ext)) {
log.warn(`CDXJ: Skipping file ${cdxjFile}, not a CDXJ file`)
continue
}

log.info(`CDXJ: Reading entries from ${cdxjFile}`)
const rl = readline.createInterface({ input: createReadStream(cdxjFile) })

for await (const line of rl) {
archive.addCDXJ(line + '\n')
}
}
} catch (err) {
log.trace(err)
log.error('An error occurred while processing user-provided CDXJ indices.')
process.exit(1)
}
}

// Main process
try {
await archive.process()
Expand Down
6 changes: 6 additions & 0 deletions constants.js
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,12 @@ export const PAGES_FIXTURE_PATH = `${PAGES_DIR_FIXTURES_PATH}pages.jsonl`
*/
export const EXTRA_PAGES_FIXTURE_PATH = `${PAGES_DIR_FIXTURES_PATH}extraPages.jsonl`

/**
* Path to the fixtures folder cdxj sub-directory.
* @constant
*/
export const CDXJ_DIR_FIXTURES_PATH = `${FIXTURES_PATH}cdxj${sep}`

/**
* Colors scheme for log level.
* @constant
Expand Down
1 change: 1 addition & 0 deletions fixtures/cdxj/invalid.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
not cdxj
1 change: 1 addition & 0 deletions fixtures/cdxj/valid.cdxj
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
net,webrecorder)/ 20240814070734 {"url":"https://webrecorder.net/","mime":"text/html","status":200,"digest":"16966a2a2909825ad1d9a6f1b2f4833c8fe43428cb9920d0f974bd7b3d73c31d","length":3941,"offset":0,"filename":"rec-8bc4bd095683-20240307070734658-0.warc.gz"}
2 changes: 2 additions & 0 deletions fixtures/cdxj/valid2.cdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
net,webrecorder)/assets/wr-logo.svg 20240814162441 {"url":"https://webrecorder.net/assets/wr-logo.svg","mime":"image/svg","status":200,"digest":"00c5957f7c97b2e79433fe607bdb47ecb3837a7ee7b603849e7cfb52dcc5f4c7","length":2041,"offset":19176,"filename":"rec-de8ca7249fc0-20240814162441960-0.warc.gz"}
net,webrecorder)/assets/favicon.ico 20240814162442 {"url":"https://webrecorder.net/assets/favicon.ico","mime":"image/vnd.microsoft.icon","status":200,"digest":"e39a17af5d611f3a36784bc70128f93c10a7f5db03626d3030edf2ee1772e328","length":15398,"offset":87313,"filename":"rec-de8ca7249fc0-20240814162441960-0.warc.gz"}
56 changes: 54 additions & 2 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,12 @@ export class WACZ {
*/
pagesDir = null

/**
* Path to directory of CDXJ files to copy as-is into WACZ.
* @type {?string}
*/
cdxjDir = null

/**
* @param {WACZOptions} options - See {@link WACZOptions} for details.
*/
Expand Down Expand Up @@ -284,13 +290,19 @@ export class WACZ {
this.detectPages = false
}

if (options?.indexFromWARCs === false) {
this.indexFromWARCs = false
}

if (options?.pages) {
this.detectPages = false
this.pagesDir = String(options?.pages).trim()
}

if (options?.indexFromWARCs === false) {
this.indexFromWARCs = false
if (options?.cdxj) {
this.detectPages = false
this.indexFromWARCs = false // Added here for clarity, but implied by calls to `this.addCDXJ()`
this.cdxjDir = String(options?.cdxj).trim()
}

if (options?.url) {
Expand Down Expand Up @@ -360,6 +372,11 @@ export class WACZ {
info('Initializing indexer')
this.initWorkerPool()

if (this.cdxjDir) {
info('Reading provided CDXJ files')
await this.readFromExistingCDXJ()
}

if (this.indexFromWARCs) {
info('Indexing WARCS')
await this.indexWARCs()
Expand Down Expand Up @@ -656,6 +673,41 @@ export class WACZ {
}
}

/**
* Reads lines from CDXJ files in `this.cdxjDir` into cdxArray.
* @returns {Promise<void>}
*/
readFromExistingCDXJ = async () => {
this.stateCheck()

const { cdxjDir, log } = this

if (!cdxjDir) {
throw new Error('Error copying CDXJ files, no directory specified.')
}

const allowedExts = ['cdx', 'cdxj']

const cdxjFiles = await fs.readdir(cdxjDir)

for (const cdxjFile of cdxjFiles) {
const cdxjFilepath = resolve(cdxjDir, cdxjFile)

const ext = cdxjFilepath.toLowerCase().split('.').pop()
if (!allowedExts.includes(ext)) {
log.warn(`CDXJ: Skipping file ${cdxjFile}, not a CDXJ file`)
continue
}

log.info(`CDXJ: Reading entries from ${cdxjFile}`)
const rl = readline.createInterface({ input: createReadStream(cdxjFilepath) })

for await (const line of rl) {
this.addCDXJ(line + '\n')
}
}
}

/**
* Streams all the files listed in `this.WARCs` to the output ZIP.
* @returns {Promise<void>}
Expand Down
47 changes: 46 additions & 1 deletion index.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import StreamZip from 'node-stream-zip'
import * as dotenv from 'dotenv'

import { WACZ } from './index.js'
import { FIXTURES_PATH, PAGES_DIR_FIXTURES_PATH, PAGES_FIXTURE_PATH, EXTRA_PAGES_FIXTURE_PATH } from './constants.js'
import { FIXTURES_PATH, PAGES_DIR_FIXTURES_PATH, PAGES_FIXTURE_PATH, EXTRA_PAGES_FIXTURE_PATH, CDXJ_DIR_FIXTURES_PATH } from './constants.js'
import { assertSHA256WithPrefix, assertValidWACZSignatureFormat } from './utils/assertions.js' // see https://github.com/motdotla/dotenv#how-do-i-use-dotenv-with-import

// Loads env vars from .env if provided
Expand Down Expand Up @@ -376,3 +376,48 @@ test('WACZ.process with pagesDir option creates valid WACZ with provided pages f
// Delete temp file
await fs.unlink(options.output)
})

test('WACZ.process with cdxj option creates valid WACZ with index from provided CDXJ files.', async (_t) => {
const options = {
input: FIXTURE_INPUT,
output: '../tmp.wacz',
url: 'https://lil.law.harvard.edu',
title: 'WACZ Title',
description: 'WACZ Description',
pages: PAGES_DIR_FIXTURES_PATH,
cdxj: CDXJ_DIR_FIXTURES_PATH
}

const archive = new WACZ(options)

await archive.process(false)

const zip = new StreamZip.async({ file: options.output }) // eslint-disable-line

//
// Indexes should be present
//
// NOTE: A test for the ZipNum Shared Index feature would require additional / larger fixtures.
assert(await zip.entryData('indexes/index.cdx'))

// Check index contests
const combinedCDX = (await zip.entryData('indexes/index.cdx')).toString('utf-8')
let pageIndex = 0

for (const entry of combinedCDX.split('\n')) {
if (pageIndex === 0) {
assert.equal(entry, 'net,webrecorder)/ 20240814070734 {"url":"https://webrecorder.net/","mime":"text/html","status":200,"digest":"16966a2a2909825ad1d9a6f1b2f4833c8fe43428cb9920d0f974bd7b3d73c31d","length":3941,"offset":0,"filename":"rec-8bc4bd095683-20240307070734658-0.warc.gz"}')
} else if (pageIndex === 1) {
assert.equal(entry, 'net,webrecorder)/assets/favicon.ico 20240814162442 {"url":"https://webrecorder.net/assets/favicon.ico","mime":"image/vnd.microsoft.icon","status":200,"digest":"e39a17af5d611f3a36784bc70128f93c10a7f5db03626d3030edf2ee1772e328","length":15398,"offset":87313,"filename":"rec-de8ca7249fc0-20240814162441960-0.warc.gz"}')
} else if (pageIndex === 2) {
assert.equal(entry, 'net,webrecorder)/assets/wr-logo.svg 20240814162441 {"url":"https://webrecorder.net/assets/wr-logo.svg","mime":"image/svg","status":200,"digest":"00c5957f7c97b2e79433fe607bdb47ecb3837a7ee7b603849e7cfb52dcc5f4c7","length":2041,"offset":19176,"filename":"rec-de8ca7249fc0-20240814162441960-0.warc.gz"}')
} else {
assert.equal(entry, '')
}

pageIndex += 1
}

// Delete temp file
await fs.unlink(options.output)
})
7 changes: 4 additions & 3 deletions types.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,17 @@
* @typedef {Object} WACZOptions
* @property {string|string[]} input - Required. Path(s) to input .warc or .warc.gz file(s). Glob-compatible.
* @property {string} output - Required. Path to output .wacz file. Will default to PWD + `archive.wacz` if not provided.
* @property {boolean} [indexFromWARCs=true] - If true, will attempt to generate CDXJ indexes from processed WARCs. Automatically disabled if `addCDXJ()` is called.
* @property {boolean} [detectPages=true] - If true (default), will attempt to detect pages in WARC records. Automatically disabled if `pages` is provided or `addPages()` is called.
* @property {?string} pages - Path to a folder containing pages files (pages.jsonl, extraPages.jsonl ...).
* @property {boolean} [indexFromWARCs=true] - If true, will attempt to generate CDXJ indexes from processed WARCs. Automatically disabled if `cdjx` is passed or `addCDXJ()` is called.
* @property {boolean} [detectPages=true] - If true (default), will attempt to detect pages in WARC records to generate a pages.jsonl file. Automatically disabled if: `pages`/ `cdxj` is provided, or `addPages()`/ `addCDJX()` is called.
* @property {?string} url - If set, will be added to datapackage.json as `mainPageUrl`.
* @property {?string} ts - If set, will be added to datapackage.json as `mainPageDate`. Can be any value that `Date()` can parse.
* @property {?string} title - If set, will be added to datapackage.json as `title`.
* @property {?string} description - If set, will be added to datapackage.json as `description`.
* @property {?string} signingUrl - If set, will be used to try and sign the resulting archive.
* @property {?string} signingToken - Access token to be used in combination with `signingUrl`.
* @property {?Object} datapackageExtras - If set, will be appended to datapackage.json under `extras`.
* @property {?string} cdxj - If set, skips indexing and allows for passing CDXJ files "as is". Path to a folder containing CDXJ files.
* @property {?string} pages - If set, allows for passing pre-set pages.jsonl file(s). Path to a folder containing pages files (pages.jsonl, extraPages.jsonl ...). Must be used in combination with `pages`, since using this option will skip the step required to generate a pages.jsonl file.
* @property {?any} log - Will be used instead of the Console API for logging, if compatible (i.e: loglevel). Defaults to globalThis.console.
*/

Expand Down
Loading