Skip to content

Commit

Permalink
Merge pull request #120 from harvard-lil/cdxj-followups
Browse files Browse the repository at this point in the history
Move CDXJ file handling from bin/cli.js to WACZ class
  • Loading branch information
matteocargnelutti authored Aug 15, 2024
2 parents 08ee202 + 8b6a113 commit 5014fff
Show file tree
Hide file tree
Showing 8 changed files with 117 additions and 42 deletions.
39 changes: 3 additions & 36 deletions bin/cli.js
Original file line number Diff line number Diff line change
@@ -1,10 +1,5 @@
#! /usr/bin/env node

import { createReadStream } from 'fs'
import fs from 'fs/promises'
import { resolve } from 'path'
import * as readline from 'node:readline/promises'

import log from 'loglevel'
import logPrefix from 'loglevel-plugin-prefix'
import { Command } from 'commander'
Expand Down Expand Up @@ -64,7 +59,8 @@ program.command('create')
.option('--cdxj <string>',
'Path to a directory containing CDXJ indices to merge into final WACZ CDXJ. ' +
'If not provided, js-wacz will reindex from WARCS. Must be used in combination ' +
'with --pages, since using this option will skip reading the WARC files.')
'with --pages, since using this option will skip the step required to generate a ' +
'pages.jsonl file.')
.action(async (name, options, command) => {
/** @type {Object} */
const values = options._optionValues
Expand Down Expand Up @@ -116,43 +112,14 @@ program.command('create')
signingUrl: values?.signingUrl,
signingToken: values?.signingToken,
pages: values?.pages,
cdxj: values?.cdxj,
log
})
} catch (err) {
log.error(`${err}`) // Show simplified report
process.exit(1)
}

// Ingest user-provided CDX files, if any.
if (values?.cdxj) {
try {
const dirPath = values?.cdxj
const cdxjFiles = await fs.readdir(dirPath)
const allowedExts = ['cdx', 'cdxj']

for (let i = 0; i < cdxjFiles.length; i++) {
const cdxjFile = resolve(dirPath, cdxjFiles[i])

const ext = cdxjFile.split('.').pop()
if (!allowedExts.includes(ext)) {
log.warn(`CDXJ: Skipping file ${cdxjFile}, not a CDXJ file`)
continue
}

log.info(`CDXJ: Reading entries from ${cdxjFile}`)
const rl = readline.createInterface({ input: createReadStream(cdxjFile) })

for await (const line of rl) {
archive.addCDXJ(line + '\n')
}
}
} catch (err) {
log.trace(err)
log.error('An error occurred while processing user-provided CDXJ indices.')
process.exit(1)
}
}

// Main process
try {
await archive.process()
Expand Down
6 changes: 6 additions & 0 deletions constants.js
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,12 @@ export const PAGES_FIXTURE_PATH = `${PAGES_DIR_FIXTURES_PATH}pages.jsonl`
*/
export const EXTRA_PAGES_FIXTURE_PATH = `${PAGES_DIR_FIXTURES_PATH}extraPages.jsonl`

/**
* Path to the fixtures folder cdxj sub-directory.
* @constant
*/
export const CDXJ_DIR_FIXTURES_PATH = `${FIXTURES_PATH}cdxj${sep}`

/**
* Colors scheme for log level.
* @constant
Expand Down
1 change: 1 addition & 0 deletions fixtures/cdxj/invalid.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
not cdxj
1 change: 1 addition & 0 deletions fixtures/cdxj/valid.cdxj
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
net,webrecorder)/ 20240814070734 {"url":"https://webrecorder.net/","mime":"text/html","status":200,"digest":"16966a2a2909825ad1d9a6f1b2f4833c8fe43428cb9920d0f974bd7b3d73c31d","length":3941,"offset":0,"filename":"rec-8bc4bd095683-20240307070734658-0.warc.gz"}
2 changes: 2 additions & 0 deletions fixtures/cdxj/valid2.cdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
net,webrecorder)/assets/wr-logo.svg 20240814162441 {"url":"https://webrecorder.net/assets/wr-logo.svg","mime":"image/svg","status":200,"digest":"00c5957f7c97b2e79433fe607bdb47ecb3837a7ee7b603849e7cfb52dcc5f4c7","length":2041,"offset":19176,"filename":"rec-de8ca7249fc0-20240814162441960-0.warc.gz"}
net,webrecorder)/assets/favicon.ico 20240814162442 {"url":"https://webrecorder.net/assets/favicon.ico","mime":"image/vnd.microsoft.icon","status":200,"digest":"e39a17af5d611f3a36784bc70128f93c10a7f5db03626d3030edf2ee1772e328","length":15398,"offset":87313,"filename":"rec-de8ca7249fc0-20240814162441960-0.warc.gz"}
56 changes: 54 additions & 2 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,12 @@ export class WACZ {
*/
pagesDir = null

/**
* Path to directory of CDXJ files to copy as-is into WACZ.
* @type {?string}
*/
cdxjDir = null

/**
* @param {WACZOptions} options - See {@link WACZOptions} for details.
*/
Expand Down Expand Up @@ -284,13 +290,19 @@ export class WACZ {
this.detectPages = false
}

if (options?.indexFromWARCs === false) {
this.indexFromWARCs = false
}

if (options?.pages) {
this.detectPages = false
this.pagesDir = String(options?.pages).trim()
}

if (options?.indexFromWARCs === false) {
this.indexFromWARCs = false
if (options?.cdxj) {
this.detectPages = false
this.indexFromWARCs = false // Added here for clarity, but implied by calls to `this.addCDXJ()`
this.cdxjDir = String(options?.cdxj).trim()
}

if (options?.url) {
Expand Down Expand Up @@ -360,6 +372,11 @@ export class WACZ {
info('Initializing indexer')
this.initWorkerPool()

if (this.cdxjDir) {
info('Reading provided CDXJ files')
await this.readFromExistingCDXJ()
}

if (this.indexFromWARCs) {
info('Indexing WARCS')
await this.indexWARCs()
Expand Down Expand Up @@ -656,6 +673,41 @@ export class WACZ {
}
}

/**
* Reads lines from CDXJ files in `this.cdxjDir` into cdxArray.
* @returns {Promise<void>}
*/
readFromExistingCDXJ = async () => {
this.stateCheck()

const { cdxjDir, log } = this

if (!cdxjDir) {
throw new Error('Error copying CDXJ files, no directory specified.')
}

const allowedExts = ['cdx', 'cdxj']

const cdxjFiles = await fs.readdir(cdxjDir)

for (const cdxjFile of cdxjFiles) {
const cdxjFilepath = resolve(cdxjDir, cdxjFile)

const ext = cdxjFilepath.toLowerCase().split('.').pop()
if (!allowedExts.includes(ext)) {
log.warn(`CDXJ: Skipping file ${cdxjFile}, not a CDXJ file`)
continue
}

log.info(`CDXJ: Reading entries from ${cdxjFile}`)
const rl = readline.createInterface({ input: createReadStream(cdxjFilepath) })

for await (const line of rl) {
this.addCDXJ(line + '\n')
}
}
}

/**
* Streams all the files listed in `this.WARCs` to the output ZIP.
* @returns {Promise<void>}
Expand Down
47 changes: 46 additions & 1 deletion index.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import StreamZip from 'node-stream-zip'
import * as dotenv from 'dotenv'

import { WACZ } from './index.js'
import { FIXTURES_PATH, PAGES_DIR_FIXTURES_PATH, PAGES_FIXTURE_PATH, EXTRA_PAGES_FIXTURE_PATH } from './constants.js'
import { FIXTURES_PATH, PAGES_DIR_FIXTURES_PATH, PAGES_FIXTURE_PATH, EXTRA_PAGES_FIXTURE_PATH, CDXJ_DIR_FIXTURES_PATH } from './constants.js'
import { assertSHA256WithPrefix, assertValidWACZSignatureFormat } from './utils/assertions.js' // see https://github.com/motdotla/dotenv#how-do-i-use-dotenv-with-import

// Loads env vars from .env if provided
Expand Down Expand Up @@ -376,3 +376,48 @@ test('WACZ.process with pagesDir option creates valid WACZ with provided pages f
// Delete temp file
await fs.unlink(options.output)
})

test('WACZ.process with cdxj option creates valid WACZ with index from provided CDXJ files.', async (_t) => {
const options = {
input: FIXTURE_INPUT,
output: '../tmp.wacz',
url: 'https://lil.law.harvard.edu',
title: 'WACZ Title',
description: 'WACZ Description',
pages: PAGES_DIR_FIXTURES_PATH,
cdxj: CDXJ_DIR_FIXTURES_PATH
}

const archive = new WACZ(options)

await archive.process(false)

const zip = new StreamZip.async({ file: options.output }) // eslint-disable-line

//
// Indexes should be present
//
// NOTE: A test for the ZipNum Shared Index feature would require additional / larger fixtures.
assert(await zip.entryData('indexes/index.cdx'))

// Check index contests
const combinedCDX = (await zip.entryData('indexes/index.cdx')).toString('utf-8')
let pageIndex = 0

for (const entry of combinedCDX.split('\n')) {
if (pageIndex === 0) {
assert.equal(entry, 'net,webrecorder)/ 20240814070734 {"url":"https://webrecorder.net/","mime":"text/html","status":200,"digest":"16966a2a2909825ad1d9a6f1b2f4833c8fe43428cb9920d0f974bd7b3d73c31d","length":3941,"offset":0,"filename":"rec-8bc4bd095683-20240307070734658-0.warc.gz"}')
} else if (pageIndex === 1) {
assert.equal(entry, 'net,webrecorder)/assets/favicon.ico 20240814162442 {"url":"https://webrecorder.net/assets/favicon.ico","mime":"image/vnd.microsoft.icon","status":200,"digest":"e39a17af5d611f3a36784bc70128f93c10a7f5db03626d3030edf2ee1772e328","length":15398,"offset":87313,"filename":"rec-de8ca7249fc0-20240814162441960-0.warc.gz"}')
} else if (pageIndex === 2) {
assert.equal(entry, 'net,webrecorder)/assets/wr-logo.svg 20240814162441 {"url":"https://webrecorder.net/assets/wr-logo.svg","mime":"image/svg","status":200,"digest":"00c5957f7c97b2e79433fe607bdb47ecb3837a7ee7b603849e7cfb52dcc5f4c7","length":2041,"offset":19176,"filename":"rec-de8ca7249fc0-20240814162441960-0.warc.gz"}')
} else {
assert.equal(entry, '')
}

pageIndex += 1
}

// Delete temp file
await fs.unlink(options.output)
})
7 changes: 4 additions & 3 deletions types.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,17 @@
* @typedef {Object} WACZOptions
* @property {string|string[]} input - Required. Path(s) to input .warc or .warc.gz file(s). Glob-compatible.
* @property {string} output - Required. Path to output .wacz file. Will default to PWD + `archive.wacz` if not provided.
* @property {boolean} [indexFromWARCs=true] - If true, will attempt to generate CDXJ indexes from processed WARCs. Automatically disabled if `addCDXJ()` is called.
* @property {boolean} [detectPages=true] - If true (default), will attempt to detect pages in WARC records. Automatically disabled if `pages` is provided or `addPages()` is called.
* @property {?string} pages - Path to a folder containing pages files (pages.jsonl, extraPages.jsonl ...).
* @property {boolean} [indexFromWARCs=true] - If true, will attempt to generate CDXJ indexes from processed WARCs. Automatically disabled if `cdjx` is passed or `addCDXJ()` is called.
* @property {boolean} [detectPages=true] - If true (default), will attempt to detect pages in WARC records to generate a pages.jsonl file. Automatically disabled if: `pages`/ `cdxj` is provided, or `addPages()`/ `addCDJX()` is called.
* @property {?string} url - If set, will be added to datapackage.json as `mainPageUrl`.
* @property {?string} ts - If set, will be added to datapackage.json as `mainPageDate`. Can be any value that `Date()` can parse.
* @property {?string} title - If set, will be added to datapackage.json as `title`.
* @property {?string} description - If set, will be added to datapackage.json as `description`.
* @property {?string} signingUrl - If set, will be used to try and sign the resulting archive.
* @property {?string} signingToken - Access token to be used in combination with `signingUrl`.
* @property {?Object} datapackageExtras - If set, will be appended to datapackage.json under `extras`.
* @property {?string} cdxj - If set, skips indexing and allows for passing CDXJ files "as is". Path to a folder containing CDXJ files.
* @property {?string} pages - If set, allows for passing pre-set pages.jsonl file(s). Path to a folder containing pages files (pages.jsonl, extraPages.jsonl ...). Must be used in combination with `pages`, since using this option will skip the step required to generate a pages.jsonl file.
* @property {?any} log - Will be used instead of the Console API for logging, if compatible (i.e: loglevel). Defaults to globalThis.console.
*/

Expand Down

0 comments on commit 5014fff

Please sign in to comment.