Skip to content

Commit

Permalink
Merge pull request #92 from tw4l/issue-91-copy-pages-files
Browse files Browse the repository at this point in the history
Modify --pages option to copy pages files directly into WACZ
  • Loading branch information
matteocargnelutti authored Mar 22, 2024
2 parents 6fcf020 + 729affa commit ec3bac4
Show file tree
Hide file tree
Showing 8 changed files with 149 additions and 27 deletions.
27 changes: 3 additions & 24 deletions bin/cli.js
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ program.command('create')
'Path to output .wacz file.', 'archive.wacz')
.option(
'-p --pages <string>',
'Path to a jsonl files to be used to replace pages.jsonl. ' +
'If not provided, js-wacz will attempt to detect pages.')
'Path to a directory of pages JSONL files to copy into WACZ as-is. ' +
'If --pages is not provided, js-wacz will attempt to detect pages.')
.option(
'--url <string>',
'If provided, will be used as the "main page url" in datapackage.json.')
Expand Down Expand Up @@ -115,35 +115,14 @@ program.command('create')
description: values?.desc,
signingUrl: values?.signingUrl,
signingToken: values?.signingToken,
pages: values?.pages,
log
})
} catch (err) {
log.error(`${err}`) // Show simplified report
return
}

// Ingest user-provided pages.jsonl file, if any.
if (values?.pages) {
try {
log.info(`pages.jsonl: Reading entries from ${values?.pages}`)
const rl = readline.createInterface({ input: createReadStream(values.pages) })

for await (const line of rl) {
const page = JSON.parse(line)

if (!page?.url) {
continue
}

log.info(`Adding ${page.url}.`)
archive.addPage(page?.url, page?.title, page?.ts)
}
} catch (err) {
log.trace(err)
log.error('An error occurred while processing user-provided pages.jsonl.')
}
}

// Ingest user-provided CDX files, if any.
if (values?.cdxj) {
try {
Expand Down
18 changes: 18 additions & 0 deletions constants.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,24 @@ export const BASE_PATH = dirname(fileURLToPath(import.meta.url))
*/
export const FIXTURES_PATH = `${BASE_PATH}${sep}fixtures${sep}`

/**
* Path to the fixtures folder pages sub-directory.
* @constant
*/
export const PAGES_DIR_FIXTURES_PATH = `${FIXTURES_PATH}pages${sep}`

/**
* Path to the pages.jsonl fixture
* @constant
*/
export const PAGES_FIXTURE_PATH = `${PAGES_DIR_FIXTURES_PATH}pages.jsonl`

/**
* Path to the extraPages.jsonl fixture
* @constant
*/
export const EXTRA_PAGES_FIXTURE_PATH = `${PAGES_DIR_FIXTURES_PATH}extraPages.jsonl`

/**
* Colors scheme for log level.
* @constant
Expand Down
4 changes: 4 additions & 0 deletions fixtures/pages/extraPages.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{"format": "json-pages-1.0", "id": "extra-pages", "title": "Extra Pages"}
{"id": "e33b4ca5-ce1d-46b2-83ea-405c43b949c5", "url": "https://webrecorder.net/tools", "title": "Webrecorder | Tools", "loadState": 4, "status": 200, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:22Z"}
{"id": "d026299c-3e37-4473-bcb4-742bc005b25d", "url": "https://webrecorder.net/blog", "title": "Webrecorder | Blog", "loadState": 4, "status": 200, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"}
{"id": "726e4e11-abb5-447d-b0be-61c4de7bb4b1", "url": "https://webrecorder.net/community", "title": "Webrecorder | Community", "loadState": 4, "status": 200, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"}
2 changes: 2 additions & 0 deletions fixtures/pages/invalid.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{id": "extra-pages", "title": "Extra Pages"}
{"id": "8e584989-8e90-41d6-9f27-c15d0fefe437", "url": "https://webrecorder.net/about", "title": "Webrecorder | About", "loadState": 4, "status": null, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"}
1 change: 1 addition & 0 deletions fixtures/pages/invalid.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Not a JSONL file
2 changes: 2 additions & 0 deletions fixtures/pages/pages.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"format": "json-pages-1.0", "id": "pages", "title": "All Pages"}
{"id": "3e01410a-e0a8-4b6f-8a6a-fca6302d9916", "url": "https://webrecorder.net/", "title": "Webrecorder", "loadState": 4, "status": 200, "seed": true, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:17Z"}
77 changes: 75 additions & 2 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
import fs from 'fs/promises'
import { createWriteStream, createReadStream, WriteStream, unlinkSync } from 'fs' // eslint-disable-line
import { createHash } from 'crypto'
import { basename, sep } from 'path'
import { basename, sep, resolve } from 'path'
import * as readline from 'node:readline/promises'
import assert from 'node:assert/strict'

import { Deflate } from 'pako'
import { globSync } from 'glob'
Expand Down Expand Up @@ -177,6 +179,12 @@ export class WACZ {
*/
archiveStream = null

/**
* Path to directory of pages JSONL files to copy as-is into WACZ.
* @type {?string}
*/
pagesDir = null

/**
* @param {WACZOptions} options - See {@link WACZOptions} for details.
*/
Expand Down Expand Up @@ -276,6 +284,11 @@ export class WACZ {
this.detectPages = false
}

if (options?.pages) {
this.detectPages = false
this.pagesDir = String(options?.pages).trim()
}

if (options?.indexFromWARCs === false) {
this.indexFromWARCs = false
}
Expand Down Expand Up @@ -359,7 +372,11 @@ export class WACZ {
await this.writeIndexesToZip()

info('Writing pages.jsonl to WACZ')
await this.writePagesToZip()
if (!this.pagesDir) {
await this.writePagesToZip()
} else {
await this.copyPagesFilesToZip()
}

info('Writing WARCs to WACZ')
await this.writeWARCsToZip()
Expand Down Expand Up @@ -582,6 +599,62 @@ export class WACZ {
}
}

/**
* Copies pages.jsonl and extraPages.jsonl files in this.pagesDir into ZIP.
* @returns {Promise<void>}
*/
copyPagesFilesToZip = async () => {
this.stateCheck()

const { pagesDir, log, addFileToZip } = this

if (!pagesDir) {
throw new Error('Error copying pages files, no directory specified.')
}

const pagesFiles = await fs.readdir(pagesDir)

for (let i = 0; i < pagesFiles.length; i++) {
const filename = pagesFiles[i]
const filenameLower = filename.toLowerCase()
const pagesFile = resolve(this.pagesDir, filename)

if (!filenameLower.endsWith('.jsonl')) {
log.warn(`Pages: Skipping file ${pagesFile}, does not end with jsonl extension`)
continue
}

let isValidJSONL = true

// Ensure file is valid JSONL
const rl = readline.createInterface({ input: createReadStream(pagesFile) })
let lineIndex = 0

for await (const line of rl) {
try {
const page = JSON.parse(line)
if (lineIndex === 0) {
assert(page.format)
assert(page.id)
} else {
assert(page.url)
assert(page.ts)
}
lineIndex++
} catch (err) {
isValidJSONL = false
log.trace(err)
log.warn(`Pages: Skipping file ${pagesFile}, not valid JSONL`)
break
}
}

if (isValidJSONL) {
await addFileToZip(pagesFile, `pages/${filename}`)
}
}
}

/**
* Streams all the files listes in `this.WARCs` to the output ZIP.
* @returns {Promise<void>}
Expand Down
45 changes: 44 additions & 1 deletion index.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import StreamZip from 'node-stream-zip'
import * as dotenv from 'dotenv'

import { WACZ } from './index.js'
import { FIXTURES_PATH } from './constants.js'
import { FIXTURES_PATH, PAGES_DIR_FIXTURES_PATH, PAGES_FIXTURE_PATH, EXTRA_PAGES_FIXTURE_PATH } from './constants.js'
import { assertSHA256WithPrefix, assertValidWACZSignatureFormat } from './utils/assertions.js' // see https://github.com/motdotla/dotenv#how-do-i-use-dotenv-with-import

// Loads env vars from .env if provided
Expand Down Expand Up @@ -74,6 +74,12 @@ test('WACZ constructor accounts for options.detectPages if valid.', async (_t) =
assert.equal(archive.detectPages, false)
})

test('WACZ constructor accounts for options.pages if provided.', async (_t) => {
const archive = new WACZ({ input: FIXTURE_INPUT, pages: PAGES_DIR_FIXTURES_PATH })
assert.equal(archive.detectPages, false)
assert.equal(archive.pagesDir, PAGES_DIR_FIXTURES_PATH)
})

test('WACZ constructor ignores options.indexFromWARCs if invalid.', async (_t) => {
const scenarios = ['foo', {}, Buffer.alloc(0), 12, () => {}]

Expand Down Expand Up @@ -333,3 +339,40 @@ test('WACZ.process runs the entire process and writes a valid .wacz to disk, acc
// Delete temp file
await fs.unlink(options.output)
})

test('WACZ.process with pagesDir option creates valid WACZ with provided pages files.', async (_t) => {
const options = {
input: FIXTURE_INPUT,
output: '../tmp.wacz',
url: 'https://lil.law.harvard.edu',
title: 'WACZ Title',
description: 'WACZ Description',
pages: PAGES_DIR_FIXTURES_PATH
}

const archive = new WACZ(options)

await archive.process(false)

const zip = new StreamZip.async({ file: options.output }) // eslint-disable-line

// File in pages fixture directory that are invalid JSONL or have wrong extension
// should not be copied into the WACZ.
assert.rejects(async () => await zip.entryData('pages/invalid.jsonl'))
assert.rejects(async () => await zip.entryData('pages/invalid.txt'))

// pages/pages.jsonl and pages/extraPages.jsonl should have same hash as fixtures
// they were copied from.
const datapackage = JSON.parse(await zip.entryData('datapackage.json'))

const datapackagePages = datapackage.resources.filter(entry => entry.path === 'pages/pages.jsonl')[0]
const pagesFixtureHash = await archive.sha256(PAGES_FIXTURE_PATH)
assert.equal(datapackagePages.hash, pagesFixtureHash)

const datapackageExtraPages = datapackage.resources.filter(entry => entry.path === 'pages/extraPages.jsonl')[0]
const extraPagesFixtureHash = await archive.sha256(EXTRA_PAGES_FIXTURE_PATH)
assert.equal(datapackageExtraPages.hash, extraPagesFixtureHash)

// Delete temp file
await fs.unlink(options.output)
})

0 comments on commit ec3bac4

Please sign in to comment.