Skip to content

Commit

Permalink
Make code review revisions
Browse files Browse the repository at this point in the history
- Replace existing -p/--pages implementation rather than adding
another option
- Rather than hardcoding allowed names, check that JSONL files
passed have correct extension and are well-formed JSON lines
- Modify tests and fixtures to account for new logic
  • Loading branch information
tw4l committed Mar 21, 2024
1 parent d8ba5f2 commit 02ef17a
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 60 deletions.
33 changes: 3 additions & 30 deletions bin/cli.js
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,8 @@ program.command('create')
'Path to output .wacz file.', 'archive.wacz')
.option(
'-p --pages <string>',
'Path to a jsonl files to be used to replace pages.jsonl. ' +
'If neither --pages nor --pages-dir is provided, js-wacz will attempt to detect pages.')
.option(
'--pages-dir <string>',
'Path to a directory of pages files to copy into WACZ as-is. ' +
'Only files named pages.jsonl or extraPages.jsonl will be copied. ' +
'If neither --pages nor --pages-dir is provided, js-wacz will attempt to detect pages.')
'Path to a directory of pages JSONL files to copy into WACZ as-is. ' +
'If --pages is not provided, js-wacz will attempt to detect pages.')
.option(
'--url <string>',
'If provided, will be used as the "main page url" in datapackage.json.')
Expand Down Expand Up @@ -120,36 +115,14 @@ program.command('create')
description: values?.desc,
signingUrl: values?.signingUrl,
signingToken: values?.signingToken,
pagesDir: values?.pagesDir,
pages: values?.pages,
log
})
} catch (err) {
log.error(`${err}`) // Show simplified report
return
}

// Ingest user-provided pages.jsonl file, if any.
if (values?.pages) {
try {
log.info(`pages.jsonl: Reading entries from ${values?.pages}`)
const rl = readline.createInterface({ input: createReadStream(values.pages) })

for await (const line of rl) {
const page = JSON.parse(line)

if (!page?.url) {
continue
}

log.info(`Adding ${page.url}.`)
archive.addPage(page?.url, page?.title, page?.ts)
}
} catch (err) {
log.trace(err)
log.error('An error occurred while processing user-provided pages.jsonl.')
}
}

// Ingest user-provided CDX files, if any.
if (values?.cdxj) {
try {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
{"format": "json-pages-1.0", "id": "extra-pages", "title": "Extra Pages"}
{"id": "8e584989-8e90-41d6-9f27-c15d0fefe437", "url": "https://webrecorder.net/about", "title": "Webrecorder | About", "loadState": 4, "status": 200, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"}
{"format": "json-pages-1.0", "id": "extra-pages", "title": "Extra Pages"
{"id": "8e584989-8e90-41d6-9f27-c15d0fefe437", "url": "https://webrecorder.net/about", "title": "Webrecorder | About", "loadState": 4, "status": None, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"}
1 change: 1 addition & 0 deletions fixtures/pages/invalid.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Not a JSONL file
50 changes: 28 additions & 22 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import fs from 'fs/promises'
import { createWriteStream, createReadStream, WriteStream, unlinkSync } from 'fs' // eslint-disable-line
import { createHash } from 'crypto'
import { basename, sep, resolve } from 'path'
import * as readline from 'node:readline/promises'

import { Deflate } from 'pako'
import { globSync } from 'glob'
Expand Down Expand Up @@ -178,7 +179,7 @@ export class WACZ {
archiveStream = null

/**
* Path to directory of pages.jsonl files to copy as-is into WACZ.
* Path to directory of pages JSONL files to copy as-is into WACZ.
* @type {?string}
*/
pagesDir = null
Expand Down Expand Up @@ -282,9 +283,9 @@ export class WACZ {
this.detectPages = false
}

if (options?.pagesDir) {
if (options?.pages) {
this.detectPages = false
this.pagesDir = String(options?.pagesDir).trim()
this.pagesDir = String(options?.pages).trim()
}

if (options?.indexFromWARCs === false) {
Expand Down Expand Up @@ -606,34 +607,39 @@ export class WACZ {

const { pagesDir, log, addFileToZip } = this

const allowedPagesFiles = ['pages.jsonl', 'extraPages.jsonl']

if (!this.pagesDir) {
if (!pagesDir) {
throw new Error('Error copying pages files, no directory specified.')
}

try {
const pagesFiles = await fs.readdir(pagesDir)
const pagesFiles = await fs.readdir(pagesDir)

for (let i = 0; i < pagesFiles.length; i++) {
const filename = pagesFiles[i]
const pagesFile = resolve(this.pagesDir, filename)
for (let i = 0; i < pagesFiles.length; i++) {
const filename = pagesFiles[i]
const filenameLower = filename.toLowerCase()
const pagesFile = resolve(this.pagesDir, filename)

if (!allowedPagesFiles.includes(filename)) {
log.warn(`Pages: Skipping file ${pagesFile}, not pages.jsonl or extraPages.jsonl`)
continue
}
if (!filenameLower.endsWith('.jsonl')) {
log.warn(`Pages: Skipping file ${pagesFile}, does not end with jsonl extension`)
continue
}

let isValidJSONL = true

let destination = 'pages/pages.jsonl'
if (filename === 'extraPages.jsonl') {
destination = 'pages/extraPages.jsonl'
// Ensure file is valid JSONL
const rl = readline.createInterface({ input: createReadStream(pagesFile) })
for await (const line of rl) {
try {
JSON.parse(line)
} catch (err) {
isValidJSONL = false
log.warn(`Pages: Skipping file ${pagesFile}, not valid JSONL`)
break
}
}

await addFileToZip(pagesFile, destination)
if (isValidJSONL) {
await addFileToZip(pagesFile, `pages/${filename}`)
}
} catch (err) {
log.trace(err)
throw new Error('An error occurred while copying pages files into WACZ.')
}
}

Expand Down
13 changes: 7 additions & 6 deletions index.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,8 @@ test('WACZ constructor accounts for options.detectPages if valid.', async (_t) =
assert.equal(archive.detectPages, false)
})

test('WACZ constructor accounts for options.pagesDir if provided.', async (_t) => {
const archive = new WACZ({ input: FIXTURE_INPUT, pagesDir: PAGES_DIR_FIXTURES_PATH })
test('WACZ constructor accounts for options.pages if provided.', async (_t) => {
const archive = new WACZ({ input: FIXTURE_INPUT, pages: PAGES_DIR_FIXTURES_PATH })
assert.equal(archive.detectPages, false)
assert.equal(archive.pagesDir, PAGES_DIR_FIXTURES_PATH)
})
Expand Down Expand Up @@ -347,7 +347,7 @@ test('WACZ.process with pagesDir option creates valid WACZ with provided pages f
url: 'https://lil.law.harvard.edu',
title: 'WACZ Title',
description: 'WACZ Description',
pagesDir: PAGES_DIR_FIXTURES_PATH
pages: PAGES_DIR_FIXTURES_PATH
}

const archive = new WACZ(options)
Expand All @@ -356,9 +356,10 @@ test('WACZ.process with pagesDir option creates valid WACZ with provided pages f

const zip = new StreamZip.async({ file: options.output }) // eslint-disable-line

// File in pages fixture directory not named pages.jsonl or extraPages.jsonl
// should not exist in the WACZ.
assert.rejects(async () => await zip.entryData('pages/invalidName.jsonl'))
// File in pages fixture directory that are invalid JSONL or have wrong extension
// should not be copied into the WACZ.
assert.rejects(async () => await zip.entryData('pages/invalid.jsonl'))
assert.rejects(async () => await zip.entryData('pages/invalid.txt'))

// pages/pages.jsonl and pages/extraPages.jsonl should have same hash as fixtures
// they were copied from.
Expand Down

0 comments on commit 02ef17a

Please sign in to comment.