Skip to content

Commit

Permalink
Check conformance of pages files against spec
Browse files Browse the repository at this point in the history
  • Loading branch information
tw4l committed Mar 21, 2024
1 parent 02ef17a commit dd847e7
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 3 deletions.
4 changes: 2 additions & 2 deletions fixtures/pages/invalid.jsonl
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
{"format": "json-pages-1.0", "id": "extra-pages", "title": "Extra Pages"
{"id": "8e584989-8e90-41d6-9f27-c15d0fefe437", "url": "https://webrecorder.net/about", "title": "Webrecorder | About", "loadState": 4, "status": None, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"}
{id": "extra-pages", "title": "Extra Pages"}
{"id": "8e584989-8e90-41d6-9f27-c15d0fefe437", "url": "https://webrecorder.net/about", "title": "Webrecorder | About", "loadState": 4, "status": null, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"}
13 changes: 12 additions & 1 deletion index.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import { createWriteStream, createReadStream, WriteStream, unlinkSync } from 'fs
import { createHash } from 'crypto'
import { basename, sep, resolve } from 'path'
import * as readline from 'node:readline/promises'
import assert from 'node:assert/strict'

import { Deflate } from 'pako'
import { globSync } from 'glob'
Expand Down Expand Up @@ -627,9 +628,19 @@ export class WACZ {

// Ensure file is valid JSONL
const rl = readline.createInterface({ input: createReadStream(pagesFile) })
let lineIndex = 0

for await (const line of rl) {
try {
JSON.parse(line)
const page = JSON.parse(line)
if (lineIndex === 0) {
assert(page.format)
assert(page.id)
} else {
assert(page.url)
assert(page.ts)
}
lineIndex++
} catch (err) {
isValidJSONL = false
log.warn(`Pages: Skipping file ${pagesFile}, not valid JSONL`)
Expand Down

0 comments on commit dd847e7

Please sign in to comment.