From 62921efa5273bb0fbd3fb52eec63008c39231914 Mon Sep 17 00:00:00 2001 From: Matteo Cargnelutti Date: Fri, 22 Mar 2024 16:02:45 -0400 Subject: [PATCH] 0.1.0 RC Added comments, fixed docs --- README.md | 5 +++-- index.js | 10 +++++----- types.js | 4 +++- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index a25ce9c..834c8d7 100644 --- a/README.md +++ b/README.md @@ -92,12 +92,13 @@ js-wacz create --file cool-beans.warc --output cool-beans.wacz ### --pages, -p -Pass a specific [pages.jsonl](https://specs.webrecorder.net/wacz/1.1.1/#pages-jsonl) file. +Path to a folder containing [pages.jsonl](https://specs.webrecorder.net/wacz/1.1.1/#pages-jsonl) files (`pages.jsonl`, `extraPages.jsonl` ...). If not provided, **js-wacz** is going to attempt to detect pages in WARC records to build its own `pages.jsonl` index. ```bash -js-wacz create -f "collection/*.warc.gz" --pages collection/pages.jsonl +# Assuming the following file exists: /collections/pages/pages.jsonl +js-wacz create -f "collection/*.warc.gz" --pages collection/pages/ ``` ### --cdxj diff --git a/index.js b/index.js index 146b7d7..935d495 100644 --- a/index.js +++ b/index.js @@ -600,7 +600,7 @@ export class WACZ { } /** - * Copies pages.jsonl and extraPages.jsonl files in this.pagesDir into ZIP. + * Copies pages.jsonl and extraPages.jsonl files in `this.pagesDir` into ZIP. * @returns {Promise} */ copyPagesFilesToZip = async () => { @@ -619,8 +619,9 @@ export class WACZ { const filenameLower = filename.toLowerCase() const pagesFile = resolve(this.pagesDir, filename) + // Ensure file is JSONL if (!filenameLower.endsWith('.jsonl')) { - log.warn(`Pages: Skipping file ${pagesFile}, does not end with jsonl extension`) + log.warn(`Pages: Skipping file ${basename(pagesFile)}: does not end with jsonl extension.`) continue } @@ -644,7 +645,7 @@ export class WACZ { } catch (err) { isValidJSONL = false log.trace(err) - log.warn(`Pages: Skipping file ${pagesFile}, not valid JSONL`) + log.warn(`Pages: Skipping file ${basename(pagesFile)}: not valid JSONL / page entry.`) break } } @@ -656,7 +657,7 @@ export class WACZ { } /** - * Streams all the files listes in `this.WARCs` to the output ZIP. + * Streams all the files listed in `this.WARCs` to the output ZIP. * @returns {Promise} */ writeWARCsToZip = async () => { @@ -886,7 +887,6 @@ export class WACZ { addCDXJ = (cdjx) => { this.stateCheck() this.indexFromWARCs = false - this.cdxTree.setIfNotPresent(cdjx, true) } diff --git a/types.js b/types.js index 6bab1f7..b0e5773 100644 --- a/types.js +++ b/types.js @@ -3,7 +3,9 @@ * @typedef {Object} WACZOptions * @property {string|string[]} input - Required. Path(s) to input .warc or .warc.gz file(s). Glob-compatible. * @property {string} output - Required. Path to output .wacz file. Will default to PWD + `archive.wacz` if not provided. - * @property {boolean} [detectPages=true] - If true (default), will attempt to detect pages in WARC records. + * @property {boolean} [indexFromWARCs=true] - If true, will attempt to generate CDXJ indexes from processed WARCs. Automatically disabled if `addCDXJ()` is called. + * @property {boolean} [detectPages=true] - If true (default), will attempt to detect pages in WARC records. Automatically disabled if `pages` is provided or `addPages()` is called. + * @property {?string} pages - Path to a folder containing pages files (pages.jsonl, extraPages.jsonl ...). * @property {?string} url - If set, will be added to datapackage.json as `mainPageUrl`. * @property {?string} ts - If set, will be added to datapackage.json as `mainPageDate`. Can be any value that `Date()` can parse. * @property {?string} title - If set, will be added to datapackage.json as `title`.