Migration support for older mupdf-js library (#105)

* gitignore IDE files Took 57 minutes * update package-lock.json Took 1 minute * fix paths in existing test suite, gitignore test output file Took 33 seconds * drawPageAsPng task function Took 23 minutes * drawPageAsHtml task function, with corresponding wasm function Took 22 minutes * update test name Took 28 minutes * drawPageAsSvg function, with corresponding wasm function Took 7 minutes * getPageText function, with corresponding wasm function Took 13 minutes * searchPageText function Took 11 minutes * code style tweaks Took 5 minutes * standardise function name Took 1 hour 35 minutes * initial migration guide Took 39 minutes * fix casing Took 5 seconds * other fixes to docs Took 1 minute * refactor: change function name casing for tasks Took 6 minutes * refactor: remove C implementation of drawPageAsSVG task Took 13 minutes * docs: update migration docs with function name change Took 5 minutes
ArtifexSoftware · Aug 21, 2024 · 1082a6e · 1082a6e
1 parent cf9d258
commit 1082a6e
Show file tree

Hide file tree

Showing 11 changed files with 342 additions and 65 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,5 @@ dist
 node_modules
 .next
 build
+.idea
+/docs/venv/
diff --git a/docs/how-to-guide/index.rst b/docs/how-to-guide/index.rst
@@ -6,7 +6,6 @@ How To Guide
 ===================
 
 
-
 |node_js_logo|
 
 .. toctree::
@@ -16,6 +15,13 @@ How To Guide
     node/index.rst
 
 
+.. toctree::
+    :caption: Migrating from mupdf-js
+    :maxdepth: 1
+
+    migration/index.rst
+
+
 .. toctree::
     :caption: Glossary
     :maxdepth: 1

diff --git a/docs/how-to-guide/migration/index.rst b/docs/how-to-guide/migration/index.rst
@@ -0,0 +1,150 @@
+.. include:: ../../header.rst
+
+.. _How_To_Guide_Migration:
+
+Migrating from `mupdf-js`
+===========================
+
+This guide is intended to help you migrate from the https://github.com/andytango/mupdf-js
+library to this one.
+
+Whilst this package offers a more comprehensive API, we also provide functions
+that are similar to those in `mupdf-js` to make the migration easier. These are
+available in the `mupdf/tasks` module.
+
+1. Initialization
+-------------------
+
+Unlike `mupdf-js`, you don't need to initialize the library before using it.
+
+So you can remove code like this:
+
+.. code-block:: javascript
+
+    import { createMuPdf } from "mupdf-js";
+
+    async function handleSomePdf(file: File) {
+      const mupdf = await createMuPdf(); // this is no longer needed
+    }
+----
+
+2. Loading a document
+-------------------
+
+Just like with `mupdf-js`, you can load a document either as a Buffer
+(in Node.js), an ArrayBuffer (in the browser), or a Uint8Array (in both environments).
+
+We provide a `loadPDF` function that is similar to the `load` method in `mupdf-js`.
+So you can replace this:
+
+.. code-block:: javascript
+
+    import { createMuPdf } from "mupdf-js";
+
+    async function handleSomePdf(file) {
+      const mupdf = await createMuPdf();
+      const buf = await file.arrayBuffer();
+      const arrayBuf = new Uint8Array(buf);
+      const doc = mupdf.load(arrayBuf);
+    }
+----
+
+With this:
+
+.. code-block:: javascript
+
+    import { loadPDF } from "mupdf/tasks";
+
+    async function handleSomePdf(file) {
+      const buf = await file.arrayBuffer();
+      const arrayBuf = new Uint8Array(buf);
+      const doc = loadPDF(arrayBuf); // Returns a Document instance
+    }
+----
+
+3. Converting a page to an image
+-------------------
+
+In `mupdf-js`, you would convert a page to an image like this:
+
+.. code-block:: javascript
+
+    import { createMuPdf } from "mupdf-js";
+
+    async function handleSomePdf(file) {
+      const mupdf = await createMuPdf();
+      const buf = await file.arrayBuffer();
+      const arrayBuf = new Uint8Array(buf);
+      const doc = mupdf.load(arrayBuf);
+
+      // Each of these returns a string:
+
+      const png = mupdf.drawPageAsPNG(doc, 1, 300);
+      const svg = mupdf.drawPageAsSVG(doc, 1);
+      const html = mupdf.drawPageAsHTML(doc, 1);
+    }
+----
+
+Here's how you would do it with this package:
+
+.. code-block:: javascript
+
+    import {
+      loadPDF,
+      drawPageAsPNG,
+      drawPageAsSVG,
+      drawPageAsHTML
+    } from "mupdf/tasks";
+
+    async function handleSomePdf(file) {
+      const buf = await file.arrayBuffer();
+      const arrayBuf = new Uint8Array(buf);
+      const doc = loadPDF(arrayBuf);
+
+      // Each of these returns a string:
+
+      const png = drawPageAsPNG(doc, 1, 300);
+      const svg = drawPageAsSVG(doc, 1);
+      const html = drawPageAsHTML(doc, 1);
+    }
+
+----
+
+4. Text operations
+-------------------
+
+Finally, we provide two functions to replace the `mupdf-js` `getPageText` and
+`searchPageText` functions:
+
+.. code-block:: javascript
+
+    import {
+      loadPDF,
+      getPageText,
+      searchPageText
+    } from "mupdf/tasks";
+
+    async function handleSomePdf(file) {
+      const buf = await file.arrayBuffer();
+      const arrayBuf = new Uint8Array(buf);
+      const doc = loadPDF(arrayBuf);
+
+      // Returns plain text for the first page
+      const pageText = getPageText(doc, 1);
+
+      // Returns an array of objects with the bounding box for each match:
+      const searchResults = searchPageText(doc, 1, "some text");
+
+    }
+
+----
+
+5. Tests
+-------------------
+
+You can also
+`see the tests <https://github.com/ArtifexSoftware/mupdf.js/blob/master/examples/tests/src/tasks.test.ts>`_
+for these functions for more examples of how to use them.
+
+
+.. include:: ../../footer.rst
diff --git a/examples/tests/.gitignore b/examples/tests/.gitignore
@@ -22,3 +22,6 @@ dist-ssr
 *.njsproj
 *.sln
 *.sw?
+
+# Test output files
+/src/resources/output*
diff --git a/examples/tests/src/annotations.test.ts b/examples/tests/src/annotations.test.ts
@@ -4,7 +4,7 @@ import path from 'path';
 import { afterAll, beforeAll, describe, expect, it } from 'vitest';
 
 const scriptdir = path.resolve(__dirname);
-const filename = path.join(scriptdir, "resources", "test.pdf");
+const filename = path.join(scriptdir, "..", "test.pdf");
 const outputFilename = path.join(scriptdir, "resources", "output-annotations.pdf");
 
 describe('mupdfjs annotations tests', () => {

diff --git a/examples/tests/src/tasks.test.ts b/examples/tests/src/tasks.test.ts
@@ -0,0 +1,101 @@
+import {describe, expect, it} from 'vitest'
+import path from "path"
+import * as fs from "node:fs"
+import * as mupdf from "../../../dist/mupdf"
+import {drawPageAsHTML, drawPageAsPNG, drawPageAsSVG, getPageText, loadPDF, searchPageText} from "../../../dist/tasks"
+
+const scriptdir = path.resolve(__dirname)
+const filename = path.join(scriptdir, "..", "test.pdf")
+const outputDir = path.join(scriptdir, "resources")
+
+const file = fs.readFileSync(filename)
+
+describe("loadPDF", () => {
+    it("successfully loads a PDF document", () => {
+        const file = fs.readFileSync(filename)
+        let document: null | mupdf.PDFDocument = null
+
+        expect(() => {
+            document = loadPDF(file)
+        }).not.toThrow()
+
+        expect(document).not.toBeNull()
+    })
+})
+
+describe("drawPageAsPng", () => {
+    it("successfully renders a page as PNG", () => {
+        const result = drawPageAsPNG(loadPDF(file), 0, 150)
+        expect(result).toHaveLength(173738)
+        fs.writeFileSync(
+          path.join(outputDir, "output-tasks.png"),
+          Buffer.from(result)
+        )
+    })
+})
+
+describe("drawPageAsHtml", () => {
+    it("successfully renders a page as HTML", () => {
+        const result = drawPageAsHTML(loadPDF(file), 0, 0)
+        expect(result).toHaveLength(654)
+        fs.writeFileSync(
+          path.join(outputDir, "output-tasks.html"),
+          Buffer.from(result)
+        )
+    })
+})
+
+describe("drawPageAsSvg", () => {
+    it("successfully renders a page as SVG", () => {
+        const result = drawPageAsSVG(loadPDF(file), 0)
+        expect(result).toHaveLength(91467)
+        fs.writeFileSync(
+          path.join(outputDir, "output-tasks.svg"),
+          Buffer.from(result)
+        )
+    })
+})
+
+describe("getPageText", () => {
+    it("successfully extracts the text from page", () => {
+        const result = getPageText(loadPDF(file), 0)
+        expect(result).toMatchInlineSnapshot(`
+          "Welcome to the Node server test.pdf file.
+
+          Sorry there is not much to see here!
+
+          1
+
+          Page 1 footer
+
+          "
+        `)
+    })
+})
+
+describe("searchPageText", () => {
+    it("returns an array of search results as coordinate bounding boxes", () => {
+        const result = searchPageText(loadPDF(file), 0, "Welcome", 1)
+        expect(result).toMatchInlineSnapshot(`
+          [
+            [
+              [
+                30.7637996673584,
+                32.626708984375,
+                80.7696304321289,
+                32.626708984375,
+                30.7637996673584,
+                46.032958984375,
+                80.7696304321289,
+                46.032958984375,
+              ],
+            ],
+          ]
+        `)
+    })
+
+    it("returns an empty array if no matches found", () => {
+        const result = searchPageText(loadPDF(file), 0, "mupdf", 1)
+        expect(result).toMatchInlineSnapshot(`[]`)
+    })
+})
diff --git a/package-lock.json b/package-lock.json
diff --git a/src/mupdf-wasm.d.ts b/src/mupdf-wasm.d.ts
@@ -245,6 +245,8 @@ interface Libmupdf {
 	_wasm_search_stext_page(text: Pointer<"fz_stext_page">, needle: Pointer<"char">, marks: Pointer<"int">, hits: Pointer<"fz_quad">, hit_max: number): number,
 	_wasm_copy_selection(text: Pointer<"fz_stext_page">, a: Pointer<"fz_point">, b: Pointer<"fz_point">): Pointer<"char">,
 	_wasm_highlight_selection(text: Pointer<"fz_stext_page">, a: Pointer<"fz_point">, b: Pointer<"fz_point">, hits: Pointer<"fz_quad">, n: number): number,
+	_wasm_print_stext_page_as_html(page: Pointer<"fz_stext_page">, id: number): Pointer<"char">,
+	_wasm_print_stext_page_as_text(page: Pointer<"fz_stext_page">): Pointer<"char">,
 	_wasm_open_document_with_buffer(magic: Pointer<"char">, buffer: Pointer<"fz_buffer">): Pointer<"any_document">,
 	_wasm_open_document_with_stream(magic: Pointer<"char">, stream: Pointer<"fz_stream">): Pointer<"any_document">,
 	_wasm_format_link_uri(doc: Pointer<"any_document">, ch: number, pg: number, ty: number, x: number, y: number, w: number, h: number, z: number): Pointer<"char">,
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,3 +2,5 @@ dist @@
     node_modules
     .next
     build
+    .idea
+    /docs/venv/