Extract images for adding metadata and put them back into the pdf #1534

jappoman · 2023-10-31T17:00:22Z

jappoman
Oct 31, 2023

Hi, I'm extracting images from a specific page of a pdf file to apply additional exif metadata. Next, I want to put the image buffers back inside the pdf... Except when I extract them again, the exif metadata is completely gone. I'm sure I applied the metadata correctly because if I try to save the image, the metadata is there. The problem therefore arises from re-insertion into the PDF.

This is my code for putting back the image into the pdf:

const replaceImagesInPdf = async (pdfDoc, currentPage, newImages) => {
  console.log(`Replacing images in page ${currentPage}...`);
  console.time("replaceImagesInPdfForPage" + currentPage);

  for (let newImage of newImages) {
    // Cycling throug the image of the only page in pdf
    const imageData = newImage.data;
    const imageRef = newImage.ref;

    const enumeratedIndirectObjects = pdfDoc.context.enumerateIndirectObjects();
    let objectIdx = 0;
    enumeratedIndirectObjects.forEach(async ([pdfRef, pdfObject], ref) => {
      objectIdx += 1;

      if (!(pdfObject instanceof PDFRawStream)) return;

      const { dict } = pdfObject;
      const subtype = dict.get(PDFName.of("Subtype"));

      if (subtype == PDFName.of("Image") && ref == imageRef) {
        pdfObject.contents = imageData;
      }
    });
  }

  console.log("Replaced images into page " + currentPage + ".");
  console.timeEnd("replaceImagesInPdfForPage" + currentPage);

  return pdfDoc;
};

This is how i extract the image from the pdf:

const indexPDFImages = async (pdfDoc) => {
  const enumeratedIndirectObjects = pdfDoc.context.enumerateIndirectObjects();
  const imagesInDoc = [];
  let objectIdx = 0;

  enumeratedIndirectObjects.forEach(async ([pdfRef, pdfObject], ref) => {
    objectIdx += 1;

    if (!(pdfObject instanceof PDFRawStream)) return;

    const { dict } = pdfObject;

    const subtype = dict.get(PDFName.of("Subtype"));
    if (subtype !== PDFName.of("Image")) return; // If it's not an image, return

    const filter = dict.get(PDFName.of("Filter"));
    let imageType = null;

    switch (filter) {
      case PDFName.of("DCTDecode"):
        imageType = "jpg";
        break;
      case PDFName.of("FlateDecode"):
        imageType = "png";
        break;
      case PDFName.of("JPXDecode"):
        imageType = "jpeg2000"; // JPX is typically used for JPEG2000 in PDFs
        break;
      // ... Add more filters for other image formats like WebP, GIF, AVIF, TIFF, SVG etc.
      default:
        console.log(
          `Unsupported image format detected for ref: ${pdfRef}. Filter used: ${filter}`
        );
        return; // If it's neither JPEG nor PNG, return
    }

    // Extract other image information
    const smaskRef = dict.get(PDFName.of("SMask"));
    const colorSpace = dict.get(PDFName.of("ColorSpace"));
    const width = dict.get(PDFName.of("Width"));
    const height = dict.get(PDFName.of("Height"));
    const name = dict.get(PDFName.of("Name"));
    const bitsPerComponent = dict.get(PDFName.of("BitsPerComponent"));

    imagesInDoc.push({
      ref,
      smaskRef,
      colorSpace,
      name: name ? name.key : `Object${objectIdx}`,
      width: width.numberValue,
      height: height.numberValue,
      pxsize: width.numberValue * height.numberValue,
      bitsPerComponent: bitsPerComponent.numberValue,
      data: pdfObject.contents,
      type: imageType,
    });
  });

  return imagesInDoc;
};

In between, you have the function what put the new metadata into the image buffer:

async function generateImageMetadataWatermark(
  imageBufferObj,
  currentPage,
  watermark,
) {
  console.log(`Generating ImageMetadataWatermark for page ${currentPage}...`);
  console.time("generateImageMetadataWatermarkForPage" + currentPage);
  try {
    // Extracting image data and reference
    const actualImageBuffer = imageBufferObj.image;
    const imageRef = imageBufferObj.ref;

    //Convert the full image buffer to base 64
    const base64Image =
      "data:image/jpeg;base64," + actualImageBuffer.toString("base64");
    const exifObj = piexifjs.load(base64Image);

    // Add watermark string in the EXIF data. Using "0th" ImageDescription.
    exifObj["0th"][piexifjs.ImageIFD.ImageDescription] = watermark;
    // Create new EXIF binary string
    const exifBytes = piexifjs.dump(exifObj);
    // Insert the new EXIF data into the image
    const newImageBase64 = piexifjs.insert(exifBytes, base64Image);
    // Convert base64 image to buffer
    const newImageBuffer = Buffer.from(newImageBase64.split(",")[1], "base64");

    // Returning the modified image
    const modifiedImage = {
      watermarkType: "imageMetadata",
      ref: imageRef,
      data: newImageBuffer,
    };
    console.log(`ImageMetadataWatermark for page ${currentPage} generated.`);
    console.timeEnd("generateImageMetadataWatermarkForPage" + currentPage);
    return modifiedImage;
  } catch (e) {
    throw e;
  }
}

Any solution to this?

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Extract images for adding metadata and put them back into the pdf #1534

{{title}}

Replies: 0 comments

Select a reply

Extract images for adding metadata and put them back into the pdf #1534

jappoman Oct 31, 2023

Replies: 0 comments

jappoman
Oct 31, 2023