Fully functional pdf compressor

typo fix
MengWoods · Mar 13, 2023 · ef298f7 · ef298f7
1 parent b0460d2
commit ef298f7
Show file tree

Hide file tree

Showing 6 changed files with 134 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+*/__pycache__
+*/*.pdf
+*/*.txt
diff --git a/README.md b/README.md
@@ -0,0 +1,31 @@
+# pdf-compressor
+Compress PDF file's pages and size. It helps you try the free-version [ChatPDF](https://www.chatpdf.com/) with large-size file.
+
+## Introduction
+
+A light-weight tool to compress PDF file's pages and size. The processing loses origin file's format, only the text information is saved in the compressed result.
+
+With default font size 10, the compressing result looks like:
+```bash
+$ python main.py -i input.pdf
+File size (bytes):      15114015----> 45828
+Page count:             23 --------> 15
+```
+By setting the font size smaller to 5:
+```bash
+$ python main.py -i symmetry.pdf -f 5
+File size (bytes):      15114015----> 31638
+Page count:             23 --------> 4
+```
+
+## Requirements
+
+In Python 3.X envrionment, install reqirements by `pip install -r requirements.txt`
+
+## Usage
+1. Clone the repository, put PDF file(s) to `./files` folder.
+`python main.py -h` to check arguments' meanings. 
+2. One typical usage command is:
+    ```
+    python main.py -i input.pdf
+    ```
diff --git a/files/.placeholder b/files/.placeholder
diff --git a/main.py b/main.py
@@ -0,0 +1,16 @@
+#!/usr/bin/python3
+import configargparse
+from utils import pdfUtils
+
+p = configargparse.ArgParser()
+p.add('-b', '--base-path', default='./files', type=str, help='Base path to the PDF files for processing')
+p.add('-i', '--input-files', required=True, nargs='+', help="Input PDF files name(s), add space between two files")
+p.add('-f', '--font-size', default=10, help="Font size of the output PDF")
+options = p.parse_args()
+
+def main():
+    pdf_compressor = pdfUtils.pdfCompressor(options.base_path, options.input_files, options.font_size)
+    pdf_compressor.process()
+
+if __name__ == "__main__":
+    main()
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,2 @@
+PyPDF2==3.0.1
+fpdf==1.7.2
diff --git a/utils/pdfUtils.py b/utils/pdfUtils.py
@@ -0,0 +1,82 @@
+#!/usr/bin/python3
+import os
+from PyPDF2 import PdfReader
+import textwrap
+from fpdf import FPDF
+
+class pdfCompressor:
+    def __init__(self, base_path, pdf_file_list, font_size):
+        self.file_name_list = pdf_file_list
+        self.absolute_path = os.path.abspath(base_path)
+        self.font_size = font_size
+
+    def process(self):
+        for i in range(len(self.file_name_list)):
+            self.getFileInfo(self.file_name_list[i])
+            self.ocrAndSaveTxt()
+            self.txtTrim()
+            self.txtToPdf()
+            self.printSummary()
+
+    def getFileInfo(self, file_name):
+        self.file_name = file_name
+        self.file_name_without_extenstion = os.path.splitext(self.file_name)[0]
+        self.path_to_file = self.absolute_path + '/' + self.file_name
+        self.path_to_result = self.absolute_path + '/' + self.file_name_without_extenstion + '_compressed' + '.pdf'
+        self.path_to_txt = self.absolute_path + '/' + self.file_name_without_extenstion + '.txt'
+
+    def ocrAndSaveTxt(self):
+        with open(self.path_to_file, 'rb') as pdf:
+            pdf_reader = PdfReader(pdf)
+            with open(self.path_to_txt, 'w') as f:
+                for j in range (len(pdf_reader.pages)):
+                    page = pdf_reader.pages[j]
+                    f.write(page.extract_text())
+            pdf.close()
+
+    def txtTrim(self):
+        with open(self.path_to_txt, 'r') as f:
+            text = f.read()
+        text = text.replace('\n', '')
+        with open(self.path_to_txt, 'w') as f:
+            f.write(text)
+
+    def txtToPdf(self):
+        with open(self.path_to_txt, 'r', encoding='latin-1') as f:
+            text = f.read()
+        a4_width_mm = 210
+        pt_to_mm = 0.35
+        fontsize_pt = float(self.font_size)
+        fontsize_mm = fontsize_pt * pt_to_mm
+        margin_bottom_mm = 5
+        character_width_mm = 7 * pt_to_mm
+        width_text = a4_width_mm / character_width_mm * (10 / fontsize_pt)
+        pdf = FPDF(orientation='P', unit='mm', format='A4')
+        pdf.set_auto_page_break(True, margin=margin_bottom_mm)
+        pdf.add_page()
+        pdf.set_font(family='Courier', size=fontsize_pt)
+        splitted = text.split('\n')
+        for line in splitted:
+            lines = textwrap.wrap(line, width_text)
+            if len(lines) == 0:
+                pdf.ln()
+            for wrap in lines:
+                pdf.cell(0, fontsize_mm, wrap, ln=1)
+        pdf.output(self.path_to_result, 'F')
+
+    def printSummary(self):
+        with open(self.path_to_file, 'rb') as f:
+            pdf = PdfReader(f)
+            origin_num_pages = len(pdf.pages)
+        with open(self.path_to_result, 'rb') as f:
+            pdf = PdfReader(f)
+            result_num_pages = len(pdf.pages)
+        origin_file_size = os.path.getsize(self.path_to_file)
+        result_file_size = os.path.getsize(self.path_to_result)
+        print(f'File size (bytes):\t{origin_file_size}----> {result_file_size}')
+        print(f'Page count: \t\t{origin_num_pages} --------> {result_num_pages}')
+        print('=========================================')
+        print('[INFO] Result is saved to:', self.path_to_result)
+        print('[INFO] If need compress further, try lower the font size by tuning argument -f.')
+
+