From ef298f7bfbdd5626ca0984f207e266e679e50f96 Mon Sep 17 00:00:00 2001 From: woods Date: Mon, 13 Mar 2023 23:19:47 +0200 Subject: [PATCH] Fully functional pdf compressor typo fix --- .gitignore | 3 ++ README.md | 31 ++++++++++++++++++ files/.placeholder | 0 main.py | 16 +++++++++ requirements.txt | 2 ++ utils/pdfUtils.py | 82 ++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 134 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 files/.placeholder create mode 100644 main.py create mode 100644 requirements.txt create mode 100644 utils/pdfUtils.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5df802c --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +*/__pycache__ +*/*.pdf +*/*.txt \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..ad0b999 --- /dev/null +++ b/README.md @@ -0,0 +1,31 @@ +# pdf-compressor +Compress PDF file's pages and size. It helps you try the free-version [ChatPDF](https://www.chatpdf.com/) with large-size file. + +## Introduction + +A light-weight tool to compress PDF file's pages and size. The processing loses origin file's format, only the text information is saved in the compressed result. + +With default font size 10, the compressing result looks like: +```bash +$ python main.py -i input.pdf +File size (bytes): 15114015----> 45828 +Page count: 23 --------> 15 +``` +By setting the font size smaller to 5: +```bash +$ python main.py -i symmetry.pdf -f 5 +File size (bytes): 15114015----> 31638 +Page count: 23 --------> 4 +``` + +## Requirements + +In Python 3.X envrionment, install reqirements by `pip install -r requirements.txt` + +## Usage +1. Clone the repository, put PDF file(s) to `./files` folder. +`python main.py -h` to check arguments' meanings. +2. One typical usage command is: + ``` + python main.py -i input.pdf + ``` \ No newline at end of file diff --git a/files/.placeholder b/files/.placeholder new file mode 100644 index 0000000..e69de29 diff --git a/main.py b/main.py new file mode 100644 index 0000000..aa517c5 --- /dev/null +++ b/main.py @@ -0,0 +1,16 @@ +#!/usr/bin/python3 +import configargparse +from utils import pdfUtils + +p = configargparse.ArgParser() +p.add('-b', '--base-path', default='./files', type=str, help='Base path to the PDF files for processing') +p.add('-i', '--input-files', required=True, nargs='+', help="Input PDF files name(s), add space between two files") +p.add('-f', '--font-size', default=10, help="Font size of the output PDF") +options = p.parse_args() + +def main(): + pdf_compressor = pdfUtils.pdfCompressor(options.base_path, options.input_files, options.font_size) + pdf_compressor.process() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c6f2bfa --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +PyPDF2==3.0.1 +fpdf==1.7.2 \ No newline at end of file diff --git a/utils/pdfUtils.py b/utils/pdfUtils.py new file mode 100644 index 0000000..72fa830 --- /dev/null +++ b/utils/pdfUtils.py @@ -0,0 +1,82 @@ +#!/usr/bin/python3 +import os +from PyPDF2 import PdfReader +import textwrap +from fpdf import FPDF + +class pdfCompressor: + def __init__(self, base_path, pdf_file_list, font_size): + self.file_name_list = pdf_file_list + self.absolute_path = os.path.abspath(base_path) + self.font_size = font_size + + def process(self): + for i in range(len(self.file_name_list)): + self.getFileInfo(self.file_name_list[i]) + self.ocrAndSaveTxt() + self.txtTrim() + self.txtToPdf() + self.printSummary() + + def getFileInfo(self, file_name): + self.file_name = file_name + self.file_name_without_extenstion = os.path.splitext(self.file_name)[0] + self.path_to_file = self.absolute_path + '/' + self.file_name + self.path_to_result = self.absolute_path + '/' + self.file_name_without_extenstion + '_compressed' + '.pdf' + self.path_to_txt = self.absolute_path + '/' + self.file_name_without_extenstion + '.txt' + + def ocrAndSaveTxt(self): + with open(self.path_to_file, 'rb') as pdf: + pdf_reader = PdfReader(pdf) + with open(self.path_to_txt, 'w') as f: + for j in range (len(pdf_reader.pages)): + page = pdf_reader.pages[j] + f.write(page.extract_text()) + pdf.close() + + def txtTrim(self): + with open(self.path_to_txt, 'r') as f: + text = f.read() + text = text.replace('\n', '') + with open(self.path_to_txt, 'w') as f: + f.write(text) + + def txtToPdf(self): + with open(self.path_to_txt, 'r', encoding='latin-1') as f: + text = f.read() + a4_width_mm = 210 + pt_to_mm = 0.35 + fontsize_pt = float(self.font_size) + fontsize_mm = fontsize_pt * pt_to_mm + margin_bottom_mm = 5 + character_width_mm = 7 * pt_to_mm + width_text = a4_width_mm / character_width_mm * (10 / fontsize_pt) + pdf = FPDF(orientation='P', unit='mm', format='A4') + pdf.set_auto_page_break(True, margin=margin_bottom_mm) + pdf.add_page() + pdf.set_font(family='Courier', size=fontsize_pt) + splitted = text.split('\n') + for line in splitted: + lines = textwrap.wrap(line, width_text) + if len(lines) == 0: + pdf.ln() + for wrap in lines: + pdf.cell(0, fontsize_mm, wrap, ln=1) + pdf.output(self.path_to_result, 'F') + + def printSummary(self): + with open(self.path_to_file, 'rb') as f: + pdf = PdfReader(f) + origin_num_pages = len(pdf.pages) + with open(self.path_to_result, 'rb') as f: + pdf = PdfReader(f) + result_num_pages = len(pdf.pages) + origin_file_size = os.path.getsize(self.path_to_file) + result_file_size = os.path.getsize(self.path_to_result) + print(f'File size (bytes):\t{origin_file_size}----> {result_file_size}') + print(f'Page count: \t\t{origin_num_pages} --------> {result_num_pages}') + print('=========================================') + print('[INFO] Result is saved to:', self.path_to_result) + print('[INFO] If need compress further, try lower the font size by tuning argument -f.') + +