Skip to content

Commit

Permalink
Fully functional pdf compressor
Browse files Browse the repository at this point in the history
typo fix
  • Loading branch information
MengWoods committed Mar 13, 2023
1 parent b0460d2 commit ef298f7
Show file tree
Hide file tree
Showing 6 changed files with 134 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
*/__pycache__
*/*.pdf
*/*.txt
31 changes: 31 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# pdf-compressor
Compress PDF file's pages and size. It helps you try the free-version [ChatPDF](https://www.chatpdf.com/) with large-size file.

## Introduction

A light-weight tool to compress PDF file's pages and size. The processing loses origin file's format, only the text information is saved in the compressed result.

With default font size 10, the compressing result looks like:
```bash
$ python main.py -i input.pdf
File size (bytes): 15114015----> 45828
Page count: 23 --------> 15
```
By setting the font size smaller to 5:
```bash
$ python main.py -i symmetry.pdf -f 5
File size (bytes): 15114015----> 31638
Page count: 23 --------> 4
```

## Requirements

In Python 3.X envrionment, install reqirements by `pip install -r requirements.txt`

## Usage
1. Clone the repository, put PDF file(s) to `./files` folder.
`python main.py -h` to check arguments' meanings.
2. One typical usage command is:
```
python main.py -i input.pdf
```
Empty file added files/.placeholder
Empty file.
16 changes: 16 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/usr/bin/python3
import configargparse
from utils import pdfUtils

p = configargparse.ArgParser()
p.add('-b', '--base-path', default='./files', type=str, help='Base path to the PDF files for processing')
p.add('-i', '--input-files', required=True, nargs='+', help="Input PDF files name(s), add space between two files")
p.add('-f', '--font-size', default=10, help="Font size of the output PDF")
options = p.parse_args()

def main():
pdf_compressor = pdfUtils.pdfCompressor(options.base_path, options.input_files, options.font_size)
pdf_compressor.process()

if __name__ == "__main__":
main()
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
PyPDF2==3.0.1
fpdf==1.7.2
82 changes: 82 additions & 0 deletions utils/pdfUtils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#!/usr/bin/python3
import os
from PyPDF2 import PdfReader
import textwrap
from fpdf import FPDF

class pdfCompressor:
def __init__(self, base_path, pdf_file_list, font_size):
self.file_name_list = pdf_file_list
self.absolute_path = os.path.abspath(base_path)
self.font_size = font_size

def process(self):
for i in range(len(self.file_name_list)):
self.getFileInfo(self.file_name_list[i])
self.ocrAndSaveTxt()
self.txtTrim()
self.txtToPdf()
self.printSummary()

def getFileInfo(self, file_name):
self.file_name = file_name
self.file_name_without_extenstion = os.path.splitext(self.file_name)[0]
self.path_to_file = self.absolute_path + '/' + self.file_name
self.path_to_result = self.absolute_path + '/' + self.file_name_without_extenstion + '_compressed' + '.pdf'
self.path_to_txt = self.absolute_path + '/' + self.file_name_without_extenstion + '.txt'

def ocrAndSaveTxt(self):
with open(self.path_to_file, 'rb') as pdf:
pdf_reader = PdfReader(pdf)
with open(self.path_to_txt, 'w') as f:
for j in range (len(pdf_reader.pages)):
page = pdf_reader.pages[j]
f.write(page.extract_text())
pdf.close()

def txtTrim(self):
with open(self.path_to_txt, 'r') as f:
text = f.read()
text = text.replace('\n', '')
with open(self.path_to_txt, 'w') as f:
f.write(text)

def txtToPdf(self):
with open(self.path_to_txt, 'r', encoding='latin-1') as f:
text = f.read()
a4_width_mm = 210
pt_to_mm = 0.35
fontsize_pt = float(self.font_size)
fontsize_mm = fontsize_pt * pt_to_mm
margin_bottom_mm = 5
character_width_mm = 7 * pt_to_mm
width_text = a4_width_mm / character_width_mm * (10 / fontsize_pt)
pdf = FPDF(orientation='P', unit='mm', format='A4')
pdf.set_auto_page_break(True, margin=margin_bottom_mm)
pdf.add_page()
pdf.set_font(family='Courier', size=fontsize_pt)
splitted = text.split('\n')
for line in splitted:
lines = textwrap.wrap(line, width_text)
if len(lines) == 0:
pdf.ln()
for wrap in lines:
pdf.cell(0, fontsize_mm, wrap, ln=1)
pdf.output(self.path_to_result, 'F')

def printSummary(self):
with open(self.path_to_file, 'rb') as f:
pdf = PdfReader(f)
origin_num_pages = len(pdf.pages)
with open(self.path_to_result, 'rb') as f:
pdf = PdfReader(f)
result_num_pages = len(pdf.pages)
origin_file_size = os.path.getsize(self.path_to_file)
result_file_size = os.path.getsize(self.path_to_result)
print(f'File size (bytes):\t{origin_file_size}----> {result_file_size}')
print(f'Page count: \t\t{origin_num_pages} --------> {result_num_pages}')
print('=========================================')
print('[INFO] Result is saved to:', self.path_to_result)
print('[INFO] If need compress further, try lower the font size by tuning argument -f.')


0 comments on commit ef298f7

Please sign in to comment.