-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
typo fix
- Loading branch information
Showing
6 changed files
with
134 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
*/__pycache__ | ||
*/*.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
# pdf-compressor | ||
Compress PDF file's pages and size. It helps you try the free-version [ChatPDF](https://www.chatpdf.com/) with large-size file. | ||
|
||
## Introduction | ||
|
||
A light-weight tool to compress PDF file's pages and size. The processing loses origin file's format, only the text information is saved in the compressed result. | ||
|
||
With default font size 10, the compressing result looks like: | ||
```bash | ||
$ python main.py -i input.pdf | ||
File size (bytes): 15114015----> 45828 | ||
Page count: 23 --------> 15 | ||
``` | ||
By setting the font size smaller to 5: | ||
```bash | ||
$ python main.py -i symmetry.pdf -f 5 | ||
File size (bytes): 15114015----> 31638 | ||
Page count: 23 --------> 4 | ||
``` | ||
|
||
## Requirements | ||
|
||
In Python 3.X envrionment, install reqirements by `pip install -r requirements.txt` | ||
|
||
## Usage | ||
1. Clone the repository, put PDF file(s) to `./files` folder. | ||
`python main.py -h` to check arguments' meanings. | ||
2. One typical usage command is: | ||
``` | ||
python main.py -i input.pdf | ||
``` |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
#!/usr/bin/python3 | ||
import configargparse | ||
from utils import pdfUtils | ||
|
||
p = configargparse.ArgParser() | ||
p.add('-b', '--base-path', default='./files', type=str, help='Base path to the PDF files for processing') | ||
p.add('-i', '--input-files', required=True, nargs='+', help="Input PDF files name(s), add space between two files") | ||
p.add('-f', '--font-size', default=10, help="Font size of the output PDF") | ||
options = p.parse_args() | ||
|
||
def main(): | ||
pdf_compressor = pdfUtils.pdfCompressor(options.base_path, options.input_files, options.font_size) | ||
pdf_compressor.process() | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
PyPDF2==3.0.1 | ||
fpdf==1.7.2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
#!/usr/bin/python3 | ||
import os | ||
from PyPDF2 import PdfReader | ||
import textwrap | ||
from fpdf import FPDF | ||
|
||
class pdfCompressor: | ||
def __init__(self, base_path, pdf_file_list, font_size): | ||
self.file_name_list = pdf_file_list | ||
self.absolute_path = os.path.abspath(base_path) | ||
self.font_size = font_size | ||
|
||
def process(self): | ||
for i in range(len(self.file_name_list)): | ||
self.getFileInfo(self.file_name_list[i]) | ||
self.ocrAndSaveTxt() | ||
self.txtTrim() | ||
self.txtToPdf() | ||
self.printSummary() | ||
|
||
def getFileInfo(self, file_name): | ||
self.file_name = file_name | ||
self.file_name_without_extenstion = os.path.splitext(self.file_name)[0] | ||
self.path_to_file = self.absolute_path + '/' + self.file_name | ||
self.path_to_result = self.absolute_path + '/' + self.file_name_without_extenstion + '_compressed' + '.pdf' | ||
self.path_to_txt = self.absolute_path + '/' + self.file_name_without_extenstion + '.txt' | ||
|
||
def ocrAndSaveTxt(self): | ||
with open(self.path_to_file, 'rb') as pdf: | ||
pdf_reader = PdfReader(pdf) | ||
with open(self.path_to_txt, 'w') as f: | ||
for j in range (len(pdf_reader.pages)): | ||
page = pdf_reader.pages[j] | ||
f.write(page.extract_text()) | ||
pdf.close() | ||
|
||
def txtTrim(self): | ||
with open(self.path_to_txt, 'r') as f: | ||
text = f.read() | ||
text = text.replace('\n', '') | ||
with open(self.path_to_txt, 'w') as f: | ||
f.write(text) | ||
|
||
def txtToPdf(self): | ||
with open(self.path_to_txt, 'r', encoding='latin-1') as f: | ||
text = f.read() | ||
a4_width_mm = 210 | ||
pt_to_mm = 0.35 | ||
fontsize_pt = float(self.font_size) | ||
fontsize_mm = fontsize_pt * pt_to_mm | ||
margin_bottom_mm = 5 | ||
character_width_mm = 7 * pt_to_mm | ||
width_text = a4_width_mm / character_width_mm * (10 / fontsize_pt) | ||
pdf = FPDF(orientation='P', unit='mm', format='A4') | ||
pdf.set_auto_page_break(True, margin=margin_bottom_mm) | ||
pdf.add_page() | ||
pdf.set_font(family='Courier', size=fontsize_pt) | ||
splitted = text.split('\n') | ||
for line in splitted: | ||
lines = textwrap.wrap(line, width_text) | ||
if len(lines) == 0: | ||
pdf.ln() | ||
for wrap in lines: | ||
pdf.cell(0, fontsize_mm, wrap, ln=1) | ||
pdf.output(self.path_to_result, 'F') | ||
|
||
def printSummary(self): | ||
with open(self.path_to_file, 'rb') as f: | ||
pdf = PdfReader(f) | ||
origin_num_pages = len(pdf.pages) | ||
with open(self.path_to_result, 'rb') as f: | ||
pdf = PdfReader(f) | ||
result_num_pages = len(pdf.pages) | ||
origin_file_size = os.path.getsize(self.path_to_file) | ||
result_file_size = os.path.getsize(self.path_to_result) | ||
print(f'File size (bytes):\t{origin_file_size}----> {result_file_size}') | ||
print(f'Page count: \t\t{origin_num_pages} --------> {result_num_pages}') | ||
print('=========================================') | ||
print('[INFO] Result is saved to:', self.path_to_result) | ||
print('[INFO] If need compress further, try lower the font size by tuning argument -f.') | ||
|
||
|