-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathauxiliar_scripts.py
104 lines (82 loc) · 3.08 KB
/
auxiliar_scripts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import pdf2image
from PyPDF2 import PdfFileWriter, PdfFileReader
from PIL import Image
import pytesseract
import time
import os
import inspect
IMG_FOLDER = 'C:\\Users\\jacob\\Desktop\\Escritorio\\Proyectos python\\PDF-OCR\\temp_img\\'
#Tesseract para poder utilizar Pytesseract https://github.com/UB-Mannheim/tesseract/wiki
#En mi caso Tesseract está instalado en la ruta:
pytesseract.pytesseract.tesseract_cmd = r'C:\\Users\\jacob\\AppData\\Local\\Tesseract-OCR\\tesseract.exe'
#En caso de estar instalado en otra ruta añadirla en lugar de la arriba escrita.
#POPPLER LATEST VERSION https://github.com/oschwartz10612/poppler-windows/releases/
PDF_PATH = 'TEST.pdf'
DPI = 400
FIRST_PAGE = None
LAST_PAGE = None
FORMAT = 'png'
THREAD_COUNT = 4
USERPWD = None
USE_CROPBOX = False
STRICT = False
def pdf_to_pil(file, folder, output_file):
start_time = time.time()
pil_images = pdf2image.convert_from_path(file, dpi = DPI, output_folder = folder, output_file = output_file,first_page = FIRST_PAGE,
last_page = LAST_PAGE, fmt = FORMAT, thread_count = THREAD_COUNT, userpw = USERPWD,
use_cropbox = USE_CROPBOX, strict = STRICT, grayscale = True)
print('Time taken: ' + str(time.time() - start_time))
def delete_pages(array, file):
#I think this could be done waaay better with regex. Think about it.
clean_array = []
pages = PdfFileReader(file, 'rb').getNumPages()
for num in array:
if 'x' in num and '+' in num:
for x in list(range(0, pages)):
polin = [int(num.split('x')[0]), int(num.split('x')[1].split('+')[1])]
if x % int(polin[0]) == 0:
clean_array.append((x + 1) + polin[1])
if 'x' in num and '+' not in num:
for x in list(range(1, pages + 1)):
if x % int(num.split('x')[0]) == 0:
clean_array.append(x)
if '-' in num:
rang = num.split('-')
if rang[0] != '' and rang[1] != '':
for i in range(int(rang[1]) - int(rang[0])):
clean_array.append(int(rang[0]) + i)
clean_array.append(int(rang[1]))
elif rang[1] == '' and rang[0] != '':
list(range(0, pages))
clean_array = list(range(1, pages + 1))[int(rang[0]):]
else:
clean_array = list(range(0, pages))[:int(rang[1])]
clean_array.append(int(rang[1]))
elif str.isdigit(num) == True:
clean_array.append(int(num))
return clean_array
def create_pdf(file, array):
pages_to_keep = []
pdf = PdfFileReader(file, 'rb')
output = PdfFileWriter()
for i in list(range(1,pdf.getNumPages() + 1)):
if i not in array:
pages_to_keep.append(i)
page = pdf.getPage(i - 1)
output.addPage(page)
with open('temp.pdf', 'wb') as temp:
output.write(temp)
def save_images(pil_images):
index = 1
for image in pil_images:
image.save(IMG_FOLDER + 'page_' + str(index) + '.png')
index += 1
def get_image_w_h(image_path):
im = Image.open(image_path)
width, height = im.size
return width, height
def extract_from_crop(image, *args):
for arg in args:
cropped_img = image.crop(arg)
cropped_img.show()
img_text = pytesseract.image_to_string(cropped_img, lang = 'spa')