-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathTextSegmentation.py
108 lines (75 loc) · 2.73 KB
/
TextSegmentation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/env python
# coding: utf-8
# ## Import Necessary libraries
# In[1]:
import textseg as ts
from PyPDF2 import PdfFileReader
from pdf2image import convert_from_path
import cv2
import json
import pandas as pd
import numpy as np
import glob
import os
import pytesseract
import math
path_to_write = "TesseractDemo/Output/"
# ## The below function is to convert your pdf to image data
# In[2]:
def convert_pdf_to_image(filepath,img_path_to_save):
try:
fileName = filepath.split("/")[-1].replace(".pdf","")
pages = convert_from_path(filepath, 350)
i = 1
for page in pages:
image_name = img_path_to_save+fileName+"Page_" + str(i) + ".png"
page.save(image_name, "JPEG")
i = i+1
return {"status":200,"response":"PDF Converted to image sucessfully","fileName":fileName}
except Exception as e:
return {"status":400,"response":str(e)}
# ## get the list of documents you want to pass as an input
# In[3]:
documents = glob.glob("TesseractDemo/*.pdf")
documents = documents[:6]
# ### The below function is used get the text present in a image
# In[4]:
def text_from_tesseract(output_img):
text = str(((pytesseract.image_to_string(output_img))))
return text
# ### This function is the core function to process each pdf and store the resultant output using EAST-Text detection Model
# In[5]:
data = pd.DataFrame()
final_name_list=[]
final_text_opencv=[]
final_text_tessaract=[]
for i in documents:
pdf = PdfFileReader(open(i,'rb'))
fname = i.split('/')[-1]
print(pdf.getNumPages())
images = convert_from_path(i)
resumes_img=[]
for j in range(len(images)):
# Save pages as images in the pdf
images[j].save(path_to_write+fname.split('.')[0]+'_'+ str(j) +'.jpg', 'JPEG')
resumes_img.append(path_to_write+fname.split('.')[0]+'_'+ str(j) +'.jpg')
name_list = fname.split('.')[0]+'_' +'.jpg'
text_opencv=[]
text_tessaract=[]
for i in resumes_img:
frame=cv2.imread(i)
os.remove(i)
img = i.split("/")[2]
output_img,label,dilate, c_dict,df1, split_img=ts.get_text_seg(frame, img)
cv2.imwrite(path_to_write+img.split('.')[0]+".png",output_img)
for i in range(len(split_img)):
cv2.imwrite(path_to_write+img.split('.')[0]+str(i)+".png", split_img[i])
text_opencv.append(c_dict)
text_tessaract+=text_from_tesseract(output_img)
tesseract_str = ''.join(text_tessaract)
final_name_list.append(name_list)
final_text_opencv.append(text_opencv)
final_text_tessaract.append(tesseract_str)
# ### Since we have passed only one document we are looking at the fisrt index in a list
# In[6]:
final_text_opencv[0]