-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdash.py
135 lines (116 loc) · 6.11 KB
/
dash.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import streamlit as st
from pdf2image import convert_from_path
import os
from PIL import Image
from main import predict
import numpy as np
import cv2
import pandas as pd
import base64
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:/Program Files (x86)/Tesseract-OCR/tesseract.exe'
tessdata_dir_config = r'--tessdata-dir "C:/Program Files (x86)/Tesseract-OCR/tessdata"'
def table_detection(img_path):
img = cv2.imread(img_path)
img_s= cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
(thresh, img_bin) = cv2.threshold(img_gray, 180, 255, cv2.THRESH_BINARY)
img_bin = cv2.bitwise_not(img_bin)
kernel_length_v = (np.array(img_gray).shape[1])//200
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_length_v))
im_temp1 = cv2.erode(img_bin, vertical_kernel, iterations=5)
vertical_lines_img = cv2.dilate(im_temp1, vertical_kernel, iterations=5)
kernel_length_h = (np.array(img_gray).shape[1])//100
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_length_h, 1))
im_temp2 = cv2.erode(img_bin, horizontal_kernel, iterations=5)
horizontal_lines_img = cv2.dilate(im_temp2, horizontal_kernel, iterations=5)
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
table_segment = cv2.addWeighted(vertical_lines_img, 0.5, horizontal_lines_img, 0.5, 0.0)
table_segment = cv2.erode(cv2.bitwise_not(table_segment), kernel, iterations=2)
thresh, table_segment = cv2.threshold(table_segment, 0, 255, cv2.THRESH_OTSU)
contours, hierarchy = cv2.findContours(table_segment, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
count = 0
dict1={}
lst1=['shipper','consignee','notify party']
for i,c in enumerate(contours):
x, y, w, h = cv2.boundingRect(c)
if (w > 0 and h > 0) :
count += 1
cropped = img[y-3:y + h, x-3:x + w]
txt=pytesseract.image_to_string(cropped, config=tessdata_dir_config).strip()
for el in lst1:
if el in txt.split('\n')[0].lower():
if cropped.shape[0]<img.shape[0]/20:
cropped = img[y-3:y+ h+int(img.shape[0]/14), x-3:x + w]
txt=pytesseract.image_to_string(Image.fromarray(cropped), config=tessdata_dir_config).strip()
dict1.update({el:' '.join(txt.split('\n')[1:])})
img_s=cv2.rectangle(img_s,(x-3,y-3),(x + w,y + h),(255,0,0),2)
dict1=pd.DataFrame([dict1])
return dict1,img_s
st.title("TableNet with OCR Detection")
st.markdown("Hello There")
method = st.selectbox("Image or PDF", ['PDF', 'Image'])
if method == "PDF":
uploaded_file = st.file_uploader("Choose a file", type=['pdf'])
if uploaded_file is not None:
with open("selected.pdf", "wb") as f:
f.write(uploaded_file.getbuffer())
for _ in os.listdir("extracted_images"):
os.remove(os.path.join("extracted_images", _))
images = convert_from_path("selected.pdf")
for i in range(len(images)):
images[i].save('extracted_images/page'+'_'+ str(i+1) +'.jpg', 'JPEG')
img_cols = st.beta_columns(len(images))
for i in range(len(img_cols)):
img_cols[i].subheader("page"+str(i+1))
img_cols[i].image(Image.open("extracted_images/page_"+str(i+1)+".jpg"), use_column_width=True)
selected_page = st.selectbox("Select the page", os.listdir("extracted_images"))
image = Image.open('extracted_images/'+selected_page)
st.image(image)
selected_approach = st.selectbox("select approach",['Image Processing', 'TableNet approach'])
if selected_approach == 'Image Processing':
df, img = table_detection('extracted_images/'+selected_page)
st.image(img, "processed image")
st.dataframe(df)
if not df.empty:
csv = df.to_csv().encode()
b64 = base64.b64encode(csv).decode()
href = f'<a href="data:file/csv;base64,{b64}" download="out.csv" target="_blank">Download csv file</a>'
st.markdown(href, unsafe_allow_html=True)
if selected_approach == 'TableNet approach':
out, tb, cl = predict('extracted_images/'+selected_page, 'best_model.ckpt')
st.image(tb, "Table Mask")
st.image(cl, "Column Mask")
for i in range(len(out)):
st.dataframe(out[i])
csv = out[i].to_csv().encode()
b64 = base64.b64encode(csv).decode()
href = f'<a href="data:file/csv;base64,{b64}" download="out.csv" target="_blank">Download csv file</a>'
st.markdown(href, unsafe_allow_html=True)
if method == "Image":
st.write(method)
uploaded_file = st.file_uploader("Choose an Image", type=['jpg','jpeg','png','bmp'])
if uploaded_file is not None:
with open("selected_img.jpg", "wb") as f:
f.write(uploaded_file.getbuffer())
st.image(Image.open('selected_img.jpg'), width=200)
selected_approach = st.selectbox("select approach",['Image Processing', 'TableNet approach'])
if selected_approach == 'Image Processing':
df, img = table_detection('selected_img.jpg')
st.image(img, "processed image")
st.dataframe(df)
if not df.empty:
csv = df.to_csv().encode()
b64 = base64.b64encode(csv).decode()
href = f'<a href="data:file/csv;base64,{b64}" download="out.csv" target="_blank">Download csv file</a>'
st.markdown(href, unsafe_allow_html=True)
if selected_approach == 'TableNet approach':
out, tb, cl = predict('selected_img.jpg', 'best_model.ckpt')
st.image(tb, "Table Mask")
st.image(cl, "Column Mask")
for i in range(len(out)):
st.dataframe(out[i])
csv = out[i].to_csv().encode()
b64 = base64.b64encode(csv).decode()
href = f'<a href="data:file/csv;base64,{b64}" download="out.csv" target="_blank">Download csv file</a>'
st.markdown(href, unsafe_allow_html=True)