-
Notifications
You must be signed in to change notification settings - Fork 28
/
Copy pathscript.py
452 lines (398 loc) · 16.9 KB
/
script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
#!/usr/bin/env python -W ignore::DeprecationWarning
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import dicom
import glob
from matplotlib import pyplot as plt
import os
import cv2
import mxnet as mx
import pandas as pd
from sklearn import cross_validation
import scipy.ndimage
from skimage import measure, morphology
from sklearn.cluster import KMeans
from skimage.transform import resize
from skimage.segmentation import clear_border
from skimage.measure import label, regionprops
from skimage.morphology import disk, dilation, binary_erosion, binary_closing
from skimage.filters import roberts, sobel
from scipy import ndimage as ndi
# import xgboost as xgb
import tensorflow as tf
import numpy as np
import gc
import db
import argparse
IMG_SIZE_PX = 306
SLICE_COUNT = 20
n_classes = 2
batch_size = 10
x = tf.placeholder('float')
y = tf.placeholder('float')
keep_rate = 0.8
def connect_to_db():
sqlite_file = "lung_cancer_NEW.sqlite" # name of the sqlite database file
conn = db.connect(sqlite_file)
return conn
def get_extractor():
print("Loading Resnet-50 Model ...")
model = mx.model.FeedForward.load('model/resnet-50', 0, ctx=mx.cpu(), numpy_batch_size=1)
fea_symbol = model.symbol.get_internals()["flatten0_output"]
feature_extractor = mx.model.FeedForward(ctx=mx.cpu(), symbol=fea_symbol, numpy_batch_size=64,
arg_params=model.arg_params, aux_params=model.aux_params,
allow_extra_params=True)
return feature_extractor
def read_data_from_db(cursor, table_name_1, col_names):
col_names = ','.join(col_names)
print("Reading data from DB ...")
cursor.execute("SELECT {col} from {tn}".format(col = col_names, tn = table_name_1))
data = cursor.fetchall()
return data
def normalize_hu(image):
MIN_BOUND = -1000.0
MAX_BOUND = 400.0
image = (image - MIN_BOUND) / (MAX_BOUND - MIN_BOUND)
image[image > 1] = 1.
image[image < 0] = 0.
return image
def read_data(path, labels_path, output_path, cursor = 0):
# path = r"F:\Data Science\DS Bowl\stage1_sample"
print("Reading Data ...")
patients = os.listdir(path)
patients.sort()
labels_df = pd.read_csv(labels_path, index_col=0)
labels_df.head()
# net = get_extractor()
count = 0
much_data = []
target_dir = output_path
for folder in glob.glob(path + "/*"):
# count += 1
# if count == 3:
# break
print("Processing patient no.: ", count + 1)
count = count + 1
lstFilesDCM = [] # create an empty list
masked_lung = []
patient_name = folder.split('\\')[-1]
try:
label = labels_df.get_value(patient_name, 'cancer')
for s in os.listdir(folder):
if ".dcm" in s.lower():
lstFilesDCM.append(os.path.join(folder,s))
# print("Loading Images ...")
scans = load_images(lstFilesDCM)
# print("Done ...")
images = get_pixels_hu(scans)
# print(images.shape)
pixel_spacing = scans[0].PixelSpacing
pixel_spacing.append(scans[0].SliceThickness)
# images = rescale_patient_images(images, pixel_spacing, 1.0)
for i in range(images.shape[0]):
patient_dir = target_dir + "/" + patient_name + "/"
if not os.path.exists(patient_dir):
os.mkdir(patient_dir)
img_path = patient_dir + "img_" + str(i).rjust(4, '0') + "_i.png"
org_img = images[i]
# if there exists slope,rotation image with corresponding degree
# if cos_degree > 0.0:
# org_img = cv_flip(org_img,org_img.shape[1],org_img.shape[0],cos_degree)
img, mask = get_segmented_lungs(org_img.copy())
org_img = normalize_hu(org_img)
# print("Image Shape: ", org_img.shape)
# print("Mask Shape: ", mask.shape)
cv2.imwrite(img_path, org_img * 255)
cv2.imwrite(img_path.replace("_i.png", "_m.png"), mask * 255)
# pix_resampled, spacing = resample(images, scans, [1, 1, 1])
# x, y, z = pix_resampled.shape
# pix_resampled = pix_resampled.reshape((x, y, z, 1))
# feats = net.predict(pix_resampled)
# print(feats.shape)
# np.save(output_path + "folder.npy", feats)
# for img in pix_resampled:
# print("learning features ...")
# feats = net.predict(images)
# print(feats.shape)
# np.save(output_path + "folder.npy", feats)
# masked_lung.append(make_lungmask(img))
# feats = net.predict(pix_resampled)
# masked_lung = np.array(masked_lung)
# print(masked_lung.shape)
# exit()
# if label == 1: label = np.array([0,1])
# elif label == 0: label = np.array([1,0])
# print(masked_lung.shape)
# exit()
# cursor.execute("insert into lung_masks values (?, ?, ?)", (patient_name, masked_lung, label))
# print("Data Inserted !")
# much_data.append([masked_lung,label])
except KeyError as e:
print("This is unlabelled data!!")
def load_patient_images(patient_id, base_dir=None, wildcard = "*.*"):
if base_dir == None:
base_dir = r"F:\Data Science\DS Bowl\Extracted_Images\\"
src_dir = base_dir + patient_id + "\\"
src_img_paths = glob.glob(src_dir + wildcard)
src_img_paths = [im for im in src_img_paths]
src_img_paths.sort()
images = [cv2.imread(img_path, cv2.IMREAD_GRAYSCALE) for img_path in src_img_paths]
# print(np.array(images).shape)
images = [im.reshape((1, ) + im.shape) for im in images]
# x, y, z = images.shape
res = np.vstack(images)
print(res.shape)
return res
def get_segmented_lungs(im):
# Step 1: Convert into a binary image.
binary = im < -400
# Step 2: Remove the blobs connected to the border of the image.
cleared = clear_border(binary)
# Step 3: Label the image.
label_image = label(cleared)
# Step 4: Keep the labels with 2 largest areas.
areas = [r.area for r in regionprops(label_image)]
areas.sort()
if len(areas) > 2:
for region in regionprops(label_image):
if region.area < areas[-2]:
for coordinates in region.coords:
label_image[coordinates[0], coordinates[1]] = 0
binary = label_image > 0
# Step 5: Erosion operation with a disk of radius 2. This operation is seperate the lung nodules attached to the blood vessels.
selem = disk(2)
binary = binary_erosion(binary, selem)
# Step 6: Closure operation with a disk of radius 10. This operation is to keep nodules attached to the lung wall.
selem = disk(10)
binary = binary_closing(binary, selem)
# Step 7: Fill in the small holes inside the binary mask of lungs.
edges = roberts(binary)
binary = ndi.binary_fill_holes(edges)
# Step 8: Superimpose the binary mask on the input image.
get_high_vals = binary == 0
im[get_high_vals] = -2000
return im, binary
def load_images(file_path):
# print(len(file_path))
slices = [dicom.read_file(s) for s in file_path]
slices.sort(key = lambda x: int(x.InstanceNumber))
try:
slice_thickness = np.abs(slices[0].ImagePositionPatient[2] - slices[1].ImagePositionPatient[2])
except:
slice_thickness = np.abs(slices[0].SliceLocation - slices[1].SliceLocation)
for s in slices:
s.SliceThickness = slice_thickness
return slices
def get_pixels_hu(scans):
image = np.stack([s.pixel_array for s in scans])
image = image.astype(np.int16)
# Set outside-of-scan pixels to 1
# The intercept is usually -1024, so air is approximately 0
image[image == -2000] = 0
# Convert to Hounsfield units (HU)
intercept = scans[0].RescaleIntercept
slope = scans[0].RescaleSlope
if slope != 1:
image = slope * image.astype(np.float64)
image = image.astype(np.int16)
image += np.int16(intercept)
return np.array(image, dtype=np.int16)
def resample(image, scan, new_spacing=[1,1,1]):
# Determine current pixel spacing
spacing = np.array([scan[0].SliceThickness] + scan[0].PixelSpacing, dtype=np.float32)
resize_factor = spacing / new_spacing
# print(image.shape)
# print(resize_factor)
new_real_shape = image.shape * resize_factor
new_shape = np.round(new_real_shape)
real_resize_factor = new_shape / image.shape
new_spacing = spacing / real_resize_factor
image = scipy.ndimage.interpolation.zoom(image, real_resize_factor, mode='nearest')
return image, new_spacing
def make_lungmask(img, display=False):
'''
We have to make sure that we set our threshold between the lung pixel values and the denser tissue pixel values.
To do this, we reset the pixels with the minimum value to the average pixel value near the center of the picture
and perform kmeans clustering with k=2.
This seems to work well for both scenarios.
'''
row_size= img.shape[0]
col_size = img.shape[1]
mean = np.mean(img)
std = np.std(img)
img = img - mean
img = img / std
# Find the average pixel value near the lungs
# to renormalize washed out images
middle = img[int(col_size / 5) : int(col_size / 5 * 4), int(row_size / 5) : int(row_size / 5 * 4)]
mean = np.mean(middle)
max = np.max(img)
min = np.min(img)
# To improve threshold finding, I'm moving the
# underflow and overflow on the pixel spectrum
img[img==max]=mean
img[img==min]=mean
#
# Using Kmeans to separate foreground (soft tissue / bone) and background (lung/air)
#
kmeans = KMeans(n_clusters=2).fit(np.reshape(middle,[np.prod(middle.shape),1]))
centers = sorted(kmeans.cluster_centers_.flatten())
threshold = np.mean(centers)
thresh_img = np.where(img<threshold,1.0,0.0) # threshold the image
# First erode away the finer elements, then dilate to include some of the pixels surrounding the lung.
# We don't want to accidentally clip the lung.
eroded = morphology.erosion(thresh_img,np.ones([3,3]))
dilation = morphology.dilation(eroded,np.ones([8,8]))
labels = measure.label(dilation) # Different labels are displayed in different colors
label_vals = np.unique(labels)
# print(labels[0])
regions = measure.regionprops(labels)
good_labels = []
for prop in regions:
B = prop.bbox
if B[2]-B[0]<row_size/10*9 and B[3]-B[1]<col_size/10*9 and B[0]>row_size/5 and B[2]<col_size/5*4:
good_labels.append(prop.label)
mask = np.ndarray([row_size,col_size],dtype=np.int8)
mask[:] = 0
#
# After just the lungs are left, we do another large dilation
# in order to fill in and out the lung mask
#
for N in good_labels:
mask = mask + np.where(labels==N,1,0)
mask = morphology.dilation(mask,np.ones([10,10])) # one last dilation
if (display):
fig, ax = plt.subplots(3, 2, figsize=[12, 12])
ax[0, 0].set_title("Original")
ax[0, 0].imshow(img, cmap='gray')
ax[0, 0].axis('off')
ax[0, 1].set_title("Threshold")
ax[0, 1].imshow(thresh_img, cmap='gray')
ax[0, 1].axis('off')
ax[1, 0].set_title("After Erosion and Dilation")
ax[1, 0].imshow(dilation, cmap='gray')
ax[1, 0].axis('off')
ax[1, 1].set_title("Color Labels")
ax[1, 1].imshow(labels)
ax[1, 1].axis('off')
ax[2, 0].set_title("Final Mask")
ax[2, 0].imshow(mask, cmap='gray')
ax[2, 0].axis('off')
ax[2, 1].set_title("Apply Mask on Original")
ax[2, 1].imshow(mask*img, cmap='gray')
ax[2, 1].axis('off')
plt.show()
return mask*img
def rescale_patient_images(images_zyx, org_spacing_xyz, target_voxel_mm, is_mask_image=False, verbose=False):
if verbose:
print("Spacing: ", org_spacing_xyz)
print("Shape: ", images_zyx.shape)
# print "Resizing dim z"
resize_x = 1.0
resize_y = float(org_spacing_xyz[2]) / float(target_voxel_mm)
interpolation = cv2.INTER_NEAREST if is_mask_image else cv2.INTER_LINEAR
res = cv2.resize(images_zyx, dsize=None, fx=resize_x, fy=resize_y, interpolation=interpolation) # opencv assumes y, x, channels umpy array, so y = z pfff
# print "Shape is now : ", res.shape
res = res.swapaxes(0, 2)
res = res.swapaxes(0, 1)
# print "Shape: ", res.shape
resize_x = float(org_spacing_xyz[0]) / float(target_voxel_mm)
resize_y = float(org_spacing_xyz[1]) / float(target_voxel_mm)
# cv2 can handle max 512 channels..
if res.shape[2] > 512:
res = res.swapaxes(0, 2)
res1 = res[:256]
res2 = res[256:]
res1 = res1.swapaxes(0, 2)
res2 = res2.swapaxes(0, 2)
res1 = cv2.resize(res1, dsize=None, fx=resize_x, fy=resize_y, interpolation=interpolation)
res2 = cv2.resize(res2, dsize=None, fx=resize_x, fy=resize_y, interpolation=interpolation)
res1 = res1.swapaxes(0, 2)
res2 = res2.swapaxes(0, 2)
res = numpy.vstack([res1, res2])
res = res.swapaxes(0, 2)
else:
res = cv2.resize(res, dsize=None, fx=resize_x, fy=resize_y, interpolation=interpolation)
# channels = cv2.split(res)
# resized_channels = []
# for channel in channels:
# channel = cv2.resize(channel, dsize=None, fx=resize_x, fy=resize_y)
# resized_channels.append(channel)
# res = cv2.merge(resized_channels)
# print "Shape after resize: ", res.shape
res = res.swapaxes(0, 2)
res = res.swapaxes(2, 1)
if verbose:
print("Shape after: ", res.shape)
return res
def train_xgboost():
df = pd.read_csv('data/stage1_labels.csv')
print(df.head())
x = np.array([np.mean(np.load('stage1/%s.npy' % str(id)), axis=0) for id in df['id'].tolist()])
y = df['cancer'].as_matrix()
trn_x, val_x, trn_y, val_y = cross_validation.train_test_split(x, y, random_state=42, stratify=y,
test_size=0.20)
clf = xgb.XGBRegressor(max_depth=10,
n_estimators=1500,
min_child_weight=9,
learning_rate=0.05,
nthread=8,
subsample=0.80,
colsample_bytree=0.80,
seed=4242)
clf.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], verbose=True, eval_metric='logloss', early_stopping_rounds=50)
return clf
def prepare_image_for_net3D(img):
img = img.astype(np.float32)
img -= 41
img /= 255.
img = img.reshape(1, img.shape[0], img.shape[1], img.shape[2])
return img
if __name__ == '__main__':
path = r"F:\Data Science\DS Bowl\stage1"
labels_path = r"F:\Data Science\DS Bowl\stage1_labels.csv"
output_path = r"E:\Dhaval\Data Science\Projects\Deep Learning\Lung_Cancer\Output\Extracted_Images"
table_name_1 = "lung_masks"
col_names = ["Patient_id TEXT", "Masks array", "Label array"]
data = []
parser = argparse.ArgumentParser()
parser.add_argument("Option", nargs = "?", choices = ["P", "CNN", "B"], default = "P", help = "Select 'P': Pre-process the CT-scan images for each patient 'CNN': Apply Convolutional Neural Network on the pre-processed data from database 'B': Perform pre-processing and CNN both")
args = parser.parse_args()
# conn = connect_to_db()
# cursor = conn.cursor()
# db.create_table(cursor, table_name_1, col_names)
if args.Option == "P":
read_data(path, labels_path, output_path)
# db.disconnect(conn)
elif args.Option == "CNN":
# data = read_data_from_db(cursor, table_name_1, col_names)
# print(data[0][1].shape)
# patient_id = "00cba091fa4ad62cc3200a657aeb957e"
net = get_extractor()
patient_dir = ".\Output"
images_input_path = r"F:\Data Science\DS Bowl\Extracted_Images\\"
for folder in glob.glob(path + '/*'):
patient_id = folder.split('\\')[-1]
patient_dir = patient_dir + "\\" + patient_id
if not os.path.exists(patient_dir):
os.mkdir(patient_dir)
im = load_patient_images(patient_id, base_dir = images_input_path, wildcard = "*_i.png")
im = prepare_image_for_net3D(im)
print(im.shape)
feats = net.predict(im)
print(feats.shape)
np.save(patient_dir, feats)
# train_data = data
# validation_data = data
# train_neural_network(x, validation_data)
# db.disconnect(conn)
elif args.Option == "B":
read_data(path, labels_path, output_path)
# db.disconnect(conn)
# cursor.execute("select * from {tn}".format(tn = table_name_1))
# results = cursor.fetchall()
# print("Number of rows: ",len(results))
# exit()
# calc_features()
# make_submit()