From 3f099ccceb0af7b040223f656de36e9aaea2f479 Mon Sep 17 00:00:00 2001
From: Ilya Kostrikov <ikostrikov@gmail.com>
Date: Sun, 31 May 2015 22:28:59 +0200
Subject: [PATCH] Added a segmentation example and new layers.

---
 beacon8/layers/Dropout.py                 |   1 +
 beacon8/layers/DuringTesting.py           |  14 ++
 beacon8/layers/DuringTraining.py          |  14 ++
 beacon8/layers/Overfeat.py                | 246 ++++++++++++++++++++++
 beacon8/layers/SpatialConvolutionCUDNN.py |   4 +-
 beacon8/layers/SpatialMaxPoolingCUDNN.py  |   4 +-
 beacon8/layers/SpatialSoftMax.py          |  17 ++
 beacon8/layers/SpatialSubSampling.py      |  14 ++
 beacon8/layers/__init__.py                |   5 +
 examples/MNIST/test.py                    |   1 +
 examples/Segmentation/data.py             |  38 ++++
 examples/Segmentation/model.py            |  64 ++++++
 examples/Segmentation/progress_bar.py     |   7 +
 examples/Segmentation/run.py              |  32 +++
 examples/Segmentation/test.py             |  43 ++++
 examples/Segmentation/train.py            |  44 ++++
 16 files changed, 544 insertions(+), 4 deletions(-)
 create mode 100644 beacon8/layers/DuringTesting.py
 create mode 100644 beacon8/layers/DuringTraining.py
 create mode 100644 beacon8/layers/Overfeat.py
 create mode 100644 beacon8/layers/SpatialSoftMax.py
 create mode 100644 beacon8/layers/SpatialSubSampling.py
 create mode 100644 examples/Segmentation/data.py
 create mode 100644 examples/Segmentation/model.py
 create mode 100644 examples/Segmentation/progress_bar.py
 create mode 100644 examples/Segmentation/run.py
 create mode 100644 examples/Segmentation/test.py
 create mode 100644 examples/Segmentation/train.py

diff --git a/beacon8/layers/Dropout.py b/beacon8/layers/Dropout.py
index 15c3a28..194ad21 100644
--- a/beacon8/layers/Dropout.py
+++ b/beacon8/layers/Dropout.py
@@ -4,6 +4,7 @@
 from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 _srng = RandomStreams()
 
+
 class Dropout(Module):
     def __init__(self, dropout):
         Module.__init__(self)
diff --git a/beacon8/layers/DuringTesting.py b/beacon8/layers/DuringTesting.py
new file mode 100644
index 0000000..304e512
--- /dev/null
+++ b/beacon8/layers/DuringTesting.py
@@ -0,0 +1,14 @@
+from .Module import Module
+
+
+class DuringTesting(Module):
+    def __init__(self, module):
+        Module.__init__(self)
+
+        self.module = module
+
+    def symb_forward(self, symb_input):
+        if self.training_mode:
+            return symb_input
+        else:
+            return self.module.symb_forward(symb_input)
diff --git a/beacon8/layers/DuringTraining.py b/beacon8/layers/DuringTraining.py
new file mode 100644
index 0000000..c1b7a92
--- /dev/null
+++ b/beacon8/layers/DuringTraining.py
@@ -0,0 +1,14 @@
+from .Module import Module
+
+
+class DuringTraining(Module):
+    def __init__(self, module):
+        Module.__init__(self)
+
+        self.module = module
+
+    def symb_forward(self, symb_input):
+        if self.training_mode:
+            return self.module.symb_forward(symb_input)
+        else:
+            return symb_input
diff --git a/beacon8/layers/Overfeat.py b/beacon8/layers/Overfeat.py
new file mode 100644
index 0000000..fd44907
--- /dev/null
+++ b/beacon8/layers/Overfeat.py
@@ -0,0 +1,246 @@
+from . import Module
+
+
+import theano as _th
+from theano.sandbox.cuda import CudaNdarrayType, GpuOp
+from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable, gpu_contiguous)
+
+
+class PyCudaOp(GpuOp):
+    def __eq__(self, other):
+        return type(self) == type(other)
+
+    def __hash__(self):
+        return hash(type(self))
+
+    def __str__(self):
+        return self.__class__.__name__
+
+    def make_node(self, inp):
+        inp = as_cuda_ndarray_variable(inp)
+        return _th.Apply(self, [inp], [inp.type()])
+
+
+class RollOpBase(PyCudaOp):
+    def c_support_code(self):
+        c_support_code = """
+            __global__ void maxpool_roll(float *input, float *output, int batch_size, int feature_size, int height_size, int width_size)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int batch = blockIdx.y * blockDim.y + threadIdx.y;
+            int map_size = height_size * width_size;
+            int feature = x / map_size;
+            int height = (x % map_size) / width_size;
+            int width = x % width_size;
+            int height_out = height / 2;
+            int width_out = width / 2;
+            int batch_out = batch * 4;
+            if (height % 2 == 0 && width % 2 == 1)
+            {
+                batch_out += 1;
+            }
+            else if (height % 2 == 1 && width % 2 == 0)
+            {
+                batch_out += 2;
+            }
+            else if (height % 2 == 1 && width % 2 == 1)
+            {
+                batch_out += 3;
+            }
+            if (batch < batch_size && feature < feature_size && height_out * 2 < height_size && width_out * 2 < width_size)
+            {
+                output[batch_out * (feature_size * ((height_size + 1) / 2) * ((width_size + 1) / 2)) +
+                       feature * (((height_size + 1) / 2) * ((width_size + 1) / 2)) +
+                       height_out * ((width_size + 1) / 2) +
+                       width_out] = input[batch * (feature_size * height_size * width_size) +
+                                          feature * (height_size * width_size) +
+                                          height * width_size +
+                                          width];
+            }
+        }
+        """
+        return c_support_code
+
+    def c_code(self, node, name, inputs, outputs, sub):
+        fail = sub['fail']
+
+        inp, = inputs
+        out, = outputs
+
+        c_code = """
+        {
+            int batch_size = CudaNdarray_HOST_DIMS(%(inp)s)[0];
+            int n_features = CudaNdarray_HOST_DIMS(%(inp)s)[1];
+            int height = CudaNdarray_HOST_DIMS(%(inp)s)[2];
+            int width = CudaNdarray_HOST_DIMS(%(inp)s)[3];
+
+            int out_shape[] = {batch_size * 4, n_features, (height + 1) / 2, (width + 1) / 2};
+            if (NULL == %(out)s || CudaNdarray_NDIM(%(inp)s) != CudaNdarray_NDIM(%(out)s) ||
+                                    !(CudaNdarray_HOST_DIMS(%(out)s)[0] == out_shape[0] &&
+                                     CudaNdarray_HOST_DIMS(%(out)s)[1] == out_shape[1] &&
+                                     CudaNdarray_HOST_DIMS(%(out)s)[2] == out_shape[2] &&
+                                     CudaNdarray_HOST_DIMS(%(out)s)[3] == out_shape[3]))
+            {
+                Py_XDECREF(%(out)s);
+                %(out)s = (CudaNdarray*)CudaNdarray_ZEROS(CudaNdarray_NDIM(%(inp)s), out_shape);
+            }
+
+            if (!%(out)s)
+            {
+                PyErr_SetString(PyExc_MemoryError, "failed to alloc output");
+                %(fail)s;
+            }
+
+            dim3 block(16, 16, 1);
+            dim3 grid((int)(ceil(((float)n_features * height * width) / block.x)),
+                      (int)(ceil(((float)batch_size) / block.y)),
+                       1);
+
+            maxpool_roll<<<grid, block>>>(CudaNdarray_DEV_DATA(%(inp)s),
+                                          CudaNdarray_DEV_DATA(%(out)s),
+                                          batch_size, n_features, height, width);
+
+            CNDA_THREAD_SYNC;
+            cudaError_t sts = cudaGetLastError();
+            if (cudaSuccess != sts)
+            {
+                PyErr_Format(PyExc_RuntimeError, cudaGetErrorString(sts));
+                %(fail)s;
+            }
+        }
+        """
+        return c_code % locals()
+
+
+class RollOp(RollOpBase):
+    def grad(self, inp, grads):
+        top, = grads
+        top = gpu_contiguous(top)
+        return [RollOpGrad()(top)]
+
+
+class UnRollOpBase(PyCudaOp):
+    def c_support_code(self):
+        c_support_code = """
+        __global__ void maxpool_unroll(float *input, float *output, int batch_size, int feature_size, int height_size, int width_size)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int batch = blockIdx.y * blockDim.y + threadIdx.y;
+            int map_size = height_size * width_size;
+            int feature = x / map_size;
+            int height = (x % map_size) / width_size;
+            int width = x % width_size;
+            int height_out = height * 2;
+            int width_out = width * 2;
+            int batch_out = batch / 4;
+            if (batch % 4 == 1)
+            {
+                width_out += 1;
+            }
+            else if (batch % 4 == 2)
+            {
+                height_out += 1;
+            }
+            else if (batch % 4 == 3)
+            {
+                height_out += 1;
+                width_out += 1;
+            }
+            if (batch < batch_size && feature < feature_size)
+            {
+                output[batch_out * (feature_size * height_size * 2 * width_size * 2) +
+                       feature * (height_size * 2 * width_size * 2) +
+                       height_out * width_size * 2 +
+                       width_out] = input[batch * (feature_size * height_size * width_size) +
+                                          feature * (height_size * width_size) +
+                                          height * width_size +
+                                          width];
+            }
+        }
+        """
+
+        return c_support_code
+
+    def c_code(self, node, name, inputs, outputs, sub):
+        fail = sub['fail']
+
+        inp, = inputs
+        out, = outputs
+
+        c_code = """
+        {
+            int batch_size = CudaNdarray_HOST_DIMS(%(inp)s)[0];
+            int n_features = CudaNdarray_HOST_DIMS(%(inp)s)[1];
+            int height = CudaNdarray_HOST_DIMS(%(inp)s)[2];
+            int width = CudaNdarray_HOST_DIMS(%(inp)s)[3];
+
+            int out_shape[] = {batch_size / 4, n_features, height * 2, width * 2};
+            if (NULL == %(out)s || CudaNdarray_NDIM(%(inp)s) != CudaNdarray_NDIM(%(out)s) ||
+                                   !(CudaNdarray_HOST_DIMS(%(out)s)[0] == out_shape[0] &&
+                                     CudaNdarray_HOST_DIMS(%(out)s)[1] == out_shape[1] &&
+                                     CudaNdarray_HOST_DIMS(%(out)s)[2] == out_shape[2] &&
+                                     CudaNdarray_HOST_DIMS(%(out)s)[3] == out_shape[3]))
+            {
+                Py_XDECREF(%(out)s);
+                %(out)s = (CudaNdarray*)CudaNdarray_NewDims(CudaNdarray_NDIM(%(inp)s), out_shape);
+            }
+
+            if (!%(out)s)
+            {
+                PyErr_SetString(PyExc_MemoryError, "failed to alloc output");
+                %(fail)s;
+            }
+
+            dim3 block(16, 16, 1);
+            dim3 grid((int)(ceil(((float)n_features * height * width) / block.x)),
+                      (int)(ceil(((float)batch_size) / block.y)),
+                       1);
+
+            maxpool_unroll<<<grid, block>>>(CudaNdarray_DEV_DATA(%(inp)s),
+                                            CudaNdarray_DEV_DATA(%(out)s),
+                                            batch_size, n_features, height, width);
+
+            CNDA_THREAD_SYNC;
+            cudaError_t sts = cudaGetLastError();
+            if (cudaSuccess != sts)
+            {
+                PyErr_Format(PyExc_RuntimeError, cudaGetErrorString(sts));
+                %(fail)s;
+            }
+        }
+        """
+        return c_code % locals()
+
+
+class UnRollOp(UnRollOpBase):
+    def grad(self, inp, grads):
+        top, = grads
+        top = gpu_contiguous(top)
+        return [UnRollOpGrad()(top)]
+
+
+class RollOpGrad(UnRollOpBase):
+    pass
+
+
+class UnRollOpGrad(RollOpBase):
+    pass
+
+unroll = UnRollOp()
+roll = RollOp()
+
+
+class OverfeatRoll(Module):
+    def __init__(self):
+        Module.__init__(self)
+
+    def symb_forward(self, symb_input):
+        return roll(symb_input)
+
+
+class OverfeatUnroll(Module):
+    def __init__(self):
+        Module.__init__(self)
+
+    def symb_forward(self, symb_input):
+        return unroll(symb_input)
diff --git a/beacon8/layers/SpatialConvolutionCUDNN.py b/beacon8/layers/SpatialConvolutionCUDNN.py
index 4c58dfa..fbcd55e 100644
--- a/beacon8/layers/SpatialConvolutionCUDNN.py
+++ b/beacon8/layers/SpatialConvolutionCUDNN.py
@@ -1,9 +1,9 @@
+from .Module import Module
+
 import theano as _th
 import numpy as _np
 import theano.sandbox.cuda.dnn as _dnn
 
-from .Module import Module
-
 
 class SpatialConvolutionCUDNN(Module):
     def __init__(self, n_input_plane, n_output_plane, k_w, k_h, d_w=1, d_h=1, pad_w=0, pad_h=0, with_bias=True):
diff --git a/beacon8/layers/SpatialMaxPoolingCUDNN.py b/beacon8/layers/SpatialMaxPoolingCUDNN.py
index ca45e45..a5146e5 100644
--- a/beacon8/layers/SpatialMaxPoolingCUDNN.py
+++ b/beacon8/layers/SpatialMaxPoolingCUDNN.py
@@ -1,7 +1,7 @@
-import theano.sandbox.cuda.dnn as _dnn
-
 from .Module import Module
 
+import theano.sandbox.cuda.dnn as _dnn
+
 
 class SpatialMaxPoolingCUDNN(Module):
     def __init__(self, k_w, k_h, d_w=None, d_h=None, pad_w=0, pad_h=0):
diff --git a/beacon8/layers/SpatialSoftMax.py b/beacon8/layers/SpatialSoftMax.py
new file mode 100644
index 0000000..dbf00ac
--- /dev/null
+++ b/beacon8/layers/SpatialSoftMax.py
@@ -0,0 +1,17 @@
+from .Module import Module
+
+import theano.sandbox.cuda.dnn as dnn
+from theano.sandbox.cuda.basic_ops import gpu_contiguous
+
+
+def spatial_softmax(img):
+    img = gpu_contiguous(img)
+    return dnn.GpuDnnSoftmax(tensor_format='bc01', algo='accurate', mode='channel')(img)
+
+
+class SpatialSoftMax(Module):
+    def __init__(self):
+        Module.__init__(self)
+
+    def symb_forward(self, symb_input):
+        return spatial_softmax(symb_input)
\ No newline at end of file
diff --git a/beacon8/layers/SpatialSubSampling.py b/beacon8/layers/SpatialSubSampling.py
new file mode 100644
index 0000000..b325e1f
--- /dev/null
+++ b/beacon8/layers/SpatialSubSampling.py
@@ -0,0 +1,14 @@
+from .Module import Module
+
+
+class SpatialSubSampling(Module):
+    """
+    note that it behaves very differently from Torch!
+    """
+    def __init__(self, scale):
+        self.scale = scale
+
+    def symb_forward(self, symb_input):
+        if symb_input.ndim != 4:
+            raise NotImplementedError
+        return symb_input[:, :, ::self.scale, ::self.scale]
diff --git a/beacon8/layers/__init__.py b/beacon8/layers/__init__.py
index 9aaab06..a5c2222 100644
--- a/beacon8/layers/__init__.py
+++ b/beacon8/layers/__init__.py
@@ -12,3 +12,8 @@
 from .SpatialMaxPooling import *
 from .SpatialConvolutionCUDNN import *
 from .SpatialMaxPoolingCUDNN import *
+from .Overfeat import OverfeatRoll, OverfeatUnroll
+from .DuringTraining import *
+from .DuringTesting import *
+from .SpatialSoftMax import SpatialSoftMax
+from .SpatialSubSampling import *
diff --git a/examples/MNIST/test.py b/examples/MNIST/test.py
index 5dae16f..ca502c2 100644
--- a/examples/MNIST/test.py
+++ b/examples/MNIST/test.py
@@ -2,6 +2,7 @@
 from progress_bar import *
 import theano as _th
 
+
 def validate(dataset_x, dataset_y, model, epoch, batch_size):
     progress = make_progressbar('Testing', epoch, len(dataset_x))
     progress.start()
diff --git a/examples/Segmentation/data.py b/examples/Segmentation/data.py
new file mode 100644
index 0000000..698f111
--- /dev/null
+++ b/examples/Segmentation/data.py
@@ -0,0 +1,38 @@
+import glob
+import os
+import numpy as np
+import scipy as sp
+import tarfile
+
+# Python 2/3 compatibility.
+try:
+    from urllib.request import urlretrieve
+except ImportError:
+    from urllib import urlretrieve
+
+
+
+def load_data():
+    data_folder = os.path.join(os.path.dirname(__file__), 'iccv09Data')
+    if not os.path.isdir(data_folder):
+        tar_file = os.path.join(os.path.dirname(__file__), 'data.tar.gz')
+        origin = ('http://dags.stanford.edu/data/iccv09Data.tar.gz')
+        print('Downloading data from {}'.format(origin))
+        urlretrieve(origin, tar_file)
+        tar = tarfile.open(tar_file)
+        tar.extractall()
+        tar.close()
+        os.remove(tar_file)
+
+    image_files = glob.glob(os.path.join(data_folder, 'images', '*.jpg'))
+
+    set_x = list()
+    set_y = list()
+
+    for image_file in image_files:
+        file_id = os.path.splitext(os.path.split(image_file)[1])[0]
+        labels = np.loadtxt(os.path.join(data_folder, 'labels', file_id + '.regions.txt'))
+        set_x.append(sp.misc.imread(image_file).transpose(2, 0, 1))
+        set_y.append(labels)
+
+    return set_x, set_y
diff --git a/examples/Segmentation/model.py b/examples/Segmentation/model.py
new file mode 100644
index 0000000..cc3a304
--- /dev/null
+++ b/examples/Segmentation/model.py
@@ -0,0 +1,64 @@
+import beacon8 as bb8
+
+
+def lenet():
+    model = bb8.Sequential()
+    model.add(bb8.BatchNormalization(3))
+
+    model.add(bb8.SpatialConvolutionCUDNN(3, 32, 3, 3, 1, 1, 1, 1, with_bias=False))
+    model.add(bb8.BatchNormalization(32))
+    model.add(bb8.ReLU())
+    model.add(bb8.SpatialConvolutionCUDNN(32, 32, 3, 3, 1, 1, 1, 1, with_bias=False))
+    model.add(bb8.BatchNormalization(32))
+    model.add(bb8.ReLU())
+    model.add(bb8.DuringTraining(bb8.SpatialMaxPoolingCUDNN(3, 3, 2, 2, 1, 1)))
+    model.add(bb8.DuringTesting(bb8.SpatialMaxPoolingCUDNN(3, 3, 1, 1, 1, 1)))
+    model.add(bb8.DuringTesting(bb8.OverfeatRoll()))
+
+    model.add(bb8.SpatialConvolutionCUDNN(32, 64, 3, 3, 1, 1, 1, 1, with_bias=False))
+    model.add(bb8.BatchNormalization(64))
+    model.add(bb8.ReLU())
+    model.add(bb8.SpatialConvolutionCUDNN(64, 64, 3, 3, 1, 1, 1, 1, with_bias=False))
+    model.add(bb8.BatchNormalization(64))
+    model.add(bb8.ReLU())
+    model.add(bb8.DuringTraining(bb8.SpatialMaxPoolingCUDNN(3, 3, 2, 2, 1, 1)))
+    model.add(bb8.DuringTesting(bb8.SpatialMaxPoolingCUDNN(3, 3, 1, 1, 1, 1)))
+    model.add(bb8.DuringTesting(bb8.OverfeatRoll()))
+
+    model.add(bb8.SpatialConvolutionCUDNN(64, 128, 3, 3, 1, 1, 1, 1, with_bias=False))
+    model.add(bb8.BatchNormalization(128))
+    model.add(bb8.ReLU())
+    model.add(bb8.SpatialConvolutionCUDNN(128, 128, 3, 3, 1, 1, 1, 1, with_bias=False))
+    model.add(bb8.BatchNormalization(128))
+    model.add(bb8.ReLU())
+    model.add(bb8.DuringTraining(bb8.SpatialMaxPoolingCUDNN(3, 3, 2, 2, 1, 1)))
+    model.add(bb8.DuringTesting(bb8.SpatialMaxPoolingCUDNN(3, 3, 1, 1, 1, 1)))
+    model.add(bb8.DuringTesting(bb8.OverfeatRoll()))
+
+    model.add(bb8.SpatialConvolutionCUDNN(128, 256, 3, 3, 1, 1, 1, 1, with_bias=False))
+    model.add(bb8.BatchNormalization(256))
+    model.add(bb8.ReLU())
+    model.add(bb8.SpatialConvolutionCUDNN(256, 256, 3, 3, 1, 1, 1, 1, with_bias=False))
+    model.add(bb8.BatchNormalization(256))
+    model.add(bb8.ReLU())
+    model.add(bb8.DuringTraining(bb8.SpatialMaxPoolingCUDNN(3, 3, 2, 2, 1, 1)))
+    model.add(bb8.DuringTesting(bb8.SpatialMaxPoolingCUDNN(3, 3, 1, 1, 1, 1)))
+    model.add(bb8.DuringTesting(bb8.OverfeatRoll()))
+
+    model.add(bb8.SpatialConvolutionCUDNN(256, 512, 4, 4, with_bias=False))
+    model.add(bb8.BatchNormalization(512))
+    model.add(bb8.ReLU())
+    model.add(bb8.Dropout(0.5))
+
+    model.add(bb8.SpatialConvolutionCUDNN(512, 8, 1, 1))
+    model.add(bb8.DuringTraining(bb8.Reshape(-1, 8)))
+    model.add(bb8.DuringTraining(bb8.SoftMax()))
+
+    model.add(bb8.DuringTesting(bb8.SpatialSoftMax()))
+    model.add(bb8.DuringTesting(bb8.OverfeatUnroll()))
+    model.add(bb8.DuringTesting(bb8.OverfeatUnroll()))
+    model.add(bb8.DuringTesting(bb8.OverfeatUnroll()))
+    model.add(bb8.DuringTesting(bb8.OverfeatUnroll()))
+
+    return model
+
diff --git a/examples/Segmentation/progress_bar.py b/examples/Segmentation/progress_bar.py
new file mode 100644
index 0000000..7481efc
--- /dev/null
+++ b/examples/Segmentation/progress_bar.py
@@ -0,0 +1,7 @@
+from progressbar import ProgressBar, Counter, Percentage, Bar, ETA
+
+
+def make_progressbar(mode, epoch, data_size):
+    widgets = [mode + ' epoch #', str(epoch), ', processed ', Counter(), ' of ', str(data_size),
+               ' (', Percentage(), ')', ' ', Bar(), ' ', ETA()]
+    return ProgressBar(maxval=data_size, widgets=widgets)
diff --git a/examples/Segmentation/run.py b/examples/Segmentation/run.py
new file mode 100644
index 0000000..0971c8f
--- /dev/null
+++ b/examples/Segmentation/run.py
@@ -0,0 +1,32 @@
+import beacon8.optimizers as optim
+from data import *
+from train import *
+from test import *
+from model import *
+from sklearn.cross_validation import train_test_split
+
+
+def main(params):
+    set_x, set_y = load_data()
+    train_set_x, test_set_x, train_set_y, test_set_y = train_test_split(set_x, set_y, train_size=0.8)
+    model = lenet()
+
+    criterion = bb8.ClassNLLCriterion()
+
+    optimiser = optim.SGD(lr=params['lr'])
+
+    for epoch in range(1000):
+        model.training()
+        train(train_set_x, train_set_y, model, optimiser, criterion, epoch, params['batch_size'])
+
+        if epoch % 100 == 0:
+            train(train_set_x, train_set_y, model, optimiser, criterion, epoch, params['batch_size'], 'stat')
+            model.evaluate()
+            validate(test_set_x, test_set_y, model, epoch)
+
+
+if __name__ == "__main__":
+    params = {}
+    params['lr'] = 0.1
+    params['batch_size'] = 64
+    main(params)
diff --git a/examples/Segmentation/test.py b/examples/Segmentation/test.py
new file mode 100644
index 0000000..cd3bd49
--- /dev/null
+++ b/examples/Segmentation/test.py
@@ -0,0 +1,43 @@
+import numpy as np
+from progress_bar import *
+import matplotlib.pyplot as plt
+from matplotlib.pyplot import pcolor
+
+
+def validate(dataset_x, dataset_y, model, epoch, visualize_after=100):
+    progress = make_progressbar('Testing', epoch, len(dataset_x))
+    progress.start()
+
+    accuracy = 0
+    total_pixels = 0
+
+    crop_size = 64
+
+    for j in range(len(dataset_x)):
+        progress.update(j)
+
+        #add padding to have the same size
+        padded = np.pad(dataset_x[j],
+                        ((0, 0), (crop_size // 2, crop_size // 2), (crop_size // 2, crop_size // 2)),
+                        mode='constant',
+                        constant_values=0)
+
+        #a single image mini-batch
+        padded = padded.reshape((1,) + padded.shape)
+        mini_batch_prediction = np.argmax(model.forward(padded)[0], axis=0)[:dataset_y[j].shape[0],
+                                                                            :dataset_y[j].shape[1]]
+
+        accuracy += sum((dataset_y[j] == mini_batch_prediction)[dataset_y[j] >= 0])
+
+        total_pixels += sum(sum(dataset_y[j] >= 0))
+
+        if epoch > visualize_after:
+            plt.figure(1)
+            plt.subplot(311)
+            pcolor(mini_batch_prediction[::-1, ]+1, cmap='RdBu', vmin=0, vmax=8)
+            plt.subplot(312)
+            pcolor(dataset_y[j][::-1, ]+1, cmap='RdBu', vmin=0, vmax=8)
+            plt.subplot(313)
+            plt.imshow(dataset_x[j].transpose((1, 2, 0)))
+            plt.show()
+    print("Classification accuracy: " + str(float(accuracy) / total_pixels))
diff --git a/examples/Segmentation/train.py b/examples/Segmentation/train.py
new file mode 100644
index 0000000..0d120f5
--- /dev/null
+++ b/examples/Segmentation/train.py
@@ -0,0 +1,44 @@
+import numpy as np
+from progress_bar import *
+import theano as _th
+
+
+def train(dataset_x, dataset_y, model, optimiser, criterion, epoch, batch_size, mode=None):
+    progress = make_progressbar('Training', epoch, len(dataset_x))
+    progress.start()
+
+    shuffle = np.random.permutation(len(dataset_x))
+
+    crop_size = 64
+
+    mini_batch_input = np.empty(shape=(batch_size, 3,) + (crop_size, crop_size),
+                                dtype=_th.config.floatX)
+    mini_batch_targets = np.empty(shape=(batch_size, ), dtype=_th.config.floatX)
+
+    for j in range(len(dataset_x) // batch_size):
+        for k in range(batch_size):
+
+            while True:
+                y = np.random.randint(0, dataset_x[shuffle[j * batch_size + k]].shape[1])
+                x = np.random.randint(0, dataset_x[shuffle[j * batch_size + k]].shape[2])
+
+                padded = np.pad(dataset_x[shuffle[j * batch_size + k]],
+                                ((0, 0), (crop_size // 2, crop_size // 2), (crop_size // 2, crop_size // 2)),
+                                mode='constant',
+                                constant_values=0)
+
+                mini_batch_input[k] = padded[:, y: y + crop_size, x: x + crop_size]
+                mini_batch_targets[k] = dataset_y[shuffle[j * batch_size + k]][y, x]
+                if mini_batch_targets[k] > -1:
+                    break
+
+        if mode is None:
+            model.zero_grad_parameters()
+            model.accumulate_gradients(mini_batch_input, mini_batch_targets, criterion)
+            optimiser.update_parameters(model)
+        else:
+            model.accumulate_statistics(mini_batch_input)
+
+        progress.update(j * batch_size)
+
+    progress.finish()