diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..f459e72
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,50 @@
+cmake_minimum_required (VERSION 2.8)
+project (fusibile)
+
+find_package(OpenCV REQUIRED )
+#find_package(Armadillo REQUIRED)
+find_package(CUDA 6.0 REQUIRED ) # For Cuda Managed Memory and c++11
+
+include_directories(${OpenCV_INCLUDE_DIRS})
+include_directories(.)
+
+#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-O3 --use_fast_math --ptxas-options=-v -std=c++11 --compiler-options -Wall -gencode arch=compute_30,code=sm_30 -gencode arch=compute_52,code=sm_52)
+set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-O3 --use_fast_math --ptxas-options=-v -std=c++11 --compiler-options -Wall -gencode arch=compute_52,code=sm_52)
+
+if(CMAKE_COMPILER_IS_GNUCXX)
+    add_definitions(-std=c++11)
+    #add_definitions(-march=native)
+endif()
+
+find_package(OpenMP)
+if (OPENMP_FOUND)
+    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+endif()
+
+# for YouCompleteMe
+set( CMAKE_EXPORT_COMPILE_COMMANDS 1 )
+
+# For compilation ...
+# Specify target & source files to compile it from
+cuda_add_executable(
+    fusibile
+    cameraGeometryUtils.h
+    vector_operations.h
+    camera.h
+    globalstate.h
+    algorithmparameters.h
+    cameraparameters.h
+    linestate.h
+    groundTruthUtils.h
+    displayUtils.h
+    mathUtils.h
+    fileIoUtils.h
+    fusibile.cu
+    main.cpp
+    )
+
+# For linking ...
+# Specify target & libraries to link it with
+target_link_libraries(fusibile
+    ${OpenCV_LIBS}
+    )
diff --git a/algorithmparameters.h b/algorithmparameters.h
new file mode 100644
index 0000000..073f8d2
--- /dev/null
+++ b/algorithmparameters.h
@@ -0,0 +1,93 @@
+#pragma once
+#include "managed.h"
+
+//cost function
+enum { PM_COST = 0, CENSUS_TRANSFORM = 1, ADAPTIVE_CENSUS = 2, CENSUS_SELFSIMILARITY = 3, PM_SELFSIMILARITY = 4, ADCENSUS = 5, ADCENSUS_SELFSIMILARITY = 6, SPARSE_CENSUS = 7 };
+
+//cost combination
+enum { COMB_ALL = 0, COMB_BEST_N = 1, COMB_ANGLE = 2, COMB_GOOD = 3};
+
+
+class AlgorithmParameters : public Managed{
+public:
+    int algorithm; // algorithm cost type
+    float max_disparity; // maximal disparity value CUDA
+    float min_disparity; // minimum disparity value (default 0) CUDA
+    int box_hsize; // filter kernel width CUDA
+    int box_vsize; // filter kernel height CUDA
+    float tau_color; // PM_COST max. threshold for color CUDA
+    float tau_gradient; // PM_COST max. threshold for gradient CUDA
+    float alpha; // PM_COST weighting between color and gradient CUDA
+    float gamma; // parameter for weight function (used e.g. in PM_COST) CUDA
+    int border_value; // what value should pixel at extended border get (constant or replicate -1)
+    int iterations; // number of iterations
+    bool color_processing; // use color processing or not (otherwise just grayscale processing)
+    float dispTol; //PM Stereo: 1, PM Huber: 0.5
+    float normTol; // 0.1 ... about 5.7 degrees
+    float census_epsilon; //for census transform
+    int self_similarity_n; // number of pixels considered for self similarity
+    float cam_scale; //used to rescale K in case of rescaled image size
+    int num_img_processed; //number of images that are processed as reference images
+    float costThresh; // threshold to decide whether disparity/depth is valid or not
+    float good_factor; // for cost aggregation/combination good: factor for truncation CUDA
+    int n_best; // CUDA
+    int cost_comb; // CUDA
+    bool viewSelection;
+    float depthMin; // CUDA
+    float depthMax; // CUDA
+    // hack XXX
+    int cols;
+    int rows;
+    // fuse
+    bool storePlyFiles;
+    bool gt_compare;
+    bool gt_normal_compare;
+
+    //threshold for consistency check
+    float depthThresh;
+    float normalThresh ;
+
+    //how many views need to be consistent? (for update: numConsistentThresh+1)
+    int numConsistentThresh;
+    bool saveTexture;
+
+    AlgorithmParameters(){
+        algorithm           = 0; // algorithm cost type
+        max_disparity     = 256.0f; // maximal disparity value CUDA
+        min_disparity     = 0.0f; // minimum disparity value (default 0) CUDA
+        box_hsize           = 15; // filter kernel width CUDA
+        box_vsize           = 15; // filter kernel height CUDA
+        tau_color         = 10; // PM_COST max. threshold for color CUDA
+        tau_gradient      = 2.0f; // PM_COST max. threshold for gradient CUDA
+        alpha             = 0.9f; // PM_COST weighting between color and gradient CUDA
+        gamma             = 10.0f; // parameter for weight function (used e.g. in PM_COST) CUDA
+        border_value        = -1; // what value should pixel at extended border get (constant or replicate -1)
+        iterations          = 8; // number of iterations
+        color_processing   = false; // use color processing or not (otherwise just grayscale processing)
+        dispTol           = 1.0; //PM Stereo: 1, PM Huber: 0.5
+        normTol           = 0.1f; // 0.1 ... about 5.7 degrees
+        census_epsilon    = 2.5f; //for census transform
+        self_similarity_n   = 50; // number of pixels considered for self similarity
+        cam_scale         = 1.0f; //used to rescale K in case of rescaled image size
+        num_img_processed   = 1; //number of images that are processed as reference images
+        costThresh        = 40.0f; // threshold to decide whether disparity/depth is valid or not
+         good_factor       = 1.5f; // for cost aggregation/combination good: factor for truncation CUDA
+        n_best              = 2; // CUDA
+        cost_comb           = 1; // CUDA
+        viewSelection      = false;
+        depthMin          = 2.0f; // CUDA
+        depthMax          = 20.0f; // CUDA
+
+        storePlyFiles = true;
+        gt_compare = false;
+        gt_normal_compare = false;
+
+        //threshold for consistency check
+        depthThresh = 0.5f;
+        normalThresh = 0.52f;
+
+        //how many views need to be consistent? (for update: numConsistentThresh+1)
+        numConsistentThresh = 2;
+        saveTexture = true;
+    }
+};
diff --git a/camera.h b/camera.h
new file mode 100644
index 0000000..7760a43
--- /dev/null
+++ b/camera.h
@@ -0,0 +1,52 @@
+#pragma once
+#include "managed.h"
+#include <vector_types.h>
+
+class Camera_cu : public Managed {
+public:
+    float* P;
+    /*float* P_col3;*/
+    float4 P_col34;
+    float* P_inv;
+    float* M_inv;
+    float* R;
+    /*float* t;*/
+    float4 t4;
+    /*float* C;*/
+    float4 C4;
+    float fx;
+    float fy;
+    float f;
+    float alpha;
+    float baseline;
+    bool reference;
+    float depthMin; //this could be figured out from the bounding volume (not done right now, but that's why this parameter is here as well and not only in AlgorithmParameters)
+    float depthMax; //this could be figured out from the bounding volume (not done right now, but that's why this parameter is here as well and not only in AlgorithmParameters)
+    //int id; //corresponds to the image name id (eg. 0-10), independent of order in argument list, just dependent on name
+    char* id;
+	float* K;
+	float* K_inv;
+    Camera_cu()
+    {
+		baseline = 0.54f;
+		reference = false;
+		depthMin = 2.0f; //this could be figured out from the bounding volume (not done right now, but that's why this parameter is here as well and not only in AlgorithmParameters)
+		depthMax = 20.0f;
+        cudaMallocManaged (&P,       sizeof(float) * 4 * 4);
+        cudaMallocManaged (&P_inv,   sizeof(float) * 4 * 4);
+        cudaMallocManaged (&M_inv,   sizeof(float) * 4 * 4);
+        cudaMallocManaged (&K,       sizeof(float) * 4 * 4);
+        cudaMallocManaged (&K_inv,   sizeof(float) * 4 * 4);
+        cudaMallocManaged (&R,       sizeof(float) * 4 * 4);
+    }
+    ~Camera_cu()
+    {
+        cudaFree (P);
+        cudaFree (P_inv);
+        cudaFree (M_inv);
+        cudaFree (K);
+        cudaFree (K_inv);
+        cudaFree (R);
+    }
+
+};
diff --git a/cameraGeometryUtils.h b/cameraGeometryUtils.h
new file mode 100644
index 0000000..ef61bc4
--- /dev/null
+++ b/cameraGeometryUtils.h
@@ -0,0 +1,495 @@
+/*
+ * utility functions for camera geometry related stuff
+ * most of them from: "Multiple View Geometry in computer vision" by Hartley and Zisserman
+ */
+
+#pragma once
+#include "mathUtils.h"
+#include <limits>
+#include <signal.h>
+
+Mat_<float> getColSubMat ( Mat_<float> M, int* indices, int numCols ) {
+    Mat_<float> subMat = Mat::zeros ( M.rows,numCols,CV_32F );
+    for ( int i = 0; i < numCols; i++ ) {
+        M.col ( indices[i] ).copyTo ( subMat.col ( i ) );
+    }
+    return subMat;
+}
+
+// Multi View Geometry, page 163
+Mat_<float> getCameraCenter ( Mat_<float> &P ) {
+    Mat_<float> C = Mat::zeros ( 4,1,CV_32F );
+
+    Mat_<float> M = Mat::zeros ( 3,3,CV_32F );
+
+    int xIndices[] = { 1, 2, 3 };
+    int yIndices[] = { 0, 2, 3 };
+    int zIndices[] = { 0, 1, 3 };
+    int tIndices[] = { 0, 1, 2 };
+
+    // x coordinate
+    M = getColSubMat ( P,xIndices,sizeof ( xIndices )/sizeof ( xIndices[0] ) );
+    C ( 0,0 ) = ( float )determinant ( M );
+
+    // y coordinate
+    M = getColSubMat ( P,yIndices,sizeof ( yIndices )/sizeof ( yIndices[0] ) );
+    C ( 1,0 ) = - ( float )determinant ( M );
+
+    // z coordinate
+    M = getColSubMat ( P,zIndices,sizeof ( zIndices )/sizeof ( zIndices[0] ) );
+    C ( 2,0 ) = ( float )determinant ( M );
+
+    // t coordinate
+    M = getColSubMat ( P,tIndices,sizeof ( tIndices )/sizeof ( tIndices[0] ) );
+    C ( 3,0 ) = - ( float )determinant ( M );
+
+    return C;
+}
+
+inline Vec3f get3Dpoint ( Camera &cam, float x, float y, float depth ) {
+    // in case camera matrix is not normalized: see page 162, then depth might not be the real depth but w and depth needs to be computed from that first
+
+    Mat_<float> pt = Mat::ones ( 3,1,CV_32F );
+    pt ( 0,0 ) = x;
+    pt ( 1,0 ) = y;
+
+    //formula taken from page 162 (alternative expression)
+    Mat_<float> ptX = cam.M_inv * ( depth*pt - cam.P.col ( 3 ) );
+    return Vec3f ( ptX ( 0 ),ptX ( 1 ),ptX ( 2 ) );
+}
+
+inline Vec3f get3Dpoint ( Camera &cam, int x, int y, float depth ){
+    return get3Dpoint(cam,(float)x,(float)y,depth);
+}
+
+// get the viewing ray for a pixel position of the camera
+static inline Vec3f getViewVector ( Camera &cam, int x, int y, bool rectified ) {
+
+    //get some point on the line (the other point on the line is the camera center)
+    Vec3f ptX = get3Dpoint ( cam,x,y,1.0f );
+
+    //get vector between camera center and other point on the line
+    Vec3f v = ptX - cam.C;
+    return normalize ( v );
+}
+
+//get d parameter of plane pi = [nT, d]T, which is the distance of the plane to the camera center
+float inline getPlaneDistance ( Vec3f &normal, Vec3f &X ) {
+    /*return -normal ( 0 )*X ( 0 )-normal ( 1 )*X ( 1 )-normal ( 2 )*X ( 2 );*/
+    return -(normal.dot(X));
+}
+
+static float getD ( Vec3f &normal, int x0, int y0, float depth, bool rectified, Camera &cam ) {
+    Vec3f pt;
+    {
+        pt = get3Dpoint ( cam, (float)x0, (float)y0, depth );
+    }
+    /* XXX WTF ?
+    float d = getPlaneDistance ( normal,pt );
+    if ( d != d ) {
+        d = FLT_MAX;
+    }
+    return d;
+    */
+    return getPlaneDistance ( normal,pt );
+}
+
+Mat_<float> getTransformationMatrix ( Mat_<float> R, Mat_<float> t ) {
+    Mat_<float> transMat = Mat::eye ( 4,4, CV_32F );
+    //Mat_<float> Rt = - R * t;
+    R.copyTo ( transMat ( Range ( 0,3 ),Range ( 0,3 ) ) );
+    t.copyTo ( transMat ( Range ( 0,3 ),Range ( 3,4 ) ) );
+
+    return transMat;
+}
+
+/* compute depth value from disparity or disparity value from depth
+ * Input:  f         - focal length in pixel
+ *         baseline  - baseline between cameras (in meters)
+ *         d - either disparity or depth value
+ * Output: either depth or disparity value
+ */
+float disparityDepthConversion ( float f, float baseline, float d ) {
+    /*if ( d == 0 )*/
+        /*return FLT_MAX;*/
+    return f * baseline / d;
+}
+
+Mat_<float> getTransformationReferenceToOrigin ( Mat_<float> R,Mat_<float> t ) {
+    // create rotation translation matrix
+    Mat_<float> transMat_original = getTransformationMatrix ( R,t );
+
+    // get transformation matrix for [R1|t1] = [I|0]
+    return transMat_original.inv ();
+}
+
+void transformCamera ( Mat_<float> R,Mat_<float> t, Mat_<float> transform, Camera &cam, Mat_<float> K ) {
+    // create rotation translation matrix
+    Mat_<float> transMat_original = getTransformationMatrix ( R,t );
+
+    //transform
+    Mat_<float> transMat_t = transMat_original * transform;
+
+    // compute translated P (only consider upper 3x4 matrix)
+    cam.P = K * transMat_t ( Range ( 0,3 ),Range ( 0,4 ) );
+    // set R and t
+    cam.R = transMat_t ( Range ( 0,3 ),Range ( 0,3 ) );
+    cam.t = transMat_t ( Range ( 0,3 ),Range ( 3,4 ) );
+    // set camera center C
+    Mat_<float> C = getCameraCenter ( cam.P );
+
+    C = C / C ( 3,0 );
+    cam.C = Vec3f ( C ( 0,0 ),C ( 1,0 ),C ( 2,0 ) );
+}
+
+Mat_<float> scaleK ( Mat_<float> K, float scaleFactor ) {
+    //compute focal length in mm (for APS-C sensor)
+    //float imgwidth_original = 3072.f;
+    //float imgheight_original = 2048.f;
+    //float ccdWidth = 22.7f;
+    //float ccdHeight = 15.1f;
+    //float f_mm = alpha_x * ccdWidth / imgwidth_original;
+    //float f_mm2 = alpha_y * ccdHeight / imgheight_original;
+    //alpha_x = f_mm / (ccdWidth / (imgwidth_original/scaleFactor));
+    //alpha_y = f_mm2 / (ccdHeight / (imgheight_original/scaleFactor));
+    //cout << "focal length: " << f_mm << "/" << f_mm2 << " , " << alpha_x << "/" << alpha_y << endl;
+
+    Mat_<float> K_scaled = K.clone();
+    //scale focal length
+    K_scaled ( 0,0 ) = K ( 0,0 ) / scaleFactor;
+    K_scaled ( 1,1 ) = K ( 1,1 ) / scaleFactor;
+    //scale center point
+    K_scaled ( 0,2 ) = K ( 0,2 ) / scaleFactor;
+    K_scaled ( 1,2 ) = K ( 1,2 ) / scaleFactor;
+
+    return K_scaled;
+}
+void copyOpencvVecToFloat4 ( Vec3f &v, float4 *a)
+{
+    a->x = v(0);
+    a->y = v(1);
+    a->z = v(2);
+}
+void copyOpencvVecToFloatArray ( Vec3f &v, float *a)
+{
+    a[0] = v(0);
+    a[1] = v(1);
+    a[2] = v(2);
+}
+void copyOpencvMatToFloatArray ( Mat_<float> &m, float **a)
+{
+
+    for (int pj=0; pj<m.rows ; pj++)
+        for (int pi=0; pi<m.cols ; pi++)
+        {
+            (*a)[pi+pj*m.cols] = m(pj,pi);
+        }
+}
+
+/* get camera parameters (e.g. projection matrices) from file
+ * Input:  inputFiles  - pathes to calibration files
+ *         scaleFactor - if image was rescaled we need to adapt calibration matrix K accordingly
+ * Output: camera parameters
+ */
+CameraParameters getCameraParameters ( CameraParameters_cu &cpc, InputFiles inputFiles, float depthMin, float depthMax, float scaleFactor = 1.0f, bool transformP = true ) {
+
+    CameraParameters params;
+    size_t numCameras = 2;
+    params.cameras.resize ( numCameras );
+    //get projection matrices
+
+    //load projection matrix from file (e.g. for Kitti)
+    if ( !inputFiles.calib_filename.empty () ) {
+        //two view case
+        readCalibFileKitti ( inputFiles.calib_filename,params.cameras[0].P,params.cameras[1].P );
+        params.rectified = false; // for Kitti data is actually rectified, set this to true for computation in disparity space
+
+        /*
+        //four view case
+        numCameras = 4;
+        params.cameras.resize ( numCameras );
+        readCalibFileKitti ( inputFiles.calib_filename,params.cameras[0].P,params.cameras[1].P );
+        params.rectified = false; // for Kitti data is actually rectified, set this to true for computation in disparity space
+
+        Mat_<float> Rt_110 = Mat::eye ( 4, 4, CV_32F );
+        //110
+        // 0.9999    0.0043    0.0118    0.0008
+        //-0.0044    1.0000    0.0027    0.0008
+        //-0.0118   -0.0028    0.9999   -0.6181
+        Rt_110(0,0) = 0.9999f;
+        Rt_110(0,1) = 0.0043f;
+        Rt_110(0,2) = 0.0118f;
+        Rt_110(0,3) = 0.0008f;
+        Rt_110(1,0) = -0.0044f;
+        Rt_110(1,1) = 1.0000f;
+        Rt_110(1,2) = 0.0027f;
+        Rt_110(1,3) = 0.0008f;
+        Rt_110(2,0) = -0.0118f;
+        Rt_110(2,1) = -0.0028f;
+        Rt_110(2,2) = 0.9999f;
+        Rt_110(2,3) =-0.6181f;
+
+        params.cameras[2].P = params.cameras[0].P.clone();
+        params.cameras[3].P = params.cameras[1].P.clone();
+
+        Mat_<float> K,R,T,C,t;
+        //left camera
+        decomposeProjectionMatrix ( params.cameras[2].P,K,R,T);
+        // get 3-dimensional translation vectors and camera center (divide by augmented component)
+        C = T ( Range ( 0,3 ),Range ( 0,1 ) ) / T ( 3,0 );
+        t = -R * C;
+        transformCamera ( R,t,Rt_110,params.cameras[2],K );
+
+        //right camera
+        decomposeProjectionMatrix ( params.cameras[3].P,K,R,T);
+        // get 3-dimensional translation vectors and camera center (divide by augmented component)
+        C = T ( Range ( 0,3 ),Range ( 0,1 ) ) / T ( 3,0 );
+        t = -R * C;
+        transformCamera ( R,t,Rt_110,params.cameras[3],K );
+
+
+        cout << params.cameras[0].P <<endl;
+        cout << params.cameras[1].P <<endl;
+        cout << params.cameras[2].P <<endl;
+        cout << params.cameras[3].P <<endl;
+        */
+    }
+    Mat_<float> KMaros = Mat::eye ( 3, 3, CV_32F );
+    KMaros(0,0) = 8066.0;
+    KMaros(1,1) = 8066.0;
+    KMaros(0,2) = 2807.5;
+    KMaros(1,2) = 1871.5;
+
+    // Load pmvs files
+    if ( !inputFiles.pmvs_folder.empty () ) {
+        numCameras = inputFiles.img_filenames.size ();
+        params.cameras.resize ( numCameras );
+        for ( size_t i = 0; i < numCameras; i++ ) {
+            int lastindex = inputFiles.img_filenames[i].find_last_of(".");
+            string filename_without_extension = inputFiles.img_filenames[i].substr(0, lastindex);
+            readPFileStrechaPmvs ( inputFiles.p_folder + filename_without_extension + ".txt",params.cameras[i].P );
+            unsigned found = inputFiles.img_filenames[i].find_last_of ( "." );
+            //params.cameras[i].id = atoi ( inputFiles.img_filenames[i].substr ( 0,found ).c_str () );
+            params.cameras[i].id = inputFiles.img_filenames[i].substr ( 0,found ).c_str ();
+            // params.cameras[i].P = KMaros * params.cameras[i].P;
+            //cout << params.cameras[i].P << endl;
+
+        }
+    }
+    //load projection matrix from file (e.g. for Strecha)
+    cout << "P folder is " << inputFiles.p_folder << endl;
+    if ( !inputFiles.p_folder.empty () ) {
+        numCameras = inputFiles.img_filenames.size ();
+        params.cameras.resize ( numCameras );
+        for ( size_t i = 0; i < numCameras; i++ ) {
+            readPFileStrechaPmvs ( inputFiles.p_folder + inputFiles.img_filenames[i] + ".P",params.cameras[i].P );
+            unsigned found = inputFiles.img_filenames[i].find_last_of ( "." );
+            //params.cameras[i].id = atoi ( inputFiles.img_filenames[i].substr ( 0,found ).c_str () );
+            params.cameras[i].id = inputFiles.img_filenames[i].substr ( 0,found ).c_str ();
+            /*params.cameras[i].P = KMaros * params.cameras[i].P;*/
+        }
+
+    }
+    //load projection matrix from file (e.g. for Middlebury)
+    if ( !inputFiles.krt_file.empty () ) {
+        numCameras = inputFiles.img_filenames.size ();
+        params.cameras.resize ( numCameras );
+        /*cout << "Num Cameras " << numCameras << endl;*/
+        size_t numImages = inputFiles.img_filenames.size ();
+        readKRtFileMiddlebury ( inputFiles.krt_file, params.cameras, inputFiles);
+        for ( size_t i = 0; i < numCameras; i++ ) {
+            unsigned found = inputFiles.img_filenames[i].find_last_of ( "." );
+            params.cameras[i].id = inputFiles.img_filenames[i].substr ( 0,found ).c_str ();
+        }
+    }
+
+    /*cout << "KMaros is" << endl;*/
+    /*cout << KMaros << endl;*/
+    cout << "numCameras is " << numCameras << endl;
+
+
+    // decompose projection matrices into K, R and t
+    vector<Mat_<float> > K ( numCameras );
+    vector<Mat_<float> > R ( numCameras );
+    vector<Mat_<float> > T ( numCameras );
+
+    vector<Mat_<float> > C ( numCameras );
+    vector<Mat_<float> > t ( numCameras );
+
+    for ( int i = 0; i < numCameras; i++ ) {
+        decomposeProjectionMatrix ( params.cameras[i].P,K[i],R[i],T[i] );
+
+        /*cout << "K: " << K[i] << endl;*/
+        /*cout << "R: " << R[i] << endl;*/
+        /*cout << "T: " << T[i] << endl;*/
+
+        // get 3-dimensional translation vectors and camera center (divide by augmented component)
+        C[i] = T[i] ( Range ( 0,3 ),Range ( 0,1 ) ) / T[i] ( 3,0 );
+        t[i] = -R[i] * C[i];
+
+        /*cout << "C: " << C[i] << endl;*/
+        /*cout << "t: " << t[i] << endl;*/
+    }
+
+    // transform projection matrices (R and t part) so that P1 = K [I | 0]
+    //computeTranslatedProjectionMatrices(R1, R2, t1, t2, params);
+    Mat_<float> transform = Mat::eye ( 4,4,CV_32F );
+
+    if ( transformP )
+        transform = getTransformationReferenceToOrigin ( R[0],t[0] );
+    /*cout << "transform is " << transform << endl;*/
+    params.cameras[0].reference = true;
+    params.idRef = 0;
+    //cout << "K before scale is" << endl;
+    //cout << K[0] << endl;
+    /*K[0](0,1)=0;*/
+    /*K[0](2,2)=1;*/
+
+    //assuming K is the same for all cameras
+    params.K = scaleK ( K[0],scaleFactor );
+    params.K_inv = params.K.inv ();
+    // get focal length from calibration matrix
+    params.f = params.K ( 0,0 );
+
+    for ( int i = 0; i < numCameras; i++ ) {
+        params.cameras[i].K = scaleK(K[i],scaleFactor);
+        params.cameras[i].K_inv = params.cameras[i].K.inv ( );
+        //params.cameras[i].f = params.cameras[i].K(0,0);
+
+
+        params.cameras[i].depthMin = depthMin;
+        params.cameras[i].depthMax = depthMax;
+
+        if ( !inputFiles.bounding_folder.empty () ) {
+            Vec3f ptBL, ptTR;
+            readBoundingVolume ( inputFiles.bounding_folder + inputFiles.img_filenames[i] + ".bounding",ptBL,ptTR );
+
+            //cout << "d1: " << getDepth ( ptBL,params.cameras[i].P ) <<endl;
+            //cout << "d2: " << getDepth ( ptTR,params.cameras[i].P ) <<endl;
+            /*
+            cout << "bounding volume: "<< ptBL << " / " << ptTR << endl;
+            Mat_<float> blMat = Mat::zeros(4,1,CV_32F);
+            blMat(0,0) = ptBL(0);
+            blMat(1,0) = ptBL(1);
+            blMat(2,0) = ptBL(2);
+            cout << "transformed BL: " << transform * blMat << endl;
+
+            Mat_<float> trMat = Mat::zeros(4,1,CV_32F);
+            trMat(0,0) = ptTR(0);
+            trMat(1,0) = ptTR(1);
+            trMat(2,0) = ptTR(2);
+            cout << "transformed TR: " << transform * trMat << endl;
+            */
+        }
+
+        transformCamera ( R[i],t[i], transform,    params.cameras[i],params.K );
+
+        params.cameras[i].P_inv = params.cameras[i].P.inv ( DECOMP_SVD );
+        params.cameras[i].M_inv = params.cameras[i].P.colRange ( 0,3 ).inv ();
+
+        // set camera baseline (if unknown we need to guess something)
+        //float b = (float)norm(t1,t2,NORM_L2);
+        params.cameras[i].baseline = 0.54f; //0.54 = Kitti baseline
+
+        // K
+        Mat_<float> tmpK = params.K.t ();
+        //copyOpencvMatToFloatArray ( params.K, &cpc.K);
+        //copyOpencvMatToFloatArray ( params.K_inv, &cpc.K_inv);
+        copyOpencvMatToFloatArray ( params.cameras[i].K, &cpc.cameras[i].K);
+        copyOpencvMatToFloatArray ( params.cameras[i].K_inv, &cpc.cameras[i].K_inv);
+        cpc.cameras[i].fy = params.K(1,1);
+        cpc.f = params.K(0,0);
+        cpc.cameras[i].f = params.K(0,0);
+        cpc.cameras[i].fx = params.K(0,0);
+        cpc.cameras[i].fy = params.K(1,1);
+        cpc.cameras[i].depthMin = params.cameras[i].depthMin;
+        cpc.cameras[i].depthMax = params.cameras[i].depthMax;
+        cpc.cameras[i].baseline = params.cameras[i].baseline;
+        cpc.cameras[i].reference = params.cameras[i].reference;
+
+        /*printf("MATRIXXX\n");*/
+        /*for (int pj=0; pj<3 ; pj++) {*/
+            /*for (int pi=0; pi<3 ; pi++)*/
+                /*cout << cpc.K[pi+pj*3] << " ";*/
+            /*cout << endl;*/
+        /*}*/
+
+        /*params.cameras[i].alpha = params.K ( 0,0 )/params.K(1,1);*/
+        cpc.cameras[i].alpha = params.K ( 0,0 )/params.K(1,1);
+        // Copy data to cuda structure
+        copyOpencvMatToFloatArray ( params.cameras[i].P,     &cpc.cameras[i].P);
+        copyOpencvMatToFloatArray ( params.cameras[i].P_inv, &cpc.cameras[i].P_inv);
+        copyOpencvMatToFloatArray ( params.cameras[i].M_inv, &cpc.cameras[i].M_inv);
+        //copyOpencvMatToFloatArray ( params.K,                &cpc.cameras[i].K);
+        //copyOpencvMatToFloatArray ( params.K_inv,            &cpc.cameras[i].K_inv);
+        copyOpencvMatToFloatArray ( params.cameras[i].K,                &cpc.cameras[i].K);
+        copyOpencvMatToFloatArray ( params.cameras[i].K_inv,            &cpc.cameras[i].K_inv);
+        copyOpencvMatToFloatArray ( params.cameras[i].R,     &cpc.cameras[i].R);
+        /*copyOpencvMatToFloatArray ( params.cameras[i].t, &cpc.cameras[i].t);*/
+        /*copyOpencvVecToFloatArray ( params.cameras[i].C, cpc.cameras[i].C);*/
+        copyOpencvVecToFloat4 ( params.cameras[i].C,         &cpc.cameras[i].C4);
+        cpc.cameras[i].t4.x = params.cameras[i].t(0);
+        cpc.cameras[i].t4.y = params.cameras[i].t(1);
+        cpc.cameras[i].t4.z = params.cameras[i].t(2);
+        Mat_<float> tmp = params.cameras[i].P.col(3);
+        /*cpc.cameras[i].P_col3[0] = tmp(0,0);*/
+        /*cpc.cameras[i].P_col3[1] = tmp(1,0);*/
+        /*cpc.cameras[i].P_col3[2] = tmp(2,0);*/
+        cpc.cameras[i].P_col34.x = tmp(0,0);
+        cpc.cameras[i].P_col34.y = tmp(1,0);
+        cpc.cameras[i].P_col34.z = tmp(2,0);
+
+        /*cout << "K for " << i << " is " << endl;*/
+        /*for (int pj=0; pj<3 ; pj++) {*/
+            /*for (int pi=0; pi<3 ; pi++)*/
+                /*cout << cpc.K[pi+pj*3] << " ";*/
+            /*cout << endl;*/
+        /*}*/
+        /*cout << "P for " << i << " is " << endl;*/
+        /*for (int pj=0; pj<3 ; pj++) {*/
+            /*for (int pi=0; pi<4 ; pi++)*/
+                /*cout << cpc.cameras[i].P[pi+pj*4] << " ";*/
+            /*cout << endl;*/
+        /*}*/
+        /*cout << "R for " << i << " is " << endl;*/
+        /*for (int pj=0; pj<3 ; pj++) {*/
+            /*for (int pi=0; pi<3 ; pi++)*/
+                /*cout << cpc.cameras[i].R[pi+pj*3] << " ";*/
+            /*cout << endl;*/
+        /*}*/
+        /*cout << "t for " << i << " is " << endl;*/
+        /*printf("%f %f %f\n", */
+               /*cpc.cameras[i].t4.x,*/
+               /*cpc.cameras[i].t4.y,*/
+               /*cpc.cameras[i].t4.z);*/
+        /*cout << endl;*/
+
+
+        Mat_<float> tmpKinv = params.K_inv.t ();
+
+        /*cout << "Camera " << i << endl;*/
+        /*cout << "P " << endl;*/
+        /*cout << params.cameras[i].P_A << endl;*/
+        /*cout << params.cameras[i].P << endl;*/
+
+        /*cout << "K " << endl;*/
+        /*cout << params.K_A << endl;*/
+        /*cout << params.K << endl;*/
+
+        /*cout << "R " << endl;*/
+        /*cout << params.cameras[i].R_A << endl;*/
+        /*cout << params.cameras[i].R << endl;*/
+
+        /*cout << "t " << endl;*/
+        /*cout << params.cameras[i].t_A << endl;*/
+        /*cout << params.cameras[i].t << endl;*/
+
+        /*cout << "C " << endl;*/
+        /*cout << params.cameras[i].C_A << endl;*/
+        /*cout << params.cameras[i].C << endl;*/
+        /*cout << endl;*/
+    }
+        //exit(1);
+
+    return params;
+}
diff --git a/cameraparameters.h b/cameraparameters.h
new file mode 100644
index 0000000..a5b785a
--- /dev/null
+++ b/cameraparameters.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include "camera.h"
+#include "managed.h"
+#include "config.h"
+
+class __align__(128) CameraParameters_cu : public Managed {
+public:
+    float f; // used only for dummy depth -> disparity conversion
+    bool rectified;
+    Camera_cu cameras[MAX_IMAGES];
+    int idRef;
+    int cols;
+    int rows;
+    int* viewSelectionSubset;
+    int viewSelectionSubsetNumber;
+    CameraParameters_cu()
+    {
+		rectified = true;
+		idRef = 0;
+        cudaMallocManaged (&viewSelectionSubset, sizeof(int) * MAX_IMAGES);
+    }
+    ~CameraParameters_cu()
+    {
+        cudaFree (viewSelectionSubset);
+    }
+};
diff --git a/config.h b/config.h
new file mode 100644
index 0000000..845f3e6
--- /dev/null
+++ b/config.h
@@ -0,0 +1,280 @@
+#pragma once
+#define MAX_IMAGES 64
+
+#define BLOCKSIZE_W 11 + 1 // +1 for the gradient computation
+#define BLOCKSIZE_H 11 + 1 // +1 for the gradient computation
+#define WIN_RADIUS_W (BLOCKSIZE_W) / (2)
+#define WIN_RADIUS_H (BLOCKSIZE_H) / (2)
+
+#define BLOCK_W 32
+#define BLOCK_H (BLOCK_W/2)
+#define TILE_W BLOCK_W
+#define TILE_H BLOCK_H * 2
+#define TILE_SIZE TILE_W * TILE_H
+#define SHARED_SIZE_W (TILE_W + WIN_RADIUS_W * 2)
+#define SHARED_SIZE_H (TILE_H + WIN_RADIUS_H * 2)
+#define SHARED_SIZE (SHARED_SIZE_W * SHARED_SIZE_H)
+
+#define REFERENCE 0
+#define MAXCOST 1000.0f
+
+#define FORCEINLINE __forceinline__
+//#define FORCEINLINE
+
+#define pow2(x) ((x)*(x))
+#define pow3(x) ((x)*(x)*(x))
+#define pow4(x) ((x)*(x)*(x)*(x))
+#define get_pow2_norm(x,y) (pow2(x)+pow2(y))
+#define get_normf(x,y) (sqrtf((pow2(x)+pow2(y))))
+#define get_normf3(v) (sqrtf((pow2(v.x)+pow2(v.y) + pow2(v.z))))
+#define get_pow2_normf3(v) ((pow2(v.x)+pow2(v.y) + pow2(v.z)))
+
+// Vector operations
+#define dot4(v0,v1) v0.x * v1.x + \
+                    v0.y * v1.y + \
+                    v0.z * v1.z
+#define mul4(v,k) \
+v->x = v->x * k; \
+v->y = v->y * k; \
+v->z = v->z * k;
+
+#define vecdiv4(v,k) \
+v->x = v->x / k; \
+v->y = v->y / k; \
+v->z = v->z / k;
+
+#define vecdiv42(v,k) \
+v.x = v.x / k; \
+v.y = v.y / k; \
+v.z = v.z / k;
+
+
+#define vecdiv(v,k) \
+v[0] = v[0] / k; \
+v[1] = v[1] / k; \
+v[2] = v[2] / k;
+
+#define get_pow2_normv3(x,y) (pow2(x)+pow2(y))
+
+#define negate(v) mul(v,-1.0f)
+#define negate4(v) v->x = -v->x; \
+                   v->y = -v->y; \
+                   v->z = -v->z;
+
+#define sub(v0,v1) v0.x = v0.x - v1.x; \
+                   v0.y = v0.y - v1.y; \
+                   v0.z = v0.z - v1.z;
+#define subout(v0,v1,vout) vout.x = v0.x - v1.x; \
+                   vout.y = v0.y - v1.y; \
+                   vout.z = v0.z - v1.z;
+
+#define outer_product(v0,v1, out) \
+out[0] = v0[0] * v1[0]; \
+out[1] = v0[0] * v1[1]; \
+out[2] = v0[0] * v1[2]; \
+out[3] = v0[1] * v1[0]; \
+out[4] = v0[1] * v1[1]; \
+out[5] = v0[1] * v1[2]; \
+out[6] = v0[2] * v1[0]; \
+out[7] = v0[2] * v1[1]; \
+out[8] = v0[2] * v1[2];
+
+#define outer_product4(v0,v1, out) \
+out[0] = v0.x * v1.x; \
+out[1] = v0.x * v1.y; \
+out[2] = v0.x * v1.z; \
+out[3] = v0.y * v1.x; \
+out[4] = v0.y * v1.y; \
+out[5] = v0.y * v1.z; \
+out[6] = v0.z * v1.x; \
+out[7] = v0.z * v1.y; \
+out[8] = v0.z * v1.z;
+// Matrix operations
+#define matadd(m,k)   m[0] = m[0] + k; \
+                   m[1] = m[1] + k; \
+                   m[2] = m[2] + k; \
+                   m[3] = m[3] + k; \
+                   m[4] = m[4] + k; \
+                   m[5] = m[5] + k; \
+                   m[6] = m[6] + k; \
+                   m[7] = m[7] + k; \
+                   m[8] = m[8] + k;
+
+#define matsub(m,k)   m[0] = m[0] - k; \
+                   m[1] = m[1] - k; \
+                   m[2] = m[2] - k; \
+                   m[3] = m[3] - k; \
+                   m[4] = m[4] - k; \
+                   m[5] = m[5] - k; \
+                   m[6] = m[6] - k; \
+                   m[7] = m[7] - k; \
+                   m[8] = m[8] - k;
+
+#define matmatsub(m0, m1) \
+m0[0] = m0[0] - m1[0]; \
+m0[1] = m0[1] - m1[1]; \
+m0[2] = m0[2] - m1[2]; \
+m0[3] = m0[3] - m1[3]; \
+m0[4] = m0[4] - m1[4]; \
+m0[5] = m0[5] - m1[5]; \
+m0[6] = m0[6] - m1[6]; \
+m0[7] = m0[7] - m1[7]; \
+m0[8] = m0[8] - m1[8];
+
+#define matmatsub2(m0, m1) \
+m1[0] = m0[0] - m1[0]; \
+m1[1] = m0[1] - m1[1]; \
+m1[2] = m0[2] - m1[2]; \
+m1[3] = m0[3] - m1[3]; \
+m1[4] = m0[4] - m1[4]; \
+m1[5] = m0[5] - m1[5]; \
+m1[6] = m0[6] - m1[6]; \
+m1[7] = m0[7] - m1[7]; \
+m1[8] = m0[8] - m1[8];
+
+
+#define matdivide(m,k) \
+m[0] = m[0] / k; \
+m[1] = m[1] / k; \
+m[2] = m[2] / k; \
+m[3] = m[3] / k; \
+m[4] = m[4] / k; \
+m[5] = m[5] / k; \
+m[6] = m[6] / k; \
+m[7] = m[7] / k; \
+m[8] = m[8] / k;
+
+#define matvecmul4noz(m, v, out) \
+out->x = \
+m [0] * v.x + \
+m [1] * v.y + \
+m [2];\
+out->y = \
+m [3] * v.x + \
+m [4] * v.y + \
+m [5]; \
+out->z = \
+m [6] * v.x + \
+m [7] * v.y + \
+m [8];
+#define matvecmul4P(m, v, out) \
+out->x = \
+m [0] * v.x + \
+m [1] * v.y + \
+m [2] * v.z + \
+m [3]; \
+out->y = \
+m [4] * v.x + \
+m [5] * v.y + \
+m [6] * v.z + \
+m [7]; \
+out->z = \
+m [8] * v.x  + \
+m [9] * v.y  + \
+m [10] * v.z + \
+m [11];
+
+#define matvecmul4(m, v, out) \
+out->x = \
+m [0] * v.x + \
+m [1] * v.y + \
+m [2] * v.z; \
+out->y = \
+m [3] * v.x + \
+m [4] * v.y + \
+m [5] * v.z; \
+out->z = \
+m [6] * v.x + \
+m [7] * v.y + \
+m [8] * v.z;
+#define matvecmul4div(m, v, out,d) \
+out->x = \
+(m [0] * v.x + \
+m [1] * v.y + \
+m [2] * v.z)/d; \
+out->y = \
+(m [3] * v.x + \
+m [4] * v.y + \
+m [5] * v.z)/d; \
+out->z = \
+(m [6] * v.x + \
+m [7] * v.y + \
+m [8] * v.z)/d;
+
+#define matvecmul(m, v, out) \
+out[0] = \
+m [0] * v[0] + \
+m [1] * v[1] + \
+m [2] * v[2]; \
+out[1] = \
+m [3] * v[0] + \
+m [4] * v[1] + \
+m [5] * v[2]; \
+out[2] = \
+m [6] * v[0] + \
+m [7] * v[1] + \
+m [8] * v[2];
+
+#define matmul_cu(m0, m1, out) \
+out[0] = \
+m0 [0] * m1[0] + \
+m0 [1] * m1[0+3] + \
+m0 [2] * m1[0+6]; \
+out[1] = \
+m0 [0] * m1[1] + \
+m0 [1] * m1[1+3] + \
+m0 [2] * m1[1+6]; \
+out[2] = \
+m0 [0] * m1[2] + \
+m0 [1] * m1[2+3] + \
+m0 [2] * m1[2+6]; \
+out[3] = \
+m0 [3] * m1[0] + \
+m0 [4] * m1[0+3] + \
+m0 [5] * m1[0+6]; \
+out[4] = \
+m0 [3] * m1[1] + \
+m0 [4] * m1[1+3] + \
+m0 [5] * m1[1+6]; \
+out[5] = \
+m0 [3] * m1[2] + \
+m0 [4] * m1[2+3] + \
+m0 [5] * m1[2+6]; \
+out[6] = \
+m0 [6] * m1[0] + \
+m0 [7] * m1[0+3] + \
+m0 [8] * m1[0+6]; \
+out[7] = \
+m0 [6] * m1[1] + \
+m0 [7] * m1[1+3] + \
+m0 [8] * m1[1+6]; \
+out[8] = \
+m0 [6] * m1[2] + \
+m0 [7] * m1[2+3] + \
+m0 [8] * m1[2+6];
+
+// texture utils
+
+#define texat(tex,x, y) \
+ tex2D<float>(tex, x+0.5f, y+0.5f)
+#define texatf4(tex,x, y) \
+ tex2D<float4>(tex, x+0.5f, y+0.5f)
+
+#define texatpt(tex,pt) \
+texat(tex, pt[0], pt[1])
+#define texatpt4(tex,pt) \
+texat(tex, pt.x, pt.y)
+#define texatptf4(tex,pt) \
+texatf4(tex, pt.x, pt.y)
+
+#define colorDifferenceL1_macro(x,y) \
+fabsf(x-y)
+
+#define print_vector(v,string) \
+printf("%s is \tnp.array([%e, %e ,%e])\n", string, v[0],v[1],v[2]);
+
+#define print_vector4(v,string) \
+printf("%s is \tnp.array([%f, %f ,%f])\n", string, v.x,v.y,v.z);
+
+#define print_matrix(m,string) \
+printf("%s is  [[%.4e, %.4e, %.4e], \n\t [%.4e, %.4e, %.4e], \n\t [%.4e, %.4e, %.4e]]\n", string, m[0],m[1],m[2],m[3],m[4],m[5],m[6],m[7],m[8]);
diff --git a/displayUtils.h b/displayUtils.h
new file mode 100644
index 0000000..e0d00e2
--- /dev/null
+++ b/displayUtils.h
@@ -0,0 +1,441 @@
+/*
+ * utility functions for visualization of results (disparity in color, warped output, ...)
+ */
+
+#pragma once
+#include <sstream>
+#include <fstream>
+
+#include "point_cloud.h"
+#include "point_cloud_list.h"
+
+/* compute gamma correction (just for display purposes to see more details in farther away areas of disparity image)
+ * Input: img   - image
+ *        gamma - gamma value
+ * Output: gamma corrected image
+ */
+Mat correctGamma( Mat& img, double gamma ) {
+ double inverse_gamma = 1.0 / gamma;
+
+ Mat lut_matrix(1, 256, CV_8UC1 );
+ uchar * ptr = lut_matrix.ptr();
+ for( int i = 0; i < 256; i++ )
+   ptr[i] = (int)( pow( (double) i / 255.0, inverse_gamma ) * 255.0 );
+
+ Mat result;
+ LUT( img, lut_matrix, result );
+
+ return result;
+};
+
+
+static void getDisparityForDisplay(const Mat_<float> &disp, Mat &dispGray, Mat &dispColor, float numDisparities, float minDisp = 0.0f){
+	float gamma = 2.0f; // to get higher contrast for lower disparity range (just for color visualization)
+	disp.convertTo(dispGray,CV_16U,65535.f/(numDisparities-minDisp),-minDisp*65535.f/(numDisparities-minDisp));
+	Mat disp8;
+	disp.convertTo(disp8,CV_8U,255.f/(numDisparities-minDisp),-minDisp*255.f/(numDisparities-minDisp));
+	if(minDisp == 0.0f)
+		disp8 = correctGamma(disp8,gamma);
+	applyColorMap(disp8, dispColor, COLORMAP_JET);
+	for(int y = 0; y < dispColor.rows; y++){
+		for(int x = 0; x < dispColor.cols; x++){
+			if(disp(y,x) <= 0.0f)
+				dispColor.at<Vec3b>(y,x) = Vec3b(0,0,0);
+		}
+	}
+};
+
+static void convertDisparityDepthImage(const Mat_<float> &dispL, Mat_<float> &d, float f, float baseline){
+	d = Mat::zeros(dispL.rows, dispL.cols, CV_32F);
+	for(int y = 0; y < dispL.rows; y++){
+		for(int x = 0; x < dispL.cols; x++){
+			d(y,x) = disparityDepthConversion(f,baseline,dispL(y,x));
+		}
+	}
+};
+
+static string getColorString(uint8_t color){
+	stringstream ss;
+	ss << (int)color << " " << (int)color << " " << (int)color;
+	return ss.str();
+}
+
+
+static string getColorString(Vec3b color){
+	stringstream ss;
+	ss << (int)color(2) << " " << (int)color(1) << " " << (int)color(0);
+	return ss.str();
+}
+
+static string getColorString(Vec3i color){
+	stringstream ss;
+	ss << (int)((float)color(2)/256.f) << " " << (int)((float)color(1)/256.f) << " " << (int)((float)color(0)/256.f);
+	return ss.str();
+}
+static void storePlyFileBinaryPointCloud (char* plyFilePath, PointCloudList &pc, Camera cam, Mat_<float> &distImg) {
+    cout << "store 3D points to ply file" << endl;
+
+    FILE *outputPly;
+    outputPly=fopen(plyFilePath,"wb");
+
+    /*write header*/
+    fprintf(outputPly, "ply\n");
+    fprintf(outputPly, "format binary_little_endian 1.0\n");
+    fprintf(outputPly, "element vertex %d\n",pc.size);
+    fprintf(outputPly, "property float x\n");
+    fprintf(outputPly, "property float y\n");
+    fprintf(outputPly, "property float z\n");
+    fprintf(outputPly, "property float nx\n");
+    fprintf(outputPly, "property float ny\n");
+    fprintf(outputPly, "property float nz\n");
+    fprintf(outputPly, "property uchar red\n");
+    fprintf(outputPly, "property uchar green\n");
+    fprintf(outputPly, "property uchar blue\n");
+    fprintf(outputPly, "end_header\n");
+
+    distImg = Mat::zeros(pc.rows,pc.cols,CV_32F);
+
+    //write data
+#pragma omp parallel for
+    for(int i = 0; i < pc.size; i++) {
+        const Point_li &p = pc.points[i];
+        const float4 normal = p.normal;
+        float4 X = p.coord;
+        const char color = (int)p.texture;
+        /*const int color = 127.0f;*/
+        /*printf("Writing point %f %f %f\n", X.x, X.y, X.z);*/
+
+        if(!(X.x < FLT_MAX && X.x > -FLT_MAX) || !(X.y < FLT_MAX && X.y > -FLT_MAX) || !(X.z < FLT_MAX && X.z >= -FLT_MAX)){
+            X.x = 0.0f;
+            X.y = 0.0f;
+            X.z = 0.0f;
+        }
+#pragma omp critical
+        {
+            /*myfile << X.x << " " << X.y << " " << X.z << " " << normal.x << " " << normal.y << " " << normal.z << " " << color << " " << color << " " << color << endl;*/
+            fwrite(&X.x,      sizeof(X.x), 1, outputPly);
+            fwrite(&X.y,      sizeof(X.y), 1, outputPly);
+            fwrite(&X.z,      sizeof(X.z), 1, outputPly);
+            fwrite(&normal.x, sizeof(normal.x), 1, outputPly);
+            fwrite(&normal.y, sizeof(normal.y), 1, outputPly);
+            fwrite(&normal.z, sizeof(normal.z), 1, outputPly);
+            fwrite(&color,  sizeof(char), 1, outputPly);
+            fwrite(&color,  sizeof(char), 1, outputPly);
+            fwrite(&color,  sizeof(char), 1, outputPly);
+        }
+
+    }
+    fclose(outputPly);
+}
+static void storeXYZPointCloud (char* plyFilePath, PointCloudList &pc, Camera cam, Mat_<float> &distImg) {
+    cout << "store 3D points to ply file" << endl;
+
+	ofstream myfile;
+	myfile.open (plyFilePath, ios::out);
+
+    //write data
+#pragma omp parallel for
+    for(int i = 0; i < pc.size; i++) {
+        const Point_li &p = pc.points[i];
+        const float4 normal = p.normal;
+        float4 X = p.coord;
+
+        if(!(X.x < FLT_MAX && X.x > -FLT_MAX) || !(X.y < FLT_MAX && X.y > -FLT_MAX) || !(X.z < FLT_MAX && X.z >= -FLT_MAX)){
+            X.x = 0.0f;
+            X.y = 0.0f;
+            X.z = 0.0f;
+        }
+#pragma omp critical
+        {
+            myfile << X.x << " " << X.y << " " << X.z << " " << normal.x << " " << normal.y << " " << normal.z << endl;
+        }
+
+    }
+	myfile.close();
+}
+static void storePlyFileAsciiPointCloud (char* plyFilePath, PointCloudList &pc, Camera cam, Mat_<float> &distImg) {
+	cout << "store 3D points to ply file" << endl;
+
+    /*FILE *outputPly;*/
+    /*outputPly=fopen(plyFilePath,"wb");*/
+
+    /*write header*/
+    /*fprintf(outputPly, "ply\n");*/
+    /*fprintf(outputPly, "format binary_little_endian 1.0\n");*/
+    /*fprintf(outputPly, "element vertex %d\n",count);*/
+    /*fprintf(outputPly, "property float x\n");*/
+    /*fprintf(outputPly, "property float y\n");*/
+    /*fprintf(outputPly, "property float z\n");*/
+    /*fprintf(outputPly, "property float nx\n");*/
+    /*fprintf(outputPly, "property float ny\n");*/
+    /*fprintf(outputPly, "property float nz\n");*/
+    /*fprintf(outputPly, "property uchar red\n");*/
+    /*fprintf(outputPly, "property uchar green\n");*/
+    /*fprintf(outputPly, "property uchar blue\n");*/
+    /*fprintf(outputPly, "end_header\n");*/
+
+	ofstream myfile;
+	myfile.open (plyFilePath, ios::out);
+
+	//write header
+	myfile << "ply" << endl;
+	myfile << "format ascii 1.0" << endl;
+	myfile << "element vertex " << pc.size << endl;
+	myfile << "property float x" << endl;
+	myfile << "property float y" << endl;
+	myfile << "property float z" << endl;
+	myfile << "property float nx" << endl;
+	myfile << "property float ny" << endl;
+	myfile << "property float nz" << endl;
+	myfile << "property uchar red" << endl;
+	myfile << "property uchar green" << endl;
+	myfile << "property uchar blue" << endl;
+	myfile << "end_header" << endl;
+
+
+    distImg = Mat::zeros(pc.rows,pc.cols,CV_32F);
+
+	//write data
+    #pragma omp parallel for
+    for(int i = 0; i < pc.size; i++) {
+        const Point_li &p = pc.points[i];
+        const float4 normal = p.normal;
+        float4 X = p.coord;
+        const int color = (int)p.texture;
+        /*const int color = 127.0f;*/
+        /*printf("Writing point %f %f %f\n", X.x, X.y, X.z);*/
+
+        if(!(X.x < FLT_MAX && X.x > -FLT_MAX) || !(X.y < FLT_MAX && X.y > -FLT_MAX) || !(X.z < FLT_MAX && X.z >= -FLT_MAX)){
+            X.x = 0.0f;
+            X.y = 0.0f;
+            X.z = 0.0f;
+        }
+#pragma omp critical
+        {
+            myfile << X.x << " " << X.y << " " << X.z << " " << normal.x << " " << normal.y << " " << normal.z << " " << color << " " << color << " " << color << endl;
+            /*fwrite(&X.x,      sizeof(float), 1, outputPly);*/
+            /*fwrite(&X.y,      sizeof(float), 1, outputPly);*/
+            /*fwrite(&X.z,      sizeof(float), 1, outputPly);*/
+            /*fwrite(&normal.x, sizeof(float), 1, outputPly);*/
+            /*fwrite(&normal.y, sizeof(float), 1, outputPly);*/
+            /*fwrite(&normal.z, sizeof(float), 1, outputPly);*/
+            /*fwrite(&color,  sizeof(float), 1, outputPly);*/
+            /*fwrite(&color,  sizeof(float), 1, outputPly);*/
+            /*fwrite(&color,  sizeof(float), 1, outputPly);*/
+        }
+
+    }
+	myfile.close();
+/*fclose(outputPly);*/
+}
+//template <typename ImgType>
+static void storePlyFileBinaryPointCloud(char* plyFilePath, PointCloud &pc, Camera cam, Mat_<float> &distImg) {
+    cout << "store 3D points to ply file" << endl;
+
+    FILE *outputPly;
+    outputPly=fopen(plyFilePath,"wb");
+
+    /*write header*/
+    fprintf(outputPly, "ply\n");
+    fprintf(outputPly, "format binary_little_endian 1.0\n");
+    fprintf(outputPly, "element vertex %d\n",pc.size);
+    fprintf(outputPly, "property float x\n");
+    fprintf(outputPly, "property float y\n");
+    fprintf(outputPly, "property float z\n");
+    /*fprintf(outputPly, "property float nx\n");*/
+    /*fprintf(outputPly, "property float ny\n");*/
+    /*fprintf(outputPly, "property float nz\n");*/
+    /*fprintf(outputPly, "property uchar red\n");*/
+    /*fprintf(outputPly, "property uchar green\n");*/
+    /*fprintf(outputPly, "property uchar blue\n");*/
+    fprintf(outputPly, "end_header\n");
+
+
+    distImg = Mat::zeros(pc.rows,pc.cols,CV_32F);
+
+    //write data
+/*#pragma omp parallel for*/
+    for(int i = 0; i < pc.size; i++){
+        const Point_cu &p = pc.points[i];
+        const float4 normal = p.normal;
+        float4 X = p.coord;
+        /*printf("Writing point %f %f %f\n", X.x, X.y, X.z);*/
+        /*float color = p.texture;*/
+        const char color = 127.0f;
+
+        if(!(X.x < FLT_MAX && X.x > -FLT_MAX) || !(X.y < FLT_MAX && X.y > -FLT_MAX) || !(X.z < FLT_MAX && X.z >= -FLT_MAX)){
+            X.x = 0.0f;
+            X.y = 0.0f;
+            X.z = 0.0f;
+        }
+/*#pragma omp critical*/
+        {
+            /*myfile << X.x << " " << X.y << " " << X.z << " " << normal.x << " " << normal.y << " " << normal.z << " " << color << " " << color << " " << color << endl;*/
+            fwrite(&(X.x),      sizeof(float), 1, outputPly);
+            fwrite(&(X.y),      sizeof(float), 1, outputPly);
+            fwrite(&(X.z),      sizeof(float), 1, outputPly);
+            /*fwrite(&(normal.x), sizeof(float), 1, outputPly);*/
+            /*fwrite(&(normal.y), sizeof(float), 1, outputPly);*/
+            /*fwrite(&(normal.z), sizeof(float), 1, outputPly);*/
+            /*fwrite(&color,  sizeof(char), 1, outputPly);*/
+            /*fwrite(&color,  sizeof(char), 1, outputPly);*/
+            /*fwrite(&color,  sizeof(char), 1, outputPly);*/
+        }
+
+        /*distImg(y,x) = sqrt(pow(X.x-cam.C(0),2)+pow(X.y-cam.C(1),2)+pow(X.z-cam.C(2),2));*/
+    }
+
+    /*myfile.close();*/
+    fclose(outputPly);
+}
+template <typename ImgType>
+static void storePlyFileBinary(char* plyFilePath, const Mat_<float> &depthImg, const Mat_<Vec3f> &normals, const Mat_<ImgType> img, Camera cam, Mat_<float> &distImg){
+	cout << "store 3D points to ply file" << endl;
+
+    FILE *outputPly;
+    outputPly=fopen(plyFilePath,"wb");
+
+	//write header
+    fprintf(outputPly, "ply\n");
+    fprintf(outputPly, "format binary_little_endian 1.0\n");
+    fprintf(outputPly, "element vertex %d\n",depthImg.rows * depthImg.cols);
+    fprintf(outputPly, "property float x\n");
+    fprintf(outputPly, "property float y\n");
+    fprintf(outputPly, "property float z\n");
+    fprintf(outputPly, "property float nx\n");
+    fprintf(outputPly, "property float ny\n");
+    fprintf(outputPly, "property float nz\n");
+    fprintf(outputPly, "property uchar red\n");
+    fprintf(outputPly, "property uchar green\n");
+    fprintf(outputPly, "property uchar blue\n");
+    fprintf(outputPly, "end_header\n");
+
+	distImg = Mat::zeros(depthImg.rows,depthImg.cols,CV_32F);
+
+	//write data
+    #pragma omp parallel for
+	for(int x = 0; x < depthImg.cols; x++){
+		for(int y = 0; y < depthImg.rows; y++){
+			Vec3f n = normals(y,x);
+			ImgType color = img(y,x);
+
+			Vec3f ptX = get3Dpoint(cam,x,y,depthImg(y,x));
+
+			if(!(ptX(0) < FLT_MAX && ptX(0) > -FLT_MAX) || !(ptX(1) < FLT_MAX && ptX(12) > -FLT_MAX) || !(ptX(2) < FLT_MAX && ptX(2) >= -FLT_MAX)){
+				ptX(0) = 0.0f;
+				ptX(1) = 0.0f;
+				ptX(2) = 0.0f;
+			}
+            #pragma omp critical
+            {
+                //myfile << ptX(0) << " " << ptX(1) << " " << ptX(2) << " " << n(0) << " " << n(1) << " " << n(2) << " " << getColorString(color) << endl;
+                fwrite(&(ptX(0)), sizeof(float), 3, outputPly);
+                fwrite(&(n(0))  , sizeof(float), 3, outputPly);
+                fwrite(&color, sizeof(color) , 1,  outputPly);
+                fwrite(&color, sizeof(color) , 1,  outputPly);
+                fwrite(&color, sizeof(color) , 1,  outputPly);
+            }
+
+			distImg(y,x) = sqrt(pow(ptX(0)-cam.C(0),2)+pow(ptX(1)-cam.C(1),2)+pow(ptX(2)-cam.C(2),2));
+
+			//}else{
+			//	cout << ptX(0) << " " << ptX(1) << " " << ptX(2) << endl;
+			//	cout << depthImg(y,x) << endl;
+			//}
+
+
+			//P *
+			//cout << xValue << " " << yValue << " " << zValue << " / " <<
+		}
+	}
+
+fclose(outputPly);
+}
+
+template <typename ImgType>
+static void storePlyFile(char* plyFilePath, const Mat_<float> &depthImg, const Mat_<Vec3f> &normals, const Mat_<ImgType> img, Camera cam, Mat_<float> &distImg){
+	cout << "store 3D points to ply file" << endl;
+	ofstream myfile;
+	myfile.open (plyFilePath, ios::out);
+
+	//write header
+	myfile << "ply" << endl;
+	myfile << "format ascii 1.0" << endl;
+	myfile << "element vertex " << depthImg.rows * depthImg.cols << endl;
+	myfile << "property float x" << endl;
+	myfile << "property float y" << endl;
+	myfile << "property float z" << endl;
+	myfile << "property float nx" << endl;
+	myfile << "property float ny" << endl;
+	myfile << "property float nz" << endl;
+	myfile << "property uchar red" << endl;
+	myfile << "property uchar green" << endl;
+	myfile << "property uchar blue" << endl;
+	myfile << "end_header" << endl;
+
+	distImg = Mat::zeros(depthImg.rows,depthImg.cols,CV_32F);
+
+	//write data
+    //#pragma omp parallel for
+	for(int x = 0; x < depthImg.cols; x++){
+		for(int y = 0; y < depthImg.rows; y++){
+			/*
+			float zValue = depthImg(x,y);
+			float xValue = ((float)x-cx)*zValue/camParams.f;
+			float yValue = ((float)y-cy)*zValue/camParams.f;
+			myfile << xValue << " " << yValue << " " << zValue << endl;
+			*/
+
+			//Mat_<float> pt = Mat::ones(3,1,CV_32F);
+			//pt(0,0) = (float)x;
+			//pt(1,0) = (float)y;
+
+			Vec3f n = normals(y,x);
+			ImgType color = img(y,x);
+
+			//Mat_<float> ptX = P1_inv * depthImg(y,x)*pt;
+
+			//if(depthImg(y,x) <= 0.0001f)
+			//	continue;
+
+
+			Vec3f ptX = get3Dpoint(cam,x,y,depthImg(y,x));
+
+			//Vec3f ptX_v1 = get3dPointFromPlane(cam.P_inv,cam.C,n,planes.d(y,x),x,y);
+			//cout << ptX_v1 << " / " << ptX << endl;
+
+			if(!(ptX(0) < FLT_MAX && ptX(0) > -FLT_MAX) || !(ptX(1) < FLT_MAX && ptX(12) > -FLT_MAX) || !(ptX(2) < FLT_MAX && ptX(2) >= -FLT_MAX)){
+				ptX(0) = 0.0f;
+				ptX(1) = 0.0f;
+				ptX(2) = 0.0f;
+			}
+            //#pragma omp critical
+            {
+			myfile << ptX(0) << " " << ptX(1) << " " << ptX(2) << " " << n(0) << " " << n(1) << " " << n(2) << " " << getColorString(color) << endl;
+            }
+
+			distImg(y,x) = sqrt(pow(ptX(0)-cam.C(0),2)+pow(ptX(1)-cam.C(1),2)+pow(ptX(2)-cam.C(2),2));
+
+			//}else{
+			//	cout << ptX(0) << " " << ptX(1) << " " << ptX(2) << endl;
+			//	cout << depthImg(y,x) << endl;
+			//}
+
+
+			//P *
+			//cout << xValue << " " << yValue << " " << zValue << " / " <<
+		}
+	}
+
+	myfile.close();
+}
+
+
+
+static void getNormalsForDisplay(const Mat &normals, Mat &normals_display, int rtype = CV_16U){
+	if(rtype == CV_8U)
+		normals.convertTo(normals_display,CV_8U,128,128);
+	else
+		normals.convertTo(normals_display,CV_16U,32767,32767);
+	cvtColor(normals_display,normals_display,COLOR_RGB2BGR);
+};
diff --git a/exception.h b/exception.h
new file mode 100644
index 0000000..adda4bc
--- /dev/null
+++ b/exception.h
@@ -0,0 +1,151 @@
+/*
+* Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
+*
+* Please refer to the NVIDIA end user license agreement (EULA) associated
+* with this source code for terms and conditions that govern your use of
+* this software. Any use, reproduction, disclosure, or distribution of
+* this software and related documentation outside the terms of the EULA
+* is strictly prohibited.
+*
+*/
+
+/* CUda UTility Library */
+#ifndef _EXCEPTION_H_
+#define _EXCEPTION_H_
+
+// includes, system
+#include <exception>
+#include <stdexcept>
+#include <iostream>
+#include <stdlib.h>
+
+//! Exception wrapper.
+//! @param Std_Exception Exception out of namespace std for easy typing.
+template<class Std_Exception>
+class Exception : public Std_Exception
+{
+    public:
+
+        //! @brief Static construction interface
+        //! @return Alwayss throws ( Located_Exception<Exception>)
+        //! @param file file in which the Exception occurs
+        //! @param line line in which the Exception occurs
+        //! @param detailed details on the code fragment causing the Exception
+        static void throw_it(const char *file,
+                             const int line,
+                             const char *detailed = "-");
+
+        //! Static construction interface
+        //! @return Alwayss throws ( Located_Exception<Exception>)
+        //! @param file file in which the Exception occurs
+        //! @param line line in which the Exception occurs
+        //! @param detailed details on the code fragment causing the Exception
+        static void throw_it(const char *file,
+                             const int line,
+                             const std::string &detailed);
+
+        //! Destructor
+        virtual ~Exception() throw();
+
+    private:
+
+        //! Constructor, default (private)
+        Exception();
+
+        //! Constructor, standard
+        //! @param str string returned by what()
+        Exception(const std::string &str);
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//! Exception handler function for arbitrary exceptions
+//! @param ex exception to handle
+////////////////////////////////////////////////////////////////////////////////
+template<class Exception_Typ>
+inline void
+handleException(const Exception_Typ &ex)
+{
+    std::cerr << ex.what() << std::endl;
+
+    exit(EXIT_FAILURE);
+}
+
+//! Convenience macros
+
+//! Exception caused by dynamic program behavior, e.g. file does not exist
+#define RUNTIME_EXCEPTION( msg) \
+    Exception<std::runtime_error>::throw_it( __FILE__, __LINE__, msg)
+
+//! Logic exception in program, e.g. an assert failed
+#define LOGIC_EXCEPTION( msg) \
+    Exception<std::logic_error>::throw_it( __FILE__, __LINE__, msg)
+
+//! Out of range exception
+#define RANGE_EXCEPTION( msg) \
+    Exception<std::range_error>::throw_it( __FILE__, __LINE__, msg)
+
+////////////////////////////////////////////////////////////////////////////////
+//! Implementation
+
+// includes, system
+#include <sstream>
+
+////////////////////////////////////////////////////////////////////////////////
+//! Static construction interface.
+//! @param  Exception causing code fragment (file and line) and detailed infos.
+////////////////////////////////////////////////////////////////////////////////
+/*static*/ template<class Std_Exception>
+void
+Exception<Std_Exception>::
+throw_it(const char *file, const int line, const char *detailed)
+{
+    std::stringstream s;
+
+    // Quiet heavy-weight but exceptions are not for
+    // performance / release versions
+    s << "Exception in file '" << file << "' in line " << line << "\n"
+      << "Detailed description: " << detailed << "\n";
+
+    throw Exception(s.str());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Static construction interface.
+//! @param  Exception causing code fragment (file and line) and detailed infos.
+////////////////////////////////////////////////////////////////////////////////
+/*static*/ template<class Std_Exception>
+void
+Exception<Std_Exception>::
+throw_it(const char *file, const int line, const std::string &msg)
+{
+    throw_it(file, line, msg.c_str());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Constructor, default (private).
+////////////////////////////////////////////////////////////////////////////////
+template<class Std_Exception>
+Exception<Std_Exception>::Exception() :
+    Std_Exception("Unknown Exception.\n")
+{ }
+
+////////////////////////////////////////////////////////////////////////////////
+//! Constructor, standard (private).
+//! String returned by what().
+////////////////////////////////////////////////////////////////////////////////
+template<class Std_Exception>
+Exception<Std_Exception>::Exception(const std::string &s) :
+    Std_Exception(s)
+{ }
+
+////////////////////////////////////////////////////////////////////////////////
+//! Destructor
+////////////////////////////////////////////////////////////////////////////////
+template<class Std_Exception>
+Exception<Std_Exception>::~Exception() throw() { }
+
+// functions, exported
+
+#endif // #ifndef _EXCEPTION_H_
+
diff --git a/fileIoUtils.h b/fileIoUtils.h
new file mode 100644
index 0000000..4ca68c6
--- /dev/null
+++ b/fileIoUtils.h
@@ -0,0 +1,441 @@
+/*
+ * utility functions for reading and writing files
+ */
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+
+static void getProjectionMatrix(char* line, Mat_<float> &P){
+    const char* p;
+    int idx = 0;
+    for (p = strtok( line, " " );  p;  p = strtok( NULL, " " ))
+    {
+        if(p[0]=='P' || p[0]=='p')
+            continue;
+
+        //float val = stof(p);
+        float val = (float)atof(p);
+        /*cout << val << endl;*/
+        P(idx/4,idx%4)=val;
+
+        idx++;
+    }
+};
+
+static int read3Dpoint(char* line, Vec3f &pt){
+    const char* p;
+    int idx = 0;
+    for (p = strtok( line, " " );  p;  p = strtok( NULL, " " ))
+    {
+        if(idx > 2)
+            return -1;
+        float val = (float)atof(p);
+        pt[idx] = val;
+
+        idx++;
+    }
+    if(idx<2)
+        return -1;
+    return 0;
+}
+
+static void readCalibFileKitti(const string calib_filename, Mat_<float> &P1, Mat_<float> &P2){
+    ifstream myfile;
+    myfile.open(calib_filename.c_str(),ifstream::in);
+    //get first line (containing P0)
+    char line[512];
+    myfile.getline(line,512);
+    getProjectionMatrix(line,P1);
+    myfile.getline(line,512);
+    getProjectionMatrix(line,P2);
+    myfile.close();
+};
+
+static void readBoundingVolume(const string filename, Vec3f &ptBL, Vec3f & ptTR){
+    ifstream myfile;
+    myfile.open(filename.c_str(),ifstream::in);
+    char line[512];
+    //bottom left point
+    myfile.getline(line,512);
+    read3Dpoint(line,ptBL);
+    //top right point
+    myfile.getline(line,512);
+    read3Dpoint(line,ptTR);
+
+    myfile.close();
+};
+
+
+static void readCameraFileStrecha(const string camera_filename, float &focalLength){
+    // only interested in focal length, but possible to get also other internal and external camera parameters
+    // focal length is stored in pixel format as alpha_x and alphy_y, only using alpha_x (which is the very first parameter of the internal camera matrix)
+    ifstream myfile;
+    myfile.open(camera_filename.c_str(),ifstream::in);
+    char line[512];
+    myfile.getline(line,512);
+    const char* p = strtok( line, " " );
+    focalLength = (float)atof(p);
+    myfile.close();
+}
+
+static void readPFileStrechaPmvs(const string p_filename, Mat_<float> &P){
+    ifstream myfile;
+    myfile.open(p_filename.c_str(),ifstream::in);
+
+    //cout <<"Opening file " << p_filename << endl;
+    for( int i = 0; i < 4; i++){
+        if (myfile.eof())
+            break;
+        char line[512];
+        myfile.getline(line,512);
+        if (strstr(line,"CONTOUR")!= NULL) {
+            //printf("Skipping CONTOUR\n");
+            i--;
+            continue;
+        }
+
+        //cout << "Line is "<< line << endl;
+        const char* p;
+        int j = 0;
+        for (p = strtok( line, " " );  p;  p = strtok( NULL, " " ))
+        {
+            float val = (float)atof(p);
+            P(i,j)=val;
+            j++;
+        }
+    }
+    myfile.close();
+}
+static void readKRtFileMiddlebury(const string filename, vector<Camera> cameras, InputFiles inputFiles)
+{
+    ifstream myfile;
+    myfile.open( filename, ifstream::in );
+    string line;
+
+    getline (myfile, line); // throw away first line
+
+    int i=0;
+    int truei=-1;
+    while( getline( myfile,line) )
+    {
+        /*cout << "Line is "<< line << endl;*/
+        Mat Rt;
+        Mat_<float> K = Mat::zeros( 3, 3, CV_32F );
+        Mat_<float> R = Mat::zeros( 3, 3, CV_32F );
+        Vec3f vt;
+        stringstream ss(line);
+        string tmp;
+        ss >> tmp
+        >> K(0,0) >> K(0,1) >> K(0,2) >> K(1,0) >> K(1,1) >> K(1,2) >> K(2,0) >> K(2,1) >> K(2,2) //
+        >> R(0,0) >> R(0,1) >> R(0,2) >> R(1,0) >> R(1,1) >> R(1,2) >> R(2,0) >> R(2,1) >> R(2,2) //
+        >> vt(0) >> vt(1) >> vt(2);
+        /*cout << "K is " << K << endl;*/
+        /*cout << "R is " << R << endl;*/
+        /*cout << "t is " << vt << endl;*/
+        //cout << "Filename is " << tmp << endl;
+        //cout << "image Filename is " << inputFiles.img_filenames[i] << endl;
+        for( int j = 0; j < inputFiles.img_filenames.size(); j++) {
+            if( tmp == inputFiles.img_filenames[j]) {
+                truei=j;
+                break;
+            }
+        }
+        Mat t(vt, false);
+        /*Mat t(vt);*/
+        hconcat(R, t, Rt);
+        cameras[truei].P = K*Rt;
+        /*cout << "Rt is " << Rt<< endl;*/
+        /*cout << "P is " << P << endl;*/
+        /*cout << "P is " << cameras[i].P << endl;*/
+        i++;
+    }
+
+
+    /*while (os >> temp)                //the stringstream makes temp a token*/
+        /*std::cout <<temp <<std::endl;   //and deletes that token from itself*/
+    //the token can now be
+    //outputted to console, or put into an array,
+    //or whatever you choose to do ith it .
+    return;
+}
+
+static void readCalibFileDaisy(const string calib_filename, Mat_<float> &P){
+    ifstream myfile;
+    myfile.open(calib_filename.c_str(),ifstream::in);
+
+    char line[512];
+    while (myfile.getline(line, 512)) {
+        if(line[0] == 'p')
+            getProjectionMatrix(line,P);
+    }
+
+
+    myfile.close();
+}
+
+static void writeImageToFile(const char* outputFolder,const char* name,const Mat &img){
+    char outputPath[256];
+    sprintf(outputPath, "%s/%s.png", outputFolder,name);
+    imwrite(outputPath,img);
+};
+
+static void writeParametersToFile(char* resultsFile, InputFiles inputFiles, AlgorithmParameters &algParameters, GTcheckParameters &gtParameters, uint32_t numPixels){
+
+    ofstream myfile;
+    myfile.open (resultsFile, ios::out);
+    myfile << "Number of images: " << inputFiles.img_filenames.size() << endl;
+    myfile << "Image folder: " << inputFiles.images_folder << endl;
+    myfile << "Images: ";
+    for(int i=0; i < inputFiles.img_filenames.size(); i++)
+        myfile << inputFiles.img_filenames[i] << ", " ;
+    myfile << endl;
+    if(numPixels != 0)
+        myfile << "Num. pixels: " << numPixels << endl;
+    myfile << "\nParameters:" << endl;
+    myfile << "  Max. disparity: " << algParameters.max_disparity << endl;
+    myfile << "  Depth min: " << algParameters.depthMin << endl;
+    myfile << "  Depth max: " << algParameters.depthMax << endl;
+    myfile << "  color processing: ";
+    if(algParameters.color_processing)
+        myfile << "yes" << endl;
+    else
+        myfile << "no" << endl;
+    myfile << "  view selection: ";
+    if(algParameters.viewSelection)
+        myfile << "yes" << endl;
+    else
+        myfile << "no" << endl;
+    myfile.close();
+};
+// read ground truth depth map file (dmb) (provided by Tola et al. "DAISY: A Fast Local Descriptor for Dense Matching" http://cvlab.epfl.ch/software/daisy)
+static int readDmbNormal (const char *filename, Mat_<Vec3f> &img)
+{
+    FILE *inimage;
+    inimage = fopen(filename, "rb");
+    if (!inimage){
+        printf("Error opening file %s",filename);
+        return -1;
+    }
+
+    int32_t type, h, w, nb;
+
+    type = -1;
+
+    fread(&type,sizeof(int32_t),1,inimage);
+    fread(&h,sizeof(int32_t),1,inimage);
+    fread(&w,sizeof(int32_t),1,inimage);
+    fread(&nb,sizeof(int32_t),1,inimage);
+
+    //only support float
+    if(type != 1){
+        fclose(inimage);
+        return -1;
+    }
+
+    int32_t dataSize = h*w*nb;
+
+    float* data;
+    data = (float*) malloc (sizeof(float)*dataSize);
+    fread(data,sizeof(float),dataSize,inimage);
+
+    img = Mat(h,w,CV_32FC3,data);
+
+    fclose(inimage);
+    return 0;
+
+}
+// read ground truth depth map file (dmb) (provided by Tola et al. "DAISY: A Fast Local Descriptor for Dense Matching" http://cvlab.epfl.ch/software/daisy)
+static int readDmb(const char *filename, Mat_<float> &img)
+{
+    FILE *inimage;
+    inimage = fopen(filename, "rb");
+    if (!inimage){
+        printf("Error opening file %s",filename);
+        return -1;
+    }
+
+    int32_t type, h, w, nb;
+
+    type = -1;
+
+    fread(&type,sizeof(int32_t),1,inimage);
+    fread(&h,sizeof(int32_t),1,inimage);
+    fread(&w,sizeof(int32_t),1,inimage);
+    fread(&nb,sizeof(int32_t),1,inimage);
+
+    //only support float
+    if(type != 1){
+        fclose(inimage);
+        return -1;
+    }
+
+    int32_t dataSize = h*w*nb;
+
+    float* data;
+    data = (float*) malloc (sizeof(float)*dataSize);
+    fread(data,sizeof(float),dataSize,inimage);
+
+    img = Mat(h,w,CV_32F,data);
+
+    fclose(inimage);
+    return 0;
+
+}
+static int writeDmbNormal(const char *filename, Mat_<Vec3f> &img){
+    FILE *outimage;
+    outimage = fopen(filename, "wb");
+    if (!outimage)
+        printf("Error opening file %s",filename);
+
+    int32_t type = 1; //float
+    int32_t h = img.rows;
+    int32_t w = img.cols;
+    int32_t nb = 3;
+
+    fwrite(&type,sizeof(int32_t),1,outimage);
+    fwrite(&h,sizeof(int32_t),1,outimage);
+    fwrite(&w,sizeof(int32_t),1,outimage);
+    fwrite(&nb,sizeof(int32_t),1,outimage);
+
+    float* data = (float*)img.data;
+
+    int32_t datasize = w*h*nb;
+    fwrite(data,sizeof(float),datasize,outimage);
+
+    fclose(outimage);
+    return 0;
+}
+
+static int writeDmb(const char *filename, Mat_<float> &img){
+    FILE *outimage;
+    outimage = fopen(filename, "wb");
+    if (!outimage)
+        printf("Error opening file %s",filename);
+
+    int32_t type = 1; //float
+    int32_t h = img.rows;
+    int32_t w = img.cols;
+    int32_t nb = 1;
+
+    fwrite(&type,sizeof(int32_t),1,outimage);
+    fwrite(&h,sizeof(int32_t),1,outimage);
+    fwrite(&w,sizeof(int32_t),1,outimage);
+    fwrite(&nb,sizeof(int32_t),1,outimage);
+
+    float* data = (float*)img.data;
+
+    int32_t datasize = w*h*nb;
+    fwrite(data,sizeof(float),datasize,outimage);
+
+    fclose(outimage);
+    return 0;
+}
+
+static int readPfm( const char *filename,
+                    //double ***u, // double matrix image
+                    Mat_<float> &img,
+                    long *nx, /*image size in x direction */
+                    long *ny) /*image size in y direction */
+{
+    FILE *inimage;              /* input image FILE pointer */
+    long   i, j;                 /* loop variable */
+    char   row[4096];              /* for reading data */
+
+    /* open input pgm file and read header */
+    inimage = fopen(filename, "rb");
+    if (!inimage)
+        printf("Error opening file %s",filename);
+
+    /* calling it two times because of the P6 header */
+    fgets (row, 4096, inimage);
+    if (row[0]!='P')
+        abort();
+    printf("Row is %s\n", row);
+    char format = row[1];
+    switch (format)  /* which format to deal with */
+    {
+    case '5': /* P6 format - classic pgm */
+      printf("Opening %s P5 file\n", filename );
+      fgets (row, 4096, inimage);
+
+      while (row[0]=='#'||row[0]=='\n') fgets(row, 4096, inimage);
+      sscanf (row, "%ld %ld", nx, ny);
+      fgets (row, 4096, inimage);
+      //    fgets (row, 4096, inimage);
+
+      /* allocate storage */
+      //alloc_matrix_d (u, *nx, *ny);
+      img = Mat::zeros((int)*ny,(int)*nx,CV_32F);
+
+      /* read image data */
+      for (j=0; j<*ny; j++)
+          for (i=0; i<*nx; i++)
+              img(j,i) = (float) getc (inimage);
+              //(*u)[i][j] = (double) getc (inimage);
+      break;
+      /* PF format - pbm HDR format file */
+    case 'F':
+    case 'f':
+      printf("Opening %s PF file\n", filename );
+      fgets (row, 4096, inimage);
+
+      while (row[0]=='#'||row[0]=='\n')
+          fgets(row, 4096, inimage);
+
+      sscanf (row, "%ld %ld", nx, ny);
+
+//      fgets (row, 4096, inimage);
+      double scale;
+      fscanf(inimage, "%lf\n", &scale);
+//      printf("Scale is %f\n", scale);
+      //    fgets (row, 4096, inimage);
+
+      /* allocate storage */
+      //alloc_matrix_d (u, (*nx)*4*3, (*ny)*4*3);
+      img = Mat::zeros((int)*ny,(int)*nx,CV_32F);
+
+      float tmpfloat;
+//      printf("Float is %lu bytes big\n", sizeof(float));
+      /* read image data */
+      for (j=*ny-1; j>=0; j--) {
+          for (i=0; i<*nx; i++) {
+              //            (*u)[i][j] = (double) getc (inimage);
+              fread((void *)(&tmpfloat), sizeof(float), 1, inimage);
+              /* overwrite other 3 channels when they are available */
+              if (format=='F') {
+                  fread((void *)(&tmpfloat), sizeof(float), 1, inimage);
+                  fread((void *)(&tmpfloat), sizeof(float), 1, inimage);
+              }
+              img(j,i) = tmpfloat;
+              //(*u)[i][j] = (double) tmpfloat;
+
+              /* if positive convert to big endian */
+              if (scale > 0)
+              {
+                  char array[4];
+                  char tmpbyte;
+                  memcpy(array, &tmpfloat, sizeof (float));
+                  /* swap 0 3 */
+                  tmpbyte = array[0];
+                  array[0] = array[3];
+                  array[3] = tmpbyte;
+                  /* swap 1 2 */
+                  tmpbyte = array[1];
+                  array[1]=array[2];
+                  array[2] = tmpbyte;
+                  memcpy(&tmpfloat, array, sizeof (float));
+
+                  img(j,i) = tmpfloat;
+                  //(*u)[i][j] = (double) tmpfloat;
+              }
+          }
+      }
+      break;
+    }
+
+    fclose(inimage);
+    return 0;
+
+};
diff --git a/fusibile.cu b/fusibile.cu
new file mode 100644
index 0000000..8d2476c
--- /dev/null
+++ b/fusibile.cu
@@ -0,0 +1,439 @@
+/* vim: ft=cpp
+ * */
+
+//#include <helper_math.h>
+#ifdef _WIN32
+#include <windows.h>
+#endif
+#include <stdio.h>
+#include "globalstate.h"
+#include "algorithmparameters.h"
+#include "cameraparameters.h"
+#include "linestate.h"
+#include "imageinfo.h"
+#include "config.h"
+
+#include <vector_types.h> // float4
+#include <math.h>
+#include <cuda.h>
+#include <curand_kernel.h>
+#include "vector_operations.h"
+#include "point_cloud_list.h"
+
+#define SAVE_TEXTURE
+//#define SMOOTHNESS
+
+#define FORCEINLINE __forceinline__
+//#define FORCEINLINE
+
+
+__device__ float K[16];
+__device__ float K_inv[16];
+
+/*__device__ FORCEINLINE __constant__ float4 camerasK[32];*/
+
+/* compute depth value from disparity or disparity value from depth
+ * Input:  f         - focal length in pixel
+ *         baseline  - baseline between cameras (in meters)
+ *         d - either disparity or depth value
+ * Output: either depth or disparity value
+ */
+__device__ FORCEINLINE float disparityDepthConversion_cu ( const float &f, const float &baseline, const float &d ) {
+    return f * baseline / d;
+}
+
+/* compute depth value from disparity or disparity value from depth
+ * Input:  f         - focal length in pixel
+ *         baseline  - baseline between cameras (in meters)
+ *         d - either disparity or depth value
+ * Output: either depth or disparity value
+ */
+__device__ FORCEINLINE float disparityDepthConversion_cu2 ( const float &f, const Camera_cu &cam_ref, const Camera_cu &cam, const float &d ) {
+    float baseline = l2_float4(cam_ref.C4 - cam.C4);
+    return f * baseline / d;
+}
+
+__device__ FORCEINLINE void get3Dpoint_cu ( float4 * __restrict__ ptX, const Camera_cu &cam, const int2 &p, const float &depth ) {
+    // in case camera matrix is not normalized: see page 162, then depth might not be the real depth but w and depth needs to be computed from that first
+    const float4 pt = make_float4 (
+                                   depth * (float)p.x     - cam.P_col34.x,
+                                   depth * (float)p.y     - cam.P_col34.y,
+                                   depth                  - cam.P_col34.z,
+                                   0);
+
+    matvecmul4 (cam.M_inv, pt, ptX);
+}
+__device__ FORCEINLINE void get3Dpoint_cu1 ( float4 * __restrict__ ptX, const Camera_cu &cam, const int2 &p) {
+    // in case camera matrix is not normalized: see page 162, then depth might not be the real depth but w and depth needs to be computed from that first
+    float4 pt;
+    pt.x = (float)p.x     - cam.P_col34.x;
+    pt.y = (float)p.y     - cam.P_col34.y;
+    pt.z = 1.0f           - cam.P_col34.z;
+
+    matvecmul4 (cam.M_inv, pt, ptX);
+}
+//get d parameter of plane pi = [nT, d]T, which is the distance of the plane to the camera center
+__device__ FORCEINLINE float getPlaneDistance_cu ( const float4 &normal, const float4 &X ) {
+    return -(dot4(normal,X));
+}
+
+__device__ FORCEINLINE void normalize_cu (float4 * __restrict__ v)
+{
+    const float normSquared = pow2(v->x) + pow2(v->y) + pow2(v->z);
+    const float inverse_sqrt = rsqrtf (normSquared);
+    v->x *= inverse_sqrt;
+    v->y *= inverse_sqrt;
+    v->z *= inverse_sqrt;
+}
+__device__ FORCEINLINE void getViewVector_cu (float4 * __restrict__ v, const Camera_cu &camera, const int2 &p)
+{
+    get3Dpoint_cu1 (v, camera, p);
+    sub((*v), camera.C4);
+    normalize_cu(v);
+    //v->x=0;
+    //v->y=0;
+    //v->z=1;
+}
+
+__device__ FORCEINLINE float l1_norm(float f) {
+    return fabsf(f);
+}
+__device__ FORCEINLINE float l1_norm(float4 f) {
+    return ( fabsf (f.x) +
+             fabsf (f.y) +
+             fabsf (f.z))*0.3333333f;
+
+}
+__device__ FORCEINLINE float l1_norm2(float4 f) {
+    return ( fabsf (f.x) +
+             fabsf (f.y) +
+             fabsf (f.z));
+
+}
+
+/* get angle between two vectors in 3D
+ * Input: v1,v2 - vectors
+ * Output: angle in radian
+ */
+__device__ FORCEINLINE float getAngle_cu ( const float4 &v1, const float4 &v2 ) {
+    float angle = acosf ( dot4(v1, v2));
+    //if angle is not a number the dot product was 1 and thus the two vectors should be identical --> return 0
+    if ( angle != angle )
+        return 0.0f;
+    //if ( acosf ( v1.dot ( v2 ) ) != acosf ( v1.dot ( v2 ) ) )
+    //cout << acosf ( v1.dot ( v2 ) ) << " / " << v1.dot ( v2 )<< " / " << v1<< " / " << v2 << endl;
+    return angle;
+}
+__device__ FORCEINLINE void project_on_camera (const float4 &X, const Camera_cu &cam, float2 *pt, float *depth) {
+    float4 tmp = make_float4 (0, 0, 0, 0);
+    matvecmul4P (cam.P, X, (&tmp));
+    pt->x = tmp.x / tmp.z;
+    pt->y = tmp.y / tmp.z;
+    *depth = tmp.z;
+}
+
+/*
+ * Simple and fast depth math fusion based on depth map and normal consensus
+ */
+__global__ void fusibile (GlobalState &gs, int ref_camera)
+{
+    int2 p = make_int2 ( blockIdx.x * blockDim.x + threadIdx.x, blockIdx.y * blockDim.y + threadIdx.y );
+    //printf("p is %d %d\n", p.x, p.y);
+
+    const int cols = gs.cameras->cols;
+    const int rows = gs.cameras->rows;
+
+    if (p.x>=cols)
+        return;
+    if (p.y>=rows)
+        return;
+
+    const int center = p.y*cols+p.x;
+
+    const CameraParameters_cu &camParams = *(gs.cameras);
+
+    if (gs.lines[ref_camera].used_pixels[center]==1)
+        return;
+
+    //printf("ref_camera is %d\n", ref_camera);
+    const float4 normal = tex2D<float4> (gs.normals_depths[ref_camera], p.x+0.5f, p.y+0.5f);
+    //printf("Normal is %f %f %f\nDepth is %f\n", normal.x, normal.y, normal.z, normal.w);
+    /*
+     * For each point of the reference camera compute the 3d position corresponding to the corresponding depth.
+     * Create a point only if the following conditions are fulfilled:
+     * - Projected depths of other cameras does not differ more than gs.params.depthThresh
+     * - Angle of normal does not differ more than gs.params.normalThresh
+     */
+    float depth = normal.w;
+
+    float4 X;
+    get3Dpoint_cu (&X, camParams.cameras[ref_camera], p, depth);
+    //if (p.x<100 && p.y ==100)
+    //printf("3d Point is %f %f %f\n", X.x, X.y, X.z);
+    float4 consistent_X = X;
+    float4 consistent_normal  = normal;
+    float consistent_texture = tex2D<float> (gs.imgs[ref_camera], p.x+0.5f, p.y+0.5f);
+    int number_consistent = 0;
+    //int2 used_list[camParams.viewSelectionSubsetNumber];
+    int2 used_list[MAX_IMAGES];
+    for ( int i = 0; i < camParams.viewSelectionSubsetNumber; i++ ) {
+
+        int idxCurr = camParams.viewSelectionSubset[i];
+        used_list[idxCurr].x=-1;
+        used_list[idxCurr].y=-1;
+        if (idxCurr == ref_camera)
+            continue;
+
+        // Project 3d point X on camera idxCurr
+        float2 tmp_pt;
+        project_on_camera (X, camParams.cameras[idxCurr], &tmp_pt, &depth);
+        //printf("P for camera %d is \n", i);
+        //print_matrix (camParams.cameras[idxCurr].P, "camera ");
+        //printf("2d point for camera %d is %f %f\n", idxCurr, tmp_pt.x, tmp_pt.y);
+
+        // Boundary check
+        if (tmp_pt.x >=0 &&
+            tmp_pt.x < cols &&
+            tmp_pt.y >=0 &&
+            tmp_pt.y < rows) {
+            //printf("Boundary check passed\n");
+
+            // Compute interpolated depth and normal for tmp_pt w.r.t. camera ref_camera
+            float4 tmp_normal_and_depth; // first 3 components normal, fourth depth
+            tmp_normal_and_depth   = tex2D<float4> (gs.normals_depths[idxCurr], tmp_pt.x+0.5f, tmp_pt.y+0.5f);
+            //printf("New depth is %f vs %f\n", tmp_normal_and_depth.w, depth);
+
+            const float depth_disp = disparityDepthConversion_cu2                ( camParams.cameras[ref_camera].f, camParams.cameras[ref_camera], camParams.cameras[idxCurr], depth );
+            const float tmp_normal_and_depth_disp = disparityDepthConversion_cu2 ( camParams.cameras[ref_camera].f, camParams.cameras[ref_camera], camParams.cameras[idxCurr], tmp_normal_and_depth.w );
+            // First consistency check on depth
+            if (fabsf(depth_disp - tmp_normal_and_depth_disp) < gs.params->depthThresh) {
+                //printf("\tFirst consistency test passed!\n");
+                float angle = getAngle_cu (tmp_normal_and_depth, normal); // extract normal
+                if (angle < gs.params->normalThresh)
+                {
+                    //printf("\tSecond consistency test passed!\n");
+                    /// All conditions met:
+                    //  - average 3d points and normals
+                    //  - save resulting point and normal
+                    //  - (optional) average texture (not done yet)
+                    float4 tmp_X; // 3d point of consistent point on other view
+                    int2 tmp_p = make_int2 ((int) tmp_pt.x, (int) tmp_pt.y);
+
+                    get3Dpoint_cu (&tmp_X, camParams.cameras[idxCurr], tmp_p, tmp_normal_and_depth.w);
+                    consistent_X      = consistent_X      + tmp_X;
+                    //consistent_X      = tmp_X;
+                    consistent_normal = consistent_normal + tmp_normal_and_depth;
+                    if (gs.params->saveTexture)
+                        consistent_texture = consistent_texture + tex2D<float> (gs.imgs[idxCurr], tmp_pt.x+0.5f, tmp_pt.y+0.5f);
+
+
+                    // Save the point for later check
+                    //printf ("Saved point on camera %d is %d %d\n", idxCurr, (int)tmp_pt.x, (int)tmp_pt.y);
+                    used_list[idxCurr].x=(int)tmp_pt.x;
+                    used_list[idxCurr].y=(int)tmp_pt.y;
+
+                    number_consistent++;
+                }
+            }
+        }
+        else
+            continue;
+    }
+
+    // Average normals and points
+    consistent_X       = consistent_X       / ((float) number_consistent + 1.0f);
+    consistent_normal  = consistent_normal  / ((float) number_consistent + 1.0f);
+    consistent_texture = consistent_texture / ((float) number_consistent + 1.0f);
+
+    // If at least numConsistentThresh point agree:
+    // Create point
+    // Save normal
+    // (optional) save texture
+    if (number_consistent >= gs.params->numConsistentThresh) {
+        //printf("\tEnough consistent points!\nSaving point %f %f %f", consistent_X.x, consistent_X.y, consistent_X.z);
+        gs.pc->points[center].coord  = consistent_X;
+        gs.pc->points[center].normal = consistent_normal;
+
+#ifdef SAVE_TEXTURE
+        if (gs.params->saveTexture)
+            gs.pc->points[center].texture = consistent_texture;
+#endif
+
+        //// Mark corresponding point on other views as "used"
+        for ( int i = 0; i < camParams.viewSelectionSubsetNumber; i++ ) {
+            int idxCurr = camParams.viewSelectionSubset[i];
+            if (used_list[idxCurr].x==-1)
+                continue;
+            //printf("Used list point on camera %d is %d %d\n", idxCurr, used_list[idxCurr].x, used_list[idxCurr].y);
+            gs.lines[idxCurr].used_pixels [used_list[idxCurr].x + used_list[idxCurr].y*cols] = 1;
+        }
+    }
+
+    return;
+}
+/* Copy point cloud to global memory */
+//template< typename T >
+void copy_point_cloud_to_host(GlobalState &gs, int cam, PointCloudList &pc_list)
+{
+    printf("Processing camera %d\n", cam);
+    unsigned int count = pc_list.size;
+    for (int y=0; y<gs.pc->rows; y++) {
+        for (int x=0; x<gs.pc->cols; x++) {
+            Point_cu &p = gs.pc->points[x+y*gs.pc->cols];
+            const float4 X      = p.coord;
+            const float4 normal = p.normal;
+            float texture = 127.0f;
+#ifdef SAVE_TEXTURE
+            if (gs.params->saveTexture)
+                texture = p.texture;
+#endif
+            if (count==pc_list.maximum) {
+                printf("Not enough space to save points :'(\n... allocating more! :)");
+                pc_list.increase_size(pc_list.maximum*2);
+
+            }
+            if (X.x != 0 && X.y != 0 && X.z != 0) {
+                pc_list.points[count].coord   = X;
+                pc_list.points[count].normal  = normal;
+#ifdef SAVE_TEXTURE
+                pc_list.points[count].texture = texture;
+#endif
+                count++;
+            }
+            p.coord = make_float4(0,0,0,0);
+        }
+    }
+    printf("Found %.2f million points\n", count/1000000.0f);
+    pc_list.size = count;
+}
+
+template< typename T >
+void fusibile_cu(GlobalState &gs, PointCloudList &pc_list, int num_views)
+{
+#ifdef SHARED
+    cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
+#endif
+
+    int rows = gs.cameras->rows;
+    int cols = gs.cameras->cols;
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    printf("Run gipuma\n");
+    /*curandState* devStates;*/
+    //cudaMalloc ( &gs.cs, rows*cols*sizeof( curandState ) );
+
+    int count = 0;
+    int i = 0;
+
+    cudaGetDeviceCount(&count);
+    if(count == 0) {
+        fprintf(stderr, "There is no device.\n");
+        return ;
+    }
+
+    for(i = 0; i < count; i++) {
+        cudaDeviceProp prop;
+        if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
+            if(prop.major >= 1) {
+                break;
+            }
+        }
+    }
+    if(i == count) {
+        fprintf(stderr, "There is no device supporting CUDA.\n");
+        return ;
+    }
+    //float mind = gs.params.min_disparity;
+    //float maxd = gs.params.max_disparity;
+    //srand(0);
+    //for(int x = 0; x < gs.cameras.cols; x++) {
+    //for(int y = 0; y < gs.cameras.rows; y++) {
+    //gs.lines.disp[y*gs.cameras.cols+x] = (float)rand()/(float)RAND_MAX * (maxd-mind) + mind;
+    //[>printf("%f\n", gs.lines.disp[y*256+x]);<]
+    //}
+    //}
+    /*printf("MAX DISP is %f\n", gs.params.max_disparity);*/
+    /*printf("MIN DISP is %f\n", gs.params.min_disparity);*/
+    cudaSetDevice(i);
+    cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 1024*128);
+    dim3 grid_size;
+    grid_size.x=(cols+BLOCK_W-1)/BLOCK_W;
+    grid_size.y=((rows/2)+BLOCK_H-1)/BLOCK_H;
+    dim3 block_size;
+    block_size.x=BLOCK_W;
+    block_size.y=BLOCK_H;
+
+    dim3 grid_size_initrand;
+    grid_size_initrand.x=(cols+32-1)/32;
+    grid_size_initrand.y=(rows+32-1)/32;
+    dim3 block_size_initrand;
+    block_size_initrand.x=32;
+    block_size_initrand.y=32;
+
+/*     printf("Launching kernel with grid of size %d %d and block of size %d %d and shared size %d %d\nBlock %d %d and radius %d %d and tile %d %d\n",
+           grid_size.x,
+           grid_size.y,
+           block_size.x,
+           block_size.y,
+           SHARED_SIZE_W,
+           SHARED_SIZE_H,
+           BLOCK_W,
+           BLOCK_H,
+           WIN_RADIUS_W,
+           WIN_RADIUS_H,
+           TILE_W,
+           TILE_H
+          );
+ */    printf("Grid size initrand is grid: %d-%d block: %d-%d\n", grid_size_initrand.x, grid_size_initrand.y, block_size_initrand.x, block_size_initrand.y);
+
+    size_t avail;
+    size_t total;
+    cudaMemGetInfo( &avail, &total );
+    size_t used = total - avail;
+    printf("Device memory used: %fMB\n", used/1000000.0f);
+    printf("Number of iterations is %d\n", gs.params->iterations);
+    printf("Blocksize is %dx%d\n", gs.params->box_hsize,gs.params->box_vsize);
+    printf("Disparity threshold is \t%f\n", gs.params->depthThresh);
+    printf("Normal threshold is \t%f\n", gs.params->normalThresh);
+    printf("Number of consistent points is \t%d\n", gs.params->numConsistentThresh);
+    printf("Cam scale is \t%f\n", gs.params->cam_scale);
+
+    //int shared_memory_size = sizeof(float)  * SHARED_SIZE ;
+    printf("Fusing points\n");
+    cudaEventRecord(start);
+
+    //printf("Computing final disparity\n");
+    //for (int cam=0; cam<10; cam++) {
+    for (int cam=0; cam<num_views; cam++) {
+        fusibile<<< grid_size_initrand, block_size_initrand, cam>>>(gs, cam);
+        cudaDeviceSynchronize();
+        copy_point_cloud_to_host(gs, cam, pc_list); // slower but saves memory
+        cudaDeviceSynchronize();
+    }
+
+    cudaEventRecord(stop);
+
+    cudaEventSynchronize(stop);
+
+    float milliseconds = 0;
+    cudaEventElapsedTime(&milliseconds, start, stop);
+    printf("\t\tELAPSED %f seconds\n", milliseconds/1000.f);
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess)
+        printf("Error: %s\n", cudaGetErrorString(err));
+
+    // print results to file
+}
+
+int runcuda(GlobalState &gs, PointCloudList &pc_list, int num_views)
+{
+    printf("Run cuda\n");
+    /*GlobalState *gs = new GlobalState;*/
+    if(gs.params->color_processing)
+        fusibile_cu<float4>(gs, pc_list, num_views);
+    else
+        fusibile_cu<float>(gs, pc_list, num_views);
+    return 0;
+}
diff --git a/fusibile.h b/fusibile.h
new file mode 100644
index 0000000..63a3b32
--- /dev/null
+++ b/fusibile.h
@@ -0,0 +1,3 @@
+#include "globalstate.h"
+#include "point_cloud_list.h"
+int runcuda(GlobalState &gs, PointCloudList &pc_list, int num_views);
diff --git a/globalstate.h b/globalstate.h
new file mode 100644
index 0000000..aa7023f
--- /dev/null
+++ b/globalstate.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include "algorithmparameters.h"
+#include "cameraparameters.h"
+#include "linestate.h"
+#include "imageinfo.h"
+#include "managed.h"
+#include "point_cloud.h"
+
+// Includes CUDA
+#include <cuda_runtime.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <builtin_types.h>
+
+// includes, cuda
+#include <vector_types.h>
+#include <driver_functions.h>
+#include <cuda_runtime.h>
+#include <cuda_texture_types.h>
+#include <cuda_gl_interop.h>
+#include <curand_kernel.h>
+
+class GlobalState : public Managed {
+public:
+    CameraParameters_cu *cameras;
+    //ImageInfo iminfo;
+    LineState *lines;
+    curandState *cs;
+    AlgorithmParameters *params;
+    PointCloud *pc;
+   // PointCloud *pc_list;
+
+    cudaTextureObject_t imgs  [MAX_IMAGES];
+    cudaTextureObject_t normals_depths  [MAX_IMAGES]; // first 3 values normal, fourth depth
+    /*cudaTextureObject_t depths  [MAX_IMAGES];*/
+    //cudaTextureObject_t gradx [MAX_IMAGES];
+    //cudaTextureObject_t grady [MAX_IMAGES];
+    void resize(int n)
+    {
+        printf("Resizing globalstate to %d\n", n);
+        cudaMallocManaged (&lines,     sizeof(LineState) * n);
+    }
+    ~GlobalState()
+    {
+        /*cudaFree (c);*/
+        cudaFree (lines);
+    }
+
+};
diff --git a/helper_cuda.h b/helper_cuda.h
new file mode 100644
index 0000000..89fd782
--- /dev/null
+++ b/helper_cuda.h
@@ -0,0 +1,1182 @@
+/**
+ * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+////////////////////////////////////////////////////////////////////////////////
+// These are CUDA Helper functions for initialization and error checking
+
+#ifndef HELPER_CUDA_H
+#define HELPER_CUDA_H
+
+#pragma once
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "helper_string.h"
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// Note, it is required that your SDK sample to include the proper header files, please
+// refer the CUDA examples for examples of the needed CUDA headers, which may change depending
+// on which CUDA functions are used.
+
+// CUDA Runtime error messages
+#ifdef __DRIVER_TYPES_H__
+static const char *_cudaGetErrorEnum(cudaError_t error)
+{
+    switch (error)
+    {
+        case cudaSuccess:
+            return "cudaSuccess";
+
+        case cudaErrorMissingConfiguration:
+            return "cudaErrorMissingConfiguration";
+
+        case cudaErrorMemoryAllocation:
+            return "cudaErrorMemoryAllocation";
+
+        case cudaErrorInitializationError:
+            return "cudaErrorInitializationError";
+
+        case cudaErrorLaunchFailure:
+            return "cudaErrorLaunchFailure";
+
+        case cudaErrorPriorLaunchFailure:
+            return "cudaErrorPriorLaunchFailure";
+
+        case cudaErrorLaunchTimeout:
+            return "cudaErrorLaunchTimeout";
+
+        case cudaErrorLaunchOutOfResources:
+            return "cudaErrorLaunchOutOfResources";
+
+        case cudaErrorInvalidDeviceFunction:
+            return "cudaErrorInvalidDeviceFunction";
+
+        case cudaErrorInvalidConfiguration:
+            return "cudaErrorInvalidConfiguration";
+
+        case cudaErrorInvalidDevice:
+            return "cudaErrorInvalidDevice";
+
+        case cudaErrorInvalidValue:
+            return "cudaErrorInvalidValue";
+
+        case cudaErrorInvalidPitchValue:
+            return "cudaErrorInvalidPitchValue";
+
+        case cudaErrorInvalidSymbol:
+            return "cudaErrorInvalidSymbol";
+
+        case cudaErrorMapBufferObjectFailed:
+            return "cudaErrorMapBufferObjectFailed";
+
+        case cudaErrorUnmapBufferObjectFailed:
+            return "cudaErrorUnmapBufferObjectFailed";
+
+        case cudaErrorInvalidHostPointer:
+            return "cudaErrorInvalidHostPointer";
+
+        case cudaErrorInvalidDevicePointer:
+            return "cudaErrorInvalidDevicePointer";
+
+        case cudaErrorInvalidTexture:
+            return "cudaErrorInvalidTexture";
+
+        case cudaErrorInvalidTextureBinding:
+            return "cudaErrorInvalidTextureBinding";
+
+        case cudaErrorInvalidChannelDescriptor:
+            return "cudaErrorInvalidChannelDescriptor";
+
+        case cudaErrorInvalidMemcpyDirection:
+            return "cudaErrorInvalidMemcpyDirection";
+
+        case cudaErrorAddressOfConstant:
+            return "cudaErrorAddressOfConstant";
+
+        case cudaErrorTextureFetchFailed:
+            return "cudaErrorTextureFetchFailed";
+
+        case cudaErrorTextureNotBound:
+            return "cudaErrorTextureNotBound";
+
+        case cudaErrorSynchronizationError:
+            return "cudaErrorSynchronizationError";
+
+        case cudaErrorInvalidFilterSetting:
+            return "cudaErrorInvalidFilterSetting";
+
+        case cudaErrorInvalidNormSetting:
+            return "cudaErrorInvalidNormSetting";
+
+        case cudaErrorMixedDeviceExecution:
+            return "cudaErrorMixedDeviceExecution";
+
+        case cudaErrorCudartUnloading:
+            return "cudaErrorCudartUnloading";
+
+        case cudaErrorUnknown:
+            return "cudaErrorUnknown";
+
+        case cudaErrorNotYetImplemented:
+            return "cudaErrorNotYetImplemented";
+
+        case cudaErrorMemoryValueTooLarge:
+            return "cudaErrorMemoryValueTooLarge";
+
+        case cudaErrorInvalidResourceHandle:
+            return "cudaErrorInvalidResourceHandle";
+
+        case cudaErrorNotReady:
+            return "cudaErrorNotReady";
+
+        case cudaErrorInsufficientDriver:
+            return "cudaErrorInsufficientDriver";
+
+        case cudaErrorSetOnActiveProcess:
+            return "cudaErrorSetOnActiveProcess";
+
+        case cudaErrorInvalidSurface:
+            return "cudaErrorInvalidSurface";
+
+        case cudaErrorNoDevice:
+            return "cudaErrorNoDevice";
+
+        case cudaErrorECCUncorrectable:
+            return "cudaErrorECCUncorrectable";
+
+        case cudaErrorSharedObjectSymbolNotFound:
+            return "cudaErrorSharedObjectSymbolNotFound";
+
+        case cudaErrorSharedObjectInitFailed:
+            return "cudaErrorSharedObjectInitFailed";
+
+        case cudaErrorUnsupportedLimit:
+            return "cudaErrorUnsupportedLimit";
+
+        case cudaErrorDuplicateVariableName:
+            return "cudaErrorDuplicateVariableName";
+
+        case cudaErrorDuplicateTextureName:
+            return "cudaErrorDuplicateTextureName";
+
+        case cudaErrorDuplicateSurfaceName:
+            return "cudaErrorDuplicateSurfaceName";
+
+        case cudaErrorDevicesUnavailable:
+            return "cudaErrorDevicesUnavailable";
+
+        case cudaErrorInvalidKernelImage:
+            return "cudaErrorInvalidKernelImage";
+
+        case cudaErrorNoKernelImageForDevice:
+            return "cudaErrorNoKernelImageForDevice";
+
+        case cudaErrorIncompatibleDriverContext:
+            return "cudaErrorIncompatibleDriverContext";
+
+        case cudaErrorPeerAccessAlreadyEnabled:
+            return "cudaErrorPeerAccessAlreadyEnabled";
+
+        case cudaErrorPeerAccessNotEnabled:
+            return "cudaErrorPeerAccessNotEnabled";
+
+        case cudaErrorDeviceAlreadyInUse:
+            return "cudaErrorDeviceAlreadyInUse";
+
+        case cudaErrorProfilerDisabled:
+            return "cudaErrorProfilerDisabled";
+
+        case cudaErrorProfilerNotInitialized:
+            return "cudaErrorProfilerNotInitialized";
+
+        case cudaErrorProfilerAlreadyStarted:
+            return "cudaErrorProfilerAlreadyStarted";
+
+        case cudaErrorProfilerAlreadyStopped:
+            return "cudaErrorProfilerAlreadyStopped";
+
+        /* Since CUDA 4.0*/
+        case cudaErrorAssert:
+            return "cudaErrorAssert";
+
+        case cudaErrorTooManyPeers:
+            return "cudaErrorTooManyPeers";
+
+        case cudaErrorHostMemoryAlreadyRegistered:
+            return "cudaErrorHostMemoryAlreadyRegistered";
+
+        case cudaErrorHostMemoryNotRegistered:
+            return "cudaErrorHostMemoryNotRegistered";
+
+        /* Since CUDA 5.0 */
+        case cudaErrorOperatingSystem:
+            return "cudaErrorOperatingSystem";
+
+        case cudaErrorPeerAccessUnsupported:
+            return "cudaErrorPeerAccessUnsupported";
+
+        case cudaErrorLaunchMaxDepthExceeded:
+            return "cudaErrorLaunchMaxDepthExceeded";
+
+        case cudaErrorLaunchFileScopedTex:
+            return "cudaErrorLaunchFileScopedTex";
+
+        case cudaErrorLaunchFileScopedSurf:
+            return "cudaErrorLaunchFileScopedSurf";
+
+        case cudaErrorSyncDepthExceeded:
+            return "cudaErrorSyncDepthExceeded";
+
+        case cudaErrorLaunchPendingCountExceeded:
+            return "cudaErrorLaunchPendingCountExceeded";
+
+        case cudaErrorNotPermitted:
+            return "cudaErrorNotPermitted";
+
+        case cudaErrorNotSupported:
+            return "cudaErrorNotSupported";
+
+        /* Since CUDA 6.0 */
+        case cudaErrorHardwareStackError:
+            return "cudaErrorHardwareStackError";
+
+        case cudaErrorIllegalInstruction:
+            return "cudaErrorIllegalInstruction";
+
+        case cudaErrorMisalignedAddress:
+            return "cudaErrorMisalignedAddress";
+
+        case cudaErrorInvalidAddressSpace:
+            return "cudaErrorInvalidAddressSpace";
+
+        case cudaErrorInvalidPc:
+            return "cudaErrorInvalidPc";
+
+        case cudaErrorIllegalAddress:
+            return "cudaErrorIllegalAddress";
+
+        /* Since CUDA 6.5*/
+        case cudaErrorInvalidPtx:
+            return "cudaErrorInvalidPtx";
+
+        case cudaErrorInvalidGraphicsContext:
+            return "cudaErrorInvalidGraphicsContext";
+
+        case cudaErrorStartupFailure:
+            return "cudaErrorStartupFailure";
+
+        case cudaErrorApiFailureBase:
+            return "cudaErrorApiFailureBase";
+    }
+
+    return "<unknown>";
+}
+#endif
+
+#ifdef __cuda_cuda_h__
+// CUDA Driver API errors
+static const char *_cudaGetErrorEnum(CUresult error)
+{
+    switch (error)
+    {
+        case CUDA_SUCCESS:
+            return "CUDA_SUCCESS";
+
+        case CUDA_ERROR_INVALID_VALUE:
+            return "CUDA_ERROR_INVALID_VALUE";
+
+        case CUDA_ERROR_OUT_OF_MEMORY:
+            return "CUDA_ERROR_OUT_OF_MEMORY";
+
+        case CUDA_ERROR_NOT_INITIALIZED:
+            return "CUDA_ERROR_NOT_INITIALIZED";
+
+        case CUDA_ERROR_DEINITIALIZED:
+            return "CUDA_ERROR_DEINITIALIZED";
+
+        case CUDA_ERROR_PROFILER_DISABLED:
+            return "CUDA_ERROR_PROFILER_DISABLED";
+
+        case CUDA_ERROR_PROFILER_NOT_INITIALIZED:
+            return "CUDA_ERROR_PROFILER_NOT_INITIALIZED";
+
+        case CUDA_ERROR_PROFILER_ALREADY_STARTED:
+            return "CUDA_ERROR_PROFILER_ALREADY_STARTED";
+
+        case CUDA_ERROR_PROFILER_ALREADY_STOPPED:
+            return "CUDA_ERROR_PROFILER_ALREADY_STOPPED";
+
+        case CUDA_ERROR_NO_DEVICE:
+            return "CUDA_ERROR_NO_DEVICE";
+
+        case CUDA_ERROR_INVALID_DEVICE:
+            return "CUDA_ERROR_INVALID_DEVICE";
+
+        case CUDA_ERROR_INVALID_IMAGE:
+            return "CUDA_ERROR_INVALID_IMAGE";
+
+        case CUDA_ERROR_INVALID_CONTEXT:
+            return "CUDA_ERROR_INVALID_CONTEXT";
+
+        case CUDA_ERROR_CONTEXT_ALREADY_CURRENT:
+            return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT";
+
+        case CUDA_ERROR_MAP_FAILED:
+            return "CUDA_ERROR_MAP_FAILED";
+
+        case CUDA_ERROR_UNMAP_FAILED:
+            return "CUDA_ERROR_UNMAP_FAILED";
+
+        case CUDA_ERROR_ARRAY_IS_MAPPED:
+            return "CUDA_ERROR_ARRAY_IS_MAPPED";
+
+        case CUDA_ERROR_ALREADY_MAPPED:
+            return "CUDA_ERROR_ALREADY_MAPPED";
+
+        case CUDA_ERROR_NO_BINARY_FOR_GPU:
+            return "CUDA_ERROR_NO_BINARY_FOR_GPU";
+
+        case CUDA_ERROR_ALREADY_ACQUIRED:
+            return "CUDA_ERROR_ALREADY_ACQUIRED";
+
+        case CUDA_ERROR_NOT_MAPPED:
+            return "CUDA_ERROR_NOT_MAPPED";
+
+        case CUDA_ERROR_NOT_MAPPED_AS_ARRAY:
+            return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY";
+
+        case CUDA_ERROR_NOT_MAPPED_AS_POINTER:
+            return "CUDA_ERROR_NOT_MAPPED_AS_POINTER";
+
+        case CUDA_ERROR_ECC_UNCORRECTABLE:
+            return "CUDA_ERROR_ECC_UNCORRECTABLE";
+
+        case CUDA_ERROR_UNSUPPORTED_LIMIT:
+            return "CUDA_ERROR_UNSUPPORTED_LIMIT";
+
+        case CUDA_ERROR_CONTEXT_ALREADY_IN_USE:
+            return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE";
+
+        case CUDA_ERROR_INVALID_SOURCE:
+            return "CUDA_ERROR_INVALID_SOURCE";
+
+        case CUDA_ERROR_FILE_NOT_FOUND:
+            return "CUDA_ERROR_FILE_NOT_FOUND";
+
+        case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND:
+            return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND";
+
+        case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED:
+            return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED";
+
+        case CUDA_ERROR_OPERATING_SYSTEM:
+            return "CUDA_ERROR_OPERATING_SYSTEM";
+
+        case CUDA_ERROR_INVALID_HANDLE:
+            return "CUDA_ERROR_INVALID_HANDLE";
+
+        case CUDA_ERROR_NOT_FOUND:
+            return "CUDA_ERROR_NOT_FOUND";
+
+        case CUDA_ERROR_NOT_READY:
+            return "CUDA_ERROR_NOT_READY";
+
+        case CUDA_ERROR_LAUNCH_FAILED:
+            return "CUDA_ERROR_LAUNCH_FAILED";
+
+        case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
+            return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
+
+        case CUDA_ERROR_LAUNCH_TIMEOUT:
+            return "CUDA_ERROR_LAUNCH_TIMEOUT";
+
+        case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING:
+            return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING";
+
+        case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED:
+            return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED";
+
+        case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED:
+            return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED";
+
+        case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE:
+            return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE";
+
+        case CUDA_ERROR_CONTEXT_IS_DESTROYED:
+            return "CUDA_ERROR_CONTEXT_IS_DESTROYED";
+
+        case CUDA_ERROR_ASSERT:
+            return "CUDA_ERROR_ASSERT";
+
+        case CUDA_ERROR_TOO_MANY_PEERS:
+            return "CUDA_ERROR_TOO_MANY_PEERS";
+
+        case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED:
+            return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED";
+
+        case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED:
+            return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED";
+
+        case CUDA_ERROR_UNKNOWN:
+            return "CUDA_ERROR_UNKNOWN";
+    }
+
+    return "<unknown>";
+}
+#endif
+
+#ifdef CUBLAS_API_H_
+// cuBLAS API errors
+static const char *_cudaGetErrorEnum(cublasStatus_t error)
+{
+    switch (error)
+    {
+        case CUBLAS_STATUS_SUCCESS:
+            return "CUBLAS_STATUS_SUCCESS";
+
+        case CUBLAS_STATUS_NOT_INITIALIZED:
+            return "CUBLAS_STATUS_NOT_INITIALIZED";
+
+        case CUBLAS_STATUS_ALLOC_FAILED:
+            return "CUBLAS_STATUS_ALLOC_FAILED";
+
+        case CUBLAS_STATUS_INVALID_VALUE:
+            return "CUBLAS_STATUS_INVALID_VALUE";
+
+        case CUBLAS_STATUS_ARCH_MISMATCH:
+            return "CUBLAS_STATUS_ARCH_MISMATCH";
+
+        case CUBLAS_STATUS_MAPPING_ERROR:
+            return "CUBLAS_STATUS_MAPPING_ERROR";
+
+        case CUBLAS_STATUS_EXECUTION_FAILED:
+            return "CUBLAS_STATUS_EXECUTION_FAILED";
+
+        case CUBLAS_STATUS_INTERNAL_ERROR:
+            return "CUBLAS_STATUS_INTERNAL_ERROR";
+    }
+
+    return "<unknown>";
+}
+#endif
+
+#ifdef _CUFFT_H_
+// cuFFT API errors
+static const char *_cudaGetErrorEnum(cufftResult error)
+{
+    switch (error)
+    {
+        case CUFFT_SUCCESS:
+            return "CUFFT_SUCCESS";
+
+        case CUFFT_INVALID_PLAN:
+            return "CUFFT_INVALID_PLAN";
+
+        case CUFFT_ALLOC_FAILED:
+            return "CUFFT_ALLOC_FAILED";
+
+        case CUFFT_INVALID_TYPE:
+            return "CUFFT_INVALID_TYPE";
+
+        case CUFFT_INVALID_VALUE:
+            return "CUFFT_INVALID_VALUE";
+
+        case CUFFT_INTERNAL_ERROR:
+            return "CUFFT_INTERNAL_ERROR";
+
+        case CUFFT_EXEC_FAILED:
+            return "CUFFT_EXEC_FAILED";
+
+        case CUFFT_SETUP_FAILED:
+            return "CUFFT_SETUP_FAILED";
+
+        case CUFFT_INVALID_SIZE:
+            return "CUFFT_INVALID_SIZE";
+
+        case CUFFT_UNALIGNED_DATA:
+            return "CUFFT_UNALIGNED_DATA";
+
+        case CUFFT_INCOMPLETE_PARAMETER_LIST:
+            return "CUFFT_INCOMPLETE_PARAMETER_LIST";
+
+        case CUFFT_INVALID_DEVICE:
+            return "CUFFT_INVALID_DEVICE";
+
+        case CUFFT_PARSE_ERROR:
+            return "CUFFT_PARSE_ERROR";
+
+        case CUFFT_NO_WORKSPACE:
+            return "CUFFT_NO_WORKSPACE";
+
+        case CUFFT_NOT_IMPLEMENTED:
+            return "CUFFT_NOT_IMPLEMENTED";
+
+        case CUFFT_LICENSE_ERROR:
+            return "CUFFT_LICENSE_ERROR";
+    }
+
+    return "<unknown>";
+}
+#endif
+
+
+#ifdef CUSPARSEAPI
+// cuSPARSE API errors
+static const char *_cudaGetErrorEnum(cusparseStatus_t error)
+{
+    switch (error)
+    {
+        case CUSPARSE_STATUS_SUCCESS:
+            return "CUSPARSE_STATUS_SUCCESS";
+
+        case CUSPARSE_STATUS_NOT_INITIALIZED:
+            return "CUSPARSE_STATUS_NOT_INITIALIZED";
+
+        case CUSPARSE_STATUS_ALLOC_FAILED:
+            return "CUSPARSE_STATUS_ALLOC_FAILED";
+
+        case CUSPARSE_STATUS_INVALID_VALUE:
+            return "CUSPARSE_STATUS_INVALID_VALUE";
+
+        case CUSPARSE_STATUS_ARCH_MISMATCH:
+            return "CUSPARSE_STATUS_ARCH_MISMATCH";
+
+        case CUSPARSE_STATUS_MAPPING_ERROR:
+            return "CUSPARSE_STATUS_MAPPING_ERROR";
+
+        case CUSPARSE_STATUS_EXECUTION_FAILED:
+            return "CUSPARSE_STATUS_EXECUTION_FAILED";
+
+        case CUSPARSE_STATUS_INTERNAL_ERROR:
+            return "CUSPARSE_STATUS_INTERNAL_ERROR";
+
+        case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+            return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+    }
+
+    return "<unknown>";
+}
+#endif
+
+#ifdef CURAND_H_
+// cuRAND API errors
+static const char *_cudaGetErrorEnum(curandStatus_t error)
+{
+    switch (error)
+    {
+        case CURAND_STATUS_SUCCESS:
+            return "CURAND_STATUS_SUCCESS";
+
+        case CURAND_STATUS_VERSION_MISMATCH:
+            return "CURAND_STATUS_VERSION_MISMATCH";
+
+        case CURAND_STATUS_NOT_INITIALIZED:
+            return "CURAND_STATUS_NOT_INITIALIZED";
+
+        case CURAND_STATUS_ALLOCATION_FAILED:
+            return "CURAND_STATUS_ALLOCATION_FAILED";
+
+        case CURAND_STATUS_TYPE_ERROR:
+            return "CURAND_STATUS_TYPE_ERROR";
+
+        case CURAND_STATUS_OUT_OF_RANGE:
+            return "CURAND_STATUS_OUT_OF_RANGE";
+
+        case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+            return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+
+        case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+            return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+
+        case CURAND_STATUS_LAUNCH_FAILURE:
+            return "CURAND_STATUS_LAUNCH_FAILURE";
+
+        case CURAND_STATUS_PREEXISTING_FAILURE:
+            return "CURAND_STATUS_PREEXISTING_FAILURE";
+
+        case CURAND_STATUS_INITIALIZATION_FAILED:
+            return "CURAND_STATUS_INITIALIZATION_FAILED";
+
+        case CURAND_STATUS_ARCH_MISMATCH:
+            return "CURAND_STATUS_ARCH_MISMATCH";
+
+        case CURAND_STATUS_INTERNAL_ERROR:
+            return "CURAND_STATUS_INTERNAL_ERROR";
+    }
+
+    return "<unknown>";
+}
+#endif
+
+#ifdef NV_NPPIDEFS_H
+// NPP API errors
+static const char *_cudaGetErrorEnum(NppStatus error)
+{
+    switch (error)
+    {
+        case NPP_NOT_SUPPORTED_MODE_ERROR:
+            return "NPP_NOT_SUPPORTED_MODE_ERROR";
+
+        case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR:
+            return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR";
+
+        case NPP_RESIZE_NO_OPERATION_ERROR:
+            return "NPP_RESIZE_NO_OPERATION_ERROR";
+
+        case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY:
+            return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
+
+        case NPP_BAD_ARG_ERROR:
+            return "NPP_BAD_ARGUMENT_ERROR";
+
+        case NPP_COEFF_ERROR:
+            return "NPP_COEFFICIENT_ERROR";
+
+        case NPP_RECT_ERROR:
+            return "NPP_RECTANGLE_ERROR";
+
+        case NPP_QUAD_ERROR:
+            return "NPP_QUADRANGLE_ERROR";
+
+        case NPP_MEM_ALLOC_ERR:
+            return "NPP_MEMORY_ALLOCATION_ERROR";
+
+        case NPP_HISTO_NUMBER_OF_LEVELS_ERROR:
+            return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
+
+        case NPP_INVALID_INPUT:
+            return "NPP_INVALID_INPUT";
+
+        case NPP_POINTER_ERROR:
+            return "NPP_POINTER_ERROR";
+
+        case NPP_WARNING:
+            return "NPP_WARNING";
+
+        case NPP_ODD_ROI_WARNING:
+            return "NPP_ODD_ROI_WARNING";
+#else
+
+            // These are for CUDA 5.5 or higher
+        case NPP_BAD_ARGUMENT_ERROR:
+            return "NPP_BAD_ARGUMENT_ERROR";
+
+        case NPP_COEFFICIENT_ERROR:
+            return "NPP_COEFFICIENT_ERROR";
+
+        case NPP_RECTANGLE_ERROR:
+            return "NPP_RECTANGLE_ERROR";
+
+        case NPP_QUADRANGLE_ERROR:
+            return "NPP_QUADRANGLE_ERROR";
+
+        case NPP_MEMORY_ALLOCATION_ERR:
+            return "NPP_MEMORY_ALLOCATION_ERROR";
+
+        case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR:
+            return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
+
+        case NPP_INVALID_HOST_POINTER_ERROR:
+            return "NPP_INVALID_HOST_POINTER_ERROR";
+
+        case NPP_INVALID_DEVICE_POINTER_ERROR:
+            return "NPP_INVALID_DEVICE_POINTER_ERROR";
+#endif
+
+        case NPP_LUT_NUMBER_OF_LEVELS_ERROR:
+            return "NPP_LUT_NUMBER_OF_LEVELS_ERROR";
+
+        case NPP_TEXTURE_BIND_ERROR:
+            return "NPP_TEXTURE_BIND_ERROR";
+
+        case NPP_WRONG_INTERSECTION_ROI_ERROR:
+            return "NPP_WRONG_INTERSECTION_ROI_ERROR";
+
+        case NPP_NOT_EVEN_STEP_ERROR:
+            return "NPP_NOT_EVEN_STEP_ERROR";
+
+        case NPP_INTERPOLATION_ERROR:
+            return "NPP_INTERPOLATION_ERROR";
+
+        case NPP_RESIZE_FACTOR_ERROR:
+            return "NPP_RESIZE_FACTOR_ERROR";
+
+        case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR:
+            return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR";
+
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
+
+        case NPP_MEMFREE_ERR:
+            return "NPP_MEMFREE_ERR";
+
+        case NPP_MEMSET_ERR:
+            return "NPP_MEMSET_ERR";
+
+        case NPP_MEMCPY_ERR:
+            return "NPP_MEMCPY_ERROR";
+
+        case NPP_MIRROR_FLIP_ERR:
+            return "NPP_MIRROR_FLIP_ERR";
+#else
+
+        case NPP_MEMFREE_ERROR:
+            return "NPP_MEMFREE_ERROR";
+
+        case NPP_MEMSET_ERROR:
+            return "NPP_MEMSET_ERROR";
+
+        case NPP_MEMCPY_ERROR:
+            return "NPP_MEMCPY_ERROR";
+
+        case NPP_MIRROR_FLIP_ERROR:
+            return "NPP_MIRROR_FLIP_ERROR";
+#endif
+
+        case NPP_ALIGNMENT_ERROR:
+            return "NPP_ALIGNMENT_ERROR";
+
+        case NPP_STEP_ERROR:
+            return "NPP_STEP_ERROR";
+
+        case NPP_SIZE_ERROR:
+            return "NPP_SIZE_ERROR";
+
+        case NPP_NULL_POINTER_ERROR:
+            return "NPP_NULL_POINTER_ERROR";
+
+        case NPP_CUDA_KERNEL_EXECUTION_ERROR:
+            return "NPP_CUDA_KERNEL_EXECUTION_ERROR";
+
+        case NPP_NOT_IMPLEMENTED_ERROR:
+            return "NPP_NOT_IMPLEMENTED_ERROR";
+
+        case NPP_ERROR:
+            return "NPP_ERROR";
+
+        case NPP_SUCCESS:
+            return "NPP_SUCCESS";
+
+        case NPP_WRONG_INTERSECTION_QUAD_WARNING:
+            return "NPP_WRONG_INTERSECTION_QUAD_WARNING";
+
+        case NPP_MISALIGNED_DST_ROI_WARNING:
+            return "NPP_MISALIGNED_DST_ROI_WARNING";
+
+        case NPP_AFFINE_QUAD_INCORRECT_WARNING:
+            return "NPP_AFFINE_QUAD_INCORRECT_WARNING";
+
+        case NPP_DOUBLE_SIZE_WARNING:
+            return "NPP_DOUBLE_SIZE_WARNING";
+
+        case NPP_WRONG_INTERSECTION_ROI_WARNING:
+            return "NPP_WRONG_INTERSECTION_ROI_WARNING";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000
+        /* These are 6.0 or higher */
+        case NPP_LUT_PALETTE_BITSIZE_ERROR:
+            return "NPP_LUT_PALETTE_BITSIZE_ERROR";
+
+        case NPP_ZC_MODE_NOT_SUPPORTED_ERROR:
+            return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR";
+
+        case NPP_QUALITY_INDEX_ERROR:
+            return "NPP_QUALITY_INDEX_ERROR";
+
+        case NPP_CHANNEL_ORDER_ERROR:
+            return "NPP_CHANNEL_ORDER_ERROR";
+
+        case NPP_ZERO_MASK_VALUE_ERROR:
+            return "NPP_ZERO_MASK_VALUE_ERROR";
+
+        case NPP_NUMBER_OF_CHANNELS_ERROR:
+            return "NPP_NUMBER_OF_CHANNELS_ERROR";
+
+        case NPP_COI_ERROR:
+            return "NPP_COI_ERROR";
+
+        case NPP_DIVISOR_ERROR:
+            return "NPP_DIVISOR_ERROR";
+
+        case NPP_CHANNEL_ERROR:
+            return "NPP_CHANNEL_ERROR";
+
+        case NPP_STRIDE_ERROR:
+            return "NPP_STRIDE_ERROR";
+
+        case NPP_ANCHOR_ERROR:
+            return "NPP_ANCHOR_ERROR";
+
+        case NPP_MASK_SIZE_ERROR:
+            return "NPP_MASK_SIZE_ERROR";
+
+        case NPP_MOMENT_00_ZERO_ERROR:
+            return "NPP_MOMENT_00_ZERO_ERROR";
+
+        case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR:
+            return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR";
+
+        case NPP_THRESHOLD_ERROR:
+            return "NPP_THRESHOLD_ERROR";
+
+        case NPP_CONTEXT_MATCH_ERROR:
+            return "NPP_CONTEXT_MATCH_ERROR";
+
+        case NPP_FFT_FLAG_ERROR:
+            return "NPP_FFT_FLAG_ERROR";
+
+        case NPP_FFT_ORDER_ERROR:
+            return "NPP_FFT_ORDER_ERROR";
+
+        case NPP_SCALE_RANGE_ERROR:
+            return "NPP_SCALE_RANGE_ERROR";
+
+        case NPP_DATA_TYPE_ERROR:
+            return "NPP_DATA_TYPE_ERROR";
+
+        case NPP_OUT_OFF_RANGE_ERROR:
+            return "NPP_OUT_OFF_RANGE_ERROR";
+
+        case NPP_DIVIDE_BY_ZERO_ERROR:
+            return "NPP_DIVIDE_BY_ZERO_ERROR";
+
+        case NPP_RANGE_ERROR:
+            return "NPP_RANGE_ERROR";
+
+        case NPP_NO_MEMORY_ERROR:
+            return "NPP_NO_MEMORY_ERROR";
+
+        case NPP_ERROR_RESERVED:
+            return "NPP_ERROR_RESERVED";
+
+        case NPP_NO_OPERATION_WARNING:
+            return "NPP_NO_OPERATION_WARNING";
+
+        case NPP_DIVIDE_BY_ZERO_WARNING:
+            return "NPP_DIVIDE_BY_ZERO_WARNING";
+#endif
+
+    }
+
+    return "<unknown>";
+}
+#endif
+
+#ifdef __DRIVER_TYPES_H__
+#ifndef DEVICE_RESET
+#define DEVICE_RESET cudaDeviceReset();
+#endif
+#else
+#ifndef DEVICE_RESET
+#define DEVICE_RESET
+#endif
+#endif
+
+template< typename T >
+void check(T result, char const *const func, const char *const file, int const line)
+{
+    if (result)
+    {
+        fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n",
+                file, line, static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
+        DEVICE_RESET
+        // Make sure we call CUDA Device Reset before exiting
+        exit(EXIT_FAILURE);
+    }
+}
+
+#ifdef __DRIVER_TYPES_H__
+// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
+#define checkCudaErrors(val)           check ( (val), #val, __FILE__, __LINE__ )
+
+// This will output the proper error string when calling cudaGetLastError
+#define getLastCudaError(msg)      __getLastCudaError (msg, __FILE__, __LINE__)
+
+inline void __getLastCudaError(const char *errorMessage, const char *file, const int line)
+{
+    cudaError_t err = cudaGetLastError();
+
+    if (cudaSuccess != err)
+    {
+        fprintf(stderr, "%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n",
+                file, line, errorMessage, (int)err, cudaGetErrorString(err));
+        DEVICE_RESET
+        exit(EXIT_FAILURE);
+    }
+}
+#endif
+
+#ifndef MAX
+#define MAX(a,b) (a > b ? a : b)
+#endif
+
+// Beginning of GPU Architecture definitions
+inline int _ConvertSMVer2Cores(int major, int minor)
+{
+    // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
+    typedef struct
+    {
+        int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
+        int Cores;
+    } sSMtoCores;
+
+    sSMtoCores nGpuArchCoresPerSM[] =
+    {
+        { 0x10,  8 }, // Tesla Generation (SM 1.0) G80 class
+        { 0x11,  8 }, // Tesla Generation (SM 1.1) G8x class
+        { 0x12,  8 }, // Tesla Generation (SM 1.2) G9x class
+        { 0x13,  8 }, // Tesla Generation (SM 1.3) GT200 class
+        { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
+        { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
+        { 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class
+        { 0x32, 192}, // Kepler Generation (SM 3.2) GK10x class
+        { 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class
+        { 0x37, 192}, // Kepler Generation (SM 3.7) GK21x class
+        { 0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class
+        {   -1, -1 }
+    };
+
+    int index = 0;
+
+    while (nGpuArchCoresPerSM[index].SM != -1)
+    {
+        if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor))
+        {
+            return nGpuArchCoresPerSM[index].Cores;
+        }
+
+        index++;
+    }
+
+    // If we don't find the values, we default use the previous one to run properly
+    printf("MapSMtoCores for SM %d.%d is undefined.  Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[index-1].Cores);
+    return nGpuArchCoresPerSM[index-1].Cores;
+}
+// end of GPU Architecture definitions
+
+#ifdef __CUDA_RUNTIME_H__
+// General GPU Device CUDA Initialization
+inline int gpuDeviceInit(int devID)
+{
+    int device_count;
+    checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+    if (device_count == 0)
+    {
+        fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n");
+        exit(EXIT_FAILURE);
+    }
+
+    if (devID < 0)
+    {
+        devID = 0;
+    }
+
+    if (devID > device_count-1)
+    {
+        fprintf(stderr, "\n");
+        fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", device_count);
+        fprintf(stderr, ">> gpuDeviceInit (-device=%d) is not a valid GPU device. <<\n", devID);
+        fprintf(stderr, "\n");
+        return -devID;
+    }
+
+    cudaDeviceProp deviceProp;
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
+
+    if (deviceProp.computeMode == cudaComputeModeProhibited)
+    {
+        fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().\n");
+        return -1;
+    }
+
+    if (deviceProp.major < 1)
+    {
+        fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");
+        exit(EXIT_FAILURE);
+    }
+
+    checkCudaErrors(cudaSetDevice(devID));
+    printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, deviceProp.name);
+
+    return devID;
+}
+
+// This function returns the best GPU (with maximum GFLOPS)
+inline int gpuGetMaxGflopsDeviceId()
+{
+    int current_device     = 0, sm_per_multiproc  = 0;
+    int max_perf_device    = 0;
+    int device_count       = 0, best_SM_arch      = 0;
+    int devices_prohibited = 0;
+
+    unsigned long long max_compute_perf = 0;
+    cudaDeviceProp deviceProp;
+    cudaGetDeviceCount(&device_count);
+
+    checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+    if (device_count == 0)
+    {
+        fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error: no devices supporting CUDA.\n");
+        exit(EXIT_FAILURE);
+    }
+
+    // Find the best major SM Architecture GPU device
+    while (current_device < device_count)
+    {
+        cudaGetDeviceProperties(&deviceProp, current_device);
+
+        // If this GPU is not running on Compute Mode prohibited, then we can add it to the list
+        if (deviceProp.computeMode != cudaComputeModeProhibited)
+        {
+            if (deviceProp.major > 0 && deviceProp.major < 9999)
+            {
+                best_SM_arch = MAX(best_SM_arch, deviceProp.major);
+            }
+        }
+        else
+        {
+            devices_prohibited++;
+        }
+
+        current_device++;
+    }
+
+    if (devices_prohibited == device_count)
+    {
+    	fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error: all devices have compute mode prohibited.\n");
+    	exit(EXIT_FAILURE);
+    }
+
+    // Find the best CUDA capable GPU device
+    current_device = 0;
+
+    while (current_device < device_count)
+    {
+        cudaGetDeviceProperties(&deviceProp, current_device);
+
+        // If this GPU is not running on Compute Mode prohibited, then we can add it to the list
+        if (deviceProp.computeMode != cudaComputeModeProhibited)
+        {
+            if (deviceProp.major == 9999 && deviceProp.minor == 9999)
+            {
+                sm_per_multiproc = 1;
+            }
+            else
+            {
+                sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
+            }
+
+            unsigned long long compute_perf  = (unsigned long long) deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
+
+            if (compute_perf  > max_compute_perf)
+            {
+                // If we find GPU with SM major > 2, search only these
+                if (best_SM_arch > 2)
+                {
+                    // If our device==dest_SM_arch, choose this, or else pass
+                    if (deviceProp.major == best_SM_arch)
+                    {
+                        max_compute_perf  = compute_perf;
+                        max_perf_device   = current_device;
+                    }
+                }
+                else
+                {
+                    max_compute_perf  = compute_perf;
+                    max_perf_device   = current_device;
+                }
+            }
+        }
+
+        ++current_device;
+    }
+
+    return max_perf_device;
+}
+
+
+// Initialization code to find the best CUDA Device
+inline int findCudaDevice(int argc, const char **argv)
+{
+    cudaDeviceProp deviceProp;
+    int devID = 0;
+
+    // If the command-line has a device number specified, use it
+    if (checkCmdLineFlag(argc, argv, "device"))
+    {
+        devID = getCmdLineArgumentInt(argc, argv, "device=");
+
+        if (devID < 0)
+        {
+            printf("Invalid command line parameter\n ");
+            exit(EXIT_FAILURE);
+        }
+        else
+        {
+            devID = gpuDeviceInit(devID);
+
+            if (devID < 0)
+            {
+                printf("exiting...\n");
+                exit(EXIT_FAILURE);
+            }
+        }
+    }
+    else
+    {
+        // Otherwise pick the device with highest Gflops/s
+        devID = gpuGetMaxGflopsDeviceId();
+        checkCudaErrors(cudaSetDevice(devID));
+        checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
+        printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
+    }
+
+    return devID;
+}
+
+// General check for CUDA GPU SM Capabilities
+inline bool checkCudaCapabilities(int major_version, int minor_version)
+{
+    cudaDeviceProp deviceProp;
+    deviceProp.major = 0;
+    deviceProp.minor = 0;
+    int dev;
+
+    checkCudaErrors(cudaGetDevice(&dev));
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
+
+    if ((deviceProp.major > major_version) ||
+        (deviceProp.major == major_version && deviceProp.minor >= minor_version))
+    {
+        printf("  Device %d: <%16s >, Compute SM %d.%d detected\n", dev, deviceProp.name, deviceProp.major, deviceProp.minor);
+        return true;
+    }
+    else
+    {
+        printf("  No GPU device was found that can support CUDA compute capability %d.%d.\n", major_version, minor_version);
+        return false;
+    }
+}
+#endif
+
+// end of CUDA Helper Functions
+
+
+#endif
diff --git a/helper_string.h b/helper_string.h
new file mode 100644
index 0000000..cdf35df
--- /dev/null
+++ b/helper_string.h
@@ -0,0 +1,498 @@
+/**
+ * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+// These are helper functions for the SDK samples (string parsing, timers, etc)
+#ifndef STRING_HELPER_H
+#define STRING_HELPER_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fstream>
+#include <string>
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#ifndef _CRT_SECURE_NO_DEPRECATE
+#define _CRT_SECURE_NO_DEPRECATE
+#endif
+#ifndef STRCASECMP
+#define STRCASECMP  _stricmp
+#endif
+#ifndef STRNCASECMP
+#define STRNCASECMP _strnicmp
+#endif
+#ifndef STRCPY
+#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath)
+#endif
+
+#ifndef FOPEN
+#define FOPEN(fHandle,filename,mode) fopen_s(&fHandle, filename, mode)
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result != 0)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf_s
+#endif
+#ifndef SPRINTF
+#define SPRINTF sprintf_s
+#endif
+#else // Linux Includes
+#include <string.h>
+#include <strings.h>
+
+#ifndef STRCASECMP
+#define STRCASECMP  strcasecmp
+#endif
+#ifndef STRNCASECMP
+#define STRNCASECMP strncasecmp
+#endif
+#ifndef STRCPY
+#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath)
+#endif
+
+#ifndef FOPEN
+#define FOPEN(fHandle,filename,mode) (fHandle = fopen(filename, mode))
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result == NULL)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf
+#endif
+#ifndef SPRINTF
+#define SPRINTF sprintf
+#endif
+#endif
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// CUDA Utility Helper Functions
+inline int stringRemoveDelimiter(char delimiter, const char *string)
+{
+    int string_start = 0;
+
+    while (string[string_start] == delimiter)
+    {
+        string_start++;
+    }
+
+    if (string_start >= (int)strlen(string)-1)
+    {
+        return 0;
+    }
+
+    return string_start;
+}
+
+inline int getFileExtension(char *filename, char **extension)
+{
+    int string_length = (int)strlen(filename);
+
+    while (filename[string_length--] != '.')
+    {
+        if (string_length == 0)
+            break;
+    }
+
+    if (string_length > 0) string_length += 2;
+
+    if (string_length == 0)
+        *extension = NULL;
+    else
+        *extension = &filename[string_length];
+
+    return string_length;
+}
+
+
+inline bool checkCmdLineFlag(const int argc, const char **argv, const char *string_ref)
+{
+    bool bFound = false;
+
+    if (argc >= 1)
+    {
+        for (int i=1; i < argc; i++)
+        {
+            int string_start = stringRemoveDelimiter('-', argv[i]);
+            const char *string_argv = &argv[i][string_start];
+
+            const char *equal_pos = strchr(string_argv, '=');
+            int argv_length = (int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
+
+            int length = (int)strlen(string_ref);
+
+            if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length))
+            {
+                bFound = true;
+                continue;
+            }
+        }
+    }
+
+    return bFound;
+}
+
+// This function wraps the CUDA Driver API into a template function
+template <class T>
+inline bool getCmdLineArgumentValue(const int argc, const char **argv, const char *string_ref, T *value)
+{
+    bool bFound = false;
+
+    if (argc >= 1)
+    {
+        for (int i=1; i < argc; i++)
+        {
+            int string_start = stringRemoveDelimiter('-', argv[i]);
+            const char *string_argv = &argv[i][string_start];
+            int length = (int)strlen(string_ref);
+
+            if (!STRNCASECMP(string_argv, string_ref, length))
+            {
+                if (length+1 <= (int)strlen(string_argv))
+                {
+                    int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+                    *value = (T)atoi(&string_argv[length + auto_inc]);
+                }
+
+                bFound = true;
+                i=argc;
+            }
+        }
+    }
+
+    return bFound;
+}
+
+inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref)
+{
+    bool bFound = false;
+    int value = -1;
+
+    if (argc >= 1)
+    {
+        for (int i=1; i < argc; i++)
+        {
+            int string_start = stringRemoveDelimiter('-', argv[i]);
+            const char *string_argv = &argv[i][string_start];
+            int length = (int)strlen(string_ref);
+
+            if (!STRNCASECMP(string_argv, string_ref, length))
+            {
+                if (length+1 <= (int)strlen(string_argv))
+                {
+                    int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+                    value = atoi(&string_argv[length + auto_inc]);
+                }
+                else
+                {
+                    value = 0;
+                }
+
+                bFound = true;
+                continue;
+            }
+        }
+    }
+
+    if (bFound)
+    {
+        return value;
+    }
+    else
+    {
+        return 0;
+    }
+}
+
+inline float getCmdLineArgumentFloat(const int argc, const char **argv, const char *string_ref)
+{
+    bool bFound = false;
+    float value = -1;
+
+    if (argc >= 1)
+    {
+        for (int i=1; i < argc; i++)
+        {
+            int string_start = stringRemoveDelimiter('-', argv[i]);
+            const char *string_argv = &argv[i][string_start];
+            int length = (int)strlen(string_ref);
+
+            if (!STRNCASECMP(string_argv, string_ref, length))
+            {
+                if (length+1 <= (int)strlen(string_argv))
+                {
+                    int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+                    value = (float)atof(&string_argv[length + auto_inc]);
+                }
+                else
+                {
+                    value = 0.f;
+                }
+
+                bFound = true;
+                continue;
+            }
+        }
+    }
+
+    if (bFound)
+    {
+        return value;
+    }
+    else
+    {
+        return 0;
+    }
+}
+
+inline bool getCmdLineArgumentString(const int argc, const char **argv,
+                                     const char *string_ref, char **string_retval)
+{
+    bool bFound = false;
+
+    if (argc >= 1)
+    {
+        for (int i=1; i < argc; i++)
+        {
+            int string_start = stringRemoveDelimiter('-', argv[i]);
+            char *string_argv = (char *)&argv[i][string_start];
+            int length = (int)strlen(string_ref);
+
+            if (!STRNCASECMP(string_argv, string_ref, length))
+            {
+                *string_retval = &string_argv[length+1];
+                bFound = true;
+                continue;
+            }
+        }
+    }
+
+    if (!bFound)
+    {
+        *string_retval = NULL;
+    }
+
+    return bFound;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Find the path for a file assuming that
+//! files are found in the searchPath.
+//!
+//! @return the path if succeeded, otherwise 0
+//! @param filename         name of the file
+//! @param executable_path  optional absolute path of the executable
+//////////////////////////////////////////////////////////////////////////////
+inline char *sdkFindFilePath(const char *filename, const char *executable_path)
+{
+    // <executable_name> defines a variable that is replaced with the name of the executable
+
+    // Typical relative search paths to locate needed companion files (e.g. sample input data, or JIT source files)
+    // The origin for the relative search may be the .exe file, a .bat file launching an .exe, a browser .exe launching the .exe or .bat, etc
+    const char *searchPath[] =
+    {
+        "./",                                       // same dir
+        "./common/",                                // "/common/" subdir
+        "./common/data/",                           // "/common/data/" subdir
+        "./data/",                                  // "/data/" subdir
+        "./src/",                                   // "/src/" subdir
+        "./src/<executable_name>/data/",            // "/src/<executable_name>/data/" subdir
+        "./inc/",                                   // "/inc/" subdir
+        "./0_Simple/",                              // "/0_Simple/" subdir
+        "./1_Utilities/",                           // "/1_Utilities/" subdir
+        "./2_Graphics/",                            // "/2_Graphics/" subdir
+        "./3_Imaging/",                             // "/3_Imaging/" subdir
+        "./4_Financial/",                           // "/4_Financial/" subdir
+        "./5_Simulations/",                         // "/5_Simulations/" subdir
+        "./6_Advanced/",                            // "/6_Advanced/" subdir
+        "./7_CUDALibraries/",                       // "/7_CUDALibraries/" subdir
+        "./8_Android/",                             // "/8_Android/" subdir
+        "./samples/",                               // "/samples/" subdir
+
+        "../",                                      // up 1 in tree
+        "../common/",                               // up 1 in tree, "/common/" subdir
+        "../common/data/",                          // up 1 in tree, "/common/data/" subdir
+        "../data/",                                 // up 1 in tree, "/data/" subdir
+        "../src/",                                  // up 1 in tree, "/src/" subdir
+        "../inc/",                                  // up 1 in tree, "/inc/" subdir
+
+        "../0_Simple/<executable_name>/data/",       // up 1 in tree, "/0_Simple/<executable_name>/" subdir
+        "../1_Utilities/<executable_name>/data/",    // up 1 in tree, "/1_Utilities/<executable_name>/" subdir
+        "../2_Graphics/<executable_name>/data/",     // up 1 in tree, "/2_Graphics/<executable_name>/" subdir
+        "../3_Imaging/<executable_name>/data/",      // up 1 in tree, "/3_Imaging/<executable_name>/" subdir
+        "../4_Financial/<executable_name>/data/",    // up 1 in tree, "/4_Financial/<executable_name>/" subdir
+        "../5_Simulations/<executable_name>/data/",  // up 1 in tree, "/5_Simulations/<executable_name>/" subdir
+        "../6_Advanced/<executable_name>/data/",     // up 1 in tree, "/6_Advanced/<executable_name>/" subdir
+        "../7_CUDALibraries/<executable_name>/data/",// up 1 in tree, "/7_CUDALibraries/<executable_name>/" subdir
+        "../8_Android/<executable_name>/data/",      // up 1 in tree, "/8_Android/<executable_name>/" subdir
+        "../samples/<executable_name>/data/",        // up 1 in tree, "/samples/<executable_name>/" subdir
+        "../../",                                        // up 2 in tree
+        "../../common/",                                 // up 2 in tree, "/common/" subdir
+        "../../common/data/",                            // up 2 in tree, "/common/data/" subdir
+        "../../data/",                                   // up 2 in tree, "/data/" subdir
+        "../../src/",                                    // up 2 in tree, "/src/" subdir
+        "../../inc/",                                    // up 2 in tree, "/inc/" subdir
+        "../../sandbox/<executable_name>/data/",         // up 2 in tree, "/sandbox/<executable_name>/" subdir
+        "../../0_Simple/<executable_name>/data/",        // up 2 in tree, "/0_Simple/<executable_name>/" subdir
+        "../../1_Utilities/<executable_name>/data/",     // up 2 in tree, "/1_Utilities/<executable_name>/" subdir
+        "../../2_Graphics/<executable_name>/data/",      // up 2 in tree, "/2_Graphics/<executable_name>/" subdir
+        "../../3_Imaging/<executable_name>/data/",       // up 2 in tree, "/3_Imaging/<executable_name>/" subdir
+        "../../4_Financial/<executable_name>/data/",     // up 2 in tree, "/4_Financial/<executable_name>/" subdir
+        "../../5_Simulations/<executable_name>/data/",   // up 2 in tree, "/5_Simulations/<executable_name>/" subdir
+        "../../6_Advanced/<executable_name>/data/",      // up 2 in tree, "/6_Advanced/<executable_name>/" subdir
+        "../../7_CUDALibraries/<executable_name>/data/", // up 2 in tree, "/7_CUDALibraries/<executable_name>/" subdir
+        "../../8_Android/<executable_name>/data/",       // up 2 in tree, "/8_Android/<executable_name>/" subdir
+        "../../samples/<executable_name>/data/",         // up 2 in tree, "/samples/<executable_name>/" subdir
+        "../../../",                                        // up 3 in tree
+        "../../../src/<executable_name>/",                  // up 3 in tree, "/src/<executable_name>/" subdir
+        "../../../src/<executable_name>/data/",             // up 3 in tree, "/src/<executable_name>/data/" subdir
+        "../../../src/<executable_name>/src/",              // up 3 in tree, "/src/<executable_name>/src/" subdir
+        "../../../src/<executable_name>/inc/",              // up 3 in tree, "/src/<executable_name>/inc/" subdir
+        "../../../sandbox/<executable_name>/",              // up 3 in tree, "/sandbox/<executable_name>/" subdir
+        "../../../sandbox/<executable_name>/data/",         // up 3 in tree, "/sandbox/<executable_name>/data/" subdir
+        "../../../sandbox/<executable_name>/src/",          // up 3 in tree, "/sandbox/<executable_name>/src/" subdir
+        "../../../sandbox/<executable_name>/inc/",          // up 3 in tree, "/sandbox/<executable_name>/inc/" subdir
+        "../../../0_Simple/<executable_name>/data/",        // up 3 in tree, "/0_Simple/<executable_name>/" subdir
+        "../../../1_Utilities/<executable_name>/data/",     // up 3 in tree, "/1_Utilities/<executable_name>/" subdir
+        "../../../2_Graphics/<executable_name>/data/",      // up 3 in tree, "/2_Graphics/<executable_name>/" subdir
+        "../../../3_Imaging/<executable_name>/data/",       // up 3 in tree, "/3_Imaging/<executable_name>/" subdir
+        "../../../4_Financial/<executable_name>/data/",     // up 3 in tree, "/4_Financial/<executable_name>/" subdir
+        "../../../5_Simulations/<executable_name>/data/",   // up 3 in tree, "/5_Simulations/<executable_name>/" subdir
+        "../../../6_Advanced/<executable_name>/data/",      // up 3 in tree, "/6_Advanced/<executable_name>/" subdir
+        "../../../7_CUDALibraries/<executable_name>/data/", // up 3 in tree, "/7_CUDALibraries/<executable_name>/" subdir
+        "../../../8_Android/<executable_name>/data/",       // up 3 in tree, "/8_Android/<executable_name>/" subdir
+        "../../../samples/<executable_name>/data/",         // up 3 in tree, "/samples/<executable_name>/" subdir
+        "../../../common/",                                 // up 3 in tree, "../../../common/" subdir
+        "../../../common/data/",                            // up 3 in tree, "../../../common/data/" subdir
+        "../../../data/",                                   // up 3 in tree, "../../../data/" subdir
+        "../../../../",                                // up 4 in tree
+        "../../../../src/<executable_name>/",          // up 4 in tree, "/src/<executable_name>/" subdir
+        "../../../../src/<executable_name>/data/",     // up 4 in tree, "/src/<executable_name>/data/" subdir
+        "../../../../src/<executable_name>/src/",      // up 4 in tree, "/src/<executable_name>/src/" subdir
+        "../../../../src/<executable_name>/inc/",      // up 4 in tree, "/src/<executable_name>/inc/" subdir
+        "../../../../sandbox/<executable_name>/",      // up 4 in tree, "/sandbox/<executable_name>/" subdir
+        "../../../../sandbox/<executable_name>/data/", // up 4 in tree, "/sandbox/<executable_name>/data/" subdir
+        "../../../../sandbox/<executable_name>/src/",  // up 4 in tree, "/sandbox/<executable_name>/src/" subdir
+        "../../../../sandbox/<executable_name>/inc/",   // up 4 in tree, "/sandbox/<executable_name>/inc/" subdir
+        "../../../../0_Simple/<executable_name>/data/",     // up 4 in tree, "/0_Simple/<executable_name>/" subdir
+        "../../../../1_Utilities/<executable_name>/data/",  // up 4 in tree, "/1_Utilities/<executable_name>/" subdir
+        "../../../../2_Graphics/<executable_name>/data/",   // up 4 in tree, "/2_Graphics/<executable_name>/" subdir
+        "../../../../3_Imaging/<executable_name>/data/",    // up 4 in tree, "/3_Imaging/<executable_name>/" subdir
+        "../../../../4_Financial/<executable_name>/data/",  // up 4 in tree, "/4_Financial/<executable_name>/" subdir
+        "../../../../5_Simulations/<executable_name>/data/",// up 4 in tree, "/5_Simulations/<executable_name>/" subdir
+        "../../../../6_Advanced/<executable_name>/data/",   // up 4 in tree, "/6_Advanced/<executable_name>/" subdir
+        "../../../../7_CUDALibraries/<executable_name>/data/", // up 4 in tree, "/7_CUDALibraries/<executable_name>/" subdir
+        "../../../../8_Android/<executable_name>/data/",    // up 4 in tree, "/8_Android/<executable_name>/" subdir
+        "../../../../samples/<executable_name>/data/",      // up 4 in tree, "/samples/<executable_name>/" subdir
+        "../../../../common/",                              // up 4 in tree, "../../../common/" subdir
+        "../../../../common/data/",                         // up 4 in tree, "../../../common/data/" subdir
+        "../../../../data/",                                // up 4 in tree, "../../../data/" subdir
+        "../../../../../",                                // up 5 in tree
+        "../../../../../src/<executable_name>/",          // up 5 in tree, "/src/<executable_name>/" subdir
+        "../../../../../src/<executable_name>/data/",     // up 5 in tree, "/src/<executable_name>/data/" subdir
+        "../../../../../src/<executable_name>/src/",      // up 5 in tree, "/src/<executable_name>/src/" subdir
+        "../../../../../src/<executable_name>/inc/",      // up 5 in tree, "/src/<executable_name>/inc/" subdir
+        "../../../../../sandbox/<executable_name>/",      // up 5 in tree, "/sandbox/<executable_name>/" subdir
+        "../../../../../sandbox/<executable_name>/data/", // up 5 in tree, "/sandbox/<executable_name>/data/" subdir
+        "../../../../../sandbox/<executable_name>/src/",  // up 5 in tree, "/sandbox/<executable_name>/src/" subdir
+        "../../../../../sandbox/<executable_name>/inc/",   // up 5 in tree, "/sandbox/<executable_name>/inc/" subdir
+        "../../../../../0_Simple/<executable_name>/data/",     // up 5 in tree, "/0_Simple/<executable_name>/" subdir
+        "../../../../../1_Utilities/<executable_name>/data/",  // up 5 in tree, "/1_Utilities/<executable_name>/" subdir
+        "../../../../../2_Graphics/<executable_name>/data/",   // up 5 in tree, "/2_Graphics/<executable_name>/" subdir
+        "../../../../../3_Imaging/<executable_name>/data/",    // up 5 in tree, "/3_Imaging/<executable_name>/" subdir
+        "../../../../../4_Financial/<executable_name>/data/",  // up 5 in tree, "/4_Financial/<executable_name>/" subdir
+        "../../../../../5_Simulations/<executable_name>/data/",// up 5 in tree, "/5_Simulations/<executable_name>/" subdir
+        "../../../../../6_Advanced/<executable_name>/data/",   // up 5 in tree, "/6_Advanced/<executable_name>/" subdir
+        "../../../../../7_CUDALibraries/<executable_name>/data/", // up 5 in tree, "/7_CUDALibraries/<executable_name>/" subdir
+        "../../../../../8_Android/<executable_name>/data/",    // up 5 in tree, "/8_Android/<executable_name>/" subdir
+        "../../../../../samples/<executable_name>/data/",      // up 5 in tree, "/samples/<executable_name>/" subdir
+        "../../../../../common/",                         // up 5 in tree, "../../../common/" subdir
+        "../../../../../common/data/",                    // up 5 in tree, "../../../common/data/" subdir
+    };
+
+    // Extract the executable name
+    std::string executable_name;
+
+    if (executable_path != 0)
+    {
+        executable_name = std::string(executable_path);
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+        // Windows path delimiter
+        size_t delimiter_pos = executable_name.find_last_of('\\');
+        executable_name.erase(0, delimiter_pos + 1);
+
+        if (executable_name.rfind(".exe") != std::string::npos)
+        {
+            // we strip .exe, only if the .exe is found
+            executable_name.resize(executable_name.size() - 4);
+        }
+
+#else
+        // Linux & OSX path delimiter
+        size_t delimiter_pos = executable_name.find_last_of('/');
+        executable_name.erase(0,delimiter_pos+1);
+#endif
+    }
+
+    // Loop over all search paths and return the first hit
+    for (unsigned int i = 0; i < sizeof(searchPath)/sizeof(char *); ++i)
+    {
+        std::string path(searchPath[i]);
+        size_t executable_name_pos = path.find("<executable_name>");
+
+        // If there is executable_name variable in the searchPath
+        // replace it with the value
+        if (executable_name_pos != std::string::npos)
+        {
+            if (executable_path != 0)
+            {
+                path.replace(executable_name_pos, strlen("<executable_name>"), executable_name);
+            }
+            else
+            {
+                // Skip this path entry if no executable argument is given
+                continue;
+            }
+        }
+
+#ifdef _DEBUG
+        printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str());
+#endif
+
+        // Test if the file exists
+        path.append(filename);
+        FILE *fp;
+        FOPEN(fp, path.c_str(), "rb");
+
+        if (fp != NULL)
+        {
+            fclose(fp);
+            // File found
+            // returning an allocated array here for backwards compatibility reasons
+            char *file_path = (char *) malloc(path.length() + 1);
+            STRCPY(file_path, path.length() + 1, path.c_str());
+            return file_path;
+        }
+
+        if (fp)
+        {
+            fclose(fp);
+        }
+    }
+
+    // File not found
+    return 0;
+}
+
+#endif
diff --git a/imageinfo.h b/imageinfo.h
new file mode 100644
index 0000000..b84a815
--- /dev/null
+++ b/imageinfo.h
@@ -0,0 +1,15 @@
+#pragma once
+#include "managed.h"
+
+class __align__(128) ImageInfo : public Managed{
+public:
+    // Image size
+    int cols;
+    int rows;
+
+    // Total number of pixels
+    int np;
+
+    // Total number of bytes (may be different when padded)
+    int nb;
+};
diff --git a/linestate.h b/linestate.h
new file mode 100644
index 0000000..3d0c8f1
--- /dev/null
+++ b/linestate.h
@@ -0,0 +1,23 @@
+#pragma once
+#include <string.h> // memset()
+#include "algorithmparameters.h"
+#include "cameraparameters.h"
+#include "managed.h"
+#include <vector_types.h> // float4
+
+class __align__(128) LineState : public Managed {
+public:
+    char *used_pixels; // disparity*/*/
+    int n;
+    int s; // stride
+    int l; // length
+    void resize(int n)
+    {
+        cudaMallocManaged (&used_pixels,        sizeof(char) * n);
+        memset            (used_pixels,      0, sizeof(char) * n);
+    }
+    ~LineState()
+    {
+        cudaFree (used_pixels);
+    }
+};
diff --git a/main.cpp b/main.cpp
new file mode 100644
index 0000000..3370ef7
--- /dev/null
+++ b/main.cpp
@@ -0,0 +1,879 @@
+#ifdef _WIN32
+#include <windows.h>
+#include <ctime>
+#include <direct.h>
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+#include <sys/types.h>
+#include <dirent.h>
+
+
+// Includes CUDA
+#include <cuda_runtime.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <cuda_texture_types.h>
+#include <vector_types.h>
+#include <unistd.h>
+
+// CUDA helper functions
+#include "helper_cuda.h"         // helper functions for CUDA error check
+
+#include <map> // multimap
+
+#include <sys/stat.h> // mkdir
+#include <sys/types.h> // mkdir
+
+
+//#include "camera.h"
+#include "algorithmparameters.h"
+#include "globalstate.h"
+#include "fusibile.h"
+
+#include "main.h"
+#include "fileIoUtils.h"
+#include "cameraGeometryUtils.h"
+#include "mathUtils.h"
+#include "displayUtils.h"
+#include "point_cloud_list.h"
+
+#define MAX_NR_POINTS 500000
+
+struct InputData{
+    string path;
+    //int id;
+    string id;
+    int camId;
+    Camera cam;
+    Mat_<float> depthMap;
+    Mat_<Vec3b> inputImage;
+    Mat_<Vec3f> normals;
+};
+
+int getCameraFromId(string id, vector<Camera> &cameras){
+    for(int i =0; i< cameras.size(); i++) {
+        //cout << "Checking camera id " << i << " cameraid " << cameras[i].id << endl;
+        if(cameras[i].id.compare(id) == 0)
+            return i;
+    }
+    return -1;
+}
+static void get_subfolders(
+                           const char *dirname,
+                           vector<string> &subfolders)
+{
+    DIR *dir;
+    struct dirent *ent;
+
+    // Open directory stream
+    dir = opendir (dirname);
+    if (dir != NULL) {
+        //cout << "Dirname is " << dirname << endl;
+        //cout << "Dirname type is " << ent->d_type << endl;
+        //cout << "Dirname type DT_DIR " << DT_DIR << endl;
+
+        // Print all files and directories within the directory
+        while ((ent = readdir (dir)) != NULL) {
+            //cout << "INSIDE" << endl;
+            //if(ent->d_type == DT_DIR)
+            {
+                char* name = ent->d_name;
+                if(strcmp(name,".") == 0 || strcmp(ent->d_name,"..") == 0)
+                    continue;
+                //printf ("dir %s/\n", name);
+                subfolders.push_back(string(name));
+            }
+        }
+
+        closedir (dir);
+
+    } else {
+        // Could not open directory
+        printf ("Cannot open directory %s\n", dirname);
+        exit (EXIT_FAILURE);
+    }
+}
+
+static void print_help ()
+{
+    printf ( "\nfusibile\n" );
+}
+
+/* process command line arguments
+ * Input: argc, argv - command line arguments
+ * Output: inputFiles, outputFiles, parameters, gt_parameters, no_display - algorithm parameters
+ */
+static int getParametersFromCommandLine ( int argc,
+                                          char** argv,
+                                          InputFiles &inputFiles,
+                                          OutputFiles &outputFiles,
+                                          AlgorithmParameters &parameters,
+                                          GTcheckParameters &gt_parameters,
+                                          bool &no_display )
+{
+    const char* algorithm_opt     = "--algorithm=";
+    const char* maxdisp_opt       = "--max-disparity=";
+    const char* blocksize_opt     = "--blocksize=";
+    const char* cost_tau_color_opt   = "--cost_tau_color=";
+    const char* cost_tau_gradient_opt   = "--cost_tau_gradient=";
+    const char* cost_alpha_opt   = "--cost_alpha=";
+    const char* cost_gamma_opt   = "--cost_gamma=";
+    const char* disparity_tolerance_opt = "--disp_tol=";
+    const char* normal_tolerance_opt = "--norm_tol=";
+    const char* border_value = "--border_value="; //either constant scalar or -1 = REPLICATE
+    const char* gtDepth_divFactor_opt = "--gtDepth_divisionFactor=";
+    const char* gtDepth_tolerance_opt = "--gtDepth_tolerance=";
+    const char* gtDepth_tolerance2_opt = "--gtDepth_tolerance2=";
+    const char* nodisplay_opt     = "-no_display";
+    const char* colorProc_opt = "-color_processing";
+    const char* num_iterations_opt = "--iterations=";
+    const char* self_similariy_n_opt = "--ss_n=";
+    const char* ct_epsilon_opt = "--ct_eps=";
+    const char* cam_scale_opt = "--cam_scale=";
+    const char* num_img_processed_opt = "--num_img_processed=";
+    const char* n_best_opt = "--n_best=";
+    const char* cost_comb_opt = "--cost_comb=";
+    const char* cost_good_factor_opt = "--good_factor=";
+    const char* depth_min_opt = "--depth_min=";
+    const char* depth_max_opt = "--depth_max=";
+    //    const char* scale_opt         = "--scale=";
+    const char* outputPath_opt = "-output_folder";
+    const char* calib_opt = "-calib_file";
+    const char* gt_opt = "-gt";
+    const char* gt_nocc_opt = "-gt_nocc";
+    const char* occl_mask_opt = "-occl_mask";
+    const char* gt_normal_opt = "-gt_normal";
+    const char* images_input_folder_opt = "-images_folder";
+    const char* p_input_folder_opt = "-p_folder";
+    const char* krt_file_opt = "-krt_file";
+    const char* camera_input_folder_opt = "-camera_folder";
+    const char* bounding_folder_opt = "-bounding_folder";
+    const char* viewSelection_opt = "-view_selection";
+    const char* initial_seed_opt = "--initial_seed";
+
+    const char* disp_thresh_opt = "--disp_thresh=";
+    const char* normal_thresh_opt = "--normal_thresh=";
+    const char* num_consistent_opt = "--num_consistent=";
+
+    //read in arguments
+    for ( int i = 1; i < argc; i++ )
+    {
+        if ( argv[i][0] != '-' )
+        {
+            inputFiles.img_filenames.push_back ( argv[i] );
+            /*if( inputFiles.imgLeft_filename.empty() )
+              inputFiles.imgLeft_filename = argv[i];
+              else if( inputFiles.imgRight_filename.empty() )
+              inputFiles.imgRight_filename = argv[i];
+              */
+        }
+        else if ( strncmp ( argv[i], algorithm_opt, strlen ( algorithm_opt ) ) == 0 )
+        {
+            char* _alg = argv[i] + strlen ( algorithm_opt );
+            parameters.algorithm = strcmp ( _alg, "pm" ) == 0 ? PM_COST :
+            strcmp ( _alg, "ct" ) == 0 ? CENSUS_TRANSFORM :
+            strcmp ( _alg, "sct" ) == 0 ? SPARSE_CENSUS :
+            strcmp ( _alg, "ct_ss" ) == 0 ? CENSUS_SELFSIMILARITY :
+            strcmp ( _alg, "adct" ) == 0 ? ADCENSUS :
+            strcmp ( _alg, "adct_ss" ) == 0 ? ADCENSUS_SELFSIMILARITY :
+            strcmp ( _alg, "pm_ss" ) == 0 ? PM_SELFSIMILARITY : -1;
+            if ( parameters.algorithm < 0 )
+            {
+                printf ( "Command-line parameter error: Unknown stereo algorithm\n\n" );
+                print_help ();
+                return -1;
+            }
+        }
+        else if ( strncmp ( argv[i], cost_comb_opt, strlen ( cost_comb_opt ) ) == 0 )
+        {
+            char* _alg = argv[i] + strlen ( algorithm_opt );
+            parameters.cost_comb = strcmp ( _alg, "all" ) == 0 ? COMB_ALL :
+            strcmp ( _alg, "best_n" ) == 0 ? COMB_BEST_N :
+            strcmp ( _alg, "angle" ) == 0 ? COMB_ANGLE :
+            strcmp ( _alg, "good" ) == 0 ? COMB_GOOD : -1;
+            if ( parameters.cost_comb < 0 )
+            {
+                printf ( "Command-line parameter error: Unknown cost combination method\n\n" );
+                print_help ();
+                return -1;
+            }
+        }
+        else if ( strncmp ( argv[i], maxdisp_opt, strlen ( maxdisp_opt ) ) == 0 )
+        {
+            if ( sscanf ( argv[i] + strlen ( maxdisp_opt ), "%f", &parameters.max_disparity ) != 1 ||
+                 parameters.max_disparity < 1  )
+            {
+                printf ( "Command-line parameter error: The max disparity (--maxdisparity=<...>) must be a positive integer \n" );
+                print_help ();
+                return -1;
+            }
+        }
+        else if ( strncmp ( argv[i], blocksize_opt, strlen ( blocksize_opt ) ) == 0 )
+        {
+            int k_size;
+            if ( sscanf ( argv[i] + strlen ( blocksize_opt ), "%d", &k_size ) != 1 ||
+                 k_size < 1 || k_size % 2 != 1 )
+            {
+                printf ( "Command-line parameter error: The block size (--blocksize=<...>) must be a positive odd number\n" );
+                return -1;
+            }
+            parameters.box_hsize = k_size;
+            parameters.box_vsize = k_size;
+        }
+        else if ( strncmp ( argv[i], cost_good_factor_opt, strlen ( cost_good_factor_opt ) ) == 0 )
+        {
+            sscanf ( argv[i] + strlen ( cost_good_factor_opt ), "%f", &parameters.good_factor );
+        }
+        else if ( strncmp ( argv[i], cost_tau_color_opt, strlen ( cost_tau_color_opt ) ) == 0 )
+        {
+            sscanf ( argv[i] + strlen ( cost_tau_color_opt ), "%f", &parameters.tau_color );
+        }
+        else if ( strncmp ( argv[i], cost_tau_gradient_opt, strlen ( cost_tau_gradient_opt ) ) == 0 )
+        {
+            sscanf ( argv[i] + strlen ( cost_tau_gradient_opt ), "%f", &parameters.tau_gradient );
+        }
+        else if ( strncmp ( argv[i], cost_alpha_opt, strlen ( cost_alpha_opt ) ) == 0 )
+        {
+            sscanf ( argv[i] + strlen ( cost_alpha_opt ), "%f", &parameters.alpha );
+        }
+        else if ( strncmp ( argv[i], cost_gamma_opt, strlen ( cost_gamma_opt ) ) == 0 )
+        {
+            sscanf ( argv[i] + strlen ( cost_gamma_opt ), "%f", &parameters.gamma );
+        }
+        else if ( strncmp ( argv[i], border_value, strlen ( border_value ) ) == 0 )
+        {
+            sscanf ( argv[i] + strlen ( border_value ), "%d", &parameters.border_value );
+        }
+        else if ( strncmp ( argv[i], num_iterations_opt, strlen ( num_iterations_opt ) ) == 0 )
+        {
+            sscanf ( argv[i] + strlen ( num_iterations_opt ), "%d", &parameters.iterations );
+        }
+        else if ( strncmp ( argv[i], disparity_tolerance_opt, strlen ( disparity_tolerance_opt ) ) == 0 )
+        {
+            sscanf ( argv[i] + strlen ( disparity_tolerance_opt ), "%f", &parameters.dispTol );
+        }
+        else if ( strncmp ( argv[i], normal_tolerance_opt, strlen ( normal_tolerance_opt ) ) == 0 )
+        {
+            sscanf ( argv[i] + strlen ( normal_tolerance_opt ), "%f", &parameters.normTol );
+        }
+        else if ( strncmp ( argv[i], self_similariy_n_opt, strlen ( self_similariy_n_opt ) ) == 0 )
+        {
+            sscanf ( argv[i] + strlen ( self_similariy_n_opt ), "%d", &parameters.self_similarity_n );
+        }
+        else if ( strncmp ( argv[i], ct_epsilon_opt, strlen ( ct_epsilon_opt ) ) == 0 )
+        {
+            sscanf ( argv[i] + strlen ( ct_epsilon_opt ), "%f", &parameters.census_epsilon );
+        }
+        else if ( strncmp ( argv[i], cam_scale_opt, strlen ( cam_scale_opt ) ) == 0 )
+        {
+            sscanf ( argv[i] + strlen ( cam_scale_opt ), "%f", &parameters.cam_scale );
+        }
+        else if ( strncmp ( argv[i], num_img_processed_opt, strlen ( num_img_processed_opt ) ) == 0 )
+        {
+            sscanf ( argv[i] + strlen ( num_img_processed_opt ), "%d", &parameters.num_img_processed );
+        }
+        else if ( strncmp ( argv[i], n_best_opt, strlen ( n_best_opt ) ) == 0 )
+        {
+            sscanf ( argv[i] + strlen ( n_best_opt ), "%d", &parameters.n_best );
+        }
+        else if ( strncmp ( argv[i], gtDepth_divFactor_opt, strlen ( gtDepth_divFactor_opt ) ) == 0 )
+        {
+            sscanf ( argv[i] + strlen ( gtDepth_divFactor_opt ), "%f", &gt_parameters.divFactor );
+        }
+        else if ( strncmp ( argv[i], gtDepth_tolerance_opt, strlen ( gtDepth_tolerance_opt ) ) == 0 )
+        {
+            sscanf ( argv[i] + strlen ( gtDepth_tolerance_opt ), "%f", &gt_parameters.dispTolGT );
+        }
+        else if ( strncmp ( argv[i], gtDepth_tolerance2_opt, strlen ( gtDepth_tolerance2_opt ) ) == 0 )
+        {
+            sscanf ( argv[i] + strlen ( gtDepth_tolerance2_opt ), "%f", &gt_parameters.dispTolGT2 );
+        }
+        else if ( strncmp ( argv[i], depth_min_opt, strlen ( depth_min_opt ) ) == 0 )
+        {
+            sscanf ( argv[i] + strlen ( depth_min_opt ), "%f", &parameters.depthMin );
+        }
+        else if ( strncmp ( argv[i], depth_max_opt, strlen ( depth_max_opt ) ) == 0 )
+        {
+            sscanf ( argv[i] + strlen ( depth_max_opt ), "%f", &parameters.depthMax );
+        }
+        else if ( strcmp ( argv[i], viewSelection_opt ) == 0 )
+            parameters.viewSelection = true;
+        else if ( strcmp ( argv[i], nodisplay_opt ) == 0 )
+            no_display = true;
+        else if ( strcmp ( argv[i], colorProc_opt ) == 0 )
+            parameters.color_processing = true;
+        else if ( strcmp ( argv[i], "-o" ) == 0 )
+            outputFiles.disparity_filename = argv[++i];
+        else if ( strcmp ( argv[i], outputPath_opt ) == 0 )
+            outputFiles.parentFolder = argv[++i];
+        else if ( strcmp ( argv[i], calib_opt ) == 0 )
+            inputFiles.calib_filename = argv[++i];
+        else if ( strcmp ( argv[i], gt_opt ) == 0 )
+            inputFiles.gt_filename = argv[++i];
+        else if ( strcmp ( argv[i], gt_nocc_opt ) == 0 )
+            inputFiles.gt_nocc_filename = argv[++i];
+        else if ( strcmp ( argv[i], occl_mask_opt ) == 0 )
+            inputFiles.occ_filename = argv[++i];
+        else if ( strcmp ( argv[i], gt_normal_opt ) == 0 )
+            inputFiles.gt_normal_filename = argv[++i];
+        else if ( strcmp ( argv[i], images_input_folder_opt ) == 0 )
+            inputFiles.images_folder = argv[++i];
+        else if ( strcmp ( argv[i], p_input_folder_opt ) == 0 )
+            inputFiles.p_folder = argv[++i];
+        else if ( strcmp ( argv[i], krt_file_opt ) == 0 )
+            inputFiles.krt_file = argv[++i];
+        else if ( strcmp ( argv[i], camera_input_folder_opt ) == 0 )
+            inputFiles.camera_folder = argv[++i];
+        else if ( strcmp ( argv[i], initial_seed_opt ) == 0 )
+            inputFiles.seed_file = argv[++i];
+        else if ( strcmp ( argv[i], bounding_folder_opt ) == 0 )
+            inputFiles.bounding_folder = argv[++i];
+        else if ( strncmp ( argv[i], disp_thresh_opt, strlen (disp_thresh_opt) ) == 0 )
+            sscanf ( argv[i] + strlen (disp_thresh_opt), "%f", &parameters.depthThresh );
+        else if ( strncmp ( argv[i], normal_thresh_opt, strlen (normal_thresh_opt) ) == 0 ) {
+            float angle_degree;
+            sscanf ( argv[i] + strlen (normal_thresh_opt), "%f", &angle_degree );
+            parameters.normalThresh = angle_degree * M_PI / 180.0f;
+        }
+        else if ( strncmp ( argv[i], num_consistent_opt, strlen (num_consistent_opt) ) == 0 )
+            sscanf ( argv[i] + strlen (num_consistent_opt), "%d", &parameters.numConsistentThresh );
+        else
+        {
+            printf ( "Command-line parameter error: unknown option %s\n", argv[i] );
+            //return -1;
+        }
+    }
+    //cout << "KRt file is " << inputFiles.krt_file << endl;
+
+    return 0;
+}
+
+static void selectViews ( CameraParameters &cameraParams, int imgWidth, int imgHeight, bool viewSel ) {
+    vector<Camera> cameras = cameraParams.cameras;
+    Camera ref = cameras[cameraParams.idRef];
+
+    int x = imgWidth / 2;
+    int y = imgHeight / 2;
+
+    cameraParams.viewSelectionSubset.clear ();
+
+    Vec3f viewVectorRef = getViewVector ( ref, x, y, cameraParams.rectified );
+
+    // TODO hardcoded value makes it a parameter
+    float minimum_angle_degree = 10;
+    float maximum_angle_degree = 30;
+    float minimum_angle_radians = minimum_angle_degree * M_PI / 180.0f;
+    float maximum_angle_radians = maximum_angle_degree * M_PI / 180.0f;
+    printf("Accepted intersection angle of central rays is %f to %f degrees\n", minimum_angle_degree, maximum_angle_degree);
+    for ( size_t i = 0; i < cameras.size (); i++ ) {
+        //if ( i == cameraParams.idRef && !cameraParams.rectified )
+        //  continue;
+
+        if ( !viewSel ) { //select all views, dont perform selection
+            cameraParams.viewSelectionSubset.push_back ( i );
+            continue;
+        }
+
+        Vec3f vec = getViewVector ( cameras[i], x, y, cameraParams.rectified );
+
+        float angle = getAngle ( viewVectorRef, vec );
+        if ( angle > minimum_angle_radians && angle < maximum_angle_radians ) //0.6 select if angle between 5.7 and 34.8 (0.6) degrees (10 and 30 degrees suggested by some paper)
+        {
+            cameraParams.viewSelectionSubset.push_back ( i );
+            printf("Accepting camera %ld with angle\t %f degree (%f radians)\n", i, angle*180.0f/M_PI, angle);
+        }
+        else
+            printf("Discarding camera %ld with angle\t %f degree (%f radians)\n", i, angle*180.0f/M_PI, angle);
+    }
+
+}
+
+static void addImageToTextureUint (vector<Mat_<uint8_t> > &imgs, cudaTextureObject_t texs[])
+{
+    for (unsigned int i=0; i<imgs.size(); i++)
+    {
+        int rows = imgs[i].rows;
+        int cols = imgs[i].cols;
+        // Create channel with uint8_t point type
+        cudaChannelFormatDesc channelDesc =
+        //cudaCreateChannelDesc (8,
+        //0,
+        //0,
+        //0,
+        //cudaChannelFormatKindUnsigned);
+        cudaCreateChannelDesc<char>();
+        // Allocate array with correct size and number of channels
+        cudaArray *cuArray;
+        checkCudaErrors(cudaMallocArray(&cuArray,
+                                        &channelDesc,
+                                        cols,
+                                        rows));
+
+        checkCudaErrors (cudaMemcpy2DToArray (cuArray,
+                                              0,
+                                              0,
+                                              imgs[i].ptr<uint8_t>(),
+                                              imgs[i].step[0],
+                                              cols*sizeof(uint8_t),
+                                              rows,
+                                              cudaMemcpyHostToDevice));
+
+        // Specify texture
+        struct cudaResourceDesc resDesc;
+        memset(&resDesc, 0, sizeof(resDesc));
+        resDesc.resType         = cudaResourceTypeArray;
+        resDesc.res.array.array = cuArray;
+
+        // Specify texture object parameters
+        struct cudaTextureDesc texDesc;
+        memset(&texDesc, 0, sizeof(texDesc));
+        texDesc.addressMode[0]   = cudaAddressModeWrap;
+        texDesc.addressMode[1]   = cudaAddressModeWrap;
+        texDesc.filterMode       = cudaFilterModePoint;
+        texDesc.readMode         = cudaReadModeElementType;
+        texDesc.normalizedCoords = 0;
+
+        // Create texture object
+        cudaTextureObject_t &texObj = texs[i];
+        checkCudaErrors(cudaCreateTextureObject(&(texs[i]), &resDesc, &texDesc, NULL));
+        //texs[i] = texObj;
+    }
+    return;
+}
+static void addImageToTextureFloatColor (vector<Mat > &imgs, cudaTextureObject_t texs[])
+{
+    for (int i=0; i<imgs.size(); i++)
+    {
+        int rows = imgs[i].rows;
+        int cols = imgs[i].cols;
+        // Create channel with floating point type
+        cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float4>();
+
+        // Allocate array with correct size and number of channels
+        cudaArray *cuArray;
+        checkCudaErrors(cudaMallocArray(&cuArray,
+                                        &channelDesc,
+                                        cols,
+                                        rows));
+
+        checkCudaErrors (cudaMemcpy2DToArray (cuArray,
+                                              0,
+                                              0,
+                                              imgs[i].ptr<float>(),
+                                              imgs[i].step[0],
+                                              cols*sizeof(float)*4,
+                                              rows,
+                                              cudaMemcpyHostToDevice));
+
+        // Specify texture
+        struct cudaResourceDesc resDesc;
+        memset(&resDesc, 0, sizeof(resDesc));
+        resDesc.resType         = cudaResourceTypeArray;
+        resDesc.res.array.array = cuArray;
+
+        // Specify texture object parameters
+        struct cudaTextureDesc texDesc;
+        memset(&texDesc, 0, sizeof(texDesc));
+        texDesc.addressMode[0]   = cudaAddressModeWrap;
+        texDesc.addressMode[1]   = cudaAddressModeWrap;
+        texDesc.filterMode       = cudaFilterModeLinear;
+        texDesc.readMode         = cudaReadModeElementType;
+        texDesc.normalizedCoords = 0;
+
+        // Create texture object
+        cudaTextureObject_t &texObj = texs[i];
+        checkCudaErrors(cudaCreateTextureObject(&(texs[i]), &resDesc, &texDesc, NULL));
+    }
+    return;
+}
+
+static void addImageToTextureFloatGray (vector<Mat > &imgs, cudaTextureObject_t texs[])
+{
+    for (int i=0; i<imgs.size(); i++)
+    {
+        int rows = imgs[i].rows;
+        int cols = imgs[i].cols;
+        // Create channel with floating point type
+        cudaChannelFormatDesc channelDesc =
+        cudaCreateChannelDesc (32,
+                               0,
+                               0,
+                               0,
+                               cudaChannelFormatKindFloat);
+        // Allocate array with correct size and number of channels
+        cudaArray *cuArray;
+        checkCudaErrors(cudaMallocArray(&cuArray,
+                                        &channelDesc,
+                                        cols,
+                                        rows));
+
+        checkCudaErrors (cudaMemcpy2DToArray (cuArray,
+                                              0,
+                                              0,
+                                              imgs[i].ptr<float>(),
+                                              imgs[i].step[0],
+                                              cols*sizeof(float),
+                                              rows,
+                                              cudaMemcpyHostToDevice));
+
+        // Specify texture
+        struct cudaResourceDesc resDesc;
+        memset(&resDesc, 0, sizeof(resDesc));
+        resDesc.resType         = cudaResourceTypeArray;
+        resDesc.res.array.array = cuArray;
+
+        // Specify texture object parameters
+        struct cudaTextureDesc texDesc;
+        memset(&texDesc, 0, sizeof(texDesc));
+        texDesc.addressMode[0]   = cudaAddressModeWrap;
+        texDesc.addressMode[1]   = cudaAddressModeWrap;
+        texDesc.filterMode       = cudaFilterModeLinear;
+        texDesc.readMode         = cudaReadModeElementType;
+        texDesc.normalizedCoords = 0;
+
+        // Create texture object
+        cudaTextureObject_t &texObj = texs[i];
+        checkCudaErrors(cudaCreateTextureObject(&(texs[i]), &resDesc, &texDesc, NULL));
+        //texs[i] = texObj;
+    }
+    return;
+}
+static int runFusibile (int argc,
+                        char **argv,
+                        OutputFiles &outputFiles,
+                        AlgorithmParameters &algParameters,
+                        bool no_display,
+                        Results &results
+                       )
+{
+    InputFiles inputFiles;
+    string ext = ".png";
+
+    string results_folder = "results/";
+
+    const char* results_folder_opt     = "-input_folder";
+    const char* p_input_folder_opt = "-p_folder";
+    const char* krt_file_opt = "-krt_file";
+    const char* images_input_folder_opt = "-images_folder";
+    const char* gt_opt = "-gt";
+    const char* gt_nocc_opt = "-gt_nocc";
+    const char* pmvs_folder_opt = "--pmvs_folder";
+    //read in arguments
+    for ( int i = 1; i < argc; i++ )
+    {
+        if ( strcmp ( argv[i], results_folder_opt ) == 0 ){
+            results_folder = argv[++i];
+            cout << "input folder is " << results_folder << endl;
+
+        }else if ( strcmp ( argv[i], p_input_folder_opt ) == 0 ){
+            inputFiles.p_folder = argv[++i];
+        }
+        else if ( strcmp ( argv[i], krt_file_opt ) == 0 )
+            inputFiles.krt_file = argv[++i];
+        else if ( strcmp ( argv[i], images_input_folder_opt ) == 0 ){
+            inputFiles.images_folder = argv[++i];
+
+        }else if ( strcmp ( argv[i], gt_opt ) == 0 ){
+            inputFiles.gt_filename = argv[++i];
+
+        }else if ( strcmp ( argv[i], gt_nocc_opt ) == 0 ){
+            inputFiles.gt_nocc_filename = argv[++i];
+        }
+        else if ( strncmp ( argv[i], pmvs_folder_opt, strlen ( pmvs_folder_opt ) ) == 0 ) {
+            inputFiles.pmvs_folder = argv[++i];
+        }
+    }
+
+    if (inputFiles.pmvs_folder.size()>0) {
+        inputFiles.images_folder = inputFiles.pmvs_folder + "/visualize/";
+        inputFiles.p_folder = inputFiles.pmvs_folder + "/txt/";
+    }
+    cout <<"image folder is " << inputFiles.images_folder << endl;
+    cout <<"p folder is " << inputFiles.images_folder << endl;
+    cout <<"pmvs folder is " << inputFiles.pmvs_folder << endl;
+
+    GTcheckParameters gtParameters;
+
+    gtParameters.dispTolGT = 0.1f;
+    gtParameters.dispTolGT2 = 0.02f;
+    gtParameters.divFactor = 1.0f;
+    // create folder to store result images
+    time_t timeObj;
+    time ( &timeObj );
+    tm *pTime = localtime ( &timeObj );
+
+    vector <Mat_<Vec3f> > view_vectors;
+
+
+    time(&timeObj);
+    pTime = localtime(&timeObj);
+
+    char output_folder[256];
+    sprintf(output_folder, "%s/consistencyCheck-%04d%02d%02d-%02d%02d%02d/",results_folder.c_str(), pTime->tm_year+1900, pTime->tm_mon+1,pTime->tm_mday,pTime->tm_hour, pTime->tm_min, pTime->tm_sec);
+#if defined(_WIN32)
+    _mkdir(output_folder);
+#else
+    mkdir(output_folder, 0777);
+#endif
+
+    vector<string> subfolders;
+    get_subfolders(results_folder.c_str(), subfolders);
+    std::sort(subfolders.begin(), subfolders.end());
+
+    vector< Mat_<Vec3b> > warpedImages;
+    vector< Mat_<Vec3b> > warpedImages_inverse;
+    //vector< Mat_<float> > depthMaps;
+    vector< Mat_<float> > updateMaps;
+    vector< Mat_<Vec3f> > updateNormals;
+    vector< Mat_<float> > depthMapConsistent;
+    vector< Mat_<Vec3f> > normalsConsistent;
+    vector< Mat_<Vec3f> > groundTruthNormals;
+    vector< Mat_<uint8_t> > valid;
+
+
+    map< int,string> consideredIds;
+    for(int i=0;i<subfolders.size();i++) {
+        //make sure that it has the right format (DATE_TIME_INDEX)
+        size_t n = std::count(subfolders[i].begin(), subfolders[i].end(), '_');
+        if(n < 2)
+            continue;
+        if (subfolders[i][0] != '2')
+            continue;
+
+        //get index
+        //unsigned found = subfolders[i].find_last_of("_");
+        //find second index
+        unsigned posFirst = subfolders[i].find_first_of("_") +1;
+        unsigned found = subfolders[i].substr(posFirst).find_first_of("_") + posFirst +1;
+        string id_string = subfolders[i].substr(found);
+        //InputData dat;
+
+        //consideredIds.push_back(id_string);
+        consideredIds.insert(pair<int,string>(i,id_string));
+        //cout << "id_string is " << id_string << endl;
+        //cout << "i is " << i << endl;
+        //char outputPath[256];
+        //sprintf(outputPath, "%s.png", id_string);
+
+        if( access( (id_string + ".png").c_str(), R_OK ) != -1 )
+            inputFiles.img_filenames.push_back((id_string + ".png"));
+        else if( access( (id_string + ".jpg").c_str(), R_OK ) != -1 )
+            inputFiles.img_filenames.push_back((id_string + ".jpg"));
+        else if( access( (id_string + ".ppm").c_str(), R_OK ) != -1 )
+            inputFiles.img_filenames.push_back((id_string + ".ppm"));
+    }
+    size_t numImages = inputFiles.img_filenames.size ();
+    cout << "numImages is " << numImages << endl;
+    cout << "img_filenames is " << inputFiles.img_filenames.size() << endl;
+    algParameters.num_img_processed = min ( ( int ) numImages, algParameters.num_img_processed );
+
+    vector<Mat_<Vec3b> > img_color; // imgLeft_color, imgRight_color;
+    vector<Mat_<uint8_t> > img_grayscale;
+    for ( size_t i = 0; i < numImages; i++ ) {
+        //printf ( "Opening image %ld: %s\n", i, ( inputFiles.images_folder + inputFiles.img_filenames[i] ).c_str () );
+        img_grayscale.push_back ( imread ( ( " ", inputFiles.images_folder + inputFiles.img_filenames[i] ), IMREAD_GRAYSCALE ) );
+        if ( algParameters.color_processing ) {
+            img_color.push_back ( imread ( ( " ", inputFiles.images_folder + inputFiles.img_filenames[i] ), IMREAD_COLOR ) );
+        }
+
+        if ( img_grayscale[i].rows == 0 ) {
+            printf ( "Image seems to be invalid\n" );
+            return -1;
+        }
+    }
+
+    size_t avail;
+    size_t total;
+    cudaMemGetInfo( &avail, &total );
+    size_t used = total - avail;
+    printf("Device memory used: %fMB\n", used/1000000.0f);
+
+    GlobalState *gs = new GlobalState;
+	gs->cameras = new CameraParameters_cu;
+	gs->pc = new PointCloud;
+    cudaMemGetInfo( &avail, &total );
+    used = total - avail;
+    printf("Device memory used: %fMB\n", used/1000000.0f);
+
+
+    uint32_t numPixels = ( uint32_t ) img_grayscale[0].rows * ( uint32_t ) img_grayscale[0].cols;
+
+    uint32_t rows = img_grayscale[0].rows;
+    uint32_t cols = img_grayscale[0].cols;
+
+    CameraParameters camParams = getCameraParameters (*(gs->cameras),
+                                                      inputFiles,algParameters.depthMin,
+                                                      algParameters.depthMax,
+                                                      algParameters.cam_scale,
+                                                      false);
+    printf("Camera size is %lu\n", camParams.cameras.size());
+
+    for ( int i = 0; i < algParameters.num_img_processed; i++ ) {
+        algParameters.min_disparity = disparityDepthConversion ( camParams.f, camParams.cameras[i].baseline, camParams.cameras[i].depthMax );
+        algParameters.max_disparity = disparityDepthConversion ( camParams.f, camParams.cameras[i].baseline, camParams.cameras[i].depthMin );
+    }
+
+    selectViews ( camParams, cols, rows, false);
+    int numSelViews = camParams.viewSelectionSubset.size ();
+    cout << "Selected views: " << numSelViews << endl;
+    gs->cameras->viewSelectionSubsetNumber = numSelViews;
+    ofstream myfile;
+    for ( int i = 0; i < numSelViews; i++ ) {
+        cout << camParams.viewSelectionSubset[i] << ", ";
+        gs->cameras->viewSelectionSubset[i] = camParams.viewSelectionSubset[i];
+    }
+    cout << endl;
+
+    vector<InputData> inputData;
+
+    cout << "Reading normals and depth from disk" << endl;
+    cout << "Size consideredIds is " << consideredIds.size() << endl;
+    for (map<int,string>::iterator it=consideredIds.begin(); it!=consideredIds.end(); ++it){
+
+        //get corresponding camera
+        int i = it->first;
+        string id = it->second;//consideredIds[i];
+        //int id = atoi(id_string.c_str());
+        int camIdx = getCameraFromId(id,camParams.cameras);
+        //cout << "id is " << id << endl;
+        //cout << "camIdx is " << camIdx << endl;
+        if(camIdx < 0)// || camIdx == camParams.idRef)
+            continue;
+
+        InputData dat;
+        dat.id = id;
+        dat.camId = camIdx;
+        dat.cam = camParams.cameras[camIdx];
+        dat.path = results_folder + subfolders[i];
+        dat.inputImage = imread((" ",inputFiles.images_folder + id + ext), IMREAD_COLOR);
+
+        //read normal
+        cout << "Reading normal " << i << endl;
+        readDmbNormal((dat.path + "/normals.dmb").c_str(),dat.normals);
+
+        //read depth
+        cout << "Reading disp " << i << endl;
+        readDmb((dat.path + "/disp.dmb").c_str(),dat.depthMap);
+
+        //inputData.push_back(move(dat));
+        inputData.push_back(dat);
+
+    }
+    // run gpu run
+    // Init parameters
+    gs->params = &algParameters;
+
+
+    // Init ImageInfo
+    //gs->iminfo.cols = img_grayscale[0].cols;
+    //gs->iminfo.rows = img_grayscale[0].rows;
+    gs->cameras->cols = img_grayscale[0].cols;
+    gs->cameras->rows = img_grayscale[0].rows;
+    gs->params->cols = img_grayscale[0].cols;
+    gs->params->rows = img_grayscale[0].rows;
+    gs->resize (img_grayscale.size());
+    gs->pc->resize (img_grayscale[0].rows * img_grayscale[0].cols);
+	PointCloudList pc_list;
+    pc_list.resize (img_grayscale[0].rows * img_grayscale[0].cols);
+    pc_list.size=0;
+    pc_list.rows = img_grayscale[0].rows;
+    pc_list.cols = img_grayscale[0].cols;
+    gs->pc->rows = img_grayscale[0].rows;
+    gs->pc->cols = img_grayscale[0].cols;
+
+    // Resize lines
+    for (int i = 0; i<img_grayscale.size(); i++)
+    {
+        gs->lines[i].resize(img_grayscale[0].rows * img_grayscale[0].cols);
+        gs->lines[i].n = img_grayscale[0].rows * img_grayscale[0].cols;
+        //gs->lines.s = img_grayscale[0].step[0];
+        gs->lines[i].s = img_grayscale[0].cols;
+        gs->lines[i].l = img_grayscale[0].cols;
+    }
+
+    vector<Mat > img_grayscale_float           (img_grayscale.size());
+    vector<Mat > img_color_float               (img_grayscale.size());
+    vector<Mat > img_color_float_alpha         (img_grayscale.size());
+    vector<Mat > normals_and_depth             (img_grayscale.size());
+    vector<Mat_<uint16_t> > img_grayscale_uint (img_grayscale.size());
+    for (int i = 0; i<img_grayscale.size(); i++)
+    {
+        //img_grayscale[i].convertTo(img_grayscale_float[i], CV_32FC1, 1.0/255.0); // or CV_32F works (too)
+        img_grayscale[i].convertTo(img_grayscale_float[i], CV_32FC1); // or CV_32F works (too)
+        img_grayscale[i].convertTo(img_grayscale_uint[i], CV_16UC1); // or CV_32F works (too)
+        if(algParameters.color_processing) {
+            vector<Mat_<float> > rgbChannels ( 3 );
+            img_color_float_alpha[i] = Mat::zeros ( img_grayscale[0].rows, img_grayscale[0].cols, CV_32FC4 );
+            img_color[i].convertTo (img_color_float[i], CV_32FC3); // or CV_32F works (too)
+            Mat alpha( img_grayscale[0].rows, img_grayscale[0].cols, CV_32FC1 );
+            split (img_color_float[i], rgbChannels);
+            rgbChannels.push_back( alpha);
+            merge (rgbChannels, img_color_float_alpha[i]);
+        }
+        /* Create vector of normals and disparities */
+        vector<Mat_<float> > normal ( 3 );
+        normals_and_depth[i] = Mat::zeros ( img_grayscale[0].rows, img_grayscale[0].cols, CV_32FC4 );
+        split (inputData[i].normals, normal);
+        normal.push_back( inputData[i].depthMap);
+        merge (normal, normals_and_depth[i]);
+
+    }
+    int64_t t = getTickCount () - t;
+
+    // Copy images to texture memory
+    if (algParameters.saveTexture)
+        if (algParameters.color_processing)
+            addImageToTextureFloatColor (img_color_float_alpha, gs->imgs);
+        else
+            addImageToTextureFloatGray (img_grayscale_float, gs->imgs);
+
+        addImageToTextureFloatColor (normals_and_depth, gs->normals_depths);
+
+#define pow2(x) ((x)*(x))
+#define get_pow2_norm(x,y) (pow2(x)+pow2(y))
+
+    runcuda(*gs, pc_list, numSelViews);
+    Mat_<Vec3f> norm0 = Mat::zeros ( img_grayscale[0].rows, img_grayscale[0].cols, CV_32FC3 );
+    Mat_<float> distImg;
+    char plyFile[256];
+    sprintf ( plyFile, "%s/final3d_model.ply", output_folder);
+    printf("Writing ply file %s\n", plyFile);
+    //storePlyFileAsciiPointCloud ( plyFile, pc_list, inputData[0].cam, distImg);
+    storePlyFileBinaryPointCloud ( plyFile, pc_list, inputData[0].cam, distImg);
+    char xyzFile[256];
+    sprintf ( xyzFile, "%s/final3d_model.xyz", output_folder);
+    printf("Writing ply file %s\n", xyzFile);
+    //storeXYZPointCloud ( xyzFile, pc_list, inputData[0].cam, distImg);
+
+    return 0;
+}
+
+int main(int argc, char **argv)
+{
+    if ( argc < 3 )
+    {
+        print_help ();
+        return 0;
+    }
+
+    InputFiles inputFiles;
+    OutputFiles outputFiles;
+	AlgorithmParameters* algParameters = new AlgorithmParameters;
+    GTcheckParameters gtParameters;
+    bool no_display = false;
+
+    int ret = getParametersFromCommandLine ( argc, argv, inputFiles, outputFiles, *algParameters, gtParameters, no_display );
+    if ( ret != 0 )
+        return ret;
+
+    Results results;
+    ret = runFusibile ( argc, argv, outputFiles, *algParameters, no_display, results);
+
+    return 0;
+}
+
diff --git a/main.h b/main.h
new file mode 100644
index 0000000..4d368d5
--- /dev/null
+++ b/main.h
@@ -0,0 +1,99 @@
+#pragma once
+
+#include "opencv2/calib3d/calib3d.hpp"
+#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/core/core.hpp"
+#include "opencv2/highgui/highgui.hpp"
+//#include "opencv2/contrib/contrib.hpp"
+#if CV_MAJOR_VERSION == 3
+#include "opencv2/core/utility.hpp"
+#endif
+
+#include <omp.h>
+#include <stdint.h>
+
+using namespace cv;
+using namespace std;
+
+typedef Vec<uint16_t, 2> Vec2us;
+
+// parameters for comparison with ground truth
+struct GTcheckParameters {
+    GTcheckParameters () : gtCheck ( false ), noccCheck ( false ), scale ( 150.0f ), dispTolGT ( 0.5f ), divFactor ( 4.0f ) {}
+    bool gtCheck;
+    bool noccCheck;
+    float scale; // scaling factor just for visualization of error
+    float dispTolGT;
+    float dispTolGT2;
+    //division factor dependent on ground truth data (to get disparity value: imgValue/divFactor)
+    float divFactor; //Middleburry small images: 4, big images third: 3, Kitti: 255
+};
+
+//pathes to input images (camera images, ground truth, ...)
+struct InputFiles {
+    InputFiles () : gt_filename ( "" ), gt_nocc_filename ( "" ), occ_filename ( "" ), gt_normal_filename ( "" ), calib_filename ( "" ), images_folder ( "" ), p_folder ( "" ), camera_folder ( "" ), pmvs_folder("") {}
+    vector<string> img_filenames; // input camera images (only filenames, path is set in images_folder), names can also be used for calibration data (e.g. for Strecha P, camera)
+    string gt_filename; // ground truth image
+    string gt_nocc_filename; // non-occluded ground truth image (as provided e.g. by Kitti)
+    string occ_filename; // occlusion mask (binary map of all the points that are occluded) (as provided e.g. by Middleburry)
+    string gt_normal_filename; // ground truth normal map (for Strecha)
+    string calib_filename; // calibration file containing camera matrices (P) (as provided e.g. by Kitti)
+    string images_folder; // path to camera input images
+    string p_folder; // path to camera projection matrix P (Strecha)
+    string krt_file; // path to camera projection matrix P (Strecha)
+    string camera_folder; // path to camera calibration matrix K (Strecha)
+    string bounding_folder; //path to bounding volume (Strecha)
+    string seed_file; //path to bounding volume (Strecha)
+    string pmvs_folder; //path to bounding volume (Strecha)
+};
+
+//pathes to output files
+struct OutputFiles {
+    OutputFiles () : parentFolder ( "results" ), disparity_filename ( 0 ) {}
+    const char* parentFolder;
+    char* disparity_filename;
+};
+
+struct Camera {
+    Camera () : baseline ( 0.54f ), P ( Mat::eye ( 3,4,CV_32F ) ), R ( Mat::eye ( 3,3,CV_32F ) ), reference ( false ), depthMin ( 2.0f ), depthMax ( 20.0f ) {}
+    Mat_<float> P;
+    Mat_<float> P_inv;
+    Mat_<float> M_inv;
+    //Mat_<float> K;
+    Mat_<float> R;
+    Mat_<float> t;
+    Vec3f C;
+    float baseline;
+    bool reference;
+    float depthMin; //this could be figured out from the bounding volume (not done right now, but that's why this parameter is here as well and not only in AlgorithmParameters)
+    float depthMax; //this could be figured out from the bounding volume (not done right now, but that's why this parameter is here as well and not only in AlgorithmParameters)
+    //int id; //corresponds to the image name id (eg. 0-10), independent of order in argument list, just dependent on name
+    string id;
+    Mat_<float> K;
+    Mat_<float> K_inv;
+    //float f;
+};
+
+//parameters for camera geometry setup (assuming that K1 = K2 = K, P1 = K [I | 0] and P2 = K [R | t])
+struct CameraParameters {
+    CameraParameters () : rectified ( false ), idRef ( 0 ) {}
+    Mat_<float> K; //if K varies from camera to camera: K and f need to be stored within Camera
+    Mat_<float> K_inv; //if K varies from camera to camera: K and f need to be stored within Camera
+    float f;
+    bool rectified;
+    vector<Camera> cameras;
+    int idRef;
+    vector<int> viewSelectionSubset;
+};
+
+struct Results {
+    Results () : error_occ ( 1.0f ), error_noc ( 1.0f ), valid_pixels ( 0.0f ), error_valid ( 1.0f ), error_valid_all ( 1.0f ), total_runtime ( 0.0f ), runtime_per_pixel ( 0.0f ) {}
+    float error_occ;
+    float error_noc;
+    float valid_pixels; // passed occlusion check
+    float valid_pixels_gt;
+    float error_valid;
+    float error_valid_all;
+    double total_runtime;
+    double runtime_per_pixel;
+};
diff --git a/managed.h b/managed.h
new file mode 100644
index 0000000..6e25388
--- /dev/null
+++ b/managed.h
@@ -0,0 +1,13 @@
+#pragma once
+class Managed {
+public:
+  void *operator new(size_t len) {
+    void *ptr;
+    cudaMallocManaged(&ptr, len);
+    return ptr;
+  }
+
+  void operator delete(void *ptr) {
+    cudaFree(ptr);
+  }
+};
diff --git a/mathUtils.h b/mathUtils.h
new file mode 100644
index 0000000..9e14445
--- /dev/null
+++ b/mathUtils.h
@@ -0,0 +1,31 @@
+/*
+ * some math helper functions
+ */
+
+#pragma once
+
+#ifndef M_PI
+#define M_PI    3.14159265358979323846f
+#endif
+#define M_PI_float    3.14159265358979323846f
+
+//rounding of positive float values
+#if defined(_WIN32)
+static float roundf ( float val ) {
+    return floor ( val+0.5f );
+};
+#endif
+
+/* get angle between two vectors in 3D
+ * Input: v1,v2 - vectors
+ * Output: angle in radian
+ */
+static float getAngle ( Vec3f v1, Vec3f v2 ) {
+    float angle = acosf ( v1.dot ( v2 ) );
+    //if angle is not a number the dot product was 1 and thus the two vectors should be identical --> return 0
+    if ( angle != angle )
+        return 0.0f;
+    if ( acosf ( v1.dot ( v2 ) ) != acosf ( v1.dot ( v2 ) ) )
+        cout << acosf ( v1.dot ( v2 ) ) << " / " << v1.dot ( v2 )<< " / " << v1<< " / " << v2 << endl;
+    return angle;
+};
diff --git a/mia-utils.h b/mia-utils.h
new file mode 100644
index 0000000..cc63e49
--- /dev/null
+++ b/mia-utils.h
@@ -0,0 +1,50 @@
+#ifndef __MIA_UTILS__
+#define __MIA_UTILS__
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <glib-2.0/glib.h>
+
+void alloc_vector
+(float **vector,   /* vector */
+ long  n);          /* size */
+
+void alloc_matrix
+(float ***matrix,  /* matrix */
+ long  nx,         /* size in x direction */
+ long  ny);         /* size in y direction */
+
+void alloc_matrix_d
+(double ***matrix,  /* matrix */
+ long  nx,         /* size in x direction */
+ long  ny);         /* size in y direction */
+
+/* allocates storage for matrix of size nx * ny */
+void disalloc_vector
+
+(float *vector,    /* vector */
+ long  n);          /* size */
+
+/* disallocates storage for a vector of size n */
+void disalloc_vector
+
+(float *vector,    /* vector */
+ long  n);          /* size */
+
+
+/*--------------------------------------------------------------------------*/
+
+void disalloc_matrix
+
+(float **matrix,   /* matrix */
+ long  nx,         /* size in x direction */
+ long  ny);         /* size in y direction */
+
+void disalloc_matrix_d
+
+(double **matrix,   /* matrix */
+ long  nx,         /* size in x direction */
+ long  ny);         /* size in y direction */
+/* disallocates storage for matrix of size nx * ny */
+
+#endif
diff --git a/point_cloud.h b/point_cloud.h
new file mode 100644
index 0000000..588315a
--- /dev/null
+++ b/point_cloud.h
@@ -0,0 +1,31 @@
+#pragma once
+#include <string.h> // memset()
+#include "algorithmparameters.h"
+#include "cameraparameters.h"
+#include "managed.h"
+#include <vector_types.h> // float4
+
+class __align__(128) Point_cu : public Managed {
+public:
+    float4 normal; // Normal
+    float4 coord; // Point coordinate
+    float texture; // Average texture color
+};
+
+
+class __align__(128) PointCloud : public Managed {
+public:
+    Point_cu *points;
+    int rows;
+    int cols;
+    int size;
+    void resize(int n)
+    {
+        cudaMallocManaged (&points,    sizeof(Point_cu) * n);
+        memset            (points,  0, sizeof(Point_cu) * n);
+    }
+    ~PointCloud()
+    {
+        cudaFree (points);
+    }
+};
diff --git a/point_cloud_list.h b/point_cloud_list.h
new file mode 100644
index 0000000..a0cfd2f
--- /dev/null
+++ b/point_cloud_list.h
@@ -0,0 +1,40 @@
+#pragma once
+#include <string.h> // memset()
+#include <stdlib.h> // malloc(), realloc(), free()
+#include "algorithmparameters.h"
+#include "cameraparameters.h"
+#include "managed.h"
+#include <vector_types.h> // float4
+
+class Point_li {
+public:
+    float4 normal; // Normal
+    float4 coord; // Point coordinate
+    float texture; // Average texture color
+};
+
+
+class PointCloudList {
+public:
+    Point_li *points;
+    int rows;
+    int cols;
+    unsigned int size;
+    unsigned int maximum;
+    void resize(int n)
+    {
+        maximum=n;
+        points = (Point_li *) malloc (sizeof(Point_li) * n);
+        memset            (points,  0, sizeof(Point_li) * n);
+    }
+    void increase_size(int n)
+    {
+        maximum=n;
+        points = (Point_li *) realloc (points, n * sizeof(Point_li));
+        printf("New size of point cloud list is %d\n", n);
+    }
+    ~PointCloudList()
+    {
+        free (points);
+    }
+};
diff --git a/test.cpp b/test.cpp
new file mode 100644
index 0000000..bdefefc
--- /dev/null
+++ b/test.cpp
@@ -0,0 +1,15 @@
+#include "cuda_runtime.h"
+
+#include <stdio.h>
+#include "globalstate.h"
+
+
+
+int main()
+{
+    int *c;
+    GlobalState *gs = new GlobalState;
+    CHECK(cudaMallocManaged(&c, sizeof(int)));
+    *c = 0;
+    return 0;
+}
diff --git a/vector_operations.h b/vector_operations.h
new file mode 100644
index 0000000..6f619e9
--- /dev/null
+++ b/vector_operations.h
@@ -0,0 +1,52 @@
+/* vim: ft=cpp
+ * */
+#pragma once
+#include <vector_types.h> // float4
+#include "config.h"
+//__device__ float4 operator*(float4 a, float4 b);
+//__device__ float4 operator-(float4 a, float4 b);
+//__device__ float4 operator-(float4 a);
+//__device__ float4 operator+(float4 a, float4 b);
+//__device__ float4 operator/(float4 a, float k);
+static __device__ float4 operator*(float4 a, float4 b) {
+    return make_float4(a.x*b.x,
+                       a.y*b.y,
+                       a.z*b.z,
+                       0);
+}
+static __device__ float4 operator-(float4 a, float4 b) {
+    return make_float4(a.x-b.x,
+                       a.y-b.y,
+                       a.z-b.z,
+                       0);
+}
+static __device__ float4 operator-(float4 a) {
+    return make_float4(-a.x,
+                       -a.y,
+                       -a.z,
+                       0);
+}
+static __device__ float4 operator+(float4 a, float4 b) {
+    return make_float4(a.x+b.x,
+                       a.y+b.y,
+                       a.z+b.z,
+                       0);
+}
+static __device__ float4 operator/(float4 a, float k) {
+    return make_float4(a.x/k,
+                       a.y/k,
+                       a.z/k,
+                       0);
+}
+static __device__ float l1_float4 (float4 a) {
+    return ( fabsf (a.x) +
+             fabsf (a.y) +
+             fabsf (a.z))*0.3333333f;
+
+}
+static __device__ float l2_float4 (float4 a) {
+    return sqrtf( pow2 (a.x) +
+             pow2 (a.y) +
+             pow2 (a.z));
+
+}