diff --git a/README.md b/README.md
index 1cbbcbd90..9acd7cb0d 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,3 @@
-# Welcome to the COAST Repository
-
-Welcome to the repository for COAST (COmpiler-Assisted Software fault Tolerance), BYU's tool for automated software mitigation! To get started, please refer to our [documentation pages](https://coast-compiler.readthedocs.io/en/stable/).
+# Welcome to the COAST Repository
+
+Welcome to the repository for COAST (COmpiler-Assisted Software fault Tolerance), BYU's tool for automated software mitigation! To get started, please refer to our [documentation pages](https://coast-compiler.readthedocs.io/en/latest/).
diff --git a/docs/source/cfcss.rst b/docs/source/cfcss.rst
index 1c23f00fc..aa04720c7 100644
--- a/docs/source/cfcss.rst
+++ b/docs/source/cfcss.rst
@@ -110,6 +110,12 @@ To implement the control flow checking, we inserted a set of instructions at the
 One of the optimizations we used was to only insert the extra XOR operation when :math:`D_n−1` was :math:`\neq 0`. This is one reason why the buffer block fix worked.
 
 
+Notes
+=============
+
+This pass was created for the purposes of studying LLVM IR and the LLVM C++ framework.  It is not actively being maintained.
+
+
 .. rubric:: Footnotes
 
 .. [#f1] N. Oh, P. P. Shirvani, and E. J. McCluskey, "Control-flow checking by software signatures," *IEEE Transactions on Reliability*\ , vol. 51, no. 1, pp. 111–122, Mar. 2002.
diff --git a/docs/source/eclipse.rst b/docs/source/eclipse.rst
index 784cf9e9c..5db85e59a 100644
--- a/docs/source/eclipse.rst
+++ b/docs/source/eclipse.rst
@@ -25,3 +25,27 @@ Building the projects
 2. Call the target name ``all`` and click OK.
 3. To build your pass, right click on the build folder and click "Make Targets -> Build -> Build" (with the target ``all`` selected).
 4. After the first time that you’ve done this, you can rebuild all your passes by pressing ``F9``.
+
+
+Fixing the CDT settings
+========================
+
+The default settings of the project are not sufficient to allow the Eclipse CDT indexer to work correctly.  While not necessary to fix the CDT settings, it allows you to use the auotcomplete functionality of Eclipse.
+
+1. Right-click on the project and select "Properties"
+#. Under "C/C++ General" select "Paths and Symbols"
+#. Add a new Include Directory using the "Add" button
+#. Select "File System"
+#. Navigate to the repository root, then select ``llvm/include``
+#. Check the box "Add to all languages," then click "OK"
+#. On the left pane, select "Preprocessor Include Paths, Macros, etc"
+#. On the "Providers" select "CDT GCC Built-in Compiler Settings"
+#. Edit the "Command to get compiler specs" by putting ``std=c++11`` right before ``${INPUTS}``
+#. Move the entry "CDT GCC Built-in Compiler Settings" to the top of the list using the "Move Up" button
+#. Select "Apply and Close"
+
+1. Select "Window" -> "Preferences"
+#. Select "C/C++" -> "Build" -> "Settings"
+#. Under the "Discovery" tab select "CDT GCC Built-in Compiler Settings"
+#. Edit the "Command to get compiler specs" the same as before
+#. Select "Apply and Close"
diff --git a/docs/source/index.rst b/docs/source/index.rst
index f76987b71..f52c10024 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -18,6 +18,7 @@ COAST
     passes
     troubleshooting
     cfcss
+    release_notes
 
 Folder guide
 ==============
diff --git a/docs/source/passes.rst b/docs/source/passes.rst
index dc45e4a83..e0ffaa901 100644
--- a/docs/source/passes.rst
+++ b/docs/source/passes.rst
@@ -43,6 +43,8 @@ These options are only applicable to the ``-DWC`` and ``-TMR`` passes.
     +---------------------------+-----------------------------------------------------+
     |    ``-noStoreAddrSync``   | Don’t synchronize the address on data stores (C5).  |
     +---------------------------+-----------------------------------------------------+
+    |     ``-storeDataSync``    | Force synchronizing data on data stores (C4).       |
+    +---------------------------+-----------------------------------------------------+
 
 .. table::
     :widths: 25 40
@@ -129,6 +131,47 @@ In-code Directives
     |                      | instead of modifying the function body.               |
     +----------------------+-------------------------------------------------------+
 
+.. versionadded:: Oct2019
+
+.. table::
+    :widths: 25 40
+
+    +----------------------------------+----------------------------------------------------+
+    |                                  | Used to mark global variables as ones that the     |
+    | ``__COAST_VOLATILE``             | pass should not remove, even if it does not appear |
+    |                                  | to be used.                                        |
+    +----------------------------------+----------------------------------------------------+
+    |                                  | Ignore checks for global variable replication in   |
+    |  ``__COAST_IGNORE_GLOBAL(name)`` | function following this directive.                 |
+    |                                  |                                                    |
+    |                                  | See section `Replication Scope`_.                  |
+    +----------------------------------+----------------------------------------------------+
+
+.. table::
+    :widths: 25 40
+
+    +------------------------------------+-------------------------------------------------+
+    |                                    | Give the name of a ``malloc()``-like function   |
+    | ``MALLOC_WRAPPER_REGISTER(fname)`` | that will be replicated.  Should be treated the |
+    |                                    | same as a function prototype.                   |
+    +------------------------------------+-------------------------------------------------+
+    |                                    | Make a call to the function registered using    |
+    | ``MALLOC_WRAPPER_CALL(fname, x)``  | the above macro.  This will be replicated by    |
+    |                                    | COAST, using the clones of the arguments.       |
+    +------------------------------------+-------------------------------------------------+
+    |                                    | Give the name of a ``printf()``-like function   |
+    | ``PRINTF_WRAPPER_REGISTER(fname)`` | that will be replicated.  Should be treated the |
+    |                                    | same as a function prototype.                   |
+    +------------------------------------+-------------------------------------------------+
+    | ``PRINTF_WRAPPER_CALL              | Make a call to the function registered using    |
+    | (fname, fmt, ...)``                | the above macro.  This will be replicated by    |
+    |                                    | COAST, using the clones of the arguments.       |
+    +------------------------------------+-------------------------------------------------+
+    |                                    | Make your own wrapper function for COAST to     |
+    | ``GENERIC_COAST_WRAPPER(fname)``   | replicate calls to.  Used in both declaring and |
+    |                                    | calling the function.                           |
+    +------------------------------------+-------------------------------------------------+
+
 
 See the file COAST.h_
 
@@ -147,6 +190,8 @@ The `default file`_ contains functions we have identified as commonly treated di
 .. _default file: https://github.com/byuccl/coast/blob/master/projects/dataflowProtection/functions.config
 
 
+.. _when_repl_cmds:
+
 When to use replication command line options
 ----------------------------------------------
 
@@ -197,17 +242,27 @@ The first option, ``-noMemReplication``, should be used whenever memory has a se
 
 The option ``-noStoreAddrSync`` corresponds to C5. In EDDI, memory was simply duplicated and each duplicate was offset from the original value by a constant. However, COAST runs before the linker, and thus has no notion of an address space. We implement rules C3 and C5, checking addresses before stores and loads, for data structures such as arrays and structs that have an offset from a base address. These offsets, instead of the base addresses, are compared in the synchronization logic.
 
+.. versionchanged:: Oct2019
+
+As of the October 2019 release, COAST no longer syncs before storing data.  Test data indicated that, in many cases, the number of synchronization points generated by this rule limited the effective protection that the replication of variables afforded.  This behavior can be overridden using the ``-storeDataSync`` flag.
+
+.. _repl_scope:
+
 Replication Scope
 --------------------
 
-The user can specify any functions and global variables that should not be protected using ``-ignoreFns`` and ``-ignoreGlbls``. At minimum, these options should be used to exclude code that interacts with hard- ware devices (GPIO, UART) from the SoR. Replicating this code is likely to lead to errors. The option ``-replicateFnCalls`` causes user functions to be called in a coarse grained way, meaning the call is replicated instead of fine-grained instruction replication within the function body. Library function calls can also be excluded from replication via the flag ``-skipLibCalls``, which causes those calls to only be executed once. These two options should be used when multiple independent copies of a return value should be generated, instead of a single return value propagating through all replicated instructions. Changing the scope of replication can cause problems across function calls.
+The user can specify any functions and global variables that should not be protected using ``-ignoreFns`` and ``-ignoreGlbls``. At minimum, these options should be used to exclude code that interacts with hardware devices (GPIO, UART) from the SoR. Replicating this code is likely to lead to errors. The option ``-replicateFnCalls`` causes user functions to be called in a coarse grained way, meaning the call is replicated instead of fine-grained instruction replication within the function body. Library function calls can also be excluded from replication via the flag ``-skipLibCalls``, which causes those calls to only be executed once. These two options should be used when multiple independent copies of a return value should be generated, instead of a single return value propagating through all replicated instructions. Changing the scope of replication can cause problems across function calls.
+
+.. versionadded:: Oct2019
+
+Before processing the IR code, COAST begins by checking to make sure the replication scope rules it was given are consistent.  It checks to make sure all cloned globals are only used in functions that are also protected.  If they are not, the compilation will fail, with an error message informating the user which global is used in which function.  The user has the option to ignore these checks if they feel that it is safe.  This is done using the ``__COAST_IGNORE_GLOBAL`` macro mentioned above.
 
 Other Options
 ----------------
 
 **Error Logging**\ : This option was developed for tests in a radiation beam, where upsets are stochastically distributed, unlike fault injection tests where one upset is guaranteed for each run. COAST can be instructed to keep track of the number of corrected faults via the flag ``-countErrors``. This flag allows the program to detect corrected upsets, which yields more precise results on the number of radiation-induced SEUs. This option is only applicable to TMR because DWC halts on the first error. A global variable, ``TMR_ERROR_CNT``, is incremented each time that all three copies of the datum do not agree. If this global is not present in the source code then the pass creates it. The user can print this value at the end of program execution, or read it using a debugging tool.
 
-**Error Handlers**\ : The user has the choice of how to handle DWC and CFCSS errors because these are uncorrectable. The default behavior is to create ``abort()`` function calls if errors are detected. However, user functions can be called in place of ``abort()``. In order to do so, the source code needs a definition for the function ``void FAULT_DETECTED_DWC()`` or ``void FAULT_DETECTED_CFCSS`` for DWC and CFCSS, respectively.
+**Error Handlers**\ : The user has the choice of how to handle DWC and CFCSS errors because these are uncorrectable. The default behavior is to create ``abort()`` function calls if errors are detected. However, user functions can be called in place of ``abort()``. In order to do so, the source code needs a definition for the function ``void FAULT_DETECTED_DWC()`` or ``void FAULT_DETECTED_CFCSS()`` for DWC and CFCSS, respectively.
 
 **Input Initialization**\ : Global variables with initial values provide an interesting problem for testing. By default, these initial values are assigned to each replicate at compile time. This models the scenario where the SoR expands into the source of the data. However, this does not accurately model the case when code inputs need to be replicated at runtime. This could happen, for instance, if a UART was feeding data into a program and storing the result in a global variable. When global variables are listed using ``-runtimeInitGlbls`` the pass inserts ``memcpy()`` calls to copy global variable data into the replicates at runtime. This supports scalar values as well as aggregate data types, such as arrays and structures.
 
@@ -215,9 +270,57 @@ Other Options
 
 By default, COAST groups copies of instructions before synchronization points, effectively partitioning regions of code into segments where each copy of the program runs uninterrupted. Alternately, the user can specify that instructions should be interleaved using ``-i``.
 
-**Printing Status Messages**\ : Using the ``-verbose`` flag will print more information about what the pass is doing. This includes removing unused functions and unused global strings. These are mainly helpful for examining when your code is not behaving exactly as expected.
+**Printing Status Messages**\ : Using the ``-verbose`` flag will print more information about what the pass is doing. This includes removing unused functions and unused global strings.
+
+If you are developing passes, then on occasion you might need to include more printing statements. Using the ``-dumpModule`` flag causes the pass to print out the entirety of the LLVM module to the command line in LLVM IR format.
+
+
+.. _dbg_tools:
+
+Debugging Tools
+=================
+
+COAST verbose output
+--------------------
 
-If you are developing passes, then on occasion you might need to include more printing statements. Using ``-dumpModule`` causes the pass to print out the entirety of the LLVM module to the command line in a format that can be tested using ``lli``. This is mainly helpful if the pass is not cleaning up after itself properly. The function ``dumpModule()`` can also be placed in different places in the code for additional debugging capabilities.
+As mentioned above, COAST supports the ``-verbose`` and ``-dumpModule`` flags.  The ``-verbose`` output lists alls of the in-code directives processed, which functions are having their signatures changed, as well as any unused globals or functions being removed.  COAST will also print warnings or errors about unsupported language constructs being used.
+
+Using the ``-dumpModule`` flag is useful to get an idea of what COAST is doing if it's failing to finish compilation.  The function ``dumpModule()`` can also be placed in different places in the code for additional debugging capabilities.  Since the module will be output to the ``stderr`` stream, and it can be quite a lot of data, it is important to redirect the output properly.
+
+Example: ``opt -TMR -dumpModule input.bc -o output.bc > dump.ll 2>&1``
+
+
+Debug Statements
+-----------------
+
+By default, the Debug Statements pass will add code to the beginning of every basic block that prints out the function name followed by the name of the basic block.  For example, you would expect the first message to be ``main->entry``.  This can produce 100s of MegaBytes of data, so it is important to redirect this output to a file, as shown in the example above.  This verbose output represents a complete call graph of the execution, although trawling through all of this data can be quite difficult.
+
+.. versionadded:: Oct2019
+
+There is an option to only add print statements to certain functions.  Pass ``-fnPrintList=`` with a comma-separated list of function names that will be instrumented with the print statements.  This will allow examining smaller parts of the execution at a time.
+
+
+Small Profiler
+-----------------
+
+.. versionadded:: Oct2019
+
+The Small Profiler is a pass which simply counts the number of calls to each function in the module.  It creates global variables that correspond to each function in the module.  Each time a function is called, the corresponding global variable is incremented.  The pass adds a call to a function named ``PRINT_PROFILE_STATS`` immediately before the ``main`` function exits.  If the program does not terminate, calls to this function may be inserted manually by the programmer.
+
+This pass also has two command line parameters:
+
+.. table::
+    :widths: 25 40
+
+    +---------------------------+-----------------------------------------------------+
+    | Command line option       | Effect                                              |
+    +===========================+=====================================================+
+    |                           | The name of the function that is used to print      |
+    |      ``printFnName``      | the stats.  The default is ``printf``.  This flag   |
+    |                           | is for if the platform does not support ``printf``. |
+    +---------------------------+-----------------------------------------------------+
+    |      ``noPrint``          | Do not insert the call to ``PRINT_PROFILE_STATS``.  |
+    +---------------------------+-----------------------------------------------------+
 
 
 .. rubric:: Footnotes
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
new file mode 100644
index 000000000..ac7cda33b
--- /dev/null
+++ b/docs/source/release_notes.rst
@@ -0,0 +1,61 @@
+.. This document explains the changes in the releases
+
+Release Notes
+**************
+
+October 2019
+==============
+
+
+Features
+---------
+
+- Support for ``invoke`` instructions.
+
+- Replication rules, does NOT sync on stores by default, added flag to enable turning that on (``-storeDataSync``).
+
+- Support for compiling multiple files in the same project at different times (using the ``-noMain`` flag).
+
+- Before running the pass, validates that the replication rules given to COAST are consistent with themselves.
+
+- Can sync on vector types.
+
+- Added more unit tests, along with a test driver.
+
+
+Directives
+------------
+
+- Added directive ``__SKIP_FN_CALL`` that has the same behavior as ``-skipFnCalls=`` command line parameter.
+
+- Can add option to not check globals crossing Sphere of Replication (``__COAST_IGNORE_GLOBAL(name)``).
+
+- Added directive macro for marking variables as volatile.
+
+- Treats any globals or functions marked with ``__attribute__((used))`` as volatile and will not remove them.  Also true for globals used in functions marked as "used".
+
+- Added wrapper macros for calling a function with the clones of the arguments.  Useful for ``printf()`` and ``malloc()``, etc, when you only want specific calls to be replicated.
+
+
+Bug Fixes
+-------------
+
+Thanks to Christos Gentsos for pointing out some errors in the code base.
+
+- Allow more usage of function pointers by printing warning message instead of crashing. 
+
+- Added various missing ``nullptr`` checks.
+
+- Fixed crashing on some ``void`` return type functions.
+
+- Better cleanup of stale pointers.
+
+
+Debugging Tools
+-----------------
+
+- Added an option to the ``DebugStatements`` pass that only adds print statements to specified functions.
+
+- Created a simplistic profiling pass called ``SmallProfile`` that can collect function call counts.
+
+- Support for preserving debug info when source is compiled with debug flags.
diff --git a/docs/source/setup.rst b/docs/source/setup.rst
index 24470f2d8..7833892ed 100644
--- a/docs/source/setup.rst
+++ b/docs/source/setup.rst
@@ -13,7 +13,6 @@ Prerequisites
 ================
 
 - Have a version of Linux that has ``cmake`` and ``make`` installed.
-- Make sure ``git`` is installed and clone this repository.
 
 For reference, development of this tool has been done on Ubuntu 16.04.
 
@@ -30,7 +29,7 @@ Building LLVM
 
  ``cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Debug -DLLVM_ENABLE_ASSERTIONS=On ../llvm/``
 
- See the README in the "build" folder for more information.
+ See the ``README.md`` in the "build" folder for more information on how to further configure LLVM.
 
 - Run ``make``.  This may take quite a while, up to an hour or more if you do not parallelize the job.  Adding the flag ``-jn`` allows you to parallelize across ``n`` cores.
 
diff --git a/docs/source/troubleshooting.rst b/docs/source/troubleshooting.rst
index 4e124852a..2843f3e2a 100644
--- a/docs/source/troubleshooting.rst
+++ b/docs/source/troubleshooting.rst
@@ -3,7 +3,7 @@
 Troubleshooting
 *******************
 
-Although it is unlikely, there is a possibility that COAST could cause user code to crash. This is most often due to complications over what should be replicated, as described in Section 3.1.2. If the crash occurs during compilation, please submit a report to jgoeders@byu.edu. If the code compiles but does not run properly, here are several steps we have found helpful. Note that running with DWC often exposes these errors, but TMR silently masks incorrect execution, which can make debugging difficult.
+Although it is unlikely, there is a possibility that COAST could cause user code to crash. This is most often due to complications over what should be replicated, as described in the :ref:`when_repl_cmds` and :ref:`repl_scope` sections. If the crash occurs during compilation, please submit a report to jgoeders@byu.edu. If the code compiles but does not run properly, here are several steps we have found helpful. Note that running with DWC often exposes these errors, but TMR silently masks incorrect execution, which can make debugging difficult.
 
 Troubleshooting Ideas
 =======================
@@ -11,8 +11,9 @@ Troubleshooting Ideas
 - Check to see if the program runs using ``lli`` before and after the optimizer, then test if the generated binary runs on your platform. This allows you to test that ``llc`` is operating properly.
 - You cannot replicate functions that are passed by reference into library calls. This may or may not be possible in user calls. Use ``-ignoreFns`` for these.
 - For systems with limited resources, duplicating or triplicating code can take up too much RAM or ROM and cause the processor to halt. Test if a smaller program can run.
-- The majority of bugs that we have encountered have stemmed from incorrect usage of customization. Please refer to Table 2 and ensure that each function call behaves properly. Many of these bugs have stemmed from user wrappers to ``malloc()`` and ``free()``. The call was not replicated, so all of the instructions operated on a single piece of data, which caused multiple ``free()`` calls on the same memory address.
+- The majority of bugs that we have encountered have stemmed from incorrect usage of customization. Please refer to :ref:`when_repl_cmds` and ensure that each function call behaves properly. Many of these bugs have stemmed from user wrappers to ``malloc()`` and ``free()``. The call was not replicated, so all of the instructions operated on a single piece of data, which caused multiple ``free()`` calls on the same memory address.
 - Another point of customization to be aware of is how to handle hardware interactions. Calls to hardware resources, such as a UART, should be marked so they are not replicated unless specifically required.
-- Be aware of synchronization logic. If a variable changes between accesses of instruction copies, then the copies will fail when compared.
-- Use the ``-debugStatements`` flag to explore the IR and find the exact point of failure.
+- Be aware of synchronization logic. If a variable changes between accesses of instruction copies, such as volatile hardware registers, then the copies will fail when compared.
+- Use the ``-debugStatements`` flag to explore the IR and find the exact point of failure.  See the :ref:`dbg_tools` section for more information.
 - You may get an error that looks something like ``undefined symbol: ZTV18dataflowProtection`` when you try to run DWC or TMR. This occurs when you do not load the dataflowProtection pass before the DWC or TMR pass. Include ``-load <Path to dataflow protection.so>`` in your call to ``opt``.
+- If compiling a C++ project, be aware that the compiler will often `mangle <https://en.wikipedia.org/wiki/Name_mangling#C++>`_ the names of functions.  In this case, the function names passed in to COAST may need to be changed.  Examine the LLVM IR output being given to ``opt`` to make sure they are correct.
diff --git a/llvm/include/llvm/Config/abi-breaking.h b/llvm/include/llvm/Config/abi-breaking.h
new file mode 100644
index 000000000..d3ea6dffe
--- /dev/null
+++ b/llvm/include/llvm/Config/abi-breaking.h
@@ -0,0 +1,4 @@
+// for some reason, things expect this to exist, but it didn't
+// Oct 2019
+#define LLVM_ENABLE_ABI_BREAKING_CHECKS 0
+
diff --git a/projects/CFCSS/CFCSS.cpp b/projects/CFCSS/CFCSS.cpp
index 473fabb2f..586742cca 100644
--- a/projects/CFCSS/CFCSS.cpp
+++ b/projects/CFCSS/CFCSS.cpp
@@ -207,7 +207,7 @@ void CFCSS::BubbleSort(){
 			if((graph[j]->num) > (graph[j+1]->num)){
 				CFCSS::BBNode* b = graph[j];
 				graph[j] = graph[j+1];
-				graph[j] = b;
+				graph[j+1] = b;
 			}
 		}
 	}
@@ -760,7 +760,7 @@ bool CFCSS::runOnModule(Module &M) {
 				continue;
 			}
 //			assert(calledF && "Called function is valid");
-			else if (!calledF->isDeclaration() && !shouldSkipF(calledF->getName())){
+			else if (!calledF->isDeclaration() && !shouldSkipF(calledF->getName()) && !skipFnCl(calledF)){
 				updateCallInsts(callI, bn, IT1, RTS, RTSA);
 				callInstList.push_back(callI);
 				callCount[calledF->getName()] += 1;
diff --git a/projects/CMakeLists.txt b/projects/CMakeLists.txt
index 21987960b..d55aa33db 100644
--- a/projects/CMakeLists.txt
+++ b/projects/CMakeLists.txt
@@ -22,8 +22,8 @@ include_directories ( ${LLVM_INCLUDE_DIRS})
 
 add_subdirectory (debugStatements)
 add_subdirectory (exitMarker)
-add_subdirectory (errorBlocks)
 add_subdirectory (CFCSS)
 add_subdirectory (dataflowProtection)
 add_subdirectory (DWC)
 add_subdirectory (TMR)
+add_subdirectory (smallProfile)
diff --git a/projects/dataflowProtection/cloning.cpp b/projects/dataflowProtection/cloning.cpp
index 4c485f2b0..e3cdc5095 100644
--- a/projects/dataflowProtection/cloning.cpp
+++ b/projects/dataflowProtection/cloning.cpp
@@ -13,6 +13,7 @@
 #include <llvm/IR/Intrinsics.h>
 #include <llvm/IR/IRBuilder.h>
 #include <llvm/IR/DIBuilder.h>
+
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/Transforms/Utils/Cloning.h>
 #include <llvm/Analysis/AliasSetTracker.h>
@@ -36,11 +37,15 @@ using namespace llvm;
 // Initialization
 //----------------------------------------------------------------------------//
 void dataflowProtection::populateValuesToClone(Module& M) {
+	//why was this here? Makes it impossible to clone local variables with in-code directives
+	//Because some pointers become stale. Therefore, second set of Instructions that is not volatile
+	// contains the instructions marked as such by the annotations.
 	instsToClone.clear();
+	instsToClone.insert(instsToCloneAnno.begin(), instsToCloneAnno.end());
 	// globalsToClone.clear();
 	constantExprToClone.clear();
 
-	for(auto F : fnsToClone) {
+	for (auto F : fnsToClone) {
 		if (std::find(coarseGrainedUserFunctions.begin(), coarseGrainedUserFunctions.end(),
 				F->getName()) != coarseGrainedUserFunctions.end()) {
 //			errs() << F->getName() << " is coarse grained. Not replicating.\n";
@@ -77,7 +82,7 @@ void dataflowProtection::populateValuesToClone(Module& M) {
 					}
 
 					// skip bitcasts and print a warning message, because this might skip more than bitcasts
-					if (!isIndirectFunctionCall(ci, "populateValuesToClone",false)) {
+					if (!isIndirectFunctionCall(ci, "populateValuesToClone", false)) {
 						Function* cF = ci->getCalledFunction();
 						if (std::find(skipLibCalls.begin(), skipLibCalls.end(),
 								cF->getName()) != skipLibCalls.end()) {
@@ -93,6 +98,14 @@ void dataflowProtection::populateValuesToClone(Module& M) {
 								continue;
 							}
 						}
+
+						// skip replicating debug function calls, the debugger only knows about the
+						//  original variable names anyway.
+						if (cF->getName().startswith_lower("llvm.dbg.") ||
+								cF->getName().startswith_lower("llvm.lifetime.")) {
+							continue;
+						}
+
 					} else {	// it is an indirect function call
 
 						Value* calledValue = ci->getCalledValue();
@@ -137,8 +150,19 @@ void dataflowProtection::populateValuesToClone(Module& M) {
 
 				//We don't clone terminators
 				//Invoke is "designed to operate as a standard call instruction in most regards" - don't clone
-				if (I.isTerminator() || isa<InvokeInst>(I))
-					continue;
+				if (I.isTerminator() || isa<InvokeInst>(I)) {
+					//we do need to clone the invokes if the function they call is marked as coarse-grained
+					if (InvokeInst* invInst = dyn_cast<InvokeInst>(&I)) {
+						if (std::find(coarseGrainedUserFunctions.begin(), coarseGrainedUserFunctions.end(),
+								invInst->getCalledFunction()->getName()) != coarseGrainedUserFunctions.end()) {
+							;	//add it to the list
+						} else {
+							continue;
+						}
+					} else {
+						continue;
+					}
+				}
 
 				//Don't clone stores to external globals - assumed to be devices
 				if (StoreInst* SI = dyn_cast<StoreInst>(&I)) {
@@ -155,6 +179,7 @@ void dataflowProtection::populateValuesToClone(Module& M) {
 					continue;
 				}
 
+//				if (instsToClone.empty())
 				instsToClone.insert(&I);
 			}
 		}
@@ -169,6 +194,7 @@ void dataflowProtection::populateValuesToClone(Module& M) {
 		}
 
 		//Don't clone ISR function pointers
+		//TODO: not a good way of checking for ISRs
 		if (globalName.startswith("__vector") || globalName.startswith("isr_")) {
 //			errs() << "WARNING: not duplicating global value " << g.getName() << ", assuming it is llvm-created\n";
 			continue;
@@ -194,6 +220,7 @@ void dataflowProtection::populateValuesToClone(Module& M) {
 // Modify functions
 //----------------------------------------------------------------------------//
 void dataflowProtection::populateFnWorklist(Module& M) {
+
 	//Populate a set with all user-defined functions
 	std::set<Function*> fnList;
 	for (auto & fn_it : M) {
@@ -201,6 +228,7 @@ void dataflowProtection::populateFnWorklist(Module& M) {
 		if (std::find(unsupportedFunctions.begin(), unsupportedFunctions.end(),
 				fn_it.getName()) != unsupportedFunctions.end()) {
 			errs() << "ERROR: \n    " << fn_it.getName() << ": function is not supported!\n\n\n";
+			// definitely will quit
 			std::exit(-1);
 			assert(false && "Function is not supported!");
 		}
@@ -215,6 +243,7 @@ void dataflowProtection::populateFnWorklist(Module& M) {
 			continue;
 		}
 
+		// skip user marked coarse-grained functions
 		if (std::find(coarseGrainedUserFunctions.begin(), coarseGrainedUserFunctions.end(),
 				fn_it.getName())!=coarseGrainedUserFunctions.end()) {
 			continue;
@@ -239,7 +268,7 @@ void dataflowProtection::populateFnWorklist(Module& M) {
 						//Skip any thing that doesn't have a called function, print warnings
 						if (isIndirectFunctionCall(CI, "populateFnWorklist"))
 							continue;
-						if (CI->getCalledFunction()->isDeclaration()){
+						if (CI->getCalledFunction()->isDeclaration()) {
 							continue;
 						} else if(fnsToSkip.find(CI->getCalledFunction())==fnsToSkip.end()) {
 							fnsToSkip.insert(CI->getCalledFunction());
@@ -301,7 +330,7 @@ void dataflowProtection::populateFnWorklist(Module& M) {
 	}
 
 	//Make sure coarse grained functions aren't modified
-	for (auto it : fnsToClone){
+	for (auto it : fnsToClone) {
 		if (std::find(coarseGrainedUserFunctions.begin(), coarseGrainedUserFunctions.end(),
 				it->getName())!=coarseGrainedUserFunctions.end()) {
 			fnsToClone.erase(it);
@@ -312,6 +341,7 @@ void dataflowProtection::populateFnWorklist(Module& M) {
 
 void dataflowProtection::cloneFunctionArguments(Module & M) {
 	std::vector<Function*> functionsToFix;
+	int warnedFnPtrs = 0;
 
 	//If we aren't replicating everything by default then don't update fn sig
 	//There won't be any clones to pass into it
@@ -376,6 +406,7 @@ void dataflowProtection::cloneFunctionArguments(Module & M) {
 
 		// See if what is passed in has a clone
 		for (auto u : F->users()) {
+
 			//Ignore global annotations - globals containing bitcasts
 			if (auto ce = dyn_cast<ConstantExpr>(u)) {
 				if (ce->isCast()) {
@@ -398,9 +429,6 @@ void dataflowProtection::cloneFunctionArguments(Module & M) {
 				}
 				continue;
 			}
-//			for (auto aliasIterator = M.alias_begin(); aliasIterator != M.alias_end(); aliasIterator++) {
-//				errs() << *aliasIterator << "\n";
-//			}
 
 			//check for invoke instructions
 			if (InvokeInst* invInst = dyn_cast<InvokeInst>(u)) {
@@ -413,7 +441,7 @@ void dataflowProtection::cloneFunctionArguments(Module & M) {
 				 * user-defined functions. If this is a library call, it must not be touched,
 				 * because we can't change the body of library functions, and invoke instructions
 				 * are treated as terminator instructions, so we can't just replicate the call.
-				 * However, library calls shouldn't show up in this function.
+				 * However, library calls shouldn't show up in this part of the pass.
 				 */
 
 				// clone the operands
@@ -425,8 +453,6 @@ void dataflowProtection::cloneFunctionArguments(Module & M) {
 				continue;
 			}
 
-			CallInst * callInst = dyn_cast<CallInst>(u);
-
 			//Handle arrays of function pointers by marking what should be modified
 			if (ConstantArray* ca = dyn_cast<ConstantArray>(u)) {
 				#ifndef NO_FN_PTR_ARRAY
@@ -443,13 +469,26 @@ void dataflowProtection::cloneFunctionArguments(Module & M) {
 				continue;
 			}
 
-
+			CallInst * callInst = dyn_cast<CallInst>(u);
 			if (!callInst) {
-				errs() << err_string << " User is not a CallInst!\n" << *u << "\n";
-				errs() << "User of function " << F->getName() << "\n";
-				errs() << "Type: " << *(u->getType()) << "\n\n";
+				// then it's probably something with function pointers
+				if (verboseFlag) {
+					if (!warnedFnPtrs) {
+						errs() << warn_string << " function pointers (" << F->getName();
+						errs() << ") are not supported by COAST.  Use at your own risk\n";
+						warnedFnPtrs = 1;
+					}
+					errs() << *u << "\n";
+				}
+				continue;
+			}
+
+			// It's possible that the function user is not actually a call to the function, but a call
+			//  to some other function that passes this one as a parameter.
+			Function* CF = callInst->getCalledFunction();
+			if (CF != F) {
+				continue;
 			}
-			assert(callInst && "User is not a call instruction");
 
 			for (unsigned int i = 0; i < callInst->getNumArgOperands(); i++) {
 				if (willBeCloned(callInst->getArgOperand(i))) {
@@ -457,6 +496,9 @@ void dataflowProtection::cloneFunctionArguments(Module & M) {
 				}
 			}
 		}
+		warnedFnPtrs = 0;
+
+		// TODO: some of the arguments could be specifically requested to NOT clone them
 
 		// Check if any parameters need clones
 		bool needClones = false;
@@ -470,13 +512,15 @@ void dataflowProtection::cloneFunctionArguments(Module & M) {
 
 		// Now clone the function arguments
 		std::vector<Type*> params;
-		for (unsigned int i = 0; i < F->getFunctionType()->params().size();
-				i++) {
-			params.push_back(F->getFunctionType()->getParamType(i));
-			if (cloneArg[i]){
-				params.push_back(F->getFunctionType()->getParamType(i));
+		for (unsigned int i = 0; i < F->getFunctionType()->params().size(); i++) {
+
+			Type* nextType = F->getFunctionType()->getParamType(i);
+			params.push_back(nextType);
+
+			if (cloneArg[i]) {
+				params.push_back(nextType);
 				if (TMR) {
-					params.push_back(F->getFunctionType()->getParamType(i));
+					params.push_back(nextType);
 				}
 			}
 		}
@@ -487,13 +531,12 @@ void dataflowProtection::cloneFunctionArguments(Module & M) {
 				F->getFunctionType()->getReturnType(), paramArray, false);
 
 		std::string Fname;
-		if(!TMR)
+		if (!TMR)
 			Fname= F->getName().str() + "_DWC";
 		else
 			Fname= F->getName().str() + "_TMR";
 		Constant * c = M.getOrInsertFunction(Fname, Ftype);
 		Function * Fnew = dyn_cast<Function>(c);
-
 		assert(Fnew && "New function is non-void");
 		newFunctions.push_back(Fnew);
 
@@ -530,13 +573,6 @@ void dataflowProtection::cloneFunctionArguments(Module & M) {
 			i++;
 		}
 
-//		errs() << "Function arguments: \n";
-//		for(auto p = Fnew->arg_begin(); p != Fnew->arg_end(); p++){
-//			errs() << *p << "\n";
-//		}
-//		errs() << "\n";
-
-
 		SmallVector<ReturnInst*, 8> returns;
 		CloneFunctionInto(Fnew, F, paramMap, true, returns);
 		origFunctions.push_back(F);
@@ -598,7 +634,13 @@ void dataflowProtection::cloneFunctionArguments(Module & M) {
 				assert(callInst && "Replacing function calls in cloneFnArgs");
 
 				Function* parentFn = callInst->getParent()->getParent();
-				if (fnsToClone.find(parentFn)==fnsToClone.end()) {
+				if (fnsToClone.find(parentFn) == fnsToClone.end()) {
+					continue;
+				}
+
+				// if the use of the function is actually a function pointer *in* the call,
+				//  then need to skip doing anything to this one
+				if (callInst->getCalledFunction() != F) {
 					continue;
 				}
 
@@ -620,10 +662,17 @@ void dataflowProtection::cloneFunctionArguments(Module & M) {
 
 				ArrayRef<Value*>* callArgs;
 				callArgs = new ArrayRef<Value*>(args);
-
-				//The casting here is to stop from complaining that the Create call doesn't have the right types
-				CallInst * newCallInst = CallInst::Create((Value*) Fnew, *callArgs,
-						Twine(callInst->getName()), (Instruction*) callInst);
+				CallInst* newCallInst;
+
+				//turns out that void returning function calls have no name, so have to be careful here
+				if (Fnew->getReturnType() == Type::getVoidTy(M.getContext())) {
+					newCallInst = CallInst::Create((Value*) Fnew, *callArgs);
+					newCallInst->insertBefore(callInst);
+				} else {
+					//The casting here is to stop from complaining that the Create call doesn't have the right types
+					newCallInst = CallInst::Create((Value*) Fnew, *callArgs,
+							Twine(callInst->getName()), (Instruction*) callInst);
+				}
 
 				//Deal with function calls inside function args when casted - not recognized as callInsts
 				for (auto ops : newCallInst->operand_values()) {
@@ -644,6 +693,16 @@ void dataflowProtection::cloneFunctionArguments(Module & M) {
 				}
 
 				// Replace all uses of the original call instruction with the new one
+				if (callInst->getType() != newCallInst->getType()) {
+					if (F->hasName()) {
+						errs() << "Looking at function '" << F->getName() << "'\n";
+						errs() << *F->getFunctionType() << "\n";
+					}
+					errs() << *callInst << "\n";
+					errs() << *newCallInst << "\n";
+				}
+				assert(callInst->getType() == newCallInst->getType());
+
 				callInst->replaceAllUsesWith(newCallInst);
 				callInst->eraseFromParent();
 			} else if (InvokeInst* invInst = dyn_cast<InvokeInst>(u)) {
@@ -695,6 +754,7 @@ void dataflowProtection::cloneFunctionArguments(Module & M) {
 				invInst->eraseFromParent();
 			} else {
 				assert(false && "wrong type!\n");
+//				TODO: what would cause this to fail?
 			}
 
 		}
@@ -757,12 +817,13 @@ void dataflowProtection::cloneFunctionArguments(Module & M) {
 
 }
 
+//#define DBG_UPDATE_CALLS
 void dataflowProtection::updateCallInsns(Module & M) {
 
 	for (auto &F : M) {
 		//If we are skipping the function, don't update the call instructions
-		if(fnsToCloneAndSkip.find(&F)!=fnsToCloneAndSkip.end()){
-			if(fnsToClone.find(&F)==fnsToClone.end()){
+		if (fnsToCloneAndSkip.find(&F)!=fnsToCloneAndSkip.end()) {
+			if (fnsToClone.find(&F)==fnsToClone.end()) {
 				continue;
 			}
 		}
@@ -770,10 +831,11 @@ void dataflowProtection::updateCallInsns(Module & M) {
 		for (auto & bb : F) {
 			for (auto & I : bb) {
 				if (CallInst * CI = dyn_cast<CallInst>(&I)) {
-
 					Function * Fcalled = CI->getCalledFunction();
+
 					if (argNumsCloned.find(Fcalled) != argNumsCloned.end()) {
 						auto argsCloned = argNumsCloned[Fcalled];
+
 						for (auto argNum : argsCloned) {
 							Value* op = CI->getArgOperand(argNum);
 							if (isCloned(op)) {
@@ -781,7 +843,7 @@ void dataflowProtection::updateCallInsns(Module & M) {
 								CI->setArgOperand(argNum + 1, clone1);
 
 								Value* clone2;
-								if(TMR){
+								if (TMR) {
 									clone2 = getClone(op).second;
 									CI->setArgOperand(argNum + 2, clone2);
 								}
@@ -789,10 +851,29 @@ void dataflowProtection::updateCallInsns(Module & M) {
 						}
 					}
 				}
+			}
+		}
+	}
+	errs() << "\n";
+}
+
+void dataflowProtection::updateInvokeInsns(Module & M) {
+
+	for (auto &F : M) {
+		//If we are skipping the function, don't update the call instructions
+		if (fnsToCloneAndSkip.find(&F)!=fnsToCloneAndSkip.end()) {
+			if (fnsToClone.find(&F)==fnsToClone.end()) {
+				continue;
+			}
+		}
 
+		for (auto & bb : F) {
+			for (auto & I : bb) {
 				//also need to update Invoke instructions
 				if (InvokeInst* invInst = dyn_cast<InvokeInst>(&I)) {
 					Function* Fcalled = invInst->getCalledFunction();
+
+					//clone the arguments
 					if (argNumsCloned.find(Fcalled) != argNumsCloned.end()) {
 						auto argsCloned = argNumsCloned[Fcalled];
 
@@ -816,6 +897,7 @@ void dataflowProtection::updateCallInsns(Module & M) {
 	}
 }
 
+//#define DEBUGGING_MEMSET
 //----------------------------------------------------------------------------//
 // Fine-grained cloning of instructions
 //----------------------------------------------------------------------------//
@@ -825,70 +907,142 @@ bool dataflowProtection::cloneInsns() {
 
 	//Populate the clone list
 	for (auto I : instsToClone) {
-		Instruction* newI1 = I->clone();
+		Instruction* newI1;
 		Instruction* newI2;
+		if (InvokeInst* invInst = dyn_cast<InvokeInst>(I) ) {
+			if (invInst->getCalledFunction()->getReturnType()->isVoidTy()) {
+				continue;
+			}
+			Function* Fparent = invInst->getParent()->getParent();
 
-		if (!I->getType()->isVoidTy()) {
-			newI1->setName(I->getName() + ".DWC");
-		}
+			//we need to create a new basic block to branch to on success
+			BasicBlock* beforeBlock = invInst->getParent();
+			BasicBlock* afterBlock = invInst->getNormalDest();
+			BasicBlock* landingBlock = invInst->getUnwindDest();
+
+			const Twine& blockName1 = Fparent->getName() + ".invoke.DWC";
+			BasicBlock* newBlock1 = BasicBlock::Create(Fparent->getContext(), \
+					blockName1, Fparent, afterBlock);
+			afterBlock = invInst->getNormalDest();
+
+			//set original invoke to have new normal destination
+			invInst->setNormalDest(newBlock1);
+
+			//make a dummy instruction so we have somewhere to put the invoke
+			ConstantInt* nothing = ConstantInt::get(IntegerType::getInt16Ty(Fparent->getContext()), 1, false);
+			BinaryOperator* dummy1 = BinaryOperator::CreateNeg(nothing, "dummy1", newBlock1);
 
-		newI1->insertAfter(I);
+			//that contains a copy of the same invoke instruction
+			InvokeInst* newInv1 = dyn_cast<InvokeInst>(invInst->clone());
+			InvokeInst* newInv2;
+			newInv1->setName(invInst->getName() + ".DWC");
+			newInv1->insertAfter(dummy1);
+			dummy1->eraseFromParent();
+
+			//the new one will have the same unwind location
+			newInv1->setUnwindDest(landingBlock);
+
+			if (TMR) {
+				const Twine& blockName2 = Fparent->getName() + ".invoke.TMR";
+				BasicBlock* newBlock2 = BasicBlock::Create(Fparent->getContext(), blockName2, Fparent);
+				newBlock2->moveAfter(newBlock1);
+
+				BinaryOperator* dummy2 = BinaryOperator::CreateNeg(nothing, "dummy2", newBlock2);
+
+				newInv2 = dyn_cast<InvokeInst>(invInst->clone());
+				newInv2->setName(invInst->getName() + ".TMR");
+				newInv2->insertAfter(dummy2);
+				dummy2->eraseFromParent();
+
+				newInv2->setUnwindDest(landingBlock);
+				newInv1->setNormalDest(newBlock2);
+				newInv2->setNormalDest(afterBlock);
+
+//				errs() << " - new basic block:\n" << *newBlock1 << "\n";
+//				errs() << " - new TMR basic block:\n" << *newBlock2 << "\n";
+//				errs() << " - next:\n" << *newInv2->getNormalDest() << "\n";
+
+				newI2 = dyn_cast<Instruction>(newInv2);
+			} else {
+				newInv1->setNormalDest(afterBlock);
+//				errs() << " - new basic block:\n" << *newBlock1 << "\n";
+			}
+			//for the map
+			newI1 = dyn_cast<Instruction>(newInv1);
+
+		} else {	//everything else besides InvokeInst
+			newI1 = I->clone();
 
-		if(TMR){
-			newI2 = I->clone();
 			if (!I->getType()->isVoidTy()) {
-				newI2->setName(I->getName() + ".TMR");
+				newI1->setName(I->getName() + ".DWC");
 			}
 
-			newI2->insertAfter(newI1);
+			newI1->insertAfter(I);
+
+			if (TMR) {
+				newI2 = I->clone();
+				if (!I->getType()->isVoidTy()) {
+					newI2->setName(I->getName() + ".TMR");
+				}
+
+				newI2->insertAfter(newI1);
+			}
 		}
 
-		instsCloned.push_back(std::pair<Instruction*,Instruction*>(newI1,newI2));
-		cloneMap[I] = ValuePair(newI1,newI2);
+		instsCloned.push_back(std::pair<Instruction*,Instruction*>(newI1, newI2));
+		cloneMap[I] = ValuePair(newI1, newI2);
 	}
 
 	//Iterate over the clone list and change references
 	for (auto clone : instsCloned) {
 		//Iterate over the operands in the instruction
+
 		for (unsigned i = 0; i < clone.first->getNumOperands(); i++) {
 			//If the operand is found in the map change the reference
 			Value* op = clone.first->getOperand(i);
-			if (cloneMap.find(op) != cloneMap.end()){ //If we found it
-				if(noMemReplicationFlag){ //Not replicating memory
+
+			//skip changing basic block references on the invoke instructions,
+			// we already set them up correctly above
+			if (isa<InvokeInst>(clone.first) && isa<BasicBlock>(op)) {
+				continue;
+			}
+
+			if (cloneMap.find(op) != cloneMap.end()) { 	//If we found it
+				if (noMemReplicationFlag) { 			//Not replicating memory
 					//If we aren't replicating memory then we should not change the load inst. address
-					if(dyn_cast<LoadInst>(clone.first)){ //Don't change load instructions
+					if (dyn_cast<LoadInst>(clone.first)) { //Don't change load instructions
 						assert(clone.first && "Clone exists when updating operand");
 						clone.first->setOperand(i, op);
-						if(TMR){
+						if (TMR) {
 							assert(clone.second && "Clone exists when updating operand");
 							clone.second->setOperand(i, op);
 						}
-					} else{ //Else update as normal
+					} else { //Else update as normal
 						clone.first->setOperand(i, cloneMap[op].first);
-						if(TMR){
+						if (TMR) {
 							clone.second->setOperand(i, cloneMap[op].second);
 						}
 					}
-				} else{ //Replicating memory
+				} else { //Replicating memory
 					clone.first->setOperand(i, cloneMap[op].first);
-					if(TMR){
+					if (TMR) {
 						clone.second->setOperand(i, cloneMap[op].second);
 					}
 				}
-			} else if(ConstantExpr* ce = dyn_cast<ConstantExpr>(op)){
+			} else if (ConstantExpr* ce = dyn_cast<ConstantExpr>(op)) {
 				//Don't need to update references to constant ints
 				assert(ce && "Null ConstantExpr ce");
-				if(isa<ConstantInt>(ce->getOperand(0))){
+				if (isa<ConstantInt>(ce->getOperand(0))) {
 					continue;
 				}
 
-				if(!willBeCloned(ce->getOperand(0))){
+				if (!willBeCloned(ce->getOperand(0))) {
 					continue;
 				}
 
 				//Don't mess with loads with inline GEPs
-				if(noMemReplicationFlag){
-					if(ce->isGEPWithNoNotionalOverIndexing()){
+				if (noMemReplicationFlag) {
+					if (ce->isGEPWithNoNotionalOverIndexing()) {
 						continue;
 					}
 				}
@@ -906,14 +1060,83 @@ bool dataflowProtection::cloneInsns() {
 				 * This only is a problem when the noMemReplication flag, therefore it's OK to skip changing
 				 *  the instruction arguments, since it would all be the same argument anyway.
 				 */
+				//might be an inline reference to a global variable. example:
+				//%0 = load <4 x i32>, <4 x i32>* bitcast ([2 x [8 x i32]]* @matrix to <4 x i32>*), align 16, !tbaa !2
+				//in the following code segment, the leading underscores in names represent levels of indirection
 				if (ce->isCast()) {
-					if (verboseFlag) {
-						errs() << warn_string << ": In cloneInsns() skipping processing cloned ConstantExpr:\n";
+					if (noMemReplicationFlag)
+						continue;
+
+					Value* _op = ce->getOperand(0);
+					if (isCloned(_op)) {
+//						errs() << *_op << "\n";
+						ConstantExpr* ce1 = dyn_cast<ConstantExpr>(clone.first->getOperand(i));
+						Value* _op1 = cloneMap[_op].first;
+						assert(_op1 && "valid clone");
+//						errs() << *_op1 << "\n";
+						Constant* _nop1 = dyn_cast<Constant>(_op1);
+						Constant* nce1 = ce1->getWithOperandReplaced(0, _nop1);
+//						errs() << *nce1 << "\n";
+						clone.first->setOperand(i, nce1);
+						if (TMR) {
+							ConstantExpr* ce2 = dyn_cast<ConstantExpr>(clone.second->getOperand(i));
+							Value* _op2 = cloneMap[_op].second;
+							assert(_op2 && "valid second clone");
+							Constant* _nop2 = dyn_cast<Constant>(_op2);
+							Constant* nce2 = ce2->getWithOperandReplaced(0, _nop2);
+							clone.second->setOperand(i, nce2);
+						}
+						continue;
+					}
+					//could be something ugly like:
+					//%2 = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([2 x [8 x i32]], [2 x [8 x i32]]* @matrix, i64 0, i64 0, i64 4) to <4 x i32>*), align 16, !tbaa !2
+					ConstantExpr* innerGEPclone1 = dyn_cast<ConstantExpr>(_op);
+					if (innerGEPclone1 && innerGEPclone1->isGEPWithNoNotionalOverIndexing()) {
+
+						//get the place to update
+						ConstantExpr* innerGEPclone1 = dyn_cast<ConstantExpr>(ce->getOperand(0));
+//						errs() << " - " << *innerGEPclone1 << "\n";
+
+						//this next thing is what has the clone(s)
+						Value* GEPvalOrig = innerGEPclone1->getOperand(0);
+//						errs() << " - " << *GEPvalOrig << "\n";
+						//get the clone
+						Value* GEPvalClone1 = cloneMap[GEPvalOrig].first;
+						assert(GEPvalClone1 && "valid clone");
+
+						//replace uses
+						Constant* newGEPclone1 = innerGEPclone1->getWithOperandReplaced(
+								0, dyn_cast<Constant>(GEPvalClone1));
+						Constant* newCE = ConstantExpr::getCast(
+								ce->getOpcode(), newGEPclone1, ce->getType());
+						clone.first->setOperand(i, newCE);
+//						errs() << " - " << *ce << "\n";
+//						errs() << " - " << *clone.first << "\n";
+
+						if (TMR) {
+							ConstantExpr* ce2 = dyn_cast<ConstantExpr>(clone.second->getOperand(i));
+							ConstantExpr* innerGEPclone2 = dyn_cast<ConstantExpr>(ce2->getOperand(0));
+							Value* GEPvalClone2 = cloneMap[GEPvalOrig].second;
+							assert(GEPvalClone2 && "valid second clone");
+							Constant* newGEPclone2 = innerGEPclone2->getWithOperandReplaced(
+									0, dyn_cast<Constant>(GEPvalClone2));
+							Constant* newCE2 = ConstantExpr::getCast(
+									ce2->getOpcode(), newGEPclone2, ce2->getType());
+							clone.second->setOperand(i, newCE2);
+						}
+					}
+					//otherwise, throw an error
+					else if (verboseFlag) {
+						errs() << warn_string << " In cloneInsns() skipping processing cloned ConstantExpr:\n";
 						errs() << " " << *ce << "\n";
 					}
 					continue;
 				}
 
+				if (!willBeCloned(ce->getOperand(0))) {
+					continue;
+				}
+
 				// error checking here for things missing in the cloneMap
 				//  if this is NULL, then that means we just inserted the operand
 				//	into the map, and therefore it wasn't in there before
@@ -921,20 +1144,19 @@ bool dataflowProtection::cloneInsns() {
 				// how did this get in the list, but not in the map?
 				Value* v_temp = cloneMap[ce->getOperand(0)].first;
 				if (v_temp == nullptr) {
-					errs() << err_string << " in CloneInsns!\n";
+					errs() << err_string << " in cloneInsns!\n";
 					errs() << *ce << "\n";
 				}
 				assert(v_temp && "ConstantExpr is in cloneMap");
 
 				Constant* newOp1 = dyn_cast<Constant>(v_temp);
-				// Constant* newOp1 = dyn_cast<Constant>(cloneMap[ce->getOperand(0)].first);
 				assert(newOp1 && "Null Constant newOp1");
 				Constant* c1 = ce->getWithOperandReplaced(0, newOp1);
 				ConstantExpr* eNew1 = dyn_cast<ConstantExpr>(c1);
 				assert(eNew1 && "Null ConstantExpr eNew1");
 				clone.first->setOperand(i, eNew1);
 
-				if(TMR){
+				if (TMR) {
 					Constant* newOp2 = dyn_cast<Constant>(cloneMap[ce->getOperand(0)].second);
 					assert(newOp2 && "Null Constant newOp2");
 					Constant* c2 = ce->getWithOperandReplaced(0, newOp2);
@@ -942,9 +1164,9 @@ bool dataflowProtection::cloneInsns() {
 					assert(eNew2 && "Null ConstantExpr eNew2");
 					clone.second->setOperand(i, eNew2);
 				}
-			} else{
+			} else {
 				clone.first->setOperand(i, op);
-				if(TMR){
+				if (TMR) {
 					assert(clone.second && "Clone exists to set operand");
 					clone.second->setOperand(i, op);
 				}
@@ -983,6 +1205,7 @@ void dataflowProtection::cloneConstantExpr() {
 			//assert(eNew->isGEPWithNoNotionalOverIndexing());
 			cloneMap[e] = ValuePair(e1,e2);
 		} else {
+//			TODO: what could cause this to fail?
 			assert(false && "Constant expr to clone not matching expected form");
 		}
 	}
@@ -993,11 +1216,11 @@ void dataflowProtection::cloneConstantExpr() {
 //----------------------------------------------------------------------------//
 void dataflowProtection::cloneGlobals(Module & M) {
 
-	if(noMemReplicationFlag)
+	if (noMemReplicationFlag)
 		return;
 
 	if (verboseFlag) {
-		for (auto g: globalsToClone) {
+		for (auto g : globalsToClone) {
 			errs() << "Cloning global: " << g->getName() << "\n";
 		}
 	}
@@ -1011,15 +1234,15 @@ void dataflowProtection::cloneGlobals(Module & M) {
 
 	for (auto g : globalsToClone) {
 		//Skip specified globals
-		if(std::find(ignoreGlbl.begin(), ignoreGlbl.end(), g->getName().str()) != ignoreGlbl.end()){
-			if(verboseFlag) errs() << "Not replicating " << g->getName() << "\n";
+		if (std::find(ignoreGlbl.begin(), ignoreGlbl.end(), g->getName().str()) != ignoreGlbl.end()) {
+			if (verboseFlag) errs() << "Not replicating " << g->getName() << "\n";
 			continue;
 		}
 
 		GlobalVariable* gNew = copyGlobal(M, g, "_DWC");
 
 		GlobalVariable* gNew2;
-		if(TMR){
+		if (TMR) {
 			gNew2 = copyGlobal(M, g, "_TMR");
 		}
 
@@ -1051,7 +1274,7 @@ GlobalVariable * dataflowProtection::copyGlobal(Module & M, GlobalVariable * g,
 	gNew->setUnnamedAddr(g->getUnnamedAddr());
 
 	if (verboseFlag)
-		errs() << "New dupicate global: " << gNew->getName() << "\n";
+		errs() << "New duplicate global: " << gNew->getName() << "\n";
 
 	return gNew;
 }
@@ -1070,6 +1293,8 @@ void dataflowProtection::addGlobalRuntimeInit(Module & M) {
 		arg_type_v.push_back(Type::getInt64Ty(M.getContext()));
 		ArrayRef<Type*> arg_type = ArrayRef<Type*>(arg_type_v);
 
+		// this is an Eclipse error because the definition comes from a built file with an include guard,
+		//  but this is a correct enum in Intrinsic.
 		Function * fun = Intrinsic::getDeclaration(&M, Intrinsic::memcpy, arg_type);
 		IRBuilder<> Builder(&(*(M.getFunction("main")->begin()->begin())));
 
@@ -1132,6 +1357,7 @@ void dataflowProtection::cloneMetadata(Module& M, Function* Fnew) {
 	 * 7: retainedNodes
 	 */
 
+#if 0
 	/* Print out all of the operands in the DISubprogram
  	for (int i = 0; i < N->getNumOperands(); i+=1) {
 		const MDOperand& mop = N->getOperand(i);
@@ -1145,6 +1371,7 @@ void dataflowProtection::cloneMetadata(Module& M, Function* Fnew) {
 			errs() << i << ": " << mop << '\n';
 		}
 	} */
+#endif
 
 	//have to make new types, based on signature of new function
 	DISubroutineType* dtype = autoSp->getType();
diff --git a/projects/dataflowProtection/dataflowProtection.cpp b/projects/dataflowProtection/dataflowProtection.cpp
index a26e6d557..0be5726a3 100644
--- a/projects/dataflowProtection/dataflowProtection.cpp
+++ b/projects/dataflowProtection/dataflowProtection.cpp
@@ -15,6 +15,7 @@ cl::opt<bool> noMemReplicationFlag ("noMemReplication", cl::desc("Do not duplica
 cl::opt<bool> noLoadSyncFlag ("noLoadSync", cl::desc("Do not synchronize on data loads"));
 cl::opt<bool> noStoreDataSyncFlag ("noStoreDataSync", cl::desc("Do not synchronize data on data stores"));
 cl::opt<bool> noStoreAddrSyncFlag ("noStoreAddrSync", cl::desc("Do not synchronize address on data stores"));
+cl::opt<bool> storeDataSyncFlag ("storeDataSync", cl::desc("Force synchronize data on data stores (not default)"));
 
 //Replication scope
 //note: any changes to list names must also be changed at the top of utils.cpp
@@ -32,6 +33,7 @@ cl::opt<bool> SegmentFlag ("s", cl::desc("Segment instructions, rather than inte
 cl::list<std::string> globalsToRuntimeInitCl("runtimeInitGlobals", cl::CommaSeparated, cl::ZeroOrMore);
 cl::opt<bool> dumpModuleFlag ("dumpModule", cl::desc("Print out the module immediately before pass concludes. Option is for pass debugging."));
 cl::opt<bool> verboseFlag ("verbose", cl::desc("Increase the amount of output"));
+cl::opt<bool> noMainFlag ("noMain", cl::desc("There is no 'main' function in this module"));
 
 //--------------------------------------------------------------------------//
 // Top level behavior
@@ -67,6 +69,9 @@ bool dataflowProtection::run(Module &M, int numClones) {
 	// First figure out which instructions are going to be cloned
 	populateValuesToClone(M);
 
+	// validate that the configuration parameters can be followed safely
+	verifyOptions(M);
+
 	// Now add new arguments to functions
 	// (In LLVM you can't change a function signature, so we have to make new functions)
 	// populateValuesToClone has to be called before this so we know which
@@ -74,6 +79,9 @@ bool dataflowProtection::run(Module &M, int numClones) {
 	cloneFunctionArguments(M);
 	removeOrigFunctions();
 
+	// deal with function wrappers
+	updateFnWrappers(M);
+
 	// Once again figure out which instructions are going to be cloned
 	// This need to be re-run after creating the new functions as the old
 	// pointers will be stale
@@ -86,6 +94,7 @@ bool dataflowProtection::run(Module &M, int numClones) {
 
 	// Change clones to depend on the duplications
 	updateCallInsns(M);
+	updateInvokeInsns(M);
 
 	//Insert error detection/handling
 	insertErrorFunction(M, numClones);
@@ -108,6 +117,8 @@ bool dataflowProtection::run(Module &M, int numClones) {
 	// This is executed if code is segmented instead of interleaved
 	moveClonesToEndIfSegmented(M);
 
+//	removeUnusedFunctions(M);
+
 	//Option executed when -dumpModule is passed in
 	dumpModule(M);
 
diff --git a/projects/dataflowProtection/dataflowProtection.h b/projects/dataflowProtection/dataflowProtection.h
index 6f9ee6bb0..8c2c083cb 100644
--- a/projects/dataflowProtection/dataflowProtection.h
+++ b/projects/dataflowProtection/dataflowProtection.h
@@ -11,8 +11,7 @@
 #include <llvm/IR/Constants.h>
 #include <llvm/IR/Instructions.h>
 
-//trying to fix the issue with instructions not segmenting correctly
-#define SYNC_POINT_FIX
+#define FIX_STORE_SEGMENTING
 
 using namespace llvm;
 
@@ -38,9 +37,11 @@ class dataflowProtection : public ModulePass {
   const std::string no_xMR_anno    = "no_xMR";
   const std::string xMR_anno       = "xMR";
   const std::string xMR_call_anno  = "xMR_call";
+  const std::string skip_call_anno = "coast_call_once";
   const std::string default_xMR    = "set_xMR_default";
   const std::string default_no_xMR = "set_no_xMR_default";
   const std::string default_global = "__xMR_DEFAULT_BEHAVIOR__";
+  const std::string coast_volatile = "coast_volatile";
 
   //----------------------------------------------------------------------------//
   // Constant strings for fancy printing
@@ -61,12 +62,18 @@ class dataflowProtection : public ModulePass {
   std::set<Instruction*> instsToSkip;
   std::set<GlobalVariable*> globalsToClone;
   std::set<GlobalVariable*> globalsToSkip;
+  std::set<GlobalVariable*> volatileGlobals;
+  std::set<Function*> usedFunctions; 	/* marked with __attribute__((used)) */
   std::set<GlobalVariable*> globalsToRuntimeInit;
   std::set<ConstantExpr*> constantExprToClone;
   std::set<Function*> fnsUsedIndirectly;
   std::set<Type*> indirectFnSignatures;
 
+  std::set<Instruction*> instsToCloneAnno;
+  std::set<Instruction*> wrapperInsts;
+
   std::vector<Instruction*> syncPoints;
+  std::vector<Instruction*> newSyncPoints;		//added while processing old ones
   std::map<Value*,ValuePair> cloneMap;
   std::map<Function*, BasicBlock*> errBlockMap;
   std::map<Function*, Function*> functionMap;
@@ -95,6 +102,7 @@ class dataflowProtection : public ModulePass {
   void populateFnWorklist(Module& M);
   void cloneFunctionArguments(Module& M);
   void updateCallInsns(Module& M);
+  void updateInvokeInsns(Module& M);
   // Clone instructions
   bool cloneInsns();
   // Clone constants
@@ -111,7 +119,7 @@ class dataflowProtection : public ModulePass {
   void populateSyncPoints(Module& M);
   // Insert synchronization logic
   void processSyncPoints(Module& M, int numClones);
-  void syncGEP(GetElementPtrInst* currGEP, GlobalVariable* TMRErrorDetected);
+  bool syncGEP(GetElementPtrInst* currGEP, GlobalVariable* TMRErrorDetected);
   void syncStoreInst(StoreInst* currStoreInst, GlobalVariable* TMRErrorDetected);
   void processCallSync(CallInst* currCallInst, GlobalVariable* TMRErrorDetected);
   void syncTerminator(TerminatorInst* currTerminator, GlobalVariable* TMRErrorDetected);
@@ -120,8 +128,9 @@ class dataflowProtection : public ModulePass {
   void insertErrorFunction(Module& M, int numClones);
   void createErrorBlocks(Module& M, int numClones);
   // TMR error detection
-  void insertTMRCorrectionCount(Instruction* cmpInst, GlobalVariable* TMRErrorDetected);
+  void insertTMRCorrectionCount(Instruction* cmpInst, GlobalVariable* TMRErrorDetected, bool updateSyncPoint = false);
   void insertTMRDetectionFlag(Instruction* cmpInst, GlobalVariable* TMRErrorDetected);
+  void insertVectorTMRCorrectionCount(Instruction* cmpInst, Instruction* cmpInst2, GlobalVariable* TMRErrorDetected);
 
   //----------------------------------------------------------------------------//
   // utils.cpp
@@ -130,6 +139,7 @@ class dataflowProtection : public ModulePass {
   void removeUnusedFunctions(Module& M);
   void processCommandLine(Module& M, int numClones);
   void processAnnotations(Module& M);
+  void verifyOptions(Module& M);
   // Cleanup
   void removeAnnotations(Module& M);
   void removeOrigFunctions();
@@ -146,14 +156,22 @@ class dataflowProtection : public ModulePass {
   int getArrayTypeElementBitWidth(Module& M, ArrayType * arrayType);
   void recursivelyVisitCalls(Module& M, Function* F, std::set<Function*> &functionList);
   bool isISR(Function& F);
+  void walkInstructionUses(Instruction* I, bool xMR);
   void cloneMetadata(Module& M, Function* Fnew);
   // Synchronization utilities
   bool isSyncPoint(Instruction* I);
+#ifdef FIX_STORE_SEGMENTING
+  bool isStoreMovePoint(StoreInst* SI);
+#endif
+  bool isCallMovePoint(CallInst* ci);
+  bool checkCoarseSync(StoreInst* inst);
   // Miscellaneous
   bool isIndirectFunctionCall(CallInst* CI, std::string errMsg, bool print=true);
+  std::string getRandomString(std::size_t len);
   int getFunctionsFromConfig();
   void getFunctionsFromCL();
   void dumpModule(Module& M);
+  void updateFnWrappers(Module& M);
 
 };
 
diff --git a/projects/dataflowProtection/functions.config b/projects/dataflowProtection/functions.config
index 952142e41..fa0974e1f 100644
--- a/projects/dataflowProtection/functions.config
+++ b/projects/dataflowProtection/functions.config
@@ -9,7 +9,7 @@
 # Ways to handle function calls
 # Call once, unmodified. Value will propogate through other replicated instructions
 # pass -skipLibCalls=[list] to skip library calls
-skipLibCalls = rand, srand, printf, abort, exit, atexit, assert, clock, printf, fprintf, sprintf, scanf, getchar, getc, ungetc, fopen, fclose, fgetpos, fflush, fread, ftell, fwrite, memchr, memcmp, setlocale, localeconv, fgets, fgetc, rewind, rename, puts, _ZNSolsEPFRSoS_E, _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
+skipLibCalls = rand, srand, printf, abort, exit, atexit, assert, clock, printf, fprintf, sprintf, scanf, getchar, getc, ungetc, fopen, fclose, fgetpos, fflush, fread, ftell, fwrite, memchr, memcmp, setlocale, localeconv, fgets, fgetc, rewind, rename, puts, _ZNSolsEPFRSoS_E, _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc, _ZStlsIdcSt11char_traitsIcEERSt13basic_ostreamIT0_T1_ES6_RKSt7complexIT_E
 # clocal.h? (returns pointer)
 
 # pass -ignoreFns=[list] to skip user calls
diff --git a/projects/dataflowProtection/synchronization.cpp b/projects/dataflowProtection/synchronization.cpp
index 2375c7692..78cf97ab5 100644
--- a/projects/dataflowProtection/synchronization.cpp
+++ b/projects/dataflowProtection/synchronization.cpp
@@ -1,4 +1,4 @@
-//This file holds all of the logic relevant to synchronization points - error functions or voting
+// This file holds all of the logic relevant to synchronization points, error functions, and voting
 
 #include "dataflowProtection.h"
 
@@ -9,31 +9,42 @@
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/IR/Dominators.h>
 #include <llvm/Analysis/LoopInfo.h>
+#include <llvm/IR/IRBuilder.h>
 
-//Command line option
+// Command line options
 extern cl::opt<bool> OriginalReportErrorsFlag;
 extern cl::opt<bool> ReportErrorsFlag;
 extern cl::opt<bool> noLoadSyncFlag;
 extern cl::opt<bool> noStoreDataSyncFlag;
 extern cl::opt<bool> noStoreAddrSyncFlag;
 extern cl::opt<bool> noMemReplicationFlag;
+extern cl::opt<bool> storeDataSyncFlag;
 extern cl::opt<bool> verboseFlag;
+extern cl::opt<bool> noMainFlag;
 
 using namespace llvm;
 
+// commonly used strings
+std::string fault_function_name = "FAULT_DETECTED_DWC";
+std::string tmr_vote_inst_name = "vote";
+std::string tmr_global_count_name = "TMR_ERROR_CNT";
+
 //----------------------------------------------------------------------------//
 // Obtain synchronization points
 //----------------------------------------------------------------------------//
 void dataflowProtection::populateSyncPoints(Module& M) {
-	for(auto F : fnsToClone) {
-		if(F->getName().startswith("FAULT_DETECTED")) //Don't sync in err handler
+	for (auto F : fnsToClone) {
+		if (F->getName().startswith("FAULT_DETECTED")) //Don't sync in err handler
 			continue;
 
 		for (auto & bb : *F) {
 			for (auto & I : bb) {
 
 				//Sync before branches
-				if (I.isTerminator()){
+				if (I.isTerminator()) {
+					//skip syncing on unreachable instructions
+					if (UnreachableInst* unreach = dyn_cast<UnreachableInst>(&I))
+						continue;
 					syncPoints.push_back(&I);
 				}
 
@@ -48,29 +59,52 @@ void dataflowProtection::populateSyncPoints(Module& M) {
 						continue;
 					}
 
-					if (CI->getCalledFunction()->hasExternalLinkage()
-							&& CI->getCalledFunction()->isDeclaration()) {
+					Function* calledF = CI->getCalledFunction();
+
+					//skip debug function calls
+					if (calledF->hasName()) {
+						if (calledF->getName().startswith_lower("llvm.dbg.") ||
+								calledF->getName().startswith_lower("llvm.lifetime."))
+						{
+							continue;
+						}
+					}
+
+					// skip functions that are marked as "wrapper" functions
+					//  see updateFnWrappers()
+					if (wrapperInsts.find(CI) != wrapperInsts.end()) {
+						continue;
+					}
+
+					if (calledF->hasExternalLinkage() && calledF->isDeclaration()) {
 						syncPoints.push_back(&I);
+//						errs() << "Adding " << CI->getCalledFunction()->getName() << " to syncpoints\n";
 					}
 				}
 
 				//Sync data on all stores unless explicitly instructed not to
 				if (StoreInst* SI = dyn_cast<StoreInst>(&I)) {
 					//Don't sync pointers, they will be different
-					if(SI->getOperand(0)->getType()->isPointerTy()){
+					if (SI->getOperand(0)->getType()->isPointerTy()) {
 						continue;
-					} else if(dyn_cast<PtrToIntInst>(SI->getOperand(0))){
+					} else if(dyn_cast<PtrToIntInst>(SI->getOperand(0))) {
 						//Likewise, don't check casted pointers
 						continue;
 					}
-#ifdef SYNC_POINT_FIX
 					// if this is not a cloned instruction
 					else if ( ( (getClone(&I).first == &I) || (getClone(&I).second == &I) ) &&
 							   !noMemReplicationFlag ) {
 						continue;
 					}
-#endif
-					syncPoints.push_back(&I);
+					//by default, we don't sync on stores, unless specifically told to
+					//have to sync on stores, data and addr, if no mem replication
+					else if (!noMemReplicationFlag && !storeDataSyncFlag) {
+						continue;
+					}
+					//otherwise, go ahead and add it to the list of sync-points
+					else {
+						syncPoints.push_back(&I);
+					}
 				}
 
 				//Sync offsets of GEPs
@@ -92,23 +126,39 @@ void dataflowProtection::processSyncPoints(Module & M, int numClones) {
 	if (syncPoints.size() == 0)
 		return;
 
-	GlobalVariable* TMRErrorDetected = M.getGlobalVariable("TMR_ERROR_CNT");
+	GlobalVariable* TMRErrorDetected = M.getGlobalVariable(tmr_global_count_name);
 
 	//Look for the variable first. If it doesn't exist, make one
 	//If it is unneeded, it is erased at the end of this function
-	if(!TMRErrorDetected){
-		if(TMR && ReportErrorsFlag && verboseFlag) errs() << "Could not find TMR_ERROR_CNT flag! Creating one...\n";
-		TMRErrorDetected = cast<GlobalVariable>(M.getOrInsertGlobal("TMR_ERROR_CNT",
+	if (!TMRErrorDetected) {
+		if(TMR && ReportErrorsFlag && verboseFlag)
+			errs() << "Could not find " << tmr_global_count_name << " flag! Creating one...\n";
+
+		TMRErrorDetected = cast<GlobalVariable>(M.getOrInsertGlobal(tmr_global_count_name,
 														IntegerType::getInt32Ty(M.getContext())));
-		TMRErrorDetected->setConstant(false);
-		TMRErrorDetected->setInitializer(ConstantInt::getNullValue(IntegerType::getInt32Ty(M.getContext())));
-		TMRErrorDetected->setUnnamedAddr( GlobalValue::UnnamedAddr() );
-		TMRErrorDetected->setAlignment(4);
+		// if there is no main in this module, keep this global as extern
+		if (noMainFlag) {
+			TMRErrorDetected->setExternallyInitialized(true);
+			TMRErrorDetected->setLinkage(GlobalValue::LinkageTypes::ExternalLinkage);
+		} else {
+			// otherwise, will be initialized to 0
+			TMRErrorDetected->setConstant(false);
+			TMRErrorDetected->setInitializer(ConstantInt::getNullValue(IntegerType::getInt32Ty(M.getContext())));
+			TMRErrorDetected->setUnnamedAddr( GlobalValue::UnnamedAddr() );
+			TMRErrorDetected->setAlignment(4);
+		}
 		globalsToSkip.insert(TMRErrorDetected);
 	}
 	assert(TMRErrorDetected != nullptr);
 
+	// Some of the syncpoints may be invalidated during this process, but we can't remove them
+	//  from this list we're iterating over.  Make a list to delete them later.
+	std::vector<Instruction*> deleteItLater;
+
 	for (auto I : syncPoints) {
+
+		assert(I && "How did a null pointer get into syncpoints?");
+
 		if (StoreInst* currStoreInst = dyn_cast<StoreInst>(I)) {
 			if(!noStoreDataSyncFlag){
 				syncStoreInst(currStoreInst, TMRErrorDetected);
@@ -121,80 +171,97 @@ void dataflowProtection::processSyncPoints(Module & M, int numClones) {
 
 		} else if(GetElementPtrInst* currGEP = dyn_cast<GetElementPtrInst>(I)) {
 
-			if(noMemReplicationFlag){
-				//Goal: Don't duplicate memory -> don't sync GEPs, should only be one
-				//Double check that the GEP isn't used in an unexpected way
-				if(currGEP->getNumUses()!=1){ //Single load, easy
-					if(currGEP->getNumUses() != numClones){ //If they match we are ok
-						for(auto u : currGEP->users()){
-							if(!dyn_cast<StoreInst>(u) && !dyn_cast<LoadInst>(u) && !dyn_cast<CallInst>(u) && !dyn_cast<PHINode>(u)){
-								assert(false && "GEP unknown uses");
-							}
-						}
-					}
-				}
+			// default is DON'T sync on addresses, can only do that when there is no second
+			//  copy in memory
+			if (!noMemReplicationFlag) {
 				continue;
 			}
 
-			if(noLoadSyncFlag){
+			if (noLoadSyncFlag) {
 				//Don't sync address of loads
-				if( dyn_cast<LoadInst>(currGEP->user_back()) ){
+				if ( dyn_cast<LoadInst>(currGEP->user_back()) ) {
 					continue;
-				} else if(GetElementPtrInst* nextGEP = dyn_cast<GetElementPtrInst>(currGEP->user_back())){
+				} else if (GetElementPtrInst* nextGEP = dyn_cast<GetElementPtrInst>(currGEP->user_back())) {
 					//Don't want to sync GEPs that feed GEPs of load inst
-					if(nextGEP->getNumUses() == 1){
-						if( dyn_cast<LoadInst>(nextGEP->user_back()) ){
+					if (nextGEP->getNumUses() == 1) {
+						if ( dyn_cast<LoadInst>(nextGEP->user_back()) ) {
 							continue;
 						}
 					}
 				}
 			}
 
-			if(noStoreAddrSyncFlag){
+			if (noStoreAddrSyncFlag) {
 				//Don't address of stores
-				if( dyn_cast<StoreInst>(currGEP->user_back()) ){
+				if ( dyn_cast<StoreInst>(currGEP->user_back()) ) {
 					continue;
-				} else if(GetElementPtrInst* nextGEP = dyn_cast<GetElementPtrInst>(currGEP->user_back())){
-					//Don't want to sync GEPs that feed GEPs of load inst
-					if(nextGEP->getNumUses() == 1){
-						if( dyn_cast<StoreInst>(nextGEP->user_back()) ){
+				} else if (GetElementPtrInst* nextGEP = dyn_cast<GetElementPtrInst>(currGEP->user_back())) {
+					//Don't want to sync GEPs that feed GEPs of store inst
+					if (nextGEP->getNumUses() == 1) {
+						if ( dyn_cast<StoreInst>(nextGEP->user_back()) ) {
 							continue;
 						}
 					}
 				}
 			}
 
-			syncGEP(currGEP, TMRErrorDetected);
-		} else{
+			//else there is noMemReplication
+			if (syncGEP(currGEP, TMRErrorDetected)) {
+				deleteItLater.push_back(I);
+			}
+		} else {
+			// more detailed information about the failure
+			if (BasicBlock* wrongBB = dyn_cast<BasicBlock>(I)) {
+				errs() << "Something is wrong here...\n";
+				errs() << wrongBB << ": " << *wrongBB << "\n";
+				errs() << "\tin " << wrongBB->getParent()->getName() << "\n";
+				errs() << "\tPrev node = " << wrongBB->getPrevNode()->getName() << "\n";
+				errs() << *wrongBB->getParent() << '\n';
+			} else {
+				errs() << I << "\n";
+				errs() << *I << "\n";
+				errs() << "\tin " << I->getParent()->getName() << "\n";
+			}
 			assert(false && "Synchronizing at an unrecognized instruction type");
 		}
+
 	}
 
-	if(!TMR)
+	// delete the now-invalid pointers
+	for (auto it : deleteItLater) {
+		syncPoints.erase(std::find(syncPoints.begin(), syncPoints.end(), it));
+	}
+
+	// we found some new ones while doing stuff above
+	// these will be used for moving sync instructions around
+	for (auto ns : newSyncPoints) {
+		syncPoints.push_back(ns);
+	}
+
+	if(!TMR && TMRErrorDetected->getNumUses() < 1)
 		TMRErrorDetected->eraseFromParent();
 }
 
-void dataflowProtection::syncGEP(GetElementPtrInst* currGEP, GlobalVariable* TMRErrorDetected) {
+/*
+ * Returns true if it invalidates the pointer to currGEP.  The calling function is responsible
+ *  for handling this.
+ */
+bool dataflowProtection::syncGEP(GetElementPtrInst* currGEP, GlobalVariable* TMRErrorDetected) {
 	//2 forms of GEP with different number of arguments
 	//Offset is the last argument
 	std::vector<Instruction*> syncInsts;
 	Value* orig = currGEP->getOperand(currGEP->getNumOperands()-1);
 
-	if(!isCloned(currGEP)){
-		//Spell the type out so eclipse doesn't think there is an error
-		std::vector<Instruction*>::const_iterator loc = std::find(syncPoints.begin(),syncPoints.end(),currGEP);
-			if(loc != syncPoints.end()){
-//				errs() << "Erasing " << *currGEP << " from syncPoints\n";
-				syncPoints.erase(loc);
-			} else{
-//				errs() << "COULDNT FIND " << *currGEP << "\n";
-			}
-		return;
+	if (!isCloned(currGEP)) {
+	/* Don't remove items from a vector we're currently iterating over.
+	 * Calling function is responsible for this.
+	 */
+		return true;
 	}
 
-	if (!isCloned(orig)){
+	if (!isCloned(orig)) {
 		startOfSyncLogic[currGEP] = currGEP;
-		return;
+		return false;
 	}
 
 	Value* clone1 = getClone(orig).first;
@@ -215,10 +282,10 @@ void dataflowProtection::syncGEP(GetElementPtrInst* currGEP, GlobalVariable* TMR
 
 	startOfSyncLogic[currGEP] = cmp;
 
-	if(TMR){
+	if (TMR) {
 		Value* clone2 = getClone(orig).second;
 		assert(clone2 && "Clone exists when syncing at store");
-		SelectInst* sel = SelectInst::Create(cmp,orig,clone2,"sel",currGEP);
+		SelectInst* sel = SelectInst::Create(cmp,orig,clone2,tmr_vote_inst_name,currGEP);
 
 		syncInsts.push_back(cmp);
 		syncInsts.push_back(sel);
@@ -235,12 +302,15 @@ void dataflowProtection::syncGEP(GetElementPtrInst* currGEP, GlobalVariable* TMR
 //			assert(numUses == 2 &&  "Instruction only used in GEP synchronization");
 //		}
 
-		insertTMRCorrectionCount(cmp,TMRErrorDetected);
-	} else{
+		insertTMRCorrectionCount(cmp, TMRErrorDetected);
+	} else {		//DWC
 		Function* currFn = currGEP->getParent()->getParent();
 		splitBlocks(cmp, errBlockMap[currFn]);
+		//fix invalidated pointer - see note in processCallSync()
+		startOfSyncLogic[currGEP] = currGEP;
 	}
 
+	return false;
 }
 
 void dataflowProtection::syncStoreInst(StoreInst* currStoreInst, GlobalVariable* TMRErrorDetected) {
@@ -253,12 +323,12 @@ void dataflowProtection::syncStoreInst(StoreInst* currStoreInst, GlobalVariable*
 
 	// No need to sync if value is not cloned
 	//Additionally, makes sure we don't sync on copies, unless we are forced to sync here
-	if (!isCloned(orig) && !noMemReplicationFlag){
+	if (!isCloned(orig) && !noMemReplicationFlag) {
 		return;
 	}
-	else if(noMemReplicationFlag){
+	else if (noMemReplicationFlag) {
 		//Make sure we don't sync on single return points when memory isn't duplicated
-		if(!dyn_cast<StoreInst>(orig) && !isCloned(orig)){
+		if (!dyn_cast<StoreInst>(orig) && !isCloned(orig)) {
 			return;
 		}
 	}
@@ -267,7 +337,7 @@ void dataflowProtection::syncStoreInst(StoreInst* currStoreInst, GlobalVariable*
 	assert(clone1 && "Cloned value exists");
 
 	//Disabling synchronization on constant store
-	if(dyn_cast<ConstantInt>(orig)){
+	if (dyn_cast<ConstantInt>(orig)) {
 		return;
 	}
 
@@ -288,10 +358,10 @@ void dataflowProtection::syncStoreInst(StoreInst* currStoreInst, GlobalVariable*
 	syncInsts.push_back(cmp);
 	startOfSyncLogic[currStoreInst] = cmp;
 
-	if(TMR){
+	if (TMR) {
 		Value* clone2 = getClone(orig).second;
 		assert(clone2 && "Clone exists when syncing at store");
-		SelectInst* sel = SelectInst::Create(cmp,orig,clone2,"sel",currStoreInst);
+		SelectInst* sel = SelectInst::Create(cmp,orig,clone2,tmr_vote_inst_name,currStoreInst);
 		syncInsts.push_back(sel);
 
 		assert(getClone(currStoreInst).first && "Store instruction has a clone");
@@ -301,18 +371,18 @@ void dataflowProtection::syncStoreInst(StoreInst* currStoreInst, GlobalVariable*
 		dyn_cast<StoreInst>(getClone(currStoreInst).second)->setOperand(0,sel);
 
 		//Make sure that the voted value is propagated downstream
-		if(orig->getNumUses() != 2){
-			if(Instruction* origInst = dyn_cast<Instruction>(orig)){
+		if (orig->getNumUses() != 2) {
+			if (Instruction* origInst = dyn_cast<Instruction>(orig)) {
 				DominatorTree DT = DominatorTree(*origInst->getParent()->getParent());
-				for(auto u : origInst->users()){
+				for (auto u : origInst->users()) {
 					//Find any and all instructions that were not updated
-					if(std::find(syncInsts.begin(),syncInsts.end(),u) == syncInsts.end()){
+					if (std::find(syncInsts.begin(),syncInsts.end(),u) == syncInsts.end()) {
 						//Get all operands that should be updated
-						for(unsigned int opNum=0; opNum < u->getNumOperands(); opNum++){
+						for (unsigned int opNum=0; opNum < u->getNumOperands(); opNum++) {
 							//Update if and only if the instruction is dominated by sel
-							if(u->getOperand(opNum) == orig && DT.dominates(sel,dyn_cast<Instruction>(u))){
+							if (u->getOperand(opNum) == orig && DT.dominates(sel,dyn_cast<Instruction>(u))) {
 								u->setOperand(opNum,sel);
-								if(isCloned(u)){
+								if (isCloned(u)) {
 									dyn_cast<Instruction>(getClone(u).first)->setOperand(opNum,sel);
 									dyn_cast<Instruction>(getClone(u).second)->setOperand(opNum,sel);
 								}
@@ -323,8 +393,8 @@ void dataflowProtection::syncStoreInst(StoreInst* currStoreInst, GlobalVariable*
 			}
 		}
 
-		insertTMRCorrectionCount(cmp,TMRErrorDetected);
-	} else{
+		insertTMRCorrectionCount(cmp, TMRErrorDetected);
+	} else {		//DWC
 		Function* currFn = currStoreInst->getParent()->getParent();
 		splitBlocks(cmp, errBlockMap[currFn]);
 		//fix invalidated pointer - see note in processCallSync()
@@ -337,6 +407,18 @@ void dataflowProtection::processCallSync(CallInst* currCallInst, GlobalVariable*
 	//Don't compare pointer values either
 	std::vector<Instruction*> syncInsts;
 
+	/* We need to check if any of the parameters in the call instruction are actually arguments
+	 * passed into the function that this CallInst is in.  We need to make a list of the arguments
+	 * to compare easier later.
+	 */
+	Function* enclosingFunction = currCallInst->getParent()->getParent();
+	assert(enclosingFunction && "function exists");
+	std::list<Value*> argVals;
+	for (auto arg = enclosingFunction->arg_begin(); arg != enclosingFunction->arg_end(); arg++) {
+//		errs() << "  " << *arg << " (@ " << arg << ")\n";
+		argVals.push_back(dyn_cast<Value>(arg));
+	}
+
 	std::deque<Value*> cloneableOperandsList;
 	for (unsigned int it = 0; it < currCallInst->getNumArgOperands(); it++) {
 		if (isa<Constant>(currCallInst->getArgOperand(it))
@@ -346,12 +428,12 @@ void dataflowProtection::processCallSync(CallInst* currCallInst, GlobalVariable*
 			continue;
 		cloneableOperandsList.push_back(currCallInst->getArgOperand(it));
 	}
-	if (cloneableOperandsList.size() == 0){
+	if (cloneableOperandsList.size() == 0) {
 		startOfSyncLogic[currCallInst] = currCallInst;
 		return;
 	}
 
-	//We now have a list of (an unknown number) operands, insert comparisons for all of them
+	//We now have a list of (an unknown number of) operands, insert comparisons for all of them
 	std::deque<Value*> cmpInstList;
 	std::vector<Instruction*> syncHelperList;
 	BasicBlock* currBB = currCallInst->getParent();
@@ -387,39 +469,41 @@ void dataflowProtection::processCallSync(CallInst* currCallInst, GlobalVariable*
 
 		syncInsts.push_back(cmp);
 
-		if(TMR){
-			SelectInst* sel = SelectInst::Create(cmp,orig,clones.second,"sel",currCallInst);
+		if (TMR) {
+			SelectInst* sel = SelectInst::Create(cmp, orig, clones.second, tmr_vote_inst_name, currCallInst);
 			syncInsts.push_back(sel);
 
 			currCallInst->replaceUsesOfWith(orig,sel);
-			dyn_cast<CallInst>(getClone(currCallInst).first)->replaceUsesOfWith(clones.first,sel);
-			dyn_cast<CallInst>(getClone(currCallInst).second)->replaceUsesOfWith(clones.second,sel);
+			dyn_cast<CallInst>(getClone(currCallInst).first)->replaceUsesOfWith(clones.first, sel);
+			dyn_cast<CallInst>(getClone(currCallInst).second)->replaceUsesOfWith(clones.second, sel);
 
 			//If something fails this assertion, it means that it is used after the call synchronization
 			//Might have to change it later in case we find a case where this is ok
 			//But extensive tests haven't found a case where this is necessary
+			// update: The condition does NOT hold if the operand is one that is passed in by an argument,
+			// and it hasn't been alloca'd; then every reference is to the original argument.
 			int useCount = orig->getNumUses();
-			if(useCount != 2){
-				if(Instruction* origInst = dyn_cast<Instruction>(orig)){
+			if (useCount != 2) {
+				if (Instruction* origInst = dyn_cast<Instruction>(orig)) {
 					DominatorTree DT = DominatorTree(*origInst->getParent()->getParent());
 					std::vector<Instruction*> uses;
-					for(auto uu : orig->users()){
+					for (auto uu : orig->users()) {
 						uses.push_back(dyn_cast<Instruction>(uu));
 					}
-					for(auto u : uses){
+					for (auto u : uses) {
 						//Find any and all instructions that were not updated
-						if(std::find(syncInsts.begin(),syncInsts.end(),u) == syncInsts.end()){
-							if(!DT.dominates(sel,dyn_cast<Instruction>(u))){
+						if (std::find(syncInsts.begin(),syncInsts.end(),u) == syncInsts.end()) {
+							if (!DT.dominates(sel, dyn_cast<Instruction>(u))) {
 								useCount--;
 							//Get all operands that should be updated
-							} else{
-								for(unsigned int opNum=0; opNum < u->getNumOperands(); opNum++){
+							} else {
+								for (unsigned int opNum=0; opNum < u->getNumOperands(); opNum++) {
 									//Update if and only if the instruction is dominated by sel
-									if(u->getOperand(opNum) == orig && DT.dominates(sel,dyn_cast<Instruction>(u))){
-										u->setOperand(opNum,sel);
-										if(isCloned(u)){
-											dyn_cast<Instruction>(getClone(u).first)->setOperand(opNum,sel);
-											dyn_cast<Instruction>(getClone(u).second)->setOperand(opNum,sel);
+									if (u->getOperand(opNum) == orig && DT.dominates(sel, dyn_cast<Instruction>(u))) {
+										u->setOperand(opNum, sel);
+										if (isCloned(u)) {
+											dyn_cast<Instruction>(getClone(u).first)->setOperand(opNum, sel);
+											dyn_cast<Instruction>(getClone(u).second)->setOperand(opNum, sel);
 										}
 										useCount--;
 									}
@@ -429,16 +513,25 @@ void dataflowProtection::processCallSync(CallInst* currCallInst, GlobalVariable*
 					}
 				}
 			}
-			assert(useCount==2 && "Instruction only used in call sync");
-			insertTMRCorrectionCount(cmp,TMRErrorDetected);
-		} else{
+
+			// if it's not an argument, then we can assert that there can only be 2 uses
+			if (std::find(argVals.begin(), argVals.end(), orig) == argVals.end()) {
+				if (useCount != 2) {
+					errs() << *currCallInst << "\n";
+					errs() << *orig << "\n";
+				}
+				assert(useCount==2 && "Instruction only used in call sync");
+				// TODO: examine what could cause this to fail
+			}
+			insertTMRCorrectionCount(cmp, TMRErrorDetected);
+		} else {		//DWC
 			cmpInstList.push_back(cmp);
 			syncHelperMap[currBB].push_back(cmp);
 		}
 	}
 
-	if(!TMR){
-		if(cmpInstList.size() == 0){
+	if (!TMR) {
+		if (cmpInstList.size() == 0) {
 			return;
 		}
 
@@ -461,6 +554,7 @@ void dataflowProtection::processCallSync(CallInst* currCallInst, GlobalVariable*
 		syncHelperMap[currBB].pop_back();
 		splitBlocks(reducedCompare,	errBlockMap[currCallInst->getParent()->getParent()]);
 		/*
+		 * NOTE:
 		 * splitting the blocks invalidates the previously set value in the map
 		 * startOfSyncLogic, set to be the instruction that compares the operands of the
 		 * function called by the currCallInst. That instruction is deleted from its
@@ -477,7 +571,7 @@ void dataflowProtection::syncTerminator(TerminatorInst* currTerminator, GlobalVa
 
 	//Only sync if there are arguments to duplicate
 	if (isa<BranchInst>(currTerminator)) {
-		if (currTerminator->getNumSuccessors() < 2){ //1 successor, or none - unconditional
+		if (currTerminator->getNumSuccessors() < 2){ //1 successor, or none = unconditional
 			startOfSyncLogic[currTerminator] = currTerminator;
 			return;
 		}
@@ -501,16 +595,17 @@ void dataflowProtection::syncTerminator(TerminatorInst* currTerminator, GlobalVa
 		return;
 	}
 
-	if(TMR){
+	if (TMR) {
 		std::vector<Instruction*> syncInsts;
 		Value* op = currTerminator->getOperand(0);
 
 		if (!isCloned(op))
-				return;
+			return;
 
 		Instruction* clone1 = dyn_cast<Instruction>(getClone(op).first);
 		Instruction* clone2 = dyn_cast<Instruction>(getClone(op).second);
 		assert(clone1 && clone2 && "Instruction has clones");
+		// TODO: examine what could cause this assertion to fail
 
 		//Make sure we're inserting the right type of comparison
 		Instruction::OtherOps cmp_op;
@@ -518,7 +613,7 @@ void dataflowProtection::syncTerminator(TerminatorInst* currTerminator, GlobalVa
 		Type* opType = op->getType();
 
 		//if it's a pointer type, is it ever safe to compare return values?
-		// could have been allocated with malloc
+		// could have been allocated with malloc()
 		// you would have to dereference the pointer to compare the insides of it
 		if (opType->isPointerTy()) {
 			if (verboseFlag) {
@@ -656,21 +751,25 @@ void dataflowProtection::syncTerminator(TerminatorInst* currTerminator, GlobalVa
 		}
 		assert(cmp_op && "return type not supported!");
 
-		Instruction* cmp = CmpInst::Create(cmp_op, cmp_eq, op, clone1,
-			"cmp", currTerminator);
+		Instruction* cmp = CmpInst::Create(cmp_op, cmp_eq, op, clone1, "vcmp", currTerminator);
 
 		startOfSyncLogic[currTerminator] = cmp;
 
-		SelectInst* sel = SelectInst::Create(cmp,op,clone2,"sel",currTerminator);
+		SelectInst* sel = SelectInst::Create(cmp, op, clone2, tmr_vote_inst_name, currTerminator);
 
-		currTerminator->replaceUsesOfWith(op,sel);
+		currTerminator->replaceUsesOfWith(op, sel);
 
 		syncInsts.push_back(cmp);
 		syncInsts.push_back(sel);
 
 		// Too many cases to account for each possibility, this is removed
 		// assert(numUses == 2 && "Instruction only used in terminator synchronization");
-		insertTMRCorrectionCount(cmp,TMRErrorDetected);
+
+		// This function invalidates the line that assigns "cmp" as the map value for currTerminator,
+		//  because the same terminator instruction will no longer exist, if we are inserting
+		//  TMR error count instructions.
+		insertTMRCorrectionCount(cmp, TMRErrorDetected, true);
+
 	} else {		//DWC
 		if (!isCloned(currTerminator->getOperand(0))) {
 			return;
@@ -755,7 +854,6 @@ void dataflowProtection::syncTerminator(TerminatorInst* currTerminator, GlobalVa
 					firstTime = 0;
 					syncPointLater = extract0;
 				}
-//				eCmp[i] = CmpInst::Create(cmp_op, cmp_eq, extract0, extract1, cmpName);
 				eCmp.push_back(CmpInst::Create(cmp_op, cmp_eq, extract0, extract1, cmpName));
 
 				//debug
@@ -835,7 +933,6 @@ void dataflowProtection::syncTerminator(TerminatorInst* currTerminator, GlobalVa
 	}
 }
 
-//#define DEBUG_SIMD_SYNCING
 void dataflowProtection::splitBlocks(Instruction* I, BasicBlock* errBlock) {
 	//Split at I, return a pointer to the new error block
 
@@ -855,12 +952,10 @@ void dataflowProtection::splitBlocks(Instruction* I, BasicBlock* errBlock) {
 	//Delete originalBlock's terminator
 	originalBlock->getTerminator()->eraseFromParent();
 	//create conditional branch
-	//there are some times it will try to branch on a vector value. This is not supported
+	//there are some times it will try to branch on a vector value.
 	//Instead need to insert additional compare logic. Only necessary with DWC.
-	if(!newCmpInst->getType()->isIntegerTy(1) && !TMR){
+	if (!newCmpInst->getType()->isIntegerTy(1) && !TMR) {
 		//it is possible that the value being compared is a vector type instead of a basic type
-//		errs() << "Not a boolean branch!\n" << *newCmpInst << "\n";
-//		assert(newCmpInst->getType()->isIntegerTy(1) && "Incorrect branching!\n");
 
 		//need to sign extend the boolean vector
 		int numElements = newCmpInst->getType()->getVectorNumElements();
@@ -881,11 +976,6 @@ void dataflowProtection::splitBlocks(Instruction* I, BasicBlock* errBlock) {
 		vecToScalar->setName("b_cast");
 		vecToScalar->insertAfter(signExt);
 
-#ifdef DEBUG_SIMD_SYNCING
-		errs() << "SExt: " << *signExt << "\n";
-		errs() << "Bcast: " << *vecToScalar << "\n";
-#endif
-
 		//create one more compare instruction
 		CmpInst* nextCmpInst = CmpInst::Create(Instruction::OtherOps::ICmp,CmpInst::ICMP_EQ,vecToScalar,newIntVec);
 		nextCmpInst->setName("simdSync");
@@ -897,7 +987,8 @@ void dataflowProtection::splitBlocks(Instruction* I, BasicBlock* errBlock) {
 		startOfSyncLogic[newTerm] = newCmpInst;
 		//this map will help with moving things later if the code is segmented
 		simdMap[newCmpInst] = std::make_tuple(signExt, vecToScalar, nextCmpInst);
-	}else{
+
+	} else {
 		BranchInst* newTerm;
 		newTerm = BranchInst::Create(newBlock, errBlock, newCmpInst, originalBlock);
 		startOfSyncLogic[newTerm] = newCmpInst;
@@ -912,17 +1003,25 @@ void dataflowProtection::splitBlocks(Instruction* I, BasicBlock* errBlock) {
 void dataflowProtection::insertErrorFunction(Module &M, int numClones) {
 	Type* t_void = Type::getVoidTy(M.getContext());
 
+	//have to update fault detection block name so it's unique to this module
+	// that way the output code can be included in a library file
+//	TODO: check if it exists. If it does, use it, otherwise, make random
+//	std::string random_suffix = getRandomString(12);
+//	fault_function_name += random_suffix;
+
 	Constant* c;
 	if(numClones==2)
-		c = M.getOrInsertFunction("FAULT_DETECTED_DWC", t_void, NULL);
+		c = M.getOrInsertFunction(fault_function_name, t_void, NULL);
 	else
 		return;
 
 	Function* errFn = dyn_cast<Function>(c);
 	assert(errFn && "Fault detection function is non-void");
+	errFn->addFnAttr(Attribute::get(M.getContext(), "noinline"));
 
 	//If the user has declared their own error handler, use that
 	if( errFn->getBasicBlockList().size() != 0){
+		if (verboseFlag) errs() << info_string << " Found existing DWC error handler function\n";
 		return;
 	}
 
@@ -932,8 +1031,7 @@ void dataflowProtection::insertErrorFunction(Module &M, int numClones) {
 	assert(abortF && "Abort function detected");
 
 	//Create a basic block that calls abort
-	BasicBlock* bb = BasicBlock::Create(M.getContext(), Twine("entry"), errFn,
-	NULL);
+	BasicBlock* bb = BasicBlock::Create(M.getContext(), Twine("entry"), errFn, NULL);
 	CallInst* new_abort = CallInst::Create(abortF, "", bb);
 	UnreachableInst* term = new UnreachableInst(M.getContext(), bb);
 }
@@ -944,11 +1042,12 @@ void dataflowProtection::createErrorBlocks(Module &M, int numClones) {
 	//Create an error handler block for each function - they can't share one
 	Constant* c;
 	if(numClones == 2)
-		c = M.getOrInsertFunction("FAULT_DETECTED_DWC",t_void, NULL);
+		c = M.getOrInsertFunction(fault_function_name, t_void, NULL);
 	else
 		return;
 
 	Function* errFn = dyn_cast<Function>(c);
+	assert(errFn && "error function exists");
 
 	for (auto & F : M) {
 		if (F.getBasicBlockList().size() == 0)
@@ -968,6 +1067,17 @@ void dataflowProtection::createErrorBlocks(Module &M, int numClones) {
 		UnreachableInst* term = new UnreachableInst(errBlock->getContext(),
 				errBlock);
 
+		// have to give the call some debug info, or compilation issues
+		Instruction* lastInst = originalBlock->getTerminator();
+//		while (lastInst->is)
+		if (lastInst->getDebugLoc()) {
+			dwcFailCall->setDebugLoc(lastInst->getDebugLoc());
+//			errs() << *dwcFailCall << "\n";
+		} else {
+//			errs() << *lastInst << "\n";
+			;
+		}
+
 		errBlockMap[&F] = errBlock;
 	}
 
@@ -977,7 +1087,7 @@ void dataflowProtection::createErrorBlocks(Module &M, int numClones) {
 // TMR error detection
 //----------------------------------------------------------------------------//
 void dataflowProtection::insertTMRDetectionFlag(Instruction* cmpInst, GlobalVariable* TMRErrorDetected) {
-	if(!OriginalReportErrorsFlag){
+	if (!OriginalReportErrorsFlag) {
 		return;
 	}
 
@@ -998,26 +1108,30 @@ void dataflowProtection::insertTMRDetectionFlag(Instruction* cmpInst, GlobalVari
 		cmp_op = Instruction::OtherOps::ICmp;
 		cmp_eq = CmpInst::ICMP_EQ;
 	}
-	Instruction* cmpInst2 = CmpInst::Create(cmp_op, cmp_eq, orig, clone2, "cmp",nextInst);
-	BinaryOperator* andCmps = BinaryOperator::CreateAnd(cmpInst,cmpInst2,"cmpReduction",nextInst);
+	Instruction* cmpInst2 = CmpInst::Create(cmp_op, cmp_eq, orig, clone2, "cmp", nextInst);
+	BinaryOperator* andCmps = BinaryOperator::CreateAnd(cmpInst, cmpInst2, "cmpReduction", nextInst);
 
 	//Insert a load, or after the sel inst
 	LoadInst* LI = new LoadInst(TMRErrorDetected, "errFlagLoad", nextInst);
-	CastInst* castedCmp = CastInst::CreateZExtOrBitCast(andCmps,LI->getType(),"extendedCmp",LI);
+	CastInst* castedCmp = CastInst::CreateZExtOrBitCast(andCmps, LI->getType(), "extendedCmp", LI);
 
-	BinaryOperator* BI = BinaryOperator::CreateAdd(LI,castedCmp,"errFlagCmp",nextInst);
-	StoreInst* SI = new StoreInst(BI,TMRErrorDetected,nextInst);
+	BinaryOperator* BI = BinaryOperator::CreateAdd(LI, castedCmp, "errFlagCmp", nextInst);
+	StoreInst* SI = new StoreInst(BI, TMRErrorDetected, nextInst);
 }
 
-void dataflowProtection::insertTMRCorrectionCount(Instruction* cmpInst, GlobalVariable* TMRErrorDetected) {
-	if(OriginalReportErrorsFlag){
-		insertTMRDetectionFlag(cmpInst,TMRErrorDetected);
+void dataflowProtection::insertTMRCorrectionCount(Instruction* cmpInst, GlobalVariable* TMRErrorDetected, bool updateSyncPoint) {
+	assert(cmpInst && "valid compare instruction");
+	assert(TMRErrorDetected && "valid TMR count global");
+
+	if (OriginalReportErrorsFlag) {
+		insertTMRDetectionFlag(cmpInst, TMRErrorDetected);
 		return;
-	} else if(!ReportErrorsFlag){
+	} else if (!ReportErrorsFlag) {
 		return;
 	}
 
 	Instruction* nextInst = cmpInst->getNextNode();
+	// value being synchronized on
 	Value* orig = dyn_cast<Value>(cmpInst->getOperand(0));
 	assert(orig && "Original operand exists");
 
@@ -1034,8 +1148,17 @@ void dataflowProtection::insertTMRCorrectionCount(Instruction* cmpInst, GlobalVa
 		cmp_op = Instruction::OtherOps::ICmp;
 		cmp_eq = CmpInst::ICMP_EQ;
 	}
-	Instruction* cmpInst2 = CmpInst::Create(cmp_op, cmp_eq, orig, clone2, "cmp",nextInst);
-	BinaryOperator* andCmps = BinaryOperator::CreateAnd(cmpInst,cmpInst2,"cmpReduction",nextInst);
+	// compare the original with the 2nd clone
+	Instruction* cmpInst2 = CmpInst::Create(cmp_op, cmp_eq, orig, clone2, "cmp", nextInst);
+
+	/* Trying to add support to detecting errors in vector types */
+	if (cmpInst->getType()->isVectorTy()) {
+		insertVectorTMRCorrectionCount(cmpInst, cmpInst2, TMRErrorDetected);
+		return;
+	}
+
+	// AND the two compares together to see if either compare failed
+	BinaryOperator* andCmps = BinaryOperator::CreateAnd(cmpInst, cmpInst2, "cmpReduction", nextInst);
 
 	if (!andCmps->getType()->isIntegerTy(1)) {
 		errs() << "TMR detector can't branch on " << *(andCmps->getType()) << ".  Disable vectorization? (-fno-vectorize) \n";
@@ -1045,26 +1168,35 @@ void dataflowProtection::insertTMRCorrectionCount(Instruction* cmpInst, GlobalVa
 	}
 
 	BasicBlock* originalBlock = cmpInst->getParent();
+	// create a new basic block to increment the counter, if there was an error
 	BasicBlock* errBlock = BasicBlock::Create(originalBlock->getContext(),
 			"errorHandler." + Twine(originalBlock->getParent()->getName()),
 			originalBlock->getParent(), originalBlock);
 
 	//Populate new block -- load global counter, increment, store
 	LoadInst* LI = new LoadInst(TMRErrorDetected, "errFlagLoad", errBlock);
-	Constant* one = ConstantInt::get(LI->getType(),1,false);
-	BinaryOperator* BI = BinaryOperator::CreateAdd(LI,one,"errFlagAdd",errBlock);
-	StoreInst* SI = new StoreInst(BI,TMRErrorDetected,errBlock);
+	Constant* one = ConstantInt::get(LI->getType(), 1, false);
+	BinaryOperator* BI = BinaryOperator::CreateAdd(LI, one, "errFlagAdd", errBlock);
+	StoreInst* SI = new StoreInst(BI, TMRErrorDetected, errBlock);
 
 	//Split blocks, deal with terminators
 	const Twine& name = originalBlock->getParent()->getName() + ".cont";
+	// the "vote" instruction is the first one in the new BB
 	BasicBlock* originalBlockContinued = originalBlock->splitBasicBlock(nextInst, name);
 
+	// splitting blocks adds an unconditional branch to the new BB; remove it
 	originalBlock->getTerminator()->eraseFromParent();
-	BranchInst* condGoToErrBlock = BranchInst::Create(originalBlockContinued,errBlock,andCmps,originalBlock);
+	BranchInst* condGoToErrBlock = BranchInst::Create(originalBlockContinued, errBlock, andCmps, originalBlock);
 
-	BranchInst* returnToBB = BranchInst::Create(originalBlockContinued,errBlock);
+	// add a branch instruction to the error block to unconditionally go to the continue block
+	BranchInst* returnToBB = BranchInst::Create(originalBlockContinued, errBlock);
 	errBlock->moveAfter(originalBlock);
 
+	// if terminator for originalBlock was a sync point, be sure to mark the new terminator as such as well
+	if (updateSyncPoint) {
+		newSyncPoints.push_back(condGoToErrBlock);
+	}
+
 	//Update how to divide up blocks
 	std::vector<Instruction*> syncHelperList;
 	syncHelperMap[originalBlock] = syncHelperList;
@@ -1074,3 +1206,67 @@ void dataflowProtection::insertTMRCorrectionCount(Instruction* cmpInst, GlobalVa
 	syncCheckMap[originalBlock] = condGoToErrBlock;
 	startOfSyncLogic[condGoToErrBlock] = cmpInst;
 }
+
+// invalidates the first two arguments
+void dataflowProtection::insertVectorTMRCorrectionCount(Instruction* cmpInst, Instruction* cmpInst2, GlobalVariable* TMRErrorDetected) {
+	//don't support pointers (yet)
+	if (cmpInst->getType()->isPtrOrPtrVectorTy()) {
+		assert(false && "not supporting TMR detector with vectors of pointers");
+	}
+
+	//change the comparisons to be NotEqual so we can add the results for a total error count
+	Instruction::OtherOps cmp_op;
+	CmpInst::Predicate cmp_neq;
+	Type* vType = cmpInst->getOperand(0)->getType();
+	if (vType->isIntOrIntVectorTy()) {
+		//integer type
+		cmp_op = Instruction::OtherOps::ICmp;
+		cmp_neq = CmpInst::ICMP_NE;
+	} else if (vType->isFPOrFPVectorTy()) {
+		//floating point type
+		cmp_op = Instruction::OtherOps::FCmp;
+		cmp_neq = CmpInst::FCMP_UNE;
+	} else {
+		assert(false && "unsupported vector type");
+	}
+	CmpInst* newCmpInst = CmpInst::Create(cmp_op, cmp_neq, \
+			cmpInst->getOperand(0), cmpInst->getOperand(1), "ncmp");
+	newCmpInst->insertAfter(cmpInst);
+	CmpInst* newCmpInst2 = CmpInst::Create(cmp_op, cmp_neq, \
+			cmpInst2->getOperand(0), cmpInst2->getOperand(1), "ncmp");
+	newCmpInst2->insertAfter(cmpInst2);
+	cmpInst2->replaceAllUsesWith(newCmpInst2);
+	cmpInst2->eraseFromParent();
+
+	//need to OR the two cmp's first
+	BinaryOperator* cmpOr = BinaryOperator::Create(Instruction::BinaryOps::Or, \
+			newCmpInst, newCmpInst2, "reduceOr");
+	cmpOr->insertAfter(newCmpInst2);
+
+	BasicBlock* thisBlock = newCmpInst->getParent();
+	IRBuilder<> builder(thisBlock);
+
+	VectorType* typ = dyn_cast<VectorType>(newCmpInst->getType());
+	//have to extract each element
+	uint64_t nTypes = typ->getNumElements();
+	VectorType* newVType = VectorType::get(TMRErrorDetected->getValueType(), nTypes);
+
+	//zero-extend the cmpOr result to be the same size as the TMR error counter
+	CastInst* zext = CastInst::Create(Instruction::CastOps::ZExt, cmpOr, newVType);
+	zext->insertAfter(cmpOr);
+
+	//the alternative to this is to implement a similar approach as is found in how
+	// syncTerminator() deals with struct types
+	CallInst* redAdd = builder.CreateAddReduce(zext);
+	redAdd->moveAfter(zext);
+
+	//add this to the global
+	//if there were no errors, then it's just adding 0
+	LoadInst* LI = new LoadInst(TMRErrorDetected, "errFlagLoad", thisBlock);
+	BinaryOperator* BI = BinaryOperator::CreateAdd(LI, redAdd, "errFlagAdd", thisBlock);
+	StoreInst* SI = new StoreInst(BI, TMRErrorDetected, thisBlock);
+	LI->moveAfter(redAdd);		BI->moveAfter(LI);		SI->moveAfter(BI);
+
+	return;
+}
+
diff --git a/projects/dataflowProtection/utils.cpp b/projects/dataflowProtection/utils.cpp
index 570f4cd9b..474844cb5 100644
--- a/projects/dataflowProtection/utils.cpp
+++ b/projects/dataflowProtection/utils.cpp
@@ -8,13 +8,15 @@
 #include <fstream>
 #include <sstream>
 #include <algorithm>
+#include <ctime>
+#include <cstdlib>
 
 #include <llvm/IR/Module.h>
 #include "llvm/Support/CommandLine.h"
 #include <llvm/Support/Debug.h>
 #include <llvm/Support/raw_ostream.h>
-
 #include <llvm/IR/Constants.h>
+#include <llvm/IR/IRBuilder.h>
 #include "llvm/ADT/StringRef.h"
 
 // Command line options
@@ -27,6 +29,7 @@ extern cl::list<std::string> ignoreGlblCl;
 extern cl::list<std::string> globalsToRuntimeInitCl;
 extern cl::opt<bool> noMemReplicationFlag;
 extern cl::opt<bool> noStoreDataSyncFlag;
+extern cl::opt<bool> storeDataSyncFlag;
 extern cl::opt<bool> ReportErrorsFlag;
 extern cl::opt<std::string> configFileLocation;
 extern cl::opt<bool> dumpModuleFlag;
@@ -47,6 +50,9 @@ std::list<std::string> coarseGrainedUserFunctions;
 std::list<std::string> ignoreGlbl;
 std::list<std::string> clGlobalsToRuntimeInit;
 
+//track functions that we should ignore invalid SOR crossings
+std::map<GlobalVariable*, Function*> globalCrossMap;
+
 //also, there are some functions that are not supported
 //it is in here instead of the config file because we don't want users touching it
 std::list<std::string> unsupportedFunctions = {"fscanf", "scanf", "fgets", "gets", "sscanf", "__isoc99_fscanf"};
@@ -56,18 +62,36 @@ using namespace llvm;
 //----------------------------------------------------------------------------//
 // Miscellaneous
 //----------------------------------------------------------------------------//
-bool dataflowProtection::isIndirectFunctionCall(CallInst* CI, std::string errMsg, bool print){
+bool dataflowProtection::isIndirectFunctionCall(CallInst* CI, std::string errMsg, bool print) {
 	//This partially handles bitcasts and other inline LLVM functions
-	if(CI->getCalledFunction() == nullptr){
-		if(print || verboseFlag)
+	if (CI->getCalledFunction() == nullptr) {
+		// probably don't want to hear about skipping inline assembly, clean up output
+		if( (print || verboseFlag) && !CI->isInlineAsm())
 			errs() << warn_string << " in " << errMsg << " skipping:\n\t" << *CI << "\n";
 		return true;
-	}else{
+	} else {
 		return false;
 	}
 }
 
-void dataflowProtection::getFunctionsFromCL(){
+// returns a string of random characters of the requested size
+// used to name-mangle the DWC error handler block (under construction)
+std::string dataflowProtection::getRandomString(std::size_t len) {
+	//init rand
+	std::srand(time(0));
+
+	const char chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
+	int charLen = sizeof(chars) - 1;
+	std::string result = "";
+
+	for (size_t i = 0; i < len; i+=1) {
+		result += chars[rand() % charLen];
+	}
+
+	return result;
+}
+
+void dataflowProtection::getFunctionsFromCL() {
 	//copy all (fixed) things from the command line to the internal, editable lists
 	for(auto x : skipLibCallsCl){
 		skipLibCalls.push_back(x);
@@ -93,30 +117,35 @@ void dataflowProtection::getFunctionsFromCL(){
 //function to extract function names from the configuration file
 //lists already exist, created in dataflowProtection.cpp
 //return value indicates success or failure
-int dataflowProtection::getFunctionsFromConfig(){
+int dataflowProtection::getFunctionsFromConfig() {
 	std::string filename;
-	if(configFileLocation!=""){
+	if (configFileLocation!="") {
 		filename = configFileLocation;
-	} else{
-		std::string coast = std::getenv("COAST_ROOT");
-		filename = coast + "/projects/dataflowProtection/functions.config";
+	} else {
+		char* coast = std::getenv("COAST_ROOT");
+		if (coast) {
+			filename = std::string(coast) + "/projects/dataflowProtection/functions.config";
+		} else {
+			// just look in the current directory
+			filename = "functions.config";
+		}
 	}
 	std::ifstream ifs(filename, std::ifstream::in);
 
-	if(!ifs.is_open()){
+	if (!ifs.is_open()) {
 		errs() << "ERROR: No configuration file found at " << filename << '\n';
 		errs() << "         Please pass one in using -configFile\n";
-		return 0;
+		return -1;
 	}
 
 	std::list<std::string>* lptr;
 	std::string line;
-	while(getline(ifs, line)){
-		if(line.length() == 0){  //Blank line
+	while (getline(ifs, line)) {
+		if (line.length() == 0) {  //Blank line
 			continue;
 		}
 
-		if(line[0] == '#'){ //# is the comment symbol
+		if (line[0] == '#') { //# is the comment symbol
 			continue;
 		}
 
@@ -145,9 +174,9 @@ int dataflowProtection::getFunctionsFromConfig(){
 		}
 
 		//insert all options into vector
-		while(iss.good()){
+		while (iss.good()) {
 			getline(iss, substr, ',');
-			if(substr.length() == 0)
+			if (substr.length() == 0)
 				continue;
 			lptr->push_back(substr);
 		}
@@ -160,15 +189,15 @@ int dataflowProtection::getFunctionsFromConfig(){
 //If -dumpModule is passed in, then print the entire module out
 //This is helpful when the pass crashes on cleanup
 //It is in a format that can be pasted into an *.ll file and run
-void dataflowProtection::dumpModule(Module& M){
-	if(!dumpModuleFlag)
+void dataflowProtection::dumpModule(Module& M) {
+	if (!dumpModuleFlag)
 		return;
 
-	for(GlobalVariable& g : M.getGlobalList()){
-			errs() << g << "\n";
+	for (GlobalVariable& g : M.getGlobalList()) {
+		errs() << g << "\n";
 	}
 	errs() << "\n";
-	for(auto &f : M){
+	for (auto &f : M) {
 		errs() << f << "\n";
 	}
 }
@@ -177,16 +206,24 @@ void dataflowProtection::dumpModule(Module& M){
 // Initialization code
 //----------------------------------------------------------------------------//
 void dataflowProtection::removeUnusedFunctions(Module& M) {
+
+	//get reference to main() function
+	Function* mainFunction = M.getFunction("main");
+	//If we don't have a main, don't remove any functions
+	if (!mainFunction) {
+		return;
+	}
+
 	//Populate a list of all functions in the module
 	std::set<Function*> functionList;
-	for(auto & F : M){
+	for (auto & F : M) {
 		//Ignore external function declarations
-		if(F.hasExternalLinkage() && F.isDeclaration()){
+		if (F.hasExternalLinkage() && F.isDeclaration()) {
 			continue;
 		}
 
 		//Don't erase fault handlers
-		if(F.getName().startswith("FAULT_DETECTED_")){
+		if (F.getName().startswith("FAULT_DETECTED_")) {
 			continue;
 		}
 
@@ -197,77 +234,75 @@ void dataflowProtection::removeUnusedFunctions(Module& M) {
 		if(F.getNumUses() != 0)
 			continue;
 
-		functionList.insert(&F);
-	}
+		if (usedFunctions.find(&F) != usedFunctions.end())
+			continue;
 
-	Function* mainFunction = M.getFunction("main");
-	if(!mainFunction) { //If we don't have a main, don't remove any
-		return;
+		functionList.insert(&F);
 	}
 
-	recursivelyVisitCalls(M,mainFunction,functionList);
+	recursivelyVisitCalls(M, mainFunction, functionList);
 
-	if(functionList.size() == 0){
+	if (functionList.size() == 0) {
 		return;
 	}
 
-	if(functionList.size()>0)
+	// TODO: fix assertion - it's possible for a xMR'd function to be in the list of no uses,
+	//  if it's used as a function pointer only
+	if(functionList.size() > 0)
 		if(verboseFlag) errs() << "The following functions are unused, removing them: \n";
-	for(auto q : functionList){
-		assert(fnsToClone.find(q)==fnsToClone.end() && "The specified function is not called, so is being removed");
+	for (auto q : functionList) {
+		if (fnsToClone.find(q) != fnsToClone.end()) {
+			errs() << "Failed removing function '" << q->getName() << "'\n";
+		}
+		assert( (fnsToClone.find(q) == fnsToClone.end()) && "The specified function is not called, so is being removed");
 		if(verboseFlag) errs() << "    " << q->getName() << "\n";
 		q->eraseFromParent();
 	}
 
 }
 
-void dataflowProtection::processCommandLine(Module& M, int numClones){
-	if(InterleaveFlag == SegmentFlag){
+void dataflowProtection::processCommandLine(Module& M, int numClones) {
+	if (InterleaveFlag == SegmentFlag) {
 		SegmentFlag = true;
 	}
 	TMR = (numClones==3);
 
-	if(noMemReplicationFlag && noStoreDataSyncFlag){
-		errs() << "WARNING: noMemDuplication and noStoreDataSync set simultaneously. Recommend not setting the two together.\n";
+	if (noMemReplicationFlag && noStoreDataSyncFlag) {
+		errs() << warn_string << " noMemDuplication and noStoreDataSync set simultaneously. Recommend not setting the two together.\n";
+	}
+
+	if (noStoreDataSyncFlag && storeDataSyncFlag) {
+		errs() << err_string << " conflicting flags for store and noStore!\n";
+		exit(-1);
 	}
 
 	//copy command line lists to internal lists
 	getFunctionsFromCL();
 
-//	errs() << "Content of skipLibCalls (before loading config):\n";
-//	for(auto li : skipLibCalls){
-//		errs() << li << "\n";
-//	}
-
-	if(getFunctionsFromConfig()){
+	if (getFunctionsFromConfig()) {
 		assert("Configuration file error!" && false);
 	}
 
-//	errs() << "Content of skipLibCalls (after loading config):\n";
-//	for(auto li : skipLibCalls){
-//		errs() << li << "\n";
-//	}
-
-	if(skipFn.size() == 0){
-		for (auto & fn_it : M){
+	if (skipFn.size() == 0) {
+		for (auto & fn_it : M) {
 
 			if (fn_it.isDeclaration()) { //Ignore library calls
 				continue;
 			}
 
-			if(isISR(fn_it)){ //Don't erase ISRs
+			if (isISR(fn_it)) { //Don't erase ISRs
 				continue;
 			}
 
-			if(xMR_default)
+			if (xMR_default)
 				fnsToClone.insert(&fn_it);
 		}
 	} else {
 		for (auto fcn : skipFn) {
 			Function* f = M.getFunction(StringRef(fcn));
-			if(!f){
-				errs() << "\nERROR:Specified function does not exist!\n";
-				errs() << "Check the spelling, check if the optimizer inlined it\n\n";
+			if (!f) {
+				errs() << "\n" << err_string << "Specified function " << fcn << " does not exist!\n";
+				errs() << "Check the spelling, check if the optimizer inlined it, of if name was mangled\n\n";
 				assert(f);
 			}
 			fnsToSkip.insert(f);
@@ -276,54 +311,104 @@ void dataflowProtection::processCommandLine(Module& M, int numClones){
 
 }
 
-void dataflowProtection::processAnnotations(Module& M){
+void dataflowProtection::processAnnotations(Module& M) {
 	//Inspired by http://bholt.org/posts/llvm-quick-tricks.html
 	auto global_annos = M.getNamedGlobal("llvm.global.annotations");
-	if(global_annos){
+	if (global_annos) {
 		auto a = cast<ConstantArray>(global_annos->getOperand(0));
-		for(int i=0; i < a->getNumOperands(); i++){
-			auto e = cast<ConstantStruct>(a->getOperand(i));
-
-			auto anno = cast<ConstantDataArray>(cast<GlobalVariable>(e->getOperand(1)->getOperand(0))->getOperand(0))->getAsCString();
-
-			//Function annotations
-			if(auto fn = dyn_cast<Function>(e->getOperand(0)->getOperand(0))){
-				if(anno == no_xMR_anno) {
-					if(verboseFlag) errs() << "Directive: do not clone function '" << fn->getName() << "'\n";
-					fnsToSkip.insert(fn);
-					if(fnsToClone.find(fn)!=fnsToClone.end())
-						fnsToClone.erase(fn);
-				} else if(anno == xMR_anno) {
-					if(verboseFlag) errs() << "Directive: clone function '" << fn->getName() << "'\n";
-					fnsToClone.insert(fn);
-				} else if(anno == xMR_call_anno){
-					if(verboseFlag) errs() << "Directive: replicate calls to function '" << fn->getName() << "'\n";
-					coarseGrainedUserFunctions.push_back(fn->getName());
-				} else {
-					assert(false && "Invalid option on function");
-				}
+		//check that it is the right type
+		if (a) {
+			for (int i=0; i < a->getNumOperands(); i++) {
+				auto e = cast<ConstantStruct>(a->getOperand(i));
+
+				//extract data
+				auto anno = cast<ConstantDataArray>(cast<GlobalVariable>(e->getOperand(1)->getOperand(0))->getOperand(0))->getAsCString();
+
+				//Function annotations
+				if (auto fn = dyn_cast<Function>(e->getOperand(0)->getOperand(0))) {
+					if (anno == no_xMR_anno) {
+						if(verboseFlag) errs() << "Directive: do not clone function '" << fn->getName() << "'\n";
+						fnsToSkip.insert(fn);
+						if (fnsToClone.find(fn)!=fnsToClone.end())
+							fnsToClone.erase(fn);
+					} else if (anno == xMR_anno) {
+						if(verboseFlag) errs() << "Directive: clone function '" << fn->getName() << "'\n";
+						fnsToClone.insert(fn);
+					} else if (anno == xMR_call_anno) {
+						if(verboseFlag) errs() << "Directive: replicate calls to function '" << fn->getName() << "'\n";
+						coarseGrainedUserFunctions.push_back(fn->getName());
+					} else if (anno == skip_call_anno) {
+						if(verboseFlag) errs() << "Directive: do not clone calls to function '"  << fn->getName() << "'\n";
+						skipLibCalls.push_back(fn->getName());
+						//TODO: do we need to worry about duplicates?
+					} else if (anno.startswith("no-verify-")) {
+						StringRef global_name = anno.substr(10, anno.size() - 10);
+
+						GlobalValue* glbl = M.getNamedValue(global_name);
+						if (glbl) {
+							GlobalVariable* glblVar = dyn_cast<GlobalVariable>(glbl);
+							if (glblVar) {
+								globalCrossMap[glblVar] = fn;
+								errs() << "Directive: ignoring global \"" << global_name
+										<< "\" being used in unprotected function \"" << fn->getName() << "\"\n";
+							}
+						} else {
+							errs() << warn_string << " global " << global_name << " doesn't exist\n";
+						}
 
-			}
-			//Global annotations
-			else if(auto gv = dyn_cast<GlobalVariable>(e->getOperand(0)->getOperand(0))){
-				if(anno == no_xMR_anno) {
-					if(verboseFlag) errs() << "Directive: do not clone global variable '" << gv->getName() << "'\n";
-					globalsToSkip.insert(gv);
-				} else if(anno == xMR_anno) {
-					if(verboseFlag) errs() << "Directive: clone global variable '" << gv->getName() << "'\n";
-					globalsToClone.insert(gv);
-				} else if(anno==default_xMR){
-					if(verboseFlag) errs() << "Directive: set xMR as default\n";
-				} else if(anno==default_no_xMR){
-					if(verboseFlag) errs() << "Directive: set no xMR as default\n";
-					xMR_default = false;
-				} else {
-					if(verboseFlag) errs() << "Directive: " << anno << "\n";
-					assert(false && "Invalid option on global value");
+					} else {
+						assert(false && "Invalid option on function");
+					}
+
+				}
+				//Global annotations
+				else if (auto gv = dyn_cast<GlobalVariable>(e->getOperand(0)->getOperand(0))) {
+					if (anno == no_xMR_anno) {
+						if(verboseFlag) errs() << "Directive: do not clone global variable '" << gv->getName() << "'\n";
+						globalsToSkip.insert(gv);
+					} else if (anno == xMR_anno) {
+						if(verboseFlag) errs() << "Directive: clone global variable '" << gv->getName() << "'\n";
+						globalsToClone.insert(gv);
+					} else if (anno == default_xMR) {
+						if(verboseFlag) errs() << "Directive: set xMR as default\n";
+					} else if (anno == default_no_xMR) {
+						if(verboseFlag) errs() << "Directive: set no xMR as default\n";
+						xMR_default = false;
+					} else if (anno == coast_volatile) {
+						if(verboseFlag) errs() << "Directive: don't remove '" << gv->getName() << "'\n";
+						volatileGlobals.insert(gv);
+					} else {
+						if(verboseFlag) errs() << "Directive: " << anno << "\n";
+						assert(false && "Invalid option on global value");
+					}
+				}
+				else {
+					assert(false && "Non-function annotation");
 				}
 			}
-			else{
-				assert(false && "Non-function annotation");
+		} else {
+			errs() << warn_string << " global annotations of wrong type!\n" << *global_annos << "\n";
+		}
+	}
+
+	// get the data from the list of "used" globals, and add it to volatileGlobals
+	auto used_annos = M.getNamedGlobal("llvm.used");
+	if (used_annos) {
+		auto ua = cast<ConstantArray>(used_annos->getOperand(0));
+		if (ua) {
+			for (int i=0; i < ua->getNumOperands(); i++) {
+				auto element = ua->getOperand(i);
+				if (BitCastOperator* bc = dyn_cast<BitCastOperator>(element)) {
+					errs() << " >>> Hooray, found a bitcast!\n";
+					if (GlobalVariable* gv = dyn_cast<GlobalVariable>(bc->getOperand(0))) {
+						errs() << *gv << "\n";
+						volatileGlobals.insert(gv);
+					} else if (Function* fn = dyn_cast<Function>(bc->getOperand(0))) {
+						errs() << " <<< found a used function:\n";
+						errs() << fn->getName() << "\n";
+						usedFunctions.insert(fn);
+					}
+				}	// TODO: what if it doesn't have to be bit-casted?
 			}
 		}
 	}
@@ -334,12 +419,19 @@ void dataflowProtection::processAnnotations(Module& M){
 			for(auto &I : bb){
 				if( auto CI = dyn_cast<CallInst>(&I) ){
 					// have to skip any bitcasts in function calls because they aren't actually a function
-					if(isIndirectFunctionCall(CI, "processAnnotations"))
+					if(isIndirectFunctionCall(CI, "processAnnotations", false))
 						continue;
 					if(CI->getCalledFunction()->getName() == "llvm.var.annotation"){
 						//Get variable
 						auto adr = dyn_cast<BitCastInst>(CI->getOperand(0));
-						auto var = dyn_cast<AllocaInst>(adr->getOperand(0));
+						AllocaInst* var;
+						if (!adr) {
+							//there could be no bitcast if the alloca is already of type i8
+							var = dyn_cast<AllocaInst>(CI->getOperand(0));
+						} else {
+							var = dyn_cast<AllocaInst>(adr->getOperand(0));
+						}
+						assert(var && "valid alloca");
 
 						auto ce = dyn_cast<ConstantExpr>(CI->getOperand(1));
 						auto gv  = dyn_cast<GlobalVariable>(ce->getOperand(0));
@@ -349,10 +441,17 @@ void dataflowProtection::processAnnotations(Module& M){
 							if(anno == no_xMR_anno){
 								if(verboseFlag) errs() << "Directive: do not clone local variable '" << *var << "'\n";
 								instsToSkip.insert(var);
+								walkInstructionUses(var, false);
 							} else if(anno == xMR_anno){
 								if(verboseFlag) errs() << "Directive: clone local variable '" << *var << "'\n";
-								instsToClone.insert(var);
+								instsToCloneAnno.insert(var);
+								//if this is all we do, it will only clone the `alloca` instruction, but
+								// we want it to clone all instructions that use the same variable
+								walkInstructionUses(var, true);
+								//how do we get the syncpoints to happen?
+								//have to add them manually
 							} else{
+								errs() << anno << "\n";
 								assert(false && "Unrecognized variable annotation");
 							}
 						} else{
@@ -368,23 +467,25 @@ void dataflowProtection::processAnnotations(Module& M){
 //----------------------------------------------------------------------------//
 // Cleanup
 //----------------------------------------------------------------------------//
-void dataflowProtection::removeAnnotations(Module& M){
+void dataflowProtection::removeAnnotations(Module& M) {
 	auto global_annos = M.getNamedGlobal("llvm.global.annotations");
-	if(!global_annos)
+	if (!global_annos)
+		return;
+	auto a = cast<ConstantArray>(global_annos->getOperand(0));
+	if (!a)
 		return;
 
 	std::set<GlobalVariable*> anno_strings;
 
 	//Populate a list of global strings that are only used in annotations
-	auto a = cast<ConstantArray>(global_annos->getOperand(0));
-	for(int i=0; i < a->getNumOperands(); i++){
+	for (int i=0; i < a->getNumOperands(); i++) {
 		auto e = cast<ConstantStruct>(a->getOperand(i)); //This is part of global_anno
 		auto anno = cast<GlobalVariable>(e->getOperand(1)->getOperand(0)); //This is the global string
 
-		for(int j=0; j < e->getNumOperands(); j++){
-			if(e->getOperand(j)->getNumOperands() >= 1){
-				if(auto cs = dyn_cast<GlobalVariable>(e->getOperand(j)->getOperand(0))){
-					if(cs->getSection() == "llvm.metadata"){
+		for (int j=0; j < e->getNumOperands(); j++) {
+			if (e->getOperand(j)->getNumOperands() >= 1) {
+				if (auto cs = dyn_cast<GlobalVariable>(e->getOperand(j)->getOperand(0))) {
+					if (cs->getSection() == "llvm.metadata") {
 						anno_strings.insert(cs);
 					}
 				}
@@ -398,9 +499,9 @@ void dataflowProtection::removeAnnotations(Module& M){
 	for (auto &F : M) {
 		for (auto & bb : F) {
 			for (auto & I : bb) {
-				if(auto CI = dyn_cast<CallInst>(&I)){
+				if (auto CI = dyn_cast<CallInst>(&I)) {
 					auto called = CI->getCalledFunction();
-					if(called->getName() == "llvm.var.annotation"){
+					if ( (called != nullptr) && (called->getName() == "llvm.var.annotation") ) {
 						lva = called;
 						toRemove.insert(CI);
 					}
@@ -409,24 +510,36 @@ void dataflowProtection::removeAnnotations(Module& M){
 		}
 	}
 
-	for(auto rm : toRemove){
+	for (auto rm : toRemove) {
 		auto op0 = dyn_cast<Instruction>(rm->getOperand(0));
-		rm->getParent()->getInstList().erase(rm);
-		if(op0)
-			op0->getParent()->getInstList().erase(op0);
+		if (rm->getNumUses() < 1) {
+			if (rm->getParent()) {
+				rm->eraseFromParent();
+			}
+		}
+		//do this 2nd so that the one possible user is removed first
+		if (op0 && op0->getNumUses() < 1) {
+			if (op0->getParent()) {
+				op0->eraseFromParent();
+			}
+			//we probably added this (which is probably a bitcast) to the list of instructions to clone
+			if (std::find(instsToCloneAnno.begin(), instsToCloneAnno.end(), op0) != instsToCloneAnno.end()) {
+				instsToCloneAnno.erase(op0);
+			}
+		}
 	}
 
-	if(lva){
+	if (lva) {
 		lva->removeFromParent();
 	}
 
 	//Remove global annotations
 	M.getGlobalList().erase(global_annos);
-	for(auto a_s : anno_strings){
+	for (auto a_s : anno_strings) {
 		a_s->eraseFromParent();
 	}
 
-	if(auto default_behavior = M.getNamedGlobal(default_global)){
+	if (auto default_behavior = M.getNamedGlobal(default_global)) {
 		default_behavior->eraseFromParent();
 	}
 }
@@ -457,11 +570,16 @@ void dataflowProtection::removeUnusedErrorBlocks(Module & M) {
 	}
 }
 
-void dataflowProtection::removeUnusedGlobals(Module& M){
+void dataflowProtection::removeUnusedGlobals(Module& M) {
 	std::vector<GlobalVariable*> unusedGlobals;
 
 	for (GlobalVariable & g : M.getGlobalList()) {
-		if (g.getNumUses() == 0) {
+		if (volatileGlobals.find(&g) != volatileGlobals.end()) {
+			// skip removing globals marked as volatile
+			// it's possible the same feature could be implemented by marking variables with
+			//  the attribute "used", instead of an annotation
+			continue;
+		} else if (g.getNumUses() == 0) {
 			StringRef gName = g.getName();
 			//Don't touch ISR related variables
 			if (!(gName.startswith("llvm") || gName.startswith("__vector") || gName.startswith("isr_"))) {
@@ -469,10 +587,17 @@ void dataflowProtection::removeUnusedGlobals(Module& M){
 			}
 		} else if (g.getNumUses() == 1) {
 			for (auto u : g.users()) {
+				if (Instruction* UI = dyn_cast<Instruction>(u)) {
+					//If it's in a function marked as __attribute__((used)), then skip this
+					Function* parentF = UI->getParent()->getParent();
+					if (usedFunctions.find(parentF) != usedFunctions.end()) {
+						continue;
+					}
+				}
 				//Account for instructions that will be cleaned up at the end of the pass
 				//it could also be a call instruction to a library function that has side effects, but
 				// we ignore the return value
-				if (u->getNumUses() == 0 && !isa<StoreInst>(u) && !isa<CallInst>(u)) {
+				if ( (u->getNumUses() == 0) && !isa<StoreInst>(u) && !isa<CallInst>(u) && !isa<InvokeInst>(u)) {
 					unusedGlobals.push_back(&g);
 				}
 			}
@@ -483,7 +608,11 @@ void dataflowProtection::removeUnusedGlobals(Module& M){
 		if (verboseFlag) {
 			errs() << "Removing unused global: " << ug->getName() << "\n";
 		}
-		ug->eraseFromParent();
+		if (ug->getParent()) {
+			ug->eraseFromParent();
+		} else {
+			errs() << warn_string << " global parent doesn't exist?\n" << *ug << "\n";
+		}
 	}
 }
 
@@ -502,41 +631,55 @@ void dataflowProtection::checkForUnusedClones(Module & M) {
 
 			//Used only in a single external function call, eg printf
 			if (orig->hasOneUse() && isa<CallInst>(orig->user_back())) {
-				if (CallInst* CI = dyn_cast<CallInst>(orig->user_back())){
-					if(isIndirectFunctionCall(CI, "checkForUnusedClones"))
+				if (CallInst* CI = dyn_cast<CallInst>(orig->user_back())) {
+					if (isIndirectFunctionCall(CI, "checkForUnusedClones"))
 						continue;
-					else if(CI->getCalledFunction()->hasExternalLinkage())
+					else if (CI->getCalledFunction()->hasExternalLinkage())
 						continue;
 				}
 			}
 
 			//If original is only used in external function calls
-			if(Instruction* inst = dyn_cast<Instruction>(orig)){
+			if (Instruction* inst = dyn_cast<Instruction>(orig)) {
 				//accumulator - proof by contradiction
 				bool allExternal = true;
-				for(auto u : inst->users()){
-					if (CallInst* ci = dyn_cast<CallInst>(u)){
+				for (auto u : inst->users()) {
+					if (CallInst* ci = dyn_cast<CallInst>(u)) {
 						//make sure we're not calling a function on a null pointer
-						if(isIndirectFunctionCall(ci, "checkForUnusedClones"))
+						if (isIndirectFunctionCall(ci, "checkForUnusedClones"))
 							continue;
-						else if(ci->getCalledFunction()->hasExternalLinkage())
+						else if (ci->getCalledFunction()->hasExternalLinkage())
 							continue;
-						else{
+						else {
 							allExternal = false;
 							break;
 						}
 					}
 				}
 				if(allExternal) continue;
+
+				// sometimes clones are erroneously created when the instructions were supposed to be skipped
+				if (instsToSkip.find(inst) != instsToSkip.end()) {
+					if (verboseFlag) errs() << "Removing unused local variable: " << *inst << "\n";
+					inst->eraseFromParent();
+
+					if (TMR) {
+						Instruction* inst2 = dyn_cast<Instruction>(cloneM.second.second);
+						if (verboseFlag) errs() << "Removing unused local variable: " << *inst2 << "\n";
+						inst2->eraseFromParent();
+					}
+				}
+
+				//TODO: add here, also when function calls are supposed to be skipped
 			}
 
 			//Global duplicated strings aren't used in uncloned printfs. Remove the unused clones
 			if (ConstantExpr* ce = dyn_cast<ConstantExpr>(clone)) {
 				if(verboseFlag) errs() << "Removing unused global string: " << *ce << "\n";
 				ce->destroyConstant();
-				if(TMR){
+				if (TMR) {
 					ConstantExpr* ce2 = dyn_cast<ConstantExpr>(cloneM.second.second);
-					if(verboseFlag) errs() << "Removing unused global string: " << *ce2 << "\n";
+					if (verboseFlag) errs() << "Removing unused global string: " << *ce2 << "\n";
 					ce2->destroyConstant();
 				}
 				continue;
@@ -547,20 +690,36 @@ void dataflowProtection::checkForUnusedClones(Module & M) {
 			}
 
 			//If using noMemDuplicationFlag then don't worry about unused arguments
-			if(noMemReplicationFlag){
-				if(dyn_cast<Argument>(orig)){
+			if (noMemReplicationFlag) {
+				if (dyn_cast<Argument>(orig)) {
 					continue;
 				}
 			}
 
-			errs() << "ERROR when updating cloned instructions.\n";
-			errs() << "More about " << *clone << ":\n";
-			errs() << "  Orig:" << *orig << "\n";
-			errs() << "  Orig has " << orig->getNumUses() << " uses\n";
-			Instruction* tmp = dyn_cast<Instruction>(orig->user_back());
-			errs() << "      " << *orig->user_back() << " in " << tmp->getParent()->getName() << "\n";
-			errs() << "\n" << *clone << " has no users\n\n";
-			assert(false && "Clone has no users");
+			// Doesn't work yet because have to get rid of all references to these instructions
+			//  or move for segmenting breaks.
+//			if(Instruction* inst = dyn_cast<Instruction>(clone)) {
+//				if (verboseFlag)
+//					errs() << "Removing unused clone: " << *inst << "\n";
+//				inst->eraseFromParent();
+//				if (TMR) {
+//					Instruction* inst2 = dyn_cast<Instruction>(cloneM.second.second);
+//					inst2->eraseFromParent();
+//				}
+//
+//				continue;
+//			}
+
+			errs() << info_string << " unused clone: " << *clone << ":\n";
+//			errs() << err_string << " when updating cloned instructions.\n";
+//			errs() << "More about " << *clone << ":\n";
+//			errs() << "  Orig:" << *orig << "\n";
+//			errs() << "  Orig has " << orig->getNumUses() << " uses\n";
+//			Instruction* tmp = dyn_cast<Instruction>(orig->user_back());
+//			errs() << "      " << *orig->user_back() << " in " << tmp->getParent()->getName() << "\n";
+//			errs() << "\n" << *clone << " has no users\n\n";
+//			errs() << *tmp->getParent() << "\n";
+//			assert(false && "Clone has no users");
 		}
 	}
 }
@@ -574,17 +733,17 @@ bool dataflowProtection::willBeSkipped(Instruction* I){
 
 bool dataflowProtection::willBeCloned(Value* v) {
 	Instruction* I = dyn_cast<Instruction>(v);
-	if(I){ 
+	if (I) {
 		return instsToClone.find(I) != instsToClone.end();
 	}
 
 	GlobalVariable* g = dyn_cast<GlobalVariable>(v);
-	if(g){
+	if (g) {
 		return globalsToClone.find(g) != globalsToClone.end();
 	}
 
 	ConstantExpr* e = dyn_cast<ConstantExpr>(v);
-	if(e){
+	if (e) {
 		return constantExprToClone.find(e) != constantExprToClone.end();
 	}
 
@@ -607,41 +766,60 @@ ValuePair dataflowProtection::getClone(Value* I) {
 		return cloneMap[I];
 }
 
+//helper function
+//#define DEBUG_INST_MOVING
 void dataflowProtection::moveClonesToEndIfSegmented(Module & M){
-	if(InterleaveFlag)
+	if (InterleaveFlag)
 		return;
 
-	for(auto F : fnsToClone) {
+#ifdef DEBUG_INST_MOVING
+	int flag = 0;
+#endif
+	for (auto F : fnsToClone) {
 		for (auto & bb : *F) {
 
+#ifdef DEBUG_INST_MOVING
+			if (bb.getName() == "entry" && F->getName() == "main") {
+				flag = 1;
+			}
+
+			if (flag) {
+				errs() << F->getName() << "\n";
+				errs() << bb << "\n";
+			}
+#endif
+
 			//Populate list of things to move before
 			std::queue<Instruction*> movePoints;
-			for(auto &I : bb){
-				if(CallInst* CI = dyn_cast<CallInst>(&I)){
+			for (auto &I : bb) {
+				if (CallInst* CI = dyn_cast<CallInst>(&I)) {
 					/* Fixed an issue where the clone was considered a syncPoint, but wasn't
 					 * in the startOfSyncLogic map, so it was inserting a new element and
 					 * putting in the default Instruction* value (whatever that is) into the
 					 * movePoints map
 					*/
-					if(isSyncPoint(CI) && (startOfSyncLogic.find(&I) != startOfSyncLogic.end()) ){
+					if (isSyncPoint(CI) && (startOfSyncLogic.find(&I) != startOfSyncLogic.end()) ) {
 //						errs() << "    Move point at CI sync" << *startOfSyncLogic[&I] << "\n";
 						movePoints.push(startOfSyncLogic[&I]);
-					} else{
+					}
+					else if (CI->getCalledFunction() != nullptr && CI->getCalledFunction()->isIntrinsic()) {
+						;	//don't add intrinsics, because they will be expanded underneath (in assembly)
+							// to be a series of inline instructions, not an actual call
+					}
+					else {
 //						errs() << "    Move point at CI " << I << "\n";
 						movePoints.push(&I);
 					}
-				} else if(TerminatorInst* TI = dyn_cast<TerminatorInst>(&I)){
-					if(isSyncPoint(TI)){
+				} else if(TerminatorInst* TI = dyn_cast<TerminatorInst>(&I)) {
+					if (isSyncPoint(TI)) {
 //						errs() << "    Move point at TI sync " << *startOfSyncLogic[&I] << "\n";
 						movePoints.push(startOfSyncLogic[&I]);
-					} else{
+					} else {
 //						errs() << "    Move point at TI" << I << "\n";
 						movePoints.push(&I);
 					}
-				} else if(StoreInst* SI = dyn_cast<StoreInst>(&I)){
-					if(isSyncPoint(SI)){
-//						errs() << "    Move point at SI" << *startOfSyncLogic[&I] << "\n";
-#ifdef SYNC_POINT_FIX
+				} else if(StoreInst* SI = dyn_cast<StoreInst>(&I)) {
+					if (isSyncPoint(SI)) {
 						/*
 						 * One problem we saw was when a basic block was split, the instruction which
 						 * is the startOfSyncLogic for a following instruction would be in the block
@@ -652,15 +830,22 @@ void dataflowProtection::moveClonesToEndIfSegmented(Module & M){
 						if ( (startOfSyncLogic.find(&I) != startOfSyncLogic.end() ) && \
 							 (startOfSyncLogic[&I]->getParent() == I.getParent()) ) {
 							movePoints.push(startOfSyncLogic[&I]);
+//							errs() << "    Move point at SI" << *startOfSyncLogic[&I] << "\n";
 						} else {
 							movePoints.push(&I);
+//							errs() << "    Move point at SI: " << *SI << "\n";
 						}
-#else					//old stuff
-						movePoints.push(startOfSyncLogic[&I]);
-#endif
 					}
-				} else if(GetElementPtrInst* GI = dyn_cast<GetElementPtrInst>(&I)){
-#ifdef SYNC_POINT_FIX
+#ifdef FIX_STORE_SEGMENTING
+					/* There is a case where we need to keep the stores next to each other, as in the
+					 * load-increment-store pattern.  For StoreInst's which aren't syncpoints, this would
+					 * cause the variable to be incremented twice.  Check for if it has a clone and if
+					 * the type being stored is not a pointer. */
+					else if (isStoreMovePoint(SI)) {
+						movePoints.push(&I);
+					}
+#endif
+				} else if(GetElementPtrInst* GI = dyn_cast<GetElementPtrInst>(&I)) {
 					if (isSyncPoint(GI)) {
 						//not all GEP syncpoints have a corresponding entry in the map
 						if ( (startOfSyncLogic.find(&I) != startOfSyncLogic.end() ) &&
@@ -670,13 +855,6 @@ void dataflowProtection::moveClonesToEndIfSegmented(Module & M){
 							movePoints.push(&I);
 						}
 					}
-#else
-					if (isSyncPoint(GI)) {
-//						errs() << "    Source instruct:" << I << "\n";
-//						errs() << "    Move point at GI" << *startOfSyncLogic[&I] << "\n";
-						movePoints.push(startOfSyncLogic[&I]);
-					}
-#endif
 				}
 			}
 
@@ -685,76 +863,82 @@ void dataflowProtection::moveClonesToEndIfSegmented(Module & M){
 
 			//Move all clones before the sync points
 			for(auto & I : bb) {
+#ifdef DEBUG_INST_MOVING
+				if (flag) {
+					errs() << I << "\n";
+				}
+#endif
 				//see if it's a clone
-				if(PHINode* PN = dyn_cast<PHINode>(&I)){
+				if (PHINode* PN = dyn_cast<PHINode>(&I)) {
 					//don't move it, phi nodes must be at the start
-#ifdef SYNC_POINT_FIX
-				} else if ( (getClone(&I).first != &I) && !(isSyncPoint(&I)) ) {
-#else
-				} else if (getClone(&I).first != &I) {
+				} else if ( (getClone(&I).first != &I) && !(isSyncPoint(&I))
+#ifdef FIX_STORE_SEGMENTING
+						&& !(isStoreMovePoint(dyn_cast<StoreInst>(&I)))
 #endif
+						&& !(isCallMovePoint(dyn_cast<CallInst>(&I)))
+						/* could also check if it's the head of the list */
+				) {
 					Instruction* cloneI1 = dyn_cast<Instruction>(getClone(&I).first);
 					listI1.push_back(cloneI1);
-					if(TMR){
+					if (TMR) {
 						Instruction* cloneI2 = dyn_cast<Instruction>(getClone(&I).second);
 						listI2.push_back(cloneI2);
 					}
 
-#ifdef SYNC_POINT_FIX
 				}
-				// this is a separate condition, not dependent on it being a cloned instruction
-				if(&I == movePoints.front()){
-#else
-				} else if(&I == movePoints.front()){
-#endif
+
+				if (&I == movePoints.front()) {
 					Instruction* inst = movePoints.front();
-					for(auto it : listI1){
+					for (auto it : listI1) {
 						it->moveBefore(movePoints.front());
 					}
 					listI1.clear();
 
-					for(auto it2 : listI2){
+					for (auto it2 : listI2) {
 						it2->moveBefore(movePoints.front());
 					}
 					listI2.clear();
 
 					movePoints.pop();
 
+#ifdef DEBUG_INST_MOVING
+					if (flag) {
+						errs() << bb << "\n";
+					}
+#endif
 				}
 			}
 
 
 			//Move all sync logic to before the branch
-			if(!TMR || ReportErrorsFlag){
-				if(syncCheckMap.find(&bb) != syncCheckMap.end()){ //If block has been split
+			if (!TMR || ReportErrorsFlag) {
+				if (syncCheckMap.find(&bb) != syncCheckMap.end()) { //If block has been split
 
 					Instruction* cmpInst = syncCheckMap[&bb]; //Get instruction block split on
 					assert(cmpInst && "Block split and the cmpInst stuck around");
 					cmpInst->moveBefore(cmpInst->getParent()->getTerminator());
 
-					if(syncHelperMap.find(&bb) != syncHelperMap.end()){ //Move logic before it
-						for(auto I : syncHelperMap[&bb]){
+					if (syncHelperMap.find(&bb) != syncHelperMap.end()) { //Move logic before it
+						for (auto I : syncHelperMap[&bb]) {
 							assert(I && "Moving valid instructions\n");
 							I->moveBefore(cmpInst);
 						}
 					}
 
 					//if there are SIMD instructions, need to move the special compare operators
-					if(simdMap.find(cmpInst) != simdMap.end()){
+					if (simdMap.find(cmpInst) != simdMap.end()) {
 						std::get<0>(simdMap[cmpInst])->moveBefore(cmpInst->getParent()->getTerminator());
 						std::get<1>(simdMap[cmpInst])->moveBefore(cmpInst->getParent()->getTerminator());
 						std::get<2>(simdMap[cmpInst])->moveBefore(cmpInst->getParent()->getTerminator());
 					}
 				}
 			}
-			//cleanup for some things
-			//in case didn't get cleared earlier
-			listI1.clear();
-			listI2.clear();
-			//empty the queue
-			while (!movePoints.empty())
-				movePoints.pop();
 
+#ifdef DEBUG_INST_MOVING
+			if (flag) {
+				flag = 0;
+			}
+#endif
 		}
 	}
 }
@@ -785,16 +969,16 @@ int dataflowProtection::getArrayTypeElementBitWidth(Module & M, ArrayType * arra
 
 }
 
-void dataflowProtection::recursivelyVisitCalls(Module& M, Function* F, std::set<Function*> &functionList){
+void dataflowProtection::recursivelyVisitCalls(Module& M, Function* F, std::set<Function*> &functionList) {
 	//If we've already deleted this function from the list
-	if(functionList.find(F)==functionList.end())
+	if (functionList.find(F)==functionList.end())
 		return;
 
 	functionList.erase(F);
 
-	for(auto & bb : *F){
-		for(auto & I : bb){
-			if(CallInst* CI = dyn_cast<CallInst>(&I)){
+	for (auto & bb : *F) {
+		for (auto & I : bb) {
+			if (CallInst* CI = dyn_cast<CallInst>(&I)) {
 				recursivelyVisitCalls(M,CI->getCalledFunction(),functionList);
 			}
 		}
@@ -802,7 +986,8 @@ void dataflowProtection::recursivelyVisitCalls(Module& M, Function* F, std::set<
 
 }
 
-bool dataflowProtection::isISR(Function& F){
+//TODO: this is not sound logic
+bool dataflowProtection::isISR(Function& F) {
 	bool ans = F.getName().endswith("ISR") || F.getName().endswith("isr");
 	return ans;
 }
@@ -810,10 +995,244 @@ bool dataflowProtection::isISR(Function& F){
 //----------------------------------------------------------------------------//
 // Synchronization utilities
 //----------------------------------------------------------------------------//
-bool dataflowProtection::isSyncPoint(Instruction* I){
-	if(isa<StoreInst>(I) || isa<CallInst>(I) || isa<TerminatorInst>(I) || isa<GetElementPtrInst>(I))
+bool dataflowProtection::isSyncPoint(Instruction* I) {
+	if (isa<StoreInst>(I) || isa<CallInst>(I) || isa<TerminatorInst>(I) || isa<GetElementPtrInst>(I))
 		return std::find(syncPoints.begin(), syncPoints.end(), I) != syncPoints.end();
 	else
 		return false;
 }
 
+#ifdef FIX_STORE_SEGMENTING
+bool dataflowProtection::isStoreMovePoint(StoreInst* SI) {
+	if ( 	(getClone(SI).first == SI) ||						/* Doesn't have a clone */
+			(SI->getOperand(0)->getType()->isPointerTy()) ||	/* Storing a pointer type */
+			(dyn_cast<PtrToIntInst>(SI->getOperand(0))) ) 		/* Casted pointer */
+	{
+		return false;
+	}
+	// otherwise, we need to segment them together
+	else
+		return true;
+}
+#endif
+
+bool dataflowProtection::isCallMovePoint(CallInst* ci) {
+	if ( (getClone(ci)).first == ci) {
+		return false;
+	} else {
+		return true;
+	}
+}
+
+/*
+ * returns true if this will try to sync on a coarse-grained function return value
+ * these should be avoided for things like the case of malloc()
+ * if returns false, then it's OK to sync on the value
+ */
+bool dataflowProtection::checkCoarseSync(StoreInst* inst) {
+	//need to check for if this value came from a replicated function call
+	Value* op0 = inst->getOperand(0);
+	if (CallInst* CI = dyn_cast<CallInst>(op0)) {
+		Function* calledF = CI->getCalledFunction();
+		if (calledF && (std::find(coarseGrainedUserFunctions.begin(), coarseGrainedUserFunctions.end(),
+				calledF->getName()) != coarseGrainedUserFunctions.end()) ) {
+			//then we've got a coarse-grained value
+			return true;
+		}
+	} else if (InvokeInst* II = dyn_cast<InvokeInst>(op0)) {
+		Function* calledF = II->getCalledFunction();
+		if (calledF && (std::find(coarseGrainedUserFunctions.begin(), coarseGrainedUserFunctions.end(),
+				calledF->getName()) != coarseGrainedUserFunctions.end()) ) {
+			//again
+			return true;
+		}
+	}
+	return false;
+}
+
+//visit all uses of an instruction and see if they are also instructions to add to clone list
+void dataflowProtection::walkInstructionUses(Instruction* I, bool xMR) {
+
+	//add it to clone or skip list, depending on annotation, passed through argument xMR
+	std::set<Instruction*> * addSet;
+	if (xMR) {
+		addSet = &instsToCloneAnno;
+	} else {
+		addSet = &instsToSkip;
+	}
+
+	for (auto U : I->users()) {
+		if (auto instUse = dyn_cast<Instruction>(U)) {
+			CallInst* CI = dyn_cast<CallInst>(instUse);
+			StoreInst* SI = dyn_cast<StoreInst>(instUse);
+			PHINode* phiInst = dyn_cast<PHINode>(instUse);
+
+			//should we add it to the list?
+			if (phiInst) {
+				;
+			} else if (CI) {
+				//skip all call instructions for now
+				;
+			} else if (TerminatorInst* TI = dyn_cast<TerminatorInst>(instUse)) {
+				//this should become a syncpoint
+				// really? needs more testing
+//				if (xMR) syncPoints.push_back(instUse);
+			} else if (SI && (noMemReplicationFlag) ) {
+				//don't replicate store instructions if flags
+				//also, this will become a syncpoint
+//				if (xMR) syncPoints.push_back(instUse);
+			} else {
+				addSet->insert(instUse);
+//				errs() << *instUse << "\n";
+			}
+
+			//should we visit its uses?
+			//as long as it has more than 1 uses
+			if ( (instUse->getNumUses() > 0) && !phiInst) {
+				//recursive call
+				walkInstructionUses(instUse, xMR);
+			}
+
+		}
+	}
+}
+
+/*
+ * verify that all of the options used to configure COAST for this pass are safe to follow
+ */
+void dataflowProtection::verifyOptions(Module& M) {
+	std::map< GlobalVariable*, std::set<Function*> > glblFnMap;
+
+	// check that the globals being cloned are only used in protected functions
+	for (auto g : globalsToClone) {
+		// get all the users
+		for (auto u : g->users()) {
+			// is it an instruction?
+			if (Instruction* UI = dyn_cast<Instruction>(u)) {
+				Function* parentF = UI->getParent()->getParent();
+
+				// have we been asked to skip it?
+				if (globalCrossMap.find(g) != globalCrossMap.end()) {
+					if (globalCrossMap[g] == parentF) {
+						// skip if it's the marked function
+						continue;
+					}
+				}
+
+				// is the instruction in a protected function?
+				if (fnsToClone.find(parentF) == fnsToClone.end()) {
+					if (glblFnMap.find(g) == glblFnMap.end()) {
+						std::set<Function*> tempSet;
+						glblFnMap[g] = tempSet;
+					}
+
+					glblFnMap[g].insert(parentF);
+				}
+
+			}
+		}
+	}
+
+	// print warning messages
+	for (auto item : glblFnMap) {
+		errs() << err_string << " global \"" << item.first->getName() << "\"\n\tused in functions: ";
+		for (auto fns : item.second) {
+			errs() << "\"" << fns->getName() << "\", ";
+		}
+		errs() << "\nwhich are not protected\n";
+	}
+
+	if (glblFnMap.size() > 0) {
+		std::exit(-1);
+	}
+
+}
+
+
+void dataflowProtection::updateFnWrappers(Module& M) {
+	std::string wrapperFnEnding = "_COAST_WRAPPER";
+	// have to create a map and edit afterwards; editing users while iterating over them is a bad idea
+	std::map<Function*, Function*> wrapperMap;
+	std::set<Function*> wrapperFns;
+
+	// update fn replication wrappers
+	for (auto &fn : M) {
+		StringRef fnName = fn.getName();
+		// this should end with wrapperFnEnding
+		if (fnName.endswith(wrapperFnEnding)) {
+			wrapperFns.insert(&fn);
+
+			// find the matching function name
+			StringRef normalFnName = fnName.substr(0, fnName.size() - wrapperFnEnding.size());
+			Constant* fnC = M.getOrInsertFunction(normalFnName, fn.getFunctionType());
+			if (!fnC) {
+				errs() << "Matching function call to '" << normalFnName << "' doesn't exist!\n";
+				exit(-1);
+			}
+			else {
+				if (verboseFlag)
+					errs() << info_string << " Found wrapper match: '" << normalFnName << "'\n";
+			}
+
+			Function* normalFn = dyn_cast<Function>(fnC);
+			wrapperMap[&fn] = normalFn;
+
+			// find all CallInsts with target of fnName function
+//			for (auto u : fn.users()) {
+//				if (CallInst* uc = dyn_cast<CallInst>(u)) {
+//					wrapperMap[uc] = normalFn;
+//				} else if (BitCastInst* bc = dyn_cast<BitCastInst>(u)) {
+//					wrapperMap[uc] = normalFn;
+//				}
+//			}
+		}
+	}
+
+	for (auto &fn : M) {
+		for (auto &bb: fn) {
+			for (auto &I : bb) {
+
+				//look for call instructions
+				if (CallInst* ci = dyn_cast<CallInst>(&I)) {
+
+					auto op0 = ci->getOperand(0);
+					Function* calledF;
+
+					Value* v = ci->getCalledValue();
+					calledF = dyn_cast<Function>(v->stripPointerCasts());
+					auto found = wrapperMap.find(calledF);
+
+					if (found != wrapperMap.end()) {
+//						errs() << "-" << *ci << "\n";
+
+						if (dyn_cast<Function>(v)) {
+							ci->setCalledFunction(found->second);
+//							errs() << " -" << *ci << "\n";
+							// duplicate this call, but only if it's in the list of functions to clone
+							if (fnsToClone.find(&fn) != fnsToClone.end()) {
+								instsToCloneAnno.insert(ci);
+								wrapperInsts.insert(ci);
+							}
+						} else if (BitCastOperator* bco = dyn_cast<BitCastOperator>(v)) {
+							errs() << err_string << " wrapper function has bad signature, it has been bitcasted in the call, which is not supported.\n";
+							errs() << *bco << "\n";
+							errs() << *bco->getOperand(0) << "\n";
+							errs() << *(v->stripPointerCasts()) << "\n";
+							ci->eraseFromParent();
+						}
+					}
+				}
+			}
+		}
+	}
+
+	for (auto fn : wrapperFns) {
+		// remove unused wrapper functions
+		if (fn->getNumUses() > 0) {
+			errs() << "Missed replacing function call for " << fn->getName() << "\n";
+			errs() << *(*fn->user_begin()) << "\n";
+			assert(false);
+		}
+		fn->eraseFromParent();
+	}
+}
diff --git a/projects/debugStatements/debugStatements.cpp b/projects/debugStatements/debugStatements.cpp
index 768123bbc..1f00fc770 100644
--- a/projects/debugStatements/debugStatements.cpp
+++ b/projects/debugStatements/debugStatements.cpp
@@ -17,6 +17,10 @@
 
 using namespace llvm;
 
+// list of functions to add print statements to
+// if nothing, do all
+cl::list<std::string> fnPrintList("fnPrintList", cl::desc("Specify functions to instrument. Defaults to all."), cl::CommaSeparated, cl::ZeroOrMore);
+
 
 //--------------------------------------------------------------------------//
 // Top level behavior
@@ -52,10 +56,18 @@ bool DebugStatements::runOnModule(Module &M) {
 	StringRef arrow = StringRef("-->");
 	StringRef newLineChar = StringRef("\n");
 
+	bool specificFlag = (fnPrintList.size() > 0);
 
 	for (auto &F : M) {
-		if(F.getBasicBlockList().size()  == 0)
+		if (F.getBasicBlockList().size()  == 0)
+			continue;
+
+		// if there's something in the list, and this function isn't, continue
+		if (specificFlag && (std::find(fnPrintList.begin(), fnPrintList.end(), F.getName().str()) == fnPrintList.end()) ) {
 			continue;
+		}
+//		errs() << F.getName() << "\n";
+
 		BasicBlock* entryBlock = &F.getEntryBlock();
 
 		//Variable def'ns
diff --git a/projects/smallProfile/CMakeLists.txt b/projects/smallProfile/CMakeLists.txt
new file mode 100644
index 000000000..85ae5a45c
--- /dev/null
+++ b/projects/smallProfile/CMakeLists.txt
@@ -0,0 +1,5 @@
+cmake_minimum_required(VERSION 3.5)
+
+add_llvm_loadable_module(SmallProfile
+	smallProfile.cpp
+	)
diff --git a/projects/smallProfile/smallProfile.cpp b/projects/smallProfile/smallProfile.cpp
new file mode 100644
index 000000000..8be36514b
--- /dev/null
+++ b/projects/smallProfile/smallProfile.cpp
@@ -0,0 +1,384 @@
+/*
+ * smallProfile.cpp
+ *
+ * Instruments code to count and report number of calls to each function
+ *
+ * Copyright BYU CCL
+ * August 2019
+ */
+
+#define DEBUG_TYPE "debugStatements"
+
+#include <string>
+#include <set>
+#include <llvm/IR/Module.h>
+#include <llvm/IR/Instructions.h>
+#include "llvm/Support/raw_ostream.h"
+#include <llvm/IR/Constants.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Support/CommandLine.h>
+
+using namespace llvm;
+
+//--------------------------------------------------------------------------//
+// Command line options for the pass
+//--------------------------------------------------------------------------//
+cl::opt<std::string> printFnNameCl("printFnName", cl::desc("Name of printing function"));
+cl::opt<bool> noPrintFlag ("noPrint", cl::desc("Does not insert call to profile printing function"));
+
+
+//--------------------------------------------------------------------------//
+// Class spec
+//--------------------------------------------------------------------------//
+class SmallProfile : public ModulePass {
+public:
+	static char ID;             // Pass identification
+	SmallProfile() : ModulePass(ID) { }
+
+	bool runOnModule(Module &M);
+	void profileLocalFunctions(Module &M);
+	void profileExternalFunctions(Module &M);
+
+	void insertProfilePrintFunction(Module &M);
+	Function* createProfilePrintFunction(Module &M);
+	GetElementPtrInst* getGEPforPrint(Module &M, StringRef* varName, BasicBlock*& bb);
+	Function* getPrintFunction(Module &M);
+
+	GlobalVariable* createGlobalCounter(Module &M, Function* fn);
+	void incrementCounter(GlobalVariable* cntr, Instruction* insertHere, bool extCall);
+
+private:
+	// constant strings
+	StringRef newLineChar = StringRef("\n");
+	StringRef formatInt = StringRef(": %d");
+
+	// important pointers
+	Function* mainFunc = nullptr;
+	ReturnInst* mainReturn = nullptr;
+	Type* type_i32 = nullptr;
+
+	// containers
+	std::vector<std::pair<Function*, GlobalVariable*> > profPairs;
+	std::set<Function*> funcsToLookFor;
+};
+
+char SmallProfile::ID = 0;
+static RegisterPass<SmallProfile> X("SmallProfile",
+		"Insert profiling instructions into the IR", false, true);
+
+
+//--------------------------------------------------------------------------//
+// Functions
+//--------------------------------------------------------------------------//
+
+/*
+ * Get a reference to the print function
+ */
+Function* SmallProfile::getPrintFunction(Module &M) {
+
+	// types we'll need
+	Type *charPointerType = PointerType::get(IntegerType::get(M.getContext(), 8), 0);
+	Type* type_i32 = Type::getInt32Ty(M.getContext());
+	FunctionType *printfTy = FunctionType::get(type_i32, std::vector<Type*> (1, charPointerType), true);
+
+	// name of the print function
+	std::string printFnName;
+	if (printFnNameCl != "") {
+		printFnName = printFnNameCl;
+	} else {
+		printFnName = "printf";
+	}
+
+	// make the function
+	Constant* printfc = M.getOrInsertFunction(printFnName, printfTy);
+	Function* print = dyn_cast<Function>(printfc);
+	assert(print && "Print function not defined");
+
+	return print;
+}
+
+/*
+ * insert instructions to increment global into entry block of functions we have the body of
+ */
+void SmallProfile::profileLocalFunctions(Module &M) {
+
+	std::string glblNm;
+
+	for (auto &F : M) {
+		StringRef fnName = F.getName();
+		// skip the debug information function calls
+		if (fnName.startswith_lower("llvm.dbg") || fnName.startswith_lower("llvm.lifetime."))
+			continue;
+
+		// external function calls
+		if(F.getBasicBlockList().size() == 0) {
+			// if we don't have the function body, we'll have to look at the calls of it instead
+			funcsToLookFor.insert(&F);
+		} else {
+
+			// create a global to increment for each function
+			GlobalVariable* nextCnt = createGlobalCounter(M, &F);
+
+			// Add calls to increment counter
+			Instruction* insertHere = F.getEntryBlock().getFirstNonPHIOrDbg();
+			if (isa<LandingPadInst>(insertHere)) {
+				insertHere = insertHere->getNextNode();
+			}
+			incrementCounter(nextCnt, insertHere, false);
+		}
+
+		// get the spot for printing at the end
+		if (F.getName() == "main") {
+			mainFunc = &F;
+
+			//find the last instruction
+			for(auto &bb : F){
+				if(ReturnInst* RI = dyn_cast<ReturnInst>(bb.getTerminator())){
+					mainReturn = RI;
+				}
+			}
+		}
+	}
+
+	return;
+}
+
+/*
+ * for functions that we don't have the body of, instructions to increment globals must come right
+ * before the function call
+ */
+void SmallProfile::profileExternalFunctions(Module &M) {
+
+	std::string glblNm;
+
+	// add profiling statements to all calls of external functions
+	for (auto &F : M) {
+		for (auto &bb : F) {
+			for (auto &I : bb) {
+				if (CallInst* CI = dyn_cast<CallInst>(&I)) {
+					Function* calledF = CI->getCalledFunction();
+					// skip function pointers, sorry
+					if (!calledF)
+						continue;
+
+					if (funcsToLookFor.find(calledF) != funcsToLookFor.end()) {
+						// create a global, unless it exists already
+						GlobalVariable* nextCnt = createGlobalCounter(M, calledF);
+
+						// Add calls to increment counter
+						incrementCounter(nextCnt, CI, true);
+					}
+				}
+			}
+		}
+	}
+
+	return;
+}
+
+/*
+ * Right before the program completes, print the value of each of the global counters.
+ * If your program never "finishes," but runs forever, you can insert a call to
+ * PRINT_PROFILE_STATS wherever you want this to be printed.
+ */
+void SmallProfile::insertProfilePrintFunction(Module &M) {
+
+	Function* printFn = getPrintFunction(M);
+
+	// create the stats function
+	Function* printStatsFn = createProfilePrintFunction(M);
+
+	// don't print if there is no main function, or if flag said not to
+	if (mainReturn == nullptr || noPrintFlag) {
+		errs() << "\033[0;33mSmallProfile\033[0m: skipping inserting call to " << printStatsFn->getName() << "\n";
+	} else {
+		// insert a function call to stats function
+		Twine callName = Twine("callStats");
+		ArrayRef<Value*> statsCallArgs;
+		CallInst* statsCall = CallInst::Create(printStatsFn, "", mainReturn);
+//		Idea: http://www.cplusplus.com/reference/cstdlib/atexit/
+	}
+
+	// holds args for the call instructions
+	std::vector<Value*> sepArgs;
+	std::vector<Value*> newlineArgs;
+	BasicBlock* entryBlock = &(printStatsFn->getEntryBlock());
+
+	// Arguments for printing a new line
+	GetElementPtrInst* newlineGEP = getGEPforPrint(M, &newLineChar, entryBlock);
+	newlineArgs.push_back(newlineGEP);
+	ArrayRef<Value*>* callArgsNewline = new ArrayRef<Value*>(newlineArgs);
+	// For printing the count
+	GetElementPtrInst* decGEP = getGEPforPrint(M, &formatInt, entryBlock);
+
+	// where to put instructions
+	Instruction* insertionPoint = entryBlock->getFirstNonPHI();
+	Instruction* returnPoint = entryBlock->getTerminator();
+	if (isa<LandingPadInst>(insertionPoint)) {
+		insertionPoint = insertionPoint->getNextNode();
+	}
+
+	//Move all the GEPs to the front of the entry block to dominate all uses
+	newlineGEP->moveBefore(insertionPoint);
+	decGEP->moveBefore(newlineGEP);
+
+	for (auto p : profPairs) {
+		//Variable def'ns
+		Function* F = p.first;
+		StringRef fnName = F->getName();
+		std::vector<Value*> fnArgs;
+		std::vector<Value*> cntArgs;
+
+		// Arguments for printing the function name
+		GetElementPtrInst* fnGEP = getGEPforPrint(M, &fnName, entryBlock);
+		fnArgs.push_back(fnGEP);
+		ArrayRef<Value*>* fnCallArgs = new ArrayRef<Value*>(fnArgs);
+
+		// Arguments for printing the profile count
+		cntArgs.push_back(decGEP);
+		LoadInst* LI = new LoadInst(p.second, "glblLoad");
+		cntArgs.push_back(LI);
+		ArrayRef<Value*>* cntCallArgs = new ArrayRef<Value*>(cntArgs);
+
+		//Create all the function calls: function arrow bb newline
+		CallInst* newlinePrint = CallInst::Create(printFn, *callArgsNewline, "", returnPoint);
+		CallInst* cntPrint = CallInst::Create(printFn, *cntCallArgs, "", newlinePrint);
+		CallInst* fnNamePrint = CallInst::Create(printFn, *fnCallArgs, "", cntPrint);
+
+		LI->insertBefore(fnNamePrint);
+		fnGEP->moveBefore(insertionPoint);
+	}
+
+	return;
+}
+
+/*
+ * Creates a function that will hold all of the printing calls.
+ */
+Function* SmallProfile::createProfilePrintFunction(Module &M) {
+
+	FunctionType* statsCallType = FunctionType::get(Type::getVoidTy(M.getContext()), false);
+	Constant* c = M.getOrInsertFunction("PRINT_PROFILE_STATS", statsCallType);
+	Function* printStatsFn = dyn_cast<Function>(c);
+	assert(printStatsFn && "Profiling function is non-void");
+
+	// Create a basic block that holds all the print functions, as long as it doesn't exist already
+	if (printStatsFn->getBasicBlockList().size() == 0) {
+		BasicBlock* bbe = BasicBlock::Create(M.getContext(), Twine("entry"), printStatsFn);
+		ReturnInst* statsRet = ReturnInst::Create(M.getContext(), bbe);
+	}
+
+	return printStatsFn;
+}
+
+/*
+ * Creates a global variable that tracks function calls based on the name of the function.
+ * This checks to see if one exists before creating it.
+ */
+GlobalVariable* SmallProfile::createGlobalCounter(Module &M, Function* fn) {
+
+	// the name of the global variable
+	std::string glblNm;
+	StringRef fnName = fn->getName();
+	glblNm = "__" + fnName.str() + "_profCnt";
+
+	// create the global
+	GlobalVariable* nextCnt = M.getGlobalVariable(StringRef(glblNm));
+
+	if (nextCnt == nullptr) {
+		nextCnt = cast<GlobalVariable>(M.getOrInsertGlobal(StringRef(glblNm), type_i32));
+
+		// set the correct attributes, making it local instead of extern
+		nextCnt->setConstant(false);
+		nextCnt->setInitializer(ConstantInt::getNullValue(type_i32));
+		nextCnt->setUnnamedAddr( GlobalValue::UnnamedAddr() );
+		nextCnt->setAlignment(4);
+
+		// add to list for later
+		std::pair<Function*, GlobalVariable*> tmpPair = std::make_pair(fn, nextCnt);
+		profPairs.push_back(tmpPair);
+	}
+
+	assert(nextCnt && "Global variable counter exists");
+	return nextCnt;
+}
+
+/*
+ * Create instructions that increment a global variable and inserts them at the specified point.
+ */
+void SmallProfile::incrementCounter(GlobalVariable* cntr, Instruction* insertHere, bool extCall) {
+
+	LoadInst* LI = new LoadInst(cntr, "cntLoad");
+	Constant* one = ConstantInt::get(LI->getType(), 1, false);
+	BinaryOperator* BI = BinaryOperator::CreateAdd(LI, one, "incCnt");
+	StoreInst* SI = new StoreInst(BI, cntr);
+	LI->insertBefore(insertHere);
+	BI->insertAfter(LI);
+	SI->insertAfter(BI);
+
+	if (extCall) {
+		//if there's debug information for the call, copy it to new instructions
+		if (auto dbgLoc = insertHere->getDebugLoc()) {
+			LI->setDebugLoc(dbgLoc);
+			BI->setDebugLoc(dbgLoc);
+			SI->setDebugLoc(dbgLoc);
+		}
+	}
+
+	return;
+}
+
+/*
+ * creates the correct GEP instruction to load a global string for printing
+ */
+GetElementPtrInst* SmallProfile::getGEPforPrint(Module &M, StringRef* varName, BasicBlock*& bb){
+
+	Type* type_i8 = Type::getInt8Ty(M.getContext());
+
+	//Create a char array (i8s)
+	ArrayType * type_i8_array = ArrayType::get(type_i8,(unsigned long long int)(varName->size()+1));
+	Constant * dataInit = ConstantDataArray::getString(M.getContext(), *varName);
+
+	//Create a global variable and init to the array of i8s
+	GlobalVariable * globalVal = dyn_cast<GlobalVariable>(
+			M.getOrInsertGlobal(*varName, type_i8_array));
+	globalVal->setConstant(true);
+	globalVal->setInitializer(dataInit);
+	globalVal->setLinkage(GlobalVariable::PrivateLinkage);
+	globalVal->setUnnamedAddr( GlobalValue::UnnamedAddr() );
+	globalVal->setAlignment(1);
+
+
+	//Create constants for GEP arguments
+	ConstantInt* zeroCI = ConstantInt::get(IntegerType::getInt32Ty(M.getContext()),0,false);
+	Value* zeroVal = dyn_cast<Value>(zeroCI);
+
+	//Assemble the GEP instruction
+	std::vector<Value*> gepArgs;
+	gepArgs.push_back(zeroVal);
+	gepArgs.push_back(zeroVal);
+	ArrayRef<Value*>* gepArgsArray;
+	gepArgsArray = new ArrayRef<Value*>(gepArgs);
+
+	//Insert the instruction into basic block
+	GetElementPtrInst* gep = GetElementPtrInst::CreateInBounds(type_i8_array,
+			globalVal,*gepArgsArray,varName->str(),bb->getTerminator());
+
+	return gep;
+}
+
+/*
+ * Top level function that controls the pass
+ */
+bool SmallProfile::runOnModule(Module &M) {
+
+	this->type_i32 = Type::getInt32Ty(M.getContext());
+
+	profileLocalFunctions(M);
+
+	profileExternalFunctions(M);
+
+	insertProfilePrintFunction(M);
+
+	return true;
+}
diff --git a/tests/.gitignore b/tests/.gitignore
index 4debdb849..f19bda4ba 100644
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -1,5 +1,7 @@
 *.ll
 *.bc
+*.lbc
+*.bcpp
 *.o
 *.elf
 *.map
@@ -9,4 +11,4 @@ build
 *.d_raw
 *.log
 a.out
-*.out
\ No newline at end of file
+*.out
diff --git a/tests/COAST.h b/tests/COAST.h
index 9951d576e..448f51969 100644
--- a/tests/COAST.h
+++ b/tests/COAST.h
@@ -10,9 +10,29 @@
 
 //Macro for function calls - same as replicateFnCalls
 #define __xMR_FN_CALL __attribute__((annotate("xMR_call")))
+//same as skipLibCalls
+#define __SKIP_FN_CALL __attribute__((annotate("coast_call_once")))
 
-//Macros to set the default behavior of the code 
+//Macros to set the default behavior of the code
 #define __DEFAULT_xMR int __attribute__((annotate("set_xMR_default"))) __xMR_DEFAULT_BEHAVIOR__;
 #define __DEFAULT_NO_xMR int __attribute__((annotate("set_no_xMR_default"))) __xMR_DEFAULT_BEHAVIOR__;
 
+//The variable should not be optimized away
+#define __COAST_VOLATILE __attribute__((annotate("coast_volatile")))
+
+//register a function as one which wraps malloc()
+#define MALLOC_WRAPPER_REGISTER(fname) void* fname##_COAST_WRAPPER(size_t size);
+#define MALLOC_WRAPPER_CALL(fname, x) fname##_COAST_WRAPPER((x))
+
+// also one which wraps printf, or something like it
+#define PRINTF_WRAPPER_REGISTER(fname) int fname##_COAST_WRAPPER(const char* format, ...);
+#define PRINTF_WRAPPER_CALL(fname, fmt, ...) fname##_COAST_WRAPPER(fmt, __VA_ARGS__)
+
+#define GENERIC_COAST_WRAPPER(fname) fname##_COAST_WRAPPER
+
+// COAST normally checks that a replicated global is used only in
+//  protected functions.  This is a directive that goes right before
+//  a function, with the name of the global to ignore boundary crossing
+#define __COAST_IGNORE_GLOBAL(name) __attribute__((annotate("no-verify-"#name)))
+
 #endif
diff --git a/tests/TMRregression/Makefile b/tests/TMRregression/Makefile
index d80224e13..bda6a71a7 100644
--- a/tests/TMRregression/Makefile
+++ b/tests/TMRregression/Makefile
@@ -24,7 +24,7 @@ CPPFLAGS := -stdlib=libc++ -I/usr/include/c++/5.4.0/ -I/usr/include/x86_64-linux
 # CPPSTD - C++ standard to use when compiling files, default is c++11
 ###########################################################
 
-LLVMROOT = $(HOME)/llvm
+LLVMROOT = $(HOME)/coast
 CLANG = $(LLVMROOT)/build/bin/clang
 CLANG++ = $(LLVMROOT)/build/bin/clang++
 LLVMDIS = $(LLVMROOT)/build/bin/llvm-dis
@@ -97,10 +97,13 @@ $(TARGET).clang.bc: $(BCFILES) $(BCPPFILES)
 %.bcpp: $(SRCFOLDER)/%.cpp
 	$(CLANG++) -std=$(CPPSTD) -emit-llvm $(CPPFLAGS) $< -c -o $@
 
-.PHONY: clean
+.PHONY: clean small_clean
 
-clean:
-	@rm -f *.bc *.bcpp *.s *.ll $(TARGET)
+small_clean:
+	@rm -f *.bc *.bcpp *.s $(TARGET)
+
+clean: small_clean
+	@rm -f *.ll
 
 cfg: $(TARGET).opt.ll $(TARGET).clang.ll
 	@rm -rf cfg
diff --git a/tests/TMRregression/MiBenchTestDriver.py b/tests/TMRregression/MiBenchTestDriver.py
index cb3c12f9a..56f22f84b 100644
--- a/tests/TMRregression/MiBenchTestDriver.py
+++ b/tests/TMRregression/MiBenchTestDriver.py
@@ -269,7 +269,7 @@ def printIntro(logFileName, numConfigurations, verboseFlag):
     if verboseFlag:
         print("**Verbose mode enabled**")
     print("-----------------------------------------------------")
-    print("Tests beginning...", end="\r")
+    print("Tests beginning...", end='\r')
 
 def updateConsoleDisplay(srcFolder, totalRuns, testNum):
     print("Testing in: " + srcFolder)
@@ -284,7 +284,7 @@ def main():
     args = parser.parse_args()
     verboseFlag = args.verbose
 
-    progFolder = os.path.expanduser("~/llvm/tests/TMRregression/")
+    progFolder = os.path.expanduser("~/coast/tests/TMRregression/")
     # the output checker is a separate functionality
     # it will only run that, then exit
     if args.check:
@@ -298,12 +298,12 @@ def main():
     # it is a lot of work to figure out how to extract all of the following data
     #   automatically; it is actually less time consuming to just pull it all
     #   out by hand
-    folder_list = ["~/llvm/tests/MiBench/automotive/basicmath/",
-                    "~/llvm/tests/MiBench/automotive/bitcount/",
-                    "~/llvm/tests/MiBench/automotive/qsort/",
-                    "~/llvm/tests/MiBench/automotive/susan/",
-                    "~/llvm/tests/MiBench/network/dijkstra/",
-                    "~/llvm/tests/MiBench/network/patricia/"]
+    folder_list = ["~/coast/tests/MiBench/automotive/basicmath/",
+                    "~/coast/tests/MiBench/automotive/bitcount/",
+                    "~/coast/tests/MiBench/automotive/qsort/",
+                    "~/coast/tests/MiBench/automotive/susan/",
+                    "~/coast/tests/MiBench/network/dijkstra/",
+                    "~/coast/tests/MiBench/network/patricia/"]
     target_list = [
             ['basicmath_small',
              'basicmath_large'],
diff --git a/tests/TMRregression/TMRregressionTest.py b/tests/TMRregression/TMRregressionTest.py
index 26aabd308..0d44defb0 100644
--- a/tests/TMRregression/TMRregressionTest.py
+++ b/tests/TMRregression/TMRregressionTest.py
@@ -220,16 +220,18 @@ def main():
     logFileName = "regResults-{}-{}-{}-{}.log".format(now.month, now.day, now.hour, now.minute)
     # which pass configurations to run
     # OPTS = ["", "\"-TMR -s\"", "\"-TMR -i\""]
-    OPTS = ["", "\"-DWC -i\"", "\"-DWC -s\"",
-            "\"-TMR -i\"", "\"-TMR -s\"", "\"-CFCSS\"",
-			"\"-CFCSS -DWC -i\"", "\"-CFCSS -DWC -s\"", "\"-DWC -i -CFCSS\"",
-			"\"-DWC -s -CFCSS\"", "\"-CFCSS -TMR -i\"", "\"-CFCSS -TMR -s\"",
-			"\"-TMR -i -CFCSS\"", "\"-TMR -s -CFCSS\"", "\"-TMR -s -countErrors\"",
-			"\"-CFCSS -TMR -s -countErrors\"", "\"-TMR -s -countErrors -CFCSS\"",
-			"\"-DWC -noMemReplication\"", "\"-TMR -noMemReplication\"", # funny rules
+    OPTS = ["", "\"-DWC\"", "\"-TMR\"", "\"-TMR -s -countErrors\"",
+			"\"-DWC -noMemReplication\"", "\"-TMR -noMemReplication\"",
 			"\"-DWC -noLoadSync\"", "\"-TMR -noLoadSync\"",
 			"\"-DWC -noStoreDataSync\"", "\"-TMR -noStoreDataSync\"",
-			"\"-DWC -noStoreAddrSync\"", "\"-TMR -noStoreAddrSync\""]
+			"\"-DWC -noStoreAddrSync\"", "\"-TMR -noStoreAddrSync\"",
+            "\"-DWC -noMemReplication -noLoadSync\"",
+            "\"-TMR -noMemReplication -noLoadSync\"",
+            "\"-DWC -noMemReplication -noStoreDataSync\"",
+            "\"-TMR -noMemReplication -noStoreDataSync\"",
+            "\"-DWC -noMemReplication -noStoreAddrSync\"",
+            "\"-TMR -noMemReplication -noStoreAddrSync\""
+            ]
     # optimization levels passed to clang
     OPT_LEVELS = [" ","-O2 "]
     # base folder for tests
diff --git a/tests/TMRregression/testSingle.sh b/tests/TMRregression/testSingle.sh
index 6555f9cff..b79d5cfe0 100755
--- a/tests/TMRregression/testSingle.sh
+++ b/tests/TMRregression/testSingle.sh
@@ -1,2 +1,2 @@
 rm output.opt.bc
-make OPT_PASSES="-DWC" OPT_FLAGS="" SRCFILES="helloWorld.cpp" SRCFOLDER=./unitTests
+make OPT_PASSES="-DWC" OPT_FLAGS="" SRCFILES="load_store.c" SRCFOLDER=./unitTests
diff --git a/tests/TMRregression/unitTestDriver.py b/tests/TMRregression/unitTestDriver.py
new file mode 100755
index 000000000..2e731236b
--- /dev/null
+++ b/tests/TMRregression/unitTestDriver.py
@@ -0,0 +1,130 @@
+#!/usr/bin/python3
+
+# Instrument tests on the unitTest folder
+# easier to do it here than in the buildbot script
+# NOTE: passing in a list or arguments for dataflowProtection, we normally expect
+#  it to start with a '-', however, argparse doesn't like that. Add an extra space
+#  before your list of arguments
+
+import os
+import sys
+import argparse
+import subprocess
+import shlex
+
+singleFlag = False
+verboseFlag = False
+
+# class that represents a configuration
+class runConfig(object):
+    """docstring for runConfig."""
+    def __init__(self, f, ef=None, xc=None, op=None, nm=None):
+        self.fname = f
+        self.extraFiles = ef
+        self.xcFlg = xc
+        self.optFlg = op
+        self.noMemFlg = nm
+
+# keep this up to date manually
+# dictionary of specific flags for each unitTest
+customConfigs = [
+    runConfig("annotations.c"),
+    runConfig("argSync.c", xc="-O3"),
+    runConfig("atomics.c"),
+    runConfig("basicIR.c"),
+    runConfig("bsearch_strcmp.c"),
+    runConfig("classTest.cpp"),
+    runConfig("exceptions.cpp", \
+        op="-replicateFnCalls=_ZNSt12_Vector_baseIiSaIiEE11_M_allocateEm,_ZSt27__uninitialized_default_n_aIPimiET_S1_T0_RSaIT1_E",  \
+        nm="-ignoreFns=_ZNSt12_Vector_baseIiSaIiEE13_M_deallocateEPim"),
+    runConfig("fSigTypes.c", \
+        ef="fSigTypes_ext.c"),
+    runConfig("helloWorld.cpp"),
+    runConfig("inlining.c", \
+        xc="-O2"),
+    runConfig("load_store.c"),
+    runConfig("mallocTest.c", \
+        nm="-skipLibCalls=free"),
+    runConfig("nestedCalls.c", \
+        op="-replicateFnCalls=memset"),
+    runConfig("ptrArith.c"),
+    runConfig("returnPointer.c"),
+    runConfig("segmenting.c"),
+    runConfig("simd.c", \
+        xc="-O3"),
+    runConfig("structCompare.c"),
+    runConfig("testFuncPtrs.c"),
+    runConfig("time_c.c"),
+    runConfig("vecTest.cpp", \
+        op="-replicateFnCalls=_ZNSt12_Vector_baseIiSaIiEE11_M_allocateEm,_ZSt34__uninitialized_move_if_noexcept_aIPiS0_SaIiEET0_T_S3_S2_RT1_", \
+        nm="-ignoreFns=_ZNSt12_Vector_baseIiSaIiEE13_M_deallocateEPim"),
+    runConfig("verifyOptions.c"),
+    runConfig("whetstone.c"),
+    runConfig("zeroInit.c"),
+]
+
+def run(cfg, config, dir_path):
+    # first clean before compiling
+    clean = subprocess.Popen(['make', '-C', dir_path, 'small_clean'])
+    clean.wait()
+    # now build the test
+    cmd = "make -C {} SRCFOLDER=./unitTests 'SRCFILES={}' 'XCFLAGS={}' 'OPT_PASSES={}'"
+    fls = cfg.fname + " " + cfg.extraFiles if cfg.extraFiles else cfg.fname
+    xcf = cfg.xcFlg if cfg.xcFlg else ""
+    ps = config + " " + cfg.optFlg \
+            if cfg.optFlg else config
+    ps = ps + " " + cfg.noMemFlg \
+            if (cfg.noMemFlg and "noMemReplication" in config) \
+            else ps
+    command = cmd.format(dir_path, fls, xcf, ps)
+    if singleFlag or verboseFlag:
+        print(command)
+    p = subprocess.Popen(shlex.split(command))
+    p.wait()
+    # print(" --- return code: {}".format(p.returncode))
+    return p.returncode
+
+
+def main():
+    global singleFlag, verboseFlag
+    # CL arguments
+    parser = argparse.ArgumentParser(description='Process commands for unit tests')
+    parser.add_argument('config', help='configuration, without any file-specific flags')
+    parser.add_argument('--single-run', '-s', help='Run only one file')
+    parser.add_argument('--verbose', '-v', help='extra output', action='store_true')
+    args = parser.parse_args()
+
+    if args.verbose:
+        verboseFlag = True
+
+    # test dir
+    dir_path = os.path.dirname(os.path.realpath(__file__))
+
+    if args.single_run:
+        single = [x for x in customConfigs if x.fname == args.single_run]
+        returnVal = 0
+        if len(single) > 0:
+            singleFlag = True
+            returnVal = run(single[0], args.config.lstrip(), dir_path)
+        else:
+            print("File name not found!")
+        return returnVal
+    else:
+        for cfg in customConfigs:
+            returnVal = run(cfg, args.config.lstrip(), dir_path)
+            if returnVal != 0:
+                if cfg.fname == "verifyOptions.c":
+                    # this shouldn't be tested for success, because it's
+                    #  supposed to fail
+                    continue
+                else:
+                    return returnVal
+    # clean one more time if we did all of them
+    clean = subprocess.Popen(['make', '-C', dir_path, 'clean'])
+    clean.wait()
+    return returnVal
+
+
+if __name__ == "__main__":
+    returnVal = main()
+    sys.exit(returnVal)
diff --git a/tests/TMRregression/unitTests/annotations.c b/tests/TMRregression/unitTests/annotations.c
new file mode 100644
index 000000000..bc5a290e9
--- /dev/null
+++ b/tests/TMRregression/unitTests/annotations.c
@@ -0,0 +1,64 @@
+/*
+ * annotations.c
+ * This file created to test how COAST treats local variables that have been
+ *  annotated to be xMR'd.
+ * The most important requirement is that it actually syncs on the values.
+ * We add in some dynamically allocated structs as examples of things we
+ *  wouldn't want to xMR
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "../../COAST.h"
+__DEFAULT_NO_xMR
+
+//struct for holding data
+typedef struct _data_struct {
+    int d0;
+    int d1;
+} data_t;
+
+
+// arbitrary math operations
+int doMath(data_t* d, int a) {
+    int __xMR x, y;
+    int __xMR result;
+    x = d->d0 * a;
+    y = d->d1 / a;
+    result = ((x + y) >> 2) | 0x01;
+    return result;
+}
+
+int moreMath(int a, int b) __xMR {
+    int __NO_xMR p = 2;
+    int q = 4;
+    return (a * p) + (b << q);
+}
+
+
+int main() {
+    int __xMR result;
+    int status = 0;
+
+    data_t* myData = (data_t*) malloc(sizeof(data_t));
+    myData->d0 = 21;
+    myData->d1 = 47;
+    //expected result: 17
+
+    result = doMath(myData, 2);
+    printf("Result = %d\n", result);
+    if (result != 17) {
+        printf("Error!\n");
+        status |= -1;
+    }
+
+    //expected result: 52
+    result = moreMath(2, 3);
+    printf("Result = %d\n", result);
+    if (result != 52) {
+        printf("Error!\n");
+        status |= -1;
+    }
+
+    return status;
+}
diff --git a/tests/TMRregression/unitTests/argSync.c b/tests/TMRregression/unitTests/argSync.c
new file mode 100644
index 000000000..7ebf00a49
--- /dev/null
+++ b/tests/TMRregression/unitTests/argSync.c
@@ -0,0 +1,44 @@
+/*
+ * In processCallSync(), assert for number of users of original in function call
+ * is it OK to have more than 2 uses if it's an argument?
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "../../COAST.h"
+__DEFAULT_NO_xMR
+
+// borrowed from FFT benchmark in MiBench
+// run with -O3, but not inlined
+unsigned __attribute((noinline)) NumberOfBitsNeeded ( unsigned PowerOfTwo ) __xMR
+{
+    unsigned i;
+
+    if ( PowerOfTwo < 2 )
+    {
+        fprintf (
+            stderr,
+            ">>> Error in fftmisc.c: argument %d to NumberOfBitsNeeded is too small.\n",
+            PowerOfTwo );
+
+        exit(1);
+    }
+
+    for ( i=0; ; i++ )
+    {
+        if ( PowerOfTwo & (1 << i) )
+            return i;
+    }
+}
+
+int runTest(int a) __xMR {
+    int x = NumberOfBitsNeeded(a);
+    return x;
+}
+
+
+int main() {
+    int x = runTest(32);
+    printf("%d\n", x);
+}
diff --git a/tests/TMRregression/unitTests/arm_locks.c b/tests/TMRregression/unitTests/arm_locks.c
new file mode 100644
index 000000000..6f41a0016
--- /dev/null
+++ b/tests/TMRregression/unitTests/arm_locks.c
@@ -0,0 +1,29 @@
+/*
+ * arm_locks.c
+ *
+ * This is to test the synchronization primitives from the ARM ISA.
+ */
+
+#include <stdio.h>
+#include <arm_acle.h>
+
+
+/**************************** COAST configuration *****************************/
+#include "../../COAST.h"
+__DEFAULT_NO_xMR
+
+
+#ifdef __arm
+void swap(unsigned int* a, unsigned int* b) __xMR {
+    *a = __swp(*a, (unsigned int*)b);
+}
+#endif
+
+
+int main() {
+    unsigned int x = 0x55;
+    unsigned int y = 0xAA;
+    swap(&x, &y);
+
+    printf("x = 0x%02X, y = 0x%02X\r\n", x, y);
+}
diff --git a/tests/TMRregression/unitTests/atomics.c b/tests/TMRregression/unitTests/atomics.c
new file mode 100644
index 000000000..2824c4d7d
--- /dev/null
+++ b/tests/TMRregression/unitTests/atomics.c
@@ -0,0 +1,26 @@
+/*
+ * atomics.c
+ *
+ * Test atomic operations and the effect of COAST SoR crossings on them.
+ * requires -std=c11
+ */
+
+#include <stdio.h>
+#include <stdatomic.h>
+
+#include "../../COAST.h"
+__DEFAULT_NO_xMR
+
+
+void incAtomic(atomic_uint* at) __xMR {
+    atomic_fetch_add(at, 1);
+}
+
+int main() {
+    atomic_uint counter;
+    atomic_init(&counter, 1);
+
+    incAtomic(&counter);
+
+    printf("counter = %d\n", counter);
+}
diff --git a/tests/TMRregression/unitTests/basicIR.c b/tests/TMRregression/unitTests/basicIR.c
new file mode 100644
index 000000000..eb7189966
--- /dev/null
+++ b/tests/TMRregression/unitTests/basicIR.c
@@ -0,0 +1,32 @@
+/*
+ * basicIR.c
+ * This benchmark designed to create basic LLVM IR that is easy to check if
+ *  all of the replication rules are being followed correctly
+ */
+
+#include <stdio.h>
+
+int globalArr[] = {0, 0};
+
+int main() {
+    //load
+    int* xp = &globalArr[0];
+    xp+=1;
+    int x = *xp;
+
+    //ops
+    x = ((x + 5) * 3) >> 1;
+
+    //store
+    globalArr[0] = x;
+
+    //expected output: 7
+    printf("Result: %d\n", globalArr[0]);
+
+    if (globalArr[0] != 7) {
+        printf("Error!\n");
+        return -1;
+    } else {
+        return 0;
+    }
+}
diff --git a/tests/TMRregression/unitTests/bsearch_strcmp.c b/tests/TMRregression/unitTests/bsearch_strcmp.c
index b124afd12..e29eab3b7 100644
--- a/tests/TMRregression/unitTests/bsearch_strcmp.c
+++ b/tests/TMRregression/unitTests/bsearch_strcmp.c
@@ -17,18 +17,23 @@ int __attribute__((annotate("no_xMR"))) compareints (const void * a, const void
 int values[] = { 50, 20, 60, 40, 10, 30 };
 char strvalues[][20] = {"some","example","strings","here"};
 
+// returns true (1) if in the array, false (0) otherwise
 int test1(){
     int * pItem;
     int key = 40;
     qsort (values, 6, sizeof (int), compareints);
     pItem = (int*) bsearch (&key, values, 6, sizeof (int), compareints);
-    if (pItem!=NULL)
+    if (pItem!=NULL) {
         printf ("%d is in the array.\n",*pItem);
-    else
+        return 1;
+    }
+    else {
         printf ("%d is not in the array.\n",key);
-    return 0;
+        return 0;
+    }
 }
 
+// returns true (1) if in the array, false (0) otherwise
 int test2(){
     char * pItem;
     char key[20] = "example";
@@ -39,16 +44,25 @@ int test2(){
     /* search for the key: */
     pItem = (char*) bsearch (key, strvalues, 4, 20, (int(*)(const void*,const void*)) strcmp);
 
-    if (pItem!=NULL)
+    if (pItem!=NULL) {
         printf ("%s is in the array.\n",pItem);
-    else
+        return 1;
+    }
+    else {
         printf ("%s is not in the array.\n",key);
-    return 0;
+        return 0;
+    }
 }
 
 int main ()
 {
-    test1();
-    test2();
-    return 0;
+    int x0 = test1();
+    int x1 = test2();
+    //expect both to be true
+    if (x0 && x1) {
+        return 0;
+    } else {
+        printf("Error!\n");
+        return -1;
+    }
 }
diff --git a/tests/TMRregression/unitTests/classTest.cpp b/tests/TMRregression/unitTests/classTest.cpp
index b519bf236..5d3b0e6f2 100644
--- a/tests/TMRregression/unitTests/classTest.cpp
+++ b/tests/TMRregression/unitTests/classTest.cpp
@@ -33,5 +33,9 @@ int main() {
     int x = mc.getStuff();
     std::cout << "stuff = " << x << "\n";
 
-    return 0;
+    //expected: 0
+    if (x) {
+        printf("Error!\n");
+    }
+    return x;
 }
diff --git a/tests/TMRregression/unitTests/exceptions.cpp b/tests/TMRregression/unitTests/exceptions.cpp
index 40c63c6bc..befbabc31 100644
--- a/tests/TMRregression/unitTests/exceptions.cpp
+++ b/tests/TMRregression/unitTests/exceptions.cpp
@@ -6,13 +6,16 @@
  * need to add
  * -replicateFnCalls=_ZNSt12_Vector_baseIiSaIiEE11_M_allocateEm,_ZSt27__uninitialized_default_n_aIPimiET_S1_T0_RSaIT1_E
  * to -DWC or -TMR invocation
+ *
+ * When compiled with -noMemReplication, also add the following options:
+ *  - ignoreFns=_ZNSt12_Vector_baseIiSaIiEE13_M_deallocateEPim
+ * this gets rid of a double free() error
  */
 
 #include <iostream>
 #include <stdexcept>
 #include <vector>
 
-using namespace std;
 
 double division(int a, int b) {
     if( b == 0 ) {
@@ -29,15 +32,17 @@ int main () {
     int x = 50;
     int y = 0;
     double z = 0;
+    int exceptionCount = 0;
 
     double m = multiplication(x, y);
 
     //example of a user-defined function which throws an exception
     try {
         z = division(x, y);
-        cout << z << endl;
+        std::cout << z << std::endl;
     } catch (const char* msg) {
-        cerr << msg << endl;
+        std::cerr << msg << std::endl;
+        exceptionCount+=1;
     }
 
     //example of a library function which throws an exception
@@ -47,7 +52,15 @@ int main () {
         myvector.at(20)=100;      // vector::at throws an out-of-range
     } catch (const std::out_of_range& oor) {
         std::cerr << "Out of Range error: " << oor.what() << '\n';
+        exceptionCount+=1;
     }
 
-    return 0;
+    //expected results
+    if (exceptionCount == 2) {
+        printf("Success!\n");
+        return 0;
+    } else {
+        printf("Error!\n");
+        return -1;
+    }
 }
diff --git a/tests/TMRregression/unitTests/fSigTypes.c b/tests/TMRregression/unitTests/fSigTypes.c
index 3d1a1dcea..ef051abbd 100644
--- a/tests/TMRregression/unitTests/fSigTypes.c
+++ b/tests/TMRregression/unitTests/fSigTypes.c
@@ -27,13 +27,28 @@ int main() {
     inc(b);
     c = add(a, b);
     print(c);
+    //check
+    if (c != 3) {
+        printf("Error!\n");
+        return -1;
+    }
 
     incx();
     d = test();
     print(d);
+    //check
+    if (d != 1) {
+        printf("Error!\n");
+        return -1;
+    }
 
     e = (float)add(b, c);
     printf("value is : %f\n", e);
+    //check
+    if (e != 5.0) {
+        printf("Error!\n");
+        return -1;
+    }
 
     return 0;
 }
diff --git a/tests/TMRregression/unitTests/helloWorld.cpp b/tests/TMRregression/unitTests/helloWorld.cpp
index 89a13e4c7..f67742cf7 100644
--- a/tests/TMRregression/unitTests/helloWorld.cpp
+++ b/tests/TMRregression/unitTests/helloWorld.cpp
@@ -2,4 +2,5 @@
 
 int main() {
     std::cout << "Hello there!" << std::endl;
+    return 0;
 }
diff --git a/tests/TMRregression/unitTests/inlining.c b/tests/TMRregression/unitTests/inlining.c
new file mode 100644
index 000000000..e3a67ea67
--- /dev/null
+++ b/tests/TMRregression/unitTests/inlining.c
@@ -0,0 +1,60 @@
+/*
+ * inlining.c
+ * Unit test to see what COAST can do with function inlining, and the problems
+ *  it can cause when combined with Replication scope and memory aliasing.
+ * Compile with -O2
+ *
+ * To try waiting to inline until after COAST has run, do
+ *  XCFLAGS=-O2 -Xclang -disable-llvm-passes -Rpass=inline
+ *  OPT_PASSES=-O2 -disable-inlining -pass-remarks=inline -DWC -verbose -inline
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include "../../COAST.h"
+__DEFAULT_NO_xMR
+
+
+// simulates system registers
+static uint32_t fakeOutput[8];
+static uint32_t idx = 0;
+
+void globalWrite(uint32_t x) {
+    fakeOutput[idx++] = x;
+}
+
+
+uint32_t __attribute__((noinline))
+replicateThis(uint32_t a, uint32_t b) __xMR {
+    uint32_t y = (a + b) << 1;  //random math
+    globalWrite(y);
+    return y;
+}
+
+uint32_t leaveThisAlone(uint32_t c, uint32_t d) __NO_xMR {
+    uint32_t z = (c - d) ^ (uint32_t)0x0F;
+    globalWrite(z);
+    return z;
+}
+
+
+int main() {
+    uint32_t y = replicateThis(2, 3);   //expect 10
+    uint32_t z = leaveThisAlone(4, 1);  //expect 12
+
+    printf("%d, %d\n", y, z);
+
+    if ( (y != 10) || (z != 12) ) {
+        printf("error!\n");
+        return -1;
+    }
+
+    //read output so doesn't get optimized out
+    for (uint32_t i = 0; i < 8; i+=1) {
+        printf("%d, ", fakeOutput[i]);
+    }
+    printf("\n");
+
+    return 0;
+}
diff --git a/tests/TMRregression/unitTests/load_store.c b/tests/TMRregression/unitTests/load_store.c
new file mode 100644
index 000000000..68dbc9fe4
--- /dev/null
+++ b/tests/TMRregression/unitTests/load_store.c
@@ -0,0 +1,48 @@
+/*
+ * load_store.c
+ * This benchmark designed to create LLVM IR that shows how storing
+ *  the first copy before loading the second can create errors. Run
+ *  with -DWC.
+ * Don't compile with any optimizations, or it will compile to only
+ *  print the first string.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+
+#include "../../COAST.h"
+__DEFAULT_NO_xMR
+
+// #define EXECUTE_CORRECT
+#ifdef EXECUTE_CORRECT
+#define FUNCTION_TAG
+#else
+#define FUNCTION_TAG __xMR
+#endif
+
+struct myStruct
+{
+    uint32_t x;
+    uint32_t y;
+};
+
+
+void touchStruct(struct myStruct* ms) FUNCTION_TAG {
+    (ms->x)++;
+
+    if ( (ms->x) == 1) {
+        printf("ms == 1\r\n");
+    } else {
+        printf("ms == %d\r\n", ms->x);
+    }
+
+    return;
+}
+
+
+int main() {
+    struct myStruct ms;
+    ms.x = 0;
+    ms.y = 0;
+    touchStruct(&ms);
+}
diff --git a/tests/TMRregression/unitTests/mallocTest.c b/tests/TMRregression/unitTests/mallocTest.c
index f6be3e2d1..bad462544 100644
--- a/tests/TMRregression/unitTests/mallocTest.c
+++ b/tests/TMRregression/unitTests/mallocTest.c
@@ -8,26 +8,29 @@
 #define ARRAY_SIZE  4
 #define ELEMENT_SIZE    10
 
-typedef struct{
+typedef struct {
     float a;
     float b;
 } inner_struct;
 
-typedef struct{
+typedef struct {
     int x;
     int y;
     int array[ARRAY_SIZE];
     inner_struct z;
-}outer_struct;
+} outer_struct;
 
-outer_struct* __attribute__((annotate("xMR_call"))) alloc_struct(){
+outer_struct* __attribute__((annotate("xMR_call"))) alloc_struct() {
     outer_struct* st = (outer_struct*) malloc(sizeof(outer_struct));
     return st;
 }
 
-int main(){
+int main() {
     //don't even need to do anything with the struct, just create it and destroy it
     outer_struct* st = alloc_struct();
     free(st);
     printf("Finished\n");
+    //this unit test considered to have succeeded
+    // if there are no memory leaks (double free corruption, etc)
+    return 0;
 }
diff --git a/tests/TMRregression/unitTests/ptrArith.c b/tests/TMRregression/unitTests/ptrArith.c
new file mode 100644
index 000000000..7586318b0
--- /dev/null
+++ b/tests/TMRregression/unitTests/ptrArith.c
@@ -0,0 +1,177 @@
+/*
+ * ptrArith.c
+ *
+ * pointer arithmetic. Is it safe?
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include "../../COAST.h"
+__DEFAULT_NO_xMR
+
+
+/***************************** utility functions ******************************/
+void print1dArray(int a[], int size) {
+    for (int i = 0; i < size; i+=1) {
+        printf("%2d ", a[i]);
+    }
+    puts("");
+}
+
+void print2dArray(int* a[], int rows, int cols) {
+    for (int i = 0; i < rows; i+=1) {
+        for (int j = 0; j < cols; j+=1) {
+            printf("%2d @ %p\n", a[i][j], &a[i][j]);
+        }
+    }
+}
+
+/******************************** mutate array ********************************/
+// do some pointer arithmetic
+void mutateArray(int* ar) __xMR {
+    int* a7 = &ar[7];
+    // ar[7] = 2;
+    *ar = 1;            //index 0
+    *(ar + 1) = 2;      //index 1
+    ar += 3;
+    *ar += 4;           //index 3
+    ar[2] -= 1;         //index 5
+    *(ar + 3) = 7;      //index 6
+    *a7 |= (ar[0] >> ar[1]) - ar[2];           //index 7
+    // *a7 ^= 1;
+}
+
+/********************************** xor swap **********************************/
+void xorSwap(int* x, int* y) __xMR {
+    if (x != y) {
+        *x ^= *y;
+        *y ^= *x;
+        *x ^= *y;
+    }
+}
+
+/******************************** pointer math ********************************/
+void doPtrMath(int* ptr0, int* ptr1) __xMR {
+    ptr0[0] = ptr0[2] ^ ptr1[7];    //yields 2
+    ptr0++;
+    ptr0[1] += ptr1[4];             //yields 7
+}
+
+/***************************** pointer increment ******************************/
+void ptrInc(int* p) __xMR {
+    (*p)++;
+}
+
+/****************************** double pointers *******************************/
+void ptr2d(int** p) __xMR {
+    *(*(p + 1)) = 42;
+    p++;
+    *(*p - 1) <<= 2;
+}
+
+/****************************** double crossing *******************************/
+void fakeLibFunc(int* p) {
+    (*p) &= 0xFF;
+}
+
+void doubleCross(int* p) __xMR {
+    fakeLibFunc(p);
+    (*p) += 1;
+}
+
+/****************************** storing pointers ******************************/
+void doNothing(int* p) {
+    int* q = p + 1;
+}
+
+void storePtr(int** pp, int* p) __xMR {
+    p = *pp;
+    p += 1;      // this makes it a GEP instead
+    doNothing(p);
+    *p = 7;
+}
+
+/***************************** function pointers ******************************/
+//typedef function pointer
+typedef void (*MyFnType)(int* arg0);
+
+void intMath(int* arg0) {
+    *arg0 += 2;
+}
+
+void callFnPtr(int* x, MyFnType fnPtr) __xMR {
+    (*fnPtr)(x);
+}
+
+
+/************************************ main ************************************/
+#define ASIZE 8
+int main() {
+    ///////////////////////////// mutate array /////////////////////////////
+    int a1[ASIZE];
+    int a2[ASIZE] = {1, 2, 0, 4, 0, -1, 7, 5};   //golden
+    memset(a1, 0, ASIZE * sizeof(int));
+
+    mutateArray(a1);
+
+    print1dArray(a1, ASIZE);
+
+    if (memcmp(a1, a2, ASIZE * sizeof(int))) {
+        printf(" !! Error !!\n");
+    }
+
+    /////////////////////////////// xor swap ///////////////////////////////
+    int x = 0x55;
+    int y = 0xFA;
+    xorSwap(&x, &y);
+    printf(" 0x%02X, 0x%02X\n", x, y);
+
+    ///////////////////////////// pointer math /////////////////////////////
+    int array0[ASIZE] = {1, 2, 3, 4, 5, 6, 7, 8};
+    int array1[ASIZE] = {8, 7, 6, 5, 4, 3, 2, 1};
+    doPtrMath(array0, array1);
+    //expected array0 = {2, 2, 7}
+
+    print1dArray(array0, ASIZE);
+
+    ////////////////////////// pointer increment ///////////////////////////
+    int incThis = 2;
+    ptrInc(&incThis);
+    printf(" %d\n", incThis);
+    if (incThis != 3) {
+        puts("Error!");
+    }
+
+    /////////////////////////// double pointers ////////////////////////////
+    int a3[2] = {1, 2};
+    int a4[2] = {3, 4};
+    int* square[2] = {a3, a4};
+    print2dArray(square, 2, 2);
+
+    int** sp = &square[0];
+    printf("%p, %p\n", sp, *sp);
+    ptr2d(&square[0]);
+
+    //expects { {1, 8}, {42, 4} }
+    print2dArray(square, 2, 2);
+
+    /////////////////////////// double crossing ////////////////////////////
+    int val = 0xAAAA;
+    doubleCross(&val);
+    //expected 0x00AB
+    printf(" 0x%04X\n", val);
+
+    /////////////////////////// storing pointers ///////////////////////////
+    int sp0[4] = {4, 8, 16, 32};
+    int* spp0 = &sp0[0];
+    storePtr(&spp0, &sp0[3]);
+    print1dArray(sp0, 4);
+
+    ////////////////////////// function pointers ///////////////////////////
+    int fnX = 3;
+    callFnPtr(&fnX, &intMath);
+    printf("%2d\n", fnX);
+
+    return 0;
+}
diff --git a/tests/TMRregression/unitTests/returnPointer.c b/tests/TMRregression/unitTests/returnPointer.c
index 90139276e..3b9ed0bfd 100644
--- a/tests/TMRregression/unitTests/returnPointer.c
+++ b/tests/TMRregression/unitTests/returnPointer.c
@@ -23,5 +23,14 @@ int main() {
 
     printf("x = %d\n", x);
 
+    //expected results: x == 1
+    if (x == 1) {
+        printf("Success!\n");
+        return 0;
+    } else {
+        printf("Error!\n");
+        return -1;
+    }
+
     return 0;
 }
diff --git a/tests/TMRregression/unitTests/segmenting.c b/tests/TMRregression/unitTests/segmenting.c
new file mode 100644
index 000000000..c8917ad81
--- /dev/null
+++ b/tests/TMRregression/unitTests/segmenting.c
@@ -0,0 +1,29 @@
+/*
+ * segmenting.c
+ * Are the basic blocks segmented correctly? This test particularly includes reordering
+ *  of function calls that have been marked to be replicated
+ * run with -replicateFnCalls=simpleMath
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "../../COAST.h"
+
+
+int simpleMath(int x, int y) {
+    return x + y;
+}
+
+
+int main() {
+    int a = 10;
+    int b = 20;
+
+    int result = simpleMath(a, b);
+
+    if (result != 30) {
+        printf("Error! %d\n", result);
+    } else {
+        printf("Success!\n");
+    }
+}
\ No newline at end of file
diff --git a/tests/TMRregression/unitTests/simd.c b/tests/TMRregression/unitTests/simd.c
index b849d0b6a..6ef12631a 100644
--- a/tests/TMRregression/unitTests/simd.c
+++ b/tests/TMRregression/unitTests/simd.c
@@ -1,44 +1,157 @@
-// This unit test is to see how the LLVM IR represents SIMD instructions
-// All this does is double all of the values in a matrix
-// have to make sure the XCFLAGS="-O3"
+/*
+ * simd.c
+ *
+ * This unit test is to see how the LLVM IR represents SIMD instructions
+ * All this does is double all of the values in a matrix
+ * have to make sure the flag XCFLAGS="-O3"
+ */
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <stdbool.h>
 
+/***************************** Intrinsic settings *****************************/
+#define WITH_INTRINSICS
+#ifdef WITH_INTRINSICS
+#ifdef __x86_64
+#include <emmintrin.h>
+#include <immintrin.h>
+#elif __arm
+#include <arm_neon.h>
+#endif
+#endif  /* WITH_INTRINSICS */
+
+
+/**************************** COAST configuration *****************************/
+#include "../../COAST.h"
+__DEFAULT_NO_xMR
+
+//all values in the matrix will be multiplied by this value
 #define SCALAR      2
 
-// #define ARRAY_SIZE  4
-// unsigned int matrix[ARRAY_SIZE][ARRAY_SIZE] = {
-//     {1, 2, 3, 4},
-//     {5, 6, 7, 8},
-//     {9, 10, 11, 12},
-//     {13, 14, 15, 16}
-// };
-
-// #define ARRAY_SIZE  5
-// unsigned int matrix[ARRAY_SIZE][ARRAY_SIZE] = {
-//     {1, 2, 3, 4, 5},
-//     {6, 7, 8, 9, 10},
-//     {11, 12, 13, 14, 15},
-//     {16, 17, 18, 19, 20},
-//     {21, 22, 23, 24, 25}
-// };
-
-#define ARRAY_SIZE  2
-#define ARRAY_SIZE2 8
-unsigned int matrix[ARRAY_SIZE][ARRAY_SIZE2] = {
+//this is only used when compiled with flag -countErrors
+unsigned int __NO_xMR TMR_ERROR_CNT = 0;
+
+//define the testing type
+typedef unsigned int test_t;
+// typedef float test_t;
+
+
+#define ROW_SIZE  2
+#define COL_SIZE 8
+test_t matrix0[ROW_SIZE][COL_SIZE] = {
     {1, 2, 3, 4, 5, 6, 7, 8},
     {9, 10, 11, 12, 13, 14, 15, 16}
 };
+test_t golden0[ROW_SIZE][COL_SIZE] = {
+    {2, 4, 6, 8, 10, 12, 14, 16},
+    {18, 20, 22, 24, 26, 28, 30, 32}
+};
 
-int main(){
+// don't inline the matrix multiply call so it can be xMR'd correctly
+__attribute__((noinline))
+#ifdef WITH_INTRINSICS
+#ifdef __x86_64
+// hand optimized for x86_64 architecture with the SSE2 extension
+void scalarMultiply(test_t scalar) __xMR {
+    // const __m128 scalar_vec = _mm_set1_epi32(scalar);
+    const __m128 scalar_vec = _mm_set1_ps(scalar);
+
+    __m128i r0 = _mm_load_si128((__m128i *)&matrix0[0][0]);     // load integers
+    __m128 m0 = _mm_cvtepi32_ps(r0);                            // convert to float
+    m0 = _mm_mul_ps(m0, scalar_vec);                            // multiply by scalar
+    __m128i w0 = _mm_cvtps_epi32(m0);                           // convert to int
+    _mm_store_si128((__m128i *)&matrix0[0][0], w0);             // store integers
+
+    __m128i r1 = _mm_load_si128((__m128i *)&matrix0[0][4]);
+    __m128 m1 = _mm_cvtepi32_ps(r1);
+    m1 = _mm_mul_ps(m1, scalar_vec);
+    __m128i w1 = _mm_cvtps_epi32(m1);
+    _mm_store_si128((__m128i *)&matrix0[0][4], w1);
+
+    __m128i r2 = _mm_load_si128((__m128i *)&matrix0[1][0]);
+    __m128 m2 = _mm_cvtepi32_ps(r2);
+    m2 = _mm_mul_ps(m2, scalar_vec);
+    __m128i w2 = _mm_cvtps_epi32(m2);
+    _mm_store_si128((__m128i *)&matrix0[1][0], w2);
+
+    __m128i r3 = _mm_load_si128((__m128i *)&matrix0[1][4]);
+    __m128 m3 = _mm_cvtepi32_ps(r3);
+    m3 = _mm_mul_ps(m3, scalar_vec);
+    __m128i w3 = _mm_cvtps_epi32(m3);
+    _mm_store_si128((__m128i *)&matrix0[1][4], w3);
+}
+#elif __arm
+// http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0472j/chr1360928373893.html
+// https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
+void scalarMultiply(test_t scalar) __xMR {
+    uint32x4_t scalar_vec = vdupq_n_u32(scalar);
+
+    uint32x4_t r0 = vld1q_u32(&matrix0[0][0]);      // load integers
+    uint32x4_t m0 = vmulq_u32(r0, scalar_vec);      // parallel multiply
+    vst1q_u32(&matrix0[0][0], m0);                  // store integers
+
+    uint32x4_t r1 = vld1q_u32(&matrix0[0][4]);
+    uint32x4_t m1 = vmulq_u32(r1, scalar_vec);
+    vst1q_u32(&matrix0[0][4], m1);
+
+    uint32x4_t r2 = vld1q_u32(&matrix0[1][0]);
+    uint32x4_t m2 = vmulq_u32(r2, scalar_vec);
+    vst1q_u32(&matrix0[1][0], m2);
+
+    uint32x4_t r3 = vld1q_u32(&matrix0[1][4]);
+    uint32x4_t m3 = vmulq_u32(r3, scalar_vec);
+    vst1q_u32(&matrix0[1][4], m3);
+}
+#endif /* __x86_64 */
+
+#else
+void scalarMultiply(test_t scalar) __xMR {
     unsigned short i, j;
-    for(i = 0; i < ARRAY_SIZE; i++){
-        for(j = 0; j < ARRAY_SIZE2; j++){
-            matrix[i][j] *= SCALAR;
+    for (i = 0; i < ROW_SIZE; i++) {
+        for (j = 0; j < COL_SIZE; j++) {
+            matrix0[i][j] *= scalar;
         }
     }
-    printf("%d\n", matrix[0][0]);
-    //expected result: SCALAR * 1
-    printf("thing: %d\n", 4);
+}
+#endif  /* WITH_INTRINSICS */
+
+int matrixMatch(test_t mat[ROW_SIZE][COL_SIZE], test_t golden[ROW_SIZE][COL_SIZE]) __NO_xMR {
+    unsigned short i, j;
+    for (i = 0; i < ROW_SIZE; i++) {
+        for (j = 0; j < COL_SIZE; j++) {
+            if (mat[i][j] != golden[i][j]) {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+void printMatrix(test_t mat[ROW_SIZE][COL_SIZE]) {
+    unsigned short i, j;
+    for (i = 0; i < ROW_SIZE; i++) {
+        printf("{ ");
+        for (j = 0; j < COL_SIZE; j++) {
+            printf("%2d, ", mat[i][j]);
+            // printf("%2f, ", mat[i][j]);
+        }
+        puts("},");
+    }
+    puts("");
+}
+
+int main(){
+    scalarMultiply((test_t)SCALAR);
+
+    printMatrix(matrix0);
+    printf("TMR errors: %d\n", TMR_ERROR_CNT);
+
+    if (matrixMatch(matrix0, golden0)) {
+        printf("Success!\n");
+        return 0;
+    } else {
+        printf("Error!\n");
+        return -1;
+    }
 }
diff --git a/tests/TMRregression/unitTests/structCompare.c b/tests/TMRregression/unitTests/structCompare.c
index 223ef162b..36e24953e 100644
--- a/tests/TMRregression/unitTests/structCompare.c
+++ b/tests/TMRregression/unitTests/structCompare.c
@@ -27,13 +27,16 @@ int structCompare(testStruct_t d0, testStruct_t d1) {
 }
 
 int main() {
+    int returnVal = 0;
+
     testStruct_t d0 = {1, 2};
     testStruct_t d1 = newStruct();
     if (structCompare(d0, d1)) {
         printf("Equal!\n");
     } else {
         printf("Not equal!\n");
+        returnVal = -1;
     }
 
-    return 0;
+    return returnVal;
 }
diff --git a/tests/TMRregression/unitTests/testFuncPtrs.c b/tests/TMRregression/unitTests/testFuncPtrs.c
index 7d7c94331..d49bf9144 100644
--- a/tests/TMRregression/unitTests/testFuncPtrs.c
+++ b/tests/TMRregression/unitTests/testFuncPtrs.c
@@ -13,22 +13,32 @@ int sub(int i, int j)
    return (i - j);
 }
 
-void print(int x, int y, int (*func)())
+int print(int x, int y, int (*func)())
 {
-        printf("value is : %d\n", (*func)(x, y));
+    int val = (*func)(x, y);
+    printf("value is : %d\n", val);
+    //check values
+    if ( (val == 300) || (val == -100) ) {
+        return 0;
+    } else {
+        return val;
+    }
 }
 
 int main()
 {
-    // test with calling function pointers
+    int returnVal = 0;
     int x=100, y=200;
-    print(x,y,add);     // expected output: 300
-    print(x,y,sub);     // expected output: -100
+
+    // test with calling function pointers
+    returnVal |= print(x, y, add);     // expected output: 300
+    returnVal |= print(x, y, sub);     // expected output: -100
 
     // see if we can create arrays of function pointers
     int (* pBitCntFunc[2])(int, int) = {
-      add, sub
+        add,
+        sub
     };
 
-    return 0;
+    return returnVal;
 }
diff --git a/tests/TMRregression/unitTests/time_c.c b/tests/TMRregression/unitTests/time_c.c
index cc09554c3..a749bda95 100644
--- a/tests/TMRregression/unitTests/time_c.c
+++ b/tests/TMRregression/unitTests/time_c.c
@@ -46,4 +46,5 @@ int main() {
     printf("Using clock: %ld clicks to run (%f seconds)\n", dur, ((float)dur)/CLOCKS_PER_SEC);
 
     return 0;
+    //is there a good way to make this unit test self-checking?
 }
diff --git a/tests/TMRregression/unitTests/vecTest.cpp b/tests/TMRregression/unitTests/vecTest.cpp
new file mode 100644
index 000000000..c5fd7d7ca
--- /dev/null
+++ b/tests/TMRregression/unitTests/vecTest.cpp
@@ -0,0 +1,45 @@
+/*
+ * vecTest.cpp
+ * Basic unit test to exercise using STL containers
+ *
+ * Notes:
+ * Because the STL does all of the memory management for you, all of the "new" and
+ *  "delete" operators are hidden inside of what amount to wrapper functions.  These
+ *  should be treated the same as malloc() wrapper functions and the calls should
+ *  be replicated.  However, they are quite difficult to find.  Function name
+ *  mangling turns "new" into "_Znwm" and "delete" into "_ZdlPv".
+ *
+ * Use the following regex to look for candidates to replicate
+ *  "  %call.* = (call|invoke).*@_Z\w*\(.*%\w*.DWC.*$"
+ *
+ * For this particular test, the following functions must be replicated:
+ *  - _ZNSt12_Vector_baseIiSaIiEE11_M_allocateEm
+ *  - _ZSt34__uninitialized_move_if_noexcept_aIPiS0_SaIiEET0_T_S3_S2_RT1_
+ *
+ * When compiled with -noMemReplication, also add the following options:
+ *  - ignoreFns=_ZNSt12_Vector_baseIiSaIiEE13_M_deallocateEPim
+ * this gets rid of a double free() error
+ */
+
+#include <iostream>
+#include <vector>
+
+#define SIZE 4
+
+int main() {
+    std::vector<int> vec;
+    for (int i = 0; i < SIZE; i+=1) {
+        vec.push_back(i);
+    }
+
+    std::size_t vsize = vec.size();
+    std::cout << "vector size: " << vsize << "\n";
+
+    //check
+    if (vsize == SIZE) {
+        return 0;
+    } else {
+        return -1;
+    }
+
+}
diff --git a/tests/TMRregression/unitTests/verifyOptions.c b/tests/TMRregression/unitTests/verifyOptions.c
new file mode 100644
index 000000000..18ebf3ba0
--- /dev/null
+++ b/tests/TMRregression/unitTests/verifyOptions.c
@@ -0,0 +1,39 @@
+/*
+ * verifyOptions.c
+ * 
+ * Unit test to see if COAST can detect when replication rules
+ * are being violated.
+ */
+
+#include <stdio.h>
+#include "../../COAST.h"
+__DEFAULT_NO_xMR
+
+
+int __xMR myGlobal = 0;
+
+
+void incGlbl(void) __xMR {
+    myGlobal++;
+}
+
+void decGlbl(void) __NO_xMR {
+    myGlobal--;
+}
+
+__COAST_IGNORE_GLOBAL(myGlobal)
+void mulGlbl(void) {
+    myGlobal *= 2;
+}
+
+
+int main() {
+
+    printf("%d, ", myGlobal);
+    incGlbl();
+    printf("%d, ", myGlobal);
+    decGlbl();
+    printf("%d\n", myGlobal);
+
+    return 0;
+}
diff --git a/tests/TMRregression/unitTests/whets.c b/tests/TMRregression/unitTests/whets.c
new file mode 100644
index 000000000..c8ad58cbd
--- /dev/null
+++ b/tests/TMRregression/unitTests/whets.c
@@ -0,0 +1,1255 @@
+/*
+ *  Document:         Whets.c
+ *  File Group:       Classic Benchmarks
+ *  Creation Date:    6 November 1996
+ *  Revision Date:
+ *
+ *  Title:            Whetstone Benchmark in C/C++
+ *  Keywords:         WHETSTONE BENCHMARK PERFORMANCE MIPS
+ *                    MWIPS MFLOPS
+ *
+ *  Abstract:         C or C++ version of Whetstone one of the
+ *                    Classic Numeric Benchmarks with example
+ *                    results on P3 to P6 based PCs.
+ *
+ *  Contributor:      Roy Longbottom 101323.2241@compuserve.com
+	*                         or     Roy_Longbottom@compuserve.com
+ *
+ ************************************************************
+ *
+ *     C/C++ Whetstone Benchmark Single or Double Precision
+ *
+ *     Original concept        Brian Wichmann NPL      1960's
+ *     Original author         Harold Curnow  CCTA     1972
+ *     Self timing versions    Roy Longbottom CCTA     1978/87
+ *     Optimisation control    Bangor University       1987/90
+ *     C/C++ Version           Roy Longbottom          1996
+ *     Compatibility & timers  Al Aburto               1996
+ *
+ ************************************************************
+ *
+ *              Official version approved by:
+ *
+ *         Harold Curnow  100421.1615@compuserve.com
+ *
+ *      Happy 25th birthday Whetstone, 21 November 1997
+ *
+ ************************************************************
+ *
+ *     The program normally runs for about 100 seconds
+ *     (adjustable in main - variable duration). This time
+ *     is necessary because of poor PC clock resolution.
+ *     The original concept included such things as a given
+ *     number of subroutine calls and divides which may be
+ *     changed by optimisation. For comparison purposes the
+ *     compiler and level of optimisation should be identified.
+ *
+ ************************************************************
+ *
+ *     The original benchmark had a single variable I which
+ *     controlled the running time. Constants with values up
+ *     to 899 were multiplied by I to control the number
+ *     passes for each loop. It was found that large values
+ *     of I could overflow index registers so an extra outer
+ *     loop with a second variable J was added.
+ *
+ *     Self timing versions were produced during the early
+ *     days. The 1978 changes supplied timings of individual
+ *     loops and these were used later to produce MFLOPS and
+ *     MOPS ratings.
+ *
+ *     1987 changes converted the benchmark to Fortran 77
+ *     standards and removed redundant IF statements and
+ *     loops to leave the 8 active loops N1 to N8. Procedure
+ *     P3 was changed to use global variables to avoid over-
+ *     optimisation with the first two statements changed from
+ *     X1=X and Y1=Y to X=Y and Y=Z. A self time calibrating
+ *     version for PCs was also produced, the facility being
+ *     incorporated in this version.
+ *
+ *     This version has changes to avoid worse than expected
+ *     speed ratings, due to underflow, and facilities to show
+ *     that consistent numeric output is produced with varying
+ *     optimisation levels or versions in different languages.
+ *
+ *     Some of the procedures produce ever decreasing numbers.
+ *     To avoid problems, variables T and T1 have been changed
+ *     from 0.499975 and 0.50025 to 0.49999975 and 0.50000025.
+ *
+ *     Each section now has its own double loop. Inner loops
+ *     are run 100 times the loop constants. Calibration
+ *     determines the number of outer loop passes. The
+ *     numeric results produced in the main output are for
+ *     one pass on the outer loop. As underflow problems were
+ *     still likely on a processor 100 times faster than a 100
+ *     MHz Pentium, three sections have T=1.0-T inserted in the
+ *     outer loop to avoid the problem. The two loops avoid
+ *     index register overflows.
+ *
+ *     The first section is run ten times longer than required
+ *     for accuracy in calculating MFLOPS. This time is divided
+ *     by ten for inclusion in the MWIPS calculations.
+ *
+ *     This version has facilities for typing in details of the
+ *     particular run. This information is appended to file
+ *     whets.res along with the results.
+ *
+ *     Roy Longbottom  101323.2241@compuserve.com
+ *
+ ************************************************************
+ *
+ *     Whetstone benchmark results are available in whets.tbl
+ *     from ftp.nosc.mil/pub/aburto. The results include
+ *     further details of the benchmarks.
+ *
+ ************************************************************
+ *
+ *     Source code is available in C/C++, Fortran, Basic and
+ *     Visual Basic in the same format as this version. Pre-
+ *     compiled versions for PCs are also available via C++.
+ *     These comprise optimised and non-optimised versions
+ *     for DOS, Windows and NT.
+ *
+ *     This version compiles and runs correctly either as a
+ *     C or CPP program with a WATCOM and Borland compiler.
+ *
+ ************************************************************
+ *
+ * Example of initial calibration display (Pentium 100 MHz)
+ *
+ * Single Precision C/C++ Whetstone Benchmark
+ *
+ * Calibrate
+ *      0.17 Seconds          1   Passes (x 100)
+ *      0.77 Seconds          5   Passes (x 100)
+ *      3.70 Seconds         25   Passes (x 100)
+ *
+ * Use 676  passes (x 100)
+ *
+ * 676 passes are used for an approximate duration of 100
+ * seconds, providing an initial estimate of a speed rating
+ * of 67.6 MWIPS.
+ *
+ * This is followed by the table of results as below. Input
+ * statements are then supplied to type in the run details.
+ *
+ ************************************************************
+ *
+ * Examples of results from file whets.res
+ *
+ * Whetstone Single  Precision Benchmark in C/C++
+ *
+ * Month run         4/1996
+ * PC model          Escom
+ * CPU               Pentium
+ * Clock MHz         100
+ * Cache             256K
+ * H/W Options       Neptune chipset
+ * OS/DOS            Windows 95
+ * Compiler          Watcom C/C++ 10.5  Win386
+ * Options           No optimisation
+ * Run by            Roy Longbottom
+ * From              UK
+ * Mail              101323.2241@compuserve.com
+ *
+ * Loop content                 Result            MFLOPS     MOPS   Seconds
+ *
+ * N1 floating point    -1.12475025653839100      19.971              0.274
+ * N2 floating point    -1.12274754047393800      11.822              3.240
+ * N3 if then else       1.00000000000000000               11.659     2.530
+ * N4 fixed point       12.00000000000000000               13.962     6.430
+ * N5 sin,cos etc.       0.49904659390449520                2.097    11.310
+ * N6 floating point     0.99999988079071040       3.360             45.750
+ * N7 assignments        3.00000000000000000                2.415    21.810
+ * N8 exp,sqrt etc.      0.75110864639282230                1.206     8.790
+ *
+ * MWIPS                                          28.462            100.134
+ *
+ * Whetstone Single  Precision Benchmark in C/C++
+ *
+ * Compiler          Watcom C/C++ 10.5  Win386
+ * Options           -otexan -zp4 -om -fp5 -5r
+ *
+ * Loop content                 Result            MFLOPS     MOPS    Seconds
+ *
+ * N1 floating point    -1.12475025653839100      26.751               0.478
+ * N2 floating point    -1.12274754047393800      17.148               5.220
+ * N3 if then else       1.00000000000000000               19.922      3.460
+ * N4 fixed point       12.00000000000000000               15.978     13.130
+ * N5 sin,cos etc.       0.49904659390449520                2.663     20.810
+ * N6 floating point     0.99999988079071040      10.077              35.650
+ * N7 assignments        3.00000000000000000               22.877      5.380
+ * N8 exp,sqrt etc.      0.75110864639282230                1.513     16.370
+ *
+ * MWIPS                                          66.270             100.498
+ *
+ *
+ * Whetstone Double  Precision Benchmark in C/C++
+ *
+ * Compiler          Watcom C/C++ 10.5 Win32NT
+ * Options           -otexan -zp4 -om -fp5 -5r
+ *
+ * Loop content                 Result           MFLOPS      MOPS   Seconds
+ *
+ * N1 floating point    -1.12398255667391900     26.548               0.486
+ * N2 floating point    -1.12187079889284400     16.542               5.460
+ * N3 if then else       1.00000000000000000               19.647     3.540
+ * N4 fixed point       12.00000000000000000               15.680    13.500
+ * N5 sin,cos etc.       0.49902937281515140                3.019    18.520
+ * N6 floating point     0.99999987890802820      9.977              36.330
+ * N7 assignments        3.00000000000000000               22.620     5.490
+ * N8 exp,sqrt etc.      0.75100163018457870                1.493    16.740
+ *
+ * MWIPS                                         67.156             100.066
+ *
+ *  Note different numeric results to single precision. Slight variations
+ *  are normal with different compilers and sometimes optimisation levels.
+ *
+ **************************************************************************
+ *
+ *       Example results via Watcom C/C++ 10.5 Win386 (P6 Win32NT)
+ *
+ *
+ *            Single Precision Non-optimised Results -dMSC
+ *
+ *     MWIPS   MFLOPS  MFLOPS  MFLOPS  COS     EXP     FIXPT   IF     EQUAL
+ * Key           1       2       3     MOPS    MOPS    MOPS    MOPS    MOPS
+ *
+ * P3  3.07    0.860   0.815   0.328   0.355   0.160   1.70    1.32   0.264
+ * P4  10.0    4.68    3.51    1.27    0.482   0.298   5.73    5.20    1.18
+ * P5  28.5    20.0    11.8    3.36    2.10    1.21    14.0    11.7    2.42
+ * P6  81.7    47.5    37.8    10.9    3.91    2.43    51.2    42.8    7.85
+ *
+ *
+ *    Single Precision Optimised Results -otexan -zp4 -om -fp5 -5r -dMSC
+ *
+ *     MWIPS   MFLOPS  MFLOPS  MFLOPS  COS     EXP     FIXPT   IF     EQUAL
+ * Key           1       2       3     MOPS    MOPS    MOPS    MOPS    MOPS
+ *
+ * P3  5.68    0.928   0.884   0.673   0.461   0.275   2.36    2.16   0.638
+ * P4  16.4    5.09    4.03    2.66    0.526   0.342   6.36    6.00    5.28
+ * P5  66.3    26.8    17.1    10.1    2.66    1.51    16.0    19.9    22.9
+ * P6  161     50.3    45.2    31.5    4.46    2.77    102     20.6    119
+ *
+ *
+ *  Double Precision Optimised Results -otexan -zp4 -om -fp5 -5r -dMSC -dDP
+ *
+ *     MWIPS   MFLOPS  MFLOPS  MFLOPS  COS     EXP     FIXPT   IF     EQUAL
+ * Key           1       2       3     MOPS    MOPS    MOPS    MOPS    MOPS
+ *
+ * P3  5.20    0.818   0.775   0.604   0.525   0.268   2.20    2.05   0.538
+ * P4  16.5    4.74    3.76    2.51    0.627   0.343   6.22    6.45    4.09
+ * P5  67.9    26.9    16.7    10.1    3.06    1.51    15.8    19.9    22.8
+ * P6  167     50.3    43.5    31.5    5.37    2.83    81.3    20.6    119
+ *
+ *
+ *                             Systems
+ *
+ * Key System       CPU     MHz   Cache    Options          OS
+ *
+ * P3  Clone     AM80386DX  40    128K   with 387        Windows 95
+ * P4  Escom,    80486DX2   66.7  128K   CIS chipset     Windows 95
+ * P5  Escom,    Pentium   100    256K   Neptune chipset Windows 95
+ * P6  Dell Pro  PentPro   200    256K   440FX PCIset    NT 3.51
+ *
+ **************************************************************************
+ *
+ *                       Running Instructions
+ *
+ *      1.  In order to compile successfully, include timer option as
+ *          indicated below.
+ *      2.  If pre-compiled codes are to be distributed, compile with the
+ *          -DPRECOMP option or uncomment #define PRECOMP at PRECOMPILE
+ *          below. Also insert compiler name and optimisation details
+ *          at #define precompiler and #define preoptions.
+ *      3.  Compile and run for single precision results
+ *      4.  Compile with -DDP option or uncomment #define DP at PRECISION
+ *          below and run for double precision results.
+ *      5.  Run with maximum and no optimisation (minimum debug)
+ *      6.  Notify Roy Longbottom of other necessary changes
+ *      7.  Send results file whets.res to Roy Longbottom - with one
+ *          sample of each run and system details fully completed
+ *
+ *      Roy Longbottom  101323.2241@compuserve.com    6 November 1996
+ *
+ **************************************************************************
+ */
+
+/*
+    NOTE: unsupported calls to gets() have been removed
+    Must be compiled with -skipFns=dtime
+*/
+
+ #include <math.h>       /* for sin, exp etc.           */
+ #include <stdio.h>      /* standard I/O                */
+ #include <string.h>     /* for strcpy - 3 occurrences  */
+ #include <stdlib.h>     /* for exit   - 1 occurrence   */
+
+/***************************************************************/
+/* Timer options. You MUST uncomment one of the options below  */
+/* or compile, for example, with the '-DUNIX' option.          */
+/***************************************************************/
+/* #define Amiga       */
+#define UNIX
+/* #define UNIX_Old    */
+/* #define VMS         */
+/* #define BORLAND_C   */
+/* #define MSC         */
+/* #define MAC         */
+/* #define IPSC        */
+/* #define FORTRAN_SEC */
+/* #define GTODay      */
+/* #define CTimer      */
+/* #define UXPM        */
+/* #define MAC_TMgr    */
+/* #define PARIX       */
+/* #define POSIX       */
+/* #define WIN32       */
+
+
+/*PRECISION PRECISION PRECISION PRECISION PRECISION PRECISION PRECISION*/
+
+ /* #define DP */
+
+ #ifdef DP
+    #define SPDP double
+    #define Precision "Double"
+ #else
+    #define SPDP float
+    #define Precision "Single"
+ #endif
+
+
+/*PRECOMPILE  PRECOMPILE  PRECOMPILE  PRECOMPILE  PRECOMPILE  PRECOMPILE*/
+
+ /* #define PRECOMP */
+
+ #ifdef PRECOMP
+    #define precompiler "INSERT COMPILER NAME HERE"
+    #define preoptions  "INSERT OPTIMISATION OPTIONS HERE"
+ #endif
+
+
+ void whetstones(long xtra, long x100, int calibrate);
+ void pa(SPDP e[4], SPDP t, SPDP t2);
+ void po(SPDP e1[4], long j, long k, long l);
+ void p3(SPDP *x, SPDP *y, SPDP *z, SPDP t, SPDP t1, SPDP t2);
+ void pout(char title[22], float ops, int type, SPDP checknum,
+                  SPDP time, int calibrate, int section);
+
+
+ static SPDP loop_time[9];
+ static SPDP loop_mops[9];
+ static SPDP loop_mflops[9];
+ static SPDP TimeUsed;
+ static SPDP mwips;
+ static char headings[9][18];
+ static SPDP Check;
+ static SPDP results[9];
+
+
+main()
+{
+    int count = 10, calibrate = 1;
+    long xtra = 1;
+    int section;
+    long x100 = 100;
+    int duration = 100;
+    FILE *outfile;
+    char compiler[80], options[256], general[10][80] = {" "};
+    char *endit;
+
+    printf("\n");
+    printf("##########################################\n");
+    printf("%s Precision C/C++ Whetstone Benchmark\n\n", Precision);
+
+    /* Changed for COAST */
+    // outfile = fopen("whets.res","a+");
+    // if (outfile == NULL)
+    //   {
+    //    printf ("Cannot open results file \n\n");
+    //    printf("Press RETURN to exit\n");
+    //    gets(endit);
+    //    exit (0);
+    //   }
+
+  printf("Calibrate\n");
+  do
+   {
+    TimeUsed=0;
+
+    whetstones(xtra,x100,calibrate);
+
+    printf("%11.2f Seconds %10.0lf   Passes (x 100)\n",
+                                     TimeUsed,(SPDP)(xtra));
+    calibrate++;
+    count--;
+
+    if (TimeUsed > 2.0)
+      {
+       count = 0;
+      }
+       else
+      {
+       xtra = xtra * 5;
+      }
+   }
+
+   while (count > 0);
+
+   if (TimeUsed > 0) xtra = (long)((SPDP)(duration * xtra) / TimeUsed);
+   if (xtra < 1) xtra = 1;
+
+   calibrate = 0;
+
+   printf("\nUse %d  passes (x 100)\n", xtra);
+
+   printf("\n          %s Precision C/C++ Whetstone Benchmark",Precision);
+
+   #ifdef PRECOMP
+      printf("\n          Compiler  %s", precompiler);
+      printf("\n          Options   %s\n", preoptions);
+   #else
+      printf("\n");
+   #endif
+
+   printf("\nLoop content                  Result              MFLOPS "
+                                "     MOPS   Seconds\n\n");
+
+   TimeUsed=0;
+   whetstones(xtra,x100,calibrate);
+
+   printf("\nMWIPS            ");
+   if (TimeUsed>0)
+     {
+      mwips=(float)(xtra) * (float)(x100) / (10 * TimeUsed);
+     }
+      else
+     {
+      mwips = 0;
+     }
+
+   printf("%39.3f%19.3f\n\n",mwips,TimeUsed);
+
+   if (Check == 0) printf("Wrong answer  ");
+
+   /* Changed for COAST */
+   return 0;
+
+
+ /************************************************************************/
+ /*             Type details of hardware, software etc.                  */
+ /************************************************************************/
+
+/*
+ printf ("Enter the following which will be added with results to file WHETS.RES\n");
+ printf ("When submitting a number of results you need only provide details once\n");
+ printf ("but a cross reference such as an abbreviated CPU type would be useful.\n");
+ printf ("You can kill (exit or close) the program now and no data will be added.\n\n");
+
+ printf ("Date:       ");
+ gets(general[0]);
+
+ printf ("Computer:   ");
+ gets(general[1]);
+
+ printf ("CPU chip:   ");
+ gets(general[2]);
+
+ printf ("Clock MHz:  ");
+ gets(general[3]);
+
+ printf ("Cache size: ");
+ gets(general[4]);
+
+ printf ("H/W options:");
+ gets(general[5]);
+
+ printf ("OS version: ");
+ gets(general[6]);
+
+ #ifdef PRECOMP
+    strcpy (compiler, precompiler);
+    strcpy (options, preoptions);
+ #else
+    printf ("Compiler:   ");
+    gets(compiler);
+
+    printf ("Options:    ");
+    gets(options);
+ #endif
+
+ printf ("Your name:  ");
+ gets(general[7]);
+
+ printf ("From:       ");
+ gets(general[8]);
+
+ printf ("Email:      ");
+ gets(general[9]);
+*/
+
+ /************************************************************************/
+ /*               Add results to output file whets.res                   */
+ /************************************************************************/
+/*
+ fprintf (outfile, "\n");
+ fprintf (outfile, "##############################################\n");
+ fprintf (outfile, "Whetstone %s  Precision Benchmark in C/C++\n\n",Precision);
+ fprintf (outfile, "Date         %s\n", general[0]);
+ fprintf (outfile, "Model        %s\n", general[1]);
+ fprintf (outfile, "CPU          %s\n", general[2]);
+ fprintf (outfile, "Clock MHz    %s\n", general[3]);
+ fprintf (outfile, "Cache        %s\n", general[4]);
+ fprintf (outfile, "H/W options  %s\n", general[5]);
+ fprintf (outfile, "OS           %s\n", general[6]);
+ fprintf (outfile, "Compiler     %s\n", compiler);
+ fprintf (outfile, "Options      %s\n", options);
+ fprintf (outfile, "Run by       %s\n", general[7]);
+ fprintf (outfile, "From         %s\n", general[8]);
+ fprintf (outfile, "Email        %s\n", general[9]);
+ fprintf (outfile, "\n");
+
+ fprintf (outfile,"Loop content                   Result"
+            "              MFLOPS      MOPS   Seconds\n\n");
+
+ for (section=1; section<9; section++)
+    {
+     fprintf (outfile, "%s  %24.17f   ", headings[section],
+                                              results[section]);
+     if (loop_mops[section] == 99999)
+       {
+        fprintf (outfile,"  %9.3f           %9.3f\n",
+                 loop_mflops[section], loop_time[section]);
+       }
+       else
+       {
+        fprintf (outfile, "            %9.3f %9.3f\n",
+             loop_mops[section], loop_time[section], results[section]);
+       }
+    }
+
+ fprintf (outfile, "\nMWIPS             ");
+ fprintf (outfile, "%39.3f%20.3f\n\n",mwips,TimeUsed);
+ fprintf (outfile, "Results  to  load  to  spreadsheet             ");
+ fprintf (outfile, "     MWIPS   Mflops1   Mflops2   Mflops3   Cosmops"
+                      "   Expmops  Fixpmops    Ifmops    Eqmops\n");
+ fprintf (outfile, "Results  to  load  to  spreadsheet             ");
+
+ fprintf (outfile, " %9.3f %9.3f %9.3f", mwips, loop_mflops[1],
+                                                         loop_mflops[2]);
+ fprintf (outfile, " %9.3f %9.3f %9.3f", loop_mflops[6],
+                                             loop_mops[5], loop_mops[8]);
+ fprintf (outfile, " %9.3f %9.3f %9.3f\n\n", loop_mops[4],
+                                              loop_mops[3], loop_mops[7]);
+
+ fclose (outfile);
+
+ printf ("\n");
+ printf ("A new results file will have been created in the same directory as the\n");
+ printf (".EXE files if one did not already exist. If you made a mistake above, \n");
+ printf ("you can use a text editor to correct it, delete the results or copy \n");
+ printf ("them to a different file name. If you intend to run multiple tests you\n");
+ printf ("you may wish to rename WHETS.RES with a more informative title.\n\n");
+ printf ("Please submit feedback and results files as a posting in Section 12\n");
+ printf ("or to Roy_Longbottom@compuserve.com\n\n");
+*/
+
+}
+
+    void whetstones(long xtra, long x100, int calibrate)
+      {
+
+        long n1,n2,n3,n4,n5,n6,n7,n8,i,ix,n1mult;
+        SPDP x,y,z;
+        long j,k,l;
+        SPDP e1[4],timea,timeb, dtime();
+
+        SPDP t =  0.49999975;
+        SPDP t0 = t;
+        SPDP t1 = 0.50000025;
+        SPDP t2 = 2.0;
+
+        Check=0.0;
+
+        n1 = 12*x100;
+        n2 = 14*x100;
+        n3 = 345*x100;
+        n4 = 210*x100;
+        n5 = 32*x100;
+        n6 = 899*x100;
+        n7 = 616*x100;
+        n8 = 93*x100;
+        n1mult = 10;
+
+        /* Section 1, Array elements */
+
+        e1[0] = 1.0;
+        e1[1] = -1.0;
+        e1[2] = -1.0;
+        e1[3] = -1.0;
+        timea = dtime();
+         {
+            for (ix=0; ix<xtra; ix++)
+              {
+                for(i=0; i<n1*n1mult; i++)
+                  {
+                      e1[0] = (e1[0] + e1[1] + e1[2] - e1[3]) * t;
+                      e1[1] = (e1[0] + e1[1] - e1[2] + e1[3]) * t;
+                      e1[2] = (e1[0] - e1[1] + e1[2] + e1[3]) * t;
+                      e1[3] = (-e1[0] + e1[1] + e1[2] + e1[3]) * t;
+                  }
+                t = 1.0 - t;
+              }
+            t =  t0;
+         }
+        timeb = (dtime()-timea)/(SPDP)(n1mult);
+        pout("N1 floating point\0",(float)(n1*16)*(float)(xtra),
+                             1,e1[3],timeb,calibrate,1);
+
+        /* Section 2, Array as parameter */
+
+        timea = dtime();
+         {
+            for (ix=0; ix<xtra; ix++)
+              {
+                for(i=0; i<n2; i++)
+                  {
+                     pa(e1,t,t2);
+                  }
+                t = 1.0 - t;
+              }
+            t =  t0;
+         }
+        timeb = dtime()-timea;
+        pout("N2 floating point\0",(float)(n2*96)*(float)(xtra),
+                             1,e1[3],timeb,calibrate,2);
+
+        /* Section 3, Conditional jumps */
+        j = 1;
+        timea = dtime();
+         {
+            for (ix=0; ix<xtra; ix++)
+              {
+                for(i=0; i<n3; i++)
+                  {
+                     if(j==1)       j = 2;
+                     else           j = 3;
+                     if(j>2)        j = 0;
+                     else           j = 1;
+                     if(j<1)        j = 1;
+                     else           j = 0;
+                  }
+              }
+         }
+        timeb = dtime()-timea;
+        pout("N3 if then else  \0",(float)(n3*3)*(float)(xtra),
+                        2,(SPDP)(j),timeb,calibrate,3);
+
+        /* Section 4, Integer arithmetic */
+        j = 1;
+        k = 2;
+        l = 3;
+        timea = dtime();
+         {
+            for (ix=0; ix<xtra; ix++)
+              {
+                for(i=0; i<n4; i++)
+                  {
+                     j = j *(k-j)*(l-k);
+                     k = l * k - (l-j) * k;
+                     l = (l-k) * (k+j);
+                     e1[l-2] = j + k + l;
+                     e1[k-2] = j * k * l;
+                  }
+              }
+         }
+        timeb = dtime()-timea;
+        x = e1[0]+e1[1];
+        pout("N4 fixed point   \0",(float)(n4*15)*(float)(xtra),
+                                 2,x,timeb,calibrate,4);
+
+        /* Section 5, Trig functions */
+        x = 0.5;
+        y = 0.5;
+        timea = dtime();
+         {
+            for (ix=0; ix<xtra; ix++)
+              {
+                for(i=1; i<n5; i++)
+                  {
+                     x = t*atan(t2*sin(x)*cos(x)/(cos(x+y)+cos(x-y)-1.0));
+                     y = t*atan(t2*sin(y)*cos(y)/(cos(x+y)+cos(x-y)-1.0));
+                  }
+                t = 1.0 - t;
+              }
+            t = t0;
+         }
+        timeb = dtime()-timea;
+        pout("N5 sin,cos etc.  \0",(float)(n5*26)*(float)(xtra),
+                                 2,y,timeb,calibrate,5);
+
+        /* Section 6, Procedure calls */
+        x = 1.0;
+        y = 1.0;
+        z = 1.0;
+        timea = dtime();
+         {
+            for (ix=0; ix<xtra; ix++)
+              {
+                for(i=0; i<n6; i++)
+                  {
+                     p3(&x,&y,&z,t,t1,t2);
+                  }
+              }
+         }
+        timeb = dtime()-timea;
+        pout("N6 floating point\0",(float)(n6*6)*(float)(xtra),
+                                1,z,timeb,calibrate,6);
+
+        /* Section 7, Array refrences */
+        j = 0;
+        k = 1;
+        l = 2;
+        e1[0] = 1.0;
+        e1[1] = 2.0;
+        e1[2] = 3.0;
+        timea = dtime();
+         {
+            for (ix=0; ix<xtra; ix++)
+              {
+                for(i=0;i<n7;i++)
+                  {
+                     po(e1,j,k,l);
+                  }
+              }
+         }
+        timeb = dtime()-timea;
+        pout("N7 assignments   \0",(float)(n7*3)*(float)(xtra),
+                            2,e1[2],timeb,calibrate,7);
+
+        /* Section 8, Standard functions */
+        x = 0.75;
+        timea = dtime();
+         {
+            for (ix=0; ix<xtra; ix++)
+              {
+                for(i=0; i<n8; i++)
+                  {
+                     x = sqrt(exp(log(x)/t1));
+                  }
+              }
+         }
+        timeb = dtime()-timea;
+        pout("N8 exp,sqrt etc. \0",(float)(n8*4)*(float)(xtra),
+                                2,x,timeb,calibrate,8);
+
+        return;
+      }
+
+
+    void pa(SPDP e[4], SPDP t, SPDP t2)
+      {
+         long j;
+         for(j=0;j<6;j++)
+            {
+               e[0] = (e[0]+e[1]+e[2]-e[3])*t;
+               e[1] = (e[0]+e[1]-e[2]+e[3])*t;
+               e[2] = (e[0]-e[1]+e[2]+e[3])*t;
+               e[3] = (-e[0]+e[1]+e[2]+e[3])/t2;
+            }
+
+         return;
+      }
+
+    void po(SPDP e1[4], long j, long k, long l)
+      {
+         e1[j] = e1[k];
+         e1[k] = e1[l];
+         e1[l] = e1[j];
+         return;
+      }
+
+    void p3(SPDP *x, SPDP *y, SPDP *z, SPDP t, SPDP t1, SPDP t2)
+      {
+         *x = *y;
+         *y = *z;
+         *x = t * (*x + *y);
+         *y = t1 * (*x + *y);
+         *z = (*x + *y)/t2;
+         return;
+      }
+
+
+    void pout(char title[18], float ops, int type, SPDP checknum,
+              SPDP time, int calibrate, int section)
+      {
+        SPDP mops,mflops;
+
+        Check = Check + checknum;
+        loop_time[section] = time;
+        strcpy (headings[section],title);
+        TimeUsed =  TimeUsed + time;
+        if (calibrate == 1)
+
+          {
+              results[section] = checknum;
+          }
+        if (calibrate == 0)
+          {
+            printf("%s %24.17f    ",headings[section],results[section]);
+
+            if (type == 1)
+             {
+                if (time>0)
+                 {
+                    mflops = ops/(1000000L*time);
+                 }
+                else
+                 {
+                   mflops = 0;
+                 }
+                loop_mops[section] = 99999;
+                loop_mflops[section] = mflops;
+                printf(" %9.3f          %9.3f\n",
+                 loop_mflops[section], loop_time[section]);
+             }
+            else
+             {
+                if (time>0)
+                 {
+                   mops = ops/(1000000L*time);
+                 }
+                else
+                 {
+                   mops = 0;
+                 }
+                loop_mops[section] = mops;
+                loop_mflops[section] = 0;
+                printf("           %9.3f%9.3f\n",
+                 loop_mops[section], loop_time[section]);
+             }
+          }
+
+        return;
+      }
+
+
+/*****************************************************/
+/* Various timer routines.                           */
+/* Al Aburto, aburto@nosc.mil, 08 Oct 1996           */
+/*                                                   */
+/* t = dtime() outputs the current time in seconds.  */
+/* Use CAUTION as some of these routines will mess   */
+/* up when timing across the hour mark!!!            */
+/*                                                   */
+/* For timing I use the 'user' time whenever         */
+/* possible. Using 'user+sys' time is a separate     */
+/* issue.                                            */
+/*                                                   */
+/* Example Usage:                                    */
+/* [timer options added here]                        */
+/* main()                                            */
+/* {                                                 */
+/*  double starttime,benchtime,dtime();              */
+/*                                                   */
+/*  starttime = dtime();                             */
+/*  [routine to time]                                */
+/*  benchtime = dtime() - starttime;                 */
+/* }                                                 */
+/*                                                   */
+/* [timer code below added here]                     */
+/*****************************************************/
+
+/*********************************/
+/* Timer code.                   */
+/*********************************/
+/*******************/
+/*  Amiga dtime()  */
+/*******************/
+#ifdef Amiga
+#include <ctype.h>
+#define HZ 50
+
+SPDP dtime()
+{
+ SPDP q;
+
+ struct tt
+       {
+        long  days;
+        long  minutes;
+        long  ticks;
+       } tt;
+
+ DateStamp(&tt);
+
+ q = ((SPDP)(tt.ticks + (tt.minutes * 60L * 50L))) / (SPDP)HZ;
+
+ return q;
+}
+#endif
+
+/*****************************************************/
+/*  UNIX dtime(). This is the preferred UNIX timer.  */
+/*  Provided by: Markku Kolkka, mk59200@cc.tut.fi    */
+/*  HP-UX Addition by: Bo Thide', bt@irfu.se         */
+/*****************************************************/
+#ifdef UNIX
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#ifdef hpux
+#include <sys/syscall.h>
+#define getrusage(a,b) syscall(SYS_getrusage,a,b)
+#endif
+
+struct rusage rusage;
+
+SPDP dtime()
+{
+ SPDP q;
+
+ getrusage(RUSAGE_SELF,&rusage);
+
+ q = (SPDP)(rusage.ru_utime.tv_sec);
+ q = q + (SPDP)(rusage.ru_utime.tv_usec) * 1.0e-06;
+
+ return q;
+}
+#endif
+
+/***************************************************/
+/*  UNIX_Old dtime(). This is the old UNIX timer.  */
+/*  Use only if absolutely necessary as HZ may be  */
+/*  ill defined on your system.                    */
+/***************************************************/
+#ifdef UNIX_Old
+#include <sys/types.h>
+#include <sys/times.h>
+#include <sys/param.h>
+
+#ifndef HZ
+#define HZ 60
+#endif
+
+struct tms tms;
+
+SPDP dtime()
+{
+ SPDP q;
+
+ times(&tms);
+
+ q = (SPDP)(tms.tms_utime) / (SPDP)HZ;
+
+ return q;
+}
+#endif
+
+/*********************************************************/
+/*  VMS dtime() for VMS systems.                         */
+/*  Provided by: RAMO@uvphys.phys.UVic.CA                */
+/*  Some people have run into problems with this timer.  */
+/*********************************************************/
+#ifdef VMS
+#include time
+
+#ifndef HZ
+#define HZ 100
+#endif
+
+struct tbuffer_t
+       {
+        int proc_user_time;
+        int proc_system_time;
+        int child_user_time;
+        int child_system_time;
+       };
+struct tbuffer_t tms;
+
+SPDP dtime()
+{
+ SPDP q;
+
+ times(&tms);
+
+ q = (SPDP)(tms.proc_user_time) / (SPDP)HZ;
+
+ return q;
+}
+#endif
+
+/******************************/
+/*  BORLAND C dtime() for DOS */
+/******************************/
+#ifdef BORLAND_C
+#include <ctype.h>
+#include <dos.h>
+#include <time.h>
+
+#define HZ 100
+struct time tnow;
+
+SPDP dtime()
+{
+ SPDP q;
+
+ gettime(&tnow);
+
+ q = 60.0 * (SPDP)(tnow.ti_min);
+ q = q + (SPDP)(tnow.ti_sec);
+ q = q + (SPDP)(tnow.ti_hund)/(SPDP)HZ;
+
+ return q;
+}
+#endif
+
+/***************************************/
+/*  Microsoft C (MSC) dtime() for DOS  */
+/*  Also suitable for Watcom C/C++ and */
+/*  some other PC compilers            */
+/***************************************/
+#ifdef MSC
+#include <time.h>
+#include <ctype.h>
+
+#define HZ CLOCKS_PER_SEC
+clock_t tnow;
+
+SPDP dtime()
+{
+ SPDP q;
+
+ tnow = clock();
+ q = (SPDP)tnow / (SPDP)HZ;
+ return q;
+}
+#endif
+
+/*************************************/
+/*  Macintosh (MAC) Think C dtime()  */
+/*************************************/
+#ifdef MAC
+#include <time.h>
+
+#define HZ 60
+
+SPDP dtime()
+{
+ SPDP q;
+
+ q = (SPDP)clock() / (SPDP)HZ;
+
+ return q;
+}
+#endif
+
+/************************************************************/
+/*  iPSC/860 (IPSC) dtime() for i860.                       */
+/*  Provided by: Dan Yergeau, yergeau@gloworm.Stanford.EDU  */
+/************************************************************/
+#ifdef IPSC
+extern double dclock();
+
+SPDP dtime()
+{
+ SPDP q;
+
+ q = dclock();
+
+ return q;
+}
+#endif
+
+/**************************************************/
+/*  FORTRAN dtime() for Cray type systems.        */
+/*  This is the preferred timer for Cray systems. */
+/**************************************************/
+#ifdef FORTRAN_SEC
+
+fortran double second();
+
+SPDP dtime()
+{
+ SPDP q;
+
+ second(&q);
+
+ return q;
+}
+#endif
+
+/***********************************************************/
+/*  UNICOS C dtime() for Cray UNICOS systems.  Don't use   */
+/*  unless absolutely necessary as returned time includes  */
+/*  'user+system' time.  Provided by: R. Mike Dority,      */
+/*  dority@craysea.cray.com                                */
+/***********************************************************/
+#ifdef CTimer
+#include <time.h>
+
+SPDP dtime()
+{
+ SPDP q;
+ clock_t   clock(void);
+
+ q = (SPDP)clock() / (SPDP)CLOCKS_PER_SEC;
+
+ return q;
+}
+#endif
+
+/********************************************/
+/* Another UNIX timer using gettimeofday(). */
+/* However, getrusage() is preferred.       */
+/********************************************/
+#ifdef GTODay
+#include <sys/time.h>
+
+struct timeval tnow;
+
+SPDP dtime()
+{
+ SPDP q;
+
+ gettimeofday(&tnow,NULL);
+ q = (SPDP)tnow.tv_sec + (SPDP)tnow.tv_usec * 1.0e-6;
+
+ return q;
+}
+#endif
+
+/*****************************************************/
+/*  Fujitsu UXP/M timer.                             */
+/*  Provided by: Mathew Lim, ANUSF, M.Lim@anu.edu.au */
+/*****************************************************/
+#ifdef UXPM
+#include <sys/types.h>
+#include <sys/timesu.h>
+struct tmsu rusage;
+
+SPDP dtime()
+{
+ SPDP q;
+
+ timesu(&rusage);
+
+ q = (SPDP)(rusage.tms_utime) * 1.0e-06;
+
+ return q;
+}
+#endif
+
+/**********************************************/
+/*    Macintosh (MAC_TMgr) Think C dtime()    */
+/*   requires Think C Language Extensions or  */
+/*    #include <MacHeaders> in the prefix     */
+/*  provided by Francis H Schiffer 3rd (fhs)  */
+/*         skipschiffer@genie.geis.com        */
+/**********************************************/
+#ifdef MAC_TMgr
+#include <Timer.h>
+#include <stdlib.h>
+
+static TMTask   mgrTimer;
+static Boolean  mgrInited = false;
+static SPDP     mgrClock;
+
+#define RMV_TIMER RmvTime( (QElemPtr)&mgrTimer )
+#define MAX_TIME  1800000000L
+/* MAX_TIME limits time between calls to */
+/* dtime( ) to no more than 30 minutes   */
+/* this limitation could be removed by   */
+/* creating a completion routine to sum  */
+/* 30 minute segments (fhs 1994 feb 9)   */
+
+static void Remove_timer( )
+{
+ RMV_TIMER;
+ mgrInited = false;
+}
+
+SPDP dtime( )
+{
+ if( mgrInited ) {
+        RMV_TIMER;
+        mgrClock += (MAX_TIME + mgrTimer.tmCount)*1.0e-6;
+ } else {
+        if( _atexit( &Remove_timer ) == 0 ) mgrInited = true;
+        mgrClock = 0.0;
+}
+        if( mgrInited ) {
+                mgrTimer.tmAddr = NULL;
+                mgrTimer.tmCount = 0;
+                mgrTimer.tmWakeUp = 0;
+                mgrTimer.tmReserved = 0;
+                InsTime( (QElemPtr)&mgrTimer );
+                PrimeTime( (QElemPtr)&mgrTimer, -MAX_TIME );
+        }
+        return( mgrClock );
+}
+#endif
+
+/***********************************************************/
+/*  Parsytec GCel timer.                                   */
+/*  Provided by: Georg Wambach, gw@informatik.uni-koeln.de */
+/***********************************************************/
+#ifdef PARIX
+#include <sys/time.h>
+
+SPDP dtime()
+{
+ SPDP q;
+
+ q = (SPDP) (TimeNowHigh()) / (SPDP) CLK_TCK_HIGH;
+
+ return q;
+}
+#endif
+
+/************************************************/
+/*  Sun Solaris POSIX dtime() routine           */
+/*  Provided by: Case Larsen, CTLarsen.lbl.gov  */
+/************************************************/
+#ifdef POSIX
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/rusage.h>
+
+#ifdef __hpux
+#include <sys/syscall.h>
+#endif
+
+struct rusage rusage;
+
+SPDP dtime()
+{
+ SPDP q;
+
+ getrusage(RUSAGE_SELF,&rusage);
+
+ q = (SPDP)(rusage.ru_utime.tv_sec);
+ q = q + (SPDP)(rusage.ru_utime.tv_nsec) * 1.0e-09;
+
+ return q;
+}
+#endif
+
+
+/****************************************************/
+/*  Windows NT (32 bit) dtime() routine             */
+/*  Provided by: Piers Haken, piersh@microsoft.com  */
+/****************************************************/
+#ifdef WIN32
+#include <windows.h>
+
+SPDP dtime(void)
+{
+ SPDP q;
+
+ q = (SPDP)GetTickCount() * 1.0e-03;
+
+ return q;
+}
+#endif
diff --git a/tests/TMRregression/unitTests/zeroInit.c b/tests/TMRregression/unitTests/zeroInit.c
index 0c5410b33..22e1171ad 100644
--- a/tests/TMRregression/unitTests/zeroInit.c
+++ b/tests/TMRregression/unitTests/zeroInit.c
@@ -31,5 +31,10 @@ int main() {
 	printf("Calculated: %d, %d\n", acc1, acc2);
 	printf("  Expected: %d, %d\n", 4240, 4220);
 
-	return 0;
+	//check
+	if ( (acc1 == 4240) && (acc2 == 4220) ) {
+		return 0;
+	} else {
+		return -1;
+	}
 }
diff --git a/tests/hifive1/.gitignore b/tests/hifive1/.gitignore
new file mode 100644
index 000000000..27385d772
--- /dev/null
+++ b/tests/hifive1/.gitignore
@@ -0,0 +1,4 @@
+matrixMultiply/mm
+matrixMultiply.tmr/mm_tmr
+sha256/sha256
+sha256.tmr/sha256_tmr
diff --git a/tests/hifive1/sha256.tmr/sha256_tmr.c b/tests/hifive1/sha256.tmr/sha256_tmr.c
index 1a5951377..3c29f18e4 100644
--- a/tests/hifive1/sha256.tmr/sha256_tmr.c
+++ b/tests/hifive1/sha256.tmr/sha256_tmr.c
@@ -4,6 +4,13 @@
 #include "COAST.h"
 __DEFAULT_NO_xMR
 
+#define SIMULATED
+#ifdef SIMULATED
+#define N 2
+#else
+#define N 100
+#endif
+
 #include "platform.h"
 
 #define US_PER_S (1000 * 1000)
@@ -22,7 +29,7 @@ int main() {
     while (1) {
        	uint64_t t1 = get_timer_value();
 
-        for (int i = 0; i < 100; i++) {
+        for (int i = 0; i < N; i++) {
 			sha_run_test();
 			error = checkGolden();
 			if (error)
diff --git a/tests/hifive1/sha256/sha256.c b/tests/hifive1/sha256/sha256.c
index 9621176a9..c90ff4962 100644
--- a/tests/hifive1/sha256/sha256.c
+++ b/tests/hifive1/sha256/sha256.c
@@ -4,6 +4,13 @@
 #include "COAST.h"
 __DEFAULT_NO_xMR
 
+#define SIMULATED
+#ifdef SIMULATED
+#define N 2
+#else
+#define N 100
+#endif
+
 #include "platform.h"
 
 #define US_PER_S (1000 * 1000)
@@ -17,21 +24,21 @@ typedef uint32_t mm_t;
 
 int main() {
     uint32_t timer_freq = (unsigned) ( get_timer_freq() & 0xFFFFFFFF);
-    
+
     while (1) {
        	uint64_t t1 = get_timer_value();
 
-        for (int i = 0; i < 100; i++) {
+        for (int i = 0; i < N; i++) {
 			sha_run_test();
 			error = checkGolden();
 			if (error)
 				break;
 		}
-        uint64_t t2 = get_timer_value();        
-        
+        uint64_t t2 = get_timer_value();
+
         uint32_t t = US_PER_S * (t2 - t1) / (float)timer_freq;
-                
+
         printf("C:0 E:%u F:0 T:%uus\n", error, t);
 
     }
-}
\ No newline at end of file
+}
diff --git a/tests/makefiles/Makefile.common b/tests/makefiles/Makefile.common
index c0230d083..cf7737f28 100644
--- a/tests/makefiles/Makefile.common
+++ b/tests/makefiles/Makefile.common
@@ -4,7 +4,7 @@ error:
 	@echo "Please choose a target"
 	@exit 2
 
-MAKEFILES := Makefile $(shell find $(LEVEL)/makefiles)
+MAKEFILES := $(notdir Makefile $(shell find $(LEVEL)/makefiles/) )
 
 include $(LEVEL)/makefiles/config
 include $(LEVEL)/makefiles/colors
diff --git a/tests/makefiles/Makefile.compile.hercules b/tests/makefiles/Makefile.compile.hercules
index 6eb148092..a496f08d4 100644
--- a/tests/makefiles/Makefile.compile.hercules
+++ b/tests/makefiles/Makefile.compile.hercules
@@ -16,7 +16,7 @@ COMPILER_ROOT = $(CCS_ROOT)/../tools/compiler
 
 #Change these every time you update CCS
 ARM_GCC_DIR   = $(COMPILER_ROOT)/gcc-arm-none-eabi-7-2017-q4-major
-ARM_COMPILER  = $(COMPILER_ROOT)/ti-cgt-arm_18.1.4.LTS
+ARM_COMPILER  = $(COMPILER_ROOT)/ti-cgt-arm_18.12.2.LTS
 ARMCL 		  = $(ARM_COMPILER)/bin/armcl
 
 #--------------------------------------------------------------
@@ -40,7 +40,8 @@ HEAP_SIZE  = 0x800
 
 # the HALCoGen code is built for gcc, but we're using Clang
 #	so some predefined symbols don't match names exactly
-REDEFINES = -D'__WCHAR_T_TYPE__=unsigned short' -D'__SIZE_T_TYPE__=unsigned' -D'__PTRDIFF_T_TYPE__=int'
+REDEFINES = -D'__PTRDIFF_T_TYPE__=int'
+# -D'__WCHAR_T_TYPE__=unsigned short' -D'__SIZE_T_TYPE__=unsigned' 
 
 #--------------------------------------------------------------
 # Compiler Flags
@@ -48,8 +49,8 @@ REDEFINES = -D'__WCHAR_T_TYPE__=unsigned short' -D'__SIZE_T_TYPE__=unsigned' -D'
 #--------------------------------------------------------------
 DEV_INCS = -nostdinc -I "$(PROJ_DIR)/" -I "$(PROJ_DIR)/include/" -I "$(ARM_COMPILER)/include/" -I "$(THIS_DIR)"
 DEV_OTHER_FLAGS = --preproc_with_compile
-CLANG_FLAGS := -fcolor-diagnostics --target=armv7r-eabi $(REDEFINES) -fshort-wchar
-DEV_CFLAGS 	:= -mv7R4 --code_state=32 -g --abi=eabi --enum_type=packed --diag_warning=225 --diag_wrap=off --display_error_number --enum_type=packed --wchar_t=16
+CLANG_FLAGS := -fcolor-diagnostics --target=armv7r-eabi $(REDEFINES) -fshort-wchar $(USER_CFLAGS)
+DEV_CFLAGS 	:= -mv7R4 --code_state=32 -g --abi=eabi --enum_type=packed --diag_warning=225 --diag_wrap=off --display_error_number --enum_type=packed --wchar_t=16 --preproc_with_compile  --asm_define=__clang__
 CL_INCS 	:= -i"$(PROJ_DIR)/" -i"$(PROJ_DIR)/include/" -i"$(ARM_COMPILER)/include/"
 LINK_INCS 	:= -i"$(ARM_COMPILER)/lib/" -i"$(CCS_ROOT)/arm/include/"
 DEV_LFLAGS 	:= --issue_remarks -z -m"$(TARGET).map" --heap_size=$(HEAP_SIZE) --stack_size=$(STACK_SIZE) -i"$(ARM_COMPILER)/lib" -i"$(ARM_COMPILER)/include" --reread_libs --diag_wrap=off --display_error_number --warn_sections --xml_link_info="$(TARGET)_linkInfo.xml" --rom_model
@@ -106,14 +107,14 @@ ASM_OBJS=$(patsubst %.asm,$(BUILD_DIR)/%.o,$(notdir $(ASM_SRCS)))
 # the fact that some of the source files are already in assembly form means that
 #	 the parts of the code contained therein cannot be protected with COAST
 
-ifeq ($(SERIES),TMS570LC43xx)
-exe:
-	@make all -C ./Debug --quiet
+# ifeq ($(SERIES),TMS570LC43xx)
+# exe:
+# 	@make all -C ./Debug --quiet
 
-clean:
-	@make clean -C ./Debug --quiet
+# clean:
+# 	@make clean -C ./Debug --quiet
 
-else
+# else
 
 exe: $(BUILD_DIR)/$(TARGET).out
 
@@ -208,11 +209,11 @@ PRGMA_SRCH_PTRN := *.c.bak
 fix_pragmas:
 	sed -i -E 's/#pragma WEAK\([a-zA-Z0-9]*\)/__attribute__ ((weak))\n&/' $(PRGMA_SRCH_PTRN)
 
-endif
+# endif
 
 # make sure our library file exists
 # https://stackoverflow.com/questions/1789594/how-do-i-write-the-cd-command-in-a-makefile
-$(LIBRARY_NAME):
+$(ARM_COMPILER)/lib/$(LIBRARY_NAME):
 	@cd $(ARM_COMPILER)/lib ; \
 	echo -e $(COLOR_BLACK)Making run-time library$(NO_COLOR); \
 	./mklib --index=libc.a --pattern=$(LIBRARY_NAME) --compiler_bin_dir=$(ARM_COMPILER)/bin; \
diff --git a/tests/makefiles/Makefile.compile.llvmLLI b/tests/makefiles/Makefile.compile.llvmLLI
index 0b3d814f6..c97b335d2 100644
--- a/tests/makefiles/Makefile.compile.llvmLLI
+++ b/tests/makefiles/Makefile.compile.llvmLLI
@@ -28,14 +28,14 @@ $(TARGET).lbc: $(BCFILES) $(BCPPFILES)
 
 ################# CLANG #################
 # Lowest level target should depend on the Makefiles
-%.clang.bc: %.c $(MAKEFILES)
+%.clang.bc: %.c
 	@echo -e $(COLOR_BLUE) Building $@ $(NO_COLOR)
 	@echo '  'flags = $(CLANG_FLAGS)
 	@$(CLANG) $(INCS) $(CLANG_FLAGS) -emit-llvm $< -c -o $@
 
 ################ CLANG++ ################
 # Lowest level target should depend on the Makefiles
-%.clang.bcpp: %.cpp $(MAKEFILES)
+%.clang.bcpp: %.cpp
 	@echo -e $(COLOR_BLUE) Building $@ $(NO_COLOR)
 	@echo '  'flags = $(CLANG_FLAGS)
 	$(CLANG++) $(INCS) $(CLANG_FLAGS) -emit-llvm $< -c -o $@
diff --git a/tests/makefiles/Makefile.compile.pynq b/tests/makefiles/Makefile.compile.pynq
index f0f1e1593..799d287b2 100644
--- a/tests/makefiles/Makefile.compile.pynq
+++ b/tests/makefiles/Makefile.compile.pynq
@@ -2,7 +2,9 @@
 # Sources
 PROJECT_SRC := ${CURDIR} $(SRC_DIRS)
 CSRCS 		:= $(foreach dir,$(PROJECT_SRC),$(wildcard $(dir)/*.c))
-BUILD_DIR	:= ./build
+BUILD_DIR	?= ./build
+INC_DIRS	?=
+BC_FILES	:= $(patsubst %.c,$(BUILD_DIR)/%.bc,$(notdir $(CSRCS)))
 
 ################################################################################
 # dependencies
@@ -15,11 +17,20 @@ BSP_INC 	:= $(BSP_DIR)/include
 ################################################################################
 # Flags
 LIB_INCS	:= -I"$(SDK_TRIPLE)/lib/gcc/arm-none-eabi/7.2.1/include/"  -I"$(SDK_TRIPLE)/lib/gcc/arm-none-eabi/7.2.1/include-fixed/" -I"$(SDK_TRIPLE)/arm-none-eabi/include/" -I"$(SDK_TRIPLE)/arm-none-eabi/libc/usr/include/"
-SRC_INCS	:= -nostdinc -I"$(BSP_INC)/" $(LIB_INCS)
+NEW_INCS	:= $(addprefix -I,$(INC_DIRS))
+SRC_INCS	:= -nostdinc -I"$(BSP_INC)/" $(LIB_INCS) $(NEW_INCS)
 LIB_DIR		?= $(BSP_DIR)
 LIBS		:= -lxil -lgcc -lc
 LIBS 		+= $(PROJ_LIBS)
 
+# NEON support
+ifneq ($(ARM_NEON),)
+FPU_NAME := neon
+LIB_INCS := -I"$(COAST_ROOT)/build/lib/clang/7.0.0/include/" -I"$(COAST_ROOT)/llvm/tools/clang/lib/Headers/" $(LIB_INCS)
+else
+FPU_NAME := vfpv3
+endif
+
 # tricky stuff to make a comma-separated list
 # https://stackoverflow.com/a/7531247
 null	:=
@@ -29,6 +40,12 @@ LIBS	:= $(subst $(space),$(comma),$(strip $(LIBS)))
 
 CFLAGS 		:= -Wall -std=c99 $(USER_CFLAGS)
 CLANG_FLAGS := -fcolor-diagnostics -target arm-none-eabi $(CFLAGS) -fshort-enums -nostdlib
+# apparently clang does not correctly set some macros, so we have to do it manually
+#  if we want to use NEON intrinsics
+ifneq ($(ARM_NEON),)
+CLANG_FLAGS += -mfloat-abi=softfp -mfpu=neon-fp-armv8 -D"__ARM_NEON"
+endif
+CLANG_FLAGS += $(addprefix -D,$(USER_DEFS))
 
 LLC_FLAGS 	:= -asm-verbose -filetype=asm -march=arm -mcpu=cortex-a9 -mattr=+vfp3 -float-abi=hard
 
@@ -73,8 +90,8 @@ $(BUILD_DIR)/$(NEW_LINK_F): $(LNK_SCRIPT)
 ################################################################################
 # Link everything together                                          		   #
 ################################################################################
-LD_FLAGS	:= -fdiagnostics-color -fshort-enums -mcpu=cortex-a9 -mfpu=vfpv3 -mfloat-abi=hard -mhard-float -Wl,--build-id=none -specs=$(SPEC_SRC) -Wl,-T -Wl,$(BOARD_SW)/lscript.ld -Wl,-Map,$(BUILD_DIR)/$(TARGET).map
-LD_LIBS		:= -L$(BUILD_DIR) -L$(LIB_DIR) -Wl,--start-group,$(LIBS),--end-group
+LD_FLAGS	:= -fdiagnostics-color -fshort-enums -mcpu=cortex-a9 -mfpu=$(FPU_NAME) -mfloat-abi=hard -mhard-float -Wl,--build-id=none -specs=$(SPEC_SRC) -Wl,-T -Wl,$(BOARD_SW)/lscript.ld -Wl,-Map,$(BUILD_DIR)/$(TARGET).map
+LD_LIBS		:= -Wl,-L$(BUILD_DIR),-L$(LIB_DIR) -Wl,--start-group,$(LIBS),--end-group
 
 $(BUILD_DIR)/$(TARGET).elf: $(BSP_LIB) $(BUILD_DIR)/$(TARGET).o | $(BUILD_DIR)/$(NEW_LINK_F)
 	@echo -e $(COLOR_MAGENTA)linking with libraries $(NO_COLOR)
@@ -86,7 +103,7 @@ $(BUILD_DIR)/$(TARGET)_%.elf: $(BUILD_DIR)/$(TARGET)_%.o $(BSP_LIB) | $(BUILD_DI
 	@echo -e $(COLOR_MAGENTA)linking with libraries $(NO_COLOR)
 	@echo -e '  'flags = $(LD_FLAGS)
 	@echo -e '  'libs = $(LD_LIBS)
-	$(LD) -g -fdiagnostics-color -fshort-enums -mcpu=cortex-a9 -mfpu=vfpv3 -mfloat-abi=hard -mhard-float -Wl,--build-id=none -specs=$(SPEC_SRC) -Wl,-T -Wl,$(BOARD_SW)/lscript_$*.ld -Wl,-Map,$(BUILD_DIR)/$(TARGET).map $^ -o $@ -L$(BUILD_DIR) -L$(LIB_DIR) -Wl,--start-group,-lxil_$*,-lgcc,-lc,--end-group
+	$(LD) -g -fdiagnostics-color -fshort-enums -mcpu=cortex-a9 -mfpu=$(FPU_NAME) -mfloat-abi=hard -mhard-float -Wl,--build-id=none -specs=$(SPEC_SRC) -Wl,-T -Wl,$(BOARD_SW)/lscript_$*.ld -Wl,-Map,$(BUILD_DIR)/$(TARGET).map $^ -o $@ -L$(BUILD_DIR) -L$(LIB_DIR) -Wl,--start-group,-lxil_$*,-lgcc,-lc,--end-group
 
 ################################################################################
 # Create object file (machine code) from assembly  			                   #
@@ -142,7 +159,7 @@ $(BUILD_DIR)/$(TARGET)_%.opt.bc: $(BUILD_DIR)/$(TARGET)_%_linked.bc
 # Linking the modules together			                                       #
 ################################################################################
 
-$(BUILD_DIR)/$(TARGET)_linked.bc: $(patsubst %.c,$(BUILD_DIR)/%.bc,$(notdir $(CSRCS)))
+$(BUILD_DIR)/$(TARGET)_linked.bc: $(BC_FILES)
 	@echo -e $(COLOR_MAGENTA)linking .bc files $(NO_COLOR)
 	@$(LLVM_LINK) $^ -o $@
 
@@ -161,13 +178,21 @@ $(BUILD_DIR)/$(TARGET)_1_linked.bc: $(patsubst %.c,$(BUILD_DIR)/%.bc.1,$(notdir
 
 # have to create a separate rule for each source directory
 #  but we can use a special construct to generate rules
+SUFFIXES	:= _linked.bc _linked.ll .s .opt.ll .opt.bc .o .map .elf
+EXE_FILES	:= $(addprefix $(TARGET),$(SUFFIXES))
+EXE_FILES	:= $(addprefix $(BUILD_DIR)/, $(EXE_FILES))
 
 define bc_file_compile =
 dir := $(1)
 
-$$(BUILD_DIR)/%.bc: $$(dir)/%.c $(MAKEFILES) $(INC_FILES) | $$(BUILD_DIR)/
+C_SRCS	 	:= $$(wildcard $$(dir)/*.c)
+C_SRC_NEW	:= $$(addprefix $$(BUILD_DIR)/,$$(notdir $$(C_SRCS)))
+EXE_FILES 	+= $$(patsubst %.c,%.bc, $$(C_SRC_NEW))
+EXE_FILES 	+= $$(patsubst %.c,%.ll, $$(C_SRC_NEW))
+
+$$(BUILD_DIR)/%.bc: $$(dir)/%.c $(INC_FILES) | $$(BUILD_DIR)/
 	@echo -e $$(COLOR_BLUE)Building $$(notdir $$@)$$(NO_COLOR)
-	$$(CLANG) -emit-llvm $$(CLANG_FLAGS) $$(SRC_INCS) $$< -c -o $$@
+	@$$(CLANG) -emit-llvm $$(CLANG_FLAGS) $$(SRC_INCS) $$< -c -o $$@
 
 $$(BUILD_DIR)/%.bc.0: $$(dir)/%.c $(MAKEFILES) $(INC_FILES) | $$(BUILD_DIR)/
 	@echo -e $$(COLOR_BLUE)Building $$(notdir $$@)$$(NO_COLOR)
@@ -181,43 +206,58 @@ endef
 
 $(foreach dir,$(PROJECT_SRC),$(eval $(call bc_file_compile,$(dir))))
 
+
 ################################################################################
 # Rules for building the BSP		                                           #
 ################################################################################
 
 .PHONY: bsp bsp_lib bsp_include
 
-BSP_DIRS	:= $(shell find ${BSP_DIR}/libsrc -type f -print)
+BSP_DIRS	:= $(shell find ${BSP_DIR}/ps7_cortexa9_0/libsrc/*/src/ -type d -print)
 # all the dirs with source files for compiling BSP
-BSP_SRCS	:= $(filter %/src/Makefile,$(BSP_DIRS))
-BSP_SRCS	:= $(filter %/Makefile,$(BSP_SRCS))
-BSP_SRCS 	:= $(dir $(BSP_SRCS))
+BSP_SRCS	:= $(filter %/src/,$(BSP_DIRS))
+BSP_FRTOS_SRC := $(filter %/freertos10_xilinx_v1_1/src/,$(BSP_SRCS))
+BSP_STALN_SRC := $(filter %/standalone_v6_7/src/,$(BSP_SRCS))
+
 # debug
 # BSP_SRCS 	:= $(word 1, $(BSP_SRCS))
-BSP_SFLAGS	:= "SHELL=/bin/sh" "COMPILER=arm-none-eabi-gcc" "ARCHIVER=arm-none-eabi-ar" "COMPILER_FLAGS=  -O2 -c" "EXTRA_COMPILER_FLAGS=-mcpu=cortex-a9 -mfpu=vfpv3 -mfloat-abi=hard -nostartfiles -g -Wall -Wextra"
-BSP_BUILD	:= "BUILD_DIR=$(BUILD_DIR)"
+BSP_LIBX_NAME	:= libxil.a
+BSP_SFLAGS		:= "SHELL=/bin/sh"
+BSP_BUILD		:= "BUILD_DIR=$(abspath $(BUILD_DIR))"
+BSP_LEVEL		:= "LEVEL=$(abspath $(LEVEL))"
+BSP_LIB			:= $(BSP_DIR)/$(BSP_LIBX_NAME)
+BSP_LIB_PATH	:= "LIB=$(abspath $(BSP_LIB))"
+BSP_MAKE		:= $(CURDIR)/makefile.bspsrc
+BSP_ADD_DEF		:= "XDEFS=$(BSP_XDEFS)"
+BSP_ALL_FLAGS	:= $(BSP_SFLAGS) $(BSP_LEVEL) $(BSP_BUILD) $(BSP_ADD_DEF) $(BSP_LIB_PATH)
 
 
 # there are some weird Make errors that come from reading the target Makefile twice
 # not sure how to get rid of them, but they aren't causing any problems
 # https://www.gnu.org/software/make/manual/html_node/Error-Messages.html
 # see warning on ignoring & overriding
+# @{ { make -C $(1) -s include $(BSP_SFLAGS); } 2>&3 | sed 's/^/STDOUT: /'; } 3>&1 1>&2 | sed 's/^/STDERR: /' | sed '/: warning: /d'
 define bsp_inc_comp =
 @echo -e $(COLOR_MAGENTA)Running make include: $(NO_COLOR)$(1)
-@make -C $(1) -s include $(BSP_SFLAGS) 2>&1 |sed '/: warning: /d'
+@make -C $(1) --file=$(BSP_MAKE) --no-print-directory -s include $(BSP_SFLAGS) $(BSP_LEVEL) 2>&1 | sed '/: warning: /d'
 
 endef
 # $(BSP_BUILD)
 
 define bsp_lib_comp =
 @echo -e $(COLOR_MAGENTA)Running make libs $(1) $(NO_COLOR)
-@make -C $(1) -s libs $(BSP_SFLAGS) 2>&1 | sed '/: warning: /d'
+@make -C $(1) --file=$(BSP_MAKE) --no-print-directory -s libs $(BSP_ALL_FLAGS) -j4 2>&1 | sed '/: warning: /d'
+
+endef
+
+define bsp_lib_clean =
+@make -C $(1) --file=$(BSP_MAKE) --no-print-directory -s clean $(BSP_ALL_FLAGS) -j4 2>&1 | sed '/: warning: /d'
 
 endef
 
 bsp_lib:
 	@echo -e $(COLOR_YELLOW)compiling bsp $(NO_COLOR)
-	$(foreach dir,$(BSP_SRCS),$(call bsp_lib_comp,$(dir)))
+	$(foreach dir,$(BSP_SRCS),$(call bsp_lib_comp,$(abspath $(dir))))
 
 bsp_include:
 	@echo -e $(COLOR_YELLOW)including bsp $(NO_COLOR)
@@ -225,25 +265,43 @@ bsp_include:
 
 bsp: $(BSP_LIB)
 
-$(BSP_LIB):
-	make bsp_include
-	make bsp_lib
+$(BSP_LIB): | $(BUILD_DIR)/
+	@make --no-print-directory bsp_include
+	@make --no-print-directory bsp_lib
 
 ################################################################################
 # Debug stuff              			                                           #
 ################################################################################
 
-.PHONY: clean print
+.PHONY: clean superclean clean_exe print print_bsp clean_bsp clean_opt
+
+superclean: clean clean_bsp
 
+# the -delete flag is not POSIX, so use exec rm if otherwise
+# @find $(BUILD_DIR)/ ! -name $(BSP_LIBX_NAME) -type f -delete
 clean:
 	@rm -rf $(BUILD_DIR)
 
+clean_exe:
+	@rm -f $(EXE_FILES)
+
+clean_lib:
+	@rm -f $(BSP_LIB)
+
+clean_bsp:
+	$(foreach dir,$(BSP_SRCS),$(call bsp_lib_clean, $(dir)))
+
+clean_opt:
+	@rm -f $(BUILD_DIR)/$(TARGET).opt.bc
+
 print:
+	@echo $(CURDIR)
 	@echo $(PROJECT_SRC)
 	@echo $(CSRCS)
-	@echo $(BSP_DIR)
-	@echo $(LIBS)
 
 print_bsp:
 	@echo $(BSP_SRCS)
 	@echo $(words $(BSP_SRCS))
+	@echo $(BSP_FRTOS_SRC)
+	@echo $(BSP_LIB)
+	@echo $(BSP_ADD_DEF)
diff --git a/tests/makefiles/Makefile.compile.x86 b/tests/makefiles/Makefile.compile.x86
index d3ddcdc7a..137b17071 100644
--- a/tests/makefiles/Makefile.compile.x86
+++ b/tests/makefiles/Makefile.compile.x86
@@ -12,7 +12,7 @@ CPP_SRCS	= $(wildcard $(PROJECT_SRC)/*.cpp)
 BCFILES		= $(patsubst %.c,%.clang.bc,$(notdir $(C_SRCS)))
 BCPPFILES	= $(patsubst %.cpp,%.clang.bcpp,$(notdir $(CPP_SRCS)))
 
-CLANG_FLAGS := -fcolor-diagnostics
+CLANG_FLAGS := -fcolor-diagnostics $(USER_CFLAGS)
 XLFLAGS 	?= -lm
 INCS		:=-I$(LEVEL)
 
@@ -51,7 +51,7 @@ $(TARGET).lbc: $(BCFILES) $(BCPPFILES)
 
 ################# CLANG #################
 # Lowest level target should depend on the Makefiles
-%.clang.bc: %.c $(MAKEFILES)
+%.clang.bc: %.c
 	@echo -e $(COLOR_BLUE) Building $@ $(NO_COLOR)
 	@echo '  'flags = $(CLANG_FLAGS)
 	@$(CLANG) $(INCS) $(CLANG_FLAGS) -emit-llvm $< -c -o $@
diff --git a/tests/makefiles/Makefile.program b/tests/makefiles/Makefile.program
index 2e081bbbb..85de82295 100644
--- a/tests/makefiles/Makefile.program
+++ b/tests/makefiles/Makefile.program
@@ -62,7 +62,7 @@ else ifeq ($(BOARD), $(BOARD_TMS1224))
 	@rm -rf $(TMP_WORKSPACE)
 
 else ifeq ($(BOARD), $(BOARD_TMS4357))
-	@$(DEBUGSERVER_ROOT)/../../eclipse/ccstudio $(DSS_OPTIONS) "$(FILE_PATH) ./Debug/$(TARGET).out $(PROJ_DIR)/targetConfigs/TMS570LC43xx.ccxml"
+	@$(DEBUGSERVER_ROOT)/../../eclipse/ccstudio $(DSS_OPTIONS) "$(FILE_PATH) ./build/$(TARGET).out $(PROJ_DIR)/targetConfigs/TMS570LC43xx.ccxml"
 	@rm -rf $(TMP_WORKSPACE)
 
 else
diff --git a/tests/makefiles/config b/tests/makefiles/config
index a74d265dc..6dc81518c 100644
--- a/tests/makefiles/config
+++ b/tests/makefiles/config
@@ -12,6 +12,6 @@ BOARD ?= $(BOARD_LLI)
 export COAST_ROOT = $(LEVEL)/..
 
 VIVADO = /opt/Xilinx/Vivado/2018.2
-CCS_ROOT = /opt/ti/ccsv8/ccs_base
+CCS_ROOT = /opt/ti/ccs910/ccs/ccs_base
 export XILINX_SDK = /opt/Xilinx/SDK/2018.2
 FREEDOM_SDK=$(HOME)/freedom_sdk
diff --git a/tests/matrixMultiply/matrixMultiply.c b/tests/matrixMultiply/matrixMultiply.c
index a099e870a..5962fb64c 100644
--- a/tests/matrixMultiply/matrixMultiply.c
+++ b/tests/matrixMultiply/matrixMultiply.c
@@ -112,12 +112,12 @@ void matrix_multiply(int f_matrix[][side], int s_matrix[][side], unsigned r_matr
 }
 
 __attribute__((noinline))
-int checkGolden() {
+int checkGolden() __NO_xMR {
 	int __xMR num_of_errors = 0;
 	int i = 0;
 	int j = 0;
 
-	for(i=0; i<side; i++) {
+	for(i = 0; i < side; i++) {
 		for (j = 0; j < side; j++) {
 			if (golden[i][j] != results_matrix[i][j]) {
 				num_of_errors++;
diff --git a/tests/pynq/.gitignore b/tests/pynq/.gitignore
new file mode 100644
index 000000000..c3f4e5d0c
--- /dev/null
+++ b/tests/pynq/.gitignore
@@ -0,0 +1,2 @@
+matrixMultiply/Xilinx.spec
+matrixMultiply.tmr/Xilinx.spec
diff --git a/tests/pynq/matrixMultiply.tmr/Makefile b/tests/pynq/matrixMultiply.tmr/Makefile
index 4319189e1..076d6b59e 100644
--- a/tests/pynq/matrixMultiply.tmr/Makefile
+++ b/tests/pynq/matrixMultiply.tmr/Makefile
@@ -1,8 +1,8 @@
 LEVEL = ../..
 TARGET = mm_tmr
-OPT_PASSES = -TMR -verbose -countErrors -O3 
+OPT_PASSES = -O3 -TMR -verbose -countErrors
 OPT_FLAGS =
-USER_CFLAGS = -O3
+USER_CFLAGS = -O3 -g3
 
 BOARD = pynq
 
diff --git a/tests/pynq/matrixMultiply.tmr/mm_tmr.c b/tests/pynq/matrixMultiply.tmr/mm_tmr.c
index 4a9c24d0d..72f30368c 100644
--- a/tests/pynq/matrixMultiply.tmr/mm_tmr.c
+++ b/tests/pynq/matrixMultiply.tmr/mm_tmr.c
@@ -16,16 +16,18 @@ __DEFAULT_NO_xMR
 
 #define SIMULATED
 #ifdef SIMULATED
-#define N 2
+#define N 5
 #else
 #define N 1000
 #endif
 
-int32_t TMR_ERROR_CNT;
-unsigned error;
+#define US_PER_S (1000 * 1000)
 
+unsigned error;
 typedef uint32_t mm_t;
 
+uint32_t TMR_ERROR_CNT;
+
 #include "mm.inc"
 
 #include "../../mm_common/mm_common_tmr.c"
@@ -49,8 +51,8 @@ int main()
 		}
 		XTime_GetTime(&tEnd);
 
-		float t = ((float) (tEnd - tStart)) / COUNTS_PER_SECOND;
-		printf("C:%d E:%d F:%d T:%fs\n\r", CORE, error, TMR_ERROR_CNT, t);
+		uint32_t t = US_PER_S *((float) (tEnd - tStart)) / COUNTS_PER_SECOND;
+		printf("C:0 E:%u F:%u T:%uus\n\r", error, TMR_ERROR_CNT, t);
 	}
     return 0;
 }
diff --git a/tests/pynq/matrixMultiply/Makefile b/tests/pynq/matrixMultiply/Makefile
index 78206cde6..cec15fcec 100644
--- a/tests/pynq/matrixMultiply/Makefile
+++ b/tests/pynq/matrixMultiply/Makefile
@@ -2,7 +2,7 @@ LEVEL = ../..
 TARGET = mm
 OPT_PASSES = -O3
 OPT_FLAGS =
-USER_CFLAGS = -O3
+USER_CFLAGS = -O3 -g3
 
 BOARD = pynq
 
diff --git a/tests/pynq/matrixMultiply/mm.c b/tests/pynq/matrixMultiply/mm.c
index d68cb4f8d..7b2d08a05 100644
--- a/tests/pynq/matrixMultiply/mm.c
+++ b/tests/pynq/matrixMultiply/mm.c
@@ -16,13 +16,14 @@ __DEFAULT_NO_xMR
 
 #define SIMULATED
 #ifdef SIMULATED
-#define N 2
+#define N 5
 #else
 #define N 1000
 #endif
 
-unsigned error;
+#define US_PER_S (1000 * 1000)
 
+unsigned error;
 typedef uint32_t mm_t;
 
 #include "mm.inc"
@@ -48,8 +49,8 @@ int main()
 		}
 		XTime_GetTime(&tEnd);
 
-		float t = ((float) (tEnd - tStart)) / COUNTS_PER_SECOND;
-		printf("C:%d E:%d F:0 T:%fs\n\r", CORE, error, t);
+		uint32_t t = US_PER_S *((float) (tEnd - tStart)) / COUNTS_PER_SECOND;
+		printf("C:%d E:%d F:0 T:%uus\n\r", CORE, error, t);
 	}
     return 0;
 }