From 8e2d58d166281dcd70e4720b5376eb854856d6c8 Mon Sep 17 00:00:00 2001
From: Anthony Castaldo <TonyCastaldo@icl.utk.edu>
Date: Wed, 11 May 2022 15:36:54 -0400
Subject: [PATCH 1/5] Adds stevx2, openmp parallel-only, one node. stevx2.cc is
 the main routine called by users; stevx2_bisection.cc is the parallel
 reursive subroutine using omp tasks. This relies on geqrf and unmqr to
 parallelize orthogonalization of the eigenvectors. test_stevx2.cc performs
 simple tests, using the Kahan matrix (internal) to verify correct eigenvalue
 performance and correct eigenvectors. lapackpp contains the new sturm
 sequence and a tester, separately committed.

---
 include/slate/slate.hh  |  76 +++++
 lapackpp                |   2 +-
 src/stevx2.cc           | 651 ++++++++++++++++++++++++++++++++++++++++
 src/stevx2_bisection.cc | 416 +++++++++++++++++++++++++
 test/CMakeLists.txt     |   8 +-
 test/test.cc            |   1 +
 test/test.hh            |   1 +
 test/test_stevx2.cc     | 309 +++++++++++++++++++
 8 files changed, 1462 insertions(+), 2 deletions(-)
 create mode 100644 src/stevx2.cc
 create mode 100644 src/stevx2_bisection.cc
 create mode 100644 test/test_stevx2.cc
diff --git a/include/slate/slate.hh b/include/slate/slate.hh
index 39a3542a1..ccaa6f18b 100644
--- a/include/slate/slate.hh
+++ b/include/slate/slate.hh
@@ -996,6 +996,82 @@ void steqr2(
     Matrix<scalar_t>& Z,
     Options const& opts = Options());
 
+//-----------------------------------------
+template <typename real_t>
+struct stevx2_stein_array_t
+{
+    std::vector< lapack_int > iblock;
+    std::vector< lapack_int > isplit;
+    std::vector< real_t > work;
+    std::vector< lapack_int > iwork;
+    std::vector< lapack_int > ifail;
+};
+
+//-----------------------------------------
+template <typename real_t>
+struct stevx2_control_t
+{
+    int64_t   n;
+    const real_t* diag;
+    const real_t* offd;
+    lapack::Range range;        // Value or Index.
+    lapack::Job jobtype;        // NoVec or Vec.
+    int64_t   il;               // For Range Index least index desired.
+    int64_t   iu;               // For Range index max index desired.
+    stevx2_stein_array_t<real_t>* stein_arrays; // workspaces.
+    int64_t base_idx;           // Number of EV less than user's low threshold.
+    int64_t error;              // first error, if non-zero.
+    real_t* pval;               // where to store eigenvalues.
+    real_t* pvec;               // where to store eigenvectors.
+    int64_t* pmul;              // where to store Multiplicity.
+};
+
+//-----------------------------------------
+template <typename real_t>
+void stevx2_bisection(
+    stevx2_control_t<real_t>* control,
+    real_t lower_bound,
+    real_t upper_bound,
+    int64_t n_lt_low,
+    int64_t n_lt_hi,
+    int64_t num_ev);
+
+//-----------------------------------------
+template <typename scalar_t>
+void stevx2_get_col_vector(
+    Matrix<scalar_t>& source,
+    std::vector<scalar_t>& v,
+    int col);
+
+//-----------------------------------------
+template <typename scalar_t>
+void stevx2_put_col_vector(
+    std::vector<scalar_t>& v,
+    Matrix<scalar_t>& dest,
+    int col);
+
+//-----------------------------------------
+template <typename scalar_t>
+void stevx2_stmv(
+    const scalar_t* diag, const scalar_t* offd, const int64_t n,
+    std::vector< scalar_t >& X, std::vector< scalar_t >& Y);
+
+//-----------------------------------------
+template <typename scalar_t>
+scalar_t stevx2_stepe(
+    const scalar_t* diag,  const scalar_t* offd, int64_t n,
+    scalar_t u, std::vector< scalar_t >& v);
+
+//-----------------------------------------
+// stevx2()
+template <typename scalar_t>
+void stevx2(
+    const lapack::Job jobtype, const lapack::Range range,
+    const std::vector< scalar_t >& diag, const std::vector< scalar_t >& offd,
+    scalar_t vl, scalar_t vu, int64_t il, int64_t iu,
+    std::vector< scalar_t >& eig_val, std::vector< int64_t >& eig_mult,
+    Matrix< scalar_t >& eig_vec, MPI_Comm mpi_comm);
+
 } // namespace slate
 
 //-----------------------------------------
diff --git a/lapackpp b/lapackpp
index 0e40e4ba8..a42684000 160000
--- a/lapackpp
+++ b/lapackpp
@@ -1 +1 @@
-Subproject commit 0e40e4ba87786e0372195f4e83c277c4eda98d1b
+Subproject commit a42684000150619d8c80123e9311727b61f53d4f
diff --git a/src/stevx2.cc b/src/stevx2.cc
new file mode 100644
index 000000000..b66b06e3e
--- /dev/null
+++ b/src/stevx2.cc
@@ -0,0 +1,651 @@
+// Copyright (c) 2017-2020, University of Tennessee. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
+
+#include "slate/slate.hh"
+#include "internal/internal.hh"
+#include "internal/internal_util.hh"
+
+#include <string.h>
+#include <omp.h>
+#include <lapack/config.h>
+#include <lapack/util.hh> // Gives lapack::Job::NoVec or Vec.
+#include "lapack/fortran.h"
+
+#include <vector>
+#include <ctgmath>
+
+namespace slate {
+
+//------------------------------------------------------------------------------
+/// @ingroup svd_specialization
+//
+///  Only real versions (s,d).
+/// Symmetric Tridiagonal Eigenvalues/pairs by range.
+/// Computes a caller-selected range of eigenvalues and, optionally,
+/// eigenvectors of a symmetric tridiagonal matrix A.  Eigenvalues and
+/// eigenvectors can be selected by specifying either a range of values or a
+/// range of indices for the desired eigenvalues.
+/// This is similiar to LAPACK dstevx, with more output parameters.
+//
+// Upon return, the vector and matrix will have sizes indicating the number of
+// unique eigenvalues/pairs found. We also return a multiplicity vector of the
+// same size. A real symmetric matrix in NxN should have N distinct eigenvalues,
+// but these can exist within ULP (the unit of least precision) of each other,
+// making them represented by the same floating point number. In such cases,
+// the multiplicity vector can hold a number greater than 1, to indicate the
+// number of eigenvalues within ULP of the reported eigenvalue.
+//
+// Finding eigenvalues alone is much faster than finding eigenpairs; and the
+// majority of the time consumed when eigenvectors are found is in
+// orthogonalizing the eigenvectors; an O(N*K^2) operation.
+//
+// MPI strategy: Given a value range, use Sturm to convert this to an index
+// range; and then divide that range into P equal parts for each node to
+// process. For example given the value range [-0.5, +0.5) say Sturm on each
+// returns the numbers [5000, 6000], i.e. 5000 eigenvalues are less than -0.5,
+// and 6000 are less than +0.5. So we have 1000 eigenvalues in the range. Given
+// 8 nodes, that is index ranges [5000,5124], [5125,5249], [5250, 5374], etc.
+//
+// If the range is not evenly divisible by P, add 1 extra element to the first
+// R modulo(P) ranges. e.g. R=1003, P=8, 1003 mod 8 = 3, so the first 3 ranges
+// get 1 extra value. So index spans of 126,126,126,125,125,125,125,125 values;
+// [5000,5125] [5126,5251] [5252,5377] [5378,5502] [5503,5627] [5628,5752]
+// [5753,5877] [5878,6002]. The expected range for 1003 values; [5000,6002].
+//
+/// @param[in] jobtype enum: lapack::Job::NoVec, lapack::Job::Vec. Whether or
+/// not to compute eigenvectors.
+
+/// @param[in] range enum: lapack::Range::Value use vl, vu for range [vl, vu).
+/// lapack::Range::Index use il, iu for range [il, iu]. 1-relative indices;
+/// 1..n.
+
+/// @param[in] diag: Vector of [n] diagonal entries of A. diag.size() must be
+/// the order of the matrix.
+
+/// @param[in] offd: Vector onf [n-1] off-diagonal entries of A.
+
+/// @param[in] vl: Lowest eigenvalue included in desired range [vl, vu).  if
+/// less than Gerschgorin min; we use Gerschgorin min. Unused if the Range
+/// enum is lapack::Range::Index.
+
+/// @param[in] vu: Highest eigenvalue beyond desired range, [vl,vu).  if
+/// greater than Gerschgorin max, we use Gerschgorin max+ulp. Unused if the
+/// Range enum is lapack::Range::Index. If used, Note that all eigenvalues will
+/// be strictly less than vu, in a Range finding.
+
+/// @param[in] il: Low Index of range. Must be in range [1,n]. Unused if the
+/// Range enum is lapack::Range::Value.
+
+/// @param[in] iu int. High index of range. Must be in range [1,n], >=il. Unused
+/// if the Range enum is lapack::Range::Value.
+
+/// @param[out] Value: vector of desired precision. Resized to fit the number of
+/// eigenvalues found. pVal.size() is the number of unique eigenvalues found.
+
+/// @param[out] Mult: vector of type lapack_int. Multiplicity count for each
+/// eigenvalue in Value[]; the number of eigenvalues within ulp of the value.
+
+/// @param[out] RetVec: Pass an empty matrix; it will be resized to a 2D Matrix,
+/// n x pVal.size(); the orthonormal set of eigenvectors.  Each column 'i'
+/// represents the eigenvector for eigenvalue Value[i]. If jobtype =
+/// lapack::Job::NoVec, RetVec[] is not referenced or changed.
+///
+/// @param[in] mpi_comm: MPI communicator to distribute matrix across. To create
+/// the RetVec output matrix of eigenvectors.
+///
+/// @retval: zero for success. Otherwise negative to indicate failure: -i means
+/// the i-th argument had an illegal value.
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// stevx2_get_col_vector: Retrieves a single vector from a tiled matrix.
+template <typename scalar_t>
+void stevx2_get_col_vector(
+        Matrix<scalar_t>& source, std::vector<scalar_t>& v, int col)
+{
+    int64_t mb;                         // rows per tile.
+    int64_t nb = source.tileNb( 0 );    // assume fixed columns per tile.
+    int64_t mt = source.mt();           // Number of row tiles.
+    int64_t nt = source.nt();           // Number of column tiles.
+    (void) nt;                          // Don't complain about no use.
+
+    int64_t ii=0;                       // position within vector.
+    int64_t tidx = col / nb;
+    int64_t toff = col - tidx*nb;
+
+    for (int64_t row_tile=0; row_tile < mt; ++row_tile) {
+        if (source.tileIsLocal( row_tile, tidx )) {
+            auto tile = source( row_tile, tidx );
+            mb = source.tileMb(row_tile);   // get rows in THIS tile.
+            blas::copy(mb, &tile.at( 0, toff), 1, &v[ ii ], 1);
+            ii += mb;
+        }
+    }
+
+    return;
+}
+
+//------------------------------------------------------------------------------
+// stevx2_put_col_vector: Copy a single vector into a column of a tiled matrix.
+template <typename scalar_t>
+void stevx2_put_col_vector(
+        std::vector<scalar_t>& v, Matrix<scalar_t>& dest, int col)
+{
+    int64_t mb;                     // rows per tile.
+    int64_t nb = dest.tileNb( 0 );  // assume fixed columns per tile.
+    int64_t mt = dest.mt();         // Number of row tiles.
+    int64_t nt = dest.nt();         // Number of column tiles.
+    (void) nt;                      // Don't complain about no use.
+
+    int64_t ii=0;                     // position within vector.
+    int64_t tidx = col / nb;
+    int64_t toff = col - tidx*nb;
+    for (int64_t row_tile=0; row_tile < mt; ++row_tile) {
+        if (dest.tileIsLocal( row_tile, tidx)) {
+            auto tile = dest( row_tile, tidx );
+            mb = dest.tileMb(row_tile);   // Get rows in THIS tile.
+            blas::copy(mb, &v[ii], 1, &tile.at( 0, toff), 1);
+            ii += mb;
+        }
+    }
+
+    return;
+}
+
+//------------------------------------------------------------------------------
+// stevx2_put_col_vector: Copy a single vector into a column of a tiled matrix.
+template <typename scalar_t>
+void stevx2_put_col_vector(
+        scalar_t* v, Matrix<scalar_t>& dest, int col)
+{
+    int64_t mb;                     // rows per tile.
+    int64_t nb = dest.tileNb( 0 );  // assume fixed columns per tile.
+    int64_t mt = dest.mt();         // Number of row tiles.
+    int64_t nt = dest.nt();         // Number of column tiles.
+    (void) nt;                      // Don't complain about no use.
+
+    int64_t ii=0;                     // position within vector.
+    int64_t tidx = col / nb;
+    int64_t toff = col - tidx*nb;
+    for (int64_t row_tile=0; row_tile < mt; ++row_tile) {
+        if (dest.tileIsLocal( row_tile, tidx)) {
+            auto tile = dest( row_tile, tidx );
+            mb = dest.tileMb(row_tile);   // Get rows in THIS tile.
+            blas::copy(mb, &v[ii], 1, &tile.at( 0, toff), 1);
+            ii += mb;
+        }
+    }
+
+    return;
+}
+
+//------------------------------------------------------------------------------
+// STELG: Symmetric Tridiagonal Eigenvalue Least Greatest (Min and Max).
+// Finds the least and largest signed eigenvalues (not least magnitude).
+// begins with bounds by Gerschgorin disc. These may be over or under
+// estimated; Gerschgorin only ensures a disk will contain each. Thus we then
+// use those with bisection to find the actual minimum and maximum eigenvalues.
+// Note we could find the least magnitude eigenvalue by bisection between 0 and
+// each extreme value.
+// By Gerschgorin Circle Theorem;
+// All Eigval(A) are \in [\lamda_{min}, \lambda_{max}].
+// \lambda_{min} = min (i=0; i<n) diag[i]-|offd[i]| - |offd[i-1]|,
+// \lambda_{max} = max (i=0; i<n) diag[i]+|offd[i]| + |offd[i-1]|,
+// with offd[-1], offd[n] = 0.
+// Indexes above are 0 relative.
+// Although Gerschgorin is mentioned in ?larr?.f LAPACK files, it is coded
+// inline there.
+//------------------------------------------------------------------------------
+
+template <typename scalar_t>
+void stevx2_stelg(
+    const scalar_t* diag,  const scalar_t* offd, const lapack_int n,
+    scalar_t& Min, scalar_t& Max)
+{
+    int i;
+    scalar_t test, testdi, testdim1, min=__DBL_MAX__, max=-__DBL_MAX__;
+
+    for (i=0; i<n; i++) {
+        if (i == 0)
+            testdim1=0.;
+        else
+            testdim1=offd[i-1];
+
+        if (i == (n-1))
+            testdi=0;
+        else
+            testdi=offd[i];
+
+        test=diag[i] - fabs(testdi) - fabs(testdim1);
+        if (test < min)
+            min=test;
+
+        test=diag[i] + fabs(testdi) + fabs(testdim1);
+        if (test > max)
+            max=test;
+    }
+
+
+    scalar_t cp, minLB=min, minUB=max, maxLB=min, maxUB=max;
+    // Within that range, find the actual minimum.
+    for (;;) {
+        cp = (minLB+minUB)*0.5;
+        if (cp == minLB || cp == minUB) break;
+        if (lapack::sturm(n, &diag[0], &offd[0], cp) == n) {
+            minLB = cp;
+        }
+        else {
+            minUB = cp;
+        }
+    }
+
+    // Within that range, find the actual maximum.
+    for (;;) {
+        cp = (maxLB+maxUB)*0.5;
+        if (cp == maxLB || cp == maxUB) break;
+        if (lapack::sturm(n, &diag[0], &offd[0], cp) == n) {
+            maxUB=cp;
+        }
+        else {
+            maxLB=cp;
+        }
+    }
+
+    Min = minLB;
+    Max = maxUB;
+}
+
+//------------------------------------------------------------------------------
+// STMV: Symmetric Tridiagonal Matrix Vector multiply.
+// Matrix multiply; A * X = Y.
+// A = [diag[0], offd[0],
+//     [offd[0], diag[1], offd[1]
+//     [      0, offd[1], diag[2], offd[2],
+//     ...
+//     [ 0...0                     offd[n-2], diag[n-1] ]
+// LAPACK does not do just Y=A*X for a packed symmetric tridiagonal matrix.
+// This routine is necessary to determine if eigenvectors should be swapped.
+// This could be done by 3 daxpy, but more code and I think more confusing.
+//------------------------------------------------------------------------------
+template <typename scalar_t>
+void stevx2_stmv(
+    const scalar_t* diag, const scalar_t* offd, const int64_t n,
+    std::vector< scalar_t >& X, std::vector< scalar_t >& Y)
+{
+    int64_t i;
+    Y[0] = diag[0]*X[0] + offd[0]*X[1];
+    Y[n-1] = offd[n-2]*X[n-2] + diag[n-1]*X[n-1];
+
+    for (i = 1; i < (n-1); ++i) {
+        Y[i] = offd[i-1]*X[i-1] + diag[i]*X[i] + offd[i]*X[i+1];
+    }
+}
+
+//------------------------------------------------------------------------------
+// STEPE: Symmetric Tridiagonal EigenPair Error.
+// This routine is necessary to determine if eigenvectors should be swapped.
+// eigenpair error: If A*v = u*v, then A*v-u*v should == 0. We compute the
+// L_infinity norm of (A*v-u*v). (Max absolute value).
+//------------------------------------------------------------------------------
+template <typename scalar_t>
+scalar_t stevx2_stepe(
+    const scalar_t* diag,  const scalar_t* offd, int64_t n,
+    scalar_t u, std::vector< scalar_t >& v)
+{
+    int i;
+    scalar_t norm, temp;
+
+    std::vector< scalar_t > y;
+    y.resize(n);
+    stevx2_stmv(diag, offd, n, v, y);      // y = A*v
+
+    norm = fabs(y[0]-u*v[0]);              // init norm.
+    for (i = 0; i < n; ++i) {
+        temp = fabs(y[i] - u*v[i]);        // This should be zero.
+        if (temp > norm)
+            norm=temp;
+    }
+
+    return norm;
+}
+
+//------------------------------------------------------------------------------
+// This is the main routine; slate_stevx2
+// Arguments are described at the top of this source.
+//------------------------------------------------------------------------------
+template <typename scalar_t>
+void stevx2(
+    const lapack::Job jobtype, const lapack::Range range,
+    const std::vector< scalar_t >& diag, const std::vector< scalar_t >& offd,
+    scalar_t vl, scalar_t vu, const int64_t il, const int64_t iu,
+    std::vector< scalar_t >& eig_val, std::vector< int64_t >& eig_mult,
+    Matrix< scalar_t >& eig_vec, MPI_Comm mpi_comm)
+{
+    lapack_int n;
+    int i, max_threads;
+    // workspaces array.
+    std::vector< stevx2_stein_array_t<scalar_t> > stein_arrays;
+
+    // Check input arguments.
+    if (jobtype != lapack::Job::Vec
+        && jobtype != lapack::Job::NoVec) {
+            slate_error("Arg 1, illegal value of jobtype");
+    }
+
+/// @param[in] range enum: lapack::Range::Value use vl, vu for range [vl, vu).
+/// lapack::Range::Index use il, iu for range [il, iu]. 1-relative indices;
+/// 1..n.
+    if (range != lapack::Range::Value
+        && range != lapack::Range::Index) {
+        slate_error("Arg 2, illegal value of range");
+    }
+
+    if (diag.size() < 2) {
+        slate_error("arg 3, diag must be at least 2 elements");
+    }
+
+    if (offd.size() < (diag.size()-1)) {
+        slate_error("arg 4, offd must be at least diag.size()-1");
+    }
+
+    n = diag.size();
+
+    if (range == lapack::Range::Value
+        && vu <= vl ) {
+        slate_error("args 5 & 6, vu must be > vl");
+    }
+
+    if (range == lapack::Range::Index) {
+        if (il < 1
+            || il > n) {
+             slate_error("arg 7 illegal value of il");
+        } else
+        {
+            if (iu < il
+                || iu > n) {
+                slate_error("arg 8, illegal value of iu");
+            }
+        }
+    }
+
+    max_threads = omp_get_max_threads();
+
+    // Ensure we have a workspace per thread.
+    if (jobtype == lapack::Job::Vec) {
+        stein_arrays.resize( max_threads );
+    }
+
+    scalar_t glob_min_eval, glob_max_eval;
+
+    stevx2_control_t<scalar_t> control;
+    control.n = n;
+    control.diag = &diag[0];
+    control.offd = &offd[0];
+    control.jobtype = jobtype;
+    control.range = range;
+    control.il = il;
+    control.iu = iu;
+    control.stein_arrays = &stein_arrays[0];
+
+    // Find actual least and greatest eigenvalues.
+    stevx2_stelg<scalar_t>(control.diag, control.offd, control.n,
+                           glob_min_eval, glob_max_eval);
+
+    int64_t ev_less_than_vl=0, ev_less_than_vu=n, n_eig_vals=0;
+    if (range == lapack::Range::Value) {
+        // We don't call Sturm if we already know the answer.
+        if (vl >= glob_min_eval)
+            ev_less_than_vl = lapack::sturm(n, &diag[0], &offd[0], vl);
+        else
+            vl = glob_min_eval; // optimize for computing step size.
+
+        if (vu <= glob_max_eval)
+            ev_less_than_vu = lapack::sturm(n, &diag[0], &offd[0], vu);
+        else
+            vu = nexttoward(glob_max_eval, __DBL_MAX__);
+
+        // Compute the number of eigenvalues in [vl, vu).
+        n_eig_vals = (ev_less_than_vu - ev_less_than_vl);
+
+         control.base_idx = ev_less_than_vl;
+    }
+    else {
+        // lapack::Range::Index. iu, il already vetted by code above.
+        n_eig_vals = iu+1-il; // Index range is inclusive.
+
+        // We still must bisect by values to discover eigenvalues, though.
+        vl = glob_min_eval;
+        // nextoward is to ensure we include globMaxVal as a possibility.
+        vu = nexttoward(glob_max_eval, __DBL_MAX__);
+        control.base_idx = 0; // There are zero eigenvalues less than vl.
+    }
+
+    std::vector< scalar_t > pvec;
+
+    eig_val.resize(n);
+    eig_mult.resize(n);
+    if (jobtype == lapack::Job::Vec) {
+        pvec.resize(control.n * n_eig_vals);  // Make space.
+    }
+
+    // Finish set up of Control.
+    control.pval = &eig_val[0];
+    control.pmul = &eig_mult[0];
+    control.pvec = &pvec[0];
+    int64_t int_one=1;
+
+    // We launch the root task: The full range to subdivide.
+    #pragma omp parallel
+    {
+        #pragma omp single
+        {
+            #pragma omp task
+                stevx2_bisection(&control, vl, vu, -int_one, -int_one,
+                                 n_eig_vals);
+        }
+    }
+
+    // Now, all the eigenvalues should have unit eigenvectors in the array
+    // ~control.pvec.  We don't need to sort that, but we do want to compress
+    // it; in case of multiplicity.
+    // NOTE: An alternative exists here, if we can figure out how to make
+    // inverse iteration (STEIN) produce different eigenvectors for the same
+    // eigenvalue. Currently, STEIN starts with a random vector, and for the
+    // same eigenvalue always produces the same vector. However, with a custom
+    // version of STEIN, Mark Gates suggests using Gram Schmidt or CGS2 to
+    // orthogonalize the random vector against previous eigenvectors for the
+    // same eigenvalue, before beginning iteration, so perhaps it will converge
+    // to a different eigenvector.
+
+    int vectors_found = 0;
+    for (i=0; i<n_eig_vals; ++i) {
+        if (eig_mult[i] > 0) {
+            ++vectors_found;
+        }
+    }
+
+    // Compress array in case vectorsFound < nEigVals (due to multiplicities).
+    // Note pMul[] init to zeros: if still zero, that represents a vector not
+    // filled in due to multiplicity.
+    if (vectors_found < n_eig_vals) {
+        int j=0;
+        for (i=0; i<n_eig_vals; ++i) {
+            if (eig_mult[i] > 0) {          // If this is NOT a multiplicity,
+                eig_mult[j] = eig_mult[i];  // copy to next open slot j.
+                eig_val[j] = eig_val[i];
+                if (control.jobtype == lapack::Job::Vec) {
+                    if (j != i) {
+                        memcpy(&pvec[j*control.n], &pvec[i*control.n],
+                               control.n*sizeof(scalar_t));
+                    }
+                }
+
+                ++j;
+            } // end if we found a non-multiplicity eigenvalue.
+        }
+    } // end if compression is needed.
+
+    // resize matrices based on what we have found.
+    eig_val.resize(vectors_found);
+    eig_mult.resize(vectors_found);
+    pvec.resize(control.n * vectors_found);
+
+    // perform QR factorization, remember the descriptor.
+    int64_t p = 1, q = 1, nb = 256;
+
+    slate::Options const opts =  {
+        {slate::Option::Target, slate::Target::Host},
+        {slate::Option::MaxPanelThreads, max_threads}
+    };
+
+    slate::TriangularFactors<scalar_t> tri_factors;
+
+    // Inefficiency Here.
+    // This is likely not ideal. I need to return eig_vec[] to the caller, but
+    // I cannot use eig_vec.fromScaLAPACK() because the pvec[] array will be
+    // de-allocated upon return. So I need to duplicate the size of the final
+    // matrix, and copy pvec[] into it. Using twice as much storage as I'd
+    // like.  I can't just create it with n_eig_vals before starting bisection;
+    // that would work if compression were not necessary; but it is necessary;
+    // because we can get two or more theoretically distinct eigenvalues within
+    // the ULP of each other, thus represented by the same floating point
+    // number. A workable alternative would be to NOT call stein() during the
+    // bisection (as we currently do), just find all the eigenvalues first.
+    // Then remove any duplicates for multiplicity, so we just compress the
+    // eigenvalues vector, and THEN we can create eig_vec in its final size,
+    // and use a different omp task to parallelize finding all the eigenvectors
+    // with stein, for each of those values, and copy the vectors directly into
+    // eig_vec[] using the stevx2_put_col_vector() function. (Stein will only
+    // return a contiguous vector, but we only need one vector per thread for
+    // calling.) That is a bigger change than I can test in the time I have
+    // left on this project.
+
+    eig_vec = slate::Matrix<scalar_t>(
+        control.n, vectors_found, nb, p, q, mpi_comm);
+    eig_vec.insertLocalTiles();
+
+    for (i = 0; i < vectors_found; ++i) {
+        stevx2_put_col_vector(&pvec[control.n * i], eig_vec, i);
+    }
+
+    // We need to make Result same size as pVec.
+    // rows=control.n, columns=vectors_found, tile-size mb=nb=256, p=q=1.
+    auto ident = slate::Matrix<scalar_t>(
+        control.n, vectors_found, nb, p, q, mpi_comm);
+    ident.insertLocalTiles();
+    scalar_t zero=0.0;
+    scalar_t one= 1.0;
+    set(zero, one, ident);
+
+    slate::qr_factor(eig_vec, tri_factors);
+
+    // Extract just the Q of the QR, in normal form. We do this with unmqr; to
+    // multiply the factorized Q by the Identity matrix we just built. ungqr
+    // could do this without the extra space of ident, in half the flops, but
+    // SLATE does not have it yet.
+
+    slate::qr_multiply_by_q(
+        slate::Side::Left, slate::Op::NoTrans, eig_vec, tri_factors, ident);
+
+    slate::copy(ident, eig_vec);    // This is our return matrix.
+
+    //--------------------------------------------------------------------------
+    // When eigenvalue are crowded, it is possible that after orthogonalizing
+    // vectors, it can be better to swap neighboring eigenvectors. We just
+    // test all the pairs; basically ||(A*V-e*V)||_max is the error.  if BOTH
+    // vectors in a pair have less error by being swapped, we swap them.
+    //--------------------------------------------------------------------------
+    if (jobtype == lapack::Job::Vec) {
+        std::vector<scalar_t> y( control.n );
+        std::vector<scalar_t> test( 4 );            // Four test results.
+        std::vector<scalar_t> vec1( control.n );
+        std::vector<scalar_t> vec2( control.n );
+        stevx2_get_col_vector(eig_vec, vec2, 0);    // Copy vec[0] to temp vec2.
+
+        // Now on each iteration, we move the current vec2 to vec1, and
+        // load the next vec2. Our pairs are 0:1, 1:2, 2:3, etc.
+
+        for (i = 1; i < vectors_found-1; ++i) {
+            std::swap(vec1, vec2);
+            stevx2_get_col_vector(eig_vec, vec2, i);
+
+            // Compute 4 tests, one for each combination.
+            test[0] = stevx2_stepe(
+                &control.diag[0], &control.offd[0], control.n, eig_val[i], vec1);
+            test[1] = stevx2_stepe(
+                &control.diag[0], &control.offd[0], control.n, eig_val[i+1], vec2);
+            test[2] = stevx2_stepe(
+                &control.diag[0], &control.offd[0], control.n, eig_val[i], vec2);
+            test[3] = stevx2_stepe(
+                &control.diag[0], &control.offd[0], control.n, eig_val[i+1], vec1);
+
+            if ( (test[2] < test[0])        // val1 & vec2 beats val1 & vec1.
+                && (test[3] < test[1]) ) {  // val2 & vec1 beats val2 & vec2.
+                // Copy vec2 to vec1 original position.
+                stevx2_put_col_vector(vec2, eig_vec, (i-1) );
+
+                // Copy vec1 to vec2 original position.
+                stevx2_put_col_vector(vec1, eig_vec, i);
+
+                // We have now swapped eig_vec[i-1] and eig_vec[i]. At the top
+                // of the loop we are going to swap and then load the new
+                // vector into vec2, but because of the swap we really need
+                // vec1 to stay as it is.  So we swap here, so the vector swap
+                // at the top will reverse this with another swap.
+                std::swap(vec1, vec2);
+            } // end this swap.
+        } // end all swapping.
+    } // end if we are processing eigenvectors at all.
+
+    return;
+}
+
+template
+void stevx2_stelg<float>(
+    const float* diag,  const float* offd, const lapack_int n,
+    float& Min, float& Max);
+
+template
+void stevx2_stelg<double>(
+    const double* diag,  const double* offd, const lapack_int n,
+    double& Min, double& Max);
+
+template
+void stevx2_stmv<float>(
+    const float* diag, const float* offd, const int64_t n,
+    std::vector<float>& X, std::vector<float>& Y);
+
+template
+void stevx2_stmv<double>(
+    const double* diag, const double* offd, const int64_t n,
+    std::vector<double>& X, std::vector<double>& Y);
+
+template
+float stevx2_stepe<float>(
+    const float* diag,  const float* offd, const int64_t n,
+    const float u, std::vector<float>& v);
+
+template
+double stevx2_stepe<double>(
+    const double* diag,  const double* offd, const int64_t,
+    const double u, std::vector<double>& v);
+
+template
+void stevx2<float>(
+    const lapack::Job jobtype, const lapack::Range range,
+    const std::vector< float >& diag, const std::vector< float >& offd,
+    const float vl, const float vu, const int64_t il, const int64_t iu,
+    std::vector< float >& eig_val, std::vector< int64_t >& eig_mult,
+    Matrix< float >& eig_vec, MPI_Comm mpi_comm);
+
+template
+void stevx2<double>(
+    const lapack::Job jobtype, const lapack::Range range,
+    const std::vector< double >& diag, const std::vector< double >& offd,
+    const double vl, const double vu, const int64_t il, const int64_t iu,
+    std::vector< double >& eig_val, std::vector< int64_t >& eig_mult,
+    Matrix< double >& eig_vec, MPI_Comm mpi_comm);
+
+} // namespace slate.
diff --git a/src/stevx2_bisection.cc b/src/stevx2_bisection.cc
new file mode 100644
index 000000000..e95931a74
--- /dev/null
+++ b/src/stevx2_bisection.cc
@@ -0,0 +1,416 @@
+// Copyright (c) 2017-2020, University of Tennessee. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
+
+#include "slate/slate.hh"
+// #include "slate/Matrix.hh"
+// #include "slate/Tile_blas.hh"
+// #include "slate/HermitianBandMatrix.hh"
+#include "internal/internal.hh"
+#include "internal/internal_util.hh"
+
+#include <string.h>
+#include <omp.h>
+#include <lapack/config.h>
+#include "lapack/fortran.h"
+
+namespace slate {
+
+//------------------------------------------------------------------------------
+/// @ingroup svd_specialization
+//
+//  Only real versions (s,d).
+//  This code is not designed to be called directly by users; it is a subroutine
+//  for stevx2.cc.
+//
+//  Specifically, this is a task-based parallel algorithm, the parameters are
+//  contained in the already initialized and populated stevx2_control_t; For
+//  example, from stevx2:
+//
+//   #pragma omp parallel
+//   {
+//       #pragma omp single
+//       {
+//           stevx2_bisection(&control, ...etc...);
+//       }
+//   }
+//
+/// @param[in] *control
+///          A pointer to the global variables needed.
+///
+/// @param[in] control->n
+///          int number of rows in the matrix.
+///
+/// @param[in] control->diag
+///          real array of [N] diagonal elements of the matrix.
+///
+/// @param[in] control->offd
+///          real array of [N-1] sub-diagonal elements of the matrix.
+///
+/// @param[in] control->range
+///          lapack::Range.
+///              Index if user is finding eigenvalues by index range.
+///              Value if user is finding eigenvuales by value range.
+///
+/// @param[in] control->jobtype
+///          lapack::Job.
+///              NoVec  if user does not want eigenvectors computed.
+///              Vec    if user desires eigenvectors computed.
+///
+/// @param[in] control->il
+///          int enum. The lower_bound of an index range if range is Index.
+///
+/// @param[in] control->iu
+///          int enum. The upper_bound of an index range, if range is Index.
+///
+/// @param[in] control->stein_arrays
+///          array of [max_threads], type stevx2_Stein_Array_t, contains work
+///          areas per thread for invoking _stein (inverse iteration to find
+///          eigenvectors).
+///
+/// @param[in] control->base_idx
+///          The index of the least eigenvalue to be found in the bracket,
+///          used to calculate the offset into the return vectors/arrays.
+///
+/// @param[out] control->error
+///          If non-zero, the first error we encountered in the operation.
+///
+/// @param[out] control->pval
+///          real vector of [eigenvaues] to store the eigenvalues discovered,
+///          these are returned in ascending sorted order.
+///
+/// @param[out] control->pvec
+///          Matrix of [N x eigenvalues] to store the eigenvectors, not
+///          referenced unless jobtype==Vec. Stored in the same order as
+///          their corresponding eigenvalue.
+///
+/// @param[out] control->pmul
+///          int vector of [eigenvalues], the corresponding multiplicity of
+///          each eigenvalue, typically == 1.
+///
+/// @param[in] lower_bound
+///          Real lower_bound (inclusive) for range of eigenvalues to find.
+///
+/// @param[in] upper_bound
+///          Real upper_bound (non-inclusive) of range of eigenvalues to find.
+///
+/// @param[in] nlt_low
+///          int number of eigenvalues less than lower_bound. Computed if < 0.
+///
+/// @param[in] nlt_hi
+///          int number of eigevalues less than upper_bound. Computed if < 0.
+///
+/// @param[in] num_ev
+///          int number of eigenvalues in [lower_bound, upper_bound). Computed
+///          if either nlt_low or nlt_hi were computed.
+//
+//  A 'bracket' is a range of either real eigenvalues, or eigenvalue indices,
+//  that this code is given to discover. It is provided in the arguments.  Upon
+//  entry, the number of theoretical eigenvalues in this range has already been
+//  determined, but the actual number may be less, due to ULP-multiplicity. (ULP
+//  is the Unit of Least Precision, the magnitude of the smallest change
+//  possible to a given real number). To explain: A real symmetric matrix in NxN
+//  should have N distinct real eigenvalues; however, if eigenvalues are closely
+//  packed either absolutely (their difference is close to zero) or relatively
+//  (their ratio is close to 1.0) then in real arithmetic two such eigenvalues
+//  may be within ULP of each other, and thus represented by the same real
+//  number. Thus we have ULP-multiplicity, two theoretically distinct
+//  eigenvalues represented by the same real number.
+//
+//
+//  This algorithm uses Bisection by the Scaled Sturm Sequence, implemented in
+//  stevx2_bisection, followed by the LAPACK routine _STEIN, which uses inverse
+//  iteration to find the eigenvalue.  The initial 'bracket' parameters should
+//  contain the full range for the eigenvalues we are to discover. The algorithm
+//  is recursively task based, at each division the bracket is divided into two
+//  brackets. If either is empty (no eigenvalues) we discard it, otherwise a new
+//  task is created to further subdivide the right-hand bracket while the
+//  current task continues dividing the left-hand side, until it can no longer
+//  divide it, and proceeds to store the eigenvalue and compute the eigenvector
+//  if needed. Thus the discovery process is complete when all tasks are
+//  completed. We then proceed to orthogonalizing any eigenvectors discovered;
+//  because inverse iteration does not inherently ensure orthogonal
+//  eigenvectors.
+//
+//  The most comparable serial LAPACK routine is DLAEBZ.
+//
+//  Once all thread work is complete, the code will condense these arrays to
+//  just the actual number of unique eigenvalues found, if any ULP-multiplicity
+//  is present.
+//
+//  For an MPI implementation; if we have K total eigenvalues to find, assign
+//  each of the P nodes (K/P) eigenpairs to find. Provide each the entire range
+//  and their index; they can compute their sub-range to find.
+
+//------------------------------------------------------------------------------
+// Use LAPACK stein to find a single eigenvector.  We may use this routine
+// multiple times, so instead of allocating/freeing the work spaces repeatedly,
+// we have an array of pointers, per thread, to workspaces we allocate if not
+// already allocated for this thread. So we don't allocate more than once per
+// thread. These are freed by the main program before exit.  Returns INFO.
+// 0=success. <0, |INFO| is invalid argument index. >0, if eigenvector failed
+// to converge.
+// These cannot be a template, they need to call different lapack routines
+// based on precision.
+//------------------------------------------------------------------------------
+int64_t LAPACK_stevx2_stein(
+        const int64_t n, const float* diag, const float* offd, const float u,
+        float* v, stevx2_stein_array_t<float>* my_arrays)
+{
+    lapack_int m = 1; // number of eigenvectors to find.
+    lapack_int ldz = n;
+    lapack_int info;
+    lapack_int my_n = n;
+    int64_t thread = omp_get_thread_num();
+    // ensure all arrays are the right size; does nothing if already correct.
+    my_arrays[thread].iblock.resize(n);
+    my_arrays[thread].iblock[0] = 1;
+    my_arrays[thread].isplit.resize(n);
+    my_arrays[thread].isplit[0] = n;
+    my_arrays[thread].work.resize(5*n);
+    my_arrays[thread].iwork.resize(n);
+    my_arrays[thread].ifail.resize(n);
+
+    float w = u; // our eigenvalue.
+
+    LAPACK_sstein(&my_n, diag, offd, &m, &w, &my_arrays[thread].iblock[0],
+                  &my_arrays[thread].isplit[0], v, &ldz,
+                  &my_arrays[thread].work[0], &my_arrays[thread].iwork[0],
+                  &my_arrays[thread].ifail[0], &info);
+    return info;
+}
+
+lapack_int LAPACK_stevx2_stein(
+        const int64_t n, const double* diag, const double* offd, const double u,
+        double* v, stevx2_stein_array_t<double>* my_arrays)
+{
+    lapack_int m = 1; // number of eigenvectors to find.
+    lapack_int ldz = n;
+    lapack_int info;
+    lapack_int my_n = n;
+    int64_t thread = omp_get_thread_num();
+    // ensure all arrays are the right size; does nothing if already correct.
+    my_arrays[thread].iblock.resize(n);
+    my_arrays[thread].iblock[0] = 1;
+    my_arrays[thread].isplit.resize(n);
+    my_arrays[thread].isplit[0] = n;
+    my_arrays[thread].work.resize(5*n);
+    my_arrays[thread].iwork.resize(n);
+    my_arrays[thread].ifail.resize(n);
+
+    blas::real_type<double> w = u; // our eigenvalue.
+
+    LAPACK_dstein(&my_n, diag, offd, &m, &w, &my_arrays[thread].iblock[0],
+                  &my_arrays[thread].isplit[0], v, &ldz,
+                  &my_arrays[thread].work[0], &my_arrays[thread].iwork[0],
+                  &my_arrays[thread].ifail[0], &info);
+    return info;
+}
+
+//-----------------------------------------------------------------------------
+// This a task that subdivides a bracket, throwing off other tasks like this
+// one if necessary, until the bracket zeroes in on a single eigenvalue, which
+// it then stores, and possibly finds the corresponding eigenvector.
+// Parameters:
+//      control:    Global variables.
+//      lower_bound: of bracket to subdivide.
+//      upper_bound: of bracket to subdivide.
+//      nlt_low:    number of eigenvalues less than lower bound.
+//                  -1 if it needs to be found.
+//      nlt_hi:     number of eigenvalues less than the upper bound.
+//                  -1 if it needs to be found.
+//      num_ev:      number of eigenvalues within bracket. Computed if either
+//                  nlt_Low or nlt_hi is computed.
+//-----------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+/// computes a range of eigenvalues/eigenvectors of a symmetric tridiagonal
+/// matrix, using the Sturm sequence and Bisection; followed by Inverse
+/// Iteration if eigenvectors are desired.
+/// Generic implementation for any target.
+/// @ingroup svd_specialization
+///
+// ATTENTION: only host computation supported for now
+//
+template <typename scalar_t>
+void stevx2_bisection(
+           stevx2_control_t<scalar_t>* control, scalar_t lower_bound,
+           scalar_t upper_bound, int64_t nlt_low, int64_t nlt_hi, int64_t num_ev)
+{
+    trace::Block trace_block("slate::stevx2_bisection");
+
+    using blas::max;
+
+    const scalar_t* diag = control->diag;
+    const scalar_t* offd = control->offd;
+    int64_t n = control->n;
+
+    scalar_t cp;
+    int64_t flag=0;
+    int64_t ev_less;
+
+    if (nlt_low < 0) {
+        nlt_low = lapack::sturm(n, diag, offd, lower_bound);
+        flag=1;
+    }
+
+    if (nlt_hi < 0) {
+        nlt_hi =  lapack::sturm(n, diag, offd, upper_bound);
+        flag=1;
+    }
+
+    if (flag) {
+        num_ev = (nlt_hi - nlt_low);
+    }
+
+    // If there are no eigenvalues in the supplied range, we are done.
+    if (num_ev < 1) return;
+
+    if (control->range == lapack::Range::Index) {
+        if (nlt_hi  < control->il ||    // e.g if il=500, and nlt_hi=499, this
+                                        // bracket is under range of interest.
+            nlt_low > control->iu) {    // e.g if iu=1000, and nlt_low=1001,
+                                        // bracket is above range of interest.
+            return;
+        }
+    }
+
+    // Bisect the bracket until we can't anymore.
+
+    flag = 0;
+    for (;;) {
+        cp = (lower_bound+upper_bound)*0.5;
+        if (cp == lower_bound
+            || cp == upper_bound) {
+            // Our bracket has been narrowed to machine epsilon for this
+            // magnitude (=ulp).  We are done; the bracket is always
+            // [low,high). 'high' is not included, so we have num_ev eigenvalues
+            // at low, whether it == 1 or is > 1. We find the eigenvector. (We
+            // can test multiplicity with a GluedWilkinson matrix).
+
+            break; // exit for(;;).
+        } else {
+            // we have a new cutpoint.
+            ev_less = lapack::sturm(n, diag, offd, cp);
+            if (ev_less < 0) {
+                // We could not compute the Sturm sequence for it.
+                flag = -1; // indicate an error.
+                break; // exit for (;;).
+            }
+
+            // Discard empty halves in both Range Value and Index.
+            // If #EV < cutpoint is the same as the #EV < high, it means
+            // no EV are in [cutpoint, hi]. We can discard that range.
+
+            if (ev_less == nlt_hi) {
+                upper_bound = cp;
+                continue;
+            }
+
+            // If #EV < cutpoint is the same as #EV < low, it means no
+            // EV are in [low, cutpoint]. We can discard that range.
+
+            if (ev_less == nlt_low) {
+                lower_bound = cp;
+                continue;
+            }
+
+            // Note: If we were Range Value the initial bounds given by the
+            // user are the ranges, so we have nothing further to do. In Range
+            // Index; the initial bounds are Gerschgorin limits and not enough:
+            // We must further narrow to the desired indices.
+
+            if (control->range == lapack::Range::Index) {
+                // For Range Index: Recall that il, iu are 1-relative; while
+                // ev_less is zero-relative; i.e.  if [il,iu]=[1,2], evless must
+                // be 0, or 1.  when ev_less<cp == il-1, or just <il, cp is a
+                // good boundary and we can discard the lower half.
+                //
+                // To judge the upper half, the cutpoint must be < iu, so if it
+                // is >= iu, cannot contain eigenvalue[iu-1].  if ev_less >= iu,
+                // we can discard upper half.
+
+                if (ev_less < control->il) {
+                    // The lower half [lower_bound, cp) is not needed, it has no
+                    // indices >= il.
+
+                    lower_bound = cp;
+                    nlt_low    = ev_less;
+                    num_ev = (nlt_hi-nlt_low);
+                    continue;
+                }
+
+                if (ev_less >= control->iu) {
+                    // The upper half [cp, upper_bound) is not needed, it has no
+                    // indices > iu;
+
+                    upper_bound = cp;
+                    nlt_hi     = ev_less;
+                    num_ev = (nlt_hi-nlt_low);
+                    continue;
+                }
+            }
+
+            // Here, the cutpoint has EV on both left right. We push off the
+            // right bracket.  The new lower_bound is the cp, the upper_bound is
+            // unchanged, the number of eigenvalues changes.
+
+            #pragma omp task
+                stevx2_bisection(control, cp, upper_bound, ev_less, nlt_hi,
+                                 (int64_t) (nlt_hi-ev_less));
+
+            // Update the Left side I kept. The new number of EV less than
+            // upper_bound is ev_less, recompute number of EV in the bracket.
+
+            upper_bound = cp;
+            nlt_hi = ev_less;
+            num_ev =( ev_less - nlt_low);
+            continue;
+         }
+    }
+
+    // Okay, count this eigenpair done, add to the Done list.
+    // NOTE: nlt_low is the global zero-relative index of
+    //       this set of mpcity eigenvalues.
+    //       No other brackets can change our entry, so we
+    //       don't need any thread block or atomicity.
+
+    int my_idx;
+    if (control->range == lapack::Range::Index) {
+        my_idx = nlt_low - (control->il-1);
+    } else { // range == Value
+        my_idx = nlt_low - control->base_idx;
+    }
+
+    if (control->jobtype == lapack::Job::Vec) {
+        // get the eigenvector.
+        int ret=LAPACK_stevx2_stein(
+            control->n, diag, offd, lower_bound,
+            &(control->pvec[my_idx*control->n]), control->stein_arrays);
+        if (ret != 0) {
+            #pragma omp critical (UpdateStack)
+            {
+                // Only store first error we encounter
+                if (control->error == 0)
+                    control->error = ret;
+            }
+        }
+    }
+
+    // Add eigenvalue and multiplicity.
+    control->pval[my_idx]=lower_bound;
+    control->pmul[my_idx]=num_ev;
+} // end stevx_bisection()
+
+//------------------------------------------------------------------------------
+template
+void stevx2_bisection<float>(
+            stevx2_control_t<float>* control, float lower_bound,
+            float upper_bound, int64_t nlt_low, int64_t nlt_hi, int64_t num_ev);
+
+template
+void stevx2_bisection<double>(
+            stevx2_control_t<double>* control, double lower_bound,
+            double upper_bound, int64_t nlt_low, int64_t nlt_hi, int64_t num_ev);
+} // end namespace slate.
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 932ae1ebf..5e27f1a4f 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -73,7 +73,13 @@ add_executable(
 )
 
 # C++17 is inherited from slate, but disabling extensions is not.
-set_target_properties( ${tester} PROPERTIES CXX_EXTENSIONS false )
+set_target_properties(
+    slate PROPERTIES
+    CXX_STANDARD 17
+    CUDA_STANDARD 11
+    CXX_STANDARD_REQUIRED true
+    CXX_EXTENSIONS false
+)
 
 target_link_libraries(
     ${tester}
diff --git a/test/test.cc b/test/test.cc
index 40a0e0410..2dcfc534d 100644
--- a/test/test.cc
+++ b/test/test.cc
@@ -199,6 +199,7 @@ std::vector< testsweeper::routines_t > routines = {
     // generalized symmetric/Hermitian eigenvalues
     { "hegv",               test_hegv,         Section::sygv },
     { "hegst",              test_hegst,        Section::sygv },
+    { "stevx2",             test_stevx2,       Section::sygv },
     { "",                   nullptr,           Section::newline },
 
     // -----
diff --git a/test/test.hh b/test/test.hh
index 22644d237..636cf4f98 100644
--- a/test/test.hh
+++ b/test/test.hh
@@ -219,6 +219,7 @@ void test_unmtr_hb2st(Params& params, bool run);
 // generalized symmetric/Hermitian eigenvalues
 void test_hegv   (Params& params, bool run);
 void test_hegst  (Params& params, bool run);
+void test_stevx2 (Params& params, bool run);
 
 // SVD
 void test_gesvd  (Params& params, bool run);
diff --git a/test/test_stevx2.cc b/test/test_stevx2.cc
new file mode 100644
index 000000000..f5e244f20
--- /dev/null
+++ b/test/test_stevx2.cc
@@ -0,0 +1,309 @@
+// Copyright (c) 2017-2020, University of Tennessee. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
+
+#include "slate/slate.hh"
+#include "test.hh"
+#include "blas/flops.hh"
+#include "print_matrix.hh"
+#include "grid_utils.hh"
+
+#include "scalapack_wrappers.hh"
+#include "scalapack_support_routines.hh"
+#include "scalapack_copy.hh"
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <utility>
+
+#define SLATE_HAVE_SCALAPACK
+
+//------------------------------------------------------------------------------
+// Matrix detailed in Kahan; et al. 
+// Matrix Test: diag=[+x,-x,+x,-x,...+x,-x] for any real x, but Kahan chooses
+//                                          a tiny x.
+//              offd=[1,1,...1]
+// Dimension: n. 
+// Computed eigenvalues:
+// evalue[k] = [ x*x + 4*cos(k/(n+1))^2 ] ^(1/2), 
+// evalue[n+1-k] = -evalue[k], for k=1,[n/2],
+// evalue[(n+1)/2] = 0 if n is odd.
+// Note k is 1-relative in these formulations.
+// The eigenvalues range from (-2,+2).
+// Note: This routine verified to match documentation for n=4,8,12,24.
+// Note: This code is a template, it is not intended to work in complex
+//       arithmetic, it is only to be translated to either single or double.
+//------------------------------------------------------------------------------
+template<typename scalar_t>
+void stevx2_test_matrix_kahan(
+    std::vector<scalar_t>& diag, std::vector<scalar_t>& offd,
+    std::vector< scalar_t>& evalue, int64_t n, scalar_t my_diag_value)
+{
+    int64_t i,k;
+    for (k = 1; k <= (n/2); ++k) {
+        scalar_t ev;
+        ev = (M_PI*k+0.)/(n+1.0);           // angle in radians...
+        ev = cos(ev);                       // cos(angle)...
+        ev *= 4.*ev;                        // 4*cos^2(angle)...
+        ev += my_diag_value*my_diag_value;  // x^2 + 4*cos^2(angle)...
+        ev = sqrt(ev);                      // (x^2 + 4*cos^2(angle))^(0.5).
+        // we reverse the -ev and ev here, to get in ascending sorted order.
+        evalue[k-1] = -ev;
+        evalue[n+1-k-1] = ev;
+    }
+
+    for (i = 0; i < n-1; ++i) {
+        offd[i] = 1.0;
+        k=(i&1);    // 0=even, 1=odd.
+
+        if (k) 
+            diag[i]=-my_diag_value;
+        else
+            diag[i]=my_diag_value;
+    }
+
+    // No offd for final diagonal entry.
+    k=(i&1);
+    if (k) 
+        diag[i]=-my_diag_value;
+    else
+        diag[i]=my_diag_value;
+}
+
+//------------------------------------------------------------------------------
+// @brief Tests slate version of stevx2.
+//
+// @param[in,out] param - array of parameters
+// @param[in]     run - whether to run test
+//
+// Sets used flags in param indicating parameters that are used.
+// If run is true, also runs test and stores output parameters.
+//------------------------------------------------------------------------------
+template<typename scalar_t>
+void test_stevx2_work(Params& params, bool run)
+{
+    using real_t = blas::real_type<scalar_t>;
+
+    // Constants
+    const scalar_t zero = 0.0, one = 1.0;
+
+    // get & mark input values
+    int64_t m = params.dim.m();
+    int64_t nb = params.nb();
+    int64_t p = params.grid.m();
+    int64_t q = params.grid.n();
+    bool check = params.check() == 'y';
+    bool trace = params.trace() == 'y';
+    int verbose = params.verbose();
+    slate::Origin origin = params.origin();
+    slate::Target target = params.target();
+
+    // mark non-standard output values
+    params.time();
+
+    if (! run)
+        return;
+
+    // skip invalid or unimplemented options
+    if (target != slate::Target::HostTask) {
+        params.msg() = "skipping: stevx2 is only implemented for HostTask";
+        return;
+    }
+
+    slate::Options const opts =  {
+        {slate::Option::Target, target}
+    };
+
+    int64_t i,j;
+
+    std::vector<scalar_t> diag;
+    std::vector<scalar_t> offd;
+    std::vector<scalar_t> kahan_eigenvalues;
+    std::vector<scalar_t> found_eigenvalues;
+    slate::Matrix<scalar_t> found_eigenvectors;
+    std::vector<int64_t> found_mult;
+
+    diag.resize(m);
+    offd.resize(m-1);
+    kahan_eigenvalues.resize(m);
+    found_eigenvalues.resize(m);
+    found_mult.resize(m);
+
+    //--------------------------------------------------------------------------
+    // Kahan has eigenvalues from [-2.0 to +2.0]. However, eigenvalues are 
+    // dense near -2.0 and +2.0, so for large matrices, the density may cause
+    // eigenvalues separated by less than machine precision, which causes us
+    // multiplicity (eigenvalues are identical at machine precision). We first
+    // see this in single precision at m=14734, with a multiplicity of 2. 
+    //--------------------------------------------------------------------------
+
+    scalar_t my_diag=1.e-5;
+    stevx2_test_matrix_kahan(diag, offd, kahan_eigenvalues, m, my_diag);
+    double min_abs_ev=__DBL_MAX__, max_abs_ev=0., kond;
+    for (i = 0; i < m; ++i) {
+        if (fabs(kahan_eigenvalues[i]) < min_abs_ev) 
+            min_abs_ev=fabs(kahan_eigenvalues[i]);
+        if (fabs(kahan_eigenvalues[i]) > max_abs_ev)
+            max_abs_ev=fabs(kahan_eigenvalues[i]);
+    }
+
+    kond = max_abs_ev / min_abs_ev;
+
+    int64_t n_eig_vals=0;
+    int64_t il=0, iu=500;
+    int64_t ev_idx;
+    scalar_t vl=1.5, vu=2.01;
+
+    // we find the index in kahan_eigenvalues of first >=vl.
+    for (ev_idx = 0; ev_idx < m; ++ev_idx)
+        if (kahan_eigenvalues[ev_idx] >= vl) break;
+
+    // Run and time stevx2, range based on values.
+    if (trace)
+        slate::trace::Trace::on();
+    else
+        slate::trace::Trace::off();
+
+    double time = barrier_get_wtime(MPI_COMM_WORLD);
+
+    slate::stevx2(lapack::Job::Vec, lapack::Range::Value, diag, offd, vl, vu,
+                  il, iu, found_eigenvalues, found_mult, found_eigenvectors,
+                  MPI_COMM_WORLD);
+
+    n_eig_vals = found_eigenvalues.size();  // unique eigenvalues found.
+
+    time = barrier_get_wtime(MPI_COMM_WORLD) - time;
+    if (trace)
+        slate::trace::Trace::finish();
+    params.time() = time;
+
+    // Test results directly. Check eigenvalues discovered by vl, vu.
+
+    if (check) {
+        //----------------------------------------------------------------------
+        // Find worst eigenvalue error. However, we must worry about
+        // multiplicity. In single precision this first occurs at m=14734, with
+        // vl=1.5, vu=2.01; mpcity=2. At m=75000, vl=1.5, vu=2.01, mpcity=10.
+        // We must also worry about the magnitude of eigenvalues; machine 
+        // epsilon for large eigenvalues is much greater than for small ones.
+        //----------------------------------------------------------------------
+
+        scalar_t worst_eigvalue_error = zero;
+        int64_t worst_eigvalue_found_index = 0, worst_eigvalue_global_index = 0,
+                worst_eigvalue_mpcty = 0;
+        int64_t max_mpcty = 0;
+        scalar_t worst_eigvector_error = zero;
+        int64_t worst_eigvector_index = 0;
+        i=0;
+        while (i < n_eig_vals
+               && ev_idx < n_eig_vals) {
+            if (found_mult[i] > max_mpcty) 
+                max_mpcty = found_mult[i];
+
+            for (j = 0; j < found_mult[i]; ++j) {
+                double ev_eps = nexttoward(fabs(found_eigenvalues[i]),
+                     __DBL_MAX__) - fabs(found_eigenvalues[i]);
+                scalar_t error;
+                error = fabs(found_eigenvalues[i]-kahan_eigenvalues[ev_idx]);
+                error /= ev_eps;
+                if (error > worst_eigvalue_error) {
+                    worst_eigvalue_found_index = i;
+                    worst_eigvalue_global_index = ev_idx;
+                    worst_eigvalue_error = error;
+                    worst_eigvalue_mpcty = found_mult[i];
+}
+
+                ++ev_idx; // advance known eigenvalue index for a multiplicity.
+            }
+           
+            ++i; // advance to next discovered eigenvalue.
+        }
+
+        //----------------------------------------------------------------------
+        // Worth reporting for debug: worst_eigvalue_index,
+        // worst_eigvalue_error, max_mpcty.
+        //----------------------------------------------------------------------
+
+        params.ref_time() = time;
+        params.error() = worst_eigvalue_error;
+        params.okay() = (worst_eigvalue_error < (scalar_t) 3.);
+
+        //----------------------------------------------------------------------
+        // If we have no eigenvalue errors, We need to test the eigenvectors.
+        // stevx2_stepe returns fabs(||(A*pvec)/pval - pvec||_maxabs) for
+        // each eigenvalue and eigenvector. We track the largest value.
+        // Empirically; the error grows slowly with m. We divide by epsilon,
+        // and 2*ceil(log_2(m)) epsilons seems a reasonable threshold without
+        // being too liberal. Obviously this is related to the number of bits
+        // of error in the result. The condition number (kond) of the Kahan
+        // matrix also grows nearly linearly with m; kond is computed above.
+        //----------------------------------------------------------------------
+
+        if (params.okay()) { // only test eigenvectors if no eigenvalue errors.
+            for (i = 0; i < n_eig_vals; ++i) {
+                double verr;
+                std::vector<scalar_t> y( m ); 
+                int64_t ii=0;
+                slate::stevx2_get_col_vector(found_eigenvectors, y, i);
+
+                verr=slate::stevx2_stepe<scalar_t>(&diag[0], &offd[0],
+                        diag.size(), found_eigenvalues[i], y);
+                if (verr > worst_eigvector_error) {
+                    worst_eigvector_error = verr; 
+                    worst_eigvector_index = i;
+                }
+            }
+
+            // Find one norm of the tridiagonal matrix (max abs(column sum)).
+            scalar_t test, one_norm = 0;
+            one_norm = fabs(diag[0])+fabs(offd[0]);
+            test = fabs(offd[m-2])+fabs(diag[m-1]);
+            if (test > one_norm)
+                one_norm = test;
+            for (i = 1; i < (m-1); ++i) {
+                test = fabs(offd[i-1]) + fabs(diag[i]) + fabs(offd[i]);
+                if (test > one_norm)
+                    one_norm = test;
+            }
+
+            // Find unit of least precision on 1-Norm.
+            scalar_t ulp = nexttoward(one_norm, __DBL_MAX__) - one_norm;
+            params.error() = worst_eigvector_error;
+            params.okay() = ( (worst_eigvector_error / (m * ulp) ) < 1.0);
+        }
+    } // end if (check)
+}
+
+// -----------------------------------------------------------------------------
+void test_stevx2(Params& params, bool run)
+{
+    switch (params.datatype()) {
+        case testsweeper::DataType::Single:
+            test_stevx2_work<float> (params, run);
+            break;
+
+        case testsweeper::DataType::Double:
+            test_stevx2_work<double> (params, run);
+            break;
+
+        case testsweeper::DataType::Integer:
+        case testsweeper::DataType::SingleComplex:
+        case testsweeper::DataType::DoubleComplex:
+            throw std::exception();
+            break;
+    }
+}
+
+
+// -----------------------------------------------------------------------------
+template
+void stevx2_test_matrix_kahan<float>(
+    std::vector<float>& diag, std::vector<float>& offd,
+    std::vector<float>& evalue, int64_t n, float my_diag_value);
+
+template
+void stevx2_test_matrix_kahan<double>(
+    std::vector<double>& diag, std::vector<double>& offd,
+    std::vector<double>& evalue, int64_t n, double my_diag_value);

From b2c369c3db350921287bcb6faebe18e29662abe3 Mon Sep 17 00:00:00 2001
From: Anthony Castaldo <TonyCastaldo@icl.utk.edu>
Date: Wed, 11 May 2022 17:28:36 -0400
Subject: [PATCH 2/5] Resolve update conflict with updated test/CMakeLists.txt

---
 test/CMakeLists.txt | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 5e27f1a4f..95de0b856 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -72,13 +72,14 @@ add_executable(
     ${tester_src}
 )
 
-# C++17 is inherited from slate, but disabling extensions is not.
+# Use and export -std=c++17.
+# CMake inexplicably allows gnu++17 to "decay" to c++11 or 14; prohibit those.
+target_compile_features( ${tester} PUBLIC cxx_std_17 )
 set_target_properties(
-    slate PROPERTIES
-    CXX_STANDARD 17
+    ${tester} PROPERTIES
     CUDA_STANDARD 11
-    CXX_STANDARD_REQUIRED true
-    CXX_EXTENSIONS false
+    CXX_STANDARD_REQUIRED true  # prohibit < c++17
+    CXX_EXTENSIONS false        # prohibit gnu++17
 )
 
 target_link_libraries(

From 885575c29ef240d18de3426e1ec31b256fde238a Mon Sep 17 00:00:00 2001
From: Anthony Castaldo <TonyCastaldo@icl.utk.edu>
Date: Wed, 11 May 2022 17:38:56 -0400
Subject: [PATCH 3/5] Resolve conflicted updates on test/CMakeLists.txt

---
 test/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 95de0b856..78e348568 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -77,7 +77,6 @@ add_executable(
 target_compile_features( ${tester} PUBLIC cxx_std_17 )
 set_target_properties(
     ${tester} PROPERTIES
-    CUDA_STANDARD 11
     CXX_STANDARD_REQUIRED true  # prohibit < c++17
     CXX_EXTENSIONS false        # prohibit gnu++17
 )

From 8d1fef2eabd0cc7ea7b1dfdce2221c8436936f63 Mon Sep 17 00:00:00 2001
From: Anthony Castaldo <TonyCastaldo@icl.utk.edu>
Date: Wed, 11 May 2022 17:41:13 -0400
Subject: [PATCH 4/5] Reverting to original test/CMakeLists.txt.

---
 test/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 78e348568..49d91a940 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -73,7 +73,7 @@ add_executable(
 )
 
 # Use and export -std=c++17.
-# CMake inexplicably allows gnu++17 to "decay" to c++11 or 14; prohibit those.
+# CMake inexplicably allows gnu++17 or "decay" to c++11 or 14; prohibit those.
 target_compile_features( ${tester} PUBLIC cxx_std_17 )
 set_target_properties(
     ${tester} PROPERTIES

From e9ccc08dc5e667163bfbf7c9a5a40f71aa6bfe12 Mon Sep 17 00:00:00 2001
From: Anthony Castaldo <TonyCastaldo@icl.utk.edu>
Date: Fri, 13 May 2022 15:54:02 -0400
Subject: [PATCH 5/5] added a routine, stevx2_mpi_apportion(), that will
 provide the scalar_t ranges to divide an eigenvalue range into panels that
 all contain the same number of eigenvalues (except the last panel, of
 course). This is tested. The purpose is the first step in MPI
 parallelization, presuming stevx2() will then be called recursively process
 each panel. Extensive notes are included for a plan to do the rest of MPI;
 they can be deleted after implementation. The new routine was extensively
 tested.

---
 src/stevx2.cc | 161 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 161 insertions(+)

diff --git a/src/stevx2.cc b/src/stevx2.cc
index b66b06e3e..cf130c260 100644
--- a/src/stevx2.cc
+++ b/src/stevx2.cc
@@ -181,6 +181,138 @@ void stevx2_put_col_vector(
     return;
 }
 
+//------------------------------------------------------------------------------
+// mpi_apportion: Given a caller's value range and panel_width, this produces a
+// vector of "dividing values", such that each 'slice' of the eigenvalue range
+// contains no more than panel_width eigenvalues. Note the values themselves are
+// not necessarily eigenvalues; we just want to distribute the work equally in
+// panel_width chunks. The size of the vector will be L=slice_count + 1, the
+// first and last values will be the caller's min and max value range. (If the
+// caller provides an index range, this must be converted to min and max values
+// before calling).
+
+// One issue we must deal with is eigenvalue multiplicity. In any precision,
+// theoretically distinct eigenvalues can be so close arithmetically that they
+// are represented by the same floating point number. In such cases, we might
+// not be able to find a perfect slice value; for example if sturm() reports
+// there are 400 eigenvalues in the range [caller_min, caller_max), and
+// panel_width = 100, it is possible that at one value X, sturm(X) reports 99
+// values < X. and sturm(X+ulp) reports 101 values < X+ulp. There is no
+// midpoint between sturm(X) and sturm(X+ulp) to make a better slice value; we
+// have a multiplicity and must include it in one slice or the other; we cannot
+// divide it. The choice we make in such circumstances is to NOT create a slice
+// with MORE than panel_width eigenvalues.
+
+// Note as well, given a slice to process, an MPI rank may find that although
+// there are X theoretical eigenvalues contained in the range, due to
+// multiplicity, there are less than X *distinct* eigenvalues in the range, and
+// it must return with a panel-wide slice with fewer than panel_width columns
+// populated. This will slightly complicate stitching these together for
+// orthogonalization.
+//
+// After eigenvalue discovery, each rank can compress their slice and return
+// the width.
+//
+// The returns are the vectors count<int>; count.size() = #panels found; and
+// boundary<scalar_t>.size() = #panels + 1.
+//
+// The caller can use the boundary[] vector as:
+// [v[0],v[1]) theoretically contains count[0] eigenvalues.
+// [v[1],v[2]) theoretically contains count[1] eigenvalues, etc.
+//
+// The panel_width provided can be anything > 0; probably the desired matrix
+// tile width, or a multiple thereof.
+//
+// This has been tested successfully with many values of panel_width and n.
+//------------------------------------------------------------------------------
+template <typename scalar_t>
+void stevx2_mpi_apportion(
+    const lapack_int n, const std::vector<scalar_t>& diag,
+    const std::vector<scalar_t>offd, const int panel_width,
+    const scalar_t& caller_min, const scalar_t& caller_max,
+    std::vector<scalar_t>& boundary, std::vector<int>& count)
+{
+    // error if min >= max.
+    int v_less_than_min = lapack::sturm(n, &diag[0], &offd[0], caller_min);
+    int v_less_than_max = lapack::sturm(n, &diag[0], &offd[0], caller_max);
+    int v_found = (v_less_than_max - v_less_than_min);
+
+    if (v_found < 1) {
+        slate_error("Illegal values of max and min in stevx2_mpi_apportion().");
+        return;
+    }
+
+    // compute number of panels required.
+    boundary.resize(2);             // resized as panels are found.
+    std::vector<int> b_sturm(2);    // resized as panels are found.
+    int panels=1;
+    boundary[0] = caller_min;
+    boundary[1] = caller_max;
+    b_sturm[0] = lapack::sturm(n, &diag[0], &offd[0], boundary[0]);
+    b_sturm[1] = lapack::sturm(n, &diag[0], &offd[0], boundary[1]);
+
+    // Until we come up short. Find new boundary for panel=panels-1.
+    for (;;) {
+        int cnt = b_sturm[panels]-b_sturm[panels-1];
+        if (cnt <= panel_width)
+            break; // exit the loop.
+
+        scalar_t cp_low = boundary[panels-1];
+        scalar_t cp_hi = boundary[panels];
+        scalar_t cp;
+        int cp_sturm;
+        // Zero in on max(cp) that yields <= panel_width.
+        for (;;) {
+            cp = (cp_low + cp_hi) * 0.5;
+            if (cp == cp_low
+                || cp == cp_hi) {
+                // We reached max resolution without landing on panel_width. We
+                // must have a multiplicity. cp_low is a range with less than
+                // panel_width eigenvalues, cp_hi has more than panel_width
+                // eigenvalues, so we choose cp_low.
+                ++panels;
+                boundary.resize(panels+1, boundary[panels-1]);
+                boundary[panels-1] = cp_low;
+                b_sturm.resize(panels+1, b_sturm[panels-1]);
+                b_sturm[panels-1] = lapack::sturm(n, &diag[0], &offd[0], cp_low);
+                break;
+            }
+
+            cp_sturm = lapack::sturm(n, &diag[0], &offd[0], cp);
+            int cp_cnt = cp_sturm - b_sturm[panels-1];
+            // If too much, reduce upper bound of search range.
+            if (cp_cnt > panel_width) {
+                cp_hi = cp;
+                continue;
+            }
+
+            // If too little, it is safe to change the lower bound.
+            if (cp_cnt < panel_width) {
+                cp_low = cp;
+                continue;
+            }
+
+            // We found a cutpoint that gives exactly panel_width.
+            ++panels;
+            boundary.resize(panels+1, boundary[panels-1]);
+            boundary[panels-1] = cp;
+            b_sturm.resize(panels+1, b_sturm[panels-1]);
+            b_sturm[panels-1] = cp_sturm;
+            break;
+        } // end bisection to cut the range.
+    } // continue until final panel <= panel_width.
+
+    // Now produce the counts per panel.
+    count.resize(panels);
+    for (int i = 0; i < panels; ++i) {
+        count[i] = b_sturm[i+1]-b_sturm[i];
+        if (0) { // DEBUG for testing; can be deleted.
+            fprintf(stderr, "%s:%i Panel %i=[%.8e,%.8e] count=%i.\n",
+            __func__, __LINE__, i, boundary[i], boundary[i+1], count[i]);
+        }
+    }
+}
+
 //------------------------------------------------------------------------------
 // STELG: Symmetric Tridiagonal Eigenvalue Least Greatest (Min and Max).
 // Finds the least and largest signed eigenvalues (not least magnitude).
@@ -436,6 +568,27 @@ void stevx2(
     control.pvec = &pvec[0];
     int64_t int_one=1;
 
+    // NOTES: stevx2_mpi_apportion divides the range into tile-sized chunks.
+    // this is a step in getting ready for MPI operation. Nothing is currently
+    // done with these results; I only coded this first step.  To continue,
+    // this should be conditional on the number of MPI ranks available; each
+    // rank would also call stevx2() to process their panel, find eigenvalues
+    // and eigenvectors. After that, the panels of each rank must be stitched
+    // together, but don't forget that due to multiplicity each may have fewer
+    // columns than we set in panel_width. We also need a single eigenvalue
+    // vector. Then we need a geqrf/unmqr to orthogonalize the full eigenvector
+    // matrix. After that we must do checks for vector_swapping operation on
+    // the full matrix.
+
+    std::vector<int> count;
+    std::vector<scalar_t> boundary;
+    if (0) { // DEBUG for testing, can be deleted.
+        fprintf(stderr, "%s:%i n_eig_vals=%ld, Range[%.8e,%.8e] \n",
+                __func__, __LINE__, n_eig_vals, vl, vu);
+    }
+
+    stevx2_mpi_apportion(n, diag, offd, 256, vl, vu, boundary, count);
+
     // We launch the root task: The full range to subdivide.
     #pragma omp parallel
     {
@@ -602,6 +755,14 @@ void stevx2(
     return;
 }
 
+template
+void stevx2_get_col_vector<float>(
+        Matrix<float>& source, std::vector<float>& v, int col);
+
+template
+void stevx2_get_col_vector<double>(
+        Matrix<double>& source, std::vector<double>& v, int col);
+
 template
 void stevx2_stelg<float>(
     const float* diag,  const float* offd, const lapack_int n,