From 37a6768da7d1b8b21783e7402297f651a022ea7c Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Thu, 27 Oct 2022 14:56:17 -0400
Subject: [PATCH 01/49] Tow-norm estimation

---
 src/normest.cc | 241 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 241 insertions(+)
 create mode 100644 src/normest.cc
diff --git a/src/normest.cc b/src/normest.cc
new file mode 100644
index 000000000..e0ed58fad
--- /dev/null
+++ b/src/normest.cc
@@ -0,0 +1,241 @@
+// Copyright (c) 2017-2022, University of Tennessee. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
+
+#include "slate/slate.hh"
+#include "internal/internal.hh"
+#include "internal/internal_util.hh"
+#include "slate/internal/mpi.hh"
+
+#include <list>
+#include <tuple>
+
+namespace slate {
+
+// specialization namespace differentiates, e.g.,
+// internal::norm from internal::specialization::norm
+namespace internal {
+namespace specialization {
+
+//------------------------------------------------------------------------------
+/// @internal
+/// Distributed parallel general matrix norm.
+/// Generic implementation for any target.
+/// @ingroup norm_specialization
+///
+template <Target target, typename matrix_type>
+blas::real_type<typename matrix_type::value_type>
+normest(slate::internal::TargetType<target>,
+     Norm in_norm, matrix_type A,
+     Options const& opts)
+{
+    using scalar_t = typename matrix_type::value_type;
+    using real_t = blas::real_type<scalar_t>;
+
+    // Undo any transpose, which switches one <=> inf norms.
+    if (A.op() == Op::ConjTrans)
+        A = conj_transpose(A);
+    else if (A.op() == Op::Trans)
+        A = transpose(A);
+
+    //---------
+    // Two norm estimation
+    // First: let's compute the x vector such that
+    // x_j = sum_i abs( A_{i,j} ), x here is global_sums.data
+    if (in_norm == Norm::One) {
+
+        using blas::min;
+
+        int p, q;
+        int myrow, mycol;
+        GridOrder order;
+        A.gridinfo(&order, &p, &q, &myrow, &mycol);
+
+        int64_t n = A.n();
+        int64_t m = A.m();
+        int64_t nb = A.tileNb(0);
+
+        scalar_t one = 1.;
+        scalar_t zero  = 0.;
+        real_t alpha;
+
+        int64_t cnt = 0;
+        int64_t maxiter = min( 100, n );
+
+        real_t e  = 0.;
+        real_t e0  = 0.;
+        real_t normX = 0;
+        real_t normAX = 0;
+        real_t tol = 1.e-1;
+
+        std::vector<real_t> local_sums(n);
+
+        if (target == Target::Devices)
+            A.reserveDeviceWorkspace();
+
+        std::vector<scalar_t> global_sums(n);
+        std::vector<scalar_t> W1(n);
+        std::vector<scalar_t> W2(n);
+
+        auto XL = slate::Matrix<scalar_t>::fromLAPACK(
+            n, 1, &global_sums[0], n, nb, 1, p, q, A.mpiComm());
+        XL.insertLocalTiles();
+        auto AX = slate::Matrix<scalar_t>::fromLAPACK(
+            m, 1, &W1[0], m, nb, 1, p, q, A.mpiComm());
+        AX.insertLocalTiles();
+        auto X = slate::Matrix<scalar_t>::fromLAPACK(
+            n, 1, &W2[0], n, nb, 1, p, q, A.mpiComm());
+        X.insertLocalTiles();
+
+
+        #pragma omp parallel
+        #pragma omp master
+        {
+            internal::norm<target>(in_norm, NormScope::Matrix, std::move(A), local_sums.data());
+        }
+
+        #pragma omp critical(slate_mpi)
+        {
+            trace::Block trace_block("MPI_Allreduce");
+            slate_mpi_call(
+                MPI_Allreduce(local_sums.data(), global_sums.data(),
+                              n, mpi_type<real_t>::value,
+                              MPI_SUM, A.mpiComm()));
+        }
+
+        // global_sums.data() will have the sum of each column.
+        // First step is done
+
+        // Second: compute the ||x||_Fro
+        //e = lapack::lange(
+        //    Norm::Fro, 1, n,
+        //    global_sums.data(), 1);
+        e  = slate::norm(Norm::Fro, XL, opts);
+        if (e == 0.) {
+            return 0.;
+        }
+        else {
+            normX = e;
+        }
+
+        // Third: start the while-loop X = X / ||X||
+        while ( (cnt < maxiter) && (fabs((e - e0)) > (tol * (e))) ) {
+            e0 = e;
+
+            // Scale X = X / ||X||
+            alpha = 1.0/normX;
+            add(scalar_t(alpha), XL, zero, X, opts);
+
+            // Compute Ax = A * sx
+            gemm(one, A, X, zero, AX, opts);
+
+            // Compute x = A' * A * x = A' * Ax
+            auto AT = conj_transpose(A);
+            gemm(one, AT, AX, zero, XL, opts);
+
+            // Compute ||X||, ||AX||
+            normX  = norm(Norm::Fro, XL, opts);
+            normAX = norm(Norm::Fro, AX, opts);
+
+            // Uodate e
+            e = normX / normAX;
+            cnt++;
+        }
+        //if ( (cnt >= maxiter) && (fabs((e - e0)) > (tol * e)) ) {
+        //}
+
+        A.clearWorkspace();
+
+        return e;
+
+    }
+    else {
+        slate_error("invalid norm.");
+    }
+}
+
+} // namespace specialization
+} // namespace internal
+
+//------------------------------------------------------------------------------
+/// Version with target as template parameter.
+/// @ingroup norm_specialization
+///
+template <Target target, typename matrix_type>
+blas::real_type<typename matrix_type::value_type>
+normest(Norm norm, matrix_type& A,
+     const std::map<Option, Value>& opts)
+{
+    return internal::specialization::normest(internal::TargetType<target>(),
+                                          norm, A, opts);
+}
+
+//------------------------------------------------------------------------------
+/// Distributed parallel general matrix norm.
+///
+//------------------------------------------------------------------------------
+/// @tparam matrix_type
+///     Any SLATE matrix type: Matrix, SymmetricMatrix, HermitianMatrix,
+///     TriangularMatrix, etc.
+//------------------------------------------------------------------------------
+/// @param[in] in_norm
+///     Norm to compute:
+///     - Norm::Second: second norm estimation of the matrix$
+///
+/// @param[in] A
+///     The matrix A.
+///
+/// @param[in] opts
+///     Additional options, as map of name = value pairs. Possible options:
+///     - Option::Target:
+///       Implementation to target. Possible values:
+///       - HostTask:  OpenMP tasks on CPU host [default].
+///       - HostNest:  nested OpenMP parallel for loop on CPU host.
+///       - Devices:   batched BLAS on GPU device.
+///
+/// @ingroup norm
+///
+template <typename matrix_type>
+blas::real_type<typename matrix_type::value_type>
+normest(Norm in_norm, matrix_type& A,
+     const std::map<Option, Value>& opts)
+{
+    Target target = get_option( opts, Option::Target, Target::HostTask );
+
+    switch (target) {
+        case Target::Host:
+        case Target::HostTask:
+            return normest<Target::HostTask>(in_norm, A, opts);
+            break;
+        case Target::HostBatch:
+        case Target::HostNest:
+        case Target::Devices:
+            break;
+    }
+    throw std::exception();  // todo: invalid target
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+template
+float normest(
+    Norm in_norm, Matrix<float>& A,
+    Options const& opts);
+
+template
+double normest(
+    Norm in_norm, Matrix<double>& A,
+    Options const& opts);
+
+template
+float normest(
+    Norm in_norm, Matrix< std::complex<float> >& A,
+    Options const& opts);
+
+template
+double normest(
+    Norm in_norm, Matrix< std::complex<double> >& A,
+    Options const& opts);
+
+} // namespace slate

From 9b47c710d8bdd7e33759c9fbf553085bfe0bf4a2 Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Thu, 27 Oct 2022 15:05:33 -0400
Subject: [PATCH 02/49] Added the polar decomposition QDWH

---
 src/qdwh.cc | 388 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 388 insertions(+)
 create mode 100644 src/qdwh.cc

diff --git a/src/qdwh.cc b/src/qdwh.cc
new file mode 100644
index 000000000..1789db558
--- /dev/null
+++ b/src/qdwh.cc
@@ -0,0 +1,388 @@
+// Copyright (c) 2017-2022, University of Tennessee. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
+
+#include "slate/slate.hh"
+#include "auxiliary/Debug.hh"
+#include "slate/Matrix.hh"
+#include "internal/internal.hh"
+
+namespace slate {
+
+// specialization namespace differentiates, e.g.,
+// todo: is it ok?
+// internal::geqrf from internal::specialization::geqrf
+namespace impl {
+
+//------------------------------------------------------------------------------
+/// Distributed parallel polar decomposition based on QDWH algorithm.
+/// Generic implementation for any target.
+// todo: is it ok?
+/// @ingroup geqrf_specialization
+///
+template <Target target, typename scalar_t>
+void qdwh(slate::internal::TargetType<target>,
+          Matrix<scalar_t>& A,
+          Matrix<scalar_t>& H, // this matrix will be hermition
+          Options const& opts)
+{
+    using real_t = blas::real_type<scalar_t>;
+    using blas::real;
+
+    int64_t lookahead = get_option<int64_t>( opts, Option::Lookahead, 1 );
+    int64_t ib = get_option<int64_t>( opts, Option::InnerBlocking, 16 );
+    int64_t max_panel_threads  = std::max(omp_get_max_threads()/2, 1);
+    max_panel_threads = get_option<int64_t>( opts, Option::MaxPanelThreads, max_panel_threads );
+
+    int64_t mt = A.mt();
+    int64_t nt = A.nt();
+
+    int64_t m    = A.m();
+    int64_t n    = A.n();
+    int64_t nb   = A.tileMb(0);
+    int64_t m2   = m + n;
+
+    int nprow, npcol;
+    int myrow, mycol;
+    GridOrder order;
+    A.gridinfo(&order, &nprow, &npcol, &myrow, &mycol);
+
+    bool optqr = 1;
+    int itconv, it;
+    int itqr = 0, itpo = 0;
+
+    real_t eps  = 0.5 * std::numeric_limits<real_t>::epsilon();
+    real_t tol1 = 5. * eps;
+    real_t tol3 = pow(tol1, 1./3.);
+
+    double L2, sqd, dd, a1, a, b, c;
+    real_t Li, Liconv;
+    real_t conv = 100.;
+    scalar_t zero = 0.0, one = 1.0, minusone = -1.0;
+
+    scalar_t alpha, beta;
+
+    real_t normA;
+    real_t norminvR;
+
+    int64_t mt2 = 2*mt - 1;
+
+    // allocate m2*n work space required for the qr-based iterations
+    // this allocation can be avoided if we change the qr iterations to
+    // work on two matrices on top of each other
+
+    // if doing QR([A;Id])
+    slate::Matrix<scalar_t> W(m2, n, nb, nprow, npcol, MPI_COMM_WORLD);
+    slate::Matrix<scalar_t> Q(m2, n, nb, nprow, npcol, MPI_COMM_WORLD);
+
+    if (target == Target::Devices) {
+        const int64_t batch_size_zero = 0; // use default batch size
+        const int64_t num_queues = 3 + lookahead;
+        A.allocateBatchArrays(batch_size_zero, num_queues);
+        A.reserveDeviceWorkspace();
+        W.allocateBatchArrays(batch_size_zero, num_queues);
+        W.reserveDeviceWorkspace();
+        Q.allocateBatchArrays(batch_size_zero, num_queues);
+        Q.reserveDeviceWorkspace();
+    }
+
+    W.insertLocalTiles();
+    Q.insertLocalTiles();
+    auto W1 = W.sub(0, mt-1, 0, nt-1); // First mxn block of W
+    auto W2 = W.sub(mt, mt2, 0, nt-1); // Second nxn block of W
+    auto Q1 = Q.sub(0, mt-1, 0, nt-1); // First mxn block of W
+    auto Q2 = Q.sub(mt, mt2, 0, nt-1); // Second nxn block of W
+
+    // if doing QR([A]) QR([A;Id])
+    //slate::Matrix<scalar_t> W1(m, n, nb, nprow, npcol, MPI_COMM_WORLD);
+    //slate::Matrix<scalar_t> W2(m, n, nb, nprow, npcol, MPI_COMM_WORLD);
+    //W1.insertLocalTiles();
+    //W2.insertLocalTiles();
+
+    slate::TriangularFactors<scalar_t> T1;
+
+    // backup A in H
+    copy(A, H);
+
+    // two norm estimation (largest singular value of A)
+    // todo: fix norm_est the call it
+    real_t norm_est = 1.;
+    //real_t norm_est = normest(slate::Norm::One, A);
+    alpha = 1.0;
+
+    // scale the original matrix A to form A0 of the iterative loop
+    add(alpha/(scalar_t)norm_est, H, zero, A, opts);
+
+    // Calculate Li: reciprocal of condition number estimation
+    // Either 1) use LU followed by gecon
+    // Or     2) QR followed by trtri
+    // If used the QR, use the Q factor in the first QR-based iteration
+    normA = norm(slate::Norm::One, A);
+    if (optqr) {
+        // Estimate the condition number using QR
+        // This Q factor can be used in the first QR-based iteration
+        copy(A, W1);
+        slate::geqrf(W1, T1, opts);
+        auto R  = TriangularMatrix<scalar_t>(
+            Uplo::Upper, slate::Diag::NonUnit, W1 );
+        auto Rh = HermitianMatrix<scalar_t>(
+            Uplo::Upper, W1 );
+        trtri(R, opts);
+        //tzset(scalar_t(0.0), L);
+        norminvR = norm(slate::Norm::One, Rh);
+        Li = (1.0 / norminvR) / normA;
+        Li = norm_est / 1.1 * Li;
+        // *flops += FLOPS_DGEQRF( M, N )
+        //       + FLOPS_DTRTRI( N );
+    }
+    else {
+        // todo
+        // Estimate the condition number using LU
+    }
+
+    itconv = 0; Liconv = Li;
+    while (itconv == 0 || fabs(1-Liconv) > tol1) {
+        // To find the minimum number of iterations to converge.
+        // itconv = number of iterations needed until |Li - 1| < tol1
+        // This should have converged in less than 50 iterations
+        if (itconv > 100) {
+            exit(-1);
+            break;
+        }
+        itconv++;
+
+        L2  = double (Liconv * Liconv);
+        dd  = pow( 4.0 * (1.0 - L2 ) / (L2 * L2), 1.0/3.0 );
+        sqd = sqrt(1.0 + dd);
+        a1  = sqd + sqrt( 8.0 - 4.0 * dd + 8.0 * (2.0 - L2) / (L2 * sqd) ) / 2.0;
+        a   = real(a1);
+        b   = (a - 1.0) * (a - 1.0) / 4.0;
+        c   = a + b - 1.0;
+        // Update Liconv
+        Liconv  = Liconv * real_t((a + b * L2) / (1.0 + c * L2));
+    }
+
+    it = 0;
+    while (conv > tol3 || it < itconv) {
+        // This should have converged in less than 50 iterations
+        if (it > 100) {
+            exit(-1);
+            break;
+        }
+        it++;
+
+        // Compute parameters L,a,b,c
+        L2  = double (Li * Li);
+        dd  = pow( 4.0 * (1.0 - L2 ) / (L2 * L2), 1.0/3.0 );
+        sqd = sqrt(1.0 + dd);
+        a1  = sqd + sqrt( 8.0 - 4.0 * dd + 8.0 * (2.0 - L2) / (L2 * sqd) ) / 2.0;
+        a   = real(a1);
+        b   = (a - 1.0) * (a - 1.0) / 4.0;
+        c   = a + b - 1.0;
+        // Update Li
+        Li  = Li * real_t ((a + b * L2) / (1.0 + c * L2));
+
+        if (real(c) > 100.) {
+            //int doqr = !optqr || it > 1;
+
+            // Generate the matrix B = [ B1 ] = [ sqrt(c) * U ]
+            //                         [ B2 ] = [ Id          ]
+            alpha = scalar_t(sqrt(c));
+            set(zero, one, W2);
+
+            // todo: have a customized splitted QR
+            //if( doqr ) {
+            //    geadd(alpha, A, zero, W1);
+            //    geqrf_qdwh_full(W, T1, opts);
+            //}
+            //else {
+            //    geadd(alpha, W1, zero, W1);
+            //    geqrf_qdwh_full2(W, T1, opts);
+            //}
+
+            // Factorize B = QR, and generate the associated Q
+            add(alpha, A, zero, W1, opts);
+            //geqrf(W, T1, opts); // naive impl
+            //geqrf_qdwh_full(W, T1, opts);
+
+            set(zero, one, Q1);
+            set(zero, zero, Q2);
+
+            //unmqr(slate::Side::Left, slate::Op::NoTrans, W, T1, Q, opts); //naive impl
+            //unmqr_qdwh_full(slate::Side::Left, slate::Op::NoTrans, W, T1, Q, opts);
+
+            // A = ( (a-b/c)/sqrt(c) ) * Q1 * Q2' + (b/c) * A
+            auto Q2T = conj_transpose(Q2);
+            alpha = scalar_t( (a-b/c)/sqrt(c) );
+            beta  = scalar_t( b / c );
+            // Copy U into C to check the convergence of QDWH
+            if (it >= itconv ) {
+                copy(A, W1);
+            }
+            gemm(alpha, Q1, Q2T, beta, A, opts);
+
+            // Main flops used in this step/
+            //flops_dgeqrf = FLOPS_DGEQRF( 2*m, n );
+            //flops_dorgqr = FLOPS_DORGQR( 2*m, n, n );
+            //flops_dgemm  = FLOPS_DGEMM( m, n, n );
+            //flops += flops_dgeqrf + flops_dorgqr + flops_dgemm;
+
+            itqr += 1;
+            //facto = 0;
+        }
+        else {
+            // Copy A into H to check the convergence of QDWH
+            if (it >= itconv) {
+                copy(A, W1);
+            }
+
+            // Make Q1 into an identity matrix
+            set(zero, one, W2);
+
+            // Compute Q1 = c * A' * A + I
+            ///////////////
+            copy(A, Q1);
+            auto AT = conj_transpose(Q1);
+            gemm(scalar_t(c), AT, A, one, W2, opts);
+
+            // Solve R x = AT
+            auto R = slate::HermitianMatrix<scalar_t>(
+                    slate::Uplo::Upper, W2 );
+            posv(R, AT, opts);
+
+            // Update A =  (a-b/c) * Q1T' + (b/c) * A
+            alpha = (a-b/c); beta = (b/c);
+            AT = conj_transpose(AT);
+            add(scalar_t(alpha), AT, scalar_t(beta), A, opts);
+
+            // Main flops used in this step
+            //flops_dgemm  = FLOPS_DGEMM( m, n, n );
+            //flops_dpotrf = FLOPS_DPOTRF( m );
+            //flops_dtrsm  = FLOPS_DTRSM( 'L', m, n );
+            //flops += flops_dgemm + flops_dpotrf + 2. * flops_dtrsm;
+
+            itpo += 1;
+            //facto = 1;
+        }
+
+        // Compute the norm of the symmetric matrix U - B1
+        conv = 10.0;
+        if (it >= itconv) {
+            add(one, A, minusone, W1, opts);
+            conv = norm(slate::Norm::Fro, W1);
+        }
+    }
+
+    // A = U*H ==> H = U'*A ==> H = 0.5*(H'+H)
+    copy(H, W1);
+    auto AT = conj_transpose(A);
+    gemm(one, AT, W1, zero, H, opts);
+}
+
+} // namespace impl
+
+//------------------------------------------------------------------------------
+/// Distributed parallel polar decomposition based on QDWH algorithm.
+///
+/// Computes a polar decomposition of an m-by-n matrix $A$.
+/// The factorization has the form
+/// \[
+///     A = UH,
+/// \]
+/// where $U$ is a matrix with orthonormal columns and $H$ is a Hermition
+/// matrix.
+///
+/// Complexity (in real):
+/// - for $m \ge n$, $\approx 2 m n^{2} - \frac{2}{3} n^{3}$ flops;
+/// - for $m \le n$, $\approx 2 m^{2} n - \frac{2}{3} m^{3}$ flops;
+/// - for $m = n$,   $\approx \frac{4}{3} n^{3}$ flops.
+/// .
+//------------------------------------------------------------------------------
+/// @tparam scalar_t
+///     One of float, double, std::complex<float>, std::complex<double>.
+//------------------------------------------------------------------------------
+/// @param[in,out] A
+///     On entry, the m-by-n matrix $A$.
+///     On exit, if return value = 0, the orthogonal polar factor $U$
+///     from the decomposition $A = U H$.
+///
+/// @param[out] H
+///     On exit, if return value = 0, the hermitian polar factor matrix $H$
+///     from the decomposition $A = U H$.
+///
+/// @param[in] opts
+///     Additional options, as map of name = value pairs. Possible options:
+///     - Option::Lookahead:
+///       Number of panels to overlap with matrix updates.
+///       lookahead >= 0. Default 1.
+///     - Option::InnerBlocking:
+///       Inner blocking to use for panel. Default 16.
+///     - Option::MaxPanelThreads:
+///       Number of threads to use for panel. Default omp_get_max_threads()/2.
+///     - Option::Target:
+///       Implementation to target. Possible values:
+///       - HostTask:  OpenMP tasks on CPU host [default].
+///       - HostNest:  nested OpenMP parallel for loop on CPU host.
+///       - HostBatch: batched BLAS on CPU host.
+///       - Devices:   batched BLAS on GPU device.
+///
+/// @ingroup geqrf_computational
+///
+template <typename scalar_t>
+void qdwh(Matrix<scalar_t>& A,
+          Matrix<scalar_t>& H,
+          Options const& opts)
+{
+    using internal::TargetType;
+    Target target = get_option( opts, Option::Target, Target::HostTask );
+
+    switch (target) {
+        case Target::Host:
+        case Target::HostTask:
+            impl::qdwh<Target::HostTask>( TargetType<Target::HostTask>(),
+                       A, H, opts);
+            break;
+        case Target::HostNest:
+            impl::qdwh<Target::HostNest>( TargetType<Target::HostNest>(),
+                       A, H, opts);
+            break;
+        case Target::HostBatch:
+            impl::qdwh<Target::HostBatch>( TargetType<Target::HostBatch>(),
+                       A, H, opts);
+            break;
+        case Target::Devices:
+            impl::qdwh<Target::Devices>( TargetType<Target::Devices>(),
+                       A, H, opts);
+            break;
+    }
+    // todo: return value for errors?
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+template
+void qdwh<float>(
+    Matrix<float>& A,
+    Matrix<float>& H,
+    Options const& opts);
+
+template
+void qdwh<double>(
+    Matrix<double>& A,
+    Matrix<double>& H,
+    Options const& opts);
+
+template
+void qdwh< std::complex<float> >(
+    Matrix< std::complex<float> >& A,
+    Matrix< std::complex<float> >& H,
+    Options const& opts);
+
+template
+void qdwh< std::complex<double> >(
+    Matrix< std::complex<double> >& A,
+    Matrix< std::complex<double> >& H,
+    Options const& opts);
+
+} // namespace slate

From ea1f5d48ee503d63f9e733dcf8d464d30702b049 Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Mon, 31 Oct 2022 16:03:52 -0400
Subject: [PATCH 03/49] Added costumized geqrf and unmqr

---
 src/geqrf_qdwh_full.cc | 489 +++++++++++++++++++++++++++++++++++++++++
 src/unmqr_qdwh_full.cc | 403 +++++++++++++++++++++++++++++++++
 2 files changed, 892 insertions(+)
 create mode 100644 src/geqrf_qdwh_full.cc
 create mode 100644 src/unmqr_qdwh_full.cc

diff --git a/src/geqrf_qdwh_full.cc b/src/geqrf_qdwh_full.cc
new file mode 100644
index 000000000..66752f7f8
--- /dev/null
+++ b/src/geqrf_qdwh_full.cc
@@ -0,0 +1,489 @@
+// Copyright (c) 2017-2022, University of Tennessee. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
+
+#include "slate/slate.hh"
+#include "auxiliary/Debug.hh"
+#include "slate/Matrix.hh"
+#include "internal/internal.hh"
+
+namespace slate {
+
+// specialization namespace differentiates, e.g.,
+// internal::geqrf_qdwh_full from internal::specialization::geqrf
+namespace internal {
+namespace specialization {
+
+//------------------------------------------------------------------------------
+/// An auxiliary routine to find each rank's first (top-most) row
+/// in panel k.
+///
+/// @param[in] A_panel
+///     Current panel, which is a sub of the input matrix $A$.
+///
+/// @param[in] k
+///     Index of the current panel in the input matrix $A$.
+///
+/// @param[out] first_indices
+///     The array of computed indices.
+///
+/// @ingroup geqrf_qdwh_full_specialization
+///
+template <typename scalar_t>
+void geqrf_compute_first_indices(Matrix<scalar_t>& A_panel, int64_t k,
+                                 std::vector< int64_t >& first_indices)
+{
+    // Find ranks in this column.
+    std::set<int> ranks_set;
+    A_panel.getRanks(&ranks_set);
+    assert(ranks_set.size() > 0);
+
+    // Find each rank's first (top-most) row in this panel,
+    // where the triangular tile resulting from local geqrf panel
+    // will reside.
+    first_indices.reserve(ranks_set.size());
+    for (int r: ranks_set) {
+        for (int64_t i = 0; i < A_panel.mt(); ++i) {
+            if (A_panel.tileRank(i, 0) == r) {
+                first_indices.push_back(i+k);
+                break;
+            }
+        }
+    }
+}
+
+//------------------------------------------------------------------------------
+/// Distributed parallel QR factorization.
+/// Generic implementation for any target.
+/// Panel and lookahead computed on host using Host OpenMP task.
+///
+/// ColMajor layout is assumed
+///
+/// @ingroup geqrf_specialization
+///
+template <Target target, typename scalar_t>
+void geqrf_qdwh_full(slate::internal::TargetType<target>,
+           Matrix<scalar_t>& A,
+           TriangularFactors<scalar_t>& T,
+           int64_t ib, int max_panel_threads, int64_t lookahead)
+{
+    using BcastList = typename Matrix<scalar_t>::BcastList;
+    using device_info_t = lapack::device_info_int;
+    using blas::real;
+
+    // Assumes column major
+    const Layout layout = Layout::ColMajor;
+
+    const int priority_zero = 0;
+    const int priority_one = 1;
+    const int life_factor_one = 1;
+    const bool set_hold = lookahead > 0;  // Do tileGetAndHold in the bcast
+
+    int64_t A_mt = A.mt();
+    int64_t A_nt = A.nt();
+    int64_t A_min_mtnt = std::min(A_mt, A_nt);
+
+    T.clear();
+    T.push_back(A.emptyLike());
+    T.push_back(A.emptyLike(ib, 0));
+    auto Tlocal  = T[0];
+    auto Treduce = T[1];
+
+    // workspace
+    auto W = A.emptyLike();
+
+    // setting up dummy variables for case the when target == host
+    int64_t num_devices  = A.num_devices();
+    int     panel_device = -1;
+    size_t  work_size    = 0;
+
+    std::vector< scalar_t* > dwork_array( num_devices, nullptr );
+
+    if (target == Target::Devices) {
+        const int64_t batch_size_zero = 0; // use default batch size
+        const int64_t num_queues = 3 + lookahead;
+        A.allocateBatchArrays(batch_size_zero, num_queues);
+        A.reserveDeviceWorkspace();
+        W.allocateBatchArrays(batch_size_zero, num_queues);
+        // todo: this is demanding too much device workspace memory
+        // only one tile-row of matrix W per MPI process is going to be used,
+        // but W with size of whole A is being allocated
+        // thus limiting the matrix size that can be processed
+        // For now, allocate workspace tiles 1-by-1.
+        //W.reserveDeviceWorkspace();
+
+        // Find largest panel size and device for copying to
+        // contiguous memory within internal geqrf routine
+        int64_t mlocal = 0;
+        int64_t first_panel_seen = -1;
+        for (int64_t j = 0; j < A.nt(); ++j) {
+            for (int64_t i = j; i < A.mt(); ++i) {
+                if (A.tileIsLocal(i, j)) {
+                    if (first_panel_seen < 0) {
+                        first_panel_seen = j;
+                    }
+                    if (first_panel_seen == j) {
+                        if (panel_device < 0) {
+                            panel_device = A.tileDevice(i, j);
+                        }
+                        mlocal += A.tileMb(i);
+                        // Asserting 1-D distribution for device
+                        assert( panel_device == A.tileDevice(i, j) );
+                    }
+                }
+            }
+            if (first_panel_seen >= 0) {
+                break;
+            }
+        }
+
+        if (panel_device >= 0) {
+
+            blas::set_device( panel_device );
+
+            lapack::Queue* queue = A.compute_queue(panel_device, 0);
+
+            int64_t nb       = A.tileNb(0);
+            size_t  size_tau = (size_t) std::min( mlocal, nb );
+            size_t  size_A   = (size_t) blas::max( 1, mlocal ) * nb;
+            size_t  hsize, dsize;
+
+            // Find size of the workspace needed
+            lapack::geqrf_work_size_bytes( mlocal, nb, dwork_array[0], mlocal,
+                                           &dsize, &hsize, *queue );
+
+            // Size of dA, dtau, dwork and dinfo
+            work_size = size_A + size_tau + ceildiv(dsize, sizeof(scalar_t))
+                        + ceildiv(sizeof(device_info_t), sizeof(scalar_t));
+
+            for (int64_t dev = 0; dev < num_devices; ++dev) {
+                blas::set_device(dev);
+                dwork_array[dev] = blas::device_malloc<scalar_t>(work_size);
+            }
+        }
+    }
+
+    // QR tracks dependencies by block-column.
+    // OpenMP needs pointer types, but vectors are exception safe
+    std::vector< uint8_t > block_vector(A_nt);
+    uint8_t* block = block_vector.data();
+
+    // set min number for omp nested active parallel regions
+    slate::OmpSetMaxActiveLevels set_active_levels( MinOmpActiveLevels );
+
+    #pragma omp parallel
+    #pragma omp master
+    {
+        for (int64_t k = 0; k < A_min_mtnt; ++k) {
+            int64_t i_end = std::min( A_mt-1, (A_mt-A_nt) + k);
+
+            auto  A_panel =       A.sub(k, i_end, k, k);
+            auto Tl_panel =  Tlocal.sub(k, i_end, k, k);
+            auto Tr_panel = Treduce.sub(k, i_end, k, k);
+
+            std::vector< int64_t > first_indices;
+            geqrf_compute_first_indices(A_panel, k, first_indices);
+            // todo: pass first_indices into internal geqrf or ttqrt?
+
+            // panel, high priority
+            #pragma omp task depend(inout:block[k]) priority(priority_one)
+            {
+                // local panel factorization
+                internal::geqrf<target>(
+                                std::move(A_panel),
+                                std::move(Tl_panel),
+                                dwork_array, work_size,
+                                ib, max_panel_threads, priority_one);
+
+                // triangle-triangle reductions
+                // ttqrt handles tile transfers internally
+                internal::ttqrt<Target::HostTask>(
+                                std::move(A_panel),
+                                std::move(Tr_panel));
+
+                // if a trailing matrix exists
+                if (k < A_nt-1) {
+
+                    // bcast V across row for trailing matrix update
+                    if (k < i_end+1) {
+                        BcastList bcast_list_V_first;
+                        BcastList bcast_list_V;
+                        for (int64_t i = k; i < i_end+1; ++i) {
+                            // send A(i, k) across row A(i, k+1:nt-1)
+                            // Vs in first_indices (except the main diagonal one) need three lives
+                            if ((std::find(first_indices.begin(), first_indices.end(), i) != first_indices.end()) && (i > k))
+                                bcast_list_V_first.push_back({i, k, {A.sub(i, i, k+1, A_nt-1)}});
+                            else
+                                bcast_list_V.push_back({i, k, {A.sub(i, i, k+1, A_nt-1)}});
+                        }
+                        A.template listBcast<target>(bcast_list_V_first, layout, 0, 3, set_hold);
+                        A.template listBcast<target>(bcast_list_V, layout, 0, 2, set_hold);
+                    }
+
+                    // bcast Tlocal across row for trailing matrix update
+                    if (first_indices.size() > 0) {
+                        BcastList bcast_list_T;
+                        for (int64_t row : first_indices) {
+                            bcast_list_T.push_back({row, k, {Tlocal.sub(row, row, k+1, A_nt-1)}});
+                        }
+                        Tlocal.template listBcast<target>(bcast_list_T, layout, k, life_factor_one, set_hold);
+                    }
+
+                    // bcast Treduce across row for trailing matrix update
+                    if (first_indices.size() > 1) {
+                        BcastList bcast_list_T;
+                        for (int64_t row : first_indices) {
+                            if (row > k) // exclude the first row of this panel that has no Treduce tile
+                                bcast_list_T.push_back({row, k, {Treduce.sub(row, row, k+1, A_nt-1)}});
+                        }
+                        Treduce.template listBcast(bcast_list_T, layout);
+                    }
+                }
+            }
+
+            // update lookahead column(s) on CPU, high priority
+            for (int64_t j = k+1; j < (k+1+lookahead) && j < A_nt; ++j) {
+
+                auto A_trail_j = A.sub(k, i_end, j, j);
+
+                #pragma omp task depend(in:block[k]) \
+                                 depend(inout:block[j]) \
+                                 priority(priority_one)
+                {
+                    // Apply local reflectors
+                    internal::unmqr<target>(
+                                    Side::Left, Op::ConjTrans,
+                                    std::move(A_panel),
+                                    std::move(Tl_panel),
+                                    std::move(A_trail_j),
+                                    W.sub(k, i_end, j, j),
+                                    priority_one, j-k+1);
+
+                    // Apply triangle-triangle reduction reflectors
+                    // ttmqr handles the tile broadcasting internally
+                    internal::ttmqr<Target::HostTask>(
+                                    Side::Left, Op::ConjTrans,
+                                    std::move(A_panel),
+                                    std::move(Tr_panel),
+                                    std::move(A_trail_j),
+                                    j);
+                }
+            }
+
+            // update trailing submatrix, normal priority
+            if (k+1+lookahead < A_nt) {
+                int64_t j = k+1+lookahead;
+                auto A_trail_j = A.sub(k, i_end, j, A_nt-1);
+
+                #pragma omp task depend(in:block[k]) \
+                                 depend(inout:block[k+1+lookahead]) \
+                                 depend(inout:block[A_nt-1])
+                {
+                    // Apply local reflectors.
+                    internal::unmqr<target>(
+                                    Side::Left, Op::ConjTrans,
+                                    std::move(A_panel),
+                                    std::move(Tl_panel),
+                                    std::move(A_trail_j),
+                                    W.sub(k, i_end, j, A_nt-1),
+                                    priority_zero, j-k+1);
+
+                    // Apply triangle-triangle reduction reflectors.
+                    // ttmqr handles the tile broadcasting internally.
+                    internal::ttmqr<Target::HostTask>(
+                                    Side::Left, Op::ConjTrans,
+                                    std::move(A_panel),
+                                    std::move(Tr_panel),
+                                    std::move(A_trail_j),
+                                    j);
+                }
+            }
+            if (target == Target::Devices) {
+                // Update the status of the on-hold tiles held by the invocation of
+                // the tileBcast routine, and then release them to free up memory.
+                // The origin must be updated with the latest modified copy
+                // for memory consistency.
+                // TODO: Find better solution to handle tile release, and
+                //       investigate the correctness of the task dependency
+                if (k >= lookahead && k < A_nt-1) {
+                    #pragma omp task depend(in:block[k]) \
+                                     depend(inout:block[k+1])
+                    {
+                        int64_t k_la = k-lookahead;
+                        for (int64_t i = k_la; i < A_mt; ++i) {
+                            if (A.tileIsLocal(i, k_la)) {
+                                A.tileUpdateOrigin(i, k_la);
+
+                                std::set<int> dev_set;
+                                A.sub(i, i, k_la+1, A_nt-1).getLocalDevices(&dev_set);
+
+                                for (auto device : dev_set) {
+                                    A.tileUnsetHold(i, k_la, device);
+                                    A.tileRelease(i, k_la, device);
+                                }
+                            }
+                        }
+
+                        auto A_panel_k_la = A.sub(k_la, A_mt-1, k_la, k_la);
+                        std::vector< int64_t > first_indices_k_la;
+                        geqrf_compute_first_indices(A_panel_k_la, k_la, first_indices_k_la);
+                        if (first_indices.size() > 0) {
+                            for (int64_t row : first_indices_k_la) {
+                                if (Tlocal.tileIsLocal(row, k_la)) {
+                                    Tlocal.tileUpdateOrigin(row, k_la);
+
+                                    std::set<int> dev_set;
+                                    Tlocal.sub(row, row, k_la+1, A_nt-1).getLocalDevices(&dev_set);
+
+                                    for (auto device : dev_set) {
+                                        Tlocal.tileUnsetHold(row, k_la, device);
+                                        Tlocal.tileRelease(row, k_la, device);
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        #pragma omp taskwait
+        A.tileUpdateAllOrigin();
+    }
+
+    A.releaseWorkspace();
+
+    if (target == Target::Devices) {
+        for (int64_t dev = 0; dev < num_devices; ++dev) {
+            blas::set_device(dev);
+            blas::device_free( dwork_array[dev] );
+            dwork_array[dev] = nullptr;
+        }
+    }
+}
+
+} // namespace specialization
+} // namespace internal
+
+//------------------------------------------------------------------------------
+/// Version with target as template parameter.
+/// @ingroup geqrf_qdwh_full_specialization
+///
+template <Target target, typename scalar_t>
+void geqrf_qdwh_full(Matrix<scalar_t>& A,
+           TriangularFactors<scalar_t>& T,
+           Options const& opts)
+{
+    int64_t lookahead = get_option<int64_t>( opts, Option::Lookahead, 1 );
+
+    int64_t ib = get_option<int64_t>( opts, Option::InnerBlocking, 16 );
+
+    int64_t max_panel_threads  = std::max(omp_get_max_threads()/2, 1);
+    max_panel_threads = get_option<int64_t>( opts, Option::MaxPanelThreads, max_panel_threads );
+
+    internal::specialization::geqrf_qdwh_full(internal::TargetType<target>(),
+                                    A, T,
+                                    ib, max_panel_threads, lookahead);
+}
+
+//------------------------------------------------------------------------------
+/// Distributed parallel QR factorization.
+///
+/// Computes a QR factorization of an m-by-n matrix $A$.
+/// The factorization has the form
+/// \[
+///     A = QR,
+/// \]
+/// where $Q$ is a matrix with orthonormal columns and $R$ is upper triangular
+/// (or upper trapezoidal if m < n).
+///
+/// Complexity (in real):
+/// - for $m \ge n$, $\approx 2 m n^{2} - \frac{2}{3} n^{3}$ flops;
+/// - for $m \le n$, $\approx 2 m^{2} n - \frac{2}{3} m^{3}$ flops;
+/// - for $m = n$,   $\approx \frac{4}{3} n^{3}$ flops.
+/// .
+//------------------------------------------------------------------------------
+/// @tparam scalar_t
+///     One of float, double, std::complex<float>, std::complex<double>.
+//------------------------------------------------------------------------------
+/// @param[in,out] A
+///     On entry, the m-by-n matrix $A$.
+///     On exit, the elements on and above the diagonal of the array contain
+///     the min(m,n)-by-n upper trapezoidal matrix $R$ (upper triangular
+///     if m >= n); the elements below the diagonal represent the unitary
+///     matrix $Q$ as a product of elementary reflectors.
+///
+/// @param[out] T
+///     On exit, triangular matrices of the block reflectors.
+///
+/// @param[in] opts
+///     Additional options, as map of name = value pairs. Possible options:
+///     - Option::Lookahead:
+///       Number of panels to overlap with matrix updates.
+///       lookahead >= 0. Default 1.
+///     - Option::InnerBlocking:
+///       Inner blocking to use for panel. Default 16.
+///     - Option::MaxPanelThreads:
+///       Number of threads to use for panel. Default omp_get_max_threads()/2.
+///     - Option::Target:
+///       Implementation to target. Possible values:
+///       - HostTask:  OpenMP tasks on CPU host [default].
+///       - HostNest:  nested OpenMP parallel for loop on CPU host.
+///       - HostBatch: batched BLAS on CPU host.
+///       - Devices:   batched BLAS on GPU device.
+///
+/// @ingroup geqrf_computational
+///
+template <typename scalar_t>
+void geqrf_qdwh_full(Matrix<scalar_t>& A,
+           TriangularFactors<scalar_t>& T,
+           Options const& opts)
+{
+    Target target = get_option( opts, Option::Target, Target::HostTask );
+
+    switch (target) {
+        case Target::Host:
+        case Target::HostTask:
+            geqrf_qdwh_full<Target::HostTask>(A, T, opts);
+            break;
+        case Target::HostNest:
+            geqrf_qdwh_full<Target::HostNest>(A, T, opts);
+            break;
+        case Target::HostBatch:
+            geqrf_qdwh_full<Target::HostBatch>(A, T, opts);
+            break;
+        case Target::Devices:
+            geqrf_qdwh_full<Target::Devices>(A, T, opts);
+            break;
+    }
+    // todo: return value for errors?
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+template
+void geqrf_qdwh_full<float>(
+    Matrix<float>& A,
+    TriangularFactors<float>& T,
+    Options const& opts);
+
+template
+void geqrf_qdwh_full<double>(
+    Matrix<double>& A,
+    TriangularFactors<double>& T,
+    Options const& opts);
+
+template
+void geqrf_qdwh_full< std::complex<float> >(
+    Matrix< std::complex<float> >& A,
+    TriangularFactors< std::complex<float> >& T,
+    Options const& opts);
+
+template
+void geqrf_qdwh_full< std::complex<double> >(
+    Matrix< std::complex<double> >& A,
+    TriangularFactors< std::complex<double> >& T,
+    Options const& opts);
+
+} // namespace slate
diff --git a/src/unmqr_qdwh_full.cc b/src/unmqr_qdwh_full.cc
new file mode 100644
index 000000000..563ac796b
--- /dev/null
+++ b/src/unmqr_qdwh_full.cc
@@ -0,0 +1,403 @@
+// Copyright (c) 2017-2022, University of Tennessee. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
+
+#include "slate/slate.hh"
+#include "auxiliary/Debug.hh"
+#include "slate/Matrix.hh"
+#include "internal/Tile_tpmqrt.hh"
+#include "internal/internal.hh"
+
+namespace slate {
+
+// specialization namespace differentiates, e.g.,
+// internal::unmqr_qdwh_full from internal::specialization::unmqr_qdwh_full
+namespace internal {
+namespace specialization {
+
+//------------------------------------------------------------------------------
+/// Distributed parallel multiply by Q from QR factorization.
+/// Generic implementation for any target.
+/// @ingroup geqrf_specialization
+///
+template <Target target, typename scalar_t>
+void unmqr_qdwh_full(
+    slate::internal::TargetType<target>,
+    Side side, Op op,
+    Matrix<scalar_t>& A,
+    TriangularFactors<scalar_t>& T,
+    Matrix<scalar_t>& C)
+{
+    // trace::Block trace_block("unmqr");
+    // const int priority_one = 1;
+    using BcastList = typename Matrix<scalar_t>::BcastList;
+
+    // Assumes column major
+    const Layout layout = Layout::ColMajor;
+
+    int64_t A_mt = A.mt();
+    int64_t A_nt = A.nt();
+    int64_t A_min_mtnt = std::min(A_mt, A_nt);
+    int64_t C_mt = C.mt();
+    int64_t C_nt = C.nt();
+
+    if (is_complex<scalar_t>::value && op == Op::Trans) {
+        throw Exception("Complex numbers uses Op::ConjTrans, not Op::Trans.");
+    }
+
+    if (target == Target::Devices) {
+        C.allocateBatchArrays();
+        C.reserveDeviceWorkspace();
+    }
+
+    // Reserve workspace
+    auto W = C.emptyLike();
+
+    if (target == Target::Devices) {
+        W.allocateBatchArrays();
+        // todo: this is demanding too much device workspace memory
+        // only one tile-row of matrix W per MPI process is going to be used,
+        // but W with size of whole C is being allocated
+        // thus limiting the matrix size that can be processed
+        //W.reserveDeviceWorkspace();
+    }
+
+    assert(T.size() == 2);
+    auto Tlocal  = T[0];
+    auto Treduce = T[1];
+
+    // QR tracks dependencies by block-column.
+    // OpenMP needs pointer types, but vectors are exception safe
+    std::vector< uint8_t > block_vector(A_nt);
+    uint8_t* block = block_vector.data();
+
+    // set min number for omp nested active parallel regions
+    slate::OmpSetMaxActiveLevels set_active_levels( MinOmpActiveLevels );
+
+    #pragma omp parallel
+    #pragma omp master
+    {
+        int64_t k_begin, k_end, k_step;
+        if ((side == Side::Left) == (op == Op::NoTrans)) {
+            // Left,  NoTrans:     multiply Q C   = Q1 ... QK C, or
+            // Right, (Conj)Trans: multiply C Q^H = C QK^H ... Q1^H,
+            // i.e., in reverse order of how Qk's were created.
+            k_begin = A_min_mtnt-1;
+            k_end   = -1;
+            k_step  = -1;
+        }
+        else {
+            // Left, (Conj)Trans: multiply Q^H C = QK^H ... Q1^H C, or
+            // Right, NoTrans:    multiply C Q   = C Q1 ... QK,
+            // i.e., in same order as Qk's were created.
+            k_begin = 0;
+            k_end   = A_min_mtnt;
+            k_step  = 1;
+        }
+
+        // for k = k_begin, lastk = k_begin (no previous col to depend on);
+        // otherwise,       lastk = k - k_step (previous col).
+        int64_t lastk = k_begin;
+        // OpenMP uses lastk; compiler doesn't, so warns it is unused.
+        SLATE_UNUSED(lastk);
+        for (int64_t k = k_begin; k != k_end; k += k_step) {
+
+            int64_t i_end = std::min( A_mt-1, (A_mt-A_nt) + k);
+
+            auto A_panel = A.sub(k, i_end, k, k);
+
+            // Find ranks in this column.
+            std::set<int> ranks_set;
+            A_panel.getRanks(&ranks_set);
+            assert(ranks_set.size() > 0);
+
+            // Find each rank's first (top-most) row in this panel,
+            // where the triangular tile resulting from local geqrf
+            // panel will reside.
+            std::vector< int64_t > first_indices;
+            first_indices.reserve(ranks_set.size());
+            for (int r: ranks_set) {
+                for (int64_t i = 0; i < A_panel.mt(); ++i) {
+                    if (A_panel.tileRank(i, 0) == r) {
+                        first_indices.push_back(i+k);
+                        break;
+                    }
+                }
+            }
+
+            #pragma omp task depend(inout:block[k]) \
+                             depend(in:block[lastk])
+            {
+                // Indices for row or col of C.
+                int64_t i0 = -1, i1 = -1, j0 = -1, j1 = -1;
+                if (side == Side::Left) {
+                    j0 = 0;
+                    j1 = C_nt-1;
+                }
+                else {
+                    i0 = 0;
+                    i1 = C_mt-1;
+                }
+
+                // Send V(i) across row C(i, 0:nt-1) or col C(0:mt-1, i),
+                // for side = left or right, respectively.
+                BcastList bcast_list_V_top;
+                BcastList bcast_list_V;
+                for (int64_t i = k; i < i_end+1; ++i) {
+                    if (side == Side::Left) {
+                        i0 = i;
+                        i1 = i;
+                    }
+                    else {
+                        j0 = i;
+                        j1 = i;
+                    }
+                    if (std::find(first_indices.begin(), first_indices.end(), i) != first_indices.end()) {
+                        bcast_list_V_top.push_back(
+                            {i, k, {C.sub(i0, i1, j0, j1)}});
+                    }
+                    else {
+                        bcast_list_V.push_back(
+                            {i, k, {C.sub(i0, i1, j0, j1)}});
+                    }
+                }
+                // V tiles in first_indices need up to 5 lives: 1 for ttmqr,
+                // 2 + extra 2 if mb > nb (trapezoid) for Vs in unmqr I-VTV^T.
+                // This may leak a few tiles that A.clearWorkspace will cleanup.
+                A.template listBcast(bcast_list_V_top, layout, 0, 5);
+                A.template listBcast(bcast_list_V, layout, 0, 2);
+
+                // Send Tlocal(i) across row C(i, 0:nt-1) or col C(0:mt-1, i).
+                if (first_indices.size() > 0) {
+                    BcastList bcast_list_T;
+                    for (int64_t i : first_indices) {
+                        if (side == Side::Left) {
+                            i0 = i;
+                            i1 = i;
+                        }
+                        else {
+                            j0 = i;
+                            j1 = i;
+                        }
+                        bcast_list_T.push_back(
+                            {i, k, {C.sub(i0, i1, j0, j1)}});
+                    }
+                    Tlocal.template listBcast(bcast_list_T, layout);
+                }
+
+                // Send Treduce(i) across row C(i, 0:nt-1) or col C(0:mt-1, i).
+                if (first_indices.size() > 1) {
+                    BcastList bcast_list_T;
+                    for (int64_t i : first_indices) {
+                        if (side == Side::Left) {
+                            i0 = i;
+                            i1 = i;
+                        }
+                        else {
+                            j0 = i;
+                            j1 = i;
+                        }
+                        // Exclude first row of this panel,
+                        // which doesn't have Treduce tile.
+                        if (i > k) {
+                            bcast_list_T.push_back(
+                                {i, k, {C.sub(i0, i1, j0, j1)}});
+                        }
+                    }
+                    Treduce.template listBcast(bcast_list_T, layout);
+                }
+
+                Matrix<scalar_t> C_trail, W_trail;
+                if (side == Side::Left) {
+                    C_trail = C.sub(k, i_end, 0, C_nt-1);
+                    W_trail = W.sub(k, i_end, 0, C_nt-1);
+                }
+                else {
+                    C_trail = C.sub(0, i_end, k, C_nt-1);
+                    W_trail = W.sub(0, i_end, k, C_nt-1);
+                }
+
+                // Left,  NoTrans:     Qi C   = Qi_local Qi_reduce C, or
+                // Right, (Conj)Trans: C Qi^H = C Qi_reduce^H Qi_local^H,
+                // do ttmqr then unmqr.
+                if ((side == Side::Left) == (op == Op::NoTrans)) {
+                    // Apply triangle-triangle reduction reflectors.
+                    internal::ttmqr<Target::HostTask>(
+                                    side, op,
+                                    std::move(A_panel),
+                                    Treduce.sub(k, i_end, k, k),
+                                    std::move(C_trail));
+                }
+
+                // Apply local reflectors.
+                internal::unmqr<target>(
+                                side, op,
+                                std::move(A_panel),
+                                Tlocal.sub(k, i_end, k, k),
+                                std::move(C_trail),
+                                std::move(W_trail));
+
+                // Left,  (Conj)Trans: Qi^H C = Qi_reduce^H Qi_local^H C, or
+                // Right, NoTrans:     C Qi   = C Qi_local Qi_reduce,
+                // do unmqr then ttmqr.
+                if ((side == Side::Left) != (op == Op::NoTrans)) {
+                    // Apply triangle-triangle reduction reflectors.
+                    internal::ttmqr<Target::HostTask>(
+                                    side, op,
+                                    std::move(A_panel),
+                                    Treduce.sub(k, A_mt-1, k, k),
+                                    std::move(C_trail));
+                }
+            }
+
+            lastk = k;
+        }
+
+        #pragma omp taskwait
+        C.tileUpdateAllOrigin();
+    }
+
+    A.clearWorkspace();
+    C.clearWorkspace();
+}
+
+} // namespace specialization
+} // namespace internal
+
+//------------------------------------------------------------------------------
+/// Version with target as template parameter.
+/// @ingroup geqrf_specialization
+///
+template <Target target, typename scalar_t>
+void unmqr_qdwh_full(
+    Side side, Op op,
+    Matrix<scalar_t>& A,
+    TriangularFactors<scalar_t>& T,
+    Matrix<scalar_t>& C,
+    Options const& opts)
+{
+    internal::specialization::unmqr_qdwh_full(internal::TargetType<target>(),
+                                    side, op, A, T, C);
+}
+
+//------------------------------------------------------------------------------
+/// Distributed parallel multiply by $Q$ from QR factorization.
+///
+/// Multiplies the general m-by-n matrix $C$ by $Q$ from QR factorization,
+/// according to:
+///
+/// op              |  side = Left  |  side = Right
+/// --------------- | ------------- | --------------
+/// op = NoTrans    |  $Q C  $      |  $C Q  $
+/// op = ConjTrans  |  $Q^H C$      |  $C Q^H$
+///
+/// where $Q$ is a unitary matrix defined as the product of k
+/// elementary reflectors
+/// \[
+///     Q = H(1) H(2) . . . H(k)
+/// \]
+/// as returned by geqrf. $Q$ is of order m if side = Left
+/// and of order n if side = Right.
+///
+//------------------------------------------------------------------------------
+/// @tparam scalar_t
+///     One of float, double, std::complex<float>, std::complex<double>.
+//------------------------------------------------------------------------------
+/// @param[in] side
+///     - Side::Left:  apply $Q$ or $Q^H$ from the left;
+///     - Side::Right: apply $Q$ or $Q^H$ from the right.
+///
+/// @param[in] op
+///     - Op::NoTrans    apply $Q$;
+///     - Op::ConjTrans: apply $Q^H$;
+///     - Op::Trans:     apply $Q^T$ (only if real).
+///       In the real case, Op::Trans is equivalent to Op::ConjTrans.
+///       In the complex case, Op::Trans is not allowed.
+///
+/// @param[in] A
+///     Details of the QR factorization of the original matrix $A$ as returned
+///     by geqrf.
+///
+/// @param[in] T
+///     Triangular matrices of the block reflectors as returned by geqrf.
+///
+/// @param[in,out] C
+///     On entry, the m-by-n matrix $C$.
+///     On exit, $C$ is overwritten by $Q C$, $Q^H C$, $C Q$, or $C Q^H$.
+///
+/// @param[in] opts
+///     Additional options, as map of name = value pairs. Possible options:
+///     - Option::Target:
+///       Implementation to target. Possible values:
+///       - HostTask:  OpenMP tasks on CPU host [default].
+///       - HostNest:  nested OpenMP parallel for loop on CPU host.
+///       - HostBatch: batched BLAS on CPU host.
+///       - Devices:   batched BLAS on GPU device.
+///
+/// @ingroup geqrf_computational
+///
+template <typename scalar_t>
+void unmqr_qdwh_full(
+    Side side, Op op,
+    Matrix<scalar_t>& A,
+    TriangularFactors<scalar_t>& T,
+    Matrix<scalar_t>& C,
+    Options const& opts)
+{
+    Target target = get_option( opts, Option::Target, Target::HostTask );
+
+    switch (target) {
+        case Target::Host:
+        case Target::HostTask:
+        default:
+            unmqr_qdwh_full<Target::HostTask>(side, op, A, T, C, opts);
+            break;
+        case Target::HostNest:
+            unmqr_qdwh_full<Target::HostNest>(side, op, A, T, C, opts);
+            break;
+        case Target::HostBatch:
+            unmqr_qdwh_full<Target::HostBatch>(side, op, A, T, C, opts);
+            break;
+        case Target::Devices:
+            unmqr_qdwh_full<Target::Devices>(side, op, A, T, C, opts);
+            break;
+    }
+    // todo: return value for errors?
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+template
+void unmqr_qdwh_full<float>(
+    Side side, Op op,
+    Matrix<float>& A,
+    TriangularFactors<float>& T,
+    Matrix<float>& C,
+    Options const& opts);
+
+template
+void unmqr_qdwh_full<double>(
+    Side side, Op op,
+    Matrix<double>& A,
+    TriangularFactors<double>& T,
+    Matrix<double>& C,
+    Options const& opts);
+
+template
+void unmqr_qdwh_full< std::complex<float> >(
+    Side side, Op op,
+    Matrix< std::complex<float> >& A,
+    TriangularFactors< std::complex<float> >& T,
+    Matrix< std::complex<float> >& C,
+    Options const& opts);
+
+template
+void unmqr_qdwh_full< std::complex<double> >(
+    Side side, Op op,
+    Matrix< std::complex<double> >& A,
+    TriangularFactors< std::complex<double> >& T,
+    Matrix< std::complex<double> >& C,
+    Options const& opts);
+
+} // namespace slate

From 554769f8fa067d52c42ae033809363bc48d11679 Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Mon, 31 Oct 2022 16:06:50 -0400
Subject: [PATCH 04/49] Test qdwh

---
 test/test.cc      |   2 +
 test/test.hh      |   3 +
 test/test_qdwh.cc | 216 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 221 insertions(+)
 create mode 100644 test/test_qdwh.cc

diff --git a/test/test.cc b/test/test.cc
index 0fac169a8..d75dfdfbb 100644
--- a/test/test.cc
+++ b/test/test.cc
@@ -277,6 +277,8 @@ std::vector< testsweeper::routines_t > routines = {
     { "syset",              test_set,          Section::aux },
     { "heset",              test_set,          Section::aux },
     { "",                   nullptr,           Section::newline },
+
+    { "qdwh",              test_qdwh,          Section::qr },
 };
 
 // -----------------------------------------------------------------------------
diff --git a/test/test.hh b/test/test.hh
index cbe314bf2..592333fc3 100644
--- a/test/test.hh
+++ b/test/test.hh
@@ -256,6 +256,9 @@ void test_scale  (Params& params, bool run);
 void test_scale_row_col(Params& params, bool run);
 void test_set    (Params& params, bool run);
 
+// polar decomposition
+void test_qdwh  (Params& params, bool run);
+
 // -----------------------------------------------------------------------------
 inline slate::Dist str2dist(const char* dist)
 {
diff --git a/test/test_qdwh.cc b/test/test_qdwh.cc
new file mode 100644
index 000000000..1281a1e05
--- /dev/null
+++ b/test/test_qdwh.cc
@@ -0,0 +1,216 @@
+// Copyright (c) 2017-2022, University of Tennessee. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
+
+#include "slate/slate.hh"
+#include "test.hh"
+#include "blas/flops.hh"
+#include "lapack/flops.hh"
+#include "print_matrix.hh"
+#include "grid_utils.hh"
+
+#include "scalapack_wrappers.hh"
+#include "scalapack_support_routines.hh"
+#include "scalapack_copy.hh"
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <utility>
+#define SLATE_HAVE_SCALAPACK
+//------------------------------------------------------------------------------
+template <typename scalar_t>
+void test_qdwh_work(Params& params, bool run)
+{
+    using real_t = blas::real_type<scalar_t>;
+    using blas::real;
+
+    // Constants
+    const scalar_t zero = 0;
+    const scalar_t one = 1;
+
+    // get & mark input values
+    int64_t m = params.dim.m();
+    int64_t n = params.dim.n();
+    int p = params.grid.m();
+    int q = params.grid.n();
+    int64_t nb = params.nb();
+    int64_t ib = params.ib();
+    int64_t lookahead = params.lookahead();
+    int64_t panel_threads = params.panel_threads();
+    bool ref_only = params.ref() == 'o';
+    bool ref = params.ref() == 'y' || ref_only;
+    bool check = params.check() == 'y' && ! ref_only;
+    bool trace = params.trace() == 'y';
+    slate::Origin origin = params.origin();
+    slate::Target target = params.target();
+    params.matrix.mark();
+
+    // mark non-standard output values
+    params.time();
+    params.gflops();
+    params.ref_time();
+    params.ref_gflops();
+    params.ortho();
+
+    if (! run)
+        return;
+
+    slate::Options const opts =  {
+        {slate::Option::Lookahead, lookahead},
+        {slate::Option::Target, target},
+        {slate::Option::MaxPanelThreads, panel_threads},
+        {slate::Option::InnerBlocking, ib}
+    };
+
+    // MPI variables
+    int mpi_rank, myrow, mycol;
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+    gridinfo(mpi_rank, p, q, &myrow, &mycol);
+
+
+    // BLACS/MPI variables
+    // matrix A, figure out local size, allocate, create descriptor, initialize
+    int64_t mlocA = num_local_rows_cols(m, nb, myrow, p);
+    int64_t nlocA = num_local_rows_cols(n, nb, mycol, q);
+    int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
+    std::vector<scalar_t> A_data;
+
+    int64_t mlocH = num_local_rows_cols(n, nb, myrow, p);
+    int64_t nlocH = num_local_rows_cols(n, nb, mycol, q);
+    int64_t lldH  = blas::max(1, mlocH); // local leading dimension of A
+    std::vector<scalar_t> H_data;
+
+    // matrix QR, for checking result
+    slate::Matrix<scalar_t> Id;
+    Id = slate::Matrix<scalar_t>(n, n, nb, p, q, MPI_COMM_WORLD);
+    Id.insertLocalTiles();
+
+    slate::Matrix<scalar_t> A;
+    slate::Matrix<scalar_t> H;
+    if (origin != slate::Origin::ScaLAPACK) {
+        // Copy local ScaLAPACK data to GPU or CPU tiles.
+        slate::Target origin_target = origin2target(origin);
+        A = slate::Matrix<scalar_t>(m, n, nb, p, q, MPI_COMM_WORLD);
+        A.insertLocalTiles(origin_target);
+        //copy(&A_tst[0], descA_tst, A);
+        H = slate::Matrix<scalar_t>(n, n, nb, p, q, MPI_COMM_WORLD);
+        H.insertLocalTiles(origin_target);
+        //copy(&H_tst[0], descH_tst, H);
+    }
+    else {
+        // create SLATE matrices from the ScaLAPACK layouts
+        A_data.resize( lldA * nlocA );
+        H_data.resize( lldH * nlocH );
+        A = slate::Matrix<scalar_t>::fromScaLAPACK(m, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
+        H = slate::Matrix<scalar_t>::fromScaLAPACK(n, n, &H_data[0], lldH, nb, p, q, MPI_COMM_WORLD);
+    }
+
+    real_t cond = 1 / std::numeric_limits<real_t>::epsilon();
+    params.matrix.kind.set_default("svd");
+    params.matrix.cond.set_default(cond);
+
+    slate::generate_matrix( params.matrix, A);
+
+    // if check is required, copy test data
+    slate::Matrix<scalar_t> Aref;
+    std::vector<scalar_t> Aref_data;
+    if (check || ref) {
+        Aref_data.resize( lldA * nlocA );
+        Aref = slate::Matrix<scalar_t>::fromScaLAPACK(
+            m, n, &Aref_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
+        slate::copy(A, Aref);
+    }
+
+    print_matrix("A", A, params);
+
+    // todo: how to compute gflops?
+    double gflop = lapack::Gflop<scalar_t>::geqrf(m, n);
+
+    if (! ref_only) {
+        if (trace) slate::trace::Trace::on();
+        else slate::trace::Trace::off();
+
+        double time = barrier_get_wtime(MPI_COMM_WORLD);
+
+        //==================================================
+        // Run SLATE test.
+        //==================================================
+        // A = AH,
+        // A will be written by the orthogonal polar factor
+        // H is the symmetric positive semidefinite polar factor
+        slate::qdwh(A, H, opts);
+
+        time = barrier_get_wtime(MPI_COMM_WORLD) - time;
+
+        if (trace) slate::trace::Trace::finish();
+
+        // compute and save timing/performance
+        params.time() = time;
+        params.gflops() = gflop / time;
+
+        print_matrix("U: orthogonal polar factor", A, params);
+        print_matrix("H: symmetric postive semi definite factor", H, params);
+    }
+
+    if (check) {
+        //==================================================
+        // Test results by checking the orthogonality of U factor
+        //
+        //      || A'A - I ||_f
+        //     ---------------- < tol * epsilon
+        //            n
+        //
+        //==================================================
+        auto AT = conj_transpose(A);
+        set(zero, one, Id);
+        slate::gemm(one, AT, A, -one, Id, opts);
+        real_t orth = slate::norm(slate::Norm::Fro, Id);
+        params.ortho() = orth / sqrt(n);
+
+        //==================================================
+        // Test results by checking backwards error
+        //
+        //      || A'H - Aref ||_f
+        //     -------------------    < tol * epsilon
+        //          ||Aref||_f
+        //
+        //==================================================
+        real_t normA = slate::norm(slate::Norm::Fro, Aref);
+        slate::gemm(one, A, H, -one, Aref, opts);
+        real_t berr = slate::norm(slate::Norm::Fro, Aref);
+        params.error() = berr / normA;
+
+        real_t tol = params.tol() * 0.5 * std::numeric_limits<real_t>::epsilon();
+        params.okay() = ((params.error() <= tol) && (params.ortho() <= tol));
+
+
+    }
+}
+
+// -----------------------------------------------------------------------------
+void test_qdwh(Params& params, bool run)
+{
+    switch (params.datatype()) {
+        case testsweeper::DataType::Integer:
+            throw std::exception();
+            break;
+
+        case testsweeper::DataType::Single:
+            test_qdwh_work<float> (params, run);
+            break;
+
+        case testsweeper::DataType::Double:
+            test_qdwh_work<double> (params, run);
+            break;
+
+        case testsweeper::DataType::SingleComplex:
+            test_qdwh_work<std::complex<float>> (params, run);
+            break;
+
+        case testsweeper::DataType::DoubleComplex:
+            test_qdwh_work<std::complex<double>> (params, run);
+            break;
+    }
+}

From 3fa51a1970dd1183aa9292d4312dc0c22ef10243 Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Mon, 31 Oct 2022 16:07:42 -0400
Subject: [PATCH 05/49] call geqrf/unmqr_qdwh_full

---
 src/qdwh.cc | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/src/qdwh.cc b/src/qdwh.cc
index 1789db558..4296cc9f5 100644
--- a/src/qdwh.cc
+++ b/src/qdwh.cc
@@ -22,7 +22,7 @@ namespace impl {
 /// @ingroup geqrf_specialization
 ///
 template <Target target, typename scalar_t>
-void qdwh(slate::internal::TargetType<target>,
+void qdwh(
           Matrix<scalar_t>& A,
           Matrix<scalar_t>& H, // this matrix will be hermition
           Options const& opts)
@@ -31,7 +31,6 @@ void qdwh(slate::internal::TargetType<target>,
     using blas::real;
 
     int64_t lookahead = get_option<int64_t>( opts, Option::Lookahead, 1 );
-    int64_t ib = get_option<int64_t>( opts, Option::InnerBlocking, 16 );
     int64_t max_panel_threads  = std::max(omp_get_max_threads()/2, 1);
     max_panel_threads = get_option<int64_t>( opts, Option::MaxPanelThreads, max_panel_threads );
 
@@ -56,7 +55,8 @@ void qdwh(slate::internal::TargetType<target>,
     real_t tol1 = 5. * eps;
     real_t tol3 = pow(tol1, 1./3.);
 
-    double L2, sqd, dd, a1, a, b, c;
+    real_t L2, sqd, dd, a1, a, b, c;
+    //double L2, sqd, dd, a1, a, b, c;
     real_t Li, Liconv;
     real_t conv = 100.;
     scalar_t zero = 0.0, one = 1.0, minusone = -1.0;
@@ -152,7 +152,8 @@ void qdwh(slate::internal::TargetType<target>,
         }
         itconv++;
 
-        L2  = double (Liconv * Liconv);
+        L2  =  Liconv * Liconv;
+        //L2  = double (Liconv * Liconv);
         dd  = pow( 4.0 * (1.0 - L2 ) / (L2 * L2), 1.0/3.0 );
         sqd = sqrt(1.0 + dd);
         a1  = sqd + sqrt( 8.0 - 4.0 * dd + 8.0 * (2.0 - L2) / (L2 * sqd) ) / 2.0;
@@ -173,7 +174,8 @@ void qdwh(slate::internal::TargetType<target>,
         it++;
 
         // Compute parameters L,a,b,c
-        L2  = double (Li * Li);
+        //L2  = double (Li * Li);
+        L2  = Li * Li;
         dd  = pow( 4.0 * (1.0 - L2 ) / (L2 * L2), 1.0/3.0 );
         sqd = sqrt(1.0 + dd);
         a1  = sqd + sqrt( 8.0 - 4.0 * dd + 8.0 * (2.0 - L2) / (L2 * sqd) ) / 2.0;
@@ -204,13 +206,13 @@ void qdwh(slate::internal::TargetType<target>,
             // Factorize B = QR, and generate the associated Q
             add(alpha, A, zero, W1, opts);
             //geqrf(W, T1, opts); // naive impl
-            //geqrf_qdwh_full(W, T1, opts);
+            geqrf_qdwh_full(W, T1, opts);
 
             set(zero, one, Q1);
             set(zero, zero, Q2);
 
             //unmqr(slate::Side::Left, slate::Op::NoTrans, W, T1, Q, opts); //naive impl
-            //unmqr_qdwh_full(slate::Side::Left, slate::Op::NoTrans, W, T1, Q, opts);
+            unmqr_qdwh_full(slate::Side::Left, slate::Op::NoTrans, W, T1, Q, opts);
 
             // A = ( (a-b/c)/sqrt(c) ) * Q1 * Q2' + (b/c) * A
             auto Q2T = conj_transpose(Q2);
@@ -330,7 +332,8 @@ void qdwh(slate::internal::TargetType<target>,
 /// @ingroup geqrf_computational
 ///
 template <typename scalar_t>
-void qdwh(Matrix<scalar_t>& A,
+void qdwh(
+          Matrix<scalar_t>& A,
           Matrix<scalar_t>& H,
           Options const& opts)
 {
@@ -340,20 +343,16 @@ void qdwh(Matrix<scalar_t>& A,
     switch (target) {
         case Target::Host:
         case Target::HostTask:
-            impl::qdwh<Target::HostTask>( TargetType<Target::HostTask>(),
-                       A, H, opts);
+            impl::qdwh<Target::HostTask>( A, H, opts );
             break;
         case Target::HostNest:
-            impl::qdwh<Target::HostNest>( TargetType<Target::HostNest>(),
-                       A, H, opts);
+            impl::qdwh<Target::HostNest>( A, H, opts );
             break;
         case Target::HostBatch:
-            impl::qdwh<Target::HostBatch>( TargetType<Target::HostBatch>(),
-                       A, H, opts);
+            impl::qdwh<Target::HostBatch>( A, H, opts );
             break;
         case Target::Devices:
-            impl::qdwh<Target::Devices>( TargetType<Target::Devices>(),
-                       A, H, opts);
+            impl::qdwh<Target::Devices>( A, H, opts );
             break;
     }
     // todo: return value for errors?

From ad698cc32081e45d12459201586c823afb4c6ad4 Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Mon, 31 Oct 2022 16:09:21 -0400
Subject: [PATCH 06/49] Added qdwh codes to slate.hh and to makefile

---
 GNUmakefile            |  5 +++++
 include/slate/slate.hh | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+)

diff --git a/GNUmakefile b/GNUmakefile
index b96c6ffa6..7021560f5 100644
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -561,6 +561,10 @@ ifneq ($(only_unit),1)
         src/work/work_trmm.cc \
         src/work/work_trsm.cc \
         src/work/work_trsmA.cc \
+		src/qdwh.cc \
+		src/normest.cc \
+		src/geqrf_qdwh_full.cc \
+		src/unmqr_qdwh_full.cc \
         # End. Add alphabetically.
 endif
 
@@ -662,6 +666,7 @@ ifneq ($(have_fortran),)
         test/pdlantr.f \
         test/pclantr.f \
         test/pzlantr.f \
+        test/test_qdwh.cc \
         # End. Add alphabetically, by base name after precision.
 endif
 
diff --git a/include/slate/slate.hh b/include/slate/slate.hh
index 16d36329f..22d249fc8 100644
--- a/include/slate/slate.hh
+++ b/include/slate/slate.hh
@@ -1085,6 +1085,42 @@ void steqr2(
     Matrix<scalar_t>& Z,
     Options const& opts = Options());
 
+//-----------------------------------------
+// Polar Decomposition
+
+//-----------------------------------------
+// qdwh()
+template <typename scalar_t>
+void qdwh(
+    Matrix<scalar_t>& A,
+    Matrix<scalar_t>& H, // this matrix will be hermition
+    Options const& opts = Options());
+
+//-----------------------------------------
+// normest()
+template <typename matrix_type>
+blas::real_type<typename matrix_type::value_type>
+normest(
+    Norm in_norm,
+    matrix_type &A,
+    Options const& opts = Options());
+
+//-----------------------------------------
+// geqrf_qdwh_full()
+template <typename scalar_t>
+void geqrf_qdwh_full(
+    Matrix<scalar_t>& A, TriangularFactors<scalar_t>& T,
+    Options const& opts = Options());
+
+//-----------------------------------------
+// unmqr_qdwh_full()
+template <typename scalar_t>
+void unmqr_qdwh_full(
+    Side side, Op op,
+    Matrix<scalar_t>& A, TriangularFactors<scalar_t>& T,
+    Matrix<scalar_t>& C,
+    Options const& opts = Options());
+
 } // namespace slate
 
 //-----------------------------------------

From d9abb8fd535334891e2b525b6f443fe07335d6f5 Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Thu, 10 Nov 2022 08:38:31 -0500
Subject: [PATCH 07/49] fixes on i_end and on releasing tiles

---
 src/geqrf_qdwh_full.cc | 9 +++++----
 src/unmqr_qdwh_full.cc | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/geqrf_qdwh_full.cc b/src/geqrf_qdwh_full.cc
index 66752f7f8..9178b0d84 100644
--- a/src/geqrf_qdwh_full.cc
+++ b/src/geqrf_qdwh_full.cc
@@ -176,7 +176,7 @@ void geqrf_qdwh_full(slate::internal::TargetType<target>,
     #pragma omp master
     {
         for (int64_t k = 0; k < A_min_mtnt; ++k) {
-            int64_t i_end = std::min( A_mt-1, (A_mt-A_nt) + k);
+            int64_t i_end = A_min_mtnt + k;
 
             auto  A_panel =       A.sub(k, i_end, k, k);
             auto Tl_panel =  Tlocal.sub(k, i_end, k, k);
@@ -206,7 +206,7 @@ void geqrf_qdwh_full(slate::internal::TargetType<target>,
                 if (k < A_nt-1) {
 
                     // bcast V across row for trailing matrix update
-                    if (k < i_end+1) {
+                    if (k < i_end) {
                         BcastList bcast_list_V_first;
                         BcastList bcast_list_V;
                         for (int64_t i = k; i < i_end+1; ++i) {
@@ -311,7 +311,8 @@ void geqrf_qdwh_full(slate::internal::TargetType<target>,
                                      depend(inout:block[k+1])
                     {
                         int64_t k_la = k-lookahead;
-                        for (int64_t i = k_la; i < A_mt; ++i) {
+                        int64_t i_end_la = A_min_mtnt + k_la;
+                        for (int64_t i = k_la; i < i_end_la; ++i) {
                             if (A.tileIsLocal(i, k_la)) {
                                 A.tileUpdateOrigin(i, k_la);
 
@@ -325,7 +326,7 @@ void geqrf_qdwh_full(slate::internal::TargetType<target>,
                             }
                         }
 
-                        auto A_panel_k_la = A.sub(k_la, A_mt-1, k_la, k_la);
+                        auto A_panel_k_la = A.sub(k_la, i_end_la, k_la, k_la);
                         std::vector< int64_t > first_indices_k_la;
                         geqrf_compute_first_indices(A_panel_k_la, k_la, first_indices_k_la);
                         if (first_indices.size() > 0) {
diff --git a/src/unmqr_qdwh_full.cc b/src/unmqr_qdwh_full.cc
index 563ac796b..9c4987aa0 100644
--- a/src/unmqr_qdwh_full.cc
+++ b/src/unmqr_qdwh_full.cc
@@ -103,7 +103,7 @@ void unmqr_qdwh_full(
         SLATE_UNUSED(lastk);
         for (int64_t k = k_begin; k != k_end; k += k_step) {
 
-            int64_t i_end = std::min( A_mt-1, (A_mt-A_nt) + k);
+            int64_t i_end = A_min_mtnt + k;
 
             auto A_panel = A.sub(k, i_end, k, k);
 

From 8dcff0284d4b421c7a4fe30aee92a98eb8d0e625 Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Thu, 10 Nov 2022 08:40:18 -0500
Subject: [PATCH 08/49] Set C to zero will make gemmA work

---
 src/normest.cc | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/src/normest.cc b/src/normest.cc
index e0ed58fad..79ff32350 100644
--- a/src/normest.cc
+++ b/src/normest.cc
@@ -43,7 +43,8 @@ normest(slate::internal::TargetType<target>,
     // Two norm estimation
     // First: let's compute the x vector such that
     // x_j = sum_i abs( A_{i,j} ), x here is global_sums.data
-    if (in_norm == Norm::One) {
+    // todo: should we add this to norm?
+    //if (in_norm == Norm::One_est) {
 
         using blas::min;
 
@@ -71,6 +72,7 @@ normest(slate::internal::TargetType<target>,
 
         std::vector<real_t> local_sums(n);
 
+        // todo: do we still need reserveDevice here?
         if (target == Target::Devices)
             A.reserveDeviceWorkspace();
 
@@ -92,7 +94,7 @@ normest(slate::internal::TargetType<target>,
         #pragma omp parallel
         #pragma omp master
         {
-            internal::norm<target>(in_norm, NormScope::Matrix, std::move(A), local_sums.data());
+            internal::norm<target>(slate::Norm::One, NormScope::Matrix, std::move(A), local_sums.data());
         }
 
         #pragma omp critical(slate_mpi)
@@ -120,7 +122,7 @@ normest(slate::internal::TargetType<target>,
         }
 
         // Third: start the while-loop X = X / ||X||
-        while ( (cnt < maxiter) && (fabs((e - e0)) > (tol * (e))) ) {
+        while ((cnt < maxiter) && (fabs((e - e0)) > (tol * (e)))) {
             e0 = e;
 
             // Scale X = X / ||X||
@@ -130,9 +132,18 @@ normest(slate::internal::TargetType<target>,
             // Compute Ax = A * sx
             gemm(one, A, X, zero, AX, opts);
 
+            // todo: still need to add the following
+            //if nnz(Sx) == 0
+            //    Sx = rand(size(Sx),class(Sx));
+            //end
+
             // Compute x = A' * A * x = A' * Ax
-            auto AT = conj_transpose(A);
+            auto AT = conjTranspose(A);
+            // todo: why this set is needed when using multiple mpi rank
+            // todo: send to Sebastien
+            set(zero, zero, XL);
             gemm(one, AT, AX, zero, XL, opts);
+            //gemmC(one, AT, AX, zero, XL, opts);
 
             // Compute ||X||, ||AX||
             normX  = norm(Norm::Fro, XL, opts);
@@ -142,17 +153,15 @@ normest(slate::internal::TargetType<target>,
             e = normX / normAX;
             cnt++;
         }
-        //if ( (cnt >= maxiter) && (fabs((e - e0)) > (tol * e)) ) {
-        //}
 
         A.clearWorkspace();
 
         return e;
 
-    }
-    else {
-        slate_error("invalid norm.");
-    }
+    //}
+    //else {
+    //    slate_error("invalid norm.");
+    //}
 }
 
 } // namespace specialization

From f379a015e6df8add5641ae9c8987a6ab6e9ebc57 Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Thu, 10 Nov 2022 08:42:07 -0500
Subject: [PATCH 09/49] minor cleaning and add todo

---
 src/qdwh.cc | 54 ++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 39 insertions(+), 15 deletions(-)

diff --git a/src/qdwh.cc b/src/qdwh.cc
index 4296cc9f5..27e77547f 100644
--- a/src/qdwh.cc
+++ b/src/qdwh.cc
@@ -12,14 +12,16 @@ namespace slate {
 
 // specialization namespace differentiates, e.g.,
 // todo: is it ok?
-// internal::geqrf from internal::specialization::geqrf
+// internal::qdwh from impl::qdwh
+// todo: works only if m % nb = 0
+// todo: works only for square matrices
 namespace impl {
 
 //------------------------------------------------------------------------------
 /// Distributed parallel polar decomposition based on QDWH algorithm.
 /// Generic implementation for any target.
-// todo: is it ok?
-/// @ingroup geqrf_specialization
+// todo: add to docs/doxygen
+/// @ingroup qdwh_specialization
 ///
 template <Target target, typename scalar_t>
 void qdwh(
@@ -56,7 +58,6 @@ void qdwh(
     real_t tol3 = pow(tol1, 1./3.);
 
     real_t L2, sqd, dd, a1, a, b, c;
-    //double L2, sqd, dd, a1, a, b, c;
     real_t Li, Liconv;
     real_t conv = 100.;
     scalar_t zero = 0.0, one = 1.0, minusone = -1.0;
@@ -66,19 +67,25 @@ void qdwh(
     real_t normA;
     real_t norminvR;
 
-    int64_t mt2 = 2*mt - 1;
+    int64_t mt2 = mt + nt - 1;
 
     // allocate m2*n work space required for the qr-based iterations
     // this allocation can be avoided if we change the qr iterations to
     // work on two matrices on top of each other
 
-    // if doing QR([A;Id])
+    // allocation needed for doing QR([A;Id])
     slate::Matrix<scalar_t> W(m2, n, nb, nprow, npcol, MPI_COMM_WORLD);
     slate::Matrix<scalar_t> Q(m2, n, nb, nprow, npcol, MPI_COMM_WORLD);
+    slate::TriangularFactors<scalar_t> T;
 
     if (target == Target::Devices) {
         const int64_t batch_size_zero = 0; // use default batch size
         const int64_t num_queues = 3 + lookahead;
+        // todo: I have workspace here and in other call as geqrf_qdwh_full
+        // todo: in geqrf, it should use the allocated ones, it should not
+        // keep allocating
+        // todo: check traces to confirm
+        // or in printf in blaspp/device_malloc and here
         A.allocateBatchArrays(batch_size_zero, num_queues);
         A.reserveDeviceWorkspace();
         W.allocateBatchArrays(batch_size_zero, num_queues);
@@ -100,20 +107,18 @@ void qdwh(
     //W1.insertLocalTiles();
     //W2.insertLocalTiles();
 
-    slate::TriangularFactors<scalar_t> T1;
 
     // backup A in H
     copy(A, H);
 
     // two norm estimation (largest singular value of A)
-    // todo: fix norm_est the call it
-    real_t norm_est = 1.;
-    //real_t norm_est = normest(slate::Norm::One, A);
+    real_t norm_est = normest(slate::Norm::One, A);
     alpha = 1.0;
 
     // scale the original matrix A to form A0 of the iterative loop
     add(alpha/(scalar_t)norm_est, H, zero, A, opts);
 
+    // todo: look at Lapack impl
     // Calculate Li: reciprocal of condition number estimation
     // Either 1) use LU followed by gecon
     // Or     2) QR followed by trtri
@@ -123,11 +128,14 @@ void qdwh(
         // Estimate the condition number using QR
         // This Q factor can be used in the first QR-based iteration
         copy(A, W1);
-        slate::geqrf(W1, T1, opts);
+        // todo: this QR can be used in the first QR iteration
+        // todo: put bound of 1-norm est compared to 2-norm
+        slate::geqrf(W1, T, opts);
         auto R  = TriangularMatrix<scalar_t>(
             Uplo::Upper, slate::Diag::NonUnit, W1 );
         auto Rh = HermitianMatrix<scalar_t>(
             Uplo::Upper, W1 );
+        // todo: cheaper to do triangular solve than calling trtri
         trtri(R, opts);
         //tzset(scalar_t(0.0), L);
         norminvR = norm(slate::Norm::One, Rh);
@@ -204,15 +212,22 @@ void qdwh(
             //}
 
             // Factorize B = QR, and generate the associated Q
+            // todo: replace following add by copy and scale, but why scale only scale by a real numbers?
             add(alpha, A, zero, W1, opts);
-            //geqrf(W, T1, opts); // naive impl
-            geqrf_qdwh_full(W, T1, opts);
+            //copy(A, W1);
+            //scale(alpha, one, W1, opts);
+            //geqrf(W, T, opts); // naive impl
+            // todo: how to avoid allocating workspace for each iteration (3 QR)?
+            // todo: how to make it work for any matrix size and any nb?
+            // todo: check time for allocationn in geqrf
+            geqrf_qdwh_full(W, T, opts);
 
             set(zero, one, Q1);
             set(zero, zero, Q2);
 
-            //unmqr(slate::Side::Left, slate::Op::NoTrans, W, T1, Q, opts); //naive impl
-            unmqr_qdwh_full(slate::Side::Left, slate::Op::NoTrans, W, T1, Q, opts);
+            // todo: do I need unmqr?
+            //unmqr(slate::Side::Left, slate::Op::NoTrans, W, T, Q, opts); //naive impl
+            unmqr_qdwh_full(slate::Side::Left, slate::Op::NoTrans, W, T, Q, opts);
 
             // A = ( (a-b/c)/sqrt(c) ) * Q1 * Q2' + (b/c) * A
             auto Q2T = conj_transpose(Q2);
@@ -276,10 +291,19 @@ void qdwh(
         }
     }
 
+    if (A.mpiRank() == 0) {
+        printf("%-7s  %-7s\n", "QR", "PO");
+        printf("%-7d  %-7d\n", itqr, itpo);
+    }
+
     // A = U*H ==> H = U'*A ==> H = 0.5*(H'+H)
     copy(H, W1);
     auto AT = conj_transpose(A);
     gemm(one, AT, W1, zero, H, opts);
+
+    A.releaseWorkspace();
+    W.releaseWorkspace();
+    Q.releaseWorkspace();
 }
 
 } // namespace impl

From 63df04af4378209317ac0619dc35a8277d7094fc Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Mon, 12 Dec 2022 11:45:11 -0500
Subject: [PATCH 10/49] Allocate W with number of rows is roundup(number of A
 rows) + n to avoid having a clean up tile in the middle of the W matrix

---
 src/qdwh.cc | 105 +++++++++++++++++++++++++++++-----------------------
 1 file changed, 59 insertions(+), 46 deletions(-)

diff --git a/src/qdwh.cc b/src/qdwh.cc
index 27e77547f..aa79d9ea9 100644
--- a/src/qdwh.cc
+++ b/src/qdwh.cc
@@ -34,29 +34,17 @@ void qdwh(
 
     int64_t lookahead = get_option<int64_t>( opts, Option::Lookahead, 1 );
     int64_t max_panel_threads  = std::max(omp_get_max_threads()/2, 1);
-    max_panel_threads = get_option<int64_t>( opts, Option::MaxPanelThreads, max_panel_threads );
 
-    int64_t mt = A.mt();
-    int64_t nt = A.nt();
-
-    int64_t m    = A.m();
-    int64_t n    = A.n();
-    int64_t nb   = A.tileMb(0);
-    int64_t m2   = m + n;
+    max_panel_threads = get_option<int64_t>( opts, Option::MaxPanelThreads, max_panel_threads );
 
-    int nprow, npcol;
-    int myrow, mycol;
-    GridOrder order;
-    A.gridinfo(&order, &nprow, &npcol, &myrow, &mycol);
+    real_t eps  = 0.5 * std::numeric_limits<real_t>::epsilon();
+    real_t tol1 = 5. * eps;
+    real_t tol3 = pow(tol1, 1./3.);
 
     bool optqr = 1;
     int itconv, it;
     int itqr = 0, itpo = 0;
 
-    real_t eps  = 0.5 * std::numeric_limits<real_t>::epsilon();
-    real_t tol1 = 5. * eps;
-    real_t tol3 = pow(tol1, 1./3.);
-
     real_t L2, sqd, dd, a1, a, b, c;
     real_t Li, Liconv;
     real_t conv = 100.;
@@ -67,15 +55,30 @@ void qdwh(
     real_t normA;
     real_t norminvR;
 
-    int64_t mt2 = mt + nt - 1;
+    int64_t mt = A.mt();
+    int64_t nt = A.nt();
+
+    int64_t m    = A.m();
+    int64_t n    = A.n();
+    int64_t nb   = A.tileMb(0);
+
+    // The QR-based iterations requires QR[A; Id],
+    // W = [A; Id], where size of A is mxn, size of Id is nxn
+    // To avoid having a clean up tile in the middle of the W matrix,
+    // we round up the number of A rows (m),
+    // so that size(W) = m_roundup + n
+    int64_t m_roundup   = ceil( (m + nb)/ nb )*nb;
+    int64_t m_W   = m_roundup + n;
+    int64_t mt_W = mt + nt;
 
-    // allocate m2*n work space required for the qr-based iterations
-    // this allocation can be avoided if we change the qr iterations to
-    // work on two matrices on top of each other
+    int nprow, npcol;
+    int myrow, mycol;
+    GridOrder order;
+    A.gridinfo(&order, &nprow, &npcol, &myrow, &mycol);
 
-    // allocation needed for doing QR([A;Id])
-    slate::Matrix<scalar_t> W(m2, n, nb, nprow, npcol, MPI_COMM_WORLD);
-    slate::Matrix<scalar_t> Q(m2, n, nb, nprow, npcol, MPI_COMM_WORLD);
+    // allocate m_W*n work space required for the qr-based iterations QR([A;Id])
+    slate::Matrix<scalar_t> W(m_W, n, nb, nprow, npcol, MPI_COMM_WORLD);
+    slate::Matrix<scalar_t> Q(m_W, n, nb, nprow, npcol, MPI_COMM_WORLD);
     slate::TriangularFactors<scalar_t> T;
 
     if (target == Target::Devices) {
@@ -96,17 +99,27 @@ void qdwh(
 
     W.insertLocalTiles();
     Q.insertLocalTiles();
-    auto W1 = W.sub(0, mt-1, 0, nt-1); // First mxn block of W
-    auto W2 = W.sub(mt, mt2, 0, nt-1); // Second nxn block of W
-    auto Q1 = Q.sub(0, mt-1, 0, nt-1); // First mxn block of W
-    auto Q2 = Q.sub(mt, mt2, 0, nt-1); // Second nxn block of W
 
-    // if doing QR([A]) QR([A;Id])
-    //slate::Matrix<scalar_t> W1(m, n, nb, nprow, npcol, MPI_COMM_WORLD);
-    //slate::Matrix<scalar_t> W2(m, n, nb, nprow, npcol, MPI_COMM_WORLD);
-    //W1.insertLocalTiles();
-    //W2.insertLocalTiles();
+    // To compute QR[A; Id]
+    // First mt tiles of W contains A matrix in [0:m-1, 0:n-1]
+    // W1 have more rows than A, since we round up the number of rows of W1
+    // W10 have A matrix
+    auto W1 = W.sub(0, mt-1, 0, nt-1);
+    auto W10 = W1.slice(0, m-1, 0, n-1);
+
+    // Second nt tiles of W, contain the identity matrix
+    auto W2 = W.sub(mt, mt_W-1, 0, nt-1);
+
+    // W11 is the extra padded rows [m:m_round_up, 0:n-1]
+    auto W11 = W1.slice(m, m_roundup-1, 0, n-1);
+    set(zero, zero, W11);
+
+    auto Q1 = Q.sub(0, mt-1, 0, nt-1);
+    auto Q2 = Q.sub(mt, mt_W-1, 0, nt-1);
 
+    auto Q10 = Q1.slice(0, m-1, 0, n-1);
+    auto Q11 = Q1.slice(m, m_roundup-1, 0, n-1);
+    set(zero, zero, Q11);
 
     // backup A in H
     copy(A, H);
@@ -127,14 +140,14 @@ void qdwh(
     if (optqr) {
         // Estimate the condition number using QR
         // This Q factor can be used in the first QR-based iteration
-        copy(A, W1);
+        copy(A, W10);
         // todo: this QR can be used in the first QR iteration
         // todo: put bound of 1-norm est compared to 2-norm
         slate::geqrf(W1, T, opts);
         auto R  = TriangularMatrix<scalar_t>(
-            Uplo::Upper, slate::Diag::NonUnit, W1 );
+            Uplo::Upper, slate::Diag::NonUnit, W10 );
         auto Rh = HermitianMatrix<scalar_t>(
-            Uplo::Upper, W1 );
+            Uplo::Upper, W10 );
         // todo: cheaper to do triangular solve than calling trtri
         trtri(R, opts);
         //tzset(scalar_t(0.0), L);
@@ -213,7 +226,7 @@ void qdwh(
 
             // Factorize B = QR, and generate the associated Q
             // todo: replace following add by copy and scale, but why scale only scale by a real numbers?
-            add(alpha, A, zero, W1, opts);
+            add(alpha, A, zero, W10, opts);
             //copy(A, W1);
             //scale(alpha, one, W1, opts);
             //geqrf(W, T, opts); // naive impl
@@ -222,7 +235,7 @@ void qdwh(
             // todo: check time for allocationn in geqrf
             geqrf_qdwh_full(W, T, opts);
 
-            set(zero, one, Q1);
+            set(zero, one, Q10);
             set(zero, zero, Q2);
 
             // todo: do I need unmqr?
@@ -235,9 +248,9 @@ void qdwh(
             beta  = scalar_t( b / c );
             // Copy U into C to check the convergence of QDWH
             if (it >= itconv ) {
-                copy(A, W1);
+                copy(A, W10);
             }
-            gemm(alpha, Q1, Q2T, beta, A, opts);
+            gemm(alpha, Q10, Q2T, beta, A, opts);
 
             // Main flops used in this step/
             //flops_dgeqrf = FLOPS_DGEQRF( 2*m, n );
@@ -251,7 +264,7 @@ void qdwh(
         else {
             // Copy A into H to check the convergence of QDWH
             if (it >= itconv) {
-                copy(A, W1);
+                copy(A, W10);
             }
 
             // Make Q1 into an identity matrix
@@ -259,8 +272,8 @@ void qdwh(
 
             // Compute Q1 = c * A' * A + I
             ///////////////
-            copy(A, Q1);
-            auto AT = conj_transpose(Q1);
+            copy(A, Q10);
+            auto AT = conj_transpose(Q10);
             gemm(scalar_t(c), AT, A, one, W2, opts);
 
             // Solve R x = AT
@@ -286,8 +299,8 @@ void qdwh(
         // Compute the norm of the symmetric matrix U - B1
         conv = 10.0;
         if (it >= itconv) {
-            add(one, A, minusone, W1, opts);
-            conv = norm(slate::Norm::Fro, W1);
+            add(one, A, minusone, W10, opts);
+            conv = norm(slate::Norm::Fro, W10);
         }
     }
 
@@ -297,9 +310,9 @@ void qdwh(
     }
 
     // A = U*H ==> H = U'*A ==> H = 0.5*(H'+H)
-    copy(H, W1);
+    copy(H, W10);
     auto AT = conj_transpose(A);
-    gemm(one, AT, W1, zero, H, opts);
+    gemm(one, AT, W10, zero, H, opts);
 
     A.releaseWorkspace();
     W.releaseWorkspace();

From 0278c6935d4261fadcf0478a9bf958515e5c4064 Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Wed, 14 Dec 2022 12:12:36 -0500
Subject: [PATCH 11/49] Fixes to have qdwh work for matrix with m > n

---
 src/geqrf_qdwh_full.cc |  2 +-
 src/normest.cc         |  2 +-
 src/qdwh.cc            | 71 ++++++++++++++++++++++++++----------------
 src/unmqr_qdwh_full.cc |  2 +-
 4 files changed, 48 insertions(+), 29 deletions(-)

diff --git a/src/geqrf_qdwh_full.cc b/src/geqrf_qdwh_full.cc
index 9178b0d84..f7bc10c74 100644
--- a/src/geqrf_qdwh_full.cc
+++ b/src/geqrf_qdwh_full.cc
@@ -176,7 +176,7 @@ void geqrf_qdwh_full(slate::internal::TargetType<target>,
     #pragma omp master
     {
         for (int64_t k = 0; k < A_min_mtnt; ++k) {
-            int64_t i_end = A_min_mtnt + k;
+            int64_t i_end = A_mt - A_nt + k;
 
             auto  A_panel =       A.sub(k, i_end, k, k);
             auto Tl_panel =  Tlocal.sub(k, i_end, k, k);
diff --git a/src/normest.cc b/src/normest.cc
index 79ff32350..c73632cb4 100644
--- a/src/normest.cc
+++ b/src/normest.cc
@@ -77,7 +77,7 @@ normest(slate::internal::TargetType<target>,
             A.reserveDeviceWorkspace();
 
         std::vector<scalar_t> global_sums(n);
-        std::vector<scalar_t> W1(n);
+        std::vector<scalar_t> W1(m);
         std::vector<scalar_t> W2(n);
 
         auto XL = slate::Matrix<scalar_t>::fromLAPACK(
diff --git a/src/qdwh.cc b/src/qdwh.cc
index aa79d9ea9..c377f8af1 100644
--- a/src/qdwh.cc
+++ b/src/qdwh.cc
@@ -58,8 +58,13 @@ void qdwh(
     int64_t mt = A.mt();
     int64_t nt = A.nt();
 
-    int64_t m    = A.m();
-    int64_t n    = A.n();
+    int64_t m  = A.m();
+    int64_t n  = A.n();
+
+    if (m < n) {
+        return;
+    }
+
     int64_t nb   = A.tileMb(0);
 
     // The QR-based iterations requires QR[A; Id],
@@ -67,7 +72,7 @@ void qdwh(
     // To avoid having a clean up tile in the middle of the W matrix,
     // we round up the number of A rows (m),
     // so that size(W) = m_roundup + n
-    int64_t m_roundup   = ceil( (m + nb)/ nb )*nb;
+    int64_t m_roundup =  ((m + nb - 1) / nb ) * nb;
     int64_t m_W   = m_roundup + n;
     int64_t mt_W = mt + nt;
 
@@ -81,6 +86,16 @@ void qdwh(
     slate::Matrix<scalar_t> Q(m_W, n, nb, nprow, npcol, MPI_COMM_WORLD);
     slate::TriangularFactors<scalar_t> T;
 
+    // todo: anyway to avoid Acpy allocation?
+    slate::Matrix<scalar_t> Acpy;
+    if (m != n) {
+        Acpy = slate::Matrix<scalar_t>(m, n, nb, nprow, npcol, MPI_COMM_WORLD);
+        Acpy.insertLocalTiles();
+    }
+    else {
+        Acpy = H;
+    }
+
     if (target == Target::Devices) {
         const int64_t batch_size_zero = 0; // use default batch size
         const int64_t num_queues = 3 + lookahead;
@@ -95,6 +110,8 @@ void qdwh(
         W.reserveDeviceWorkspace();
         Q.allocateBatchArrays(batch_size_zero, num_queues);
         Q.reserveDeviceWorkspace();
+        Acpy.allocateBatchArrays(batch_size_zero, num_queues);
+        Acpy.reserveDeviceWorkspace();
     }
 
     W.insertLocalTiles();
@@ -110,26 +127,29 @@ void qdwh(
     // Second nt tiles of W, contain the identity matrix
     auto W2 = W.sub(mt, mt_W-1, 0, nt-1);
 
-    // W11 is the extra padded rows [m:m_round_up, 0:n-1]
-    auto W11 = W1.slice(m, m_roundup-1, 0, n-1);
-    set(zero, zero, W11);
-
     auto Q1 = Q.sub(0, mt-1, 0, nt-1);
+    auto Q10 = Q1.slice(0, m-1, 0, n-1);
+
     auto Q2 = Q.sub(mt, mt_W-1, 0, nt-1);
 
-    auto Q10 = Q1.slice(0, m-1, 0, n-1);
-    auto Q11 = Q1.slice(m, m_roundup-1, 0, n-1);
-    set(zero, zero, Q11);
 
-    // backup A in H
-    copy(A, H);
+    if (m_roundup != m) {
+        // W11 and Q11 is the extra padded rows [m:m_round_up, 0:n-1]
+        auto W11 = W1.slice(m, m_roundup-1, 0, n-1);
+        set(zero, zero, W11);
+        auto Q11 = Q1.slice(m, m_roundup-1, 0, n-1);
+        set(zero, zero, Q11);
+    }
+
+    // backup A in Acpy to compute H
+    copy(A, Acpy);
 
     // two norm estimation (largest singular value of A)
     real_t norm_est = normest(slate::Norm::One, A);
     alpha = 1.0;
 
     // scale the original matrix A to form A0 of the iterative loop
-    add(alpha/(scalar_t)norm_est, H, zero, A, opts);
+    add(alpha/scalar_t(norm_est), Acpy, zero, A, opts);
 
     // todo: look at Lapack impl
     // Calculate Li: reciprocal of condition number estimation
@@ -144,13 +164,15 @@ void qdwh(
         // todo: this QR can be used in the first QR iteration
         // todo: put bound of 1-norm est compared to 2-norm
         slate::geqrf(W1, T, opts);
+        //auto R  = TrapezoidMatrix<scalar_t>(
+        //    Uplo::Upper, slate::Diag::NonUnit, W10 );
+        auto R1 =  W10.slice(0, n-1, 0, n-1);
         auto R  = TriangularMatrix<scalar_t>(
-            Uplo::Upper, slate::Diag::NonUnit, W10 );
+            Uplo::Upper, slate::Diag::NonUnit, R1 );
         auto Rh = HermitianMatrix<scalar_t>(
-            Uplo::Upper, W10 );
+            Uplo::Upper, R1 );
         // todo: cheaper to do triangular solve than calling trtri
         trtri(R, opts);
-        //tzset(scalar_t(0.0), L);
         norminvR = norm(slate::Norm::One, Rh);
         Li = (1.0 / norminvR) / normA;
         Li = norm_est / 1.1 * Li;
@@ -162,6 +184,7 @@ void qdwh(
         // Estimate the condition number using LU
     }
 
+    // Compute the number of iterations to converge
     itconv = 0; Liconv = Li;
     while (itconv == 0 || fabs(1-Liconv) > tol1) {
         // To find the minimum number of iterations to converge.
@@ -174,7 +197,6 @@ void qdwh(
         itconv++;
 
         L2  =  Liconv * Liconv;
-        //L2  = double (Liconv * Liconv);
         dd  = pow( 4.0 * (1.0 - L2 ) / (L2 * L2), 1.0/3.0 );
         sqd = sqrt(1.0 + dd);
         a1  = sqd + sqrt( 8.0 - 4.0 * dd + 8.0 * (2.0 - L2) / (L2 * sqd) ) / 2.0;
@@ -207,20 +229,19 @@ void qdwh(
         Li  = Li * real_t ((a + b * L2) / (1.0 + c * L2));
 
         if (real(c) > 100.) {
-            //int doqr = !optqr || it > 1;
+            //int64_t doqr = !optqr || it > 1;
 
             // Generate the matrix B = [ B1 ] = [ sqrt(c) * U ]
             //                         [ B2 ] = [ Id          ]
             alpha = scalar_t(sqrt(c));
             set(zero, one, W2);
 
-            // todo: have a customized splitted QR
             //if( doqr ) {
-            //    geadd(alpha, A, zero, W1);
+            //    geadd(alpha, A, zero, W10, opts);
             //    geqrf_qdwh_full(W, T1, opts);
             //}
             //else {
-            //    geadd(alpha, W1, zero, W1);
+            //    geadd(alpha, W1, zero, W10, opts);
             //    geqrf_qdwh_full2(W, T1, opts);
             //}
 
@@ -229,16 +250,14 @@ void qdwh(
             add(alpha, A, zero, W10, opts);
             //copy(A, W1);
             //scale(alpha, one, W1, opts);
+
             //geqrf(W, T, opts); // naive impl
             // todo: how to avoid allocating workspace for each iteration (3 QR)?
             // todo: how to make it work for any matrix size and any nb?
             // todo: check time for allocationn in geqrf
             geqrf_qdwh_full(W, T, opts);
 
-            set(zero, one, Q10);
-            set(zero, zero, Q2);
-
-            // todo: do I need unmqr?
+            set(zero, one, Q);
             //unmqr(slate::Side::Left, slate::Op::NoTrans, W, T, Q, opts); //naive impl
             unmqr_qdwh_full(slate::Side::Left, slate::Op::NoTrans, W, T, Q, opts);
 
@@ -310,7 +329,7 @@ void qdwh(
     }
 
     // A = U*H ==> H = U'*A ==> H = 0.5*(H'+H)
-    copy(H, W10);
+    copy(Acpy, W10);
     auto AT = conj_transpose(A);
     gemm(one, AT, W10, zero, H, opts);
 
diff --git a/src/unmqr_qdwh_full.cc b/src/unmqr_qdwh_full.cc
index 9c4987aa0..3b1277103 100644
--- a/src/unmqr_qdwh_full.cc
+++ b/src/unmqr_qdwh_full.cc
@@ -103,7 +103,7 @@ void unmqr_qdwh_full(
         SLATE_UNUSED(lastk);
         for (int64_t k = k_begin; k != k_end; k += k_step) {
 
-            int64_t i_end = A_min_mtnt + k;
+            int64_t i_end = A_mt - A_nt + k;
 
             auto A_panel = A.sub(k, i_end, k, k);
 

From 458b8b953bb70792ea5fd1ed45a4f20acc3dd877 Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Wed, 21 Dec 2022 13:53:08 -0500
Subject: [PATCH 12/49] add itqr and itpo to qdwh interace, and change H matrix
 to Hermitian

---
 include/slate/slate.hh |  3 +-
 src/qdwh.cc            | 77 ++++++++++++++++++++++++------------------
 test/test_qdwh.cc      | 36 ++++++++++++++++----
 3 files changed, 75 insertions(+), 41 deletions(-)

diff --git a/include/slate/slate.hh b/include/slate/slate.hh
index 22d249fc8..a249d8af9 100644
--- a/include/slate/slate.hh
+++ b/include/slate/slate.hh
@@ -1093,7 +1093,8 @@ void steqr2(
 template <typename scalar_t>
 void qdwh(
     Matrix<scalar_t>& A,
-    Matrix<scalar_t>& H, // this matrix will be hermition
+    HermitianMatrix<scalar_t>& H,
+    int& itqr, int& itpo,
     Options const& opts = Options());
 
 //-----------------------------------------
diff --git a/src/qdwh.cc b/src/qdwh.cc
index c377f8af1..15c4d4975 100644
--- a/src/qdwh.cc
+++ b/src/qdwh.cc
@@ -26,7 +26,8 @@ namespace impl {
 template <Target target, typename scalar_t>
 void qdwh(
           Matrix<scalar_t>& A,
-          Matrix<scalar_t>& H, // this matrix will be hermition
+          HermitianMatrix<scalar_t>& H, // this matrix will be hermition
+          int& itqr, int& itpo,
           Options const& opts)
 {
     using real_t = blas::real_type<scalar_t>;
@@ -43,12 +44,12 @@ void qdwh(
 
     bool optqr = 1;
     int itconv, it;
-    int itqr = 0, itpo = 0;
 
     real_t L2, sqd, dd, a1, a, b, c;
     real_t Li, Liconv;
     real_t conv = 100.;
-    scalar_t zero = 0.0, one = 1.0, minusone = -1.0;
+    scalar_t zero = 0.0, one = 1.0;
+    real_t rzero = 0.0;
 
     scalar_t alpha, beta;
 
@@ -88,13 +89,8 @@ void qdwh(
 
     // todo: anyway to avoid Acpy allocation?
     slate::Matrix<scalar_t> Acpy;
-    if (m != n) {
-        Acpy = slate::Matrix<scalar_t>(m, n, nb, nprow, npcol, MPI_COMM_WORLD);
-        Acpy.insertLocalTiles();
-    }
-    else {
-        Acpy = H;
-    }
+    Acpy = slate::Matrix<scalar_t>(m, n, nb, nprow, npcol, MPI_COMM_WORLD);
+    Acpy.insertLocalTiles(target);
 
     if (target == Target::Devices) {
         const int64_t batch_size_zero = 0; // use default batch size
@@ -104,18 +100,22 @@ void qdwh(
         // keep allocating
         // todo: check traces to confirm
         // or in printf in blaspp/device_malloc and here
-        A.allocateBatchArrays(batch_size_zero, num_queues);
-        A.reserveDeviceWorkspace();
-        W.allocateBatchArrays(batch_size_zero, num_queues);
-        W.reserveDeviceWorkspace();
-        Q.allocateBatchArrays(batch_size_zero, num_queues);
-        Q.reserveDeviceWorkspace();
-        Acpy.allocateBatchArrays(batch_size_zero, num_queues);
-        Acpy.reserveDeviceWorkspace();
+        // todo: is it enough to allocate for one matrix??
+        // todo: we don't need these here because each slate call will do it.
+        // todo: check if after any call it deallocate, for example after gemm, does it throw away the gpu allocation?
+        //A.allocateBatchArrays(batch_size_zero, num_queues);
+        //A.reserveDeviceWorkspace();
+        //W.allocateBatchArrays(batch_size_zero, num_queues);
+        //W.reserveDeviceWorkspace();
+        //Q.allocateBatchArrays(batch_size_zero, num_queues);
+        //Q.reserveDeviceWorkspace();
+        //Acpy.allocateBatchArrays(batch_size_zero, num_queues);
+        //Acpy.reserveDeviceWorkspace();
     }
 
-    W.insertLocalTiles();
-    Q.insertLocalTiles();
+    // todo: do this on GPUs
+    W.insertLocalTiles(target);
+    Q.insertLocalTiles(target);
 
     // To compute QR[A; Id]
     // First mt tiles of W contains A matrix in [0:m-1, 0:n-1]
@@ -151,7 +151,6 @@ void qdwh(
     // scale the original matrix A to form A0 of the iterative loop
     add(alpha/scalar_t(norm_est), Acpy, zero, A, opts);
 
-    // todo: look at Lapack impl
     // Calculate Li: reciprocal of condition number estimation
     // Either 1) use LU followed by gecon
     // Or     2) QR followed by trtri
@@ -161,7 +160,7 @@ void qdwh(
         // Estimate the condition number using QR
         // This Q factor can be used in the first QR-based iteration
         copy(A, W10);
-        // todo: this QR can be used in the first QR iteration
+        // todo: the QR used to estimate the condition number can be used in the first QR iteration
         // todo: put bound of 1-norm est compared to 2-norm
         slate::geqrf(W1, T, opts);
         //auto R  = TrapezoidMatrix<scalar_t>(
@@ -255,6 +254,8 @@ void qdwh(
             // todo: how to avoid allocating workspace for each iteration (3 QR)?
             // todo: how to make it work for any matrix size and any nb?
             // todo: check time for allocationn in geqrf
+            // W = [W1]
+            //     [I ]
             geqrf_qdwh_full(W, T, opts);
 
             set(zero, one, Q);
@@ -318,7 +319,7 @@ void qdwh(
         // Compute the norm of the symmetric matrix U - B1
         conv = 10.0;
         if (it >= itconv) {
-            add(one, A, minusone, W10, opts);
+            add(one, A, -one, W10, opts);
             conv = norm(slate::Norm::Fro, W10);
         }
     }
@@ -331,7 +332,12 @@ void qdwh(
     // A = U*H ==> H = U'*A ==> H = 0.5*(H'+H)
     copy(Acpy, W10);
     auto AT = conj_transpose(A);
-    gemm(one, AT, W10, zero, H, opts);
+    // todo: try something like her2k to compute H
+    //her2k(one, A, W10, rzero, H, opts);
+    gemm(one, AT, W10, zero, Q10, opts);
+    auto AL = HermitianMatrix<scalar_t>(
+            Uplo::Lower, Q10 );
+    copy(AL, H);
 
     A.releaseWorkspace();
     W.releaseWorkspace();
@@ -390,7 +396,8 @@ void qdwh(
 template <typename scalar_t>
 void qdwh(
           Matrix<scalar_t>& A,
-          Matrix<scalar_t>& H,
+          HermitianMatrix<scalar_t>& H,
+          int& itqr, int& itpo,
           Options const& opts)
 {
     using internal::TargetType;
@@ -399,16 +406,16 @@ void qdwh(
     switch (target) {
         case Target::Host:
         case Target::HostTask:
-            impl::qdwh<Target::HostTask>( A, H, opts );
+            impl::qdwh<Target::HostTask>( A, H, itqr, itpo, opts );
             break;
         case Target::HostNest:
-            impl::qdwh<Target::HostNest>( A, H, opts );
+            impl::qdwh<Target::HostNest>( A, H, itqr, itpo, opts );
             break;
         case Target::HostBatch:
-            impl::qdwh<Target::HostBatch>( A, H, opts );
+            impl::qdwh<Target::HostBatch>( A, H, itqr, itpo, opts );
             break;
         case Target::Devices:
-            impl::qdwh<Target::Devices>( A, H, opts );
+            impl::qdwh<Target::Devices>( A, H, itqr, itpo, opts );
             break;
     }
     // todo: return value for errors?
@@ -419,25 +426,29 @@ void qdwh(
 template
 void qdwh<float>(
     Matrix<float>& A,
-    Matrix<float>& H,
+    HermitianMatrix<float>& H,
+    int& itqr, int& itpo,
     Options const& opts);
 
 template
 void qdwh<double>(
     Matrix<double>& A,
-    Matrix<double>& H,
+    HermitianMatrix<double>& H,
+    int& itqr, int& itpo,
     Options const& opts);
 
 template
 void qdwh< std::complex<float> >(
     Matrix< std::complex<float> >& A,
-    Matrix< std::complex<float> >& H,
+    HermitianMatrix< std::complex<float> >& H,
+    int& itqr, int& itpo,
     Options const& opts);
 
 template
 void qdwh< std::complex<double> >(
     Matrix< std::complex<double> >& A,
-    Matrix< std::complex<double> >& H,
+    HermitianMatrix< std::complex<double> >& H,
+    int& itqr, int& itpo,
     Options const& opts);
 
 } // namespace slate
diff --git a/test/test_qdwh.cc b/test/test_qdwh.cc
index 1281a1e05..019e19417 100644
--- a/test/test_qdwh.cc
+++ b/test/test_qdwh.cc
@@ -88,23 +88,22 @@ void test_qdwh_work(Params& params, bool run)
     Id.insertLocalTiles();
 
     slate::Matrix<scalar_t> A;
-    slate::Matrix<scalar_t> H;
+    slate::HermitianMatrix<scalar_t> H;
     if (origin != slate::Origin::ScaLAPACK) {
         // Copy local ScaLAPACK data to GPU or CPU tiles.
         slate::Target origin_target = origin2target(origin);
         A = slate::Matrix<scalar_t>(m, n, nb, p, q, MPI_COMM_WORLD);
         A.insertLocalTiles(origin_target);
         //copy(&A_tst[0], descA_tst, A);
-        H = slate::Matrix<scalar_t>(n, n, nb, p, q, MPI_COMM_WORLD);
+        H = slate::HermitianMatrix<scalar_t>(slate::Uplo::Lower, n, nb, p, q, MPI_COMM_WORLD);
         H.insertLocalTiles(origin_target);
-        //copy(&H_tst[0], descH_tst, H);
     }
     else {
         // create SLATE matrices from the ScaLAPACK layouts
         A_data.resize( lldA * nlocA );
         H_data.resize( lldH * nlocH );
         A = slate::Matrix<scalar_t>::fromScaLAPACK(m, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
-        H = slate::Matrix<scalar_t>::fromScaLAPACK(n, n, &H_data[0], lldH, nb, p, q, MPI_COMM_WORLD);
+        H = slate::HermitianMatrix<scalar_t>::fromScaLAPACK(slate::Uplo::Lower, n, &H_data[0], lldH, nb, p, q, MPI_COMM_WORLD);
     }
 
     real_t cond = 1 / std::numeric_limits<real_t>::epsilon();
@@ -126,12 +125,13 @@ void test_qdwh_work(Params& params, bool run)
     print_matrix("A", A, params);
 
     // todo: how to compute gflops?
-    double gflop = lapack::Gflop<scalar_t>::geqrf(m, n);
 
     if (! ref_only) {
         if (trace) slate::trace::Trace::on();
         else slate::trace::Trace::off();
 
+        int itqr = 0, itpo = 0;
+
         double time = barrier_get_wtime(MPI_COMM_WORLD);
 
         //==================================================
@@ -140,10 +140,31 @@ void test_qdwh_work(Params& params, bool run)
         // A = AH,
         // A will be written by the orthogonal polar factor
         // H is the symmetric positive semidefinite polar factor
-        slate::qdwh(A, H, opts);
+        slate::qdwh(A, H, itqr, itpo, opts);
 
         time = barrier_get_wtime(MPI_COMM_WORLD) - time;
 
+        // gflop include the flops to estimate the condition
+        // number, the flops for the qr_based iterations the
+        // flops for the po_based iterations and the flops
+        // to compute H (hermition polar factor).
+        // gflop = trcon + itqr * (flop_per_qr_based_iter) +
+        //         itpo * (flop_per_po_based_iter) +
+        //         flop_compute_H
+        double gflop_trcon = lapack::Gflop<scalar_t>::geqrf(m, n);
+
+        double gflop_itqr  = itqr * ( lapack::Gflop<scalar_t>::geqrf(m+n, n) +
+                                      lapack::Gflop<scalar_t>::unmqr(slate::Side::Left, m+n, n, n) +
+                                      lapack::Gflop<scalar_t>::gemm(m, n, n) );
+
+        double gflop_itpo  = itpo * ( lapack::Gflop<scalar_t>::gemm(m, n, n) +
+                                      lapack::Gflop<scalar_t>::potrf(m)      +
+                                      blas::Gflop<scalar_t>::trsm(slate::Side::Left, m, n) );
+
+        double gflop_compute_H = blas::Gflop<scalar_t>::her2k(n, m);
+
+        double gflop = gflop_trcon + gflop_itqr + gflop_itpo + gflop_compute_H;
+
         if (trace) slate::trace::Trace::finish();
 
         // compute and save timing/performance
@@ -178,7 +199,8 @@ void test_qdwh_work(Params& params, bool run)
         //
         //==================================================
         real_t normA = slate::norm(slate::Norm::Fro, Aref);
-        slate::gemm(one, A, H, -one, Aref, opts);
+        //slate::gemm(one, A, H, -one, Aref, opts);
+        slate::hemm(slate::Side::Right, one, H, A, -one, Aref, opts);
         real_t berr = slate::norm(slate::Norm::Fro, Aref);
         params.error() = berr / normA;
 

From 44e7c65d305ca7b278f299e99f9503be0f5199d2 Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Thu, 5 Jan 2023 14:19:49 -0500
Subject: [PATCH 13/49] Save the hermitian polar factor into a general matrix
 for now. Add opts to slate calls

---
 src/qdwh.cc       | 42 +++++++++++++++++++-----------------------
 test/test_qdwh.cc | 29 +++++++++++++++++------------
 2 files changed, 36 insertions(+), 35 deletions(-)

diff --git a/src/qdwh.cc b/src/qdwh.cc
index 15c4d4975..405e68a45 100644
--- a/src/qdwh.cc
+++ b/src/qdwh.cc
@@ -26,7 +26,7 @@ namespace impl {
 template <Target target, typename scalar_t>
 void qdwh(
           Matrix<scalar_t>& A,
-          HermitianMatrix<scalar_t>& H, // this matrix will be hermition
+          Matrix<scalar_t>& B,
           int& itqr, int& itpo,
           Options const& opts)
 {
@@ -136,9 +136,9 @@ void qdwh(
     if (m_roundup != m) {
         // W11 and Q11 is the extra padded rows [m:m_round_up, 0:n-1]
         auto W11 = W1.slice(m, m_roundup-1, 0, n-1);
-        set(zero, zero, W11);
+        set(zero, zero, W11, opts);
         auto Q11 = Q1.slice(m, m_roundup-1, 0, n-1);
-        set(zero, zero, Q11);
+        set(zero, zero, Q11, opts);
     }
 
     // backup A in Acpy to compute H
@@ -155,7 +155,7 @@ void qdwh(
     // Either 1) use LU followed by gecon
     // Or     2) QR followed by trtri
     // If used the QR, use the Q factor in the first QR-based iteration
-    normA = norm(slate::Norm::One, A);
+    normA = norm(slate::Norm::One, A, opts);
     if (optqr) {
         // Estimate the condition number using QR
         // This Q factor can be used in the first QR-based iteration
@@ -172,7 +172,7 @@ void qdwh(
             Uplo::Upper, R1 );
         // todo: cheaper to do triangular solve than calling trtri
         trtri(R, opts);
-        norminvR = norm(slate::Norm::One, Rh);
+        norminvR = norm(slate::Norm::One, Rh, opts);
         Li = (1.0 / norminvR) / normA;
         Li = norm_est / 1.1 * Li;
         // *flops += FLOPS_DGEQRF( M, N )
@@ -233,7 +233,7 @@ void qdwh(
             // Generate the matrix B = [ B1 ] = [ sqrt(c) * U ]
             //                         [ B2 ] = [ Id          ]
             alpha = scalar_t(sqrt(c));
-            set(zero, one, W2);
+            set(zero, one, W2, opts);
 
             //if( doqr ) {
             //    geadd(alpha, A, zero, W10, opts);
@@ -258,7 +258,7 @@ void qdwh(
             //     [I ]
             geqrf_qdwh_full(W, T, opts);
 
-            set(zero, one, Q);
+            set(zero, one, Q, opts);
             //unmqr(slate::Side::Left, slate::Op::NoTrans, W, T, Q, opts); //naive impl
             unmqr_qdwh_full(slate::Side::Left, slate::Op::NoTrans, W, T, Q, opts);
 
@@ -288,7 +288,7 @@ void qdwh(
             }
 
             // Make Q1 into an identity matrix
-            set(zero, one, W2);
+            set(zero, one, W2, opts);
 
             // Compute Q1 = c * A' * A + I
             ///////////////
@@ -320,7 +320,7 @@ void qdwh(
         conv = 10.0;
         if (it >= itconv) {
             add(one, A, -one, W10, opts);
-            conv = norm(slate::Norm::Fro, W10);
+            conv = norm(slate::Norm::Fro, W10, opts);
         }
     }
 
@@ -330,14 +330,10 @@ void qdwh(
     }
 
     // A = U*H ==> H = U'*A ==> H = 0.5*(H'+H)
-    copy(Acpy, W10);
     auto AT = conj_transpose(A);
+    gemm(one, AT, Acpy, zero, B, opts);
     // todo: try something like her2k to compute H
     //her2k(one, A, W10, rzero, H, opts);
-    gemm(one, AT, W10, zero, Q10, opts);
-    auto AL = HermitianMatrix<scalar_t>(
-            Uplo::Lower, Q10 );
-    copy(AL, H);
 
     A.releaseWorkspace();
     W.releaseWorkspace();
@@ -396,7 +392,7 @@ void qdwh(
 template <typename scalar_t>
 void qdwh(
           Matrix<scalar_t>& A,
-          HermitianMatrix<scalar_t>& H,
+          Matrix<scalar_t>& B,
           int& itqr, int& itpo,
           Options const& opts)
 {
@@ -406,16 +402,16 @@ void qdwh(
     switch (target) {
         case Target::Host:
         case Target::HostTask:
-            impl::qdwh<Target::HostTask>( A, H, itqr, itpo, opts );
+            impl::qdwh<Target::HostTask>( A, B, itqr, itpo, opts );
             break;
         case Target::HostNest:
-            impl::qdwh<Target::HostNest>( A, H, itqr, itpo, opts );
+            impl::qdwh<Target::HostNest>( A, B, itqr, itpo, opts );
             break;
         case Target::HostBatch:
-            impl::qdwh<Target::HostBatch>( A, H, itqr, itpo, opts );
+            impl::qdwh<Target::HostBatch>( A, B, itqr, itpo, opts );
             break;
         case Target::Devices:
-            impl::qdwh<Target::Devices>( A, H, itqr, itpo, opts );
+            impl::qdwh<Target::Devices>( A, B, itqr, itpo, opts );
             break;
     }
     // todo: return value for errors?
@@ -426,28 +422,28 @@ void qdwh(
 template
 void qdwh<float>(
     Matrix<float>& A,
-    HermitianMatrix<float>& H,
+    Matrix<float>& B,
     int& itqr, int& itpo,
     Options const& opts);
 
 template
 void qdwh<double>(
     Matrix<double>& A,
-    HermitianMatrix<double>& H,
+    Matrix<double>& B,
     int& itqr, int& itpo,
     Options const& opts);
 
 template
 void qdwh< std::complex<float> >(
     Matrix< std::complex<float> >& A,
-    HermitianMatrix< std::complex<float> >& H,
+    Matrix< std::complex<float> >& B,
     int& itqr, int& itpo,
     Options const& opts);
 
 template
 void qdwh< std::complex<double> >(
     Matrix< std::complex<double> >& A,
-    HermitianMatrix< std::complex<double> >& H,
+    Matrix< std::complex<double> >& B,
     int& itqr, int& itpo,
     Options const& opts);
 
diff --git a/test/test_qdwh.cc b/test/test_qdwh.cc
index 019e19417..b02a264a8 100644
--- a/test/test_qdwh.cc
+++ b/test/test_qdwh.cc
@@ -76,6 +76,7 @@ void test_qdwh_work(Params& params, bool run)
     int64_t nlocA = num_local_rows_cols(n, nb, mycol, q);
     int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
     std::vector<scalar_t> A_data;
+    std::vector<scalar_t> B_data;
 
     int64_t mlocH = num_local_rows_cols(n, nb, myrow, p);
     int64_t nlocH = num_local_rows_cols(n, nb, mycol, q);
@@ -88,22 +89,26 @@ void test_qdwh_work(Params& params, bool run)
     Id.insertLocalTiles();
 
     slate::Matrix<scalar_t> A;
-    slate::HermitianMatrix<scalar_t> H;
+    slate::Matrix<scalar_t> B;
+    //slate::HermitianMatrix<scalar_t> H;
     if (origin != slate::Origin::ScaLAPACK) {
         // Copy local ScaLAPACK data to GPU or CPU tiles.
         slate::Target origin_target = origin2target(origin);
         A = slate::Matrix<scalar_t>(m, n, nb, p, q, MPI_COMM_WORLD);
         A.insertLocalTiles(origin_target);
-        //copy(&A_tst[0], descA_tst, A);
-        H = slate::HermitianMatrix<scalar_t>(slate::Uplo::Lower, n, nb, p, q, MPI_COMM_WORLD);
-        H.insertLocalTiles(origin_target);
+        //H = slate::HermitianMatrix<scalar_t>(slate::Uplo::Lower, n, nb, p, q, MPI_COMM_WORLD);
+        //H.insertLocalTiles(origin_target);
+        B = slate::Matrix<scalar_t>(m, n, nb, p, q, MPI_COMM_WORLD);
+        B.insertLocalTiles(origin_target);
     }
     else {
         // create SLATE matrices from the ScaLAPACK layouts
         A_data.resize( lldA * nlocA );
-        H_data.resize( lldH * nlocH );
         A = slate::Matrix<scalar_t>::fromScaLAPACK(m, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
-        H = slate::HermitianMatrix<scalar_t>::fromScaLAPACK(slate::Uplo::Lower, n, &H_data[0], lldH, nb, p, q, MPI_COMM_WORLD);
+        //H_data.resize( lldH * nlocH );
+        //H = slate::HermitianMatrix<scalar_t>::fromScaLAPACK(slate::Uplo::Lower, n, &H_data[0], lldH, nb, p, q, MPI_COMM_WORLD);
+        B_data.resize( lldA * nlocA );
+        B = slate::Matrix<scalar_t>::fromScaLAPACK(m, n, &B_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
     }
 
     real_t cond = 1 / std::numeric_limits<real_t>::epsilon();
@@ -140,7 +145,7 @@ void test_qdwh_work(Params& params, bool run)
         // A = AH,
         // A will be written by the orthogonal polar factor
         // H is the symmetric positive semidefinite polar factor
-        slate::qdwh(A, H, itqr, itpo, opts);
+        slate::qdwh(A, B, itqr, itpo, opts);
 
         time = barrier_get_wtime(MPI_COMM_WORLD) - time;
 
@@ -172,7 +177,7 @@ void test_qdwh_work(Params& params, bool run)
         params.gflops() = gflop / time;
 
         print_matrix("U: orthogonal polar factor", A, params);
-        print_matrix("H: symmetric postive semi definite factor", H, params);
+        //print_matrix("H: symmetric postive semi definite factor", H, params);
     }
 
     if (check) {
@@ -198,10 +203,10 @@ void test_qdwh_work(Params& params, bool run)
         //          ||Aref||_f
         //
         //==================================================
-        real_t normA = slate::norm(slate::Norm::Fro, Aref);
-        //slate::gemm(one, A, H, -one, Aref, opts);
-        slate::hemm(slate::Side::Right, one, H, A, -one, Aref, opts);
-        real_t berr = slate::norm(slate::Norm::Fro, Aref);
+        real_t normA = slate::norm(slate::Norm::Fro, Aref, opts);
+        slate::gemm(one, A, B, -one, Aref, opts);
+        //slate::hemm(slate::Side::Right, one, H, A, -one, Aref, opts);
+        real_t berr = slate::norm(slate::Norm::Fro, Aref, opts);
         params.error() = berr / normA;
 
         real_t tol = params.tol() * 0.5 * std::numeric_limits<real_t>::epsilon();

From 1e5e0348d3704bd1c092c909b08b9afc1c3d40c7 Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Fri, 6 Jan 2023 10:41:53 -0500
Subject: [PATCH 14/49] rename normest and cleaning it

---
 src/norm2est.cc | 211 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 211 insertions(+)
 create mode 100644 src/norm2est.cc

diff --git a/src/norm2est.cc b/src/norm2est.cc
new file mode 100644
index 000000000..b826f1f2a
--- /dev/null
+++ b/src/norm2est.cc
@@ -0,0 +1,211 @@
+// Copyright (c) 2017-2022, University of Tennessee. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
+
+#include "slate/slate.hh"
+#include "internal/internal.hh"
+#include "internal/internal_util.hh"
+#include "slate/internal/mpi.hh"
+
+#include <list>
+#include <tuple>
+
+namespace slate {
+
+namespace impl {
+
+//------------------------------------------------------------------------------
+/// @internal
+/// Distributed parallel general matrix norm.
+/// Generic implementation for any target.
+/// @ingroup norm_specialization
+///
+template <Target target, typename matrix_type>
+blas::real_type<typename matrix_type::value_type>
+norm2est(
+    matrix_type A,
+    Options const& opts)
+{
+    using scalar_t = typename matrix_type::value_type;
+    using real_t = blas::real_type<scalar_t>;
+    using blas::min;
+
+    int p, q;
+    int myrow, mycol;
+    int izero = 0;
+    GridOrder order;
+    A.gridinfo(&order, &p, &q, &myrow, &mycol);
+
+    int64_t n = A.n();
+    int64_t m = A.m();
+    int64_t mb = A.tileMb(0);
+    int64_t nb = A.tileNb(0);
+
+    scalar_t one = 1.;
+    scalar_t zero  = 0.;
+    real_t alpha;
+
+    int64_t cnt = 0;
+    int64_t maxiter = min( 100, n );
+
+    real_t e  = 0.;
+    real_t e0  = 0.;
+    real_t normX = 0;
+    real_t normAX = 0;
+    real_t tol = 1.e-1;
+
+    int64_t mloc   = numberLocalRowOrCol(m, mb, myrow, izero, p);
+    int64_t nloc   = numberLocalRowOrCol(n, nb, myrow, izero, q);
+    int64_t lldA   = blas::max(1, mloc);
+    int64_t lldA_n = blas::max(1, nloc);
+
+    // todo: do we still need reserveDevice here?
+    if (target == Target::Devices)
+        A.reserveDeviceWorkspace();
+
+    std::vector<real_t> local_sums(n);
+    std::vector<scalar_t> global_sums(n);
+    std::vector<scalar_t> W1(lldA);
+    std::vector<scalar_t> W2(lldA_n);
+
+    auto XL = slate::Matrix<scalar_t>::fromScaLAPACK(
+            n, 1, &global_sums[0], n, nb, 1, p, q, A.mpiComm());
+    auto AX = slate::Matrix<scalar_t>::fromScaLAPACK(
+            m, 1, &W1[0], lldA, mb, 1, p, q, A.mpiComm());
+    auto X = slate::Matrix<scalar_t>::fromScaLAPACK(
+            n, 1, &W2[0], lldA_n, nb, 1, p, q, A.mpiComm());
+
+    // Two norm estimation
+    // First: let's compute the x vector such that
+    // x_j = sum_i abs( A_{i,j} ), x here is global_sums.data
+    #pragma omp parallel
+    #pragma omp master
+    {
+        internal::norm<target>(slate::Norm::One, NormScope::Matrix, std::move(A), local_sums.data());
+    }
+
+    // global_sums.data() will have the sum of each column.
+    #pragma omp critical(slate_mpi)
+    {
+        trace::Block trace_block("MPI_Allreduce");
+        slate_mpi_call(
+                MPI_Allreduce(local_sums.data(), global_sums.data(),
+                    n, mpi_type<real_t>::value,
+                    MPI_SUM, A.mpiComm()));
+    }
+
+    // Second: compute the ||x||_Fro
+    e  = slate::norm(Norm::Fro, XL, opts);
+    if (e == 0.) {
+        return 0.;
+    }
+    else {
+        normX = e;
+    }
+
+    // Third: start the while-loop X = X / ||X||
+    while ((cnt < maxiter) && (fabs((e - e0)) > (tol * (e)))) {
+        e0 = e;
+
+        // Scale X = X / ||X||
+        alpha = 1.0/normX;
+        add(scalar_t(alpha), XL, zero, X, opts);
+
+        // Compute Ax = A * sx
+        gemm(one, A, X, zero, AX, opts);
+
+        // todo: still need to add the following
+        //if nnz(Sx) == 0
+        //    Sx = rand(size(Sx),class(Sx));
+        //end
+
+        // Compute x = A' * A * x = A' * Ax
+        auto AT = conjTranspose(A);
+        // todo: why this set is needed when using multiple mpi rank
+        // todo: send to Sebastien
+        set(zero, zero, XL);
+        gemm(one, AT, AX, zero, XL, opts);
+        //gemmC(one, AT, AX, zero, XL, opts);
+
+        // Compute ||X||, ||AX||
+        normX  = norm(Norm::Fro, XL, opts);
+        normAX = norm(Norm::Fro, AX, opts);
+
+        // Uodate e
+        e = normX / normAX;
+        cnt++;
+    }
+
+    A.clearWorkspace();
+
+    return e;
+
+}
+
+} // namespace impl
+
+//------------------------------------------------------------------------------
+/// Distributed parallel general matrix norm.
+///
+//------------------------------------------------------------------------------
+/// @tparam matrix_type
+///     Any SLATE matrix type: Matrix, SymmetricMatrix, HermitianMatrix,
+///     TriangularMatrix, etc.
+//------------------------------------------------------------------------------
+/// @param[in] A
+///     The matrix A.
+///
+/// @param[in] opts
+///     Additional options, as map of name = value pairs. Possible options:
+///     - Option::Target:
+///       Implementation to target. Possible values:
+///       - HostTask:  OpenMP tasks on CPU host [default].
+///       - HostNest:  nested OpenMP parallel for loop on CPU host.
+///       - Devices:   batched BLAS on GPU device.
+///
+/// @ingroup norm
+///
+template <typename matrix_type>
+blas::real_type<typename matrix_type::value_type>
+norm2est(matrix_type& A,
+        Options const& opts)
+{
+    Target target = get_option( opts, Option::Target, Target::HostTask );
+
+    switch (target) {
+        case Target::Host:
+        case Target::HostTask:
+        case Target::HostNest:
+        case Target::HostBatch:
+            return impl::norm2est<Target::HostTask>( A, opts );
+            break;
+        case Target::Devices:
+            return impl::norm2est<Target::Devices>( A, opts );
+            break;
+    }
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+template
+float norm2est(
+    Matrix<float>& A,
+    Options const& opts);
+
+template
+double norm2est(
+    Matrix<double>& A,
+    Options const& opts);
+
+template
+float norm2est(
+    Matrix< std::complex<float> >& A,
+    Options const& opts);
+
+template
+double norm2est(
+    Matrix< std::complex<double> >& A,
+    Options const& opts);
+
+} // namespace slate

From 12c151cde340fb3a2108f63857caf75b504af369 Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Fri, 6 Jan 2023 10:43:58 -0500
Subject: [PATCH 15/49] change header of norm2est and call it in qdwh

---
 include/slate/slate.hh | 5 ++---
 src/qdwh.cc            | 9 +++------
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/include/slate/slate.hh b/include/slate/slate.hh
index a249d8af9..dbcae4781 100644
--- a/include/slate/slate.hh
+++ b/include/slate/slate.hh
@@ -1093,7 +1093,7 @@ void steqr2(
 template <typename scalar_t>
 void qdwh(
     Matrix<scalar_t>& A,
-    HermitianMatrix<scalar_t>& H,
+    Matrix<scalar_t>& B,
     int& itqr, int& itpo,
     Options const& opts = Options());
 
@@ -1101,8 +1101,7 @@ void qdwh(
 // normest()
 template <typename matrix_type>
 blas::real_type<typename matrix_type::value_type>
-normest(
-    Norm in_norm,
+norm2est(
     matrix_type &A,
     Options const& opts = Options());
 
diff --git a/src/qdwh.cc b/src/qdwh.cc
index 405e68a45..99a1e0980 100644
--- a/src/qdwh.cc
+++ b/src/qdwh.cc
@@ -145,8 +145,9 @@ void qdwh(
     copy(A, Acpy);
 
     // two norm estimation (largest singular value of A)
-    real_t norm_est = normest(slate::Norm::One, A);
+    real_t norm_est = norm2est( A, opts);
     alpha = 1.0;
+    norm_est = 1.;
 
     // scale the original matrix A to form A0 of the iterative loop
     add(alpha/scalar_t(norm_est), Acpy, zero, A, opts);
@@ -402,13 +403,9 @@ void qdwh(
     switch (target) {
         case Target::Host:
         case Target::HostTask:
-            impl::qdwh<Target::HostTask>( A, B, itqr, itpo, opts );
-            break;
         case Target::HostNest:
-            impl::qdwh<Target::HostNest>( A, B, itqr, itpo, opts );
-            break;
         case Target::HostBatch:
-            impl::qdwh<Target::HostBatch>( A, B, itqr, itpo, opts );
+            impl::qdwh<Target::HostTask>( A, B, itqr, itpo, opts );
             break;
         case Target::Devices:
             impl::qdwh<Target::Devices>( A, B, itqr, itpo, opts );

From c2e487033f2b30a5b06b2649711fcc12937980dd Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Fri, 6 Jan 2023 10:48:10 -0500
Subject: [PATCH 16/49] rename normest and minor change

---
 GNUmakefile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/GNUmakefile b/GNUmakefile
index 7021560f5..94a581bf9 100644
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -502,6 +502,7 @@ ifneq ($(only_unit),1)
         src/gemmA.cc \
         src/gemmC.cc \
         src/geqrf.cc \
+        src/geqrf_qdwh_full.cc \
         src/gesv.cc \
         src/gesvMixed.cc \
         src/gesv_nopiv.cc \
@@ -528,6 +529,7 @@ ifneq ($(only_unit),1)
         src/hetrf.cc \
         src/hetrs.cc \
         src/norm.cc \
+        src/norm2est.cc \
         src/pbsv.cc \
         src/pbtrf.cc \
         src/pbtrs.cc \
@@ -537,6 +539,7 @@ ifneq ($(only_unit),1)
         src/potri.cc \
         src/potrs.cc \
         src/print.cc \
+        src/qdwh.cc \
         src/scale.cc \
         src/scale_row_col.cc \
         src/set.cc \
@@ -558,13 +561,10 @@ ifneq ($(only_unit),1)
         src/unmqr.cc \
         src/unmtr_hb2st.cc \
         src/unmtr_he2hb.cc \
+        src/unmqr_qdwh_full.cc \
         src/work/work_trmm.cc \
         src/work/work_trsm.cc \
         src/work/work_trsmA.cc \
-		src/qdwh.cc \
-		src/normest.cc \
-		src/geqrf_qdwh_full.cc \
-		src/unmqr_qdwh_full.cc \
         # End. Add alphabetically.
 endif
 

From a0aa6052b3a9018cbc79a71de7dc6e8a968b8d9c Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Fri, 6 Jan 2023 10:52:27 -0500
Subject: [PATCH 17/49] delete

---
 src/normest.cc | 250 -------------------------------------------------
 1 file changed, 250 deletions(-)
 delete mode 100644 src/normest.cc

diff --git a/src/normest.cc b/src/normest.cc
deleted file mode 100644
index c73632cb4..000000000
--- a/src/normest.cc
+++ /dev/null
@@ -1,250 +0,0 @@
-// Copyright (c) 2017-2022, University of Tennessee. All rights reserved.
-// SPDX-License-Identifier: BSD-3-Clause
-// This program is free software: you can redistribute it and/or modify it under
-// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
-
-#include "slate/slate.hh"
-#include "internal/internal.hh"
-#include "internal/internal_util.hh"
-#include "slate/internal/mpi.hh"
-
-#include <list>
-#include <tuple>
-
-namespace slate {
-
-// specialization namespace differentiates, e.g.,
-// internal::norm from internal::specialization::norm
-namespace internal {
-namespace specialization {
-
-//------------------------------------------------------------------------------
-/// @internal
-/// Distributed parallel general matrix norm.
-/// Generic implementation for any target.
-/// @ingroup norm_specialization
-///
-template <Target target, typename matrix_type>
-blas::real_type<typename matrix_type::value_type>
-normest(slate::internal::TargetType<target>,
-     Norm in_norm, matrix_type A,
-     Options const& opts)
-{
-    using scalar_t = typename matrix_type::value_type;
-    using real_t = blas::real_type<scalar_t>;
-
-    // Undo any transpose, which switches one <=> inf norms.
-    if (A.op() == Op::ConjTrans)
-        A = conj_transpose(A);
-    else if (A.op() == Op::Trans)
-        A = transpose(A);
-
-    //---------
-    // Two norm estimation
-    // First: let's compute the x vector such that
-    // x_j = sum_i abs( A_{i,j} ), x here is global_sums.data
-    // todo: should we add this to norm?
-    //if (in_norm == Norm::One_est) {
-
-        using blas::min;
-
-        int p, q;
-        int myrow, mycol;
-        GridOrder order;
-        A.gridinfo(&order, &p, &q, &myrow, &mycol);
-
-        int64_t n = A.n();
-        int64_t m = A.m();
-        int64_t nb = A.tileNb(0);
-
-        scalar_t one = 1.;
-        scalar_t zero  = 0.;
-        real_t alpha;
-
-        int64_t cnt = 0;
-        int64_t maxiter = min( 100, n );
-
-        real_t e  = 0.;
-        real_t e0  = 0.;
-        real_t normX = 0;
-        real_t normAX = 0;
-        real_t tol = 1.e-1;
-
-        std::vector<real_t> local_sums(n);
-
-        // todo: do we still need reserveDevice here?
-        if (target == Target::Devices)
-            A.reserveDeviceWorkspace();
-
-        std::vector<scalar_t> global_sums(n);
-        std::vector<scalar_t> W1(m);
-        std::vector<scalar_t> W2(n);
-
-        auto XL = slate::Matrix<scalar_t>::fromLAPACK(
-            n, 1, &global_sums[0], n, nb, 1, p, q, A.mpiComm());
-        XL.insertLocalTiles();
-        auto AX = slate::Matrix<scalar_t>::fromLAPACK(
-            m, 1, &W1[0], m, nb, 1, p, q, A.mpiComm());
-        AX.insertLocalTiles();
-        auto X = slate::Matrix<scalar_t>::fromLAPACK(
-            n, 1, &W2[0], n, nb, 1, p, q, A.mpiComm());
-        X.insertLocalTiles();
-
-
-        #pragma omp parallel
-        #pragma omp master
-        {
-            internal::norm<target>(slate::Norm::One, NormScope::Matrix, std::move(A), local_sums.data());
-        }
-
-        #pragma omp critical(slate_mpi)
-        {
-            trace::Block trace_block("MPI_Allreduce");
-            slate_mpi_call(
-                MPI_Allreduce(local_sums.data(), global_sums.data(),
-                              n, mpi_type<real_t>::value,
-                              MPI_SUM, A.mpiComm()));
-        }
-
-        // global_sums.data() will have the sum of each column.
-        // First step is done
-
-        // Second: compute the ||x||_Fro
-        //e = lapack::lange(
-        //    Norm::Fro, 1, n,
-        //    global_sums.data(), 1);
-        e  = slate::norm(Norm::Fro, XL, opts);
-        if (e == 0.) {
-            return 0.;
-        }
-        else {
-            normX = e;
-        }
-
-        // Third: start the while-loop X = X / ||X||
-        while ((cnt < maxiter) && (fabs((e - e0)) > (tol * (e)))) {
-            e0 = e;
-
-            // Scale X = X / ||X||
-            alpha = 1.0/normX;
-            add(scalar_t(alpha), XL, zero, X, opts);
-
-            // Compute Ax = A * sx
-            gemm(one, A, X, zero, AX, opts);
-
-            // todo: still need to add the following
-            //if nnz(Sx) == 0
-            //    Sx = rand(size(Sx),class(Sx));
-            //end
-
-            // Compute x = A' * A * x = A' * Ax
-            auto AT = conjTranspose(A);
-            // todo: why this set is needed when using multiple mpi rank
-            // todo: send to Sebastien
-            set(zero, zero, XL);
-            gemm(one, AT, AX, zero, XL, opts);
-            //gemmC(one, AT, AX, zero, XL, opts);
-
-            // Compute ||X||, ||AX||
-            normX  = norm(Norm::Fro, XL, opts);
-            normAX = norm(Norm::Fro, AX, opts);
-
-            // Uodate e
-            e = normX / normAX;
-            cnt++;
-        }
-
-        A.clearWorkspace();
-
-        return e;
-
-    //}
-    //else {
-    //    slate_error("invalid norm.");
-    //}
-}
-
-} // namespace specialization
-} // namespace internal
-
-//------------------------------------------------------------------------------
-/// Version with target as template parameter.
-/// @ingroup norm_specialization
-///
-template <Target target, typename matrix_type>
-blas::real_type<typename matrix_type::value_type>
-normest(Norm norm, matrix_type& A,
-     const std::map<Option, Value>& opts)
-{
-    return internal::specialization::normest(internal::TargetType<target>(),
-                                          norm, A, opts);
-}
-
-//------------------------------------------------------------------------------
-/// Distributed parallel general matrix norm.
-///
-//------------------------------------------------------------------------------
-/// @tparam matrix_type
-///     Any SLATE matrix type: Matrix, SymmetricMatrix, HermitianMatrix,
-///     TriangularMatrix, etc.
-//------------------------------------------------------------------------------
-/// @param[in] in_norm
-///     Norm to compute:
-///     - Norm::Second: second norm estimation of the matrix$
-///
-/// @param[in] A
-///     The matrix A.
-///
-/// @param[in] opts
-///     Additional options, as map of name = value pairs. Possible options:
-///     - Option::Target:
-///       Implementation to target. Possible values:
-///       - HostTask:  OpenMP tasks on CPU host [default].
-///       - HostNest:  nested OpenMP parallel for loop on CPU host.
-///       - Devices:   batched BLAS on GPU device.
-///
-/// @ingroup norm
-///
-template <typename matrix_type>
-blas::real_type<typename matrix_type::value_type>
-normest(Norm in_norm, matrix_type& A,
-     const std::map<Option, Value>& opts)
-{
-    Target target = get_option( opts, Option::Target, Target::HostTask );
-
-    switch (target) {
-        case Target::Host:
-        case Target::HostTask:
-            return normest<Target::HostTask>(in_norm, A, opts);
-            break;
-        case Target::HostBatch:
-        case Target::HostNest:
-        case Target::Devices:
-            break;
-    }
-    throw std::exception();  // todo: invalid target
-}
-
-//------------------------------------------------------------------------------
-// Explicit instantiations.
-template
-float normest(
-    Norm in_norm, Matrix<float>& A,
-    Options const& opts);
-
-template
-double normest(
-    Norm in_norm, Matrix<double>& A,
-    Options const& opts);
-
-template
-float normest(
-    Norm in_norm, Matrix< std::complex<float> >& A,
-    Options const& opts);
-
-template
-double normest(
-    Norm in_norm, Matrix< std::complex<double> >& A,
-    Options const& opts);
-
-} // namespace slate

From c8516a901f59d92bf2d0882adfc9357c3bb9bda8 Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Fri, 6 Jan 2023 11:17:37 -0500
Subject: [PATCH 18/49] minor cleaning

---
 src/qdwh.cc | 27 ++++++++-------------------
 1 file changed, 8 insertions(+), 19 deletions(-)

diff --git a/src/qdwh.cc b/src/qdwh.cc
index 99a1e0980..d58badd96 100644
--- a/src/qdwh.cc
+++ b/src/qdwh.cc
@@ -10,17 +10,11 @@
 
 namespace slate {
 
-// specialization namespace differentiates, e.g.,
-// todo: is it ok?
-// internal::qdwh from impl::qdwh
-// todo: works only if m % nb = 0
-// todo: works only for square matrices
 namespace impl {
 
 //------------------------------------------------------------------------------
 /// Distributed parallel polar decomposition based on QDWH algorithm.
 /// Generic implementation for any target.
-// todo: add to docs/doxygen
 /// @ingroup qdwh_specialization
 ///
 template <Target target, typename scalar_t>
@@ -92,9 +86,9 @@ void qdwh(
     Acpy = slate::Matrix<scalar_t>(m, n, nb, nprow, npcol, MPI_COMM_WORLD);
     Acpy.insertLocalTiles(target);
 
-    if (target == Target::Devices) {
-        const int64_t batch_size_zero = 0; // use default batch size
-        const int64_t num_queues = 3 + lookahead;
+    //if (target == Target::Devices) {
+        //const int64_t batch_size_zero = 0; // use default batch size
+        //const int64_t num_queues = 3 + lookahead;
         // todo: I have workspace here and in other call as geqrf_qdwh_full
         // todo: in geqrf, it should use the allocated ones, it should not
         // keep allocating
@@ -111,7 +105,7 @@ void qdwh(
         //Q.reserveDeviceWorkspace();
         //Acpy.allocateBatchArrays(batch_size_zero, num_queues);
         //Acpy.reserveDeviceWorkspace();
-    }
+    //}
 
     // todo: do this on GPUs
     W.insertLocalTiles(target);
@@ -136,9 +130,7 @@ void qdwh(
     if (m_roundup != m) {
         // W11 and Q11 is the extra padded rows [m:m_round_up, 0:n-1]
         auto W11 = W1.slice(m, m_roundup-1, 0, n-1);
-        set(zero, zero, W11, opts);
         auto Q11 = Q1.slice(m, m_roundup-1, 0, n-1);
-        set(zero, zero, Q11, opts);
     }
 
     // backup A in Acpy to compute H
@@ -252,11 +244,6 @@ void qdwh(
             //scale(alpha, one, W1, opts);
 
             //geqrf(W, T, opts); // naive impl
-            // todo: how to avoid allocating workspace for each iteration (3 QR)?
-            // todo: how to make it work for any matrix size and any nb?
-            // todo: check time for allocationn in geqrf
-            // W = [W1]
-            //     [I ]
             geqrf_qdwh_full(W, T, opts);
 
             set(zero, one, Q, opts);
@@ -292,7 +279,6 @@ void qdwh(
             set(zero, one, W2, opts);
 
             // Compute Q1 = c * A' * A + I
-            ///////////////
             copy(A, Q10);
             auto AT = conj_transpose(Q10);
             gemm(scalar_t(c), AT, A, one, W2, opts);
@@ -317,7 +303,7 @@ void qdwh(
             //facto = 1;
         }
 
-        // Compute the norm of the symmetric matrix U - B1
+        // Check if it converge, compute the norm of matrix U - B1
         conv = 10.0;
         if (it >= itconv) {
             add(one, A, -one, W10, opts);
@@ -335,6 +321,9 @@ void qdwh(
     gemm(one, AT, Acpy, zero, B, opts);
     // todo: try something like her2k to compute H
     //her2k(one, A, W10, rzero, H, opts);
+    //auto AL = HermitianMatrix<scalar_t>(
+    //        slate::Uplo::Lower, B );
+    //slate::copy(AL, H, opts);
 
     A.releaseWorkspace();
     W.releaseWorkspace();

From c21a9933786f7d80057f4ed24b5a0c543c135601 Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Fri, 6 Jan 2023 11:26:02 -0500
Subject: [PATCH 19/49] pass opts to calls

---
 src/qdwh.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/qdwh.cc b/src/qdwh.cc
index d58badd96..1dbfc47a9 100644
--- a/src/qdwh.cc
+++ b/src/qdwh.cc
@@ -134,7 +134,7 @@ void qdwh(
     }
 
     // backup A in Acpy to compute H
-    copy(A, Acpy);
+    slate::copy(A, Acpy, opts);
 
     // two norm estimation (largest singular value of A)
     real_t norm_est = norm2est( A, opts);
@@ -152,7 +152,7 @@ void qdwh(
     if (optqr) {
         // Estimate the condition number using QR
         // This Q factor can be used in the first QR-based iteration
-        copy(A, W10);
+        slate::copy(A, W10, opts);
         // todo: the QR used to estimate the condition number can be used in the first QR iteration
         // todo: put bound of 1-norm est compared to 2-norm
         slate::geqrf(W1, T, opts);
@@ -256,7 +256,7 @@ void qdwh(
             beta  = scalar_t( b / c );
             // Copy U into C to check the convergence of QDWH
             if (it >= itconv ) {
-                copy(A, W10);
+                slate::copy(A, W10, opts);
             }
             gemm(alpha, Q10, Q2T, beta, A, opts);
 
@@ -272,14 +272,14 @@ void qdwh(
         else {
             // Copy A into H to check the convergence of QDWH
             if (it >= itconv) {
-                copy(A, W10);
+                slate::copy(A, W10, opts);
             }
 
             // Make Q1 into an identity matrix
             set(zero, one, W2, opts);
 
             // Compute Q1 = c * A' * A + I
-            copy(A, Q10);
+            slate::copy(A, Q10, opts);
             auto AT = conj_transpose(Q10);
             gemm(scalar_t(c), AT, A, one, W2, opts);
 

From 7e9c00e346e57c6191ae091d939e24d392d85665 Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Mon, 9 Jan 2023 14:01:35 -0500
Subject: [PATCH 20/49] Use H innstead of new allocation in case of square
 matrix. Rename B to H

---
 src/qdwh.cc       | 69 +++++++++++++++++------------------------------
 test/test_qdwh.cc | 22 ++++++++-------
 2 files changed, 37 insertions(+), 54 deletions(-)

diff --git a/src/qdwh.cc b/src/qdwh.cc
index 1dbfc47a9..796cc05cd 100644
--- a/src/qdwh.cc
+++ b/src/qdwh.cc
@@ -20,7 +20,7 @@ namespace impl {
 template <Target target, typename scalar_t>
 void qdwh(
           Matrix<scalar_t>& A,
-          Matrix<scalar_t>& B,
+          Matrix<scalar_t>& H,
           int& itqr, int& itpo,
           Options const& opts)
 {
@@ -81,31 +81,16 @@ void qdwh(
     slate::Matrix<scalar_t> Q(m_W, n, nb, nprow, npcol, MPI_COMM_WORLD);
     slate::TriangularFactors<scalar_t> T;
 
-    // todo: anyway to avoid Acpy allocation?
     slate::Matrix<scalar_t> Acpy;
-    Acpy = slate::Matrix<scalar_t>(m, n, nb, nprow, npcol, MPI_COMM_WORLD);
-    Acpy.insertLocalTiles(target);
-
-    //if (target == Target::Devices) {
-        //const int64_t batch_size_zero = 0; // use default batch size
-        //const int64_t num_queues = 3 + lookahead;
-        // todo: I have workspace here and in other call as geqrf_qdwh_full
-        // todo: in geqrf, it should use the allocated ones, it should not
-        // keep allocating
-        // todo: check traces to confirm
-        // or in printf in blaspp/device_malloc and here
-        // todo: is it enough to allocate for one matrix??
-        // todo: we don't need these here because each slate call will do it.
-        // todo: check if after any call it deallocate, for example after gemm, does it throw away the gpu allocation?
-        //A.allocateBatchArrays(batch_size_zero, num_queues);
-        //A.reserveDeviceWorkspace();
-        //W.allocateBatchArrays(batch_size_zero, num_queues);
-        //W.reserveDeviceWorkspace();
-        //Q.allocateBatchArrays(batch_size_zero, num_queues);
-        //Q.reserveDeviceWorkspace();
-        //Acpy.allocateBatchArrays(batch_size_zero, num_queues);
-        //Acpy.reserveDeviceWorkspace();
-    //}
+    if (m == n) {
+        // if A is square, we can use H to save a copy of A
+        // this is will not work if H is stored as Hermitian matrix
+        Acpy = H;
+    }
+    else {
+        Acpy = slate::Matrix<scalar_t>(m, n, nb, nprow, npcol, MPI_COMM_WORLD);
+        Acpy.insertLocalTiles(target);
+    }
 
     // todo: do this on GPUs
     W.insertLocalTiles(target);
@@ -126,13 +111,6 @@ void qdwh(
 
     auto Q2 = Q.sub(mt, mt_W-1, 0, nt-1);
 
-
-    if (m_roundup != m) {
-        // W11 and Q11 is the extra padded rows [m:m_round_up, 0:n-1]
-        auto W11 = W1.slice(m, m_roundup-1, 0, n-1);
-        auto Q11 = Q1.slice(m, m_roundup-1, 0, n-1);
-    }
-
     // backup A in Acpy to compute H
     slate::copy(A, Acpy, opts);
 
@@ -145,9 +123,7 @@ void qdwh(
     add(alpha/scalar_t(norm_est), Acpy, zero, A, opts);
 
     // Calculate Li: reciprocal of condition number estimation
-    // Either 1) use LU followed by gecon
-    // Or     2) QR followed by trtri
-    // If used the QR, use the Q factor in the first QR-based iteration
+    // todo: use the Q factor in the first QR-based iteration
     normA = norm(slate::Norm::One, A, opts);
     if (optqr) {
         // Estimate the condition number using QR
@@ -223,8 +199,8 @@ void qdwh(
         if (real(c) > 100.) {
             //int64_t doqr = !optqr || it > 1;
 
-            // Generate the matrix B = [ B1 ] = [ sqrt(c) * U ]
-            //                         [ B2 ] = [ Id          ]
+            // Generate the matrix W = [ W1 ] = [ sqrt(c) * A ]
+            //                         [ W2 ] = [ Id          ]
             alpha = scalar_t(sqrt(c));
             set(zero, one, W2, opts);
 
@@ -237,7 +213,7 @@ void qdwh(
             //    geqrf_qdwh_full2(W, T1, opts);
             //}
 
-            // Factorize B = QR, and generate the associated Q
+            // Factorize W = QR, and generate the associated Q
             // todo: replace following add by copy and scale, but why scale only scale by a real numbers?
             add(alpha, A, zero, W10, opts);
             //copy(A, W1);
@@ -303,7 +279,7 @@ void qdwh(
             //facto = 1;
         }
 
-        // Check if it converge, compute the norm of matrix U - B1
+        // Check if it converge, compute the norm of matrix A - W1
         conv = 10.0;
         if (it >= itconv) {
             add(one, A, -one, W10, opts);
@@ -318,16 +294,19 @@ void qdwh(
 
     // A = U*H ==> H = U'*A ==> H = 0.5*(H'+H)
     auto AT = conj_transpose(A);
-    gemm(one, AT, Acpy, zero, B, opts);
+    if (m == n) {
+        auto W10_n = W1.slice(0, n-1, 0, n-1);
+        gemm(one, AT, Acpy, zero, W10_n, opts);
+        slate::copy(W10_n, H, opts);
+    }
+    else {
+        gemm(one, AT, Acpy, zero, H, opts);
+    }
     // todo: try something like her2k to compute H
     //her2k(one, A, W10, rzero, H, opts);
     //auto AL = HermitianMatrix<scalar_t>(
-    //        slate::Uplo::Lower, B );
+    //        slate::Uplo::Lower, H );
     //slate::copy(AL, H, opts);
-
-    A.releaseWorkspace();
-    W.releaseWorkspace();
-    Q.releaseWorkspace();
 }
 
 } // namespace impl
diff --git a/test/test_qdwh.cc b/test/test_qdwh.cc
index b02a264a8..339ec5ff4 100644
--- a/test/test_qdwh.cc
+++ b/test/test_qdwh.cc
@@ -13,6 +13,7 @@
 #include "scalapack_wrappers.hh"
 #include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
+#include "nvToolsExt.h"
 
 #include <cmath>
 #include <cstdio>
@@ -76,11 +77,10 @@ void test_qdwh_work(Params& params, bool run)
     int64_t nlocA = num_local_rows_cols(n, nb, mycol, q);
     int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
     std::vector<scalar_t> A_data;
-    std::vector<scalar_t> B_data;
 
     int64_t mlocH = num_local_rows_cols(n, nb, myrow, p);
     int64_t nlocH = num_local_rows_cols(n, nb, mycol, q);
-    int64_t lldH  = blas::max(1, mlocH); // local leading dimension of A
+    int64_t lldH  = blas::max(1, mlocH); // local leading dimension of H
     std::vector<scalar_t> H_data;
 
     // matrix QR, for checking result
@@ -89,7 +89,7 @@ void test_qdwh_work(Params& params, bool run)
     Id.insertLocalTiles();
 
     slate::Matrix<scalar_t> A;
-    slate::Matrix<scalar_t> B;
+    slate::Matrix<scalar_t> H;
     //slate::HermitianMatrix<scalar_t> H;
     if (origin != slate::Origin::ScaLAPACK) {
         // Copy local ScaLAPACK data to GPU or CPU tiles.
@@ -98,8 +98,8 @@ void test_qdwh_work(Params& params, bool run)
         A.insertLocalTiles(origin_target);
         //H = slate::HermitianMatrix<scalar_t>(slate::Uplo::Lower, n, nb, p, q, MPI_COMM_WORLD);
         //H.insertLocalTiles(origin_target);
-        B = slate::Matrix<scalar_t>(m, n, nb, p, q, MPI_COMM_WORLD);
-        B.insertLocalTiles(origin_target);
+        H = slate::Matrix<scalar_t>(n, n, nb, p, q, MPI_COMM_WORLD);
+        H.insertLocalTiles(origin_target);
     }
     else {
         // create SLATE matrices from the ScaLAPACK layouts
@@ -107,8 +107,8 @@ void test_qdwh_work(Params& params, bool run)
         A = slate::Matrix<scalar_t>::fromScaLAPACK(m, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
         //H_data.resize( lldH * nlocH );
         //H = slate::HermitianMatrix<scalar_t>::fromScaLAPACK(slate::Uplo::Lower, n, &H_data[0], lldH, nb, p, q, MPI_COMM_WORLD);
-        B_data.resize( lldA * nlocA );
-        B = slate::Matrix<scalar_t>::fromScaLAPACK(m, n, &B_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
+        H_data.resize( lldH * nlocA );
+        H = slate::Matrix<scalar_t>::fromScaLAPACK(n, n, &H_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
     }
 
     real_t cond = 1 / std::numeric_limits<real_t>::epsilon();
@@ -145,7 +145,9 @@ void test_qdwh_work(Params& params, bool run)
         // A = AH,
         // A will be written by the orthogonal polar factor
         // H is the symmetric positive semidefinite polar factor
-        slate::qdwh(A, B, itqr, itpo, opts);
+        nvtxRangePush("qdwh");
+        slate::qdwh(A, H, itqr, itpo, opts);
+        nvtxRangePop();
 
         time = barrier_get_wtime(MPI_COMM_WORLD) - time;
 
@@ -204,7 +206,9 @@ void test_qdwh_work(Params& params, bool run)
         //
         //==================================================
         real_t normA = slate::norm(slate::Norm::Fro, Aref, opts);
-        slate::gemm(one, A, B, -one, Aref, opts);
+        slate::gemm(one, A, H, -one, Aref, opts);
+        //auto H = HermitianMatrix<scalar_t>(
+        //        slate::Uplo::Lower, B );
         //slate::hemm(slate::Side::Right, one, H, A, -one, Aref, opts);
         real_t berr = slate::norm(slate::Norm::Fro, Aref, opts);
         params.error() = berr / normA;

From 04f06d933e5f41f9d160a5736f2fca3487f19180 Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Tue, 10 Jan 2023 09:51:19 -0500
Subject: [PATCH 21/49] Added exit with error if #itr > 100 in qdwh and minor
 cleaning

---
 src/qdwh.cc       | 76 +++++++++++++++++++----------------------------
 test/test_qdwh.cc |  4 +--
 2 files changed, 33 insertions(+), 47 deletions(-)

diff --git a/src/qdwh.cc b/src/qdwh.cc
index 796cc05cd..0e22a3a4a 100644
--- a/src/qdwh.cc
+++ b/src/qdwh.cc
@@ -27,12 +27,11 @@ void qdwh(
     using real_t = blas::real_type<scalar_t>;
     using blas::real;
 
-    int64_t lookahead = get_option<int64_t>( opts, Option::Lookahead, 1 );
     int64_t max_panel_threads  = std::max(omp_get_max_threads()/2, 1);
 
     max_panel_threads = get_option<int64_t>( opts, Option::MaxPanelThreads, max_panel_threads );
 
-    real_t eps  = 0.5 * std::numeric_limits<real_t>::epsilon();
+    real_t eps  = std::numeric_limits<real_t>::epsilon();
     real_t tol1 = 5. * eps;
     real_t tol3 = pow(tol1, 1./3.);
 
@@ -43,7 +42,6 @@ void qdwh(
     real_t Li, Liconv;
     real_t conv = 100.;
     scalar_t zero = 0.0, one = 1.0;
-    real_t rzero = 0.0;
 
     scalar_t alpha, beta;
 
@@ -77,8 +75,8 @@ void qdwh(
     A.gridinfo(&order, &nprow, &npcol, &myrow, &mycol);
 
     // allocate m_W*n work space required for the qr-based iterations QR([A;Id])
-    slate::Matrix<scalar_t> W(m_W, n, nb, nprow, npcol, MPI_COMM_WORLD);
-    slate::Matrix<scalar_t> Q(m_W, n, nb, nprow, npcol, MPI_COMM_WORLD);
+    slate::Matrix<scalar_t> W(m_W, n, nb, nprow, npcol, A.mpiComm());
+    slate::Matrix<scalar_t> Q(m_W, n, nb, nprow, npcol, A.mpiComm());
     slate::TriangularFactors<scalar_t> T;
 
     slate::Matrix<scalar_t> Acpy;
@@ -88,7 +86,7 @@ void qdwh(
         Acpy = H;
     }
     else {
-        Acpy = slate::Matrix<scalar_t>(m, n, nb, nprow, npcol, MPI_COMM_WORLD);
+        Acpy = slate::Matrix<scalar_t>(m, n, nb, nprow, npcol, A.mpiComm());
         Acpy.insertLocalTiles(target);
     }
 
@@ -102,13 +100,14 @@ void qdwh(
     // W10 have A matrix
     auto W1 = W.sub(0, mt-1, 0, nt-1);
     auto W10 = W1.slice(0, m-1, 0, n-1);
-
-    // Second nt tiles of W, contain the identity matrix
-    auto W2 = W.sub(mt, mt_W-1, 0, nt-1);
+    auto W11 = W.sub(mt-1, mt-1, 0, nt-1);
 
     auto Q1 = Q.sub(0, mt-1, 0, nt-1);
     auto Q10 = Q1.slice(0, m-1, 0, n-1);
 
+    // Second nt tiles of W, contain the identity matrix
+    auto W2 = W.sub(mt, mt_W-1, 0, nt-1);
+
     auto Q2 = Q.sub(mt, mt_W-1, 0, nt-1);
 
     // backup A in Acpy to compute H
@@ -117,7 +116,6 @@ void qdwh(
     // two norm estimation (largest singular value of A)
     real_t norm_est = norm2est( A, opts);
     alpha = 1.0;
-    norm_est = 1.;
 
     // scale the original matrix A to form A0 of the iterative loop
     add(alpha/scalar_t(norm_est), Acpy, zero, A, opts);
@@ -139,7 +137,6 @@ void qdwh(
             Uplo::Upper, slate::Diag::NonUnit, R1 );
         auto Rh = HermitianMatrix<scalar_t>(
             Uplo::Upper, R1 );
-        // todo: cheaper to do triangular solve than calling trtri
         trtri(R, opts);
         norminvR = norm(slate::Norm::One, Rh, opts);
         Li = (1.0 / norminvR) / normA;
@@ -159,8 +156,7 @@ void qdwh(
         // itconv = number of iterations needed until |Li - 1| < tol1
         // This should have converged in less than 50 iterations
         if (itconv > 100) {
-            exit(-1);
-            break;
+            slate_error("Failed to converge.");
         }
         itconv++;
 
@@ -179,8 +175,7 @@ void qdwh(
     while (conv > tol3 || it < itconv) {
         // This should have converged in less than 50 iterations
         if (it > 100) {
-            exit(-1);
-            break;
+            slate_error("Failed to converge.");
         }
         it++;
 
@@ -201,62 +196,53 @@ void qdwh(
 
             // Generate the matrix W = [ W1 ] = [ sqrt(c) * A ]
             //                         [ W2 ] = [ Id          ]
-            alpha = scalar_t(sqrt(c));
-            set(zero, one, W2, opts);
-
-            //if( doqr ) {
-            //    geadd(alpha, A, zero, W10, opts);
-            //    geqrf_qdwh_full(W, T1, opts);
-            //}
-            //else {
-            //    geadd(alpha, W1, zero, W10, opts);
-            //    geqrf_qdwh_full2(W, T1, opts);
-            //}
 
             // Factorize W = QR, and generate the associated Q
             // todo: replace following add by copy and scale, but why scale only scale by a real numbers?
+            alpha = scalar_t(sqrt(c));
+            set(zero, zero, W11, opts);
             add(alpha, A, zero, W10, opts);
-            //copy(A, W1);
-            //scale(alpha, one, W1, opts);
+            set(zero, one, W2, opts);
 
-            //geqrf(W, T, opts); // naive impl
+            // Call a variant of geqrf annd unmqr that avoid the tiles of zeros,
+            // which are the tiles below the diagonal of W2 (identity)
             geqrf_qdwh_full(W, T, opts);
 
             set(zero, one, Q, opts);
-            //unmqr(slate::Side::Left, slate::Op::NoTrans, W, T, Q, opts); //naive impl
             unmqr_qdwh_full(slate::Side::Left, slate::Op::NoTrans, W, T, Q, opts);
 
             // A = ( (a-b/c)/sqrt(c) ) * Q1 * Q2' + (b/c) * A
             auto Q2T = conj_transpose(Q2);
             alpha = scalar_t( (a-b/c)/sqrt(c) );
             beta  = scalar_t( b / c );
-            // Copy U into C to check the convergence of QDWH
+            // Save a copy of A to check the convergence of QDWH
             if (it >= itconv ) {
                 slate::copy(A, W10, opts);
             }
             gemm(alpha, Q10, Q2T, beta, A, opts);
 
+            itqr += 1;
+
+            //facto = 0;
             // Main flops used in this step/
             //flops_dgeqrf = FLOPS_DGEQRF( 2*m, n );
             //flops_dorgqr = FLOPS_DORGQR( 2*m, n, n );
             //flops_dgemm  = FLOPS_DGEMM( m, n, n );
             //flops += flops_dgeqrf + flops_dorgqr + flops_dgemm;
-
-            itqr += 1;
-            //facto = 0;
         }
         else {
-            // Copy A into H to check the convergence of QDWH
+            // A = (b/c) * A + (a-b/c) * ( linsolve( C, linsolve( C, A') ) )';
+            // Save a copy of A to check the convergence of QDWH
             if (it >= itconv) {
                 slate::copy(A, W10, opts);
             }
 
-            // Make Q1 into an identity matrix
-            set(zero, one, W2, opts);
-
-            // Compute Q1 = c * A' * A + I
+            // Save a copy of A
             slate::copy(A, Q10, opts);
+
+            // Compute c * A' * A + I
             auto AT = conj_transpose(Q10);
+            set(zero, one, W2, opts);
             gemm(scalar_t(c), AT, A, one, W2, opts);
 
             // Solve R x = AT
@@ -264,19 +250,19 @@ void qdwh(
                     slate::Uplo::Upper, W2 );
             posv(R, AT, opts);
 
-            // Update A =  (a-b/c) * Q1T' + (b/c) * A
+            // Update A = (b/c) * A + (a-b/c) * ( linsolve( C, linsolve( C, A') ) )';
             alpha = (a-b/c); beta = (b/c);
-            AT = conj_transpose(AT);
-            add(scalar_t(alpha), AT, scalar_t(beta), A, opts);
+            auto AT2 = conj_transpose(AT);
+            add(alpha, AT2, beta, A, opts);
+
+            itpo += 1;
 
+            //facto = 1;
             // Main flops used in this step
             //flops_dgemm  = FLOPS_DGEMM( m, n, n );
             //flops_dpotrf = FLOPS_DPOTRF( m );
             //flops_dtrsm  = FLOPS_DTRSM( 'L', m, n );
             //flops += flops_dgemm + flops_dpotrf + 2. * flops_dtrsm;
-
-            itpo += 1;
-            //facto = 1;
         }
 
         // Check if it converge, compute the norm of matrix A - W1
diff --git a/test/test_qdwh.cc b/test/test_qdwh.cc
index 339ec5ff4..ff584d506 100644
--- a/test/test_qdwh.cc
+++ b/test/test_qdwh.cc
@@ -107,8 +107,8 @@ void test_qdwh_work(Params& params, bool run)
         A = slate::Matrix<scalar_t>::fromScaLAPACK(m, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
         //H_data.resize( lldH * nlocH );
         //H = slate::HermitianMatrix<scalar_t>::fromScaLAPACK(slate::Uplo::Lower, n, &H_data[0], lldH, nb, p, q, MPI_COMM_WORLD);
-        H_data.resize( lldH * nlocA );
-        H = slate::Matrix<scalar_t>::fromScaLAPACK(n, n, &H_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
+        H_data.resize( lldH * nlocH );
+        H = slate::Matrix<scalar_t>::fromScaLAPACK(n, n, &H_data[0], lldH, nb, p, q, MPI_COMM_WORLD);
     }
 
     real_t cond = 1 / std::numeric_limits<real_t>::epsilon();

From ce9ef1c5f19a576ee90bd1d3dc24fd8f410b11bf Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Tue, 17 Jan 2023 10:42:38 -0500
Subject: [PATCH 22/49] Used real_t to cast numbers inn while-loop and minor
 cleaning

---
 src/qdwh.cc | 126 +++++++++++++++++++++++++++-------------------------
 1 file changed, 66 insertions(+), 60 deletions(-)

diff --git a/src/qdwh.cc b/src/qdwh.cc
index 0e22a3a4a..5fe0bd4fb 100644
--- a/src/qdwh.cc
+++ b/src/qdwh.cc
@@ -24,6 +24,14 @@ void qdwh(
           int& itqr, int& itpo,
           Options const& opts)
 {
+    // todo:
+    // optimizations:
+    // 1. reuse Q from first geqrf
+    // 2. avoid rounding m if m is not divisible by nb, because then geqrf/unmqr
+    // have extra zero rows
+    // in norm2est, when allocate vector mx1, need to allocate on row-processes only
+    // limitations:
+    // 1. works only when have square grid
     using real_t = blas::real_type<scalar_t>;
     using blas::real;
 
@@ -31,23 +39,23 @@ void qdwh(
 
     max_panel_threads = get_option<int64_t>( opts, Option::MaxPanelThreads, max_panel_threads );
 
-    real_t eps  = std::numeric_limits<real_t>::epsilon();
-    real_t tol1 = 5. * eps;
-    real_t tol3 = pow(tol1, 1./3.);
-
-    bool optqr = 1;
     int itconv, it;
 
     real_t L2, sqd, dd, a1, a, b, c;
     real_t Li, Liconv;
     real_t conv = 100.;
-    scalar_t zero = 0.0, one = 1.0;
+    real_t rone = 1.;
 
+    scalar_t zero = 0.0, one = 1.0;
     scalar_t alpha, beta;
 
     real_t normA;
     real_t norminvR;
 
+    real_t eps  = std::numeric_limits<real_t>::epsilon();
+    real_t tol1 = 5. * eps;
+    real_t tol3 = pow(tol1, rone/real_t(3.));
+
     int64_t mt = A.mt();
     int64_t nt = A.nt();
 
@@ -90,24 +98,22 @@ void qdwh(
         Acpy.insertLocalTiles(target);
     }
 
-    // todo: do this on GPUs
     W.insertLocalTiles(target);
     Q.insertLocalTiles(target);
 
-    // To compute QR[A; Id]
+    // To compute QR[A; Id], allocate W and Q of size m_Wxn
     // First mt tiles of W contains A matrix in [0:m-1, 0:n-1]
     // W1 have more rows than A, since we round up the number of rows of W1
-    // W10 have A matrix
     auto W1 = W.sub(0, mt-1, 0, nt-1);
-    auto W10 = W1.slice(0, m-1, 0, n-1);
-    auto W11 = W.sub(mt-1, mt-1, 0, nt-1);
-
     auto Q1 = Q.sub(0, mt-1, 0, nt-1);
+    // W10 have A matrix
+    auto W10 = W1.slice(0, m-1, 0, n-1);
     auto Q10 = Q1.slice(0, m-1, 0, n-1);
+    // W11 is the last tile which has more rows in case we round up m
+    auto W11 = W.sub(mt-1, mt-1, 0, nt-1);
 
     // Second nt tiles of W, contain the identity matrix
     auto W2 = W.sub(mt, mt_W-1, 0, nt-1);
-
     auto Q2 = Q.sub(mt, mt_W-1, 0, nt-1);
 
     // backup A in Acpy to compute H
@@ -117,37 +123,33 @@ void qdwh(
     real_t norm_est = norm2est( A, opts);
     alpha = 1.0;
 
-    // scale the original matrix A to form A0 of the iterative loop
+    // scale the original matrix A to form A0 which is the initial matrix
+    // for the first iteration
     add(alpha/scalar_t(norm_est), Acpy, zero, A, opts);
 
-    // Calculate Li: reciprocal of condition number estimation
+    // Estimate the condition number
     // todo: use the Q factor in the first QR-based iteration
-    normA = norm(slate::Norm::One, A, opts);
-    if (optqr) {
-        // Estimate the condition number using QR
-        // This Q factor can be used in the first QR-based iteration
-        slate::copy(A, W10, opts);
-        // todo: the QR used to estimate the condition number can be used in the first QR iteration
-        // todo: put bound of 1-norm est compared to 2-norm
-        slate::geqrf(W1, T, opts);
-        //auto R  = TrapezoidMatrix<scalar_t>(
-        //    Uplo::Upper, slate::Diag::NonUnit, W10 );
-        auto R1 =  W10.slice(0, n-1, 0, n-1);
-        auto R  = TriangularMatrix<scalar_t>(
+
+    // Estimate the condition number using QR
+    // This Q factor can be used in the first QR-based iteration
+    slate::copy(A, W10, opts);
+    // todo: put bound of 1-norm est compared to 2-norm
+    slate::geqrf(W10, T, opts);
+    //auto R  = TrapezoidMatrix<scalar_t>(
+    //    Uplo::Upper, slate::Diag::NonUnit, W10 );
+    auto R1 =  W10.slice(0, n-1, 0, n-1);
+    auto R  = TriangularMatrix<scalar_t>(
             Uplo::Upper, slate::Diag::NonUnit, R1 );
-        auto Rh = HermitianMatrix<scalar_t>(
-            Uplo::Upper, R1 );
-        trtri(R, opts);
-        norminvR = norm(slate::Norm::One, Rh, opts);
-        Li = (1.0 / norminvR) / normA;
-        Li = norm_est / 1.1 * Li;
-        // *flops += FLOPS_DGEQRF( M, N )
-        //       + FLOPS_DTRTRI( N );
-    }
-    else {
-        // todo
-        // Estimate the condition number using LU
-    }
+    // For now, compute the exact condition number using trtri
+    normA = norm(slate::Norm::One, A, opts);
+    trtri(R, opts);
+    norminvR = norm(slate::Norm::One, R, opts);
+    Li = (1.0 / norminvR) / normA;
+    // todo: will call trcondest instead
+    //slate::trcondest(slate::Norm::One, R, &Li, opts);
+    Li = norm_est / 1.1 * Li;
+    // *flops += FLOPS_DGEQRF( M, N )
+    //       + FLOPS_DTRTRI( N );
 
     // Compute the number of iterations to converge
     itconv = 0; Liconv = Li;
@@ -161,14 +163,15 @@ void qdwh(
         itconv++;
 
         L2  =  Liconv * Liconv;
-        dd  = pow( 4.0 * (1.0 - L2 ) / (L2 * L2), 1.0/3.0 );
-        sqd = sqrt(1.0 + dd);
-        a1  = sqd + sqrt( 8.0 - 4.0 * dd + 8.0 * (2.0 - L2) / (L2 * sqd) ) / 2.0;
+        dd  = pow( real_t(4.0) * ( rone - L2 ) / ( L2 * L2 ), rone / real_t(3.0) );
+        sqd = sqrt( rone + dd );
+        a1  = sqd + sqrt( real_t(8.0) - real_t(4.0) * dd +
+              real_t(8.0) * ( real_t(2.0) - L2 ) / ( L2 * sqd ) ) / real_t(2.0);
         a   = real(a1);
-        b   = (a - 1.0) * (a - 1.0) / 4.0;
-        c   = a + b - 1.0;
+        b   = ( a - rone ) * ( a - rone ) / real_t(4.0);
+        c   = a + b - rone;
         // Update Liconv
-        Liconv  = Liconv * real_t((a + b * L2) / (1.0 + c * L2));
+        Liconv  = Liconv * ( a + b * L2 ) / ( rone + c * L2 );
     }
 
     it = 0;
@@ -180,31 +183,28 @@ void qdwh(
         it++;
 
         // Compute parameters L,a,b,c
-        //L2  = double (Li * Li);
         L2  = Li * Li;
-        dd  = pow( 4.0 * (1.0 - L2 ) / (L2 * L2), 1.0/3.0 );
-        sqd = sqrt(1.0 + dd);
-        a1  = sqd + sqrt( 8.0 - 4.0 * dd + 8.0 * (2.0 - L2) / (L2 * sqd) ) / 2.0;
+        dd  = pow( real_t(4.0) * ( rone - L2 ) / ( L2 * L2 ), rone / real_t(3.0) );
+        sqd = sqrt( rone + dd );
+        a1  = sqd + sqrt( real_t(8.0) - real_t(4.0) * dd +
+              real_t(8.0) * ( real_t(2.0) - L2 ) / ( L2 * sqd ) ) / real_t(2.0);
         a   = real(a1);
-        b   = (a - 1.0) * (a - 1.0) / 4.0;
-        c   = a + b - 1.0;
+        b   = ( a - rone ) * ( a - rone ) / real_t(4.0);
+        c   = a + b - rone;
         // Update Li
-        Li  = Li * real_t ((a + b * L2) / (1.0 + c * L2));
-
-        if (real(c) > 100.) {
-            //int64_t doqr = !optqr || it > 1;
+        Li  = Li * ( a + b * L2 ) / ( rone + c * L2 );
 
+        if (c > real_t(100.)) {
             // Generate the matrix W = [ W1 ] = [ sqrt(c) * A ]
             //                         [ W2 ] = [ Id          ]
 
             // Factorize W = QR, and generate the associated Q
-            // todo: replace following add by copy and scale, but why scale only scale by a real numbers?
             alpha = scalar_t(sqrt(c));
             set(zero, zero, W11, opts);
             add(alpha, A, zero, W10, opts);
             set(zero, one, W2, opts);
 
-            // Call a variant of geqrf annd unmqr that avoid the tiles of zeros,
+            // Call a variant of geqrf and unmqr that avoid the tiles of zeros,
             // which are the tiles below the diagonal of W2 (identity)
             geqrf_qdwh_full(W, T, opts);
 
@@ -246,9 +246,9 @@ void qdwh(
             gemm(scalar_t(c), AT, A, one, W2, opts);
 
             // Solve R x = AT
-            auto R = slate::HermitianMatrix<scalar_t>(
+            auto U = slate::HermitianMatrix<scalar_t>(
                     slate::Uplo::Upper, W2 );
-            posv(R, AT, opts);
+            posv(U, AT, opts);
 
             // Update A = (b/c) * A + (a-b/c) * ( linsolve( C, linsolve( C, A') ) )';
             alpha = (a-b/c); beta = (b/c);
@@ -278,6 +278,12 @@ void qdwh(
         printf("%-7d  %-7d\n", itqr, itpo);
     }
 
+    if (itqr + itpo > 6) {
+        printf("\n Converged after %d. Check what is the issue,"
+                   "because QDWH needs <= 6 iterations \n",
+                   itqr+itpo);
+    }
+
     // A = U*H ==> H = U'*A ==> H = 0.5*(H'+H)
     auto AT = conj_transpose(A);
     if (m == n) {

From 7c688311876918644e922dfe3dbb726255f6a214 Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Mon, 23 Jan 2023 09:46:38 -0500
Subject: [PATCH 23/49] Fixed W2 allocation in norm2est to have it work with
 grid pxq where p != q. Changed dd computing in qdwh for-now and minor changes

---
 src/norm2est.cc |  7 +++++--
 src/qdwh.cc     | 21 +++++++++++----------
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/src/norm2est.cc b/src/norm2est.cc
index b826f1f2a..9dabe4d23 100644
--- a/src/norm2est.cc
+++ b/src/norm2est.cc
@@ -56,7 +56,7 @@ norm2est(
     real_t tol = 1.e-1;
 
     int64_t mloc   = numberLocalRowOrCol(m, mb, myrow, izero, p);
-    int64_t nloc   = numberLocalRowOrCol(n, nb, myrow, izero, q);
+    int64_t nloc   = numberLocalRowOrCol(n, nb, mycol, izero, q);
     int64_t lldA   = blas::max(1, mloc);
     int64_t lldA_n = blas::max(1, nloc);
 
@@ -67,7 +67,10 @@ norm2est(
     std::vector<real_t> local_sums(n);
     std::vector<scalar_t> global_sums(n);
     std::vector<scalar_t> W1(lldA);
-    std::vector<scalar_t> W2(lldA_n);
+    std::vector<scalar_t> W2(n);
+    // todo: W2 should needs only lldA_n, but it does not work with grid pxq,
+    // where p!=q
+    //std::vector<scalar_t> W2(lldA_n);
 
     auto XL = slate::Matrix<scalar_t>::fromScaLAPACK(
             n, 1, &global_sums[0], n, nb, 1, p, q, A.mpiComm());
diff --git a/src/qdwh.cc b/src/qdwh.cc
index 5fe0bd4fb..ef37dd5a5 100644
--- a/src/qdwh.cc
+++ b/src/qdwh.cc
@@ -30,8 +30,6 @@ void qdwh(
     // 2. avoid rounding m if m is not divisible by nb, because then geqrf/unmqr
     // have extra zero rows
     // in norm2est, when allocate vector mx1, need to allocate on row-processes only
-    // limitations:
-    // 1. works only when have square grid
     using real_t = blas::real_type<scalar_t>;
     using blas::real;
 
@@ -44,12 +42,11 @@ void qdwh(
     real_t L2, sqd, dd, a1, a, b, c;
     real_t Li, Liconv;
     real_t conv = 100.;
-    real_t rone = 1.;
+    real_t rzero = 0.0, rone = 1.0;
 
     scalar_t zero = 0.0, one = 1.0;
     scalar_t alpha, beta;
 
-    real_t normA;
     real_t norminvR;
 
     real_t eps  = std::numeric_limits<real_t>::epsilon();
@@ -141,13 +138,13 @@ void qdwh(
     auto R  = TriangularMatrix<scalar_t>(
             Uplo::Upper, slate::Diag::NonUnit, R1 );
     // For now, compute the exact condition number using trtri
-    normA = norm(slate::Norm::One, A, opts);
     trtri(R, opts);
     norminvR = norm(slate::Norm::One, R, opts);
-    Li = (1.0 / norminvR) / normA;
+    real_t smin_est = 1./norminvR;
+    Li = smin_est / sqrt(n);
+
     // todo: will call trcondest instead
     //slate::trcondest(slate::Norm::One, R, &Li, opts);
-    Li = norm_est / 1.1 * Li;
     // *flops += FLOPS_DGEQRF( M, N )
     //       + FLOPS_DTRTRI( N );
 
@@ -184,7 +181,11 @@ void qdwh(
 
         // Compute parameters L,a,b,c
         L2  = Li * Li;
-        dd  = pow( real_t(4.0) * ( rone - L2 ) / ( L2 * L2 ), rone / real_t(3.0) );
+        //dd  = pow( real_t(4.0) * ( rone - L2 ) / ( L2 * L2 ), rone / real_t(3.0) );
+        if (abs( L2 - one) <= 10*eps)
+            dd = rzero;
+        else
+            dd  = pow( real_t(4.0) * ( rone - L2 ) / ( L2 * L2 ), rone / real_t(3.0) );
         sqd = sqrt( rone + dd );
         a1  = sqd + sqrt( real_t(8.0) - real_t(4.0) * dd +
               real_t(8.0) * ( real_t(2.0) - L2 ) / ( L2 * sqd ) ) / real_t(2.0);
@@ -194,7 +195,7 @@ void qdwh(
         // Update Li
         Li  = Li * ( a + b * L2 ) / ( rone + c * L2 );
 
-        if (c > real_t(100.)) {
+        if (c > 100.) {
             // Generate the matrix W = [ W1 ] = [ sqrt(c) * A ]
             //                         [ W2 ] = [ Id          ]
 
@@ -280,7 +281,7 @@ void qdwh(
 
     if (itqr + itpo > 6) {
         printf("\n Converged after %d. Check what is the issue,"
-                   "because QDWH needs <= 6 iterations \n",
+                   "because QDWH needs <= 6 iterations. \n",
                    itqr+itpo);
     }
 

From ca1db42966677535e15e22ffaca3ff36da09b94c Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Mon, 23 Jan 2023 10:50:53 -0500
Subject: [PATCH 24/49] minor on printf

---
 src/qdwh.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/qdwh.cc b/src/qdwh.cc
index ef37dd5a5..80f4376bf 100644
--- a/src/qdwh.cc
+++ b/src/qdwh.cc
@@ -280,8 +280,8 @@ void qdwh(
     }
 
     if (itqr + itpo > 6) {
-        printf("\n Converged after %d. Check what is the issue,"
-                   "because QDWH needs <= 6 iterations. \n",
+        printf("\nConverged after %d. Check what is the issue, "
+                   "because QDWH needs <= 6 iterations.\n",
                    itqr+itpo);
     }
 

From 3afc4cbe59d83d9da32c8750a1583fa3c883c67f Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Mon, 30 Jan 2023 10:24:17 -0500
Subject: [PATCH 25/49] throw err if invalid target

---
 src/norm2est.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/norm2est.cc b/src/norm2est.cc
index 9dabe4d23..8c2a822af 100644
--- a/src/norm2est.cc
+++ b/src/norm2est.cc
@@ -187,6 +187,7 @@ norm2est(matrix_type& A,
             return impl::norm2est<Target::Devices>( A, opts );
             break;
     }
+    throw std::exception();  // todo: invalid target
 }
 
 //------------------------------------------------------------------------------

From 5b19e0a7be7b892ef3f76a5d44b0cbe152b71b21 Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Mon, 30 Jan 2023 12:49:30 -0500
Subject: [PATCH 26/49] minor on doc

---
 src/norm2est.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/norm2est.cc b/src/norm2est.cc
index 8c2a822af..2cb10bb1f 100644
--- a/src/norm2est.cc
+++ b/src/norm2est.cc
@@ -17,7 +17,7 @@ namespace impl {
 
 //------------------------------------------------------------------------------
 /// @internal
-/// Distributed parallel general matrix norm.
+/// Distributed parallel general matrix 2-norm estimate.
 /// Generic implementation for any target.
 /// @ingroup norm_specialization
 ///

From 952b11bdc17d211b31376512f1295fd239fc73d4 Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Mon, 30 Jan 2023 12:57:43 -0500
Subject: [PATCH 27/49] remove nvToolsExt.h

---
 test/test_qdwh.cc | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/test/test_qdwh.cc b/test/test_qdwh.cc
index ff584d506..b09055284 100644
--- a/test/test_qdwh.cc
+++ b/test/test_qdwh.cc
@@ -13,7 +13,6 @@
 #include "scalapack_wrappers.hh"
 #include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
-#include "nvToolsExt.h"
 
 #include <cmath>
 #include <cstdio>
@@ -145,9 +144,7 @@ void test_qdwh_work(Params& params, bool run)
         // A = AH,
         // A will be written by the orthogonal polar factor
         // H is the symmetric positive semidefinite polar factor
-        nvtxRangePush("qdwh");
         slate::qdwh(A, H, itqr, itpo, opts);
-        nvtxRangePop();
 
         time = barrier_get_wtime(MPI_COMM_WORLD) - time;
 

From 398703d19a2b3a6051600f9f5d5838494ea52686 Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Mon, 30 Jan 2023 13:26:25 -0500
Subject: [PATCH 28/49] minor

---
 test/test_qdwh.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_qdwh.cc b/test/test_qdwh.cc
index b09055284..002720f3e 100644
--- a/test/test_qdwh.cc
+++ b/test/test_qdwh.cc
@@ -176,7 +176,7 @@ void test_qdwh_work(Params& params, bool run)
         params.gflops() = gflop / time;
 
         print_matrix("U: orthogonal polar factor", A, params);
-        //print_matrix("H: symmetric postive semi definite factor", H, params);
+        print_matrix("H: symmetric postive semi definite factor", H, params);
     }
 
     if (check) {

From d546c84114a12eaf4e215ed6393cfdd30ed118ff Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Mon, 30 Jan 2023 13:48:49 -0500
Subject: [PATCH 29/49] remove redifned SLATE_HAVE_SCALAPACK

---
 test/test_qdwh.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/test_qdwh.cc b/test/test_qdwh.cc
index 002720f3e..ceea1f1ac 100644
--- a/test/test_qdwh.cc
+++ b/test/test_qdwh.cc
@@ -18,7 +18,6 @@
 #include <cstdio>
 #include <cstdlib>
 #include <utility>
-#define SLATE_HAVE_SCALAPACK
 //------------------------------------------------------------------------------
 template <typename scalar_t>
 void test_qdwh_work(Params& params, bool run)

From 6d48ec2ff0ae9070c54622594f113ce7516c7a3d Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Mon, 30 Jan 2023 22:27:56 -0500
Subject: [PATCH 30/49] Added qdwh to run_tests. Trying to fix dd computing in
 qdwh and used trcondest. Reduced the condition number of the tested matrix

---
 src/qdwh.cc       | 26 ++++++++++++++------------
 test/run_tests.py |  7 +++++++
 test/test_qdwh.cc |  2 +-
 3 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/src/qdwh.cc b/src/qdwh.cc
index 80f4376bf..b47d34ec1 100644
--- a/src/qdwh.cc
+++ b/src/qdwh.cc
@@ -47,7 +47,7 @@ void qdwh(
     scalar_t zero = 0.0, one = 1.0;
     scalar_t alpha, beta;
 
-    real_t norminvR;
+    real_t normR;
 
     real_t eps  = std::numeric_limits<real_t>::epsilon();
     real_t tol1 = 5. * eps;
@@ -137,16 +137,13 @@ void qdwh(
     auto R1 =  W10.slice(0, n-1, 0, n-1);
     auto R  = TriangularMatrix<scalar_t>(
             Uplo::Upper, slate::Diag::NonUnit, R1 );
-    // For now, compute the exact condition number using trtri
-    trtri(R, opts);
-    norminvR = norm(slate::Norm::One, R, opts);
-    real_t smin_est = 1./norminvR;
+    normR = norm(slate::Norm::One, R, opts);
+    slate::trcondest(slate::Norm::One, R, &Li, opts);
+    real_t smin_est = normR*Li;
     Li = smin_est / sqrt(n);
 
-    // todo: will call trcondest instead
     //slate::trcondest(slate::Norm::One, R, &Li, opts);
-    // *flops += FLOPS_DGEQRF( M, N )
-    //       + FLOPS_DTRTRI( N );
+    // *flops += FLOPS_DGEQRF( M, N );
 
     // Compute the number of iterations to converge
     itconv = 0; Liconv = Li;
@@ -160,7 +157,10 @@ void qdwh(
         itconv++;
 
         L2  =  Liconv * Liconv;
-        dd  = pow( real_t(4.0) * ( rone - L2 ) / ( L2 * L2 ), rone / real_t(3.0) );
+        if (abs( L2 - rone) <= 10*eps)
+            dd = rzero;
+        else
+            dd  = pow( real_t(4.0) * ( rone - L2 ) / ( L2 * L2 ), rone / real_t(3.0) );
         sqd = sqrt( rone + dd );
         a1  = sqd + sqrt( real_t(8.0) - real_t(4.0) * dd +
               real_t(8.0) * ( real_t(2.0) - L2 ) / ( L2 * sqd ) ) / real_t(2.0);
@@ -182,10 +182,12 @@ void qdwh(
         // Compute parameters L,a,b,c
         L2  = Li * Li;
         //dd  = pow( real_t(4.0) * ( rone - L2 ) / ( L2 * L2 ), rone / real_t(3.0) );
-        if (abs( L2 - one) <= 10*eps)
+        if (abs( L2 - rone) <= 10*eps) {
             dd = rzero;
-        else
-            dd  = pow( real_t(4.0) * ( rone - L2 ) / ( L2 * L2 ), rone / real_t(3.0) );
+        }
+        else {
+            dd  = std::pow( real_t(4.0) * ( rone - L2 ) / ( L2 * L2 ), rone / real_t(3.0) );
+        }
         sqd = sqrt( rone + dd );
         a1  = sqd + sqrt( real_t(8.0) - real_t(4.0) * dd +
               real_t(8.0) * ( real_t(2.0) - L2 ) / ( L2 * sqd ) ) / real_t(2.0);
diff --git a/test/run_tests.py b/test/run_tests.py
index 77dee5bed..a907b54da 100755
--- a/test/run_tests.py
+++ b/test/run_tests.py
@@ -75,6 +75,7 @@
     group_cat.add_argument( '--aux',           action='store_true', help='run auxiliary routine tests' ),
     group_cat.add_argument( '--norms',         action='store_true', help='run norm tests' ),
     group_cat.add_argument( '--cond',          action='store_true', help='run condition number estimate tests' ),
+    group_cat.add_argument( '--qdwh',          action='store_true', help='run the polar decomposition qdwh' ),
 ]
 # map category objects to category names: ['lu', 'chol', ...]
 categories = list( map( lambda x: x.dest, categories ) )
@@ -574,6 +575,12 @@ def filter_csv( values, csv ):
     #[ 'pbcon', gen + dtype + la + n + kd + uplo ],
     ]
 
+# polar
+if (opts.qdwh):
+    cmds += [
+    [ 'qdwh', gen + dtype + n ],
+    ]
+
 # aux
 if (opts.aux):
     cmds += [
diff --git a/test/test_qdwh.cc b/test/test_qdwh.cc
index ceea1f1ac..f51763f9d 100644
--- a/test/test_qdwh.cc
+++ b/test/test_qdwh.cc
@@ -109,7 +109,7 @@ void test_qdwh_work(Params& params, bool run)
         H = slate::Matrix<scalar_t>::fromScaLAPACK(n, n, &H_data[0], lldH, nb, p, q, MPI_COMM_WORLD);
     }
 
-    real_t cond = 1 / std::numeric_limits<real_t>::epsilon();
+    real_t cond = 1 / (100 * std::numeric_limits<real_t>::epsilon() );
     params.matrix.kind.set_default("svd");
     params.matrix.cond.set_default(cond);
 

From 2acf39601029f974259c0a8c42ae0479a57b5545 Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Wed, 1 Feb 2023 12:06:58 -0500
Subject: [PATCH 31/49] Removed an extra allocation

---
 src/norm2est.cc | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/src/norm2est.cc b/src/norm2est.cc
index 2cb10bb1f..e670b7e3b 100644
--- a/src/norm2est.cc
+++ b/src/norm2est.cc
@@ -44,6 +44,7 @@ norm2est(
 
     scalar_t one = 1.;
     scalar_t zero  = 0.;
+    real_t rone = 1.;
     real_t alpha;
 
     int64_t cnt = 0;
@@ -67,17 +68,11 @@ norm2est(
     std::vector<real_t> local_sums(n);
     std::vector<scalar_t> global_sums(n);
     std::vector<scalar_t> W1(lldA);
-    std::vector<scalar_t> W2(n);
-    // todo: W2 should needs only lldA_n, but it does not work with grid pxq,
-    // where p!=q
-    //std::vector<scalar_t> W2(lldA_n);
 
     auto XL = slate::Matrix<scalar_t>::fromScaLAPACK(
             n, 1, &global_sums[0], n, nb, 1, p, q, A.mpiComm());
     auto AX = slate::Matrix<scalar_t>::fromScaLAPACK(
             m, 1, &W1[0], lldA, mb, 1, p, q, A.mpiComm());
-    auto X = slate::Matrix<scalar_t>::fromScaLAPACK(
-            n, 1, &W2[0], lldA_n, nb, 1, p, q, A.mpiComm());
 
     // Two norm estimation
     // First: let's compute the x vector such that
@@ -112,11 +107,10 @@ norm2est(
         e0 = e;
 
         // Scale X = X / ||X||
-        alpha = 1.0/normX;
-        add(scalar_t(alpha), XL, zero, X, opts);
+        scale(rone, normX, XL, opts);
 
         // Compute Ax = A * sx
-        gemm(one, A, X, zero, AX, opts);
+        gemm(one, A, XL, zero, AX, opts);
 
         // todo: still need to add the following
         //if nnz(Sx) == 0
@@ -125,11 +119,9 @@ norm2est(
 
         // Compute x = A' * A * x = A' * Ax
         auto AT = conjTranspose(A);
-        // todo: why this set is needed when using multiple mpi rank
-        // todo: send to Sebastien
+        // todo: why this set is needed when using multiple mpi rank and using gemmA
         set(zero, zero, XL);
         gemm(one, AT, AX, zero, XL, opts);
-        //gemmC(one, AT, AX, zero, XL, opts);
 
         // Compute ||X||, ||AX||
         normX  = norm(Norm::Fro, XL, opts);

From 516cae64fb290e0c2953c510d432e30c362ca8ce Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Thu, 2 Feb 2023 09:53:43 -0500
Subject: [PATCH 32/49] Changed dd to avoid overflow

---
 src/qdwh.cc | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/src/qdwh.cc b/src/qdwh.cc
index b47d34ec1..1d0a77c35 100644
--- a/src/qdwh.cc
+++ b/src/qdwh.cc
@@ -130,7 +130,6 @@ void qdwh(
     // Estimate the condition number using QR
     // This Q factor can be used in the first QR-based iteration
     slate::copy(A, W10, opts);
-    // todo: put bound of 1-norm est compared to 2-norm
     slate::geqrf(W10, T, opts);
     //auto R  = TrapezoidMatrix<scalar_t>(
     //    Uplo::Upper, slate::Diag::NonUnit, W10 );
@@ -157,10 +156,8 @@ void qdwh(
         itconv++;
 
         L2  =  Liconv * Liconv;
-        if (abs( L2 - rone) <= 10*eps)
-            dd = rzero;
-        else
-            dd  = pow( real_t(4.0) * ( rone - L2 ) / ( L2 * L2 ), rone / real_t(3.0) );
+        dd  = std::pow( real_t(4.0) * ( rone - L2 ), rone / real_t(3.0) ) *
+            std::pow( L2, real_t(-2.0) / real_t(3.0) );
         sqd = sqrt( rone + dd );
         a1  = sqd + sqrt( real_t(8.0) - real_t(4.0) * dd +
               real_t(8.0) * ( real_t(2.0) - L2 ) / ( L2 * sqd ) ) / real_t(2.0);
@@ -181,13 +178,8 @@ void qdwh(
 
         // Compute parameters L,a,b,c
         L2  = Li * Li;
-        //dd  = pow( real_t(4.0) * ( rone - L2 ) / ( L2 * L2 ), rone / real_t(3.0) );
-        if (abs( L2 - rone) <= 10*eps) {
-            dd = rzero;
-        }
-        else {
-            dd  = std::pow( real_t(4.0) * ( rone - L2 ) / ( L2 * L2 ), rone / real_t(3.0) );
-        }
+        dd  = std::pow( real_t(4.0) * ( rone - L2 ), rone / real_t(3.0) ) *
+            std::pow( L2, real_t(-2.0) / real_t(3.0) );
         sqd = sqrt( rone + dd );
         a1  = sqd + sqrt( real_t(8.0) - real_t(4.0) * dd +
               real_t(8.0) * ( real_t(2.0) - L2 ) / ( L2 * sqd ) ) / real_t(2.0);

From 22e050af11d51cfacf931aa6741b1ec03d8a48fa Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Thu, 2 Feb 2023 09:57:10 -0500
Subject: [PATCH 33/49] =?UTF-8?q?test=20ill-cond=C3=83?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/test_qdwh.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_qdwh.cc b/test/test_qdwh.cc
index f51763f9d..f033b2c67 100644
--- a/test/test_qdwh.cc
+++ b/test/test_qdwh.cc
@@ -109,7 +109,7 @@ void test_qdwh_work(Params& params, bool run)
         H = slate::Matrix<scalar_t>::fromScaLAPACK(n, n, &H_data[0], lldH, nb, p, q, MPI_COMM_WORLD);
     }
 
-    real_t cond = 1 / (100 * std::numeric_limits<real_t>::epsilon() );
+    real_t cond = 1 / (std::numeric_limits<real_t>::epsilon() );
     params.matrix.kind.set_default("svd");
     params.matrix.cond.set_default(cond);
 

From 6ea76e69ebfe97176862f4ccd1a046dc2ee2a061 Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Thu, 2 Feb 2023 11:09:55 -0500
Subject: [PATCH 34/49] Fix in makefil, used impl namesapce and minor to the
 docs in geqrf_qdwh_full

---
 GNUmakefile            |  2 +-
 src/geqrf_qdwh_full.cc | 80 +++++++++++++++++++-----------------------
 2 files changed, 38 insertions(+), 44 deletions(-)

diff --git a/GNUmakefile b/GNUmakefile
index 782b35d00..7773cc72e 100644
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -639,6 +639,7 @@ tester_src += \
         test/test_pbsv.cc \
         test/test_posv.cc \
         test/test_potri.cc \
+        test/test_qdwh.cc \
         test/test_scale.cc \
         test/test_scale_row_col.cc \
         test/test_set.cc \
@@ -676,7 +677,6 @@ ifneq ($(have_fortran),)
         test/pdlantr.f \
         test/pclantr.f \
         test/pzlantr.f \
-        test/test_qdwh.cc \
         # End. Add alphabetically, by base name after precision.
 endif
 endif
diff --git a/src/geqrf_qdwh_full.cc b/src/geqrf_qdwh_full.cc
index f7bc10c74..ac3ae303e 100644
--- a/src/geqrf_qdwh_full.cc
+++ b/src/geqrf_qdwh_full.cc
@@ -10,10 +10,7 @@
 
 namespace slate {
 
-// specialization namespace differentiates, e.g.,
-// internal::geqrf_qdwh_full from internal::specialization::geqrf
-namespace internal {
-namespace specialization {
+namespace impl {
 
 //------------------------------------------------------------------------------
 /// An auxiliary routine to find each rank's first (top-most) row
@@ -31,8 +28,9 @@ namespace specialization {
 /// @ingroup geqrf_qdwh_full_specialization
 ///
 template <typename scalar_t>
-void geqrf_compute_first_indices(Matrix<scalar_t>& A_panel, int64_t k,
-                                 std::vector< int64_t >& first_indices)
+void geqrf_compute_first_indices(
+     Matrix<scalar_t>& A_panel, int64_t k,
+     std::vector< int64_t >& first_indices)
 {
     // Find ranks in this column.
     std::set<int> ranks_set;
@@ -60,13 +58,13 @@ void geqrf_compute_first_indices(Matrix<scalar_t>& A_panel, int64_t k,
 ///
 /// ColMajor layout is assumed
 ///
-/// @ingroup geqrf_specialization
+/// @ingroup geqrf_impl
 ///
 template <Target target, typename scalar_t>
-void geqrf_qdwh_full(slate::internal::TargetType<target>,
+void geqrf_qdwh_full(
            Matrix<scalar_t>& A,
            TriangularFactors<scalar_t>& T,
-           int64_t ib, int max_panel_threads, int64_t lookahead)
+           Options const& opts )
 {
     using BcastList = typename Matrix<scalar_t>::BcastList;
     using device_info_t = lapack::device_info_int;
@@ -78,7 +76,15 @@ void geqrf_qdwh_full(slate::internal::TargetType<target>,
     const int priority_zero = 0;
     const int priority_one = 1;
     const int life_factor_one = 1;
-    const bool set_hold = lookahead > 0;  // Do tileGetAndHold in the bcast
+
+    // Options
+    int64_t lookahead = get_option<int64_t>( opts, Option::Lookahead, 1 );
+    int64_t ib = get_option<int64_t>( opts, Option::InnerBlocking, 16 );
+    int64_t max_panel_threads  = std::max(omp_get_max_threads()/2, 1);
+    max_panel_threads = get_option<int64_t>( opts, Option::MaxPanelThreads,
+                                             max_panel_threads );
+
+    bool set_hold = lookahead > 0;  // Do tileGetAndHold in the bcast
 
     int64_t A_mt = A.mt();
     int64_t A_nt = A.nt();
@@ -364,34 +370,17 @@ void geqrf_qdwh_full(slate::internal::TargetType<target>,
     }
 }
 
-} // namespace specialization
-} // namespace internal
-
-//------------------------------------------------------------------------------
-/// Version with target as template parameter.
-/// @ingroup geqrf_qdwh_full_specialization
-///
-template <Target target, typename scalar_t>
-void geqrf_qdwh_full(Matrix<scalar_t>& A,
-           TriangularFactors<scalar_t>& T,
-           Options const& opts)
-{
-    int64_t lookahead = get_option<int64_t>( opts, Option::Lookahead, 1 );
-
-    int64_t ib = get_option<int64_t>( opts, Option::InnerBlocking, 16 );
-
-    int64_t max_panel_threads  = std::max(omp_get_max_threads()/2, 1);
-    max_panel_threads = get_option<int64_t>( opts, Option::MaxPanelThreads, max_panel_threads );
-
-    internal::specialization::geqrf_qdwh_full(internal::TargetType<target>(),
-                                    A, T,
-                                    ib, max_panel_threads, lookahead);
-}
+} // namespace impl
 
 //------------------------------------------------------------------------------
-/// Distributed parallel QR factorization.
+/// Distributed parallel customized QR factorization.
+/// Required for the QR-based iterations in the polar decomposition QDWH.
 ///
-/// Computes a QR factorization of an m-by-n matrix $A$.
+/// Computes a QR factorization of m-by-n matrix $A$, m \ge 2n,
+/// and takes advantage of the trailing identity matrix structure.
+/// A = [ A0 ]  full matrix ( m0-by-n, where m0 = m - n)
+///     [ A1 ]  identity matrix (n-by-n)
+/// Avoids doing computaions on the zero tiles below the diagonal of $A1$.
 /// The factorization has the form
 /// \[
 ///     A = QR,
@@ -401,15 +390,16 @@ void geqrf_qdwh_full(Matrix<scalar_t>& A,
 ///
 /// Complexity (in real):
 /// - for $m \ge n$, $\approx 2 m n^{2} - \frac{2}{3} n^{3}$ flops;
-/// - for $m \le n$, $\approx 2 m^{2} n - \frac{2}{3} m^{3}$ flops;
-/// - for $m = n$,   $\approx \frac{4}{3} n^{3}$ flops.
 /// .
 //------------------------------------------------------------------------------
 /// @tparam scalar_t
 ///     One of float, double, std::complex<float>, std::complex<double>.
 //------------------------------------------------------------------------------
 /// @param[in,out] A
-///     On entry, the m-by-n matrix $A$.
+///     On entry, the m-by-n matrix $A$, m \ge 2n,
+///     A = [ A0 ]  full matrix ( m0-by-n, where m0 = m - n)
+///         [ A1 ]  identity matrix (n-by-n)
+///
 ///     On exit, the elements on and above the diagonal of the array contain
 ///     the min(m,n)-by-n upper trapezoidal matrix $R$ (upper triangular
 ///     if m >= n); the elements below the diagonal represent the unitary
@@ -437,7 +427,8 @@ void geqrf_qdwh_full(Matrix<scalar_t>& A,
 /// @ingroup geqrf_computational
 ///
 template <typename scalar_t>
-void geqrf_qdwh_full(Matrix<scalar_t>& A,
+void geqrf_qdwh_full(
+           Matrix<scalar_t>& A,
            TriangularFactors<scalar_t>& T,
            Options const& opts)
 {
@@ -446,16 +437,19 @@ void geqrf_qdwh_full(Matrix<scalar_t>& A,
     switch (target) {
         case Target::Host:
         case Target::HostTask:
-            geqrf_qdwh_full<Target::HostTask>(A, T, opts);
+            impl::geqrf_qdwh_full<Target::HostTask>( A, T, opts );
             break;
+
         case Target::HostNest:
-            geqrf_qdwh_full<Target::HostNest>(A, T, opts);
+            impl::geqrf_qdwh_full<Target::HostNest>( A, T, opts );
             break;
+
         case Target::HostBatch:
-            geqrf_qdwh_full<Target::HostBatch>(A, T, opts);
+            impl::geqrf_qdwh_full<Target::HostBatch>( A, T, opts );
             break;
+
         case Target::Devices:
-            geqrf_qdwh_full<Target::Devices>(A, T, opts);
+            impl::geqrf_qdwh_full<Target::Devices>( A, T, opts );
             break;
     }
     // todo: return value for errors?

From a7d423ff6cb260292a5f087404063af5872d7d13 Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Thu, 2 Feb 2023 11:24:14 -0500
Subject: [PATCH 35/49] minor fixes

---
 src/norm2est.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/norm2est.cc b/src/norm2est.cc
index e670b7e3b..7f69b90cf 100644
--- a/src/norm2est.cc
+++ b/src/norm2est.cc
@@ -103,7 +103,7 @@ norm2est(
     }
 
     // Third: start the while-loop X = X / ||X||
-    while ((cnt < maxiter) && (fabs((e - e0)) > (tol * (e)))) {
+    while ((cnt < maxiter) && (std::abs(e - e0) > (tol * e))) {
         e0 = e;
 
         // Scale X = X / ||X||
@@ -127,7 +127,7 @@ norm2est(
         normX  = norm(Norm::Fro, XL, opts);
         normAX = norm(Norm::Fro, AX, opts);
 
-        // Uodate e
+        // Update e
         e = normX / normAX;
         cnt++;
     }

From 3bcf929bedb875bc33ff72e86f3f496cc4254840 Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Thu, 2 Feb 2023 11:38:30 -0500
Subject: [PATCH 36/49] --amend

---
 src/norm2est.cc | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/norm2est.cc b/src/norm2est.cc
index 7f69b90cf..463435e31 100644
--- a/src/norm2est.cc
+++ b/src/norm2est.cc
@@ -45,7 +45,6 @@ norm2est(
     scalar_t one = 1.;
     scalar_t zero  = 0.;
     real_t rone = 1.;
-    real_t alpha;
 
     int64_t cnt = 0;
     int64_t maxiter = min( 100, n );
@@ -57,9 +56,7 @@ norm2est(
     real_t tol = 1.e-1;
 
     int64_t mloc   = numberLocalRowOrCol(m, mb, myrow, izero, p);
-    int64_t nloc   = numberLocalRowOrCol(n, nb, mycol, izero, q);
     int64_t lldA   = blas::max(1, mloc);
-    int64_t lldA_n = blas::max(1, nloc);
 
     // todo: do we still need reserveDevice here?
     if (target == Target::Devices)

From a09c7582ad2846360e67bbdd6458709ccd4ac01e Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Thu, 2 Feb 2023 13:04:16 -0500
Subject: [PATCH 37/49] Applied Mark's comments

---
 docs/doxygen/groups.dox |   8 ++
 src/qdwh.cc             | 161 ++++++++++++++++------------------------
 2 files changed, 71 insertions(+), 98 deletions(-)

diff --git a/docs/doxygen/groups.dox b/docs/doxygen/groups.dox
index 6cb84a95e..dcbc88538 100644
--- a/docs/doxygen/groups.dox
+++ b/docs/doxygen/groups.dox
@@ -145,6 +145,14 @@
         @defgroup svd_specialization    Target implementations
     @}
 
+    ------------------------------------------------------------
+    @defgroup group_qdwh Polar Decomposition
+    @{
+        @defgroup qdwh                      Driver
+        @brief                              $A = U H$
+
+    @}
+
     ------------------------------------------------------------
     @defgroup group_blas2_top     Level 2 BLAS and Auxiliary: O(n^2) work
     @brief    Matrix and Matrix-vector operations that perform $O(n^2)$ work on $O(n^2)$ data.
diff --git a/src/qdwh.cc b/src/qdwh.cc
index 1d0a77c35..83820728f 100644
--- a/src/qdwh.cc
+++ b/src/qdwh.cc
@@ -10,14 +10,55 @@
 
 namespace slate {
 
-namespace impl {
-
 //------------------------------------------------------------------------------
 /// Distributed parallel polar decomposition based on QDWH algorithm.
-/// Generic implementation for any target.
-/// @ingroup qdwh_specialization
 ///
-template <Target target, typename scalar_t>
+/// Computes a polar decomposition of an m-by-n matrix $A$.
+/// The factorization has the form
+/// \[
+///     A = UH,
+/// \]
+/// where $U$ is a matrix with orthonormal columns and $H$ is a Hermition
+/// matrix.
+///
+//------------------------------------------------------------------------------
+/// @tparam scalar_t
+///     One of float, double, std::complex<float>, std::complex<double>.
+//------------------------------------------------------------------------------
+/// @param[in,out] A
+///     On entry, the m-by-n matrix $A$.
+///     On exit, if return value = 0, the orthogonal polar factor $U$
+///     from the decomposition $A = U H$.
+///
+/// @param[out] H
+///     On exit, if return value = 0, the hermitian polar factor matrix $H$
+///     from the decomposition $A = U H$.
+///
+/// @param[out] itqr
+///     The number of the QR-based iterations.
+///
+/// @param[out] itpo
+///     The number of the Cholesky-based iterations.
+///
+/// @param[in] opts
+///     Additional options, as map of name = value pairs. Possible options:
+///     - Option::Lookahead:
+///       Number of panels to overlap with matrix updates.
+///       lookahead >= 0. Default 1.
+///     - Option::InnerBlocking:
+///       Inner blocking to use for panel. Default 16.
+///     - Option::MaxPanelThreads:
+///       Number of threads to use for panel. Default omp_get_max_threads()/2.
+///     - Option::Target:
+///       Implementation to target. Possible values:
+///       - HostTask:  OpenMP tasks on CPU host [default].
+///       - HostNest:  nested OpenMP parallel for loop on CPU host.
+///       - HostBatch: batched BLAS on CPU host.
+///       - Devices:   batched BLAS on GPU device.
+///
+/// @ingroup qdwh
+///
+template <typename scalar_t>
 void qdwh(
           Matrix<scalar_t>& A,
           Matrix<scalar_t>& H,
@@ -33,21 +74,14 @@ void qdwh(
     using real_t = blas::real_type<scalar_t>;
     using blas::real;
 
-    int64_t max_panel_threads  = std::max(omp_get_max_threads()/2, 1);
-
-    max_panel_threads = get_option<int64_t>( opts, Option::MaxPanelThreads, max_panel_threads );
-
-    int itconv, it;
-
-    real_t L2, sqd, dd, a1, a, b, c;
-    real_t Li, Liconv;
+    // Constants
     real_t conv = 100.;
-    real_t rzero = 0.0, rone = 1.0;
-
+    real_t rone = 1.0;
     scalar_t zero = 0.0, one = 1.0;
     scalar_t alpha, beta;
 
-    real_t normR;
+    // Options
+    Target target = get_option( opts, Option::Target, Target::HostTask );
 
     real_t eps  = std::numeric_limits<real_t>::epsilon();
     real_t tol1 = 5. * eps;
@@ -65,15 +99,21 @@ void qdwh(
 
     int64_t nb   = A.tileMb(0);
 
+    int itconv, it;
+    real_t L2, sqd, dd, a1, a, b, c;
+    real_t Li, Liconv;
+    real_t normR;
     // The QR-based iterations requires QR[A; Id],
     // W = [A; Id], where size of A is mxn, size of Id is nxn
     // To avoid having a clean up tile in the middle of the W matrix,
     // we round up the number of A rows (m),
     // so that size(W) = m_roundup + n
-    int64_t m_roundup =  ((m + nb - 1) / nb ) * nb;
+    //int64_t m_roundup =  ((m + nb - 1) / nb ) * nb;
+    int64_t m_roundup = roundup( m, nb );
     int64_t m_W   = m_roundup + n;
     int64_t mt_W = mt + nt;
 
+
     int nprow, npcol;
     int myrow, mycol;
     GridOrder order;
@@ -122,13 +162,11 @@ void qdwh(
 
     // scale the original matrix A to form A0 which is the initial matrix
     // for the first iteration
-    add(alpha/scalar_t(norm_est), Acpy, zero, A, opts);
-
-    // Estimate the condition number
-    // todo: use the Q factor in the first QR-based iteration
+    scale(rone, norm_est, A, opts);
 
     // Estimate the condition number using QR
     // This Q factor can be used in the first QR-based iteration
+    // todo: use the Q factor in the first QR-based iteration
     slate::copy(A, W10, opts);
     slate::geqrf(W10, T, opts);
     //auto R  = TrapezoidMatrix<scalar_t>(
@@ -296,106 +334,33 @@ void qdwh(
     //slate::copy(AL, H, opts);
 }
 
-} // namespace impl
-
-//------------------------------------------------------------------------------
-/// Distributed parallel polar decomposition based on QDWH algorithm.
-///
-/// Computes a polar decomposition of an m-by-n matrix $A$.
-/// The factorization has the form
-/// \[
-///     A = UH,
-/// \]
-/// where $U$ is a matrix with orthonormal columns and $H$ is a Hermition
-/// matrix.
-///
-/// Complexity (in real):
-/// - for $m \ge n$, $\approx 2 m n^{2} - \frac{2}{3} n^{3}$ flops;
-/// - for $m \le n$, $\approx 2 m^{2} n - \frac{2}{3} m^{3}$ flops;
-/// - for $m = n$,   $\approx \frac{4}{3} n^{3}$ flops.
-/// .
-//------------------------------------------------------------------------------
-/// @tparam scalar_t
-///     One of float, double, std::complex<float>, std::complex<double>.
-//------------------------------------------------------------------------------
-/// @param[in,out] A
-///     On entry, the m-by-n matrix $A$.
-///     On exit, if return value = 0, the orthogonal polar factor $U$
-///     from the decomposition $A = U H$.
-///
-/// @param[out] H
-///     On exit, if return value = 0, the hermitian polar factor matrix $H$
-///     from the decomposition $A = U H$.
-///
-/// @param[in] opts
-///     Additional options, as map of name = value pairs. Possible options:
-///     - Option::Lookahead:
-///       Number of panels to overlap with matrix updates.
-///       lookahead >= 0. Default 1.
-///     - Option::InnerBlocking:
-///       Inner blocking to use for panel. Default 16.
-///     - Option::MaxPanelThreads:
-///       Number of threads to use for panel. Default omp_get_max_threads()/2.
-///     - Option::Target:
-///       Implementation to target. Possible values:
-///       - HostTask:  OpenMP tasks on CPU host [default].
-///       - HostNest:  nested OpenMP parallel for loop on CPU host.
-///       - HostBatch: batched BLAS on CPU host.
-///       - Devices:   batched BLAS on GPU device.
-///
-/// @ingroup geqrf_computational
-///
-template <typename scalar_t>
-void qdwh(
-          Matrix<scalar_t>& A,
-          Matrix<scalar_t>& B,
-          int& itqr, int& itpo,
-          Options const& opts)
-{
-    using internal::TargetType;
-    Target target = get_option( opts, Option::Target, Target::HostTask );
-
-    switch (target) {
-        case Target::Host:
-        case Target::HostTask:
-        case Target::HostNest:
-        case Target::HostBatch:
-            impl::qdwh<Target::HostTask>( A, B, itqr, itpo, opts );
-            break;
-        case Target::Devices:
-            impl::qdwh<Target::Devices>( A, B, itqr, itpo, opts );
-            break;
-    }
-    // todo: return value for errors?
-}
-
 //------------------------------------------------------------------------------
 // Explicit instantiations.
 template
 void qdwh<float>(
     Matrix<float>& A,
-    Matrix<float>& B,
+    Matrix<float>& H,
     int& itqr, int& itpo,
     Options const& opts);
 
 template
 void qdwh<double>(
     Matrix<double>& A,
-    Matrix<double>& B,
+    Matrix<double>& H,
     int& itqr, int& itpo,
     Options const& opts);
 
 template
 void qdwh< std::complex<float> >(
     Matrix< std::complex<float> >& A,
-    Matrix< std::complex<float> >& B,
+    Matrix< std::complex<float> >& H,
     int& itqr, int& itpo,
     Options const& opts);
 
 template
 void qdwh< std::complex<double> >(
     Matrix< std::complex<double> >& A,
-    Matrix< std::complex<double> >& B,
+    Matrix< std::complex<double> >& H,
     int& itqr, int& itpo,
     Options const& opts);
 

From cdfc10ca0a521d782938eb43ff5397f0d1e7b8d7 Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Thu, 2 Feb 2023 13:42:03 -0500
Subject: [PATCH 38/49] Replaced namespace by impl in umnqr_qdwh_full. Changed
 n to mn in run_test.py

---
 src/unmqr_qdwh_full.cc | 50 ++++++++++++++++++------------------------
 test/run_tests.py      |  2 +-
 2 files changed, 22 insertions(+), 30 deletions(-)

diff --git a/src/unmqr_qdwh_full.cc b/src/unmqr_qdwh_full.cc
index 3b1277103..b13b8b502 100644
--- a/src/unmqr_qdwh_full.cc
+++ b/src/unmqr_qdwh_full.cc
@@ -11,23 +11,20 @@
 
 namespace slate {
 
-// specialization namespace differentiates, e.g.,
-// internal::unmqr_qdwh_full from internal::specialization::unmqr_qdwh_full
-namespace internal {
-namespace specialization {
+namespace impl {
 
 //------------------------------------------------------------------------------
 /// Distributed parallel multiply by Q from QR factorization.
 /// Generic implementation for any target.
-/// @ingroup geqrf_specialization
+/// @ingroup geqrf_impl
 ///
 template <Target target, typename scalar_t>
 void unmqr_qdwh_full(
-    slate::internal::TargetType<target>,
     Side side, Op op,
     Matrix<scalar_t>& A,
     TriangularFactors<scalar_t>& T,
-    Matrix<scalar_t>& C)
+    Matrix<scalar_t>& C,
+    Options const& opts )
 {
     // trace::Block trace_block("unmqr");
     // const int priority_one = 1;
@@ -262,29 +259,21 @@ void unmqr_qdwh_full(
     C.clearWorkspace();
 }
 
-} // namespace specialization
-} // namespace internal
+} // namespace impl
 
 //------------------------------------------------------------------------------
-/// Version with target as template parameter.
-/// @ingroup geqrf_specialization
+/// Distributed parallel customized multiply by $Q$ from QR factorization.
+/// Required for the QR-based iterations in the polar decomposition QDWH.
 ///
-template <Target target, typename scalar_t>
-void unmqr_qdwh_full(
-    Side side, Op op,
-    Matrix<scalar_t>& A,
-    TriangularFactors<scalar_t>& T,
-    Matrix<scalar_t>& C,
-    Options const& opts)
-{
-    internal::specialization::unmqr_qdwh_full(internal::TargetType<target>(),
-                                    side, op, A, T, C);
-}
-
-//------------------------------------------------------------------------------
-/// Distributed parallel multiply by $Q$ from QR factorization.
+/// The $Q$ matrix is the output of geqrf_qdwh_full. Since the input matrix
+/// to geqrf_qdwh_full has identity trailing matrix, the $Q$ matrix has
+/// zero tiles below the diagonal.
+/// Q = [ Q0 ]  full matrix ( m0-by-n, where m0 = m - n)
+///     [ Q1 ]  zero tiles below the diagonal (n-by-n)
 ///
 /// Multiplies the general m-by-n matrix $C$ by $Q$ from QR factorization,
+/// and takes advantage of the structure of the matrix by avoid operating
+/// on the tiles of zeros,
 /// according to:
 ///
 /// op              |  side = Left  |  side = Right
@@ -351,16 +340,19 @@ void unmqr_qdwh_full(
         case Target::Host:
         case Target::HostTask:
         default:
-            unmqr_qdwh_full<Target::HostTask>(side, op, A, T, C, opts);
+            impl::unmqr_qdwh_full<Target::HostTask>( side, op, A, T, C, opts );
             break;
+
         case Target::HostNest:
-            unmqr_qdwh_full<Target::HostNest>(side, op, A, T, C, opts);
+            impl::unmqr_qdwh_full<Target::HostNest>( side, op, A, T, C, opts );
             break;
+
         case Target::HostBatch:
-            unmqr_qdwh_full<Target::HostBatch>(side, op, A, T, C, opts);
+            impl::unmqr_qdwh_full<Target::HostBatch>( side, op, A, T, C, opts );
             break;
+
         case Target::Devices:
-            unmqr_qdwh_full<Target::Devices>(side, op, A, T, C, opts);
+            impl::unmqr_qdwh_full<Target::Devices>( side, op, A, T, C, opts );
             break;
     }
     // todo: return value for errors?
diff --git a/test/run_tests.py b/test/run_tests.py
index a907b54da..2e27ef4fd 100755
--- a/test/run_tests.py
+++ b/test/run_tests.py
@@ -578,7 +578,7 @@ def filter_csv( values, csv ):
 # polar
 if (opts.qdwh):
     cmds += [
-    [ 'qdwh', gen + dtype + n ],
+    [ 'qdwh', gen + dtype + mn ],
     ]
 
 # aux

From fab06a19dd9ea69aa5972e8f9cfc80dc3a5080ef Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Thu, 2 Feb 2023 14:20:01 -0500
Subject: [PATCH 39/49] qdwh works m >= n

---
 test/run_tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/run_tests.py b/test/run_tests.py
index 2e27ef4fd..d67480af1 100755
--- a/test/run_tests.py
+++ b/test/run_tests.py
@@ -578,7 +578,7 @@ def filter_csv( values, csv ):
 # polar
 if (opts.qdwh):
     cmds += [
-    [ 'qdwh', gen + dtype + mn ],
+    [ 'qdwh', gen + dtype + n + tall ],
     ]
 
 # aux

From deec809ab7249a913181c2790f42fe8fe1759c4c Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Fri, 3 Feb 2023 10:41:43 -0500
Subject: [PATCH 40/49] fix flops count

---
 src/geqrf_qdwh_full.cc |  9 +++++++--
 test/test_qdwh.cc      | 10 ++++++++--
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/src/geqrf_qdwh_full.cc b/src/geqrf_qdwh_full.cc
index ac3ae303e..d1c4141c0 100644
--- a/src/geqrf_qdwh_full.cc
+++ b/src/geqrf_qdwh_full.cc
@@ -378,7 +378,8 @@ void geqrf_qdwh_full(
 ///
 /// Computes a QR factorization of m-by-n matrix $A$, m \ge 2n,
 /// and takes advantage of the trailing identity matrix structure.
-/// A = [ A0 ]  full matrix ( m0-by-n, where m0 = m - n)
+/// A = [ A0 ]  full matrix ( m0-by-n, where m0 = m - n, which the size of the
+/// original matrix in qdwh)
 ///     [ A1 ]  identity matrix (n-by-n)
 /// Avoids doing computaions on the zero tiles below the diagonal of $A1$.
 /// The factorization has the form
@@ -389,7 +390,11 @@ void geqrf_qdwh_full(
 /// (or upper trapezoidal if m < n).
 ///
 /// Complexity (in real):
-/// - for $m \ge n$, $\approx 2 m n^{2} - \frac{2}{3} n^{3}$ flops;
+/// The size of matrix $A$ is $m = m0 + n$;
+/// Since the lower n-by-n of the matrix $A$ is the identity, then the
+/// k householder vectors has m0 - k + k + 1 = m0 + 1 rows.
+/// - for $m \ge n$, \Sigma_{k=1}^{n-1} 4 (n - k) (m0 + 1) =
+/// $\approx 2 m0 n^{2} + 2 m0 n +  n^{2}$ flops;
 /// .
 //------------------------------------------------------------------------------
 /// @tparam scalar_t
diff --git a/test/test_qdwh.cc b/test/test_qdwh.cc
index f033b2c67..29553f22a 100644
--- a/test/test_qdwh.cc
+++ b/test/test_qdwh.cc
@@ -156,8 +156,14 @@ void test_qdwh_work(Params& params, bool run)
         //         flop_compute_H
         double gflop_trcon = lapack::Gflop<scalar_t>::geqrf(m, n);
 
-        double gflop_itqr  = itqr * ( lapack::Gflop<scalar_t>::geqrf(m+n, n) +
-                                      lapack::Gflop<scalar_t>::unmqr(slate::Side::Left, m+n, n, n) +
+        // this flop count of the QR-based iteration is in case we are not taking advantage
+        // of the identity trailing submatrix:
+        //double gflop_itqr  = itqr * ( lapack::Gflop<scalar_t>::geqrf(m+n, n) +
+        //                              lapack::Gflop<scalar_t>::unmqr(slate::Side::Left, m+n, n, n) +
+        //                              lapack::Gflop<scalar_t>::gemm(m, n, n) );
+        // when take advantage of the matrix structure, the flops count will be reduced:
+        double gflop_itqr  = itqr * ( 2 * m * n^2 +
+                                      2 * m * n^2 +
                                       lapack::Gflop<scalar_t>::gemm(m, n, n) );
 
         double gflop_itpo  = itpo * ( lapack::Gflop<scalar_t>::gemm(m, n, n) +

From 169a63d5df35de61d0c853ec5067bd7b76590878 Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Mon, 6 Feb 2023 09:38:58 -0500
Subject: [PATCH 41/49] minor

---
 test/test_qdwh.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_qdwh.cc b/test/test_qdwh.cc
index 29553f22a..c636a7f92 100644
--- a/test/test_qdwh.cc
+++ b/test/test_qdwh.cc
@@ -162,8 +162,8 @@ void test_qdwh_work(Params& params, bool run)
         //                              lapack::Gflop<scalar_t>::unmqr(slate::Side::Left, m+n, n, n) +
         //                              lapack::Gflop<scalar_t>::gemm(m, n, n) );
         // when take advantage of the matrix structure, the flops count will be reduced:
-        double gflop_itqr  = itqr * ( 2 * m * n^2 +
-                                      2 * m * n^2 +
+        double gflop_itqr  = itqr * ( 2 * m * std::pow(n, 2) +
+                                      2 * m * std::pow(n, 2) +
                                       lapack::Gflop<scalar_t>::gemm(m, n, n) );
 
         double gflop_itpo  = itpo * ( lapack::Gflop<scalar_t>::gemm(m, n, n) +

From fc35842e454a30264a1f739c90bfec67b34be2dd Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Mon, 6 Feb 2023 13:56:48 -0500
Subject: [PATCH 42/49] add const and delete dead code

---
 src/qdwh.cc            | 30 +++++++++++++++---------------
 src/unmqr_qdwh_full.cc |  1 -
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/src/qdwh.cc b/src/qdwh.cc
index 83820728f..469a71897 100644
--- a/src/qdwh.cc
+++ b/src/qdwh.cc
@@ -75,9 +75,9 @@ void qdwh(
     using blas::real;
 
     // Constants
-    real_t conv = 100.;
-    real_t rone = 1.0;
-    scalar_t zero = 0.0, one = 1.0;
+    const real_t conv = 100.;
+    const real_t r_one = 1.0;
+    const scalar_t zero = 0.0, one = 1.0;
     scalar_t alpha, beta;
 
     // Options
@@ -85,7 +85,7 @@ void qdwh(
 
     real_t eps  = std::numeric_limits<real_t>::epsilon();
     real_t tol1 = 5. * eps;
-    real_t tol3 = pow(tol1, rone/real_t(3.));
+    real_t tol3 = pow(tol1, r_one/real_t(3.));
 
     int64_t mt = A.mt();
     int64_t nt = A.nt();
@@ -162,7 +162,7 @@ void qdwh(
 
     // scale the original matrix A to form A0 which is the initial matrix
     // for the first iteration
-    scale(rone, norm_est, A, opts);
+    scale(r_one, norm_est, A, opts);
 
     // Estimate the condition number using QR
     // This Q factor can be used in the first QR-based iteration
@@ -194,16 +194,16 @@ void qdwh(
         itconv++;
 
         L2  =  Liconv * Liconv;
-        dd  = std::pow( real_t(4.0) * ( rone - L2 ), rone / real_t(3.0) ) *
+        dd  = std::pow( real_t(4.0) * ( r_one - L2 ), r_one / real_t(3.0) ) *
             std::pow( L2, real_t(-2.0) / real_t(3.0) );
-        sqd = sqrt( rone + dd );
+        sqd = sqrt( r_one + dd );
         a1  = sqd + sqrt( real_t(8.0) - real_t(4.0) * dd +
               real_t(8.0) * ( real_t(2.0) - L2 ) / ( L2 * sqd ) ) / real_t(2.0);
         a   = real(a1);
-        b   = ( a - rone ) * ( a - rone ) / real_t(4.0);
-        c   = a + b - rone;
+        b   = ( a - r_one ) * ( a - r_one ) / real_t(4.0);
+        c   = a + b - r_one;
         // Update Liconv
-        Liconv  = Liconv * ( a + b * L2 ) / ( rone + c * L2 );
+        Liconv  = Liconv * ( a + b * L2 ) / ( r_one + c * L2 );
     }
 
     it = 0;
@@ -216,16 +216,16 @@ void qdwh(
 
         // Compute parameters L,a,b,c
         L2  = Li * Li;
-        dd  = std::pow( real_t(4.0) * ( rone - L2 ), rone / real_t(3.0) ) *
+        dd  = std::pow( real_t(4.0) * ( r_one - L2 ), r_one / real_t(3.0) ) *
             std::pow( L2, real_t(-2.0) / real_t(3.0) );
-        sqd = sqrt( rone + dd );
+        sqd = sqrt( r_one + dd );
         a1  = sqd + sqrt( real_t(8.0) - real_t(4.0) * dd +
               real_t(8.0) * ( real_t(2.0) - L2 ) / ( L2 * sqd ) ) / real_t(2.0);
         a   = real(a1);
-        b   = ( a - rone ) * ( a - rone ) / real_t(4.0);
-        c   = a + b - rone;
+        b   = ( a - r_one ) * ( a - r_one ) / real_t(4.0);
+        c   = a + b - r_one;
         // Update Li
-        Li  = Li * ( a + b * L2 ) / ( rone + c * L2 );
+        Li  = Li * ( a + b * L2 ) / ( r_one + c * L2 );
 
         if (c > 100.) {
             // Generate the matrix W = [ W1 ] = [ sqrt(c) * A ]
diff --git a/src/unmqr_qdwh_full.cc b/src/unmqr_qdwh_full.cc
index b13b8b502..a1ace1725 100644
--- a/src/unmqr_qdwh_full.cc
+++ b/src/unmqr_qdwh_full.cc
@@ -27,7 +27,6 @@ void unmqr_qdwh_full(
     Options const& opts )
 {
     // trace::Block trace_block("unmqr");
-    // const int priority_one = 1;
     using BcastList = typename Matrix<scalar_t>::BcastList;
 
     // Assumes column major

From 2be83cd6f6b4850796bce8cca457470b0b504321 Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Mon, 6 Feb 2023 14:17:01 -0500
Subject: [PATCH 43/49] moving const up and minor changes

---
 src/norm2est.cc | 26 +++++++++++++-------------
 src/qdwh.cc     | 21 ++++++++++-----------
 2 files changed, 23 insertions(+), 24 deletions(-)

diff --git a/src/norm2est.cc b/src/norm2est.cc
index 463435e31..9d6cca8c7 100644
--- a/src/norm2est.cc
+++ b/src/norm2est.cc
@@ -31,23 +31,26 @@ norm2est(
     using real_t = blas::real_type<scalar_t>;
     using blas::min;
 
-    int p, q;
-    int myrow, mycol;
-    int izero = 0;
-    GridOrder order;
-    A.gridinfo(&order, &p, &q, &myrow, &mycol);
+    // Constants
+    const scalar_t one = 1.;
+    const scalar_t zero  = 0.;
+    const real_t r_one = 1.;
+    const int izero = 0;
 
     int64_t n = A.n();
     int64_t m = A.m();
     int64_t mb = A.tileMb(0);
     int64_t nb = A.tileNb(0);
 
-    scalar_t one = 1.;
-    scalar_t zero  = 0.;
-    real_t rone = 1.;
+    int p, q;
+    int myrow, mycol;
+    GridOrder order;
+    A.gridinfo(&order, &p, &q, &myrow, &mycol);
 
     int64_t cnt = 0;
     int64_t maxiter = min( 100, n );
+    int64_t mloc   = numberLocalRowOrCol(m, mb, myrow, izero, p);
+    int64_t lldA   = blas::max(1, mloc);
 
     real_t e  = 0.;
     real_t e0  = 0.;
@@ -55,10 +58,6 @@ norm2est(
     real_t normAX = 0;
     real_t tol = 1.e-1;
 
-    int64_t mloc   = numberLocalRowOrCol(m, mb, myrow, izero, p);
-    int64_t lldA   = blas::max(1, mloc);
-
-    // todo: do we still need reserveDevice here?
     if (target == Target::Devices)
         A.reserveDeviceWorkspace();
 
@@ -104,7 +103,7 @@ norm2est(
         e0 = e;
 
         // Scale X = X / ||X||
-        scale(rone, normX, XL, opts);
+        scale(r_one, normX, XL, opts);
 
         // Compute Ax = A * sx
         gemm(one, A, XL, zero, AX, opts);
@@ -116,6 +115,7 @@ norm2est(
 
         // Compute x = A' * A * x = A' * Ax
         auto AT = conjTranspose(A);
+
         // todo: why this set is needed when using multiple mpi rank and using gemmA
         set(zero, zero, XL);
         gemm(one, AT, AX, zero, XL, opts);
diff --git a/src/qdwh.cc b/src/qdwh.cc
index 469a71897..86bdb6809 100644
--- a/src/qdwh.cc
+++ b/src/qdwh.cc
@@ -70,25 +70,21 @@ void qdwh(
     // 1. reuse Q from first geqrf
     // 2. avoid rounding m if m is not divisible by nb, because then geqrf/unmqr
     // have extra zero rows
-    // in norm2est, when allocate vector mx1, need to allocate on row-processes only
+
     using real_t = blas::real_type<scalar_t>;
     using blas::real;
 
     // Constants
-    const real_t conv = 100.;
+    const scalar_t zero = 0.0;
+    const scalar_t one = 1.0;
     const real_t r_one = 1.0;
-    const scalar_t zero = 0.0, one = 1.0;
-    scalar_t alpha, beta;
 
     // Options
     Target target = get_option( opts, Option::Target, Target::HostTask );
 
-    real_t eps  = std::numeric_limits<real_t>::epsilon();
-    real_t tol1 = 5. * eps;
-    real_t tol3 = pow(tol1, r_one/real_t(3.));
-
     int64_t mt = A.mt();
     int64_t nt = A.nt();
+    int64_t nb   = A.tileMb(0);
 
     int64_t m  = A.m();
     int64_t n  = A.n();
@@ -97,23 +93,26 @@ void qdwh(
         return;
     }
 
-    int64_t nb   = A.tileMb(0);
+    real_t eps  = std::numeric_limits<real_t>::epsilon();
+    real_t tol1 = 5. * eps;
+    real_t tol3 = pow(tol1, r_one/real_t(3.));
 
     int itconv, it;
     real_t L2, sqd, dd, a1, a, b, c;
     real_t Li, Liconv;
     real_t normR;
+    real_t conv = 10.0;
+    scalar_t alpha, beta;
+
     // The QR-based iterations requires QR[A; Id],
     // W = [A; Id], where size of A is mxn, size of Id is nxn
     // To avoid having a clean up tile in the middle of the W matrix,
     // we round up the number of A rows (m),
     // so that size(W) = m_roundup + n
-    //int64_t m_roundup =  ((m + nb - 1) / nb ) * nb;
     int64_t m_roundup = roundup( m, nb );
     int64_t m_W   = m_roundup + n;
     int64_t mt_W = mt + nt;
 
-
     int nprow, npcol;
     int myrow, mycol;
     GridOrder order;

From 23bc075a6f1dcf63dbc2e586d2bcaaea59cd4549 Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Mon, 6 Feb 2023 15:19:02 -0500
Subject: [PATCH 44/49] test matrices with a smaller cond

---
 test/test_qdwh.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_qdwh.cc b/test/test_qdwh.cc
index c636a7f92..b189035fd 100644
--- a/test/test_qdwh.cc
+++ b/test/test_qdwh.cc
@@ -109,7 +109,7 @@ void test_qdwh_work(Params& params, bool run)
         H = slate::Matrix<scalar_t>::fromScaLAPACK(n, n, &H_data[0], lldH, nb, p, q, MPI_COMM_WORLD);
     }
 
-    real_t cond = 1 / (std::numeric_limits<real_t>::epsilon() );
+    real_t cond = 1 / (100 * std::numeric_limits<real_t>::epsilon() );
     params.matrix.kind.set_default("svd");
     params.matrix.cond.set_default(cond);
 

From c8c88bb5633bff7ef551b7f6e0f685441faa5a94 Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Tue, 7 Feb 2023 15:12:29 -0500
Subject: [PATCH 45/49] Computed parameters in double to avoid overflow in
 complex-float. Minor update on gflops count

---
 src/qdwh.cc       | 21 +++++++++++----------
 test/test_qdwh.cc | 12 ++++++------
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/src/qdwh.cc b/src/qdwh.cc
index 86bdb6809..4582840ac 100644
--- a/src/qdwh.cc
+++ b/src/qdwh.cc
@@ -98,11 +98,11 @@ void qdwh(
     real_t tol3 = pow(tol1, r_one/real_t(3.));
 
     int itconv, it;
-    real_t L2, sqd, dd, a1, a, b, c;
-    real_t Li, Liconv;
     real_t normR;
     real_t conv = 10.0;
     scalar_t alpha, beta;
+    double L2, sqd, a1, a, b, c, dd;
+    real_t Li, Liconv;
 
     // The QR-based iterations requires QR[A; Id],
     // W = [A; Id], where size of A is mxn, size of Id is nxn
@@ -192,17 +192,17 @@ void qdwh(
         }
         itconv++;
 
-        L2  =  Liconv * Liconv;
+        L2  =  double(Liconv) * double(Liconv);
         dd  = std::pow( real_t(4.0) * ( r_one - L2 ), r_one / real_t(3.0) ) *
             std::pow( L2, real_t(-2.0) / real_t(3.0) );
-        sqd = sqrt( r_one + dd );
-        a1  = sqd + sqrt( real_t(8.0) - real_t(4.0) * dd +
+        sqd = sqrt( r_one + real_t(dd) );
+        a1  = sqd + sqrt( real_t(8.0) - real_t(4.0) * real_t(dd) +
               real_t(8.0) * ( real_t(2.0) - L2 ) / ( L2 * sqd ) ) / real_t(2.0);
         a   = real(a1);
         b   = ( a - r_one ) * ( a - r_one ) / real_t(4.0);
         c   = a + b - r_one;
         // Update Liconv
-        Liconv  = Liconv * ( a + b * L2 ) / ( r_one + c * L2 );
+        Liconv  = Liconv * real_t(( a + b * L2 ) / ( r_one + c * L2 ));
     }
 
     it = 0;
@@ -214,17 +214,17 @@ void qdwh(
         it++;
 
         // Compute parameters L,a,b,c
-        L2  = Li * Li;
+        L2  = double(Li) * double(Li);
         dd  = std::pow( real_t(4.0) * ( r_one - L2 ), r_one / real_t(3.0) ) *
             std::pow( L2, real_t(-2.0) / real_t(3.0) );
-        sqd = sqrt( r_one + dd );
-        a1  = sqd + sqrt( real_t(8.0) - real_t(4.0) * dd +
+        sqd = sqrt( r_one + real_t(dd) );
+        a1  = sqd + sqrt( real_t(8.0) - real_t(4.0) * real_t(dd) +
               real_t(8.0) * ( real_t(2.0) - L2 ) / ( L2 * sqd ) ) / real_t(2.0);
         a   = real(a1);
         b   = ( a - r_one ) * ( a - r_one ) / real_t(4.0);
         c   = a + b - r_one;
         // Update Li
-        Li  = Li * ( a + b * L2 ) / ( r_one + c * L2 );
+        Li  = Li * real_t(( a + b * L2 ) / ( r_one + c * L2 ));
 
         if (c > 100.) {
             // Generate the matrix W = [ W1 ] = [ sqrt(c) * A ]
@@ -326,6 +326,7 @@ void qdwh(
     else {
         gemm(one, AT, Acpy, zero, H, opts);
     }
+
     // todo: try something like her2k to compute H
     //her2k(one, A, W10, rzero, H, opts);
     //auto AL = HermitianMatrix<scalar_t>(
diff --git a/test/test_qdwh.cc b/test/test_qdwh.cc
index b189035fd..7cdb460f9 100644
--- a/test/test_qdwh.cc
+++ b/test/test_qdwh.cc
@@ -109,7 +109,7 @@ void test_qdwh_work(Params& params, bool run)
         H = slate::Matrix<scalar_t>::fromScaLAPACK(n, n, &H_data[0], lldH, nb, p, q, MPI_COMM_WORLD);
     }
 
-    real_t cond = 1 / (100 * std::numeric_limits<real_t>::epsilon() );
+    real_t cond = 1 / std::numeric_limits<real_t>::epsilon();
     params.matrix.kind.set_default("svd");
     params.matrix.cond.set_default(cond);
 
@@ -160,13 +160,13 @@ void test_qdwh_work(Params& params, bool run)
         // of the identity trailing submatrix:
         //double gflop_itqr  = itqr * ( lapack::Gflop<scalar_t>::geqrf(m+n, n) +
         //                              lapack::Gflop<scalar_t>::unmqr(slate::Side::Left, m+n, n, n) +
-        //                              lapack::Gflop<scalar_t>::gemm(m, n, n) );
+        //                              blas::Gflop<scalar_t>::gemm(m, n, n) );
         // when take advantage of the matrix structure, the flops count will be reduced:
-        double gflop_itqr  = itqr * ( 2 * m * std::pow(n, 2) +
-                                      2 * m * std::pow(n, 2) +
-                                      lapack::Gflop<scalar_t>::gemm(m, n, n) );
+        double gflop_itqr  = itqr * ( 2. * double(m*n*n) +
+                                      2. * double(m*n*n) +
+                                      blas::Gflop<scalar_t>::gemm(m, n, n) );
 
-        double gflop_itpo  = itpo * ( lapack::Gflop<scalar_t>::gemm(m, n, n) +
+        double gflop_itpo  = itpo * ( blas::Gflop<scalar_t>::gemm(m, n, n) +
                                       lapack::Gflop<scalar_t>::potrf(m)      +
                                       blas::Gflop<scalar_t>::trsm(slate::Side::Left, m, n) );
 

From 531ca62ea2f251d0b762a6ad073e1af13cb38952 Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Tue, 7 Feb 2023 16:01:07 -0500
Subject: [PATCH 46/49] Added geqrf_compute_first_indices to internal_util.hh
 and removed it from geqrf and geqrf_qdwh_full

---
 src/geqrf.cc                  | 44 +++--------------------------------
 src/geqrf_qdwh_full.cc        | 44 +++--------------------------------
 src/internal/internal_util.hh | 40 +++++++++++++++++++++++++++++++
 3 files changed, 46 insertions(+), 82 deletions(-)

diff --git a/src/geqrf.cc b/src/geqrf.cc
index 399202201..4fe6328c8 100644
--- a/src/geqrf.cc
+++ b/src/geqrf.cc
@@ -7,50 +7,12 @@
 #include "auxiliary/Debug.hh"
 #include "slate/Matrix.hh"
 #include "internal/internal.hh"
+#include "internal/internal_util.hh"
 
 namespace slate {
 
 namespace impl {
 
-//------------------------------------------------------------------------------
-/// An auxiliary routine to find each rank's first (top-most) row
-/// in panel k.
-///
-/// @param[in] A_panel
-///     Current panel, which is a sub of the input matrix $A$.
-///
-/// @param[in] k
-///     Index of the current panel in the input matrix $A$.
-///
-/// @param[out] first_indices
-///     The array of computed indices.
-///
-/// @ingroup geqrf_impl
-///
-template <typename scalar_t>
-void geqrf_compute_first_indices(
-    Matrix<scalar_t>& A_panel, int64_t k,
-    std::vector< int64_t >& first_indices )
-{
-    // Find ranks in this column.
-    std::set<int> ranks_set;
-    A_panel.getRanks(&ranks_set);
-    assert(ranks_set.size() > 0);
-
-    // Find each rank's first (top-most) row in this panel,
-    // where the triangular tile resulting from local geqrf panel
-    // will reside.
-    first_indices.reserve(ranks_set.size());
-    for (int r: ranks_set) {
-        for (int64_t i = 0; i < A_panel.mt(); ++i) {
-            if (A_panel.tileRank(i, 0) == r) {
-                first_indices.push_back(i+k);
-                break;
-            }
-        }
-    }
-}
-
 //------------------------------------------------------------------------------
 /// Distributed parallel QR factorization.
 /// Generic implementation for any target.
@@ -185,7 +147,7 @@ void geqrf(
             auto Tr_panel = Treduce.sub(k, A_mt-1, k, k);
 
             std::vector< int64_t > first_indices;
-            geqrf_compute_first_indices(A_panel, k, first_indices);
+            internal::geqrf_compute_first_indices(A_panel, k, first_indices);
             // todo: pass first_indices into internal geqrf or ttqrt?
 
             // panel, high priority
@@ -328,7 +290,7 @@ void geqrf(
 
                         auto A_panel_k_la = A.sub(k_la, A_mt-1, k_la, k_la);
                         std::vector< int64_t > first_indices_k_la;
-                        geqrf_compute_first_indices(A_panel_k_la, k_la, first_indices_k_la);
+                        internal::geqrf_compute_first_indices(A_panel_k_la, k_la, first_indices_k_la);
                         if (first_indices.size() > 0) {
                             for (int64_t row : first_indices_k_la) {
                                 if (Tlocal.tileIsLocal(row, k_la)) {
diff --git a/src/geqrf_qdwh_full.cc b/src/geqrf_qdwh_full.cc
index d1c4141c0..1dafed1a7 100644
--- a/src/geqrf_qdwh_full.cc
+++ b/src/geqrf_qdwh_full.cc
@@ -7,50 +7,12 @@
 #include "auxiliary/Debug.hh"
 #include "slate/Matrix.hh"
 #include "internal/internal.hh"
+#include "internal/internal_util.hh"
 
 namespace slate {
 
 namespace impl {
 
-//------------------------------------------------------------------------------
-/// An auxiliary routine to find each rank's first (top-most) row
-/// in panel k.
-///
-/// @param[in] A_panel
-///     Current panel, which is a sub of the input matrix $A$.
-///
-/// @param[in] k
-///     Index of the current panel in the input matrix $A$.
-///
-/// @param[out] first_indices
-///     The array of computed indices.
-///
-/// @ingroup geqrf_qdwh_full_specialization
-///
-template <typename scalar_t>
-void geqrf_compute_first_indices(
-     Matrix<scalar_t>& A_panel, int64_t k,
-     std::vector< int64_t >& first_indices)
-{
-    // Find ranks in this column.
-    std::set<int> ranks_set;
-    A_panel.getRanks(&ranks_set);
-    assert(ranks_set.size() > 0);
-
-    // Find each rank's first (top-most) row in this panel,
-    // where the triangular tile resulting from local geqrf panel
-    // will reside.
-    first_indices.reserve(ranks_set.size());
-    for (int r: ranks_set) {
-        for (int64_t i = 0; i < A_panel.mt(); ++i) {
-            if (A_panel.tileRank(i, 0) == r) {
-                first_indices.push_back(i+k);
-                break;
-            }
-        }
-    }
-}
-
 //------------------------------------------------------------------------------
 /// Distributed parallel QR factorization.
 /// Generic implementation for any target.
@@ -189,7 +151,7 @@ void geqrf_qdwh_full(
             auto Tr_panel = Treduce.sub(k, i_end, k, k);
 
             std::vector< int64_t > first_indices;
-            geqrf_compute_first_indices(A_panel, k, first_indices);
+            internal::geqrf_compute_first_indices(A_panel, k, first_indices);
             // todo: pass first_indices into internal geqrf or ttqrt?
 
             // panel, high priority
@@ -334,7 +296,7 @@ void geqrf_qdwh_full(
 
                         auto A_panel_k_la = A.sub(k_la, i_end_la, k_la, k_la);
                         std::vector< int64_t > first_indices_k_la;
-                        geqrf_compute_first_indices(A_panel_k_la, k_la, first_indices_k_la);
+                        internal::geqrf_compute_first_indices(A_panel_k_la, k_la, first_indices_k_la);
                         if (first_indices.size() > 0) {
                             for (int64_t row : first_indices_k_la) {
                                 if (Tlocal.tileIsLocal(row, k_la)) {
diff --git a/src/internal/internal_util.hh b/src/internal/internal_util.hh
index e7b8241d5..ba2671dd3 100644
--- a/src/internal/internal_util.hh
+++ b/src/internal/internal_util.hh
@@ -10,6 +10,7 @@
 #define SLATE_INTERNAL_UTIL_HH
 
 #include "slate/internal/mpi.hh"
+#include "slate/Matrix.hh"
 
 #include <cmath>
 #include <complex>
@@ -69,6 +70,45 @@ inline bool compareSecond(
     return a.second < b.second;
 }
 
+//------------------------------------------------------------------------------
+/// An auxiliary routine to find each rank's first (top-most) row
+/// in panel k.
+///
+/// @param[in] A_panel
+///     Current panel, which is a sub of the input matrix $A$.
+///
+/// @param[in] k
+///     Index of the current panel in the input matrix $A$.
+///
+/// @param[out] first_indices
+///     The array of computed indices.
+///
+/// @ingroup geqrf_impl
+///
+template <typename scalar_t>
+void geqrf_compute_first_indices(
+    Matrix<scalar_t>& A_panel, int64_t k,
+    std::vector< int64_t >& first_indices )
+{
+    // Find ranks in this column.
+    std::set<int> ranks_set;
+    A_panel.getRanks(&ranks_set);
+    assert(ranks_set.size() > 0);
+
+    // Find each rank's first (top-most) row in this panel,
+    // where the triangular tile resulting from local geqrf panel
+    // will reside.
+    first_indices.reserve(ranks_set.size());
+    for (int r: ranks_set) {
+        for (int64_t i = 0; i < A_panel.mt(); ++i) {
+            if (A_panel.tileRank(i, 0) == r) {
+                first_indices.push_back(i+k);
+                break;
+            }
+        }
+    }
+}
+
 } // namespace internal
 } // namespace slate
 

From 8ce87db66fb78b65b9187ceb951d00d0a32736ab Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Thu, 9 Feb 2023 09:58:53 -0500
Subject: [PATCH 47/49] fix flops count

---
 test/test_qdwh.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_qdwh.cc b/test/test_qdwh.cc
index 7cdb460f9..f12f7fd44 100644
--- a/test/test_qdwh.cc
+++ b/test/test_qdwh.cc
@@ -162,8 +162,8 @@ void test_qdwh_work(Params& params, bool run)
         //                              lapack::Gflop<scalar_t>::unmqr(slate::Side::Left, m+n, n, n) +
         //                              blas::Gflop<scalar_t>::gemm(m, n, n) );
         // when take advantage of the matrix structure, the flops count will be reduced:
-        double gflop_itqr  = itqr * ( 2. * double(m*n*n) +
-                                      2. * double(m*n*n) +
+        double gflop_itqr  = itqr * ( 1e-9 * 2. * double(m*n*n) +
+                                      1e-9 * 2. * double(m*n*n) +
                                       blas::Gflop<scalar_t>::gemm(m, n, n) );
 
         double gflop_itpo  = itpo * ( blas::Gflop<scalar_t>::gemm(m, n, n) +

From 34e0afa0071075d7b3d280faa2f3ca83db85f9a7 Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Fri, 31 Mar 2023 12:43:42 -0400
Subject: [PATCH 48/49] Delete un needed matrix allocation

---
 test/test_qdwh.cc | 33 ++++++++++++++-------------------
 1 file changed, 14 insertions(+), 19 deletions(-)

diff --git a/test/test_qdwh.cc b/test/test_qdwh.cc
index f12f7fd44..7e8d3a214 100644
--- a/test/test_qdwh.cc
+++ b/test/test_qdwh.cc
@@ -81,11 +81,6 @@ void test_qdwh_work(Params& params, bool run)
     int64_t lldH  = blas::max(1, mlocH); // local leading dimension of H
     std::vector<scalar_t> H_data;
 
-    // matrix QR, for checking result
-    slate::Matrix<scalar_t> Id;
-    Id = slate::Matrix<scalar_t>(n, n, nb, p, q, MPI_COMM_WORLD);
-    Id.insertLocalTiles();
-
     slate::Matrix<scalar_t> A;
     slate::Matrix<scalar_t> H;
     //slate::HermitianMatrix<scalar_t> H;
@@ -185,20 +180,6 @@ void test_qdwh_work(Params& params, bool run)
     }
 
     if (check) {
-        //==================================================
-        // Test results by checking the orthogonality of U factor
-        //
-        //      || A'A - I ||_f
-        //     ---------------- < tol * epsilon
-        //            n
-        //
-        //==================================================
-        auto AT = conj_transpose(A);
-        set(zero, one, Id);
-        slate::gemm(one, AT, A, -one, Id, opts);
-        real_t orth = slate::norm(slate::Norm::Fro, Id);
-        params.ortho() = orth / sqrt(n);
-
         //==================================================
         // Test results by checking backwards error
         //
@@ -215,6 +196,20 @@ void test_qdwh_work(Params& params, bool run)
         real_t berr = slate::norm(slate::Norm::Fro, Aref, opts);
         params.error() = berr / normA;
 
+        //==================================================
+        // Test results by checking the orthogonality of U factor
+        //
+        //      || A'A - I ||_f
+        //     ---------------- < tol * epsilon
+        //            n
+        //
+        //==================================================
+        auto AT = conj_transpose(A);
+        set(zero, one, Aref);
+        slate::gemm(one, AT, A, -one, Aref, opts);
+        real_t orth = slate::norm(slate::Norm::Fro, Aref);
+        params.ortho() = orth / sqrt(n);
+
         real_t tol = params.tol() * 0.5 * std::numeric_limits<real_t>::epsilon();
         params.okay() = ((params.error() <= tol) && (params.ortho() <= tol));
 

From d25e4198d7cb7e8122cfa49844d659986aa48800 Mon Sep 17 00:00:00 2001
From: Dalal Sukkari <sukkari@icl.utk.edu>
Date: Mon, 10 Apr 2023 10:00:48 -0400
Subject: [PATCH 49/49] Changed device_malloc and used gemmC in norm2est (for
 now)

---
 src/geqrf_qdwh_full.cc | 14 ++++++--------
 src/norm2est.cc        |  6 +++---
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/src/geqrf_qdwh_full.cc b/src/geqrf_qdwh_full.cc
index 1dafed1a7..c341104a0 100644
--- a/src/geqrf_qdwh_full.cc
+++ b/src/geqrf_qdwh_full.cc
@@ -108,9 +108,7 @@ void geqrf_qdwh_full(
 
         if (panel_device >= 0) {
 
-            blas::set_device( panel_device );
-
-            lapack::Queue* queue = A.compute_queue(panel_device, 0);
+            lapack::Queue* comm_queue = A.comm_queue(panel_device);
 
             int64_t nb       = A.tileNb(0);
             size_t  size_tau = (size_t) std::min( mlocal, nb );
@@ -119,15 +117,15 @@ void geqrf_qdwh_full(
 
             // Find size of the workspace needed
             lapack::geqrf_work_size_bytes( mlocal, nb, dwork_array[0], mlocal,
-                                           &dsize, &hsize, *queue );
+                                           &dsize, &hsize, *comm_queue );
 
             // Size of dA, dtau, dwork and dinfo
             work_size = size_A + size_tau + ceildiv(dsize, sizeof(scalar_t))
                         + ceildiv(sizeof(device_info_t), sizeof(scalar_t));
 
             for (int64_t dev = 0; dev < num_devices; ++dev) {
-                blas::set_device(dev);
-                dwork_array[dev] = blas::device_malloc<scalar_t>(work_size);
+                lapack::Queue* queue = A.comm_queue( dev );
+                dwork_array[dev] = blas::device_malloc<scalar_t>(work_size, *queue);
             }
         }
     }
@@ -325,8 +323,8 @@ void geqrf_qdwh_full(
 
     if (target == Target::Devices) {
         for (int64_t dev = 0; dev < num_devices; ++dev) {
-            blas::set_device(dev);
-            blas::device_free( dwork_array[dev] );
+            blas::Queue* queue = A.comm_queue( dev );
+            blas::device_free( dwork_array[dev], *queue );
             dwork_array[dev] = nullptr;
         }
     }
diff --git a/src/norm2est.cc b/src/norm2est.cc
index 9d6cca8c7..f7b253ac2 100644
--- a/src/norm2est.cc
+++ b/src/norm2est.cc
@@ -106,7 +106,7 @@ norm2est(
         scale(r_one, normX, XL, opts);
 
         // Compute Ax = A * sx
-        gemm(one, A, XL, zero, AX, opts);
+        gemmC(one, A, XL, zero, AX, opts);
 
         // todo: still need to add the following
         //if nnz(Sx) == 0
@@ -114,11 +114,11 @@ norm2est(
         //end
 
         // Compute x = A' * A * x = A' * Ax
-        auto AT = conjTranspose(A);
+        auto AT = conj_transpose(A);
 
         // todo: why this set is needed when using multiple mpi rank and using gemmA
         set(zero, zero, XL);
-        gemm(one, AT, AX, zero, XL, opts);
+        gemmC(one, AT, AX, zero, XL, opts);
 
         // Compute ||X||, ||AX||
         normX  = norm(Norm::Fro, XL, opts);