diff --git a/GNUmakefile b/GNUmakefile
index 8649f065f..7773cc72e 100644
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -508,6 +508,7 @@ ifneq ($(only_unit),1)
         src/gemmA.cc \
         src/gemmC.cc \
         src/geqrf.cc \
+        src/geqrf_qdwh_full.cc \
         src/gesv.cc \
         src/gesvMixed.cc \
         src/gesv_nopiv.cc \
@@ -534,6 +535,7 @@ ifneq ($(only_unit),1)
         src/hetrf.cc \
         src/hetrs.cc \
         src/norm.cc \
+        src/norm2est.cc \
         src/pbsv.cc \
         src/pbtrf.cc \
         src/pbtrs.cc \
@@ -543,6 +545,7 @@ ifneq ($(only_unit),1)
         src/potri.cc \
         src/potrs.cc \
         src/print.cc \
+        src/qdwh.cc \
         src/scale.cc \
         src/scale_row_col.cc \
         src/set.cc \
@@ -565,6 +568,7 @@ ifneq ($(only_unit),1)
         src/unmqr.cc \
         src/unmtr_hb2st.cc \
         src/unmtr_he2hb.cc \
+        src/unmqr_qdwh_full.cc \
         src/work/work_trmm.cc \
         src/work/work_trsm.cc \
         src/work/work_trsmA.cc \
@@ -635,6 +639,7 @@ tester_src += \
         test/test_pbsv.cc \
         test/test_posv.cc \
         test/test_potri.cc \
+        test/test_qdwh.cc \
         test/test_scale.cc \
         test/test_scale_row_col.cc \
         test/test_set.cc \
diff --git a/docs/doxygen/groups.dox b/docs/doxygen/groups.dox
index 6cb84a95e..dcbc88538 100644
--- a/docs/doxygen/groups.dox
+++ b/docs/doxygen/groups.dox
@@ -145,6 +145,14 @@
         @defgroup svd_specialization    Target implementations
     @}
 
+    ------------------------------------------------------------
+    @defgroup group_qdwh Polar Decomposition
+    @{
+        @defgroup qdwh                      Driver
+        @brief                              $A = U H$
+
+    @}
+
     ------------------------------------------------------------
     @defgroup group_blas2_top     Level 2 BLAS and Auxiliary: O(n^2) work
     @brief    Matrix and Matrix-vector operations that perform $O(n^2)$ work on $O(n^2)$ data.
diff --git a/include/slate/slate.hh b/include/slate/slate.hh
index d44a17562..d437f849d 100644
--- a/include/slate/slate.hh
+++ b/include/slate/slate.hh
@@ -1107,6 +1107,42 @@ void trcondest(
         blas::real_type<scalar_t> *rcond,
         Options const& opts = Options());
 
+//-----------------------------------------
+// Polar Decomposition
+
+//-----------------------------------------
+// qdwh()
+template <typename scalar_t>
+void qdwh(
+    Matrix<scalar_t>& A,
+    Matrix<scalar_t>& B,
+    int& itqr, int& itpo,
+    Options const& opts = Options());
+
+//-----------------------------------------
+// normest()
+template <typename matrix_type>
+blas::real_type<typename matrix_type::value_type>
+norm2est(
+    matrix_type &A,
+    Options const& opts = Options());
+
+//-----------------------------------------
+// geqrf_qdwh_full()
+template <typename scalar_t>
+void geqrf_qdwh_full(
+    Matrix<scalar_t>& A, TriangularFactors<scalar_t>& T,
+    Options const& opts = Options());
+
+//-----------------------------------------
+// unmqr_qdwh_full()
+template <typename scalar_t>
+void unmqr_qdwh_full(
+    Side side, Op op,
+    Matrix<scalar_t>& A, TriangularFactors<scalar_t>& T,
+    Matrix<scalar_t>& C,
+    Options const& opts = Options());
+
 } // namespace slate
 
 //-----------------------------------------
diff --git a/src/geqrf.cc b/src/geqrf.cc
index 399202201..4fe6328c8 100644
--- a/src/geqrf.cc
+++ b/src/geqrf.cc
@@ -7,50 +7,12 @@
 #include "auxiliary/Debug.hh"
 #include "slate/Matrix.hh"
 #include "internal/internal.hh"
+#include "internal/internal_util.hh"
 
 namespace slate {
 
 namespace impl {
 
-//------------------------------------------------------------------------------
-/// An auxiliary routine to find each rank's first (top-most) row
-/// in panel k.
-///
-/// @param[in] A_panel
-///     Current panel, which is a sub of the input matrix $A$.
-///
-/// @param[in] k
-///     Index of the current panel in the input matrix $A$.
-///
-/// @param[out] first_indices
-///     The array of computed indices.
-///
-/// @ingroup geqrf_impl
-///
-template <typename scalar_t>
-void geqrf_compute_first_indices(
-    Matrix<scalar_t>& A_panel, int64_t k,
-    std::vector< int64_t >& first_indices )
-{
-    // Find ranks in this column.
-    std::set<int> ranks_set;
-    A_panel.getRanks(&ranks_set);
-    assert(ranks_set.size() > 0);
-
-    // Find each rank's first (top-most) row in this panel,
-    // where the triangular tile resulting from local geqrf panel
-    // will reside.
-    first_indices.reserve(ranks_set.size());
-    for (int r: ranks_set) {
-        for (int64_t i = 0; i < A_panel.mt(); ++i) {
-            if (A_panel.tileRank(i, 0) == r) {
-                first_indices.push_back(i+k);
-                break;
-            }
-        }
-    }
-}
-
 //------------------------------------------------------------------------------
 /// Distributed parallel QR factorization.
 /// Generic implementation for any target.
@@ -185,7 +147,7 @@ void geqrf(
             auto Tr_panel = Treduce.sub(k, A_mt-1, k, k);
 
             std::vector< int64_t > first_indices;
-            geqrf_compute_first_indices(A_panel, k, first_indices);
+            internal::geqrf_compute_first_indices(A_panel, k, first_indices);
             // todo: pass first_indices into internal geqrf or ttqrt?
 
             // panel, high priority
@@ -328,7 +290,7 @@ void geqrf(
 
                         auto A_panel_k_la = A.sub(k_la, A_mt-1, k_la, k_la);
                         std::vector< int64_t > first_indices_k_la;
-                        geqrf_compute_first_indices(A_panel_k_la, k_la, first_indices_k_la);
+                        internal::geqrf_compute_first_indices(A_panel_k_la, k_la, first_indices_k_la);
                         if (first_indices.size() > 0) {
                             for (int64_t row : first_indices_k_la) {
                                 if (Tlocal.tileIsLocal(row, k_la)) {
diff --git a/src/geqrf_qdwh_full.cc b/src/geqrf_qdwh_full.cc
new file mode 100644
index 000000000..c341104a0
--- /dev/null
+++ b/src/geqrf_qdwh_full.cc
@@ -0,0 +1,449 @@
+// Copyright (c) 2017-2022, University of Tennessee. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
+
+#include "slate/slate.hh"
+#include "auxiliary/Debug.hh"
+#include "slate/Matrix.hh"
+#include "internal/internal.hh"
+#include "internal/internal_util.hh"
+
+namespace slate {
+
+namespace impl {
+
+//------------------------------------------------------------------------------
+/// Distributed parallel QR factorization.
+/// Generic implementation for any target.
+/// Panel and lookahead computed on host using Host OpenMP task.
+///
+/// ColMajor layout is assumed
+///
+/// @ingroup geqrf_impl
+///
+template <Target target, typename scalar_t>
+void geqrf_qdwh_full(
+           Matrix<scalar_t>& A,
+           TriangularFactors<scalar_t>& T,
+           Options const& opts )
+{
+    using BcastList = typename Matrix<scalar_t>::BcastList;
+    using device_info_t = lapack::device_info_int;
+    using blas::real;
+
+    // Assumes column major
+    const Layout layout = Layout::ColMajor;
+
+    const int priority_zero = 0;
+    const int priority_one = 1;
+    const int life_factor_one = 1;
+
+    // Options
+    int64_t lookahead = get_option<int64_t>( opts, Option::Lookahead, 1 );
+    int64_t ib = get_option<int64_t>( opts, Option::InnerBlocking, 16 );
+    int64_t max_panel_threads  = std::max(omp_get_max_threads()/2, 1);
+    max_panel_threads = get_option<int64_t>( opts, Option::MaxPanelThreads,
+                                             max_panel_threads );
+
+    bool set_hold = lookahead > 0;  // Do tileGetAndHold in the bcast
+
+    int64_t A_mt = A.mt();
+    int64_t A_nt = A.nt();
+    int64_t A_min_mtnt = std::min(A_mt, A_nt);
+
+    T.clear();
+    T.push_back(A.emptyLike());
+    T.push_back(A.emptyLike(ib, 0));
+    auto Tlocal  = T[0];
+    auto Treduce = T[1];
+
+    // workspace
+    auto W = A.emptyLike();
+
+    // setting up dummy variables for case the when target == host
+    int64_t num_devices  = A.num_devices();
+    int     panel_device = -1;
+    size_t  work_size    = 0;
+
+    std::vector< scalar_t* > dwork_array( num_devices, nullptr );
+
+    if (target == Target::Devices) {
+        const int64_t batch_size_zero = 0; // use default batch size
+        const int64_t num_queues = 3 + lookahead;
+        A.allocateBatchArrays(batch_size_zero, num_queues);
+        A.reserveDeviceWorkspace();
+        W.allocateBatchArrays(batch_size_zero, num_queues);
+        // todo: this is demanding too much device workspace memory
+        // only one tile-row of matrix W per MPI process is going to be used,
+        // but W with size of whole A is being allocated
+        // thus limiting the matrix size that can be processed
+        // For now, allocate workspace tiles 1-by-1.
+        //W.reserveDeviceWorkspace();
+
+        // Find largest panel size and device for copying to
+        // contiguous memory within internal geqrf routine
+        int64_t mlocal = 0;
+        int64_t first_panel_seen = -1;
+        for (int64_t j = 0; j < A.nt(); ++j) {
+            for (int64_t i = j; i < A.mt(); ++i) {
+                if (A.tileIsLocal(i, j)) {
+                    if (first_panel_seen < 0) {
+                        first_panel_seen = j;
+                    }
+                    if (first_panel_seen == j) {
+                        if (panel_device < 0) {
+                            panel_device = A.tileDevice(i, j);
+                        }
+                        mlocal += A.tileMb(i);
+                        // Asserting 1-D distribution for device
+                        assert( panel_device == A.tileDevice(i, j) );
+                    }
+                }
+            }
+            if (first_panel_seen >= 0) {
+                break;
+            }
+        }
+
+        if (panel_device >= 0) {
+
+            lapack::Queue* comm_queue = A.comm_queue(panel_device);
+
+            int64_t nb       = A.tileNb(0);
+            size_t  size_tau = (size_t) std::min( mlocal, nb );
+            size_t  size_A   = (size_t) blas::max( 1, mlocal ) * nb;
+            size_t  hsize, dsize;
+
+            // Find size of the workspace needed
+            lapack::geqrf_work_size_bytes( mlocal, nb, dwork_array[0], mlocal,
+                                           &dsize, &hsize, *comm_queue );
+
+            // Size of dA, dtau, dwork and dinfo
+            work_size = size_A + size_tau + ceildiv(dsize, sizeof(scalar_t))
+                        + ceildiv(sizeof(device_info_t), sizeof(scalar_t));
+
+            for (int64_t dev = 0; dev < num_devices; ++dev) {
+                lapack::Queue* queue = A.comm_queue( dev );
+                dwork_array[dev] = blas::device_malloc<scalar_t>(work_size, *queue);
+            }
+        }
+    }
+
+    // QR tracks dependencies by block-column.
+    // OpenMP needs pointer types, but vectors are exception safe
+    std::vector< uint8_t > block_vector(A_nt);
+    uint8_t* block = block_vector.data();
+
+    // set min number for omp nested active parallel regions
+    slate::OmpSetMaxActiveLevels set_active_levels( MinOmpActiveLevels );
+
+    #pragma omp parallel
+    #pragma omp master
+    {
+        for (int64_t k = 0; k < A_min_mtnt; ++k) {
+            int64_t i_end = A_mt - A_nt + k;
+
+            auto  A_panel =       A.sub(k, i_end, k, k);
+            auto Tl_panel =  Tlocal.sub(k, i_end, k, k);
+            auto Tr_panel = Treduce.sub(k, i_end, k, k);
+
+            std::vector< int64_t > first_indices;
+            internal::geqrf_compute_first_indices(A_panel, k, first_indices);
+            // todo: pass first_indices into internal geqrf or ttqrt?
+
+            // panel, high priority
+            #pragma omp task depend(inout:block[k]) priority(priority_one)
+            {
+                // local panel factorization
+                internal::geqrf<target>(
+                                std::move(A_panel),
+                                std::move(Tl_panel),
+                                dwork_array, work_size,
+                                ib, max_panel_threads, priority_one);
+
+                // triangle-triangle reductions
+                // ttqrt handles tile transfers internally
+                internal::ttqrt<Target::HostTask>(
+                                std::move(A_panel),
+                                std::move(Tr_panel));
+
+                // if a trailing matrix exists
+                if (k < A_nt-1) {
+
+                    // bcast V across row for trailing matrix update
+                    if (k < i_end) {
+                        BcastList bcast_list_V_first;
+                        BcastList bcast_list_V;
+                        for (int64_t i = k; i < i_end+1; ++i) {
+                            // send A(i, k) across row A(i, k+1:nt-1)
+                            // Vs in first_indices (except the main diagonal one) need three lives
+                            if ((std::find(first_indices.begin(), first_indices.end(), i) != first_indices.end()) && (i > k))
+                                bcast_list_V_first.push_back({i, k, {A.sub(i, i, k+1, A_nt-1)}});
+                            else
+                                bcast_list_V.push_back({i, k, {A.sub(i, i, k+1, A_nt-1)}});
+                        }
+                        A.template listBcast<target>(bcast_list_V_first, layout, 0, 3, set_hold);
+                        A.template listBcast<target>(bcast_list_V, layout, 0, 2, set_hold);
+                    }
+
+                    // bcast Tlocal across row for trailing matrix update
+                    if (first_indices.size() > 0) {
+                        BcastList bcast_list_T;
+                        for (int64_t row : first_indices) {
+                            bcast_list_T.push_back({row, k, {Tlocal.sub(row, row, k+1, A_nt-1)}});
+                        }
+                        Tlocal.template listBcast<target>(bcast_list_T, layout, k, life_factor_one, set_hold);
+                    }
+
+                    // bcast Treduce across row for trailing matrix update
+                    if (first_indices.size() > 1) {
+                        BcastList bcast_list_T;
+                        for (int64_t row : first_indices) {
+                            if (row > k) // exclude the first row of this panel that has no Treduce tile
+                                bcast_list_T.push_back({row, k, {Treduce.sub(row, row, k+1, A_nt-1)}});
+                        }
+                        Treduce.template listBcast(bcast_list_T, layout);
+                    }
+                }
+            }
+
+            // update lookahead column(s) on CPU, high priority
+            for (int64_t j = k+1; j < (k+1+lookahead) && j < A_nt; ++j) {
+
+                auto A_trail_j = A.sub(k, i_end, j, j);
+
+                #pragma omp task depend(in:block[k]) \
+                                 depend(inout:block[j]) \
+                                 priority(priority_one)
+                {
+                    // Apply local reflectors
+                    internal::unmqr<target>(
+                                    Side::Left, Op::ConjTrans,
+                                    std::move(A_panel),
+                                    std::move(Tl_panel),
+                                    std::move(A_trail_j),
+                                    W.sub(k, i_end, j, j),
+                                    priority_one, j-k+1);
+
+                    // Apply triangle-triangle reduction reflectors
+                    // ttmqr handles the tile broadcasting internally
+                    internal::ttmqr<Target::HostTask>(
+                                    Side::Left, Op::ConjTrans,
+                                    std::move(A_panel),
+                                    std::move(Tr_panel),
+                                    std::move(A_trail_j),
+                                    j);
+                }
+            }
+
+            // update trailing submatrix, normal priority
+            if (k+1+lookahead < A_nt) {
+                int64_t j = k+1+lookahead;
+                auto A_trail_j = A.sub(k, i_end, j, A_nt-1);
+
+                #pragma omp task depend(in:block[k]) \
+                                 depend(inout:block[k+1+lookahead]) \
+                                 depend(inout:block[A_nt-1])
+                {
+                    // Apply local reflectors.
+                    internal::unmqr<target>(
+                                    Side::Left, Op::ConjTrans,
+                                    std::move(A_panel),
+                                    std::move(Tl_panel),
+                                    std::move(A_trail_j),
+                                    W.sub(k, i_end, j, A_nt-1),
+                                    priority_zero, j-k+1);
+
+                    // Apply triangle-triangle reduction reflectors.
+                    // ttmqr handles the tile broadcasting internally.
+                    internal::ttmqr<Target::HostTask>(
+                                    Side::Left, Op::ConjTrans,
+                                    std::move(A_panel),
+                                    std::move(Tr_panel),
+                                    std::move(A_trail_j),
+                                    j);
+                }
+            }
+            if (target == Target::Devices) {
+                // Update the status of the on-hold tiles held by the invocation of
+                // the tileBcast routine, and then release them to free up memory.
+                // The origin must be updated with the latest modified copy
+                // for memory consistency.
+                // TODO: Find better solution to handle tile release, and
+                //       investigate the correctness of the task dependency
+                if (k >= lookahead && k < A_nt-1) {
+                    #pragma omp task depend(in:block[k]) \
+                                     depend(inout:block[k+1])
+                    {
+                        int64_t k_la = k-lookahead;
+                        int64_t i_end_la = A_min_mtnt + k_la;
+                        for (int64_t i = k_la; i < i_end_la; ++i) {
+                            if (A.tileIsLocal(i, k_la)) {
+                                A.tileUpdateOrigin(i, k_la);
+
+                                std::set<int> dev_set;
+                                A.sub(i, i, k_la+1, A_nt-1).getLocalDevices(&dev_set);
+
+                                for (auto device : dev_set) {
+                                    A.tileUnsetHold(i, k_la, device);
+                                    A.tileRelease(i, k_la, device);
+                                }
+                            }
+                        }
+
+                        auto A_panel_k_la = A.sub(k_la, i_end_la, k_la, k_la);
+                        std::vector< int64_t > first_indices_k_la;
+                        internal::geqrf_compute_first_indices(A_panel_k_la, k_la, first_indices_k_la);
+                        if (first_indices.size() > 0) {
+                            for (int64_t row : first_indices_k_la) {
+                                if (Tlocal.tileIsLocal(row, k_la)) {
+                                    Tlocal.tileUpdateOrigin(row, k_la);
+
+                                    std::set<int> dev_set;
+                                    Tlocal.sub(row, row, k_la+1, A_nt-1).getLocalDevices(&dev_set);
+
+                                    for (auto device : dev_set) {
+                                        Tlocal.tileUnsetHold(row, k_la, device);
+                                        Tlocal.tileRelease(row, k_la, device);
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        #pragma omp taskwait
+        A.tileUpdateAllOrigin();
+    }
+
+    A.releaseWorkspace();
+
+    if (target == Target::Devices) {
+        for (int64_t dev = 0; dev < num_devices; ++dev) {
+            blas::Queue* queue = A.comm_queue( dev );
+            blas::device_free( dwork_array[dev], *queue );
+            dwork_array[dev] = nullptr;
+        }
+    }
+}
+
+} // namespace impl
+
+//------------------------------------------------------------------------------
+/// Distributed parallel customized QR factorization.
+/// Required for the QR-based iterations in the polar decomposition QDWH.
+///
+/// Computes a QR factorization of m-by-n matrix $A$, m \ge 2n,
+/// and takes advantage of the trailing identity matrix structure.
+/// A = [ A0 ]  full matrix ( m0-by-n, where m0 = m - n, which the size of the
+/// original matrix in qdwh)
+///     [ A1 ]  identity matrix (n-by-n)
+/// Avoids doing computaions on the zero tiles below the diagonal of $A1$.
+/// The factorization has the form
+/// \[
+///     A = QR,
+/// \]
+/// where $Q$ is a matrix with orthonormal columns and $R$ is upper triangular
+/// (or upper trapezoidal if m < n).
+///
+/// Complexity (in real):
+/// The size of matrix $A$ is $m = m0 + n$;
+/// Since the lower n-by-n of the matrix $A$ is the identity, then the
+/// k householder vectors has m0 - k + k + 1 = m0 + 1 rows.
+/// - for $m \ge n$, \Sigma_{k=1}^{n-1} 4 (n - k) (m0 + 1) =
+/// $\approx 2 m0 n^{2} + 2 m0 n +  n^{2}$ flops;
+/// .
+//------------------------------------------------------------------------------
+/// @tparam scalar_t
+///     One of float, double, std::complex<float>, std::complex<double>.
+//------------------------------------------------------------------------------
+/// @param[in,out] A
+///     On entry, the m-by-n matrix $A$, m \ge 2n,
+///     A = [ A0 ]  full matrix ( m0-by-n, where m0 = m - n)
+///         [ A1 ]  identity matrix (n-by-n)
+///
+///     On exit, the elements on and above the diagonal of the array contain
+///     the min(m,n)-by-n upper trapezoidal matrix $R$ (upper triangular
+///     if m >= n); the elements below the diagonal represent the unitary
+///     matrix $Q$ as a product of elementary reflectors.
+///
+/// @param[out] T
+///     On exit, triangular matrices of the block reflectors.
+///
+/// @param[in] opts
+///     Additional options, as map of name = value pairs. Possible options:
+///     - Option::Lookahead:
+///       Number of panels to overlap with matrix updates.
+///       lookahead >= 0. Default 1.
+///     - Option::InnerBlocking:
+///       Inner blocking to use for panel. Default 16.
+///     - Option::MaxPanelThreads:
+///       Number of threads to use for panel. Default omp_get_max_threads()/2.
+///     - Option::Target:
+///       Implementation to target. Possible values:
+///       - HostTask:  OpenMP tasks on CPU host [default].
+///       - HostNest:  nested OpenMP parallel for loop on CPU host.
+///       - HostBatch: batched BLAS on CPU host.
+///       - Devices:   batched BLAS on GPU device.
+///
+/// @ingroup geqrf_computational
+///
+template <typename scalar_t>
+void geqrf_qdwh_full(
+           Matrix<scalar_t>& A,
+           TriangularFactors<scalar_t>& T,
+           Options const& opts)
+{
+    Target target = get_option( opts, Option::Target, Target::HostTask );
+
+    switch (target) {
+        case Target::Host:
+        case Target::HostTask:
+            impl::geqrf_qdwh_full<Target::HostTask>( A, T, opts );
+            break;
+
+        case Target::HostNest:
+            impl::geqrf_qdwh_full<Target::HostNest>( A, T, opts );
+            break;
+
+        case Target::HostBatch:
+            impl::geqrf_qdwh_full<Target::HostBatch>( A, T, opts );
+            break;
+
+        case Target::Devices:
+            impl::geqrf_qdwh_full<Target::Devices>( A, T, opts );
+            break;
+    }
+    // todo: return value for errors?
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+template
+void geqrf_qdwh_full<float>(
+    Matrix<float>& A,
+    TriangularFactors<float>& T,
+    Options const& opts);
+
+template
+void geqrf_qdwh_full<double>(
+    Matrix<double>& A,
+    TriangularFactors<double>& T,
+    Options const& opts);
+
+template
+void geqrf_qdwh_full< std::complex<float> >(
+    Matrix< std::complex<float> >& A,
+    TriangularFactors< std::complex<float> >& T,
+    Options const& opts);
+
+template
+void geqrf_qdwh_full< std::complex<double> >(
+    Matrix< std::complex<double> >& A,
+    TriangularFactors< std::complex<double> >& T,
+    Options const& opts);
+
+} // namespace slate
diff --git a/src/internal/internal_util.hh b/src/internal/internal_util.hh
index e7b8241d5..ba2671dd3 100644
--- a/src/internal/internal_util.hh
+++ b/src/internal/internal_util.hh
@@ -10,6 +10,7 @@
 #define SLATE_INTERNAL_UTIL_HH
 
 #include "slate/internal/mpi.hh"
+#include "slate/Matrix.hh"
 
 #include <cmath>
 #include <complex>
@@ -69,6 +70,45 @@ inline bool compareSecond(
     return a.second < b.second;
 }
 
+//------------------------------------------------------------------------------
+/// An auxiliary routine to find each rank's first (top-most) row
+/// in panel k.
+///
+/// @param[in] A_panel
+///     Current panel, which is a sub of the input matrix $A$.
+///
+/// @param[in] k
+///     Index of the current panel in the input matrix $A$.
+///
+/// @param[out] first_indices
+///     The array of computed indices.
+///
+/// @ingroup geqrf_impl
+///
+template <typename scalar_t>
+void geqrf_compute_first_indices(
+    Matrix<scalar_t>& A_panel, int64_t k,
+    std::vector< int64_t >& first_indices )
+{
+    // Find ranks in this column.
+    std::set<int> ranks_set;
+    A_panel.getRanks(&ranks_set);
+    assert(ranks_set.size() > 0);
+
+    // Find each rank's first (top-most) row in this panel,
+    // where the triangular tile resulting from local geqrf panel
+    // will reside.
+    first_indices.reserve(ranks_set.size());
+    for (int r: ranks_set) {
+        for (int64_t i = 0; i < A_panel.mt(); ++i) {
+            if (A_panel.tileRank(i, 0) == r) {
+                first_indices.push_back(i+k);
+                break;
+            }
+        }
+    }
+}
+
 } // namespace internal
 } // namespace slate
 
diff --git a/src/norm2est.cc b/src/norm2est.cc
new file mode 100644
index 000000000..f7b253ac2
--- /dev/null
+++ b/src/norm2est.cc
@@ -0,0 +1,204 @@
+// Copyright (c) 2017-2022, University of Tennessee. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
+
+#include "slate/slate.hh"
+#include "internal/internal.hh"
+#include "internal/internal_util.hh"
+#include "slate/internal/mpi.hh"
+
+#include <list>
+#include <tuple>
+
+namespace slate {
+
+namespace impl {
+
+//------------------------------------------------------------------------------
+/// @internal
+/// Distributed parallel general matrix 2-norm estimate.
+/// Generic implementation for any target.
+/// @ingroup norm_specialization
+///
+template <Target target, typename matrix_type>
+blas::real_type<typename matrix_type::value_type>
+norm2est(
+    matrix_type A,
+    Options const& opts)
+{
+    using scalar_t = typename matrix_type::value_type;
+    using real_t = blas::real_type<scalar_t>;
+    using blas::min;
+
+    // Constants
+    const scalar_t one = 1.;
+    const scalar_t zero  = 0.;
+    const real_t r_one = 1.;
+    const int izero = 0;
+
+    int64_t n = A.n();
+    int64_t m = A.m();
+    int64_t mb = A.tileMb(0);
+    int64_t nb = A.tileNb(0);
+
+    int p, q;
+    int myrow, mycol;
+    GridOrder order;
+    A.gridinfo(&order, &p, &q, &myrow, &mycol);
+
+    int64_t cnt = 0;
+    int64_t maxiter = min( 100, n );
+    int64_t mloc   = numberLocalRowOrCol(m, mb, myrow, izero, p);
+    int64_t lldA   = blas::max(1, mloc);
+
+    real_t e  = 0.;
+    real_t e0  = 0.;
+    real_t normX = 0;
+    real_t normAX = 0;
+    real_t tol = 1.e-1;
+
+    if (target == Target::Devices)
+        A.reserveDeviceWorkspace();
+
+    std::vector<real_t> local_sums(n);
+    std::vector<scalar_t> global_sums(n);
+    std::vector<scalar_t> W1(lldA);
+
+    auto XL = slate::Matrix<scalar_t>::fromScaLAPACK(
+            n, 1, &global_sums[0], n, nb, 1, p, q, A.mpiComm());
+    auto AX = slate::Matrix<scalar_t>::fromScaLAPACK(
+            m, 1, &W1[0], lldA, mb, 1, p, q, A.mpiComm());
+
+    // Two norm estimation
+    // First: let's compute the x vector such that
+    // x_j = sum_i abs( A_{i,j} ), x here is global_sums.data
+    #pragma omp parallel
+    #pragma omp master
+    {
+        internal::norm<target>(slate::Norm::One, NormScope::Matrix, std::move(A), local_sums.data());
+    }
+
+    // global_sums.data() will have the sum of each column.
+    #pragma omp critical(slate_mpi)
+    {
+        trace::Block trace_block("MPI_Allreduce");
+        slate_mpi_call(
+                MPI_Allreduce(local_sums.data(), global_sums.data(),
+                    n, mpi_type<real_t>::value,
+                    MPI_SUM, A.mpiComm()));
+    }
+
+    // Second: compute the ||x||_Fro
+    e  = slate::norm(Norm::Fro, XL, opts);
+    if (e == 0.) {
+        return 0.;
+    }
+    else {
+        normX = e;
+    }
+
+    // Third: start the while-loop X = X / ||X||
+    while ((cnt < maxiter) && (std::abs(e - e0) > (tol * e))) {
+        e0 = e;
+
+        // Scale X = X / ||X||
+        scale(r_one, normX, XL, opts);
+
+        // Compute Ax = A * sx
+        gemmC(one, A, XL, zero, AX, opts);
+
+        // todo: still need to add the following
+        //if nnz(Sx) == 0
+        //    Sx = rand(size(Sx),class(Sx));
+        //end
+
+        // Compute x = A' * A * x = A' * Ax
+        auto AT = conj_transpose(A);
+
+        // todo: why this set is needed when using multiple mpi rank and using gemmA
+        set(zero, zero, XL);
+        gemmC(one, AT, AX, zero, XL, opts);
+
+        // Compute ||X||, ||AX||
+        normX  = norm(Norm::Fro, XL, opts);
+        normAX = norm(Norm::Fro, AX, opts);
+
+        // Update e
+        e = normX / normAX;
+        cnt++;
+    }
+
+    A.clearWorkspace();
+
+    return e;
+
+}
+
+} // namespace impl
+
+//------------------------------------------------------------------------------
+/// Distributed parallel general matrix norm.
+///
+//------------------------------------------------------------------------------
+/// @tparam matrix_type
+///     Any SLATE matrix type: Matrix, SymmetricMatrix, HermitianMatrix,
+///     TriangularMatrix, etc.
+//------------------------------------------------------------------------------
+/// @param[in] A
+///     The matrix A.
+///
+/// @param[in] opts
+///     Additional options, as map of name = value pairs. Possible options:
+///     - Option::Target:
+///       Implementation to target. Possible values:
+///       - HostTask:  OpenMP tasks on CPU host [default].
+///       - HostNest:  nested OpenMP parallel for loop on CPU host.
+///       - Devices:   batched BLAS on GPU device.
+///
+/// @ingroup norm
+///
+template <typename matrix_type>
+blas::real_type<typename matrix_type::value_type>
+norm2est(matrix_type& A,
+        Options const& opts)
+{
+    Target target = get_option( opts, Option::Target, Target::HostTask );
+
+    switch (target) {
+        case Target::Host:
+        case Target::HostTask:
+        case Target::HostNest:
+        case Target::HostBatch:
+            return impl::norm2est<Target::HostTask>( A, opts );
+            break;
+        case Target::Devices:
+            return impl::norm2est<Target::Devices>( A, opts );
+            break;
+    }
+    throw std::exception();  // todo: invalid target
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+template
+float norm2est(
+    Matrix<float>& A,
+    Options const& opts);
+
+template
+double norm2est(
+    Matrix<double>& A,
+    Options const& opts);
+
+template
+float norm2est(
+    Matrix< std::complex<float> >& A,
+    Options const& opts);
+
+template
+double norm2est(
+    Matrix< std::complex<double> >& A,
+    Options const& opts);
+
+} // namespace slate
diff --git a/src/qdwh.cc b/src/qdwh.cc
new file mode 100644
index 000000000..4582840ac
--- /dev/null
+++ b/src/qdwh.cc
@@ -0,0 +1,367 @@
+// Copyright (c) 2017-2022, University of Tennessee. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
+
+#include "slate/slate.hh"
+#include "auxiliary/Debug.hh"
+#include "slate/Matrix.hh"
+#include "internal/internal.hh"
+
+namespace slate {
+
+//------------------------------------------------------------------------------
+/// Distributed parallel polar decomposition based on QDWH algorithm.
+///
+/// Computes a polar decomposition of an m-by-n matrix $A$.
+/// The factorization has the form
+/// \[
+///     A = UH,
+/// \]
+/// where $U$ is a matrix with orthonormal columns and $H$ is a Hermition
+/// matrix.
+///
+//------------------------------------------------------------------------------
+/// @tparam scalar_t
+///     One of float, double, std::complex<float>, std::complex<double>.
+//------------------------------------------------------------------------------
+/// @param[in,out] A
+///     On entry, the m-by-n matrix $A$.
+///     On exit, if return value = 0, the orthogonal polar factor $U$
+///     from the decomposition $A = U H$.
+///
+/// @param[out] H
+///     On exit, if return value = 0, the hermitian polar factor matrix $H$
+///     from the decomposition $A = U H$.
+///
+/// @param[out] itqr
+///     The number of the QR-based iterations.
+///
+/// @param[out] itpo
+///     The number of the Cholesky-based iterations.
+///
+/// @param[in] opts
+///     Additional options, as map of name = value pairs. Possible options:
+///     - Option::Lookahead:
+///       Number of panels to overlap with matrix updates.
+///       lookahead >= 0. Default 1.
+///     - Option::InnerBlocking:
+///       Inner blocking to use for panel. Default 16.
+///     - Option::MaxPanelThreads:
+///       Number of threads to use for panel. Default omp_get_max_threads()/2.
+///     - Option::Target:
+///       Implementation to target. Possible values:
+///       - HostTask:  OpenMP tasks on CPU host [default].
+///       - HostNest:  nested OpenMP parallel for loop on CPU host.
+///       - HostBatch: batched BLAS on CPU host.
+///       - Devices:   batched BLAS on GPU device.
+///
+/// @ingroup qdwh
+///
+template <typename scalar_t>
+void qdwh(
+          Matrix<scalar_t>& A,
+          Matrix<scalar_t>& H,
+          int& itqr, int& itpo,
+          Options const& opts)
+{
+    // todo:
+    // optimizations:
+    // 1. reuse Q from first geqrf
+    // 2. avoid rounding m if m is not divisible by nb, because then geqrf/unmqr
+    // have extra zero rows
+
+    using real_t = blas::real_type<scalar_t>;
+    using blas::real;
+
+    // Constants
+    const scalar_t zero = 0.0;
+    const scalar_t one = 1.0;
+    const real_t r_one = 1.0;
+
+    // Options
+    Target target = get_option( opts, Option::Target, Target::HostTask );
+
+    int64_t mt = A.mt();
+    int64_t nt = A.nt();
+    int64_t nb   = A.tileMb(0);
+
+    int64_t m  = A.m();
+    int64_t n  = A.n();
+
+    if (m < n) {
+        return;
+    }
+
+    real_t eps  = std::numeric_limits<real_t>::epsilon();
+    real_t tol1 = 5. * eps;
+    real_t tol3 = pow(tol1, r_one/real_t(3.));
+
+    int itconv, it;
+    real_t normR;
+    real_t conv = 10.0;
+    scalar_t alpha, beta;
+    double L2, sqd, a1, a, b, c, dd;
+    real_t Li, Liconv;
+
+    // The QR-based iterations requires QR[A; Id],
+    // W = [A; Id], where size of A is mxn, size of Id is nxn
+    // To avoid having a clean up tile in the middle of the W matrix,
+    // we round up the number of A rows (m),
+    // so that size(W) = m_roundup + n
+    int64_t m_roundup = roundup( m, nb );
+    int64_t m_W   = m_roundup + n;
+    int64_t mt_W = mt + nt;
+
+    int nprow, npcol;
+    int myrow, mycol;
+    GridOrder order;
+    A.gridinfo(&order, &nprow, &npcol, &myrow, &mycol);
+
+    // allocate m_W*n work space required for the qr-based iterations QR([A;Id])
+    slate::Matrix<scalar_t> W(m_W, n, nb, nprow, npcol, A.mpiComm());
+    slate::Matrix<scalar_t> Q(m_W, n, nb, nprow, npcol, A.mpiComm());
+    slate::TriangularFactors<scalar_t> T;
+
+    slate::Matrix<scalar_t> Acpy;
+    if (m == n) {
+        // if A is square, we can use H to save a copy of A
+        // this is will not work if H is stored as Hermitian matrix
+        Acpy = H;
+    }
+    else {
+        Acpy = slate::Matrix<scalar_t>(m, n, nb, nprow, npcol, A.mpiComm());
+        Acpy.insertLocalTiles(target);
+    }
+
+    W.insertLocalTiles(target);
+    Q.insertLocalTiles(target);
+
+    // To compute QR[A; Id], allocate W and Q of size m_Wxn
+    // First mt tiles of W contains A matrix in [0:m-1, 0:n-1]
+    // W1 have more rows than A, since we round up the number of rows of W1
+    auto W1 = W.sub(0, mt-1, 0, nt-1);
+    auto Q1 = Q.sub(0, mt-1, 0, nt-1);
+    // W10 have A matrix
+    auto W10 = W1.slice(0, m-1, 0, n-1);
+    auto Q10 = Q1.slice(0, m-1, 0, n-1);
+    // W11 is the last tile which has more rows in case we round up m
+    auto W11 = W.sub(mt-1, mt-1, 0, nt-1);
+
+    // Second nt tiles of W, contain the identity matrix
+    auto W2 = W.sub(mt, mt_W-1, 0, nt-1);
+    auto Q2 = Q.sub(mt, mt_W-1, 0, nt-1);
+
+    // backup A in Acpy to compute H
+    slate::copy(A, Acpy, opts);
+
+    // two norm estimation (largest singular value of A)
+    real_t norm_est = norm2est( A, opts);
+    alpha = 1.0;
+
+    // scale the original matrix A to form A0 which is the initial matrix
+    // for the first iteration
+    scale(r_one, norm_est, A, opts);
+
+    // Estimate the condition number using QR
+    // This Q factor can be used in the first QR-based iteration
+    // todo: use the Q factor in the first QR-based iteration
+    slate::copy(A, W10, opts);
+    slate::geqrf(W10, T, opts);
+    //auto R  = TrapezoidMatrix<scalar_t>(
+    //    Uplo::Upper, slate::Diag::NonUnit, W10 );
+    auto R1 =  W10.slice(0, n-1, 0, n-1);
+    auto R  = TriangularMatrix<scalar_t>(
+            Uplo::Upper, slate::Diag::NonUnit, R1 );
+    normR = norm(slate::Norm::One, R, opts);
+    slate::trcondest(slate::Norm::One, R, &Li, opts);
+    real_t smin_est = normR*Li;
+    Li = smin_est / sqrt(n);
+
+    //slate::trcondest(slate::Norm::One, R, &Li, opts);
+    // *flops += FLOPS_DGEQRF( M, N );
+
+    // Compute the number of iterations to converge
+    itconv = 0; Liconv = Li;
+    while (itconv == 0 || fabs(1-Liconv) > tol1) {
+        // To find the minimum number of iterations to converge.
+        // itconv = number of iterations needed until |Li - 1| < tol1
+        // This should have converged in less than 50 iterations
+        if (itconv > 100) {
+            slate_error("Failed to converge.");
+        }
+        itconv++;
+
+        L2  =  double(Liconv) * double(Liconv);
+        dd  = std::pow( real_t(4.0) * ( r_one - L2 ), r_one / real_t(3.0) ) *
+            std::pow( L2, real_t(-2.0) / real_t(3.0) );
+        sqd = sqrt( r_one + real_t(dd) );
+        a1  = sqd + sqrt( real_t(8.0) - real_t(4.0) * real_t(dd) +
+              real_t(8.0) * ( real_t(2.0) - L2 ) / ( L2 * sqd ) ) / real_t(2.0);
+        a   = real(a1);
+        b   = ( a - r_one ) * ( a - r_one ) / real_t(4.0);
+        c   = a + b - r_one;
+        // Update Liconv
+        Liconv  = Liconv * real_t(( a + b * L2 ) / ( r_one + c * L2 ));
+    }
+
+    it = 0;
+    while (conv > tol3 || it < itconv) {
+        // This should have converged in less than 50 iterations
+        if (it > 100) {
+            slate_error("Failed to converge.");
+        }
+        it++;
+
+        // Compute parameters L,a,b,c
+        L2  = double(Li) * double(Li);
+        dd  = std::pow( real_t(4.0) * ( r_one - L2 ), r_one / real_t(3.0) ) *
+            std::pow( L2, real_t(-2.0) / real_t(3.0) );
+        sqd = sqrt( r_one + real_t(dd) );
+        a1  = sqd + sqrt( real_t(8.0) - real_t(4.0) * real_t(dd) +
+              real_t(8.0) * ( real_t(2.0) - L2 ) / ( L2 * sqd ) ) / real_t(2.0);
+        a   = real(a1);
+        b   = ( a - r_one ) * ( a - r_one ) / real_t(4.0);
+        c   = a + b - r_one;
+        // Update Li
+        Li  = Li * real_t(( a + b * L2 ) / ( r_one + c * L2 ));
+
+        if (c > 100.) {
+            // Generate the matrix W = [ W1 ] = [ sqrt(c) * A ]
+            //                         [ W2 ] = [ Id          ]
+
+            // Factorize W = QR, and generate the associated Q
+            alpha = scalar_t(sqrt(c));
+            set(zero, zero, W11, opts);
+            add(alpha, A, zero, W10, opts);
+            set(zero, one, W2, opts);
+
+            // Call a variant of geqrf and unmqr that avoid the tiles of zeros,
+            // which are the tiles below the diagonal of W2 (identity)
+            geqrf_qdwh_full(W, T, opts);
+
+            set(zero, one, Q, opts);
+            unmqr_qdwh_full(slate::Side::Left, slate::Op::NoTrans, W, T, Q, opts);
+
+            // A = ( (a-b/c)/sqrt(c) ) * Q1 * Q2' + (b/c) * A
+            auto Q2T = conj_transpose(Q2);
+            alpha = scalar_t( (a-b/c)/sqrt(c) );
+            beta  = scalar_t( b / c );
+            // Save a copy of A to check the convergence of QDWH
+            if (it >= itconv ) {
+                slate::copy(A, W10, opts);
+            }
+            gemm(alpha, Q10, Q2T, beta, A, opts);
+
+            itqr += 1;
+
+            //facto = 0;
+            // Main flops used in this step/
+            //flops_dgeqrf = FLOPS_DGEQRF( 2*m, n );
+            //flops_dorgqr = FLOPS_DORGQR( 2*m, n, n );
+            //flops_dgemm  = FLOPS_DGEMM( m, n, n );
+            //flops += flops_dgeqrf + flops_dorgqr + flops_dgemm;
+        }
+        else {
+            // A = (b/c) * A + (a-b/c) * ( linsolve( C, linsolve( C, A') ) )';
+            // Save a copy of A to check the convergence of QDWH
+            if (it >= itconv) {
+                slate::copy(A, W10, opts);
+            }
+
+            // Save a copy of A
+            slate::copy(A, Q10, opts);
+
+            // Compute c * A' * A + I
+            auto AT = conj_transpose(Q10);
+            set(zero, one, W2, opts);
+            gemm(scalar_t(c), AT, A, one, W2, opts);
+
+            // Solve R x = AT
+            auto U = slate::HermitianMatrix<scalar_t>(
+                    slate::Uplo::Upper, W2 );
+            posv(U, AT, opts);
+
+            // Update A = (b/c) * A + (a-b/c) * ( linsolve( C, linsolve( C, A') ) )';
+            alpha = (a-b/c); beta = (b/c);
+            auto AT2 = conj_transpose(AT);
+            add(alpha, AT2, beta, A, opts);
+
+            itpo += 1;
+
+            //facto = 1;
+            // Main flops used in this step
+            //flops_dgemm  = FLOPS_DGEMM( m, n, n );
+            //flops_dpotrf = FLOPS_DPOTRF( m );
+            //flops_dtrsm  = FLOPS_DTRSM( 'L', m, n );
+            //flops += flops_dgemm + flops_dpotrf + 2. * flops_dtrsm;
+        }
+
+        // Check if it converge, compute the norm of matrix A - W1
+        conv = 10.0;
+        if (it >= itconv) {
+            add(one, A, -one, W10, opts);
+            conv = norm(slate::Norm::Fro, W10, opts);
+        }
+    }
+
+    if (A.mpiRank() == 0) {
+        printf("%-7s  %-7s\n", "QR", "PO");
+        printf("%-7d  %-7d\n", itqr, itpo);
+    }
+
+    if (itqr + itpo > 6) {
+        printf("\nConverged after %d. Check what is the issue, "
+                   "because QDWH needs <= 6 iterations.\n",
+                   itqr+itpo);
+    }
+
+    // A = U*H ==> H = U'*A ==> H = 0.5*(H'+H)
+    auto AT = conj_transpose(A);
+    if (m == n) {
+        auto W10_n = W1.slice(0, n-1, 0, n-1);
+        gemm(one, AT, Acpy, zero, W10_n, opts);
+        slate::copy(W10_n, H, opts);
+    }
+    else {
+        gemm(one, AT, Acpy, zero, H, opts);
+    }
+
+    // todo: try something like her2k to compute H
+    //her2k(one, A, W10, rzero, H, opts);
+    //auto AL = HermitianMatrix<scalar_t>(
+    //        slate::Uplo::Lower, H );
+    //slate::copy(AL, H, opts);
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+template
+void qdwh<float>(
+    Matrix<float>& A,
+    Matrix<float>& H,
+    int& itqr, int& itpo,
+    Options const& opts);
+
+template
+void qdwh<double>(
+    Matrix<double>& A,
+    Matrix<double>& H,
+    int& itqr, int& itpo,
+    Options const& opts);
+
+template
+void qdwh< std::complex<float> >(
+    Matrix< std::complex<float> >& A,
+    Matrix< std::complex<float> >& H,
+    int& itqr, int& itpo,
+    Options const& opts);
+
+template
+void qdwh< std::complex<double> >(
+    Matrix< std::complex<double> >& A,
+    Matrix< std::complex<double> >& H,
+    int& itqr, int& itpo,
+    Options const& opts);
+
+} // namespace slate
diff --git a/src/unmqr_qdwh_full.cc b/src/unmqr_qdwh_full.cc
new file mode 100644
index 000000000..a1ace1725
--- /dev/null
+++ b/src/unmqr_qdwh_full.cc
@@ -0,0 +1,394 @@
+// Copyright (c) 2017-2022, University of Tennessee. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
+
+#include "slate/slate.hh"
+#include "auxiliary/Debug.hh"
+#include "slate/Matrix.hh"
+#include "internal/Tile_tpmqrt.hh"
+#include "internal/internal.hh"
+
+namespace slate {
+
+namespace impl {
+
+//------------------------------------------------------------------------------
+/// Distributed parallel multiply by Q from QR factorization.
+/// Generic implementation for any target.
+/// @ingroup geqrf_impl
+///
+template <Target target, typename scalar_t>
+void unmqr_qdwh_full(
+    Side side, Op op,
+    Matrix<scalar_t>& A,
+    TriangularFactors<scalar_t>& T,
+    Matrix<scalar_t>& C,
+    Options const& opts )
+{
+    // trace::Block trace_block("unmqr");
+    using BcastList = typename Matrix<scalar_t>::BcastList;
+
+    // Assumes column major
+    const Layout layout = Layout::ColMajor;
+
+    int64_t A_mt = A.mt();
+    int64_t A_nt = A.nt();
+    int64_t A_min_mtnt = std::min(A_mt, A_nt);
+    int64_t C_mt = C.mt();
+    int64_t C_nt = C.nt();
+
+    if (is_complex<scalar_t>::value && op == Op::Trans) {
+        throw Exception("Complex numbers uses Op::ConjTrans, not Op::Trans.");
+    }
+
+    if (target == Target::Devices) {
+        C.allocateBatchArrays();
+        C.reserveDeviceWorkspace();
+    }
+
+    // Reserve workspace
+    auto W = C.emptyLike();
+
+    if (target == Target::Devices) {
+        W.allocateBatchArrays();
+        // todo: this is demanding too much device workspace memory
+        // only one tile-row of matrix W per MPI process is going to be used,
+        // but W with size of whole C is being allocated
+        // thus limiting the matrix size that can be processed
+        //W.reserveDeviceWorkspace();
+    }
+
+    assert(T.size() == 2);
+    auto Tlocal  = T[0];
+    auto Treduce = T[1];
+
+    // QR tracks dependencies by block-column.
+    // OpenMP needs pointer types, but vectors are exception safe
+    std::vector< uint8_t > block_vector(A_nt);
+    uint8_t* block = block_vector.data();
+
+    // set min number for omp nested active parallel regions
+    slate::OmpSetMaxActiveLevels set_active_levels( MinOmpActiveLevels );
+
+    #pragma omp parallel
+    #pragma omp master
+    {
+        int64_t k_begin, k_end, k_step;
+        if ((side == Side::Left) == (op == Op::NoTrans)) {
+            // Left,  NoTrans:     multiply Q C   = Q1 ... QK C, or
+            // Right, (Conj)Trans: multiply C Q^H = C QK^H ... Q1^H,
+            // i.e., in reverse order of how Qk's were created.
+            k_begin = A_min_mtnt-1;
+            k_end   = -1;
+            k_step  = -1;
+        }
+        else {
+            // Left, (Conj)Trans: multiply Q^H C = QK^H ... Q1^H C, or
+            // Right, NoTrans:    multiply C Q   = C Q1 ... QK,
+            // i.e., in same order as Qk's were created.
+            k_begin = 0;
+            k_end   = A_min_mtnt;
+            k_step  = 1;
+        }
+
+        // for k = k_begin, lastk = k_begin (no previous col to depend on);
+        // otherwise,       lastk = k - k_step (previous col).
+        int64_t lastk = k_begin;
+        // OpenMP uses lastk; compiler doesn't, so warns it is unused.
+        SLATE_UNUSED(lastk);
+        for (int64_t k = k_begin; k != k_end; k += k_step) {
+
+            int64_t i_end = A_mt - A_nt + k;
+
+            auto A_panel = A.sub(k, i_end, k, k);
+
+            // Find ranks in this column.
+            std::set<int> ranks_set;
+            A_panel.getRanks(&ranks_set);
+            assert(ranks_set.size() > 0);
+
+            // Find each rank's first (top-most) row in this panel,
+            // where the triangular tile resulting from local geqrf
+            // panel will reside.
+            std::vector< int64_t > first_indices;
+            first_indices.reserve(ranks_set.size());
+            for (int r: ranks_set) {
+                for (int64_t i = 0; i < A_panel.mt(); ++i) {
+                    if (A_panel.tileRank(i, 0) == r) {
+                        first_indices.push_back(i+k);
+                        break;
+                    }
+                }
+            }
+
+            #pragma omp task depend(inout:block[k]) \
+                             depend(in:block[lastk])
+            {
+                // Indices for row or col of C.
+                int64_t i0 = -1, i1 = -1, j0 = -1, j1 = -1;
+                if (side == Side::Left) {
+                    j0 = 0;
+                    j1 = C_nt-1;
+                }
+                else {
+                    i0 = 0;
+                    i1 = C_mt-1;
+                }
+
+                // Send V(i) across row C(i, 0:nt-1) or col C(0:mt-1, i),
+                // for side = left or right, respectively.
+                BcastList bcast_list_V_top;
+                BcastList bcast_list_V;
+                for (int64_t i = k; i < i_end+1; ++i) {
+                    if (side == Side::Left) {
+                        i0 = i;
+                        i1 = i;
+                    }
+                    else {
+                        j0 = i;
+                        j1 = i;
+                    }
+                    if (std::find(first_indices.begin(), first_indices.end(), i) != first_indices.end()) {
+                        bcast_list_V_top.push_back(
+                            {i, k, {C.sub(i0, i1, j0, j1)}});
+                    }
+                    else {
+                        bcast_list_V.push_back(
+                            {i, k, {C.sub(i0, i1, j0, j1)}});
+                    }
+                }
+                // V tiles in first_indices need up to 5 lives: 1 for ttmqr,
+                // 2 + extra 2 if mb > nb (trapezoid) for Vs in unmqr I-VTV^T.
+                // This may leak a few tiles that A.clearWorkspace will cleanup.
+                A.template listBcast(bcast_list_V_top, layout, 0, 5);
+                A.template listBcast(bcast_list_V, layout, 0, 2);
+
+                // Send Tlocal(i) across row C(i, 0:nt-1) or col C(0:mt-1, i).
+                if (first_indices.size() > 0) {
+                    BcastList bcast_list_T;
+                    for (int64_t i : first_indices) {
+                        if (side == Side::Left) {
+                            i0 = i;
+                            i1 = i;
+                        }
+                        else {
+                            j0 = i;
+                            j1 = i;
+                        }
+                        bcast_list_T.push_back(
+                            {i, k, {C.sub(i0, i1, j0, j1)}});
+                    }
+                    Tlocal.template listBcast(bcast_list_T, layout);
+                }
+
+                // Send Treduce(i) across row C(i, 0:nt-1) or col C(0:mt-1, i).
+                if (first_indices.size() > 1) {
+                    BcastList bcast_list_T;
+                    for (int64_t i : first_indices) {
+                        if (side == Side::Left) {
+                            i0 = i;
+                            i1 = i;
+                        }
+                        else {
+                            j0 = i;
+                            j1 = i;
+                        }
+                        // Exclude first row of this panel,
+                        // which doesn't have Treduce tile.
+                        if (i > k) {
+                            bcast_list_T.push_back(
+                                {i, k, {C.sub(i0, i1, j0, j1)}});
+                        }
+                    }
+                    Treduce.template listBcast(bcast_list_T, layout);
+                }
+
+                Matrix<scalar_t> C_trail, W_trail;
+                if (side == Side::Left) {
+                    C_trail = C.sub(k, i_end, 0, C_nt-1);
+                    W_trail = W.sub(k, i_end, 0, C_nt-1);
+                }
+                else {
+                    C_trail = C.sub(0, i_end, k, C_nt-1);
+                    W_trail = W.sub(0, i_end, k, C_nt-1);
+                }
+
+                // Left,  NoTrans:     Qi C   = Qi_local Qi_reduce C, or
+                // Right, (Conj)Trans: C Qi^H = C Qi_reduce^H Qi_local^H,
+                // do ttmqr then unmqr.
+                if ((side == Side::Left) == (op == Op::NoTrans)) {
+                    // Apply triangle-triangle reduction reflectors.
+                    internal::ttmqr<Target::HostTask>(
+                                    side, op,
+                                    std::move(A_panel),
+                                    Treduce.sub(k, i_end, k, k),
+                                    std::move(C_trail));
+                }
+
+                // Apply local reflectors.
+                internal::unmqr<target>(
+                                side, op,
+                                std::move(A_panel),
+                                Tlocal.sub(k, i_end, k, k),
+                                std::move(C_trail),
+                                std::move(W_trail));
+
+                // Left,  (Conj)Trans: Qi^H C = Qi_reduce^H Qi_local^H C, or
+                // Right, NoTrans:     C Qi   = C Qi_local Qi_reduce,
+                // do unmqr then ttmqr.
+                if ((side == Side::Left) != (op == Op::NoTrans)) {
+                    // Apply triangle-triangle reduction reflectors.
+                    internal::ttmqr<Target::HostTask>(
+                                    side, op,
+                                    std::move(A_panel),
+                                    Treduce.sub(k, A_mt-1, k, k),
+                                    std::move(C_trail));
+                }
+            }
+
+            lastk = k;
+        }
+
+        #pragma omp taskwait
+        C.tileUpdateAllOrigin();
+    }
+
+    A.clearWorkspace();
+    C.clearWorkspace();
+}
+
+} // namespace impl
+
+//------------------------------------------------------------------------------
+/// Distributed parallel customized multiply by $Q$ from QR factorization.
+/// Required for the QR-based iterations in the polar decomposition QDWH.
+///
+/// The $Q$ matrix is the output of geqrf_qdwh_full. Since the input matrix
+/// to geqrf_qdwh_full has identity trailing matrix, the $Q$ matrix has
+/// zero tiles below the diagonal.
+/// Q = [ Q0 ]  full matrix ( m0-by-n, where m0 = m - n)
+///     [ Q1 ]  zero tiles below the diagonal (n-by-n)
+///
+/// Multiplies the general m-by-n matrix $C$ by $Q$ from QR factorization,
+/// and takes advantage of the structure of the matrix by avoid operating
+/// on the tiles of zeros,
+/// according to:
+///
+/// op              |  side = Left  |  side = Right
+/// --------------- | ------------- | --------------
+/// op = NoTrans    |  $Q C  $      |  $C Q  $
+/// op = ConjTrans  |  $Q^H C$      |  $C Q^H$
+///
+/// where $Q$ is a unitary matrix defined as the product of k
+/// elementary reflectors
+/// \[
+///     Q = H(1) H(2) . . . H(k)
+/// \]
+/// as returned by geqrf. $Q$ is of order m if side = Left
+/// and of order n if side = Right.
+///
+//------------------------------------------------------------------------------
+/// @tparam scalar_t
+///     One of float, double, std::complex<float>, std::complex<double>.
+//------------------------------------------------------------------------------
+/// @param[in] side
+///     - Side::Left:  apply $Q$ or $Q^H$ from the left;
+///     - Side::Right: apply $Q$ or $Q^H$ from the right.
+///
+/// @param[in] op
+///     - Op::NoTrans    apply $Q$;
+///     - Op::ConjTrans: apply $Q^H$;
+///     - Op::Trans:     apply $Q^T$ (only if real).
+///       In the real case, Op::Trans is equivalent to Op::ConjTrans.
+///       In the complex case, Op::Trans is not allowed.
+///
+/// @param[in] A
+///     Details of the QR factorization of the original matrix $A$ as returned
+///     by geqrf.
+///
+/// @param[in] T
+///     Triangular matrices of the block reflectors as returned by geqrf.
+///
+/// @param[in,out] C
+///     On entry, the m-by-n matrix $C$.
+///     On exit, $C$ is overwritten by $Q C$, $Q^H C$, $C Q$, or $C Q^H$.
+///
+/// @param[in] opts
+///     Additional options, as map of name = value pairs. Possible options:
+///     - Option::Target:
+///       Implementation to target. Possible values:
+///       - HostTask:  OpenMP tasks on CPU host [default].
+///       - HostNest:  nested OpenMP parallel for loop on CPU host.
+///       - HostBatch: batched BLAS on CPU host.
+///       - Devices:   batched BLAS on GPU device.
+///
+/// @ingroup geqrf_computational
+///
+template <typename scalar_t>
+void unmqr_qdwh_full(
+    Side side, Op op,
+    Matrix<scalar_t>& A,
+    TriangularFactors<scalar_t>& T,
+    Matrix<scalar_t>& C,
+    Options const& opts)
+{
+    Target target = get_option( opts, Option::Target, Target::HostTask );
+
+    switch (target) {
+        case Target::Host:
+        case Target::HostTask:
+        default:
+            impl::unmqr_qdwh_full<Target::HostTask>( side, op, A, T, C, opts );
+            break;
+
+        case Target::HostNest:
+            impl::unmqr_qdwh_full<Target::HostNest>( side, op, A, T, C, opts );
+            break;
+
+        case Target::HostBatch:
+            impl::unmqr_qdwh_full<Target::HostBatch>( side, op, A, T, C, opts );
+            break;
+
+        case Target::Devices:
+            impl::unmqr_qdwh_full<Target::Devices>( side, op, A, T, C, opts );
+            break;
+    }
+    // todo: return value for errors?
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+template
+void unmqr_qdwh_full<float>(
+    Side side, Op op,
+    Matrix<float>& A,
+    TriangularFactors<float>& T,
+    Matrix<float>& C,
+    Options const& opts);
+
+template
+void unmqr_qdwh_full<double>(
+    Side side, Op op,
+    Matrix<double>& A,
+    TriangularFactors<double>& T,
+    Matrix<double>& C,
+    Options const& opts);
+
+template
+void unmqr_qdwh_full< std::complex<float> >(
+    Side side, Op op,
+    Matrix< std::complex<float> >& A,
+    TriangularFactors< std::complex<float> >& T,
+    Matrix< std::complex<float> >& C,
+    Options const& opts);
+
+template
+void unmqr_qdwh_full< std::complex<double> >(
+    Side side, Op op,
+    Matrix< std::complex<double> >& A,
+    TriangularFactors< std::complex<double> >& T,
+    Matrix< std::complex<double> >& C,
+    Options const& opts);
+
+} // namespace slate
diff --git a/test/run_tests.py b/test/run_tests.py
index 77dee5bed..d67480af1 100755
--- a/test/run_tests.py
+++ b/test/run_tests.py
@@ -75,6 +75,7 @@
     group_cat.add_argument( '--aux',           action='store_true', help='run auxiliary routine tests' ),
     group_cat.add_argument( '--norms',         action='store_true', help='run norm tests' ),
     group_cat.add_argument( '--cond',          action='store_true', help='run condition number estimate tests' ),
+    group_cat.add_argument( '--qdwh',          action='store_true', help='run the polar decomposition qdwh' ),
 ]
 # map category objects to category names: ['lu', 'chol', ...]
 categories = list( map( lambda x: x.dest, categories ) )
@@ -574,6 +575,12 @@ def filter_csv( values, csv ):
     #[ 'pbcon', gen + dtype + la + n + kd + uplo ],
     ]
 
+# polar
+if (opts.qdwh):
+    cmds += [
+    [ 'qdwh', gen + dtype + n + tall ],
+    ]
+
 # aux
 if (opts.aux):
     cmds += [
diff --git a/test/test.cc b/test/test.cc
index 03cadbfb7..df6bc8247 100644
--- a/test/test.cc
+++ b/test/test.cc
@@ -279,6 +279,8 @@ std::vector< testsweeper::routines_t > routines = {
     { "syset",              test_set,          Section::aux },
     { "heset",              test_set,          Section::aux },
     { "",                   nullptr,           Section::newline },
+
+    { "qdwh",              test_qdwh,          Section::qr },
 };
 
 // -----------------------------------------------------------------------------
diff --git a/test/test.hh b/test/test.hh
index e38871038..8743a533e 100644
--- a/test/test.hh
+++ b/test/test.hh
@@ -261,6 +261,9 @@ void test_scale  (Params& params, bool run);
 void test_scale_row_col(Params& params, bool run);
 void test_set    (Params& params, bool run);
 
+// polar decomposition
+void test_qdwh  (Params& params, bool run);
+
 // -----------------------------------------------------------------------------
 inline slate::Dist str2dist(const char* dist)
 {
diff --git a/test/test_qdwh.cc b/test/test_qdwh.cc
new file mode 100644
index 000000000..7e8d3a214
--- /dev/null
+++ b/test/test_qdwh.cc
@@ -0,0 +1,244 @@
+// Copyright (c) 2017-2022, University of Tennessee. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
+
+#include "slate/slate.hh"
+#include "test.hh"
+#include "blas/flops.hh"
+#include "lapack/flops.hh"
+#include "print_matrix.hh"
+#include "grid_utils.hh"
+
+#include "scalapack_wrappers.hh"
+#include "scalapack_support_routines.hh"
+#include "scalapack_copy.hh"
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <utility>
+//------------------------------------------------------------------------------
+template <typename scalar_t>
+void test_qdwh_work(Params& params, bool run)
+{
+    using real_t = blas::real_type<scalar_t>;
+    using blas::real;
+
+    // Constants
+    const scalar_t zero = 0;
+    const scalar_t one = 1;
+
+    // get & mark input values
+    int64_t m = params.dim.m();
+    int64_t n = params.dim.n();
+    int p = params.grid.m();
+    int q = params.grid.n();
+    int64_t nb = params.nb();
+    int64_t ib = params.ib();
+    int64_t lookahead = params.lookahead();
+    int64_t panel_threads = params.panel_threads();
+    bool ref_only = params.ref() == 'o';
+    bool ref = params.ref() == 'y' || ref_only;
+    bool check = params.check() == 'y' && ! ref_only;
+    bool trace = params.trace() == 'y';
+    slate::Origin origin = params.origin();
+    slate::Target target = params.target();
+    params.matrix.mark();
+
+    // mark non-standard output values
+    params.time();
+    params.gflops();
+    params.ref_time();
+    params.ref_gflops();
+    params.ortho();
+
+    if (! run)
+        return;
+
+    slate::Options const opts =  {
+        {slate::Option::Lookahead, lookahead},
+        {slate::Option::Target, target},
+        {slate::Option::MaxPanelThreads, panel_threads},
+        {slate::Option::InnerBlocking, ib}
+    };
+
+    // MPI variables
+    int mpi_rank, myrow, mycol;
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+    gridinfo(mpi_rank, p, q, &myrow, &mycol);
+
+
+    // BLACS/MPI variables
+    // matrix A, figure out local size, allocate, create descriptor, initialize
+    int64_t mlocA = num_local_rows_cols(m, nb, myrow, p);
+    int64_t nlocA = num_local_rows_cols(n, nb, mycol, q);
+    int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
+    std::vector<scalar_t> A_data;
+
+    int64_t mlocH = num_local_rows_cols(n, nb, myrow, p);
+    int64_t nlocH = num_local_rows_cols(n, nb, mycol, q);
+    int64_t lldH  = blas::max(1, mlocH); // local leading dimension of H
+    std::vector<scalar_t> H_data;
+
+    slate::Matrix<scalar_t> A;
+    slate::Matrix<scalar_t> H;
+    //slate::HermitianMatrix<scalar_t> H;
+    if (origin != slate::Origin::ScaLAPACK) {
+        // Copy local ScaLAPACK data to GPU or CPU tiles.
+        slate::Target origin_target = origin2target(origin);
+        A = slate::Matrix<scalar_t>(m, n, nb, p, q, MPI_COMM_WORLD);
+        A.insertLocalTiles(origin_target);
+        //H = slate::HermitianMatrix<scalar_t>(slate::Uplo::Lower, n, nb, p, q, MPI_COMM_WORLD);
+        //H.insertLocalTiles(origin_target);
+        H = slate::Matrix<scalar_t>(n, n, nb, p, q, MPI_COMM_WORLD);
+        H.insertLocalTiles(origin_target);
+    }
+    else {
+        // create SLATE matrices from the ScaLAPACK layouts
+        A_data.resize( lldA * nlocA );
+        A = slate::Matrix<scalar_t>::fromScaLAPACK(m, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
+        //H_data.resize( lldH * nlocH );
+        //H = slate::HermitianMatrix<scalar_t>::fromScaLAPACK(slate::Uplo::Lower, n, &H_data[0], lldH, nb, p, q, MPI_COMM_WORLD);
+        H_data.resize( lldH * nlocH );
+        H = slate::Matrix<scalar_t>::fromScaLAPACK(n, n, &H_data[0], lldH, nb, p, q, MPI_COMM_WORLD);
+    }
+
+    real_t cond = 1 / std::numeric_limits<real_t>::epsilon();
+    params.matrix.kind.set_default("svd");
+    params.matrix.cond.set_default(cond);
+
+    slate::generate_matrix( params.matrix, A);
+
+    // if check is required, copy test data
+    slate::Matrix<scalar_t> Aref;
+    std::vector<scalar_t> Aref_data;
+    if (check || ref) {
+        Aref_data.resize( lldA * nlocA );
+        Aref = slate::Matrix<scalar_t>::fromScaLAPACK(
+            m, n, &Aref_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
+        slate::copy(A, Aref);
+    }
+
+    print_matrix("A", A, params);
+
+    // todo: how to compute gflops?
+
+    if (! ref_only) {
+        if (trace) slate::trace::Trace::on();
+        else slate::trace::Trace::off();
+
+        int itqr = 0, itpo = 0;
+
+        double time = barrier_get_wtime(MPI_COMM_WORLD);
+
+        //==================================================
+        // Run SLATE test.
+        //==================================================
+        // A = AH,
+        // A will be written by the orthogonal polar factor
+        // H is the symmetric positive semidefinite polar factor
+        slate::qdwh(A, H, itqr, itpo, opts);
+
+        time = barrier_get_wtime(MPI_COMM_WORLD) - time;
+
+        // gflop include the flops to estimate the condition
+        // number, the flops for the qr_based iterations the
+        // flops for the po_based iterations and the flops
+        // to compute H (hermition polar factor).
+        // gflop = trcon + itqr * (flop_per_qr_based_iter) +
+        //         itpo * (flop_per_po_based_iter) +
+        //         flop_compute_H
+        double gflop_trcon = lapack::Gflop<scalar_t>::geqrf(m, n);
+
+        // this flop count of the QR-based iteration is in case we are not taking advantage
+        // of the identity trailing submatrix:
+        //double gflop_itqr  = itqr * ( lapack::Gflop<scalar_t>::geqrf(m+n, n) +
+        //                              lapack::Gflop<scalar_t>::unmqr(slate::Side::Left, m+n, n, n) +
+        //                              blas::Gflop<scalar_t>::gemm(m, n, n) );
+        // when take advantage of the matrix structure, the flops count will be reduced:
+        double gflop_itqr  = itqr * ( 1e-9 * 2. * double(m*n*n) +
+                                      1e-9 * 2. * double(m*n*n) +
+                                      blas::Gflop<scalar_t>::gemm(m, n, n) );
+
+        double gflop_itpo  = itpo * ( blas::Gflop<scalar_t>::gemm(m, n, n) +
+                                      lapack::Gflop<scalar_t>::potrf(m)      +
+                                      blas::Gflop<scalar_t>::trsm(slate::Side::Left, m, n) );
+
+        double gflop_compute_H = blas::Gflop<scalar_t>::her2k(n, m);
+
+        double gflop = gflop_trcon + gflop_itqr + gflop_itpo + gflop_compute_H;
+
+        if (trace) slate::trace::Trace::finish();
+
+        // compute and save timing/performance
+        params.time() = time;
+        params.gflops() = gflop / time;
+
+        print_matrix("U: orthogonal polar factor", A, params);
+        print_matrix("H: symmetric postive semi definite factor", H, params);
+    }
+
+    if (check) {
+        //==================================================
+        // Test results by checking backwards error
+        //
+        //      || A'H - Aref ||_f
+        //     -------------------    < tol * epsilon
+        //          ||Aref||_f
+        //
+        //==================================================
+        real_t normA = slate::norm(slate::Norm::Fro, Aref, opts);
+        slate::gemm(one, A, H, -one, Aref, opts);
+        //auto H = HermitianMatrix<scalar_t>(
+        //        slate::Uplo::Lower, B );
+        //slate::hemm(slate::Side::Right, one, H, A, -one, Aref, opts);
+        real_t berr = slate::norm(slate::Norm::Fro, Aref, opts);
+        params.error() = berr / normA;
+
+        //==================================================
+        // Test results by checking the orthogonality of U factor
+        //
+        //      || A'A - I ||_f
+        //     ---------------- < tol * epsilon
+        //            n
+        //
+        //==================================================
+        auto AT = conj_transpose(A);
+        set(zero, one, Aref);
+        slate::gemm(one, AT, A, -one, Aref, opts);
+        real_t orth = slate::norm(slate::Norm::Fro, Aref);
+        params.ortho() = orth / sqrt(n);
+
+        real_t tol = params.tol() * 0.5 * std::numeric_limits<real_t>::epsilon();
+        params.okay() = ((params.error() <= tol) && (params.ortho() <= tol));
+
+
+    }
+}
+
+// -----------------------------------------------------------------------------
+void test_qdwh(Params& params, bool run)
+{
+    switch (params.datatype()) {
+        case testsweeper::DataType::Integer:
+            throw std::exception();
+            break;
+
+        case testsweeper::DataType::Single:
+            test_qdwh_work<float> (params, run);
+            break;
+
+        case testsweeper::DataType::Double:
+            test_qdwh_work<double> (params, run);
+            break;
+
+        case testsweeper::DataType::SingleComplex:
+            test_qdwh_work<std::complex<float>> (params, run);
+            break;
+
+        case testsweeper::DataType::DoubleComplex:
+            test_qdwh_work<std::complex<double>> (params, run);
+            break;
+    }
+}