diff --git a/GNUmakefile b/GNUmakefile index 8649f065f..7773cc72e 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -508,6 +508,7 @@ ifneq ($(only_unit),1) src/gemmA.cc \ src/gemmC.cc \ src/geqrf.cc \ + src/geqrf_qdwh_full.cc \ src/gesv.cc \ src/gesvMixed.cc \ src/gesv_nopiv.cc \ @@ -534,6 +535,7 @@ ifneq ($(only_unit),1) src/hetrf.cc \ src/hetrs.cc \ src/norm.cc \ + src/norm2est.cc \ src/pbsv.cc \ src/pbtrf.cc \ src/pbtrs.cc \ @@ -543,6 +545,7 @@ ifneq ($(only_unit),1) src/potri.cc \ src/potrs.cc \ src/print.cc \ + src/qdwh.cc \ src/scale.cc \ src/scale_row_col.cc \ src/set.cc \ @@ -565,6 +568,7 @@ ifneq ($(only_unit),1) src/unmqr.cc \ src/unmtr_hb2st.cc \ src/unmtr_he2hb.cc \ + src/unmqr_qdwh_full.cc \ src/work/work_trmm.cc \ src/work/work_trsm.cc \ src/work/work_trsmA.cc \ @@ -635,6 +639,7 @@ tester_src += \ test/test_pbsv.cc \ test/test_posv.cc \ test/test_potri.cc \ + test/test_qdwh.cc \ test/test_scale.cc \ test/test_scale_row_col.cc \ test/test_set.cc \ diff --git a/docs/doxygen/groups.dox b/docs/doxygen/groups.dox index 6cb84a95e..dcbc88538 100644 --- a/docs/doxygen/groups.dox +++ b/docs/doxygen/groups.dox @@ -145,6 +145,14 @@ @defgroup svd_specialization Target implementations @} + ------------------------------------------------------------ + @defgroup group_qdwh Polar Decomposition + @{ + @defgroup qdwh Driver + @brief $A = U H$ + + @} + ------------------------------------------------------------ @defgroup group_blas2_top Level 2 BLAS and Auxiliary: O(n^2) work @brief Matrix and Matrix-vector operations that perform $O(n^2)$ work on $O(n^2)$ data. diff --git a/include/slate/slate.hh b/include/slate/slate.hh index d44a17562..d437f849d 100644 --- a/include/slate/slate.hh +++ b/include/slate/slate.hh @@ -1107,6 +1107,42 @@ void trcondest( blas::real_type *rcond, Options const& opts = Options()); +//----------------------------------------- +// Polar Decomposition + +//----------------------------------------- +// qdwh() +template +void qdwh( + Matrix& A, + Matrix& B, + int& itqr, int& itpo, + Options const& opts = Options()); + +//----------------------------------------- +// normest() +template +blas::real_type +norm2est( + matrix_type &A, + Options const& opts = Options()); + +//----------------------------------------- +// geqrf_qdwh_full() +template +void geqrf_qdwh_full( + Matrix& A, TriangularFactors& T, + Options const& opts = Options()); + +//----------------------------------------- +// unmqr_qdwh_full() +template +void unmqr_qdwh_full( + Side side, Op op, + Matrix& A, TriangularFactors& T, + Matrix& C, + Options const& opts = Options()); + } // namespace slate //----------------------------------------- diff --git a/src/geqrf.cc b/src/geqrf.cc index 399202201..4fe6328c8 100644 --- a/src/geqrf.cc +++ b/src/geqrf.cc @@ -7,50 +7,12 @@ #include "auxiliary/Debug.hh" #include "slate/Matrix.hh" #include "internal/internal.hh" +#include "internal/internal_util.hh" namespace slate { namespace impl { -//------------------------------------------------------------------------------ -/// An auxiliary routine to find each rank's first (top-most) row -/// in panel k. -/// -/// @param[in] A_panel -/// Current panel, which is a sub of the input matrix $A$. -/// -/// @param[in] k -/// Index of the current panel in the input matrix $A$. -/// -/// @param[out] first_indices -/// The array of computed indices. -/// -/// @ingroup geqrf_impl -/// -template -void geqrf_compute_first_indices( - Matrix& A_panel, int64_t k, - std::vector< int64_t >& first_indices ) -{ - // Find ranks in this column. - std::set ranks_set; - A_panel.getRanks(&ranks_set); - assert(ranks_set.size() > 0); - - // Find each rank's first (top-most) row in this panel, - // where the triangular tile resulting from local geqrf panel - // will reside. - first_indices.reserve(ranks_set.size()); - for (int r: ranks_set) { - for (int64_t i = 0; i < A_panel.mt(); ++i) { - if (A_panel.tileRank(i, 0) == r) { - first_indices.push_back(i+k); - break; - } - } - } -} - //------------------------------------------------------------------------------ /// Distributed parallel QR factorization. /// Generic implementation for any target. @@ -185,7 +147,7 @@ void geqrf( auto Tr_panel = Treduce.sub(k, A_mt-1, k, k); std::vector< int64_t > first_indices; - geqrf_compute_first_indices(A_panel, k, first_indices); + internal::geqrf_compute_first_indices(A_panel, k, first_indices); // todo: pass first_indices into internal geqrf or ttqrt? // panel, high priority @@ -328,7 +290,7 @@ void geqrf( auto A_panel_k_la = A.sub(k_la, A_mt-1, k_la, k_la); std::vector< int64_t > first_indices_k_la; - geqrf_compute_first_indices(A_panel_k_la, k_la, first_indices_k_la); + internal::geqrf_compute_first_indices(A_panel_k_la, k_la, first_indices_k_la); if (first_indices.size() > 0) { for (int64_t row : first_indices_k_la) { if (Tlocal.tileIsLocal(row, k_la)) { diff --git a/src/geqrf_qdwh_full.cc b/src/geqrf_qdwh_full.cc new file mode 100644 index 000000000..c341104a0 --- /dev/null +++ b/src/geqrf_qdwh_full.cc @@ -0,0 +1,449 @@ +// Copyright (c) 2017-2022, University of Tennessee. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// This program is free software: you can redistribute it and/or modify it under +// the terms of the BSD 3-Clause license. See the accompanying LICENSE file. + +#include "slate/slate.hh" +#include "auxiliary/Debug.hh" +#include "slate/Matrix.hh" +#include "internal/internal.hh" +#include "internal/internal_util.hh" + +namespace slate { + +namespace impl { + +//------------------------------------------------------------------------------ +/// Distributed parallel QR factorization. +/// Generic implementation for any target. +/// Panel and lookahead computed on host using Host OpenMP task. +/// +/// ColMajor layout is assumed +/// +/// @ingroup geqrf_impl +/// +template +void geqrf_qdwh_full( + Matrix& A, + TriangularFactors& T, + Options const& opts ) +{ + using BcastList = typename Matrix::BcastList; + using device_info_t = lapack::device_info_int; + using blas::real; + + // Assumes column major + const Layout layout = Layout::ColMajor; + + const int priority_zero = 0; + const int priority_one = 1; + const int life_factor_one = 1; + + // Options + int64_t lookahead = get_option( opts, Option::Lookahead, 1 ); + int64_t ib = get_option( opts, Option::InnerBlocking, 16 ); + int64_t max_panel_threads = std::max(omp_get_max_threads()/2, 1); + max_panel_threads = get_option( opts, Option::MaxPanelThreads, + max_panel_threads ); + + bool set_hold = lookahead > 0; // Do tileGetAndHold in the bcast + + int64_t A_mt = A.mt(); + int64_t A_nt = A.nt(); + int64_t A_min_mtnt = std::min(A_mt, A_nt); + + T.clear(); + T.push_back(A.emptyLike()); + T.push_back(A.emptyLike(ib, 0)); + auto Tlocal = T[0]; + auto Treduce = T[1]; + + // workspace + auto W = A.emptyLike(); + + // setting up dummy variables for case the when target == host + int64_t num_devices = A.num_devices(); + int panel_device = -1; + size_t work_size = 0; + + std::vector< scalar_t* > dwork_array( num_devices, nullptr ); + + if (target == Target::Devices) { + const int64_t batch_size_zero = 0; // use default batch size + const int64_t num_queues = 3 + lookahead; + A.allocateBatchArrays(batch_size_zero, num_queues); + A.reserveDeviceWorkspace(); + W.allocateBatchArrays(batch_size_zero, num_queues); + // todo: this is demanding too much device workspace memory + // only one tile-row of matrix W per MPI process is going to be used, + // but W with size of whole A is being allocated + // thus limiting the matrix size that can be processed + // For now, allocate workspace tiles 1-by-1. + //W.reserveDeviceWorkspace(); + + // Find largest panel size and device for copying to + // contiguous memory within internal geqrf routine + int64_t mlocal = 0; + int64_t first_panel_seen = -1; + for (int64_t j = 0; j < A.nt(); ++j) { + for (int64_t i = j; i < A.mt(); ++i) { + if (A.tileIsLocal(i, j)) { + if (first_panel_seen < 0) { + first_panel_seen = j; + } + if (first_panel_seen == j) { + if (panel_device < 0) { + panel_device = A.tileDevice(i, j); + } + mlocal += A.tileMb(i); + // Asserting 1-D distribution for device + assert( panel_device == A.tileDevice(i, j) ); + } + } + } + if (first_panel_seen >= 0) { + break; + } + } + + if (panel_device >= 0) { + + lapack::Queue* comm_queue = A.comm_queue(panel_device); + + int64_t nb = A.tileNb(0); + size_t size_tau = (size_t) std::min( mlocal, nb ); + size_t size_A = (size_t) blas::max( 1, mlocal ) * nb; + size_t hsize, dsize; + + // Find size of the workspace needed + lapack::geqrf_work_size_bytes( mlocal, nb, dwork_array[0], mlocal, + &dsize, &hsize, *comm_queue ); + + // Size of dA, dtau, dwork and dinfo + work_size = size_A + size_tau + ceildiv(dsize, sizeof(scalar_t)) + + ceildiv(sizeof(device_info_t), sizeof(scalar_t)); + + for (int64_t dev = 0; dev < num_devices; ++dev) { + lapack::Queue* queue = A.comm_queue( dev ); + dwork_array[dev] = blas::device_malloc(work_size, *queue); + } + } + } + + // QR tracks dependencies by block-column. + // OpenMP needs pointer types, but vectors are exception safe + std::vector< uint8_t > block_vector(A_nt); + uint8_t* block = block_vector.data(); + + // set min number for omp nested active parallel regions + slate::OmpSetMaxActiveLevels set_active_levels( MinOmpActiveLevels ); + + #pragma omp parallel + #pragma omp master + { + for (int64_t k = 0; k < A_min_mtnt; ++k) { + int64_t i_end = A_mt - A_nt + k; + + auto A_panel = A.sub(k, i_end, k, k); + auto Tl_panel = Tlocal.sub(k, i_end, k, k); + auto Tr_panel = Treduce.sub(k, i_end, k, k); + + std::vector< int64_t > first_indices; + internal::geqrf_compute_first_indices(A_panel, k, first_indices); + // todo: pass first_indices into internal geqrf or ttqrt? + + // panel, high priority + #pragma omp task depend(inout:block[k]) priority(priority_one) + { + // local panel factorization + internal::geqrf( + std::move(A_panel), + std::move(Tl_panel), + dwork_array, work_size, + ib, max_panel_threads, priority_one); + + // triangle-triangle reductions + // ttqrt handles tile transfers internally + internal::ttqrt( + std::move(A_panel), + std::move(Tr_panel)); + + // if a trailing matrix exists + if (k < A_nt-1) { + + // bcast V across row for trailing matrix update + if (k < i_end) { + BcastList bcast_list_V_first; + BcastList bcast_list_V; + for (int64_t i = k; i < i_end+1; ++i) { + // send A(i, k) across row A(i, k+1:nt-1) + // Vs in first_indices (except the main diagonal one) need three lives + if ((std::find(first_indices.begin(), first_indices.end(), i) != first_indices.end()) && (i > k)) + bcast_list_V_first.push_back({i, k, {A.sub(i, i, k+1, A_nt-1)}}); + else + bcast_list_V.push_back({i, k, {A.sub(i, i, k+1, A_nt-1)}}); + } + A.template listBcast(bcast_list_V_first, layout, 0, 3, set_hold); + A.template listBcast(bcast_list_V, layout, 0, 2, set_hold); + } + + // bcast Tlocal across row for trailing matrix update + if (first_indices.size() > 0) { + BcastList bcast_list_T; + for (int64_t row : first_indices) { + bcast_list_T.push_back({row, k, {Tlocal.sub(row, row, k+1, A_nt-1)}}); + } + Tlocal.template listBcast(bcast_list_T, layout, k, life_factor_one, set_hold); + } + + // bcast Treduce across row for trailing matrix update + if (first_indices.size() > 1) { + BcastList bcast_list_T; + for (int64_t row : first_indices) { + if (row > k) // exclude the first row of this panel that has no Treduce tile + bcast_list_T.push_back({row, k, {Treduce.sub(row, row, k+1, A_nt-1)}}); + } + Treduce.template listBcast(bcast_list_T, layout); + } + } + } + + // update lookahead column(s) on CPU, high priority + for (int64_t j = k+1; j < (k+1+lookahead) && j < A_nt; ++j) { + + auto A_trail_j = A.sub(k, i_end, j, j); + + #pragma omp task depend(in:block[k]) \ + depend(inout:block[j]) \ + priority(priority_one) + { + // Apply local reflectors + internal::unmqr( + Side::Left, Op::ConjTrans, + std::move(A_panel), + std::move(Tl_panel), + std::move(A_trail_j), + W.sub(k, i_end, j, j), + priority_one, j-k+1); + + // Apply triangle-triangle reduction reflectors + // ttmqr handles the tile broadcasting internally + internal::ttmqr( + Side::Left, Op::ConjTrans, + std::move(A_panel), + std::move(Tr_panel), + std::move(A_trail_j), + j); + } + } + + // update trailing submatrix, normal priority + if (k+1+lookahead < A_nt) { + int64_t j = k+1+lookahead; + auto A_trail_j = A.sub(k, i_end, j, A_nt-1); + + #pragma omp task depend(in:block[k]) \ + depend(inout:block[k+1+lookahead]) \ + depend(inout:block[A_nt-1]) + { + // Apply local reflectors. + internal::unmqr( + Side::Left, Op::ConjTrans, + std::move(A_panel), + std::move(Tl_panel), + std::move(A_trail_j), + W.sub(k, i_end, j, A_nt-1), + priority_zero, j-k+1); + + // Apply triangle-triangle reduction reflectors. + // ttmqr handles the tile broadcasting internally. + internal::ttmqr( + Side::Left, Op::ConjTrans, + std::move(A_panel), + std::move(Tr_panel), + std::move(A_trail_j), + j); + } + } + if (target == Target::Devices) { + // Update the status of the on-hold tiles held by the invocation of + // the tileBcast routine, and then release them to free up memory. + // The origin must be updated with the latest modified copy + // for memory consistency. + // TODO: Find better solution to handle tile release, and + // investigate the correctness of the task dependency + if (k >= lookahead && k < A_nt-1) { + #pragma omp task depend(in:block[k]) \ + depend(inout:block[k+1]) + { + int64_t k_la = k-lookahead; + int64_t i_end_la = A_min_mtnt + k_la; + for (int64_t i = k_la; i < i_end_la; ++i) { + if (A.tileIsLocal(i, k_la)) { + A.tileUpdateOrigin(i, k_la); + + std::set dev_set; + A.sub(i, i, k_la+1, A_nt-1).getLocalDevices(&dev_set); + + for (auto device : dev_set) { + A.tileUnsetHold(i, k_la, device); + A.tileRelease(i, k_la, device); + } + } + } + + auto A_panel_k_la = A.sub(k_la, i_end_la, k_la, k_la); + std::vector< int64_t > first_indices_k_la; + internal::geqrf_compute_first_indices(A_panel_k_la, k_la, first_indices_k_la); + if (first_indices.size() > 0) { + for (int64_t row : first_indices_k_la) { + if (Tlocal.tileIsLocal(row, k_la)) { + Tlocal.tileUpdateOrigin(row, k_la); + + std::set dev_set; + Tlocal.sub(row, row, k_la+1, A_nt-1).getLocalDevices(&dev_set); + + for (auto device : dev_set) { + Tlocal.tileUnsetHold(row, k_la, device); + Tlocal.tileRelease(row, k_la, device); + } + } + } + } + } + } + } + } + + #pragma omp taskwait + A.tileUpdateAllOrigin(); + } + + A.releaseWorkspace(); + + if (target == Target::Devices) { + for (int64_t dev = 0; dev < num_devices; ++dev) { + blas::Queue* queue = A.comm_queue( dev ); + blas::device_free( dwork_array[dev], *queue ); + dwork_array[dev] = nullptr; + } + } +} + +} // namespace impl + +//------------------------------------------------------------------------------ +/// Distributed parallel customized QR factorization. +/// Required for the QR-based iterations in the polar decomposition QDWH. +/// +/// Computes a QR factorization of m-by-n matrix $A$, m \ge 2n, +/// and takes advantage of the trailing identity matrix structure. +/// A = [ A0 ] full matrix ( m0-by-n, where m0 = m - n, which the size of the +/// original matrix in qdwh) +/// [ A1 ] identity matrix (n-by-n) +/// Avoids doing computaions on the zero tiles below the diagonal of $A1$. +/// The factorization has the form +/// \[ +/// A = QR, +/// \] +/// where $Q$ is a matrix with orthonormal columns and $R$ is upper triangular +/// (or upper trapezoidal if m < n). +/// +/// Complexity (in real): +/// The size of matrix $A$ is $m = m0 + n$; +/// Since the lower n-by-n of the matrix $A$ is the identity, then the +/// k householder vectors has m0 - k + k + 1 = m0 + 1 rows. +/// - for $m \ge n$, \Sigma_{k=1}^{n-1} 4 (n - k) (m0 + 1) = +/// $\approx 2 m0 n^{2} + 2 m0 n + n^{2}$ flops; +/// . +//------------------------------------------------------------------------------ +/// @tparam scalar_t +/// One of float, double, std::complex, std::complex. +//------------------------------------------------------------------------------ +/// @param[in,out] A +/// On entry, the m-by-n matrix $A$, m \ge 2n, +/// A = [ A0 ] full matrix ( m0-by-n, where m0 = m - n) +/// [ A1 ] identity matrix (n-by-n) +/// +/// On exit, the elements on and above the diagonal of the array contain +/// the min(m,n)-by-n upper trapezoidal matrix $R$ (upper triangular +/// if m >= n); the elements below the diagonal represent the unitary +/// matrix $Q$ as a product of elementary reflectors. +/// +/// @param[out] T +/// On exit, triangular matrices of the block reflectors. +/// +/// @param[in] opts +/// Additional options, as map of name = value pairs. Possible options: +/// - Option::Lookahead: +/// Number of panels to overlap with matrix updates. +/// lookahead >= 0. Default 1. +/// - Option::InnerBlocking: +/// Inner blocking to use for panel. Default 16. +/// - Option::MaxPanelThreads: +/// Number of threads to use for panel. Default omp_get_max_threads()/2. +/// - Option::Target: +/// Implementation to target. Possible values: +/// - HostTask: OpenMP tasks on CPU host [default]. +/// - HostNest: nested OpenMP parallel for loop on CPU host. +/// - HostBatch: batched BLAS on CPU host. +/// - Devices: batched BLAS on GPU device. +/// +/// @ingroup geqrf_computational +/// +template +void geqrf_qdwh_full( + Matrix& A, + TriangularFactors& T, + Options const& opts) +{ + Target target = get_option( opts, Option::Target, Target::HostTask ); + + switch (target) { + case Target::Host: + case Target::HostTask: + impl::geqrf_qdwh_full( A, T, opts ); + break; + + case Target::HostNest: + impl::geqrf_qdwh_full( A, T, opts ); + break; + + case Target::HostBatch: + impl::geqrf_qdwh_full( A, T, opts ); + break; + + case Target::Devices: + impl::geqrf_qdwh_full( A, T, opts ); + break; + } + // todo: return value for errors? +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +void geqrf_qdwh_full( + Matrix& A, + TriangularFactors& T, + Options const& opts); + +template +void geqrf_qdwh_full( + Matrix& A, + TriangularFactors& T, + Options const& opts); + +template +void geqrf_qdwh_full< std::complex >( + Matrix< std::complex >& A, + TriangularFactors< std::complex >& T, + Options const& opts); + +template +void geqrf_qdwh_full< std::complex >( + Matrix< std::complex >& A, + TriangularFactors< std::complex >& T, + Options const& opts); + +} // namespace slate diff --git a/src/internal/internal_util.hh b/src/internal/internal_util.hh index e7b8241d5..ba2671dd3 100644 --- a/src/internal/internal_util.hh +++ b/src/internal/internal_util.hh @@ -10,6 +10,7 @@ #define SLATE_INTERNAL_UTIL_HH #include "slate/internal/mpi.hh" +#include "slate/Matrix.hh" #include #include @@ -69,6 +70,45 @@ inline bool compareSecond( return a.second < b.second; } +//------------------------------------------------------------------------------ +/// An auxiliary routine to find each rank's first (top-most) row +/// in panel k. +/// +/// @param[in] A_panel +/// Current panel, which is a sub of the input matrix $A$. +/// +/// @param[in] k +/// Index of the current panel in the input matrix $A$. +/// +/// @param[out] first_indices +/// The array of computed indices. +/// +/// @ingroup geqrf_impl +/// +template +void geqrf_compute_first_indices( + Matrix& A_panel, int64_t k, + std::vector< int64_t >& first_indices ) +{ + // Find ranks in this column. + std::set ranks_set; + A_panel.getRanks(&ranks_set); + assert(ranks_set.size() > 0); + + // Find each rank's first (top-most) row in this panel, + // where the triangular tile resulting from local geqrf panel + // will reside. + first_indices.reserve(ranks_set.size()); + for (int r: ranks_set) { + for (int64_t i = 0; i < A_panel.mt(); ++i) { + if (A_panel.tileRank(i, 0) == r) { + first_indices.push_back(i+k); + break; + } + } + } +} + } // namespace internal } // namespace slate diff --git a/src/norm2est.cc b/src/norm2est.cc new file mode 100644 index 000000000..f7b253ac2 --- /dev/null +++ b/src/norm2est.cc @@ -0,0 +1,204 @@ +// Copyright (c) 2017-2022, University of Tennessee. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// This program is free software: you can redistribute it and/or modify it under +// the terms of the BSD 3-Clause license. See the accompanying LICENSE file. + +#include "slate/slate.hh" +#include "internal/internal.hh" +#include "internal/internal_util.hh" +#include "slate/internal/mpi.hh" + +#include +#include + +namespace slate { + +namespace impl { + +//------------------------------------------------------------------------------ +/// @internal +/// Distributed parallel general matrix 2-norm estimate. +/// Generic implementation for any target. +/// @ingroup norm_specialization +/// +template +blas::real_type +norm2est( + matrix_type A, + Options const& opts) +{ + using scalar_t = typename matrix_type::value_type; + using real_t = blas::real_type; + using blas::min; + + // Constants + const scalar_t one = 1.; + const scalar_t zero = 0.; + const real_t r_one = 1.; + const int izero = 0; + + int64_t n = A.n(); + int64_t m = A.m(); + int64_t mb = A.tileMb(0); + int64_t nb = A.tileNb(0); + + int p, q; + int myrow, mycol; + GridOrder order; + A.gridinfo(&order, &p, &q, &myrow, &mycol); + + int64_t cnt = 0; + int64_t maxiter = min( 100, n ); + int64_t mloc = numberLocalRowOrCol(m, mb, myrow, izero, p); + int64_t lldA = blas::max(1, mloc); + + real_t e = 0.; + real_t e0 = 0.; + real_t normX = 0; + real_t normAX = 0; + real_t tol = 1.e-1; + + if (target == Target::Devices) + A.reserveDeviceWorkspace(); + + std::vector local_sums(n); + std::vector global_sums(n); + std::vector W1(lldA); + + auto XL = slate::Matrix::fromScaLAPACK( + n, 1, &global_sums[0], n, nb, 1, p, q, A.mpiComm()); + auto AX = slate::Matrix::fromScaLAPACK( + m, 1, &W1[0], lldA, mb, 1, p, q, A.mpiComm()); + + // Two norm estimation + // First: let's compute the x vector such that + // x_j = sum_i abs( A_{i,j} ), x here is global_sums.data + #pragma omp parallel + #pragma omp master + { + internal::norm(slate::Norm::One, NormScope::Matrix, std::move(A), local_sums.data()); + } + + // global_sums.data() will have the sum of each column. + #pragma omp critical(slate_mpi) + { + trace::Block trace_block("MPI_Allreduce"); + slate_mpi_call( + MPI_Allreduce(local_sums.data(), global_sums.data(), + n, mpi_type::value, + MPI_SUM, A.mpiComm())); + } + + // Second: compute the ||x||_Fro + e = slate::norm(Norm::Fro, XL, opts); + if (e == 0.) { + return 0.; + } + else { + normX = e; + } + + // Third: start the while-loop X = X / ||X|| + while ((cnt < maxiter) && (std::abs(e - e0) > (tol * e))) { + e0 = e; + + // Scale X = X / ||X|| + scale(r_one, normX, XL, opts); + + // Compute Ax = A * sx + gemmC(one, A, XL, zero, AX, opts); + + // todo: still need to add the following + //if nnz(Sx) == 0 + // Sx = rand(size(Sx),class(Sx)); + //end + + // Compute x = A' * A * x = A' * Ax + auto AT = conj_transpose(A); + + // todo: why this set is needed when using multiple mpi rank and using gemmA + set(zero, zero, XL); + gemmC(one, AT, AX, zero, XL, opts); + + // Compute ||X||, ||AX|| + normX = norm(Norm::Fro, XL, opts); + normAX = norm(Norm::Fro, AX, opts); + + // Update e + e = normX / normAX; + cnt++; + } + + A.clearWorkspace(); + + return e; + +} + +} // namespace impl + +//------------------------------------------------------------------------------ +/// Distributed parallel general matrix norm. +/// +//------------------------------------------------------------------------------ +/// @tparam matrix_type +/// Any SLATE matrix type: Matrix, SymmetricMatrix, HermitianMatrix, +/// TriangularMatrix, etc. +//------------------------------------------------------------------------------ +/// @param[in] A +/// The matrix A. +/// +/// @param[in] opts +/// Additional options, as map of name = value pairs. Possible options: +/// - Option::Target: +/// Implementation to target. Possible values: +/// - HostTask: OpenMP tasks on CPU host [default]. +/// - HostNest: nested OpenMP parallel for loop on CPU host. +/// - Devices: batched BLAS on GPU device. +/// +/// @ingroup norm +/// +template +blas::real_type +norm2est(matrix_type& A, + Options const& opts) +{ + Target target = get_option( opts, Option::Target, Target::HostTask ); + + switch (target) { + case Target::Host: + case Target::HostTask: + case Target::HostNest: + case Target::HostBatch: + return impl::norm2est( A, opts ); + break; + case Target::Devices: + return impl::norm2est( A, opts ); + break; + } + throw std::exception(); // todo: invalid target +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +float norm2est( + Matrix& A, + Options const& opts); + +template +double norm2est( + Matrix& A, + Options const& opts); + +template +float norm2est( + Matrix< std::complex >& A, + Options const& opts); + +template +double norm2est( + Matrix< std::complex >& A, + Options const& opts); + +} // namespace slate diff --git a/src/qdwh.cc b/src/qdwh.cc new file mode 100644 index 000000000..4582840ac --- /dev/null +++ b/src/qdwh.cc @@ -0,0 +1,367 @@ +// Copyright (c) 2017-2022, University of Tennessee. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// This program is free software: you can redistribute it and/or modify it under +// the terms of the BSD 3-Clause license. See the accompanying LICENSE file. + +#include "slate/slate.hh" +#include "auxiliary/Debug.hh" +#include "slate/Matrix.hh" +#include "internal/internal.hh" + +namespace slate { + +//------------------------------------------------------------------------------ +/// Distributed parallel polar decomposition based on QDWH algorithm. +/// +/// Computes a polar decomposition of an m-by-n matrix $A$. +/// The factorization has the form +/// \[ +/// A = UH, +/// \] +/// where $U$ is a matrix with orthonormal columns and $H$ is a Hermition +/// matrix. +/// +//------------------------------------------------------------------------------ +/// @tparam scalar_t +/// One of float, double, std::complex, std::complex. +//------------------------------------------------------------------------------ +/// @param[in,out] A +/// On entry, the m-by-n matrix $A$. +/// On exit, if return value = 0, the orthogonal polar factor $U$ +/// from the decomposition $A = U H$. +/// +/// @param[out] H +/// On exit, if return value = 0, the hermitian polar factor matrix $H$ +/// from the decomposition $A = U H$. +/// +/// @param[out] itqr +/// The number of the QR-based iterations. +/// +/// @param[out] itpo +/// The number of the Cholesky-based iterations. +/// +/// @param[in] opts +/// Additional options, as map of name = value pairs. Possible options: +/// - Option::Lookahead: +/// Number of panels to overlap with matrix updates. +/// lookahead >= 0. Default 1. +/// - Option::InnerBlocking: +/// Inner blocking to use for panel. Default 16. +/// - Option::MaxPanelThreads: +/// Number of threads to use for panel. Default omp_get_max_threads()/2. +/// - Option::Target: +/// Implementation to target. Possible values: +/// - HostTask: OpenMP tasks on CPU host [default]. +/// - HostNest: nested OpenMP parallel for loop on CPU host. +/// - HostBatch: batched BLAS on CPU host. +/// - Devices: batched BLAS on GPU device. +/// +/// @ingroup qdwh +/// +template +void qdwh( + Matrix& A, + Matrix& H, + int& itqr, int& itpo, + Options const& opts) +{ + // todo: + // optimizations: + // 1. reuse Q from first geqrf + // 2. avoid rounding m if m is not divisible by nb, because then geqrf/unmqr + // have extra zero rows + + using real_t = blas::real_type; + using blas::real; + + // Constants + const scalar_t zero = 0.0; + const scalar_t one = 1.0; + const real_t r_one = 1.0; + + // Options + Target target = get_option( opts, Option::Target, Target::HostTask ); + + int64_t mt = A.mt(); + int64_t nt = A.nt(); + int64_t nb = A.tileMb(0); + + int64_t m = A.m(); + int64_t n = A.n(); + + if (m < n) { + return; + } + + real_t eps = std::numeric_limits::epsilon(); + real_t tol1 = 5. * eps; + real_t tol3 = pow(tol1, r_one/real_t(3.)); + + int itconv, it; + real_t normR; + real_t conv = 10.0; + scalar_t alpha, beta; + double L2, sqd, a1, a, b, c, dd; + real_t Li, Liconv; + + // The QR-based iterations requires QR[A; Id], + // W = [A; Id], where size of A is mxn, size of Id is nxn + // To avoid having a clean up tile in the middle of the W matrix, + // we round up the number of A rows (m), + // so that size(W) = m_roundup + n + int64_t m_roundup = roundup( m, nb ); + int64_t m_W = m_roundup + n; + int64_t mt_W = mt + nt; + + int nprow, npcol; + int myrow, mycol; + GridOrder order; + A.gridinfo(&order, &nprow, &npcol, &myrow, &mycol); + + // allocate m_W*n work space required for the qr-based iterations QR([A;Id]) + slate::Matrix W(m_W, n, nb, nprow, npcol, A.mpiComm()); + slate::Matrix Q(m_W, n, nb, nprow, npcol, A.mpiComm()); + slate::TriangularFactors T; + + slate::Matrix Acpy; + if (m == n) { + // if A is square, we can use H to save a copy of A + // this is will not work if H is stored as Hermitian matrix + Acpy = H; + } + else { + Acpy = slate::Matrix(m, n, nb, nprow, npcol, A.mpiComm()); + Acpy.insertLocalTiles(target); + } + + W.insertLocalTiles(target); + Q.insertLocalTiles(target); + + // To compute QR[A; Id], allocate W and Q of size m_Wxn + // First mt tiles of W contains A matrix in [0:m-1, 0:n-1] + // W1 have more rows than A, since we round up the number of rows of W1 + auto W1 = W.sub(0, mt-1, 0, nt-1); + auto Q1 = Q.sub(0, mt-1, 0, nt-1); + // W10 have A matrix + auto W10 = W1.slice(0, m-1, 0, n-1); + auto Q10 = Q1.slice(0, m-1, 0, n-1); + // W11 is the last tile which has more rows in case we round up m + auto W11 = W.sub(mt-1, mt-1, 0, nt-1); + + // Second nt tiles of W, contain the identity matrix + auto W2 = W.sub(mt, mt_W-1, 0, nt-1); + auto Q2 = Q.sub(mt, mt_W-1, 0, nt-1); + + // backup A in Acpy to compute H + slate::copy(A, Acpy, opts); + + // two norm estimation (largest singular value of A) + real_t norm_est = norm2est( A, opts); + alpha = 1.0; + + // scale the original matrix A to form A0 which is the initial matrix + // for the first iteration + scale(r_one, norm_est, A, opts); + + // Estimate the condition number using QR + // This Q factor can be used in the first QR-based iteration + // todo: use the Q factor in the first QR-based iteration + slate::copy(A, W10, opts); + slate::geqrf(W10, T, opts); + //auto R = TrapezoidMatrix( + // Uplo::Upper, slate::Diag::NonUnit, W10 ); + auto R1 = W10.slice(0, n-1, 0, n-1); + auto R = TriangularMatrix( + Uplo::Upper, slate::Diag::NonUnit, R1 ); + normR = norm(slate::Norm::One, R, opts); + slate::trcondest(slate::Norm::One, R, &Li, opts); + real_t smin_est = normR*Li; + Li = smin_est / sqrt(n); + + //slate::trcondest(slate::Norm::One, R, &Li, opts); + // *flops += FLOPS_DGEQRF( M, N ); + + // Compute the number of iterations to converge + itconv = 0; Liconv = Li; + while (itconv == 0 || fabs(1-Liconv) > tol1) { + // To find the minimum number of iterations to converge. + // itconv = number of iterations needed until |Li - 1| < tol1 + // This should have converged in less than 50 iterations + if (itconv > 100) { + slate_error("Failed to converge."); + } + itconv++; + + L2 = double(Liconv) * double(Liconv); + dd = std::pow( real_t(4.0) * ( r_one - L2 ), r_one / real_t(3.0) ) * + std::pow( L2, real_t(-2.0) / real_t(3.0) ); + sqd = sqrt( r_one + real_t(dd) ); + a1 = sqd + sqrt( real_t(8.0) - real_t(4.0) * real_t(dd) + + real_t(8.0) * ( real_t(2.0) - L2 ) / ( L2 * sqd ) ) / real_t(2.0); + a = real(a1); + b = ( a - r_one ) * ( a - r_one ) / real_t(4.0); + c = a + b - r_one; + // Update Liconv + Liconv = Liconv * real_t(( a + b * L2 ) / ( r_one + c * L2 )); + } + + it = 0; + while (conv > tol3 || it < itconv) { + // This should have converged in less than 50 iterations + if (it > 100) { + slate_error("Failed to converge."); + } + it++; + + // Compute parameters L,a,b,c + L2 = double(Li) * double(Li); + dd = std::pow( real_t(4.0) * ( r_one - L2 ), r_one / real_t(3.0) ) * + std::pow( L2, real_t(-2.0) / real_t(3.0) ); + sqd = sqrt( r_one + real_t(dd) ); + a1 = sqd + sqrt( real_t(8.0) - real_t(4.0) * real_t(dd) + + real_t(8.0) * ( real_t(2.0) - L2 ) / ( L2 * sqd ) ) / real_t(2.0); + a = real(a1); + b = ( a - r_one ) * ( a - r_one ) / real_t(4.0); + c = a + b - r_one; + // Update Li + Li = Li * real_t(( a + b * L2 ) / ( r_one + c * L2 )); + + if (c > 100.) { + // Generate the matrix W = [ W1 ] = [ sqrt(c) * A ] + // [ W2 ] = [ Id ] + + // Factorize W = QR, and generate the associated Q + alpha = scalar_t(sqrt(c)); + set(zero, zero, W11, opts); + add(alpha, A, zero, W10, opts); + set(zero, one, W2, opts); + + // Call a variant of geqrf and unmqr that avoid the tiles of zeros, + // which are the tiles below the diagonal of W2 (identity) + geqrf_qdwh_full(W, T, opts); + + set(zero, one, Q, opts); + unmqr_qdwh_full(slate::Side::Left, slate::Op::NoTrans, W, T, Q, opts); + + // A = ( (a-b/c)/sqrt(c) ) * Q1 * Q2' + (b/c) * A + auto Q2T = conj_transpose(Q2); + alpha = scalar_t( (a-b/c)/sqrt(c) ); + beta = scalar_t( b / c ); + // Save a copy of A to check the convergence of QDWH + if (it >= itconv ) { + slate::copy(A, W10, opts); + } + gemm(alpha, Q10, Q2T, beta, A, opts); + + itqr += 1; + + //facto = 0; + // Main flops used in this step/ + //flops_dgeqrf = FLOPS_DGEQRF( 2*m, n ); + //flops_dorgqr = FLOPS_DORGQR( 2*m, n, n ); + //flops_dgemm = FLOPS_DGEMM( m, n, n ); + //flops += flops_dgeqrf + flops_dorgqr + flops_dgemm; + } + else { + // A = (b/c) * A + (a-b/c) * ( linsolve( C, linsolve( C, A') ) )'; + // Save a copy of A to check the convergence of QDWH + if (it >= itconv) { + slate::copy(A, W10, opts); + } + + // Save a copy of A + slate::copy(A, Q10, opts); + + // Compute c * A' * A + I + auto AT = conj_transpose(Q10); + set(zero, one, W2, opts); + gemm(scalar_t(c), AT, A, one, W2, opts); + + // Solve R x = AT + auto U = slate::HermitianMatrix( + slate::Uplo::Upper, W2 ); + posv(U, AT, opts); + + // Update A = (b/c) * A + (a-b/c) * ( linsolve( C, linsolve( C, A') ) )'; + alpha = (a-b/c); beta = (b/c); + auto AT2 = conj_transpose(AT); + add(alpha, AT2, beta, A, opts); + + itpo += 1; + + //facto = 1; + // Main flops used in this step + //flops_dgemm = FLOPS_DGEMM( m, n, n ); + //flops_dpotrf = FLOPS_DPOTRF( m ); + //flops_dtrsm = FLOPS_DTRSM( 'L', m, n ); + //flops += flops_dgemm + flops_dpotrf + 2. * flops_dtrsm; + } + + // Check if it converge, compute the norm of matrix A - W1 + conv = 10.0; + if (it >= itconv) { + add(one, A, -one, W10, opts); + conv = norm(slate::Norm::Fro, W10, opts); + } + } + + if (A.mpiRank() == 0) { + printf("%-7s %-7s\n", "QR", "PO"); + printf("%-7d %-7d\n", itqr, itpo); + } + + if (itqr + itpo > 6) { + printf("\nConverged after %d. Check what is the issue, " + "because QDWH needs <= 6 iterations.\n", + itqr+itpo); + } + + // A = U*H ==> H = U'*A ==> H = 0.5*(H'+H) + auto AT = conj_transpose(A); + if (m == n) { + auto W10_n = W1.slice(0, n-1, 0, n-1); + gemm(one, AT, Acpy, zero, W10_n, opts); + slate::copy(W10_n, H, opts); + } + else { + gemm(one, AT, Acpy, zero, H, opts); + } + + // todo: try something like her2k to compute H + //her2k(one, A, W10, rzero, H, opts); + //auto AL = HermitianMatrix( + // slate::Uplo::Lower, H ); + //slate::copy(AL, H, opts); +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +void qdwh( + Matrix& A, + Matrix& H, + int& itqr, int& itpo, + Options const& opts); + +template +void qdwh( + Matrix& A, + Matrix& H, + int& itqr, int& itpo, + Options const& opts); + +template +void qdwh< std::complex >( + Matrix< std::complex >& A, + Matrix< std::complex >& H, + int& itqr, int& itpo, + Options const& opts); + +template +void qdwh< std::complex >( + Matrix< std::complex >& A, + Matrix< std::complex >& H, + int& itqr, int& itpo, + Options const& opts); + +} // namespace slate diff --git a/src/unmqr_qdwh_full.cc b/src/unmqr_qdwh_full.cc new file mode 100644 index 000000000..a1ace1725 --- /dev/null +++ b/src/unmqr_qdwh_full.cc @@ -0,0 +1,394 @@ +// Copyright (c) 2017-2022, University of Tennessee. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// This program is free software: you can redistribute it and/or modify it under +// the terms of the BSD 3-Clause license. See the accompanying LICENSE file. + +#include "slate/slate.hh" +#include "auxiliary/Debug.hh" +#include "slate/Matrix.hh" +#include "internal/Tile_tpmqrt.hh" +#include "internal/internal.hh" + +namespace slate { + +namespace impl { + +//------------------------------------------------------------------------------ +/// Distributed parallel multiply by Q from QR factorization. +/// Generic implementation for any target. +/// @ingroup geqrf_impl +/// +template +void unmqr_qdwh_full( + Side side, Op op, + Matrix& A, + TriangularFactors& T, + Matrix& C, + Options const& opts ) +{ + // trace::Block trace_block("unmqr"); + using BcastList = typename Matrix::BcastList; + + // Assumes column major + const Layout layout = Layout::ColMajor; + + int64_t A_mt = A.mt(); + int64_t A_nt = A.nt(); + int64_t A_min_mtnt = std::min(A_mt, A_nt); + int64_t C_mt = C.mt(); + int64_t C_nt = C.nt(); + + if (is_complex::value && op == Op::Trans) { + throw Exception("Complex numbers uses Op::ConjTrans, not Op::Trans."); + } + + if (target == Target::Devices) { + C.allocateBatchArrays(); + C.reserveDeviceWorkspace(); + } + + // Reserve workspace + auto W = C.emptyLike(); + + if (target == Target::Devices) { + W.allocateBatchArrays(); + // todo: this is demanding too much device workspace memory + // only one tile-row of matrix W per MPI process is going to be used, + // but W with size of whole C is being allocated + // thus limiting the matrix size that can be processed + //W.reserveDeviceWorkspace(); + } + + assert(T.size() == 2); + auto Tlocal = T[0]; + auto Treduce = T[1]; + + // QR tracks dependencies by block-column. + // OpenMP needs pointer types, but vectors are exception safe + std::vector< uint8_t > block_vector(A_nt); + uint8_t* block = block_vector.data(); + + // set min number for omp nested active parallel regions + slate::OmpSetMaxActiveLevels set_active_levels( MinOmpActiveLevels ); + + #pragma omp parallel + #pragma omp master + { + int64_t k_begin, k_end, k_step; + if ((side == Side::Left) == (op == Op::NoTrans)) { + // Left, NoTrans: multiply Q C = Q1 ... QK C, or + // Right, (Conj)Trans: multiply C Q^H = C QK^H ... Q1^H, + // i.e., in reverse order of how Qk's were created. + k_begin = A_min_mtnt-1; + k_end = -1; + k_step = -1; + } + else { + // Left, (Conj)Trans: multiply Q^H C = QK^H ... Q1^H C, or + // Right, NoTrans: multiply C Q = C Q1 ... QK, + // i.e., in same order as Qk's were created. + k_begin = 0; + k_end = A_min_mtnt; + k_step = 1; + } + + // for k = k_begin, lastk = k_begin (no previous col to depend on); + // otherwise, lastk = k - k_step (previous col). + int64_t lastk = k_begin; + // OpenMP uses lastk; compiler doesn't, so warns it is unused. + SLATE_UNUSED(lastk); + for (int64_t k = k_begin; k != k_end; k += k_step) { + + int64_t i_end = A_mt - A_nt + k; + + auto A_panel = A.sub(k, i_end, k, k); + + // Find ranks in this column. + std::set ranks_set; + A_panel.getRanks(&ranks_set); + assert(ranks_set.size() > 0); + + // Find each rank's first (top-most) row in this panel, + // where the triangular tile resulting from local geqrf + // panel will reside. + std::vector< int64_t > first_indices; + first_indices.reserve(ranks_set.size()); + for (int r: ranks_set) { + for (int64_t i = 0; i < A_panel.mt(); ++i) { + if (A_panel.tileRank(i, 0) == r) { + first_indices.push_back(i+k); + break; + } + } + } + + #pragma omp task depend(inout:block[k]) \ + depend(in:block[lastk]) + { + // Indices for row or col of C. + int64_t i0 = -1, i1 = -1, j0 = -1, j1 = -1; + if (side == Side::Left) { + j0 = 0; + j1 = C_nt-1; + } + else { + i0 = 0; + i1 = C_mt-1; + } + + // Send V(i) across row C(i, 0:nt-1) or col C(0:mt-1, i), + // for side = left or right, respectively. + BcastList bcast_list_V_top; + BcastList bcast_list_V; + for (int64_t i = k; i < i_end+1; ++i) { + if (side == Side::Left) { + i0 = i; + i1 = i; + } + else { + j0 = i; + j1 = i; + } + if (std::find(first_indices.begin(), first_indices.end(), i) != first_indices.end()) { + bcast_list_V_top.push_back( + {i, k, {C.sub(i0, i1, j0, j1)}}); + } + else { + bcast_list_V.push_back( + {i, k, {C.sub(i0, i1, j0, j1)}}); + } + } + // V tiles in first_indices need up to 5 lives: 1 for ttmqr, + // 2 + extra 2 if mb > nb (trapezoid) for Vs in unmqr I-VTV^T. + // This may leak a few tiles that A.clearWorkspace will cleanup. + A.template listBcast(bcast_list_V_top, layout, 0, 5); + A.template listBcast(bcast_list_V, layout, 0, 2); + + // Send Tlocal(i) across row C(i, 0:nt-1) or col C(0:mt-1, i). + if (first_indices.size() > 0) { + BcastList bcast_list_T; + for (int64_t i : first_indices) { + if (side == Side::Left) { + i0 = i; + i1 = i; + } + else { + j0 = i; + j1 = i; + } + bcast_list_T.push_back( + {i, k, {C.sub(i0, i1, j0, j1)}}); + } + Tlocal.template listBcast(bcast_list_T, layout); + } + + // Send Treduce(i) across row C(i, 0:nt-1) or col C(0:mt-1, i). + if (first_indices.size() > 1) { + BcastList bcast_list_T; + for (int64_t i : first_indices) { + if (side == Side::Left) { + i0 = i; + i1 = i; + } + else { + j0 = i; + j1 = i; + } + // Exclude first row of this panel, + // which doesn't have Treduce tile. + if (i > k) { + bcast_list_T.push_back( + {i, k, {C.sub(i0, i1, j0, j1)}}); + } + } + Treduce.template listBcast(bcast_list_T, layout); + } + + Matrix C_trail, W_trail; + if (side == Side::Left) { + C_trail = C.sub(k, i_end, 0, C_nt-1); + W_trail = W.sub(k, i_end, 0, C_nt-1); + } + else { + C_trail = C.sub(0, i_end, k, C_nt-1); + W_trail = W.sub(0, i_end, k, C_nt-1); + } + + // Left, NoTrans: Qi C = Qi_local Qi_reduce C, or + // Right, (Conj)Trans: C Qi^H = C Qi_reduce^H Qi_local^H, + // do ttmqr then unmqr. + if ((side == Side::Left) == (op == Op::NoTrans)) { + // Apply triangle-triangle reduction reflectors. + internal::ttmqr( + side, op, + std::move(A_panel), + Treduce.sub(k, i_end, k, k), + std::move(C_trail)); + } + + // Apply local reflectors. + internal::unmqr( + side, op, + std::move(A_panel), + Tlocal.sub(k, i_end, k, k), + std::move(C_trail), + std::move(W_trail)); + + // Left, (Conj)Trans: Qi^H C = Qi_reduce^H Qi_local^H C, or + // Right, NoTrans: C Qi = C Qi_local Qi_reduce, + // do unmqr then ttmqr. + if ((side == Side::Left) != (op == Op::NoTrans)) { + // Apply triangle-triangle reduction reflectors. + internal::ttmqr( + side, op, + std::move(A_panel), + Treduce.sub(k, A_mt-1, k, k), + std::move(C_trail)); + } + } + + lastk = k; + } + + #pragma omp taskwait + C.tileUpdateAllOrigin(); + } + + A.clearWorkspace(); + C.clearWorkspace(); +} + +} // namespace impl + +//------------------------------------------------------------------------------ +/// Distributed parallel customized multiply by $Q$ from QR factorization. +/// Required for the QR-based iterations in the polar decomposition QDWH. +/// +/// The $Q$ matrix is the output of geqrf_qdwh_full. Since the input matrix +/// to geqrf_qdwh_full has identity trailing matrix, the $Q$ matrix has +/// zero tiles below the diagonal. +/// Q = [ Q0 ] full matrix ( m0-by-n, where m0 = m - n) +/// [ Q1 ] zero tiles below the diagonal (n-by-n) +/// +/// Multiplies the general m-by-n matrix $C$ by $Q$ from QR factorization, +/// and takes advantage of the structure of the matrix by avoid operating +/// on the tiles of zeros, +/// according to: +/// +/// op | side = Left | side = Right +/// --------------- | ------------- | -------------- +/// op = NoTrans | $Q C $ | $C Q $ +/// op = ConjTrans | $Q^H C$ | $C Q^H$ +/// +/// where $Q$ is a unitary matrix defined as the product of k +/// elementary reflectors +/// \[ +/// Q = H(1) H(2) . . . H(k) +/// \] +/// as returned by geqrf. $Q$ is of order m if side = Left +/// and of order n if side = Right. +/// +//------------------------------------------------------------------------------ +/// @tparam scalar_t +/// One of float, double, std::complex, std::complex. +//------------------------------------------------------------------------------ +/// @param[in] side +/// - Side::Left: apply $Q$ or $Q^H$ from the left; +/// - Side::Right: apply $Q$ or $Q^H$ from the right. +/// +/// @param[in] op +/// - Op::NoTrans apply $Q$; +/// - Op::ConjTrans: apply $Q^H$; +/// - Op::Trans: apply $Q^T$ (only if real). +/// In the real case, Op::Trans is equivalent to Op::ConjTrans. +/// In the complex case, Op::Trans is not allowed. +/// +/// @param[in] A +/// Details of the QR factorization of the original matrix $A$ as returned +/// by geqrf. +/// +/// @param[in] T +/// Triangular matrices of the block reflectors as returned by geqrf. +/// +/// @param[in,out] C +/// On entry, the m-by-n matrix $C$. +/// On exit, $C$ is overwritten by $Q C$, $Q^H C$, $C Q$, or $C Q^H$. +/// +/// @param[in] opts +/// Additional options, as map of name = value pairs. Possible options: +/// - Option::Target: +/// Implementation to target. Possible values: +/// - HostTask: OpenMP tasks on CPU host [default]. +/// - HostNest: nested OpenMP parallel for loop on CPU host. +/// - HostBatch: batched BLAS on CPU host. +/// - Devices: batched BLAS on GPU device. +/// +/// @ingroup geqrf_computational +/// +template +void unmqr_qdwh_full( + Side side, Op op, + Matrix& A, + TriangularFactors& T, + Matrix& C, + Options const& opts) +{ + Target target = get_option( opts, Option::Target, Target::HostTask ); + + switch (target) { + case Target::Host: + case Target::HostTask: + default: + impl::unmqr_qdwh_full( side, op, A, T, C, opts ); + break; + + case Target::HostNest: + impl::unmqr_qdwh_full( side, op, A, T, C, opts ); + break; + + case Target::HostBatch: + impl::unmqr_qdwh_full( side, op, A, T, C, opts ); + break; + + case Target::Devices: + impl::unmqr_qdwh_full( side, op, A, T, C, opts ); + break; + } + // todo: return value for errors? +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +void unmqr_qdwh_full( + Side side, Op op, + Matrix& A, + TriangularFactors& T, + Matrix& C, + Options const& opts); + +template +void unmqr_qdwh_full( + Side side, Op op, + Matrix& A, + TriangularFactors& T, + Matrix& C, + Options const& opts); + +template +void unmqr_qdwh_full< std::complex >( + Side side, Op op, + Matrix< std::complex >& A, + TriangularFactors< std::complex >& T, + Matrix< std::complex >& C, + Options const& opts); + +template +void unmqr_qdwh_full< std::complex >( + Side side, Op op, + Matrix< std::complex >& A, + TriangularFactors< std::complex >& T, + Matrix< std::complex >& C, + Options const& opts); + +} // namespace slate diff --git a/test/run_tests.py b/test/run_tests.py index 77dee5bed..d67480af1 100755 --- a/test/run_tests.py +++ b/test/run_tests.py @@ -75,6 +75,7 @@ group_cat.add_argument( '--aux', action='store_true', help='run auxiliary routine tests' ), group_cat.add_argument( '--norms', action='store_true', help='run norm tests' ), group_cat.add_argument( '--cond', action='store_true', help='run condition number estimate tests' ), + group_cat.add_argument( '--qdwh', action='store_true', help='run the polar decomposition qdwh' ), ] # map category objects to category names: ['lu', 'chol', ...] categories = list( map( lambda x: x.dest, categories ) ) @@ -574,6 +575,12 @@ def filter_csv( values, csv ): #[ 'pbcon', gen + dtype + la + n + kd + uplo ], ] +# polar +if (opts.qdwh): + cmds += [ + [ 'qdwh', gen + dtype + n + tall ], + ] + # aux if (opts.aux): cmds += [ diff --git a/test/test.cc b/test/test.cc index 03cadbfb7..df6bc8247 100644 --- a/test/test.cc +++ b/test/test.cc @@ -279,6 +279,8 @@ std::vector< testsweeper::routines_t > routines = { { "syset", test_set, Section::aux }, { "heset", test_set, Section::aux }, { "", nullptr, Section::newline }, + + { "qdwh", test_qdwh, Section::qr }, }; // ----------------------------------------------------------------------------- diff --git a/test/test.hh b/test/test.hh index e38871038..8743a533e 100644 --- a/test/test.hh +++ b/test/test.hh @@ -261,6 +261,9 @@ void test_scale (Params& params, bool run); void test_scale_row_col(Params& params, bool run); void test_set (Params& params, bool run); +// polar decomposition +void test_qdwh (Params& params, bool run); + // ----------------------------------------------------------------------------- inline slate::Dist str2dist(const char* dist) { diff --git a/test/test_qdwh.cc b/test/test_qdwh.cc new file mode 100644 index 000000000..7e8d3a214 --- /dev/null +++ b/test/test_qdwh.cc @@ -0,0 +1,244 @@ +// Copyright (c) 2017-2022, University of Tennessee. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// This program is free software: you can redistribute it and/or modify it under +// the terms of the BSD 3-Clause license. See the accompanying LICENSE file. + +#include "slate/slate.hh" +#include "test.hh" +#include "blas/flops.hh" +#include "lapack/flops.hh" +#include "print_matrix.hh" +#include "grid_utils.hh" + +#include "scalapack_wrappers.hh" +#include "scalapack_support_routines.hh" +#include "scalapack_copy.hh" + +#include +#include +#include +#include +//------------------------------------------------------------------------------ +template +void test_qdwh_work(Params& params, bool run) +{ + using real_t = blas::real_type; + using blas::real; + + // Constants + const scalar_t zero = 0; + const scalar_t one = 1; + + // get & mark input values + int64_t m = params.dim.m(); + int64_t n = params.dim.n(); + int p = params.grid.m(); + int q = params.grid.n(); + int64_t nb = params.nb(); + int64_t ib = params.ib(); + int64_t lookahead = params.lookahead(); + int64_t panel_threads = params.panel_threads(); + bool ref_only = params.ref() == 'o'; + bool ref = params.ref() == 'y' || ref_only; + bool check = params.check() == 'y' && ! ref_only; + bool trace = params.trace() == 'y'; + slate::Origin origin = params.origin(); + slate::Target target = params.target(); + params.matrix.mark(); + + // mark non-standard output values + params.time(); + params.gflops(); + params.ref_time(); + params.ref_gflops(); + params.ortho(); + + if (! run) + return; + + slate::Options const opts = { + {slate::Option::Lookahead, lookahead}, + {slate::Option::Target, target}, + {slate::Option::MaxPanelThreads, panel_threads}, + {slate::Option::InnerBlocking, ib} + }; + + // MPI variables + int mpi_rank, myrow, mycol; + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + gridinfo(mpi_rank, p, q, &myrow, &mycol); + + + // BLACS/MPI variables + // matrix A, figure out local size, allocate, create descriptor, initialize + int64_t mlocA = num_local_rows_cols(m, nb, myrow, p); + int64_t nlocA = num_local_rows_cols(n, nb, mycol, q); + int64_t lldA = blas::max(1, mlocA); // local leading dimension of A + std::vector A_data; + + int64_t mlocH = num_local_rows_cols(n, nb, myrow, p); + int64_t nlocH = num_local_rows_cols(n, nb, mycol, q); + int64_t lldH = blas::max(1, mlocH); // local leading dimension of H + std::vector H_data; + + slate::Matrix A; + slate::Matrix H; + //slate::HermitianMatrix H; + if (origin != slate::Origin::ScaLAPACK) { + // Copy local ScaLAPACK data to GPU or CPU tiles. + slate::Target origin_target = origin2target(origin); + A = slate::Matrix(m, n, nb, p, q, MPI_COMM_WORLD); + A.insertLocalTiles(origin_target); + //H = slate::HermitianMatrix(slate::Uplo::Lower, n, nb, p, q, MPI_COMM_WORLD); + //H.insertLocalTiles(origin_target); + H = slate::Matrix(n, n, nb, p, q, MPI_COMM_WORLD); + H.insertLocalTiles(origin_target); + } + else { + // create SLATE matrices from the ScaLAPACK layouts + A_data.resize( lldA * nlocA ); + A = slate::Matrix::fromScaLAPACK(m, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD); + //H_data.resize( lldH * nlocH ); + //H = slate::HermitianMatrix::fromScaLAPACK(slate::Uplo::Lower, n, &H_data[0], lldH, nb, p, q, MPI_COMM_WORLD); + H_data.resize( lldH * nlocH ); + H = slate::Matrix::fromScaLAPACK(n, n, &H_data[0], lldH, nb, p, q, MPI_COMM_WORLD); + } + + real_t cond = 1 / std::numeric_limits::epsilon(); + params.matrix.kind.set_default("svd"); + params.matrix.cond.set_default(cond); + + slate::generate_matrix( params.matrix, A); + + // if check is required, copy test data + slate::Matrix Aref; + std::vector Aref_data; + if (check || ref) { + Aref_data.resize( lldA * nlocA ); + Aref = slate::Matrix::fromScaLAPACK( + m, n, &Aref_data[0], lldA, nb, p, q, MPI_COMM_WORLD); + slate::copy(A, Aref); + } + + print_matrix("A", A, params); + + // todo: how to compute gflops? + + if (! ref_only) { + if (trace) slate::trace::Trace::on(); + else slate::trace::Trace::off(); + + int itqr = 0, itpo = 0; + + double time = barrier_get_wtime(MPI_COMM_WORLD); + + //================================================== + // Run SLATE test. + //================================================== + // A = AH, + // A will be written by the orthogonal polar factor + // H is the symmetric positive semidefinite polar factor + slate::qdwh(A, H, itqr, itpo, opts); + + time = barrier_get_wtime(MPI_COMM_WORLD) - time; + + // gflop include the flops to estimate the condition + // number, the flops for the qr_based iterations the + // flops for the po_based iterations and the flops + // to compute H (hermition polar factor). + // gflop = trcon + itqr * (flop_per_qr_based_iter) + + // itpo * (flop_per_po_based_iter) + + // flop_compute_H + double gflop_trcon = lapack::Gflop::geqrf(m, n); + + // this flop count of the QR-based iteration is in case we are not taking advantage + // of the identity trailing submatrix: + //double gflop_itqr = itqr * ( lapack::Gflop::geqrf(m+n, n) + + // lapack::Gflop::unmqr(slate::Side::Left, m+n, n, n) + + // blas::Gflop::gemm(m, n, n) ); + // when take advantage of the matrix structure, the flops count will be reduced: + double gflop_itqr = itqr * ( 1e-9 * 2. * double(m*n*n) + + 1e-9 * 2. * double(m*n*n) + + blas::Gflop::gemm(m, n, n) ); + + double gflop_itpo = itpo * ( blas::Gflop::gemm(m, n, n) + + lapack::Gflop::potrf(m) + + blas::Gflop::trsm(slate::Side::Left, m, n) ); + + double gflop_compute_H = blas::Gflop::her2k(n, m); + + double gflop = gflop_trcon + gflop_itqr + gflop_itpo + gflop_compute_H; + + if (trace) slate::trace::Trace::finish(); + + // compute and save timing/performance + params.time() = time; + params.gflops() = gflop / time; + + print_matrix("U: orthogonal polar factor", A, params); + print_matrix("H: symmetric postive semi definite factor", H, params); + } + + if (check) { + //================================================== + // Test results by checking backwards error + // + // || A'H - Aref ||_f + // ------------------- < tol * epsilon + // ||Aref||_f + // + //================================================== + real_t normA = slate::norm(slate::Norm::Fro, Aref, opts); + slate::gemm(one, A, H, -one, Aref, opts); + //auto H = HermitianMatrix( + // slate::Uplo::Lower, B ); + //slate::hemm(slate::Side::Right, one, H, A, -one, Aref, opts); + real_t berr = slate::norm(slate::Norm::Fro, Aref, opts); + params.error() = berr / normA; + + //================================================== + // Test results by checking the orthogonality of U factor + // + // || A'A - I ||_f + // ---------------- < tol * epsilon + // n + // + //================================================== + auto AT = conj_transpose(A); + set(zero, one, Aref); + slate::gemm(one, AT, A, -one, Aref, opts); + real_t orth = slate::norm(slate::Norm::Fro, Aref); + params.ortho() = orth / sqrt(n); + + real_t tol = params.tol() * 0.5 * std::numeric_limits::epsilon(); + params.okay() = ((params.error() <= tol) && (params.ortho() <= tol)); + + + } +} + +// ----------------------------------------------------------------------------- +void test_qdwh(Params& params, bool run) +{ + switch (params.datatype()) { + case testsweeper::DataType::Integer: + throw std::exception(); + break; + + case testsweeper::DataType::Single: + test_qdwh_work (params, run); + break; + + case testsweeper::DataType::Double: + test_qdwh_work (params, run); + break; + + case testsweeper::DataType::SingleComplex: + test_qdwh_work> (params, run); + break; + + case testsweeper::DataType::DoubleComplex: + test_qdwh_work> (params, run); + break; + } +}