From 37a6768da7d1b8b21783e7402297f651a022ea7c Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Thu, 27 Oct 2022 14:56:17 -0400 Subject: [PATCH 01/49] Tow-norm estimation --- src/normest.cc | 241 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 241 insertions(+) create mode 100644 src/normest.cc diff --git a/src/normest.cc b/src/normest.cc new file mode 100644 index 000000000..e0ed58fad --- /dev/null +++ b/src/normest.cc @@ -0,0 +1,241 @@ +// Copyright (c) 2017-2022, University of Tennessee. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// This program is free software: you can redistribute it and/or modify it under +// the terms of the BSD 3-Clause license. See the accompanying LICENSE file. + +#include "slate/slate.hh" +#include "internal/internal.hh" +#include "internal/internal_util.hh" +#include "slate/internal/mpi.hh" + +#include +#include + +namespace slate { + +// specialization namespace differentiates, e.g., +// internal::norm from internal::specialization::norm +namespace internal { +namespace specialization { + +//------------------------------------------------------------------------------ +/// @internal +/// Distributed parallel general matrix norm. +/// Generic implementation for any target. +/// @ingroup norm_specialization +/// +template +blas::real_type +normest(slate::internal::TargetType, + Norm in_norm, matrix_type A, + Options const& opts) +{ + using scalar_t = typename matrix_type::value_type; + using real_t = blas::real_type; + + // Undo any transpose, which switches one <=> inf norms. + if (A.op() == Op::ConjTrans) + A = conj_transpose(A); + else if (A.op() == Op::Trans) + A = transpose(A); + + //--------- + // Two norm estimation + // First: let's compute the x vector such that + // x_j = sum_i abs( A_{i,j} ), x here is global_sums.data + if (in_norm == Norm::One) { + + using blas::min; + + int p, q; + int myrow, mycol; + GridOrder order; + A.gridinfo(&order, &p, &q, &myrow, &mycol); + + int64_t n = A.n(); + int64_t m = A.m(); + int64_t nb = A.tileNb(0); + + scalar_t one = 1.; + scalar_t zero = 0.; + real_t alpha; + + int64_t cnt = 0; + int64_t maxiter = min( 100, n ); + + real_t e = 0.; + real_t e0 = 0.; + real_t normX = 0; + real_t normAX = 0; + real_t tol = 1.e-1; + + std::vector local_sums(n); + + if (target == Target::Devices) + A.reserveDeviceWorkspace(); + + std::vector global_sums(n); + std::vector W1(n); + std::vector W2(n); + + auto XL = slate::Matrix::fromLAPACK( + n, 1, &global_sums[0], n, nb, 1, p, q, A.mpiComm()); + XL.insertLocalTiles(); + auto AX = slate::Matrix::fromLAPACK( + m, 1, &W1[0], m, nb, 1, p, q, A.mpiComm()); + AX.insertLocalTiles(); + auto X = slate::Matrix::fromLAPACK( + n, 1, &W2[0], n, nb, 1, p, q, A.mpiComm()); + X.insertLocalTiles(); + + + #pragma omp parallel + #pragma omp master + { + internal::norm(in_norm, NormScope::Matrix, std::move(A), local_sums.data()); + } + + #pragma omp critical(slate_mpi) + { + trace::Block trace_block("MPI_Allreduce"); + slate_mpi_call( + MPI_Allreduce(local_sums.data(), global_sums.data(), + n, mpi_type::value, + MPI_SUM, A.mpiComm())); + } + + // global_sums.data() will have the sum of each column. + // First step is done + + // Second: compute the ||x||_Fro + //e = lapack::lange( + // Norm::Fro, 1, n, + // global_sums.data(), 1); + e = slate::norm(Norm::Fro, XL, opts); + if (e == 0.) { + return 0.; + } + else { + normX = e; + } + + // Third: start the while-loop X = X / ||X|| + while ( (cnt < maxiter) && (fabs((e - e0)) > (tol * (e))) ) { + e0 = e; + + // Scale X = X / ||X|| + alpha = 1.0/normX; + add(scalar_t(alpha), XL, zero, X, opts); + + // Compute Ax = A * sx + gemm(one, A, X, zero, AX, opts); + + // Compute x = A' * A * x = A' * Ax + auto AT = conj_transpose(A); + gemm(one, AT, AX, zero, XL, opts); + + // Compute ||X||, ||AX|| + normX = norm(Norm::Fro, XL, opts); + normAX = norm(Norm::Fro, AX, opts); + + // Uodate e + e = normX / normAX; + cnt++; + } + //if ( (cnt >= maxiter) && (fabs((e - e0)) > (tol * e)) ) { + //} + + A.clearWorkspace(); + + return e; + + } + else { + slate_error("invalid norm."); + } +} + +} // namespace specialization +} // namespace internal + +//------------------------------------------------------------------------------ +/// Version with target as template parameter. +/// @ingroup norm_specialization +/// +template +blas::real_type +normest(Norm norm, matrix_type& A, + const std::map& opts) +{ + return internal::specialization::normest(internal::TargetType(), + norm, A, opts); +} + +//------------------------------------------------------------------------------ +/// Distributed parallel general matrix norm. +/// +//------------------------------------------------------------------------------ +/// @tparam matrix_type +/// Any SLATE matrix type: Matrix, SymmetricMatrix, HermitianMatrix, +/// TriangularMatrix, etc. +//------------------------------------------------------------------------------ +/// @param[in] in_norm +/// Norm to compute: +/// - Norm::Second: second norm estimation of the matrix$ +/// +/// @param[in] A +/// The matrix A. +/// +/// @param[in] opts +/// Additional options, as map of name = value pairs. Possible options: +/// - Option::Target: +/// Implementation to target. Possible values: +/// - HostTask: OpenMP tasks on CPU host [default]. +/// - HostNest: nested OpenMP parallel for loop on CPU host. +/// - Devices: batched BLAS on GPU device. +/// +/// @ingroup norm +/// +template +blas::real_type +normest(Norm in_norm, matrix_type& A, + const std::map& opts) +{ + Target target = get_option( opts, Option::Target, Target::HostTask ); + + switch (target) { + case Target::Host: + case Target::HostTask: + return normest(in_norm, A, opts); + break; + case Target::HostBatch: + case Target::HostNest: + case Target::Devices: + break; + } + throw std::exception(); // todo: invalid target +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +float normest( + Norm in_norm, Matrix& A, + Options const& opts); + +template +double normest( + Norm in_norm, Matrix& A, + Options const& opts); + +template +float normest( + Norm in_norm, Matrix< std::complex >& A, + Options const& opts); + +template +double normest( + Norm in_norm, Matrix< std::complex >& A, + Options const& opts); + +} // namespace slate From 9b47c710d8bdd7e33759c9fbf553085bfe0bf4a2 Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Thu, 27 Oct 2022 15:05:33 -0400 Subject: [PATCH 02/49] Added the polar decomposition QDWH --- src/qdwh.cc | 388 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 388 insertions(+) create mode 100644 src/qdwh.cc diff --git a/src/qdwh.cc b/src/qdwh.cc new file mode 100644 index 000000000..1789db558 --- /dev/null +++ b/src/qdwh.cc @@ -0,0 +1,388 @@ +// Copyright (c) 2017-2022, University of Tennessee. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// This program is free software: you can redistribute it and/or modify it under +// the terms of the BSD 3-Clause license. See the accompanying LICENSE file. + +#include "slate/slate.hh" +#include "auxiliary/Debug.hh" +#include "slate/Matrix.hh" +#include "internal/internal.hh" + +namespace slate { + +// specialization namespace differentiates, e.g., +// todo: is it ok? +// internal::geqrf from internal::specialization::geqrf +namespace impl { + +//------------------------------------------------------------------------------ +/// Distributed parallel polar decomposition based on QDWH algorithm. +/// Generic implementation for any target. +// todo: is it ok? +/// @ingroup geqrf_specialization +/// +template +void qdwh(slate::internal::TargetType, + Matrix& A, + Matrix& H, // this matrix will be hermition + Options const& opts) +{ + using real_t = blas::real_type; + using blas::real; + + int64_t lookahead = get_option( opts, Option::Lookahead, 1 ); + int64_t ib = get_option( opts, Option::InnerBlocking, 16 ); + int64_t max_panel_threads = std::max(omp_get_max_threads()/2, 1); + max_panel_threads = get_option( opts, Option::MaxPanelThreads, max_panel_threads ); + + int64_t mt = A.mt(); + int64_t nt = A.nt(); + + int64_t m = A.m(); + int64_t n = A.n(); + int64_t nb = A.tileMb(0); + int64_t m2 = m + n; + + int nprow, npcol; + int myrow, mycol; + GridOrder order; + A.gridinfo(&order, &nprow, &npcol, &myrow, &mycol); + + bool optqr = 1; + int itconv, it; + int itqr = 0, itpo = 0; + + real_t eps = 0.5 * std::numeric_limits::epsilon(); + real_t tol1 = 5. * eps; + real_t tol3 = pow(tol1, 1./3.); + + double L2, sqd, dd, a1, a, b, c; + real_t Li, Liconv; + real_t conv = 100.; + scalar_t zero = 0.0, one = 1.0, minusone = -1.0; + + scalar_t alpha, beta; + + real_t normA; + real_t norminvR; + + int64_t mt2 = 2*mt - 1; + + // allocate m2*n work space required for the qr-based iterations + // this allocation can be avoided if we change the qr iterations to + // work on two matrices on top of each other + + // if doing QR([A;Id]) + slate::Matrix W(m2, n, nb, nprow, npcol, MPI_COMM_WORLD); + slate::Matrix Q(m2, n, nb, nprow, npcol, MPI_COMM_WORLD); + + if (target == Target::Devices) { + const int64_t batch_size_zero = 0; // use default batch size + const int64_t num_queues = 3 + lookahead; + A.allocateBatchArrays(batch_size_zero, num_queues); + A.reserveDeviceWorkspace(); + W.allocateBatchArrays(batch_size_zero, num_queues); + W.reserveDeviceWorkspace(); + Q.allocateBatchArrays(batch_size_zero, num_queues); + Q.reserveDeviceWorkspace(); + } + + W.insertLocalTiles(); + Q.insertLocalTiles(); + auto W1 = W.sub(0, mt-1, 0, nt-1); // First mxn block of W + auto W2 = W.sub(mt, mt2, 0, nt-1); // Second nxn block of W + auto Q1 = Q.sub(0, mt-1, 0, nt-1); // First mxn block of W + auto Q2 = Q.sub(mt, mt2, 0, nt-1); // Second nxn block of W + + // if doing QR([A]) QR([A;Id]) + //slate::Matrix W1(m, n, nb, nprow, npcol, MPI_COMM_WORLD); + //slate::Matrix W2(m, n, nb, nprow, npcol, MPI_COMM_WORLD); + //W1.insertLocalTiles(); + //W2.insertLocalTiles(); + + slate::TriangularFactors T1; + + // backup A in H + copy(A, H); + + // two norm estimation (largest singular value of A) + // todo: fix norm_est the call it + real_t norm_est = 1.; + //real_t norm_est = normest(slate::Norm::One, A); + alpha = 1.0; + + // scale the original matrix A to form A0 of the iterative loop + add(alpha/(scalar_t)norm_est, H, zero, A, opts); + + // Calculate Li: reciprocal of condition number estimation + // Either 1) use LU followed by gecon + // Or 2) QR followed by trtri + // If used the QR, use the Q factor in the first QR-based iteration + normA = norm(slate::Norm::One, A); + if (optqr) { + // Estimate the condition number using QR + // This Q factor can be used in the first QR-based iteration + copy(A, W1); + slate::geqrf(W1, T1, opts); + auto R = TriangularMatrix( + Uplo::Upper, slate::Diag::NonUnit, W1 ); + auto Rh = HermitianMatrix( + Uplo::Upper, W1 ); + trtri(R, opts); + //tzset(scalar_t(0.0), L); + norminvR = norm(slate::Norm::One, Rh); + Li = (1.0 / norminvR) / normA; + Li = norm_est / 1.1 * Li; + // *flops += FLOPS_DGEQRF( M, N ) + // + FLOPS_DTRTRI( N ); + } + else { + // todo + // Estimate the condition number using LU + } + + itconv = 0; Liconv = Li; + while (itconv == 0 || fabs(1-Liconv) > tol1) { + // To find the minimum number of iterations to converge. + // itconv = number of iterations needed until |Li - 1| < tol1 + // This should have converged in less than 50 iterations + if (itconv > 100) { + exit(-1); + break; + } + itconv++; + + L2 = double (Liconv * Liconv); + dd = pow( 4.0 * (1.0 - L2 ) / (L2 * L2), 1.0/3.0 ); + sqd = sqrt(1.0 + dd); + a1 = sqd + sqrt( 8.0 - 4.0 * dd + 8.0 * (2.0 - L2) / (L2 * sqd) ) / 2.0; + a = real(a1); + b = (a - 1.0) * (a - 1.0) / 4.0; + c = a + b - 1.0; + // Update Liconv + Liconv = Liconv * real_t((a + b * L2) / (1.0 + c * L2)); + } + + it = 0; + while (conv > tol3 || it < itconv) { + // This should have converged in less than 50 iterations + if (it > 100) { + exit(-1); + break; + } + it++; + + // Compute parameters L,a,b,c + L2 = double (Li * Li); + dd = pow( 4.0 * (1.0 - L2 ) / (L2 * L2), 1.0/3.0 ); + sqd = sqrt(1.0 + dd); + a1 = sqd + sqrt( 8.0 - 4.0 * dd + 8.0 * (2.0 - L2) / (L2 * sqd) ) / 2.0; + a = real(a1); + b = (a - 1.0) * (a - 1.0) / 4.0; + c = a + b - 1.0; + // Update Li + Li = Li * real_t ((a + b * L2) / (1.0 + c * L2)); + + if (real(c) > 100.) { + //int doqr = !optqr || it > 1; + + // Generate the matrix B = [ B1 ] = [ sqrt(c) * U ] + // [ B2 ] = [ Id ] + alpha = scalar_t(sqrt(c)); + set(zero, one, W2); + + // todo: have a customized splitted QR + //if( doqr ) { + // geadd(alpha, A, zero, W1); + // geqrf_qdwh_full(W, T1, opts); + //} + //else { + // geadd(alpha, W1, zero, W1); + // geqrf_qdwh_full2(W, T1, opts); + //} + + // Factorize B = QR, and generate the associated Q + add(alpha, A, zero, W1, opts); + //geqrf(W, T1, opts); // naive impl + //geqrf_qdwh_full(W, T1, opts); + + set(zero, one, Q1); + set(zero, zero, Q2); + + //unmqr(slate::Side::Left, slate::Op::NoTrans, W, T1, Q, opts); //naive impl + //unmqr_qdwh_full(slate::Side::Left, slate::Op::NoTrans, W, T1, Q, opts); + + // A = ( (a-b/c)/sqrt(c) ) * Q1 * Q2' + (b/c) * A + auto Q2T = conj_transpose(Q2); + alpha = scalar_t( (a-b/c)/sqrt(c) ); + beta = scalar_t( b / c ); + // Copy U into C to check the convergence of QDWH + if (it >= itconv ) { + copy(A, W1); + } + gemm(alpha, Q1, Q2T, beta, A, opts); + + // Main flops used in this step/ + //flops_dgeqrf = FLOPS_DGEQRF( 2*m, n ); + //flops_dorgqr = FLOPS_DORGQR( 2*m, n, n ); + //flops_dgemm = FLOPS_DGEMM( m, n, n ); + //flops += flops_dgeqrf + flops_dorgqr + flops_dgemm; + + itqr += 1; + //facto = 0; + } + else { + // Copy A into H to check the convergence of QDWH + if (it >= itconv) { + copy(A, W1); + } + + // Make Q1 into an identity matrix + set(zero, one, W2); + + // Compute Q1 = c * A' * A + I + /////////////// + copy(A, Q1); + auto AT = conj_transpose(Q1); + gemm(scalar_t(c), AT, A, one, W2, opts); + + // Solve R x = AT + auto R = slate::HermitianMatrix( + slate::Uplo::Upper, W2 ); + posv(R, AT, opts); + + // Update A = (a-b/c) * Q1T' + (b/c) * A + alpha = (a-b/c); beta = (b/c); + AT = conj_transpose(AT); + add(scalar_t(alpha), AT, scalar_t(beta), A, opts); + + // Main flops used in this step + //flops_dgemm = FLOPS_DGEMM( m, n, n ); + //flops_dpotrf = FLOPS_DPOTRF( m ); + //flops_dtrsm = FLOPS_DTRSM( 'L', m, n ); + //flops += flops_dgemm + flops_dpotrf + 2. * flops_dtrsm; + + itpo += 1; + //facto = 1; + } + + // Compute the norm of the symmetric matrix U - B1 + conv = 10.0; + if (it >= itconv) { + add(one, A, minusone, W1, opts); + conv = norm(slate::Norm::Fro, W1); + } + } + + // A = U*H ==> H = U'*A ==> H = 0.5*(H'+H) + copy(H, W1); + auto AT = conj_transpose(A); + gemm(one, AT, W1, zero, H, opts); +} + +} // namespace impl + +//------------------------------------------------------------------------------ +/// Distributed parallel polar decomposition based on QDWH algorithm. +/// +/// Computes a polar decomposition of an m-by-n matrix $A$. +/// The factorization has the form +/// \[ +/// A = UH, +/// \] +/// where $U$ is a matrix with orthonormal columns and $H$ is a Hermition +/// matrix. +/// +/// Complexity (in real): +/// - for $m \ge n$, $\approx 2 m n^{2} - \frac{2}{3} n^{3}$ flops; +/// - for $m \le n$, $\approx 2 m^{2} n - \frac{2}{3} m^{3}$ flops; +/// - for $m = n$, $\approx \frac{4}{3} n^{3}$ flops. +/// . +//------------------------------------------------------------------------------ +/// @tparam scalar_t +/// One of float, double, std::complex, std::complex. +//------------------------------------------------------------------------------ +/// @param[in,out] A +/// On entry, the m-by-n matrix $A$. +/// On exit, if return value = 0, the orthogonal polar factor $U$ +/// from the decomposition $A = U H$. +/// +/// @param[out] H +/// On exit, if return value = 0, the hermitian polar factor matrix $H$ +/// from the decomposition $A = U H$. +/// +/// @param[in] opts +/// Additional options, as map of name = value pairs. Possible options: +/// - Option::Lookahead: +/// Number of panels to overlap with matrix updates. +/// lookahead >= 0. Default 1. +/// - Option::InnerBlocking: +/// Inner blocking to use for panel. Default 16. +/// - Option::MaxPanelThreads: +/// Number of threads to use for panel. Default omp_get_max_threads()/2. +/// - Option::Target: +/// Implementation to target. Possible values: +/// - HostTask: OpenMP tasks on CPU host [default]. +/// - HostNest: nested OpenMP parallel for loop on CPU host. +/// - HostBatch: batched BLAS on CPU host. +/// - Devices: batched BLAS on GPU device. +/// +/// @ingroup geqrf_computational +/// +template +void qdwh(Matrix& A, + Matrix& H, + Options const& opts) +{ + using internal::TargetType; + Target target = get_option( opts, Option::Target, Target::HostTask ); + + switch (target) { + case Target::Host: + case Target::HostTask: + impl::qdwh( TargetType(), + A, H, opts); + break; + case Target::HostNest: + impl::qdwh( TargetType(), + A, H, opts); + break; + case Target::HostBatch: + impl::qdwh( TargetType(), + A, H, opts); + break; + case Target::Devices: + impl::qdwh( TargetType(), + A, H, opts); + break; + } + // todo: return value for errors? +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +void qdwh( + Matrix& A, + Matrix& H, + Options const& opts); + +template +void qdwh( + Matrix& A, + Matrix& H, + Options const& opts); + +template +void qdwh< std::complex >( + Matrix< std::complex >& A, + Matrix< std::complex >& H, + Options const& opts); + +template +void qdwh< std::complex >( + Matrix< std::complex >& A, + Matrix< std::complex >& H, + Options const& opts); + +} // namespace slate From ea1f5d48ee503d63f9e733dcf8d464d30702b049 Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Mon, 31 Oct 2022 16:03:52 -0400 Subject: [PATCH 03/49] Added costumized geqrf and unmqr --- src/geqrf_qdwh_full.cc | 489 +++++++++++++++++++++++++++++++++++++++++ src/unmqr_qdwh_full.cc | 403 +++++++++++++++++++++++++++++++++ 2 files changed, 892 insertions(+) create mode 100644 src/geqrf_qdwh_full.cc create mode 100644 src/unmqr_qdwh_full.cc diff --git a/src/geqrf_qdwh_full.cc b/src/geqrf_qdwh_full.cc new file mode 100644 index 000000000..66752f7f8 --- /dev/null +++ b/src/geqrf_qdwh_full.cc @@ -0,0 +1,489 @@ +// Copyright (c) 2017-2022, University of Tennessee. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// This program is free software: you can redistribute it and/or modify it under +// the terms of the BSD 3-Clause license. See the accompanying LICENSE file. + +#include "slate/slate.hh" +#include "auxiliary/Debug.hh" +#include "slate/Matrix.hh" +#include "internal/internal.hh" + +namespace slate { + +// specialization namespace differentiates, e.g., +// internal::geqrf_qdwh_full from internal::specialization::geqrf +namespace internal { +namespace specialization { + +//------------------------------------------------------------------------------ +/// An auxiliary routine to find each rank's first (top-most) row +/// in panel k. +/// +/// @param[in] A_panel +/// Current panel, which is a sub of the input matrix $A$. +/// +/// @param[in] k +/// Index of the current panel in the input matrix $A$. +/// +/// @param[out] first_indices +/// The array of computed indices. +/// +/// @ingroup geqrf_qdwh_full_specialization +/// +template +void geqrf_compute_first_indices(Matrix& A_panel, int64_t k, + std::vector< int64_t >& first_indices) +{ + // Find ranks in this column. + std::set ranks_set; + A_panel.getRanks(&ranks_set); + assert(ranks_set.size() > 0); + + // Find each rank's first (top-most) row in this panel, + // where the triangular tile resulting from local geqrf panel + // will reside. + first_indices.reserve(ranks_set.size()); + for (int r: ranks_set) { + for (int64_t i = 0; i < A_panel.mt(); ++i) { + if (A_panel.tileRank(i, 0) == r) { + first_indices.push_back(i+k); + break; + } + } + } +} + +//------------------------------------------------------------------------------ +/// Distributed parallel QR factorization. +/// Generic implementation for any target. +/// Panel and lookahead computed on host using Host OpenMP task. +/// +/// ColMajor layout is assumed +/// +/// @ingroup geqrf_specialization +/// +template +void geqrf_qdwh_full(slate::internal::TargetType, + Matrix& A, + TriangularFactors& T, + int64_t ib, int max_panel_threads, int64_t lookahead) +{ + using BcastList = typename Matrix::BcastList; + using device_info_t = lapack::device_info_int; + using blas::real; + + // Assumes column major + const Layout layout = Layout::ColMajor; + + const int priority_zero = 0; + const int priority_one = 1; + const int life_factor_one = 1; + const bool set_hold = lookahead > 0; // Do tileGetAndHold in the bcast + + int64_t A_mt = A.mt(); + int64_t A_nt = A.nt(); + int64_t A_min_mtnt = std::min(A_mt, A_nt); + + T.clear(); + T.push_back(A.emptyLike()); + T.push_back(A.emptyLike(ib, 0)); + auto Tlocal = T[0]; + auto Treduce = T[1]; + + // workspace + auto W = A.emptyLike(); + + // setting up dummy variables for case the when target == host + int64_t num_devices = A.num_devices(); + int panel_device = -1; + size_t work_size = 0; + + std::vector< scalar_t* > dwork_array( num_devices, nullptr ); + + if (target == Target::Devices) { + const int64_t batch_size_zero = 0; // use default batch size + const int64_t num_queues = 3 + lookahead; + A.allocateBatchArrays(batch_size_zero, num_queues); + A.reserveDeviceWorkspace(); + W.allocateBatchArrays(batch_size_zero, num_queues); + // todo: this is demanding too much device workspace memory + // only one tile-row of matrix W per MPI process is going to be used, + // but W with size of whole A is being allocated + // thus limiting the matrix size that can be processed + // For now, allocate workspace tiles 1-by-1. + //W.reserveDeviceWorkspace(); + + // Find largest panel size and device for copying to + // contiguous memory within internal geqrf routine + int64_t mlocal = 0; + int64_t first_panel_seen = -1; + for (int64_t j = 0; j < A.nt(); ++j) { + for (int64_t i = j; i < A.mt(); ++i) { + if (A.tileIsLocal(i, j)) { + if (first_panel_seen < 0) { + first_panel_seen = j; + } + if (first_panel_seen == j) { + if (panel_device < 0) { + panel_device = A.tileDevice(i, j); + } + mlocal += A.tileMb(i); + // Asserting 1-D distribution for device + assert( panel_device == A.tileDevice(i, j) ); + } + } + } + if (first_panel_seen >= 0) { + break; + } + } + + if (panel_device >= 0) { + + blas::set_device( panel_device ); + + lapack::Queue* queue = A.compute_queue(panel_device, 0); + + int64_t nb = A.tileNb(0); + size_t size_tau = (size_t) std::min( mlocal, nb ); + size_t size_A = (size_t) blas::max( 1, mlocal ) * nb; + size_t hsize, dsize; + + // Find size of the workspace needed + lapack::geqrf_work_size_bytes( mlocal, nb, dwork_array[0], mlocal, + &dsize, &hsize, *queue ); + + // Size of dA, dtau, dwork and dinfo + work_size = size_A + size_tau + ceildiv(dsize, sizeof(scalar_t)) + + ceildiv(sizeof(device_info_t), sizeof(scalar_t)); + + for (int64_t dev = 0; dev < num_devices; ++dev) { + blas::set_device(dev); + dwork_array[dev] = blas::device_malloc(work_size); + } + } + } + + // QR tracks dependencies by block-column. + // OpenMP needs pointer types, but vectors are exception safe + std::vector< uint8_t > block_vector(A_nt); + uint8_t* block = block_vector.data(); + + // set min number for omp nested active parallel regions + slate::OmpSetMaxActiveLevels set_active_levels( MinOmpActiveLevels ); + + #pragma omp parallel + #pragma omp master + { + for (int64_t k = 0; k < A_min_mtnt; ++k) { + int64_t i_end = std::min( A_mt-1, (A_mt-A_nt) + k); + + auto A_panel = A.sub(k, i_end, k, k); + auto Tl_panel = Tlocal.sub(k, i_end, k, k); + auto Tr_panel = Treduce.sub(k, i_end, k, k); + + std::vector< int64_t > first_indices; + geqrf_compute_first_indices(A_panel, k, first_indices); + // todo: pass first_indices into internal geqrf or ttqrt? + + // panel, high priority + #pragma omp task depend(inout:block[k]) priority(priority_one) + { + // local panel factorization + internal::geqrf( + std::move(A_panel), + std::move(Tl_panel), + dwork_array, work_size, + ib, max_panel_threads, priority_one); + + // triangle-triangle reductions + // ttqrt handles tile transfers internally + internal::ttqrt( + std::move(A_panel), + std::move(Tr_panel)); + + // if a trailing matrix exists + if (k < A_nt-1) { + + // bcast V across row for trailing matrix update + if (k < i_end+1) { + BcastList bcast_list_V_first; + BcastList bcast_list_V; + for (int64_t i = k; i < i_end+1; ++i) { + // send A(i, k) across row A(i, k+1:nt-1) + // Vs in first_indices (except the main diagonal one) need three lives + if ((std::find(first_indices.begin(), first_indices.end(), i) != first_indices.end()) && (i > k)) + bcast_list_V_first.push_back({i, k, {A.sub(i, i, k+1, A_nt-1)}}); + else + bcast_list_V.push_back({i, k, {A.sub(i, i, k+1, A_nt-1)}}); + } + A.template listBcast(bcast_list_V_first, layout, 0, 3, set_hold); + A.template listBcast(bcast_list_V, layout, 0, 2, set_hold); + } + + // bcast Tlocal across row for trailing matrix update + if (first_indices.size() > 0) { + BcastList bcast_list_T; + for (int64_t row : first_indices) { + bcast_list_T.push_back({row, k, {Tlocal.sub(row, row, k+1, A_nt-1)}}); + } + Tlocal.template listBcast(bcast_list_T, layout, k, life_factor_one, set_hold); + } + + // bcast Treduce across row for trailing matrix update + if (first_indices.size() > 1) { + BcastList bcast_list_T; + for (int64_t row : first_indices) { + if (row > k) // exclude the first row of this panel that has no Treduce tile + bcast_list_T.push_back({row, k, {Treduce.sub(row, row, k+1, A_nt-1)}}); + } + Treduce.template listBcast(bcast_list_T, layout); + } + } + } + + // update lookahead column(s) on CPU, high priority + for (int64_t j = k+1; j < (k+1+lookahead) && j < A_nt; ++j) { + + auto A_trail_j = A.sub(k, i_end, j, j); + + #pragma omp task depend(in:block[k]) \ + depend(inout:block[j]) \ + priority(priority_one) + { + // Apply local reflectors + internal::unmqr( + Side::Left, Op::ConjTrans, + std::move(A_panel), + std::move(Tl_panel), + std::move(A_trail_j), + W.sub(k, i_end, j, j), + priority_one, j-k+1); + + // Apply triangle-triangle reduction reflectors + // ttmqr handles the tile broadcasting internally + internal::ttmqr( + Side::Left, Op::ConjTrans, + std::move(A_panel), + std::move(Tr_panel), + std::move(A_trail_j), + j); + } + } + + // update trailing submatrix, normal priority + if (k+1+lookahead < A_nt) { + int64_t j = k+1+lookahead; + auto A_trail_j = A.sub(k, i_end, j, A_nt-1); + + #pragma omp task depend(in:block[k]) \ + depend(inout:block[k+1+lookahead]) \ + depend(inout:block[A_nt-1]) + { + // Apply local reflectors. + internal::unmqr( + Side::Left, Op::ConjTrans, + std::move(A_panel), + std::move(Tl_panel), + std::move(A_trail_j), + W.sub(k, i_end, j, A_nt-1), + priority_zero, j-k+1); + + // Apply triangle-triangle reduction reflectors. + // ttmqr handles the tile broadcasting internally. + internal::ttmqr( + Side::Left, Op::ConjTrans, + std::move(A_panel), + std::move(Tr_panel), + std::move(A_trail_j), + j); + } + } + if (target == Target::Devices) { + // Update the status of the on-hold tiles held by the invocation of + // the tileBcast routine, and then release them to free up memory. + // The origin must be updated with the latest modified copy + // for memory consistency. + // TODO: Find better solution to handle tile release, and + // investigate the correctness of the task dependency + if (k >= lookahead && k < A_nt-1) { + #pragma omp task depend(in:block[k]) \ + depend(inout:block[k+1]) + { + int64_t k_la = k-lookahead; + for (int64_t i = k_la; i < A_mt; ++i) { + if (A.tileIsLocal(i, k_la)) { + A.tileUpdateOrigin(i, k_la); + + std::set dev_set; + A.sub(i, i, k_la+1, A_nt-1).getLocalDevices(&dev_set); + + for (auto device : dev_set) { + A.tileUnsetHold(i, k_la, device); + A.tileRelease(i, k_la, device); + } + } + } + + auto A_panel_k_la = A.sub(k_la, A_mt-1, k_la, k_la); + std::vector< int64_t > first_indices_k_la; + geqrf_compute_first_indices(A_panel_k_la, k_la, first_indices_k_la); + if (first_indices.size() > 0) { + for (int64_t row : first_indices_k_la) { + if (Tlocal.tileIsLocal(row, k_la)) { + Tlocal.tileUpdateOrigin(row, k_la); + + std::set dev_set; + Tlocal.sub(row, row, k_la+1, A_nt-1).getLocalDevices(&dev_set); + + for (auto device : dev_set) { + Tlocal.tileUnsetHold(row, k_la, device); + Tlocal.tileRelease(row, k_la, device); + } + } + } + } + } + } + } + } + + #pragma omp taskwait + A.tileUpdateAllOrigin(); + } + + A.releaseWorkspace(); + + if (target == Target::Devices) { + for (int64_t dev = 0; dev < num_devices; ++dev) { + blas::set_device(dev); + blas::device_free( dwork_array[dev] ); + dwork_array[dev] = nullptr; + } + } +} + +} // namespace specialization +} // namespace internal + +//------------------------------------------------------------------------------ +/// Version with target as template parameter. +/// @ingroup geqrf_qdwh_full_specialization +/// +template +void geqrf_qdwh_full(Matrix& A, + TriangularFactors& T, + Options const& opts) +{ + int64_t lookahead = get_option( opts, Option::Lookahead, 1 ); + + int64_t ib = get_option( opts, Option::InnerBlocking, 16 ); + + int64_t max_panel_threads = std::max(omp_get_max_threads()/2, 1); + max_panel_threads = get_option( opts, Option::MaxPanelThreads, max_panel_threads ); + + internal::specialization::geqrf_qdwh_full(internal::TargetType(), + A, T, + ib, max_panel_threads, lookahead); +} + +//------------------------------------------------------------------------------ +/// Distributed parallel QR factorization. +/// +/// Computes a QR factorization of an m-by-n matrix $A$. +/// The factorization has the form +/// \[ +/// A = QR, +/// \] +/// where $Q$ is a matrix with orthonormal columns and $R$ is upper triangular +/// (or upper trapezoidal if m < n). +/// +/// Complexity (in real): +/// - for $m \ge n$, $\approx 2 m n^{2} - \frac{2}{3} n^{3}$ flops; +/// - for $m \le n$, $\approx 2 m^{2} n - \frac{2}{3} m^{3}$ flops; +/// - for $m = n$, $\approx \frac{4}{3} n^{3}$ flops. +/// . +//------------------------------------------------------------------------------ +/// @tparam scalar_t +/// One of float, double, std::complex, std::complex. +//------------------------------------------------------------------------------ +/// @param[in,out] A +/// On entry, the m-by-n matrix $A$. +/// On exit, the elements on and above the diagonal of the array contain +/// the min(m,n)-by-n upper trapezoidal matrix $R$ (upper triangular +/// if m >= n); the elements below the diagonal represent the unitary +/// matrix $Q$ as a product of elementary reflectors. +/// +/// @param[out] T +/// On exit, triangular matrices of the block reflectors. +/// +/// @param[in] opts +/// Additional options, as map of name = value pairs. Possible options: +/// - Option::Lookahead: +/// Number of panels to overlap with matrix updates. +/// lookahead >= 0. Default 1. +/// - Option::InnerBlocking: +/// Inner blocking to use for panel. Default 16. +/// - Option::MaxPanelThreads: +/// Number of threads to use for panel. Default omp_get_max_threads()/2. +/// - Option::Target: +/// Implementation to target. Possible values: +/// - HostTask: OpenMP tasks on CPU host [default]. +/// - HostNest: nested OpenMP parallel for loop on CPU host. +/// - HostBatch: batched BLAS on CPU host. +/// - Devices: batched BLAS on GPU device. +/// +/// @ingroup geqrf_computational +/// +template +void geqrf_qdwh_full(Matrix& A, + TriangularFactors& T, + Options const& opts) +{ + Target target = get_option( opts, Option::Target, Target::HostTask ); + + switch (target) { + case Target::Host: + case Target::HostTask: + geqrf_qdwh_full(A, T, opts); + break; + case Target::HostNest: + geqrf_qdwh_full(A, T, opts); + break; + case Target::HostBatch: + geqrf_qdwh_full(A, T, opts); + break; + case Target::Devices: + geqrf_qdwh_full(A, T, opts); + break; + } + // todo: return value for errors? +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +void geqrf_qdwh_full( + Matrix& A, + TriangularFactors& T, + Options const& opts); + +template +void geqrf_qdwh_full( + Matrix& A, + TriangularFactors& T, + Options const& opts); + +template +void geqrf_qdwh_full< std::complex >( + Matrix< std::complex >& A, + TriangularFactors< std::complex >& T, + Options const& opts); + +template +void geqrf_qdwh_full< std::complex >( + Matrix< std::complex >& A, + TriangularFactors< std::complex >& T, + Options const& opts); + +} // namespace slate diff --git a/src/unmqr_qdwh_full.cc b/src/unmqr_qdwh_full.cc new file mode 100644 index 000000000..563ac796b --- /dev/null +++ b/src/unmqr_qdwh_full.cc @@ -0,0 +1,403 @@ +// Copyright (c) 2017-2022, University of Tennessee. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// This program is free software: you can redistribute it and/or modify it under +// the terms of the BSD 3-Clause license. See the accompanying LICENSE file. + +#include "slate/slate.hh" +#include "auxiliary/Debug.hh" +#include "slate/Matrix.hh" +#include "internal/Tile_tpmqrt.hh" +#include "internal/internal.hh" + +namespace slate { + +// specialization namespace differentiates, e.g., +// internal::unmqr_qdwh_full from internal::specialization::unmqr_qdwh_full +namespace internal { +namespace specialization { + +//------------------------------------------------------------------------------ +/// Distributed parallel multiply by Q from QR factorization. +/// Generic implementation for any target. +/// @ingroup geqrf_specialization +/// +template +void unmqr_qdwh_full( + slate::internal::TargetType, + Side side, Op op, + Matrix& A, + TriangularFactors& T, + Matrix& C) +{ + // trace::Block trace_block("unmqr"); + // const int priority_one = 1; + using BcastList = typename Matrix::BcastList; + + // Assumes column major + const Layout layout = Layout::ColMajor; + + int64_t A_mt = A.mt(); + int64_t A_nt = A.nt(); + int64_t A_min_mtnt = std::min(A_mt, A_nt); + int64_t C_mt = C.mt(); + int64_t C_nt = C.nt(); + + if (is_complex::value && op == Op::Trans) { + throw Exception("Complex numbers uses Op::ConjTrans, not Op::Trans."); + } + + if (target == Target::Devices) { + C.allocateBatchArrays(); + C.reserveDeviceWorkspace(); + } + + // Reserve workspace + auto W = C.emptyLike(); + + if (target == Target::Devices) { + W.allocateBatchArrays(); + // todo: this is demanding too much device workspace memory + // only one tile-row of matrix W per MPI process is going to be used, + // but W with size of whole C is being allocated + // thus limiting the matrix size that can be processed + //W.reserveDeviceWorkspace(); + } + + assert(T.size() == 2); + auto Tlocal = T[0]; + auto Treduce = T[1]; + + // QR tracks dependencies by block-column. + // OpenMP needs pointer types, but vectors are exception safe + std::vector< uint8_t > block_vector(A_nt); + uint8_t* block = block_vector.data(); + + // set min number for omp nested active parallel regions + slate::OmpSetMaxActiveLevels set_active_levels( MinOmpActiveLevels ); + + #pragma omp parallel + #pragma omp master + { + int64_t k_begin, k_end, k_step; + if ((side == Side::Left) == (op == Op::NoTrans)) { + // Left, NoTrans: multiply Q C = Q1 ... QK C, or + // Right, (Conj)Trans: multiply C Q^H = C QK^H ... Q1^H, + // i.e., in reverse order of how Qk's were created. + k_begin = A_min_mtnt-1; + k_end = -1; + k_step = -1; + } + else { + // Left, (Conj)Trans: multiply Q^H C = QK^H ... Q1^H C, or + // Right, NoTrans: multiply C Q = C Q1 ... QK, + // i.e., in same order as Qk's were created. + k_begin = 0; + k_end = A_min_mtnt; + k_step = 1; + } + + // for k = k_begin, lastk = k_begin (no previous col to depend on); + // otherwise, lastk = k - k_step (previous col). + int64_t lastk = k_begin; + // OpenMP uses lastk; compiler doesn't, so warns it is unused. + SLATE_UNUSED(lastk); + for (int64_t k = k_begin; k != k_end; k += k_step) { + + int64_t i_end = std::min( A_mt-1, (A_mt-A_nt) + k); + + auto A_panel = A.sub(k, i_end, k, k); + + // Find ranks in this column. + std::set ranks_set; + A_panel.getRanks(&ranks_set); + assert(ranks_set.size() > 0); + + // Find each rank's first (top-most) row in this panel, + // where the triangular tile resulting from local geqrf + // panel will reside. + std::vector< int64_t > first_indices; + first_indices.reserve(ranks_set.size()); + for (int r: ranks_set) { + for (int64_t i = 0; i < A_panel.mt(); ++i) { + if (A_panel.tileRank(i, 0) == r) { + first_indices.push_back(i+k); + break; + } + } + } + + #pragma omp task depend(inout:block[k]) \ + depend(in:block[lastk]) + { + // Indices for row or col of C. + int64_t i0 = -1, i1 = -1, j0 = -1, j1 = -1; + if (side == Side::Left) { + j0 = 0; + j1 = C_nt-1; + } + else { + i0 = 0; + i1 = C_mt-1; + } + + // Send V(i) across row C(i, 0:nt-1) or col C(0:mt-1, i), + // for side = left or right, respectively. + BcastList bcast_list_V_top; + BcastList bcast_list_V; + for (int64_t i = k; i < i_end+1; ++i) { + if (side == Side::Left) { + i0 = i; + i1 = i; + } + else { + j0 = i; + j1 = i; + } + if (std::find(first_indices.begin(), first_indices.end(), i) != first_indices.end()) { + bcast_list_V_top.push_back( + {i, k, {C.sub(i0, i1, j0, j1)}}); + } + else { + bcast_list_V.push_back( + {i, k, {C.sub(i0, i1, j0, j1)}}); + } + } + // V tiles in first_indices need up to 5 lives: 1 for ttmqr, + // 2 + extra 2 if mb > nb (trapezoid) for Vs in unmqr I-VTV^T. + // This may leak a few tiles that A.clearWorkspace will cleanup. + A.template listBcast(bcast_list_V_top, layout, 0, 5); + A.template listBcast(bcast_list_V, layout, 0, 2); + + // Send Tlocal(i) across row C(i, 0:nt-1) or col C(0:mt-1, i). + if (first_indices.size() > 0) { + BcastList bcast_list_T; + for (int64_t i : first_indices) { + if (side == Side::Left) { + i0 = i; + i1 = i; + } + else { + j0 = i; + j1 = i; + } + bcast_list_T.push_back( + {i, k, {C.sub(i0, i1, j0, j1)}}); + } + Tlocal.template listBcast(bcast_list_T, layout); + } + + // Send Treduce(i) across row C(i, 0:nt-1) or col C(0:mt-1, i). + if (first_indices.size() > 1) { + BcastList bcast_list_T; + for (int64_t i : first_indices) { + if (side == Side::Left) { + i0 = i; + i1 = i; + } + else { + j0 = i; + j1 = i; + } + // Exclude first row of this panel, + // which doesn't have Treduce tile. + if (i > k) { + bcast_list_T.push_back( + {i, k, {C.sub(i0, i1, j0, j1)}}); + } + } + Treduce.template listBcast(bcast_list_T, layout); + } + + Matrix C_trail, W_trail; + if (side == Side::Left) { + C_trail = C.sub(k, i_end, 0, C_nt-1); + W_trail = W.sub(k, i_end, 0, C_nt-1); + } + else { + C_trail = C.sub(0, i_end, k, C_nt-1); + W_trail = W.sub(0, i_end, k, C_nt-1); + } + + // Left, NoTrans: Qi C = Qi_local Qi_reduce C, or + // Right, (Conj)Trans: C Qi^H = C Qi_reduce^H Qi_local^H, + // do ttmqr then unmqr. + if ((side == Side::Left) == (op == Op::NoTrans)) { + // Apply triangle-triangle reduction reflectors. + internal::ttmqr( + side, op, + std::move(A_panel), + Treduce.sub(k, i_end, k, k), + std::move(C_trail)); + } + + // Apply local reflectors. + internal::unmqr( + side, op, + std::move(A_panel), + Tlocal.sub(k, i_end, k, k), + std::move(C_trail), + std::move(W_trail)); + + // Left, (Conj)Trans: Qi^H C = Qi_reduce^H Qi_local^H C, or + // Right, NoTrans: C Qi = C Qi_local Qi_reduce, + // do unmqr then ttmqr. + if ((side == Side::Left) != (op == Op::NoTrans)) { + // Apply triangle-triangle reduction reflectors. + internal::ttmqr( + side, op, + std::move(A_panel), + Treduce.sub(k, A_mt-1, k, k), + std::move(C_trail)); + } + } + + lastk = k; + } + + #pragma omp taskwait + C.tileUpdateAllOrigin(); + } + + A.clearWorkspace(); + C.clearWorkspace(); +} + +} // namespace specialization +} // namespace internal + +//------------------------------------------------------------------------------ +/// Version with target as template parameter. +/// @ingroup geqrf_specialization +/// +template +void unmqr_qdwh_full( + Side side, Op op, + Matrix& A, + TriangularFactors& T, + Matrix& C, + Options const& opts) +{ + internal::specialization::unmqr_qdwh_full(internal::TargetType(), + side, op, A, T, C); +} + +//------------------------------------------------------------------------------ +/// Distributed parallel multiply by $Q$ from QR factorization. +/// +/// Multiplies the general m-by-n matrix $C$ by $Q$ from QR factorization, +/// according to: +/// +/// op | side = Left | side = Right +/// --------------- | ------------- | -------------- +/// op = NoTrans | $Q C $ | $C Q $ +/// op = ConjTrans | $Q^H C$ | $C Q^H$ +/// +/// where $Q$ is a unitary matrix defined as the product of k +/// elementary reflectors +/// \[ +/// Q = H(1) H(2) . . . H(k) +/// \] +/// as returned by geqrf. $Q$ is of order m if side = Left +/// and of order n if side = Right. +/// +//------------------------------------------------------------------------------ +/// @tparam scalar_t +/// One of float, double, std::complex, std::complex. +//------------------------------------------------------------------------------ +/// @param[in] side +/// - Side::Left: apply $Q$ or $Q^H$ from the left; +/// - Side::Right: apply $Q$ or $Q^H$ from the right. +/// +/// @param[in] op +/// - Op::NoTrans apply $Q$; +/// - Op::ConjTrans: apply $Q^H$; +/// - Op::Trans: apply $Q^T$ (only if real). +/// In the real case, Op::Trans is equivalent to Op::ConjTrans. +/// In the complex case, Op::Trans is not allowed. +/// +/// @param[in] A +/// Details of the QR factorization of the original matrix $A$ as returned +/// by geqrf. +/// +/// @param[in] T +/// Triangular matrices of the block reflectors as returned by geqrf. +/// +/// @param[in,out] C +/// On entry, the m-by-n matrix $C$. +/// On exit, $C$ is overwritten by $Q C$, $Q^H C$, $C Q$, or $C Q^H$. +/// +/// @param[in] opts +/// Additional options, as map of name = value pairs. Possible options: +/// - Option::Target: +/// Implementation to target. Possible values: +/// - HostTask: OpenMP tasks on CPU host [default]. +/// - HostNest: nested OpenMP parallel for loop on CPU host. +/// - HostBatch: batched BLAS on CPU host. +/// - Devices: batched BLAS on GPU device. +/// +/// @ingroup geqrf_computational +/// +template +void unmqr_qdwh_full( + Side side, Op op, + Matrix& A, + TriangularFactors& T, + Matrix& C, + Options const& opts) +{ + Target target = get_option( opts, Option::Target, Target::HostTask ); + + switch (target) { + case Target::Host: + case Target::HostTask: + default: + unmqr_qdwh_full(side, op, A, T, C, opts); + break; + case Target::HostNest: + unmqr_qdwh_full(side, op, A, T, C, opts); + break; + case Target::HostBatch: + unmqr_qdwh_full(side, op, A, T, C, opts); + break; + case Target::Devices: + unmqr_qdwh_full(side, op, A, T, C, opts); + break; + } + // todo: return value for errors? +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +void unmqr_qdwh_full( + Side side, Op op, + Matrix& A, + TriangularFactors& T, + Matrix& C, + Options const& opts); + +template +void unmqr_qdwh_full( + Side side, Op op, + Matrix& A, + TriangularFactors& T, + Matrix& C, + Options const& opts); + +template +void unmqr_qdwh_full< std::complex >( + Side side, Op op, + Matrix< std::complex >& A, + TriangularFactors< std::complex >& T, + Matrix< std::complex >& C, + Options const& opts); + +template +void unmqr_qdwh_full< std::complex >( + Side side, Op op, + Matrix< std::complex >& A, + TriangularFactors< std::complex >& T, + Matrix< std::complex >& C, + Options const& opts); + +} // namespace slate From 554769f8fa067d52c42ae033809363bc48d11679 Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Mon, 31 Oct 2022 16:06:50 -0400 Subject: [PATCH 04/49] Test qdwh --- test/test.cc | 2 + test/test.hh | 3 + test/test_qdwh.cc | 216 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 221 insertions(+) create mode 100644 test/test_qdwh.cc diff --git a/test/test.cc b/test/test.cc index 0fac169a8..d75dfdfbb 100644 --- a/test/test.cc +++ b/test/test.cc @@ -277,6 +277,8 @@ std::vector< testsweeper::routines_t > routines = { { "syset", test_set, Section::aux }, { "heset", test_set, Section::aux }, { "", nullptr, Section::newline }, + + { "qdwh", test_qdwh, Section::qr }, }; // ----------------------------------------------------------------------------- diff --git a/test/test.hh b/test/test.hh index cbe314bf2..592333fc3 100644 --- a/test/test.hh +++ b/test/test.hh @@ -256,6 +256,9 @@ void test_scale (Params& params, bool run); void test_scale_row_col(Params& params, bool run); void test_set (Params& params, bool run); +// polar decomposition +void test_qdwh (Params& params, bool run); + // ----------------------------------------------------------------------------- inline slate::Dist str2dist(const char* dist) { diff --git a/test/test_qdwh.cc b/test/test_qdwh.cc new file mode 100644 index 000000000..1281a1e05 --- /dev/null +++ b/test/test_qdwh.cc @@ -0,0 +1,216 @@ +// Copyright (c) 2017-2022, University of Tennessee. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// This program is free software: you can redistribute it and/or modify it under +// the terms of the BSD 3-Clause license. See the accompanying LICENSE file. + +#include "slate/slate.hh" +#include "test.hh" +#include "blas/flops.hh" +#include "lapack/flops.hh" +#include "print_matrix.hh" +#include "grid_utils.hh" + +#include "scalapack_wrappers.hh" +#include "scalapack_support_routines.hh" +#include "scalapack_copy.hh" + +#include +#include +#include +#include +#define SLATE_HAVE_SCALAPACK +//------------------------------------------------------------------------------ +template +void test_qdwh_work(Params& params, bool run) +{ + using real_t = blas::real_type; + using blas::real; + + // Constants + const scalar_t zero = 0; + const scalar_t one = 1; + + // get & mark input values + int64_t m = params.dim.m(); + int64_t n = params.dim.n(); + int p = params.grid.m(); + int q = params.grid.n(); + int64_t nb = params.nb(); + int64_t ib = params.ib(); + int64_t lookahead = params.lookahead(); + int64_t panel_threads = params.panel_threads(); + bool ref_only = params.ref() == 'o'; + bool ref = params.ref() == 'y' || ref_only; + bool check = params.check() == 'y' && ! ref_only; + bool trace = params.trace() == 'y'; + slate::Origin origin = params.origin(); + slate::Target target = params.target(); + params.matrix.mark(); + + // mark non-standard output values + params.time(); + params.gflops(); + params.ref_time(); + params.ref_gflops(); + params.ortho(); + + if (! run) + return; + + slate::Options const opts = { + {slate::Option::Lookahead, lookahead}, + {slate::Option::Target, target}, + {slate::Option::MaxPanelThreads, panel_threads}, + {slate::Option::InnerBlocking, ib} + }; + + // MPI variables + int mpi_rank, myrow, mycol; + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + gridinfo(mpi_rank, p, q, &myrow, &mycol); + + + // BLACS/MPI variables + // matrix A, figure out local size, allocate, create descriptor, initialize + int64_t mlocA = num_local_rows_cols(m, nb, myrow, p); + int64_t nlocA = num_local_rows_cols(n, nb, mycol, q); + int64_t lldA = blas::max(1, mlocA); // local leading dimension of A + std::vector A_data; + + int64_t mlocH = num_local_rows_cols(n, nb, myrow, p); + int64_t nlocH = num_local_rows_cols(n, nb, mycol, q); + int64_t lldH = blas::max(1, mlocH); // local leading dimension of A + std::vector H_data; + + // matrix QR, for checking result + slate::Matrix Id; + Id = slate::Matrix(n, n, nb, p, q, MPI_COMM_WORLD); + Id.insertLocalTiles(); + + slate::Matrix A; + slate::Matrix H; + if (origin != slate::Origin::ScaLAPACK) { + // Copy local ScaLAPACK data to GPU or CPU tiles. + slate::Target origin_target = origin2target(origin); + A = slate::Matrix(m, n, nb, p, q, MPI_COMM_WORLD); + A.insertLocalTiles(origin_target); + //copy(&A_tst[0], descA_tst, A); + H = slate::Matrix(n, n, nb, p, q, MPI_COMM_WORLD); + H.insertLocalTiles(origin_target); + //copy(&H_tst[0], descH_tst, H); + } + else { + // create SLATE matrices from the ScaLAPACK layouts + A_data.resize( lldA * nlocA ); + H_data.resize( lldH * nlocH ); + A = slate::Matrix::fromScaLAPACK(m, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD); + H = slate::Matrix::fromScaLAPACK(n, n, &H_data[0], lldH, nb, p, q, MPI_COMM_WORLD); + } + + real_t cond = 1 / std::numeric_limits::epsilon(); + params.matrix.kind.set_default("svd"); + params.matrix.cond.set_default(cond); + + slate::generate_matrix( params.matrix, A); + + // if check is required, copy test data + slate::Matrix Aref; + std::vector Aref_data; + if (check || ref) { + Aref_data.resize( lldA * nlocA ); + Aref = slate::Matrix::fromScaLAPACK( + m, n, &Aref_data[0], lldA, nb, p, q, MPI_COMM_WORLD); + slate::copy(A, Aref); + } + + print_matrix("A", A, params); + + // todo: how to compute gflops? + double gflop = lapack::Gflop::geqrf(m, n); + + if (! ref_only) { + if (trace) slate::trace::Trace::on(); + else slate::trace::Trace::off(); + + double time = barrier_get_wtime(MPI_COMM_WORLD); + + //================================================== + // Run SLATE test. + //================================================== + // A = AH, + // A will be written by the orthogonal polar factor + // H is the symmetric positive semidefinite polar factor + slate::qdwh(A, H, opts); + + time = barrier_get_wtime(MPI_COMM_WORLD) - time; + + if (trace) slate::trace::Trace::finish(); + + // compute and save timing/performance + params.time() = time; + params.gflops() = gflop / time; + + print_matrix("U: orthogonal polar factor", A, params); + print_matrix("H: symmetric postive semi definite factor", H, params); + } + + if (check) { + //================================================== + // Test results by checking the orthogonality of U factor + // + // || A'A - I ||_f + // ---------------- < tol * epsilon + // n + // + //================================================== + auto AT = conj_transpose(A); + set(zero, one, Id); + slate::gemm(one, AT, A, -one, Id, opts); + real_t orth = slate::norm(slate::Norm::Fro, Id); + params.ortho() = orth / sqrt(n); + + //================================================== + // Test results by checking backwards error + // + // || A'H - Aref ||_f + // ------------------- < tol * epsilon + // ||Aref||_f + // + //================================================== + real_t normA = slate::norm(slate::Norm::Fro, Aref); + slate::gemm(one, A, H, -one, Aref, opts); + real_t berr = slate::norm(slate::Norm::Fro, Aref); + params.error() = berr / normA; + + real_t tol = params.tol() * 0.5 * std::numeric_limits::epsilon(); + params.okay() = ((params.error() <= tol) && (params.ortho() <= tol)); + + + } +} + +// ----------------------------------------------------------------------------- +void test_qdwh(Params& params, bool run) +{ + switch (params.datatype()) { + case testsweeper::DataType::Integer: + throw std::exception(); + break; + + case testsweeper::DataType::Single: + test_qdwh_work (params, run); + break; + + case testsweeper::DataType::Double: + test_qdwh_work (params, run); + break; + + case testsweeper::DataType::SingleComplex: + test_qdwh_work> (params, run); + break; + + case testsweeper::DataType::DoubleComplex: + test_qdwh_work> (params, run); + break; + } +} From 3fa51a1970dd1183aa9292d4312dc0c22ef10243 Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Mon, 31 Oct 2022 16:07:42 -0400 Subject: [PATCH 05/49] call geqrf/unmqr_qdwh_full --- src/qdwh.cc | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/src/qdwh.cc b/src/qdwh.cc index 1789db558..4296cc9f5 100644 --- a/src/qdwh.cc +++ b/src/qdwh.cc @@ -22,7 +22,7 @@ namespace impl { /// @ingroup geqrf_specialization /// template -void qdwh(slate::internal::TargetType, +void qdwh( Matrix& A, Matrix& H, // this matrix will be hermition Options const& opts) @@ -31,7 +31,6 @@ void qdwh(slate::internal::TargetType, using blas::real; int64_t lookahead = get_option( opts, Option::Lookahead, 1 ); - int64_t ib = get_option( opts, Option::InnerBlocking, 16 ); int64_t max_panel_threads = std::max(omp_get_max_threads()/2, 1); max_panel_threads = get_option( opts, Option::MaxPanelThreads, max_panel_threads ); @@ -56,7 +55,8 @@ void qdwh(slate::internal::TargetType, real_t tol1 = 5. * eps; real_t tol3 = pow(tol1, 1./3.); - double L2, sqd, dd, a1, a, b, c; + real_t L2, sqd, dd, a1, a, b, c; + //double L2, sqd, dd, a1, a, b, c; real_t Li, Liconv; real_t conv = 100.; scalar_t zero = 0.0, one = 1.0, minusone = -1.0; @@ -152,7 +152,8 @@ void qdwh(slate::internal::TargetType, } itconv++; - L2 = double (Liconv * Liconv); + L2 = Liconv * Liconv; + //L2 = double (Liconv * Liconv); dd = pow( 4.0 * (1.0 - L2 ) / (L2 * L2), 1.0/3.0 ); sqd = sqrt(1.0 + dd); a1 = sqd + sqrt( 8.0 - 4.0 * dd + 8.0 * (2.0 - L2) / (L2 * sqd) ) / 2.0; @@ -173,7 +174,8 @@ void qdwh(slate::internal::TargetType, it++; // Compute parameters L,a,b,c - L2 = double (Li * Li); + //L2 = double (Li * Li); + L2 = Li * Li; dd = pow( 4.0 * (1.0 - L2 ) / (L2 * L2), 1.0/3.0 ); sqd = sqrt(1.0 + dd); a1 = sqd + sqrt( 8.0 - 4.0 * dd + 8.0 * (2.0 - L2) / (L2 * sqd) ) / 2.0; @@ -204,13 +206,13 @@ void qdwh(slate::internal::TargetType, // Factorize B = QR, and generate the associated Q add(alpha, A, zero, W1, opts); //geqrf(W, T1, opts); // naive impl - //geqrf_qdwh_full(W, T1, opts); + geqrf_qdwh_full(W, T1, opts); set(zero, one, Q1); set(zero, zero, Q2); //unmqr(slate::Side::Left, slate::Op::NoTrans, W, T1, Q, opts); //naive impl - //unmqr_qdwh_full(slate::Side::Left, slate::Op::NoTrans, W, T1, Q, opts); + unmqr_qdwh_full(slate::Side::Left, slate::Op::NoTrans, W, T1, Q, opts); // A = ( (a-b/c)/sqrt(c) ) * Q1 * Q2' + (b/c) * A auto Q2T = conj_transpose(Q2); @@ -330,7 +332,8 @@ void qdwh(slate::internal::TargetType, /// @ingroup geqrf_computational /// template -void qdwh(Matrix& A, +void qdwh( + Matrix& A, Matrix& H, Options const& opts) { @@ -340,20 +343,16 @@ void qdwh(Matrix& A, switch (target) { case Target::Host: case Target::HostTask: - impl::qdwh( TargetType(), - A, H, opts); + impl::qdwh( A, H, opts ); break; case Target::HostNest: - impl::qdwh( TargetType(), - A, H, opts); + impl::qdwh( A, H, opts ); break; case Target::HostBatch: - impl::qdwh( TargetType(), - A, H, opts); + impl::qdwh( A, H, opts ); break; case Target::Devices: - impl::qdwh( TargetType(), - A, H, opts); + impl::qdwh( A, H, opts ); break; } // todo: return value for errors? From ad698cc32081e45d12459201586c823afb4c6ad4 Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Mon, 31 Oct 2022 16:09:21 -0400 Subject: [PATCH 06/49] Added qdwh codes to slate.hh and to makefile --- GNUmakefile | 5 +++++ include/slate/slate.hh | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/GNUmakefile b/GNUmakefile index b96c6ffa6..7021560f5 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -561,6 +561,10 @@ ifneq ($(only_unit),1) src/work/work_trmm.cc \ src/work/work_trsm.cc \ src/work/work_trsmA.cc \ + src/qdwh.cc \ + src/normest.cc \ + src/geqrf_qdwh_full.cc \ + src/unmqr_qdwh_full.cc \ # End. Add alphabetically. endif @@ -662,6 +666,7 @@ ifneq ($(have_fortran),) test/pdlantr.f \ test/pclantr.f \ test/pzlantr.f \ + test/test_qdwh.cc \ # End. Add alphabetically, by base name after precision. endif diff --git a/include/slate/slate.hh b/include/slate/slate.hh index 16d36329f..22d249fc8 100644 --- a/include/slate/slate.hh +++ b/include/slate/slate.hh @@ -1085,6 +1085,42 @@ void steqr2( Matrix& Z, Options const& opts = Options()); +//----------------------------------------- +// Polar Decomposition + +//----------------------------------------- +// qdwh() +template +void qdwh( + Matrix& A, + Matrix& H, // this matrix will be hermition + Options const& opts = Options()); + +//----------------------------------------- +// normest() +template +blas::real_type +normest( + Norm in_norm, + matrix_type &A, + Options const& opts = Options()); + +//----------------------------------------- +// geqrf_qdwh_full() +template +void geqrf_qdwh_full( + Matrix& A, TriangularFactors& T, + Options const& opts = Options()); + +//----------------------------------------- +// unmqr_qdwh_full() +template +void unmqr_qdwh_full( + Side side, Op op, + Matrix& A, TriangularFactors& T, + Matrix& C, + Options const& opts = Options()); + } // namespace slate //----------------------------------------- From d9abb8fd535334891e2b525b6f443fe07335d6f5 Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Thu, 10 Nov 2022 08:38:31 -0500 Subject: [PATCH 07/49] fixes on i_end and on releasing tiles --- src/geqrf_qdwh_full.cc | 9 +++++---- src/unmqr_qdwh_full.cc | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/geqrf_qdwh_full.cc b/src/geqrf_qdwh_full.cc index 66752f7f8..9178b0d84 100644 --- a/src/geqrf_qdwh_full.cc +++ b/src/geqrf_qdwh_full.cc @@ -176,7 +176,7 @@ void geqrf_qdwh_full(slate::internal::TargetType, #pragma omp master { for (int64_t k = 0; k < A_min_mtnt; ++k) { - int64_t i_end = std::min( A_mt-1, (A_mt-A_nt) + k); + int64_t i_end = A_min_mtnt + k; auto A_panel = A.sub(k, i_end, k, k); auto Tl_panel = Tlocal.sub(k, i_end, k, k); @@ -206,7 +206,7 @@ void geqrf_qdwh_full(slate::internal::TargetType, if (k < A_nt-1) { // bcast V across row for trailing matrix update - if (k < i_end+1) { + if (k < i_end) { BcastList bcast_list_V_first; BcastList bcast_list_V; for (int64_t i = k; i < i_end+1; ++i) { @@ -311,7 +311,8 @@ void geqrf_qdwh_full(slate::internal::TargetType, depend(inout:block[k+1]) { int64_t k_la = k-lookahead; - for (int64_t i = k_la; i < A_mt; ++i) { + int64_t i_end_la = A_min_mtnt + k_la; + for (int64_t i = k_la; i < i_end_la; ++i) { if (A.tileIsLocal(i, k_la)) { A.tileUpdateOrigin(i, k_la); @@ -325,7 +326,7 @@ void geqrf_qdwh_full(slate::internal::TargetType, } } - auto A_panel_k_la = A.sub(k_la, A_mt-1, k_la, k_la); + auto A_panel_k_la = A.sub(k_la, i_end_la, k_la, k_la); std::vector< int64_t > first_indices_k_la; geqrf_compute_first_indices(A_panel_k_la, k_la, first_indices_k_la); if (first_indices.size() > 0) { diff --git a/src/unmqr_qdwh_full.cc b/src/unmqr_qdwh_full.cc index 563ac796b..9c4987aa0 100644 --- a/src/unmqr_qdwh_full.cc +++ b/src/unmqr_qdwh_full.cc @@ -103,7 +103,7 @@ void unmqr_qdwh_full( SLATE_UNUSED(lastk); for (int64_t k = k_begin; k != k_end; k += k_step) { - int64_t i_end = std::min( A_mt-1, (A_mt-A_nt) + k); + int64_t i_end = A_min_mtnt + k; auto A_panel = A.sub(k, i_end, k, k); From 8dcff0284d4b421c7a4fe30aee92a98eb8d0e625 Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Thu, 10 Nov 2022 08:40:18 -0500 Subject: [PATCH 08/49] Set C to zero will make gemmA work --- src/normest.cc | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/src/normest.cc b/src/normest.cc index e0ed58fad..79ff32350 100644 --- a/src/normest.cc +++ b/src/normest.cc @@ -43,7 +43,8 @@ normest(slate::internal::TargetType, // Two norm estimation // First: let's compute the x vector such that // x_j = sum_i abs( A_{i,j} ), x here is global_sums.data - if (in_norm == Norm::One) { + // todo: should we add this to norm? + //if (in_norm == Norm::One_est) { using blas::min; @@ -71,6 +72,7 @@ normest(slate::internal::TargetType, std::vector local_sums(n); + // todo: do we still need reserveDevice here? if (target == Target::Devices) A.reserveDeviceWorkspace(); @@ -92,7 +94,7 @@ normest(slate::internal::TargetType, #pragma omp parallel #pragma omp master { - internal::norm(in_norm, NormScope::Matrix, std::move(A), local_sums.data()); + internal::norm(slate::Norm::One, NormScope::Matrix, std::move(A), local_sums.data()); } #pragma omp critical(slate_mpi) @@ -120,7 +122,7 @@ normest(slate::internal::TargetType, } // Third: start the while-loop X = X / ||X|| - while ( (cnt < maxiter) && (fabs((e - e0)) > (tol * (e))) ) { + while ((cnt < maxiter) && (fabs((e - e0)) > (tol * (e)))) { e0 = e; // Scale X = X / ||X|| @@ -130,9 +132,18 @@ normest(slate::internal::TargetType, // Compute Ax = A * sx gemm(one, A, X, zero, AX, opts); + // todo: still need to add the following + //if nnz(Sx) == 0 + // Sx = rand(size(Sx),class(Sx)); + //end + // Compute x = A' * A * x = A' * Ax - auto AT = conj_transpose(A); + auto AT = conjTranspose(A); + // todo: why this set is needed when using multiple mpi rank + // todo: send to Sebastien + set(zero, zero, XL); gemm(one, AT, AX, zero, XL, opts); + //gemmC(one, AT, AX, zero, XL, opts); // Compute ||X||, ||AX|| normX = norm(Norm::Fro, XL, opts); @@ -142,17 +153,15 @@ normest(slate::internal::TargetType, e = normX / normAX; cnt++; } - //if ( (cnt >= maxiter) && (fabs((e - e0)) > (tol * e)) ) { - //} A.clearWorkspace(); return e; - } - else { - slate_error("invalid norm."); - } + //} + //else { + // slate_error("invalid norm."); + //} } } // namespace specialization From f379a015e6df8add5641ae9c8987a6ab6e9ebc57 Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Thu, 10 Nov 2022 08:42:07 -0500 Subject: [PATCH 09/49] minor cleaning and add todo --- src/qdwh.cc | 54 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 15 deletions(-) diff --git a/src/qdwh.cc b/src/qdwh.cc index 4296cc9f5..27e77547f 100644 --- a/src/qdwh.cc +++ b/src/qdwh.cc @@ -12,14 +12,16 @@ namespace slate { // specialization namespace differentiates, e.g., // todo: is it ok? -// internal::geqrf from internal::specialization::geqrf +// internal::qdwh from impl::qdwh +// todo: works only if m % nb = 0 +// todo: works only for square matrices namespace impl { //------------------------------------------------------------------------------ /// Distributed parallel polar decomposition based on QDWH algorithm. /// Generic implementation for any target. -// todo: is it ok? -/// @ingroup geqrf_specialization +// todo: add to docs/doxygen +/// @ingroup qdwh_specialization /// template void qdwh( @@ -56,7 +58,6 @@ void qdwh( real_t tol3 = pow(tol1, 1./3.); real_t L2, sqd, dd, a1, a, b, c; - //double L2, sqd, dd, a1, a, b, c; real_t Li, Liconv; real_t conv = 100.; scalar_t zero = 0.0, one = 1.0, minusone = -1.0; @@ -66,19 +67,25 @@ void qdwh( real_t normA; real_t norminvR; - int64_t mt2 = 2*mt - 1; + int64_t mt2 = mt + nt - 1; // allocate m2*n work space required for the qr-based iterations // this allocation can be avoided if we change the qr iterations to // work on two matrices on top of each other - // if doing QR([A;Id]) + // allocation needed for doing QR([A;Id]) slate::Matrix W(m2, n, nb, nprow, npcol, MPI_COMM_WORLD); slate::Matrix Q(m2, n, nb, nprow, npcol, MPI_COMM_WORLD); + slate::TriangularFactors T; if (target == Target::Devices) { const int64_t batch_size_zero = 0; // use default batch size const int64_t num_queues = 3 + lookahead; + // todo: I have workspace here and in other call as geqrf_qdwh_full + // todo: in geqrf, it should use the allocated ones, it should not + // keep allocating + // todo: check traces to confirm + // or in printf in blaspp/device_malloc and here A.allocateBatchArrays(batch_size_zero, num_queues); A.reserveDeviceWorkspace(); W.allocateBatchArrays(batch_size_zero, num_queues); @@ -100,20 +107,18 @@ void qdwh( //W1.insertLocalTiles(); //W2.insertLocalTiles(); - slate::TriangularFactors T1; // backup A in H copy(A, H); // two norm estimation (largest singular value of A) - // todo: fix norm_est the call it - real_t norm_est = 1.; - //real_t norm_est = normest(slate::Norm::One, A); + real_t norm_est = normest(slate::Norm::One, A); alpha = 1.0; // scale the original matrix A to form A0 of the iterative loop add(alpha/(scalar_t)norm_est, H, zero, A, opts); + // todo: look at Lapack impl // Calculate Li: reciprocal of condition number estimation // Either 1) use LU followed by gecon // Or 2) QR followed by trtri @@ -123,11 +128,14 @@ void qdwh( // Estimate the condition number using QR // This Q factor can be used in the first QR-based iteration copy(A, W1); - slate::geqrf(W1, T1, opts); + // todo: this QR can be used in the first QR iteration + // todo: put bound of 1-norm est compared to 2-norm + slate::geqrf(W1, T, opts); auto R = TriangularMatrix( Uplo::Upper, slate::Diag::NonUnit, W1 ); auto Rh = HermitianMatrix( Uplo::Upper, W1 ); + // todo: cheaper to do triangular solve than calling trtri trtri(R, opts); //tzset(scalar_t(0.0), L); norminvR = norm(slate::Norm::One, Rh); @@ -204,15 +212,22 @@ void qdwh( //} // Factorize B = QR, and generate the associated Q + // todo: replace following add by copy and scale, but why scale only scale by a real numbers? add(alpha, A, zero, W1, opts); - //geqrf(W, T1, opts); // naive impl - geqrf_qdwh_full(W, T1, opts); + //copy(A, W1); + //scale(alpha, one, W1, opts); + //geqrf(W, T, opts); // naive impl + // todo: how to avoid allocating workspace for each iteration (3 QR)? + // todo: how to make it work for any matrix size and any nb? + // todo: check time for allocationn in geqrf + geqrf_qdwh_full(W, T, opts); set(zero, one, Q1); set(zero, zero, Q2); - //unmqr(slate::Side::Left, slate::Op::NoTrans, W, T1, Q, opts); //naive impl - unmqr_qdwh_full(slate::Side::Left, slate::Op::NoTrans, W, T1, Q, opts); + // todo: do I need unmqr? + //unmqr(slate::Side::Left, slate::Op::NoTrans, W, T, Q, opts); //naive impl + unmqr_qdwh_full(slate::Side::Left, slate::Op::NoTrans, W, T, Q, opts); // A = ( (a-b/c)/sqrt(c) ) * Q1 * Q2' + (b/c) * A auto Q2T = conj_transpose(Q2); @@ -276,10 +291,19 @@ void qdwh( } } + if (A.mpiRank() == 0) { + printf("%-7s %-7s\n", "QR", "PO"); + printf("%-7d %-7d\n", itqr, itpo); + } + // A = U*H ==> H = U'*A ==> H = 0.5*(H'+H) copy(H, W1); auto AT = conj_transpose(A); gemm(one, AT, W1, zero, H, opts); + + A.releaseWorkspace(); + W.releaseWorkspace(); + Q.releaseWorkspace(); } } // namespace impl From 63df04af4378209317ac0619dc35a8277d7094fc Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Mon, 12 Dec 2022 11:45:11 -0500 Subject: [PATCH 10/49] Allocate W with number of rows is roundup(number of A rows) + n to avoid having a clean up tile in the middle of the W matrix --- src/qdwh.cc | 105 +++++++++++++++++++++++++++++----------------------- 1 file changed, 59 insertions(+), 46 deletions(-) diff --git a/src/qdwh.cc b/src/qdwh.cc index 27e77547f..aa79d9ea9 100644 --- a/src/qdwh.cc +++ b/src/qdwh.cc @@ -34,29 +34,17 @@ void qdwh( int64_t lookahead = get_option( opts, Option::Lookahead, 1 ); int64_t max_panel_threads = std::max(omp_get_max_threads()/2, 1); - max_panel_threads = get_option( opts, Option::MaxPanelThreads, max_panel_threads ); - int64_t mt = A.mt(); - int64_t nt = A.nt(); - - int64_t m = A.m(); - int64_t n = A.n(); - int64_t nb = A.tileMb(0); - int64_t m2 = m + n; + max_panel_threads = get_option( opts, Option::MaxPanelThreads, max_panel_threads ); - int nprow, npcol; - int myrow, mycol; - GridOrder order; - A.gridinfo(&order, &nprow, &npcol, &myrow, &mycol); + real_t eps = 0.5 * std::numeric_limits::epsilon(); + real_t tol1 = 5. * eps; + real_t tol3 = pow(tol1, 1./3.); bool optqr = 1; int itconv, it; int itqr = 0, itpo = 0; - real_t eps = 0.5 * std::numeric_limits::epsilon(); - real_t tol1 = 5. * eps; - real_t tol3 = pow(tol1, 1./3.); - real_t L2, sqd, dd, a1, a, b, c; real_t Li, Liconv; real_t conv = 100.; @@ -67,15 +55,30 @@ void qdwh( real_t normA; real_t norminvR; - int64_t mt2 = mt + nt - 1; + int64_t mt = A.mt(); + int64_t nt = A.nt(); + + int64_t m = A.m(); + int64_t n = A.n(); + int64_t nb = A.tileMb(0); + + // The QR-based iterations requires QR[A; Id], + // W = [A; Id], where size of A is mxn, size of Id is nxn + // To avoid having a clean up tile in the middle of the W matrix, + // we round up the number of A rows (m), + // so that size(W) = m_roundup + n + int64_t m_roundup = ceil( (m + nb)/ nb )*nb; + int64_t m_W = m_roundup + n; + int64_t mt_W = mt + nt; - // allocate m2*n work space required for the qr-based iterations - // this allocation can be avoided if we change the qr iterations to - // work on two matrices on top of each other + int nprow, npcol; + int myrow, mycol; + GridOrder order; + A.gridinfo(&order, &nprow, &npcol, &myrow, &mycol); - // allocation needed for doing QR([A;Id]) - slate::Matrix W(m2, n, nb, nprow, npcol, MPI_COMM_WORLD); - slate::Matrix Q(m2, n, nb, nprow, npcol, MPI_COMM_WORLD); + // allocate m_W*n work space required for the qr-based iterations QR([A;Id]) + slate::Matrix W(m_W, n, nb, nprow, npcol, MPI_COMM_WORLD); + slate::Matrix Q(m_W, n, nb, nprow, npcol, MPI_COMM_WORLD); slate::TriangularFactors T; if (target == Target::Devices) { @@ -96,17 +99,27 @@ void qdwh( W.insertLocalTiles(); Q.insertLocalTiles(); - auto W1 = W.sub(0, mt-1, 0, nt-1); // First mxn block of W - auto W2 = W.sub(mt, mt2, 0, nt-1); // Second nxn block of W - auto Q1 = Q.sub(0, mt-1, 0, nt-1); // First mxn block of W - auto Q2 = Q.sub(mt, mt2, 0, nt-1); // Second nxn block of W - // if doing QR([A]) QR([A;Id]) - //slate::Matrix W1(m, n, nb, nprow, npcol, MPI_COMM_WORLD); - //slate::Matrix W2(m, n, nb, nprow, npcol, MPI_COMM_WORLD); - //W1.insertLocalTiles(); - //W2.insertLocalTiles(); + // To compute QR[A; Id] + // First mt tiles of W contains A matrix in [0:m-1, 0:n-1] + // W1 have more rows than A, since we round up the number of rows of W1 + // W10 have A matrix + auto W1 = W.sub(0, mt-1, 0, nt-1); + auto W10 = W1.slice(0, m-1, 0, n-1); + + // Second nt tiles of W, contain the identity matrix + auto W2 = W.sub(mt, mt_W-1, 0, nt-1); + + // W11 is the extra padded rows [m:m_round_up, 0:n-1] + auto W11 = W1.slice(m, m_roundup-1, 0, n-1); + set(zero, zero, W11); + + auto Q1 = Q.sub(0, mt-1, 0, nt-1); + auto Q2 = Q.sub(mt, mt_W-1, 0, nt-1); + auto Q10 = Q1.slice(0, m-1, 0, n-1); + auto Q11 = Q1.slice(m, m_roundup-1, 0, n-1); + set(zero, zero, Q11); // backup A in H copy(A, H); @@ -127,14 +140,14 @@ void qdwh( if (optqr) { // Estimate the condition number using QR // This Q factor can be used in the first QR-based iteration - copy(A, W1); + copy(A, W10); // todo: this QR can be used in the first QR iteration // todo: put bound of 1-norm est compared to 2-norm slate::geqrf(W1, T, opts); auto R = TriangularMatrix( - Uplo::Upper, slate::Diag::NonUnit, W1 ); + Uplo::Upper, slate::Diag::NonUnit, W10 ); auto Rh = HermitianMatrix( - Uplo::Upper, W1 ); + Uplo::Upper, W10 ); // todo: cheaper to do triangular solve than calling trtri trtri(R, opts); //tzset(scalar_t(0.0), L); @@ -213,7 +226,7 @@ void qdwh( // Factorize B = QR, and generate the associated Q // todo: replace following add by copy and scale, but why scale only scale by a real numbers? - add(alpha, A, zero, W1, opts); + add(alpha, A, zero, W10, opts); //copy(A, W1); //scale(alpha, one, W1, opts); //geqrf(W, T, opts); // naive impl @@ -222,7 +235,7 @@ void qdwh( // todo: check time for allocationn in geqrf geqrf_qdwh_full(W, T, opts); - set(zero, one, Q1); + set(zero, one, Q10); set(zero, zero, Q2); // todo: do I need unmqr? @@ -235,9 +248,9 @@ void qdwh( beta = scalar_t( b / c ); // Copy U into C to check the convergence of QDWH if (it >= itconv ) { - copy(A, W1); + copy(A, W10); } - gemm(alpha, Q1, Q2T, beta, A, opts); + gemm(alpha, Q10, Q2T, beta, A, opts); // Main flops used in this step/ //flops_dgeqrf = FLOPS_DGEQRF( 2*m, n ); @@ -251,7 +264,7 @@ void qdwh( else { // Copy A into H to check the convergence of QDWH if (it >= itconv) { - copy(A, W1); + copy(A, W10); } // Make Q1 into an identity matrix @@ -259,8 +272,8 @@ void qdwh( // Compute Q1 = c * A' * A + I /////////////// - copy(A, Q1); - auto AT = conj_transpose(Q1); + copy(A, Q10); + auto AT = conj_transpose(Q10); gemm(scalar_t(c), AT, A, one, W2, opts); // Solve R x = AT @@ -286,8 +299,8 @@ void qdwh( // Compute the norm of the symmetric matrix U - B1 conv = 10.0; if (it >= itconv) { - add(one, A, minusone, W1, opts); - conv = norm(slate::Norm::Fro, W1); + add(one, A, minusone, W10, opts); + conv = norm(slate::Norm::Fro, W10); } } @@ -297,9 +310,9 @@ void qdwh( } // A = U*H ==> H = U'*A ==> H = 0.5*(H'+H) - copy(H, W1); + copy(H, W10); auto AT = conj_transpose(A); - gemm(one, AT, W1, zero, H, opts); + gemm(one, AT, W10, zero, H, opts); A.releaseWorkspace(); W.releaseWorkspace(); From 0278c6935d4261fadcf0478a9bf958515e5c4064 Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Wed, 14 Dec 2022 12:12:36 -0500 Subject: [PATCH 11/49] Fixes to have qdwh work for matrix with m > n --- src/geqrf_qdwh_full.cc | 2 +- src/normest.cc | 2 +- src/qdwh.cc | 71 ++++++++++++++++++++++++++---------------- src/unmqr_qdwh_full.cc | 2 +- 4 files changed, 48 insertions(+), 29 deletions(-) diff --git a/src/geqrf_qdwh_full.cc b/src/geqrf_qdwh_full.cc index 9178b0d84..f7bc10c74 100644 --- a/src/geqrf_qdwh_full.cc +++ b/src/geqrf_qdwh_full.cc @@ -176,7 +176,7 @@ void geqrf_qdwh_full(slate::internal::TargetType, #pragma omp master { for (int64_t k = 0; k < A_min_mtnt; ++k) { - int64_t i_end = A_min_mtnt + k; + int64_t i_end = A_mt - A_nt + k; auto A_panel = A.sub(k, i_end, k, k); auto Tl_panel = Tlocal.sub(k, i_end, k, k); diff --git a/src/normest.cc b/src/normest.cc index 79ff32350..c73632cb4 100644 --- a/src/normest.cc +++ b/src/normest.cc @@ -77,7 +77,7 @@ normest(slate::internal::TargetType, A.reserveDeviceWorkspace(); std::vector global_sums(n); - std::vector W1(n); + std::vector W1(m); std::vector W2(n); auto XL = slate::Matrix::fromLAPACK( diff --git a/src/qdwh.cc b/src/qdwh.cc index aa79d9ea9..c377f8af1 100644 --- a/src/qdwh.cc +++ b/src/qdwh.cc @@ -58,8 +58,13 @@ void qdwh( int64_t mt = A.mt(); int64_t nt = A.nt(); - int64_t m = A.m(); - int64_t n = A.n(); + int64_t m = A.m(); + int64_t n = A.n(); + + if (m < n) { + return; + } + int64_t nb = A.tileMb(0); // The QR-based iterations requires QR[A; Id], @@ -67,7 +72,7 @@ void qdwh( // To avoid having a clean up tile in the middle of the W matrix, // we round up the number of A rows (m), // so that size(W) = m_roundup + n - int64_t m_roundup = ceil( (m + nb)/ nb )*nb; + int64_t m_roundup = ((m + nb - 1) / nb ) * nb; int64_t m_W = m_roundup + n; int64_t mt_W = mt + nt; @@ -81,6 +86,16 @@ void qdwh( slate::Matrix Q(m_W, n, nb, nprow, npcol, MPI_COMM_WORLD); slate::TriangularFactors T; + // todo: anyway to avoid Acpy allocation? + slate::Matrix Acpy; + if (m != n) { + Acpy = slate::Matrix(m, n, nb, nprow, npcol, MPI_COMM_WORLD); + Acpy.insertLocalTiles(); + } + else { + Acpy = H; + } + if (target == Target::Devices) { const int64_t batch_size_zero = 0; // use default batch size const int64_t num_queues = 3 + lookahead; @@ -95,6 +110,8 @@ void qdwh( W.reserveDeviceWorkspace(); Q.allocateBatchArrays(batch_size_zero, num_queues); Q.reserveDeviceWorkspace(); + Acpy.allocateBatchArrays(batch_size_zero, num_queues); + Acpy.reserveDeviceWorkspace(); } W.insertLocalTiles(); @@ -110,26 +127,29 @@ void qdwh( // Second nt tiles of W, contain the identity matrix auto W2 = W.sub(mt, mt_W-1, 0, nt-1); - // W11 is the extra padded rows [m:m_round_up, 0:n-1] - auto W11 = W1.slice(m, m_roundup-1, 0, n-1); - set(zero, zero, W11); - auto Q1 = Q.sub(0, mt-1, 0, nt-1); + auto Q10 = Q1.slice(0, m-1, 0, n-1); + auto Q2 = Q.sub(mt, mt_W-1, 0, nt-1); - auto Q10 = Q1.slice(0, m-1, 0, n-1); - auto Q11 = Q1.slice(m, m_roundup-1, 0, n-1); - set(zero, zero, Q11); - // backup A in H - copy(A, H); + if (m_roundup != m) { + // W11 and Q11 is the extra padded rows [m:m_round_up, 0:n-1] + auto W11 = W1.slice(m, m_roundup-1, 0, n-1); + set(zero, zero, W11); + auto Q11 = Q1.slice(m, m_roundup-1, 0, n-1); + set(zero, zero, Q11); + } + + // backup A in Acpy to compute H + copy(A, Acpy); // two norm estimation (largest singular value of A) real_t norm_est = normest(slate::Norm::One, A); alpha = 1.0; // scale the original matrix A to form A0 of the iterative loop - add(alpha/(scalar_t)norm_est, H, zero, A, opts); + add(alpha/scalar_t(norm_est), Acpy, zero, A, opts); // todo: look at Lapack impl // Calculate Li: reciprocal of condition number estimation @@ -144,13 +164,15 @@ void qdwh( // todo: this QR can be used in the first QR iteration // todo: put bound of 1-norm est compared to 2-norm slate::geqrf(W1, T, opts); + //auto R = TrapezoidMatrix( + // Uplo::Upper, slate::Diag::NonUnit, W10 ); + auto R1 = W10.slice(0, n-1, 0, n-1); auto R = TriangularMatrix( - Uplo::Upper, slate::Diag::NonUnit, W10 ); + Uplo::Upper, slate::Diag::NonUnit, R1 ); auto Rh = HermitianMatrix( - Uplo::Upper, W10 ); + Uplo::Upper, R1 ); // todo: cheaper to do triangular solve than calling trtri trtri(R, opts); - //tzset(scalar_t(0.0), L); norminvR = norm(slate::Norm::One, Rh); Li = (1.0 / norminvR) / normA; Li = norm_est / 1.1 * Li; @@ -162,6 +184,7 @@ void qdwh( // Estimate the condition number using LU } + // Compute the number of iterations to converge itconv = 0; Liconv = Li; while (itconv == 0 || fabs(1-Liconv) > tol1) { // To find the minimum number of iterations to converge. @@ -174,7 +197,6 @@ void qdwh( itconv++; L2 = Liconv * Liconv; - //L2 = double (Liconv * Liconv); dd = pow( 4.0 * (1.0 - L2 ) / (L2 * L2), 1.0/3.0 ); sqd = sqrt(1.0 + dd); a1 = sqd + sqrt( 8.0 - 4.0 * dd + 8.0 * (2.0 - L2) / (L2 * sqd) ) / 2.0; @@ -207,20 +229,19 @@ void qdwh( Li = Li * real_t ((a + b * L2) / (1.0 + c * L2)); if (real(c) > 100.) { - //int doqr = !optqr || it > 1; + //int64_t doqr = !optqr || it > 1; // Generate the matrix B = [ B1 ] = [ sqrt(c) * U ] // [ B2 ] = [ Id ] alpha = scalar_t(sqrt(c)); set(zero, one, W2); - // todo: have a customized splitted QR //if( doqr ) { - // geadd(alpha, A, zero, W1); + // geadd(alpha, A, zero, W10, opts); // geqrf_qdwh_full(W, T1, opts); //} //else { - // geadd(alpha, W1, zero, W1); + // geadd(alpha, W1, zero, W10, opts); // geqrf_qdwh_full2(W, T1, opts); //} @@ -229,16 +250,14 @@ void qdwh( add(alpha, A, zero, W10, opts); //copy(A, W1); //scale(alpha, one, W1, opts); + //geqrf(W, T, opts); // naive impl // todo: how to avoid allocating workspace for each iteration (3 QR)? // todo: how to make it work for any matrix size and any nb? // todo: check time for allocationn in geqrf geqrf_qdwh_full(W, T, opts); - set(zero, one, Q10); - set(zero, zero, Q2); - - // todo: do I need unmqr? + set(zero, one, Q); //unmqr(slate::Side::Left, slate::Op::NoTrans, W, T, Q, opts); //naive impl unmqr_qdwh_full(slate::Side::Left, slate::Op::NoTrans, W, T, Q, opts); @@ -310,7 +329,7 @@ void qdwh( } // A = U*H ==> H = U'*A ==> H = 0.5*(H'+H) - copy(H, W10); + copy(Acpy, W10); auto AT = conj_transpose(A); gemm(one, AT, W10, zero, H, opts); diff --git a/src/unmqr_qdwh_full.cc b/src/unmqr_qdwh_full.cc index 9c4987aa0..3b1277103 100644 --- a/src/unmqr_qdwh_full.cc +++ b/src/unmqr_qdwh_full.cc @@ -103,7 +103,7 @@ void unmqr_qdwh_full( SLATE_UNUSED(lastk); for (int64_t k = k_begin; k != k_end; k += k_step) { - int64_t i_end = A_min_mtnt + k; + int64_t i_end = A_mt - A_nt + k; auto A_panel = A.sub(k, i_end, k, k); From 458b8b953bb70792ea5fd1ed45a4f20acc3dd877 Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Wed, 21 Dec 2022 13:53:08 -0500 Subject: [PATCH 12/49] add itqr and itpo to qdwh interace, and change H matrix to Hermitian --- include/slate/slate.hh | 3 +- src/qdwh.cc | 77 ++++++++++++++++++++++++------------------ test/test_qdwh.cc | 36 ++++++++++++++++---- 3 files changed, 75 insertions(+), 41 deletions(-) diff --git a/include/slate/slate.hh b/include/slate/slate.hh index 22d249fc8..a249d8af9 100644 --- a/include/slate/slate.hh +++ b/include/slate/slate.hh @@ -1093,7 +1093,8 @@ void steqr2( template void qdwh( Matrix& A, - Matrix& H, // this matrix will be hermition + HermitianMatrix& H, + int& itqr, int& itpo, Options const& opts = Options()); //----------------------------------------- diff --git a/src/qdwh.cc b/src/qdwh.cc index c377f8af1..15c4d4975 100644 --- a/src/qdwh.cc +++ b/src/qdwh.cc @@ -26,7 +26,8 @@ namespace impl { template void qdwh( Matrix& A, - Matrix& H, // this matrix will be hermition + HermitianMatrix& H, // this matrix will be hermition + int& itqr, int& itpo, Options const& opts) { using real_t = blas::real_type; @@ -43,12 +44,12 @@ void qdwh( bool optqr = 1; int itconv, it; - int itqr = 0, itpo = 0; real_t L2, sqd, dd, a1, a, b, c; real_t Li, Liconv; real_t conv = 100.; - scalar_t zero = 0.0, one = 1.0, minusone = -1.0; + scalar_t zero = 0.0, one = 1.0; + real_t rzero = 0.0; scalar_t alpha, beta; @@ -88,13 +89,8 @@ void qdwh( // todo: anyway to avoid Acpy allocation? slate::Matrix Acpy; - if (m != n) { - Acpy = slate::Matrix(m, n, nb, nprow, npcol, MPI_COMM_WORLD); - Acpy.insertLocalTiles(); - } - else { - Acpy = H; - } + Acpy = slate::Matrix(m, n, nb, nprow, npcol, MPI_COMM_WORLD); + Acpy.insertLocalTiles(target); if (target == Target::Devices) { const int64_t batch_size_zero = 0; // use default batch size @@ -104,18 +100,22 @@ void qdwh( // keep allocating // todo: check traces to confirm // or in printf in blaspp/device_malloc and here - A.allocateBatchArrays(batch_size_zero, num_queues); - A.reserveDeviceWorkspace(); - W.allocateBatchArrays(batch_size_zero, num_queues); - W.reserveDeviceWorkspace(); - Q.allocateBatchArrays(batch_size_zero, num_queues); - Q.reserveDeviceWorkspace(); - Acpy.allocateBatchArrays(batch_size_zero, num_queues); - Acpy.reserveDeviceWorkspace(); + // todo: is it enough to allocate for one matrix?? + // todo: we don't need these here because each slate call will do it. + // todo: check if after any call it deallocate, for example after gemm, does it throw away the gpu allocation? + //A.allocateBatchArrays(batch_size_zero, num_queues); + //A.reserveDeviceWorkspace(); + //W.allocateBatchArrays(batch_size_zero, num_queues); + //W.reserveDeviceWorkspace(); + //Q.allocateBatchArrays(batch_size_zero, num_queues); + //Q.reserveDeviceWorkspace(); + //Acpy.allocateBatchArrays(batch_size_zero, num_queues); + //Acpy.reserveDeviceWorkspace(); } - W.insertLocalTiles(); - Q.insertLocalTiles(); + // todo: do this on GPUs + W.insertLocalTiles(target); + Q.insertLocalTiles(target); // To compute QR[A; Id] // First mt tiles of W contains A matrix in [0:m-1, 0:n-1] @@ -151,7 +151,6 @@ void qdwh( // scale the original matrix A to form A0 of the iterative loop add(alpha/scalar_t(norm_est), Acpy, zero, A, opts); - // todo: look at Lapack impl // Calculate Li: reciprocal of condition number estimation // Either 1) use LU followed by gecon // Or 2) QR followed by trtri @@ -161,7 +160,7 @@ void qdwh( // Estimate the condition number using QR // This Q factor can be used in the first QR-based iteration copy(A, W10); - // todo: this QR can be used in the first QR iteration + // todo: the QR used to estimate the condition number can be used in the first QR iteration // todo: put bound of 1-norm est compared to 2-norm slate::geqrf(W1, T, opts); //auto R = TrapezoidMatrix( @@ -255,6 +254,8 @@ void qdwh( // todo: how to avoid allocating workspace for each iteration (3 QR)? // todo: how to make it work for any matrix size and any nb? // todo: check time for allocationn in geqrf + // W = [W1] + // [I ] geqrf_qdwh_full(W, T, opts); set(zero, one, Q); @@ -318,7 +319,7 @@ void qdwh( // Compute the norm of the symmetric matrix U - B1 conv = 10.0; if (it >= itconv) { - add(one, A, minusone, W10, opts); + add(one, A, -one, W10, opts); conv = norm(slate::Norm::Fro, W10); } } @@ -331,7 +332,12 @@ void qdwh( // A = U*H ==> H = U'*A ==> H = 0.5*(H'+H) copy(Acpy, W10); auto AT = conj_transpose(A); - gemm(one, AT, W10, zero, H, opts); + // todo: try something like her2k to compute H + //her2k(one, A, W10, rzero, H, opts); + gemm(one, AT, W10, zero, Q10, opts); + auto AL = HermitianMatrix( + Uplo::Lower, Q10 ); + copy(AL, H); A.releaseWorkspace(); W.releaseWorkspace(); @@ -390,7 +396,8 @@ void qdwh( template void qdwh( Matrix& A, - Matrix& H, + HermitianMatrix& H, + int& itqr, int& itpo, Options const& opts) { using internal::TargetType; @@ -399,16 +406,16 @@ void qdwh( switch (target) { case Target::Host: case Target::HostTask: - impl::qdwh( A, H, opts ); + impl::qdwh( A, H, itqr, itpo, opts ); break; case Target::HostNest: - impl::qdwh( A, H, opts ); + impl::qdwh( A, H, itqr, itpo, opts ); break; case Target::HostBatch: - impl::qdwh( A, H, opts ); + impl::qdwh( A, H, itqr, itpo, opts ); break; case Target::Devices: - impl::qdwh( A, H, opts ); + impl::qdwh( A, H, itqr, itpo, opts ); break; } // todo: return value for errors? @@ -419,25 +426,29 @@ void qdwh( template void qdwh( Matrix& A, - Matrix& H, + HermitianMatrix& H, + int& itqr, int& itpo, Options const& opts); template void qdwh( Matrix& A, - Matrix& H, + HermitianMatrix& H, + int& itqr, int& itpo, Options const& opts); template void qdwh< std::complex >( Matrix< std::complex >& A, - Matrix< std::complex >& H, + HermitianMatrix< std::complex >& H, + int& itqr, int& itpo, Options const& opts); template void qdwh< std::complex >( Matrix< std::complex >& A, - Matrix< std::complex >& H, + HermitianMatrix< std::complex >& H, + int& itqr, int& itpo, Options const& opts); } // namespace slate diff --git a/test/test_qdwh.cc b/test/test_qdwh.cc index 1281a1e05..019e19417 100644 --- a/test/test_qdwh.cc +++ b/test/test_qdwh.cc @@ -88,23 +88,22 @@ void test_qdwh_work(Params& params, bool run) Id.insertLocalTiles(); slate::Matrix A; - slate::Matrix H; + slate::HermitianMatrix H; if (origin != slate::Origin::ScaLAPACK) { // Copy local ScaLAPACK data to GPU or CPU tiles. slate::Target origin_target = origin2target(origin); A = slate::Matrix(m, n, nb, p, q, MPI_COMM_WORLD); A.insertLocalTiles(origin_target); //copy(&A_tst[0], descA_tst, A); - H = slate::Matrix(n, n, nb, p, q, MPI_COMM_WORLD); + H = slate::HermitianMatrix(slate::Uplo::Lower, n, nb, p, q, MPI_COMM_WORLD); H.insertLocalTiles(origin_target); - //copy(&H_tst[0], descH_tst, H); } else { // create SLATE matrices from the ScaLAPACK layouts A_data.resize( lldA * nlocA ); H_data.resize( lldH * nlocH ); A = slate::Matrix::fromScaLAPACK(m, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD); - H = slate::Matrix::fromScaLAPACK(n, n, &H_data[0], lldH, nb, p, q, MPI_COMM_WORLD); + H = slate::HermitianMatrix::fromScaLAPACK(slate::Uplo::Lower, n, &H_data[0], lldH, nb, p, q, MPI_COMM_WORLD); } real_t cond = 1 / std::numeric_limits::epsilon(); @@ -126,12 +125,13 @@ void test_qdwh_work(Params& params, bool run) print_matrix("A", A, params); // todo: how to compute gflops? - double gflop = lapack::Gflop::geqrf(m, n); if (! ref_only) { if (trace) slate::trace::Trace::on(); else slate::trace::Trace::off(); + int itqr = 0, itpo = 0; + double time = barrier_get_wtime(MPI_COMM_WORLD); //================================================== @@ -140,10 +140,31 @@ void test_qdwh_work(Params& params, bool run) // A = AH, // A will be written by the orthogonal polar factor // H is the symmetric positive semidefinite polar factor - slate::qdwh(A, H, opts); + slate::qdwh(A, H, itqr, itpo, opts); time = barrier_get_wtime(MPI_COMM_WORLD) - time; + // gflop include the flops to estimate the condition + // number, the flops for the qr_based iterations the + // flops for the po_based iterations and the flops + // to compute H (hermition polar factor). + // gflop = trcon + itqr * (flop_per_qr_based_iter) + + // itpo * (flop_per_po_based_iter) + + // flop_compute_H + double gflop_trcon = lapack::Gflop::geqrf(m, n); + + double gflop_itqr = itqr * ( lapack::Gflop::geqrf(m+n, n) + + lapack::Gflop::unmqr(slate::Side::Left, m+n, n, n) + + lapack::Gflop::gemm(m, n, n) ); + + double gflop_itpo = itpo * ( lapack::Gflop::gemm(m, n, n) + + lapack::Gflop::potrf(m) + + blas::Gflop::trsm(slate::Side::Left, m, n) ); + + double gflop_compute_H = blas::Gflop::her2k(n, m); + + double gflop = gflop_trcon + gflop_itqr + gflop_itpo + gflop_compute_H; + if (trace) slate::trace::Trace::finish(); // compute and save timing/performance @@ -178,7 +199,8 @@ void test_qdwh_work(Params& params, bool run) // //================================================== real_t normA = slate::norm(slate::Norm::Fro, Aref); - slate::gemm(one, A, H, -one, Aref, opts); + //slate::gemm(one, A, H, -one, Aref, opts); + slate::hemm(slate::Side::Right, one, H, A, -one, Aref, opts); real_t berr = slate::norm(slate::Norm::Fro, Aref); params.error() = berr / normA; From 44e7c65d305ca7b278f299e99f9503be0f5199d2 Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Thu, 5 Jan 2023 14:19:49 -0500 Subject: [PATCH 13/49] Save the hermitian polar factor into a general matrix for now. Add opts to slate calls --- src/qdwh.cc | 42 +++++++++++++++++++----------------------- test/test_qdwh.cc | 29 +++++++++++++++++------------ 2 files changed, 36 insertions(+), 35 deletions(-) diff --git a/src/qdwh.cc b/src/qdwh.cc index 15c4d4975..405e68a45 100644 --- a/src/qdwh.cc +++ b/src/qdwh.cc @@ -26,7 +26,7 @@ namespace impl { template void qdwh( Matrix& A, - HermitianMatrix& H, // this matrix will be hermition + Matrix& B, int& itqr, int& itpo, Options const& opts) { @@ -136,9 +136,9 @@ void qdwh( if (m_roundup != m) { // W11 and Q11 is the extra padded rows [m:m_round_up, 0:n-1] auto W11 = W1.slice(m, m_roundup-1, 0, n-1); - set(zero, zero, W11); + set(zero, zero, W11, opts); auto Q11 = Q1.slice(m, m_roundup-1, 0, n-1); - set(zero, zero, Q11); + set(zero, zero, Q11, opts); } // backup A in Acpy to compute H @@ -155,7 +155,7 @@ void qdwh( // Either 1) use LU followed by gecon // Or 2) QR followed by trtri // If used the QR, use the Q factor in the first QR-based iteration - normA = norm(slate::Norm::One, A); + normA = norm(slate::Norm::One, A, opts); if (optqr) { // Estimate the condition number using QR // This Q factor can be used in the first QR-based iteration @@ -172,7 +172,7 @@ void qdwh( Uplo::Upper, R1 ); // todo: cheaper to do triangular solve than calling trtri trtri(R, opts); - norminvR = norm(slate::Norm::One, Rh); + norminvR = norm(slate::Norm::One, Rh, opts); Li = (1.0 / norminvR) / normA; Li = norm_est / 1.1 * Li; // *flops += FLOPS_DGEQRF( M, N ) @@ -233,7 +233,7 @@ void qdwh( // Generate the matrix B = [ B1 ] = [ sqrt(c) * U ] // [ B2 ] = [ Id ] alpha = scalar_t(sqrt(c)); - set(zero, one, W2); + set(zero, one, W2, opts); //if( doqr ) { // geadd(alpha, A, zero, W10, opts); @@ -258,7 +258,7 @@ void qdwh( // [I ] geqrf_qdwh_full(W, T, opts); - set(zero, one, Q); + set(zero, one, Q, opts); //unmqr(slate::Side::Left, slate::Op::NoTrans, W, T, Q, opts); //naive impl unmqr_qdwh_full(slate::Side::Left, slate::Op::NoTrans, W, T, Q, opts); @@ -288,7 +288,7 @@ void qdwh( } // Make Q1 into an identity matrix - set(zero, one, W2); + set(zero, one, W2, opts); // Compute Q1 = c * A' * A + I /////////////// @@ -320,7 +320,7 @@ void qdwh( conv = 10.0; if (it >= itconv) { add(one, A, -one, W10, opts); - conv = norm(slate::Norm::Fro, W10); + conv = norm(slate::Norm::Fro, W10, opts); } } @@ -330,14 +330,10 @@ void qdwh( } // A = U*H ==> H = U'*A ==> H = 0.5*(H'+H) - copy(Acpy, W10); auto AT = conj_transpose(A); + gemm(one, AT, Acpy, zero, B, opts); // todo: try something like her2k to compute H //her2k(one, A, W10, rzero, H, opts); - gemm(one, AT, W10, zero, Q10, opts); - auto AL = HermitianMatrix( - Uplo::Lower, Q10 ); - copy(AL, H); A.releaseWorkspace(); W.releaseWorkspace(); @@ -396,7 +392,7 @@ void qdwh( template void qdwh( Matrix& A, - HermitianMatrix& H, + Matrix& B, int& itqr, int& itpo, Options const& opts) { @@ -406,16 +402,16 @@ void qdwh( switch (target) { case Target::Host: case Target::HostTask: - impl::qdwh( A, H, itqr, itpo, opts ); + impl::qdwh( A, B, itqr, itpo, opts ); break; case Target::HostNest: - impl::qdwh( A, H, itqr, itpo, opts ); + impl::qdwh( A, B, itqr, itpo, opts ); break; case Target::HostBatch: - impl::qdwh( A, H, itqr, itpo, opts ); + impl::qdwh( A, B, itqr, itpo, opts ); break; case Target::Devices: - impl::qdwh( A, H, itqr, itpo, opts ); + impl::qdwh( A, B, itqr, itpo, opts ); break; } // todo: return value for errors? @@ -426,28 +422,28 @@ void qdwh( template void qdwh( Matrix& A, - HermitianMatrix& H, + Matrix& B, int& itqr, int& itpo, Options const& opts); template void qdwh( Matrix& A, - HermitianMatrix& H, + Matrix& B, int& itqr, int& itpo, Options const& opts); template void qdwh< std::complex >( Matrix< std::complex >& A, - HermitianMatrix< std::complex >& H, + Matrix< std::complex >& B, int& itqr, int& itpo, Options const& opts); template void qdwh< std::complex >( Matrix< std::complex >& A, - HermitianMatrix< std::complex >& H, + Matrix< std::complex >& B, int& itqr, int& itpo, Options const& opts); diff --git a/test/test_qdwh.cc b/test/test_qdwh.cc index 019e19417..b02a264a8 100644 --- a/test/test_qdwh.cc +++ b/test/test_qdwh.cc @@ -76,6 +76,7 @@ void test_qdwh_work(Params& params, bool run) int64_t nlocA = num_local_rows_cols(n, nb, mycol, q); int64_t lldA = blas::max(1, mlocA); // local leading dimension of A std::vector A_data; + std::vector B_data; int64_t mlocH = num_local_rows_cols(n, nb, myrow, p); int64_t nlocH = num_local_rows_cols(n, nb, mycol, q); @@ -88,22 +89,26 @@ void test_qdwh_work(Params& params, bool run) Id.insertLocalTiles(); slate::Matrix A; - slate::HermitianMatrix H; + slate::Matrix B; + //slate::HermitianMatrix H; if (origin != slate::Origin::ScaLAPACK) { // Copy local ScaLAPACK data to GPU or CPU tiles. slate::Target origin_target = origin2target(origin); A = slate::Matrix(m, n, nb, p, q, MPI_COMM_WORLD); A.insertLocalTiles(origin_target); - //copy(&A_tst[0], descA_tst, A); - H = slate::HermitianMatrix(slate::Uplo::Lower, n, nb, p, q, MPI_COMM_WORLD); - H.insertLocalTiles(origin_target); + //H = slate::HermitianMatrix(slate::Uplo::Lower, n, nb, p, q, MPI_COMM_WORLD); + //H.insertLocalTiles(origin_target); + B = slate::Matrix(m, n, nb, p, q, MPI_COMM_WORLD); + B.insertLocalTiles(origin_target); } else { // create SLATE matrices from the ScaLAPACK layouts A_data.resize( lldA * nlocA ); - H_data.resize( lldH * nlocH ); A = slate::Matrix::fromScaLAPACK(m, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD); - H = slate::HermitianMatrix::fromScaLAPACK(slate::Uplo::Lower, n, &H_data[0], lldH, nb, p, q, MPI_COMM_WORLD); + //H_data.resize( lldH * nlocH ); + //H = slate::HermitianMatrix::fromScaLAPACK(slate::Uplo::Lower, n, &H_data[0], lldH, nb, p, q, MPI_COMM_WORLD); + B_data.resize( lldA * nlocA ); + B = slate::Matrix::fromScaLAPACK(m, n, &B_data[0], lldA, nb, p, q, MPI_COMM_WORLD); } real_t cond = 1 / std::numeric_limits::epsilon(); @@ -140,7 +145,7 @@ void test_qdwh_work(Params& params, bool run) // A = AH, // A will be written by the orthogonal polar factor // H is the symmetric positive semidefinite polar factor - slate::qdwh(A, H, itqr, itpo, opts); + slate::qdwh(A, B, itqr, itpo, opts); time = barrier_get_wtime(MPI_COMM_WORLD) - time; @@ -172,7 +177,7 @@ void test_qdwh_work(Params& params, bool run) params.gflops() = gflop / time; print_matrix("U: orthogonal polar factor", A, params); - print_matrix("H: symmetric postive semi definite factor", H, params); + //print_matrix("H: symmetric postive semi definite factor", H, params); } if (check) { @@ -198,10 +203,10 @@ void test_qdwh_work(Params& params, bool run) // ||Aref||_f // //================================================== - real_t normA = slate::norm(slate::Norm::Fro, Aref); - //slate::gemm(one, A, H, -one, Aref, opts); - slate::hemm(slate::Side::Right, one, H, A, -one, Aref, opts); - real_t berr = slate::norm(slate::Norm::Fro, Aref); + real_t normA = slate::norm(slate::Norm::Fro, Aref, opts); + slate::gemm(one, A, B, -one, Aref, opts); + //slate::hemm(slate::Side::Right, one, H, A, -one, Aref, opts); + real_t berr = slate::norm(slate::Norm::Fro, Aref, opts); params.error() = berr / normA; real_t tol = params.tol() * 0.5 * std::numeric_limits::epsilon(); From 1e5e0348d3704bd1c092c909b08b9afc1c3d40c7 Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Fri, 6 Jan 2023 10:41:53 -0500 Subject: [PATCH 14/49] rename normest and cleaning it --- src/norm2est.cc | 211 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 211 insertions(+) create mode 100644 src/norm2est.cc diff --git a/src/norm2est.cc b/src/norm2est.cc new file mode 100644 index 000000000..b826f1f2a --- /dev/null +++ b/src/norm2est.cc @@ -0,0 +1,211 @@ +// Copyright (c) 2017-2022, University of Tennessee. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// This program is free software: you can redistribute it and/or modify it under +// the terms of the BSD 3-Clause license. See the accompanying LICENSE file. + +#include "slate/slate.hh" +#include "internal/internal.hh" +#include "internal/internal_util.hh" +#include "slate/internal/mpi.hh" + +#include +#include + +namespace slate { + +namespace impl { + +//------------------------------------------------------------------------------ +/// @internal +/// Distributed parallel general matrix norm. +/// Generic implementation for any target. +/// @ingroup norm_specialization +/// +template +blas::real_type +norm2est( + matrix_type A, + Options const& opts) +{ + using scalar_t = typename matrix_type::value_type; + using real_t = blas::real_type; + using blas::min; + + int p, q; + int myrow, mycol; + int izero = 0; + GridOrder order; + A.gridinfo(&order, &p, &q, &myrow, &mycol); + + int64_t n = A.n(); + int64_t m = A.m(); + int64_t mb = A.tileMb(0); + int64_t nb = A.tileNb(0); + + scalar_t one = 1.; + scalar_t zero = 0.; + real_t alpha; + + int64_t cnt = 0; + int64_t maxiter = min( 100, n ); + + real_t e = 0.; + real_t e0 = 0.; + real_t normX = 0; + real_t normAX = 0; + real_t tol = 1.e-1; + + int64_t mloc = numberLocalRowOrCol(m, mb, myrow, izero, p); + int64_t nloc = numberLocalRowOrCol(n, nb, myrow, izero, q); + int64_t lldA = blas::max(1, mloc); + int64_t lldA_n = blas::max(1, nloc); + + // todo: do we still need reserveDevice here? + if (target == Target::Devices) + A.reserveDeviceWorkspace(); + + std::vector local_sums(n); + std::vector global_sums(n); + std::vector W1(lldA); + std::vector W2(lldA_n); + + auto XL = slate::Matrix::fromScaLAPACK( + n, 1, &global_sums[0], n, nb, 1, p, q, A.mpiComm()); + auto AX = slate::Matrix::fromScaLAPACK( + m, 1, &W1[0], lldA, mb, 1, p, q, A.mpiComm()); + auto X = slate::Matrix::fromScaLAPACK( + n, 1, &W2[0], lldA_n, nb, 1, p, q, A.mpiComm()); + + // Two norm estimation + // First: let's compute the x vector such that + // x_j = sum_i abs( A_{i,j} ), x here is global_sums.data + #pragma omp parallel + #pragma omp master + { + internal::norm(slate::Norm::One, NormScope::Matrix, std::move(A), local_sums.data()); + } + + // global_sums.data() will have the sum of each column. + #pragma omp critical(slate_mpi) + { + trace::Block trace_block("MPI_Allreduce"); + slate_mpi_call( + MPI_Allreduce(local_sums.data(), global_sums.data(), + n, mpi_type::value, + MPI_SUM, A.mpiComm())); + } + + // Second: compute the ||x||_Fro + e = slate::norm(Norm::Fro, XL, opts); + if (e == 0.) { + return 0.; + } + else { + normX = e; + } + + // Third: start the while-loop X = X / ||X|| + while ((cnt < maxiter) && (fabs((e - e0)) > (tol * (e)))) { + e0 = e; + + // Scale X = X / ||X|| + alpha = 1.0/normX; + add(scalar_t(alpha), XL, zero, X, opts); + + // Compute Ax = A * sx + gemm(one, A, X, zero, AX, opts); + + // todo: still need to add the following + //if nnz(Sx) == 0 + // Sx = rand(size(Sx),class(Sx)); + //end + + // Compute x = A' * A * x = A' * Ax + auto AT = conjTranspose(A); + // todo: why this set is needed when using multiple mpi rank + // todo: send to Sebastien + set(zero, zero, XL); + gemm(one, AT, AX, zero, XL, opts); + //gemmC(one, AT, AX, zero, XL, opts); + + // Compute ||X||, ||AX|| + normX = norm(Norm::Fro, XL, opts); + normAX = norm(Norm::Fro, AX, opts); + + // Uodate e + e = normX / normAX; + cnt++; + } + + A.clearWorkspace(); + + return e; + +} + +} // namespace impl + +//------------------------------------------------------------------------------ +/// Distributed parallel general matrix norm. +/// +//------------------------------------------------------------------------------ +/// @tparam matrix_type +/// Any SLATE matrix type: Matrix, SymmetricMatrix, HermitianMatrix, +/// TriangularMatrix, etc. +//------------------------------------------------------------------------------ +/// @param[in] A +/// The matrix A. +/// +/// @param[in] opts +/// Additional options, as map of name = value pairs. Possible options: +/// - Option::Target: +/// Implementation to target. Possible values: +/// - HostTask: OpenMP tasks on CPU host [default]. +/// - HostNest: nested OpenMP parallel for loop on CPU host. +/// - Devices: batched BLAS on GPU device. +/// +/// @ingroup norm +/// +template +blas::real_type +norm2est(matrix_type& A, + Options const& opts) +{ + Target target = get_option( opts, Option::Target, Target::HostTask ); + + switch (target) { + case Target::Host: + case Target::HostTask: + case Target::HostNest: + case Target::HostBatch: + return impl::norm2est( A, opts ); + break; + case Target::Devices: + return impl::norm2est( A, opts ); + break; + } +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +float norm2est( + Matrix& A, + Options const& opts); + +template +double norm2est( + Matrix& A, + Options const& opts); + +template +float norm2est( + Matrix< std::complex >& A, + Options const& opts); + +template +double norm2est( + Matrix< std::complex >& A, + Options const& opts); + +} // namespace slate From 12c151cde340fb3a2108f63857caf75b504af369 Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Fri, 6 Jan 2023 10:43:58 -0500 Subject: [PATCH 15/49] change header of norm2est and call it in qdwh --- include/slate/slate.hh | 5 ++--- src/qdwh.cc | 9 +++------ 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/include/slate/slate.hh b/include/slate/slate.hh index a249d8af9..dbcae4781 100644 --- a/include/slate/slate.hh +++ b/include/slate/slate.hh @@ -1093,7 +1093,7 @@ void steqr2( template void qdwh( Matrix& A, - HermitianMatrix& H, + Matrix& B, int& itqr, int& itpo, Options const& opts = Options()); @@ -1101,8 +1101,7 @@ void qdwh( // normest() template blas::real_type -normest( - Norm in_norm, +norm2est( matrix_type &A, Options const& opts = Options()); diff --git a/src/qdwh.cc b/src/qdwh.cc index 405e68a45..99a1e0980 100644 --- a/src/qdwh.cc +++ b/src/qdwh.cc @@ -145,8 +145,9 @@ void qdwh( copy(A, Acpy); // two norm estimation (largest singular value of A) - real_t norm_est = normest(slate::Norm::One, A); + real_t norm_est = norm2est( A, opts); alpha = 1.0; + norm_est = 1.; // scale the original matrix A to form A0 of the iterative loop add(alpha/scalar_t(norm_est), Acpy, zero, A, opts); @@ -402,13 +403,9 @@ void qdwh( switch (target) { case Target::Host: case Target::HostTask: - impl::qdwh( A, B, itqr, itpo, opts ); - break; case Target::HostNest: - impl::qdwh( A, B, itqr, itpo, opts ); - break; case Target::HostBatch: - impl::qdwh( A, B, itqr, itpo, opts ); + impl::qdwh( A, B, itqr, itpo, opts ); break; case Target::Devices: impl::qdwh( A, B, itqr, itpo, opts ); From c2e487033f2b30a5b06b2649711fcc12937980dd Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Fri, 6 Jan 2023 10:48:10 -0500 Subject: [PATCH 16/49] rename normest and minor change --- GNUmakefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/GNUmakefile b/GNUmakefile index 7021560f5..94a581bf9 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -502,6 +502,7 @@ ifneq ($(only_unit),1) src/gemmA.cc \ src/gemmC.cc \ src/geqrf.cc \ + src/geqrf_qdwh_full.cc \ src/gesv.cc \ src/gesvMixed.cc \ src/gesv_nopiv.cc \ @@ -528,6 +529,7 @@ ifneq ($(only_unit),1) src/hetrf.cc \ src/hetrs.cc \ src/norm.cc \ + src/norm2est.cc \ src/pbsv.cc \ src/pbtrf.cc \ src/pbtrs.cc \ @@ -537,6 +539,7 @@ ifneq ($(only_unit),1) src/potri.cc \ src/potrs.cc \ src/print.cc \ + src/qdwh.cc \ src/scale.cc \ src/scale_row_col.cc \ src/set.cc \ @@ -558,13 +561,10 @@ ifneq ($(only_unit),1) src/unmqr.cc \ src/unmtr_hb2st.cc \ src/unmtr_he2hb.cc \ + src/unmqr_qdwh_full.cc \ src/work/work_trmm.cc \ src/work/work_trsm.cc \ src/work/work_trsmA.cc \ - src/qdwh.cc \ - src/normest.cc \ - src/geqrf_qdwh_full.cc \ - src/unmqr_qdwh_full.cc \ # End. Add alphabetically. endif From a0aa6052b3a9018cbc79a71de7dc6e8a968b8d9c Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Fri, 6 Jan 2023 10:52:27 -0500 Subject: [PATCH 17/49] delete --- src/normest.cc | 250 ------------------------------------------------- 1 file changed, 250 deletions(-) delete mode 100644 src/normest.cc diff --git a/src/normest.cc b/src/normest.cc deleted file mode 100644 index c73632cb4..000000000 --- a/src/normest.cc +++ /dev/null @@ -1,250 +0,0 @@ -// Copyright (c) 2017-2022, University of Tennessee. All rights reserved. -// SPDX-License-Identifier: BSD-3-Clause -// This program is free software: you can redistribute it and/or modify it under -// the terms of the BSD 3-Clause license. See the accompanying LICENSE file. - -#include "slate/slate.hh" -#include "internal/internal.hh" -#include "internal/internal_util.hh" -#include "slate/internal/mpi.hh" - -#include -#include - -namespace slate { - -// specialization namespace differentiates, e.g., -// internal::norm from internal::specialization::norm -namespace internal { -namespace specialization { - -//------------------------------------------------------------------------------ -/// @internal -/// Distributed parallel general matrix norm. -/// Generic implementation for any target. -/// @ingroup norm_specialization -/// -template -blas::real_type -normest(slate::internal::TargetType, - Norm in_norm, matrix_type A, - Options const& opts) -{ - using scalar_t = typename matrix_type::value_type; - using real_t = blas::real_type; - - // Undo any transpose, which switches one <=> inf norms. - if (A.op() == Op::ConjTrans) - A = conj_transpose(A); - else if (A.op() == Op::Trans) - A = transpose(A); - - //--------- - // Two norm estimation - // First: let's compute the x vector such that - // x_j = sum_i abs( A_{i,j} ), x here is global_sums.data - // todo: should we add this to norm? - //if (in_norm == Norm::One_est) { - - using blas::min; - - int p, q; - int myrow, mycol; - GridOrder order; - A.gridinfo(&order, &p, &q, &myrow, &mycol); - - int64_t n = A.n(); - int64_t m = A.m(); - int64_t nb = A.tileNb(0); - - scalar_t one = 1.; - scalar_t zero = 0.; - real_t alpha; - - int64_t cnt = 0; - int64_t maxiter = min( 100, n ); - - real_t e = 0.; - real_t e0 = 0.; - real_t normX = 0; - real_t normAX = 0; - real_t tol = 1.e-1; - - std::vector local_sums(n); - - // todo: do we still need reserveDevice here? - if (target == Target::Devices) - A.reserveDeviceWorkspace(); - - std::vector global_sums(n); - std::vector W1(m); - std::vector W2(n); - - auto XL = slate::Matrix::fromLAPACK( - n, 1, &global_sums[0], n, nb, 1, p, q, A.mpiComm()); - XL.insertLocalTiles(); - auto AX = slate::Matrix::fromLAPACK( - m, 1, &W1[0], m, nb, 1, p, q, A.mpiComm()); - AX.insertLocalTiles(); - auto X = slate::Matrix::fromLAPACK( - n, 1, &W2[0], n, nb, 1, p, q, A.mpiComm()); - X.insertLocalTiles(); - - - #pragma omp parallel - #pragma omp master - { - internal::norm(slate::Norm::One, NormScope::Matrix, std::move(A), local_sums.data()); - } - - #pragma omp critical(slate_mpi) - { - trace::Block trace_block("MPI_Allreduce"); - slate_mpi_call( - MPI_Allreduce(local_sums.data(), global_sums.data(), - n, mpi_type::value, - MPI_SUM, A.mpiComm())); - } - - // global_sums.data() will have the sum of each column. - // First step is done - - // Second: compute the ||x||_Fro - //e = lapack::lange( - // Norm::Fro, 1, n, - // global_sums.data(), 1); - e = slate::norm(Norm::Fro, XL, opts); - if (e == 0.) { - return 0.; - } - else { - normX = e; - } - - // Third: start the while-loop X = X / ||X|| - while ((cnt < maxiter) && (fabs((e - e0)) > (tol * (e)))) { - e0 = e; - - // Scale X = X / ||X|| - alpha = 1.0/normX; - add(scalar_t(alpha), XL, zero, X, opts); - - // Compute Ax = A * sx - gemm(one, A, X, zero, AX, opts); - - // todo: still need to add the following - //if nnz(Sx) == 0 - // Sx = rand(size(Sx),class(Sx)); - //end - - // Compute x = A' * A * x = A' * Ax - auto AT = conjTranspose(A); - // todo: why this set is needed when using multiple mpi rank - // todo: send to Sebastien - set(zero, zero, XL); - gemm(one, AT, AX, zero, XL, opts); - //gemmC(one, AT, AX, zero, XL, opts); - - // Compute ||X||, ||AX|| - normX = norm(Norm::Fro, XL, opts); - normAX = norm(Norm::Fro, AX, opts); - - // Uodate e - e = normX / normAX; - cnt++; - } - - A.clearWorkspace(); - - return e; - - //} - //else { - // slate_error("invalid norm."); - //} -} - -} // namespace specialization -} // namespace internal - -//------------------------------------------------------------------------------ -/// Version with target as template parameter. -/// @ingroup norm_specialization -/// -template -blas::real_type -normest(Norm norm, matrix_type& A, - const std::map& opts) -{ - return internal::specialization::normest(internal::TargetType(), - norm, A, opts); -} - -//------------------------------------------------------------------------------ -/// Distributed parallel general matrix norm. -/// -//------------------------------------------------------------------------------ -/// @tparam matrix_type -/// Any SLATE matrix type: Matrix, SymmetricMatrix, HermitianMatrix, -/// TriangularMatrix, etc. -//------------------------------------------------------------------------------ -/// @param[in] in_norm -/// Norm to compute: -/// - Norm::Second: second norm estimation of the matrix$ -/// -/// @param[in] A -/// The matrix A. -/// -/// @param[in] opts -/// Additional options, as map of name = value pairs. Possible options: -/// - Option::Target: -/// Implementation to target. Possible values: -/// - HostTask: OpenMP tasks on CPU host [default]. -/// - HostNest: nested OpenMP parallel for loop on CPU host. -/// - Devices: batched BLAS on GPU device. -/// -/// @ingroup norm -/// -template -blas::real_type -normest(Norm in_norm, matrix_type& A, - const std::map& opts) -{ - Target target = get_option( opts, Option::Target, Target::HostTask ); - - switch (target) { - case Target::Host: - case Target::HostTask: - return normest(in_norm, A, opts); - break; - case Target::HostBatch: - case Target::HostNest: - case Target::Devices: - break; - } - throw std::exception(); // todo: invalid target -} - -//------------------------------------------------------------------------------ -// Explicit instantiations. -template -float normest( - Norm in_norm, Matrix& A, - Options const& opts); - -template -double normest( - Norm in_norm, Matrix& A, - Options const& opts); - -template -float normest( - Norm in_norm, Matrix< std::complex >& A, - Options const& opts); - -template -double normest( - Norm in_norm, Matrix< std::complex >& A, - Options const& opts); - -} // namespace slate From c8516a901f59d92bf2d0882adfc9357c3bb9bda8 Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Fri, 6 Jan 2023 11:17:37 -0500 Subject: [PATCH 18/49] minor cleaning --- src/qdwh.cc | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/src/qdwh.cc b/src/qdwh.cc index 99a1e0980..d58badd96 100644 --- a/src/qdwh.cc +++ b/src/qdwh.cc @@ -10,17 +10,11 @@ namespace slate { -// specialization namespace differentiates, e.g., -// todo: is it ok? -// internal::qdwh from impl::qdwh -// todo: works only if m % nb = 0 -// todo: works only for square matrices namespace impl { //------------------------------------------------------------------------------ /// Distributed parallel polar decomposition based on QDWH algorithm. /// Generic implementation for any target. -// todo: add to docs/doxygen /// @ingroup qdwh_specialization /// template @@ -92,9 +86,9 @@ void qdwh( Acpy = slate::Matrix(m, n, nb, nprow, npcol, MPI_COMM_WORLD); Acpy.insertLocalTiles(target); - if (target == Target::Devices) { - const int64_t batch_size_zero = 0; // use default batch size - const int64_t num_queues = 3 + lookahead; + //if (target == Target::Devices) { + //const int64_t batch_size_zero = 0; // use default batch size + //const int64_t num_queues = 3 + lookahead; // todo: I have workspace here and in other call as geqrf_qdwh_full // todo: in geqrf, it should use the allocated ones, it should not // keep allocating @@ -111,7 +105,7 @@ void qdwh( //Q.reserveDeviceWorkspace(); //Acpy.allocateBatchArrays(batch_size_zero, num_queues); //Acpy.reserveDeviceWorkspace(); - } + //} // todo: do this on GPUs W.insertLocalTiles(target); @@ -136,9 +130,7 @@ void qdwh( if (m_roundup != m) { // W11 and Q11 is the extra padded rows [m:m_round_up, 0:n-1] auto W11 = W1.slice(m, m_roundup-1, 0, n-1); - set(zero, zero, W11, opts); auto Q11 = Q1.slice(m, m_roundup-1, 0, n-1); - set(zero, zero, Q11, opts); } // backup A in Acpy to compute H @@ -252,11 +244,6 @@ void qdwh( //scale(alpha, one, W1, opts); //geqrf(W, T, opts); // naive impl - // todo: how to avoid allocating workspace for each iteration (3 QR)? - // todo: how to make it work for any matrix size and any nb? - // todo: check time for allocationn in geqrf - // W = [W1] - // [I ] geqrf_qdwh_full(W, T, opts); set(zero, one, Q, opts); @@ -292,7 +279,6 @@ void qdwh( set(zero, one, W2, opts); // Compute Q1 = c * A' * A + I - /////////////// copy(A, Q10); auto AT = conj_transpose(Q10); gemm(scalar_t(c), AT, A, one, W2, opts); @@ -317,7 +303,7 @@ void qdwh( //facto = 1; } - // Compute the norm of the symmetric matrix U - B1 + // Check if it converge, compute the norm of matrix U - B1 conv = 10.0; if (it >= itconv) { add(one, A, -one, W10, opts); @@ -335,6 +321,9 @@ void qdwh( gemm(one, AT, Acpy, zero, B, opts); // todo: try something like her2k to compute H //her2k(one, A, W10, rzero, H, opts); + //auto AL = HermitianMatrix( + // slate::Uplo::Lower, B ); + //slate::copy(AL, H, opts); A.releaseWorkspace(); W.releaseWorkspace(); From c21a9933786f7d80057f4ed24b5a0c543c135601 Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Fri, 6 Jan 2023 11:26:02 -0500 Subject: [PATCH 19/49] pass opts to calls --- src/qdwh.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/qdwh.cc b/src/qdwh.cc index d58badd96..1dbfc47a9 100644 --- a/src/qdwh.cc +++ b/src/qdwh.cc @@ -134,7 +134,7 @@ void qdwh( } // backup A in Acpy to compute H - copy(A, Acpy); + slate::copy(A, Acpy, opts); // two norm estimation (largest singular value of A) real_t norm_est = norm2est( A, opts); @@ -152,7 +152,7 @@ void qdwh( if (optqr) { // Estimate the condition number using QR // This Q factor can be used in the first QR-based iteration - copy(A, W10); + slate::copy(A, W10, opts); // todo: the QR used to estimate the condition number can be used in the first QR iteration // todo: put bound of 1-norm est compared to 2-norm slate::geqrf(W1, T, opts); @@ -256,7 +256,7 @@ void qdwh( beta = scalar_t( b / c ); // Copy U into C to check the convergence of QDWH if (it >= itconv ) { - copy(A, W10); + slate::copy(A, W10, opts); } gemm(alpha, Q10, Q2T, beta, A, opts); @@ -272,14 +272,14 @@ void qdwh( else { // Copy A into H to check the convergence of QDWH if (it >= itconv) { - copy(A, W10); + slate::copy(A, W10, opts); } // Make Q1 into an identity matrix set(zero, one, W2, opts); // Compute Q1 = c * A' * A + I - copy(A, Q10); + slate::copy(A, Q10, opts); auto AT = conj_transpose(Q10); gemm(scalar_t(c), AT, A, one, W2, opts); From 7e9c00e346e57c6191ae091d939e24d392d85665 Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Mon, 9 Jan 2023 14:01:35 -0500 Subject: [PATCH 20/49] Use H innstead of new allocation in case of square matrix. Rename B to H --- src/qdwh.cc | 69 +++++++++++++++++------------------------------ test/test_qdwh.cc | 22 ++++++++------- 2 files changed, 37 insertions(+), 54 deletions(-) diff --git a/src/qdwh.cc b/src/qdwh.cc index 1dbfc47a9..796cc05cd 100644 --- a/src/qdwh.cc +++ b/src/qdwh.cc @@ -20,7 +20,7 @@ namespace impl { template void qdwh( Matrix& A, - Matrix& B, + Matrix& H, int& itqr, int& itpo, Options const& opts) { @@ -81,31 +81,16 @@ void qdwh( slate::Matrix Q(m_W, n, nb, nprow, npcol, MPI_COMM_WORLD); slate::TriangularFactors T; - // todo: anyway to avoid Acpy allocation? slate::Matrix Acpy; - Acpy = slate::Matrix(m, n, nb, nprow, npcol, MPI_COMM_WORLD); - Acpy.insertLocalTiles(target); - - //if (target == Target::Devices) { - //const int64_t batch_size_zero = 0; // use default batch size - //const int64_t num_queues = 3 + lookahead; - // todo: I have workspace here and in other call as geqrf_qdwh_full - // todo: in geqrf, it should use the allocated ones, it should not - // keep allocating - // todo: check traces to confirm - // or in printf in blaspp/device_malloc and here - // todo: is it enough to allocate for one matrix?? - // todo: we don't need these here because each slate call will do it. - // todo: check if after any call it deallocate, for example after gemm, does it throw away the gpu allocation? - //A.allocateBatchArrays(batch_size_zero, num_queues); - //A.reserveDeviceWorkspace(); - //W.allocateBatchArrays(batch_size_zero, num_queues); - //W.reserveDeviceWorkspace(); - //Q.allocateBatchArrays(batch_size_zero, num_queues); - //Q.reserveDeviceWorkspace(); - //Acpy.allocateBatchArrays(batch_size_zero, num_queues); - //Acpy.reserveDeviceWorkspace(); - //} + if (m == n) { + // if A is square, we can use H to save a copy of A + // this is will not work if H is stored as Hermitian matrix + Acpy = H; + } + else { + Acpy = slate::Matrix(m, n, nb, nprow, npcol, MPI_COMM_WORLD); + Acpy.insertLocalTiles(target); + } // todo: do this on GPUs W.insertLocalTiles(target); @@ -126,13 +111,6 @@ void qdwh( auto Q2 = Q.sub(mt, mt_W-1, 0, nt-1); - - if (m_roundup != m) { - // W11 and Q11 is the extra padded rows [m:m_round_up, 0:n-1] - auto W11 = W1.slice(m, m_roundup-1, 0, n-1); - auto Q11 = Q1.slice(m, m_roundup-1, 0, n-1); - } - // backup A in Acpy to compute H slate::copy(A, Acpy, opts); @@ -145,9 +123,7 @@ void qdwh( add(alpha/scalar_t(norm_est), Acpy, zero, A, opts); // Calculate Li: reciprocal of condition number estimation - // Either 1) use LU followed by gecon - // Or 2) QR followed by trtri - // If used the QR, use the Q factor in the first QR-based iteration + // todo: use the Q factor in the first QR-based iteration normA = norm(slate::Norm::One, A, opts); if (optqr) { // Estimate the condition number using QR @@ -223,8 +199,8 @@ void qdwh( if (real(c) > 100.) { //int64_t doqr = !optqr || it > 1; - // Generate the matrix B = [ B1 ] = [ sqrt(c) * U ] - // [ B2 ] = [ Id ] + // Generate the matrix W = [ W1 ] = [ sqrt(c) * A ] + // [ W2 ] = [ Id ] alpha = scalar_t(sqrt(c)); set(zero, one, W2, opts); @@ -237,7 +213,7 @@ void qdwh( // geqrf_qdwh_full2(W, T1, opts); //} - // Factorize B = QR, and generate the associated Q + // Factorize W = QR, and generate the associated Q // todo: replace following add by copy and scale, but why scale only scale by a real numbers? add(alpha, A, zero, W10, opts); //copy(A, W1); @@ -303,7 +279,7 @@ void qdwh( //facto = 1; } - // Check if it converge, compute the norm of matrix U - B1 + // Check if it converge, compute the norm of matrix A - W1 conv = 10.0; if (it >= itconv) { add(one, A, -one, W10, opts); @@ -318,16 +294,19 @@ void qdwh( // A = U*H ==> H = U'*A ==> H = 0.5*(H'+H) auto AT = conj_transpose(A); - gemm(one, AT, Acpy, zero, B, opts); + if (m == n) { + auto W10_n = W1.slice(0, n-1, 0, n-1); + gemm(one, AT, Acpy, zero, W10_n, opts); + slate::copy(W10_n, H, opts); + } + else { + gemm(one, AT, Acpy, zero, H, opts); + } // todo: try something like her2k to compute H //her2k(one, A, W10, rzero, H, opts); //auto AL = HermitianMatrix( - // slate::Uplo::Lower, B ); + // slate::Uplo::Lower, H ); //slate::copy(AL, H, opts); - - A.releaseWorkspace(); - W.releaseWorkspace(); - Q.releaseWorkspace(); } } // namespace impl diff --git a/test/test_qdwh.cc b/test/test_qdwh.cc index b02a264a8..339ec5ff4 100644 --- a/test/test_qdwh.cc +++ b/test/test_qdwh.cc @@ -13,6 +13,7 @@ #include "scalapack_wrappers.hh" #include "scalapack_support_routines.hh" #include "scalapack_copy.hh" +#include "nvToolsExt.h" #include #include @@ -76,11 +77,10 @@ void test_qdwh_work(Params& params, bool run) int64_t nlocA = num_local_rows_cols(n, nb, mycol, q); int64_t lldA = blas::max(1, mlocA); // local leading dimension of A std::vector A_data; - std::vector B_data; int64_t mlocH = num_local_rows_cols(n, nb, myrow, p); int64_t nlocH = num_local_rows_cols(n, nb, mycol, q); - int64_t lldH = blas::max(1, mlocH); // local leading dimension of A + int64_t lldH = blas::max(1, mlocH); // local leading dimension of H std::vector H_data; // matrix QR, for checking result @@ -89,7 +89,7 @@ void test_qdwh_work(Params& params, bool run) Id.insertLocalTiles(); slate::Matrix A; - slate::Matrix B; + slate::Matrix H; //slate::HermitianMatrix H; if (origin != slate::Origin::ScaLAPACK) { // Copy local ScaLAPACK data to GPU or CPU tiles. @@ -98,8 +98,8 @@ void test_qdwh_work(Params& params, bool run) A.insertLocalTiles(origin_target); //H = slate::HermitianMatrix(slate::Uplo::Lower, n, nb, p, q, MPI_COMM_WORLD); //H.insertLocalTiles(origin_target); - B = slate::Matrix(m, n, nb, p, q, MPI_COMM_WORLD); - B.insertLocalTiles(origin_target); + H = slate::Matrix(n, n, nb, p, q, MPI_COMM_WORLD); + H.insertLocalTiles(origin_target); } else { // create SLATE matrices from the ScaLAPACK layouts @@ -107,8 +107,8 @@ void test_qdwh_work(Params& params, bool run) A = slate::Matrix::fromScaLAPACK(m, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD); //H_data.resize( lldH * nlocH ); //H = slate::HermitianMatrix::fromScaLAPACK(slate::Uplo::Lower, n, &H_data[0], lldH, nb, p, q, MPI_COMM_WORLD); - B_data.resize( lldA * nlocA ); - B = slate::Matrix::fromScaLAPACK(m, n, &B_data[0], lldA, nb, p, q, MPI_COMM_WORLD); + H_data.resize( lldH * nlocA ); + H = slate::Matrix::fromScaLAPACK(n, n, &H_data[0], lldA, nb, p, q, MPI_COMM_WORLD); } real_t cond = 1 / std::numeric_limits::epsilon(); @@ -145,7 +145,9 @@ void test_qdwh_work(Params& params, bool run) // A = AH, // A will be written by the orthogonal polar factor // H is the symmetric positive semidefinite polar factor - slate::qdwh(A, B, itqr, itpo, opts); + nvtxRangePush("qdwh"); + slate::qdwh(A, H, itqr, itpo, opts); + nvtxRangePop(); time = barrier_get_wtime(MPI_COMM_WORLD) - time; @@ -204,7 +206,9 @@ void test_qdwh_work(Params& params, bool run) // //================================================== real_t normA = slate::norm(slate::Norm::Fro, Aref, opts); - slate::gemm(one, A, B, -one, Aref, opts); + slate::gemm(one, A, H, -one, Aref, opts); + //auto H = HermitianMatrix( + // slate::Uplo::Lower, B ); //slate::hemm(slate::Side::Right, one, H, A, -one, Aref, opts); real_t berr = slate::norm(slate::Norm::Fro, Aref, opts); params.error() = berr / normA; From 04f06d933e5f41f9d160a5736f2fca3487f19180 Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Tue, 10 Jan 2023 09:51:19 -0500 Subject: [PATCH 21/49] Added exit with error if #itr > 100 in qdwh and minor cleaning --- src/qdwh.cc | 76 +++++++++++++++++++---------------------------- test/test_qdwh.cc | 4 +-- 2 files changed, 33 insertions(+), 47 deletions(-) diff --git a/src/qdwh.cc b/src/qdwh.cc index 796cc05cd..0e22a3a4a 100644 --- a/src/qdwh.cc +++ b/src/qdwh.cc @@ -27,12 +27,11 @@ void qdwh( using real_t = blas::real_type; using blas::real; - int64_t lookahead = get_option( opts, Option::Lookahead, 1 ); int64_t max_panel_threads = std::max(omp_get_max_threads()/2, 1); max_panel_threads = get_option( opts, Option::MaxPanelThreads, max_panel_threads ); - real_t eps = 0.5 * std::numeric_limits::epsilon(); + real_t eps = std::numeric_limits::epsilon(); real_t tol1 = 5. * eps; real_t tol3 = pow(tol1, 1./3.); @@ -43,7 +42,6 @@ void qdwh( real_t Li, Liconv; real_t conv = 100.; scalar_t zero = 0.0, one = 1.0; - real_t rzero = 0.0; scalar_t alpha, beta; @@ -77,8 +75,8 @@ void qdwh( A.gridinfo(&order, &nprow, &npcol, &myrow, &mycol); // allocate m_W*n work space required for the qr-based iterations QR([A;Id]) - slate::Matrix W(m_W, n, nb, nprow, npcol, MPI_COMM_WORLD); - slate::Matrix Q(m_W, n, nb, nprow, npcol, MPI_COMM_WORLD); + slate::Matrix W(m_W, n, nb, nprow, npcol, A.mpiComm()); + slate::Matrix Q(m_W, n, nb, nprow, npcol, A.mpiComm()); slate::TriangularFactors T; slate::Matrix Acpy; @@ -88,7 +86,7 @@ void qdwh( Acpy = H; } else { - Acpy = slate::Matrix(m, n, nb, nprow, npcol, MPI_COMM_WORLD); + Acpy = slate::Matrix(m, n, nb, nprow, npcol, A.mpiComm()); Acpy.insertLocalTiles(target); } @@ -102,13 +100,14 @@ void qdwh( // W10 have A matrix auto W1 = W.sub(0, mt-1, 0, nt-1); auto W10 = W1.slice(0, m-1, 0, n-1); - - // Second nt tiles of W, contain the identity matrix - auto W2 = W.sub(mt, mt_W-1, 0, nt-1); + auto W11 = W.sub(mt-1, mt-1, 0, nt-1); auto Q1 = Q.sub(0, mt-1, 0, nt-1); auto Q10 = Q1.slice(0, m-1, 0, n-1); + // Second nt tiles of W, contain the identity matrix + auto W2 = W.sub(mt, mt_W-1, 0, nt-1); + auto Q2 = Q.sub(mt, mt_W-1, 0, nt-1); // backup A in Acpy to compute H @@ -117,7 +116,6 @@ void qdwh( // two norm estimation (largest singular value of A) real_t norm_est = norm2est( A, opts); alpha = 1.0; - norm_est = 1.; // scale the original matrix A to form A0 of the iterative loop add(alpha/scalar_t(norm_est), Acpy, zero, A, opts); @@ -139,7 +137,6 @@ void qdwh( Uplo::Upper, slate::Diag::NonUnit, R1 ); auto Rh = HermitianMatrix( Uplo::Upper, R1 ); - // todo: cheaper to do triangular solve than calling trtri trtri(R, opts); norminvR = norm(slate::Norm::One, Rh, opts); Li = (1.0 / norminvR) / normA; @@ -159,8 +156,7 @@ void qdwh( // itconv = number of iterations needed until |Li - 1| < tol1 // This should have converged in less than 50 iterations if (itconv > 100) { - exit(-1); - break; + slate_error("Failed to converge."); } itconv++; @@ -179,8 +175,7 @@ void qdwh( while (conv > tol3 || it < itconv) { // This should have converged in less than 50 iterations if (it > 100) { - exit(-1); - break; + slate_error("Failed to converge."); } it++; @@ -201,62 +196,53 @@ void qdwh( // Generate the matrix W = [ W1 ] = [ sqrt(c) * A ] // [ W2 ] = [ Id ] - alpha = scalar_t(sqrt(c)); - set(zero, one, W2, opts); - - //if( doqr ) { - // geadd(alpha, A, zero, W10, opts); - // geqrf_qdwh_full(W, T1, opts); - //} - //else { - // geadd(alpha, W1, zero, W10, opts); - // geqrf_qdwh_full2(W, T1, opts); - //} // Factorize W = QR, and generate the associated Q // todo: replace following add by copy and scale, but why scale only scale by a real numbers? + alpha = scalar_t(sqrt(c)); + set(zero, zero, W11, opts); add(alpha, A, zero, W10, opts); - //copy(A, W1); - //scale(alpha, one, W1, opts); + set(zero, one, W2, opts); - //geqrf(W, T, opts); // naive impl + // Call a variant of geqrf annd unmqr that avoid the tiles of zeros, + // which are the tiles below the diagonal of W2 (identity) geqrf_qdwh_full(W, T, opts); set(zero, one, Q, opts); - //unmqr(slate::Side::Left, slate::Op::NoTrans, W, T, Q, opts); //naive impl unmqr_qdwh_full(slate::Side::Left, slate::Op::NoTrans, W, T, Q, opts); // A = ( (a-b/c)/sqrt(c) ) * Q1 * Q2' + (b/c) * A auto Q2T = conj_transpose(Q2); alpha = scalar_t( (a-b/c)/sqrt(c) ); beta = scalar_t( b / c ); - // Copy U into C to check the convergence of QDWH + // Save a copy of A to check the convergence of QDWH if (it >= itconv ) { slate::copy(A, W10, opts); } gemm(alpha, Q10, Q2T, beta, A, opts); + itqr += 1; + + //facto = 0; // Main flops used in this step/ //flops_dgeqrf = FLOPS_DGEQRF( 2*m, n ); //flops_dorgqr = FLOPS_DORGQR( 2*m, n, n ); //flops_dgemm = FLOPS_DGEMM( m, n, n ); //flops += flops_dgeqrf + flops_dorgqr + flops_dgemm; - - itqr += 1; - //facto = 0; } else { - // Copy A into H to check the convergence of QDWH + // A = (b/c) * A + (a-b/c) * ( linsolve( C, linsolve( C, A') ) )'; + // Save a copy of A to check the convergence of QDWH if (it >= itconv) { slate::copy(A, W10, opts); } - // Make Q1 into an identity matrix - set(zero, one, W2, opts); - - // Compute Q1 = c * A' * A + I + // Save a copy of A slate::copy(A, Q10, opts); + + // Compute c * A' * A + I auto AT = conj_transpose(Q10); + set(zero, one, W2, opts); gemm(scalar_t(c), AT, A, one, W2, opts); // Solve R x = AT @@ -264,19 +250,19 @@ void qdwh( slate::Uplo::Upper, W2 ); posv(R, AT, opts); - // Update A = (a-b/c) * Q1T' + (b/c) * A + // Update A = (b/c) * A + (a-b/c) * ( linsolve( C, linsolve( C, A') ) )'; alpha = (a-b/c); beta = (b/c); - AT = conj_transpose(AT); - add(scalar_t(alpha), AT, scalar_t(beta), A, opts); + auto AT2 = conj_transpose(AT); + add(alpha, AT2, beta, A, opts); + + itpo += 1; + //facto = 1; // Main flops used in this step //flops_dgemm = FLOPS_DGEMM( m, n, n ); //flops_dpotrf = FLOPS_DPOTRF( m ); //flops_dtrsm = FLOPS_DTRSM( 'L', m, n ); //flops += flops_dgemm + flops_dpotrf + 2. * flops_dtrsm; - - itpo += 1; - //facto = 1; } // Check if it converge, compute the norm of matrix A - W1 diff --git a/test/test_qdwh.cc b/test/test_qdwh.cc index 339ec5ff4..ff584d506 100644 --- a/test/test_qdwh.cc +++ b/test/test_qdwh.cc @@ -107,8 +107,8 @@ void test_qdwh_work(Params& params, bool run) A = slate::Matrix::fromScaLAPACK(m, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD); //H_data.resize( lldH * nlocH ); //H = slate::HermitianMatrix::fromScaLAPACK(slate::Uplo::Lower, n, &H_data[0], lldH, nb, p, q, MPI_COMM_WORLD); - H_data.resize( lldH * nlocA ); - H = slate::Matrix::fromScaLAPACK(n, n, &H_data[0], lldA, nb, p, q, MPI_COMM_WORLD); + H_data.resize( lldH * nlocH ); + H = slate::Matrix::fromScaLAPACK(n, n, &H_data[0], lldH, nb, p, q, MPI_COMM_WORLD); } real_t cond = 1 / std::numeric_limits::epsilon(); From ce9ef1c5f19a576ee90bd1d3dc24fd8f410b11bf Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Tue, 17 Jan 2023 10:42:38 -0500 Subject: [PATCH 22/49] Used real_t to cast numbers inn while-loop and minor cleaning --- src/qdwh.cc | 126 +++++++++++++++++++++++++++------------------------- 1 file changed, 66 insertions(+), 60 deletions(-) diff --git a/src/qdwh.cc b/src/qdwh.cc index 0e22a3a4a..5fe0bd4fb 100644 --- a/src/qdwh.cc +++ b/src/qdwh.cc @@ -24,6 +24,14 @@ void qdwh( int& itqr, int& itpo, Options const& opts) { + // todo: + // optimizations: + // 1. reuse Q from first geqrf + // 2. avoid rounding m if m is not divisible by nb, because then geqrf/unmqr + // have extra zero rows + // in norm2est, when allocate vector mx1, need to allocate on row-processes only + // limitations: + // 1. works only when have square grid using real_t = blas::real_type; using blas::real; @@ -31,23 +39,23 @@ void qdwh( max_panel_threads = get_option( opts, Option::MaxPanelThreads, max_panel_threads ); - real_t eps = std::numeric_limits::epsilon(); - real_t tol1 = 5. * eps; - real_t tol3 = pow(tol1, 1./3.); - - bool optqr = 1; int itconv, it; real_t L2, sqd, dd, a1, a, b, c; real_t Li, Liconv; real_t conv = 100.; - scalar_t zero = 0.0, one = 1.0; + real_t rone = 1.; + scalar_t zero = 0.0, one = 1.0; scalar_t alpha, beta; real_t normA; real_t norminvR; + real_t eps = std::numeric_limits::epsilon(); + real_t tol1 = 5. * eps; + real_t tol3 = pow(tol1, rone/real_t(3.)); + int64_t mt = A.mt(); int64_t nt = A.nt(); @@ -90,24 +98,22 @@ void qdwh( Acpy.insertLocalTiles(target); } - // todo: do this on GPUs W.insertLocalTiles(target); Q.insertLocalTiles(target); - // To compute QR[A; Id] + // To compute QR[A; Id], allocate W and Q of size m_Wxn // First mt tiles of W contains A matrix in [0:m-1, 0:n-1] // W1 have more rows than A, since we round up the number of rows of W1 - // W10 have A matrix auto W1 = W.sub(0, mt-1, 0, nt-1); - auto W10 = W1.slice(0, m-1, 0, n-1); - auto W11 = W.sub(mt-1, mt-1, 0, nt-1); - auto Q1 = Q.sub(0, mt-1, 0, nt-1); + // W10 have A matrix + auto W10 = W1.slice(0, m-1, 0, n-1); auto Q10 = Q1.slice(0, m-1, 0, n-1); + // W11 is the last tile which has more rows in case we round up m + auto W11 = W.sub(mt-1, mt-1, 0, nt-1); // Second nt tiles of W, contain the identity matrix auto W2 = W.sub(mt, mt_W-1, 0, nt-1); - auto Q2 = Q.sub(mt, mt_W-1, 0, nt-1); // backup A in Acpy to compute H @@ -117,37 +123,33 @@ void qdwh( real_t norm_est = norm2est( A, opts); alpha = 1.0; - // scale the original matrix A to form A0 of the iterative loop + // scale the original matrix A to form A0 which is the initial matrix + // for the first iteration add(alpha/scalar_t(norm_est), Acpy, zero, A, opts); - // Calculate Li: reciprocal of condition number estimation + // Estimate the condition number // todo: use the Q factor in the first QR-based iteration - normA = norm(slate::Norm::One, A, opts); - if (optqr) { - // Estimate the condition number using QR - // This Q factor can be used in the first QR-based iteration - slate::copy(A, W10, opts); - // todo: the QR used to estimate the condition number can be used in the first QR iteration - // todo: put bound of 1-norm est compared to 2-norm - slate::geqrf(W1, T, opts); - //auto R = TrapezoidMatrix( - // Uplo::Upper, slate::Diag::NonUnit, W10 ); - auto R1 = W10.slice(0, n-1, 0, n-1); - auto R = TriangularMatrix( + + // Estimate the condition number using QR + // This Q factor can be used in the first QR-based iteration + slate::copy(A, W10, opts); + // todo: put bound of 1-norm est compared to 2-norm + slate::geqrf(W10, T, opts); + //auto R = TrapezoidMatrix( + // Uplo::Upper, slate::Diag::NonUnit, W10 ); + auto R1 = W10.slice(0, n-1, 0, n-1); + auto R = TriangularMatrix( Uplo::Upper, slate::Diag::NonUnit, R1 ); - auto Rh = HermitianMatrix( - Uplo::Upper, R1 ); - trtri(R, opts); - norminvR = norm(slate::Norm::One, Rh, opts); - Li = (1.0 / norminvR) / normA; - Li = norm_est / 1.1 * Li; - // *flops += FLOPS_DGEQRF( M, N ) - // + FLOPS_DTRTRI( N ); - } - else { - // todo - // Estimate the condition number using LU - } + // For now, compute the exact condition number using trtri + normA = norm(slate::Norm::One, A, opts); + trtri(R, opts); + norminvR = norm(slate::Norm::One, R, opts); + Li = (1.0 / norminvR) / normA; + // todo: will call trcondest instead + //slate::trcondest(slate::Norm::One, R, &Li, opts); + Li = norm_est / 1.1 * Li; + // *flops += FLOPS_DGEQRF( M, N ) + // + FLOPS_DTRTRI( N ); // Compute the number of iterations to converge itconv = 0; Liconv = Li; @@ -161,14 +163,15 @@ void qdwh( itconv++; L2 = Liconv * Liconv; - dd = pow( 4.0 * (1.0 - L2 ) / (L2 * L2), 1.0/3.0 ); - sqd = sqrt(1.0 + dd); - a1 = sqd + sqrt( 8.0 - 4.0 * dd + 8.0 * (2.0 - L2) / (L2 * sqd) ) / 2.0; + dd = pow( real_t(4.0) * ( rone - L2 ) / ( L2 * L2 ), rone / real_t(3.0) ); + sqd = sqrt( rone + dd ); + a1 = sqd + sqrt( real_t(8.0) - real_t(4.0) * dd + + real_t(8.0) * ( real_t(2.0) - L2 ) / ( L2 * sqd ) ) / real_t(2.0); a = real(a1); - b = (a - 1.0) * (a - 1.0) / 4.0; - c = a + b - 1.0; + b = ( a - rone ) * ( a - rone ) / real_t(4.0); + c = a + b - rone; // Update Liconv - Liconv = Liconv * real_t((a + b * L2) / (1.0 + c * L2)); + Liconv = Liconv * ( a + b * L2 ) / ( rone + c * L2 ); } it = 0; @@ -180,31 +183,28 @@ void qdwh( it++; // Compute parameters L,a,b,c - //L2 = double (Li * Li); L2 = Li * Li; - dd = pow( 4.0 * (1.0 - L2 ) / (L2 * L2), 1.0/3.0 ); - sqd = sqrt(1.0 + dd); - a1 = sqd + sqrt( 8.0 - 4.0 * dd + 8.0 * (2.0 - L2) / (L2 * sqd) ) / 2.0; + dd = pow( real_t(4.0) * ( rone - L2 ) / ( L2 * L2 ), rone / real_t(3.0) ); + sqd = sqrt( rone + dd ); + a1 = sqd + sqrt( real_t(8.0) - real_t(4.0) * dd + + real_t(8.0) * ( real_t(2.0) - L2 ) / ( L2 * sqd ) ) / real_t(2.0); a = real(a1); - b = (a - 1.0) * (a - 1.0) / 4.0; - c = a + b - 1.0; + b = ( a - rone ) * ( a - rone ) / real_t(4.0); + c = a + b - rone; // Update Li - Li = Li * real_t ((a + b * L2) / (1.0 + c * L2)); - - if (real(c) > 100.) { - //int64_t doqr = !optqr || it > 1; + Li = Li * ( a + b * L2 ) / ( rone + c * L2 ); + if (c > real_t(100.)) { // Generate the matrix W = [ W1 ] = [ sqrt(c) * A ] // [ W2 ] = [ Id ] // Factorize W = QR, and generate the associated Q - // todo: replace following add by copy and scale, but why scale only scale by a real numbers? alpha = scalar_t(sqrt(c)); set(zero, zero, W11, opts); add(alpha, A, zero, W10, opts); set(zero, one, W2, opts); - // Call a variant of geqrf annd unmqr that avoid the tiles of zeros, + // Call a variant of geqrf and unmqr that avoid the tiles of zeros, // which are the tiles below the diagonal of W2 (identity) geqrf_qdwh_full(W, T, opts); @@ -246,9 +246,9 @@ void qdwh( gemm(scalar_t(c), AT, A, one, W2, opts); // Solve R x = AT - auto R = slate::HermitianMatrix( + auto U = slate::HermitianMatrix( slate::Uplo::Upper, W2 ); - posv(R, AT, opts); + posv(U, AT, opts); // Update A = (b/c) * A + (a-b/c) * ( linsolve( C, linsolve( C, A') ) )'; alpha = (a-b/c); beta = (b/c); @@ -278,6 +278,12 @@ void qdwh( printf("%-7d %-7d\n", itqr, itpo); } + if (itqr + itpo > 6) { + printf("\n Converged after %d. Check what is the issue," + "because QDWH needs <= 6 iterations \n", + itqr+itpo); + } + // A = U*H ==> H = U'*A ==> H = 0.5*(H'+H) auto AT = conj_transpose(A); if (m == n) { From 7c688311876918644e922dfe3dbb726255f6a214 Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Mon, 23 Jan 2023 09:46:38 -0500 Subject: [PATCH 23/49] Fixed W2 allocation in norm2est to have it work with grid pxq where p != q. Changed dd computing in qdwh for-now and minor changes --- src/norm2est.cc | 7 +++++-- src/qdwh.cc | 21 +++++++++++---------- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/src/norm2est.cc b/src/norm2est.cc index b826f1f2a..9dabe4d23 100644 --- a/src/norm2est.cc +++ b/src/norm2est.cc @@ -56,7 +56,7 @@ norm2est( real_t tol = 1.e-1; int64_t mloc = numberLocalRowOrCol(m, mb, myrow, izero, p); - int64_t nloc = numberLocalRowOrCol(n, nb, myrow, izero, q); + int64_t nloc = numberLocalRowOrCol(n, nb, mycol, izero, q); int64_t lldA = blas::max(1, mloc); int64_t lldA_n = blas::max(1, nloc); @@ -67,7 +67,10 @@ norm2est( std::vector local_sums(n); std::vector global_sums(n); std::vector W1(lldA); - std::vector W2(lldA_n); + std::vector W2(n); + // todo: W2 should needs only lldA_n, but it does not work with grid pxq, + // where p!=q + //std::vector W2(lldA_n); auto XL = slate::Matrix::fromScaLAPACK( n, 1, &global_sums[0], n, nb, 1, p, q, A.mpiComm()); diff --git a/src/qdwh.cc b/src/qdwh.cc index 5fe0bd4fb..ef37dd5a5 100644 --- a/src/qdwh.cc +++ b/src/qdwh.cc @@ -30,8 +30,6 @@ void qdwh( // 2. avoid rounding m if m is not divisible by nb, because then geqrf/unmqr // have extra zero rows // in norm2est, when allocate vector mx1, need to allocate on row-processes only - // limitations: - // 1. works only when have square grid using real_t = blas::real_type; using blas::real; @@ -44,12 +42,11 @@ void qdwh( real_t L2, sqd, dd, a1, a, b, c; real_t Li, Liconv; real_t conv = 100.; - real_t rone = 1.; + real_t rzero = 0.0, rone = 1.0; scalar_t zero = 0.0, one = 1.0; scalar_t alpha, beta; - real_t normA; real_t norminvR; real_t eps = std::numeric_limits::epsilon(); @@ -141,13 +138,13 @@ void qdwh( auto R = TriangularMatrix( Uplo::Upper, slate::Diag::NonUnit, R1 ); // For now, compute the exact condition number using trtri - normA = norm(slate::Norm::One, A, opts); trtri(R, opts); norminvR = norm(slate::Norm::One, R, opts); - Li = (1.0 / norminvR) / normA; + real_t smin_est = 1./norminvR; + Li = smin_est / sqrt(n); + // todo: will call trcondest instead //slate::trcondest(slate::Norm::One, R, &Li, opts); - Li = norm_est / 1.1 * Li; // *flops += FLOPS_DGEQRF( M, N ) // + FLOPS_DTRTRI( N ); @@ -184,7 +181,11 @@ void qdwh( // Compute parameters L,a,b,c L2 = Li * Li; - dd = pow( real_t(4.0) * ( rone - L2 ) / ( L2 * L2 ), rone / real_t(3.0) ); + //dd = pow( real_t(4.0) * ( rone - L2 ) / ( L2 * L2 ), rone / real_t(3.0) ); + if (abs( L2 - one) <= 10*eps) + dd = rzero; + else + dd = pow( real_t(4.0) * ( rone - L2 ) / ( L2 * L2 ), rone / real_t(3.0) ); sqd = sqrt( rone + dd ); a1 = sqd + sqrt( real_t(8.0) - real_t(4.0) * dd + real_t(8.0) * ( real_t(2.0) - L2 ) / ( L2 * sqd ) ) / real_t(2.0); @@ -194,7 +195,7 @@ void qdwh( // Update Li Li = Li * ( a + b * L2 ) / ( rone + c * L2 ); - if (c > real_t(100.)) { + if (c > 100.) { // Generate the matrix W = [ W1 ] = [ sqrt(c) * A ] // [ W2 ] = [ Id ] @@ -280,7 +281,7 @@ void qdwh( if (itqr + itpo > 6) { printf("\n Converged after %d. Check what is the issue," - "because QDWH needs <= 6 iterations \n", + "because QDWH needs <= 6 iterations. \n", itqr+itpo); } From ca1db42966677535e15e22ffaca3ff36da09b94c Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Mon, 23 Jan 2023 10:50:53 -0500 Subject: [PATCH 24/49] minor on printf --- src/qdwh.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/qdwh.cc b/src/qdwh.cc index ef37dd5a5..80f4376bf 100644 --- a/src/qdwh.cc +++ b/src/qdwh.cc @@ -280,8 +280,8 @@ void qdwh( } if (itqr + itpo > 6) { - printf("\n Converged after %d. Check what is the issue," - "because QDWH needs <= 6 iterations. \n", + printf("\nConverged after %d. Check what is the issue, " + "because QDWH needs <= 6 iterations.\n", itqr+itpo); } From 3afc4cbe59d83d9da32c8750a1583fa3c883c67f Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Mon, 30 Jan 2023 10:24:17 -0500 Subject: [PATCH 25/49] throw err if invalid target --- src/norm2est.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/norm2est.cc b/src/norm2est.cc index 9dabe4d23..8c2a822af 100644 --- a/src/norm2est.cc +++ b/src/norm2est.cc @@ -187,6 +187,7 @@ norm2est(matrix_type& A, return impl::norm2est( A, opts ); break; } + throw std::exception(); // todo: invalid target } //------------------------------------------------------------------------------ From 5b19e0a7be7b892ef3f76a5d44b0cbe152b71b21 Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Mon, 30 Jan 2023 12:49:30 -0500 Subject: [PATCH 26/49] minor on doc --- src/norm2est.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/norm2est.cc b/src/norm2est.cc index 8c2a822af..2cb10bb1f 100644 --- a/src/norm2est.cc +++ b/src/norm2est.cc @@ -17,7 +17,7 @@ namespace impl { //------------------------------------------------------------------------------ /// @internal -/// Distributed parallel general matrix norm. +/// Distributed parallel general matrix 2-norm estimate. /// Generic implementation for any target. /// @ingroup norm_specialization /// From 952b11bdc17d211b31376512f1295fd239fc73d4 Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Mon, 30 Jan 2023 12:57:43 -0500 Subject: [PATCH 27/49] remove nvToolsExt.h --- test/test_qdwh.cc | 3 --- 1 file changed, 3 deletions(-) diff --git a/test/test_qdwh.cc b/test/test_qdwh.cc index ff584d506..b09055284 100644 --- a/test/test_qdwh.cc +++ b/test/test_qdwh.cc @@ -13,7 +13,6 @@ #include "scalapack_wrappers.hh" #include "scalapack_support_routines.hh" #include "scalapack_copy.hh" -#include "nvToolsExt.h" #include #include @@ -145,9 +144,7 @@ void test_qdwh_work(Params& params, bool run) // A = AH, // A will be written by the orthogonal polar factor // H is the symmetric positive semidefinite polar factor - nvtxRangePush("qdwh"); slate::qdwh(A, H, itqr, itpo, opts); - nvtxRangePop(); time = barrier_get_wtime(MPI_COMM_WORLD) - time; From 398703d19a2b3a6051600f9f5d5838494ea52686 Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Mon, 30 Jan 2023 13:26:25 -0500 Subject: [PATCH 28/49] minor --- test/test_qdwh.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_qdwh.cc b/test/test_qdwh.cc index b09055284..002720f3e 100644 --- a/test/test_qdwh.cc +++ b/test/test_qdwh.cc @@ -176,7 +176,7 @@ void test_qdwh_work(Params& params, bool run) params.gflops() = gflop / time; print_matrix("U: orthogonal polar factor", A, params); - //print_matrix("H: symmetric postive semi definite factor", H, params); + print_matrix("H: symmetric postive semi definite factor", H, params); } if (check) { From d546c84114a12eaf4e215ed6393cfdd30ed118ff Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Mon, 30 Jan 2023 13:48:49 -0500 Subject: [PATCH 29/49] remove redifned SLATE_HAVE_SCALAPACK --- test/test_qdwh.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_qdwh.cc b/test/test_qdwh.cc index 002720f3e..ceea1f1ac 100644 --- a/test/test_qdwh.cc +++ b/test/test_qdwh.cc @@ -18,7 +18,6 @@ #include #include #include -#define SLATE_HAVE_SCALAPACK //------------------------------------------------------------------------------ template void test_qdwh_work(Params& params, bool run) From 6d48ec2ff0ae9070c54622594f113ce7516c7a3d Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Mon, 30 Jan 2023 22:27:56 -0500 Subject: [PATCH 30/49] Added qdwh to run_tests. Trying to fix dd computing in qdwh and used trcondest. Reduced the condition number of the tested matrix --- src/qdwh.cc | 26 ++++++++++++++------------ test/run_tests.py | 7 +++++++ test/test_qdwh.cc | 2 +- 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/src/qdwh.cc b/src/qdwh.cc index 80f4376bf..b47d34ec1 100644 --- a/src/qdwh.cc +++ b/src/qdwh.cc @@ -47,7 +47,7 @@ void qdwh( scalar_t zero = 0.0, one = 1.0; scalar_t alpha, beta; - real_t norminvR; + real_t normR; real_t eps = std::numeric_limits::epsilon(); real_t tol1 = 5. * eps; @@ -137,16 +137,13 @@ void qdwh( auto R1 = W10.slice(0, n-1, 0, n-1); auto R = TriangularMatrix( Uplo::Upper, slate::Diag::NonUnit, R1 ); - // For now, compute the exact condition number using trtri - trtri(R, opts); - norminvR = norm(slate::Norm::One, R, opts); - real_t smin_est = 1./norminvR; + normR = norm(slate::Norm::One, R, opts); + slate::trcondest(slate::Norm::One, R, &Li, opts); + real_t smin_est = normR*Li; Li = smin_est / sqrt(n); - // todo: will call trcondest instead //slate::trcondest(slate::Norm::One, R, &Li, opts); - // *flops += FLOPS_DGEQRF( M, N ) - // + FLOPS_DTRTRI( N ); + // *flops += FLOPS_DGEQRF( M, N ); // Compute the number of iterations to converge itconv = 0; Liconv = Li; @@ -160,7 +157,10 @@ void qdwh( itconv++; L2 = Liconv * Liconv; - dd = pow( real_t(4.0) * ( rone - L2 ) / ( L2 * L2 ), rone / real_t(3.0) ); + if (abs( L2 - rone) <= 10*eps) + dd = rzero; + else + dd = pow( real_t(4.0) * ( rone - L2 ) / ( L2 * L2 ), rone / real_t(3.0) ); sqd = sqrt( rone + dd ); a1 = sqd + sqrt( real_t(8.0) - real_t(4.0) * dd + real_t(8.0) * ( real_t(2.0) - L2 ) / ( L2 * sqd ) ) / real_t(2.0); @@ -182,10 +182,12 @@ void qdwh( // Compute parameters L,a,b,c L2 = Li * Li; //dd = pow( real_t(4.0) * ( rone - L2 ) / ( L2 * L2 ), rone / real_t(3.0) ); - if (abs( L2 - one) <= 10*eps) + if (abs( L2 - rone) <= 10*eps) { dd = rzero; - else - dd = pow( real_t(4.0) * ( rone - L2 ) / ( L2 * L2 ), rone / real_t(3.0) ); + } + else { + dd = std::pow( real_t(4.0) * ( rone - L2 ) / ( L2 * L2 ), rone / real_t(3.0) ); + } sqd = sqrt( rone + dd ); a1 = sqd + sqrt( real_t(8.0) - real_t(4.0) * dd + real_t(8.0) * ( real_t(2.0) - L2 ) / ( L2 * sqd ) ) / real_t(2.0); diff --git a/test/run_tests.py b/test/run_tests.py index 77dee5bed..a907b54da 100755 --- a/test/run_tests.py +++ b/test/run_tests.py @@ -75,6 +75,7 @@ group_cat.add_argument( '--aux', action='store_true', help='run auxiliary routine tests' ), group_cat.add_argument( '--norms', action='store_true', help='run norm tests' ), group_cat.add_argument( '--cond', action='store_true', help='run condition number estimate tests' ), + group_cat.add_argument( '--qdwh', action='store_true', help='run the polar decomposition qdwh' ), ] # map category objects to category names: ['lu', 'chol', ...] categories = list( map( lambda x: x.dest, categories ) ) @@ -574,6 +575,12 @@ def filter_csv( values, csv ): #[ 'pbcon', gen + dtype + la + n + kd + uplo ], ] +# polar +if (opts.qdwh): + cmds += [ + [ 'qdwh', gen + dtype + n ], + ] + # aux if (opts.aux): cmds += [ diff --git a/test/test_qdwh.cc b/test/test_qdwh.cc index ceea1f1ac..f51763f9d 100644 --- a/test/test_qdwh.cc +++ b/test/test_qdwh.cc @@ -109,7 +109,7 @@ void test_qdwh_work(Params& params, bool run) H = slate::Matrix::fromScaLAPACK(n, n, &H_data[0], lldH, nb, p, q, MPI_COMM_WORLD); } - real_t cond = 1 / std::numeric_limits::epsilon(); + real_t cond = 1 / (100 * std::numeric_limits::epsilon() ); params.matrix.kind.set_default("svd"); params.matrix.cond.set_default(cond); From 2acf39601029f974259c0a8c42ae0479a57b5545 Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Wed, 1 Feb 2023 12:06:58 -0500 Subject: [PATCH 31/49] Removed an extra allocation --- src/norm2est.cc | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/src/norm2est.cc b/src/norm2est.cc index 2cb10bb1f..e670b7e3b 100644 --- a/src/norm2est.cc +++ b/src/norm2est.cc @@ -44,6 +44,7 @@ norm2est( scalar_t one = 1.; scalar_t zero = 0.; + real_t rone = 1.; real_t alpha; int64_t cnt = 0; @@ -67,17 +68,11 @@ norm2est( std::vector local_sums(n); std::vector global_sums(n); std::vector W1(lldA); - std::vector W2(n); - // todo: W2 should needs only lldA_n, but it does not work with grid pxq, - // where p!=q - //std::vector W2(lldA_n); auto XL = slate::Matrix::fromScaLAPACK( n, 1, &global_sums[0], n, nb, 1, p, q, A.mpiComm()); auto AX = slate::Matrix::fromScaLAPACK( m, 1, &W1[0], lldA, mb, 1, p, q, A.mpiComm()); - auto X = slate::Matrix::fromScaLAPACK( - n, 1, &W2[0], lldA_n, nb, 1, p, q, A.mpiComm()); // Two norm estimation // First: let's compute the x vector such that @@ -112,11 +107,10 @@ norm2est( e0 = e; // Scale X = X / ||X|| - alpha = 1.0/normX; - add(scalar_t(alpha), XL, zero, X, opts); + scale(rone, normX, XL, opts); // Compute Ax = A * sx - gemm(one, A, X, zero, AX, opts); + gemm(one, A, XL, zero, AX, opts); // todo: still need to add the following //if nnz(Sx) == 0 @@ -125,11 +119,9 @@ norm2est( // Compute x = A' * A * x = A' * Ax auto AT = conjTranspose(A); - // todo: why this set is needed when using multiple mpi rank - // todo: send to Sebastien + // todo: why this set is needed when using multiple mpi rank and using gemmA set(zero, zero, XL); gemm(one, AT, AX, zero, XL, opts); - //gemmC(one, AT, AX, zero, XL, opts); // Compute ||X||, ||AX|| normX = norm(Norm::Fro, XL, opts); From 516cae64fb290e0c2953c510d432e30c362ca8ce Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Thu, 2 Feb 2023 09:53:43 -0500 Subject: [PATCH 32/49] Changed dd to avoid overflow --- src/qdwh.cc | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/src/qdwh.cc b/src/qdwh.cc index b47d34ec1..1d0a77c35 100644 --- a/src/qdwh.cc +++ b/src/qdwh.cc @@ -130,7 +130,6 @@ void qdwh( // Estimate the condition number using QR // This Q factor can be used in the first QR-based iteration slate::copy(A, W10, opts); - // todo: put bound of 1-norm est compared to 2-norm slate::geqrf(W10, T, opts); //auto R = TrapezoidMatrix( // Uplo::Upper, slate::Diag::NonUnit, W10 ); @@ -157,10 +156,8 @@ void qdwh( itconv++; L2 = Liconv * Liconv; - if (abs( L2 - rone) <= 10*eps) - dd = rzero; - else - dd = pow( real_t(4.0) * ( rone - L2 ) / ( L2 * L2 ), rone / real_t(3.0) ); + dd = std::pow( real_t(4.0) * ( rone - L2 ), rone / real_t(3.0) ) * + std::pow( L2, real_t(-2.0) / real_t(3.0) ); sqd = sqrt( rone + dd ); a1 = sqd + sqrt( real_t(8.0) - real_t(4.0) * dd + real_t(8.0) * ( real_t(2.0) - L2 ) / ( L2 * sqd ) ) / real_t(2.0); @@ -181,13 +178,8 @@ void qdwh( // Compute parameters L,a,b,c L2 = Li * Li; - //dd = pow( real_t(4.0) * ( rone - L2 ) / ( L2 * L2 ), rone / real_t(3.0) ); - if (abs( L2 - rone) <= 10*eps) { - dd = rzero; - } - else { - dd = std::pow( real_t(4.0) * ( rone - L2 ) / ( L2 * L2 ), rone / real_t(3.0) ); - } + dd = std::pow( real_t(4.0) * ( rone - L2 ), rone / real_t(3.0) ) * + std::pow( L2, real_t(-2.0) / real_t(3.0) ); sqd = sqrt( rone + dd ); a1 = sqd + sqrt( real_t(8.0) - real_t(4.0) * dd + real_t(8.0) * ( real_t(2.0) - L2 ) / ( L2 * sqd ) ) / real_t(2.0); From 22e050af11d51cfacf931aa6741b1ec03d8a48fa Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Thu, 2 Feb 2023 09:57:10 -0500 Subject: [PATCH 33/49] =?UTF-8?q?test=20ill-cond=C3=83?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/test_qdwh.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_qdwh.cc b/test/test_qdwh.cc index f51763f9d..f033b2c67 100644 --- a/test/test_qdwh.cc +++ b/test/test_qdwh.cc @@ -109,7 +109,7 @@ void test_qdwh_work(Params& params, bool run) H = slate::Matrix::fromScaLAPACK(n, n, &H_data[0], lldH, nb, p, q, MPI_COMM_WORLD); } - real_t cond = 1 / (100 * std::numeric_limits::epsilon() ); + real_t cond = 1 / (std::numeric_limits::epsilon() ); params.matrix.kind.set_default("svd"); params.matrix.cond.set_default(cond); From 6ea76e69ebfe97176862f4ccd1a046dc2ee2a061 Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Thu, 2 Feb 2023 11:09:55 -0500 Subject: [PATCH 34/49] Fix in makefil, used impl namesapce and minor to the docs in geqrf_qdwh_full --- GNUmakefile | 2 +- src/geqrf_qdwh_full.cc | 80 +++++++++++++++++++----------------------- 2 files changed, 38 insertions(+), 44 deletions(-) diff --git a/GNUmakefile b/GNUmakefile index 782b35d00..7773cc72e 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -639,6 +639,7 @@ tester_src += \ test/test_pbsv.cc \ test/test_posv.cc \ test/test_potri.cc \ + test/test_qdwh.cc \ test/test_scale.cc \ test/test_scale_row_col.cc \ test/test_set.cc \ @@ -676,7 +677,6 @@ ifneq ($(have_fortran),) test/pdlantr.f \ test/pclantr.f \ test/pzlantr.f \ - test/test_qdwh.cc \ # End. Add alphabetically, by base name after precision. endif endif diff --git a/src/geqrf_qdwh_full.cc b/src/geqrf_qdwh_full.cc index f7bc10c74..ac3ae303e 100644 --- a/src/geqrf_qdwh_full.cc +++ b/src/geqrf_qdwh_full.cc @@ -10,10 +10,7 @@ namespace slate { -// specialization namespace differentiates, e.g., -// internal::geqrf_qdwh_full from internal::specialization::geqrf -namespace internal { -namespace specialization { +namespace impl { //------------------------------------------------------------------------------ /// An auxiliary routine to find each rank's first (top-most) row @@ -31,8 +28,9 @@ namespace specialization { /// @ingroup geqrf_qdwh_full_specialization /// template -void geqrf_compute_first_indices(Matrix& A_panel, int64_t k, - std::vector< int64_t >& first_indices) +void geqrf_compute_first_indices( + Matrix& A_panel, int64_t k, + std::vector< int64_t >& first_indices) { // Find ranks in this column. std::set ranks_set; @@ -60,13 +58,13 @@ void geqrf_compute_first_indices(Matrix& A_panel, int64_t k, /// /// ColMajor layout is assumed /// -/// @ingroup geqrf_specialization +/// @ingroup geqrf_impl /// template -void geqrf_qdwh_full(slate::internal::TargetType, +void geqrf_qdwh_full( Matrix& A, TriangularFactors& T, - int64_t ib, int max_panel_threads, int64_t lookahead) + Options const& opts ) { using BcastList = typename Matrix::BcastList; using device_info_t = lapack::device_info_int; @@ -78,7 +76,15 @@ void geqrf_qdwh_full(slate::internal::TargetType, const int priority_zero = 0; const int priority_one = 1; const int life_factor_one = 1; - const bool set_hold = lookahead > 0; // Do tileGetAndHold in the bcast + + // Options + int64_t lookahead = get_option( opts, Option::Lookahead, 1 ); + int64_t ib = get_option( opts, Option::InnerBlocking, 16 ); + int64_t max_panel_threads = std::max(omp_get_max_threads()/2, 1); + max_panel_threads = get_option( opts, Option::MaxPanelThreads, + max_panel_threads ); + + bool set_hold = lookahead > 0; // Do tileGetAndHold in the bcast int64_t A_mt = A.mt(); int64_t A_nt = A.nt(); @@ -364,34 +370,17 @@ void geqrf_qdwh_full(slate::internal::TargetType, } } -} // namespace specialization -} // namespace internal - -//------------------------------------------------------------------------------ -/// Version with target as template parameter. -/// @ingroup geqrf_qdwh_full_specialization -/// -template -void geqrf_qdwh_full(Matrix& A, - TriangularFactors& T, - Options const& opts) -{ - int64_t lookahead = get_option( opts, Option::Lookahead, 1 ); - - int64_t ib = get_option( opts, Option::InnerBlocking, 16 ); - - int64_t max_panel_threads = std::max(omp_get_max_threads()/2, 1); - max_panel_threads = get_option( opts, Option::MaxPanelThreads, max_panel_threads ); - - internal::specialization::geqrf_qdwh_full(internal::TargetType(), - A, T, - ib, max_panel_threads, lookahead); -} +} // namespace impl //------------------------------------------------------------------------------ -/// Distributed parallel QR factorization. +/// Distributed parallel customized QR factorization. +/// Required for the QR-based iterations in the polar decomposition QDWH. /// -/// Computes a QR factorization of an m-by-n matrix $A$. +/// Computes a QR factorization of m-by-n matrix $A$, m \ge 2n, +/// and takes advantage of the trailing identity matrix structure. +/// A = [ A0 ] full matrix ( m0-by-n, where m0 = m - n) +/// [ A1 ] identity matrix (n-by-n) +/// Avoids doing computaions on the zero tiles below the diagonal of $A1$. /// The factorization has the form /// \[ /// A = QR, @@ -401,15 +390,16 @@ void geqrf_qdwh_full(Matrix& A, /// /// Complexity (in real): /// - for $m \ge n$, $\approx 2 m n^{2} - \frac{2}{3} n^{3}$ flops; -/// - for $m \le n$, $\approx 2 m^{2} n - \frac{2}{3} m^{3}$ flops; -/// - for $m = n$, $\approx \frac{4}{3} n^{3}$ flops. /// . //------------------------------------------------------------------------------ /// @tparam scalar_t /// One of float, double, std::complex, std::complex. //------------------------------------------------------------------------------ /// @param[in,out] A -/// On entry, the m-by-n matrix $A$. +/// On entry, the m-by-n matrix $A$, m \ge 2n, +/// A = [ A0 ] full matrix ( m0-by-n, where m0 = m - n) +/// [ A1 ] identity matrix (n-by-n) +/// /// On exit, the elements on and above the diagonal of the array contain /// the min(m,n)-by-n upper trapezoidal matrix $R$ (upper triangular /// if m >= n); the elements below the diagonal represent the unitary @@ -437,7 +427,8 @@ void geqrf_qdwh_full(Matrix& A, /// @ingroup geqrf_computational /// template -void geqrf_qdwh_full(Matrix& A, +void geqrf_qdwh_full( + Matrix& A, TriangularFactors& T, Options const& opts) { @@ -446,16 +437,19 @@ void geqrf_qdwh_full(Matrix& A, switch (target) { case Target::Host: case Target::HostTask: - geqrf_qdwh_full(A, T, opts); + impl::geqrf_qdwh_full( A, T, opts ); break; + case Target::HostNest: - geqrf_qdwh_full(A, T, opts); + impl::geqrf_qdwh_full( A, T, opts ); break; + case Target::HostBatch: - geqrf_qdwh_full(A, T, opts); + impl::geqrf_qdwh_full( A, T, opts ); break; + case Target::Devices: - geqrf_qdwh_full(A, T, opts); + impl::geqrf_qdwh_full( A, T, opts ); break; } // todo: return value for errors? From a7d423ff6cb260292a5f087404063af5872d7d13 Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Thu, 2 Feb 2023 11:24:14 -0500 Subject: [PATCH 35/49] minor fixes --- src/norm2est.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/norm2est.cc b/src/norm2est.cc index e670b7e3b..7f69b90cf 100644 --- a/src/norm2est.cc +++ b/src/norm2est.cc @@ -103,7 +103,7 @@ norm2est( } // Third: start the while-loop X = X / ||X|| - while ((cnt < maxiter) && (fabs((e - e0)) > (tol * (e)))) { + while ((cnt < maxiter) && (std::abs(e - e0) > (tol * e))) { e0 = e; // Scale X = X / ||X|| @@ -127,7 +127,7 @@ norm2est( normX = norm(Norm::Fro, XL, opts); normAX = norm(Norm::Fro, AX, opts); - // Uodate e + // Update e e = normX / normAX; cnt++; } From 3bcf929bedb875bc33ff72e86f3f496cc4254840 Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Thu, 2 Feb 2023 11:38:30 -0500 Subject: [PATCH 36/49] --amend --- src/norm2est.cc | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/norm2est.cc b/src/norm2est.cc index 7f69b90cf..463435e31 100644 --- a/src/norm2est.cc +++ b/src/norm2est.cc @@ -45,7 +45,6 @@ norm2est( scalar_t one = 1.; scalar_t zero = 0.; real_t rone = 1.; - real_t alpha; int64_t cnt = 0; int64_t maxiter = min( 100, n ); @@ -57,9 +56,7 @@ norm2est( real_t tol = 1.e-1; int64_t mloc = numberLocalRowOrCol(m, mb, myrow, izero, p); - int64_t nloc = numberLocalRowOrCol(n, nb, mycol, izero, q); int64_t lldA = blas::max(1, mloc); - int64_t lldA_n = blas::max(1, nloc); // todo: do we still need reserveDevice here? if (target == Target::Devices) From a09c7582ad2846360e67bbdd6458709ccd4ac01e Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Thu, 2 Feb 2023 13:04:16 -0500 Subject: [PATCH 37/49] Applied Mark's comments --- docs/doxygen/groups.dox | 8 ++ src/qdwh.cc | 161 ++++++++++++++++------------------------ 2 files changed, 71 insertions(+), 98 deletions(-) diff --git a/docs/doxygen/groups.dox b/docs/doxygen/groups.dox index 6cb84a95e..dcbc88538 100644 --- a/docs/doxygen/groups.dox +++ b/docs/doxygen/groups.dox @@ -145,6 +145,14 @@ @defgroup svd_specialization Target implementations @} + ------------------------------------------------------------ + @defgroup group_qdwh Polar Decomposition + @{ + @defgroup qdwh Driver + @brief $A = U H$ + + @} + ------------------------------------------------------------ @defgroup group_blas2_top Level 2 BLAS and Auxiliary: O(n^2) work @brief Matrix and Matrix-vector operations that perform $O(n^2)$ work on $O(n^2)$ data. diff --git a/src/qdwh.cc b/src/qdwh.cc index 1d0a77c35..83820728f 100644 --- a/src/qdwh.cc +++ b/src/qdwh.cc @@ -10,14 +10,55 @@ namespace slate { -namespace impl { - //------------------------------------------------------------------------------ /// Distributed parallel polar decomposition based on QDWH algorithm. -/// Generic implementation for any target. -/// @ingroup qdwh_specialization /// -template +/// Computes a polar decomposition of an m-by-n matrix $A$. +/// The factorization has the form +/// \[ +/// A = UH, +/// \] +/// where $U$ is a matrix with orthonormal columns and $H$ is a Hermition +/// matrix. +/// +//------------------------------------------------------------------------------ +/// @tparam scalar_t +/// One of float, double, std::complex, std::complex. +//------------------------------------------------------------------------------ +/// @param[in,out] A +/// On entry, the m-by-n matrix $A$. +/// On exit, if return value = 0, the orthogonal polar factor $U$ +/// from the decomposition $A = U H$. +/// +/// @param[out] H +/// On exit, if return value = 0, the hermitian polar factor matrix $H$ +/// from the decomposition $A = U H$. +/// +/// @param[out] itqr +/// The number of the QR-based iterations. +/// +/// @param[out] itpo +/// The number of the Cholesky-based iterations. +/// +/// @param[in] opts +/// Additional options, as map of name = value pairs. Possible options: +/// - Option::Lookahead: +/// Number of panels to overlap with matrix updates. +/// lookahead >= 0. Default 1. +/// - Option::InnerBlocking: +/// Inner blocking to use for panel. Default 16. +/// - Option::MaxPanelThreads: +/// Number of threads to use for panel. Default omp_get_max_threads()/2. +/// - Option::Target: +/// Implementation to target. Possible values: +/// - HostTask: OpenMP tasks on CPU host [default]. +/// - HostNest: nested OpenMP parallel for loop on CPU host. +/// - HostBatch: batched BLAS on CPU host. +/// - Devices: batched BLAS on GPU device. +/// +/// @ingroup qdwh +/// +template void qdwh( Matrix& A, Matrix& H, @@ -33,21 +74,14 @@ void qdwh( using real_t = blas::real_type; using blas::real; - int64_t max_panel_threads = std::max(omp_get_max_threads()/2, 1); - - max_panel_threads = get_option( opts, Option::MaxPanelThreads, max_panel_threads ); - - int itconv, it; - - real_t L2, sqd, dd, a1, a, b, c; - real_t Li, Liconv; + // Constants real_t conv = 100.; - real_t rzero = 0.0, rone = 1.0; - + real_t rone = 1.0; scalar_t zero = 0.0, one = 1.0; scalar_t alpha, beta; - real_t normR; + // Options + Target target = get_option( opts, Option::Target, Target::HostTask ); real_t eps = std::numeric_limits::epsilon(); real_t tol1 = 5. * eps; @@ -65,15 +99,21 @@ void qdwh( int64_t nb = A.tileMb(0); + int itconv, it; + real_t L2, sqd, dd, a1, a, b, c; + real_t Li, Liconv; + real_t normR; // The QR-based iterations requires QR[A; Id], // W = [A; Id], where size of A is mxn, size of Id is nxn // To avoid having a clean up tile in the middle of the W matrix, // we round up the number of A rows (m), // so that size(W) = m_roundup + n - int64_t m_roundup = ((m + nb - 1) / nb ) * nb; + //int64_t m_roundup = ((m + nb - 1) / nb ) * nb; + int64_t m_roundup = roundup( m, nb ); int64_t m_W = m_roundup + n; int64_t mt_W = mt + nt; + int nprow, npcol; int myrow, mycol; GridOrder order; @@ -122,13 +162,11 @@ void qdwh( // scale the original matrix A to form A0 which is the initial matrix // for the first iteration - add(alpha/scalar_t(norm_est), Acpy, zero, A, opts); - - // Estimate the condition number - // todo: use the Q factor in the first QR-based iteration + scale(rone, norm_est, A, opts); // Estimate the condition number using QR // This Q factor can be used in the first QR-based iteration + // todo: use the Q factor in the first QR-based iteration slate::copy(A, W10, opts); slate::geqrf(W10, T, opts); //auto R = TrapezoidMatrix( @@ -296,106 +334,33 @@ void qdwh( //slate::copy(AL, H, opts); } -} // namespace impl - -//------------------------------------------------------------------------------ -/// Distributed parallel polar decomposition based on QDWH algorithm. -/// -/// Computes a polar decomposition of an m-by-n matrix $A$. -/// The factorization has the form -/// \[ -/// A = UH, -/// \] -/// where $U$ is a matrix with orthonormal columns and $H$ is a Hermition -/// matrix. -/// -/// Complexity (in real): -/// - for $m \ge n$, $\approx 2 m n^{2} - \frac{2}{3} n^{3}$ flops; -/// - for $m \le n$, $\approx 2 m^{2} n - \frac{2}{3} m^{3}$ flops; -/// - for $m = n$, $\approx \frac{4}{3} n^{3}$ flops. -/// . -//------------------------------------------------------------------------------ -/// @tparam scalar_t -/// One of float, double, std::complex, std::complex. -//------------------------------------------------------------------------------ -/// @param[in,out] A -/// On entry, the m-by-n matrix $A$. -/// On exit, if return value = 0, the orthogonal polar factor $U$ -/// from the decomposition $A = U H$. -/// -/// @param[out] H -/// On exit, if return value = 0, the hermitian polar factor matrix $H$ -/// from the decomposition $A = U H$. -/// -/// @param[in] opts -/// Additional options, as map of name = value pairs. Possible options: -/// - Option::Lookahead: -/// Number of panels to overlap with matrix updates. -/// lookahead >= 0. Default 1. -/// - Option::InnerBlocking: -/// Inner blocking to use for panel. Default 16. -/// - Option::MaxPanelThreads: -/// Number of threads to use for panel. Default omp_get_max_threads()/2. -/// - Option::Target: -/// Implementation to target. Possible values: -/// - HostTask: OpenMP tasks on CPU host [default]. -/// - HostNest: nested OpenMP parallel for loop on CPU host. -/// - HostBatch: batched BLAS on CPU host. -/// - Devices: batched BLAS on GPU device. -/// -/// @ingroup geqrf_computational -/// -template -void qdwh( - Matrix& A, - Matrix& B, - int& itqr, int& itpo, - Options const& opts) -{ - using internal::TargetType; - Target target = get_option( opts, Option::Target, Target::HostTask ); - - switch (target) { - case Target::Host: - case Target::HostTask: - case Target::HostNest: - case Target::HostBatch: - impl::qdwh( A, B, itqr, itpo, opts ); - break; - case Target::Devices: - impl::qdwh( A, B, itqr, itpo, opts ); - break; - } - // todo: return value for errors? -} - //------------------------------------------------------------------------------ // Explicit instantiations. template void qdwh( Matrix& A, - Matrix& B, + Matrix& H, int& itqr, int& itpo, Options const& opts); template void qdwh( Matrix& A, - Matrix& B, + Matrix& H, int& itqr, int& itpo, Options const& opts); template void qdwh< std::complex >( Matrix< std::complex >& A, - Matrix< std::complex >& B, + Matrix< std::complex >& H, int& itqr, int& itpo, Options const& opts); template void qdwh< std::complex >( Matrix< std::complex >& A, - Matrix< std::complex >& B, + Matrix< std::complex >& H, int& itqr, int& itpo, Options const& opts); From cdfc10ca0a521d782938eb43ff5397f0d1e7b8d7 Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Thu, 2 Feb 2023 13:42:03 -0500 Subject: [PATCH 38/49] Replaced namespace by impl in umnqr_qdwh_full. Changed n to mn in run_test.py --- src/unmqr_qdwh_full.cc | 50 ++++++++++++++++++------------------------ test/run_tests.py | 2 +- 2 files changed, 22 insertions(+), 30 deletions(-) diff --git a/src/unmqr_qdwh_full.cc b/src/unmqr_qdwh_full.cc index 3b1277103..b13b8b502 100644 --- a/src/unmqr_qdwh_full.cc +++ b/src/unmqr_qdwh_full.cc @@ -11,23 +11,20 @@ namespace slate { -// specialization namespace differentiates, e.g., -// internal::unmqr_qdwh_full from internal::specialization::unmqr_qdwh_full -namespace internal { -namespace specialization { +namespace impl { //------------------------------------------------------------------------------ /// Distributed parallel multiply by Q from QR factorization. /// Generic implementation for any target. -/// @ingroup geqrf_specialization +/// @ingroup geqrf_impl /// template void unmqr_qdwh_full( - slate::internal::TargetType, Side side, Op op, Matrix& A, TriangularFactors& T, - Matrix& C) + Matrix& C, + Options const& opts ) { // trace::Block trace_block("unmqr"); // const int priority_one = 1; @@ -262,29 +259,21 @@ void unmqr_qdwh_full( C.clearWorkspace(); } -} // namespace specialization -} // namespace internal +} // namespace impl //------------------------------------------------------------------------------ -/// Version with target as template parameter. -/// @ingroup geqrf_specialization +/// Distributed parallel customized multiply by $Q$ from QR factorization. +/// Required for the QR-based iterations in the polar decomposition QDWH. /// -template -void unmqr_qdwh_full( - Side side, Op op, - Matrix& A, - TriangularFactors& T, - Matrix& C, - Options const& opts) -{ - internal::specialization::unmqr_qdwh_full(internal::TargetType(), - side, op, A, T, C); -} - -//------------------------------------------------------------------------------ -/// Distributed parallel multiply by $Q$ from QR factorization. +/// The $Q$ matrix is the output of geqrf_qdwh_full. Since the input matrix +/// to geqrf_qdwh_full has identity trailing matrix, the $Q$ matrix has +/// zero tiles below the diagonal. +/// Q = [ Q0 ] full matrix ( m0-by-n, where m0 = m - n) +/// [ Q1 ] zero tiles below the diagonal (n-by-n) /// /// Multiplies the general m-by-n matrix $C$ by $Q$ from QR factorization, +/// and takes advantage of the structure of the matrix by avoid operating +/// on the tiles of zeros, /// according to: /// /// op | side = Left | side = Right @@ -351,16 +340,19 @@ void unmqr_qdwh_full( case Target::Host: case Target::HostTask: default: - unmqr_qdwh_full(side, op, A, T, C, opts); + impl::unmqr_qdwh_full( side, op, A, T, C, opts ); break; + case Target::HostNest: - unmqr_qdwh_full(side, op, A, T, C, opts); + impl::unmqr_qdwh_full( side, op, A, T, C, opts ); break; + case Target::HostBatch: - unmqr_qdwh_full(side, op, A, T, C, opts); + impl::unmqr_qdwh_full( side, op, A, T, C, opts ); break; + case Target::Devices: - unmqr_qdwh_full(side, op, A, T, C, opts); + impl::unmqr_qdwh_full( side, op, A, T, C, opts ); break; } // todo: return value for errors? diff --git a/test/run_tests.py b/test/run_tests.py index a907b54da..2e27ef4fd 100755 --- a/test/run_tests.py +++ b/test/run_tests.py @@ -578,7 +578,7 @@ def filter_csv( values, csv ): # polar if (opts.qdwh): cmds += [ - [ 'qdwh', gen + dtype + n ], + [ 'qdwh', gen + dtype + mn ], ] # aux From fab06a19dd9ea69aa5972e8f9cfc80dc3a5080ef Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Thu, 2 Feb 2023 14:20:01 -0500 Subject: [PATCH 39/49] qdwh works m >= n --- test/run_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/run_tests.py b/test/run_tests.py index 2e27ef4fd..d67480af1 100755 --- a/test/run_tests.py +++ b/test/run_tests.py @@ -578,7 +578,7 @@ def filter_csv( values, csv ): # polar if (opts.qdwh): cmds += [ - [ 'qdwh', gen + dtype + mn ], + [ 'qdwh', gen + dtype + n + tall ], ] # aux From deec809ab7249a913181c2790f42fe8fe1759c4c Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Fri, 3 Feb 2023 10:41:43 -0500 Subject: [PATCH 40/49] fix flops count --- src/geqrf_qdwh_full.cc | 9 +++++++-- test/test_qdwh.cc | 10 ++++++++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/src/geqrf_qdwh_full.cc b/src/geqrf_qdwh_full.cc index ac3ae303e..d1c4141c0 100644 --- a/src/geqrf_qdwh_full.cc +++ b/src/geqrf_qdwh_full.cc @@ -378,7 +378,8 @@ void geqrf_qdwh_full( /// /// Computes a QR factorization of m-by-n matrix $A$, m \ge 2n, /// and takes advantage of the trailing identity matrix structure. -/// A = [ A0 ] full matrix ( m0-by-n, where m0 = m - n) +/// A = [ A0 ] full matrix ( m0-by-n, where m0 = m - n, which the size of the +/// original matrix in qdwh) /// [ A1 ] identity matrix (n-by-n) /// Avoids doing computaions on the zero tiles below the diagonal of $A1$. /// The factorization has the form @@ -389,7 +390,11 @@ void geqrf_qdwh_full( /// (or upper trapezoidal if m < n). /// /// Complexity (in real): -/// - for $m \ge n$, $\approx 2 m n^{2} - \frac{2}{3} n^{3}$ flops; +/// The size of matrix $A$ is $m = m0 + n$; +/// Since the lower n-by-n of the matrix $A$ is the identity, then the +/// k householder vectors has m0 - k + k + 1 = m0 + 1 rows. +/// - for $m \ge n$, \Sigma_{k=1}^{n-1} 4 (n - k) (m0 + 1) = +/// $\approx 2 m0 n^{2} + 2 m0 n + n^{2}$ flops; /// . //------------------------------------------------------------------------------ /// @tparam scalar_t diff --git a/test/test_qdwh.cc b/test/test_qdwh.cc index f033b2c67..29553f22a 100644 --- a/test/test_qdwh.cc +++ b/test/test_qdwh.cc @@ -156,8 +156,14 @@ void test_qdwh_work(Params& params, bool run) // flop_compute_H double gflop_trcon = lapack::Gflop::geqrf(m, n); - double gflop_itqr = itqr * ( lapack::Gflop::geqrf(m+n, n) + - lapack::Gflop::unmqr(slate::Side::Left, m+n, n, n) + + // this flop count of the QR-based iteration is in case we are not taking advantage + // of the identity trailing submatrix: + //double gflop_itqr = itqr * ( lapack::Gflop::geqrf(m+n, n) + + // lapack::Gflop::unmqr(slate::Side::Left, m+n, n, n) + + // lapack::Gflop::gemm(m, n, n) ); + // when take advantage of the matrix structure, the flops count will be reduced: + double gflop_itqr = itqr * ( 2 * m * n^2 + + 2 * m * n^2 + lapack::Gflop::gemm(m, n, n) ); double gflop_itpo = itpo * ( lapack::Gflop::gemm(m, n, n) + From 169a63d5df35de61d0c853ec5067bd7b76590878 Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Mon, 6 Feb 2023 09:38:58 -0500 Subject: [PATCH 41/49] minor --- test/test_qdwh.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_qdwh.cc b/test/test_qdwh.cc index 29553f22a..c636a7f92 100644 --- a/test/test_qdwh.cc +++ b/test/test_qdwh.cc @@ -162,8 +162,8 @@ void test_qdwh_work(Params& params, bool run) // lapack::Gflop::unmqr(slate::Side::Left, m+n, n, n) + // lapack::Gflop::gemm(m, n, n) ); // when take advantage of the matrix structure, the flops count will be reduced: - double gflop_itqr = itqr * ( 2 * m * n^2 + - 2 * m * n^2 + + double gflop_itqr = itqr * ( 2 * m * std::pow(n, 2) + + 2 * m * std::pow(n, 2) + lapack::Gflop::gemm(m, n, n) ); double gflop_itpo = itpo * ( lapack::Gflop::gemm(m, n, n) + From fc35842e454a30264a1f739c90bfec67b34be2dd Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Mon, 6 Feb 2023 13:56:48 -0500 Subject: [PATCH 42/49] add const and delete dead code --- src/qdwh.cc | 30 +++++++++++++++--------------- src/unmqr_qdwh_full.cc | 1 - 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/src/qdwh.cc b/src/qdwh.cc index 83820728f..469a71897 100644 --- a/src/qdwh.cc +++ b/src/qdwh.cc @@ -75,9 +75,9 @@ void qdwh( using blas::real; // Constants - real_t conv = 100.; - real_t rone = 1.0; - scalar_t zero = 0.0, one = 1.0; + const real_t conv = 100.; + const real_t r_one = 1.0; + const scalar_t zero = 0.0, one = 1.0; scalar_t alpha, beta; // Options @@ -85,7 +85,7 @@ void qdwh( real_t eps = std::numeric_limits::epsilon(); real_t tol1 = 5. * eps; - real_t tol3 = pow(tol1, rone/real_t(3.)); + real_t tol3 = pow(tol1, r_one/real_t(3.)); int64_t mt = A.mt(); int64_t nt = A.nt(); @@ -162,7 +162,7 @@ void qdwh( // scale the original matrix A to form A0 which is the initial matrix // for the first iteration - scale(rone, norm_est, A, opts); + scale(r_one, norm_est, A, opts); // Estimate the condition number using QR // This Q factor can be used in the first QR-based iteration @@ -194,16 +194,16 @@ void qdwh( itconv++; L2 = Liconv * Liconv; - dd = std::pow( real_t(4.0) * ( rone - L2 ), rone / real_t(3.0) ) * + dd = std::pow( real_t(4.0) * ( r_one - L2 ), r_one / real_t(3.0) ) * std::pow( L2, real_t(-2.0) / real_t(3.0) ); - sqd = sqrt( rone + dd ); + sqd = sqrt( r_one + dd ); a1 = sqd + sqrt( real_t(8.0) - real_t(4.0) * dd + real_t(8.0) * ( real_t(2.0) - L2 ) / ( L2 * sqd ) ) / real_t(2.0); a = real(a1); - b = ( a - rone ) * ( a - rone ) / real_t(4.0); - c = a + b - rone; + b = ( a - r_one ) * ( a - r_one ) / real_t(4.0); + c = a + b - r_one; // Update Liconv - Liconv = Liconv * ( a + b * L2 ) / ( rone + c * L2 ); + Liconv = Liconv * ( a + b * L2 ) / ( r_one + c * L2 ); } it = 0; @@ -216,16 +216,16 @@ void qdwh( // Compute parameters L,a,b,c L2 = Li * Li; - dd = std::pow( real_t(4.0) * ( rone - L2 ), rone / real_t(3.0) ) * + dd = std::pow( real_t(4.0) * ( r_one - L2 ), r_one / real_t(3.0) ) * std::pow( L2, real_t(-2.0) / real_t(3.0) ); - sqd = sqrt( rone + dd ); + sqd = sqrt( r_one + dd ); a1 = sqd + sqrt( real_t(8.0) - real_t(4.0) * dd + real_t(8.0) * ( real_t(2.0) - L2 ) / ( L2 * sqd ) ) / real_t(2.0); a = real(a1); - b = ( a - rone ) * ( a - rone ) / real_t(4.0); - c = a + b - rone; + b = ( a - r_one ) * ( a - r_one ) / real_t(4.0); + c = a + b - r_one; // Update Li - Li = Li * ( a + b * L2 ) / ( rone + c * L2 ); + Li = Li * ( a + b * L2 ) / ( r_one + c * L2 ); if (c > 100.) { // Generate the matrix W = [ W1 ] = [ sqrt(c) * A ] diff --git a/src/unmqr_qdwh_full.cc b/src/unmqr_qdwh_full.cc index b13b8b502..a1ace1725 100644 --- a/src/unmqr_qdwh_full.cc +++ b/src/unmqr_qdwh_full.cc @@ -27,7 +27,6 @@ void unmqr_qdwh_full( Options const& opts ) { // trace::Block trace_block("unmqr"); - // const int priority_one = 1; using BcastList = typename Matrix::BcastList; // Assumes column major From 2be83cd6f6b4850796bce8cca457470b0b504321 Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Mon, 6 Feb 2023 14:17:01 -0500 Subject: [PATCH 43/49] moving const up and minor changes --- src/norm2est.cc | 26 +++++++++++++------------- src/qdwh.cc | 21 ++++++++++----------- 2 files changed, 23 insertions(+), 24 deletions(-) diff --git a/src/norm2est.cc b/src/norm2est.cc index 463435e31..9d6cca8c7 100644 --- a/src/norm2est.cc +++ b/src/norm2est.cc @@ -31,23 +31,26 @@ norm2est( using real_t = blas::real_type; using blas::min; - int p, q; - int myrow, mycol; - int izero = 0; - GridOrder order; - A.gridinfo(&order, &p, &q, &myrow, &mycol); + // Constants + const scalar_t one = 1.; + const scalar_t zero = 0.; + const real_t r_one = 1.; + const int izero = 0; int64_t n = A.n(); int64_t m = A.m(); int64_t mb = A.tileMb(0); int64_t nb = A.tileNb(0); - scalar_t one = 1.; - scalar_t zero = 0.; - real_t rone = 1.; + int p, q; + int myrow, mycol; + GridOrder order; + A.gridinfo(&order, &p, &q, &myrow, &mycol); int64_t cnt = 0; int64_t maxiter = min( 100, n ); + int64_t mloc = numberLocalRowOrCol(m, mb, myrow, izero, p); + int64_t lldA = blas::max(1, mloc); real_t e = 0.; real_t e0 = 0.; @@ -55,10 +58,6 @@ norm2est( real_t normAX = 0; real_t tol = 1.e-1; - int64_t mloc = numberLocalRowOrCol(m, mb, myrow, izero, p); - int64_t lldA = blas::max(1, mloc); - - // todo: do we still need reserveDevice here? if (target == Target::Devices) A.reserveDeviceWorkspace(); @@ -104,7 +103,7 @@ norm2est( e0 = e; // Scale X = X / ||X|| - scale(rone, normX, XL, opts); + scale(r_one, normX, XL, opts); // Compute Ax = A * sx gemm(one, A, XL, zero, AX, opts); @@ -116,6 +115,7 @@ norm2est( // Compute x = A' * A * x = A' * Ax auto AT = conjTranspose(A); + // todo: why this set is needed when using multiple mpi rank and using gemmA set(zero, zero, XL); gemm(one, AT, AX, zero, XL, opts); diff --git a/src/qdwh.cc b/src/qdwh.cc index 469a71897..86bdb6809 100644 --- a/src/qdwh.cc +++ b/src/qdwh.cc @@ -70,25 +70,21 @@ void qdwh( // 1. reuse Q from first geqrf // 2. avoid rounding m if m is not divisible by nb, because then geqrf/unmqr // have extra zero rows - // in norm2est, when allocate vector mx1, need to allocate on row-processes only + using real_t = blas::real_type; using blas::real; // Constants - const real_t conv = 100.; + const scalar_t zero = 0.0; + const scalar_t one = 1.0; const real_t r_one = 1.0; - const scalar_t zero = 0.0, one = 1.0; - scalar_t alpha, beta; // Options Target target = get_option( opts, Option::Target, Target::HostTask ); - real_t eps = std::numeric_limits::epsilon(); - real_t tol1 = 5. * eps; - real_t tol3 = pow(tol1, r_one/real_t(3.)); - int64_t mt = A.mt(); int64_t nt = A.nt(); + int64_t nb = A.tileMb(0); int64_t m = A.m(); int64_t n = A.n(); @@ -97,23 +93,26 @@ void qdwh( return; } - int64_t nb = A.tileMb(0); + real_t eps = std::numeric_limits::epsilon(); + real_t tol1 = 5. * eps; + real_t tol3 = pow(tol1, r_one/real_t(3.)); int itconv, it; real_t L2, sqd, dd, a1, a, b, c; real_t Li, Liconv; real_t normR; + real_t conv = 10.0; + scalar_t alpha, beta; + // The QR-based iterations requires QR[A; Id], // W = [A; Id], where size of A is mxn, size of Id is nxn // To avoid having a clean up tile in the middle of the W matrix, // we round up the number of A rows (m), // so that size(W) = m_roundup + n - //int64_t m_roundup = ((m + nb - 1) / nb ) * nb; int64_t m_roundup = roundup( m, nb ); int64_t m_W = m_roundup + n; int64_t mt_W = mt + nt; - int nprow, npcol; int myrow, mycol; GridOrder order; From 23bc075a6f1dcf63dbc2e586d2bcaaea59cd4549 Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Mon, 6 Feb 2023 15:19:02 -0500 Subject: [PATCH 44/49] test matrices with a smaller cond --- test/test_qdwh.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_qdwh.cc b/test/test_qdwh.cc index c636a7f92..b189035fd 100644 --- a/test/test_qdwh.cc +++ b/test/test_qdwh.cc @@ -109,7 +109,7 @@ void test_qdwh_work(Params& params, bool run) H = slate::Matrix::fromScaLAPACK(n, n, &H_data[0], lldH, nb, p, q, MPI_COMM_WORLD); } - real_t cond = 1 / (std::numeric_limits::epsilon() ); + real_t cond = 1 / (100 * std::numeric_limits::epsilon() ); params.matrix.kind.set_default("svd"); params.matrix.cond.set_default(cond); From c8c88bb5633bff7ef551b7f6e0f685441faa5a94 Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Tue, 7 Feb 2023 15:12:29 -0500 Subject: [PATCH 45/49] Computed parameters in double to avoid overflow in complex-float. Minor update on gflops count --- src/qdwh.cc | 21 +++++++++++---------- test/test_qdwh.cc | 12 ++++++------ 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/src/qdwh.cc b/src/qdwh.cc index 86bdb6809..4582840ac 100644 --- a/src/qdwh.cc +++ b/src/qdwh.cc @@ -98,11 +98,11 @@ void qdwh( real_t tol3 = pow(tol1, r_one/real_t(3.)); int itconv, it; - real_t L2, sqd, dd, a1, a, b, c; - real_t Li, Liconv; real_t normR; real_t conv = 10.0; scalar_t alpha, beta; + double L2, sqd, a1, a, b, c, dd; + real_t Li, Liconv; // The QR-based iterations requires QR[A; Id], // W = [A; Id], where size of A is mxn, size of Id is nxn @@ -192,17 +192,17 @@ void qdwh( } itconv++; - L2 = Liconv * Liconv; + L2 = double(Liconv) * double(Liconv); dd = std::pow( real_t(4.0) * ( r_one - L2 ), r_one / real_t(3.0) ) * std::pow( L2, real_t(-2.0) / real_t(3.0) ); - sqd = sqrt( r_one + dd ); - a1 = sqd + sqrt( real_t(8.0) - real_t(4.0) * dd + + sqd = sqrt( r_one + real_t(dd) ); + a1 = sqd + sqrt( real_t(8.0) - real_t(4.0) * real_t(dd) + real_t(8.0) * ( real_t(2.0) - L2 ) / ( L2 * sqd ) ) / real_t(2.0); a = real(a1); b = ( a - r_one ) * ( a - r_one ) / real_t(4.0); c = a + b - r_one; // Update Liconv - Liconv = Liconv * ( a + b * L2 ) / ( r_one + c * L2 ); + Liconv = Liconv * real_t(( a + b * L2 ) / ( r_one + c * L2 )); } it = 0; @@ -214,17 +214,17 @@ void qdwh( it++; // Compute parameters L,a,b,c - L2 = Li * Li; + L2 = double(Li) * double(Li); dd = std::pow( real_t(4.0) * ( r_one - L2 ), r_one / real_t(3.0) ) * std::pow( L2, real_t(-2.0) / real_t(3.0) ); - sqd = sqrt( r_one + dd ); - a1 = sqd + sqrt( real_t(8.0) - real_t(4.0) * dd + + sqd = sqrt( r_one + real_t(dd) ); + a1 = sqd + sqrt( real_t(8.0) - real_t(4.0) * real_t(dd) + real_t(8.0) * ( real_t(2.0) - L2 ) / ( L2 * sqd ) ) / real_t(2.0); a = real(a1); b = ( a - r_one ) * ( a - r_one ) / real_t(4.0); c = a + b - r_one; // Update Li - Li = Li * ( a + b * L2 ) / ( r_one + c * L2 ); + Li = Li * real_t(( a + b * L2 ) / ( r_one + c * L2 )); if (c > 100.) { // Generate the matrix W = [ W1 ] = [ sqrt(c) * A ] @@ -326,6 +326,7 @@ void qdwh( else { gemm(one, AT, Acpy, zero, H, opts); } + // todo: try something like her2k to compute H //her2k(one, A, W10, rzero, H, opts); //auto AL = HermitianMatrix( diff --git a/test/test_qdwh.cc b/test/test_qdwh.cc index b189035fd..7cdb460f9 100644 --- a/test/test_qdwh.cc +++ b/test/test_qdwh.cc @@ -109,7 +109,7 @@ void test_qdwh_work(Params& params, bool run) H = slate::Matrix::fromScaLAPACK(n, n, &H_data[0], lldH, nb, p, q, MPI_COMM_WORLD); } - real_t cond = 1 / (100 * std::numeric_limits::epsilon() ); + real_t cond = 1 / std::numeric_limits::epsilon(); params.matrix.kind.set_default("svd"); params.matrix.cond.set_default(cond); @@ -160,13 +160,13 @@ void test_qdwh_work(Params& params, bool run) // of the identity trailing submatrix: //double gflop_itqr = itqr * ( lapack::Gflop::geqrf(m+n, n) + // lapack::Gflop::unmqr(slate::Side::Left, m+n, n, n) + - // lapack::Gflop::gemm(m, n, n) ); + // blas::Gflop::gemm(m, n, n) ); // when take advantage of the matrix structure, the flops count will be reduced: - double gflop_itqr = itqr * ( 2 * m * std::pow(n, 2) + - 2 * m * std::pow(n, 2) + - lapack::Gflop::gemm(m, n, n) ); + double gflop_itqr = itqr * ( 2. * double(m*n*n) + + 2. * double(m*n*n) + + blas::Gflop::gemm(m, n, n) ); - double gflop_itpo = itpo * ( lapack::Gflop::gemm(m, n, n) + + double gflop_itpo = itpo * ( blas::Gflop::gemm(m, n, n) + lapack::Gflop::potrf(m) + blas::Gflop::trsm(slate::Side::Left, m, n) ); From 531ca62ea2f251d0b762a6ad073e1af13cb38952 Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Tue, 7 Feb 2023 16:01:07 -0500 Subject: [PATCH 46/49] Added geqrf_compute_first_indices to internal_util.hh and removed it from geqrf and geqrf_qdwh_full --- src/geqrf.cc | 44 +++-------------------------------- src/geqrf_qdwh_full.cc | 44 +++-------------------------------- src/internal/internal_util.hh | 40 +++++++++++++++++++++++++++++++ 3 files changed, 46 insertions(+), 82 deletions(-) diff --git a/src/geqrf.cc b/src/geqrf.cc index 399202201..4fe6328c8 100644 --- a/src/geqrf.cc +++ b/src/geqrf.cc @@ -7,50 +7,12 @@ #include "auxiliary/Debug.hh" #include "slate/Matrix.hh" #include "internal/internal.hh" +#include "internal/internal_util.hh" namespace slate { namespace impl { -//------------------------------------------------------------------------------ -/// An auxiliary routine to find each rank's first (top-most) row -/// in panel k. -/// -/// @param[in] A_panel -/// Current panel, which is a sub of the input matrix $A$. -/// -/// @param[in] k -/// Index of the current panel in the input matrix $A$. -/// -/// @param[out] first_indices -/// The array of computed indices. -/// -/// @ingroup geqrf_impl -/// -template -void geqrf_compute_first_indices( - Matrix& A_panel, int64_t k, - std::vector< int64_t >& first_indices ) -{ - // Find ranks in this column. - std::set ranks_set; - A_panel.getRanks(&ranks_set); - assert(ranks_set.size() > 0); - - // Find each rank's first (top-most) row in this panel, - // where the triangular tile resulting from local geqrf panel - // will reside. - first_indices.reserve(ranks_set.size()); - for (int r: ranks_set) { - for (int64_t i = 0; i < A_panel.mt(); ++i) { - if (A_panel.tileRank(i, 0) == r) { - first_indices.push_back(i+k); - break; - } - } - } -} - //------------------------------------------------------------------------------ /// Distributed parallel QR factorization. /// Generic implementation for any target. @@ -185,7 +147,7 @@ void geqrf( auto Tr_panel = Treduce.sub(k, A_mt-1, k, k); std::vector< int64_t > first_indices; - geqrf_compute_first_indices(A_panel, k, first_indices); + internal::geqrf_compute_first_indices(A_panel, k, first_indices); // todo: pass first_indices into internal geqrf or ttqrt? // panel, high priority @@ -328,7 +290,7 @@ void geqrf( auto A_panel_k_la = A.sub(k_la, A_mt-1, k_la, k_la); std::vector< int64_t > first_indices_k_la; - geqrf_compute_first_indices(A_panel_k_la, k_la, first_indices_k_la); + internal::geqrf_compute_first_indices(A_panel_k_la, k_la, first_indices_k_la); if (first_indices.size() > 0) { for (int64_t row : first_indices_k_la) { if (Tlocal.tileIsLocal(row, k_la)) { diff --git a/src/geqrf_qdwh_full.cc b/src/geqrf_qdwh_full.cc index d1c4141c0..1dafed1a7 100644 --- a/src/geqrf_qdwh_full.cc +++ b/src/geqrf_qdwh_full.cc @@ -7,50 +7,12 @@ #include "auxiliary/Debug.hh" #include "slate/Matrix.hh" #include "internal/internal.hh" +#include "internal/internal_util.hh" namespace slate { namespace impl { -//------------------------------------------------------------------------------ -/// An auxiliary routine to find each rank's first (top-most) row -/// in panel k. -/// -/// @param[in] A_panel -/// Current panel, which is a sub of the input matrix $A$. -/// -/// @param[in] k -/// Index of the current panel in the input matrix $A$. -/// -/// @param[out] first_indices -/// The array of computed indices. -/// -/// @ingroup geqrf_qdwh_full_specialization -/// -template -void geqrf_compute_first_indices( - Matrix& A_panel, int64_t k, - std::vector< int64_t >& first_indices) -{ - // Find ranks in this column. - std::set ranks_set; - A_panel.getRanks(&ranks_set); - assert(ranks_set.size() > 0); - - // Find each rank's first (top-most) row in this panel, - // where the triangular tile resulting from local geqrf panel - // will reside. - first_indices.reserve(ranks_set.size()); - for (int r: ranks_set) { - for (int64_t i = 0; i < A_panel.mt(); ++i) { - if (A_panel.tileRank(i, 0) == r) { - first_indices.push_back(i+k); - break; - } - } - } -} - //------------------------------------------------------------------------------ /// Distributed parallel QR factorization. /// Generic implementation for any target. @@ -189,7 +151,7 @@ void geqrf_qdwh_full( auto Tr_panel = Treduce.sub(k, i_end, k, k); std::vector< int64_t > first_indices; - geqrf_compute_first_indices(A_panel, k, first_indices); + internal::geqrf_compute_first_indices(A_panel, k, first_indices); // todo: pass first_indices into internal geqrf or ttqrt? // panel, high priority @@ -334,7 +296,7 @@ void geqrf_qdwh_full( auto A_panel_k_la = A.sub(k_la, i_end_la, k_la, k_la); std::vector< int64_t > first_indices_k_la; - geqrf_compute_first_indices(A_panel_k_la, k_la, first_indices_k_la); + internal::geqrf_compute_first_indices(A_panel_k_la, k_la, first_indices_k_la); if (first_indices.size() > 0) { for (int64_t row : first_indices_k_la) { if (Tlocal.tileIsLocal(row, k_la)) { diff --git a/src/internal/internal_util.hh b/src/internal/internal_util.hh index e7b8241d5..ba2671dd3 100644 --- a/src/internal/internal_util.hh +++ b/src/internal/internal_util.hh @@ -10,6 +10,7 @@ #define SLATE_INTERNAL_UTIL_HH #include "slate/internal/mpi.hh" +#include "slate/Matrix.hh" #include #include @@ -69,6 +70,45 @@ inline bool compareSecond( return a.second < b.second; } +//------------------------------------------------------------------------------ +/// An auxiliary routine to find each rank's first (top-most) row +/// in panel k. +/// +/// @param[in] A_panel +/// Current panel, which is a sub of the input matrix $A$. +/// +/// @param[in] k +/// Index of the current panel in the input matrix $A$. +/// +/// @param[out] first_indices +/// The array of computed indices. +/// +/// @ingroup geqrf_impl +/// +template +void geqrf_compute_first_indices( + Matrix& A_panel, int64_t k, + std::vector< int64_t >& first_indices ) +{ + // Find ranks in this column. + std::set ranks_set; + A_panel.getRanks(&ranks_set); + assert(ranks_set.size() > 0); + + // Find each rank's first (top-most) row in this panel, + // where the triangular tile resulting from local geqrf panel + // will reside. + first_indices.reserve(ranks_set.size()); + for (int r: ranks_set) { + for (int64_t i = 0; i < A_panel.mt(); ++i) { + if (A_panel.tileRank(i, 0) == r) { + first_indices.push_back(i+k); + break; + } + } + } +} + } // namespace internal } // namespace slate From 8ce87db66fb78b65b9187ceb951d00d0a32736ab Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Thu, 9 Feb 2023 09:58:53 -0500 Subject: [PATCH 47/49] fix flops count --- test/test_qdwh.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_qdwh.cc b/test/test_qdwh.cc index 7cdb460f9..f12f7fd44 100644 --- a/test/test_qdwh.cc +++ b/test/test_qdwh.cc @@ -162,8 +162,8 @@ void test_qdwh_work(Params& params, bool run) // lapack::Gflop::unmqr(slate::Side::Left, m+n, n, n) + // blas::Gflop::gemm(m, n, n) ); // when take advantage of the matrix structure, the flops count will be reduced: - double gflop_itqr = itqr * ( 2. * double(m*n*n) + - 2. * double(m*n*n) + + double gflop_itqr = itqr * ( 1e-9 * 2. * double(m*n*n) + + 1e-9 * 2. * double(m*n*n) + blas::Gflop::gemm(m, n, n) ); double gflop_itpo = itpo * ( blas::Gflop::gemm(m, n, n) + From 34e0afa0071075d7b3d280faa2f3ca83db85f9a7 Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Fri, 31 Mar 2023 12:43:42 -0400 Subject: [PATCH 48/49] Delete un needed matrix allocation --- test/test_qdwh.cc | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/test/test_qdwh.cc b/test/test_qdwh.cc index f12f7fd44..7e8d3a214 100644 --- a/test/test_qdwh.cc +++ b/test/test_qdwh.cc @@ -81,11 +81,6 @@ void test_qdwh_work(Params& params, bool run) int64_t lldH = blas::max(1, mlocH); // local leading dimension of H std::vector H_data; - // matrix QR, for checking result - slate::Matrix Id; - Id = slate::Matrix(n, n, nb, p, q, MPI_COMM_WORLD); - Id.insertLocalTiles(); - slate::Matrix A; slate::Matrix H; //slate::HermitianMatrix H; @@ -185,20 +180,6 @@ void test_qdwh_work(Params& params, bool run) } if (check) { - //================================================== - // Test results by checking the orthogonality of U factor - // - // || A'A - I ||_f - // ---------------- < tol * epsilon - // n - // - //================================================== - auto AT = conj_transpose(A); - set(zero, one, Id); - slate::gemm(one, AT, A, -one, Id, opts); - real_t orth = slate::norm(slate::Norm::Fro, Id); - params.ortho() = orth / sqrt(n); - //================================================== // Test results by checking backwards error // @@ -215,6 +196,20 @@ void test_qdwh_work(Params& params, bool run) real_t berr = slate::norm(slate::Norm::Fro, Aref, opts); params.error() = berr / normA; + //================================================== + // Test results by checking the orthogonality of U factor + // + // || A'A - I ||_f + // ---------------- < tol * epsilon + // n + // + //================================================== + auto AT = conj_transpose(A); + set(zero, one, Aref); + slate::gemm(one, AT, A, -one, Aref, opts); + real_t orth = slate::norm(slate::Norm::Fro, Aref); + params.ortho() = orth / sqrt(n); + real_t tol = params.tol() * 0.5 * std::numeric_limits::epsilon(); params.okay() = ((params.error() <= tol) && (params.ortho() <= tol)); From d25e4198d7cb7e8122cfa49844d659986aa48800 Mon Sep 17 00:00:00 2001 From: Dalal Sukkari Date: Mon, 10 Apr 2023 10:00:48 -0400 Subject: [PATCH 49/49] Changed device_malloc and used gemmC in norm2est (for now) --- src/geqrf_qdwh_full.cc | 14 ++++++-------- src/norm2est.cc | 6 +++--- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/src/geqrf_qdwh_full.cc b/src/geqrf_qdwh_full.cc index 1dafed1a7..c341104a0 100644 --- a/src/geqrf_qdwh_full.cc +++ b/src/geqrf_qdwh_full.cc @@ -108,9 +108,7 @@ void geqrf_qdwh_full( if (panel_device >= 0) { - blas::set_device( panel_device ); - - lapack::Queue* queue = A.compute_queue(panel_device, 0); + lapack::Queue* comm_queue = A.comm_queue(panel_device); int64_t nb = A.tileNb(0); size_t size_tau = (size_t) std::min( mlocal, nb ); @@ -119,15 +117,15 @@ void geqrf_qdwh_full( // Find size of the workspace needed lapack::geqrf_work_size_bytes( mlocal, nb, dwork_array[0], mlocal, - &dsize, &hsize, *queue ); + &dsize, &hsize, *comm_queue ); // Size of dA, dtau, dwork and dinfo work_size = size_A + size_tau + ceildiv(dsize, sizeof(scalar_t)) + ceildiv(sizeof(device_info_t), sizeof(scalar_t)); for (int64_t dev = 0; dev < num_devices; ++dev) { - blas::set_device(dev); - dwork_array[dev] = blas::device_malloc(work_size); + lapack::Queue* queue = A.comm_queue( dev ); + dwork_array[dev] = blas::device_malloc(work_size, *queue); } } } @@ -325,8 +323,8 @@ void geqrf_qdwh_full( if (target == Target::Devices) { for (int64_t dev = 0; dev < num_devices; ++dev) { - blas::set_device(dev); - blas::device_free( dwork_array[dev] ); + blas::Queue* queue = A.comm_queue( dev ); + blas::device_free( dwork_array[dev], *queue ); dwork_array[dev] = nullptr; } } diff --git a/src/norm2est.cc b/src/norm2est.cc index 9d6cca8c7..f7b253ac2 100644 --- a/src/norm2est.cc +++ b/src/norm2est.cc @@ -106,7 +106,7 @@ norm2est( scale(r_one, normX, XL, opts); // Compute Ax = A * sx - gemm(one, A, XL, zero, AX, opts); + gemmC(one, A, XL, zero, AX, opts); // todo: still need to add the following //if nnz(Sx) == 0 @@ -114,11 +114,11 @@ norm2est( //end // Compute x = A' * A * x = A' * Ax - auto AT = conjTranspose(A); + auto AT = conj_transpose(A); // todo: why this set is needed when using multiple mpi rank and using gemmA set(zero, zero, XL); - gemm(one, AT, AX, zero, XL, opts); + gemmC(one, AT, AX, zero, XL, opts); // Compute ||X||, ||AX|| normX = norm(Norm::Fro, XL, opts);