dgejsv function

void dgejsv(
  1. String JOBA,
  2. String JOBU,
  3. String JOBV,
  4. String JOBR,
  5. String JOBT,
  6. String JOBP,
  7. int M,
  8. int N,
  9. Matrix<double> A_,
  10. int LDA,
  11. Array<double> SVA_,
  12. Matrix<double> U_,
  13. int LDU,
  14. Matrix<double> V_,
  15. int LDV,
  16. Array<double> WORK_,
  17. int LWORK,
  18. Array<int> IWORK_,
  19. Box<int> INFO,
)

Implementation

void dgejsv(
  final String JOBA,
  final String JOBU,
  final String JOBV,
  final String JOBR,
  final String JOBT,
  final String JOBP,
  final int M,
  final int N,
  final Matrix<double> A_,
  final int LDA,
  final Array<double> SVA_,
  final Matrix<double> U_,
  final int LDU,
  final Matrix<double> V_,
  final int LDV,
  final Array<double> WORK_,
  final int LWORK,
  final Array<int> IWORK_,
  final Box<int> INFO,
) {
  final A = A_.having(ld: LDA);
  final SVA = SVA_.having();
  final U = U_.having(ld: LDU);
  final V = V_.having(ld: LDV);
  final WORK = WORK_.having();
  final IWORK = IWORK_.having();
  const ZERO = 0.0, ONE = 1.0;
  double AATMAX,
      AATMIN,
      BIG,
      BIG1,
      COND_OK,
      CONDR1,
      CONDR2,
      ENTRA,
      ENTRAT,
      EPSLN,
      MAXPRJ,
      SCALEM,
      SCONDA,
      SFMIN,
      SMALL,
      USCAL1,
      USCAL2;
  int N1 = 0, NR, NUMRANK, p, q, WARNING;
  bool ALMORT,
      DEFR,
      ERREST,
      GOSCAL,
      JRACC,
      KILL,
      LSVEC,
      L2ABER,
      L2KILL,
      L2PERT,
      L2RANK,
      L2TRAN,
      NOSCAL,
      ROWPIV,
      RSVEC,
      TRANSP;
  final AAPP = Box(0.0), AAQQ = Box(0.0), XSC = Box(0.0), TEMP1 = Box(0.0);
  final IERR = Box(0);

  // Test the input arguments

  LSVEC = lsame(JOBU, 'U') || lsame(JOBU, 'F');
  JRACC = lsame(JOBV, 'J');
  RSVEC = lsame(JOBV, 'V') || JRACC;
  ROWPIV = lsame(JOBA, 'F') || lsame(JOBA, 'G');
  L2RANK = lsame(JOBA, 'R');
  L2ABER = lsame(JOBA, 'A');
  ERREST = lsame(JOBA, 'E') || lsame(JOBA, 'G');
  L2TRAN = lsame(JOBT, 'T');
  L2KILL = lsame(JOBR, 'R');
  DEFR = lsame(JOBR, 'N');
  L2PERT = lsame(JOBP, 'P');

  if (!(ROWPIV || L2RANK || L2ABER || ERREST || lsame(JOBA, 'C'))) {
    INFO.value = -1;
  } else if (!(LSVEC || lsame(JOBU, 'N') || lsame(JOBU, 'W'))) {
    INFO.value = -2;
  } else if (!(RSVEC || lsame(JOBV, 'N') || lsame(JOBV, 'W')) ||
      (JRACC && !LSVEC)) {
    INFO.value = -3;
  } else if (!(L2KILL || DEFR)) {
    INFO.value = -4;
  } else if (!(L2TRAN || lsame(JOBT, 'N'))) {
    INFO.value = -5;
  } else if (!(L2PERT || lsame(JOBP, 'N'))) {
    INFO.value = -6;
  } else if (M < 0) {
    INFO.value = -7;
  } else if ((N < 0) || (N > M)) {
    INFO.value = -8;
  } else if (LDA < M) {
    INFO.value = -10;
  } else if (LSVEC && (LDU < M)) {
    INFO.value = -13;
  } else if (RSVEC && (LDV < N)) {
    INFO.value = -15;
  } else if ((!(LSVEC || RSVEC || ERREST) &&
          (LWORK < max(7, max(4 * N + 1, 2 * M + N)))) ||
      (!(LSVEC || RSVEC) &&
          ERREST &&
          (LWORK < max(7, max(4 * N + N * N, 2 * M + N)))) ||
      (LSVEC && !RSVEC && (LWORK < max(7, max(2 * M + N, 4 * N + 1)))) ||
      (RSVEC && !LSVEC && (LWORK < max(7, max(2 * M + N, 4 * N + 1)))) ||
      (LSVEC &&
          RSVEC &&
          !JRACC &&
          (LWORK < max(2 * M + N, 6 * N + 2 * N * N))) ||
      (LSVEC &&
          RSVEC &&
          JRACC &&
          LWORK < max(2 * M + N, max(4 * N + N * N, 2 * N + N * N + 6)))) {
    INFO.value = -17;
  } else {
    INFO.value = 0;
  }

  if (INFO.value != 0) {
    xerbla('DGEJSV', -INFO.value);
    return;
  }

  // Quick return for void matrix (Y3K safe)
  if ((M == 0) || (N == 0)) {
    for (var i = 1; i <= 3; i++) {
      IWORK[i] = 0;
    }
    for (var i = 1; i <= 7; i++) {
      WORK[i] = 0;
    }
    return;
  }

  // Determine whether the matrix U should be M x N or M x M

  if (LSVEC) {
    N1 = N;
    if (lsame(JOBU, 'F')) N1 = M;
  }

  // Set numerical parameters

  // NOTE: Make sure dlamch() does not fail on the target architecture.

  EPSLN = dlamch('Epsilon');
  SFMIN = dlamch('SafeMinimum');
  SMALL = SFMIN / EPSLN;
  BIG = dlamch('O');
  // BIG   = ONE / SFMIN

  // Initialize SVA[1:N] = diag( ||A e_i||_2 )_1^N

  // If necessary, scale SVA() to protect the largest norm from
  // overflow. It is possible that this scaling pushes the smallest
  // column norm left from the underflow threshold (extreme case).

  SCALEM = ONE / sqrt(M * N);
  NOSCAL = true;
  GOSCAL = true;
  for (p = 1; p <= N; p++) {
    AAPP.value = ZERO;
    AAQQ.value = ONE;
    dlassq(M, A(1, p).asArray(), 1, AAPP, AAQQ);
    if (AAPP.value > BIG) {
      INFO.value = -9;
      xerbla('DGEJSV', -INFO.value);
      return;
    }
    AAQQ.value = sqrt(AAQQ.value);
    if ((AAPP.value < (BIG / AAQQ.value)) && NOSCAL) {
      SVA[p] = AAPP.value * AAQQ.value;
    } else {
      NOSCAL = false;
      SVA[p] = AAPP.value * (AAQQ.value * SCALEM);
      if (GOSCAL) {
        GOSCAL = false;
        dscal(p - 1, SCALEM, SVA, 1);
      }
    }
  }

  if (NOSCAL) SCALEM = ONE;

  AAPP.value = ZERO;
  AAQQ.value = BIG;
  for (p = 1; p <= N; p++) {
    AAPP.value = max(AAPP.value, SVA[p]);
    if (SVA[p] != ZERO) AAQQ.value = min(AAQQ.value, SVA[p]);
  }

  // Quick return for zero M x N matrix
  if (AAPP.value == ZERO) {
    if (LSVEC) dlaset('G', M, N1, ZERO, ONE, U, LDU);
    if (RSVEC) dlaset('G', N, N, ZERO, ONE, V, LDV);
    WORK[1] = ONE;
    WORK[2] = ONE;
    if (ERREST) WORK[3] = ONE;
    if (LSVEC && RSVEC) {
      WORK[4] = ONE;
      WORK[5] = ONE;
    }
    if (L2TRAN) {
      WORK[6] = ZERO;
      WORK[7] = ZERO;
    }
    IWORK[1] = 0;
    IWORK[2] = 0;
    IWORK[3] = 0;
    return;
  }

  // Issue warning if denormalized column norms detected. Override the
  // high relative accuracy request. Issue licence to kill columns
  // (set them to zero) whose norm is less than sigma_max / BIG (roughly).
  WARNING = 0;
  if (AAQQ.value <= SFMIN) {
    L2RANK = true;
    L2KILL = true;
    WARNING = 1;
  }

  // Quick return for one-column matrix
  if (N == 1) {
    if (LSVEC) {
      dlascl('G', 0, 0, SVA[1], SCALEM, M, 1, A(1, 1), LDA, IERR);
      dlacpy('A', M, 1, A, LDA, U, LDU);
      // computing all M left singular vectors of the M x 1 matrix
      if (N1 != N) {
        dgeqrf(M, N, U, LDU, WORK, WORK(N + 1), LWORK - N, IERR);
        dorgqr(M, N1, 1, U, LDU, WORK, WORK(N + 1), LWORK - N, IERR);
        dcopy(M, A(1, 1).asArray(), 1, U(1, 1).asArray(), 1);
      }
    }
    if (RSVEC) {
      V[1][1] = ONE;
    }
    if (SVA[1] < (BIG * SCALEM)) {
      SVA[1] /= SCALEM;
      SCALEM = ONE;
    }
    WORK[1] = ONE / SCALEM;
    WORK[2] = ONE;
    if (SVA[1] != ZERO) {
      IWORK[1] = 1;
      if ((SVA[1] / SCALEM) >= SFMIN) {
        IWORK[2] = 1;
      } else {
        IWORK[2] = 0;
      }
    } else {
      IWORK[1] = 0;
      IWORK[2] = 0;
    }
    IWORK[3] = 0;
    if (ERREST) WORK[3] = ONE;
    if (LSVEC && RSVEC) {
      WORK[4] = ONE;
      WORK[5] = ONE;
    }
    if (L2TRAN) {
      WORK[6] = ZERO;
      WORK[7] = ZERO;
    }
    return;
  }

  TRANSP = false;
  L2TRAN = L2TRAN && (M == N);

  AATMAX = -ONE;
  AATMIN = BIG;
  if (ROWPIV || L2TRAN) {
    // Compute the row norms, needed to determine row pivoting sequence
    // (in the case of heavily row weighted A, row pivoting is strongly
    // advised) and to collect information needed to compare the
    // structures of A * A^t and A^t * A (in the case L2TRAN == true ).

    if (L2TRAN) {
      for (p = 1; p <= M; p++) {
        XSC.value = ZERO;
        TEMP1.value = ONE;
        dlassq(N, A(p, 1).asArray(), LDA, XSC, TEMP1);
        // DLASSQ gets both the ell_2 and the ell_infinity norm
        // in one pass through the vector
        WORK[M + N + p] = XSC.value * SCALEM;
        WORK[N + p] = XSC.value * (SCALEM * sqrt(TEMP1.value));
        AATMAX = max(AATMAX, WORK[N + p]);
        if (WORK[N + p] != ZERO) AATMIN = min(AATMIN, WORK[N + p]);
      }
    } else {
      for (p = 1; p <= M; p++) {
        WORK[M + N + p] =
            SCALEM * A[p][idamax(N, A(p, 1).asArray(), LDA)].abs();
        AATMAX = max(AATMAX, WORK[M + N + p]);
        AATMIN = min(AATMIN, WORK[M + N + p]);
      }
    }
  }

  // For square matrix A try to determine whether A^t  would be  better
  // input for the preconditioned Jacobi SVD, with faster convergence.
  // The decision is based on an O(N) function of the vector of column
  // and row norms of A, based on the Shannon entropy. This should give
  // the right choice in most cases when the difference actually matters.
  // It may fail and pick the slower converging side.

  ENTRA = ZERO;
  ENTRAT = ZERO;
  if (L2TRAN) {
    XSC.value = ZERO;
    TEMP1.value = ONE;
    dlassq(N, SVA, 1, XSC, TEMP1);
    TEMP1.value = ONE / TEMP1.value;

    ENTRA = ZERO;
    for (p = 1; p <= N; p++) {
      BIG1 = (pow((SVA[p] / XSC.value), 2)) * TEMP1.value;
      if (BIG1 != ZERO) ENTRA += BIG1 * log(BIG1);
    }
    ENTRA = -ENTRA / log(N);

    // Now, SVA().^2/Trace(A^t * A) is a point in the probability simplex.
    // It is derived from the diagonal of  A^t * A.  Do the same with the
    // diagonal of A * A^t, compute the entropy of the corresponding
    // probability distribution. Note that A * A^t and A^t * A have the
    // same trace.

    ENTRAT = ZERO;
    for (p = N + 1; p <= N + M; p++) {
      BIG1 = (pow((WORK[p] / XSC.value), 2)) * TEMP1.value;
      if (BIG1 != ZERO) ENTRAT += BIG1 * log(BIG1);
    }
    ENTRAT = -ENTRAT / log(M);

    // Analyze the entropies and decide A or A^t. Smaller entropy
    // usually means better input for the algorithm.

    TRANSP = (ENTRAT < ENTRA);

    // If A^t is better than A, transpose A.

    if (TRANSP) {
      // In an optimal implementation, this trivial transpose
      // should be replaced with faster transpose.
      for (p = 1; p <= N - 1; p++) {
        for (q = p + 1; q <= N; q++) {
          TEMP1.value = A[q][p];
          A[q][p] = A[p][q];
          A[p][q] = TEMP1.value;
        }
      }
      for (p = 1; p <= N; p++) {
        WORK[M + N + p] = SVA[p];
        SVA[p] = WORK[N + p];
      }
      TEMP1.value = AAPP.value;
      AAPP.value = AATMAX;
      AATMAX = TEMP1.value;
      TEMP1.value = AAQQ.value;
      AAQQ.value = AATMIN;
      AATMIN = TEMP1.value;
      KILL = LSVEC;
      LSVEC = RSVEC;
      RSVEC = KILL;
      if (LSVEC) N1 = N;

      ROWPIV = true;
    }
  }
  // END IF L2TRAN

  // Scale the matrix so that its maximal singular value remains less
  // than sqrt(BIG) -- the matrix is scaled so that its maximal column
  // has Euclidean norm equal to sqrt(BIG/N). The only reason to keep
  // sqrt(BIG) instead of BIG is the fact that DGEJSV uses LAPACK and
  // BLAS routines that, in some implementations, are not capable of
  // working in the full interval [SFMIN,BIG] and that they may provoke
  // overflows in the intermediate results. If the singular values spread
  // from SFMIN to BIG, then DGESVJ will compute them. So, in that case,
  // one should use DGESVJ instead of DGEJSV.

  BIG1 = sqrt(BIG);
  TEMP1.value = sqrt(BIG / N);

  dlascl('G', 0, 0, AAPP.value, TEMP1.value, N, 1, SVA.asMatrix(N), N, IERR);
  if (AAQQ.value > (AAPP.value * SFMIN)) {
    AAQQ.value = (AAQQ.value / AAPP.value) * TEMP1.value;
  } else {
    AAQQ.value = (AAQQ.value * TEMP1.value) / AAPP.value;
  }
  TEMP1.value *= SCALEM;
  dlascl('G', 0, 0, AAPP.value, TEMP1.value, M, N, A, LDA, IERR);

  // To undo scaling at the end of this procedure, multiply the
  // computed singular values with USCAL2 / USCAL1.

  USCAL1 = TEMP1.value;
  USCAL2 = AAPP.value;

  if (L2KILL) {
    // L2KILL enforces computation of nonzero singular values in
    // the restricted range of condition number of the initial A,
    // sigma_max(A) / sigma_min(A) approx. sqrt(BIG)/sqrt(SFMIN).
    XSC.value = sqrt(SFMIN);
  } else {
    XSC.value = SMALL;

    // Now, if the condition number of A is too big,
    // sigma_max(A) / sigma_min(A) > sqrt(BIG/N) * EPSLN / SFMIN,
    // as a precaution measure, the full SVD is computed using DGESVJ
    // with accumulated Jacobi rotations. This provides numerically
    // more robust computation, at the cost of slightly increased run
    // time. Depending on the concrete implementation of BLAS and LAPACK
    // (i.e. how they behave in presence of extreme ill-conditioning) the
    // implementor may decide to remove this switch.
    if ((AAQQ.value < sqrt(SFMIN)) && LSVEC && RSVEC) {
      JRACC = true;
    }
  }
  if (AAQQ.value < XSC.value) {
    for (p = 1; p <= N; p++) {
      if (SVA[p] < XSC.value) {
        dlaset('A', M, 1, ZERO, ZERO, A(1, p), LDA);
        SVA[p] = ZERO;
      }
    }
  }

  // Preconditioning using QR factorization with pivoting

  if (ROWPIV) {
    // Optional row permutation (Bjoerck row pivoting):
    // A result by Cox and Higham shows that the Bjoerck's
    // row pivoting combined with standard column pivoting
    // has similar effect as Powell-Reid complete pivoting.
    // The ell-infinity norms of A are made nonincreasing.
    for (p = 1; p <= M - 1; p++) {
      q = idamax(M - p + 1, WORK(M + N + p), 1) + p - 1;
      IWORK[2 * N + p] = q;
      if (p != q) {
        TEMP1.value = WORK[M + N + p];
        WORK[M + N + p] = WORK[M + N + q];
        WORK[M + N + q] = TEMP1.value;
      }
    }
    dlaswp(N, A, LDA, 1, M - 1, IWORK(2 * N + 1), 1);
  }

  // End of the preparation phase (scaling, optional sorting and
  // transposing, optional flushing of small columns).

  // Preconditioning

  // If the full SVD is needed, the right singular vectors are computed
  // from a matrix equation, and for that we need theoretical analysis
  // of the Businger-Golub pivoting. So we use DGEQP3 as the first RR QRF.
  // In all other cases the first RR QRF can be chosen by other criteria
  // (eg speed by replacing global with restricted window pivoting, such
  // as in SGEQPX from TOMS # 782). Good results will be obtained using
  // SGEQPX with properly (!) chosen numerical parameters.
  // Any improvement of DGEQP3 improves overall performance of DGEJSV.

  // A * P1 = Q1 * [ R1^t 0]^t:
  for (p = 1; p <= N; p++) {
    // all columns are free columns
    IWORK[p] = 0;
  }
  dgeqp3(M, N, A, LDA, IWORK, WORK, WORK(N + 1), LWORK - N, IERR);

  // The upper triangular matrix R1 from the first QRF is inspected for
  // rank deficiency and possibilities for deflation, or possible
  // ill-conditioning. Depending on the user specified flag L2RANK,
  // the procedure explores possibilities to reduce the numerical
  // rank by inspecting the computed upper triangular factor. If
  // L2RANK or L2ABER are up, then DGEJSV will compute the SVD of
  // A + dA, where ||dA|| <= f(M,N)*EPSLN.

  NR = 1;
  if (L2ABER) {
    // Standard absolute error bound suffices. All sigma_i with
    // sigma_i < N*EPSLN*||A|| are flushed to zero. This is an
    // aggressive enforcement of lower numerical rank by introducing a
    // backward error of the order of N*EPSLN*||A||.
    TEMP1.value = sqrt(N) * EPSLN;
    for (p = 2; p <= N; p++) {
      if (A[p][p].abs() >= (TEMP1.value * A[1][1].abs())) {
        NR++;
      } else {
        break;
      }
    }
  } else if (L2RANK) {
    // similarly as above, only slightly more gentle (less aggressive).
    // Sudden drop on the diagonal of R1 is used as the criterion for
    // close-to-rank-deficient.
    TEMP1.value = sqrt(SFMIN);
    for (p = 2; p <= N; p++) {
      if ((A[p][p].abs() < (EPSLN * A[p - 1][p - 1].abs())) ||
          (A[p][p].abs() < SMALL) ||
          (L2KILL && (A[p][p].abs() < TEMP1.value))) break;
      NR++;
    }
  } else {
    // The goal is high relative accuracy. However, if the matrix
    // has high scaled condition number the relative accuracy is in
    // general not feasible. Later on, a condition number estimator
    // will be deployed to estimate the scaled condition number.
    // Here we just remove the underflowed part of the triangular
    // factor. This prevents the situation in which the code is
    // working hard to get the accuracy not warranted by the data.
    TEMP1.value = sqrt(SFMIN);
    for (p = 2; p <= N; p++) {
      if ((A[p][p].abs() < SMALL) ||
          (L2KILL && (A[p][p].abs() < TEMP1.value))) {
        break;
      }
      NR++;
    }
  }

  ALMORT = false;
  if (NR == N) {
    MAXPRJ = ONE;
    for (p = 2; p <= N; p++) {
      TEMP1.value = A[p][p].abs() / SVA[IWORK[p]];
      MAXPRJ = min(MAXPRJ, TEMP1.value);
    }
    if (pow(MAXPRJ, 2) >= ONE - N * EPSLN) ALMORT = true;
  }

  SCONDA = -ONE;
  CONDR1 = -ONE;
  CONDR2 = -ONE;

  if (ERREST) {
    if (N == NR) {
      if (RSVEC) {
        // V is available as workspace
        dlacpy('U', N, N, A, LDA, V, LDV);
        for (p = 1; p <= N; p++) {
          TEMP1.value = SVA[IWORK[p]];
          dscal(p, ONE / TEMP1.value, V(1, p).asArray(), 1);
        }
        dpocon('U', N, V, LDV, ONE, TEMP1, WORK(N + 1), IWORK(2 * N + M + 1),
            IERR);
      } else if (LSVEC) {
        // U is available as workspace
        dlacpy('U', N, N, A, LDA, U, LDU);
        for (p = 1; p <= N; p++) {
          TEMP1.value = SVA[IWORK[p]];
          dscal(p, ONE / TEMP1.value, U(1, p).asArray(), 1);
        }
        dpocon('U', N, U, LDU, ONE, TEMP1, WORK(N + 1), IWORK(2 * N + M + 1),
            IERR);
      } else {
        dlacpy('U', N, N, A, LDA, WORK(N + 1).asMatrix(N), N);
        for (p = 1; p <= N; p++) {
          TEMP1.value = SVA[IWORK[p]];
          dscal(p, ONE / TEMP1.value, WORK(N + (p - 1) * N + 1), 1);
        }
        // the columns of R are scaled to have unit Euclidean lengths.
        dpocon('U', N, WORK(N + 1).asMatrix(N), N, ONE, TEMP1,
            WORK(N + N * N + 1), IWORK(2 * N + M + 1), IERR);
      }
      SCONDA = ONE / sqrt(TEMP1.value);
      // SCONDA is an estimate of sqrt(||(R^t * R)^(-1)||_1).
      // N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA
    } else {
      SCONDA = -ONE;
    }
  }

  L2PERT = L2PERT && ((A[1][1] / A[NR][NR]).abs() > sqrt(BIG1));
  // If there is no violent scaling, artificial perturbation is not needed.

  // Phase 3:

  if (!(RSVEC || LSVEC)) {
    // Singular Values only

    // transpose A[1:NR][1:N]
    for (p = 1; p <= min(N - 1, NR); p++) {
      dcopy(N - p, A(p, p + 1).asArray(), LDA, A(p + 1, p).asArray(), 1);
    }

    // The following two DO-loops introduce small relative perturbation
    // into the strict upper triangle of the lower triangular matrix.
    // Small entries below the main diagonal are also changed.
    // This modification is useful if the computing environment does not
    // provide/allow FLUSH TO ZERO underflow, for it prevents many
    // annoying denormalized numbers in case of strongly scaled matrices.
    // The perturbation is structured so that it does not introduce any
    // new perturbation of the singular values, and it does not destroy
    // the job done by the preconditioner.
    // The licence for this perturbation is in the variable L2PERT, which
    // should be false if FLUSH TO ZERO underflow is active.

    if (!ALMORT) {
      if (L2PERT) {
        // XSC = sqrt(SMALL)
        XSC.value = EPSLN / N;
        for (q = 1; q <= NR; q++) {
          TEMP1.value = XSC.value * A[q][q].abs();
          for (p = 1; p <= N; p++) {
            if (((p > q) && (A[p][q].abs() <= TEMP1.value)) || (p < q)) {
              A[p][q] = sign(TEMP1.value, A[p][q]);
            }
          }
        }
      } else {
        dlaset('U', NR - 1, NR - 1, ZERO, ZERO, A(1, 2), LDA);
      }

      // second preconditioning using the QR factorization

      dgeqrf(N, NR, A, LDA, WORK, WORK(N + 1), LWORK - N, IERR);

      // and transpose upper to lower triangular
      for (p = 1; p <= NR - 1; p++) {
        dcopy(NR - p, A(p, p + 1).asArray(), LDA, A(p + 1, p).asArray(), 1);
      }
    }

    // Row-cyclic Jacobi SVD algorithm with column pivoting

    // again some perturbation (a "background noise") is added
    // to drown denormals
    if (L2PERT) {
      // XSC = sqrt(SMALL)
      XSC.value = EPSLN / N;
      for (q = 1; q <= NR; q++) {
        TEMP1.value = XSC.value * A[q][q].abs();
        for (p = 1; p <= NR; p++) {
          if (((p > q) && (A[p][q].abs() <= TEMP1.value)) || (p < q)) {
            A[p][q] = sign(TEMP1.value, A[p][q]);
          }
        }
      }
    } else {
      dlaset('U', NR - 1, NR - 1, ZERO, ZERO, A(1, 2), LDA);
    }

    // and one-sided Jacobi rotations are started on a lower
    // triangular matrix (plus perturbation which is ignored in
    // the part which destroys triangular form (confusing?!))

    dgesvj(
        'L', 'NoU', 'NoV', NR, NR, A, LDA, SVA, N, V, LDV, WORK, LWORK, INFO);

    SCALEM = WORK[1];
    NUMRANK = WORK[2].round();
  } else if (RSVEC && !LSVEC) {
    // -> Singular Values and Right Singular Vectors <-

    if (ALMORT) {
      // in this case NR equals N
      for (p = 1; p <= NR; p++) {
        dcopy(N - p + 1, A(p, p).asArray(), LDA, V(p, p).asArray(), 1);
      }
      dlaset('Upper', NR - 1, NR - 1, ZERO, ZERO, V(1, 2), LDV);

      dgesvj('L', 'U', 'N', N, NR, V, LDV, SVA, NR, A, LDA, WORK, LWORK, INFO);
      SCALEM = WORK[1];
      NUMRANK = WORK[2].round();
    } else {
      // two more QR factorizations ( one QRF is not enough, two require
      // accumulated product of Jacobi rotations, three are perfect )

      dlaset('Lower', NR - 1, NR - 1, ZERO, ZERO, A(2, 1), LDA);
      dgelqf(NR, N, A, LDA, WORK, WORK(N + 1), LWORK - N, IERR);
      dlacpy('Lower', NR, NR, A, LDA, V, LDV);
      dlaset('Upper', NR - 1, NR - 1, ZERO, ZERO, V(1, 2), LDV);
      dgeqrf(NR, NR, V, LDV, WORK(N + 1), WORK(2 * N + 1), LWORK - 2 * N, IERR);
      for (p = 1; p <= NR; p++) {
        dcopy(NR - p + 1, V(p, p).asArray(), LDV, V(p, p).asArray(), 1);
      }
      dlaset('Upper', NR - 1, NR - 1, ZERO, ZERO, V(1, 2), LDV);

      dgesvj('Lower', 'U', 'N', NR, NR, V, LDV, SVA, NR, U, LDU, WORK(N + 1),
          LWORK, INFO);
      SCALEM = WORK[N + 1];
      NUMRANK = WORK[N + 2].round();
      if (NR < N) {
        dlaset('A', N - NR, NR, ZERO, ZERO, V(NR + 1, 1), LDV);
        dlaset('A', NR, N - NR, ZERO, ZERO, V(1, NR + 1), LDV);
        dlaset('A', N - NR, N - NR, ZERO, ONE, V(NR + 1, NR + 1), LDV);
      }

      dormlq('Left', 'Transpose', N, N, NR, A, LDA, WORK, V, LDV, WORK(N + 1),
          LWORK - N, IERR);
    }

    for (p = 1; p <= N; p++) {
      dcopy(N, V(p, 1).asArray(), LDV, A(IWORK[p], 1).asArray(), LDA);
    }
    dlacpy('All', N, N, A, LDA, V, LDV);

    if (TRANSP) {
      dlacpy('All', N, N, V, LDV, U, LDU);
    }
  } else if (LSVEC && !RSVEC) {
    // Singular Values and Left Singular Vectors                 ..

    // second preconditioning step to avoid need to accumulate
    // Jacobi rotations in the Jacobi iterations.
    for (p = 1; p <= NR; p++) {
      dcopy(N - p + 1, A(p, p).asArray(), LDA, U(p, p).asArray(), 1);
    }
    dlaset('Upper', NR - 1, NR - 1, ZERO, ZERO, U(1, 2), LDU);

    dgeqrf(N, NR, U, LDU, WORK(N + 1), WORK(2 * N + 1), LWORK - 2 * N, IERR);

    for (p = 1; p <= NR - 1; p++) {
      dcopy(NR - p, U(p, p + 1).asArray(), LDU, U(p + 1, p).asArray(), 1);
    }
    dlaset('Upper', NR - 1, NR - 1, ZERO, ZERO, U(1, 2), LDU);

    dgesvj('Lower', 'U', 'N', NR, NR, U, LDU, SVA, NR, A, LDA, WORK(N + 1),
        LWORK - N, INFO);
    SCALEM = WORK[N + 1];
    NUMRANK = WORK[N + 2].round();

    if (NR < M) {
      dlaset('A', M - NR, NR, ZERO, ZERO, U(NR + 1, 1), LDU);
      if (NR < N1) {
        dlaset('A', NR, N1 - NR, ZERO, ZERO, U(1, NR + 1), LDU);
        dlaset('A', M - NR, N1 - NR, ZERO, ONE, U(NR + 1, NR + 1), LDU);
      }
    }

    dormqr('Left', 'No Tr', M, N1, N, A, LDA, WORK, U, LDU, WORK(N + 1),
        LWORK - N, IERR);

    if (ROWPIV) dlaswp(N1, U, LDU, 1, M - 1, IWORK(2 * N + 1), -1);

    for (p = 1; p <= N1; p++) {
      XSC.value = ONE / dnrm2(M, U(1, p).asArray(), 1);
      dscal(M, XSC.value, U(1, p).asArray(), 1);
    }

    if (TRANSP) {
      dlacpy('All', N, N, U, LDU, V, LDV);
    }
  } else {
    // Full SVD ..

    if (!JRACC) {
      if (!ALMORT) {
        // Second Preconditioning Step (QRF [with pivoting])
        // Note that the composition of TRANSPOSE, QRF and TRANSPOSE is
        // equivalent to an LQF CALL. Since in many libraries the QRF
        // seems to be better optimized than the LQF, we do explicit
        // transpose and use the QRF. This is subject to changes in an
        // optimized implementation of DGEJSV.

        for (p = 1; p <= NR; p++) {
          dcopy(N - p + 1, A(p, p).asArray(), LDA, V(p, p).asArray(), 1);
        }

        // the following two loops perturb small entries to avoid
        // denormals in the second QR factorization, where they are
        // as good as zeros. This is done to avoid painfully slow
        // computation with denormals. The relative size of the perturbation
        // is a parameter that can be changed by the implementer.
        // This perturbation device will be obsolete on machines with
        // properly implemented arithmetic.
        // To switch it off, set L2PERT= false To remove it from  the
        // code, remove the action under L2PERT= true , leave the ELSE part.
        // The following two loops should be blocked and fused with the
        // transposed copy above.

        if (L2PERT) {
          XSC.value = sqrt(SMALL);
          for (q = 1; q <= NR; q++) {
            TEMP1.value = XSC.value * V[q][q].abs();
            for (p = 1; p <= N; p++) {
              if ((p > q) && (V[p][q].abs() <= TEMP1.value) || (p < q)) {
                V[p][q] = sign(TEMP1.value, V[p][q]);
              }
              if (p < q) V[p][q] = -V[p][q];
            }
          }
        } else {
          dlaset('U', NR - 1, NR - 1, ZERO, ZERO, V(1, 2), LDV);
        }

        // Estimate the row scaled condition number of R1
        // (If R1 is rectangular, N > NR, then the condition number
        // of the leading NR x NR submatrix is estimated.)

        dlacpy('L', NR, NR, V, LDV, WORK(2 * N + 1).asMatrix(NR), NR);
        for (p = 1; p <= NR; p++) {
          TEMP1.value = dnrm2(NR - p + 1, WORK(2 * N + (p - 1) * NR + p), 1);
          dscal(
              NR - p + 1, ONE / TEMP1.value, WORK(2 * N + (p - 1) * NR + p), 1);
        }
        dpocon('Lower', NR, WORK(2 * N + 1).asMatrix(NR), NR, ONE, TEMP1,
            WORK(2 * N + NR * NR + 1), IWORK(M + 2 * N + 1), IERR);
        CONDR1 = ONE / sqrt(TEMP1.value);
        // here need a second opinion on the condition number
        // then assume worst case scenario
        // R1 is OK for inverse <=> CONDR1 < N
        // more conservative    <=> CONDR1 < sqrt(N)

        COND_OK = sqrt(NR);
        // [TP] COND_OK is a tuning parameter.

        if (CONDR1 < COND_OK) {
          // the second QRF without pivoting. Note: in an optimized
          // implementation, this QRF should be implemented as the QRF
          // of a lower triangular matrix.
          // R1^t = Q2 * R2
          dgeqrf(
              N, NR, V, LDV, WORK(N + 1), WORK(2 * N + 1), LWORK - 2 * N, IERR);

          if (L2PERT) {
            XSC.value = sqrt(SMALL) / EPSLN;
            for (p = 2; p <= NR; p++) {
              for (q = 1; q <= p - 1; q++) {
                TEMP1.value = XSC.value * min(V[p][p].abs(), V[q][q].abs());
                if (V[q][p].abs() <= TEMP1.value) {
                  V[q][p] = sign(TEMP1.value, V[q][p]);
                }
              }
            }
          }

          if (NR != N) {
            dlacpy('A', N, NR, V, LDV, WORK(2 * N + 1).asMatrix(N), N);
          }
          // save ...

          // this transposed copy should be better than naive
          for (p = 1; p <= NR - 1; p++) {
            dcopy(NR - p, V(p, p + 1).asArray(), LDV, V(p + 1, p).asArray(), 1);
          }

          CONDR2 = CONDR1;
        } else {
          // ill-conditioned case: second QRF with pivoting
          // Note that windowed pivoting would be equally good
          // numerically, and more run-time efficient. So, in
          // an optimal implementation, the next call to DGEQP3
          // should be replaced with eg. CALL SGEQPX (ACM TOMS #782)
          // with properly (carefully) chosen parameters.

          // R1^t * P2 = Q2 * R2
          for (p = 1; p <= NR; p++) {
            IWORK[N + p] = 0;
          }
          dgeqp3(N, NR, V, LDV, IWORK(N + 1), WORK(N + 1), WORK(2 * N + 1),
              LWORK - 2 * N, IERR);
          // CALL DGEQRF( N, NR, V, LDV, WORK[N+1], WORK[2*N+1], LWORK-2*N, IERR )
          if (L2PERT) {
            XSC.value = sqrt(SMALL);
            for (p = 2; p <= NR; p++) {
              for (q = 1; q <= p - 1; q++) {
                TEMP1.value = XSC.value * min(V[p][p].abs(), V[q][q].abs());
                if (V[q][p].abs() <= TEMP1.value) {
                  V[q][p] = sign(TEMP1.value, V[q][p]);
                }
              }
            }
          }

          dlacpy('A', N, NR, V, LDV, WORK(2 * N + 1).asMatrix(N), N);

          if (L2PERT) {
            XSC.value = sqrt(SMALL);
            for (p = 2; p <= NR; p++) {
              for (q = 1; q <= p - 1; q++) {
                TEMP1.value = XSC.value * min(V[p][p].abs(), V[q][q].abs());
                V[p][q] = -sign(TEMP1.value, V[q][p]);
              }
            }
          } else {
            dlaset('L', NR - 1, NR - 1, ZERO, ZERO, V(2, 1), LDV);
          }
          // Now, compute R2 = L3 * Q3, the LQ factorization.
          dgelqf(NR, NR, V, LDV, WORK(2 * N + N * NR + 1),
              WORK(2 * N + N * NR + NR + 1), LWORK - 2 * N - N * NR - NR, IERR);
          // and estimate the condition number
          dlacpy('L', NR, NR, V, LDV,
              WORK(2 * N + N * NR + NR + 1).asMatrix(NR), NR);
          for (p = 1; p <= NR; p++) {
            TEMP1.value = dnrm2(p, WORK(2 * N + N * NR + NR + p), NR);
            dscal(p, ONE / TEMP1.value, WORK(2 * N + N * NR + NR + p), NR);
          }
          dpocon(
              'L',
              NR,
              WORK(2 * N + N * NR + NR + 1).asMatrix(NR),
              NR,
              ONE,
              TEMP1,
              WORK(2 * N + N * NR + NR + NR * NR + 1),
              IWORK(M + 2 * N + 1),
              IERR);
          CONDR2 = ONE / sqrt(TEMP1.value);

          if (CONDR2 >= COND_OK) {
            // save the Householder vectors used for Q3
            // (this overwrites the copy of R2, as it will not be
            // needed in this branch, but it does not overwrite the
            // Huseholder vectors of Q2.).
            dlacpy('U', NR, NR, V, LDV, WORK(2 * N + 1).asMatrix(N), N);
            // and the rest of the information on Q3 is in
            // WORK[2*N+N*NR+1:2*N+N*NR+N]
          }
        }

        if (L2PERT) {
          XSC.value = sqrt(SMALL);
          for (q = 2; q <= NR; q++) {
            TEMP1.value = XSC.value * V[q][q];
            for (p = 1; p <= q - 1; p++) {
              // V[p][q] = - sign( TEMP1, V[q][p] )
              V[p][q] = -sign(TEMP1.value, V[p][q]);
            }
          }
        } else {
          dlaset('U', NR - 1, NR - 1, ZERO, ZERO, V(1, 2), LDV);
        }

        // Second preconditioning finished; continue with Jacobi SVD
        // The input matrix is lower triangular.

        // Recover the right singular vectors as solution of a well
        // conditioned triangular matrix equation.

        if (CONDR1 < COND_OK) {
          dgesvj('L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U, LDU,
              WORK(2 * N + N * NR + NR + 1), LWORK - 2 * N - N * NR - NR, INFO);
          SCALEM = WORK[2 * N + N * NR + NR + 1];
          NUMRANK = WORK[2 * N + N * NR + NR + 2].round();
          for (p = 1; p <= NR; p++) {
            dcopy(NR, V(1, p).asArray(), 1, U(1, p).asArray(), 1);
            dscal(NR, SVA[p], V(1, p).asArray(), 1);
          }

          // pick the right matrix equation and solve it

          if (NR == N) {
            // best case, R1 is inverted. The solution of this matrix
            // equation is Q2*V2 = the product of the Jacobi rotations
            // used in DGESVJ, premultiplied with the orthogonal matrix
            // from the second QR factorization.
            dtrsm('L', 'U', 'N', 'N', NR, NR, ONE, A, LDA, V, LDV);
          } else {
            // R1 is well conditioned, but non-square. Transpose(R2)
            // is inverted to get the product of the Jacobi rotations
            // used in DGESVJ. The Q-factor from the second QR
            // factorization is then built in explicitly.
            dtrsm('L', 'U', 'T', 'N', NR, NR, ONE, WORK(2 * N + 1).asMatrix(N),
                N, V, LDV);
            if (NR < N) {
              dlaset('A', N - NR, NR, ZERO, ZERO, V(NR + 1, 1), LDV);
              dlaset('A', NR, N - NR, ZERO, ZERO, V(1, NR + 1), LDV);
              dlaset('A', N - NR, N - NR, ZERO, ONE, V(NR + 1, NR + 1), LDV);
            }
            dormqr(
                'L',
                'N',
                N,
                N,
                NR,
                WORK(2 * N + 1).asMatrix(N),
                N,
                WORK(N + 1),
                V,
                LDV,
                WORK(2 * N + N * NR + NR + 1),
                LWORK - 2 * N - N * NR - NR,
                IERR);
          }
        } else if (CONDR2 < COND_OK) {
          // the input matrix A is very likely a relative of
          // the Kahan matrix :)
          // The matrix R2 is inverted. The solution of the matrix equation
          // is Q3^T*V3 = the product of the Jacobi rotations (applied to
          // the lower triangular L3 from the LQ factorization of
          // R2=L3*Q3), pre-multiplied with the transposed Q3.
          dgesvj('L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U, LDU,
              WORK(2 * N + N * NR + NR + 1), LWORK - 2 * N - N * NR - NR, INFO);
          SCALEM = WORK[2 * N + N * NR + NR + 1];
          NUMRANK = WORK[2 * N + N * NR + NR + 2].round();
          for (p = 1; p <= NR; p++) {
            dcopy(NR, V(1, p).asArray(), 1, U(1, p).asArray(), 1);
            dscal(NR, SVA[p], U(1, p).asArray(), 1);
          }
          dtrsm('L', 'U', 'N', 'N', NR, NR, ONE, WORK(2 * N + 1).asMatrix(N), N,
              U, LDU);
          // apply the permutation from the second QR factorization
          for (q = 1; q <= NR; q++) {
            for (p = 1; p <= NR; p++) {
              WORK[2 * N + N * NR + NR + IWORK[N + p]] = U[p][q];
            }
            for (p = 1; p <= NR; p++) {
              U[p][q] = WORK[2 * N + N * NR + NR + p];
            }
          }
          if (NR < N) {
            dlaset('A', N - NR, NR, ZERO, ZERO, V(NR + 1, 1), LDV);
            dlaset('A', NR, N - NR, ZERO, ZERO, V(1, NR + 1), LDV);
            dlaset('A', N - NR, N - NR, ZERO, ONE, V(NR + 1, NR + 1), LDV);
          }
          dormqr(
              'L',
              'N',
              N,
              N,
              NR,
              WORK(2 * N + 1).asMatrix(N),
              N,
              WORK(N + 1),
              V,
              LDV,
              WORK(2 * N + N * NR + NR + 1),
              LWORK - 2 * N - N * NR - NR,
              IERR);
        } else {
          // Last line of defense.
          // This is a rather pathological case: no scaled condition
          // improvement after two pivoted QR factorizations. Other
          // possibility is that the rank revealing QR factorization
          // or the condition estimator has failed, or the COND_OK
          // is set very close to ONE (which is unnecessary). Normally,
          // this branch should never be executed, but in rare cases of
          // failure of the RRQR or condition estimator, the last line of
          // defense ensures that DGEJSV completes the task.
          // Compute the full SVD of L3 using DGESVJ with explicit
          // accumulation of Jacobi rotations.
          dgesvj('L', 'U', 'V', NR, NR, V, LDV, SVA, NR, U, LDU,
              WORK(2 * N + N * NR + NR + 1), LWORK - 2 * N - N * NR - NR, INFO);
          SCALEM = WORK[2 * N + N * NR + NR + 1];
          NUMRANK = WORK[2 * N + N * NR + NR + 2].round();
          if (NR < N) {
            dlaset('A', N - NR, NR, ZERO, ZERO, V(NR + 1, 1), LDV);
            dlaset('A', NR, N - NR, ZERO, ZERO, V(1, NR + 1), LDV);
            dlaset('A', N - NR, N - NR, ZERO, ONE, V(NR + 1, NR + 1), LDV);
          }
          dormqr(
              'L',
              'N',
              N,
              N,
              NR,
              WORK(2 * N + 1).asMatrix(N),
              N,
              WORK(N + 1),
              V,
              LDV,
              WORK(2 * N + N * NR + NR + 1),
              LWORK - 2 * N - N * NR - NR,
              IERR);

          dormlq(
              'L',
              'T',
              NR,
              NR,
              NR,
              WORK(2 * N + 1).asMatrix(N),
              N,
              WORK(2 * N + N * NR + 1),
              U,
              LDU,
              WORK(2 * N + N * NR + NR + 1),
              LWORK - 2 * N - N * NR - NR,
              IERR);
          for (q = 1; q <= NR; q++) {
            for (p = 1; p <= NR; p++) {
              WORK[2 * N + N * NR + NR + IWORK[N + p]] = U[p][q];
            }
            for (p = 1; p <= NR; p++) {
              U[p][q] = WORK[2 * N + N * NR + NR + p];
            }
          }
        }

        // Permute the rows of V using the (column) permutation from the
        // first QRF. Also, scale the columns to make them unit in
        // Euclidean norm. This applies to all cases.

        TEMP1.value = sqrt(N) * EPSLN;
        for (q = 1; q <= N; q++) {
          for (p = 1; p <= N; p++) {
            WORK[2 * N + N * NR + NR + IWORK[p]] = V[p][q];
          }
          for (p = 1; p <= N; p++) {
            V[p][q] = WORK[2 * N + N * NR + NR + p];
          }
          XSC.value = ONE / dnrm2(N, V(1, q).asArray(), 1);
          if ((XSC.value < (ONE - TEMP1.value)) ||
              (XSC.value > (ONE + TEMP1.value))) {
            dscal(N, XSC.value, V(1, q).asArray(), 1);
          }
        }
        // At this moment, V contains the right singular vectors of A.
        // Next, assemble the left singular vector matrix U (M x N).
        if (NR < M) {
          dlaset('A', M - NR, NR, ZERO, ZERO, U(NR + 1, 1), LDU);
          if (NR < N1) {
            dlaset('A', NR, N1 - NR, ZERO, ZERO, U(1, NR + 1), LDU);
            dlaset('A', M - NR, N1 - NR, ZERO, ONE, U(NR + 1, NR + 1), LDU);
          }
        }

        // The Q matrix from the first QRF is built into the left singular
        // matrix U. This applies to all cases.

        dormqr('Left', 'No_Tr', M, N1, N, A, LDA, WORK, U, LDU, WORK(N + 1),
            LWORK - N, IERR);

        // The columns of U are normalized. The cost is O(M*N) flops.
        TEMP1.value = sqrt(M) * EPSLN;
        for (p = 1; p <= NR; p++) {
          XSC.value = ONE / dnrm2(M, U(1, p).asArray(), 1);
          if ((XSC.value < (ONE - TEMP1.value)) ||
              (XSC.value > (ONE + TEMP1.value))) {
            dscal(M, XSC.value, U(1, p).asArray(), 1);
          }
        }

        // If the initial QRF is computed with row pivoting, the left
        // singular vectors must be adjusted.

        if (ROWPIV) dlaswp(N1, U, LDU, 1, M - 1, IWORK(2 * N + 1), -1);
      } else {
        // the initial matrix A has almost orthogonal columns and
        // the second QRF is not needed

        dlacpy('Upper', N, N, A, LDA, WORK(N + 1).asMatrix(N), N);
        if (L2PERT) {
          XSC.value = sqrt(SMALL);
          for (p = 2; p <= N; p++) {
            TEMP1.value = XSC.value * WORK[N + (p - 1) * N + p];
            for (q = 1; q <= p - 1; q++) {
              WORK[N + (q - 1) * N + p] =
                  -sign(TEMP1.value, WORK[N + (p - 1) * N + q]);
            }
          }
        } else {
          dlaset('Lower', N - 1, N - 1, ZERO, ZERO, WORK(N + 2).asMatrix(N), N);
        }

        dgesvj('Upper', 'U', 'N', N, N, WORK(N + 1).asMatrix(N), N, SVA, N, U,
            LDU, WORK(N + N * N + 1), LWORK - N - N * N, INFO);

        SCALEM = WORK[N + N * N + 1];
        NUMRANK = WORK[N + N * N + 2].round();
        for (p = 1; p <= N; p++) {
          dcopy(N, WORK(N + (p - 1) * N + 1), 1, U(1, p).asArray(), 1);
          dscal(N, SVA[p], WORK(N + (p - 1) * N + 1), 1);
        }

        dtrsm('Left', 'Upper', 'NoTrans', 'No UD', N, N, ONE, A, LDA,
            WORK(N + 1).asMatrix(N), N);
        for (p = 1; p <= N; p++) {
          dcopy(N, WORK(N + p), N, V(IWORK[p], 1).asArray(), LDV);
        }
        TEMP1.value = sqrt(N) * EPSLN;
        for (p = 1; p <= N; p++) {
          XSC.value = ONE / dnrm2(N, V(1, p).asArray(), 1);
          if ((XSC.value < (ONE - TEMP1.value)) ||
              (XSC.value > (ONE + TEMP1.value))) {
            dscal(N, XSC.value, V(1, p).asArray(), 1);
          }
        }

        // Assemble the left singular vector matrix U (M x N).

        if (N < M) {
          dlaset('A', M - N, N, ZERO, ZERO, U(N + 1, 1), LDU);
          if (N < N1) {
            dlaset('A', N, N1 - N, ZERO, ZERO, U(1, N + 1), LDU);
            dlaset('A', M - N, N1 - N, ZERO, ONE, U(N + 1, N + 1), LDU);
          }
        }
        dormqr('Left', 'No Tr', M, N1, N, A, LDA, WORK, U, LDU, WORK(N + 1),
            LWORK - N, IERR);
        TEMP1.value = sqrt(M) * EPSLN;
        for (p = 1; p <= N1; p++) {
          XSC.value = ONE / dnrm2(M, U(1, p).asArray(), 1);
          if ((XSC.value < (ONE - TEMP1.value)) ||
              (XSC.value > (ONE + TEMP1.value))) {
            dscal(M, XSC.value, U(1, p).asArray(), 1);
          }
        }

        if (ROWPIV) dlaswp(N1, U, LDU, 1, M - 1, IWORK(2 * N + 1), -1);
      }

      // end of the  >> almost orthogonal case <<  in the full SVD
    } else {
      // This branch deploys a preconditioned Jacobi SVD with explicitly
      // accumulated rotations. It is included as optional, mainly for
      // experimental purposes. It does perform well, and can also be used.
      // In this implementation, this branch will be automatically activated
      // if the  condition number sigma_max(A) / sigma_min(A) is predicted
      // to be greater than the overflow threshold. This is because the
      // a posteriori computation of the singular vectors assumes robust
      // implementation of BLAS and some LAPACK procedures, capable of working
      // in presence of extreme values. Since that is not always the case, ...

      for (p = 1; p <= NR; p++) {
        dcopy(N - p + 1, A(p, p).asArray(), LDA, V(p, p).asArray(), 1);
      }

      if (L2PERT) {
        XSC.value = sqrt(SMALL / EPSLN);
        for (q = 1; q <= NR; q++) {
          TEMP1.value = XSC.value * V[q][q].abs();
          for (p = 1; p <= N; p++) {
            if ((p > q) && (V[p][q].abs() <= TEMP1.value) || (p < q)) {
              V[p][q] = sign(TEMP1.value, V[p][q]);
            }
            if (p < q) V[p][q] = -V[p][q];
          }
        }
      } else {
        dlaset('U', NR - 1, NR - 1, ZERO, ZERO, V(1, 2), LDV);
      }
      dgeqrf(N, NR, V, LDV, WORK(N + 1), WORK(2 * N + 1), LWORK - 2 * N, IERR);
      dlacpy('L', N, NR, V, LDV, WORK(2 * N + 1).asMatrix(N), N);

      for (p = 1; p <= NR; p++) {
        dcopy(NR - p + 1, V(p, p).asArray(), LDV, U(p, p).asArray(), 1);
      }

      if (L2PERT) {
        XSC.value = sqrt(SMALL / EPSLN);
        for (q = 2; q <= NR; q++) {
          for (p = 1; p <= q - 1; p++) {
            TEMP1.value = XSC.value * min(U[p][p].abs(), U[q][q].abs());
            U[p][q] = -sign(TEMP1.value, U[q][p]);
          }
        }
      } else {
        dlaset('U', NR - 1, NR - 1, ZERO, ZERO, U(1, 2), LDU);
      }
      dgesvj('G', 'U', 'V', NR, NR, U, LDU, SVA, N, V, LDV,
          WORK(2 * N + N * NR + 1), LWORK - 2 * N - N * NR, INFO);
      SCALEM = WORK[2 * N + N * NR + 1];
      NUMRANK = WORK[2 * N + N * NR + 2].round();

      if (NR < N) {
        dlaset('A', N - NR, NR, ZERO, ZERO, V(NR + 1, 1), LDV);
        dlaset('A', NR, N - NR, ZERO, ZERO, V(1, NR + 1), LDV);
        dlaset('A', N - NR, N - NR, ZERO, ONE, V(NR + 1, NR + 1), LDV);
      }
      dormqr(
          'L',
          'N',
          N,
          N,
          NR,
          WORK(2 * N + 1).asMatrix(N),
          N,
          WORK(N + 1),
          V,
          LDV,
          WORK(2 * N + N * NR + NR + 1),
          LWORK - 2 * N - N * NR - NR,
          IERR);

      // Permute the rows of V using the (column) permutation from the
      // first QRF. Also, scale the columns to make them unit in
      // Euclidean norm. This applies to all cases.

      TEMP1.value = sqrt(N) * EPSLN;
      for (q = 1; q <= N; q++) {
        for (p = 1; p <= N; p++) {
          WORK[2 * N + N * NR + NR + IWORK[p]] = V[p][q];
        }
        for (p = 1; p <= N; p++) {
          V[p][q] = WORK[2 * N + N * NR + NR + p];
        }
        XSC.value = ONE / dnrm2(N, V(1, q).asArray(), 1);
        if ((XSC.value < (ONE - TEMP1.value)) ||
            (XSC.value > (ONE + TEMP1.value))) {
          dscal(N, XSC.value, V(1, q).asArray(), 1);
        }
      }

      // At this moment, V contains the right singular vectors of A.
      // Next, assemble the left singular vector matrix U (M x N).

      if (NR < M) {
        dlaset('A', M - NR, NR, ZERO, ZERO, U(NR + 1, 1), LDU);
        if (NR < N1) {
          dlaset('A', NR, N1 - NR, ZERO, ZERO, U(1, NR + 1), LDU);
          dlaset('A', M - NR, N1 - NR, ZERO, ONE, U(NR + 1, NR + 1), LDU);
        }
      }

      dormqr('Left', 'No Tr', M, N1, N, A, LDA, WORK, U, LDU, WORK(N + 1),
          LWORK - N, IERR);

      if (ROWPIV) dlaswp(N1, U, LDU, 1, M - 1, IWORK(2 * N + 1), -1);
    }
    if (TRANSP) {
      // swap U and V because the procedure worked on A^t
      for (p = 1; p <= N; p++) {
        dswap(N, U(1, p).asArray(), 1, V(1, p).asArray(), 1);
      }
    }
  }
  // end of the full SVD

  // Undo scaling, if necessary (and possible)

  if (USCAL2 <= (BIG / SVA[1]) * USCAL1) {
    dlascl('G', 0, 0, USCAL1, USCAL2, NR, 1, SVA.asMatrix(N), N, IERR);
    USCAL1 = ONE;
    USCAL2 = ONE;
  }

  if (NR < N) {
    for (p = NR + 1; p <= N; p++) {
      SVA[p] = ZERO;
    }
  }

  WORK[1] = USCAL2 * SCALEM;
  WORK[2] = USCAL1;
  if (ERREST) WORK[3] = SCONDA;
  if (LSVEC && RSVEC) {
    WORK[4] = CONDR1;
    WORK[5] = CONDR2;
  }
  if (L2TRAN) {
    WORK[6] = ENTRA;
    WORK[7] = ENTRAT;
  }

  IWORK[1] = NR;
  IWORK[2] = NUMRANK;
  IWORK[3] = WARNING;
}