Actual source code: bvlapack.c
slepc-3.17.1 2022-04-11
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: BV private kernels that use the LAPACK
12: */
14: #include <slepc/private/bvimpl.h>
15: #include <slepcblaslapack.h>
17: /*
18: Reduction operation to compute sqrt(x**2+y**2) when normalizing vectors
19: */
20: SLEPC_EXTERN void MPIAPI SlepcPythag(void *in,void *inout,PetscMPIInt *len,MPI_Datatype *datatype)
21: {
22: PetscBLASInt i,n=*len;
23: PetscReal *x = (PetscReal*)in,*y = (PetscReal*)inout;
25: if (PetscUnlikely(*datatype!=MPIU_REAL)) {
26: (*PetscErrorPrintf)("Only implemented for MPIU_REAL data type");
27: MPI_Abort(MPI_COMM_WORLD,1);
28: }
29: for (i=0;i<n;i++) y[i] = SlepcAbs(x[i],y[i]);
30: return;
31: }
33: /*
34: Compute ||A|| for an mxn matrix
35: */
36: PetscErrorCode BVNorm_LAPACK_Private(BV bv,PetscInt m_,PetscInt n_,const PetscScalar *A,NormType type,PetscReal *nrm,PetscBool mpi)
37: {
38: PetscBLASInt m,n,i,j;
39: PetscMPIInt len;
40: PetscReal lnrm,*rwork=NULL,*rwork2=NULL;
42: PetscFPTrapPush(PETSC_FP_TRAP_OFF);
43: PetscBLASIntCast(m_,&m);
44: PetscBLASIntCast(n_,&n);
45: if (type==NORM_FROBENIUS || type==NORM_2) {
46: lnrm = LAPACKlange_("F",&m,&n,(PetscScalar*)A,&m,rwork);
47: if (mpi) MPIU_Allreduce(&lnrm,nrm,1,MPIU_REAL,MPIU_LAPY2,PetscObjectComm((PetscObject)bv));
48: else *nrm = lnrm;
49: PetscLogFlops(2.0*m*n);
50: } else if (type==NORM_1) {
51: if (mpi) {
52: BVAllocateWork_Private(bv,2*n_);
53: rwork = (PetscReal*)bv->work;
54: rwork2 = rwork+n_;
55: PetscArrayzero(rwork,n_);
56: PetscArrayzero(rwork2,n_);
57: for (j=0;j<n_;j++) {
58: for (i=0;i<m_;i++) {
59: rwork[j] += PetscAbsScalar(A[i+j*m_]);
60: }
61: }
62: PetscMPIIntCast(n_,&len);
63: MPIU_Allreduce(rwork,rwork2,len,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)bv));
64: *nrm = 0.0;
65: for (j=0;j<n_;j++) if (rwork2[j] > *nrm) *nrm = rwork2[j];
66: } else {
67: *nrm = LAPACKlange_("O",&m,&n,(PetscScalar*)A,&m,rwork);
68: }
69: PetscLogFlops(1.0*m*n);
70: } else if (type==NORM_INFINITY) {
71: BVAllocateWork_Private(bv,m_);
72: rwork = (PetscReal*)bv->work;
73: lnrm = LAPACKlange_("I",&m,&n,(PetscScalar*)A,&m,rwork);
74: if (mpi) MPIU_Allreduce(&lnrm,nrm,1,MPIU_REAL,MPIU_MAX,PetscObjectComm((PetscObject)bv));
75: else *nrm = lnrm;
76: PetscLogFlops(1.0*m*n);
77: }
78: PetscFPTrapPop();
79: PetscFunctionReturn(0);
80: }
82: /*
83: Normalize the columns of an mxn matrix A
84: */
85: PetscErrorCode BVNormalize_LAPACK_Private(BV bv,PetscInt m_,PetscInt n_,const PetscScalar *A,PetscScalar *eigi,PetscBool mpi)
86: {
87: PetscBLASInt m,j,k,info,zero=0;
88: PetscMPIInt len;
89: PetscReal *norms,*rwork=NULL,*rwork2=NULL,done=1.0;
91: PetscFPTrapPush(PETSC_FP_TRAP_OFF);
92: PetscBLASIntCast(m_,&m);
93: BVAllocateWork_Private(bv,2*n_);
94: rwork = (PetscReal*)bv->work;
95: rwork2 = rwork+n_;
96: /* compute local norms */
97: for (j=0;j<n_;j++) {
98: k = 1;
99: #if !defined(PETSC_USE_COMPLEX)
100: if (eigi && eigi[j] != 0.0) k = 2;
101: #endif
102: rwork[j] = LAPACKlange_("F",&m,&k,(PetscScalar*)(A+j*m_),&m,rwork2);
103: if (k==2) { rwork[j+1] = rwork[j]; j++; }
104: }
105: /* reduction to get global norms */
106: if (mpi) {
107: PetscMPIIntCast(n_,&len);
108: PetscArrayzero(rwork2,n_);
109: MPIU_Allreduce(rwork,rwork2,len,MPIU_REAL,MPIU_LAPY2,PetscObjectComm((PetscObject)bv));
110: norms = rwork2;
111: } else norms = rwork;
112: /* scale columns */
113: for (j=0;j<n_;j++) {
114: k = 1;
115: #if !defined(PETSC_USE_COMPLEX)
116: if (eigi && eigi[j] != 0.0) k = 2;
117: #endif
118: PetscStackCallBLAS("LAPACKlascl",LAPACKlascl_("G",&zero,&zero,norms+j,&done,&m,&k,(PetscScalar*)(A+j*m_),&m,&info));
119: SlepcCheckLapackInfo("lascl",info);
120: if (k==2) j++;
121: }
122: PetscLogFlops(3.0*m*n_);
123: PetscFPTrapPop();
124: PetscFunctionReturn(0);
125: }
127: /*
128: Compute the upper Cholesky factor in R and its inverse in S.
129: If S == R then the inverse overwrites the Cholesky factor.
130: */
131: PetscErrorCode BVMatCholInv_LAPACK_Private(BV bv,Mat R,Mat S)
132: {
133: PetscInt i,k,l,n,m,ld,lds;
134: PetscScalar *pR,*pS;
135: PetscBLASInt info,n_ = 0,m_ = 0,ld_,lds_;
137: l = bv->l;
138: k = bv->k;
139: MatGetSize(R,&m,NULL);
140: n = k-l;
141: PetscBLASIntCast(m,&m_);
142: PetscBLASIntCast(n,&n_);
143: ld = m;
144: ld_ = m_;
145: MatDenseGetArray(R,&pR);
147: if (S==R) {
148: BVAllocateWork_Private(bv,m*k);
149: pS = bv->work;
150: lds = ld;
151: lds_ = ld_;
152: } else {
153: MatDenseGetArray(S,&pS);
154: MatGetSize(S,&lds,NULL);
155: PetscBLASIntCast(lds,&lds_);
156: }
158: /* save a copy of matrix in S */
159: for (i=l;i<k;i++) PetscArraycpy(pS+i*lds+l,pR+i*ld+l,n);
161: /* compute upper Cholesky factor in R */
162: PetscFPTrapPush(PETSC_FP_TRAP_OFF);
163: PetscStackCallBLAS("LAPACKpotrf",LAPACKpotrf_("U",&n_,pR+l*ld+l,&ld_,&info));
164: PetscLogFlops((1.0*n*n*n)/3.0);
166: if (info) { /* LAPACKpotrf failed, retry on diagonally perturbed matrix */
167: for (i=l;i<k;i++) {
168: PetscArraycpy(pR+i*ld+l,pS+i*lds+l,n);
169: pR[i+i*ld] += 50.0*PETSC_MACHINE_EPSILON;
170: }
171: PetscStackCallBLAS("LAPACKpotrf",LAPACKpotrf_("U",&n_,pR+l*ld+l,&ld_,&info));
172: SlepcCheckLapackInfo("potrf",info);
173: PetscLogFlops((1.0*n*n*n)/3.0);
174: }
176: /* compute S = inv(R) */
177: if (S==R) {
178: PetscStackCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pR+l*ld+l,&ld_,&info));
179: } else {
180: PetscArrayzero(pS+l*lds,(k-l)*k);
181: for (i=l;i<k;i++) PetscArraycpy(pS+i*lds+l,pR+i*ld+l,n);
182: PetscStackCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pS+l*lds+l,&lds_,&info));
183: }
184: SlepcCheckLapackInfo("trtri",info);
185: PetscFPTrapPop();
186: PetscLogFlops(0.33*n*n*n);
188: /* Zero out entries below the diagonal */
189: for (i=l;i<k-1;i++) {
190: PetscArrayzero(pR+i*ld+i+1,(k-i-1));
191: if (S!=R) PetscArrayzero(pS+i*lds+i+1,(k-i-1));
192: }
193: MatDenseRestoreArray(R,&pR);
194: if (S!=R) MatDenseRestoreArray(S,&pS);
195: PetscFunctionReturn(0);
196: }
198: /*
199: Compute the inverse of an upper triangular matrix R, store it in S.
200: If S == R then the inverse overwrites R.
201: */
202: PetscErrorCode BVMatTriInv_LAPACK_Private(BV bv,Mat R,Mat S)
203: {
204: PetscInt i,k,l,n,m,ld,lds;
205: PetscScalar *pR,*pS;
206: PetscBLASInt info,n_,m_ = 0,ld_,lds_;
208: l = bv->l;
209: k = bv->k;
210: MatGetSize(R,&m,NULL);
211: n = k-l;
212: PetscBLASIntCast(m,&m_);
213: PetscBLASIntCast(n,&n_);
214: ld = m;
215: ld_ = m_;
216: MatDenseGetArray(R,&pR);
218: if (S==R) {
219: BVAllocateWork_Private(bv,m*k);
220: pS = bv->work;
221: lds = ld;
222: lds_ = ld_;
223: } else {
224: MatDenseGetArray(S,&pS);
225: MatGetSize(S,&lds,NULL);
226: PetscBLASIntCast(lds,&lds_);
227: }
229: /* compute S = inv(R) */
230: PetscFPTrapPush(PETSC_FP_TRAP_OFF);
231: if (S==R) {
232: PetscStackCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pR+l*ld+l,&ld_,&info));
233: } else {
234: PetscArrayzero(pS+l*lds,(k-l)*k);
235: for (i=l;i<k;i++) PetscArraycpy(pS+i*lds+l,pR+i*ld+l,n);
236: PetscStackCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pS+l*lds+l,&lds_,&info));
237: }
238: SlepcCheckLapackInfo("trtri",info);
239: PetscFPTrapPop();
240: PetscLogFlops(0.33*n*n*n);
242: MatDenseRestoreArray(R,&pR);
243: if (S!=R) MatDenseRestoreArray(S,&pS);
244: PetscFunctionReturn(0);
245: }
247: /*
248: Compute the matrix to be used for post-multiplying the basis in the SVQB
249: block orthogonalization method.
250: On input R = V'*V, on output S = D*U*Lambda^{-1/2} where (U,Lambda) is
251: the eigendecomposition of D*R*D with D=diag(R)^{-1/2}.
252: If S == R then the result overwrites R.
253: */
254: PetscErrorCode BVMatSVQB_LAPACK_Private(BV bv,Mat R,Mat S)
255: {
256: PetscInt i,j,k,l,n,m,ld,lds;
257: PetscScalar *pR,*pS,*D,*work,a;
258: PetscReal *eig,dummy;
259: PetscBLASInt info,lwork,n_,m_ = 0,ld_,lds_;
260: #if defined(PETSC_USE_COMPLEX)
261: PetscReal *rwork,rdummy;
262: #endif
264: l = bv->l;
265: k = bv->k;
266: MatGetSize(R,&m,NULL);
267: n = k-l;
268: PetscBLASIntCast(m,&m_);
269: PetscBLASIntCast(n,&n_);
270: ld = m;
271: ld_ = m_;
272: MatDenseGetArray(R,&pR);
274: if (S==R) {
275: pS = pR;
276: lds = ld;
277: lds_ = ld_;
278: } else {
279: MatDenseGetArray(S,&pS);
280: MatGetSize(S,&lds,NULL);
281: PetscBLASIntCast(lds,&lds_);
282: }
283: PetscFPTrapPush(PETSC_FP_TRAP_OFF);
285: /* workspace query and memory allocation */
286: lwork = -1;
287: #if defined(PETSC_USE_COMPLEX)
288: PetscStackCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS,&lds_,&dummy,&a,&lwork,&rdummy,&info));
289: PetscBLASIntCast((PetscInt)PetscRealPart(a),&lwork);
290: PetscMalloc4(n,&eig,n,&D,lwork,&work,PetscMax(1,3*n-2),&rwork);
291: #else
292: PetscStackCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS,&lds_,&dummy,&a,&lwork,&info));
293: PetscBLASIntCast((PetscInt)a,&lwork);
294: PetscMalloc3(n,&eig,n,&D,lwork,&work);
295: #endif
297: /* copy and scale matrix */
298: for (i=l;i<k;i++) D[i-l] = 1.0/PetscSqrtReal(PetscRealPart(pR[i+i*ld]));
299: for (i=l;i<k;i++) for (j=l;j<k;j++) pS[i+j*lds] = pR[i+j*ld]*D[i-l];
300: for (j=l;j<k;j++) for (i=l;i<k;i++) pS[i+j*lds] *= D[j-l];
302: /* compute eigendecomposition */
303: #if defined(PETSC_USE_COMPLEX)
304: PetscStackCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS+l*lds+l,&lds_,eig,work,&lwork,rwork,&info));
305: #else
306: PetscStackCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS+l*lds+l,&lds_,eig,work,&lwork,&info));
307: #endif
308: SlepcCheckLapackInfo("syev",info);
310: if (S!=R) { /* R = U' */
311: for (i=l;i<k;i++) for (j=l;j<k;j++) pR[i+j*ld] = pS[j+i*lds];
312: }
314: /* compute S = D*U*Lambda^{-1/2} */
315: for (i=l;i<k;i++) for (j=l;j<k;j++) pS[i+j*lds] *= D[i-l];
316: for (j=l;j<k;j++) for (i=l;i<k;i++) pS[i+j*lds] /= PetscSqrtReal(eig[j-l]);
318: if (S!=R) { /* compute R = inv(S) = Lambda^{1/2}*U'/D */
319: for (i=l;i<k;i++) for (j=l;j<k;j++) pR[i+j*ld] *= PetscSqrtReal(eig[i-l]);
320: for (j=l;j<k;j++) for (i=l;i<k;i++) pR[i+j*ld] /= D[j-l];
321: }
323: #if defined(PETSC_USE_COMPLEX)
324: PetscFree4(eig,D,work,rwork);
325: #else
326: PetscFree3(eig,D,work);
327: #endif
328: PetscLogFlops(9.0*n*n*n);
329: PetscFPTrapPop();
331: MatDenseRestoreArray(R,&pR);
332: if (S!=R) MatDenseRestoreArray(S,&pS);
333: PetscFunctionReturn(0);
334: }
336: /*
337: QR factorization of an mxn matrix via parallel TSQR
338: */
339: PetscErrorCode BVOrthogonalize_LAPACK_TSQR(BV bv,PetscInt m_,PetscInt n_,PetscScalar *Q,PetscScalar *R,PetscInt ldr)
340: {
341: PetscInt level,plevel,nlevels,powtwo,lda,worklen;
342: PetscBLASInt m,n,i,j,k,l,s = 0,nb,sz,lwork,info;
343: PetscScalar *tau,*work,*A=NULL,*QQ=NULL,*Qhalf,*C=NULL,one=1.0,zero=0.0;
344: PetscMPIInt rank,size,count,stride;
345: MPI_Datatype tmat;
347: PetscFPTrapPush(PETSC_FP_TRAP_OFF);
348: PetscBLASIntCast(m_,&m);
349: PetscBLASIntCast(n_,&n);
350: k = PetscMin(m,n);
351: nb = 16;
352: lda = 2*n;
353: MPI_Comm_size(PetscObjectComm((PetscObject)bv),&size);
354: MPI_Comm_rank(PetscObjectComm((PetscObject)bv),&rank);
355: nlevels = (PetscInt)PetscCeilReal(PetscLog2Real((PetscReal)size));
356: powtwo = PetscPowInt(2,(PetscInt)PetscFloorReal(PetscLog2Real((PetscReal)size)));
357: worklen = n+n*nb;
358: if (nlevels) worklen += n*lda+n*lda*nlevels+n*lda;
359: BVAllocateWork_Private(bv,worklen);
360: tau = bv->work;
361: work = bv->work+n;
362: PetscBLASIntCast(n*nb,&lwork);
363: if (nlevels) {
364: A = bv->work+n+n*nb;
365: QQ = bv->work+n+n*nb+n*lda;
366: C = bv->work+n+n*nb+n*lda+n*lda*nlevels;
367: }
369: /* Compute QR */
370: PetscStackCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&m,&n,Q,&m,tau,work,&lwork,&info));
371: SlepcCheckLapackInfo("geqrf",info);
373: /* Extract R */
374: if (R || nlevels) {
375: for (j=0;j<n;j++) {
376: for (i=0;i<=PetscMin(j,m-1);i++) {
377: if (nlevels) A[i+j*lda] = Q[i+j*m];
378: else R[i+j*ldr] = Q[i+j*m];
379: }
380: for (i=PetscMin(j,m-1)+1;i<n;i++) {
381: if (nlevels) A[i+j*lda] = 0.0;
382: else R[i+j*ldr] = 0.0;
383: }
384: }
385: }
387: /* Compute orthogonal matrix in Q */
388: PetscStackCallBLAS("LAPACKorgqr",LAPACKorgqr_(&m,&k,&k,Q,&m,tau,work,&lwork,&info));
389: SlepcCheckLapackInfo("orgqr",info);
391: if (nlevels) {
393: PetscMPIIntCast(n,&count);
394: PetscMPIIntCast(lda,&stride);
395: PetscBLASIntCast(lda,&l);
396: MPI_Type_vector(count,count,stride,MPIU_SCALAR,&tmat);
397: MPI_Type_commit(&tmat);
399: for (level=nlevels;level>=1;level--) {
401: plevel = PetscPowInt(2,level);
402: PetscBLASIntCast(plevel*PetscFloorReal(rank/(PetscReal)plevel)+(rank+PetscPowInt(2,level-1))%plevel,&s);
404: /* Stack triangular matrices */
405: if (rank<s && s<size) { /* send top part, receive bottom part */
406: MPI_Sendrecv(A,1,tmat,s,111,A+n,1,tmat,s,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE);
407: } else if (s<size) { /* copy top to bottom, receive top part */
408: MPI_Sendrecv(A,1,tmat,rank,111,A+n,1,tmat,rank,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE);
409: MPI_Sendrecv(A+n,1,tmat,s,111,A,1,tmat,s,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE);
410: }
411: if (level<nlevels && size!=powtwo) { /* for cases when size is not a power of 2 */
412: if (rank<size-powtwo) { /* send bottom part */
413: MPI_Send(A+n,1,tmat,rank+powtwo,111,PetscObjectComm((PetscObject)bv));
414: } else if (rank>=powtwo) { /* receive bottom part */
415: MPI_Recv(A+n,1,tmat,rank-powtwo,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE);
416: }
417: }
418: /* Compute QR and build orthogonal matrix */
419: if (level<nlevels || (level==nlevels && s<size)) {
420: PetscStackCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&l,&n,A,&l,tau,work,&lwork,&info));
421: SlepcCheckLapackInfo("geqrf",info);
422: PetscArraycpy(QQ+(level-1)*n*lda,A,n*lda);
423: PetscStackCallBLAS("LAPACKorgqr",LAPACKorgqr_(&l,&n,&n,QQ+(level-1)*n*lda,&l,tau,work,&lwork,&info));
424: SlepcCheckLapackInfo("orgqr",info);
425: for (j=0;j<n;j++) {
426: for (i=j+1;i<n;i++) A[i+j*lda] = 0.0;
427: }
428: } else if (level==nlevels) { /* only one triangular matrix, set Q=I */
429: PetscArrayzero(QQ+(level-1)*n*lda,n*lda);
430: for (j=0;j<n;j++) QQ[j+j*lda+(level-1)*n*lda] = 1.0;
431: }
432: }
434: /* Extract R */
435: if (R) {
436: for (j=0;j<n;j++) {
437: for (i=0;i<=j;i++) R[i+j*ldr] = A[i+j*lda];
438: for (i=j+1;i<n;i++) R[i+j*ldr] = 0.0;
439: }
440: }
442: /* Accumulate orthogonal matrices */
443: for (level=1;level<=nlevels;level++) {
444: plevel = PetscPowInt(2,level);
445: PetscBLASIntCast(plevel*PetscFloorReal(rank/(PetscReal)plevel)+(rank+PetscPowInt(2,level-1))%plevel,&s);
446: Qhalf = (rank<s)? QQ+(level-1)*n*lda: QQ+(level-1)*n*lda+n;
447: if (level<nlevels) {
448: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&l,&n,&n,&one,QQ+level*n*lda,&l,Qhalf,&l,&zero,C,&l));
449: PetscArraycpy(QQ+level*n*lda,C,n*lda);
450: } else {
451: for (i=0;i<m/l;i++) {
452: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&l,&n,&n,&one,Q+i*l,&m,Qhalf,&l,&zero,C,&l));
453: for (j=0;j<n;j++) PetscArraycpy(Q+i*l+j*m,C+j*l,l);
454: }
455: sz = m%l;
456: if (sz) {
457: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&sz,&n,&n,&one,Q+(m/l)*l,&m,Qhalf,&l,&zero,C,&l));
458: for (j=0;j<n;j++) PetscArraycpy(Q+(m/l)*l+j*m,C+j*l,sz);
459: }
460: }
461: }
463: MPI_Type_free(&tmat);
464: }
466: PetscLogFlops(3.0*m*n*n);
467: PetscFPTrapPop();
468: PetscFunctionReturn(0);
469: }
471: /*
472: Reduction operation to compute [~,Rout]=qr([Rin1;Rin2]) in the TSQR algorithm;
473: all matrices are upper triangular stored in packed format
474: */
475: SLEPC_EXTERN void MPIAPI SlepcGivensPacked(void *in,void *inout,PetscMPIInt *len,MPI_Datatype *datatype)
476: {
477: PetscBLASInt n,i,j,k,one=1;
478: PetscMPIInt tsize;
479: PetscScalar v,s,*R2=(PetscScalar*)in,*R1=(PetscScalar*)inout;
480: PetscReal c;
482: PETSC_COMM_SELF,MPI_Type_size(*datatype,&tsize); /* we assume len=1 */
483: tsize /= sizeof(PetscScalar);
484: n = (-1+(PetscBLASInt)PetscSqrtReal(1+8*tsize))/2;
485: for (j=0;j<n;j++) {
486: for (i=0;i<=j;i++) {
487: LAPACKlartg_(R1+(2*n-j-1)*j/2+j,R2+(2*n-i-1)*i/2+j,&c,&s,&v);
488: R1[(2*n-j-1)*j/2+j] = v;
489: k = n-j-1;
490: if (k) BLASrot_(&k,R1+(2*n-j-1)*j/2+j+1,&one,R2+(2*n-i-1)*i/2+j+1,&one,&c,&s);
491: }
492: }
493: return;
494: }
496: /*
497: Computes the R factor of the QR factorization of an mxn matrix via parallel TSQR
498: */
499: PetscErrorCode BVOrthogonalize_LAPACK_TSQR_OnlyR(BV bv,PetscInt m_,PetscInt n_,PetscScalar *Q,PetscScalar *R,PetscInt ldr)
500: {
501: PetscInt worklen;
502: PetscBLASInt m,n,i,j,s,nb,lwork,info;
503: PetscScalar *tau,*work,*A=NULL,*R1=NULL,*R2=NULL;
504: PetscMPIInt size,count;
505: MPI_Datatype tmat;
507: PetscFPTrapPush(PETSC_FP_TRAP_OFF);
508: PetscBLASIntCast(m_,&m);
509: PetscBLASIntCast(n_,&n);
510: nb = 16;
511: s = n+n*(n-1)/2; /* length of packed triangular matrix */
512: MPI_Comm_size(PetscObjectComm((PetscObject)bv),&size);
513: worklen = n+n*nb+2*s+m*n;
514: BVAllocateWork_Private(bv,worklen);
515: tau = bv->work;
516: work = bv->work+n;
517: R1 = bv->work+n+n*nb;
518: R2 = bv->work+n+n*nb+s;
519: A = bv->work+n+n*nb+2*s;
520: PetscBLASIntCast(n*nb,&lwork);
521: PetscArraycpy(A,Q,m*n);
523: /* Compute QR */
524: PetscStackCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&m,&n,A,&m,tau,work,&lwork,&info));
525: SlepcCheckLapackInfo("geqrf",info);
527: if (size==1) {
528: /* Extract R */
529: for (j=0;j<n;j++) {
530: for (i=0;i<=PetscMin(j,m-1);i++) R[i+j*ldr] = A[i+j*m];
531: for (i=PetscMin(j,m-1)+1;i<n;i++) R[i+j*ldr] = 0.0;
532: }
533: } else {
534: /* Use MPI reduction operation to obtain global R */
535: PetscMPIIntCast(s,&count);
536: MPI_Type_contiguous(count,MPIU_SCALAR,&tmat);
537: MPI_Type_commit(&tmat);
538: for (i=0;i<n;i++) {
539: for (j=i;j<n;j++) R1[(2*n-i-1)*i/2+j] = (i<m)?A[i+j*m]:0.0;
540: }
541: MPIU_Allreduce(R1,R2,1,tmat,MPIU_TSQR,PetscObjectComm((PetscObject)bv));
542: for (i=0;i<n;i++) {
543: for (j=0;j<i;j++) R[i+j*ldr] = 0.0;
544: for (j=i;j<n;j++) R[i+j*ldr] = R2[(2*n-i-1)*i/2+j];
545: }
546: MPI_Type_free(&tmat);
547: }
549: PetscLogFlops(3.0*m*n*n);
550: PetscFPTrapPop();
551: PetscFunctionReturn(0);
552: }