Actual source code: bvblas.c
slepc-3.7.4 2017-05-17
1: /*
2: BV private kernels that use the BLAS.
4: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
5: SLEPc - Scalable Library for Eigenvalue Problem Computations
6: Copyright (c) 2002-2016, Universitat Politecnica de Valencia, Spain
8: This file is part of SLEPc.
10: SLEPc is free software: you can redistribute it and/or modify it under the
11: terms of version 3 of the GNU Lesser General Public License as published by
12: the Free Software Foundation.
14: SLEPc is distributed in the hope that it will be useful, but WITHOUT ANY
15: WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
16: FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for
17: more details.
19: You should have received a copy of the GNU Lesser General Public License
20: along with SLEPc. If not, see <http://www.gnu.org/licenses/>.
21: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
22: */
24: #include <slepc/private/bvimpl.h>
25: #include <slepcblaslapack.h>
27: #define BLOCKSIZE 64
31: /*
32: C := alpha*A*B + beta*C
34: A is mxk (ld=m), B is kxn (ld=ldb), C is mxn (ld=m)
35: */
36: PetscErrorCode BVMult_BLAS_Private(BV bv,PetscInt m_,PetscInt n_,PetscInt k_,PetscInt ldb_,PetscScalar alpha,const PetscScalar *A,const PetscScalar *B,PetscScalar beta,PetscScalar *C)
37: {
39: PetscBLASInt m,n,k,ldb;
40: #if defined(PETSC_HAVE_FBLASLAPACK) || defined(PETSC_HAVE_F2CBLASLAPACK)
41: PetscBLASInt l,bs=BLOCKSIZE;
42: #endif
45: PetscBLASIntCast(m_,&m);
46: PetscBLASIntCast(n_,&n);
47: PetscBLASIntCast(k_,&k);
48: PetscBLASIntCast(ldb_,&ldb);
49: #if defined(PETSC_HAVE_FBLASLAPACK) || defined(PETSC_HAVE_F2CBLASLAPACK)
50: l = m % bs;
51: if (l) PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&l,&n,&k,&alpha,(PetscScalar*)A,&m,(PetscScalar*)B,&ldb,&beta,C,&m));
52: for (;l<m;l+=bs) {
53: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bs,&n,&k,&alpha,(PetscScalar*)A+l,&m,(PetscScalar*)B,&ldb,&beta,C+l,&m));
54: }
55: #else
56: if (m) PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&m,&n,&k,&alpha,(PetscScalar*)A,&m,(PetscScalar*)B,&ldb,&beta,C,&m));
57: #endif
58: PetscLogFlops(2.0*m*n*k);
59: return(0);
60: }
64: /*
65: y := alpha*A*x + beta*y
67: A is nxk (ld=n)
68: */
69: PetscErrorCode BVMultVec_BLAS_Private(BV bv,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *A,const PetscScalar *x,PetscScalar beta,PetscScalar *y)
70: {
72: PetscBLASInt n,k,one=1;
75: PetscBLASIntCast(n_,&n);
76: PetscBLASIntCast(k_,&k);
77: if (n) PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&n,&k,&alpha,A,&n,x,&one,&beta,y,&one));
78: PetscLogFlops(2.0*n*k);
79: return(0);
80: }
84: /*
85: A(:,s:e-1) := A*B(:,s:e-1)
87: A is mxk (ld=m), B is kxn (ld=ldb) n=e-s
88: */
89: PetscErrorCode BVMultInPlace_BLAS_Private(BV bv,PetscInt m_,PetscInt k_,PetscInt ldb_,PetscInt s,PetscInt e,PetscScalar *A,const PetscScalar *B,PetscBool btrans)
90: {
92: PetscScalar *pb,zero=0.0,one=1.0;
93: PetscBLASInt m,n,k,l,ldb,bs=BLOCKSIZE;
94: PetscInt j,n_=e-s;
95: const char *bt;
98: PetscBLASIntCast(m_,&m);
99: PetscBLASIntCast(n_,&n);
100: PetscBLASIntCast(k_,&k);
101: PetscBLASIntCast(ldb_,&ldb);
102: BVAllocateWork_Private(bv,BLOCKSIZE*n_);
103: if (btrans) {
104: pb = (PetscScalar*)B+s;
105: bt = "C";
106: } else {
107: pb = (PetscScalar*)B+s*ldb;
108: bt = "N";
109: }
110: l = m % bs;
111: if (l) {
112: PetscStackCallBLAS("BLASgemm",BLASgemm_("N",bt,&l,&n,&k,&one,A,&m,pb,&ldb,&zero,bv->work,&l));
113: for (j=0;j<n;j++) {
114: PetscMemcpy(A+(s+j)*m,bv->work+j*l,l*sizeof(PetscScalar));
115: }
116: }
117: for (;l<m;l+=bs) {
118: PetscStackCallBLAS("BLASgemm",BLASgemm_("N",bt,&bs,&n,&k,&one,A+l,&m,pb,&ldb,&zero,bv->work,&bs));
119: for (j=0;j<n;j++) {
120: PetscMemcpy(A+(s+j)*m+l,bv->work+j*bs,bs*sizeof(PetscScalar));
121: }
122: }
123: PetscLogFlops(2.0*m*n*k);
124: return(0);
125: }
129: /*
130: V := V*B
132: V is mxn (ld=m), B is nxn (ld=k)
133: */
134: PetscErrorCode BVMultInPlace_Vecs_Private(BV bv,PetscInt m_,PetscInt n_,PetscInt k_,Vec *V,const PetscScalar *B,PetscBool btrans)
135: {
136: PetscErrorCode ierr;
137: PetscScalar zero=0.0,one=1.0,*out,*pout;
138: const PetscScalar *pin;
139: PetscBLASInt m,n,k,l,bs=BLOCKSIZE;
140: PetscInt j;
141: const char *bt;
144: PetscBLASIntCast(m_,&m);
145: PetscBLASIntCast(n_,&n);
146: PetscBLASIntCast(k_,&k);
147: BVAllocateWork_Private(bv,2*BLOCKSIZE*n_);
148: out = bv->work+BLOCKSIZE*n_;
149: if (btrans) bt = "C";
150: else bt = "N";
151: l = m % bs;
152: if (l) {
153: for (j=0;j<n;j++) {
154: VecGetArrayRead(V[j],&pin);
155: PetscMemcpy(bv->work+j*l,pin,l*sizeof(PetscScalar));
156: VecRestoreArrayRead(V[j],&pin);
157: }
158: PetscStackCallBLAS("BLASgemm",BLASgemm_("N",bt,&l,&n,&n,&one,bv->work,&l,(PetscScalar*)B,&k,&zero,out,&l));
159: for (j=0;j<n;j++) {
160: VecGetArray(V[j],&pout);
161: PetscMemcpy(pout,out+j*l,l*sizeof(PetscScalar));
162: VecRestoreArray(V[j],&pout);
163: }
164: }
165: for (;l<m;l+=bs) {
166: for (j=0;j<n;j++) {
167: VecGetArrayRead(V[j],&pin);
168: PetscMemcpy(bv->work+j*bs,pin+l,bs*sizeof(PetscScalar));
169: VecRestoreArrayRead(V[j],&pin);
170: }
171: PetscStackCallBLAS("BLASgemm",BLASgemm_("N",bt,&bs,&n,&n,&one,bv->work,&bs,(PetscScalar*)B,&k,&zero,out,&bs));
172: for (j=0;j<n;j++) {
173: VecGetArray(V[j],&pout);
174: PetscMemcpy(pout+l,out+j*bs,bs*sizeof(PetscScalar));
175: VecRestoreArray(V[j],&pout);
176: }
177: }
178: PetscLogFlops(2.0*n*n*k);
179: return(0);
180: }
184: /*
185: B := alpha*A + beta*B
187: A,B are nxk (ld=n)
188: */
189: PetscErrorCode BVAXPY_BLAS_Private(BV bv,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *A,PetscScalar beta,PetscScalar *B)
190: {
192: PetscBLASInt m,one=1;
195: PetscBLASIntCast(n_*k_,&m);
196: if (beta!=(PetscScalar)1.0) {
197: PetscStackCallBLAS("BLASscal",BLASscal_(&m,&beta,B,&one));
198: PetscLogFlops(m);
199: }
200: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&m,&alpha,A,&one,B,&one));
201: PetscLogFlops(2.0*m);
202: return(0);
203: }
207: /*
208: C := A'*B
210: A' is mxk (ld=k), B is kxn (ld=k), C is mxn (ld=ldc)
211: */
212: PetscErrorCode BVDot_BLAS_Private(BV bv,PetscInt m_,PetscInt n_,PetscInt k_,PetscInt ldc_,const PetscScalar *A,const PetscScalar *B,PetscScalar *C,PetscBool mpi)
213: {
215: PetscScalar zero=0.0,one=1.0,*CC;
216: PetscBLASInt m,n,k,ldc,j,len;
219: PetscBLASIntCast(m_,&m);
220: PetscBLASIntCast(n_,&n);
221: PetscBLASIntCast(k_,&k);
222: PetscBLASIntCast(ldc_,&ldc);
223: if (mpi) {
224: if (ldc==m) {
225: BVAllocateWork_Private(bv,m*n);
226: if (k) PetscStackCallBLAS("BLASgemm",BLASgemm_("C","N",&m,&n,&k,&one,(PetscScalar*)A,&k,(PetscScalar*)B,&k,&zero,bv->work,&ldc));
227: else { PetscMemzero(bv->work,m*n*sizeof(PetscScalar)); }
228: PetscMPIIntCast(m*n,&len);
229: MPI_Allreduce(bv->work,C,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv));
230: } else {
231: BVAllocateWork_Private(bv,2*m*n);
232: CC = bv->work+m*n;
233: if (k) PetscStackCallBLAS("BLASgemm",BLASgemm_("C","N",&m,&n,&k,&one,(PetscScalar*)A,&k,(PetscScalar*)B,&k,&zero,bv->work,&m));
234: else { PetscMemzero(bv->work,m*n*sizeof(PetscScalar)); }
235: PetscMPIIntCast(m*n,&len);
236: MPI_Allreduce(bv->work,CC,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv));
237: for (j=0;j<n;j++) {
238: PetscMemcpy(C+j*ldc,CC+j*m,m*sizeof(PetscScalar));
239: }
240: }
241: } else {
242: if (k) PetscStackCallBLAS("BLASgemm",BLASgemm_("C","N",&m,&n,&k,&one,(PetscScalar*)A,&k,(PetscScalar*)B,&k,&zero,C,&ldc));
243: }
244: PetscLogFlops(2.0*m*n*k);
245: return(0);
246: }
250: /*
251: y := A'*x
253: A is nxk (ld=n)
254: */
255: PetscErrorCode BVDotVec_BLAS_Private(BV bv,PetscInt n_,PetscInt k_,const PetscScalar *A,const PetscScalar *x,PetscScalar *y,PetscBool mpi)
256: {
258: PetscScalar zero=0.0,done=1.0;
259: PetscBLASInt n,k,one=1,len;
262: PetscBLASIntCast(n_,&n);
263: PetscBLASIntCast(k_,&k);
264: if (mpi) {
265: BVAllocateWork_Private(bv,k);
266: if (n) {
267: PetscStackCallBLAS("BLASgemv",BLASgemv_("C",&n,&k,&done,A,&n,x,&one,&zero,bv->work,&one));
268: } else {
269: PetscMemzero(bv->work,k*sizeof(PetscScalar));
270: }
271: PetscMPIIntCast(k,&len);
272: MPI_Allreduce(bv->work,y,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv));
273: } else {
274: if (n) PetscStackCallBLAS("BLASgemv",BLASgemv_("C",&n,&k,&done,A,&n,x,&one,&zero,y,&one));
275: }
276: PetscLogFlops(2.0*n*k);
277: return(0);
278: }
282: /*
283: Scale n scalars
284: */
285: PetscErrorCode BVScale_BLAS_Private(BV bv,PetscInt n_,PetscScalar *A,PetscScalar alpha)
286: {
288: PetscBLASInt n,one=1;
291: if (alpha == (PetscScalar)0.0) {
292: PetscMemzero(A,n_*sizeof(PetscScalar));
293: } else if (alpha!=(PetscScalar)1.0) {
294: PetscBLASIntCast(n_,&n);
295: PetscStackCallBLAS("BLASscal",BLASscal_(&n,&alpha,A,&one));
296: PetscLogFlops(n);
297: }
298: return(0);
299: }
303: /*
304: Compute ||A|| for an mxn matrix
305: */
306: PetscErrorCode BVNorm_LAPACK_Private(BV bv,PetscInt m_,PetscInt n_,const PetscScalar *A,NormType type,PetscReal *nrm,PetscBool mpi)
307: {
309: PetscBLASInt m,n,i,j;
310: PetscMPIInt len;
311: PetscReal lnrm,*rwork=NULL,*rwork2=NULL;
314: PetscFPTrapPush(PETSC_FP_TRAP_OFF);
315: PetscBLASIntCast(m_,&m);
316: PetscBLASIntCast(n_,&n);
317: if (type==NORM_FROBENIUS || type==NORM_2) {
318: lnrm = LAPACKlange_("F",&m,&n,(PetscScalar*)A,&m,rwork);
319: if (mpi) {
320: lnrm = lnrm*lnrm;
321: MPI_Allreduce(&lnrm,nrm,1,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)bv));
322: *nrm = PetscSqrtReal(*nrm);
323: } else *nrm = lnrm;
324: PetscLogFlops(2.0*m*n);
325: } else if (type==NORM_1) {
326: if (mpi) {
327: BVAllocateWork_Private(bv,2*n_);
328: rwork = (PetscReal*)bv->work;
329: rwork2 = rwork+n_;
330: PetscMemzero(rwork,n_*sizeof(PetscReal));
331: PetscMemzero(rwork2,n_*sizeof(PetscReal));
332: for (j=0;j<n_;j++) {
333: for (i=0;i<m_;i++) {
334: rwork[j] += PetscAbsScalar(A[i+j*m_]);
335: }
336: }
337: PetscMPIIntCast(n_,&len);
338: MPI_Allreduce(rwork,rwork2,len,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)bv));
339: *nrm = 0.0;
340: for (j=0;j<n_;j++) if (rwork2[j] > *nrm) *nrm = rwork2[j];
341: } else {
342: *nrm = LAPACKlange_("O",&m,&n,(PetscScalar*)A,&m,rwork);
343: }
344: PetscLogFlops(1.0*m*n);
345: } else if (type==NORM_INFINITY) {
346: BVAllocateWork_Private(bv,m_);
347: rwork = (PetscReal*)bv->work;
348: lnrm = LAPACKlange_("I",&m,&n,(PetscScalar*)A,&m,rwork);
349: if (mpi) {
350: MPI_Allreduce(&lnrm,nrm,1,MPIU_REAL,MPIU_MAX,PetscObjectComm((PetscObject)bv));
351: } else *nrm = lnrm;
352: PetscLogFlops(1.0*m*n);
353: }
354: PetscFPTrapPop();
355: return(0);
356: }
360: /*
361: QR factorization of an mxn matrix
362: */
363: PetscErrorCode BVOrthogonalize_LAPACK_Private(BV bv,PetscInt m_,PetscInt n_,PetscScalar *Q,PetscScalar *R,PetscBool mpi)
364: {
365: #if defined(PETSC_MISSING_LAPACK_GEQRF) || defined(PETSC_MISSING_LAPACK_ORGQR)
367: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"GEQRF/ORGQR - Lapack routines are unavailable");
368: #else
370: PetscBLASInt m,n,i,j,k,l,nb,lwork,info;
371: PetscScalar *tau,*work,*Rl=NULL,*A=NULL,*C=NULL,one=1.0,zero=0.0;
372: PetscMPIInt rank,size,len;
375: PetscFPTrapPush(PETSC_FP_TRAP_OFF);
376: PetscBLASIntCast(m_,&m);
377: PetscBLASIntCast(n_,&n);
378: k = PetscMin(m,n);
379: nb = 16;
380: if (mpi) {
381: MPI_Comm_rank(PetscObjectComm((PetscObject)bv),&rank);
382: MPI_Comm_size(PetscObjectComm((PetscObject)bv),&size);
383: BVAllocateWork_Private(bv,k+n*nb+n*n+n*n*size+m*n);
384: } else {
385: BVAllocateWork_Private(bv,k+n*nb);
386: }
387: tau = bv->work;
388: work = bv->work+k;
389: PetscBLASIntCast(n*nb,&lwork);
390: if (mpi) {
391: Rl = bv->work+k+n*nb;
392: A = bv->work+k+n*nb+n*n;
393: C = bv->work+k+n*nb+n*n+n*n*size;
394: }
396: /* Compute QR */
397: PetscStackCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&m,&n,Q,&m,tau,work,&lwork,&info));
398: if (info) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Error in Lapack xGEQRF %d",info);
400: /* Extract R */
401: if (R || mpi) {
402: PetscMemzero(mpi? Rl: R,n*n*sizeof(PetscScalar));
403: for (j=0;j<n;j++) {
404: for (i=0;i<=j;i++) {
405: if (mpi) Rl[i+j*n] = Q[i+j*m];
406: else R[i+j*n] = Q[i+j*m];
407: }
408: }
409: }
411: /* Compute orthogonal matrix in Q */
412: PetscStackCallBLAS("LAPACKungqr",LAPACKungqr_(&m,&n,&k,Q,&m,tau,work,&lwork,&info));
413: if (info) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Error in Lapack xORGQR %d",info);
415: if (mpi) {
417: /* Stack triangular matrices */
418: PetscBLASIntCast(n*size,&l);
419: PetscMPIIntCast(n,&len);
420: for (j=0;j<n;j++) {
421: MPI_Allgather(Rl+j*n,len,MPIU_SCALAR,A+j*l,len,MPIU_SCALAR,PetscObjectComm((PetscObject)bv));
422: }
424: /* Compute QR */
425: PetscStackCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&l,&n,A,&l,tau,work,&lwork,&info));
426: if (info) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Error in Lapack xGEQRF %d",info);
428: /* Extract R */
429: if (R) {
430: PetscMemzero(R,n*n*sizeof(PetscScalar));
431: for (j=0;j<n;j++)
432: for (i=0;i<=j;i++)
433: R[i+j*n] = A[i+j*l];
434: }
436: /* Accumulate orthogonal matrix */
437: PetscStackCallBLAS("LAPACKungqr",LAPACKungqr_(&l,&n,&n,A,&l,tau,work,&lwork,&info));
438: if (info) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Error in Lapack xORGQR %d",info);
439: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&m,&n,&n,&one,Q,&m,A+rank*n,&l,&zero,C,&m));
440: PetscMemcpy(Q,C,m*n*sizeof(PetscScalar));
441: }
443: PetscLogFlops(3.0*m*n*n);
444: PetscFPTrapPop();
445: return(0);
446: #endif
447: }