Actual source code: mpimatmatmult.c
petsc-3.7.7 2017-09-25
2: /*
3: Defines matrix-matrix product routines for pairs of MPIAIJ matrices
4: C = A * B
5: */
6: #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
7: #include <../src/mat/utils/freespace.h>
8: #include <../src/mat/impls/aij/mpi/mpiaij.h>
9: #include <petscbt.h>
10: #include <../src/mat/impls/dense/mpi/mpidense.h>
11: #include <petsc/private/vecimpl.h>
15: PETSC_INTERN PetscErrorCode MatMatMult_MPIAIJ_MPIAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill, Mat *C)
16: {
18: const char *algTypes[2] = {"scalable","nonscalable"};
19: PetscInt alg=1; /* set default algorithm */
20: MPI_Comm comm;
23: if (scall == MAT_INITIAL_MATRIX) {
24: PetscObjectGetComm((PetscObject)A,&comm);
25: if (A->cmap->rstart != B->rmap->rstart || A->cmap->rend != B->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, (%D, %D) != (%D,%D)",A->cmap->rstart,A->cmap->rend,B->rmap->rstart,B->rmap->rend);
27: PetscObjectOptionsBegin((PetscObject)A);
28: PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,2,algTypes[1],&alg,NULL);
29: PetscOptionsEnd();
31: PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);
32: switch (alg) {
33: case 1:
34: MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(A,B,fill,C);
35: break;
36: default:
37: MatMatMultSymbolic_MPIAIJ_MPIAIJ(A,B,fill,C);
38: break;
39: }
40: PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);
41: }
42: PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);
43: (*(*C)->ops->matmultnumeric)(A,B,*C);
44: PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);
45: return(0);
46: }
50: PetscErrorCode MatDestroy_MPIAIJ_MatMatMult(Mat A)
51: {
53: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
54: Mat_PtAPMPI *ptap = a->ptap;
57: PetscFree2(ptap->startsj_s,ptap->startsj_r);
58: PetscFree(ptap->bufa);
59: MatDestroy(&ptap->P_loc);
60: MatDestroy(&ptap->P_oth);
61: MatDestroy(&ptap->Pt);
62: PetscFree(ptap->api);
63: PetscFree(ptap->apj);
64: PetscFree(ptap->apa);
65: ptap->destroy(A);
66: PetscFree(ptap);
67: return(0);
68: }
72: PetscErrorCode MatDuplicate_MPIAIJ_MatMatMult(Mat A, MatDuplicateOption op, Mat *M)
73: {
75: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
76: Mat_PtAPMPI *ptap = a->ptap;
79: (*ptap->duplicate)(A,op,M);
81: (*M)->ops->destroy = ptap->destroy; /* = MatDestroy_MPIAIJ, *M doesn't duplicate A's special structure! */
82: (*M)->ops->duplicate = ptap->duplicate; /* = MatDuplicate_MPIAIJ */
83: return(0);
84: }
88: PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,Mat C)
89: {
91: Mat_MPIAIJ *a =(Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
92: Mat_SeqAIJ *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
93: Mat_SeqAIJ *cd =(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
94: PetscScalar *cda=cd->a,*coa=co->a;
95: Mat_SeqAIJ *p_loc,*p_oth;
96: PetscScalar *apa,*ca;
97: PetscInt cm =C->rmap->n;
98: Mat_PtAPMPI *ptap=c->ptap;
99: PetscInt *api,*apj,*apJ,i,k;
100: PetscInt cstart=C->cmap->rstart;
101: PetscInt cdnz,conz,k0,k1;
102: MPI_Comm comm;
103: PetscMPIInt size;
106: PetscObjectGetComm((PetscObject)A,&comm);
107: MPI_Comm_size(comm,&size);
109: /* 1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
110: /*-----------------------------------------------------*/
111: /* update numerical values of P_oth and P_loc */
112: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
113: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
115: /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
116: /*----------------------------------------------------------*/
117: /* get data from symbolic products */
118: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
119: p_oth = NULL;
120: if (size >1) {
121: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
122: }
124: /* get apa for storing dense row A[i,:]*P */
125: apa = ptap->apa;
127: api = ptap->api;
128: apj = ptap->apj;
129: for (i=0; i<cm; i++) {
130: /* compute apa = A[i,:]*P */
131: AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
133: /* set values in C */
134: apJ = apj + api[i];
135: cdnz = cd->i[i+1] - cd->i[i];
136: conz = co->i[i+1] - co->i[i];
138: /* 1st off-diagoanl part of C */
139: ca = coa + co->i[i];
140: k = 0;
141: for (k0=0; k0<conz; k0++) {
142: if (apJ[k] >= cstart) break;
143: ca[k0] = apa[apJ[k]];
144: apa[apJ[k]] = 0.0;
145: k++;
146: }
148: /* diagonal part of C */
149: ca = cda + cd->i[i];
150: for (k1=0; k1<cdnz; k1++) {
151: ca[k1] = apa[apJ[k]];
152: apa[apJ[k]] = 0.0;
153: k++;
154: }
156: /* 2nd off-diagoanl part of C */
157: ca = coa + co->i[i];
158: for (; k0<conz; k0++) {
159: ca[k0] = apa[apJ[k]];
160: apa[apJ[k]] = 0.0;
161: k++;
162: }
163: }
164: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
165: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
166: return(0);
167: }
171: PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,PetscReal fill,Mat *C)
172: {
173: PetscErrorCode ierr;
174: MPI_Comm comm;
175: PetscMPIInt size;
176: Mat Cmpi;
177: Mat_PtAPMPI *ptap;
178: PetscFreeSpaceList free_space=NULL,current_space=NULL;
179: Mat_MPIAIJ *a =(Mat_MPIAIJ*)A->data,*c;
180: Mat_SeqAIJ *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
181: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
182: PetscInt *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
183: PetscInt *lnk,i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi;
184: PetscInt am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n,Crmax;
185: PetscBT lnkbt;
186: PetscScalar *apa;
187: PetscReal afill;
188: PetscTable ta;
191: PetscObjectGetComm((PetscObject)A,&comm);
192: MPI_Comm_size(comm,&size);
194: /* create struct Mat_PtAPMPI and attached it to C later */
195: PetscNew(&ptap);
197: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
198: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
200: /* get P_loc by taking all local rows of P */
201: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);
203: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
204: pi_loc = p_loc->i; pj_loc = p_loc->j;
205: if (size > 1) {
206: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
207: pi_oth = p_oth->i; pj_oth = p_oth->j;
208: } else {
209: p_oth = NULL;
210: pi_oth = NULL; pj_oth = NULL;
211: }
213: /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
214: /*-------------------------------------------------------------------*/
215: PetscMalloc1(am+2,&api);
216: ptap->api = api;
217: api[0] = 0;
219: /* create and initialize a linked list -- TODO: replace it with PetscBTCreate()! */
220: PetscTableCreate(pn,pN,&ta);
221: MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta);
222: MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta);
223: PetscTableGetCount(ta,&Crmax);
224: PetscTableDestroy(&ta);
226: PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);
228: /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
229: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(adi[am],PetscIntSumTruncate(aoi[am],pi_loc[pm]))),&free_space);
230: current_space = free_space;
232: MatPreallocateInitialize(comm,am,pn,dnz,onz);
233: for (i=0; i<am; i++) {
234: /* diagonal portion of A */
235: nzi = adi[i+1] - adi[i];
236: for (j=0; j<nzi; j++) {
237: row = *adj++;
238: pnz = pi_loc[row+1] - pi_loc[row];
239: Jptr = pj_loc + pi_loc[row];
240: /* add non-zero cols of P into the sorted linked list lnk */
241: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
242: }
243: /* off-diagonal portion of A */
244: nzi = aoi[i+1] - aoi[i];
245: for (j=0; j<nzi; j++) {
246: row = *aoj++;
247: pnz = pi_oth[row+1] - pi_oth[row];
248: Jptr = pj_oth + pi_oth[row];
249: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
250: }
252: apnz = lnk[0];
253: api[i+1] = api[i] + apnz;
255: /* if free space is not available, double the total space in the list */
256: if (current_space->local_remaining<apnz) {
257: PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);
258: nspacedouble++;
259: }
261: /* Copy data into free space, then initialize lnk */
262: PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);
263: MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);
265: current_space->array += apnz;
266: current_space->local_used += apnz;
267: current_space->local_remaining -= apnz;
268: }
270: /* Allocate space for apj, initialize apj, and */
271: /* destroy list of free space and other temporary array(s) */
272: PetscMalloc1(api[am]+1,&ptap->apj);
273: apj = ptap->apj;
274: PetscFreeSpaceContiguous(&free_space,ptap->apj);
275: PetscLLDestroy(lnk,lnkbt);
277: /* malloc apa to store dense row A[i,:]*P */
278: PetscCalloc1(pN,&apa);
280: ptap->apa = apa;
282: /* create and assemble symbolic parallel matrix Cmpi */
283: /*----------------------------------------------------*/
284: MatCreate(comm,&Cmpi);
285: MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);
286: MatSetBlockSizesFromMats(Cmpi,A,P);
288: MatSetType(Cmpi,MATMPIAIJ);
289: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
290: MatPreallocateFinalize(dnz,onz);
291: for (i=0; i<am; i++) {
292: row = i + rstart;
293: apnz = api[i+1] - api[i];
294: MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);
295: apj += apnz;
296: }
297: MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
298: MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
300: ptap->destroy = Cmpi->ops->destroy;
301: ptap->duplicate = Cmpi->ops->duplicate;
302: Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
303: Cmpi->ops->destroy = MatDestroy_MPIAIJ_MatMatMult;
304: Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult;
306: /* attach the supporting struct to Cmpi for reuse */
307: c = (Mat_MPIAIJ*)Cmpi->data;
308: c->ptap = ptap;
310: *C = Cmpi;
312: /* set MatInfo */
313: afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
314: if (afill < 1.0) afill = 1.0;
315: Cmpi->info.mallocs = nspacedouble;
316: Cmpi->info.fill_ratio_given = fill;
317: Cmpi->info.fill_ratio_needed = afill;
319: #if defined(PETSC_USE_INFO)
320: if (api[am]) {
321: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);
322: PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
323: } else {
324: PetscInfo(Cmpi,"Empty matrix product\n");
325: }
326: #endif
327: return(0);
328: }
332: PETSC_INTERN PetscErrorCode MatMatMult_MPIAIJ_MPIDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
333: {
337: if (scall == MAT_INITIAL_MATRIX) {
338: PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);
339: MatMatMultSymbolic_MPIAIJ_MPIDense(A,B,fill,C);
340: PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);
341: }
342: PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);
343: MatMatMultNumeric_MPIAIJ_MPIDense(A,B,*C);
344: PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);
345: return(0);
346: }
348: typedef struct {
349: Mat workB;
350: PetscScalar *rvalues,*svalues;
351: MPI_Request *rwaits,*swaits;
352: } MPIAIJ_MPIDense;
356: PetscErrorCode MatMPIAIJ_MPIDenseDestroy(void *ctx)
357: {
358: MPIAIJ_MPIDense *contents = (MPIAIJ_MPIDense*) ctx;
359: PetscErrorCode ierr;
362: MatDestroy(&contents->workB);
363: PetscFree4(contents->rvalues,contents->svalues,contents->rwaits,contents->swaits);
364: PetscFree(contents);
365: return(0);
366: }
370: /*
371: This is a "dummy function" that handles the case where matrix C was created as a dense matrix
372: directly by the user and passed to MatMatMult() with the MAT_REUSE_MATRIX option
374: It is the same as MatMatMultSymbolic_MPIAIJ_MPIDense() except does not create C
375: */
376: PetscErrorCode MatMatMultNumeric_MPIDense(Mat A,Mat B,Mat C)
377: {
378: PetscErrorCode ierr;
379: PetscBool flg;
380: Mat_MPIAIJ *aij = (Mat_MPIAIJ*) A->data;
381: PetscInt nz = aij->B->cmap->n;
382: PetscContainer container;
383: MPIAIJ_MPIDense *contents;
384: VecScatter ctx = aij->Mvctx;
385: VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
386: VecScatter_MPI_General *to = (VecScatter_MPI_General*) ctx->todata;
389: PetscObjectTypeCompare((PetscObject)B,MATMPIDENSE,&flg);
390: if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Second matrix must be mpidense");
392: /* Handle case where where user provided the final C matrix rather than calling MatMatMult() with MAT_INITIAL_MATRIX*/
393: PetscObjectTypeCompare((PetscObject)A,MATMPIAIJ,&flg);
394: if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"First matrix must be MPIAIJ");
396: C->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
398: PetscNew(&contents);
399: /* Create work matrix used to store off processor rows of B needed for local product */
400: MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);
401: /* Create work arrays needed */
402: PetscMalloc4(B->cmap->N*from->starts[from->n],&contents->rvalues,
403: B->cmap->N*to->starts[to->n],&contents->svalues,
404: from->n,&contents->rwaits,
405: to->n,&contents->swaits);
407: PetscContainerCreate(PetscObjectComm((PetscObject)A),&container);
408: PetscContainerSetPointer(container,contents);
409: PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);
410: PetscObjectCompose((PetscObject)C,"workB",(PetscObject)container);
411: PetscContainerDestroy(&container);
413: (*C->ops->matmultnumeric)(A,B,C);
414: return(0);
415: }
419: PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIDense(Mat A,Mat B,PetscReal fill,Mat *C)
420: {
421: PetscErrorCode ierr;
422: Mat_MPIAIJ *aij = (Mat_MPIAIJ*) A->data;
423: PetscInt nz = aij->B->cmap->n;
424: PetscContainer container;
425: MPIAIJ_MPIDense *contents;
426: VecScatter ctx = aij->Mvctx;
427: VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
428: VecScatter_MPI_General *to = (VecScatter_MPI_General*) ctx->todata;
429: PetscInt m = A->rmap->n,n=B->cmap->n;
432: MatCreate(PetscObjectComm((PetscObject)B),C);
433: MatSetSizes(*C,m,n,A->rmap->N,B->cmap->N);
434: MatSetBlockSizesFromMats(*C,A,B);
435: MatSetType(*C,MATMPIDENSE);
436: MatMPIDenseSetPreallocation(*C,NULL);
437: MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);
438: MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);
440: (*C)->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
442: PetscNew(&contents);
443: /* Create work matrix used to store off processor rows of B needed for local product */
444: MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);
445: /* Create work arrays needed */
446: PetscMalloc4(B->cmap->N*from->starts[from->n],&contents->rvalues,
447: B->cmap->N*to->starts[to->n],&contents->svalues,
448: from->n,&contents->rwaits,
449: to->n,&contents->swaits);
451: PetscContainerCreate(PetscObjectComm((PetscObject)A),&container);
452: PetscContainerSetPointer(container,contents);
453: PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);
454: PetscObjectCompose((PetscObject)(*C),"workB",(PetscObject)container);
455: PetscContainerDestroy(&container);
456: return(0);
457: }
461: /*
462: Performs an efficient scatter on the rows of B needed by this process; this is
463: a modification of the VecScatterBegin_() routines.
464: */
465: PetscErrorCode MatMPIDenseScatter(Mat A,Mat B,Mat C,Mat *outworkB)
466: {
467: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
468: PetscErrorCode ierr;
469: PetscScalar *b,*w,*svalues,*rvalues;
470: VecScatter ctx = aij->Mvctx;
471: VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
472: VecScatter_MPI_General *to = (VecScatter_MPI_General*) ctx->todata;
473: PetscInt i,j,k;
474: PetscInt *sindices,*sstarts,*rindices,*rstarts;
475: PetscMPIInt *sprocs,*rprocs,nrecvs;
476: MPI_Request *swaits,*rwaits;
477: MPI_Comm comm;
478: PetscMPIInt tag = ((PetscObject)ctx)->tag,ncols = B->cmap->N, nrows = aij->B->cmap->n,imdex,nrowsB = B->rmap->n;
479: MPI_Status status;
480: MPIAIJ_MPIDense *contents;
481: PetscContainer container;
482: Mat workB;
485: PetscObjectGetComm((PetscObject)A,&comm);
486: PetscObjectQuery((PetscObject)C,"workB",(PetscObject*)&container);
487: if (!container) SETERRQ(comm,PETSC_ERR_PLIB,"Container does not exist");
488: PetscContainerGetPointer(container,(void**)&contents);
490: workB = *outworkB = contents->workB;
491: if (nrows != workB->rmap->n) SETERRQ2(comm,PETSC_ERR_PLIB,"Number of rows of workB %D not equal to columns of aij->B %D",nrows,workB->cmap->n);
492: sindices = to->indices;
493: sstarts = to->starts;
494: sprocs = to->procs;
495: swaits = contents->swaits;
496: svalues = contents->svalues;
498: rindices = from->indices;
499: rstarts = from->starts;
500: rprocs = from->procs;
501: rwaits = contents->rwaits;
502: rvalues = contents->rvalues;
504: MatDenseGetArray(B,&b);
505: MatDenseGetArray(workB,&w);
507: for (i=0; i<from->n; i++) {
508: MPI_Irecv(rvalues+ncols*rstarts[i],ncols*(rstarts[i+1]-rstarts[i]),MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
509: }
511: for (i=0; i<to->n; i++) {
512: /* pack a message at a time */
513: for (j=0; j<sstarts[i+1]-sstarts[i]; j++) {
514: for (k=0; k<ncols; k++) {
515: svalues[ncols*(sstarts[i] + j) + k] = b[sindices[sstarts[i]+j] + nrowsB*k];
516: }
517: }
518: MPI_Isend(svalues+ncols*sstarts[i],ncols*(sstarts[i+1]-sstarts[i]),MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
519: }
521: nrecvs = from->n;
522: while (nrecvs) {
523: MPI_Waitany(from->n,rwaits,&imdex,&status);
524: nrecvs--;
525: /* unpack a message at a time */
526: for (j=0; j<rstarts[imdex+1]-rstarts[imdex]; j++) {
527: for (k=0; k<ncols; k++) {
528: w[rindices[rstarts[imdex]+j] + nrows*k] = rvalues[ncols*(rstarts[imdex] + j) + k];
529: }
530: }
531: }
532: if (to->n) {MPI_Waitall(to->n,swaits,to->sstatus);}
534: MatDenseRestoreArray(B,&b);
535: MatDenseRestoreArray(workB,&w);
536: MatAssemblyBegin(workB,MAT_FINAL_ASSEMBLY);
537: MatAssemblyEnd(workB,MAT_FINAL_ASSEMBLY);
538: return(0);
539: }
540: extern PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat,Mat,Mat);
544: PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIDense(Mat A,Mat B,Mat C)
545: {
547: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
548: Mat_MPIDense *bdense = (Mat_MPIDense*)B->data;
549: Mat_MPIDense *cdense = (Mat_MPIDense*)C->data;
550: Mat workB;
553: /* diagonal block of A times all local rows of B*/
554: MatMatMultNumeric_SeqAIJ_SeqDense(aij->A,bdense->A,cdense->A);
556: /* get off processor parts of B needed to complete the product */
557: MatMPIDenseScatter(A,B,C,&workB);
559: /* off-diagonal block of A times nonlocal rows of B */
560: MatMatMultNumericAdd_SeqAIJ_SeqDense(aij->B,workB,cdense->A);
561: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
562: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
563: return(0);
564: }
568: PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
569: {
571: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
572: Mat_SeqAIJ *ad = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
573: Mat_SeqAIJ *cd = (Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
574: PetscInt *adi = ad->i,*adj,*aoi=ao->i,*aoj;
575: PetscScalar *ada,*aoa,*cda=cd->a,*coa=co->a;
576: Mat_SeqAIJ *p_loc,*p_oth;
577: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
578: PetscScalar *pa_loc,*pa_oth,*pa,valtmp,*ca;
579: PetscInt cm = C->rmap->n,anz,pnz;
580: Mat_PtAPMPI *ptap = c->ptap;
581: PetscScalar *apa_sparse = ptap->apa;
582: PetscInt *api,*apj,*apJ,i,j,k,row;
583: PetscInt cstart = C->cmap->rstart;
584: PetscInt cdnz,conz,k0,k1,nextp;
585: MPI_Comm comm;
586: PetscMPIInt size;
589: PetscObjectGetComm((PetscObject)A,&comm);
590: MPI_Comm_size(comm,&size);
592: /* 1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
593: /*-----------------------------------------------------*/
594: /* update numerical values of P_oth and P_loc */
595: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
596: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
598: /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
599: /*----------------------------------------------------------*/
600: /* get data from symbolic products */
601: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
602: pi_loc = p_loc->i; pj_loc = p_loc->j; pa_loc = p_loc->a;
603: if (size >1) {
604: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
605: pi_oth = p_oth->i; pj_oth = p_oth->j; pa_oth = p_oth->a;
606: } else {
607: p_oth = NULL; pi_oth = NULL; pj_oth = NULL; pa_oth = NULL;
608: }
610: api = ptap->api;
611: apj = ptap->apj;
612: for (i=0; i<cm; i++) {
613: apJ = apj + api[i];
615: /* diagonal portion of A */
616: anz = adi[i+1] - adi[i];
617: adj = ad->j + adi[i];
618: ada = ad->a + adi[i];
619: for (j=0; j<anz; j++) {
620: row = adj[j];
621: pnz = pi_loc[row+1] - pi_loc[row];
622: pj = pj_loc + pi_loc[row];
623: pa = pa_loc + pi_loc[row];
624: /* perform sparse axpy */
625: valtmp = ada[j];
626: nextp = 0;
627: for (k=0; nextp<pnz; k++) {
628: if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
629: apa_sparse[k] += valtmp*pa[nextp++];
630: }
631: }
632: PetscLogFlops(2.0*pnz);
633: }
635: /* off-diagonal portion of A */
636: anz = aoi[i+1] - aoi[i];
637: aoj = ao->j + aoi[i];
638: aoa = ao->a + aoi[i];
639: for (j=0; j<anz; j++) {
640: row = aoj[j];
641: pnz = pi_oth[row+1] - pi_oth[row];
642: pj = pj_oth + pi_oth[row];
643: pa = pa_oth + pi_oth[row];
644: /* perform sparse axpy */
645: valtmp = aoa[j];
646: nextp = 0;
647: for (k=0; nextp<pnz; k++) {
648: if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
649: apa_sparse[k] += valtmp*pa[nextp++];
650: }
651: }
652: PetscLogFlops(2.0*pnz);
653: }
655: /* set values in C */
656: cdnz = cd->i[i+1] - cd->i[i];
657: conz = co->i[i+1] - co->i[i];
659: /* 1st off-diagoanl part of C */
660: ca = coa + co->i[i];
661: k = 0;
662: for (k0=0; k0<conz; k0++) {
663: if (apJ[k] >= cstart) break;
664: ca[k0] = apa_sparse[k];
665: apa_sparse[k] = 0.0;
666: k++;
667: }
669: /* diagonal part of C */
670: ca = cda + cd->i[i];
671: for (k1=0; k1<cdnz; k1++) {
672: ca[k1] = apa_sparse[k];
673: apa_sparse[k] = 0.0;
674: k++;
675: }
677: /* 2nd off-diagoanl part of C */
678: ca = coa + co->i[i];
679: for (; k0<conz; k0++) {
680: ca[k0] = apa_sparse[k];
681: apa_sparse[k] = 0.0;
682: k++;
683: }
684: }
685: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
686: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
687: return(0);
688: }
690: /* same as MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(), except using LLCondensed to avoid O(BN) memory requirement */
693: PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
694: {
695: PetscErrorCode ierr;
696: MPI_Comm comm;
697: PetscMPIInt size;
698: Mat Cmpi;
699: Mat_PtAPMPI *ptap;
700: PetscFreeSpaceList free_space = NULL,current_space=NULL;
701: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data,*c;
702: Mat_SeqAIJ *ad = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
703: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
704: PetscInt *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
705: PetscInt i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi,*lnk,apnz_max;
706: PetscInt am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
707: PetscReal afill;
708: PetscScalar *apa;
709: PetscTable ta;
712: PetscObjectGetComm((PetscObject)A,&comm);
713: MPI_Comm_size(comm,&size);
715: /* create struct Mat_PtAPMPI and attached it to C later */
716: PetscNew(&ptap);
718: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
719: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
721: /* get P_loc by taking all local rows of P */
722: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);
724: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
725: pi_loc = p_loc->i; pj_loc = p_loc->j;
726: if (size > 1) {
727: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
728: pi_oth = p_oth->i; pj_oth = p_oth->j;
729: } else {
730: p_oth = NULL;
731: pi_oth = NULL; pj_oth = NULL;
732: }
734: /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
735: /*-------------------------------------------------------------------*/
736: PetscMalloc1(am+2,&api);
737: ptap->api = api;
738: api[0] = 0;
740: /* create and initialize a linked list */
741: PetscTableCreate(pn,pN,&ta);
743: /* Calculate apnz_max */
744: apnz_max = 0;
745: for (i=0; i<am; i++) {
746: PetscTableRemoveAll(ta);
747: /* diagonal portion of A */
748: nzi = adi[i+1] - adi[i];
749: Jptr = adj+adi[i]; /* cols of A_diag */
750: MatMergeRows_SeqAIJ(p_loc,nzi,Jptr,ta);
751: PetscTableGetCount(ta,&apnz);
752: if (apnz_max < apnz) apnz_max = apnz;
754: /* off-diagonal portion of A */
755: nzi = aoi[i+1] - aoi[i];
756: Jptr = aoj+aoi[i]; /* cols of A_off */
757: MatMergeRows_SeqAIJ(p_oth,nzi,Jptr,ta);
758: PetscTableGetCount(ta,&apnz);
759: if (apnz_max < apnz) apnz_max = apnz;
760: }
761: PetscTableDestroy(&ta);
763: PetscLLCondensedCreate_Scalable(apnz_max,&lnk);
765: /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
766: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(adi[am],PetscIntSumTruncate(aoi[am],pi_loc[pm]))),&free_space);
767: current_space = free_space;
768: MatPreallocateInitialize(comm,am,pn,dnz,onz);
769: for (i=0; i<am; i++) {
770: /* diagonal portion of A */
771: nzi = adi[i+1] - adi[i];
772: for (j=0; j<nzi; j++) {
773: row = *adj++;
774: pnz = pi_loc[row+1] - pi_loc[row];
775: Jptr = pj_loc + pi_loc[row];
776: /* add non-zero cols of P into the sorted linked list lnk */
777: PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
778: }
779: /* off-diagonal portion of A */
780: nzi = aoi[i+1] - aoi[i];
781: for (j=0; j<nzi; j++) {
782: row = *aoj++;
783: pnz = pi_oth[row+1] - pi_oth[row];
784: Jptr = pj_oth + pi_oth[row];
785: PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
786: }
788: apnz = *lnk;
789: api[i+1] = api[i] + apnz;
791: /* if free space is not available, double the total space in the list */
792: if (current_space->local_remaining<apnz) {
793: PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);
794: nspacedouble++;
795: }
797: /* Copy data into free space, then initialize lnk */
798: PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);
799: MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);
801: current_space->array += apnz;
802: current_space->local_used += apnz;
803: current_space->local_remaining -= apnz;
804: }
806: /* Allocate space for apj, initialize apj, and */
807: /* destroy list of free space and other temporary array(s) */
808: PetscMalloc1(api[am]+1,&ptap->apj);
809: apj = ptap->apj;
810: PetscFreeSpaceContiguous(&free_space,ptap->apj);
811: PetscLLCondensedDestroy_Scalable(lnk);
813: /* create and assemble symbolic parallel matrix Cmpi */
814: /*----------------------------------------------------*/
815: MatCreate(comm,&Cmpi);
816: MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);
817: MatSetBlockSizesFromMats(Cmpi,A,P);
818: MatSetType(Cmpi,MATMPIAIJ);
819: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
820: MatPreallocateFinalize(dnz,onz);
822: /* malloc apa for assembly Cmpi */
823: PetscCalloc1(apnz_max,&apa);
825: ptap->apa = apa;
826: for (i=0; i<am; i++) {
827: row = i + rstart;
828: apnz = api[i+1] - api[i];
829: MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);
830: apj += apnz;
831: }
832: MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
833: MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
835: ptap->destroy = Cmpi->ops->destroy;
836: ptap->duplicate = Cmpi->ops->duplicate;
837: Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ;
838: Cmpi->ops->destroy = MatDestroy_MPIAIJ_MatMatMult;
839: Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult;
841: /* attach the supporting struct to Cmpi for reuse */
842: c = (Mat_MPIAIJ*)Cmpi->data;
843: c->ptap = ptap;
845: *C = Cmpi;
847: /* set MatInfo */
848: afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
849: if (afill < 1.0) afill = 1.0;
850: Cmpi->info.mallocs = nspacedouble;
851: Cmpi->info.fill_ratio_given = fill;
852: Cmpi->info.fill_ratio_needed = afill;
854: #if defined(PETSC_USE_INFO)
855: if (api[am]) {
856: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);
857: PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
858: } else {
859: PetscInfo(Cmpi,"Empty matrix product\n");
860: }
861: #endif
862: return(0);
863: }
865: /*-------------------------------------------------------------------------*/
868: PetscErrorCode MatTransposeMatMult_MPIAIJ_MPIAIJ(Mat P,Mat A,MatReuse scall,PetscReal fill,Mat *C)
869: {
871: const char *algTypes[3] = {"scalable","nonscalable","matmatmult"};
872: PetscInt alg=0; /* set default algorithm */
875: if (scall == MAT_INITIAL_MATRIX) {
876: PetscObjectOptionsBegin((PetscObject)A);
877: PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,3,algTypes[0],&alg,NULL);
878: PetscOptionsEnd();
880: PetscLogEventBegin(MAT_TransposeMatMultSymbolic,P,A,0,0);
881: switch (alg) {
882: case 1:
883: MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(P,A,fill,C);
884: break;
885: case 2:
886: {
887: Mat Pt;
888: Mat_PtAPMPI *ptap;
889: Mat_MPIAIJ *c;
890: MatTranspose(P,MAT_INITIAL_MATRIX,&Pt);
891: MatMatMult(Pt,A,MAT_INITIAL_MATRIX,fill,C);
892: c = (Mat_MPIAIJ*)(*C)->data;
893: ptap = c->ptap;
894: ptap->Pt = Pt;
895: (*C)->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult;
896: return(0);
897: }
898: break;
899: default:
900: MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(P,A,fill,C);
901: break;
902: }
903: PetscLogEventEnd(MAT_TransposeMatMultSymbolic,P,A,0,0);
904: }
905: PetscLogEventBegin(MAT_TransposeMatMultNumeric,P,A,0,0);
906: (*(*C)->ops->mattransposemultnumeric)(P,A,*C);
907: PetscLogEventEnd(MAT_TransposeMatMultNumeric,P,A,0,0);
908: return(0);
909: }
911: /* This routine only works when scall=MAT_REUSE_MATRIX! */
914: PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult(Mat P,Mat A,Mat C)
915: {
917: Mat_MPIAIJ *c=(Mat_MPIAIJ*)C->data;
918: Mat_PtAPMPI *ptap= c->ptap;
919: Mat Pt=ptap->Pt;
922: MatTranspose(P,MAT_REUSE_MATRIX,&Pt);
923: MatMatMultNumeric(Pt,A,C);
924: return(0);
925: }
927: /* Non-scalable version, use dense axpy */
930: PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,Mat C)
931: {
932: PetscErrorCode ierr;
933: Mat_Merge_SeqsToMPI *merge;
934: Mat_MPIAIJ *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
935: Mat_SeqAIJ *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
936: Mat_PtAPMPI *ptap;
937: PetscInt *adj,*aJ;
938: PetscInt i,j,k,anz,pnz,row,*cj;
939: MatScalar *ada,*aval,*ca,valtmp;
940: PetscInt am =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
941: MPI_Comm comm;
942: PetscMPIInt size,rank,taga,*len_s;
943: PetscInt *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
944: PetscInt **buf_ri,**buf_rj;
945: PetscInt cnz=0,*bj_i,*bi,*bj,bnz,nextcj; /* bi,bj,ba: local array of C(mpi mat) */
946: MPI_Request *s_waits,*r_waits;
947: MPI_Status *status;
948: MatScalar **abuf_r,*ba_i,*pA,*coa,*ba;
949: PetscInt *ai,*aj,*coi,*coj;
950: PetscInt *poJ,*pdJ;
951: Mat A_loc;
952: Mat_SeqAIJ *a_loc;
955: PetscObjectGetComm((PetscObject)C,&comm);
956: MPI_Comm_size(comm,&size);
957: MPI_Comm_rank(comm,&rank);
959: ptap = c->ptap;
960: merge = ptap->merge;
962: /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
963: /*--------------------------------------------------------------*/
964: /* get data from symbolic products */
965: coi = merge->coi; coj = merge->coj;
966: PetscCalloc1(coi[pon]+1,&coa);
968: bi = merge->bi; bj = merge->bj;
969: owners = merge->rowmap->range;
970: PetscCalloc1(bi[cm]+1,&ba);
972: /* get A_loc by taking all local rows of A */
973: A_loc = ptap->A_loc;
974: MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);
975: a_loc = (Mat_SeqAIJ*)(A_loc)->data;
976: ai = a_loc->i;
977: aj = a_loc->j;
979: PetscCalloc1(A->cmap->N,&aval); /* non-scalable!!! */
981: for (i=0; i<am; i++) {
982: /* 2-a) put A[i,:] to dense array aval */
983: anz = ai[i+1] - ai[i];
984: adj = aj + ai[i];
985: ada = a_loc->a + ai[i];
986: for (j=0; j<anz; j++) {
987: aval[adj[j]] = ada[j];
988: }
990: /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
991: /*--------------------------------------------------------------*/
992: /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
993: pnz = po->i[i+1] - po->i[i];
994: poJ = po->j + po->i[i];
995: pA = po->a + po->i[i];
996: for (j=0; j<pnz; j++) {
997: row = poJ[j];
998: cnz = coi[row+1] - coi[row];
999: cj = coj + coi[row];
1000: ca = coa + coi[row];
1001: /* perform dense axpy */
1002: valtmp = pA[j];
1003: for (k=0; k<cnz; k++) {
1004: ca[k] += valtmp*aval[cj[k]];
1005: }
1006: PetscLogFlops(2.0*cnz);
1007: }
1009: /* put the value into Cd (diagonal part) */
1010: pnz = pd->i[i+1] - pd->i[i];
1011: pdJ = pd->j + pd->i[i];
1012: pA = pd->a + pd->i[i];
1013: for (j=0; j<pnz; j++) {
1014: row = pdJ[j];
1015: cnz = bi[row+1] - bi[row];
1016: cj = bj + bi[row];
1017: ca = ba + bi[row];
1018: /* perform dense axpy */
1019: valtmp = pA[j];
1020: for (k=0; k<cnz; k++) {
1021: ca[k] += valtmp*aval[cj[k]];
1022: }
1023: PetscLogFlops(2.0*cnz);
1024: }
1026: /* zero the current row of Pt*A */
1027: aJ = aj + ai[i];
1028: for (k=0; k<anz; k++) aval[aJ[k]] = 0.0;
1029: }
1031: /* 3) send and recv matrix values coa */
1032: /*------------------------------------*/
1033: buf_ri = merge->buf_ri;
1034: buf_rj = merge->buf_rj;
1035: len_s = merge->len_s;
1036: PetscCommGetNewTag(comm,&taga);
1037: PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);
1039: PetscMalloc2(merge->nsend+1,&s_waits,size,&status);
1040: for (proc=0,k=0; proc<size; proc++) {
1041: if (!len_s[proc]) continue;
1042: i = merge->owners_co[proc];
1043: MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);
1044: k++;
1045: }
1046: if (merge->nrecv) {MPI_Waitall(merge->nrecv,r_waits,status);}
1047: if (merge->nsend) {MPI_Waitall(merge->nsend,s_waits,status);}
1049: PetscFree2(s_waits,status);
1050: PetscFree(r_waits);
1051: PetscFree(coa);
1053: /* 4) insert local Cseq and received values into Cmpi */
1054: /*----------------------------------------------------*/
1055: PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);
1056: for (k=0; k<merge->nrecv; k++) {
1057: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1058: nrows = *(buf_ri_k[k]);
1059: nextrow[k] = buf_ri_k[k]+1; /* next row number of k-th recved i-structure */
1060: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
1061: }
1063: for (i=0; i<cm; i++) {
1064: row = owners[rank] + i; /* global row index of C_seq */
1065: bj_i = bj + bi[i]; /* col indices of the i-th row of C */
1066: ba_i = ba + bi[i];
1067: bnz = bi[i+1] - bi[i];
1068: /* add received vals into ba */
1069: for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1070: /* i-th row */
1071: if (i == *nextrow[k]) {
1072: cnz = *(nextci[k]+1) - *nextci[k];
1073: cj = buf_rj[k] + *(nextci[k]);
1074: ca = abuf_r[k] + *(nextci[k]);
1075: nextcj = 0;
1076: for (j=0; nextcj<cnz; j++) {
1077: if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1078: ba_i[j] += ca[nextcj++];
1079: }
1080: }
1081: nextrow[k]++; nextci[k]++;
1082: PetscLogFlops(2.0*cnz);
1083: }
1084: }
1085: MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);
1086: }
1087: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1088: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1090: PetscFree(ba);
1091: PetscFree(abuf_r[0]);
1092: PetscFree(abuf_r);
1093: PetscFree3(buf_ri_k,nextrow,nextci);
1094: PetscFree(aval);
1095: return(0);
1096: }
1098: PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat, MatDuplicateOption,Mat*);
1099: /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */
1102: PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,PetscReal fill,Mat *C)
1103: {
1104: PetscErrorCode ierr;
1105: Mat Cmpi,A_loc,POt,PDt;
1106: Mat_PtAPMPI *ptap;
1107: PetscFreeSpaceList free_space=NULL,current_space=NULL;
1108: Mat_MPIAIJ *p =(Mat_MPIAIJ*)P->data,*c;
1109: PetscInt *pdti,*pdtj,*poti,*potj,*ptJ;
1110: PetscInt nnz;
1111: PetscInt *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1112: PetscInt am=A->rmap->n,pn=P->cmap->n;
1113: PetscBT lnkbt;
1114: MPI_Comm comm;
1115: PetscMPIInt size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1116: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
1117: PetscInt len,proc,*dnz,*onz,*owners;
1118: PetscInt nzi,*bi,*bj;
1119: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1120: MPI_Request *swaits,*rwaits;
1121: MPI_Status *sstatus,rstatus;
1122: Mat_Merge_SeqsToMPI *merge;
1123: PetscInt *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1124: PetscReal afill =1.0,afill_tmp;
1125: PetscInt rstart = P->cmap->rstart,rmax,aN=A->cmap->N;
1126: PetscScalar *vals;
1127: Mat_SeqAIJ *a_loc, *pdt,*pot;
1130: PetscObjectGetComm((PetscObject)A,&comm);
1131: /* check if matrix local sizes are compatible */
1132: if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1134: MPI_Comm_size(comm,&size);
1135: MPI_Comm_rank(comm,&rank);
1137: /* create struct Mat_PtAPMPI and attached it to C later */
1138: PetscNew(&ptap);
1140: /* get A_loc by taking all local rows of A */
1141: MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);
1143: ptap->A_loc = A_loc;
1145: a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1146: ai = a_loc->i;
1147: aj = a_loc->j;
1149: /* determine symbolic Co=(p->B)^T*A - send to others */
1150: /*----------------------------------------------------*/
1151: MatTransposeSymbolic_SeqAIJ(p->A,&PDt);
1152: pdt = (Mat_SeqAIJ*)PDt->data;
1153: pdti = pdt->i; pdtj = pdt->j;
1155: MatTransposeSymbolic_SeqAIJ(p->B,&POt);
1156: pot = (Mat_SeqAIJ*)POt->data;
1157: poti = pot->i; potj = pot->j;
1159: /* then, compute symbolic Co = (p->B)^T*A */
1160: pon = (p->B)->cmap->n; /* total num of rows to be sent to other processors >= (num of nonzero rows of C_seq) - pn */
1161: PetscMalloc1(pon+1,&coi);
1162: coi[0] = 0;
1164: /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1165: nnz = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(poti[pon],ai[am]));
1166: PetscFreeSpaceGet(nnz,&free_space);
1167: current_space = free_space;
1169: /* create and initialize a linked list */
1170: PetscLLCondensedCreate(aN,aN,&lnk,&lnkbt);
1172: for (i=0; i<pon; i++) {
1173: pnz = poti[i+1] - poti[i];
1174: ptJ = potj + poti[i];
1175: for (j=0; j<pnz; j++) {
1176: row = ptJ[j]; /* row of A_loc == col of Pot */
1177: anz = ai[row+1] - ai[row];
1178: Jptr = aj + ai[row];
1179: /* add non-zero cols of AP into the sorted linked list lnk */
1180: PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);
1181: }
1182: nnz = lnk[0];
1184: /* If free space is not available, double the total space in the list */
1185: if (current_space->local_remaining<nnz) {
1186: PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),¤t_space);
1187: nspacedouble++;
1188: }
1190: /* Copy data into free space, and zero out denserows */
1191: PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);
1193: current_space->array += nnz;
1194: current_space->local_used += nnz;
1195: current_space->local_remaining -= nnz;
1197: coi[i+1] = coi[i] + nnz;
1198: }
1200: PetscMalloc1(coi[pon]+1,&coj);
1201: PetscFreeSpaceContiguous(&free_space,coj);
1203: afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1204: if (afill_tmp > afill) afill = afill_tmp;
1206: /* send j-array (coj) of Co to other processors */
1207: /*----------------------------------------------*/
1208: /* determine row ownership */
1209: PetscNew(&merge);
1210: PetscLayoutCreate(comm,&merge->rowmap);
1212: merge->rowmap->n = pn;
1213: merge->rowmap->bs = 1;
1215: PetscLayoutSetUp(merge->rowmap);
1216: owners = merge->rowmap->range;
1218: /* determine the number of messages to send, their lengths */
1219: PetscCalloc1(size,&len_si);
1220: PetscMalloc1(size,&merge->len_s);
1222: len_s = merge->len_s;
1223: merge->nsend = 0;
1225: PetscMalloc1(size+2,&owners_co);
1226: PetscMemzero(len_s,size*sizeof(PetscMPIInt));
1228: proc = 0;
1229: for (i=0; i<pon; i++) {
1230: while (prmap[i] >= owners[proc+1]) proc++;
1231: len_si[proc]++; /* num of rows in Co to be sent to [proc] */
1232: len_s[proc] += coi[i+1] - coi[i];
1233: }
1235: len = 0; /* max length of buf_si[] */
1236: owners_co[0] = 0;
1237: for (proc=0; proc<size; proc++) {
1238: owners_co[proc+1] = owners_co[proc] + len_si[proc];
1239: if (len_si[proc]) {
1240: merge->nsend++;
1241: len_si[proc] = 2*(len_si[proc] + 1);
1242: len += len_si[proc];
1243: }
1244: }
1246: /* determine the number and length of messages to receive for coi and coj */
1247: PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);
1248: PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);
1250: /* post the Irecv and Isend of coj */
1251: PetscCommGetNewTag(comm,&tagj);
1252: PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);
1253: PetscMalloc1(merge->nsend+1,&swaits);
1254: for (proc=0, k=0; proc<size; proc++) {
1255: if (!len_s[proc]) continue;
1256: i = owners_co[proc];
1257: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
1258: k++;
1259: }
1261: /* receives and sends of coj are complete */
1262: PetscMalloc1(size,&sstatus);
1263: for (i=0; i<merge->nrecv; i++) {
1264: PetscMPIInt icompleted;
1265: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1266: }
1267: PetscFree(rwaits);
1268: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1270: /* send and recv coi */
1271: /*-------------------*/
1272: PetscCommGetNewTag(comm,&tagi);
1273: PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);
1274: PetscMalloc1(len+1,&buf_s);
1275: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
1276: for (proc=0,k=0; proc<size; proc++) {
1277: if (!len_s[proc]) continue;
1278: /* form outgoing message for i-structure:
1279: buf_si[0]: nrows to be sent
1280: [1:nrows]: row index (global)
1281: [nrows+1:2*nrows+1]: i-structure index
1282: */
1283: /*-------------------------------------------*/
1284: nrows = len_si[proc]/2 - 1;
1285: buf_si_i = buf_si + nrows+1;
1286: buf_si[0] = nrows;
1287: buf_si_i[0] = 0;
1288: nrows = 0;
1289: for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1290: nzi = coi[i+1] - coi[i];
1291: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1292: buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */
1293: nrows++;
1294: }
1295: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
1296: k++;
1297: buf_si += len_si[proc];
1298: }
1299: i = merge->nrecv;
1300: while (i--) {
1301: PetscMPIInt icompleted;
1302: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1303: }
1304: PetscFree(rwaits);
1305: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1306: PetscFree(len_si);
1307: PetscFree(len_ri);
1308: PetscFree(swaits);
1309: PetscFree(sstatus);
1310: PetscFree(buf_s);
1312: /* compute the local portion of C (mpi mat) */
1313: /*------------------------------------------*/
1314: /* allocate bi array and free space for accumulating nonzero column info */
1315: PetscMalloc1(pn+1,&bi);
1316: bi[0] = 0;
1318: /* set initial free space to be fill*(nnz(P) + nnz(A)) */
1319: nnz = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(pdti[pn],PetscIntSumTruncate(poti[pon],ai[am])));
1320: PetscFreeSpaceGet(nnz,&free_space);
1321: current_space = free_space;
1323: PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);
1324: for (k=0; k<merge->nrecv; k++) {
1325: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1326: nrows = *buf_ri_k[k];
1327: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
1328: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
1329: }
1331: MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);
1332: rmax = 0;
1333: for (i=0; i<pn; i++) {
1334: /* add pdt[i,:]*AP into lnk */
1335: pnz = pdti[i+1] - pdti[i];
1336: ptJ = pdtj + pdti[i];
1337: for (j=0; j<pnz; j++) {
1338: row = ptJ[j]; /* row of AP == col of Pt */
1339: anz = ai[row+1] - ai[row];
1340: Jptr = aj + ai[row];
1341: /* add non-zero cols of AP into the sorted linked list lnk */
1342: PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);
1343: }
1345: /* add received col data into lnk */
1346: for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1347: if (i == *nextrow[k]) { /* i-th row */
1348: nzi = *(nextci[k]+1) - *nextci[k];
1349: Jptr = buf_rj[k] + *nextci[k];
1350: PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
1351: nextrow[k]++; nextci[k]++;
1352: }
1353: }
1354: nnz = lnk[0];
1356: /* if free space is not available, make more free space */
1357: if (current_space->local_remaining<nnz) {
1358: PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),¤t_space);
1359: nspacedouble++;
1360: }
1361: /* copy data into free space, then initialize lnk */
1362: PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);
1363: MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);
1365: current_space->array += nnz;
1366: current_space->local_used += nnz;
1367: current_space->local_remaining -= nnz;
1369: bi[i+1] = bi[i] + nnz;
1370: if (nnz > rmax) rmax = nnz;
1371: }
1372: PetscFree3(buf_ri_k,nextrow,nextci);
1374: PetscMalloc1(bi[pn]+1,&bj);
1375: PetscFreeSpaceContiguous(&free_space,bj);
1377: afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1378: if (afill_tmp > afill) afill = afill_tmp;
1379: PetscLLCondensedDestroy(lnk,lnkbt);
1380: MatDestroy(&POt);
1381: MatDestroy(&PDt);
1383: /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part */
1384: /*----------------------------------------------------------------------------------*/
1385: PetscCalloc1(rmax+1,&vals);
1387: MatCreate(comm,&Cmpi);
1388: MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);
1389: MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));
1390: MatSetType(Cmpi,MATMPIAIJ);
1391: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1392: MatPreallocateFinalize(dnz,onz);
1393: MatSetBlockSize(Cmpi,1);
1394: for (i=0; i<pn; i++) {
1395: row = i + rstart;
1396: nnz = bi[i+1] - bi[i];
1397: Jptr = bj + bi[i];
1398: MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);
1399: }
1400: MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
1401: MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
1402: PetscFree(vals);
1404: merge->bi = bi;
1405: merge->bj = bj;
1406: merge->coi = coi;
1407: merge->coj = coj;
1408: merge->buf_ri = buf_ri;
1409: merge->buf_rj = buf_rj;
1410: merge->owners_co = owners_co;
1412: /* attach the supporting struct to Cmpi for reuse */
1413: c = (Mat_MPIAIJ*)Cmpi->data;
1414: c->ptap = ptap;
1415: ptap->api = NULL;
1416: ptap->apj = NULL;
1417: ptap->merge = merge;
1418: ptap->destroy = Cmpi->ops->destroy;
1419: ptap->duplicate = Cmpi->ops->duplicate;
1421: Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
1422: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
1423: Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatPtAP;
1425: *C = Cmpi;
1426: #if defined(PETSC_USE_INFO)
1427: if (bi[pn] != 0) {
1428: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);
1429: PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);
1430: } else {
1431: PetscInfo(Cmpi,"Empty matrix product\n");
1432: }
1433: #endif
1434: return(0);
1435: }
1439: PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ(Mat P,Mat A,Mat C)
1440: {
1441: PetscErrorCode ierr;
1442: Mat_Merge_SeqsToMPI *merge;
1443: Mat_MPIAIJ *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1444: Mat_SeqAIJ *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
1445: Mat_PtAPMPI *ptap;
1446: PetscInt *adj;
1447: PetscInt i,j,k,anz,pnz,row,*cj,nexta;
1448: MatScalar *ada,*ca,valtmp;
1449: PetscInt am =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
1450: MPI_Comm comm;
1451: PetscMPIInt size,rank,taga,*len_s;
1452: PetscInt *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
1453: PetscInt **buf_ri,**buf_rj;
1454: PetscInt cnz=0,*bj_i,*bi,*bj,bnz,nextcj; /* bi,bj,ba: local array of C(mpi mat) */
1455: MPI_Request *s_waits,*r_waits;
1456: MPI_Status *status;
1457: MatScalar **abuf_r,*ba_i,*pA,*coa,*ba;
1458: PetscInt *ai,*aj,*coi,*coj;
1459: PetscInt *poJ,*pdJ;
1460: Mat A_loc;
1461: Mat_SeqAIJ *a_loc;
1464: PetscObjectGetComm((PetscObject)C,&comm);
1465: MPI_Comm_size(comm,&size);
1466: MPI_Comm_rank(comm,&rank);
1468: ptap = c->ptap;
1469: merge = ptap->merge;
1471: /* 2) compute numeric C_seq = P_loc^T*A_loc */
1472: /*------------------------------------------*/
1473: /* get data from symbolic products */
1474: coi = merge->coi; coj = merge->coj;
1475: PetscCalloc1(coi[pon]+1,&coa);
1476: bi = merge->bi; bj = merge->bj;
1477: owners = merge->rowmap->range;
1478: PetscCalloc1(bi[cm]+1,&ba);
1480: /* get A_loc by taking all local rows of A */
1481: A_loc = ptap->A_loc;
1482: MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);
1483: a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1484: ai = a_loc->i;
1485: aj = a_loc->j;
1487: for (i=0; i<am; i++) {
1488: anz = ai[i+1] - ai[i];
1489: adj = aj + ai[i];
1490: ada = a_loc->a + ai[i];
1492: /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
1493: /*-------------------------------------------------------------*/
1494: /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
1495: pnz = po->i[i+1] - po->i[i];
1496: poJ = po->j + po->i[i];
1497: pA = po->a + po->i[i];
1498: for (j=0; j<pnz; j++) {
1499: row = poJ[j];
1500: cj = coj + coi[row];
1501: ca = coa + coi[row];
1502: /* perform sparse axpy */
1503: nexta = 0;
1504: valtmp = pA[j];
1505: for (k=0; nexta<anz; k++) {
1506: if (cj[k] == adj[nexta]) {
1507: ca[k] += valtmp*ada[nexta];
1508: nexta++;
1509: }
1510: }
1511: PetscLogFlops(2.0*anz);
1512: }
1514: /* put the value into Cd (diagonal part) */
1515: pnz = pd->i[i+1] - pd->i[i];
1516: pdJ = pd->j + pd->i[i];
1517: pA = pd->a + pd->i[i];
1518: for (j=0; j<pnz; j++) {
1519: row = pdJ[j];
1520: cj = bj + bi[row];
1521: ca = ba + bi[row];
1522: /* perform sparse axpy */
1523: nexta = 0;
1524: valtmp = pA[j];
1525: for (k=0; nexta<anz; k++) {
1526: if (cj[k] == adj[nexta]) {
1527: ca[k] += valtmp*ada[nexta];
1528: nexta++;
1529: }
1530: }
1531: PetscLogFlops(2.0*anz);
1532: }
1533: }
1535: /* 3) send and recv matrix values coa */
1536: /*------------------------------------*/
1537: buf_ri = merge->buf_ri;
1538: buf_rj = merge->buf_rj;
1539: len_s = merge->len_s;
1540: PetscCommGetNewTag(comm,&taga);
1541: PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);
1543: PetscMalloc2(merge->nsend+1,&s_waits,size,&status);
1544: for (proc=0,k=0; proc<size; proc++) {
1545: if (!len_s[proc]) continue;
1546: i = merge->owners_co[proc];
1547: MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);
1548: k++;
1549: }
1550: if (merge->nrecv) {MPI_Waitall(merge->nrecv,r_waits,status);}
1551: if (merge->nsend) {MPI_Waitall(merge->nsend,s_waits,status);}
1553: PetscFree2(s_waits,status);
1554: PetscFree(r_waits);
1555: PetscFree(coa);
1557: /* 4) insert local Cseq and received values into Cmpi */
1558: /*----------------------------------------------------*/
1559: PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);
1560: for (k=0; k<merge->nrecv; k++) {
1561: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1562: nrows = *(buf_ri_k[k]);
1563: nextrow[k] = buf_ri_k[k]+1; /* next row number of k-th recved i-structure */
1564: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
1565: }
1567: for (i=0; i<cm; i++) {
1568: row = owners[rank] + i; /* global row index of C_seq */
1569: bj_i = bj + bi[i]; /* col indices of the i-th row of C */
1570: ba_i = ba + bi[i];
1571: bnz = bi[i+1] - bi[i];
1572: /* add received vals into ba */
1573: for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1574: /* i-th row */
1575: if (i == *nextrow[k]) {
1576: cnz = *(nextci[k]+1) - *nextci[k];
1577: cj = buf_rj[k] + *(nextci[k]);
1578: ca = abuf_r[k] + *(nextci[k]);
1579: nextcj = 0;
1580: for (j=0; nextcj<cnz; j++) {
1581: if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1582: ba_i[j] += ca[nextcj++];
1583: }
1584: }
1585: nextrow[k]++; nextci[k]++;
1586: PetscLogFlops(2.0*cnz);
1587: }
1588: }
1589: MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);
1590: }
1591: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1592: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1594: PetscFree(ba);
1595: PetscFree(abuf_r[0]);
1596: PetscFree(abuf_r);
1597: PetscFree3(buf_ri_k,nextrow,nextci);
1598: return(0);
1599: }
1601: PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat, MatDuplicateOption,Mat*);
1602: /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ();
1603: differ from MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable in using LLCondensedCreate_Scalable() */
1606: PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(Mat P,Mat A,PetscReal fill,Mat *C)
1607: {
1608: PetscErrorCode ierr;
1609: Mat Cmpi,A_loc,POt,PDt;
1610: Mat_PtAPMPI *ptap;
1611: PetscFreeSpaceList free_space=NULL,current_space=NULL;
1612: Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*a=(Mat_MPIAIJ*)A->data,*c;
1613: PetscInt *pdti,*pdtj,*poti,*potj,*ptJ;
1614: PetscInt nnz;
1615: PetscInt *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1616: PetscInt am =A->rmap->n,pn=P->cmap->n;
1617: MPI_Comm comm;
1618: PetscMPIInt size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1619: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
1620: PetscInt len,proc,*dnz,*onz,*owners;
1621: PetscInt nzi,*bi,*bj;
1622: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1623: MPI_Request *swaits,*rwaits;
1624: MPI_Status *sstatus,rstatus;
1625: Mat_Merge_SeqsToMPI *merge;
1626: PetscInt *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1627: PetscReal afill =1.0,afill_tmp;
1628: PetscInt rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Armax;
1629: PetscScalar *vals;
1630: Mat_SeqAIJ *a_loc,*pdt,*pot;
1631: PetscTable ta;
1634: PetscObjectGetComm((PetscObject)A,&comm);
1635: /* check if matrix local sizes are compatible */
1636: if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1638: MPI_Comm_size(comm,&size);
1639: MPI_Comm_rank(comm,&rank);
1641: /* create struct Mat_PtAPMPI and attached it to C later */
1642: PetscNew(&ptap);
1644: /* get A_loc by taking all local rows of A */
1645: MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);
1647: ptap->A_loc = A_loc;
1648: a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1649: ai = a_loc->i;
1650: aj = a_loc->j;
1652: /* determine symbolic Co=(p->B)^T*A - send to others */
1653: /*----------------------------------------------------*/
1654: MatTransposeSymbolic_SeqAIJ(p->A,&PDt);
1655: pdt = (Mat_SeqAIJ*)PDt->data;
1656: pdti = pdt->i; pdtj = pdt->j;
1658: MatTransposeSymbolic_SeqAIJ(p->B,&POt);
1659: pot = (Mat_SeqAIJ*)POt->data;
1660: poti = pot->i; potj = pot->j;
1662: /* then, compute symbolic Co = (p->B)^T*A */
1663: pon = (p->B)->cmap->n; /* total num of rows to be sent to other processors
1664: >= (num of nonzero rows of C_seq) - pn */
1665: PetscMalloc1(pon+1,&coi);
1666: coi[0] = 0;
1668: /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1669: nnz = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(poti[pon],ai[am]));
1670: PetscFreeSpaceGet(nnz,&free_space);
1671: current_space = free_space;
1673: /* create and initialize a linked list */
1674: PetscTableCreate(A->cmap->n + a->B->cmap->N,aN,&ta);
1675: MatRowMergeMax_SeqAIJ(a_loc,am,ta);
1676: PetscTableGetCount(ta,&Armax);
1678: PetscLLCondensedCreate_Scalable(Armax,&lnk);
1680: for (i=0; i<pon; i++) {
1681: pnz = poti[i+1] - poti[i];
1682: ptJ = potj + poti[i];
1683: for (j=0; j<pnz; j++) {
1684: row = ptJ[j]; /* row of A_loc == col of Pot */
1685: anz = ai[row+1] - ai[row];
1686: Jptr = aj + ai[row];
1687: /* add non-zero cols of AP into the sorted linked list lnk */
1688: PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);
1689: }
1690: nnz = lnk[0];
1692: /* If free space is not available, double the total space in the list */
1693: if (current_space->local_remaining<nnz) {
1694: PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),¤t_space);
1695: nspacedouble++;
1696: }
1698: /* Copy data into free space, and zero out denserows */
1699: PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);
1701: current_space->array += nnz;
1702: current_space->local_used += nnz;
1703: current_space->local_remaining -= nnz;
1705: coi[i+1] = coi[i] + nnz;
1706: }
1708: PetscMalloc1(coi[pon]+1,&coj);
1709: PetscFreeSpaceContiguous(&free_space,coj);
1710: PetscLLCondensedDestroy_Scalable(lnk); /* must destroy to get a new one for C */
1712: afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1713: if (afill_tmp > afill) afill = afill_tmp;
1715: /* send j-array (coj) of Co to other processors */
1716: /*----------------------------------------------*/
1717: /* determine row ownership */
1718: PetscNew(&merge);
1719: PetscLayoutCreate(comm,&merge->rowmap);
1721: merge->rowmap->n = pn;
1722: merge->rowmap->bs = 1;
1724: PetscLayoutSetUp(merge->rowmap);
1725: owners = merge->rowmap->range;
1727: /* determine the number of messages to send, their lengths */
1728: PetscCalloc1(size,&len_si);
1729: PetscMalloc1(size,&merge->len_s);
1731: len_s = merge->len_s;
1732: merge->nsend = 0;
1734: PetscMalloc1(size+2,&owners_co);
1735: PetscMemzero(len_s,size*sizeof(PetscMPIInt));
1737: proc = 0;
1738: for (i=0; i<pon; i++) {
1739: while (prmap[i] >= owners[proc+1]) proc++;
1740: len_si[proc]++; /* num of rows in Co to be sent to [proc] */
1741: len_s[proc] += coi[i+1] - coi[i];
1742: }
1744: len = 0; /* max length of buf_si[] */
1745: owners_co[0] = 0;
1746: for (proc=0; proc<size; proc++) {
1747: owners_co[proc+1] = owners_co[proc] + len_si[proc];
1748: if (len_si[proc]) {
1749: merge->nsend++;
1750: len_si[proc] = 2*(len_si[proc] + 1);
1751: len += len_si[proc];
1752: }
1753: }
1755: /* determine the number and length of messages to receive for coi and coj */
1756: PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);
1757: PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);
1759: /* post the Irecv and Isend of coj */
1760: PetscCommGetNewTag(comm,&tagj);
1761: PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);
1762: PetscMalloc1(merge->nsend+1,&swaits);
1763: for (proc=0, k=0; proc<size; proc++) {
1764: if (!len_s[proc]) continue;
1765: i = owners_co[proc];
1766: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
1767: k++;
1768: }
1770: /* receives and sends of coj are complete */
1771: PetscMalloc1(size,&sstatus);
1772: for (i=0; i<merge->nrecv; i++) {
1773: PetscMPIInt icompleted;
1774: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1775: }
1776: PetscFree(rwaits);
1777: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1779: /* add received column indices into table to update Armax */
1780: /* Armax can be as large as aN if a P[row,:] is dense, see src/ksp/ksp/examples/tutorials/ex56.c! */
1781: for (k=0; k<merge->nrecv; k++) {/* k-th received message */
1782: Jptr = buf_rj[k];
1783: for (j=0; j<merge->len_r[k]; j++) {
1784: PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
1785: }
1786: }
1787: PetscTableGetCount(ta,&Armax);
1788: /* printf("Armax %d, an %d + Bn %d = %d, aN %d\n",Armax,A->cmap->n,a->B->cmap->N,A->cmap->n+a->B->cmap->N,aN); */
1790: /* send and recv coi */
1791: /*-------------------*/
1792: PetscCommGetNewTag(comm,&tagi);
1793: PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);
1794: PetscMalloc1(len+1,&buf_s);
1795: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
1796: for (proc=0,k=0; proc<size; proc++) {
1797: if (!len_s[proc]) continue;
1798: /* form outgoing message for i-structure:
1799: buf_si[0]: nrows to be sent
1800: [1:nrows]: row index (global)
1801: [nrows+1:2*nrows+1]: i-structure index
1802: */
1803: /*-------------------------------------------*/
1804: nrows = len_si[proc]/2 - 1;
1805: buf_si_i = buf_si + nrows+1;
1806: buf_si[0] = nrows;
1807: buf_si_i[0] = 0;
1808: nrows = 0;
1809: for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1810: nzi = coi[i+1] - coi[i];
1811: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1812: buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */
1813: nrows++;
1814: }
1815: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
1816: k++;
1817: buf_si += len_si[proc];
1818: }
1819: i = merge->nrecv;
1820: while (i--) {
1821: PetscMPIInt icompleted;
1822: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1823: }
1824: PetscFree(rwaits);
1825: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1826: PetscFree(len_si);
1827: PetscFree(len_ri);
1828: PetscFree(swaits);
1829: PetscFree(sstatus);
1830: PetscFree(buf_s);
1832: /* compute the local portion of C (mpi mat) */
1833: /*------------------------------------------*/
1834: /* allocate bi array and free space for accumulating nonzero column info */
1835: PetscMalloc1(pn+1,&bi);
1836: bi[0] = 0;
1838: /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
1839: nnz = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(pdti[pn],PetscIntSumTruncate(poti[pon],ai[am])));
1840: PetscFreeSpaceGet(nnz,&free_space);
1841: current_space = free_space;
1843: PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);
1844: for (k=0; k<merge->nrecv; k++) {
1845: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1846: nrows = *buf_ri_k[k];
1847: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
1848: nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recieved i-structure */
1849: }
1851: PetscLLCondensedCreate_Scalable(Armax,&lnk);
1852: MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);
1853: rmax = 0;
1854: for (i=0; i<pn; i++) {
1855: /* add pdt[i,:]*AP into lnk */
1856: pnz = pdti[i+1] - pdti[i];
1857: ptJ = pdtj + pdti[i];
1858: for (j=0; j<pnz; j++) {
1859: row = ptJ[j]; /* row of AP == col of Pt */
1860: anz = ai[row+1] - ai[row];
1861: Jptr = aj + ai[row];
1862: /* add non-zero cols of AP into the sorted linked list lnk */
1863: PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);
1864: }
1866: /* add received col data into lnk */
1867: for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1868: if (i == *nextrow[k]) { /* i-th row */
1869: nzi = *(nextci[k]+1) - *nextci[k];
1870: Jptr = buf_rj[k] + *nextci[k];
1871: PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);
1872: nextrow[k]++; nextci[k]++;
1873: }
1874: }
1875: nnz = lnk[0];
1877: /* if free space is not available, make more free space */
1878: if (current_space->local_remaining<nnz) {
1879: PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),¤t_space);
1880: nspacedouble++;
1881: }
1882: /* copy data into free space, then initialize lnk */
1883: PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);
1884: MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);
1886: current_space->array += nnz;
1887: current_space->local_used += nnz;
1888: current_space->local_remaining -= nnz;
1890: bi[i+1] = bi[i] + nnz;
1891: if (nnz > rmax) rmax = nnz;
1892: }
1893: PetscFree3(buf_ri_k,nextrow,nextci);
1895: PetscMalloc1(bi[pn]+1,&bj);
1896: PetscFreeSpaceContiguous(&free_space,bj);
1897: afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1898: if (afill_tmp > afill) afill = afill_tmp;
1899: PetscLLCondensedDestroy_Scalable(lnk);
1900: PetscTableDestroy(&ta);
1902: MatDestroy(&POt);
1903: MatDestroy(&PDt);
1905: /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part */
1906: /*----------------------------------------------------------------------------------*/
1907: PetscCalloc1(rmax+1,&vals);
1909: MatCreate(comm,&Cmpi);
1910: MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);
1911: MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));
1912: MatSetType(Cmpi,MATMPIAIJ);
1913: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1914: MatPreallocateFinalize(dnz,onz);
1915: MatSetBlockSize(Cmpi,1);
1916: for (i=0; i<pn; i++) {
1917: row = i + rstart;
1918: nnz = bi[i+1] - bi[i];
1919: Jptr = bj + bi[i];
1920: MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);
1921: }
1922: MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
1923: MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
1924: PetscFree(vals);
1926: merge->bi = bi;
1927: merge->bj = bj;
1928: merge->coi = coi;
1929: merge->coj = coj;
1930: merge->buf_ri = buf_ri;
1931: merge->buf_rj = buf_rj;
1932: merge->owners_co = owners_co;
1934: /* attach the supporting struct to Cmpi for reuse */
1935: c = (Mat_MPIAIJ*)Cmpi->data;
1937: c->ptap = ptap;
1938: ptap->api = NULL;
1939: ptap->apj = NULL;
1940: ptap->merge = merge;
1941: ptap->apa = NULL;
1942: ptap->destroy = Cmpi->ops->destroy;
1943: ptap->duplicate = Cmpi->ops->duplicate;
1945: Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ;
1946: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
1947: Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatPtAP;
1949: *C = Cmpi;
1950: #if defined(PETSC_USE_INFO)
1951: if (bi[pn] != 0) {
1952: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);
1953: PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);
1954: } else {
1955: PetscInfo(Cmpi,"Empty matrix product\n");
1956: }
1957: #endif
1958: return(0);
1959: }