C-DVM - contents | Part 1(1-4) | Part2 (5-11) | Part 3 (Appendixes) |
created: april, 2001 | - last edited 08.10.02 - |
Appendix 1. Examples of C-DVM programs.
Six small scientific programs are presented to illustrate C-DVM language features. They are intended for solving a system of linear equations:
A x = b
where A - matrix of coefficients,
b - vector of free members,
x - vector of unknowns.
The following basic methods are used to illustrate CDVM language features.
Direct methods. The well-known Gaussian Elimination method is the most commonly used algorithm of this class. The main idea of this algorithm is to reduce the matrix A to upper triangular form and then to use backward substitution to diagonalize the matrix.
Explicit iteration methods. Jacobi Relaxation is the most known algorithm of this class. The algorithm performs the following computation iteratively
xi,jnew = ( xi-1,jold + xi,j-1old + xi+1,jold + xi,j+1old ) / 4
Implicit iteration methods. Successive Over Relaxation (SOR) belongs to this class. The algorithm performs the following calculation iteratively
xi,jnew = ( w / 4 ) * ( xi-1,jnew + xi,j-1new + xi+1,jold + xi,j+1old ) + (1-w) * xi,jold
By using “red-black” coloring of variables each step of SOR consists of two half Jacobi steps. One processes “red” variables and the other processes “black” variables. Coloring of variables allows to overlap calculation and communication.
Example 1. The Gauss elimination algorithm.
This is a program to solve a system of linear equations Ax = B by Gauss elimination method. The coefficient matrix A is represented by an array section A[0:N-1][0:N-1], and the vector B is represented by the section A[0:N-1][N] of the same array.
/* GAUSS program */ #include <stdlib.h> #include <stdio.h> #include <math.h> #define DVM(dvmdir) #define DO(v,l,h,s) for(v=l; v<=h; v+=s) #define N 10 int main (int argn, char **args) { long i, j, k; /* declaration of dynamic distributed arrays */ DVM(DISTRIBUTE [BLOCK] []) float (*A)[N+1]; DVM(DISTRIBUTE [BLOCK]) float (*X); /* creation of arrays */ A = malloc( N*(N+1)*sizeof(float)); X = malloc( N*sizeof(float)); /* initialize array A*/ DVM(PARALLEL [i][j] ON A[i][j]) DO(i,0,N-1,1) DO(j,0,N,1) if (i==j || j==N) A[i][j] = 1.f; else A[i][j]=0.f; /* elimination */ for (i=0; i<N-1; i++) { DVM(PARALLEL [k][j] ON A[k][j]; REMOTE_ACCESS A[i][]) DO (k,i+1,N-1,1) DO (j,i+1,N,1) A[k][j] = A[k][j]-A[k][i]*A[i][j]/A[i][i]; } /* reverse substitution */ X[N-1] = A[N-1][N]/A[N-1][N-1]; for (j=N-2; j>=0; j-=1) { DVM(PARALLEL [k] ON A[k][]; REMOTE_ACCESS X[j+1]) DO (k,0,j,1) A[k][N] = A[k][N]-A[k][j+1]*X[j+1]; X[j]=A[j][N]/A[j][j]; DVM(REMOTE_ACCESS X[j]) printf(“j=%4i X[j]=%3.3E\n”,j,X[j]); } return 0; }
/* JACOBI program */ #include <math.h> #include <stdlib.h> #include <stdio.h> #define Max(a,b) ((a)>(b)?(a): (b)) #define DVM(dvmdir) #define DO(v,l,h,s) for(v=l; v<=h; v+=s) #define L 8 #define ITMAX 20 int i,j,it,k; double eps; double MAXEPS = 0.5; FILE *f; /* 2D arrays block distributed along 2 dimensions */ DVM(DISTRIBUTE [BLOCK][BLOCK]) double A[L][L]; DVM(ALIGN[i][j] WITH A[i][j]) double B[L][L]; int main(int argn, char **args) { /* 2D loop with base array A */ DVM(PARALLEL [i][j] ON A[i][j]) DO(i,0,L-1,1) DO(j,0,L-1,1) {A[i][j]=0.; B[i][j]=1.+i+j; } /****** iteration loop *************************/ DO(it,1,ITMAX,1) { eps= 0.; /* Parallel loop with base array A */ /* calculating maximum in variable eps */ DVM(PARALLEL [i][j] ON A[i][j]; REDUCTION MAX(eps)) DO(i,1,L-2,1) DO(j,1,L-2,1) {eps = Max(fabs(B[i][j]-A[i][j]),eps); A[i][j] = B[i][j]; } /* Parallel loop with base array B and */ /* with prior updating shadow elements of array A */ DVM(PARALLEL[i][j] ON B[i][j]; SHADOW_RENEW A) DO(i,1,L-2,1) DO(j,1,L-2,1) B[i][j] = (A[i-1][j]+A[i+1][j]+A[i][j-1]+A[i][j+1])/4.; printf(“it=%4i eps=%3.3E\n”, it,eps); if (eps < MAXEPS) break; }/*DO it*/ f=fopen("jacobi.dat","wb"); fwrite(B,sizeof(double),L*L,f); return 0; }
Example 3. Jacobi Algorithm (asynchronous version)
/* Asynchronous JACOBI */ #include <math.h> #include <stdlib.h> #include <stdio.h> #define Max(a,b) ((a)>(b)? (a): (b)) #define DVM(dvmdir) #define DO(v,l,u,s) for(v=l; v<=u; v+=s) #define L 8 #define ITMAX 20 int i,j,it,k; double eps; double MAXEPS = 0.5; FILE *f; /* declare groups for shadow and reduction operations */ DVM(SHADOW_GROUP) void *grshad; DVM(REDUCTION_GROUP) void *emax; /* 2D arrays block distributed along 2 dimensions */ DVM(DISTRIBUTE [BLOCK][BLOCK]) double A[L][L]; DVM(ALIGN [i][j] WITH A[i][j]) double B[L][L]; int main(int argn, char **args) { /* 2D parallel loop with base array A */ DVM(PARALLEL [i][j] ON A[i][j]) DO(i,0,L-1,1) DO(j,0,L-1,1) {A[i][j]=0.; B[i][j]=1.+i+j;} /* Create group for shadow operation */ DVM(CREATE_SHADOW_GROUP grshad: A); /************ iteration loop *************************/ DO(it,1,ITMAX,1) { eps= 0.; /* Parallel loop with base array A: */ /* at first elements of array A exported by the */ /* neighbor processor are calculated and sent and */ /* then internal elements of array A are calculated */ DVM(PARALLEL [i][j] ON A[i][j]; SHADOW_START grshad; REDUCTION emax : MAX(eps)) DO(i,1,L-2,1) DO(j,1,L-2,1) {eps = max(fabs(B[i][j]-A[i][j]),eps); A[i][j] = B[i][j]; } /* Start asynchronous calculation of maximum */ DVM(REDUCTION_START emax); /* Parallel loop with base array B: */ /* internal elements of array B are calculated at first */ /* then completion of array A shadow edges update is */ /* awaited and the loop iterations, requiring shadow */ /* elements of array A are calculated */ DVM(PARALLEL[i][j] ON B[i][j]; SHADOW_WAIT grshad) DO(i,1,L-2,1) DO(j,1,L-2,1) B[i][j] = (A[i-1][j]+A[i+1][j]+A[i][j-1]+A[i][j+1])/4; /* Wait for completion of reduction */ DVM(REDUCTION_WAIT emax); printf( "it=%4i eps=%3.3E\n”, it,eps); if (eps < MAXEPS) break; }/*DO it*/ f=fopen("jacobi.dat","wb"); fwrite(B,sizeof(double),L*L,f); return 0; }
Example 4. Red-Black Successive Over-Relaxation
The idea of the method is to paint the points of the grid (the variables) in two colors - red and black - as on a chess-board. Non-rectangular index space is defined by the special form of a loop header: DO (var, a+b%2, ub, 2)
/* RED-BLACK program */ #include <stdlib.h> #include <math.h> #define DVM(dvmdir) #define DO(v,l,h,s) for(v=l; v<=h; v+=s) #define Max(a,b) ((a)>(b)?(a):(b)) #define ITMAX 10 int main(int an, char ** as) {long i, j, it, irb; float eps=0.; float MAXEPS = 0.5E-5; float w = 0.5; /* Groups of asynchronous operations */ DVM(SHADOW_GROUP) void *sha; DVM(REDUCTION_GROUP) void *rg; int N; /* redefine N as constant */ #define N 8 DVM(DISTRIBUTE [BLOCK] [BLOCK]) float (*A)[N]; #undef N /* restore N */ /* "Calculate" of array size (=8!). Create array */ N = 8; A = malloc( N*N*sizeof(float)); /* Specification of members of shadow group */ DVM(CREATE_SHADOW_GROUP sha : A); /* Initialization: parallel loop with surpassing */ /* calculation and sending of exported data */ DVM(PARALLEL [i][j] ON A[i][j]; SHADOW_START sha) DO (i,0,N-1,1) DO (j,0,N-1,1) if(i==0 || i==N-1 || j==0 || j==N-1) A[i][j]=0.; else A[i][j] = 1.+i+j; DVM(SHADOW_WAIT sha); /* iteration loop */ DO (it,0,ITMAX,1) { eps = 0.; /* Parallel loop with reduction on RED points */ DVM(PARALLEL [i][j] ON A[i][j]; SHADOW_START sha; REDUCTION rg: MAX(eps) ) DO (i,1,N-2,1) DO (j,1+i%2,N-2,2) {float s; s = A[i][j]; A[i][j] = (w/4)*(A[i-1][j]+A[i+1][j]+ A[i][j-1]+A[i][j+1])+(1-w)*A[i][j]; eps = Max (fabs(s-A[i][j]), eps); } /* Start red reduction -- as early as possible */ /* Then wait shadows */ DVM(REDUCTION_START rg); DVM(SHADOW_WAIT sha); /* Parallel loop on BLACK points (without reduction) */ DVM(PARALLEL [i][j] ON A[i][j]; SHADOW_START sha) DO (i,1,N-2,1) DO (j,1+(i+1)%2,N-2,2) {A[i][j] = (w/4)*(A[i-1][j]+A[i+1][j]+ A[i][j-1]+A[i][j+1])+(1-w)*A[i][j]; } /* Wait shadows, then */ /* wait red point's reduction -- as late as possible */ DVM(SHADOW_WAIT sha); DVM(REDUCTION_WAIT rg); printf("it=%d eps=%e\n",it,eps); if (eps < MAXEPS) break; } /* DO it */ free(A); return 0; }
Example 5. Multigrid method program.
/* MULTIGRID program */ #include <stdio.h> #include <stdlib.h> #define DVM(dvm) #define DO(v,l,h,s) for(v=(l); v<=(h); v+=(s)) void oper( DVM(*DISTRIBUTE [*][*][*]) float *AA, int AAN, int AAM, int AAK, DVM(*DISTRIBUTE [*][*][*]) float *BB, int BBN, int BBM, int BBK) /*Parameters: two distributed 3D arrays and */ /* their actual dimensions */ { #define AA(i,j,k) AA[((i)*AAM+(j))*AAK+(k)] #define BB(i,j,k) AA[((i)*BBM+(j))*BBK+(k)] int i, j,k; /* Alignment of array BB with array AA using stretching*/ DVM(REALIGN BB[i][j][k] WITH AA[2*i][2*j][2*k] NEW); /* forming array BB from elements of array AA with even indexes */ DVM(PARALLEL [i][j][k] ON BB[i][j][k]) FOR(i,BBN) FOR(j,BBM) FOR(k,BBK) BB(i,j,k)=AA(i*2,j*2,k*2); #undef AA #undef BB } int main(int argn, char **args) { int N1=8,M1=12,K1=16; int N2,M2,K2; int Narr=5,Nit=5; int grid=0; int ACM,ACK; int i,j,k; int step_grid=1; /* Up to 20 distributed arrays */ DVM(DISTRIBUTE[BLOCK][BLOCK][]) float *A[20]; /* Pointer to current distributed array */ DVM(*DISTRIBUTE[*][*][*]) float *AC; /* creation of array A[0] */ A[0]=malloc(N1*M1*K1*sizeof(float)); AC=A[0]; ACM=M1; ACK=K1; #define AC(i,j,k) AC[((i)*ACM+(j))*ACK+(k)] /* initialization of source array */ DVM(PARALLEL [i][j][k] ON AC[i][j][k]) DO(i,0,N1-1,1) DO(j,0,M1-1,1) DO(k,0,K1-1,1) AC(i,j,k)=1.+i+j+k ; #undef AC do{ printf("N1=%d M1=%d K1=%d \n”,N1,M1,K1); N2=N1/2; M2=M1/2; K2=K1/2; grid++; /* creation of array A[grid] */ A[grid]=malloc(N2*M2*K2*sizeof(float)); oper(A[grid-1],N1,M1,K1,A[grid],N2,M2,K2); N1=N2; M1=M2; K1=K2; } while (N2>2 && grid<Narr) for(i=0;i<=grid;i++) free(A[i]); return 0; }
Example 6. Task Parallelism for Multiblock Code
/* PROGRAM TASKS */ /* rectangular grid is distributed on two blocks */ /* */ /* <------- A2,B2 -----> */ /* 0 1 2 ... N2 */ /* <--- A1,B1 --------> */ /* 0 1 ... N1-1 N1 N1+1 ... N1+N2-1 */ /*------------------------------------------------*/ /* 0 | . . . . . ... . */ /* ... | */ /* K-1 | . . . . . ... . */ /**************************************************/ #include <stdlib.h> #include <math.h> #define DVM(dvmdir) #define DO(v,l,h,s) for(v=l; v<=h; v+=s) #define FOR(v,n) for(v=0; v<n; v++) #define Max(a,b) ((a)>(b)?(a):(b)) #define NUMBER_OF_PROCESSORS() 1 #define K 8 #define N1 4 #define N2 K-N1 #define ITMAX 10 DVM(PROCESSORS) void * P[NUMBER_OF_PROCESSORS()]; DVM(TASK) void * MB[2]; double eps0; DVM(DISTRIBUTE) float (*A1)[K], (*A2)[K]; DVM(ALIGN) float (*B1)[K], (*B2)[K]; int main(int argn, char** args) { int i,j, it; int NP; printf("---------- starting --------------\n"); DVM(DEBUG 1 -d0) { NP = NUMBER_OF_PROCESSORS() / 2; } DVM(MAP MB[0] ONTO P[0:(NP?NP-1:0)]); DVM(MAP MB[1] ONTO P[NP:(NP?2*NP-1:NP)]); A1=malloc((N1+1)*K*sizeof(float)); DVM(REDISTRIBUTE A1[][BLOCK] ONTO MB[0]); B1=malloc((N1+1)*K*sizeof(float)); DVM(REALIGN B1[i][j] WITH A1[i][j]); A2=malloc((N2+1)*K*sizeof(float)); DVM(REDISTRIBUTE A2[][BLOCK] ONTO MB[1]); B2=malloc((N2+1)*K*sizeof(float)); DVM(REALIGN B2[i][j] WITH A2[i][j]); /* Initialization */ DVM(PARALLEL [i][j] ON A1[i][j]) FOR(i,N1+1) FOR(j,K) {if (i==0 || j==0 || j==K-1) { A1[i][j] = 0.f; B1[i][j] = 0.f; } else { B1[i][j] = 1.f + i + j ; A1[i][j] = B1[i][j]; }} DVM(PARALLEL [i][j] ON A2[i][j]) FOR(i,N2+1) FOR(j,K) {if(i == N2 || j==0 || j==K-1) { A2[i][j] = 0.f; B2[i][j] = 0.f; } else { B2[i][j] = 1.f + ( i + N1 - 1 ) + j ; A2[i][j] = B2[i][j]; }} DO(it,1,ITMAX,1) { eps0 = 0.; /* exchange bounds */ DVM(PARALLEL [j] ON A1[(N1)][j]; REMOTE_ACCESS B2[1][]) FOR(j,K) A1[N1][j] = B2[1][j]; DVM(PARALLEL [j] ON A2[0][j]; REMOTE_ACCESS B1[N1-1][]) FOR(j,K) A2[0][j] = B1[N1-1][j]; /* Block task region */ DVM(TASK_REGION MB; REDUCTION MAX(eps0)) { DVM(ON MB[0]) { double eps=0.; /* private reduction variable */ DVM(PARALLEL [i][j] ON B1[i][j]; SHADOW_RENEW A1[][]) DO(i,1, N1-1, 1) DO(j,1, K-2, 1) B1[i][j]=(A1[i-1][j]+A1[i][j-1]+A1[i+1][j]+A1[i][j+1])/4.f; DVM(PARALLEL [i][j] ON A1[i][j]; REDUCTION MAX(eps)) DO(i,1,N1-1,1) DO(j,1,K-2,1) {eps = Max(fabs(B1[i][j]-A1[i][j]),eps); A1[i][j] = B1[i][j]; } eps0=Max(eps0,eps); } DVM(ON MB[1]) { double eps=0.; /* private reduction variable */ DVM(PARALLEL [i][j] ON B2[i][j]; SHADOW_RENEW A2[][]) DO(i,1,N2-1,1) DO(j,1,K-2,1) B2[i][j]=(A2[i-1][j]+A2[i][j-1]+A2[i+1][j]+A2[i][j+1])/4.f; DVM(PARALLEL [i][j] ON A2[i][j]; REDUCTION MAX(eps)) DO(i,1,N2-1,1) DO(j,1,K-2,1) {eps = Max(fabs(B2[i][j]-A2[i][j]),eps); A2[i][j] = B2[i][j]; } eps0=Max(eps0,eps); } } /* TASK_REGION */ printf("it=%d eps0=%f\n",it,eps0); }/* DO IT */ return 0; }
directive | ::= DVM ( DVM-directive [ ; DVM-directive ] ) |
DVM-directive | ::= specification-directive |
| executable-directive | |
specification-directive | ::= processors-directive |
| align-directive | |
| distribute-directive | |
| template-directive | |
| shadow-directive | |
| shadow-group-directive | |
| reduction-group-directive | |
| remote-group-directive | |
| task-directive | |
executable-directive | ::= realign-directive |
| redistribute-directive | |
| create-template-directive | |
| parallel-directive | |
| remote-access-directive | |
| create-shadow-group-directive | |
| shadow-start-directive | |
| shadow-wait-directive | |
| reduction-start-directive | |
| reduction-wait-directive | |
| prefetch-directive | |
| reset-directive | |
| map-directive | |
| task-region-directive | |
| on-directive | |
processors-directive | ::= PROCESSORS |
distribute-directive | ::= DISTRIBUTE [ dist-directive-stuff ] |
redistribute-directive | ::= REDISTRIBUTE distributee dist-directive-stuff [ NEW ] |
dist-directive-stuff | ::= dist-format... [ dist-onto-clause ] |
distributee | ::= array-name |
dist-format | ::= [BLOCK] |
| [GENBLOCK ( block-array-name )] | |
| [WGTBLOCK ( block-array-name,nblock )] | |
| [] | |
dist-onto-clause | ::= ONTO dist-target |
dist-target | ::= processors-name [ section-subscript ]… |
align-directive | ::= ALIGN [ align-directive-stuff ] |
realign-directive | ::= REALIGN alignee align-directive-stuff [ NEW ] |
align-directive-stuff | ::= align-source... align-with-clause |
alignee | ::= array-name |
align-source | ::= [ ] |
| [ align-dummy ] | |
align-dummy | ::= scalar-int-variable |
align-with-clause | ::= WITH align-spec |
align-spec | ::= align-target [ align-subscript ]… |
align-target | ::= array-name |
| template-name | |
align-subscript | ::= [ int-expr ] |
| [ align-subscript-use ] | |
| [ ] | |
align-subscript-use | ::= [ primary-expr
* ] align-dummy [ add-op primary-expr ] |
primary-expr | ::= int-constant |
| int-variable | |
| ( int-expr ) | |
add-op | ::= + |
| - | |
create-template-directive | ::= CREATE_TEMPLATE template_name size... |
parallel-directive | ::= PARALLEL loop-variable... ON iteration-align-spec [ ; reduction-clause] [ ; shadow-renew-clause] [ ; remote-access-clause ] [ ; across-clause ] |
iteration-align-spec | ::= align-target iteration-align-subscript... |
iteration-align-subscript | ::= [ int-expr ] |
| [ do-variable-use ] | |
| [] | |
loop-variable | ::= [ do-variable ] |
do-variable-use | ::= [ primary-expr
* ] do-variable [ add-op primary-expr ] |
reduction-clause | ::= REDUCTION [ group_name : ] reduction-op... |
reduction-op | ::= reduction-op-name ( reduction-variable ) |
| reduction-loc-name ( reduction-variable, loc-variable ) | |
reduction-variable | ::= array-name |
| scalar-variable-name | |
reduction-op-name | ::= SUM |
| PRODUCT | |
| MAX | |
| MIN | |
| AND | |
| OR | |
reduction-loc-name | ::= MAXLOC |
| MINLOC | |
across-clause | ::= ACROSS dependent-array... |
dependent-array | ::= dist-array-name dependence... |
dependence | ::= [ flow-dep-length : anti-dep-length ] |
flow-dep-length | ::= int-constant |
anti-dep-length | ::= int-constant |
shadow-directive | ::= SHADOW shadow-array... |
shadow-array | ::= array-name shadow-edge... |
shadow-edge | ::= [ width ] |
| [ low-width : high-width ] | |
width | ::= int-expr |
low-width | ::= int-expr |
high-width | ::= int-expr |
shadow-renew-clause | ::= SHADOW_RENEW renewee... |
renewee | ::= dist-array-name [ shadow-edge ]… [CORNER] |
remote-access-directive | ::= REMOTE_ACCESS [ remote-group-name : ] regular-reference... |
regular-reference | ::= dist-array-name [ regular-subscript ]… |
regular-subscript | ::= [ int-expr ] |
| [ do-variable-use ] | |
| [ ] | |
remote-access-clause | ::= remote-access-directive |
prefetch-directive | ::= PREFETCH group-name |
reset-directive | ::= RESET group-name |
shadow-group-directive | ::=
CREATE_SHADOW_GROUP shadow-group-name : renewee... |
shadow-start-directive | ::= SHADOW_START shadow-group-name |
shadow-wait-directive | ::= SHADOW_WAIT shadow-group-name |
shadow-renew-clause | ::= . . . |
| shadow-start-directive | |
| shadow-wait-directive | |
reduction-group-directive | ::= REDUCTION_GROUP |
reduction-start-directive | ::= REDUCTION_START reduction-group-name |
reduction-wait-directive | ::= REDUCTION_WAIT reduction-group-name |
task-directive | ::= TASK |
map-directive | ::= MAP task-name [ task-index ] |
ONTO processors-name [ section-subscript ]… | |
dist-target | ::= . . . |
| task-name [ task-index ] | |
block-task-region | ::= DVM( task-region-directive ) { |
on-block... | |
} | |
task-region-directive | ::= TASK_REGION task-name [ ; reduction-clause] |
on-block | ::= DVM( on-directive ) operator |
on-directive | ::= ON task-name [ task-index ] |
loop-task-region | ::= DVM( task-region-directive ) { |
parallel-task-loop | |
} | |
parallel-task-loop | ::= DVM( parallel-task-loop-directive ) |
do-loop | |
parallel-task-loop-directive | ::= PARALLEL [ do-variable
] ON task-name [ do-variable ] |
copy-flag-directive | ::= COPY_FLAG |
copy-start-directive | ::= COPY_START flag_addr |
copy-wait-directive | ::= COPY_WAIT flag_addr |
copy-directive | ::= COPY |
C-DVM - contents | Part 1(1-4) | Part2 (5-11) | Part 3 (Appendixes) |