C-DVM language description. Part 3 (Apendixes)

Part 3 (Appendixes)

created: april, 2001

- last edited 08.10.02 -

Appendix 1. Examples of C-DVM programs.

Six small scientific programs are presented to illustrate C-DVM language features. They are intended for solving a system of linear equations:

A x = b

where A - matrix of coefficients,
b - vector of free members,
x - vector of unknowns.

The following basic methods are used to illustrate CDVM language features.

Direct methods. The well-known Gaussian Elimination method is the most commonly used algorithm of this class. The main idea of this algorithm is to reduce the matrix A to upper triangular form and then to use backward substitution to diagonalize the matrix.

Explicit iteration methods. Jacobi Relaxation is the most known algorithm of this class. The algorithm performs the following computation iteratively

x_i,j^new = ( x_i-1,j^old + x_i,j-1^old + x_i+1,j^old + x_i,j+1^old ) / 4

Implicit iteration methods. Successive Over Relaxation (SOR) belongs to this class. The algorithm performs the following calculation iteratively

x_i,j^new = ( w / 4 ) * ( x_i-1,j^new + x_i,j-1^new + x_i+1,j^old + x_i,j+1^old ) + (1-w) * x_i,j^old

By using “red-black” coloring of variables each step of SOR consists of two half Jacobi steps. One processes “red” variables and the other processes “black” variables. Coloring of variables allows to overlap calculation and communication.

Example 1. The Gauss elimination algorithm.

This is a program to solve a system of linear equations Ax = B by Gauss elimination method. The coefficient matrix A is represented by an array section A[0:N-1][0:N-1], and the vector B is represented by the section A[0:N-1][N] of the same array.

/* GAUSS  program */

#include <stdlib.h>
#include <stdio.h>
#include <math.h>

#define DVM(dvmdir)
#define DO(v,l,h,s)   for(v=l; v<=h; v+=s)
#define N 10

int main (int argn, char **args)
{
long  i, j, k;

/* declaration of dynamic distributed arrays */
DVM(DISTRIBUTE [BLOCK] [])  float (*A)[N+1];
DVM(DISTRIBUTE [BLOCK])  float (*X);
/* creation of arrays */
A = malloc( N*(N+1)*sizeof(float));
X = malloc( N*sizeof(float));
/* initialize array A*/
DVM(PARALLEL [i][j] ON A[i][j])
DO(i,0,N-1,1)
DO(j,0,N,1)
	if (i==j || j==N)  A[i][j] = 1.f;
	else A[i][j]=0.f;

/* elimination */
for (i=0; i<N-1; i++)
{
DVM(PARALLEL [k][j] ON A[k][j]; REMOTE_ACCESS A[i][])
DO (k,i+1,N-1,1)
DO (j,i+1,N,1)
	A[k][j] = A[k][j]-A[k][i]*A[i][j]/A[i][i];
}
/* reverse substitution */
X[N-1] = A[N-1][N]/A[N-1][N-1];
for (j=N-2; j>=0; j-=1)
{
DVM(PARALLEL [k] ON A[k][]; REMOTE_ACCESS X[j+1])
DO (k,0,j,1)
    	A[k][N] = A[k][N]-A[k][j+1]*X[j+1];
X[j]=A[j][N]/A[j][j];
DVM(REMOTE_ACCESS X[j]) 
	printf(“j=%4i  X[j]=%3.3E\n”,j,X[j]);
}
return 0;
}

Example 2. Jacobi algorithm

/* JACOBI program */

#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#define Max(a,b) ((a)>(b)?(a): (b))

#define DVM(dvmdir)
#define DO(v,l,h,s)   for(v=l; v<=h; v+=s)

#define  L  8
#define  ITMAX 20

int i,j,it,k;
double eps;
double MAXEPS = 0.5;
FILE *f;

/* 2D arrays block distributed along 2 dimensions */
DVM(DISTRIBUTE [BLOCK][BLOCK]) double A[L][L];
DVM(ALIGN[i][j] WITH A[i][j]) double B[L][L];

int main(int argn, char **args)
{
/* 2D loop with base array A */
DVM(PARALLEL [i][j] ON A[i][j])
DO(i,0,L-1,1)
DO(j,0,L-1,1)
      {A[i][j]=0.;
      B[i][j]=1.+i+j;
      }

/******  iteration loop  *************************/
DO(it,1,ITMAX,1)
{
  eps= 0.;
/* Parallel loop with base array A             */
/* calculating maximum in variable eps          */
DVM(PARALLEL [i][j] ON A[i][j]; REDUCTION MAX(eps))
DO(i,1,L-2,1)
DO(j,1,L-2,1)
	{eps =  Max(fabs(B[i][j]-A[i][j]),eps);
	A[i][j] = B[i][j];
	}
/* Parallel loop with base array  B  and         */
/* with prior updating shadow elements of array A  */
DVM(PARALLEL[i][j] ON B[i][j]; SHADOW_RENEW A)
DO(i,1,L-2,1)
DO(j,1,L-2,1)
    B[i][j] = (A[i-1][j]+A[i+1][j]+A[i][j-1]+A[i][j+1])/4.;

  printf(“it=%4i   eps=%3.3E\n”, it,eps);
  if (eps < MAXEPS) break;
  }/*DO it*/
f=fopen("jacobi.dat","wb");
fwrite(B,sizeof(double),L*L,f);
return 0;
}

Example 3. Jacobi Algorithm (asynchronous version)

/* Asynchronous JACOBI */ 
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#define Max(a,b) ((a)>(b)? (a): (b))

#define DVM(dvmdir)
#define DO(v,l,u,s)   for(v=l; v<=u; v+=s)

#define  L 8
#define  ITMAX 20

int i,j,it,k;
double eps;
double MAXEPS   = 0.5;
FILE *f;
/* declare groups for shadow and reduction operations */
DVM(SHADOW_GROUP) void *grshad;
DVM(REDUCTION_GROUP) void *emax;
/* 2D arrays block distributed along 2 dimensions */
DVM(DISTRIBUTE [BLOCK][BLOCK]) double A[L][L];
DVM(ALIGN [i][j] WITH A[i][j]) double B[L][L];

int main(int argn, char **args)
{
/* 2D parallel loop with base array A */
DVM(PARALLEL [i][j] ON A[i][j])
DO(i,0,L-1,1)
DO(j,0,L-1,1)
        {A[i][j]=0.;
         B[i][j]=1.+i+j;}
/* Create group for shadow operation */
DVM(CREATE_SHADOW_GROUP grshad: A);

/************  iteration loop  *************************/
DO(it,1,ITMAX,1)
{
  eps= 0.;
/* Parallel loop with base array A: */
/* at first elements of array A exported by the */
/* neighbor processor are calculated and sent and */
/* then internal elements of array A are calculated */
DVM(PARALLEL [i][j] ON A[i][j]; 
	SHADOW_START grshad; REDUCTION emax : MAX(eps))
DO(i,1,L-2,1)
DO(j,1,L-2,1)
         {eps = max(fabs(B[i][j]-A[i][j]),eps);
          A[i][j] = B[i][j];
         }
/* Start asynchronous calculation of maximum */
DVM(REDUCTION_START emax);
/* Parallel loop with base array B: */
/* internal elements of array B are calculated at first */
/* then completion of array A shadow edges update is */ 
/* awaited and the loop iterations, requiring shadow */ 
/* elements of array A are calculated */
DVM(PARALLEL[i][j] ON B[i][j]; SHADOW_WAIT grshad)
DO(i,1,L-2,1)
DO(j,1,L-2,1)
    B[i][j] = (A[i-1][j]+A[i+1][j]+A[i][j-1]+A[i][j+1])/4;
/* Wait for completion of reduction */
DVM(REDUCTION_WAIT emax);

printf( "it=%4i  eps=%3.3E\n”, it,eps);
if (eps < MAXEPS) break;
}/*DO it*/

f=fopen("jacobi.dat","wb");
fwrite(B,sizeof(double),L*L,f);
return 0;
}

Example 4. Red-Black Successive Over-Relaxation

The idea of the method is to paint the points of the grid (the variables) in two colors - red and black - as on a chess-board. Non-rectangular index space is defined by the special form of a loop header: DO (var, a+b%2, ub, 2)

/* RED-BLACK program */

#include <stdlib.h>
#include <math.h>

#define DVM(dvmdir)
#define DO(v,l,h,s)   for(v=l; v<=h; v+=s)

#define Max(a,b) ((a)>(b)?(a):(b))
#define ITMAX 10

int main(int an, char ** as)
{long i, j, it, irb;
float eps=0.;
float MAXEPS = 0.5E-5;
float w = 0.5;

/* Groups of asynchronous operations */
DVM(SHADOW_GROUP) void *sha;
DVM(REDUCTION_GROUP) void *rg;

int  N;

/* redefine N as constant */
#define     N 8
DVM(DISTRIBUTE [BLOCK] [BLOCK])  float (*A)[N];
#undef      N /* restore N */

/* "Calculate"  of array size (=8!). Create array */
N = 8;
A = malloc( N*N*sizeof(float));

/* Specification of members of shadow group  */
DVM(CREATE_SHADOW_GROUP sha : A);

/* Initialization: parallel loop with surpassing */
/* calculation and sending of exported data */
DVM(PARALLEL [i][j] ON A[i][j]; SHADOW_START sha)
DO (i,0,N-1,1)
DO (j,0,N-1,1)
    if(i==0 || i==N-1 || j==0 || j==N-1) A[i][j]=0.;
    else A[i][j] = 1.+i+j;

DVM(SHADOW_WAIT sha);

/* iteration loop */
DO (it,0,ITMAX,1)
{
eps = 0.;

/* Parallel loop with reduction on RED points */
DVM(PARALLEL [i][j] ON A[i][j]; SHADOW_START sha;
                                REDUCTION rg: MAX(eps) )
DO (i,1,N-2,1)
DO (j,1+i%2,N-2,2)
    {float s;
    s = A[i][j];
    A[i][j] = (w/4)*(A[i-1][j]+A[i+1][j]+
             A[i][j-1]+A[i][j+1])+(1-w)*A[i][j];
    eps = Max (fabs(s-A[i][j]), eps);
    }

/* Start red reduction -- as early as possible */
/* Then wait shadows */
DVM(REDUCTION_START rg);
DVM(SHADOW_WAIT sha);

/* Parallel loop on BLACK points (without reduction) */
DVM(PARALLEL [i][j] ON A[i][j]; SHADOW_START sha)
DO (i,1,N-2,1)
DO (j,1+(i+1)%2,N-2,2)
    {A[i][j] = (w/4)*(A[i-1][j]+A[i+1][j]+
             A[i][j-1]+A[i][j+1])+(1-w)*A[i][j];
    }
/* Wait shadows, then */
/* wait red point's reduction -- as late as possible */
DVM(SHADOW_WAIT sha);
DVM(REDUCTION_WAIT rg);

printf("it=%d eps=%e\n",it,eps);
if (eps < MAXEPS) break;

} /* DO it */

free(A);
return 0;
}

Example 5. Multigrid method program.

/* MULTIGRID  program */

#include <stdio.h>
#include <stdlib.h>

#define DVM(dvm)
#define DO(v,l,h,s) for(v=(l); v<=(h); v+=(s))

void oper(
	DVM(*DISTRIBUTE [*][*][*])  float *AA,
	int AAN, int AAM, int AAK,
	DVM(*DISTRIBUTE [*][*][*])  float *BB,
	int BBN, int BBM, int BBK)
/*Parameters: two distributed 3D arrays and */
/* their  actual dimensions */
{
#define AA(i,j,k) AA[((i)*AAM+(j))*AAK+(k)]
#define BB(i,j,k) AA[((i)*BBM+(j))*BBK+(k)]
int i, j,k;
/* Alignment of array BB with array AA using stretching*/
DVM(REALIGN BB[i][j][k] WITH AA[2*i][2*j][2*k] NEW);
/* forming array BB from elements of array AA with even indexes */
DVM(PARALLEL [i][j][k] ON BB[i][j][k])
FOR(i,BBN)
FOR(j,BBM)
FOR(k,BBK)
	BB(i,j,k)=AA(i*2,j*2,k*2);
#undef AA
#undef BB
}

int main(int argn, char **args)
{
int N1=8,M1=12,K1=16;
int N2,M2,K2;
int Narr=5,Nit=5;
int grid=0;
int ACM,ACK;
int i,j,k;
int step_grid=1;

/* Up to 20 distributed arrays */
DVM(DISTRIBUTE[BLOCK][BLOCK][]) float *A[20];
/* Pointer to current distributed array */
DVM(*DISTRIBUTE[*][*][*]) float *AC;
	/* creation of array  A[0] */
A[0]=malloc(N1*M1*K1*sizeof(float));
AC=A[0];
ACM=M1;
ACK=K1;
#define AC(i,j,k) AC[((i)*ACM+(j))*ACK+(k)]
	/* initialization of source array */
DVM(PARALLEL [i][j][k] ON AC[i][j][k])
DO(i,0,N1-1,1)
DO(j,0,M1-1,1)
DO(k,0,K1-1,1)
	AC(i,j,k)=1.+i+j+k ;
#undef AC

do{
printf("N1=%d M1=%d K1=%d \n”,N1,M1,K1);
N2=N1/2;
M2=M1/2;
K2=K1/2;
grid++;
/* creation of array A[grid] */
A[grid]=malloc(N2*M2*K2*sizeof(float));
oper(A[grid-1],N1,M1,K1,A[grid],N2,M2,K2);
N1=N2;
M1=M2;
K1=K2;
} while (N2>2 && grid<Narr)

for(i=0;i<=grid;i++)
	free(A[i]);
return 0;
}

Example 6. Task Parallelism for Multiblock Code

/*  PROGRAM TASKS                                 */
/* rectangular grid is distributed on two blocks  */
/*                                                */
/*                     <------- A2,B2 ----->      */
/*                     0    1   2    ... N2       */
/*       <--- A1,B1 -------->                     */
/*       0   1   ...  N1-1  N1  N1+1 ... N1+N2-1  */
/*------------------------------------------------*/
/* 0   | .   .         .    .   .    ...  .       */
/* ... |                                          */
/* K-1 | .   .         .    .   .    ...  .       */
/**************************************************/
#include <stdlib.h>
#include <math.h>

#define DVM(dvmdir)
#define DO(v,l,h,s)   for(v=l; v<=h; v+=s)
#define FOR(v,n)  for(v=0; v<n; v++)
#define Max(a,b) ((a)>(b)?(a):(b))

#define NUMBER_OF_PROCESSORS() 1

#define K   8
#define N1  4
#define N2  K-N1
#define ITMAX 10

DVM(PROCESSORS)  void * P[NUMBER_OF_PROCESSORS()];
DVM(TASK) void * MB[2];
double eps0;
DVM(DISTRIBUTE) float (*A1)[K], (*A2)[K];
DVM(ALIGN)      float (*B1)[K], (*B2)[K];

int main(int argn, char** args)
{
int i,j, it;
int NP;

printf("---------- starting --------------\n");

DVM(DEBUG 1 -d0)
{
NP = NUMBER_OF_PROCESSORS() / 2;
}

DVM(MAP MB[0] ONTO P[0:(NP?NP-1:0)]);
DVM(MAP MB[1] ONTO P[NP:(NP?2*NP-1:NP)]);

A1=malloc((N1+1)*K*sizeof(float));
DVM(REDISTRIBUTE A1[][BLOCK] ONTO MB[0]);
B1=malloc((N1+1)*K*sizeof(float));
DVM(REALIGN B1[i][j] WITH A1[i][j]);

A2=malloc((N2+1)*K*sizeof(float));
DVM(REDISTRIBUTE A2[][BLOCK] ONTO MB[1]);
B2=malloc((N2+1)*K*sizeof(float));
DVM(REALIGN B2[i][j] WITH A2[i][j]);

    /* Initialization */
DVM(PARALLEL [i][j] ON A1[i][j])
FOR(i,N1+1)
FOR(j,K)
{if (i==0 || j==0 || j==K-1) {
    A1[i][j] = 0.f;
    B1[i][j] = 0.f;
} else  {
    B1[i][j] = 1.f + i + j ;
    A1[i][j] = B1[i][j];
}}
DVM(PARALLEL [i][j] ON A2[i][j])
FOR(i,N2+1)
FOR(j,K)
{if(i == N2  || j==0 || j==K-1) {
    A2[i][j] = 0.f;
    B2[i][j] = 0.f;
} else {
    B2[i][j] = 1.f + ( i + N1 - 1 ) + j ;
    A2[i][j] = B2[i][j];
}}

DO(it,1,ITMAX,1)
{
    eps0 = 0.;

/*     exchange bounds */

DVM(PARALLEL [j] ON A1[(N1)][j]; REMOTE_ACCESS B2[1][])
    FOR(j,K) A1[N1][j] = B2[1][j];

DVM(PARALLEL [j] ON A2[0][j]; REMOTE_ACCESS B1[N1-1][])
    FOR(j,K) A2[0][j] = B1[N1-1][j];


/* Block task region */
DVM(TASK_REGION MB; REDUCTION MAX(eps0))
{
DVM(ON MB[0])
{
double eps=0.; /* private reduction variable */
DVM(PARALLEL [i][j] ON B1[i][j]; SHADOW_RENEW A1[][])
DO(i,1, N1-1, 1)
DO(j,1, K-2, 1)
   B1[i][j]=(A1[i-1][j]+A1[i][j-1]+A1[i+1][j]+A1[i][j+1])/4.f;

DVM(PARALLEL [i][j] ON A1[i][j]; REDUCTION MAX(eps))
DO(i,1,N1-1,1)
DO(j,1,K-2,1)
        {eps =  Max(fabs(B1[i][j]-A1[i][j]),eps);
        A1[i][j] = B1[i][j];
        }
eps0=Max(eps0,eps);
}

DVM(ON MB[1])
{
double eps=0.; /* private reduction variable */
DVM(PARALLEL [i][j] ON B2[i][j]; SHADOW_RENEW A2[][])
DO(i,1,N2-1,1)
DO(j,1,K-2,1)
  B2[i][j]=(A2[i-1][j]+A2[i][j-1]+A2[i+1][j]+A2[i][j+1])/4.f;

DVM(PARALLEL [i][j] ON A2[i][j]; REDUCTION MAX(eps))
DO(i,1,N2-1,1)
DO(j,1,K-2,1)
        {eps =  Max(fabs(B2[i][j]-A2[i][j]),eps);
        A2[i][j] = B2[i][j];
        }
eps0=Max(eps0,eps);
}
} /* TASK_REGION */

printf("it=%d eps0=%f\n",it,eps0);
}/* DO IT */
return 0;
}

Appendix 2. Syntax summary

directive	::= DVM ( DVM-directive [ ; DVM-directive ] )
DVM-directive	::= specification-directive
	\| executable-directive
specification-directive	::= processors-directive
	\| align-directive
	\| distribute-directive
	\| template-directive
	\| shadow-directive
	\| shadow-group-directive
	\| reduction-group-directive
	\| remote-group-directive
	\| task-directive
executable-directive	::= realign-directive
	\| redistribute-directive
	\| create-template-directive
	\| parallel-directive
	\| remote-access-directive
	\| create-shadow-group-directive
	\| shadow-start-directive
	\| shadow-wait-directive
	\| reduction-start-directive
	\| reduction-wait-directive
	\| prefetch-directive
	\| reset-directive
	\| map-directive
	\| task-region-directive
	\| on-directive
processors-directive	::= PROCESSORS
distribute-directive	::= DISTRIBUTE [ dist-directive-stuff ]
redistribute-directive	::= REDISTRIBUTE distributee dist-directive-stuff [ NEW ]
dist-directive-stuff	::= dist-format... [ dist-onto-clause ]
distributee	::= array-name
dist-format	::= [BLOCK]
	\| [GENBLOCK ( block-array-name )]
	\| [WGTBLOCK ( block-array-name,nblock )]
	\| []
dist-onto-clause	::= ONTO dist-target
dist-target	::= processors-name [ section-subscript ]…
align-directive	::= ALIGN [ align-directive-stuff ]
realign-directive	::= REALIGN alignee align-directive-stuff [ NEW ]
align-directive-stuff	::= align-source... align-with-clause
alignee	::= array-name
align-source	::= [ ]
	\| [ align-dummy ]
align-dummy	::= scalar-int-variable
align-with-clause	::= WITH align-spec
align-spec	::= align-target [ align-subscript ]…
align-target	::= array-name
	\| template-name
align-subscript	::= [ int-expr ]
	\| [ align-subscript-use ]
	\| [ ]
align-subscript-use	::= [ primary-expr * ] align-dummy [ add-op primary-expr ]
primary-expr	::= int-constant
	\| int-variable
	\| ( int-expr )
add-op	::= +
	\| -
create-template-directive	::= CREATE_TEMPLATE template_name size...
parallel-directive	::= PARALLEL loop-variable... ON iteration-align-spec [ ; reduction-clause] [ ; shadow-renew-clause] [ ; remote-access-clause ] [ ; across-clause ]
iteration-align-spec	::= align-target iteration-align-subscript...
iteration-align-subscript	::= [ int-expr ]
	\| [ do-variable-use ]
	\| []
loop-variable	::= [ do-variable ]
do-variable-use	::= [ primary-expr * ] do-variable [ add-op primary-expr ]
reduction-clause	::= REDUCTION [ group_name : ] reduction-op...
reduction-op	::= reduction-op-name ( reduction-variable )
	\| reduction-loc-name ( reduction-variable, loc-variable )
reduction-variable	::= array-name
	\| scalar-variable-name
reduction-op-name	::= SUM
	\| PRODUCT
	\| MAX
	\| MIN
	\| AND
	\| OR
reduction-loc-name	::= MAXLOC
	\| MINLOC
across-clause	::= ACROSS dependent-array...
dependent-array	::= dist-array-name dependence...
dependence	::= [ flow-dep-length : anti-dep-length ]
flow-dep-length	::= int-constant
anti-dep-length	::= int-constant
shadow-directive	::= SHADOW shadow-array...
shadow-array	::= array-name shadow-edge...
shadow-edge	::= [ width ]
	\| [ low-width : high-width ]
width	::= int-expr
low-width	::= int-expr
high-width	::= int-expr
shadow-renew-clause	::= SHADOW_RENEW renewee...
renewee	::= dist-array-name [ shadow-edge ]… [CORNER]
remote-access-directive	::= REMOTE_ACCESS [ remote-group-name : ] regular-reference...
regular-reference	::= dist-array-name [ regular-subscript ]…
regular-subscript	::= [ int-expr ]
	\| [ do-variable-use ]
	\| [ ]
remote-access-clause	::= remote-access-directive
prefetch-directive	::= PREFETCH group-name
reset-directive	::= RESET group-name
shadow-group-directive	::= CREATE_SHADOW_GROUP shadow-group-name : renewee...
shadow-start-directive	::= SHADOW_START shadow-group-name
shadow-wait-directive	::= SHADOW_WAIT shadow-group-name
shadow-renew-clause	::= . . .
	\| shadow-start-directive
	\| shadow-wait-directive
reduction-group-directive	::= REDUCTION_GROUP
reduction-start-directive	::= REDUCTION_START reduction-group-name
reduction-wait-directive	::= REDUCTION_WAIT reduction-group-name
task-directive	::= TASK
map-directive	::= MAP task-name [ task-index ]
	ONTO processors-name [ section-subscript ]…
dist-target	::= . . .
	\| task-name [ task-index ]
block-task-region	::= DVM( task-region-directive ) {
	on-block...
	}
task-region-directive	::= TASK_REGION task-name [ ; reduction-clause]
on-block	::= DVM( on-directive ) operator
on-directive	::= ON task-name [ task-index ]
loop-task-region	::= DVM( task-region-directive ) {
	parallel-task-loop
	}
parallel-task-loop	::= DVM( parallel-task-loop-directive )
	do-loop
parallel-task-loop-directive	::= PARALLEL [ do-variable ] ON task-name [ do-variable ]

copy-flag-directive	::= COPY_FLAG
copy-start-directive	::= COPY_START flag_addr
copy-wait-directive	::= COPY_WAIT flag_addr
copy-directive	::= COPY

C-DVM - contents

Part 1(1-4)

Part2 (5-11)

Part 3 (Appendixes)