/* HIERARCHICAL N-BODY implementation of Barnes-Hut'86 in Fork95 */ /* 03/99 Christoph W. Kessler */ #include #include #include #include #include #include #define GRAPHICS 0 /* 1=graphical output, 0=only text output */ #define DEBUG 0 /* 1=debugging output, 0=silent mode*/ #define DIM 2 /* 2=quadtree, 3=octree */ #define MAXDEG 4 /* 4=quadtree, 8=octree */ #define MAXNBODY 500 sh int NBODY = 50; /* number of particles, default value */ #define EPSILON 0.0001 /* distance very close -> consider as one particle */ #define GAMMA 20.0 /* gravitational constant * (here: dummy value for nice graphical output) */ #define PIXELS 512 /* extents of generated X-Pixmap picture * also used as x/y size of the universe */ #define THETA 1.0 /* cellwidth/distance threshold for considering the * mass center instead of all nodes in a subtree */ #define MAXSIZE PIXELS /* size of universe, must be a power of 2 */ #define MAXDIST (0.8*MAXSIZE) /* very far away -> no significant contribution */ #define PRINTOUT (NBODY<=40) /* omit lengthy output of computed force vectors */ typedef float* position; typedef struct part { /* data structure for a particle */ position pos; /* left upper corner of this cell/particle */ float mass; /* mass of particle */ position force; /* accumulated force computed for a time step */ /*position speed;*/ /*position acceleration;*/ } *particle; typedef struct treenode { struct treenode **kids; /* up to 8 children */ /* if NULL, this node is a leaf (1 partcl or empty)*/ float cellwidth; /* cell width */ particle part; /* if not NULL, points to a particle */ position pos; /* left upper corner of this cell/particle */ position com; /* position center of mass */ float mass; /* pseudo-mass of com */ int size; /* Number of inner nodes in the subtree */ simple_lock lock; /* protection for concurrent insertion */ } node, *pnode; sh pnode tree; /* the tree, with pointers to particles */ sh particle particles[MAXNBODY]; /* the particles */ position newPos2( float x, float y ) { int i; position q = (position) shmalloc( 2 * sizeof(float)); q[0] = x; q[1] = y; return q; } particle newParticle( position pos, float m ) { particle p = (particle) shmalloc( sizeof( struct part )); p->mass = m; p->pos = pos; p->force = NULL; return p; } pnode newNode( position pos, float cwidth, particle p ) { int i; pnode n = (pnode) shmalloc( sizeof(node)); #if DEBUG pprintf("newNode() pos(%d,%d), width=%d, particle=%p\n", (int)pos[0], (int)pos[1], (int)cwidth, p ); #endif n->kids = NULL; // set only in treeInsert() n->pos = pos; n->cellwidth = cwidth; n->mass = 0.0; n->part = p; assert(p); if (p) n->size = 1; else n->size = 0; n->lock = 0; return n; } int getKidIndex( position pos1, float width, position pos2 ) { // determine which quarter of the cell (pos1, width) // the position pos2 is located in. float x1 = pos1[0]; float x2 = pos2[0]; float y1 = pos1[1]; float y2 = pos2[1]; #if DEBUG pprintf("x1=%d x2=%d y1=%d y2=%d width=%d\n", (int)x1,(int)x2,(int)y1,(int)y2, (int)width ); assert( x2 >= x1 ); assert( x2 <= x1+width ); assert( y2 >= y1 ); assert( y2 <= y1+width ); #endif if (x2 < x1 + 0.5*width) if (y2 < y1 + 0.5 * width) return 0; else return 2; else if (y2 < y1 + 0.5 * width) return 1; else return 3; } int treeInsert( pnode T, particle n ) // returns number of newly generated cells: { #if DEBUG pprintf("treeInsert ( width %d, particle %p=(%d,%d) )\n", (int)T->cellwidth, n->pos, (int)n->pos[0], (int)n->pos[1]); #endif if (T->kids) { int c; INNER_NODE: // T is an inner node (has kids). // Once T becomes an inner node, it remains so forever. // Now determine which child c of node T the new particle n lies in: // The entries ->pos and ->cellwidth are constant (set in newNode only). c = getKidIndex( T->pos, T->cellwidth, n->pos ); if (T->kids[c]) { int ncells; KIDS_C_EXISTS: // T->kids[c], once set to nonzero, will remain constant forever. // Hence, T needs not be locked in this case. ncells = treeInsert( T->kids[c], n ); syncadd( &(T->size), ncells ); // is atomic, no locking required. return ncells; // contributions by concurrent treeInsert()s are // taken into consideration separately. } else { // T->kids[c] may be concurrently set to nonzero by different // processors. Hence, locking is required. // In order to improve performance, the newNode() call is hoisted // out of the critical section. pnode nn = newNode( newPos2( T->pos[0] + 0.5 * (c&1)*T->cellwidth, T->pos[1] + 0.5 * (float)((c&2)>>1) * T->cellwidth ), 0.5*T->cellwidth, n ); simple_lockup( &(T->lock) ); #if DEBUG pprintf(" P%d has(1) lock for %p\n", $, T ); #endif if (T->kids[c]) { // retest: I was not the first processor // to enter the critical section #if DEBUG pprintf(" P%d frees(1) lock for %p\n", $, T ); #endif simple_unlock( &(T->lock) ); shfree( nn ); goto KIDS_C_EXISTS; } else { // set T->kids[c]: T->kids[c] = nn; #if DEBUG pprintf(" P%d frees(2) lock for %p\n", $, T ); #endif simple_unlock( &(T->lock) ); syncadd( &(T->size), 1 ); // is atomic anyway. return 1; } } } else { // leaf node => T->part is nonzero. Split T: // computations depending only on invariant values T->pos, // T->cellwidth, n, are hoisted out of the critical section. particle p; pnode n1; int c1, c2; int i; c1 = getKidIndex( T->pos, T->cellwidth, n->pos ); n1 = newNode( newPos2( T->pos[0] + 0.5 * (float)(c1&1)*T->cellwidth, T->pos[1] + 0.5 * (float)((c1&2)>>1)*T->cellwidth ), 0.5*T->cellwidth, n); // setting T->kids is a critical section: simple_lockup( &(T->lock) ); #if DEBUG pprintf(" P%d has(3) lock for %p\n", $, T ); #endif if (!T->part) { // another processor was faster than me and set T->part to 0. // hence, now after that processor finishes, T is no longer a leaf node. // jump to the case "inner node": #if DEBUG assert( T->kids ); #endif simple_unlock( &(T->lock) ); goto INNER_NODE; } p = T->part; c2 = getKidIndex( T->pos, T->cellwidth, p->pos ); T->part = NULL; T->kids = (pnode*) shmalloc( MAXDEG * sizeof(pnode)); for (i=0; ikids[i] = NULL; T->kids[c1] = n1; if (c1 != c2) { // n and p go to different subcells: T->kids[c2] = newNode( newPos2( T->pos[0] + 0.5 * (float)(c2&1)*T->cellwidth, T->pos[1] + 0.5 * (float)((c2&2)>>1)*T->cellwidth ), 0.5*T->cellwidth, p); #if DEBUG pprintf(" P%d frees(3) lock for %p\n", $, T ); #endif simple_unlock( &(T->lock) ); // size increment by c1 updated together with contribution by kids[c2] syncadd( &(T->size), 2 ); return 2; } else { int ncells; #if DEBUG pprintf(" P%d frees(4) lock for %p\n", $, T ); #endif simple_unlock( &(T->lock) ); // T->kids[c2], once set, exists forever ncells = treeInsert( T->kids[c2], p ); syncadd( &(T->size), ncells + 1 ); return ncells + 1; /* one cell for c1 and the others by c2 */ } } } float frandom(void) { float f; int c2 = rand(), c = rand(); c = c&(MAXSIZE-1); c2 = c2&(511); f = itof( c ) + (itof(c2) * 0.0019); return f; } particle randomParticle( void ) { particle p; position pos = newPos2( frandom(), frandom() ); p = newParticle( pos, 1.0+(frandom()*0.0001*(float)MAXSIZE)); return p; } async void setFilledSquare( int x, int y, int width, int color ) { int i, j; for (i=1-width; i<=width; i++) for (j=1-width; j<=width; j++) set_pixel( x+i, /*PIXELS*/ y+j, color ); } async void setSquareFrame( int x, int y, int width, int color ) { int i, j; for (i=1-width; i<=width; i++) { set_pixel( x+i, y+1-width, color ); set_pixel( x+i, y+width, color ); } for (j=1-width; j<=width; j++) { set_pixel( x+1-width, y+j, color ); set_pixel( x+width, y+j, color ); } } void setCoM( pnode tree ) // recursive routine, computer center of mass { int i; #if DEBUG pprintf(" setCoM( %p )\n", tree ); #endif tree->mass = 0.0; if (tree->part) { tree->mass = tree->part->mass; tree->com = tree->part->pos; return; } if (tree->kids) { tree->com = newPos2( 0.0, 0.0 ); for (i=0; ikids[i]) { setCoM( tree->kids[i] ); tree->mass += tree->kids[i]->mass; } for (i=0; ikids[i]) { // update com in ratio subtree[i]->mass / tree->mass: tree->com[0] += tree->kids[i]->com[0] * tree->kids[i]->mass / tree->mass; tree->com[1] += tree->kids[i]->com[1] * tree->kids[i]->mass / tree->mass; } } } sync int *weightedGroupVector( sh int p, sh int n, sh float *work ) { // a simple but working sequential solution. // may be superior to the parallel variant if n is very close to p. // // work is an array of length n holding positive float values. // n < p. // Compute an int array ret[0:p-1] holding group indices ret[i] // in the range 0,...,n-1 // with n_j := sum_{i, ret[i]==c_j} , j=0,...,n-1, such that // (1) n_j > 0 iff work[j] > 0 // (2) n_j / \sum_{k} n_k approximately equal to work[j]. sh int *ret, count; int i, j; seq { ret = (int *)shmalloc( p * sizeof( int )); for (i=0, j=0; i=n) j=0; if (work[j] > 0.0) { ret[i++] = j; // proc i will work for subgroup j //pprintf("Proz %d von %d fuer Untergruppe %d von %d, Arbeit %d\n", // i, p, j, n, (int) work[j]); } j++; } } return ret; /* This parallel variant is still buggy: if ($ 0.0) { i = mpadd(&count, 1); ret[i] = $; //pprintf("Proz %d von %d fuer Untergruppe %d von %d, Arbeit %d\n", // i, p, $, n, (int) work[$]); } } } return ret; */ } sync int *par_weightedGroupVector( sh int p, sh int n, sh float *work ) { // work is an array of length n holding positive float values. // n < p. // Compute an int array ret[0:p-1] holding group indices ret[i] // in the range 0,...,n // with n_j := sum_{i, ret[i]==c_j} , j=0,...,n-1, such that // (1) n_j > 0 iff work[j] > 0 // (2) n_j / \sum_{k} n_k approximately equal to work[j]. // Note that this problem is related to the distribution // of p seats in a parliament among n parties with votes for // party i being stored in work[i], where each party with at // least one vote is guaranteed to have a seat in parliament. // We use a method adapted from that by Hare-Niemeyer. sh int *ret; int i, j; sh int nprocs = groupsize(); sh float sum = 0.0; #define MAXSEATS MAXDEG sh int seats[MAXSEATS]; sh int pre[MAXSEATS]; sh int sumseats = 0; seq assert( nprocs == p ); // for this particular application seq ret = (int *)shmalloc( p * sizeof( int )); // first: compute the sum of all work[] entries seq for (i=0; i= 0.0 ); sum += work[i]; } #if DEBUG seq pprintf("(int)work sum: %d\n", (int) sum ); #endif // now, each party gets at most one seat more than it // should have; each party with nonzero vote gets at least one: farm for (i=$; i 0.0) { seats[i] = (int) ( (float)p * work[i] / sum ) + 1; syncadd( &sumseats, seats[i] ); } else seats[i] = 0; // now sumseats is in the range p...p+N, where N = #nonzero parties // correct the result by taking away up to N seats: #if DEBUG seq assert( sumseats >= p ); seq pprintf("sumseats: %d\n", sumseats); seq assert( sumseats <= p+n ); #endif seq { while (sumseats > p) { #if 0 pprintf("while iteration\n"); #endif for (i=0; i %d\n", pre[i], pre[i]+seats[i], i ); #endif for (j=pre[i]; jkids; sh simple_lock lock = 0; #if DEBUG farm pprintf(" parSetCoM( %p )\n", tree ); #endif if (P==1) { farm setCoM( tree ); return; } if (!haskids) { // tree is a leaf: seq setCoM( tree ); return; } tree->mass = 0.0; #if DEBUG seq assert( tree->part==0 ); #endif seq tree->com = newPos2( 0.0, 0.0 ); // compute the weights of the subtrees for load balancing: nkids = 0; farm { for (i=$; ikids[i]) { syncadd( &nkids, 1 ); assert(tree->kids[i]->size > 0); weight[i] = (float)(tree->kids[i]->size); } else weight[i] = (float) 0; } if (P <= nkids) { // not enough processors to balance work distribution: // assign one processor per (nonempty) child. sh counter = 0; #if DEBUG farm pprintf("Case P <= nkids\n"); #endif farm { // self-scheduling loop: for (i=mpadd(&counter,1); ikids[i]) continue; setCoM( tree->kids[i] ); simple_lockup( &lock ); tree->mass += tree->kids[i]->mass; simple_unlock( &lock ); } } counter = 0; farm { // self-scheduling loop: for (i=mpadd(&counter,1); ikids[i]) continue; com0 = tree->kids[i]->com[0] * tree->kids[i]->mass / tree->mass; com1 = tree->kids[i]->com[1] * tree->kids[i]->mass / tree->mass; simple_lockup( &lock ); tree->com[0] += com0; tree->com[1] += com1; simple_unlock( &lock ); } } return; } #if DEBUG seq for (i=0; ikids[i])? tree->kids[i]->size : 0, i, (int)weight[i]); #endif // distribute P processors across MAXDEG subgroups in ratio of weights: #if DEBUG farm pprintf("Case weighted distribution\n"); #endif procs = weightedGroupVector( P, MAXDEG, weight ); // returns the subgroup index of proc. $ in procs[$] fork (MAXDEG; @ = procs[$]; ) { sh int subP = 0; #if DEBUG seq pprintf("@=%d, treekids[@]=%p\n", @, tree->kids[@]); seq assert( tree->kids[@] ); #endif $ = mpadd( &subP, 1 ); // renumber procs in subgroup parSetCoM( tree->kids[@] ); seq { simple_lockup( &(tree->lock) ); tree->mass += tree->kids[@]->mass; simple_unlock( &(tree->lock) ); } } fork (MAXDEG; @ = procs[$]; ) { #if DEBUG seq assert (tree->kids[@]); #endif // update com in ratio subtree[@]->mass / tree->mass: seq { float com0, com1; com0 = tree->kids[@]->com[0] * tree->kids[@]->mass / tree->mass; com1 = tree->kids[@]->com[1] * tree->kids[@]->mass / tree->mass; simple_lockup( &(lock) ); tree->com[0] += com0; tree->com[1] += com1; simple_unlock( &(lock) ); } } } void printTree( pnode tree, int depth ) // recursive routine, graphical output { int i, j; if (depth==0) printf("\n======================TREE:=====================\n"); for (i=0; ipos[0], (int)tree->pos[1], (int)tree->cellwidth, (int)tree->mass, (int)tree->com[0], (int)tree->com[1], (int)tree->size); if (tree->part) { for (i=0; ipart->pos[0], (int)tree->part->pos[1], (int)tree->part->mass); } if (tree->kids) { for (i=0; ikids[i]) { // printf(" visit child %d:\n", i); printTree( tree->kids[i], depth + 1 ); } else { for (j=0; j<=depth; j++) printf("| "); printf("Empty child %d\n", i); } } if (depth==0) printf("================END OF TREE=====================\n"); } void plotTree( pnode tree ) // recursive routine, graphical output { int i; if (tree->part) { setFilledSquare( (int) tree->part->pos[0], (int) tree->part->pos[1], (int) sqrt(tree->part->mass), 0xff0000 ); if (tree->part->force) // draw force vector: seq_line( (int)tree->part->pos[0], (int)tree->part->pos[1], (int)(tree->part->pos[0]+tree->part->force[0]), (int)(tree->part->pos[1]+tree->part->force[1]), 0xffffff, 1 ); } if (tree->kids) { seq_line( (int) tree->pos[0]+0.5*tree->cellwidth, (int) tree->pos[1], (int) tree->pos[0]+0.5*tree->cellwidth, (int) tree->pos[1]+tree->cellwidth, 0xffff, 1 ); seq_line( (int) tree->pos[0], (int) tree->pos[1]+0.5*tree->cellwidth, (int) tree->pos[0]+tree->cellwidth, (int) tree->pos[1]+0.5*tree->cellwidth, 0xffff, 1 ); for (i=0; ikids[i]) { plotTree( tree->kids[i] ); } setSquareFrame( (int) tree->com[0], (int) tree->com[1], (int) sqrt(tree->mass), 0xff00ff ); } } sync pnode buildTree( particle* Parray ) { int i; sh int iter = 1; sh pnode T; sh P = groupsize(); seq { T = newNode( newPos2( 0.0, 0.0 ), (float)MAXSIZE, Parray[0] ); } #if (DEBUG) seq printTree( T, 0 ); #endif // asynchronous concurrent insertion of particles 1,...,NBODY-1: //seq // sequential construction // for (i=1; ipos, pj->pos ); #if DEBUG pprintf("distance(%p,%p): %d\n", P, pj, (int) r ); #endif if (r < EPSILON) { // positions identical or very close #if DEBUG pprintf("r < EPSILON\n"); #endif r = EPSILON; // necessary to avoid div by zero } if (r > MAXDIST) continue; // ignore - contribution is too small f[0] += pj->mass * ( pj->pos[0] - P->pos[0] ) / (r*r*r); f[1] += pj->mass * ( pj->pos[1] - P->pos[1] ) / (r*r*r); } f[0] *= GAMMA * P->mass; // loop-invariant factors hoisted out f[1] *= GAMMA * P->mass; #if DEBUG pprintf("Force[%p] = (%d,%d)\n", P, (int)f[0], (int)f[1] ); #endif return f; } sync position parAllForce( sh particle P ) { // O(n) force calculation for each particle, for comparison to treeForce() sh int count = 0; sh simple_lock grouplock0 = 0, grouplock1 = 0; sh position f; sh int p = groupsize(); position myf; int j; float r; if (p == 1) return allForce( P ); seq { f = newPos2( 0.0, 0.0 ); } farm { myf = newPos2( 0.0, 0.0 ); for (j=mpadd(&count,1); jpos, pj->pos ); #if DEBUG pprintf("distance(%p,%p): %d\n", P, pj, (int) r ); #endif if (r < EPSILON) { // positions identical or very close #if DEBUG pprintf("r < EPSILON\n"); #endif r = EPSILON; // necessary to avoid div by zero } if (r > MAXDIST) continue; // ignore - contribution is too small myf[0] += pj->mass * ( pj->pos[0] - P->pos[0] ) / (r*r*r); myf[1] += pj->mass * ( pj->pos[1] - P->pos[1] ) / (r*r*r); } simple_lockup( &grouplock0 ); f[0] += myf[0]; simple_unlock( &grouplock0 ); simple_lockup( &grouplock1 ); f[1] += myf[1]; simple_unlock( &grouplock1 ); } f[0] *= GAMMA * P->mass; // loop-invariant factors hoisted out f[1] *= GAMMA * P->mass; #if DEBUG seq pprintf("Force[%p] = (%d,%d)\n", P, (int)f[0], (int)f[1] ); #endif return f; } position treeForce( pnode T, particle P ) { int j; float r; position f = newPos2( 0.0, 0.0 ); if (! T->kids ) { // T contains only one particle assert( T->part ); assert( T->com ); r = dist2( T->com, P->pos ); if (r < EPSILON) { // positions identical or very close #if DEBUG pprintf("r < EPSILON\n"); #endif r = EPSILON; // necessary to avoid div by zero } if (r > MAXDIST) return f; // ignore - contribution is too small f[0] = T->mass * ( T->com[0] - P->pos[0] ) / (r*r*r) * GAMMA * P->mass; f[1] = T->mass * ( T->com[1] - P->pos[1] ) / (r*r*r) * GAMMA * P->mass; #if DEBUG pprintf("Force[%p] = (%d,%d)\n", P, (int)f[0], (int)f[1] ); #endif return f; } // otherwise: assert(! T->part ); assert( T->com ); r = dist2( T->com, P->pos ); if (r < EPSILON) { // positions identical or very close #if DEBUG pprintf("r < EPSILON:\n"); #endif r = EPSILON; // necessary to avoid div by zero } if (r > MAXDIST) return f; // ignore - contribution is too small if ( T->cellwidth < r * THETA ) { #if DEBUG pprintf("treeForce: relative distance from P is far enough, use com\n"); #endif f[0] = T->mass * ( T->com[0] - P->pos[0] ) / (r*r*r) * GAMMA * P->mass; f[1] = T->mass * ( T->com[1] - P->pos[1] ) / (r*r*r) * GAMMA * P->mass; #if DEBUG pprintf("Force[%p] = (%d,%d)\n", P, (int)f[0], (int)f[1] ); #endif return f; } else { // remains the case that the children must be summed up: int i; for (i=0; ikids[i]) { position pj; #if DEBUG pprintf("Recursive call %d\n", i); #endif pj = treeForce( T->kids[i], P ); f[0] += pj[0]; f[1] += pj[1]; } } #if DEBUG pprintf("Force[%p] = (%d,%d)\n", P, (int)f[0], (int)f[1] ); #endif return f; } } sync position parTreeForce( sh pnode tree, particle P ) // parallel DC { int i; sh float weight[MAXDEG]; sh float r; sh int *procs; sh int nprocs = groupsize(); sh int nkids; sh int haskids = (int) tree->kids; sh position f; sh int faraway; // flag sh simple_lock lock = 0; #if DEBUG farm pprintf("parTreeForce( %p, (%d,%d) )\n", tree, (int)P->pos[0], (int)P->pos[1] ); #endif if (nprocs==1) { farm f = treeForce( tree, P ); return f; } if (!haskids) { // tree is a leaf: case handled in treeForce() seq f = treeForce( tree, P ); return f; } // tree is an inner node. seq f = newPos2( 0.0, 0.0 ); farm r = dist2( tree->com, P->pos ); if (r < EPSILON) { // positions identical or very close #if DEBUG seq pprintf("r < EPSILON\n"); #endif r = EPSILON; // necessary to avoid div by zero } if (r > MAXDIST) return f; // ignore - contribution is too small faraway = (tree->cellwidth < r * THETA); if ( faraway ) { #if DEBUG seq pprintf("parTreeForce: relative distance from P is far enough, use com\n"); #endif f[0] = tree->mass * ( tree->com[0] - P->pos[0] ) / (r*r*r) * GAMMA * P->mass; f[1] = tree->mass * ( tree->com[1] - P->pos[1] ) / (r*r*r) * GAMMA * P->mass; #if DEBUG seq pprintf("ComForce[%p] = (%d,%d)\n", P, (int)f[0], (int)f[1] ); #endif return f; } // remains the case that the children must be summed up: // compute the weights of the subtrees for load balancing: nkids = 0; farm { for (i=$; ikids[i]) { syncadd( &nkids, 1 ); assert(tree->kids[i]->size > 0); weight[i] = (float)(tree->kids[i]->size); } else weight[i] = (float) 0; } if (nprocs <= nkids) { // not enough processors to balance work distribution: // assign one processor per (nonempty) child. sh counter = 0; #if DEBUG farm pprintf("parTreeForce: Case nprocs <= nkids\n"); #endif farm { // self-scheduling loop: for (i=mpadd(&counter,1); ikids[i]) continue; pj = treeForce( tree->kids[i], P ); simple_lockup( &lock ); f[0] += pj[0]; f[1] += pj[1]; simple_unlock( &lock ); } } #if DEBUG seq pprintf("RecForce[%p] = (%d,%d)\n", P, (int)f[0], (int)f[1] ); #endif return f; } // remains the massively parallel case: nprocs > nkids #if DEBUG seq for (i=0; ikids[i])? tree->kids[i]->size : 0, i, (int)weight[i]); #endif // distribute nprocs processors across MAXDEG subgroups in ratio of weights: #if DEBUG farm pprintf("parTreeForce(): Case weighted distribution\n"); #endif procs = weightedGroupVector( nprocs, MAXDEG, weight ); // returns the subgroup index of proc. $ in procs[$] fork (MAXDEG; @ = procs[$]; ) { sh int subnprocs = 0; sh position pj; #if DEBUG seq pprintf("@=%d, treekids[@]=%p\n", @, tree->kids[@]); seq assert( tree->kids[@] ); #endif $ = mpadd( &subnprocs, 1 ); // renumber procs in subgroup pj = parTreeForce( tree->kids[@], P ); seq { simple_lockup( &(tree->lock) ); f[0] += pj[0]; f[1] += pj[1]; simple_unlock( &(tree->lock) ); } } #if DEBUG seq pprintf("Rec2Force[%p] = (%d,%d)\n", P, (int)f[0], (int)f[1] ); #endif return f; } void main( void ) { int i, ctime, stime, ftime, btime; start { sh int counter = 0; sh int nprocs = groupsize(); seq printf("\nBARNES-HUT HIERARCHICAL N-BODY SIMULATION IN FORK95\n\n"); #if GRAPHICS seq printf("Clearing picture...\n"); init_pict( PIXELS+1, PIXELS+1 ); clear_pixels( 1 ); #endif seq { printf("\nEnter number of particles (<%d): ", MAXNBODY); scanf( "%d", &NBODY ); } seq printf("Generating %d random particles...\n", NBODY ); seq { for (i=0; ipos[0], (int) particles[i]->pos[1], (int) particles[i]->mass); #endif } } farm ctime = getct(); tree = buildTree( particles ); farm ctime = getct() - ctime; seq printf("Computing mass centers...\n"); seq setCoM( tree ); // sequential computation // parSetCoM( tree ); farm stime = getct() - ctime; #if DEBUG seq printTree( tree, 0 ); #endif seq printf("Computing forces O(n^2)...\n"); farm ftime = getct(); // O(n^2) algorithm: if (nprocs <= NBODY) farm for (i=mpadd(&counter,1); iforce = allForce( particles[i] ); else { // exploit massive parallelism: fork( NBODY; @ = $ % NBODY; $=$ / NBODY ) particles[@]->force = parAllForce( particles[@] ); } farm ftime = getct() - ftime; seq if (PRINTOUT) for(i=0; iforce[0], (int) particles[i]->force[1] ); seq printf("Barnes-Hut O(n log n) force calculation algorithm ...\n"); farm btime = getct(); if (nprocs <= NBODY) { // use seq. Barnes-Hut on each particle counter = 0; farm for (i=mpadd(&counter,1); iforce = allForce( particles[i] ); particles[i]->force = treeForce( tree, particles[i] ); } else { // exploit massive parallelism: fork( NBODY; @ = $ % NBODY; $=$ / NBODY ) particles[@]->force = parTreeForce( tree, particles[@] ); } farm btime = getct() - btime; seq if (PRINTOUT) for(i=0; iforce[0], (int) particles[i]->force[1] ); seq printf("\n#Bodies: %d #Processors: %d THETA=%f\n", NBODY, nprocs, THETA ); seq printf("Tree construction time: %d ms\n", ctime>>8 ); seq printf("Computation of mass centers: %d ms\n", stime>>8 ); seq printf("O(n*n/p) Computation of forces: %d ms\n", ftime>>8 ); seq printf("O(n/p log n) Computation of forces: %d ms\n", btime>>8 ); seq printf("Total number of cells: %d\n\n", tree->size ); #if GRAPHICS seq { printf("\nwriting tree to x-pixmap file `NBODY' ...\n"); plotTree( tree ); } write_pixmap( "NBODY" ); #endif seq { printf("press to exit program:"); scanf("%c", &ctime); } } }