/* HIERARCHICAL N-BODY implementation of Barnes-Hut'86 in Fork95 */
/* 03/99 Christoph W. Kessler */

#include <fork.h>
#include <io.h>
#include <stdlib.h>
#include <assert.h>
#include <math.h>
#include <graphic.h>

#define GRAPHICS 0    /* 1=graphical output, 0=only text output */
#define DEBUG 0       /* 1=debugging output, 0=silent mode*/

#define DIM 2         /* 2=quadtree, 3=octree */
#define MAXDEG 4      /* 4=quadtree, 8=octree */

#define MAXNBODY 500
sh int NBODY = 50;    /* number of particles, default value */

#define EPSILON 0.0001 /* distance very close -> consider as one particle */

#define GAMMA 20.0     /* gravitational constant
                        * (here: dummy value for nice graphical output) */

#define PIXELS 512      /* extents of generated X-Pixmap picture
                         * also used as x/y size of the universe */

#define THETA 1.0       /* cellwidth/distance threshold for considering the 
                         * mass center instead of all nodes in a subtree */

#define MAXSIZE PIXELS  /* size of universe, must be a power of 2 */

#define MAXDIST (0.8*MAXSIZE) /* very far away -> no significant contribution */

#define PRINTOUT (NBODY<=40)  /* omit lengthy output of computed force vectors */

typedef float* position;

typedef struct part {       /* data structure for a particle */
   position pos;            /* left upper corner of this cell/particle */
   float mass;              /* mass of particle  */
   position force;          /* accumulated force computed for a time step */
   /*position speed;*/
   /*position acceleration;*/
} *particle;

typedef struct treenode {
   struct treenode **kids;  /* up to 8 children */
                            /* if NULL, this node is a leaf (1 partcl or empty)*/
   float cellwidth;         /* cell width */
   particle part;           /* if not NULL, points to a particle */
   position pos;            /* left upper corner of this cell/particle */
   position com;            /* position center of mass */
   float mass;              /* pseudo-mass of com */
   int size;                /* Number of inner nodes in the subtree */
   simple_lock lock;        /* protection for concurrent insertion */
} node, *pnode;


sh pnode tree;                /* the tree, with pointers to particles */

sh particle particles[MAXNBODY]; /* the particles */


position newPos2( float x, float y )
{
  int i;
  position q = (position) shmalloc( 2 * sizeof(float));
  q[0] = x; 
  q[1] = y;  
  return q;
}


particle newParticle( position pos, float m ) 
{
  particle p = (particle) shmalloc( sizeof( struct part ));
  p->mass = m;
  p->pos = pos;
  p->force = NULL;
  return p;
}


pnode newNode( position pos, float cwidth, particle p )
{
  int i;
  pnode n = (pnode) shmalloc( sizeof(node));
#if DEBUG
  pprintf("newNode() pos(%d,%d), width=%d, particle=%p\n",
          (int)pos[0], (int)pos[1], (int)cwidth, p );
#endif
  n->kids = NULL;   // set only in treeInsert()
  n->pos = pos;
  n->cellwidth = cwidth;
  n->mass = 0.0; 
  n->part = p;
  assert(p);
  if (p) n->size = 1;
  else   n->size = 0;
  n->lock = 0;
  return n;
}


int getKidIndex( position pos1, float width, position pos2 )
{
 // determine which quarter of the cell (pos1, width) 
 // the position pos2 is located in.
 float x1 = pos1[0];  
 float x2 = pos2[0];  
 float y1 = pos1[1];  
 float y2 = pos2[1];  
#if DEBUG
 pprintf("x1=%d x2=%d y1=%d y2=%d width=%d\n",
         (int)x1,(int)x2,(int)y1,(int)y2, (int)width );
 assert( x2 >= x1 );
 assert( x2 <= x1+width );
 assert( y2 >= y1 );
 assert( y2 <= y1+width );
#endif
 if (x2 < x1 + 0.5*width)
   if (y2 < y1 + 0.5 * width)   return 0;
   else                         return 2;
 else
   if (y2 < y1 + 0.5 * width)   return 1;
   else                         return 3;
}


int treeInsert( pnode T, particle n )
// returns number of newly generated cells:
{
#if DEBUG
  pprintf("treeInsert ( width %d, particle %p=(%d,%d) )\n",
          (int)T->cellwidth, n->pos, (int)n->pos[0], (int)n->pos[1]);
#endif
  if (T->kids) {   
     int c;
     INNER_NODE:
     // T is an inner node (has kids). 
     // Once T becomes an inner node, it remains so forever.
     // Now determine which child c of node T the new particle n lies in:
     // The entries ->pos and ->cellwidth are constant (set in newNode only).
     c = getKidIndex( T->pos, T->cellwidth, n->pos );
     if (T->kids[c]) {
        int ncells;
        KIDS_C_EXISTS:
        // T->kids[c], once set to nonzero, will remain constant forever.
        // Hence, T needs not be locked in this case.
        ncells = treeInsert( T->kids[c], n );
        syncadd( &(T->size), ncells );  // is atomic, no locking required.
        return ncells;   // contributions by concurrent treeInsert()s are
                         // taken into consideration separately.
     }
     else {
        // T->kids[c] may be concurrently set to nonzero by different
        // processors. Hence, locking is required.
        // In order to improve performance, the newNode() call is hoisted
        // out of the critical section.
        pnode nn = newNode( newPos2( T->pos[0] + 0.5 * (c&1)*T->cellwidth,
                                     T->pos[1] + 0.5 * (float)((c&2)>>1)
                                                     * T->cellwidth ),
                            0.5*T->cellwidth,  n );
        simple_lockup( &(T->lock) );
#if DEBUG
        pprintf(" P%d has(1) lock for %p\n", $, T );
#endif
        if (T->kids[c]) {  // retest: I was not the first processor
                           //         to enter the critical section
#if DEBUG
            pprintf(" P%d frees(1) lock for %p\n", $, T );
#endif
            simple_unlock( &(T->lock) );
            shfree( nn );
            goto KIDS_C_EXISTS;
        }
        else {  // set T->kids[c]:
            T->kids[c] = nn;
#if DEBUG
            pprintf(" P%d frees(2) lock for %p\n", $, T );
#endif
            simple_unlock( &(T->lock) ); 
            syncadd( &(T->size), 1 );  // is atomic anyway.
            return 1;
        }
     }
  }
  else {
    // leaf node => T->part is nonzero. Split T:
    // computations depending only on invariant values T->pos, 
    // T->cellwidth, n, are hoisted out of the critical section.
    particle p;
    pnode n1;
    int c1, c2;
    int i;
    c1 = getKidIndex( T->pos, T->cellwidth, n->pos );
    n1 = newNode( newPos2( T->pos[0] + 0.5 * (float)(c1&1)*T->cellwidth,
                           T->pos[1] + 0.5 * (float)((c1&2)>>1)*T->cellwidth ),
                  0.5*T->cellwidth,
                  n);
    // setting T->kids is a critical section:
    simple_lockup( &(T->lock) );
#if DEBUG
    pprintf(" P%d has(3) lock for %p\n", $, T );
#endif
    if (!T->part) {
       // another processor was faster than me and set T->part to 0.
       // hence, now after that processor finishes, T is no longer a leaf node.
       // jump to the case "inner node":
#if DEBUG
       assert( T->kids );
#endif
       simple_unlock( &(T->lock) );
       goto INNER_NODE;
    }
    p = T->part;
    c2 = getKidIndex( T->pos, T->cellwidth, p->pos );
      
    T->part = NULL;
    T->kids = (pnode*) shmalloc( MAXDEG * sizeof(pnode));
    for (i=0; i<MAXDEG; i++)
       T->kids[i] = NULL;
    T->kids[c1] = n1;
    if (c1 != c2) {  // n and p go to different subcells:
      T->kids[c2] = newNode(
                    newPos2( T->pos[0] + 0.5 * (float)(c2&1)*T->cellwidth,
                             T->pos[1] + 0.5 * (float)((c2&2)>>1)*T->cellwidth ),
                    0.5*T->cellwidth,
                    p);
#if DEBUG
      pprintf(" P%d frees(3) lock for %p\n", $, T );
#endif
      simple_unlock( &(T->lock) );
      // size increment by c1 updated together with contribution by kids[c2]
      syncadd( &(T->size), 2 ); 
      return 2; 
    }
    else {
       int ncells;
#if DEBUG
       pprintf(" P%d frees(4) lock for %p\n", $, T );
#endif
       simple_unlock( &(T->lock) );   // T->kids[c2], once set, exists forever 
       ncells = treeInsert( T->kids[c2], p );
       syncadd( &(T->size), ncells + 1 );
       return ncells + 1;  /* one cell for c1 and the others by c2 */
    }
  } 
}


float frandom(void) 
{ 
  float f;
  int c2 = rand(), c = rand();
  c = c&(MAXSIZE-1);
  c2 = c2&(511);
  f = itof( c ) + (itof(c2) * 0.0019);
  return f;
}


particle randomParticle( void )
{
  particle p;
  position pos = newPos2( frandom(), frandom() );
  p = newParticle( pos, 1.0+(frandom()*0.0001*(float)MAXSIZE));
  return p;
}


async void setFilledSquare( int x, int y, int width, int color )
{
 int i, j;
 for (i=1-width; i<=width; i++)
   for (j=1-width; j<=width; j++)
     set_pixel( x+i, /*PIXELS*/ y+j, color );
}


async void setSquareFrame( int x, int y, int width, int color )
{
 int i, j;
 for (i=1-width; i<=width; i++) {
    set_pixel( x+i, y+1-width, color );
    set_pixel( x+i, y+width, color );
 }
 for (j=1-width; j<=width; j++) {
    set_pixel( x+1-width, y+j, color );
    set_pixel( x+width, y+j, color );
 }
}


void setCoM( pnode tree )   // recursive routine, computer center of mass
{
  int i;

#if DEBUG
  pprintf(" setCoM( %p )\n", tree );
#endif
  tree->mass = 0.0;
  if (tree->part) {
     tree->mass = tree->part->mass;
     tree->com = tree->part->pos;
     return;
  }
  if (tree->kids) {
     tree->com = newPos2( 0.0, 0.0 );
     for (i=0; i<MAXDEG; i++)
       if (tree->kids[i]) {
          setCoM( tree->kids[i] );
          tree->mass += tree->kids[i]->mass;
       }
     for (i=0; i<MAXDEG; i++)
       if (tree->kids[i]) {
          // update com in ratio subtree[i]->mass / tree->mass:
          tree->com[0] += tree->kids[i]->com[0] * tree->kids[i]->mass / tree->mass;
          tree->com[1] += tree->kids[i]->com[1] * tree->kids[i]->mass / tree->mass;
       }
   }
}


sync int *weightedGroupVector( sh int p, sh int n, sh float *work )
{
  // a simple but working sequential solution.
  // may be superior to the parallel variant if n is very close to p.
  //
  // work is an array of length n holding positive float values.
  // n < p.
  // Compute an int array ret[0:p-1] holding group indices ret[i]
  // in the range 0,...,n-1
  // with n_j := sum_{i, ret[i]==c_j} , j=0,...,n-1, such that
  // (1) n_j > 0 iff work[j] > 0
  // (2) n_j / \sum_{k} n_k approximately equal to work[j].
 
  sh int *ret, count;
  int i, j;


  seq {
    ret = (int *)shmalloc( p * sizeof( int ));
    for (i=0, j=0; i<p; ) {  // loop over seats
      if (j>=n)  j=0;
      if (work[j] > 0.0) {
         ret[i++] = j;    // proc i will work for subgroup j
         //pprintf("Proz %d von %d fuer Untergruppe %d von %d, Arbeit %d\n",
         //        i, p, j, n, (int) work[j]);
      }
      j++;
    }
  }
  return ret;

/* This parallel variant is still buggy:
  if ($<n) {    // note that n<p
    count = 0;
    while (count < p) {
      if (work[$] > 0.0) {
         i = mpadd(&count, 1);
         ret[i] = $;
         //pprintf("Proz %d von %d fuer Untergruppe %d von %d, Arbeit %d\n",
         //         i, p, $, n, (int) work[$]);
      }
    }
  }
  return ret;
 */
}

 
sync int *par_weightedGroupVector( sh int p, sh int n, sh float *work )
{
  // work is an array of length n holding positive float values.
  // n < p.
  // Compute an int array ret[0:p-1] holding group indices ret[i]
  // in the range 0,...,n
  // with n_j := sum_{i, ret[i]==c_j} , j=0,...,n-1, such that
  // (1) n_j > 0 iff work[j] > 0
  // (2) n_j / \sum_{k} n_k approximately equal to work[j].

  // Note that this problem is related to the distribution
  // of p seats in a parliament among n parties with votes for
  // party i being stored in work[i], where each party with at
  // least one vote is guaranteed to have a seat in parliament.
 
  // We use a method adapted from that by Hare-Niemeyer.

  sh int *ret;
  int i, j;
  sh int nprocs = groupsize();
  sh float sum = 0.0;
#define MAXSEATS MAXDEG
  sh int seats[MAXSEATS]; 
  sh int pre[MAXSEATS]; 
  sh int sumseats = 0;

  seq assert( nprocs == p );  // for this particular application

  seq ret = (int *)shmalloc( p * sizeof( int ));

  // first: compute the sum of all work[] entries
  seq
     for (i=0; i<n; i++) {
        assert( work[i] >= 0.0 );
        sum += work[i];
     }
#if DEBUG
  seq pprintf("(int)work sum: %d\n", (int) sum );
#endif
  // now, each party gets at most one seat more than it
  // should have; each party with nonzero vote gets at least one:
  farm
   for (i=$; i<n; i+=nprocs)
     if (work[i] > 0.0) {
        seats[i] = (int) ( (float)p * work[i] / sum ) + 1;
        syncadd( &sumseats, seats[i] );
     }
     else
        seats[i] = 0;
  
  // now sumseats is in the range p...p+N, where N = #nonzero parties
  // correct the result by taking away up to N seats:
#if DEBUG
  seq assert( sumseats >= p );
  seq pprintf("sumseats: %d\n", sumseats); 
  seq assert( sumseats <= p+n );
#endif
  seq {
   while (sumseats > p) {
#if 0
    pprintf("while iteration\n");
#endif
    for (i=0; i<n; i++) {
      if( seats[i]<=1 ) continue; // can't take away a seat from a near-zero party
      sumseats --;
      seats[i] --;
      if (sumseats == p) break;
    }
    // assert( sumseats == p );
    if (sumseats != p) {   // debug:
      pprintf("sumseats=%d but p=%d:\n", sumseats, p );
      for (i=0; i<n; i++) 
        pprintf("seats[%d] = %d of %d\n", i, seats[i], sumseats);
    }
   }
  }
  // now seats are allocated. Compute prefix vector in parallel:
  sumseats = 0;
  for (i=$; i<n; i+=nprocs)
     pre[i] = mpadd( &sumseats, seats[i] );
#if DEBUG
  seq assert( sumseats == p );
#endif
  // From the prefix vector, compute the result vector 
  // (seat assignment) in parallel:
  farm
    for (i=$; i<n; i+=nprocs) {  // each party "numbers" its seats:
#if DEBUG
      pprintf("set: (%d:%d) -> %d\n", pre[i], pre[i]+seats[i], i );
#endif
      for (j=pre[i]; j<pre[i]+seats[i]; j++) {
         ret[j] = i;
      }
    }
#if DEBUG
  seq for (i=0; i<p; i++)  pprintf("ret[%d] = %d\n", i, ret[i]);
#endif
  return ret;
}


sync void parSetCoM( sh pnode tree )  // parallel DC variant of setCoM()
{
  int i;
  sh float weight[MAXDEG];
  sh int *procs;
  sh int P = groupsize();
  sh int nkids;
  sh int haskids = (int) tree->kids;
  sh simple_lock lock = 0;
#if DEBUG
  farm pprintf(" parSetCoM( %p )\n", tree );
#endif
  if (P==1) { 
      farm setCoM( tree ); 
      return; 
  }
  if (!haskids) {  // tree is a leaf:
      seq setCoM( tree );
      return;
  }
  tree->mass = 0.0;
#if DEBUG
  seq assert( tree->part==0 );
#endif
  seq tree->com = newPos2( 0.0, 0.0 );
  // compute the weights of the subtrees for load balancing:
  nkids = 0;
  farm {
     for (i=$; i<MAXDEG; i+=P) 
        if (tree->kids[i]) {
             syncadd( &nkids, 1 );
             assert(tree->kids[i]->size > 0);
             weight[i] = (float)(tree->kids[i]->size);
        }
        else weight[i] = (float) 0;
  }
  if (P <= nkids) {  // not enough processors to balance work distribution:
     // assign one processor per (nonempty) child.
     sh counter = 0;
#if DEBUG
     farm pprintf("Case P <= nkids\n");
#endif
     farm {  // self-scheduling loop:
        for (i=mpadd(&counter,1); i<MAXDEG; i=mpadd(&counter,1)) {
           if (!tree->kids[i]) continue;
           setCoM( tree->kids[i] );
           simple_lockup( &lock );
             tree->mass += tree->kids[i]->mass;
           simple_unlock( &lock );
        }
     }
     counter = 0;
     farm {  // self-scheduling loop:
        for (i=mpadd(&counter,1); i<MAXDEG; i=mpadd(&counter,1)) {
           float com0, com1;
           if (!tree->kids[i]) continue;
           com0 = tree->kids[i]->com[0] * tree->kids[i]->mass / tree->mass;
           com1 = tree->kids[i]->com[1] * tree->kids[i]->mass / tree->mass;
           simple_lockup( &lock );
             tree->com[0] += com0;
             tree->com[1] += com1;
           simple_unlock( &lock );
        }
     }
     return;
  }
#if DEBUG
  seq for (i=0; i<MAXDEG; i++) pprintf("%p: size=%d weight[%d] = %d\n", 
          tree, (tree->kids[i])? tree->kids[i]->size : 0, i, (int)weight[i]);
#endif
  // distribute P processors across MAXDEG subgroups in ratio of weights:
#if DEBUG
  farm pprintf("Case weighted distribution\n");
#endif
  procs = weightedGroupVector( P, MAXDEG, weight );
  // returns the subgroup index of proc. $ in procs[$]
  fork (MAXDEG; @ = procs[$]; ) {
     sh int subP = 0;
#if DEBUG
     seq pprintf("@=%d, treekids[@]=%p\n", @, tree->kids[@]);
     seq assert( tree->kids[@] );
#endif
     $ = mpadd( &subP, 1 );   // renumber procs in subgroup
     parSetCoM( tree->kids[@] );
     seq {
       simple_lockup( &(tree->lock) );
         tree->mass += tree->kids[@]->mass;
       simple_unlock( &(tree->lock) );
     }
  }
  fork (MAXDEG; @ = procs[$]; ) {
#if DEBUG
     seq assert (tree->kids[@]);
#endif
     // update com in ratio subtree[@]->mass / tree->mass:
     seq {
       float com0, com1;
       com0 = tree->kids[@]->com[0] * tree->kids[@]->mass / tree->mass;
       com1 = tree->kids[@]->com[1] * tree->kids[@]->mass / tree->mass;
       simple_lockup( &(lock) );
         tree->com[0] += com0;
         tree->com[1] += com1;
       simple_unlock( &(lock) );
     }
  }
}

void printTree( pnode tree, int depth ) // recursive routine, graphical output
{
  int i, j;

  if (depth==0) printf("\n======================TREE:=====================\n");
  for (i=0; i<depth; i++) printf("|  ");
  printf("Node at (%d,%d), cellwidth %d mass %d com(%d,%d) size %d\n", 
         (int)tree->pos[0], (int)tree->pos[1], 
         (int)tree->cellwidth, (int)tree->mass, 
         (int)tree->com[0], (int)tree->com[1],
         (int)tree->size);
  if (tree->part) {
      for (i=0; i<depth; i++) printf("|  ");
      printf("     with particle (%d,%d), mass %d\n",
            (int)tree->part->pos[0], (int)tree->part->pos[1],
            (int)tree->part->mass);
  }
  if (tree->kids) {
     for (i=0; i<MAXDEG; i++)
       if (tree->kids[i]) {
          // printf(" visit child %d:\n", i);
          printTree( tree->kids[i], depth + 1 );
       }
       else {
          for (j=0; j<=depth; j++) printf("|  ");
          printf("Empty child %d\n", i);
       }
  }
  if (depth==0) printf("================END OF TREE=====================\n");
}


void plotTree( pnode tree )   // recursive routine, graphical output
{
  int i;

  if (tree->part) {
     setFilledSquare( (int) tree->part->pos[0], (int) tree->part->pos[1],
                      (int) sqrt(tree->part->mass), 0xff0000 );
     if (tree->part->force)  // draw force vector:
        seq_line( (int)tree->part->pos[0], (int)tree->part->pos[1],
                  (int)(tree->part->pos[0]+tree->part->force[0]),
                  (int)(tree->part->pos[1]+tree->part->force[1]), 0xffffff, 1 );
  }
  if (tree->kids) {
     seq_line( (int) tree->pos[0]+0.5*tree->cellwidth, 
           (int) tree->pos[1],
           (int) tree->pos[0]+0.5*tree->cellwidth,
           (int) tree->pos[1]+tree->cellwidth,
           0xffff, 1 );
     seq_line( (int) tree->pos[0],
           (int) tree->pos[1]+0.5*tree->cellwidth,
           (int) tree->pos[0]+tree->cellwidth, 
           (int) tree->pos[1]+0.5*tree->cellwidth,
           0xffff, 1 );
     for (i=0; i<MAXDEG; i++)
       if (tree->kids[i]) {
          plotTree( tree->kids[i] );
       }
     setSquareFrame( (int) tree->com[0], (int) tree->com[1],
                     (int) sqrt(tree->mass), 0xff00ff );
   }
}


sync pnode buildTree( particle* Parray )
{
  int i;
  sh int iter = 1;
  sh pnode T;
  sh P = groupsize();

  seq {
     T = newNode( newPos2( 0.0, 0.0 ), (float)MAXSIZE, Parray[0] );
  }
#if (DEBUG)
  seq printTree( T, 0 );
#endif

  // asynchronous concurrent insertion of particles 1,...,NBODY-1:
  //seq   // sequential construction
  //  for (i=1; i<NBODY; i++) {
  farm   // parallel construction
    for (i=mpadd(&iter,1); i<NBODY; i=mpadd(&iter,1)) {
#if DEBUG
      pprintf("--- now insert particle %p\n", Parray[i] );
#endif
      treeInsert( T, Parray[i] );
    }
  return T;
}


float dist2( position pi, position pj )    // Euclidean distance
{
  float dx = pi[0]-pj[0];
  float dy = pi[1]-pj[1];
  //pprintf("dist(dx=%d  dy=%d) = %d\n", (int) dx, (int) dy, (int) d );
  return sqrt( dx*dx + dy*dy );
}


position allForce( particle P )
{
  // O(n) force calculation for each particle, for comparison to treeForce()
  int j;
  float r;
  position f = newPos2( 0.0, 0.0 );
  for (j=0; j<NBODY; j++) {
     particle pj = particles[j];
     if (pj==P) continue;
     r = dist2( P->pos, pj->pos );
#if DEBUG
     pprintf("distance(%p,%p): %d\n", P, pj, (int) r );
#endif
     if (r < EPSILON) {  // positions identical or very close
#if DEBUG
        pprintf("r < EPSILON\n");
#endif
        r = EPSILON; // necessary to avoid div by zero
     }
     if (r > MAXDIST)
        continue;   // ignore - contribution is too small
     f[0] += pj->mass * ( pj->pos[0] - P->pos[0] ) / (r*r*r);
     f[1] += pj->mass * ( pj->pos[1] - P->pos[1] ) / (r*r*r);
  }
  f[0] *= GAMMA * P->mass;   // loop-invariant factors hoisted out
  f[1] *= GAMMA * P->mass;
#if DEBUG
  pprintf("Force[%p] = (%d,%d)\n", P, (int)f[0], (int)f[1] );
#endif
  return f;
}


sync position parAllForce( sh particle P )
{
  // O(n) force calculation for each particle, for comparison to treeForce()
  sh int count = 0;
  sh simple_lock grouplock0 = 0, grouplock1 = 0;
  sh position f;
  sh int p = groupsize();
  position myf;
  int j;
  float r;
          
  if (p == 1) return allForce( P );

  seq {
    f = newPos2( 0.0, 0.0 );
  }
  farm  {
    myf = newPos2( 0.0, 0.0 );
    for (j=mpadd(&count,1); j<NBODY; j=mpadd(&count,1)) {
      particle pj = particles[j];
      if (pj==P) continue;
      r = dist2( P->pos, pj->pos );
#if DEBUG
      pprintf("distance(%p,%p): %d\n", P, pj, (int) r );
#endif
      if (r < EPSILON) {  // positions identical or very close
#if DEBUG
        pprintf("r < EPSILON\n");
#endif
        r = EPSILON; // necessary to avoid div by zero
      }
      if (r > MAXDIST)
        continue;   // ignore - contribution is too small
      myf[0] += pj->mass * ( pj->pos[0] - P->pos[0] ) / (r*r*r);
      myf[1] += pj->mass * ( pj->pos[1] - P->pos[1] ) / (r*r*r);
    }
    simple_lockup( &grouplock0 );
      f[0] += myf[0];
    simple_unlock( &grouplock0 );
    simple_lockup( &grouplock1 );
      f[1] += myf[1];
    simple_unlock( &grouplock1 );
  }
  f[0] *= GAMMA * P->mass;   // loop-invariant factors hoisted out
  f[1] *= GAMMA * P->mass;
#if DEBUG
  seq
    pprintf("Force[%p] = (%d,%d)\n", P, (int)f[0], (int)f[1] );
#endif
  return f;
}


position treeForce( pnode T, particle P )
{
  int j;
  float r;
  position f = newPos2( 0.0, 0.0 );
  if (! T->kids ) {  // T contains only one particle 
      assert( T->part );
      assert( T->com );
      r = dist2( T->com, P->pos );     
      if (r < EPSILON) {  // positions identical or very close
#if DEBUG
         pprintf("r < EPSILON\n");
#endif
         r = EPSILON;   // necessary to avoid div by zero
      }
      if (r > MAXDIST)
         return f;   // ignore - contribution is too small
      f[0] = T->mass * ( T->com[0] - P->pos[0] ) / (r*r*r) * GAMMA * P->mass;
      f[1] = T->mass * ( T->com[1] - P->pos[1] ) / (r*r*r) * GAMMA * P->mass;
#if DEBUG
      pprintf("Force[%p] = (%d,%d)\n", P, (int)f[0], (int)f[1] );
#endif
      return f;
  }

  // otherwise: 
  assert(! T->part );
  assert( T->com );
  r = dist2( T->com, P->pos );     
  if (r < EPSILON) {  // positions identical or very close
#if DEBUG
         pprintf("r < EPSILON:\n");
#endif
     r = EPSILON;   // necessary to avoid div by zero
  }
  if (r > MAXDIST)
      return f;   // ignore - contribution is too small
  if ( T->cellwidth < r * THETA ) {
#if DEBUG
      pprintf("treeForce: relative distance from P is far enough, use com\n");
#endif
      f[0] = T->mass * ( T->com[0] - P->pos[0] ) / (r*r*r) * GAMMA * P->mass;
      f[1] = T->mass * ( T->com[1] - P->pos[1] ) / (r*r*r) * GAMMA * P->mass;
#if DEBUG
      pprintf("Force[%p] = (%d,%d)\n", P, (int)f[0], (int)f[1] );
#endif
      return f;
  }
  else {   // remains the case that the children must be summed up:
     int i;
     for (i=0; i<MAXDEG; i++) {
        if (T->kids[i]) {
           position pj;
#if DEBUG
           pprintf("Recursive call %d\n", i);
#endif
           pj = treeForce( T->kids[i], P );
           f[0] += pj[0];
           f[1] += pj[1];
        } 
     }
#if DEBUG
     pprintf("Force[%p] = (%d,%d)\n", P, (int)f[0], (int)f[1] );
#endif
     return f;
  }
}


sync position parTreeForce( sh pnode tree, particle P )    // parallel DC 
{
  int i;
  sh float weight[MAXDEG];
  sh float r;
  sh int *procs;
  sh int nprocs = groupsize();
  sh int nkids;
  sh int haskids = (int) tree->kids;
  sh position f;
  sh int faraway;   // flag
  sh simple_lock lock = 0;
#if DEBUG
  farm pprintf("parTreeForce( %p, (%d,%d) )\n", 
               tree, (int)P->pos[0], (int)P->pos[1] );
#endif
  if (nprocs==1) { 
      farm f = treeForce( tree, P ); 
      return f; 
  }
  if (!haskids) {  // tree is a leaf: case handled in treeForce()
      seq f = treeForce( tree, P );
      return f;
  }
  // tree is an inner node.
  seq f = newPos2( 0.0, 0.0 );
  farm r = dist2( tree->com, P->pos );     
  if (r < EPSILON) {  // positions identical or very close
#if DEBUG
     seq pprintf("r < EPSILON\n");
#endif
     r = EPSILON;   // necessary to avoid div by zero
  }
  if (r > MAXDIST)
      return f;   // ignore - contribution is too small
  faraway = (tree->cellwidth < r * THETA);
  if ( faraway ) {
#if DEBUG
      seq pprintf("parTreeForce: relative distance from P is far enough, use com\n");
#endif
      f[0] = tree->mass * ( tree->com[0] - P->pos[0] ) / (r*r*r) * GAMMA * P->mass;
      f[1] = tree->mass * ( tree->com[1] - P->pos[1] ) / (r*r*r) * GAMMA * P->mass;
#if DEBUG
      seq pprintf("ComForce[%p] = (%d,%d)\n", P, (int)f[0], (int)f[1] );
#endif
      return f;
  }

  // remains the case that the children must be summed up:

  // compute the weights of the subtrees for load balancing:
  nkids = 0;
  farm {
     for (i=$; i<MAXDEG; i+=nprocs) 
        if (tree->kids[i]) {
             syncadd( &nkids, 1 );
             assert(tree->kids[i]->size > 0);
             weight[i] = (float)(tree->kids[i]->size);
        }
        else weight[i] = (float) 0;
  }
  if (nprocs <= nkids) {  // not enough processors to balance work distribution:
     // assign one processor per (nonempty) child.
     sh counter = 0;
#if DEBUG
     farm pprintf("parTreeForce: Case nprocs <= nkids\n");
#endif
     farm {  // self-scheduling loop:
        for (i=mpadd(&counter,1); i<MAXDEG; i=mpadd(&counter,1)) {
           position pj;
           if (!tree->kids[i]) continue;
           pj = treeForce( tree->kids[i], P );
           simple_lockup( &lock );
            f[0] += pj[0];
            f[1] += pj[1];
           simple_unlock( &lock );
        }
     }
#if DEBUG
     seq pprintf("RecForce[%p] = (%d,%d)\n", P, (int)f[0], (int)f[1] );
#endif 
     return f;
  }

  // remains the massively parallel case: nprocs > nkids
#if DEBUG
  seq for (i=0; i<MAXDEG; i++) pprintf("PTF: %p: size=%d weight[%d] = %d\n", 
          tree, (tree->kids[i])? tree->kids[i]->size : 0, i, (int)weight[i]);
#endif
  // distribute nprocs processors across MAXDEG subgroups in ratio of weights:
#if DEBUG
  farm pprintf("parTreeForce(): Case weighted distribution\n");
#endif
  procs = weightedGroupVector( nprocs, MAXDEG, weight );
  // returns the subgroup index of proc. $ in procs[$]
  fork (MAXDEG; @ = procs[$]; ) {
     sh int subnprocs = 0;
     sh position pj;
#if DEBUG
     seq pprintf("@=%d, treekids[@]=%p\n", @, tree->kids[@]);
     seq assert( tree->kids[@] );
#endif
     $ = mpadd( &subnprocs, 1 );   // renumber procs in subgroup
     pj = parTreeForce( tree->kids[@], P );
     seq {
       simple_lockup( &(tree->lock) );
        f[0] += pj[0];
        f[1] += pj[1];
       simple_unlock( &(tree->lock) );
     }
  }
#if DEBUG
  seq pprintf("Rec2Force[%p] = (%d,%d)\n", P, (int)f[0], (int)f[1] );
#endif 
  return f;
}


void main( void )
{
  int i, ctime, stime, ftime, btime;
  start {
    sh int counter = 0;
    sh int nprocs = groupsize();

    seq printf("\nBARNES-HUT HIERARCHICAL N-BODY SIMULATION IN FORK95\n\n");
#if GRAPHICS
    seq printf("Clearing picture...\n");
    init_pict( PIXELS+1, PIXELS+1 );
    clear_pixels( 1 );
#endif
    seq {
      printf("\nEnter number of particles (<%d): ", MAXNBODY);
      scanf( "%d", &NBODY );
    }
    seq printf("Generating %d random particles...\n", NBODY );
    seq {
      for (i=0; i<NBODY; i++) {
        particles[i] = randomParticle();
#if DEBUG
        printf("new particle (%d,%d), mass %d\n",
                (int) particles[i]->pos[0], 
                (int) particles[i]->pos[1], (int) particles[i]->mass);
#endif
      }
    }           
    farm ctime = getct();
    tree = buildTree( particles );
    farm ctime = getct() - ctime;
    seq printf("Computing mass centers...\n");
    seq setCoM( tree );   // sequential computation
    // parSetCoM( tree );
    farm stime = getct() - ctime;
#if DEBUG
    seq printTree( tree, 0 );
#endif
    seq printf("Computing forces O(n^2)...\n");
    farm ftime = getct();
    // O(n^2) algorithm:
    if (nprocs <= NBODY) 
       farm
         for (i=mpadd(&counter,1); i<NBODY; i=mpadd(&counter,1)) 
           particles[i]->force = allForce( particles[i] );
    else { // exploit massive parallelism:
       fork( NBODY; @ = $ % NBODY; $=$ / NBODY )
          particles[@]->force = parAllForce( particles[@] );
    }
    farm ftime = getct() - ftime;
     
    seq
      if (PRINTOUT)
       for(i=0; i<NBODY; i++) 
          printf("O(n^2): force[%d] = (%d,%d)\n", i, 
                (int) particles[i]->force[0], (int) particles[i]->force[1] );

    seq printf("Barnes-Hut O(n log n) force calculation algorithm ...\n");
    farm btime = getct();
    if (nprocs <= NBODY) {  // use seq. Barnes-Hut on each particle
      counter = 0;
      farm 
        for (i=mpadd(&counter,1); i<NBODY; i=mpadd(&counter,1))
          //particles[i]->force = allForce( particles[i] );
          particles[i]->force = treeForce( tree, particles[i] );
    }
    else { // exploit massive parallelism:
      fork( NBODY; @ = $ % NBODY; $=$ / NBODY )
         particles[@]->force = parTreeForce( tree, particles[@] );
    }
    farm btime = getct() - btime;
     
    seq
      if (PRINTOUT)
       for(i=0; i<NBODY; i++) 
          printf("O(n log n): force[%d] = (%d,%d)\n", i, 
                (int)particles[i]->force[0], (int) particles[i]->force[1] );

    seq printf("\n#Bodies: %d   #Processors: %d   THETA=%f\n", 
                 NBODY, nprocs, THETA );
    seq printf("Tree construction time:             %d ms\n", ctime>>8 );
    seq printf("Computation of mass centers:        %d ms\n", stime>>8 );
    seq printf("O(n*n/p) Computation of forces:     %d ms\n", ftime>>8 );
    seq printf("O(n/p log n) Computation of forces: %d ms\n", btime>>8 );
    seq printf("Total number of cells:              %d\n\n", tree->size );
#if GRAPHICS
    seq {
       printf("\nwriting tree to x-pixmap file `NBODY' ...\n");
       plotTree( tree );
    }
    write_pixmap( "NBODY" );
#endif
    seq {
      printf("press <RETURN> to exit program:");
      scanf("%c", &ctime);
    }
  }
}