/*  fastprefix.c   by C.W.Kessler 01/95
 *  general integer multiprefix-ADD implementation in Fork95
 *
 *  straightforward nonrecursive algorithm
 *  takes O(n/p) time on p-processor SB-PRAM 
 *                       with built-in MPADD-operator running in O(1).
 *  Only one additional shared memory cell required (proposed by J. Roehrig).
 *  This is optimal.
 *
 *  n must no longer be a power of the number of processors p
 *
 *  Condition: group-relative Processor ID's '$' must be consecutively
 *             numbered from 0 to groupsize - 1.
 */

#include <fork.h>
#include <syscall.h>
#include <io.h>

extern sync void output_array( sh int*, sh int );
extern sync void parallel_prefix_add( sh int*, sh int, sh int *, sh int );

sh int *a, *b;
sh int n = 100;


void main( void ) {
 pr int i;
 start {
   a = (int *) shalloc(n);
   b = (int *) shalloc(n);
   /* preset the input array: */
   seq  prS("\nSource Array:\n");
   farm for (i=$; i<n; i+= __STARTED_PROCS__)  a[i] = 1; 
   /* initialize the output array: */
   output_array( a, n );                  /*print the original array*/
   parallel_prefix_add( a, n, b, 0 ); 
   seq  prS("\nParallel Prefix Array:\n");
   output_array( b, n );                  /*print the resulting array*/
   seq  printAccStat();
 }
}


sync void parallel_prefix_add(
  sh int *in,     /* operand array, length n */
  sh int n,       /* problem size */
  sh int *out,    /* result array, length n */
  sh int initsum) /* global offset on parallel prefix computation */
{
  sh int p = groupsize();
  sh int sum = initsum;                       /*temp. accumulator cell */
  pr int i;

  for (i=$; i<n; i+=p)           /*step over n/p slices of entire array*/
     out[i] = mpadd( &sum, in[i] );         /* handle one slice of in[]*/
}
 

sync void output_array (
  sh int *arr,    /* the array to print out */
  sh int n )      /* length of arr */
{
  pr int i;
  seq {
    for (i=0; i<n; i++)
       { prI( arr[i], 0 ); write(1,"  ",2); }
    write(1,"\n\n",2);
  }
}
