SkePU  1.2
 All Classes Namespaces Files Functions Variables Enumerations Friends Macros Groups Pages
tuner.h
1 #ifndef SKEPU_TUNER
2 #define SKEPU_TUNER
3 
4 #include <cstdlib>
5 #include <iostream>
6 #include <cassert>
7 
8 
9 
10 
11 
12 
13 #include "skepu/map.h"
14 #include "skepu/reduce.h"
15 #include "skepu/mapreduce.h"
16 #include "skepu/mapoverlap.h"
17 #include "skepu/maparray.h"
18 #include "skepu/scan.h"
19 
20 #include <vector>
21 
22 #include "skepu/src/trainer.h"
23 #include "skepu/src/timer.h"
24 
25 enum SkeletonType
26 {
27  MAP,
28  REDUCE,
29  MAPREDUCE,
30  SCAN,
31  MAPARRAY,
32  MAPOVERLAP
33 };
34 
35 
36 namespace skepu
37 {
38 
46 template <typename StructType, typename StructType2>
47 void cpu_tune_wrapper_map(void *arg)
48 {
49  if(!arg)
50  return;
51 
52  Timer timer;
53 
54  TrainingData *td=reinterpret_cast<TrainingData*>(arg);
55  assert(td != NULL);
56 
57  unsigned int nImpls = td->nImpls;
58  assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
59  unsigned int dimens = td->dimens;
60  unsigned int actDimens = td->extra->actDimensions;
61 
62  DEBUG_TUNING_LEVEL3("Computed dimensions: " << dimens << ", Actual dimensions: " << actDimens << "\n");
63 
64  assert(dimens == 1 && actDimens >= 1 && actDimens <= 4);
65 
66  size_t sizes[MAX_PARAMS];
68  for(unsigned int i=0; i<actDimens; ++i)
69  {
70  sizes[i] = td->problemSize[0];
71  vecArr[i].resize(sizes[i]);
72  }
73 
74  double commCost[MAX_EXEC_PLANS];
75 #ifdef SKEPU_CUDA
76  DevTimingStruct &bwDataStruct = Environment<int>::getInstance()->bwDataStruct;
77  commCost[0] = 0.0;
78  double commCostPerOp = bwDataStruct.latency_dth + (bwDataStruct.timing_dth * sizeof(typename StructType::TYPE) * (td->problemSize[0]));
79  // the user can specify flag hints for operands memory location
80  bool singlePlan = (td->extra->memUp != NULL && td->extra->memDown != NULL);
81  if(singlePlan)
82  {
83  assert(nImpls == 1);
84 
85  int *memUpFlags = td->extra->memUp;
86  int *memDownFlags = td->extra->memDown;
87  for(unsigned int i=0; i<actDimens; ++i)
88  {
89  if(i < (actDimens - 1) && memUpFlags[i] == 1)
90  commCost[0] += bwDataStruct.latency_dth + (bwDataStruct.timing_dth * sizeof(typename StructType::TYPE) * (sizes[0]));
91  else if(i == (actDimens - 1) && memDownFlags[0] == 1)
92  commCost[0] += bwDataStruct.latency_htd + (bwDataStruct.timing_htd * sizeof(typename StructType::TYPE) * (sizes[0]));
93  }
94  }
95  else
96  {
97  if(nImpls > 1)
98  {
99  commCost[1] = 0.0;
100  commCost[2] = commCostPerOp;
101  }
102  if(nImpls > 3)
103  {
104  commCost[3] = 0.0;
105  commCost[4] = commCostPerOp;
106  commCost[5] = commCostPerOp * 2;
107  }
108  if(nImpls > 6)
109  {
110  commCost[6] = 0.0;
111  commCost[7] = commCostPerOp;
112  commCost[8] = commCostPerOp * 2;
113  commCost[9] = commCostPerOp * 3;
114  }
115  }
116 #else
117 
118  commCost[0] = 0.0;
119  assert(nImpls == 1);
120 #endif
121 
123  StructType *userFunc = new StructType;
124  if(td->callBackFunction != NULL)
125  td->callBackFunction(userFunc, sizes, actDimens);
126 
127  skepu::Map<StructType> mapTest(userFunc);
128 
129  timer.start();
130 
131  if(actDimens == 1)
132  mapTest.CPU(vecArr[0]);
133  else if(actDimens == 2)
134  mapTest.CPU(vecArr[0],vecArr[1]);
135  else if(actDimens == 3)
136  mapTest.CPU(vecArr[0],vecArr[1], vecArr[2]);
137  else if(actDimens == 4)
138  mapTest.CPU(vecArr[0],vecArr[1], vecArr[2], vecArr[3]);
139  else
140  assert(false);
141 
142  timer.stop();
143 
144  DEBUG_TUNING_LEVEL3("*CPU* map size: " << sizes[0] << "\n");
145 
146  std::string printStr = "";
147  for(unsigned int i=0; i<nImpls; ++i)
148  {
149  td->exec_time[i] = commCost[i] + timer.getTotalTime();
150  printStr += " " + convertToStr<double>(td->exec_time[i]);
151  }
152  DEBUG_TUNING_LEVEL3(printStr + "\n");
153 }
154 
162 template <typename StructType, typename StructType2>
163 void cpu_tune_wrapper_reduce(void *arg)
164 {
165  if(!arg)
166  return;
167 
168  Timer timer;
169 
170  TrainingData *td=reinterpret_cast<TrainingData*>(arg);
171  assert(td != NULL);
172 
173  unsigned int nImpls = td->nImpls;
174  assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
175  unsigned int dimens = td->dimens;
176  unsigned int actDimens = td->extra->actDimensions;
177 
178  DEBUG_TUNING_LEVEL3("Computed dimensions: " << dimens << ", Actual dimensions: " << actDimens << "\n");
179 
180  assert(dimens == 1 && actDimens == 1);
181 
182  // to ensure that compiler does not optimize these calls away as retVal is not used anywhere...
183  volatile typename StructType::TYPE retVal;
184 
185  size_t sizes[MAX_PARAMS];
186  skepu::Vector<typename StructType::TYPE> vecArr[MAX_PARAMS];
187  for(unsigned int i=0; i<actDimens; ++i)
188  {
189  sizes[i] = ((actDimens != dimens)? td->problemSize[0] : td->problemSize[i]);
190  vecArr[i].resize(sizes[i]);
191  }
192 
193  double commCost[MAX_EXEC_PLANS];
194 #ifdef SKEPU_CUDA
195  DevTimingStruct &bwDataStruct = Environment<int>::getInstance()->bwDataStruct;
196  commCost[0] = 0.0;
197  // the user can specify flag hints for operands memory location
198  bool singlePlan = (td->extra->memUp != NULL);
199  if(singlePlan)
200  {
201  assert(nImpls == 1);
202 
203  int *memUpFlags = td->extra->memUp;
204  for(unsigned int i=0; i<actDimens; ++i)
205  {
206  if(memUpFlags[i] == 1)
207  commCost[0] += bwDataStruct.latency_dth + (bwDataStruct.timing_dth * sizeof(typename StructType::TYPE) * (sizes[0]));
208  }
209  }
210  else
211  {
212  if(nImpls > 1)
213  {
214  commCost[1] = 0.0;
215  commCost[2] = bwDataStruct.latency_dth + (bwDataStruct.timing_dth * sizeof(typename StructType::TYPE) * (td->problemSize[0]));
216  }
217  }
218 #else
219 
220  commCost[0] = 0.0;
221  assert(nImpls == 1);
222 #endif
223 
225  StructType *userFunc = new StructType;
226  if(td->callBackFunction != NULL)
227  td->callBackFunction(userFunc, sizes, actDimens);
228 
229  skepu::Reduce<StructType> redTest(userFunc);
230 
231  timer.start();
232 
233  retVal = redTest.CPU(vecArr[0]);
234 
235  timer.stop();
236 
237  DEBUG_TUNING_LEVEL3("*CPU* reduce size: " << sizes[0] << "\n");
238 
239  std::string printStr = "";
240  for(unsigned int i=0; i<nImpls; ++i)
241  {
242  td->exec_time[i] = commCost[i] + timer.getTotalTime();
243  printStr += " " + convertToStr<double>(td->exec_time[i]);
244  }
245  DEBUG_TUNING_LEVEL3(printStr + "\n");
246 }
247 
255 template <typename StructType, typename StructType2>
257 {
258  if(!arg)
259  return;
260 
261  Timer timer;
262 
263  TrainingData *td=reinterpret_cast<TrainingData*>(arg);
264  assert(td != NULL);
265 
266  unsigned int nImpls = td->nImpls;
267  assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
268  unsigned int dimens = td->dimens;
269  unsigned int actDimens = td->extra->actDimensions;
270 
271  DEBUG_TUNING_LEVEL3("Computed dimensions: " << dimens << ", Actual dimensions: " << actDimens << "\n");
272 
273  assert(dimens == 1 && actDimens >=1 && actDimens <= 2);
274 
275  size_t sizes[MAX_PARAMS];
276  skepu::Vector<typename StructType::TYPE> vecArr[MAX_PARAMS];
277  for(unsigned int i=0; i<actDimens; ++i)
278  {
279  sizes[i] = ((actDimens != dimens)? td->problemSize[0] : td->problemSize[i]);
280  vecArr[i].resize(sizes[i]);
281  }
282 
283  double commCost[MAX_EXEC_PLANS];
284 #ifdef SKEPU_CUDA
285  DevTimingStruct &bwDataStruct = Environment<int>::getInstance()->bwDataStruct;
286  commCost[0] = 0.0;
287  // the user can specify flag hints for operands memory location
288  bool singlePlan = (td->extra->memUp != NULL && td->extra->memDown != NULL);
289  if(singlePlan)
290  {
291  assert(nImpls == 1);
292 
293  int *memUpFlags = td->extra->memUp;
294  int *memDownFlags = td->extra->memDown;
295  for(unsigned int i=0; i<actDimens; ++i)
296  {
297  if(i < (actDimens - 1) && memUpFlags[i] == 1)
298  commCost[0] += bwDataStruct.latency_dth + (bwDataStruct.timing_dth * sizeof(typename StructType::TYPE) * (sizes[0]));
299  else if(i == (actDimens - 1) && memDownFlags[0] == 1)
300  commCost[0] += bwDataStruct.latency_htd + (bwDataStruct.timing_htd * sizeof(typename StructType::TYPE) * (sizes[0]));
301  }
302  }
303  else
304  {
305  if(nImpls > 1)
306  {
307  commCost[1] = 0.0;
308  commCost[2] = bwDataStruct.latency_dth + (bwDataStruct.timing_dth * sizeof(typename StructType::TYPE) * (td->problemSize[0]));
309  }
310  }
311 #else
312 
313  commCost[0] = 0.0;
314  assert(nImpls == 1);
315 #endif
316 
318  StructType *userFunc = new StructType;
319  if(td->callBackFunction != NULL)
320  td->callBackFunction(userFunc, sizes, actDimens);
321 
322  skepu::MapOverlap<StructType> mapOverTest(userFunc);
323 
324  timer.start();
325 
326  if(actDimens == 1)
327  mapOverTest.CPU(vecArr[0]);
328  else if(actDimens == 2)
329  mapOverTest.CPU(vecArr[0], vecArr[1]);
330  else
331  assert(false);
332 
333  timer.stop();
334 
335  DEBUG_TUNING_LEVEL3("*CPU* mapoverlap size: " << sizes[0] << "\n");
336 
337  std::string printStr = "";
338  for(unsigned int i=0; i<nImpls; ++i)
339  {
340  td->exec_time[i] = commCost[i] + timer.getTotalTime();
341  printStr += " " + convertToStr<double>(td->exec_time[i]);
342  }
343  DEBUG_TUNING_LEVEL3(printStr + "\n");
344 }
345 
353 template <typename StructType, typename StructType2>
355 {
356  if(!arg)
357  return;
358 
359  Timer timer;
360 
361  TrainingData *td=reinterpret_cast<TrainingData*>(arg);
362  assert(td != NULL);
363 
364  unsigned int nImpls = td->nImpls;
365  assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
366  unsigned int dimens = td->dimens;
367  unsigned int actDimens = td->extra->actDimensions;
368 
369  DEBUG_TUNING_LEVEL3("Computed dimensions: " << dimens << ", Actual dimensions: " << actDimens << "\n");
370 
371  assert(dimens >= 1 && dimens <= 2 && actDimens == 3);
372 
373  size_t sizes[MAX_PARAMS];
374  skepu::Vector<typename StructType::TYPE> vecArr[MAX_PARAMS];
375 
376  sizes[0] = td->problemSize[0];
377  sizes[1] = (dimens == 1)? td->problemSize[0] : td->problemSize[1];
378  sizes[2] = (dimens == 1)? td->problemSize[0] : td->problemSize[1];
379 
380  for(unsigned int i=0; i<actDimens; ++i)
381  {
382  vecArr[i].resize(sizes[i]);
383  }
384 
385  double commCost[MAX_EXEC_PLANS];
386 #ifdef SKEPU_CUDA
387  assert(sizes[0] == sizes[1] && sizes[1] == sizes[2]);
388 
389  DevTimingStruct &bwDataStruct = Environment<int>::getInstance()->bwDataStruct;
390  commCost[0] = 0.0;
391  // the user can specify flag hints for operands memory location
392  bool singlePlan = (td->extra->memUp != NULL && td->extra->memDown != NULL);
393  if(singlePlan)
394  {
395  assert(nImpls == 1);
396 
397  int *memUpFlags = td->extra->memUp;
398  int *memDownFlags = td->extra->memDown;
399  for(unsigned int i=0; i<actDimens; ++i)
400  {
401  if(i < (actDimens - 1) && memUpFlags[i] == 1)
402  commCost[0] += bwDataStruct.latency_dth + (bwDataStruct.timing_dth * sizeof(typename StructType::TYPE) * (sizes[0]));
403  else if(i == (actDimens - 1) && memDownFlags[0] == 1)
404  commCost[0] += bwDataStruct.latency_htd + (bwDataStruct.timing_htd * sizeof(typename StructType::TYPE) * (sizes[0]));
405  }
406  }
407  else
408  {
409  if(nImpls > 1)
410  {
411  commCost[1] = 0.0;
412  commCost[2] = bwDataStruct.latency_dth + (bwDataStruct.timing_dth * sizeof(typename StructType::TYPE) * (sizes[0]));
413  }
414  if(nImpls > 3)
415  {
416  assert(sizes[0] == sizes[1]);
417  commCost[3] = 0.0;
418  commCost[4] = bwDataStruct.latency_dth + (bwDataStruct.timing_dth * sizeof(typename StructType::TYPE) * (sizes[1]));
419  commCost[5] = commCost[2] + bwDataStruct.latency_dth + (bwDataStruct.timing_dth * sizeof(typename StructType::TYPE) * (sizes[1]));
420  }
421  }
422 #else
423 
424  commCost[0] = 0.0;
425  assert(nImpls == 1);
426 #endif
427 
429  StructType *userFunc = new StructType;
430  if(td->callBackFunction != NULL)
431  td->callBackFunction(userFunc, sizes, actDimens);
432 
433  skepu::MapArray<StructType> mapArrTest(userFunc);
434 
435  timer.start();
436 
437  mapArrTest.CPU(vecArr[0], vecArr[1], vecArr[2]);
438 
439  timer.stop();
440 
441  DEBUG_TUNING_LEVEL3("*CPU* maparray size: " << sizes[0] << "\n");
442 
443  std::string printStr = "";
444  for(unsigned int i=0; i<nImpls; ++i)
445  {
446  td->exec_time[i] = commCost[i] + timer.getTotalTime();
447  printStr += " " + convertToStr<double>(td->exec_time[i]);
448  }
449  DEBUG_TUNING_LEVEL3(printStr + "\n");
450 }
451 
452 
460 template <typename StructType, typename StructType2>
462 {
463  if(!arg)
464  return;
465 
466  Timer timer;
467 
468  TrainingData *td=reinterpret_cast<TrainingData*>(arg);
469  assert(td != NULL);
470 
471  unsigned int nImpls = td->nImpls;
472  assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
473  unsigned int dimens = td->dimens;
474  unsigned int actDimens = td->extra->actDimensions;
475 
476  DEBUG_TUNING_LEVEL3("Computed dimensions: " << dimens << ", Actual dimensions: " << actDimens << "\n");
477 
478  assert(dimens == 1 && actDimens >= 1 && actDimens <= 3);
479 
480  // to ensure that compiler does not optimize these calls away as retVal is not used anywhere...
481  volatile typename StructType::TYPE retVal;
482 
483  size_t sizes[MAX_PARAMS];
484  skepu::Vector<typename StructType::TYPE> vecArr[MAX_PARAMS];
485  for(unsigned int i=0; i<actDimens; ++i)
486  {
487  sizes[i] = td->problemSize[0];
488  vecArr[i].resize(sizes[i]);
489  }
490 
491  double commCost[MAX_EXEC_PLANS];
492 #ifdef SKEPU_CUDA
493  DevTimingStruct &bwDataStruct = Environment<int>::getInstance()->bwDataStruct;
494  commCost[0] = 0.0;
495  double commCostPerOp = bwDataStruct.latency_dth + (bwDataStruct.timing_dth * sizeof(typename StructType::TYPE) * (td->problemSize[0]));
496  // the user can specify flag hints for operands memory location
497  bool singlePlan = (td->extra->memUp != NULL);
498  if(singlePlan)
499  {
500  assert(nImpls == 1);
501 
502  int *memUpFlags = td->extra->memUp;
503  for(unsigned int i=0; i<actDimens; ++i)
504  {
505  if(memUpFlags[i] == 1)
506  commCost[0] += bwDataStruct.latency_dth + (bwDataStruct.timing_dth * sizeof(typename StructType::TYPE) * (sizes[0]));
507  }
508  }
509  else
510  {
511  if(nImpls > 1)
512  {
513  commCost[1] = 0.0;
514  commCost[2] = commCostPerOp;
515  }
516  if(nImpls > 3)
517  {
518  commCost[3] = 0.0;
519  commCost[4] = commCostPerOp;
520  commCost[5] = commCostPerOp * 2;
521  }
522  if(nImpls > 6)
523  {
524  commCost[6] = 0.0;
525  commCost[7] = commCostPerOp;
526  commCost[8] = commCostPerOp * 2;
527  commCost[9] = commCostPerOp * 3;
528  }
529  }
530 #else
531 
532  commCost[0] = 0.0;
533  assert(nImpls == 1);
534 #endif
535 
537  StructType *userFunc = new StructType;
538  StructType2 *userFunc2 = new StructType2;
539  if(td->callBackFunctionMapReduce != NULL)
540  td->callBackFunctionMapReduce(userFunc, userFunc2, sizes, actDimens);
541 
542  skepu::MapReduce<StructType, StructType2> mapRedTest(userFunc, userFunc2);
543 
544  timer.start();
545 
546  if(actDimens == 1)
547  retVal = mapRedTest.CPU(vecArr[0]);
548  else if(actDimens == 2)
549  retVal = mapRedTest.CPU(vecArr[0],vecArr[1]);
550  else if(actDimens == 3)
551  retVal = mapRedTest.CPU(vecArr[0],vecArr[1], vecArr[2]);
552  else
553  assert(false);
554 
555  timer.stop();
556 
557  DEBUG_TUNING_LEVEL3("*CPU* mapreduce size: " << sizes[0] << "\n");
558 
559  std::string printStr = "";
560  for(unsigned int i=0; i<nImpls; ++i)
561  {
562  td->exec_time[i] = commCost[i] + timer.getTotalTime();
563  printStr += " " + convertToStr<double>(td->exec_time[i]);
564  }
565  DEBUG_TUNING_LEVEL3(printStr + "\n");
566 }
567 
568 
570 
571 
572 #ifdef SKEPU_OPENMP
573 
581 template <typename StructType, typename StructType2>
582 void omp_tune_wrapper_map(void *arg)
583 {
584  if(!arg)
585  return;
586 
587  Timer timer;
588 
589  TrainingData *td=reinterpret_cast<TrainingData*>(arg);
590  assert(td != NULL);
591 
592  unsigned int nImpls = td->nImpls;
593  assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
594  unsigned int dimens = td->dimens;
595  unsigned int actDimens = td->extra->actDimensions;
596 
597  DEBUG_TUNING_LEVEL3("Computed dimensions: " << dimens << ", Actual dimensions: " << actDimens << "\n");
598 
599  assert(dimens == 1 && actDimens >= 1 && actDimens <= 4);
600 
601  size_t sizes[MAX_PARAMS];
602  skepu::Vector<typename StructType::TYPE> vecArr[MAX_PARAMS];
603  for(unsigned int i=0; i<actDimens; ++i)
604  {
605  sizes[i] = td->problemSize[0];
606  vecArr[i].resize(sizes[i]);
607  }
608 
609  double commCost[MAX_EXEC_PLANS];
610 #ifdef SKEPU_CUDA
611  DevTimingStruct &bwDataStruct = Environment<int>::getInstance()->bwDataStruct;
612  commCost[0] = 0.0;
613  double commCostPerOp = bwDataStruct.latency_dth + (bwDataStruct.timing_dth * sizeof(typename StructType::TYPE) * (td->problemSize[0]));
614 
615  // the user can specify flag hints for operands memory location
616  bool singlePlan = (td->extra->memUp != NULL && td->extra->memDown != NULL);
617  if(singlePlan)
618  {
619  assert(nImpls == 1);
620 
621  int *memUpFlags = td->extra->memUp;
622  int *memDownFlags = td->extra->memDown;
623  for(unsigned int i=0; i<actDimens; ++i)
624  {
625  if(i < (actDimens - 1) && memUpFlags[i] == 1)
626  commCost[0] += bwDataStruct.latency_dth + (bwDataStruct.timing_dth * sizeof(typename StructType::TYPE) * (sizes[0]));
627  else if(i == (actDimens - 1) && memDownFlags[0] == 1)
628  commCost[0] += bwDataStruct.latency_htd + (bwDataStruct.timing_htd * sizeof(typename StructType::TYPE) * (sizes[0]));
629  }
630  }
631  else
632  {
633  if(nImpls > 1)
634  {
635  commCost[1] = 0.0;
636  commCost[2] = commCostPerOp;
637  }
638  if(nImpls > 3)
639  {
640  commCost[3] = 0.0;
641  commCost[4] = commCostPerOp;
642  commCost[5] = commCostPerOp * 2;
643  }
644  if(nImpls > 6)
645  {
646  commCost[6] = 0.0;
647  commCost[7] = commCostPerOp;
648  commCost[8] = commCostPerOp * 2;
649  commCost[9] = commCostPerOp * 3;
650  }
651  }
652 #else
653 
654  commCost[0] = 0.0;
655  assert(nImpls == 1);
656 #endif
657 
659  StructType *userFunc = new StructType;
660  if(td->callBackFunction != NULL)
661  td->callBackFunction(userFunc, sizes, actDimens);
662 
663  skepu::Map<StructType> mapTest(userFunc);
664 
665  timer.start();
666 
667  if(actDimens == 1)
668  mapTest.OMP(vecArr[0]);
669  else if(actDimens == 2)
670  mapTest.OMP(vecArr[0],vecArr[1]);
671  else if(actDimens == 3)
672  mapTest.OMP(vecArr[0],vecArr[1], vecArr[2]);
673  else if(actDimens == 4)
674  mapTest.OMP(vecArr[0],vecArr[1], vecArr[2], vecArr[3]);
675  else
676  assert(false);
677 
678  timer.stop();
679 
680  DEBUG_TUNING_LEVEL3("*OpenMP* map size: " << sizes[0] << "\n");
681 
682  std::string printStr = "";
683  for(unsigned int i=0; i<nImpls; ++i)
684  {
685  td->exec_time[i] = commCost[i] + timer.getTotalTime();
686  printStr += " " + convertToStr<double>(td->exec_time[i]);
687  }
688  DEBUG_TUNING_LEVEL3(printStr + "\n");
689 }
690 
698 template <typename StructType, typename StructType2>
699 void omp_tune_wrapper_reduce(void *arg)
700 {
701  if(!arg)
702  return;
703 
704  Timer timer;
705 
706  TrainingData *td=reinterpret_cast<TrainingData*>(arg);
707  assert(td != NULL);
708 
709  unsigned int nImpls = td->nImpls;
710  assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
711  unsigned int dimens = td->dimens;
712  unsigned int actDimens = td->extra->actDimensions;
713 
714  DEBUG_TUNING_LEVEL3("Computed dimensions: " << dimens << ", Actual dimensions: " << actDimens << "\n");
715 
716  assert(dimens == 1 && actDimens == 1);
717 
718  // to ensure that compiler does not optimize these calls away as retVal is not used anywhere...
719  volatile typename StructType::TYPE retVal;
720 
721  size_t sizes[MAX_PARAMS];
722  skepu::Vector<typename StructType::TYPE> vecArr[MAX_PARAMS];
723  for(unsigned int i=0; i<actDimens; ++i)
724  {
725  sizes[i] = ((actDimens != dimens)? td->problemSize[0] : td->problemSize[i]);
726  vecArr[i].resize(sizes[i]);
727  }
728 
729  double commCost[MAX_EXEC_PLANS];
730 #ifdef SKEPU_CUDA
731  DevTimingStruct &bwDataStruct = Environment<int>::getInstance()->bwDataStruct;
732  commCost[0] = 0.0;
733 
734  // the user can specify flag hints for operands memory location
735  bool singlePlan = (td->extra->memUp != NULL);
736  if(singlePlan)
737  {
738  assert(nImpls == 1);
739 
740  int *memUpFlags = td->extra->memUp;
741  for(unsigned int i=0; i<actDimens; ++i)
742  {
743  if(memUpFlags[i] == 1)
744  commCost[0] += bwDataStruct.latency_dth + (bwDataStruct.timing_dth * sizeof(typename StructType::TYPE) * (sizes[0]));
745  }
746  }
747  else
748  {
749  if(nImpls > 1)
750  {
751  commCost[1] = 0.0;
752  commCost[2] = bwDataStruct.latency_dth + (bwDataStruct.timing_dth * sizeof(typename StructType::TYPE) * (td->problemSize[0]));
753  }
754  }
755 #else
756 
757  commCost[0] = 0.0;
758  assert(nImpls == 1);
759 #endif
760 
762  StructType *userFunc = new StructType;
763  if(td->callBackFunction != NULL)
764  td->callBackFunction(userFunc, sizes, actDimens);
765 
766  skepu::Reduce<StructType> redTest(userFunc);
767 
768  timer.start();
769 
770  retVal = redTest.OMP(vecArr[0]);
771 
772  timer.stop();
773 
774  DEBUG_TUNING_LEVEL3("*OpenMP* reduce size: " << sizes[0] << "\n");
775 
776  std::string printStr = "";
777  for(unsigned int i=0; i<nImpls; ++i)
778  {
779  td->exec_time[i] = commCost[i] + timer.getTotalTime();
780  printStr += " " + convertToStr<double>(td->exec_time[i]);
781  }
782  DEBUG_TUNING_LEVEL3(printStr + "\n");
783 }
784 
792 template <typename StructType, typename StructType2>
794 {
795  if(!arg)
796  return;
797 
798  Timer timer;
799 
800  TrainingData *td=reinterpret_cast<TrainingData*>(arg);
801  assert(td != NULL);
802 
803  unsigned int nImpls = td->nImpls;
804  assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
805  unsigned int dimens = td->dimens;
806  unsigned int actDimens = td->extra->actDimensions;
807 
808  DEBUG_TUNING_LEVEL3("Computed dimensions: " << dimens << ", Actual dimensions: " << actDimens << "\n");
809 
810  assert(dimens == 1 && actDimens >=1 && actDimens <= 2);
811 
812  // to ensure that compiler does not optimize these calls away as retVal is not used anywhere...
813  volatile typename StructType::TYPE retVal;
814 
815  size_t sizes[MAX_PARAMS];
816  skepu::Vector<typename StructType::TYPE> vecArr[MAX_PARAMS];
817  for(unsigned int i=0; i<actDimens; ++i)
818  {
819  sizes[i] = ((actDimens != dimens)? td->problemSize[0] : td->problemSize[i]);
820  vecArr[i].resize(sizes[i]);
821  }
822 
823  double commCost[MAX_EXEC_PLANS];
824 #ifdef SKEPU_CUDA
825  DevTimingStruct &bwDataStruct = Environment<int>::getInstance()->bwDataStruct;
826  commCost[0] = 0.0;
827 
828  // the user can specify flag hints for operands memory location
829  bool singlePlan = (td->extra->memUp != NULL && td->extra->memDown != NULL);
830  if(singlePlan)
831  {
832  assert(nImpls == 1);
833 
834  int *memUpFlags = td->extra->memUp;
835  int *memDownFlags = td->extra->memDown;
836  for(unsigned int i=0; i<actDimens; ++i)
837  {
838  if(i < (actDimens - 1) && memUpFlags[i] == 1)
839  commCost[0] += bwDataStruct.latency_dth + (bwDataStruct.timing_dth * sizeof(typename StructType::TYPE) * (sizes[0]));
840  else if(i == (actDimens - 1) && memDownFlags[0] == 1)
841  commCost[0] += bwDataStruct.latency_htd + (bwDataStruct.timing_htd * sizeof(typename StructType::TYPE) * (sizes[0]));
842  }
843  }
844  else
845  {
846  if(nImpls > 1)
847  {
848  commCost[1] = 0.0;
849  commCost[2] = bwDataStruct.latency_dth + (bwDataStruct.timing_dth * sizeof(typename StructType::TYPE) * (td->problemSize[0]));
850  }
851  }
852 #else
853 
854  commCost[0] = 0.0;
855  assert(nImpls == 1);
856 #endif
857 
859  StructType *userFunc = new StructType;
860  if(td->callBackFunction != NULL)
861  td->callBackFunction(userFunc, sizes, actDimens);
862 
863  skepu::MapOverlap<StructType> mapOverTest(userFunc);
864 
865  timer.start();
866 
867  if(actDimens == 1)
868  mapOverTest.OMP(vecArr[0]);
869  else if(actDimens == 2)
870  mapOverTest.OMP(vecArr[0], vecArr[1]);
871  else
872  assert(false);
873 
874  timer.stop();
875 
876  DEBUG_TUNING_LEVEL3("*OpenMP* mapoverlap size: " << sizes[0] << "\n");
877 
878  std::string printStr = "";
879  for(unsigned int i=0; i<nImpls; ++i)
880  {
881  td->exec_time[i] = commCost[i] + timer.getTotalTime();
882  printStr += " " + convertToStr<double>(td->exec_time[i]);
883  }
884  DEBUG_TUNING_LEVEL3(printStr + "\n");
885 }
886 
894 template <typename StructType, typename StructType2>
896 {
897  if(!arg)
898  return;
899 
900  Timer timer;
901 
902  TrainingData *td=reinterpret_cast<TrainingData*>(arg);
903  assert(td != NULL);
904 
905  unsigned int nImpls = td->nImpls;
906  assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
907  unsigned int dimens = td->dimens;
908  unsigned int actDimens = td->extra->actDimensions;
909 
910  DEBUG_TUNING_LEVEL3("Computed dimensions: " << dimens << ", Actual dimensions: " << actDimens << "\n");
911 
912  assert(dimens >= 1 && dimens <= 2 && actDimens == 3);
913 
914  size_t sizes[MAX_PARAMS];
915  skepu::Vector<typename StructType::TYPE> vecArr[MAX_PARAMS];
916 
917  sizes[0] = td->problemSize[0];
918  sizes[1] = (dimens == 1)? td->problemSize[0] : td->problemSize[1];
919  sizes[2] = (dimens == 1)? td->problemSize[0] : td->problemSize[1];
920 
921  for(unsigned int i=0; i<actDimens; ++i)
922  {
923  vecArr[i].resize(sizes[i]);
924  }
925 
926  double commCost[MAX_EXEC_PLANS];
927 #ifdef SKEPU_CUDA
928  assert(sizes[0] == sizes[1] && sizes[1] == sizes[2]);
929  DevTimingStruct &bwDataStruct = Environment<int>::getInstance()->bwDataStruct;
930  commCost[0] = 0.0;
931 
932  // the user can specify flag hints for operands memory location
933  bool singlePlan = (td->extra->memUp != NULL && td->extra->memDown != NULL);
934  if(singlePlan)
935  {
936  assert(nImpls == 1);
937 
938  int *memUpFlags = td->extra->memUp;
939  int *memDownFlags = td->extra->memDown;
940  for(unsigned int i=0; i<actDimens; ++i)
941  {
942  if(i < (actDimens - 1) && memUpFlags[i] == 1)
943  commCost[0] += bwDataStruct.latency_dth + (bwDataStruct.timing_dth * sizeof(typename StructType::TYPE) * (sizes[0]));
944  else if(i == (actDimens - 1) && memDownFlags[0] == 1)
945  commCost[0] += bwDataStruct.latency_htd + (bwDataStruct.timing_htd * sizeof(typename StructType::TYPE) * (sizes[0]));
946  }
947  }
948  else
949  {
950  if(nImpls > 1)
951  {
952  commCost[1] = 0.0;
953  commCost[2] = bwDataStruct.latency_dth + (bwDataStruct.timing_dth * sizeof(typename StructType::TYPE) * (sizes[0]));
954  }
955  if(nImpls > 3)
956  {
957  assert(sizes[0] == sizes[1]);
958  commCost[3] = 0.0;
959  commCost[4] = bwDataStruct.latency_dth + (bwDataStruct.timing_dth * sizeof(typename StructType::TYPE) * (sizes[1]));
960  commCost[5] = commCost[2] + bwDataStruct.latency_dth + (bwDataStruct.timing_dth * sizeof(typename StructType::TYPE) * (sizes[1]));
961  }
962  }
963 #else
964 
965  commCost[0] = 0.0;
966  assert(nImpls == 1);
967 #endif
968 
970  StructType *userFunc = new StructType;
971  if(td->callBackFunction != NULL)
972  td->callBackFunction(userFunc, sizes, actDimens);
973 
974  skepu::MapArray<StructType> mapArrTest(userFunc);
975 
976  timer.start();
977 
978  mapArrTest.OMP(vecArr[0], vecArr[1], vecArr[2]);
979 
980  timer.stop();
981 
982  DEBUG_TUNING_LEVEL3("*OpenMP* maparray size: " << sizes[0] << "\n");
983 
984  std::string printStr = "";
985  for(unsigned int i=0; i<nImpls; ++i)
986  {
987  td->exec_time[i] = commCost[i] + timer.getTotalTime();
988  printStr += " " + convertToStr<double>(td->exec_time[i]);
989  }
990  DEBUG_TUNING_LEVEL3(printStr + "\n");
991 }
992 
993 
1001 template <typename StructType, typename StructType2>
1003 {
1004  if(!arg)
1005  return;
1006 
1007  Timer timer;
1008 
1009  TrainingData *td=reinterpret_cast<TrainingData*>(arg);
1010  assert(td != NULL);
1011 
1012  unsigned int nImpls = td->nImpls;
1013  assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
1014  unsigned int dimens = td->dimens;
1015  unsigned int actDimens = td->extra->actDimensions;
1016 
1017  DEBUG_TUNING_LEVEL3("Computed dimensions: " << dimens << ", Actual dimensions: " << actDimens << "\n");
1018 
1019  assert(dimens == 1 && actDimens >= 1 && actDimens <= 3);
1020 
1021  // to ensure that compiler does not optimize these calls away as retVal is not used anywhere...
1022  volatile typename StructType::TYPE retVal;
1023 
1024  size_t sizes[MAX_PARAMS];
1025  skepu::Vector<typename StructType::TYPE> vecArr[MAX_PARAMS];
1026  for(unsigned int i=0; i<actDimens; ++i)
1027  {
1028  sizes[i] = td->problemSize[0];
1029  vecArr[i].resize(sizes[i]);
1030  }
1031 
1032  double commCost[MAX_EXEC_PLANS];
1033 #ifdef SKEPU_CUDA
1034  DevTimingStruct &bwDataStruct = Environment<int>::getInstance()->bwDataStruct;
1035  commCost[0] = 0.0;
1036  double commCostPerOp = bwDataStruct.latency_dth + (bwDataStruct.timing_dth * sizeof(typename StructType::TYPE) * (td->problemSize[0]));
1037 
1038  // the user can specify flag hints for operands memory location
1039  bool singlePlan = (td->extra->memUp != NULL);
1040  if(singlePlan)
1041  {
1042  assert(nImpls == 1);
1043 
1044  int *memUpFlags = td->extra->memUp;
1045  for(unsigned int i=0; i<actDimens; ++i)
1046  {
1047  if(memUpFlags[i] == 1)
1048  commCost[0] += commCostPerOp;
1049  }
1050  }
1051  else
1052  {
1053  if(nImpls > 1)
1054  {
1055  commCost[1] = 0.0;
1056  commCost[2] = commCostPerOp;
1057  }
1058  if(nImpls > 3)
1059  {
1060  commCost[3] = 0.0;
1061  commCost[4] = commCostPerOp;
1062  commCost[5] = commCostPerOp * 2;
1063  }
1064  if(nImpls > 6)
1065  {
1066  commCost[6] = 0.0;
1067  commCost[7] = commCostPerOp;
1068  commCost[8] = commCostPerOp * 2;
1069  commCost[9] = commCostPerOp * 3;
1070  }
1071  }
1072 #else
1073 
1074  commCost[0] = 0.0;
1075  assert(nImpls == 1);
1076 #endif
1077 
1079  StructType *userFunc = new StructType;
1080  StructType2 *userFunc2 = new StructType2;
1081  if(td->callBackFunctionMapReduce != NULL)
1082  td->callBackFunctionMapReduce(userFunc, userFunc2, sizes, actDimens);
1083 
1084  skepu::MapReduce<StructType, StructType2> mapRedTest(userFunc, userFunc2);
1085 
1086  timer.start();
1087 
1088  if(actDimens == 1)
1089  retVal = mapRedTest.OMP(vecArr[0]);
1090  else if(actDimens == 2)
1091  retVal = mapRedTest.OMP(vecArr[0],vecArr[1]);
1092  else if(actDimens == 3)
1093  retVal = mapRedTest.OMP(vecArr[0],vecArr[1], vecArr[2]);
1094  else
1095  assert(false);
1096 
1097  timer.stop();
1098 
1099  DEBUG_TUNING_LEVEL3("*OpenMP* mapreduce size: " << sizes[0] << "\n");
1100 
1101  std::string printStr = "";
1102  for(unsigned int i=0; i<nImpls; ++i)
1103  {
1104  td->exec_time[i] = commCost[i] + timer.getTotalTime();
1105  printStr += " " + convertToStr<double>(td->exec_time[i]);
1106  }
1107  DEBUG_TUNING_LEVEL3(printStr + "\n");
1108 }
1109 #endif
1110 
1111 
1113 
1114 #ifdef SKEPU_CUDA
1115 
1123 template <typename StructType, typename StructType2>
1124 void cuda_tune_wrapper_map(void *arg)
1125 {
1126  if(!arg)
1127  return;
1128 
1129  Timer timer;
1130 
1131  TrainingData *td=reinterpret_cast<TrainingData*>(arg);
1132  assert(td != NULL);
1133 
1134  unsigned int nImpls = td->nImpls;
1135  assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
1136  unsigned int dimens = td->dimens;
1137  unsigned int actDimens = td->extra->actDimensions;
1138 
1139  // the user can specify flag hints for operands memory location
1140  int *memUpFlags = td->extra->memUp;
1141  int *memDownFlags = td->extra->memDown;
1142  bool singlePlan = (memUpFlags != NULL && memDownFlags != NULL);
1143 
1144  DEBUG_TUNING_LEVEL3("Computed dimensions: " << dimens << ", Actual dimensions: " << actDimens << "\n");
1145 
1146  assert(dimens == 1 && actDimens >= 1 && actDimens <= 4);
1147 
1148  cudaSetDevice(Environment<int>::getInstance()->bestCUDADevID);
1149 
1151  double commCost[MAX_EXEC_PLANS];
1152  DevTimingStruct &bwDataStruct = Environment<int>::getInstance()->bwDataStruct;
1153  double commCostPerOp = bwDataStruct.latency_htd + (bwDataStruct.timing_htd * sizeof(typename StructType::TYPE) * (td->problemSize[0]));
1154 
1155 
1156  if (singlePlan)
1157  commCost[0] = 0.0;
1158 
1159  size_t sizes[MAX_PARAMS];
1160  skepu::Vector<typename StructType::TYPE> vecArr[MAX_PARAMS];
1161  for(unsigned int i=0; i<actDimens; ++i)
1162  {
1163  sizes[i] = td->problemSize[0];
1164  vecArr[i].resize(sizes[i]);
1165 
1166  if(singlePlan) // means data cost for HTD/DTH should be included assuming data is not already on required GPU memory
1167  {
1168  if(i == (actDimens-1) && memDownFlags[0] == 0)
1169  commCost[0] += bwDataStruct.latency_dth + (bwDataStruct.timing_dth * sizeof(typename StructType::TYPE) * (sizes[0]));
1170 
1171  if(i < (actDimens-1) && memUpFlags[i] == 0)
1172  commCost[0] += bwDataStruct.latency_htd + (bwDataStruct.timing_htd * sizeof(typename StructType::TYPE) * (sizes[0]));
1173  }
1174 
1176  if(i == actDimens-1) // last should be output variable and no copy is required in that case...
1177  vecArr[i].updateDevice_CU(&vecArr[i][0], sizes[i], Environment<int>::getInstance()->bestCUDADevID, false, true);
1178  else
1179  vecArr[i].updateDevice_CU(&vecArr[i][0], sizes[i], Environment<int>::getInstance()->bestCUDADevID, true, false);
1180  }
1181 
1182  cudaDeviceSynchronize();
1183 
1184  if(singlePlan)
1185  assert(nImpls == 1);
1186  else
1187  {
1188  commCost[0] = commCostPerOp * ((actDimens>1) ? (actDimens-1) : 1); // 0 operands are valid in gpu memory so need to transfer actDimens operands...
1189  if(nImpls > 1)
1190  {
1191  commCost[1] = commCostPerOp * ((actDimens>2) ? (actDimens-2) : 0);
1192  commCost[2] = commCostPerOp * ((actDimens>2) ? (actDimens-2) : 0);
1193  }
1194  if(nImpls > 3)
1195  {
1196  commCost[3] = commCostPerOp * ((actDimens>3) ? (actDimens-3) : 0);
1197  commCost[4] = commCostPerOp * ((actDimens>3) ? (actDimens-3) : 0);
1198  commCost[5] = commCostPerOp * ((actDimens>3) ? (actDimens-3) : 0);
1199  }
1200  if(nImpls > 6)
1201  {
1202  commCost[6] = 0;
1203  commCost[7] = 0;
1204  commCost[8] = 0;
1205  commCost[9] = 0;
1206  }
1207  }
1208 
1210  StructType *userFunc = new StructType;
1211  if(td->callBackFunction != NULL)
1212  td->callBackFunction(userFunc, sizes, actDimens);
1213 
1214  skepu::Map<StructType> mapTest(userFunc);
1215 
1216  timer.start();
1217 
1218  if(actDimens == 1)
1219  mapTest.CU(vecArr[0]);
1220  else if(actDimens == 2)
1221  mapTest.CU(vecArr[0],vecArr[1]);
1222  else if(actDimens == 3)
1223  mapTest.CU(vecArr[0],vecArr[1], vecArr[2]);
1224  else if(actDimens == 4)
1225  mapTest.CU(vecArr[0],vecArr[1], vecArr[2], vecArr[3]);
1226  else
1227  assert(false);
1228 
1229  timer.stop();
1230 
1231  DEBUG_TUNING_LEVEL3("*CUDA* map size: " << sizes[0] << "\n");
1232 
1233  std::string printStr = "";
1234  for(unsigned int i=0; i<nImpls; ++i)
1235  {
1236  td->exec_time[i] = commCost[i] + timer.getTotalTime();
1237  printStr += " " + convertToStr<double>(td->exec_time[i]);
1238  }
1239  DEBUG_TUNING_LEVEL3(printStr + "\n");
1240 }
1241 
1242 
1250 template <typename StructType, typename StructType2>
1252 {
1253  if(!arg)
1254  return;
1255 
1256  Timer timer;
1257 
1258  TrainingData *td=reinterpret_cast<TrainingData*>(arg);
1259  assert(td != NULL);
1260 
1261  unsigned int nImpls = td->nImpls;
1262  assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
1263  unsigned int dimens = td->dimens;
1264  unsigned int actDimens = td->extra->actDimensions;
1265 
1266  // the user can specify flag hints for operands memory location
1267  int *memUpFlags = td->extra->memUp;
1268  bool singlePlan = (memUpFlags != NULL);
1269 
1270  DEBUG_TUNING_LEVEL3("Computed dimensions: " << dimens << ", Actual dimensions: " << actDimens << "\n");
1271 
1272  assert(dimens == 1 && actDimens == 1);
1273 
1274  // to ensure that compiler does not optimize these calls away as retVal is not used anywhere...
1275  volatile typename StructType::TYPE retVal;
1276 
1278  double commCost[MAX_EXEC_PLANS];
1279  DevTimingStruct &bwDataStruct = Environment<int>::getInstance()->bwDataStruct;
1280 
1281  if (singlePlan)
1282  commCost[0] = 0.0;
1283 
1284  size_t sizes[MAX_PARAMS];
1285  skepu::Vector<typename StructType::TYPE> vecArr[MAX_PARAMS];
1286  for(unsigned int i=0; i<actDimens; ++i)
1287  {
1288  sizes[i] = td->problemSize[0];
1289  vecArr[i].resize(sizes[i]);
1290 
1291  if(singlePlan && memUpFlags[i] == 0) // means data cost for HTD should be included assuming data is not already on required GPU memory
1292  {
1293  commCost[0] += bwDataStruct.latency_htd + (bwDataStruct.timing_htd * sizeof(typename StructType::TYPE) * (sizes[0]));
1294  }
1295 
1297  vecArr[i].updateDevice_CU(&vecArr[i][0], sizes[i], Environment<int>::getInstance()->bestCUDADevID, true, false);
1298  }
1299 
1300  cudaDeviceSynchronize();
1301 
1302 
1303  if(singlePlan)
1304  assert(nImpls == 1);
1305  else
1306  {
1307  commCost[0] = bwDataStruct.latency_htd + (bwDataStruct.timing_htd * sizeof(typename StructType::TYPE) * (td->problemSize[0]));
1308  if(nImpls > 1)
1309  {
1310  commCost[1] = 0.0;
1311  commCost[2] = 0.0;
1312  }
1313  }
1314 
1316  StructType *userFunc = new StructType;
1317  if(td->callBackFunction != NULL)
1318  td->callBackFunction(userFunc, sizes, actDimens);
1319 
1320  skepu::Reduce<StructType> redTest(userFunc);
1321 
1322  DEBUG_TUNING_LEVEL3("Computed dimensions: " << dimens << ", Actual dimensions: " << actDimens << "\n");
1323 
1324  timer.start();
1325 
1326  retVal = redTest.CU(vecArr[0]); // DTH cost is implicit always in reduce and mapreduce patterns....
1327 
1328  timer.stop();
1329 
1330  DEBUG_TUNING_LEVEL3("*CUDA* reduce size: " << sizes[0] << "\n");
1331 
1332  std::string printStr = "";
1333  for(unsigned int i=0; i<nImpls; ++i)
1334  {
1335  td->exec_time[i] = commCost[i] + timer.getTotalTime();
1336  printStr += " " + convertToStr<double>(td->exec_time[i]);
1337  }
1338  DEBUG_TUNING_LEVEL3(printStr + "\n");
1339 }
1340 
1341 
1349 template <typename StructType, typename StructType2>
1351 {
1352  if(!arg)
1353  return;
1354 
1355  Timer timer;
1356 
1357  TrainingData *td=reinterpret_cast<TrainingData*>(arg);
1358  assert(td != NULL);
1359 
1360  unsigned int nImpls = td->nImpls;
1361  assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
1362  unsigned int dimens = td->dimens;
1363  unsigned int actDimens = td->extra->actDimensions;
1364 
1365  // the user can specify flag hints for operands memory location
1366  int *memUpFlags = td->extra->memUp;
1367  int *memDownFlags = td->extra->memDown;
1368  bool singlePlan = (memUpFlags != NULL && memDownFlags != NULL);
1369 
1370  DEBUG_TUNING_LEVEL3("Computed dimensions: " << dimens << ", Actual dimensions: " << actDimens << "\n");
1371 
1372  assert(dimens == 1 && actDimens >=1 && actDimens <= 2);
1373 
1374  // to ensure that compiler does not optimize these calls away as retVal is not used anywhere...
1375  volatile typename StructType::TYPE retVal;
1376 
1378  double commCost[MAX_EXEC_PLANS];
1379  DevTimingStruct &bwDataStruct = Environment<int>::getInstance()->bwDataStruct;
1380 
1381  if (singlePlan)
1382  commCost[0] = 0.0;
1383 
1384  size_t sizes[MAX_PARAMS];
1385  skepu::Vector<typename StructType::TYPE> vecArr[MAX_PARAMS];
1386  for(unsigned int i=0; i<actDimens; ++i)
1387  {
1388  sizes[i] = td->problemSize[0];
1389  vecArr[i].resize(sizes[i]);
1390 
1391  if(singlePlan) // means data cost for HTD/DTH should be included assuming data is not already on required GPU memory
1392  {
1393  if(i == 1 && memDownFlags[0] == 0)
1394  commCost[0] += bwDataStruct.latency_dth + (bwDataStruct.timing_dth * sizeof(typename StructType::TYPE) * (sizes[0]));
1395  else if (i<1 && memUpFlags[i] == 0)
1396  commCost[0] += bwDataStruct.latency_htd + (bwDataStruct.timing_htd * sizeof(typename StructType::TYPE) * (sizes[0]));
1397  }
1398 
1400  if(i == 1)
1401  vecArr[i].updateDevice_CU(&vecArr[i][0], sizes[i], Environment<int>::getInstance()->bestCUDADevID, false, true);
1402  else
1403  vecArr[i].updateDevice_CU(&vecArr[i][0], sizes[i], Environment<int>::getInstance()->bestCUDADevID, true, false);
1404  }
1405 
1406  cudaDeviceSynchronize();
1407 
1408  if(singlePlan)
1409  assert(nImpls == 1);
1410  else
1411  {
1412  commCost[0] = bwDataStruct.latency_htd + (bwDataStruct.timing_htd * sizeof(typename StructType::TYPE) * (td->problemSize[0]));
1413  if(nImpls > 1)
1414  {
1415  commCost[1] = 0.0;
1416  commCost[2] = 0.0;
1417  }
1418  }
1419 
1421  StructType *userFunc = new StructType;
1422  if(td->callBackFunction != NULL)
1423  td->callBackFunction(userFunc, sizes, actDimens);
1424 
1425  skepu::MapOverlap<StructType> mapOverTest(userFunc);
1426 
1427  timer.start();
1428 
1429  if(actDimens == 1)
1430  mapOverTest.CU(vecArr[0]);
1431  else if(actDimens == 2)
1432  mapOverTest.CU(vecArr[0], vecArr[1]);
1433  else
1434  assert(false);
1435 
1436  timer.stop();
1437 
1438  DEBUG_TUNING_LEVEL3("*CUDA* mapoverlap size: " << sizes[0] << "\n");
1439 
1440  std::string printStr = "";
1441  for(unsigned int i=0; i<nImpls; ++i)
1442  {
1443  td->exec_time[i] = commCost[i] + timer.getTotalTime();
1444  printStr += " " + convertToStr<double>(td->exec_time[i]);
1445  }
1446  DEBUG_TUNING_LEVEL3(printStr + "\n");
1447 }
1448 
1456 template <typename StructType, typename StructType2>
1458 {
1459  if(!arg)
1460  return;
1461 
1462  Timer timer;
1463 
1464  TrainingData *td=reinterpret_cast<TrainingData*>(arg);
1465  assert(td != NULL);
1466 
1467  unsigned int nImpls = td->nImpls;
1468  assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
1469  unsigned int dimens = td->dimens;
1470  unsigned int actDimens = td->extra->actDimensions;
1471 
1472  // the user can specify flag hints for operands memory location
1473  int *memUpFlags = td->extra->memUp;
1474  int *memDownFlags = td->extra->memDown;
1475  bool singlePlan = (memUpFlags != NULL && memDownFlags != NULL);
1476 
1477 
1478  DEBUG_TUNING_LEVEL3("Computed dimensions: " << dimens << ", Actual dimensions: " << actDimens << "\n");
1479 
1480  assert(dimens >= 1 && dimens <= 2 && actDimens == 3);
1481 
1482  size_t sizes[MAX_PARAMS];
1483  skepu::Vector<typename StructType::TYPE> vecArr[MAX_PARAMS];
1484 
1485  sizes[0] = td->problemSize[0];
1486  sizes[1] = (dimens == 1)? td->problemSize[0] : td->problemSize[1];
1487  sizes[2] = (dimens == 1)? td->problemSize[0] : td->problemSize[1];
1488 
1490  double commCost[MAX_EXEC_PLANS];
1491  assert(sizes[0] == sizes[1] && sizes[1] == sizes[2]);
1492  DevTimingStruct &bwDataStruct = Environment<int>::getInstance()->bwDataStruct;
1493 
1494  if (singlePlan)
1495  commCost[0] = 0.0;
1496 
1497  for(unsigned int i=0; i<actDimens; ++i)
1498  {
1499  vecArr[i].resize(sizes[i]);
1500 
1501  if(singlePlan) // means data cost for HTD should be included assuming data is not already on required GPU memory
1502  {
1503  if(i == (actDimens - 1) && memDownFlags[0] == 0)
1504  commCost[0] += bwDataStruct.latency_dth + (bwDataStruct.timing_dth * sizeof(typename StructType::TYPE) * (sizes[0]));
1505  else if (i < (actDimens - 1) && memUpFlags[i] == 0)
1506  commCost[0] += bwDataStruct.latency_htd + (bwDataStruct.timing_htd * sizeof(typename StructType::TYPE) * (sizes[0]));
1507  }
1508 
1510  if(i == actDimens-1) // last should be output variable and no copy is required in that case...
1511  vecArr[i].updateDevice_CU(&vecArr[i][0], sizes[i], Environment<int>::getInstance()->bestCUDADevID, false, true);
1512  else
1513  vecArr[i].updateDevice_CU(&vecArr[i][0], sizes[i], Environment<int>::getInstance()->bestCUDADevID, true, false);
1514  }
1515 
1516  cudaDeviceSynchronize();
1517 
1518  if(singlePlan)
1519  assert(nImpls == 1);
1520  else
1521  {
1522  commCost[0] = 2 * (bwDataStruct.latency_htd + (bwDataStruct.timing_htd * sizeof(typename StructType::TYPE) * (sizes[0])));
1523  if(nImpls > 1)
1524  {
1525  commCost[1] = bwDataStruct.latency_htd + (bwDataStruct.timing_htd * sizeof(typename StructType::TYPE) * (sizes[0]));
1526  commCost[2] = commCost[1];
1527  }
1528  if(nImpls > 3)
1529  {
1530  commCost[3] = 0.0;
1531  commCost[4] = 0.0;
1532  commCost[5] = 0.0;
1533  }
1534  }
1535 
1537  StructType *userFunc = new StructType;
1538  if(td->callBackFunction != NULL)
1539  td->callBackFunction(userFunc, sizes, actDimens);
1540 
1541  skepu::MapArray<StructType> mapArrTest(userFunc);
1542 
1543  timer.start();
1544 
1545  mapArrTest.CU(vecArr[0], vecArr[1], vecArr[2]);
1546 
1547  timer.stop();
1548 
1549  DEBUG_TUNING_LEVEL3("*CUDA* maparray size: " << sizes[0] << "\n");
1550 
1551  std::string printStr = "";
1552  for(unsigned int i=0; i<nImpls; ++i)
1553  {
1554  td->exec_time[i] = commCost[i] + timer.getTotalTime();
1555  printStr += " " + convertToStr<double>(td->exec_time[i]);
1556  }
1557  DEBUG_TUNING_LEVEL3(printStr + "\n");
1558 }
1559 
1560 
1568 template <typename StructType, typename StructType2>
1570 {
1571  if(!arg)
1572  return;
1573 
1574  Timer timer;
1575 
1576  TrainingData *td=reinterpret_cast<TrainingData*>(arg);
1577  assert(td != NULL);
1578 
1579  unsigned int nImpls = td->nImpls;
1580  assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
1581  unsigned int dimens = td->dimens;
1582  unsigned int actDimens = td->extra->actDimensions;
1583 
1584  // the user can specify flag hints for operands memory location
1585  int *memUpFlags = td->extra->memUp;
1586  bool singlePlan = (memUpFlags != NULL);
1587 
1588  DEBUG_TUNING_LEVEL3("Computed dimensions: " << dimens << ", Actual dimensions: " << actDimens << "\n");
1589 
1590  assert(dimens == 1 && actDimens >= 1 && actDimens <= 3);
1591 
1592  // to ensure that compiler does not optimize these calls away as retVal is not used anywhere...
1593  volatile typename StructType::TYPE retVal;
1594 
1596  double commCost[MAX_EXEC_PLANS];
1597  DevTimingStruct &bwDataStruct = Environment<int>::getInstance()->bwDataStruct;
1598  double costPerOp = bwDataStruct.latency_htd + (bwDataStruct.timing_htd * sizeof(typename StructType::TYPE) * (td->problemSize[0]));
1599 
1600  if (singlePlan)
1601  commCost[0] = 0.0;
1602 
1603  size_t sizes[MAX_PARAMS];
1604  skepu::Vector<typename StructType::TYPE> vecArr[MAX_PARAMS];
1605  for(unsigned int i=0; i<actDimens; ++i)
1606  {
1607  sizes[i] = td->problemSize[0];
1608  vecArr[i].resize(sizes[i]);
1609 
1610  if(singlePlan && memUpFlags[i] == 0) // means data cost for HTD should be included assuming data is not already on required GPU memory
1611  {
1612  commCost[0] += costPerOp;
1613  }
1614 
1616  vecArr[i].updateDevice_CU(&vecArr[i][0], sizes[i], Environment<int>::getInstance()->bestCUDADevID, true, false);
1617  }
1618 
1619  cudaDeviceSynchronize();
1620 
1621  if(singlePlan)
1622  assert(nImpls == 1);
1623  else
1624  {
1625  commCost[0] = costPerOp * actDimens; // 0 operands are valid in gpu memory so need to transfer actDimens operands...
1626  if(nImpls > 1)
1627  {
1628  commCost[1] = costPerOp * ((actDimens>1) ? (actDimens-1) : 0);
1629  commCost[2] = costPerOp * ((actDimens>1) ? (actDimens-1) : 0);
1630  }
1631  if(nImpls > 3)
1632  {
1633  commCost[3] = costPerOp * ((actDimens>2) ? (actDimens-2) : 0);
1634  commCost[4] = costPerOp * ((actDimens>2) ? (actDimens-2) : 0);
1635  commCost[5] = costPerOp * ((actDimens>2) ? (actDimens-2) : 0);
1636  }
1637  if(nImpls > 6)
1638  {
1639  commCost[6] = 0;
1640  commCost[7] = 0;
1641  commCost[8] = 0;
1642  commCost[9] = 0;
1643  }
1644  }
1645 
1647  StructType *userFunc = new StructType;
1648  StructType2 *userFunc2 = new StructType2;
1649  if(td->callBackFunctionMapReduce != NULL)
1650  td->callBackFunctionMapReduce(userFunc, userFunc2, sizes, actDimens);
1651 
1652  skepu::MapReduce<StructType, StructType2> mapRedTest(userFunc, userFunc2);
1653 
1654  timer.start();
1655 
1656  if(actDimens == 1)
1657  retVal = mapRedTest.CU(vecArr[0]);
1658  else if(actDimens == 2)
1659  retVal = mapRedTest.CU(vecArr[0],vecArr[1]);
1660  else if(actDimens == 3)
1661  retVal = mapRedTest.CU(vecArr[0],vecArr[1], vecArr[2]);
1662  else
1663  assert(false);
1664 
1665  timer.stop(); // DTH cost is implicit always in reduce and mapreduce patterns....
1666 
1667  DEBUG_TUNING_LEVEL3("*CUDA* mapreduce size: " << sizes[0] << "\n");
1668 
1669  std::string printStr = "";
1670  for(unsigned int i=0; i<nImpls; ++i)
1671  {
1672  td->exec_time[i] = commCost[i] + timer.getTotalTime();
1673  printStr += " " + convertToStr<double>(td->exec_time[i]);
1674  }
1675  DEBUG_TUNING_LEVEL3(printStr + "\n");
1676 }
1677 #endif
1678 
1679 
1680 
1681 
1682 
1683 
1684 
1685 
1694 template <typename StructType, SkeletonType type, typename StructType2 = StructType>
1695 struct Tuner
1696 {
1697  Tuner()
1698  {
1699  assert(false);
1700  }
1701 };
1702 
1703 
1712 {
1714 
1715 #if defined(SKEPU_OPENCL) && !defined(SKEPU_CUDA) && SKEPU_NUMGPU == 1
1716  bp.backend = CL_BACKEND;
1717 #elif defined(SKEPU_OPENCL) && !defined(SKEPU_CUDA) && SKEPU_NUMGPU != 1
1718  bp.backend = CLM_BACKEND;
1719 #elif !defined(SKEPU_OPENCL) && defined(SKEPU_CUDA) && SKEPU_NUMGPU == 1
1720  bp.backend = CU_BACKEND;
1721 #elif !defined(SKEPU_OPENCL) && defined(SKEPU_CUDA) && SKEPU_NUMGPU != 1
1722  bp.backend = CUM_BACKEND;
1723 #elif defined(SKEPU_OPENCL) && defined(SKEPU_CUDA) && SKEPU_NUMGPU == 1
1724  bp.backend = CL_BACKEND;
1725 #elif defined(SKEPU_OPENCL) && defined(SKEPU_CUDA) && SKEPU_NUMGPU != 1
1726  bp.backend = CLM_BACKEND;
1727 #elif !defined(SKEPU_OPENCL) && !defined(SKEPU_CUDA)
1728 
1729 #if defined(SKEPU_OPENMP)
1730  bp.backend = OMP_BACKEND;
1731 #else
1732  bp.backend = CPU_BACKEND;
1733 #endif
1734 
1735 #endif
1736 
1737 #ifdef SKEPU_OPENCL
1738  bp.maxThreads = environment->m_devices_CL.at(0)->getMaxThreads();
1739  bp.maxBlocks = environment->m_devices_CL.at(0)->getMaxBlocks();
1740 #endif
1741 
1742 #ifdef SKEPU_CUDA
1743  bp.maxThreads = environment->m_devices_CU.at(0)->getMaxThreads();
1744  bp.maxBlocks = environment->m_devices_CU.at(0)->getMaxBlocks();
1745 #endif
1746 
1747 #ifdef SKEPU_OPENMP
1748 #ifdef SKEPU_OPENMP_THREADS
1749  bp.numOmpThreads = SKEPU_OPENMP_THREADS;
1750 #else
1751  bp.numOmpThreads = omp_get_max_threads();
1752 #endif
1753 #endif
1754 }
1755 
1756 
1757 
1758 
1759 
1761 #ifndef _WIN32
1762 
1763 #include "skepu/src/makedir.h"
1764 
1765 
1774 bool loadExecPlan(std::string id, ExecPlan &plan)
1775 {
1776  std::string path = getPMDirectory();
1777  path += id + ".meta";
1778  if(fileExists(path))
1779  {
1780  std::ifstream infile(path.c_str());
1781 
1782  assert(infile.good());
1783 
1784  std::string strLine;
1785  size_t low, upp, numCUThreads, numCUBlocks;
1786  unsigned int numOmpThreads;
1787 
1788  std::string impTypeStr;
1789  BackEndParams bp;
1790  while(infile.good())
1791  {
1792  getline(infile, strLine);
1793  strLine = trimSpaces(strLine);
1794  if(strLine[0] == '%' || strLine[0] == '/' || strLine[0] == '#')
1795  continue;
1796 
1797  std::istringstream iss(strLine);
1798  iss >> low >> upp >> impTypeStr;
1799  iss >> numOmpThreads >> numCUThreads >> numCUBlocks;
1800 
1801  bp.numOmpThreads = numOmpThreads;
1802  bp.maxThreads = numCUThreads;
1803  bp.maxBlocks = numCUBlocks;
1804 
1805  impTypeStr = capitalizeString(impTypeStr);
1806 
1807  if(impTypeStr == "CPU")
1808  {
1809  bp.backend = CPU_BACKEND;
1810 
1811  }
1812  else if(impTypeStr == "OMP")
1813  {
1814  bp.backend = OMP_BACKEND;
1815 // iss >> numOmpThreads;
1816  }
1817  else if(impTypeStr == "CUDA")
1818  {
1819  bp.backend = CU_BACKEND;
1820 // iss >> numCUThreads >> numCUBlocks;
1821  }
1822  else
1823  assert(false);
1824 
1825  plan.add(low, upp, bp);
1826  }
1827  plan.calibrated = true;
1828  return true;
1829  }
1830  return false;
1831 }
1832 
1833 
1842 bool storeExecPlan(std::string id, const ExecPlan &plan)
1843 {
1844  std::string path = getPMDirectory();
1845  std::string file(path + id + ".meta");
1846 
1847  if(fileExists(file) == false)
1848  createPath(path);
1849 
1850 
1851  std::ofstream outfile(file.c_str());
1852 
1853  assert(outfile.good());
1854 
1855 
1856  outfile << "% Execution plan for " << id << "\n";
1857  std::map< std::pair<size_t, size_t>, BackEndParams > m_data = plan.sizePlan;
1858  for(std::map< std::pair<size_t, size_t>, BackEndParams >::iterator it = m_data.begin(); it != m_data.end(); ++it)
1859  {
1860  std::string beTypeStr = "";
1861  BackEndParams bp = it->second;
1862  switch(bp.backend)
1863  {
1864  case CPU_BACKEND:
1865  beTypeStr = "CPU " + convertIntToString(bp.numOmpThreads) + " " + convertIntToString(bp.maxThreads) + " " + convertIntToString(bp.maxBlocks);
1866  break;
1867  case OMP_BACKEND:
1868  beTypeStr = "OMP " + convertIntToString(bp.numOmpThreads) + " " + convertIntToString(bp.maxThreads) + " " + convertIntToString(bp.maxBlocks);
1869  break;
1870  case CU_BACKEND:
1871  beTypeStr = "CUDA " + convertIntToString(bp.numOmpThreads) + " " + convertIntToString(bp.maxThreads) + " " + convertIntToString(bp.maxBlocks);
1872  break;
1873  default:
1874  assert(false);
1875  }
1876 
1877  outfile << it->first.first << " " << it->first.second << " " << beTypeStr << "\n";
1878  }
1879 
1880  outfile.close();
1881 
1882  return true;
1883 }
1884 
1885 
1894 bool loadExecPlanArray(std::string id, ExecPlan *planArray)
1895 {
1896  assert(planArray != NULL);
1897 
1898  std::string path = getPMDirectory();
1899  path += id + "_multi.meta";
1900  if(fileExists(path))
1901  {
1902  std::ifstream infile(path.c_str());
1903 
1904  assert(infile.good());
1905 
1906  std::string strLine;
1907  size_t low, upp, numCUThreads, numCUBlocks;
1908  unsigned int numOmpThreads;
1909  std::string impTypeStr;
1910 // ImplType impType;
1911  BackEndParams bp;
1912  int idx = -1;
1913  while(infile.good())
1914  {
1915  getline(infile, strLine);
1916  strLine = trimSpaces(strLine);
1917  if(strLine[0] == '%' || strLine[0] == '/')
1918  continue;
1919 
1920  if(strLine[0] == '#')
1921  {
1922  strLine = trimSpaces(strLine.substr(1));
1923  int tmpIdx;
1924  std::istringstream iss(strLine);
1925  iss >> tmpIdx;
1926  idx++;
1927  assert(idx == tmpIdx);
1928  continue;
1929  }
1930  assert(idx >= 0 && idx < MAX_EXEC_PLANS);
1931 
1932  std::istringstream iss(strLine);
1933  iss >> low >> upp >> impTypeStr;
1934  iss >> numOmpThreads >> numCUThreads >> numCUBlocks;
1935 
1936  bp.numOmpThreads = numOmpThreads;
1937  bp.maxThreads = numCUThreads;
1938  bp.maxBlocks = numCUBlocks;
1939 
1940  impTypeStr = capitalizeString(impTypeStr);
1941 
1942  if(impTypeStr == "CPU")
1943  {
1944  bp.backend = CPU_BACKEND;
1945 
1946  }
1947  else if(impTypeStr == "OMP")
1948  {
1949  bp.backend = OMP_BACKEND;
1950 // iss >> numOmpThreads;
1951  }
1952  else if(impTypeStr == "CUDA")
1953  {
1954  bp.backend = CU_BACKEND;
1955 // iss >> numCUThreads >> numCUBlocks;
1956  }
1957  else
1958  assert(false);
1959 
1960  planArray[idx].add(low, upp, bp);
1961  planArray[idx].calibrated = true;
1962  }
1963  return true;
1964  }
1965  return false;
1966 }
1967 
1976 bool storeExecPlanArray(std::string id, const ExecPlan *planArray, unsigned int nImpls)
1977 {
1978  assert(planArray != NULL);
1979 
1980  std::string path = getPMDirectory();
1981  std::string file(path + id + "_multi.meta");
1982 
1983  if(fileExists(file) == false)
1984  createPath(path);
1985 
1986 
1987  std::ofstream outfile(file.c_str());
1988 
1989  assert(outfile.good());
1990 
1991  assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
1992 
1993  outfile << "% Execution plan for " << id << "\n";
1994  for(unsigned int i=0; i<nImpls; ++i)
1995  {
1996  if(!planArray[i].calibrated)
1997  {
1998  SKEPU_WARNING("[SKEPU Warning]: Plan '" << id << "' is not calibrated for index: " << i << "\n");
1999  break;
2000  }
2001 
2002  outfile << "# " << i << "\n";
2003 
2004  const std::map< std::pair<size_t, size_t>, BackEndParams > &m_data = planArray[i].sizePlan;
2005  for(std::map< std::pair<size_t, size_t>, BackEndParams >::const_iterator it = m_data.begin(); it != m_data.end(); ++it)
2006  {
2007  std::string beTypeStr = "";
2008  BackEndParams bp = it->second;
2009  switch(bp.backend)
2010  {
2011  case CPU_BACKEND:
2012  beTypeStr = "CPU " + convertIntToString(bp.numOmpThreads) + " " + convertIntToString(bp.maxThreads) + " " + convertIntToString(bp.maxBlocks);
2013  break;
2014  case OMP_BACKEND:
2015  beTypeStr = "OMP " + convertIntToString(bp.numOmpThreads) + " " + convertIntToString(bp.maxThreads) + " " + convertIntToString(bp.maxBlocks);
2016  break;
2017  case CU_BACKEND:
2018  beTypeStr = "CUDA " + convertIntToString(bp.numOmpThreads) + " " + convertIntToString(bp.maxThreads) + " " + convertIntToString(bp.maxBlocks);
2019  break;
2020  default:
2021  assert(false);
2022  }
2023 
2024  outfile << it->first.first << " " << it->first.second << " " << beTypeStr << "\n";
2025  }
2026  }
2027 
2028  outfile.close();
2029 
2030  return true;
2031 }
2032 
2033 #endif
2034 
2041 template <typename StructType, typename StructType2>
2042 struct Tuner<StructType, MAPREDUCE, StructType2>
2043 {
2044  Tuner(std::string _id, int _dimens, size_t *_lowBounds, size_t *_uppBounds): id(_id), dimens(_dimens), lowBounds(_lowBounds), uppBounds(_uppBounds), callBackFunction(NULL), callBackFunctionMapReduce(NULL)
2045  {
2046  assert(dimens >= 1 && dimens <= 3 && lowBounds && uppBounds);
2047  extra.memUp = NULL;
2048  extra.memDown = NULL;
2049  }
2050 
2051  Tuner(std::string _id, int _dimens, size_t *_lowBounds, size_t *_uppBounds, int *_memUp): id(_id), dimens(_dimens), lowBounds(_lowBounds), uppBounds(_uppBounds), callBackFunction(NULL), callBackFunctionMapReduce(NULL)
2052  {
2053  assert(dimens >= 1 && dimens <= 3 && lowBounds && uppBounds);
2054  extra.memUp = _memUp;
2055  extra.memDown = NULL;
2056  }
2057 
2058  StatsTuner stats;
2059 
2060  void operator()(ExecPlan *execPlanArray)
2061  {
2062  assert(execPlanArray!=NULL);
2063 
2068  int *oldMemUp = extra.memUp;
2069  extra.memUp = NULL;
2070 
2071  unsigned int actDimens = dimens;
2072  std::string interface = "mapreduce";
2073  dimens = 1;
2074 
2076  unsigned int nImpls = 1;
2077  #ifdef SKEPU_CUDA
2078  nImpls = nImpls = ( (actDimens == 1) ? 3 : ((actDimens == 2) ? 6 : 10) );
2079  #endif
2080  assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
2081 
2083 #if !defined(_WIN32) && !defined(REDO_MEASUREMENTS)
2084  if(loadExecPlanArray(id, execPlanArray))
2085  {
2086  bool redoMesures = false;
2087  for(unsigned int i=0; i<nImpls; ++i)
2088  {
2089  if(execPlanArray[i].calibrated == false)
2090  {
2091  redoMesures = true;
2092  break;
2093  }
2094  for(unsigned int i=0; i<dimens; ++i)
2095  {
2096  if(execPlanArray[i].isTrainedFor(lowBounds[i]) == false || execPlanArray[i].isTrainedFor(uppBounds[i]) == false)
2097  {
2098  redoMesures = true;
2099  break;
2100  }
2101  }
2102  }
2104  if(redoMesures == false)
2105  return;
2106  }
2107 #endif
2108 
2109  BackEndParams bp;
2111 
2112  std::vector<size_t> upperBounds(dimens);
2113  std::vector<size_t> lowerBounds(dimens);
2114 
2115  for(unsigned int i=0; i<dimens; ++i)
2116  {
2117  upperBounds[i] = uppBounds[i];
2118  lowerBounds[i] = lowBounds[i];
2119  }
2120 
2121  std::vector<ImpDetail*> impls;
2122 
2123  cpu_tune_wrapper_mapreduce<StructType, StructType2>(0);
2124  impls.push_back(new ImpDetail("cpu_impl", IMPL_CPU, &cpu_tune_wrapper_mapreduce<StructType, StructType2>));
2125 
2126 #ifdef SKEPU_OPENMP
2127  omp_tune_wrapper_mapreduce<StructType, StructType2>(0);
2128  impls.push_back(new ImpDetail("omp_impl", IMPL_OMP, &omp_tune_wrapper_mapreduce<StructType, StructType2>));
2129 #endif
2130 
2131 #ifdef SKEPU_CUDA
2132  cuda_tune_wrapper_mapreduce<StructType, StructType2>(0);
2133  impls.push_back(new ImpDetail("cuda_impl", IMPL_CUDA, &cuda_tune_wrapper_mapreduce<StructType, StructType2>));
2134 #endif
2135 
2136  std::ofstream outfile(std::string("tree_data_multi_" + id + ".dat").c_str());
2137  assert(outfile.good());
2138 
2139 
2140  extra.actDimensions = actDimens;
2141  Trainer trainer(impls, lowerBounds, upperBounds, MAX_DEPTH, nImpls, extra, callBackFunction, callBackFunctionMapReduce, OVERSAMPLE);
2142  trainer.train();
2143 
2144  ExecPlanNew<1> planArr[MAX_EXEC_PLANS];
2145  trainer.constructExecPlanNew(planArr, stats);
2146 
2147  for(unsigned int i=0; i<MAX_EXEC_PLANS; ++i)
2148  {
2149  if(planArr[i].calibrated == false)
2150  break;
2151 
2152  outfile << planArr[i];
2153  }
2154 
2155  for(unsigned int i=0; i<MAX_EXEC_PLANS; ++i)
2156  {
2157  if(planArr[i].calibrated == false)
2158  break;
2159 
2160  execPlanArray[i].clear();
2161  outfile << "compressed plan:\n";
2162  trainer.compressExecPlanNew(planArr[i]);
2163  for(std::map<std::pair<size_t,size_t>, ImplType>::iterator it = planArr[i].m_data.begin(); it != planArr[i].m_data.end(); ++it)
2164  {
2165  switch(it->second)
2166  {
2167  case IMPL_CPU:
2168  bp.backend = CPU_BACKEND;
2169  break;
2170  case IMPL_OMP:
2171  bp.backend = OMP_BACKEND;
2172  break;
2173  case IMPL_CUDA:
2174  bp.backend = CU_BACKEND;
2175  break;
2176  default:
2177  assert(false);
2178  }
2179  execPlanArray[i].add(it->first.first, it->first.second, bp);
2180  execPlanArray[i].calibrated = true;
2181  }
2182  outfile << planArr[i];
2183  }
2184 
2186 #ifndef _WIN32
2187  storeExecPlanArray(id, execPlanArray, nImpls);
2188 #endif
2189 
2190  outfile << *(trainer.m_tree);
2191  DEBUG_TUNING_LEVEL2( "\nTree: " << *(trainer.m_tree) << "\n");
2192 
2193  // free memory...
2194  for(unsigned int i=0; i<impls.size(); ++i)
2195  {
2196  delete impls[i];
2197  }
2198 
2199  impls.clear();
2200 
2201  // restore them now...
2202  extra.memUp = oldMemUp;
2203  }
2204 
2205 
2207  {
2208  assert(extra.memUp != NULL);
2209 
2210  unsigned int actDimens = dimens;
2211  std::string interface = "mapreduce";
2212  dimens = 1;
2213 
2214  unsigned int nImpls = 1;
2215 
2216  ExecPlan execPlan;
2217 
2219 #if !defined(_WIN32) && !defined(REDO_MEASUREMENTS)
2220  if(loadExecPlan(id, execPlan))
2221  {
2222  return execPlan;
2223  }
2224 #endif
2225 
2226  BackEndParams bp;
2228 
2229  std::vector<size_t> upperBounds(dimens);
2230  std::vector<size_t> lowerBounds(dimens);
2231 
2232  for(unsigned int i=0; i<dimens; ++i)
2233  {
2234  upperBounds[i] = uppBounds[i];
2235  lowerBounds[i] = lowBounds[i];
2236  }
2237 
2238  std::vector<ImpDetail*> impls;
2239 
2240  cpu_tune_wrapper_mapreduce<StructType, StructType2>(0);
2241  impls.push_back(new ImpDetail("cpu_impl", IMPL_CPU, &cpu_tune_wrapper_mapreduce<StructType, StructType2>));
2242 
2243 #ifdef SKEPU_OPENMP
2244  omp_tune_wrapper_mapreduce<StructType, StructType2>(0);
2245  impls.push_back(new ImpDetail("omp_impl", IMPL_OMP, &omp_tune_wrapper_mapreduce<StructType, StructType2>));
2246 #endif
2247 
2248 #ifdef SKEPU_CUDA
2249  cuda_tune_wrapper_mapreduce<StructType, StructType2>(0);
2250  impls.push_back(new ImpDetail("cuda_impl", IMPL_CUDA, &cuda_tune_wrapper_mapreduce<StructType, StructType2>));
2251 #endif
2252 
2253  std::ofstream outfile(std::string("tree_data_" + id + ".dat").c_str());
2254  assert(outfile.good());
2255 
2256  extra.actDimensions = actDimens;
2257  Trainer trainer(impls, lowerBounds, upperBounds, MAX_DEPTH, nImpls, extra, callBackFunction, callBackFunctionMapReduce, OVERSAMPLE);
2258  trainer.train();
2259 
2260  ExecPlanNew<1> plan;
2261  trainer.constructExecPlanNew(&plan, stats);
2262  assert(plan.calibrated);
2263  outfile << plan ;
2264 
2265  outfile << "compressed plan:\n";
2266  trainer.compressExecPlanNew(plan);
2267  for(std::map<std::pair<size_t,size_t>, ImplType>::iterator it = plan.m_data.begin(); it != plan.m_data.end(); ++it)
2268  {
2269  switch(it->second)
2270  {
2271  case IMPL_CPU:
2272  bp.backend = CPU_BACKEND;
2273  break;
2274  case IMPL_OMP:
2275  bp.backend = OMP_BACKEND;
2276  break;
2277  case IMPL_CUDA:
2278  bp.backend = CU_BACKEND;
2279  break;
2280  default:
2281  assert(false);
2282  }
2283  execPlan.add(it->first.first, it->first.second, bp);
2284  }
2285  outfile << plan;
2286 
2288 #ifndef _WIN32
2289  storeExecPlan(id, execPlan);
2290 #endif
2291 
2292  outfile << *(trainer.m_tree);
2293  DEBUG_TUNING_LEVEL2( "\nTree: " << *(trainer.m_tree) << "\n");
2294 
2295  // free memory...
2296  for(int i=0; i<impls.size(); ++i)
2297  {
2298  delete impls[i];
2299  }
2300 
2301  impls.clear();
2302 
2303  return execPlan;
2304  }
2305 
2306 public:
2307  void (*callBackFunction)(void*, size_t*, unsigned int);
2308  void (*callBackFunctionMapReduce)(void*, void*, size_t*, unsigned int);
2309 
2310 private:
2311  ExtraData extra;
2312  unsigned int dimens;
2313  size_t *lowBounds;
2314  size_t *uppBounds;
2315  std::string id;
2316 };
2317 
2318 
2325 template <typename StructType>
2326 struct Tuner<StructType, MAP, StructType>
2327 {
2328  Tuner(std::string _id, int _dimens, size_t *_lowBounds, size_t *_uppBounds): id(_id), dimens(_dimens), lowBounds(_lowBounds), uppBounds(_uppBounds), callBackFunction(NULL), callBackFunctionMapReduce(NULL)
2329  {
2330  assert(dimens >= 1 && dimens <= 4 && lowBounds && uppBounds);
2331  extra.memUp = NULL;
2332  extra.memDown = NULL;
2333  }
2334 
2335  Tuner(std::string _id, int _dimens, size_t *_lowBounds, size_t *_uppBounds, int *_memUp, int *_memDown): id(_id), dimens(_dimens), lowBounds(_lowBounds), uppBounds(_uppBounds), callBackFunction(NULL), callBackFunctionMapReduce(NULL)
2336  {
2337  assert(dimens >= 1 && dimens <= 4 && lowBounds && uppBounds);
2338  extra.memUp = _memUp;
2339  extra.memDown = _memDown;
2340  }
2341 
2342  StatsTuner stats;
2343 
2344  void operator()(ExecPlan *execPlanArray)
2345  {
2346  assert(execPlanArray!=NULL);
2347 
2352  int *oldMemUp = extra.memUp;
2353  extra.memUp = NULL;
2354  int *oldMemDown = extra.memDown;
2355  extra.memDown = NULL;
2356 
2357 
2358  unsigned int actDimens = dimens;
2359  std::string interface = "map";
2360  dimens = 1;
2361 
2363  unsigned int nImpls = 1;
2364  #ifdef SKEPU_CUDA
2365  nImpls = ( (actDimens == 1 || actDimens == 2) ? 3 : ((actDimens == 3) ? 6 : 10) );
2366  #endif
2367  assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
2368 
2370 #if !defined(_WIN32) && !defined(REDO_MEASUREMENTS)
2371  if(loadExecPlanArray(id, execPlanArray))
2372  {
2373  bool redoMesures = false;
2374  for(unsigned int i=0; i<nImpls; ++i)
2375  {
2376  if(execPlanArray[i].calibrated == false)
2377  {
2378  redoMesures = true;
2379  break;
2380  }
2381  for(unsigned int i=0; i<dimens; ++i)
2382  {
2383  if(execPlanArray[i].isTrainedFor(lowBounds[i]) == false || execPlanArray[i].isTrainedFor(uppBounds[i]) == false)
2384  {
2385  redoMesures = true;
2386  break;
2387  }
2388  }
2389  }
2391  if(redoMesures == false)
2392  return;
2393  }
2394 #endif
2395 
2396  BackEndParams bp;
2398 
2399  std::vector<size_t> upperBounds(dimens);
2400  std::vector<size_t> lowerBounds(dimens);
2401 
2402  for(unsigned int i=0; i<dimens; ++i)
2403  {
2404  upperBounds[i] = uppBounds[i];
2405  lowerBounds[i] = lowBounds[i];
2406  }
2407 
2408  std::vector<ImpDetail*> impls;
2409 
2410  cpu_tune_wrapper_map<StructType, StructType>(0);
2411  impls.push_back(new ImpDetail("cpu_impl", IMPL_CPU, &cpu_tune_wrapper_map<StructType, StructType>));
2412 
2413 #ifdef SKEPU_OPENMP
2414  omp_tune_wrapper_map<StructType, StructType>(0);
2415  impls.push_back(new ImpDetail("omp_impl", IMPL_OMP, &omp_tune_wrapper_map<StructType, StructType>));
2416 #endif
2417 
2418 #ifdef SKEPU_CUDA
2419  cuda_tune_wrapper_map<StructType, StructType>(0);
2420  impls.push_back(new ImpDetail("cuda_impl", IMPL_CUDA, &cuda_tune_wrapper_map<StructType, StructType>));
2421 #endif
2422 
2423  std::ofstream outfile(std::string("tree_data_multi_" + id + ".dat").c_str());
2424  assert(outfile.good());
2425 
2426  extra.actDimensions = actDimens;
2427  Trainer trainer(impls, lowerBounds, upperBounds, MAX_DEPTH, nImpls, extra, callBackFunction, callBackFunctionMapReduce, OVERSAMPLE);
2428  trainer.train();
2429 
2430  ExecPlanNew<1> planArr[MAX_EXEC_PLANS];
2431  trainer.constructExecPlanNew(planArr, stats);
2432 
2433  for(unsigned int i=0; i<MAX_EXEC_PLANS; ++i)
2434  {
2435  if(planArr[i].calibrated == false)
2436  break;
2437 
2438  outfile << planArr[i];
2439  }
2440 
2441  for(unsigned int i=0; i<MAX_EXEC_PLANS; ++i)
2442  {
2443  if(planArr[i].calibrated == false)
2444  break;
2445 
2446  execPlanArray[i].clear();
2447  outfile << "compressed plan:\n";
2448  trainer.compressExecPlanNew(planArr[i]);
2449  for(std::map<std::pair<size_t,size_t>, ImplType>::iterator it = planArr[i].m_data.begin(); it != planArr[i].m_data.end(); ++it)
2450  {
2451  switch(it->second)
2452  {
2453  case IMPL_CPU:
2454  bp.backend = CPU_BACKEND;
2455  break;
2456  case IMPL_OMP:
2457  bp.backend = OMP_BACKEND;
2458  break;
2459  case IMPL_CUDA:
2460  bp.backend = CU_BACKEND;
2461  break;
2462  default:
2463  assert(false);
2464  }
2465  execPlanArray[i].add(it->first.first, it->first.second, bp);
2466  execPlanArray[i].calibrated = true;
2467  }
2468  outfile << planArr[i] ;
2469  }
2470 
2472 #ifndef _WIN32
2473  storeExecPlanArray(id, execPlanArray, nImpls);
2474 #endif
2475 
2476  outfile << *(trainer.m_tree);
2477  DEBUG_TUNING_LEVEL2( "\nTree: " << *(trainer.m_tree) << "\n");
2478 
2479  // free memory...
2480  for(unsigned int i=0; i<impls.size(); ++i)
2481  {
2482  delete impls[i];
2483  }
2484 
2485  impls.clear();
2486 
2487  // restore them now...
2488  extra.memUp = oldMemUp;
2489  extra.memDown = oldMemDown;
2490  }
2491 
2492 
2494  {
2495  assert(extra.memUp != NULL && extra.memDown != NULL);
2496 
2497  unsigned int actDimens = dimens;
2498  std::string interface = "map";
2499  dimens = 1;
2500 
2501  unsigned int nImpls = 1;
2502 
2503  ExecPlan execPlan;
2504 
2506 #if !defined(_WIN32) && !defined(REDO_MEASUREMENTS)
2507  if(loadExecPlan(id, execPlan))
2508  {
2509  return execPlan;
2510  }
2511 #endif
2512 
2513  BackEndParams bp;
2515 
2516  std::vector<size_t> upperBounds(dimens);
2517  std::vector<size_t> lowerBounds(dimens);
2518 
2519  for(unsigned int i=0; i<dimens; ++i)
2520  {
2521  upperBounds[i] = uppBounds[i];
2522  lowerBounds[i] = lowBounds[i];
2523  }
2524 
2525  std::vector<ImpDetail*> impls;
2526 
2527  cpu_tune_wrapper_map<StructType, StructType>(0);
2528  impls.push_back(new ImpDetail("cpu_impl", IMPL_CPU, &cpu_tune_wrapper_map<StructType, StructType>));
2529 
2530 #ifdef SKEPU_OPENMP
2531  omp_tune_wrapper_map<StructType, StructType>(0);
2532  impls.push_back(new ImpDetail("omp_impl", IMPL_OMP, &omp_tune_wrapper_map<StructType, StructType>));
2533 #endif
2534 
2535 #ifdef SKEPU_CUDA
2536  cuda_tune_wrapper_map<StructType, StructType>(0);
2537  impls.push_back(new ImpDetail("cuda_impl", IMPL_CUDA, &cuda_tune_wrapper_map<StructType, StructType>));
2538 #endif
2539 
2540  std::ofstream outfile(std::string("tree_data_" + id + ".dat").c_str());
2541  assert(outfile.good());
2542 
2543  extra.actDimensions = actDimens;
2544  Trainer trainer(impls, lowerBounds, upperBounds, MAX_DEPTH, nImpls, extra, callBackFunction, callBackFunctionMapReduce, OVERSAMPLE);
2545  trainer.train();
2546 
2547  ExecPlanNew<1> plan;
2548  trainer.constructExecPlanNew(&plan, stats);
2549  assert(plan.calibrated);
2550  outfile << plan ;
2551 
2552  outfile << "compressed plan:\n";
2553  trainer.compressExecPlanNew(plan);
2554  for(std::map<std::pair<size_t,size_t>, ImplType>::iterator it = plan.m_data.begin(); it != plan.m_data.end(); ++it)
2555  {
2556  switch(it->second)
2557  {
2558  case IMPL_CPU:
2559  bp.backend = CPU_BACKEND;
2560  break;
2561  case IMPL_OMP:
2562  bp.backend = OMP_BACKEND;
2563  break;
2564  case IMPL_CUDA:
2565  bp.backend = CU_BACKEND;
2566  break;
2567  default:
2568  assert(false);
2569  }
2570  execPlan.add(it->first.first, it->first.second, bp);
2571  }
2572  outfile << plan;
2573 
2575 #ifndef _WIN32
2576  storeExecPlan(id, execPlan);
2577 #endif
2578 
2579  outfile << *(trainer.m_tree);
2580  DEBUG_TUNING_LEVEL2( "\nTree: " << *(trainer.m_tree) << "\n");
2581 
2582  // free memory...
2583  for(int i=0; i<impls.size(); ++i)
2584  {
2585  delete impls[i];
2586  }
2587 
2588  impls.clear();
2589 
2590  return execPlan;
2591  }
2592 
2593 public:
2594  void (*callBackFunction)(void*, size_t*, unsigned int);
2595  void (*callBackFunctionMapReduce)(void*, void*, size_t*, unsigned int);
2596 
2597 private:
2598  ExtraData extra;
2599  unsigned int dimens;
2600  size_t *lowBounds;
2601  size_t *uppBounds;
2602  std::string id;
2603 };
2604 
2605 
2606 
2607 
2608 
2615 template <typename StructType>
2616 struct Tuner<StructType, REDUCE, StructType>
2617 {
2618  Tuner(std::string _id, int _dimens, size_t *_lowBounds, size_t *_uppBounds): id(_id), dimens(_dimens), lowBounds(_lowBounds), uppBounds(_uppBounds), callBackFunction(NULL), callBackFunctionMapReduce(NULL)
2619  {
2620  assert(dimens == 1 && lowBounds && uppBounds);
2621  extra.memUp = NULL;
2622  extra.memDown = NULL;
2623  }
2624 
2625  Tuner(std::string _id, int _dimens, size_t *_lowBounds, size_t *_uppBounds, int *_memUp): id(_id), dimens(_dimens), lowBounds(_lowBounds), uppBounds(_uppBounds), callBackFunction(NULL), callBackFunctionMapReduce(NULL)
2626  {
2627  assert(dimens == 1 && lowBounds && uppBounds);
2628  extra.memUp = _memUp;
2629  extra.memDown = NULL;
2630  }
2631 
2632  StatsTuner stats;
2633 
2634  void operator()(ExecPlan *execPlanArray)
2635  {
2636  assert(execPlanArray!=NULL);
2637 
2642  int *oldMemUp = extra.memUp;
2643  extra.memUp = NULL;
2644 
2645  unsigned int actDimens = dimens;
2646  std::string interface = "reduce";
2647 
2649  unsigned int nImpls = 1;
2650  #ifdef SKEPU_CUDA
2651  nImpls = 3;
2652  #endif
2653  assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
2654 
2656 #if !defined(_WIN32) && !defined(REDO_MEASUREMENTS)
2657  if(loadExecPlanArray(id, execPlanArray))
2658  {
2659  bool redoMesures = false;
2660  for(unsigned int i=0; i<nImpls; ++i)
2661  {
2662  if(execPlanArray[i].calibrated == false)
2663  {
2664  redoMesures = true;
2665  break;
2666  }
2667  for(unsigned int i=0; i<dimens; ++i)
2668  {
2669  if(execPlanArray[i].isTrainedFor(lowBounds[i]) == false || execPlanArray[i].isTrainedFor(uppBounds[i]) == false)
2670  {
2671  redoMesures = true;
2672  break;
2673  }
2674  }
2675  }
2677  if(redoMesures == false)
2678  return;
2679  }
2680 #endif
2681 
2682  BackEndParams bp;
2684 
2685  std::vector<size_t> upperBounds(dimens);
2686  std::vector<size_t> lowerBounds(dimens);
2687 
2688  for(unsigned int i=0; i<dimens; ++i)
2689  {
2690  upperBounds[i] = uppBounds[i];
2691  lowerBounds[i] = lowBounds[i];
2692  }
2693 
2694  std::vector<ImpDetail*> impls;
2695 
2696  cpu_tune_wrapper_reduce<StructType, StructType>(0);
2697  impls.push_back(new ImpDetail("cpu_impl", IMPL_CPU, &cpu_tune_wrapper_reduce<StructType, StructType>));
2698 
2699 #ifdef SKEPU_OPENMP
2700  omp_tune_wrapper_reduce<StructType, StructType>(0);
2701  impls.push_back(new ImpDetail("omp_impl", IMPL_OMP, &omp_tune_wrapper_reduce<StructType, StructType>));
2702 #endif
2703 
2704 #ifdef SKEPU_CUDA
2705  cuda_tune_wrapper_reduce<StructType, StructType>(0);
2706  impls.push_back(new ImpDetail("cuda_impl", IMPL_CUDA, &cuda_tune_wrapper_reduce<StructType, StructType>));
2707 #endif
2708 
2709  std::ofstream outfile(std::string("tree_data_multi_" + id + ".dat").c_str());
2710  assert(outfile.good());
2711 
2712  extra.actDimensions = actDimens;
2713  Trainer trainer(impls, lowerBounds, upperBounds, MAX_DEPTH, nImpls, extra, callBackFunction, callBackFunctionMapReduce, OVERSAMPLE);
2714  trainer.train();
2715 
2716  ExecPlanNew<1> planArr[MAX_EXEC_PLANS];
2717  trainer.constructExecPlanNew(planArr, stats);
2718 
2719  for(unsigned int i=0; i<MAX_EXEC_PLANS; ++i)
2720  {
2721  if(planArr[i].calibrated == false)
2722  break;
2723 
2724  outfile << planArr[i];
2725  }
2726 
2727  for(unsigned int i=0; i<MAX_EXEC_PLANS; ++i)
2728  {
2729  if(planArr[i].calibrated == false)
2730  break;
2731 
2732  execPlanArray[i].clear();
2733  outfile << "compressed plan:\n";
2734  trainer.compressExecPlanNew(planArr[i]);
2735  for(std::map<std::pair<size_t,size_t>, ImplType>::iterator it = planArr[i].m_data.begin(); it != planArr[i].m_data.end(); ++it)
2736  {
2737  switch(it->second)
2738  {
2739  case IMPL_CPU:
2740  bp.backend = CPU_BACKEND;
2741  break;
2742  case IMPL_OMP:
2743  bp.backend = OMP_BACKEND;
2744  break;
2745  case IMPL_CUDA:
2746  bp.backend = CU_BACKEND;
2747  break;
2748  default:
2749  assert(false);
2750  }
2751  execPlanArray[i].add(it->first.first, it->first.second, bp);
2752  execPlanArray[i].calibrated = true;
2753  }
2754  outfile << planArr[i];
2755  }
2756 
2758 #ifndef _WIN32
2759  storeExecPlanArray(id, execPlanArray, nImpls);
2760 #endif
2761 
2762  outfile << *(trainer.m_tree);
2763  DEBUG_TUNING_LEVEL2( "\nTree: " << *(trainer.m_tree) << "\n");
2764 
2765  // free memory...
2766  for(unsigned int i=0; i<impls.size(); ++i)
2767  {
2768  delete impls[i];
2769  }
2770 
2771  impls.clear();
2772 
2773  // restore them now...
2774  extra.memUp = oldMemUp;
2775  }
2776 
2777 
2779  {
2780  assert(extra.memUp != NULL);
2781 
2782  unsigned int actDimens = dimens;
2783  std::string interface = "map";
2784  dimens = 1;
2785 
2786  unsigned int nImpls = 1;
2787 
2788  ExecPlan execPlan;
2789 
2791 #if !defined(_WIN32) && !defined(REDO_MEASUREMENTS)
2792  if(loadExecPlan(id, execPlan))
2793  {
2794  return execPlan;
2795  }
2796 #endif
2797 
2798  BackEndParams bp;
2800 
2801  std::vector<size_t> upperBounds(dimens);
2802  std::vector<size_t> lowerBounds(dimens);
2803 
2804  for(unsigned int i=0; i<dimens; ++i)
2805  {
2806  upperBounds[i] = uppBounds[i];
2807  lowerBounds[i] = lowBounds[i];
2808  }
2809 
2810  std::vector<ImpDetail*> impls;
2811 
2812  cpu_tune_wrapper_reduce<StructType, StructType>(0);
2813  impls.push_back(new ImpDetail("cpu_impl", IMPL_CPU, &cpu_tune_wrapper_reduce<StructType, StructType>));
2814 
2815 #ifdef SKEPU_OPENMP
2816  omp_tune_wrapper_reduce<StructType, StructType>(0);
2817  impls.push_back(new ImpDetail("omp_impl", IMPL_OMP, &omp_tune_wrapper_reduce<StructType, StructType>));
2818 #endif
2819 
2820 #ifdef SKEPU_CUDA
2821  cuda_tune_wrapper_reduce<StructType, StructType>(0);
2822  impls.push_back(new ImpDetail("cuda_impl", IMPL_CUDA, &cuda_tune_wrapper_reduce<StructType, StructType>));
2823 #endif
2824 
2825  std::ofstream outfile(std::string("tree_data_" + id + ".dat").c_str());
2826  assert(outfile.good());
2827 
2828  extra.actDimensions = actDimens;
2829  Trainer trainer(impls, lowerBounds, upperBounds, MAX_DEPTH, nImpls, extra, callBackFunction, callBackFunctionMapReduce, OVERSAMPLE);
2830  trainer.train();
2831 
2832  ExecPlanNew<1> plan;
2833  trainer.constructExecPlanNew(&plan, stats);
2834  assert(plan.calibrated);
2835  outfile << plan ;
2836 
2837  outfile << "compressed plan:\n";
2838  trainer.compressExecPlanNew(plan);
2839  for(std::map<std::pair<size_t,size_t>, ImplType>::iterator it = plan.m_data.begin(); it != plan.m_data.end(); ++it)
2840  {
2841  switch(it->second)
2842  {
2843  case IMPL_CPU:
2844  bp.backend = CPU_BACKEND;
2845  break;
2846  case IMPL_OMP:
2847  bp.backend = OMP_BACKEND;
2848  break;
2849  case IMPL_CUDA:
2850  bp.backend = CU_BACKEND;
2851  break;
2852  default:
2853  assert(false);
2854  }
2855  execPlan.add(it->first.first, it->first.second, bp);
2856  }
2857  outfile << plan;
2858 
2860 #ifndef _WIN32
2861  storeExecPlan(id, execPlan);
2862 #endif
2863 
2864  outfile << *(trainer.m_tree);
2865  DEBUG_TUNING_LEVEL2( "\nTree: " << *(trainer.m_tree) << "\n");
2866 
2867  // free memory...
2868  for(int i=0; i<impls.size(); ++i)
2869  {
2870  delete impls[i];
2871  }
2872 
2873  impls.clear();
2874 
2875  return execPlan;
2876  }
2877 
2878 public:
2879  void (*callBackFunction)(void*, size_t*, unsigned int);
2880  void (*callBackFunctionMapReduce)(void*, void*, size_t*, unsigned int);
2881 
2882 private:
2883  ExtraData extra;
2884  unsigned int dimens;
2885  size_t *lowBounds;
2886  size_t *uppBounds;
2887  std::string id;
2888 };
2889 
2890 
2897 template <typename StructType>
2898 struct Tuner<StructType, MAPARRAY, StructType>
2899 {
2900  Tuner(std::string _id, int _dimens, size_t *_lowBounds, size_t *_uppBounds): id(_id), dimens(_dimens), lowBounds(_lowBounds), uppBounds(_uppBounds), callBackFunction(NULL), callBackFunctionMapReduce(NULL)
2901  {
2902  assert(dimens == 3 && lowBounds && uppBounds);
2903  extra.memUp = NULL;
2904  extra.memDown = NULL;
2905  }
2906 
2907  Tuner(std::string _id, int _dimens, size_t *_lowBounds, size_t *_uppBounds, int *_memUp, int *_memDown): id(_id), dimens(_dimens), lowBounds(_lowBounds), uppBounds(_uppBounds), callBackFunction(NULL), callBackFunctionMapReduce(NULL)
2908  {
2909  assert(dimens == 3 && lowBounds && uppBounds);
2910  extra.memUp = _memUp;
2911  extra.memDown = _memDown;
2912  }
2913 
2914  StatsTuner stats;
2915 
2916  void operator()(ExecPlan *execPlanArray)
2917  {
2918  assert(execPlanArray!=NULL);
2919 
2924  int *oldMemUp = extra.memUp;
2925  extra.memUp = NULL;
2926  int *oldMemDown = extra.memDown;
2927  extra.memDown = NULL;
2928 
2929  unsigned int actDimens = dimens;
2930  std::string interface = "maparray";
2931 
2932  bool allSame = ((lowBounds[0] == lowBounds[1]) && (lowBounds[1] == lowBounds[2])) && ((uppBounds[0] == uppBounds[1]) && (uppBounds[1] == uppBounds[2]));
2933 
2934  dimens = (allSame)? 1:2;
2935 
2937  unsigned int nImpls = 1;
2938  if(!allSame)
2939  SKEPU_ERROR("The current tuning framework does not support MapArray skeleton tuning with different vector sizes. TODO in future.");
2940  #ifdef SKEPU_CUDA
2941  nImpls = 6;
2942  #endif
2943  assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
2944 
2946 #if !defined(_WIN32) && !defined(REDO_MEASUREMENTS)
2947  if(loadExecPlanArray(id, execPlanArray))
2948  {
2949  bool redoMesures = false;
2950  for(unsigned int i=0; i<nImpls; ++i)
2951  {
2952  if(execPlanArray[i].calibrated == false)
2953  {
2954  redoMesures = true;
2955  break;
2956  }
2957  for(unsigned int i=0; i<dimens; ++i)
2958  {
2959  if(execPlanArray[i].isTrainedFor(lowBounds[i]) == false || execPlanArray[i].isTrainedFor(uppBounds[i]) == false)
2960  {
2961  redoMesures = true;
2962  break;
2963  }
2964  }
2965  }
2967  if(redoMesures == false)
2968  return;
2969  }
2970 #endif
2971 
2972  BackEndParams bp;
2974 
2978  assert(dimens == 1);
2979 
2980  std::vector<size_t> upperBounds(dimens);
2981  std::vector<size_t> lowerBounds(dimens);
2982 
2983  for(unsigned int i=0; i<dimens; ++i)
2984  {
2985  upperBounds[i] = uppBounds[i];
2986  lowerBounds[i] = lowBounds[i];
2987  }
2988 
2989  std::vector<ImpDetail*> impls;
2990 
2991  cpu_tune_wrapper_maparray<StructType, StructType>(0);
2992  impls.push_back(new ImpDetail("cpu_impl", IMPL_CPU, &cpu_tune_wrapper_maparray<StructType, StructType>));
2993 
2994 #ifdef SKEPU_OPENMP
2995  omp_tune_wrapper_maparray<StructType, StructType>(0);
2996  impls.push_back(new ImpDetail("omp_impl", IMPL_OMP, &omp_tune_wrapper_maparray<StructType, StructType>));
2997 #endif
2998 
2999 #ifdef SKEPU_CUDA
3000  cuda_tune_wrapper_maparray<StructType, StructType>(0);
3001  impls.push_back(new ImpDetail("cuda_impl", IMPL_CUDA, &cuda_tune_wrapper_maparray<StructType, StructType>));
3002 #endif
3003 
3004  std::ofstream outfile(std::string("tree_data_multi_" + id + ".dat").c_str());
3005  assert(outfile.good());
3006 
3007  extra.actDimensions = actDimens;
3008  Trainer trainer(impls, lowerBounds, upperBounds, MAX_DEPTH, nImpls, extra, callBackFunction, callBackFunctionMapReduce, OVERSAMPLE);
3009  trainer.train();
3010 
3011  ExecPlanNew<1> planArr[MAX_EXEC_PLANS];
3012  trainer.constructExecPlanNew(planArr, stats);
3013 
3014  for(unsigned int i=0; i<MAX_EXEC_PLANS; ++i)
3015  {
3016  if(planArr[i].calibrated == false)
3017  break;
3018 
3019  outfile << planArr[i];
3020  }
3021 
3022  for(unsigned int i=0; i<MAX_EXEC_PLANS; ++i)
3023  {
3024  if(planArr[i].calibrated == false)
3025  break;
3026 
3027  execPlanArray[i].clear();
3028  outfile << "compressed plan:\n";
3029  trainer.compressExecPlanNew(planArr[i]);
3030  for(std::map<std::pair<size_t,size_t>, ImplType>::iterator it = planArr[i].m_data.begin(); it != planArr[i].m_data.end(); ++it)
3031  {
3032  switch(it->second)
3033  {
3034  case IMPL_CPU:
3035  bp.backend = CPU_BACKEND;
3036  break;
3037  case IMPL_OMP:
3038  bp.backend = OMP_BACKEND;
3039  break;
3040  case IMPL_CUDA:
3041  bp.backend = CU_BACKEND;
3042  break;
3043  default:
3044  assert(false);
3045  }
3046  execPlanArray[i].add(it->first.first, it->first.second, bp);
3047  execPlanArray[i].calibrated = true;
3048  }
3049  outfile << planArr[i];
3050  }
3051 
3053 #ifndef _WIN32
3054  storeExecPlanArray(id, execPlanArray, nImpls);
3055 #endif
3056 
3057  outfile << *(trainer.m_tree);
3058  DEBUG_TUNING_LEVEL2( "\nTree: " << *(trainer.m_tree) << "\n");
3059 
3060  // free memory...
3061  for(unsigned int i=0; i<impls.size(); ++i)
3062  {
3063  delete impls[i];
3064  }
3065 
3066  impls.clear();
3067 
3068  // restore them now...
3069  extra.memUp = oldMemUp;
3070  extra.memDown = oldMemDown;
3071  }
3072 
3073 
3075  {
3076  assert(extra.memUp != NULL && extra.memDown != NULL);
3077 
3078  unsigned int actDimens = dimens;
3079  std::string interface = "maparray";
3080 
3081  bool allSame = ((lowBounds[0] == lowBounds[1]) && (lowBounds[1] == lowBounds[2])) && ((uppBounds[0] == uppBounds[1]) && (uppBounds[1] == uppBounds[2]));
3082 
3083  dimens = (allSame)? 1:2;
3084 
3085  unsigned int nImpls = 1;
3086  if(!allSame)
3087  SKEPU_ERROR("The current tuning framework does not support MapArray skeleton tuning with different vector sizes. TODO in future.");
3088 
3089  ExecPlan execPlan;
3090 
3092 #if !defined(_WIN32) && !defined(REDO_MEASUREMENTS)
3093  if(loadExecPlan(id, execPlan))
3094  {
3095  return execPlan;
3096  }
3097 #endif
3098 
3099  BackEndParams bp;
3101 
3102  std::vector<size_t> upperBounds(dimens);
3103  std::vector<size_t> lowerBounds(dimens);
3104 
3105  for(unsigned int i=0; i<dimens; ++i)
3106  {
3107  upperBounds[i] = uppBounds[i];
3108  lowerBounds[i] = lowBounds[i];
3109  }
3110 
3111  std::vector<ImpDetail*> impls;
3112 
3113  cpu_tune_wrapper_maparray<StructType, StructType>(0);
3114  impls.push_back(new ImpDetail("cpu_impl", IMPL_CPU, &cpu_tune_wrapper_maparray<StructType, StructType>));
3115 
3116 #ifdef SKEPU_OPENMP
3117  omp_tune_wrapper_maparray<StructType, StructType>(0);
3118  impls.push_back(new ImpDetail("omp_impl", IMPL_OMP, &omp_tune_wrapper_maparray<StructType, StructType>));
3119 #endif
3120 
3121 #ifdef SKEPU_CUDA
3122  cuda_tune_wrapper_maparray<StructType, StructType>(0);
3123  impls.push_back(new ImpDetail("cuda_impl", IMPL_CUDA, &cuda_tune_wrapper_maparray<StructType, StructType>));
3124 #endif
3125 
3126  std::ofstream outfile(std::string("tree_data_" + id + ".dat").c_str());
3127  assert(outfile.good());
3128 
3129  extra.actDimensions = actDimens;
3130  Trainer trainer(impls, lowerBounds, upperBounds, MAX_DEPTH, nImpls, extra, callBackFunction, callBackFunctionMapReduce, OVERSAMPLE);
3131  trainer.train();
3132 
3133  ExecPlanNew<1> plan;
3134  trainer.constructExecPlanNew(&plan, stats);
3135  assert(plan.calibrated);
3136  outfile << plan ;
3137 
3138  outfile << "compressed plan:\n";
3139  trainer.compressExecPlanNew(plan);
3140  for(std::map<std::pair<size_t,size_t>, ImplType>::iterator it = plan.m_data.begin(); it != plan.m_data.end(); ++it)
3141  {
3142  switch(it->second)
3143  {
3144  case IMPL_CPU:
3145  bp.backend = CPU_BACKEND;
3146  break;
3147  case IMPL_OMP:
3148  bp.backend = OMP_BACKEND;
3149  break;
3150  case IMPL_CUDA:
3151  bp.backend = CU_BACKEND;
3152  break;
3153  default:
3154  assert(false);
3155  }
3156  execPlan.add(it->first.first, it->first.second, bp);
3157  }
3158  outfile << plan;
3159 
3161 #ifndef _WIN32
3162  storeExecPlan(id, execPlan);
3163 #endif
3164 
3165  outfile << *(trainer.m_tree);
3166  DEBUG_TUNING_LEVEL2( "\nTree: " << *(trainer.m_tree) << "\n");
3167 
3168  // free memory...
3169  for(int i=0; i<impls.size(); ++i)
3170  {
3171  delete impls[i];
3172  }
3173 
3174  impls.clear();
3175 
3176  return execPlan;
3177  }
3178 
3179 public:
3180  void (*callBackFunction)(void*, size_t*, unsigned int);
3181  void (*callBackFunctionMapReduce)(void*, void*, size_t*, unsigned int);
3182 
3183 private:
3184  ExtraData extra;
3185  unsigned int dimens;
3186  size_t *lowBounds;
3187  size_t *uppBounds;
3188  std::string id;
3189 };
3190 
3191 
3192 
3199 template <typename StructType>
3200 struct Tuner<StructType, MAPOVERLAP, StructType>
3201 {
3202  Tuner(std::string _id, int _dimens, size_t *_lowBounds, size_t *_uppBounds): id(_id), dimens(_dimens), lowBounds(_lowBounds), uppBounds(_uppBounds), callBackFunction(NULL), callBackFunctionMapReduce(NULL)
3203  {
3204  assert(dimens >= 1 && dimens <= 2 && lowBounds && uppBounds);
3205  extra.memUp = NULL;
3206  extra.memDown = NULL;
3207  }
3208 
3209  Tuner(std::string _id, int _dimens, size_t *_lowBounds, size_t *_uppBounds, int *_memUp, int *_memDown): id(_id), dimens(_dimens), lowBounds(_lowBounds), uppBounds(_uppBounds), callBackFunction(NULL), callBackFunctionMapReduce(NULL)
3210  {
3211  assert(dimens >= 1 && dimens <= 2 && lowBounds && uppBounds);
3212  extra.memUp = _memUp;
3213  extra.memDown = _memDown;
3214  }
3215 
3216  StatsTuner stats;
3217 
3218  void operator()(ExecPlan *execPlanArray)
3219  {
3220  assert(execPlanArray!=NULL);
3221 
3226  int *oldMemUp = extra.memUp;
3227  extra.memUp = NULL;
3228  int *oldMemDown = extra.memDown;
3229  extra.memDown = NULL;
3230 
3231  unsigned int actDimens = dimens;
3232  std::string interface = "mapoverlap";
3233  dimens = 1;
3235  unsigned int nImpls = 1;
3236  #ifdef SKEPU_CUDA
3237  nImpls = 3;
3238  #endif
3239  assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
3240 
3242 #if !defined(_WIN32) && !defined(REDO_MEASUREMENTS)
3243  if(loadExecPlanArray(id, execPlanArray))
3244  {
3245  bool redoMesures = false;
3246  for(unsigned int i=0; i<nImpls; ++i)
3247  {
3248  if(execPlanArray[i].calibrated == false)
3249  {
3250  redoMesures = true;
3251  break;
3252  }
3253  for(int j=0; j<dimens; ++j)
3254  {
3255  if(execPlanArray[i].isTrainedFor(lowBounds[j]) == false || execPlanArray[i].isTrainedFor(uppBounds[j]) == false)
3256  {
3257  redoMesures = true;
3258  break;
3259  }
3260  }
3261  }
3263  if(redoMesures == false)
3264  return;
3265  }
3266 #endif
3267 
3268  BackEndParams bp;
3270 
3271  std::vector<size_t> upperBounds(dimens);
3272  std::vector<size_t> lowerBounds(dimens);
3273 
3274  for(unsigned int i=0; i<dimens; ++i)
3275  {
3276  upperBounds[i] = uppBounds[i];
3277  lowerBounds[i] = lowBounds[i];
3278  }
3279 
3280  std::vector<ImpDetail*> impls;
3281 
3282  cpu_tune_wrapper_mapoverlap<StructType, StructType>(0);
3283  impls.push_back(new ImpDetail("cpu_impl", IMPL_CPU, &cpu_tune_wrapper_mapoverlap<StructType, StructType>));
3284 
3285 #ifdef SKEPU_OPENMP
3286  omp_tune_wrapper_mapoverlap<StructType, StructType>(0);
3287  impls.push_back(new ImpDetail("omp_impl", IMPL_OMP, &omp_tune_wrapper_mapoverlap<StructType, StructType>));
3288 #endif
3289 
3290 #ifdef SKEPU_CUDA
3291  cuda_tune_wrapper_mapoverlap<StructType, StructType>(0);
3292  impls.push_back(new ImpDetail("cuda_impl", IMPL_CUDA, &cuda_tune_wrapper_mapoverlap<StructType, StructType>));
3293 #endif
3294 
3295  std::ofstream outfile(std::string("tree_data_multi_" + id + ".dat").c_str());
3296  assert(outfile.good());
3297 
3298  extra.actDimensions = actDimens;
3299  Trainer trainer(impls, lowerBounds, upperBounds, MAX_DEPTH, nImpls, extra, callBackFunction, callBackFunctionMapReduce, OVERSAMPLE);
3300  trainer.train();
3301 
3302 
3303  ExecPlanNew<1> planArr[MAX_EXEC_PLANS];
3304  trainer.constructExecPlanNew(planArr, stats);
3305 
3306  for(unsigned int i=0; i<MAX_EXEC_PLANS; ++i)
3307  {
3308  if(planArr[i].calibrated == false)
3309  break;
3310 
3311  outfile << planArr[i];
3312  }
3313 
3314  for(unsigned int i=0; i<MAX_EXEC_PLANS; ++i)
3315  {
3316  if(planArr[i].calibrated == false)
3317  break;
3318 
3319  execPlanArray[i].clear();
3320  outfile << "compressed plan:\n";
3321  trainer.compressExecPlanNew(planArr[i]);
3322  for(std::map<std::pair<size_t,size_t>, ImplType>::iterator it = planArr[i].m_data.begin(); it != planArr[i].m_data.end(); ++it)
3323  {
3324  switch(it->second)
3325  {
3326  case IMPL_CPU:
3327  bp.backend = CPU_BACKEND;
3328  break;
3329  case IMPL_OMP:
3330  bp.backend = OMP_BACKEND;
3331  break;
3332  case IMPL_CUDA:
3333  bp.backend = CU_BACKEND;
3334  break;
3335  default:
3336  assert(false);
3337  }
3338  execPlanArray[i].add(it->first.first, it->first.second, bp);
3339  execPlanArray[i].calibrated = true;
3340  }
3341  outfile << planArr[i];
3342  }
3343 
3345 #ifndef _WIN32
3346  storeExecPlanArray(id, execPlanArray, nImpls);
3347 #endif
3348 
3349  outfile << *(trainer.m_tree);
3350  DEBUG_TUNING_LEVEL2( "\nTree: " << *(trainer.m_tree) << "\n");
3351 
3352  // free memory...
3353  for(unsigned int i=0; i<impls.size(); ++i)
3354  {
3355  delete impls[i];
3356  }
3357 
3358  impls.clear();
3359 
3360  // restore them now...
3361  extra.memUp = oldMemUp;
3362  extra.memDown = oldMemDown;
3363  }
3364 
3365 
3367  {
3368  assert(extra.memUp != NULL && extra.memDown != NULL);
3369 
3370  unsigned int actDimens = dimens;
3371  std::string interface = "mapoverlap";
3372 
3373  dimens = 1;
3374 
3375  unsigned int nImpls = 1;
3376 
3377  ExecPlan execPlan;
3378 
3380 #if !defined(_WIN32) && !defined(REDO_MEASUREMENTS)
3381  if(loadExecPlan(id, execPlan))
3382  {
3383  return execPlan;
3384  }
3385 #endif
3386 
3387  BackEndParams bp;
3389 
3390  std::vector<size_t> upperBounds(dimens);
3391  std::vector<size_t> lowerBounds(dimens);
3392 
3393  for(unsigned int i=0; i<dimens; ++i)
3394  {
3395  upperBounds[i] = uppBounds[i];
3396  lowerBounds[i] = lowBounds[i];
3397  }
3398 
3399  std::vector<ImpDetail*> impls;
3400 
3401  cpu_tune_wrapper_mapoverlap<StructType, StructType>(0);
3402  impls.push_back(new ImpDetail("cpu_impl", IMPL_CPU, &cpu_tune_wrapper_mapoverlap<StructType, StructType>));
3403 
3404 #ifdef SKEPU_OPENMP
3405  omp_tune_wrapper_mapoverlap<StructType, StructType>(0);
3406  impls.push_back(new ImpDetail("omp_impl", IMPL_OMP, &omp_tune_wrapper_mapoverlap<StructType, StructType>));
3407 #endif
3408 
3409 #ifdef SKEPU_CUDA
3410  cuda_tune_wrapper_mapoverlap<StructType, StructType>(0);
3411  impls.push_back(new ImpDetail("cuda_impl", IMPL_CUDA, &cuda_tune_wrapper_mapoverlap<StructType, StructType>));
3412 #endif
3413 
3414  std::ofstream outfile(std::string("tree_data_" + id + ".dat").c_str());
3415  assert(outfile.good());
3416 
3417  extra.actDimensions = actDimens;
3418  Trainer trainer(impls, lowerBounds, upperBounds, MAX_DEPTH, nImpls, extra, callBackFunction, callBackFunctionMapReduce, OVERSAMPLE);
3419  trainer.train();
3420 
3421  ExecPlanNew<1> plan;
3422  trainer.constructExecPlanNew(&plan, stats);
3423  assert(plan.calibrated);
3424  outfile << plan ;
3425 
3426  outfile << "compressed plan:\n";
3427  trainer.compressExecPlanNew(plan);
3428  for(std::map<std::pair<size_t,size_t>, ImplType>::iterator it = plan.m_data.begin(); it != plan.m_data.end(); ++it)
3429  {
3430  switch(it->second)
3431  {
3432  case IMPL_CPU:
3433  bp.backend = CPU_BACKEND;
3434  break;
3435  case IMPL_OMP:
3436  bp.backend = OMP_BACKEND;
3437  break;
3438  case IMPL_CUDA:
3439  bp.backend = CU_BACKEND;
3440  break;
3441  default:
3442  assert(false);
3443  }
3444  execPlan.add(it->first.first, it->first.second, bp);
3445  }
3446  outfile << plan;
3447 
3449 #ifndef _WIN32
3450  storeExecPlan(id, execPlan);
3451 #endif
3452 
3453  outfile << *(trainer.m_tree);
3454  DEBUG_TUNING_LEVEL2( "\nTree: " << *(trainer.m_tree) << "\n");
3455 
3456  // free memory...
3457  for(int i=0; i<impls.size(); ++i)
3458  {
3459  delete impls[i];
3460  }
3461 
3462  impls.clear();
3463 
3464  return execPlan;
3465  }
3466 
3467 public:
3468  void (*callBackFunction)(void*, size_t*, unsigned int);
3469  void (*callBackFunctionMapReduce)(void*, void*, size_t*, unsigned int);
3470 
3471 private:
3472  ExtraData extra;
3473  unsigned int dimens;
3474  size_t *lowBounds;
3475  size_t *uppBounds;
3476  std::string id;
3477 };
3478 
3479 
3480 
3481 } // end namespace skepu....
3482 
3483 
3484 
3485 #endif
3486 
void operator()(ExecPlan *execPlanArray)
Definition: tuner.h:2634
T OMP(Matrix< T > &input)
Definition: reduce_omp_2d.inl:27
void cuda_tune_wrapper_mapreduce(void *arg)
Do training execution for a single performance context for MapReduce skeleton and CUDA implementation...
Definition: tuner.h:1569
bool loadExecPlan(std::string id, ExecPlan &plan)
Loads an execution plan for a file into the structure passes as argument.
Definition: tuner.h:1774
ExecPlan operator()()
Definition: tuner.h:2206
bool storeExecPlan(std::string id, const ExecPlan &plan)
Stores an execution plan for the structure passed as argument to a file.
Definition: tuner.h:1842
void CU(Vector< T > &input, EdgePolicy poly=CONSTANT, T pad=T(), int useNumGPU=1)
Definition: mapoverlap_cu.inl:33
void cpu_tune_wrapper_mapreduce(void *arg)
Do training execution for a single performance context for MapReduce skeleton and sequential CPU impl...
Definition: tuner.h:461
void omp_tune_wrapper_reduce(void *arg)
Do training execution for a single performance context for Reduce skeleton and parallel OpenMP implem...
Definition: tuner.h:699
bool calibrated
Definition: exec_plan.h:57
void CPU(Vector< T > &input)
Definition: map_cpu.inl:21
void cuda_tune_wrapper_mapoverlap(void *arg)
Do training execution for a single performance context for MapOverlap skeleton and CUDA implementatio...
Definition: tuner.h:1350
A class representing the MapArray skeleton.
Definition: maparray.h:94
void cpu_tune_wrapper_reduce(void *arg)
Do training execution for a single performance context for Reduce skeleton and sequential CPU impleme...
Definition: tuner.h:163
void CU(Vector< in > &input, MultiVector P, Vector< out > &output, int useNumGPU=1)
Definition: maparray_cu.inl:155
void compressExecPlanNew(ExecPlanNew< 1 > &plan)
Definition: trainer.h:490
void OMP(Vector< in > &input, MultiVector P, Vector< out > &output)
Definition: maparray_omp.inl:54
void createDefaultConfiguration(BackEndParams &bp)
A helper function that creates the default configuration.
Definition: tuner.h:1711
void CPU(Vector< in > &input, MultiVector P, Vector< out > &output)
Definition: maparray_cpu.inl:21
bool storeExecPlanArray(std::string id, const ExecPlan *planArray, unsigned int nImpls)
Stores execution plans for the structure passed as argument to a file.
Definition: tuner.h:1976
bool loadExecPlanArray(std::string id, ExecPlan *planArray)
Loads execution plans for a file into the structure passes as argument.
Definition: tuner.h:1894
void cuda_tune_wrapper_map(void *arg)
the following functions train for CUDA implementations for different skeletons... ...
Definition: tuner.h:1124
Contains a class declaration for the MapReduce skeleton.
void cuda_tune_wrapper_reduce(void *arg)
Do training execution for a single performance context for Reduce skeleton and CUDA implementation...
Definition: tuner.h:1251
void cuda_tune_wrapper_maparray(void *arg)
Do training execution for a single performance context for MapArray skeleton and CUDA implementation...
Definition: tuner.h:1457
void CPU(Vector< T > &input, EdgePolicy poly=CONSTANT, T pad=T())
Definition: mapoverlap_cpu.inl:23
void OMP(Vector< T > &input, EdgePolicy poly=CONSTANT, T pad=T())
Definition: mapoverlap_omp.inl:26
static const std::string trimSpaces(const std::string &pString, const std::string &pWhitespace=" \t")
Definition: helper_methods.h:32
void omp_tune_wrapper_mapoverlap(void *arg)
Do training execution for a single performance context for MapOverlap skeleton and parallel OpenMP im...
Definition: tuner.h:793
void constructExecPlanNew(ExecPlanNew< dimens > *plan, StatsTuner &stats)
Definition: trainer.inl:103
ExecPlan operator()()
Definition: tuner.h:2493
void resize(size_type num, T val=T())
Definition: vector.inl:326
T CPU(Vector< T > &input)
Definition: mapreduce_cpu.inl:23
end Node class...
Definition: trainer.h:478
device_pointer_type_cu updateDevice_CU(T *start, size_type numElements, unsigned int deviceID, bool copy, bool writeAccess, bool markOnlyLocalCopiesInvalid=false, unsigned int streamID=0)
Update device with vector content.
Definition: vector_cu.inl:162
Contains a class declaration for the Reduce skeleton.
ExecPlan operator()()
Definition: tuner.h:2778
T CU(Vector< T > &input, int useNumGPU=1)
Definition: mapreduce_cu.inl:349
void cpu_tune_wrapper_maparray(void *arg)
Do training execution for a single performance context for MapArray skeleton and sequential CPU imple...
Definition: tuner.h:354
void cpu_tune_wrapper_map(void *arg)
Do training execution for a single performance context for Map skeleton and sequential CPU implementa...
Definition: tuner.h:47
void omp_tune_wrapper_maparray(void *arg)
Do training execution for a single performance context for MapArray skeleton and parallel OpenMP impl...
Definition: tuner.h:895
A vector container class, implemented as a wrapper for std::vector.
Definition: vector.h:61
Tuner class: generic definition.... Multiple class specializations are defined for this class...
Definition: tuner.h:1695
void operator()(ExecPlan *execPlanArray)
Definition: tuner.h:2344
A class representing the MapReduce skeleton.
Definition: mapreduce.h:54
A class that describes an execution plan.
Definition: exec_plan.h:47
void operator()(ExecPlan *execPlanArray)
Definition: tuner.h:3218
void omp_tune_wrapper_map(void *arg)
the following section contains function that can train OpenMP implementations. Only enabled when Open...
Definition: tuner.h:582
Contains a class declaration for the Scan skeleton.
A class representing a execution environment.
Definition: environment.h:80
Contains a class declaration for the MapArray skeleton.
void train()
Definition: trainer.inl:78
static Environment * getInstance()
Definition: environment.inl:90
A class representing the Reduce skeleton both for 1D and 2D reduce operation for 1D Vector...
Definition: reduce.h:77
ExecPlan operator()()
Definition: tuner.h:3074
A class representing the MapOverlap skeleton.
Definition: mapoverlap.h:75
Contains a class declaration for the Map skeleton.
Can be used to specify properties for a backend.
Definition: exec_plan.h:19
void OMP(Vector< T > &input)
Definition: map_omp.inl:24
Any extra information that User want to pass to the function wrapper for implementations can be speci...
Definition: trainer.h:99
ExecPlan operator()()
Definition: tuner.h:3366
A class representing the Map skeleton.
Definition: map.h:52
void CU(Vector< T > &input, int useNumGPU=1)
Definition: map_cu.inl:418
T OMP(Vector< T > &input)
Definition: mapreduce_omp.inl:27
void cpu_tune_wrapper_mapoverlap(void *arg)
Do training execution for a single performance context for MapOverlap skeleton and sequential CPU imp...
Definition: tuner.h:256
T CPU(Matrix< T > &input)
Definition: reduce_cpu_2d.inl:26
void operator()(ExecPlan *execPlanArray)
Definition: tuner.h:2060
T CU(Matrix< T > &input, int useNumGPU=1)
Definition: reduce_cu_2d.inl:311
void omp_tune_wrapper_mapreduce(void *arg)
Do training execution for a single performance context for MapReduce skeleton and parallel OpenMP imp...
Definition: tuner.h:1002
Contains a class declaration for the MapOverlap skeleton.
void operator()(ExecPlan *execPlanArray)
Definition: tuner.h:2916