22 #include "skepu/src/trainer.h"
23 #include "skepu/src/timer.h"
46 template <
typename StructType,
typename StructType2>
54 TrainingData *td=
reinterpret_cast<TrainingData*
>(arg);
57 unsigned int nImpls = td->nImpls;
58 assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
59 unsigned int dimens = td->dimens;
60 unsigned int actDimens = td->extra->actDimensions;
62 DEBUG_TUNING_LEVEL3(
"Computed dimensions: " << dimens <<
", Actual dimensions: " << actDimens <<
"\n");
64 assert(dimens == 1 && actDimens >= 1 && actDimens <= 4);
66 size_t sizes[MAX_PARAMS];
68 for(
unsigned int i=0; i<actDimens; ++i)
70 sizes[i] = td->problemSize[0];
71 vecArr[i].
resize(sizes[i]);
74 double commCost[MAX_EXEC_PLANS];
78 double commCostPerOp = bwDataStruct.latency_dth + (bwDataStruct.timing_dth *
sizeof(
typename StructType::TYPE) * (td->problemSize[0]));
80 bool singlePlan = (td->extra->memUp != NULL && td->extra->memDown != NULL);
85 int *memUpFlags = td->extra->memUp;
86 int *memDownFlags = td->extra->memDown;
87 for(
unsigned int i=0; i<actDimens; ++i)
89 if(i < (actDimens - 1) && memUpFlags[i] == 1)
90 commCost[0] += bwDataStruct.latency_dth + (bwDataStruct.timing_dth *
sizeof(
typename StructType::TYPE) * (sizes[0]));
91 else if(i == (actDimens - 1) && memDownFlags[0] == 1)
92 commCost[0] += bwDataStruct.latency_htd + (bwDataStruct.timing_htd *
sizeof(
typename StructType::TYPE) * (sizes[0]));
100 commCost[2] = commCostPerOp;
105 commCost[4] = commCostPerOp;
106 commCost[5] = commCostPerOp * 2;
111 commCost[7] = commCostPerOp;
112 commCost[8] = commCostPerOp * 2;
113 commCost[9] = commCostPerOp * 3;
123 StructType *userFunc =
new StructType;
124 if(td->callBackFunction != NULL)
125 td->callBackFunction(userFunc, sizes, actDimens);
132 mapTest.
CPU(vecArr[0]);
133 else if(actDimens == 2)
134 mapTest.
CPU(vecArr[0],vecArr[1]);
135 else if(actDimens == 3)
136 mapTest.
CPU(vecArr[0],vecArr[1], vecArr[2]);
137 else if(actDimens == 4)
138 mapTest.
CPU(vecArr[0],vecArr[1], vecArr[2], vecArr[3]);
144 DEBUG_TUNING_LEVEL3(
"*CPU* map size: " << sizes[0] <<
"\n");
146 std::string printStr =
"";
147 for(
unsigned int i=0; i<nImpls; ++i)
149 td->exec_time[i] = commCost[i] + timer.getTotalTime();
150 printStr +=
" " + convertToStr<double>(td->exec_time[i]);
152 DEBUG_TUNING_LEVEL3(printStr +
"\n");
162 template <
typename StructType,
typename StructType2>
170 TrainingData *td=
reinterpret_cast<TrainingData*
>(arg);
173 unsigned int nImpls = td->nImpls;
174 assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
175 unsigned int dimens = td->dimens;
176 unsigned int actDimens = td->extra->actDimensions;
178 DEBUG_TUNING_LEVEL3(
"Computed dimensions: " << dimens <<
", Actual dimensions: " << actDimens <<
"\n");
180 assert(dimens == 1 && actDimens == 1);
183 volatile typename StructType::TYPE retVal;
185 size_t sizes[MAX_PARAMS];
187 for(
unsigned int i=0; i<actDimens; ++i)
189 sizes[i] = ((actDimens != dimens)? td->problemSize[0] : td->problemSize[i]);
190 vecArr[i].
resize(sizes[i]);
193 double commCost[MAX_EXEC_PLANS];
198 bool singlePlan = (td->extra->memUp != NULL);
203 int *memUpFlags = td->extra->memUp;
204 for(
unsigned int i=0; i<actDimens; ++i)
206 if(memUpFlags[i] == 1)
207 commCost[0] += bwDataStruct.latency_dth + (bwDataStruct.timing_dth *
sizeof(
typename StructType::TYPE) * (sizes[0]));
215 commCost[2] = bwDataStruct.latency_dth + (bwDataStruct.timing_dth *
sizeof(
typename StructType::TYPE) * (td->problemSize[0]));
225 StructType *userFunc =
new StructType;
226 if(td->callBackFunction != NULL)
227 td->callBackFunction(userFunc, sizes, actDimens);
233 retVal = redTest.
CPU(vecArr[0]);
237 DEBUG_TUNING_LEVEL3(
"*CPU* reduce size: " << sizes[0] <<
"\n");
239 std::string printStr =
"";
240 for(
unsigned int i=0; i<nImpls; ++i)
242 td->exec_time[i] = commCost[i] + timer.getTotalTime();
243 printStr +=
" " + convertToStr<double>(td->exec_time[i]);
245 DEBUG_TUNING_LEVEL3(printStr +
"\n");
255 template <
typename StructType,
typename StructType2>
263 TrainingData *td=
reinterpret_cast<TrainingData*
>(arg);
266 unsigned int nImpls = td->nImpls;
267 assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
268 unsigned int dimens = td->dimens;
269 unsigned int actDimens = td->extra->actDimensions;
271 DEBUG_TUNING_LEVEL3(
"Computed dimensions: " << dimens <<
", Actual dimensions: " << actDimens <<
"\n");
273 assert(dimens == 1 && actDimens >=1 && actDimens <= 2);
275 size_t sizes[MAX_PARAMS];
277 for(
unsigned int i=0; i<actDimens; ++i)
279 sizes[i] = ((actDimens != dimens)? td->problemSize[0] : td->problemSize[i]);
280 vecArr[i].
resize(sizes[i]);
283 double commCost[MAX_EXEC_PLANS];
288 bool singlePlan = (td->extra->memUp != NULL && td->extra->memDown != NULL);
293 int *memUpFlags = td->extra->memUp;
294 int *memDownFlags = td->extra->memDown;
295 for(
unsigned int i=0; i<actDimens; ++i)
297 if(i < (actDimens - 1) && memUpFlags[i] == 1)
298 commCost[0] += bwDataStruct.latency_dth + (bwDataStruct.timing_dth *
sizeof(
typename StructType::TYPE) * (sizes[0]));
299 else if(i == (actDimens - 1) && memDownFlags[0] == 1)
300 commCost[0] += bwDataStruct.latency_htd + (bwDataStruct.timing_htd *
sizeof(
typename StructType::TYPE) * (sizes[0]));
308 commCost[2] = bwDataStruct.latency_dth + (bwDataStruct.timing_dth *
sizeof(
typename StructType::TYPE) * (td->problemSize[0]));
318 StructType *userFunc =
new StructType;
319 if(td->callBackFunction != NULL)
320 td->callBackFunction(userFunc, sizes, actDimens);
327 mapOverTest.
CPU(vecArr[0]);
328 else if(actDimens == 2)
329 mapOverTest.
CPU(vecArr[0], vecArr[1]);
335 DEBUG_TUNING_LEVEL3(
"*CPU* mapoverlap size: " << sizes[0] <<
"\n");
337 std::string printStr =
"";
338 for(
unsigned int i=0; i<nImpls; ++i)
340 td->exec_time[i] = commCost[i] + timer.getTotalTime();
341 printStr +=
" " + convertToStr<double>(td->exec_time[i]);
343 DEBUG_TUNING_LEVEL3(printStr +
"\n");
353 template <
typename StructType,
typename StructType2>
361 TrainingData *td=
reinterpret_cast<TrainingData*
>(arg);
364 unsigned int nImpls = td->nImpls;
365 assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
366 unsigned int dimens = td->dimens;
367 unsigned int actDimens = td->extra->actDimensions;
369 DEBUG_TUNING_LEVEL3(
"Computed dimensions: " << dimens <<
", Actual dimensions: " << actDimens <<
"\n");
371 assert(dimens >= 1 && dimens <= 2 && actDimens == 3);
373 size_t sizes[MAX_PARAMS];
376 sizes[0] = td->problemSize[0];
377 sizes[1] = (dimens == 1)? td->problemSize[0] : td->problemSize[1];
378 sizes[2] = (dimens == 1)? td->problemSize[0] : td->problemSize[1];
380 for(
unsigned int i=0; i<actDimens; ++i)
382 vecArr[i].
resize(sizes[i]);
385 double commCost[MAX_EXEC_PLANS];
387 assert(sizes[0] == sizes[1] && sizes[1] == sizes[2]);
392 bool singlePlan = (td->extra->memUp != NULL && td->extra->memDown != NULL);
397 int *memUpFlags = td->extra->memUp;
398 int *memDownFlags = td->extra->memDown;
399 for(
unsigned int i=0; i<actDimens; ++i)
401 if(i < (actDimens - 1) && memUpFlags[i] == 1)
402 commCost[0] += bwDataStruct.latency_dth + (bwDataStruct.timing_dth *
sizeof(
typename StructType::TYPE) * (sizes[0]));
403 else if(i == (actDimens - 1) && memDownFlags[0] == 1)
404 commCost[0] += bwDataStruct.latency_htd + (bwDataStruct.timing_htd *
sizeof(
typename StructType::TYPE) * (sizes[0]));
412 commCost[2] = bwDataStruct.latency_dth + (bwDataStruct.timing_dth *
sizeof(
typename StructType::TYPE) * (sizes[0]));
416 assert(sizes[0] == sizes[1]);
418 commCost[4] = bwDataStruct.latency_dth + (bwDataStruct.timing_dth *
sizeof(
typename StructType::TYPE) * (sizes[1]));
419 commCost[5] = commCost[2] + bwDataStruct.latency_dth + (bwDataStruct.timing_dth *
sizeof(
typename StructType::TYPE) * (sizes[1]));
429 StructType *userFunc =
new StructType;
430 if(td->callBackFunction != NULL)
431 td->callBackFunction(userFunc, sizes, actDimens);
437 mapArrTest.
CPU(vecArr[0], vecArr[1], vecArr[2]);
441 DEBUG_TUNING_LEVEL3(
"*CPU* maparray size: " << sizes[0] <<
"\n");
443 std::string printStr =
"";
444 for(
unsigned int i=0; i<nImpls; ++i)
446 td->exec_time[i] = commCost[i] + timer.getTotalTime();
447 printStr +=
" " + convertToStr<double>(td->exec_time[i]);
449 DEBUG_TUNING_LEVEL3(printStr +
"\n");
460 template <
typename StructType,
typename StructType2>
468 TrainingData *td=
reinterpret_cast<TrainingData*
>(arg);
471 unsigned int nImpls = td->nImpls;
472 assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
473 unsigned int dimens = td->dimens;
474 unsigned int actDimens = td->extra->actDimensions;
476 DEBUG_TUNING_LEVEL3(
"Computed dimensions: " << dimens <<
", Actual dimensions: " << actDimens <<
"\n");
478 assert(dimens == 1 && actDimens >= 1 && actDimens <= 3);
481 volatile typename StructType::TYPE retVal;
483 size_t sizes[MAX_PARAMS];
485 for(
unsigned int i=0; i<actDimens; ++i)
487 sizes[i] = td->problemSize[0];
488 vecArr[i].
resize(sizes[i]);
491 double commCost[MAX_EXEC_PLANS];
495 double commCostPerOp = bwDataStruct.latency_dth + (bwDataStruct.timing_dth *
sizeof(
typename StructType::TYPE) * (td->problemSize[0]));
497 bool singlePlan = (td->extra->memUp != NULL);
502 int *memUpFlags = td->extra->memUp;
503 for(
unsigned int i=0; i<actDimens; ++i)
505 if(memUpFlags[i] == 1)
506 commCost[0] += bwDataStruct.latency_dth + (bwDataStruct.timing_dth *
sizeof(
typename StructType::TYPE) * (sizes[0]));
514 commCost[2] = commCostPerOp;
519 commCost[4] = commCostPerOp;
520 commCost[5] = commCostPerOp * 2;
525 commCost[7] = commCostPerOp;
526 commCost[8] = commCostPerOp * 2;
527 commCost[9] = commCostPerOp * 3;
537 StructType *userFunc =
new StructType;
538 StructType2 *userFunc2 =
new StructType2;
539 if(td->callBackFunctionMapReduce != NULL)
540 td->callBackFunctionMapReduce(userFunc, userFunc2, sizes, actDimens);
547 retVal = mapRedTest.
CPU(vecArr[0]);
548 else if(actDimens == 2)
549 retVal = mapRedTest.
CPU(vecArr[0],vecArr[1]);
550 else if(actDimens == 3)
551 retVal = mapRedTest.
CPU(vecArr[0],vecArr[1], vecArr[2]);
557 DEBUG_TUNING_LEVEL3(
"*CPU* mapreduce size: " << sizes[0] <<
"\n");
559 std::string printStr =
"";
560 for(
unsigned int i=0; i<nImpls; ++i)
562 td->exec_time[i] = commCost[i] + timer.getTotalTime();
563 printStr +=
" " + convertToStr<double>(td->exec_time[i]);
565 DEBUG_TUNING_LEVEL3(printStr +
"\n");
581 template <
typename StructType,
typename StructType2>
589 TrainingData *td=
reinterpret_cast<TrainingData*
>(arg);
592 unsigned int nImpls = td->nImpls;
593 assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
594 unsigned int dimens = td->dimens;
595 unsigned int actDimens = td->extra->actDimensions;
597 DEBUG_TUNING_LEVEL3(
"Computed dimensions: " << dimens <<
", Actual dimensions: " << actDimens <<
"\n");
599 assert(dimens == 1 && actDimens >= 1 && actDimens <= 4);
601 size_t sizes[MAX_PARAMS];
603 for(
unsigned int i=0; i<actDimens; ++i)
605 sizes[i] = td->problemSize[0];
606 vecArr[i].
resize(sizes[i]);
609 double commCost[MAX_EXEC_PLANS];
613 double commCostPerOp = bwDataStruct.latency_dth + (bwDataStruct.timing_dth *
sizeof(
typename StructType::TYPE) * (td->problemSize[0]));
616 bool singlePlan = (td->extra->memUp != NULL && td->extra->memDown != NULL);
621 int *memUpFlags = td->extra->memUp;
622 int *memDownFlags = td->extra->memDown;
623 for(
unsigned int i=0; i<actDimens; ++i)
625 if(i < (actDimens - 1) && memUpFlags[i] == 1)
626 commCost[0] += bwDataStruct.latency_dth + (bwDataStruct.timing_dth *
sizeof(
typename StructType::TYPE) * (sizes[0]));
627 else if(i == (actDimens - 1) && memDownFlags[0] == 1)
628 commCost[0] += bwDataStruct.latency_htd + (bwDataStruct.timing_htd *
sizeof(
typename StructType::TYPE) * (sizes[0]));
636 commCost[2] = commCostPerOp;
641 commCost[4] = commCostPerOp;
642 commCost[5] = commCostPerOp * 2;
647 commCost[7] = commCostPerOp;
648 commCost[8] = commCostPerOp * 2;
649 commCost[9] = commCostPerOp * 3;
659 StructType *userFunc =
new StructType;
660 if(td->callBackFunction != NULL)
661 td->callBackFunction(userFunc, sizes, actDimens);
668 mapTest.
OMP(vecArr[0]);
669 else if(actDimens == 2)
670 mapTest.
OMP(vecArr[0],vecArr[1]);
671 else if(actDimens == 3)
672 mapTest.
OMP(vecArr[0],vecArr[1], vecArr[2]);
673 else if(actDimens == 4)
674 mapTest.
OMP(vecArr[0],vecArr[1], vecArr[2], vecArr[3]);
680 DEBUG_TUNING_LEVEL3(
"*OpenMP* map size: " << sizes[0] <<
"\n");
682 std::string printStr =
"";
683 for(
unsigned int i=0; i<nImpls; ++i)
685 td->exec_time[i] = commCost[i] + timer.getTotalTime();
686 printStr +=
" " + convertToStr<double>(td->exec_time[i]);
688 DEBUG_TUNING_LEVEL3(printStr +
"\n");
698 template <
typename StructType,
typename StructType2>
706 TrainingData *td=
reinterpret_cast<TrainingData*
>(arg);
709 unsigned int nImpls = td->nImpls;
710 assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
711 unsigned int dimens = td->dimens;
712 unsigned int actDimens = td->extra->actDimensions;
714 DEBUG_TUNING_LEVEL3(
"Computed dimensions: " << dimens <<
", Actual dimensions: " << actDimens <<
"\n");
716 assert(dimens == 1 && actDimens == 1);
719 volatile typename StructType::TYPE retVal;
721 size_t sizes[MAX_PARAMS];
723 for(
unsigned int i=0; i<actDimens; ++i)
725 sizes[i] = ((actDimens != dimens)? td->problemSize[0] : td->problemSize[i]);
726 vecArr[i].
resize(sizes[i]);
729 double commCost[MAX_EXEC_PLANS];
735 bool singlePlan = (td->extra->memUp != NULL);
740 int *memUpFlags = td->extra->memUp;
741 for(
unsigned int i=0; i<actDimens; ++i)
743 if(memUpFlags[i] == 1)
744 commCost[0] += bwDataStruct.latency_dth + (bwDataStruct.timing_dth *
sizeof(
typename StructType::TYPE) * (sizes[0]));
752 commCost[2] = bwDataStruct.latency_dth + (bwDataStruct.timing_dth *
sizeof(
typename StructType::TYPE) * (td->problemSize[0]));
762 StructType *userFunc =
new StructType;
763 if(td->callBackFunction != NULL)
764 td->callBackFunction(userFunc, sizes, actDimens);
770 retVal = redTest.
OMP(vecArr[0]);
774 DEBUG_TUNING_LEVEL3(
"*OpenMP* reduce size: " << sizes[0] <<
"\n");
776 std::string printStr =
"";
777 for(
unsigned int i=0; i<nImpls; ++i)
779 td->exec_time[i] = commCost[i] + timer.getTotalTime();
780 printStr +=
" " + convertToStr<double>(td->exec_time[i]);
782 DEBUG_TUNING_LEVEL3(printStr +
"\n");
792 template <
typename StructType,
typename StructType2>
800 TrainingData *td=
reinterpret_cast<TrainingData*
>(arg);
803 unsigned int nImpls = td->nImpls;
804 assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
805 unsigned int dimens = td->dimens;
806 unsigned int actDimens = td->extra->actDimensions;
808 DEBUG_TUNING_LEVEL3(
"Computed dimensions: " << dimens <<
", Actual dimensions: " << actDimens <<
"\n");
810 assert(dimens == 1 && actDimens >=1 && actDimens <= 2);
813 volatile typename StructType::TYPE retVal;
815 size_t sizes[MAX_PARAMS];
817 for(
unsigned int i=0; i<actDimens; ++i)
819 sizes[i] = ((actDimens != dimens)? td->problemSize[0] : td->problemSize[i]);
820 vecArr[i].
resize(sizes[i]);
823 double commCost[MAX_EXEC_PLANS];
829 bool singlePlan = (td->extra->memUp != NULL && td->extra->memDown != NULL);
834 int *memUpFlags = td->extra->memUp;
835 int *memDownFlags = td->extra->memDown;
836 for(
unsigned int i=0; i<actDimens; ++i)
838 if(i < (actDimens - 1) && memUpFlags[i] == 1)
839 commCost[0] += bwDataStruct.latency_dth + (bwDataStruct.timing_dth *
sizeof(
typename StructType::TYPE) * (sizes[0]));
840 else if(i == (actDimens - 1) && memDownFlags[0] == 1)
841 commCost[0] += bwDataStruct.latency_htd + (bwDataStruct.timing_htd *
sizeof(
typename StructType::TYPE) * (sizes[0]));
849 commCost[2] = bwDataStruct.latency_dth + (bwDataStruct.timing_dth *
sizeof(
typename StructType::TYPE) * (td->problemSize[0]));
859 StructType *userFunc =
new StructType;
860 if(td->callBackFunction != NULL)
861 td->callBackFunction(userFunc, sizes, actDimens);
868 mapOverTest.
OMP(vecArr[0]);
869 else if(actDimens == 2)
870 mapOverTest.
OMP(vecArr[0], vecArr[1]);
876 DEBUG_TUNING_LEVEL3(
"*OpenMP* mapoverlap size: " << sizes[0] <<
"\n");
878 std::string printStr =
"";
879 for(
unsigned int i=0; i<nImpls; ++i)
881 td->exec_time[i] = commCost[i] + timer.getTotalTime();
882 printStr +=
" " + convertToStr<double>(td->exec_time[i]);
884 DEBUG_TUNING_LEVEL3(printStr +
"\n");
894 template <
typename StructType,
typename StructType2>
902 TrainingData *td=
reinterpret_cast<TrainingData*
>(arg);
905 unsigned int nImpls = td->nImpls;
906 assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
907 unsigned int dimens = td->dimens;
908 unsigned int actDimens = td->extra->actDimensions;
910 DEBUG_TUNING_LEVEL3(
"Computed dimensions: " << dimens <<
", Actual dimensions: " << actDimens <<
"\n");
912 assert(dimens >= 1 && dimens <= 2 && actDimens == 3);
914 size_t sizes[MAX_PARAMS];
917 sizes[0] = td->problemSize[0];
918 sizes[1] = (dimens == 1)? td->problemSize[0] : td->problemSize[1];
919 sizes[2] = (dimens == 1)? td->problemSize[0] : td->problemSize[1];
921 for(
unsigned int i=0; i<actDimens; ++i)
923 vecArr[i].
resize(sizes[i]);
926 double commCost[MAX_EXEC_PLANS];
928 assert(sizes[0] == sizes[1] && sizes[1] == sizes[2]);
933 bool singlePlan = (td->extra->memUp != NULL && td->extra->memDown != NULL);
938 int *memUpFlags = td->extra->memUp;
939 int *memDownFlags = td->extra->memDown;
940 for(
unsigned int i=0; i<actDimens; ++i)
942 if(i < (actDimens - 1) && memUpFlags[i] == 1)
943 commCost[0] += bwDataStruct.latency_dth + (bwDataStruct.timing_dth *
sizeof(
typename StructType::TYPE) * (sizes[0]));
944 else if(i == (actDimens - 1) && memDownFlags[0] == 1)
945 commCost[0] += bwDataStruct.latency_htd + (bwDataStruct.timing_htd *
sizeof(
typename StructType::TYPE) * (sizes[0]));
953 commCost[2] = bwDataStruct.latency_dth + (bwDataStruct.timing_dth *
sizeof(
typename StructType::TYPE) * (sizes[0]));
957 assert(sizes[0] == sizes[1]);
959 commCost[4] = bwDataStruct.latency_dth + (bwDataStruct.timing_dth *
sizeof(
typename StructType::TYPE) * (sizes[1]));
960 commCost[5] = commCost[2] + bwDataStruct.latency_dth + (bwDataStruct.timing_dth *
sizeof(
typename StructType::TYPE) * (sizes[1]));
970 StructType *userFunc =
new StructType;
971 if(td->callBackFunction != NULL)
972 td->callBackFunction(userFunc, sizes, actDimens);
978 mapArrTest.
OMP(vecArr[0], vecArr[1], vecArr[2]);
982 DEBUG_TUNING_LEVEL3(
"*OpenMP* maparray size: " << sizes[0] <<
"\n");
984 std::string printStr =
"";
985 for(
unsigned int i=0; i<nImpls; ++i)
987 td->exec_time[i] = commCost[i] + timer.getTotalTime();
988 printStr +=
" " + convertToStr<double>(td->exec_time[i]);
990 DEBUG_TUNING_LEVEL3(printStr +
"\n");
1001 template <
typename StructType,
typename StructType2>
1009 TrainingData *td=
reinterpret_cast<TrainingData*
>(arg);
1012 unsigned int nImpls = td->nImpls;
1013 assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
1014 unsigned int dimens = td->dimens;
1015 unsigned int actDimens = td->extra->actDimensions;
1017 DEBUG_TUNING_LEVEL3(
"Computed dimensions: " << dimens <<
", Actual dimensions: " << actDimens <<
"\n");
1019 assert(dimens == 1 && actDimens >= 1 && actDimens <= 3);
1022 volatile typename StructType::TYPE retVal;
1024 size_t sizes[MAX_PARAMS];
1026 for(
unsigned int i=0; i<actDimens; ++i)
1028 sizes[i] = td->problemSize[0];
1029 vecArr[i].
resize(sizes[i]);
1032 double commCost[MAX_EXEC_PLANS];
1036 double commCostPerOp = bwDataStruct.latency_dth + (bwDataStruct.timing_dth *
sizeof(
typename StructType::TYPE) * (td->problemSize[0]));
1039 bool singlePlan = (td->extra->memUp != NULL);
1042 assert(nImpls == 1);
1044 int *memUpFlags = td->extra->memUp;
1045 for(
unsigned int i=0; i<actDimens; ++i)
1047 if(memUpFlags[i] == 1)
1048 commCost[0] += commCostPerOp;
1056 commCost[2] = commCostPerOp;
1061 commCost[4] = commCostPerOp;
1062 commCost[5] = commCostPerOp * 2;
1067 commCost[7] = commCostPerOp;
1068 commCost[8] = commCostPerOp * 2;
1069 commCost[9] = commCostPerOp * 3;
1075 assert(nImpls == 1);
1079 StructType *userFunc =
new StructType;
1080 StructType2 *userFunc2 =
new StructType2;
1081 if(td->callBackFunctionMapReduce != NULL)
1082 td->callBackFunctionMapReduce(userFunc, userFunc2, sizes, actDimens);
1089 retVal = mapRedTest.
OMP(vecArr[0]);
1090 else if(actDimens == 2)
1091 retVal = mapRedTest.
OMP(vecArr[0],vecArr[1]);
1092 else if(actDimens == 3)
1093 retVal = mapRedTest.
OMP(vecArr[0],vecArr[1], vecArr[2]);
1099 DEBUG_TUNING_LEVEL3(
"*OpenMP* mapreduce size: " << sizes[0] <<
"\n");
1101 std::string printStr =
"";
1102 for(
unsigned int i=0; i<nImpls; ++i)
1104 td->exec_time[i] = commCost[i] + timer.getTotalTime();
1105 printStr +=
" " + convertToStr<double>(td->exec_time[i]);
1107 DEBUG_TUNING_LEVEL3(printStr +
"\n");
1123 template <
typename StructType,
typename StructType2>
1131 TrainingData *td=
reinterpret_cast<TrainingData*
>(arg);
1134 unsigned int nImpls = td->nImpls;
1135 assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
1136 unsigned int dimens = td->dimens;
1137 unsigned int actDimens = td->extra->actDimensions;
1140 int *memUpFlags = td->extra->memUp;
1141 int *memDownFlags = td->extra->memDown;
1142 bool singlePlan = (memUpFlags != NULL && memDownFlags != NULL);
1144 DEBUG_TUNING_LEVEL3(
"Computed dimensions: " << dimens <<
", Actual dimensions: " << actDimens <<
"\n");
1146 assert(dimens == 1 && actDimens >= 1 && actDimens <= 4);
1151 double commCost[MAX_EXEC_PLANS];
1153 double commCostPerOp = bwDataStruct.latency_htd + (bwDataStruct.timing_htd *
sizeof(
typename StructType::TYPE) * (td->problemSize[0]));
1159 size_t sizes[MAX_PARAMS];
1161 for(
unsigned int i=0; i<actDimens; ++i)
1163 sizes[i] = td->problemSize[0];
1164 vecArr[i].
resize(sizes[i]);
1168 if(i == (actDimens-1) && memDownFlags[0] == 0)
1169 commCost[0] += bwDataStruct.latency_dth + (bwDataStruct.timing_dth *
sizeof(
typename StructType::TYPE) * (sizes[0]));
1171 if(i < (actDimens-1) && memUpFlags[i] == 0)
1172 commCost[0] += bwDataStruct.latency_htd + (bwDataStruct.timing_htd *
sizeof(
typename StructType::TYPE) * (sizes[0]));
1176 if(i == actDimens-1)
1182 cudaDeviceSynchronize();
1185 assert(nImpls == 1);
1188 commCost[0] = commCostPerOp * ((actDimens>1) ? (actDimens-1) : 1);
1191 commCost[1] = commCostPerOp * ((actDimens>2) ? (actDimens-2) : 0);
1192 commCost[2] = commCostPerOp * ((actDimens>2) ? (actDimens-2) : 0);
1196 commCost[3] = commCostPerOp * ((actDimens>3) ? (actDimens-3) : 0);
1197 commCost[4] = commCostPerOp * ((actDimens>3) ? (actDimens-3) : 0);
1198 commCost[5] = commCostPerOp * ((actDimens>3) ? (actDimens-3) : 0);
1210 StructType *userFunc =
new StructType;
1211 if(td->callBackFunction != NULL)
1212 td->callBackFunction(userFunc, sizes, actDimens);
1219 mapTest.
CU(vecArr[0]);
1220 else if(actDimens == 2)
1221 mapTest.
CU(vecArr[0],vecArr[1]);
1222 else if(actDimens == 3)
1223 mapTest.
CU(vecArr[0],vecArr[1], vecArr[2]);
1224 else if(actDimens == 4)
1225 mapTest.
CU(vecArr[0],vecArr[1], vecArr[2], vecArr[3]);
1231 DEBUG_TUNING_LEVEL3(
"*CUDA* map size: " << sizes[0] <<
"\n");
1233 std::string printStr =
"";
1234 for(
unsigned int i=0; i<nImpls; ++i)
1236 td->exec_time[i] = commCost[i] + timer.getTotalTime();
1237 printStr +=
" " + convertToStr<double>(td->exec_time[i]);
1239 DEBUG_TUNING_LEVEL3(printStr +
"\n");
1250 template <
typename StructType,
typename StructType2>
1258 TrainingData *td=
reinterpret_cast<TrainingData*
>(arg);
1261 unsigned int nImpls = td->nImpls;
1262 assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
1263 unsigned int dimens = td->dimens;
1264 unsigned int actDimens = td->extra->actDimensions;
1267 int *memUpFlags = td->extra->memUp;
1268 bool singlePlan = (memUpFlags != NULL);
1270 DEBUG_TUNING_LEVEL3(
"Computed dimensions: " << dimens <<
", Actual dimensions: " << actDimens <<
"\n");
1272 assert(dimens == 1 && actDimens == 1);
1275 volatile typename StructType::TYPE retVal;
1278 double commCost[MAX_EXEC_PLANS];
1284 size_t sizes[MAX_PARAMS];
1286 for(
unsigned int i=0; i<actDimens; ++i)
1288 sizes[i] = td->problemSize[0];
1289 vecArr[i].
resize(sizes[i]);
1291 if(singlePlan && memUpFlags[i] == 0)
1293 commCost[0] += bwDataStruct.latency_htd + (bwDataStruct.timing_htd *
sizeof(
typename StructType::TYPE) * (sizes[0]));
1300 cudaDeviceSynchronize();
1304 assert(nImpls == 1);
1307 commCost[0] = bwDataStruct.latency_htd + (bwDataStruct.timing_htd *
sizeof(
typename StructType::TYPE) * (td->problemSize[0]));
1316 StructType *userFunc =
new StructType;
1317 if(td->callBackFunction != NULL)
1318 td->callBackFunction(userFunc, sizes, actDimens);
1322 DEBUG_TUNING_LEVEL3(
"Computed dimensions: " << dimens <<
", Actual dimensions: " << actDimens <<
"\n");
1326 retVal = redTest.
CU(vecArr[0]);
1330 DEBUG_TUNING_LEVEL3(
"*CUDA* reduce size: " << sizes[0] <<
"\n");
1332 std::string printStr =
"";
1333 for(
unsigned int i=0; i<nImpls; ++i)
1335 td->exec_time[i] = commCost[i] + timer.getTotalTime();
1336 printStr +=
" " + convertToStr<double>(td->exec_time[i]);
1338 DEBUG_TUNING_LEVEL3(printStr +
"\n");
1349 template <
typename StructType,
typename StructType2>
1357 TrainingData *td=
reinterpret_cast<TrainingData*
>(arg);
1360 unsigned int nImpls = td->nImpls;
1361 assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
1362 unsigned int dimens = td->dimens;
1363 unsigned int actDimens = td->extra->actDimensions;
1366 int *memUpFlags = td->extra->memUp;
1367 int *memDownFlags = td->extra->memDown;
1368 bool singlePlan = (memUpFlags != NULL && memDownFlags != NULL);
1370 DEBUG_TUNING_LEVEL3(
"Computed dimensions: " << dimens <<
", Actual dimensions: " << actDimens <<
"\n");
1372 assert(dimens == 1 && actDimens >=1 && actDimens <= 2);
1375 volatile typename StructType::TYPE retVal;
1378 double commCost[MAX_EXEC_PLANS];
1384 size_t sizes[MAX_PARAMS];
1386 for(
unsigned int i=0; i<actDimens; ++i)
1388 sizes[i] = td->problemSize[0];
1389 vecArr[i].
resize(sizes[i]);
1393 if(i == 1 && memDownFlags[0] == 0)
1394 commCost[0] += bwDataStruct.latency_dth + (bwDataStruct.timing_dth *
sizeof(
typename StructType::TYPE) * (sizes[0]));
1395 else if (i<1 && memUpFlags[i] == 0)
1396 commCost[0] += bwDataStruct.latency_htd + (bwDataStruct.timing_htd *
sizeof(
typename StructType::TYPE) * (sizes[0]));
1406 cudaDeviceSynchronize();
1409 assert(nImpls == 1);
1412 commCost[0] = bwDataStruct.latency_htd + (bwDataStruct.timing_htd *
sizeof(
typename StructType::TYPE) * (td->problemSize[0]));
1421 StructType *userFunc =
new StructType;
1422 if(td->callBackFunction != NULL)
1423 td->callBackFunction(userFunc, sizes, actDimens);
1430 mapOverTest.
CU(vecArr[0]);
1431 else if(actDimens == 2)
1432 mapOverTest.
CU(vecArr[0], vecArr[1]);
1438 DEBUG_TUNING_LEVEL3(
"*CUDA* mapoverlap size: " << sizes[0] <<
"\n");
1440 std::string printStr =
"";
1441 for(
unsigned int i=0; i<nImpls; ++i)
1443 td->exec_time[i] = commCost[i] + timer.getTotalTime();
1444 printStr +=
" " + convertToStr<double>(td->exec_time[i]);
1446 DEBUG_TUNING_LEVEL3(printStr +
"\n");
1456 template <
typename StructType,
typename StructType2>
1464 TrainingData *td=
reinterpret_cast<TrainingData*
>(arg);
1467 unsigned int nImpls = td->nImpls;
1468 assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
1469 unsigned int dimens = td->dimens;
1470 unsigned int actDimens = td->extra->actDimensions;
1473 int *memUpFlags = td->extra->memUp;
1474 int *memDownFlags = td->extra->memDown;
1475 bool singlePlan = (memUpFlags != NULL && memDownFlags != NULL);
1478 DEBUG_TUNING_LEVEL3(
"Computed dimensions: " << dimens <<
", Actual dimensions: " << actDimens <<
"\n");
1480 assert(dimens >= 1 && dimens <= 2 && actDimens == 3);
1482 size_t sizes[MAX_PARAMS];
1485 sizes[0] = td->problemSize[0];
1486 sizes[1] = (dimens == 1)? td->problemSize[0] : td->problemSize[1];
1487 sizes[2] = (dimens == 1)? td->problemSize[0] : td->problemSize[1];
1490 double commCost[MAX_EXEC_PLANS];
1491 assert(sizes[0] == sizes[1] && sizes[1] == sizes[2]);
1497 for(
unsigned int i=0; i<actDimens; ++i)
1499 vecArr[i].
resize(sizes[i]);
1503 if(i == (actDimens - 1) && memDownFlags[0] == 0)
1504 commCost[0] += bwDataStruct.latency_dth + (bwDataStruct.timing_dth *
sizeof(
typename StructType::TYPE) * (sizes[0]));
1505 else if (i < (actDimens - 1) && memUpFlags[i] == 0)
1506 commCost[0] += bwDataStruct.latency_htd + (bwDataStruct.timing_htd *
sizeof(
typename StructType::TYPE) * (sizes[0]));
1510 if(i == actDimens-1)
1516 cudaDeviceSynchronize();
1519 assert(nImpls == 1);
1522 commCost[0] = 2 * (bwDataStruct.latency_htd + (bwDataStruct.timing_htd *
sizeof(
typename StructType::TYPE) * (sizes[0])));
1525 commCost[1] = bwDataStruct.latency_htd + (bwDataStruct.timing_htd *
sizeof(
typename StructType::TYPE) * (sizes[0]));
1526 commCost[2] = commCost[1];
1537 StructType *userFunc =
new StructType;
1538 if(td->callBackFunction != NULL)
1539 td->callBackFunction(userFunc, sizes, actDimens);
1545 mapArrTest.
CU(vecArr[0], vecArr[1], vecArr[2]);
1549 DEBUG_TUNING_LEVEL3(
"*CUDA* maparray size: " << sizes[0] <<
"\n");
1551 std::string printStr =
"";
1552 for(
unsigned int i=0; i<nImpls; ++i)
1554 td->exec_time[i] = commCost[i] + timer.getTotalTime();
1555 printStr +=
" " + convertToStr<double>(td->exec_time[i]);
1557 DEBUG_TUNING_LEVEL3(printStr +
"\n");
1568 template <
typename StructType,
typename StructType2>
1576 TrainingData *td=
reinterpret_cast<TrainingData*
>(arg);
1579 unsigned int nImpls = td->nImpls;
1580 assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
1581 unsigned int dimens = td->dimens;
1582 unsigned int actDimens = td->extra->actDimensions;
1585 int *memUpFlags = td->extra->memUp;
1586 bool singlePlan = (memUpFlags != NULL);
1588 DEBUG_TUNING_LEVEL3(
"Computed dimensions: " << dimens <<
", Actual dimensions: " << actDimens <<
"\n");
1590 assert(dimens == 1 && actDimens >= 1 && actDimens <= 3);
1593 volatile typename StructType::TYPE retVal;
1596 double commCost[MAX_EXEC_PLANS];
1598 double costPerOp = bwDataStruct.latency_htd + (bwDataStruct.timing_htd *
sizeof(
typename StructType::TYPE) * (td->problemSize[0]));
1603 size_t sizes[MAX_PARAMS];
1605 for(
unsigned int i=0; i<actDimens; ++i)
1607 sizes[i] = td->problemSize[0];
1608 vecArr[i].
resize(sizes[i]);
1610 if(singlePlan && memUpFlags[i] == 0)
1612 commCost[0] += costPerOp;
1619 cudaDeviceSynchronize();
1622 assert(nImpls == 1);
1625 commCost[0] = costPerOp * actDimens;
1628 commCost[1] = costPerOp * ((actDimens>1) ? (actDimens-1) : 0);
1629 commCost[2] = costPerOp * ((actDimens>1) ? (actDimens-1) : 0);
1633 commCost[3] = costPerOp * ((actDimens>2) ? (actDimens-2) : 0);
1634 commCost[4] = costPerOp * ((actDimens>2) ? (actDimens-2) : 0);
1635 commCost[5] = costPerOp * ((actDimens>2) ? (actDimens-2) : 0);
1647 StructType *userFunc =
new StructType;
1648 StructType2 *userFunc2 =
new StructType2;
1649 if(td->callBackFunctionMapReduce != NULL)
1650 td->callBackFunctionMapReduce(userFunc, userFunc2, sizes, actDimens);
1657 retVal = mapRedTest.
CU(vecArr[0]);
1658 else if(actDimens == 2)
1659 retVal = mapRedTest.
CU(vecArr[0],vecArr[1]);
1660 else if(actDimens == 3)
1661 retVal = mapRedTest.
CU(vecArr[0],vecArr[1], vecArr[2]);
1667 DEBUG_TUNING_LEVEL3(
"*CUDA* mapreduce size: " << sizes[0] <<
"\n");
1669 std::string printStr =
"";
1670 for(
unsigned int i=0; i<nImpls; ++i)
1672 td->exec_time[i] = commCost[i] + timer.getTotalTime();
1673 printStr +=
" " + convertToStr<double>(td->exec_time[i]);
1675 DEBUG_TUNING_LEVEL3(printStr +
"\n");
1694 template <
typename StructType, SkeletonType type,
typename StructType2 = StructType>
1715 #if defined(SKEPU_OPENCL) && !defined(SKEPU_CUDA) && SKEPU_NUMGPU == 1
1716 bp.backend = CL_BACKEND;
1717 #elif defined(SKEPU_OPENCL) && !defined(SKEPU_CUDA) && SKEPU_NUMGPU != 1
1718 bp.backend = CLM_BACKEND;
1719 #elif !defined(SKEPU_OPENCL) && defined(SKEPU_CUDA) && SKEPU_NUMGPU == 1
1720 bp.backend = CU_BACKEND;
1721 #elif !defined(SKEPU_OPENCL) && defined(SKEPU_CUDA) && SKEPU_NUMGPU != 1
1722 bp.backend = CUM_BACKEND;
1723 #elif defined(SKEPU_OPENCL) && defined(SKEPU_CUDA) && SKEPU_NUMGPU == 1
1724 bp.backend = CL_BACKEND;
1725 #elif defined(SKEPU_OPENCL) && defined(SKEPU_CUDA) && SKEPU_NUMGPU != 1
1726 bp.backend = CLM_BACKEND;
1727 #elif !defined(SKEPU_OPENCL) && !defined(SKEPU_CUDA)
1729 #if defined(SKEPU_OPENMP)
1730 bp.backend = OMP_BACKEND;
1732 bp.backend = CPU_BACKEND;
1738 bp.maxThreads = environment->m_devices_CL.at(0)->getMaxThreads();
1739 bp.maxBlocks = environment->m_devices_CL.at(0)->getMaxBlocks();
1743 bp.maxThreads = environment->m_devices_CU.at(0)->getMaxThreads();
1744 bp.maxBlocks = environment->m_devices_CU.at(0)->getMaxBlocks();
1748 #ifdef SKEPU_OPENMP_THREADS
1749 bp.numOmpThreads = SKEPU_OPENMP_THREADS;
1751 bp.numOmpThreads = omp_get_max_threads();
1763 #include "skepu/src/makedir.h"
1776 std::string path = getPMDirectory();
1777 path +=
id +
".meta";
1778 if(fileExists(path))
1780 std::ifstream infile(path.c_str());
1782 assert(infile.good());
1784 std::string strLine;
1785 size_t low, upp, numCUThreads, numCUBlocks;
1786 unsigned int numOmpThreads;
1788 std::string impTypeStr;
1790 while(infile.good())
1792 getline(infile, strLine);
1794 if(strLine[0] ==
'%' || strLine[0] ==
'/' || strLine[0] ==
'#')
1797 std::istringstream iss(strLine);
1798 iss >> low >> upp >> impTypeStr;
1799 iss >> numOmpThreads >> numCUThreads >> numCUBlocks;
1801 bp.numOmpThreads = numOmpThreads;
1802 bp.maxThreads = numCUThreads;
1803 bp.maxBlocks = numCUBlocks;
1805 impTypeStr = capitalizeString(impTypeStr);
1807 if(impTypeStr ==
"CPU")
1809 bp.backend = CPU_BACKEND;
1812 else if(impTypeStr ==
"OMP")
1814 bp.backend = OMP_BACKEND;
1817 else if(impTypeStr ==
"CUDA")
1819 bp.backend = CU_BACKEND;
1825 plan.add(low, upp, bp);
1844 std::string path = getPMDirectory();
1845 std::string file(path +
id +
".meta");
1847 if(fileExists(file) ==
false)
1851 std::ofstream outfile(file.c_str());
1853 assert(outfile.good());
1856 outfile <<
"% Execution plan for " <<
id <<
"\n";
1857 std::map< std::pair<size_t, size_t>,
BackEndParams > m_data = plan.sizePlan;
1858 for(std::map< std::pair<size_t, size_t>,
BackEndParams >::iterator it = m_data.begin(); it != m_data.end(); ++it)
1860 std::string beTypeStr =
"";
1865 beTypeStr =
"CPU " + convertIntToString(bp.numOmpThreads) +
" " + convertIntToString(bp.maxThreads) +
" " + convertIntToString(bp.maxBlocks);
1868 beTypeStr =
"OMP " + convertIntToString(bp.numOmpThreads) +
" " + convertIntToString(bp.maxThreads) +
" " + convertIntToString(bp.maxBlocks);
1871 beTypeStr =
"CUDA " + convertIntToString(bp.numOmpThreads) +
" " + convertIntToString(bp.maxThreads) +
" " + convertIntToString(bp.maxBlocks);
1877 outfile << it->first.first <<
" " << it->first.second <<
" " << beTypeStr <<
"\n";
1896 assert(planArray != NULL);
1898 std::string path = getPMDirectory();
1899 path +=
id +
"_multi.meta";
1900 if(fileExists(path))
1902 std::ifstream infile(path.c_str());
1904 assert(infile.good());
1906 std::string strLine;
1907 size_t low, upp, numCUThreads, numCUBlocks;
1908 unsigned int numOmpThreads;
1909 std::string impTypeStr;
1913 while(infile.good())
1915 getline(infile, strLine);
1917 if(strLine[0] ==
'%' || strLine[0] ==
'/')
1920 if(strLine[0] ==
'#')
1924 std::istringstream iss(strLine);
1927 assert(idx == tmpIdx);
1930 assert(idx >= 0 && idx < MAX_EXEC_PLANS);
1932 std::istringstream iss(strLine);
1933 iss >> low >> upp >> impTypeStr;
1934 iss >> numOmpThreads >> numCUThreads >> numCUBlocks;
1936 bp.numOmpThreads = numOmpThreads;
1937 bp.maxThreads = numCUThreads;
1938 bp.maxBlocks = numCUBlocks;
1940 impTypeStr = capitalizeString(impTypeStr);
1942 if(impTypeStr ==
"CPU")
1944 bp.backend = CPU_BACKEND;
1947 else if(impTypeStr ==
"OMP")
1949 bp.backend = OMP_BACKEND;
1952 else if(impTypeStr ==
"CUDA")
1954 bp.backend = CU_BACKEND;
1960 planArray[idx].add(low, upp, bp);
1978 assert(planArray != NULL);
1980 std::string path = getPMDirectory();
1981 std::string file(path +
id +
"_multi.meta");
1983 if(fileExists(file) ==
false)
1987 std::ofstream outfile(file.c_str());
1989 assert(outfile.good());
1991 assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
1993 outfile <<
"% Execution plan for " <<
id <<
"\n";
1994 for(
unsigned int i=0; i<nImpls; ++i)
1996 if(!planArray[i].calibrated)
1998 SKEPU_WARNING(
"[SKEPU Warning]: Plan '" <<
id <<
"' is not calibrated for index: " << i <<
"\n");
2002 outfile <<
"# " << i <<
"\n";
2004 const std::map< std::pair<size_t, size_t>,
BackEndParams > &m_data = planArray[i].sizePlan;
2005 for(std::map< std::pair<size_t, size_t>,
BackEndParams >::const_iterator it = m_data.begin(); it != m_data.end(); ++it)
2007 std::string beTypeStr =
"";
2012 beTypeStr =
"CPU " + convertIntToString(bp.numOmpThreads) +
" " + convertIntToString(bp.maxThreads) +
" " + convertIntToString(bp.maxBlocks);
2015 beTypeStr =
"OMP " + convertIntToString(bp.numOmpThreads) +
" " + convertIntToString(bp.maxThreads) +
" " + convertIntToString(bp.maxBlocks);
2018 beTypeStr =
"CUDA " + convertIntToString(bp.numOmpThreads) +
" " + convertIntToString(bp.maxThreads) +
" " + convertIntToString(bp.maxBlocks);
2024 outfile << it->first.first <<
" " << it->first.second <<
" " << beTypeStr <<
"\n";
2041 template <
typename StructType,
typename StructType2>
2042 struct Tuner<StructType, MAPREDUCE, StructType2>
2044 Tuner(std::string _id,
int _dimens,
size_t *_lowBounds,
size_t *_uppBounds): id(_id), dimens(_dimens), lowBounds(_lowBounds), uppBounds(_uppBounds), callBackFunction(NULL), callBackFunctionMapReduce(NULL)
2046 assert(dimens >= 1 && dimens <= 3 && lowBounds && uppBounds);
2048 extra.memDown = NULL;
2051 Tuner(std::string _id,
int _dimens,
size_t *_lowBounds,
size_t *_uppBounds,
int *_memUp): id(_id), dimens(_dimens), lowBounds(_lowBounds), uppBounds(_uppBounds), callBackFunction(NULL), callBackFunctionMapReduce(NULL)
2053 assert(dimens >= 1 && dimens <= 3 && lowBounds && uppBounds);
2054 extra.memUp = _memUp;
2055 extra.memDown = NULL;
2062 assert(execPlanArray!=NULL);
2068 int *oldMemUp = extra.memUp;
2071 unsigned int actDimens = dimens;
2072 std::string
interface = "mapreduce";
2076 unsigned int nImpls = 1;
2078 nImpls = nImpls = ( (actDimens == 1) ? 3 : ((actDimens == 2) ? 6 : 10) );
2080 assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
2083 #if !defined(_WIN32) && !defined(REDO_MEASUREMENTS)
2086 bool redoMesures =
false;
2087 for(
unsigned int i=0; i<nImpls; ++i)
2089 if(execPlanArray[i].calibrated ==
false)
2094 for(
unsigned int i=0; i<dimens; ++i)
2096 if(execPlanArray[i].isTrainedFor(lowBounds[i]) ==
false || execPlanArray[i].isTrainedFor(uppBounds[i]) ==
false)
2104 if(redoMesures ==
false)
2112 std::vector<size_t> upperBounds(dimens);
2113 std::vector<size_t> lowerBounds(dimens);
2115 for(
unsigned int i=0; i<dimens; ++i)
2117 upperBounds[i] = uppBounds[i];
2118 lowerBounds[i] = lowBounds[i];
2121 std::vector<ImpDetail*> impls;
2123 cpu_tune_wrapper_mapreduce<StructType, StructType2>(0);
2124 impls.push_back(
new ImpDetail(
"cpu_impl", IMPL_CPU, &cpu_tune_wrapper_mapreduce<StructType, StructType2>));
2127 omp_tune_wrapper_mapreduce<StructType, StructType2>(0);
2128 impls.push_back(
new ImpDetail(
"omp_impl", IMPL_OMP, &omp_tune_wrapper_mapreduce<StructType, StructType2>));
2132 cuda_tune_wrapper_mapreduce<StructType, StructType2>(0);
2133 impls.push_back(
new ImpDetail(
"cuda_impl", IMPL_CUDA, &cuda_tune_wrapper_mapreduce<StructType, StructType2>));
2136 std::ofstream outfile(std::string(
"tree_data_multi_" +
id +
".dat").c_str());
2137 assert(outfile.good());
2140 extra.actDimensions = actDimens;
2141 Trainer trainer(impls, lowerBounds, upperBounds, MAX_DEPTH, nImpls, extra, callBackFunction, callBackFunctionMapReduce, OVERSAMPLE);
2144 ExecPlanNew<1> planArr[MAX_EXEC_PLANS];
2147 for(
unsigned int i=0; i<MAX_EXEC_PLANS; ++i)
2149 if(planArr[i].calibrated ==
false)
2152 outfile << planArr[i];
2155 for(
unsigned int i=0; i<MAX_EXEC_PLANS; ++i)
2157 if(planArr[i].calibrated ==
false)
2160 execPlanArray[i].clear();
2161 outfile <<
"compressed plan:\n";
2163 for(std::map<std::pair<size_t,size_t>, ImplType>::iterator it = planArr[i].m_data.begin(); it != planArr[i].m_data.end(); ++it)
2168 bp.backend = CPU_BACKEND;
2171 bp.backend = OMP_BACKEND;
2174 bp.backend = CU_BACKEND;
2179 execPlanArray[i].add(it->first.first, it->first.second, bp);
2182 outfile << planArr[i];
2190 outfile << *(trainer.m_tree);
2191 DEBUG_TUNING_LEVEL2(
"\nTree: " << *(trainer.m_tree) <<
"\n");
2194 for(
unsigned int i=0; i<impls.size(); ++i)
2202 extra.memUp = oldMemUp;
2208 assert(extra.memUp != NULL);
2210 unsigned int actDimens = dimens;
2211 std::string
interface = "mapreduce";
2214 unsigned int nImpls = 1;
2219 #if !defined(_WIN32) && !defined(REDO_MEASUREMENTS)
2229 std::vector<size_t> upperBounds(dimens);
2230 std::vector<size_t> lowerBounds(dimens);
2232 for(
unsigned int i=0; i<dimens; ++i)
2234 upperBounds[i] = uppBounds[i];
2235 lowerBounds[i] = lowBounds[i];
2238 std::vector<ImpDetail*> impls;
2240 cpu_tune_wrapper_mapreduce<StructType, StructType2>(0);
2241 impls.push_back(
new ImpDetail(
"cpu_impl", IMPL_CPU, &cpu_tune_wrapper_mapreduce<StructType, StructType2>));
2244 omp_tune_wrapper_mapreduce<StructType, StructType2>(0);
2245 impls.push_back(
new ImpDetail(
"omp_impl", IMPL_OMP, &omp_tune_wrapper_mapreduce<StructType, StructType2>));
2249 cuda_tune_wrapper_mapreduce<StructType, StructType2>(0);
2250 impls.push_back(
new ImpDetail(
"cuda_impl", IMPL_CUDA, &cuda_tune_wrapper_mapreduce<StructType, StructType2>));
2253 std::ofstream outfile(std::string(
"tree_data_" +
id +
".dat").c_str());
2254 assert(outfile.good());
2256 extra.actDimensions = actDimens;
2257 Trainer trainer(impls, lowerBounds, upperBounds, MAX_DEPTH, nImpls, extra, callBackFunction, callBackFunctionMapReduce, OVERSAMPLE);
2260 ExecPlanNew<1> plan;
2262 assert(plan.calibrated);
2265 outfile <<
"compressed plan:\n";
2267 for(std::map<std::pair<size_t,size_t>, ImplType>::iterator it = plan.m_data.begin(); it != plan.m_data.end(); ++it)
2272 bp.backend = CPU_BACKEND;
2275 bp.backend = OMP_BACKEND;
2278 bp.backend = CU_BACKEND;
2283 execPlan.add(it->first.first, it->first.second, bp);
2292 outfile << *(trainer.m_tree);
2293 DEBUG_TUNING_LEVEL2(
"\nTree: " << *(trainer.m_tree) <<
"\n");
2296 for(
int i=0; i<impls.size(); ++i)
2307 void (*callBackFunction)(
void*,
size_t*,
unsigned int);
2308 void (*callBackFunctionMapReduce)(
void*,
void*,
size_t*,
unsigned int);
2312 unsigned int dimens;
2325 template <
typename StructType>
2326 struct Tuner<StructType, MAP, StructType>
2328 Tuner(std::string _id,
int _dimens,
size_t *_lowBounds,
size_t *_uppBounds): id(_id), dimens(_dimens), lowBounds(_lowBounds), uppBounds(_uppBounds), callBackFunction(NULL), callBackFunctionMapReduce(NULL)
2330 assert(dimens >= 1 && dimens <= 4 && lowBounds && uppBounds);
2332 extra.memDown = NULL;
2335 Tuner(std::string _id,
int _dimens,
size_t *_lowBounds,
size_t *_uppBounds,
int *_memUp,
int *_memDown): id(_id), dimens(_dimens), lowBounds(_lowBounds), uppBounds(_uppBounds), callBackFunction(NULL), callBackFunctionMapReduce(NULL)
2337 assert(dimens >= 1 && dimens <= 4 && lowBounds && uppBounds);
2338 extra.memUp = _memUp;
2339 extra.memDown = _memDown;
2346 assert(execPlanArray!=NULL);
2352 int *oldMemUp = extra.memUp;
2354 int *oldMemDown = extra.memDown;
2355 extra.memDown = NULL;
2358 unsigned int actDimens = dimens;
2359 std::string
interface = "map";
2363 unsigned int nImpls = 1;
2365 nImpls = ( (actDimens == 1 || actDimens == 2) ? 3 : ((actDimens == 3) ? 6 : 10) );
2367 assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
2370 #if !defined(_WIN32) && !defined(REDO_MEASUREMENTS)
2373 bool redoMesures =
false;
2374 for(
unsigned int i=0; i<nImpls; ++i)
2376 if(execPlanArray[i].calibrated ==
false)
2381 for(
unsigned int i=0; i<dimens; ++i)
2383 if(execPlanArray[i].isTrainedFor(lowBounds[i]) ==
false || execPlanArray[i].isTrainedFor(uppBounds[i]) ==
false)
2391 if(redoMesures ==
false)
2399 std::vector<size_t> upperBounds(dimens);
2400 std::vector<size_t> lowerBounds(dimens);
2402 for(
unsigned int i=0; i<dimens; ++i)
2404 upperBounds[i] = uppBounds[i];
2405 lowerBounds[i] = lowBounds[i];
2408 std::vector<ImpDetail*> impls;
2410 cpu_tune_wrapper_map<StructType, StructType>(0);
2411 impls.push_back(
new ImpDetail(
"cpu_impl", IMPL_CPU, &cpu_tune_wrapper_map<StructType, StructType>));
2414 omp_tune_wrapper_map<StructType, StructType>(0);
2415 impls.push_back(
new ImpDetail(
"omp_impl", IMPL_OMP, &omp_tune_wrapper_map<StructType, StructType>));
2419 cuda_tune_wrapper_map<StructType, StructType>(0);
2420 impls.push_back(
new ImpDetail(
"cuda_impl", IMPL_CUDA, &cuda_tune_wrapper_map<StructType, StructType>));
2423 std::ofstream outfile(std::string(
"tree_data_multi_" +
id +
".dat").c_str());
2424 assert(outfile.good());
2426 extra.actDimensions = actDimens;
2427 Trainer trainer(impls, lowerBounds, upperBounds, MAX_DEPTH, nImpls, extra, callBackFunction, callBackFunctionMapReduce, OVERSAMPLE);
2430 ExecPlanNew<1> planArr[MAX_EXEC_PLANS];
2433 for(
unsigned int i=0; i<MAX_EXEC_PLANS; ++i)
2435 if(planArr[i].calibrated ==
false)
2438 outfile << planArr[i];
2441 for(
unsigned int i=0; i<MAX_EXEC_PLANS; ++i)
2443 if(planArr[i].calibrated ==
false)
2446 execPlanArray[i].clear();
2447 outfile <<
"compressed plan:\n";
2449 for(std::map<std::pair<size_t,size_t>, ImplType>::iterator it = planArr[i].m_data.begin(); it != planArr[i].m_data.end(); ++it)
2454 bp.backend = CPU_BACKEND;
2457 bp.backend = OMP_BACKEND;
2460 bp.backend = CU_BACKEND;
2465 execPlanArray[i].add(it->first.first, it->first.second, bp);
2468 outfile << planArr[i] ;
2476 outfile << *(trainer.m_tree);
2477 DEBUG_TUNING_LEVEL2(
"\nTree: " << *(trainer.m_tree) <<
"\n");
2480 for(
unsigned int i=0; i<impls.size(); ++i)
2488 extra.memUp = oldMemUp;
2489 extra.memDown = oldMemDown;
2495 assert(extra.memUp != NULL && extra.memDown != NULL);
2497 unsigned int actDimens = dimens;
2498 std::string
interface = "map";
2501 unsigned int nImpls = 1;
2506 #if !defined(_WIN32) && !defined(REDO_MEASUREMENTS)
2516 std::vector<size_t> upperBounds(dimens);
2517 std::vector<size_t> lowerBounds(dimens);
2519 for(
unsigned int i=0; i<dimens; ++i)
2521 upperBounds[i] = uppBounds[i];
2522 lowerBounds[i] = lowBounds[i];
2525 std::vector<ImpDetail*> impls;
2527 cpu_tune_wrapper_map<StructType, StructType>(0);
2528 impls.push_back(
new ImpDetail(
"cpu_impl", IMPL_CPU, &cpu_tune_wrapper_map<StructType, StructType>));
2531 omp_tune_wrapper_map<StructType, StructType>(0);
2532 impls.push_back(
new ImpDetail(
"omp_impl", IMPL_OMP, &omp_tune_wrapper_map<StructType, StructType>));
2536 cuda_tune_wrapper_map<StructType, StructType>(0);
2537 impls.push_back(
new ImpDetail(
"cuda_impl", IMPL_CUDA, &cuda_tune_wrapper_map<StructType, StructType>));
2540 std::ofstream outfile(std::string(
"tree_data_" +
id +
".dat").c_str());
2541 assert(outfile.good());
2543 extra.actDimensions = actDimens;
2544 Trainer trainer(impls, lowerBounds, upperBounds, MAX_DEPTH, nImpls, extra, callBackFunction, callBackFunctionMapReduce, OVERSAMPLE);
2547 ExecPlanNew<1> plan;
2549 assert(plan.calibrated);
2552 outfile <<
"compressed plan:\n";
2554 for(std::map<std::pair<size_t,size_t>, ImplType>::iterator it = plan.m_data.begin(); it != plan.m_data.end(); ++it)
2559 bp.backend = CPU_BACKEND;
2562 bp.backend = OMP_BACKEND;
2565 bp.backend = CU_BACKEND;
2570 execPlan.add(it->first.first, it->first.second, bp);
2579 outfile << *(trainer.m_tree);
2580 DEBUG_TUNING_LEVEL2(
"\nTree: " << *(trainer.m_tree) <<
"\n");
2583 for(
int i=0; i<impls.size(); ++i)
2594 void (*callBackFunction)(
void*,
size_t*,
unsigned int);
2595 void (*callBackFunctionMapReduce)(
void*,
void*,
size_t*,
unsigned int);
2599 unsigned int dimens;
2615 template <
typename StructType>
2616 struct Tuner<StructType, REDUCE, StructType>
2618 Tuner(std::string _id,
int _dimens,
size_t *_lowBounds,
size_t *_uppBounds): id(_id), dimens(_dimens), lowBounds(_lowBounds), uppBounds(_uppBounds), callBackFunction(NULL), callBackFunctionMapReduce(NULL)
2620 assert(dimens == 1 && lowBounds && uppBounds);
2622 extra.memDown = NULL;
2625 Tuner(std::string _id,
int _dimens,
size_t *_lowBounds,
size_t *_uppBounds,
int *_memUp): id(_id), dimens(_dimens), lowBounds(_lowBounds), uppBounds(_uppBounds), callBackFunction(NULL), callBackFunctionMapReduce(NULL)
2627 assert(dimens == 1 && lowBounds && uppBounds);
2628 extra.memUp = _memUp;
2629 extra.memDown = NULL;
2636 assert(execPlanArray!=NULL);
2642 int *oldMemUp = extra.memUp;
2645 unsigned int actDimens = dimens;
2646 std::string
interface = "reduce";
2649 unsigned int nImpls = 1;
2653 assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
2656 #if !defined(_WIN32) && !defined(REDO_MEASUREMENTS)
2659 bool redoMesures =
false;
2660 for(
unsigned int i=0; i<nImpls; ++i)
2662 if(execPlanArray[i].calibrated ==
false)
2667 for(
unsigned int i=0; i<dimens; ++i)
2669 if(execPlanArray[i].isTrainedFor(lowBounds[i]) ==
false || execPlanArray[i].isTrainedFor(uppBounds[i]) ==
false)
2677 if(redoMesures ==
false)
2685 std::vector<size_t> upperBounds(dimens);
2686 std::vector<size_t> lowerBounds(dimens);
2688 for(
unsigned int i=0; i<dimens; ++i)
2690 upperBounds[i] = uppBounds[i];
2691 lowerBounds[i] = lowBounds[i];
2694 std::vector<ImpDetail*> impls;
2696 cpu_tune_wrapper_reduce<StructType, StructType>(0);
2697 impls.push_back(
new ImpDetail(
"cpu_impl", IMPL_CPU, &cpu_tune_wrapper_reduce<StructType, StructType>));
2700 omp_tune_wrapper_reduce<StructType, StructType>(0);
2701 impls.push_back(
new ImpDetail(
"omp_impl", IMPL_OMP, &omp_tune_wrapper_reduce<StructType, StructType>));
2705 cuda_tune_wrapper_reduce<StructType, StructType>(0);
2706 impls.push_back(
new ImpDetail(
"cuda_impl", IMPL_CUDA, &cuda_tune_wrapper_reduce<StructType, StructType>));
2709 std::ofstream outfile(std::string(
"tree_data_multi_" +
id +
".dat").c_str());
2710 assert(outfile.good());
2712 extra.actDimensions = actDimens;
2713 Trainer trainer(impls, lowerBounds, upperBounds, MAX_DEPTH, nImpls, extra, callBackFunction, callBackFunctionMapReduce, OVERSAMPLE);
2716 ExecPlanNew<1> planArr[MAX_EXEC_PLANS];
2719 for(
unsigned int i=0; i<MAX_EXEC_PLANS; ++i)
2721 if(planArr[i].calibrated ==
false)
2724 outfile << planArr[i];
2727 for(
unsigned int i=0; i<MAX_EXEC_PLANS; ++i)
2729 if(planArr[i].calibrated ==
false)
2732 execPlanArray[i].clear();
2733 outfile <<
"compressed plan:\n";
2735 for(std::map<std::pair<size_t,size_t>, ImplType>::iterator it = planArr[i].m_data.begin(); it != planArr[i].m_data.end(); ++it)
2740 bp.backend = CPU_BACKEND;
2743 bp.backend = OMP_BACKEND;
2746 bp.backend = CU_BACKEND;
2751 execPlanArray[i].add(it->first.first, it->first.second, bp);
2754 outfile << planArr[i];
2762 outfile << *(trainer.m_tree);
2763 DEBUG_TUNING_LEVEL2(
"\nTree: " << *(trainer.m_tree) <<
"\n");
2766 for(
unsigned int i=0; i<impls.size(); ++i)
2774 extra.memUp = oldMemUp;
2780 assert(extra.memUp != NULL);
2782 unsigned int actDimens = dimens;
2783 std::string
interface = "map";
2786 unsigned int nImpls = 1;
2791 #if !defined(_WIN32) && !defined(REDO_MEASUREMENTS)
2801 std::vector<size_t> upperBounds(dimens);
2802 std::vector<size_t> lowerBounds(dimens);
2804 for(
unsigned int i=0; i<dimens; ++i)
2806 upperBounds[i] = uppBounds[i];
2807 lowerBounds[i] = lowBounds[i];
2810 std::vector<ImpDetail*> impls;
2812 cpu_tune_wrapper_reduce<StructType, StructType>(0);
2813 impls.push_back(
new ImpDetail(
"cpu_impl", IMPL_CPU, &cpu_tune_wrapper_reduce<StructType, StructType>));
2816 omp_tune_wrapper_reduce<StructType, StructType>(0);
2817 impls.push_back(
new ImpDetail(
"omp_impl", IMPL_OMP, &omp_tune_wrapper_reduce<StructType, StructType>));
2821 cuda_tune_wrapper_reduce<StructType, StructType>(0);
2822 impls.push_back(
new ImpDetail(
"cuda_impl", IMPL_CUDA, &cuda_tune_wrapper_reduce<StructType, StructType>));
2825 std::ofstream outfile(std::string(
"tree_data_" +
id +
".dat").c_str());
2826 assert(outfile.good());
2828 extra.actDimensions = actDimens;
2829 Trainer trainer(impls, lowerBounds, upperBounds, MAX_DEPTH, nImpls, extra, callBackFunction, callBackFunctionMapReduce, OVERSAMPLE);
2832 ExecPlanNew<1> plan;
2834 assert(plan.calibrated);
2837 outfile <<
"compressed plan:\n";
2839 for(std::map<std::pair<size_t,size_t>, ImplType>::iterator it = plan.m_data.begin(); it != plan.m_data.end(); ++it)
2844 bp.backend = CPU_BACKEND;
2847 bp.backend = OMP_BACKEND;
2850 bp.backend = CU_BACKEND;
2855 execPlan.add(it->first.first, it->first.second, bp);
2864 outfile << *(trainer.m_tree);
2865 DEBUG_TUNING_LEVEL2(
"\nTree: " << *(trainer.m_tree) <<
"\n");
2868 for(
int i=0; i<impls.size(); ++i)
2879 void (*callBackFunction)(
void*,
size_t*,
unsigned int);
2880 void (*callBackFunctionMapReduce)(
void*,
void*,
size_t*,
unsigned int);
2884 unsigned int dimens;
2897 template <
typename StructType>
2898 struct Tuner<StructType, MAPARRAY, StructType>
2900 Tuner(std::string _id,
int _dimens,
size_t *_lowBounds,
size_t *_uppBounds): id(_id), dimens(_dimens), lowBounds(_lowBounds), uppBounds(_uppBounds), callBackFunction(NULL), callBackFunctionMapReduce(NULL)
2902 assert(dimens == 3 && lowBounds && uppBounds);
2904 extra.memDown = NULL;
2907 Tuner(std::string _id,
int _dimens,
size_t *_lowBounds,
size_t *_uppBounds,
int *_memUp,
int *_memDown): id(_id), dimens(_dimens), lowBounds(_lowBounds), uppBounds(_uppBounds), callBackFunction(NULL), callBackFunctionMapReduce(NULL)
2909 assert(dimens == 3 && lowBounds && uppBounds);
2910 extra.memUp = _memUp;
2911 extra.memDown = _memDown;
2918 assert(execPlanArray!=NULL);
2924 int *oldMemUp = extra.memUp;
2926 int *oldMemDown = extra.memDown;
2927 extra.memDown = NULL;
2929 unsigned int actDimens = dimens;
2930 std::string
interface = "maparray";
2932 bool allSame = ((lowBounds[0] == lowBounds[1]) && (lowBounds[1] == lowBounds[2])) && ((uppBounds[0] == uppBounds[1]) && (uppBounds[1] == uppBounds[2]));
2934 dimens = (allSame)? 1:2;
2937 unsigned int nImpls = 1;
2939 SKEPU_ERROR(
"The current tuning framework does not support MapArray skeleton tuning with different vector sizes. TODO in future.");
2943 assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
2946 #if !defined(_WIN32) && !defined(REDO_MEASUREMENTS)
2949 bool redoMesures =
false;
2950 for(
unsigned int i=0; i<nImpls; ++i)
2952 if(execPlanArray[i].calibrated ==
false)
2957 for(
unsigned int i=0; i<dimens; ++i)
2959 if(execPlanArray[i].isTrainedFor(lowBounds[i]) ==
false || execPlanArray[i].isTrainedFor(uppBounds[i]) ==
false)
2967 if(redoMesures ==
false)
2978 assert(dimens == 1);
2980 std::vector<size_t> upperBounds(dimens);
2981 std::vector<size_t> lowerBounds(dimens);
2983 for(
unsigned int i=0; i<dimens; ++i)
2985 upperBounds[i] = uppBounds[i];
2986 lowerBounds[i] = lowBounds[i];
2989 std::vector<ImpDetail*> impls;
2991 cpu_tune_wrapper_maparray<StructType, StructType>(0);
2992 impls.push_back(
new ImpDetail(
"cpu_impl", IMPL_CPU, &cpu_tune_wrapper_maparray<StructType, StructType>));
2995 omp_tune_wrapper_maparray<StructType, StructType>(0);
2996 impls.push_back(
new ImpDetail(
"omp_impl", IMPL_OMP, &omp_tune_wrapper_maparray<StructType, StructType>));
3000 cuda_tune_wrapper_maparray<StructType, StructType>(0);
3001 impls.push_back(
new ImpDetail(
"cuda_impl", IMPL_CUDA, &cuda_tune_wrapper_maparray<StructType, StructType>));
3004 std::ofstream outfile(std::string(
"tree_data_multi_" +
id +
".dat").c_str());
3005 assert(outfile.good());
3007 extra.actDimensions = actDimens;
3008 Trainer trainer(impls, lowerBounds, upperBounds, MAX_DEPTH, nImpls, extra, callBackFunction, callBackFunctionMapReduce, OVERSAMPLE);
3011 ExecPlanNew<1> planArr[MAX_EXEC_PLANS];
3014 for(
unsigned int i=0; i<MAX_EXEC_PLANS; ++i)
3016 if(planArr[i].calibrated ==
false)
3019 outfile << planArr[i];
3022 for(
unsigned int i=0; i<MAX_EXEC_PLANS; ++i)
3024 if(planArr[i].calibrated ==
false)
3027 execPlanArray[i].clear();
3028 outfile <<
"compressed plan:\n";
3030 for(std::map<std::pair<size_t,size_t>, ImplType>::iterator it = planArr[i].m_data.begin(); it != planArr[i].m_data.end(); ++it)
3035 bp.backend = CPU_BACKEND;
3038 bp.backend = OMP_BACKEND;
3041 bp.backend = CU_BACKEND;
3046 execPlanArray[i].add(it->first.first, it->first.second, bp);
3049 outfile << planArr[i];
3057 outfile << *(trainer.m_tree);
3058 DEBUG_TUNING_LEVEL2(
"\nTree: " << *(trainer.m_tree) <<
"\n");
3061 for(
unsigned int i=0; i<impls.size(); ++i)
3069 extra.memUp = oldMemUp;
3070 extra.memDown = oldMemDown;
3076 assert(extra.memUp != NULL && extra.memDown != NULL);
3078 unsigned int actDimens = dimens;
3079 std::string
interface = "maparray";
3081 bool allSame = ((lowBounds[0] == lowBounds[1]) && (lowBounds[1] == lowBounds[2])) && ((uppBounds[0] == uppBounds[1]) && (uppBounds[1] == uppBounds[2]));
3083 dimens = (allSame)? 1:2;
3085 unsigned int nImpls = 1;
3087 SKEPU_ERROR(
"The current tuning framework does not support MapArray skeleton tuning with different vector sizes. TODO in future.");
3092 #if !defined(_WIN32) && !defined(REDO_MEASUREMENTS)
3102 std::vector<size_t> upperBounds(dimens);
3103 std::vector<size_t> lowerBounds(dimens);
3105 for(
unsigned int i=0; i<dimens; ++i)
3107 upperBounds[i] = uppBounds[i];
3108 lowerBounds[i] = lowBounds[i];
3111 std::vector<ImpDetail*> impls;
3113 cpu_tune_wrapper_maparray<StructType, StructType>(0);
3114 impls.push_back(
new ImpDetail(
"cpu_impl", IMPL_CPU, &cpu_tune_wrapper_maparray<StructType, StructType>));
3117 omp_tune_wrapper_maparray<StructType, StructType>(0);
3118 impls.push_back(
new ImpDetail(
"omp_impl", IMPL_OMP, &omp_tune_wrapper_maparray<StructType, StructType>));
3122 cuda_tune_wrapper_maparray<StructType, StructType>(0);
3123 impls.push_back(
new ImpDetail(
"cuda_impl", IMPL_CUDA, &cuda_tune_wrapper_maparray<StructType, StructType>));
3126 std::ofstream outfile(std::string(
"tree_data_" +
id +
".dat").c_str());
3127 assert(outfile.good());
3129 extra.actDimensions = actDimens;
3130 Trainer trainer(impls, lowerBounds, upperBounds, MAX_DEPTH, nImpls, extra, callBackFunction, callBackFunctionMapReduce, OVERSAMPLE);
3133 ExecPlanNew<1> plan;
3135 assert(plan.calibrated);
3138 outfile <<
"compressed plan:\n";
3140 for(std::map<std::pair<size_t,size_t>, ImplType>::iterator it = plan.m_data.begin(); it != plan.m_data.end(); ++it)
3145 bp.backend = CPU_BACKEND;
3148 bp.backend = OMP_BACKEND;
3151 bp.backend = CU_BACKEND;
3156 execPlan.add(it->first.first, it->first.second, bp);
3165 outfile << *(trainer.m_tree);
3166 DEBUG_TUNING_LEVEL2(
"\nTree: " << *(trainer.m_tree) <<
"\n");
3169 for(
int i=0; i<impls.size(); ++i)
3180 void (*callBackFunction)(
void*,
size_t*,
unsigned int);
3181 void (*callBackFunctionMapReduce)(
void*,
void*,
size_t*,
unsigned int);
3185 unsigned int dimens;
3199 template <
typename StructType>
3200 struct Tuner<StructType, MAPOVERLAP, StructType>
3202 Tuner(std::string _id,
int _dimens,
size_t *_lowBounds,
size_t *_uppBounds): id(_id), dimens(_dimens), lowBounds(_lowBounds), uppBounds(_uppBounds), callBackFunction(NULL), callBackFunctionMapReduce(NULL)
3204 assert(dimens >= 1 && dimens <= 2 && lowBounds && uppBounds);
3206 extra.memDown = NULL;
3209 Tuner(std::string _id,
int _dimens,
size_t *_lowBounds,
size_t *_uppBounds,
int *_memUp,
int *_memDown): id(_id), dimens(_dimens), lowBounds(_lowBounds), uppBounds(_uppBounds), callBackFunction(NULL), callBackFunctionMapReduce(NULL)
3211 assert(dimens >= 1 && dimens <= 2 && lowBounds && uppBounds);
3212 extra.memUp = _memUp;
3213 extra.memDown = _memDown;
3220 assert(execPlanArray!=NULL);
3226 int *oldMemUp = extra.memUp;
3228 int *oldMemDown = extra.memDown;
3229 extra.memDown = NULL;
3231 unsigned int actDimens = dimens;
3232 std::string
interface = "mapoverlap";
3235 unsigned int nImpls = 1;
3239 assert(nImpls > 0 && nImpls <= MAX_EXEC_PLANS);
3242 #if !defined(_WIN32) && !defined(REDO_MEASUREMENTS)
3245 bool redoMesures =
false;
3246 for(
unsigned int i=0; i<nImpls; ++i)
3248 if(execPlanArray[i].calibrated ==
false)
3253 for(
int j=0; j<dimens; ++j)
3255 if(execPlanArray[i].isTrainedFor(lowBounds[j]) ==
false || execPlanArray[i].isTrainedFor(uppBounds[j]) ==
false)
3263 if(redoMesures ==
false)
3271 std::vector<size_t> upperBounds(dimens);
3272 std::vector<size_t> lowerBounds(dimens);
3274 for(
unsigned int i=0; i<dimens; ++i)
3276 upperBounds[i] = uppBounds[i];
3277 lowerBounds[i] = lowBounds[i];
3280 std::vector<ImpDetail*> impls;
3282 cpu_tune_wrapper_mapoverlap<StructType, StructType>(0);
3283 impls.push_back(
new ImpDetail(
"cpu_impl", IMPL_CPU, &cpu_tune_wrapper_mapoverlap<StructType, StructType>));
3286 omp_tune_wrapper_mapoverlap<StructType, StructType>(0);
3287 impls.push_back(
new ImpDetail(
"omp_impl", IMPL_OMP, &omp_tune_wrapper_mapoverlap<StructType, StructType>));
3291 cuda_tune_wrapper_mapoverlap<StructType, StructType>(0);
3292 impls.push_back(
new ImpDetail(
"cuda_impl", IMPL_CUDA, &cuda_tune_wrapper_mapoverlap<StructType, StructType>));
3295 std::ofstream outfile(std::string(
"tree_data_multi_" +
id +
".dat").c_str());
3296 assert(outfile.good());
3298 extra.actDimensions = actDimens;
3299 Trainer trainer(impls, lowerBounds, upperBounds, MAX_DEPTH, nImpls, extra, callBackFunction, callBackFunctionMapReduce, OVERSAMPLE);
3303 ExecPlanNew<1> planArr[MAX_EXEC_PLANS];
3306 for(
unsigned int i=0; i<MAX_EXEC_PLANS; ++i)
3308 if(planArr[i].calibrated ==
false)
3311 outfile << planArr[i];
3314 for(
unsigned int i=0; i<MAX_EXEC_PLANS; ++i)
3316 if(planArr[i].calibrated ==
false)
3319 execPlanArray[i].clear();
3320 outfile <<
"compressed plan:\n";
3322 for(std::map<std::pair<size_t,size_t>, ImplType>::iterator it = planArr[i].m_data.begin(); it != planArr[i].m_data.end(); ++it)
3327 bp.backend = CPU_BACKEND;
3330 bp.backend = OMP_BACKEND;
3333 bp.backend = CU_BACKEND;
3338 execPlanArray[i].add(it->first.first, it->first.second, bp);
3341 outfile << planArr[i];
3349 outfile << *(trainer.m_tree);
3350 DEBUG_TUNING_LEVEL2(
"\nTree: " << *(trainer.m_tree) <<
"\n");
3353 for(
unsigned int i=0; i<impls.size(); ++i)
3361 extra.memUp = oldMemUp;
3362 extra.memDown = oldMemDown;
3368 assert(extra.memUp != NULL && extra.memDown != NULL);
3370 unsigned int actDimens = dimens;
3371 std::string
interface = "mapoverlap";
3375 unsigned int nImpls = 1;
3380 #if !defined(_WIN32) && !defined(REDO_MEASUREMENTS)
3390 std::vector<size_t> upperBounds(dimens);
3391 std::vector<size_t> lowerBounds(dimens);
3393 for(
unsigned int i=0; i<dimens; ++i)
3395 upperBounds[i] = uppBounds[i];
3396 lowerBounds[i] = lowBounds[i];
3399 std::vector<ImpDetail*> impls;
3401 cpu_tune_wrapper_mapoverlap<StructType, StructType>(0);
3402 impls.push_back(
new ImpDetail(
"cpu_impl", IMPL_CPU, &cpu_tune_wrapper_mapoverlap<StructType, StructType>));
3405 omp_tune_wrapper_mapoverlap<StructType, StructType>(0);
3406 impls.push_back(
new ImpDetail(
"omp_impl", IMPL_OMP, &omp_tune_wrapper_mapoverlap<StructType, StructType>));
3410 cuda_tune_wrapper_mapoverlap<StructType, StructType>(0);
3411 impls.push_back(
new ImpDetail(
"cuda_impl", IMPL_CUDA, &cuda_tune_wrapper_mapoverlap<StructType, StructType>));
3414 std::ofstream outfile(std::string(
"tree_data_" +
id +
".dat").c_str());
3415 assert(outfile.good());
3417 extra.actDimensions = actDimens;
3418 Trainer trainer(impls, lowerBounds, upperBounds, MAX_DEPTH, nImpls, extra, callBackFunction, callBackFunctionMapReduce, OVERSAMPLE);
3421 ExecPlanNew<1> plan;
3423 assert(plan.calibrated);
3426 outfile <<
"compressed plan:\n";
3428 for(std::map<std::pair<size_t,size_t>, ImplType>::iterator it = plan.m_data.begin(); it != plan.m_data.end(); ++it)
3433 bp.backend = CPU_BACKEND;
3436 bp.backend = OMP_BACKEND;
3439 bp.backend = CU_BACKEND;
3444 execPlan.add(it->first.first, it->first.second, bp);
3453 outfile << *(trainer.m_tree);
3454 DEBUG_TUNING_LEVEL2(
"\nTree: " << *(trainer.m_tree) <<
"\n");
3457 for(
int i=0; i<impls.size(); ++i)
3468 void (*callBackFunction)(
void*,
size_t*,
unsigned int);
3469 void (*callBackFunctionMapReduce)(
void*,
void*,
size_t*,
unsigned int);
3473 unsigned int dimens;
void operator()(ExecPlan *execPlanArray)
Definition: tuner.h:2634
T OMP(Matrix< T > &input)
Definition: reduce_omp_2d.inl:27
void cuda_tune_wrapper_mapreduce(void *arg)
Do training execution for a single performance context for MapReduce skeleton and CUDA implementation...
Definition: tuner.h:1569
bool loadExecPlan(std::string id, ExecPlan &plan)
Loads an execution plan for a file into the structure passes as argument.
Definition: tuner.h:1774
ExecPlan operator()()
Definition: tuner.h:2206
bool storeExecPlan(std::string id, const ExecPlan &plan)
Stores an execution plan for the structure passed as argument to a file.
Definition: tuner.h:1842
void CU(Vector< T > &input, EdgePolicy poly=CONSTANT, T pad=T(), int useNumGPU=1)
Definition: mapoverlap_cu.inl:33
void cpu_tune_wrapper_mapreduce(void *arg)
Do training execution for a single performance context for MapReduce skeleton and sequential CPU impl...
Definition: tuner.h:461
void omp_tune_wrapper_reduce(void *arg)
Do training execution for a single performance context for Reduce skeleton and parallel OpenMP implem...
Definition: tuner.h:699
bool calibrated
Definition: exec_plan.h:57
void CPU(Vector< T > &input)
Definition: map_cpu.inl:21
void cuda_tune_wrapper_mapoverlap(void *arg)
Do training execution for a single performance context for MapOverlap skeleton and CUDA implementatio...
Definition: tuner.h:1350
A class representing the MapArray skeleton.
Definition: maparray.h:94
void cpu_tune_wrapper_reduce(void *arg)
Do training execution for a single performance context for Reduce skeleton and sequential CPU impleme...
Definition: tuner.h:163
void CU(Vector< in > &input, MultiVector P, Vector< out > &output, int useNumGPU=1)
Definition: maparray_cu.inl:155
void compressExecPlanNew(ExecPlanNew< 1 > &plan)
Definition: trainer.h:490
void OMP(Vector< in > &input, MultiVector P, Vector< out > &output)
Definition: maparray_omp.inl:54
void createDefaultConfiguration(BackEndParams &bp)
A helper function that creates the default configuration.
Definition: tuner.h:1711
void CPU(Vector< in > &input, MultiVector P, Vector< out > &output)
Definition: maparray_cpu.inl:21
bool storeExecPlanArray(std::string id, const ExecPlan *planArray, unsigned int nImpls)
Stores execution plans for the structure passed as argument to a file.
Definition: tuner.h:1976
bool loadExecPlanArray(std::string id, ExecPlan *planArray)
Loads execution plans for a file into the structure passes as argument.
Definition: tuner.h:1894
void cuda_tune_wrapper_map(void *arg)
the following functions train for CUDA implementations for different skeletons... ...
Definition: tuner.h:1124
Contains a class declaration for the MapReduce skeleton.
void cuda_tune_wrapper_reduce(void *arg)
Do training execution for a single performance context for Reduce skeleton and CUDA implementation...
Definition: tuner.h:1251
void cuda_tune_wrapper_maparray(void *arg)
Do training execution for a single performance context for MapArray skeleton and CUDA implementation...
Definition: tuner.h:1457
void CPU(Vector< T > &input, EdgePolicy poly=CONSTANT, T pad=T())
Definition: mapoverlap_cpu.inl:23
void OMP(Vector< T > &input, EdgePolicy poly=CONSTANT, T pad=T())
Definition: mapoverlap_omp.inl:26
static const std::string trimSpaces(const std::string &pString, const std::string &pWhitespace=" \t")
Definition: helper_methods.h:32
void omp_tune_wrapper_mapoverlap(void *arg)
Do training execution for a single performance context for MapOverlap skeleton and parallel OpenMP im...
Definition: tuner.h:793
void constructExecPlanNew(ExecPlanNew< dimens > *plan, StatsTuner &stats)
Definition: trainer.inl:103
ExecPlan operator()()
Definition: tuner.h:2493
void resize(size_type num, T val=T())
Definition: vector.inl:326
T CPU(Vector< T > &input)
Definition: mapreduce_cpu.inl:23
end Node class...
Definition: trainer.h:478
device_pointer_type_cu updateDevice_CU(T *start, size_type numElements, unsigned int deviceID, bool copy, bool writeAccess, bool markOnlyLocalCopiesInvalid=false, unsigned int streamID=0)
Update device with vector content.
Definition: vector_cu.inl:162
Contains a class declaration for the Reduce skeleton.
ExecPlan operator()()
Definition: tuner.h:2778
T CU(Vector< T > &input, int useNumGPU=1)
Definition: mapreduce_cu.inl:349
void cpu_tune_wrapper_maparray(void *arg)
Do training execution for a single performance context for MapArray skeleton and sequential CPU imple...
Definition: tuner.h:354
void cpu_tune_wrapper_map(void *arg)
Do training execution for a single performance context for Map skeleton and sequential CPU implementa...
Definition: tuner.h:47
void omp_tune_wrapper_maparray(void *arg)
Do training execution for a single performance context for MapArray skeleton and parallel OpenMP impl...
Definition: tuner.h:895
A vector container class, implemented as a wrapper for std::vector.
Definition: vector.h:61
Tuner class: generic definition.... Multiple class specializations are defined for this class...
Definition: tuner.h:1695
void operator()(ExecPlan *execPlanArray)
Definition: tuner.h:2344
A class representing the MapReduce skeleton.
Definition: mapreduce.h:54
A class that describes an execution plan.
Definition: exec_plan.h:47
void operator()(ExecPlan *execPlanArray)
Definition: tuner.h:3218
void omp_tune_wrapper_map(void *arg)
the following section contains function that can train OpenMP implementations. Only enabled when Open...
Definition: tuner.h:582
Contains a class declaration for the Scan skeleton.
A class representing a execution environment.
Definition: environment.h:80
Contains a class declaration for the MapArray skeleton.
void train()
Definition: trainer.inl:78
static Environment * getInstance()
Definition: environment.inl:90
A class representing the Reduce skeleton both for 1D and 2D reduce operation for 1D Vector...
Definition: reduce.h:77
ExecPlan operator()()
Definition: tuner.h:3074
A class representing the MapOverlap skeleton.
Definition: mapoverlap.h:75
Contains a class declaration for the Map skeleton.
Can be used to specify properties for a backend.
Definition: exec_plan.h:19
void OMP(Vector< T > &input)
Definition: map_omp.inl:24
ExecPlan operator()()
Definition: tuner.h:3366
A class representing the Map skeleton.
Definition: map.h:52
void CU(Vector< T > &input, int useNumGPU=1)
Definition: map_cu.inl:418
T OMP(Vector< T > &input)
Definition: mapreduce_omp.inl:27
void cpu_tune_wrapper_mapoverlap(void *arg)
Do training execution for a single performance context for MapOverlap skeleton and sequential CPU imp...
Definition: tuner.h:256
T CPU(Matrix< T > &input)
Definition: reduce_cpu_2d.inl:26
void operator()(ExecPlan *execPlanArray)
Definition: tuner.h:2060
T CU(Matrix< T > &input, int useNumGPU=1)
Definition: reduce_cu_2d.inl:311
void omp_tune_wrapper_mapreduce(void *arg)
Do training execution for a single performance context for MapReduce skeleton and parallel OpenMP imp...
Definition: tuner.h:1002
Contains a class declaration for the MapOverlap skeleton.
void operator()(ExecPlan *execPlanArray)
Definition: tuner.h:2916