voidclampedExpVector(float*values,int*exponents,float*output,intN){//
// CS149 STUDENTS TODO: Implement your vectorized version of
// clampedExpSerial() here.
//
// Your solution should work for any value of
// N and VECTOR_WIDTH, not just when VECTOR_WIDTH divides N
//
__cs149_vec_floatx,result,oneFloat=_cs149_vset_float(1.f),ceiling=_cs149_vset_float(9.999999f);__cs149_vec_inty,count,zero=_cs149_vset_int(0),oneInt=_cs149_vset_int(1);__cs149_maskmaskAll,maskEqZero,maskNotEqZero,maskGtCeiling,maskCountGtZero;for(inti=0;i<N;i+=VECTOR_WIDTH){// 全 1(且未越界)
maskAll=_cs149_init_ones(std::min(VECTOR_WIDTH,N-i));_cs149_vload_float(x,values+i,maskAll);// x = value[i];
_cs149_vload_int(y,exponents+i,maskAll);// y = exponents[i];
// 等于 0(且未越界)
maskEqZero=_cs149_init_ones(0);// init corner ???
_cs149_veq_int(maskEqZero,y,zero,maskAll);// if (y == 0) {
_cs149_vstore_float(output+i,oneFloat,maskEqZero);// output[i] = 1.f;
// 不等于 0(且未越界)
maskNotEqZero=_cs149_mask_not(maskEqZero);// if (y != 0) {
maskNotEqZero=_cs149_mask_and(maskNotEqZero,maskAll);// corner ???
_cs149_vmove_float(result,x,maskNotEqZero);// result = x;
count=_cs149_vset_int(0);// init ???
_cs149_vsub_int(count,y,oneInt,maskNotEqZero);// count = y - 1;
maskCountGtZero=_cs149_init_ones(0);// corner ???
_cs149_vgt_int(maskCountGtZero,count,zero,maskNotEqZero);while(_cs149_cntbits(maskCountGtZero)>0){// while (count > 0) {
_cs149_vmult_float(result,result,x,maskCountGtZero);// result *= x;
_cs149_vsub_int(count,count,oneInt,maskCountGtZero);// count--;
_cs149_vgt_int(maskCountGtZero,count,zero,maskNotEqZero);}// 大于上界值(且未越界)
// maskGtCeiling = _cs149_init_ones(0); // corner ??? can remove.
_cs149_vgt_float(maskGtCeiling,result,ceiling,maskNotEqZero);// if (result > 9.999999f) {
_cs149_vmove_float(result,ceiling,maskGtCeiling);// result = 9.999999f;
_cs149_vstore_float(output+i,result,maskNotEqZero);// output[i] = result;
}}
// returns the sum of all elements in values
// You can assume N is a multiple of VECTOR_WIDTH
// You can assume VECTOR_WIDTH is a power of 2
floatarraySumVector(float*values,intN){//
// CS149 STUDENTS TODO: Implement your vectorized version of arraySumSerial
// here
//
// 实验保证向量位宽是 N 的因子
// O(N / VECTOR_WIDTH)
__cs149_vec_floatsum=_cs149_vset_float(0.f),tmp;__cs149_maskmaskAll=_cs149_init_ones(VECTOR_WIDTH);for(inti=0;i<N;i+=VECTOR_WIDTH){_cs149_vload_float(tmp,values+i,maskAll);_cs149_vadd_float(sum,sum,tmp,maskAll);}// 1. O(VECTOR_WIDTH)
// float result = 0.f;
// for (int i = 0; i < VECTOR_WIDTH; i++) {
// result += sum.value[i];
// }
// return result;
// 2. O(log2(VECTOR_WIDTH))
// 并行归约 / 树形归约
for(ints=VECTOR_WIDTH/2;s>0;s>>=1){_cs149_hadd_float(tmp,sum);_cs149_interleave_float(sum,tmp);}returnsum.value[0];}
prog3_mandelbrot_ispc
任务是优化性能问题。
Part 1.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# ./mandelbrot_ispc -t [mandelbrot serial]: [214.754] ms
Wrote image file mandelbrot-serial.ppm
[mandelbrot ispc]: [36.824] ms
Wrote image file mandelbrot-ispc.ppm
[mandelbrot multicore ispc]: [18.593] ms
Wrote image file mandelbrot-task-ispc.ppm
(5.83x speedup from ISPC)(11.55x speedup from task ISPC)# ./mandelbrot_ispc -t -v 2 [mandelbrot serial]: [129.845] ms
Wrote image file mandelbrot-serial.ppm
[mandelbrot ispc]: [26.140] ms
Wrote image file mandelbrot-ispc.ppm
[mandelbrot multicore ispc]: [15.506] ms
Wrote image file mandelbrot-task-ispc.ppm
(4.97x speedup from ISPC)(8.37x speedup from task ISPC)
# ./sqrt[sqrt serial]: [915.247] ms
[sqrt ispc]: [184.232] ms
[sqrt task ispc]: [15.848] ms
(4.97x speedup from ISPC)(57.75x speedup from task ISPC)(11.6x)
1
2
3
4
5
6
7
8
9
10
11
for(unsignedinti=0;i<N;i++){// TODO: CS149 students. Attempt to change the values in the
// array here to meet the instructions in the handout: we want
// to you generate best and worse-case speedups
// starter code populates array with random input values
values[i]=.001f+2.998f*static_cast<float>(rand())/RAND_MAX;// values[i] = 1.f;
// values[i] = 2.999f;
// values[i] = ((i % 8) ? 1.f : 2.999f);
}
// 导入相关库
#include<immintrin.h>voidsqrt_my_avx2(intN,floatinitialGuess,float*values,float*output){staticconstfloatkThreshold=0.00001f;const__m256initialGuess_v=_mm256_set1_ps(initialGuess);const__m256kThreshold_v=_mm256_set1_ps(kThreshold);const__m256one_v=_mm256_set1_ps(1.f);const__m256three_v=_mm256_set1_ps(3.f);const__m256half_v=_mm256_set1_ps(0.5f);// 做与操作,实现取绝对值
const__m256abs_mask_v=_mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));for(inti=0;i<=N-8;i+=8){__m256x=_mm256_loadu_ps(values+i);__m256guess=initialGuess_v;__m256continue_mask;while(true){__m256guess_sq=_mm256_mul_ps(guess,guess);__m256term=_mm256_mul_ps(guess_sq,x);__m256error=_mm256_and_ps((_mm256_sub_ps(term,one_v)),abs_mask_v);continue_mask=_mm256_cmp_ps(error,kThreshold_v,_CMP_GT_OQ);if(_mm256_movemask_ps(continue_mask)==0){break;}__m256guess_cubed=_mm256_mul_ps(guess_sq,guess);__m256term2=_mm256_mul_ps(x,guess_cubed);__m256term3=_mm256_mul_ps(three_v,guess);__m256new_guess_unscaled=_mm256_sub_ps(term3,term2);__m256new_guess=_mm256_mul_ps(new_guess_unscaled,half_v);guess=_mm256_blendv_ps(guess,new_guess,continue_mask);}__m256result=_mm256_mul_ps(x,guess);_mm256_storeu_ps(output+i,result);}}// main()
// My version of AVX2
doubleminMyAVX2=1e30;for(inti=0;i<3;++i){doublestartTime=CycleTimer::currentSeconds();sqrt_my_avx2(N,initialGuess,values,output);doubleendTime=CycleTimer::currentSeconds();minMyAVX2=std::min(minMyAVX2,endTime-startTime);}printf("[sqrt my avx2]:\t\t[%.3f] ms\n",minMyAVX2*1000);verifyResult(N,output,gold);// Clear out the buffer
for(unsignedinti=0;i<N;++i)output[i]=0;printf("\t\t\t\t(%.2fx speedup from My AVX2)\n",minSerial/minMyAVX2);
# 随机数据[sqrt serial]: [917.327] ms
[sqrt ispc]: [186.708] ms
[sqrt my avx2]: [155.992] ms
[sqrt task ispc]: [16.230] ms
(4.91x speedup from ISPC)(5.88x speedup from My AVX2)(56.52x speedup from task ISPC)# values[i] = 1.f;[sqrt serial]: [14.391] ms
[sqrt ispc]: [9.552] ms
[sqrt my avx2]: [8.411] ms
[sqrt task ispc]: [6.627] ms
(1.51x speedup from ISPC)(1.71x speedup from My AVX2)(2.17x speedup from task ISPC)# values[i] = 2.999f;[sqrt serial]: [1922.672] ms
[sqrt ispc]: [290.222] ms
[sqrt my avx2]: [214.912] ms
[sqrt task ispc]: [24.780] ms
(6.62x speedup from ISPC)(8.95x speedup from My AVX2)(77.59x speedup from task ISPC)# values[i] = ((i % 8) ? 1.f : 2.999f);[sqrt serial]: [267.990] ms
[sqrt ispc]: [291.000] ms
[sqrt my avx2]: [212.485] ms
[sqrt task ispc]: [27.419] ms
(0.92x speedup from ISPC)(1.26x speedup from My AVX2)(9.77x speedup from task ISPC)
Prog5_saxpy
1
2
3
4
5
6
7
8
9
10
11
12
# ./saxpy[saxpy ispc]: [11.335] ms [26.292] GB/s [3.529] GFLOPS
[saxpy task ispc]: [8.950] ms [33.299] GB/s [4.469] GFLOPS
(1.27x speedup from use of tasks)# 取消注释掉后的完整输出[saxpy serial]: [12.749] ms [23.376] GB/s [3.137] GFLOPS
[saxpy ispc]: [11.794] ms [25.269] GB/s [3.392] GFLOPS
[saxpy task ispc]: [9.080] ms [32.821] GB/s [4.405] GFLOPS
(1.30x speedup from use of tasks)(1.08x speedup from ISPC)(1.40x speedup from task ISPC)
// main.cpp
#include<immintrin.h>// stream_ps,直接写入主内存
staticvoidsaxpy_avx2(intN,floatscale,float*X,float*Y,float*result){const__m256scale_v=_mm256_set1_ps(scale);inti=0;for(;i<=N-8;i+=8){__m256x=_mm256_load_ps(X+i),y=_mm256_load_ps(Y+i);__m256res=_mm256_add_ps(_mm256_mul_ps(scale_v,x),y);_mm256_stream_ps(result+i,res);}for(;i<N;i++){result[i]=scale*X[i]+Y[i];}}/**********************************************************************
[saxpy serial]: [12.063] ms [24.705] GB/s [3.316] GFLOPS
[saxpy avx2]: [6.954] ms [42.859] GB/s [5.752] GFLOPS
[saxpy ispc]: [11.196] ms [26.618] GB/s [3.573] GFLOPS
[saxpy task ispc]: [9.088] ms [32.792] GB/s [4.401] GFLOPS
(1.73x speedup from My AVX2)
(1.23x speedup from use of tasks)
(1.08x speedup from ISPC)
(1.33x speedup from task ISPC)
**********************************************************************/// main()
constintALIGNMENT=32;float*arrayX=(float*)_mm_malloc(N*sizeof(float),ALIGNMENT);float*arrayY=(float*)_mm_malloc(N*sizeof(float),ALIGNMENT);float*resultSerial=(float*)_mm_malloc(N*sizeof(float),ALIGNMENT);float*resultISPC=(float*)_mm_malloc(N*sizeof(float),ALIGNMENT);float*resultTasks=(float*)_mm_malloc(N*sizeof(float),ALIGNMENT);float*resultAVX2=(float*)_mm_malloc(N*sizeof(float),ALIGNMENT);// My AVX2 version
doubleminAVX2=1e30;for(inti=0;i<3;++i){doublestartTime=CycleTimer::currentSeconds();saxpy_avx2(N,scale,arrayX,arrayY,resultAVX2);doubleendTime=CycleTimer::currentSeconds();minAVX2=std::min(minAVX2,endTime-startTime);}verifyResult(N,resultAVX2,resultSerial);printf("[saxpy avx2]:\t\t[%.3f] ms\t[%.3f] GB/s\t[%.3f] GFLOPS\n",minAVX2*1000,toBW(TOTAL_BYTES,minAVX2),toGFLOPS(TOTAL_FLOPS,minAVX2));printf("\t\t\t\t(%.2fx speedup from My AVX2)\n",minSerial/minAVX2);_mm_free(arrayX);_mm_free(arrayY);_mm_free(resultAVX2);_mm_free(resultSerial);_mm_free(resultISPC);_mm_free(resultTasks);
// main.cpp
// readData("./data.dat", &data, &clusterCentroids, &clusterAssignments, &M, &N, &K, &epsilon);
// NOTE: if you want to generate your own data (for fun), you can use the below code
M=1e6;N=100;K=3;epsilon=0.1;data=newdouble[M*N];clusterCentroids=newdouble[K*N];clusterAssignments=newint[M];// Initialize data
initData(data,M,N);initCentroids(clusterCentroids,K,N);// Initialize cluster assignments
for(intm=0;m<M;m++){doubleminDist=1e30;intbestAssignment=-1;for(intk=0;k<K;k++){doubled=dist(&data[m*N],&clusterCentroids[k*N],N);if(d<minDist){minDist=d;bestAssignment=k;}}clusterAssignments[m]=bestAssignment;}// Uncomment to generate data file
writeData("./data.dat",data,clusterCentroids,clusterAssignments,&M,&N,&K,&epsilon);
// kmeansThread.cpp
doubletot1=0.f,tot2=0.f,tot3=0.f;// while (!stoppingConditionMet(prevCost, currCost, epsilon, K)) {
doublet1=CycleTimer::currentSeconds();computeAssignments(&args);doublet2=CycleTimer::currentSeconds();computeCentroids(&args);doublet3=CycleTimer::currentSeconds();computeCost(&args);doublet4=CycleTimer::currentSeconds();// }
tot1+=t2-t1;tot2+=t3-t2;tot3+=t4-t3;iter=std::max(iter,1);tot1/=iter,tot2/=iter,tot3/=iter;doublesum=tot1+tot2+tot3;printf("cost per iteration:\n");printf("compute Assignments:\t[%.2lf] ms\n",tot1*1000);printf("\t\tout of all: [%.2lf] %%\n",tot1/sum*100);printf("compute Centroids:\t[%.2lf] ms\n",tot2*1000);printf("\t\tout of all: [%.2lf] %%\n",tot2/sum*100);printf("compute Cost:\t\t[%.2lf] ms\n",tot3*1000);printf("\t\tout of all: [%.2lf] %%\n",tot3/sum*100);
结果:
1
2
3
4
5
6
7
8
9
10
# 分析性能瓶颈Running K-means with: M=1000000, N=100, K=3, epsilon=0.100000
cost per iteration:
compute Assignments: [261.63] ms
out of all: [65.42] %
compute Centroids: [64.91] ms
out of all: [16.23] %
compute Cost: [73.35] ms
out of all: [18.34] %
[Total Time]: 9597.698 ms
# numThreads = 8Running K-means with: M=1000000, N=100, K=3, epsilon=0.100000
cost per iteration:
compute Assignments: [60.94] ms
out of all: [32.30] %
compute Centroids: [52.40] ms
out of all: [27.78] %
compute Cost: [75.29] ms
out of all: [39.91] %
[Total Time]: 4338.599 ms
voidcomputeAssignments(WorkerArgs*constargs){double*minDist=newdouble[args->M];// Initialize arrays
// for (int m = 0; m < args->M; m++) {
for(intm=args->start;m<args->end;m++){minDist[m]=1e30;args->clusterAssignments[m]=-1;}// Assign datapoints to closest centroids
// for (int k = args->start; k < args->end; k++) {
// for (int m = 0; m < args->M; m++) {
for(intk=0;k<args->K;k++){for(intm=args->start;m<args->end;m++){doubled=dist(&args->data[m*args->N],&args->clusterCentroids[k*args->N],args->N);if(d<minDist[m]){minDist[m]=d;args->clusterAssignments[m]=k;}}}// free(minDist);
delete[]minDist;}voidkMeansThread(double*data,double*clusterCentroids,int*clusterAssignments,intM,intN,intK,doubleepsilon){// Used to track convergence
double*prevCost=newdouble[K];double*currCost=newdouble[K];// The WorkerArgs array is used to pass inputs to and return output from
// functions.
staticconstexprintnumThreads=8;intperThread=M/numThreads;WorkerArgsargs[numThreads];for(inti=0;i<numThreads;i++){args[i].data=data;args[i].clusterCentroids=clusterCentroids;args[i].clusterAssignments=clusterAssignments;args[i].currCost=currCost;args[i].M=M;args[i].N=N;args[i].K=K;args[i].start=i*perThread;args[i].end=std::min((i+1)*perThread,M);}// Initialize arrays to track cost
for(intk=0;k<K;k++){prevCost[k]=1e30;currCost[k]=0.0;}doubletot1=0.f,tot2=0.f,tot3=0.f;std::threadworkers[numThreads];/* Main K-Means Algorithm Loop */intiter=0;while(!stoppingConditionMet(prevCost,currCost,epsilon,K)){// Update cost arrays (for checking convergence criteria)
for(intk=0;k<K;k++){prevCost[k]=currCost[k];}// Setup args struct
// args.start = 0;
// args.end = K;
// computeAssignments(&args);
doublet1=CycleTimer::currentSeconds();for(inti=1;i<numThreads;i++){workers[i]=thread(computeAssignments,&args[i]);}computeAssignments(&args[0]);for(inti=1;i<numThreads;i++){workers[i].join();}args[0].start=0;args[0].end=K;doublet2=CycleTimer::currentSeconds();computeCentroids(&args[0]);doublet3=CycleTimer::currentSeconds();computeCost(&args[0]);doublet4=CycleTimer::currentSeconds();tot1+=t2-t1;tot2+=t3-t2;tot3+=t4-t3;iter++;}iter=std::max(iter,1);tot1/=iter,tot2/=iter,tot3/=iter;doublesum=tot1+tot2+tot3;printf("cost per iteration:\n");printf("compute Assignments:\t[%.2lf] ms\n",tot1*1000);printf("\t\tout of all: [%.2lf] %%\n",tot1/sum*100);printf("compute Centroids:\t[%.2lf] ms\n",tot2*1000);printf("\t\tout of all: [%.2lf] %%\n",tot2/sum*100);printf("compute Cost:\t\t[%.2lf] ms\n",tot3*1000);printf("\t\tout of all: [%.2lf] %%\n",tot3/sum*100);// free(currCost);
// free(prevCost);
delete[]currCost;delete[]prevCost;}