
此文简单介绍如何使用intel c++编译器实现向量化加速。


  • base : 待优化的源代码。
  • vectorization : 第一个向量化版本。
  • aligned : 内存对其对向量化的影响。



// filename : main.cpp
#include <iostream>
#include <iomanip>
#include <stdlib.h>
#include <cstdint>
#include <malloc.h>
#include <windows.h>
using namespace std;int64_t cpu_freq;
int64_t cpu_counter(){int64_t clock;QueryPerformanceCounter((LARGE_INTEGER*)&clock);return clock;
}// output time
#if 1int64_t gloabel_timer_begin;int64_t gloabel_timer_end;#define TB__ gloabel_timer_begin=cpu_counter()#define TE__ gloabel_timer_end  =cpu_counter(); \cout << __LINE__ << " : " << double(gloabel_timer_end-gloabel_timer_begin)/double(cpu_freq) << " seconds" << endl
#else#define TB__ #define TE__
#endif// repeat times
#define REPEATTIMES 100000// initialize data
void init(float *data, int rows, int cols, int true_cols){for (int i = 0; i < rows; i++){for (int j = 0; j < cols; j++){data[i*true_cols+j] = float(rand())/float(RAND_MAX);}}
}void multiply(float *C, float *A, float *B, int rows, int cols, int true_cols);void print_sum(float *data, int rows, int cols, int true_cols){float total = 0;for (int i = 0; i < rows; i++){for (int j = 0; j < cols; j++){total += data[i*true_cols+j];}}cout << total << endl;
}int main(){QueryPerformanceFrequency((LARGE_INTEGER *)&cpu_freq);int rows = 100; int cols = 101;int true_cols = cols;float *A = (float*)malloc(rows*true_cols*sizeof(float));float *B = (float*)malloc(rows*sizeof(float));float *C = (float*)malloc(rows*sizeof(float));init(A, rows, cols, true_cols);init(B, rows, 1, 1);// computingTB__;for (int k = 0; k < REPEATTIMES; k++){multiply(C, A, B, rows, cols, true_cols);}TE__;// print result.  print_sum(C, rows, 1, 1);free(A); A = NULL;free(B); B = NULL;free(C); C = NULL;return 0;
// filename : multiply.cpp
void multiply(float *C, float *A, float *B, int rows, int cols, int true_cols){for (int i = 0; i < rows; i++){C[i] = 0;for (int j = 0; j < cols; j++){C[i] += A[i*true_cols+j]*B[j];}    }


user@machine> icl /O1 /Qopt-report:1 /Qopt-report-phase:vec main.cpp multiply.cpp


user@machine> main.exe
73 : 0.877882 seconds




user@machine> icl /O2 /Qopt-report:1 /Qopt-report-phase:vec main.cpp multiply.cpp


user@machine> main.exe
73 : 0.205989 seconds

执行速度提升了 4倍左右。



// filename : main.cpp
#include <iostream>
#include <iomanip>
#include <stdlib.h>
#include <cstdint>
#include <malloc.h>
#include <windows.h>
using namespace std;int64_t cpu_freq;
int64_t cpu_counter(){int64_t clock;QueryPerformanceCounter((LARGE_INTEGER*)&clock);return clock;
}// output time
#if 1int64_t gloabel_timer_begin;int64_t gloabel_timer_end;#define TB__ gloabel_timer_begin=cpu_counter()#define TE__ gloabel_timer_end  =cpu_counter(); \cout << __LINE__ << " : " << double(gloabel_timer_end-gloabel_timer_begin)/double(cpu_freq) << " seconds" << endl
#else#define TB__ #define TE__
#endif// repeat times
#define REPEATTIMES 100000// initialize data
void init(float *data, int rows, int cols, int true_cols){for (int i = 0; i < rows; i++){for (int j = 0; j < cols; j++){data[i*true_cols+j] = float(rand())/float(RAND_MAX);}}
}void multiply(float *C, float *A, float *B, int rows, int cols, int true_cols);void print_sum(float *data, int rows, int cols, int true_cols){float total = 0;for (int i = 0; i < rows; i++){for (int j = 0; j < cols; j++){total += data[i*true_cols+j];}}cout << total << endl;
}int main(){QueryPerformanceFrequency((LARGE_INTEGER *)&cpu_freq);int rows = 100; int cols = 101;#ifdef ALIGNED#define ALLIGNED_LEN 32int true_cols = ((((cols*sizeof(float))+ALLIGNED_LEN-1)/ALLIGNED_LEN)*ALLIGNED_LEN)/sizeof(float);//cout << true_cols << endl;float *A = (float*)_aligned_malloc(rows*true_cols*sizeof(float), ALLIGNED_LEN);float *B = (float*)_aligned_malloc(rows*sizeof(float), ALLIGNED_LEN);float *C = (float*)_aligned_malloc(rows*sizeof(float), ALLIGNED_LEN);
#elseint true_cols = cols;float *A = (float*)malloc(rows*true_cols*sizeof(float));float *B = (float*)malloc(rows*sizeof(float));float *C = (float*)malloc(rows*sizeof(float));
#endifinit(A, rows, cols, true_cols);init(B, rows, 1, 1);// computingTB__;for (int k = 0; k < REPEATTIMES; k++){multiply(C, A, B, rows, cols, true_cols);}TE__;// print result.  print_sum(C, rows, 1, 1);#ifdef ALIGNED_aligned_free(A); A = NULL;_aligned_free(B); B = NULL;_aligned_free(C); C = NULL;
#elsefree(A); A = NULL;free(B); B = NULL;free(C); C = NULL;
#endifreturn 0;
// filename : multiply.cpp
void multiply(float *C, float *A, float *B, int rows, int cols, int true_cols){for (int i = 0; i < rows; i++){C[i] = 0;#ifdef ALIGNED#pragma vector aligned#endiffor (int j = 0; j < cols; j++){C[i] += A[i*true_cols+j]*B[j];}    }


user@machine> icl /DALIGNED /O2 /Qopt-report:1 /Qopt-report-phase:vec main.cpp multiply.cpp


82 : 0.17747 seconds




