ICL Auto Vectorization

简介

此文简单介绍如何使用intel c++编译器实现向量化加速。

全文如下安排：

base ：待优化的源代码。
vectorization ：第一个向量化版本。
aligned ：内存对其对向量化的影响。

base

base版本代码：

// filename : main.cpp
#include <iostream>
#include <iomanip>
#include <stdlib.h>
#include <cstdint>
#include <malloc.h>
#include <windows.h>
using namespace std;int64_t cpu_freq;
int64_t cpu_counter(){int64_t clock;QueryPerformanceCounter((LARGE_INTEGER*)&clock);return clock;
}// output time
#if 1int64_t gloabel_timer_begin;int64_t gloabel_timer_end;#define TB__ gloabel_timer_begin=cpu_counter()#define TE__ gloabel_timer_end  =cpu_counter(); \cout << __LINE__ << " : " << double(gloabel_timer_end-gloabel_timer_begin)/double(cpu_freq) << " seconds" << endl
#else#define TB__ #define TE__
#endif// repeat times
#define REPEATTIMES 100000// initialize data
void init(float *data, int rows, int cols, int true_cols){for (int i = 0; i < rows; i++){for (int j = 0; j < cols; j++){data[i*true_cols+j] = float(rand())/float(RAND_MAX);}}
}void multiply(float *C, float *A, float *B, int rows, int cols, int true_cols);void print_sum(float *data, int rows, int cols, int true_cols){float total = 0;for (int i = 0; i < rows; i++){for (int j = 0; j < cols; j++){total += data[i*true_cols+j];}}cout << total << endl;
}int main(){QueryPerformanceFrequency((LARGE_INTEGER *)&cpu_freq);int rows = 100; int cols = 101;int true_cols = cols;float *A = (float*)malloc(rows*true_cols*sizeof(float));float *B = (float*)malloc(rows*sizeof(float));float *C = (float*)malloc(rows*sizeof(float));init(A, rows, cols, true_cols);init(B, rows, 1, 1);// computingTB__;for (int k = 0; k < REPEATTIMES; k++){multiply(C, A, B, rows, cols, true_cols);}TE__;// print result.  print_sum(C, rows, 1, 1);free(A); A = NULL;free(B); B = NULL;free(C); C = NULL;return 0;
}

// filename : multiply.cpp
void multiply(float *C, float *A, float *B, int rows, int cols, int true_cols){for (int i = 0; i < rows; i++){C[i] = 0;for (int j = 0; j < cols; j++){C[i] += A[i*true_cols+j]*B[j];}    }
}

编译：

user@machine> icl /O1 /Qopt-report:1 /Qopt-report-phase:vec main.cpp multiply.cpp

执行：

user@machine> main.exe
73 : 0.877882 seconds
2483.53

vectorization

源代码保持不变

编译：

user@machine> icl /O2 /Qopt-report:1 /Qopt-report-phase:vec main.cpp multiply.cpp

执行：

user@machine> main.exe
73 : 0.205989 seconds
2483.53

执行速度提升了 4倍左右。

aligned

源代码修改。（注意：下面的代码有问题，结果可能有错误，原因可能是内存的问题。）

// filename : main.cpp
#include <iostream>
#include <iomanip>
#include <stdlib.h>
#include <cstdint>
#include <malloc.h>
#include <windows.h>
using namespace std;int64_t cpu_freq;
int64_t cpu_counter(){int64_t clock;QueryPerformanceCounter((LARGE_INTEGER*)&clock);return clock;
}// output time
#if 1int64_t gloabel_timer_begin;int64_t gloabel_timer_end;#define TB__ gloabel_timer_begin=cpu_counter()#define TE__ gloabel_timer_end  =cpu_counter(); \cout << __LINE__ << " : " << double(gloabel_timer_end-gloabel_timer_begin)/double(cpu_freq) << " seconds" << endl
#else#define TB__ #define TE__
#endif// repeat times
#define REPEATTIMES 100000// initialize data
void init(float *data, int rows, int cols, int true_cols){for (int i = 0; i < rows; i++){for (int j = 0; j < cols; j++){data[i*true_cols+j] = float(rand())/float(RAND_MAX);}}
}void multiply(float *C, float *A, float *B, int rows, int cols, int true_cols);void print_sum(float *data, int rows, int cols, int true_cols){float total = 0;for (int i = 0; i < rows; i++){for (int j = 0; j < cols; j++){total += data[i*true_cols+j];}}cout << total << endl;
}int main(){QueryPerformanceFrequency((LARGE_INTEGER *)&cpu_freq);int rows = 100; int cols = 101;#ifdef ALIGNED#define ALLIGNED_LEN 32int true_cols = ((((cols*sizeof(float))+ALLIGNED_LEN-1)/ALLIGNED_LEN)*ALLIGNED_LEN)/sizeof(float);//cout << true_cols << endl;float *A = (float*)_aligned_malloc(rows*true_cols*sizeof(float), ALLIGNED_LEN);float *B = (float*)_aligned_malloc(rows*sizeof(float), ALLIGNED_LEN);float *C = (float*)_aligned_malloc(rows*sizeof(float), ALLIGNED_LEN);
#elseint true_cols = cols;float *A = (float*)malloc(rows*true_cols*sizeof(float));float *B = (float*)malloc(rows*sizeof(float));float *C = (float*)malloc(rows*sizeof(float));
#endifinit(A, rows, cols, true_cols);init(B, rows, 1, 1);// computingTB__;for (int k = 0; k < REPEATTIMES; k++){multiply(C, A, B, rows, cols, true_cols);}TE__;// print result.  print_sum(C, rows, 1, 1);#ifdef ALIGNED_aligned_free(A); A = NULL;_aligned_free(B); B = NULL;_aligned_free(C); C = NULL;
#elsefree(A); A = NULL;free(B); B = NULL;free(C); C = NULL;
#endifreturn 0;
}

// filename : multiply.cpp
void multiply(float *C, float *A, float *B, int rows, int cols, int true_cols){for (int i = 0; i < rows; i++){C[i] = 0;#ifdef ALIGNED#pragma vector aligned#endiffor (int j = 0; j < cols; j++){C[i] += A[i*true_cols+j]*B[j];}    }
}

编译：

user@machine> icl /DALIGNED /O2 /Qopt-report:1 /Qopt-report-phase:vec main.cpp multiply.cpp

执行：

82 : 0.17747 seconds
2483.53

相对第一个优化的版本又提升了一点速度。

结论

vectorization版本：不需要改变源代码，通过修改编译器选项直接实现向量化。
aligned版本：需要修改代码，使得内存对其。可以进一步获得性能。

ICL Auto Vectorization相关推荐

python的文本编辑geny_android模拟器（genymotion）+appium+python 框架执行基本原理（目前公司自己写的）...
android模拟器(genymotion)+appium+python 框架执行的基本过程: 1.Push.initDate(openid)方法 //业务数据初始化 1.1 v5db.p ...
Java中的自动向量化（SIMD）
摘要这篇文章简单解释了向量化的目的,它是如何在Java中生效的,如何去检测它是否被应用于Java程序中.这些知识对数学计算大有帮助. 这些方式比较底层,且只适用于特殊的场景.如果你想优化你的Java ...
Announcing the program for the 2019 LLVM Developers’ Meeting - Bay Area
Announcing the program for the 2019 LLVM Developers' Meeting - Bay Area 2019 Bay Area LLVM Developer ...
使用Auto TensorCore CodeGen优化Matmul
使用Auto TensorCore CodeGen优化Matmul 本文将演示如何使用TVM Auto TensorCore CodeGen在Volta / Turing GPU上编写高性能matmu ...
c++中的auto关键字
auto的属性特征 #include <iostream> using namespace std;int main() {//1.auto 变量必须在定义时初始化,类似于constaut ...
Auto ML自动特征工程
Auto ML自动特征工程特征工程是在做机器学习训练的过程中必不可少的环节,特征工程就是找出对模型结果有益的特征交叉关系,通常特征工程需要耗费算法工程师大量的精力去尝试.针对这样的场景,PAI推出智 ...
Auto ML自动调参
Auto ML自动调参本文介绍Auto ML自动调参的算法介绍及操作流程. 操作步骤登录PAI控制台. 单击左侧导航栏的实验并选择某个实验. 本文以雾霾天气预测实验为例. 在实验画布区,单击左上角 ...
auto关键字详解 C++
C++98 auto 早在C++98标准中就存在了auto关键字,那时的auto用于声明变量为自动变量,自动变量意为拥有自动的生命期,这是多余的,因为就算不使用auto声明,变量依旧拥有自动的生命期: ...
WPF中Auto与*的差别
Auto 表示自己主动适应显示内容的宽度, 如自己主动适应文本的宽度,文本有多长,控件就显示多长. * 则表示按比例来分配宽度. <ColumnDefinition Width="3* ...

ICL Auto Vectorization

简介

base

vectorization

aligned

结论

ICL Auto Vectorization相关推荐

最新文章

热门文章