显卡性能测试工具BenchMarkTool

设计自己的CUDAZ

CUDAZ是一款用于查询显卡信息，测试显卡性能的工具，具体参见：
http://cuda-z.sourceforge.net/#block-windows
获取信息包括 TimeStamp Compute能力(Gflops) fHost2Device拷贝性能 fDevice2Host拷贝性能 fHost2DevicePin fDevice2HostPin fDevice2Device(GiB/s)
基本框架搭建：
LabBase.h

#pragma once
#ifndef LAB_BASE
#define LAB_BASE
#include <stdio.h>
#include <fstream>
#include <windows.h>
#include <vector>
#include <list>
#include <iostream>
#include <iomanip>
#include <sstream>
#include <math.h>
#include <string>
#include <time.h>
#include <tchar.h>
#include <cstring>
#include <psapi.h>
#pragma comment(lib,"psapi.lib")
#include <direct.h>
#include <io.h>typedef _ULonglong uint64_t;class LabBase
{public://Singleton lazy allocstatic LabBase getInstance();static uint64_t getTime_UTC();static std::string getTime_Str();
public:template<typename T>
static void log2File(const T &Info,const std::string &logfile){std::ostringstream streamInfo;streamInfo.clear();streamInfo.str("");streamInfo<<Info;std::string sInfo = streamInfo.str();LabBase obj = LabBase::getInstance();obj.log2File(sInfo,logfile);
}
template<typename T>
static void log2File(const T &Info,const char *const file, int const line,const std::string &logfile) {std::string sfile(file);std::ostringstream streamInfo;streamInfo.clear();streamInfo.str("");streamInfo<<Info;std::string sInfo = streamInfo.str();LabBase obj = LabBase::getInstance();obj.log2File(sInfo,sfile,line,logfile);
}/// \brief Save file.
template<typename T>
static void SaveFile(const T* data, size_t len, const std::string &file) {std::ofstream ofs(file, std::ios::binary|std::ios::ate);if(ofs.fail()) {std::cout<<"failed to open file: " + file<<std::endl;return;}ofs.write((char*)data, len*sizeof(T));ofs.close();
}private:void log2File(const std::string &sInfo,const std::string &file="1.log");void log2File(const char* charArray,const std::string &file="1.log");void log2File(const std::string &sInfo,const std::string &file, int const line,const std::string &logfile="1.log");
};class LogAlgoPerf
{public:explicit LogAlgoPerf(void);virtual ~LogAlgoPerf(void);
public:// set Log file name & will be saved to "D:\\LogAlgoPerf\\"void setLogFile(const std::string &strFileName);// set start logging timer void setLogStart();// get logging timevoid getLogEnd();// get Process Memory info WorkSet Paged Pinnedtemplate<typename T>void logInfo(T info){//decltypem_streamInfo<<info<<"    ";}void logMemInfo(); // Log other info double getTotalTime();
private:void log2File();void mkdir();void reset();
private:LARGE_INTEGER m_nBegTime;LARGE_INTEGER m_nEndTime;LARGE_INTEGER m_nFreq;std::string     m_strFile;std::ostringstream            m_streamInfo;std::vector<LARGE_INTEGER>   m_vBegTime;std::vector<LARGE_INTEGER> m_vEndTime;std::vector<std::string>   m_vInfo;
};//#define LOGPERF(_ins, _m, ...)
#define LOGPERF(_ins, _m, ...) {_ins.##_m(##__VA_ARGS__);}#endif LAB_BASE

CudaBase.h

#pragma once
#ifndef __CUDA_BASE__
#define __CUDA_BASE__#include"cuda_runtime.h"
#include"cublas.h"
#include "device_launch_parameters.h"
#include "LabBase.h"template< typename T >
inline bool checkOpt(T cudaFunc, char const *const cudaFuncName, const char *const file, int const line){bool bSuc = true;auto result = cudaFunc;if (result != cudaSuccess){bSuc = false;std::ostringstream oss;oss<<"Throw cudaError: "<<result<<"File: "<<file<<", cudaFunction: "<<cudaFuncName<<", Line: "<<line;size_t nBytesFree = 0, nBytesTotal = 0;if (cudaMemGetInfo(&nBytesFree, &nBytesTotal) == cudaSuccess){int DeviceId =-1;cudaGetDevice(&DeviceId);cudaGetLastError();// reset cuda error oss<<" GPU "<<DeviceId<<" FreeMemory: "<<nBytesFree<<" bytes, TotalMemory: "<<nBytesTotal<<" bytes";}std::string sInfo = oss.str();LabBase::log2File(sInfo,__FILE__, __LINE__,"checkCudaErrors.log");printf("%s",oss.str());throw std::logic_error(oss.str());}return bSuc;
}#define checkCudaErrors(cudaFunc)           checkOpt ( (cudaFunc),  #cudaFunc, __FILE__, __LINE__ )#endif  __CUDA_BASE__

BenchMark.h

#include "../../common/CudaBase.h"// Test memory Opts
enum GPUCOPYMODEL{COPY_MODE_Hpage2D = 0,       /*!< Host pageable memory to device data copy mode. */COPY_MODE_D2Hpage  = 1,       /*!< Device to host pageable memory data copy mode. */COPY_MODE_Hpin2D   = 2,       /*!< Host pinned memory to device data copy mode. */COPY_MODE_D2Hpin = 3,       /*!< Device to host pinned memory data copy mode. */COPY_MODE_D2D        = 4            /*!< Device to device data copy mode. */
};struct memBuff {size_t    szMem;void  *memHostPage;   /*!< Pageable host memory. */void    *memHostPin;    /*!< Pinned host memory. */void  *memDevice1;    /*!< Device memory buffer 1. */void  *memDevice2;    /*!< Device memory buffer 2. */memBuff():szMem(0),memHostPage(NULL),memHostPin(NULL),memDevice1(NULL),memDevice2(NULL){}
};struct memInfo{float fH2DPage;float fD2HPage;float fH2DPin;float fD2HPin;float fD2D;memInfo():fH2DPage(0.0f),fD2HPage(0.0f),fH2DPin(0.0f),fD2HPin(0.0f),fD2D(0.0f){}
};
class BandWidth{public:BandWidth(void);~BandWidth(void);void memBandInit(memBuff sData);memInfo getBandWidth();
private:float getMemSpeed(GPUCOPYMODEL model,int iterNum=10);void memAllocFree();
private:memBuff m_sData;
};

BenchMark.cuh

#include"BenchMarks.h"
extern "C" float TestFunc_Launchlatency();
extern "C" float TestFunc_CalcPerformance();
extern "C" memInfo TestFunc_BandWidth();

BenchMark.cu

#include"BenchMarks.cuh"
//empty Kernel
__global__ void empty() {}#define CALC_FMAD_16(a, b) \a = a * a + a; b = b * b + b; a = a * a + a; b = b * b + b; \a = a * a + a; b = b * b + b; a = a * a + a; b = b * b + b; \a = a * a + a; b = b * b + b; a = a * a + a; b = b * b + b; \a = a * a + a; b = b * b + b; a = a * a + a; b = b * b + b; \

#define CALC_FMAD_256(a, b) \CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) \CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) \CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) \CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) \

__global__ void CalcKernelFloat(void *buf,                  /*!<[in] Data buffer. */int iters) {int index = blockIdx.x * blockDim.x + threadIdx.x;float *arr = (float*)buf;float val1 = index;float val2 = arr[index];int i;for(i = 0; i < iters; i++) {CALC_FMAD_256(val1, val2);CALC_FMAD_256(val1, val2);CALC_FMAD_256(val1, val2);CALC_FMAD_256(val1, val2);CALC_FMAD_256(val1, val2);CALC_FMAD_256(val1, val2);CALC_FMAD_256(val1, val2);CALC_FMAD_256(val1, val2);}arr[index] = val1 + val2;
}float TestFunc_Launchlatency(){const int iters = 100;cudaFree(0);// Warmup phaseempty<<<1,1>>>();float timeMs   = 0.0;cudaEvent_t start;cudaEvent_t stop;checkCudaErrors(cudaEventCreate(&start));checkCudaErrors(cudaEventCreate(&stop));checkCudaErrors(cudaEventRecord(start, 0));checkCudaErrors(cudaDeviceSynchronize());float totalTime = 0;// Benchmark phasefor (int i = 0; i < iters; ++i) {float loopMs = 0;empty<<<1,1024>>>();cudaEventRecord(stop, 0);cudaEventSynchronize(stop);cudaEventElapsedTime(&loopMs, start, stop);//std::cout<<i<<" "<<loopMs<<std::endl;timeMs += loopMs;}float averTime = timeMs/iters;return averTime;//printf(" Average Launch Time %f ms \r\n",averTime);
}memInfo TestFunc_BandWidth(){memBuff sData;BandWidth obj;sData.szMem = 1024*1024*100;obj.memBandInit(sData);memInfo info = obj.getBandWidth();return info;
}float TestFunc_CalcPerformance(){cudaDeviceProp prop;cudaGetDeviceProperties(&prop, 0);int nRepeatNum          = 10;int iterNumInKernel       = 32;int threadsNum            = prop.maxThreadsPerBlock;int blocksNum            = 1;if(threadsNum == 0) {int warpSize = prop.warpSize;if(warpSize == 0)warpSize = 32;threadsNum = warpSize * 2;if(threadsNum > 512)threadsNum = 512;}float* pfDiviceBuff        = NULL;checkCudaErrors(cudaMalloc((void**)&pfDiviceBuff,threadsNum*sizeof(float)));checkCudaErrors(cudaMemset(pfDiviceBuff,0,threadsNum*sizeof(float)));std::shared_ptr<float> pArray(new float[threadsNum]);float*ptrArray = pArray.get();for(int i = 0;i<threadsNum;++i){ptrArray[i] = i*0.03141592653f;}checkCudaErrors(cudaMemcpy(pfDiviceBuff,ptrArray,threadsNum*sizeof(float),cudaMemcpyHostToDevice));float timeMs   = 0.0;cudaEvent_t start;cudaEvent_t stop;cudaEventCreate(&start);cudaEventCreate(&stop);cudaEventRecord(start, 0);for(int i = 0;i<nRepeatNum;++i){float loopMs = 0.0;cudaEventRecord(start, 0);CalcKernelFloat<<<blocksNum, threadsNum>>>(pfDiviceBuff,iterNumInKernel);checkCudaErrors(cudaGetLastError());cudaEventRecord(stop, 0);cudaEventSynchronize(stop);cudaEventElapsedTime(&loopMs, start, stop);timeMs += loopMs;}int nOpsNumofFMA   = 2;       // Number of operations per one loopint nFMAInstNum     = 256;     // Size of instruction block        int nIterofFMAOps   = 8;       // Number of instruction blocks in loopfloat GFLOPs = ((float)prop.multiProcessorCount * (float)threadsNum                                 // max cores * warp*(float)iterNumInKernel*(float)nIterofFMAOps *(float)nFMAInstNum *(float)nOpsNumofFMA    // ops of cores*(float)nRepeatNum *1000.0/(float)timeMs)                                                // ops per Second/1000/1000/1000;                                                                       // GcudaEventDestroy(start);cudaEventDestroy(stop);cudaFree(pfDiviceBuff);//printf("GFLOPs %10f \r\n",GFLOPs);return GFLOPs;
}

BenchMark.cpp

#include "BenchMarks.h"BandWidth::BandWidth(void){}BandWidth::~BandWidth(void){memAllocFree();
}void BandWidth::memBandInit(memBuff sData){m_sData = sData;if (m_sData.szMem==0){m_sData.szMem = 1024*1024*1;}m_sData.memHostPage = (void*)malloc(m_sData.szMem);auto state = cudaMallocHost((void**)&m_sData.memHostPin,m_sData.szMem);cudaMalloc((void**)&m_sData.memDevice1,m_sData.szMem);cudaMalloc((void**)&m_sData.memDevice2,m_sData.szMem);}float BandWidth::getMemSpeed(GPUCOPYMODEL model,int iterNum){float timeMs   = 0.0;cudaEvent_t start;cudaEvent_t stop;checkCudaErrors(cudaEventCreate(&start));checkCudaErrors(cudaEventCreate(&stop));for(int i = 0;i<iterNum;++i){float loopMs = 0.0;cudaEventRecord(start, 0);switch(model){case COPY_MODE_Hpage2D:checkCudaErrors(cudaMemcpy(m_sData.memDevice1, m_sData.memHostPage, m_sData.szMem, cudaMemcpyHostToDevice));break;case COPY_MODE_D2Hpage:checkCudaErrors(cudaMemcpy(m_sData.memHostPage, m_sData.memDevice1, m_sData.szMem, cudaMemcpyDeviceToHost));break;case COPY_MODE_Hpin2D:checkCudaErrors(cudaMemcpy(m_sData.memDevice2, m_sData.memHostPin, m_sData.szMem, cudaMemcpyHostToDevice));break;case COPY_MODE_D2Hpin:checkCudaErrors(cudaMemcpy(m_sData.memHostPin, m_sData.memDevice2, m_sData.szMem, cudaMemcpyDeviceToHost));break;case COPY_MODE_D2D:checkCudaErrors(cudaMemcpy(m_sData.memDevice1, m_sData.memDevice2, m_sData.szMem, cudaMemcpyDeviceToHost));break;default:break;}cudaEventRecord(stop, 0);checkCudaErrors(cudaEventSynchronize(stop));cudaEventElapsedTime(&loopMs, start, stop);timeMs += loopMs;}checkCudaErrors(cudaEventDestroy(start));checkCudaErrors(cudaEventDestroy(stop));float bandwidthGiBs = (1000 *m_sData.szMem*iterNum) / (timeMs *(float)(1 << 30));return bandwidthGiBs;
}void BandWidth::memAllocFree(){if (m_sData.memDevice1!=NULL){cudaFree(m_sData.memDevice1);}if (m_sData.memDevice2!=NULL){cudaFree(m_sData.memDevice2);}if (m_sData.memHostPage!=NULL){free(m_sData.memHostPage);}if (m_sData.memHostPin!=NULL){cudaFreeHost(m_sData.memHostPin);}
}memInfo BandWidth::getBandWidth(){memInfo info;info.fH2DPage   = getMemSpeed(COPY_MODE_Hpage2D);info.fD2HPage = getMemSpeed(COPY_MODE_D2Hpage);info.fH2DPin  = getMemSpeed(COPY_MODE_Hpin2D);info.fD2HPin   = getMemSpeed(COPY_MODE_D2Hpin);info.fD2D      = getMemSpeed(COPY_MODE_D2D);return info;
}

Main.cpp

void TestFunc_PerfMon(){TestFunc_Launchlatency();printf("   TimeStamp   Compute(Gflops) fH2D     fD2H       fH2DPin    fD2HPin   fD2D(GiB/s) \r\n");for (int i=0;i<10000;++i){std::string strtime = LabBase::getTime_Str();memInfo info = TestFunc_BandWidth();float Gflops = TestFunc_CalcPerformance();printf("%s %10f %10f %10f %10f %10f %10f \r\n",strtime.c_str(),Gflops,info.fH2DPage,info.fD2HPage,info.fH2DPin,info.fD2HPin,info.fD2D);Sleep(1000);}
}

显卡性能测试工具BenchMarkTool相关推荐

linux显卡性能测试工具,Linux系统中A/N显卡通用计算性能测试
[天极网DIY硬件频道]今天我们将给大家带来一份很有趣的测试,在Linux操作系统下对比了NVIDIA.AMD几款显卡的OpenCL通用计算性能,尤其是最近比较抢眼的GeForce GTX 460. ...
显卡html5性能测试工具,电脑显卡性能测试软件
作者选择100电脑网推荐配置了解最佳配置看首页电脑显卡性能测试一般用经典权威的3DMark,3DMark目前较流行使用的版本有3DMark Vantage.3DMark.新3DMark.3DMar ...
CPU性能测试工具-Unixbench
简介: UnixBench是一个类unix系(Unix,BSD,Linux)统下的性能测试工具,一个开源工具,被广泛用与测试linux系统主机的性能.Unixbench的主要测试项目有:系统调用.读写 ...
技嘉显卡性能测试软件,你好六啊！GTX 1660 Ti深度测试：升吧
近两个季度对NVIDIA来说颇为动荡,矿潮退去,业绩不断下行,股价也经历了过山车式的震动.随着GTX 10系列库存压力逐步减轻,NVIDIA也开始重新构建产品线. 今天就带来非常六的显卡GTX 166 ...
vr性能测试软件,SteamVR性能测试工具
不知道怎么下载?点我游戏介绍 SteamVR性能测试工具是为Steam打造的一款辅助工具,通过本软件可让你提前知道VR游戏的配置要求,需要的小伙伴自行下载. 使用说明 1.下载解压本工具; 2.运行 ...
linux硬件性能,Linux运维知识：Linux下的硬件性能测试工具汇总
本文主要向大家介绍了Linux运维知识的Linux下的硬件性能测试工具汇总,通过具体的内容向大家展现,希望对的大家学习Linux运维知识有所帮助. 在购买计算机之后,我们都希望能充分了解它们的硬件性能 ...
如何评估移动GPU性能：以及4个需要了解的Android GPU 本地性能测试工具
众所周知,Android终端基本都配有GPU:无论手机还是VR,AR设备,GPU在其中扮演了越来越重要的地位. 当我们拿到一款GPU时,我们最关心的就是性能了. 不服跑个分.跑分是目前最常见的测试性能 ...
linux命令 iperf-网络性能测试工具
iperf命令是一个网络性能测试工具.iperf可以测试TCP和UDP带宽质量.iperf可以测量最大TCP带宽,具有多种参数和UDP特性.iperf可以报告带宽,延迟抖动和数据包丢失.利用iperf ...
深入浅出开源性能测试工具 Locust (使用篇 1)
在<[LocustPlus序]漫谈服务端性能测试>中,我对服务端性能测试的基础概念和性能测试工具的基本原理进行了介绍,并且重点推荐了Locust这一款开源性能测试工具.然而,当前在网络上针 ...

显卡性能测试工具BenchMarkTool

设计自己的CUDAZ

显卡性能测试工具BenchMarkTool相关推荐

最新文章

热门文章