ã¯ããã«
Wikipediaã§OpenCLã«ã€ããŠèªãã®ã¯ããŸãæå³ããªããšæããŸãããç°¡åã«èšãã°ãOpenCLã¯ç°ãªãã¢ãŒããã¯ãã£ãç¹ã«é«åºŠãªäžŠåããã»ããµã§ç°ãªãããã€ã¹ã§åãã³ãŒããå®è¡ã§ããèšèªïŒãã¬ãŒã ã¯ãŒã¯ãšãã©ãããã©ãŒã ïŒã§ãããããªã«ãŒããææ°ã®äžå€®åŠçè£ çœ®ãªã©ã ãã®æšæºã¯C99ã«åºã¥ããŠãããã¯ããã¹ã°ã«ãŒãã«ãã£ãŠãµããŒããããŠããŸããããã«åºã¥ããŠãæè²ããã°ã©ã ã¯å®äºãããšèŠãªããŸãã
OpenCLãã©ã®ããã«æ©èœãããã«ã€ããŠè©±ããŠããéã«ãå°ããªã³ãŒããèŠããŠããã§äœãèµ·ãã£ãŠããã®ãã説æããããšããå§ããŸãã
ãŸããããããªã³ãŒãã«ã€ããŠèª¬æããŸããOpenCLã®éæ³ãèŠãããªã人ã¯ãæåã®éšåãã¹ãããã§ããŸãïŒMathCalculationsé¢æ°ã«ã€ããŠèª¬æããæåŸã®æ®µèœãèªãã§ãã ãããããã¯éèŠã§ãã 5çªç®ã®ã»ã¯ã·ã§ã³ã«çŽæ¥é²ã¿ãŸããããšã«ããMathCalculationsãèŠãŠãã ããã
int mainïŒint argcãchar * argv []ïŒ
int main(int argc, char* argv[]) { GenerateTestData(); PerformCalculationsOnHost(); //Get all available platforms vector<cl::Platform> platforms; cl::Platform::get(&platforms); for (int iPlatform=0; iPlatform<platforms.size(); iPlatform++) { //Get all available devices on selected platform std::vector<cl::Device> devices; platforms[iPlatform].getDevices(CL_DEVICE_TYPE_ALL, &devices); //Perform test on each device for (int iDevice=0; iDevice<devices.size(); iDevice++) { try { PerformTestOnDevice(devices[iDevice]); } catch(cl::Error error) { std::cout << error.what() << "(" << error.err() << ")" << std::endl; } CheckResults(); } } //Clean buffers delete[](pInputVector1); delete[](pInputVector2); delete[](pOutputVector); delete[](pOutputVectorHost); return 0; }
ããããOpenCLããã¹ãããããã®ããŸãã¯ããããåŸã§èª¬æããç¹å®ã®æœè±¡çãªæ°åŠåŒãèšç®ããããã®ç§ã®å°ããªããã°ã©ã ã®äž»ãªå€èŠ³ã§ãã ããã§ãããã§äœãèµ·ãã£ãŠããã®ããç解ããããã«ãè¡ããšã«èŠãŠã¿ãŸãããã
ããŒã1-ãœãŒã¹ããŒã¿ãšåŸæ¥ã®ã³ã³ãã¥ãŒãã£ã³ã°æ¹æ³ã®åæå
GenerateTestDataïŒïŒ; ç¹å¥ãªããšã¯äœãè¡ããŸããããåã«å ¥åããã³åºåé åã«ã¡ã¢ãªãå²ãåœãŠãå ¥åé åãã©ã³ãã ããŒã¿ã§æºãããŸãã
void GenerateTestDataïŒïŒ
void GenerateTestData() { pInputVector1 = new float[DATA_SIZE]; pInputVector2 = new float[DATA_SIZE]; pOutputVector = new float[DATA_SIZE]; pOutputVectorHost = new float[DATA_SIZE]; srand (time(NULL)); for (int i=0; i<DATA_SIZE; i++) { pInputVector1[i] = rand() * 1000.0 / RAND_MAX; pInputVector2[i] = rand() * 1000.0 / RAND_MAX; } }
次ã«ãããå°ãèå³æ·±ãæ©èœããããŸãã
void PerformCalculationsOnHostïŒïŒ
void PerformCalculationsOnHost() { cout << "Device: Host" << endl << endl; //Some performance measurement timeValues.clear(); __int64 start_count; __int64 end_count; __int64 freq; QueryPerformanceFrequency((LARGE_INTEGER*)&freq); for(int iTest=0; iTest<(TESTS_NUMBER/10); iTest++) { QueryPerformanceCounter((LARGE_INTEGER*)&start_count); for(int iJob=0; iJob<DATA_SIZE; iJob++) { //Check boundary conditions if (iJob >= DATA_SIZE) break; //Perform calculations pOutputVectorHost[iJob] = MathCalculations(pInputVector1[iJob], pInputVector2[iJob]); } QueryPerformanceCounter((LARGE_INTEGER*)&end_count); double time = 1000 * (double)(end_count - start_count) / (double)freq; timeValues.push_back(time); } hostPerformanceTimeMS = std::accumulate(timeValues.begin(), timeValues.end(), 0)/timeValues.size(); PrintTimeStatistic(); }
ãã®æåã®ãµã€ã¯ã«
for(int iTest=0; iTest<(TESTS_NUMBER/10); iTest++)
ããæ£ç¢ºãªã©ã³ã¿ã€ã ãåŸãããã«ããã¹ããæ°åå®è¡ããå¿ èŠããããŸããã åãã¹ãã®èšç®æéã¯timeValuesé åã«æ ŒçŽãããããããå¹³åå€ãèšç®ãããŠhostPerformanceTimeMSã«æ ŒçŽãããŸãã
第äºãµã€ã¯ã«
for(int iJob=0; iJob<DATA_SIZE; iJob++)
å ¥åé åã®èŠçŽ ã«å¯ŸããŠããã€ãã®æ°åŠèšç®ãé 次å®è¡ããåºåé åã«æ ŒçŽããŸãã
ã芧ã®ãšããããã®ã³ãŒãã«ã¯ç°åžžãªãã®ã¯ãããŸãããéåžžã®ã·ã¹ãã ã³ã³ãã€ã©ã«ãã£ãŠã³ã³ãã€ã«ãããæ¯æ¥äœæããã»ãšãã©ã®ã³ãŒãã®ããã«ãäžå€®åŠçè£ çœ®ã§é 次å®è¡ãããŸãã ãããŠãOpenCLã«ãã£ãŠåŸãããçµæãåŸã§åœŒãšæ¯èŒããã©ã®ãããªããã©ãŒãã³ã¹ã®åäžãåŸãããããç解ããããã«åœŒãå¿ èŠã§ãã
ããã«MathCalculationsã調ã¹ããã¹ãŠãå®å šã«éå±ããŠããããšã確èªããå¿ èŠããããŸãã
float MathCalculationsïŒãããŒãaããããŒãbïŒ
float MathCalculations(float a, float b) { float res = 0; res += a*a*0.315f + b*0.512f + 0.789f; res += a*a*0.15f + b*0.12f + 0.789f; res += a*a*0.35f + b*0.51f + 0.89f; res += a*a*0.31f + b*0.52f + 0.7f; res += a*a*0.4315f + b*0.512f + 0.4789f; res += a*a*0.515f + b*0.132f + 0.7859f; res += a*a*0.635f + b*0.521f + 0.89f; res += a*a*0.731f + b*0.152f + 0.7f; res += a*a*0.1315f + b*0.512f + 0.789f; res += a*a*0.115f + b*0.12f + 0.789f; res += a*a*0.135f + b*0.51f + 0.89f; res += a*a*0.131f + b*0.52f + 0.7f; res += a*a*0.14315f + b*0.512f + 0.4789f; res += a*a*0.1515f + b*0.132f + 0.7859f; res += a*a*0.1635f + b*0.521f + 0.89f; res += a*a*0.1731f + b*0.152f + 0.7f; return res; }
å®éãããã¯ããŸãæå³ããããŸããïŒãããŠãéåžžã«åçŽåã§ããããšã¯æããã§ãïŒããçŽç²ãªæ°åŠæŒç®ã®ç°¡åãªãã¢ã³ã¹ãã¬ãŒã·ã§ã³ãšããŠæ©èœããŸãã ãã®äžã§éèŠãªããšã¯ããããå¥ã®.cppãã¡ã€ã«ã«ãããå€ãã®ç®è¡æŒç®ãè¡ãããšã§ãããããã«ã€ããŠã¯åŸã§è©³ãã説æããŸãã
ããŒã2-OpenCLã®åæå
ã ãããæ£è ã¯ãã®éšåãèªãã§ãäœãé¢çœãããšãå§ãŸã£ãããšãåãã§ãããããã£ãã¡ãªäººã¯ãã®æèŠãçµéšããããšãã§ããªãã£ãã®ã§ãæåŸã®æ®µèœãã¹ããããã:)
æåã«ãOpenCLã©ã³ã¿ã€ã APIã¯C ++ã®APIã§ã¯ãªããCã®APIã«ãããªããšèšããŸãã äžè¬ã«ããšã©ãŒããã§ãã¯ããããã«ãåé¢æ°ã«ãã£ãŠè¿ãããã³ãŒãããã§ãã¯ããå¿ èŠãããããšãé€ããŠãããã«ã¯äœãåé¡ã¯ãããŸãããããã¯ããŸã䟿å©ã§ã¯ãããŸããã ãŸããå²ãåœãŠããããªãœãŒã¹ã®ãªãªãŒã¹ãæåã§ç£èŠããå¿ èŠããããŸãã
ããããå ¬åŒã®C ++ã©ãããŒïŒKhronosã®Webãµã€ãã«ãããŸãïŒããããŸããããã¯ãOpenCLãªããžã§ã¯ãã«å¯Ÿå¿ããåç §ã«ãŠã³ãïŒåç §ïŒããµããŒããããšã©ãŒã®å Žåã«äŸå€ãã¹ããŒããã¯ã©ã¹ã®ã»ããã§ãïŒ#define __CL_ENABLE_EXCEPTIONSã䜿çšããŠäŸå€ãæå¹ã«ããå¿ èŠããããŸãïŒ ïŒ ãã®éåžžã«ã©ãããŒã¯ããã¹ãã§äœ¿çšããŸãã
ãããã£ãŠãæåã«å ¥æã§ããã®ã¯ãå©çšå¯èœãªãã©ãããã©ãŒã ã®ãªã¹ãã§ãã
vector<cl::Platform> platforms; cl::Platform::get(&platforms);
OpenCLã®ãã©ãããã©ãŒã ã¯ãã³ããŒã«å¯Ÿå¿ããŠããŸãã NVidiaã«ã¯1ã€ã®ãã©ãããã©ãŒã ã«ããã€ã¹ããIntelã«ã¯å¥ã®ãã©ãããã©ãŒã ãªã©ããããŸã ç§ã®å Žåãå©çšã§ããã®ã¯2ã€ã®NVidiaãšIntelãã©ãããã©ãŒã ã ãã§ãã
ãã1ã€ã®å°ããªããªãã¯ã¯ãC ++ã©ãããŒãç¬èªã®ãã¯ã¿ãŒïŒããã«ã€ããŠåœŒã«äŒããå ŽåïŒãŸãã¯STDã®ãã¯ã¿ãŒã䜿çšã§ãããããäŸã§cl :: vectorã®ãããªãã®ãåºãŠããŠããæããªãã§äž¡æ¹ã®åœ¢åŒãç¥ã£ãŠããã
ãã©ãããã©ãŒã ã®ãªã¹ããååŸããåŸããã©ãããã©ãŒã ããšã«äœ¿çšå¯èœãªããã€ã¹ã®ãªã¹ããååŸããŸãã
std::vector<cl::Device> devices; platforms[iPlatform].getDevices(CL_DEVICE_TYPE_ALL, &devices);
å®éãããã€ã¹ã¯èšç®ãå®è¡ãããã®ã§ãã GPUãCPUããŸãã¯ãã¹ãã«æ¥ç¶ãããŠããç¹å¥ãªã¢ã¯ã»ã©ã¬ãŒã¿ãŒãã€ãŸã OpenCLãå®è¡ãããŠããã·ã¹ãã ã CL_DEVICE_TYPE_ALLã®ä»£ããã«ãCL_DEVICE_TYPE_GPUãæž¡ãããšãã§ããŸãããããããšãäžå€®åŠçè£ çœ®çšã«ãããªã«ãŒããŸãã¯CL_DEVICE_TYPE_CPUã®ã¿ãçºè¡ãããŸãã
ç§ãèŠã€ããåããã€ã¹ã«ã€ããŠã以äžã§èª¬æãããã¹ããå®è¡ããåé¡ãçºçããå Žåã«OpenCLãã¹ããŒããäŸå€ããã£ããããããšããŸãããã¹ãŠãããŸããã£ãå ŽåãCheckResultsã¯çµæããã¹ãã®æåã®éšåã§ã«ãŠã³ããããã®ãšæ¯èŒããçµ±èšãèšç®ããŸãééãã
ããŒã3-ã«ãŒãã«ã®äœæãšèµ·å
ããã§æãèå³æ·±ãéšå-èšç®ã«é²ã¿ãŸãã
void PerformTestOnDeviceïŒcl ::ããã€ã¹ããã€ã¹ïŒ
void PerformTestOnDevice(cl::Device device) { cout << endl << "-------------------------------------------------" << endl; cout << "Device: " << device.getInfo<CL_DEVICE_NAME>() << endl << endl; //For the selected device create a context vector<cl::Device> contextDevices; contextDevices.push_back(device); cl::Context context(contextDevices); //For the selected device create a context and command queue cl::CommandQueue queue(context, device); //Clean output buffers fill_n(pOutputVector, DATA_SIZE, 0); //Create memory buffers cl::Buffer clmInputVector1 = cl::Buffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, DATA_SIZE * sizeof(float), pInputVector1); cl::Buffer clmInputVector2 = cl::Buffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, DATA_SIZE * sizeof(float), pInputVector2); cl::Buffer clmOutputVector = cl::Buffer(context, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, DATA_SIZE * sizeof(float), pOutputVector); //Load OpenCL source code std::ifstream sourceFile("OpenCLFile1.cl"); std::string sourceCode(std::istreambuf_iterator<char>(sourceFile),(std::istreambuf_iterator<char>())); //Build OpenCL program and make the kernel cl::Program::Sources source(1, std::make_pair(sourceCode.c_str(), sourceCode.length()+1)); cl::Program program = cl::Program(context, source); program.build(contextDevices); cl::Kernel kernel(program, "TestKernel"); //Set arguments to kernel int iArg = 0; kernel.setArg(iArg++, clmInputVector1); kernel.setArg(iArg++, clmInputVector2); kernel.setArg(iArg++, clmOutputVector); kernel.setArg(iArg++, DATA_SIZE); //Some performance measurement timeValues.clear(); __int64 start_count; __int64 end_count; __int64 freq; QueryPerformanceFrequency((LARGE_INTEGER*)&freq); //Run the kernel on specific ND range for(int iTest=0; iTest<TESTS_NUMBER; iTest++) { QueryPerformanceCounter((LARGE_INTEGER*)&start_count); queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(DATA_SIZE), cl::NDRange(128)); queue.finish(); QueryPerformanceCounter((LARGE_INTEGER*)&end_count); double time = 1000 * (double)(end_count - start_count) / (double)freq; timeValues.push_back(time); } PrintTimeStatistic(); // Read buffer C into a local list queue.enqueueReadBuffer(clmOutputVector, CL_TRUE, 0, DATA_SIZE * sizeof(float), pOutputVector); }
ãŸãããã®æ¹æ³ã§ååŸããããã€ã¹åã衚瀺ããŸãã
device.getInfo<CL_DEVICE_NAME>()
åæ§ã«ãã³ã¢ã®æ°ãåšæ³¢æ°ãããŒãžã§ã³ãªã©ã«é¢ããæ å ±ãååŸã§ããŸãã
次ã«ãã³ã³ããã¹ããäœæããŸãã
vector<cl::Device> contextDevices; contextDevices.push_back(device); cl::Context context(contextDevices);
ã³ã³ããã¹ãã§ã¯ãã¹ãŠãããã»ã©åçŽã§ã¯ãããŸãã...ã³ã³ããã¹ããäœæãããšããããã«å«ããããã€ã¹ã®ãªã¹ããæž¡ããŸãããå¶éããããŸãïŒåããã©ãããã©ãŒã äžã®ããã€ã¹ã®ã¿ãåãã³ã³ããã¹ãã«å ¥ããããšãã§ããŸãã GPUããã³CPUïŒIntel / NVidiaã®å ŽåïŒãšã®ã³ã³ããã¹ãã®äœæã¯å€±æããŸãã åãã³ã³ããã¹ãã«è€æ°ã®ããã€ã¹ãããå Žåããã¹ãŠã®ãããã¡ã¯ç°ãªãããã€ã¹ã§èªåçã«åæãããŸãã äžæ¹ã§ã¯ãããã«ãããã«ãGPUãµããŒããç°¡çŽ åãããä»æ¹ã§ã¯ããã©ã€ããŒãã©ã®ããã«ãäœãããã€åæãããã誰ã«ãããããŸãããããŒã¿è»¢éã®å¹çã¯ããã¹ãŠãèããããé«ãããã©ãŒãã³ã¹ãåŸãããã«éèŠã§ãã ãã®ãããéåžžã¯åããã€ã¹ã«åå¥ã®ã³ã³ããã¹ããäœæããããŒã¿ãæåã§é åžããŸãã ãããã£ãŠãäœããã©ãã§ããã€çºçãããã¯åžžã«ããã£ãŠããŸãã
次ã®ã¹ãããã¯ãããã€ã¹ã®ã³ãã³ããã¥ãŒãäœæããããšã§ãã
cl::CommandQueue queue(context, device);
ãã®ãã¥ãŒã¯ç¹å®ã®ããã€ã¹ã«é¢é£ä»ããããŠãããçè«çã«ã¯æ éããŠããå¯èœæ§ããããŸãããå®éã«ã¯ãã®åäœã«æ°ä»ããŸããã§ããã 1ã€ã®ããã€ã¹ã«è€æ°ã®ãã¥ãŒããããåãã³ã³ããã¹ãå ã§ç°ãªããã¥ãŒããã³ãã³ããåæã§ããŸãã
次ã«ãå ¥åãã¯ãã«ãšåºåãã¯ãã«çšã®ãããã¡ãŒãäœæããŸãã
//Create memory buffers cl::Buffer clmInputVector1 = cl::Buffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, DATA_SIZE * sizeof(float), pInputVector1); cl::Buffer clmInputVector2 = cl::Buffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, DATA_SIZE * sizeof(float), pInputVector2); cl::Buffer clmOutputVector = cl::Buffer(context, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, DATA_SIZE * sizeof(float), pOutputVector);
ãããã¡ãŒãäœæãããšããã³ã³ããã¹ãïŒç¹å®ã®ããã€ã¹ã§ã¯ãªãïŒããã®ããªã¥ãŒã ãããã³å¿ èŠã«å¿ããŠCL_MEM_COPY_HOST_PTRãã©ã°ã䜿çšããŠãäœææã«ãã®äžã«ã³ããŒãããããŒã¿ãžã®ãã€ã³ã¿ãŒã瀺ãããŸãã åè¿°ããããã«ãC ++ã©ãããŒã¯åç §ã«ãŠã³ãã䜿çšãããããçŽç²ãªC APIãšã¯ç°ãªããæåã§ãããã¡ãŒãåé€ããå¿ èŠã¯ãããŸããã
次ã«ãã³ãŒãããã¡ã€ã«ãOpenCLFile1.clãã«ä¿åãããŠããã«ãŒãã«ãäœæããå¿ èŠããããŸãã ãããè¡ãã«ã¯ããã¡ã€ã«ããããã¹ããèªã¿åããOpenCLããã°ã©ã ãäœæããŠã³ã³ãã€ã«ãããTestKernelããšããååã®ã«ãŒãã«ãååŸããŸããããã«ã€ããŠã¯æ¬¡ã®ããŒãã§èª¬æããŸãã
cl::Program::Sources source(1, std::make_pair(sourceCode.c_str(), sourceCode.length()+1)); cl::Program program = cl::Program(context, source); program.build(contextDevices); cl::Kernel kernel(program, "TestKernel");
ã³ã³ãã€ã«æã«ã¯ãå®è¡ããäºå®ã®ããã€ã¹ãæå®ããå¿ èŠããããŸãããã®å Žåã¯ããã¹ãçšã«éžæããã1ã€ã®ããã€ã¹ã§ãããäžåºŠã«ãã¹ãŠæå®ã§ããŸãã ã³ã³ãã€ã«ãã©ã°ãæž¡ãããšãã§ããŸããããã®äŸã§ã¯æž¡ããŸããã
次ã«ãã«ãŒãã«ã«æž¡ãåŒæ°ãèšå®ããå¿ èŠããããŸãã CUDAãšã¯ç°ãªããåŒæ°ããšã«ç¹å¥ãªé¢æ°ïŒC ++ã©ãããŒã®å Žåã¯ã¡ãœããïŒãåŒã³åºããå¿ èŠã«å¿ããŠåŒæ°ã®ãµã€ãºãæå®ããå¿ èŠããããŸãã
int iArg = 0; kernel.setArg(iArg++, clmInputVector1); kernel.setArg(iArg++, clmInputVector2); kernel.setArg(iArg++, clmOutputVector); kernel.setArg(iArg++, DATA_SIZE);
ããã§ãæãéèŠãªããšãã€ãŸãã«ãŒãã«ã®èµ·åã«é²ã¿ãŸãã
queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(DATA_SIZE), cl::NDRange(128));
å®éã«ã¯ãqueue.enqueueNDRangeKernelã¯ã«ãŒãã«éå§ã³ãã³ããã³ãã³ããã¥ãŒã«è¿œå ããåŠçããèŠçŽ ã®æ°ãšã°ã«ãŒãã®ãµã€ãºãèšå®ããŸãã ïŒå¥ã®èšäºã§ïŒã°ã«ãŒãã«ã€ããŠåå¥ã«èª¬æããŸãããããã§ã¯ããã¹ãŠã®èŠçŽ ãåžžã«ã°ã«ãŒãã«åå²ãããããã©ãŒãã³ã¹ãã°ã«ãŒãã®ãµã€ãºã«å€§ããäŸåãããšããäºå®ã®ã¿ã説æããŸãã ãã®å ŽåãèŠçŽ ã®æ°ã¯DATA_SIZEã§ãã°ã«ãŒããµã€ãºã¯128ã§ããã«ãŒãã«ã®å®è¡äžã«ãDATA_SIZEåïŒäžæãªã·ãŒã±ã³ã¹ã§ãå Žåã«ãã£ãŠã¯åæã«ïŒèµ·åãããèµ·åããããã³ã«ãã©ã®èŠçŽ ãåŠçããããã«é¢ããæ å ±ãéä¿¡ãããŸãã
enqueueNDRangeKernelã¯ãããã¯ããªããããã«ãŒãã«ã®èµ·ååŸãã«ãŒãã«ãçµäºãããŸã§åŸ æ©ããå¿ èŠããããŸãã
queue.finish();
å®éãfinishã¯2ã€ã®ã¿ã¹ã¯ãå®è¡ããŸãã
1ïŒãã¹ãŠã®ã³ãã³ããããã€ã¹ã«éä¿¡ããŸãïŒenqueueNDRangeKernelãå®è¡ãããšããã©ã€ããŒãã³ãã³ããåä¿¡ããŠââãã¥ãŒã«å ¥ããããšãä¿èšŒããŸãããããã€ã¹äžã§ã®å®è¡ãä¿èšŒãããã®ã§ã¯ãªããã«ãŒãã«ãå®éã«èµ·åãããŸã§ã«ããªãé·ãæéããããããšããããŸãïŒã
2ïŒãã¥ãŒå ã®ãã¹ãŠã®ããŒã ã®å®äºãåŸ ã¡ãŸãã
æåã®éšåã®ã¿ãå®è¡ãããå Žåããããã¯ãããŠããªãããã·ã¥ïŒclFlushïŒã³ãã³ãããããŸããããã©ã€ããŒã¯åŒ·å¶çã«ãã¥ãŒããã³ãã³ãã®å®è¡ãéå§ããŸãã
èšç®ãå®è¡ããåŸãè²»ãããæéãèšç®ãã次ã®ã³ãã³ãã䜿çšããŠèšç®çµæããã¹ãã«ã¢ããããŒãããŸãã
queue.enqueueReadBuffer(clmOutputVector, CL_TRUE, 0, DATA_SIZE * sizeof(float), pOutputVector);
2çªç®ã®åŒæ°ã«å¿ããŠãenqueueReadBufferã¯ãããã¯ããå Žåãšããªãå ŽåããããŸãã ç§ãã¡ã®å Žåããããã¯ããŠãããããfinishãåå¥ã«åŒã³åºãå¿ èŠã¯ãããŸããã æ§æã¯åçŽã§ããæåã®åŒæ°ã¯èªãå Žæã4çªç®ã®åŒæ°ã¯èªãéãæåŸã®åŒæ°ã¯èªãå Žæã§ãã ãŸããå ¥åãããã¡ã®å é ããã®ãªãã»ãããèšå®ãããã©ã¡ãŒã¿ããããŸããããã¯ããã¹ãäžã®OpenCLãããã¡ã«ã¢ãã¬ã¹æŒç®ã䜿çšã§ããªããããæåã«ããŒã¿ãèªã¿åãå¿ èŠããªãå Žåã«äœ¿çšããå¿ èŠããããŸãã
ããŒã4-OpenCLã«ãŒãã«ã³ãŒã
ãããŠããã§ãOpenCLã§ã³ãŒãã®èšè¿°ãéå§ããå¿ èŠããããŸãïŒã³ãŒããåŒã³åºãã®ã¯é£ããã®ã§...çããããŸã:)ïŒã OpenCLFile1.clã¯æ¬¡ã®ããã«ãªããŸãã
#include "MathCode.cpp" __kernel void TestKernel( __global const float* pInputVector1, __global const float* pInputVector2, __global float* pOutputVectorHost, int elementsNumber) { //Get index into global data array int iJob = get_global_id(0); //Check boundary conditions if (iJob >= elementsNumber) return; //Perform calculations pOutputVectorHost[iJob] = MathCalculations(pInputVector1[iJob], pInputVector2[iJob]); }
é çªã«ïŒ
ãŸããã³ãŒãã«MathCode.cppãã¡ã€ã«ãå«ããŸããMathCode.cppãã¡ã€ã«ã«ã¯ãæ°åŠé¢æ°ãå«ãŸããŠããŸããããã¯ã以åã«æ³šæãæãããšãæ±ãããã®ã§ããã¹ãã§ã®åŸæ¥ã®èšç®ã«äœ¿çšããããã®ãšåãã§ãã ã芧ã®ãšãããã³ãŒããã³ããŒããããšãããããæ°åŠã³ãŒãã§åããã¡ã€ã«ã䜿çšããŸãã
次ã«ã__ kernelããŒã¯ãŒãã§ããŒã¯ããã«ãŒãã«ãäœæããŸãã äžéšã®ã«ãŒãã«åŒæ°ã«ã¯ã__ globalããŒã¯ãŒããä»ããŠããŸããããã¯ããã¹ãã³ãŒãã§äœæããããã€ã¹ã®ã°ããŒãã«ã¡ã¢ãªå ã®ãããã¡ãŒã§ããããšã瀺ããŠããŸãã
ã«ãŒãã«ã³ãŒãã§ãåŠçããå¿ èŠãããèŠçŽ ã®æ°ãååŸããŸãã
int iJob = get_global_id(0);
get_global_idãã©ã¡ãŒã¿ãŒã¯æ¬¡å ã瀺ããŸããåŠçãããèŠçŽ ã¯1ã2ããŸãã¯3次å ã®é åã«ãªãå¯èœæ§ãããããã§ãã
次ã«ãå¢çæ¡ä»¶ã確èªããŸãã
if (iJob >= elementsNumber) return;
ããã¯ãåŠçããèŠçŽ ã®æ°ãåžžã«ã°ã«ãŒãã®ãµã€ãºã®åæ°ã§ããå¿ èŠããããããåŠçããæ°ãè¶ ããå Žåãããããã§ãã
ãããŠãã§ãã¯ããåŸãäž»ãªéšåãå®è¡ããŸãïŒèšç®ããã¹ããšãŸã£ããåãæ¹æ³ã§ïŒ
pOutputVectorHost[iJob] = MathCalculations(pInputVector1[iJob], pInputVector2[iJob]);
ããŒã5-ããã©ãŒãã³ã¹ã®ãã¹ããšæž¬å®
ããã§ã¯ãã¢ããªã±ãŒã·ã§ã³ãèµ·åããããã©ãŒãã³ã¹ãè©äŸ¡ããŠãããã€ãã®çµè«ãå°ãåºããŸãããã
2å°ã®ãã·ã³ã§ãã¹ããå®è¡ãããšãããèå³æ·±ãçµæãåŸãããŸããã
ããŒãããã¯ïŒCPUïŒ Intel®Coreâ¢i7-820QM ãGPUïŒ NVidia Quadro FX 2800M ïŒïŒ
Host: 959.256 ms CPU: 82.4163 ms (13.106X faster than host) GPU: 9.90836 ms (109.014X faster than host)
ãã¹ã¯ãããïŒCPUïŒ Intel®Coreâ¢i7-2600 ãGPUïŒ NVidia GeForce GTX 580 ïŒïŒ
Host: 699.031 ms CPU: 27.7833 ms (25.159X faster than host) GPU: 2.06257 ms (338.897X faster than host)
å®å
šãªçµæ
Device: Host Calculation time statistic: (20 runs) Med: 959.256 ms (1.12602X faster than host) Avg: 1080.15 ms Min: 933.554 ms Max: 1319.19 ms ------------------------------------------------- Device: Quadro FX 2800M Calculation time statistic: (200 runs) Med: 9.90836 ms (109.014X faster than host) Avg: 10.7231 ms Min: 9.82841 ms Max: 135.924 ms Errors: avgRelAbsDiff = 5.25777e-008 maxRelAbsDiff = 5.83678e-007 ------------------------------------------------- Device: Intel(R) Core(TM) i7 CPU Q 820 @ 1.73GHz Calculation time statistic: (200 runs) Med: 82.4163 ms (13.106X faster than host) Avg: 85.2226 ms Min: 79.4138 ms Max: 113.03 ms Errors: avgRelAbsDiff = 3.64332e-008 maxRelAbsDiff = 4.84797e-007
Device: Host Calculation time statistic: (20 runs) Med: 699.031 ms (0.999956X faster than host) Avg: 699.1 ms Min: 691.544 ms Max: 715.233 ms ------------------------------------------------- Device: GeForce GTX 580 Calculation time statistic: (200 runs) Med: 2.06257 ms (338.897X faster than host) Avg: 2.4 ms Min: 2.03873 ms Max: 82.0514 ms Errors: avgRelAbsDiff = 3.50006e-008 maxRelAbsDiff = 4.92271e-007 ------------------------------------------------- Device: Intel(R) Core(TM) i7-2600 CPU @ 3.40GHz Calculation time statistic: (200 runs) Med: 27.7833 ms (25.159X faster than host) Avg: 27.49 ms Min: 27.0154 ms Max: 35.8386 ms Errors: avgRelAbsDiff = 3.64377e-008 maxRelAbsDiff = 4.89584e-007
ããã§ã¯ãçµæã®åæãå§ããŸããããçµæã¯éåžžã«å°è±¡çã§ãã ã©ãããããã®GPUã¯ãã¹ããããçŽ110åé«éã§ããããã¹ã¯ãããã§ã¯çŽ340åé«éã§ãããå°è±¡çãªçµæã§ãã ããªããç§ã«ã¹ãªãããæãå§ããŠããã®ãããªæ¯èŒã¯æ£ãããªããšèšã£ãŠå§ããåã«ãç§ã¯æ¬åœã«ããã€ãã®ããªãã¯ããããŸããããã以äžã§ã¯ãªããšèšããŸãã
ãŸããããã§ã¯ãããã€ã¹ãšã®éã§ããŒã¿ãã³ããŒããæéãèæ ®ããŠããŸããã äžæ¹ã§ãããã¯ééã£ãŠããŸãããã¹ãŠãã³ããŒããããšãèæ ®ãããšãããã»ã©ãããããã«èŠããªãããã§ãã äžæ¹ãã³ããŒã¯èšç®ãšåæã«å®è¡ã§ããŸããããŒã¿ãæ¢ã«ããã€ã¹ã«ããå Žåã¯ãã³ããŒããŸã£ããå®è¡ããå¿ èŠã¯ãããŸããã äžè¬ã«ããã¹ãŠã¯æ確ãªãã®ã§ã¯ãªããç¹å®ã®ã¿ã¹ã¯ã«äŸåããŸãã
次ã«ãæ°åŠã³ãŒããã©ã®ããã«èŠãããèŠããŠããŸããïŒ ãããèŠãªãã£ã人ã«ãšã£ãŠã¯ãåãããŒã¿ã«å¯Ÿããå€ãã®æ°åŠçãªæäœã§ãããåçŽãªã³ããŒã¢ã³ãããŒã¹ããšä¿æ°ã®æ°å€ã®çœ®ãæãã«ãã£ãŠå€æããŸããããæåã¯ããç°¡åã§ã1è¡ã ãã§ãããã¹ããéå§ããŸããããçµæã¯ããã»ã©åã°ãããããŸããã§ãããGPUã¯4ã5åé«éã§ããã ã©ãããŠããæããŸããïŒ ïŒä¿®èŸçãªè³ªåãããªãã¯èããããšã¯ã§ããŸãã:)ïŒã ãããŠããã¹ãŠãåçŽã§ãããã¡ã¢ãªã®ããã©ãŒãã³ã¹ã«åºããããŸããã åŸã§ãããæã«å ¥ããŠãã¡ã¢ãªãšããã»ããµã®ããã©ãŒãã³ã¹ã®é¢ä¿ã«é¢ããèšäºãæžãããšãé¡ã£ãŠããŸãããããã¯å¥ã®è©±ã§ãããã®èšäºã§ã¯ããã®ã«ãŒãã«ã§æŒç®ããã»ããµã®ããã©ãŒãã³ã¹ã®ã¯ãªãŒã³ãªãã¹ããåŸããšããäºå®ã«ã®ã¿èå³ããããŸãã
ãããã®2ã€ã®ç¹ãèãããšãGPUã¯CPUäžã®é䞊åã³ãŒããããçŽç²ãªç®è¡ã§å®éã«æ°çŸåé«éã§ãããšèšããŸããããã¯äžè¬ã«ãçè«çãªããã©ãŒãã³ã¹ã®éãã«å¯Ÿå¿ããŸãã ïŒããäžã€ã®åžæã¯ãå®æ°ãšå¥ã®èšäºã®çè«ãžã®å¯Ÿå¿ã枬å®ããããã«æã䌞ã°ãããšã§ãïŒã
ããããGPUã¯ããã«ãããèæ ®ãããã¹ãã®çµæãCPUãOpenCLã³ãŒããããªãéãå®è¡ããããšãããããŸãããæ£ç¢ºã«ã¯ãããã©ã«ãèšå®ã§MSVC10ã«ãã£ãŠã³ã³ãã€ã«ãããéåžžã®ã³ãŒãããã13åããã³25åé«éã§ãã ãããã©ããªã£ãŠããããã®æ°åãã©ãããæ¥ãã®ããç解ããŸãããã
äž¡æ¹ã®ããã»ããµã«ã¯4ã€ã®å®ã³ã¢ãš8ã€ã®ä»®æ³ã³ã¢ãå«ãŸããŠãããOpenCLã¯ãã¹ãŠã®ã³ã¢ã䜿çšããããã«äœãããŠããŸããã4Xãããã¯ããã«æ¹åãããŠããŸãã ãããŠããã§ãOpenCLã®å®è£ ã§èªåãã¯ãã«åã®ãµããŒããè¿œå ããIntelã«æè¬ããå¿ èŠããããŸãã ã³ãŒããå€æŽããããšãªããOpenCLã¯äœ¿çšå¯èœãªãã®ã«å¿ããŠSSEãŸãã¯AVXã䜿çšããŸãã 128ãããã®SSEããããAVXã256ãããã§åäœããããšãèãããšãããã©ãŒãã³ã¹ã¯ãããã16åãš32åã«äžããã¯ãã§ãã ããã¯çå®ã«è¿ãã§ããããŸã å®å šã«äžèŽããŠããããã§ã¯ãããŸããã ãããŠãTurboBoostã®ãããªæ¥œããããšãèŠããŠããå¿ èŠããããŸãã ãããã®ããã»ããµã¯1.73 GHz / 3.06 GHzïŒã©ãããããïŒãš3.4 GHz / 3.8 GHzïŒãã¹ã¯ãããïŒã®åšæ³¢æ°ã§åäœããŸãããå®éã«ã¯ã©ãããããããã»ããµã®åšæ³¢æ°ã¯1.73ãã2.8ã«é£ç¶çã«ãžã£ã³ãããéåžžã«åŒ·åã«å ç±ãããããïŒããã§ã¯ã湟æ²ããå·åŽã·ã¹ãã ã®ããã«ãã«ã«å€§ããªã«ã¡ãªãæããå¿ èŠããããŸãïŒã3.06GHzã®åšæ³¢æ°ãã¹ãã§ã¯ãå®éã«éèŠãªæéã¯ãããŸããã ããã«ãå®éã®çµæã¯åžžã«çè«çã«å¯èœãªãã®ãããå°ãªãããšãå¿ããŠã¯ãªããŸããïŒååãšããŠãã¹ã¯ãããã¯ããéãåäœããã¯ãã§ãïŒããèŠãŠãããããã«ãåãããŒããŠã§ã¢ã§ã»ãŒç¡æã§25åã®ããã©ãŒãã³ã¹åäžãåŸãããŸãã
ãããã«
ãã®èšäºã®ç®æšã¯ãOpenCLã䜿çšããäœæ¥ã®è©³çŽ°ããã¹ãŠèª¬æããããšã§ã¯ãªãããã¹ãŠãããã»ã©è€éã§ã¯ãªãããšïŒããã§ã¯æ¢ã«ããã»ã©åçŽã§ã¯ãªãããšãæ¢ã«æžããŠããŸãïŒã瀺ãããšã§ãããçæ³çãªæ¡ä»¶ã§ã¯éåžžã«å°è±¡çãªããã©ãŒãã³ã¹ãåŸãããšãã§ããŸããåãããŒããŠã§ã¢äžã§ãããã«ããã¹ãŠã®ããã€ã¹ã«åãã³ãŒãã䜿çšã§ããŸãããã ãããããã¯ã»ãŒçæ³çãªæ¡ä»¶ã§ãããåžžã«ãããšã¯éããªãããšãå¿ããªãã§ãã ããã
PSïŒã³ãŒããããã£ãŠãå¥ã®ããŒããŠã§ã¢ã§ãã¹ãã確èªããã人ã®ããã«ããããžã§ã¯ãïŒããã³çµã¿ç«ãŠãããå®è¡å¯èœãã¡ã€ã«ïŒãgithubã«ãããŸããéå§ããã«ã¯ãããŒããŠã§ã¢ã®ã¡ãŒã«ãŒã®OpenCL SDKãå¿ èŠã«ãªãå ŽåããããŸãã
PS2ïŒèª°ããIvy Bridgeãæã£ãŠããå Žåãçµ±åããããããªã³ã¢ã®ãã¹ããèŠãã®ã¯èå³æ·±ãã§ããããå®éãOpenCL SDKã®ææ°ããŒãžã§ã³ã§ã¯ãIntelã¯IGPãžã®ã¢ã¯ã»ã¹ããªãŒãã³ããŸããããææ°äžä»£ã®ããã»ããµã®ã¿ã察象ã«ããŠããŸãããæå ã«ã¯ãããŸãããAMDã®çµæã¯èå³æ·±ããã®ã§ãã