{"id":130,"date":"2017-10-25T13:10:08","date_gmt":"2017-10-25T04:10:08","guid":{"rendered":"http:\/\/www.sciement.com\/tech-blog\/?p=130"},"modified":"2017-12-21T17:40:46","modified_gmt":"2017-12-21T08:40:46","slug":"various_binarizations2","status":"publish","type":"post","link":"https:\/\/www.sciement.com\/tech-blog\/c\/various_binarizations2\/","title":{"rendered":"[C++][CUDA][Thrust]\u7d9a\u30fb\u5358\u7d14\u306a\u4e8c\u5024\u5316\u51e6\u7406\u30924\u901a\u308a\u306e\u4e26\u5217\u5316\u624b\u6cd5\u3067\u6bd4\u8f03\u3057\u3066\u307f\u308b"},"content":{"rendered":"<p>\u524d\u56de\u3001<a href=\"http:\/\/www.sciement.com\/tech-blog\/c\/various_binarizations\/\" rel=\"noopener\" target=\"_blank\"><strong>\u5358\u7d14\u306a\u4e8c\u5024\u5316\u51e6\u7406\u30924\u901a\u308a\u306e\u4e26\u5217\u5316\u624b\u6cd5\u3067\u6bd4\u8f03<\/strong><\/a>\u3057\u307e\u3057\u305f\u3002<br \/>\n\u4eca\u56de\u306fCUDA\u95a2\u9023\u3067\u3082\u3046\u5c11\u3057\u691c\u8a3c\u3057\u3066\u307f\u307e\u3059\u3002<br \/>\nCUDA\u306e\u30d7\u30ed\u30b0\u30e9\u30df\u30f3\u30b0\u3067\u6700\u521d\u306b\u6c17\u306b\u3059\u308b\u3068\u3053\u308d\u3068\u8a00\u3048\u3070\u3001\u3084\u306f\u308acudaMalloc, cudaFree\u306e\u30e1\u30e2\u30ea\u7ba1\u7406\u304b\u3068\u601d\u3044\u307e\u3059\u3002\u51fa\u6765\u308b\u3053\u3068\u306a\u3089\u3053\u306e\u8fba\u308a\u306f\u4f55\u3082\u8003\u3048\u306a\u304f\u3066\u3082\u52dd\u624b\u306b\u3084\u3063\u3066\u304f\u308c\u308b\u3088\u3046\u306b\u3057\u3066\u6b32\u3057\u3044\u3082\u306e\u3067\u3059\u3002<br \/>\n\u3068\u3044\u3046\u308f\u3051\u3067\u4eca\u56de\u306fNVIDIA\u304c\u958b\u767a\u3057\u3066\u3044\u308b\u4e26\u5217\u30a2\u30eb\u30b4\u30ea\u30ba\u30e0\u30e9\u30a4\u30d6\u30e9\u30ea\u3067\u3042\u308b<a href=\"https:\/\/thrust.github.io\/\" rel=\"noopener\" target=\"_blank\"><strong>Thrust<\/strong><\/a>\u3092\u4f7f\u3063\u3066\u524d\u56de\u306e.cu\u30b3\u30fc\u30c9\u3092\u66f8\u304d\u76f4\u3057\u3066\u307f\u307e\u3057\u305f\u3002<\/p>\n<p>\u53c2\u8003\u8a18\u4e8b\uff1a<br \/>\n<a href=\"http:\/\/www.sie.es\/wp-content\/uploads\/2015\/12\/Intro-to-Thrust-Parallel-Algorithms-Library.pdf\" rel=\"noopener\" target=\"_blank\"><strong>An Introduction to the Thrust Parallel Algorithms Library<\/strong><\/a><\/p>\n<p>\u3044\u304f\u3064\u304b\u3068\u3066\u3082\u6c17\u306b\u306a\u308b\u90e8\u5206\u304c\u3042\u308a\u307e\u3057\u305f\u306e\u3067\u3001\u4ee5\u4e0b\u3001\u305d\u306e\u30e1\u30e2\u306b\u306a\u308a\u307e\u3059\u3002<\/p>\n<p>\u66f8\u304d\u76f4\u3057\u305f\u30b3\u30fc\u30c9\u306f\u4ee5\u4e0b\u306e\u901a\u308a\u3067\u3059\u3002<\/p>\n<pre class=\"lang:c++ mark:5-6,42-44,53,64,88-89 decode:true \" title=\"CUDA_test.cu\" >\/\/ Copyright SCIEMENT, Inc.\r\n\/\/ by Hirofumi Seo, M.D., CEO &amp; President\r\n\r\n#include \"CUDA_test.h\"\r\n#include \"thrust\/device_vector.h\"\r\n#include \"thrust\/copy.h\"\r\n#include &lt;chrono&gt;\r\n#include &lt;iostream&gt;\r\n#include &lt;stdio.h&gt;\r\n\r\n__global__ void Kernel_make_bit_vertices(const int threshold, const int* voxels, int* bit_vertices, const int voxels_size)\r\n{\r\n  int index = blockIdx.x * blockDim.x + threadIdx.x;\r\n  const int stride = blockDim.x * gridDim.x;\r\n  for (int i = index; i &lt; voxels_size; i += stride) {\r\n    bit_vertices[i] = (voxels[i] &lt; threshold) ? 1 : 0;\r\n  }\r\n}\r\n\r\ncudaError_t Cuda_make_bit_vertices(const int threshold, const std::vector&lt;int&gt;&amp; voxels, std::vector&lt;int&gt;* bit_vertices)\r\n{\r\n  cudaError_t cuda_status;\r\n\r\n  auto start = std::chrono::system_clock::now();\r\n\r\n  \/\/ Choose which GPU to run on, change this on a multi-GPU system.\r\n  cuda_status = cudaSetDevice(0);\r\n  if (cuda_status != cudaSuccess) {\r\n    fprintf(stderr, \"cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?\");\r\n    return cuda_status;\r\n  }\r\n\r\n  auto end = std::chrono::system_clock::now();\r\n  auto execution_time = end - start;\r\n  std::cout &lt;&lt; \"Initialize: \" &lt;&lt; std::chrono::duration_cast&lt;std::chrono::milliseconds&gt;(execution_time).count() &lt;&lt; \" msec.\" &lt;&lt; std::endl;\r\n\r\n  const int voxels_size = voxels.size();\r\n  bit_vertices-&gt;resize(voxels_size);\r\n\r\n  start = std::chrono::system_clock::now();\r\n\r\n  \/\/thrust::device_vector&lt;int&gt; dev_voxels_vector(voxels);\r\n  thrust::device_vector&lt;int&gt; dev_voxels_vector(voxels_size);\r\n  thrust::device_vector&lt;int&gt; dev_bit_vertices_vector(voxels_size);\r\n\r\n  end = std::chrono::system_clock::now();\r\n  execution_time = end - start;\r\n  std::cout &lt;&lt; \"GPU Malloc: \" &lt;&lt; std::chrono::duration_cast&lt;std::chrono::milliseconds&gt;(execution_time).count() &lt;&lt; \" msec.\" &lt;&lt; std::endl;\r\n  \/\/std::cout &lt;&lt; \"GPU Malloc and CPU -&gt; GPU: \" &lt;&lt; std::chrono::duration_cast&lt;std::chrono::milliseconds&gt;(execution_time).count() &lt;&lt; \" msec.\" &lt;&lt; std::endl;\r\n\r\n  start = std::chrono::system_clock::now();\r\n\r\n  cudaMemcpy(thrust::raw_pointer_cast(dev_voxels_vector.data()), voxels.data(), voxels_size * sizeof(int), cudaMemcpyHostToDevice);\r\n\r\n  end = std::chrono::system_clock::now();\r\n  execution_time = end - start;\r\n  std::cout &lt;&lt; \"CPU -&gt; GPU: \" &lt;&lt; std::chrono::duration_cast&lt;std::chrono::milliseconds&gt;(execution_time).count() &lt;&lt; \" msec.\" &lt;&lt; std::endl;\r\n\r\n  start = std::chrono::system_clock::now();\r\n\r\n  \/\/ Launch a kernel on the GPU with one thread for each element.\r\n  const int block_size = 256; \/\/ MAX: 1024\r\n  const int num_blocks = (voxels_size + block_size - 1) \/ block_size;\r\n  Kernel_make_bit_vertices &lt;&lt;&lt;num_blocks, block_size &gt;&gt;&gt;(threshold, thrust::raw_pointer_cast(dev_voxels_vector.data()), thrust::raw_pointer_cast(dev_bit_vertices_vector.data()), voxels_size);\r\n\r\n  \/\/ Check for any errors launching the kernel\r\n  cuda_status = cudaGetLastError();\r\n  if (cuda_status != cudaSuccess) {\r\n    fprintf(stderr, \"Kernel_make_bit_vertices launch failed: %s\\n\", cudaGetErrorString(cuda_status));\r\n    return cuda_status;\r\n  }\r\n\r\n  \/\/ cudaDeviceSynchronize waits for the kernel to finish, and returns\r\n  \/\/ any errors encountered during the launch.\r\n  cuda_status = cudaDeviceSynchronize();\r\n  if (cuda_status != cudaSuccess) {\r\n    fprintf(stderr, \"cudaDeviceSynchronize returned error code %d after launching Kernel_make_bit_vertices!\\n\", cuda_status);\r\n    return cuda_status;\r\n  }\r\n\r\n  end = std::chrono::system_clock::now();\r\n  execution_time = end - start;\r\n  std::cout &lt;&lt; \"GPU: \" &lt;&lt; std::chrono::duration_cast&lt;std::chrono::microseconds&gt;(execution_time).count() \/ 1000.0 &lt;&lt; \" msec.\" &lt;&lt; std::endl;\r\n\r\n  start = std::chrono::system_clock::now();\r\n\r\n  \/\/ Copy output vector from GPU buffer to host memory.\r\n  \/\/thrust::copy(dev_bit_vertices_vector.begin(), dev_bit_vertices_vector.end(), bit_vertices-&gt;begin());\r\n  cudaMemcpy(bit_vertices-&gt;data(), thrust::raw_pointer_cast(dev_bit_vertices_vector.data()), voxels_size * sizeof(int), cudaMemcpyDeviceToHost);\r\n  \r\n  end = std::chrono::system_clock::now();\r\n  execution_time = end - start;\r\n  std::cout &lt;&lt; \"GPU -&gt; CPU: \" &lt;&lt; std::chrono::duration_cast&lt;std::chrono::milliseconds&gt;(execution_time).count() &lt;&lt; \" msec.\" &lt;&lt; std::endl;\r\n\r\n  return cuda_status;\r\n}<\/pre>\n<p>cudaMalloc, cudaFree\u304c\u4e0d\u8981\u306b\u306a\u308a\u307e\u3059\u3002<br \/>\n\u6700\u521d\u306f\u5143\u3005\u306e\u30b3\u30fc\u30c9\u306e\u6700\u5f8c\u306e\u90e8\u5206\u306b\u3042\u3063\u305fcudaFree\u306e\u3068\u3053\u308d\u3092\u30b3\u30e1\u30f3\u30c8\u30a2\u30a6\u30c8\u3057\u3066\u3001<\/p>\n<pre class=\"nums:false nums-toggle:false lang:c++ decode:true   \" >Error:\r\n  \/\/cudaFree(dev_voxels);\r\n  \/\/cudaFree(dev_bit_vertices);\r\n \r\n  return cudaStatus;<\/pre>\n<p>\u3068\u3057\u3066<\/p>\n<pre class=\"nums:false nums-toggle:false lang:c++ decode:true   \" >  if (cuda_status != cudaSuccess) {\r\n    goto Error;\r\n  }\r\n<\/pre>\n<p>\u306e\u3088\u3046\u306b\u3057\u3066\u3044\u305f\u306e\u3067\u3059\u304c\u3001\u4f55\u6545\u304b\u30b3\u30f3\u30d1\u30a4\u30eb\u6642\u306b<\/p>\n<blockquote><p>initialization of &#8216;dev_bit_vertices_vector&#8217; is skipped by &#8216;goto Error&#8217;<\/p><\/blockquote>\n<p>\u3068\u3044\u3046\u30a8\u30e9\u30fc\u304c\u51fa\u3066\u3057\u307e\u3044\u307e\u3057\u305f\uff08\u3053\u306e\u539f\u56e0\u306f\u5168\u304f\u308f\u304b\u308a\u307e\u305b\u3093\u3067\u3057\u305f\u2026\uff09\u3002<br \/>\ncudaFree\u304c\u5fc5\u8981\u306a\u304f\u306a\u308a\u307e\u3057\u305f\u306e\u3067\u3001Error:\u3092\u6d88\u3057\u3066return\u3067\u8fd4\u3059\u3088\u3046\u306b\u3057\u305f\u3089\u30a8\u30e9\u30fc\u306f\u6d88\u3048\u305f\u306e\u3067\u3059\u304c\u3001\u5b9f\u884c\u6642\u3067\u306f\u306a\u304f\u30b3\u30f3\u30d1\u30a4\u30eb\u6642\u306b\u3053\u306e\u30a8\u30e9\u30fc\u304c\u51fa\u3066\u3057\u307e\u3063\u305f\u7406\u7531\u306f\u308f\u304b\u308a\u307e\u305b\u3093\u3067\u3057\u305f\u2026\u3002<\/p>\n<p>\u305b\u3063\u304b\u304fThrust\u3092\u4f7f\u3063\u3066\u3044\u308b\u306e\u3067\u3001<br \/>\n\u30fb<a href=\"http:\/\/thrust.github.io\/doc\/classthrust_1_1device__vector.html#ae2c21c4f22cb44d2e41e2a4c9d3409c9\" rel=\"noopener\" target=\"_blank\"><strong>thrust::device_vector = std::vector<\/strong><\/a>\u3067\u306edevice_vector\u306e\u521d\u671f\u5316<br \/>\n\u30fb<a href=\"https:\/\/thrust.github.io\/doc\/group__copying.html#ga24ccfaaa706a9163ec5117758fdb71b9\" rel=\"noopener\" target=\"_blank\"><strong>thrust::copy<\/strong><\/a>\u3067device_vector\u304b\u3089std::vector\u3078\u306e\u30c7\u30fc\u30bf\u306e\u66f8\u304d\u8fbc\u307f<br \/>\n\u3092\u884c\u3063\u3066\u307f\u305f\u3068\u3053\u308d\u3001<strong>\u4fe1\u3058\u3089\u308c\u306a\u3044\u304f\u3089\u3044\u901f\u5ea6\u304c\u9045\u304f\u306a\u308a\u307e\u3057\u305f\u2026<\/strong>\u3002<\/p>\n<p><strong>Thurst\u3092\u4f7f\u3063\u3066\u3044\u308b\u306e\u306b\u6bce\u56deraw pointer\u3092\u53d6\u5f97\u3057\u3066cudaMemcpy\u3092\u4f7f\u3046\u307b\u3046\u304c\u5727\u5012\u7684\u306bCPU, GPU\u9593\u306e\u30c7\u30fc\u30bf\u51e6\u7406\u306f\u901f\u3044<\/strong>\u3088\u3046\u3067\u3059\u3002<\/p>\n<p>\u4ee5\u4e0b\u3001\u5b9f\u884c\u6642\u9593\u306e\u6bd4\u8f03\u3067\u3059\u3002\u51fa\u529b\u7d50\u679c\u306b\u5c11\u3057\u5f8c\u4ed8\u3051\u3067\u8aac\u660e\u3092\u52a0\u3048\u3066\u3044\u307e\u3059\u3002<br \/>\n\u3082\u3061\u308d\u3093\u3001\u6570\u5b57\u306f\u3044\u3058\u3063\u3066\u3044\u307e\u305b\u3093\u3002<\/p>\n<p><strong><\/p>\n<table style=\"border-style: none;\" border=\"1\">\n<tbody>\n<td style=\"border-style: none;\">\nCUDA\uff08\u524d\u56de\u306e\u3082\u306e\uff09:<br \/>\nInitialize: 170 msec.<br \/>\nGPU Malloc: 99 msec.<br \/>\nCPU -> GPU: 9 msec.<br \/>\nGPU: 0.622 msec.<br \/>\nGPU -> CPU: 9 msec.\n<\/td>\n<td style=\"border-style: none;\">\nThrust\uff08<font color=\"red\">\u975e<\/font>cudaMemcpy\u7248\uff09\uff1a<br \/>\nInitialize: 140 msec.<br \/>\nThrust Malloc: <font color=\"red\">167<\/font> msec.<br \/>\n&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;+ CPU -> GPU<br \/>\nGPU: 0.605 msec.<br \/>\nGPU -> CPU: <font color=\"red\">52<\/font> msec.\n<\/td>\n<td style=\"border-style: none;\">\nThrust\uff08cudaMemcpy\u7248\uff09\uff1a<br \/>\nInitialize: 139 msec.<br \/>\nThrust Malloc: 108 msec.<br \/>\nCPU -> GPU: 9 msec.<br \/>\nGPU: 0.583 msec.<br \/>\nGPU -> CPU: 8 msec.\n<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<p><\/strong><\/p>\n<p><strong>\u30c7\u30fc\u30bf\u8ee2\u9001\u6642\u9593\u304c6\u500d\u8fd1\u304f\u3082\u5909\u308f\u3063\u3066\u3057\u307e\u3063\u3066\u3044\u307e\u3059<\/strong>\u306d\u2026\u3002\u306a\u3093\u3068\u3082\u6b8b\u5ff5\u306a\u7d50\u679c\u3067\u3059\u3002<br \/>\n\u3068\u306f\u8a00\u3048\u3001<strong>cudaMemcpy\u3092\u4f7f\u3044\u3055\u3048\u3059\u308c\u3070Thrust\u7248\u3067\u3082\u5143\u3005\u306eCUDA\u7248\u3068\u540c\u3058\u30d1\u30d5\u30a9\u30fc\u30de\u30f3\u30b9\u304c\u5f97\u3089\u308c\u3001\u4e14\u3064\u30e1\u30e2\u30ea\u89e3\u653e\u3057\u5fd8\u308c\u3092\u6c17\u306b\u3057\u306a\u304f\u3066\u826f\u304f\u306a\u308b<\/strong>\u306e\u306f\u3068\u3066\u3082\u5927\u304d\u3044\u3067\u3059\u306d\u3002<br \/>\nraw pointer\u3092\u53d6\u5f97\u3059\u308c\u3070\u65e2\u5b58\u306e\u30ab\u30fc\u30cd\u30eb\u95a2\u6570\u3082\u4f7f\u3048\u308b\u308f\u3051\u3067\u3059\u3057\u3001\u3055\u3089\u306b<strong>Thrust\u306e\u5404\u7a2e\u30a2\u30eb\u30b4\u30ea\u30ba\u30e0\u3092\u624b\u8efd\u306b\u4f7f\u3048\u308b<\/strong>\u306e\u3082\u5b09\u3057\u3044\u9650\u308a\u3067\u3059\u3002<\/p>\n<hr \/>\n<p>\u203b\u672c\u8a18\u4e8b\u5185\u5bb9\u306f\u3001\u56fd\u7acb\u7814\u7a76\u958b\u767a\u6cd5\u4eba \u65e5\u672c\u533b\u7642\u7814\u7a76\u958b\u767a\u6a5f\u69cb\uff08AMED\uff09\u306e<a href=\"https:\/\/www.amed.go.jp\/koubo\/02\/01\/0201C_00124.html\" rel=\"noopener\" target=\"_blank\">\u5e73\u621029\u5e74\u5ea6 \u300c\u672a\u6765\u533b\u7642\u3092\u5b9f\u73fe\u3059\u308b\u533b\u7642\u6a5f\u5668\u30fb\u30b7\u30b9\u30c6\u30e0\u7814\u7a76\u958b\u767a\u4e8b\u696d\u300e\u8853\u4e2d\u306e\u8fc5\u901f\u306a\u5224\u65ad\u30fb\u6c7a\u5b9a\u3092\u652f\u63f4\u3059\u308b\u305f\u3081\u306e\u8a3a\u65ad\u652f\u63f4\u6a5f\u5668\u30fb\u30b7\u30b9\u30c6\u30e0\u958b\u767a\u300f\u300d\u63a1\u629e\u8ab2\u984c<\/a>\u3067\u3042\u308b\u300c\u8853\u524d\u3068\u8853\u4e2d\u3092\u3064\u306a\u3050\u30b9\u30de\u30fc\u30c8\u624b\u8853\u30ac\u30a4\u30c9\u30bd\u30d5\u30c8\u30a6\u30a7\u30a2\u306e\u958b\u767a\u300d\uff08\u4ee3\u8868\u6a5f\u95a2\u540d\uff1a\u6771\u4eac\u5927\u5b66\u3001\u7814\u7a76\u958b\u767a\u4ee3\u8868\u8005\u540d\uff1a\u9f4a\u85e4\u5ef6\u4eba\uff09\u306b\u3001\u6771\u4eac\u5927\u5b66\u5927\u5b66\u9662\u60c5\u5831\u7406\u5de5\u5b66\u7cfb\u7814\u7a76\u79d1\u306e\u5b66\u8853\u652f\u63f4\u5c02\u9580\u8077\u54e1\u3068\u3057\u3066\u53c2\u753b\u3057\u3066\u3044\u308b\u702c\u5c3e\u62e1\u53f2\u304c\u3001\u7814\u7a76\u958b\u767a\u3068\u3057\u3066\u884c\u3063\u3066\u3044\u308b\u3082\u306e\u3084\u305d\u306e\u6210\u679c\u3092\u4e00\u90e8\u542b\u3093\u3067\u3044\u307e\u3059\u3002<\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u524d\u56de\u3001\u5358\u7d14\u306a\u4e8c\u5024\u5316\u51e6\u7406\u30924\u901a\u308a\u306e\u4e26\u5217\u5316\u624b\u6cd5\u3067\u6bd4\u8f03\u3057\u307e\u3057\u305f\u3002 \u4eca\u56de\u306fCUDA\u95a2\u9023\u3067\u3082\u3046\u5c11\u3057\u691c\u8a3c\u3057\u3066\u307f\u307e\u3059\u3002 CUDA\u306e\u30d7\u30ed\u30b0\u30e9\u30df\u30f3\u30b0\u3067\u6700\u521d\u306b\u6c17\u306b\u3059\u308b\u3068\u3053\u308d\u3068\u8a00\u3048\u3070\u3001\u3084\u306f\u308acudaMalloc, cudaFree\u306e\u30e1\u30e2\u30ea\u7ba1\u7406 &#8230;<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[4,5,9],"tags":[],"class_list":["post-130","post","type-post","status-publish","format-standard","hentry","category-c","category-cuda","category-thrust"],"_links":{"self":[{"href":"https:\/\/www.sciement.com\/tech-blog\/wp-json\/wp\/v2\/posts\/130","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.sciement.com\/tech-blog\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.sciement.com\/tech-blog\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.sciement.com\/tech-blog\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/www.sciement.com\/tech-blog\/wp-json\/wp\/v2\/comments?post=130"}],"version-history":[{"count":16,"href":"https:\/\/www.sciement.com\/tech-blog\/wp-json\/wp\/v2\/posts\/130\/revisions"}],"predecessor-version":[{"id":191,"href":"https:\/\/www.sciement.com\/tech-blog\/wp-json\/wp\/v2\/posts\/130\/revisions\/191"}],"wp:attachment":[{"href":"https:\/\/www.sciement.com\/tech-blog\/wp-json\/wp\/v2\/media?parent=130"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.sciement.com\/tech-blog\/wp-json\/wp\/v2\/categories?post=130"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.sciement.com\/tech-blog\/wp-json\/wp\/v2\/tags?post=130"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}