{"id":317,"date":"2019-11-12T02:00:03","date_gmt":"2019-11-11T17:00:03","guid":{"rendered":"http:\/\/www.sciement.com\/tech-blog\/?p=317"},"modified":"2019-11-12T02:03:18","modified_gmt":"2019-11-11T17:03:18","slug":"cuda-custom-atomicmin","status":"publish","type":"post","link":"https:\/\/www.sciement.com\/tech-blog\/cuda\/cuda-custom-atomicmin\/","title":{"rendered":"[CUDA]\u30ab\u30b9\u30bf\u30e0atomicMin\u3067\u6700\u5c0f\u5024\u3068\u305d\u306e\u3068\u304d\u306eindex\u3068\u3092\u540c\u6642\u306b\u53d6\u5f97\u3059\u308b"},"content":{"rendered":"\n<p>\u5024\u3084\u69cb\u9020\u4f53\u306e\u914d\u5217\u3092\u5165\u529b\u3068\u3057\u3066\u3001\u305d\u308c\u305e\u308c\u306e\u5165\u529b\u5024\u306b\u5bfe\u3057\u3066\u4f55\u3089\u304b\u306e\u8a08\u7b97\u5f0f\u306a\u3069\u3067\u4f55\u304b\u3057\u3089\u306e\u5024\u3092\u6c42\u3081\u3001\u305d\u306e\u5024\u306e\u6700\u5c0f\u5024\uff08\u307e\u305f\u306f\u6700\u5927\u5024\uff09\u3068\u3001\u305d\u306e\u3068\u304d\u306e\u914d\u5217\u306eindex\u3068\u3092\u30bb\u30c3\u30c8\u3067\u6b32\u3057\u3044\u3068\u304d\u304c\u3042\u308a\u307e\u3059\u3002<br>\u4f8b\u3048\u3070\u3001<br><code>std::vector values  = { 3, 5, 2, 7, 1, 9 };<\/code><br> \u304c\u5165\u529b\u5024\u3067\u3001\u3053\u306e\u5165\u529b\u5024\u306e\u6700\u5c0f\u5024\u3068\u305d\u306e\u3068\u304d\u306eindex\u304c\u6b32\u3057\u3044\u3068\u304d\u3001 <br><code>min_value = 1;<br>index = 4;<\/code><br>\u3067\u3059\u3002<\/p>\n\n\n\n<p>\u3053\u308c\u3092CUDA\u4e0a\u3067\u3084\u308a\u305f\u3044\u3068\u304d\u304c\u3042\u308a\u307e\u3059\u3002\u3082\u3061\u308d\u3093\u3001\u5165\u529b\u914d\u5217\u306e\u6700\u5c0f\u5024\u3092\u6c42\u3081\u308b\u3060\u3051\u3067\u3042\u308c\u3070CPU\u3067\u3084\u3063\u305f\u307b\u3046\u304c\u666e\u901a\u306f\u901f\u3044\u305f\u3081\u3001\u5b9f\u969b\u306b\u306f\u3001\u5165\u529b\u5024\u3084\u5165\u529b\u69cb\u9020\u4f53\u5024\u304b\u3089CUDA\u30ab\u30fc\u30cd\u30eb\u4e0a\u3067\u4f55\u304b\u3057\u3089\u306e\u8a08\u7b97\u3092\u884c\u3044\u3001\u6709\u52b9\u306a\u6700\u5c0f\u5024\u306e\u5019\u88dc\u304c\u898b\u3064\u304b\u3063\u305f\u30b9\u30ec\u30c3\u30c9\u306b\u5bfe\u3057\u3066\u6700\u5c0f\u5024\u304b\u3069\u3046\u304b\u3092\u5224\u5b9a\u3059\u308b\u3001\u3068\u3044\u3046\u4f7f\u3044\u65b9\u306b\u306a\u308b\u304b\u3068\u601d\u3044\u307e\u3059\u304c\u3001\u5358\u7d14\u5316\u306e\u305f\u3081\u3053\u3053\u3067\u306f\u5358\u306b\u5165\u529b\u5024\u304c\u6700\u5c0f\u5024\u5019\u88dc\u3060\u3068\u4eee\u5b9a\u3057\u3066\u8a71\u3092\u9032\u3081\u307e\u3059\u3002<\/p>\n\n\n\n<p>CUDA\u3067\u306f\u8907\u6570\u30b9\u30ec\u30c3\u30c9\u304c\u540c\u4e00\u30e1\u30e2\u30ea\u306b\u540c\u6642\u306b\u66f8\u304d\u8fbc\u3080\u3053\u3068\u3092\u9632\u3050\u305f\u3081\u306eatomic\u95a2\u6570\u304c\u7528\u610f\u3055\u308c\u3066\u304a\u308a\u3001\u6700\u5c0f\u5024\u3092\u6c42\u3081\u308b\u305f\u3081\u306eatomicMin\u3082\u7528\u610f\u3055\u308c\u3066\u3044\u308b\u306e\u3067\u3059\u304c\u3001\u8907\u6570\u306e\u5024\u3092\u540c\u6642\u306bthread safe\u306b\u66f8\u304d\u63db\u3048\u308b\u3053\u3068\u306f\u51fa\u6765\u305a\u300116bit, 32bit, \u3042\u308b\u3044\u306f64bit\u306e\u5358\u4e00\u306e\u5909\u6570\u3092\u66f8\u304d\u63db\u3048\u308b\u3053\u3068\u3057\u304b\u3067\u304d\u307e\u305b\u3093\u3002<br>\u8907\u6570\u884c\u306b\u6e21\u3063\u3066mutex\u7684\u306a\u3053\u3068\u3092\u884c\u3063\u305f\u308a\u3001\u8907\u6570\u306e\u5909\u6570\u3092\u6301\u3064\u69cb\u9020\u4f53\u306b\u5bfe\u3057\u3066thread safe\u306b\u5024\u3092\u66f8\u304d\u63db\u3048\u308b\u3053\u3068\u306f\u57fa\u672c\u7684\u306b\u306f\u51fa\u6765\u307e\u305b\u3093\uff08\u2026\u305f\u3076\u3093\uff09\u3002<\/p>\n\n\n\n<p>\u3067\u3059\u304c\u3001<strong>\u6700\u5c0f\u5024\uff08\u307e\u305f\u306f\u6700\u5927\u5024\uff09\u3068\u305d\u306e\u3068\u304d\u306eindex\u304c\u3069\u3061\u3089\u308232bit\u4ee5\u4e0b\u306e\u5834\u5408\u306b\u306f\u3001\u5171\u7528\u4f53\uff08union\uff09\u3092\u4f7f\u3063\u3066\u7121\u7406\u3084\u308a64bit\u5909\u6570\u3068\u3057\u3066\u89e3\u91c8\u3059\u308b\u3053\u3068\u3067\u30012\u3064\u306e\u5024\u3092thread safe\u306b\u66f8\u304d\u63db\u3048\u3089\u308c\u308b<\/strong>\u3068\u3044\u3046stack overflow\u306e\u8a18\u4e8b\u3092\u898b\u3064\u3051\u307e\u3057\u305f\u3002<\/p>\n\n\n\n<p><a rel=\"noreferrer noopener\" aria-label=\"cuda - How can I implement a custom atomic function involving several variables? - Stack Overflow (opens in a new tab)\" href=\"https:\/\/stackoverflow.com\/questions\/17411493\/how-can-i-implement-a-custom-atomic-function-involving-several-variables\/17414007#17414007\" target=\"_blank\">cuda &#8211; How can I implement a custom atomic function involving several variables? &#8211; Stack Overflow<\/a><\/p>\n\n\n\n<p>\u3044\u307e\u307e\u3067\u5171\u7528\u4f53\u306a\u3093\u3066\u4f7f\u3063\u305f\u3053\u3068\u304c\u306a\u304f\u3001\u3068\u3044\u3046\u304b\u4f7f\u3044\u3069\u3053\u308d\u3092\u77e5\u3089\u306a\u304b\u3063\u305f\u306e\u3067\u3059\u304c\u3001\u307e\u3055\u304b\u3053\u3093\u306a\u3068\u3053\u308d\u3067\u4f7f\u3048\u308b\u306a\u3093\u3066\u9a5a\u304d\u3067\u3059\u3002\u5929\u624d\u7684\u306a\u767a\u60f3\u3067\u306f\u306a\u3044\u304b\u3068\u601d\u3044\u307e\u3057\u305f\u3002<br>\u30ea\u30f3\u30af\u5148\u306e\u30b3\u30fc\u30c9\u3092C++\u3068CUDA thrust\u3092\u4f7f\u3044\u3001\u305d\u306e\u4ed6\u82e5\u5e72\u306erefactoring\u3092\u884c\u3063\u305f\u30b3\u30fc\u30c9\u304c\u3053\u3061\u3089\u3067\u3059\u3002<\/p>\n\n\n\n<pre class=\"lang:default mark:16-47,51-57 decode:true  \" title=\"Kernel_GetMinValueAndIndexOriginal.cu\">\/* Copyright Hirofumi Seo, M.D. *\/\n\n#include <random>\n#include <limits>\n#include <iostream>\n#include <chrono>\n#include \"cuda_runtime.h\"\n#include \"device_launch_parameters.h\"\n#include \"thrust\/host_vector.h\"\n#include \"thrust\/device_vector.h\"\n\nconstexpr int kNumTotal = 50000;\nconstexpr int kBlockSize = 256;\nconstexpr int kNumBlocks = (kNumTotal + kBlockSize - 1) \/ kBlockSize;\n\nunion ValueAndIndex{\npublic:\n  unsigned long long int ulong;    \/\/ for atomic update\n \n  __host__ __device__ ValueAndIndex() {}\n  __host__ __device__ ValueAndIndex(const float value, const int index) {\n    Set(value, index);\n  }\n\n  __host__ __device__ void Set(const float value, const int index) {\n    SetValue(value);\n    SetIndex(index);\n  }\n\n  __host__ __device__ float GetValue() const {\n    return floats[0];\n  }\n  __host__ __device__ void SetValue(const float value) {\n    floats[0] = value;\n  }\n\n  __host__ __device__ int GetIndex() const {\n    return ints[1];\n  }\n  __host__ __device__ void SetIndex(const int index) {\n    ints[1] = index;\n  }\n\nprivate:\n  float floats[2];                 \/\/ floats[0] = lowest value\n  int ints[2];                     \/\/ ints[1] = the index of the lowest value\n};\n\n\/* ------------original version------------ *\/\n\/\/ https:\/\/stackoverflow.com\/questions\/17411493\/how-can-i-implement-a-custom-atomic-function-involving-several-variables\/17414007#17414007\n__device__ ValueAndIndex AtomicMinValueAndIndexOriginal(const ValueAndIndex& value_and_index, ValueAndIndex* min_value_and_index) {\n  ValueAndIndex old = *min_value_and_index;\n  while (value_and_index.GetValue() < old.GetValue()) {\n    old.ulong = atomicCAS(&#038;(min_value_and_index->ulong), old.ulong, value_and_index.ulong);\n  }\n  return old;\n}\n\n__global__ void Kernel_GetMinValueAndIndexOriginal(const float* values, ValueAndIndex* min_value_and_index) {\n  const int initial_index = blockIdx.x * blockDim.x + threadIdx.x;\n  const int stride = blockDim.x * gridDim.x;\n  for (int index = initial_index; index < kNumTotal; index += stride) {\n    AtomicMinValueAndIndexOriginal(ValueAndIndex(values[index], index), min_value_and_index);\n  }\n}\n\/* ---------------------------------------- *\/\n\nint main() {\n\n  thrust::host_vector<float> h_values(kNumTotal);\n  thrust::device_vector<float> d_values(kNumTotal);\n\n  \/\/ create random floats between 0 and 1\n  std::random_device rnd;\n  std::mt19937 mt(rnd());\n  std::uniform_int_distribution<int> rand0_to_10000(0, 10000);\n  for (int i = 0; i < kNumTotal; ++i) {\n    h_values[i] = (float)rand0_to_10000(mt);\n  }\n  d_values = h_values;\n\n  ValueAndIndex h_min_value_and_index;\n  ValueAndIndex* d_min_value_and_index;\n  cudaMalloc((void**)&#038;d_min_value_and_index, sizeof(ValueAndIndex));\n\n  auto time_start = std::chrono::system_clock::now();\n  auto time_end = std::chrono::system_clock::now();\n  auto time_used = std::chrono::duration_cast<std::chrono::microseconds>(time_end - time_start).count() \/ 1000.0f;\n\n  \/* ------------original version------------ *\/\n  time_start = std::chrono::system_clock::now();\n  \n  cudaMemcpy(d_min_value_and_index, &ValueAndIndex(std::numeric_limits<float>::max(), -1), sizeof(ValueAndIndex), cudaMemcpyHostToDevice);\n  Kernel_GetMinValueAndIndexOriginal<<<kNumBlocks, kBlockSize>>>(thrust::raw_pointer_cast(d_values.data()), d_min_value_and_index);\n  cudaDeviceSynchronize();\n  cudaMemcpy(&h_min_value_and_index, d_min_value_and_index, sizeof(ValueAndIndex), cudaMemcpyDeviceToHost);\n  \n  time_end = std::chrono::system_clock::now();\n  time_used = std::chrono::duration_cast<std::chrono::microseconds>(time_end - time_start).count() \/ 1000.0f;\n\n  std::cout << \"GPU Original: (min_value, index) = (\" << h_min_value_and_index.GetValue() << \", \" << h_min_value_and_index.GetIndex() << \"), \" << time_used << \"msec.\" << std::endl;\n  \/* ---------------------------------------- *\/\n\n\n  \/* --------------cpu version--------------- *\/\n  time_start = std::chrono::system_clock::now();\n\n  ValueAndIndex min_value_and_index_by_cpu(std::numeric_limits<float>::max(), -1);\n  for (int index = 0; index < kNumTotal; ++index) {\n    if (h_values[index] < min_value_and_index_by_cpu.GetValue()) {\n      min_value_and_index_by_cpu.Set(h_values[index], index);\n    }\n  }\n\n  time_end = std::chrono::system_clock::now();\n  time_used = std::chrono::duration_cast<std::chrono::microseconds>(time_end - time_start).count() \/ 1000.0f;\n\n  std::cout << \"CPU: (min_value, index) = (\" << min_value_and_index_by_cpu.GetValue() << \", \" << min_value_and_index_by_cpu.GetIndex() << \"), \" << time_used << \"msec.\" << std::endl;\n  \/* ---------------------------------------- *\/\n\n  cudaFree(d_min_value_and_index);\n  return 0;\n}<\/pre>\n\n\n\n<p><strong>\u5171\u7528\u4f53ValueAndIndex\u306e\u524d\u534a32bit\u306b\u6700\u5c0f\u5024\u3092\u3001\u5f8c\u534a32bit\u306bindex\u3092\u683c\u7d0d\u3059\u308b\u3088\u3046\u306b\u3057\u3001atomicMin\u3092\u7528\u3044\u308b\u3068\u304d\u306funsigned long long int\u5024\u3068\u3057\u3066\u89e3\u91c8\u3059\u308b<\/strong>\u3068\u3044\u3046\u30c6\u30af\u30cb\u30c3\u30af\u3067\u3059\u3002<br>\u304b\u306a\u308a\u5929\u624d\u7684\u306a\u767a\u60f3\u3067\u306f\u306a\u3044\u304b\u3068\u500b\u4eba\u7684\u306b\u306f\u601d\u3046\u306e\u3068\u3001stack overflow\u3067\u306f\u6a21\u7bc4\u89e3\u7b54\u7684\u306b\u6271\u308f\u308c\u3066\u3044\u308b\u306e\u3067\u3059\u304c\u3001atomic\u95a2\u6570\u306b\u3064\u3044\u3066\u52c9\u5f37\u3059\u308b\u3068\u3001\u5b9f\u306f\u4e0a\u8a18\u306e\u65b9\u6cd5\u306f\u3082\u3046\u5c11\u3057\u6539\u826f\u306e\u4f59\u5730\u304c\u3042\u308b\u3053\u3068\u304c\u308f\u304b\u308a\u307e\u3059\u3002<\/p>\n\n\n\n<p>\u3061\u306a\u307f\u306b\u4e0a\u8a18\u30b3\u30fc\u30c9\u3092\u8d70\u3089\u305b\u308b\u3068\u3001<br><code>GPU Original: (min_value, index) = (0, 11101), 0.191msec.<br>CPU: (min_value, index) = (0, 11101), 0.036msec.<\/code><br>\u3068\u3044\u3046\u3088\u3046\u306a\u7d50\u679c\u306b\u306a\u308a\u3001\u5f53\u305f\u308a\u524d\u3067\u3059\u304c\u3053\u306e\u7a0b\u5ea6\u3067\u3042\u308c\u3070CPU\u8a08\u7b97\u306e\u307b\u3046\u304c\u901f\u3044\u3067\u3059\u3002<\/p>\n\n\n\n<p>\u3055\u3066\u3001\u4e0a\u8a18\u3067\u6700\u3082\u5927\u4e8b\u306a\u30ab\u30fc\u30cd\u30eb\u95a2\u6570\u3092\u6ce8\u610f\u6df1\u304f\u898b\u3066\u307f\u307e\u3057\u3087\u3046\u3002<\/p>\n\n\n\n<pre class=\"start-line:51 lang:default decode:true  \" title=\"Kernel_GetMinValueAndIndexOriginal.cu\">__device__ ValueAndIndex AtomicMinValueAndIndexOriginal(const ValueAndIndex& value_and_index, ValueAndIndex* min_value_and_index) {\n  ValueAndIndex current = *min_value_and_index;\n  while (value_and_index.GetValue() < current.GetValue()) {\n    current.ulong = atomicCAS(&#038;(min_value_and_index->ulong), current.ulong, value_and_index.ulong);\n  }\n  return current;\n}\n<\/pre>\n\n\n\n<p>\u3053\u308c\u3001\u6700\u4f4e\u3067\u3082while\u6587\u304c2\u56de\u307e\u308f\u3063\u3066\u3057\u307e\u3044\u307e\u3059\u3002\u4ed6\u306e\u30b9\u30ec\u30c3\u30c9\u304b\u3089\u306emin_value_and_index\u3078\u306e\u64cd\u4f5c\u304c\u306a\u304f\u30011\u56de\u76ee\u306ewhile\u6587\u5185\u3067atomicCAS\u306b\u304a\u3044\u3066<br><code>min_value_and_index-&gt;ulong == current.ulong<\/code><br>\u3060\u3063\u305f\u3068\u3057\u3066\u3082<br>current.ulong\u306b\u306f\u53e4\u3044\u5024\u304c\u4ee3\u5165\u3055\u308c\u307e\u3059\uff08atnomicCAS\u306e\u4ed5\u69d8\u3067\u3059\uff09\u3002\u3064\u307e\u308a\u3001min_value_and_index\u306b\u306fvalue_and_index\u304c\u6b63\u3057\u304f\u4ee3\u5165\u3055\u308c\u3066\u3044\u308b\u306e\u306bcurrent\u306b\u306f\u4ee3\u5165\u524d\u306e\u53e4\u3044\u5024\u304c\u4ee3\u5165\u3055\u308c\u308b\u305f\u3081\u3001\u65b0\u3057\u3044\u5024\u304c\u4ee3\u5165\u3055\u308c\u3066\u3082<br><code>value_and_index.GetValue() &lt; current.GetValue()<\/code><br>\u3092\u6e80\u305f\u3057\u3066\u3057\u307e\u3044\u30012\u56de\u76ee\u306ewhile\u6587\u304c\u8d70\u308a\u307e\u3059\u3002<br>2\u56de\u76ee\u306ewhile\u6587\u5185\u3067\u306f<br> <code>min_value_and_index-&gt;ulong != current.ulong<\/code> <br>\u306a\u306e\u3067\u3001atomicCAS\u306b\u3088\u3063\u3066\u5024\u304c\u5909\u308f\u308b\u3053\u3068\u306f\u7121\u304f\u3001\u4e00\u65b9\u3067atomicCAS\u306b\u3088\u3063\u3066<br><code>current.ulong == value_and_index.ulong<\/code><br>\u3068\u306a\u308b\u305f\u3081\u3001\u3053\u308c\u3067\u3088\u3046\u3084\u304f<br><code>value_and_index.GetValue() == current.GetValue()<\/code> <br>\u3068\u306a\u308awhile\u6587\u3092\u629c\u3051\u307e\u3059\u3002<\/p>\n\n\n\n<p>atomic\u95a2\u6570\u306b\u3064\u3044\u3066\u306fNVIDIA\u304c2013\u5e74\u306eGPU Technology Conference\u3067atomic\u95a2\u6570\u306b\u7279\u5316\u3057\u305f\u8b1b\u6f14\uff08\uff1f\uff09\u3092\u3057\u3066\u3044\u308b\u3089\u3057\u304f\u3001\u8cb4\u91cd\u306a\u30b9\u30e9\u30a4\u30c9\u3092\u898b\u308b\u3053\u3068\u304c\u51fa\u6765\u307e\u3059\u3002<\/p>\n\n\n\n<p><a rel=\"noreferrer noopener\" aria-label=\"Understanding and Using Atomic Memory Operations\nLars Nyland &amp; Stephen Jones, NVIDIA GTC 2013  (opens in a new tab)\" href=\"http:\/\/on-demand.gputechconf.com\/gtc\/2013\/presentations\/S3101-Atomic-Memory-Operations.pdf\" target=\"_blank\">Understanding and Using Atomic Memory Operations<br>Lars Nyland &amp; Stephen Jones, NVIDIA GTC 2013 <\/a><\/p>\n\n\n\n<p>\u3053\u306e\u30b9\u30e9\u30a4\u30c9\u3067\u89e3\u8aac\u3055\u308c\u3066\u3044\u308bLock-Free Data Updates\u306e\u8003\u3048\u65b9\u3067\u3001\u5148\u307b\u3069\u306e\u30ab\u30fc\u30cd\u30eb\u3092\u6539\u826f\u3059\u308b\u3068\u4ee5\u4e0b\u306e\u3088\u3046\u306b\u306a\u308a\u307e\u3059\u3002<\/p>\n\n\n\n\n\n<pre class=\"lang:default decode:true  \" title=\"Kernel_GetMinValueAndIndexModified.cu\">__device__ ValueAndIndex AtomicMinValueAndIndexModified(const ValueAndIndex&amp; value_and_index, ValueAndIndex* min_value_and_index) {\n  ValueAndIndex old = *min_value_and_index, assumed;\n  do {\n    if (value_and_index.GetValue() &gt;= old.GetValue()) {\n      break;\n    }\n    assumed = old;\n    old.ulong = atomicCAS(&amp;(min_value_and_index-&gt;ulong), assumed.ulong, value_and_index.ulong);\n  } while (assumed.ulong != old.ulong);\n  return old;\n}\n<\/pre>\n\n\n\n<p><a href=\"https:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/index.html#atomic-functions\" target=\"_blank\" rel=\"noreferrer noopener\" aria-label=\"Programming Guide :: CUDA Toolkit Documentation\n8.12. Atomic Functions (opens in a new tab)\">Programming Guide :: CUDA Toolkit Documentation<br>8.12. Atomic Functions<\/a><\/p>\n\n\n\n<p>\u306b\u8a18\u8f09\u3055\u308c\u3066\u3044\u308b\u3001double\u5024\u3067\u306eatomicAdd\u6f14\u7b97\u306e\u30b5\u30f3\u30d7\u30eb\u30b3\u30fc\u30c9\u3068\u540c\u3058\u5909\u6570\u540d\u3092\u7528\u3044\u307e\u3057\u305f\u3002\u4e0a\u8a18\u306e\u3088\u3046\u306b\u3059\u308b\u3053\u3068\u3067\u3001\u4e2d\u9593\u5909\u6570assumed\u304c1\u3064\u5897\u3048\u3066\u3057\u307e\u3044\u307e\u3059\u304c\u3001atomicCAS\u306f\u6700\u4f4e1\u56de\u3067\u6e08\u307f\u307e\u3059\u3002<br>\u305f\u3060\u3057\u3001\u30aa\u30ea\u30b8\u30ca\u30eb\u306e\u3082\u306e\u3068\u6bd4\u8f03\u3057\u3066\u901f\u5ea6\u304c\u6539\u5584\u3055\u308c\u308b\u304b\u3068\u3044\u3046\u3068\u305f\u3044\u3057\u3066\u5909\u308f\u308a\u307e\u305b\u3093\u3002\u3080\u3057\u308d\u6539\u826f\u3057\u305f\u3082\u306e\u306e\u307b\u3046\u304c\u6642\u3005\u9045\u304f\u306a\u308b\u3053\u3068\u3082\u3042\u308a\u307e\u3059\u3002\u3053\u306e\u3042\u305f\u308a\u306f\u5b9f\u884c\u6642\u306eGPU\u306e\u72b6\u614b\uff08\uff1f\uff09\u6b21\u7b2c\u3067\u307e\u3061\u307e\u3061\u3067\u3059\u3002<br>\u8a66\u3057\u306b\u5b9f\u884c\u3057\u3066\u307f\u308b\u3068\u4ee5\u4e0b\u306e\u3088\u3046\u306b\u306a\u308a\u307e\u3057\u305f\u3002<\/p>\n\n\n\n<p><code>GPU Original: (min_value, index) = (0, 16912), 0.193msec.<br>GPU Modified: (min_value, index) = (0, 34187), 0.183msec.<br>CPU: (min_value, index) = (0, 15386), 0.038msec.<\/code><\/p>\n\n\n\n<p>\u3093\u3001\u4f55\u3084\u3089\u5225\u306e\u554f\u984c\u304c\u767a\u751f\u3057\u3066\u3044\u308b\u3088\u3046\u3067\u3059\u2026\u3002<strong>index\u306e\u5024\u304c\u7570\u306a\u3063\u3066\u3044\u307e\u3059\u2026\uff01<\/strong><br>\u5b9f\u306f\u6700\u521d\u306b\u7d39\u4ecb\u3057\u305fstack overflow\u306e\u30b5\u30f3\u30d7\u30eb\u30b3\u30fc\u30c9\u3067\u306f\u5165\u529b\u8981\u7d20\u6570\u304c5000\u3057\u304b\u306a\u304b\u3063\u305f\u305f\u3081\u306b\u6c17\u4ed8\u304b\u308c\u306b\u304f\u304b\u3063\u305f\u306e\u3067\u3059\u304c\u3001min_value\u3092\u6301\u3064\u8981\u7d20\u304c\u8907\u6570\u3042\u308b\u3068\u304d\u3001<strong>CPU\u3067\u306f\u5fc5\u305a\u6700\u5c0f\u306eindex\u5024\u304c\u9078\u3070\u308c\u307e\u3059\u304c\u3001GPU\u3067\u306f\u3069\u306ethread\u304c\u6700\u901f\u3067\u7d42\u308f\u308b\u304b\u306f\u305d\u306e\u6642\u3005\u3067\u7570\u306a\u308b\u305f\u3081\u3001\u3069\u306eindex\u304c\u9078\u3070\u308c\u308b\u304b\u306f\u5168\u304f\u308f\u304b\u308a\u307e\u305b\u3093<\/strong>\u3002<br>\u4e0a\u8a18\u306e\u5834\u5408\u300150000\u500b\u306e\u5165\u529b\u5024\u306b\u5bfe\u3057\u3066\u5165\u529b\u5024\u304c\u6700\u5c0f\u5024\u3067\u3042\u308b0\u3068\u306a\u3063\u3066\u3044\u305f\u3082\u306e\u304c\u8907\u6570\u3042\u308a\u3001index\u3092\u66f8\u304d\u51fa\u3057\u3066\u307f\u308b\u3068<br><code>The indices of the min_value are\u2026 15386, 16912, 17974, 28312, 34187,<\/code><br>\u3068\u306a\u3063\u3066\u3044\u307e\u3057\u305f\u3002\u3064\u307e\u308a\u3001\u3053\u306e5\u3064\u306eindex\u306e\u3069\u308c\u3067\u3042\u3063\u3066\u3082\u6700\u5c0f\u5024\u306f0\u306a\u306e\u3067\u3001\u3069\u306eindex\u3082\u6b63\u3057\u3044\u3067\u3059\u3002<br>\u6700\u5c0f\u5024\u304c\u8907\u6570\u3042\u308b\u5834\u5408\u306b\u3001\u3069\u306e\u5834\u6240\u306e\u8981\u7d20\u3092\u53d6\u3063\u3066\u304d\u3066\u3082\u69cb\u308f\u306a\u3044\u306e\u3067\u3042\u308c\u3070\u3053\u308c\u3067\u554f\u984c\u3042\u308a\u307e\u305b\u3093\u304c\u3001CPU\u3067\u306e\u51e6\u7406\u3068GPU\u3067\u306e\u51e6\u7406\u3068\u3067\u7d50\u679c\u304c\u7570\u306a\u308b\u3053\u3068\u304c\u3042\u308b\u306e\u306f\u6c17\u6301\u3061\u60aa\u3044\u3067\u3059\u3002<br>\u306e\u3067\u3001\u4e0a\u8a18\u6539\u826f\u3057\u305f\u30ab\u30fc\u30cd\u30eb\u306b\u3055\u3089\u306b\u5c11\u3057\u3060\u3051\u624b\u3092\u52a0\u3048\u3066\u3001CPU\u3068GPU\u3068\u3067\u5fc5\u305a\u7d50\u679c\u304c\u540c\u3058\u306b\u306a\u308b\u3088\u3046\u306b\u3057\u307e\u3057\u3087\u3046\u3002\u3053\u3046\u3059\u308c\u3070\u826f\u3044\u3067\u3059\u3002<\/p>\n\n\n\n<pre class=\"lang:default mark:4-9 decode:true  \" title=\"Kernel_GetMinValueAndIndexModifiedNew.cu\">__device__ ValueAndIndex AtomicMinValueAndIndexModifiedNew(const ValueAndIndex&amp; value_and_index, ValueAndIndex* min_value_and_index) {\n  ValueAndIndex old = *min_value_and_index, assumed;\n  do {\n    if (value_and_index.GetValue() &gt; old.GetValue()) {\n      break;\n    }\n    if ((value_and_index.GetValue() == old.GetValue()) &amp;&amp; (value_and_index.GetIndex() &gt; old.GetIndex())) {\n      break;\n    }\n    assumed = old;\n    old.ulong = atomicCAS(&amp;(min_value_and_index-&gt;ulong), assumed.ulong, value_and_index.ulong);\n  } while (assumed.ulong != old.ulong);\n  return old;\n}\n<\/pre>\n\n\n\n<p>\u3053\u308c\u3067\u3001\u5e38\u306b\u6700\u5c0f\u306eindex\u5024\u3092\u5f97\u3089\u308c\u308b\u3088\u3046\u306b\u306a\u308a\u307e\u3057\u305f\u30027\u884c\u76ee\u306e\u4e0d\u7b49\u53f7\u3092\u53cd\u5bfe\u5411\u304d\u306b\u3059\u308c\u3070\u6700\u5927\u306eindex\u5024\u3092\u5f97\u308b\u3053\u3068\u3082\u51fa\u6765\u307e\u3059\u3002<br>4\u884c\u76ee\u306e\u4e0d\u7b49\u53f7\u3092\u5165\u308c\u66ff\u3048\u308c\u3070atomicMax\u306b\u3082\u306a\u308a\u307e\u3059\u3002<br>\u5b9f\u884c\u7d50\u679c\u306f\u4f8b\u3048\u3070\u4ee5\u4e0b\u306e\u3088\u3046\u306b\u306a\u308a\u307e\u3059\u3002<\/p>\n\n\n\n<p><code>GPU Original: (min_value, index) = (0, 43739), 0.195msec.<br>GPU Modified: (min_value, index) = (0, 33730), 0.268msec.<br>GPU Modified New: (min_value, index) = (0, 7930), 0.239msec.<br>CPU: (min_value, index) = (0, 7930), 0.036msec.<br>The indices of the min_value are\u2026 7930, 18153, 33730, 43739, 46528, 48255,<\/code><\/p>\n\n\n\n<p>CPU\u3067\u306e\u7d50\u679c\u3068\u5fc5\u305a\u4e00\u81f4\u3059\u308b\u3068\u5b89\u5fc3\u51fa\u6765\u307e\u3059\u306d\u3002<br>\u3082\u3061\u308d\u3093\u3001\u30aa\u30ea\u30b8\u30ca\u30eb\u306e\u30ab\u30fc\u30cd\u30eb\u3067\u3082while\u306e\u6761\u4ef6\u5f0f\u3092\u66f8\u304d\u8db3\u305b\u3070\u3001CPU\u3068\u540c\u3058\u7d50\u679c\u306b\u3059\u308b\u3053\u3068\u304c\u51fa\u6765\u307e\u3059\u3002<\/p>\n\n\n\n<p>\u4ee5\u4e0a\u3001atmoic\u95a2\u6570\u306b\u95a2\u3059\u308b\u975e\u5e38\u306b\u30de\u30cb\u30a2\u30c3\u30af\u306a\u30cd\u30bf\u3067\u3057\u305f\u3002<\/p>\n\n\n\n<p>\u6700\u5f8c\u306b\u3001\u30bd\u30fc\u30b9\u30b3\u30fc\u30c9\u5168\u4f53\u3092\u8f09\u305b\u3066\u304a\u304d\u307e\u3059\u306d\u3002<\/p>\n\n\n\n\n\n<pre class=\"lang:defaul decode:true  \" title=\"Kernel_GetMinValueAndIndexModified.cu\">\/* Copyright Hirofumi Seo, M.D. *\/\n\n#include <random>\n#include <limits>\n#include <iostream>\n#include <chrono>\n#include \"cuda_runtime.h\"\n#include \"device_launch_parameters.h\"\n#include \"thrust\/host_vector.h\"\n#include \"thrust\/device_vector.h\"\n\nconstexpr int kNumTotal = 50000;\nconstexpr int kBlockSize = 256;\nconstexpr int kNumBlocks = (kNumTotal + kBlockSize - 1) \/ kBlockSize;\n\nunion ValueAndIndex{\npublic:\n  unsigned long long int ulong;    \/\/ for atomic update\n \n  __host__ __device__ ValueAndIndex() {}\n  __host__ __device__ ValueAndIndex(const float value, const int index) {\n    Set(value, index);\n  }\n\n  __host__ __device__ void Set(const float value, const int index) {\n    SetValue(value);\n    SetIndex(index);\n  }\n\n  __host__ __device__ float GetValue() const {\n    return floats[0];\n  }\n  __host__ __device__ void SetValue(const float value) {\n    floats[0] = value;\n  }\n\n  __host__ __device__ int GetIndex() const {\n    return ints[1];\n  }\n  __host__ __device__ void SetIndex(const int index) {\n    ints[1] = index;\n  }\n\nprivate:\n  float floats[2];                 \/\/ floats[0] = lowest value\n  int ints[2];                     \/\/ ints[1] = the index of the lowest value\n};\n\n\/* ------------original version------------ *\/\n\/\/ https:\/\/stackoverflow.com\/questions\/17411493\/how-can-i-implement-a-custom-atomic-function-involving-several-variables\/17414007#17414007\n__device__ ValueAndIndex AtomicMinValueAndIndexOriginal(const ValueAndIndex& value_and_index, ValueAndIndex* min_value_and_index) {\n  ValueAndIndex old = *min_value_and_index;\n  while (value_and_index.GetValue() < old.GetValue()) {\n    old.ulong = atomicCAS(&#038;(min_value_and_index->ulong), old.ulong, value_and_index.ulong);\n  }\n  return old;\n}\n\n__global__ void Kernel_GetMinValueAndIndexOriginal(const float* values, ValueAndIndex* min_value_and_index) {\n  const int initial_index = blockIdx.x * blockDim.x + threadIdx.x;\n  const int stride = blockDim.x * gridDim.x;\n  for (int index = initial_index; index < kNumTotal; index += stride) {\n    AtomicMinValueAndIndexOriginal(ValueAndIndex(values[index], index), min_value_and_index);\n  }\n}\n\/* ---------------------------------------- *\/\n\n\/* -----Lock-Free Data Updates version----- *\/\n__device__ ValueAndIndex AtomicMinValueAndIndexModified(const ValueAndIndex&#038; value_and_index, ValueAndIndex* min_value_and_index) {\n  ValueAndIndex old = *min_value_and_index, assumed;\n  do {\n    if (value_and_index.GetValue() >= old.GetValue()) {\n      break;\n    }\n    assumed = old;\n    old.ulong = atomicCAS(&(min_value_and_index->ulong), assumed.ulong, value_and_index.ulong);\n  } while (assumed.ulong != old.ulong);\n  return old;\n}\n\n__global__ void Kernel_GetMinValueAndIndexModified(const float* values, ValueAndIndex* min_value_and_index) {\n  const int initial_index = blockIdx.x * blockDim.x + threadIdx.x;\n  const int stride = blockDim.x * gridDim.x;\n  for (int index = initial_index; index < kNumTotal; index += stride) {\n    AtomicMinValueAndIndexModified(ValueAndIndex(values[index], index), min_value_and_index);\n  }\n}\n\/* ---------------------------------------- *\/\n\n\/* ---Lock-Free Data Updates new version--- *\/\n__device__ ValueAndIndex AtomicMinValueAndIndexModifiedNew(const ValueAndIndex&#038; value_and_index, ValueAndIndex* min_value_and_index) {\n  ValueAndIndex old = *min_value_and_index, assumed;\n  do {\n    if (value_and_index.GetValue() > old.GetValue()) {\n      break;\n    }\n    if ((value_and_index.GetValue() == old.GetValue()) && (value_and_index.GetIndex() > old.GetIndex())) {\n      break;\n    }\n    assumed = old;\n    old.ulong = atomicCAS(&(min_value_and_index->ulong), assumed.ulong, value_and_index.ulong);\n  } while (assumed.ulong != old.ulong);\n  return old;\n}\n\n__global__ void Kernel_GetMinValueAndIndexModifiedNew(const float* values, ValueAndIndex* min_value_and_index) {\n  const int initial_index = blockIdx.x * blockDim.x + threadIdx.x;\n  const int stride = blockDim.x * gridDim.x;\n  for (int index = initial_index; index < kNumTotal; index += stride) {\n    AtomicMinValueAndIndexModifiedNew(ValueAndIndex(values[index], index), min_value_and_index);\n  }\n}\n\/* ---------------------------------------- *\/\n\nint main() {\n\n  thrust::host_vector<float> h_values(kNumTotal);\n  thrust::device_vector<float> d_values(kNumTotal);\n\n  \/\/ create random floats between 0 and 1\n  std::random_device rnd;\n  std::mt19937 mt(rnd());\n  std::uniform_int_distribution<int> rand0_to_10000(0, 10000);\n  for (int i = 0; i < kNumTotal; ++i) {\n    h_values[i] = (float)rand0_to_10000(mt);\n  }\n  d_values = h_values;\n\n  ValueAndIndex h_min_value_and_index;\n  ValueAndIndex* d_min_value_and_index;\n  cudaMalloc((void**)&#038;d_min_value_and_index, sizeof(ValueAndIndex));\n\n  auto time_start = std::chrono::system_clock::now();\n  auto time_end = std::chrono::system_clock::now();\n  auto time_used = std::chrono::duration_cast<std::chrono::microseconds>(time_end - time_start).count() \/ 1000.0f;\n\n  \/* ------------original version------------ *\/\n  time_start = std::chrono::system_clock::now();\n  \n  cudaMemcpy(d_min_value_and_index, &ValueAndIndex(std::numeric_limits<float>::max(), -1), sizeof(ValueAndIndex), cudaMemcpyHostToDevice);\n  Kernel_GetMinValueAndIndexOriginal<<<kNumBlocks, kBlockSize>>>(thrust::raw_pointer_cast(d_values.data()), d_min_value_and_index);\n  cudaDeviceSynchronize();\n  cudaMemcpy(&h_min_value_and_index, d_min_value_and_index, sizeof(ValueAndIndex), cudaMemcpyDeviceToHost);\n  \n  time_end = std::chrono::system_clock::now();\n  time_used = std::chrono::duration_cast<std::chrono::microseconds>(time_end - time_start).count() \/ 1000.0f;\n\n  std::cout << \"GPU Original: (min_value, index) = (\" << h_min_value_and_index.GetValue() << \", \" << h_min_value_and_index.GetIndex() << \"), \" << time_used << \"msec.\" << std::endl;\n  \/* ---------------------------------------- *\/\n\n\n  \/* -----Lock-Free Data Updates version----- *\/\n  time_start = std::chrono::system_clock::now();\n  \n  cudaMemcpy(d_min_value_and_index, &#038;ValueAndIndex(std::numeric_limits<float>::max(), -1), sizeof(ValueAndIndex), cudaMemcpyHostToDevice);\n  Kernel_GetMinValueAndIndexModified<<<kNumBlocks, kBlockSize>>>(thrust::raw_pointer_cast(d_values.data()), d_min_value_and_index);\n  cudaDeviceSynchronize();\n  cudaMemcpy(&h_min_value_and_index, d_min_value_and_index, sizeof(ValueAndIndex), cudaMemcpyDeviceToHost);\n  \n  time_end = std::chrono::system_clock::now();\n  time_used = std::chrono::duration_cast<std::chrono::microseconds>(time_end - time_start).count() \/ 1000.0f;\n\n  std::cout << \"GPU Modified: (min_value, index) = (\" << h_min_value_and_index.GetValue() << \", \" << h_min_value_and_index.GetIndex() << \"), \" << time_used << \"msec.\" << std::endl;\n  \/* ---------------------------------------- *\/\n\n\n\/* ---Lock-Free Data Updates new version--- *\/\n  time_start = std::chrono::system_clock::now();\n\n  cudaMemcpy(d_min_value_and_index, &#038;ValueAndIndex(std::numeric_limits<float>::max(), -1), sizeof(ValueAndIndex), cudaMemcpyHostToDevice);\n  Kernel_GetMinValueAndIndexModifiedNew<<<kNumBlocks, kBlockSize>>>(thrust::raw_pointer_cast(d_values.data()), d_min_value_and_index);\n  cudaDeviceSynchronize();\n  cudaMemcpy(&h_min_value_and_index, d_min_value_and_index, sizeof(ValueAndIndex), cudaMemcpyDeviceToHost);\n\n  time_end = std::chrono::system_clock::now();\n  time_used = std::chrono::duration_cast<std::chrono::microseconds>(time_end - time_start).count() \/ 1000.0f;\n\n  std::cout << \"GPU Modified New: (min_value, index) = (\" << h_min_value_and_index.GetValue() << \", \" << h_min_value_and_index.GetIndex() << \"), \" << time_used << \"msec.\" << std::endl;\n  \/* ---------------------------------------- *\/\n\n\n  \/* --------------cpu version--------------- *\/\n  time_start = std::chrono::system_clock::now();\n\n  ValueAndIndex min_value_and_index_by_cpu(std::numeric_limits<float>::max(), -1);\n  for (int index = 0; index < kNumTotal; ++index) {\n    if (h_values[index] < min_value_and_index_by_cpu.GetValue()) {\n      min_value_and_index_by_cpu.Set(h_values[index], index);\n    }\n  }\n\n  time_end = std::chrono::system_clock::now();\n  time_used = std::chrono::duration_cast<std::chrono::microseconds>(time_end - time_start).count() \/ 1000.0f;\n\n  std::cout << \"CPU: (min_value, index) = (\" << min_value_and_index_by_cpu.GetValue() << \", \" << min_value_and_index_by_cpu.GetIndex() << \"), \" << time_used << \"msec.\" << std::endl;\n  \/* ---------------------------------------- *\/\n\n  std::cout << \"The indices of the min_value are... \";\n  for (int index = 0; index < kNumTotal; ++index) {\n    if (h_values[index] == min_value_and_index_by_cpu.GetValue()) {\n      std::cout << index << \", \";\n    }\n  }\n  std::cout << std::endl;\n\n  cudaFree(d_min_value_and_index);\n  return 0;\n}\n<\/pre>\n","protected":false},"excerpt":{"rendered":"<p>\u5024\u3084\u69cb\u9020\u4f53\u306e\u914d\u5217\u3092\u5165\u529b\u3068\u3057\u3066\u3001\u305d\u308c\u305e\u308c\u306e\u5165\u529b\u5024\u306b\u5bfe\u3057\u3066\u4f55\u3089\u304b\u306e\u8a08\u7b97\u5f0f\u306a\u3069\u3067\u4f55\u304b\u3057\u3089\u306e\u5024\u3092\u6c42\u3081\u3001\u305d\u306e\u5024\u306e\u6700\u5c0f\u5024\uff08\u307e\u305f\u306f\u6700\u5927\u5024\uff09\u3068\u3001\u305d\u306e\u3068\u304d\u306e\u914d\u5217\u306eindex\u3068\u3092\u30bb\u30c3\u30c8\u3067\u6b32\u3057\u3044\u3068\u304d\u304c\u3042\u308a\u307e\u3059\u3002\u4f8b\u3048\u3070\u3001std::vector  &#8230;<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[5],"tags":[],"class_list":["post-317","post","type-post","status-publish","format-standard","hentry","category-cuda"],"_links":{"self":[{"href":"https:\/\/www.sciement.com\/tech-blog\/wp-json\/wp\/v2\/posts\/317","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.sciement.com\/tech-blog\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.sciement.com\/tech-blog\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.sciement.com\/tech-blog\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/www.sciement.com\/tech-blog\/wp-json\/wp\/v2\/comments?post=317"}],"version-history":[{"count":41,"href":"https:\/\/www.sciement.com\/tech-blog\/wp-json\/wp\/v2\/posts\/317\/revisions"}],"predecessor-version":[{"id":359,"href":"https:\/\/www.sciement.com\/tech-blog\/wp-json\/wp\/v2\/posts\/317\/revisions\/359"}],"wp:attachment":[{"href":"https:\/\/www.sciement.com\/tech-blog\/wp-json\/wp\/v2\/media?parent=317"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.sciement.com\/tech-blog\/wp-json\/wp\/v2\/categories?post=317"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.sciement.com\/tech-blog\/wp-json\/wp\/v2\/tags?post=317"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}