util/X86LINUX64/cuda-6.5/include/thrust/system/cuda/detail/detail/b40c/radixsort_api.h

   1 /******************************************************************************
   2  * Copyright 2010 Duane Merrill
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *     http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  *
  16  *
  17  *
  18  *
  19  * AUTHORS' REQUEST:
  20  *
  21  *              If you use|reference|benchmark this code, please cite our Technical
  22  *              Report (http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf):
  23  *
  24  *              @TechReport{ Merrill:Sorting:2010,
  25  *              author = "Duane Merrill and Andrew Grimshaw",
  26  *              title = "Revisiting Sorting for GPGPU Stream Architectures",
  27  *              year = "2010",
  28  *              institution = "University of Virginia, Department of Computer Science",
  29  *              address = "Charlottesville, VA, USA",
  30  *              number = "CS2010-03"
  31  *              }
  32  *
  33  * For more information, see our Google Code project site:
  34  * http://code.google.com/p/back40computing/
  35  *
  36  * Thanks!
  37  ******************************************************************************/
  38
  39
  40
  41 /******************************************************************************
  42  * Radix Sorting API
  43  *
  44  * USAGE:
  45  *
  46  * Using the B40C radix sorting implementation is easy.  Just #include this API
  47  * file and its kernel include dependencies within your source.  Below are two
  48  * examples for using:
  49  *
  50  * (1) A keys-only example for sorting floats:
  51  *
  52  *              // Create storage-management structure
  53  *              RadixSortStorage<float> device_storage(d_float_keys);
  54  *
  55  *              // Create and enact sorter
  56  *              RadixSortingEnactor sorter<float>(d_float_keys_len);
  57  *              sorter.EnactSort(device_storage);
  58  *
  59  *              // Re-acquire pointer to sorted keys, free unused/temp storage
  60  *              d_float_keys = device_storage.d_keys;
  61  *              device_storage.CleanupTempStorage();
  62  *
  63  * (2) And a key-value example for sorting ints paired with doubles:
  64  *
  65  *              // Create storage-management structure
  66  *              RadixSortStorage<int, double> device_storage(d_int_keys, d_double_values);
  67  *
  68  *              // Create and enact sorter
  69  *              RadixSortingEnactor sorter<int, double>(d_int_keys_len);
  70  *              sorter.EnactSort(device_storage);
  71  *
  72  *              // Re-acquire pointer to sorted keys and values, free unused/temp storage
  73  *              d_int_keys = device_storage.d_keys;
  74  *              d_double_values = device_storage.d_values;
  75  *              device_storage.CleanupTempStorage();
  76  *
  77  *
  78  ******************************************************************************/
  79
  80 #pragma once
  81
  82 #include <stdlib.h>
  83 #include <stdio.h>
  84 #include <string.h>
  85 #include <math.h>
  86 #include <float.h>
  87
  88 #include "radixsort_reduction_kernel.h"
  89 #include "radixsort_spine_kernel.h"
  90 #include "radixsort_scanscatter_kernel.h"
  91
  92 #include <thrust/swap.h>
  93
  94 namespace thrust  {
  95 namespace system  {
  96 namespace cuda    {
  97 namespace detail  {
  98 namespace detail  {
  99 namespace b40c_thrust   {
 100
 101
 102 /******************************************************************************
 103  * Debugging options
 104  ******************************************************************************/
 105
 106 static bool RADIXSORT_DEBUG = false;
 107
 108
 109
 110 /******************************************************************************
 111  * Structures for mananging device-side sorting state
 112  ******************************************************************************/
 113
 114 /**
 115  * Sorting storage-management structure for device vectors
 116  */
 117 template <typename K, typename V = KeysOnlyType>
 118 struct RadixSortStorage {
 119
 120         // Device vector of keys to sort
 121         K* d_keys;
 122
 123         // Device vector of values to sort
 124         V* d_values;
 125
 126         // Ancillary device vector for key storage
 127         K* d_alt_keys;
 128
 129         // Ancillary device vector for value storage
 130         V* d_alt_values;
 131
 132         // Temporary device storage needed for radix sorting histograms
 133         int *d_spine;
 134
 135         // Flip-flopping temporary device storage denoting which digit place
 136         // pass should read from which input source (i.e., false if reading from
 137         // keys, true if reading from alternate_keys
 138         bool *d_from_alt_storage;
 139
 140         // Host-side boolean whether or not an odd number of sorting passes left the
 141         // results in alternate storage.  If so, the d_keys (and d_values) pointers
 142         // will have been swapped with the d_alt_keys (and d_alt_values) pointers in order to
 143         // point to the final results.
 144         bool using_alternate_storage;
 145
 146         // Constructor
 147         RadixSortStorage(K* keys = NULL, V* values = NULL)
 148         {
 149                 d_keys = keys;
 150                 d_values = values;
 151                 d_alt_keys = NULL;
 152                 d_alt_values = NULL;
 153                 d_spine = NULL;
 154                 d_from_alt_storage = NULL;
 155
 156                 using_alternate_storage = false;
 157         }
 158
 159         // Clean up non-results storage (may include freeing original storage if
 160         // primary pointers were swizzled as per using_alternate_storage)
 161         cudaError_t CleanupTempStorage()
 162         {
 163                 if (d_alt_keys) cudaFree(d_alt_keys);
 164                 if (d_alt_values) cudaFree(d_alt_values);
 165                 if (d_spine) cudaFree(d_spine);
 166                 if (d_from_alt_storage) cudaFree(d_from_alt_storage);
 167
 168                 return cudaSuccess;
 169         }
 170 };
 171
 172
 173
 174 /******************************************************************************
 175  * Base class for sorting enactors
 176  ******************************************************************************/
 177
 178
 179 /**
 180  * Base class for SRTS radix sorting enactors.
 181  */
 182 template <typename K, typename V>
 183 class BaseRadixSortingEnactor
 184 {
 185 public:
 186
 187         // Unsigned integer type suitable for radix sorting of keys
 188         typedef typename KeyConversion<K>::UnsignedBits ConvertedKeyType;
 189
 190 protected:
 191
 192         //
 193         // Information about our problem configuration
 194         //
 195
 196         bool                            _keys_only;
 197         unsigned int            _num_elements;
 198         int                             _cycle_elements;
 199         int                             _spine_elements;
 200         int                             _grid_size;
 201         CtaDecomposition        _work_decomposition;
 202         int                             _passes;
 203         bool                            _swizzle_pointers_for_odd_passes;
 204
 205         // Information about our target device
 206         cudaDeviceProp          _device_props;
 207         int                             _device_sm_version;
 208
 209         // Information about our kernel assembly
 210         int                             _kernel_ptx_version;
 211         cudaFuncAttributes      _spine_scan_kernel_attrs;
 212
 213 protected:
 214
 215         /**
 216          * Constructor.
 217          */
 218         BaseRadixSortingEnactor(int passes, int radix_bits, unsigned int num_elements, int max_grid_size, bool swizzle_pointers_for_odd_passes = true);
 219
 220         /**
 221          * Heuristic for determining the number of CTAs to launch.
 222          *
 223          * @param[in]           max_grid_size
 224          *              Maximum allowable number of CTAs to launch.  A value of 0 indicates
 225          *              that the default value should be used.
 226          *
 227          * @return The actual number of CTAs that should be launched
 228          */
 229         int GridSize(int max_grid_size);
 230
 231         /**
 232          * Performs a distribution sorting pass over a single digit place
 233          */
 234         template <int PASS, int RADIX_BITS, int BIT, typename PreprocessFunctor, typename PostprocessFunctor>
 235         cudaError_t DigitPlacePass(const RadixSortStorage<ConvertedKeyType, V> &converted_storage);
 236
 237         /**
 238          * Enacts a sorting operation by performing the the appropriate
 239          * digit-place passes.  To be overloaded by specialized subclasses.
 240          */
 241         virtual cudaError_t EnactDigitPlacePasses(const RadixSortStorage<ConvertedKeyType, V> &converted_storage) = 0;
 242
 243 public:
 244
 245         /**
 246          * Returns the length (in unsigned ints) of the device vector needed for
 247          * temporary storage of the reduction spine.  Useful if pre-allocating
 248          * your own device storage (as opposed to letting EnactSort() allocate it
 249          * for you).
 250          */
 251         int SpineElements() { return _spine_elements; }
 252
 253         /**
 254          * Returns whether or not the problem will fit on the device.
 255          */
 256         bool CanFit();
 257
 258         /**
 259          * Enacts a radix sorting operation on the specified device data.
 260          *
 261          * IMPORTANT NOTES: The device storage backing the specified input vectors of
 262          * keys (and data) will be modified.  (I.e., treat this as an in-place sort.)
 263          *
 264          * Additionally, the pointers in the problem_storage structure may be updated
 265          * (a) depending upon the number of digit-place sorting passes needed, and (b)
 266          * whether or not the caller has already allocated temporary storage.
 267          *
 268          * The sorted results will always be referenced by problem_storage.d_keys (and
 269          * problem_storage.d_values).  However, for an odd number of sorting passes (uncommon)
 270          * these results will actually be backed by the storage initially allocated for
 271          * by problem_storage.d_alt_keys (and problem_storage.d_alt_values).  If so,
 272          * problem_storage.d_alt_keys and problem_storage.d_alt_keys will be updated to
 273          * reference the original problem_storage.d_keys and problem_storage.d_values in order
 274          * to facilitate cleanup.
 275          *
 276          * This means it is important to avoid keeping stale copies of device pointers
 277          * to keys/data; you will want to re-reference the pointers in problem_storage.
 278          *
 279          * @param[in/out]       problem_storage
 280          *              Device vectors of keys and values to sort, and ancillary storage
 281          *              needed by the sorting kernels. See the IMPORTANT NOTES above.
 282          *
 283          *              The problem_storage.[alternate_keys|alternate_values|d_spine] fields are
 284          *              temporary storage needed by the sorting kernels.  To facilitate
 285          *              speed, callers are welcome to re-use this storage for same-sized
 286          *              (or smaller) sortign problems. If NULL, these storage vectors will be
 287          *      allocated by this routine (and must be subsequently cuda-freed by
 288          *      the caller).
 289          *
 290          * @return cudaSuccess on success, error enumeration otherwise
 291          */
 292         cudaError_t EnactSort(RadixSortStorage<K, V> &problem_storage);
 293
 294     /*
 295      * Destructor
 296      */
 297     virtual ~BaseRadixSortingEnactor() {}
 298 };
 299
 300
 301
 302 template <typename K, typename V>
 303 BaseRadixSortingEnactor<K, V>::BaseRadixSortingEnactor(
 304         int passes,
 305         int max_radix_bits,
 306         unsigned int num_elements,
 307         int max_grid_size,
 308         bool swizzle_pointers_for_odd_passes)
 309 {
 310         //
 311         // Get current device properties
 312         //
 313
 314         int current_device;
 315         cudaGetDevice(&current_device);
 316         cudaGetDeviceProperties(&_device_props, current_device);
 317         _device_sm_version = _device_props.major * 100 + _device_props.minor * 10;
 318
 319
 320         //
 321         // Get SM version of compiled kernel assembly
 322         //
 323         cudaFuncGetAttributes(&_spine_scan_kernel_attrs, SrtsScanSpine<void>);
 324         _kernel_ptx_version = _spine_scan_kernel_attrs.ptxVersion * 10;
 325
 326
 327         //
 328         // Determine number of CTAs to launch, shared memory, cycle elements, etc.
 329         //
 330
 331         _passes                                                         = passes;
 332         _num_elements                                           = num_elements;
 333         _keys_only                                                      = IsKeysOnly<V>();
 334         _cycle_elements                                         = B40C_RADIXSORT_CYCLE_ELEMENTS(_kernel_ptx_version , ConvertedKeyType, V);
 335         _grid_size                                                      = GridSize(max_grid_size);
 336         _swizzle_pointers_for_odd_passes        = swizzle_pointers_for_odd_passes;
 337
 338         int total_cycles                        = _num_elements / _cycle_elements;
 339         unsigned int cycles_per_block           = total_cycles / _grid_size;
 340         unsigned int extra_cycles                       = total_cycles - (cycles_per_block * _grid_size);
 341
 342         CtaDecomposition work_decomposition = {
 343                 extra_cycles,                                                                           // num_big_blocks
 344                 (cycles_per_block + 1) * _cycle_elements,                       // big_block_elements
 345                 cycles_per_block * _cycle_elements,                                     // normal_block_elements
 346                 _num_elements - (total_cycles * _cycle_elements),       // extra_elements_last_block
 347                 _num_elements};                                                                         // num_elements
 348
 349         _work_decomposition = work_decomposition;
 350
 351         int spine_cycles = ((_grid_size * (1 << max_radix_bits)) + B40C_RADIXSORT_SPINE_CYCLE_ELEMENTS - 1) / B40C_RADIXSORT_SPINE_CYCLE_ELEMENTS;
 352         _spine_elements = spine_cycles * B40C_RADIXSORT_SPINE_CYCLE_ELEMENTS;
 353 }
 354
 355
 356
 357 template <typename K, typename V>
 358 int BaseRadixSortingEnactor<K, V>::GridSize(int max_grid_size)
 359 {
 360         const int SINGLE_CTA_CUTOFF = 0;                // right now zero; we have no single-cta sorting
 361
 362         // find maximum number of threadblocks if "use-default"
 363         if (max_grid_size == 0) {
 364
 365                 if (_num_elements <= static_cast<unsigned int>(SINGLE_CTA_CUTOFF)) {
 366
 367                         // The problem size is too small to warrant a two-level reduction:
 368                         // use only one stream-processor
 369                         max_grid_size = 1;
 370
 371                 } else {
 372
 373                         if (_device_sm_version <= 120) {
 374
 375                                 // G80/G90
 376                                 max_grid_size = _device_props.multiProcessorCount * 4;
 377
 378                         } else if (_device_sm_version < 200) {
 379
 380                                 // GT200 (has some kind of TLB or icache drama)
 381                                 int orig_max_grid_size = _device_props.multiProcessorCount * B40C_RADIXSORT_SCAN_SCATTER_CTA_OCCUPANCY(_kernel_ptx_version);
 382                                 if (_keys_only) {
 383                                         orig_max_grid_size *= (_num_elements + (1024 * 1024 * 96) - 1) / (1024 * 1024 * 96);
 384                                 } else {
 385                                         orig_max_grid_size *= (_num_elements + (1024 * 1024 * 64) - 1) / (1024 * 1024 * 64);
 386                                 }
 387                                 max_grid_size = orig_max_grid_size;
 388
 389                                 if (_num_elements / _cycle_elements > static_cast<unsigned int>(max_grid_size)) {
 390
 391                                         double multiplier1 = 4.0;
 392                                         double multiplier2 = 16.0;
 393
 394                                         double delta1 = 0.068;
 395                                         double delta2 = 0.127;
 396
 397                                         int dividend = (_num_elements + _cycle_elements - 1) / _cycle_elements;
 398
 399                                         while(true) {
 400
 401                                                 double quotient = ((double) dividend) / (multiplier1 * max_grid_size);
 402                                                 quotient -= (int) quotient;
 403
 404                                                 if ((quotient > delta1) && (quotient < 1 - delta1)) {
 405
 406                                                         quotient = ((double) dividend) / (multiplier2 * max_grid_size / 3.0);
 407                                                         quotient -= (int) quotient;
 408
 409                                                         if ((quotient > delta2) && (quotient < 1 - delta2)) {
 410                                                                 break;
 411                                                         }
 412                                                 }
 413
 414                                                 if (max_grid_size == orig_max_grid_size - 2) {
 415                                                         max_grid_size = orig_max_grid_size - 30;
 416                                                 } else {
 417                                                         max_grid_size -= 1;
 418                                                 }
 419                                         }
 420                                 }
 421                         } else {
 422
 423                                 // GF100
 424                                 max_grid_size = 418;
 425                         }
 426                 }
 427         }
 428
 429         // Calculate the actual number of threadblocks to launch.  Initially
 430         // assume that each threadblock will do only one cycle_elements worth
 431         // of work, but then clamp it by the "max" restriction derived above
 432         // in order to accomodate the "single-sp" and "saturated" cases.
 433
 434         int grid_size = _num_elements / _cycle_elements;
 435         if (grid_size == 0) {
 436                 grid_size = 1;
 437         }
 438         if (grid_size > max_grid_size) {
 439                 grid_size = max_grid_size;
 440         }
 441
 442         return grid_size;
 443 }
 444
 445
 446
 447 template <typename K, typename V>
 448 bool BaseRadixSortingEnactor<K, V>::
 449 CanFit()
 450 {
 451         long long bytes = (_num_elements * sizeof(K) * 2) + (_spine_elements * sizeof(int));
 452         if (!_keys_only) bytes += _num_elements * sizeof(V) * 2;
 453
 454         if (_device_props.totalGlobalMem < 1024 * 1024 * 513) {
 455                 return (bytes < ((double) _device_props.totalGlobalMem) * 0.81);        // allow up to 81% capacity for 512MB
 456         }
 457
 458         return (bytes < ((double) _device_props.totalGlobalMem) * 0.89);        // allow up to 90% capacity
 459 }
 460
 461
 462
 463 template <typename K, typename V>
 464 template <int PASS, int RADIX_BITS, int BIT, typename PreprocessFunctor, typename PostprocessFunctor>
 465 cudaError_t BaseRadixSortingEnactor<K, V>::
 466 DigitPlacePass(const RadixSortStorage<ConvertedKeyType, V> &converted_storage)
 467 {
 468         int threads = B40C_RADIXSORT_THREADS;
 469         int dynamic_smem;
 470
 471         cudaFuncAttributes reduce_kernel_attrs, scan_scatter_attrs;
 472         cudaFuncGetAttributes(&reduce_kernel_attrs, RakingReduction<ConvertedKeyType, V, PASS, RADIX_BITS, BIT, PreprocessFunctor>);
 473         cudaFuncGetAttributes(&scan_scatter_attrs, ScanScatterDigits<ConvertedKeyType, V, PASS, RADIX_BITS, BIT, PreprocessFunctor, PostprocessFunctor>);
 474
 475         //
 476         // Counting Reduction
 477         //
 478
 479         // Run tesla flush kernel if we have two or more threadblocks for each of the SMs
 480         if ((_device_sm_version == 130) && (_work_decomposition.num_elements > static_cast<unsigned int>(_device_props.multiProcessorCount * _cycle_elements * 2))) {
 481                 FlushKernel<void><<<_grid_size, B40C_RADIXSORT_THREADS, scan_scatter_attrs.sharedSizeBytes>>>();
 482                 synchronize_if_enabled("FlushKernel");
 483         }
 484
 485         // GF100 and GT200 get the same smem allocation for every kernel launch (pad the reduction/top-level-scan kernels)
 486         dynamic_smem = (_kernel_ptx_version >= 130) ? scan_scatter_attrs.sharedSizeBytes - reduce_kernel_attrs.sharedSizeBytes : 0;
 487
 488         RakingReduction<ConvertedKeyType, V, PASS, RADIX_BITS, BIT, PreprocessFunctor> <<<_grid_size, threads, dynamic_smem>>>(
 489                 converted_storage.d_from_alt_storage,
 490                 converted_storage.d_spine,
 491                 converted_storage.d_keys,
 492                 converted_storage.d_alt_keys,
 493                 _work_decomposition);
 494     synchronize_if_enabled("RakingReduction");
 495
 496
 497         //
 498         // Spine
 499         //
 500
 501         // GF100 and GT200 get the same smem allocation for every kernel launch (pad the reduction/top-level-scan kernels)
 502         dynamic_smem = (_kernel_ptx_version >= 130) ? scan_scatter_attrs.sharedSizeBytes - _spine_scan_kernel_attrs.sharedSizeBytes : 0;
 503
 504         SrtsScanSpine<void><<<_grid_size, B40C_RADIXSORT_SPINE_THREADS, dynamic_smem>>>(
 505                 converted_storage.d_spine,
 506                 converted_storage.d_spine,
 507                 _spine_elements);
 508     synchronize_if_enabled("SrtsScanSpine");
 509
 510
 511         //
 512         // Scanning Scatter
 513         //
 514
 515         // Run tesla flush kernel if we have two or more threadblocks for each of the SMs
 516         if ((_device_sm_version == 130) && (_work_decomposition.num_elements > static_cast<unsigned int>(_device_props.multiProcessorCount * _cycle_elements * 2))) {
 517                 FlushKernel<void><<<_grid_size, B40C_RADIXSORT_THREADS, scan_scatter_attrs.sharedSizeBytes>>>();
 518                 synchronize_if_enabled("FlushKernel");
 519         }
 520
 521         ScanScatterDigits<ConvertedKeyType, V, PASS, RADIX_BITS, BIT, PreprocessFunctor, PostprocessFunctor> <<<_grid_size, threads, 0>>>(
 522                 converted_storage.d_from_alt_storage,
 523                 converted_storage.d_spine,
 524                 converted_storage.d_keys,
 525                 converted_storage.d_alt_keys,
 526                 converted_storage.d_values,
 527                 converted_storage.d_alt_values,
 528                 _work_decomposition);
 529     synchronize_if_enabled("ScanScatterDigits");
 530
 531         return cudaSuccess;
 532 }
 533
 534
 535
 536 template <typename K, typename V>
 537 cudaError_t BaseRadixSortingEnactor<K, V>::
 538 EnactSort(RadixSortStorage<K, V> &problem_storage)
 539 {
 540         //
 541         // Allocate device memory for temporary storage (if necessary)
 542         //
 543
 544         if (problem_storage.d_alt_keys == NULL) {
 545                 cudaMalloc((void**) &problem_storage.d_alt_keys, _num_elements * sizeof(K));
 546         }
 547         if (!_keys_only && (problem_storage.d_alt_values == NULL)) {
 548                 cudaMalloc((void**) &problem_storage.d_alt_values, _num_elements * sizeof(V));
 549         }
 550         if (problem_storage.d_spine == NULL) {
 551                 cudaMalloc((void**) &problem_storage.d_spine, _spine_elements * sizeof(int));
 552         }
 553         if (problem_storage.d_from_alt_storage == NULL) {
 554                 cudaMalloc((void**) &problem_storage.d_from_alt_storage, 2 * sizeof(bool));
 555         }
 556
 557         // Determine suitable type of unsigned byte storage to use for keys
 558         typedef typename KeyConversion<K>::UnsignedBits ConvertedKeyType;
 559
 560         // Copy storage pointers to an appropriately typed stucture
 561         RadixSortStorage<ConvertedKeyType, V> converted_storage;
 562         memcpy(&converted_storage, &problem_storage, sizeof(RadixSortStorage<K, V>));
 563
 564         //
 565         // Enact the sorting operation
 566         //
 567
 568         if (RADIXSORT_DEBUG) {
 569
 570                 printf("_device_sm_version: %d, _kernel_ptx_version: %d\n", _device_sm_version, _kernel_ptx_version);
 571                 printf("Bottom-level reduction & scan kernels:\n\tgrid_size: %d, \n\tthreads: %d, \n\tcycle_elements: %d, \n\tnum_big_blocks: %d, \n\tbig_block_elements: %d, \n\tnormal_block_elements: %d\n\textra_elements_last_block: %d\n\n",
 572                         _grid_size, B40C_RADIXSORT_THREADS, _cycle_elements, _work_decomposition.num_big_blocks, _work_decomposition.big_block_elements, _work_decomposition.normal_block_elements, _work_decomposition.extra_elements_last_block);
 573                 printf("Top-level spine scan:\n\tgrid_size: %d, \n\tthreads: %d, \n\tspine_block_elements: %d\n\n",
 574                         _grid_size, B40C_RADIXSORT_SPINE_THREADS, _spine_elements);
 575         }
 576
 577         cudaError_t retval = EnactDigitPlacePasses(converted_storage);
 578
 579
 580         //
 581         // Swizzle pointers if we left our sorted output in temp storage
 582         //
 583
 584         if (_swizzle_pointers_for_odd_passes) {
 585
 586                 cudaMemcpy(
 587                         &problem_storage.using_alternate_storage,
 588                         &problem_storage.d_from_alt_storage[_passes & 0x1],
 589                         sizeof(bool),
 590                         cudaMemcpyDeviceToHost);
 591
 592                 if (problem_storage.using_alternate_storage) {
 593             thrust::swap<K*>(problem_storage.d_keys, problem_storage.d_alt_keys);
 594                         if (!_keys_only) {
 595                 thrust::swap<V*>(problem_storage.d_values, problem_storage.d_alt_values);
 596                         }
 597                 }
 598         }
 599
 600         return retval;
 601 }
 602
 603
 604
 605
 606
 607 /******************************************************************************
 608  * Sorting enactor classes
 609  ******************************************************************************/
 610
 611 /**
 612  * Generic sorting enactor class.  Simply create an instance of this class
 613  * with your key-type K (and optionally value-type V if sorting with satellite
 614  * values).
 615  *
 616  * Template specialization provides the appropriate enactor instance to handle
 617  * the specified data types.
 618  *
 619  * @template-param K
 620  *              Type of keys to be sorted
 621  *
 622  * @template-param V
 623  *              Type of values to be sorted.
 624  *
 625  * @template-param ConvertedKeyType
 626  *              Leave as default to effect necessary enactor specialization.
 627  */
 628 template <typename K, typename V = KeysOnlyType, typename ConvertedKeyType = typename KeyConversion<K>::UnsignedBits>
 629 class RadixSortingEnactor;
 630
 631
 632
 633 /**
 634  * Sorting enactor that is specialized for for 8-bit key types
 635  */
 636 template <typename K, typename V>
 637 class RadixSortingEnactor<K, V, unsigned char> : public BaseRadixSortingEnactor<K, V>
 638 {
 639 protected:
 640
 641         typedef BaseRadixSortingEnactor<K, V> Base;
 642         typedef typename Base::ConvertedKeyType ConvertedKeyType;
 643
 644         cudaError_t EnactDigitPlacePasses(const RadixSortStorage<ConvertedKeyType, V> &converted_storage)
 645         {
 646                 Base::template DigitPlacePass<0, 4, 0, PreprocessKeyFunctor<K>,      NopFunctor<ConvertedKeyType> >(converted_storage);
 647                 Base::template DigitPlacePass<1, 4, 4, NopFunctor<ConvertedKeyType>, PostprocessKeyFunctor<K> >    (converted_storage);
 648
 649                 return cudaSuccess;
 650         }
 651
 652 public:
 653
 654         /**
 655          * Constructor.
 656          *
 657          * @param[in]           num_elements
 658          *              Length (in elements) of the input to a sorting operation
 659          *
 660          * @param[in]           max_grid_size
 661          *              Maximum allowable number of CTAs to launch.  The default value of 0 indicates
 662          *              that the dispatch logic should select an appropriate value for the target device.
 663          */
 664         RadixSortingEnactor(unsigned int num_elements, int max_grid_size = 0) : Base::BaseRadixSortingEnactor(2, 4, num_elements, max_grid_size) {}
 665
 666 };
 667
 668
 669
 670 /**
 671  * Sorting enactor that is specialized for for 16-bit key types
 672  */
 673 template <typename K, typename V>
 674 class RadixSortingEnactor<K, V, unsigned short> : public BaseRadixSortingEnactor<K, V>
 675 {
 676 protected:
 677
 678         typedef BaseRadixSortingEnactor<K, V> Base;
 679         typedef typename Base::ConvertedKeyType ConvertedKeyType;
 680
 681         cudaError_t EnactDigitPlacePasses(const RadixSortStorage<ConvertedKeyType, V> &converted_storage)
 682         {
 683                 Base::template DigitPlacePass<0, 4, 0,  PreprocessKeyFunctor<K>,      NopFunctor<ConvertedKeyType> >(converted_storage);
 684                 Base::template DigitPlacePass<1, 4, 4,  NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
 685                 Base::template DigitPlacePass<2, 4, 8,  NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
 686                 Base::template DigitPlacePass<3, 4, 12, NopFunctor<ConvertedKeyType>, PostprocessKeyFunctor<K> >    (converted_storage);
 687
 688                 return cudaSuccess;
 689         }
 690
 691 public:
 692
 693         /**
 694          * Constructor.
 695          *
 696          * @param[in]           num_elements
 697          *              Length (in elements) of the input to a sorting operation
 698          *
 699          * @param[in]           max_grid_size
 700          *              Maximum allowable number of CTAs to launch.  The default value of 0 indicates
 701          *              that the dispatch logic should select an appropriate value for the target device.
 702          */
 703         RadixSortingEnactor(unsigned int num_elements, int max_grid_size = 0) : Base::BaseRadixSortingEnactor(4, 4, num_elements, max_grid_size) {}
 704
 705 };
 706
 707
 708 /**
 709  * Sorting enactor that is specialized for for 32-bit key types
 710  */
 711 template <typename K, typename V>
 712 class RadixSortingEnactor<K, V, unsigned int> : public BaseRadixSortingEnactor<K, V>
 713 {
 714 protected:
 715
 716         typedef BaseRadixSortingEnactor<K, V> Base;
 717         typedef typename Base::ConvertedKeyType ConvertedKeyType;
 718
 719         cudaError_t EnactDigitPlacePasses(const RadixSortStorage<ConvertedKeyType, V> &converted_storage)
 720         {
 721                 Base::template DigitPlacePass<0, 4, 0,  PreprocessKeyFunctor<K>,      NopFunctor<ConvertedKeyType> >(converted_storage);
 722                 Base::template DigitPlacePass<1, 4, 4,  NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
 723                 Base::template DigitPlacePass<2, 4, 8,  NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
 724                 Base::template DigitPlacePass<3, 4, 12, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
 725                 Base::template DigitPlacePass<4, 4, 16, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
 726                 Base::template DigitPlacePass<5, 4, 20, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
 727                 Base::template DigitPlacePass<6, 4, 24, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
 728                 Base::template DigitPlacePass<7, 4, 28, NopFunctor<ConvertedKeyType>, PostprocessKeyFunctor<K> >    (converted_storage);
 729
 730                 return cudaSuccess;
 731         }
 732
 733 public:
 734
 735         /**
 736          * Constructor.
 737          *
 738          * @param[in]           num_elements
 739          *              Length (in elements) of the input to a sorting operation
 740          *
 741          * @param[in]           max_grid_size
 742          *              Maximum allowable number of CTAs to launch.  The default value of 0 indicates
 743          *              that the dispatch logic should select an appropriate value for the target device.
 744          */
 745         RadixSortingEnactor(unsigned int num_elements, int max_grid_size = 0) : Base::BaseRadixSortingEnactor(8, 4, num_elements, max_grid_size) {}
 746
 747 };
 748
 749
 750
 751 /**
 752  * Sorting enactor that is specialized for for 64-bit key types
 753  */
 754 template <typename K, typename V>
 755 class RadixSortingEnactor<K, V, unsigned long long> : public BaseRadixSortingEnactor<K, V>
 756 {
 757 protected:
 758
 759         typedef BaseRadixSortingEnactor<K, V> Base;
 760         typedef typename Base::ConvertedKeyType ConvertedKeyType;
 761
 762         cudaError_t EnactDigitPlacePasses(const RadixSortStorage<ConvertedKeyType, V> &converted_storage)
 763         {
 764                 Base::template DigitPlacePass<0,  4, 0,  PreprocessKeyFunctor<K>,      NopFunctor<ConvertedKeyType> >(converted_storage);
 765                 Base::template DigitPlacePass<1,  4, 4,  NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
 766                 Base::template DigitPlacePass<2,  4, 8,  NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
 767                 Base::template DigitPlacePass<3,  4, 12, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
 768                 Base::template DigitPlacePass<4,  4, 16, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
 769                 Base::template DigitPlacePass<5,  4, 20, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
 770                 Base::template DigitPlacePass<6,  4, 24, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
 771                 Base::template DigitPlacePass<7,  4, 28, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
 772                 Base::template DigitPlacePass<8,  4, 32, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
 773                 Base::template DigitPlacePass<9,  4, 36, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
 774                 Base::template DigitPlacePass<10, 4, 40, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
 775                 Base::template DigitPlacePass<11, 4, 44, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
 776                 Base::template DigitPlacePass<12, 4, 48, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
 777                 Base::template DigitPlacePass<13, 4, 52, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
 778                 Base::template DigitPlacePass<14, 4, 56, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
 779                 Base::template DigitPlacePass<15, 4, 60, NopFunctor<ConvertedKeyType>, PostprocessKeyFunctor<K> >    (converted_storage);
 780
 781                 return cudaSuccess;
 782         }
 783
 784 public:
 785
 786         /**
 787          * Constructor.
 788          *
 789          * @param[in]           num_elements
 790          *              Length (in elements) of the input to a sorting operation
 791          *
 792          * @param[in]           max_grid_size
 793          *              Maximum allowable number of CTAs to launch.  The default value of 0 indicates
 794          *              that the dispatch logic should select an appropriate value for the target device.
 795          */
 796         RadixSortingEnactor(unsigned int num_elements, int max_grid_size = 0) : Base::BaseRadixSortingEnactor(16, 4, num_elements, max_grid_size) {}
 797
 798 };
 799
 800
 801 } // end namespace b40c_thrust
 802 } // end namespace detail
 803 } // end namespace detail
 804 } // end namespace cuda
 805 } // end namespace system
 806 } // end namespace thrust
 807