1 /******************************************************************************
2 * Copyright 2010 Duane Merrill
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
21 * If you use|reference|benchmark this code, please cite our Technical
22 * Report (http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf):
24 * @TechReport{ Merrill:Sorting:2010,
25 * author = "Duane Merrill and Andrew Grimshaw",
26 * title = "Revisiting Sorting for GPGPU Stream Architectures",
28 * institution = "University of Virginia, Department of Computer Science",
29 * address = "Charlottesville, VA, USA",
30 * number = "CS2010-03"
33 * For more information, see our Google Code project site:
34 * http://code.google.com/p/back40computing/
37 ******************************************************************************/
41 /******************************************************************************
46 * Using the B40C radix sorting implementation is easy. Just #include this API
47 * file and its kernel include dependencies within your source. Below are two
50 * (1) A keys-only example for sorting floats:
52 * // Create storage-management structure
53 * RadixSortStorage<float> device_storage(d_float_keys);
55 * // Create and enact sorter
56 * RadixSortingEnactor sorter<float>(d_float_keys_len);
57 * sorter.EnactSort(device_storage);
59 * // Re-acquire pointer to sorted keys, free unused/temp storage
60 * d_float_keys = device_storage.d_keys;
61 * device_storage.CleanupTempStorage();
63 * (2) And a key-value example for sorting ints paired with doubles:
65 * // Create storage-management structure
66 * RadixSortStorage<int, double> device_storage(d_int_keys, d_double_values);
68 * // Create and enact sorter
69 * RadixSortingEnactor sorter<int, double>(d_int_keys_len);
70 * sorter.EnactSort(device_storage);
72 * // Re-acquire pointer to sorted keys and values, free unused/temp storage
73 * d_int_keys = device_storage.d_keys;
74 * d_double_values = device_storage.d_values;
75 * device_storage.CleanupTempStorage();
78 ******************************************************************************/
88 #include "radixsort_reduction_kernel.h"
89 #include "radixsort_spine_kernel.h"
90 #include "radixsort_scanscatter_kernel.h"
92 #include <thrust/swap.h>
99 namespace b40c_thrust {
102 /******************************************************************************
104 ******************************************************************************/
106 static bool RADIXSORT_DEBUG = false;
110 /******************************************************************************
111 * Structures for mananging device-side sorting state
112 ******************************************************************************/
115 * Sorting storage-management structure for device vectors
117 template <typename K, typename V = KeysOnlyType>
118 struct RadixSortStorage {
120 // Device vector of keys to sort
123 // Device vector of values to sort
126 // Ancillary device vector for key storage
129 // Ancillary device vector for value storage
132 // Temporary device storage needed for radix sorting histograms
135 // Flip-flopping temporary device storage denoting which digit place
136 // pass should read from which input source (i.e., false if reading from
137 // keys, true if reading from alternate_keys
138 bool *d_from_alt_storage;
140 // Host-side boolean whether or not an odd number of sorting passes left the
141 // results in alternate storage. If so, the d_keys (and d_values) pointers
142 // will have been swapped with the d_alt_keys (and d_alt_values) pointers in order to
143 // point to the final results.
144 bool using_alternate_storage;
147 RadixSortStorage(K* keys = NULL, V* values = NULL)
154 d_from_alt_storage = NULL;
156 using_alternate_storage = false;
159 // Clean up non-results storage (may include freeing original storage if
160 // primary pointers were swizzled as per using_alternate_storage)
161 cudaError_t CleanupTempStorage()
163 if (d_alt_keys) cudaFree(d_alt_keys);
164 if (d_alt_values) cudaFree(d_alt_values);
165 if (d_spine) cudaFree(d_spine);
166 if (d_from_alt_storage) cudaFree(d_from_alt_storage);
174 /******************************************************************************
175 * Base class for sorting enactors
176 ******************************************************************************/
180 * Base class for SRTS radix sorting enactors.
182 template <typename K, typename V>
183 class BaseRadixSortingEnactor
187 // Unsigned integer type suitable for radix sorting of keys
188 typedef typename KeyConversion<K>::UnsignedBits ConvertedKeyType;
193 // Information about our problem configuration
197 unsigned int _num_elements;
201 CtaDecomposition _work_decomposition;
203 bool _swizzle_pointers_for_odd_passes;
205 // Information about our target device
206 cudaDeviceProp _device_props;
207 int _device_sm_version;
209 // Information about our kernel assembly
210 int _kernel_ptx_version;
211 cudaFuncAttributes _spine_scan_kernel_attrs;
218 BaseRadixSortingEnactor(int passes, int radix_bits, unsigned int num_elements, int max_grid_size, bool swizzle_pointers_for_odd_passes = true);
221 * Heuristic for determining the number of CTAs to launch.
223 * @param[in] max_grid_size
224 * Maximum allowable number of CTAs to launch. A value of 0 indicates
225 * that the default value should be used.
227 * @return The actual number of CTAs that should be launched
229 int GridSize(int max_grid_size);
232 * Performs a distribution sorting pass over a single digit place
234 template <int PASS, int RADIX_BITS, int BIT, typename PreprocessFunctor, typename PostprocessFunctor>
235 cudaError_t DigitPlacePass(const RadixSortStorage<ConvertedKeyType, V> &converted_storage);
238 * Enacts a sorting operation by performing the the appropriate
239 * digit-place passes. To be overloaded by specialized subclasses.
241 virtual cudaError_t EnactDigitPlacePasses(const RadixSortStorage<ConvertedKeyType, V> &converted_storage) = 0;
246 * Returns the length (in unsigned ints) of the device vector needed for
247 * temporary storage of the reduction spine. Useful if pre-allocating
248 * your own device storage (as opposed to letting EnactSort() allocate it
251 int SpineElements() { return _spine_elements; }
254 * Returns whether or not the problem will fit on the device.
259 * Enacts a radix sorting operation on the specified device data.
261 * IMPORTANT NOTES: The device storage backing the specified input vectors of
262 * keys (and data) will be modified. (I.e., treat this as an in-place sort.)
264 * Additionally, the pointers in the problem_storage structure may be updated
265 * (a) depending upon the number of digit-place sorting passes needed, and (b)
266 * whether or not the caller has already allocated temporary storage.
268 * The sorted results will always be referenced by problem_storage.d_keys (and
269 * problem_storage.d_values). However, for an odd number of sorting passes (uncommon)
270 * these results will actually be backed by the storage initially allocated for
271 * by problem_storage.d_alt_keys (and problem_storage.d_alt_values). If so,
272 * problem_storage.d_alt_keys and problem_storage.d_alt_keys will be updated to
273 * reference the original problem_storage.d_keys and problem_storage.d_values in order
274 * to facilitate cleanup.
276 * This means it is important to avoid keeping stale copies of device pointers
277 * to keys/data; you will want to re-reference the pointers in problem_storage.
279 * @param[in/out] problem_storage
280 * Device vectors of keys and values to sort, and ancillary storage
281 * needed by the sorting kernels. See the IMPORTANT NOTES above.
283 * The problem_storage.[alternate_keys|alternate_values|d_spine] fields are
284 * temporary storage needed by the sorting kernels. To facilitate
285 * speed, callers are welcome to re-use this storage for same-sized
286 * (or smaller) sortign problems. If NULL, these storage vectors will be
287 * allocated by this routine (and must be subsequently cuda-freed by
290 * @return cudaSuccess on success, error enumeration otherwise
292 cudaError_t EnactSort(RadixSortStorage<K, V> &problem_storage);
297 virtual ~BaseRadixSortingEnactor() {}
302 template <typename K, typename V>
303 BaseRadixSortingEnactor<K, V>::BaseRadixSortingEnactor(
306 unsigned int num_elements,
308 bool swizzle_pointers_for_odd_passes)
311 // Get current device properties
315 cudaGetDevice(¤t_device);
316 cudaGetDeviceProperties(&_device_props, current_device);
317 _device_sm_version = _device_props.major * 100 + _device_props.minor * 10;
321 // Get SM version of compiled kernel assembly
323 cudaFuncGetAttributes(&_spine_scan_kernel_attrs, SrtsScanSpine<void>);
324 _kernel_ptx_version = _spine_scan_kernel_attrs.ptxVersion * 10;
328 // Determine number of CTAs to launch, shared memory, cycle elements, etc.
332 _num_elements = num_elements;
333 _keys_only = IsKeysOnly<V>();
334 _cycle_elements = B40C_RADIXSORT_CYCLE_ELEMENTS(_kernel_ptx_version , ConvertedKeyType, V);
335 _grid_size = GridSize(max_grid_size);
336 _swizzle_pointers_for_odd_passes = swizzle_pointers_for_odd_passes;
338 int total_cycles = _num_elements / _cycle_elements;
339 unsigned int cycles_per_block = total_cycles / _grid_size;
340 unsigned int extra_cycles = total_cycles - (cycles_per_block * _grid_size);
342 CtaDecomposition work_decomposition = {
343 extra_cycles, // num_big_blocks
344 (cycles_per_block + 1) * _cycle_elements, // big_block_elements
345 cycles_per_block * _cycle_elements, // normal_block_elements
346 _num_elements - (total_cycles * _cycle_elements), // extra_elements_last_block
347 _num_elements}; // num_elements
349 _work_decomposition = work_decomposition;
351 int spine_cycles = ((_grid_size * (1 << max_radix_bits)) + B40C_RADIXSORT_SPINE_CYCLE_ELEMENTS - 1) / B40C_RADIXSORT_SPINE_CYCLE_ELEMENTS;
352 _spine_elements = spine_cycles * B40C_RADIXSORT_SPINE_CYCLE_ELEMENTS;
357 template <typename K, typename V>
358 int BaseRadixSortingEnactor<K, V>::GridSize(int max_grid_size)
360 const int SINGLE_CTA_CUTOFF = 0; // right now zero; we have no single-cta sorting
362 // find maximum number of threadblocks if "use-default"
363 if (max_grid_size == 0) {
365 if (_num_elements <= static_cast<unsigned int>(SINGLE_CTA_CUTOFF)) {
367 // The problem size is too small to warrant a two-level reduction:
368 // use only one stream-processor
373 if (_device_sm_version <= 120) {
376 max_grid_size = _device_props.multiProcessorCount * 4;
378 } else if (_device_sm_version < 200) {
380 // GT200 (has some kind of TLB or icache drama)
381 int orig_max_grid_size = _device_props.multiProcessorCount * B40C_RADIXSORT_SCAN_SCATTER_CTA_OCCUPANCY(_kernel_ptx_version);
383 orig_max_grid_size *= (_num_elements + (1024 * 1024 * 96) - 1) / (1024 * 1024 * 96);
385 orig_max_grid_size *= (_num_elements + (1024 * 1024 * 64) - 1) / (1024 * 1024 * 64);
387 max_grid_size = orig_max_grid_size;
389 if (_num_elements / _cycle_elements > static_cast<unsigned int>(max_grid_size)) {
391 double multiplier1 = 4.0;
392 double multiplier2 = 16.0;
394 double delta1 = 0.068;
395 double delta2 = 0.127;
397 int dividend = (_num_elements + _cycle_elements - 1) / _cycle_elements;
401 double quotient = ((double) dividend) / (multiplier1 * max_grid_size);
402 quotient -= (int) quotient;
404 if ((quotient > delta1) && (quotient < 1 - delta1)) {
406 quotient = ((double) dividend) / (multiplier2 * max_grid_size / 3.0);
407 quotient -= (int) quotient;
409 if ((quotient > delta2) && (quotient < 1 - delta2)) {
414 if (max_grid_size == orig_max_grid_size - 2) {
415 max_grid_size = orig_max_grid_size - 30;
429 // Calculate the actual number of threadblocks to launch. Initially
430 // assume that each threadblock will do only one cycle_elements worth
431 // of work, but then clamp it by the "max" restriction derived above
432 // in order to accomodate the "single-sp" and "saturated" cases.
434 int grid_size = _num_elements / _cycle_elements;
435 if (grid_size == 0) {
438 if (grid_size > max_grid_size) {
439 grid_size = max_grid_size;
447 template <typename K, typename V>
448 bool BaseRadixSortingEnactor<K, V>::
451 long long bytes = (_num_elements * sizeof(K) * 2) + (_spine_elements * sizeof(int));
452 if (!_keys_only) bytes += _num_elements * sizeof(V) * 2;
454 if (_device_props.totalGlobalMem < 1024 * 1024 * 513) {
455 return (bytes < ((double) _device_props.totalGlobalMem) * 0.81); // allow up to 81% capacity for 512MB
458 return (bytes < ((double) _device_props.totalGlobalMem) * 0.89); // allow up to 90% capacity
463 template <typename K, typename V>
464 template <int PASS, int RADIX_BITS, int BIT, typename PreprocessFunctor, typename PostprocessFunctor>
465 cudaError_t BaseRadixSortingEnactor<K, V>::
466 DigitPlacePass(const RadixSortStorage<ConvertedKeyType, V> &converted_storage)
468 int threads = B40C_RADIXSORT_THREADS;
471 cudaFuncAttributes reduce_kernel_attrs, scan_scatter_attrs;
472 cudaFuncGetAttributes(&reduce_kernel_attrs, RakingReduction<ConvertedKeyType, V, PASS, RADIX_BITS, BIT, PreprocessFunctor>);
473 cudaFuncGetAttributes(&scan_scatter_attrs, ScanScatterDigits<ConvertedKeyType, V, PASS, RADIX_BITS, BIT, PreprocessFunctor, PostprocessFunctor>);
476 // Counting Reduction
479 // Run tesla flush kernel if we have two or more threadblocks for each of the SMs
480 if ((_device_sm_version == 130) && (_work_decomposition.num_elements > static_cast<unsigned int>(_device_props.multiProcessorCount * _cycle_elements * 2))) {
481 FlushKernel<void><<<_grid_size, B40C_RADIXSORT_THREADS, scan_scatter_attrs.sharedSizeBytes>>>();
482 synchronize_if_enabled("FlushKernel");
485 // GF100 and GT200 get the same smem allocation for every kernel launch (pad the reduction/top-level-scan kernels)
486 dynamic_smem = (_kernel_ptx_version >= 130) ? scan_scatter_attrs.sharedSizeBytes - reduce_kernel_attrs.sharedSizeBytes : 0;
488 RakingReduction<ConvertedKeyType, V, PASS, RADIX_BITS, BIT, PreprocessFunctor> <<<_grid_size, threads, dynamic_smem>>>(
489 converted_storage.d_from_alt_storage,
490 converted_storage.d_spine,
491 converted_storage.d_keys,
492 converted_storage.d_alt_keys,
493 _work_decomposition);
494 synchronize_if_enabled("RakingReduction");
501 // GF100 and GT200 get the same smem allocation for every kernel launch (pad the reduction/top-level-scan kernels)
502 dynamic_smem = (_kernel_ptx_version >= 130) ? scan_scatter_attrs.sharedSizeBytes - _spine_scan_kernel_attrs.sharedSizeBytes : 0;
504 SrtsScanSpine<void><<<_grid_size, B40C_RADIXSORT_SPINE_THREADS, dynamic_smem>>>(
505 converted_storage.d_spine,
506 converted_storage.d_spine,
508 synchronize_if_enabled("SrtsScanSpine");
515 // Run tesla flush kernel if we have two or more threadblocks for each of the SMs
516 if ((_device_sm_version == 130) && (_work_decomposition.num_elements > static_cast<unsigned int>(_device_props.multiProcessorCount * _cycle_elements * 2))) {
517 FlushKernel<void><<<_grid_size, B40C_RADIXSORT_THREADS, scan_scatter_attrs.sharedSizeBytes>>>();
518 synchronize_if_enabled("FlushKernel");
521 ScanScatterDigits<ConvertedKeyType, V, PASS, RADIX_BITS, BIT, PreprocessFunctor, PostprocessFunctor> <<<_grid_size, threads, 0>>>(
522 converted_storage.d_from_alt_storage,
523 converted_storage.d_spine,
524 converted_storage.d_keys,
525 converted_storage.d_alt_keys,
526 converted_storage.d_values,
527 converted_storage.d_alt_values,
528 _work_decomposition);
529 synchronize_if_enabled("ScanScatterDigits");
536 template <typename K, typename V>
537 cudaError_t BaseRadixSortingEnactor<K, V>::
538 EnactSort(RadixSortStorage<K, V> &problem_storage)
541 // Allocate device memory for temporary storage (if necessary)
544 if (problem_storage.d_alt_keys == NULL) {
545 cudaMalloc((void**) &problem_storage.d_alt_keys, _num_elements * sizeof(K));
547 if (!_keys_only && (problem_storage.d_alt_values == NULL)) {
548 cudaMalloc((void**) &problem_storage.d_alt_values, _num_elements * sizeof(V));
550 if (problem_storage.d_spine == NULL) {
551 cudaMalloc((void**) &problem_storage.d_spine, _spine_elements * sizeof(int));
553 if (problem_storage.d_from_alt_storage == NULL) {
554 cudaMalloc((void**) &problem_storage.d_from_alt_storage, 2 * sizeof(bool));
557 // Determine suitable type of unsigned byte storage to use for keys
558 typedef typename KeyConversion<K>::UnsignedBits ConvertedKeyType;
560 // Copy storage pointers to an appropriately typed stucture
561 RadixSortStorage<ConvertedKeyType, V> converted_storage;
562 memcpy(&converted_storage, &problem_storage, sizeof(RadixSortStorage<K, V>));
565 // Enact the sorting operation
568 if (RADIXSORT_DEBUG) {
570 printf("_device_sm_version: %d, _kernel_ptx_version: %d\n", _device_sm_version, _kernel_ptx_version);
571 printf("Bottom-level reduction & scan kernels:\n\tgrid_size: %d, \n\tthreads: %d, \n\tcycle_elements: %d, \n\tnum_big_blocks: %d, \n\tbig_block_elements: %d, \n\tnormal_block_elements: %d\n\textra_elements_last_block: %d\n\n",
572 _grid_size, B40C_RADIXSORT_THREADS, _cycle_elements, _work_decomposition.num_big_blocks, _work_decomposition.big_block_elements, _work_decomposition.normal_block_elements, _work_decomposition.extra_elements_last_block);
573 printf("Top-level spine scan:\n\tgrid_size: %d, \n\tthreads: %d, \n\tspine_block_elements: %d\n\n",
574 _grid_size, B40C_RADIXSORT_SPINE_THREADS, _spine_elements);
577 cudaError_t retval = EnactDigitPlacePasses(converted_storage);
581 // Swizzle pointers if we left our sorted output in temp storage
584 if (_swizzle_pointers_for_odd_passes) {
587 &problem_storage.using_alternate_storage,
588 &problem_storage.d_from_alt_storage[_passes & 0x1],
590 cudaMemcpyDeviceToHost);
592 if (problem_storage.using_alternate_storage) {
593 thrust::swap<K*>(problem_storage.d_keys, problem_storage.d_alt_keys);
595 thrust::swap<V*>(problem_storage.d_values, problem_storage.d_alt_values);
607 /******************************************************************************
608 * Sorting enactor classes
609 ******************************************************************************/
612 * Generic sorting enactor class. Simply create an instance of this class
613 * with your key-type K (and optionally value-type V if sorting with satellite
616 * Template specialization provides the appropriate enactor instance to handle
617 * the specified data types.
620 * Type of keys to be sorted
623 * Type of values to be sorted.
625 * @template-param ConvertedKeyType
626 * Leave as default to effect necessary enactor specialization.
628 template <typename K, typename V = KeysOnlyType, typename ConvertedKeyType = typename KeyConversion<K>::UnsignedBits>
629 class RadixSortingEnactor;
634 * Sorting enactor that is specialized for for 8-bit key types
636 template <typename K, typename V>
637 class RadixSortingEnactor<K, V, unsigned char> : public BaseRadixSortingEnactor<K, V>
641 typedef BaseRadixSortingEnactor<K, V> Base;
642 typedef typename Base::ConvertedKeyType ConvertedKeyType;
644 cudaError_t EnactDigitPlacePasses(const RadixSortStorage<ConvertedKeyType, V> &converted_storage)
646 Base::template DigitPlacePass<0, 4, 0, PreprocessKeyFunctor<K>, NopFunctor<ConvertedKeyType> >(converted_storage);
647 Base::template DigitPlacePass<1, 4, 4, NopFunctor<ConvertedKeyType>, PostprocessKeyFunctor<K> > (converted_storage);
657 * @param[in] num_elements
658 * Length (in elements) of the input to a sorting operation
660 * @param[in] max_grid_size
661 * Maximum allowable number of CTAs to launch. The default value of 0 indicates
662 * that the dispatch logic should select an appropriate value for the target device.
664 RadixSortingEnactor(unsigned int num_elements, int max_grid_size = 0) : Base::BaseRadixSortingEnactor(2, 4, num_elements, max_grid_size) {}
671 * Sorting enactor that is specialized for for 16-bit key types
673 template <typename K, typename V>
674 class RadixSortingEnactor<K, V, unsigned short> : public BaseRadixSortingEnactor<K, V>
678 typedef BaseRadixSortingEnactor<K, V> Base;
679 typedef typename Base::ConvertedKeyType ConvertedKeyType;
681 cudaError_t EnactDigitPlacePasses(const RadixSortStorage<ConvertedKeyType, V> &converted_storage)
683 Base::template DigitPlacePass<0, 4, 0, PreprocessKeyFunctor<K>, NopFunctor<ConvertedKeyType> >(converted_storage);
684 Base::template DigitPlacePass<1, 4, 4, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
685 Base::template DigitPlacePass<2, 4, 8, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
686 Base::template DigitPlacePass<3, 4, 12, NopFunctor<ConvertedKeyType>, PostprocessKeyFunctor<K> > (converted_storage);
696 * @param[in] num_elements
697 * Length (in elements) of the input to a sorting operation
699 * @param[in] max_grid_size
700 * Maximum allowable number of CTAs to launch. The default value of 0 indicates
701 * that the dispatch logic should select an appropriate value for the target device.
703 RadixSortingEnactor(unsigned int num_elements, int max_grid_size = 0) : Base::BaseRadixSortingEnactor(4, 4, num_elements, max_grid_size) {}
709 * Sorting enactor that is specialized for for 32-bit key types
711 template <typename K, typename V>
712 class RadixSortingEnactor<K, V, unsigned int> : public BaseRadixSortingEnactor<K, V>
716 typedef BaseRadixSortingEnactor<K, V> Base;
717 typedef typename Base::ConvertedKeyType ConvertedKeyType;
719 cudaError_t EnactDigitPlacePasses(const RadixSortStorage<ConvertedKeyType, V> &converted_storage)
721 Base::template DigitPlacePass<0, 4, 0, PreprocessKeyFunctor<K>, NopFunctor<ConvertedKeyType> >(converted_storage);
722 Base::template DigitPlacePass<1, 4, 4, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
723 Base::template DigitPlacePass<2, 4, 8, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
724 Base::template DigitPlacePass<3, 4, 12, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
725 Base::template DigitPlacePass<4, 4, 16, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
726 Base::template DigitPlacePass<5, 4, 20, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
727 Base::template DigitPlacePass<6, 4, 24, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
728 Base::template DigitPlacePass<7, 4, 28, NopFunctor<ConvertedKeyType>, PostprocessKeyFunctor<K> > (converted_storage);
738 * @param[in] num_elements
739 * Length (in elements) of the input to a sorting operation
741 * @param[in] max_grid_size
742 * Maximum allowable number of CTAs to launch. The default value of 0 indicates
743 * that the dispatch logic should select an appropriate value for the target device.
745 RadixSortingEnactor(unsigned int num_elements, int max_grid_size = 0) : Base::BaseRadixSortingEnactor(8, 4, num_elements, max_grid_size) {}
752 * Sorting enactor that is specialized for for 64-bit key types
754 template <typename K, typename V>
755 class RadixSortingEnactor<K, V, unsigned long long> : public BaseRadixSortingEnactor<K, V>
759 typedef BaseRadixSortingEnactor<K, V> Base;
760 typedef typename Base::ConvertedKeyType ConvertedKeyType;
762 cudaError_t EnactDigitPlacePasses(const RadixSortStorage<ConvertedKeyType, V> &converted_storage)
764 Base::template DigitPlacePass<0, 4, 0, PreprocessKeyFunctor<K>, NopFunctor<ConvertedKeyType> >(converted_storage);
765 Base::template DigitPlacePass<1, 4, 4, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
766 Base::template DigitPlacePass<2, 4, 8, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
767 Base::template DigitPlacePass<3, 4, 12, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
768 Base::template DigitPlacePass<4, 4, 16, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
769 Base::template DigitPlacePass<5, 4, 20, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
770 Base::template DigitPlacePass<6, 4, 24, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
771 Base::template DigitPlacePass<7, 4, 28, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
772 Base::template DigitPlacePass<8, 4, 32, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
773 Base::template DigitPlacePass<9, 4, 36, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
774 Base::template DigitPlacePass<10, 4, 40, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
775 Base::template DigitPlacePass<11, 4, 44, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
776 Base::template DigitPlacePass<12, 4, 48, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
777 Base::template DigitPlacePass<13, 4, 52, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
778 Base::template DigitPlacePass<14, 4, 56, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
779 Base::template DigitPlacePass<15, 4, 60, NopFunctor<ConvertedKeyType>, PostprocessKeyFunctor<K> > (converted_storage);
789 * @param[in] num_elements
790 * Length (in elements) of the input to a sorting operation
792 * @param[in] max_grid_size
793 * Maximum allowable number of CTAs to launch. The default value of 0 indicates
794 * that the dispatch logic should select an appropriate value for the target device.
796 RadixSortingEnactor(unsigned int num_elements, int max_grid_size = 0) : Base::BaseRadixSortingEnactor(16, 4, num_elements, max_grid_size) {}
801 } // end namespace b40c_thrust
802 } // end namespace detail
803 } // end namespace detail
804 } // end namespace cuda
805 } // end namespace system
806 } // end namespace thrust