OSDN Git Service

CUDA
[eos/hostdependX86LINUX64.git] / util / X86LINUX64 / cuda-6.5 / include / thrust / system / cuda / detail / detail / b40c / radixsort_api.h
1 /******************************************************************************
2  * Copyright 2010 Duane Merrill
3  * 
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  * 
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License. 
15  * 
16  * 
17  * 
18  * 
19  * AUTHORS' REQUEST: 
20  * 
21  *              If you use|reference|benchmark this code, please cite our Technical 
22  *              Report (http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf):
23  * 
24  *              @TechReport{ Merrill:Sorting:2010,
25  *              author = "Duane Merrill and Andrew Grimshaw",
26  *              title = "Revisiting Sorting for GPGPU Stream Architectures",
27  *              year = "2010",
28  *              institution = "University of Virginia, Department of Computer Science",
29  *              address = "Charlottesville, VA, USA",
30  *              number = "CS2010-03"
31  *              }
32  * 
33  * For more information, see our Google Code project site: 
34  * http://code.google.com/p/back40computing/
35  * 
36  * Thanks!
37  ******************************************************************************/
38
39
40
41 /******************************************************************************
42  * Radix Sorting API
43  *
44  * USAGE:
45  * 
46  * Using the B40C radix sorting implementation is easy.  Just #include this API 
47  * file and its kernel include dependencies within your source.  Below are two
48  * examples for using: 
49  *
50  * (1) A keys-only example for sorting floats:
51  * 
52  *              // Create storage-management structure
53  *              RadixSortStorage<float> device_storage(d_float_keys);                   
54  *
55  *              // Create and enact sorter
56  *              RadixSortingEnactor sorter<float>(d_float_keys_len);
57  *              sorter.EnactSort(device_storage);
58  *
59  *              // Re-acquire pointer to sorted keys, free unused/temp storage 
60  *              d_float_keys = device_storage.d_keys;
61  *              device_storage.CleanupTempStorage();
62  *
63  * (2) And a key-value example for sorting ints paired with doubles:
64  *
65  *              // Create storage-management structure
66  *              RadixSortStorage<int, double> device_storage(d_int_keys, d_double_values);                      
67  *
68  *              // Create and enact sorter
69  *              RadixSortingEnactor sorter<int, double>(d_int_keys_len);
70  *              sorter.EnactSort(device_storage);
71  *
72  *              // Re-acquire pointer to sorted keys and values, free unused/temp storage 
73  *              d_int_keys = device_storage.d_keys;
74  *              d_double_values = device_storage.d_values;
75  *              device_storage.CleanupTempStorage();
76  *
77  *
78  ******************************************************************************/
79
80 #pragma once
81
82 #include <stdlib.h> 
83 #include <stdio.h> 
84 #include <string.h> 
85 #include <math.h> 
86 #include <float.h>
87
88 #include "radixsort_reduction_kernel.h"
89 #include "radixsort_spine_kernel.h"
90 #include "radixsort_scanscatter_kernel.h"
91
92 #include <thrust/swap.h>
93
94 namespace thrust  {
95 namespace system  {
96 namespace cuda    {
97 namespace detail  {
98 namespace detail  {
99 namespace b40c_thrust   {
100
101
102 /******************************************************************************
103  * Debugging options
104  ******************************************************************************/
105
106 static bool RADIXSORT_DEBUG = false;
107
108
109
110 /******************************************************************************
111  * Structures for mananging device-side sorting state
112  ******************************************************************************/
113
114 /**
115  * Sorting storage-management structure for device vectors
116  */
117 template <typename K, typename V = KeysOnlyType>
118 struct RadixSortStorage {
119
120         // Device vector of keys to sort
121         K* d_keys;
122         
123         // Device vector of values to sort
124         V* d_values;
125
126         // Ancillary device vector for key storage 
127         K* d_alt_keys;
128
129         // Ancillary device vector for value storage
130         V* d_alt_values;
131
132         // Temporary device storage needed for radix sorting histograms
133         int *d_spine;
134         
135         // Flip-flopping temporary device storage denoting which digit place 
136         // pass should read from which input source (i.e., false if reading from 
137         // keys, true if reading from alternate_keys
138         bool *d_from_alt_storage;
139
140         // Host-side boolean whether or not an odd number of sorting passes left the 
141         // results in alternate storage.  If so, the d_keys (and d_values) pointers 
142         // will have been swapped with the d_alt_keys (and d_alt_values) pointers in order to 
143         // point to the final results.
144         bool using_alternate_storage;
145         
146         // Constructor
147         RadixSortStorage(K* keys = NULL, V* values = NULL) 
148         { 
149                 d_keys = keys; 
150                 d_values = values; 
151                 d_alt_keys = NULL; 
152                 d_alt_values = NULL; 
153                 d_spine = NULL;
154                 d_from_alt_storage = NULL;
155                 
156                 using_alternate_storage = false;
157         }
158
159         // Clean up non-results storage (may include freeing original storage if 
160         // primary pointers were swizzled as per using_alternate_storage) 
161         cudaError_t CleanupTempStorage() 
162         {
163                 if (d_alt_keys) cudaFree(d_alt_keys);
164                 if (d_alt_values) cudaFree(d_alt_values);
165                 if (d_spine) cudaFree(d_spine);
166                 if (d_from_alt_storage) cudaFree(d_from_alt_storage);
167                 
168                 return cudaSuccess;
169         }
170 };
171
172
173
174 /******************************************************************************
175  * Base class for sorting enactors
176  ******************************************************************************/
177
178
179 /**
180  * Base class for SRTS radix sorting enactors.
181  */
182 template <typename K, typename V>
183 class BaseRadixSortingEnactor 
184 {
185 public:
186         
187         // Unsigned integer type suitable for radix sorting of keys
188         typedef typename KeyConversion<K>::UnsignedBits ConvertedKeyType;
189
190 protected:
191
192         //
193         // Information about our problem configuration
194         //
195         
196         bool                            _keys_only;
197         unsigned int            _num_elements;
198         int                             _cycle_elements;
199         int                             _spine_elements;
200         int                             _grid_size;
201         CtaDecomposition        _work_decomposition;
202         int                             _passes;
203         bool                            _swizzle_pointers_for_odd_passes;
204
205         // Information about our target device
206         cudaDeviceProp          _device_props;
207         int                             _device_sm_version;
208         
209         // Information about our kernel assembly
210         int                             _kernel_ptx_version;
211         cudaFuncAttributes      _spine_scan_kernel_attrs;
212         
213 protected:
214         
215         /**
216          * Constructor.
217          */
218         BaseRadixSortingEnactor(int passes, int radix_bits, unsigned int num_elements, int max_grid_size, bool swizzle_pointers_for_odd_passes = true); 
219         
220         /**
221          * Heuristic for determining the number of CTAs to launch.
222          *   
223          * @param[in]           max_grid_size  
224          *              Maximum allowable number of CTAs to launch.  A value of 0 indicates 
225          *              that the default value should be used.
226          * 
227          * @return The actual number of CTAs that should be launched
228          */
229         int GridSize(int max_grid_size);
230
231         /**
232          * Performs a distribution sorting pass over a single digit place
233          */
234         template <int PASS, int RADIX_BITS, int BIT, typename PreprocessFunctor, typename PostprocessFunctor>
235         cudaError_t DigitPlacePass(const RadixSortStorage<ConvertedKeyType, V> &converted_storage); 
236         
237         /**
238          * Enacts a sorting operation by performing the the appropriate 
239          * digit-place passes.  To be overloaded by specialized subclasses.
240          */
241         virtual cudaError_t EnactDigitPlacePasses(const RadixSortStorage<ConvertedKeyType, V> &converted_storage) = 0;
242         
243 public:
244         
245         /**
246          * Returns the length (in unsigned ints) of the device vector needed for  
247          * temporary storage of the reduction spine.  Useful if pre-allocating 
248          * your own device storage (as opposed to letting EnactSort() allocate it
249          * for you).
250          */
251         int SpineElements() { return _spine_elements; }
252
253         /**
254          * Returns whether or not the problem will fit on the device.
255          */
256         bool CanFit();
257
258         /**
259          * Enacts a radix sorting operation on the specified device data.
260          * 
261          * IMPORTANT NOTES: The device storage backing the specified input vectors of 
262          * keys (and data) will be modified.  (I.e., treat this as an in-place sort.)  
263          * 
264          * Additionally, the pointers in the problem_storage structure may be updated 
265          * (a) depending upon the number of digit-place sorting passes needed, and (b) 
266          * whether or not the caller has already allocated temporary storage.  
267          * 
268          * The sorted results will always be referenced by problem_storage.d_keys (and 
269          * problem_storage.d_values).  However, for an odd number of sorting passes (uncommon)
270          * these results will actually be backed by the storage initially allocated for 
271          * by problem_storage.d_alt_keys (and problem_storage.d_alt_values).  If so, 
272          * problem_storage.d_alt_keys and problem_storage.d_alt_keys will be updated to 
273          * reference the original problem_storage.d_keys and problem_storage.d_values in order 
274          * to facilitate cleanup.  
275          * 
276          * This means it is important to avoid keeping stale copies of device pointers 
277          * to keys/data; you will want to re-reference the pointers in problem_storage.
278          * 
279          * @param[in/out]       problem_storage 
280          *              Device vectors of keys and values to sort, and ancillary storage 
281          *              needed by the sorting kernels. See the IMPORTANT NOTES above. 
282          * 
283          *              The problem_storage.[alternate_keys|alternate_values|d_spine] fields are 
284          *              temporary storage needed by the sorting kernels.  To facilitate 
285          *              speed, callers are welcome to re-use this storage for same-sized 
286          *              (or smaller) sortign problems. If NULL, these storage vectors will be 
287          *      allocated by this routine (and must be subsequently cuda-freed by 
288          *      the caller).
289          *
290          * @return cudaSuccess on success, error enumeration otherwise
291          */
292         cudaError_t EnactSort(RadixSortStorage<K, V> &problem_storage); 
293
294     /*
295      * Destructor
296      */
297     virtual ~BaseRadixSortingEnactor() {}
298 };
299
300
301
302 template <typename K, typename V>
303 BaseRadixSortingEnactor<K, V>::BaseRadixSortingEnactor(
304         int passes, 
305         int max_radix_bits, 
306         unsigned int num_elements, 
307         int max_grid_size,
308         bool swizzle_pointers_for_odd_passes) 
309 {
310         //
311         // Get current device properties 
312         //
313
314         int current_device;
315         cudaGetDevice(&current_device);
316         cudaGetDeviceProperties(&_device_props, current_device);
317         _device_sm_version = _device_props.major * 100 + _device_props.minor * 10;
318
319         
320         //
321         // Get SM version of compiled kernel assembly
322         //
323         cudaFuncGetAttributes(&_spine_scan_kernel_attrs, SrtsScanSpine<void>);
324         _kernel_ptx_version = _spine_scan_kernel_attrs.ptxVersion * 10;
325         
326
327         //
328         // Determine number of CTAs to launch, shared memory, cycle elements, etc.
329         //
330
331         _passes                                                         = passes;
332         _num_elements                                           = num_elements;
333         _keys_only                                                      = IsKeysOnly<V>();
334         _cycle_elements                                         = B40C_RADIXSORT_CYCLE_ELEMENTS(_kernel_ptx_version , ConvertedKeyType, V);
335         _grid_size                                                      = GridSize(max_grid_size);
336         _swizzle_pointers_for_odd_passes        = swizzle_pointers_for_odd_passes;
337         
338         int total_cycles                        = _num_elements / _cycle_elements;
339         unsigned int cycles_per_block           = total_cycles / _grid_size;                                            
340         unsigned int extra_cycles                       = total_cycles - (cycles_per_block * _grid_size);
341
342         CtaDecomposition work_decomposition = {
343                 extra_cycles,                                                                           // num_big_blocks
344                 (cycles_per_block + 1) * _cycle_elements,                       // big_block_elements
345                 cycles_per_block * _cycle_elements,                                     // normal_block_elements
346                 _num_elements - (total_cycles * _cycle_elements),       // extra_elements_last_block
347                 _num_elements};                                                                         // num_elements
348         
349         _work_decomposition = work_decomposition;
350         
351         int spine_cycles = ((_grid_size * (1 << max_radix_bits)) + B40C_RADIXSORT_SPINE_CYCLE_ELEMENTS - 1) / B40C_RADIXSORT_SPINE_CYCLE_ELEMENTS;
352         _spine_elements = spine_cycles * B40C_RADIXSORT_SPINE_CYCLE_ELEMENTS;
353 }
354
355
356
357 template <typename K, typename V>
358 int BaseRadixSortingEnactor<K, V>::GridSize(int max_grid_size)
359 {
360         const int SINGLE_CTA_CUTOFF = 0;                // right now zero; we have no single-cta sorting
361
362         // find maximum number of threadblocks if "use-default"
363         if (max_grid_size == 0) {
364
365                 if (_num_elements <= static_cast<unsigned int>(SINGLE_CTA_CUTOFF)) {
366
367                         // The problem size is too small to warrant a two-level reduction: 
368                         // use only one stream-processor
369                         max_grid_size = 1;
370
371                 } else {
372
373                         if (_device_sm_version <= 120) {
374                                 
375                                 // G80/G90
376                                 max_grid_size = _device_props.multiProcessorCount * 4;
377                                 
378                         } else if (_device_sm_version < 200) {
379                                 
380                                 // GT200 (has some kind of TLB or icache drama)
381                                 int orig_max_grid_size = _device_props.multiProcessorCount * B40C_RADIXSORT_SCAN_SCATTER_CTA_OCCUPANCY(_kernel_ptx_version);
382                                 if (_keys_only) { 
383                                         orig_max_grid_size *= (_num_elements + (1024 * 1024 * 96) - 1) / (1024 * 1024 * 96);
384                                 } else {
385                                         orig_max_grid_size *= (_num_elements + (1024 * 1024 * 64) - 1) / (1024 * 1024 * 64);
386                                 }
387                                 max_grid_size = orig_max_grid_size;
388
389                                 if (_num_elements / _cycle_elements > static_cast<unsigned int>(max_grid_size)) {
390         
391                                         double multiplier1 = 4.0;
392                                         double multiplier2 = 16.0;
393
394                                         double delta1 = 0.068;
395                                         double delta2 = 0.127;  
396         
397                                         int dividend = (_num_elements + _cycle_elements - 1) / _cycle_elements;
398         
399                                         while(true) {
400         
401                                                 double quotient = ((double) dividend) / (multiplier1 * max_grid_size);
402                                                 quotient -= (int) quotient;
403
404                                                 if ((quotient > delta1) && (quotient < 1 - delta1)) {
405
406                                                         quotient = ((double) dividend) / (multiplier2 * max_grid_size / 3.0);
407                                                         quotient -= (int) quotient;
408
409                                                         if ((quotient > delta2) && (quotient < 1 - delta2)) {
410                                                                 break;
411                                                         }
412                                                 }
413                                                 
414                                                 if (max_grid_size == orig_max_grid_size - 2) {
415                                                         max_grid_size = orig_max_grid_size - 30;
416                                                 } else {
417                                                         max_grid_size -= 1;
418                                                 }
419                                         }
420                                 }
421                         } else {
422                                 
423                                 // GF100
424                                 max_grid_size = 418;
425                         }
426                 }
427         }
428
429         // Calculate the actual number of threadblocks to launch.  Initially
430         // assume that each threadblock will do only one cycle_elements worth 
431         // of work, but then clamp it by the "max" restriction derived above
432         // in order to accomodate the "single-sp" and "saturated" cases.
433
434         int grid_size = _num_elements / _cycle_elements;
435         if (grid_size == 0) {
436                 grid_size = 1;
437         }
438         if (grid_size > max_grid_size) {
439                 grid_size = max_grid_size;
440         } 
441
442         return grid_size;
443 }
444
445
446
447 template <typename K, typename V>
448 bool BaseRadixSortingEnactor<K, V>::
449 CanFit() 
450 {
451         long long bytes = (_num_elements * sizeof(K) * 2) + (_spine_elements * sizeof(int));
452         if (!_keys_only) bytes += _num_elements * sizeof(V) * 2;
453
454         if (_device_props.totalGlobalMem < 1024 * 1024 * 513) {
455                 return (bytes < ((double) _device_props.totalGlobalMem) * 0.81);        // allow up to 81% capacity for 512MB   
456         }
457         
458         return (bytes < ((double) _device_props.totalGlobalMem) * 0.89);        // allow up to 90% capacity 
459 }
460
461
462
463 template <typename K, typename V>
464 template <int PASS, int RADIX_BITS, int BIT, typename PreprocessFunctor, typename PostprocessFunctor>
465 cudaError_t BaseRadixSortingEnactor<K, V>::
466 DigitPlacePass(const RadixSortStorage<ConvertedKeyType, V> &converted_storage)
467 {
468         int threads = B40C_RADIXSORT_THREADS;
469         int dynamic_smem;
470
471         cudaFuncAttributes reduce_kernel_attrs, scan_scatter_attrs;
472         cudaFuncGetAttributes(&reduce_kernel_attrs, RakingReduction<ConvertedKeyType, V, PASS, RADIX_BITS, BIT, PreprocessFunctor>);
473         cudaFuncGetAttributes(&scan_scatter_attrs, ScanScatterDigits<ConvertedKeyType, V, PASS, RADIX_BITS, BIT, PreprocessFunctor, PostprocessFunctor>);
474         
475         //
476         // Counting Reduction
477         //
478
479         // Run tesla flush kernel if we have two or more threadblocks for each of the SMs
480         if ((_device_sm_version == 130) && (_work_decomposition.num_elements > static_cast<unsigned int>(_device_props.multiProcessorCount * _cycle_elements * 2))) { 
481                 FlushKernel<void><<<_grid_size, B40C_RADIXSORT_THREADS, scan_scatter_attrs.sharedSizeBytes>>>();
482                 synchronize_if_enabled("FlushKernel");
483         }
484
485         // GF100 and GT200 get the same smem allocation for every kernel launch (pad the reduction/top-level-scan kernels)
486         dynamic_smem = (_kernel_ptx_version >= 130) ? scan_scatter_attrs.sharedSizeBytes - reduce_kernel_attrs.sharedSizeBytes : 0;
487
488         RakingReduction<ConvertedKeyType, V, PASS, RADIX_BITS, BIT, PreprocessFunctor> <<<_grid_size, threads, dynamic_smem>>>(
489                 converted_storage.d_from_alt_storage,
490                 converted_storage.d_spine,
491                 converted_storage.d_keys,
492                 converted_storage.d_alt_keys,
493                 _work_decomposition);
494     synchronize_if_enabled("RakingReduction");
495
496         
497         //
498         // Spine
499         //
500         
501         // GF100 and GT200 get the same smem allocation for every kernel launch (pad the reduction/top-level-scan kernels)
502         dynamic_smem = (_kernel_ptx_version >= 130) ? scan_scatter_attrs.sharedSizeBytes - _spine_scan_kernel_attrs.sharedSizeBytes : 0;
503         
504         SrtsScanSpine<void><<<_grid_size, B40C_RADIXSORT_SPINE_THREADS, dynamic_smem>>>(
505                 converted_storage.d_spine,
506                 converted_storage.d_spine,
507                 _spine_elements);
508     synchronize_if_enabled("SrtsScanSpine");
509
510         
511         //
512         // Scanning Scatter
513         //
514         
515         // Run tesla flush kernel if we have two or more threadblocks for each of the SMs
516         if ((_device_sm_version == 130) && (_work_decomposition.num_elements > static_cast<unsigned int>(_device_props.multiProcessorCount * _cycle_elements * 2))) { 
517                 FlushKernel<void><<<_grid_size, B40C_RADIXSORT_THREADS, scan_scatter_attrs.sharedSizeBytes>>>();
518                 synchronize_if_enabled("FlushKernel");
519         }
520
521         ScanScatterDigits<ConvertedKeyType, V, PASS, RADIX_BITS, BIT, PreprocessFunctor, PostprocessFunctor> <<<_grid_size, threads, 0>>>(
522                 converted_storage.d_from_alt_storage,
523                 converted_storage.d_spine,
524                 converted_storage.d_keys,
525                 converted_storage.d_alt_keys,
526                 converted_storage.d_values,
527                 converted_storage.d_alt_values,
528                 _work_decomposition);
529     synchronize_if_enabled("ScanScatterDigits");
530
531         return cudaSuccess;
532 }
533
534
535
536 template <typename K, typename V>
537 cudaError_t BaseRadixSortingEnactor<K, V>::
538 EnactSort(RadixSortStorage<K, V> &problem_storage) 
539 {
540         //
541         // Allocate device memory for temporary storage (if necessary)
542         //
543
544         if (problem_storage.d_alt_keys == NULL) {
545                 cudaMalloc((void**) &problem_storage.d_alt_keys, _num_elements * sizeof(K));
546         }
547         if (!_keys_only && (problem_storage.d_alt_values == NULL)) {
548                 cudaMalloc((void**) &problem_storage.d_alt_values, _num_elements * sizeof(V));
549         }
550         if (problem_storage.d_spine == NULL) {
551                 cudaMalloc((void**) &problem_storage.d_spine, _spine_elements * sizeof(int));
552         }
553         if (problem_storage.d_from_alt_storage == NULL) {
554                 cudaMalloc((void**) &problem_storage.d_from_alt_storage, 2 * sizeof(bool));
555         }
556
557         // Determine suitable type of unsigned byte storage to use for keys 
558         typedef typename KeyConversion<K>::UnsignedBits ConvertedKeyType;
559         
560         // Copy storage pointers to an appropriately typed stucture 
561         RadixSortStorage<ConvertedKeyType, V> converted_storage;
562         memcpy(&converted_storage, &problem_storage, sizeof(RadixSortStorage<K, V>));
563
564         // 
565         // Enact the sorting operation
566         //
567         
568         if (RADIXSORT_DEBUG) {
569                 
570                 printf("_device_sm_version: %d, _kernel_ptx_version: %d\n", _device_sm_version, _kernel_ptx_version);
571                 printf("Bottom-level reduction & scan kernels:\n\tgrid_size: %d, \n\tthreads: %d, \n\tcycle_elements: %d, \n\tnum_big_blocks: %d, \n\tbig_block_elements: %d, \n\tnormal_block_elements: %d\n\textra_elements_last_block: %d\n\n",
572                         _grid_size, B40C_RADIXSORT_THREADS, _cycle_elements, _work_decomposition.num_big_blocks, _work_decomposition.big_block_elements, _work_decomposition.normal_block_elements, _work_decomposition.extra_elements_last_block);
573                 printf("Top-level spine scan:\n\tgrid_size: %d, \n\tthreads: %d, \n\tspine_block_elements: %d\n\n", 
574                         _grid_size, B40C_RADIXSORT_SPINE_THREADS, _spine_elements);
575         }       
576
577         cudaError_t retval = EnactDigitPlacePasses(converted_storage);
578
579         
580         //
581         // Swizzle pointers if we left our sorted output in temp storage 
582         //
583         
584         if (_swizzle_pointers_for_odd_passes) {
585         
586                 cudaMemcpy(
587                         &problem_storage.using_alternate_storage, 
588                         &problem_storage.d_from_alt_storage[_passes & 0x1], 
589                         sizeof(bool), 
590                         cudaMemcpyDeviceToHost);
591         
592                 if (problem_storage.using_alternate_storage) {
593             thrust::swap<K*>(problem_storage.d_keys, problem_storage.d_alt_keys);
594                         if (!_keys_only) {
595                 thrust::swap<V*>(problem_storage.d_values, problem_storage.d_alt_values);
596                         }
597                 }
598         }
599         
600         return retval;
601 }
602
603
604
605
606
607 /******************************************************************************
608  * Sorting enactor classes
609  ******************************************************************************/
610
611 /**
612  * Generic sorting enactor class.  Simply create an instance of this class
613  * with your key-type K (and optionally value-type V if sorting with satellite 
614  * values).
615  * 
616  * Template specialization provides the appropriate enactor instance to handle 
617  * the specified data types. 
618  * 
619  * @template-param K
620  *              Type of keys to be sorted
621  *
622  * @template-param V
623  *              Type of values to be sorted.
624  *
625  * @template-param ConvertedKeyType
626  *              Leave as default to effect necessary enactor specialization.
627  */
628 template <typename K, typename V = KeysOnlyType, typename ConvertedKeyType = typename KeyConversion<K>::UnsignedBits>
629 class RadixSortingEnactor;
630
631
632
633 /**
634  * Sorting enactor that is specialized for for 8-bit key types
635  */
636 template <typename K, typename V>
637 class RadixSortingEnactor<K, V, unsigned char> : public BaseRadixSortingEnactor<K, V>
638 {
639 protected:
640
641         typedef BaseRadixSortingEnactor<K, V> Base; 
642         typedef typename Base::ConvertedKeyType ConvertedKeyType;
643
644         cudaError_t EnactDigitPlacePasses(const RadixSortStorage<ConvertedKeyType, V> &converted_storage)
645         {
646                 Base::template DigitPlacePass<0, 4, 0, PreprocessKeyFunctor<K>,      NopFunctor<ConvertedKeyType> >(converted_storage);
647                 Base::template DigitPlacePass<1, 4, 4, NopFunctor<ConvertedKeyType>, PostprocessKeyFunctor<K> >    (converted_storage); 
648
649                 return cudaSuccess;
650         }
651
652 public:
653         
654         /**
655          * Constructor.
656          * 
657          * @param[in]           num_elements 
658          *              Length (in elements) of the input to a sorting operation
659          * 
660          * @param[in]           max_grid_size  
661          *              Maximum allowable number of CTAs to launch.  The default value of 0 indicates 
662          *              that the dispatch logic should select an appropriate value for the target device.
663          */     
664         RadixSortingEnactor(unsigned int num_elements, int max_grid_size = 0) : Base::BaseRadixSortingEnactor(2, 4, num_elements, max_grid_size) {}
665
666 };
667
668
669
670 /**
671  * Sorting enactor that is specialized for for 16-bit key types
672  */
673 template <typename K, typename V>
674 class RadixSortingEnactor<K, V, unsigned short> : public BaseRadixSortingEnactor<K, V>
675 {
676 protected:
677
678         typedef BaseRadixSortingEnactor<K, V> Base; 
679         typedef typename Base::ConvertedKeyType ConvertedKeyType;
680
681         cudaError_t EnactDigitPlacePasses(const RadixSortStorage<ConvertedKeyType, V> &converted_storage)
682         {
683                 Base::template DigitPlacePass<0, 4, 0,  PreprocessKeyFunctor<K>,      NopFunctor<ConvertedKeyType> >(converted_storage);
684                 Base::template DigitPlacePass<1, 4, 4,  NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
685                 Base::template DigitPlacePass<2, 4, 8,  NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
686                 Base::template DigitPlacePass<3, 4, 12, NopFunctor<ConvertedKeyType>, PostprocessKeyFunctor<K> >    (converted_storage); 
687
688                 return cudaSuccess;
689         }
690
691 public:
692         
693         /**
694          * Constructor.
695          * 
696          * @param[in]           num_elements 
697          *              Length (in elements) of the input to a sorting operation
698          * 
699          * @param[in]           max_grid_size  
700          *              Maximum allowable number of CTAs to launch.  The default value of 0 indicates 
701          *              that the dispatch logic should select an appropriate value for the target device.
702          */     
703         RadixSortingEnactor(unsigned int num_elements, int max_grid_size = 0) : Base::BaseRadixSortingEnactor(4, 4, num_elements, max_grid_size) {}
704
705 };
706
707
708 /**
709  * Sorting enactor that is specialized for for 32-bit key types
710  */
711 template <typename K, typename V>
712 class RadixSortingEnactor<K, V, unsigned int> : public BaseRadixSortingEnactor<K, V>
713 {
714 protected:
715
716         typedef BaseRadixSortingEnactor<K, V> Base; 
717         typedef typename Base::ConvertedKeyType ConvertedKeyType;
718
719         cudaError_t EnactDigitPlacePasses(const RadixSortStorage<ConvertedKeyType, V> &converted_storage)
720         {
721                 Base::template DigitPlacePass<0, 4, 0,  PreprocessKeyFunctor<K>,      NopFunctor<ConvertedKeyType> >(converted_storage);
722                 Base::template DigitPlacePass<1, 4, 4,  NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
723                 Base::template DigitPlacePass<2, 4, 8,  NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
724                 Base::template DigitPlacePass<3, 4, 12, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
725                 Base::template DigitPlacePass<4, 4, 16, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
726                 Base::template DigitPlacePass<5, 4, 20, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
727                 Base::template DigitPlacePass<6, 4, 24, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
728                 Base::template DigitPlacePass<7, 4, 28, NopFunctor<ConvertedKeyType>, PostprocessKeyFunctor<K> >    (converted_storage); 
729
730                 return cudaSuccess;
731         }
732
733 public:
734         
735         /**
736          * Constructor.
737          * 
738          * @param[in]           num_elements 
739          *              Length (in elements) of the input to a sorting operation
740          * 
741          * @param[in]           max_grid_size  
742          *              Maximum allowable number of CTAs to launch.  The default value of 0 indicates 
743          *              that the dispatch logic should select an appropriate value for the target device.
744          */     
745         RadixSortingEnactor(unsigned int num_elements, int max_grid_size = 0) : Base::BaseRadixSortingEnactor(8, 4, num_elements, max_grid_size) {}
746
747 };
748
749
750
751 /**
752  * Sorting enactor that is specialized for for 64-bit key types
753  */
754 template <typename K, typename V>
755 class RadixSortingEnactor<K, V, unsigned long long> : public BaseRadixSortingEnactor<K, V>
756 {
757 protected:
758
759         typedef BaseRadixSortingEnactor<K, V> Base; 
760         typedef typename Base::ConvertedKeyType ConvertedKeyType;
761
762         cudaError_t EnactDigitPlacePasses(const RadixSortStorage<ConvertedKeyType, V> &converted_storage)
763         {
764                 Base::template DigitPlacePass<0,  4, 0,  PreprocessKeyFunctor<K>,      NopFunctor<ConvertedKeyType> >(converted_storage);
765                 Base::template DigitPlacePass<1,  4, 4,  NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
766                 Base::template DigitPlacePass<2,  4, 8,  NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
767                 Base::template DigitPlacePass<3,  4, 12, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
768                 Base::template DigitPlacePass<4,  4, 16, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
769                 Base::template DigitPlacePass<5,  4, 20, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
770                 Base::template DigitPlacePass<6,  4, 24, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
771                 Base::template DigitPlacePass<7,  4, 28, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
772                 Base::template DigitPlacePass<8,  4, 32, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
773                 Base::template DigitPlacePass<9,  4, 36, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
774                 Base::template DigitPlacePass<10, 4, 40, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
775                 Base::template DigitPlacePass<11, 4, 44, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
776                 Base::template DigitPlacePass<12, 4, 48, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
777                 Base::template DigitPlacePass<13, 4, 52, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
778                 Base::template DigitPlacePass<14, 4, 56, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
779                 Base::template DigitPlacePass<15, 4, 60, NopFunctor<ConvertedKeyType>, PostprocessKeyFunctor<K> >    (converted_storage); 
780
781                 return cudaSuccess;
782         }
783
784 public:
785         
786         /**
787          * Constructor.
788          * 
789          * @param[in]           num_elements 
790          *              Length (in elements) of the input to a sorting operation
791          * 
792          * @param[in]           max_grid_size  
793          *              Maximum allowable number of CTAs to launch.  The default value of 0 indicates 
794          *              that the dispatch logic should select an appropriate value for the target device.
795          */     
796         RadixSortingEnactor(unsigned int num_elements, int max_grid_size = 0) : Base::BaseRadixSortingEnactor(16, 4, num_elements, max_grid_size) {}
797
798 };
799
800
801 } // end namespace b40c_thrust
802 } // end namespace detail
803 } // end namespace detail
804 } // end namespace cuda
805 } // end namespace system
806 } // end namespace thrust
807