2 /* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
6 * The source code and/or documentation ("Licensed Deliverables") are
7 * subject to NVIDIA intellectual property rights under U.S. and
8 * international Copyright laws.
10 * The Licensed Deliverables contained herein are PROPRIETARY and
11 * CONFIDENTIAL to NVIDIA and are being provided under the terms and
12 * conditions of a form of NVIDIA software license agreement by and
13 * between NVIDIA and Licensee ("License Agreement") or electronically
14 * accepted by Licensee. Notwithstanding any terms or conditions to
15 * the contrary in the License Agreement, reproduction or disclosure
16 * of the Licensed Deliverables to any third party without the express
17 * written consent of NVIDIA is prohibited.
19 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
22 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32 * OF THESE LICENSED DELIVERABLES.
34 * U.S. Government End Users. These Licensed Deliverables are a
35 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36 * 1995), consisting of "commercial computer software" and "commercial
37 * computer software documentation" as such terms are used in 48
38 * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
39 * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41 * U.S. Government End Users acquire the Licensed Deliverables with
42 * only those rights set forth herein.
44 * Any use of the Licensed Deliverables in individual and commercial
45 * software must include, in the user documentation and internal
46 * comments to the code, the above Disclaimer and U.S. Government End
51 #if !defined(CURAND_KERNEL_H_)
52 #define CURAND_KERNEL_H_
55 * \defgroup DEVICE Device API
60 #if !defined(QUALIFIERS)
61 #define QUALIFIERS static __forceinline__ __device__
64 #include "curand_discrete.h"
65 #include "curand_precalc.h"
66 #include "curand_mrg32k3a.h"
67 #include "curand_mtgp32_kernel.h"
70 #include "curand_philox4x32_x.h"
71 #include "curand_globals.h"
77 /* This generator uses the formula:
78 x_n = x_(n-1) + 1 mod 2^32
79 x_0 = (unsigned int)seed * 3
80 Subsequences are spaced 31337 steps apart.
82 struct curandStateTest {
86 /** \cond UNHIDE_TYPEDEFS */
87 typedef struct curandStateTest curandStateTest_t;
90 /* XORSHIFT FAMILY RNGs */
91 /* These generators are a family proposed by Marsaglia. They keep state
92 in 32 bit chunks, then use repeated shift and xor operations to scramble
93 the bits. The following generators are a combination of a simple Weyl
94 generator with an N variable XORSHIFT generator.
98 /* This generator uses the xorwow formula of
99 www.jstatsoft.org/v08/i14/paper page 5
100 Has period 2^192 - 2^32.
103 * CURAND XORWOW state
105 struct curandStateXORWOW;
108 * Implementation details not in reference documentation */
109 struct curandStateXORWOW {
110 unsigned int d, v[5];
112 int boxmuller_flag_double;
113 float boxmuller_extra;
114 double boxmuller_extra_double;
118 * CURAND XORWOW state
120 /** \cond UNHIDE_TYPEDEFS */
121 typedef struct curandStateXORWOW curandStateXORWOW_t;
123 #define EXTRA_FLAG_NORMAL 0x00000001
124 #define EXTRA_FLAG_LOG_NORMAL 0x00000002
127 /* Combined Multiple Recursive Generators */
128 /* These generators are a family proposed by L'Ecuyer. They keep state
129 in sets of doubles, then use repeated modular arithmetic multiply operations
130 to scramble the bits in each set, and combine the result.
134 /* This generator uses the MRG32k3A formula of
135 http://www.iro.umontreal.ca/~lecuyer/myftp/streams00/c++/streams4.pdf
139 /* moduli for the recursions */
140 /** \cond UNHIDE_DEFINES */
141 #define MRG32K3A_MOD1 4294967087.
142 #define MRG32K3A_MOD2 4294944443.
144 /* Constants used in generation */
146 #define MRG32K3A_A12 1403580.
147 #define MRG32K3A_A13N 810728.
148 #define MRG32K3A_A21 527612.
149 #define MRG32K3A_A23N 1370589.
150 #define MRG32K3A_NORM 2.328306549295728e-10
152 // #define MRG32K3A_BITS_NORM ((double)((POW32_DOUBLE-1.0)/MOD1))
153 // above constant, used verbatim, rounds differently on some host systems.
154 #define MRG32K3A_BITS_NORM 1.000000048662
157 /* Constants for address manipulation */
159 #define MRG32K3A_SKIPUNITS_DOUBLES (sizeof(struct sMRG32k3aSkipUnits)/sizeof(double))
160 #define MRG32K3A_SKIPSUBSEQ_DOUBLES (sizeof(struct sMRG32k3aSkipSubSeq)/sizeof(double))
161 #define MRG32K3A_SKIPSEQ_DOUBLES (sizeof(struct sMRG32k3aSkipSeq)/sizeof(double))
168 * CURAND MRG32K3A state
170 struct curandStateMRG32k3a;
172 /* Implementation details not in reference documentation */
173 struct curandStateMRG32k3a {
177 int boxmuller_flag_double;
178 float boxmuller_extra;
179 double boxmuller_extra_double;
183 * CURAND MRG32K3A state
185 /** \cond UNHIDE_TYPEDEFS */
186 typedef struct curandStateMRG32k3a curandStateMRG32k3a_t;
191 * CURAND Sobol32 state
193 struct curandStateSobol32;
195 /* Implementation details not in reference documentation */
196 struct curandStateSobol32 {
197 unsigned int i, x, c;
198 unsigned int direction_vectors[32];
202 * CURAND Sobol32 state
204 /** \cond UNHIDE_TYPEDEFS */
205 typedef struct curandStateSobol32 curandStateSobol32_t;
209 * CURAND Scrambled Sobol32 state
211 struct curandStateScrambledSobol32;
213 /* Implementation details not in reference documentation */
214 struct curandStateScrambledSobol32 {
215 unsigned int i, x, c;
216 unsigned int direction_vectors[32];
220 * CURAND Scrambled Sobol32 state
222 /** \cond UNHIDE_TYPEDEFS */
223 typedef struct curandStateScrambledSobol32 curandStateScrambledSobol32_t;
227 * CURAND Sobol64 state
229 struct curandStateSobol64;
231 /* Implementation details not in reference documentation */
232 struct curandStateSobol64 {
233 unsigned long long i, x, c;
234 unsigned long long direction_vectors[64];
238 * CURAND Sobol64 state
240 /** \cond UNHIDE_TYPEDEFS */
241 typedef struct curandStateSobol64 curandStateSobol64_t;
245 * CURAND Scrambled Sobol64 state
247 struct curandStateScrambledSobol64;
249 /* Implementation details not in reference documentation */
250 struct curandStateScrambledSobol64 {
251 unsigned long long i, x, c;
252 unsigned long long direction_vectors[64];
256 * CURAND Scrambled Sobol64 state
258 /** \cond UNHIDE_TYPEDEFS */
259 typedef struct curandStateScrambledSobol64 curandStateScrambledSobol64_t;
265 /** \cond UNHIDE_TYPEDEFS */
266 typedef struct curandStateXORWOW curandState_t;
267 typedef struct curandStateXORWOW curandState;
270 /****************************************************************************/
271 /* Utility functions needed by RNGs */
272 /****************************************************************************/
273 /** \cond UNHIDE_UTILITIES */
275 multiply vector by matrix, store in result
276 matrix is n x n, measured in 32 bit units
277 matrix is stored in row major order
278 vector and result cannot be same pointer
280 QUALIFIERS void __curand_matvec(unsigned int *vector, unsigned int *matrix,
281 unsigned int *result, int n)
283 for(int i = 0; i < n; i++) {
286 for(int i = 0; i < n; i++) {
287 for(int j = 0; j < 32; j++) {
288 if(vector[i] & (1 << j)) {
289 for(int k = 0; k < n; k++) {
290 result[k] ^= matrix[n * (i * 32 + j) + k];
297 /* generate identity matrix */
298 QUALIFIERS void __curand_matidentity(unsigned int *matrix, int n)
301 for(int i = 0; i < n * 32; i++) {
302 for(int j = 0; j < n; j++) {
305 matrix[i * n + j] = (1 << r);
307 matrix[i * n + j] = 0;
313 /* multiply matrixA by matrixB, store back in matrixA
314 matrixA and matrixB must not be same matrix */
315 QUALIFIERS void __curand_matmat(unsigned int *matrixA, unsigned int *matrixB, int n)
317 unsigned int result[MAX_XOR_N];
318 for(int i = 0; i < n * 32; i++) {
319 __curand_matvec(matrixA + i * n, matrixB, result, n);
320 for(int j = 0; j < n; j++) {
321 matrixA[i * n + j] = result[j];
326 /* copy vectorA to vector */
327 QUALIFIERS void __curand_veccopy(unsigned int *vector, unsigned int *vectorA, int n)
329 for(int i = 0; i < n; i++) {
330 vector[i] = vectorA[i];
334 /* copy matrixA to matrix */
335 QUALIFIERS void __curand_matcopy(unsigned int *matrix, unsigned int *matrixA, int n)
337 for(int i = 0; i < n * n * 32; i++) {
338 matrix[i] = matrixA[i];
342 /* compute matrixA to power p, store result in matrix */
343 QUALIFIERS void __curand_matpow(unsigned int *matrix, unsigned int *matrixA,
344 unsigned long long p, int n)
346 unsigned int matrixR[MAX_XOR_N * MAX_XOR_N * 32];
347 unsigned int matrixS[MAX_XOR_N * MAX_XOR_N * 32];
348 __curand_matidentity(matrix, n);
349 __curand_matcopy(matrixR, matrixA, n);
352 __curand_matmat(matrix, matrixR, n);
354 __curand_matcopy(matrixS, matrixR, n);
355 __curand_matmat(matrixR, matrixS, n);
360 /* Convert unsigned int to float, use no intrinsics */
361 QUALIFIERS float __curand_uint32AsFloat (unsigned int i)
371 /* Convert two unsigned ints to double, use no intrinsics */
372 QUALIFIERS double __curand_hilouint32AsDouble (unsigned int hi, unsigned int lo)
384 /* Convert unsigned int to float, as efficiently as possible */
385 QUALIFIERS float __curand_uint32_as_float(unsigned int x)
387 #if __CUDA_ARCH__ > 0
388 return __int_as_float(x);
389 #elif !defined(__CUDA_ARCH__)
390 return __curand_uint32AsFloat(x);
395 QUALIFIERS double __curand_hilouint32_as_double(unsigned int hi, unsigned int lo)
397 #if __CUDA_ARCH__ > 0
398 return __hiloint2double(hi, lo);
399 #elif !defined(__CUDA_ARCH__)
400 return hilouint32AsDouble(hi, lo);
405 /****************************************************************************/
406 /* Utility functions needed by MRG32k3a RNG */
407 /* Matrix operations modulo some integer less than 2**32, done in */
408 /* double precision floating point, with care not to overflow 53 bits */
409 /****************************************************************************/
411 /* return i mod m. */
412 /* assumes i and m are integers represented accurately in doubles */
414 QUALIFIERS double curand_MRGmod(double i, double m)
420 if (rem < 0.0) rem += m;
424 /* Multiplication modulo m. Inputs i and j less than 2**32 */
425 /* Ensure intermediate results do not exceed 2**53 */
427 QUALIFIERS double curand_MRGmodMul(double i, double j, double m)
432 tempHi = floor(i/131072.0);
433 tempLo = i - (tempHi*131072.0);
434 tempLo = curand_MRGmod( curand_MRGmod( (tempHi * j), m) * 131072.0 + curand_MRGmod(tempLo * j, m),m);
436 if (tempLo < 0.0) tempLo += m;
440 /* multiply 3 by 3 matrices of doubles, modulo m */
442 QUALIFIERS void curand_MRGmatMul3x3(double i1[][3],double i2[][3],double o[][3],double m)
448 temp[i][j] = ( curand_MRGmodMul(i1[i][0], i2[0][j], m) +
449 curand_MRGmodMul(i1[i][1], i2[1][j], m) +
450 curand_MRGmodMul(i1[i][2], i2[2][j], m));
451 temp[i][j] = curand_MRGmod( temp[i][j], m );
456 o[i][j] = temp[i][j];
461 /* multiply 3 by 3 matrix times 3 by 1 vector of doubles, modulo m */
463 QUALIFIERS void curand_MRGmatVecMul3x3( double i[][3], double v[], double m)
467 for (k = 0; k < 3; k++) {
468 t[k] = ( curand_MRGmodMul(i[k][0], v[0], m) +
469 curand_MRGmodMul(i[k][1], v[1], m) +
470 curand_MRGmodMul(i[k][2], v[2], m) );
471 t[k] = curand_MRGmod( t[k], m );
473 for (k = 0; k < 3; k++) {
479 /* raise a 3 by 3 matrix of doubles to a 64 bit integer power pow, modulo m */
480 /* input is index zero of an array of 3 by 3 matrices m, */
481 /* each m = m[0]**(2**index) */
483 QUALIFIERS void curand_MRGmatPow3x3( double in[][3][3], double o[][3], double m, unsigned long long pow )
486 for ( i = 0; i < 3; i++ ) {
487 for ( j = 0; j < 3; j++ ) {
489 if ( i == j ) o[i][j] = 1;
493 curand_MRGmatVecMul3x3(o,o[0],m);
496 curand_MRGmatMul3x3(in[i], o, o, m);
503 /* raise a 3 by 3 matrix of doubles to the power */
504 /* 2 to the power (pow modulo 191), modulo m */
506 QUALIFIERS void curnand_MRGmatPow2Pow3x3( double in[][3], double o[][3], double m, unsigned long pow )
511 for ( i = 0; i < 3; i++ ) {
512 for ( j = 0; j < 3; j++ ) {
513 temp[i][j] = in[i][j];
517 curand_MRGmatMul3x3(temp, temp, temp, m);
520 for ( i = 0; i < 3; i++ ) {
521 for ( j = 0; j < 3; j++ ) {
522 o[i][j] = temp[i][j];
529 /****************************************************************************/
530 /* Kernel implementations of RNGs */
531 /****************************************************************************/
535 QUALIFIERS void curand_init(unsigned long long seed,
536 unsigned long long subsequence,
537 unsigned long long offset,
538 curandStateTest_t *state)
540 state->v = (unsigned int)(seed * 3) + (unsigned int)(subsequence * 31337) + \
541 (unsigned int)offset;
545 QUALIFIERS unsigned int curand(curandStateTest_t *state)
547 unsigned int r = state->v++;
551 QUALIFIERS void skipahead(unsigned long long n, curandStateTest_t *state)
553 state->v += (unsigned int)n;
558 template <typename T, int n>
559 QUALIFIERS void __curand_generate_skipahead_matrix_xor(unsigned int matrix[])
562 // Generate matrix that advances one step
563 // matrix has n * n * 32 32-bit elements
564 // solve for matrix by stepping single bit states
565 for(int i = 0; i < 32 * n; i++) {
567 for(int j = 0; j < n; j++) {
570 state.v[i / 32] = (1 << (i & 31));
572 for(int j = 0; j < n; j++) {
573 matrix[i * n + j] = state.v[j];
578 template <typename T, int n>
579 QUALIFIERS void _skipahead_scratch(unsigned long long x, T *state, unsigned int *scratch)
581 // unsigned int matrix[n * n * 32];
582 unsigned int *matrix = scratch;
583 // unsigned int matrixA[n * n * 32];
584 unsigned int *matrixA = scratch + (n * n * 32);
585 // unsigned int vector[n];
586 unsigned int *vector = scratch + (n * n * 32) + (n * n * 32);
587 // unsigned int result[n];
588 unsigned int *result = scratch + (n * n * 32) + (n * n * 32) + n;
589 unsigned long long p = x;
590 for(int i = 0; i < n; i++) {
591 vector[i] = state->v[i];
594 while(p && (matrix_num < PRECALC_NUM_MATRICES - 1)) {
595 for(unsigned int t = 0; t < (p & PRECALC_BLOCK_MASK); t++) {
597 __curand_matvec(vector, precalc_xorwow_offset_matrix[matrix_num], result, n);
599 __curand_matvec(vector, precalc_xorwow_offset_matrix_host[matrix_num], result, n);
601 __curand_veccopy(vector, result, n);
603 p >>= PRECALC_BLOCK_SIZE;
608 __curand_matcopy(matrix, precalc_xorwow_offset_matrix[PRECALC_NUM_MATRICES - 1], n);
609 __curand_matcopy(matrixA, precalc_xorwow_offset_matrix[PRECALC_NUM_MATRICES - 1], n);
611 __curand_matcopy(matrix, precalc_xorwow_offset_matrix_host[PRECALC_NUM_MATRICES - 1], n);
612 __curand_matcopy(matrixA, precalc_xorwow_offset_matrix_host[PRECALC_NUM_MATRICES - 1], n);
616 for(unsigned int t = 0; t < (p & SKIPAHEAD_MASK); t++) {
617 __curand_matvec(vector, matrixA, result, n);
618 __curand_veccopy(vector, result, n);
620 p >>= SKIPAHEAD_BLOCKSIZE;
622 for(int i = 0; i < SKIPAHEAD_BLOCKSIZE; i++) {
623 __curand_matmat(matrix, matrixA, n);
624 __curand_matcopy(matrixA, matrix, n);
628 for(int i = 0; i < n; i++) {
629 state->v[i] = vector[i];
631 state->d += 362437 * (unsigned int)x;
634 template <typename T, int n>
635 QUALIFIERS void _skipahead_sequence_scratch(unsigned long long x, T *state, unsigned int *scratch)
637 // unsigned int matrix[n * n * 32];
638 unsigned int *matrix = scratch;
639 // unsigned int matrixA[n * n * 32];
640 unsigned int *matrixA = scratch + (n * n * 32);
641 // unsigned int vector[n];
642 unsigned int *vector = scratch + (n * n * 32) + (n * n * 32);
643 // unsigned int result[n];
644 unsigned int *result = scratch + (n * n * 32) + (n * n * 32) + n;
645 unsigned long long p = x;
646 for(int i = 0; i < n; i++) {
647 vector[i] = state->v[i];
650 while(p && matrix_num < PRECALC_NUM_MATRICES - 1) {
651 for(unsigned int t = 0; t < (p & PRECALC_BLOCK_MASK); t++) {
653 __curand_matvec(vector, precalc_xorwow_matrix[matrix_num], result, n);
655 __curand_matvec(vector, precalc_xorwow_matrix_host[matrix_num], result, n);
657 __curand_veccopy(vector, result, n);
659 p >>= PRECALC_BLOCK_SIZE;
664 __curand_matcopy(matrix, precalc_xorwow_matrix[PRECALC_NUM_MATRICES - 1], n);
665 __curand_matcopy(matrixA, precalc_xorwow_matrix[PRECALC_NUM_MATRICES - 1], n);
667 __curand_matcopy(matrix, precalc_xorwow_matrix_host[PRECALC_NUM_MATRICES - 1], n);
668 __curand_matcopy(matrixA, precalc_xorwow_matrix_host[PRECALC_NUM_MATRICES - 1], n);
672 for(unsigned int t = 0; t < (p & SKIPAHEAD_MASK); t++) {
673 __curand_matvec(vector, matrixA, result, n);
674 __curand_veccopy(vector, result, n);
676 p >>= SKIPAHEAD_BLOCKSIZE;
678 for(int i = 0; i < SKIPAHEAD_BLOCKSIZE; i++) {
679 __curand_matmat(matrix, matrixA, n);
680 __curand_matcopy(matrixA, matrix, n);
684 for(int i = 0; i < n; i++) {
685 state->v[i] = vector[i];
687 /* No update of state->d needed, guaranteed to be a multiple of 2^32 */
691 * \brief Update XORWOW state to skip \p n elements.
693 * Update the XORWOW state in \p state to skip ahead \p n elements.
695 * All values of \p n are valid. Large values require more computation and so
696 * will take more time to complete.
698 * \param n - Number of elements to skip
699 * \param state - Pointer to state to update
701 QUALIFIERS void skipahead(unsigned long long n, curandStateXORWOW_t *state)
703 unsigned int scratch[5 * 5 * 32 * 2 + 5 * 2];
704 _skipahead_scratch<curandStateXORWOW_t, 5>(n, state, (unsigned int *)scratch);
708 * \brief Update XORWOW state to skip ahead \p n subsequences.
710 * Update the XORWOW state in \p state to skip ahead \p n subsequences. Each
711 * subsequence is \xmlonly<ph outputclass="xmlonly">2<sup>67</sup></ph>\endxmlonly elements long, so this means the function will skip ahead
712 * \xmlonly<ph outputclass="xmlonly">2<sup>67</sup></ph>\endxmlonly * n elements.
714 * All values of \p n are valid. Large values require more computation and so
715 * will take more time to complete.
717 * \param n - Number of subsequences to skip
718 * \param state - Pointer to state to update
720 QUALIFIERS void skipahead_sequence(unsigned long long n, curandStateXORWOW_t *state)
722 unsigned int scratch[5 * 5 * 32 * 2 + 5 * 2];
723 _skipahead_sequence_scratch<curandStateXORWOW_t, 5>(n, state, (unsigned int *)scratch);
728 QUALIFIERS void _curand_init_scratch(unsigned long long seed,
729 unsigned long long subsequence,
730 unsigned long long offset,
731 curandStateXORWOW_t *state,
732 unsigned int *scratch)
734 // Break up seed, apply salt
735 // Constants are arbitrary nonzero values
736 unsigned int s0 = ((unsigned int)seed) ^ 0xaad26b49UL;
737 unsigned int s1 = (unsigned int)(seed >> 32) ^ 0xf7dcefddUL;
738 // Simple multiplication to mix up bits
739 // Constants are arbitrary odd values
740 unsigned int t0 = 1099087573UL * s0;
741 unsigned int t1 = 2591861531UL * s1;
742 state->d = 6615241 + t1 + t0;
743 state->v[0] = 123456789UL + t0;
744 state->v[1] = 362436069UL ^ t0;
745 state->v[2] = 521288629UL + t1;
746 state->v[3] = 88675123UL ^ t1;
747 state->v[4] = 5783321UL + t0;
748 _skipahead_sequence_scratch<curandStateXORWOW_t, 5>(subsequence, state, scratch);
749 _skipahead_scratch<curandStateXORWOW_t, 5>(offset, state, scratch);
750 state->boxmuller_flag = 0;
751 state->boxmuller_flag_double = 0;
755 * \brief Initialize XORWOW state.
757 * Initialize XORWOW state in \p state with the given \p seed, \p subsequence,
760 * All input values of \p seed, \p subsequence, and \p offset are legal. Large
761 * values for \p subsequence and \p offset require more computation and so will
762 * take more time to complete.
764 * A value of 0 for \p seed sets the state to the values of the original
765 * published version of the \p xorwow algorithm.
767 * \param seed - Arbitrary bits to use as a seed
768 * \param subsequence - Subsequence to start at
769 * \param offset - Absolute offset into sequence
770 * \param state - Pointer to state to initialize
772 QUALIFIERS void curand_init(unsigned long long seed,
773 unsigned long long subsequence,
774 unsigned long long offset,
775 curandStateXORWOW_t *state)
777 unsigned int scratch[5 * 5 * 32 * 2 + 5 * 2];
778 _curand_init_scratch(seed, subsequence, offset, state, (unsigned int*)scratch);
782 * \brief Return 32-bits of pseudorandomness from an XORWOW generator.
784 * Return 32-bits of pseudorandomness from the XORWOW generator in \p state,
785 * increment position of generator by one.
787 * \param state - Pointer to state to update
789 * \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use.
791 QUALIFIERS unsigned int curand(curandStateXORWOW_t *state)
794 t = (state->v[0] ^ (state->v[0] >> 2));
795 state->v[0] = state->v[1];
796 state->v[1] = state->v[2];
797 state->v[2] = state->v[3];
798 state->v[3] = state->v[4];
799 state->v[4] = (state->v[4] ^ (state->v[4] <<4)) ^ (t ^ (t << 1));
801 return state->v[4] + state->d;
806 * \brief Return 32-bits of pseudorandomness from an Philox4_32_10 generator.
808 * Return 32-bits of pseudorandomness from the Philox4_32_10 generator in \p state,
809 * increment position of generator by one.
811 * \param state - Pointer to state to update
813 * \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use.
816 QUALIFIERS unsigned int curand(curandStatePhilox4_32_10_t *state)
818 // Maintain the invariant: output[STATE] is always "good" and
819 // is the next value to be returned by curand.
820 unsigned ret = (&state->output.x)[state->STATE++];
821 if(state->STATE == 4){
822 Philox_State_Incr(state);
823 state->output = curand_Philox4x32_10(state->ctr,state->key);
830 * \brief Return tuple of 4 32-bit pseudorandoms from a Philox4_32_10 generator.
832 * Return 128 bits of pseudorandomness from the Philox4_32_10 generator in \p state,
833 * increment position of generator by four.
835 * \param state - Pointer to state to update
837 * \return 128-bits of pseudorandomness as a uint4, all bits valid to use.
840 QUALIFIERS uint4 curand4(curandStatePhilox4_32_10_t *state)
844 uint4 tmp = state->output;
845 Philox_State_Incr(state);
846 state->output= curand_Philox4x32_10(state->ctr,state->key);
847 switch(state->STATE){
854 r.w = state->output.x;
859 r.z = state->output.x;
860 r.w = state->output.y;
864 r.y = state->output.x;
865 r.z = state->output.y;
866 r.w = state->output.z;
869 // NOT possible but needed to avoid compiler warnings
876 * \brief Update Philox4_32_10 state to skip \p n elements.
878 * Update the Philox4_32_10 state in \p state to skip ahead \p n elements.
880 * All values of \p n are valid.
882 * \param n - Number of elements to skip
883 * \param state - Pointer to state to update
885 QUALIFIERS void skipahead(unsigned long long n, curandStatePhilox4_32_10_t *state)
887 state->STATE += (n & 3);
889 if( state->STATE > 3 ){
893 Philox_State_Incr(state, n);
894 state->output = curand_Philox4x32_10(state->ctr,state->key);
898 * \brief Update Philox4_32_10 state to skip ahead \p n subsequences.
900 * Update the Philox4_32_10 state in \p state to skip ahead \p n subsequences. Each
901 * subsequence is \xmlonly<ph outputclass="xmlonly">2<sup>66</sup></ph>\endxmlonly elements long, so this means the function will skip ahead
902 * \xmlonly<ph outputclass="xmlonly">2<sup>66</sup></ph>\endxmlonly * n elements.
904 * All values of \p n are valid.
906 * \param n - Number of subsequences to skip
907 * \param state - Pointer to state to update
909 QUALIFIERS void skipahead_sequence(unsigned long long n, curandStatePhilox4_32_10_t *state)
911 Philox_State_Incr_hi(state, n);
912 state->output = curand_Philox4x32_10(state->ctr,state->key);
916 * \brief Initialize Philox4_32_10 state.
918 * Initialize Philox4_32_10 state in \p state with the given \p seed, p\ subsequence,
921 * All input values for \p seed, \p subseqence and \p offset are legal. Each of the
922 * \xmlonly<ph outputclass="xmlonly">2<sup>64</sup></ph>\endxmlonly possible
923 * values of seed selects an independent sequence of length
924 * \xmlonly<ph outputclass="xmlonly">2<sup>130</sup></ph>\endxmlonly.
926 * \xmlonly<ph outputclass="xmlonly">2<sup>66</sup> * subsequence + offset</ph>\endxmlonly.
927 * values of the sequence are skipped.
928 * I.e., subsequences are of length
929 * \xmlonly<ph outputclass="xmlonly">2<sup>66</sup></ph>\endxmlonly.
931 * \param seed - Arbitrary bits to use as a seed
932 * \param subsequence - Subsequence to start at
933 * \param offset - Absolute offset into subsequence
934 * \param state - Pointer to state to initialize
936 QUALIFIERS void curand_init(unsigned long long seed,
937 unsigned long long subsequence,
938 unsigned long long offset,
939 curandStatePhilox4_32_10_t *state)
941 state->ctr = make_uint4(0, 0, 0, 0);
942 state->key.x = (unsigned int)seed;
943 state->key.y = (unsigned int)(seed>>32);
945 state->boxmuller_flag = 0;
946 state->boxmuller_flag_double = 0;
947 skipahead_sequence(subsequence, state);
948 skipahead(offset, state);
954 /* Base generator for MRG32k3a */
955 /* note that the parameters have been selected such that intermediate */
956 /* results stay within 53 bits */
959 #if __CUDA_ARCH__ > 0
960 /* nj's implementation */
961 QUALIFIERS double curand_MRG32k3a (curandStateMRG32k3a_t *state)
963 const double m1 = 4294967087.;
964 const double m2 = 4294944443.;
965 const double a12 = 1403580.;
966 const double a13n = 810728.;
967 const double a21 = 527612.;
968 const double a23n = 1370589.;
970 const double rh1 = 2.3283065498378290e-010; /* (1.0 / m1)__hi */
971 const double rl1 = -1.7354913086174288e-026; /* (1.0 / m1)__lo */
972 const double rh2 = 2.3283188252407387e-010; /* (1.0 / m2)__hi */
973 const double rl2 = 2.4081018096503646e-026; /* (1.0 / m2)__lo */
976 p1 = a12 * state->s1[1] - a13n * state->s1[0];
977 q = trunc (fma (p1, rh1, p1 * rl1));
979 if (p1 < 0.0) p1 += m1;
980 state->s1[0] = state->s1[1]; state->s1[1] = state->s1[2]; state->s1[2] = p1;
981 p2 = a21 * state->s2[2] - a23n * state->s2[0];
982 q = trunc (fma (p2, rh2, p2 * rl2));
984 if (p2 < 0.0) p2 += m2;
985 state->s2[0] = state->s2[1]; state->s2[1] = state->s2[2]; state->s2[2] = p2;
986 if (p1 <= p2) return (p1 - p2 + m1);
987 else return (p1 - p2);
989 /* end nj's implementation */
991 QUALIFIERS double curand_MRG32k3a(curandStateMRG32k3a_t *state)
994 p1 = (MRG32K3A_A12 * state->s1[1]) - (MRG32K3A_A13N * state->s1[0]);
995 p1 = curand_MRGmod(p1, MRG32K3A_MOD1);
996 if (p1 < 0.0) p1 += MRG32K3A_MOD1;
997 state->s1[0] = state->s1[1];
998 state->s1[1] = state->s1[2];
1000 p2 = (MRG32K3A_A21 * state->s2[2]) - (MRG32K3A_A23N * state->s2[0]);
1001 p2 = curand_MRGmod(p2, MRG32K3A_MOD2);
1002 if (p2 < 0) p2 += MRG32K3A_MOD2;
1003 state->s2[0] = state->s2[1];
1004 state->s2[1] = state->s2[2];
1007 if (r <= 0) r += MRG32K3A_MOD1;
1014 * \brief Return 32-bits of pseudorandomness from an MRG32k3a generator.
1016 * Return 32-bits of pseudorandomness from the MRG32k3a generator in \p state,
1017 * increment position of generator by one.
1019 * \param state - Pointer to state to update
1021 * \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use.
1023 QUALIFIERS unsigned int curand(curandStateMRG32k3a_t *state)
1026 dRet = (double)curand_MRG32k3a(state)*(double)MRG32K3A_BITS_NORM;
1027 return (unsigned int)dRet;
1033 * \brief Update MRG32k3a state to skip \p n elements.
1035 * Update the MRG32k3a state in \p state to skip ahead \p n elements.
1037 * All values of \p n are valid. Large values require more computation and so
1038 * will take more time to complete.
1040 * \param n - Number of elements to skip
1041 * \param state - Pointer to state to update
1043 QUALIFIERS void skipahead(unsigned long long n, curandStateMRG32k3a_t *state)
1046 #ifdef __CUDA_ARCH__
1047 curand_MRGmatPow3x3( mrg32k3aM1, t, MRG32K3A_MOD1, n);
1048 curand_MRGmatVecMul3x3( t, state->s1, MRG32K3A_MOD1);
1049 curand_MRGmatPow3x3(mrg32k3aM2, t, MRG32K3A_MOD2, n);
1050 curand_MRGmatVecMul3x3( t, state->s2, MRG32K3A_MOD2);
1052 curand_MRGmatPow3x3( mrg32k3aM1Host, t, MRG32K3A_MOD1, n);
1053 curand_MRGmatVecMul3x3( t, state->s1, MRG32K3A_MOD1);
1054 curand_MRGmatPow3x3(mrg32k3aM2Host, t, MRG32K3A_MOD2, n);
1055 curand_MRGmatVecMul3x3( t, state->s2, MRG32K3A_MOD2);
1060 * \brief Update MRG32k3a state to skip ahead \p n subsequences.
1062 * Update the MRG32k3a state in \p state to skip ahead \p n subsequences. Each
1063 * subsequence is \xmlonly<ph outputclass="xmlonly">2<sup>127</sup></ph>\endxmlonly
1065 * \xmlonly<ph outputclass="xmlonly">2<sup>76</sup></ph>\endxmlonly elements long, so this means the function will skip ahead
1066 * \xmlonly<ph outputclass="xmlonly">2<sup>67</sup></ph>\endxmlonly * n elements.
1068 * Valid values of \p n are 0 to \xmlonly<ph outputclass="xmlonly">2<sup>51</sup></ph>\endxmlonly. Note \p n will be masked to 51 bits
1070 * \param n - Number of subsequences to skip
1071 * \param state - Pointer to state to update
1073 QUALIFIERS void skipahead_subsequence(unsigned long long n, curandStateMRG32k3a_t *state)
1076 #ifdef __CUDA_ARCH__
1077 curand_MRGmatPow3x3( mrg32k3aM1SubSeq, t, MRG32K3A_MOD1, n);
1078 curand_MRGmatVecMul3x3( t, state->s1, MRG32K3A_MOD1);
1079 curand_MRGmatPow3x3( mrg32k3aM2SubSeq, t, MRG32K3A_MOD2, n);
1080 curand_MRGmatVecMul3x3( t, state->s2, MRG32K3A_MOD2);
1082 curand_MRGmatPow3x3( mrg32k3aM1SubSeqHost, t, MRG32K3A_MOD1, n);
1083 curand_MRGmatVecMul3x3( t, state->s1, MRG32K3A_MOD1);
1084 curand_MRGmatPow3x3( mrg32k3aM2SubSeqHost, t, MRG32K3A_MOD2, n);
1085 curand_MRGmatVecMul3x3( t, state->s2, MRG32K3A_MOD2);
1090 * \brief Update MRG32k3a state to skip ahead \p n sequences.
1092 * Update the MRG32k3a state in \p state to skip ahead \p n sequences. Each
1093 * sequence is \xmlonly<ph outputclass="xmlonly">2<sup>127</sup></ph>\endxmlonly elements long, so this means the function will skip ahead
1094 * \xmlonly<ph outputclass="xmlonly">2<sup>127</sup></ph>\endxmlonly * n elements.
1096 * All values of \p n are valid. Large values require more computation and so
1097 * will take more time to complete.
1099 * \param n - Number of sequences to skip
1100 * \param state - Pointer to state to update
1102 QUALIFIERS void skipahead_sequence(unsigned long long n, curandStateMRG32k3a_t *state)
1105 #ifdef __CUDA_ARCH__
1106 curand_MRGmatPow3x3( mrg32k3aM1Seq, t, MRG32K3A_MOD1, n);
1107 curand_MRGmatVecMul3x3( t, state->s1, MRG32K3A_MOD1);
1108 curand_MRGmatPow3x3( mrg32k3aM2Seq, t, MRG32K3A_MOD2, n);
1109 curand_MRGmatVecMul3x3( t, state->s2, MRG32K3A_MOD2);
1111 curand_MRGmatPow3x3( mrg32k3aM1SeqHost, t, MRG32K3A_MOD1, n);
1112 curand_MRGmatVecMul3x3( t, state->s1, MRG32K3A_MOD1);
1113 curand_MRGmatPow3x3( mrg32k3aM2SeqHost, t, MRG32K3A_MOD2, n);
1114 curand_MRGmatVecMul3x3( t, state->s2, MRG32K3A_MOD2);
1120 * \brief Initialize MRG32k3a state.
1122 * Initialize MRG32k3a state in \p state with the given \p seed, \p subsequence,
1125 * All input values of \p seed, \p subsequence, and \p offset are legal.
1126 * \p subsequence will be truncated to 51 bits to avoid running into the next sequence
1128 * A value of 0 for \p seed sets the state to the values of the original
1129 * published version of the \p MRG32k3a algorithm.
1131 * \param seed - Arbitrary bits to use as a seed
1132 * \param subsequence - Subsequence to start at
1133 * \param offset - Absolute offset into sequence
1134 * \param state - Pointer to state to initialize
1136 QUALIFIERS void curand_init(unsigned long long seed,
1137 unsigned long long subsequence,
1138 unsigned long long offset,
1139 curandStateMRG32k3a_t *state)
1142 for ( i=0; i<3; i++ ) {
1143 state->s1[i] = 12345.;
1144 state->s2[i] = 12345.;
1147 unsigned int x1 = ((unsigned int)seed) ^ 0x55555555UL;
1148 unsigned int x2 = (unsigned int)((seed >> 32) ^ 0xAAAAAAAAUL);
1149 state->s1[0] = curand_MRGmodMul(x1, state->s1[0], MRG32K3A_MOD1);
1150 state->s1[1] = curand_MRGmodMul(x2, state->s1[1], MRG32K3A_MOD1);
1151 state->s1[2] = curand_MRGmodMul(x1, state->s1[2], MRG32K3A_MOD1);
1152 state->s2[0] = curand_MRGmodMul(x2, state->s2[0], MRG32K3A_MOD2);
1153 state->s2[1] = curand_MRGmodMul(x1, state->s2[1], MRG32K3A_MOD2);
1154 state->s2[2] = curand_MRGmodMul(x2, state->s2[2], MRG32K3A_MOD2);
1156 skipahead_subsequence( subsequence, state );
1157 skipahead( offset, state );
1158 state->boxmuller_flag = 0;
1159 state->boxmuller_flag_double = 0;
1163 * \brief Update Sobol32 state to skip \p n elements.
1165 * Update the Sobol32 state in \p state to skip ahead \p n elements.
1167 * All values of \p n are valid.
1169 * \param n - Number of elements to skip
1170 * \param state - Pointer to state to update
1172 template <typename T>
1173 QUALIFIERS void skipahead(unsigned int n, T state)
1175 unsigned int i_gray;
1176 state->x = state->c;
1178 /* Convert state->i to gray code */
1179 i_gray = state->i ^ (state->i >> 1);
1180 for(unsigned int k = 0; k < 32; k++) {
1181 if(i_gray & (1 << k)) {
1182 state->x ^= state->direction_vectors[k];
1189 * \brief Update Sobol64 state to skip \p n elements.
1191 * Update the Sobol64 state in \p state to skip ahead \p n elements.
1193 * All values of \p n are valid.
1195 * \param n - Number of elements to skip
1196 * \param state - Pointer to state to update
1198 template <typename T>
1199 QUALIFIERS void skipahead(unsigned long long n, T state)
1201 unsigned long long i_gray;
1202 state->x = state->c;
1204 /* Convert state->i to gray code */
1205 i_gray = state->i ^ (state->i >> 1);
1206 for(unsigned k = 0; k < 64; k++) {
1207 if(i_gray & (1ULL << k)) {
1208 state->x ^= state->direction_vectors[k];
1215 * \brief Initialize Sobol32 state.
1217 * Initialize Sobol32 state in \p state with the given \p direction \p vectors and
1220 * The direction vector is a device pointer to an array of 32 unsigned ints.
1221 * All input values of \p offset are legal.
1223 * \param direction_vectors - Pointer to array of 32 unsigned ints representing the
1224 * direction vectors for the desired dimension
1225 * \param offset - Absolute offset into sequence
1226 * \param state - Pointer to state to initialize
1228 QUALIFIERS void curand_init(curandDirectionVectors32_t direction_vectors,
1229 unsigned int offset,
1230 curandStateSobol32_t *state)
1234 for(int i = 0; i < 32; i++) {
1235 state->direction_vectors[i] = direction_vectors[i];
1238 skipahead<curandStateSobol32_t *>(offset, state);
1241 * \brief Initialize Scrambled Sobol32 state.
1243 * Initialize Sobol32 state in \p state with the given \p direction \p vectors and
1246 * The direction vector is a device pointer to an array of 32 unsigned ints.
1247 * All input values of \p offset are legal.
1249 * \param direction_vectors - Pointer to array of 32 unsigned ints representing the
1250 direction vectors for the desired dimension
1251 * \param scramble_c Scramble constant
1252 * \param offset - Absolute offset into sequence
1253 * \param state - Pointer to state to initialize
1255 QUALIFIERS void curand_init(curandDirectionVectors32_t direction_vectors,
1256 unsigned int scramble_c,
1257 unsigned int offset,
1258 curandStateScrambledSobol32_t *state)
1261 state->c = scramble_c;
1262 for(int i = 0; i < 32; i++) {
1263 state->direction_vectors[i] = direction_vectors[i];
1265 state->x = state->c;
1266 skipahead<curandStateScrambledSobol32_t *>(offset, state);
1269 template<typename XT>
1270 QUALIFIERS int __curand_find_trailing_zero(XT x)
1272 #if __CUDA_ARCH__ > 0
1273 unsigned long long z = x;
1274 int y = __ffsll(~z) | 0x40;
1275 return (y - 1) & 0x3F;
1277 unsigned long long z = x;
1287 * \brief Initialize Sobol64 state.
1289 * Initialize Sobol64 state in \p state with the given \p direction \p vectors and
1292 * The direction vector is a device pointer to an array of 64 unsigned long longs.
1293 * All input values of \p offset are legal.
1295 * \param direction_vectors - Pointer to array of 64 unsigned long longs representing the
1296 direction vectors for the desired dimension
1297 * \param offset - Absolute offset into sequence
1298 * \param state - Pointer to state to initialize
1300 QUALIFIERS void curand_init(curandDirectionVectors64_t direction_vectors,
1301 unsigned long long offset,
1302 curandStateSobol64_t *state)
1306 for(int i = 0; i < 64; i++) {
1307 state->direction_vectors[i] = direction_vectors[i];
1310 skipahead<curandStateSobol64_t *>(offset, state);
1313 template<typename PT>
1314 QUALIFIERS void _skipahead_stride(int n_log2, PT state)
1316 /* Moving from i to i+2^n_log2 element in gray code is flipping two bits */
1317 unsigned int shifted_i = state->i >> n_log2;
1318 state->x ^= state->direction_vectors[n_log2 - 1];
1319 state->x ^= state->direction_vectors[
1320 __curand_find_trailing_zero(shifted_i) + n_log2];
1321 state->i += 1 << n_log2;
1325 * \brief Initialize Scrambled Sobol64 state.
1327 * Initialize Sobol64 state in \p state with the given \p direction \p vectors and
1330 * The direction vector is a device pointer to an array of 64 unsigned long longs.
1331 * All input values of \p offset are legal.
1333 * \param direction_vectors - Pointer to array of 64 unsigned long longs representing the
1334 direction vectors for the desired dimension
1335 * \param scramble_c Scramble constant
1336 * \param offset - Absolute offset into sequence
1337 * \param state - Pointer to state to initialize
1339 QUALIFIERS void curand_init(curandDirectionVectors64_t direction_vectors,
1340 unsigned long long scramble_c,
1341 unsigned long long offset,
1342 curandStateScrambledSobol64_t *state)
1345 state->c = scramble_c;
1346 for(int i = 0; i < 64; i++) {
1347 state->direction_vectors[i] = direction_vectors[i];
1349 state->x = state->c;
1350 skipahead<curandStateScrambledSobol64_t *>(offset, state);
1354 * \brief Return 32-bits of quasirandomness from a Sobol32 generator.
1356 * Return 32-bits of quasirandomness from the Sobol32 generator in \p state,
1357 * increment position of generator by one.
1359 * \param state - Pointer to state to update
1361 * \return 32-bits of quasirandomness as an unsigned int, all bits valid to use.
1364 QUALIFIERS unsigned int curand(curandStateSobol32_t * state)
1366 /* Moving from i to i+1 element in gray code is flipping one bit,
1367 the trailing zero bit of i
1369 unsigned int res = state->x;
1370 state->x ^= state->direction_vectors[__curand_find_trailing_zero(state->i)];
1376 * \brief Return 32-bits of quasirandomness from a scrambled Sobol32 generator.
1378 * Return 32-bits of quasirandomness from the scrambled Sobol32 generator in \p state,
1379 * increment position of generator by one.
1381 * \param state - Pointer to state to update
1383 * \return 32-bits of quasirandomness as an unsigned int, all bits valid to use.
1386 QUALIFIERS unsigned int curand(curandStateScrambledSobol32_t * state)
1388 /* Moving from i to i+1 element in gray code is flipping one bit,
1389 the trailing zero bit of i
1391 unsigned int res = state->x;
1392 state->x ^= state->direction_vectors[__curand_find_trailing_zero(state->i)];
1398 * \brief Return 64-bits of quasirandomness from a Sobol64 generator.
1400 * Return 64-bits of quasirandomness from the Sobol64 generator in \p state,
1401 * increment position of generator by one.
1403 * \param state - Pointer to state to update
1405 * \return 64-bits of quasirandomness as an unsigned long long, all bits valid to use.
1408 QUALIFIERS unsigned long long curand(curandStateSobol64_t * state)
1410 /* Moving from i to i+1 element in gray code is flipping one bit,
1411 the trailing zero bit of i
1413 unsigned long long res = state->x;
1414 state->x ^= state->direction_vectors[__curand_find_trailing_zero(state->i)];
1420 * \brief Return 64-bits of quasirandomness from a scrambled Sobol64 generator.
1422 * Return 64-bits of quasirandomness from the scrambled Sobol32 generator in \p state,
1423 * increment position of generator by one.
1425 * \param state - Pointer to state to update
1427 * \return 64-bits of quasirandomness as an unsigned long long, all bits valid to use.
1430 QUALIFIERS unsigned long long curand(curandStateScrambledSobol64_t * state)
1432 /* Moving from i to i+1 element in gray code is flipping one bit,
1433 the trailing zero bit of i
1435 unsigned long long res = state->x;
1436 state->x ^= state->direction_vectors[__curand_find_trailing_zero(state->i)];
1441 #include "curand_uniform.h"
1442 #include "curand_normal.h"
1443 #include "curand_lognormal.h"
1444 #include "curand_poisson.h"
1445 #include "curand_discrete2.h"
1447 __device__ static inline unsigned int *__get_precalculated_matrix(int n)
1450 return precalc_xorwow_matrix[n];
1453 return precalc_xorwow_offset_matrix[n];
1455 return precalc_xorwow_matrix[n];
1458 __host__ static inline unsigned int *__get_precalculated_matrix_host(int n)
1461 return precalc_xorwow_matrix_host[n];
1464 return precalc_xorwow_offset_matrix_host[n];
1466 return precalc_xorwow_matrix_host[n];
1469 __device__ static inline double *__get_mrg32k3a_matrix(int n)
1472 return mrg32k3aM1[n][0];
1475 return mrg32k3aM2[n][0];
1478 return mrg32k3aM1SubSeq[n][0];
1481 return mrg32k3aM2SubSeq[n][0];
1484 return mrg32k3aM1Seq[n][0];
1487 return mrg32k3aM2Seq[n][0];
1489 return mrg32k3aM1[n][0];
1492 __host__ static inline double *__get_mrg32k3a_matrix_host(int n)
1495 return mrg32k3aM1Host[n][0];
1498 return mrg32k3aM2Host[n][0];
1501 return mrg32k3aM1SubSeqHost[n][0];
1504 return mrg32k3aM2SubSeqHost[n][0];
1507 return mrg32k3aM1SeqHost[n][0];
1510 return mrg32k3aM2SeqHost[n][0];
1512 return mrg32k3aM1Host[n][0];
1515 __host__ static inline double *__get__cr_lgamma_table_host(void) {
1516 return __cr_lgamma_table;
1521 #endif // !defined(CURAND_KERNEL_H_)