util/X86LINUX64/cuda-6.5/include/cuda_runtime.h

   1 /*
   2  * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
   3  *
   4  * NOTICE TO LICENSEE:
   5  *
   6  * This source code and/or documentation ("Licensed Deliverables") are
   7  * subject to NVIDIA intellectual property rights under U.S. and
   8  * international Copyright laws.
   9  *
  10  * These Licensed Deliverables contained herein is PROPRIETARY and
  11  * CONFIDENTIAL to NVIDIA and is being provided under the terms and
  12  * conditions of a form of NVIDIA software license agreement by and
  13  * between NVIDIA and Licensee ("License Agreement") or electronically
  14  * accepted by Licensee.  Notwithstanding any terms or conditions to
  15  * the contrary in the License Agreement, reproduction or disclosure
  16  * of the Licensed Deliverables to any third party without the express
  17  * written consent of NVIDIA is prohibited.
  18  *
  19  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
  20  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
  21  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
  22  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
  23  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
  24  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
  25  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
  26  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
  27  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
  28  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
  29  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
  30  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
  31  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  32  * OF THESE LICENSED DELIVERABLES.
  33  *
  34  * U.S. Government End Users.  These Licensed Deliverables are a
  35  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
  36  * 1995), consisting of "commercial computer software" and "commercial
  37  * computer software documentation" as such terms are used in 48
  38  * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
  39  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
  40  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
  41  * U.S. Government End Users acquire the Licensed Deliverables with
  42  * only those rights set forth herein.
  43  *
  44  * Any use of the Licensed Deliverables in individual and commercial
  45  * software must include, in the user documentation and internal
  46  * comments to the code, the above Disclaimer and U.S. Government End
  47  * Users Notice.
  48  */
  49
  50 #if !defined(__CUDA_RUNTIME_H__)
  51 #define __CUDA_RUNTIME_H__
  52
  53 /*******************************************************************************
  54 *                                                                              *
  55 *                                                                              *
  56 *                                                                              *
  57 *******************************************************************************/
  58
  59 #include "host_config.h"
  60
  61 /*******************************************************************************
  62 *                                                                              *
  63 *                                                                              *
  64 *                                                                              *
  65 *******************************************************************************/
  66
  67 #include "builtin_types.h"
  68 #include "channel_descriptor.h"
  69 #include "cuda_runtime_api.h"
  70 #include "driver_functions.h"
  71 #include "host_defines.h"
  72 #include "vector_functions.h"
  73
  74 #if defined(__CUDACC__)
  75
  76 #include "common_functions.h"
  77 #include "cuda_surface_types.h"
  78 #include "cuda_texture_types.h"
  79 #include "device_functions.h"
  80 #include "device_launch_parameters.h"
  81
  82 #endif /* __CUDACC__ */
  83
  84 #if defined(__cplusplus)
  85
  86 /*******************************************************************************
  87 *                                                                              *
  88 *                                                                              *
  89 *                                                                              *
  90 *******************************************************************************/
  91
  92 namespace
  93 {
  94
  95 /**
  96  * \addtogroup CUDART_HIGHLEVEL
  97  * @{
  98  */
  99
 100 /**
 101  * \brief \hl Configure a device launch
 102  *
 103  * Pushes \p size bytes of the argument pointed to by \p arg at \p offset
 104  * bytes from the start of the parameter passing area, which starts at
 105  * offset 0. The arguments are stored in the top of the execution stack.
 106  * \ref ::cudaSetupArgument(T, size_t) "cudaSetupArgument()" must be preceded
 107  * by a call to ::cudaConfigureCall().
 108  *
 109  * \param arg    - Argument to push for a kernel launch
 110  * \param offset - Offset in argument stack to push new arg
 111  *
 112  * \return
 113  * ::cudaSuccess
 114  * \notefnerr
 115  *
 116  * \sa ::cudaConfigureCall,
 117  * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGetAttributes (C++ API)",
 118  * \ref ::cudaLaunch(T*) "cudaLaunch (C++ API)",
 119  * ::cudaSetDoubleForDevice,
 120  * ::cudaSetDoubleForHost,
 121  * \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument (C API)"
 122  */
 123 template<class T>
 124 __inline__ __host__ cudaError_t cudaSetupArgument(
 125   T      arg,
 126   size_t offset
 127 )
 128 {
 129   return ::cudaSetupArgument((const void*)&arg, sizeof(T), offset);
 130 }
 131
 132 /**
 133  * \brief \hl Creates an event object with the specified flags
 134  *
 135  * Creates an event object with the specified flags. Valid flags include:
 136  * - ::cudaEventDefault: Default event creation flag.
 137  * - ::cudaEventBlockingSync: Specifies that event should use blocking
 138  *   synchronization. A host thread that uses ::cudaEventSynchronize() to wait
 139  *   on an event created with this flag will block until the event actually
 140  *   completes.
 141  * - ::cudaEventDisableTiming: Specifies that the created event does not need
 142  *   to record timing data.  Events created with this flag specified and
 143  *   the ::cudaEventBlockingSync flag not specified will provide the best
 144  *   performance when used with ::cudaStreamWaitEvent() and ::cudaEventQuery().
 145  *
 146  * \param event - Newly created event
 147  * \param flags - Flags for new event
 148  *
 149  * \return
 150  * ::cudaSuccess,
 151  * ::cudaErrorInitializationError,
 152  * ::cudaErrorInvalidValue,
 153  * ::cudaErrorLaunchFailure,
 154  * ::cudaErrorMemoryAllocation
 155  * \notefnerr
 156  *
 157  * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
 158  * ::cudaEventCreateWithFlags, ::cudaEventRecord, ::cudaEventQuery,
 159  * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime,
 160  * ::cudaStreamWaitEvent
 161  */
 162 static __inline__ __host__ cudaError_t cudaEventCreate(
 163   cudaEvent_t  *event,
 164   unsigned int  flags
 165 )
 166 {
 167   return ::cudaEventCreateWithFlags(event, flags);
 168 }
 169
 170 /**
 171  * \brief \hl Allocates page-locked memory on the host
 172  *
 173  * Allocates \p size bytes of host memory that is page-locked and accessible
 174  * to the device. The driver tracks the virtual memory ranges allocated with
 175  * this function and automatically accelerates calls to functions such as
 176  * ::cudaMemcpy(). Since the memory can be accessed directly by the device, it
 177  * can be read or written with much higher bandwidth than pageable memory
 178  * obtained with functions such as ::malloc(). Allocating excessive amounts of
 179  * pinned memory may degrade system performance, since it reduces the amount
 180  * of memory available to the system for paging. As a result, this function is
 181  * best used sparingly to allocate staging areas for data exchange between host
 182  * and device.
 183  *
 184  * The \p flags parameter enables different options to be specified that affect
 185  * the allocation, as follows.
 186  * - ::cudaHostAllocDefault: This flag's value is defined to be 0.
 187  * - ::cudaHostAllocPortable: The memory returned by this call will be
 188  * considered as pinned memory by all CUDA contexts, not just the one that
 189  * performed the allocation.
 190  * - ::cudaHostAllocMapped: Maps the allocation into the CUDA address space.
 191  * The device pointer to the memory may be obtained by calling
 192  * ::cudaHostGetDevicePointer().
 193  * - ::cudaHostAllocWriteCombined: Allocates the memory as write-combined (WC).
 194  * WC memory can be transferred across the PCI Express bus more quickly on some
 195  * system configurations, but cannot be read efficiently by most CPUs.  WC
 196  * memory is a good option for buffers that will be written by the CPU and read
 197  * by the device via mapped pinned memory or host->device transfers.
 198  *
 199  * All of these flags are orthogonal to one another: a developer may allocate
 200  * memory that is portable, mapped and/or write-combined with no restrictions.
 201  *
 202  * ::cudaSetDeviceFlags() must have been called with the ::cudaDeviceMapHost
 203  * flag in order for the ::cudaHostAllocMapped flag to have any effect.
 204  *
 205  * The ::cudaHostAllocMapped flag may be specified on CUDA contexts for devices
 206  * that do not support mapped pinned memory. The failure is deferred to
 207  * ::cudaHostGetDevicePointer() because the memory may be mapped into other
 208  * CUDA contexts via the ::cudaHostAllocPortable flag.
 209  *
 210  * Memory allocated by this function must be freed with ::cudaFreeHost().
 211  *
 212  * \param ptr   - Device pointer to allocated memory
 213  * \param size  - Requested allocation size in bytes
 214  * \param flags - Requested properties of allocated memory
 215  *
 216  * \return
 217  * ::cudaSuccess,
 218  * ::cudaErrorMemoryAllocation
 219  * \notefnerr
 220  *
 221  * \sa ::cudaSetDeviceFlags,
 222  * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
 223  * ::cudaFreeHost, ::cudaHostAlloc
 224  */
 225 __inline__ __host__ cudaError_t cudaMallocHost(
 226   void         **ptr,
 227   size_t         size,
 228   unsigned int   flags
 229 )
 230 {
 231   return ::cudaHostAlloc(ptr, size, flags);
 232 }
 233
 234 template<class T>
 235 __inline__ __host__ cudaError_t cudaHostAlloc(
 236   T            **ptr,
 237   size_t         size,
 238   unsigned int   flags
 239 )
 240 {
 241   return ::cudaHostAlloc((void**)(void*)ptr, size, flags);
 242 }
 243
 244 template<class T>
 245 __inline__ __host__ cudaError_t cudaHostGetDevicePointer(
 246   T            **pDevice,
 247   void          *pHost,
 248   unsigned int   flags
 249 )
 250 {
 251   return ::cudaHostGetDevicePointer((void**)(void*)pDevice, pHost, flags);
 252 }
 253
 254 /**
 255  * \brief Allocates memory that will be automatically managed by the Unified Memory system
 256  *
 257  * Allocates \p size bytes of managed memory on the device and returns in
 258  * \p *devPtr a pointer to the allocated memory. If the device doesn't support
 259  * allocating managed memory, ::cudaErrorNotSupported is returned. Support
 260  * for managed memory can be queried using the device attribute
 261  * ::cudaDevAttrManagedMemory. The allocated memory is suitably
 262  * aligned for any kind of variable. The memory is not cleared. If \p size
 263  * is 0, ::cudaMallocManaged returns ::cudaErrorInvalidValue. The pointer
 264  * is valid on the CPU and on all GPUs in the system that support managed memory.
 265  * All accesses to this pointer must obey the Unified Memory programming model.
 266  *
 267  * \p flags specifies the default stream association for this allocation.
 268  * \p flags must be one of ::cudaMemAttachGlobal or ::cudaMemAttachHost. The
 269  * default value for \p flags is ::cudaMemAttachGlobal.
 270  * If ::cudaMemAttachGlobal is specified, then this memory is accessible from
 271  * any stream on any device. If ::cudaMemAttachHost is specified, then the
 272  * allocation is created with initial visibility restricted to host access only;
 273  * an explicit call to ::cudaStreamAttachMemAsync will be required to enable access
 274  * on the device.
 275  *
 276  * If the association is later changed via ::cudaStreamAttachMemAsync to
 277  * a single stream, the default association, as specifed during ::cudaMallocManaged,
 278  * is restored when that stream is destroyed. For __managed__ variables, the
 279  * default association is always ::cudaMemAttachGlobal. Note that destroying a
 280  * stream is an asynchronous operation, and as a result, the change to default
 281  * association won't happen until all work in the stream has completed.
 282  *
 283  * Memory allocated with ::cudaMallocManaged should be released with ::cudaFree.
 284  *
 285  * On a multi-GPU system with peer-to-peer support, where multiple GPUs support
 286  * managed memory, the physical storage is created on the GPU which is active
 287  * at the time ::cudaMallocManaged is called. All other GPUs will reference the
 288  * data at reduced bandwidth via peer mappings over the PCIe bus. The Unified
 289  * Memory management system does not migrate memory between GPUs.
 290  *
 291  * On a multi-GPU system where multiple GPUs support managed memory, but not
 292  * all pairs of such GPUs have peer-to-peer support between them, the physical
 293  * storage is created in 'zero-copy' or system memory. All GPUs will reference
 294  * the data at reduced bandwidth over the PCIe bus. In these circumstances,
 295  * use of the environment variable, CUDA_VISIBLE_DEVICES, is recommended to
 296  * restrict CUDA to only use those GPUs that have peer-to-peer support.
 297  * Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a non-zero
 298  * value to force the driver to always use device memory for physical storage.
 299  * When this environment variable is set to a non-zero value, all devices used in
 300  * that process that support managed memory have to be peer-to-peer compatible
 301  * with each other. The error ::cudaErrorInvalidDevice will be returned if a device
 302  * that supports managed memory is used and it is not peer-to-peer compatible with
 303  * any of the other managed memory supporting devices that were previously used in
 304  * that process, even if ::cudaDeviceReset has been called on those devices. These
 305  * environment variables are described in the CUDA programming guide under the
 306  * "CUDA environment variables" section.
 307  *
 308  * \param devPtr - Pointer to allocated device memory
 309  * \param size   - Requested allocation size in bytes
 310  * \param flags  - Must be either ::cudaMemAttachGlobal or ::cudaMemAttachHost (defaults to ::cudaMemAttachGlobal)
 311  *
 312  * \return
 313  * ::cudaSuccess,
 314  * ::cudaErrorMemoryAllocation
 315  * ::cudaErrorNotSupported
 316  * ::cudaErrorInvalidValue
 317  *
 318  * \sa ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, ::cudaFreeArray,
 319  * ::cudaMalloc3D, ::cudaMalloc3DArray,
 320  * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
 321  * ::cudaFreeHost, ::cudaHostAlloc, ::cudaDeviceGetAttribute, ::cudaStreamAttachMemAsync
 322  */
 323 template<class T>
 324 __inline__ __host__ cudaError_t cudaMallocManaged(
 325   T            **devPtr,
 326   size_t         size,
 327   unsigned int   flags = cudaMemAttachGlobal
 328 )
 329 {
 330   return ::cudaMallocManaged((void**)(void*)devPtr, size, flags);
 331 }
 332
 333 /**
 334  * \brief Attach memory to a stream asynchronously
 335  *
 336  * Enqueues an operation in \p stream to specify stream association of
 337  * \p length bytes of memory starting from \p devPtr. This function is a
 338  * stream-ordered operation, meaning that it is dependent on, and will
 339  * only take effect when, previous work in stream has completed. Any
 340  * previous association is automatically replaced.
 341  *
 342  * \p devPtr must point to an address within managed memory space declared
 343  * using the __managed__ keyword or allocated with ::cudaMallocManaged.
 344  *
 345  * \p length must be zero, to indicate that the entire allocation's
 346  * stream association is being changed.  Currently, it's not possible
 347  * to change stream association for a portion of an allocation. The default
 348  * value for \p length is zero.
 349  *
 350  * The stream association is specified using \p flags which must be
 351  * one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle.
 352  * The default value for \p flags is ::cudaMemAttachSingle
 353  * If the ::cudaMemAttachGlobal flag is specified, the memory can be accessed
 354  * by any stream on any device.
 355  * If the ::cudaMemAttachHost flag is specified, the program makes a guarantee
 356  * that it won't access the memory on the device from any stream.
 357  * If the ::cudaMemAttachSingle flag is specified, the program makes a guarantee
 358  * that it will only access the memory on the device from \p stream. It is illegal
 359  * to attach singly to the NULL stream, because the NULL stream is a virtual global
 360  * stream and not a specific stream. An error will be returned in this case.
 361  *
 362  * When memory is associated with a single stream, the Unified Memory system will
 363  * allow CPU access to this memory region so long as all operations in \p stream
 364  * have completed, regardless of whether other streams are active. In effect,
 365  * this constrains exclusive ownership of the managed memory region by
 366  * an active GPU to per-stream activity instead of whole-GPU activity.
 367  *
 368  * Accessing memory on the device from streams that are not associated with
 369  * it will produce undefined results. No error checking is performed by the
 370  * Unified Memory system to ensure that kernels launched into other streams
 371  * do not access this region.
 372  *
 373  * It is a program's responsibility to order calls to ::cudaStreamAttachMemAsync
 374  * via events, synchronization or other means to ensure legal access to memory
 375  * at all times. Data visibility and coherency will be changed appropriately
 376  * for all kernels which follow a stream-association change.
 377  *
 378  * If \p stream is destroyed while data is associated with it, the association is
 379  * removed and the association reverts to the default visibility of the allocation
 380  * as specified at ::cudaMallocManaged. For __managed__ variables, the default
 381  * association is always ::cudaMemAttachGlobal. Note that destroying a stream is an
 382  * asynchronous operation, and as a result, the change to default association won't
 383  * happen until all work in the stream has completed.
 384  *
 385  * \param stream  - Stream in which to enqueue the attach operation
 386  * \param devPtr  - Pointer to memory (must be a pointer to managed memory)
 387  * \param length  - Length of memory (must be zero, defaults to zero)
 388  * \param flags   - Must be one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle (defaults to ::cudaMemAttachSingle)
 389  *
 390  * \return
 391  * ::cudaSuccess,
 392  * ::cudaErrorNotReady,
 393  * ::cudaErrorInvalidValue
 394  * ::cudaErrorInvalidResourceHandle
 395  * \notefnerr
 396  *
 397  * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamWaitEvent, ::cudaStreamSynchronize, ::cudaStreamAddCallback, ::cudaStreamDestroy, ::cudaMallocManaged
 398  */
 399 template<class T>
 400 __inline__ __host__ cudaError_t cudaStreamAttachMemAsync(
 401   cudaStream_t   stream,
 402   T              *devPtr,
 403   size_t         length = 0,
 404   unsigned int   flags  = cudaMemAttachSingle
 405 )
 406 {
 407   return ::cudaStreamAttachMemAsync(stream, (void*)devPtr, length, flags);
 408 }
 409
 410 template<class T>
 411 __inline__ __host__ cudaError_t cudaMalloc(
 412   T      **devPtr,
 413   size_t   size
 414 )
 415 {
 416   return ::cudaMalloc((void**)(void*)devPtr, size);
 417 }
 418
 419 template<class T>
 420 __inline__ __host__ cudaError_t cudaMallocHost(
 421   T            **ptr,
 422   size_t         size,
 423   unsigned int   flags = 0
 424 )
 425 {
 426   return cudaMallocHost((void**)(void*)ptr, size, flags);
 427 }
 428
 429 template<class T>
 430 __inline__ __host__ cudaError_t cudaMallocPitch(
 431   T      **devPtr,
 432   size_t  *pitch,
 433   size_t   width,
 434   size_t   height
 435 )
 436 {
 437   return ::cudaMallocPitch((void**)(void*)devPtr, pitch, width, height);
 438 }
 439
 440 #if defined(__CUDACC__)
 441
 442 /**
 443  * \brief \hl Copies data to the given symbol on the device
 444  *
 445  * Copies \p count bytes from the memory area pointed to by \p src
 446  * to the memory area \p offset bytes from the start of symbol
 447  * \p symbol. The memory areas may not overlap. \p symbol is a variable that
 448  * resides in global or constant memory space. \p kind can be either
 449  * ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToDevice.
 450  *
 451  * \param symbol - Device symbol reference
 452  * \param src    - Source memory address
 453  * \param count  - Size in bytes to copy
 454  * \param offset - Offset from start of symbol in bytes
 455  * \param kind   - Type of transfer
 456  *
 457  * \return
 458  * ::cudaSuccess,
 459  * ::cudaErrorInvalidValue,
 460  * ::cudaErrorInvalidSymbol,
 461  * ::cudaErrorInvalidDevicePointer,
 462  * ::cudaErrorInvalidMemcpyDirection
 463  * \notefnerr
 464  * \note_sync
 465  * \note_string_api_deprecation
 466  *
 467  * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
 468  * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
 469  * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray,
 470  * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
 471  * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
 472  * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
 473  * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
 474  */
 475 template<class T>
 476 __inline__ __host__ cudaError_t cudaMemcpyToSymbol(
 477   const T                   &symbol,
 478   const void                *src,
 479         size_t               count,
 480         size_t               offset = 0,
 481         enum cudaMemcpyKind  kind   = cudaMemcpyHostToDevice
 482 )
 483 {
 484   return ::cudaMemcpyToSymbol((const void*)&symbol, src, count, offset, kind);
 485 }
 486
 487 /**
 488  * \brief \hl Copies data to the given symbol on the device
 489  *
 490  * Copies \p count bytes from the memory area pointed to by \p src
 491  * to the memory area \p offset bytes from the start of symbol
 492  * \p symbol. The memory areas may not overlap. \p symbol is a variable that
 493  * resides in global or constant memory space. \p kind can be either
 494  * ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToDevice.
 495  *
 496  * ::cudaMemcpyToSymbolAsync() is asynchronous with respect to the host, so
 497  * the call may return before the copy is complete. The copy can optionally
 498  * be associated to a stream by passing a non-zero \p stream argument. If
 499  * \p kind is ::cudaMemcpyHostToDevice and \p stream is non-zero, the copy
 500  * may overlap with operations in other streams.
 501  *
 502  * \param symbol - Device symbol reference
 503  * \param src    - Source memory address
 504  * \param count  - Size in bytes to copy
 505  * \param offset - Offset from start of symbol in bytes
 506  * \param kind   - Type of transfer
 507  * \param stream - Stream identifier
 508  *
 509  * \return
 510  * ::cudaSuccess,
 511  * ::cudaErrorInvalidValue,
 512  * ::cudaErrorInvalidSymbol,
 513  * ::cudaErrorInvalidDevicePointer,
 514  * ::cudaErrorInvalidMemcpyDirection
 515  * \notefnerr
 516  * \note_async
 517  * \note_string_api_deprecation
 518  *
 519  * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
 520  * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
 521  * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
 522  * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
 523  * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
 524  * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
 525  * ::cudaMemcpyFromSymbolAsync
 526  */
 527 template<class T>
 528 __inline__ __host__ cudaError_t cudaMemcpyToSymbolAsync(
 529   const T                   &symbol,
 530   const void                *src,
 531         size_t               count,
 532         size_t               offset = 0,
 533         enum cudaMemcpyKind  kind   = cudaMemcpyHostToDevice,
 534         cudaStream_t         stream = 0
 535 )
 536 {
 537   return ::cudaMemcpyToSymbolAsync((const void*)&symbol, src, count, offset, kind, stream);
 538 }
 539
 540 /**
 541  * \brief \hl Copies data from the given symbol on the device
 542  *
 543  * Copies \p count bytes from the memory area \p offset bytes
 544  * from the start of symbol \p symbol to the memory area pointed to by \p dst.
 545  * The memory areas may not overlap. \p symbol is a variable that
 546  * resides in global or constant memory space. \p kind can be either
 547  * ::cudaMemcpyDeviceToHost or ::cudaMemcpyDeviceToDevice.
 548  *
 549  * \param dst    - Destination memory address
 550  * \param symbol - Device symbol reference
 551  * \param count  - Size in bytes to copy
 552  * \param offset - Offset from start of symbol in bytes
 553  * \param kind   - Type of transfer
 554  *
 555  * \return
 556  * ::cudaSuccess,
 557  * ::cudaErrorInvalidValue,
 558  * ::cudaErrorInvalidSymbol,
 559  * ::cudaErrorInvalidDevicePointer,
 560  * ::cudaErrorInvalidMemcpyDirection
 561  * \notefnerr
 562  * \note_sync
 563  * \note_string_api_deprecation
 564  *
 565  * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
 566  * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
 567  * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
 568  * ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
 569  * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
 570  * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
 571  * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
 572  */
 573 template<class T>
 574 __inline__ __host__ cudaError_t cudaMemcpyFromSymbol(
 575         void                *dst,
 576   const T                   &symbol,
 577         size_t               count,
 578         size_t               offset = 0,
 579         enum cudaMemcpyKind  kind   = cudaMemcpyDeviceToHost
 580 )
 581 {
 582   return ::cudaMemcpyFromSymbol(dst, (const void*)&symbol, count, offset, kind);
 583 }
 584
 585 /**
 586  * \brief \hl Copies data from the given symbol on the device
 587  *
 588  * Copies \p count bytes from the memory area \p offset bytes
 589  * from the start of symbol \p symbol to the memory area pointed to by \p dst.
 590  * The memory areas may not overlap. \p symbol is a variable that resides in
 591  * global or constant memory space. \p kind can be either
 592  * ::cudaMemcpyDeviceToHost or ::cudaMemcpyDeviceToDevice.
 593  *
 594  * ::cudaMemcpyFromSymbolAsync() is asynchronous with respect to the host, so
 595  * the call may return before the copy is complete. The copy can optionally be
 596  * associated to a stream by passing a non-zero \p stream argument. If \p kind
 597  * is ::cudaMemcpyDeviceToHost and \p stream is non-zero, the copy may overlap
 598  * with operations in other streams.
 599  *
 600  * \param dst    - Destination memory address
 601  * \param symbol - Device symbol reference
 602  * \param count  - Size in bytes to copy
 603  * \param offset - Offset from start of symbol in bytes
 604  * \param kind   - Type of transfer
 605  * \param stream - Stream identifier
 606  *
 607  * \return
 608  * ::cudaSuccess,
 609  * ::cudaErrorInvalidValue,
 610  * ::cudaErrorInvalidSymbol,
 611  * ::cudaErrorInvalidDevicePointer,
 612  * ::cudaErrorInvalidMemcpyDirection
 613  * \notefnerr
 614  * \note_async
 615  * \note_string_api_deprecation
 616  *
 617  * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
 618  * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
 619  * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
 620  * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
 621  * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
 622  * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
 623  * ::cudaMemcpyToSymbolAsync
 624  */
 625 template<class T>
 626 __inline__ __host__ cudaError_t cudaMemcpyFromSymbolAsync(
 627         void                *dst,
 628   const T                   &symbol,
 629         size_t               count,
 630         size_t               offset = 0,
 631         enum cudaMemcpyKind  kind   = cudaMemcpyDeviceToHost,
 632         cudaStream_t         stream = 0
 633 )
 634 {
 635   return ::cudaMemcpyFromSymbolAsync(dst, (const void*)&symbol, count, offset, kind, stream);
 636 }
 637
 638 /**
 639  * \brief \hl Finds the address associated with a CUDA symbol
 640  *
 641  * Returns in \p *devPtr the address of symbol \p symbol on the device.
 642  * \p symbol can either be a variable that resides in global or constant memory space.
 643  * If \p symbol cannot be found, or if \p symbol is not declared
 644  * in the global or constant memory space, \p *devPtr is unchanged and the error
 645  * ::cudaErrorInvalidSymbol is returned.
 646  *
 647  * \param devPtr - Return device pointer associated with symbol
 648  * \param symbol - Device symbol reference
 649  *
 650  * \return
 651  * ::cudaSuccess,
 652  * ::cudaErrorInvalidSymbol
 653  * \notefnerr
 654  *
 655  * \sa \ref ::cudaGetSymbolAddress(void**, const void*) "cudaGetSymbolAddress (C API)",
 656  * \ref ::cudaGetSymbolSize(size_t*, const T&) "cudaGetSymbolSize (C++ API)"
 657  */
 658 template<class T>
 659 __inline__ __host__ cudaError_t cudaGetSymbolAddress(
 660         void **devPtr,
 661   const T     &symbol
 662 )
 663 {
 664   return ::cudaGetSymbolAddress(devPtr, (const void*)&symbol);
 665 }
 666
 667 /**
 668  * \brief \hl Finds the size of the object associated with a CUDA symbol
 669  *
 670  * Returns in \p *size the size of symbol \p symbol. \p symbol must be a
 671  * variable that resides in global or constant memory space.
 672  * If \p symbol cannot be found, or if \p symbol is not declared
 673  * in global or constant memory space, \p *size is unchanged and the error
 674  * ::cudaErrorInvalidSymbol is returned.
 675  *
 676  * \param size   - Size of object associated with symbol
 677  * \param symbol - Device symbol reference
 678  *
 679  * \return
 680  * ::cudaSuccess,
 681  * ::cudaErrorInvalidSymbol
 682  * \notefnerr
 683  *
 684  * \sa \ref ::cudaGetSymbolAddress(void**, const T&) "cudaGetSymbolAddress (C++ API)",
 685  * \ref ::cudaGetSymbolSize(size_t*, const void*) "cudaGetSymbolSize (C API)"
 686  */
 687 template<class T>
 688 __inline__ __host__ cudaError_t cudaGetSymbolSize(
 689         size_t *size,
 690   const T      &symbol
 691 )
 692 {
 693   return ::cudaGetSymbolSize(size, (const void*)&symbol);
 694 }
 695
 696 /**
 697  * \brief \hl Binds a memory area to a texture
 698  *
 699  * Binds \p size bytes of the memory area pointed to by \p devPtr to texture
 700  * reference \p tex. \p desc describes how the memory is interpreted when
 701  * fetching values from the texture. The \p offset parameter is an optional
 702  * byte offset as with the low-level
 703  * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture()"
 704  * function. Any memory previously bound to \p tex is unbound.
 705  *
 706  * \param offset - Offset in bytes
 707  * \param tex    - Texture to bind
 708  * \param devPtr - Memory area on device
 709  * \param desc   - Channel format
 710  * \param size   - Size of the memory area pointed to by devPtr
 711  *
 712  * \return
 713  * ::cudaSuccess,
 714  * ::cudaErrorInvalidValue,
 715  * ::cudaErrorInvalidDevicePointer,
 716  * ::cudaErrorInvalidTexture
 717  * \notefnerr
 718  *
 719  * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
 720  * ::cudaGetChannelDesc, ::cudaGetTextureReference,
 721  * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)",
 722  * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
 723  * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
 724  * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
 725  * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
 726  * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
 727  * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
 728  * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode>&) "cudaGetTextureAlignmentOffset (C++ API)"
 729  */
 730 template<class T, int dim, enum cudaTextureReadMode readMode>
 731 __inline__ __host__ cudaError_t cudaBindTexture(
 732         size_t                           *offset,
 733   const struct texture<T, dim, readMode> &tex,
 734   const void                             *devPtr,
 735   const struct cudaChannelFormatDesc     &desc,
 736         size_t                            size = UINT_MAX
 737 )
 738 {
 739   return ::cudaBindTexture(offset, &tex, devPtr, &desc, size);
 740 }
 741
 742 /**
 743  * \brief \hl Binds a memory area to a texture
 744  *
 745  * Binds \p size bytes of the memory area pointed to by \p devPtr to texture
 746  * reference \p tex. The channel descriptor is inherited from the texture
 747  * reference type. The \p offset parameter is an optional byte offset as with
 748  * the low-level
 749  * ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t)
 750  * function. Any memory previously bound to \p tex is unbound.
 751  *
 752  * \param offset - Offset in bytes
 753  * \param tex    - Texture to bind
 754  * \param devPtr - Memory area on device
 755  * \param size   - Size of the memory area pointed to by devPtr
 756  *
 757  * \return
 758  * ::cudaSuccess,
 759  * ::cudaErrorInvalidValue,
 760  * ::cudaErrorInvalidDevicePointer,
 761  * ::cudaErrorInvalidTexture
 762  * \notefnerr
 763  *
 764  * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
 765  * ::cudaGetChannelDesc, ::cudaGetTextureReference,
 766  * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)",
 767  * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
 768  * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
 769  * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
 770  * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
 771  * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
 772  * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
 773  * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode>&) "cudaGetTextureAlignmentOffset (C++ API)"
 774  */
 775 template<class T, int dim, enum cudaTextureReadMode readMode>
 776 __inline__ __host__ cudaError_t cudaBindTexture(
 777         size_t                           *offset,
 778   const struct texture<T, dim, readMode> &tex,
 779   const void                             *devPtr,
 780         size_t                            size = UINT_MAX
 781 )
 782 {
 783   return cudaBindTexture(offset, tex, devPtr, tex.channelDesc, size);
 784 }
 785
 786 /**
 787  * \brief \hl Binds a 2D memory area to a texture
 788  *
 789  * Binds the 2D memory area pointed to by \p devPtr to the
 790  * texture reference \p tex. The size of the area is constrained by
 791  * \p width in texel units, \p height in texel units, and \p pitch in byte
 792  * units. \p desc describes how the memory is interpreted when fetching values
 793  * from the texture. Any memory previously bound to \p tex is unbound.
 794  *
 795  * Since the hardware enforces an alignment requirement on texture base
 796  * addresses,
 797  * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D()"
 798  * returns in \p *offset a byte offset that
 799  * must be applied to texture fetches in order to read from the desired memory.
 800  * This offset must be divided by the texel size and passed to kernels that
 801  * read from the texture so they can be applied to the ::tex2D() function.
 802  * If the device memory pointer was returned from ::cudaMalloc(), the offset is
 803  * guaranteed to be 0 and NULL may be passed as the \p offset parameter.
 804  *
 805  * \param offset - Offset in bytes
 806  * \param tex    - Texture reference to bind
 807  * \param devPtr - 2D memory area on device
 808  * \param desc   - Channel format
 809  * \param width  - Width in texel units
 810  * \param height - Height in texel units
 811  * \param pitch  - Pitch in bytes
 812  *
 813  * \return
 814  * ::cudaSuccess,
 815  * ::cudaErrorInvalidValue,
 816  * ::cudaErrorInvalidDevicePointer,
 817  * ::cudaErrorInvalidTexture
 818  * \notefnerr
 819  *
 820  * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
 821  * ::cudaGetChannelDesc, ::cudaGetTextureReference,
 822  * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
 823  * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
 824  * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)",
 825  * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
 826  * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
 827  * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
 828  * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
 829  * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode>&) "cudaGetTextureAlignmentOffset (C++ API)"
 830  */
 831 template<class T, int dim, enum cudaTextureReadMode readMode>
 832 __inline__ __host__ cudaError_t cudaBindTexture2D(
 833         size_t                           *offset,
 834   const struct texture<T, dim, readMode> &tex,
 835   const void                             *devPtr,
 836   const struct cudaChannelFormatDesc     &desc,
 837   size_t                                  width,
 838   size_t                                  height,
 839   size_t                                  pitch
 840 )
 841 {
 842   return ::cudaBindTexture2D(offset, &tex, devPtr, &desc, width, height, pitch);
 843 }
 844
 845 /**
 846  * \brief \hl Binds a 2D memory area to a texture
 847  *
 848  * Binds the 2D memory area pointed to by \p devPtr to the
 849  * texture reference \p tex. The size of the area is constrained by
 850  * \p width in texel units, \p height in texel units, and \p pitch in byte
 851  * units. The channel descriptor is inherited from the texture reference
 852  * type. Any memory previously bound to \p tex is unbound.
 853  *
 854  * Since the hardware enforces an alignment requirement on texture base
 855  * addresses,
 856  * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D()"
 857  * returns in \p *offset a byte offset that
 858  * must be applied to texture fetches in order to read from the desired memory.
 859  * This offset must be divided by the texel size and passed to kernels that
 860  * read from the texture so they can be applied to the ::tex2D() function.
 861  * If the device memory pointer was returned from ::cudaMalloc(), the offset is
 862  * guaranteed to be 0 and NULL may be passed as the \p offset parameter.
 863  *
 864  * \param offset - Offset in bytes
 865  * \param tex    - Texture reference to bind
 866  * \param devPtr - 2D memory area on device
 867  * \param width  - Width in texel units
 868  * \param height - Height in texel units
 869  * \param pitch  - Pitch in bytes
 870  *
 871  * \return
 872  * ::cudaSuccess,
 873  * ::cudaErrorInvalidValue,
 874  * ::cudaErrorInvalidDevicePointer,
 875  * ::cudaErrorInvalidTexture
 876  * \notefnerr
 877  *
 878  * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
 879  * ::cudaGetChannelDesc, ::cudaGetTextureReference,
 880  * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
 881  * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
 882  * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)",
 883  * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
 884  * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
 885  * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
 886  * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
 887  * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode>&) "cudaGetTextureAlignmentOffset (C++ API)"
 888  */
 889 template<class T, int dim, enum cudaTextureReadMode readMode>
 890 __inline__ __host__ cudaError_t cudaBindTexture2D(
 891         size_t                           *offset,
 892   const struct texture<T, dim, readMode> &tex,
 893   const void                             *devPtr,
 894   size_t                                  width,
 895   size_t                                  height,
 896   size_t                                  pitch
 897 )
 898 {
 899   return ::cudaBindTexture2D(offset, &tex, devPtr, &tex.channelDesc, width, height, pitch);
 900 }
 901
 902 /**
 903  * \brief \hl Binds an array to a texture
 904  *
 905  * Binds the CUDA array \p array to the texture reference \p tex.
 906  * \p desc describes how the memory is interpreted when fetching values from
 907  * the texture. Any CUDA array previously bound to \p tex is unbound.
 908  *
 909  * \param tex   - Texture to bind
 910  * \param array - Memory array on device
 911  * \param desc  - Channel format
 912  *
 913  * \return
 914  * ::cudaSuccess,
 915  * ::cudaErrorInvalidValue,
 916  * ::cudaErrorInvalidDevicePointer,
 917  * ::cudaErrorInvalidTexture
 918  * \notefnerr
 919  *
 920  * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
 921  * ::cudaGetChannelDesc, ::cudaGetTextureReference,
 922  * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
 923  * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
 924  * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
 925  * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
 926  * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
 927  * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
 928  * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
 929  * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)"
 930  */
 931 template<class T, int dim, enum cudaTextureReadMode readMode>
 932 __inline__ __host__ cudaError_t cudaBindTextureToArray(
 933   const struct texture<T, dim, readMode> &tex,
 934   cudaArray_const_t                       array,
 935   const struct cudaChannelFormatDesc     &desc
 936 )
 937 {
 938   return ::cudaBindTextureToArray(&tex, array, &desc);
 939 }
 940
 941 /**
 942  * \brief \hl Binds an array to a texture
 943  *
 944  * Binds the CUDA array \p array to the texture reference \p tex.
 945  * The channel descriptor is inherited from the CUDA array. Any CUDA array
 946  * previously bound to \p tex is unbound.
 947  *
 948  * \param tex   - Texture to bind
 949  * \param array - Memory array on device
 950  *
 951  * \return
 952  * ::cudaSuccess,
 953  * ::cudaErrorInvalidValue,
 954  * ::cudaErrorInvalidDevicePointer,
 955  * ::cudaErrorInvalidTexture
 956  * \notefnerr
 957  *
 958  * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
 959  * ::cudaGetChannelDesc, ::cudaGetTextureReference,
 960  * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
 961  * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
 962  * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
 963  * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
 964  * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
 965  * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
 966  * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
 967  * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)"
 968  */
 969 template<class T, int dim, enum cudaTextureReadMode readMode>
 970 __inline__ __host__ cudaError_t cudaBindTextureToArray(
 971   const struct texture<T, dim, readMode> &tex,
 972   cudaArray_const_t                       array
 973 )
 974 {
 975   struct cudaChannelFormatDesc desc;
 976   cudaError_t                  err = ::cudaGetChannelDesc(&desc, array);
 977
 978   return err == cudaSuccess ? cudaBindTextureToArray(tex, array, desc) : err;
 979 }
 980
 981 /**
 982  * \brief \hl Binds a mipmapped array to a texture
 983  *
 984  * Binds the CUDA mipmapped array \p mipmappedArray to the texture reference \p tex.
 985  * \p desc describes how the memory is interpreted when fetching values from
 986  * the texture. Any CUDA mipmapped array previously bound to \p tex is unbound.
 987  *
 988  * \param tex            - Texture to bind
 989  * \param mipmappedArray - Memory mipmapped array on device
 990  * \param desc           - Channel format
 991  *
 992  * \return
 993  * ::cudaSuccess,
 994  * ::cudaErrorInvalidValue,
 995  * ::cudaErrorInvalidDevicePointer,
 996  * ::cudaErrorInvalidTexture
 997  * \notefnerr
 998  *
 999  * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
1000  * ::cudaGetChannelDesc, ::cudaGetTextureReference,
1001  * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
1002  * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
1003  * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
1004  * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
1005  * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
1006  * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
1007  * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
1008  * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)"
1009  */
1010 template<class T, int dim, enum cudaTextureReadMode readMode>
1011 __inline__ __host__ cudaError_t cudaBindTextureToMipmappedArray(
1012   const struct texture<T, dim, readMode> &tex,
1013   cudaMipmappedArray_const_t              mipmappedArray,
1014   const struct cudaChannelFormatDesc     &desc
1015 )
1016 {
1017   return ::cudaBindTextureToMipmappedArray(&tex, mipmappedArray, &desc);
1018 }
1019
1020 /**
1021  * \brief \hl Binds a mipmapped array to a texture
1022  *
1023  * Binds the CUDA mipmapped array \p mipmappedArray to the texture reference \p tex.
1024  * The channel descriptor is inherited from the CUDA array. Any CUDA mipmapped array
1025  * previously bound to \p tex is unbound.
1026  *
1027  * \param tex            - Texture to bind
1028  * \param mipmappedArray - Memory mipmapped array on device
1029  *
1030  * \return
1031  * ::cudaSuccess,
1032  * ::cudaErrorInvalidValue,
1033  * ::cudaErrorInvalidDevicePointer,
1034  * ::cudaErrorInvalidTexture
1035  * \notefnerr
1036  *
1037  * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
1038  * ::cudaGetChannelDesc, ::cudaGetTextureReference,
1039  * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
1040  * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
1041  * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
1042  * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
1043  * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
1044  * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
1045  * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
1046  * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)"
1047  */
1048 template<class T, int dim, enum cudaTextureReadMode readMode>
1049 __inline__ __host__ cudaError_t cudaBindTextureToMipmappedArray(
1050   const struct texture<T, dim, readMode> &tex,
1051   cudaMipmappedArray_const_t              mipmappedArray
1052 )
1053 {
1054   struct cudaChannelFormatDesc desc;
1055   cudaArray_t                  levelArray;
1056   cudaError_t                  err = ::cudaGetMipmappedArrayLevel(&levelArray, mipmappedArray, 0);
1057
1058   if (err != cudaSuccess) {
1059       return err;
1060   }
1061   err = ::cudaGetChannelDesc(&desc, levelArray);
1062
1063   return err == cudaSuccess ? cudaBindTextureToMipmappedArray(tex, mipmappedArray, desc) : err;
1064 }
1065
1066 /**
1067  * \brief \hl Unbinds a texture
1068  *
1069  * Unbinds the texture bound to \p tex.
1070  *
1071  * \param tex - Texture to unbind
1072  *
1073  * \return ::cudaSuccess
1074  * \notefnerr
1075  *
1076  * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
1077  * ::cudaGetChannelDesc, ::cudaGetTextureReference,
1078  * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
1079  * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
1080  * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
1081  * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
1082  * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
1083  * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
1084  * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)",
1085  * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)"
1086  */
1087 template<class T, int dim, enum cudaTextureReadMode readMode>
1088 __inline__ __host__ cudaError_t cudaUnbindTexture(
1089   const struct texture<T, dim, readMode> &tex
1090 )
1091 {
1092   return ::cudaUnbindTexture(&tex);
1093 }
1094
1095 /**
1096  * \brief \hl Get the alignment offset of a texture
1097  *
1098  * Returns in \p *offset the offset that was returned when texture reference
1099  * \p tex was bound.
1100  *
1101  * \param offset - Offset of texture reference in bytes
1102  * \param tex    - Texture to get offset of
1103  *
1104  * \return
1105  * ::cudaSuccess,
1106  * ::cudaErrorInvalidTexture,
1107  * ::cudaErrorInvalidTextureBinding
1108  * \notefnerr
1109  *
1110  * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
1111  * ::cudaGetChannelDesc, ::cudaGetTextureReference,
1112  * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
1113  * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
1114  * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
1115  * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
1116  * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
1117  * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
1118  * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
1119  * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)"
1120  */
1121 template<class T, int dim, enum cudaTextureReadMode readMode>
1122 __inline__ __host__ cudaError_t cudaGetTextureAlignmentOffset(
1123         size_t                           *offset,
1124   const struct texture<T, dim, readMode> &tex
1125 )
1126 {
1127   return ::cudaGetTextureAlignmentOffset(offset, &tex);
1128 }
1129
1130 /**
1131  * \brief \hl Sets the preferred cache configuration for a device function
1132  *
1133  * On devices where the L1 cache and shared memory use the same hardware
1134  * resources, this sets through \p cacheConfig the preferred cache configuration
1135  * for the function specified via \p func. This is only a preference. The
1136  * runtime will use the requested configuration if possible, but it is free to
1137  * choose a different configuration if required to execute \p func.
1138  *
1139  * \p func must be a pointer to a function that executes on the device.
1140  * The parameter specified by \p func must be declared as a \p __global__
1141  * function. If the specified function does not exist,
1142  * then ::cudaErrorInvalidDeviceFunction is returned.
1143  *
1144  * This setting does nothing on devices where the size of the L1 cache and
1145  * shared memory are fixed.
1146  *
1147  * Launching a kernel with a different preference than the most recent
1148  * preference setting may insert a device-side synchronization point.
1149  *
1150  * The supported cache configurations are:
1151  * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default)
1152  * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache
1153  * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory
1154  *
1155  * \param func        - device function pointer
1156  * \param cacheConfig - Requested cache configuration
1157  *
1158  * \return
1159  * ::cudaSuccess,
1160  * ::cudaErrorInitializationError,
1161  * ::cudaErrorInvalidDeviceFunction
1162  * \notefnerr
1163  *
1164  * \sa ::cudaConfigureCall,
1165  * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
1166  * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGetAttributes (C++ API)",
1167  * \ref ::cudaLaunch(const void*) "cudaLaunch (C API)",
1168  * ::cudaSetDoubleForDevice,
1169  * ::cudaSetDoubleForHost,
1170  * \ref ::cudaSetupArgument(T, size_t) "cudaSetupArgument (C++ API)",
1171  * ::cudaThreadGetCacheConfig,
1172  * ::cudaThreadSetCacheConfig
1173  */
1174 template<class T>
1175 __inline__ __host__ cudaError_t cudaFuncSetCacheConfig(
1176   T                  *func,
1177   enum cudaFuncCache  cacheConfig
1178 )
1179 {
1180   return ::cudaFuncSetCacheConfig((const void*)func, cacheConfig);
1181 }
1182
1183 template<class T>
1184 __inline__ __host__ cudaError_t cudaFuncSetSharedMemConfig(
1185   T                        *func,
1186   enum cudaSharedMemConfig  config
1187 )
1188 {
1189   return ::cudaFuncSetSharedMemConfig((const void*)func, config);
1190 }
1191
1192 /**
1193  * \brief Returns occupancy for a device function
1194  *
1195  * Returns in \p *numBlocks the maximum number of active blocks per
1196  * streaming multiprocessor for the device function.
1197  *
1198  * \param numBlocks       - Returned occupancy
1199  * \param func            - Kernel function for which occupancy is calulated
1200  * \param blockSize       - Block size the kernel is intended to be launched with
1201  * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
1202  *
1203  * \return
1204  * ::cudaSuccess,
1205  * ::cudaErrorCudartUnloading,
1206  * ::cudaErrorInitializationError,
1207  * ::cudaErrorInvalidDevice,
1208  * ::cudaErrorInvalidDeviceFunction,
1209  * ::cudaErrorInvalidValue,
1210  * ::cudaErrorUnknown,
1211  * \notefnerr
1212  *
1213  * \sa ::cudaOccupancyMaxPotentialBlockSize
1214  * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem
1215  */
1216 template<class T>
1217 __inline__ __host__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(
1218     int   *numBlocks,
1219     T      func,
1220     int    blockSize,
1221     size_t dynamicSMemSize)
1222 {
1223   return ::cudaOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, (const void*)func, blockSize, dynamicSMemSize);
1224 }
1225
1226 /**
1227  * Helper functor for cudaOccupancyMaxPotentialBlockSize
1228  */
1229 class __cudaOccupancyB2DHelper {
1230   size_t n;
1231 public:
1232   inline __host__ CUDART_DEVICE __cudaOccupancyB2DHelper(size_t n) : n(n) {}
1233   inline __host__ CUDART_DEVICE size_t operator()(int)
1234   {
1235       return n;
1236   }
1237 };
1238
1239 /**
1240  * \brief Returns grid and block size that achieves maximum potential occupancy for a device function
1241  *
1242  * Returns in \p *minGridSize and \p *blocksize a suggested grid /
1243  * block size pair that achieves the best potential occupancy
1244  * (i.e. the maximum number of active warps with the smallest number
1245  * of blocks).
1246  *
1247  * Use \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem if the
1248  * amount of per-block dynamic shared memory changes with different
1249  * block sizes.
1250  *
1251  * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy
1252  * \param blockSize   - Returned block size
1253  * \param func        - Device function symbol
1254  * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
1255  * \param blockSizeLimit  - The maximum block size \p func is designed to work with. 0 means no limit.
1256  *
1257  * \return
1258  * ::cudaSuccess,
1259  * ::cudaErrorCudartUnloading,
1260  * ::cudaErrorInitializationError,
1261  * ::cudaErrorInvalidDevice,
1262  * ::cudaErrorInvalidDeviceFunction,
1263  * ::cudaErrorInvalidValue,
1264  * ::cudaErrorUnknown,
1265  * \notefnerr
1266  *
1267  * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
1268  * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem
1269  */
1270 template<class T>
1271 __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSize(
1272     int    *minGridSize,
1273     int    *blockSize,
1274     T       func,
1275     size_t  dynamicSMemSize = 0,
1276     int     blockSizeLimit = 0)
1277 {
1278   return cudaOccupancyMaxPotentialBlockSizeVariableSMem(minGridSize, blockSize, func, __cudaOccupancyB2DHelper(dynamicSMemSize), blockSizeLimit);
1279 }
1280
1281 /**
1282  * \brief Returns grid and block size that achieves maximum potential occupancy for a device function
1283  *
1284  * Returns in \p *minGridSize and \p *blocksize a suggested grid /
1285  * block size pair that achieves the best potential occupancy
1286  * (i.e. the maximum number of active warps with the smallest number
1287  * of blocks).
1288  *
1289  * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy
1290  * \param blockSize   - Returned block size
1291  * \param func        - Device function symbol
1292  * \param blockSizeToDynamicSMemSize - A unary function / functor that takes block size, and returns the size, in bytes, of dynamic shared memory needed for a block
1293  * \param blockSizeLimit  - The maximum block size \p func is designed to work with. 0 means no limit.
1294  *
1295  * \return
1296  * ::cudaSuccess,
1297  * ::cudaErrorCudartUnloading,
1298  * ::cudaErrorInitializationError,
1299  * ::cudaErrorInvalidDevice,
1300  * ::cudaErrorInvalidDeviceFunction,
1301  * ::cudaErrorInvalidValue,
1302  * ::cudaErrorUnknown,
1303  * \notefnerr
1304  *
1305  * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
1306  * \sa ::cudaOccupancyMaxPotentialBlockSize
1307  */
1308
1309 template<typename UnaryFunction, class T>
1310 __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMem(
1311     int           *minGridSize,
1312     int           *blockSize,
1313     T              func,
1314     UnaryFunction  blockSizeToDynamicSMemSize,
1315     int            blockSizeLimit = 0)
1316 {
1317     cudaError_t status;
1318
1319     // Device and function properties
1320     int                       device;
1321     struct cudaFuncAttributes attr;
1322
1323     // Limits
1324     int maxThreadsPerMultiProcessor;
1325     int warpSize;
1326     int devMaxThreadsPerBlock;
1327     int multiProcessorCount;
1328     int funcMaxThreadsPerBlock;
1329     int occupancyLimit;
1330     int granularity;
1331
1332     // Recorded maximum
1333     int maxBlockSize = 0;
1334     int numBlocks    = 0;
1335     int maxOccupancy = 0;
1336
1337     // Temporary
1338     int blockSizeToTryAligned;
1339     int blockSizeToTry;
1340     int blockSizeLimitAligned;
1341     int occupancyInBlocks;
1342     int occupancyInThreads;
1343     int dynamicSMemSize;
1344
1345     ///////////////////////////
1346     // Check user input
1347     ///////////////////////////
1348
1349     if (!minGridSize || !blockSize || !func) {
1350         return cudaErrorInvalidValue;
1351     }
1352
1353     //////////////////////////////////////////////
1354     // Obtain device and function properties
1355     //////////////////////////////////////////////
1356
1357     status = ::cudaGetDevice(&device);
1358     if (status != cudaSuccess) {
1359         return status;
1360     }
1361
1362     status = cudaDeviceGetAttribute(
1363         &maxThreadsPerMultiProcessor,
1364         cudaDevAttrMaxThreadsPerMultiProcessor,
1365         device);
1366     if (status != cudaSuccess) {
1367         return status;
1368     }
1369
1370     status = cudaDeviceGetAttribute(
1371         &warpSize,
1372         cudaDevAttrWarpSize,
1373         device);
1374     if (status != cudaSuccess) {
1375         return status;
1376     }
1377
1378     status = cudaDeviceGetAttribute(
1379         &devMaxThreadsPerBlock,
1380         cudaDevAttrMaxThreadsPerBlock,
1381         device);
1382     if (status != cudaSuccess) {
1383         return status;
1384     }
1385
1386     status = cudaDeviceGetAttribute(
1387         &multiProcessorCount,
1388         cudaDevAttrMultiProcessorCount,
1389         device);
1390     if (status != cudaSuccess) {
1391         return status;
1392     }
1393
1394     status = cudaFuncGetAttributes(&attr, func);
1395     if (status != cudaSuccess) {
1396         return status;
1397     }
1398
1399     funcMaxThreadsPerBlock = attr.maxThreadsPerBlock;
1400
1401     /////////////////////////////////////////////////////////////////////////////////
1402     // Try each block size, and pick the block size with maximum occupancy
1403     /////////////////////////////////////////////////////////////////////////////////
1404
1405     occupancyLimit = maxThreadsPerMultiProcessor;
1406     granularity    = warpSize;
1407
1408     if (blockSizeLimit == 0) {
1409         blockSizeLimit = devMaxThreadsPerBlock;
1410     }
1411
1412     if (devMaxThreadsPerBlock < blockSizeLimit) {
1413         blockSizeLimit = devMaxThreadsPerBlock;
1414     }
1415
1416     if (funcMaxThreadsPerBlock < blockSizeLimit) {
1417         blockSizeLimit = funcMaxThreadsPerBlock;
1418     }
1419
1420     blockSizeLimitAligned = ((blockSizeLimit + (granularity - 1)) / granularity) * granularity;
1421
1422     for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
1423         // This is needed for the first iteration, because
1424         // blockSizeLimitAligned could be greater than blockSizeLimit
1425         //
1426         if (blockSizeLimit < blockSizeToTryAligned) {
1427             blockSizeToTry = blockSizeLimit;
1428         } else {
1429             blockSizeToTry = blockSizeToTryAligned;
1430         }
1431
1432         dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry);
1433
1434         status = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
1435             &occupancyInBlocks,
1436             func,
1437             blockSizeToTry,
1438             dynamicSMemSize);
1439
1440         if (status != cudaSuccess) {
1441             return status;
1442         }
1443
1444         occupancyInThreads = blockSizeToTry * occupancyInBlocks;
1445
1446         if (occupancyInThreads > maxOccupancy) {
1447             maxBlockSize = blockSizeToTry;
1448             numBlocks    = occupancyInBlocks;
1449             maxOccupancy = occupancyInThreads;
1450         }
1451
1452         // Early out if we have reached the maximum
1453         //
1454         if (occupancyLimit == maxOccupancy) {
1455             break;
1456         }
1457     }
1458
1459     ///////////////////////////
1460     // Return best available
1461     ///////////////////////////
1462
1463     // Suggested min grid size to achieve a full machine launch
1464     //
1465     *minGridSize = numBlocks * multiProcessorCount;
1466     *blockSize = maxBlockSize;
1467
1468     return status;
1469 }
1470
1471 /**
1472  * \brief \hl Launches a device function
1473  *
1474  * Launches the function \p entry on the device. The parameter \p entry must
1475  * be a function that executes on the device. The parameter specified by \p entry
1476  * must be declared as a \p __global__ function.
1477  * \ref ::cudaLaunch(T*) "cudaLaunch()" must be preceded by a call to
1478  * ::cudaConfigureCall() since it pops the data that was pushed by
1479  * ::cudaConfigureCall() from the execution stack.
1480  *
1481  * \param entry - Device function pointer
1482  * to execute
1483  *
1484  * \return
1485  * ::cudaSuccess,
1486  * ::cudaErrorInvalidDeviceFunction,
1487  * ::cudaErrorInvalidConfiguration,
1488  * ::cudaErrorLaunchFailure,
1489  * ::cudaErrorLaunchTimeout,
1490  * ::cudaErrorLaunchOutOfResources,
1491  * ::cudaErrorSharedObjectSymbolNotFound,
1492  * ::cudaErrorSharedObjectInitFailed
1493  * \notefnerr
1494  *
1495  * \sa ::cudaConfigureCall,
1496  * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)",
1497  * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGetAttributes (C++ API)",
1498  * \ref ::cudaLaunch(const void*) "cudaLaunch (C API)",
1499  * ::cudaSetDoubleForDevice,
1500  * ::cudaSetDoubleForHost,
1501  * \ref ::cudaSetupArgument(T, size_t) "cudaSetupArgument (C++ API)",
1502  * ::cudaThreadGetCacheConfig,
1503  * ::cudaThreadSetCacheConfig
1504  */
1505 template<class T>
1506 __inline__ __host__ cudaError_t cudaLaunch(
1507   T *func
1508 )
1509 {
1510   return ::cudaLaunch((const void*)func);
1511 }
1512
1513 /**
1514  * \brief \hl Find out attributes for a given function
1515  *
1516  * This function obtains the attributes of a function specified via \p entry.
1517  * The parameter \p entry must be a pointer to a function that executes
1518  * on the device. The parameter specified by \p entry must be declared as a \p __global__
1519  * function. The fetched attributes are placed in \p attr. If the specified
1520  * function does not exist, then ::cudaErrorInvalidDeviceFunction is returned.
1521  *
1522  * Note that some function attributes such as
1523  * \ref ::cudaFuncAttributes::maxThreadsPerBlock "maxThreadsPerBlock"
1524  * may vary based on the device that is currently being used.
1525  *
1526  * \param attr  - Return pointer to function's attributes
1527  * \param entry - Function to get attributes of
1528  *
1529  * \return
1530  * ::cudaSuccess,
1531  * ::cudaErrorInitializationError,
1532  * ::cudaErrorInvalidDeviceFunction
1533  * \notefnerr
1534  *
1535  * \sa ::cudaConfigureCall,
1536  * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)",
1537  * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)",
1538  * \ref ::cudaLaunch(T*) "cudaLaunch (C++ API)",
1539  * ::cudaSetDoubleForDevice,
1540  * ::cudaSetDoubleForHost,
1541  * \ref ::cudaSetupArgument(T, size_t) "cudaSetupArgument (C++ API)"
1542  */
1543 template<class T>
1544 __inline__ __host__ cudaError_t cudaFuncGetAttributes(
1545   struct cudaFuncAttributes *attr,
1546   T                         *entry
1547 )
1548 {
1549   return ::cudaFuncGetAttributes(attr, (const void*)entry);
1550 }
1551
1552 /**
1553  * \brief \hl Binds an array to a surface
1554  *
1555  * Binds the CUDA array \p array to the surface reference \p surf.
1556  * \p desc describes how the memory is interpreted when dealing with
1557  * the surface. Any CUDA array previously bound to \p surf is unbound.
1558  *
1559  * \param surf  - Surface to bind
1560  * \param array - Memory array on device
1561  * \param desc  - Channel format
1562  *
1563  * \return
1564  * ::cudaSuccess,
1565  * ::cudaErrorInvalidValue,
1566  * ::cudaErrorInvalidSurface
1567  * \notefnerr
1568  *
1569  * \sa \ref ::cudaBindSurfaceToArray(const struct surfaceReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindSurfaceToArray (C API)",
1570  * \ref ::cudaBindSurfaceToArray(const struct surface<T, dim>&, cudaArray_const_t) "cudaBindSurfaceToArray (C++ API, inherited channel descriptor)"
1571  */
1572 template<class T, int dim>
1573 __inline__ __host__ cudaError_t cudaBindSurfaceToArray(
1574   const struct surface<T, dim>       &surf,
1575   cudaArray_const_t                   array,
1576   const struct cudaChannelFormatDesc &desc
1577 )
1578 {
1579   return ::cudaBindSurfaceToArray(&surf, array, &desc);
1580 }
1581
1582 /**
1583  * \brief \hl Binds an array to a surface
1584  *
1585  * Binds the CUDA array \p array to the surface reference \p surf.
1586  * The channel descriptor is inherited from the CUDA array. Any CUDA array
1587  * previously bound to \p surf is unbound.
1588  *
1589  * \param surf  - Surface to bind
1590  * \param array - Memory array on device
1591  *
1592  * \return
1593  * ::cudaSuccess,
1594  * ::cudaErrorInvalidValue,
1595  * ::cudaErrorInvalidSurface
1596  * \notefnerr
1597  *
1598  * \sa \ref ::cudaBindSurfaceToArray(const struct surfaceReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindSurfaceToArray (C API)",
1599  * \ref ::cudaBindSurfaceToArray(const struct surface<T, dim>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindSurfaceToArray (C++ API)"
1600  */
1601 template<class T, int dim>
1602 __inline__ __host__ cudaError_t cudaBindSurfaceToArray(
1603   const struct surface<T, dim> &surf,
1604   cudaArray_const_t             array
1605 )
1606 {
1607   struct cudaChannelFormatDesc desc;
1608   cudaError_t                  err = ::cudaGetChannelDesc(&desc, array);
1609
1610   return err == cudaSuccess ? cudaBindSurfaceToArray(surf, array, desc) : err;
1611 }
1612
1613 #endif /* __CUDACC__ */
1614
1615 /** @} */ /* END CUDART_HIGHLEVEL */
1616
1617 } // namespace anonymous
1618
1619 #endif /* __cplusplus */
1620
1621 #endif /* !__CUDA_RUNTIME_H__ */