2 * Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
6 * This source code and/or documentation ("Licensed Deliverables") are
7 * subject to NVIDIA intellectual property rights under U.S. and
8 * international Copyright laws.
10 * These Licensed Deliverables contained herein is PROPRIETARY and
11 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12 * conditions of a form of NVIDIA software license agreement by and
13 * between NVIDIA and Licensee ("License Agreement") or electronically
14 * accepted by Licensee. Notwithstanding any terms or conditions to
15 * the contrary in the License Agreement, reproduction or disclosure
16 * of the Licensed Deliverables to any third party without the express
17 * written consent of NVIDIA is prohibited.
19 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32 * OF THESE LICENSED DELIVERABLES.
34 * U.S. Government End Users. These Licensed Deliverables are a
35 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36 * 1995), consisting of "commercial computer software" and "commercial
37 * computer software documentation" as such terms are used in 48
38 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39 * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41 * U.S. Government End Users acquire the Licensed Deliverables with
42 * only those rights set forth herein.
44 * Any use of the Licensed Deliverables in individual and commercial
45 * software must include, in the user documentation and internal
46 * comments to the code, the above Disclaimer and U.S. Government End
50 #if !defined(__CUDA_RUNTIME_H__)
51 #define __CUDA_RUNTIME_H__
53 /*******************************************************************************
57 *******************************************************************************/
59 #include "host_config.h"
61 /*******************************************************************************
65 *******************************************************************************/
67 #include "builtin_types.h"
68 #include "channel_descriptor.h"
69 #include "cuda_runtime_api.h"
70 #include "driver_functions.h"
71 #include "host_defines.h"
72 #include "vector_functions.h"
74 #if defined(__CUDACC__)
76 #include "common_functions.h"
77 #include "cuda_surface_types.h"
78 #include "cuda_texture_types.h"
79 #include "device_functions.h"
80 #include "device_launch_parameters.h"
82 #endif /* __CUDACC__ */
84 #if defined(__cplusplus)
86 /*******************************************************************************
90 *******************************************************************************/
96 * \addtogroup CUDART_HIGHLEVEL
101 * \brief \hl Configure a device launch
103 * Pushes \p size bytes of the argument pointed to by \p arg at \p offset
104 * bytes from the start of the parameter passing area, which starts at
105 * offset 0. The arguments are stored in the top of the execution stack.
106 * \ref ::cudaSetupArgument(T, size_t) "cudaSetupArgument()" must be preceded
107 * by a call to ::cudaConfigureCall().
109 * \param arg - Argument to push for a kernel launch
110 * \param offset - Offset in argument stack to push new arg
116 * \sa ::cudaConfigureCall,
117 * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGetAttributes (C++ API)",
118 * \ref ::cudaLaunch(T*) "cudaLaunch (C++ API)",
119 * ::cudaSetDoubleForDevice,
120 * ::cudaSetDoubleForHost,
121 * \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument (C API)"
124 __inline__ __host__ cudaError_t cudaSetupArgument(
129 return ::cudaSetupArgument((const void*)&arg, sizeof(T), offset);
133 * \brief \hl Creates an event object with the specified flags
135 * Creates an event object with the specified flags. Valid flags include:
136 * - ::cudaEventDefault: Default event creation flag.
137 * - ::cudaEventBlockingSync: Specifies that event should use blocking
138 * synchronization. A host thread that uses ::cudaEventSynchronize() to wait
139 * on an event created with this flag will block until the event actually
141 * - ::cudaEventDisableTiming: Specifies that the created event does not need
142 * to record timing data. Events created with this flag specified and
143 * the ::cudaEventBlockingSync flag not specified will provide the best
144 * performance when used with ::cudaStreamWaitEvent() and ::cudaEventQuery().
146 * \param event - Newly created event
147 * \param flags - Flags for new event
151 * ::cudaErrorInitializationError,
152 * ::cudaErrorInvalidValue,
153 * ::cudaErrorLaunchFailure,
154 * ::cudaErrorMemoryAllocation
157 * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
158 * ::cudaEventCreateWithFlags, ::cudaEventRecord, ::cudaEventQuery,
159 * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime,
160 * ::cudaStreamWaitEvent
162 static __inline__ __host__ cudaError_t cudaEventCreate(
167 return ::cudaEventCreateWithFlags(event, flags);
171 * \brief \hl Allocates page-locked memory on the host
173 * Allocates \p size bytes of host memory that is page-locked and accessible
174 * to the device. The driver tracks the virtual memory ranges allocated with
175 * this function and automatically accelerates calls to functions such as
176 * ::cudaMemcpy(). Since the memory can be accessed directly by the device, it
177 * can be read or written with much higher bandwidth than pageable memory
178 * obtained with functions such as ::malloc(). Allocating excessive amounts of
179 * pinned memory may degrade system performance, since it reduces the amount
180 * of memory available to the system for paging. As a result, this function is
181 * best used sparingly to allocate staging areas for data exchange between host
184 * The \p flags parameter enables different options to be specified that affect
185 * the allocation, as follows.
186 * - ::cudaHostAllocDefault: This flag's value is defined to be 0.
187 * - ::cudaHostAllocPortable: The memory returned by this call will be
188 * considered as pinned memory by all CUDA contexts, not just the one that
189 * performed the allocation.
190 * - ::cudaHostAllocMapped: Maps the allocation into the CUDA address space.
191 * The device pointer to the memory may be obtained by calling
192 * ::cudaHostGetDevicePointer().
193 * - ::cudaHostAllocWriteCombined: Allocates the memory as write-combined (WC).
194 * WC memory can be transferred across the PCI Express bus more quickly on some
195 * system configurations, but cannot be read efficiently by most CPUs. WC
196 * memory is a good option for buffers that will be written by the CPU and read
197 * by the device via mapped pinned memory or host->device transfers.
199 * All of these flags are orthogonal to one another: a developer may allocate
200 * memory that is portable, mapped and/or write-combined with no restrictions.
202 * ::cudaSetDeviceFlags() must have been called with the ::cudaDeviceMapHost
203 * flag in order for the ::cudaHostAllocMapped flag to have any effect.
205 * The ::cudaHostAllocMapped flag may be specified on CUDA contexts for devices
206 * that do not support mapped pinned memory. The failure is deferred to
207 * ::cudaHostGetDevicePointer() because the memory may be mapped into other
208 * CUDA contexts via the ::cudaHostAllocPortable flag.
210 * Memory allocated by this function must be freed with ::cudaFreeHost().
212 * \param ptr - Device pointer to allocated memory
213 * \param size - Requested allocation size in bytes
214 * \param flags - Requested properties of allocated memory
218 * ::cudaErrorMemoryAllocation
221 * \sa ::cudaSetDeviceFlags,
222 * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
223 * ::cudaFreeHost, ::cudaHostAlloc
225 __inline__ __host__ cudaError_t cudaMallocHost(
231 return ::cudaHostAlloc(ptr, size, flags);
235 __inline__ __host__ cudaError_t cudaHostAlloc(
241 return ::cudaHostAlloc((void**)(void*)ptr, size, flags);
245 __inline__ __host__ cudaError_t cudaHostGetDevicePointer(
251 return ::cudaHostGetDevicePointer((void**)(void*)pDevice, pHost, flags);
255 * \brief Allocates memory that will be automatically managed by the Unified Memory system
257 * Allocates \p size bytes of managed memory on the device and returns in
258 * \p *devPtr a pointer to the allocated memory. If the device doesn't support
259 * allocating managed memory, ::cudaErrorNotSupported is returned. Support
260 * for managed memory can be queried using the device attribute
261 * ::cudaDevAttrManagedMemory. The allocated memory is suitably
262 * aligned for any kind of variable. The memory is not cleared. If \p size
263 * is 0, ::cudaMallocManaged returns ::cudaErrorInvalidValue. The pointer
264 * is valid on the CPU and on all GPUs in the system that support managed memory.
265 * All accesses to this pointer must obey the Unified Memory programming model.
267 * \p flags specifies the default stream association for this allocation.
268 * \p flags must be one of ::cudaMemAttachGlobal or ::cudaMemAttachHost. The
269 * default value for \p flags is ::cudaMemAttachGlobal.
270 * If ::cudaMemAttachGlobal is specified, then this memory is accessible from
271 * any stream on any device. If ::cudaMemAttachHost is specified, then the
272 * allocation is created with initial visibility restricted to host access only;
273 * an explicit call to ::cudaStreamAttachMemAsync will be required to enable access
276 * If the association is later changed via ::cudaStreamAttachMemAsync to
277 * a single stream, the default association, as specifed during ::cudaMallocManaged,
278 * is restored when that stream is destroyed. For __managed__ variables, the
279 * default association is always ::cudaMemAttachGlobal. Note that destroying a
280 * stream is an asynchronous operation, and as a result, the change to default
281 * association won't happen until all work in the stream has completed.
283 * Memory allocated with ::cudaMallocManaged should be released with ::cudaFree.
285 * On a multi-GPU system with peer-to-peer support, where multiple GPUs support
286 * managed memory, the physical storage is created on the GPU which is active
287 * at the time ::cudaMallocManaged is called. All other GPUs will reference the
288 * data at reduced bandwidth via peer mappings over the PCIe bus. The Unified
289 * Memory management system does not migrate memory between GPUs.
291 * On a multi-GPU system where multiple GPUs support managed memory, but not
292 * all pairs of such GPUs have peer-to-peer support between them, the physical
293 * storage is created in 'zero-copy' or system memory. All GPUs will reference
294 * the data at reduced bandwidth over the PCIe bus. In these circumstances,
295 * use of the environment variable, CUDA_VISIBLE_DEVICES, is recommended to
296 * restrict CUDA to only use those GPUs that have peer-to-peer support.
297 * Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a non-zero
298 * value to force the driver to always use device memory for physical storage.
299 * When this environment variable is set to a non-zero value, all devices used in
300 * that process that support managed memory have to be peer-to-peer compatible
301 * with each other. The error ::cudaErrorInvalidDevice will be returned if a device
302 * that supports managed memory is used and it is not peer-to-peer compatible with
303 * any of the other managed memory supporting devices that were previously used in
304 * that process, even if ::cudaDeviceReset has been called on those devices. These
305 * environment variables are described in the CUDA programming guide under the
306 * "CUDA environment variables" section.
308 * \param devPtr - Pointer to allocated device memory
309 * \param size - Requested allocation size in bytes
310 * \param flags - Must be either ::cudaMemAttachGlobal or ::cudaMemAttachHost (defaults to ::cudaMemAttachGlobal)
314 * ::cudaErrorMemoryAllocation
315 * ::cudaErrorNotSupported
316 * ::cudaErrorInvalidValue
318 * \sa ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, ::cudaFreeArray,
319 * ::cudaMalloc3D, ::cudaMalloc3DArray,
320 * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
321 * ::cudaFreeHost, ::cudaHostAlloc, ::cudaDeviceGetAttribute, ::cudaStreamAttachMemAsync
324 __inline__ __host__ cudaError_t cudaMallocManaged(
327 unsigned int flags = cudaMemAttachGlobal
330 return ::cudaMallocManaged((void**)(void*)devPtr, size, flags);
334 * \brief Attach memory to a stream asynchronously
336 * Enqueues an operation in \p stream to specify stream association of
337 * \p length bytes of memory starting from \p devPtr. This function is a
338 * stream-ordered operation, meaning that it is dependent on, and will
339 * only take effect when, previous work in stream has completed. Any
340 * previous association is automatically replaced.
342 * \p devPtr must point to an address within managed memory space declared
343 * using the __managed__ keyword or allocated with ::cudaMallocManaged.
345 * \p length must be zero, to indicate that the entire allocation's
346 * stream association is being changed. Currently, it's not possible
347 * to change stream association for a portion of an allocation. The default
348 * value for \p length is zero.
350 * The stream association is specified using \p flags which must be
351 * one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle.
352 * The default value for \p flags is ::cudaMemAttachSingle
353 * If the ::cudaMemAttachGlobal flag is specified, the memory can be accessed
354 * by any stream on any device.
355 * If the ::cudaMemAttachHost flag is specified, the program makes a guarantee
356 * that it won't access the memory on the device from any stream.
357 * If the ::cudaMemAttachSingle flag is specified, the program makes a guarantee
358 * that it will only access the memory on the device from \p stream. It is illegal
359 * to attach singly to the NULL stream, because the NULL stream is a virtual global
360 * stream and not a specific stream. An error will be returned in this case.
362 * When memory is associated with a single stream, the Unified Memory system will
363 * allow CPU access to this memory region so long as all operations in \p stream
364 * have completed, regardless of whether other streams are active. In effect,
365 * this constrains exclusive ownership of the managed memory region by
366 * an active GPU to per-stream activity instead of whole-GPU activity.
368 * Accessing memory on the device from streams that are not associated with
369 * it will produce undefined results. No error checking is performed by the
370 * Unified Memory system to ensure that kernels launched into other streams
371 * do not access this region.
373 * It is a program's responsibility to order calls to ::cudaStreamAttachMemAsync
374 * via events, synchronization or other means to ensure legal access to memory
375 * at all times. Data visibility and coherency will be changed appropriately
376 * for all kernels which follow a stream-association change.
378 * If \p stream is destroyed while data is associated with it, the association is
379 * removed and the association reverts to the default visibility of the allocation
380 * as specified at ::cudaMallocManaged. For __managed__ variables, the default
381 * association is always ::cudaMemAttachGlobal. Note that destroying a stream is an
382 * asynchronous operation, and as a result, the change to default association won't
383 * happen until all work in the stream has completed.
385 * \param stream - Stream in which to enqueue the attach operation
386 * \param devPtr - Pointer to memory (must be a pointer to managed memory)
387 * \param length - Length of memory (must be zero, defaults to zero)
388 * \param flags - Must be one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle (defaults to ::cudaMemAttachSingle)
392 * ::cudaErrorNotReady,
393 * ::cudaErrorInvalidValue
394 * ::cudaErrorInvalidResourceHandle
397 * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamWaitEvent, ::cudaStreamSynchronize, ::cudaStreamAddCallback, ::cudaStreamDestroy, ::cudaMallocManaged
400 __inline__ __host__ cudaError_t cudaStreamAttachMemAsync(
404 unsigned int flags = cudaMemAttachSingle
407 return ::cudaStreamAttachMemAsync(stream, (void*)devPtr, length, flags);
411 __inline__ __host__ cudaError_t cudaMalloc(
416 return ::cudaMalloc((void**)(void*)devPtr, size);
420 __inline__ __host__ cudaError_t cudaMallocHost(
423 unsigned int flags = 0
426 return cudaMallocHost((void**)(void*)ptr, size, flags);
430 __inline__ __host__ cudaError_t cudaMallocPitch(
437 return ::cudaMallocPitch((void**)(void*)devPtr, pitch, width, height);
440 #if defined(__CUDACC__)
443 * \brief \hl Copies data to the given symbol on the device
445 * Copies \p count bytes from the memory area pointed to by \p src
446 * to the memory area \p offset bytes from the start of symbol
447 * \p symbol. The memory areas may not overlap. \p symbol is a variable that
448 * resides in global or constant memory space. \p kind can be either
449 * ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToDevice.
451 * \param symbol - Device symbol reference
452 * \param src - Source memory address
453 * \param count - Size in bytes to copy
454 * \param offset - Offset from start of symbol in bytes
455 * \param kind - Type of transfer
459 * ::cudaErrorInvalidValue,
460 * ::cudaErrorInvalidSymbol,
461 * ::cudaErrorInvalidDevicePointer,
462 * ::cudaErrorInvalidMemcpyDirection
465 * \note_string_api_deprecation
467 * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
468 * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
469 * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray,
470 * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
471 * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
472 * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
473 * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
476 __inline__ __host__ cudaError_t cudaMemcpyToSymbol(
481 enum cudaMemcpyKind kind = cudaMemcpyHostToDevice
484 return ::cudaMemcpyToSymbol((const void*)&symbol, src, count, offset, kind);
488 * \brief \hl Copies data to the given symbol on the device
490 * Copies \p count bytes from the memory area pointed to by \p src
491 * to the memory area \p offset bytes from the start of symbol
492 * \p symbol. The memory areas may not overlap. \p symbol is a variable that
493 * resides in global or constant memory space. \p kind can be either
494 * ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToDevice.
496 * ::cudaMemcpyToSymbolAsync() is asynchronous with respect to the host, so
497 * the call may return before the copy is complete. The copy can optionally
498 * be associated to a stream by passing a non-zero \p stream argument. If
499 * \p kind is ::cudaMemcpyHostToDevice and \p stream is non-zero, the copy
500 * may overlap with operations in other streams.
502 * \param symbol - Device symbol reference
503 * \param src - Source memory address
504 * \param count - Size in bytes to copy
505 * \param offset - Offset from start of symbol in bytes
506 * \param kind - Type of transfer
507 * \param stream - Stream identifier
511 * ::cudaErrorInvalidValue,
512 * ::cudaErrorInvalidSymbol,
513 * ::cudaErrorInvalidDevicePointer,
514 * ::cudaErrorInvalidMemcpyDirection
517 * \note_string_api_deprecation
519 * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
520 * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
521 * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
522 * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
523 * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
524 * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
525 * ::cudaMemcpyFromSymbolAsync
528 __inline__ __host__ cudaError_t cudaMemcpyToSymbolAsync(
533 enum cudaMemcpyKind kind = cudaMemcpyHostToDevice,
534 cudaStream_t stream = 0
537 return ::cudaMemcpyToSymbolAsync((const void*)&symbol, src, count, offset, kind, stream);
541 * \brief \hl Copies data from the given symbol on the device
543 * Copies \p count bytes from the memory area \p offset bytes
544 * from the start of symbol \p symbol to the memory area pointed to by \p dst.
545 * The memory areas may not overlap. \p symbol is a variable that
546 * resides in global or constant memory space. \p kind can be either
547 * ::cudaMemcpyDeviceToHost or ::cudaMemcpyDeviceToDevice.
549 * \param dst - Destination memory address
550 * \param symbol - Device symbol reference
551 * \param count - Size in bytes to copy
552 * \param offset - Offset from start of symbol in bytes
553 * \param kind - Type of transfer
557 * ::cudaErrorInvalidValue,
558 * ::cudaErrorInvalidSymbol,
559 * ::cudaErrorInvalidDevicePointer,
560 * ::cudaErrorInvalidMemcpyDirection
563 * \note_string_api_deprecation
565 * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
566 * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
567 * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
568 * ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
569 * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
570 * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
571 * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
574 __inline__ __host__ cudaError_t cudaMemcpyFromSymbol(
579 enum cudaMemcpyKind kind = cudaMemcpyDeviceToHost
582 return ::cudaMemcpyFromSymbol(dst, (const void*)&symbol, count, offset, kind);
586 * \brief \hl Copies data from the given symbol on the device
588 * Copies \p count bytes from the memory area \p offset bytes
589 * from the start of symbol \p symbol to the memory area pointed to by \p dst.
590 * The memory areas may not overlap. \p symbol is a variable that resides in
591 * global or constant memory space. \p kind can be either
592 * ::cudaMemcpyDeviceToHost or ::cudaMemcpyDeviceToDevice.
594 * ::cudaMemcpyFromSymbolAsync() is asynchronous with respect to the host, so
595 * the call may return before the copy is complete. The copy can optionally be
596 * associated to a stream by passing a non-zero \p stream argument. If \p kind
597 * is ::cudaMemcpyDeviceToHost and \p stream is non-zero, the copy may overlap
598 * with operations in other streams.
600 * \param dst - Destination memory address
601 * \param symbol - Device symbol reference
602 * \param count - Size in bytes to copy
603 * \param offset - Offset from start of symbol in bytes
604 * \param kind - Type of transfer
605 * \param stream - Stream identifier
609 * ::cudaErrorInvalidValue,
610 * ::cudaErrorInvalidSymbol,
611 * ::cudaErrorInvalidDevicePointer,
612 * ::cudaErrorInvalidMemcpyDirection
615 * \note_string_api_deprecation
617 * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
618 * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
619 * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
620 * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
621 * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
622 * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
623 * ::cudaMemcpyToSymbolAsync
626 __inline__ __host__ cudaError_t cudaMemcpyFromSymbolAsync(
631 enum cudaMemcpyKind kind = cudaMemcpyDeviceToHost,
632 cudaStream_t stream = 0
635 return ::cudaMemcpyFromSymbolAsync(dst, (const void*)&symbol, count, offset, kind, stream);
639 * \brief \hl Finds the address associated with a CUDA symbol
641 * Returns in \p *devPtr the address of symbol \p symbol on the device.
642 * \p symbol can either be a variable that resides in global or constant memory space.
643 * If \p symbol cannot be found, or if \p symbol is not declared
644 * in the global or constant memory space, \p *devPtr is unchanged and the error
645 * ::cudaErrorInvalidSymbol is returned.
647 * \param devPtr - Return device pointer associated with symbol
648 * \param symbol - Device symbol reference
652 * ::cudaErrorInvalidSymbol
655 * \sa \ref ::cudaGetSymbolAddress(void**, const void*) "cudaGetSymbolAddress (C API)",
656 * \ref ::cudaGetSymbolSize(size_t*, const T&) "cudaGetSymbolSize (C++ API)"
659 __inline__ __host__ cudaError_t cudaGetSymbolAddress(
664 return ::cudaGetSymbolAddress(devPtr, (const void*)&symbol);
668 * \brief \hl Finds the size of the object associated with a CUDA symbol
670 * Returns in \p *size the size of symbol \p symbol. \p symbol must be a
671 * variable that resides in global or constant memory space.
672 * If \p symbol cannot be found, or if \p symbol is not declared
673 * in global or constant memory space, \p *size is unchanged and the error
674 * ::cudaErrorInvalidSymbol is returned.
676 * \param size - Size of object associated with symbol
677 * \param symbol - Device symbol reference
681 * ::cudaErrorInvalidSymbol
684 * \sa \ref ::cudaGetSymbolAddress(void**, const T&) "cudaGetSymbolAddress (C++ API)",
685 * \ref ::cudaGetSymbolSize(size_t*, const void*) "cudaGetSymbolSize (C API)"
688 __inline__ __host__ cudaError_t cudaGetSymbolSize(
693 return ::cudaGetSymbolSize(size, (const void*)&symbol);
697 * \brief \hl Binds a memory area to a texture
699 * Binds \p size bytes of the memory area pointed to by \p devPtr to texture
700 * reference \p tex. \p desc describes how the memory is interpreted when
701 * fetching values from the texture. The \p offset parameter is an optional
702 * byte offset as with the low-level
703 * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture()"
704 * function. Any memory previously bound to \p tex is unbound.
706 * \param offset - Offset in bytes
707 * \param tex - Texture to bind
708 * \param devPtr - Memory area on device
709 * \param desc - Channel format
710 * \param size - Size of the memory area pointed to by devPtr
714 * ::cudaErrorInvalidValue,
715 * ::cudaErrorInvalidDevicePointer,
716 * ::cudaErrorInvalidTexture
719 * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
720 * ::cudaGetChannelDesc, ::cudaGetTextureReference,
721 * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)",
722 * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
723 * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
724 * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
725 * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
726 * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
727 * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
728 * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode>&) "cudaGetTextureAlignmentOffset (C++ API)"
730 template<class T, int dim, enum cudaTextureReadMode readMode>
731 __inline__ __host__ cudaError_t cudaBindTexture(
733 const struct texture<T, dim, readMode> &tex,
735 const struct cudaChannelFormatDesc &desc,
736 size_t size = UINT_MAX
739 return ::cudaBindTexture(offset, &tex, devPtr, &desc, size);
743 * \brief \hl Binds a memory area to a texture
745 * Binds \p size bytes of the memory area pointed to by \p devPtr to texture
746 * reference \p tex. The channel descriptor is inherited from the texture
747 * reference type. The \p offset parameter is an optional byte offset as with
749 * ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t)
750 * function. Any memory previously bound to \p tex is unbound.
752 * \param offset - Offset in bytes
753 * \param tex - Texture to bind
754 * \param devPtr - Memory area on device
755 * \param size - Size of the memory area pointed to by devPtr
759 * ::cudaErrorInvalidValue,
760 * ::cudaErrorInvalidDevicePointer,
761 * ::cudaErrorInvalidTexture
764 * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
765 * ::cudaGetChannelDesc, ::cudaGetTextureReference,
766 * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)",
767 * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
768 * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
769 * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
770 * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
771 * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
772 * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
773 * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode>&) "cudaGetTextureAlignmentOffset (C++ API)"
775 template<class T, int dim, enum cudaTextureReadMode readMode>
776 __inline__ __host__ cudaError_t cudaBindTexture(
778 const struct texture<T, dim, readMode> &tex,
780 size_t size = UINT_MAX
783 return cudaBindTexture(offset, tex, devPtr, tex.channelDesc, size);
787 * \brief \hl Binds a 2D memory area to a texture
789 * Binds the 2D memory area pointed to by \p devPtr to the
790 * texture reference \p tex. The size of the area is constrained by
791 * \p width in texel units, \p height in texel units, and \p pitch in byte
792 * units. \p desc describes how the memory is interpreted when fetching values
793 * from the texture. Any memory previously bound to \p tex is unbound.
795 * Since the hardware enforces an alignment requirement on texture base
797 * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D()"
798 * returns in \p *offset a byte offset that
799 * must be applied to texture fetches in order to read from the desired memory.
800 * This offset must be divided by the texel size and passed to kernels that
801 * read from the texture so they can be applied to the ::tex2D() function.
802 * If the device memory pointer was returned from ::cudaMalloc(), the offset is
803 * guaranteed to be 0 and NULL may be passed as the \p offset parameter.
805 * \param offset - Offset in bytes
806 * \param tex - Texture reference to bind
807 * \param devPtr - 2D memory area on device
808 * \param desc - Channel format
809 * \param width - Width in texel units
810 * \param height - Height in texel units
811 * \param pitch - Pitch in bytes
815 * ::cudaErrorInvalidValue,
816 * ::cudaErrorInvalidDevicePointer,
817 * ::cudaErrorInvalidTexture
820 * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
821 * ::cudaGetChannelDesc, ::cudaGetTextureReference,
822 * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
823 * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
824 * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)",
825 * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
826 * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
827 * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
828 * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
829 * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode>&) "cudaGetTextureAlignmentOffset (C++ API)"
831 template<class T, int dim, enum cudaTextureReadMode readMode>
832 __inline__ __host__ cudaError_t cudaBindTexture2D(
834 const struct texture<T, dim, readMode> &tex,
836 const struct cudaChannelFormatDesc &desc,
842 return ::cudaBindTexture2D(offset, &tex, devPtr, &desc, width, height, pitch);
846 * \brief \hl Binds a 2D memory area to a texture
848 * Binds the 2D memory area pointed to by \p devPtr to the
849 * texture reference \p tex. The size of the area is constrained by
850 * \p width in texel units, \p height in texel units, and \p pitch in byte
851 * units. The channel descriptor is inherited from the texture reference
852 * type. Any memory previously bound to \p tex is unbound.
854 * Since the hardware enforces an alignment requirement on texture base
856 * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D()"
857 * returns in \p *offset a byte offset that
858 * must be applied to texture fetches in order to read from the desired memory.
859 * This offset must be divided by the texel size and passed to kernels that
860 * read from the texture so they can be applied to the ::tex2D() function.
861 * If the device memory pointer was returned from ::cudaMalloc(), the offset is
862 * guaranteed to be 0 and NULL may be passed as the \p offset parameter.
864 * \param offset - Offset in bytes
865 * \param tex - Texture reference to bind
866 * \param devPtr - 2D memory area on device
867 * \param width - Width in texel units
868 * \param height - Height in texel units
869 * \param pitch - Pitch in bytes
873 * ::cudaErrorInvalidValue,
874 * ::cudaErrorInvalidDevicePointer,
875 * ::cudaErrorInvalidTexture
878 * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
879 * ::cudaGetChannelDesc, ::cudaGetTextureReference,
880 * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
881 * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
882 * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)",
883 * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
884 * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
885 * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
886 * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
887 * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode>&) "cudaGetTextureAlignmentOffset (C++ API)"
889 template<class T, int dim, enum cudaTextureReadMode readMode>
890 __inline__ __host__ cudaError_t cudaBindTexture2D(
892 const struct texture<T, dim, readMode> &tex,
899 return ::cudaBindTexture2D(offset, &tex, devPtr, &tex.channelDesc, width, height, pitch);
903 * \brief \hl Binds an array to a texture
905 * Binds the CUDA array \p array to the texture reference \p tex.
906 * \p desc describes how the memory is interpreted when fetching values from
907 * the texture. Any CUDA array previously bound to \p tex is unbound.
909 * \param tex - Texture to bind
910 * \param array - Memory array on device
911 * \param desc - Channel format
915 * ::cudaErrorInvalidValue,
916 * ::cudaErrorInvalidDevicePointer,
917 * ::cudaErrorInvalidTexture
920 * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
921 * ::cudaGetChannelDesc, ::cudaGetTextureReference,
922 * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
923 * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
924 * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
925 * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
926 * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
927 * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
928 * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
929 * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)"
931 template<class T, int dim, enum cudaTextureReadMode readMode>
932 __inline__ __host__ cudaError_t cudaBindTextureToArray(
933 const struct texture<T, dim, readMode> &tex,
934 cudaArray_const_t array,
935 const struct cudaChannelFormatDesc &desc
938 return ::cudaBindTextureToArray(&tex, array, &desc);
942 * \brief \hl Binds an array to a texture
944 * Binds the CUDA array \p array to the texture reference \p tex.
945 * The channel descriptor is inherited from the CUDA array. Any CUDA array
946 * previously bound to \p tex is unbound.
948 * \param tex - Texture to bind
949 * \param array - Memory array on device
953 * ::cudaErrorInvalidValue,
954 * ::cudaErrorInvalidDevicePointer,
955 * ::cudaErrorInvalidTexture
958 * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
959 * ::cudaGetChannelDesc, ::cudaGetTextureReference,
960 * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
961 * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
962 * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
963 * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
964 * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
965 * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
966 * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
967 * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)"
969 template<class T, int dim, enum cudaTextureReadMode readMode>
970 __inline__ __host__ cudaError_t cudaBindTextureToArray(
971 const struct texture<T, dim, readMode> &tex,
972 cudaArray_const_t array
975 struct cudaChannelFormatDesc desc;
976 cudaError_t err = ::cudaGetChannelDesc(&desc, array);
978 return err == cudaSuccess ? cudaBindTextureToArray(tex, array, desc) : err;
982 * \brief \hl Binds a mipmapped array to a texture
984 * Binds the CUDA mipmapped array \p mipmappedArray to the texture reference \p tex.
985 * \p desc describes how the memory is interpreted when fetching values from
986 * the texture. Any CUDA mipmapped array previously bound to \p tex is unbound.
988 * \param tex - Texture to bind
989 * \param mipmappedArray - Memory mipmapped array on device
990 * \param desc - Channel format
994 * ::cudaErrorInvalidValue,
995 * ::cudaErrorInvalidDevicePointer,
996 * ::cudaErrorInvalidTexture
999 * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
1000 * ::cudaGetChannelDesc, ::cudaGetTextureReference,
1001 * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
1002 * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
1003 * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
1004 * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
1005 * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
1006 * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
1007 * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
1008 * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)"
1010 template<class T, int dim, enum cudaTextureReadMode readMode>
1011 __inline__ __host__ cudaError_t cudaBindTextureToMipmappedArray(
1012 const struct texture<T, dim, readMode> &tex,
1013 cudaMipmappedArray_const_t mipmappedArray,
1014 const struct cudaChannelFormatDesc &desc
1017 return ::cudaBindTextureToMipmappedArray(&tex, mipmappedArray, &desc);
1021 * \brief \hl Binds a mipmapped array to a texture
1023 * Binds the CUDA mipmapped array \p mipmappedArray to the texture reference \p tex.
1024 * The channel descriptor is inherited from the CUDA array. Any CUDA mipmapped array
1025 * previously bound to \p tex is unbound.
1027 * \param tex - Texture to bind
1028 * \param mipmappedArray - Memory mipmapped array on device
1032 * ::cudaErrorInvalidValue,
1033 * ::cudaErrorInvalidDevicePointer,
1034 * ::cudaErrorInvalidTexture
1037 * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
1038 * ::cudaGetChannelDesc, ::cudaGetTextureReference,
1039 * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
1040 * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
1041 * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
1042 * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
1043 * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
1044 * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
1045 * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
1046 * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)"
1048 template<class T, int dim, enum cudaTextureReadMode readMode>
1049 __inline__ __host__ cudaError_t cudaBindTextureToMipmappedArray(
1050 const struct texture<T, dim, readMode> &tex,
1051 cudaMipmappedArray_const_t mipmappedArray
1054 struct cudaChannelFormatDesc desc;
1055 cudaArray_t levelArray;
1056 cudaError_t err = ::cudaGetMipmappedArrayLevel(&levelArray, mipmappedArray, 0);
1058 if (err != cudaSuccess) {
1061 err = ::cudaGetChannelDesc(&desc, levelArray);
1063 return err == cudaSuccess ? cudaBindTextureToMipmappedArray(tex, mipmappedArray, desc) : err;
1067 * \brief \hl Unbinds a texture
1069 * Unbinds the texture bound to \p tex.
1071 * \param tex - Texture to unbind
1073 * \return ::cudaSuccess
1076 * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
1077 * ::cudaGetChannelDesc, ::cudaGetTextureReference,
1078 * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
1079 * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
1080 * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
1081 * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
1082 * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
1083 * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
1084 * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)",
1085 * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)"
1087 template<class T, int dim, enum cudaTextureReadMode readMode>
1088 __inline__ __host__ cudaError_t cudaUnbindTexture(
1089 const struct texture<T, dim, readMode> &tex
1092 return ::cudaUnbindTexture(&tex);
1096 * \brief \hl Get the alignment offset of a texture
1098 * Returns in \p *offset the offset that was returned when texture reference
1101 * \param offset - Offset of texture reference in bytes
1102 * \param tex - Texture to get offset of
1106 * ::cudaErrorInvalidTexture,
1107 * ::cudaErrorInvalidTextureBinding
1110 * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
1111 * ::cudaGetChannelDesc, ::cudaGetTextureReference,
1112 * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
1113 * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
1114 * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
1115 * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
1116 * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
1117 * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
1118 * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
1119 * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)"
1121 template<class T, int dim, enum cudaTextureReadMode readMode>
1122 __inline__ __host__ cudaError_t cudaGetTextureAlignmentOffset(
1124 const struct texture<T, dim, readMode> &tex
1127 return ::cudaGetTextureAlignmentOffset(offset, &tex);
1131 * \brief \hl Sets the preferred cache configuration for a device function
1133 * On devices where the L1 cache and shared memory use the same hardware
1134 * resources, this sets through \p cacheConfig the preferred cache configuration
1135 * for the function specified via \p func. This is only a preference. The
1136 * runtime will use the requested configuration if possible, but it is free to
1137 * choose a different configuration if required to execute \p func.
1139 * \p func must be a pointer to a function that executes on the device.
1140 * The parameter specified by \p func must be declared as a \p __global__
1141 * function. If the specified function does not exist,
1142 * then ::cudaErrorInvalidDeviceFunction is returned.
1144 * This setting does nothing on devices where the size of the L1 cache and
1145 * shared memory are fixed.
1147 * Launching a kernel with a different preference than the most recent
1148 * preference setting may insert a device-side synchronization point.
1150 * The supported cache configurations are:
1151 * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default)
1152 * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache
1153 * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory
1155 * \param func - device function pointer
1156 * \param cacheConfig - Requested cache configuration
1160 * ::cudaErrorInitializationError,
1161 * ::cudaErrorInvalidDeviceFunction
1164 * \sa ::cudaConfigureCall,
1165 * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
1166 * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGetAttributes (C++ API)",
1167 * \ref ::cudaLaunch(const void*) "cudaLaunch (C API)",
1168 * ::cudaSetDoubleForDevice,
1169 * ::cudaSetDoubleForHost,
1170 * \ref ::cudaSetupArgument(T, size_t) "cudaSetupArgument (C++ API)",
1171 * ::cudaThreadGetCacheConfig,
1172 * ::cudaThreadSetCacheConfig
1175 __inline__ __host__ cudaError_t cudaFuncSetCacheConfig(
1177 enum cudaFuncCache cacheConfig
1180 return ::cudaFuncSetCacheConfig((const void*)func, cacheConfig);
1184 __inline__ __host__ cudaError_t cudaFuncSetSharedMemConfig(
1186 enum cudaSharedMemConfig config
1189 return ::cudaFuncSetSharedMemConfig((const void*)func, config);
1193 * \brief Returns occupancy for a device function
1195 * Returns in \p *numBlocks the maximum number of active blocks per
1196 * streaming multiprocessor for the device function.
1198 * \param numBlocks - Returned occupancy
1199 * \param func - Kernel function for which occupancy is calulated
1200 * \param blockSize - Block size the kernel is intended to be launched with
1201 * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
1205 * ::cudaErrorCudartUnloading,
1206 * ::cudaErrorInitializationError,
1207 * ::cudaErrorInvalidDevice,
1208 * ::cudaErrorInvalidDeviceFunction,
1209 * ::cudaErrorInvalidValue,
1210 * ::cudaErrorUnknown,
1213 * \sa ::cudaOccupancyMaxPotentialBlockSize
1214 * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem
1217 __inline__ __host__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(
1221 size_t dynamicSMemSize)
1223 return ::cudaOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, (const void*)func, blockSize, dynamicSMemSize);
1227 * Helper functor for cudaOccupancyMaxPotentialBlockSize
1229 class __cudaOccupancyB2DHelper {
1232 inline __host__ CUDART_DEVICE __cudaOccupancyB2DHelper(size_t n) : n(n) {}
1233 inline __host__ CUDART_DEVICE size_t operator()(int)
1240 * \brief Returns grid and block size that achieves maximum potential occupancy for a device function
1242 * Returns in \p *minGridSize and \p *blocksize a suggested grid /
1243 * block size pair that achieves the best potential occupancy
1244 * (i.e. the maximum number of active warps with the smallest number
1247 * Use \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem if the
1248 * amount of per-block dynamic shared memory changes with different
1251 * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy
1252 * \param blockSize - Returned block size
1253 * \param func - Device function symbol
1254 * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
1255 * \param blockSizeLimit - The maximum block size \p func is designed to work with. 0 means no limit.
1259 * ::cudaErrorCudartUnloading,
1260 * ::cudaErrorInitializationError,
1261 * ::cudaErrorInvalidDevice,
1262 * ::cudaErrorInvalidDeviceFunction,
1263 * ::cudaErrorInvalidValue,
1264 * ::cudaErrorUnknown,
1267 * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
1268 * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem
1271 __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSize(
1275 size_t dynamicSMemSize = 0,
1276 int blockSizeLimit = 0)
1278 return cudaOccupancyMaxPotentialBlockSizeVariableSMem(minGridSize, blockSize, func, __cudaOccupancyB2DHelper(dynamicSMemSize), blockSizeLimit);
1282 * \brief Returns grid and block size that achieves maximum potential occupancy for a device function
1284 * Returns in \p *minGridSize and \p *blocksize a suggested grid /
1285 * block size pair that achieves the best potential occupancy
1286 * (i.e. the maximum number of active warps with the smallest number
1289 * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy
1290 * \param blockSize - Returned block size
1291 * \param func - Device function symbol
1292 * \param blockSizeToDynamicSMemSize - A unary function / functor that takes block size, and returns the size, in bytes, of dynamic shared memory needed for a block
1293 * \param blockSizeLimit - The maximum block size \p func is designed to work with. 0 means no limit.
1297 * ::cudaErrorCudartUnloading,
1298 * ::cudaErrorInitializationError,
1299 * ::cudaErrorInvalidDevice,
1300 * ::cudaErrorInvalidDeviceFunction,
1301 * ::cudaErrorInvalidValue,
1302 * ::cudaErrorUnknown,
1305 * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
1306 * \sa ::cudaOccupancyMaxPotentialBlockSize
1309 template<typename UnaryFunction, class T>
1310 __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMem(
1314 UnaryFunction blockSizeToDynamicSMemSize,
1315 int blockSizeLimit = 0)
1319 // Device and function properties
1321 struct cudaFuncAttributes attr;
1324 int maxThreadsPerMultiProcessor;
1326 int devMaxThreadsPerBlock;
1327 int multiProcessorCount;
1328 int funcMaxThreadsPerBlock;
1333 int maxBlockSize = 0;
1335 int maxOccupancy = 0;
1338 int blockSizeToTryAligned;
1340 int blockSizeLimitAligned;
1341 int occupancyInBlocks;
1342 int occupancyInThreads;
1343 int dynamicSMemSize;
1345 ///////////////////////////
1347 ///////////////////////////
1349 if (!minGridSize || !blockSize || !func) {
1350 return cudaErrorInvalidValue;
1353 //////////////////////////////////////////////
1354 // Obtain device and function properties
1355 //////////////////////////////////////////////
1357 status = ::cudaGetDevice(&device);
1358 if (status != cudaSuccess) {
1362 status = cudaDeviceGetAttribute(
1363 &maxThreadsPerMultiProcessor,
1364 cudaDevAttrMaxThreadsPerMultiProcessor,
1366 if (status != cudaSuccess) {
1370 status = cudaDeviceGetAttribute(
1372 cudaDevAttrWarpSize,
1374 if (status != cudaSuccess) {
1378 status = cudaDeviceGetAttribute(
1379 &devMaxThreadsPerBlock,
1380 cudaDevAttrMaxThreadsPerBlock,
1382 if (status != cudaSuccess) {
1386 status = cudaDeviceGetAttribute(
1387 &multiProcessorCount,
1388 cudaDevAttrMultiProcessorCount,
1390 if (status != cudaSuccess) {
1394 status = cudaFuncGetAttributes(&attr, func);
1395 if (status != cudaSuccess) {
1399 funcMaxThreadsPerBlock = attr.maxThreadsPerBlock;
1401 /////////////////////////////////////////////////////////////////////////////////
1402 // Try each block size, and pick the block size with maximum occupancy
1403 /////////////////////////////////////////////////////////////////////////////////
1405 occupancyLimit = maxThreadsPerMultiProcessor;
1406 granularity = warpSize;
1408 if (blockSizeLimit == 0) {
1409 blockSizeLimit = devMaxThreadsPerBlock;
1412 if (devMaxThreadsPerBlock < blockSizeLimit) {
1413 blockSizeLimit = devMaxThreadsPerBlock;
1416 if (funcMaxThreadsPerBlock < blockSizeLimit) {
1417 blockSizeLimit = funcMaxThreadsPerBlock;
1420 blockSizeLimitAligned = ((blockSizeLimit + (granularity - 1)) / granularity) * granularity;
1422 for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
1423 // This is needed for the first iteration, because
1424 // blockSizeLimitAligned could be greater than blockSizeLimit
1426 if (blockSizeLimit < blockSizeToTryAligned) {
1427 blockSizeToTry = blockSizeLimit;
1429 blockSizeToTry = blockSizeToTryAligned;
1432 dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry);
1434 status = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
1440 if (status != cudaSuccess) {
1444 occupancyInThreads = blockSizeToTry * occupancyInBlocks;
1446 if (occupancyInThreads > maxOccupancy) {
1447 maxBlockSize = blockSizeToTry;
1448 numBlocks = occupancyInBlocks;
1449 maxOccupancy = occupancyInThreads;
1452 // Early out if we have reached the maximum
1454 if (occupancyLimit == maxOccupancy) {
1459 ///////////////////////////
1460 // Return best available
1461 ///////////////////////////
1463 // Suggested min grid size to achieve a full machine launch
1465 *minGridSize = numBlocks * multiProcessorCount;
1466 *blockSize = maxBlockSize;
1472 * \brief \hl Launches a device function
1474 * Launches the function \p entry on the device. The parameter \p entry must
1475 * be a function that executes on the device. The parameter specified by \p entry
1476 * must be declared as a \p __global__ function.
1477 * \ref ::cudaLaunch(T*) "cudaLaunch()" must be preceded by a call to
1478 * ::cudaConfigureCall() since it pops the data that was pushed by
1479 * ::cudaConfigureCall() from the execution stack.
1481 * \param entry - Device function pointer
1486 * ::cudaErrorInvalidDeviceFunction,
1487 * ::cudaErrorInvalidConfiguration,
1488 * ::cudaErrorLaunchFailure,
1489 * ::cudaErrorLaunchTimeout,
1490 * ::cudaErrorLaunchOutOfResources,
1491 * ::cudaErrorSharedObjectSymbolNotFound,
1492 * ::cudaErrorSharedObjectInitFailed
1495 * \sa ::cudaConfigureCall,
1496 * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)",
1497 * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGetAttributes (C++ API)",
1498 * \ref ::cudaLaunch(const void*) "cudaLaunch (C API)",
1499 * ::cudaSetDoubleForDevice,
1500 * ::cudaSetDoubleForHost,
1501 * \ref ::cudaSetupArgument(T, size_t) "cudaSetupArgument (C++ API)",
1502 * ::cudaThreadGetCacheConfig,
1503 * ::cudaThreadSetCacheConfig
1506 __inline__ __host__ cudaError_t cudaLaunch(
1510 return ::cudaLaunch((const void*)func);
1514 * \brief \hl Find out attributes for a given function
1516 * This function obtains the attributes of a function specified via \p entry.
1517 * The parameter \p entry must be a pointer to a function that executes
1518 * on the device. The parameter specified by \p entry must be declared as a \p __global__
1519 * function. The fetched attributes are placed in \p attr. If the specified
1520 * function does not exist, then ::cudaErrorInvalidDeviceFunction is returned.
1522 * Note that some function attributes such as
1523 * \ref ::cudaFuncAttributes::maxThreadsPerBlock "maxThreadsPerBlock"
1524 * may vary based on the device that is currently being used.
1526 * \param attr - Return pointer to function's attributes
1527 * \param entry - Function to get attributes of
1531 * ::cudaErrorInitializationError,
1532 * ::cudaErrorInvalidDeviceFunction
1535 * \sa ::cudaConfigureCall,
1536 * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)",
1537 * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)",
1538 * \ref ::cudaLaunch(T*) "cudaLaunch (C++ API)",
1539 * ::cudaSetDoubleForDevice,
1540 * ::cudaSetDoubleForHost,
1541 * \ref ::cudaSetupArgument(T, size_t) "cudaSetupArgument (C++ API)"
1544 __inline__ __host__ cudaError_t cudaFuncGetAttributes(
1545 struct cudaFuncAttributes *attr,
1549 return ::cudaFuncGetAttributes(attr, (const void*)entry);
1553 * \brief \hl Binds an array to a surface
1555 * Binds the CUDA array \p array to the surface reference \p surf.
1556 * \p desc describes how the memory is interpreted when dealing with
1557 * the surface. Any CUDA array previously bound to \p surf is unbound.
1559 * \param surf - Surface to bind
1560 * \param array - Memory array on device
1561 * \param desc - Channel format
1565 * ::cudaErrorInvalidValue,
1566 * ::cudaErrorInvalidSurface
1569 * \sa \ref ::cudaBindSurfaceToArray(const struct surfaceReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindSurfaceToArray (C API)",
1570 * \ref ::cudaBindSurfaceToArray(const struct surface<T, dim>&, cudaArray_const_t) "cudaBindSurfaceToArray (C++ API, inherited channel descriptor)"
1572 template<class T, int dim>
1573 __inline__ __host__ cudaError_t cudaBindSurfaceToArray(
1574 const struct surface<T, dim> &surf,
1575 cudaArray_const_t array,
1576 const struct cudaChannelFormatDesc &desc
1579 return ::cudaBindSurfaceToArray(&surf, array, &desc);
1583 * \brief \hl Binds an array to a surface
1585 * Binds the CUDA array \p array to the surface reference \p surf.
1586 * The channel descriptor is inherited from the CUDA array. Any CUDA array
1587 * previously bound to \p surf is unbound.
1589 * \param surf - Surface to bind
1590 * \param array - Memory array on device
1594 * ::cudaErrorInvalidValue,
1595 * ::cudaErrorInvalidSurface
1598 * \sa \ref ::cudaBindSurfaceToArray(const struct surfaceReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindSurfaceToArray (C API)",
1599 * \ref ::cudaBindSurfaceToArray(const struct surface<T, dim>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindSurfaceToArray (C++ API)"
1601 template<class T, int dim>
1602 __inline__ __host__ cudaError_t cudaBindSurfaceToArray(
1603 const struct surface<T, dim> &surf,
1604 cudaArray_const_t array
1607 struct cudaChannelFormatDesc desc;
1608 cudaError_t err = ::cudaGetChannelDesc(&desc, array);
1610 return err == cudaSuccess ? cudaBindSurfaceToArray(surf, array, desc) : err;
1613 #endif /* __CUDACC__ */
1615 /** @} */ /* END CUDART_HIGHLEVEL */
1617 } // namespace anonymous
1619 #endif /* __cplusplus */
1621 #endif /* !__CUDA_RUNTIME_H__ */