2 * Copyright 2011-2014 NVIDIA Corporation. All rights reserved
4 * Sample app to demonstrate use of CUPTI library to obtain profiler
5 * event values by sampling.
10 #define WIN32_LEAN_AND_MEAN
14 #include <cuda_runtime_api.h>
15 #include <cupti_events.h>
24 #define CHECK_CU_ERROR(err, cufunc) \
25 if (err != CUDA_SUCCESS) \
27 printf ("Error %d for CUDA Driver API function '%s'.\n", \
32 #define CHECK_CUPTI_ERROR(err, cuptifunc) \
33 if (err != CUPTI_SUCCESS) \
36 cuptiGetResultString(err, &errstr); \
37 printf ("%s:%d:Error %s for CUPTI API function '%s'.\n", \
38 __FILE__, __LINE__, errstr, cuptifunc); \
42 #define EVENT_NAME "inst_executed"
44 #define TESLA_ITERS 500
45 #define FERMI_ITERS 2000
46 #define SAMPLE_PERIOD_MS 50
48 // used to signal from the compute thread to the sampling thread
49 static volatile int testComplete = 0;
51 static CUcontext context;
52 static CUdevice device;
53 static const char *eventName;
57 __global__ void VecAdd(const int* A, const int* B, int* C, int size)
59 int i = blockDim.x * blockIdx.x + threadIdx.x;
60 for(int n = 0 ; n < 100; n++) {
67 initVec(int *vec, int n)
69 for (int i=0; i< n; i++)
74 sampling_func(void *arg)
77 CUpti_EventGroup eventGroup;
78 CUpti_EventID eventId;
82 cuptiErr = cuptiSetEventCollectionMode(context,
83 CUPTI_EVENT_COLLECTION_MODE_CONTINUOUS);
84 CHECK_CUPTI_ERROR(cuptiErr, "cuptiSetEventCollectionMode");
86 cuptiErr = cuptiEventGroupCreate(context, &eventGroup, 0);
87 CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupCreate");
89 cuptiErr = cuptiEventGetIdFromName(device, eventName, &eventId);
90 CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGetIdFromName");
92 cuptiErr = cuptiEventGroupAddEvent(eventGroup, eventId);
93 CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupAddEvent");
95 cuptiErr = cuptiEventGroupEnable(eventGroup);
96 CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupEnable");
98 while (!testComplete) {
99 bytesRead = sizeof(eventVal);
100 cuptiErr = cuptiEventGroupReadEvent(eventGroup,
101 CUPTI_EVENT_READ_FLAG_NONE,
102 eventId, &bytesRead, &eventVal);
103 CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupReadEvent");
104 if (bytesRead != sizeof(eventVal)) {
105 printf("Failed to read value for \"%s\"\n", eventName);
109 printf("%s: %llu\n", eventName, (unsigned long long)eventVal);
112 Sleep(SAMPLE_PERIOD_MS);
114 usleep(SAMPLE_PERIOD_MS * 1000);
118 cuptiErr = cuptiEventGroupDisable(eventGroup);
119 CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupDisable");
121 cuptiErr = cuptiEventGroupDestroy(eventGroup);
122 CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupDestroy");
130 size_t size = N * sizeof(int);
131 int threadsPerBlock = 0;
132 int blocksPerGrid = 0;
134 int *h_A, *h_B, *h_C;
135 int *d_A, *d_B, *d_C;
137 // Allocate input vectors h_A and h_B in host memory
138 h_A = (int*)malloc(size);
139 h_B = (int*)malloc(size);
140 h_C = (int*)malloc(size);
142 // Initialize input vectors
145 memset(h_C, 0, size);
147 // Allocate vectors in device memory
148 cudaMalloc((void**)&d_A, size);
149 cudaMalloc((void**)&d_B, size);
150 cudaMalloc((void**)&d_C, size);
152 // Copy vectors from host memory to device memory
153 cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
154 cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
156 // Invoke kernel (multiple times to make sure we have time for
158 threadsPerBlock = 256;
159 blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
160 for (i = 0; i < iters; i++) {
161 VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
165 // Copy result from device memory to host memory
166 // h_C contains the result in host memory
167 cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
170 for (i = 0; i < N; ++i) {
171 sum = h_A[i] + h_B[i];
173 printf("kernel execution FAILED\n");
180 main(int argc, char *argv[])
189 CUptiResult cuptiErr;
190 int computeCapabilityMajor=0;
191 int computeCapabilityMinor=0;
195 CUpti_DeviceAttributeDeviceClass deviceClass;
196 size_t deviceClassSize = sizeof deviceClass;
198 printf("Usage: %s [device_num] [event_name]\n", argv[0]);
201 CHECK_CU_ERROR(err, "cuInit");
203 err = cuDeviceGetCount(&deviceCount);
204 CHECK_CU_ERROR(err, "cuDeviceGetCount");
206 if (deviceCount == 0) {
207 printf("There is no device supporting CUDA.\n");
212 deviceNum = atoi(argv[1]);
215 printf("CUDA Device Number: %d\n", deviceNum);
217 err = cuDeviceGet(&device, deviceNum);
218 CHECK_CU_ERROR(err, "cuDeviceGet");
220 err = cuDeviceGetName(deviceName, 32, device);
221 CHECK_CU_ERROR(err, "cuDeviceGetName");
223 printf("CUDA Device Name: %s\n", deviceName);
225 cuptiErr = cuptiDeviceGetAttribute(device, CUPTI_DEVICE_ATTR_DEVICE_CLASS, &deviceClassSize, &deviceClass);
226 CHECK_CUPTI_ERROR(cuptiErr, "cuptiDeviceGetAttribute");
228 if (deviceClass != CUPTI_DEVICE_ATTR_DEVICE_CLASS_TESLA) {
229 printf("Sample uses event collection mode _CONTINUOUS which is supported only on Tesla GPUs.\n");
233 err = cuDeviceComputeCapability(&computeCapabilityMajor,
234 &computeCapabilityMinor,
236 CHECK_CU_ERROR(err, "cuDeviceComputeCapability");
242 if (computeCapabilityMajor > 1) {
243 eventName = EVENT_NAME;
246 printf("Event sampling is not supported for Tesla family devices.\n");
251 err = cuCtxCreate(&context, 0, device);
252 CHECK_CU_ERROR(err, "cuCtxCreate");
257 printf("Creating sampling thread\n");
259 hThread = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) sampling_func,
262 printf("CreateThread failed\n");
266 status = pthread_create(&pThread, NULL, sampling_func, NULL);
268 perror("pthread_create");
273 // run kernel while sampling
274 compute((computeCapabilityMajor > 1) ? FERMI_ITERS : TESLA_ITERS);
276 // "signal" the sampling thread to exit and wait for it
279 WaitForSingleObject(hThread, INFINITE);
281 pthread_join(pThread, NULL);
284 cudaDeviceSynchronize();