util/X86LINUX64/cuda-6.5/extras/CUPTI/sample/event_sampling/event_sampling.cu

   1 /*
   2  * Copyright 2011-2014 NVIDIA Corporation. All rights reserved
   3  *
   4  * Sample app to demonstrate use of CUPTI library to obtain profiler
   5  * event values by sampling.
   6  */
   7
   8
   9 #ifdef _WIN32
  10 #define WIN32_LEAN_AND_MEAN
  11 #endif
  12
  13 #include <stdio.h>
  14 #include <cuda_runtime_api.h>
  15 #include <cupti_events.h>
  16
  17 #ifdef _WIN32
  18 #include <windows.h>
  19 #else
  20 #include <unistd.h>
  21 #include <pthread.h>
  22 #endif
  23
  24 #define CHECK_CU_ERROR(err, cufunc)                                     \
  25   if (err != CUDA_SUCCESS)                                              \
  26     {                                                                   \
  27       printf ("Error %d for CUDA Driver API function '%s'.\n",          \
  28               err, cufunc);                                             \
  29       exit(-1);                                                         \
  30     }
  31
  32 #define CHECK_CUPTI_ERROR(err, cuptifunc)                       \
  33   if (err != CUPTI_SUCCESS)                                     \
  34     {                                                           \
  35       const char *errstr;                                       \
  36       cuptiGetResultString(err, &errstr);                       \
  37       printf ("%s:%d:Error %s for CUPTI API function '%s'.\n",  \
  38               __FILE__, __LINE__, errstr, cuptifunc);           \
  39       exit(-1);                                                 \
  40     }
  41
  42 #define EVENT_NAME "inst_executed"
  43 #define N 100000
  44 #define TESLA_ITERS 500
  45 #define FERMI_ITERS 2000
  46 #define SAMPLE_PERIOD_MS 50
  47
  48 // used to signal from the compute thread to the sampling thread
  49 static volatile int testComplete = 0;
  50
  51 static CUcontext context;
  52 static CUdevice device;
  53 static const char *eventName;
  54
  55
  56 // Device code
  57 __global__ void VecAdd(const int* A, const int* B, int* C, int size)
  58 {
  59   int i = blockDim.x * blockIdx.x + threadIdx.x;
  60   for(int n = 0 ; n < 100; n++) {
  61     if (i < size)
  62       C[i] = A[i] + B[i];
  63   }
  64 }
  65
  66 static void
  67 initVec(int *vec, int n)
  68 {
  69   for (int i=0; i< n; i++)
  70     vec[i] = i;
  71 }
  72
  73 void *
  74 sampling_func(void *arg)
  75 {
  76   CUptiResult cuptiErr;
  77   CUpti_EventGroup eventGroup;
  78   CUpti_EventID eventId;
  79   size_t bytesRead;
  80   uint64_t eventVal;
  81
  82   cuptiErr = cuptiSetEventCollectionMode(context,
  83                                          CUPTI_EVENT_COLLECTION_MODE_CONTINUOUS);
  84   CHECK_CUPTI_ERROR(cuptiErr, "cuptiSetEventCollectionMode");
  85
  86   cuptiErr = cuptiEventGroupCreate(context, &eventGroup, 0);
  87   CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupCreate");
  88
  89   cuptiErr = cuptiEventGetIdFromName(device, eventName, &eventId);
  90   CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGetIdFromName");
  91
  92   cuptiErr = cuptiEventGroupAddEvent(eventGroup, eventId);
  93   CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupAddEvent");
  94
  95   cuptiErr = cuptiEventGroupEnable(eventGroup);
  96   CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupEnable");
  97
  98   while (!testComplete) {
  99     bytesRead = sizeof(eventVal);
 100     cuptiErr = cuptiEventGroupReadEvent(eventGroup,
 101                                         CUPTI_EVENT_READ_FLAG_NONE,
 102                                         eventId, &bytesRead, &eventVal);
 103     CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupReadEvent");
 104     if (bytesRead != sizeof(eventVal)) {
 105       printf("Failed to read value for \"%s\"\n", eventName);
 106       exit(-1);
 107     }
 108
 109     printf("%s: %llu\n", eventName, (unsigned long long)eventVal);
 110
 111 #ifdef _WIN32
 112     Sleep(SAMPLE_PERIOD_MS);
 113 #else
 114     usleep(SAMPLE_PERIOD_MS * 1000);
 115 #endif
 116   }
 117
 118   cuptiErr = cuptiEventGroupDisable(eventGroup);
 119   CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupDisable");
 120
 121   cuptiErr = cuptiEventGroupDestroy(eventGroup);
 122   CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupDestroy");
 123
 124   return NULL;
 125 }
 126
 127 static void
 128 compute(int iters)
 129 {
 130   size_t size = N * sizeof(int);
 131   int threadsPerBlock = 0;
 132   int blocksPerGrid = 0;
 133   int sum, i;
 134   int *h_A, *h_B, *h_C;
 135   int *d_A, *d_B, *d_C;
 136
 137   // Allocate input vectors h_A and h_B in host memory
 138   h_A = (int*)malloc(size);
 139   h_B = (int*)malloc(size);
 140   h_C = (int*)malloc(size);
 141
 142   // Initialize input vectors
 143   initVec(h_A, N);
 144   initVec(h_B, N);
 145   memset(h_C, 0, size);
 146
 147   // Allocate vectors in device memory
 148   cudaMalloc((void**)&d_A, size);
 149   cudaMalloc((void**)&d_B, size);
 150   cudaMalloc((void**)&d_C, size);
 151
 152   // Copy vectors from host memory to device memory
 153   cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
 154   cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
 155
 156   // Invoke kernel (multiple times to make sure we have time for
 157   // sampling)
 158   threadsPerBlock = 256;
 159   blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
 160   for (i = 0; i < iters; i++) {
 161     VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
 162   }
 163
 164
 165   // Copy result from device memory to host memory
 166   // h_C contains the result in host memory
 167   cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
 168
 169   // Verify result
 170   for (i = 0; i < N; ++i) {
 171     sum = h_A[i] + h_B[i];
 172     if (h_C[i] != sum) {
 173       printf("kernel execution FAILED\n");
 174       exit(-1);
 175     }
 176   }
 177 }
 178
 179 int
 180 main(int argc, char *argv[])
 181 {
 182 #ifdef _WIN32
 183   HANDLE hThread;
 184 #else
 185   int status;
 186   pthread_t pThread;
 187 #endif
 188   CUresult err;
 189   CUptiResult cuptiErr;
 190   int computeCapabilityMajor=0;
 191   int computeCapabilityMinor=0;
 192   int deviceNum;
 193   int deviceCount;
 194   char deviceName[32];
 195   CUpti_DeviceAttributeDeviceClass deviceClass;
 196   size_t deviceClassSize = sizeof deviceClass;
 197
 198   printf("Usage: %s [device_num] [event_name]\n", argv[0]);
 199
 200   err = cuInit(0);
 201   CHECK_CU_ERROR(err, "cuInit");
 202
 203   err = cuDeviceGetCount(&deviceCount);
 204   CHECK_CU_ERROR(err, "cuDeviceGetCount");
 205
 206   if (deviceCount == 0) {
 207     printf("There is no device supporting CUDA.\n");
 208     exit(-1);
 209   }
 210
 211   if (argc > 1)
 212     deviceNum = atoi(argv[1]);
 213   else
 214     deviceNum = 0;
 215   printf("CUDA Device Number: %d\n", deviceNum);
 216
 217   err = cuDeviceGet(&device, deviceNum);
 218   CHECK_CU_ERROR(err, "cuDeviceGet");
 219
 220   err = cuDeviceGetName(deviceName, 32, device);
 221   CHECK_CU_ERROR(err, "cuDeviceGetName");
 222
 223   printf("CUDA Device Name: %s\n", deviceName);
 224
 225   cuptiErr = cuptiDeviceGetAttribute(device, CUPTI_DEVICE_ATTR_DEVICE_CLASS, &deviceClassSize, &deviceClass);
 226   CHECK_CUPTI_ERROR(cuptiErr, "cuptiDeviceGetAttribute");
 227
 228   if (deviceClass != CUPTI_DEVICE_ATTR_DEVICE_CLASS_TESLA) {
 229     printf("Sample uses event collection mode _CONTINUOUS which is supported only on Tesla GPUs.\n");
 230     return 0;
 231   }
 232
 233   err = cuDeviceComputeCapability(&computeCapabilityMajor,
 234                                   &computeCapabilityMinor,
 235                                   device);
 236   CHECK_CU_ERROR(err, "cuDeviceComputeCapability");
 237
 238   if (argc > 2) {
 239     eventName = argv[2];
 240   }
 241   else {
 242     if (computeCapabilityMajor > 1) {
 243       eventName = EVENT_NAME;
 244     }
 245     else {
 246       printf("Event sampling is not supported for Tesla family devices.\n");
 247       return 0;
 248     }
 249   }
 250
 251   err = cuCtxCreate(&context, 0, device);
 252   CHECK_CU_ERROR(err, "cuCtxCreate");
 253
 254
 255   testComplete = 0;
 256
 257   printf("Creating sampling thread\n");
 258 #ifdef _WIN32
 259   hThread = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) sampling_func,
 260                          NULL, 0, NULL );
 261   if (!hThread) {
 262     printf("CreateThread failed\n");
 263     exit(-1);
 264   }
 265 #else
 266   status = pthread_create(&pThread, NULL, sampling_func, NULL);
 267   if (status != 0) {
 268     perror("pthread_create");
 269     exit(-1);
 270   }
 271 #endif
 272
 273   // run kernel while sampling
 274   compute((computeCapabilityMajor > 1) ? FERMI_ITERS : TESLA_ITERS);
 275
 276   // "signal" the sampling thread to exit and wait for it
 277   testComplete = 1;
 278 #ifdef _WIN32
 279   WaitForSingleObject(hThread, INFINITE);
 280 #else
 281   pthread_join(pThread, NULL);
 282 #endif
 283
 284   cudaDeviceSynchronize();
 285   return 0;
 286 }