2 * Copyright 2011-2014 NVIDIA Corporation. All rights reserved
4 * Sample CUPTI app to print a trace of CUDA API and GPU activity
5 * using asynchronous handling of activity buffers.
13 #define CUPTI_CALL(call) \
15 CUptiResult _status = call; \
16 if (_status != CUPTI_SUCCESS) { \
18 cuptiGetResultString(_status, &errstr); \
19 fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
20 __FILE__, __LINE__, #call, errstr); \
25 #define BUF_SIZE (32 * 1024)
26 #define ALIGN_SIZE (8)
27 #define ALIGN_BUFFER(buffer, align) \
28 (((uintptr_t) (buffer) & ((align)-1)) ? ((buffer) + (align) - ((uintptr_t) (buffer) & ((align)-1))) : (buffer))
30 // Timestamp at trace initialization time. Used to normalized other
32 static uint64_t startTimestamp;
35 getMemcpyKindString(CUpti_ActivityMemcpyKind kind)
38 case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD:
40 case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH:
42 case CUPTI_ACTIVITY_MEMCPY_KIND_HTOA:
44 case CUPTI_ACTIVITY_MEMCPY_KIND_ATOH:
46 case CUPTI_ACTIVITY_MEMCPY_KIND_ATOA:
48 case CUPTI_ACTIVITY_MEMCPY_KIND_ATOD:
50 case CUPTI_ACTIVITY_MEMCPY_KIND_DTOA:
52 case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD:
54 case CUPTI_ACTIVITY_MEMCPY_KIND_HTOH:
64 getActivityOverheadKindString(CUpti_ActivityOverheadKind kind)
67 case CUPTI_ACTIVITY_OVERHEAD_DRIVER_COMPILER:
69 case CUPTI_ACTIVITY_OVERHEAD_CUPTI_BUFFER_FLUSH:
70 return "BUFFER_FLUSH";
71 case CUPTI_ACTIVITY_OVERHEAD_CUPTI_INSTRUMENTATION:
72 return "INSTRUMENTATION";
73 case CUPTI_ACTIVITY_OVERHEAD_CUPTI_RESOURCE:
83 getActivityObjectKindString(CUpti_ActivityObjectKind kind)
86 case CUPTI_ACTIVITY_OBJECT_PROCESS:
88 case CUPTI_ACTIVITY_OBJECT_THREAD:
90 case CUPTI_ACTIVITY_OBJECT_DEVICE:
92 case CUPTI_ACTIVITY_OBJECT_CONTEXT:
94 case CUPTI_ACTIVITY_OBJECT_STREAM:
104 getActivityObjectKindId(CUpti_ActivityObjectKind kind, CUpti_ActivityObjectKindId *id)
107 case CUPTI_ACTIVITY_OBJECT_PROCESS:
108 return id->pt.processId;
109 case CUPTI_ACTIVITY_OBJECT_THREAD:
110 return id->pt.threadId;
111 case CUPTI_ACTIVITY_OBJECT_DEVICE:
112 return id->dcs.deviceId;
113 case CUPTI_ACTIVITY_OBJECT_CONTEXT:
114 return id->dcs.contextId;
115 case CUPTI_ACTIVITY_OBJECT_STREAM:
116 return id->dcs.streamId;
125 getComputeApiKindString(CUpti_ActivityComputeApiKind kind)
128 case CUPTI_ACTIVITY_COMPUTE_API_CUDA:
130 case CUPTI_ACTIVITY_COMPUTE_API_CUDA_MPS:
140 printActivity(CUpti_Activity *record)
142 switch (record->kind)
144 case CUPTI_ACTIVITY_KIND_DEVICE:
146 CUpti_ActivityDevice *device = (CUpti_ActivityDevice *) record;
147 printf("DEVICE %s (%u), capability %u.%u, global memory (bandwidth %u GB/s, size %u MB), "
148 "multiprocessors %u, clock %u MHz\n",
149 device->name, device->id,
150 device->computeCapabilityMajor, device->computeCapabilityMinor,
151 (unsigned int) (device->globalMemoryBandwidth / 1024 / 1024),
152 (unsigned int) (device->globalMemorySize / 1024 / 1024),
153 device->numMultiprocessors, (unsigned int) (device->coreClockRate / 1000));
156 case CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE:
158 CUpti_ActivityDeviceAttribute *attribute = (CUpti_ActivityDeviceAttribute *)record;
159 printf("DEVICE_ATTRIBUTE %u, device %u, value=0x%llx\n",
160 attribute->attribute.cupti, attribute->deviceId, (unsigned long long)attribute->value.vUint64);
163 case CUPTI_ACTIVITY_KIND_CONTEXT:
165 CUpti_ActivityContext *context = (CUpti_ActivityContext *) record;
166 printf("CONTEXT %u, device %u, compute API %s, NULL stream %d\n",
167 context->contextId, context->deviceId,
168 getComputeApiKindString((CUpti_ActivityComputeApiKind) context->computeApiKind),
169 (int) context->nullStreamId);
172 case CUPTI_ACTIVITY_KIND_MEMCPY:
174 CUpti_ActivityMemcpy *memcpy = (CUpti_ActivityMemcpy *) record;
175 printf("MEMCPY %s [ %llu - %llu ] device %u, context %u, stream %u, correlation %u/r%u\n",
176 getMemcpyKindString((CUpti_ActivityMemcpyKind) memcpy->copyKind),
177 (unsigned long long) (memcpy->start - startTimestamp),
178 (unsigned long long) (memcpy->end - startTimestamp),
179 memcpy->deviceId, memcpy->contextId, memcpy->streamId,
180 memcpy->correlationId, memcpy->runtimeCorrelationId);
183 case CUPTI_ACTIVITY_KIND_MEMSET:
185 CUpti_ActivityMemset *memset = (CUpti_ActivityMemset *) record;
186 printf("MEMSET value=%u [ %llu - %llu ] device %u, context %u, stream %u, correlation %u/r%u\n",
188 (unsigned long long) (memset->start - startTimestamp),
189 (unsigned long long) (memset->end - startTimestamp),
190 memset->deviceId, memset->contextId, memset->streamId,
191 memset->correlationId, memset->runtimeCorrelationId);
194 case CUPTI_ACTIVITY_KIND_KERNEL:
195 case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL:
197 const char* kindString = (record->kind == CUPTI_ACTIVITY_KIND_KERNEL) ? "KERNEL" : "CONC KERNEL";
198 CUpti_ActivityKernel2 *kernel = (CUpti_ActivityKernel2 *) record;
199 printf("%s \"%s\" [ %llu - %llu ] device %u, context %u, stream %u, correlation %u\n",
202 (unsigned long long) (kernel->start - startTimestamp),
203 (unsigned long long) (kernel->end - startTimestamp),
204 kernel->deviceId, kernel->contextId, kernel->streamId,
205 kernel->correlationId);
206 printf(" grid [%u,%u,%u], block [%u,%u,%u], shared memory (static %u, dynamic %u)\n",
207 kernel->gridX, kernel->gridY, kernel->gridZ,
208 kernel->blockX, kernel->blockY, kernel->blockZ,
209 kernel->staticSharedMemory, kernel->dynamicSharedMemory);
212 case CUPTI_ACTIVITY_KIND_DRIVER:
214 CUpti_ActivityAPI *api = (CUpti_ActivityAPI *) record;
215 printf("DRIVER cbid=%u [ %llu - %llu ] process %u, thread %u, correlation %u\n",
217 (unsigned long long) (api->start - startTimestamp),
218 (unsigned long long) (api->end - startTimestamp),
219 api->processId, api->threadId, api->correlationId);
222 case CUPTI_ACTIVITY_KIND_RUNTIME:
224 CUpti_ActivityAPI *api = (CUpti_ActivityAPI *) record;
225 printf("RUNTIME cbid=%u [ %llu - %llu ] process %u, thread %u, correlation %u\n",
227 (unsigned long long) (api->start - startTimestamp),
228 (unsigned long long) (api->end - startTimestamp),
229 api->processId, api->threadId, api->correlationId);
232 case CUPTI_ACTIVITY_KIND_NAME:
234 CUpti_ActivityName *name = (CUpti_ActivityName *) record;
235 switch (name->objectKind)
237 case CUPTI_ACTIVITY_OBJECT_CONTEXT:
238 printf("NAME %s %u %s id %u, name %s\n",
239 getActivityObjectKindString(name->objectKind),
240 getActivityObjectKindId(name->objectKind, &name->objectId),
241 getActivityObjectKindString(CUPTI_ACTIVITY_OBJECT_DEVICE),
242 getActivityObjectKindId(CUPTI_ACTIVITY_OBJECT_DEVICE, &name->objectId),
245 case CUPTI_ACTIVITY_OBJECT_STREAM:
246 printf("NAME %s %u %s %u %s id %u, name %s\n",
247 getActivityObjectKindString(name->objectKind),
248 getActivityObjectKindId(name->objectKind, &name->objectId),
249 getActivityObjectKindString(CUPTI_ACTIVITY_OBJECT_CONTEXT),
250 getActivityObjectKindId(CUPTI_ACTIVITY_OBJECT_CONTEXT, &name->objectId),
251 getActivityObjectKindString(CUPTI_ACTIVITY_OBJECT_DEVICE),
252 getActivityObjectKindId(CUPTI_ACTIVITY_OBJECT_DEVICE, &name->objectId),
256 printf("NAME %s id %u, name %s\n",
257 getActivityObjectKindString(name->objectKind),
258 getActivityObjectKindId(name->objectKind, &name->objectId),
264 case CUPTI_ACTIVITY_KIND_MARKER:
266 CUpti_ActivityMarker *marker = (CUpti_ActivityMarker *) record;
267 printf("MARKER id %u [ %llu ], name %s\n",
268 marker->id, (unsigned long long) marker->timestamp, marker->name);
271 case CUPTI_ACTIVITY_KIND_MARKER_DATA:
273 CUpti_ActivityMarkerData *marker = (CUpti_ActivityMarkerData *) record;
274 printf("MARKER_DATA id %u, color 0x%x, category %u, payload %llu/%f\n",
275 marker->id, marker->color, marker->category,
276 (unsigned long long) marker->payload.metricValueUint64,
277 marker->payload.metricValueDouble);
280 case CUPTI_ACTIVITY_KIND_OVERHEAD:
282 CUpti_ActivityOverhead *overhead = (CUpti_ActivityOverhead *) record;
283 printf("OVERHEAD %s [ %llu, %llu ] %s id %u\n",
284 getActivityOverheadKindString(overhead->overheadKind),
285 (unsigned long long) overhead->start - startTimestamp,
286 (unsigned long long) overhead->end - startTimestamp,
287 getActivityObjectKindString(overhead->objectKind),
288 getActivityObjectKindId(overhead->objectKind, &overhead->objectId));
292 printf(" <unknown>\n");
297 void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size, size_t *maxNumRecords)
299 uint8_t *bfr = (uint8_t *) malloc(BUF_SIZE + ALIGN_SIZE);
301 printf("Error: out of memory\n");
306 *buffer = ALIGN_BUFFER(bfr, ALIGN_SIZE);
310 void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize)
313 CUpti_Activity *record = NULL;
317 printf("==== Starting dump for global ====\n");
320 printf("==== Starting dump for context %p, stream %u ====\n", ctx, streamId);
324 status = cuptiActivityGetNextRecord(buffer, validSize, &record);
325 if (status == CUPTI_SUCCESS) {
326 printActivity(record);
328 else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED)
335 // report any records dropped from the queue
337 CUPTI_CALL(cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped));
339 printf("Dropped %u activity records\n", (unsigned int) dropped);
343 printf("==== Finished dump for global ====\n");
346 printf("==== Finished dump for context %p, stream %u ====\n", ctx, streamId);
356 size_t attrValue = 0, attrValueSize = sizeof(size_t);
357 // Device activity record is created when CUDA initializes, so we
358 // want to enable it before cuInit() or any CUDA runtime call.
359 CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE));
360 // Enable all other activity record kinds.
361 CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT));
362 CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER));
363 CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));
364 CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY));
365 CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET));
366 CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_NAME));
367 CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MARKER));
368 CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL));
369 CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD));
371 // Register callbacks for buffer requests and for buffers completed by CUPTI.
372 CUPTI_CALL(cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted));
374 // Get and set activity attributes.
375 // Attributes can be set by the CUPTI client to change behavior of the activity API.
376 // Some attributes require to be set before any CUDA context is created to be effective,
377 // e.g. to be applied to all device buffer allocations (see documentation).
378 CUPTI_CALL(cuptiActivityGetAttribute(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE, &attrValueSize, &attrValue));
379 printf("%s = %llu\n", "CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE", (long long unsigned)attrValue);
381 CUPTI_CALL(cuptiActivitySetAttribute(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE, &attrValueSize, &attrValue));
383 CUPTI_CALL(cuptiActivityGetAttribute(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT, &attrValueSize, &attrValue));
384 printf("%s = %llu\n", "CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT", (long long unsigned)attrValue);
386 CUPTI_CALL(cuptiActivitySetAttribute(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT, &attrValueSize, &attrValue));
388 CUPTI_CALL(cuptiGetTimestamp(&startTimestamp));