OSDN Git Service

Merge tag 'erofs-for-5.5-rc2-fixes' of git://git.kernel.org/pub/scm/linux/kernel...
[tomoyo/tomoyo-test1.git] / tools / perf / builtin-record.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include "util/parse-events.h"
14 #include "util/config.h"
15
16 #include "util/callchain.h"
17 #include "util/cgroup.h"
18 #include "util/header.h"
19 #include "util/event.h"
20 #include "util/evlist.h"
21 #include "util/evsel.h"
22 #include "util/debug.h"
23 #include "util/mmap.h"
24 #include "util/target.h"
25 #include "util/session.h"
26 #include "util/tool.h"
27 #include "util/symbol.h"
28 #include "util/record.h"
29 #include "util/cpumap.h"
30 #include "util/thread_map.h"
31 #include "util/data.h"
32 #include "util/perf_regs.h"
33 #include "util/auxtrace.h"
34 #include "util/tsc.h"
35 #include "util/parse-branch-options.h"
36 #include "util/parse-regs-options.h"
37 #include "util/llvm-utils.h"
38 #include "util/bpf-loader.h"
39 #include "util/trigger.h"
40 #include "util/perf-hooks.h"
41 #include "util/cpu-set-sched.h"
42 #include "util/synthetic-events.h"
43 #include "util/time-utils.h"
44 #include "util/units.h"
45 #include "util/bpf-event.h"
46 #include "asm/bug.h"
47 #include "perf.h"
48
49 #include <errno.h>
50 #include <inttypes.h>
51 #include <locale.h>
52 #include <poll.h>
53 #include <unistd.h>
54 #include <sched.h>
55 #include <signal.h>
56 #include <sys/mman.h>
57 #include <sys/wait.h>
58 #include <sys/types.h>
59 #include <sys/stat.h>
60 #include <fcntl.h>
61 #include <linux/err.h>
62 #include <linux/string.h>
63 #include <linux/time64.h>
64 #include <linux/zalloc.h>
65
66 struct switch_output {
67         bool             enabled;
68         bool             signal;
69         unsigned long    size;
70         unsigned long    time;
71         const char      *str;
72         bool             set;
73         char             **filenames;
74         int              num_files;
75         int              cur_file;
76 };
77
78 struct record {
79         struct perf_tool        tool;
80         struct record_opts      opts;
81         u64                     bytes_written;
82         struct perf_data        data;
83         struct auxtrace_record  *itr;
84         struct evlist   *evlist;
85         struct perf_session     *session;
86         int                     realtime_prio;
87         bool                    no_buildid;
88         bool                    no_buildid_set;
89         bool                    no_buildid_cache;
90         bool                    no_buildid_cache_set;
91         bool                    buildid_all;
92         bool                    timestamp_filename;
93         bool                    timestamp_boundary;
94         struct switch_output    switch_output;
95         unsigned long long      samples;
96         cpu_set_t               affinity_mask;
97         unsigned long           output_max_size;        /* = 0: unlimited */
98 };
99
100 static volatile int done;
101
102 static volatile int auxtrace_record__snapshot_started;
103 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
104 static DEFINE_TRIGGER(switch_output_trigger);
105
106 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
107         "SYS", "NODE", "CPU"
108 };
109
110 static bool switch_output_signal(struct record *rec)
111 {
112         return rec->switch_output.signal &&
113                trigger_is_ready(&switch_output_trigger);
114 }
115
116 static bool switch_output_size(struct record *rec)
117 {
118         return rec->switch_output.size &&
119                trigger_is_ready(&switch_output_trigger) &&
120                (rec->bytes_written >= rec->switch_output.size);
121 }
122
123 static bool switch_output_time(struct record *rec)
124 {
125         return rec->switch_output.time &&
126                trigger_is_ready(&switch_output_trigger);
127 }
128
129 static bool record__output_max_size_exceeded(struct record *rec)
130 {
131         return rec->output_max_size &&
132                (rec->bytes_written >= rec->output_max_size);
133 }
134
135 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
136                          void *bf, size_t size)
137 {
138         struct perf_data_file *file = &rec->session->data->file;
139
140         if (perf_data_file__write(file, bf, size) < 0) {
141                 pr_err("failed to write perf data, error: %m\n");
142                 return -1;
143         }
144
145         rec->bytes_written += size;
146
147         if (record__output_max_size_exceeded(rec) && !done) {
148                 fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
149                                 " stopping session ]\n",
150                                 rec->bytes_written >> 10);
151                 done = 1;
152         }
153
154         if (switch_output_size(rec))
155                 trigger_hit(&switch_output_trigger);
156
157         return 0;
158 }
159
160 static int record__aio_enabled(struct record *rec);
161 static int record__comp_enabled(struct record *rec);
162 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
163                             void *src, size_t src_size);
164
165 #ifdef HAVE_AIO_SUPPORT
166 static int record__aio_write(struct aiocb *cblock, int trace_fd,
167                 void *buf, size_t size, off_t off)
168 {
169         int rc;
170
171         cblock->aio_fildes = trace_fd;
172         cblock->aio_buf    = buf;
173         cblock->aio_nbytes = size;
174         cblock->aio_offset = off;
175         cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
176
177         do {
178                 rc = aio_write(cblock);
179                 if (rc == 0) {
180                         break;
181                 } else if (errno != EAGAIN) {
182                         cblock->aio_fildes = -1;
183                         pr_err("failed to queue perf data, error: %m\n");
184                         break;
185                 }
186         } while (1);
187
188         return rc;
189 }
190
191 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
192 {
193         void *rem_buf;
194         off_t rem_off;
195         size_t rem_size;
196         int rc, aio_errno;
197         ssize_t aio_ret, written;
198
199         aio_errno = aio_error(cblock);
200         if (aio_errno == EINPROGRESS)
201                 return 0;
202
203         written = aio_ret = aio_return(cblock);
204         if (aio_ret < 0) {
205                 if (aio_errno != EINTR)
206                         pr_err("failed to write perf data, error: %m\n");
207                 written = 0;
208         }
209
210         rem_size = cblock->aio_nbytes - written;
211
212         if (rem_size == 0) {
213                 cblock->aio_fildes = -1;
214                 /*
215                  * md->refcount is incremented in record__aio_pushfn() for
216                  * every aio write request started in record__aio_push() so
217                  * decrement it because the request is now complete.
218                  */
219                 perf_mmap__put(&md->core);
220                 rc = 1;
221         } else {
222                 /*
223                  * aio write request may require restart with the
224                  * reminder if the kernel didn't write whole
225                  * chunk at once.
226                  */
227                 rem_off = cblock->aio_offset + written;
228                 rem_buf = (void *)(cblock->aio_buf + written);
229                 record__aio_write(cblock, cblock->aio_fildes,
230                                 rem_buf, rem_size, rem_off);
231                 rc = 0;
232         }
233
234         return rc;
235 }
236
237 static int record__aio_sync(struct mmap *md, bool sync_all)
238 {
239         struct aiocb **aiocb = md->aio.aiocb;
240         struct aiocb *cblocks = md->aio.cblocks;
241         struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
242         int i, do_suspend;
243
244         do {
245                 do_suspend = 0;
246                 for (i = 0; i < md->aio.nr_cblocks; ++i) {
247                         if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
248                                 if (sync_all)
249                                         aiocb[i] = NULL;
250                                 else
251                                         return i;
252                         } else {
253                                 /*
254                                  * Started aio write is not complete yet
255                                  * so it has to be waited before the
256                                  * next allocation.
257                                  */
258                                 aiocb[i] = &cblocks[i];
259                                 do_suspend = 1;
260                         }
261                 }
262                 if (!do_suspend)
263                         return -1;
264
265                 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
266                         if (!(errno == EAGAIN || errno == EINTR))
267                                 pr_err("failed to sync perf data, error: %m\n");
268                 }
269         } while (1);
270 }
271
272 struct record_aio {
273         struct record   *rec;
274         void            *data;
275         size_t          size;
276 };
277
278 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
279 {
280         struct record_aio *aio = to;
281
282         /*
283          * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
284          * to release space in the kernel buffer as fast as possible, calling
285          * perf_mmap__consume() from perf_mmap__push() function.
286          *
287          * That lets the kernel to proceed with storing more profiling data into
288          * the kernel buffer earlier than other per-cpu kernel buffers are handled.
289          *
290          * Coping can be done in two steps in case the chunk of profiling data
291          * crosses the upper bound of the kernel buffer. In this case we first move
292          * part of data from map->start till the upper bound and then the reminder
293          * from the beginning of the kernel buffer till the end of the data chunk.
294          */
295
296         if (record__comp_enabled(aio->rec)) {
297                 size = zstd_compress(aio->rec->session, aio->data + aio->size,
298                                      mmap__mmap_len(map) - aio->size,
299                                      buf, size);
300         } else {
301                 memcpy(aio->data + aio->size, buf, size);
302         }
303
304         if (!aio->size) {
305                 /*
306                  * Increment map->refcount to guard map->aio.data[] buffer
307                  * from premature deallocation because map object can be
308                  * released earlier than aio write request started on
309                  * map->aio.data[] buffer is complete.
310                  *
311                  * perf_mmap__put() is done at record__aio_complete()
312                  * after started aio request completion or at record__aio_push()
313                  * if the request failed to start.
314                  */
315                 perf_mmap__get(&map->core);
316         }
317
318         aio->size += size;
319
320         return size;
321 }
322
323 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
324 {
325         int ret, idx;
326         int trace_fd = rec->session->data->file.fd;
327         struct record_aio aio = { .rec = rec, .size = 0 };
328
329         /*
330          * Call record__aio_sync() to wait till map->aio.data[] buffer
331          * becomes available after previous aio write operation.
332          */
333
334         idx = record__aio_sync(map, false);
335         aio.data = map->aio.data[idx];
336         ret = perf_mmap__push(map, &aio, record__aio_pushfn);
337         if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
338                 return ret;
339
340         rec->samples++;
341         ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
342         if (!ret) {
343                 *off += aio.size;
344                 rec->bytes_written += aio.size;
345                 if (switch_output_size(rec))
346                         trigger_hit(&switch_output_trigger);
347         } else {
348                 /*
349                  * Decrement map->refcount incremented in record__aio_pushfn()
350                  * back if record__aio_write() operation failed to start, otherwise
351                  * map->refcount is decremented in record__aio_complete() after
352                  * aio write operation finishes successfully.
353                  */
354                 perf_mmap__put(&map->core);
355         }
356
357         return ret;
358 }
359
360 static off_t record__aio_get_pos(int trace_fd)
361 {
362         return lseek(trace_fd, 0, SEEK_CUR);
363 }
364
365 static void record__aio_set_pos(int trace_fd, off_t pos)
366 {
367         lseek(trace_fd, pos, SEEK_SET);
368 }
369
370 static void record__aio_mmap_read_sync(struct record *rec)
371 {
372         int i;
373         struct evlist *evlist = rec->evlist;
374         struct mmap *maps = evlist->mmap;
375
376         if (!record__aio_enabled(rec))
377                 return;
378
379         for (i = 0; i < evlist->core.nr_mmaps; i++) {
380                 struct mmap *map = &maps[i];
381
382                 if (map->core.base)
383                         record__aio_sync(map, true);
384         }
385 }
386
387 static int nr_cblocks_default = 1;
388 static int nr_cblocks_max = 4;
389
390 static int record__aio_parse(const struct option *opt,
391                              const char *str,
392                              int unset)
393 {
394         struct record_opts *opts = (struct record_opts *)opt->value;
395
396         if (unset) {
397                 opts->nr_cblocks = 0;
398         } else {
399                 if (str)
400                         opts->nr_cblocks = strtol(str, NULL, 0);
401                 if (!opts->nr_cblocks)
402                         opts->nr_cblocks = nr_cblocks_default;
403         }
404
405         return 0;
406 }
407 #else /* HAVE_AIO_SUPPORT */
408 static int nr_cblocks_max = 0;
409
410 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
411                             off_t *off __maybe_unused)
412 {
413         return -1;
414 }
415
416 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
417 {
418         return -1;
419 }
420
421 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
422 {
423 }
424
425 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
426 {
427 }
428 #endif
429
430 static int record__aio_enabled(struct record *rec)
431 {
432         return rec->opts.nr_cblocks > 0;
433 }
434
435 #define MMAP_FLUSH_DEFAULT 1
436 static int record__mmap_flush_parse(const struct option *opt,
437                                     const char *str,
438                                     int unset)
439 {
440         int flush_max;
441         struct record_opts *opts = (struct record_opts *)opt->value;
442         static struct parse_tag tags[] = {
443                         { .tag  = 'B', .mult = 1       },
444                         { .tag  = 'K', .mult = 1 << 10 },
445                         { .tag  = 'M', .mult = 1 << 20 },
446                         { .tag  = 'G', .mult = 1 << 30 },
447                         { .tag  = 0 },
448         };
449
450         if (unset)
451                 return 0;
452
453         if (str) {
454                 opts->mmap_flush = parse_tag_value(str, tags);
455                 if (opts->mmap_flush == (int)-1)
456                         opts->mmap_flush = strtol(str, NULL, 0);
457         }
458
459         if (!opts->mmap_flush)
460                 opts->mmap_flush = MMAP_FLUSH_DEFAULT;
461
462         flush_max = evlist__mmap_size(opts->mmap_pages);
463         flush_max /= 4;
464         if (opts->mmap_flush > flush_max)
465                 opts->mmap_flush = flush_max;
466
467         return 0;
468 }
469
470 #ifdef HAVE_ZSTD_SUPPORT
471 static unsigned int comp_level_default = 1;
472
473 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
474 {
475         struct record_opts *opts = opt->value;
476
477         if (unset) {
478                 opts->comp_level = 0;
479         } else {
480                 if (str)
481                         opts->comp_level = strtol(str, NULL, 0);
482                 if (!opts->comp_level)
483                         opts->comp_level = comp_level_default;
484         }
485
486         return 0;
487 }
488 #endif
489 static unsigned int comp_level_max = 22;
490
491 static int record__comp_enabled(struct record *rec)
492 {
493         return rec->opts.comp_level > 0;
494 }
495
496 static int process_synthesized_event(struct perf_tool *tool,
497                                      union perf_event *event,
498                                      struct perf_sample *sample __maybe_unused,
499                                      struct machine *machine __maybe_unused)
500 {
501         struct record *rec = container_of(tool, struct record, tool);
502         return record__write(rec, NULL, event, event->header.size);
503 }
504
505 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
506 {
507         struct record *rec = to;
508
509         if (record__comp_enabled(rec)) {
510                 size = zstd_compress(rec->session, map->data, mmap__mmap_len(map), bf, size);
511                 bf   = map->data;
512         }
513
514         rec->samples++;
515         return record__write(rec, map, bf, size);
516 }
517
518 static volatile int signr = -1;
519 static volatile int child_finished;
520
521 static void sig_handler(int sig)
522 {
523         if (sig == SIGCHLD)
524                 child_finished = 1;
525         else
526                 signr = sig;
527
528         done = 1;
529 }
530
531 static void sigsegv_handler(int sig)
532 {
533         perf_hooks__recover();
534         sighandler_dump_stack(sig);
535 }
536
537 static void record__sig_exit(void)
538 {
539         if (signr == -1)
540                 return;
541
542         signal(signr, SIG_DFL);
543         raise(signr);
544 }
545
546 #ifdef HAVE_AUXTRACE_SUPPORT
547
548 static int record__process_auxtrace(struct perf_tool *tool,
549                                     struct mmap *map,
550                                     union perf_event *event, void *data1,
551                                     size_t len1, void *data2, size_t len2)
552 {
553         struct record *rec = container_of(tool, struct record, tool);
554         struct perf_data *data = &rec->data;
555         size_t padding;
556         u8 pad[8] = {0};
557
558         if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
559                 off_t file_offset;
560                 int fd = perf_data__fd(data);
561                 int err;
562
563                 file_offset = lseek(fd, 0, SEEK_CUR);
564                 if (file_offset == -1)
565                         return -1;
566                 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
567                                                      event, file_offset);
568                 if (err)
569                         return err;
570         }
571
572         /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
573         padding = (len1 + len2) & 7;
574         if (padding)
575                 padding = 8 - padding;
576
577         record__write(rec, map, event, event->header.size);
578         record__write(rec, map, data1, len1);
579         if (len2)
580                 record__write(rec, map, data2, len2);
581         record__write(rec, map, &pad, padding);
582
583         return 0;
584 }
585
586 static int record__auxtrace_mmap_read(struct record *rec,
587                                       struct mmap *map)
588 {
589         int ret;
590
591         ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
592                                   record__process_auxtrace);
593         if (ret < 0)
594                 return ret;
595
596         if (ret)
597                 rec->samples++;
598
599         return 0;
600 }
601
602 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
603                                                struct mmap *map)
604 {
605         int ret;
606
607         ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
608                                            record__process_auxtrace,
609                                            rec->opts.auxtrace_snapshot_size);
610         if (ret < 0)
611                 return ret;
612
613         if (ret)
614                 rec->samples++;
615
616         return 0;
617 }
618
619 static int record__auxtrace_read_snapshot_all(struct record *rec)
620 {
621         int i;
622         int rc = 0;
623
624         for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
625                 struct mmap *map = &rec->evlist->mmap[i];
626
627                 if (!map->auxtrace_mmap.base)
628                         continue;
629
630                 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
631                         rc = -1;
632                         goto out;
633                 }
634         }
635 out:
636         return rc;
637 }
638
639 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
640 {
641         pr_debug("Recording AUX area tracing snapshot\n");
642         if (record__auxtrace_read_snapshot_all(rec) < 0) {
643                 trigger_error(&auxtrace_snapshot_trigger);
644         } else {
645                 if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
646                         trigger_error(&auxtrace_snapshot_trigger);
647                 else
648                         trigger_ready(&auxtrace_snapshot_trigger);
649         }
650 }
651
652 static int record__auxtrace_snapshot_exit(struct record *rec)
653 {
654         if (trigger_is_error(&auxtrace_snapshot_trigger))
655                 return 0;
656
657         if (!auxtrace_record__snapshot_started &&
658             auxtrace_record__snapshot_start(rec->itr))
659                 return -1;
660
661         record__read_auxtrace_snapshot(rec, true);
662         if (trigger_is_error(&auxtrace_snapshot_trigger))
663                 return -1;
664
665         return 0;
666 }
667
668 static int record__auxtrace_init(struct record *rec)
669 {
670         int err;
671
672         if (!rec->itr) {
673                 rec->itr = auxtrace_record__init(rec->evlist, &err);
674                 if (err)
675                         return err;
676         }
677
678         err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
679                                               rec->opts.auxtrace_snapshot_opts);
680         if (err)
681                 return err;
682
683         err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
684                                             rec->opts.auxtrace_sample_opts);
685         if (err)
686                 return err;
687
688         return auxtrace_parse_filters(rec->evlist);
689 }
690
691 #else
692
693 static inline
694 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
695                                struct mmap *map __maybe_unused)
696 {
697         return 0;
698 }
699
700 static inline
701 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
702                                     bool on_exit __maybe_unused)
703 {
704 }
705
706 static inline
707 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
708 {
709         return 0;
710 }
711
712 static inline
713 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
714 {
715         return 0;
716 }
717
718 static int record__auxtrace_init(struct record *rec __maybe_unused)
719 {
720         return 0;
721 }
722
723 #endif
724
725 static bool record__kcore_readable(struct machine *machine)
726 {
727         char kcore[PATH_MAX];
728         int fd;
729
730         scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
731
732         fd = open(kcore, O_RDONLY);
733         if (fd < 0)
734                 return false;
735
736         close(fd);
737
738         return true;
739 }
740
741 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
742 {
743         char from_dir[PATH_MAX];
744         char kcore_dir[PATH_MAX];
745         int ret;
746
747         snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
748
749         ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
750         if (ret)
751                 return ret;
752
753         return kcore_copy(from_dir, kcore_dir);
754 }
755
756 static int record__mmap_evlist(struct record *rec,
757                                struct evlist *evlist)
758 {
759         struct record_opts *opts = &rec->opts;
760         bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
761                                   opts->auxtrace_sample_mode;
762         char msg[512];
763
764         if (opts->affinity != PERF_AFFINITY_SYS)
765                 cpu__setup_cpunode_map();
766
767         if (evlist__mmap_ex(evlist, opts->mmap_pages,
768                                  opts->auxtrace_mmap_pages,
769                                  auxtrace_overwrite,
770                                  opts->nr_cblocks, opts->affinity,
771                                  opts->mmap_flush, opts->comp_level) < 0) {
772                 if (errno == EPERM) {
773                         pr_err("Permission error mapping pages.\n"
774                                "Consider increasing "
775                                "/proc/sys/kernel/perf_event_mlock_kb,\n"
776                                "or try again with a smaller value of -m/--mmap_pages.\n"
777                                "(current value: %u,%u)\n",
778                                opts->mmap_pages, opts->auxtrace_mmap_pages);
779                         return -errno;
780                 } else {
781                         pr_err("failed to mmap with %d (%s)\n", errno,
782                                 str_error_r(errno, msg, sizeof(msg)));
783                         if (errno)
784                                 return -errno;
785                         else
786                                 return -EINVAL;
787                 }
788         }
789         return 0;
790 }
791
792 static int record__mmap(struct record *rec)
793 {
794         return record__mmap_evlist(rec, rec->evlist);
795 }
796
797 static int record__open(struct record *rec)
798 {
799         char msg[BUFSIZ];
800         struct evsel *pos;
801         struct evlist *evlist = rec->evlist;
802         struct perf_session *session = rec->session;
803         struct record_opts *opts = &rec->opts;
804         int rc = 0;
805
806         /*
807          * For initial_delay we need to add a dummy event so that we can track
808          * PERF_RECORD_MMAP while we wait for the initial delay to enable the
809          * real events, the ones asked by the user.
810          */
811         if (opts->initial_delay) {
812                 if (perf_evlist__add_dummy(evlist))
813                         return -ENOMEM;
814
815                 pos = evlist__first(evlist);
816                 pos->tracking = 0;
817                 pos = evlist__last(evlist);
818                 pos->tracking = 1;
819                 pos->core.attr.enable_on_exec = 1;
820         }
821
822         perf_evlist__config(evlist, opts, &callchain_param);
823
824         evlist__for_each_entry(evlist, pos) {
825 try_again:
826                 if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
827                         if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
828                                 if (verbose > 0)
829                                         ui__warning("%s\n", msg);
830                                 goto try_again;
831                         }
832                         if ((errno == EINVAL || errno == EBADF) &&
833                             pos->leader != pos &&
834                             pos->weak_group) {
835                                 pos = perf_evlist__reset_weak_group(evlist, pos);
836                                 goto try_again;
837                         }
838                         rc = -errno;
839                         perf_evsel__open_strerror(pos, &opts->target,
840                                                   errno, msg, sizeof(msg));
841                         ui__error("%s\n", msg);
842                         goto out;
843                 }
844
845                 pos->supported = true;
846         }
847
848         if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(evlist)) {
849                 pr_warning(
850 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
851 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
852 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
853 "file is not found in the buildid cache or in the vmlinux path.\n\n"
854 "Samples in kernel modules won't be resolved at all.\n\n"
855 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
856 "even with a suitable vmlinux or kallsyms file.\n\n");
857         }
858
859         if (perf_evlist__apply_filters(evlist, &pos)) {
860                 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
861                         pos->filter, perf_evsel__name(pos), errno,
862                         str_error_r(errno, msg, sizeof(msg)));
863                 rc = -1;
864                 goto out;
865         }
866
867         rc = record__mmap(rec);
868         if (rc)
869                 goto out;
870
871         session->evlist = evlist;
872         perf_session__set_id_hdr_size(session);
873 out:
874         return rc;
875 }
876
877 static int process_sample_event(struct perf_tool *tool,
878                                 union perf_event *event,
879                                 struct perf_sample *sample,
880                                 struct evsel *evsel,
881                                 struct machine *machine)
882 {
883         struct record *rec = container_of(tool, struct record, tool);
884
885         if (rec->evlist->first_sample_time == 0)
886                 rec->evlist->first_sample_time = sample->time;
887
888         rec->evlist->last_sample_time = sample->time;
889
890         if (rec->buildid_all)
891                 return 0;
892
893         rec->samples++;
894         return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
895 }
896
897 static int process_buildids(struct record *rec)
898 {
899         struct perf_session *session = rec->session;
900
901         if (perf_data__size(&rec->data) == 0)
902                 return 0;
903
904         /*
905          * During this process, it'll load kernel map and replace the
906          * dso->long_name to a real pathname it found.  In this case
907          * we prefer the vmlinux path like
908          *   /lib/modules/3.16.4/build/vmlinux
909          *
910          * rather than build-id path (in debug directory).
911          *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
912          */
913         symbol_conf.ignore_vmlinux_buildid = true;
914
915         /*
916          * If --buildid-all is given, it marks all DSO regardless of hits,
917          * so no need to process samples. But if timestamp_boundary is enabled,
918          * it still needs to walk on all samples to get the timestamps of
919          * first/last samples.
920          */
921         if (rec->buildid_all && !rec->timestamp_boundary)
922                 rec->tool.sample = NULL;
923
924         return perf_session__process_events(session);
925 }
926
927 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
928 {
929         int err;
930         struct perf_tool *tool = data;
931         /*
932          *As for guest kernel when processing subcommand record&report,
933          *we arrange module mmap prior to guest kernel mmap and trigger
934          *a preload dso because default guest module symbols are loaded
935          *from guest kallsyms instead of /lib/modules/XXX/XXX. This
936          *method is used to avoid symbol missing when the first addr is
937          *in module instead of in guest kernel.
938          */
939         err = perf_event__synthesize_modules(tool, process_synthesized_event,
940                                              machine);
941         if (err < 0)
942                 pr_err("Couldn't record guest kernel [%d]'s reference"
943                        " relocation symbol.\n", machine->pid);
944
945         /*
946          * We use _stext for guest kernel because guest kernel's /proc/kallsyms
947          * have no _text sometimes.
948          */
949         err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
950                                                  machine);
951         if (err < 0)
952                 pr_err("Couldn't record guest kernel [%d]'s reference"
953                        " relocation symbol.\n", machine->pid);
954 }
955
956 static struct perf_event_header finished_round_event = {
957         .size = sizeof(struct perf_event_header),
958         .type = PERF_RECORD_FINISHED_ROUND,
959 };
960
961 static void record__adjust_affinity(struct record *rec, struct mmap *map)
962 {
963         if (rec->opts.affinity != PERF_AFFINITY_SYS &&
964             !CPU_EQUAL(&rec->affinity_mask, &map->affinity_mask)) {
965                 CPU_ZERO(&rec->affinity_mask);
966                 CPU_OR(&rec->affinity_mask, &rec->affinity_mask, &map->affinity_mask);
967                 sched_setaffinity(0, sizeof(rec->affinity_mask), &rec->affinity_mask);
968         }
969 }
970
971 static size_t process_comp_header(void *record, size_t increment)
972 {
973         struct perf_record_compressed *event = record;
974         size_t size = sizeof(*event);
975
976         if (increment) {
977                 event->header.size += increment;
978                 return increment;
979         }
980
981         event->header.type = PERF_RECORD_COMPRESSED;
982         event->header.size = size;
983
984         return size;
985 }
986
987 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
988                             void *src, size_t src_size)
989 {
990         size_t compressed;
991         size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
992
993         compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
994                                                      max_record_size, process_comp_header);
995
996         session->bytes_transferred += src_size;
997         session->bytes_compressed  += compressed;
998
999         return compressed;
1000 }
1001
1002 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1003                                     bool overwrite, bool synch)
1004 {
1005         u64 bytes_written = rec->bytes_written;
1006         int i;
1007         int rc = 0;
1008         struct mmap *maps;
1009         int trace_fd = rec->data.file.fd;
1010         off_t off = 0;
1011
1012         if (!evlist)
1013                 return 0;
1014
1015         maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
1016         if (!maps)
1017                 return 0;
1018
1019         if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1020                 return 0;
1021
1022         if (record__aio_enabled(rec))
1023                 off = record__aio_get_pos(trace_fd);
1024
1025         for (i = 0; i < evlist->core.nr_mmaps; i++) {
1026                 u64 flush = 0;
1027                 struct mmap *map = &maps[i];
1028
1029                 if (map->core.base) {
1030                         record__adjust_affinity(rec, map);
1031                         if (synch) {
1032                                 flush = map->core.flush;
1033                                 map->core.flush = 1;
1034                         }
1035                         if (!record__aio_enabled(rec)) {
1036                                 if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1037                                         if (synch)
1038                                                 map->core.flush = flush;
1039                                         rc = -1;
1040                                         goto out;
1041                                 }
1042                         } else {
1043                                 if (record__aio_push(rec, map, &off) < 0) {
1044                                         record__aio_set_pos(trace_fd, off);
1045                                         if (synch)
1046                                                 map->core.flush = flush;
1047                                         rc = -1;
1048                                         goto out;
1049                                 }
1050                         }
1051                         if (synch)
1052                                 map->core.flush = flush;
1053                 }
1054
1055                 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1056                     !rec->opts.auxtrace_sample_mode &&
1057                     record__auxtrace_mmap_read(rec, map) != 0) {
1058                         rc = -1;
1059                         goto out;
1060                 }
1061         }
1062
1063         if (record__aio_enabled(rec))
1064                 record__aio_set_pos(trace_fd, off);
1065
1066         /*
1067          * Mark the round finished in case we wrote
1068          * at least one event.
1069          */
1070         if (bytes_written != rec->bytes_written)
1071                 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1072
1073         if (overwrite)
1074                 perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1075 out:
1076         return rc;
1077 }
1078
1079 static int record__mmap_read_all(struct record *rec, bool synch)
1080 {
1081         int err;
1082
1083         err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1084         if (err)
1085                 return err;
1086
1087         return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1088 }
1089
1090 static void record__init_features(struct record *rec)
1091 {
1092         struct perf_session *session = rec->session;
1093         int feat;
1094
1095         for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1096                 perf_header__set_feat(&session->header, feat);
1097
1098         if (rec->no_buildid)
1099                 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1100
1101         if (!have_tracepoints(&rec->evlist->core.entries))
1102                 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1103
1104         if (!rec->opts.branch_stack)
1105                 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1106
1107         if (!rec->opts.full_auxtrace)
1108                 perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1109
1110         if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1111                 perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1112
1113         perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1114         if (!record__comp_enabled(rec))
1115                 perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1116
1117         perf_header__clear_feat(&session->header, HEADER_STAT);
1118 }
1119
1120 static void
1121 record__finish_output(struct record *rec)
1122 {
1123         struct perf_data *data = &rec->data;
1124         int fd = perf_data__fd(data);
1125
1126         if (data->is_pipe)
1127                 return;
1128
1129         rec->session->header.data_size += rec->bytes_written;
1130         data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1131
1132         if (!rec->no_buildid) {
1133                 process_buildids(rec);
1134
1135                 if (rec->buildid_all)
1136                         dsos__hit_all(rec->session);
1137         }
1138         perf_session__write_header(rec->session, rec->evlist, fd, true);
1139
1140         return;
1141 }
1142
1143 static int record__synthesize_workload(struct record *rec, bool tail)
1144 {
1145         int err;
1146         struct perf_thread_map *thread_map;
1147
1148         if (rec->opts.tail_synthesize != tail)
1149                 return 0;
1150
1151         thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1152         if (thread_map == NULL)
1153                 return -1;
1154
1155         err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1156                                                  process_synthesized_event,
1157                                                  &rec->session->machines.host,
1158                                                  rec->opts.sample_address);
1159         perf_thread_map__put(thread_map);
1160         return err;
1161 }
1162
1163 static int record__synthesize(struct record *rec, bool tail);
1164
1165 static int
1166 record__switch_output(struct record *rec, bool at_exit)
1167 {
1168         struct perf_data *data = &rec->data;
1169         int fd, err;
1170         char *new_filename;
1171
1172         /* Same Size:      "2015122520103046"*/
1173         char timestamp[] = "InvalidTimestamp";
1174
1175         record__aio_mmap_read_sync(rec);
1176
1177         record__synthesize(rec, true);
1178         if (target__none(&rec->opts.target))
1179                 record__synthesize_workload(rec, true);
1180
1181         rec->samples = 0;
1182         record__finish_output(rec);
1183         err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1184         if (err) {
1185                 pr_err("Failed to get current timestamp\n");
1186                 return -EINVAL;
1187         }
1188
1189         fd = perf_data__switch(data, timestamp,
1190                                     rec->session->header.data_offset,
1191                                     at_exit, &new_filename);
1192         if (fd >= 0 && !at_exit) {
1193                 rec->bytes_written = 0;
1194                 rec->session->header.data_size = 0;
1195         }
1196
1197         if (!quiet)
1198                 fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1199                         data->path, timestamp);
1200
1201         if (rec->switch_output.num_files) {
1202                 int n = rec->switch_output.cur_file + 1;
1203
1204                 if (n >= rec->switch_output.num_files)
1205                         n = 0;
1206                 rec->switch_output.cur_file = n;
1207                 if (rec->switch_output.filenames[n]) {
1208                         remove(rec->switch_output.filenames[n]);
1209                         zfree(&rec->switch_output.filenames[n]);
1210                 }
1211                 rec->switch_output.filenames[n] = new_filename;
1212         } else {
1213                 free(new_filename);
1214         }
1215
1216         /* Output tracking events */
1217         if (!at_exit) {
1218                 record__synthesize(rec, false);
1219
1220                 /*
1221                  * In 'perf record --switch-output' without -a,
1222                  * record__synthesize() in record__switch_output() won't
1223                  * generate tracking events because there's no thread_map
1224                  * in evlist. Which causes newly created perf.data doesn't
1225                  * contain map and comm information.
1226                  * Create a fake thread_map and directly call
1227                  * perf_event__synthesize_thread_map() for those events.
1228                  */
1229                 if (target__none(&rec->opts.target))
1230                         record__synthesize_workload(rec, false);
1231         }
1232         return fd;
1233 }
1234
1235 static volatile int workload_exec_errno;
1236
1237 /*
1238  * perf_evlist__prepare_workload will send a SIGUSR1
1239  * if the fork fails, since we asked by setting its
1240  * want_signal to true.
1241  */
1242 static void workload_exec_failed_signal(int signo __maybe_unused,
1243                                         siginfo_t *info,
1244                                         void *ucontext __maybe_unused)
1245 {
1246         workload_exec_errno = info->si_value.sival_int;
1247         done = 1;
1248         child_finished = 1;
1249 }
1250
1251 static void snapshot_sig_handler(int sig);
1252 static void alarm_sig_handler(int sig);
1253
1254 static const struct perf_event_mmap_page *
1255 perf_evlist__pick_pc(struct evlist *evlist)
1256 {
1257         if (evlist) {
1258                 if (evlist->mmap && evlist->mmap[0].core.base)
1259                         return evlist->mmap[0].core.base;
1260                 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1261                         return evlist->overwrite_mmap[0].core.base;
1262         }
1263         return NULL;
1264 }
1265
1266 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1267 {
1268         const struct perf_event_mmap_page *pc;
1269
1270         pc = perf_evlist__pick_pc(rec->evlist);
1271         if (pc)
1272                 return pc;
1273         return NULL;
1274 }
1275
1276 static int record__synthesize(struct record *rec, bool tail)
1277 {
1278         struct perf_session *session = rec->session;
1279         struct machine *machine = &session->machines.host;
1280         struct perf_data *data = &rec->data;
1281         struct record_opts *opts = &rec->opts;
1282         struct perf_tool *tool = &rec->tool;
1283         int fd = perf_data__fd(data);
1284         int err = 0;
1285
1286         if (rec->opts.tail_synthesize != tail)
1287                 return 0;
1288
1289         if (data->is_pipe) {
1290                 /*
1291                  * We need to synthesize events first, because some
1292                  * features works on top of them (on report side).
1293                  */
1294                 err = perf_event__synthesize_attrs(tool, rec->evlist,
1295                                                    process_synthesized_event);
1296                 if (err < 0) {
1297                         pr_err("Couldn't synthesize attrs.\n");
1298                         goto out;
1299                 }
1300
1301                 err = perf_event__synthesize_features(tool, session, rec->evlist,
1302                                                       process_synthesized_event);
1303                 if (err < 0) {
1304                         pr_err("Couldn't synthesize features.\n");
1305                         return err;
1306                 }
1307
1308                 if (have_tracepoints(&rec->evlist->core.entries)) {
1309                         /*
1310                          * FIXME err <= 0 here actually means that
1311                          * there were no tracepoints so its not really
1312                          * an error, just that we don't need to
1313                          * synthesize anything.  We really have to
1314                          * return this more properly and also
1315                          * propagate errors that now are calling die()
1316                          */
1317                         err = perf_event__synthesize_tracing_data(tool, fd, rec->evlist,
1318                                                                   process_synthesized_event);
1319                         if (err <= 0) {
1320                                 pr_err("Couldn't record tracing data.\n");
1321                                 goto out;
1322                         }
1323                         rec->bytes_written += err;
1324                 }
1325         }
1326
1327         err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1328                                           process_synthesized_event, machine);
1329         if (err)
1330                 goto out;
1331
1332         /* Synthesize id_index before auxtrace_info */
1333         if (rec->opts.auxtrace_sample_mode) {
1334                 err = perf_event__synthesize_id_index(tool,
1335                                                       process_synthesized_event,
1336                                                       session->evlist, machine);
1337                 if (err)
1338                         goto out;
1339         }
1340
1341         if (rec->opts.full_auxtrace) {
1342                 err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1343                                         session, process_synthesized_event);
1344                 if (err)
1345                         goto out;
1346         }
1347
1348         if (!perf_evlist__exclude_kernel(rec->evlist)) {
1349                 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1350                                                          machine);
1351                 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1352                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1353                                    "Check /proc/kallsyms permission or run as root.\n");
1354
1355                 err = perf_event__synthesize_modules(tool, process_synthesized_event,
1356                                                      machine);
1357                 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1358                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1359                                    "Check /proc/modules permission or run as root.\n");
1360         }
1361
1362         if (perf_guest) {
1363                 machines__process_guests(&session->machines,
1364                                          perf_event__synthesize_guest_os, tool);
1365         }
1366
1367         err = perf_event__synthesize_extra_attr(&rec->tool,
1368                                                 rec->evlist,
1369                                                 process_synthesized_event,
1370                                                 data->is_pipe);
1371         if (err)
1372                 goto out;
1373
1374         err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1375                                                  process_synthesized_event,
1376                                                 NULL);
1377         if (err < 0) {
1378                 pr_err("Couldn't synthesize thread map.\n");
1379                 return err;
1380         }
1381
1382         err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
1383                                              process_synthesized_event, NULL);
1384         if (err < 0) {
1385                 pr_err("Couldn't synthesize cpu map.\n");
1386                 return err;
1387         }
1388
1389         err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1390                                                 machine, opts);
1391         if (err < 0)
1392                 pr_warning("Couldn't synthesize bpf events.\n");
1393
1394         err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads,
1395                                             process_synthesized_event, opts->sample_address,
1396                                             1);
1397 out:
1398         return err;
1399 }
1400
1401 static int __cmd_record(struct record *rec, int argc, const char **argv)
1402 {
1403         int err;
1404         int status = 0;
1405         unsigned long waking = 0;
1406         const bool forks = argc > 0;
1407         struct perf_tool *tool = &rec->tool;
1408         struct record_opts *opts = &rec->opts;
1409         struct perf_data *data = &rec->data;
1410         struct perf_session *session;
1411         bool disabled = false, draining = false;
1412         struct evlist *sb_evlist = NULL;
1413         int fd;
1414         float ratio = 0;
1415
1416         atexit(record__sig_exit);
1417         signal(SIGCHLD, sig_handler);
1418         signal(SIGINT, sig_handler);
1419         signal(SIGTERM, sig_handler);
1420         signal(SIGSEGV, sigsegv_handler);
1421
1422         if (rec->opts.record_namespaces)
1423                 tool->namespace_events = true;
1424
1425         if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1426                 signal(SIGUSR2, snapshot_sig_handler);
1427                 if (rec->opts.auxtrace_snapshot_mode)
1428                         trigger_on(&auxtrace_snapshot_trigger);
1429                 if (rec->switch_output.enabled)
1430                         trigger_on(&switch_output_trigger);
1431         } else {
1432                 signal(SIGUSR2, SIG_IGN);
1433         }
1434
1435         session = perf_session__new(data, false, tool);
1436         if (IS_ERR(session)) {
1437                 pr_err("Perf session creation failed.\n");
1438                 return PTR_ERR(session);
1439         }
1440
1441         fd = perf_data__fd(data);
1442         rec->session = session;
1443
1444         if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1445                 pr_err("Compression initialization failed.\n");
1446                 return -1;
1447         }
1448
1449         session->header.env.comp_type  = PERF_COMP_ZSTD;
1450         session->header.env.comp_level = rec->opts.comp_level;
1451
1452         if (rec->opts.kcore &&
1453             !record__kcore_readable(&session->machines.host)) {
1454                 pr_err("ERROR: kcore is not readable.\n");
1455                 return -1;
1456         }
1457
1458         record__init_features(rec);
1459
1460         if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1461                 session->header.env.clockid_res_ns = rec->opts.clockid_res_ns;
1462
1463         if (forks) {
1464                 err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
1465                                                     argv, data->is_pipe,
1466                                                     workload_exec_failed_signal);
1467                 if (err < 0) {
1468                         pr_err("Couldn't run the workload!\n");
1469                         status = err;
1470                         goto out_delete_session;
1471                 }
1472         }
1473
1474         /*
1475          * If we have just single event and are sending data
1476          * through pipe, we need to force the ids allocation,
1477          * because we synthesize event name through the pipe
1478          * and need the id for that.
1479          */
1480         if (data->is_pipe && rec->evlist->core.nr_entries == 1)
1481                 rec->opts.sample_id = true;
1482
1483         if (record__open(rec) != 0) {
1484                 err = -1;
1485                 goto out_child;
1486         }
1487         session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
1488
1489         if (rec->opts.kcore) {
1490                 err = record__kcore_copy(&session->machines.host, data);
1491                 if (err) {
1492                         pr_err("ERROR: Failed to copy kcore\n");
1493                         goto out_child;
1494                 }
1495         }
1496
1497         err = bpf__apply_obj_config();
1498         if (err) {
1499                 char errbuf[BUFSIZ];
1500
1501                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1502                 pr_err("ERROR: Apply config to BPF failed: %s\n",
1503                          errbuf);
1504                 goto out_child;
1505         }
1506
1507         /*
1508          * Normally perf_session__new would do this, but it doesn't have the
1509          * evlist.
1510          */
1511         if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
1512                 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1513                 rec->tool.ordered_events = false;
1514         }
1515
1516         if (!rec->evlist->nr_groups)
1517                 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1518
1519         if (data->is_pipe) {
1520                 err = perf_header__write_pipe(fd);
1521                 if (err < 0)
1522                         goto out_child;
1523         } else {
1524                 err = perf_session__write_header(session, rec->evlist, fd, false);
1525                 if (err < 0)
1526                         goto out_child;
1527         }
1528
1529         if (!rec->no_buildid
1530             && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1531                 pr_err("Couldn't generate buildids. "
1532                        "Use --no-buildid to profile anyway.\n");
1533                 err = -1;
1534                 goto out_child;
1535         }
1536
1537         if (!opts->no_bpf_event)
1538                 bpf_event__add_sb_event(&sb_evlist, &session->header.env);
1539
1540         if (perf_evlist__start_sb_thread(sb_evlist, &rec->opts.target)) {
1541                 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1542                 opts->no_bpf_event = true;
1543         }
1544
1545         err = record__synthesize(rec, false);
1546         if (err < 0)
1547                 goto out_child;
1548
1549         if (rec->realtime_prio) {
1550                 struct sched_param param;
1551
1552                 param.sched_priority = rec->realtime_prio;
1553                 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1554                         pr_err("Could not set realtime priority.\n");
1555                         err = -1;
1556                         goto out_child;
1557                 }
1558         }
1559
1560         /*
1561          * When perf is starting the traced process, all the events
1562          * (apart from group members) have enable_on_exec=1 set,
1563          * so don't spoil it by prematurely enabling them.
1564          */
1565         if (!target__none(&opts->target) && !opts->initial_delay)
1566                 evlist__enable(rec->evlist);
1567
1568         /*
1569          * Let the child rip
1570          */
1571         if (forks) {
1572                 struct machine *machine = &session->machines.host;
1573                 union perf_event *event;
1574                 pid_t tgid;
1575
1576                 event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1577                 if (event == NULL) {
1578                         err = -ENOMEM;
1579                         goto out_child;
1580                 }
1581
1582                 /*
1583                  * Some H/W events are generated before COMM event
1584                  * which is emitted during exec(), so perf script
1585                  * cannot see a correct process name for those events.
1586                  * Synthesize COMM event to prevent it.
1587                  */
1588                 tgid = perf_event__synthesize_comm(tool, event,
1589                                                    rec->evlist->workload.pid,
1590                                                    process_synthesized_event,
1591                                                    machine);
1592                 free(event);
1593
1594                 if (tgid == -1)
1595                         goto out_child;
1596
1597                 event = malloc(sizeof(event->namespaces) +
1598                                (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1599                                machine->id_hdr_size);
1600                 if (event == NULL) {
1601                         err = -ENOMEM;
1602                         goto out_child;
1603                 }
1604
1605                 /*
1606                  * Synthesize NAMESPACES event for the command specified.
1607                  */
1608                 perf_event__synthesize_namespaces(tool, event,
1609                                                   rec->evlist->workload.pid,
1610                                                   tgid, process_synthesized_event,
1611                                                   machine);
1612                 free(event);
1613
1614                 perf_evlist__start_workload(rec->evlist);
1615         }
1616
1617         if (opts->initial_delay) {
1618                 usleep(opts->initial_delay * USEC_PER_MSEC);
1619                 evlist__enable(rec->evlist);
1620         }
1621
1622         trigger_ready(&auxtrace_snapshot_trigger);
1623         trigger_ready(&switch_output_trigger);
1624         perf_hooks__invoke_record_start();
1625         for (;;) {
1626                 unsigned long long hits = rec->samples;
1627
1628                 /*
1629                  * rec->evlist->bkw_mmap_state is possible to be
1630                  * BKW_MMAP_EMPTY here: when done == true and
1631                  * hits != rec->samples in previous round.
1632                  *
1633                  * perf_evlist__toggle_bkw_mmap ensure we never
1634                  * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1635                  */
1636                 if (trigger_is_hit(&switch_output_trigger) || done || draining)
1637                         perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1638
1639                 if (record__mmap_read_all(rec, false) < 0) {
1640                         trigger_error(&auxtrace_snapshot_trigger);
1641                         trigger_error(&switch_output_trigger);
1642                         err = -1;
1643                         goto out_child;
1644                 }
1645
1646                 if (auxtrace_record__snapshot_started) {
1647                         auxtrace_record__snapshot_started = 0;
1648                         if (!trigger_is_error(&auxtrace_snapshot_trigger))
1649                                 record__read_auxtrace_snapshot(rec, false);
1650                         if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1651                                 pr_err("AUX area tracing snapshot failed\n");
1652                                 err = -1;
1653                                 goto out_child;
1654                         }
1655                 }
1656
1657                 if (trigger_is_hit(&switch_output_trigger)) {
1658                         /*
1659                          * If switch_output_trigger is hit, the data in
1660                          * overwritable ring buffer should have been collected,
1661                          * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1662                          *
1663                          * If SIGUSR2 raise after or during record__mmap_read_all(),
1664                          * record__mmap_read_all() didn't collect data from
1665                          * overwritable ring buffer. Read again.
1666                          */
1667                         if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1668                                 continue;
1669                         trigger_ready(&switch_output_trigger);
1670
1671                         /*
1672                          * Reenable events in overwrite ring buffer after
1673                          * record__mmap_read_all(): we should have collected
1674                          * data from it.
1675                          */
1676                         perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1677
1678                         if (!quiet)
1679                                 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1680                                         waking);
1681                         waking = 0;
1682                         fd = record__switch_output(rec, false);
1683                         if (fd < 0) {
1684                                 pr_err("Failed to switch to new file\n");
1685                                 trigger_error(&switch_output_trigger);
1686                                 err = fd;
1687                                 goto out_child;
1688                         }
1689
1690                         /* re-arm the alarm */
1691                         if (rec->switch_output.time)
1692                                 alarm(rec->switch_output.time);
1693                 }
1694
1695                 if (hits == rec->samples) {
1696                         if (done || draining)
1697                                 break;
1698                         err = evlist__poll(rec->evlist, -1);
1699                         /*
1700                          * Propagate error, only if there's any. Ignore positive
1701                          * number of returned events and interrupt error.
1702                          */
1703                         if (err > 0 || (err < 0 && errno == EINTR))
1704                                 err = 0;
1705                         waking++;
1706
1707                         if (evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1708                                 draining = true;
1709                 }
1710
1711                 /*
1712                  * When perf is starting the traced process, at the end events
1713                  * die with the process and we wait for that. Thus no need to
1714                  * disable events in this case.
1715                  */
1716                 if (done && !disabled && !target__none(&opts->target)) {
1717                         trigger_off(&auxtrace_snapshot_trigger);
1718                         evlist__disable(rec->evlist);
1719                         disabled = true;
1720                 }
1721         }
1722
1723         trigger_off(&auxtrace_snapshot_trigger);
1724         trigger_off(&switch_output_trigger);
1725
1726         if (opts->auxtrace_snapshot_on_exit)
1727                 record__auxtrace_snapshot_exit(rec);
1728
1729         if (forks && workload_exec_errno) {
1730                 char msg[STRERR_BUFSIZE];
1731                 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1732                 pr_err("Workload failed: %s\n", emsg);
1733                 err = -1;
1734                 goto out_child;
1735         }
1736
1737         if (!quiet)
1738                 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1739
1740         if (target__none(&rec->opts.target))
1741                 record__synthesize_workload(rec, true);
1742
1743 out_child:
1744         record__mmap_read_all(rec, true);
1745         record__aio_mmap_read_sync(rec);
1746
1747         if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
1748                 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
1749                 session->header.env.comp_ratio = ratio + 0.5;
1750         }
1751
1752         if (forks) {
1753                 int exit_status;
1754
1755                 if (!child_finished)
1756                         kill(rec->evlist->workload.pid, SIGTERM);
1757
1758                 wait(&exit_status);
1759
1760                 if (err < 0)
1761                         status = err;
1762                 else if (WIFEXITED(exit_status))
1763                         status = WEXITSTATUS(exit_status);
1764                 else if (WIFSIGNALED(exit_status))
1765                         signr = WTERMSIG(exit_status);
1766         } else
1767                 status = err;
1768
1769         record__synthesize(rec, true);
1770         /* this will be recalculated during process_buildids() */
1771         rec->samples = 0;
1772
1773         if (!err) {
1774                 if (!rec->timestamp_filename) {
1775                         record__finish_output(rec);
1776                 } else {
1777                         fd = record__switch_output(rec, true);
1778                         if (fd < 0) {
1779                                 status = fd;
1780                                 goto out_delete_session;
1781                         }
1782                 }
1783         }
1784
1785         perf_hooks__invoke_record_end();
1786
1787         if (!err && !quiet) {
1788                 char samples[128];
1789                 const char *postfix = rec->timestamp_filename ?
1790                                         ".<timestamp>" : "";
1791
1792                 if (rec->samples && !rec->opts.full_auxtrace)
1793                         scnprintf(samples, sizeof(samples),
1794                                   " (%" PRIu64 " samples)", rec->samples);
1795                 else
1796                         samples[0] = '\0';
1797
1798                 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s",
1799                         perf_data__size(data) / 1024.0 / 1024.0,
1800                         data->path, postfix, samples);
1801                 if (ratio) {
1802                         fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)",
1803                                         rec->session->bytes_transferred / 1024.0 / 1024.0,
1804                                         ratio);
1805                 }
1806                 fprintf(stderr, " ]\n");
1807         }
1808
1809 out_delete_session:
1810         zstd_fini(&session->zstd_data);
1811         perf_session__delete(session);
1812
1813         if (!opts->no_bpf_event)
1814                 perf_evlist__stop_sb_thread(sb_evlist);
1815         return status;
1816 }
1817
1818 static void callchain_debug(struct callchain_param *callchain)
1819 {
1820         static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
1821
1822         pr_debug("callchain: type %s\n", str[callchain->record_mode]);
1823
1824         if (callchain->record_mode == CALLCHAIN_DWARF)
1825                 pr_debug("callchain: stack dump size %d\n",
1826                          callchain->dump_size);
1827 }
1828
1829 int record_opts__parse_callchain(struct record_opts *record,
1830                                  struct callchain_param *callchain,
1831                                  const char *arg, bool unset)
1832 {
1833         int ret;
1834         callchain->enabled = !unset;
1835
1836         /* --no-call-graph */
1837         if (unset) {
1838                 callchain->record_mode = CALLCHAIN_NONE;
1839                 pr_debug("callchain: disabled\n");
1840                 return 0;
1841         }
1842
1843         ret = parse_callchain_record_opt(arg, callchain);
1844         if (!ret) {
1845                 /* Enable data address sampling for DWARF unwind. */
1846                 if (callchain->record_mode == CALLCHAIN_DWARF)
1847                         record->sample_address = true;
1848                 callchain_debug(callchain);
1849         }
1850
1851         return ret;
1852 }
1853
1854 int record_parse_callchain_opt(const struct option *opt,
1855                                const char *arg,
1856                                int unset)
1857 {
1858         return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
1859 }
1860
1861 int record_callchain_opt(const struct option *opt,
1862                          const char *arg __maybe_unused,
1863                          int unset __maybe_unused)
1864 {
1865         struct callchain_param *callchain = opt->value;
1866
1867         callchain->enabled = true;
1868
1869         if (callchain->record_mode == CALLCHAIN_NONE)
1870                 callchain->record_mode = CALLCHAIN_FP;
1871
1872         callchain_debug(callchain);
1873         return 0;
1874 }
1875
1876 static int perf_record_config(const char *var, const char *value, void *cb)
1877 {
1878         struct record *rec = cb;
1879
1880         if (!strcmp(var, "record.build-id")) {
1881                 if (!strcmp(value, "cache"))
1882                         rec->no_buildid_cache = false;
1883                 else if (!strcmp(value, "no-cache"))
1884                         rec->no_buildid_cache = true;
1885                 else if (!strcmp(value, "skip"))
1886                         rec->no_buildid = true;
1887                 else
1888                         return -1;
1889                 return 0;
1890         }
1891         if (!strcmp(var, "record.call-graph")) {
1892                 var = "call-graph.record-mode";
1893                 return perf_default_config(var, value, cb);
1894         }
1895 #ifdef HAVE_AIO_SUPPORT
1896         if (!strcmp(var, "record.aio")) {
1897                 rec->opts.nr_cblocks = strtol(value, NULL, 0);
1898                 if (!rec->opts.nr_cblocks)
1899                         rec->opts.nr_cblocks = nr_cblocks_default;
1900         }
1901 #endif
1902
1903         return 0;
1904 }
1905
1906 struct clockid_map {
1907         const char *name;
1908         int clockid;
1909 };
1910
1911 #define CLOCKID_MAP(n, c)       \
1912         { .name = n, .clockid = (c), }
1913
1914 #define CLOCKID_END     { .name = NULL, }
1915
1916
1917 /*
1918  * Add the missing ones, we need to build on many distros...
1919  */
1920 #ifndef CLOCK_MONOTONIC_RAW
1921 #define CLOCK_MONOTONIC_RAW 4
1922 #endif
1923 #ifndef CLOCK_BOOTTIME
1924 #define CLOCK_BOOTTIME 7
1925 #endif
1926 #ifndef CLOCK_TAI
1927 #define CLOCK_TAI 11
1928 #endif
1929
1930 static const struct clockid_map clockids[] = {
1931         /* available for all events, NMI safe */
1932         CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
1933         CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
1934
1935         /* available for some events */
1936         CLOCKID_MAP("realtime", CLOCK_REALTIME),
1937         CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
1938         CLOCKID_MAP("tai", CLOCK_TAI),
1939
1940         /* available for the lazy */
1941         CLOCKID_MAP("mono", CLOCK_MONOTONIC),
1942         CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
1943         CLOCKID_MAP("real", CLOCK_REALTIME),
1944         CLOCKID_MAP("boot", CLOCK_BOOTTIME),
1945
1946         CLOCKID_END,
1947 };
1948
1949 static int get_clockid_res(clockid_t clk_id, u64 *res_ns)
1950 {
1951         struct timespec res;
1952
1953         *res_ns = 0;
1954         if (!clock_getres(clk_id, &res))
1955                 *res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC;
1956         else
1957                 pr_warning("WARNING: Failed to determine specified clock resolution.\n");
1958
1959         return 0;
1960 }
1961
1962 static int parse_clockid(const struct option *opt, const char *str, int unset)
1963 {
1964         struct record_opts *opts = (struct record_opts *)opt->value;
1965         const struct clockid_map *cm;
1966         const char *ostr = str;
1967
1968         if (unset) {
1969                 opts->use_clockid = 0;
1970                 return 0;
1971         }
1972
1973         /* no arg passed */
1974         if (!str)
1975                 return 0;
1976
1977         /* no setting it twice */
1978         if (opts->use_clockid)
1979                 return -1;
1980
1981         opts->use_clockid = true;
1982
1983         /* if its a number, we're done */
1984         if (sscanf(str, "%d", &opts->clockid) == 1)
1985                 return get_clockid_res(opts->clockid, &opts->clockid_res_ns);
1986
1987         /* allow a "CLOCK_" prefix to the name */
1988         if (!strncasecmp(str, "CLOCK_", 6))
1989                 str += 6;
1990
1991         for (cm = clockids; cm->name; cm++) {
1992                 if (!strcasecmp(str, cm->name)) {
1993                         opts->clockid = cm->clockid;
1994                         return get_clockid_res(opts->clockid,
1995                                                &opts->clockid_res_ns);
1996                 }
1997         }
1998
1999         opts->use_clockid = false;
2000         ui__warning("unknown clockid %s, check man page\n", ostr);
2001         return -1;
2002 }
2003
2004 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2005 {
2006         struct record_opts *opts = (struct record_opts *)opt->value;
2007
2008         if (unset || !str)
2009                 return 0;
2010
2011         if (!strcasecmp(str, "node"))
2012                 opts->affinity = PERF_AFFINITY_NODE;
2013         else if (!strcasecmp(str, "cpu"))
2014                 opts->affinity = PERF_AFFINITY_CPU;
2015
2016         return 0;
2017 }
2018
2019 static int parse_output_max_size(const struct option *opt,
2020                                  const char *str, int unset)
2021 {
2022         unsigned long *s = (unsigned long *)opt->value;
2023         static struct parse_tag tags_size[] = {
2024                 { .tag  = 'B', .mult = 1       },
2025                 { .tag  = 'K', .mult = 1 << 10 },
2026                 { .tag  = 'M', .mult = 1 << 20 },
2027                 { .tag  = 'G', .mult = 1 << 30 },
2028                 { .tag  = 0 },
2029         };
2030         unsigned long val;
2031
2032         if (unset) {
2033                 *s = 0;
2034                 return 0;
2035         }
2036
2037         val = parse_tag_value(str, tags_size);
2038         if (val != (unsigned long) -1) {
2039                 *s = val;
2040                 return 0;
2041         }
2042
2043         return -1;
2044 }
2045
2046 static int record__parse_mmap_pages(const struct option *opt,
2047                                     const char *str,
2048                                     int unset __maybe_unused)
2049 {
2050         struct record_opts *opts = opt->value;
2051         char *s, *p;
2052         unsigned int mmap_pages;
2053         int ret;
2054
2055         if (!str)
2056                 return -EINVAL;
2057
2058         s = strdup(str);
2059         if (!s)
2060                 return -ENOMEM;
2061
2062         p = strchr(s, ',');
2063         if (p)
2064                 *p = '\0';
2065
2066         if (*s) {
2067                 ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
2068                 if (ret)
2069                         goto out_free;
2070                 opts->mmap_pages = mmap_pages;
2071         }
2072
2073         if (!p) {
2074                 ret = 0;
2075                 goto out_free;
2076         }
2077
2078         ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
2079         if (ret)
2080                 goto out_free;
2081
2082         opts->auxtrace_mmap_pages = mmap_pages;
2083
2084 out_free:
2085         free(s);
2086         return ret;
2087 }
2088
2089 static void switch_output_size_warn(struct record *rec)
2090 {
2091         u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
2092         struct switch_output *s = &rec->switch_output;
2093
2094         wakeup_size /= 2;
2095
2096         if (s->size < wakeup_size) {
2097                 char buf[100];
2098
2099                 unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
2100                 pr_warning("WARNING: switch-output data size lower than "
2101                            "wakeup kernel buffer size (%s) "
2102                            "expect bigger perf.data sizes\n", buf);
2103         }
2104 }
2105
2106 static int switch_output_setup(struct record *rec)
2107 {
2108         struct switch_output *s = &rec->switch_output;
2109         static struct parse_tag tags_size[] = {
2110                 { .tag  = 'B', .mult = 1       },
2111                 { .tag  = 'K', .mult = 1 << 10 },
2112                 { .tag  = 'M', .mult = 1 << 20 },
2113                 { .tag  = 'G', .mult = 1 << 30 },
2114                 { .tag  = 0 },
2115         };
2116         static struct parse_tag tags_time[] = {
2117                 { .tag  = 's', .mult = 1        },
2118                 { .tag  = 'm', .mult = 60       },
2119                 { .tag  = 'h', .mult = 60*60    },
2120                 { .tag  = 'd', .mult = 60*60*24 },
2121                 { .tag  = 0 },
2122         };
2123         unsigned long val;
2124
2125         if (!s->set)
2126                 return 0;
2127
2128         if (!strcmp(s->str, "signal")) {
2129                 s->signal = true;
2130                 pr_debug("switch-output with SIGUSR2 signal\n");
2131                 goto enabled;
2132         }
2133
2134         val = parse_tag_value(s->str, tags_size);
2135         if (val != (unsigned long) -1) {
2136                 s->size = val;
2137                 pr_debug("switch-output with %s size threshold\n", s->str);
2138                 goto enabled;
2139         }
2140
2141         val = parse_tag_value(s->str, tags_time);
2142         if (val != (unsigned long) -1) {
2143                 s->time = val;
2144                 pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2145                          s->str, s->time);
2146                 goto enabled;
2147         }
2148
2149         return -1;
2150
2151 enabled:
2152         rec->timestamp_filename = true;
2153         s->enabled              = true;
2154
2155         if (s->size && !rec->opts.no_buffering)
2156                 switch_output_size_warn(rec);
2157
2158         return 0;
2159 }
2160
2161 static const char * const __record_usage[] = {
2162         "perf record [<options>] [<command>]",
2163         "perf record [<options>] -- <command> [<options>]",
2164         NULL
2165 };
2166 const char * const *record_usage = __record_usage;
2167
2168 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
2169                                   struct perf_sample *sample, struct machine *machine)
2170 {
2171         /*
2172          * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2173          * no need to add them twice.
2174          */
2175         if (!(event->header.misc & PERF_RECORD_MISC_USER))
2176                 return 0;
2177         return perf_event__process_mmap(tool, event, sample, machine);
2178 }
2179
2180 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
2181                                    struct perf_sample *sample, struct machine *machine)
2182 {
2183         /*
2184          * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2185          * no need to add them twice.
2186          */
2187         if (!(event->header.misc & PERF_RECORD_MISC_USER))
2188                 return 0;
2189
2190         return perf_event__process_mmap2(tool, event, sample, machine);
2191 }
2192
2193 /*
2194  * XXX Ideally would be local to cmd_record() and passed to a record__new
2195  * because we need to have access to it in record__exit, that is called
2196  * after cmd_record() exits, but since record_options need to be accessible to
2197  * builtin-script, leave it here.
2198  *
2199  * At least we don't ouch it in all the other functions here directly.
2200  *
2201  * Just say no to tons of global variables, sigh.
2202  */
2203 static struct record record = {
2204         .opts = {
2205                 .sample_time         = true,
2206                 .mmap_pages          = UINT_MAX,
2207                 .user_freq           = UINT_MAX,
2208                 .user_interval       = ULLONG_MAX,
2209                 .freq                = 4000,
2210                 .target              = {
2211                         .uses_mmap   = true,
2212                         .default_per_cpu = true,
2213                 },
2214                 .mmap_flush          = MMAP_FLUSH_DEFAULT,
2215         },
2216         .tool = {
2217                 .sample         = process_sample_event,
2218                 .fork           = perf_event__process_fork,
2219                 .exit           = perf_event__process_exit,
2220                 .comm           = perf_event__process_comm,
2221                 .namespaces     = perf_event__process_namespaces,
2222                 .mmap           = build_id__process_mmap,
2223                 .mmap2          = build_id__process_mmap2,
2224                 .ordered_events = true,
2225         },
2226 };
2227
2228 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2229         "\n\t\t\t\tDefault: fp";
2230
2231 static bool dry_run;
2232
2233 /*
2234  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2235  * with it and switch to use the library functions in perf_evlist that came
2236  * from builtin-record.c, i.e. use record_opts,
2237  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2238  * using pipes, etc.
2239  */
2240 static struct option __record_options[] = {
2241         OPT_CALLBACK('e', "event", &record.evlist, "event",
2242                      "event selector. use 'perf list' to list available events",
2243                      parse_events_option),
2244         OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2245                      "event filter", parse_filter),
2246         OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2247                            NULL, "don't record events from perf itself",
2248                            exclude_perf),
2249         OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2250                     "record events on existing process id"),
2251         OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2252                     "record events on existing thread id"),
2253         OPT_INTEGER('r', "realtime", &record.realtime_prio,
2254                     "collect data with this RT SCHED_FIFO priority"),
2255         OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2256                     "collect data without buffering"),
2257         OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2258                     "collect raw sample records from all opened counters"),
2259         OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2260                             "system-wide collection from all CPUs"),
2261         OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2262                     "list of cpus to monitor"),
2263         OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2264         OPT_STRING('o', "output", &record.data.path, "file",
2265                     "output file name"),
2266         OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2267                         &record.opts.no_inherit_set,
2268                         "child tasks do not inherit counters"),
2269         OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2270                     "synthesize non-sample events at the end of output"),
2271         OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2272         OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "record bpf events"),
2273         OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2274                     "Fail if the specified frequency can't be used"),
2275         OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2276                      "profile at this frequency",
2277                       record__parse_freq),
2278         OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2279                      "number of mmap data pages and AUX area tracing mmap pages",
2280                      record__parse_mmap_pages),
2281         OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2282                      "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2283                      record__mmap_flush_parse),
2284         OPT_BOOLEAN(0, "group", &record.opts.group,
2285                     "put the counters into a counter group"),
2286         OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2287                            NULL, "enables call-graph recording" ,
2288                            &record_callchain_opt),
2289         OPT_CALLBACK(0, "call-graph", &record.opts,
2290                      "record_mode[,record_size]", record_callchain_help,
2291                      &record_parse_callchain_opt),
2292         OPT_INCR('v', "verbose", &verbose,
2293                     "be more verbose (show counter open errors, etc)"),
2294         OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2295         OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2296                     "per thread counts"),
2297         OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2298         OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2299                     "Record the sample physical addresses"),
2300         OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2301         OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2302                         &record.opts.sample_time_set,
2303                         "Record the sample timestamps"),
2304         OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2305                         "Record the sample period"),
2306         OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2307                     "don't sample"),
2308         OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2309                         &record.no_buildid_cache_set,
2310                         "do not update the buildid cache"),
2311         OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2312                         &record.no_buildid_set,
2313                         "do not collect buildids in perf.data"),
2314         OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2315                      "monitor event in cgroup name only",
2316                      parse_cgroups),
2317         OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
2318                   "ms to wait before starting measurement after program start"),
2319         OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
2320         OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2321                    "user to profile"),
2322
2323         OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2324                      "branch any", "sample any taken branches",
2325                      parse_branch_stack),
2326
2327         OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2328                      "branch filter mask", "branch stack filter modes",
2329                      parse_branch_stack),
2330         OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2331                     "sample by weight (on special events only)"),
2332         OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2333                     "sample transaction flags (special events only)"),
2334         OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2335                     "use per-thread mmaps"),
2336         OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2337                     "sample selected machine registers on interrupt,"
2338                     " use '-I?' to list register names", parse_intr_regs),
2339         OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2340                     "sample selected machine registers on interrupt,"
2341                     " use '--user-regs=?' to list register names", parse_user_regs),
2342         OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2343                     "Record running/enabled time of read (:S) events"),
2344         OPT_CALLBACK('k', "clockid", &record.opts,
2345         "clockid", "clockid to use for events, see clock_gettime()",
2346         parse_clockid),
2347         OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2348                           "opts", "AUX area tracing Snapshot Mode", ""),
2349         OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
2350                           "opts", "sample AUX area", ""),
2351         OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2352                         "per thread proc mmap processing timeout in ms"),
2353         OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2354                     "Record namespaces events"),
2355         OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
2356                     "Record context switch events"),
2357         OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2358                          "Configure all used events to run in kernel space.",
2359                          PARSE_OPT_EXCLUSIVE),
2360         OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2361                          "Configure all used events to run in user space.",
2362                          PARSE_OPT_EXCLUSIVE),
2363         OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2364                     "collect kernel callchains"),
2365         OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2366                     "collect user callchains"),
2367         OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2368                    "clang binary to use for compiling BPF scriptlets"),
2369         OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2370                    "options passed to clang when compiling BPF scriptlets"),
2371         OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2372                    "file", "vmlinux pathname"),
2373         OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2374                     "Record build-id of all DSOs regardless of hits"),
2375         OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2376                     "append timestamp to output filename"),
2377         OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2378                     "Record timestamp boundary (time of first/last samples)"),
2379         OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2380                           &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2381                           "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2382                           "signal"),
2383         OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2384                    "Limit number of switch output generated files"),
2385         OPT_BOOLEAN(0, "dry-run", &dry_run,
2386                     "Parse options then exit"),
2387 #ifdef HAVE_AIO_SUPPORT
2388         OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2389                      &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2390                      record__aio_parse),
2391 #endif
2392         OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2393                      "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2394                      record__parse_affinity),
2395 #ifdef HAVE_ZSTD_SUPPORT
2396         OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2397                             "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2398                             record__parse_comp_level),
2399 #endif
2400         OPT_CALLBACK(0, "max-size", &record.output_max_size,
2401                      "size", "Limit the maximum size of the output file", parse_output_max_size),
2402         OPT_END()
2403 };
2404
2405 struct option *record_options = __record_options;
2406
2407 int cmd_record(int argc, const char **argv)
2408 {
2409         int err;
2410         struct record *rec = &record;
2411         char errbuf[BUFSIZ];
2412
2413         setlocale(LC_ALL, "");
2414
2415 #ifndef HAVE_LIBBPF_SUPPORT
2416 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2417         set_nobuild('\0', "clang-path", true);
2418         set_nobuild('\0', "clang-opt", true);
2419 # undef set_nobuild
2420 #endif
2421
2422 #ifndef HAVE_BPF_PROLOGUE
2423 # if !defined (HAVE_DWARF_SUPPORT)
2424 #  define REASON  "NO_DWARF=1"
2425 # elif !defined (HAVE_LIBBPF_SUPPORT)
2426 #  define REASON  "NO_LIBBPF=1"
2427 # else
2428 #  define REASON  "this architecture doesn't support BPF prologue"
2429 # endif
2430 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2431         set_nobuild('\0', "vmlinux", true);
2432 # undef set_nobuild
2433 # undef REASON
2434 #endif
2435
2436         CPU_ZERO(&rec->affinity_mask);
2437         rec->opts.affinity = PERF_AFFINITY_SYS;
2438
2439         rec->evlist = evlist__new();
2440         if (rec->evlist == NULL)
2441                 return -ENOMEM;
2442
2443         err = perf_config(perf_record_config, rec);
2444         if (err)
2445                 return err;
2446
2447         argc = parse_options(argc, argv, record_options, record_usage,
2448                             PARSE_OPT_STOP_AT_NON_OPTION);
2449         if (quiet)
2450                 perf_quiet_option();
2451
2452         /* Make system wide (-a) the default target. */
2453         if (!argc && target__none(&rec->opts.target))
2454                 rec->opts.target.system_wide = true;
2455
2456         if (nr_cgroups && !rec->opts.target.system_wide) {
2457                 usage_with_options_msg(record_usage, record_options,
2458                         "cgroup monitoring only available in system-wide mode");
2459
2460         }
2461
2462         if (rec->opts.kcore)
2463                 rec->data.is_dir = true;
2464
2465         if (rec->opts.comp_level != 0) {
2466                 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2467                 rec->no_buildid = true;
2468         }
2469
2470         if (rec->opts.record_switch_events &&
2471             !perf_can_record_switch_events()) {
2472                 ui__error("kernel does not support recording context switch events\n");
2473                 parse_options_usage(record_usage, record_options, "switch-events", 0);
2474                 return -EINVAL;
2475         }
2476
2477         if (switch_output_setup(rec)) {
2478                 parse_options_usage(record_usage, record_options, "switch-output", 0);
2479                 return -EINVAL;
2480         }
2481
2482         if (rec->switch_output.time) {
2483                 signal(SIGALRM, alarm_sig_handler);
2484                 alarm(rec->switch_output.time);
2485         }
2486
2487         if (rec->switch_output.num_files) {
2488                 rec->switch_output.filenames = calloc(sizeof(char *),
2489                                                       rec->switch_output.num_files);
2490                 if (!rec->switch_output.filenames)
2491                         return -EINVAL;
2492         }
2493
2494         /*
2495          * Allow aliases to facilitate the lookup of symbols for address
2496          * filters. Refer to auxtrace_parse_filters().
2497          */
2498         symbol_conf.allow_aliases = true;
2499
2500         symbol__init(NULL);
2501
2502         err = record__auxtrace_init(rec);
2503         if (err)
2504                 goto out;
2505
2506         if (dry_run)
2507                 goto out;
2508
2509         err = bpf__setup_stdout(rec->evlist);
2510         if (err) {
2511                 bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2512                 pr_err("ERROR: Setup BPF stdout failed: %s\n",
2513                          errbuf);
2514                 goto out;
2515         }
2516
2517         err = -ENOMEM;
2518
2519         if (rec->no_buildid_cache || rec->no_buildid) {
2520                 disable_buildid_cache();
2521         } else if (rec->switch_output.enabled) {
2522                 /*
2523                  * In 'perf record --switch-output', disable buildid
2524                  * generation by default to reduce data file switching
2525                  * overhead. Still generate buildid if they are required
2526                  * explicitly using
2527                  *
2528                  *  perf record --switch-output --no-no-buildid \
2529                  *              --no-no-buildid-cache
2530                  *
2531                  * Following code equals to:
2532                  *
2533                  * if ((rec->no_buildid || !rec->no_buildid_set) &&
2534                  *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2535                  *         disable_buildid_cache();
2536                  */
2537                 bool disable = true;
2538
2539                 if (rec->no_buildid_set && !rec->no_buildid)
2540                         disable = false;
2541                 if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2542                         disable = false;
2543                 if (disable) {
2544                         rec->no_buildid = true;
2545                         rec->no_buildid_cache = true;
2546                         disable_buildid_cache();
2547                 }
2548         }
2549
2550         if (record.opts.overwrite)
2551                 record.opts.tail_synthesize = true;
2552
2553         if (rec->evlist->core.nr_entries == 0 &&
2554             __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
2555                 pr_err("Not enough memory for event selector list\n");
2556                 goto out;
2557         }
2558
2559         if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2560                 rec->opts.no_inherit = true;
2561
2562         err = target__validate(&rec->opts.target);
2563         if (err) {
2564                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2565                 ui__warning("%s\n", errbuf);
2566         }
2567
2568         err = target__parse_uid(&rec->opts.target);
2569         if (err) {
2570                 int saved_errno = errno;
2571
2572                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2573                 ui__error("%s", errbuf);
2574
2575                 err = -saved_errno;
2576                 goto out;
2577         }
2578
2579         /* Enable ignoring missing threads when -u/-p option is defined. */
2580         rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2581
2582         err = -ENOMEM;
2583         if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2584                 usage_with_options(record_usage, record_options);
2585
2586         err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2587         if (err)
2588                 goto out;
2589
2590         /*
2591          * We take all buildids when the file contains
2592          * AUX area tracing data because we do not decode the
2593          * trace because it would take too long.
2594          */
2595         if (rec->opts.full_auxtrace)
2596                 rec->buildid_all = true;
2597
2598         if (record_opts__config(&rec->opts)) {
2599                 err = -EINVAL;
2600                 goto out;
2601         }
2602
2603         if (rec->opts.nr_cblocks > nr_cblocks_max)
2604                 rec->opts.nr_cblocks = nr_cblocks_max;
2605         pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2606
2607         pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2608         pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2609
2610         if (rec->opts.comp_level > comp_level_max)
2611                 rec->opts.comp_level = comp_level_max;
2612         pr_debug("comp level: %d\n", rec->opts.comp_level);
2613
2614         err = __cmd_record(&record, argc, argv);
2615 out:
2616         evlist__delete(rec->evlist);
2617         symbol__exit();
2618         auxtrace_record__free(rec->itr);
2619         return err;
2620 }
2621
2622 static void snapshot_sig_handler(int sig __maybe_unused)
2623 {
2624         struct record *rec = &record;
2625
2626         if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2627                 trigger_hit(&auxtrace_snapshot_trigger);
2628                 auxtrace_record__snapshot_started = 1;
2629                 if (auxtrace_record__snapshot_start(record.itr))
2630                         trigger_error(&auxtrace_snapshot_trigger);
2631         }
2632
2633         if (switch_output_signal(rec))
2634                 trigger_hit(&switch_output_trigger);
2635 }
2636
2637 static void alarm_sig_handler(int sig __maybe_unused)
2638 {
2639         struct record *rec = &record;
2640
2641         if (switch_output_time(rec))
2642                 trigger_hit(&switch_output_trigger);
2643 }