LLVM OpenMP* Runtime Library
 All Classes Functions Variables Typedefs Enumerations Enumerator Modules Pages
kmp_stats.h
1 #ifndef KMP_STATS_H
2 #define KMP_STATS_H
3 
8 //===----------------------------------------------------------------------===//
9 //
10 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
11 // See https://llvm.org/LICENSE.txt for license information.
12 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "kmp_config.h"
17 #include "kmp_debug.h"
18 
19 #if KMP_STATS_ENABLED
20 /* Statistics accumulator.
21  Accumulates number of samples and computes min, max, mean, standard deviation
22  on the fly.
23 
24  Online variance calculation algorithm from
25  http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
26  */
27 
28 #include "kmp_stats_timing.h"
29 #include <limits>
30 #include <math.h>
31 #include <new> // placement new
32 #include <stdint.h>
33 #include <string>
34 #include <vector>
35 
36 /* Enable developer statistics here if you want them. They are more detailed
37  than is useful for application characterisation and are intended for the
38  runtime library developer. */
39 #define KMP_DEVELOPER_STATS 0
40 
41 /* Enable/Disable histogram output */
42 #define KMP_STATS_HIST 0
43 
50  noTotal = 1 << 0,
51  onlyInMaster = 1 << 1,
52  noUnits = 1 << 2,
53  notInMaster = 1 << 3,
54  logEvent = 1 << 4
55 };
57 
64  IDLE,
65  SERIAL_REGION,
66  FORK_JOIN_BARRIER,
67  PLAIN_BARRIER,
68  TASKWAIT,
69  TASKYIELD,
70  TASKGROUP,
71  IMPLICIT_TASK,
72  EXPLICIT_TASK
73 };
74 
93 // clang-format off
94 #define KMP_FOREACH_COUNTER(macro, arg) \
95  macro(OMP_PARALLEL,stats_flags_e::onlyInMaster|stats_flags_e::noTotal,arg) \
96  macro(OMP_NESTED_PARALLEL, 0, arg) \
97  macro(OMP_LOOP_STATIC, 0, arg) \
98  macro(OMP_LOOP_STATIC_STEAL, 0, arg) \
99  macro(OMP_LOOP_DYNAMIC, 0, arg) \
100  macro(OMP_DISTRIBUTE, 0, arg) \
101  macro(OMP_BARRIER, 0, arg) \
102  macro(OMP_CRITICAL, 0, arg) \
103  macro(OMP_SINGLE, 0, arg) \
104  macro(OMP_MASTER, 0, arg) \
105  macro(OMP_TEAMS, 0, arg) \
106  macro(OMP_set_lock, 0, arg) \
107  macro(OMP_test_lock, 0, arg) \
108  macro(REDUCE_wait, 0, arg) \
109  macro(REDUCE_nowait, 0, arg) \
110  macro(OMP_TASKYIELD, 0, arg) \
111  macro(OMP_TASKLOOP, 0, arg) \
112  macro(TASK_executed, 0, arg) \
113  macro(TASK_cancelled, 0, arg) \
114  macro(TASK_stolen, 0, arg)
115 // clang-format on
116 
135 // clang-format off
136 #define KMP_FOREACH_TIMER(macro, arg) \
137  macro (OMP_worker_thread_life, stats_flags_e::logEvent, arg) \
138  macro (OMP_parallel, stats_flags_e::logEvent, arg) \
139  macro (OMP_parallel_overhead, stats_flags_e::logEvent, arg) \
140  macro (OMP_loop_static, 0, arg) \
141  macro (OMP_loop_static_scheduling, 0, arg) \
142  macro (OMP_loop_dynamic, 0, arg) \
143  macro (OMP_loop_dynamic_scheduling, 0, arg) \
144  macro (OMP_critical, 0, arg) \
145  macro (OMP_critical_wait, 0, arg) \
146  macro (OMP_single, 0, arg) \
147  macro (OMP_master, 0, arg) \
148  macro (OMP_task_immediate, 0, arg) \
149  macro (OMP_task_taskwait, 0, arg) \
150  macro (OMP_task_taskyield, 0, arg) \
151  macro (OMP_task_taskgroup, 0, arg) \
152  macro (OMP_task_join_bar, 0, arg) \
153  macro (OMP_task_plain_bar, 0, arg) \
154  macro (OMP_taskloop_scheduling, 0, arg) \
155  macro (OMP_plain_barrier, stats_flags_e::logEvent, arg) \
156  macro (OMP_idle, stats_flags_e::logEvent, arg) \
157  macro (OMP_fork_barrier, stats_flags_e::logEvent, arg) \
158  macro (OMP_join_barrier, stats_flags_e::logEvent, arg) \
159  macro (OMP_serial, stats_flags_e::logEvent, arg) \
160  macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal, \
161  arg) \
162  macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal, \
163  arg) \
164  macro (OMP_loop_static_iterations, \
165  stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
166  macro (OMP_loop_dynamic_iterations, \
167  stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
168  KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
169 // clang-format on
170 
171 // OMP_worker_thread_life -- Time from thread becoming an OpenMP thread (either
172 // initializing OpenMP or being created by a master)
173 // until the thread is destroyed
174 // OMP_parallel -- Time thread spends executing work directly
175 // within a #pragma omp parallel
176 // OMP_parallel_overhead -- Time thread spends setting up a parallel region
177 // OMP_loop_static -- Time thread spends executing loop iterations from
178 // a statically scheduled loop
179 // OMP_loop_static_scheduling -- Time thread spends scheduling loop iterations
180 // from a statically scheduled loop
181 // OMP_loop_dynamic -- Time thread spends executing loop iterations from
182 // a dynamically scheduled loop
183 // OMP_loop_dynamic_scheduling -- Time thread spends scheduling loop iterations
184 // from a dynamically scheduled loop
185 // OMP_critical -- Time thread spends executing critical section
186 // OMP_critical_wait -- Time thread spends waiting to enter
187 // a critcal seciton
188 // OMP_single -- Time spent executing a "single" region
189 // OMP_master -- Time spent executing a "master" region
190 // OMP_task_immediate -- Time spent executing non-deferred tasks
191 // OMP_task_taskwait -- Time spent executing tasks inside a taskwait
192 // construct
193 // OMP_task_taskyield -- Time spent executing tasks inside a taskyield
194 // construct
195 // OMP_task_taskgroup -- Time spent executing tasks inside a taskygroup
196 // construct
197 // OMP_task_join_bar -- Time spent executing tasks inside a join barrier
198 // OMP_task_plain_bar -- Time spent executing tasks inside a barrier
199 // construct
200 // OMP_taskloop_scheduling -- Time spent scheduling tasks inside a taskloop
201 // construct
202 // OMP_plain_barrier -- Time spent in a #pragma omp barrier construct or
203 // inside implicit barrier at end of worksharing
204 // construct
205 // OMP_idle -- Time worker threads spend waiting for next
206 // parallel region
207 // OMP_fork_barrier -- Time spent in a the fork barrier surrounding a
208 // parallel region
209 // OMP_join_barrier -- Time spent in a the join barrier surrounding a
210 // parallel region
211 // OMP_serial -- Time thread zero spends executing serial code
212 // OMP_set_numthreads -- Values passed to omp_set_num_threads
213 // OMP_PARALLEL_args -- Number of arguments passed to a parallel region
214 // OMP_loop_static_iterations -- Number of iterations thread is assigned for
215 // statically scheduled loops
216 // OMP_loop_dynamic_iterations -- Number of iterations thread is assigned for
217 // dynamically scheduled loops
218 
219 #if (KMP_DEVELOPER_STATS)
220 // Timers which are of interest to runtime library developers, not end users.
221 // These have to be explicitly enabled in addition to the other stats.
222 
223 // KMP_fork_barrier -- time in __kmp_fork_barrier
224 // KMP_join_barrier -- time in __kmp_join_barrier
225 // KMP_barrier -- time in __kmp_barrier
226 // KMP_end_split_barrier -- time in __kmp_end_split_barrier
227 // KMP_setup_icv_copy -- time in __kmp_setup_icv_copy
228 // KMP_icv_copy -- start/stop timer for any ICV copying
229 // KMP_linear_gather -- time in __kmp_linear_barrier_gather
230 // KMP_linear_release -- time in __kmp_linear_barrier_release
231 // KMP_tree_gather -- time in __kmp_tree_barrier_gather
232 // KMP_tree_release -- time in __kmp_tree_barrier_release
233 // KMP_hyper_gather -- time in __kmp_hyper_barrier_gather
234 // KMP_hyper_release -- time in __kmp_hyper_barrier_release
235 // clang-format off
236 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg) \
237  macro(KMP_fork_call, 0, arg) \
238  macro(KMP_join_call, 0, arg) \
239  macro(KMP_end_split_barrier, 0, arg) \
240  macro(KMP_hier_gather, 0, arg) \
241  macro(KMP_hier_release, 0, arg) \
242  macro(KMP_hyper_gather, 0, arg) \
243  macro(KMP_hyper_release, 0, arg) \
244  macro(KMP_linear_gather, 0, arg) \
245  macro(KMP_linear_release, 0, arg) \
246  macro(KMP_tree_gather, 0, arg) \
247  macro(KMP_tree_release, 0, arg) \
248  macro(USER_resume, 0, arg) \
249  macro(USER_suspend, 0, arg) \
250  macro(KMP_allocate_team, 0, arg) \
251  macro(KMP_setup_icv_copy, 0, arg) \
252  macro(USER_icv_copy, 0, arg) \
253  macro (FOR_static_steal_stolen, \
254  stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
255  macro (FOR_static_steal_chunks, \
256  stats_flags_e::noUnits | stats_flags_e::noTotal, arg)
257 #else
258 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
259 #endif
260 // clang-format on
261 
281 #define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) KMP_FOREACH_TIMER(macro, arg)
282 
283 #define ENUMERATE(name, ignore, prefix) prefix##name,
284 enum timer_e { KMP_FOREACH_TIMER(ENUMERATE, TIMER_) TIMER_LAST };
285 
286 enum explicit_timer_e {
287  KMP_FOREACH_EXPLICIT_TIMER(ENUMERATE, EXPLICIT_TIMER_) EXPLICIT_TIMER_LAST
288 };
289 
290 enum counter_e { KMP_FOREACH_COUNTER(ENUMERATE, COUNTER_) COUNTER_LAST };
291 #undef ENUMERATE
292 
293 /*
294  * A logarithmic histogram. It accumulates the number of values in each power of
295  * ten bin. So 1<=x<10, 10<=x<100, ...
296  * Mostly useful where we have some big outliers and want to see information
297  * about them.
298  */
299 class logHistogram {
300  enum {
301  numBins = 31, /* Number of powers of 10. If this changes you need to change
302  * the initializer for binMax */
303 
304  /*
305  * If you want to use this to analyse values that may be less than 1, (for
306  * instance times in s), then the logOffset gives you negative powers.
307  * In our case here, we're just looking at times in ticks, or counts, so we
308  * can never see values with magnitude < 1 (other than zero), so we can set
309  * it to 0. As above change the initializer if you change this.
310  */
311  logOffset = 0
312  };
313  uint32_t KMP_ALIGN_CACHE zeroCount;
314  struct {
315  uint32_t count;
316  double total;
317  } bins[numBins];
318 
319  static double binMax[numBins];
320 
321 #ifdef KMP_DEBUG
322  uint64_t _total;
323 
324  void check() const {
325  uint64_t t = zeroCount;
326  for (int i = 0; i < numBins; i++)
327  t += bins[i].count;
328  KMP_DEBUG_ASSERT(t == _total);
329  }
330 #else
331  void check() const {}
332 #endif
333 
334 public:
335  logHistogram() { reset(); }
336 
337  logHistogram(logHistogram const &o) {
338  for (int i = 0; i < numBins; i++)
339  bins[i] = o.bins[i];
340 #ifdef KMP_DEBUG
341  _total = o._total;
342 #endif
343  }
344 
345  void reset() {
346  zeroCount = 0;
347  for (int i = 0; i < numBins; i++) {
348  bins[i].count = 0;
349  bins[i].total = 0;
350  }
351 
352 #ifdef KMP_DEBUG
353  _total = 0;
354 #endif
355  }
356  uint32_t count(int b) const { return bins[b + logOffset].count; }
357  double total(int b) const { return bins[b + logOffset].total; }
358  static uint32_t findBin(double sample);
359 
360  logHistogram &operator+=(logHistogram const &o) {
361  zeroCount += o.zeroCount;
362  for (int i = 0; i < numBins; i++) {
363  bins[i].count += o.bins[i].count;
364  bins[i].total += o.bins[i].total;
365  }
366 #ifdef KMP_DEBUG
367  _total += o._total;
368  check();
369 #endif
370 
371  return *this;
372  }
373 
374  void addSample(double sample);
375  int minBin() const;
376  int maxBin() const;
377 
378  std::string format(char) const;
379 };
380 
381 class statistic {
382  double KMP_ALIGN_CACHE minVal;
383  double maxVal;
384  double meanVal;
385  double m2;
386  uint64_t sampleCount;
387  double offset;
388  bool collectingHist;
389  logHistogram hist;
390 
391 public:
392  statistic(bool doHist = bool(KMP_STATS_HIST)) {
393  reset();
394  collectingHist = doHist;
395  }
396  statistic(statistic const &o)
397  : minVal(o.minVal), maxVal(o.maxVal), meanVal(o.meanVal), m2(o.m2),
398  sampleCount(o.sampleCount), offset(o.offset),
399  collectingHist(o.collectingHist), hist(o.hist) {}
400  statistic(double minv, double maxv, double meanv, uint64_t sc, double sd)
401  : minVal(minv), maxVal(maxv), meanVal(meanv), m2(sd * sd * sc),
402  sampleCount(sc), offset(0.0), collectingHist(false) {}
403  bool haveHist() const { return collectingHist; }
404  double getMin() const { return minVal; }
405  double getMean() const { return meanVal; }
406  double getMax() const { return maxVal; }
407  uint64_t getCount() const { return sampleCount; }
408  double getSD() const { return sqrt(m2 / sampleCount); }
409  double getTotal() const { return sampleCount * meanVal; }
410  logHistogram const *getHist() const { return &hist; }
411  void setOffset(double d) { offset = d; }
412 
413  void reset() {
414  minVal = std::numeric_limits<double>::max();
415  maxVal = -minVal;
416  meanVal = 0.0;
417  m2 = 0.0;
418  sampleCount = 0;
419  offset = 0.0;
420  hist.reset();
421  }
422  void addSample(double sample);
423  void scale(double factor);
424  void scaleDown(double f) { scale(1. / f); }
425  void forceCount(uint64_t count) { sampleCount = count; }
426  statistic &operator+=(statistic const &other);
427 
428  std::string format(char unit, bool total = false) const;
429  std::string formatHist(char unit) const { return hist.format(unit); }
430 };
431 
432 struct statInfo {
433  const char *name;
434  uint32_t flags;
435 };
436 
437 class timeStat : public statistic {
438  static statInfo timerInfo[];
439 
440 public:
441  timeStat() : statistic() {}
442  static const char *name(timer_e e) { return timerInfo[e].name; }
443  static bool noTotal(timer_e e) {
444  return timerInfo[e].flags & stats_flags_e::noTotal;
445  }
446  static bool masterOnly(timer_e e) {
447  return timerInfo[e].flags & stats_flags_e::onlyInMaster;
448  }
449  static bool workerOnly(timer_e e) {
450  return timerInfo[e].flags & stats_flags_e::notInMaster;
451  }
452  static bool noUnits(timer_e e) {
453  return timerInfo[e].flags & stats_flags_e::noUnits;
454  }
455  static bool logEvent(timer_e e) {
456  return timerInfo[e].flags & stats_flags_e::logEvent;
457  }
458  static void clearEventFlags() {
459  for (int i = 0; i < TIMER_LAST; i++) {
460  timerInfo[i].flags &= (~(stats_flags_e::logEvent));
461  }
462  }
463 };
464 
465 // Where we need explicitly to start and end the timer, this version can be used
466 // Since these timers normally aren't nicely scoped, so don't have a good place
467 // to live on the stack of the thread, they're more work to use.
468 class explicitTimer {
469  timeStat *stat;
470  timer_e timerEnumValue;
471  tsc_tick_count startTime;
472  tsc_tick_count pauseStartTime;
473  tsc_tick_count::tsc_interval_t totalPauseTime;
474 
475 public:
476  explicitTimer(timeStat *s, timer_e te)
477  : stat(s), timerEnumValue(te), startTime(), pauseStartTime(0),
478  totalPauseTime() {}
479 
480  // void setStat(timeStat *s) { stat = s; }
481  void start(tsc_tick_count tick);
482  void pause(tsc_tick_count tick) { pauseStartTime = tick; }
483  void resume(tsc_tick_count tick) {
484  totalPauseTime += (tick - pauseStartTime);
485  }
486  void stop(tsc_tick_count tick, kmp_stats_list *stats_ptr = nullptr);
487  void reset() {
488  startTime = 0;
489  pauseStartTime = 0;
490  totalPauseTime = 0;
491  }
492  timer_e get_type() const { return timerEnumValue; }
493 };
494 
495 // Where you need to partition a threads clock ticks into separate states
496 // e.g., a partitionedTimers class with two timers of EXECUTING_TASK, and
497 // DOING_NOTHING would render these conditions:
498 // time(EXECUTING_TASK) + time(DOING_NOTHING) = total time thread is alive
499 // No clock tick in the EXECUTING_TASK is a member of DOING_NOTHING and vice
500 // versa
501 class partitionedTimers {
502 private:
503  std::vector<explicitTimer> timer_stack;
504 
505 public:
506  partitionedTimers();
507  void init(explicitTimer timer);
508  void exchange(explicitTimer timer);
509  void push(explicitTimer timer);
510  void pop();
511  void windup();
512 };
513 
514 // Special wrapper around the partioned timers to aid timing code blocks
515 // It avoids the need to have an explicit end, leaving the scope suffices.
516 class blockPartitionedTimer {
517  partitionedTimers *part_timers;
518 
519 public:
520  blockPartitionedTimer(partitionedTimers *pt, explicitTimer timer)
521  : part_timers(pt) {
522  part_timers->push(timer);
523  }
524  ~blockPartitionedTimer() { part_timers->pop(); }
525 };
526 
527 // Special wrapper around the thread state to aid in keeping state in code
528 // blocks It avoids the need to have an explicit end, leaving the scope
529 // suffices.
530 class blockThreadState {
531  stats_state_e *state_pointer;
532  stats_state_e old_state;
533 
534 public:
535  blockThreadState(stats_state_e *thread_state_pointer, stats_state_e new_state)
536  : state_pointer(thread_state_pointer), old_state(*thread_state_pointer) {
537  *state_pointer = new_state;
538  }
539  ~blockThreadState() { *state_pointer = old_state; }
540 };
541 
542 // If all you want is a count, then you can use this...
543 // The individual per-thread counts will be aggregated into a statistic at
544 // program exit.
545 class counter {
546  uint64_t value;
547  static const statInfo counterInfo[];
548 
549 public:
550  counter() : value(0) {}
551  void increment() { value++; }
552  uint64_t getValue() const { return value; }
553  void reset() { value = 0; }
554  static const char *name(counter_e e) { return counterInfo[e].name; }
555  static bool masterOnly(counter_e e) {
556  return counterInfo[e].flags & stats_flags_e::onlyInMaster;
557  }
558 };
559 
560 /* ****************************************************************
561  Class to implement an event
562 
563  There are four components to an event: start time, stop time
564  nest_level, and timer_name.
565  The start and stop time should be obvious (recorded in clock ticks).
566  The nest_level relates to the bar width in the timeline graph.
567  The timer_name is used to determine which timer event triggered this event.
568 
569  the interface to this class is through four read-only operations:
570  1) getStart() -- returns the start time as 64 bit integer
571  2) getStop() -- returns the stop time as 64 bit integer
572  3) getNestLevel() -- returns the nest level of the event
573  4) getTimerName() -- returns the timer name that triggered event
574 
575  *MORE ON NEST_LEVEL*
576  The nest level is used in the bar graph that represents the timeline.
577  Its main purpose is for showing how events are nested inside eachother.
578  For example, say events, A, B, and C are recorded. If the timeline
579  looks like this:
580 
581 Begin -------------------------------------------------------------> Time
582  | | | | | |
583  A B C C B A
584  start start start end end end
585 
586  Then A, B, C will have a nest level of 1, 2, 3 respectively.
587  These values are then used to calculate the barwidth so you can
588  see that inside A, B has occurred, and inside B, C has occurred.
589  Currently, this is shown with A's bar width being larger than B's
590  bar width, and B's bar width being larger than C's bar width.
591 
592 **************************************************************** */
593 class kmp_stats_event {
594  uint64_t start;
595  uint64_t stop;
596  int nest_level;
597  timer_e timer_name;
598 
599 public:
600  kmp_stats_event()
601  : start(0), stop(0), nest_level(0), timer_name(TIMER_LAST) {}
602  kmp_stats_event(uint64_t strt, uint64_t stp, int nst, timer_e nme)
603  : start(strt), stop(stp), nest_level(nst), timer_name(nme) {}
604  inline uint64_t getStart() const { return start; }
605  inline uint64_t getStop() const { return stop; }
606  inline int getNestLevel() const { return nest_level; }
607  inline timer_e getTimerName() const { return timer_name; }
608 };
609 
610 /* ****************************************************************
611  Class to implement a dynamically expandable array of events
612 
613  ---------------------------------------------------------
614  | event 1 | event 2 | event 3 | event 4 | ... | event N |
615  ---------------------------------------------------------
616 
617  An event is pushed onto the back of this array at every
618  explicitTimer->stop() call. The event records the thread #,
619  start time, stop time, and nest level related to the bar width.
620 
621  The event vector starts at size INIT_SIZE and grows (doubles in size)
622  if needed. An implication of this behavior is that log(N)
623  reallocations are needed (where N is number of events). If you want
624  to avoid reallocations, then set INIT_SIZE to a large value.
625 
626  the interface to this class is through six operations:
627  1) reset() -- sets the internal_size back to 0 but does not deallocate any
628  memory
629  2) size() -- returns the number of valid elements in the vector
630  3) push_back(start, stop, nest, timer_name) -- pushes an event onto
631  the back of the array
632  4) deallocate() -- frees all memory associated with the vector
633  5) sort() -- sorts the vector by start time
634  6) operator[index] or at(index) -- returns event reference at that index
635 **************************************************************** */
636 class kmp_stats_event_vector {
637  kmp_stats_event *events;
638  int internal_size;
639  int allocated_size;
640  static const int INIT_SIZE = 1024;
641 
642 public:
643  kmp_stats_event_vector() {
644  events =
645  (kmp_stats_event *)__kmp_allocate(sizeof(kmp_stats_event) * INIT_SIZE);
646  internal_size = 0;
647  allocated_size = INIT_SIZE;
648  }
649  ~kmp_stats_event_vector() {}
650  inline void reset() { internal_size = 0; }
651  inline int size() const { return internal_size; }
652  void push_back(uint64_t start_time, uint64_t stop_time, int nest_level,
653  timer_e name) {
654  int i;
655  if (internal_size == allocated_size) {
656  kmp_stats_event *tmp = (kmp_stats_event *)__kmp_allocate(
657  sizeof(kmp_stats_event) * allocated_size * 2);
658  for (i = 0; i < internal_size; i++)
659  tmp[i] = events[i];
660  __kmp_free(events);
661  events = tmp;
662  allocated_size *= 2;
663  }
664  events[internal_size] =
665  kmp_stats_event(start_time, stop_time, nest_level, name);
666  internal_size++;
667  return;
668  }
669  void deallocate();
670  void sort();
671  const kmp_stats_event &operator[](int index) const { return events[index]; }
672  kmp_stats_event &operator[](int index) { return events[index]; }
673  const kmp_stats_event &at(int index) const { return events[index]; }
674  kmp_stats_event &at(int index) { return events[index]; }
675 };
676 
677 /* ****************************************************************
678  Class to implement a doubly-linked, circular, statistics list
679 
680  |---| ---> |---| ---> |---| ---> |---| ---> ... next
681  | | | | | | | |
682  |---| <--- |---| <--- |---| <--- |---| <--- ... prev
683  Sentinel first second third
684  Node node node node
685 
686  The Sentinel Node is the user handle on the list.
687  The first node corresponds to thread 0's statistics.
688  The second node corresponds to thread 1's statistics and so on...
689 
690  Each node has a _timers, _counters, and _explicitTimers array to hold that
691  thread's statistics. The _explicitTimers point to the correct _timer and
692  update its statistics at every stop() call. The explicitTimers' pointers are
693  set up in the constructor. Each node also has an event vector to hold that
694  thread's timing events. The event vector expands as necessary and records
695  the start-stop times for each timer.
696 
697  The nestLevel variable is for plotting events and is related
698  to the bar width in the timeline graph.
699 
700  Every thread will have a thread local pointer to its node in
701  the list. The sentinel node is used by the master thread to
702  store "dummy" statistics before __kmp_create_worker() is called.
703 **************************************************************** */
704 class kmp_stats_list {
705  int gtid;
706  timeStat _timers[TIMER_LAST + 1];
707  counter _counters[COUNTER_LAST + 1];
708  explicitTimer thread_life_timer;
709  partitionedTimers _partitionedTimers;
710  int _nestLevel; // one per thread
711  kmp_stats_event_vector _event_vector;
712  kmp_stats_list *next;
713  kmp_stats_list *prev;
714  stats_state_e state;
715  int thread_is_idle_flag;
716 
717 public:
718  kmp_stats_list()
719  : thread_life_timer(&_timers[TIMER_OMP_worker_thread_life],
720  TIMER_OMP_worker_thread_life),
721  _nestLevel(0), _event_vector(), next(this), prev(this), state(IDLE),
722  thread_is_idle_flag(0) {}
723  ~kmp_stats_list() {}
724  inline timeStat *getTimer(timer_e idx) { return &_timers[idx]; }
725  inline counter *getCounter(counter_e idx) { return &_counters[idx]; }
726  inline partitionedTimers *getPartitionedTimers() {
727  return &_partitionedTimers;
728  }
729  inline timeStat *getTimers() { return _timers; }
730  inline counter *getCounters() { return _counters; }
731  inline kmp_stats_event_vector &getEventVector() { return _event_vector; }
732  inline void startLife() { thread_life_timer.start(tsc_tick_count::now()); }
733  inline void endLife() { thread_life_timer.stop(tsc_tick_count::now(), this); }
734  inline void resetEventVector() { _event_vector.reset(); }
735  inline void incrementNestValue() { _nestLevel++; }
736  inline int getNestValue() { return _nestLevel; }
737  inline void decrementNestValue() { _nestLevel--; }
738  inline int getGtid() const { return gtid; }
739  inline void setGtid(int newgtid) { gtid = newgtid; }
740  inline void setState(stats_state_e newstate) { state = newstate; }
741  inline stats_state_e getState() const { return state; }
742  inline stats_state_e *getStatePointer() { return &state; }
743  inline bool isIdle() { return thread_is_idle_flag == 1; }
744  inline void setIdleFlag() { thread_is_idle_flag = 1; }
745  inline void resetIdleFlag() { thread_is_idle_flag = 0; }
746  kmp_stats_list *push_back(int gtid); // returns newly created list node
747  inline void push_event(uint64_t start_time, uint64_t stop_time,
748  int nest_level, timer_e name) {
749  _event_vector.push_back(start_time, stop_time, nest_level, name);
750  }
751  void deallocate();
752  class iterator;
753  kmp_stats_list::iterator begin();
754  kmp_stats_list::iterator end();
755  int size();
756  class iterator {
757  kmp_stats_list *ptr;
758  friend kmp_stats_list::iterator kmp_stats_list::begin();
759  friend kmp_stats_list::iterator kmp_stats_list::end();
760 
761  public:
762  iterator();
763  ~iterator();
764  iterator operator++();
765  iterator operator++(int dummy);
766  iterator operator--();
767  iterator operator--(int dummy);
768  bool operator!=(const iterator &rhs);
769  bool operator==(const iterator &rhs);
770  kmp_stats_list *operator*() const; // dereference operator
771  };
772 };
773 
774 /* ****************************************************************
775  Class to encapsulate all output functions and the environment variables
776 
777  This module holds filenames for various outputs (normal stats, events, plot
778  file), as well as coloring information for the plot file.
779 
780  The filenames and flags variables are read from environment variables.
781  These are read once by the constructor of the global variable
782  __kmp_stats_output which calls init().
783 
784  During this init() call, event flags for the timeStat::timerInfo[] global
785  array are cleared if KMP_STATS_EVENTS is not true (on, 1, yes).
786 
787  The only interface function that is public is outputStats(heading). This
788  function should print out everything it needs to, either to files or stderr,
789  depending on the environment variables described below
790 
791  ENVIRONMENT VARIABLES:
792  KMP_STATS_FILE -- if set, all statistics (not events) will be printed to this
793  file, otherwise, print to stderr
794  KMP_STATS_THREADS -- if set to "on", then will print per thread statistics to
795  either KMP_STATS_FILE or stderr
796  KMP_STATS_PLOT_FILE -- if set, print the ploticus plot file to this filename,
797  otherwise, the plot file is sent to "events.plt"
798  KMP_STATS_EVENTS -- if set to "on", then log events, otherwise, don't log
799  events
800  KMP_STATS_EVENTS_FILE -- if set, all events are outputted to this file,
801  otherwise, output is sent to "events.dat"
802 **************************************************************** */
803 class kmp_stats_output_module {
804 
805 public:
806  struct rgb_color {
807  float r;
808  float g;
809  float b;
810  };
811 
812 private:
813  std::string outputFileName;
814  static const char *eventsFileName;
815  static const char *plotFileName;
816  static int printPerThreadFlag;
817  static int printPerThreadEventsFlag;
818  static const rgb_color globalColorArray[];
819  static rgb_color timerColorInfo[];
820 
821  void init();
822  static void setupEventColors();
823  static void printPloticusFile();
824  static void printHeaderInfo(FILE *statsOut);
825  static void printTimerStats(FILE *statsOut, statistic const *theStats,
826  statistic const *totalStats);
827  static void printCounterStats(FILE *statsOut, statistic const *theStats);
828  static void printCounters(FILE *statsOut, counter const *theCounters);
829  static void printEvents(FILE *eventsOut, kmp_stats_event_vector *theEvents,
830  int gtid);
831  static rgb_color getEventColor(timer_e e) { return timerColorInfo[e]; }
832  static void windupExplicitTimers();
833  bool eventPrintingEnabled() const { return printPerThreadEventsFlag; }
834 
835 public:
836  kmp_stats_output_module() { init(); }
837  void outputStats(const char *heading);
838 };
839 
840 #ifdef __cplusplus
841 extern "C" {
842 #endif
843 void __kmp_stats_init();
844 void __kmp_stats_fini();
845 void __kmp_reset_stats();
846 void __kmp_output_stats(const char *);
847 void __kmp_accumulate_stats_at_exit(void);
848 // thread local pointer to stats node within list
849 extern KMP_THREAD_LOCAL kmp_stats_list *__kmp_stats_thread_ptr;
850 // head to stats list.
851 extern kmp_stats_list *__kmp_stats_list;
852 // lock for __kmp_stats_list
853 extern kmp_tas_lock_t __kmp_stats_lock;
854 // reference start time
855 extern tsc_tick_count __kmp_stats_start_time;
856 // interface to output
857 extern kmp_stats_output_module __kmp_stats_output;
858 
859 #ifdef __cplusplus
860 }
861 #endif
862 
863 // Simple, standard interfaces that drop out completely if stats aren't enabled
864 
876 #define KMP_COUNT_VALUE(name, value) \
877  __kmp_stats_thread_ptr->getTimer(TIMER_##name)->addSample(value)
878 
889 #define KMP_COUNT_BLOCK(name) \
890  __kmp_stats_thread_ptr->getCounter(COUNTER_##name)->increment()
891 
909 #define KMP_OUTPUT_STATS(heading_string) __kmp_output_stats(heading_string)
910 
918 #define KMP_INIT_PARTITIONED_TIMERS(name) \
919  __kmp_stats_thread_ptr->getPartitionedTimers()->init(explicitTimer( \
920  __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
921 
922 #define KMP_TIME_PARTITIONED_BLOCK(name) \
923  blockPartitionedTimer __PBLOCKTIME__( \
924  __kmp_stats_thread_ptr->getPartitionedTimers(), \
925  explicitTimer(__kmp_stats_thread_ptr->getTimer(TIMER_##name), \
926  TIMER_##name))
927 
928 #define KMP_PUSH_PARTITIONED_TIMER(name) \
929  __kmp_stats_thread_ptr->getPartitionedTimers()->push(explicitTimer( \
930  __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
931 
932 #define KMP_POP_PARTITIONED_TIMER() \
933  __kmp_stats_thread_ptr->getPartitionedTimers()->pop()
934 
935 #define KMP_EXCHANGE_PARTITIONED_TIMER(name) \
936  __kmp_stats_thread_ptr->getPartitionedTimers()->exchange(explicitTimer( \
937  __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
938 
939 #define KMP_SET_THREAD_STATE(state_name) \
940  __kmp_stats_thread_ptr->setState(state_name)
941 
942 #define KMP_GET_THREAD_STATE() __kmp_stats_thread_ptr->getState()
943 
944 #define KMP_SET_THREAD_STATE_BLOCK(state_name) \
945  blockThreadState __BTHREADSTATE__(__kmp_stats_thread_ptr->getStatePointer(), \
946  state_name)
947 
955 #define KMP_RESET_STATS() __kmp_reset_stats()
956 
957 #if (KMP_DEVELOPER_STATS)
958 #define KMP_TIME_DEVELOPER_BLOCK(n) KMP_TIME_BLOCK(n)
959 #define KMP_COUNT_DEVELOPER_VALUE(n, v) KMP_COUNT_VALUE(n, v)
960 #define KMP_COUNT_DEVELOPER_BLOCK(n) KMP_COUNT_BLOCK(n)
961 #define KMP_START_DEVELOPER_EXPLICIT_TIMER(n) KMP_START_EXPLICIT_TIMER(n)
962 #define KMP_STOP_DEVELOPER_EXPLICIT_TIMER(n) KMP_STOP_EXPLICIT_TIMER(n)
963 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) KMP_TIME_PARTITIONED_BLOCK(n)
964 #else
965 // Null definitions
966 #define KMP_TIME_DEVELOPER_BLOCK(n) ((void)0)
967 #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
968 #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
969 #define KMP_START_DEVELOPER_EXPLICIT_TIMER(n) ((void)0)
970 #define KMP_STOP_DEVELOPER_EXPLICIT_TIMER(n) ((void)0)
971 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
972 #endif
973 
974 #else // KMP_STATS_ENABLED
975 
976 // Null definitions
977 #define KMP_TIME_BLOCK(n) ((void)0)
978 #define KMP_COUNT_VALUE(n, v) ((void)0)
979 #define KMP_COUNT_BLOCK(n) ((void)0)
980 #define KMP_START_EXPLICIT_TIMER(n) ((void)0)
981 #define KMP_STOP_EXPLICIT_TIMER(n) ((void)0)
982 
983 #define KMP_OUTPUT_STATS(heading_string) ((void)0)
984 #define KMP_RESET_STATS() ((void)0)
985 
986 #define KMP_TIME_DEVELOPER_BLOCK(n) ((void)0)
987 #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
988 #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
989 #define KMP_START_DEVELOPER_EXPLICIT_TIMER(n) ((void)0)
990 #define KMP_STOP_DEVELOPER_EXPLICIT_TIMER(n) ((void)0)
991 #define KMP_INIT_PARTITIONED_TIMERS(name) ((void)0)
992 #define KMP_TIME_PARTITIONED_BLOCK(name) ((void)0)
993 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
994 #define KMP_PUSH_PARTITIONED_TIMER(name) ((void)0)
995 #define KMP_POP_PARTITIONED_TIMER() ((void)0)
996 #define KMP_SET_THREAD_STATE(state_name) ((void)0)
997 #define KMP_GET_THREAD_STATE() ((void)0)
998 #define KMP_SET_THREAD_STATE_BLOCK(state_name) ((void)0)
999 #endif // KMP_STATS_ENABLED
1000 
1001 #endif // KMP_STATS_H
statistic is valid only for master
Definition: kmp_stats.h:51
statistic is valid only for non-master threads
Definition: kmp_stats.h:53
do not show a TOTAL_aggregation for this statistic
Definition: kmp_stats.h:50
#define KMP_FOREACH_EXPLICIT_TIMER(macro, arg)
Add new explicit timers under KMP_FOREACH_EXPLICIT_TIMER() macro.
Definition: kmp_stats.h:281
statistic doesn't need units printed next to it
Definition: kmp_stats.h:52
stats_flags_e
flags to describe the statistic (timer or counter)
Definition: kmp_stats.h:49
#define KMP_FOREACH_COUNTER(macro, arg)
Add new counters under KMP_FOREACH_COUNTER() macro in kmp_stats.h.
Definition: kmp_stats.h:94
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63