LLVM OpenMP* Runtime Library
kmp_alloc.cpp
1 /*
2  * kmp_alloc.cpp -- private/shared dynamic memory allocation and management
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_io.h"
15 #include "kmp_wrapper_malloc.h"
16 
17 #if KMP_HWLOC_ENABLED
18 #if HWLOC_API_VERSION > 0x00020300
19 #define KMP_HWLOC_LOCATION_TYPE_CPUSET HWLOC_LOCATION_TYPE_CPUSET
20 #elif HWLOC_API_VERSION == 0x00020300
21 #define KMP_HWLOC_LOCATION_TYPE_CPUSET \
22  hwloc_location::HWLOC_LOCATION_TYPE_CPUSET
23 #else
24 enum hwloc_memattr_id_e {
25  HWLOC_MEMATTR_ID_BANDWIDTH,
26  HWLOC_MEMATTR_ID_CAPACITY
27 };
28 #endif
29 #endif // KMP_HWLOC_ENABLED
30 
31 // Disable bget when it is not used
32 #if KMP_USE_BGET
33 
34 /* Thread private buffer management code */
35 
36 typedef int (*bget_compact_t)(size_t, int);
37 typedef void *(*bget_acquire_t)(size_t);
38 typedef void (*bget_release_t)(void *);
39 
40 /* NOTE: bufsize must be a signed datatype */
41 
42 #if KMP_OS_WINDOWS
43 #if KMP_ARCH_X86 || KMP_ARCH_ARM
44 typedef kmp_int32 bufsize;
45 #else
46 typedef kmp_int64 bufsize;
47 #endif
48 #else
49 typedef ssize_t bufsize;
50 #endif // KMP_OS_WINDOWS
51 
52 /* The three modes of operation are, fifo search, lifo search, and best-fit */
53 
54 typedef enum bget_mode {
55  bget_mode_fifo = 0,
56  bget_mode_lifo = 1,
57  bget_mode_best = 2
58 } bget_mode_t;
59 
60 static void bpool(kmp_info_t *th, void *buffer, bufsize len);
61 static void *bget(kmp_info_t *th, bufsize size);
62 static void *bgetz(kmp_info_t *th, bufsize size);
63 static void *bgetr(kmp_info_t *th, void *buffer, bufsize newsize);
64 static void brel(kmp_info_t *th, void *buf);
65 static void bectl(kmp_info_t *th, bget_compact_t compact,
66  bget_acquire_t acquire, bget_release_t release,
67  bufsize pool_incr);
68 
69 /* BGET CONFIGURATION */
70 /* Buffer allocation size quantum: all buffers allocated are a
71  multiple of this size. This MUST be a power of two. */
72 
73 /* On some architectures, malloc() does not ensure 16 byte alignment,
74  Solaris/sparc and x86 among them. */
75 
76 #if KMP_ARCH_X86 || KMP_ARCH_SPARC || !KMP_HAVE_QUAD
77 
78 #define SizeQuant 8
79 #define AlignType double
80 
81 #else
82 
83 #define SizeQuant 16
84 #define AlignType _Quad
85 
86 #endif
87 
88 // Define this symbol to enable the bstats() function which calculates the
89 // total free space in the buffer pool, the largest available buffer, and the
90 // total space currently allocated.
91 #define BufStats 1
92 
93 #ifdef KMP_DEBUG
94 
95 // Define this symbol to enable the bpoold() function which dumps the buffers
96 // in a buffer pool.
97 #define BufDump 1
98 
99 // Define this symbol to enable the bpoolv() function for validating a buffer
100 // pool.
101 #define BufValid 1
102 
103 // Define this symbol to enable the bufdump() function which allows dumping the
104 // contents of an allocated or free buffer.
105 #define DumpData 1
106 
107 #ifdef NOT_USED_NOW
108 
109 // Wipe free buffers to a guaranteed pattern of garbage to trip up miscreants
110 // who attempt to use pointers into released buffers.
111 #define FreeWipe 1
112 
113 // Use a best fit algorithm when searching for space for an allocation request.
114 // This uses memory more efficiently, but allocation will be much slower.
115 #define BestFit 1
116 
117 #endif /* NOT_USED_NOW */
118 #endif /* KMP_DEBUG */
119 
120 static bufsize bget_bin_size[] = {
121  0,
122  // 1 << 6, /* .5 Cache line */
123  1 << 7, /* 1 Cache line, new */
124  1 << 8, /* 2 Cache lines */
125  1 << 9, /* 4 Cache lines, new */
126  1 << 10, /* 8 Cache lines */
127  1 << 11, /* 16 Cache lines, new */
128  1 << 12, 1 << 13, /* new */
129  1 << 14, 1 << 15, /* new */
130  1 << 16, 1 << 17, 1 << 18, 1 << 19, 1 << 20, /* 1MB */
131  1 << 21, /* 2MB */
132  1 << 22, /* 4MB */
133  1 << 23, /* 8MB */
134  1 << 24, /* 16MB */
135  1 << 25, /* 32MB */
136 };
137 
138 #define MAX_BGET_BINS (int)(sizeof(bget_bin_size) / sizeof(bufsize))
139 
140 struct bfhead;
141 
142 // Declare the interface, including the requested buffer size type, bufsize.
143 
144 /* Queue links */
145 typedef struct qlinks {
146  struct bfhead *flink; /* Forward link */
147  struct bfhead *blink; /* Backward link */
148 } qlinks_t;
149 
150 /* Header in allocated and free buffers */
151 typedef struct bhead2 {
152  kmp_info_t *bthr; /* The thread which owns the buffer pool */
153  bufsize prevfree; /* Relative link back to previous free buffer in memory or
154  0 if previous buffer is allocated. */
155  bufsize bsize; /* Buffer size: positive if free, negative if allocated. */
156 } bhead2_t;
157 
158 /* Make sure the bhead structure is a multiple of SizeQuant in size. */
159 typedef union bhead {
160  KMP_ALIGN(SizeQuant)
161  AlignType b_align;
162  char b_pad[sizeof(bhead2_t) + (SizeQuant - (sizeof(bhead2_t) % SizeQuant))];
163  bhead2_t bb;
164 } bhead_t;
165 #define BH(p) ((bhead_t *)(p))
166 
167 /* Header in directly allocated buffers (by acqfcn) */
168 typedef struct bdhead {
169  bufsize tsize; /* Total size, including overhead */
170  bhead_t bh; /* Common header */
171 } bdhead_t;
172 #define BDH(p) ((bdhead_t *)(p))
173 
174 /* Header in free buffers */
175 typedef struct bfhead {
176  bhead_t bh; /* Common allocated/free header */
177  qlinks_t ql; /* Links on free list */
178 } bfhead_t;
179 #define BFH(p) ((bfhead_t *)(p))
180 
181 typedef struct thr_data {
182  bfhead_t freelist[MAX_BGET_BINS];
183 #if BufStats
184  size_t totalloc; /* Total space currently allocated */
185  long numget, numrel; /* Number of bget() and brel() calls */
186  long numpblk; /* Number of pool blocks */
187  long numpget, numprel; /* Number of block gets and rels */
188  long numdget, numdrel; /* Number of direct gets and rels */
189 #endif /* BufStats */
190 
191  /* Automatic expansion block management functions */
192  bget_compact_t compfcn;
193  bget_acquire_t acqfcn;
194  bget_release_t relfcn;
195 
196  bget_mode_t mode; /* what allocation mode to use? */
197 
198  bufsize exp_incr; /* Expansion block size */
199  bufsize pool_len; /* 0: no bpool calls have been made
200  -1: not all pool blocks are the same size
201  >0: (common) block size for all bpool calls made so far
202  */
203  bfhead_t *last_pool; /* Last pool owned by this thread (delay deallocation) */
204 } thr_data_t;
205 
206 /* Minimum allocation quantum: */
207 #define QLSize (sizeof(qlinks_t))
208 #define SizeQ ((SizeQuant > QLSize) ? SizeQuant : QLSize)
209 #define MaxSize \
210  (bufsize)( \
211  ~(((bufsize)(1) << (sizeof(bufsize) * CHAR_BIT - 1)) | (SizeQuant - 1)))
212 // Maximum for the requested size.
213 
214 /* End sentinel: value placed in bsize field of dummy block delimiting
215  end of pool block. The most negative number which will fit in a
216  bufsize, defined in a way that the compiler will accept. */
217 
218 #define ESent \
219  ((bufsize)(-(((((bufsize)1) << ((int)sizeof(bufsize) * 8 - 2)) - 1) * 2) - 2))
220 
221 /* Thread Data management routines */
222 static int bget_get_bin(bufsize size) {
223  // binary chop bins
224  int lo = 0, hi = MAX_BGET_BINS - 1;
225 
226  KMP_DEBUG_ASSERT(size > 0);
227 
228  while ((hi - lo) > 1) {
229  int mid = (lo + hi) >> 1;
230  if (size < bget_bin_size[mid])
231  hi = mid - 1;
232  else
233  lo = mid;
234  }
235 
236  KMP_DEBUG_ASSERT((lo >= 0) && (lo < MAX_BGET_BINS));
237 
238  return lo;
239 }
240 
241 static void set_thr_data(kmp_info_t *th) {
242  int i;
243  thr_data_t *data;
244 
245  data = (thr_data_t *)((!th->th.th_local.bget_data)
246  ? __kmp_allocate(sizeof(*data))
247  : th->th.th_local.bget_data);
248 
249  memset(data, '\0', sizeof(*data));
250 
251  for (i = 0; i < MAX_BGET_BINS; ++i) {
252  data->freelist[i].ql.flink = &data->freelist[i];
253  data->freelist[i].ql.blink = &data->freelist[i];
254  }
255 
256  th->th.th_local.bget_data = data;
257  th->th.th_local.bget_list = 0;
258 #if !USE_CMP_XCHG_FOR_BGET
259 #ifdef USE_QUEUING_LOCK_FOR_BGET
260  __kmp_init_lock(&th->th.th_local.bget_lock);
261 #else
262  __kmp_init_bootstrap_lock(&th->th.th_local.bget_lock);
263 #endif /* USE_LOCK_FOR_BGET */
264 #endif /* ! USE_CMP_XCHG_FOR_BGET */
265 }
266 
267 static thr_data_t *get_thr_data(kmp_info_t *th) {
268  thr_data_t *data;
269 
270  data = (thr_data_t *)th->th.th_local.bget_data;
271 
272  KMP_DEBUG_ASSERT(data != 0);
273 
274  return data;
275 }
276 
277 /* Walk the free list and release the enqueued buffers */
278 static void __kmp_bget_dequeue(kmp_info_t *th) {
279  void *p = TCR_SYNC_PTR(th->th.th_local.bget_list);
280 
281  if (p != 0) {
282 #if USE_CMP_XCHG_FOR_BGET
283  {
284  volatile void *old_value = TCR_SYNC_PTR(th->th.th_local.bget_list);
285  while (!KMP_COMPARE_AND_STORE_PTR(&th->th.th_local.bget_list,
286  CCAST(void *, old_value), nullptr)) {
287  KMP_CPU_PAUSE();
288  old_value = TCR_SYNC_PTR(th->th.th_local.bget_list);
289  }
290  p = CCAST(void *, old_value);
291  }
292 #else /* ! USE_CMP_XCHG_FOR_BGET */
293 #ifdef USE_QUEUING_LOCK_FOR_BGET
294  __kmp_acquire_lock(&th->th.th_local.bget_lock, __kmp_gtid_from_thread(th));
295 #else
296  __kmp_acquire_bootstrap_lock(&th->th.th_local.bget_lock);
297 #endif /* USE_QUEUING_LOCK_FOR_BGET */
298 
299  p = (void *)th->th.th_local.bget_list;
300  th->th.th_local.bget_list = 0;
301 
302 #ifdef USE_QUEUING_LOCK_FOR_BGET
303  __kmp_release_lock(&th->th.th_local.bget_lock, __kmp_gtid_from_thread(th));
304 #else
305  __kmp_release_bootstrap_lock(&th->th.th_local.bget_lock);
306 #endif
307 #endif /* USE_CMP_XCHG_FOR_BGET */
308 
309  /* Check again to make sure the list is not empty */
310  while (p != 0) {
311  void *buf = p;
312  bfhead_t *b = BFH(((char *)p) - sizeof(bhead_t));
313 
314  KMP_DEBUG_ASSERT(b->bh.bb.bsize != 0);
315  KMP_DEBUG_ASSERT(((kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) & ~1) ==
316  (kmp_uintptr_t)th); // clear possible mark
317  KMP_DEBUG_ASSERT(b->ql.blink == 0);
318 
319  p = (void *)b->ql.flink;
320 
321  brel(th, buf);
322  }
323  }
324 }
325 
326 /* Chain together the free buffers by using the thread owner field */
327 static void __kmp_bget_enqueue(kmp_info_t *th, void *buf
328 #ifdef USE_QUEUING_LOCK_FOR_BGET
329  ,
330  kmp_int32 rel_gtid
331 #endif
332 ) {
333  bfhead_t *b = BFH(((char *)buf) - sizeof(bhead_t));
334 
335  KMP_DEBUG_ASSERT(b->bh.bb.bsize != 0);
336  KMP_DEBUG_ASSERT(((kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) & ~1) ==
337  (kmp_uintptr_t)th); // clear possible mark
338 
339  b->ql.blink = 0;
340 
341  KC_TRACE(10, ("__kmp_bget_enqueue: moving buffer to T#%d list\n",
342  __kmp_gtid_from_thread(th)));
343 
344 #if USE_CMP_XCHG_FOR_BGET
345  {
346  volatile void *old_value = TCR_PTR(th->th.th_local.bget_list);
347  /* the next pointer must be set before setting bget_list to buf to avoid
348  exposing a broken list to other threads, even for an instant. */
349  b->ql.flink = BFH(CCAST(void *, old_value));
350 
351  while (!KMP_COMPARE_AND_STORE_PTR(&th->th.th_local.bget_list,
352  CCAST(void *, old_value), buf)) {
353  KMP_CPU_PAUSE();
354  old_value = TCR_PTR(th->th.th_local.bget_list);
355  /* the next pointer must be set before setting bget_list to buf to avoid
356  exposing a broken list to other threads, even for an instant. */
357  b->ql.flink = BFH(CCAST(void *, old_value));
358  }
359  }
360 #else /* ! USE_CMP_XCHG_FOR_BGET */
361 #ifdef USE_QUEUING_LOCK_FOR_BGET
362  __kmp_acquire_lock(&th->th.th_local.bget_lock, rel_gtid);
363 #else
364  __kmp_acquire_bootstrap_lock(&th->th.th_local.bget_lock);
365 #endif
366 
367  b->ql.flink = BFH(th->th.th_local.bget_list);
368  th->th.th_local.bget_list = (void *)buf;
369 
370 #ifdef USE_QUEUING_LOCK_FOR_BGET
371  __kmp_release_lock(&th->th.th_local.bget_lock, rel_gtid);
372 #else
373  __kmp_release_bootstrap_lock(&th->th.th_local.bget_lock);
374 #endif
375 #endif /* USE_CMP_XCHG_FOR_BGET */
376 }
377 
378 /* insert buffer back onto a new freelist */
379 static void __kmp_bget_insert_into_freelist(thr_data_t *thr, bfhead_t *b) {
380  int bin;
381 
382  KMP_DEBUG_ASSERT(((size_t)b) % SizeQuant == 0);
383  KMP_DEBUG_ASSERT(b->bh.bb.bsize % SizeQuant == 0);
384 
385  bin = bget_get_bin(b->bh.bb.bsize);
386 
387  KMP_DEBUG_ASSERT(thr->freelist[bin].ql.blink->ql.flink ==
388  &thr->freelist[bin]);
389  KMP_DEBUG_ASSERT(thr->freelist[bin].ql.flink->ql.blink ==
390  &thr->freelist[bin]);
391 
392  b->ql.flink = &thr->freelist[bin];
393  b->ql.blink = thr->freelist[bin].ql.blink;
394 
395  thr->freelist[bin].ql.blink = b;
396  b->ql.blink->ql.flink = b;
397 }
398 
399 /* unlink the buffer from the old freelist */
400 static void __kmp_bget_remove_from_freelist(bfhead_t *b) {
401  KMP_DEBUG_ASSERT(b->ql.blink->ql.flink == b);
402  KMP_DEBUG_ASSERT(b->ql.flink->ql.blink == b);
403 
404  b->ql.blink->ql.flink = b->ql.flink;
405  b->ql.flink->ql.blink = b->ql.blink;
406 }
407 
408 /* GET STATS -- check info on free list */
409 static void bcheck(kmp_info_t *th, bufsize *max_free, bufsize *total_free) {
410  thr_data_t *thr = get_thr_data(th);
411  int bin;
412 
413  *total_free = *max_free = 0;
414 
415  for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
416  bfhead_t *b, *best;
417 
418  best = &thr->freelist[bin];
419  b = best->ql.flink;
420 
421  while (b != &thr->freelist[bin]) {
422  *total_free += (b->bh.bb.bsize - sizeof(bhead_t));
423  if ((best == &thr->freelist[bin]) || (b->bh.bb.bsize < best->bh.bb.bsize))
424  best = b;
425 
426  /* Link to next buffer */
427  b = b->ql.flink;
428  }
429 
430  if (*max_free < best->bh.bb.bsize)
431  *max_free = best->bh.bb.bsize;
432  }
433 
434  if (*max_free > (bufsize)sizeof(bhead_t))
435  *max_free -= sizeof(bhead_t);
436 }
437 
438 /* BGET -- Allocate a buffer. */
439 static void *bget(kmp_info_t *th, bufsize requested_size) {
440  thr_data_t *thr = get_thr_data(th);
441  bufsize size = requested_size;
442  bfhead_t *b;
443  void *buf;
444  int compactseq = 0;
445  int use_blink = 0;
446  /* For BestFit */
447  bfhead_t *best;
448 
449  if (size < 0 || size + sizeof(bhead_t) > MaxSize) {
450  return NULL;
451  }
452 
453  __kmp_bget_dequeue(th); /* Release any queued buffers */
454 
455  if (size < (bufsize)SizeQ) { // Need at least room for the queue links.
456  size = SizeQ;
457  }
458 #if defined(SizeQuant) && (SizeQuant > 1)
459  size = (size + (SizeQuant - 1)) & (~(SizeQuant - 1));
460 #endif
461 
462  size += sizeof(bhead_t); // Add overhead in allocated buffer to size required.
463  KMP_DEBUG_ASSERT(size >= 0);
464  KMP_DEBUG_ASSERT(size % SizeQuant == 0);
465 
466  use_blink = (thr->mode == bget_mode_lifo);
467 
468  /* If a compact function was provided in the call to bectl(), wrap
469  a loop around the allocation process to allow compaction to
470  intervene in case we don't find a suitable buffer in the chain. */
471 
472  for (;;) {
473  int bin;
474 
475  for (bin = bget_get_bin(size); bin < MAX_BGET_BINS; ++bin) {
476  /* Link to next buffer */
477  b = (use_blink ? thr->freelist[bin].ql.blink
478  : thr->freelist[bin].ql.flink);
479 
480  if (thr->mode == bget_mode_best) {
481  best = &thr->freelist[bin];
482 
483  /* Scan the free list searching for the first buffer big enough
484  to hold the requested size buffer. */
485  while (b != &thr->freelist[bin]) {
486  if (b->bh.bb.bsize >= (bufsize)size) {
487  if ((best == &thr->freelist[bin]) ||
488  (b->bh.bb.bsize < best->bh.bb.bsize)) {
489  best = b;
490  }
491  }
492 
493  /* Link to next buffer */
494  b = (use_blink ? b->ql.blink : b->ql.flink);
495  }
496  b = best;
497  }
498 
499  while (b != &thr->freelist[bin]) {
500  if ((bufsize)b->bh.bb.bsize >= (bufsize)size) {
501 
502  // Buffer is big enough to satisfy the request. Allocate it to the
503  // caller. We must decide whether the buffer is large enough to split
504  // into the part given to the caller and a free buffer that remains
505  // on the free list, or whether the entire buffer should be removed
506  // from the free list and given to the caller in its entirety. We
507  // only split the buffer if enough room remains for a header plus the
508  // minimum quantum of allocation.
509  if ((b->bh.bb.bsize - (bufsize)size) >
510  (bufsize)(SizeQ + (sizeof(bhead_t)))) {
511  bhead_t *ba, *bn;
512 
513  ba = BH(((char *)b) + (b->bh.bb.bsize - (bufsize)size));
514  bn = BH(((char *)ba) + size);
515 
516  KMP_DEBUG_ASSERT(bn->bb.prevfree == b->bh.bb.bsize);
517 
518  /* Subtract size from length of free block. */
519  b->bh.bb.bsize -= (bufsize)size;
520 
521  /* Link allocated buffer to the previous free buffer. */
522  ba->bb.prevfree = b->bh.bb.bsize;
523 
524  /* Plug negative size into user buffer. */
525  ba->bb.bsize = -size;
526 
527  /* Mark this buffer as owned by this thread. */
528  TCW_PTR(ba->bb.bthr,
529  th); // not an allocated address (do not mark it)
530  /* Mark buffer after this one not preceded by free block. */
531  bn->bb.prevfree = 0;
532 
533  // unlink buffer from old freelist, and reinsert into new freelist
534  __kmp_bget_remove_from_freelist(b);
535  __kmp_bget_insert_into_freelist(thr, b);
536 #if BufStats
537  thr->totalloc += (size_t)size;
538  thr->numget++; /* Increment number of bget() calls */
539 #endif
540  buf = (void *)((((char *)ba) + sizeof(bhead_t)));
541  KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
542  return buf;
543  } else {
544  bhead_t *ba;
545 
546  ba = BH(((char *)b) + b->bh.bb.bsize);
547 
548  KMP_DEBUG_ASSERT(ba->bb.prevfree == b->bh.bb.bsize);
549 
550  /* The buffer isn't big enough to split. Give the whole
551  shebang to the caller and remove it from the free list. */
552 
553  __kmp_bget_remove_from_freelist(b);
554 #if BufStats
555  thr->totalloc += (size_t)b->bh.bb.bsize;
556  thr->numget++; /* Increment number of bget() calls */
557 #endif
558  /* Negate size to mark buffer allocated. */
559  b->bh.bb.bsize = -(b->bh.bb.bsize);
560 
561  /* Mark this buffer as owned by this thread. */
562  TCW_PTR(ba->bb.bthr, th); // not an allocated address (do not mark)
563  /* Zero the back pointer in the next buffer in memory
564  to indicate that this buffer is allocated. */
565  ba->bb.prevfree = 0;
566 
567  /* Give user buffer starting at queue links. */
568  buf = (void *)&(b->ql);
569  KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
570  return buf;
571  }
572  }
573 
574  /* Link to next buffer */
575  b = (use_blink ? b->ql.blink : b->ql.flink);
576  }
577  }
578 
579  /* We failed to find a buffer. If there's a compact function defined,
580  notify it of the size requested. If it returns TRUE, try the allocation
581  again. */
582 
583  if ((thr->compfcn == 0) || (!(*thr->compfcn)(size, ++compactseq))) {
584  break;
585  }
586  }
587 
588  /* No buffer available with requested size free. */
589 
590  /* Don't give up yet -- look in the reserve supply. */
591  if (thr->acqfcn != 0) {
592  if (size > (bufsize)(thr->exp_incr - sizeof(bhead_t))) {
593  /* Request is too large to fit in a single expansion block.
594  Try to satisfy it by a direct buffer acquisition. */
595  bdhead_t *bdh;
596 
597  size += sizeof(bdhead_t) - sizeof(bhead_t);
598 
599  KE_TRACE(10, ("%%%%%% MALLOC( %d )\n", (int)size));
600 
601  /* richryan */
602  bdh = BDH((*thr->acqfcn)((bufsize)size));
603  if (bdh != NULL) {
604 
605  // Mark the buffer special by setting size field of its header to zero.
606  bdh->bh.bb.bsize = 0;
607 
608  /* Mark this buffer as owned by this thread. */
609  TCW_PTR(bdh->bh.bb.bthr, th); // don't mark buffer as allocated,
610  // because direct buffer never goes to free list
611  bdh->bh.bb.prevfree = 0;
612  bdh->tsize = size;
613 #if BufStats
614  thr->totalloc += (size_t)size;
615  thr->numget++; /* Increment number of bget() calls */
616  thr->numdget++; /* Direct bget() call count */
617 #endif
618  buf = (void *)(bdh + 1);
619  KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
620  return buf;
621  }
622 
623  } else {
624 
625  /* Try to obtain a new expansion block */
626  void *newpool;
627 
628  KE_TRACE(10, ("%%%%%% MALLOCB( %d )\n", (int)thr->exp_incr));
629 
630  /* richryan */
631  newpool = (*thr->acqfcn)((bufsize)thr->exp_incr);
632  KMP_DEBUG_ASSERT(((size_t)newpool) % SizeQuant == 0);
633  if (newpool != NULL) {
634  bpool(th, newpool, thr->exp_incr);
635  buf = bget(
636  th, requested_size); /* This can't, I say, can't get into a loop. */
637  return buf;
638  }
639  }
640  }
641 
642  /* Still no buffer available */
643 
644  return NULL;
645 }
646 
647 /* BGETZ -- Allocate a buffer and clear its contents to zero. We clear
648  the entire contents of the buffer to zero, not just the
649  region requested by the caller. */
650 
651 static void *bgetz(kmp_info_t *th, bufsize size) {
652  char *buf = (char *)bget(th, size);
653 
654  if (buf != NULL) {
655  bhead_t *b;
656  bufsize rsize;
657 
658  b = BH(buf - sizeof(bhead_t));
659  rsize = -(b->bb.bsize);
660  if (rsize == 0) {
661  bdhead_t *bd;
662 
663  bd = BDH(buf - sizeof(bdhead_t));
664  rsize = bd->tsize - (bufsize)sizeof(bdhead_t);
665  } else {
666  rsize -= sizeof(bhead_t);
667  }
668 
669  KMP_DEBUG_ASSERT(rsize >= size);
670 
671  (void)memset(buf, 0, (bufsize)rsize);
672  }
673  return ((void *)buf);
674 }
675 
676 /* BGETR -- Reallocate a buffer. This is a minimal implementation,
677  simply in terms of brel() and bget(). It could be
678  enhanced to allow the buffer to grow into adjacent free
679  blocks and to avoid moving data unnecessarily. */
680 
681 static void *bgetr(kmp_info_t *th, void *buf, bufsize size) {
682  void *nbuf;
683  bufsize osize; /* Old size of buffer */
684  bhead_t *b;
685 
686  nbuf = bget(th, size);
687  if (nbuf == NULL) { /* Acquire new buffer */
688  return NULL;
689  }
690  if (buf == NULL) {
691  return nbuf;
692  }
693  b = BH(((char *)buf) - sizeof(bhead_t));
694  osize = -b->bb.bsize;
695  if (osize == 0) {
696  /* Buffer acquired directly through acqfcn. */
697  bdhead_t *bd;
698 
699  bd = BDH(((char *)buf) - sizeof(bdhead_t));
700  osize = bd->tsize - (bufsize)sizeof(bdhead_t);
701  } else {
702  osize -= sizeof(bhead_t);
703  }
704 
705  KMP_DEBUG_ASSERT(osize > 0);
706 
707  (void)KMP_MEMCPY((char *)nbuf, (char *)buf, /* Copy the data */
708  (size_t)((size < osize) ? size : osize));
709  brel(th, buf);
710 
711  return nbuf;
712 }
713 
714 /* BREL -- Release a buffer. */
715 static void brel(kmp_info_t *th, void *buf) {
716  thr_data_t *thr = get_thr_data(th);
717  bfhead_t *b, *bn;
718  kmp_info_t *bth;
719 
720  KMP_DEBUG_ASSERT(buf != NULL);
721  KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
722 
723  b = BFH(((char *)buf) - sizeof(bhead_t));
724 
725  if (b->bh.bb.bsize == 0) { /* Directly-acquired buffer? */
726  bdhead_t *bdh;
727 
728  bdh = BDH(((char *)buf) - sizeof(bdhead_t));
729  KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);
730 #if BufStats
731  thr->totalloc -= (size_t)bdh->tsize;
732  thr->numdrel++; /* Number of direct releases */
733  thr->numrel++; /* Increment number of brel() calls */
734 #endif /* BufStats */
735 #ifdef FreeWipe
736  (void)memset((char *)buf, 0x55, (size_t)(bdh->tsize - sizeof(bdhead_t)));
737 #endif /* FreeWipe */
738 
739  KE_TRACE(10, ("%%%%%% FREE( %p )\n", (void *)bdh));
740 
741  KMP_DEBUG_ASSERT(thr->relfcn != 0);
742  (*thr->relfcn)((void *)bdh); /* Release it directly. */
743  return;
744  }
745 
746  bth = (kmp_info_t *)((kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) &
747  ~1); // clear possible mark before comparison
748  if (bth != th) {
749  /* Add this buffer to be released by the owning thread later */
750  __kmp_bget_enqueue(bth, buf
751 #ifdef USE_QUEUING_LOCK_FOR_BGET
752  ,
753  __kmp_gtid_from_thread(th)
754 #endif
755  );
756  return;
757  }
758 
759  /* Buffer size must be negative, indicating that the buffer is allocated. */
760  if (b->bh.bb.bsize >= 0) {
761  bn = NULL;
762  }
763  KMP_DEBUG_ASSERT(b->bh.bb.bsize < 0);
764 
765  /* Back pointer in next buffer must be zero, indicating the same thing: */
766 
767  KMP_DEBUG_ASSERT(BH((char *)b - b->bh.bb.bsize)->bb.prevfree == 0);
768 
769 #if BufStats
770  thr->numrel++; /* Increment number of brel() calls */
771  thr->totalloc += (size_t)b->bh.bb.bsize;
772 #endif
773 
774  /* If the back link is nonzero, the previous buffer is free. */
775 
776  if (b->bh.bb.prevfree != 0) {
777  /* The previous buffer is free. Consolidate this buffer with it by adding
778  the length of this buffer to the previous free buffer. Note that we
779  subtract the size in the buffer being released, since it's negative to
780  indicate that the buffer is allocated. */
781  bufsize size = b->bh.bb.bsize;
782 
783  /* Make the previous buffer the one we're working on. */
784  KMP_DEBUG_ASSERT(BH((char *)b - b->bh.bb.prevfree)->bb.bsize ==
785  b->bh.bb.prevfree);
786  b = BFH(((char *)b) - b->bh.bb.prevfree);
787  b->bh.bb.bsize -= size;
788 
789  /* unlink the buffer from the old freelist */
790  __kmp_bget_remove_from_freelist(b);
791  } else {
792  /* The previous buffer isn't allocated. Mark this buffer size as positive
793  (i.e. free) and fall through to place the buffer on the free list as an
794  isolated free block. */
795  b->bh.bb.bsize = -b->bh.bb.bsize;
796  }
797 
798  /* insert buffer back onto a new freelist */
799  __kmp_bget_insert_into_freelist(thr, b);
800 
801  /* Now we look at the next buffer in memory, located by advancing from
802  the start of this buffer by its size, to see if that buffer is
803  free. If it is, we combine this buffer with the next one in
804  memory, dechaining the second buffer from the free list. */
805  bn = BFH(((char *)b) + b->bh.bb.bsize);
806  if (bn->bh.bb.bsize > 0) {
807 
808  /* The buffer is free. Remove it from the free list and add
809  its size to that of our buffer. */
810  KMP_DEBUG_ASSERT(BH((char *)bn + bn->bh.bb.bsize)->bb.prevfree ==
811  bn->bh.bb.bsize);
812 
813  __kmp_bget_remove_from_freelist(bn);
814 
815  b->bh.bb.bsize += bn->bh.bb.bsize;
816 
817  /* unlink the buffer from the old freelist, and reinsert it into the new
818  * freelist */
819  __kmp_bget_remove_from_freelist(b);
820  __kmp_bget_insert_into_freelist(thr, b);
821 
822  /* Finally, advance to the buffer that follows the newly
823  consolidated free block. We must set its backpointer to the
824  head of the consolidated free block. We know the next block
825  must be an allocated block because the process of recombination
826  guarantees that two free blocks will never be contiguous in
827  memory. */
828  bn = BFH(((char *)b) + b->bh.bb.bsize);
829  }
830 #ifdef FreeWipe
831  (void)memset(((char *)b) + sizeof(bfhead_t), 0x55,
832  (size_t)(b->bh.bb.bsize - sizeof(bfhead_t)));
833 #endif
834  KMP_DEBUG_ASSERT(bn->bh.bb.bsize < 0);
835 
836  /* The next buffer is allocated. Set the backpointer in it to point
837  to this buffer; the previous free buffer in memory. */
838 
839  bn->bh.bb.prevfree = b->bh.bb.bsize;
840 
841  /* If a block-release function is defined, and this free buffer
842  constitutes the entire block, release it. Note that pool_len
843  is defined in such a way that the test will fail unless all
844  pool blocks are the same size. */
845  if (thr->relfcn != 0 &&
846  b->bh.bb.bsize == (bufsize)(thr->pool_len - sizeof(bhead_t))) {
847 #if BufStats
848  if (thr->numpblk !=
849  1) { /* Do not release the last buffer until finalization time */
850 #endif
851 
852  KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);
853  KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.bsize == ESent);
854  KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.prevfree ==
855  b->bh.bb.bsize);
856 
857  /* Unlink the buffer from the free list */
858  __kmp_bget_remove_from_freelist(b);
859 
860  KE_TRACE(10, ("%%%%%% FREE( %p )\n", (void *)b));
861 
862  (*thr->relfcn)(b);
863 #if BufStats
864  thr->numprel++; /* Nr of expansion block releases */
865  thr->numpblk--; /* Total number of blocks */
866  KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);
867 
868  // avoid leaving stale last_pool pointer around if it is being dealloced
869  if (thr->last_pool == b)
870  thr->last_pool = 0;
871  } else {
872  thr->last_pool = b;
873  }
874 #endif /* BufStats */
875  }
876 }
877 
878 /* BECTL -- Establish automatic pool expansion control */
879 static void bectl(kmp_info_t *th, bget_compact_t compact,
880  bget_acquire_t acquire, bget_release_t release,
881  bufsize pool_incr) {
882  thr_data_t *thr = get_thr_data(th);
883 
884  thr->compfcn = compact;
885  thr->acqfcn = acquire;
886  thr->relfcn = release;
887  thr->exp_incr = pool_incr;
888 }
889 
890 /* BPOOL -- Add a region of memory to the buffer pool. */
891 static void bpool(kmp_info_t *th, void *buf, bufsize len) {
892  /* int bin = 0; */
893  thr_data_t *thr = get_thr_data(th);
894  bfhead_t *b = BFH(buf);
895  bhead_t *bn;
896 
897  __kmp_bget_dequeue(th); /* Release any queued buffers */
898 
899 #ifdef SizeQuant
900  len &= ~((bufsize)(SizeQuant - 1));
901 #endif
902  if (thr->pool_len == 0) {
903  thr->pool_len = len;
904  } else if (len != thr->pool_len) {
905  thr->pool_len = -1;
906  }
907 #if BufStats
908  thr->numpget++; /* Number of block acquisitions */
909  thr->numpblk++; /* Number of blocks total */
910  KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);
911 #endif /* BufStats */
912 
913  /* Since the block is initially occupied by a single free buffer,
914  it had better not be (much) larger than the largest buffer
915  whose size we can store in bhead.bb.bsize. */
916  KMP_DEBUG_ASSERT(len - sizeof(bhead_t) <= -((bufsize)ESent + 1));
917 
918  /* Clear the backpointer at the start of the block to indicate that
919  there is no free block prior to this one. That blocks
920  recombination when the first block in memory is released. */
921  b->bh.bb.prevfree = 0;
922 
923  /* Create a dummy allocated buffer at the end of the pool. This dummy
924  buffer is seen when a buffer at the end of the pool is released and
925  blocks recombination of the last buffer with the dummy buffer at
926  the end. The length in the dummy buffer is set to the largest
927  negative number to denote the end of the pool for diagnostic
928  routines (this specific value is not counted on by the actual
929  allocation and release functions). */
930  len -= sizeof(bhead_t);
931  b->bh.bb.bsize = (bufsize)len;
932  /* Set the owner of this buffer */
933  TCW_PTR(b->bh.bb.bthr,
934  (kmp_info_t *)((kmp_uintptr_t)th |
935  1)); // mark the buffer as allocated address
936 
937  /* Chain the new block to the free list. */
938  __kmp_bget_insert_into_freelist(thr, b);
939 
940 #ifdef FreeWipe
941  (void)memset(((char *)b) + sizeof(bfhead_t), 0x55,
942  (size_t)(len - sizeof(bfhead_t)));
943 #endif
944  bn = BH(((char *)b) + len);
945  bn->bb.prevfree = (bufsize)len;
946  /* Definition of ESent assumes two's complement! */
947  KMP_DEBUG_ASSERT((~0) == -1 && (bn != 0));
948 
949  bn->bb.bsize = ESent;
950 }
951 
952 /* BFREED -- Dump the free lists for this thread. */
953 static void bfreed(kmp_info_t *th) {
954  int bin = 0, count = 0;
955  int gtid = __kmp_gtid_from_thread(th);
956  thr_data_t *thr = get_thr_data(th);
957 
958 #if BufStats
959  __kmp_printf_no_lock("__kmp_printpool: T#%d total=%" KMP_UINT64_SPEC
960  " get=%" KMP_INT64_SPEC " rel=%" KMP_INT64_SPEC
961  " pblk=%" KMP_INT64_SPEC " pget=%" KMP_INT64_SPEC
962  " prel=%" KMP_INT64_SPEC " dget=%" KMP_INT64_SPEC
963  " drel=%" KMP_INT64_SPEC "\n",
964  gtid, (kmp_uint64)thr->totalloc, (kmp_int64)thr->numget,
965  (kmp_int64)thr->numrel, (kmp_int64)thr->numpblk,
966  (kmp_int64)thr->numpget, (kmp_int64)thr->numprel,
967  (kmp_int64)thr->numdget, (kmp_int64)thr->numdrel);
968 #endif
969 
970  for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
971  bfhead_t *b;
972 
973  for (b = thr->freelist[bin].ql.flink; b != &thr->freelist[bin];
974  b = b->ql.flink) {
975  bufsize bs = b->bh.bb.bsize;
976 
977  KMP_DEBUG_ASSERT(b->ql.blink->ql.flink == b);
978  KMP_DEBUG_ASSERT(b->ql.flink->ql.blink == b);
979  KMP_DEBUG_ASSERT(bs > 0);
980 
981  count += 1;
982 
983  __kmp_printf_no_lock(
984  "__kmp_printpool: T#%d Free block: 0x%p size %6ld bytes.\n", gtid, b,
985  (long)bs);
986 #ifdef FreeWipe
987  {
988  char *lerr = ((char *)b) + sizeof(bfhead_t);
989  if ((bs > sizeof(bfhead_t)) &&
990  ((*lerr != 0x55) ||
991  (memcmp(lerr, lerr + 1, (size_t)(bs - (sizeof(bfhead_t) + 1))) !=
992  0))) {
993  __kmp_printf_no_lock("__kmp_printpool: T#%d (Contents of above "
994  "free block have been overstored.)\n",
995  gtid);
996  }
997  }
998 #endif
999  }
1000  }
1001 
1002  if (count == 0)
1003  __kmp_printf_no_lock("__kmp_printpool: T#%d No free blocks\n", gtid);
1004 }
1005 
1006 void __kmp_initialize_bget(kmp_info_t *th) {
1007  KMP_DEBUG_ASSERT(SizeQuant >= sizeof(void *) && (th != 0));
1008 
1009  set_thr_data(th);
1010 
1011  bectl(th, (bget_compact_t)0, (bget_acquire_t)malloc, (bget_release_t)free,
1012  (bufsize)__kmp_malloc_pool_incr);
1013 }
1014 
1015 void __kmp_finalize_bget(kmp_info_t *th) {
1016  thr_data_t *thr;
1017  bfhead_t *b;
1018 
1019  KMP_DEBUG_ASSERT(th != 0);
1020 
1021 #if BufStats
1022  thr = (thr_data_t *)th->th.th_local.bget_data;
1023  KMP_DEBUG_ASSERT(thr != NULL);
1024  b = thr->last_pool;
1025 
1026  /* If a block-release function is defined, and this free buffer constitutes
1027  the entire block, release it. Note that pool_len is defined in such a way
1028  that the test will fail unless all pool blocks are the same size. */
1029 
1030  // Deallocate the last pool if one exists because we no longer do it in brel()
1031  if (thr->relfcn != 0 && b != 0 && thr->numpblk != 0 &&
1032  b->bh.bb.bsize == (bufsize)(thr->pool_len - sizeof(bhead_t))) {
1033  KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);
1034  KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.bsize == ESent);
1035  KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.prevfree ==
1036  b->bh.bb.bsize);
1037 
1038  /* Unlink the buffer from the free list */
1039  __kmp_bget_remove_from_freelist(b);
1040 
1041  KE_TRACE(10, ("%%%%%% FREE( %p )\n", (void *)b));
1042 
1043  (*thr->relfcn)(b);
1044  thr->numprel++; /* Nr of expansion block releases */
1045  thr->numpblk--; /* Total number of blocks */
1046  KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);
1047  }
1048 #endif /* BufStats */
1049 
1050  /* Deallocate bget_data */
1051  if (th->th.th_local.bget_data != NULL) {
1052  __kmp_free(th->th.th_local.bget_data);
1053  th->th.th_local.bget_data = NULL;
1054  }
1055 }
1056 
1057 void kmpc_set_poolsize(size_t size) {
1058  bectl(__kmp_get_thread(), (bget_compact_t)0, (bget_acquire_t)malloc,
1059  (bget_release_t)free, (bufsize)size);
1060 }
1061 
1062 size_t kmpc_get_poolsize(void) {
1063  thr_data_t *p;
1064 
1065  p = get_thr_data(__kmp_get_thread());
1066 
1067  return p->exp_incr;
1068 }
1069 
1070 void kmpc_set_poolmode(int mode) {
1071  thr_data_t *p;
1072 
1073  if (mode == bget_mode_fifo || mode == bget_mode_lifo ||
1074  mode == bget_mode_best) {
1075  p = get_thr_data(__kmp_get_thread());
1076  p->mode = (bget_mode_t)mode;
1077  }
1078 }
1079 
1080 int kmpc_get_poolmode(void) {
1081  thr_data_t *p;
1082 
1083  p = get_thr_data(__kmp_get_thread());
1084 
1085  return p->mode;
1086 }
1087 
1088 void kmpc_get_poolstat(size_t *maxmem, size_t *allmem) {
1089  kmp_info_t *th = __kmp_get_thread();
1090  bufsize a, b;
1091 
1092  __kmp_bget_dequeue(th); /* Release any queued buffers */
1093 
1094  bcheck(th, &a, &b);
1095 
1096  *maxmem = a;
1097  *allmem = b;
1098 }
1099 
1100 void kmpc_poolprint(void) {
1101  kmp_info_t *th = __kmp_get_thread();
1102 
1103  __kmp_bget_dequeue(th); /* Release any queued buffers */
1104 
1105  bfreed(th);
1106 }
1107 
1108 #endif // #if KMP_USE_BGET
1109 
1110 void *kmpc_malloc(size_t size) {
1111  void *ptr;
1112  ptr = bget(__kmp_entry_thread(), (bufsize)(size + sizeof(ptr)));
1113  if (ptr != NULL) {
1114  // save allocated pointer just before one returned to user
1115  *(void **)ptr = ptr;
1116  ptr = (void **)ptr + 1;
1117  }
1118  return ptr;
1119 }
1120 
1121 #define IS_POWER_OF_TWO(n) (((n) & ((n)-1)) == 0)
1122 
1123 void *kmpc_aligned_malloc(size_t size, size_t alignment) {
1124  void *ptr;
1125  void *ptr_allocated;
1126  KMP_DEBUG_ASSERT(alignment < 32 * 1024); // Alignment should not be too big
1127  if (!IS_POWER_OF_TWO(alignment)) {
1128  // AC: do we need to issue a warning here?
1129  errno = EINVAL;
1130  return NULL;
1131  }
1132  size = size + sizeof(void *) + alignment;
1133  ptr_allocated = bget(__kmp_entry_thread(), (bufsize)size);
1134  if (ptr_allocated != NULL) {
1135  // save allocated pointer just before one returned to user
1136  ptr = (void *)(((kmp_uintptr_t)ptr_allocated + sizeof(void *) + alignment) &
1137  ~(alignment - 1));
1138  *((void **)ptr - 1) = ptr_allocated;
1139  } else {
1140  ptr = NULL;
1141  }
1142  return ptr;
1143 }
1144 
1145 void *kmpc_calloc(size_t nelem, size_t elsize) {
1146  void *ptr;
1147  ptr = bgetz(__kmp_entry_thread(), (bufsize)(nelem * elsize + sizeof(ptr)));
1148  if (ptr != NULL) {
1149  // save allocated pointer just before one returned to user
1150  *(void **)ptr = ptr;
1151  ptr = (void **)ptr + 1;
1152  }
1153  return ptr;
1154 }
1155 
1156 void *kmpc_realloc(void *ptr, size_t size) {
1157  void *result = NULL;
1158  if (ptr == NULL) {
1159  // If pointer is NULL, realloc behaves like malloc.
1160  result = bget(__kmp_entry_thread(), (bufsize)(size + sizeof(ptr)));
1161  // save allocated pointer just before one returned to user
1162  if (result != NULL) {
1163  *(void **)result = result;
1164  result = (void **)result + 1;
1165  }
1166  } else if (size == 0) {
1167  // If size is 0, realloc behaves like free.
1168  // The thread must be registered by the call to kmpc_malloc() or
1169  // kmpc_calloc() before.
1170  // So it should be safe to call __kmp_get_thread(), not
1171  // __kmp_entry_thread().
1172  KMP_ASSERT(*((void **)ptr - 1));
1173  brel(__kmp_get_thread(), *((void **)ptr - 1));
1174  } else {
1175  result = bgetr(__kmp_entry_thread(), *((void **)ptr - 1),
1176  (bufsize)(size + sizeof(ptr)));
1177  if (result != NULL) {
1178  *(void **)result = result;
1179  result = (void **)result + 1;
1180  }
1181  }
1182  return result;
1183 }
1184 
1185 // NOTE: the library must have already been initialized by a previous allocate
1186 void kmpc_free(void *ptr) {
1187  if (!__kmp_init_serial) {
1188  return;
1189  }
1190  if (ptr != NULL) {
1191  kmp_info_t *th = __kmp_get_thread();
1192  __kmp_bget_dequeue(th); /* Release any queued buffers */
1193  // extract allocated pointer and free it
1194  KMP_ASSERT(*((void **)ptr - 1));
1195  brel(th, *((void **)ptr - 1));
1196  }
1197 }
1198 
1199 void *___kmp_thread_malloc(kmp_info_t *th, size_t size KMP_SRC_LOC_DECL) {
1200  void *ptr;
1201  KE_TRACE(30, ("-> __kmp_thread_malloc( %p, %d ) called from %s:%d\n", th,
1202  (int)size KMP_SRC_LOC_PARM));
1203  ptr = bget(th, (bufsize)size);
1204  KE_TRACE(30, ("<- __kmp_thread_malloc() returns %p\n", ptr));
1205  return ptr;
1206 }
1207 
1208 void *___kmp_thread_calloc(kmp_info_t *th, size_t nelem,
1209  size_t elsize KMP_SRC_LOC_DECL) {
1210  void *ptr;
1211  KE_TRACE(30, ("-> __kmp_thread_calloc( %p, %d, %d ) called from %s:%d\n", th,
1212  (int)nelem, (int)elsize KMP_SRC_LOC_PARM));
1213  ptr = bgetz(th, (bufsize)(nelem * elsize));
1214  KE_TRACE(30, ("<- __kmp_thread_calloc() returns %p\n", ptr));
1215  return ptr;
1216 }
1217 
1218 void *___kmp_thread_realloc(kmp_info_t *th, void *ptr,
1219  size_t size KMP_SRC_LOC_DECL) {
1220  KE_TRACE(30, ("-> __kmp_thread_realloc( %p, %p, %d ) called from %s:%d\n", th,
1221  ptr, (int)size KMP_SRC_LOC_PARM));
1222  ptr = bgetr(th, ptr, (bufsize)size);
1223  KE_TRACE(30, ("<- __kmp_thread_realloc() returns %p\n", ptr));
1224  return ptr;
1225 }
1226 
1227 void ___kmp_thread_free(kmp_info_t *th, void *ptr KMP_SRC_LOC_DECL) {
1228  KE_TRACE(30, ("-> __kmp_thread_free( %p, %p ) called from %s:%d\n", th,
1229  ptr KMP_SRC_LOC_PARM));
1230  if (ptr != NULL) {
1231  __kmp_bget_dequeue(th); /* Release any queued buffers */
1232  brel(th, ptr);
1233  }
1234  KE_TRACE(30, ("<- __kmp_thread_free()\n"));
1235 }
1236 
1237 /* OMP 5.0 Memory Management support */
1238 /* memkind experimental API: */
1239 // memkind_alloc
1240 static void *(*kmp_mk_alloc)(void *k, size_t sz);
1241 // memkind_free
1242 static void (*kmp_mk_free)(void *kind, void *ptr);
1243 // kinds we are going to use
1244 static void **mk_default;
1245 static void **mk_interleave;
1246 static void **mk_hbw_interleave;
1247 static void **mk_hbw_preferred;
1248 static void **mk_dax_kmem;
1249 static void **mk_dax_kmem_all;
1250 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB && !KMP_OS_DARWIN
1251 static const char *kmp_mk_lib_name;
1252 static void *h_memkind;
1253 // memkind_check_available
1254 static int (*kmp_mk_check)(void *kind);
1255 static void **mk_hbw;
1256 static void **mk_hugetlb;
1257 static void **mk_hbw_hugetlb;
1258 static void **mk_hbw_preferred_hugetlb;
1259 static void **mk_dax_kmem_preferred;
1260 #endif
1261 static void *(*kmp_target_alloc_host)(size_t size, int device);
1262 static void *(*kmp_target_alloc_shared)(size_t size, int device);
1263 static void *(*kmp_target_alloc_device)(size_t size, int device);
1264 static void *(*kmp_target_lock_mem)(void *ptr, size_t size, int device);
1265 static void *(*kmp_target_unlock_mem)(void *ptr, int device);
1266 static void *(*kmp_target_free_host)(void *ptr, int device);
1267 static void *(*kmp_target_free_shared)(void *ptr, int device);
1268 static void *(*kmp_target_free_device)(void *ptr, int device);
1269 static bool __kmp_target_mem_available;
1270 
1271 #define KMP_IS_TARGET_MEM_SPACE(MS) \
1272  (MS == llvm_omp_target_host_mem_space || \
1273  MS == llvm_omp_target_shared_mem_space || \
1274  MS == llvm_omp_target_device_mem_space)
1275 
1276 #define KMP_IS_TARGET_MEM_ALLOC(MA) \
1277  (MA == llvm_omp_target_host_mem_alloc || \
1278  MA == llvm_omp_target_shared_mem_alloc || \
1279  MA == llvm_omp_target_device_mem_alloc)
1280 
1281 #define KMP_IS_PREDEF_MEM_SPACE(MS) \
1282  (MS == omp_null_mem_space || MS == omp_default_mem_space || \
1283  MS == omp_large_cap_mem_space || MS == omp_const_mem_space || \
1284  MS == omp_high_bw_mem_space || MS == omp_low_lat_mem_space || \
1285  KMP_IS_TARGET_MEM_SPACE(MS))
1286 
1304  bool supported = false;
1305  using get_mem_resources_t = int (*)(int, const int *, int,
1306  omp_memspace_handle_t, int *);
1307  using omp_alloc_t = void *(*)(size_t, omp_allocator_handle_t);
1308  using omp_free_t = void (*)(void *, omp_allocator_handle_t);
1309  get_mem_resources_t tgt_get_mem_resources = nullptr;
1310  omp_alloc_t tgt_omp_alloc = nullptr;
1311  omp_free_t tgt_omp_free = nullptr;
1312 
1313 public:
1315  void init() {
1316  tgt_get_mem_resources =
1317  (get_mem_resources_t)KMP_DLSYM("__tgt_get_mem_resources");
1318  tgt_omp_alloc = (omp_alloc_t)KMP_DLSYM("__tgt_omp_alloc");
1319  tgt_omp_free = (omp_free_t)KMP_DLSYM("__tgt_omp_free");
1320  supported = tgt_get_mem_resources && tgt_omp_alloc && tgt_omp_free;
1321  }
1324  int get_mem_resources(int ndevs, const int *devs, int host,
1325  omp_memspace_handle_t memspace, int *resources) {
1326  if (supported)
1327  return tgt_get_mem_resources(ndevs, devs, host, memspace, resources);
1328  return 0;
1329  }
1331  void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
1332  if (supported)
1333  return tgt_omp_alloc(size, allocator);
1334  return nullptr;
1335  }
1337  void omp_free(void *ptr, omp_allocator_handle_t allocator) {
1338  if (supported)
1339  tgt_omp_free(ptr, allocator);
1340  }
1341 } __kmp_tgt_allocator;
1342 
1343 extern "C" int omp_get_num_devices(void);
1344 
1349  kmp_memspace_t *memspace_list = nullptr;
1350  KMP_LOCK_INIT(mtx);
1352  kmp_memspace_t *find(int num_resources, const int *resources,
1353  omp_memspace_handle_t memspace) {
1354  kmp_memspace_t *ms = memspace_list;
1355  while (ms) {
1356  if (ms->num_resources == num_resources && ms->memspace == memspace &&
1357  !memcmp(ms->resources, resources, sizeof(int) * num_resources))
1358  break;
1359  ms = ms->next;
1360  }
1361  return ms;
1362  }
1366  omp_memspace_handle_t get(int num_resources, const int *resources,
1367  omp_memspace_handle_t memspace) {
1368  int gtid = __kmp_entry_gtid();
1369  __kmp_acquire_lock(&mtx, gtid);
1370  // Sort absolute IDs in the resource list
1371  int *sorted_resources = (int *)__kmp_allocate(sizeof(int) * num_resources);
1372  KMP_MEMCPY(sorted_resources, resources, num_resources * sizeof(int));
1373  qsort(sorted_resources, (size_t)num_resources, sizeof(int),
1374  [](const void *a, const void *b) {
1375  const int val_a = *(const int *)a;
1376  const int val_b = *(const int *)b;
1377  return (val_a > val_b) ? 1 : ((val_a < val_b) ? -1 : 0);
1378  });
1379  kmp_memspace_t *ms = find(num_resources, sorted_resources, memspace);
1380  if (ms) {
1381  __kmp_free(sorted_resources);
1382  __kmp_release_lock(&mtx, gtid);
1383  return ms;
1384  }
1385  ms = (kmp_memspace_t *)__kmp_allocate(sizeof(kmp_memspace_t));
1386  ms->memspace = memspace;
1387  ms->num_resources = num_resources;
1388  ms->resources = sorted_resources;
1389  ms->next = memspace_list;
1390  memspace_list = ms;
1391  __kmp_release_lock(&mtx, gtid);
1392  return ms;
1393  }
1394 
1395 public:
1397  void init() { __kmp_init_lock(&mtx); }
1399  void fini() {
1400  kmp_memspace_t *ms = memspace_list;
1401  while (ms) {
1402  if (ms->resources)
1403  __kmp_free(ms->resources);
1404  kmp_memspace_t *tmp = ms;
1405  ms = ms->next;
1406  __kmp_free(tmp);
1407  }
1408  __kmp_destroy_lock(&mtx);
1409  }
1411  omp_memspace_handle_t get_memspace(int num_devices, const int *devices,
1412  int host_access,
1413  omp_memspace_handle_t memspace) {
1414  int actual_num_devices = num_devices;
1415  int *actual_devices = const_cast<int *>(devices);
1416  if (actual_num_devices == 0) {
1417  actual_num_devices = omp_get_num_devices();
1418  if (actual_num_devices <= 0)
1419  return omp_null_mem_space;
1420  }
1421  if (actual_devices == NULL) {
1422  // Prepare list of all devices in this case.
1423  actual_devices = (int *)__kmp_allocate(sizeof(int) * actual_num_devices);
1424  for (int i = 0; i < actual_num_devices; i++)
1425  actual_devices[i] = i;
1426  }
1427  // Get the number of available resources first
1428  int num_resources = __kmp_tgt_allocator.get_mem_resources(
1429  actual_num_devices, actual_devices, host_access, memspace, NULL);
1430  if (num_resources <= 0)
1431  return omp_null_mem_space; // No available resources
1432 
1433  omp_memspace_handle_t ms = omp_null_mem_space;
1434  if (num_resources > 0) {
1435  int *resources = (int *)__kmp_allocate(sizeof(int) * num_resources);
1436  // Let offload runtime write the resource IDs
1437  num_resources = __kmp_tgt_allocator.get_mem_resources(
1438  actual_num_devices, actual_devices, host_access, memspace, resources);
1439  ms = get(num_resources, resources, memspace);
1440  __kmp_free(resources);
1441  }
1442  if (!devices && actual_devices)
1443  __kmp_free(actual_devices);
1444  return ms;
1445  }
1447  omp_memspace_handle_t get_memspace(int num_resources, const int *resources,
1448  omp_memspace_handle_t parent) {
1449  kmp_memspace_t *ms = (kmp_memspace_t *)parent;
1450  return get(num_resources, resources, ms->memspace);
1451  }
1452 } __kmp_tgt_memspace_list;
1453 
1454 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB && !KMP_OS_DARWIN
1455 static inline void chk_kind(void ***pkind) {
1456  KMP_DEBUG_ASSERT(pkind);
1457  if (*pkind) // symbol found
1458  if (kmp_mk_check(**pkind)) // kind not available or error
1459  *pkind = NULL;
1460 }
1461 #endif
1462 
1463 void __kmp_init_memkind() {
1464 // as of 2018-07-31 memkind does not support Windows*, exclude it for now
1465 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB && !KMP_OS_DARWIN
1466  // use of statically linked memkind is problematic, as it depends on libnuma
1467  kmp_mk_lib_name = "libmemkind.so";
1468  h_memkind = dlopen(kmp_mk_lib_name, RTLD_LAZY);
1469  if (h_memkind) {
1470  kmp_mk_check = (int (*)(void *))dlsym(h_memkind, "memkind_check_available");
1471  kmp_mk_alloc =
1472  (void *(*)(void *, size_t))dlsym(h_memkind, "memkind_malloc");
1473  kmp_mk_free = (void (*)(void *, void *))dlsym(h_memkind, "memkind_free");
1474  mk_default = (void **)dlsym(h_memkind, "MEMKIND_DEFAULT");
1475  if (kmp_mk_check && kmp_mk_alloc && kmp_mk_free && mk_default &&
1476  !kmp_mk_check(*mk_default)) {
1477  __kmp_memkind_available = 1;
1478  mk_interleave = (void **)dlsym(h_memkind, "MEMKIND_INTERLEAVE");
1479  chk_kind(&mk_interleave);
1480  mk_hbw = (void **)dlsym(h_memkind, "MEMKIND_HBW");
1481  chk_kind(&mk_hbw);
1482  mk_hbw_interleave = (void **)dlsym(h_memkind, "MEMKIND_HBW_INTERLEAVE");
1483  chk_kind(&mk_hbw_interleave);
1484  mk_hbw_preferred = (void **)dlsym(h_memkind, "MEMKIND_HBW_PREFERRED");
1485  chk_kind(&mk_hbw_preferred);
1486  mk_hugetlb = (void **)dlsym(h_memkind, "MEMKIND_HUGETLB");
1487  chk_kind(&mk_hugetlb);
1488  mk_hbw_hugetlb = (void **)dlsym(h_memkind, "MEMKIND_HBW_HUGETLB");
1489  chk_kind(&mk_hbw_hugetlb);
1490  mk_hbw_preferred_hugetlb =
1491  (void **)dlsym(h_memkind, "MEMKIND_HBW_PREFERRED_HUGETLB");
1492  chk_kind(&mk_hbw_preferred_hugetlb);
1493  mk_dax_kmem = (void **)dlsym(h_memkind, "MEMKIND_DAX_KMEM");
1494  chk_kind(&mk_dax_kmem);
1495  mk_dax_kmem_all = (void **)dlsym(h_memkind, "MEMKIND_DAX_KMEM_ALL");
1496  chk_kind(&mk_dax_kmem_all);
1497  mk_dax_kmem_preferred =
1498  (void **)dlsym(h_memkind, "MEMKIND_DAX_KMEM_PREFERRED");
1499  chk_kind(&mk_dax_kmem_preferred);
1500  KE_TRACE(25, ("__kmp_init_memkind: memkind library initialized\n"));
1501  return; // success
1502  }
1503  dlclose(h_memkind); // failure
1504  }
1505  h_memkind = NULL;
1506  kmp_mk_check = NULL;
1507  mk_hbw = NULL;
1508  mk_hugetlb = NULL;
1509  mk_hbw_hugetlb = NULL;
1510  mk_hbw_preferred_hugetlb = NULL;
1511  mk_dax_kmem_preferred = NULL;
1512  kmp_mk_lib_name = "";
1513 #endif // !(KMP_OS_UNIX && KMP_DYNAMIC_LIB && !KMP_OS_DARWIN)
1514  kmp_mk_alloc = NULL;
1515  kmp_mk_free = NULL;
1516  mk_default = NULL;
1517  mk_interleave = NULL;
1518  mk_hbw_interleave = NULL;
1519  mk_hbw_preferred = NULL;
1520  mk_dax_kmem = NULL;
1521  mk_dax_kmem_all = NULL;
1522 }
1523 
1524 void __kmp_fini_memkind() {
1525 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB && !KMP_OS_DARWIN
1526  if (__kmp_memkind_available)
1527  KE_TRACE(25, ("__kmp_fini_memkind: finalize memkind library\n"));
1528  if (h_memkind) {
1529  dlclose(h_memkind);
1530  h_memkind = NULL;
1531  }
1532  kmp_mk_check = NULL;
1533  mk_hbw = NULL;
1534  mk_hugetlb = NULL;
1535  mk_hbw_hugetlb = NULL;
1536  mk_hbw_preferred_hugetlb = NULL;
1537  mk_dax_kmem_preferred = NULL;
1538 #endif
1539 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
1540  kmp_mk_alloc = NULL;
1541  kmp_mk_free = NULL;
1542  mk_default = NULL;
1543  mk_interleave = NULL;
1544  mk_hbw_interleave = NULL;
1545  mk_hbw_preferred = NULL;
1546  mk_dax_kmem = NULL;
1547  mk_dax_kmem_all = NULL;
1548 #endif
1549 }
1550 
1551 #if KMP_HWLOC_ENABLED
1552 static bool __kmp_is_hwloc_membind_supported(hwloc_membind_policy_t policy) {
1553 #if HWLOC_API_VERSION >= 0x00020300
1554  const hwloc_topology_support *support;
1555  support = hwloc_topology_get_support(__kmp_hwloc_topology);
1556  if (support) {
1557  if (policy == HWLOC_MEMBIND_BIND)
1558  return (support->membind->alloc_membind &&
1559  support->membind->bind_membind);
1560  if (policy == HWLOC_MEMBIND_INTERLEAVE)
1561  return (support->membind->alloc_membind &&
1562  support->membind->interleave_membind);
1563  }
1564  return false;
1565 #else
1566  return false;
1567 #endif // KMP_HWLOC_ENABLED
1568 }
1569 
1570 void *__kmp_hwloc_alloc_membind(hwloc_memattr_id_e attr, size_t size,
1571  hwloc_membind_policy_t policy) {
1572 #if HWLOC_API_VERSION >= 0x00020300
1573  void *ptr = NULL;
1574  hwloc_obj_t node;
1575  struct hwloc_location initiator;
1576  int ret;
1577  // TODO: We should make this more efficient by getting rid of the OS syscall
1578  // 'hwloc_bitmap_alloc' and 'hwloc_get_cpubind' to get affinity and instead
1579  // use th_affin_mask field when it's capable of getting the underlying
1580  // mask implementation.
1581  hwloc_cpuset_t mask = hwloc_bitmap_alloc();
1582  ret = hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
1583  if (ret < 0) {
1584  hwloc_bitmap_free(mask);
1585  return ptr;
1586  }
1587  initiator.type = KMP_HWLOC_LOCATION_TYPE_CPUSET;
1588  initiator.location.cpuset = mask;
1589  ret = hwloc_memattr_get_best_target(__kmp_hwloc_topology, attr, &initiator, 0,
1590  &node, NULL);
1591  if (ret < 0) {
1592  return ptr;
1593  }
1594  return hwloc_alloc_membind(__kmp_hwloc_topology, size, node->nodeset, policy,
1595  HWLOC_MEMBIND_BYNODESET);
1596 #else
1597  return NULL;
1598 #endif
1599 }
1600 
1601 void *__kmp_hwloc_membind_policy(omp_memspace_handle_t ms, size_t size,
1602  hwloc_membind_policy_t policy) {
1603 #if HWLOC_API_VERSION >= 0x00020300
1604  void *ptr = NULL;
1605  if (ms == omp_high_bw_mem_space) {
1606  ptr = __kmp_hwloc_alloc_membind(HWLOC_MEMATTR_ID_BANDWIDTH, size, policy);
1607  } else if (ms == omp_large_cap_mem_space) {
1608  ptr = __kmp_hwloc_alloc_membind(HWLOC_MEMATTR_ID_CAPACITY, size, policy);
1609  } else {
1610  ptr = hwloc_alloc(__kmp_hwloc_topology, size);
1611  }
1612  return ptr;
1613 #else
1614  return NULL;
1615 #endif
1616 }
1617 #endif // KMP_HWLOC_ENABLED
1618 
1619 void __kmp_init_target_mem() {
1620  *(void **)(&kmp_target_alloc_host) = KMP_DLSYM("llvm_omp_target_alloc_host");
1621  *(void **)(&kmp_target_alloc_shared) =
1622  KMP_DLSYM("llvm_omp_target_alloc_shared");
1623  *(void **)(&kmp_target_alloc_device) =
1624  KMP_DLSYM("llvm_omp_target_alloc_device");
1625  *(void **)(&kmp_target_free_host) = KMP_DLSYM("llvm_omp_target_free_host");
1626  *(void **)(&kmp_target_free_shared) =
1627  KMP_DLSYM("llvm_omp_target_free_shared");
1628  *(void **)(&kmp_target_free_device) =
1629  KMP_DLSYM("llvm_omp_target_free_device");
1630  __kmp_target_mem_available =
1631  kmp_target_alloc_host && kmp_target_alloc_shared &&
1632  kmp_target_alloc_device && kmp_target_free_host &&
1633  kmp_target_free_shared && kmp_target_free_device;
1634  // lock/pin and unlock/unpin target calls
1635  *(void **)(&kmp_target_lock_mem) = KMP_DLSYM("llvm_omp_target_lock_mem");
1636  *(void **)(&kmp_target_unlock_mem) = KMP_DLSYM("llvm_omp_target_unlock_mem");
1637  __kmp_tgt_allocator.init();
1638  __kmp_tgt_memspace_list.init();
1639 }
1640 
1642 void __kmp_fini_target_mem() { __kmp_tgt_memspace_list.fini(); }
1643 
1644 omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms,
1645  int ntraits,
1646  omp_alloctrait_t traits[]) {
1647  kmp_allocator_t *al;
1648  int i;
1649  al = (kmp_allocator_t *)__kmp_allocate(sizeof(kmp_allocator_t)); // zeroed
1650  al->memspace = ms; // not used currently
1651 
1652  // Assign default values if applicable
1653  al->alignment = 1;
1654  al->pinned = false;
1655  al->partition = omp_atv_environment;
1656  al->pin_device = -1;
1657  al->preferred_device = -1;
1658  al->target_access = omp_atv_single;
1659  al->atomic_scope = omp_atv_device;
1660 
1661  for (i = 0; i < ntraits; ++i) {
1662  switch (traits[i].key) {
1663  case omp_atk_sync_hint:
1664  case omp_atk_access:
1665  break;
1666  case omp_atk_pinned:
1667  al->pinned = true;
1668  break;
1669  case omp_atk_alignment:
1670  __kmp_type_convert(traits[i].value, &(al->alignment));
1671  KMP_ASSERT(IS_POWER_OF_TWO(al->alignment));
1672  break;
1673  case omp_atk_pool_size:
1674  al->pool_size = traits[i].value;
1675  break;
1676  case omp_atk_fallback:
1677  al->fb = (omp_alloctrait_value_t)traits[i].value;
1678  KMP_DEBUG_ASSERT(
1679  al->fb == omp_atv_default_mem_fb || al->fb == omp_atv_null_fb ||
1680  al->fb == omp_atv_abort_fb || al->fb == omp_atv_allocator_fb);
1681  break;
1682  case omp_atk_fb_data:
1683  al->fb_data = RCAST(kmp_allocator_t *, traits[i].value);
1684  break;
1685  case omp_atk_partition:
1686 #if KMP_HWLOC_ENABLED
1687  al->membind = (omp_alloctrait_value_t)traits[i].value;
1688  KMP_DEBUG_ASSERT(al->membind == omp_atv_environment ||
1689  al->membind == omp_atv_nearest ||
1690  al->membind == omp_atv_blocked ||
1691  al->membind == omp_atv_interleaved);
1692 #endif // KMP_HWLOC_ENABLED
1693  al->memkind = RCAST(void **, traits[i].value);
1694  break;
1695  case omp_atk_pin_device:
1696  __kmp_type_convert(traits[i].value, &(al->pin_device));
1697  break;
1698  case omp_atk_preferred_device:
1699  __kmp_type_convert(traits[i].value, &(al->preferred_device));
1700  break;
1701  case omp_atk_target_access:
1702  al->target_access = (omp_alloctrait_value_t)traits[i].value;
1703  break;
1704  case omp_atk_atomic_scope:
1705  al->atomic_scope = (omp_alloctrait_value_t)traits[i].value;
1706  break;
1707  case omp_atk_part_size:
1708  __kmp_type_convert(traits[i].value, &(al->part_size));
1709  break;
1710  default:
1711  KMP_ASSERT2(0, "Unexpected allocator trait");
1712  }
1713  }
1714 
1715  if (al->memspace > kmp_max_mem_space) {
1716  // Memory space has been allocated for targets.
1717  return (omp_allocator_handle_t)al;
1718  }
1719 
1720  KMP_DEBUG_ASSERT(KMP_IS_PREDEF_MEM_SPACE(al->memspace));
1721 
1722  if (al->fb == 0) {
1723  // set default allocator
1724  al->fb = omp_atv_default_mem_fb;
1725  al->fb_data = (kmp_allocator_t *)omp_default_mem_alloc;
1726  } else if (al->fb == omp_atv_allocator_fb) {
1727  KMP_ASSERT(al->fb_data != NULL);
1728  } else if (al->fb == omp_atv_default_mem_fb) {
1729  al->fb_data = (kmp_allocator_t *)omp_default_mem_alloc;
1730  }
1731  if (__kmp_memkind_available) {
1732  // Let's use memkind library if available
1733  if (ms == omp_high_bw_mem_space) {
1734  if (al->memkind == (void *)omp_atv_interleaved && mk_hbw_interleave) {
1735  al->memkind = mk_hbw_interleave;
1736  } else if (mk_hbw_preferred) {
1737  // AC: do not try to use MEMKIND_HBW for now, because memkind library
1738  // cannot reliably detect exhaustion of HBW memory.
1739  // It could be possible using hbw_verify_memory_region() but memkind
1740  // manual says: "Using this function in production code may result in
1741  // serious performance penalty".
1742  al->memkind = mk_hbw_preferred;
1743  } else {
1744  // HBW is requested but not available --> return NULL allocator
1745  __kmp_free(al);
1746  return omp_null_allocator;
1747  }
1748  } else if (ms == omp_large_cap_mem_space) {
1749  if (mk_dax_kmem_all) {
1750  // All pmem nodes are visited
1751  al->memkind = mk_dax_kmem_all;
1752  } else if (mk_dax_kmem) {
1753  // Only closest pmem node is visited
1754  al->memkind = mk_dax_kmem;
1755  } else {
1756  __kmp_free(al);
1757  return omp_null_allocator;
1758  }
1759  } else {
1760  if (al->memkind == (void *)omp_atv_interleaved && mk_interleave) {
1761  al->memkind = mk_interleave;
1762  } else {
1763  al->memkind = mk_default;
1764  }
1765  }
1766  } else if (KMP_IS_TARGET_MEM_SPACE(ms) && !__kmp_target_mem_available) {
1767  __kmp_free(al);
1768  return omp_null_allocator;
1769  } else {
1770  if (!__kmp_hwloc_available &&
1771  (ms == omp_high_bw_mem_space || ms == omp_large_cap_mem_space)) {
1772  // cannot detect HBW memory presence without memkind library
1773  __kmp_free(al);
1774  return omp_null_allocator;
1775  }
1776  }
1777  return (omp_allocator_handle_t)al;
1778 }
1779 
1780 void __kmpc_destroy_allocator(int gtid, omp_allocator_handle_t allocator) {
1781  if (allocator > kmp_max_mem_alloc)
1782  __kmp_free(allocator);
1783 }
1784 
1785 void __kmpc_set_default_allocator(int gtid, omp_allocator_handle_t allocator) {
1786  if (allocator == omp_null_allocator)
1787  allocator = omp_default_mem_alloc;
1788  __kmp_threads[gtid]->th.th_def_allocator = allocator;
1789 }
1790 
1791 omp_allocator_handle_t __kmpc_get_default_allocator(int gtid) {
1792  return __kmp_threads[gtid]->th.th_def_allocator;
1793 }
1794 
1795 omp_memspace_handle_t __kmp_get_devices_memspace(int ndevs, const int *devs,
1796  omp_memspace_handle_t memspace,
1797  int host) {
1798  if (!__kmp_init_serial)
1799  __kmp_serial_initialize();
1800  // Only accept valid device description and predefined memory space
1801  if (ndevs < 0 || (ndevs > 0 && !devs) || memspace > kmp_max_mem_space)
1802  return omp_null_mem_space;
1803 
1804  return __kmp_tgt_memspace_list.get_memspace(ndevs, devs, host, memspace);
1805 }
1806 
1807 omp_allocator_handle_t
1808 __kmp_get_devices_allocator(int ndevs, const int *devs,
1809  omp_memspace_handle_t memspace, int host) {
1810  if (!__kmp_init_serial)
1811  __kmp_serial_initialize();
1812  // Only accept valid device description and predefined memory space
1813  if (ndevs < 0 || (ndevs > 0 && !devs) || memspace > kmp_max_mem_space)
1814  return omp_null_allocator;
1815 
1816  omp_memspace_handle_t mspace =
1817  __kmp_get_devices_memspace(ndevs, devs, memspace, host);
1818  if (mspace == omp_null_mem_space)
1819  return omp_null_allocator;
1820 
1821  return __kmpc_init_allocator(__kmp_entry_gtid(), mspace, 0, NULL);
1822 }
1823 
1824 int __kmp_get_memspace_num_resources(omp_memspace_handle_t memspace) {
1825  if (!__kmp_init_serial)
1826  __kmp_serial_initialize();
1827  if (memspace == omp_null_mem_space)
1828  return 0;
1829  if (memspace < kmp_max_mem_space)
1830  return 1; // return 1 for predefined memory space
1831  kmp_memspace_t *ms = (kmp_memspace_t *)memspace;
1832  return ms->num_resources;
1833 }
1834 
1835 omp_memspace_handle_t __kmp_get_submemspace(omp_memspace_handle_t memspace,
1836  int num_resources, int *resources) {
1837  if (!__kmp_init_serial)
1838  __kmp_serial_initialize();
1839  if (memspace == omp_null_mem_space || memspace < kmp_max_mem_space)
1840  return memspace; // return input memory space for predefined memory space
1841  kmp_memspace_t *ms = (kmp_memspace_t *)memspace;
1842  if (num_resources == 0 || ms->num_resources < num_resources || !resources)
1843  return omp_null_mem_space; // input memory space cannot satisfy the request
1844 
1845  // The stored resource ID is an absolute ID only known to the offload backend,
1846  // and the returned memory space will still keep the property.
1847  int *resources_abs = (int *)__kmp_allocate(sizeof(int) * num_resources);
1848 
1849  // Collect absolute resource ID from the relative ID
1850  for (int i = 0; i < num_resources; i++)
1851  resources_abs[i] = ms->resources[resources[i]];
1852 
1853  omp_memspace_handle_t submemspace = __kmp_tgt_memspace_list.get_memspace(
1854  num_resources, resources_abs, memspace);
1855  __kmp_free(resources_abs);
1856 
1857  return submemspace;
1858 }
1859 
1860 typedef struct kmp_mem_desc { // Memory block descriptor
1861  void *ptr_alloc; // Pointer returned by allocator
1862  size_t size_a; // Size of allocated memory block (initial+descriptor+align)
1863  size_t size_orig; // Original size requested
1864  void *ptr_align; // Pointer to aligned memory, returned
1865  kmp_allocator_t *allocator; // allocator
1866 } kmp_mem_desc_t;
1867 constexpr size_t alignment = SizeQuant;
1868 
1869 // external interfaces are wrappers over internal implementation
1870 void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
1871  KE_TRACE(25, ("__kmpc_alloc: T#%d (%d, %p)\n", gtid, (int)size, allocator));
1872  void *ptr = __kmp_alloc(gtid, 0, size, allocator);
1873  KE_TRACE(25, ("__kmpc_alloc returns %p, T#%d\n", ptr, gtid));
1874  return ptr;
1875 }
1876 
1877 void *__kmpc_aligned_alloc(int gtid, size_t algn, size_t size,
1878  omp_allocator_handle_t allocator) {
1879  KE_TRACE(25, ("__kmpc_aligned_alloc: T#%d (%d, %d, %p)\n", gtid, (int)algn,
1880  (int)size, allocator));
1881  void *ptr = __kmp_alloc(gtid, algn, size, allocator);
1882  KE_TRACE(25, ("__kmpc_aligned_alloc returns %p, T#%d\n", ptr, gtid));
1883  return ptr;
1884 }
1885 
1886 void *__kmpc_calloc(int gtid, size_t nmemb, size_t size,
1887  omp_allocator_handle_t allocator) {
1888  KE_TRACE(25, ("__kmpc_calloc: T#%d (%d, %d, %p)\n", gtid, (int)nmemb,
1889  (int)size, allocator));
1890  void *ptr = __kmp_calloc(gtid, 0, nmemb, size, allocator);
1891  KE_TRACE(25, ("__kmpc_calloc returns %p, T#%d\n", ptr, gtid));
1892  return ptr;
1893 }
1894 
1895 void *__kmpc_realloc(int gtid, void *ptr, size_t size,
1896  omp_allocator_handle_t allocator,
1897  omp_allocator_handle_t free_allocator) {
1898  KE_TRACE(25, ("__kmpc_realloc: T#%d (%p, %d, %p, %p)\n", gtid, ptr, (int)size,
1899  allocator, free_allocator));
1900  void *nptr = __kmp_realloc(gtid, ptr, size, allocator, free_allocator);
1901  KE_TRACE(25, ("__kmpc_realloc returns %p, T#%d\n", nptr, gtid));
1902  return nptr;
1903 }
1904 
1905 void __kmpc_free(int gtid, void *ptr, omp_allocator_handle_t allocator) {
1906  KE_TRACE(25, ("__kmpc_free: T#%d free(%p,%p)\n", gtid, ptr, allocator));
1907  ___kmpc_free(gtid, ptr, allocator);
1908  KE_TRACE(10, ("__kmpc_free: T#%d freed %p (%p)\n", gtid, ptr, allocator));
1909  return;
1910 }
1911 
1912 // internal implementation, called from inside the library
1913 void *__kmp_alloc(int gtid, size_t algn, size_t size,
1914  omp_allocator_handle_t allocator) {
1915  void *ptr = NULL;
1916  kmp_allocator_t *al;
1917  KMP_DEBUG_ASSERT(__kmp_init_serial);
1918  if (size == 0)
1919  return NULL;
1920  if (allocator == omp_null_allocator)
1921  allocator = __kmp_threads[gtid]->th.th_def_allocator;
1922  kmp_int32 default_device =
1923  __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
1924 
1925  al = RCAST(kmp_allocator_t *, allocator);
1926 
1927  int sz_desc = sizeof(kmp_mem_desc_t);
1928  kmp_mem_desc_t desc;
1929  kmp_uintptr_t addr; // address returned by allocator
1930  kmp_uintptr_t addr_align; // address to return to caller
1931  kmp_uintptr_t addr_descr; // address of memory block descriptor
1932  size_t align = alignment; // default alignment
1933  if (allocator > kmp_max_mem_alloc && al->alignment > align)
1934  align = al->alignment; // alignment required by allocator trait
1935  if (align < algn)
1936  align = algn; // max of allocator trait, parameter and sizeof(void*)
1937  desc.size_orig = size;
1938  desc.size_a = size + sz_desc + align;
1939  bool is_pinned = false;
1940  if (allocator > kmp_max_mem_alloc)
1941  is_pinned = al->pinned;
1942 
1943  // Use default allocator if hwloc and libmemkind are not available
1944  int use_default_allocator =
1945  (!__kmp_hwloc_available && !__kmp_memkind_available);
1946 
1947  if (al > kmp_max_mem_alloc && al->memspace > kmp_max_mem_space) {
1948  // Memspace has been allocated for targets.
1949  return __kmp_tgt_allocator.omp_alloc(size, allocator);
1950  }
1951 
1952  if (KMP_IS_TARGET_MEM_ALLOC(allocator)) {
1953  // Use size input directly as the memory may not be accessible on host.
1954  // Use default device for now.
1955  if (__kmp_target_mem_available) {
1956  kmp_int32 device =
1957  __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
1958  if (allocator == llvm_omp_target_host_mem_alloc)
1959  ptr = kmp_target_alloc_host(size, device);
1960  else if (allocator == llvm_omp_target_shared_mem_alloc)
1961  ptr = kmp_target_alloc_shared(size, device);
1962  else // allocator == llvm_omp_target_device_mem_alloc
1963  ptr = kmp_target_alloc_device(size, device);
1964  return ptr;
1965  } else {
1966  KMP_INFORM(TargetMemNotAvailable);
1967  }
1968  }
1969 
1970  if (allocator >= kmp_max_mem_alloc && KMP_IS_TARGET_MEM_SPACE(al->memspace)) {
1971  if (__kmp_target_mem_available) {
1972  kmp_int32 device =
1973  __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
1974  if (al->memspace == llvm_omp_target_host_mem_space)
1975  ptr = kmp_target_alloc_host(size, device);
1976  else if (al->memspace == llvm_omp_target_shared_mem_space)
1977  ptr = kmp_target_alloc_shared(size, device);
1978  else // al->memspace == llvm_omp_target_device_mem_space
1979  ptr = kmp_target_alloc_device(size, device);
1980  return ptr;
1981  } else {
1982  KMP_INFORM(TargetMemNotAvailable);
1983  }
1984  }
1985 
1986 #if KMP_HWLOC_ENABLED
1987  if (__kmp_hwloc_available) {
1988  if (__kmp_is_hwloc_membind_supported(HWLOC_MEMBIND_BIND)) {
1989  if (allocator < kmp_max_mem_alloc) {
1990  // pre-defined allocator
1991  if (allocator == omp_high_bw_mem_alloc) {
1992  ptr = __kmp_hwloc_alloc_membind(HWLOC_MEMATTR_ID_BANDWIDTH,
1993  desc.size_a, HWLOC_MEMBIND_BIND);
1994  if (ptr == NULL)
1995  use_default_allocator = true;
1996  } else if (allocator == omp_large_cap_mem_alloc) {
1997  ptr = __kmp_hwloc_alloc_membind(HWLOC_MEMATTR_ID_CAPACITY,
1998  desc.size_a, HWLOC_MEMBIND_BIND);
1999  if (ptr == NULL)
2000  use_default_allocator = true;
2001  } else {
2002  use_default_allocator = true;
2003  }
2004  if (use_default_allocator) {
2005  ptr = hwloc_alloc(__kmp_hwloc_topology, desc.size_a);
2006  }
2007  } else if (al->pool_size > 0) {
2008  // custom allocator with pool size requested
2009  kmp_uint64 used =
2010  KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, desc.size_a);
2011  if (used + desc.size_a > al->pool_size) {
2012  // not enough space, need to go fallback path
2013  KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
2014  if (al->fb == omp_atv_default_mem_fb) {
2015  al = (kmp_allocator_t *)omp_default_mem_alloc;
2016  ptr = hwloc_alloc(__kmp_hwloc_topology, desc.size_a);
2017  } else if (al->fb == omp_atv_abort_fb) {
2018  KMP_ASSERT(0); // abort fallback requested
2019  } else if (al->fb == omp_atv_allocator_fb) {
2020  KMP_ASSERT(al != al->fb_data);
2021  al = al->fb_data;
2022  return __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
2023  } // else ptr == NULL;
2024  } else {
2025  // pool has enough space
2026  if (al->membind == omp_atv_interleaved) {
2027  if (__kmp_is_hwloc_membind_supported(HWLOC_MEMBIND_INTERLEAVE)) {
2028  ptr = __kmp_hwloc_membind_policy(al->memspace, desc.size_a,
2029  HWLOC_MEMBIND_INTERLEAVE);
2030  }
2031  } else if (al->membind == omp_atv_environment) {
2032  ptr = __kmp_hwloc_membind_policy(al->memspace, desc.size_a,
2033  HWLOC_MEMBIND_DEFAULT);
2034  } else {
2035  ptr = hwloc_alloc(__kmp_hwloc_topology, desc.size_a);
2036  }
2037  if (ptr == NULL) {
2038  if (al->fb == omp_atv_default_mem_fb) {
2039  al = (kmp_allocator_t *)omp_default_mem_alloc;
2040  ptr = hwloc_alloc(__kmp_hwloc_topology, desc.size_a);
2041  } else if (al->fb == omp_atv_abort_fb) {
2042  KMP_ASSERT(0); // abort fallback requested
2043  } else if (al->fb == omp_atv_allocator_fb) {
2044  KMP_ASSERT(al != al->fb_data);
2045  al = al->fb_data;
2046  return __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
2047  }
2048  }
2049  }
2050  } else {
2051  // custom allocator, pool size not requested
2052  if (al->membind == omp_atv_interleaved) {
2053  if (__kmp_is_hwloc_membind_supported(HWLOC_MEMBIND_INTERLEAVE)) {
2054  ptr = __kmp_hwloc_membind_policy(al->memspace, desc.size_a,
2055  HWLOC_MEMBIND_INTERLEAVE);
2056  }
2057  } else if (al->membind == omp_atv_environment) {
2058  ptr = __kmp_hwloc_membind_policy(al->memspace, desc.size_a,
2059  HWLOC_MEMBIND_DEFAULT);
2060  } else {
2061  ptr = hwloc_alloc(__kmp_hwloc_topology, desc.size_a);
2062  }
2063  if (ptr == NULL) {
2064  if (al->fb == omp_atv_default_mem_fb) {
2065  al = (kmp_allocator_t *)omp_default_mem_alloc;
2066  ptr = hwloc_alloc(__kmp_hwloc_topology, desc.size_a);
2067  } else if (al->fb == omp_atv_abort_fb) {
2068  KMP_ASSERT(0); // abort fallback requested
2069  } else if (al->fb == omp_atv_allocator_fb) {
2070  KMP_ASSERT(al != al->fb_data);
2071  al = al->fb_data;
2072  return __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
2073  }
2074  }
2075  }
2076  } else { // alloc membind not supported, use hwloc_alloc
2077  ptr = hwloc_alloc(__kmp_hwloc_topology, desc.size_a);
2078  }
2079  } else {
2080 #endif // KMP_HWLOC_ENABLED
2081  if (__kmp_memkind_available) {
2082  if (allocator < kmp_max_mem_alloc) {
2083  // pre-defined allocator
2084  if (allocator == omp_high_bw_mem_alloc && mk_hbw_preferred) {
2085  ptr = kmp_mk_alloc(*mk_hbw_preferred, desc.size_a);
2086  } else if (allocator == omp_large_cap_mem_alloc && mk_dax_kmem_all) {
2087  ptr = kmp_mk_alloc(*mk_dax_kmem_all, desc.size_a);
2088  } else {
2089  ptr = kmp_mk_alloc(*mk_default, desc.size_a);
2090  }
2091  } else if (al->pool_size > 0) {
2092  // custom allocator with pool size requested
2093  kmp_uint64 used =
2094  KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, desc.size_a);
2095  if (used + desc.size_a > al->pool_size) {
2096  // not enough space, need to go fallback path
2097  KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
2098  if (al->fb == omp_atv_default_mem_fb) {
2099  al = (kmp_allocator_t *)omp_default_mem_alloc;
2100  ptr = kmp_mk_alloc(*mk_default, desc.size_a);
2101  } else if (al->fb == omp_atv_abort_fb) {
2102  KMP_ASSERT(0); // abort fallback requested
2103  } else if (al->fb == omp_atv_allocator_fb) {
2104  KMP_ASSERT(al != al->fb_data);
2105  al = al->fb_data;
2106  ptr = __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
2107  if (is_pinned && kmp_target_lock_mem)
2108  kmp_target_lock_mem(ptr, size, default_device);
2109  return ptr;
2110  } // else ptr == NULL;
2111  } else {
2112  // pool has enough space
2113  ptr = kmp_mk_alloc(*al->memkind, desc.size_a);
2114  if (ptr == NULL) {
2115  if (al->fb == omp_atv_default_mem_fb) {
2116  al = (kmp_allocator_t *)omp_default_mem_alloc;
2117  ptr = kmp_mk_alloc(*mk_default, desc.size_a);
2118  } else if (al->fb == omp_atv_abort_fb) {
2119  KMP_ASSERT(0); // abort fallback requested
2120  } else if (al->fb == omp_atv_allocator_fb) {
2121  KMP_ASSERT(al != al->fb_data);
2122  al = al->fb_data;
2123  ptr = __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
2124  if (is_pinned && kmp_target_lock_mem)
2125  kmp_target_lock_mem(ptr, size, default_device);
2126  return ptr;
2127  }
2128  }
2129  }
2130  } else {
2131  // custom allocator, pool size not requested
2132  ptr = kmp_mk_alloc(*al->memkind, desc.size_a);
2133  if (ptr == NULL) {
2134  if (al->fb == omp_atv_default_mem_fb) {
2135  al = (kmp_allocator_t *)omp_default_mem_alloc;
2136  ptr = kmp_mk_alloc(*mk_default, desc.size_a);
2137  } else if (al->fb == omp_atv_abort_fb) {
2138  KMP_ASSERT(0); // abort fallback requested
2139  } else if (al->fb == omp_atv_allocator_fb) {
2140  KMP_ASSERT(al != al->fb_data);
2141  al = al->fb_data;
2142  ptr = __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
2143  if (is_pinned && kmp_target_lock_mem)
2144  kmp_target_lock_mem(ptr, size, default_device);
2145  return ptr;
2146  }
2147  }
2148  }
2149  } else if (allocator < kmp_max_mem_alloc) {
2150  // pre-defined allocator
2151  if (allocator == omp_high_bw_mem_alloc) {
2152  KMP_WARNING(OmpNoAllocator, "omp_high_bw_mem_alloc");
2153  } else if (allocator == omp_large_cap_mem_alloc) {
2154  KMP_WARNING(OmpNoAllocator, "omp_large_cap_mem_alloc");
2155  } else if (allocator == omp_const_mem_alloc) {
2156  KMP_WARNING(OmpNoAllocator, "omp_const_mem_alloc");
2157  } else if (allocator == omp_low_lat_mem_alloc) {
2158  KMP_WARNING(OmpNoAllocator, "omp_low_lat_mem_alloc");
2159  } else if (allocator == omp_cgroup_mem_alloc) {
2160  KMP_WARNING(OmpNoAllocator, "omp_cgroup_mem_alloc");
2161  } else if (allocator == omp_pteam_mem_alloc) {
2162  KMP_WARNING(OmpNoAllocator, "omp_pteam_mem_alloc");
2163  } else if (allocator == omp_thread_mem_alloc) {
2164  KMP_WARNING(OmpNoAllocator, "omp_thread_mem_alloc");
2165  } else { // default allocator requested
2166  use_default_allocator = true;
2167  }
2168  if (use_default_allocator) {
2169  ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
2170  use_default_allocator = false;
2171  }
2172  } else if (al->pool_size > 0) {
2173  // custom allocator with pool size requested
2174  kmp_uint64 used =
2175  KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, desc.size_a);
2176  if (used + desc.size_a > al->pool_size) {
2177  // not enough space, need to go fallback path
2178  KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
2179  if (al->fb == omp_atv_default_mem_fb) {
2180  al = (kmp_allocator_t *)omp_default_mem_alloc;
2181  ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
2182  } else if (al->fb == omp_atv_abort_fb) {
2183  KMP_ASSERT(0); // abort fallback requested
2184  } else if (al->fb == omp_atv_allocator_fb) {
2185  KMP_ASSERT(al != al->fb_data);
2186  al = al->fb_data;
2187  ptr = __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
2188  if (is_pinned && kmp_target_lock_mem)
2189  kmp_target_lock_mem(ptr, size, default_device);
2190  return ptr;
2191  } // else ptr == NULL
2192  } else {
2193  // pool has enough space
2194  ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
2195  if (ptr == NULL && al->fb == omp_atv_abort_fb) {
2196  KMP_ASSERT(0); // abort fallback requested
2197  } // no sense to look for another fallback because of same internal
2198  // alloc
2199  }
2200  } else {
2201  // custom allocator, pool size not requested
2202  ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
2203  if (ptr == NULL && al->fb == omp_atv_abort_fb) {
2204  KMP_ASSERT(0); // abort fallback requested
2205  } // no sense to look for another fallback because of same internal alloc
2206  }
2207 #if KMP_HWLOC_ENABLED
2208  }
2209 #endif // KMP_HWLOC_ENABLED
2210  KE_TRACE(10, ("__kmp_alloc: T#%d %p=alloc(%d)\n", gtid, ptr, desc.size_a));
2211  if (ptr == NULL)
2212  return NULL;
2213 
2214  if (is_pinned && kmp_target_lock_mem)
2215  kmp_target_lock_mem(ptr, desc.size_a, default_device);
2216 
2217  addr = (kmp_uintptr_t)ptr;
2218  addr_align = (addr + sz_desc + align - 1) & ~(align - 1);
2219  addr_descr = addr_align - sz_desc;
2220 
2221  desc.ptr_alloc = ptr;
2222  desc.ptr_align = (void *)addr_align;
2223  desc.allocator = al;
2224  *((kmp_mem_desc_t *)addr_descr) = desc; // save descriptor contents
2225  KMP_MB();
2226 
2227  return desc.ptr_align;
2228 }
2229 
2230 void *__kmp_calloc(int gtid, size_t algn, size_t nmemb, size_t size,
2231  omp_allocator_handle_t allocator) {
2232  void *ptr = NULL;
2233  kmp_allocator_t *al;
2234  KMP_DEBUG_ASSERT(__kmp_init_serial);
2235 
2236  if (allocator == omp_null_allocator)
2237  allocator = __kmp_threads[gtid]->th.th_def_allocator;
2238 
2239  al = RCAST(kmp_allocator_t *, allocator);
2240 
2241  if (nmemb == 0 || size == 0)
2242  return ptr;
2243 
2244  if ((SIZE_MAX - sizeof(kmp_mem_desc_t)) / size < nmemb) {
2245  if (al->fb == omp_atv_abort_fb) {
2246  KMP_ASSERT(0);
2247  }
2248  return ptr;
2249  }
2250 
2251  ptr = __kmp_alloc(gtid, algn, nmemb * size, allocator);
2252 
2253  if (ptr) {
2254  memset(ptr, 0x00, nmemb * size);
2255  }
2256  return ptr;
2257 }
2258 
2259 void *__kmp_realloc(int gtid, void *ptr, size_t size,
2260  omp_allocator_handle_t allocator,
2261  omp_allocator_handle_t free_allocator) {
2262  void *nptr = NULL;
2263  KMP_DEBUG_ASSERT(__kmp_init_serial);
2264 
2265  if (size == 0) {
2266  if (ptr != NULL)
2267  ___kmpc_free(gtid, ptr, free_allocator);
2268  return nptr;
2269  }
2270 
2271  nptr = __kmp_alloc(gtid, 0, size, allocator);
2272 
2273  if (nptr != NULL && ptr != NULL) {
2274  kmp_mem_desc_t desc;
2275  kmp_uintptr_t addr_align; // address to return to caller
2276  kmp_uintptr_t addr_descr; // address of memory block descriptor
2277 
2278  addr_align = (kmp_uintptr_t)ptr;
2279  addr_descr = addr_align - sizeof(kmp_mem_desc_t);
2280  desc = *((kmp_mem_desc_t *)addr_descr); // read descriptor
2281 
2282  KMP_DEBUG_ASSERT(desc.ptr_align == ptr);
2283  KMP_DEBUG_ASSERT(desc.size_orig > 0);
2284  KMP_DEBUG_ASSERT(desc.size_orig < desc.size_a);
2285  KMP_MEMCPY((char *)nptr, (char *)ptr,
2286  (size_t)((size < desc.size_orig) ? size : desc.size_orig));
2287  }
2288 
2289  if (nptr != NULL) {
2290  ___kmpc_free(gtid, ptr, free_allocator);
2291  }
2292 
2293  return nptr;
2294 }
2295 
2296 void ___kmpc_free(int gtid, void *ptr, omp_allocator_handle_t allocator) {
2297  if (ptr == NULL)
2298  return;
2299 
2300  kmp_allocator_t *al;
2301  omp_allocator_handle_t oal;
2302  al = RCAST(kmp_allocator_t *, CCAST(omp_allocator_handle_t, allocator));
2303  kmp_mem_desc_t desc;
2304  kmp_uintptr_t addr_align; // address to return to caller
2305  kmp_uintptr_t addr_descr; // address of memory block descriptor
2306 
2307  if (al > kmp_max_mem_alloc && al->memspace > kmp_max_mem_space) {
2308  __kmp_tgt_allocator.omp_free(ptr, allocator);
2309  return;
2310  }
2311 
2312  if (__kmp_target_mem_available && (KMP_IS_TARGET_MEM_ALLOC(allocator) ||
2313  (allocator > kmp_max_mem_alloc &&
2314  KMP_IS_TARGET_MEM_SPACE(al->memspace)))) {
2315  kmp_int32 device =
2316  __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
2317  if (allocator == llvm_omp_target_host_mem_alloc) {
2318  kmp_target_free_host(ptr, device);
2319  } else if (allocator == llvm_omp_target_shared_mem_alloc) {
2320  kmp_target_free_shared(ptr, device);
2321  } else if (allocator == llvm_omp_target_device_mem_alloc) {
2322  kmp_target_free_device(ptr, device);
2323  }
2324  return;
2325  }
2326 
2327  addr_align = (kmp_uintptr_t)ptr;
2328  addr_descr = addr_align - sizeof(kmp_mem_desc_t);
2329  desc = *((kmp_mem_desc_t *)addr_descr); // read descriptor
2330 
2331  KMP_DEBUG_ASSERT(desc.ptr_align == ptr);
2332  if (allocator) {
2333  KMP_DEBUG_ASSERT(desc.allocator == al || desc.allocator == al->fb_data);
2334  }
2335  al = desc.allocator;
2336  oal = (omp_allocator_handle_t)al; // cast to void* for comparisons
2337  KMP_DEBUG_ASSERT(al);
2338 
2339  if (allocator > kmp_max_mem_alloc && kmp_target_unlock_mem && al->pinned) {
2340  kmp_int32 device =
2341  __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
2342  kmp_target_unlock_mem(desc.ptr_alloc, device);
2343  }
2344 
2345 #if KMP_HWLOC_ENABLED
2346  if (__kmp_hwloc_available) {
2347  if (oal > kmp_max_mem_alloc && al->pool_size > 0) {
2348  kmp_uint64 used =
2349  KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
2350  (void)used; // to suppress compiler warning
2351  KMP_DEBUG_ASSERT(used >= desc.size_a);
2352  }
2353  hwloc_free(__kmp_hwloc_topology, desc.ptr_alloc, desc.size_a);
2354  } else {
2355 #endif // KMP_HWLOC_ENABLED
2356  if (__kmp_memkind_available) {
2357  if (oal < kmp_max_mem_alloc) {
2358  // pre-defined allocator
2359  if (oal == omp_high_bw_mem_alloc && mk_hbw_preferred) {
2360  kmp_mk_free(*mk_hbw_preferred, desc.ptr_alloc);
2361  } else if (oal == omp_large_cap_mem_alloc && mk_dax_kmem_all) {
2362  kmp_mk_free(*mk_dax_kmem_all, desc.ptr_alloc);
2363  } else {
2364  kmp_mk_free(*mk_default, desc.ptr_alloc);
2365  }
2366  } else {
2367  if (al->pool_size > 0) { // custom allocator with pool size requested
2368  kmp_uint64 used =
2369  KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
2370  (void)used; // to suppress compiler warning
2371  KMP_DEBUG_ASSERT(used >= desc.size_a);
2372  }
2373  kmp_mk_free(*al->memkind, desc.ptr_alloc);
2374  }
2375  } else {
2376  if (oal > kmp_max_mem_alloc && al->pool_size > 0) {
2377  kmp_uint64 used =
2378  KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
2379  (void)used; // to suppress compiler warning
2380  KMP_DEBUG_ASSERT(used >= desc.size_a);
2381  }
2382  __kmp_thread_free(__kmp_thread_from_gtid(gtid), desc.ptr_alloc);
2383  }
2384 #if KMP_HWLOC_ENABLED
2385  }
2386 #endif // KMP_HWLOC_ENABLED
2387 }
2388 
2389 /* If LEAK_MEMORY is defined, __kmp_free() will *not* free memory. It causes
2390  memory leaks, but it may be useful for debugging memory corruptions, used
2391  freed pointers, etc. */
2392 /* #define LEAK_MEMORY */
2393 struct kmp_mem_descr { // Memory block descriptor.
2394  void *ptr_allocated; // Pointer returned by malloc(), subject for free().
2395  size_t size_allocated; // Size of allocated memory block.
2396  void *ptr_aligned; // Pointer to aligned memory, to be used by client code.
2397  size_t size_aligned; // Size of aligned memory block.
2398 };
2399 typedef struct kmp_mem_descr kmp_mem_descr_t;
2400 
2401 /* Allocate memory on requested boundary, fill allocated memory with 0x00.
2402  NULL is NEVER returned, __kmp_abort() is called in case of memory allocation
2403  error. Must use __kmp_free when freeing memory allocated by this routine! */
2404 static void *___kmp_allocate_align(size_t size,
2405  size_t alignment KMP_SRC_LOC_DECL) {
2406  /* __kmp_allocate() allocates (by call to malloc()) bigger memory block than
2407  requested to return properly aligned pointer. Original pointer returned
2408  by malloc() and size of allocated block is saved in descriptor just
2409  before the aligned pointer. This information used by __kmp_free() -- it
2410  has to pass to free() original pointer, not aligned one.
2411 
2412  +---------+------------+-----------------------------------+---------+
2413  | padding | descriptor | aligned block | padding |
2414  +---------+------------+-----------------------------------+---------+
2415  ^ ^
2416  | |
2417  | +- Aligned pointer returned to caller
2418  +- Pointer returned by malloc()
2419 
2420  Aligned block is filled with zeros, paddings are filled with 0xEF. */
2421 
2422  kmp_mem_descr_t descr;
2423  kmp_uintptr_t addr_allocated; // Address returned by malloc().
2424  kmp_uintptr_t addr_aligned; // Aligned address to return to caller.
2425  kmp_uintptr_t addr_descr; // Address of memory block descriptor.
2426 
2427  KE_TRACE(25, ("-> ___kmp_allocate_align( %d, %d ) called from %s:%d\n",
2428  (int)size, (int)alignment KMP_SRC_LOC_PARM));
2429 
2430  KMP_DEBUG_ASSERT(alignment < 32 * 1024); // Alignment should not be too
2431  KMP_DEBUG_ASSERT(sizeof(void *) <= sizeof(kmp_uintptr_t));
2432  // Make sure kmp_uintptr_t is enough to store addresses.
2433 
2434  descr.size_aligned = size;
2435  descr.size_allocated =
2436  descr.size_aligned + sizeof(kmp_mem_descr_t) + alignment;
2437 
2438 #if KMP_DEBUG
2439  descr.ptr_allocated = _malloc_src_loc(descr.size_allocated, _file_, _line_);
2440 #else
2441  descr.ptr_allocated = malloc_src_loc(descr.size_allocated KMP_SRC_LOC_PARM);
2442 #endif
2443  KE_TRACE(10, (" malloc( %d ) returned %p\n", (int)descr.size_allocated,
2444  descr.ptr_allocated));
2445  if (descr.ptr_allocated == NULL) {
2446  KMP_FATAL(OutOfHeapMemory);
2447  }
2448 
2449  addr_allocated = (kmp_uintptr_t)descr.ptr_allocated;
2450  addr_aligned =
2451  (addr_allocated + sizeof(kmp_mem_descr_t) + alignment) & ~(alignment - 1);
2452  addr_descr = addr_aligned - sizeof(kmp_mem_descr_t);
2453 
2454  descr.ptr_aligned = (void *)addr_aligned;
2455 
2456  KE_TRACE(26, (" ___kmp_allocate_align: "
2457  "ptr_allocated=%p, size_allocated=%d, "
2458  "ptr_aligned=%p, size_aligned=%d\n",
2459  descr.ptr_allocated, (int)descr.size_allocated,
2460  descr.ptr_aligned, (int)descr.size_aligned));
2461 
2462  KMP_DEBUG_ASSERT(addr_allocated <= addr_descr);
2463  KMP_DEBUG_ASSERT(addr_descr + sizeof(kmp_mem_descr_t) == addr_aligned);
2464  KMP_DEBUG_ASSERT(addr_aligned + descr.size_aligned <=
2465  addr_allocated + descr.size_allocated);
2466  KMP_DEBUG_ASSERT(addr_aligned % alignment == 0);
2467 #ifdef KMP_DEBUG
2468  memset(descr.ptr_allocated, 0xEF, descr.size_allocated);
2469 // Fill allocated memory block with 0xEF.
2470 #endif
2471  memset(descr.ptr_aligned, 0x00, descr.size_aligned);
2472  // Fill the aligned memory block (which is intended for using by caller) with
2473  // 0x00. Do not
2474  // put this filling under KMP_DEBUG condition! Many callers expect zeroed
2475  // memory. (Padding
2476  // bytes remain filled with 0xEF in debugging library.)
2477  *((kmp_mem_descr_t *)addr_descr) = descr;
2478 
2479  KMP_MB();
2480 
2481  KE_TRACE(25, ("<- ___kmp_allocate_align() returns %p\n", descr.ptr_aligned));
2482  return descr.ptr_aligned;
2483 } // func ___kmp_allocate_align
2484 
2485 /* Allocate memory on cache line boundary, fill allocated memory with 0x00.
2486  Do not call this func directly! Use __kmp_allocate macro instead.
2487  NULL is NEVER returned, __kmp_abort() is called in case of memory allocation
2488  error. Must use __kmp_free when freeing memory allocated by this routine! */
2489 void *___kmp_allocate(size_t size KMP_SRC_LOC_DECL) {
2490  void *ptr;
2491  KE_TRACE(25, ("-> __kmp_allocate( %d ) called from %s:%d\n",
2492  (int)size KMP_SRC_LOC_PARM));
2493  ptr = ___kmp_allocate_align(size, __kmp_align_alloc KMP_SRC_LOC_PARM);
2494  KE_TRACE(25, ("<- __kmp_allocate() returns %p\n", ptr));
2495  return ptr;
2496 } // func ___kmp_allocate
2497 
2498 /* Allocate memory on page boundary, fill allocated memory with 0x00.
2499  Does not call this func directly! Use __kmp_page_allocate macro instead.
2500  NULL is NEVER returned, __kmp_abort() is called in case of memory allocation
2501  error. Must use __kmp_free when freeing memory allocated by this routine! */
2502 void *___kmp_page_allocate(size_t size KMP_SRC_LOC_DECL) {
2503  int page_size = 8 * 1024;
2504  void *ptr;
2505 
2506  KE_TRACE(25, ("-> __kmp_page_allocate( %d ) called from %s:%d\n",
2507  (int)size KMP_SRC_LOC_PARM));
2508  ptr = ___kmp_allocate_align(size, page_size KMP_SRC_LOC_PARM);
2509  KE_TRACE(25, ("<- __kmp_page_allocate( %d ) returns %p\n", (int)size, ptr));
2510  return ptr;
2511 } // ___kmp_page_allocate
2512 
2513 /* Free memory allocated by __kmp_allocate() and __kmp_page_allocate().
2514  In debug mode, fill the memory block with 0xEF before call to free(). */
2515 void ___kmp_free(void *ptr KMP_SRC_LOC_DECL) {
2516  kmp_mem_descr_t descr;
2517 #if KMP_DEBUG
2518  kmp_uintptr_t addr_allocated; // Address returned by malloc().
2519  kmp_uintptr_t addr_aligned; // Aligned address passed by caller.
2520 #endif
2521  KE_TRACE(25,
2522  ("-> __kmp_free( %p ) called from %s:%d\n", ptr KMP_SRC_LOC_PARM));
2523  KMP_ASSERT(ptr != NULL);
2524 
2525  descr = *(kmp_mem_descr_t *)((kmp_uintptr_t)ptr - sizeof(kmp_mem_descr_t));
2526 
2527  KE_TRACE(26, (" __kmp_free: "
2528  "ptr_allocated=%p, size_allocated=%d, "
2529  "ptr_aligned=%p, size_aligned=%d\n",
2530  descr.ptr_allocated, (int)descr.size_allocated,
2531  descr.ptr_aligned, (int)descr.size_aligned));
2532 #if KMP_DEBUG
2533  addr_allocated = (kmp_uintptr_t)descr.ptr_allocated;
2534  addr_aligned = (kmp_uintptr_t)descr.ptr_aligned;
2535  KMP_DEBUG_ASSERT(addr_aligned % CACHE_LINE == 0);
2536  KMP_DEBUG_ASSERT(descr.ptr_aligned == ptr);
2537  KMP_DEBUG_ASSERT(addr_allocated + sizeof(kmp_mem_descr_t) <= addr_aligned);
2538  KMP_DEBUG_ASSERT(descr.size_aligned < descr.size_allocated);
2539  KMP_DEBUG_ASSERT(addr_aligned + descr.size_aligned <=
2540  addr_allocated + descr.size_allocated);
2541  memset(descr.ptr_allocated, 0xEF, descr.size_allocated);
2542 // Fill memory block with 0xEF, it helps catch using freed memory.
2543 #endif
2544 
2545 #ifndef LEAK_MEMORY
2546  KE_TRACE(10, (" free( %p )\n", descr.ptr_allocated));
2547 #ifdef KMP_DEBUG
2548  _free_src_loc(descr.ptr_allocated, _file_, _line_);
2549 #else
2550  free_src_loc(descr.ptr_allocated KMP_SRC_LOC_PARM);
2551 #endif
2552 #endif
2553  KMP_MB();
2554  KE_TRACE(25, ("<- __kmp_free() returns\n"));
2555 } // func ___kmp_free
2556 
2557 #if USE_FAST_MEMORY == 3
2558 // Allocate fast memory by first scanning the thread's free lists
2559 // If a chunk the right size exists, grab it off the free list.
2560 // Otherwise allocate normally using kmp_thread_malloc.
2561 
2562 // AC: How to choose the limit? Just get 16 for now...
2563 #define KMP_FREE_LIST_LIMIT 16
2564 
2565 // Always use 128 bytes for determining buckets for caching memory blocks
2566 #define DCACHE_LINE 128
2567 
2568 void *___kmp_fast_allocate(kmp_info_t *this_thr, size_t size KMP_SRC_LOC_DECL) {
2569  void *ptr;
2570  size_t num_lines, idx;
2571  int index;
2572  void *alloc_ptr;
2573  size_t alloc_size;
2574  kmp_mem_descr_t *descr;
2575 
2576  KE_TRACE(25, ("-> __kmp_fast_allocate( T#%d, %d ) called from %s:%d\n",
2577  __kmp_gtid_from_thread(this_thr), (int)size KMP_SRC_LOC_PARM));
2578 
2579  num_lines = (size + DCACHE_LINE - 1) / DCACHE_LINE;
2580  idx = num_lines - 1;
2581  KMP_DEBUG_ASSERT(idx >= 0);
2582  if (idx < 2) {
2583  index = 0; // idx is [ 0, 1 ], use first free list
2584  num_lines = 2; // 1, 2 cache lines or less than cache line
2585  } else if ((idx >>= 2) == 0) {
2586  index = 1; // idx is [ 2, 3 ], use second free list
2587  num_lines = 4; // 3, 4 cache lines
2588  } else if ((idx >>= 2) == 0) {
2589  index = 2; // idx is [ 4, 15 ], use third free list
2590  num_lines = 16; // 5, 6, ..., 16 cache lines
2591  } else if ((idx >>= 2) == 0) {
2592  index = 3; // idx is [ 16, 63 ], use fourth free list
2593  num_lines = 64; // 17, 18, ..., 64 cache lines
2594  } else {
2595  goto alloc_call; // 65 or more cache lines ( > 8KB ), don't use free lists
2596  }
2597 
2598  ptr = this_thr->th.th_free_lists[index].th_free_list_self;
2599  if (ptr != NULL) {
2600  // pop the head of no-sync free list
2601  this_thr->th.th_free_lists[index].th_free_list_self = *((void **)ptr);
2602  KMP_DEBUG_ASSERT(this_thr == ((kmp_mem_descr_t *)((kmp_uintptr_t)ptr -
2603  sizeof(kmp_mem_descr_t)))
2604  ->ptr_aligned);
2605  goto end;
2606  }
2607  ptr = TCR_SYNC_PTR(this_thr->th.th_free_lists[index].th_free_list_sync);
2608  if (ptr != NULL) {
2609  // no-sync free list is empty, use sync free list (filled in by other
2610  // threads only)
2611  // pop the head of the sync free list, push NULL instead
2612  while (!KMP_COMPARE_AND_STORE_PTR(
2613  &this_thr->th.th_free_lists[index].th_free_list_sync, ptr, nullptr)) {
2614  KMP_CPU_PAUSE();
2615  ptr = TCR_SYNC_PTR(this_thr->th.th_free_lists[index].th_free_list_sync);
2616  }
2617  // push the rest of chain into no-sync free list (can be NULL if there was
2618  // the only block)
2619  this_thr->th.th_free_lists[index].th_free_list_self = *((void **)ptr);
2620  KMP_DEBUG_ASSERT(this_thr == ((kmp_mem_descr_t *)((kmp_uintptr_t)ptr -
2621  sizeof(kmp_mem_descr_t)))
2622  ->ptr_aligned);
2623  goto end;
2624  }
2625 
2626 alloc_call:
2627  // haven't found block in the free lists, thus allocate it
2628  size = num_lines * DCACHE_LINE;
2629 
2630  alloc_size = size + sizeof(kmp_mem_descr_t) + DCACHE_LINE;
2631  KE_TRACE(25, ("__kmp_fast_allocate: T#%d Calling __kmp_thread_malloc with "
2632  "alloc_size %d\n",
2633  __kmp_gtid_from_thread(this_thr), alloc_size));
2634  alloc_ptr = bget(this_thr, (bufsize)alloc_size);
2635 
2636  // align ptr to DCACHE_LINE
2637  ptr = (void *)((((kmp_uintptr_t)alloc_ptr) + sizeof(kmp_mem_descr_t) +
2638  DCACHE_LINE) &
2639  ~(DCACHE_LINE - 1));
2640  descr = (kmp_mem_descr_t *)(((kmp_uintptr_t)ptr) - sizeof(kmp_mem_descr_t));
2641 
2642  descr->ptr_allocated = alloc_ptr; // remember allocated pointer
2643  // we don't need size_allocated
2644  descr->ptr_aligned = (void *)this_thr; // remember allocating thread
2645  // (it is already saved in bget buffer,
2646  // but we may want to use another allocator in future)
2647  descr->size_aligned = size;
2648 
2649 end:
2650  KE_TRACE(25, ("<- __kmp_fast_allocate( T#%d ) returns %p\n",
2651  __kmp_gtid_from_thread(this_thr), ptr));
2652  return ptr;
2653 } // func __kmp_fast_allocate
2654 
2655 // Free fast memory and place it on the thread's free list if it is of
2656 // the correct size.
2657 void ___kmp_fast_free(kmp_info_t *this_thr, void *ptr KMP_SRC_LOC_DECL) {
2658  kmp_mem_descr_t *descr;
2659  kmp_info_t *alloc_thr;
2660  size_t size;
2661  size_t idx;
2662  int index;
2663 
2664  KE_TRACE(25, ("-> __kmp_fast_free( T#%d, %p ) called from %s:%d\n",
2665  __kmp_gtid_from_thread(this_thr), ptr KMP_SRC_LOC_PARM));
2666  KMP_ASSERT(ptr != NULL);
2667 
2668  descr = (kmp_mem_descr_t *)(((kmp_uintptr_t)ptr) - sizeof(kmp_mem_descr_t));
2669 
2670  KE_TRACE(26, (" __kmp_fast_free: size_aligned=%d\n",
2671  (int)descr->size_aligned));
2672 
2673  size = descr->size_aligned; // 2, 4, 16, 64, 65, 66, ... cache lines
2674 
2675  idx = DCACHE_LINE * 2; // 2 cache lines is minimal size of block
2676  if (idx == size) {
2677  index = 0; // 2 cache lines
2678  } else if ((idx <<= 1) == size) {
2679  index = 1; // 4 cache lines
2680  } else if ((idx <<= 2) == size) {
2681  index = 2; // 16 cache lines
2682  } else if ((idx <<= 2) == size) {
2683  index = 3; // 64 cache lines
2684  } else {
2685  KMP_DEBUG_ASSERT(size > DCACHE_LINE * 64);
2686  goto free_call; // 65 or more cache lines ( > 8KB )
2687  }
2688 
2689  alloc_thr = (kmp_info_t *)descr->ptr_aligned; // get thread owning the block
2690  if (alloc_thr == this_thr) {
2691  // push block to self no-sync free list, linking previous head (LIFO)
2692  *((void **)ptr) = this_thr->th.th_free_lists[index].th_free_list_self;
2693  this_thr->th.th_free_lists[index].th_free_list_self = ptr;
2694  } else {
2695  void *head = this_thr->th.th_free_lists[index].th_free_list_other;
2696  if (head == NULL) {
2697  // Create new free list
2698  this_thr->th.th_free_lists[index].th_free_list_other = ptr;
2699  *((void **)ptr) = NULL; // mark the tail of the list
2700  descr->size_allocated = (size_t)1; // head of the list keeps its length
2701  } else {
2702  // need to check existed "other" list's owner thread and size of queue
2703  kmp_mem_descr_t *dsc =
2704  (kmp_mem_descr_t *)((char *)head - sizeof(kmp_mem_descr_t));
2705  // allocating thread, same for all queue nodes
2706  kmp_info_t *q_th = (kmp_info_t *)(dsc->ptr_aligned);
2707  size_t q_sz =
2708  dsc->size_allocated + 1; // new size in case we add current task
2709  if (q_th == alloc_thr && q_sz <= KMP_FREE_LIST_LIMIT) {
2710  // we can add current task to "other" list, no sync needed
2711  *((void **)ptr) = head;
2712  descr->size_allocated = q_sz;
2713  this_thr->th.th_free_lists[index].th_free_list_other = ptr;
2714  } else {
2715  // either queue blocks owner is changing or size limit exceeded
2716  // return old queue to allocating thread (q_th) synchronously,
2717  // and start new list for alloc_thr's tasks
2718  void *old_ptr;
2719  void *tail = head;
2720  void *next = *((void **)head);
2721  while (next != NULL) {
2722  KMP_DEBUG_ASSERT(
2723  // queue size should decrease by 1 each step through the list
2724  ((kmp_mem_descr_t *)((char *)next - sizeof(kmp_mem_descr_t)))
2725  ->size_allocated +
2726  1 ==
2727  ((kmp_mem_descr_t *)((char *)tail - sizeof(kmp_mem_descr_t)))
2728  ->size_allocated);
2729  tail = next; // remember tail node
2730  next = *((void **)next);
2731  }
2732  KMP_DEBUG_ASSERT(q_th != NULL);
2733  // push block to owner's sync free list
2734  old_ptr = TCR_PTR(q_th->th.th_free_lists[index].th_free_list_sync);
2735  /* the next pointer must be set before setting free_list to ptr to avoid
2736  exposing a broken list to other threads, even for an instant. */
2737  *((void **)tail) = old_ptr;
2738 
2739  while (!KMP_COMPARE_AND_STORE_PTR(
2740  &q_th->th.th_free_lists[index].th_free_list_sync, old_ptr, head)) {
2741  KMP_CPU_PAUSE();
2742  old_ptr = TCR_PTR(q_th->th.th_free_lists[index].th_free_list_sync);
2743  *((void **)tail) = old_ptr;
2744  }
2745 
2746  // start new list of not-selt tasks
2747  this_thr->th.th_free_lists[index].th_free_list_other = ptr;
2748  *((void **)ptr) = NULL;
2749  descr->size_allocated = (size_t)1; // head of queue keeps its length
2750  }
2751  }
2752  }
2753  goto end;
2754 
2755 free_call:
2756  KE_TRACE(25, ("__kmp_fast_free: T#%d Calling __kmp_thread_free for size %d\n",
2757  __kmp_gtid_from_thread(this_thr), size));
2758  __kmp_bget_dequeue(this_thr); /* Release any queued buffers */
2759  brel(this_thr, descr->ptr_allocated);
2760 
2761 end:
2762  KE_TRACE(25, ("<- __kmp_fast_free() returns\n"));
2763 
2764 } // func __kmp_fast_free
2765 
2766 // Initialize the thread free lists related to fast memory
2767 // Only do this when a thread is initially created.
2768 void __kmp_initialize_fast_memory(kmp_info_t *this_thr) {
2769  KE_TRACE(10, ("__kmp_initialize_fast_memory: Called from th %p\n", this_thr));
2770 
2771  memset(this_thr->th.th_free_lists, 0, NUM_LISTS * sizeof(kmp_free_list_t));
2772 }
2773 
2774 // Free the memory in the thread free lists related to fast memory
2775 // Only do this when a thread is being reaped (destroyed).
2776 void __kmp_free_fast_memory(kmp_info_t *th) {
2777  // Suppose we use BGET underlying allocator, walk through its structures...
2778  int bin;
2779  thr_data_t *thr = get_thr_data(th);
2780  void **lst = NULL;
2781 
2782  KE_TRACE(
2783  5, ("__kmp_free_fast_memory: Called T#%d\n", __kmp_gtid_from_thread(th)));
2784 
2785  __kmp_bget_dequeue(th); // Release any queued buffers
2786 
2787  // Dig through free lists and extract all allocated blocks
2788  for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
2789  bfhead_t *b = thr->freelist[bin].ql.flink;
2790  while (b != &thr->freelist[bin]) {
2791  if ((kmp_uintptr_t)b->bh.bb.bthr & 1) { // the buffer is allocated address
2792  *((void **)b) =
2793  lst; // link the list (override bthr, but keep flink yet)
2794  lst = (void **)b; // push b into lst
2795  }
2796  b = b->ql.flink; // get next buffer
2797  }
2798  }
2799  while (lst != NULL) {
2800  void *next = *lst;
2801  KE_TRACE(10, ("__kmp_free_fast_memory: freeing %p, next=%p th %p (%d)\n",
2802  lst, next, th, __kmp_gtid_from_thread(th)));
2803  (*thr->relfcn)(lst);
2804 #if BufStats
2805  // count blocks to prevent problems in __kmp_finalize_bget()
2806  thr->numprel++; /* Nr of expansion block releases */
2807  thr->numpblk--; /* Total number of blocks */
2808 #endif
2809  lst = (void **)next;
2810  }
2811 
2812  KE_TRACE(
2813  5, ("__kmp_free_fast_memory: Freed T#%d\n", __kmp_gtid_from_thread(th)));
2814 }
2815 
2816 #endif // USE_FAST_MEMORY
void * omp_alloc(size_t size, omp_allocator_handle_t allocator)
Invoke offload runtime's memory allocation routine.
Definition: kmp_alloc.cpp:1331
void init()
Initialize interface with offload runtime.
Definition: kmp_alloc.cpp:1315
int get_mem_resources(int ndevs, const int *devs, int host, omp_memspace_handle_t memspace, int *resources)
Definition: kmp_alloc.cpp:1324
void omp_free(void *ptr, omp_allocator_handle_t allocator)
Invoke offload runtime's memory deallocation routine.
Definition: kmp_alloc.cpp:1337
omp_memspace_handle_t get_memspace(int num_resources, const int *resources, omp_memspace_handle_t parent)
Return sub memory space from the parent memory space.
Definition: kmp_alloc.cpp:1447
omp_memspace_handle_t get(int num_resources, const int *resources, omp_memspace_handle_t memspace)
Definition: kmp_alloc.cpp:1366
kmp_memspace_t * find(int num_resources, const int *resources, omp_memspace_handle_t memspace)
Find memory space that matches the provided input.
Definition: kmp_alloc.cpp:1352
void init()
Initialize memory space list.
Definition: kmp_alloc.cpp:1397
omp_memspace_handle_t get_memspace(int num_devices, const int *devices, int host_access, omp_memspace_handle_t memspace)
Return memory space for the provided input.
Definition: kmp_alloc.cpp:1411
void fini()
Release resources for the memory space list.
Definition: kmp_alloc.cpp:1399
Memory allocator information is shared with offload runtime.
Definition: kmp.h:1104
Memory space informaition is shared with offload runtime.
Definition: kmp.h:1096