1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
|
/*-------------------------------------------------------------------------
*
* pg_numa.c
* Basic NUMA portability routines
*
*
* Copyright (c) 2025, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
* src/port/pg_numa.c
*
*-------------------------------------------------------------------------
*/
#include "c.h"
#include <unistd.h>
#include "port/pg_numa.h"
/*
* At this point we provide support only for Linux thanks to libnuma, but in
* future support for other platforms e.g. Win32 or FreeBSD might be possible
* too. For Win32 NUMA APIs see
* https://learn.microsoft.com/en-us/windows/win32/procthread/numa-support
*/
#ifdef USE_LIBNUMA
#include <numa.h>
#include <numaif.h>
/*
* numa_move_pages() chunk size, has to be <= 16 to work around a kernel bug
* in do_pages_stat() (chunked by DO_PAGES_STAT_CHUNK_NR). By using the same
* chunk size, we make it work even on unfixed kernels.
*
* 64-bit system are not affected by the bug, and so use much larger chunks.
*/
#if SIZEOF_SIZE_T == 4
#define NUMA_QUERY_CHUNK_SIZE 16
#else
#define NUMA_QUERY_CHUNK_SIZE 1024
#endif
/* libnuma requires initialization as per numa(3) on Linux */
int
pg_numa_init(void)
{
int r = numa_available();
return r;
}
/*
* We use move_pages(2) syscall here - instead of get_mempolicy(2) - as the
* first one allows us to batch and query about many memory pages in one single
* giant system call that is way faster.
*
* We call numa_move_pages() for smaller chunks of the whole array. The first
* reason is to work around a kernel bug, but also to allow interrupting the
* query between the calls (for many pointers processing the whole array can
* take a lot of time).
*/
int
pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status)
{
unsigned long next = 0;
int ret = 0;
/*
* Chunk pointers passed to numa_move_pages to NUMA_QUERY_CHUNK_SIZE
* items, to work around a kernel bug in do_pages_stat().
*/
while (next < count)
{
unsigned long count_chunk = Min(count - next,
NUMA_QUERY_CHUNK_SIZE);
/*
* Bail out if any of the chunks errors out (ret<0). We ignore
* (ret>0) which is used to return number of nonmigrated pages,
* but we're not migrating any pages here.
*/
ret = numa_move_pages(pid, count_chunk, &pages[next], NULL, &status[next], 0);
if (ret < 0)
{
/* plain error, return as is */
return ret;
}
next += count_chunk;
}
/* should have consumed the input array exactly */
Assert(next == count);
return 0;
}
int
pg_numa_get_max_node(void)
{
return numa_max_node();
}
#else
/* Empty wrappers */
int
pg_numa_init(void)
{
/* We state that NUMA is not available */
return -1;
}
int
pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status)
{
return 0;
}
int
pg_numa_get_max_node(void)
{
return 0;
}
#endif
|