1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * Copyright (c) 2000, 2003 Silicon Graphics, Inc. All rights reserved.
4*4882a593Smuzhiyun * Copyright (c) 2001 Intel Corp.
5*4882a593Smuzhiyun * Copyright (c) 2001 Tony Luck <tony.luck@intel.com>
6*4882a593Smuzhiyun * Copyright (c) 2002 NEC Corp.
7*4882a593Smuzhiyun * Copyright (c) 2002 Kimio Suganuma <k-suganuma@da.jp.nec.com>
8*4882a593Smuzhiyun * Copyright (c) 2004 Silicon Graphics, Inc
9*4882a593Smuzhiyun * Russ Anderson <rja@sgi.com>
10*4882a593Smuzhiyun * Jesse Barnes <jbarnes@sgi.com>
11*4882a593Smuzhiyun * Jack Steiner <steiner@sgi.com>
12*4882a593Smuzhiyun */
13*4882a593Smuzhiyun
14*4882a593Smuzhiyun /*
15*4882a593Smuzhiyun * Platform initialization for Discontig Memory
16*4882a593Smuzhiyun */
17*4882a593Smuzhiyun
18*4882a593Smuzhiyun #include <linux/kernel.h>
19*4882a593Smuzhiyun #include <linux/mm.h>
20*4882a593Smuzhiyun #include <linux/nmi.h>
21*4882a593Smuzhiyun #include <linux/swap.h>
22*4882a593Smuzhiyun #include <linux/memblock.h>
23*4882a593Smuzhiyun #include <linux/acpi.h>
24*4882a593Smuzhiyun #include <linux/efi.h>
25*4882a593Smuzhiyun #include <linux/nodemask.h>
26*4882a593Smuzhiyun #include <linux/slab.h>
27*4882a593Smuzhiyun #include <asm/tlb.h>
28*4882a593Smuzhiyun #include <asm/meminit.h>
29*4882a593Smuzhiyun #include <asm/numa.h>
30*4882a593Smuzhiyun #include <asm/sections.h>
31*4882a593Smuzhiyun
32*4882a593Smuzhiyun /*
33*4882a593Smuzhiyun * Track per-node information needed to setup the boot memory allocator, the
34*4882a593Smuzhiyun * per-node areas, and the real VM.
35*4882a593Smuzhiyun */
36*4882a593Smuzhiyun struct early_node_data {
37*4882a593Smuzhiyun struct ia64_node_data *node_data;
38*4882a593Smuzhiyun unsigned long pernode_addr;
39*4882a593Smuzhiyun unsigned long pernode_size;
40*4882a593Smuzhiyun unsigned long min_pfn;
41*4882a593Smuzhiyun unsigned long max_pfn;
42*4882a593Smuzhiyun };
43*4882a593Smuzhiyun
44*4882a593Smuzhiyun static struct early_node_data mem_data[MAX_NUMNODES] __initdata;
45*4882a593Smuzhiyun static nodemask_t memory_less_mask __initdata;
46*4882a593Smuzhiyun
47*4882a593Smuzhiyun pg_data_t *pgdat_list[MAX_NUMNODES];
48*4882a593Smuzhiyun
49*4882a593Smuzhiyun /*
50*4882a593Smuzhiyun * To prevent cache aliasing effects, align per-node structures so that they
51*4882a593Smuzhiyun * start at addresses that are strided by node number.
52*4882a593Smuzhiyun */
53*4882a593Smuzhiyun #define MAX_NODE_ALIGN_OFFSET (32 * 1024 * 1024)
54*4882a593Smuzhiyun #define NODEDATA_ALIGN(addr, node) \
55*4882a593Smuzhiyun ((((addr) + 1024*1024-1) & ~(1024*1024-1)) + \
56*4882a593Smuzhiyun (((node)*PERCPU_PAGE_SIZE) & (MAX_NODE_ALIGN_OFFSET - 1)))
57*4882a593Smuzhiyun
58*4882a593Smuzhiyun /**
59*4882a593Smuzhiyun * build_node_maps - callback to setup mem_data structs for each node
60*4882a593Smuzhiyun * @start: physical start of range
61*4882a593Smuzhiyun * @len: length of range
62*4882a593Smuzhiyun * @node: node where this range resides
63*4882a593Smuzhiyun *
64*4882a593Smuzhiyun * Detect extents of each piece of memory that we wish to
65*4882a593Smuzhiyun * treat as a virtually contiguous block (i.e. each node). Each such block
66*4882a593Smuzhiyun * must start on an %IA64_GRANULE_SIZE boundary, so we round the address down
67*4882a593Smuzhiyun * if necessary. Any non-existent pages will simply be part of the virtual
68*4882a593Smuzhiyun * memmap.
69*4882a593Smuzhiyun */
build_node_maps(unsigned long start,unsigned long len,int node)70*4882a593Smuzhiyun static int __init build_node_maps(unsigned long start, unsigned long len,
71*4882a593Smuzhiyun int node)
72*4882a593Smuzhiyun {
73*4882a593Smuzhiyun unsigned long spfn, epfn, end = start + len;
74*4882a593Smuzhiyun
75*4882a593Smuzhiyun epfn = GRANULEROUNDUP(end) >> PAGE_SHIFT;
76*4882a593Smuzhiyun spfn = GRANULEROUNDDOWN(start) >> PAGE_SHIFT;
77*4882a593Smuzhiyun
78*4882a593Smuzhiyun if (!mem_data[node].min_pfn) {
79*4882a593Smuzhiyun mem_data[node].min_pfn = spfn;
80*4882a593Smuzhiyun mem_data[node].max_pfn = epfn;
81*4882a593Smuzhiyun } else {
82*4882a593Smuzhiyun mem_data[node].min_pfn = min(spfn, mem_data[node].min_pfn);
83*4882a593Smuzhiyun mem_data[node].max_pfn = max(epfn, mem_data[node].max_pfn);
84*4882a593Smuzhiyun }
85*4882a593Smuzhiyun
86*4882a593Smuzhiyun return 0;
87*4882a593Smuzhiyun }
88*4882a593Smuzhiyun
89*4882a593Smuzhiyun /**
90*4882a593Smuzhiyun * early_nr_cpus_node - return number of cpus on a given node
91*4882a593Smuzhiyun * @node: node to check
92*4882a593Smuzhiyun *
93*4882a593Smuzhiyun * Count the number of cpus on @node. We can't use nr_cpus_node() yet because
94*4882a593Smuzhiyun * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been
95*4882a593Smuzhiyun * called yet. Note that node 0 will also count all non-existent cpus.
96*4882a593Smuzhiyun */
early_nr_cpus_node(int node)97*4882a593Smuzhiyun static int early_nr_cpus_node(int node)
98*4882a593Smuzhiyun {
99*4882a593Smuzhiyun int cpu, n = 0;
100*4882a593Smuzhiyun
101*4882a593Smuzhiyun for_each_possible_early_cpu(cpu)
102*4882a593Smuzhiyun if (node == node_cpuid[cpu].nid)
103*4882a593Smuzhiyun n++;
104*4882a593Smuzhiyun
105*4882a593Smuzhiyun return n;
106*4882a593Smuzhiyun }
107*4882a593Smuzhiyun
108*4882a593Smuzhiyun /**
109*4882a593Smuzhiyun * compute_pernodesize - compute size of pernode data
110*4882a593Smuzhiyun * @node: the node id.
111*4882a593Smuzhiyun */
compute_pernodesize(int node)112*4882a593Smuzhiyun static unsigned long compute_pernodesize(int node)
113*4882a593Smuzhiyun {
114*4882a593Smuzhiyun unsigned long pernodesize = 0, cpus;
115*4882a593Smuzhiyun
116*4882a593Smuzhiyun cpus = early_nr_cpus_node(node);
117*4882a593Smuzhiyun pernodesize += PERCPU_PAGE_SIZE * cpus;
118*4882a593Smuzhiyun pernodesize += node * L1_CACHE_BYTES;
119*4882a593Smuzhiyun pernodesize += L1_CACHE_ALIGN(sizeof(pg_data_t));
120*4882a593Smuzhiyun pernodesize += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
121*4882a593Smuzhiyun pernodesize += L1_CACHE_ALIGN(sizeof(pg_data_t));
122*4882a593Smuzhiyun pernodesize = PAGE_ALIGN(pernodesize);
123*4882a593Smuzhiyun return pernodesize;
124*4882a593Smuzhiyun }
125*4882a593Smuzhiyun
126*4882a593Smuzhiyun /**
127*4882a593Smuzhiyun * per_cpu_node_setup - setup per-cpu areas on each node
128*4882a593Smuzhiyun * @cpu_data: per-cpu area on this node
129*4882a593Smuzhiyun * @node: node to setup
130*4882a593Smuzhiyun *
131*4882a593Smuzhiyun * Copy the static per-cpu data into the region we just set aside and then
132*4882a593Smuzhiyun * setup __per_cpu_offset for each CPU on this node. Return a pointer to
133*4882a593Smuzhiyun * the end of the area.
134*4882a593Smuzhiyun */
per_cpu_node_setup(void * cpu_data,int node)135*4882a593Smuzhiyun static void *per_cpu_node_setup(void *cpu_data, int node)
136*4882a593Smuzhiyun {
137*4882a593Smuzhiyun #ifdef CONFIG_SMP
138*4882a593Smuzhiyun int cpu;
139*4882a593Smuzhiyun
140*4882a593Smuzhiyun for_each_possible_early_cpu(cpu) {
141*4882a593Smuzhiyun void *src = cpu == 0 ? __cpu0_per_cpu : __phys_per_cpu_start;
142*4882a593Smuzhiyun
143*4882a593Smuzhiyun if (node != node_cpuid[cpu].nid)
144*4882a593Smuzhiyun continue;
145*4882a593Smuzhiyun
146*4882a593Smuzhiyun memcpy(__va(cpu_data), src, __per_cpu_end - __per_cpu_start);
147*4882a593Smuzhiyun __per_cpu_offset[cpu] = (char *)__va(cpu_data) -
148*4882a593Smuzhiyun __per_cpu_start;
149*4882a593Smuzhiyun
150*4882a593Smuzhiyun /*
151*4882a593Smuzhiyun * percpu area for cpu0 is moved from the __init area
152*4882a593Smuzhiyun * which is setup by head.S and used till this point.
153*4882a593Smuzhiyun * Update ar.k3. This move is ensures that percpu
154*4882a593Smuzhiyun * area for cpu0 is on the correct node and its
155*4882a593Smuzhiyun * virtual address isn't insanely far from other
156*4882a593Smuzhiyun * percpu areas which is important for congruent
157*4882a593Smuzhiyun * percpu allocator.
158*4882a593Smuzhiyun */
159*4882a593Smuzhiyun if (cpu == 0)
160*4882a593Smuzhiyun ia64_set_kr(IA64_KR_PER_CPU_DATA,
161*4882a593Smuzhiyun (unsigned long)cpu_data -
162*4882a593Smuzhiyun (unsigned long)__per_cpu_start);
163*4882a593Smuzhiyun
164*4882a593Smuzhiyun cpu_data += PERCPU_PAGE_SIZE;
165*4882a593Smuzhiyun }
166*4882a593Smuzhiyun #endif
167*4882a593Smuzhiyun return cpu_data;
168*4882a593Smuzhiyun }
169*4882a593Smuzhiyun
170*4882a593Smuzhiyun #ifdef CONFIG_SMP
171*4882a593Smuzhiyun /**
172*4882a593Smuzhiyun * setup_per_cpu_areas - setup percpu areas
173*4882a593Smuzhiyun *
174*4882a593Smuzhiyun * Arch code has already allocated and initialized percpu areas. All
175*4882a593Smuzhiyun * this function has to do is to teach the determined layout to the
176*4882a593Smuzhiyun * dynamic percpu allocator, which happens to be more complex than
177*4882a593Smuzhiyun * creating whole new ones using helpers.
178*4882a593Smuzhiyun */
setup_per_cpu_areas(void)179*4882a593Smuzhiyun void __init setup_per_cpu_areas(void)
180*4882a593Smuzhiyun {
181*4882a593Smuzhiyun struct pcpu_alloc_info *ai;
182*4882a593Smuzhiyun struct pcpu_group_info *gi;
183*4882a593Smuzhiyun unsigned int *cpu_map;
184*4882a593Smuzhiyun void *base;
185*4882a593Smuzhiyun unsigned long base_offset;
186*4882a593Smuzhiyun unsigned int cpu;
187*4882a593Smuzhiyun ssize_t static_size, reserved_size, dyn_size;
188*4882a593Smuzhiyun int node, prev_node, unit, nr_units;
189*4882a593Smuzhiyun
190*4882a593Smuzhiyun ai = pcpu_alloc_alloc_info(MAX_NUMNODES, nr_cpu_ids);
191*4882a593Smuzhiyun if (!ai)
192*4882a593Smuzhiyun panic("failed to allocate pcpu_alloc_info");
193*4882a593Smuzhiyun cpu_map = ai->groups[0].cpu_map;
194*4882a593Smuzhiyun
195*4882a593Smuzhiyun /* determine base */
196*4882a593Smuzhiyun base = (void *)ULONG_MAX;
197*4882a593Smuzhiyun for_each_possible_cpu(cpu)
198*4882a593Smuzhiyun base = min(base,
199*4882a593Smuzhiyun (void *)(__per_cpu_offset[cpu] + __per_cpu_start));
200*4882a593Smuzhiyun base_offset = (void *)__per_cpu_start - base;
201*4882a593Smuzhiyun
202*4882a593Smuzhiyun /* build cpu_map, units are grouped by node */
203*4882a593Smuzhiyun unit = 0;
204*4882a593Smuzhiyun for_each_node(node)
205*4882a593Smuzhiyun for_each_possible_cpu(cpu)
206*4882a593Smuzhiyun if (node == node_cpuid[cpu].nid)
207*4882a593Smuzhiyun cpu_map[unit++] = cpu;
208*4882a593Smuzhiyun nr_units = unit;
209*4882a593Smuzhiyun
210*4882a593Smuzhiyun /* set basic parameters */
211*4882a593Smuzhiyun static_size = __per_cpu_end - __per_cpu_start;
212*4882a593Smuzhiyun reserved_size = PERCPU_MODULE_RESERVE;
213*4882a593Smuzhiyun dyn_size = PERCPU_PAGE_SIZE - static_size - reserved_size;
214*4882a593Smuzhiyun if (dyn_size < 0)
215*4882a593Smuzhiyun panic("percpu area overflow static=%zd reserved=%zd\n",
216*4882a593Smuzhiyun static_size, reserved_size);
217*4882a593Smuzhiyun
218*4882a593Smuzhiyun ai->static_size = static_size;
219*4882a593Smuzhiyun ai->reserved_size = reserved_size;
220*4882a593Smuzhiyun ai->dyn_size = dyn_size;
221*4882a593Smuzhiyun ai->unit_size = PERCPU_PAGE_SIZE;
222*4882a593Smuzhiyun ai->atom_size = PAGE_SIZE;
223*4882a593Smuzhiyun ai->alloc_size = PERCPU_PAGE_SIZE;
224*4882a593Smuzhiyun
225*4882a593Smuzhiyun /*
226*4882a593Smuzhiyun * CPUs are put into groups according to node. Walk cpu_map
227*4882a593Smuzhiyun * and create new groups at node boundaries.
228*4882a593Smuzhiyun */
229*4882a593Smuzhiyun prev_node = NUMA_NO_NODE;
230*4882a593Smuzhiyun ai->nr_groups = 0;
231*4882a593Smuzhiyun for (unit = 0; unit < nr_units; unit++) {
232*4882a593Smuzhiyun cpu = cpu_map[unit];
233*4882a593Smuzhiyun node = node_cpuid[cpu].nid;
234*4882a593Smuzhiyun
235*4882a593Smuzhiyun if (node == prev_node) {
236*4882a593Smuzhiyun gi->nr_units++;
237*4882a593Smuzhiyun continue;
238*4882a593Smuzhiyun }
239*4882a593Smuzhiyun prev_node = node;
240*4882a593Smuzhiyun
241*4882a593Smuzhiyun gi = &ai->groups[ai->nr_groups++];
242*4882a593Smuzhiyun gi->nr_units = 1;
243*4882a593Smuzhiyun gi->base_offset = __per_cpu_offset[cpu] + base_offset;
244*4882a593Smuzhiyun gi->cpu_map = &cpu_map[unit];
245*4882a593Smuzhiyun }
246*4882a593Smuzhiyun
247*4882a593Smuzhiyun pcpu_setup_first_chunk(ai, base);
248*4882a593Smuzhiyun pcpu_free_alloc_info(ai);
249*4882a593Smuzhiyun }
250*4882a593Smuzhiyun #endif
251*4882a593Smuzhiyun
252*4882a593Smuzhiyun /**
253*4882a593Smuzhiyun * fill_pernode - initialize pernode data.
254*4882a593Smuzhiyun * @node: the node id.
255*4882a593Smuzhiyun * @pernode: physical address of pernode data
256*4882a593Smuzhiyun * @pernodesize: size of the pernode data
257*4882a593Smuzhiyun */
fill_pernode(int node,unsigned long pernode,unsigned long pernodesize)258*4882a593Smuzhiyun static void __init fill_pernode(int node, unsigned long pernode,
259*4882a593Smuzhiyun unsigned long pernodesize)
260*4882a593Smuzhiyun {
261*4882a593Smuzhiyun void *cpu_data;
262*4882a593Smuzhiyun int cpus = early_nr_cpus_node(node);
263*4882a593Smuzhiyun
264*4882a593Smuzhiyun mem_data[node].pernode_addr = pernode;
265*4882a593Smuzhiyun mem_data[node].pernode_size = pernodesize;
266*4882a593Smuzhiyun memset(__va(pernode), 0, pernodesize);
267*4882a593Smuzhiyun
268*4882a593Smuzhiyun cpu_data = (void *)pernode;
269*4882a593Smuzhiyun pernode += PERCPU_PAGE_SIZE * cpus;
270*4882a593Smuzhiyun pernode += node * L1_CACHE_BYTES;
271*4882a593Smuzhiyun
272*4882a593Smuzhiyun pgdat_list[node] = __va(pernode);
273*4882a593Smuzhiyun pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
274*4882a593Smuzhiyun
275*4882a593Smuzhiyun mem_data[node].node_data = __va(pernode);
276*4882a593Smuzhiyun pernode += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
277*4882a593Smuzhiyun pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
278*4882a593Smuzhiyun
279*4882a593Smuzhiyun cpu_data = per_cpu_node_setup(cpu_data, node);
280*4882a593Smuzhiyun
281*4882a593Smuzhiyun return;
282*4882a593Smuzhiyun }
283*4882a593Smuzhiyun
284*4882a593Smuzhiyun /**
285*4882a593Smuzhiyun * find_pernode_space - allocate memory for memory map and per-node structures
286*4882a593Smuzhiyun * @start: physical start of range
287*4882a593Smuzhiyun * @len: length of range
288*4882a593Smuzhiyun * @node: node where this range resides
289*4882a593Smuzhiyun *
290*4882a593Smuzhiyun * This routine reserves space for the per-cpu data struct, the list of
291*4882a593Smuzhiyun * pg_data_ts and the per-node data struct. Each node will have something like
292*4882a593Smuzhiyun * the following in the first chunk of addr. space large enough to hold it.
293*4882a593Smuzhiyun *
294*4882a593Smuzhiyun * ________________________
295*4882a593Smuzhiyun * | |
296*4882a593Smuzhiyun * |~~~~~~~~~~~~~~~~~~~~~~~~| <-- NODEDATA_ALIGN(start, node) for the first
297*4882a593Smuzhiyun * | PERCPU_PAGE_SIZE * | start and length big enough
298*4882a593Smuzhiyun * | cpus_on_this_node | Node 0 will also have entries for all non-existent cpus.
299*4882a593Smuzhiyun * |------------------------|
300*4882a593Smuzhiyun * | local pg_data_t * |
301*4882a593Smuzhiyun * |------------------------|
302*4882a593Smuzhiyun * | local ia64_node_data |
303*4882a593Smuzhiyun * |------------------------|
304*4882a593Smuzhiyun * | ??? |
305*4882a593Smuzhiyun * |________________________|
306*4882a593Smuzhiyun *
307*4882a593Smuzhiyun * Once this space has been set aside, the bootmem maps are initialized. We
308*4882a593Smuzhiyun * could probably move the allocation of the per-cpu and ia64_node_data space
309*4882a593Smuzhiyun * outside of this function and use alloc_bootmem_node(), but doing it here
310*4882a593Smuzhiyun * is straightforward and we get the alignments we want so...
311*4882a593Smuzhiyun */
find_pernode_space(unsigned long start,unsigned long len,int node)312*4882a593Smuzhiyun static int __init find_pernode_space(unsigned long start, unsigned long len,
313*4882a593Smuzhiyun int node)
314*4882a593Smuzhiyun {
315*4882a593Smuzhiyun unsigned long spfn, epfn;
316*4882a593Smuzhiyun unsigned long pernodesize = 0, pernode;
317*4882a593Smuzhiyun
318*4882a593Smuzhiyun spfn = start >> PAGE_SHIFT;
319*4882a593Smuzhiyun epfn = (start + len) >> PAGE_SHIFT;
320*4882a593Smuzhiyun
321*4882a593Smuzhiyun /*
322*4882a593Smuzhiyun * Make sure this memory falls within this node's usable memory
323*4882a593Smuzhiyun * since we may have thrown some away in build_maps().
324*4882a593Smuzhiyun */
325*4882a593Smuzhiyun if (spfn < mem_data[node].min_pfn || epfn > mem_data[node].max_pfn)
326*4882a593Smuzhiyun return 0;
327*4882a593Smuzhiyun
328*4882a593Smuzhiyun /* Don't setup this node's local space twice... */
329*4882a593Smuzhiyun if (mem_data[node].pernode_addr)
330*4882a593Smuzhiyun return 0;
331*4882a593Smuzhiyun
332*4882a593Smuzhiyun /*
333*4882a593Smuzhiyun * Calculate total size needed, incl. what's necessary
334*4882a593Smuzhiyun * for good alignment and alias prevention.
335*4882a593Smuzhiyun */
336*4882a593Smuzhiyun pernodesize = compute_pernodesize(node);
337*4882a593Smuzhiyun pernode = NODEDATA_ALIGN(start, node);
338*4882a593Smuzhiyun
339*4882a593Smuzhiyun /* Is this range big enough for what we want to store here? */
340*4882a593Smuzhiyun if (start + len > (pernode + pernodesize))
341*4882a593Smuzhiyun fill_pernode(node, pernode, pernodesize);
342*4882a593Smuzhiyun
343*4882a593Smuzhiyun return 0;
344*4882a593Smuzhiyun }
345*4882a593Smuzhiyun
346*4882a593Smuzhiyun /**
347*4882a593Smuzhiyun * reserve_pernode_space - reserve memory for per-node space
348*4882a593Smuzhiyun *
349*4882a593Smuzhiyun * Reserve the space used by the bootmem maps & per-node space in the boot
350*4882a593Smuzhiyun * allocator so that when we actually create the real mem maps we don't
351*4882a593Smuzhiyun * use their memory.
352*4882a593Smuzhiyun */
reserve_pernode_space(void)353*4882a593Smuzhiyun static void __init reserve_pernode_space(void)
354*4882a593Smuzhiyun {
355*4882a593Smuzhiyun unsigned long base, size;
356*4882a593Smuzhiyun int node;
357*4882a593Smuzhiyun
358*4882a593Smuzhiyun for_each_online_node(node) {
359*4882a593Smuzhiyun if (node_isset(node, memory_less_mask))
360*4882a593Smuzhiyun continue;
361*4882a593Smuzhiyun
362*4882a593Smuzhiyun /* Now the per-node space */
363*4882a593Smuzhiyun size = mem_data[node].pernode_size;
364*4882a593Smuzhiyun base = __pa(mem_data[node].pernode_addr);
365*4882a593Smuzhiyun memblock_reserve(base, size);
366*4882a593Smuzhiyun }
367*4882a593Smuzhiyun }
368*4882a593Smuzhiyun
scatter_node_data(void)369*4882a593Smuzhiyun static void scatter_node_data(void)
370*4882a593Smuzhiyun {
371*4882a593Smuzhiyun pg_data_t **dst;
372*4882a593Smuzhiyun int node;
373*4882a593Smuzhiyun
374*4882a593Smuzhiyun /*
375*4882a593Smuzhiyun * for_each_online_node() can't be used at here.
376*4882a593Smuzhiyun * node_online_map is not set for hot-added nodes at this time,
377*4882a593Smuzhiyun * because we are halfway through initialization of the new node's
378*4882a593Smuzhiyun * structures. If for_each_online_node() is used, a new node's
379*4882a593Smuzhiyun * pg_data_ptrs will be not initialized. Instead of using it,
380*4882a593Smuzhiyun * pgdat_list[] is checked.
381*4882a593Smuzhiyun */
382*4882a593Smuzhiyun for_each_node(node) {
383*4882a593Smuzhiyun if (pgdat_list[node]) {
384*4882a593Smuzhiyun dst = LOCAL_DATA_ADDR(pgdat_list[node])->pg_data_ptrs;
385*4882a593Smuzhiyun memcpy(dst, pgdat_list, sizeof(pgdat_list));
386*4882a593Smuzhiyun }
387*4882a593Smuzhiyun }
388*4882a593Smuzhiyun }
389*4882a593Smuzhiyun
390*4882a593Smuzhiyun /**
391*4882a593Smuzhiyun * initialize_pernode_data - fixup per-cpu & per-node pointers
392*4882a593Smuzhiyun *
393*4882a593Smuzhiyun * Each node's per-node area has a copy of the global pg_data_t list, so
394*4882a593Smuzhiyun * we copy that to each node here, as well as setting the per-cpu pointer
395*4882a593Smuzhiyun * to the local node data structure.
396*4882a593Smuzhiyun */
initialize_pernode_data(void)397*4882a593Smuzhiyun static void __init initialize_pernode_data(void)
398*4882a593Smuzhiyun {
399*4882a593Smuzhiyun int cpu, node;
400*4882a593Smuzhiyun
401*4882a593Smuzhiyun scatter_node_data();
402*4882a593Smuzhiyun
403*4882a593Smuzhiyun #ifdef CONFIG_SMP
404*4882a593Smuzhiyun /* Set the node_data pointer for each per-cpu struct */
405*4882a593Smuzhiyun for_each_possible_early_cpu(cpu) {
406*4882a593Smuzhiyun node = node_cpuid[cpu].nid;
407*4882a593Smuzhiyun per_cpu(ia64_cpu_info, cpu).node_data =
408*4882a593Smuzhiyun mem_data[node].node_data;
409*4882a593Smuzhiyun }
410*4882a593Smuzhiyun #else
411*4882a593Smuzhiyun {
412*4882a593Smuzhiyun struct cpuinfo_ia64 *cpu0_cpu_info;
413*4882a593Smuzhiyun cpu = 0;
414*4882a593Smuzhiyun node = node_cpuid[cpu].nid;
415*4882a593Smuzhiyun cpu0_cpu_info = (struct cpuinfo_ia64 *)(__phys_per_cpu_start +
416*4882a593Smuzhiyun ((char *)&ia64_cpu_info - __per_cpu_start));
417*4882a593Smuzhiyun cpu0_cpu_info->node_data = mem_data[node].node_data;
418*4882a593Smuzhiyun }
419*4882a593Smuzhiyun #endif /* CONFIG_SMP */
420*4882a593Smuzhiyun }
421*4882a593Smuzhiyun
422*4882a593Smuzhiyun /**
423*4882a593Smuzhiyun * memory_less_node_alloc - * attempt to allocate memory on the best NUMA slit
424*4882a593Smuzhiyun * node but fall back to any other node when __alloc_bootmem_node fails
425*4882a593Smuzhiyun * for best.
426*4882a593Smuzhiyun * @nid: node id
427*4882a593Smuzhiyun * @pernodesize: size of this node's pernode data
428*4882a593Smuzhiyun */
memory_less_node_alloc(int nid,unsigned long pernodesize)429*4882a593Smuzhiyun static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize)
430*4882a593Smuzhiyun {
431*4882a593Smuzhiyun void *ptr = NULL;
432*4882a593Smuzhiyun u8 best = 0xff;
433*4882a593Smuzhiyun int bestnode = NUMA_NO_NODE, node, anynode = 0;
434*4882a593Smuzhiyun
435*4882a593Smuzhiyun for_each_online_node(node) {
436*4882a593Smuzhiyun if (node_isset(node, memory_less_mask))
437*4882a593Smuzhiyun continue;
438*4882a593Smuzhiyun else if (node_distance(nid, node) < best) {
439*4882a593Smuzhiyun best = node_distance(nid, node);
440*4882a593Smuzhiyun bestnode = node;
441*4882a593Smuzhiyun }
442*4882a593Smuzhiyun anynode = node;
443*4882a593Smuzhiyun }
444*4882a593Smuzhiyun
445*4882a593Smuzhiyun if (bestnode == NUMA_NO_NODE)
446*4882a593Smuzhiyun bestnode = anynode;
447*4882a593Smuzhiyun
448*4882a593Smuzhiyun ptr = memblock_alloc_try_nid(pernodesize, PERCPU_PAGE_SIZE,
449*4882a593Smuzhiyun __pa(MAX_DMA_ADDRESS),
450*4882a593Smuzhiyun MEMBLOCK_ALLOC_ACCESSIBLE,
451*4882a593Smuzhiyun bestnode);
452*4882a593Smuzhiyun if (!ptr)
453*4882a593Smuzhiyun panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%lx\n",
454*4882a593Smuzhiyun __func__, pernodesize, PERCPU_PAGE_SIZE, bestnode,
455*4882a593Smuzhiyun __pa(MAX_DMA_ADDRESS));
456*4882a593Smuzhiyun
457*4882a593Smuzhiyun return ptr;
458*4882a593Smuzhiyun }
459*4882a593Smuzhiyun
460*4882a593Smuzhiyun /**
461*4882a593Smuzhiyun * memory_less_nodes - allocate and initialize CPU only nodes pernode
462*4882a593Smuzhiyun * information.
463*4882a593Smuzhiyun */
memory_less_nodes(void)464*4882a593Smuzhiyun static void __init memory_less_nodes(void)
465*4882a593Smuzhiyun {
466*4882a593Smuzhiyun unsigned long pernodesize;
467*4882a593Smuzhiyun void *pernode;
468*4882a593Smuzhiyun int node;
469*4882a593Smuzhiyun
470*4882a593Smuzhiyun for_each_node_mask(node, memory_less_mask) {
471*4882a593Smuzhiyun pernodesize = compute_pernodesize(node);
472*4882a593Smuzhiyun pernode = memory_less_node_alloc(node, pernodesize);
473*4882a593Smuzhiyun fill_pernode(node, __pa(pernode), pernodesize);
474*4882a593Smuzhiyun }
475*4882a593Smuzhiyun
476*4882a593Smuzhiyun return;
477*4882a593Smuzhiyun }
478*4882a593Smuzhiyun
479*4882a593Smuzhiyun /**
480*4882a593Smuzhiyun * find_memory - walk the EFI memory map and setup the bootmem allocator
481*4882a593Smuzhiyun *
482*4882a593Smuzhiyun * Called early in boot to setup the bootmem allocator, and to
483*4882a593Smuzhiyun * allocate the per-cpu and per-node structures.
484*4882a593Smuzhiyun */
find_memory(void)485*4882a593Smuzhiyun void __init find_memory(void)
486*4882a593Smuzhiyun {
487*4882a593Smuzhiyun int node;
488*4882a593Smuzhiyun
489*4882a593Smuzhiyun reserve_memory();
490*4882a593Smuzhiyun efi_memmap_walk(filter_memory, register_active_ranges);
491*4882a593Smuzhiyun
492*4882a593Smuzhiyun if (num_online_nodes() == 0) {
493*4882a593Smuzhiyun printk(KERN_ERR "node info missing!\n");
494*4882a593Smuzhiyun node_set_online(0);
495*4882a593Smuzhiyun }
496*4882a593Smuzhiyun
497*4882a593Smuzhiyun nodes_or(memory_less_mask, memory_less_mask, node_online_map);
498*4882a593Smuzhiyun min_low_pfn = -1;
499*4882a593Smuzhiyun max_low_pfn = 0;
500*4882a593Smuzhiyun
501*4882a593Smuzhiyun /* These actually end up getting called by call_pernode_memory() */
502*4882a593Smuzhiyun efi_memmap_walk(filter_rsvd_memory, build_node_maps);
503*4882a593Smuzhiyun efi_memmap_walk(filter_rsvd_memory, find_pernode_space);
504*4882a593Smuzhiyun efi_memmap_walk(find_max_min_low_pfn, NULL);
505*4882a593Smuzhiyun
506*4882a593Smuzhiyun for_each_online_node(node)
507*4882a593Smuzhiyun if (mem_data[node].min_pfn)
508*4882a593Smuzhiyun node_clear(node, memory_less_mask);
509*4882a593Smuzhiyun
510*4882a593Smuzhiyun reserve_pernode_space();
511*4882a593Smuzhiyun memory_less_nodes();
512*4882a593Smuzhiyun initialize_pernode_data();
513*4882a593Smuzhiyun
514*4882a593Smuzhiyun max_pfn = max_low_pfn;
515*4882a593Smuzhiyun
516*4882a593Smuzhiyun find_initrd();
517*4882a593Smuzhiyun }
518*4882a593Smuzhiyun
519*4882a593Smuzhiyun #ifdef CONFIG_SMP
520*4882a593Smuzhiyun /**
521*4882a593Smuzhiyun * per_cpu_init - setup per-cpu variables
522*4882a593Smuzhiyun *
523*4882a593Smuzhiyun * find_pernode_space() does most of this already, we just need to set
524*4882a593Smuzhiyun * local_per_cpu_offset
525*4882a593Smuzhiyun */
per_cpu_init(void)526*4882a593Smuzhiyun void *per_cpu_init(void)
527*4882a593Smuzhiyun {
528*4882a593Smuzhiyun int cpu;
529*4882a593Smuzhiyun static int first_time = 1;
530*4882a593Smuzhiyun
531*4882a593Smuzhiyun if (first_time) {
532*4882a593Smuzhiyun first_time = 0;
533*4882a593Smuzhiyun for_each_possible_early_cpu(cpu)
534*4882a593Smuzhiyun per_cpu(local_per_cpu_offset, cpu) = __per_cpu_offset[cpu];
535*4882a593Smuzhiyun }
536*4882a593Smuzhiyun
537*4882a593Smuzhiyun return __per_cpu_start + __per_cpu_offset[smp_processor_id()];
538*4882a593Smuzhiyun }
539*4882a593Smuzhiyun #endif /* CONFIG_SMP */
540*4882a593Smuzhiyun
541*4882a593Smuzhiyun /**
542*4882a593Smuzhiyun * call_pernode_memory - use SRAT to call callback functions with node info
543*4882a593Smuzhiyun * @start: physical start of range
544*4882a593Smuzhiyun * @len: length of range
545*4882a593Smuzhiyun * @arg: function to call for each range
546*4882a593Smuzhiyun *
547*4882a593Smuzhiyun * efi_memmap_walk() knows nothing about layout of memory across nodes. Find
548*4882a593Smuzhiyun * out to which node a block of memory belongs. Ignore memory that we cannot
549*4882a593Smuzhiyun * identify, and split blocks that run across multiple nodes.
550*4882a593Smuzhiyun *
551*4882a593Smuzhiyun * Take this opportunity to round the start address up and the end address
552*4882a593Smuzhiyun * down to page boundaries.
553*4882a593Smuzhiyun */
call_pernode_memory(unsigned long start,unsigned long len,void * arg)554*4882a593Smuzhiyun void call_pernode_memory(unsigned long start, unsigned long len, void *arg)
555*4882a593Smuzhiyun {
556*4882a593Smuzhiyun unsigned long rs, re, end = start + len;
557*4882a593Smuzhiyun void (*func)(unsigned long, unsigned long, int);
558*4882a593Smuzhiyun int i;
559*4882a593Smuzhiyun
560*4882a593Smuzhiyun start = PAGE_ALIGN(start);
561*4882a593Smuzhiyun end &= PAGE_MASK;
562*4882a593Smuzhiyun if (start >= end)
563*4882a593Smuzhiyun return;
564*4882a593Smuzhiyun
565*4882a593Smuzhiyun func = arg;
566*4882a593Smuzhiyun
567*4882a593Smuzhiyun if (!num_node_memblks) {
568*4882a593Smuzhiyun /* No SRAT table, so assume one node (node 0) */
569*4882a593Smuzhiyun if (start < end)
570*4882a593Smuzhiyun (*func)(start, end - start, 0);
571*4882a593Smuzhiyun return;
572*4882a593Smuzhiyun }
573*4882a593Smuzhiyun
574*4882a593Smuzhiyun for (i = 0; i < num_node_memblks; i++) {
575*4882a593Smuzhiyun rs = max(start, node_memblk[i].start_paddr);
576*4882a593Smuzhiyun re = min(end, node_memblk[i].start_paddr +
577*4882a593Smuzhiyun node_memblk[i].size);
578*4882a593Smuzhiyun
579*4882a593Smuzhiyun if (rs < re)
580*4882a593Smuzhiyun (*func)(rs, re - rs, node_memblk[i].nid);
581*4882a593Smuzhiyun
582*4882a593Smuzhiyun if (re == end)
583*4882a593Smuzhiyun break;
584*4882a593Smuzhiyun }
585*4882a593Smuzhiyun }
586*4882a593Smuzhiyun
587*4882a593Smuzhiyun /**
588*4882a593Smuzhiyun * paging_init - setup page tables
589*4882a593Smuzhiyun *
590*4882a593Smuzhiyun * paging_init() sets up the page tables for each node of the system and frees
591*4882a593Smuzhiyun * the bootmem allocator memory for general use.
592*4882a593Smuzhiyun */
paging_init(void)593*4882a593Smuzhiyun void __init paging_init(void)
594*4882a593Smuzhiyun {
595*4882a593Smuzhiyun unsigned long max_dma;
596*4882a593Smuzhiyun unsigned long pfn_offset = 0;
597*4882a593Smuzhiyun unsigned long max_pfn = 0;
598*4882a593Smuzhiyun int node;
599*4882a593Smuzhiyun unsigned long max_zone_pfns[MAX_NR_ZONES];
600*4882a593Smuzhiyun
601*4882a593Smuzhiyun max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
602*4882a593Smuzhiyun
603*4882a593Smuzhiyun sparse_init();
604*4882a593Smuzhiyun
605*4882a593Smuzhiyun #ifdef CONFIG_VIRTUAL_MEM_MAP
606*4882a593Smuzhiyun VMALLOC_END -= PAGE_ALIGN(ALIGN(max_low_pfn, MAX_ORDER_NR_PAGES) *
607*4882a593Smuzhiyun sizeof(struct page));
608*4882a593Smuzhiyun vmem_map = (struct page *) VMALLOC_END;
609*4882a593Smuzhiyun efi_memmap_walk(create_mem_map_page_table, NULL);
610*4882a593Smuzhiyun printk("Virtual mem_map starts at 0x%p\n", vmem_map);
611*4882a593Smuzhiyun #endif
612*4882a593Smuzhiyun
613*4882a593Smuzhiyun for_each_online_node(node) {
614*4882a593Smuzhiyun pfn_offset = mem_data[node].min_pfn;
615*4882a593Smuzhiyun
616*4882a593Smuzhiyun #ifdef CONFIG_VIRTUAL_MEM_MAP
617*4882a593Smuzhiyun NODE_DATA(node)->node_mem_map = vmem_map + pfn_offset;
618*4882a593Smuzhiyun #endif
619*4882a593Smuzhiyun if (mem_data[node].max_pfn > max_pfn)
620*4882a593Smuzhiyun max_pfn = mem_data[node].max_pfn;
621*4882a593Smuzhiyun }
622*4882a593Smuzhiyun
623*4882a593Smuzhiyun memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
624*4882a593Smuzhiyun #ifdef CONFIG_ZONE_DMA32
625*4882a593Smuzhiyun max_zone_pfns[ZONE_DMA32] = max_dma;
626*4882a593Smuzhiyun #endif
627*4882a593Smuzhiyun max_zone_pfns[ZONE_NORMAL] = max_pfn;
628*4882a593Smuzhiyun free_area_init(max_zone_pfns);
629*4882a593Smuzhiyun
630*4882a593Smuzhiyun zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
631*4882a593Smuzhiyun }
632*4882a593Smuzhiyun
633*4882a593Smuzhiyun #ifdef CONFIG_MEMORY_HOTPLUG
arch_alloc_nodedata(int nid)634*4882a593Smuzhiyun pg_data_t *arch_alloc_nodedata(int nid)
635*4882a593Smuzhiyun {
636*4882a593Smuzhiyun unsigned long size = compute_pernodesize(nid);
637*4882a593Smuzhiyun
638*4882a593Smuzhiyun return kzalloc(size, GFP_KERNEL);
639*4882a593Smuzhiyun }
640*4882a593Smuzhiyun
arch_free_nodedata(pg_data_t * pgdat)641*4882a593Smuzhiyun void arch_free_nodedata(pg_data_t *pgdat)
642*4882a593Smuzhiyun {
643*4882a593Smuzhiyun kfree(pgdat);
644*4882a593Smuzhiyun }
645*4882a593Smuzhiyun
arch_refresh_nodedata(int update_node,pg_data_t * update_pgdat)646*4882a593Smuzhiyun void arch_refresh_nodedata(int update_node, pg_data_t *update_pgdat)
647*4882a593Smuzhiyun {
648*4882a593Smuzhiyun pgdat_list[update_node] = update_pgdat;
649*4882a593Smuzhiyun scatter_node_data();
650*4882a593Smuzhiyun }
651*4882a593Smuzhiyun #endif
652*4882a593Smuzhiyun
653*4882a593Smuzhiyun #ifdef CONFIG_SPARSEMEM_VMEMMAP
vmemmap_populate(unsigned long start,unsigned long end,int node,struct vmem_altmap * altmap)654*4882a593Smuzhiyun int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
655*4882a593Smuzhiyun struct vmem_altmap *altmap)
656*4882a593Smuzhiyun {
657*4882a593Smuzhiyun return vmemmap_populate_basepages(start, end, node, NULL);
658*4882a593Smuzhiyun }
659*4882a593Smuzhiyun
vmemmap_free(unsigned long start,unsigned long end,struct vmem_altmap * altmap)660*4882a593Smuzhiyun void vmemmap_free(unsigned long start, unsigned long end,
661*4882a593Smuzhiyun struct vmem_altmap *altmap)
662*4882a593Smuzhiyun {
663*4882a593Smuzhiyun }
664*4882a593Smuzhiyun #endif
665