1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0-or-later
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * Copyright (C) IBM Corporation, 2014, 2017
4*4882a593Smuzhiyun * Anton Blanchard, Rashmica Gupta.
5*4882a593Smuzhiyun */
6*4882a593Smuzhiyun
7*4882a593Smuzhiyun #define pr_fmt(fmt) "memtrace: " fmt
8*4882a593Smuzhiyun
9*4882a593Smuzhiyun #include <linux/bitops.h>
10*4882a593Smuzhiyun #include <linux/string.h>
11*4882a593Smuzhiyun #include <linux/memblock.h>
12*4882a593Smuzhiyun #include <linux/init.h>
13*4882a593Smuzhiyun #include <linux/moduleparam.h>
14*4882a593Smuzhiyun #include <linux/fs.h>
15*4882a593Smuzhiyun #include <linux/debugfs.h>
16*4882a593Smuzhiyun #include <linux/slab.h>
17*4882a593Smuzhiyun #include <linux/memory.h>
18*4882a593Smuzhiyun #include <linux/memory_hotplug.h>
19*4882a593Smuzhiyun #include <linux/numa.h>
20*4882a593Smuzhiyun #include <asm/machdep.h>
21*4882a593Smuzhiyun #include <asm/debugfs.h>
22*4882a593Smuzhiyun
23*4882a593Smuzhiyun /* This enables us to keep track of the memory removed from each node. */
24*4882a593Smuzhiyun struct memtrace_entry {
25*4882a593Smuzhiyun void *mem;
26*4882a593Smuzhiyun u64 start;
27*4882a593Smuzhiyun u64 size;
28*4882a593Smuzhiyun u32 nid;
29*4882a593Smuzhiyun struct dentry *dir;
30*4882a593Smuzhiyun char name[16];
31*4882a593Smuzhiyun };
32*4882a593Smuzhiyun
33*4882a593Smuzhiyun static DEFINE_MUTEX(memtrace_mutex);
34*4882a593Smuzhiyun static u64 memtrace_size;
35*4882a593Smuzhiyun
36*4882a593Smuzhiyun static struct memtrace_entry *memtrace_array;
37*4882a593Smuzhiyun static unsigned int memtrace_array_nr;
38*4882a593Smuzhiyun
39*4882a593Smuzhiyun
memtrace_read(struct file * filp,char __user * ubuf,size_t count,loff_t * ppos)40*4882a593Smuzhiyun static ssize_t memtrace_read(struct file *filp, char __user *ubuf,
41*4882a593Smuzhiyun size_t count, loff_t *ppos)
42*4882a593Smuzhiyun {
43*4882a593Smuzhiyun struct memtrace_entry *ent = filp->private_data;
44*4882a593Smuzhiyun
45*4882a593Smuzhiyun return simple_read_from_buffer(ubuf, count, ppos, ent->mem, ent->size);
46*4882a593Smuzhiyun }
47*4882a593Smuzhiyun
48*4882a593Smuzhiyun static const struct file_operations memtrace_fops = {
49*4882a593Smuzhiyun .llseek = default_llseek,
50*4882a593Smuzhiyun .read = memtrace_read,
51*4882a593Smuzhiyun .open = simple_open,
52*4882a593Smuzhiyun };
53*4882a593Smuzhiyun
check_memblock_online(struct memory_block * mem,void * arg)54*4882a593Smuzhiyun static int check_memblock_online(struct memory_block *mem, void *arg)
55*4882a593Smuzhiyun {
56*4882a593Smuzhiyun if (mem->state != MEM_ONLINE)
57*4882a593Smuzhiyun return -1;
58*4882a593Smuzhiyun
59*4882a593Smuzhiyun return 0;
60*4882a593Smuzhiyun }
61*4882a593Smuzhiyun
change_memblock_state(struct memory_block * mem,void * arg)62*4882a593Smuzhiyun static int change_memblock_state(struct memory_block *mem, void *arg)
63*4882a593Smuzhiyun {
64*4882a593Smuzhiyun unsigned long state = (unsigned long)arg;
65*4882a593Smuzhiyun
66*4882a593Smuzhiyun mem->state = state;
67*4882a593Smuzhiyun
68*4882a593Smuzhiyun return 0;
69*4882a593Smuzhiyun }
70*4882a593Smuzhiyun
memtrace_clear_range(unsigned long start_pfn,unsigned long nr_pages)71*4882a593Smuzhiyun static void memtrace_clear_range(unsigned long start_pfn,
72*4882a593Smuzhiyun unsigned long nr_pages)
73*4882a593Smuzhiyun {
74*4882a593Smuzhiyun unsigned long pfn;
75*4882a593Smuzhiyun
76*4882a593Smuzhiyun /*
77*4882a593Smuzhiyun * As pages are offline, we cannot trust the memmap anymore. As HIGHMEM
78*4882a593Smuzhiyun * does not apply, avoid passing around "struct page" and use
79*4882a593Smuzhiyun * clear_page() instead directly.
80*4882a593Smuzhiyun */
81*4882a593Smuzhiyun for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
82*4882a593Smuzhiyun if (IS_ALIGNED(pfn, PAGES_PER_SECTION))
83*4882a593Smuzhiyun cond_resched();
84*4882a593Smuzhiyun clear_page(__va(PFN_PHYS(pfn)));
85*4882a593Smuzhiyun }
86*4882a593Smuzhiyun }
87*4882a593Smuzhiyun
88*4882a593Smuzhiyun /* called with device_hotplug_lock held */
memtrace_offline_pages(u32 nid,u64 start_pfn,u64 nr_pages)89*4882a593Smuzhiyun static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages)
90*4882a593Smuzhiyun {
91*4882a593Smuzhiyun const unsigned long start = PFN_PHYS(start_pfn);
92*4882a593Smuzhiyun const unsigned long size = PFN_PHYS(nr_pages);
93*4882a593Smuzhiyun
94*4882a593Smuzhiyun if (walk_memory_blocks(start, size, NULL, check_memblock_online))
95*4882a593Smuzhiyun return false;
96*4882a593Smuzhiyun
97*4882a593Smuzhiyun walk_memory_blocks(start, size, (void *)MEM_GOING_OFFLINE,
98*4882a593Smuzhiyun change_memblock_state);
99*4882a593Smuzhiyun
100*4882a593Smuzhiyun if (offline_pages(start_pfn, nr_pages)) {
101*4882a593Smuzhiyun walk_memory_blocks(start, size, (void *)MEM_ONLINE,
102*4882a593Smuzhiyun change_memblock_state);
103*4882a593Smuzhiyun return false;
104*4882a593Smuzhiyun }
105*4882a593Smuzhiyun
106*4882a593Smuzhiyun walk_memory_blocks(start, size, (void *)MEM_OFFLINE,
107*4882a593Smuzhiyun change_memblock_state);
108*4882a593Smuzhiyun
109*4882a593Smuzhiyun
110*4882a593Smuzhiyun return true;
111*4882a593Smuzhiyun }
112*4882a593Smuzhiyun
memtrace_alloc_node(u32 nid,u64 size)113*4882a593Smuzhiyun static u64 memtrace_alloc_node(u32 nid, u64 size)
114*4882a593Smuzhiyun {
115*4882a593Smuzhiyun u64 start_pfn, end_pfn, nr_pages, pfn;
116*4882a593Smuzhiyun u64 base_pfn;
117*4882a593Smuzhiyun u64 bytes = memory_block_size_bytes();
118*4882a593Smuzhiyun
119*4882a593Smuzhiyun if (!node_spanned_pages(nid))
120*4882a593Smuzhiyun return 0;
121*4882a593Smuzhiyun
122*4882a593Smuzhiyun start_pfn = node_start_pfn(nid);
123*4882a593Smuzhiyun end_pfn = node_end_pfn(nid);
124*4882a593Smuzhiyun nr_pages = size >> PAGE_SHIFT;
125*4882a593Smuzhiyun
126*4882a593Smuzhiyun /* Trace memory needs to be aligned to the size */
127*4882a593Smuzhiyun end_pfn = round_down(end_pfn - nr_pages, nr_pages);
128*4882a593Smuzhiyun
129*4882a593Smuzhiyun lock_device_hotplug();
130*4882a593Smuzhiyun for (base_pfn = end_pfn; base_pfn > start_pfn; base_pfn -= nr_pages) {
131*4882a593Smuzhiyun if (memtrace_offline_pages(nid, base_pfn, nr_pages) == true) {
132*4882a593Smuzhiyun /*
133*4882a593Smuzhiyun * Clear the range while we still have a linear
134*4882a593Smuzhiyun * mapping.
135*4882a593Smuzhiyun */
136*4882a593Smuzhiyun memtrace_clear_range(base_pfn, nr_pages);
137*4882a593Smuzhiyun /*
138*4882a593Smuzhiyun * Remove memory in memory block size chunks so that
139*4882a593Smuzhiyun * iomem resources are always split to the same size and
140*4882a593Smuzhiyun * we never try to remove memory that spans two iomem
141*4882a593Smuzhiyun * resources.
142*4882a593Smuzhiyun */
143*4882a593Smuzhiyun end_pfn = base_pfn + nr_pages;
144*4882a593Smuzhiyun for (pfn = base_pfn; pfn < end_pfn; pfn += bytes>> PAGE_SHIFT) {
145*4882a593Smuzhiyun __remove_memory(nid, pfn << PAGE_SHIFT, bytes);
146*4882a593Smuzhiyun }
147*4882a593Smuzhiyun unlock_device_hotplug();
148*4882a593Smuzhiyun return base_pfn << PAGE_SHIFT;
149*4882a593Smuzhiyun }
150*4882a593Smuzhiyun }
151*4882a593Smuzhiyun unlock_device_hotplug();
152*4882a593Smuzhiyun
153*4882a593Smuzhiyun return 0;
154*4882a593Smuzhiyun }
155*4882a593Smuzhiyun
memtrace_init_regions_runtime(u64 size)156*4882a593Smuzhiyun static int memtrace_init_regions_runtime(u64 size)
157*4882a593Smuzhiyun {
158*4882a593Smuzhiyun u32 nid;
159*4882a593Smuzhiyun u64 m;
160*4882a593Smuzhiyun
161*4882a593Smuzhiyun memtrace_array = kcalloc(num_online_nodes(),
162*4882a593Smuzhiyun sizeof(struct memtrace_entry), GFP_KERNEL);
163*4882a593Smuzhiyun if (!memtrace_array) {
164*4882a593Smuzhiyun pr_err("Failed to allocate memtrace_array\n");
165*4882a593Smuzhiyun return -EINVAL;
166*4882a593Smuzhiyun }
167*4882a593Smuzhiyun
168*4882a593Smuzhiyun for_each_online_node(nid) {
169*4882a593Smuzhiyun m = memtrace_alloc_node(nid, size);
170*4882a593Smuzhiyun
171*4882a593Smuzhiyun /*
172*4882a593Smuzhiyun * A node might not have any local memory, so warn but
173*4882a593Smuzhiyun * continue on.
174*4882a593Smuzhiyun */
175*4882a593Smuzhiyun if (!m) {
176*4882a593Smuzhiyun pr_err("Failed to allocate trace memory on node %d\n", nid);
177*4882a593Smuzhiyun continue;
178*4882a593Smuzhiyun }
179*4882a593Smuzhiyun
180*4882a593Smuzhiyun pr_info("Allocated trace memory on node %d at 0x%016llx\n", nid, m);
181*4882a593Smuzhiyun
182*4882a593Smuzhiyun memtrace_array[memtrace_array_nr].start = m;
183*4882a593Smuzhiyun memtrace_array[memtrace_array_nr].size = size;
184*4882a593Smuzhiyun memtrace_array[memtrace_array_nr].nid = nid;
185*4882a593Smuzhiyun memtrace_array_nr++;
186*4882a593Smuzhiyun }
187*4882a593Smuzhiyun
188*4882a593Smuzhiyun return 0;
189*4882a593Smuzhiyun }
190*4882a593Smuzhiyun
191*4882a593Smuzhiyun static struct dentry *memtrace_debugfs_dir;
192*4882a593Smuzhiyun
memtrace_init_debugfs(void)193*4882a593Smuzhiyun static int memtrace_init_debugfs(void)
194*4882a593Smuzhiyun {
195*4882a593Smuzhiyun int ret = 0;
196*4882a593Smuzhiyun int i;
197*4882a593Smuzhiyun
198*4882a593Smuzhiyun for (i = 0; i < memtrace_array_nr; i++) {
199*4882a593Smuzhiyun struct dentry *dir;
200*4882a593Smuzhiyun struct memtrace_entry *ent = &memtrace_array[i];
201*4882a593Smuzhiyun
202*4882a593Smuzhiyun ent->mem = ioremap(ent->start, ent->size);
203*4882a593Smuzhiyun /* Warn but continue on */
204*4882a593Smuzhiyun if (!ent->mem) {
205*4882a593Smuzhiyun pr_err("Failed to map trace memory at 0x%llx\n",
206*4882a593Smuzhiyun ent->start);
207*4882a593Smuzhiyun ret = -1;
208*4882a593Smuzhiyun continue;
209*4882a593Smuzhiyun }
210*4882a593Smuzhiyun
211*4882a593Smuzhiyun snprintf(ent->name, 16, "%08x", ent->nid);
212*4882a593Smuzhiyun dir = debugfs_create_dir(ent->name, memtrace_debugfs_dir);
213*4882a593Smuzhiyun
214*4882a593Smuzhiyun ent->dir = dir;
215*4882a593Smuzhiyun debugfs_create_file("trace", 0400, dir, ent, &memtrace_fops);
216*4882a593Smuzhiyun debugfs_create_x64("start", 0400, dir, &ent->start);
217*4882a593Smuzhiyun debugfs_create_x64("size", 0400, dir, &ent->size);
218*4882a593Smuzhiyun }
219*4882a593Smuzhiyun
220*4882a593Smuzhiyun return ret;
221*4882a593Smuzhiyun }
222*4882a593Smuzhiyun
online_mem_block(struct memory_block * mem,void * arg)223*4882a593Smuzhiyun static int online_mem_block(struct memory_block *mem, void *arg)
224*4882a593Smuzhiyun {
225*4882a593Smuzhiyun return device_online(&mem->dev);
226*4882a593Smuzhiyun }
227*4882a593Smuzhiyun
228*4882a593Smuzhiyun /*
229*4882a593Smuzhiyun * Iterate through the chunks of memory we have removed from the kernel
230*4882a593Smuzhiyun * and attempt to add them back to the kernel.
231*4882a593Smuzhiyun */
memtrace_online(void)232*4882a593Smuzhiyun static int memtrace_online(void)
233*4882a593Smuzhiyun {
234*4882a593Smuzhiyun int i, ret = 0;
235*4882a593Smuzhiyun struct memtrace_entry *ent;
236*4882a593Smuzhiyun
237*4882a593Smuzhiyun for (i = memtrace_array_nr - 1; i >= 0; i--) {
238*4882a593Smuzhiyun ent = &memtrace_array[i];
239*4882a593Smuzhiyun
240*4882a593Smuzhiyun /* We have onlined this chunk previously */
241*4882a593Smuzhiyun if (ent->nid == NUMA_NO_NODE)
242*4882a593Smuzhiyun continue;
243*4882a593Smuzhiyun
244*4882a593Smuzhiyun /* Remove from io mappings */
245*4882a593Smuzhiyun if (ent->mem) {
246*4882a593Smuzhiyun iounmap(ent->mem);
247*4882a593Smuzhiyun ent->mem = 0;
248*4882a593Smuzhiyun }
249*4882a593Smuzhiyun
250*4882a593Smuzhiyun if (add_memory(ent->nid, ent->start, ent->size, MHP_NONE)) {
251*4882a593Smuzhiyun pr_err("Failed to add trace memory to node %d\n",
252*4882a593Smuzhiyun ent->nid);
253*4882a593Smuzhiyun ret += 1;
254*4882a593Smuzhiyun continue;
255*4882a593Smuzhiyun }
256*4882a593Smuzhiyun
257*4882a593Smuzhiyun lock_device_hotplug();
258*4882a593Smuzhiyun walk_memory_blocks(ent->start, ent->size, NULL,
259*4882a593Smuzhiyun online_mem_block);
260*4882a593Smuzhiyun unlock_device_hotplug();
261*4882a593Smuzhiyun
262*4882a593Smuzhiyun /*
263*4882a593Smuzhiyun * Memory was added successfully so clean up references to it
264*4882a593Smuzhiyun * so on reentry we can tell that this chunk was added.
265*4882a593Smuzhiyun */
266*4882a593Smuzhiyun debugfs_remove_recursive(ent->dir);
267*4882a593Smuzhiyun pr_info("Added trace memory back to node %d\n", ent->nid);
268*4882a593Smuzhiyun ent->size = ent->start = ent->nid = NUMA_NO_NODE;
269*4882a593Smuzhiyun }
270*4882a593Smuzhiyun if (ret)
271*4882a593Smuzhiyun return ret;
272*4882a593Smuzhiyun
273*4882a593Smuzhiyun /* If all chunks of memory were added successfully, reset globals */
274*4882a593Smuzhiyun kfree(memtrace_array);
275*4882a593Smuzhiyun memtrace_array = NULL;
276*4882a593Smuzhiyun memtrace_size = 0;
277*4882a593Smuzhiyun memtrace_array_nr = 0;
278*4882a593Smuzhiyun return 0;
279*4882a593Smuzhiyun }
280*4882a593Smuzhiyun
memtrace_enable_set(void * data,u64 val)281*4882a593Smuzhiyun static int memtrace_enable_set(void *data, u64 val)
282*4882a593Smuzhiyun {
283*4882a593Smuzhiyun int rc = -EAGAIN;
284*4882a593Smuzhiyun u64 bytes;
285*4882a593Smuzhiyun
286*4882a593Smuzhiyun /*
287*4882a593Smuzhiyun * Don't attempt to do anything if size isn't aligned to a memory
288*4882a593Smuzhiyun * block or equal to zero.
289*4882a593Smuzhiyun */
290*4882a593Smuzhiyun bytes = memory_block_size_bytes();
291*4882a593Smuzhiyun if (val & (bytes - 1)) {
292*4882a593Smuzhiyun pr_err("Value must be aligned with 0x%llx\n", bytes);
293*4882a593Smuzhiyun return -EINVAL;
294*4882a593Smuzhiyun }
295*4882a593Smuzhiyun
296*4882a593Smuzhiyun mutex_lock(&memtrace_mutex);
297*4882a593Smuzhiyun
298*4882a593Smuzhiyun /* Re-add/online previously removed/offlined memory */
299*4882a593Smuzhiyun if (memtrace_size) {
300*4882a593Smuzhiyun if (memtrace_online())
301*4882a593Smuzhiyun goto out_unlock;
302*4882a593Smuzhiyun }
303*4882a593Smuzhiyun
304*4882a593Smuzhiyun if (!val) {
305*4882a593Smuzhiyun rc = 0;
306*4882a593Smuzhiyun goto out_unlock;
307*4882a593Smuzhiyun }
308*4882a593Smuzhiyun
309*4882a593Smuzhiyun /* Offline and remove memory */
310*4882a593Smuzhiyun if (memtrace_init_regions_runtime(val))
311*4882a593Smuzhiyun goto out_unlock;
312*4882a593Smuzhiyun
313*4882a593Smuzhiyun if (memtrace_init_debugfs())
314*4882a593Smuzhiyun goto out_unlock;
315*4882a593Smuzhiyun
316*4882a593Smuzhiyun memtrace_size = val;
317*4882a593Smuzhiyun rc = 0;
318*4882a593Smuzhiyun out_unlock:
319*4882a593Smuzhiyun mutex_unlock(&memtrace_mutex);
320*4882a593Smuzhiyun return rc;
321*4882a593Smuzhiyun }
322*4882a593Smuzhiyun
memtrace_enable_get(void * data,u64 * val)323*4882a593Smuzhiyun static int memtrace_enable_get(void *data, u64 *val)
324*4882a593Smuzhiyun {
325*4882a593Smuzhiyun *val = memtrace_size;
326*4882a593Smuzhiyun return 0;
327*4882a593Smuzhiyun }
328*4882a593Smuzhiyun
329*4882a593Smuzhiyun DEFINE_SIMPLE_ATTRIBUTE(memtrace_init_fops, memtrace_enable_get,
330*4882a593Smuzhiyun memtrace_enable_set, "0x%016llx\n");
331*4882a593Smuzhiyun
memtrace_init(void)332*4882a593Smuzhiyun static int memtrace_init(void)
333*4882a593Smuzhiyun {
334*4882a593Smuzhiyun memtrace_debugfs_dir = debugfs_create_dir("memtrace",
335*4882a593Smuzhiyun powerpc_debugfs_root);
336*4882a593Smuzhiyun
337*4882a593Smuzhiyun debugfs_create_file("enable", 0600, memtrace_debugfs_dir,
338*4882a593Smuzhiyun NULL, &memtrace_init_fops);
339*4882a593Smuzhiyun
340*4882a593Smuzhiyun return 0;
341*4882a593Smuzhiyun }
342*4882a593Smuzhiyun machine_device_initcall(powernv, memtrace_init);
343