xref: /OK3568_Linux_fs/kernel/mm/page_pinner.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/debugfs.h>
3 #include <linux/mm.h>
4 #include <linux/slab.h>
5 #include <linux/uaccess.h>
6 #include <linux/memblock.h>
7 #include <linux/stacktrace.h>
8 #include <linux/page_pinner.h>
9 #include <linux/jump_label.h>
10 #include <linux/migrate.h>
11 #include <linux/stackdepot.h>
12 #include <linux/seq_file.h>
13 #include <linux/sched/clock.h>
14 
15 #include "internal.h"
16 
17 #define PAGE_PINNER_STACK_DEPTH 16
18 #define LONGTERM_PIN_BUCKETS	4096
19 
20 struct page_pinner {
21 	depot_stack_handle_t handle;
22 	s64 ts_usec;
23 	atomic_t count;
24 };
25 
26 struct captured_pinner {
27 	depot_stack_handle_t handle;
28 	union {
29 		s64 ts_usec;
30 		s64 elapsed;
31 	};
32 
33 	/* struct page fields */
34 	unsigned long pfn;
35 	int count;
36 	int mapcount;
37 	struct address_space *mapping;
38 	unsigned long flags;
39 };
40 
41 struct longterm_pinner {
42 	spinlock_t lock;
43 	unsigned int index;
44 	struct captured_pinner pinner[LONGTERM_PIN_BUCKETS];
45 };
46 
47 static struct longterm_pinner lt_pinner = {
48 	.lock = __SPIN_LOCK_UNLOCKED(lt_pinner.lock),
49 };
50 
51 static s64 threshold_usec = 300000;
52 
53 /* alloc_contig failed pinner */
54 static struct longterm_pinner acf_pinner = {
55 	.lock = __SPIN_LOCK_UNLOCKED(acf_pinner.lock),
56 };
57 
58 static bool page_pinner_enabled;
59 DEFINE_STATIC_KEY_FALSE(page_pinner_inited);
60 
61 DEFINE_STATIC_KEY_TRUE(failure_tracking);
62 EXPORT_SYMBOL(failure_tracking);
63 
64 static depot_stack_handle_t failure_handle;
65 
early_page_pinner_param(char * buf)66 static int __init early_page_pinner_param(char *buf)
67 {
68 	page_pinner_enabled = true;
69 	return 0;
70 }
71 early_param("page_pinner", early_page_pinner_param);
72 
need_page_pinner(void)73 static bool need_page_pinner(void)
74 {
75 	return page_pinner_enabled;
76 }
77 
register_failure_stack(void)78 static noinline void register_failure_stack(void)
79 {
80 	unsigned long entries[4];
81 	unsigned int nr_entries;
82 
83 	nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
84 	failure_handle = stack_depot_save(entries, nr_entries, GFP_KERNEL);
85 }
86 
init_page_pinner(void)87 static void init_page_pinner(void)
88 {
89 	if (!page_pinner_enabled)
90 		return;
91 
92 	register_failure_stack();
93 	static_branch_enable(&page_pinner_inited);
94 }
95 
96 struct page_ext_operations page_pinner_ops = {
97 	.size = sizeof(struct page_pinner),
98 	.need = need_page_pinner,
99 	.init = init_page_pinner,
100 };
101 
get_page_pinner(struct page_ext * page_ext)102 static inline struct page_pinner *get_page_pinner(struct page_ext *page_ext)
103 {
104 	return (void *)page_ext + page_pinner_ops.offset;
105 }
106 
save_stack(gfp_t flags)107 static noinline depot_stack_handle_t save_stack(gfp_t flags)
108 {
109 	unsigned long entries[PAGE_PINNER_STACK_DEPTH];
110 	depot_stack_handle_t handle;
111 	unsigned int nr_entries;
112 
113 	nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2);
114 	handle = stack_depot_save(entries, nr_entries, flags);
115 	if (!handle)
116 		handle = failure_handle;
117 
118 	return handle;
119 }
120 
capture_page_state(struct page * page,struct captured_pinner * record)121 static void capture_page_state(struct page *page,
122 			       struct captured_pinner *record)
123 {
124 	record->flags = page->flags;
125 	record->mapping = page_mapping(page);
126 	record->pfn = page_to_pfn(page);
127 	record->count = page_count(page);
128 	record->mapcount = page_mapcount(page);
129 }
130 
check_longterm_pin(struct page_pinner * page_pinner,struct page * page)131 static void check_longterm_pin(struct page_pinner *page_pinner,
132 			      struct page *page)
133 {
134 	s64 now, delta = 0;
135 	unsigned long flags;
136 	unsigned int idx;
137 	struct captured_pinner record;
138 
139 	now = ktime_to_us(ktime_get_boottime());
140 
141 	/* get/put_page can be raced. Ignore that case */
142 	if (page_pinner->ts_usec < now)
143 		delta = now - page_pinner->ts_usec;
144 
145 	if (delta <= threshold_usec)
146 		return;
147 
148 	record.handle = page_pinner->handle;
149 	record.elapsed = delta;
150 	capture_page_state(page, &record);
151 
152 	spin_lock_irqsave(&lt_pinner.lock, flags);
153 	idx = lt_pinner.index++;
154 	lt_pinner.index %= LONGTERM_PIN_BUCKETS;
155 	lt_pinner.pinner[idx] = record;
156 	spin_unlock_irqrestore(&lt_pinner.lock, flags);
157 }
158 
__reset_page_pinner(struct page * page,unsigned int order,bool free)159 void __reset_page_pinner(struct page *page, unsigned int order, bool free)
160 {
161 	struct page_pinner *page_pinner;
162 	struct page_ext *page_ext;
163 	int i;
164 
165 	page_ext = page_ext_get(page);
166 	if (unlikely(!page_ext))
167 		return;
168 
169 	for (i = 0; i < (1 << order); i++) {
170 		if (!test_bit(PAGE_EXT_GET, &page_ext->flags) &&
171 			!test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED,
172 				  &page_ext->flags))
173 			continue;
174 
175 		page_pinner = get_page_pinner(page_ext);
176 		if (free) {
177 			/* record page free call path */
178 			__page_pinner_migration_failed(page);
179 			atomic_set(&page_pinner->count, 0);
180 			__clear_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags);
181 		} else {
182 			check_longterm_pin(page_pinner, page);
183 		}
184 		clear_bit(PAGE_EXT_GET, &page_ext->flags);
185 		page_ext = page_ext_next(page_ext);
186 	}
187 	page_ext_put(page_ext);
188 }
189 
__set_page_pinner_handle(struct page * page,struct page_ext * page_ext,depot_stack_handle_t handle,unsigned int order)190 static inline void __set_page_pinner_handle(struct page *page,
191 	struct page_ext *page_ext, depot_stack_handle_t handle,
192 	unsigned int order)
193 {
194 	struct page_pinner *page_pinner;
195 	int i;
196 	s64 usec = ktime_to_us(ktime_get_boottime());
197 
198 	for (i = 0; i < (1 << order); i++) {
199 		page_pinner = get_page_pinner(page_ext);
200 		page_pinner->handle = handle;
201 		page_pinner->ts_usec = usec;
202 		set_bit(PAGE_EXT_GET, &page_ext->flags);
203 		atomic_inc(&page_pinner->count);
204 		page_ext = page_ext_next(page_ext);
205 	}
206 }
207 
__set_page_pinner(struct page * page,unsigned int order)208 noinline void __set_page_pinner(struct page *page, unsigned int order)
209 {
210 	struct page_ext *page_ext;
211 	depot_stack_handle_t handle;
212 
213 	handle = save_stack(GFP_NOWAIT|__GFP_NOWARN);
214 
215 	page_ext = page_ext_get(page);
216 	if (unlikely(!page_ext))
217 		return;
218 	__set_page_pinner_handle(page, page_ext, handle, order);
219 	page_ext_put(page_ext);
220 }
221 
222 static ssize_t
print_page_pinner(bool longterm,char __user * buf,size_t count,struct captured_pinner * record)223 print_page_pinner(bool longterm, char __user *buf, size_t count, struct captured_pinner *record)
224 {
225 	int ret;
226 	unsigned long *entries;
227 	unsigned int nr_entries;
228 	char *kbuf;
229 
230 	count = min_t(size_t, count, PAGE_SIZE);
231 	kbuf = kmalloc(count, GFP_KERNEL);
232 	if (!kbuf)
233 		return -ENOMEM;
234 
235 	if (longterm) {
236 		ret = snprintf(kbuf, count, "Page pinned for %lld us\n",
237 			       record->elapsed);
238 	} else {
239 		u64 ts_usec = record->ts_usec;
240 		unsigned long rem_usec = do_div(ts_usec, 1000000);
241 
242 		ret = snprintf(kbuf, count,
243 			       "Page pinned ts [%5lu.%06lu]\n",
244 			       (unsigned long)ts_usec, rem_usec);
245 	}
246 
247 	if (ret >= count)
248 		goto err;
249 
250 	/* Print information relevant to grouping pages by mobility */
251 	ret += snprintf(kbuf + ret, count - ret,
252 			"PFN 0x%lx Block %lu count %d mapcount %d mapping %pS Flags %#lx(%pGp)\n",
253 			record->pfn,
254 			record->pfn >> pageblock_order,
255 			record->count, record->mapcount,
256 			record->mapping,
257 			record->flags, &record->flags);
258 
259 	if (ret >= count)
260 		goto err;
261 
262 	nr_entries = stack_depot_fetch(record->handle, &entries);
263 	ret += stack_trace_snprint(kbuf + ret, count - ret, entries,
264 				   nr_entries, 0);
265 	if (ret >= count)
266 		goto err;
267 
268 	ret += snprintf(kbuf + ret, count - ret, "\n");
269 	if (ret >= count)
270 		goto err;
271 
272 	if (copy_to_user(buf, kbuf, ret))
273 		ret = -EFAULT;
274 
275 	kfree(kbuf);
276 	return ret;
277 
278 err:
279 	kfree(kbuf);
280 	return -ENOMEM;
281 }
282 
__dump_page_pinner(struct page * page)283 void __dump_page_pinner(struct page *page)
284 {
285 	struct page_ext *page_ext = page_ext_get(page);
286 	struct page_pinner *page_pinner;
287 	depot_stack_handle_t handle;
288 	unsigned long *entries;
289 	unsigned int nr_entries;
290 	int pageblock_mt;
291 	unsigned long pfn;
292 	int count;
293 	unsigned long rem_usec;
294 	u64 ts_usec;
295 
296 	if (unlikely(!page_ext)) {
297 		pr_alert("There is not page extension available.\n");
298 		return;
299 	}
300 
301 	page_pinner = get_page_pinner(page_ext);
302 
303 	count = atomic_read(&page_pinner->count);
304 	if (!count) {
305 		pr_alert("page_pinner info is not present (never set?)\n");
306 		page_ext_put(page_ext);
307 		return;
308 	}
309 
310 	pfn = page_to_pfn(page);
311 	ts_usec = page_pinner->ts_usec;
312 	rem_usec = do_div(ts_usec, 1000000);
313 	pr_alert("page last pinned %5lu.%06lu] count %d\n",
314 		 (unsigned long)ts_usec, rem_usec, count);
315 
316 	pageblock_mt = get_pageblock_migratetype(page);
317 	pr_alert("PFN %lu Block %lu type %s Flags %#lx(%pGp)\n",
318 			pfn,
319 			pfn >> pageblock_order,
320 			migratetype_names[pageblock_mt],
321 			page->flags, &page->flags);
322 
323 	handle = READ_ONCE(page_pinner->handle);
324 	if (!handle) {
325 		pr_alert("page_pinner allocation stack trace missing\n");
326 	} else {
327 		nr_entries = stack_depot_fetch(handle, &entries);
328 		stack_trace_print(entries, nr_entries, 0);
329 	}
330 	page_ext_put(page_ext);
331 }
332 
__page_pinner_migration_failed(struct page * page)333 void __page_pinner_migration_failed(struct page *page)
334 {
335 	struct page_ext *page_ext = page_ext_get(page);
336 	struct captured_pinner record;
337 	unsigned long flags;
338 	unsigned int idx;
339 
340 	if (unlikely(!page_ext))
341 		return;
342 
343 	if (!test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags)) {
344 		page_ext_put(page_ext);
345 		return;
346 	}
347 
348 	page_ext_put(page_ext);
349 	record.handle = save_stack(GFP_NOWAIT|__GFP_NOWARN);
350 	record.ts_usec = ktime_to_us(ktime_get_boottime());
351 	capture_page_state(page, &record);
352 
353 	spin_lock_irqsave(&acf_pinner.lock, flags);
354 	idx = acf_pinner.index++;
355 	acf_pinner.index %= LONGTERM_PIN_BUCKETS;
356 	acf_pinner.pinner[idx] = record;
357 	spin_unlock_irqrestore(&acf_pinner.lock, flags);
358 }
359 EXPORT_SYMBOL(__page_pinner_migration_failed);
360 
__page_pinner_mark_migration_failed_pages(struct list_head * page_list)361 void __page_pinner_mark_migration_failed_pages(struct list_head *page_list)
362 {
363 	struct page *page;
364 	struct page_ext *page_ext;
365 
366 	list_for_each_entry(page, page_list, lru) {
367 		/* The page will be freed by putback_movable_pages soon */
368 		if (page_count(page) == 1)
369 			continue;
370 		page_ext = page_ext_get(page);
371 		if (unlikely(!page_ext))
372 			continue;
373 		__set_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags);
374 		page_ext_put(page_ext);
375 		__page_pinner_migration_failed(page);
376 	}
377 }
378 
379 static ssize_t
read_longterm_page_pinner(struct file * file,char __user * buf,size_t count,loff_t * ppos)380 read_longterm_page_pinner(struct file *file, char __user *buf, size_t count,
381 			  loff_t *ppos)
382 {
383 	loff_t i, idx;
384 	struct captured_pinner record;
385 	unsigned long flags;
386 
387 	if (!static_branch_unlikely(&page_pinner_inited))
388 		return -EINVAL;
389 
390 	if (*ppos >= LONGTERM_PIN_BUCKETS)
391 		return 0;
392 
393 	i = *ppos;
394 	*ppos = i + 1;
395 
396 	/*
397 	 * reading the records in the reverse order with newest one
398 	 * being read first followed by older ones
399 	 */
400 	idx = (lt_pinner.index - 1 - i + LONGTERM_PIN_BUCKETS) %
401 	       LONGTERM_PIN_BUCKETS;
402 	spin_lock_irqsave(&lt_pinner.lock, flags);
403 	record = lt_pinner.pinner[idx];
404 	spin_unlock_irqrestore(&lt_pinner.lock, flags);
405 	if (!record.handle)
406 		return 0;
407 
408 	return print_page_pinner(true, buf, count, &record);
409 }
410 
411 static const struct file_operations proc_longterm_pinner_operations = {
412 	.read		= read_longterm_page_pinner,
413 };
414 
read_alloc_contig_failed(struct file * file,char __user * buf,size_t count,loff_t * ppos)415 static ssize_t read_alloc_contig_failed(struct file *file, char __user *buf,
416 					size_t count, loff_t *ppos)
417 {
418 	loff_t i, idx;
419 	struct captured_pinner record;
420 	unsigned long flags;
421 
422 	if (!static_branch_unlikely(&failure_tracking))
423 		return -EINVAL;
424 
425 	if (*ppos >= LONGTERM_PIN_BUCKETS)
426 		return 0;
427 
428 	i = *ppos;
429 	*ppos = i + 1;
430 
431 	/*
432 	 * reading the records in the reverse order with newest one
433 	 * being read first followed by older ones
434 	 */
435 	idx = (acf_pinner.index - 1 - i + LONGTERM_PIN_BUCKETS) %
436 	       LONGTERM_PIN_BUCKETS;
437 
438 	spin_lock_irqsave(&acf_pinner.lock, flags);
439 	record = acf_pinner.pinner[idx];
440 	spin_unlock_irqrestore(&acf_pinner.lock, flags);
441 	if (!record.handle)
442 		return 0;
443 
444 	return print_page_pinner(false, buf, count, &record);
445 }
446 
447 static const struct file_operations proc_alloc_contig_failed_operations = {
448 	.read		= read_alloc_contig_failed,
449 };
450 
pp_threshold_set(void * data,unsigned long long val)451 static int pp_threshold_set(void *data, unsigned long long val)
452 {
453 	unsigned long flags;
454 
455 	threshold_usec = (s64)val;
456 
457 	spin_lock_irqsave(&lt_pinner.lock, flags);
458 	memset(lt_pinner.pinner, 0,
459 	       sizeof(struct captured_pinner) * LONGTERM_PIN_BUCKETS);
460 	lt_pinner.index = 0;
461 	spin_unlock_irqrestore(&lt_pinner.lock, flags);
462 	return 0;
463 }
464 
pp_threshold_get(void * data,unsigned long long * val)465 static int pp_threshold_get(void *data, unsigned long long *val)
466 {
467 	*val = (unsigned long long)threshold_usec;
468 
469 	return 0;
470 }
471 DEFINE_DEBUGFS_ATTRIBUTE(pp_threshold_fops, pp_threshold_get,
472 			 pp_threshold_set, "%lld\n");
473 
failure_tracking_set(void * data,u64 val)474 static int failure_tracking_set(void *data, u64 val)
475 {
476 	bool on;
477 
478 	on = (bool)val;
479 	if (on)
480 		static_branch_enable(&failure_tracking);
481 	else
482 		static_branch_disable(&failure_tracking);
483 	return 0;
484 }
485 
failure_tracking_get(void * data,u64 * val)486 static int failure_tracking_get(void *data, u64 *val)
487 {
488 	*val = static_branch_unlikely(&failure_tracking);
489 	return 0;
490 }
491 DEFINE_DEBUGFS_ATTRIBUTE(failure_tracking_fops,
492 			 failure_tracking_get,
493 			 failure_tracking_set, "%llu\n");
494 
page_pinner_init(void)495 static int __init page_pinner_init(void)
496 {
497 	struct dentry *pp_debugfs_root;
498 
499 	if (!static_branch_unlikely(&page_pinner_inited))
500 		return 0;
501 
502 	pr_info("page_pinner enabled\n");
503 
504 	pp_debugfs_root = debugfs_create_dir("page_pinner", NULL);
505 
506 	debugfs_create_file("longterm_pinner", 0444, pp_debugfs_root, NULL,
507 			    &proc_longterm_pinner_operations);
508 
509 	debugfs_create_file("threshold", 0644, pp_debugfs_root, NULL,
510 			    &pp_threshold_fops);
511 
512 	debugfs_create_file("alloc_contig_failed", 0444,
513 			    pp_debugfs_root, NULL,
514 			    &proc_alloc_contig_failed_operations);
515 
516 	debugfs_create_file("failure_tracking", 0644,
517 			    pp_debugfs_root, NULL,
518 			    &failure_tracking_fops);
519 	return 0;
520 }
521 late_initcall(page_pinner_init)
522