1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/debugfs.h>
3 #include <linux/mm.h>
4 #include <linux/slab.h>
5 #include <linux/uaccess.h>
6 #include <linux/memblock.h>
7 #include <linux/stacktrace.h>
8 #include <linux/page_pinner.h>
9 #include <linux/jump_label.h>
10 #include <linux/migrate.h>
11 #include <linux/stackdepot.h>
12 #include <linux/seq_file.h>
13 #include <linux/sched/clock.h>
14
15 #include "internal.h"
16
17 #define PAGE_PINNER_STACK_DEPTH 16
18 #define LONGTERM_PIN_BUCKETS 4096
19
20 struct page_pinner {
21 depot_stack_handle_t handle;
22 s64 ts_usec;
23 atomic_t count;
24 };
25
26 struct captured_pinner {
27 depot_stack_handle_t handle;
28 union {
29 s64 ts_usec;
30 s64 elapsed;
31 };
32
33 /* struct page fields */
34 unsigned long pfn;
35 int count;
36 int mapcount;
37 struct address_space *mapping;
38 unsigned long flags;
39 };
40
41 struct longterm_pinner {
42 spinlock_t lock;
43 unsigned int index;
44 struct captured_pinner pinner[LONGTERM_PIN_BUCKETS];
45 };
46
47 static struct longterm_pinner lt_pinner = {
48 .lock = __SPIN_LOCK_UNLOCKED(lt_pinner.lock),
49 };
50
51 static s64 threshold_usec = 300000;
52
53 /* alloc_contig failed pinner */
54 static struct longterm_pinner acf_pinner = {
55 .lock = __SPIN_LOCK_UNLOCKED(acf_pinner.lock),
56 };
57
58 static bool page_pinner_enabled;
59 DEFINE_STATIC_KEY_FALSE(page_pinner_inited);
60
61 DEFINE_STATIC_KEY_TRUE(failure_tracking);
62 EXPORT_SYMBOL(failure_tracking);
63
64 static depot_stack_handle_t failure_handle;
65
early_page_pinner_param(char * buf)66 static int __init early_page_pinner_param(char *buf)
67 {
68 page_pinner_enabled = true;
69 return 0;
70 }
71 early_param("page_pinner", early_page_pinner_param);
72
need_page_pinner(void)73 static bool need_page_pinner(void)
74 {
75 return page_pinner_enabled;
76 }
77
register_failure_stack(void)78 static noinline void register_failure_stack(void)
79 {
80 unsigned long entries[4];
81 unsigned int nr_entries;
82
83 nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
84 failure_handle = stack_depot_save(entries, nr_entries, GFP_KERNEL);
85 }
86
init_page_pinner(void)87 static void init_page_pinner(void)
88 {
89 if (!page_pinner_enabled)
90 return;
91
92 register_failure_stack();
93 static_branch_enable(&page_pinner_inited);
94 }
95
96 struct page_ext_operations page_pinner_ops = {
97 .size = sizeof(struct page_pinner),
98 .need = need_page_pinner,
99 .init = init_page_pinner,
100 };
101
get_page_pinner(struct page_ext * page_ext)102 static inline struct page_pinner *get_page_pinner(struct page_ext *page_ext)
103 {
104 return (void *)page_ext + page_pinner_ops.offset;
105 }
106
save_stack(gfp_t flags)107 static noinline depot_stack_handle_t save_stack(gfp_t flags)
108 {
109 unsigned long entries[PAGE_PINNER_STACK_DEPTH];
110 depot_stack_handle_t handle;
111 unsigned int nr_entries;
112
113 nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2);
114 handle = stack_depot_save(entries, nr_entries, flags);
115 if (!handle)
116 handle = failure_handle;
117
118 return handle;
119 }
120
capture_page_state(struct page * page,struct captured_pinner * record)121 static void capture_page_state(struct page *page,
122 struct captured_pinner *record)
123 {
124 record->flags = page->flags;
125 record->mapping = page_mapping(page);
126 record->pfn = page_to_pfn(page);
127 record->count = page_count(page);
128 record->mapcount = page_mapcount(page);
129 }
130
check_longterm_pin(struct page_pinner * page_pinner,struct page * page)131 static void check_longterm_pin(struct page_pinner *page_pinner,
132 struct page *page)
133 {
134 s64 now, delta = 0;
135 unsigned long flags;
136 unsigned int idx;
137 struct captured_pinner record;
138
139 now = ktime_to_us(ktime_get_boottime());
140
141 /* get/put_page can be raced. Ignore that case */
142 if (page_pinner->ts_usec < now)
143 delta = now - page_pinner->ts_usec;
144
145 if (delta <= threshold_usec)
146 return;
147
148 record.handle = page_pinner->handle;
149 record.elapsed = delta;
150 capture_page_state(page, &record);
151
152 spin_lock_irqsave(<_pinner.lock, flags);
153 idx = lt_pinner.index++;
154 lt_pinner.index %= LONGTERM_PIN_BUCKETS;
155 lt_pinner.pinner[idx] = record;
156 spin_unlock_irqrestore(<_pinner.lock, flags);
157 }
158
__reset_page_pinner(struct page * page,unsigned int order,bool free)159 void __reset_page_pinner(struct page *page, unsigned int order, bool free)
160 {
161 struct page_pinner *page_pinner;
162 struct page_ext *page_ext;
163 int i;
164
165 page_ext = page_ext_get(page);
166 if (unlikely(!page_ext))
167 return;
168
169 for (i = 0; i < (1 << order); i++) {
170 if (!test_bit(PAGE_EXT_GET, &page_ext->flags) &&
171 !test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED,
172 &page_ext->flags))
173 continue;
174
175 page_pinner = get_page_pinner(page_ext);
176 if (free) {
177 /* record page free call path */
178 __page_pinner_migration_failed(page);
179 atomic_set(&page_pinner->count, 0);
180 __clear_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags);
181 } else {
182 check_longterm_pin(page_pinner, page);
183 }
184 clear_bit(PAGE_EXT_GET, &page_ext->flags);
185 page_ext = page_ext_next(page_ext);
186 }
187 page_ext_put(page_ext);
188 }
189
__set_page_pinner_handle(struct page * page,struct page_ext * page_ext,depot_stack_handle_t handle,unsigned int order)190 static inline void __set_page_pinner_handle(struct page *page,
191 struct page_ext *page_ext, depot_stack_handle_t handle,
192 unsigned int order)
193 {
194 struct page_pinner *page_pinner;
195 int i;
196 s64 usec = ktime_to_us(ktime_get_boottime());
197
198 for (i = 0; i < (1 << order); i++) {
199 page_pinner = get_page_pinner(page_ext);
200 page_pinner->handle = handle;
201 page_pinner->ts_usec = usec;
202 set_bit(PAGE_EXT_GET, &page_ext->flags);
203 atomic_inc(&page_pinner->count);
204 page_ext = page_ext_next(page_ext);
205 }
206 }
207
__set_page_pinner(struct page * page,unsigned int order)208 noinline void __set_page_pinner(struct page *page, unsigned int order)
209 {
210 struct page_ext *page_ext;
211 depot_stack_handle_t handle;
212
213 handle = save_stack(GFP_NOWAIT|__GFP_NOWARN);
214
215 page_ext = page_ext_get(page);
216 if (unlikely(!page_ext))
217 return;
218 __set_page_pinner_handle(page, page_ext, handle, order);
219 page_ext_put(page_ext);
220 }
221
222 static ssize_t
print_page_pinner(bool longterm,char __user * buf,size_t count,struct captured_pinner * record)223 print_page_pinner(bool longterm, char __user *buf, size_t count, struct captured_pinner *record)
224 {
225 int ret;
226 unsigned long *entries;
227 unsigned int nr_entries;
228 char *kbuf;
229
230 count = min_t(size_t, count, PAGE_SIZE);
231 kbuf = kmalloc(count, GFP_KERNEL);
232 if (!kbuf)
233 return -ENOMEM;
234
235 if (longterm) {
236 ret = snprintf(kbuf, count, "Page pinned for %lld us\n",
237 record->elapsed);
238 } else {
239 u64 ts_usec = record->ts_usec;
240 unsigned long rem_usec = do_div(ts_usec, 1000000);
241
242 ret = snprintf(kbuf, count,
243 "Page pinned ts [%5lu.%06lu]\n",
244 (unsigned long)ts_usec, rem_usec);
245 }
246
247 if (ret >= count)
248 goto err;
249
250 /* Print information relevant to grouping pages by mobility */
251 ret += snprintf(kbuf + ret, count - ret,
252 "PFN 0x%lx Block %lu count %d mapcount %d mapping %pS Flags %#lx(%pGp)\n",
253 record->pfn,
254 record->pfn >> pageblock_order,
255 record->count, record->mapcount,
256 record->mapping,
257 record->flags, &record->flags);
258
259 if (ret >= count)
260 goto err;
261
262 nr_entries = stack_depot_fetch(record->handle, &entries);
263 ret += stack_trace_snprint(kbuf + ret, count - ret, entries,
264 nr_entries, 0);
265 if (ret >= count)
266 goto err;
267
268 ret += snprintf(kbuf + ret, count - ret, "\n");
269 if (ret >= count)
270 goto err;
271
272 if (copy_to_user(buf, kbuf, ret))
273 ret = -EFAULT;
274
275 kfree(kbuf);
276 return ret;
277
278 err:
279 kfree(kbuf);
280 return -ENOMEM;
281 }
282
__dump_page_pinner(struct page * page)283 void __dump_page_pinner(struct page *page)
284 {
285 struct page_ext *page_ext = page_ext_get(page);
286 struct page_pinner *page_pinner;
287 depot_stack_handle_t handle;
288 unsigned long *entries;
289 unsigned int nr_entries;
290 int pageblock_mt;
291 unsigned long pfn;
292 int count;
293 unsigned long rem_usec;
294 u64 ts_usec;
295
296 if (unlikely(!page_ext)) {
297 pr_alert("There is not page extension available.\n");
298 return;
299 }
300
301 page_pinner = get_page_pinner(page_ext);
302
303 count = atomic_read(&page_pinner->count);
304 if (!count) {
305 pr_alert("page_pinner info is not present (never set?)\n");
306 page_ext_put(page_ext);
307 return;
308 }
309
310 pfn = page_to_pfn(page);
311 ts_usec = page_pinner->ts_usec;
312 rem_usec = do_div(ts_usec, 1000000);
313 pr_alert("page last pinned %5lu.%06lu] count %d\n",
314 (unsigned long)ts_usec, rem_usec, count);
315
316 pageblock_mt = get_pageblock_migratetype(page);
317 pr_alert("PFN %lu Block %lu type %s Flags %#lx(%pGp)\n",
318 pfn,
319 pfn >> pageblock_order,
320 migratetype_names[pageblock_mt],
321 page->flags, &page->flags);
322
323 handle = READ_ONCE(page_pinner->handle);
324 if (!handle) {
325 pr_alert("page_pinner allocation stack trace missing\n");
326 } else {
327 nr_entries = stack_depot_fetch(handle, &entries);
328 stack_trace_print(entries, nr_entries, 0);
329 }
330 page_ext_put(page_ext);
331 }
332
__page_pinner_migration_failed(struct page * page)333 void __page_pinner_migration_failed(struct page *page)
334 {
335 struct page_ext *page_ext = page_ext_get(page);
336 struct captured_pinner record;
337 unsigned long flags;
338 unsigned int idx;
339
340 if (unlikely(!page_ext))
341 return;
342
343 if (!test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags)) {
344 page_ext_put(page_ext);
345 return;
346 }
347
348 page_ext_put(page_ext);
349 record.handle = save_stack(GFP_NOWAIT|__GFP_NOWARN);
350 record.ts_usec = ktime_to_us(ktime_get_boottime());
351 capture_page_state(page, &record);
352
353 spin_lock_irqsave(&acf_pinner.lock, flags);
354 idx = acf_pinner.index++;
355 acf_pinner.index %= LONGTERM_PIN_BUCKETS;
356 acf_pinner.pinner[idx] = record;
357 spin_unlock_irqrestore(&acf_pinner.lock, flags);
358 }
359 EXPORT_SYMBOL(__page_pinner_migration_failed);
360
__page_pinner_mark_migration_failed_pages(struct list_head * page_list)361 void __page_pinner_mark_migration_failed_pages(struct list_head *page_list)
362 {
363 struct page *page;
364 struct page_ext *page_ext;
365
366 list_for_each_entry(page, page_list, lru) {
367 /* The page will be freed by putback_movable_pages soon */
368 if (page_count(page) == 1)
369 continue;
370 page_ext = page_ext_get(page);
371 if (unlikely(!page_ext))
372 continue;
373 __set_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags);
374 page_ext_put(page_ext);
375 __page_pinner_migration_failed(page);
376 }
377 }
378
379 static ssize_t
read_longterm_page_pinner(struct file * file,char __user * buf,size_t count,loff_t * ppos)380 read_longterm_page_pinner(struct file *file, char __user *buf, size_t count,
381 loff_t *ppos)
382 {
383 loff_t i, idx;
384 struct captured_pinner record;
385 unsigned long flags;
386
387 if (!static_branch_unlikely(&page_pinner_inited))
388 return -EINVAL;
389
390 if (*ppos >= LONGTERM_PIN_BUCKETS)
391 return 0;
392
393 i = *ppos;
394 *ppos = i + 1;
395
396 /*
397 * reading the records in the reverse order with newest one
398 * being read first followed by older ones
399 */
400 idx = (lt_pinner.index - 1 - i + LONGTERM_PIN_BUCKETS) %
401 LONGTERM_PIN_BUCKETS;
402 spin_lock_irqsave(<_pinner.lock, flags);
403 record = lt_pinner.pinner[idx];
404 spin_unlock_irqrestore(<_pinner.lock, flags);
405 if (!record.handle)
406 return 0;
407
408 return print_page_pinner(true, buf, count, &record);
409 }
410
411 static const struct file_operations proc_longterm_pinner_operations = {
412 .read = read_longterm_page_pinner,
413 };
414
read_alloc_contig_failed(struct file * file,char __user * buf,size_t count,loff_t * ppos)415 static ssize_t read_alloc_contig_failed(struct file *file, char __user *buf,
416 size_t count, loff_t *ppos)
417 {
418 loff_t i, idx;
419 struct captured_pinner record;
420 unsigned long flags;
421
422 if (!static_branch_unlikely(&failure_tracking))
423 return -EINVAL;
424
425 if (*ppos >= LONGTERM_PIN_BUCKETS)
426 return 0;
427
428 i = *ppos;
429 *ppos = i + 1;
430
431 /*
432 * reading the records in the reverse order with newest one
433 * being read first followed by older ones
434 */
435 idx = (acf_pinner.index - 1 - i + LONGTERM_PIN_BUCKETS) %
436 LONGTERM_PIN_BUCKETS;
437
438 spin_lock_irqsave(&acf_pinner.lock, flags);
439 record = acf_pinner.pinner[idx];
440 spin_unlock_irqrestore(&acf_pinner.lock, flags);
441 if (!record.handle)
442 return 0;
443
444 return print_page_pinner(false, buf, count, &record);
445 }
446
447 static const struct file_operations proc_alloc_contig_failed_operations = {
448 .read = read_alloc_contig_failed,
449 };
450
pp_threshold_set(void * data,unsigned long long val)451 static int pp_threshold_set(void *data, unsigned long long val)
452 {
453 unsigned long flags;
454
455 threshold_usec = (s64)val;
456
457 spin_lock_irqsave(<_pinner.lock, flags);
458 memset(lt_pinner.pinner, 0,
459 sizeof(struct captured_pinner) * LONGTERM_PIN_BUCKETS);
460 lt_pinner.index = 0;
461 spin_unlock_irqrestore(<_pinner.lock, flags);
462 return 0;
463 }
464
pp_threshold_get(void * data,unsigned long long * val)465 static int pp_threshold_get(void *data, unsigned long long *val)
466 {
467 *val = (unsigned long long)threshold_usec;
468
469 return 0;
470 }
471 DEFINE_DEBUGFS_ATTRIBUTE(pp_threshold_fops, pp_threshold_get,
472 pp_threshold_set, "%lld\n");
473
failure_tracking_set(void * data,u64 val)474 static int failure_tracking_set(void *data, u64 val)
475 {
476 bool on;
477
478 on = (bool)val;
479 if (on)
480 static_branch_enable(&failure_tracking);
481 else
482 static_branch_disable(&failure_tracking);
483 return 0;
484 }
485
failure_tracking_get(void * data,u64 * val)486 static int failure_tracking_get(void *data, u64 *val)
487 {
488 *val = static_branch_unlikely(&failure_tracking);
489 return 0;
490 }
491 DEFINE_DEBUGFS_ATTRIBUTE(failure_tracking_fops,
492 failure_tracking_get,
493 failure_tracking_set, "%llu\n");
494
page_pinner_init(void)495 static int __init page_pinner_init(void)
496 {
497 struct dentry *pp_debugfs_root;
498
499 if (!static_branch_unlikely(&page_pinner_inited))
500 return 0;
501
502 pr_info("page_pinner enabled\n");
503
504 pp_debugfs_root = debugfs_create_dir("page_pinner", NULL);
505
506 debugfs_create_file("longterm_pinner", 0444, pp_debugfs_root, NULL,
507 &proc_longterm_pinner_operations);
508
509 debugfs_create_file("threshold", 0644, pp_debugfs_root, NULL,
510 &pp_threshold_fops);
511
512 debugfs_create_file("alloc_contig_failed", 0444,
513 pp_debugfs_root, NULL,
514 &proc_alloc_contig_failed_operations);
515
516 debugfs_create_file("failure_tracking", 0644,
517 pp_debugfs_root, NULL,
518 &failure_tracking_fops);
519 return 0;
520 }
521 late_initcall(page_pinner_init)
522