1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0-only
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * linux/mm/swap.c
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
6*4882a593Smuzhiyun */
7*4882a593Smuzhiyun
8*4882a593Smuzhiyun /*
9*4882a593Smuzhiyun * This file contains the default values for the operation of the
10*4882a593Smuzhiyun * Linux VM subsystem. Fine-tuning documentation can be found in
11*4882a593Smuzhiyun * Documentation/admin-guide/sysctl/vm.rst.
12*4882a593Smuzhiyun * Started 18.12.91
13*4882a593Smuzhiyun * Swap aging added 23.2.95, Stephen Tweedie.
14*4882a593Smuzhiyun * Buffermem limits added 12.3.98, Rik van Riel.
15*4882a593Smuzhiyun */
16*4882a593Smuzhiyun
17*4882a593Smuzhiyun #include <linux/mm.h>
18*4882a593Smuzhiyun #include <linux/sched.h>
19*4882a593Smuzhiyun #include <linux/kernel_stat.h>
20*4882a593Smuzhiyun #include <linux/swap.h>
21*4882a593Smuzhiyun #include <linux/mman.h>
22*4882a593Smuzhiyun #include <linux/pagemap.h>
23*4882a593Smuzhiyun #include <linux/pagevec.h>
24*4882a593Smuzhiyun #include <linux/init.h>
25*4882a593Smuzhiyun #include <linux/export.h>
26*4882a593Smuzhiyun #include <linux/mm_inline.h>
27*4882a593Smuzhiyun #include <linux/percpu_counter.h>
28*4882a593Smuzhiyun #include <linux/memremap.h>
29*4882a593Smuzhiyun #include <linux/percpu.h>
30*4882a593Smuzhiyun #include <linux/cpu.h>
31*4882a593Smuzhiyun #include <linux/notifier.h>
32*4882a593Smuzhiyun #include <linux/backing-dev.h>
33*4882a593Smuzhiyun #include <linux/memcontrol.h>
34*4882a593Smuzhiyun #include <linux/gfp.h>
35*4882a593Smuzhiyun #include <linux/uio.h>
36*4882a593Smuzhiyun #include <linux/hugetlb.h>
37*4882a593Smuzhiyun #include <linux/page_idle.h>
38*4882a593Smuzhiyun #include <linux/local_lock.h>
39*4882a593Smuzhiyun #include <linux/buffer_head.h>
40*4882a593Smuzhiyun
41*4882a593Smuzhiyun #include "internal.h"
42*4882a593Smuzhiyun
43*4882a593Smuzhiyun #define CREATE_TRACE_POINTS
44*4882a593Smuzhiyun #include <trace/events/pagemap.h>
45*4882a593Smuzhiyun
46*4882a593Smuzhiyun /* How many pages do we try to swap or page in/out together? */
47*4882a593Smuzhiyun int page_cluster;
48*4882a593Smuzhiyun
49*4882a593Smuzhiyun /* Protecting only lru_rotate.pvec which requires disabling interrupts */
50*4882a593Smuzhiyun struct lru_rotate {
51*4882a593Smuzhiyun local_lock_t lock;
52*4882a593Smuzhiyun struct pagevec pvec;
53*4882a593Smuzhiyun };
54*4882a593Smuzhiyun static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = {
55*4882a593Smuzhiyun .lock = INIT_LOCAL_LOCK(lock),
56*4882a593Smuzhiyun };
57*4882a593Smuzhiyun
58*4882a593Smuzhiyun /*
59*4882a593Smuzhiyun * The following struct pagevec are grouped together because they are protected
60*4882a593Smuzhiyun * by disabling preemption (and interrupts remain enabled).
61*4882a593Smuzhiyun */
62*4882a593Smuzhiyun struct lru_pvecs {
63*4882a593Smuzhiyun local_lock_t lock;
64*4882a593Smuzhiyun struct pagevec lru_add;
65*4882a593Smuzhiyun struct pagevec lru_deactivate_file;
66*4882a593Smuzhiyun struct pagevec lru_deactivate;
67*4882a593Smuzhiyun struct pagevec lru_lazyfree;
68*4882a593Smuzhiyun struct pagevec lru_lazyfree_movetail;
69*4882a593Smuzhiyun #ifdef CONFIG_SMP
70*4882a593Smuzhiyun struct pagevec activate_page;
71*4882a593Smuzhiyun #endif
72*4882a593Smuzhiyun };
73*4882a593Smuzhiyun static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
74*4882a593Smuzhiyun .lock = INIT_LOCAL_LOCK(lock),
75*4882a593Smuzhiyun };
76*4882a593Smuzhiyun
77*4882a593Smuzhiyun /*
78*4882a593Smuzhiyun * This path almost never happens for VM activity - pages are normally
79*4882a593Smuzhiyun * freed via pagevecs. But it gets used by networking.
80*4882a593Smuzhiyun */
__page_cache_release(struct page * page)81*4882a593Smuzhiyun static void __page_cache_release(struct page *page)
82*4882a593Smuzhiyun {
83*4882a593Smuzhiyun if (PageLRU(page)) {
84*4882a593Smuzhiyun pg_data_t *pgdat = page_pgdat(page);
85*4882a593Smuzhiyun struct lruvec *lruvec;
86*4882a593Smuzhiyun unsigned long flags;
87*4882a593Smuzhiyun
88*4882a593Smuzhiyun spin_lock_irqsave(&pgdat->lru_lock, flags);
89*4882a593Smuzhiyun lruvec = mem_cgroup_page_lruvec(page, pgdat);
90*4882a593Smuzhiyun VM_BUG_ON_PAGE(!PageLRU(page), page);
91*4882a593Smuzhiyun __ClearPageLRU(page);
92*4882a593Smuzhiyun del_page_from_lru_list(page, lruvec, page_off_lru(page));
93*4882a593Smuzhiyun spin_unlock_irqrestore(&pgdat->lru_lock, flags);
94*4882a593Smuzhiyun }
95*4882a593Smuzhiyun __ClearPageWaiters(page);
96*4882a593Smuzhiyun }
97*4882a593Smuzhiyun
__put_single_page(struct page * page)98*4882a593Smuzhiyun static void __put_single_page(struct page *page)
99*4882a593Smuzhiyun {
100*4882a593Smuzhiyun __page_cache_release(page);
101*4882a593Smuzhiyun mem_cgroup_uncharge(page);
102*4882a593Smuzhiyun free_unref_page(page);
103*4882a593Smuzhiyun }
104*4882a593Smuzhiyun
__put_compound_page(struct page * page)105*4882a593Smuzhiyun static void __put_compound_page(struct page *page)
106*4882a593Smuzhiyun {
107*4882a593Smuzhiyun /*
108*4882a593Smuzhiyun * __page_cache_release() is supposed to be called for thp, not for
109*4882a593Smuzhiyun * hugetlb. This is because hugetlb page does never have PageLRU set
110*4882a593Smuzhiyun * (it's never listed to any LRU lists) and no memcg routines should
111*4882a593Smuzhiyun * be called for hugetlb (it has a separate hugetlb_cgroup.)
112*4882a593Smuzhiyun */
113*4882a593Smuzhiyun if (!PageHuge(page))
114*4882a593Smuzhiyun __page_cache_release(page);
115*4882a593Smuzhiyun destroy_compound_page(page);
116*4882a593Smuzhiyun }
117*4882a593Smuzhiyun
__put_page(struct page * page)118*4882a593Smuzhiyun void __put_page(struct page *page)
119*4882a593Smuzhiyun {
120*4882a593Smuzhiyun if (is_zone_device_page(page)) {
121*4882a593Smuzhiyun put_dev_pagemap(page->pgmap);
122*4882a593Smuzhiyun
123*4882a593Smuzhiyun /*
124*4882a593Smuzhiyun * The page belongs to the device that created pgmap. Do
125*4882a593Smuzhiyun * not return it to page allocator.
126*4882a593Smuzhiyun */
127*4882a593Smuzhiyun return;
128*4882a593Smuzhiyun }
129*4882a593Smuzhiyun
130*4882a593Smuzhiyun if (unlikely(PageCompound(page)))
131*4882a593Smuzhiyun __put_compound_page(page);
132*4882a593Smuzhiyun else
133*4882a593Smuzhiyun __put_single_page(page);
134*4882a593Smuzhiyun }
135*4882a593Smuzhiyun EXPORT_SYMBOL(__put_page);
136*4882a593Smuzhiyun
137*4882a593Smuzhiyun /**
138*4882a593Smuzhiyun * put_pages_list() - release a list of pages
139*4882a593Smuzhiyun * @pages: list of pages threaded on page->lru
140*4882a593Smuzhiyun *
141*4882a593Smuzhiyun * Release a list of pages which are strung together on page.lru. Currently
142*4882a593Smuzhiyun * used by read_cache_pages() and related error recovery code.
143*4882a593Smuzhiyun */
put_pages_list(struct list_head * pages)144*4882a593Smuzhiyun void put_pages_list(struct list_head *pages)
145*4882a593Smuzhiyun {
146*4882a593Smuzhiyun while (!list_empty(pages)) {
147*4882a593Smuzhiyun struct page *victim;
148*4882a593Smuzhiyun
149*4882a593Smuzhiyun victim = lru_to_page(pages);
150*4882a593Smuzhiyun list_del(&victim->lru);
151*4882a593Smuzhiyun put_page(victim);
152*4882a593Smuzhiyun }
153*4882a593Smuzhiyun }
154*4882a593Smuzhiyun EXPORT_SYMBOL(put_pages_list);
155*4882a593Smuzhiyun
156*4882a593Smuzhiyun /*
157*4882a593Smuzhiyun * get_kernel_pages() - pin kernel pages in memory
158*4882a593Smuzhiyun * @kiov: An array of struct kvec structures
159*4882a593Smuzhiyun * @nr_segs: number of segments to pin
160*4882a593Smuzhiyun * @write: pinning for read/write, currently ignored
161*4882a593Smuzhiyun * @pages: array that receives pointers to the pages pinned.
162*4882a593Smuzhiyun * Should be at least nr_segs long.
163*4882a593Smuzhiyun *
164*4882a593Smuzhiyun * Returns number of pages pinned. This may be fewer than the number
165*4882a593Smuzhiyun * requested. If nr_pages is 0 or negative, returns 0. If no pages
166*4882a593Smuzhiyun * were pinned, returns -errno. Each page returned must be released
167*4882a593Smuzhiyun * with a put_page() call when it is finished with.
168*4882a593Smuzhiyun */
get_kernel_pages(const struct kvec * kiov,int nr_segs,int write,struct page ** pages)169*4882a593Smuzhiyun int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
170*4882a593Smuzhiyun struct page **pages)
171*4882a593Smuzhiyun {
172*4882a593Smuzhiyun int seg;
173*4882a593Smuzhiyun
174*4882a593Smuzhiyun for (seg = 0; seg < nr_segs; seg++) {
175*4882a593Smuzhiyun if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE))
176*4882a593Smuzhiyun return seg;
177*4882a593Smuzhiyun
178*4882a593Smuzhiyun pages[seg] = kmap_to_page(kiov[seg].iov_base);
179*4882a593Smuzhiyun get_page(pages[seg]);
180*4882a593Smuzhiyun }
181*4882a593Smuzhiyun
182*4882a593Smuzhiyun return seg;
183*4882a593Smuzhiyun }
184*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(get_kernel_pages);
185*4882a593Smuzhiyun
186*4882a593Smuzhiyun /*
187*4882a593Smuzhiyun * get_kernel_page() - pin a kernel page in memory
188*4882a593Smuzhiyun * @start: starting kernel address
189*4882a593Smuzhiyun * @write: pinning for read/write, currently ignored
190*4882a593Smuzhiyun * @pages: array that receives pointer to the page pinned.
191*4882a593Smuzhiyun * Must be at least nr_segs long.
192*4882a593Smuzhiyun *
193*4882a593Smuzhiyun * Returns 1 if page is pinned. If the page was not pinned, returns
194*4882a593Smuzhiyun * -errno. The page returned must be released with a put_page() call
195*4882a593Smuzhiyun * when it is finished with.
196*4882a593Smuzhiyun */
get_kernel_page(unsigned long start,int write,struct page ** pages)197*4882a593Smuzhiyun int get_kernel_page(unsigned long start, int write, struct page **pages)
198*4882a593Smuzhiyun {
199*4882a593Smuzhiyun const struct kvec kiov = {
200*4882a593Smuzhiyun .iov_base = (void *)start,
201*4882a593Smuzhiyun .iov_len = PAGE_SIZE
202*4882a593Smuzhiyun };
203*4882a593Smuzhiyun
204*4882a593Smuzhiyun return get_kernel_pages(&kiov, 1, write, pages);
205*4882a593Smuzhiyun }
206*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(get_kernel_page);
207*4882a593Smuzhiyun
pagevec_lru_move_fn(struct pagevec * pvec,void (* move_fn)(struct page * page,struct lruvec * lruvec,void * arg),void * arg)208*4882a593Smuzhiyun static void pagevec_lru_move_fn(struct pagevec *pvec,
209*4882a593Smuzhiyun void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg),
210*4882a593Smuzhiyun void *arg)
211*4882a593Smuzhiyun {
212*4882a593Smuzhiyun int i;
213*4882a593Smuzhiyun struct pglist_data *pgdat = NULL;
214*4882a593Smuzhiyun struct lruvec *lruvec;
215*4882a593Smuzhiyun unsigned long flags = 0;
216*4882a593Smuzhiyun
217*4882a593Smuzhiyun for (i = 0; i < pagevec_count(pvec); i++) {
218*4882a593Smuzhiyun struct page *page = pvec->pages[i];
219*4882a593Smuzhiyun struct pglist_data *pagepgdat = page_pgdat(page);
220*4882a593Smuzhiyun
221*4882a593Smuzhiyun if (pagepgdat != pgdat) {
222*4882a593Smuzhiyun if (pgdat)
223*4882a593Smuzhiyun spin_unlock_irqrestore(&pgdat->lru_lock, flags);
224*4882a593Smuzhiyun pgdat = pagepgdat;
225*4882a593Smuzhiyun spin_lock_irqsave(&pgdat->lru_lock, flags);
226*4882a593Smuzhiyun }
227*4882a593Smuzhiyun
228*4882a593Smuzhiyun lruvec = mem_cgroup_page_lruvec(page, pgdat);
229*4882a593Smuzhiyun (*move_fn)(page, lruvec, arg);
230*4882a593Smuzhiyun }
231*4882a593Smuzhiyun if (pgdat)
232*4882a593Smuzhiyun spin_unlock_irqrestore(&pgdat->lru_lock, flags);
233*4882a593Smuzhiyun release_pages(pvec->pages, pvec->nr);
234*4882a593Smuzhiyun pagevec_reinit(pvec);
235*4882a593Smuzhiyun }
236*4882a593Smuzhiyun
pagevec_move_tail_fn(struct page * page,struct lruvec * lruvec,void * arg)237*4882a593Smuzhiyun static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec,
238*4882a593Smuzhiyun void *arg)
239*4882a593Smuzhiyun {
240*4882a593Smuzhiyun int *pgmoved = arg;
241*4882a593Smuzhiyun
242*4882a593Smuzhiyun if (PageLRU(page) && !PageUnevictable(page)) {
243*4882a593Smuzhiyun del_page_from_lru_list(page, lruvec, page_lru(page));
244*4882a593Smuzhiyun ClearPageActive(page);
245*4882a593Smuzhiyun add_page_to_lru_list_tail(page, lruvec, page_lru(page));
246*4882a593Smuzhiyun (*pgmoved) += thp_nr_pages(page);
247*4882a593Smuzhiyun }
248*4882a593Smuzhiyun }
249*4882a593Smuzhiyun
250*4882a593Smuzhiyun /*
251*4882a593Smuzhiyun * pagevec_move_tail() must be called with IRQ disabled.
252*4882a593Smuzhiyun * Otherwise this may cause nasty races.
253*4882a593Smuzhiyun */
pagevec_move_tail(struct pagevec * pvec)254*4882a593Smuzhiyun static void pagevec_move_tail(struct pagevec *pvec)
255*4882a593Smuzhiyun {
256*4882a593Smuzhiyun int pgmoved = 0;
257*4882a593Smuzhiyun
258*4882a593Smuzhiyun pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved);
259*4882a593Smuzhiyun __count_vm_events(PGROTATED, pgmoved);
260*4882a593Smuzhiyun }
261*4882a593Smuzhiyun
262*4882a593Smuzhiyun /* return true if pagevec needs to drain */
pagevec_add_and_need_flush(struct pagevec * pvec,struct page * page)263*4882a593Smuzhiyun static bool pagevec_add_and_need_flush(struct pagevec *pvec, struct page *page)
264*4882a593Smuzhiyun {
265*4882a593Smuzhiyun bool ret = false;
266*4882a593Smuzhiyun
267*4882a593Smuzhiyun if (!pagevec_add(pvec, page) || PageCompound(page) ||
268*4882a593Smuzhiyun lru_cache_disabled())
269*4882a593Smuzhiyun ret = true;
270*4882a593Smuzhiyun
271*4882a593Smuzhiyun return ret;
272*4882a593Smuzhiyun }
273*4882a593Smuzhiyun
274*4882a593Smuzhiyun /*
275*4882a593Smuzhiyun * Writeback is about to end against a page which has been marked for immediate
276*4882a593Smuzhiyun * reclaim. If it still appears to be reclaimable, move it to the tail of the
277*4882a593Smuzhiyun * inactive list.
278*4882a593Smuzhiyun */
rotate_reclaimable_page(struct page * page)279*4882a593Smuzhiyun void rotate_reclaimable_page(struct page *page)
280*4882a593Smuzhiyun {
281*4882a593Smuzhiyun if (!PageLocked(page) && !PageDirty(page) &&
282*4882a593Smuzhiyun !PageUnevictable(page) && PageLRU(page)) {
283*4882a593Smuzhiyun struct pagevec *pvec;
284*4882a593Smuzhiyun unsigned long flags;
285*4882a593Smuzhiyun
286*4882a593Smuzhiyun get_page(page);
287*4882a593Smuzhiyun local_lock_irqsave(&lru_rotate.lock, flags);
288*4882a593Smuzhiyun pvec = this_cpu_ptr(&lru_rotate.pvec);
289*4882a593Smuzhiyun if (pagevec_add_and_need_flush(pvec, page))
290*4882a593Smuzhiyun pagevec_move_tail(pvec);
291*4882a593Smuzhiyun local_unlock_irqrestore(&lru_rotate.lock, flags);
292*4882a593Smuzhiyun }
293*4882a593Smuzhiyun }
294*4882a593Smuzhiyun
lru_note_cost(struct lruvec * lruvec,bool file,unsigned int nr_pages)295*4882a593Smuzhiyun void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
296*4882a593Smuzhiyun {
297*4882a593Smuzhiyun do {
298*4882a593Smuzhiyun unsigned long lrusize;
299*4882a593Smuzhiyun
300*4882a593Smuzhiyun /* Record cost event */
301*4882a593Smuzhiyun if (file)
302*4882a593Smuzhiyun lruvec->file_cost += nr_pages;
303*4882a593Smuzhiyun else
304*4882a593Smuzhiyun lruvec->anon_cost += nr_pages;
305*4882a593Smuzhiyun
306*4882a593Smuzhiyun /*
307*4882a593Smuzhiyun * Decay previous events
308*4882a593Smuzhiyun *
309*4882a593Smuzhiyun * Because workloads change over time (and to avoid
310*4882a593Smuzhiyun * overflow) we keep these statistics as a floating
311*4882a593Smuzhiyun * average, which ends up weighing recent refaults
312*4882a593Smuzhiyun * more than old ones.
313*4882a593Smuzhiyun */
314*4882a593Smuzhiyun lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) +
315*4882a593Smuzhiyun lruvec_page_state(lruvec, NR_ACTIVE_ANON) +
316*4882a593Smuzhiyun lruvec_page_state(lruvec, NR_INACTIVE_FILE) +
317*4882a593Smuzhiyun lruvec_page_state(lruvec, NR_ACTIVE_FILE);
318*4882a593Smuzhiyun
319*4882a593Smuzhiyun if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) {
320*4882a593Smuzhiyun lruvec->file_cost /= 2;
321*4882a593Smuzhiyun lruvec->anon_cost /= 2;
322*4882a593Smuzhiyun }
323*4882a593Smuzhiyun } while ((lruvec = parent_lruvec(lruvec)));
324*4882a593Smuzhiyun }
325*4882a593Smuzhiyun
lru_note_cost_page(struct page * page)326*4882a593Smuzhiyun void lru_note_cost_page(struct page *page)
327*4882a593Smuzhiyun {
328*4882a593Smuzhiyun lru_note_cost(mem_cgroup_page_lruvec(page, page_pgdat(page)),
329*4882a593Smuzhiyun page_is_file_lru(page), thp_nr_pages(page));
330*4882a593Smuzhiyun }
331*4882a593Smuzhiyun
__activate_page(struct page * page,struct lruvec * lruvec,void * arg)332*4882a593Smuzhiyun static void __activate_page(struct page *page, struct lruvec *lruvec,
333*4882a593Smuzhiyun void *arg)
334*4882a593Smuzhiyun {
335*4882a593Smuzhiyun if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
336*4882a593Smuzhiyun int lru = page_lru_base_type(page);
337*4882a593Smuzhiyun int nr_pages = thp_nr_pages(page);
338*4882a593Smuzhiyun
339*4882a593Smuzhiyun del_page_from_lru_list(page, lruvec, lru);
340*4882a593Smuzhiyun SetPageActive(page);
341*4882a593Smuzhiyun lru += LRU_ACTIVE;
342*4882a593Smuzhiyun add_page_to_lru_list(page, lruvec, lru);
343*4882a593Smuzhiyun trace_mm_lru_activate(page);
344*4882a593Smuzhiyun
345*4882a593Smuzhiyun __count_vm_events(PGACTIVATE, nr_pages);
346*4882a593Smuzhiyun __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE,
347*4882a593Smuzhiyun nr_pages);
348*4882a593Smuzhiyun }
349*4882a593Smuzhiyun }
350*4882a593Smuzhiyun
351*4882a593Smuzhiyun #ifdef CONFIG_SMP
activate_page_drain(int cpu)352*4882a593Smuzhiyun static void activate_page_drain(int cpu)
353*4882a593Smuzhiyun {
354*4882a593Smuzhiyun struct pagevec *pvec = &per_cpu(lru_pvecs.activate_page, cpu);
355*4882a593Smuzhiyun
356*4882a593Smuzhiyun if (pagevec_count(pvec))
357*4882a593Smuzhiyun pagevec_lru_move_fn(pvec, __activate_page, NULL);
358*4882a593Smuzhiyun }
359*4882a593Smuzhiyun
need_activate_page_drain(int cpu)360*4882a593Smuzhiyun static bool need_activate_page_drain(int cpu)
361*4882a593Smuzhiyun {
362*4882a593Smuzhiyun return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0;
363*4882a593Smuzhiyun }
364*4882a593Smuzhiyun
activate_page(struct page * page)365*4882a593Smuzhiyun static void activate_page(struct page *page)
366*4882a593Smuzhiyun {
367*4882a593Smuzhiyun page = compound_head(page);
368*4882a593Smuzhiyun if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
369*4882a593Smuzhiyun struct pagevec *pvec;
370*4882a593Smuzhiyun
371*4882a593Smuzhiyun local_lock(&lru_pvecs.lock);
372*4882a593Smuzhiyun pvec = this_cpu_ptr(&lru_pvecs.activate_page);
373*4882a593Smuzhiyun get_page(page);
374*4882a593Smuzhiyun if (pagevec_add_and_need_flush(pvec, page))
375*4882a593Smuzhiyun pagevec_lru_move_fn(pvec, __activate_page, NULL);
376*4882a593Smuzhiyun local_unlock(&lru_pvecs.lock);
377*4882a593Smuzhiyun }
378*4882a593Smuzhiyun }
379*4882a593Smuzhiyun
380*4882a593Smuzhiyun #else
activate_page_drain(int cpu)381*4882a593Smuzhiyun static inline void activate_page_drain(int cpu)
382*4882a593Smuzhiyun {
383*4882a593Smuzhiyun }
384*4882a593Smuzhiyun
activate_page(struct page * page)385*4882a593Smuzhiyun static void activate_page(struct page *page)
386*4882a593Smuzhiyun {
387*4882a593Smuzhiyun pg_data_t *pgdat = page_pgdat(page);
388*4882a593Smuzhiyun
389*4882a593Smuzhiyun page = compound_head(page);
390*4882a593Smuzhiyun spin_lock_irq(&pgdat->lru_lock);
391*4882a593Smuzhiyun __activate_page(page, mem_cgroup_page_lruvec(page, pgdat), NULL);
392*4882a593Smuzhiyun spin_unlock_irq(&pgdat->lru_lock);
393*4882a593Smuzhiyun }
394*4882a593Smuzhiyun #endif
395*4882a593Smuzhiyun
__lru_cache_activate_page(struct page * page)396*4882a593Smuzhiyun static void __lru_cache_activate_page(struct page *page)
397*4882a593Smuzhiyun {
398*4882a593Smuzhiyun struct pagevec *pvec;
399*4882a593Smuzhiyun int i;
400*4882a593Smuzhiyun
401*4882a593Smuzhiyun local_lock(&lru_pvecs.lock);
402*4882a593Smuzhiyun pvec = this_cpu_ptr(&lru_pvecs.lru_add);
403*4882a593Smuzhiyun
404*4882a593Smuzhiyun /*
405*4882a593Smuzhiyun * Search backwards on the optimistic assumption that the page being
406*4882a593Smuzhiyun * activated has just been added to this pagevec. Note that only
407*4882a593Smuzhiyun * the local pagevec is examined as a !PageLRU page could be in the
408*4882a593Smuzhiyun * process of being released, reclaimed, migrated or on a remote
409*4882a593Smuzhiyun * pagevec that is currently being drained. Furthermore, marking
410*4882a593Smuzhiyun * a remote pagevec's page PageActive potentially hits a race where
411*4882a593Smuzhiyun * a page is marked PageActive just after it is added to the inactive
412*4882a593Smuzhiyun * list causing accounting errors and BUG_ON checks to trigger.
413*4882a593Smuzhiyun */
414*4882a593Smuzhiyun for (i = pagevec_count(pvec) - 1; i >= 0; i--) {
415*4882a593Smuzhiyun struct page *pagevec_page = pvec->pages[i];
416*4882a593Smuzhiyun
417*4882a593Smuzhiyun if (pagevec_page == page) {
418*4882a593Smuzhiyun SetPageActive(page);
419*4882a593Smuzhiyun break;
420*4882a593Smuzhiyun }
421*4882a593Smuzhiyun }
422*4882a593Smuzhiyun
423*4882a593Smuzhiyun local_unlock(&lru_pvecs.lock);
424*4882a593Smuzhiyun }
425*4882a593Smuzhiyun
426*4882a593Smuzhiyun /*
427*4882a593Smuzhiyun * Mark a page as having seen activity.
428*4882a593Smuzhiyun *
429*4882a593Smuzhiyun * inactive,unreferenced -> inactive,referenced
430*4882a593Smuzhiyun * inactive,referenced -> active,unreferenced
431*4882a593Smuzhiyun * active,unreferenced -> active,referenced
432*4882a593Smuzhiyun *
433*4882a593Smuzhiyun * When a newly allocated page is not yet visible, so safe for non-atomic ops,
434*4882a593Smuzhiyun * __SetPageReferenced(page) may be substituted for mark_page_accessed(page).
435*4882a593Smuzhiyun */
mark_page_accessed(struct page * page)436*4882a593Smuzhiyun void mark_page_accessed(struct page *page)
437*4882a593Smuzhiyun {
438*4882a593Smuzhiyun page = compound_head(page);
439*4882a593Smuzhiyun
440*4882a593Smuzhiyun trace_android_vh_mark_page_accessed(page);
441*4882a593Smuzhiyun if (!PageReferenced(page)) {
442*4882a593Smuzhiyun SetPageReferenced(page);
443*4882a593Smuzhiyun } else if (PageUnevictable(page)) {
444*4882a593Smuzhiyun /*
445*4882a593Smuzhiyun * Unevictable pages are on the "LRU_UNEVICTABLE" list. But,
446*4882a593Smuzhiyun * this list is never rotated or maintained, so marking an
447*4882a593Smuzhiyun * evictable page accessed has no effect.
448*4882a593Smuzhiyun */
449*4882a593Smuzhiyun } else if (!PageActive(page)) {
450*4882a593Smuzhiyun /*
451*4882a593Smuzhiyun * If the page is on the LRU, queue it for activation via
452*4882a593Smuzhiyun * lru_pvecs.activate_page. Otherwise, assume the page is on a
453*4882a593Smuzhiyun * pagevec, mark it active and it'll be moved to the active
454*4882a593Smuzhiyun * LRU on the next drain.
455*4882a593Smuzhiyun */
456*4882a593Smuzhiyun if (PageLRU(page))
457*4882a593Smuzhiyun activate_page(page);
458*4882a593Smuzhiyun else
459*4882a593Smuzhiyun __lru_cache_activate_page(page);
460*4882a593Smuzhiyun ClearPageReferenced(page);
461*4882a593Smuzhiyun workingset_activation(page);
462*4882a593Smuzhiyun }
463*4882a593Smuzhiyun if (page_is_idle(page))
464*4882a593Smuzhiyun clear_page_idle(page);
465*4882a593Smuzhiyun }
466*4882a593Smuzhiyun EXPORT_SYMBOL(mark_page_accessed);
467*4882a593Smuzhiyun
468*4882a593Smuzhiyun /**
469*4882a593Smuzhiyun * lru_cache_add - add a page to a page list
470*4882a593Smuzhiyun * @page: the page to be added to the LRU.
471*4882a593Smuzhiyun *
472*4882a593Smuzhiyun * Queue the page for addition to the LRU via pagevec. The decision on whether
473*4882a593Smuzhiyun * to add the page to the [in]active [file|anon] list is deferred until the
474*4882a593Smuzhiyun * pagevec is drained. This gives a chance for the caller of lru_cache_add()
475*4882a593Smuzhiyun * have the page added to the active list using mark_page_accessed().
476*4882a593Smuzhiyun */
lru_cache_add(struct page * page)477*4882a593Smuzhiyun void lru_cache_add(struct page *page)
478*4882a593Smuzhiyun {
479*4882a593Smuzhiyun struct pagevec *pvec;
480*4882a593Smuzhiyun
481*4882a593Smuzhiyun VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
482*4882a593Smuzhiyun VM_BUG_ON_PAGE(PageLRU(page), page);
483*4882a593Smuzhiyun
484*4882a593Smuzhiyun get_page(page);
485*4882a593Smuzhiyun local_lock(&lru_pvecs.lock);
486*4882a593Smuzhiyun pvec = this_cpu_ptr(&lru_pvecs.lru_add);
487*4882a593Smuzhiyun if (pagevec_add_and_need_flush(pvec, page))
488*4882a593Smuzhiyun __pagevec_lru_add(pvec);
489*4882a593Smuzhiyun local_unlock(&lru_pvecs.lock);
490*4882a593Smuzhiyun }
491*4882a593Smuzhiyun EXPORT_SYMBOL(lru_cache_add);
492*4882a593Smuzhiyun
493*4882a593Smuzhiyun /**
494*4882a593Smuzhiyun * lru_cache_add_inactive_or_unevictable
495*4882a593Smuzhiyun * @page: the page to be added to LRU
496*4882a593Smuzhiyun * @vma: vma in which page is mapped for determining reclaimability
497*4882a593Smuzhiyun *
498*4882a593Smuzhiyun * Place @page on the inactive or unevictable LRU list, depending on its
499*4882a593Smuzhiyun * evictability.
500*4882a593Smuzhiyun */
__lru_cache_add_inactive_or_unevictable(struct page * page,unsigned long vma_flags)501*4882a593Smuzhiyun void __lru_cache_add_inactive_or_unevictable(struct page *page,
502*4882a593Smuzhiyun unsigned long vma_flags)
503*4882a593Smuzhiyun {
504*4882a593Smuzhiyun bool unevictable;
505*4882a593Smuzhiyun
506*4882a593Smuzhiyun VM_BUG_ON_PAGE(PageLRU(page), page);
507*4882a593Smuzhiyun
508*4882a593Smuzhiyun unevictable = (vma_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED;
509*4882a593Smuzhiyun if (unlikely(unevictable) && !TestSetPageMlocked(page)) {
510*4882a593Smuzhiyun int nr_pages = thp_nr_pages(page);
511*4882a593Smuzhiyun /*
512*4882a593Smuzhiyun * We use the irq-unsafe __mod_zone_page_stat because this
513*4882a593Smuzhiyun * counter is not modified from interrupt context, and the pte
514*4882a593Smuzhiyun * lock is held(spinlock), which implies preemption disabled.
515*4882a593Smuzhiyun */
516*4882a593Smuzhiyun __mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
517*4882a593Smuzhiyun count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
518*4882a593Smuzhiyun }
519*4882a593Smuzhiyun lru_cache_add(page);
520*4882a593Smuzhiyun }
521*4882a593Smuzhiyun
522*4882a593Smuzhiyun /*
523*4882a593Smuzhiyun * If the page can not be invalidated, it is moved to the
524*4882a593Smuzhiyun * inactive list to speed up its reclaim. It is moved to the
525*4882a593Smuzhiyun * head of the list, rather than the tail, to give the flusher
526*4882a593Smuzhiyun * threads some time to write it out, as this is much more
527*4882a593Smuzhiyun * effective than the single-page writeout from reclaim.
528*4882a593Smuzhiyun *
529*4882a593Smuzhiyun * If the page isn't page_mapped and dirty/writeback, the page
530*4882a593Smuzhiyun * could reclaim asap using PG_reclaim.
531*4882a593Smuzhiyun *
532*4882a593Smuzhiyun * 1. active, mapped page -> none
533*4882a593Smuzhiyun * 2. active, dirty/writeback page -> inactive, head, PG_reclaim
534*4882a593Smuzhiyun * 3. inactive, mapped page -> none
535*4882a593Smuzhiyun * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim
536*4882a593Smuzhiyun * 5. inactive, clean -> inactive, tail
537*4882a593Smuzhiyun * 6. Others -> none
538*4882a593Smuzhiyun *
539*4882a593Smuzhiyun * In 4, why it moves inactive's head, the VM expects the page would
540*4882a593Smuzhiyun * be write it out by flusher threads as this is much more effective
541*4882a593Smuzhiyun * than the single-page writeout from reclaim.
542*4882a593Smuzhiyun */
lru_deactivate_file_fn(struct page * page,struct lruvec * lruvec,void * arg)543*4882a593Smuzhiyun static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
544*4882a593Smuzhiyun void *arg)
545*4882a593Smuzhiyun {
546*4882a593Smuzhiyun int lru;
547*4882a593Smuzhiyun bool active;
548*4882a593Smuzhiyun int nr_pages = thp_nr_pages(page);
549*4882a593Smuzhiyun
550*4882a593Smuzhiyun if (!PageLRU(page))
551*4882a593Smuzhiyun return;
552*4882a593Smuzhiyun
553*4882a593Smuzhiyun if (PageUnevictable(page))
554*4882a593Smuzhiyun return;
555*4882a593Smuzhiyun
556*4882a593Smuzhiyun /* Some processes are using the page */
557*4882a593Smuzhiyun if (page_mapped(page))
558*4882a593Smuzhiyun return;
559*4882a593Smuzhiyun
560*4882a593Smuzhiyun active = PageActive(page);
561*4882a593Smuzhiyun lru = page_lru_base_type(page);
562*4882a593Smuzhiyun
563*4882a593Smuzhiyun del_page_from_lru_list(page, lruvec, lru + active);
564*4882a593Smuzhiyun ClearPageActive(page);
565*4882a593Smuzhiyun ClearPageReferenced(page);
566*4882a593Smuzhiyun
567*4882a593Smuzhiyun if (PageWriteback(page) || PageDirty(page)) {
568*4882a593Smuzhiyun /*
569*4882a593Smuzhiyun * PG_reclaim could be raced with end_page_writeback
570*4882a593Smuzhiyun * It can make readahead confusing. But race window
571*4882a593Smuzhiyun * is _really_ small and it's non-critical problem.
572*4882a593Smuzhiyun */
573*4882a593Smuzhiyun add_page_to_lru_list(page, lruvec, lru);
574*4882a593Smuzhiyun SetPageReclaim(page);
575*4882a593Smuzhiyun } else {
576*4882a593Smuzhiyun /*
577*4882a593Smuzhiyun * The page's writeback ends up during pagevec
578*4882a593Smuzhiyun * We moves tha page into tail of inactive.
579*4882a593Smuzhiyun */
580*4882a593Smuzhiyun add_page_to_lru_list_tail(page, lruvec, lru);
581*4882a593Smuzhiyun __count_vm_events(PGROTATED, nr_pages);
582*4882a593Smuzhiyun }
583*4882a593Smuzhiyun
584*4882a593Smuzhiyun if (active) {
585*4882a593Smuzhiyun __count_vm_events(PGDEACTIVATE, nr_pages);
586*4882a593Smuzhiyun __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
587*4882a593Smuzhiyun nr_pages);
588*4882a593Smuzhiyun }
589*4882a593Smuzhiyun }
590*4882a593Smuzhiyun
lru_deactivate_fn(struct page * page,struct lruvec * lruvec,void * arg)591*4882a593Smuzhiyun static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
592*4882a593Smuzhiyun void *arg)
593*4882a593Smuzhiyun {
594*4882a593Smuzhiyun if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
595*4882a593Smuzhiyun int lru = page_lru_base_type(page);
596*4882a593Smuzhiyun int nr_pages = thp_nr_pages(page);
597*4882a593Smuzhiyun
598*4882a593Smuzhiyun del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
599*4882a593Smuzhiyun ClearPageActive(page);
600*4882a593Smuzhiyun ClearPageReferenced(page);
601*4882a593Smuzhiyun add_page_to_lru_list(page, lruvec, lru);
602*4882a593Smuzhiyun
603*4882a593Smuzhiyun __count_vm_events(PGDEACTIVATE, nr_pages);
604*4882a593Smuzhiyun __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
605*4882a593Smuzhiyun nr_pages);
606*4882a593Smuzhiyun }
607*4882a593Smuzhiyun }
608*4882a593Smuzhiyun
lru_lazyfree_fn(struct page * page,struct lruvec * lruvec,void * arg)609*4882a593Smuzhiyun static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
610*4882a593Smuzhiyun void *arg)
611*4882a593Smuzhiyun {
612*4882a593Smuzhiyun if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
613*4882a593Smuzhiyun !PageSwapCache(page) && !PageUnevictable(page)) {
614*4882a593Smuzhiyun bool active = PageActive(page);
615*4882a593Smuzhiyun int nr_pages = thp_nr_pages(page);
616*4882a593Smuzhiyun
617*4882a593Smuzhiyun del_page_from_lru_list(page, lruvec,
618*4882a593Smuzhiyun LRU_INACTIVE_ANON + active);
619*4882a593Smuzhiyun ClearPageActive(page);
620*4882a593Smuzhiyun ClearPageReferenced(page);
621*4882a593Smuzhiyun /*
622*4882a593Smuzhiyun * Lazyfree pages are clean anonymous pages. They have
623*4882a593Smuzhiyun * PG_swapbacked flag cleared, to distinguish them from normal
624*4882a593Smuzhiyun * anonymous pages
625*4882a593Smuzhiyun */
626*4882a593Smuzhiyun ClearPageSwapBacked(page);
627*4882a593Smuzhiyun add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE);
628*4882a593Smuzhiyun
629*4882a593Smuzhiyun __count_vm_events(PGLAZYFREE, nr_pages);
630*4882a593Smuzhiyun __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE,
631*4882a593Smuzhiyun nr_pages);
632*4882a593Smuzhiyun }
633*4882a593Smuzhiyun }
634*4882a593Smuzhiyun
lru_lazyfree_movetail_fn(struct page * page,struct lruvec * lruvec,void * arg)635*4882a593Smuzhiyun static void lru_lazyfree_movetail_fn(struct page *page, struct lruvec *lruvec,
636*4882a593Smuzhiyun void *arg)
637*4882a593Smuzhiyun {
638*4882a593Smuzhiyun bool *add_to_tail = (bool *)arg;
639*4882a593Smuzhiyun
640*4882a593Smuzhiyun if (PageLRU(page) && !PageUnevictable(page) && PageSwapBacked(page) &&
641*4882a593Smuzhiyun !PageSwapCache(page)) {
642*4882a593Smuzhiyun bool active = PageActive(page);
643*4882a593Smuzhiyun
644*4882a593Smuzhiyun del_page_from_lru_list(page, lruvec,
645*4882a593Smuzhiyun LRU_INACTIVE_ANON + active);
646*4882a593Smuzhiyun ClearPageActive(page);
647*4882a593Smuzhiyun ClearPageReferenced(page);
648*4882a593Smuzhiyun if (add_to_tail && *add_to_tail)
649*4882a593Smuzhiyun add_page_to_lru_list_tail(page, lruvec, LRU_INACTIVE_FILE);
650*4882a593Smuzhiyun else
651*4882a593Smuzhiyun add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE);
652*4882a593Smuzhiyun }
653*4882a593Smuzhiyun }
654*4882a593Smuzhiyun
655*4882a593Smuzhiyun /*
656*4882a593Smuzhiyun * Drain pages out of the cpu's pagevecs.
657*4882a593Smuzhiyun * Either "cpu" is the current CPU, and preemption has already been
658*4882a593Smuzhiyun * disabled; or "cpu" is being hot-unplugged, and is already dead.
659*4882a593Smuzhiyun */
lru_add_drain_cpu(int cpu)660*4882a593Smuzhiyun void lru_add_drain_cpu(int cpu)
661*4882a593Smuzhiyun {
662*4882a593Smuzhiyun struct pagevec *pvec = &per_cpu(lru_pvecs.lru_add, cpu);
663*4882a593Smuzhiyun
664*4882a593Smuzhiyun if (pagevec_count(pvec))
665*4882a593Smuzhiyun __pagevec_lru_add(pvec);
666*4882a593Smuzhiyun
667*4882a593Smuzhiyun pvec = &per_cpu(lru_rotate.pvec, cpu);
668*4882a593Smuzhiyun /* Disabling interrupts below acts as a compiler barrier. */
669*4882a593Smuzhiyun if (data_race(pagevec_count(pvec))) {
670*4882a593Smuzhiyun unsigned long flags;
671*4882a593Smuzhiyun
672*4882a593Smuzhiyun /* No harm done if a racing interrupt already did this */
673*4882a593Smuzhiyun local_lock_irqsave(&lru_rotate.lock, flags);
674*4882a593Smuzhiyun pagevec_move_tail(pvec);
675*4882a593Smuzhiyun local_unlock_irqrestore(&lru_rotate.lock, flags);
676*4882a593Smuzhiyun }
677*4882a593Smuzhiyun
678*4882a593Smuzhiyun pvec = &per_cpu(lru_pvecs.lru_deactivate_file, cpu);
679*4882a593Smuzhiyun if (pagevec_count(pvec))
680*4882a593Smuzhiyun pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
681*4882a593Smuzhiyun
682*4882a593Smuzhiyun pvec = &per_cpu(lru_pvecs.lru_deactivate, cpu);
683*4882a593Smuzhiyun if (pagevec_count(pvec))
684*4882a593Smuzhiyun pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
685*4882a593Smuzhiyun
686*4882a593Smuzhiyun pvec = &per_cpu(lru_pvecs.lru_lazyfree, cpu);
687*4882a593Smuzhiyun if (pagevec_count(pvec))
688*4882a593Smuzhiyun pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
689*4882a593Smuzhiyun
690*4882a593Smuzhiyun pvec = &per_cpu(lru_pvecs.lru_lazyfree_movetail, cpu);
691*4882a593Smuzhiyun if (pagevec_count(pvec))
692*4882a593Smuzhiyun pagevec_lru_move_fn(pvec, lru_lazyfree_movetail_fn, NULL);
693*4882a593Smuzhiyun
694*4882a593Smuzhiyun activate_page_drain(cpu);
695*4882a593Smuzhiyun }
696*4882a593Smuzhiyun
697*4882a593Smuzhiyun /**
698*4882a593Smuzhiyun * deactivate_file_page - forcefully deactivate a file page
699*4882a593Smuzhiyun * @page: page to deactivate
700*4882a593Smuzhiyun *
701*4882a593Smuzhiyun * This function hints the VM that @page is a good reclaim candidate,
702*4882a593Smuzhiyun * for example if its invalidation fails due to the page being dirty
703*4882a593Smuzhiyun * or under writeback.
704*4882a593Smuzhiyun */
deactivate_file_page(struct page * page)705*4882a593Smuzhiyun void deactivate_file_page(struct page *page)
706*4882a593Smuzhiyun {
707*4882a593Smuzhiyun /*
708*4882a593Smuzhiyun * In a workload with many unevictable page such as mprotect,
709*4882a593Smuzhiyun * unevictable page deactivation for accelerating reclaim is pointless.
710*4882a593Smuzhiyun */
711*4882a593Smuzhiyun if (PageUnevictable(page))
712*4882a593Smuzhiyun return;
713*4882a593Smuzhiyun
714*4882a593Smuzhiyun if (likely(get_page_unless_zero(page))) {
715*4882a593Smuzhiyun struct pagevec *pvec;
716*4882a593Smuzhiyun
717*4882a593Smuzhiyun local_lock(&lru_pvecs.lock);
718*4882a593Smuzhiyun pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file);
719*4882a593Smuzhiyun
720*4882a593Smuzhiyun if (pagevec_add_and_need_flush(pvec, page))
721*4882a593Smuzhiyun pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
722*4882a593Smuzhiyun local_unlock(&lru_pvecs.lock);
723*4882a593Smuzhiyun }
724*4882a593Smuzhiyun }
725*4882a593Smuzhiyun
726*4882a593Smuzhiyun /*
727*4882a593Smuzhiyun * deactivate_page - deactivate a page
728*4882a593Smuzhiyun * @page: page to deactivate
729*4882a593Smuzhiyun *
730*4882a593Smuzhiyun * deactivate_page() moves @page to the inactive list if @page was on the active
731*4882a593Smuzhiyun * list and was not an unevictable page. This is done to accelerate the reclaim
732*4882a593Smuzhiyun * of @page.
733*4882a593Smuzhiyun */
deactivate_page(struct page * page)734*4882a593Smuzhiyun void deactivate_page(struct page *page)
735*4882a593Smuzhiyun {
736*4882a593Smuzhiyun if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
737*4882a593Smuzhiyun struct pagevec *pvec;
738*4882a593Smuzhiyun
739*4882a593Smuzhiyun local_lock(&lru_pvecs.lock);
740*4882a593Smuzhiyun pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate);
741*4882a593Smuzhiyun get_page(page);
742*4882a593Smuzhiyun if (pagevec_add_and_need_flush(pvec, page))
743*4882a593Smuzhiyun pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
744*4882a593Smuzhiyun local_unlock(&lru_pvecs.lock);
745*4882a593Smuzhiyun }
746*4882a593Smuzhiyun }
747*4882a593Smuzhiyun
748*4882a593Smuzhiyun /**
749*4882a593Smuzhiyun * mark_page_lazyfree - make an anon page lazyfree
750*4882a593Smuzhiyun * @page: page to deactivate
751*4882a593Smuzhiyun *
752*4882a593Smuzhiyun * mark_page_lazyfree() moves @page to the inactive file list.
753*4882a593Smuzhiyun * This is done to accelerate the reclaim of @page.
754*4882a593Smuzhiyun */
mark_page_lazyfree(struct page * page)755*4882a593Smuzhiyun void mark_page_lazyfree(struct page *page)
756*4882a593Smuzhiyun {
757*4882a593Smuzhiyun if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
758*4882a593Smuzhiyun !PageSwapCache(page) && !PageUnevictable(page)) {
759*4882a593Smuzhiyun struct pagevec *pvec;
760*4882a593Smuzhiyun
761*4882a593Smuzhiyun local_lock(&lru_pvecs.lock);
762*4882a593Smuzhiyun pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree);
763*4882a593Smuzhiyun get_page(page);
764*4882a593Smuzhiyun if (pagevec_add_and_need_flush(pvec, page))
765*4882a593Smuzhiyun pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
766*4882a593Smuzhiyun local_unlock(&lru_pvecs.lock);
767*4882a593Smuzhiyun }
768*4882a593Smuzhiyun }
769*4882a593Smuzhiyun
770*4882a593Smuzhiyun /**
771*4882a593Smuzhiyun * mark_page_lazyfree_movetail - make a swapbacked page lazyfree
772*4882a593Smuzhiyun * @page: page to deactivate
773*4882a593Smuzhiyun *
774*4882a593Smuzhiyun * mark_page_lazyfree_movetail() moves @page to the tail of inactive file list.
775*4882a593Smuzhiyun * This is done to accelerate the reclaim of @page.
776*4882a593Smuzhiyun */
mark_page_lazyfree_movetail(struct page * page,bool tail)777*4882a593Smuzhiyun void mark_page_lazyfree_movetail(struct page *page, bool tail)
778*4882a593Smuzhiyun {
779*4882a593Smuzhiyun if (PageLRU(page) && !PageUnevictable(page) && PageSwapBacked(page) &&
780*4882a593Smuzhiyun !PageSwapCache(page)) {
781*4882a593Smuzhiyun struct pagevec *pvec;
782*4882a593Smuzhiyun
783*4882a593Smuzhiyun local_lock(&lru_pvecs.lock);
784*4882a593Smuzhiyun pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree_movetail);
785*4882a593Smuzhiyun get_page(page);
786*4882a593Smuzhiyun if (pagevec_add_and_need_flush(pvec, page))
787*4882a593Smuzhiyun pagevec_lru_move_fn(pvec,
788*4882a593Smuzhiyun lru_lazyfree_movetail_fn, &tail);
789*4882a593Smuzhiyun local_unlock(&lru_pvecs.lock);
790*4882a593Smuzhiyun }
791*4882a593Smuzhiyun }
792*4882a593Smuzhiyun
lru_add_drain(void)793*4882a593Smuzhiyun void lru_add_drain(void)
794*4882a593Smuzhiyun {
795*4882a593Smuzhiyun local_lock(&lru_pvecs.lock);
796*4882a593Smuzhiyun lru_add_drain_cpu(smp_processor_id());
797*4882a593Smuzhiyun local_unlock(&lru_pvecs.lock);
798*4882a593Smuzhiyun }
799*4882a593Smuzhiyun
800*4882a593Smuzhiyun /*
801*4882a593Smuzhiyun * It's called from per-cpu workqueue context in SMP case so
802*4882a593Smuzhiyun * lru_add_drain_cpu and invalidate_bh_lrus_cpu should run on
803*4882a593Smuzhiyun * the same cpu. It shouldn't be a problem in !SMP case since
804*4882a593Smuzhiyun * the core is only one and the locks will disable preemption.
805*4882a593Smuzhiyun */
lru_add_and_bh_lrus_drain(void)806*4882a593Smuzhiyun static void lru_add_and_bh_lrus_drain(void)
807*4882a593Smuzhiyun {
808*4882a593Smuzhiyun local_lock(&lru_pvecs.lock);
809*4882a593Smuzhiyun lru_add_drain_cpu(smp_processor_id());
810*4882a593Smuzhiyun local_unlock(&lru_pvecs.lock);
811*4882a593Smuzhiyun invalidate_bh_lrus_cpu();
812*4882a593Smuzhiyun }
813*4882a593Smuzhiyun
lru_add_drain_cpu_zone(struct zone * zone)814*4882a593Smuzhiyun void lru_add_drain_cpu_zone(struct zone *zone)
815*4882a593Smuzhiyun {
816*4882a593Smuzhiyun local_lock(&lru_pvecs.lock);
817*4882a593Smuzhiyun lru_add_drain_cpu(smp_processor_id());
818*4882a593Smuzhiyun drain_local_pages(zone);
819*4882a593Smuzhiyun local_unlock(&lru_pvecs.lock);
820*4882a593Smuzhiyun }
821*4882a593Smuzhiyun
822*4882a593Smuzhiyun #ifdef CONFIG_SMP
823*4882a593Smuzhiyun
824*4882a593Smuzhiyun static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
825*4882a593Smuzhiyun
lru_add_drain_per_cpu(struct work_struct * dummy)826*4882a593Smuzhiyun static void lru_add_drain_per_cpu(struct work_struct *dummy)
827*4882a593Smuzhiyun {
828*4882a593Smuzhiyun lru_add_and_bh_lrus_drain();
829*4882a593Smuzhiyun }
830*4882a593Smuzhiyun
831*4882a593Smuzhiyun /*
832*4882a593Smuzhiyun * Doesn't need any cpu hotplug locking because we do rely on per-cpu
833*4882a593Smuzhiyun * kworkers being shut down before our page_alloc_cpu_dead callback is
834*4882a593Smuzhiyun * executed on the offlined cpu.
835*4882a593Smuzhiyun * Calling this function with cpu hotplug locks held can actually lead
836*4882a593Smuzhiyun * to obscure indirect dependencies via WQ context.
837*4882a593Smuzhiyun */
__lru_add_drain_all(bool force_all_cpus)838*4882a593Smuzhiyun inline void __lru_add_drain_all(bool force_all_cpus)
839*4882a593Smuzhiyun {
840*4882a593Smuzhiyun /*
841*4882a593Smuzhiyun * lru_drain_gen - Global pages generation number
842*4882a593Smuzhiyun *
843*4882a593Smuzhiyun * (A) Definition: global lru_drain_gen = x implies that all generations
844*4882a593Smuzhiyun * 0 < n <= x are already *scheduled* for draining.
845*4882a593Smuzhiyun *
846*4882a593Smuzhiyun * This is an optimization for the highly-contended use case where a
847*4882a593Smuzhiyun * user space workload keeps constantly generating a flow of pages for
848*4882a593Smuzhiyun * each CPU.
849*4882a593Smuzhiyun */
850*4882a593Smuzhiyun static unsigned int lru_drain_gen;
851*4882a593Smuzhiyun static struct cpumask has_work;
852*4882a593Smuzhiyun static DEFINE_MUTEX(lock);
853*4882a593Smuzhiyun unsigned cpu, this_gen;
854*4882a593Smuzhiyun
855*4882a593Smuzhiyun /*
856*4882a593Smuzhiyun * Make sure nobody triggers this path before mm_percpu_wq is fully
857*4882a593Smuzhiyun * initialized.
858*4882a593Smuzhiyun */
859*4882a593Smuzhiyun if (WARN_ON(!mm_percpu_wq))
860*4882a593Smuzhiyun return;
861*4882a593Smuzhiyun
862*4882a593Smuzhiyun /*
863*4882a593Smuzhiyun * Guarantee pagevec counter stores visible by this CPU are visible to
864*4882a593Smuzhiyun * other CPUs before loading the current drain generation.
865*4882a593Smuzhiyun */
866*4882a593Smuzhiyun smp_mb();
867*4882a593Smuzhiyun
868*4882a593Smuzhiyun /*
869*4882a593Smuzhiyun * (B) Locally cache global LRU draining generation number
870*4882a593Smuzhiyun *
871*4882a593Smuzhiyun * The read barrier ensures that the counter is loaded before the mutex
872*4882a593Smuzhiyun * is taken. It pairs with smp_mb() inside the mutex critical section
873*4882a593Smuzhiyun * at (D).
874*4882a593Smuzhiyun */
875*4882a593Smuzhiyun this_gen = smp_load_acquire(&lru_drain_gen);
876*4882a593Smuzhiyun
877*4882a593Smuzhiyun mutex_lock(&lock);
878*4882a593Smuzhiyun
879*4882a593Smuzhiyun /*
880*4882a593Smuzhiyun * (C) Exit the draining operation if a newer generation, from another
881*4882a593Smuzhiyun * lru_add_drain_all(), was already scheduled for draining. Check (A).
882*4882a593Smuzhiyun */
883*4882a593Smuzhiyun if (unlikely(this_gen != lru_drain_gen && !force_all_cpus))
884*4882a593Smuzhiyun goto done;
885*4882a593Smuzhiyun
886*4882a593Smuzhiyun /*
887*4882a593Smuzhiyun * (D) Increment global generation number
888*4882a593Smuzhiyun *
889*4882a593Smuzhiyun * Pairs with smp_load_acquire() at (B), outside of the critical
890*4882a593Smuzhiyun * section. Use a full memory barrier to guarantee that the new global
891*4882a593Smuzhiyun * drain generation number is stored before loading pagevec counters.
892*4882a593Smuzhiyun *
893*4882a593Smuzhiyun * This pairing must be done here, before the for_each_online_cpu loop
894*4882a593Smuzhiyun * below which drains the page vectors.
895*4882a593Smuzhiyun *
896*4882a593Smuzhiyun * Let x, y, and z represent some system CPU numbers, where x < y < z.
897*4882a593Smuzhiyun * Assume CPU #z is is in the middle of the for_each_online_cpu loop
898*4882a593Smuzhiyun * below and has already reached CPU #y's per-cpu data. CPU #x comes
899*4882a593Smuzhiyun * along, adds some pages to its per-cpu vectors, then calls
900*4882a593Smuzhiyun * lru_add_drain_all().
901*4882a593Smuzhiyun *
902*4882a593Smuzhiyun * If the paired barrier is done at any later step, e.g. after the
903*4882a593Smuzhiyun * loop, CPU #x will just exit at (C) and miss flushing out all of its
904*4882a593Smuzhiyun * added pages.
905*4882a593Smuzhiyun */
906*4882a593Smuzhiyun WRITE_ONCE(lru_drain_gen, lru_drain_gen + 1);
907*4882a593Smuzhiyun smp_mb();
908*4882a593Smuzhiyun
909*4882a593Smuzhiyun cpumask_clear(&has_work);
910*4882a593Smuzhiyun for_each_online_cpu(cpu) {
911*4882a593Smuzhiyun struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
912*4882a593Smuzhiyun
913*4882a593Smuzhiyun if (force_all_cpus ||
914*4882a593Smuzhiyun pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) ||
915*4882a593Smuzhiyun data_race(pagevec_count(&per_cpu(lru_rotate.pvec, cpu))) ||
916*4882a593Smuzhiyun pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) ||
917*4882a593Smuzhiyun pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) ||
918*4882a593Smuzhiyun pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) ||
919*4882a593Smuzhiyun pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree_movetail, cpu)) ||
920*4882a593Smuzhiyun need_activate_page_drain(cpu) ||
921*4882a593Smuzhiyun has_bh_in_lru(cpu, NULL)) {
922*4882a593Smuzhiyun INIT_WORK(work, lru_add_drain_per_cpu);
923*4882a593Smuzhiyun queue_work_on(cpu, mm_percpu_wq, work);
924*4882a593Smuzhiyun __cpumask_set_cpu(cpu, &has_work);
925*4882a593Smuzhiyun }
926*4882a593Smuzhiyun }
927*4882a593Smuzhiyun
928*4882a593Smuzhiyun for_each_cpu(cpu, &has_work)
929*4882a593Smuzhiyun flush_work(&per_cpu(lru_add_drain_work, cpu));
930*4882a593Smuzhiyun
931*4882a593Smuzhiyun done:
932*4882a593Smuzhiyun mutex_unlock(&lock);
933*4882a593Smuzhiyun }
934*4882a593Smuzhiyun
lru_add_drain_all(void)935*4882a593Smuzhiyun void lru_add_drain_all(void)
936*4882a593Smuzhiyun {
937*4882a593Smuzhiyun __lru_add_drain_all(false);
938*4882a593Smuzhiyun }
939*4882a593Smuzhiyun #else
lru_add_drain_all(void)940*4882a593Smuzhiyun void lru_add_drain_all(void)
941*4882a593Smuzhiyun {
942*4882a593Smuzhiyun lru_add_drain();
943*4882a593Smuzhiyun }
944*4882a593Smuzhiyun #endif /* CONFIG_SMP */
945*4882a593Smuzhiyun
946*4882a593Smuzhiyun static atomic_t lru_disable_count = ATOMIC_INIT(0);
947*4882a593Smuzhiyun
lru_cache_disabled(void)948*4882a593Smuzhiyun bool lru_cache_disabled(void)
949*4882a593Smuzhiyun {
950*4882a593Smuzhiyun return atomic_read(&lru_disable_count) != 0;
951*4882a593Smuzhiyun }
952*4882a593Smuzhiyun
lru_cache_enable(void)953*4882a593Smuzhiyun void lru_cache_enable(void)
954*4882a593Smuzhiyun {
955*4882a593Smuzhiyun atomic_dec(&lru_disable_count);
956*4882a593Smuzhiyun }
957*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(lru_cache_enable);
958*4882a593Smuzhiyun
959*4882a593Smuzhiyun /*
960*4882a593Smuzhiyun * lru_cache_disable() needs to be called before we start compiling
961*4882a593Smuzhiyun * a list of pages to be migrated using isolate_lru_page().
962*4882a593Smuzhiyun * It drains pages on LRU cache and then disable on all cpus until
963*4882a593Smuzhiyun * lru_cache_enable is called.
964*4882a593Smuzhiyun *
965*4882a593Smuzhiyun * Must be paired with a call to lru_cache_enable().
966*4882a593Smuzhiyun */
lru_cache_disable(void)967*4882a593Smuzhiyun void lru_cache_disable(void)
968*4882a593Smuzhiyun {
969*4882a593Smuzhiyun /*
970*4882a593Smuzhiyun * If someone is already disabled lru_cache, just return with
971*4882a593Smuzhiyun * increasing the lru_disable_count.
972*4882a593Smuzhiyun */
973*4882a593Smuzhiyun if (atomic_inc_not_zero(&lru_disable_count))
974*4882a593Smuzhiyun return;
975*4882a593Smuzhiyun #ifdef CONFIG_SMP
976*4882a593Smuzhiyun /*
977*4882a593Smuzhiyun * lru_add_drain_all in the force mode will schedule draining on
978*4882a593Smuzhiyun * all online CPUs so any calls of lru_cache_disabled wrapped by
979*4882a593Smuzhiyun * local_lock or preemption disabled would be ordered by that.
980*4882a593Smuzhiyun * The atomic operation doesn't need to have stronger ordering
981*4882a593Smuzhiyun * requirements because that is enforeced by the scheduling
982*4882a593Smuzhiyun * guarantees.
983*4882a593Smuzhiyun */
984*4882a593Smuzhiyun __lru_add_drain_all(true);
985*4882a593Smuzhiyun #else
986*4882a593Smuzhiyun lru_add_and_bh_lrus_drain();
987*4882a593Smuzhiyun #endif
988*4882a593Smuzhiyun atomic_inc(&lru_disable_count);
989*4882a593Smuzhiyun }
990*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(lru_cache_disable);
991*4882a593Smuzhiyun
992*4882a593Smuzhiyun /**
993*4882a593Smuzhiyun * release_pages - batched put_page()
994*4882a593Smuzhiyun * @pages: array of pages to release
995*4882a593Smuzhiyun * @nr: number of pages
996*4882a593Smuzhiyun *
997*4882a593Smuzhiyun * Decrement the reference count on all the pages in @pages. If it
998*4882a593Smuzhiyun * fell to zero, remove the page from the LRU and free it.
999*4882a593Smuzhiyun */
release_pages(struct page ** pages,int nr)1000*4882a593Smuzhiyun void release_pages(struct page **pages, int nr)
1001*4882a593Smuzhiyun {
1002*4882a593Smuzhiyun int i;
1003*4882a593Smuzhiyun LIST_HEAD(pages_to_free);
1004*4882a593Smuzhiyun struct pglist_data *locked_pgdat = NULL;
1005*4882a593Smuzhiyun struct lruvec *lruvec;
1006*4882a593Smuzhiyun unsigned long flags;
1007*4882a593Smuzhiyun unsigned int lock_batch;
1008*4882a593Smuzhiyun
1009*4882a593Smuzhiyun for (i = 0; i < nr; i++) {
1010*4882a593Smuzhiyun struct page *page = pages[i];
1011*4882a593Smuzhiyun
1012*4882a593Smuzhiyun /*
1013*4882a593Smuzhiyun * Make sure the IRQ-safe lock-holding time does not get
1014*4882a593Smuzhiyun * excessive with a continuous string of pages from the
1015*4882a593Smuzhiyun * same pgdat. The lock is held only if pgdat != NULL.
1016*4882a593Smuzhiyun */
1017*4882a593Smuzhiyun if (locked_pgdat && ++lock_batch == SWAP_CLUSTER_MAX) {
1018*4882a593Smuzhiyun spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
1019*4882a593Smuzhiyun locked_pgdat = NULL;
1020*4882a593Smuzhiyun }
1021*4882a593Smuzhiyun
1022*4882a593Smuzhiyun page = compound_head(page);
1023*4882a593Smuzhiyun if (is_huge_zero_page(page))
1024*4882a593Smuzhiyun continue;
1025*4882a593Smuzhiyun
1026*4882a593Smuzhiyun if (is_zone_device_page(page)) {
1027*4882a593Smuzhiyun if (locked_pgdat) {
1028*4882a593Smuzhiyun spin_unlock_irqrestore(&locked_pgdat->lru_lock,
1029*4882a593Smuzhiyun flags);
1030*4882a593Smuzhiyun locked_pgdat = NULL;
1031*4882a593Smuzhiyun }
1032*4882a593Smuzhiyun /*
1033*4882a593Smuzhiyun * ZONE_DEVICE pages that return 'false' from
1034*4882a593Smuzhiyun * page_is_devmap_managed() do not require special
1035*4882a593Smuzhiyun * processing, and instead, expect a call to
1036*4882a593Smuzhiyun * put_page_testzero().
1037*4882a593Smuzhiyun */
1038*4882a593Smuzhiyun if (page_is_devmap_managed(page)) {
1039*4882a593Smuzhiyun put_devmap_managed_page(page);
1040*4882a593Smuzhiyun continue;
1041*4882a593Smuzhiyun }
1042*4882a593Smuzhiyun }
1043*4882a593Smuzhiyun
1044*4882a593Smuzhiyun if (!put_page_testzero(page))
1045*4882a593Smuzhiyun continue;
1046*4882a593Smuzhiyun
1047*4882a593Smuzhiyun if (PageCompound(page)) {
1048*4882a593Smuzhiyun if (locked_pgdat) {
1049*4882a593Smuzhiyun spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
1050*4882a593Smuzhiyun locked_pgdat = NULL;
1051*4882a593Smuzhiyun }
1052*4882a593Smuzhiyun __put_compound_page(page);
1053*4882a593Smuzhiyun continue;
1054*4882a593Smuzhiyun }
1055*4882a593Smuzhiyun
1056*4882a593Smuzhiyun if (PageLRU(page)) {
1057*4882a593Smuzhiyun struct pglist_data *pgdat = page_pgdat(page);
1058*4882a593Smuzhiyun
1059*4882a593Smuzhiyun if (pgdat != locked_pgdat) {
1060*4882a593Smuzhiyun if (locked_pgdat)
1061*4882a593Smuzhiyun spin_unlock_irqrestore(&locked_pgdat->lru_lock,
1062*4882a593Smuzhiyun flags);
1063*4882a593Smuzhiyun lock_batch = 0;
1064*4882a593Smuzhiyun locked_pgdat = pgdat;
1065*4882a593Smuzhiyun spin_lock_irqsave(&locked_pgdat->lru_lock, flags);
1066*4882a593Smuzhiyun }
1067*4882a593Smuzhiyun
1068*4882a593Smuzhiyun lruvec = mem_cgroup_page_lruvec(page, locked_pgdat);
1069*4882a593Smuzhiyun VM_BUG_ON_PAGE(!PageLRU(page), page);
1070*4882a593Smuzhiyun __ClearPageLRU(page);
1071*4882a593Smuzhiyun del_page_from_lru_list(page, lruvec, page_off_lru(page));
1072*4882a593Smuzhiyun }
1073*4882a593Smuzhiyun
1074*4882a593Smuzhiyun __ClearPageWaiters(page);
1075*4882a593Smuzhiyun
1076*4882a593Smuzhiyun list_add(&page->lru, &pages_to_free);
1077*4882a593Smuzhiyun }
1078*4882a593Smuzhiyun if (locked_pgdat)
1079*4882a593Smuzhiyun spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
1080*4882a593Smuzhiyun
1081*4882a593Smuzhiyun mem_cgroup_uncharge_list(&pages_to_free);
1082*4882a593Smuzhiyun free_unref_page_list(&pages_to_free);
1083*4882a593Smuzhiyun }
1084*4882a593Smuzhiyun EXPORT_SYMBOL(release_pages);
1085*4882a593Smuzhiyun
1086*4882a593Smuzhiyun /*
1087*4882a593Smuzhiyun * The pages which we're about to release may be in the deferred lru-addition
1088*4882a593Smuzhiyun * queues. That would prevent them from really being freed right now. That's
1089*4882a593Smuzhiyun * OK from a correctness point of view but is inefficient - those pages may be
1090*4882a593Smuzhiyun * cache-warm and we want to give them back to the page allocator ASAP.
1091*4882a593Smuzhiyun *
1092*4882a593Smuzhiyun * So __pagevec_release() will drain those queues here. __pagevec_lru_add()
1093*4882a593Smuzhiyun * and __pagevec_lru_add_active() call release_pages() directly to avoid
1094*4882a593Smuzhiyun * mutual recursion.
1095*4882a593Smuzhiyun */
__pagevec_release(struct pagevec * pvec)1096*4882a593Smuzhiyun void __pagevec_release(struct pagevec *pvec)
1097*4882a593Smuzhiyun {
1098*4882a593Smuzhiyun if (!pvec->percpu_pvec_drained) {
1099*4882a593Smuzhiyun lru_add_drain();
1100*4882a593Smuzhiyun pvec->percpu_pvec_drained = true;
1101*4882a593Smuzhiyun }
1102*4882a593Smuzhiyun release_pages(pvec->pages, pagevec_count(pvec));
1103*4882a593Smuzhiyun pagevec_reinit(pvec);
1104*4882a593Smuzhiyun }
1105*4882a593Smuzhiyun EXPORT_SYMBOL(__pagevec_release);
1106*4882a593Smuzhiyun
1107*4882a593Smuzhiyun #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1108*4882a593Smuzhiyun /* used by __split_huge_page_refcount() */
lru_add_page_tail(struct page * page,struct page * page_tail,struct lruvec * lruvec,struct list_head * list)1109*4882a593Smuzhiyun void lru_add_page_tail(struct page *page, struct page *page_tail,
1110*4882a593Smuzhiyun struct lruvec *lruvec, struct list_head *list)
1111*4882a593Smuzhiyun {
1112*4882a593Smuzhiyun VM_BUG_ON_PAGE(!PageHead(page), page);
1113*4882a593Smuzhiyun VM_BUG_ON_PAGE(PageCompound(page_tail), page);
1114*4882a593Smuzhiyun VM_BUG_ON_PAGE(PageLRU(page_tail), page);
1115*4882a593Smuzhiyun lockdep_assert_held(&lruvec_pgdat(lruvec)->lru_lock);
1116*4882a593Smuzhiyun
1117*4882a593Smuzhiyun if (!list)
1118*4882a593Smuzhiyun SetPageLRU(page_tail);
1119*4882a593Smuzhiyun
1120*4882a593Smuzhiyun if (likely(PageLRU(page)))
1121*4882a593Smuzhiyun list_add_tail(&page_tail->lru, &page->lru);
1122*4882a593Smuzhiyun else if (list) {
1123*4882a593Smuzhiyun /* page reclaim is reclaiming a huge page */
1124*4882a593Smuzhiyun get_page(page_tail);
1125*4882a593Smuzhiyun list_add_tail(&page_tail->lru, list);
1126*4882a593Smuzhiyun } else {
1127*4882a593Smuzhiyun /*
1128*4882a593Smuzhiyun * Head page has not yet been counted, as an hpage,
1129*4882a593Smuzhiyun * so we must account for each subpage individually.
1130*4882a593Smuzhiyun *
1131*4882a593Smuzhiyun * Put page_tail on the list at the correct position
1132*4882a593Smuzhiyun * so they all end up in order.
1133*4882a593Smuzhiyun */
1134*4882a593Smuzhiyun add_page_to_lru_list_tail(page_tail, lruvec,
1135*4882a593Smuzhiyun page_lru(page_tail));
1136*4882a593Smuzhiyun }
1137*4882a593Smuzhiyun }
1138*4882a593Smuzhiyun #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1139*4882a593Smuzhiyun
__pagevec_lru_add_fn(struct page * page,struct lruvec * lruvec,void * arg)1140*4882a593Smuzhiyun static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
1141*4882a593Smuzhiyun void *arg)
1142*4882a593Smuzhiyun {
1143*4882a593Smuzhiyun enum lru_list lru;
1144*4882a593Smuzhiyun int was_unevictable = TestClearPageUnevictable(page);
1145*4882a593Smuzhiyun int nr_pages = thp_nr_pages(page);
1146*4882a593Smuzhiyun
1147*4882a593Smuzhiyun VM_BUG_ON_PAGE(PageLRU(page), page);
1148*4882a593Smuzhiyun
1149*4882a593Smuzhiyun /*
1150*4882a593Smuzhiyun * Page becomes evictable in two ways:
1151*4882a593Smuzhiyun * 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()].
1152*4882a593Smuzhiyun * 2) Before acquiring LRU lock to put the page to correct LRU and then
1153*4882a593Smuzhiyun * a) do PageLRU check with lock [check_move_unevictable_pages]
1154*4882a593Smuzhiyun * b) do PageLRU check before lock [clear_page_mlock]
1155*4882a593Smuzhiyun *
1156*4882a593Smuzhiyun * (1) & (2a) are ok as LRU lock will serialize them. For (2b), we need
1157*4882a593Smuzhiyun * following strict ordering:
1158*4882a593Smuzhiyun *
1159*4882a593Smuzhiyun * #0: __pagevec_lru_add_fn #1: clear_page_mlock
1160*4882a593Smuzhiyun *
1161*4882a593Smuzhiyun * SetPageLRU() TestClearPageMlocked()
1162*4882a593Smuzhiyun * smp_mb() // explicit ordering // above provides strict
1163*4882a593Smuzhiyun * // ordering
1164*4882a593Smuzhiyun * PageMlocked() PageLRU()
1165*4882a593Smuzhiyun *
1166*4882a593Smuzhiyun *
1167*4882a593Smuzhiyun * if '#1' does not observe setting of PG_lru by '#0' and fails
1168*4882a593Smuzhiyun * isolation, the explicit barrier will make sure that page_evictable
1169*4882a593Smuzhiyun * check will put the page in correct LRU. Without smp_mb(), SetPageLRU
1170*4882a593Smuzhiyun * can be reordered after PageMlocked check and can make '#1' to fail
1171*4882a593Smuzhiyun * the isolation of the page whose Mlocked bit is cleared (#0 is also
1172*4882a593Smuzhiyun * looking at the same page) and the evictable page will be stranded
1173*4882a593Smuzhiyun * in an unevictable LRU.
1174*4882a593Smuzhiyun */
1175*4882a593Smuzhiyun SetPageLRU(page);
1176*4882a593Smuzhiyun smp_mb__after_atomic();
1177*4882a593Smuzhiyun
1178*4882a593Smuzhiyun if (page_evictable(page)) {
1179*4882a593Smuzhiyun lru = page_lru(page);
1180*4882a593Smuzhiyun if (was_unevictable)
1181*4882a593Smuzhiyun __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
1182*4882a593Smuzhiyun } else {
1183*4882a593Smuzhiyun lru = LRU_UNEVICTABLE;
1184*4882a593Smuzhiyun ClearPageActive(page);
1185*4882a593Smuzhiyun SetPageUnevictable(page);
1186*4882a593Smuzhiyun if (!was_unevictable)
1187*4882a593Smuzhiyun __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
1188*4882a593Smuzhiyun }
1189*4882a593Smuzhiyun
1190*4882a593Smuzhiyun add_page_to_lru_list(page, lruvec, lru);
1191*4882a593Smuzhiyun trace_mm_lru_insertion(page, lru);
1192*4882a593Smuzhiyun }
1193*4882a593Smuzhiyun
1194*4882a593Smuzhiyun /*
1195*4882a593Smuzhiyun * Add the passed pages to the LRU, then drop the caller's refcount
1196*4882a593Smuzhiyun * on them. Reinitialises the caller's pagevec.
1197*4882a593Smuzhiyun */
__pagevec_lru_add(struct pagevec * pvec)1198*4882a593Smuzhiyun void __pagevec_lru_add(struct pagevec *pvec)
1199*4882a593Smuzhiyun {
1200*4882a593Smuzhiyun pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL);
1201*4882a593Smuzhiyun }
1202*4882a593Smuzhiyun
1203*4882a593Smuzhiyun /**
1204*4882a593Smuzhiyun * pagevec_lookup_entries - gang pagecache lookup
1205*4882a593Smuzhiyun * @pvec: Where the resulting entries are placed
1206*4882a593Smuzhiyun * @mapping: The address_space to search
1207*4882a593Smuzhiyun * @start: The starting entry index
1208*4882a593Smuzhiyun * @nr_entries: The maximum number of pages
1209*4882a593Smuzhiyun * @indices: The cache indices corresponding to the entries in @pvec
1210*4882a593Smuzhiyun *
1211*4882a593Smuzhiyun * pagevec_lookup_entries() will search for and return a group of up
1212*4882a593Smuzhiyun * to @nr_pages pages and shadow entries in the mapping. All
1213*4882a593Smuzhiyun * entries are placed in @pvec. pagevec_lookup_entries() takes a
1214*4882a593Smuzhiyun * reference against actual pages in @pvec.
1215*4882a593Smuzhiyun *
1216*4882a593Smuzhiyun * The search returns a group of mapping-contiguous entries with
1217*4882a593Smuzhiyun * ascending indexes. There may be holes in the indices due to
1218*4882a593Smuzhiyun * not-present entries.
1219*4882a593Smuzhiyun *
1220*4882a593Smuzhiyun * Only one subpage of a Transparent Huge Page is returned in one call:
1221*4882a593Smuzhiyun * allowing truncate_inode_pages_range() to evict the whole THP without
1222*4882a593Smuzhiyun * cycling through a pagevec of extra references.
1223*4882a593Smuzhiyun *
1224*4882a593Smuzhiyun * pagevec_lookup_entries() returns the number of entries which were
1225*4882a593Smuzhiyun * found.
1226*4882a593Smuzhiyun */
pagevec_lookup_entries(struct pagevec * pvec,struct address_space * mapping,pgoff_t start,unsigned nr_entries,pgoff_t * indices)1227*4882a593Smuzhiyun unsigned pagevec_lookup_entries(struct pagevec *pvec,
1228*4882a593Smuzhiyun struct address_space *mapping,
1229*4882a593Smuzhiyun pgoff_t start, unsigned nr_entries,
1230*4882a593Smuzhiyun pgoff_t *indices)
1231*4882a593Smuzhiyun {
1232*4882a593Smuzhiyun pvec->nr = find_get_entries(mapping, start, nr_entries,
1233*4882a593Smuzhiyun pvec->pages, indices);
1234*4882a593Smuzhiyun return pagevec_count(pvec);
1235*4882a593Smuzhiyun }
1236*4882a593Smuzhiyun
1237*4882a593Smuzhiyun /**
1238*4882a593Smuzhiyun * pagevec_remove_exceptionals - pagevec exceptionals pruning
1239*4882a593Smuzhiyun * @pvec: The pagevec to prune
1240*4882a593Smuzhiyun *
1241*4882a593Smuzhiyun * pagevec_lookup_entries() fills both pages and exceptional radix
1242*4882a593Smuzhiyun * tree entries into the pagevec. This function prunes all
1243*4882a593Smuzhiyun * exceptionals from @pvec without leaving holes, so that it can be
1244*4882a593Smuzhiyun * passed on to page-only pagevec operations.
1245*4882a593Smuzhiyun */
pagevec_remove_exceptionals(struct pagevec * pvec)1246*4882a593Smuzhiyun void pagevec_remove_exceptionals(struct pagevec *pvec)
1247*4882a593Smuzhiyun {
1248*4882a593Smuzhiyun int i, j;
1249*4882a593Smuzhiyun
1250*4882a593Smuzhiyun for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
1251*4882a593Smuzhiyun struct page *page = pvec->pages[i];
1252*4882a593Smuzhiyun if (!xa_is_value(page))
1253*4882a593Smuzhiyun pvec->pages[j++] = page;
1254*4882a593Smuzhiyun }
1255*4882a593Smuzhiyun pvec->nr = j;
1256*4882a593Smuzhiyun }
1257*4882a593Smuzhiyun
1258*4882a593Smuzhiyun /**
1259*4882a593Smuzhiyun * pagevec_lookup_range - gang pagecache lookup
1260*4882a593Smuzhiyun * @pvec: Where the resulting pages are placed
1261*4882a593Smuzhiyun * @mapping: The address_space to search
1262*4882a593Smuzhiyun * @start: The starting page index
1263*4882a593Smuzhiyun * @end: The final page index
1264*4882a593Smuzhiyun *
1265*4882a593Smuzhiyun * pagevec_lookup_range() will search for & return a group of up to PAGEVEC_SIZE
1266*4882a593Smuzhiyun * pages in the mapping starting from index @start and upto index @end
1267*4882a593Smuzhiyun * (inclusive). The pages are placed in @pvec. pagevec_lookup() takes a
1268*4882a593Smuzhiyun * reference against the pages in @pvec.
1269*4882a593Smuzhiyun *
1270*4882a593Smuzhiyun * The search returns a group of mapping-contiguous pages with ascending
1271*4882a593Smuzhiyun * indexes. There may be holes in the indices due to not-present pages. We
1272*4882a593Smuzhiyun * also update @start to index the next page for the traversal.
1273*4882a593Smuzhiyun *
1274*4882a593Smuzhiyun * pagevec_lookup_range() returns the number of pages which were found. If this
1275*4882a593Smuzhiyun * number is smaller than PAGEVEC_SIZE, the end of specified range has been
1276*4882a593Smuzhiyun * reached.
1277*4882a593Smuzhiyun */
pagevec_lookup_range(struct pagevec * pvec,struct address_space * mapping,pgoff_t * start,pgoff_t end)1278*4882a593Smuzhiyun unsigned pagevec_lookup_range(struct pagevec *pvec,
1279*4882a593Smuzhiyun struct address_space *mapping, pgoff_t *start, pgoff_t end)
1280*4882a593Smuzhiyun {
1281*4882a593Smuzhiyun pvec->nr = find_get_pages_range(mapping, start, end, PAGEVEC_SIZE,
1282*4882a593Smuzhiyun pvec->pages);
1283*4882a593Smuzhiyun return pagevec_count(pvec);
1284*4882a593Smuzhiyun }
1285*4882a593Smuzhiyun EXPORT_SYMBOL(pagevec_lookup_range);
1286*4882a593Smuzhiyun
pagevec_lookup_range_tag(struct pagevec * pvec,struct address_space * mapping,pgoff_t * index,pgoff_t end,xa_mark_t tag)1287*4882a593Smuzhiyun unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
1288*4882a593Smuzhiyun struct address_space *mapping, pgoff_t *index, pgoff_t end,
1289*4882a593Smuzhiyun xa_mark_t tag)
1290*4882a593Smuzhiyun {
1291*4882a593Smuzhiyun pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
1292*4882a593Smuzhiyun PAGEVEC_SIZE, pvec->pages);
1293*4882a593Smuzhiyun return pagevec_count(pvec);
1294*4882a593Smuzhiyun }
1295*4882a593Smuzhiyun EXPORT_SYMBOL(pagevec_lookup_range_tag);
1296*4882a593Smuzhiyun
pagevec_lookup_range_nr_tag(struct pagevec * pvec,struct address_space * mapping,pgoff_t * index,pgoff_t end,xa_mark_t tag,unsigned max_pages)1297*4882a593Smuzhiyun unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec,
1298*4882a593Smuzhiyun struct address_space *mapping, pgoff_t *index, pgoff_t end,
1299*4882a593Smuzhiyun xa_mark_t tag, unsigned max_pages)
1300*4882a593Smuzhiyun {
1301*4882a593Smuzhiyun pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
1302*4882a593Smuzhiyun min_t(unsigned int, max_pages, PAGEVEC_SIZE), pvec->pages);
1303*4882a593Smuzhiyun return pagevec_count(pvec);
1304*4882a593Smuzhiyun }
1305*4882a593Smuzhiyun EXPORT_SYMBOL(pagevec_lookup_range_nr_tag);
1306*4882a593Smuzhiyun /*
1307*4882a593Smuzhiyun * Perform any setup for the swap system
1308*4882a593Smuzhiyun */
swap_setup(void)1309*4882a593Smuzhiyun void __init swap_setup(void)
1310*4882a593Smuzhiyun {
1311*4882a593Smuzhiyun unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT);
1312*4882a593Smuzhiyun
1313*4882a593Smuzhiyun /* Use a smaller cluster for small-memory machines */
1314*4882a593Smuzhiyun if (megs < 16)
1315*4882a593Smuzhiyun page_cluster = 2;
1316*4882a593Smuzhiyun else
1317*4882a593Smuzhiyun page_cluster = 3;
1318*4882a593Smuzhiyun /*
1319*4882a593Smuzhiyun * Right now other parts of the system means that we
1320*4882a593Smuzhiyun * _really_ don't want to cluster much more
1321*4882a593Smuzhiyun */
1322*4882a593Smuzhiyun }
1323*4882a593Smuzhiyun
1324*4882a593Smuzhiyun #ifdef CONFIG_DEV_PAGEMAP_OPS
put_devmap_managed_page(struct page * page)1325*4882a593Smuzhiyun void put_devmap_managed_page(struct page *page)
1326*4882a593Smuzhiyun {
1327*4882a593Smuzhiyun int count;
1328*4882a593Smuzhiyun
1329*4882a593Smuzhiyun if (WARN_ON_ONCE(!page_is_devmap_managed(page)))
1330*4882a593Smuzhiyun return;
1331*4882a593Smuzhiyun
1332*4882a593Smuzhiyun count = page_ref_dec_return(page);
1333*4882a593Smuzhiyun
1334*4882a593Smuzhiyun /*
1335*4882a593Smuzhiyun * devmap page refcounts are 1-based, rather than 0-based: if
1336*4882a593Smuzhiyun * refcount is 1, then the page is free and the refcount is
1337*4882a593Smuzhiyun * stable because nobody holds a reference on the page.
1338*4882a593Smuzhiyun */
1339*4882a593Smuzhiyun if (count == 1)
1340*4882a593Smuzhiyun free_devmap_managed_page(page);
1341*4882a593Smuzhiyun else if (!count)
1342*4882a593Smuzhiyun __put_page(page);
1343*4882a593Smuzhiyun }
1344*4882a593Smuzhiyun EXPORT_SYMBOL(put_devmap_managed_page);
1345*4882a593Smuzhiyun #endif
1346