1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0-only
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * linux/mm/swapfile.c
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
6*4882a593Smuzhiyun * Swap reorganised 29.12.95, Stephen Tweedie
7*4882a593Smuzhiyun */
8*4882a593Smuzhiyun
9*4882a593Smuzhiyun #include <linux/mm.h>
10*4882a593Smuzhiyun #include <linux/sched/mm.h>
11*4882a593Smuzhiyun #include <linux/sched/task.h>
12*4882a593Smuzhiyun #include <linux/hugetlb.h>
13*4882a593Smuzhiyun #include <linux/mman.h>
14*4882a593Smuzhiyun #include <linux/slab.h>
15*4882a593Smuzhiyun #include <linux/kernel_stat.h>
16*4882a593Smuzhiyun #include <linux/swap.h>
17*4882a593Smuzhiyun #include <linux/vmalloc.h>
18*4882a593Smuzhiyun #include <linux/pagemap.h>
19*4882a593Smuzhiyun #include <linux/namei.h>
20*4882a593Smuzhiyun #include <linux/shmem_fs.h>
21*4882a593Smuzhiyun #include <linux/blkdev.h>
22*4882a593Smuzhiyun #include <linux/random.h>
23*4882a593Smuzhiyun #include <linux/writeback.h>
24*4882a593Smuzhiyun #include <linux/proc_fs.h>
25*4882a593Smuzhiyun #include <linux/seq_file.h>
26*4882a593Smuzhiyun #include <linux/init.h>
27*4882a593Smuzhiyun #include <linux/ksm.h>
28*4882a593Smuzhiyun #include <linux/rmap.h>
29*4882a593Smuzhiyun #include <linux/security.h>
30*4882a593Smuzhiyun #include <linux/backing-dev.h>
31*4882a593Smuzhiyun #include <linux/mutex.h>
32*4882a593Smuzhiyun #include <linux/capability.h>
33*4882a593Smuzhiyun #include <linux/syscalls.h>
34*4882a593Smuzhiyun #include <linux/memcontrol.h>
35*4882a593Smuzhiyun #include <linux/poll.h>
36*4882a593Smuzhiyun #include <linux/oom.h>
37*4882a593Smuzhiyun #include <linux/frontswap.h>
38*4882a593Smuzhiyun #include <linux/swapfile.h>
39*4882a593Smuzhiyun #include <linux/export.h>
40*4882a593Smuzhiyun #include <linux/swap_slots.h>
41*4882a593Smuzhiyun #include <linux/sort.h>
42*4882a593Smuzhiyun
43*4882a593Smuzhiyun #include <asm/tlbflush.h>
44*4882a593Smuzhiyun #include <linux/swapops.h>
45*4882a593Smuzhiyun #include <linux/swap_cgroup.h>
46*4882a593Smuzhiyun #include <trace/hooks/mm.h>
47*4882a593Smuzhiyun
48*4882a593Smuzhiyun static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
49*4882a593Smuzhiyun unsigned char);
50*4882a593Smuzhiyun static void free_swap_count_continuations(struct swap_info_struct *);
51*4882a593Smuzhiyun static sector_t map_swap_entry(swp_entry_t, struct block_device**);
52*4882a593Smuzhiyun
53*4882a593Smuzhiyun DEFINE_SPINLOCK(swap_lock);
54*4882a593Smuzhiyun static unsigned int nr_swapfiles;
55*4882a593Smuzhiyun atomic_long_t nr_swap_pages;
56*4882a593Smuzhiyun /*
57*4882a593Smuzhiyun * Some modules use swappable objects and may try to swap them out under
58*4882a593Smuzhiyun * memory pressure (via the shrinker). Before doing so, they may wish to
59*4882a593Smuzhiyun * check to see if any swap space is available.
60*4882a593Smuzhiyun */
61*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(nr_swap_pages);
62*4882a593Smuzhiyun /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
63*4882a593Smuzhiyun long total_swap_pages;
64*4882a593Smuzhiyun static int least_priority = -1;
65*4882a593Smuzhiyun
66*4882a593Smuzhiyun static const char Bad_file[] = "Bad swap file entry ";
67*4882a593Smuzhiyun static const char Unused_file[] = "Unused swap file entry ";
68*4882a593Smuzhiyun static const char Bad_offset[] = "Bad swap offset entry ";
69*4882a593Smuzhiyun static const char Unused_offset[] = "Unused swap offset entry ";
70*4882a593Smuzhiyun
71*4882a593Smuzhiyun /*
72*4882a593Smuzhiyun * all active swap_info_structs
73*4882a593Smuzhiyun * protected with swap_lock, and ordered by priority.
74*4882a593Smuzhiyun */
75*4882a593Smuzhiyun PLIST_HEAD(swap_active_head);
76*4882a593Smuzhiyun
77*4882a593Smuzhiyun /*
78*4882a593Smuzhiyun * all available (active, not full) swap_info_structs
79*4882a593Smuzhiyun * protected with swap_avail_lock, ordered by priority.
80*4882a593Smuzhiyun * This is used by get_swap_page() instead of swap_active_head
81*4882a593Smuzhiyun * because swap_active_head includes all swap_info_structs,
82*4882a593Smuzhiyun * but get_swap_page() doesn't need to look at full ones.
83*4882a593Smuzhiyun * This uses its own lock instead of swap_lock because when a
84*4882a593Smuzhiyun * swap_info_struct changes between not-full/full, it needs to
85*4882a593Smuzhiyun * add/remove itself to/from this list, but the swap_info_struct->lock
86*4882a593Smuzhiyun * is held and the locking order requires swap_lock to be taken
87*4882a593Smuzhiyun * before any swap_info_struct->lock.
88*4882a593Smuzhiyun */
89*4882a593Smuzhiyun static struct plist_head *swap_avail_heads;
90*4882a593Smuzhiyun static DEFINE_SPINLOCK(swap_avail_lock);
91*4882a593Smuzhiyun
92*4882a593Smuzhiyun struct swap_info_struct *swap_info[MAX_SWAPFILES];
93*4882a593Smuzhiyun
94*4882a593Smuzhiyun static DEFINE_MUTEX(swapon_mutex);
95*4882a593Smuzhiyun
96*4882a593Smuzhiyun static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
97*4882a593Smuzhiyun /* Activity counter to indicate that a swapon or swapoff has occurred */
98*4882a593Smuzhiyun static atomic_t proc_poll_event = ATOMIC_INIT(0);
99*4882a593Smuzhiyun
100*4882a593Smuzhiyun atomic_t nr_rotate_swap = ATOMIC_INIT(0);
101*4882a593Smuzhiyun
swap_type_to_swap_info(int type)102*4882a593Smuzhiyun struct swap_info_struct *swap_type_to_swap_info(int type)
103*4882a593Smuzhiyun {
104*4882a593Smuzhiyun if (type >= READ_ONCE(nr_swapfiles))
105*4882a593Smuzhiyun return NULL;
106*4882a593Smuzhiyun
107*4882a593Smuzhiyun smp_rmb(); /* Pairs with smp_wmb in alloc_swap_info. */
108*4882a593Smuzhiyun return READ_ONCE(swap_info[type]);
109*4882a593Smuzhiyun }
110*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(swap_type_to_swap_info);
111*4882a593Smuzhiyun
swap_count(unsigned char ent)112*4882a593Smuzhiyun static inline unsigned char swap_count(unsigned char ent)
113*4882a593Smuzhiyun {
114*4882a593Smuzhiyun return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */
115*4882a593Smuzhiyun }
116*4882a593Smuzhiyun
117*4882a593Smuzhiyun /* Reclaim the swap entry anyway if possible */
118*4882a593Smuzhiyun #define TTRS_ANYWAY 0x1
119*4882a593Smuzhiyun /*
120*4882a593Smuzhiyun * Reclaim the swap entry if there are no more mappings of the
121*4882a593Smuzhiyun * corresponding page
122*4882a593Smuzhiyun */
123*4882a593Smuzhiyun #define TTRS_UNMAPPED 0x2
124*4882a593Smuzhiyun /* Reclaim the swap entry if swap is getting full*/
125*4882a593Smuzhiyun #define TTRS_FULL 0x4
126*4882a593Smuzhiyun
127*4882a593Smuzhiyun /* returns 1 if swap entry is freed */
__try_to_reclaim_swap(struct swap_info_struct * si,unsigned long offset,unsigned long flags)128*4882a593Smuzhiyun static int __try_to_reclaim_swap(struct swap_info_struct *si,
129*4882a593Smuzhiyun unsigned long offset, unsigned long flags)
130*4882a593Smuzhiyun {
131*4882a593Smuzhiyun swp_entry_t entry = swp_entry(si->type, offset);
132*4882a593Smuzhiyun struct page *page;
133*4882a593Smuzhiyun int ret = 0;
134*4882a593Smuzhiyun
135*4882a593Smuzhiyun page = find_get_page(swap_address_space(entry), offset);
136*4882a593Smuzhiyun if (!page)
137*4882a593Smuzhiyun return 0;
138*4882a593Smuzhiyun /*
139*4882a593Smuzhiyun * When this function is called from scan_swap_map_slots() and it's
140*4882a593Smuzhiyun * called by vmscan.c at reclaiming pages. So, we hold a lock on a page,
141*4882a593Smuzhiyun * here. We have to use trylock for avoiding deadlock. This is a special
142*4882a593Smuzhiyun * case and you should use try_to_free_swap() with explicit lock_page()
143*4882a593Smuzhiyun * in usual operations.
144*4882a593Smuzhiyun */
145*4882a593Smuzhiyun if (trylock_page(page)) {
146*4882a593Smuzhiyun if ((flags & TTRS_ANYWAY) ||
147*4882a593Smuzhiyun ((flags & TTRS_UNMAPPED) && !page_mapped(page)) ||
148*4882a593Smuzhiyun ((flags & TTRS_FULL) && mem_cgroup_swap_full(page)))
149*4882a593Smuzhiyun ret = try_to_free_swap(page);
150*4882a593Smuzhiyun unlock_page(page);
151*4882a593Smuzhiyun }
152*4882a593Smuzhiyun put_page(page);
153*4882a593Smuzhiyun return ret;
154*4882a593Smuzhiyun }
155*4882a593Smuzhiyun
first_se(struct swap_info_struct * sis)156*4882a593Smuzhiyun static inline struct swap_extent *first_se(struct swap_info_struct *sis)
157*4882a593Smuzhiyun {
158*4882a593Smuzhiyun struct rb_node *rb = rb_first(&sis->swap_extent_root);
159*4882a593Smuzhiyun return rb_entry(rb, struct swap_extent, rb_node);
160*4882a593Smuzhiyun }
161*4882a593Smuzhiyun
next_se(struct swap_extent * se)162*4882a593Smuzhiyun static inline struct swap_extent *next_se(struct swap_extent *se)
163*4882a593Smuzhiyun {
164*4882a593Smuzhiyun struct rb_node *rb = rb_next(&se->rb_node);
165*4882a593Smuzhiyun return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL;
166*4882a593Smuzhiyun }
167*4882a593Smuzhiyun
168*4882a593Smuzhiyun /*
169*4882a593Smuzhiyun * swapon tell device that all the old swap contents can be discarded,
170*4882a593Smuzhiyun * to allow the swap device to optimize its wear-levelling.
171*4882a593Smuzhiyun */
discard_swap(struct swap_info_struct * si)172*4882a593Smuzhiyun static int discard_swap(struct swap_info_struct *si)
173*4882a593Smuzhiyun {
174*4882a593Smuzhiyun struct swap_extent *se;
175*4882a593Smuzhiyun sector_t start_block;
176*4882a593Smuzhiyun sector_t nr_blocks;
177*4882a593Smuzhiyun int err = 0;
178*4882a593Smuzhiyun
179*4882a593Smuzhiyun /* Do not discard the swap header page! */
180*4882a593Smuzhiyun se = first_se(si);
181*4882a593Smuzhiyun start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
182*4882a593Smuzhiyun nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
183*4882a593Smuzhiyun if (nr_blocks) {
184*4882a593Smuzhiyun err = blkdev_issue_discard(si->bdev, start_block,
185*4882a593Smuzhiyun nr_blocks, GFP_KERNEL, 0);
186*4882a593Smuzhiyun if (err)
187*4882a593Smuzhiyun return err;
188*4882a593Smuzhiyun cond_resched();
189*4882a593Smuzhiyun }
190*4882a593Smuzhiyun
191*4882a593Smuzhiyun for (se = next_se(se); se; se = next_se(se)) {
192*4882a593Smuzhiyun start_block = se->start_block << (PAGE_SHIFT - 9);
193*4882a593Smuzhiyun nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
194*4882a593Smuzhiyun
195*4882a593Smuzhiyun err = blkdev_issue_discard(si->bdev, start_block,
196*4882a593Smuzhiyun nr_blocks, GFP_KERNEL, 0);
197*4882a593Smuzhiyun if (err)
198*4882a593Smuzhiyun break;
199*4882a593Smuzhiyun
200*4882a593Smuzhiyun cond_resched();
201*4882a593Smuzhiyun }
202*4882a593Smuzhiyun return err; /* That will often be -EOPNOTSUPP */
203*4882a593Smuzhiyun }
204*4882a593Smuzhiyun
205*4882a593Smuzhiyun static struct swap_extent *
offset_to_swap_extent(struct swap_info_struct * sis,unsigned long offset)206*4882a593Smuzhiyun offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
207*4882a593Smuzhiyun {
208*4882a593Smuzhiyun struct swap_extent *se;
209*4882a593Smuzhiyun struct rb_node *rb;
210*4882a593Smuzhiyun
211*4882a593Smuzhiyun rb = sis->swap_extent_root.rb_node;
212*4882a593Smuzhiyun while (rb) {
213*4882a593Smuzhiyun se = rb_entry(rb, struct swap_extent, rb_node);
214*4882a593Smuzhiyun if (offset < se->start_page)
215*4882a593Smuzhiyun rb = rb->rb_left;
216*4882a593Smuzhiyun else if (offset >= se->start_page + se->nr_pages)
217*4882a593Smuzhiyun rb = rb->rb_right;
218*4882a593Smuzhiyun else
219*4882a593Smuzhiyun return se;
220*4882a593Smuzhiyun }
221*4882a593Smuzhiyun /* It *must* be present */
222*4882a593Smuzhiyun BUG();
223*4882a593Smuzhiyun }
224*4882a593Smuzhiyun
swap_page_sector(struct page * page)225*4882a593Smuzhiyun sector_t swap_page_sector(struct page *page)
226*4882a593Smuzhiyun {
227*4882a593Smuzhiyun struct swap_info_struct *sis = page_swap_info(page);
228*4882a593Smuzhiyun struct swap_extent *se;
229*4882a593Smuzhiyun sector_t sector;
230*4882a593Smuzhiyun pgoff_t offset;
231*4882a593Smuzhiyun
232*4882a593Smuzhiyun offset = __page_file_index(page);
233*4882a593Smuzhiyun se = offset_to_swap_extent(sis, offset);
234*4882a593Smuzhiyun sector = se->start_block + (offset - se->start_page);
235*4882a593Smuzhiyun return sector << (PAGE_SHIFT - 9);
236*4882a593Smuzhiyun }
237*4882a593Smuzhiyun
238*4882a593Smuzhiyun /*
239*4882a593Smuzhiyun * swap allocation tell device that a cluster of swap can now be discarded,
240*4882a593Smuzhiyun * to allow the swap device to optimize its wear-levelling.
241*4882a593Smuzhiyun */
discard_swap_cluster(struct swap_info_struct * si,pgoff_t start_page,pgoff_t nr_pages)242*4882a593Smuzhiyun static void discard_swap_cluster(struct swap_info_struct *si,
243*4882a593Smuzhiyun pgoff_t start_page, pgoff_t nr_pages)
244*4882a593Smuzhiyun {
245*4882a593Smuzhiyun struct swap_extent *se = offset_to_swap_extent(si, start_page);
246*4882a593Smuzhiyun
247*4882a593Smuzhiyun while (nr_pages) {
248*4882a593Smuzhiyun pgoff_t offset = start_page - se->start_page;
249*4882a593Smuzhiyun sector_t start_block = se->start_block + offset;
250*4882a593Smuzhiyun sector_t nr_blocks = se->nr_pages - offset;
251*4882a593Smuzhiyun
252*4882a593Smuzhiyun if (nr_blocks > nr_pages)
253*4882a593Smuzhiyun nr_blocks = nr_pages;
254*4882a593Smuzhiyun start_page += nr_blocks;
255*4882a593Smuzhiyun nr_pages -= nr_blocks;
256*4882a593Smuzhiyun
257*4882a593Smuzhiyun start_block <<= PAGE_SHIFT - 9;
258*4882a593Smuzhiyun nr_blocks <<= PAGE_SHIFT - 9;
259*4882a593Smuzhiyun if (blkdev_issue_discard(si->bdev, start_block,
260*4882a593Smuzhiyun nr_blocks, GFP_NOIO, 0))
261*4882a593Smuzhiyun break;
262*4882a593Smuzhiyun
263*4882a593Smuzhiyun se = next_se(se);
264*4882a593Smuzhiyun }
265*4882a593Smuzhiyun }
266*4882a593Smuzhiyun
267*4882a593Smuzhiyun #ifdef CONFIG_THP_SWAP
268*4882a593Smuzhiyun #define SWAPFILE_CLUSTER HPAGE_PMD_NR
269*4882a593Smuzhiyun
270*4882a593Smuzhiyun #define swap_entry_size(size) (size)
271*4882a593Smuzhiyun #else
272*4882a593Smuzhiyun #define SWAPFILE_CLUSTER 256
273*4882a593Smuzhiyun
274*4882a593Smuzhiyun /*
275*4882a593Smuzhiyun * Define swap_entry_size() as constant to let compiler to optimize
276*4882a593Smuzhiyun * out some code if !CONFIG_THP_SWAP
277*4882a593Smuzhiyun */
278*4882a593Smuzhiyun #define swap_entry_size(size) 1
279*4882a593Smuzhiyun #endif
280*4882a593Smuzhiyun #define LATENCY_LIMIT 256
281*4882a593Smuzhiyun
cluster_set_flag(struct swap_cluster_info * info,unsigned int flag)282*4882a593Smuzhiyun static inline void cluster_set_flag(struct swap_cluster_info *info,
283*4882a593Smuzhiyun unsigned int flag)
284*4882a593Smuzhiyun {
285*4882a593Smuzhiyun info->flags = flag;
286*4882a593Smuzhiyun }
287*4882a593Smuzhiyun
cluster_count(struct swap_cluster_info * info)288*4882a593Smuzhiyun static inline unsigned int cluster_count(struct swap_cluster_info *info)
289*4882a593Smuzhiyun {
290*4882a593Smuzhiyun return info->data;
291*4882a593Smuzhiyun }
292*4882a593Smuzhiyun
cluster_set_count(struct swap_cluster_info * info,unsigned int c)293*4882a593Smuzhiyun static inline void cluster_set_count(struct swap_cluster_info *info,
294*4882a593Smuzhiyun unsigned int c)
295*4882a593Smuzhiyun {
296*4882a593Smuzhiyun info->data = c;
297*4882a593Smuzhiyun }
298*4882a593Smuzhiyun
cluster_set_count_flag(struct swap_cluster_info * info,unsigned int c,unsigned int f)299*4882a593Smuzhiyun static inline void cluster_set_count_flag(struct swap_cluster_info *info,
300*4882a593Smuzhiyun unsigned int c, unsigned int f)
301*4882a593Smuzhiyun {
302*4882a593Smuzhiyun info->flags = f;
303*4882a593Smuzhiyun info->data = c;
304*4882a593Smuzhiyun }
305*4882a593Smuzhiyun
cluster_next(struct swap_cluster_info * info)306*4882a593Smuzhiyun static inline unsigned int cluster_next(struct swap_cluster_info *info)
307*4882a593Smuzhiyun {
308*4882a593Smuzhiyun return info->data;
309*4882a593Smuzhiyun }
310*4882a593Smuzhiyun
cluster_set_next(struct swap_cluster_info * info,unsigned int n)311*4882a593Smuzhiyun static inline void cluster_set_next(struct swap_cluster_info *info,
312*4882a593Smuzhiyun unsigned int n)
313*4882a593Smuzhiyun {
314*4882a593Smuzhiyun info->data = n;
315*4882a593Smuzhiyun }
316*4882a593Smuzhiyun
cluster_set_next_flag(struct swap_cluster_info * info,unsigned int n,unsigned int f)317*4882a593Smuzhiyun static inline void cluster_set_next_flag(struct swap_cluster_info *info,
318*4882a593Smuzhiyun unsigned int n, unsigned int f)
319*4882a593Smuzhiyun {
320*4882a593Smuzhiyun info->flags = f;
321*4882a593Smuzhiyun info->data = n;
322*4882a593Smuzhiyun }
323*4882a593Smuzhiyun
cluster_is_free(struct swap_cluster_info * info)324*4882a593Smuzhiyun static inline bool cluster_is_free(struct swap_cluster_info *info)
325*4882a593Smuzhiyun {
326*4882a593Smuzhiyun return info->flags & CLUSTER_FLAG_FREE;
327*4882a593Smuzhiyun }
328*4882a593Smuzhiyun
cluster_is_null(struct swap_cluster_info * info)329*4882a593Smuzhiyun static inline bool cluster_is_null(struct swap_cluster_info *info)
330*4882a593Smuzhiyun {
331*4882a593Smuzhiyun return info->flags & CLUSTER_FLAG_NEXT_NULL;
332*4882a593Smuzhiyun }
333*4882a593Smuzhiyun
cluster_set_null(struct swap_cluster_info * info)334*4882a593Smuzhiyun static inline void cluster_set_null(struct swap_cluster_info *info)
335*4882a593Smuzhiyun {
336*4882a593Smuzhiyun info->flags = CLUSTER_FLAG_NEXT_NULL;
337*4882a593Smuzhiyun info->data = 0;
338*4882a593Smuzhiyun }
339*4882a593Smuzhiyun
cluster_is_huge(struct swap_cluster_info * info)340*4882a593Smuzhiyun static inline bool cluster_is_huge(struct swap_cluster_info *info)
341*4882a593Smuzhiyun {
342*4882a593Smuzhiyun if (IS_ENABLED(CONFIG_THP_SWAP))
343*4882a593Smuzhiyun return info->flags & CLUSTER_FLAG_HUGE;
344*4882a593Smuzhiyun return false;
345*4882a593Smuzhiyun }
346*4882a593Smuzhiyun
cluster_clear_huge(struct swap_cluster_info * info)347*4882a593Smuzhiyun static inline void cluster_clear_huge(struct swap_cluster_info *info)
348*4882a593Smuzhiyun {
349*4882a593Smuzhiyun info->flags &= ~CLUSTER_FLAG_HUGE;
350*4882a593Smuzhiyun }
351*4882a593Smuzhiyun
lock_cluster(struct swap_info_struct * si,unsigned long offset)352*4882a593Smuzhiyun static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
353*4882a593Smuzhiyun unsigned long offset)
354*4882a593Smuzhiyun {
355*4882a593Smuzhiyun struct swap_cluster_info *ci;
356*4882a593Smuzhiyun
357*4882a593Smuzhiyun ci = si->cluster_info;
358*4882a593Smuzhiyun if (ci) {
359*4882a593Smuzhiyun ci += offset / SWAPFILE_CLUSTER;
360*4882a593Smuzhiyun spin_lock(&ci->lock);
361*4882a593Smuzhiyun }
362*4882a593Smuzhiyun return ci;
363*4882a593Smuzhiyun }
364*4882a593Smuzhiyun
unlock_cluster(struct swap_cluster_info * ci)365*4882a593Smuzhiyun static inline void unlock_cluster(struct swap_cluster_info *ci)
366*4882a593Smuzhiyun {
367*4882a593Smuzhiyun if (ci)
368*4882a593Smuzhiyun spin_unlock(&ci->lock);
369*4882a593Smuzhiyun }
370*4882a593Smuzhiyun
371*4882a593Smuzhiyun /*
372*4882a593Smuzhiyun * Determine the locking method in use for this device. Return
373*4882a593Smuzhiyun * swap_cluster_info if SSD-style cluster-based locking is in place.
374*4882a593Smuzhiyun */
lock_cluster_or_swap_info(struct swap_info_struct * si,unsigned long offset)375*4882a593Smuzhiyun static inline struct swap_cluster_info *lock_cluster_or_swap_info(
376*4882a593Smuzhiyun struct swap_info_struct *si, unsigned long offset)
377*4882a593Smuzhiyun {
378*4882a593Smuzhiyun struct swap_cluster_info *ci;
379*4882a593Smuzhiyun
380*4882a593Smuzhiyun /* Try to use fine-grained SSD-style locking if available: */
381*4882a593Smuzhiyun ci = lock_cluster(si, offset);
382*4882a593Smuzhiyun /* Otherwise, fall back to traditional, coarse locking: */
383*4882a593Smuzhiyun if (!ci)
384*4882a593Smuzhiyun spin_lock(&si->lock);
385*4882a593Smuzhiyun
386*4882a593Smuzhiyun return ci;
387*4882a593Smuzhiyun }
388*4882a593Smuzhiyun
unlock_cluster_or_swap_info(struct swap_info_struct * si,struct swap_cluster_info * ci)389*4882a593Smuzhiyun static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
390*4882a593Smuzhiyun struct swap_cluster_info *ci)
391*4882a593Smuzhiyun {
392*4882a593Smuzhiyun if (ci)
393*4882a593Smuzhiyun unlock_cluster(ci);
394*4882a593Smuzhiyun else
395*4882a593Smuzhiyun spin_unlock(&si->lock);
396*4882a593Smuzhiyun }
397*4882a593Smuzhiyun
cluster_list_empty(struct swap_cluster_list * list)398*4882a593Smuzhiyun static inline bool cluster_list_empty(struct swap_cluster_list *list)
399*4882a593Smuzhiyun {
400*4882a593Smuzhiyun return cluster_is_null(&list->head);
401*4882a593Smuzhiyun }
402*4882a593Smuzhiyun
cluster_list_first(struct swap_cluster_list * list)403*4882a593Smuzhiyun static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
404*4882a593Smuzhiyun {
405*4882a593Smuzhiyun return cluster_next(&list->head);
406*4882a593Smuzhiyun }
407*4882a593Smuzhiyun
cluster_list_init(struct swap_cluster_list * list)408*4882a593Smuzhiyun static void cluster_list_init(struct swap_cluster_list *list)
409*4882a593Smuzhiyun {
410*4882a593Smuzhiyun cluster_set_null(&list->head);
411*4882a593Smuzhiyun cluster_set_null(&list->tail);
412*4882a593Smuzhiyun }
413*4882a593Smuzhiyun
cluster_list_add_tail(struct swap_cluster_list * list,struct swap_cluster_info * ci,unsigned int idx)414*4882a593Smuzhiyun static void cluster_list_add_tail(struct swap_cluster_list *list,
415*4882a593Smuzhiyun struct swap_cluster_info *ci,
416*4882a593Smuzhiyun unsigned int idx)
417*4882a593Smuzhiyun {
418*4882a593Smuzhiyun if (cluster_list_empty(list)) {
419*4882a593Smuzhiyun cluster_set_next_flag(&list->head, idx, 0);
420*4882a593Smuzhiyun cluster_set_next_flag(&list->tail, idx, 0);
421*4882a593Smuzhiyun } else {
422*4882a593Smuzhiyun struct swap_cluster_info *ci_tail;
423*4882a593Smuzhiyun unsigned int tail = cluster_next(&list->tail);
424*4882a593Smuzhiyun
425*4882a593Smuzhiyun /*
426*4882a593Smuzhiyun * Nested cluster lock, but both cluster locks are
427*4882a593Smuzhiyun * only acquired when we held swap_info_struct->lock
428*4882a593Smuzhiyun */
429*4882a593Smuzhiyun ci_tail = ci + tail;
430*4882a593Smuzhiyun spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
431*4882a593Smuzhiyun cluster_set_next(ci_tail, idx);
432*4882a593Smuzhiyun spin_unlock(&ci_tail->lock);
433*4882a593Smuzhiyun cluster_set_next_flag(&list->tail, idx, 0);
434*4882a593Smuzhiyun }
435*4882a593Smuzhiyun }
436*4882a593Smuzhiyun
cluster_list_del_first(struct swap_cluster_list * list,struct swap_cluster_info * ci)437*4882a593Smuzhiyun static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
438*4882a593Smuzhiyun struct swap_cluster_info *ci)
439*4882a593Smuzhiyun {
440*4882a593Smuzhiyun unsigned int idx;
441*4882a593Smuzhiyun
442*4882a593Smuzhiyun idx = cluster_next(&list->head);
443*4882a593Smuzhiyun if (cluster_next(&list->tail) == idx) {
444*4882a593Smuzhiyun cluster_set_null(&list->head);
445*4882a593Smuzhiyun cluster_set_null(&list->tail);
446*4882a593Smuzhiyun } else
447*4882a593Smuzhiyun cluster_set_next_flag(&list->head,
448*4882a593Smuzhiyun cluster_next(&ci[idx]), 0);
449*4882a593Smuzhiyun
450*4882a593Smuzhiyun return idx;
451*4882a593Smuzhiyun }
452*4882a593Smuzhiyun
453*4882a593Smuzhiyun /* Add a cluster to discard list and schedule it to do discard */
swap_cluster_schedule_discard(struct swap_info_struct * si,unsigned int idx)454*4882a593Smuzhiyun static void swap_cluster_schedule_discard(struct swap_info_struct *si,
455*4882a593Smuzhiyun unsigned int idx)
456*4882a593Smuzhiyun {
457*4882a593Smuzhiyun /*
458*4882a593Smuzhiyun * If scan_swap_map() can't find a free cluster, it will check
459*4882a593Smuzhiyun * si->swap_map directly. To make sure the discarding cluster isn't
460*4882a593Smuzhiyun * taken by scan_swap_map(), mark the swap entries bad (occupied). It
461*4882a593Smuzhiyun * will be cleared after discard
462*4882a593Smuzhiyun */
463*4882a593Smuzhiyun memset(si->swap_map + idx * SWAPFILE_CLUSTER,
464*4882a593Smuzhiyun SWAP_MAP_BAD, SWAPFILE_CLUSTER);
465*4882a593Smuzhiyun
466*4882a593Smuzhiyun cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
467*4882a593Smuzhiyun
468*4882a593Smuzhiyun schedule_work(&si->discard_work);
469*4882a593Smuzhiyun }
470*4882a593Smuzhiyun
__free_cluster(struct swap_info_struct * si,unsigned long idx)471*4882a593Smuzhiyun static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
472*4882a593Smuzhiyun {
473*4882a593Smuzhiyun struct swap_cluster_info *ci = si->cluster_info;
474*4882a593Smuzhiyun
475*4882a593Smuzhiyun cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
476*4882a593Smuzhiyun cluster_list_add_tail(&si->free_clusters, ci, idx);
477*4882a593Smuzhiyun }
478*4882a593Smuzhiyun
479*4882a593Smuzhiyun /*
480*4882a593Smuzhiyun * Doing discard actually. After a cluster discard is finished, the cluster
481*4882a593Smuzhiyun * will be added to free cluster list. caller should hold si->lock.
482*4882a593Smuzhiyun */
swap_do_scheduled_discard(struct swap_info_struct * si)483*4882a593Smuzhiyun static void swap_do_scheduled_discard(struct swap_info_struct *si)
484*4882a593Smuzhiyun {
485*4882a593Smuzhiyun struct swap_cluster_info *info, *ci;
486*4882a593Smuzhiyun unsigned int idx;
487*4882a593Smuzhiyun
488*4882a593Smuzhiyun info = si->cluster_info;
489*4882a593Smuzhiyun
490*4882a593Smuzhiyun while (!cluster_list_empty(&si->discard_clusters)) {
491*4882a593Smuzhiyun idx = cluster_list_del_first(&si->discard_clusters, info);
492*4882a593Smuzhiyun spin_unlock(&si->lock);
493*4882a593Smuzhiyun
494*4882a593Smuzhiyun discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
495*4882a593Smuzhiyun SWAPFILE_CLUSTER);
496*4882a593Smuzhiyun
497*4882a593Smuzhiyun spin_lock(&si->lock);
498*4882a593Smuzhiyun ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
499*4882a593Smuzhiyun __free_cluster(si, idx);
500*4882a593Smuzhiyun memset(si->swap_map + idx * SWAPFILE_CLUSTER,
501*4882a593Smuzhiyun 0, SWAPFILE_CLUSTER);
502*4882a593Smuzhiyun unlock_cluster(ci);
503*4882a593Smuzhiyun }
504*4882a593Smuzhiyun }
505*4882a593Smuzhiyun
swap_discard_work(struct work_struct * work)506*4882a593Smuzhiyun static void swap_discard_work(struct work_struct *work)
507*4882a593Smuzhiyun {
508*4882a593Smuzhiyun struct swap_info_struct *si;
509*4882a593Smuzhiyun
510*4882a593Smuzhiyun si = container_of(work, struct swap_info_struct, discard_work);
511*4882a593Smuzhiyun
512*4882a593Smuzhiyun spin_lock(&si->lock);
513*4882a593Smuzhiyun swap_do_scheduled_discard(si);
514*4882a593Smuzhiyun spin_unlock(&si->lock);
515*4882a593Smuzhiyun }
516*4882a593Smuzhiyun
alloc_cluster(struct swap_info_struct * si,unsigned long idx)517*4882a593Smuzhiyun static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
518*4882a593Smuzhiyun {
519*4882a593Smuzhiyun struct swap_cluster_info *ci = si->cluster_info;
520*4882a593Smuzhiyun
521*4882a593Smuzhiyun VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
522*4882a593Smuzhiyun cluster_list_del_first(&si->free_clusters, ci);
523*4882a593Smuzhiyun cluster_set_count_flag(ci + idx, 0, 0);
524*4882a593Smuzhiyun }
525*4882a593Smuzhiyun
free_cluster(struct swap_info_struct * si,unsigned long idx)526*4882a593Smuzhiyun static void free_cluster(struct swap_info_struct *si, unsigned long idx)
527*4882a593Smuzhiyun {
528*4882a593Smuzhiyun struct swap_cluster_info *ci = si->cluster_info + idx;
529*4882a593Smuzhiyun
530*4882a593Smuzhiyun VM_BUG_ON(cluster_count(ci) != 0);
531*4882a593Smuzhiyun /*
532*4882a593Smuzhiyun * If the swap is discardable, prepare discard the cluster
533*4882a593Smuzhiyun * instead of free it immediately. The cluster will be freed
534*4882a593Smuzhiyun * after discard.
535*4882a593Smuzhiyun */
536*4882a593Smuzhiyun if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
537*4882a593Smuzhiyun (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
538*4882a593Smuzhiyun swap_cluster_schedule_discard(si, idx);
539*4882a593Smuzhiyun return;
540*4882a593Smuzhiyun }
541*4882a593Smuzhiyun
542*4882a593Smuzhiyun __free_cluster(si, idx);
543*4882a593Smuzhiyun }
544*4882a593Smuzhiyun
545*4882a593Smuzhiyun /*
546*4882a593Smuzhiyun * The cluster corresponding to page_nr will be used. The cluster will be
547*4882a593Smuzhiyun * removed from free cluster list and its usage counter will be increased.
548*4882a593Smuzhiyun */
inc_cluster_info_page(struct swap_info_struct * p,struct swap_cluster_info * cluster_info,unsigned long page_nr)549*4882a593Smuzhiyun static void inc_cluster_info_page(struct swap_info_struct *p,
550*4882a593Smuzhiyun struct swap_cluster_info *cluster_info, unsigned long page_nr)
551*4882a593Smuzhiyun {
552*4882a593Smuzhiyun unsigned long idx = page_nr / SWAPFILE_CLUSTER;
553*4882a593Smuzhiyun
554*4882a593Smuzhiyun if (!cluster_info)
555*4882a593Smuzhiyun return;
556*4882a593Smuzhiyun if (cluster_is_free(&cluster_info[idx]))
557*4882a593Smuzhiyun alloc_cluster(p, idx);
558*4882a593Smuzhiyun
559*4882a593Smuzhiyun VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
560*4882a593Smuzhiyun cluster_set_count(&cluster_info[idx],
561*4882a593Smuzhiyun cluster_count(&cluster_info[idx]) + 1);
562*4882a593Smuzhiyun }
563*4882a593Smuzhiyun
564*4882a593Smuzhiyun /*
565*4882a593Smuzhiyun * The cluster corresponding to page_nr decreases one usage. If the usage
566*4882a593Smuzhiyun * counter becomes 0, which means no page in the cluster is in using, we can
567*4882a593Smuzhiyun * optionally discard the cluster and add it to free cluster list.
568*4882a593Smuzhiyun */
dec_cluster_info_page(struct swap_info_struct * p,struct swap_cluster_info * cluster_info,unsigned long page_nr)569*4882a593Smuzhiyun static void dec_cluster_info_page(struct swap_info_struct *p,
570*4882a593Smuzhiyun struct swap_cluster_info *cluster_info, unsigned long page_nr)
571*4882a593Smuzhiyun {
572*4882a593Smuzhiyun unsigned long idx = page_nr / SWAPFILE_CLUSTER;
573*4882a593Smuzhiyun
574*4882a593Smuzhiyun if (!cluster_info)
575*4882a593Smuzhiyun return;
576*4882a593Smuzhiyun
577*4882a593Smuzhiyun VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
578*4882a593Smuzhiyun cluster_set_count(&cluster_info[idx],
579*4882a593Smuzhiyun cluster_count(&cluster_info[idx]) - 1);
580*4882a593Smuzhiyun
581*4882a593Smuzhiyun if (cluster_count(&cluster_info[idx]) == 0)
582*4882a593Smuzhiyun free_cluster(p, idx);
583*4882a593Smuzhiyun }
584*4882a593Smuzhiyun
585*4882a593Smuzhiyun /*
586*4882a593Smuzhiyun * It's possible scan_swap_map() uses a free cluster in the middle of free
587*4882a593Smuzhiyun * cluster list. Avoiding such abuse to avoid list corruption.
588*4882a593Smuzhiyun */
589*4882a593Smuzhiyun static bool
scan_swap_map_ssd_cluster_conflict(struct swap_info_struct * si,unsigned long offset)590*4882a593Smuzhiyun scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
591*4882a593Smuzhiyun unsigned long offset)
592*4882a593Smuzhiyun {
593*4882a593Smuzhiyun struct percpu_cluster *percpu_cluster;
594*4882a593Smuzhiyun bool conflict;
595*4882a593Smuzhiyun
596*4882a593Smuzhiyun offset /= SWAPFILE_CLUSTER;
597*4882a593Smuzhiyun conflict = !cluster_list_empty(&si->free_clusters) &&
598*4882a593Smuzhiyun offset != cluster_list_first(&si->free_clusters) &&
599*4882a593Smuzhiyun cluster_is_free(&si->cluster_info[offset]);
600*4882a593Smuzhiyun
601*4882a593Smuzhiyun if (!conflict)
602*4882a593Smuzhiyun return false;
603*4882a593Smuzhiyun
604*4882a593Smuzhiyun percpu_cluster = this_cpu_ptr(si->percpu_cluster);
605*4882a593Smuzhiyun cluster_set_null(&percpu_cluster->index);
606*4882a593Smuzhiyun return true;
607*4882a593Smuzhiyun }
608*4882a593Smuzhiyun
609*4882a593Smuzhiyun /*
610*4882a593Smuzhiyun * Try to get a swap entry from current cpu's swap entry pool (a cluster). This
611*4882a593Smuzhiyun * might involve allocating a new cluster for current CPU too.
612*4882a593Smuzhiyun */
scan_swap_map_try_ssd_cluster(struct swap_info_struct * si,unsigned long * offset,unsigned long * scan_base)613*4882a593Smuzhiyun static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
614*4882a593Smuzhiyun unsigned long *offset, unsigned long *scan_base)
615*4882a593Smuzhiyun {
616*4882a593Smuzhiyun struct percpu_cluster *cluster;
617*4882a593Smuzhiyun struct swap_cluster_info *ci;
618*4882a593Smuzhiyun unsigned long tmp, max;
619*4882a593Smuzhiyun
620*4882a593Smuzhiyun new_cluster:
621*4882a593Smuzhiyun cluster = this_cpu_ptr(si->percpu_cluster);
622*4882a593Smuzhiyun if (cluster_is_null(&cluster->index)) {
623*4882a593Smuzhiyun if (!cluster_list_empty(&si->free_clusters)) {
624*4882a593Smuzhiyun cluster->index = si->free_clusters.head;
625*4882a593Smuzhiyun cluster->next = cluster_next(&cluster->index) *
626*4882a593Smuzhiyun SWAPFILE_CLUSTER;
627*4882a593Smuzhiyun } else if (!cluster_list_empty(&si->discard_clusters)) {
628*4882a593Smuzhiyun /*
629*4882a593Smuzhiyun * we don't have free cluster but have some clusters in
630*4882a593Smuzhiyun * discarding, do discard now and reclaim them, then
631*4882a593Smuzhiyun * reread cluster_next_cpu since we dropped si->lock
632*4882a593Smuzhiyun */
633*4882a593Smuzhiyun swap_do_scheduled_discard(si);
634*4882a593Smuzhiyun *scan_base = this_cpu_read(*si->cluster_next_cpu);
635*4882a593Smuzhiyun *offset = *scan_base;
636*4882a593Smuzhiyun goto new_cluster;
637*4882a593Smuzhiyun } else
638*4882a593Smuzhiyun return false;
639*4882a593Smuzhiyun }
640*4882a593Smuzhiyun
641*4882a593Smuzhiyun /*
642*4882a593Smuzhiyun * Other CPUs can use our cluster if they can't find a free cluster,
643*4882a593Smuzhiyun * check if there is still free entry in the cluster
644*4882a593Smuzhiyun */
645*4882a593Smuzhiyun tmp = cluster->next;
646*4882a593Smuzhiyun max = min_t(unsigned long, si->max,
647*4882a593Smuzhiyun (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER);
648*4882a593Smuzhiyun if (tmp < max) {
649*4882a593Smuzhiyun ci = lock_cluster(si, tmp);
650*4882a593Smuzhiyun while (tmp < max) {
651*4882a593Smuzhiyun if (!si->swap_map[tmp])
652*4882a593Smuzhiyun break;
653*4882a593Smuzhiyun tmp++;
654*4882a593Smuzhiyun }
655*4882a593Smuzhiyun unlock_cluster(ci);
656*4882a593Smuzhiyun }
657*4882a593Smuzhiyun if (tmp >= max) {
658*4882a593Smuzhiyun cluster_set_null(&cluster->index);
659*4882a593Smuzhiyun goto new_cluster;
660*4882a593Smuzhiyun }
661*4882a593Smuzhiyun cluster->next = tmp + 1;
662*4882a593Smuzhiyun *offset = tmp;
663*4882a593Smuzhiyun *scan_base = tmp;
664*4882a593Smuzhiyun return true;
665*4882a593Smuzhiyun }
666*4882a593Smuzhiyun
__del_from_avail_list(struct swap_info_struct * p)667*4882a593Smuzhiyun static void __del_from_avail_list(struct swap_info_struct *p)
668*4882a593Smuzhiyun {
669*4882a593Smuzhiyun int nid;
670*4882a593Smuzhiyun
671*4882a593Smuzhiyun for_each_node(nid)
672*4882a593Smuzhiyun plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
673*4882a593Smuzhiyun }
674*4882a593Smuzhiyun
del_from_avail_list(struct swap_info_struct * p)675*4882a593Smuzhiyun static void del_from_avail_list(struct swap_info_struct *p)
676*4882a593Smuzhiyun {
677*4882a593Smuzhiyun spin_lock(&swap_avail_lock);
678*4882a593Smuzhiyun __del_from_avail_list(p);
679*4882a593Smuzhiyun spin_unlock(&swap_avail_lock);
680*4882a593Smuzhiyun }
681*4882a593Smuzhiyun
swap_range_alloc(struct swap_info_struct * si,unsigned long offset,unsigned int nr_entries)682*4882a593Smuzhiyun static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
683*4882a593Smuzhiyun unsigned int nr_entries)
684*4882a593Smuzhiyun {
685*4882a593Smuzhiyun unsigned int end = offset + nr_entries - 1;
686*4882a593Smuzhiyun
687*4882a593Smuzhiyun if (offset == si->lowest_bit)
688*4882a593Smuzhiyun si->lowest_bit += nr_entries;
689*4882a593Smuzhiyun if (end == si->highest_bit)
690*4882a593Smuzhiyun WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries);
691*4882a593Smuzhiyun si->inuse_pages += nr_entries;
692*4882a593Smuzhiyun if (si->inuse_pages == si->pages) {
693*4882a593Smuzhiyun si->lowest_bit = si->max;
694*4882a593Smuzhiyun si->highest_bit = 0;
695*4882a593Smuzhiyun del_from_avail_list(si);
696*4882a593Smuzhiyun }
697*4882a593Smuzhiyun }
698*4882a593Smuzhiyun
add_to_avail_list(struct swap_info_struct * p)699*4882a593Smuzhiyun static void add_to_avail_list(struct swap_info_struct *p)
700*4882a593Smuzhiyun {
701*4882a593Smuzhiyun int nid;
702*4882a593Smuzhiyun
703*4882a593Smuzhiyun spin_lock(&swap_avail_lock);
704*4882a593Smuzhiyun for_each_node(nid) {
705*4882a593Smuzhiyun WARN_ON(!plist_node_empty(&p->avail_lists[nid]));
706*4882a593Smuzhiyun plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]);
707*4882a593Smuzhiyun }
708*4882a593Smuzhiyun spin_unlock(&swap_avail_lock);
709*4882a593Smuzhiyun }
710*4882a593Smuzhiyun
swap_range_free(struct swap_info_struct * si,unsigned long offset,unsigned int nr_entries)711*4882a593Smuzhiyun static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
712*4882a593Smuzhiyun unsigned int nr_entries)
713*4882a593Smuzhiyun {
714*4882a593Smuzhiyun unsigned long begin = offset;
715*4882a593Smuzhiyun unsigned long end = offset + nr_entries - 1;
716*4882a593Smuzhiyun void (*swap_slot_free_notify)(struct block_device *, unsigned long);
717*4882a593Smuzhiyun bool skip = false;
718*4882a593Smuzhiyun
719*4882a593Smuzhiyun if (offset < si->lowest_bit)
720*4882a593Smuzhiyun si->lowest_bit = offset;
721*4882a593Smuzhiyun if (end > si->highest_bit) {
722*4882a593Smuzhiyun bool was_full = !si->highest_bit;
723*4882a593Smuzhiyun
724*4882a593Smuzhiyun WRITE_ONCE(si->highest_bit, end);
725*4882a593Smuzhiyun if (was_full && (si->flags & SWP_WRITEOK))
726*4882a593Smuzhiyun add_to_avail_list(si);
727*4882a593Smuzhiyun }
728*4882a593Smuzhiyun trace_android_vh_account_swap_pages(si, &skip);
729*4882a593Smuzhiyun if (!skip)
730*4882a593Smuzhiyun atomic_long_add(nr_entries, &nr_swap_pages);
731*4882a593Smuzhiyun si->inuse_pages -= nr_entries;
732*4882a593Smuzhiyun if (si->flags & SWP_BLKDEV)
733*4882a593Smuzhiyun swap_slot_free_notify =
734*4882a593Smuzhiyun si->bdev->bd_disk->fops->swap_slot_free_notify;
735*4882a593Smuzhiyun else
736*4882a593Smuzhiyun swap_slot_free_notify = NULL;
737*4882a593Smuzhiyun while (offset <= end) {
738*4882a593Smuzhiyun arch_swap_invalidate_page(si->type, offset);
739*4882a593Smuzhiyun frontswap_invalidate_page(si->type, offset);
740*4882a593Smuzhiyun if (swap_slot_free_notify)
741*4882a593Smuzhiyun swap_slot_free_notify(si->bdev, offset);
742*4882a593Smuzhiyun offset++;
743*4882a593Smuzhiyun }
744*4882a593Smuzhiyun clear_shadow_from_swap_cache(si->type, begin, end);
745*4882a593Smuzhiyun }
746*4882a593Smuzhiyun
set_cluster_next(struct swap_info_struct * si,unsigned long next)747*4882a593Smuzhiyun static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
748*4882a593Smuzhiyun {
749*4882a593Smuzhiyun unsigned long prev;
750*4882a593Smuzhiyun
751*4882a593Smuzhiyun if (!(si->flags & SWP_SOLIDSTATE)) {
752*4882a593Smuzhiyun si->cluster_next = next;
753*4882a593Smuzhiyun return;
754*4882a593Smuzhiyun }
755*4882a593Smuzhiyun
756*4882a593Smuzhiyun prev = this_cpu_read(*si->cluster_next_cpu);
757*4882a593Smuzhiyun /*
758*4882a593Smuzhiyun * Cross the swap address space size aligned trunk, choose
759*4882a593Smuzhiyun * another trunk randomly to avoid lock contention on swap
760*4882a593Smuzhiyun * address space if possible.
761*4882a593Smuzhiyun */
762*4882a593Smuzhiyun if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) !=
763*4882a593Smuzhiyun (next >> SWAP_ADDRESS_SPACE_SHIFT)) {
764*4882a593Smuzhiyun /* No free swap slots available */
765*4882a593Smuzhiyun if (si->highest_bit <= si->lowest_bit)
766*4882a593Smuzhiyun return;
767*4882a593Smuzhiyun next = si->lowest_bit +
768*4882a593Smuzhiyun prandom_u32_max(si->highest_bit - si->lowest_bit + 1);
769*4882a593Smuzhiyun next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES);
770*4882a593Smuzhiyun next = max_t(unsigned int, next, si->lowest_bit);
771*4882a593Smuzhiyun }
772*4882a593Smuzhiyun this_cpu_write(*si->cluster_next_cpu, next);
773*4882a593Smuzhiyun }
774*4882a593Smuzhiyun
scan_swap_map_slots(struct swap_info_struct * si,unsigned char usage,int nr,swp_entry_t slots[])775*4882a593Smuzhiyun int scan_swap_map_slots(struct swap_info_struct *si,
776*4882a593Smuzhiyun unsigned char usage, int nr,
777*4882a593Smuzhiyun swp_entry_t slots[])
778*4882a593Smuzhiyun {
779*4882a593Smuzhiyun struct swap_cluster_info *ci;
780*4882a593Smuzhiyun unsigned long offset;
781*4882a593Smuzhiyun unsigned long scan_base;
782*4882a593Smuzhiyun unsigned long last_in_cluster = 0;
783*4882a593Smuzhiyun int latency_ration = LATENCY_LIMIT;
784*4882a593Smuzhiyun int n_ret = 0;
785*4882a593Smuzhiyun bool scanned_many = false;
786*4882a593Smuzhiyun
787*4882a593Smuzhiyun /*
788*4882a593Smuzhiyun * We try to cluster swap pages by allocating them sequentially
789*4882a593Smuzhiyun * in swap. Once we've allocated SWAPFILE_CLUSTER pages this
790*4882a593Smuzhiyun * way, however, we resort to first-free allocation, starting
791*4882a593Smuzhiyun * a new cluster. This prevents us from scattering swap pages
792*4882a593Smuzhiyun * all over the entire swap partition, so that we reduce
793*4882a593Smuzhiyun * overall disk seek times between swap pages. -- sct
794*4882a593Smuzhiyun * But we do now try to find an empty cluster. -Andrea
795*4882a593Smuzhiyun * And we let swap pages go all over an SSD partition. Hugh
796*4882a593Smuzhiyun */
797*4882a593Smuzhiyun
798*4882a593Smuzhiyun si->flags += SWP_SCANNING;
799*4882a593Smuzhiyun /*
800*4882a593Smuzhiyun * Use percpu scan base for SSD to reduce lock contention on
801*4882a593Smuzhiyun * cluster and swap cache. For HDD, sequential access is more
802*4882a593Smuzhiyun * important.
803*4882a593Smuzhiyun */
804*4882a593Smuzhiyun if (si->flags & SWP_SOLIDSTATE)
805*4882a593Smuzhiyun scan_base = this_cpu_read(*si->cluster_next_cpu);
806*4882a593Smuzhiyun else
807*4882a593Smuzhiyun scan_base = si->cluster_next;
808*4882a593Smuzhiyun offset = scan_base;
809*4882a593Smuzhiyun
810*4882a593Smuzhiyun /* SSD algorithm */
811*4882a593Smuzhiyun if (si->cluster_info) {
812*4882a593Smuzhiyun if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
813*4882a593Smuzhiyun goto scan;
814*4882a593Smuzhiyun } else if (unlikely(!si->cluster_nr--)) {
815*4882a593Smuzhiyun if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
816*4882a593Smuzhiyun si->cluster_nr = SWAPFILE_CLUSTER - 1;
817*4882a593Smuzhiyun goto checks;
818*4882a593Smuzhiyun }
819*4882a593Smuzhiyun
820*4882a593Smuzhiyun spin_unlock(&si->lock);
821*4882a593Smuzhiyun
822*4882a593Smuzhiyun /*
823*4882a593Smuzhiyun * If seek is expensive, start searching for new cluster from
824*4882a593Smuzhiyun * start of partition, to minimize the span of allocated swap.
825*4882a593Smuzhiyun * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info
826*4882a593Smuzhiyun * case, just handled by scan_swap_map_try_ssd_cluster() above.
827*4882a593Smuzhiyun */
828*4882a593Smuzhiyun scan_base = offset = si->lowest_bit;
829*4882a593Smuzhiyun last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
830*4882a593Smuzhiyun
831*4882a593Smuzhiyun /* Locate the first empty (unaligned) cluster */
832*4882a593Smuzhiyun for (; last_in_cluster <= si->highest_bit; offset++) {
833*4882a593Smuzhiyun if (si->swap_map[offset])
834*4882a593Smuzhiyun last_in_cluster = offset + SWAPFILE_CLUSTER;
835*4882a593Smuzhiyun else if (offset == last_in_cluster) {
836*4882a593Smuzhiyun spin_lock(&si->lock);
837*4882a593Smuzhiyun offset -= SWAPFILE_CLUSTER - 1;
838*4882a593Smuzhiyun si->cluster_next = offset;
839*4882a593Smuzhiyun si->cluster_nr = SWAPFILE_CLUSTER - 1;
840*4882a593Smuzhiyun goto checks;
841*4882a593Smuzhiyun }
842*4882a593Smuzhiyun if (unlikely(--latency_ration < 0)) {
843*4882a593Smuzhiyun cond_resched();
844*4882a593Smuzhiyun latency_ration = LATENCY_LIMIT;
845*4882a593Smuzhiyun }
846*4882a593Smuzhiyun }
847*4882a593Smuzhiyun
848*4882a593Smuzhiyun offset = scan_base;
849*4882a593Smuzhiyun spin_lock(&si->lock);
850*4882a593Smuzhiyun si->cluster_nr = SWAPFILE_CLUSTER - 1;
851*4882a593Smuzhiyun }
852*4882a593Smuzhiyun
853*4882a593Smuzhiyun checks:
854*4882a593Smuzhiyun if (si->cluster_info) {
855*4882a593Smuzhiyun while (scan_swap_map_ssd_cluster_conflict(si, offset)) {
856*4882a593Smuzhiyun /* take a break if we already got some slots */
857*4882a593Smuzhiyun if (n_ret)
858*4882a593Smuzhiyun goto done;
859*4882a593Smuzhiyun if (!scan_swap_map_try_ssd_cluster(si, &offset,
860*4882a593Smuzhiyun &scan_base))
861*4882a593Smuzhiyun goto scan;
862*4882a593Smuzhiyun }
863*4882a593Smuzhiyun }
864*4882a593Smuzhiyun if (!(si->flags & SWP_WRITEOK))
865*4882a593Smuzhiyun goto no_page;
866*4882a593Smuzhiyun if (!si->highest_bit)
867*4882a593Smuzhiyun goto no_page;
868*4882a593Smuzhiyun if (offset > si->highest_bit)
869*4882a593Smuzhiyun scan_base = offset = si->lowest_bit;
870*4882a593Smuzhiyun
871*4882a593Smuzhiyun ci = lock_cluster(si, offset);
872*4882a593Smuzhiyun /* reuse swap entry of cache-only swap if not busy. */
873*4882a593Smuzhiyun if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
874*4882a593Smuzhiyun int swap_was_freed;
875*4882a593Smuzhiyun unlock_cluster(ci);
876*4882a593Smuzhiyun spin_unlock(&si->lock);
877*4882a593Smuzhiyun swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
878*4882a593Smuzhiyun spin_lock(&si->lock);
879*4882a593Smuzhiyun /* entry was freed successfully, try to use this again */
880*4882a593Smuzhiyun if (swap_was_freed)
881*4882a593Smuzhiyun goto checks;
882*4882a593Smuzhiyun goto scan; /* check next one */
883*4882a593Smuzhiyun }
884*4882a593Smuzhiyun
885*4882a593Smuzhiyun if (si->swap_map[offset]) {
886*4882a593Smuzhiyun unlock_cluster(ci);
887*4882a593Smuzhiyun if (!n_ret)
888*4882a593Smuzhiyun goto scan;
889*4882a593Smuzhiyun else
890*4882a593Smuzhiyun goto done;
891*4882a593Smuzhiyun }
892*4882a593Smuzhiyun WRITE_ONCE(si->swap_map[offset], usage);
893*4882a593Smuzhiyun inc_cluster_info_page(si, si->cluster_info, offset);
894*4882a593Smuzhiyun unlock_cluster(ci);
895*4882a593Smuzhiyun
896*4882a593Smuzhiyun swap_range_alloc(si, offset, 1);
897*4882a593Smuzhiyun slots[n_ret++] = swp_entry(si->type, offset);
898*4882a593Smuzhiyun
899*4882a593Smuzhiyun /* got enough slots or reach max slots? */
900*4882a593Smuzhiyun if ((n_ret == nr) || (offset >= si->highest_bit))
901*4882a593Smuzhiyun goto done;
902*4882a593Smuzhiyun
903*4882a593Smuzhiyun /* search for next available slot */
904*4882a593Smuzhiyun
905*4882a593Smuzhiyun /* time to take a break? */
906*4882a593Smuzhiyun if (unlikely(--latency_ration < 0)) {
907*4882a593Smuzhiyun if (n_ret)
908*4882a593Smuzhiyun goto done;
909*4882a593Smuzhiyun spin_unlock(&si->lock);
910*4882a593Smuzhiyun cond_resched();
911*4882a593Smuzhiyun spin_lock(&si->lock);
912*4882a593Smuzhiyun latency_ration = LATENCY_LIMIT;
913*4882a593Smuzhiyun }
914*4882a593Smuzhiyun
915*4882a593Smuzhiyun /* try to get more slots in cluster */
916*4882a593Smuzhiyun if (si->cluster_info) {
917*4882a593Smuzhiyun if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
918*4882a593Smuzhiyun goto checks;
919*4882a593Smuzhiyun } else if (si->cluster_nr && !si->swap_map[++offset]) {
920*4882a593Smuzhiyun /* non-ssd case, still more slots in cluster? */
921*4882a593Smuzhiyun --si->cluster_nr;
922*4882a593Smuzhiyun goto checks;
923*4882a593Smuzhiyun }
924*4882a593Smuzhiyun
925*4882a593Smuzhiyun /*
926*4882a593Smuzhiyun * Even if there's no free clusters available (fragmented),
927*4882a593Smuzhiyun * try to scan a little more quickly with lock held unless we
928*4882a593Smuzhiyun * have scanned too many slots already.
929*4882a593Smuzhiyun */
930*4882a593Smuzhiyun if (!scanned_many) {
931*4882a593Smuzhiyun unsigned long scan_limit;
932*4882a593Smuzhiyun
933*4882a593Smuzhiyun if (offset < scan_base)
934*4882a593Smuzhiyun scan_limit = scan_base;
935*4882a593Smuzhiyun else
936*4882a593Smuzhiyun scan_limit = si->highest_bit;
937*4882a593Smuzhiyun for (; offset <= scan_limit && --latency_ration > 0;
938*4882a593Smuzhiyun offset++) {
939*4882a593Smuzhiyun if (!si->swap_map[offset])
940*4882a593Smuzhiyun goto checks;
941*4882a593Smuzhiyun }
942*4882a593Smuzhiyun }
943*4882a593Smuzhiyun
944*4882a593Smuzhiyun done:
945*4882a593Smuzhiyun set_cluster_next(si, offset + 1);
946*4882a593Smuzhiyun si->flags -= SWP_SCANNING;
947*4882a593Smuzhiyun return n_ret;
948*4882a593Smuzhiyun
949*4882a593Smuzhiyun scan:
950*4882a593Smuzhiyun spin_unlock(&si->lock);
951*4882a593Smuzhiyun while (++offset <= READ_ONCE(si->highest_bit)) {
952*4882a593Smuzhiyun if (data_race(!si->swap_map[offset])) {
953*4882a593Smuzhiyun spin_lock(&si->lock);
954*4882a593Smuzhiyun goto checks;
955*4882a593Smuzhiyun }
956*4882a593Smuzhiyun if (vm_swap_full() &&
957*4882a593Smuzhiyun READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
958*4882a593Smuzhiyun spin_lock(&si->lock);
959*4882a593Smuzhiyun goto checks;
960*4882a593Smuzhiyun }
961*4882a593Smuzhiyun if (unlikely(--latency_ration < 0)) {
962*4882a593Smuzhiyun cond_resched();
963*4882a593Smuzhiyun latency_ration = LATENCY_LIMIT;
964*4882a593Smuzhiyun scanned_many = true;
965*4882a593Smuzhiyun }
966*4882a593Smuzhiyun }
967*4882a593Smuzhiyun offset = si->lowest_bit;
968*4882a593Smuzhiyun while (offset < scan_base) {
969*4882a593Smuzhiyun if (data_race(!si->swap_map[offset])) {
970*4882a593Smuzhiyun spin_lock(&si->lock);
971*4882a593Smuzhiyun goto checks;
972*4882a593Smuzhiyun }
973*4882a593Smuzhiyun if (vm_swap_full() &&
974*4882a593Smuzhiyun READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
975*4882a593Smuzhiyun spin_lock(&si->lock);
976*4882a593Smuzhiyun goto checks;
977*4882a593Smuzhiyun }
978*4882a593Smuzhiyun if (unlikely(--latency_ration < 0)) {
979*4882a593Smuzhiyun cond_resched();
980*4882a593Smuzhiyun latency_ration = LATENCY_LIMIT;
981*4882a593Smuzhiyun scanned_many = true;
982*4882a593Smuzhiyun }
983*4882a593Smuzhiyun offset++;
984*4882a593Smuzhiyun }
985*4882a593Smuzhiyun spin_lock(&si->lock);
986*4882a593Smuzhiyun
987*4882a593Smuzhiyun no_page:
988*4882a593Smuzhiyun si->flags -= SWP_SCANNING;
989*4882a593Smuzhiyun return n_ret;
990*4882a593Smuzhiyun }
991*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(scan_swap_map_slots);
992*4882a593Smuzhiyun
swap_alloc_cluster(struct swap_info_struct * si,swp_entry_t * slot)993*4882a593Smuzhiyun int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
994*4882a593Smuzhiyun {
995*4882a593Smuzhiyun unsigned long idx;
996*4882a593Smuzhiyun struct swap_cluster_info *ci;
997*4882a593Smuzhiyun unsigned long offset, i;
998*4882a593Smuzhiyun unsigned char *map;
999*4882a593Smuzhiyun
1000*4882a593Smuzhiyun /*
1001*4882a593Smuzhiyun * Should not even be attempting cluster allocations when huge
1002*4882a593Smuzhiyun * page swap is disabled. Warn and fail the allocation.
1003*4882a593Smuzhiyun */
1004*4882a593Smuzhiyun if (!IS_ENABLED(CONFIG_THP_SWAP)) {
1005*4882a593Smuzhiyun VM_WARN_ON_ONCE(1);
1006*4882a593Smuzhiyun return 0;
1007*4882a593Smuzhiyun }
1008*4882a593Smuzhiyun
1009*4882a593Smuzhiyun if (cluster_list_empty(&si->free_clusters))
1010*4882a593Smuzhiyun return 0;
1011*4882a593Smuzhiyun
1012*4882a593Smuzhiyun idx = cluster_list_first(&si->free_clusters);
1013*4882a593Smuzhiyun offset = idx * SWAPFILE_CLUSTER;
1014*4882a593Smuzhiyun ci = lock_cluster(si, offset);
1015*4882a593Smuzhiyun alloc_cluster(si, idx);
1016*4882a593Smuzhiyun cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE);
1017*4882a593Smuzhiyun
1018*4882a593Smuzhiyun map = si->swap_map + offset;
1019*4882a593Smuzhiyun for (i = 0; i < SWAPFILE_CLUSTER; i++)
1020*4882a593Smuzhiyun map[i] = SWAP_HAS_CACHE;
1021*4882a593Smuzhiyun unlock_cluster(ci);
1022*4882a593Smuzhiyun swap_range_alloc(si, offset, SWAPFILE_CLUSTER);
1023*4882a593Smuzhiyun *slot = swp_entry(si->type, offset);
1024*4882a593Smuzhiyun
1025*4882a593Smuzhiyun return 1;
1026*4882a593Smuzhiyun }
1027*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(swap_alloc_cluster);
1028*4882a593Smuzhiyun
swap_free_cluster(struct swap_info_struct * si,unsigned long idx)1029*4882a593Smuzhiyun static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
1030*4882a593Smuzhiyun {
1031*4882a593Smuzhiyun unsigned long offset = idx * SWAPFILE_CLUSTER;
1032*4882a593Smuzhiyun struct swap_cluster_info *ci;
1033*4882a593Smuzhiyun
1034*4882a593Smuzhiyun ci = lock_cluster(si, offset);
1035*4882a593Smuzhiyun memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
1036*4882a593Smuzhiyun cluster_set_count_flag(ci, 0, 0);
1037*4882a593Smuzhiyun free_cluster(si, idx);
1038*4882a593Smuzhiyun unlock_cluster(ci);
1039*4882a593Smuzhiyun swap_range_free(si, offset, SWAPFILE_CLUSTER);
1040*4882a593Smuzhiyun }
1041*4882a593Smuzhiyun
scan_swap_map(struct swap_info_struct * si,unsigned char usage)1042*4882a593Smuzhiyun static unsigned long scan_swap_map(struct swap_info_struct *si,
1043*4882a593Smuzhiyun unsigned char usage)
1044*4882a593Smuzhiyun {
1045*4882a593Smuzhiyun swp_entry_t entry;
1046*4882a593Smuzhiyun int n_ret;
1047*4882a593Smuzhiyun
1048*4882a593Smuzhiyun n_ret = scan_swap_map_slots(si, usage, 1, &entry);
1049*4882a593Smuzhiyun
1050*4882a593Smuzhiyun if (n_ret)
1051*4882a593Smuzhiyun return swp_offset(entry);
1052*4882a593Smuzhiyun else
1053*4882a593Smuzhiyun return 0;
1054*4882a593Smuzhiyun
1055*4882a593Smuzhiyun }
1056*4882a593Smuzhiyun
get_swap_pages(int n_goal,swp_entry_t swp_entries[],int entry_size)1057*4882a593Smuzhiyun int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
1058*4882a593Smuzhiyun {
1059*4882a593Smuzhiyun unsigned long size = swap_entry_size(entry_size);
1060*4882a593Smuzhiyun struct swap_info_struct *si, *next;
1061*4882a593Smuzhiyun long avail_pgs;
1062*4882a593Smuzhiyun int n_ret = 0;
1063*4882a593Smuzhiyun int node;
1064*4882a593Smuzhiyun
1065*4882a593Smuzhiyun /* Only single cluster request supported */
1066*4882a593Smuzhiyun WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
1067*4882a593Smuzhiyun
1068*4882a593Smuzhiyun spin_lock(&swap_avail_lock);
1069*4882a593Smuzhiyun
1070*4882a593Smuzhiyun avail_pgs = atomic_long_read(&nr_swap_pages) / size;
1071*4882a593Smuzhiyun if (avail_pgs <= 0) {
1072*4882a593Smuzhiyun spin_unlock(&swap_avail_lock);
1073*4882a593Smuzhiyun goto noswap;
1074*4882a593Smuzhiyun }
1075*4882a593Smuzhiyun
1076*4882a593Smuzhiyun n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs);
1077*4882a593Smuzhiyun
1078*4882a593Smuzhiyun atomic_long_sub(n_goal * size, &nr_swap_pages);
1079*4882a593Smuzhiyun
1080*4882a593Smuzhiyun start_over:
1081*4882a593Smuzhiyun node = numa_node_id();
1082*4882a593Smuzhiyun plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
1083*4882a593Smuzhiyun /* requeue si to after same-priority siblings */
1084*4882a593Smuzhiyun plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
1085*4882a593Smuzhiyun spin_unlock(&swap_avail_lock);
1086*4882a593Smuzhiyun spin_lock(&si->lock);
1087*4882a593Smuzhiyun if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
1088*4882a593Smuzhiyun spin_lock(&swap_avail_lock);
1089*4882a593Smuzhiyun if (plist_node_empty(&si->avail_lists[node])) {
1090*4882a593Smuzhiyun spin_unlock(&si->lock);
1091*4882a593Smuzhiyun goto nextsi;
1092*4882a593Smuzhiyun }
1093*4882a593Smuzhiyun WARN(!si->highest_bit,
1094*4882a593Smuzhiyun "swap_info %d in list but !highest_bit\n",
1095*4882a593Smuzhiyun si->type);
1096*4882a593Smuzhiyun WARN(!(si->flags & SWP_WRITEOK),
1097*4882a593Smuzhiyun "swap_info %d in list but !SWP_WRITEOK\n",
1098*4882a593Smuzhiyun si->type);
1099*4882a593Smuzhiyun __del_from_avail_list(si);
1100*4882a593Smuzhiyun spin_unlock(&si->lock);
1101*4882a593Smuzhiyun goto nextsi;
1102*4882a593Smuzhiyun }
1103*4882a593Smuzhiyun if (size == SWAPFILE_CLUSTER) {
1104*4882a593Smuzhiyun if (si->flags & SWP_BLKDEV)
1105*4882a593Smuzhiyun n_ret = swap_alloc_cluster(si, swp_entries);
1106*4882a593Smuzhiyun } else
1107*4882a593Smuzhiyun n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
1108*4882a593Smuzhiyun n_goal, swp_entries);
1109*4882a593Smuzhiyun spin_unlock(&si->lock);
1110*4882a593Smuzhiyun if (n_ret || size == SWAPFILE_CLUSTER)
1111*4882a593Smuzhiyun goto check_out;
1112*4882a593Smuzhiyun pr_debug("scan_swap_map of si %d failed to find offset\n",
1113*4882a593Smuzhiyun si->type);
1114*4882a593Smuzhiyun
1115*4882a593Smuzhiyun spin_lock(&swap_avail_lock);
1116*4882a593Smuzhiyun nextsi:
1117*4882a593Smuzhiyun /*
1118*4882a593Smuzhiyun * if we got here, it's likely that si was almost full before,
1119*4882a593Smuzhiyun * and since scan_swap_map() can drop the si->lock, multiple
1120*4882a593Smuzhiyun * callers probably all tried to get a page from the same si
1121*4882a593Smuzhiyun * and it filled up before we could get one; or, the si filled
1122*4882a593Smuzhiyun * up between us dropping swap_avail_lock and taking si->lock.
1123*4882a593Smuzhiyun * Since we dropped the swap_avail_lock, the swap_avail_head
1124*4882a593Smuzhiyun * list may have been modified; so if next is still in the
1125*4882a593Smuzhiyun * swap_avail_head list then try it, otherwise start over
1126*4882a593Smuzhiyun * if we have not gotten any slots.
1127*4882a593Smuzhiyun */
1128*4882a593Smuzhiyun if (plist_node_empty(&next->avail_lists[node]))
1129*4882a593Smuzhiyun goto start_over;
1130*4882a593Smuzhiyun }
1131*4882a593Smuzhiyun
1132*4882a593Smuzhiyun spin_unlock(&swap_avail_lock);
1133*4882a593Smuzhiyun
1134*4882a593Smuzhiyun check_out:
1135*4882a593Smuzhiyun if (n_ret < n_goal)
1136*4882a593Smuzhiyun atomic_long_add((long)(n_goal - n_ret) * size,
1137*4882a593Smuzhiyun &nr_swap_pages);
1138*4882a593Smuzhiyun noswap:
1139*4882a593Smuzhiyun return n_ret;
1140*4882a593Smuzhiyun }
1141*4882a593Smuzhiyun
1142*4882a593Smuzhiyun /* The only caller of this function is now suspend routine */
get_swap_page_of_type(int type)1143*4882a593Smuzhiyun swp_entry_t get_swap_page_of_type(int type)
1144*4882a593Smuzhiyun {
1145*4882a593Smuzhiyun struct swap_info_struct *si = swap_type_to_swap_info(type);
1146*4882a593Smuzhiyun pgoff_t offset;
1147*4882a593Smuzhiyun bool skip = false;
1148*4882a593Smuzhiyun
1149*4882a593Smuzhiyun if (!si)
1150*4882a593Smuzhiyun goto fail;
1151*4882a593Smuzhiyun
1152*4882a593Smuzhiyun spin_lock(&si->lock);
1153*4882a593Smuzhiyun if (si->flags & SWP_WRITEOK) {
1154*4882a593Smuzhiyun /* This is called for allocating swap entry, not cache */
1155*4882a593Smuzhiyun offset = scan_swap_map(si, 1);
1156*4882a593Smuzhiyun if (offset) {
1157*4882a593Smuzhiyun trace_android_vh_account_swap_pages(si, &skip);
1158*4882a593Smuzhiyun if (!skip)
1159*4882a593Smuzhiyun atomic_long_dec(&nr_swap_pages);
1160*4882a593Smuzhiyun spin_unlock(&si->lock);
1161*4882a593Smuzhiyun return swp_entry(type, offset);
1162*4882a593Smuzhiyun }
1163*4882a593Smuzhiyun }
1164*4882a593Smuzhiyun spin_unlock(&si->lock);
1165*4882a593Smuzhiyun fail:
1166*4882a593Smuzhiyun return (swp_entry_t) {0};
1167*4882a593Smuzhiyun }
1168*4882a593Smuzhiyun
__swap_info_get(swp_entry_t entry)1169*4882a593Smuzhiyun static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
1170*4882a593Smuzhiyun {
1171*4882a593Smuzhiyun struct swap_info_struct *p;
1172*4882a593Smuzhiyun unsigned long offset;
1173*4882a593Smuzhiyun
1174*4882a593Smuzhiyun if (!entry.val)
1175*4882a593Smuzhiyun goto out;
1176*4882a593Smuzhiyun p = swp_swap_info(entry);
1177*4882a593Smuzhiyun if (!p)
1178*4882a593Smuzhiyun goto bad_nofile;
1179*4882a593Smuzhiyun if (data_race(!(p->flags & SWP_USED)))
1180*4882a593Smuzhiyun goto bad_device;
1181*4882a593Smuzhiyun offset = swp_offset(entry);
1182*4882a593Smuzhiyun if (offset >= p->max)
1183*4882a593Smuzhiyun goto bad_offset;
1184*4882a593Smuzhiyun return p;
1185*4882a593Smuzhiyun
1186*4882a593Smuzhiyun bad_offset:
1187*4882a593Smuzhiyun pr_err("swap_info_get: %s%08lx\n", Bad_offset, entry.val);
1188*4882a593Smuzhiyun goto out;
1189*4882a593Smuzhiyun bad_device:
1190*4882a593Smuzhiyun pr_err("swap_info_get: %s%08lx\n", Unused_file, entry.val);
1191*4882a593Smuzhiyun goto out;
1192*4882a593Smuzhiyun bad_nofile:
1193*4882a593Smuzhiyun pr_err("swap_info_get: %s%08lx\n", Bad_file, entry.val);
1194*4882a593Smuzhiyun out:
1195*4882a593Smuzhiyun return NULL;
1196*4882a593Smuzhiyun }
1197*4882a593Smuzhiyun
_swap_info_get(swp_entry_t entry)1198*4882a593Smuzhiyun static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
1199*4882a593Smuzhiyun {
1200*4882a593Smuzhiyun struct swap_info_struct *p;
1201*4882a593Smuzhiyun
1202*4882a593Smuzhiyun p = __swap_info_get(entry);
1203*4882a593Smuzhiyun if (!p)
1204*4882a593Smuzhiyun goto out;
1205*4882a593Smuzhiyun if (data_race(!p->swap_map[swp_offset(entry)]))
1206*4882a593Smuzhiyun goto bad_free;
1207*4882a593Smuzhiyun return p;
1208*4882a593Smuzhiyun
1209*4882a593Smuzhiyun bad_free:
1210*4882a593Smuzhiyun pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val);
1211*4882a593Smuzhiyun out:
1212*4882a593Smuzhiyun return NULL;
1213*4882a593Smuzhiyun }
1214*4882a593Smuzhiyun
swap_info_get(swp_entry_t entry)1215*4882a593Smuzhiyun static struct swap_info_struct *swap_info_get(swp_entry_t entry)
1216*4882a593Smuzhiyun {
1217*4882a593Smuzhiyun struct swap_info_struct *p;
1218*4882a593Smuzhiyun
1219*4882a593Smuzhiyun p = _swap_info_get(entry);
1220*4882a593Smuzhiyun if (p)
1221*4882a593Smuzhiyun spin_lock(&p->lock);
1222*4882a593Smuzhiyun return p;
1223*4882a593Smuzhiyun }
1224*4882a593Smuzhiyun
swap_info_get_cont(swp_entry_t entry,struct swap_info_struct * q)1225*4882a593Smuzhiyun static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
1226*4882a593Smuzhiyun struct swap_info_struct *q)
1227*4882a593Smuzhiyun {
1228*4882a593Smuzhiyun struct swap_info_struct *p;
1229*4882a593Smuzhiyun
1230*4882a593Smuzhiyun p = _swap_info_get(entry);
1231*4882a593Smuzhiyun
1232*4882a593Smuzhiyun if (p != q) {
1233*4882a593Smuzhiyun if (q != NULL)
1234*4882a593Smuzhiyun spin_unlock(&q->lock);
1235*4882a593Smuzhiyun if (p != NULL)
1236*4882a593Smuzhiyun spin_lock(&p->lock);
1237*4882a593Smuzhiyun }
1238*4882a593Smuzhiyun return p;
1239*4882a593Smuzhiyun }
1240*4882a593Smuzhiyun
__swap_entry_free_locked(struct swap_info_struct * p,unsigned long offset,unsigned char usage)1241*4882a593Smuzhiyun static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
1242*4882a593Smuzhiyun unsigned long offset,
1243*4882a593Smuzhiyun unsigned char usage)
1244*4882a593Smuzhiyun {
1245*4882a593Smuzhiyun unsigned char count;
1246*4882a593Smuzhiyun unsigned char has_cache;
1247*4882a593Smuzhiyun
1248*4882a593Smuzhiyun count = p->swap_map[offset];
1249*4882a593Smuzhiyun
1250*4882a593Smuzhiyun has_cache = count & SWAP_HAS_CACHE;
1251*4882a593Smuzhiyun count &= ~SWAP_HAS_CACHE;
1252*4882a593Smuzhiyun
1253*4882a593Smuzhiyun if (usage == SWAP_HAS_CACHE) {
1254*4882a593Smuzhiyun VM_BUG_ON(!has_cache);
1255*4882a593Smuzhiyun has_cache = 0;
1256*4882a593Smuzhiyun } else if (count == SWAP_MAP_SHMEM) {
1257*4882a593Smuzhiyun /*
1258*4882a593Smuzhiyun * Or we could insist on shmem.c using a special
1259*4882a593Smuzhiyun * swap_shmem_free() and free_shmem_swap_and_cache()...
1260*4882a593Smuzhiyun */
1261*4882a593Smuzhiyun count = 0;
1262*4882a593Smuzhiyun } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
1263*4882a593Smuzhiyun if (count == COUNT_CONTINUED) {
1264*4882a593Smuzhiyun if (swap_count_continued(p, offset, count))
1265*4882a593Smuzhiyun count = SWAP_MAP_MAX | COUNT_CONTINUED;
1266*4882a593Smuzhiyun else
1267*4882a593Smuzhiyun count = SWAP_MAP_MAX;
1268*4882a593Smuzhiyun } else
1269*4882a593Smuzhiyun count--;
1270*4882a593Smuzhiyun }
1271*4882a593Smuzhiyun
1272*4882a593Smuzhiyun usage = count | has_cache;
1273*4882a593Smuzhiyun if (usage)
1274*4882a593Smuzhiyun WRITE_ONCE(p->swap_map[offset], usage);
1275*4882a593Smuzhiyun else
1276*4882a593Smuzhiyun WRITE_ONCE(p->swap_map[offset], SWAP_HAS_CACHE);
1277*4882a593Smuzhiyun
1278*4882a593Smuzhiyun return usage;
1279*4882a593Smuzhiyun }
1280*4882a593Smuzhiyun
1281*4882a593Smuzhiyun /*
1282*4882a593Smuzhiyun * Check whether swap entry is valid in the swap device. If so,
1283*4882a593Smuzhiyun * return pointer to swap_info_struct, and keep the swap entry valid
1284*4882a593Smuzhiyun * via preventing the swap device from being swapoff, until
1285*4882a593Smuzhiyun * put_swap_device() is called. Otherwise return NULL.
1286*4882a593Smuzhiyun *
1287*4882a593Smuzhiyun * The entirety of the RCU read critical section must come before the
1288*4882a593Smuzhiyun * return from or after the call to synchronize_rcu() in
1289*4882a593Smuzhiyun * enable_swap_info() or swapoff(). So if "si->flags & SWP_VALID" is
1290*4882a593Smuzhiyun * true, the si->map, si->cluster_info, etc. must be valid in the
1291*4882a593Smuzhiyun * critical section.
1292*4882a593Smuzhiyun *
1293*4882a593Smuzhiyun * Notice that swapoff or swapoff+swapon can still happen before the
1294*4882a593Smuzhiyun * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
1295*4882a593Smuzhiyun * in put_swap_device() if there isn't any other way to prevent
1296*4882a593Smuzhiyun * swapoff, such as page lock, page table lock, etc. The caller must
1297*4882a593Smuzhiyun * be prepared for that. For example, the following situation is
1298*4882a593Smuzhiyun * possible.
1299*4882a593Smuzhiyun *
1300*4882a593Smuzhiyun * CPU1 CPU2
1301*4882a593Smuzhiyun * do_swap_page()
1302*4882a593Smuzhiyun * ... swapoff+swapon
1303*4882a593Smuzhiyun * __read_swap_cache_async()
1304*4882a593Smuzhiyun * swapcache_prepare()
1305*4882a593Smuzhiyun * __swap_duplicate()
1306*4882a593Smuzhiyun * // check swap_map
1307*4882a593Smuzhiyun * // verify PTE not changed
1308*4882a593Smuzhiyun *
1309*4882a593Smuzhiyun * In __swap_duplicate(), the swap_map need to be checked before
1310*4882a593Smuzhiyun * changing partly because the specified swap entry may be for another
1311*4882a593Smuzhiyun * swap device which has been swapoff. And in do_swap_page(), after
1312*4882a593Smuzhiyun * the page is read from the swap device, the PTE is verified not
1313*4882a593Smuzhiyun * changed with the page table locked to check whether the swap device
1314*4882a593Smuzhiyun * has been swapoff or swapoff+swapon.
1315*4882a593Smuzhiyun */
get_swap_device(swp_entry_t entry)1316*4882a593Smuzhiyun struct swap_info_struct *get_swap_device(swp_entry_t entry)
1317*4882a593Smuzhiyun {
1318*4882a593Smuzhiyun struct swap_info_struct *si;
1319*4882a593Smuzhiyun unsigned long offset;
1320*4882a593Smuzhiyun
1321*4882a593Smuzhiyun if (!entry.val)
1322*4882a593Smuzhiyun goto out;
1323*4882a593Smuzhiyun si = swp_swap_info(entry);
1324*4882a593Smuzhiyun if (!si)
1325*4882a593Smuzhiyun goto bad_nofile;
1326*4882a593Smuzhiyun
1327*4882a593Smuzhiyun rcu_read_lock();
1328*4882a593Smuzhiyun if (data_race(!(si->flags & SWP_VALID)))
1329*4882a593Smuzhiyun goto unlock_out;
1330*4882a593Smuzhiyun offset = swp_offset(entry);
1331*4882a593Smuzhiyun if (offset >= si->max)
1332*4882a593Smuzhiyun goto unlock_out;
1333*4882a593Smuzhiyun
1334*4882a593Smuzhiyun return si;
1335*4882a593Smuzhiyun bad_nofile:
1336*4882a593Smuzhiyun pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
1337*4882a593Smuzhiyun out:
1338*4882a593Smuzhiyun return NULL;
1339*4882a593Smuzhiyun unlock_out:
1340*4882a593Smuzhiyun rcu_read_unlock();
1341*4882a593Smuzhiyun return NULL;
1342*4882a593Smuzhiyun }
1343*4882a593Smuzhiyun
__swap_entry_free(struct swap_info_struct * p,swp_entry_t entry)1344*4882a593Smuzhiyun static unsigned char __swap_entry_free(struct swap_info_struct *p,
1345*4882a593Smuzhiyun swp_entry_t entry)
1346*4882a593Smuzhiyun {
1347*4882a593Smuzhiyun struct swap_cluster_info *ci;
1348*4882a593Smuzhiyun unsigned long offset = swp_offset(entry);
1349*4882a593Smuzhiyun unsigned char usage;
1350*4882a593Smuzhiyun
1351*4882a593Smuzhiyun ci = lock_cluster_or_swap_info(p, offset);
1352*4882a593Smuzhiyun usage = __swap_entry_free_locked(p, offset, 1);
1353*4882a593Smuzhiyun unlock_cluster_or_swap_info(p, ci);
1354*4882a593Smuzhiyun if (!usage)
1355*4882a593Smuzhiyun free_swap_slot(entry);
1356*4882a593Smuzhiyun
1357*4882a593Smuzhiyun return usage;
1358*4882a593Smuzhiyun }
1359*4882a593Smuzhiyun
swap_entry_free(struct swap_info_struct * p,swp_entry_t entry)1360*4882a593Smuzhiyun static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
1361*4882a593Smuzhiyun {
1362*4882a593Smuzhiyun struct swap_cluster_info *ci;
1363*4882a593Smuzhiyun unsigned long offset = swp_offset(entry);
1364*4882a593Smuzhiyun unsigned char count;
1365*4882a593Smuzhiyun
1366*4882a593Smuzhiyun ci = lock_cluster(p, offset);
1367*4882a593Smuzhiyun count = p->swap_map[offset];
1368*4882a593Smuzhiyun VM_BUG_ON(count != SWAP_HAS_CACHE);
1369*4882a593Smuzhiyun p->swap_map[offset] = 0;
1370*4882a593Smuzhiyun dec_cluster_info_page(p, p->cluster_info, offset);
1371*4882a593Smuzhiyun unlock_cluster(ci);
1372*4882a593Smuzhiyun
1373*4882a593Smuzhiyun mem_cgroup_uncharge_swap(entry, 1);
1374*4882a593Smuzhiyun swap_range_free(p, offset, 1);
1375*4882a593Smuzhiyun }
1376*4882a593Smuzhiyun
1377*4882a593Smuzhiyun /*
1378*4882a593Smuzhiyun * Caller has made sure that the swap device corresponding to entry
1379*4882a593Smuzhiyun * is still around or has not been recycled.
1380*4882a593Smuzhiyun */
swap_free(swp_entry_t entry)1381*4882a593Smuzhiyun void swap_free(swp_entry_t entry)
1382*4882a593Smuzhiyun {
1383*4882a593Smuzhiyun struct swap_info_struct *p;
1384*4882a593Smuzhiyun
1385*4882a593Smuzhiyun p = _swap_info_get(entry);
1386*4882a593Smuzhiyun if (p)
1387*4882a593Smuzhiyun __swap_entry_free(p, entry);
1388*4882a593Smuzhiyun }
1389*4882a593Smuzhiyun
1390*4882a593Smuzhiyun /*
1391*4882a593Smuzhiyun * Called after dropping swapcache to decrease refcnt to swap entries.
1392*4882a593Smuzhiyun */
put_swap_page(struct page * page,swp_entry_t entry)1393*4882a593Smuzhiyun void put_swap_page(struct page *page, swp_entry_t entry)
1394*4882a593Smuzhiyun {
1395*4882a593Smuzhiyun unsigned long offset = swp_offset(entry);
1396*4882a593Smuzhiyun unsigned long idx = offset / SWAPFILE_CLUSTER;
1397*4882a593Smuzhiyun struct swap_cluster_info *ci;
1398*4882a593Smuzhiyun struct swap_info_struct *si;
1399*4882a593Smuzhiyun unsigned char *map;
1400*4882a593Smuzhiyun unsigned int i, free_entries = 0;
1401*4882a593Smuzhiyun unsigned char val;
1402*4882a593Smuzhiyun int size = swap_entry_size(thp_nr_pages(page));
1403*4882a593Smuzhiyun
1404*4882a593Smuzhiyun si = _swap_info_get(entry);
1405*4882a593Smuzhiyun if (!si)
1406*4882a593Smuzhiyun return;
1407*4882a593Smuzhiyun
1408*4882a593Smuzhiyun ci = lock_cluster_or_swap_info(si, offset);
1409*4882a593Smuzhiyun if (size == SWAPFILE_CLUSTER) {
1410*4882a593Smuzhiyun VM_BUG_ON(!cluster_is_huge(ci));
1411*4882a593Smuzhiyun map = si->swap_map + offset;
1412*4882a593Smuzhiyun for (i = 0; i < SWAPFILE_CLUSTER; i++) {
1413*4882a593Smuzhiyun val = map[i];
1414*4882a593Smuzhiyun VM_BUG_ON(!(val & SWAP_HAS_CACHE));
1415*4882a593Smuzhiyun if (val == SWAP_HAS_CACHE)
1416*4882a593Smuzhiyun free_entries++;
1417*4882a593Smuzhiyun }
1418*4882a593Smuzhiyun cluster_clear_huge(ci);
1419*4882a593Smuzhiyun if (free_entries == SWAPFILE_CLUSTER) {
1420*4882a593Smuzhiyun unlock_cluster_or_swap_info(si, ci);
1421*4882a593Smuzhiyun spin_lock(&si->lock);
1422*4882a593Smuzhiyun mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
1423*4882a593Smuzhiyun swap_free_cluster(si, idx);
1424*4882a593Smuzhiyun spin_unlock(&si->lock);
1425*4882a593Smuzhiyun return;
1426*4882a593Smuzhiyun }
1427*4882a593Smuzhiyun }
1428*4882a593Smuzhiyun for (i = 0; i < size; i++, entry.val++) {
1429*4882a593Smuzhiyun if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
1430*4882a593Smuzhiyun unlock_cluster_or_swap_info(si, ci);
1431*4882a593Smuzhiyun free_swap_slot(entry);
1432*4882a593Smuzhiyun if (i == size - 1)
1433*4882a593Smuzhiyun return;
1434*4882a593Smuzhiyun lock_cluster_or_swap_info(si, offset);
1435*4882a593Smuzhiyun }
1436*4882a593Smuzhiyun }
1437*4882a593Smuzhiyun unlock_cluster_or_swap_info(si, ci);
1438*4882a593Smuzhiyun }
1439*4882a593Smuzhiyun
1440*4882a593Smuzhiyun #ifdef CONFIG_THP_SWAP
split_swap_cluster(swp_entry_t entry)1441*4882a593Smuzhiyun int split_swap_cluster(swp_entry_t entry)
1442*4882a593Smuzhiyun {
1443*4882a593Smuzhiyun struct swap_info_struct *si;
1444*4882a593Smuzhiyun struct swap_cluster_info *ci;
1445*4882a593Smuzhiyun unsigned long offset = swp_offset(entry);
1446*4882a593Smuzhiyun
1447*4882a593Smuzhiyun si = _swap_info_get(entry);
1448*4882a593Smuzhiyun if (!si)
1449*4882a593Smuzhiyun return -EBUSY;
1450*4882a593Smuzhiyun ci = lock_cluster(si, offset);
1451*4882a593Smuzhiyun cluster_clear_huge(ci);
1452*4882a593Smuzhiyun unlock_cluster(ci);
1453*4882a593Smuzhiyun return 0;
1454*4882a593Smuzhiyun }
1455*4882a593Smuzhiyun #endif
1456*4882a593Smuzhiyun
swp_entry_cmp(const void * ent1,const void * ent2)1457*4882a593Smuzhiyun static int swp_entry_cmp(const void *ent1, const void *ent2)
1458*4882a593Smuzhiyun {
1459*4882a593Smuzhiyun const swp_entry_t *e1 = ent1, *e2 = ent2;
1460*4882a593Smuzhiyun
1461*4882a593Smuzhiyun return (int)swp_type(*e1) - (int)swp_type(*e2);
1462*4882a593Smuzhiyun }
1463*4882a593Smuzhiyun
swapcache_free_entries(swp_entry_t * entries,int n)1464*4882a593Smuzhiyun void swapcache_free_entries(swp_entry_t *entries, int n)
1465*4882a593Smuzhiyun {
1466*4882a593Smuzhiyun struct swap_info_struct *p, *prev;
1467*4882a593Smuzhiyun int i;
1468*4882a593Smuzhiyun
1469*4882a593Smuzhiyun if (n <= 0)
1470*4882a593Smuzhiyun return;
1471*4882a593Smuzhiyun
1472*4882a593Smuzhiyun prev = NULL;
1473*4882a593Smuzhiyun p = NULL;
1474*4882a593Smuzhiyun
1475*4882a593Smuzhiyun /*
1476*4882a593Smuzhiyun * Sort swap entries by swap device, so each lock is only taken once.
1477*4882a593Smuzhiyun * nr_swapfiles isn't absolutely correct, but the overhead of sort() is
1478*4882a593Smuzhiyun * so low that it isn't necessary to optimize further.
1479*4882a593Smuzhiyun */
1480*4882a593Smuzhiyun if (nr_swapfiles > 1)
1481*4882a593Smuzhiyun sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL);
1482*4882a593Smuzhiyun for (i = 0; i < n; ++i) {
1483*4882a593Smuzhiyun p = swap_info_get_cont(entries[i], prev);
1484*4882a593Smuzhiyun if (p)
1485*4882a593Smuzhiyun swap_entry_free(p, entries[i]);
1486*4882a593Smuzhiyun prev = p;
1487*4882a593Smuzhiyun }
1488*4882a593Smuzhiyun if (p)
1489*4882a593Smuzhiyun spin_unlock(&p->lock);
1490*4882a593Smuzhiyun }
1491*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(swapcache_free_entries);
1492*4882a593Smuzhiyun
1493*4882a593Smuzhiyun /*
1494*4882a593Smuzhiyun * How many references to page are currently swapped out?
1495*4882a593Smuzhiyun * This does not give an exact answer when swap count is continued,
1496*4882a593Smuzhiyun * but does include the high COUNT_CONTINUED flag to allow for that.
1497*4882a593Smuzhiyun */
page_swapcount(struct page * page)1498*4882a593Smuzhiyun int page_swapcount(struct page *page)
1499*4882a593Smuzhiyun {
1500*4882a593Smuzhiyun int count = 0;
1501*4882a593Smuzhiyun struct swap_info_struct *p;
1502*4882a593Smuzhiyun struct swap_cluster_info *ci;
1503*4882a593Smuzhiyun swp_entry_t entry;
1504*4882a593Smuzhiyun unsigned long offset;
1505*4882a593Smuzhiyun
1506*4882a593Smuzhiyun entry.val = page_private(page);
1507*4882a593Smuzhiyun p = _swap_info_get(entry);
1508*4882a593Smuzhiyun if (p) {
1509*4882a593Smuzhiyun offset = swp_offset(entry);
1510*4882a593Smuzhiyun ci = lock_cluster_or_swap_info(p, offset);
1511*4882a593Smuzhiyun count = swap_count(p->swap_map[offset]);
1512*4882a593Smuzhiyun unlock_cluster_or_swap_info(p, ci);
1513*4882a593Smuzhiyun }
1514*4882a593Smuzhiyun return count;
1515*4882a593Smuzhiyun }
1516*4882a593Smuzhiyun
__swap_count(swp_entry_t entry)1517*4882a593Smuzhiyun int __swap_count(swp_entry_t entry)
1518*4882a593Smuzhiyun {
1519*4882a593Smuzhiyun struct swap_info_struct *si;
1520*4882a593Smuzhiyun pgoff_t offset = swp_offset(entry);
1521*4882a593Smuzhiyun int count = 0;
1522*4882a593Smuzhiyun
1523*4882a593Smuzhiyun si = get_swap_device(entry);
1524*4882a593Smuzhiyun if (si) {
1525*4882a593Smuzhiyun count = swap_count(si->swap_map[offset]);
1526*4882a593Smuzhiyun put_swap_device(si);
1527*4882a593Smuzhiyun }
1528*4882a593Smuzhiyun return count;
1529*4882a593Smuzhiyun }
1530*4882a593Smuzhiyun
swap_swapcount(struct swap_info_struct * si,swp_entry_t entry)1531*4882a593Smuzhiyun static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
1532*4882a593Smuzhiyun {
1533*4882a593Smuzhiyun int count = 0;
1534*4882a593Smuzhiyun pgoff_t offset = swp_offset(entry);
1535*4882a593Smuzhiyun struct swap_cluster_info *ci;
1536*4882a593Smuzhiyun
1537*4882a593Smuzhiyun ci = lock_cluster_or_swap_info(si, offset);
1538*4882a593Smuzhiyun count = swap_count(si->swap_map[offset]);
1539*4882a593Smuzhiyun unlock_cluster_or_swap_info(si, ci);
1540*4882a593Smuzhiyun return count;
1541*4882a593Smuzhiyun }
1542*4882a593Smuzhiyun
1543*4882a593Smuzhiyun /*
1544*4882a593Smuzhiyun * How many references to @entry are currently swapped out?
1545*4882a593Smuzhiyun * This does not give an exact answer when swap count is continued,
1546*4882a593Smuzhiyun * but does include the high COUNT_CONTINUED flag to allow for that.
1547*4882a593Smuzhiyun */
__swp_swapcount(swp_entry_t entry)1548*4882a593Smuzhiyun int __swp_swapcount(swp_entry_t entry)
1549*4882a593Smuzhiyun {
1550*4882a593Smuzhiyun int count = 0;
1551*4882a593Smuzhiyun struct swap_info_struct *si;
1552*4882a593Smuzhiyun
1553*4882a593Smuzhiyun si = get_swap_device(entry);
1554*4882a593Smuzhiyun if (si) {
1555*4882a593Smuzhiyun count = swap_swapcount(si, entry);
1556*4882a593Smuzhiyun put_swap_device(si);
1557*4882a593Smuzhiyun }
1558*4882a593Smuzhiyun return count;
1559*4882a593Smuzhiyun }
1560*4882a593Smuzhiyun
1561*4882a593Smuzhiyun /*
1562*4882a593Smuzhiyun * How many references to @entry are currently swapped out?
1563*4882a593Smuzhiyun * This considers COUNT_CONTINUED so it returns exact answer.
1564*4882a593Smuzhiyun */
swp_swapcount(swp_entry_t entry)1565*4882a593Smuzhiyun int swp_swapcount(swp_entry_t entry)
1566*4882a593Smuzhiyun {
1567*4882a593Smuzhiyun int count, tmp_count, n;
1568*4882a593Smuzhiyun struct swap_info_struct *p;
1569*4882a593Smuzhiyun struct swap_cluster_info *ci;
1570*4882a593Smuzhiyun struct page *page;
1571*4882a593Smuzhiyun pgoff_t offset;
1572*4882a593Smuzhiyun unsigned char *map;
1573*4882a593Smuzhiyun
1574*4882a593Smuzhiyun p = _swap_info_get(entry);
1575*4882a593Smuzhiyun if (!p)
1576*4882a593Smuzhiyun return 0;
1577*4882a593Smuzhiyun
1578*4882a593Smuzhiyun offset = swp_offset(entry);
1579*4882a593Smuzhiyun
1580*4882a593Smuzhiyun ci = lock_cluster_or_swap_info(p, offset);
1581*4882a593Smuzhiyun
1582*4882a593Smuzhiyun count = swap_count(p->swap_map[offset]);
1583*4882a593Smuzhiyun if (!(count & COUNT_CONTINUED))
1584*4882a593Smuzhiyun goto out;
1585*4882a593Smuzhiyun
1586*4882a593Smuzhiyun count &= ~COUNT_CONTINUED;
1587*4882a593Smuzhiyun n = SWAP_MAP_MAX + 1;
1588*4882a593Smuzhiyun
1589*4882a593Smuzhiyun page = vmalloc_to_page(p->swap_map + offset);
1590*4882a593Smuzhiyun offset &= ~PAGE_MASK;
1591*4882a593Smuzhiyun VM_BUG_ON(page_private(page) != SWP_CONTINUED);
1592*4882a593Smuzhiyun
1593*4882a593Smuzhiyun do {
1594*4882a593Smuzhiyun page = list_next_entry(page, lru);
1595*4882a593Smuzhiyun map = kmap_atomic(page);
1596*4882a593Smuzhiyun tmp_count = map[offset];
1597*4882a593Smuzhiyun kunmap_atomic(map);
1598*4882a593Smuzhiyun
1599*4882a593Smuzhiyun count += (tmp_count & ~COUNT_CONTINUED) * n;
1600*4882a593Smuzhiyun n *= (SWAP_CONT_MAX + 1);
1601*4882a593Smuzhiyun } while (tmp_count & COUNT_CONTINUED);
1602*4882a593Smuzhiyun out:
1603*4882a593Smuzhiyun unlock_cluster_or_swap_info(p, ci);
1604*4882a593Smuzhiyun return count;
1605*4882a593Smuzhiyun }
1606*4882a593Smuzhiyun
swap_page_trans_huge_swapped(struct swap_info_struct * si,swp_entry_t entry)1607*4882a593Smuzhiyun static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
1608*4882a593Smuzhiyun swp_entry_t entry)
1609*4882a593Smuzhiyun {
1610*4882a593Smuzhiyun struct swap_cluster_info *ci;
1611*4882a593Smuzhiyun unsigned char *map = si->swap_map;
1612*4882a593Smuzhiyun unsigned long roffset = swp_offset(entry);
1613*4882a593Smuzhiyun unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER);
1614*4882a593Smuzhiyun int i;
1615*4882a593Smuzhiyun bool ret = false;
1616*4882a593Smuzhiyun
1617*4882a593Smuzhiyun ci = lock_cluster_or_swap_info(si, offset);
1618*4882a593Smuzhiyun if (!ci || !cluster_is_huge(ci)) {
1619*4882a593Smuzhiyun if (swap_count(map[roffset]))
1620*4882a593Smuzhiyun ret = true;
1621*4882a593Smuzhiyun goto unlock_out;
1622*4882a593Smuzhiyun }
1623*4882a593Smuzhiyun for (i = 0; i < SWAPFILE_CLUSTER; i++) {
1624*4882a593Smuzhiyun if (swap_count(map[offset + i])) {
1625*4882a593Smuzhiyun ret = true;
1626*4882a593Smuzhiyun break;
1627*4882a593Smuzhiyun }
1628*4882a593Smuzhiyun }
1629*4882a593Smuzhiyun unlock_out:
1630*4882a593Smuzhiyun unlock_cluster_or_swap_info(si, ci);
1631*4882a593Smuzhiyun return ret;
1632*4882a593Smuzhiyun }
1633*4882a593Smuzhiyun
page_swapped(struct page * page)1634*4882a593Smuzhiyun static bool page_swapped(struct page *page)
1635*4882a593Smuzhiyun {
1636*4882a593Smuzhiyun swp_entry_t entry;
1637*4882a593Smuzhiyun struct swap_info_struct *si;
1638*4882a593Smuzhiyun
1639*4882a593Smuzhiyun if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page)))
1640*4882a593Smuzhiyun return page_swapcount(page) != 0;
1641*4882a593Smuzhiyun
1642*4882a593Smuzhiyun page = compound_head(page);
1643*4882a593Smuzhiyun entry.val = page_private(page);
1644*4882a593Smuzhiyun si = _swap_info_get(entry);
1645*4882a593Smuzhiyun if (si)
1646*4882a593Smuzhiyun return swap_page_trans_huge_swapped(si, entry);
1647*4882a593Smuzhiyun return false;
1648*4882a593Smuzhiyun }
1649*4882a593Smuzhiyun
page_trans_huge_map_swapcount(struct page * page,int * total_mapcount,int * total_swapcount)1650*4882a593Smuzhiyun static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
1651*4882a593Smuzhiyun int *total_swapcount)
1652*4882a593Smuzhiyun {
1653*4882a593Smuzhiyun int i, map_swapcount, _total_mapcount, _total_swapcount;
1654*4882a593Smuzhiyun unsigned long offset = 0;
1655*4882a593Smuzhiyun struct swap_info_struct *si;
1656*4882a593Smuzhiyun struct swap_cluster_info *ci = NULL;
1657*4882a593Smuzhiyun unsigned char *map = NULL;
1658*4882a593Smuzhiyun int mapcount, swapcount = 0;
1659*4882a593Smuzhiyun
1660*4882a593Smuzhiyun /* hugetlbfs shouldn't call it */
1661*4882a593Smuzhiyun VM_BUG_ON_PAGE(PageHuge(page), page);
1662*4882a593Smuzhiyun
1663*4882a593Smuzhiyun if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) {
1664*4882a593Smuzhiyun mapcount = page_trans_huge_mapcount(page, total_mapcount);
1665*4882a593Smuzhiyun if (PageSwapCache(page))
1666*4882a593Smuzhiyun swapcount = page_swapcount(page);
1667*4882a593Smuzhiyun if (total_swapcount)
1668*4882a593Smuzhiyun *total_swapcount = swapcount;
1669*4882a593Smuzhiyun return mapcount + swapcount;
1670*4882a593Smuzhiyun }
1671*4882a593Smuzhiyun
1672*4882a593Smuzhiyun page = compound_head(page);
1673*4882a593Smuzhiyun
1674*4882a593Smuzhiyun _total_mapcount = _total_swapcount = map_swapcount = 0;
1675*4882a593Smuzhiyun if (PageSwapCache(page)) {
1676*4882a593Smuzhiyun swp_entry_t entry;
1677*4882a593Smuzhiyun
1678*4882a593Smuzhiyun entry.val = page_private(page);
1679*4882a593Smuzhiyun si = _swap_info_get(entry);
1680*4882a593Smuzhiyun if (si) {
1681*4882a593Smuzhiyun map = si->swap_map;
1682*4882a593Smuzhiyun offset = swp_offset(entry);
1683*4882a593Smuzhiyun }
1684*4882a593Smuzhiyun }
1685*4882a593Smuzhiyun if (map)
1686*4882a593Smuzhiyun ci = lock_cluster(si, offset);
1687*4882a593Smuzhiyun for (i = 0; i < HPAGE_PMD_NR; i++) {
1688*4882a593Smuzhiyun mapcount = atomic_read(&page[i]._mapcount) + 1;
1689*4882a593Smuzhiyun _total_mapcount += mapcount;
1690*4882a593Smuzhiyun if (map) {
1691*4882a593Smuzhiyun swapcount = swap_count(map[offset + i]);
1692*4882a593Smuzhiyun _total_swapcount += swapcount;
1693*4882a593Smuzhiyun }
1694*4882a593Smuzhiyun map_swapcount = max(map_swapcount, mapcount + swapcount);
1695*4882a593Smuzhiyun }
1696*4882a593Smuzhiyun unlock_cluster(ci);
1697*4882a593Smuzhiyun if (PageDoubleMap(page)) {
1698*4882a593Smuzhiyun map_swapcount -= 1;
1699*4882a593Smuzhiyun _total_mapcount -= HPAGE_PMD_NR;
1700*4882a593Smuzhiyun }
1701*4882a593Smuzhiyun mapcount = compound_mapcount(page);
1702*4882a593Smuzhiyun map_swapcount += mapcount;
1703*4882a593Smuzhiyun _total_mapcount += mapcount;
1704*4882a593Smuzhiyun if (total_mapcount)
1705*4882a593Smuzhiyun *total_mapcount = _total_mapcount;
1706*4882a593Smuzhiyun if (total_swapcount)
1707*4882a593Smuzhiyun *total_swapcount = _total_swapcount;
1708*4882a593Smuzhiyun
1709*4882a593Smuzhiyun return map_swapcount;
1710*4882a593Smuzhiyun }
1711*4882a593Smuzhiyun
1712*4882a593Smuzhiyun /*
1713*4882a593Smuzhiyun * We can write to an anon page without COW if there are no other references
1714*4882a593Smuzhiyun * to it. And as a side-effect, free up its swap: because the old content
1715*4882a593Smuzhiyun * on disk will never be read, and seeking back there to write new content
1716*4882a593Smuzhiyun * later would only waste time away from clustering.
1717*4882a593Smuzhiyun *
1718*4882a593Smuzhiyun * NOTE: total_map_swapcount should not be relied upon by the caller if
1719*4882a593Smuzhiyun * reuse_swap_page() returns false, but it may be always overwritten
1720*4882a593Smuzhiyun * (see the other implementation for CONFIG_SWAP=n).
1721*4882a593Smuzhiyun */
reuse_swap_page(struct page * page,int * total_map_swapcount)1722*4882a593Smuzhiyun bool reuse_swap_page(struct page *page, int *total_map_swapcount)
1723*4882a593Smuzhiyun {
1724*4882a593Smuzhiyun int count, total_mapcount, total_swapcount;
1725*4882a593Smuzhiyun
1726*4882a593Smuzhiyun VM_BUG_ON_PAGE(!PageLocked(page), page);
1727*4882a593Smuzhiyun if (unlikely(PageKsm(page)))
1728*4882a593Smuzhiyun return false;
1729*4882a593Smuzhiyun count = page_trans_huge_map_swapcount(page, &total_mapcount,
1730*4882a593Smuzhiyun &total_swapcount);
1731*4882a593Smuzhiyun if (total_map_swapcount)
1732*4882a593Smuzhiyun *total_map_swapcount = total_mapcount + total_swapcount;
1733*4882a593Smuzhiyun if (count == 1 && PageSwapCache(page) &&
1734*4882a593Smuzhiyun (likely(!PageTransCompound(page)) ||
1735*4882a593Smuzhiyun /* The remaining swap count will be freed soon */
1736*4882a593Smuzhiyun total_swapcount == page_swapcount(page))) {
1737*4882a593Smuzhiyun if (!PageWriteback(page)) {
1738*4882a593Smuzhiyun page = compound_head(page);
1739*4882a593Smuzhiyun delete_from_swap_cache(page);
1740*4882a593Smuzhiyun SetPageDirty(page);
1741*4882a593Smuzhiyun } else {
1742*4882a593Smuzhiyun swp_entry_t entry;
1743*4882a593Smuzhiyun struct swap_info_struct *p;
1744*4882a593Smuzhiyun
1745*4882a593Smuzhiyun entry.val = page_private(page);
1746*4882a593Smuzhiyun p = swap_info_get(entry);
1747*4882a593Smuzhiyun if (p->flags & SWP_STABLE_WRITES) {
1748*4882a593Smuzhiyun spin_unlock(&p->lock);
1749*4882a593Smuzhiyun return false;
1750*4882a593Smuzhiyun }
1751*4882a593Smuzhiyun spin_unlock(&p->lock);
1752*4882a593Smuzhiyun }
1753*4882a593Smuzhiyun }
1754*4882a593Smuzhiyun
1755*4882a593Smuzhiyun return count <= 1;
1756*4882a593Smuzhiyun }
1757*4882a593Smuzhiyun
1758*4882a593Smuzhiyun /*
1759*4882a593Smuzhiyun * If swap is getting full, or if there are no more mappings of this page,
1760*4882a593Smuzhiyun * then try_to_free_swap is called to free its swap space.
1761*4882a593Smuzhiyun */
try_to_free_swap(struct page * page)1762*4882a593Smuzhiyun int try_to_free_swap(struct page *page)
1763*4882a593Smuzhiyun {
1764*4882a593Smuzhiyun VM_BUG_ON_PAGE(!PageLocked(page), page);
1765*4882a593Smuzhiyun
1766*4882a593Smuzhiyun if (!PageSwapCache(page))
1767*4882a593Smuzhiyun return 0;
1768*4882a593Smuzhiyun if (PageWriteback(page))
1769*4882a593Smuzhiyun return 0;
1770*4882a593Smuzhiyun if (page_swapped(page))
1771*4882a593Smuzhiyun return 0;
1772*4882a593Smuzhiyun
1773*4882a593Smuzhiyun /*
1774*4882a593Smuzhiyun * Once hibernation has begun to create its image of memory,
1775*4882a593Smuzhiyun * there's a danger that one of the calls to try_to_free_swap()
1776*4882a593Smuzhiyun * - most probably a call from __try_to_reclaim_swap() while
1777*4882a593Smuzhiyun * hibernation is allocating its own swap pages for the image,
1778*4882a593Smuzhiyun * but conceivably even a call from memory reclaim - will free
1779*4882a593Smuzhiyun * the swap from a page which has already been recorded in the
1780*4882a593Smuzhiyun * image as a clean swapcache page, and then reuse its swap for
1781*4882a593Smuzhiyun * another page of the image. On waking from hibernation, the
1782*4882a593Smuzhiyun * original page might be freed under memory pressure, then
1783*4882a593Smuzhiyun * later read back in from swap, now with the wrong data.
1784*4882a593Smuzhiyun *
1785*4882a593Smuzhiyun * Hibernation suspends storage while it is writing the image
1786*4882a593Smuzhiyun * to disk so check that here.
1787*4882a593Smuzhiyun */
1788*4882a593Smuzhiyun if (pm_suspended_storage())
1789*4882a593Smuzhiyun return 0;
1790*4882a593Smuzhiyun
1791*4882a593Smuzhiyun page = compound_head(page);
1792*4882a593Smuzhiyun delete_from_swap_cache(page);
1793*4882a593Smuzhiyun SetPageDirty(page);
1794*4882a593Smuzhiyun return 1;
1795*4882a593Smuzhiyun }
1796*4882a593Smuzhiyun
1797*4882a593Smuzhiyun /*
1798*4882a593Smuzhiyun * Free the swap entry like above, but also try to
1799*4882a593Smuzhiyun * free the page cache entry if it is the last user.
1800*4882a593Smuzhiyun */
free_swap_and_cache(swp_entry_t entry)1801*4882a593Smuzhiyun int free_swap_and_cache(swp_entry_t entry)
1802*4882a593Smuzhiyun {
1803*4882a593Smuzhiyun struct swap_info_struct *p;
1804*4882a593Smuzhiyun unsigned char count;
1805*4882a593Smuzhiyun
1806*4882a593Smuzhiyun if (non_swap_entry(entry))
1807*4882a593Smuzhiyun return 1;
1808*4882a593Smuzhiyun
1809*4882a593Smuzhiyun p = _swap_info_get(entry);
1810*4882a593Smuzhiyun if (p) {
1811*4882a593Smuzhiyun count = __swap_entry_free(p, entry);
1812*4882a593Smuzhiyun if (count == SWAP_HAS_CACHE &&
1813*4882a593Smuzhiyun !swap_page_trans_huge_swapped(p, entry))
1814*4882a593Smuzhiyun __try_to_reclaim_swap(p, swp_offset(entry),
1815*4882a593Smuzhiyun TTRS_UNMAPPED | TTRS_FULL);
1816*4882a593Smuzhiyun }
1817*4882a593Smuzhiyun return p != NULL;
1818*4882a593Smuzhiyun }
1819*4882a593Smuzhiyun
1820*4882a593Smuzhiyun #ifdef CONFIG_HIBERNATION
1821*4882a593Smuzhiyun /*
1822*4882a593Smuzhiyun * Find the swap type that corresponds to given device (if any).
1823*4882a593Smuzhiyun *
1824*4882a593Smuzhiyun * @offset - number of the PAGE_SIZE-sized block of the device, starting
1825*4882a593Smuzhiyun * from 0, in which the swap header is expected to be located.
1826*4882a593Smuzhiyun *
1827*4882a593Smuzhiyun * This is needed for the suspend to disk (aka swsusp).
1828*4882a593Smuzhiyun */
swap_type_of(dev_t device,sector_t offset)1829*4882a593Smuzhiyun int swap_type_of(dev_t device, sector_t offset)
1830*4882a593Smuzhiyun {
1831*4882a593Smuzhiyun int type;
1832*4882a593Smuzhiyun
1833*4882a593Smuzhiyun if (!device)
1834*4882a593Smuzhiyun return -1;
1835*4882a593Smuzhiyun
1836*4882a593Smuzhiyun spin_lock(&swap_lock);
1837*4882a593Smuzhiyun for (type = 0; type < nr_swapfiles; type++) {
1838*4882a593Smuzhiyun struct swap_info_struct *sis = swap_info[type];
1839*4882a593Smuzhiyun
1840*4882a593Smuzhiyun if (!(sis->flags & SWP_WRITEOK))
1841*4882a593Smuzhiyun continue;
1842*4882a593Smuzhiyun
1843*4882a593Smuzhiyun if (device == sis->bdev->bd_dev) {
1844*4882a593Smuzhiyun struct swap_extent *se = first_se(sis);
1845*4882a593Smuzhiyun
1846*4882a593Smuzhiyun if (se->start_block == offset) {
1847*4882a593Smuzhiyun spin_unlock(&swap_lock);
1848*4882a593Smuzhiyun return type;
1849*4882a593Smuzhiyun }
1850*4882a593Smuzhiyun }
1851*4882a593Smuzhiyun }
1852*4882a593Smuzhiyun spin_unlock(&swap_lock);
1853*4882a593Smuzhiyun return -ENODEV;
1854*4882a593Smuzhiyun }
1855*4882a593Smuzhiyun
find_first_swap(dev_t * device)1856*4882a593Smuzhiyun int find_first_swap(dev_t *device)
1857*4882a593Smuzhiyun {
1858*4882a593Smuzhiyun int type;
1859*4882a593Smuzhiyun
1860*4882a593Smuzhiyun spin_lock(&swap_lock);
1861*4882a593Smuzhiyun for (type = 0; type < nr_swapfiles; type++) {
1862*4882a593Smuzhiyun struct swap_info_struct *sis = swap_info[type];
1863*4882a593Smuzhiyun
1864*4882a593Smuzhiyun if (!(sis->flags & SWP_WRITEOK))
1865*4882a593Smuzhiyun continue;
1866*4882a593Smuzhiyun *device = sis->bdev->bd_dev;
1867*4882a593Smuzhiyun spin_unlock(&swap_lock);
1868*4882a593Smuzhiyun return type;
1869*4882a593Smuzhiyun }
1870*4882a593Smuzhiyun spin_unlock(&swap_lock);
1871*4882a593Smuzhiyun return -ENODEV;
1872*4882a593Smuzhiyun }
1873*4882a593Smuzhiyun
1874*4882a593Smuzhiyun /*
1875*4882a593Smuzhiyun * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
1876*4882a593Smuzhiyun * corresponding to given index in swap_info (swap type).
1877*4882a593Smuzhiyun */
swapdev_block(int type,pgoff_t offset)1878*4882a593Smuzhiyun sector_t swapdev_block(int type, pgoff_t offset)
1879*4882a593Smuzhiyun {
1880*4882a593Smuzhiyun struct block_device *bdev;
1881*4882a593Smuzhiyun struct swap_info_struct *si = swap_type_to_swap_info(type);
1882*4882a593Smuzhiyun
1883*4882a593Smuzhiyun if (!si || !(si->flags & SWP_WRITEOK))
1884*4882a593Smuzhiyun return 0;
1885*4882a593Smuzhiyun return map_swap_entry(swp_entry(type, offset), &bdev);
1886*4882a593Smuzhiyun }
1887*4882a593Smuzhiyun
1888*4882a593Smuzhiyun /*
1889*4882a593Smuzhiyun * Return either the total number of swap pages of given type, or the number
1890*4882a593Smuzhiyun * of free pages of that type (depending on @free)
1891*4882a593Smuzhiyun *
1892*4882a593Smuzhiyun * This is needed for software suspend
1893*4882a593Smuzhiyun */
count_swap_pages(int type,int free)1894*4882a593Smuzhiyun unsigned int count_swap_pages(int type, int free)
1895*4882a593Smuzhiyun {
1896*4882a593Smuzhiyun unsigned int n = 0;
1897*4882a593Smuzhiyun
1898*4882a593Smuzhiyun spin_lock(&swap_lock);
1899*4882a593Smuzhiyun if ((unsigned int)type < nr_swapfiles) {
1900*4882a593Smuzhiyun struct swap_info_struct *sis = swap_info[type];
1901*4882a593Smuzhiyun
1902*4882a593Smuzhiyun spin_lock(&sis->lock);
1903*4882a593Smuzhiyun if (sis->flags & SWP_WRITEOK) {
1904*4882a593Smuzhiyun n = sis->pages;
1905*4882a593Smuzhiyun if (free)
1906*4882a593Smuzhiyun n -= sis->inuse_pages;
1907*4882a593Smuzhiyun }
1908*4882a593Smuzhiyun spin_unlock(&sis->lock);
1909*4882a593Smuzhiyun }
1910*4882a593Smuzhiyun spin_unlock(&swap_lock);
1911*4882a593Smuzhiyun return n;
1912*4882a593Smuzhiyun }
1913*4882a593Smuzhiyun #endif /* CONFIG_HIBERNATION */
1914*4882a593Smuzhiyun
pte_same_as_swp(pte_t pte,pte_t swp_pte)1915*4882a593Smuzhiyun static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
1916*4882a593Smuzhiyun {
1917*4882a593Smuzhiyun return pte_same(pte_swp_clear_flags(pte), swp_pte);
1918*4882a593Smuzhiyun }
1919*4882a593Smuzhiyun
1920*4882a593Smuzhiyun /*
1921*4882a593Smuzhiyun * No need to decide whether this PTE shares the swap entry with others,
1922*4882a593Smuzhiyun * just let do_wp_page work it out if a write is requested later - to
1923*4882a593Smuzhiyun * force COW, vm_page_prot omits write permission from any private vma.
1924*4882a593Smuzhiyun */
unuse_pte(struct vm_area_struct * vma,pmd_t * pmd,unsigned long addr,swp_entry_t entry,struct page * page)1925*4882a593Smuzhiyun static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
1926*4882a593Smuzhiyun unsigned long addr, swp_entry_t entry, struct page *page)
1927*4882a593Smuzhiyun {
1928*4882a593Smuzhiyun struct page *swapcache;
1929*4882a593Smuzhiyun spinlock_t *ptl;
1930*4882a593Smuzhiyun pte_t *pte;
1931*4882a593Smuzhiyun int ret = 1;
1932*4882a593Smuzhiyun
1933*4882a593Smuzhiyun swapcache = page;
1934*4882a593Smuzhiyun page = ksm_might_need_to_copy(page, vma, addr);
1935*4882a593Smuzhiyun if (unlikely(!page))
1936*4882a593Smuzhiyun return -ENOMEM;
1937*4882a593Smuzhiyun
1938*4882a593Smuzhiyun pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
1939*4882a593Smuzhiyun if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
1940*4882a593Smuzhiyun ret = 0;
1941*4882a593Smuzhiyun goto out;
1942*4882a593Smuzhiyun }
1943*4882a593Smuzhiyun
1944*4882a593Smuzhiyun dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
1945*4882a593Smuzhiyun inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
1946*4882a593Smuzhiyun get_page(page);
1947*4882a593Smuzhiyun set_pte_at(vma->vm_mm, addr, pte,
1948*4882a593Smuzhiyun pte_mkold(mk_pte(page, vma->vm_page_prot)));
1949*4882a593Smuzhiyun if (page == swapcache) {
1950*4882a593Smuzhiyun page_add_anon_rmap(page, vma, addr, false);
1951*4882a593Smuzhiyun } else { /* ksm created a completely new copy */
1952*4882a593Smuzhiyun page_add_new_anon_rmap(page, vma, addr, false);
1953*4882a593Smuzhiyun lru_cache_add_inactive_or_unevictable(page, vma);
1954*4882a593Smuzhiyun }
1955*4882a593Smuzhiyun swap_free(entry);
1956*4882a593Smuzhiyun out:
1957*4882a593Smuzhiyun pte_unmap_unlock(pte, ptl);
1958*4882a593Smuzhiyun if (page != swapcache) {
1959*4882a593Smuzhiyun unlock_page(page);
1960*4882a593Smuzhiyun put_page(page);
1961*4882a593Smuzhiyun }
1962*4882a593Smuzhiyun return ret;
1963*4882a593Smuzhiyun }
1964*4882a593Smuzhiyun
unuse_pte_range(struct vm_area_struct * vma,pmd_t * pmd,unsigned long addr,unsigned long end,unsigned int type,bool frontswap,unsigned long * fs_pages_to_unuse)1965*4882a593Smuzhiyun static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
1966*4882a593Smuzhiyun unsigned long addr, unsigned long end,
1967*4882a593Smuzhiyun unsigned int type, bool frontswap,
1968*4882a593Smuzhiyun unsigned long *fs_pages_to_unuse)
1969*4882a593Smuzhiyun {
1970*4882a593Smuzhiyun struct page *page;
1971*4882a593Smuzhiyun swp_entry_t entry;
1972*4882a593Smuzhiyun pte_t *pte;
1973*4882a593Smuzhiyun struct swap_info_struct *si;
1974*4882a593Smuzhiyun unsigned long offset;
1975*4882a593Smuzhiyun int ret = 0;
1976*4882a593Smuzhiyun volatile unsigned char *swap_map;
1977*4882a593Smuzhiyun
1978*4882a593Smuzhiyun si = swap_info[type];
1979*4882a593Smuzhiyun pte = pte_offset_map(pmd, addr);
1980*4882a593Smuzhiyun do {
1981*4882a593Smuzhiyun if (!is_swap_pte(*pte))
1982*4882a593Smuzhiyun continue;
1983*4882a593Smuzhiyun
1984*4882a593Smuzhiyun entry = pte_to_swp_entry(*pte);
1985*4882a593Smuzhiyun if (swp_type(entry) != type)
1986*4882a593Smuzhiyun continue;
1987*4882a593Smuzhiyun
1988*4882a593Smuzhiyun offset = swp_offset(entry);
1989*4882a593Smuzhiyun if (frontswap && !frontswap_test(si, offset))
1990*4882a593Smuzhiyun continue;
1991*4882a593Smuzhiyun
1992*4882a593Smuzhiyun pte_unmap(pte);
1993*4882a593Smuzhiyun swap_map = &si->swap_map[offset];
1994*4882a593Smuzhiyun page = lookup_swap_cache(entry, vma, addr);
1995*4882a593Smuzhiyun if (!page) {
1996*4882a593Smuzhiyun struct vm_fault vmf = {
1997*4882a593Smuzhiyun .vma = vma,
1998*4882a593Smuzhiyun .address = addr,
1999*4882a593Smuzhiyun .pmd = pmd,
2000*4882a593Smuzhiyun };
2001*4882a593Smuzhiyun
2002*4882a593Smuzhiyun page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
2003*4882a593Smuzhiyun &vmf);
2004*4882a593Smuzhiyun }
2005*4882a593Smuzhiyun if (!page) {
2006*4882a593Smuzhiyun if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD)
2007*4882a593Smuzhiyun goto try_next;
2008*4882a593Smuzhiyun return -ENOMEM;
2009*4882a593Smuzhiyun }
2010*4882a593Smuzhiyun
2011*4882a593Smuzhiyun lock_page(page);
2012*4882a593Smuzhiyun wait_on_page_writeback(page);
2013*4882a593Smuzhiyun ret = unuse_pte(vma, pmd, addr, entry, page);
2014*4882a593Smuzhiyun if (ret < 0) {
2015*4882a593Smuzhiyun unlock_page(page);
2016*4882a593Smuzhiyun put_page(page);
2017*4882a593Smuzhiyun goto out;
2018*4882a593Smuzhiyun }
2019*4882a593Smuzhiyun
2020*4882a593Smuzhiyun try_to_free_swap(page);
2021*4882a593Smuzhiyun trace_android_vh_unuse_swap_page(si, page);
2022*4882a593Smuzhiyun unlock_page(page);
2023*4882a593Smuzhiyun put_page(page);
2024*4882a593Smuzhiyun
2025*4882a593Smuzhiyun if (*fs_pages_to_unuse && !--(*fs_pages_to_unuse)) {
2026*4882a593Smuzhiyun ret = FRONTSWAP_PAGES_UNUSED;
2027*4882a593Smuzhiyun goto out;
2028*4882a593Smuzhiyun }
2029*4882a593Smuzhiyun try_next:
2030*4882a593Smuzhiyun pte = pte_offset_map(pmd, addr);
2031*4882a593Smuzhiyun } while (pte++, addr += PAGE_SIZE, addr != end);
2032*4882a593Smuzhiyun pte_unmap(pte - 1);
2033*4882a593Smuzhiyun
2034*4882a593Smuzhiyun ret = 0;
2035*4882a593Smuzhiyun out:
2036*4882a593Smuzhiyun return ret;
2037*4882a593Smuzhiyun }
2038*4882a593Smuzhiyun
unuse_pmd_range(struct vm_area_struct * vma,pud_t * pud,unsigned long addr,unsigned long end,unsigned int type,bool frontswap,unsigned long * fs_pages_to_unuse)2039*4882a593Smuzhiyun static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
2040*4882a593Smuzhiyun unsigned long addr, unsigned long end,
2041*4882a593Smuzhiyun unsigned int type, bool frontswap,
2042*4882a593Smuzhiyun unsigned long *fs_pages_to_unuse)
2043*4882a593Smuzhiyun {
2044*4882a593Smuzhiyun pmd_t *pmd;
2045*4882a593Smuzhiyun unsigned long next;
2046*4882a593Smuzhiyun int ret;
2047*4882a593Smuzhiyun
2048*4882a593Smuzhiyun pmd = pmd_offset(pud, addr);
2049*4882a593Smuzhiyun do {
2050*4882a593Smuzhiyun cond_resched();
2051*4882a593Smuzhiyun next = pmd_addr_end(addr, end);
2052*4882a593Smuzhiyun if (pmd_none_or_trans_huge_or_clear_bad(pmd))
2053*4882a593Smuzhiyun continue;
2054*4882a593Smuzhiyun ret = unuse_pte_range(vma, pmd, addr, next, type,
2055*4882a593Smuzhiyun frontswap, fs_pages_to_unuse);
2056*4882a593Smuzhiyun if (ret)
2057*4882a593Smuzhiyun return ret;
2058*4882a593Smuzhiyun } while (pmd++, addr = next, addr != end);
2059*4882a593Smuzhiyun return 0;
2060*4882a593Smuzhiyun }
2061*4882a593Smuzhiyun
unuse_pud_range(struct vm_area_struct * vma,p4d_t * p4d,unsigned long addr,unsigned long end,unsigned int type,bool frontswap,unsigned long * fs_pages_to_unuse)2062*4882a593Smuzhiyun static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
2063*4882a593Smuzhiyun unsigned long addr, unsigned long end,
2064*4882a593Smuzhiyun unsigned int type, bool frontswap,
2065*4882a593Smuzhiyun unsigned long *fs_pages_to_unuse)
2066*4882a593Smuzhiyun {
2067*4882a593Smuzhiyun pud_t *pud;
2068*4882a593Smuzhiyun unsigned long next;
2069*4882a593Smuzhiyun int ret;
2070*4882a593Smuzhiyun
2071*4882a593Smuzhiyun pud = pud_offset(p4d, addr);
2072*4882a593Smuzhiyun do {
2073*4882a593Smuzhiyun next = pud_addr_end(addr, end);
2074*4882a593Smuzhiyun if (pud_none_or_clear_bad(pud))
2075*4882a593Smuzhiyun continue;
2076*4882a593Smuzhiyun ret = unuse_pmd_range(vma, pud, addr, next, type,
2077*4882a593Smuzhiyun frontswap, fs_pages_to_unuse);
2078*4882a593Smuzhiyun if (ret)
2079*4882a593Smuzhiyun return ret;
2080*4882a593Smuzhiyun } while (pud++, addr = next, addr != end);
2081*4882a593Smuzhiyun return 0;
2082*4882a593Smuzhiyun }
2083*4882a593Smuzhiyun
unuse_p4d_range(struct vm_area_struct * vma,pgd_t * pgd,unsigned long addr,unsigned long end,unsigned int type,bool frontswap,unsigned long * fs_pages_to_unuse)2084*4882a593Smuzhiyun static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
2085*4882a593Smuzhiyun unsigned long addr, unsigned long end,
2086*4882a593Smuzhiyun unsigned int type, bool frontswap,
2087*4882a593Smuzhiyun unsigned long *fs_pages_to_unuse)
2088*4882a593Smuzhiyun {
2089*4882a593Smuzhiyun p4d_t *p4d;
2090*4882a593Smuzhiyun unsigned long next;
2091*4882a593Smuzhiyun int ret;
2092*4882a593Smuzhiyun
2093*4882a593Smuzhiyun p4d = p4d_offset(pgd, addr);
2094*4882a593Smuzhiyun do {
2095*4882a593Smuzhiyun next = p4d_addr_end(addr, end);
2096*4882a593Smuzhiyun if (p4d_none_or_clear_bad(p4d))
2097*4882a593Smuzhiyun continue;
2098*4882a593Smuzhiyun ret = unuse_pud_range(vma, p4d, addr, next, type,
2099*4882a593Smuzhiyun frontswap, fs_pages_to_unuse);
2100*4882a593Smuzhiyun if (ret)
2101*4882a593Smuzhiyun return ret;
2102*4882a593Smuzhiyun } while (p4d++, addr = next, addr != end);
2103*4882a593Smuzhiyun return 0;
2104*4882a593Smuzhiyun }
2105*4882a593Smuzhiyun
unuse_vma(struct vm_area_struct * vma,unsigned int type,bool frontswap,unsigned long * fs_pages_to_unuse)2106*4882a593Smuzhiyun static int unuse_vma(struct vm_area_struct *vma, unsigned int type,
2107*4882a593Smuzhiyun bool frontswap, unsigned long *fs_pages_to_unuse)
2108*4882a593Smuzhiyun {
2109*4882a593Smuzhiyun pgd_t *pgd;
2110*4882a593Smuzhiyun unsigned long addr, end, next;
2111*4882a593Smuzhiyun int ret;
2112*4882a593Smuzhiyun
2113*4882a593Smuzhiyun addr = vma->vm_start;
2114*4882a593Smuzhiyun end = vma->vm_end;
2115*4882a593Smuzhiyun
2116*4882a593Smuzhiyun pgd = pgd_offset(vma->vm_mm, addr);
2117*4882a593Smuzhiyun do {
2118*4882a593Smuzhiyun next = pgd_addr_end(addr, end);
2119*4882a593Smuzhiyun if (pgd_none_or_clear_bad(pgd))
2120*4882a593Smuzhiyun continue;
2121*4882a593Smuzhiyun ret = unuse_p4d_range(vma, pgd, addr, next, type,
2122*4882a593Smuzhiyun frontswap, fs_pages_to_unuse);
2123*4882a593Smuzhiyun if (ret)
2124*4882a593Smuzhiyun return ret;
2125*4882a593Smuzhiyun } while (pgd++, addr = next, addr != end);
2126*4882a593Smuzhiyun return 0;
2127*4882a593Smuzhiyun }
2128*4882a593Smuzhiyun
unuse_mm(struct mm_struct * mm,unsigned int type,bool frontswap,unsigned long * fs_pages_to_unuse)2129*4882a593Smuzhiyun static int unuse_mm(struct mm_struct *mm, unsigned int type,
2130*4882a593Smuzhiyun bool frontswap, unsigned long *fs_pages_to_unuse)
2131*4882a593Smuzhiyun {
2132*4882a593Smuzhiyun struct vm_area_struct *vma;
2133*4882a593Smuzhiyun int ret = 0;
2134*4882a593Smuzhiyun
2135*4882a593Smuzhiyun mmap_read_lock(mm);
2136*4882a593Smuzhiyun for (vma = mm->mmap; vma; vma = vma->vm_next) {
2137*4882a593Smuzhiyun if (vma->anon_vma) {
2138*4882a593Smuzhiyun ret = unuse_vma(vma, type, frontswap,
2139*4882a593Smuzhiyun fs_pages_to_unuse);
2140*4882a593Smuzhiyun if (ret)
2141*4882a593Smuzhiyun break;
2142*4882a593Smuzhiyun }
2143*4882a593Smuzhiyun cond_resched();
2144*4882a593Smuzhiyun }
2145*4882a593Smuzhiyun mmap_read_unlock(mm);
2146*4882a593Smuzhiyun return ret;
2147*4882a593Smuzhiyun }
2148*4882a593Smuzhiyun
2149*4882a593Smuzhiyun /*
2150*4882a593Smuzhiyun * Scan swap_map (or frontswap_map if frontswap parameter is true)
2151*4882a593Smuzhiyun * from current position to next entry still in use. Return 0
2152*4882a593Smuzhiyun * if there are no inuse entries after prev till end of the map.
2153*4882a593Smuzhiyun */
find_next_to_unuse(struct swap_info_struct * si,unsigned int prev,bool frontswap)2154*4882a593Smuzhiyun static unsigned int find_next_to_unuse(struct swap_info_struct *si,
2155*4882a593Smuzhiyun unsigned int prev, bool frontswap)
2156*4882a593Smuzhiyun {
2157*4882a593Smuzhiyun unsigned int i;
2158*4882a593Smuzhiyun unsigned char count;
2159*4882a593Smuzhiyun
2160*4882a593Smuzhiyun /*
2161*4882a593Smuzhiyun * No need for swap_lock here: we're just looking
2162*4882a593Smuzhiyun * for whether an entry is in use, not modifying it; false
2163*4882a593Smuzhiyun * hits are okay, and sys_swapoff() has already prevented new
2164*4882a593Smuzhiyun * allocations from this area (while holding swap_lock).
2165*4882a593Smuzhiyun */
2166*4882a593Smuzhiyun for (i = prev + 1; i < si->max; i++) {
2167*4882a593Smuzhiyun count = READ_ONCE(si->swap_map[i]);
2168*4882a593Smuzhiyun if (count && swap_count(count) != SWAP_MAP_BAD)
2169*4882a593Smuzhiyun if (!frontswap || frontswap_test(si, i))
2170*4882a593Smuzhiyun break;
2171*4882a593Smuzhiyun if ((i % LATENCY_LIMIT) == 0)
2172*4882a593Smuzhiyun cond_resched();
2173*4882a593Smuzhiyun }
2174*4882a593Smuzhiyun
2175*4882a593Smuzhiyun if (i == si->max)
2176*4882a593Smuzhiyun i = 0;
2177*4882a593Smuzhiyun
2178*4882a593Smuzhiyun return i;
2179*4882a593Smuzhiyun }
2180*4882a593Smuzhiyun
2181*4882a593Smuzhiyun /*
2182*4882a593Smuzhiyun * If the boolean frontswap is true, only unuse pages_to_unuse pages;
2183*4882a593Smuzhiyun * pages_to_unuse==0 means all pages; ignored if frontswap is false
2184*4882a593Smuzhiyun */
try_to_unuse(unsigned int type,bool frontswap,unsigned long pages_to_unuse)2185*4882a593Smuzhiyun int try_to_unuse(unsigned int type, bool frontswap,
2186*4882a593Smuzhiyun unsigned long pages_to_unuse)
2187*4882a593Smuzhiyun {
2188*4882a593Smuzhiyun struct mm_struct *prev_mm;
2189*4882a593Smuzhiyun struct mm_struct *mm;
2190*4882a593Smuzhiyun struct list_head *p;
2191*4882a593Smuzhiyun int retval = 0;
2192*4882a593Smuzhiyun struct swap_info_struct *si = swap_info[type];
2193*4882a593Smuzhiyun struct page *page;
2194*4882a593Smuzhiyun swp_entry_t entry;
2195*4882a593Smuzhiyun unsigned int i;
2196*4882a593Smuzhiyun
2197*4882a593Smuzhiyun if (!READ_ONCE(si->inuse_pages))
2198*4882a593Smuzhiyun return 0;
2199*4882a593Smuzhiyun
2200*4882a593Smuzhiyun if (!frontswap)
2201*4882a593Smuzhiyun pages_to_unuse = 0;
2202*4882a593Smuzhiyun
2203*4882a593Smuzhiyun retry:
2204*4882a593Smuzhiyun retval = shmem_unuse(type, frontswap, &pages_to_unuse);
2205*4882a593Smuzhiyun if (retval)
2206*4882a593Smuzhiyun goto out;
2207*4882a593Smuzhiyun
2208*4882a593Smuzhiyun prev_mm = &init_mm;
2209*4882a593Smuzhiyun mmget(prev_mm);
2210*4882a593Smuzhiyun
2211*4882a593Smuzhiyun spin_lock(&mmlist_lock);
2212*4882a593Smuzhiyun p = &init_mm.mmlist;
2213*4882a593Smuzhiyun while (READ_ONCE(si->inuse_pages) &&
2214*4882a593Smuzhiyun !signal_pending(current) &&
2215*4882a593Smuzhiyun (p = p->next) != &init_mm.mmlist) {
2216*4882a593Smuzhiyun
2217*4882a593Smuzhiyun mm = list_entry(p, struct mm_struct, mmlist);
2218*4882a593Smuzhiyun if (!mmget_not_zero(mm))
2219*4882a593Smuzhiyun continue;
2220*4882a593Smuzhiyun spin_unlock(&mmlist_lock);
2221*4882a593Smuzhiyun mmput(prev_mm);
2222*4882a593Smuzhiyun prev_mm = mm;
2223*4882a593Smuzhiyun retval = unuse_mm(mm, type, frontswap, &pages_to_unuse);
2224*4882a593Smuzhiyun
2225*4882a593Smuzhiyun if (retval) {
2226*4882a593Smuzhiyun mmput(prev_mm);
2227*4882a593Smuzhiyun goto out;
2228*4882a593Smuzhiyun }
2229*4882a593Smuzhiyun
2230*4882a593Smuzhiyun /*
2231*4882a593Smuzhiyun * Make sure that we aren't completely killing
2232*4882a593Smuzhiyun * interactive performance.
2233*4882a593Smuzhiyun */
2234*4882a593Smuzhiyun cond_resched();
2235*4882a593Smuzhiyun spin_lock(&mmlist_lock);
2236*4882a593Smuzhiyun }
2237*4882a593Smuzhiyun spin_unlock(&mmlist_lock);
2238*4882a593Smuzhiyun
2239*4882a593Smuzhiyun mmput(prev_mm);
2240*4882a593Smuzhiyun
2241*4882a593Smuzhiyun i = 0;
2242*4882a593Smuzhiyun while (READ_ONCE(si->inuse_pages) &&
2243*4882a593Smuzhiyun !signal_pending(current) &&
2244*4882a593Smuzhiyun (i = find_next_to_unuse(si, i, frontswap)) != 0) {
2245*4882a593Smuzhiyun
2246*4882a593Smuzhiyun entry = swp_entry(type, i);
2247*4882a593Smuzhiyun page = find_get_page(swap_address_space(entry), i);
2248*4882a593Smuzhiyun if (!page)
2249*4882a593Smuzhiyun continue;
2250*4882a593Smuzhiyun
2251*4882a593Smuzhiyun /*
2252*4882a593Smuzhiyun * It is conceivable that a racing task removed this page from
2253*4882a593Smuzhiyun * swap cache just before we acquired the page lock. The page
2254*4882a593Smuzhiyun * might even be back in swap cache on another swap area. But
2255*4882a593Smuzhiyun * that is okay, try_to_free_swap() only removes stale pages.
2256*4882a593Smuzhiyun */
2257*4882a593Smuzhiyun lock_page(page);
2258*4882a593Smuzhiyun wait_on_page_writeback(page);
2259*4882a593Smuzhiyun try_to_free_swap(page);
2260*4882a593Smuzhiyun trace_android_vh_unuse_swap_page(si, page);
2261*4882a593Smuzhiyun unlock_page(page);
2262*4882a593Smuzhiyun put_page(page);
2263*4882a593Smuzhiyun
2264*4882a593Smuzhiyun /*
2265*4882a593Smuzhiyun * For frontswap, we just need to unuse pages_to_unuse, if
2266*4882a593Smuzhiyun * it was specified. Need not check frontswap again here as
2267*4882a593Smuzhiyun * we already zeroed out pages_to_unuse if not frontswap.
2268*4882a593Smuzhiyun */
2269*4882a593Smuzhiyun if (pages_to_unuse && --pages_to_unuse == 0)
2270*4882a593Smuzhiyun goto out;
2271*4882a593Smuzhiyun }
2272*4882a593Smuzhiyun
2273*4882a593Smuzhiyun /*
2274*4882a593Smuzhiyun * Lets check again to see if there are still swap entries in the map.
2275*4882a593Smuzhiyun * If yes, we would need to do retry the unuse logic again.
2276*4882a593Smuzhiyun * Under global memory pressure, swap entries can be reinserted back
2277*4882a593Smuzhiyun * into process space after the mmlist loop above passes over them.
2278*4882a593Smuzhiyun *
2279*4882a593Smuzhiyun * Limit the number of retries? No: when mmget_not_zero() above fails,
2280*4882a593Smuzhiyun * that mm is likely to be freeing swap from exit_mmap(), which proceeds
2281*4882a593Smuzhiyun * at its own independent pace; and even shmem_writepage() could have
2282*4882a593Smuzhiyun * been preempted after get_swap_page(), temporarily hiding that swap.
2283*4882a593Smuzhiyun * It's easy and robust (though cpu-intensive) just to keep retrying.
2284*4882a593Smuzhiyun */
2285*4882a593Smuzhiyun if (READ_ONCE(si->inuse_pages)) {
2286*4882a593Smuzhiyun if (!signal_pending(current))
2287*4882a593Smuzhiyun goto retry;
2288*4882a593Smuzhiyun retval = -EINTR;
2289*4882a593Smuzhiyun }
2290*4882a593Smuzhiyun out:
2291*4882a593Smuzhiyun return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval;
2292*4882a593Smuzhiyun }
2293*4882a593Smuzhiyun
2294*4882a593Smuzhiyun /*
2295*4882a593Smuzhiyun * After a successful try_to_unuse, if no swap is now in use, we know
2296*4882a593Smuzhiyun * we can empty the mmlist. swap_lock must be held on entry and exit.
2297*4882a593Smuzhiyun * Note that mmlist_lock nests inside swap_lock, and an mm must be
2298*4882a593Smuzhiyun * added to the mmlist just after page_duplicate - before would be racy.
2299*4882a593Smuzhiyun */
drain_mmlist(void)2300*4882a593Smuzhiyun static void drain_mmlist(void)
2301*4882a593Smuzhiyun {
2302*4882a593Smuzhiyun struct list_head *p, *next;
2303*4882a593Smuzhiyun unsigned int type;
2304*4882a593Smuzhiyun
2305*4882a593Smuzhiyun for (type = 0; type < nr_swapfiles; type++)
2306*4882a593Smuzhiyun if (swap_info[type]->inuse_pages)
2307*4882a593Smuzhiyun return;
2308*4882a593Smuzhiyun spin_lock(&mmlist_lock);
2309*4882a593Smuzhiyun list_for_each_safe(p, next, &init_mm.mmlist)
2310*4882a593Smuzhiyun list_del_init(p);
2311*4882a593Smuzhiyun spin_unlock(&mmlist_lock);
2312*4882a593Smuzhiyun }
2313*4882a593Smuzhiyun
2314*4882a593Smuzhiyun /*
2315*4882a593Smuzhiyun * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
2316*4882a593Smuzhiyun * corresponds to page offset for the specified swap entry.
2317*4882a593Smuzhiyun * Note that the type of this function is sector_t, but it returns page offset
2318*4882a593Smuzhiyun * into the bdev, not sector offset.
2319*4882a593Smuzhiyun */
map_swap_entry(swp_entry_t entry,struct block_device ** bdev)2320*4882a593Smuzhiyun static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
2321*4882a593Smuzhiyun {
2322*4882a593Smuzhiyun struct swap_info_struct *sis;
2323*4882a593Smuzhiyun struct swap_extent *se;
2324*4882a593Smuzhiyun pgoff_t offset;
2325*4882a593Smuzhiyun
2326*4882a593Smuzhiyun sis = swp_swap_info(entry);
2327*4882a593Smuzhiyun *bdev = sis->bdev;
2328*4882a593Smuzhiyun
2329*4882a593Smuzhiyun offset = swp_offset(entry);
2330*4882a593Smuzhiyun se = offset_to_swap_extent(sis, offset);
2331*4882a593Smuzhiyun return se->start_block + (offset - se->start_page);
2332*4882a593Smuzhiyun }
2333*4882a593Smuzhiyun
2334*4882a593Smuzhiyun /*
2335*4882a593Smuzhiyun * Returns the page offset into bdev for the specified page's swap entry.
2336*4882a593Smuzhiyun */
map_swap_page(struct page * page,struct block_device ** bdev)2337*4882a593Smuzhiyun sector_t map_swap_page(struct page *page, struct block_device **bdev)
2338*4882a593Smuzhiyun {
2339*4882a593Smuzhiyun swp_entry_t entry;
2340*4882a593Smuzhiyun entry.val = page_private(page);
2341*4882a593Smuzhiyun return map_swap_entry(entry, bdev);
2342*4882a593Smuzhiyun }
2343*4882a593Smuzhiyun
2344*4882a593Smuzhiyun /*
2345*4882a593Smuzhiyun * Free all of a swapdev's extent information
2346*4882a593Smuzhiyun */
destroy_swap_extents(struct swap_info_struct * sis)2347*4882a593Smuzhiyun static void destroy_swap_extents(struct swap_info_struct *sis)
2348*4882a593Smuzhiyun {
2349*4882a593Smuzhiyun while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) {
2350*4882a593Smuzhiyun struct rb_node *rb = sis->swap_extent_root.rb_node;
2351*4882a593Smuzhiyun struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node);
2352*4882a593Smuzhiyun
2353*4882a593Smuzhiyun rb_erase(rb, &sis->swap_extent_root);
2354*4882a593Smuzhiyun kfree(se);
2355*4882a593Smuzhiyun }
2356*4882a593Smuzhiyun
2357*4882a593Smuzhiyun if (sis->flags & SWP_ACTIVATED) {
2358*4882a593Smuzhiyun struct file *swap_file = sis->swap_file;
2359*4882a593Smuzhiyun struct address_space *mapping = swap_file->f_mapping;
2360*4882a593Smuzhiyun
2361*4882a593Smuzhiyun sis->flags &= ~SWP_ACTIVATED;
2362*4882a593Smuzhiyun if (mapping->a_ops->swap_deactivate)
2363*4882a593Smuzhiyun mapping->a_ops->swap_deactivate(swap_file);
2364*4882a593Smuzhiyun }
2365*4882a593Smuzhiyun }
2366*4882a593Smuzhiyun
2367*4882a593Smuzhiyun /*
2368*4882a593Smuzhiyun * Add a block range (and the corresponding page range) into this swapdev's
2369*4882a593Smuzhiyun * extent tree.
2370*4882a593Smuzhiyun *
2371*4882a593Smuzhiyun * This function rather assumes that it is called in ascending page order.
2372*4882a593Smuzhiyun */
2373*4882a593Smuzhiyun int
add_swap_extent(struct swap_info_struct * sis,unsigned long start_page,unsigned long nr_pages,sector_t start_block)2374*4882a593Smuzhiyun add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
2375*4882a593Smuzhiyun unsigned long nr_pages, sector_t start_block)
2376*4882a593Smuzhiyun {
2377*4882a593Smuzhiyun struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL;
2378*4882a593Smuzhiyun struct swap_extent *se;
2379*4882a593Smuzhiyun struct swap_extent *new_se;
2380*4882a593Smuzhiyun
2381*4882a593Smuzhiyun /*
2382*4882a593Smuzhiyun * place the new node at the right most since the
2383*4882a593Smuzhiyun * function is called in ascending page order.
2384*4882a593Smuzhiyun */
2385*4882a593Smuzhiyun while (*link) {
2386*4882a593Smuzhiyun parent = *link;
2387*4882a593Smuzhiyun link = &parent->rb_right;
2388*4882a593Smuzhiyun }
2389*4882a593Smuzhiyun
2390*4882a593Smuzhiyun if (parent) {
2391*4882a593Smuzhiyun se = rb_entry(parent, struct swap_extent, rb_node);
2392*4882a593Smuzhiyun BUG_ON(se->start_page + se->nr_pages != start_page);
2393*4882a593Smuzhiyun if (se->start_block + se->nr_pages == start_block) {
2394*4882a593Smuzhiyun /* Merge it */
2395*4882a593Smuzhiyun se->nr_pages += nr_pages;
2396*4882a593Smuzhiyun return 0;
2397*4882a593Smuzhiyun }
2398*4882a593Smuzhiyun }
2399*4882a593Smuzhiyun
2400*4882a593Smuzhiyun /* No merge, insert a new extent. */
2401*4882a593Smuzhiyun new_se = kmalloc(sizeof(*se), GFP_KERNEL);
2402*4882a593Smuzhiyun if (new_se == NULL)
2403*4882a593Smuzhiyun return -ENOMEM;
2404*4882a593Smuzhiyun new_se->start_page = start_page;
2405*4882a593Smuzhiyun new_se->nr_pages = nr_pages;
2406*4882a593Smuzhiyun new_se->start_block = start_block;
2407*4882a593Smuzhiyun
2408*4882a593Smuzhiyun rb_link_node(&new_se->rb_node, parent, link);
2409*4882a593Smuzhiyun rb_insert_color(&new_se->rb_node, &sis->swap_extent_root);
2410*4882a593Smuzhiyun return 1;
2411*4882a593Smuzhiyun }
2412*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(add_swap_extent);
2413*4882a593Smuzhiyun
2414*4882a593Smuzhiyun /*
2415*4882a593Smuzhiyun * A `swap extent' is a simple thing which maps a contiguous range of pages
2416*4882a593Smuzhiyun * onto a contiguous range of disk blocks. An ordered list of swap extents
2417*4882a593Smuzhiyun * is built at swapon time and is then used at swap_writepage/swap_readpage
2418*4882a593Smuzhiyun * time for locating where on disk a page belongs.
2419*4882a593Smuzhiyun *
2420*4882a593Smuzhiyun * If the swapfile is an S_ISBLK block device, a single extent is installed.
2421*4882a593Smuzhiyun * This is done so that the main operating code can treat S_ISBLK and S_ISREG
2422*4882a593Smuzhiyun * swap files identically.
2423*4882a593Smuzhiyun *
2424*4882a593Smuzhiyun * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
2425*4882a593Smuzhiyun * extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK
2426*4882a593Smuzhiyun * swapfiles are handled *identically* after swapon time.
2427*4882a593Smuzhiyun *
2428*4882a593Smuzhiyun * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
2429*4882a593Smuzhiyun * and will parse them into an ordered extent list, in PAGE_SIZE chunks. If
2430*4882a593Smuzhiyun * some stray blocks are found which do not fall within the PAGE_SIZE alignment
2431*4882a593Smuzhiyun * requirements, they are simply tossed out - we will never use those blocks
2432*4882a593Smuzhiyun * for swapping.
2433*4882a593Smuzhiyun *
2434*4882a593Smuzhiyun * For all swap devices we set S_SWAPFILE across the life of the swapon. This
2435*4882a593Smuzhiyun * prevents users from writing to the swap device, which will corrupt memory.
2436*4882a593Smuzhiyun *
2437*4882a593Smuzhiyun * The amount of disk space which a single swap extent represents varies.
2438*4882a593Smuzhiyun * Typically it is in the 1-4 megabyte range. So we can have hundreds of
2439*4882a593Smuzhiyun * extents in the list. To avoid much list walking, we cache the previous
2440*4882a593Smuzhiyun * search location in `curr_swap_extent', and start new searches from there.
2441*4882a593Smuzhiyun * This is extremely effective. The average number of iterations in
2442*4882a593Smuzhiyun * map_swap_page() has been measured at about 0.3 per page. - akpm.
2443*4882a593Smuzhiyun */
setup_swap_extents(struct swap_info_struct * sis,sector_t * span)2444*4882a593Smuzhiyun static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
2445*4882a593Smuzhiyun {
2446*4882a593Smuzhiyun struct file *swap_file = sis->swap_file;
2447*4882a593Smuzhiyun struct address_space *mapping = swap_file->f_mapping;
2448*4882a593Smuzhiyun struct inode *inode = mapping->host;
2449*4882a593Smuzhiyun int ret;
2450*4882a593Smuzhiyun
2451*4882a593Smuzhiyun if (S_ISBLK(inode->i_mode)) {
2452*4882a593Smuzhiyun ret = add_swap_extent(sis, 0, sis->max, 0);
2453*4882a593Smuzhiyun *span = sis->pages;
2454*4882a593Smuzhiyun return ret;
2455*4882a593Smuzhiyun }
2456*4882a593Smuzhiyun
2457*4882a593Smuzhiyun if (mapping->a_ops->swap_activate) {
2458*4882a593Smuzhiyun ret = mapping->a_ops->swap_activate(sis, swap_file, span);
2459*4882a593Smuzhiyun if (ret >= 0)
2460*4882a593Smuzhiyun sis->flags |= SWP_ACTIVATED;
2461*4882a593Smuzhiyun if (!ret) {
2462*4882a593Smuzhiyun sis->flags |= SWP_FS_OPS;
2463*4882a593Smuzhiyun ret = add_swap_extent(sis, 0, sis->max, 0);
2464*4882a593Smuzhiyun *span = sis->pages;
2465*4882a593Smuzhiyun }
2466*4882a593Smuzhiyun return ret;
2467*4882a593Smuzhiyun }
2468*4882a593Smuzhiyun
2469*4882a593Smuzhiyun return generic_swapfile_activate(sis, swap_file, span);
2470*4882a593Smuzhiyun }
2471*4882a593Smuzhiyun
swap_node(struct swap_info_struct * p)2472*4882a593Smuzhiyun static int swap_node(struct swap_info_struct *p)
2473*4882a593Smuzhiyun {
2474*4882a593Smuzhiyun struct block_device *bdev;
2475*4882a593Smuzhiyun
2476*4882a593Smuzhiyun if (p->bdev)
2477*4882a593Smuzhiyun bdev = p->bdev;
2478*4882a593Smuzhiyun else
2479*4882a593Smuzhiyun bdev = p->swap_file->f_inode->i_sb->s_bdev;
2480*4882a593Smuzhiyun
2481*4882a593Smuzhiyun return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
2482*4882a593Smuzhiyun }
2483*4882a593Smuzhiyun
setup_swap_info(struct swap_info_struct * p,int prio,unsigned char * swap_map,struct swap_cluster_info * cluster_info)2484*4882a593Smuzhiyun static void setup_swap_info(struct swap_info_struct *p, int prio,
2485*4882a593Smuzhiyun unsigned char *swap_map,
2486*4882a593Smuzhiyun struct swap_cluster_info *cluster_info)
2487*4882a593Smuzhiyun {
2488*4882a593Smuzhiyun int i;
2489*4882a593Smuzhiyun
2490*4882a593Smuzhiyun if (prio >= 0)
2491*4882a593Smuzhiyun p->prio = prio;
2492*4882a593Smuzhiyun else
2493*4882a593Smuzhiyun p->prio = --least_priority;
2494*4882a593Smuzhiyun /*
2495*4882a593Smuzhiyun * the plist prio is negated because plist ordering is
2496*4882a593Smuzhiyun * low-to-high, while swap ordering is high-to-low
2497*4882a593Smuzhiyun */
2498*4882a593Smuzhiyun p->list.prio = -p->prio;
2499*4882a593Smuzhiyun for_each_node(i) {
2500*4882a593Smuzhiyun if (p->prio >= 0)
2501*4882a593Smuzhiyun p->avail_lists[i].prio = -p->prio;
2502*4882a593Smuzhiyun else {
2503*4882a593Smuzhiyun if (swap_node(p) == i)
2504*4882a593Smuzhiyun p->avail_lists[i].prio = 1;
2505*4882a593Smuzhiyun else
2506*4882a593Smuzhiyun p->avail_lists[i].prio = -p->prio;
2507*4882a593Smuzhiyun }
2508*4882a593Smuzhiyun }
2509*4882a593Smuzhiyun p->swap_map = swap_map;
2510*4882a593Smuzhiyun p->cluster_info = cluster_info;
2511*4882a593Smuzhiyun }
2512*4882a593Smuzhiyun
_enable_swap_info(struct swap_info_struct * p)2513*4882a593Smuzhiyun static void _enable_swap_info(struct swap_info_struct *p)
2514*4882a593Smuzhiyun {
2515*4882a593Smuzhiyun bool skip = false;
2516*4882a593Smuzhiyun
2517*4882a593Smuzhiyun p->flags |= SWP_WRITEOK | SWP_VALID;
2518*4882a593Smuzhiyun trace_android_vh_account_swap_pages(p, &skip);
2519*4882a593Smuzhiyun if (!skip) {
2520*4882a593Smuzhiyun atomic_long_add(p->pages, &nr_swap_pages);
2521*4882a593Smuzhiyun total_swap_pages += p->pages;
2522*4882a593Smuzhiyun }
2523*4882a593Smuzhiyun assert_spin_locked(&swap_lock);
2524*4882a593Smuzhiyun /*
2525*4882a593Smuzhiyun * both lists are plists, and thus priority ordered.
2526*4882a593Smuzhiyun * swap_active_head needs to be priority ordered for swapoff(),
2527*4882a593Smuzhiyun * which on removal of any swap_info_struct with an auto-assigned
2528*4882a593Smuzhiyun * (i.e. negative) priority increments the auto-assigned priority
2529*4882a593Smuzhiyun * of any lower-priority swap_info_structs.
2530*4882a593Smuzhiyun * swap_avail_head needs to be priority ordered for get_swap_page(),
2531*4882a593Smuzhiyun * which allocates swap pages from the highest available priority
2532*4882a593Smuzhiyun * swap_info_struct.
2533*4882a593Smuzhiyun */
2534*4882a593Smuzhiyun plist_add(&p->list, &swap_active_head);
2535*4882a593Smuzhiyun add_to_avail_list(p);
2536*4882a593Smuzhiyun }
2537*4882a593Smuzhiyun
enable_swap_info(struct swap_info_struct * p,int prio,unsigned char * swap_map,struct swap_cluster_info * cluster_info,unsigned long * frontswap_map)2538*4882a593Smuzhiyun static void enable_swap_info(struct swap_info_struct *p, int prio,
2539*4882a593Smuzhiyun unsigned char *swap_map,
2540*4882a593Smuzhiyun struct swap_cluster_info *cluster_info,
2541*4882a593Smuzhiyun unsigned long *frontswap_map)
2542*4882a593Smuzhiyun {
2543*4882a593Smuzhiyun frontswap_init(p->type, frontswap_map);
2544*4882a593Smuzhiyun spin_lock(&swap_lock);
2545*4882a593Smuzhiyun spin_lock(&p->lock);
2546*4882a593Smuzhiyun setup_swap_info(p, prio, swap_map, cluster_info);
2547*4882a593Smuzhiyun spin_unlock(&p->lock);
2548*4882a593Smuzhiyun spin_unlock(&swap_lock);
2549*4882a593Smuzhiyun /*
2550*4882a593Smuzhiyun * Guarantee swap_map, cluster_info, etc. fields are valid
2551*4882a593Smuzhiyun * between get/put_swap_device() if SWP_VALID bit is set
2552*4882a593Smuzhiyun */
2553*4882a593Smuzhiyun synchronize_rcu();
2554*4882a593Smuzhiyun spin_lock(&swap_lock);
2555*4882a593Smuzhiyun spin_lock(&p->lock);
2556*4882a593Smuzhiyun _enable_swap_info(p);
2557*4882a593Smuzhiyun spin_unlock(&p->lock);
2558*4882a593Smuzhiyun spin_unlock(&swap_lock);
2559*4882a593Smuzhiyun }
2560*4882a593Smuzhiyun
reinsert_swap_info(struct swap_info_struct * p)2561*4882a593Smuzhiyun static void reinsert_swap_info(struct swap_info_struct *p)
2562*4882a593Smuzhiyun {
2563*4882a593Smuzhiyun spin_lock(&swap_lock);
2564*4882a593Smuzhiyun spin_lock(&p->lock);
2565*4882a593Smuzhiyun setup_swap_info(p, p->prio, p->swap_map, p->cluster_info);
2566*4882a593Smuzhiyun _enable_swap_info(p);
2567*4882a593Smuzhiyun spin_unlock(&p->lock);
2568*4882a593Smuzhiyun spin_unlock(&swap_lock);
2569*4882a593Smuzhiyun }
2570*4882a593Smuzhiyun
has_usable_swap(void)2571*4882a593Smuzhiyun bool has_usable_swap(void)
2572*4882a593Smuzhiyun {
2573*4882a593Smuzhiyun bool ret = true;
2574*4882a593Smuzhiyun
2575*4882a593Smuzhiyun spin_lock(&swap_lock);
2576*4882a593Smuzhiyun if (plist_head_empty(&swap_active_head))
2577*4882a593Smuzhiyun ret = false;
2578*4882a593Smuzhiyun spin_unlock(&swap_lock);
2579*4882a593Smuzhiyun return ret;
2580*4882a593Smuzhiyun }
2581*4882a593Smuzhiyun
SYSCALL_DEFINE1(swapoff,const char __user *,specialfile)2582*4882a593Smuzhiyun SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
2583*4882a593Smuzhiyun {
2584*4882a593Smuzhiyun struct swap_info_struct *p = NULL;
2585*4882a593Smuzhiyun unsigned char *swap_map;
2586*4882a593Smuzhiyun struct swap_cluster_info *cluster_info;
2587*4882a593Smuzhiyun unsigned long *frontswap_map;
2588*4882a593Smuzhiyun struct file *swap_file, *victim;
2589*4882a593Smuzhiyun struct address_space *mapping;
2590*4882a593Smuzhiyun struct inode *inode;
2591*4882a593Smuzhiyun struct filename *pathname;
2592*4882a593Smuzhiyun int err, found = 0;
2593*4882a593Smuzhiyun unsigned int old_block_size;
2594*4882a593Smuzhiyun bool skip = false;
2595*4882a593Smuzhiyun
2596*4882a593Smuzhiyun if (!capable(CAP_SYS_ADMIN))
2597*4882a593Smuzhiyun return -EPERM;
2598*4882a593Smuzhiyun
2599*4882a593Smuzhiyun BUG_ON(!current->mm);
2600*4882a593Smuzhiyun
2601*4882a593Smuzhiyun pathname = getname(specialfile);
2602*4882a593Smuzhiyun if (IS_ERR(pathname))
2603*4882a593Smuzhiyun return PTR_ERR(pathname);
2604*4882a593Smuzhiyun
2605*4882a593Smuzhiyun victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
2606*4882a593Smuzhiyun err = PTR_ERR(victim);
2607*4882a593Smuzhiyun if (IS_ERR(victim))
2608*4882a593Smuzhiyun goto out;
2609*4882a593Smuzhiyun
2610*4882a593Smuzhiyun mapping = victim->f_mapping;
2611*4882a593Smuzhiyun spin_lock(&swap_lock);
2612*4882a593Smuzhiyun plist_for_each_entry(p, &swap_active_head, list) {
2613*4882a593Smuzhiyun if (p->flags & SWP_WRITEOK) {
2614*4882a593Smuzhiyun if (p->swap_file->f_mapping == mapping) {
2615*4882a593Smuzhiyun found = 1;
2616*4882a593Smuzhiyun break;
2617*4882a593Smuzhiyun }
2618*4882a593Smuzhiyun }
2619*4882a593Smuzhiyun }
2620*4882a593Smuzhiyun if (!found) {
2621*4882a593Smuzhiyun err = -EINVAL;
2622*4882a593Smuzhiyun spin_unlock(&swap_lock);
2623*4882a593Smuzhiyun goto out_dput;
2624*4882a593Smuzhiyun }
2625*4882a593Smuzhiyun if (!security_vm_enough_memory_mm(current->mm, p->pages))
2626*4882a593Smuzhiyun vm_unacct_memory(p->pages);
2627*4882a593Smuzhiyun else {
2628*4882a593Smuzhiyun err = -ENOMEM;
2629*4882a593Smuzhiyun spin_unlock(&swap_lock);
2630*4882a593Smuzhiyun goto out_dput;
2631*4882a593Smuzhiyun }
2632*4882a593Smuzhiyun del_from_avail_list(p);
2633*4882a593Smuzhiyun spin_lock(&p->lock);
2634*4882a593Smuzhiyun if (p->prio < 0) {
2635*4882a593Smuzhiyun struct swap_info_struct *si = p;
2636*4882a593Smuzhiyun int nid;
2637*4882a593Smuzhiyun
2638*4882a593Smuzhiyun plist_for_each_entry_continue(si, &swap_active_head, list) {
2639*4882a593Smuzhiyun si->prio++;
2640*4882a593Smuzhiyun si->list.prio--;
2641*4882a593Smuzhiyun for_each_node(nid) {
2642*4882a593Smuzhiyun if (si->avail_lists[nid].prio != 1)
2643*4882a593Smuzhiyun si->avail_lists[nid].prio--;
2644*4882a593Smuzhiyun }
2645*4882a593Smuzhiyun }
2646*4882a593Smuzhiyun least_priority++;
2647*4882a593Smuzhiyun }
2648*4882a593Smuzhiyun plist_del(&p->list, &swap_active_head);
2649*4882a593Smuzhiyun trace_android_vh_account_swap_pages(p, &skip);
2650*4882a593Smuzhiyun if (!skip) {
2651*4882a593Smuzhiyun atomic_long_sub(p->pages, &nr_swap_pages);
2652*4882a593Smuzhiyun total_swap_pages -= p->pages;
2653*4882a593Smuzhiyun }
2654*4882a593Smuzhiyun p->flags &= ~SWP_WRITEOK;
2655*4882a593Smuzhiyun spin_unlock(&p->lock);
2656*4882a593Smuzhiyun spin_unlock(&swap_lock);
2657*4882a593Smuzhiyun
2658*4882a593Smuzhiyun disable_swap_slots_cache_lock();
2659*4882a593Smuzhiyun
2660*4882a593Smuzhiyun set_current_oom_origin();
2661*4882a593Smuzhiyun err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
2662*4882a593Smuzhiyun clear_current_oom_origin();
2663*4882a593Smuzhiyun
2664*4882a593Smuzhiyun if (err) {
2665*4882a593Smuzhiyun /* re-insert swap space back into swap_list */
2666*4882a593Smuzhiyun reinsert_swap_info(p);
2667*4882a593Smuzhiyun reenable_swap_slots_cache_unlock();
2668*4882a593Smuzhiyun goto out_dput;
2669*4882a593Smuzhiyun }
2670*4882a593Smuzhiyun
2671*4882a593Smuzhiyun reenable_swap_slots_cache_unlock();
2672*4882a593Smuzhiyun
2673*4882a593Smuzhiyun spin_lock(&swap_lock);
2674*4882a593Smuzhiyun spin_lock(&p->lock);
2675*4882a593Smuzhiyun p->flags &= ~SWP_VALID; /* mark swap device as invalid */
2676*4882a593Smuzhiyun spin_unlock(&p->lock);
2677*4882a593Smuzhiyun spin_unlock(&swap_lock);
2678*4882a593Smuzhiyun /*
2679*4882a593Smuzhiyun * wait for swap operations protected by get/put_swap_device()
2680*4882a593Smuzhiyun * to complete
2681*4882a593Smuzhiyun */
2682*4882a593Smuzhiyun synchronize_rcu();
2683*4882a593Smuzhiyun
2684*4882a593Smuzhiyun flush_work(&p->discard_work);
2685*4882a593Smuzhiyun
2686*4882a593Smuzhiyun destroy_swap_extents(p);
2687*4882a593Smuzhiyun if (p->flags & SWP_CONTINUED)
2688*4882a593Smuzhiyun free_swap_count_continuations(p);
2689*4882a593Smuzhiyun
2690*4882a593Smuzhiyun if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev)))
2691*4882a593Smuzhiyun atomic_dec(&nr_rotate_swap);
2692*4882a593Smuzhiyun
2693*4882a593Smuzhiyun mutex_lock(&swapon_mutex);
2694*4882a593Smuzhiyun spin_lock(&swap_lock);
2695*4882a593Smuzhiyun spin_lock(&p->lock);
2696*4882a593Smuzhiyun drain_mmlist();
2697*4882a593Smuzhiyun
2698*4882a593Smuzhiyun /* wait for anyone still in scan_swap_map */
2699*4882a593Smuzhiyun p->highest_bit = 0; /* cuts scans short */
2700*4882a593Smuzhiyun while (p->flags >= SWP_SCANNING) {
2701*4882a593Smuzhiyun spin_unlock(&p->lock);
2702*4882a593Smuzhiyun spin_unlock(&swap_lock);
2703*4882a593Smuzhiyun schedule_timeout_uninterruptible(1);
2704*4882a593Smuzhiyun spin_lock(&swap_lock);
2705*4882a593Smuzhiyun spin_lock(&p->lock);
2706*4882a593Smuzhiyun }
2707*4882a593Smuzhiyun
2708*4882a593Smuzhiyun swap_file = p->swap_file;
2709*4882a593Smuzhiyun old_block_size = p->old_block_size;
2710*4882a593Smuzhiyun p->swap_file = NULL;
2711*4882a593Smuzhiyun p->max = 0;
2712*4882a593Smuzhiyun swap_map = p->swap_map;
2713*4882a593Smuzhiyun p->swap_map = NULL;
2714*4882a593Smuzhiyun cluster_info = p->cluster_info;
2715*4882a593Smuzhiyun p->cluster_info = NULL;
2716*4882a593Smuzhiyun frontswap_map = frontswap_map_get(p);
2717*4882a593Smuzhiyun spin_unlock(&p->lock);
2718*4882a593Smuzhiyun spin_unlock(&swap_lock);
2719*4882a593Smuzhiyun arch_swap_invalidate_area(p->type);
2720*4882a593Smuzhiyun frontswap_invalidate_area(p->type);
2721*4882a593Smuzhiyun frontswap_map_set(p, NULL);
2722*4882a593Smuzhiyun mutex_unlock(&swapon_mutex);
2723*4882a593Smuzhiyun free_percpu(p->percpu_cluster);
2724*4882a593Smuzhiyun p->percpu_cluster = NULL;
2725*4882a593Smuzhiyun free_percpu(p->cluster_next_cpu);
2726*4882a593Smuzhiyun p->cluster_next_cpu = NULL;
2727*4882a593Smuzhiyun vfree(swap_map);
2728*4882a593Smuzhiyun kvfree(cluster_info);
2729*4882a593Smuzhiyun kvfree(frontswap_map);
2730*4882a593Smuzhiyun /* Destroy swap account information */
2731*4882a593Smuzhiyun swap_cgroup_swapoff(p->type);
2732*4882a593Smuzhiyun exit_swap_address_space(p->type);
2733*4882a593Smuzhiyun
2734*4882a593Smuzhiyun inode = mapping->host;
2735*4882a593Smuzhiyun if (S_ISBLK(inode->i_mode)) {
2736*4882a593Smuzhiyun struct block_device *bdev = I_BDEV(inode);
2737*4882a593Smuzhiyun
2738*4882a593Smuzhiyun set_blocksize(bdev, old_block_size);
2739*4882a593Smuzhiyun blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2740*4882a593Smuzhiyun }
2741*4882a593Smuzhiyun
2742*4882a593Smuzhiyun inode_lock(inode);
2743*4882a593Smuzhiyun inode->i_flags &= ~S_SWAPFILE;
2744*4882a593Smuzhiyun inode_unlock(inode);
2745*4882a593Smuzhiyun filp_close(swap_file, NULL);
2746*4882a593Smuzhiyun
2747*4882a593Smuzhiyun /*
2748*4882a593Smuzhiyun * Clear the SWP_USED flag after all resources are freed so that swapon
2749*4882a593Smuzhiyun * can reuse this swap_info in alloc_swap_info() safely. It is ok to
2750*4882a593Smuzhiyun * not hold p->lock after we cleared its SWP_WRITEOK.
2751*4882a593Smuzhiyun */
2752*4882a593Smuzhiyun spin_lock(&swap_lock);
2753*4882a593Smuzhiyun p->flags = 0;
2754*4882a593Smuzhiyun spin_unlock(&swap_lock);
2755*4882a593Smuzhiyun
2756*4882a593Smuzhiyun err = 0;
2757*4882a593Smuzhiyun atomic_inc(&proc_poll_event);
2758*4882a593Smuzhiyun wake_up_interruptible(&proc_poll_wait);
2759*4882a593Smuzhiyun
2760*4882a593Smuzhiyun out_dput:
2761*4882a593Smuzhiyun filp_close(victim, NULL);
2762*4882a593Smuzhiyun out:
2763*4882a593Smuzhiyun putname(pathname);
2764*4882a593Smuzhiyun return err;
2765*4882a593Smuzhiyun }
2766*4882a593Smuzhiyun
2767*4882a593Smuzhiyun #ifdef CONFIG_PROC_FS
swaps_poll(struct file * file,poll_table * wait)2768*4882a593Smuzhiyun static __poll_t swaps_poll(struct file *file, poll_table *wait)
2769*4882a593Smuzhiyun {
2770*4882a593Smuzhiyun struct seq_file *seq = file->private_data;
2771*4882a593Smuzhiyun
2772*4882a593Smuzhiyun poll_wait(file, &proc_poll_wait, wait);
2773*4882a593Smuzhiyun
2774*4882a593Smuzhiyun if (seq->poll_event != atomic_read(&proc_poll_event)) {
2775*4882a593Smuzhiyun seq->poll_event = atomic_read(&proc_poll_event);
2776*4882a593Smuzhiyun return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI;
2777*4882a593Smuzhiyun }
2778*4882a593Smuzhiyun
2779*4882a593Smuzhiyun return EPOLLIN | EPOLLRDNORM;
2780*4882a593Smuzhiyun }
2781*4882a593Smuzhiyun
2782*4882a593Smuzhiyun /* iterator */
swap_start(struct seq_file * swap,loff_t * pos)2783*4882a593Smuzhiyun static void *swap_start(struct seq_file *swap, loff_t *pos)
2784*4882a593Smuzhiyun {
2785*4882a593Smuzhiyun struct swap_info_struct *si;
2786*4882a593Smuzhiyun int type;
2787*4882a593Smuzhiyun loff_t l = *pos;
2788*4882a593Smuzhiyun
2789*4882a593Smuzhiyun mutex_lock(&swapon_mutex);
2790*4882a593Smuzhiyun
2791*4882a593Smuzhiyun if (!l)
2792*4882a593Smuzhiyun return SEQ_START_TOKEN;
2793*4882a593Smuzhiyun
2794*4882a593Smuzhiyun for (type = 0; (si = swap_type_to_swap_info(type)); type++) {
2795*4882a593Smuzhiyun if (!(si->flags & SWP_USED) || !si->swap_map)
2796*4882a593Smuzhiyun continue;
2797*4882a593Smuzhiyun if (!--l)
2798*4882a593Smuzhiyun return si;
2799*4882a593Smuzhiyun }
2800*4882a593Smuzhiyun
2801*4882a593Smuzhiyun return NULL;
2802*4882a593Smuzhiyun }
2803*4882a593Smuzhiyun
swap_next(struct seq_file * swap,void * v,loff_t * pos)2804*4882a593Smuzhiyun static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
2805*4882a593Smuzhiyun {
2806*4882a593Smuzhiyun struct swap_info_struct *si = v;
2807*4882a593Smuzhiyun int type;
2808*4882a593Smuzhiyun
2809*4882a593Smuzhiyun if (v == SEQ_START_TOKEN)
2810*4882a593Smuzhiyun type = 0;
2811*4882a593Smuzhiyun else
2812*4882a593Smuzhiyun type = si->type + 1;
2813*4882a593Smuzhiyun
2814*4882a593Smuzhiyun ++(*pos);
2815*4882a593Smuzhiyun for (; (si = swap_type_to_swap_info(type)); type++) {
2816*4882a593Smuzhiyun if (!(si->flags & SWP_USED) || !si->swap_map)
2817*4882a593Smuzhiyun continue;
2818*4882a593Smuzhiyun return si;
2819*4882a593Smuzhiyun }
2820*4882a593Smuzhiyun
2821*4882a593Smuzhiyun return NULL;
2822*4882a593Smuzhiyun }
2823*4882a593Smuzhiyun
swap_stop(struct seq_file * swap,void * v)2824*4882a593Smuzhiyun static void swap_stop(struct seq_file *swap, void *v)
2825*4882a593Smuzhiyun {
2826*4882a593Smuzhiyun mutex_unlock(&swapon_mutex);
2827*4882a593Smuzhiyun }
2828*4882a593Smuzhiyun
swap_show(struct seq_file * swap,void * v)2829*4882a593Smuzhiyun static int swap_show(struct seq_file *swap, void *v)
2830*4882a593Smuzhiyun {
2831*4882a593Smuzhiyun struct swap_info_struct *si = v;
2832*4882a593Smuzhiyun struct file *file;
2833*4882a593Smuzhiyun int len;
2834*4882a593Smuzhiyun unsigned int bytes, inuse;
2835*4882a593Smuzhiyun
2836*4882a593Smuzhiyun if (si == SEQ_START_TOKEN) {
2837*4882a593Smuzhiyun seq_puts(swap,"Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
2838*4882a593Smuzhiyun return 0;
2839*4882a593Smuzhiyun }
2840*4882a593Smuzhiyun
2841*4882a593Smuzhiyun bytes = si->pages << (PAGE_SHIFT - 10);
2842*4882a593Smuzhiyun inuse = si->inuse_pages << (PAGE_SHIFT - 10);
2843*4882a593Smuzhiyun
2844*4882a593Smuzhiyun file = si->swap_file;
2845*4882a593Smuzhiyun len = seq_file_path(swap, file, " \t\n\\");
2846*4882a593Smuzhiyun seq_printf(swap, "%*s%s\t%u\t%s%u\t%s%d\n",
2847*4882a593Smuzhiyun len < 40 ? 40 - len : 1, " ",
2848*4882a593Smuzhiyun S_ISBLK(file_inode(file)->i_mode) ?
2849*4882a593Smuzhiyun "partition" : "file\t",
2850*4882a593Smuzhiyun bytes, bytes < 10000000 ? "\t" : "",
2851*4882a593Smuzhiyun inuse, inuse < 10000000 ? "\t" : "",
2852*4882a593Smuzhiyun si->prio);
2853*4882a593Smuzhiyun return 0;
2854*4882a593Smuzhiyun }
2855*4882a593Smuzhiyun
2856*4882a593Smuzhiyun static const struct seq_operations swaps_op = {
2857*4882a593Smuzhiyun .start = swap_start,
2858*4882a593Smuzhiyun .next = swap_next,
2859*4882a593Smuzhiyun .stop = swap_stop,
2860*4882a593Smuzhiyun .show = swap_show
2861*4882a593Smuzhiyun };
2862*4882a593Smuzhiyun
swaps_open(struct inode * inode,struct file * file)2863*4882a593Smuzhiyun static int swaps_open(struct inode *inode, struct file *file)
2864*4882a593Smuzhiyun {
2865*4882a593Smuzhiyun struct seq_file *seq;
2866*4882a593Smuzhiyun int ret;
2867*4882a593Smuzhiyun
2868*4882a593Smuzhiyun ret = seq_open(file, &swaps_op);
2869*4882a593Smuzhiyun if (ret)
2870*4882a593Smuzhiyun return ret;
2871*4882a593Smuzhiyun
2872*4882a593Smuzhiyun seq = file->private_data;
2873*4882a593Smuzhiyun seq->poll_event = atomic_read(&proc_poll_event);
2874*4882a593Smuzhiyun return 0;
2875*4882a593Smuzhiyun }
2876*4882a593Smuzhiyun
2877*4882a593Smuzhiyun static const struct proc_ops swaps_proc_ops = {
2878*4882a593Smuzhiyun .proc_flags = PROC_ENTRY_PERMANENT,
2879*4882a593Smuzhiyun .proc_open = swaps_open,
2880*4882a593Smuzhiyun .proc_read = seq_read,
2881*4882a593Smuzhiyun .proc_lseek = seq_lseek,
2882*4882a593Smuzhiyun .proc_release = seq_release,
2883*4882a593Smuzhiyun .proc_poll = swaps_poll,
2884*4882a593Smuzhiyun };
2885*4882a593Smuzhiyun
procswaps_init(void)2886*4882a593Smuzhiyun static int __init procswaps_init(void)
2887*4882a593Smuzhiyun {
2888*4882a593Smuzhiyun proc_create("swaps", 0, NULL, &swaps_proc_ops);
2889*4882a593Smuzhiyun return 0;
2890*4882a593Smuzhiyun }
2891*4882a593Smuzhiyun __initcall(procswaps_init);
2892*4882a593Smuzhiyun #endif /* CONFIG_PROC_FS */
2893*4882a593Smuzhiyun
2894*4882a593Smuzhiyun #ifdef MAX_SWAPFILES_CHECK
max_swapfiles_check(void)2895*4882a593Smuzhiyun static int __init max_swapfiles_check(void)
2896*4882a593Smuzhiyun {
2897*4882a593Smuzhiyun MAX_SWAPFILES_CHECK();
2898*4882a593Smuzhiyun return 0;
2899*4882a593Smuzhiyun }
2900*4882a593Smuzhiyun late_initcall(max_swapfiles_check);
2901*4882a593Smuzhiyun #endif
2902*4882a593Smuzhiyun
alloc_swap_info(void)2903*4882a593Smuzhiyun static struct swap_info_struct *alloc_swap_info(void)
2904*4882a593Smuzhiyun {
2905*4882a593Smuzhiyun struct swap_info_struct *p = NULL;
2906*4882a593Smuzhiyun struct swap_info_struct *defer = NULL;
2907*4882a593Smuzhiyun unsigned int type;
2908*4882a593Smuzhiyun int i;
2909*4882a593Smuzhiyun bool skip = false;
2910*4882a593Smuzhiyun
2911*4882a593Smuzhiyun trace_android_rvh_alloc_si(&p, &skip);
2912*4882a593Smuzhiyun trace_android_vh_alloc_si(&p, &skip);
2913*4882a593Smuzhiyun if (!skip)
2914*4882a593Smuzhiyun p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL);
2915*4882a593Smuzhiyun if (!p)
2916*4882a593Smuzhiyun return ERR_PTR(-ENOMEM);
2917*4882a593Smuzhiyun
2918*4882a593Smuzhiyun spin_lock(&swap_lock);
2919*4882a593Smuzhiyun for (type = 0; type < nr_swapfiles; type++) {
2920*4882a593Smuzhiyun if (!(swap_info[type]->flags & SWP_USED))
2921*4882a593Smuzhiyun break;
2922*4882a593Smuzhiyun }
2923*4882a593Smuzhiyun if (type >= MAX_SWAPFILES) {
2924*4882a593Smuzhiyun spin_unlock(&swap_lock);
2925*4882a593Smuzhiyun kvfree(p);
2926*4882a593Smuzhiyun return ERR_PTR(-EPERM);
2927*4882a593Smuzhiyun }
2928*4882a593Smuzhiyun if (type >= nr_swapfiles) {
2929*4882a593Smuzhiyun p->type = type;
2930*4882a593Smuzhiyun WRITE_ONCE(swap_info[type], p);
2931*4882a593Smuzhiyun /*
2932*4882a593Smuzhiyun * Write swap_info[type] before nr_swapfiles, in case a
2933*4882a593Smuzhiyun * racing procfs swap_start() or swap_next() is reading them.
2934*4882a593Smuzhiyun * (We never shrink nr_swapfiles, we never free this entry.)
2935*4882a593Smuzhiyun */
2936*4882a593Smuzhiyun smp_wmb();
2937*4882a593Smuzhiyun WRITE_ONCE(nr_swapfiles, nr_swapfiles + 1);
2938*4882a593Smuzhiyun } else {
2939*4882a593Smuzhiyun defer = p;
2940*4882a593Smuzhiyun p = swap_info[type];
2941*4882a593Smuzhiyun /*
2942*4882a593Smuzhiyun * Do not memset this entry: a racing procfs swap_next()
2943*4882a593Smuzhiyun * would be relying on p->type to remain valid.
2944*4882a593Smuzhiyun */
2945*4882a593Smuzhiyun }
2946*4882a593Smuzhiyun p->swap_extent_root = RB_ROOT;
2947*4882a593Smuzhiyun plist_node_init(&p->list, 0);
2948*4882a593Smuzhiyun for_each_node(i)
2949*4882a593Smuzhiyun plist_node_init(&p->avail_lists[i], 0);
2950*4882a593Smuzhiyun p->flags = SWP_USED;
2951*4882a593Smuzhiyun spin_unlock(&swap_lock);
2952*4882a593Smuzhiyun kvfree(defer);
2953*4882a593Smuzhiyun spin_lock_init(&p->lock);
2954*4882a593Smuzhiyun spin_lock_init(&p->cont_lock);
2955*4882a593Smuzhiyun
2956*4882a593Smuzhiyun return p;
2957*4882a593Smuzhiyun }
2958*4882a593Smuzhiyun
claim_swapfile(struct swap_info_struct * p,struct inode * inode)2959*4882a593Smuzhiyun static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
2960*4882a593Smuzhiyun {
2961*4882a593Smuzhiyun int error;
2962*4882a593Smuzhiyun
2963*4882a593Smuzhiyun if (S_ISBLK(inode->i_mode)) {
2964*4882a593Smuzhiyun p->bdev = blkdev_get_by_dev(inode->i_rdev,
2965*4882a593Smuzhiyun FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);
2966*4882a593Smuzhiyun if (IS_ERR(p->bdev)) {
2967*4882a593Smuzhiyun error = PTR_ERR(p->bdev);
2968*4882a593Smuzhiyun p->bdev = NULL;
2969*4882a593Smuzhiyun return error;
2970*4882a593Smuzhiyun }
2971*4882a593Smuzhiyun p->old_block_size = block_size(p->bdev);
2972*4882a593Smuzhiyun error = set_blocksize(p->bdev, PAGE_SIZE);
2973*4882a593Smuzhiyun if (error < 0)
2974*4882a593Smuzhiyun return error;
2975*4882a593Smuzhiyun /*
2976*4882a593Smuzhiyun * Zoned block devices contain zones that have a sequential
2977*4882a593Smuzhiyun * write only restriction. Hence zoned block devices are not
2978*4882a593Smuzhiyun * suitable for swapping. Disallow them here.
2979*4882a593Smuzhiyun */
2980*4882a593Smuzhiyun if (blk_queue_is_zoned(p->bdev->bd_disk->queue))
2981*4882a593Smuzhiyun return -EINVAL;
2982*4882a593Smuzhiyun p->flags |= SWP_BLKDEV;
2983*4882a593Smuzhiyun } else if (S_ISREG(inode->i_mode)) {
2984*4882a593Smuzhiyun p->bdev = inode->i_sb->s_bdev;
2985*4882a593Smuzhiyun }
2986*4882a593Smuzhiyun
2987*4882a593Smuzhiyun return 0;
2988*4882a593Smuzhiyun }
2989*4882a593Smuzhiyun
2990*4882a593Smuzhiyun
2991*4882a593Smuzhiyun /*
2992*4882a593Smuzhiyun * Find out how many pages are allowed for a single swap device. There
2993*4882a593Smuzhiyun * are two limiting factors:
2994*4882a593Smuzhiyun * 1) the number of bits for the swap offset in the swp_entry_t type, and
2995*4882a593Smuzhiyun * 2) the number of bits in the swap pte, as defined by the different
2996*4882a593Smuzhiyun * architectures.
2997*4882a593Smuzhiyun *
2998*4882a593Smuzhiyun * In order to find the largest possible bit mask, a swap entry with
2999*4882a593Smuzhiyun * swap type 0 and swap offset ~0UL is created, encoded to a swap pte,
3000*4882a593Smuzhiyun * decoded to a swp_entry_t again, and finally the swap offset is
3001*4882a593Smuzhiyun * extracted.
3002*4882a593Smuzhiyun *
3003*4882a593Smuzhiyun * This will mask all the bits from the initial ~0UL mask that can't
3004*4882a593Smuzhiyun * be encoded in either the swp_entry_t or the architecture definition
3005*4882a593Smuzhiyun * of a swap pte.
3006*4882a593Smuzhiyun */
generic_max_swapfile_size(void)3007*4882a593Smuzhiyun unsigned long generic_max_swapfile_size(void)
3008*4882a593Smuzhiyun {
3009*4882a593Smuzhiyun return swp_offset(pte_to_swp_entry(
3010*4882a593Smuzhiyun swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
3011*4882a593Smuzhiyun }
3012*4882a593Smuzhiyun
3013*4882a593Smuzhiyun /* Can be overridden by an architecture for additional checks. */
max_swapfile_size(void)3014*4882a593Smuzhiyun __weak unsigned long max_swapfile_size(void)
3015*4882a593Smuzhiyun {
3016*4882a593Smuzhiyun return generic_max_swapfile_size();
3017*4882a593Smuzhiyun }
3018*4882a593Smuzhiyun
read_swap_header(struct swap_info_struct * p,union swap_header * swap_header,struct inode * inode)3019*4882a593Smuzhiyun static unsigned long read_swap_header(struct swap_info_struct *p,
3020*4882a593Smuzhiyun union swap_header *swap_header,
3021*4882a593Smuzhiyun struct inode *inode)
3022*4882a593Smuzhiyun {
3023*4882a593Smuzhiyun int i;
3024*4882a593Smuzhiyun unsigned long maxpages;
3025*4882a593Smuzhiyun unsigned long swapfilepages;
3026*4882a593Smuzhiyun unsigned long last_page;
3027*4882a593Smuzhiyun
3028*4882a593Smuzhiyun if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
3029*4882a593Smuzhiyun pr_err("Unable to find swap-space signature\n");
3030*4882a593Smuzhiyun return 0;
3031*4882a593Smuzhiyun }
3032*4882a593Smuzhiyun
3033*4882a593Smuzhiyun /* swap partition endianess hack... */
3034*4882a593Smuzhiyun if (swab32(swap_header->info.version) == 1) {
3035*4882a593Smuzhiyun swab32s(&swap_header->info.version);
3036*4882a593Smuzhiyun swab32s(&swap_header->info.last_page);
3037*4882a593Smuzhiyun swab32s(&swap_header->info.nr_badpages);
3038*4882a593Smuzhiyun if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
3039*4882a593Smuzhiyun return 0;
3040*4882a593Smuzhiyun for (i = 0; i < swap_header->info.nr_badpages; i++)
3041*4882a593Smuzhiyun swab32s(&swap_header->info.badpages[i]);
3042*4882a593Smuzhiyun }
3043*4882a593Smuzhiyun /* Check the swap header's sub-version */
3044*4882a593Smuzhiyun if (swap_header->info.version != 1) {
3045*4882a593Smuzhiyun pr_warn("Unable to handle swap header version %d\n",
3046*4882a593Smuzhiyun swap_header->info.version);
3047*4882a593Smuzhiyun return 0;
3048*4882a593Smuzhiyun }
3049*4882a593Smuzhiyun
3050*4882a593Smuzhiyun p->lowest_bit = 1;
3051*4882a593Smuzhiyun p->cluster_next = 1;
3052*4882a593Smuzhiyun p->cluster_nr = 0;
3053*4882a593Smuzhiyun
3054*4882a593Smuzhiyun maxpages = max_swapfile_size();
3055*4882a593Smuzhiyun last_page = swap_header->info.last_page;
3056*4882a593Smuzhiyun if (!last_page) {
3057*4882a593Smuzhiyun pr_warn("Empty swap-file\n");
3058*4882a593Smuzhiyun return 0;
3059*4882a593Smuzhiyun }
3060*4882a593Smuzhiyun if (last_page > maxpages) {
3061*4882a593Smuzhiyun pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
3062*4882a593Smuzhiyun maxpages << (PAGE_SHIFT - 10),
3063*4882a593Smuzhiyun last_page << (PAGE_SHIFT - 10));
3064*4882a593Smuzhiyun }
3065*4882a593Smuzhiyun if (maxpages > last_page) {
3066*4882a593Smuzhiyun maxpages = last_page + 1;
3067*4882a593Smuzhiyun /* p->max is an unsigned int: don't overflow it */
3068*4882a593Smuzhiyun if ((unsigned int)maxpages == 0)
3069*4882a593Smuzhiyun maxpages = UINT_MAX;
3070*4882a593Smuzhiyun }
3071*4882a593Smuzhiyun p->highest_bit = maxpages - 1;
3072*4882a593Smuzhiyun
3073*4882a593Smuzhiyun if (!maxpages)
3074*4882a593Smuzhiyun return 0;
3075*4882a593Smuzhiyun swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
3076*4882a593Smuzhiyun if (swapfilepages && maxpages > swapfilepages) {
3077*4882a593Smuzhiyun pr_warn("Swap area shorter than signature indicates\n");
3078*4882a593Smuzhiyun return 0;
3079*4882a593Smuzhiyun }
3080*4882a593Smuzhiyun if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
3081*4882a593Smuzhiyun return 0;
3082*4882a593Smuzhiyun if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
3083*4882a593Smuzhiyun return 0;
3084*4882a593Smuzhiyun
3085*4882a593Smuzhiyun return maxpages;
3086*4882a593Smuzhiyun }
3087*4882a593Smuzhiyun
3088*4882a593Smuzhiyun #define SWAP_CLUSTER_INFO_COLS \
3089*4882a593Smuzhiyun DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
3090*4882a593Smuzhiyun #define SWAP_CLUSTER_SPACE_COLS \
3091*4882a593Smuzhiyun DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
3092*4882a593Smuzhiyun #define SWAP_CLUSTER_COLS \
3093*4882a593Smuzhiyun max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
3094*4882a593Smuzhiyun
setup_swap_map_and_extents(struct swap_info_struct * p,union swap_header * swap_header,unsigned char * swap_map,struct swap_cluster_info * cluster_info,unsigned long maxpages,sector_t * span)3095*4882a593Smuzhiyun static int setup_swap_map_and_extents(struct swap_info_struct *p,
3096*4882a593Smuzhiyun union swap_header *swap_header,
3097*4882a593Smuzhiyun unsigned char *swap_map,
3098*4882a593Smuzhiyun struct swap_cluster_info *cluster_info,
3099*4882a593Smuzhiyun unsigned long maxpages,
3100*4882a593Smuzhiyun sector_t *span)
3101*4882a593Smuzhiyun {
3102*4882a593Smuzhiyun unsigned int j, k;
3103*4882a593Smuzhiyun unsigned int nr_good_pages;
3104*4882a593Smuzhiyun int nr_extents;
3105*4882a593Smuzhiyun unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
3106*4882a593Smuzhiyun unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
3107*4882a593Smuzhiyun unsigned long i, idx;
3108*4882a593Smuzhiyun
3109*4882a593Smuzhiyun nr_good_pages = maxpages - 1; /* omit header page */
3110*4882a593Smuzhiyun
3111*4882a593Smuzhiyun cluster_list_init(&p->free_clusters);
3112*4882a593Smuzhiyun cluster_list_init(&p->discard_clusters);
3113*4882a593Smuzhiyun
3114*4882a593Smuzhiyun for (i = 0; i < swap_header->info.nr_badpages; i++) {
3115*4882a593Smuzhiyun unsigned int page_nr = swap_header->info.badpages[i];
3116*4882a593Smuzhiyun if (page_nr == 0 || page_nr > swap_header->info.last_page)
3117*4882a593Smuzhiyun return -EINVAL;
3118*4882a593Smuzhiyun if (page_nr < maxpages) {
3119*4882a593Smuzhiyun swap_map[page_nr] = SWAP_MAP_BAD;
3120*4882a593Smuzhiyun nr_good_pages--;
3121*4882a593Smuzhiyun /*
3122*4882a593Smuzhiyun * Haven't marked the cluster free yet, no list
3123*4882a593Smuzhiyun * operation involved
3124*4882a593Smuzhiyun */
3125*4882a593Smuzhiyun inc_cluster_info_page(p, cluster_info, page_nr);
3126*4882a593Smuzhiyun }
3127*4882a593Smuzhiyun }
3128*4882a593Smuzhiyun
3129*4882a593Smuzhiyun /* Haven't marked the cluster free yet, no list operation involved */
3130*4882a593Smuzhiyun for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
3131*4882a593Smuzhiyun inc_cluster_info_page(p, cluster_info, i);
3132*4882a593Smuzhiyun
3133*4882a593Smuzhiyun if (nr_good_pages) {
3134*4882a593Smuzhiyun swap_map[0] = SWAP_MAP_BAD;
3135*4882a593Smuzhiyun /*
3136*4882a593Smuzhiyun * Not mark the cluster free yet, no list
3137*4882a593Smuzhiyun * operation involved
3138*4882a593Smuzhiyun */
3139*4882a593Smuzhiyun inc_cluster_info_page(p, cluster_info, 0);
3140*4882a593Smuzhiyun p->max = maxpages;
3141*4882a593Smuzhiyun p->pages = nr_good_pages;
3142*4882a593Smuzhiyun nr_extents = setup_swap_extents(p, span);
3143*4882a593Smuzhiyun if (nr_extents < 0)
3144*4882a593Smuzhiyun return nr_extents;
3145*4882a593Smuzhiyun nr_good_pages = p->pages;
3146*4882a593Smuzhiyun }
3147*4882a593Smuzhiyun if (!nr_good_pages) {
3148*4882a593Smuzhiyun pr_warn("Empty swap-file\n");
3149*4882a593Smuzhiyun return -EINVAL;
3150*4882a593Smuzhiyun }
3151*4882a593Smuzhiyun
3152*4882a593Smuzhiyun if (!cluster_info)
3153*4882a593Smuzhiyun return nr_extents;
3154*4882a593Smuzhiyun
3155*4882a593Smuzhiyun
3156*4882a593Smuzhiyun /*
3157*4882a593Smuzhiyun * Reduce false cache line sharing between cluster_info and
3158*4882a593Smuzhiyun * sharing same address space.
3159*4882a593Smuzhiyun */
3160*4882a593Smuzhiyun for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
3161*4882a593Smuzhiyun j = (k + col) % SWAP_CLUSTER_COLS;
3162*4882a593Smuzhiyun for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
3163*4882a593Smuzhiyun idx = i * SWAP_CLUSTER_COLS + j;
3164*4882a593Smuzhiyun if (idx >= nr_clusters)
3165*4882a593Smuzhiyun continue;
3166*4882a593Smuzhiyun if (cluster_count(&cluster_info[idx]))
3167*4882a593Smuzhiyun continue;
3168*4882a593Smuzhiyun cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
3169*4882a593Smuzhiyun cluster_list_add_tail(&p->free_clusters, cluster_info,
3170*4882a593Smuzhiyun idx);
3171*4882a593Smuzhiyun }
3172*4882a593Smuzhiyun }
3173*4882a593Smuzhiyun return nr_extents;
3174*4882a593Smuzhiyun }
3175*4882a593Smuzhiyun
3176*4882a593Smuzhiyun /*
3177*4882a593Smuzhiyun * Helper to sys_swapon determining if a given swap
3178*4882a593Smuzhiyun * backing device queue supports DISCARD operations.
3179*4882a593Smuzhiyun */
swap_discardable(struct swap_info_struct * si)3180*4882a593Smuzhiyun static bool swap_discardable(struct swap_info_struct *si)
3181*4882a593Smuzhiyun {
3182*4882a593Smuzhiyun struct request_queue *q = bdev_get_queue(si->bdev);
3183*4882a593Smuzhiyun
3184*4882a593Smuzhiyun if (!q || !blk_queue_discard(q))
3185*4882a593Smuzhiyun return false;
3186*4882a593Smuzhiyun
3187*4882a593Smuzhiyun return true;
3188*4882a593Smuzhiyun }
3189*4882a593Smuzhiyun
SYSCALL_DEFINE2(swapon,const char __user *,specialfile,int,swap_flags)3190*4882a593Smuzhiyun SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
3191*4882a593Smuzhiyun {
3192*4882a593Smuzhiyun struct swap_info_struct *p;
3193*4882a593Smuzhiyun struct filename *name;
3194*4882a593Smuzhiyun struct file *swap_file = NULL;
3195*4882a593Smuzhiyun struct address_space *mapping;
3196*4882a593Smuzhiyun int prio;
3197*4882a593Smuzhiyun int error;
3198*4882a593Smuzhiyun union swap_header *swap_header;
3199*4882a593Smuzhiyun int nr_extents;
3200*4882a593Smuzhiyun sector_t span;
3201*4882a593Smuzhiyun unsigned long maxpages;
3202*4882a593Smuzhiyun unsigned char *swap_map = NULL;
3203*4882a593Smuzhiyun struct swap_cluster_info *cluster_info = NULL;
3204*4882a593Smuzhiyun unsigned long *frontswap_map = NULL;
3205*4882a593Smuzhiyun struct page *page = NULL;
3206*4882a593Smuzhiyun struct inode *inode = NULL;
3207*4882a593Smuzhiyun bool inced_nr_rotate_swap = false;
3208*4882a593Smuzhiyun
3209*4882a593Smuzhiyun if (swap_flags & ~SWAP_FLAGS_VALID)
3210*4882a593Smuzhiyun return -EINVAL;
3211*4882a593Smuzhiyun
3212*4882a593Smuzhiyun if (!capable(CAP_SYS_ADMIN))
3213*4882a593Smuzhiyun return -EPERM;
3214*4882a593Smuzhiyun
3215*4882a593Smuzhiyun if (!swap_avail_heads)
3216*4882a593Smuzhiyun return -ENOMEM;
3217*4882a593Smuzhiyun
3218*4882a593Smuzhiyun p = alloc_swap_info();
3219*4882a593Smuzhiyun if (IS_ERR(p))
3220*4882a593Smuzhiyun return PTR_ERR(p);
3221*4882a593Smuzhiyun
3222*4882a593Smuzhiyun INIT_WORK(&p->discard_work, swap_discard_work);
3223*4882a593Smuzhiyun
3224*4882a593Smuzhiyun name = getname(specialfile);
3225*4882a593Smuzhiyun if (IS_ERR(name)) {
3226*4882a593Smuzhiyun error = PTR_ERR(name);
3227*4882a593Smuzhiyun name = NULL;
3228*4882a593Smuzhiyun goto bad_swap;
3229*4882a593Smuzhiyun }
3230*4882a593Smuzhiyun swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0);
3231*4882a593Smuzhiyun if (IS_ERR(swap_file)) {
3232*4882a593Smuzhiyun error = PTR_ERR(swap_file);
3233*4882a593Smuzhiyun swap_file = NULL;
3234*4882a593Smuzhiyun goto bad_swap;
3235*4882a593Smuzhiyun }
3236*4882a593Smuzhiyun
3237*4882a593Smuzhiyun p->swap_file = swap_file;
3238*4882a593Smuzhiyun mapping = swap_file->f_mapping;
3239*4882a593Smuzhiyun inode = mapping->host;
3240*4882a593Smuzhiyun
3241*4882a593Smuzhiyun error = claim_swapfile(p, inode);
3242*4882a593Smuzhiyun if (unlikely(error))
3243*4882a593Smuzhiyun goto bad_swap;
3244*4882a593Smuzhiyun
3245*4882a593Smuzhiyun inode_lock(inode);
3246*4882a593Smuzhiyun if (IS_SWAPFILE(inode)) {
3247*4882a593Smuzhiyun error = -EBUSY;
3248*4882a593Smuzhiyun goto bad_swap_unlock_inode;
3249*4882a593Smuzhiyun }
3250*4882a593Smuzhiyun
3251*4882a593Smuzhiyun /*
3252*4882a593Smuzhiyun * Read the swap header.
3253*4882a593Smuzhiyun */
3254*4882a593Smuzhiyun if (!mapping->a_ops->readpage) {
3255*4882a593Smuzhiyun error = -EINVAL;
3256*4882a593Smuzhiyun goto bad_swap_unlock_inode;
3257*4882a593Smuzhiyun }
3258*4882a593Smuzhiyun page = read_mapping_page(mapping, 0, swap_file);
3259*4882a593Smuzhiyun if (IS_ERR(page)) {
3260*4882a593Smuzhiyun error = PTR_ERR(page);
3261*4882a593Smuzhiyun goto bad_swap_unlock_inode;
3262*4882a593Smuzhiyun }
3263*4882a593Smuzhiyun swap_header = kmap(page);
3264*4882a593Smuzhiyun
3265*4882a593Smuzhiyun maxpages = read_swap_header(p, swap_header, inode);
3266*4882a593Smuzhiyun if (unlikely(!maxpages)) {
3267*4882a593Smuzhiyun error = -EINVAL;
3268*4882a593Smuzhiyun goto bad_swap_unlock_inode;
3269*4882a593Smuzhiyun }
3270*4882a593Smuzhiyun
3271*4882a593Smuzhiyun /* OK, set up the swap map and apply the bad block list */
3272*4882a593Smuzhiyun swap_map = vzalloc(maxpages);
3273*4882a593Smuzhiyun if (!swap_map) {
3274*4882a593Smuzhiyun error = -ENOMEM;
3275*4882a593Smuzhiyun goto bad_swap_unlock_inode;
3276*4882a593Smuzhiyun }
3277*4882a593Smuzhiyun
3278*4882a593Smuzhiyun if (p->bdev && blk_queue_stable_writes(p->bdev->bd_disk->queue))
3279*4882a593Smuzhiyun p->flags |= SWP_STABLE_WRITES;
3280*4882a593Smuzhiyun
3281*4882a593Smuzhiyun if (p->bdev && p->bdev->bd_disk->fops->rw_page)
3282*4882a593Smuzhiyun p->flags |= SWP_SYNCHRONOUS_IO;
3283*4882a593Smuzhiyun
3284*4882a593Smuzhiyun if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
3285*4882a593Smuzhiyun int cpu;
3286*4882a593Smuzhiyun unsigned long ci, nr_cluster;
3287*4882a593Smuzhiyun
3288*4882a593Smuzhiyun p->flags |= SWP_SOLIDSTATE;
3289*4882a593Smuzhiyun p->cluster_next_cpu = alloc_percpu(unsigned int);
3290*4882a593Smuzhiyun if (!p->cluster_next_cpu) {
3291*4882a593Smuzhiyun error = -ENOMEM;
3292*4882a593Smuzhiyun goto bad_swap_unlock_inode;
3293*4882a593Smuzhiyun }
3294*4882a593Smuzhiyun /*
3295*4882a593Smuzhiyun * select a random position to start with to help wear leveling
3296*4882a593Smuzhiyun * SSD
3297*4882a593Smuzhiyun */
3298*4882a593Smuzhiyun for_each_possible_cpu(cpu) {
3299*4882a593Smuzhiyun per_cpu(*p->cluster_next_cpu, cpu) =
3300*4882a593Smuzhiyun 1 + prandom_u32_max(p->highest_bit);
3301*4882a593Smuzhiyun }
3302*4882a593Smuzhiyun nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
3303*4882a593Smuzhiyun
3304*4882a593Smuzhiyun cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info),
3305*4882a593Smuzhiyun GFP_KERNEL);
3306*4882a593Smuzhiyun if (!cluster_info) {
3307*4882a593Smuzhiyun error = -ENOMEM;
3308*4882a593Smuzhiyun goto bad_swap_unlock_inode;
3309*4882a593Smuzhiyun }
3310*4882a593Smuzhiyun
3311*4882a593Smuzhiyun for (ci = 0; ci < nr_cluster; ci++)
3312*4882a593Smuzhiyun spin_lock_init(&((cluster_info + ci)->lock));
3313*4882a593Smuzhiyun
3314*4882a593Smuzhiyun p->percpu_cluster = alloc_percpu(struct percpu_cluster);
3315*4882a593Smuzhiyun if (!p->percpu_cluster) {
3316*4882a593Smuzhiyun error = -ENOMEM;
3317*4882a593Smuzhiyun goto bad_swap_unlock_inode;
3318*4882a593Smuzhiyun }
3319*4882a593Smuzhiyun for_each_possible_cpu(cpu) {
3320*4882a593Smuzhiyun struct percpu_cluster *cluster;
3321*4882a593Smuzhiyun cluster = per_cpu_ptr(p->percpu_cluster, cpu);
3322*4882a593Smuzhiyun cluster_set_null(&cluster->index);
3323*4882a593Smuzhiyun }
3324*4882a593Smuzhiyun } else {
3325*4882a593Smuzhiyun atomic_inc(&nr_rotate_swap);
3326*4882a593Smuzhiyun inced_nr_rotate_swap = true;
3327*4882a593Smuzhiyun }
3328*4882a593Smuzhiyun
3329*4882a593Smuzhiyun error = swap_cgroup_swapon(p->type, maxpages);
3330*4882a593Smuzhiyun if (error)
3331*4882a593Smuzhiyun goto bad_swap_unlock_inode;
3332*4882a593Smuzhiyun
3333*4882a593Smuzhiyun nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
3334*4882a593Smuzhiyun cluster_info, maxpages, &span);
3335*4882a593Smuzhiyun if (unlikely(nr_extents < 0)) {
3336*4882a593Smuzhiyun error = nr_extents;
3337*4882a593Smuzhiyun goto bad_swap_unlock_inode;
3338*4882a593Smuzhiyun }
3339*4882a593Smuzhiyun /* frontswap enabled? set up bit-per-page map for frontswap */
3340*4882a593Smuzhiyun if (IS_ENABLED(CONFIG_FRONTSWAP))
3341*4882a593Smuzhiyun frontswap_map = kvcalloc(BITS_TO_LONGS(maxpages),
3342*4882a593Smuzhiyun sizeof(long),
3343*4882a593Smuzhiyun GFP_KERNEL);
3344*4882a593Smuzhiyun
3345*4882a593Smuzhiyun if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
3346*4882a593Smuzhiyun /*
3347*4882a593Smuzhiyun * When discard is enabled for swap with no particular
3348*4882a593Smuzhiyun * policy flagged, we set all swap discard flags here in
3349*4882a593Smuzhiyun * order to sustain backward compatibility with older
3350*4882a593Smuzhiyun * swapon(8) releases.
3351*4882a593Smuzhiyun */
3352*4882a593Smuzhiyun p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
3353*4882a593Smuzhiyun SWP_PAGE_DISCARD);
3354*4882a593Smuzhiyun
3355*4882a593Smuzhiyun /*
3356*4882a593Smuzhiyun * By flagging sys_swapon, a sysadmin can tell us to
3357*4882a593Smuzhiyun * either do single-time area discards only, or to just
3358*4882a593Smuzhiyun * perform discards for released swap page-clusters.
3359*4882a593Smuzhiyun * Now it's time to adjust the p->flags accordingly.
3360*4882a593Smuzhiyun */
3361*4882a593Smuzhiyun if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
3362*4882a593Smuzhiyun p->flags &= ~SWP_PAGE_DISCARD;
3363*4882a593Smuzhiyun else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
3364*4882a593Smuzhiyun p->flags &= ~SWP_AREA_DISCARD;
3365*4882a593Smuzhiyun
3366*4882a593Smuzhiyun /* issue a swapon-time discard if it's still required */
3367*4882a593Smuzhiyun if (p->flags & SWP_AREA_DISCARD) {
3368*4882a593Smuzhiyun int err = discard_swap(p);
3369*4882a593Smuzhiyun if (unlikely(err))
3370*4882a593Smuzhiyun pr_err("swapon: discard_swap(%p): %d\n",
3371*4882a593Smuzhiyun p, err);
3372*4882a593Smuzhiyun }
3373*4882a593Smuzhiyun }
3374*4882a593Smuzhiyun
3375*4882a593Smuzhiyun error = init_swap_address_space(p->type, maxpages);
3376*4882a593Smuzhiyun if (error)
3377*4882a593Smuzhiyun goto bad_swap_unlock_inode;
3378*4882a593Smuzhiyun
3379*4882a593Smuzhiyun /*
3380*4882a593Smuzhiyun * Flush any pending IO and dirty mappings before we start using this
3381*4882a593Smuzhiyun * swap device.
3382*4882a593Smuzhiyun */
3383*4882a593Smuzhiyun inode->i_flags |= S_SWAPFILE;
3384*4882a593Smuzhiyun error = inode_drain_writes(inode);
3385*4882a593Smuzhiyun if (error) {
3386*4882a593Smuzhiyun inode->i_flags &= ~S_SWAPFILE;
3387*4882a593Smuzhiyun goto free_swap_address_space;
3388*4882a593Smuzhiyun }
3389*4882a593Smuzhiyun
3390*4882a593Smuzhiyun mutex_lock(&swapon_mutex);
3391*4882a593Smuzhiyun prio = -1;
3392*4882a593Smuzhiyun if (swap_flags & SWAP_FLAG_PREFER)
3393*4882a593Smuzhiyun prio =
3394*4882a593Smuzhiyun (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
3395*4882a593Smuzhiyun enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
3396*4882a593Smuzhiyun
3397*4882a593Smuzhiyun trace_android_vh_init_swap_info_struct(p, swap_avail_heads);
3398*4882a593Smuzhiyun pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
3399*4882a593Smuzhiyun p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
3400*4882a593Smuzhiyun nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
3401*4882a593Smuzhiyun (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
3402*4882a593Smuzhiyun (p->flags & SWP_DISCARDABLE) ? "D" : "",
3403*4882a593Smuzhiyun (p->flags & SWP_AREA_DISCARD) ? "s" : "",
3404*4882a593Smuzhiyun (p->flags & SWP_PAGE_DISCARD) ? "c" : "",
3405*4882a593Smuzhiyun (frontswap_map) ? "FS" : "");
3406*4882a593Smuzhiyun
3407*4882a593Smuzhiyun mutex_unlock(&swapon_mutex);
3408*4882a593Smuzhiyun atomic_inc(&proc_poll_event);
3409*4882a593Smuzhiyun wake_up_interruptible(&proc_poll_wait);
3410*4882a593Smuzhiyun
3411*4882a593Smuzhiyun error = 0;
3412*4882a593Smuzhiyun goto out;
3413*4882a593Smuzhiyun free_swap_address_space:
3414*4882a593Smuzhiyun exit_swap_address_space(p->type);
3415*4882a593Smuzhiyun bad_swap_unlock_inode:
3416*4882a593Smuzhiyun inode_unlock(inode);
3417*4882a593Smuzhiyun bad_swap:
3418*4882a593Smuzhiyun free_percpu(p->percpu_cluster);
3419*4882a593Smuzhiyun p->percpu_cluster = NULL;
3420*4882a593Smuzhiyun free_percpu(p->cluster_next_cpu);
3421*4882a593Smuzhiyun p->cluster_next_cpu = NULL;
3422*4882a593Smuzhiyun if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
3423*4882a593Smuzhiyun set_blocksize(p->bdev, p->old_block_size);
3424*4882a593Smuzhiyun blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3425*4882a593Smuzhiyun }
3426*4882a593Smuzhiyun inode = NULL;
3427*4882a593Smuzhiyun destroy_swap_extents(p);
3428*4882a593Smuzhiyun swap_cgroup_swapoff(p->type);
3429*4882a593Smuzhiyun spin_lock(&swap_lock);
3430*4882a593Smuzhiyun p->swap_file = NULL;
3431*4882a593Smuzhiyun p->flags = 0;
3432*4882a593Smuzhiyun spin_unlock(&swap_lock);
3433*4882a593Smuzhiyun vfree(swap_map);
3434*4882a593Smuzhiyun kvfree(cluster_info);
3435*4882a593Smuzhiyun kvfree(frontswap_map);
3436*4882a593Smuzhiyun if (inced_nr_rotate_swap)
3437*4882a593Smuzhiyun atomic_dec(&nr_rotate_swap);
3438*4882a593Smuzhiyun if (swap_file)
3439*4882a593Smuzhiyun filp_close(swap_file, NULL);
3440*4882a593Smuzhiyun out:
3441*4882a593Smuzhiyun if (page && !IS_ERR(page)) {
3442*4882a593Smuzhiyun kunmap(page);
3443*4882a593Smuzhiyun put_page(page);
3444*4882a593Smuzhiyun }
3445*4882a593Smuzhiyun if (name)
3446*4882a593Smuzhiyun putname(name);
3447*4882a593Smuzhiyun if (inode)
3448*4882a593Smuzhiyun inode_unlock(inode);
3449*4882a593Smuzhiyun if (!error)
3450*4882a593Smuzhiyun enable_swap_slots_cache();
3451*4882a593Smuzhiyun return error;
3452*4882a593Smuzhiyun }
3453*4882a593Smuzhiyun
si_swapinfo(struct sysinfo * val)3454*4882a593Smuzhiyun void si_swapinfo(struct sysinfo *val)
3455*4882a593Smuzhiyun {
3456*4882a593Smuzhiyun unsigned int type;
3457*4882a593Smuzhiyun unsigned long nr_to_be_unused = 0;
3458*4882a593Smuzhiyun
3459*4882a593Smuzhiyun spin_lock(&swap_lock);
3460*4882a593Smuzhiyun for (type = 0; type < nr_swapfiles; type++) {
3461*4882a593Smuzhiyun struct swap_info_struct *si = swap_info[type];
3462*4882a593Smuzhiyun bool skip = false;
3463*4882a593Smuzhiyun
3464*4882a593Smuzhiyun trace_android_vh_si_swapinfo(si, &skip);
3465*4882a593Smuzhiyun if (!skip && (si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
3466*4882a593Smuzhiyun nr_to_be_unused += si->inuse_pages;
3467*4882a593Smuzhiyun }
3468*4882a593Smuzhiyun val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
3469*4882a593Smuzhiyun val->totalswap = total_swap_pages + nr_to_be_unused;
3470*4882a593Smuzhiyun spin_unlock(&swap_lock);
3471*4882a593Smuzhiyun }
3472*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(si_swapinfo);
3473*4882a593Smuzhiyun
3474*4882a593Smuzhiyun /*
3475*4882a593Smuzhiyun * Verify that a swap entry is valid and increment its swap map count.
3476*4882a593Smuzhiyun *
3477*4882a593Smuzhiyun * Returns error code in following case.
3478*4882a593Smuzhiyun * - success -> 0
3479*4882a593Smuzhiyun * - swp_entry is invalid -> EINVAL
3480*4882a593Smuzhiyun * - swp_entry is migration entry -> EINVAL
3481*4882a593Smuzhiyun * - swap-cache reference is requested but there is already one. -> EEXIST
3482*4882a593Smuzhiyun * - swap-cache reference is requested but the entry is not used. -> ENOENT
3483*4882a593Smuzhiyun * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
3484*4882a593Smuzhiyun */
__swap_duplicate(swp_entry_t entry,unsigned char usage)3485*4882a593Smuzhiyun static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
3486*4882a593Smuzhiyun {
3487*4882a593Smuzhiyun struct swap_info_struct *p;
3488*4882a593Smuzhiyun struct swap_cluster_info *ci;
3489*4882a593Smuzhiyun unsigned long offset;
3490*4882a593Smuzhiyun unsigned char count;
3491*4882a593Smuzhiyun unsigned char has_cache;
3492*4882a593Smuzhiyun int err = -EINVAL;
3493*4882a593Smuzhiyun
3494*4882a593Smuzhiyun p = get_swap_device(entry);
3495*4882a593Smuzhiyun if (!p)
3496*4882a593Smuzhiyun goto out;
3497*4882a593Smuzhiyun
3498*4882a593Smuzhiyun offset = swp_offset(entry);
3499*4882a593Smuzhiyun ci = lock_cluster_or_swap_info(p, offset);
3500*4882a593Smuzhiyun
3501*4882a593Smuzhiyun count = p->swap_map[offset];
3502*4882a593Smuzhiyun
3503*4882a593Smuzhiyun /*
3504*4882a593Smuzhiyun * swapin_readahead() doesn't check if a swap entry is valid, so the
3505*4882a593Smuzhiyun * swap entry could be SWAP_MAP_BAD. Check here with lock held.
3506*4882a593Smuzhiyun */
3507*4882a593Smuzhiyun if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
3508*4882a593Smuzhiyun err = -ENOENT;
3509*4882a593Smuzhiyun goto unlock_out;
3510*4882a593Smuzhiyun }
3511*4882a593Smuzhiyun
3512*4882a593Smuzhiyun has_cache = count & SWAP_HAS_CACHE;
3513*4882a593Smuzhiyun count &= ~SWAP_HAS_CACHE;
3514*4882a593Smuzhiyun err = 0;
3515*4882a593Smuzhiyun
3516*4882a593Smuzhiyun if (usage == SWAP_HAS_CACHE) {
3517*4882a593Smuzhiyun
3518*4882a593Smuzhiyun /* set SWAP_HAS_CACHE if there is no cache and entry is used */
3519*4882a593Smuzhiyun if (!has_cache && count)
3520*4882a593Smuzhiyun has_cache = SWAP_HAS_CACHE;
3521*4882a593Smuzhiyun else if (has_cache) /* someone else added cache */
3522*4882a593Smuzhiyun err = -EEXIST;
3523*4882a593Smuzhiyun else /* no users remaining */
3524*4882a593Smuzhiyun err = -ENOENT;
3525*4882a593Smuzhiyun
3526*4882a593Smuzhiyun } else if (count || has_cache) {
3527*4882a593Smuzhiyun
3528*4882a593Smuzhiyun if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
3529*4882a593Smuzhiyun count += usage;
3530*4882a593Smuzhiyun else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
3531*4882a593Smuzhiyun err = -EINVAL;
3532*4882a593Smuzhiyun else if (swap_count_continued(p, offset, count))
3533*4882a593Smuzhiyun count = COUNT_CONTINUED;
3534*4882a593Smuzhiyun else
3535*4882a593Smuzhiyun err = -ENOMEM;
3536*4882a593Smuzhiyun } else
3537*4882a593Smuzhiyun err = -ENOENT; /* unused swap entry */
3538*4882a593Smuzhiyun
3539*4882a593Smuzhiyun WRITE_ONCE(p->swap_map[offset], count | has_cache);
3540*4882a593Smuzhiyun
3541*4882a593Smuzhiyun unlock_out:
3542*4882a593Smuzhiyun unlock_cluster_or_swap_info(p, ci);
3543*4882a593Smuzhiyun out:
3544*4882a593Smuzhiyun if (p)
3545*4882a593Smuzhiyun put_swap_device(p);
3546*4882a593Smuzhiyun return err;
3547*4882a593Smuzhiyun }
3548*4882a593Smuzhiyun
3549*4882a593Smuzhiyun /*
3550*4882a593Smuzhiyun * Help swapoff by noting that swap entry belongs to shmem/tmpfs
3551*4882a593Smuzhiyun * (in which case its reference count is never incremented).
3552*4882a593Smuzhiyun */
swap_shmem_alloc(swp_entry_t entry)3553*4882a593Smuzhiyun void swap_shmem_alloc(swp_entry_t entry)
3554*4882a593Smuzhiyun {
3555*4882a593Smuzhiyun __swap_duplicate(entry, SWAP_MAP_SHMEM);
3556*4882a593Smuzhiyun }
3557*4882a593Smuzhiyun
3558*4882a593Smuzhiyun /*
3559*4882a593Smuzhiyun * Increase reference count of swap entry by 1.
3560*4882a593Smuzhiyun * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
3561*4882a593Smuzhiyun * but could not be atomically allocated. Returns 0, just as if it succeeded,
3562*4882a593Smuzhiyun * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
3563*4882a593Smuzhiyun * might occur if a page table entry has got corrupted.
3564*4882a593Smuzhiyun */
swap_duplicate(swp_entry_t entry)3565*4882a593Smuzhiyun int swap_duplicate(swp_entry_t entry)
3566*4882a593Smuzhiyun {
3567*4882a593Smuzhiyun int err = 0;
3568*4882a593Smuzhiyun
3569*4882a593Smuzhiyun while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
3570*4882a593Smuzhiyun err = add_swap_count_continuation(entry, GFP_ATOMIC);
3571*4882a593Smuzhiyun return err;
3572*4882a593Smuzhiyun }
3573*4882a593Smuzhiyun
3574*4882a593Smuzhiyun /*
3575*4882a593Smuzhiyun * @entry: swap entry for which we allocate swap cache.
3576*4882a593Smuzhiyun *
3577*4882a593Smuzhiyun * Called when allocating swap cache for existing swap entry,
3578*4882a593Smuzhiyun * This can return error codes. Returns 0 at success.
3579*4882a593Smuzhiyun * -EEXIST means there is a swap cache.
3580*4882a593Smuzhiyun * Note: return code is different from swap_duplicate().
3581*4882a593Smuzhiyun */
swapcache_prepare(swp_entry_t entry)3582*4882a593Smuzhiyun int swapcache_prepare(swp_entry_t entry)
3583*4882a593Smuzhiyun {
3584*4882a593Smuzhiyun return __swap_duplicate(entry, SWAP_HAS_CACHE);
3585*4882a593Smuzhiyun }
3586*4882a593Smuzhiyun
swp_swap_info(swp_entry_t entry)3587*4882a593Smuzhiyun struct swap_info_struct *swp_swap_info(swp_entry_t entry)
3588*4882a593Smuzhiyun {
3589*4882a593Smuzhiyun return swap_type_to_swap_info(swp_type(entry));
3590*4882a593Smuzhiyun }
3591*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(swp_swap_info);
3592*4882a593Smuzhiyun
page_swap_info(struct page * page)3593*4882a593Smuzhiyun struct swap_info_struct *page_swap_info(struct page *page)
3594*4882a593Smuzhiyun {
3595*4882a593Smuzhiyun swp_entry_t entry = { .val = page_private(page) };
3596*4882a593Smuzhiyun return swp_swap_info(entry);
3597*4882a593Smuzhiyun }
3598*4882a593Smuzhiyun
3599*4882a593Smuzhiyun /*
3600*4882a593Smuzhiyun * out-of-line __page_file_ methods to avoid include hell.
3601*4882a593Smuzhiyun */
__page_file_mapping(struct page * page)3602*4882a593Smuzhiyun struct address_space *__page_file_mapping(struct page *page)
3603*4882a593Smuzhiyun {
3604*4882a593Smuzhiyun return page_swap_info(page)->swap_file->f_mapping;
3605*4882a593Smuzhiyun }
3606*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(__page_file_mapping);
3607*4882a593Smuzhiyun
__page_file_index(struct page * page)3608*4882a593Smuzhiyun pgoff_t __page_file_index(struct page *page)
3609*4882a593Smuzhiyun {
3610*4882a593Smuzhiyun swp_entry_t swap = { .val = page_private(page) };
3611*4882a593Smuzhiyun return swp_offset(swap);
3612*4882a593Smuzhiyun }
3613*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(__page_file_index);
3614*4882a593Smuzhiyun
3615*4882a593Smuzhiyun /*
3616*4882a593Smuzhiyun * add_swap_count_continuation - called when a swap count is duplicated
3617*4882a593Smuzhiyun * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
3618*4882a593Smuzhiyun * page of the original vmalloc'ed swap_map, to hold the continuation count
3619*4882a593Smuzhiyun * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called
3620*4882a593Smuzhiyun * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
3621*4882a593Smuzhiyun *
3622*4882a593Smuzhiyun * These continuation pages are seldom referenced: the common paths all work
3623*4882a593Smuzhiyun * on the original swap_map, only referring to a continuation page when the
3624*4882a593Smuzhiyun * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
3625*4882a593Smuzhiyun *
3626*4882a593Smuzhiyun * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
3627*4882a593Smuzhiyun * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
3628*4882a593Smuzhiyun * can be called after dropping locks.
3629*4882a593Smuzhiyun */
add_swap_count_continuation(swp_entry_t entry,gfp_t gfp_mask)3630*4882a593Smuzhiyun int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
3631*4882a593Smuzhiyun {
3632*4882a593Smuzhiyun struct swap_info_struct *si;
3633*4882a593Smuzhiyun struct swap_cluster_info *ci;
3634*4882a593Smuzhiyun struct page *head;
3635*4882a593Smuzhiyun struct page *page;
3636*4882a593Smuzhiyun struct page *list_page;
3637*4882a593Smuzhiyun pgoff_t offset;
3638*4882a593Smuzhiyun unsigned char count;
3639*4882a593Smuzhiyun int ret = 0;
3640*4882a593Smuzhiyun
3641*4882a593Smuzhiyun /*
3642*4882a593Smuzhiyun * When debugging, it's easier to use __GFP_ZERO here; but it's better
3643*4882a593Smuzhiyun * for latency not to zero a page while GFP_ATOMIC and holding locks.
3644*4882a593Smuzhiyun */
3645*4882a593Smuzhiyun page = alloc_page(gfp_mask | __GFP_HIGHMEM);
3646*4882a593Smuzhiyun
3647*4882a593Smuzhiyun si = get_swap_device(entry);
3648*4882a593Smuzhiyun if (!si) {
3649*4882a593Smuzhiyun /*
3650*4882a593Smuzhiyun * An acceptable race has occurred since the failing
3651*4882a593Smuzhiyun * __swap_duplicate(): the swap device may be swapoff
3652*4882a593Smuzhiyun */
3653*4882a593Smuzhiyun goto outer;
3654*4882a593Smuzhiyun }
3655*4882a593Smuzhiyun spin_lock(&si->lock);
3656*4882a593Smuzhiyun
3657*4882a593Smuzhiyun offset = swp_offset(entry);
3658*4882a593Smuzhiyun
3659*4882a593Smuzhiyun ci = lock_cluster(si, offset);
3660*4882a593Smuzhiyun
3661*4882a593Smuzhiyun count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
3662*4882a593Smuzhiyun
3663*4882a593Smuzhiyun if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
3664*4882a593Smuzhiyun /*
3665*4882a593Smuzhiyun * The higher the swap count, the more likely it is that tasks
3666*4882a593Smuzhiyun * will race to add swap count continuation: we need to avoid
3667*4882a593Smuzhiyun * over-provisioning.
3668*4882a593Smuzhiyun */
3669*4882a593Smuzhiyun goto out;
3670*4882a593Smuzhiyun }
3671*4882a593Smuzhiyun
3672*4882a593Smuzhiyun if (!page) {
3673*4882a593Smuzhiyun ret = -ENOMEM;
3674*4882a593Smuzhiyun goto out;
3675*4882a593Smuzhiyun }
3676*4882a593Smuzhiyun
3677*4882a593Smuzhiyun /*
3678*4882a593Smuzhiyun * We are fortunate that although vmalloc_to_page uses pte_offset_map,
3679*4882a593Smuzhiyun * no architecture is using highmem pages for kernel page tables: so it
3680*4882a593Smuzhiyun * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps.
3681*4882a593Smuzhiyun */
3682*4882a593Smuzhiyun head = vmalloc_to_page(si->swap_map + offset);
3683*4882a593Smuzhiyun offset &= ~PAGE_MASK;
3684*4882a593Smuzhiyun
3685*4882a593Smuzhiyun spin_lock(&si->cont_lock);
3686*4882a593Smuzhiyun /*
3687*4882a593Smuzhiyun * Page allocation does not initialize the page's lru field,
3688*4882a593Smuzhiyun * but it does always reset its private field.
3689*4882a593Smuzhiyun */
3690*4882a593Smuzhiyun if (!page_private(head)) {
3691*4882a593Smuzhiyun BUG_ON(count & COUNT_CONTINUED);
3692*4882a593Smuzhiyun INIT_LIST_HEAD(&head->lru);
3693*4882a593Smuzhiyun set_page_private(head, SWP_CONTINUED);
3694*4882a593Smuzhiyun si->flags |= SWP_CONTINUED;
3695*4882a593Smuzhiyun }
3696*4882a593Smuzhiyun
3697*4882a593Smuzhiyun list_for_each_entry(list_page, &head->lru, lru) {
3698*4882a593Smuzhiyun unsigned char *map;
3699*4882a593Smuzhiyun
3700*4882a593Smuzhiyun /*
3701*4882a593Smuzhiyun * If the previous map said no continuation, but we've found
3702*4882a593Smuzhiyun * a continuation page, free our allocation and use this one.
3703*4882a593Smuzhiyun */
3704*4882a593Smuzhiyun if (!(count & COUNT_CONTINUED))
3705*4882a593Smuzhiyun goto out_unlock_cont;
3706*4882a593Smuzhiyun
3707*4882a593Smuzhiyun map = kmap_atomic(list_page) + offset;
3708*4882a593Smuzhiyun count = *map;
3709*4882a593Smuzhiyun kunmap_atomic(map);
3710*4882a593Smuzhiyun
3711*4882a593Smuzhiyun /*
3712*4882a593Smuzhiyun * If this continuation count now has some space in it,
3713*4882a593Smuzhiyun * free our allocation and use this one.
3714*4882a593Smuzhiyun */
3715*4882a593Smuzhiyun if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
3716*4882a593Smuzhiyun goto out_unlock_cont;
3717*4882a593Smuzhiyun }
3718*4882a593Smuzhiyun
3719*4882a593Smuzhiyun list_add_tail(&page->lru, &head->lru);
3720*4882a593Smuzhiyun page = NULL; /* now it's attached, don't free it */
3721*4882a593Smuzhiyun out_unlock_cont:
3722*4882a593Smuzhiyun spin_unlock(&si->cont_lock);
3723*4882a593Smuzhiyun out:
3724*4882a593Smuzhiyun unlock_cluster(ci);
3725*4882a593Smuzhiyun spin_unlock(&si->lock);
3726*4882a593Smuzhiyun put_swap_device(si);
3727*4882a593Smuzhiyun outer:
3728*4882a593Smuzhiyun if (page)
3729*4882a593Smuzhiyun __free_page(page);
3730*4882a593Smuzhiyun return ret;
3731*4882a593Smuzhiyun }
3732*4882a593Smuzhiyun
3733*4882a593Smuzhiyun /*
3734*4882a593Smuzhiyun * swap_count_continued - when the original swap_map count is incremented
3735*4882a593Smuzhiyun * from SWAP_MAP_MAX, check if there is already a continuation page to carry
3736*4882a593Smuzhiyun * into, carry if so, or else fail until a new continuation page is allocated;
3737*4882a593Smuzhiyun * when the original swap_map count is decremented from 0 with continuation,
3738*4882a593Smuzhiyun * borrow from the continuation and report whether it still holds more.
3739*4882a593Smuzhiyun * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster
3740*4882a593Smuzhiyun * lock.
3741*4882a593Smuzhiyun */
swap_count_continued(struct swap_info_struct * si,pgoff_t offset,unsigned char count)3742*4882a593Smuzhiyun static bool swap_count_continued(struct swap_info_struct *si,
3743*4882a593Smuzhiyun pgoff_t offset, unsigned char count)
3744*4882a593Smuzhiyun {
3745*4882a593Smuzhiyun struct page *head;
3746*4882a593Smuzhiyun struct page *page;
3747*4882a593Smuzhiyun unsigned char *map;
3748*4882a593Smuzhiyun bool ret;
3749*4882a593Smuzhiyun
3750*4882a593Smuzhiyun head = vmalloc_to_page(si->swap_map + offset);
3751*4882a593Smuzhiyun if (page_private(head) != SWP_CONTINUED) {
3752*4882a593Smuzhiyun BUG_ON(count & COUNT_CONTINUED);
3753*4882a593Smuzhiyun return false; /* need to add count continuation */
3754*4882a593Smuzhiyun }
3755*4882a593Smuzhiyun
3756*4882a593Smuzhiyun spin_lock(&si->cont_lock);
3757*4882a593Smuzhiyun offset &= ~PAGE_MASK;
3758*4882a593Smuzhiyun page = list_next_entry(head, lru);
3759*4882a593Smuzhiyun map = kmap_atomic(page) + offset;
3760*4882a593Smuzhiyun
3761*4882a593Smuzhiyun if (count == SWAP_MAP_MAX) /* initial increment from swap_map */
3762*4882a593Smuzhiyun goto init_map; /* jump over SWAP_CONT_MAX checks */
3763*4882a593Smuzhiyun
3764*4882a593Smuzhiyun if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */
3765*4882a593Smuzhiyun /*
3766*4882a593Smuzhiyun * Think of how you add 1 to 999
3767*4882a593Smuzhiyun */
3768*4882a593Smuzhiyun while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
3769*4882a593Smuzhiyun kunmap_atomic(map);
3770*4882a593Smuzhiyun page = list_next_entry(page, lru);
3771*4882a593Smuzhiyun BUG_ON(page == head);
3772*4882a593Smuzhiyun map = kmap_atomic(page) + offset;
3773*4882a593Smuzhiyun }
3774*4882a593Smuzhiyun if (*map == SWAP_CONT_MAX) {
3775*4882a593Smuzhiyun kunmap_atomic(map);
3776*4882a593Smuzhiyun page = list_next_entry(page, lru);
3777*4882a593Smuzhiyun if (page == head) {
3778*4882a593Smuzhiyun ret = false; /* add count continuation */
3779*4882a593Smuzhiyun goto out;
3780*4882a593Smuzhiyun }
3781*4882a593Smuzhiyun map = kmap_atomic(page) + offset;
3782*4882a593Smuzhiyun init_map: *map = 0; /* we didn't zero the page */
3783*4882a593Smuzhiyun }
3784*4882a593Smuzhiyun *map += 1;
3785*4882a593Smuzhiyun kunmap_atomic(map);
3786*4882a593Smuzhiyun while ((page = list_prev_entry(page, lru)) != head) {
3787*4882a593Smuzhiyun map = kmap_atomic(page) + offset;
3788*4882a593Smuzhiyun *map = COUNT_CONTINUED;
3789*4882a593Smuzhiyun kunmap_atomic(map);
3790*4882a593Smuzhiyun }
3791*4882a593Smuzhiyun ret = true; /* incremented */
3792*4882a593Smuzhiyun
3793*4882a593Smuzhiyun } else { /* decrementing */
3794*4882a593Smuzhiyun /*
3795*4882a593Smuzhiyun * Think of how you subtract 1 from 1000
3796*4882a593Smuzhiyun */
3797*4882a593Smuzhiyun BUG_ON(count != COUNT_CONTINUED);
3798*4882a593Smuzhiyun while (*map == COUNT_CONTINUED) {
3799*4882a593Smuzhiyun kunmap_atomic(map);
3800*4882a593Smuzhiyun page = list_next_entry(page, lru);
3801*4882a593Smuzhiyun BUG_ON(page == head);
3802*4882a593Smuzhiyun map = kmap_atomic(page) + offset;
3803*4882a593Smuzhiyun }
3804*4882a593Smuzhiyun BUG_ON(*map == 0);
3805*4882a593Smuzhiyun *map -= 1;
3806*4882a593Smuzhiyun if (*map == 0)
3807*4882a593Smuzhiyun count = 0;
3808*4882a593Smuzhiyun kunmap_atomic(map);
3809*4882a593Smuzhiyun while ((page = list_prev_entry(page, lru)) != head) {
3810*4882a593Smuzhiyun map = kmap_atomic(page) + offset;
3811*4882a593Smuzhiyun *map = SWAP_CONT_MAX | count;
3812*4882a593Smuzhiyun count = COUNT_CONTINUED;
3813*4882a593Smuzhiyun kunmap_atomic(map);
3814*4882a593Smuzhiyun }
3815*4882a593Smuzhiyun ret = count == COUNT_CONTINUED;
3816*4882a593Smuzhiyun }
3817*4882a593Smuzhiyun out:
3818*4882a593Smuzhiyun spin_unlock(&si->cont_lock);
3819*4882a593Smuzhiyun return ret;
3820*4882a593Smuzhiyun }
3821*4882a593Smuzhiyun
3822*4882a593Smuzhiyun /*
3823*4882a593Smuzhiyun * free_swap_count_continuations - swapoff free all the continuation pages
3824*4882a593Smuzhiyun * appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
3825*4882a593Smuzhiyun */
free_swap_count_continuations(struct swap_info_struct * si)3826*4882a593Smuzhiyun static void free_swap_count_continuations(struct swap_info_struct *si)
3827*4882a593Smuzhiyun {
3828*4882a593Smuzhiyun pgoff_t offset;
3829*4882a593Smuzhiyun
3830*4882a593Smuzhiyun for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
3831*4882a593Smuzhiyun struct page *head;
3832*4882a593Smuzhiyun head = vmalloc_to_page(si->swap_map + offset);
3833*4882a593Smuzhiyun if (page_private(head)) {
3834*4882a593Smuzhiyun struct page *page, *next;
3835*4882a593Smuzhiyun
3836*4882a593Smuzhiyun list_for_each_entry_safe(page, next, &head->lru, lru) {
3837*4882a593Smuzhiyun list_del(&page->lru);
3838*4882a593Smuzhiyun __free_page(page);
3839*4882a593Smuzhiyun }
3840*4882a593Smuzhiyun }
3841*4882a593Smuzhiyun }
3842*4882a593Smuzhiyun }
3843*4882a593Smuzhiyun
3844*4882a593Smuzhiyun #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
__cgroup_throttle_swaprate(struct page * page,gfp_t gfp_mask)3845*4882a593Smuzhiyun void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
3846*4882a593Smuzhiyun {
3847*4882a593Smuzhiyun struct swap_info_struct *si, *next;
3848*4882a593Smuzhiyun int nid = page_to_nid(page);
3849*4882a593Smuzhiyun
3850*4882a593Smuzhiyun if (!(gfp_mask & __GFP_IO))
3851*4882a593Smuzhiyun return;
3852*4882a593Smuzhiyun
3853*4882a593Smuzhiyun if (!blk_cgroup_congested())
3854*4882a593Smuzhiyun return;
3855*4882a593Smuzhiyun
3856*4882a593Smuzhiyun /*
3857*4882a593Smuzhiyun * We've already scheduled a throttle, avoid taking the global swap
3858*4882a593Smuzhiyun * lock.
3859*4882a593Smuzhiyun */
3860*4882a593Smuzhiyun if (current->throttle_queue)
3861*4882a593Smuzhiyun return;
3862*4882a593Smuzhiyun
3863*4882a593Smuzhiyun spin_lock(&swap_avail_lock);
3864*4882a593Smuzhiyun plist_for_each_entry_safe(si, next, &swap_avail_heads[nid],
3865*4882a593Smuzhiyun avail_lists[nid]) {
3866*4882a593Smuzhiyun if (si->bdev) {
3867*4882a593Smuzhiyun blkcg_schedule_throttle(bdev_get_queue(si->bdev), true);
3868*4882a593Smuzhiyun break;
3869*4882a593Smuzhiyun }
3870*4882a593Smuzhiyun }
3871*4882a593Smuzhiyun spin_unlock(&swap_avail_lock);
3872*4882a593Smuzhiyun }
3873*4882a593Smuzhiyun #endif
3874*4882a593Smuzhiyun
swapfile_init(void)3875*4882a593Smuzhiyun static int __init swapfile_init(void)
3876*4882a593Smuzhiyun {
3877*4882a593Smuzhiyun int nid;
3878*4882a593Smuzhiyun
3879*4882a593Smuzhiyun swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
3880*4882a593Smuzhiyun GFP_KERNEL);
3881*4882a593Smuzhiyun if (!swap_avail_heads) {
3882*4882a593Smuzhiyun pr_emerg("Not enough memory for swap heads, swap is disabled\n");
3883*4882a593Smuzhiyun return -ENOMEM;
3884*4882a593Smuzhiyun }
3885*4882a593Smuzhiyun
3886*4882a593Smuzhiyun for_each_node(nid)
3887*4882a593Smuzhiyun plist_head_init(&swap_avail_heads[nid]);
3888*4882a593Smuzhiyun
3889*4882a593Smuzhiyun return 0;
3890*4882a593Smuzhiyun }
3891*4882a593Smuzhiyun subsys_initcall(swapfile_init);
3892