xref: /OK3568_Linux_fs/kernel/mm/readahead.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0-only
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun  * mm/readahead.c - address_space-level file readahead.
4*4882a593Smuzhiyun  *
5*4882a593Smuzhiyun  * Copyright (C) 2002, Linus Torvalds
6*4882a593Smuzhiyun  *
7*4882a593Smuzhiyun  * 09Apr2002	Andrew Morton
8*4882a593Smuzhiyun  *		Initial version.
9*4882a593Smuzhiyun  */
10*4882a593Smuzhiyun 
11*4882a593Smuzhiyun #include <linux/kernel.h>
12*4882a593Smuzhiyun #include <linux/dax.h>
13*4882a593Smuzhiyun #include <linux/gfp.h>
14*4882a593Smuzhiyun #include <linux/export.h>
15*4882a593Smuzhiyun #include <linux/blkdev.h>
16*4882a593Smuzhiyun #include <linux/backing-dev.h>
17*4882a593Smuzhiyun #include <linux/task_io_accounting_ops.h>
18*4882a593Smuzhiyun #include <linux/pagevec.h>
19*4882a593Smuzhiyun #include <linux/pagemap.h>
20*4882a593Smuzhiyun #include <linux/syscalls.h>
21*4882a593Smuzhiyun #include <linux/file.h>
22*4882a593Smuzhiyun #include <linux/mm_inline.h>
23*4882a593Smuzhiyun #include <linux/blk-cgroup.h>
24*4882a593Smuzhiyun #include <linux/fadvise.h>
25*4882a593Smuzhiyun #include <linux/sched/mm.h>
26*4882a593Smuzhiyun #include <trace/hooks/mm.h>
27*4882a593Smuzhiyun 
28*4882a593Smuzhiyun #include "internal.h"
29*4882a593Smuzhiyun 
30*4882a593Smuzhiyun #if defined(CONFIG_ARCH_ROCKCHIP) && defined(CONFIG_NO_GKI)
31*4882a593Smuzhiyun #include <linux/fscrypt.h>
32*4882a593Smuzhiyun #endif
33*4882a593Smuzhiyun 
34*4882a593Smuzhiyun /*
35*4882a593Smuzhiyun  * Initialise a struct file's readahead state.  Assumes that the caller has
36*4882a593Smuzhiyun  * memset *ra to zero.
37*4882a593Smuzhiyun  */
38*4882a593Smuzhiyun void
file_ra_state_init(struct file_ra_state * ra,struct address_space * mapping)39*4882a593Smuzhiyun file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
40*4882a593Smuzhiyun {
41*4882a593Smuzhiyun 	ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages;
42*4882a593Smuzhiyun 	ra->prev_pos = -1;
43*4882a593Smuzhiyun }
44*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(file_ra_state_init);
45*4882a593Smuzhiyun 
46*4882a593Smuzhiyun /*
47*4882a593Smuzhiyun  * see if a page needs releasing upon read_cache_pages() failure
48*4882a593Smuzhiyun  * - the caller of read_cache_pages() may have set PG_private or PG_fscache
49*4882a593Smuzhiyun  *   before calling, such as the NFS fs marking pages that are cached locally
50*4882a593Smuzhiyun  *   on disk, thus we need to give the fs a chance to clean up in the event of
51*4882a593Smuzhiyun  *   an error
52*4882a593Smuzhiyun  */
read_cache_pages_invalidate_page(struct address_space * mapping,struct page * page)53*4882a593Smuzhiyun static void read_cache_pages_invalidate_page(struct address_space *mapping,
54*4882a593Smuzhiyun 					     struct page *page)
55*4882a593Smuzhiyun {
56*4882a593Smuzhiyun 	if (page_has_private(page)) {
57*4882a593Smuzhiyun 		if (!trylock_page(page))
58*4882a593Smuzhiyun 			BUG();
59*4882a593Smuzhiyun 		page->mapping = mapping;
60*4882a593Smuzhiyun 		do_invalidatepage(page, 0, PAGE_SIZE);
61*4882a593Smuzhiyun 		page->mapping = NULL;
62*4882a593Smuzhiyun 		unlock_page(page);
63*4882a593Smuzhiyun 	}
64*4882a593Smuzhiyun 	put_page(page);
65*4882a593Smuzhiyun }
66*4882a593Smuzhiyun 
67*4882a593Smuzhiyun /*
68*4882a593Smuzhiyun  * release a list of pages, invalidating them first if need be
69*4882a593Smuzhiyun  */
read_cache_pages_invalidate_pages(struct address_space * mapping,struct list_head * pages)70*4882a593Smuzhiyun static void read_cache_pages_invalidate_pages(struct address_space *mapping,
71*4882a593Smuzhiyun 					      struct list_head *pages)
72*4882a593Smuzhiyun {
73*4882a593Smuzhiyun 	struct page *victim;
74*4882a593Smuzhiyun 
75*4882a593Smuzhiyun 	while (!list_empty(pages)) {
76*4882a593Smuzhiyun 		victim = lru_to_page(pages);
77*4882a593Smuzhiyun 		list_del(&victim->lru);
78*4882a593Smuzhiyun 		read_cache_pages_invalidate_page(mapping, victim);
79*4882a593Smuzhiyun 	}
80*4882a593Smuzhiyun }
81*4882a593Smuzhiyun 
82*4882a593Smuzhiyun /**
83*4882a593Smuzhiyun  * read_cache_pages - populate an address space with some pages & start reads against them
84*4882a593Smuzhiyun  * @mapping: the address_space
85*4882a593Smuzhiyun  * @pages: The address of a list_head which contains the target pages.  These
86*4882a593Smuzhiyun  *   pages have their ->index populated and are otherwise uninitialised.
87*4882a593Smuzhiyun  * @filler: callback routine for filling a single page.
88*4882a593Smuzhiyun  * @data: private data for the callback routine.
89*4882a593Smuzhiyun  *
90*4882a593Smuzhiyun  * Hides the details of the LRU cache etc from the filesystems.
91*4882a593Smuzhiyun  *
92*4882a593Smuzhiyun  * Returns: %0 on success, error return by @filler otherwise
93*4882a593Smuzhiyun  */
read_cache_pages(struct address_space * mapping,struct list_head * pages,int (* filler)(void *,struct page *),void * data)94*4882a593Smuzhiyun int read_cache_pages(struct address_space *mapping, struct list_head *pages,
95*4882a593Smuzhiyun 			int (*filler)(void *, struct page *), void *data)
96*4882a593Smuzhiyun {
97*4882a593Smuzhiyun 	struct page *page;
98*4882a593Smuzhiyun 	int ret = 0;
99*4882a593Smuzhiyun 
100*4882a593Smuzhiyun 	while (!list_empty(pages)) {
101*4882a593Smuzhiyun 		page = lru_to_page(pages);
102*4882a593Smuzhiyun 		list_del(&page->lru);
103*4882a593Smuzhiyun 		if (add_to_page_cache_lru(page, mapping, page->index,
104*4882a593Smuzhiyun 				readahead_gfp_mask(mapping))) {
105*4882a593Smuzhiyun 			read_cache_pages_invalidate_page(mapping, page);
106*4882a593Smuzhiyun 			continue;
107*4882a593Smuzhiyun 		}
108*4882a593Smuzhiyun 		put_page(page);
109*4882a593Smuzhiyun 
110*4882a593Smuzhiyun 		ret = filler(data, page);
111*4882a593Smuzhiyun 		if (unlikely(ret)) {
112*4882a593Smuzhiyun 			read_cache_pages_invalidate_pages(mapping, pages);
113*4882a593Smuzhiyun 			break;
114*4882a593Smuzhiyun 		}
115*4882a593Smuzhiyun 		task_io_account_read(PAGE_SIZE);
116*4882a593Smuzhiyun 	}
117*4882a593Smuzhiyun 	return ret;
118*4882a593Smuzhiyun }
119*4882a593Smuzhiyun 
120*4882a593Smuzhiyun EXPORT_SYMBOL(read_cache_pages);
121*4882a593Smuzhiyun 
readahead_gfp_mask(struct address_space * x)122*4882a593Smuzhiyun gfp_t readahead_gfp_mask(struct address_space *x)
123*4882a593Smuzhiyun {
124*4882a593Smuzhiyun 	gfp_t mask = mapping_gfp_mask(x) | __GFP_NORETRY | __GFP_NOWARN;
125*4882a593Smuzhiyun 
126*4882a593Smuzhiyun 	trace_android_rvh_set_readahead_gfp_mask(&mask);
127*4882a593Smuzhiyun 	return mask;
128*4882a593Smuzhiyun }
129*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(readahead_gfp_mask);
130*4882a593Smuzhiyun 
read_pages(struct readahead_control * rac,struct list_head * pages,bool skip_page)131*4882a593Smuzhiyun static void read_pages(struct readahead_control *rac, struct list_head *pages,
132*4882a593Smuzhiyun 		bool skip_page)
133*4882a593Smuzhiyun {
134*4882a593Smuzhiyun 	const struct address_space_operations *aops = rac->mapping->a_ops;
135*4882a593Smuzhiyun 	struct page *page;
136*4882a593Smuzhiyun 	struct blk_plug plug;
137*4882a593Smuzhiyun 
138*4882a593Smuzhiyun 	if (!readahead_count(rac))
139*4882a593Smuzhiyun 		goto out;
140*4882a593Smuzhiyun 
141*4882a593Smuzhiyun 	blk_start_plug(&plug);
142*4882a593Smuzhiyun 
143*4882a593Smuzhiyun 	if (aops->readahead) {
144*4882a593Smuzhiyun 		aops->readahead(rac);
145*4882a593Smuzhiyun 		/* Clean up the remaining pages */
146*4882a593Smuzhiyun 		while ((page = readahead_page(rac))) {
147*4882a593Smuzhiyun 			unlock_page(page);
148*4882a593Smuzhiyun 			put_page(page);
149*4882a593Smuzhiyun 		}
150*4882a593Smuzhiyun 	} else if (aops->readpages) {
151*4882a593Smuzhiyun 		aops->readpages(rac->file, rac->mapping, pages,
152*4882a593Smuzhiyun 				readahead_count(rac));
153*4882a593Smuzhiyun 		/* Clean up the remaining pages */
154*4882a593Smuzhiyun 		put_pages_list(pages);
155*4882a593Smuzhiyun 		rac->_index += rac->_nr_pages;
156*4882a593Smuzhiyun 		rac->_nr_pages = 0;
157*4882a593Smuzhiyun 	} else {
158*4882a593Smuzhiyun 		while ((page = readahead_page(rac))) {
159*4882a593Smuzhiyun 			aops->readpage(rac->file, page);
160*4882a593Smuzhiyun 			put_page(page);
161*4882a593Smuzhiyun 		}
162*4882a593Smuzhiyun 	}
163*4882a593Smuzhiyun 
164*4882a593Smuzhiyun 	blk_finish_plug(&plug);
165*4882a593Smuzhiyun 
166*4882a593Smuzhiyun 	BUG_ON(!list_empty(pages));
167*4882a593Smuzhiyun 	BUG_ON(readahead_count(rac));
168*4882a593Smuzhiyun 
169*4882a593Smuzhiyun out:
170*4882a593Smuzhiyun 	if (skip_page)
171*4882a593Smuzhiyun 		rac->_index++;
172*4882a593Smuzhiyun }
173*4882a593Smuzhiyun 
174*4882a593Smuzhiyun /**
175*4882a593Smuzhiyun  * page_cache_ra_unbounded - Start unchecked readahead.
176*4882a593Smuzhiyun  * @ractl: Readahead control.
177*4882a593Smuzhiyun  * @nr_to_read: The number of pages to read.
178*4882a593Smuzhiyun  * @lookahead_size: Where to start the next readahead.
179*4882a593Smuzhiyun  *
180*4882a593Smuzhiyun  * This function is for filesystems to call when they want to start
181*4882a593Smuzhiyun  * readahead beyond a file's stated i_size.  This is almost certainly
182*4882a593Smuzhiyun  * not the function you want to call.  Use page_cache_async_readahead()
183*4882a593Smuzhiyun  * or page_cache_sync_readahead() instead.
184*4882a593Smuzhiyun  *
185*4882a593Smuzhiyun  * Context: File is referenced by caller.  Mutexes may be held by caller.
186*4882a593Smuzhiyun  * May sleep, but will not reenter filesystem to reclaim memory.
187*4882a593Smuzhiyun  */
page_cache_ra_unbounded(struct readahead_control * ractl,unsigned long nr_to_read,unsigned long lookahead_size)188*4882a593Smuzhiyun void page_cache_ra_unbounded(struct readahead_control *ractl,
189*4882a593Smuzhiyun 		unsigned long nr_to_read, unsigned long lookahead_size)
190*4882a593Smuzhiyun {
191*4882a593Smuzhiyun 	struct address_space *mapping = ractl->mapping;
192*4882a593Smuzhiyun 	unsigned long index = readahead_index(ractl);
193*4882a593Smuzhiyun 	LIST_HEAD(page_pool);
194*4882a593Smuzhiyun 	gfp_t gfp_mask = readahead_gfp_mask(mapping);
195*4882a593Smuzhiyun 	unsigned long i;
196*4882a593Smuzhiyun 
197*4882a593Smuzhiyun 	/*
198*4882a593Smuzhiyun 	 * Partway through the readahead operation, we will have added
199*4882a593Smuzhiyun 	 * locked pages to the page cache, but will not yet have submitted
200*4882a593Smuzhiyun 	 * them for I/O.  Adding another page may need to allocate memory,
201*4882a593Smuzhiyun 	 * which can trigger memory reclaim.  Telling the VM we're in
202*4882a593Smuzhiyun 	 * the middle of a filesystem operation will cause it to not
203*4882a593Smuzhiyun 	 * touch file-backed pages, preventing a deadlock.  Most (all?)
204*4882a593Smuzhiyun 	 * filesystems already specify __GFP_NOFS in their mapping's
205*4882a593Smuzhiyun 	 * gfp_mask, but let's be explicit here.
206*4882a593Smuzhiyun 	 */
207*4882a593Smuzhiyun 	unsigned int nofs = memalloc_nofs_save();
208*4882a593Smuzhiyun 
209*4882a593Smuzhiyun 	/*
210*4882a593Smuzhiyun 	 * Preallocate as many pages as we will need.
211*4882a593Smuzhiyun 	 */
212*4882a593Smuzhiyun 	for (i = 0; i < nr_to_read; i++) {
213*4882a593Smuzhiyun 		struct page *page = xa_load(&mapping->i_pages, index + i);
214*4882a593Smuzhiyun 
215*4882a593Smuzhiyun 		BUG_ON(index + i != ractl->_index + ractl->_nr_pages);
216*4882a593Smuzhiyun 
217*4882a593Smuzhiyun 		if (page && !xa_is_value(page)) {
218*4882a593Smuzhiyun 			/*
219*4882a593Smuzhiyun 			 * Page already present?  Kick off the current batch
220*4882a593Smuzhiyun 			 * of contiguous pages before continuing with the
221*4882a593Smuzhiyun 			 * next batch.  This page may be the one we would
222*4882a593Smuzhiyun 			 * have intended to mark as Readahead, but we don't
223*4882a593Smuzhiyun 			 * have a stable reference to this page, and it's
224*4882a593Smuzhiyun 			 * not worth getting one just for that.
225*4882a593Smuzhiyun 			 */
226*4882a593Smuzhiyun 			read_pages(ractl, &page_pool, true);
227*4882a593Smuzhiyun 			continue;
228*4882a593Smuzhiyun 		}
229*4882a593Smuzhiyun 
230*4882a593Smuzhiyun 		page = __page_cache_alloc(gfp_mask);
231*4882a593Smuzhiyun 		if (!page)
232*4882a593Smuzhiyun 			break;
233*4882a593Smuzhiyun 		if (mapping->a_ops->readpages) {
234*4882a593Smuzhiyun 			page->index = index + i;
235*4882a593Smuzhiyun 			list_add(&page->lru, &page_pool);
236*4882a593Smuzhiyun 		} else if (add_to_page_cache_lru(page, mapping, index + i,
237*4882a593Smuzhiyun 					gfp_mask) < 0) {
238*4882a593Smuzhiyun 			put_page(page);
239*4882a593Smuzhiyun 			read_pages(ractl, &page_pool, true);
240*4882a593Smuzhiyun 			continue;
241*4882a593Smuzhiyun 		}
242*4882a593Smuzhiyun 		if (i == nr_to_read - lookahead_size)
243*4882a593Smuzhiyun 			SetPageReadahead(page);
244*4882a593Smuzhiyun 		ractl->_nr_pages++;
245*4882a593Smuzhiyun 	}
246*4882a593Smuzhiyun 
247*4882a593Smuzhiyun 	/*
248*4882a593Smuzhiyun 	 * Now start the IO.  We ignore I/O errors - if the page is not
249*4882a593Smuzhiyun 	 * uptodate then the caller will launch readpage again, and
250*4882a593Smuzhiyun 	 * will then handle the error.
251*4882a593Smuzhiyun 	 */
252*4882a593Smuzhiyun 	read_pages(ractl, &page_pool, false);
253*4882a593Smuzhiyun 	memalloc_nofs_restore(nofs);
254*4882a593Smuzhiyun }
255*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(page_cache_ra_unbounded);
256*4882a593Smuzhiyun 
257*4882a593Smuzhiyun /*
258*4882a593Smuzhiyun  * do_page_cache_ra() actually reads a chunk of disk.  It allocates
259*4882a593Smuzhiyun  * the pages first, then submits them for I/O. This avoids the very bad
260*4882a593Smuzhiyun  * behaviour which would occur if page allocations are causing VM writeback.
261*4882a593Smuzhiyun  * We really don't want to intermingle reads and writes like that.
262*4882a593Smuzhiyun  */
do_page_cache_ra(struct readahead_control * ractl,unsigned long nr_to_read,unsigned long lookahead_size)263*4882a593Smuzhiyun void do_page_cache_ra(struct readahead_control *ractl,
264*4882a593Smuzhiyun 		unsigned long nr_to_read, unsigned long lookahead_size)
265*4882a593Smuzhiyun {
266*4882a593Smuzhiyun 	struct inode *inode = ractl->mapping->host;
267*4882a593Smuzhiyun 	unsigned long index = readahead_index(ractl);
268*4882a593Smuzhiyun 	loff_t isize = i_size_read(inode);
269*4882a593Smuzhiyun 	pgoff_t end_index;	/* The last page we want to read */
270*4882a593Smuzhiyun 
271*4882a593Smuzhiyun 	if (isize == 0)
272*4882a593Smuzhiyun 		return;
273*4882a593Smuzhiyun 
274*4882a593Smuzhiyun 	end_index = (isize - 1) >> PAGE_SHIFT;
275*4882a593Smuzhiyun 	if (index > end_index)
276*4882a593Smuzhiyun 		return;
277*4882a593Smuzhiyun 	/* Don't read past the page containing the last byte of the file */
278*4882a593Smuzhiyun 	if (nr_to_read > end_index - index)
279*4882a593Smuzhiyun 		nr_to_read = end_index - index + 1;
280*4882a593Smuzhiyun 
281*4882a593Smuzhiyun 	page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size);
282*4882a593Smuzhiyun }
283*4882a593Smuzhiyun 
284*4882a593Smuzhiyun /*
285*4882a593Smuzhiyun  * Chunk the readahead into 2 megabyte units, so that we don't pin too much
286*4882a593Smuzhiyun  * memory at once.
287*4882a593Smuzhiyun  */
force_page_cache_ra(struct readahead_control * ractl,struct file_ra_state * ra,unsigned long nr_to_read)288*4882a593Smuzhiyun void force_page_cache_ra(struct readahead_control *ractl,
289*4882a593Smuzhiyun 		struct file_ra_state *ra, unsigned long nr_to_read)
290*4882a593Smuzhiyun {
291*4882a593Smuzhiyun 	struct address_space *mapping = ractl->mapping;
292*4882a593Smuzhiyun 	struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
293*4882a593Smuzhiyun 	unsigned long max_pages, index;
294*4882a593Smuzhiyun #if defined(CONFIG_ARCH_ROCKCHIP) && defined(CONFIG_NO_GKI)
295*4882a593Smuzhiyun 	bool force_lookahead = false;
296*4882a593Smuzhiyun #endif
297*4882a593Smuzhiyun 
298*4882a593Smuzhiyun 	if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages &&
299*4882a593Smuzhiyun 			!mapping->a_ops->readahead))
300*4882a593Smuzhiyun 		return;
301*4882a593Smuzhiyun 
302*4882a593Smuzhiyun 	/*
303*4882a593Smuzhiyun 	 * If the request exceeds the readahead window, allow the read to
304*4882a593Smuzhiyun 	 * be up to the optimal hardware IO size
305*4882a593Smuzhiyun 	 */
306*4882a593Smuzhiyun 	index = readahead_index(ractl);
307*4882a593Smuzhiyun 	max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
308*4882a593Smuzhiyun #if defined(CONFIG_ARCH_ROCKCHIP) && defined(CONFIG_NO_GKI)
309*4882a593Smuzhiyun 	/* For files with fscrypt enabled, to allow IO and the encryption
310*4882a593Smuzhiyun 	 * or decryption process to ping-pong, lookahead is forcibly enabled.
311*4882a593Smuzhiyun 	 */
312*4882a593Smuzhiyun 	if (nr_to_read > max_pages && fscrypt_inode_uses_fs_layer_crypto(mapping->host))
313*4882a593Smuzhiyun 		force_lookahead = true;
314*4882a593Smuzhiyun #endif
315*4882a593Smuzhiyun 	nr_to_read = min_t(unsigned long, nr_to_read, max_pages);
316*4882a593Smuzhiyun 	while (nr_to_read) {
317*4882a593Smuzhiyun 		unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE;
318*4882a593Smuzhiyun 
319*4882a593Smuzhiyun 		if (this_chunk > nr_to_read)
320*4882a593Smuzhiyun 			this_chunk = nr_to_read;
321*4882a593Smuzhiyun 		ractl->_index = index;
322*4882a593Smuzhiyun #if defined(CONFIG_ARCH_ROCKCHIP) && defined(CONFIG_NO_GKI)
323*4882a593Smuzhiyun 		if (force_lookahead)
324*4882a593Smuzhiyun 			do_page_cache_ra(ractl, this_chunk, this_chunk / 2);
325*4882a593Smuzhiyun 		else
326*4882a593Smuzhiyun 			do_page_cache_ra(ractl, this_chunk, 0);
327*4882a593Smuzhiyun #else
328*4882a593Smuzhiyun 		do_page_cache_ra(ractl, this_chunk, 0);
329*4882a593Smuzhiyun #endif
330*4882a593Smuzhiyun 
331*4882a593Smuzhiyun 		index += this_chunk;
332*4882a593Smuzhiyun 		nr_to_read -= this_chunk;
333*4882a593Smuzhiyun 	}
334*4882a593Smuzhiyun }
335*4882a593Smuzhiyun 
336*4882a593Smuzhiyun /*
337*4882a593Smuzhiyun  * Set the initial window size, round to next power of 2 and square
338*4882a593Smuzhiyun  * for small size, x 4 for medium, and x 2 for large
339*4882a593Smuzhiyun  * for 128k (32 page) max ra
340*4882a593Smuzhiyun  * 1-8 page = 32k initial, > 8 page = 128k initial
341*4882a593Smuzhiyun  */
get_init_ra_size(unsigned long size,unsigned long max)342*4882a593Smuzhiyun static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
343*4882a593Smuzhiyun {
344*4882a593Smuzhiyun 	unsigned long newsize = roundup_pow_of_two(size);
345*4882a593Smuzhiyun 
346*4882a593Smuzhiyun 	if (newsize <= max / 32)
347*4882a593Smuzhiyun 		newsize = newsize * 4;
348*4882a593Smuzhiyun 	else if (newsize <= max / 4)
349*4882a593Smuzhiyun 		newsize = newsize * 2;
350*4882a593Smuzhiyun 	else
351*4882a593Smuzhiyun 		newsize = max;
352*4882a593Smuzhiyun 
353*4882a593Smuzhiyun 	return newsize;
354*4882a593Smuzhiyun }
355*4882a593Smuzhiyun 
356*4882a593Smuzhiyun /*
357*4882a593Smuzhiyun  *  Get the previous window size, ramp it up, and
358*4882a593Smuzhiyun  *  return it as the new window size.
359*4882a593Smuzhiyun  */
get_next_ra_size(struct file_ra_state * ra,unsigned long max)360*4882a593Smuzhiyun static unsigned long get_next_ra_size(struct file_ra_state *ra,
361*4882a593Smuzhiyun 				      unsigned long max)
362*4882a593Smuzhiyun {
363*4882a593Smuzhiyun 	unsigned long cur = ra->size;
364*4882a593Smuzhiyun 
365*4882a593Smuzhiyun 	if (cur < max / 16)
366*4882a593Smuzhiyun 		return 4 * cur;
367*4882a593Smuzhiyun 	if (cur <= max / 2)
368*4882a593Smuzhiyun 		return 2 * cur;
369*4882a593Smuzhiyun 	return max;
370*4882a593Smuzhiyun }
371*4882a593Smuzhiyun 
372*4882a593Smuzhiyun /*
373*4882a593Smuzhiyun  * On-demand readahead design.
374*4882a593Smuzhiyun  *
375*4882a593Smuzhiyun  * The fields in struct file_ra_state represent the most-recently-executed
376*4882a593Smuzhiyun  * readahead attempt:
377*4882a593Smuzhiyun  *
378*4882a593Smuzhiyun  *                        |<----- async_size ---------|
379*4882a593Smuzhiyun  *     |------------------- size -------------------->|
380*4882a593Smuzhiyun  *     |==================#===========================|
381*4882a593Smuzhiyun  *     ^start             ^page marked with PG_readahead
382*4882a593Smuzhiyun  *
383*4882a593Smuzhiyun  * To overlap application thinking time and disk I/O time, we do
384*4882a593Smuzhiyun  * `readahead pipelining': Do not wait until the application consumed all
385*4882a593Smuzhiyun  * readahead pages and stalled on the missing page at readahead_index;
386*4882a593Smuzhiyun  * Instead, submit an asynchronous readahead I/O as soon as there are
387*4882a593Smuzhiyun  * only async_size pages left in the readahead window. Normally async_size
388*4882a593Smuzhiyun  * will be equal to size, for maximum pipelining.
389*4882a593Smuzhiyun  *
390*4882a593Smuzhiyun  * In interleaved sequential reads, concurrent streams on the same fd can
391*4882a593Smuzhiyun  * be invalidating each other's readahead state. So we flag the new readahead
392*4882a593Smuzhiyun  * page at (start+size-async_size) with PG_readahead, and use it as readahead
393*4882a593Smuzhiyun  * indicator. The flag won't be set on already cached pages, to avoid the
394*4882a593Smuzhiyun  * readahead-for-nothing fuss, saving pointless page cache lookups.
395*4882a593Smuzhiyun  *
396*4882a593Smuzhiyun  * prev_pos tracks the last visited byte in the _previous_ read request.
397*4882a593Smuzhiyun  * It should be maintained by the caller, and will be used for detecting
398*4882a593Smuzhiyun  * small random reads. Note that the readahead algorithm checks loosely
399*4882a593Smuzhiyun  * for sequential patterns. Hence interleaved reads might be served as
400*4882a593Smuzhiyun  * sequential ones.
401*4882a593Smuzhiyun  *
402*4882a593Smuzhiyun  * There is a special-case: if the first page which the application tries to
403*4882a593Smuzhiyun  * read happens to be the first page of the file, it is assumed that a linear
404*4882a593Smuzhiyun  * read is about to happen and the window is immediately set to the initial size
405*4882a593Smuzhiyun  * based on I/O request size and the max_readahead.
406*4882a593Smuzhiyun  *
407*4882a593Smuzhiyun  * The code ramps up the readahead size aggressively at first, but slow down as
408*4882a593Smuzhiyun  * it approaches max_readhead.
409*4882a593Smuzhiyun  */
410*4882a593Smuzhiyun 
411*4882a593Smuzhiyun /*
412*4882a593Smuzhiyun  * Count contiguously cached pages from @index-1 to @index-@max,
413*4882a593Smuzhiyun  * this count is a conservative estimation of
414*4882a593Smuzhiyun  * 	- length of the sequential read sequence, or
415*4882a593Smuzhiyun  * 	- thrashing threshold in memory tight systems
416*4882a593Smuzhiyun  */
count_history_pages(struct address_space * mapping,pgoff_t index,unsigned long max)417*4882a593Smuzhiyun static pgoff_t count_history_pages(struct address_space *mapping,
418*4882a593Smuzhiyun 				   pgoff_t index, unsigned long max)
419*4882a593Smuzhiyun {
420*4882a593Smuzhiyun 	pgoff_t head;
421*4882a593Smuzhiyun 
422*4882a593Smuzhiyun 	rcu_read_lock();
423*4882a593Smuzhiyun 	head = page_cache_prev_miss(mapping, index - 1, max);
424*4882a593Smuzhiyun 	rcu_read_unlock();
425*4882a593Smuzhiyun 
426*4882a593Smuzhiyun 	return index - 1 - head;
427*4882a593Smuzhiyun }
428*4882a593Smuzhiyun 
429*4882a593Smuzhiyun /*
430*4882a593Smuzhiyun  * page cache context based read-ahead
431*4882a593Smuzhiyun  */
try_context_readahead(struct address_space * mapping,struct file_ra_state * ra,pgoff_t index,unsigned long req_size,unsigned long max)432*4882a593Smuzhiyun static int try_context_readahead(struct address_space *mapping,
433*4882a593Smuzhiyun 				 struct file_ra_state *ra,
434*4882a593Smuzhiyun 				 pgoff_t index,
435*4882a593Smuzhiyun 				 unsigned long req_size,
436*4882a593Smuzhiyun 				 unsigned long max)
437*4882a593Smuzhiyun {
438*4882a593Smuzhiyun 	pgoff_t size;
439*4882a593Smuzhiyun 
440*4882a593Smuzhiyun 	size = count_history_pages(mapping, index, max);
441*4882a593Smuzhiyun 
442*4882a593Smuzhiyun 	/*
443*4882a593Smuzhiyun 	 * not enough history pages:
444*4882a593Smuzhiyun 	 * it could be a random read
445*4882a593Smuzhiyun 	 */
446*4882a593Smuzhiyun 	if (size <= req_size)
447*4882a593Smuzhiyun 		return 0;
448*4882a593Smuzhiyun 
449*4882a593Smuzhiyun 	/*
450*4882a593Smuzhiyun 	 * starts from beginning of file:
451*4882a593Smuzhiyun 	 * it is a strong indication of long-run stream (or whole-file-read)
452*4882a593Smuzhiyun 	 */
453*4882a593Smuzhiyun 	if (size >= index)
454*4882a593Smuzhiyun 		size *= 2;
455*4882a593Smuzhiyun 
456*4882a593Smuzhiyun 	ra->start = index;
457*4882a593Smuzhiyun 	ra->size = min(size + req_size, max);
458*4882a593Smuzhiyun 	ra->async_size = 1;
459*4882a593Smuzhiyun 
460*4882a593Smuzhiyun 	return 1;
461*4882a593Smuzhiyun }
462*4882a593Smuzhiyun 
463*4882a593Smuzhiyun /*
464*4882a593Smuzhiyun  * A minimal readahead algorithm for trivial sequential/random reads.
465*4882a593Smuzhiyun  */
ondemand_readahead(struct readahead_control * ractl,struct file_ra_state * ra,bool hit_readahead_marker,unsigned long req_size)466*4882a593Smuzhiyun static void ondemand_readahead(struct readahead_control *ractl,
467*4882a593Smuzhiyun 		struct file_ra_state *ra, bool hit_readahead_marker,
468*4882a593Smuzhiyun 		unsigned long req_size)
469*4882a593Smuzhiyun {
470*4882a593Smuzhiyun 	struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host);
471*4882a593Smuzhiyun 	unsigned long max_pages = ra->ra_pages;
472*4882a593Smuzhiyun 	unsigned long add_pages;
473*4882a593Smuzhiyun 	unsigned long index = readahead_index(ractl);
474*4882a593Smuzhiyun 	pgoff_t prev_index;
475*4882a593Smuzhiyun 
476*4882a593Smuzhiyun 	/*
477*4882a593Smuzhiyun 	 * If the request exceeds the readahead window, allow the read to
478*4882a593Smuzhiyun 	 * be up to the optimal hardware IO size
479*4882a593Smuzhiyun 	 */
480*4882a593Smuzhiyun 	if (req_size > max_pages && bdi->io_pages > max_pages)
481*4882a593Smuzhiyun 		max_pages = min(req_size, bdi->io_pages);
482*4882a593Smuzhiyun 
483*4882a593Smuzhiyun 	trace_android_vh_ra_tuning_max_page(ractl, &max_pages);
484*4882a593Smuzhiyun 
485*4882a593Smuzhiyun 	/*
486*4882a593Smuzhiyun 	 * start of file
487*4882a593Smuzhiyun 	 */
488*4882a593Smuzhiyun 	if (!index)
489*4882a593Smuzhiyun 		goto initial_readahead;
490*4882a593Smuzhiyun 
491*4882a593Smuzhiyun 	/*
492*4882a593Smuzhiyun 	 * It's the expected callback index, assume sequential access.
493*4882a593Smuzhiyun 	 * Ramp up sizes, and push forward the readahead window.
494*4882a593Smuzhiyun 	 */
495*4882a593Smuzhiyun 	if ((index == (ra->start + ra->size - ra->async_size) ||
496*4882a593Smuzhiyun 	     index == (ra->start + ra->size))) {
497*4882a593Smuzhiyun 		ra->start += ra->size;
498*4882a593Smuzhiyun 		ra->size = get_next_ra_size(ra, max_pages);
499*4882a593Smuzhiyun 		ra->async_size = ra->size;
500*4882a593Smuzhiyun 		goto readit;
501*4882a593Smuzhiyun 	}
502*4882a593Smuzhiyun 
503*4882a593Smuzhiyun 	/*
504*4882a593Smuzhiyun 	 * Hit a marked page without valid readahead state.
505*4882a593Smuzhiyun 	 * E.g. interleaved reads.
506*4882a593Smuzhiyun 	 * Query the pagecache for async_size, which normally equals to
507*4882a593Smuzhiyun 	 * readahead size. Ramp it up and use it as the new readahead size.
508*4882a593Smuzhiyun 	 */
509*4882a593Smuzhiyun 	if (hit_readahead_marker) {
510*4882a593Smuzhiyun 		pgoff_t start;
511*4882a593Smuzhiyun 
512*4882a593Smuzhiyun 		rcu_read_lock();
513*4882a593Smuzhiyun 		start = page_cache_next_miss(ractl->mapping, index + 1,
514*4882a593Smuzhiyun 				max_pages);
515*4882a593Smuzhiyun 		rcu_read_unlock();
516*4882a593Smuzhiyun 
517*4882a593Smuzhiyun 		if (!start || start - index > max_pages)
518*4882a593Smuzhiyun 			return;
519*4882a593Smuzhiyun 
520*4882a593Smuzhiyun 		ra->start = start;
521*4882a593Smuzhiyun 		ra->size = start - index;	/* old async_size */
522*4882a593Smuzhiyun 		ra->size += req_size;
523*4882a593Smuzhiyun 		ra->size = get_next_ra_size(ra, max_pages);
524*4882a593Smuzhiyun 		ra->async_size = ra->size;
525*4882a593Smuzhiyun 		goto readit;
526*4882a593Smuzhiyun 	}
527*4882a593Smuzhiyun 
528*4882a593Smuzhiyun 	/*
529*4882a593Smuzhiyun 	 * oversize read
530*4882a593Smuzhiyun 	 */
531*4882a593Smuzhiyun 	if (req_size > max_pages)
532*4882a593Smuzhiyun 		goto initial_readahead;
533*4882a593Smuzhiyun 
534*4882a593Smuzhiyun 	/*
535*4882a593Smuzhiyun 	 * sequential cache miss
536*4882a593Smuzhiyun 	 * trivial case: (index - prev_index) == 1
537*4882a593Smuzhiyun 	 * unaligned reads: (index - prev_index) == 0
538*4882a593Smuzhiyun 	 */
539*4882a593Smuzhiyun 	prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
540*4882a593Smuzhiyun 	if (index - prev_index <= 1UL)
541*4882a593Smuzhiyun 		goto initial_readahead;
542*4882a593Smuzhiyun 
543*4882a593Smuzhiyun 	/*
544*4882a593Smuzhiyun 	 * Query the page cache and look for the traces(cached history pages)
545*4882a593Smuzhiyun 	 * that a sequential stream would leave behind.
546*4882a593Smuzhiyun 	 */
547*4882a593Smuzhiyun 	if (try_context_readahead(ractl->mapping, ra, index, req_size,
548*4882a593Smuzhiyun 			max_pages))
549*4882a593Smuzhiyun 		goto readit;
550*4882a593Smuzhiyun 
551*4882a593Smuzhiyun 	/*
552*4882a593Smuzhiyun 	 * standalone, small random read
553*4882a593Smuzhiyun 	 * Read as is, and do not pollute the readahead state.
554*4882a593Smuzhiyun 	 */
555*4882a593Smuzhiyun 	do_page_cache_ra(ractl, req_size, 0);
556*4882a593Smuzhiyun 	return;
557*4882a593Smuzhiyun 
558*4882a593Smuzhiyun initial_readahead:
559*4882a593Smuzhiyun 	ra->start = index;
560*4882a593Smuzhiyun 	ra->size = get_init_ra_size(req_size, max_pages);
561*4882a593Smuzhiyun 	ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
562*4882a593Smuzhiyun 
563*4882a593Smuzhiyun readit:
564*4882a593Smuzhiyun 	/*
565*4882a593Smuzhiyun 	 * Will this read hit the readahead marker made by itself?
566*4882a593Smuzhiyun 	 * If so, trigger the readahead marker hit now, and merge
567*4882a593Smuzhiyun 	 * the resulted next readahead window into the current one.
568*4882a593Smuzhiyun 	 * Take care of maximum IO pages as above.
569*4882a593Smuzhiyun 	 */
570*4882a593Smuzhiyun 	if (index == ra->start && ra->size == ra->async_size) {
571*4882a593Smuzhiyun 		add_pages = get_next_ra_size(ra, max_pages);
572*4882a593Smuzhiyun 		if (ra->size + add_pages <= max_pages) {
573*4882a593Smuzhiyun 			ra->async_size = add_pages;
574*4882a593Smuzhiyun 			ra->size += add_pages;
575*4882a593Smuzhiyun 		} else {
576*4882a593Smuzhiyun 			ra->size = max_pages;
577*4882a593Smuzhiyun 			ra->async_size = max_pages >> 1;
578*4882a593Smuzhiyun 		}
579*4882a593Smuzhiyun 	}
580*4882a593Smuzhiyun 
581*4882a593Smuzhiyun 	ractl->_index = ra->start;
582*4882a593Smuzhiyun 	do_page_cache_ra(ractl, ra->size, ra->async_size);
583*4882a593Smuzhiyun }
584*4882a593Smuzhiyun 
page_cache_sync_ra(struct readahead_control * ractl,struct file_ra_state * ra,unsigned long req_count)585*4882a593Smuzhiyun void page_cache_sync_ra(struct readahead_control *ractl,
586*4882a593Smuzhiyun 		struct file_ra_state *ra, unsigned long req_count)
587*4882a593Smuzhiyun {
588*4882a593Smuzhiyun 	bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM);
589*4882a593Smuzhiyun 
590*4882a593Smuzhiyun 	/*
591*4882a593Smuzhiyun 	 * Even if read-ahead is disabled, issue this request as read-ahead
592*4882a593Smuzhiyun 	 * as we'll need it to satisfy the requested range. The forced
593*4882a593Smuzhiyun 	 * read-ahead will do the right thing and limit the read to just the
594*4882a593Smuzhiyun 	 * requested range, which we'll set to 1 page for this case.
595*4882a593Smuzhiyun 	 */
596*4882a593Smuzhiyun 	if (!ra->ra_pages || blk_cgroup_congested()) {
597*4882a593Smuzhiyun 		if (!ractl->file)
598*4882a593Smuzhiyun 			return;
599*4882a593Smuzhiyun 		req_count = 1;
600*4882a593Smuzhiyun 		do_forced_ra = true;
601*4882a593Smuzhiyun 	}
602*4882a593Smuzhiyun 
603*4882a593Smuzhiyun 	/* be dumb */
604*4882a593Smuzhiyun 	if (do_forced_ra) {
605*4882a593Smuzhiyun 		force_page_cache_ra(ractl, ra, req_count);
606*4882a593Smuzhiyun 		return;
607*4882a593Smuzhiyun 	}
608*4882a593Smuzhiyun 
609*4882a593Smuzhiyun 	/* do read-ahead */
610*4882a593Smuzhiyun 	ondemand_readahead(ractl, ra, false, req_count);
611*4882a593Smuzhiyun }
612*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(page_cache_sync_ra);
613*4882a593Smuzhiyun 
page_cache_async_ra(struct readahead_control * ractl,struct file_ra_state * ra,struct page * page,unsigned long req_count)614*4882a593Smuzhiyun void page_cache_async_ra(struct readahead_control *ractl,
615*4882a593Smuzhiyun 		struct file_ra_state *ra, struct page *page,
616*4882a593Smuzhiyun 		unsigned long req_count)
617*4882a593Smuzhiyun {
618*4882a593Smuzhiyun 	/* no read-ahead */
619*4882a593Smuzhiyun 	if (!ra->ra_pages)
620*4882a593Smuzhiyun 		return;
621*4882a593Smuzhiyun 
622*4882a593Smuzhiyun 	/*
623*4882a593Smuzhiyun 	 * Same bit is used for PG_readahead and PG_reclaim.
624*4882a593Smuzhiyun 	 */
625*4882a593Smuzhiyun 	if (PageWriteback(page))
626*4882a593Smuzhiyun 		return;
627*4882a593Smuzhiyun 
628*4882a593Smuzhiyun 	ClearPageReadahead(page);
629*4882a593Smuzhiyun 
630*4882a593Smuzhiyun 	/*
631*4882a593Smuzhiyun 	 * Defer asynchronous read-ahead on IO congestion.
632*4882a593Smuzhiyun 	 */
633*4882a593Smuzhiyun 	if (inode_read_congested(ractl->mapping->host))
634*4882a593Smuzhiyun 		return;
635*4882a593Smuzhiyun 
636*4882a593Smuzhiyun 	if (blk_cgroup_congested())
637*4882a593Smuzhiyun 		return;
638*4882a593Smuzhiyun 
639*4882a593Smuzhiyun 	/* do read-ahead */
640*4882a593Smuzhiyun 	ondemand_readahead(ractl, ra, true, req_count);
641*4882a593Smuzhiyun }
642*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(page_cache_async_ra);
643*4882a593Smuzhiyun 
ksys_readahead(int fd,loff_t offset,size_t count)644*4882a593Smuzhiyun ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
645*4882a593Smuzhiyun {
646*4882a593Smuzhiyun 	ssize_t ret;
647*4882a593Smuzhiyun 	struct fd f;
648*4882a593Smuzhiyun 
649*4882a593Smuzhiyun 	ret = -EBADF;
650*4882a593Smuzhiyun 	f = fdget(fd);
651*4882a593Smuzhiyun 	if (!f.file || !(f.file->f_mode & FMODE_READ))
652*4882a593Smuzhiyun 		goto out;
653*4882a593Smuzhiyun 
654*4882a593Smuzhiyun 	/*
655*4882a593Smuzhiyun 	 * The readahead() syscall is intended to run only on files
656*4882a593Smuzhiyun 	 * that can execute readahead. If readahead is not possible
657*4882a593Smuzhiyun 	 * on this file, then we must return -EINVAL.
658*4882a593Smuzhiyun 	 */
659*4882a593Smuzhiyun 	ret = -EINVAL;
660*4882a593Smuzhiyun 	if (!f.file->f_mapping || !f.file->f_mapping->a_ops ||
661*4882a593Smuzhiyun 	    !S_ISREG(file_inode(f.file)->i_mode))
662*4882a593Smuzhiyun 		goto out;
663*4882a593Smuzhiyun 
664*4882a593Smuzhiyun 	ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED);
665*4882a593Smuzhiyun out:
666*4882a593Smuzhiyun 	fdput(f);
667*4882a593Smuzhiyun 	return ret;
668*4882a593Smuzhiyun }
669*4882a593Smuzhiyun 
SYSCALL_DEFINE3(readahead,int,fd,loff_t,offset,size_t,count)670*4882a593Smuzhiyun SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
671*4882a593Smuzhiyun {
672*4882a593Smuzhiyun 	return ksys_readahead(fd, offset, count);
673*4882a593Smuzhiyun }
674