1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0-only
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * mm/readahead.c - address_space-level file readahead.
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * Copyright (C) 2002, Linus Torvalds
6*4882a593Smuzhiyun *
7*4882a593Smuzhiyun * 09Apr2002 Andrew Morton
8*4882a593Smuzhiyun * Initial version.
9*4882a593Smuzhiyun */
10*4882a593Smuzhiyun
11*4882a593Smuzhiyun #include <linux/kernel.h>
12*4882a593Smuzhiyun #include <linux/dax.h>
13*4882a593Smuzhiyun #include <linux/gfp.h>
14*4882a593Smuzhiyun #include <linux/export.h>
15*4882a593Smuzhiyun #include <linux/blkdev.h>
16*4882a593Smuzhiyun #include <linux/backing-dev.h>
17*4882a593Smuzhiyun #include <linux/task_io_accounting_ops.h>
18*4882a593Smuzhiyun #include <linux/pagevec.h>
19*4882a593Smuzhiyun #include <linux/pagemap.h>
20*4882a593Smuzhiyun #include <linux/syscalls.h>
21*4882a593Smuzhiyun #include <linux/file.h>
22*4882a593Smuzhiyun #include <linux/mm_inline.h>
23*4882a593Smuzhiyun #include <linux/blk-cgroup.h>
24*4882a593Smuzhiyun #include <linux/fadvise.h>
25*4882a593Smuzhiyun #include <linux/sched/mm.h>
26*4882a593Smuzhiyun #include <trace/hooks/mm.h>
27*4882a593Smuzhiyun
28*4882a593Smuzhiyun #include "internal.h"
29*4882a593Smuzhiyun
30*4882a593Smuzhiyun #if defined(CONFIG_ARCH_ROCKCHIP) && defined(CONFIG_NO_GKI)
31*4882a593Smuzhiyun #include <linux/fscrypt.h>
32*4882a593Smuzhiyun #endif
33*4882a593Smuzhiyun
34*4882a593Smuzhiyun /*
35*4882a593Smuzhiyun * Initialise a struct file's readahead state. Assumes that the caller has
36*4882a593Smuzhiyun * memset *ra to zero.
37*4882a593Smuzhiyun */
38*4882a593Smuzhiyun void
file_ra_state_init(struct file_ra_state * ra,struct address_space * mapping)39*4882a593Smuzhiyun file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
40*4882a593Smuzhiyun {
41*4882a593Smuzhiyun ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages;
42*4882a593Smuzhiyun ra->prev_pos = -1;
43*4882a593Smuzhiyun }
44*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(file_ra_state_init);
45*4882a593Smuzhiyun
46*4882a593Smuzhiyun /*
47*4882a593Smuzhiyun * see if a page needs releasing upon read_cache_pages() failure
48*4882a593Smuzhiyun * - the caller of read_cache_pages() may have set PG_private or PG_fscache
49*4882a593Smuzhiyun * before calling, such as the NFS fs marking pages that are cached locally
50*4882a593Smuzhiyun * on disk, thus we need to give the fs a chance to clean up in the event of
51*4882a593Smuzhiyun * an error
52*4882a593Smuzhiyun */
read_cache_pages_invalidate_page(struct address_space * mapping,struct page * page)53*4882a593Smuzhiyun static void read_cache_pages_invalidate_page(struct address_space *mapping,
54*4882a593Smuzhiyun struct page *page)
55*4882a593Smuzhiyun {
56*4882a593Smuzhiyun if (page_has_private(page)) {
57*4882a593Smuzhiyun if (!trylock_page(page))
58*4882a593Smuzhiyun BUG();
59*4882a593Smuzhiyun page->mapping = mapping;
60*4882a593Smuzhiyun do_invalidatepage(page, 0, PAGE_SIZE);
61*4882a593Smuzhiyun page->mapping = NULL;
62*4882a593Smuzhiyun unlock_page(page);
63*4882a593Smuzhiyun }
64*4882a593Smuzhiyun put_page(page);
65*4882a593Smuzhiyun }
66*4882a593Smuzhiyun
67*4882a593Smuzhiyun /*
68*4882a593Smuzhiyun * release a list of pages, invalidating them first if need be
69*4882a593Smuzhiyun */
read_cache_pages_invalidate_pages(struct address_space * mapping,struct list_head * pages)70*4882a593Smuzhiyun static void read_cache_pages_invalidate_pages(struct address_space *mapping,
71*4882a593Smuzhiyun struct list_head *pages)
72*4882a593Smuzhiyun {
73*4882a593Smuzhiyun struct page *victim;
74*4882a593Smuzhiyun
75*4882a593Smuzhiyun while (!list_empty(pages)) {
76*4882a593Smuzhiyun victim = lru_to_page(pages);
77*4882a593Smuzhiyun list_del(&victim->lru);
78*4882a593Smuzhiyun read_cache_pages_invalidate_page(mapping, victim);
79*4882a593Smuzhiyun }
80*4882a593Smuzhiyun }
81*4882a593Smuzhiyun
82*4882a593Smuzhiyun /**
83*4882a593Smuzhiyun * read_cache_pages - populate an address space with some pages & start reads against them
84*4882a593Smuzhiyun * @mapping: the address_space
85*4882a593Smuzhiyun * @pages: The address of a list_head which contains the target pages. These
86*4882a593Smuzhiyun * pages have their ->index populated and are otherwise uninitialised.
87*4882a593Smuzhiyun * @filler: callback routine for filling a single page.
88*4882a593Smuzhiyun * @data: private data for the callback routine.
89*4882a593Smuzhiyun *
90*4882a593Smuzhiyun * Hides the details of the LRU cache etc from the filesystems.
91*4882a593Smuzhiyun *
92*4882a593Smuzhiyun * Returns: %0 on success, error return by @filler otherwise
93*4882a593Smuzhiyun */
read_cache_pages(struct address_space * mapping,struct list_head * pages,int (* filler)(void *,struct page *),void * data)94*4882a593Smuzhiyun int read_cache_pages(struct address_space *mapping, struct list_head *pages,
95*4882a593Smuzhiyun int (*filler)(void *, struct page *), void *data)
96*4882a593Smuzhiyun {
97*4882a593Smuzhiyun struct page *page;
98*4882a593Smuzhiyun int ret = 0;
99*4882a593Smuzhiyun
100*4882a593Smuzhiyun while (!list_empty(pages)) {
101*4882a593Smuzhiyun page = lru_to_page(pages);
102*4882a593Smuzhiyun list_del(&page->lru);
103*4882a593Smuzhiyun if (add_to_page_cache_lru(page, mapping, page->index,
104*4882a593Smuzhiyun readahead_gfp_mask(mapping))) {
105*4882a593Smuzhiyun read_cache_pages_invalidate_page(mapping, page);
106*4882a593Smuzhiyun continue;
107*4882a593Smuzhiyun }
108*4882a593Smuzhiyun put_page(page);
109*4882a593Smuzhiyun
110*4882a593Smuzhiyun ret = filler(data, page);
111*4882a593Smuzhiyun if (unlikely(ret)) {
112*4882a593Smuzhiyun read_cache_pages_invalidate_pages(mapping, pages);
113*4882a593Smuzhiyun break;
114*4882a593Smuzhiyun }
115*4882a593Smuzhiyun task_io_account_read(PAGE_SIZE);
116*4882a593Smuzhiyun }
117*4882a593Smuzhiyun return ret;
118*4882a593Smuzhiyun }
119*4882a593Smuzhiyun
120*4882a593Smuzhiyun EXPORT_SYMBOL(read_cache_pages);
121*4882a593Smuzhiyun
readahead_gfp_mask(struct address_space * x)122*4882a593Smuzhiyun gfp_t readahead_gfp_mask(struct address_space *x)
123*4882a593Smuzhiyun {
124*4882a593Smuzhiyun gfp_t mask = mapping_gfp_mask(x) | __GFP_NORETRY | __GFP_NOWARN;
125*4882a593Smuzhiyun
126*4882a593Smuzhiyun trace_android_rvh_set_readahead_gfp_mask(&mask);
127*4882a593Smuzhiyun return mask;
128*4882a593Smuzhiyun }
129*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(readahead_gfp_mask);
130*4882a593Smuzhiyun
read_pages(struct readahead_control * rac,struct list_head * pages,bool skip_page)131*4882a593Smuzhiyun static void read_pages(struct readahead_control *rac, struct list_head *pages,
132*4882a593Smuzhiyun bool skip_page)
133*4882a593Smuzhiyun {
134*4882a593Smuzhiyun const struct address_space_operations *aops = rac->mapping->a_ops;
135*4882a593Smuzhiyun struct page *page;
136*4882a593Smuzhiyun struct blk_plug plug;
137*4882a593Smuzhiyun
138*4882a593Smuzhiyun if (!readahead_count(rac))
139*4882a593Smuzhiyun goto out;
140*4882a593Smuzhiyun
141*4882a593Smuzhiyun blk_start_plug(&plug);
142*4882a593Smuzhiyun
143*4882a593Smuzhiyun if (aops->readahead) {
144*4882a593Smuzhiyun aops->readahead(rac);
145*4882a593Smuzhiyun /* Clean up the remaining pages */
146*4882a593Smuzhiyun while ((page = readahead_page(rac))) {
147*4882a593Smuzhiyun unlock_page(page);
148*4882a593Smuzhiyun put_page(page);
149*4882a593Smuzhiyun }
150*4882a593Smuzhiyun } else if (aops->readpages) {
151*4882a593Smuzhiyun aops->readpages(rac->file, rac->mapping, pages,
152*4882a593Smuzhiyun readahead_count(rac));
153*4882a593Smuzhiyun /* Clean up the remaining pages */
154*4882a593Smuzhiyun put_pages_list(pages);
155*4882a593Smuzhiyun rac->_index += rac->_nr_pages;
156*4882a593Smuzhiyun rac->_nr_pages = 0;
157*4882a593Smuzhiyun } else {
158*4882a593Smuzhiyun while ((page = readahead_page(rac))) {
159*4882a593Smuzhiyun aops->readpage(rac->file, page);
160*4882a593Smuzhiyun put_page(page);
161*4882a593Smuzhiyun }
162*4882a593Smuzhiyun }
163*4882a593Smuzhiyun
164*4882a593Smuzhiyun blk_finish_plug(&plug);
165*4882a593Smuzhiyun
166*4882a593Smuzhiyun BUG_ON(!list_empty(pages));
167*4882a593Smuzhiyun BUG_ON(readahead_count(rac));
168*4882a593Smuzhiyun
169*4882a593Smuzhiyun out:
170*4882a593Smuzhiyun if (skip_page)
171*4882a593Smuzhiyun rac->_index++;
172*4882a593Smuzhiyun }
173*4882a593Smuzhiyun
174*4882a593Smuzhiyun /**
175*4882a593Smuzhiyun * page_cache_ra_unbounded - Start unchecked readahead.
176*4882a593Smuzhiyun * @ractl: Readahead control.
177*4882a593Smuzhiyun * @nr_to_read: The number of pages to read.
178*4882a593Smuzhiyun * @lookahead_size: Where to start the next readahead.
179*4882a593Smuzhiyun *
180*4882a593Smuzhiyun * This function is for filesystems to call when they want to start
181*4882a593Smuzhiyun * readahead beyond a file's stated i_size. This is almost certainly
182*4882a593Smuzhiyun * not the function you want to call. Use page_cache_async_readahead()
183*4882a593Smuzhiyun * or page_cache_sync_readahead() instead.
184*4882a593Smuzhiyun *
185*4882a593Smuzhiyun * Context: File is referenced by caller. Mutexes may be held by caller.
186*4882a593Smuzhiyun * May sleep, but will not reenter filesystem to reclaim memory.
187*4882a593Smuzhiyun */
page_cache_ra_unbounded(struct readahead_control * ractl,unsigned long nr_to_read,unsigned long lookahead_size)188*4882a593Smuzhiyun void page_cache_ra_unbounded(struct readahead_control *ractl,
189*4882a593Smuzhiyun unsigned long nr_to_read, unsigned long lookahead_size)
190*4882a593Smuzhiyun {
191*4882a593Smuzhiyun struct address_space *mapping = ractl->mapping;
192*4882a593Smuzhiyun unsigned long index = readahead_index(ractl);
193*4882a593Smuzhiyun LIST_HEAD(page_pool);
194*4882a593Smuzhiyun gfp_t gfp_mask = readahead_gfp_mask(mapping);
195*4882a593Smuzhiyun unsigned long i;
196*4882a593Smuzhiyun
197*4882a593Smuzhiyun /*
198*4882a593Smuzhiyun * Partway through the readahead operation, we will have added
199*4882a593Smuzhiyun * locked pages to the page cache, but will not yet have submitted
200*4882a593Smuzhiyun * them for I/O. Adding another page may need to allocate memory,
201*4882a593Smuzhiyun * which can trigger memory reclaim. Telling the VM we're in
202*4882a593Smuzhiyun * the middle of a filesystem operation will cause it to not
203*4882a593Smuzhiyun * touch file-backed pages, preventing a deadlock. Most (all?)
204*4882a593Smuzhiyun * filesystems already specify __GFP_NOFS in their mapping's
205*4882a593Smuzhiyun * gfp_mask, but let's be explicit here.
206*4882a593Smuzhiyun */
207*4882a593Smuzhiyun unsigned int nofs = memalloc_nofs_save();
208*4882a593Smuzhiyun
209*4882a593Smuzhiyun /*
210*4882a593Smuzhiyun * Preallocate as many pages as we will need.
211*4882a593Smuzhiyun */
212*4882a593Smuzhiyun for (i = 0; i < nr_to_read; i++) {
213*4882a593Smuzhiyun struct page *page = xa_load(&mapping->i_pages, index + i);
214*4882a593Smuzhiyun
215*4882a593Smuzhiyun BUG_ON(index + i != ractl->_index + ractl->_nr_pages);
216*4882a593Smuzhiyun
217*4882a593Smuzhiyun if (page && !xa_is_value(page)) {
218*4882a593Smuzhiyun /*
219*4882a593Smuzhiyun * Page already present? Kick off the current batch
220*4882a593Smuzhiyun * of contiguous pages before continuing with the
221*4882a593Smuzhiyun * next batch. This page may be the one we would
222*4882a593Smuzhiyun * have intended to mark as Readahead, but we don't
223*4882a593Smuzhiyun * have a stable reference to this page, and it's
224*4882a593Smuzhiyun * not worth getting one just for that.
225*4882a593Smuzhiyun */
226*4882a593Smuzhiyun read_pages(ractl, &page_pool, true);
227*4882a593Smuzhiyun continue;
228*4882a593Smuzhiyun }
229*4882a593Smuzhiyun
230*4882a593Smuzhiyun page = __page_cache_alloc(gfp_mask);
231*4882a593Smuzhiyun if (!page)
232*4882a593Smuzhiyun break;
233*4882a593Smuzhiyun if (mapping->a_ops->readpages) {
234*4882a593Smuzhiyun page->index = index + i;
235*4882a593Smuzhiyun list_add(&page->lru, &page_pool);
236*4882a593Smuzhiyun } else if (add_to_page_cache_lru(page, mapping, index + i,
237*4882a593Smuzhiyun gfp_mask) < 0) {
238*4882a593Smuzhiyun put_page(page);
239*4882a593Smuzhiyun read_pages(ractl, &page_pool, true);
240*4882a593Smuzhiyun continue;
241*4882a593Smuzhiyun }
242*4882a593Smuzhiyun if (i == nr_to_read - lookahead_size)
243*4882a593Smuzhiyun SetPageReadahead(page);
244*4882a593Smuzhiyun ractl->_nr_pages++;
245*4882a593Smuzhiyun }
246*4882a593Smuzhiyun
247*4882a593Smuzhiyun /*
248*4882a593Smuzhiyun * Now start the IO. We ignore I/O errors - if the page is not
249*4882a593Smuzhiyun * uptodate then the caller will launch readpage again, and
250*4882a593Smuzhiyun * will then handle the error.
251*4882a593Smuzhiyun */
252*4882a593Smuzhiyun read_pages(ractl, &page_pool, false);
253*4882a593Smuzhiyun memalloc_nofs_restore(nofs);
254*4882a593Smuzhiyun }
255*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(page_cache_ra_unbounded);
256*4882a593Smuzhiyun
257*4882a593Smuzhiyun /*
258*4882a593Smuzhiyun * do_page_cache_ra() actually reads a chunk of disk. It allocates
259*4882a593Smuzhiyun * the pages first, then submits them for I/O. This avoids the very bad
260*4882a593Smuzhiyun * behaviour which would occur if page allocations are causing VM writeback.
261*4882a593Smuzhiyun * We really don't want to intermingle reads and writes like that.
262*4882a593Smuzhiyun */
do_page_cache_ra(struct readahead_control * ractl,unsigned long nr_to_read,unsigned long lookahead_size)263*4882a593Smuzhiyun void do_page_cache_ra(struct readahead_control *ractl,
264*4882a593Smuzhiyun unsigned long nr_to_read, unsigned long lookahead_size)
265*4882a593Smuzhiyun {
266*4882a593Smuzhiyun struct inode *inode = ractl->mapping->host;
267*4882a593Smuzhiyun unsigned long index = readahead_index(ractl);
268*4882a593Smuzhiyun loff_t isize = i_size_read(inode);
269*4882a593Smuzhiyun pgoff_t end_index; /* The last page we want to read */
270*4882a593Smuzhiyun
271*4882a593Smuzhiyun if (isize == 0)
272*4882a593Smuzhiyun return;
273*4882a593Smuzhiyun
274*4882a593Smuzhiyun end_index = (isize - 1) >> PAGE_SHIFT;
275*4882a593Smuzhiyun if (index > end_index)
276*4882a593Smuzhiyun return;
277*4882a593Smuzhiyun /* Don't read past the page containing the last byte of the file */
278*4882a593Smuzhiyun if (nr_to_read > end_index - index)
279*4882a593Smuzhiyun nr_to_read = end_index - index + 1;
280*4882a593Smuzhiyun
281*4882a593Smuzhiyun page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size);
282*4882a593Smuzhiyun }
283*4882a593Smuzhiyun
284*4882a593Smuzhiyun /*
285*4882a593Smuzhiyun * Chunk the readahead into 2 megabyte units, so that we don't pin too much
286*4882a593Smuzhiyun * memory at once.
287*4882a593Smuzhiyun */
force_page_cache_ra(struct readahead_control * ractl,struct file_ra_state * ra,unsigned long nr_to_read)288*4882a593Smuzhiyun void force_page_cache_ra(struct readahead_control *ractl,
289*4882a593Smuzhiyun struct file_ra_state *ra, unsigned long nr_to_read)
290*4882a593Smuzhiyun {
291*4882a593Smuzhiyun struct address_space *mapping = ractl->mapping;
292*4882a593Smuzhiyun struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
293*4882a593Smuzhiyun unsigned long max_pages, index;
294*4882a593Smuzhiyun #if defined(CONFIG_ARCH_ROCKCHIP) && defined(CONFIG_NO_GKI)
295*4882a593Smuzhiyun bool force_lookahead = false;
296*4882a593Smuzhiyun #endif
297*4882a593Smuzhiyun
298*4882a593Smuzhiyun if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages &&
299*4882a593Smuzhiyun !mapping->a_ops->readahead))
300*4882a593Smuzhiyun return;
301*4882a593Smuzhiyun
302*4882a593Smuzhiyun /*
303*4882a593Smuzhiyun * If the request exceeds the readahead window, allow the read to
304*4882a593Smuzhiyun * be up to the optimal hardware IO size
305*4882a593Smuzhiyun */
306*4882a593Smuzhiyun index = readahead_index(ractl);
307*4882a593Smuzhiyun max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
308*4882a593Smuzhiyun #if defined(CONFIG_ARCH_ROCKCHIP) && defined(CONFIG_NO_GKI)
309*4882a593Smuzhiyun /* For files with fscrypt enabled, to allow IO and the encryption
310*4882a593Smuzhiyun * or decryption process to ping-pong, lookahead is forcibly enabled.
311*4882a593Smuzhiyun */
312*4882a593Smuzhiyun if (nr_to_read > max_pages && fscrypt_inode_uses_fs_layer_crypto(mapping->host))
313*4882a593Smuzhiyun force_lookahead = true;
314*4882a593Smuzhiyun #endif
315*4882a593Smuzhiyun nr_to_read = min_t(unsigned long, nr_to_read, max_pages);
316*4882a593Smuzhiyun while (nr_to_read) {
317*4882a593Smuzhiyun unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE;
318*4882a593Smuzhiyun
319*4882a593Smuzhiyun if (this_chunk > nr_to_read)
320*4882a593Smuzhiyun this_chunk = nr_to_read;
321*4882a593Smuzhiyun ractl->_index = index;
322*4882a593Smuzhiyun #if defined(CONFIG_ARCH_ROCKCHIP) && defined(CONFIG_NO_GKI)
323*4882a593Smuzhiyun if (force_lookahead)
324*4882a593Smuzhiyun do_page_cache_ra(ractl, this_chunk, this_chunk / 2);
325*4882a593Smuzhiyun else
326*4882a593Smuzhiyun do_page_cache_ra(ractl, this_chunk, 0);
327*4882a593Smuzhiyun #else
328*4882a593Smuzhiyun do_page_cache_ra(ractl, this_chunk, 0);
329*4882a593Smuzhiyun #endif
330*4882a593Smuzhiyun
331*4882a593Smuzhiyun index += this_chunk;
332*4882a593Smuzhiyun nr_to_read -= this_chunk;
333*4882a593Smuzhiyun }
334*4882a593Smuzhiyun }
335*4882a593Smuzhiyun
336*4882a593Smuzhiyun /*
337*4882a593Smuzhiyun * Set the initial window size, round to next power of 2 and square
338*4882a593Smuzhiyun * for small size, x 4 for medium, and x 2 for large
339*4882a593Smuzhiyun * for 128k (32 page) max ra
340*4882a593Smuzhiyun * 1-8 page = 32k initial, > 8 page = 128k initial
341*4882a593Smuzhiyun */
get_init_ra_size(unsigned long size,unsigned long max)342*4882a593Smuzhiyun static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
343*4882a593Smuzhiyun {
344*4882a593Smuzhiyun unsigned long newsize = roundup_pow_of_two(size);
345*4882a593Smuzhiyun
346*4882a593Smuzhiyun if (newsize <= max / 32)
347*4882a593Smuzhiyun newsize = newsize * 4;
348*4882a593Smuzhiyun else if (newsize <= max / 4)
349*4882a593Smuzhiyun newsize = newsize * 2;
350*4882a593Smuzhiyun else
351*4882a593Smuzhiyun newsize = max;
352*4882a593Smuzhiyun
353*4882a593Smuzhiyun return newsize;
354*4882a593Smuzhiyun }
355*4882a593Smuzhiyun
356*4882a593Smuzhiyun /*
357*4882a593Smuzhiyun * Get the previous window size, ramp it up, and
358*4882a593Smuzhiyun * return it as the new window size.
359*4882a593Smuzhiyun */
get_next_ra_size(struct file_ra_state * ra,unsigned long max)360*4882a593Smuzhiyun static unsigned long get_next_ra_size(struct file_ra_state *ra,
361*4882a593Smuzhiyun unsigned long max)
362*4882a593Smuzhiyun {
363*4882a593Smuzhiyun unsigned long cur = ra->size;
364*4882a593Smuzhiyun
365*4882a593Smuzhiyun if (cur < max / 16)
366*4882a593Smuzhiyun return 4 * cur;
367*4882a593Smuzhiyun if (cur <= max / 2)
368*4882a593Smuzhiyun return 2 * cur;
369*4882a593Smuzhiyun return max;
370*4882a593Smuzhiyun }
371*4882a593Smuzhiyun
372*4882a593Smuzhiyun /*
373*4882a593Smuzhiyun * On-demand readahead design.
374*4882a593Smuzhiyun *
375*4882a593Smuzhiyun * The fields in struct file_ra_state represent the most-recently-executed
376*4882a593Smuzhiyun * readahead attempt:
377*4882a593Smuzhiyun *
378*4882a593Smuzhiyun * |<----- async_size ---------|
379*4882a593Smuzhiyun * |------------------- size -------------------->|
380*4882a593Smuzhiyun * |==================#===========================|
381*4882a593Smuzhiyun * ^start ^page marked with PG_readahead
382*4882a593Smuzhiyun *
383*4882a593Smuzhiyun * To overlap application thinking time and disk I/O time, we do
384*4882a593Smuzhiyun * `readahead pipelining': Do not wait until the application consumed all
385*4882a593Smuzhiyun * readahead pages and stalled on the missing page at readahead_index;
386*4882a593Smuzhiyun * Instead, submit an asynchronous readahead I/O as soon as there are
387*4882a593Smuzhiyun * only async_size pages left in the readahead window. Normally async_size
388*4882a593Smuzhiyun * will be equal to size, for maximum pipelining.
389*4882a593Smuzhiyun *
390*4882a593Smuzhiyun * In interleaved sequential reads, concurrent streams on the same fd can
391*4882a593Smuzhiyun * be invalidating each other's readahead state. So we flag the new readahead
392*4882a593Smuzhiyun * page at (start+size-async_size) with PG_readahead, and use it as readahead
393*4882a593Smuzhiyun * indicator. The flag won't be set on already cached pages, to avoid the
394*4882a593Smuzhiyun * readahead-for-nothing fuss, saving pointless page cache lookups.
395*4882a593Smuzhiyun *
396*4882a593Smuzhiyun * prev_pos tracks the last visited byte in the _previous_ read request.
397*4882a593Smuzhiyun * It should be maintained by the caller, and will be used for detecting
398*4882a593Smuzhiyun * small random reads. Note that the readahead algorithm checks loosely
399*4882a593Smuzhiyun * for sequential patterns. Hence interleaved reads might be served as
400*4882a593Smuzhiyun * sequential ones.
401*4882a593Smuzhiyun *
402*4882a593Smuzhiyun * There is a special-case: if the first page which the application tries to
403*4882a593Smuzhiyun * read happens to be the first page of the file, it is assumed that a linear
404*4882a593Smuzhiyun * read is about to happen and the window is immediately set to the initial size
405*4882a593Smuzhiyun * based on I/O request size and the max_readahead.
406*4882a593Smuzhiyun *
407*4882a593Smuzhiyun * The code ramps up the readahead size aggressively at first, but slow down as
408*4882a593Smuzhiyun * it approaches max_readhead.
409*4882a593Smuzhiyun */
410*4882a593Smuzhiyun
411*4882a593Smuzhiyun /*
412*4882a593Smuzhiyun * Count contiguously cached pages from @index-1 to @index-@max,
413*4882a593Smuzhiyun * this count is a conservative estimation of
414*4882a593Smuzhiyun * - length of the sequential read sequence, or
415*4882a593Smuzhiyun * - thrashing threshold in memory tight systems
416*4882a593Smuzhiyun */
count_history_pages(struct address_space * mapping,pgoff_t index,unsigned long max)417*4882a593Smuzhiyun static pgoff_t count_history_pages(struct address_space *mapping,
418*4882a593Smuzhiyun pgoff_t index, unsigned long max)
419*4882a593Smuzhiyun {
420*4882a593Smuzhiyun pgoff_t head;
421*4882a593Smuzhiyun
422*4882a593Smuzhiyun rcu_read_lock();
423*4882a593Smuzhiyun head = page_cache_prev_miss(mapping, index - 1, max);
424*4882a593Smuzhiyun rcu_read_unlock();
425*4882a593Smuzhiyun
426*4882a593Smuzhiyun return index - 1 - head;
427*4882a593Smuzhiyun }
428*4882a593Smuzhiyun
429*4882a593Smuzhiyun /*
430*4882a593Smuzhiyun * page cache context based read-ahead
431*4882a593Smuzhiyun */
try_context_readahead(struct address_space * mapping,struct file_ra_state * ra,pgoff_t index,unsigned long req_size,unsigned long max)432*4882a593Smuzhiyun static int try_context_readahead(struct address_space *mapping,
433*4882a593Smuzhiyun struct file_ra_state *ra,
434*4882a593Smuzhiyun pgoff_t index,
435*4882a593Smuzhiyun unsigned long req_size,
436*4882a593Smuzhiyun unsigned long max)
437*4882a593Smuzhiyun {
438*4882a593Smuzhiyun pgoff_t size;
439*4882a593Smuzhiyun
440*4882a593Smuzhiyun size = count_history_pages(mapping, index, max);
441*4882a593Smuzhiyun
442*4882a593Smuzhiyun /*
443*4882a593Smuzhiyun * not enough history pages:
444*4882a593Smuzhiyun * it could be a random read
445*4882a593Smuzhiyun */
446*4882a593Smuzhiyun if (size <= req_size)
447*4882a593Smuzhiyun return 0;
448*4882a593Smuzhiyun
449*4882a593Smuzhiyun /*
450*4882a593Smuzhiyun * starts from beginning of file:
451*4882a593Smuzhiyun * it is a strong indication of long-run stream (or whole-file-read)
452*4882a593Smuzhiyun */
453*4882a593Smuzhiyun if (size >= index)
454*4882a593Smuzhiyun size *= 2;
455*4882a593Smuzhiyun
456*4882a593Smuzhiyun ra->start = index;
457*4882a593Smuzhiyun ra->size = min(size + req_size, max);
458*4882a593Smuzhiyun ra->async_size = 1;
459*4882a593Smuzhiyun
460*4882a593Smuzhiyun return 1;
461*4882a593Smuzhiyun }
462*4882a593Smuzhiyun
463*4882a593Smuzhiyun /*
464*4882a593Smuzhiyun * A minimal readahead algorithm for trivial sequential/random reads.
465*4882a593Smuzhiyun */
ondemand_readahead(struct readahead_control * ractl,struct file_ra_state * ra,bool hit_readahead_marker,unsigned long req_size)466*4882a593Smuzhiyun static void ondemand_readahead(struct readahead_control *ractl,
467*4882a593Smuzhiyun struct file_ra_state *ra, bool hit_readahead_marker,
468*4882a593Smuzhiyun unsigned long req_size)
469*4882a593Smuzhiyun {
470*4882a593Smuzhiyun struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host);
471*4882a593Smuzhiyun unsigned long max_pages = ra->ra_pages;
472*4882a593Smuzhiyun unsigned long add_pages;
473*4882a593Smuzhiyun unsigned long index = readahead_index(ractl);
474*4882a593Smuzhiyun pgoff_t prev_index;
475*4882a593Smuzhiyun
476*4882a593Smuzhiyun /*
477*4882a593Smuzhiyun * If the request exceeds the readahead window, allow the read to
478*4882a593Smuzhiyun * be up to the optimal hardware IO size
479*4882a593Smuzhiyun */
480*4882a593Smuzhiyun if (req_size > max_pages && bdi->io_pages > max_pages)
481*4882a593Smuzhiyun max_pages = min(req_size, bdi->io_pages);
482*4882a593Smuzhiyun
483*4882a593Smuzhiyun trace_android_vh_ra_tuning_max_page(ractl, &max_pages);
484*4882a593Smuzhiyun
485*4882a593Smuzhiyun /*
486*4882a593Smuzhiyun * start of file
487*4882a593Smuzhiyun */
488*4882a593Smuzhiyun if (!index)
489*4882a593Smuzhiyun goto initial_readahead;
490*4882a593Smuzhiyun
491*4882a593Smuzhiyun /*
492*4882a593Smuzhiyun * It's the expected callback index, assume sequential access.
493*4882a593Smuzhiyun * Ramp up sizes, and push forward the readahead window.
494*4882a593Smuzhiyun */
495*4882a593Smuzhiyun if ((index == (ra->start + ra->size - ra->async_size) ||
496*4882a593Smuzhiyun index == (ra->start + ra->size))) {
497*4882a593Smuzhiyun ra->start += ra->size;
498*4882a593Smuzhiyun ra->size = get_next_ra_size(ra, max_pages);
499*4882a593Smuzhiyun ra->async_size = ra->size;
500*4882a593Smuzhiyun goto readit;
501*4882a593Smuzhiyun }
502*4882a593Smuzhiyun
503*4882a593Smuzhiyun /*
504*4882a593Smuzhiyun * Hit a marked page without valid readahead state.
505*4882a593Smuzhiyun * E.g. interleaved reads.
506*4882a593Smuzhiyun * Query the pagecache for async_size, which normally equals to
507*4882a593Smuzhiyun * readahead size. Ramp it up and use it as the new readahead size.
508*4882a593Smuzhiyun */
509*4882a593Smuzhiyun if (hit_readahead_marker) {
510*4882a593Smuzhiyun pgoff_t start;
511*4882a593Smuzhiyun
512*4882a593Smuzhiyun rcu_read_lock();
513*4882a593Smuzhiyun start = page_cache_next_miss(ractl->mapping, index + 1,
514*4882a593Smuzhiyun max_pages);
515*4882a593Smuzhiyun rcu_read_unlock();
516*4882a593Smuzhiyun
517*4882a593Smuzhiyun if (!start || start - index > max_pages)
518*4882a593Smuzhiyun return;
519*4882a593Smuzhiyun
520*4882a593Smuzhiyun ra->start = start;
521*4882a593Smuzhiyun ra->size = start - index; /* old async_size */
522*4882a593Smuzhiyun ra->size += req_size;
523*4882a593Smuzhiyun ra->size = get_next_ra_size(ra, max_pages);
524*4882a593Smuzhiyun ra->async_size = ra->size;
525*4882a593Smuzhiyun goto readit;
526*4882a593Smuzhiyun }
527*4882a593Smuzhiyun
528*4882a593Smuzhiyun /*
529*4882a593Smuzhiyun * oversize read
530*4882a593Smuzhiyun */
531*4882a593Smuzhiyun if (req_size > max_pages)
532*4882a593Smuzhiyun goto initial_readahead;
533*4882a593Smuzhiyun
534*4882a593Smuzhiyun /*
535*4882a593Smuzhiyun * sequential cache miss
536*4882a593Smuzhiyun * trivial case: (index - prev_index) == 1
537*4882a593Smuzhiyun * unaligned reads: (index - prev_index) == 0
538*4882a593Smuzhiyun */
539*4882a593Smuzhiyun prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
540*4882a593Smuzhiyun if (index - prev_index <= 1UL)
541*4882a593Smuzhiyun goto initial_readahead;
542*4882a593Smuzhiyun
543*4882a593Smuzhiyun /*
544*4882a593Smuzhiyun * Query the page cache and look for the traces(cached history pages)
545*4882a593Smuzhiyun * that a sequential stream would leave behind.
546*4882a593Smuzhiyun */
547*4882a593Smuzhiyun if (try_context_readahead(ractl->mapping, ra, index, req_size,
548*4882a593Smuzhiyun max_pages))
549*4882a593Smuzhiyun goto readit;
550*4882a593Smuzhiyun
551*4882a593Smuzhiyun /*
552*4882a593Smuzhiyun * standalone, small random read
553*4882a593Smuzhiyun * Read as is, and do not pollute the readahead state.
554*4882a593Smuzhiyun */
555*4882a593Smuzhiyun do_page_cache_ra(ractl, req_size, 0);
556*4882a593Smuzhiyun return;
557*4882a593Smuzhiyun
558*4882a593Smuzhiyun initial_readahead:
559*4882a593Smuzhiyun ra->start = index;
560*4882a593Smuzhiyun ra->size = get_init_ra_size(req_size, max_pages);
561*4882a593Smuzhiyun ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
562*4882a593Smuzhiyun
563*4882a593Smuzhiyun readit:
564*4882a593Smuzhiyun /*
565*4882a593Smuzhiyun * Will this read hit the readahead marker made by itself?
566*4882a593Smuzhiyun * If so, trigger the readahead marker hit now, and merge
567*4882a593Smuzhiyun * the resulted next readahead window into the current one.
568*4882a593Smuzhiyun * Take care of maximum IO pages as above.
569*4882a593Smuzhiyun */
570*4882a593Smuzhiyun if (index == ra->start && ra->size == ra->async_size) {
571*4882a593Smuzhiyun add_pages = get_next_ra_size(ra, max_pages);
572*4882a593Smuzhiyun if (ra->size + add_pages <= max_pages) {
573*4882a593Smuzhiyun ra->async_size = add_pages;
574*4882a593Smuzhiyun ra->size += add_pages;
575*4882a593Smuzhiyun } else {
576*4882a593Smuzhiyun ra->size = max_pages;
577*4882a593Smuzhiyun ra->async_size = max_pages >> 1;
578*4882a593Smuzhiyun }
579*4882a593Smuzhiyun }
580*4882a593Smuzhiyun
581*4882a593Smuzhiyun ractl->_index = ra->start;
582*4882a593Smuzhiyun do_page_cache_ra(ractl, ra->size, ra->async_size);
583*4882a593Smuzhiyun }
584*4882a593Smuzhiyun
page_cache_sync_ra(struct readahead_control * ractl,struct file_ra_state * ra,unsigned long req_count)585*4882a593Smuzhiyun void page_cache_sync_ra(struct readahead_control *ractl,
586*4882a593Smuzhiyun struct file_ra_state *ra, unsigned long req_count)
587*4882a593Smuzhiyun {
588*4882a593Smuzhiyun bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM);
589*4882a593Smuzhiyun
590*4882a593Smuzhiyun /*
591*4882a593Smuzhiyun * Even if read-ahead is disabled, issue this request as read-ahead
592*4882a593Smuzhiyun * as we'll need it to satisfy the requested range. The forced
593*4882a593Smuzhiyun * read-ahead will do the right thing and limit the read to just the
594*4882a593Smuzhiyun * requested range, which we'll set to 1 page for this case.
595*4882a593Smuzhiyun */
596*4882a593Smuzhiyun if (!ra->ra_pages || blk_cgroup_congested()) {
597*4882a593Smuzhiyun if (!ractl->file)
598*4882a593Smuzhiyun return;
599*4882a593Smuzhiyun req_count = 1;
600*4882a593Smuzhiyun do_forced_ra = true;
601*4882a593Smuzhiyun }
602*4882a593Smuzhiyun
603*4882a593Smuzhiyun /* be dumb */
604*4882a593Smuzhiyun if (do_forced_ra) {
605*4882a593Smuzhiyun force_page_cache_ra(ractl, ra, req_count);
606*4882a593Smuzhiyun return;
607*4882a593Smuzhiyun }
608*4882a593Smuzhiyun
609*4882a593Smuzhiyun /* do read-ahead */
610*4882a593Smuzhiyun ondemand_readahead(ractl, ra, false, req_count);
611*4882a593Smuzhiyun }
612*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(page_cache_sync_ra);
613*4882a593Smuzhiyun
page_cache_async_ra(struct readahead_control * ractl,struct file_ra_state * ra,struct page * page,unsigned long req_count)614*4882a593Smuzhiyun void page_cache_async_ra(struct readahead_control *ractl,
615*4882a593Smuzhiyun struct file_ra_state *ra, struct page *page,
616*4882a593Smuzhiyun unsigned long req_count)
617*4882a593Smuzhiyun {
618*4882a593Smuzhiyun /* no read-ahead */
619*4882a593Smuzhiyun if (!ra->ra_pages)
620*4882a593Smuzhiyun return;
621*4882a593Smuzhiyun
622*4882a593Smuzhiyun /*
623*4882a593Smuzhiyun * Same bit is used for PG_readahead and PG_reclaim.
624*4882a593Smuzhiyun */
625*4882a593Smuzhiyun if (PageWriteback(page))
626*4882a593Smuzhiyun return;
627*4882a593Smuzhiyun
628*4882a593Smuzhiyun ClearPageReadahead(page);
629*4882a593Smuzhiyun
630*4882a593Smuzhiyun /*
631*4882a593Smuzhiyun * Defer asynchronous read-ahead on IO congestion.
632*4882a593Smuzhiyun */
633*4882a593Smuzhiyun if (inode_read_congested(ractl->mapping->host))
634*4882a593Smuzhiyun return;
635*4882a593Smuzhiyun
636*4882a593Smuzhiyun if (blk_cgroup_congested())
637*4882a593Smuzhiyun return;
638*4882a593Smuzhiyun
639*4882a593Smuzhiyun /* do read-ahead */
640*4882a593Smuzhiyun ondemand_readahead(ractl, ra, true, req_count);
641*4882a593Smuzhiyun }
642*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(page_cache_async_ra);
643*4882a593Smuzhiyun
ksys_readahead(int fd,loff_t offset,size_t count)644*4882a593Smuzhiyun ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
645*4882a593Smuzhiyun {
646*4882a593Smuzhiyun ssize_t ret;
647*4882a593Smuzhiyun struct fd f;
648*4882a593Smuzhiyun
649*4882a593Smuzhiyun ret = -EBADF;
650*4882a593Smuzhiyun f = fdget(fd);
651*4882a593Smuzhiyun if (!f.file || !(f.file->f_mode & FMODE_READ))
652*4882a593Smuzhiyun goto out;
653*4882a593Smuzhiyun
654*4882a593Smuzhiyun /*
655*4882a593Smuzhiyun * The readahead() syscall is intended to run only on files
656*4882a593Smuzhiyun * that can execute readahead. If readahead is not possible
657*4882a593Smuzhiyun * on this file, then we must return -EINVAL.
658*4882a593Smuzhiyun */
659*4882a593Smuzhiyun ret = -EINVAL;
660*4882a593Smuzhiyun if (!f.file->f_mapping || !f.file->f_mapping->a_ops ||
661*4882a593Smuzhiyun !S_ISREG(file_inode(f.file)->i_mode))
662*4882a593Smuzhiyun goto out;
663*4882a593Smuzhiyun
664*4882a593Smuzhiyun ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED);
665*4882a593Smuzhiyun out:
666*4882a593Smuzhiyun fdput(f);
667*4882a593Smuzhiyun return ret;
668*4882a593Smuzhiyun }
669*4882a593Smuzhiyun
SYSCALL_DEFINE3(readahead,int,fd,loff_t,offset,size_t,count)670*4882a593Smuzhiyun SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
671*4882a593Smuzhiyun {
672*4882a593Smuzhiyun return ksys_readahead(fd, offset, count);
673*4882a593Smuzhiyun }
674