kernel/mm/page_alloc.c

1 // SPDX-License-Identifier: GPL-2.0-only
50 #include <linux/backing-dev.h>
51 #include <linux/fault-inject.h>
52 #include <linux/page-isolation.h>
84 /* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
93  * reporting it and marking it "reported" -  it only skips notifying
102  * page shuffling (relevant code - e.g., memory onlining - is expected to
105  * Note: No code should rely on this flag for correctness - it's purely
113  * Don't poison memory with KASAN (only for the tag-based modes).
114  * During boot, all non-reserved memblock memory is exposed to page_alloc.
117  * This is only done for the tag-based KASAN modes, as those are able to
123 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
145 /* work_structs for global per-cpu drains */
211  * other index - this ensures that it will be put on the correct CMA freelist.
215 	return page->index;  in get_pcppage_migratetype()
220 	page->index = migratetype;  in set_pcppage_migratetype()
270  *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
271  *	1G machine -> (16M dma, 784M normal, 224M high)
276  * TBD: should special case ZONE_DMA32 machines here - in those we normally
340 int user_min_free_kbytes = -1;
347  * many cases very high-order allocations like THP are likely to be
348  * unsupported and the premature reclaim offsets the advantage of long-term
392  * During boot we initialize deferred pages on-demand, as needed, but once
408  * on-demand allocation and then freed again before the deferred pages
424 	if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)  in early_page_uninitialised()
448 	/* Always populate low zones for address-constrained allocations */  in defer_init()
452 	if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX)  in defer_init()
460 	    (pfn & (PAGES_PER_SECTION - 1)) == 0) {  in defer_init()
461 		NODE_DATA(nid)->first_deferred_pfn = pfn;  in defer_init()
492 	return page_zone(page)->pageblock_flags;  in get_pageblock_bitmap()
499 	pfn &= (PAGES_PER_SECTION-1);  in pfn_to_bitidx()
501 	pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);  in pfn_to_bitidx()
507 …* get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block o…
526 	bitidx &= (BITS_PER_LONG-1);  in __get_pfnblock_flags_mask()
544 		return -EINVAL;  in isolate_anon_lru_page()
547 		return -EINVAL;  in isolate_anon_lru_page()
562 …* set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pag…
582 	bitidx &= (BITS_PER_LONG-1);  in set_pfnblock_flags_mask()
618 		start_pfn = zone->zone_start_pfn;  in page_outside_zone_boundaries()
619 		sp = zone->spanned_pages;  in page_outside_zone_boundaries()
625 		pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",  in page_outside_zone_boundaries()
626 			pfn, zone_to_nid(zone), zone->name,  in page_outside_zone_boundaries()
687 		current->comm, page_to_pfn(page));  in bad_page()
700  * Higher-order pages are called "compound pages".  They are structured thusly:
705  * in bit 0 of page->compound_head. The rest of bits is pointer to head page.
707  * The first tail page's ->compound_dtor holds the offset in array of compound
710  * The first tail page's ->compound_order holds the order of allocation.
711  * This usage means that zero-order pages may not be compound.
729 		p->mapping = TAIL_MAPPING;  in prep_compound_page()
735 	atomic_set(compound_mapcount_ptr(page), -1);  in prep_compound_page()
781 	INIT_LIST_HEAD(&page->lru);  in set_page_guard()
784 	__mod_zone_freepage_state(zone, -(1 << order), migratetype);  in set_page_guard()
833 			pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "  in init_mem_debugging_and_hardening()
840 			pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "  in init_mem_debugging_and_hardening()
874  * Setting, clearing, and testing PageBuddy is serialized by zone->lock.
902 	struct capture_control *capc = current->capture_control;  in task_capc()
905 		!(current->flags & PF_KTHREAD) &&  in task_capc()
906 		!capc->page &&  in task_capc()
907 		capc->cc->zone == zone ? capc : NULL;  in task_capc()
914 	if (!capc || order != capc->cc->order)  in compaction_capture()
925 	 * and vice-versa but no more than normal fallback logic which can  in compaction_capture()
926 	 * have trouble finding a high-order free page.  in compaction_capture()
931 	capc->page = page;  in compaction_capture()
953 	struct free_area *area = &zone->free_area[order];  in add_to_free_list()
955 	list_add(&page->lru, &area->free_list[migratetype]);  in add_to_free_list()
956 	area->nr_free++;  in add_to_free_list()
963 	struct free_area *area = &zone->free_area[order];  in add_to_free_list_tail()
965 	list_add_tail(&page->lru, &area->free_list[migratetype]);  in add_to_free_list_tail()
966 	area->nr_free++;  in add_to_free_list_tail()
971  * of the list - so the moved pages won't immediately be considered for
977 	struct free_area *area = &zone->free_area[order];  in move_to_free_list()
979 	list_move_tail(&page->lru, &area->free_list[migratetype]);  in move_to_free_list()
989 	list_del(&page->lru);  in del_page_from_free_list()
992 	zone->free_area[order].nr_free--;  in del_page_from_free_list()
997  * of the next-highest order is free. If it is, it's possible
1010 	if (order >= MAX_ORDER - 2)  in buddy_merge_likely()
1017 	higher_page = page + (combined_pfn - pfn);  in buddy_merge_likely()
1019 	higher_buddy = higher_page + (buddy_pfn - combined_pfn);  in buddy_merge_likely()
1028  * The concept of a buddy system is to maintain direct-mapped table
1046  * -- nyc
1061 	max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order);  in __free_one_page()
1064 	VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);  in __free_one_page()
1066 	VM_BUG_ON(migratetype == -1);  in __free_one_page()
1070 	VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);  in __free_one_page()
1076 			__mod_zone_freepage_state(zone, -(1 << order),  in __free_one_page()
1081 		buddy = page + (buddy_pfn - pfn);  in __free_one_page()
1096 		page = page + (combined_pfn - pfn);  in __free_one_page()
1100 	if (order < MAX_ORDER - 1) {  in __free_one_page()
1107 		 * low-order merging.  in __free_one_page()
1113 			buddy = page + (buddy_pfn - pfn);  in __free_one_page()
1153 	if (unlikely(atomic_read(&page->_mapcount) != -1))  in page_expected_state()
1156 	if (unlikely((unsigned long)page->mapping |  in page_expected_state()
1159 			(unsigned long)page->mem_cgroup |  in page_expected_state()
1161 			(page->flags & check_flags)))  in page_expected_state()
1171 	if (unlikely(atomic_read(&page->_mapcount) != -1))  in page_bad_reason()
1173 	if (unlikely(page->mapping != NULL))  in page_bad_reason()
1174 		bad_reason = "non-NULL mapping";  in page_bad_reason()
1177 	if (unlikely(page->flags & flags)) {  in page_bad_reason()
1184 	if (unlikely(page->mem_cgroup))  in page_bad_reason()
1211 	 * We rely page->lru.next never has bit 0 set, unless the page  in free_tail_pages_check()
1212 	 * is PageTail(). Let's make sure that's true even for poisoned ->lru.  in free_tail_pages_check()
1220 	switch (page - head_page) {  in free_tail_pages_check()
1222 		/* the first tail page: ->mapping may be compound_mapcount() */  in free_tail_pages_check()
1230 		 * the second tail page: ->mapping is  in free_tail_pages_check()
1231 		 * deferred_list.next -- ignore value.  in free_tail_pages_check()
1235 		if (page->mapping != TAIL_MAPPING) {  in free_tail_pages_check()
1251 	page->mapping = NULL;  in free_tail_pages_check()
1301 	 * avoid checking PageCompound for order-0 pages.  in free_pages_prepare()
1318 			(page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;  in free_pages_prepare()
1322 		page->mapping = NULL;  in free_pages_prepare()
1331 	page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;  in free_pages_prepare()
1349 	 * With hardware tag-based KASAN, memory tags must be set before the  in free_pages_prepare()
1378  * With DEBUG_VM enabled, order-0 pages are checked immediately when being freed
1396  * With DEBUG_VM disabled, order-0 pages being freed are checked only when
1419 	struct page *buddy = page + (buddy_pfn - pfn);  in prefetch_buddy()
1449 	count = min(pcp->count, count);  in free_pcppages_bulk()
1454 		 * Remove pages from lists in a round-robin fashion. A  in free_pcppages_bulk()
1464 			list = &pcp->lists[migratetype];  in free_pcppages_bulk()
1467 		/* This is the only non-empty list. Free them all. */  in free_pcppages_bulk()
1474 			list_del(&page->lru);  in free_pcppages_bulk()
1475 			pcp->count--;  in free_pcppages_bulk()
1480 			list_add_tail(&page->lru, &head);  in free_pcppages_bulk()
1485 			 * under zone->lock. It is believed the overhead of  in free_pcppages_bulk()
1489 			 * prefetch buddy for the first pcp->batch nr of pages.  in free_pcppages_bulk()
1491 			if (prefetch_nr++ < pcp->batch)  in free_pcppages_bulk()
1493 		} while (--count && --batch_free && !list_empty(list));  in free_pcppages_bulk()
1496 	spin_lock(&zone->lock);  in free_pcppages_bulk()
1501 	 * page->lru.next will not point to original list.  in free_pcppages_bulk()
1514 	spin_unlock(&zone->lock);  in free_pcppages_bulk()
1522 	spin_lock(&zone->lock);  in free_one_page()
1528 	spin_unlock(&zone->lock);  in free_one_page()
1547 	INIT_LIST_HEAD(&page->lru);  in __init_single_page()
1568 		struct zone *zone = &pgdat->node_zones[zid];  in init_reserved_page()
1570 		if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))  in init_reserved_page()
1598 			/* Avoid false-positive PageTail() */  in reserve_bootmem_region()
1599 			INIT_LIST_HEAD(&page->lru);  in reserve_bootmem_region()
1641 	for (loop = 0; loop < (nr_pages - 1); loop++, p++) {  in __free_pages_core()
1649 	atomic_long_add(nr_pages, &page_zone(page)->managed_pages);  in __free_pages_core()
1673 	if (state->last_start <= pfn && pfn < state->last_end)  in __early_pfn_to_nid()
1674 		return state->last_nid;  in __early_pfn_to_nid()
1678 		state->last_start = start_pfn;  in __early_pfn_to_nid()
1679 		state->last_end = end_pfn;  in __early_pfn_to_nid()
1680 		state->last_nid = nid;  in __early_pfn_to_nid()
1734 	end_pfn--;  in __pageblock_pfn_to_page()
1757 	unsigned long block_start_pfn = zone->zone_start_pfn;  in set_zone_contiguous()
1774 	zone->contiguous = true;  in set_zone_contiguous()
1779 	zone->contiguous = false;  in clear_zone_contiguous()
1794 	/* Free a large naturally-aligned chunk if possible */  in deferred_free_range()
1796 	    (pfn & (pageblock_nr_pages - 1)) == 0) {  in deferred_free_range()
1803 		if ((pfn & (pageblock_nr_pages - 1)) == 0)  in deferred_free_range()
1833 	if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn))  in deferred_pfn_valid()
1845 	unsigned long nr_pgmask = pageblock_nr_pages - 1;  in deferred_free_pages()
1850 			deferred_free_range(pfn - nr_free, nr_free);  in deferred_free_pages()
1853 			deferred_free_range(pfn - nr_free, nr_free);  in deferred_free_pages()
1860 	deferred_free_range(pfn - nr_free, nr_free);  in deferred_free_pages()
1872 	unsigned long nr_pgmask = pageblock_nr_pages - 1;  in deferred_init_pages()
1894  * This function is meant to pre-load the iterator for the zone init.
2008 	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);  in deferred_init_memmap()
2021 	first_init_pfn = pgdat->first_deferred_pfn;  in deferred_init_memmap()
2029 	BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);  in deferred_init_memmap()
2030 	BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));  in deferred_init_memmap()
2031 	pgdat->first_deferred_pfn = ULONG_MAX;  in deferred_init_memmap()
2036 	 * pre-grown prior to start of deferred page initialization.  in deferred_init_memmap()
2042 		zone = pgdat->node_zones + zid;  in deferred_init_memmap()
2060 			.size        = epfn_align - spfn,  in deferred_init_memmap()
2075 		pgdat->node_id, jiffies_to_msecs(jiffies - start));  in deferred_init_memmap()
2100 	pg_data_t *pgdat = zone->zone_pgdat;  in deferred_grow_zone()
2101 	unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;  in deferred_grow_zone()
2116 	if (first_deferred_pfn != pgdat->first_deferred_pfn) {  in deferred_grow_zone()
2124 		pgdat->first_deferred_pfn = ULONG_MAX;  in deferred_grow_zone()
2151 	pgdat->first_deferred_pfn = spfn;  in deferred_grow_zone()
2197 	 * on-demand struct page initialization.  in page_alloc_init_late()
2225 	} while (++p, --i);  in init_cma_reserved_pageblock()
2234 			__free_pages(p, MAX_ORDER - 1);  in init_cma_reserved_pageblock()
2236 		} while (i -= MAX_ORDER_NR_PAGES);  in init_cma_reserved_pageblock()
2243 	page_zone(page)->cma_pages += pageblock_nr_pages;  in init_cma_reserved_pageblock()
2259  * -- nyc
2267 		high--;  in expand()
2287 	if (unlikely(page->flags & __PG_HWPOISON)) {  in check_new_page_bad()
2312  * With DEBUG_VM enabled, order-0 pages are checked for expected state when
2330  * With DEBUG_VM disabled, free order-0 pages are checked for expected state
2430 		area = &(zone->free_area[current_order]);  in __rmqueue_smallest()
2527 	start_pfn = start_pfn & ~(pageblock_nr_pages-1);  in move_freepages_block()
2529 	end_page = start_page + pageblock_nr_pages - 1;  in move_freepages_block()
2530 	end_pfn = start_pfn + pageblock_nr_pages - 1;  in move_freepages_block()
2545 	int nr_pageblocks = 1 << (start_order - pageblock_order);  in change_pageblock_range()
2547 	while (nr_pageblocks--) {  in change_pageblock_range()
2601 	max_boost = mult_frac(zone->_watermark[WMARK_HIGH],  in boost_watermark()
2617 	zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,  in boost_watermark()
2626  * pageblock to our migratetype and determine how many already-allocated pages
2659 		set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);  in steal_suitable_fallback()
2677 		 * to MOVABLE pageblock, consider all non-movable pages as  in steal_suitable_fallback()
2680 		 * exact migratetype of non-movable pages.  in steal_suitable_fallback()
2684 						- (free_pages + movable_pages);  in steal_suitable_fallback()
2697 	if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||  in steal_suitable_fallback()
2719 	if (area->nr_free == 0)  in find_suitable_fallback()
2720 		return -1;  in find_suitable_fallback()
2741 	return -1;  in find_suitable_fallback()
2745  * Reserve a pageblock for exclusive use of high-order atomic allocations if
2756 	 * Check is race-prone but harmless.  in reserve_highatomic_pageblock()
2759 	if (zone->nr_reserved_highatomic >= max_managed)  in reserve_highatomic_pageblock()
2762 	spin_lock_irqsave(&zone->lock, flags);  in reserve_highatomic_pageblock()
2765 	if (zone->nr_reserved_highatomic >= max_managed)  in reserve_highatomic_pageblock()
2772 		zone->nr_reserved_highatomic += pageblock_nr_pages;  in reserve_highatomic_pageblock()
2778 	spin_unlock_irqrestore(&zone->lock, flags);  in reserve_highatomic_pageblock()
2783  * potentially hurts the reliability of high-order allocations when under
2793 	struct zonelist *zonelist = ac->zonelist;  in unreserve_highatomic_pageblock()
2801 	for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,  in unreserve_highatomic_pageblock()
2802 								ac->nodemask) {  in unreserve_highatomic_pageblock()
2807 		if (!force && zone->nr_reserved_highatomic <=  in unreserve_highatomic_pageblock()
2811 		spin_lock_irqsave(&zone->lock, flags);  in unreserve_highatomic_pageblock()
2813 			struct free_area *area = &(zone->free_area[order]);  in unreserve_highatomic_pageblock()
2823 			 * from highatomic to ac->migratetype. So we should  in unreserve_highatomic_pageblock()
2829 				 * locking could inadvertently allow a per-cpu  in unreserve_highatomic_pageblock()
2834 				zone->nr_reserved_highatomic -= min(  in unreserve_highatomic_pageblock()
2836 						zone->nr_reserved_highatomic);  in unreserve_highatomic_pageblock()
2840 			 * Convert to ac->migratetype and avoid the normal  in unreserve_highatomic_pageblock()
2848 			set_pageblock_migratetype(page, ac->migratetype);  in unreserve_highatomic_pageblock()
2849 			ret = move_freepages_block(zone, page, ac->migratetype,  in unreserve_highatomic_pageblock()
2852 				spin_unlock_irqrestore(&zone->lock, flags);  in unreserve_highatomic_pageblock()
2856 		spin_unlock_irqrestore(&zone->lock, flags);  in unreserve_highatomic_pageblock()
2896 	for (current_order = MAX_ORDER - 1; current_order >= min_order;  in __rmqueue_fallback()
2897 				--current_order) {  in __rmqueue_fallback()
2898 		area = &(zone->free_area[current_order]);  in __rmqueue_fallback()
2901 		if (fallback_mt == -1)  in __rmqueue_fallback()
2924 		area = &(zone->free_area[current_order]);  in __rmqueue_fallback()
2927 		if (fallback_mt != -1)  in __rmqueue_fallback()
2932 	 * This should not happen - we already found a suitable fallback  in __rmqueue_fallback()
2952  * Call me with the zone->lock already held.
3000 	spin_lock(&zone->lock);  in rmqueue_bulk()
3026 		list_add_tail(&page->lru, list);  in rmqueue_bulk()
3030 					      -(1 << order));  in rmqueue_bulk()
3039 	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));  in rmqueue_bulk()
3040 	spin_unlock(&zone->lock);  in rmqueue_bulk()
3053 	struct list_head *list = &pcp->lists[migratetype];  in get_populated_pcp_list()
3056 		pcp->count += rmqueue_bulk(zone, order,  in get_populated_pcp_list()
3057 				pcp->batch, list,  in get_populated_pcp_list()
3081 	batch = READ_ONCE(pcp->batch);  in drain_zone_pages()
3082 	to_drain = min(pcp->count, batch);  in drain_zone_pages()
3103 	pset = per_cpu_ptr(zone->pageset, cpu);  in drain_pages_zone()
3105 	pcp = &pset->pcp;  in drain_pages_zone()
3106 	if (pcp->count)  in drain_pages_zone()
3107 		free_pcppages_bulk(zone, pcp->count, pcp);  in drain_pages_zone()
3128  * Spill all of this CPU's per-cpu pages back into the buddy allocator.
3130  * The CPU has to be pinned. When zone parameter is non-NULL, spill just
3157 	drain_local_pages(drain->zone);  in drain_local_pages_wq()
3162  * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
3164  * When zone parameter is non-NULL, spill just the single zone's pages.
3208 			pcp = per_cpu_ptr(zone->pageset, cpu);  in drain_all_pages()
3209 			if (pcp->pcp.count)  in drain_all_pages()
3213 				pcp = per_cpu_ptr(z->pageset, cpu);  in drain_all_pages()
3214 				if (pcp->pcp.count) {  in drain_all_pages()
3230 		drain->zone = zone;  in drain_all_pages()
3231 		INIT_WORK(&drain->work, drain_local_pages_wq);  in drain_all_pages()
3232 		queue_work_on(cpu, mm_percpu_wq, &drain->work);  in drain_all_pages()
3235 		flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work);  in drain_all_pages()
3257 	spin_lock_irqsave(&zone->lock, flags);  in mark_free_pages()
3260 	for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)  in mark_free_pages()
3264 			if (!--page_count) {  in mark_free_pages()
3278 				&zone->free_area[order].free_list[t], lru) {  in mark_free_pages()
3283 				if (!--page_count) {  in mark_free_pages()
3291 	spin_unlock_irqrestore(&zone->lock, flags);  in mark_free_pages()
3336 	pcp = &this_cpu_ptr(zone->pageset)->pcp;  in free_unref_page_commit()
3337 	list_add(&page->lru, &pcp->lists[migratetype]);  in free_unref_page_commit()
3338 	pcp->count++;  in free_unref_page_commit()
3339 	if (pcp->count >= pcp->high) {  in free_unref_page_commit()
3340 		unsigned long batch = READ_ONCE(pcp->batch);  in free_unref_page_commit()
3346  * Free a 0-order page
3362  * Free a list of 0-order pages
3374 			list_del(&page->lru);  in free_unref_page_list()
3400  * split_page takes a non-compound higher-order page, and splits it into
3401  * n (1<<order) sub-pages: page[0..n]
3402  * Each sub-page must be freed individually.
3435 		 * emulate a high-order watermark check with a raised order-0  in __isolate_free_page()
3436 		 * watermark, because we already know our high-order page  in __isolate_free_page()
3439 		watermark = zone->_watermark[WMARK_MIN] + (1UL << order);  in __isolate_free_page()
3443 		__mod_zone_freepage_state(zone, -(1UL << order), mt);  in __isolate_free_page()
3454 	if (order >= pageblock_order - 1) {  in __isolate_free_page()
3455 		struct page *endpage = page + (1 << order) - 1;  in __isolate_free_page()
3470  * __putback_isolated_page - Return a now-isolated page back where we got it
3483 	lockdep_assert_held(&zone->lock);  in __putback_isolated_page()
3517 /* Remove page from the per-cpu list, caller must protect the list */
3547 		list_del(&page->lru);  in __rmqueue_pcplist()
3548 		pcp->count--;  in __rmqueue_pcplist()
3554 /* Lock and remove page from the per-cpu list */
3564 	pcp = &this_cpu_ptr(zone->pageset)->pcp;  in rmqueue_pcplist()
3576  * Allocate a page from the given zone. Use pcplists for order-0 allocations.
3595 	 * allocate greater than order-1 page units with __GFP_NOFAIL.  in rmqueue()
3598 	spin_lock_irqsave(&zone->lock, flags);  in rmqueue()
3603 		 * order-0 request can reach here when the pcplist is skipped  in rmqueue()
3604 		 * due to non-CMA allocation context. HIGHATOMIC area is  in rmqueue()
3605 		 * reserved for high-order atomic allocation, so order-0  in rmqueue()
3623 	spin_unlock(&zone->lock);  in rmqueue()
3626 	__mod_zone_freepage_state(zone, -(1 << order),  in rmqueue()
3637 	if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {  in rmqueue()
3638 		clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);  in rmqueue()
3696 	debugfs_create_bool("ignore-gfp-wait", mode, dir,  in fail_page_alloc_debugfs()
3698 	debugfs_create_bool("ignore-gfp-highmem", mode, dir,  in fail_page_alloc_debugfs()
3700 	debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);  in fail_page_alloc_debugfs()
3728 	long unusable_free = (1 << order) - 1;  in __zone_watermark_unusable_free()
3732 	 * the high-atomic reserves. This will over-estimate the size of the  in __zone_watermark_unusable_free()
3736 		unusable_free += z->nr_reserved_highatomic;  in __zone_watermark_unusable_free()
3748  * Return true if free base pages are above 'mark'. For high-order checks it
3749  * will return true of the order-0 watermark is reached and there is at least
3761 	/* free_pages may go negative - that's OK */  in __zone_watermark_ok()
3762 	free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);  in __zone_watermark_ok()
3765 		min -= min / 2;  in __zone_watermark_ok()
3772 		 * makes during the free path will be small and short-lived.  in __zone_watermark_ok()
3775 			min -= min / 2;  in __zone_watermark_ok()
3777 			min -= min / 4;  in __zone_watermark_ok()
3781 	 * Check watermarks for an order-0 allocation request. If these  in __zone_watermark_ok()
3782 	 * are not met, then a high-order request also cannot go ahead  in __zone_watermark_ok()
3785 	if (free_pages <= min + z->lowmem_reserve[highest_zoneidx])  in __zone_watermark_ok()
3788 	/* If this is an order-0 request then the watermark is fine */  in __zone_watermark_ok()
3792 	/* For a high-order request, check at least one suitable page is free */  in __zone_watermark_ok()
3794 		struct free_area *area = &z->free_area[o];  in __zone_watermark_ok()
3797 		if (!area->nr_free)  in __zone_watermark_ok()
3842 	 * Fast check for order-0 only. If this fails then the reserves  in zone_watermark_fast()
3852 		/* reserved may over estimate high-atomic reserves. */  in zone_watermark_fast()
3853 		usable_free -= min(usable_free, reserved);  in zone_watermark_fast()
3854 		if (usable_free > mark + z->lowmem_reserve[highest_zoneidx])  in zone_watermark_fast()
3862 	 * Ignore watermark boosting for GFP_ATOMIC order-0 allocations  in zone_watermark_fast()
3867 	if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost  in zone_watermark_fast()
3869 		mark = z->_watermark[WMARK_MIN];  in zone_watermark_fast()
3882 	if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)  in zone_watermark_ok_safe()
3931 	 * the pointer is within zone->zone_pgdat->node_zones[]. Also assume  in alloc_flags_nofragment()
3934 	BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1);  in alloc_flags_nofragment()
3935 	if (nr_online_nodes > 1 && !populated_zone(--zone))  in alloc_flags_nofragment()
3947 	unsigned int pflags = current->flags;  in current_alloc_flags()
3977 	z = ac->preferred_zoneref;  in get_page_from_freelist()
3978 	for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx,  in get_page_from_freelist()
3979 					ac->nodemask) {  in get_page_from_freelist()
3998 		 * exceed the per-node dirty limit in the slowpath  in get_page_from_freelist()
4004 		 * dirty-throttling and the flusher threads.  in get_page_from_freelist()
4006 		if (ac->spread_dirty_pages) {  in get_page_from_freelist()
4007 			if (last_pgdat_dirty_limit == zone->zone_pgdat)  in get_page_from_freelist()
4010 			if (!node_dirty_ok(zone->zone_pgdat)) {  in get_page_from_freelist()
4011 				last_pgdat_dirty_limit = zone->zone_pgdat;  in get_page_from_freelist()
4017 		    zone != ac->preferred_zoneref->zone) {  in get_page_from_freelist()
4025 			local_nid = zone_to_nid(ac->preferred_zoneref->zone);  in get_page_from_freelist()
4034 				       ac->highest_zoneidx, alloc_flags,  in get_page_from_freelist()
4054 			    !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))  in get_page_from_freelist()
4057 			ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);  in get_page_from_freelist()
4068 					ac->highest_zoneidx, alloc_flags))  in get_page_from_freelist()
4076 		page = rmqueue(ac->preferred_zoneref->zone, zone, order,  in get_page_from_freelist()
4077 				gfp_mask, alloc_flags, ac->migratetype);  in get_page_from_freelist()
4082 			 * If this is a high-order atomic allocation then check  in get_page_from_freelist()
4123 		    (current->flags & (PF_MEMALLOC | PF_EXITING)))  in warn_alloc_show_mem()
4146 			current->comm, &vaf, gfp_mask, &gfp_mask,  in warn_alloc()
4181 		.zonelist = ac->zonelist,  in __alloc_pages_may_oom()
4182 		.nodemask = ac->nodemask,  in __alloc_pages_may_oom()
4215 	if (current->flags & PF_DUMPCORE)  in __alloc_pages_may_oom()
4231 	if (ac->highest_zoneidx < ZONE_NORMAL)  in __alloc_pages_may_oom()
4250 		 * Help non-failing allocations by giving them access to memory  in __alloc_pages_may_oom()
4269 /* Try memory compaction for high-order allocations before reclaim */
4308 		zone->compact_blockskip_flush = false;  in __alloc_pages_direct_compact()
4352 	 * compaction was skipped because there are not enough order-0 pages  in should_compact_retry()
4394 		(*compact_priority)--;  in should_compact_retry()
4427 	 * Let's give them a good hope and keep retrying while the order-0  in should_compact_retry()
4430 	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,  in should_compact_retry()
4431 				ac->highest_zoneidx, ac->nodemask) {  in should_compact_retry()
4433 					ac->highest_zoneidx, alloc_flags))  in should_compact_retry()
4453 	if (current->flags & PF_MEMALLOC)  in __need_fs_reclaim()
4530 	progress = try_to_free_pages(ac->zonelist, order, gfp_mask,  in __perform_reclaim()
4531 								ac->nodemask);  in __perform_reclaim()
4562 	 * pages are pinned on the per-cpu lists or in high alloc reserves.  in __alloc_pages_direct_reclaim()
4568 			alloc_flags, ac->migratetype, *did_some_progress, &skip_pcp_drain);  in __alloc_pages_direct_reclaim()
4586 	enum zone_type highest_zoneidx = ac->highest_zoneidx;  in wake_all_kswapds()
4588 	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,  in wake_all_kswapds()
4589 					ac->nodemask) {  in wake_all_kswapds()
4590 		if (last_pgdat != zone->zone_pgdat)  in wake_all_kswapds()
4592 		last_pgdat = zone->zone_pgdat;  in wake_all_kswapds()
4663 	if (in_serving_softirq() && (current->flags & PF_MEMALLOC))  in __gfp_pfmemalloc_flags()
4666 		if (current->flags & PF_MEMALLOC)  in __gfp_pfmemalloc_flags()
4724 	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,  in should_reclaim_retry()
4725 				ac->highest_zoneidx, ac->nodemask) {  in should_reclaim_retry()
4739 				ac->highest_zoneidx, alloc_flags, available);  in should_reclaim_retry()
4774 	if (current->flags & PF_WQ_WORKER)  in should_reclaim_retry()
4790 	 * This assumes that for all allocations, ac->nodemask can come only  in check_retry_cpuset()
4795 	if (cpusets_enabled() && ac->nodemask &&  in check_retry_cpuset()
4796 			!cpuset_nodemask_valid_mems_allowed(ac->nodemask)) {  in check_retry_cpuset()
4797 		ac->nodemask = NULL;  in check_retry_cpuset()
4858 	 * there was a cpuset modification and we are retrying - otherwise we  in __alloc_pages_slowpath()
4859 	 * could end up iterating over non-eligible zones endlessly.  in __alloc_pages_slowpath()
4861 	ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,  in __alloc_pages_slowpath()
4862 					ac->highest_zoneidx, ac->nodemask);  in __alloc_pages_slowpath()
4863 	if (!ac->preferred_zoneref->zone)  in __alloc_pages_slowpath()
4879 	 * that we have enough base pages and don't need to reclaim. For non-  in __alloc_pages_slowpath()
4880 	 * movable high-order allocations, do that as well, as compaction will  in __alloc_pages_slowpath()
4888 			   (order > 0 && ac->migratetype != MIGRATE_MOVABLE))  in __alloc_pages_slowpath()
4910 			 *  - potentially very expensive because zones are far  in __alloc_pages_slowpath()
4913 			 *  - not guaranteed to help because isolate_freepages()  in __alloc_pages_slowpath()
4916 			 *  - unlikely to make entire pageblocks free on its  in __alloc_pages_slowpath()
4947 		ac->nodemask = NULL;  in __alloc_pages_slowpath()
4948 		ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,  in __alloc_pages_slowpath()
4949 					ac->highest_zoneidx, ac->nodemask);  in __alloc_pages_slowpath()
4962 	if (current->flags & PF_MEMALLOC)  in __alloc_pages_slowpath()
4966 		alloc_flags, ac->migratetype, &page);  in __alloc_pages_slowpath()
4999 	 * It doesn't make any sense to retry for the compaction if the order-0  in __alloc_pages_slowpath()
5062 		WARN_ON_ONCE(current->flags & PF_MEMALLOC);  in __alloc_pages_slowpath()
5073 		 * Help non-failing allocations by giving them access to memory  in __alloc_pages_slowpath()
5087 		alloc_flags, ac->migratetype, &page);  in __alloc_pages_slowpath()
5091 	warn_alloc(gfp_mask, ac->nodemask,  in __alloc_pages_slowpath()
5103 	ac->highest_zoneidx = gfp_zone(gfp_mask);  in prepare_alloc_pages()
5104 	ac->zonelist = node_zonelist(preferred_nid, gfp_mask);  in prepare_alloc_pages()
5105 	ac->nodemask = nodemask;  in prepare_alloc_pages()
5106 	ac->migratetype = gfp_migratetype(gfp_mask);  in prepare_alloc_pages()
5114 		if (!in_interrupt() && !ac->nodemask)  in prepare_alloc_pages()
5115 			ac->nodemask = &cpuset_current_mems_allowed;  in prepare_alloc_pages()
5131 	ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);  in prepare_alloc_pages()
5138 	ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,  in prepare_alloc_pages()
5139 					ac->highest_zoneidx, ac->nodemask);  in prepare_alloc_pages()
5174 	alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask);  in __alloc_pages_nodemask()
5192 	 * &cpuset_current_mems_allowed to optimize the fast-path attempt.  in __alloc_pages_nodemask()
5247 		while (order-- > 0)  in __free_pages()
5264  *  An arbitrary-length arbitrary-offset area of memory which resides
5271  * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
5284 	nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;  in __page_frag_cache_refill()
5289 	nc->va = page ? page_address(page) : NULL;  in __page_frag_cache_refill()
5310 	if (unlikely(!nc->va)) {  in page_frag_alloc()
5318 		size = nc->size;  in page_frag_alloc()
5326 		nc->pfmemalloc = page_is_pfmemalloc(page);  in page_frag_alloc()
5327 		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;  in page_frag_alloc()
5328 		nc->offset = size;  in page_frag_alloc()
5331 	offset = nc->offset - fragsz;  in page_frag_alloc()
5333 		page = virt_to_page(nc->va);  in page_frag_alloc()
5335 		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))  in page_frag_alloc()
5338 		if (unlikely(nc->pfmemalloc)) {  in page_frag_alloc()
5345 		size = nc->size;  in page_frag_alloc()
5351 		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;  in page_frag_alloc()
5352 		offset = size - fragsz;  in page_frag_alloc()
5367 	nc->pagecnt_bias--;  in page_frag_alloc()
5368 	nc->offset = offset;  in page_frag_alloc()
5370 	return nc->va + offset;  in page_frag_alloc()
5403  * alloc_pages_exact - allocate an exact number physically-contiguous pages.
5409  * allocate memory in power-of-two pages.
5431  * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
5457  * free_pages_exact - release memory allocated via alloc_pages_exact()
5476  * nr_free_zone_pages - count number of pages beyond high watermark
5483  *     nr_free_zone_pages = managed_pages - high_pages
5501 			sum += size - high;  in nr_free_zone_pages()
5508  * nr_free_buffer_pages - count number of pages beyond high watermark
5548 	available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;  in si_mem_available()
5556 	pagecache -= min(pagecache / 2, wmark_low);  in si_mem_available()
5566 	available += reclaimable - min(reclaimable / 2, wmark_low);  in si_mem_available()
5576 	val->totalram = totalram_pages();  in si_meminfo()
5577 	val->sharedram = global_node_page_state(NR_SHMEM);  in si_meminfo()
5578 	val->freeram = global_zone_page_state(NR_FREE_PAGES);  in si_meminfo()
5579 	val->bufferram = nr_blockdev_pages();  in si_meminfo()
5580 	val->totalhigh = totalhigh_pages();  in si_meminfo()
5581 	val->freehigh = nr_free_highpages();  in si_meminfo()
5582 	val->mem_unit = PAGE_SIZE;  in si_meminfo()
5597 		managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);  in si_meminfo_node()
5598 	val->totalram = managed_pages;  in si_meminfo_node()
5599 	val->sharedram = node_page_state(pgdat, NR_SHMEM);  in si_meminfo_node()
5600 	val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);  in si_meminfo_node()
5603 		struct zone *zone = &pgdat->node_zones[zone_type];  in si_meminfo_node()
5610 	val->totalhigh = managed_highpages;  in si_meminfo_node()
5611 	val->freehigh = free_highpages;  in si_meminfo_node()
5613 	val->totalhigh = managed_highpages;  in si_meminfo_node()
5614 	val->freehigh = free_highpages;  in si_meminfo_node()
5616 	val->mem_unit = PAGE_SIZE;  in si_meminfo_node()
5630 	 * no node mask - aka implicit memory numa policy. Do not bother with  in show_mem_node_skip()
5631 	 * the synchronization - read_mems_allowed_begin - because we do not  in show_mem_node_skip()
5640 #define K(x) ((x) << (PAGE_SHIFT-10))
5670  * Show free area list (used inside shift_scroll-lock stuff)
5690 			free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;  in show_free_areas()
5720 		if (show_mem_node_skip(filter, pgdat->node_id, nodemask))  in show_free_areas()
5747 			pgdat->node_id,  in show_free_areas()
5770 			pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?  in show_free_areas()
5782 			free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;  in show_free_areas()
5807 			zone->name,  in show_free_areas()
5812 			K(zone->nr_reserved_highatomic),  in show_free_areas()
5819 			K(zone->present_pages),  in show_free_areas()
5825 			K(this_cpu_read(zone->pageset->pcp.count)),  in show_free_areas()
5829 			printk(KERN_CONT " %ld", zone->lowmem_reserve[i]);  in show_free_areas()
5841 		printk(KERN_CONT "%s: ", zone->name);  in show_free_areas()
5843 		spin_lock_irqsave(&zone->lock, flags);  in show_free_areas()
5845 			struct free_area *area = &zone->free_area[order];  in show_free_areas()
5848 			nr[order] = area->nr_free;  in show_free_areas()
5857 		spin_unlock_irqrestore(&zone->lock, flags);  in show_free_areas()
5876 	zoneref->zone = zone;  in zoneref_set_zone()
5877 	zoneref->zone_idx = zone_idx(zone);  in zoneref_set_zone()
5892 		zone_type--;  in build_zonerefs_node()
5893 		zone = pgdat->node_zones + zone_type;  in build_zonerefs_node()
5915 		return -EINVAL;  in __parse_numa_zonelist_order()
5938  * find_next_best_node - find the next node that should appear in a given node's fallback list
5999  * This results in maximum locality--normal zone overflows into local
6000  * DMA zone, if any--but risks exhausting DMA zone.
6008 	zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;  in build_zonelists_in_node_order()
6018 	zonerefs->zone = NULL;  in build_zonelists_in_node_order()
6019 	zonerefs->zone_idx = 0;  in build_zonelists_in_node_order()
6030 	zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;  in build_thisnode_zonelists()
6033 	zonerefs->zone = NULL;  in build_thisnode_zonelists()
6034 	zonerefs->zone_idx = 0;  in build_thisnode_zonelists()
6051 	/* NUMA-aware ordering of nodes */  in build_zonelists()
6052 	local_node = pgdat->node_id;  in build_zonelists()
6061 		 * distance group to make it round-robin.  in build_zonelists()
6069 		load--;  in build_zonelists()
6090 	return zone_to_nid(z->zone);  in local_memory_node()
6104 	local_node = pgdat->node_id;  in build_zonelists()
6106 	zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;  in build_zonelists()
6131 	zonerefs->zone = NULL;  in build_zonelists()
6132 	zonerefs->zone_idx = 0;  in build_zonelists()
6170 	 * building zonelists is fine - no need to touch other nodes.  in __build_all_zonelists()
6172 	if (self && !node_online(self->node_id)) {  in __build_all_zonelists()
6183 		 * We now know the "local memory node" for each node--  in __build_all_zonelists()
6185 		 * Set up numa_mem percpu variable for on-line cpus.  During  in __build_all_zonelists()
6186 		 * boot, only the boot cpu should be on-line;  we'll init the  in __build_all_zonelists()
6187 		 * secondary cpus' numa_mem as they come on-line.  During  in __build_all_zonelists()
6188 		 * node/memory hotplug, we'll fixup all on-line cpus.  in __build_all_zonelists()
6216 	 * (a chicken-egg dilemma).  in build_all_zonelists_init()
6246 	 * more accurate, but expensive to check per-zone. This check is  in build_all_zonelists()
6247 	 * made on memory-hotadd so a system can start with mobility  in build_all_zonelists()
6287  * Initially all pages are reserved - free ones are freed
6289  * done. Non-atomic initialization, single-pass.
6303 	if (highest_memmap_pfn < end_pfn - 1)  in memmap_init_zone()
6304 		highest_memmap_pfn = end_pfn - 1;  in memmap_init_zone()
6318 		if (start_pfn == altmap->base_pfn)  in memmap_init_zone()
6319 			start_pfn += altmap->reserve;  in memmap_init_zone()
6320 		end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);  in memmap_init_zone()
6331 		 * There can be holes in boot-time mem_map[]s handed to this  in memmap_init_zone()
6366 	struct pglist_data *pgdat = zone->zone_pgdat;  in memmap_init_zone_device()
6370 	int nid = pgdat->node_id;  in memmap_init_zone_device()
6381 		start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);  in memmap_init_zone_device()
6382 		nr_pages = end_pfn - start_pfn;  in memmap_init_zone_device()
6394 		 * We can use the non-atomic __set_bit operation for setting  in memmap_init_zone_device()
6400 		 * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer  in memmap_init_zone_device()
6402 		 * ever freed or placed on a driver-private list.  in memmap_init_zone_device()
6404 		page->pgmap = pgmap;  in memmap_init_zone_device()
6405 		page->zone_device_data = NULL;  in memmap_init_zone_device()
6411 		 * the address space during boot when many long-lived  in memmap_init_zone_device()
6424 		nr_pages, jiffies_to_msecs(jiffies - start));  in memmap_init_zone_device()
6432 		INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);  in zone_init_free_lists()
6433 		zone->free_area[order].nr_free = 0;  in zone_init_free_lists()
6444  * - physical memory bank size is not necessarily the exact multiple of the
6446  * - early reserved memory may not be listed in memblock.memory
6447  * - memory layouts defined with memmap= kernel parameter may not align
6451  * - PG_Reserved is set
6452  * - zone and node links point to zone and node that span the page if the
6454  * - zone and node links point to adjacent zone/node if the hole falls on
6469 				+ pageblock_nr_pages - 1;  in init_unavailable_range()
6487 	unsigned long zone_start_pfn = zone->zone_start_pfn;  in memmap_init_zone_range()
6488 	unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;  in memmap_init_zone_range()
6497 	memmap_init_zone(end_pfn - start_pfn, nid, zone_id, start_pfn,  in memmap_init_zone_range()
6516 			struct zone *zone = node->node_zones + j;  in memmap_init()
6543 /* A stub for backwards compatibility with custom implementatin on IA-64 */
6556 	 * The per-cpu-pages pools are set to around 1000th of the  in zone_batchsize()
6568 	 * Clamp the batch to a 2^n - 1 value. Having a power  in zone_batchsize()
6577 	batch = rounddown_pow_of_two(batch + batch/2) - 1;  in zone_batchsize()
6593 	 * fragmented and becoming unavailable for high-order allocations.  in zone_batchsize()
6600  * pcp->high and pcp->batch values are related and dependent on one another:
6601  * ->batch must never be higher then ->high.
6605  * Any new users of pcp->batch and pcp->high should ensure they can cope with
6616 	pcp->batch = 1;  in pageset_update()
6620 	pcp->high = high;  in pageset_update()
6623 	pcp->batch = batch;  in pageset_update()
6629 	pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));  in pageset_set_batch()
6639 	pcp = &p->pcp;  in pageset_init()
6641 		INIT_LIST_HEAD(&pcp->lists[migratetype]);  in pageset_init()
6661 	pageset_update(&p->pcp, high, batch);  in pageset_set_high()
6677 	struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);  in zone_pageset_init()
6686 	zone->pageset = alloc_percpu(struct per_cpu_pageset);  in setup_zone_pageset()
6713 		memset(pcp->vm_numa_stat_diff, 0,  in setup_per_cpu_pageset()
6714 		       sizeof(pcp->vm_numa_stat_diff));  in setup_per_cpu_pageset()
6719 		pgdat->per_cpu_nodestats =  in setup_per_cpu_pageset()
6730 	zone->pageset = &boot_pageset;  in zone_pcp_init()
6734 			zone->name, zone->present_pages,  in zone_pcp_init()
6742 	struct pglist_data *pgdat = zone->zone_pgdat;  in init_currently_empty_zone()
6745 	if (zone_idx > pgdat->nr_zones)  in init_currently_empty_zone()
6746 		pgdat->nr_zones = zone_idx;  in init_currently_empty_zone()
6748 	zone->zone_start_pfn = zone_start_pfn;  in init_currently_empty_zone()
6751 			"Initialising map node %d zone %lu pfns %lu -> %lu\n",  in init_currently_empty_zone()
6752 			pgdat->node_id,  in init_currently_empty_zone()
6757 	zone->initialized = 1;  in init_currently_empty_zone()
6761  * get_pfn_range_for_nid - Return the start and end page frames for a node
6777 	*start_pfn = -1UL;  in get_pfn_range_for_nid()
6785 	if (*start_pfn == -1UL)  in get_pfn_range_for_nid()
6797 	for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {  in find_usable_zone_for_movable()
6806 	VM_BUG_ON(zone_index == -1);  in find_usable_zone_for_movable()
6849  * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
6880 	return *zone_end_pfn - *zone_start_pfn;  in zone_spanned_pages_in_node()
6891 	unsigned long nr_absent = range_end_pfn - range_start_pfn;  in __absent_pages_in_range()
6898 		nr_absent -= end_pfn - start_pfn;  in __absent_pages_in_range()
6904  * absent_pages_in_range - Return number of page frames in holes within a range
6956 				nr_absent += end_pfn - start_pfn;  in zone_absent_pages_in_node()
6960 				nr_absent += end_pfn - start_pfn;  in zone_absent_pages_in_node()
6975 		struct zone *zone = pgdat->node_zones + i;  in calculate_node_totalpages()
6980 		spanned = zone_spanned_pages_in_node(pgdat->node_id, i,  in calculate_node_totalpages()
6985 		absent = zone_absent_pages_in_node(pgdat->node_id, i,  in calculate_node_totalpages()
6990 		real_size = size - absent;  in calculate_node_totalpages()
6993 			zone->zone_start_pfn = zone_start_pfn;  in calculate_node_totalpages()
6995 			zone->zone_start_pfn = 0;  in calculate_node_totalpages()
6996 		zone->spanned_pages = size;  in calculate_node_totalpages()
6997 		zone->present_pages = real_size;  in calculate_node_totalpages()
7003 	pgdat->node_spanned_pages = totalpages;  in calculate_node_totalpages()
7004 	pgdat->node_present_pages = realtotalpages;  in calculate_node_totalpages()
7005 	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,  in calculate_node_totalpages()
7011  * Calculate the size of the zone->blockflags rounded to an unsigned long
7021 	zonesize += zone_start_pfn & (pageblock_nr_pages-1);  in usemap_size()
7036 	zone->pageblock_flags = NULL;  in setup_usemap()
7038 		zone->pageblock_flags =  in setup_usemap()
7040 					    pgdat->node_id);  in setup_usemap()
7041 		if (!zone->pageblock_flags)  in setup_usemap()
7043 			      usemapsize, zone->name, pgdat->node_id);  in setup_usemap()
7065 		order = MAX_ORDER - 1;  in set_pageblock_order()
7078  * is unused as pageblock_order is set at compile-time. See
7079  * include/linux/pageblock-flags.h for the values of pageblock_order based on
7111 	struct deferred_split *ds_queue = &pgdat->deferred_split_queue;  in pgdat_init_split_queue()
7113 	spin_lock_init(&ds_queue->split_queue_lock);  in pgdat_init_split_queue()
7114 	INIT_LIST_HEAD(&ds_queue->split_queue);  in pgdat_init_split_queue()
7115 	ds_queue->split_queue_len = 0;  in pgdat_init_split_queue()
7124 	init_waitqueue_head(&pgdat->kcompactd_wait);  in pgdat_init_kcompactd()
7137 	init_waitqueue_head(&pgdat->kswapd_wait);  in pgdat_init_internals()
7138 	init_waitqueue_head(&pgdat->pfmemalloc_wait);  in pgdat_init_internals()
7141 	spin_lock_init(&pgdat->lru_lock);  in pgdat_init_internals()
7142 	lruvec_init(&pgdat->__lruvec);  in pgdat_init_internals()
7148 	atomic_long_set(&zone->managed_pages, remaining_pages);  in zone_init_internals()
7150 	zone->name = zone_names[idx];  in zone_init_internals()
7151 	zone->zone_pgdat = NODE_DATA(nid);  in zone_init_internals()
7152 	spin_lock_init(&zone->lock);  in zone_init_internals()
7159  * - init pgdat internals
7160  * - init all zones belonging to this node
7172 		zone_init_internals(&pgdat->node_zones[z], z, nid, 0);  in free_area_init_core_hotplug()
7178  *   - mark all pages reserved
7179  *   - mark all memory queues empty
7180  *   - clear the memory bitmaps
7188 	int nid = pgdat->node_id;  in free_area_init_core()
7191 	pgdat->per_cpu_nodestats = &boot_nodestats;  in free_area_init_core()
7194 		struct zone *zone = pgdat->node_zones + j;  in free_area_init_core()
7196 		unsigned long zone_start_pfn = zone->zone_start_pfn;  in free_area_init_core()
7198 		size = zone->spanned_pages;  in free_area_init_core()
7199 		freesize = zone->present_pages;  in free_area_init_core()
7204 		 * and per-cpu initialisations  in free_area_init_core()
7209 				freesize -= memmap_pages;  in free_area_init_core()
7221 			freesize -= dma_reserve;  in free_area_init_core()
7230 			nr_kernel_pages -= memmap_pages;  in free_area_init_core()
7257 	if (!pgdat->node_spanned_pages)  in alloc_node_mem_map()
7260 	start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);  in alloc_node_mem_map()
7261 	offset = pgdat->node_start_pfn - start;  in alloc_node_mem_map()
7263 	if (!pgdat->node_mem_map) {  in alloc_node_mem_map()
7274 		size =  (end - start) * sizeof(struct page);  in alloc_node_mem_map()
7276 					  pgdat->node_id);  in alloc_node_mem_map()
7279 			      size, pgdat->node_id);  in alloc_node_mem_map()
7280 		pgdat->node_mem_map = map + offset;  in alloc_node_mem_map()
7283 				__func__, pgdat->node_id, (unsigned long)pgdat,  in alloc_node_mem_map()
7284 				(unsigned long)pgdat->node_mem_map);  in alloc_node_mem_map()
7290 		mem_map = NODE_DATA(0)->node_mem_map;  in alloc_node_mem_map()
7291 		if (page_to_pfn(mem_map) != pgdat->node_start_pfn)  in alloc_node_mem_map()
7292 			mem_map -= offset;  in alloc_node_mem_map()
7303 	pgdat->first_deferred_pfn = ULONG_MAX;  in pgdat_set_deferred_range()
7316 	WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx);  in free_area_init_node()
7320 	pgdat->node_id = nid;  in free_area_init_node()
7321 	pgdat->node_start_pfn = start_pfn;  in free_area_init_node()
7322 	pgdat->per_cpu_nodestats = NULL;  in free_area_init_node()
7324 	pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,  in free_area_init_node()
7326 		end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);  in free_area_init_node()
7354  * node_map_pfn_alignment - determine the maximum internode alignment
7361  * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)).  If the
7365  * This is used to test whether pfn -> nid mapping of the chosen memory
7366  * model has fine enough granularity to avoid incorrect mapping for the
7387 		 * Start with a mask granular enough to pin-point to the  in node_map_pfn_alignment()
7388 		 * start pfn and tick off bits one-by-one until it becomes  in node_map_pfn_alignment()
7391 		mask = ~((1 << __ffs(start)) - 1);  in node_map_pfn_alignment()
7404  * find_min_pfn_with_active_regions - Find the minimum PFN registered
7426 		unsigned long pages = end_pfn - start_pfn;  in early_calculate_totalpages()
7466 			usable_startpfn = PFN_DOWN(r->base);  in find_zone_movable_pfns_for_nodes()
7528 		 * Round-up so that ZONE_MOVABLE is at least as large as what  in find_zone_movable_pfns_for_nodes()
7534 		corepages = totalpages - required_movablecore;  in find_zone_movable_pfns_for_nodes()
7582 								- start_pfn;  in find_zone_movable_pfns_for_nodes()
7584 				kernelcore_remaining -= min(kernel_pages,  in find_zone_movable_pfns_for_nodes()
7586 				required_kernelcore -= min(kernel_pages,  in find_zone_movable_pfns_for_nodes()
7606 			 * start_pfn->end_pfn. Calculate size_pages as the  in find_zone_movable_pfns_for_nodes()
7609 			size_pages = end_pfn - start_pfn;  in find_zone_movable_pfns_for_nodes()
7619 			required_kernelcore -= min(required_kernelcore,  in find_zone_movable_pfns_for_nodes()
7621 			kernelcore_remaining -= size_pages;  in find_zone_movable_pfns_for_nodes()
7633 	usable_nodes--;  in find_zone_movable_pfns_for_nodes()
7660 	for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {  in check_for_memory()
7661 		struct zone *zone = &pgdat->node_zones[zone_type];  in check_for_memory()
7682  * free_area_init - Initialise all pg_data_t and zone data
7711 			zone = MAX_NR_ZONES - i - 1;  in free_area_init()
7734 		pr_info("  %-8s ", zone_names[i]);  in free_area_init()
7739 			pr_cont("[mem %#018Lx-%#018Lx]\n",  in free_area_init()
7743 					<< PAGE_SHIFT) - 1);  in free_area_init()
7756 	 * subsection-map relative to active online memory ranges to  in free_area_init()
7757 	 * enable future "sub-section" extensions of the memory map.  in free_area_init()
7761 		pr_info("  node %3d: [mem %#018Lx-%#018Lx]\n", nid,  in free_area_init()
7763 			((u64)end_pfn << PAGE_SHIFT) - 1);  in free_area_init()
7764 		subsection_map_init(start_pfn, end_pfn - start_pfn);  in free_area_init()
7775 		if (pgdat->node_present_pages)  in free_area_init()
7790 		return -EINVAL;  in cmdline_parse_core()
7841 	atomic_long_add(count, &page_zone(page)->managed_pages);  in adjust_managed_page_count()
7870 		 * Perform a kasan-unchecked memset() since this memory  in free_reserved_area()
7882 			s, pages << (PAGE_SHIFT - 10));  in free_reserved_area()
7892 	atomic_long_inc(&page_zone(page)->managed_pages);  in free_highmem_page()
7904 	codesize = _etext - _stext;  in mem_init_print_info()
7905 	datasize = _edata - _sdata;  in mem_init_print_info()
7906 	rosize = __end_rodata - __start_rodata;  in mem_init_print_info()
7907 	bss_size = __bss_stop - __bss_start;  in mem_init_print_info()
7908 	init_data_size = __init_end - __init_begin;  in mem_init_print_info()
7909 	init_code_size = _einittext - _sinittext;  in mem_init_print_info()
7921 			size -= adj; \  in mem_init_print_info()
7933 …(%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"  in mem_init_print_info()
7938 		nr_free_pages() << (PAGE_SHIFT - 10),  in mem_init_print_info()
7939 		physpages << (PAGE_SHIFT - 10),  in mem_init_print_info()
7942 		(physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10),  in mem_init_print_info()
7943 		totalcma_pages << (PAGE_SHIFT - 10),  in mem_init_print_info()
7945 		totalhigh_pages() << (PAGE_SHIFT - 10),  in mem_init_print_info()
7951  * set_dma_reserve - set the specified number of pages reserved in the first zone
7954  * The per-cpu batchsize and zone watermarks are determined by managed_pages.
7959  * smaller per-cpu batchsize.
8020  * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
8031 		pgdat->totalreserve_pages = 0;  in calculate_totalreserve_pages()
8034 			struct zone *zone = pgdat->node_zones + i;  in calculate_totalreserve_pages()
8040 				if (zone->lowmem_reserve[j] > max)  in calculate_totalreserve_pages()
8041 					max = zone->lowmem_reserve[j];  in calculate_totalreserve_pages()
8050 			pgdat->totalreserve_pages += max;  in calculate_totalreserve_pages()
8059  * setup_per_zone_lowmem_reserve - called whenever
8070 		for (i = 0; i < MAX_NR_ZONES - 1; i++) {  in setup_per_zone_lowmem_reserve()
8071 			struct zone *zone = &pgdat->node_zones[i];  in setup_per_zone_lowmem_reserve()
8077 				struct zone *upper_zone = &pgdat->node_zones[j];  in setup_per_zone_lowmem_reserve()
8082 					zone->lowmem_reserve[j] = 0;  in setup_per_zone_lowmem_reserve()
8084 					zone->lowmem_reserve[j] = managed_pages / ratio;  in setup_per_zone_lowmem_reserve()
8095 	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);  in __setup_per_zone_wmarks()
8096 	unsigned long pages_low = extra_free_kbytes >> (PAGE_SHIFT - 10);  in __setup_per_zone_wmarks()
8110 		spin_lock_irqsave(&zone->lock, flags);  in __setup_per_zone_wmarks()
8121 			 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)  in __setup_per_zone_wmarks()
8129 			zone->_watermark[WMARK_MIN] = min_pages;  in __setup_per_zone_wmarks()
8135 			zone->_watermark[WMARK_MIN] = tmp;  in __setup_per_zone_wmarks()
8147 		zone->watermark_boost = 0;  in __setup_per_zone_wmarks()
8148 		zone->_watermark[WMARK_LOW]  = min_wmark_pages(zone) + low + tmp;  in __setup_per_zone_wmarks()
8149 		zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + low + tmp * 2;  in __setup_per_zone_wmarks()
8151 		spin_unlock_irqrestore(&zone->lock, flags);  in __setup_per_zone_wmarks()
8159  * setup_per_zone_wmarks - called when min_free_kbytes changes
8160  * or when memory is hot-{added|removed}
8232  * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so  in postcore_initcall()
8274 		pgdat->min_unmapped_pages = 0;  in setup_min_unmapped_ratio()
8277 		zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) *  in setup_min_unmapped_ratio()
8302 		pgdat->min_slab_pages = 0;  in setup_min_slab_ratio()
8305 		zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) *  in setup_min_slab_ratio()
8325  * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
8355 				per_cpu_ptr(zone->pageset, cpu));  in __zone_pcp_update()
8359  * percpu_pagelist_fraction - changes the pcp->high for each zone on each
8381 		ret = -EINVAL;  in percpu_pagelist_fraction_sysctl_handler()
8413  * Because 32-bit systems cannot have large physical memory, where this scaling
8424  * - it is assumed that the hash table must contain an exact power-of-2
8426  * - limit is the number of hash buckets, not the total allocation size
8448 		numentries -= arch_reserved_kernel_pages();  in alloc_large_system_hash()
8466 			numentries >>= (scale - PAGE_SHIFT);  in alloc_large_system_hash()
8468 			numentries <<= (PAGE_SHIFT - scale);  in alloc_large_system_hash()
8470 		/* Make sure we've got at least a 0-order allocation.. */  in alloc_large_system_hash()
8512 			 * If bucketsize is not a power-of-two, we may free  in alloc_large_system_hash()
8519 	} while (!table && size > PAGE_SIZE && --log2qty);  in alloc_large_system_hash()
8525 		tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,  in alloc_large_system_hash()
8531 		*_hash_mask = (1 << log2qty) - 1;  in alloc_large_system_hash()
8541  * check without lock_page also may miss some movable non-lru pages at
8568 	for (; iter < pageblock_nr_pages - offset; iter++) {  in has_unmovable_pages()
8608 			skip_pages = compound_nr(head) - (page - head);  in has_unmovable_pages()
8609 			iter += skip_pages - 1;  in has_unmovable_pages()
8617 		 * because their page->_refcount is zero at all time.  in has_unmovable_pages()
8621 				iter += (1 << buddy_order(page)) - 1;  in has_unmovable_pages()
8662 			     pageblock_nr_pages) - 1);  in pfn_max_align_down()
8673 /* Usage: See admin-guide/dynamic-debug-howto.rst */
8715 		.nid = zone_to_nid(cc->zone),  in __alloc_contig_migrate_range()
8719 	if (cc->alloc_contig && cc->mode == MIGRATE_ASYNC)  in __alloc_contig_migrate_range()
8724 	while (pfn < end || !list_empty(&cc->migratepages)) {  in __alloc_contig_migrate_range()
8726 			ret = -EINTR;  in __alloc_contig_migrate_range()
8730 		if (list_empty(&cc->migratepages)) {  in __alloc_contig_migrate_range()
8731 			cc->nr_migratepages = 0;  in __alloc_contig_migrate_range()
8734 				ret = -EINTR;  in __alloc_contig_migrate_range()
8739 			ret = ret < 0 ? ret : -EBUSY;  in __alloc_contig_migrate_range()
8743 		nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,  in __alloc_contig_migrate_range()
8744 							&cc->migratepages);  in __alloc_contig_migrate_range()
8745 		info->nr_reclaimed += nr_reclaimed;  in __alloc_contig_migrate_range()
8746 		cc->nr_migratepages -= nr_reclaimed;  in __alloc_contig_migrate_range()
8748 		list_for_each_entry(page, &cc->migratepages, lru)  in __alloc_contig_migrate_range()
8749 			info->nr_mapped += page_mapcount(page);  in __alloc_contig_migrate_range()
8751 		ret = migrate_pages(&cc->migratepages, alloc_migration_target,  in __alloc_contig_migrate_range()
8752 				NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE);  in __alloc_contig_migrate_range()
8754 			info->nr_migrated += cc->nr_migratepages;  in __alloc_contig_migrate_range()
8759 		if (ret == -EBUSY) {  in __alloc_contig_migrate_range()
8760 			alloc_contig_dump_pages(&cc->migratepages);  in __alloc_contig_migrate_range()
8761 			page_pinner_mark_migration_failed_pages(&cc->migratepages);  in __alloc_contig_migrate_range()
8764 		if (!list_empty(&cc->migratepages)) {  in __alloc_contig_migrate_range()
8765 			page = list_first_entry(&cc->migratepages, struct page , lru);  in __alloc_contig_migrate_range()
8766 			info->failed_pfn = page_to_pfn(page);  in __alloc_contig_migrate_range()
8769 		putback_movable_pages(&cc->migratepages);  in __alloc_contig_migrate_range()
8770 		info->err |= ACR_ERR_MIGRATE;  in __alloc_contig_migrate_range()
8777  * alloc_contig_range() -- tries to allocate given range of pages
8779  * @end:	one-past-the-last PFN to allocate
8808 		.order = -1,  in alloc_contig_range()
8844 				       &info->failed_pfn);  in alloc_contig_range()
8846 		info->err |= ACR_ERR_ISOLATE;  in alloc_contig_range()
8856 	 * In case of -EBUSY, we'd like to know which page causes problem.  in alloc_contig_range()
8863 	 * -EBUSY is not accidentally used or returned to caller.  in alloc_contig_range()
8866 	if (ret && (ret != -EBUSY || (gfp_mask & __GFP_NORETRY)))  in alloc_contig_range()
8883 	 * We don't have to hold zone->lock here because the pages are  in alloc_contig_range()
8911 	if (test_pages_isolated(outer_start, end, 0, &info->failed_pfn)) {  in alloc_contig_range()
8914 		ret = -EBUSY;  in alloc_contig_range()
8915 		info->err |= ACR_ERR_TEST;  in alloc_contig_range()
8922 		ret = -EBUSY;  in alloc_contig_range()
8928 		free_contig_range(outer_start, start - outer_start);  in alloc_contig_range()
8930 		free_contig_range(end, outer_end - end);  in alloc_contig_range()
8978 	unsigned long last_pfn = start_pfn + nr_pages - 1;  in zone_spans_last_pfn()
8984  * alloc_contig_pages() -- tries to find and allocate contiguous range of pages
9015 		spin_lock_irqsave(&zone->lock, flags);  in alloc_contig_pages()
9017 		pfn = ALIGN(zone->zone_start_pfn, nr_pages);  in alloc_contig_pages()
9027 				spin_unlock_irqrestore(&zone->lock, flags);  in alloc_contig_pages()
9032 				spin_lock_irqsave(&zone->lock, flags);  in alloc_contig_pages()
9036 		spin_unlock_irqrestore(&zone->lock, flags);  in alloc_contig_pages()
9046 	for (; nr_pages--; pfn++) {  in free_contig_range()
9075 	if (zone->pageset != &boot_pageset) {  in zone_pcp_reset()
9077 			pset = per_cpu_ptr(zone->pageset, cpu);  in zone_pcp_reset()
9080 		free_percpu(zone->pageset);  in zone_pcp_reset()
9081 		zone->pageset = &boot_pageset;  in zone_pcp_reset()
9101 	spin_lock_irqsave(&zone->lock, flags);  in __offline_isolated_pages()
9129 	spin_unlock_irqrestore(&zone->lock, flags);  in __offline_isolated_pages()
9140 	spin_lock_irqsave(&zone->lock, flags);  in is_free_buddy_page()
9142 		struct page *page_head = page - (pfn & ((1 << order) - 1));  in is_free_buddy_page()
9147 	spin_unlock_irqrestore(&zone->lock, flags);  in is_free_buddy_page()
9154  * Break down a higher-order page in sub-pages, and keep our target out of
9165 		high--;  in break_down_buddy_pages()
9198 	spin_lock_irqsave(&zone->lock, flags);  in take_page_off_buddy()
9200 		struct page *page_head = page - (pfn & ((1 << order) - 1));  in take_page_off_buddy()
9212 				__mod_zone_freepage_state(zone, -1, migratetype);  in take_page_off_buddy()
9219 	spin_unlock_irqrestore(&zone->lock, flags);  in take_page_off_buddy()
9230 		struct zone *zone = &pgdat->node_zones[ZONE_DMA];  in has_managed_dma()