1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0-or-later
2*4882a593Smuzhiyun /* memcontrol.c - Memory Controller
3*4882a593Smuzhiyun *
4*4882a593Smuzhiyun * Copyright IBM Corporation, 2007
5*4882a593Smuzhiyun * Author Balbir Singh <balbir@linux.vnet.ibm.com>
6*4882a593Smuzhiyun *
7*4882a593Smuzhiyun * Copyright 2007 OpenVZ SWsoft Inc
8*4882a593Smuzhiyun * Author: Pavel Emelianov <xemul@openvz.org>
9*4882a593Smuzhiyun *
10*4882a593Smuzhiyun * Memory thresholds
11*4882a593Smuzhiyun * Copyright (C) 2009 Nokia Corporation
12*4882a593Smuzhiyun * Author: Kirill A. Shutemov
13*4882a593Smuzhiyun *
14*4882a593Smuzhiyun * Kernel Memory Controller
15*4882a593Smuzhiyun * Copyright (C) 2012 Parallels Inc. and Google Inc.
16*4882a593Smuzhiyun * Authors: Glauber Costa and Suleiman Souhlal
17*4882a593Smuzhiyun *
18*4882a593Smuzhiyun * Native page reclaim
19*4882a593Smuzhiyun * Charge lifetime sanitation
20*4882a593Smuzhiyun * Lockless page tracking & accounting
21*4882a593Smuzhiyun * Unified hierarchy configuration model
22*4882a593Smuzhiyun * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
23*4882a593Smuzhiyun */
24*4882a593Smuzhiyun
25*4882a593Smuzhiyun #include <linux/page_counter.h>
26*4882a593Smuzhiyun #include <linux/memcontrol.h>
27*4882a593Smuzhiyun #include <linux/cgroup.h>
28*4882a593Smuzhiyun #include <linux/pagewalk.h>
29*4882a593Smuzhiyun #include <linux/sched/mm.h>
30*4882a593Smuzhiyun #include <linux/shmem_fs.h>
31*4882a593Smuzhiyun #include <linux/hugetlb.h>
32*4882a593Smuzhiyun #include <linux/pagemap.h>
33*4882a593Smuzhiyun #include <linux/vm_event_item.h>
34*4882a593Smuzhiyun #include <linux/smp.h>
35*4882a593Smuzhiyun #include <linux/page-flags.h>
36*4882a593Smuzhiyun #include <linux/backing-dev.h>
37*4882a593Smuzhiyun #include <linux/bit_spinlock.h>
38*4882a593Smuzhiyun #include <linux/rcupdate.h>
39*4882a593Smuzhiyun #include <linux/limits.h>
40*4882a593Smuzhiyun #include <linux/export.h>
41*4882a593Smuzhiyun #include <linux/mutex.h>
42*4882a593Smuzhiyun #include <linux/rbtree.h>
43*4882a593Smuzhiyun #include <linux/slab.h>
44*4882a593Smuzhiyun #include <linux/swap.h>
45*4882a593Smuzhiyun #include <linux/swapops.h>
46*4882a593Smuzhiyun #include <linux/spinlock.h>
47*4882a593Smuzhiyun #include <linux/eventfd.h>
48*4882a593Smuzhiyun #include <linux/poll.h>
49*4882a593Smuzhiyun #include <linux/sort.h>
50*4882a593Smuzhiyun #include <linux/fs.h>
51*4882a593Smuzhiyun #include <linux/seq_file.h>
52*4882a593Smuzhiyun #include <linux/vmpressure.h>
53*4882a593Smuzhiyun #include <linux/mm_inline.h>
54*4882a593Smuzhiyun #include <linux/swap_cgroup.h>
55*4882a593Smuzhiyun #include <linux/cpu.h>
56*4882a593Smuzhiyun #include <linux/oom.h>
57*4882a593Smuzhiyun #include <linux/lockdep.h>
58*4882a593Smuzhiyun #include <linux/file.h>
59*4882a593Smuzhiyun #include <linux/tracehook.h>
60*4882a593Smuzhiyun #include <linux/psi.h>
61*4882a593Smuzhiyun #include <linux/seq_buf.h>
62*4882a593Smuzhiyun #include "internal.h"
63*4882a593Smuzhiyun #include <net/sock.h>
64*4882a593Smuzhiyun #include <net/ip.h>
65*4882a593Smuzhiyun #include "slab.h"
66*4882a593Smuzhiyun
67*4882a593Smuzhiyun #include <linux/uaccess.h>
68*4882a593Smuzhiyun
69*4882a593Smuzhiyun #include <trace/events/vmscan.h>
70*4882a593Smuzhiyun #include <trace/hooks/mm.h>
71*4882a593Smuzhiyun
72*4882a593Smuzhiyun struct cgroup_subsys memory_cgrp_subsys __read_mostly;
73*4882a593Smuzhiyun EXPORT_SYMBOL(memory_cgrp_subsys);
74*4882a593Smuzhiyun
75*4882a593Smuzhiyun struct mem_cgroup *root_mem_cgroup __read_mostly;
76*4882a593Smuzhiyun
77*4882a593Smuzhiyun /* Active memory cgroup to use from an interrupt context */
78*4882a593Smuzhiyun DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
79*4882a593Smuzhiyun
80*4882a593Smuzhiyun /* Socket memory accounting disabled? */
81*4882a593Smuzhiyun static bool cgroup_memory_nosocket;
82*4882a593Smuzhiyun
83*4882a593Smuzhiyun /* Kernel memory accounting disabled? */
84*4882a593Smuzhiyun static bool cgroup_memory_nokmem;
85*4882a593Smuzhiyun
86*4882a593Smuzhiyun /* Whether the swap controller is active */
87*4882a593Smuzhiyun #ifdef CONFIG_MEMCG_SWAP
88*4882a593Smuzhiyun bool cgroup_memory_noswap __read_mostly;
89*4882a593Smuzhiyun #else
90*4882a593Smuzhiyun #define cgroup_memory_noswap 1
91*4882a593Smuzhiyun #endif
92*4882a593Smuzhiyun
93*4882a593Smuzhiyun #ifdef CONFIG_CGROUP_WRITEBACK
94*4882a593Smuzhiyun static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
95*4882a593Smuzhiyun #endif
96*4882a593Smuzhiyun
97*4882a593Smuzhiyun /* Whether legacy memory+swap accounting is active */
do_memsw_account(void)98*4882a593Smuzhiyun static bool do_memsw_account(void)
99*4882a593Smuzhiyun {
100*4882a593Smuzhiyun return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
101*4882a593Smuzhiyun }
102*4882a593Smuzhiyun
103*4882a593Smuzhiyun #define THRESHOLDS_EVENTS_TARGET 128
104*4882a593Smuzhiyun #define SOFTLIMIT_EVENTS_TARGET 1024
105*4882a593Smuzhiyun
106*4882a593Smuzhiyun /*
107*4882a593Smuzhiyun * Cgroups above their limits are maintained in a RB-Tree, independent of
108*4882a593Smuzhiyun * their hierarchy representation
109*4882a593Smuzhiyun */
110*4882a593Smuzhiyun
111*4882a593Smuzhiyun struct mem_cgroup_tree_per_node {
112*4882a593Smuzhiyun struct rb_root rb_root;
113*4882a593Smuzhiyun struct rb_node *rb_rightmost;
114*4882a593Smuzhiyun spinlock_t lock;
115*4882a593Smuzhiyun };
116*4882a593Smuzhiyun
117*4882a593Smuzhiyun struct mem_cgroup_tree {
118*4882a593Smuzhiyun struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
119*4882a593Smuzhiyun };
120*4882a593Smuzhiyun
121*4882a593Smuzhiyun static struct mem_cgroup_tree soft_limit_tree __read_mostly;
122*4882a593Smuzhiyun
123*4882a593Smuzhiyun /* for OOM */
124*4882a593Smuzhiyun struct mem_cgroup_eventfd_list {
125*4882a593Smuzhiyun struct list_head list;
126*4882a593Smuzhiyun struct eventfd_ctx *eventfd;
127*4882a593Smuzhiyun };
128*4882a593Smuzhiyun
129*4882a593Smuzhiyun /*
130*4882a593Smuzhiyun * cgroup_event represents events which userspace want to receive.
131*4882a593Smuzhiyun */
132*4882a593Smuzhiyun struct mem_cgroup_event {
133*4882a593Smuzhiyun /*
134*4882a593Smuzhiyun * memcg which the event belongs to.
135*4882a593Smuzhiyun */
136*4882a593Smuzhiyun struct mem_cgroup *memcg;
137*4882a593Smuzhiyun /*
138*4882a593Smuzhiyun * eventfd to signal userspace about the event.
139*4882a593Smuzhiyun */
140*4882a593Smuzhiyun struct eventfd_ctx *eventfd;
141*4882a593Smuzhiyun /*
142*4882a593Smuzhiyun * Each of these stored in a list by the cgroup.
143*4882a593Smuzhiyun */
144*4882a593Smuzhiyun struct list_head list;
145*4882a593Smuzhiyun /*
146*4882a593Smuzhiyun * register_event() callback will be used to add new userspace
147*4882a593Smuzhiyun * waiter for changes related to this event. Use eventfd_signal()
148*4882a593Smuzhiyun * on eventfd to send notification to userspace.
149*4882a593Smuzhiyun */
150*4882a593Smuzhiyun int (*register_event)(struct mem_cgroup *memcg,
151*4882a593Smuzhiyun struct eventfd_ctx *eventfd, const char *args);
152*4882a593Smuzhiyun /*
153*4882a593Smuzhiyun * unregister_event() callback will be called when userspace closes
154*4882a593Smuzhiyun * the eventfd or on cgroup removing. This callback must be set,
155*4882a593Smuzhiyun * if you want provide notification functionality.
156*4882a593Smuzhiyun */
157*4882a593Smuzhiyun void (*unregister_event)(struct mem_cgroup *memcg,
158*4882a593Smuzhiyun struct eventfd_ctx *eventfd);
159*4882a593Smuzhiyun /*
160*4882a593Smuzhiyun * All fields below needed to unregister event when
161*4882a593Smuzhiyun * userspace closes eventfd.
162*4882a593Smuzhiyun */
163*4882a593Smuzhiyun poll_table pt;
164*4882a593Smuzhiyun wait_queue_head_t *wqh;
165*4882a593Smuzhiyun wait_queue_entry_t wait;
166*4882a593Smuzhiyun struct work_struct remove;
167*4882a593Smuzhiyun };
168*4882a593Smuzhiyun
169*4882a593Smuzhiyun static void mem_cgroup_threshold(struct mem_cgroup *memcg);
170*4882a593Smuzhiyun static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
171*4882a593Smuzhiyun
172*4882a593Smuzhiyun /* Stuffs for move charges at task migration. */
173*4882a593Smuzhiyun /*
174*4882a593Smuzhiyun * Types of charges to be moved.
175*4882a593Smuzhiyun */
176*4882a593Smuzhiyun #define MOVE_ANON 0x1U
177*4882a593Smuzhiyun #define MOVE_FILE 0x2U
178*4882a593Smuzhiyun #define MOVE_MASK (MOVE_ANON | MOVE_FILE)
179*4882a593Smuzhiyun
180*4882a593Smuzhiyun /* "mc" and its members are protected by cgroup_mutex */
181*4882a593Smuzhiyun static struct move_charge_struct {
182*4882a593Smuzhiyun spinlock_t lock; /* for from, to */
183*4882a593Smuzhiyun struct mm_struct *mm;
184*4882a593Smuzhiyun struct mem_cgroup *from;
185*4882a593Smuzhiyun struct mem_cgroup *to;
186*4882a593Smuzhiyun unsigned long flags;
187*4882a593Smuzhiyun unsigned long precharge;
188*4882a593Smuzhiyun unsigned long moved_charge;
189*4882a593Smuzhiyun unsigned long moved_swap;
190*4882a593Smuzhiyun struct task_struct *moving_task; /* a task moving charges */
191*4882a593Smuzhiyun wait_queue_head_t waitq; /* a waitq for other context */
192*4882a593Smuzhiyun } mc = {
193*4882a593Smuzhiyun .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
194*4882a593Smuzhiyun .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
195*4882a593Smuzhiyun };
196*4882a593Smuzhiyun
197*4882a593Smuzhiyun /*
198*4882a593Smuzhiyun * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
199*4882a593Smuzhiyun * limit reclaim to prevent infinite loops, if they ever occur.
200*4882a593Smuzhiyun */
201*4882a593Smuzhiyun #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
202*4882a593Smuzhiyun #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
203*4882a593Smuzhiyun
204*4882a593Smuzhiyun /* for encoding cft->private value on file */
205*4882a593Smuzhiyun enum res_type {
206*4882a593Smuzhiyun _MEM,
207*4882a593Smuzhiyun _MEMSWAP,
208*4882a593Smuzhiyun _OOM_TYPE,
209*4882a593Smuzhiyun _KMEM,
210*4882a593Smuzhiyun _TCP,
211*4882a593Smuzhiyun };
212*4882a593Smuzhiyun
213*4882a593Smuzhiyun #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
214*4882a593Smuzhiyun #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
215*4882a593Smuzhiyun #define MEMFILE_ATTR(val) ((val) & 0xffff)
216*4882a593Smuzhiyun /* Used for OOM nofiier */
217*4882a593Smuzhiyun #define OOM_CONTROL (0)
218*4882a593Smuzhiyun
219*4882a593Smuzhiyun /*
220*4882a593Smuzhiyun * Iteration constructs for visiting all cgroups (under a tree). If
221*4882a593Smuzhiyun * loops are exited prematurely (break), mem_cgroup_iter_break() must
222*4882a593Smuzhiyun * be used for reference counting.
223*4882a593Smuzhiyun */
224*4882a593Smuzhiyun #define for_each_mem_cgroup_tree(iter, root) \
225*4882a593Smuzhiyun for (iter = mem_cgroup_iter(root, NULL, NULL); \
226*4882a593Smuzhiyun iter != NULL; \
227*4882a593Smuzhiyun iter = mem_cgroup_iter(root, iter, NULL))
228*4882a593Smuzhiyun
229*4882a593Smuzhiyun #define for_each_mem_cgroup(iter) \
230*4882a593Smuzhiyun for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
231*4882a593Smuzhiyun iter != NULL; \
232*4882a593Smuzhiyun iter = mem_cgroup_iter(NULL, iter, NULL))
233*4882a593Smuzhiyun
task_is_dying(void)234*4882a593Smuzhiyun static inline bool task_is_dying(void)
235*4882a593Smuzhiyun {
236*4882a593Smuzhiyun return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
237*4882a593Smuzhiyun (current->flags & PF_EXITING);
238*4882a593Smuzhiyun }
239*4882a593Smuzhiyun
240*4882a593Smuzhiyun /* Some nice accessors for the vmpressure. */
memcg_to_vmpressure(struct mem_cgroup * memcg)241*4882a593Smuzhiyun struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
242*4882a593Smuzhiyun {
243*4882a593Smuzhiyun if (!memcg)
244*4882a593Smuzhiyun memcg = root_mem_cgroup;
245*4882a593Smuzhiyun return &memcg->vmpressure;
246*4882a593Smuzhiyun }
247*4882a593Smuzhiyun
vmpressure_to_css(struct vmpressure * vmpr)248*4882a593Smuzhiyun struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
249*4882a593Smuzhiyun {
250*4882a593Smuzhiyun return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
251*4882a593Smuzhiyun }
252*4882a593Smuzhiyun
253*4882a593Smuzhiyun #ifdef CONFIG_MEMCG_KMEM
254*4882a593Smuzhiyun static DEFINE_SPINLOCK(objcg_lock);
255*4882a593Smuzhiyun
obj_cgroup_release(struct percpu_ref * ref)256*4882a593Smuzhiyun static void obj_cgroup_release(struct percpu_ref *ref)
257*4882a593Smuzhiyun {
258*4882a593Smuzhiyun struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
259*4882a593Smuzhiyun struct mem_cgroup *memcg;
260*4882a593Smuzhiyun unsigned int nr_bytes;
261*4882a593Smuzhiyun unsigned int nr_pages;
262*4882a593Smuzhiyun unsigned long flags;
263*4882a593Smuzhiyun
264*4882a593Smuzhiyun /*
265*4882a593Smuzhiyun * At this point all allocated objects are freed, and
266*4882a593Smuzhiyun * objcg->nr_charged_bytes can't have an arbitrary byte value.
267*4882a593Smuzhiyun * However, it can be PAGE_SIZE or (x * PAGE_SIZE).
268*4882a593Smuzhiyun *
269*4882a593Smuzhiyun * The following sequence can lead to it:
270*4882a593Smuzhiyun * 1) CPU0: objcg == stock->cached_objcg
271*4882a593Smuzhiyun * 2) CPU1: we do a small allocation (e.g. 92 bytes),
272*4882a593Smuzhiyun * PAGE_SIZE bytes are charged
273*4882a593Smuzhiyun * 3) CPU1: a process from another memcg is allocating something,
274*4882a593Smuzhiyun * the stock if flushed,
275*4882a593Smuzhiyun * objcg->nr_charged_bytes = PAGE_SIZE - 92
276*4882a593Smuzhiyun * 5) CPU0: we do release this object,
277*4882a593Smuzhiyun * 92 bytes are added to stock->nr_bytes
278*4882a593Smuzhiyun * 6) CPU0: stock is flushed,
279*4882a593Smuzhiyun * 92 bytes are added to objcg->nr_charged_bytes
280*4882a593Smuzhiyun *
281*4882a593Smuzhiyun * In the result, nr_charged_bytes == PAGE_SIZE.
282*4882a593Smuzhiyun * This page will be uncharged in obj_cgroup_release().
283*4882a593Smuzhiyun */
284*4882a593Smuzhiyun nr_bytes = atomic_read(&objcg->nr_charged_bytes);
285*4882a593Smuzhiyun WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
286*4882a593Smuzhiyun nr_pages = nr_bytes >> PAGE_SHIFT;
287*4882a593Smuzhiyun
288*4882a593Smuzhiyun spin_lock_irqsave(&objcg_lock, flags);
289*4882a593Smuzhiyun memcg = obj_cgroup_memcg(objcg);
290*4882a593Smuzhiyun if (nr_pages)
291*4882a593Smuzhiyun __memcg_kmem_uncharge(memcg, nr_pages);
292*4882a593Smuzhiyun list_del(&objcg->list);
293*4882a593Smuzhiyun mem_cgroup_put(memcg);
294*4882a593Smuzhiyun spin_unlock_irqrestore(&objcg_lock, flags);
295*4882a593Smuzhiyun
296*4882a593Smuzhiyun percpu_ref_exit(ref);
297*4882a593Smuzhiyun kfree_rcu(objcg, rcu);
298*4882a593Smuzhiyun }
299*4882a593Smuzhiyun
obj_cgroup_alloc(void)300*4882a593Smuzhiyun static struct obj_cgroup *obj_cgroup_alloc(void)
301*4882a593Smuzhiyun {
302*4882a593Smuzhiyun struct obj_cgroup *objcg;
303*4882a593Smuzhiyun int ret;
304*4882a593Smuzhiyun
305*4882a593Smuzhiyun objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
306*4882a593Smuzhiyun if (!objcg)
307*4882a593Smuzhiyun return NULL;
308*4882a593Smuzhiyun
309*4882a593Smuzhiyun ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0,
310*4882a593Smuzhiyun GFP_KERNEL);
311*4882a593Smuzhiyun if (ret) {
312*4882a593Smuzhiyun kfree(objcg);
313*4882a593Smuzhiyun return NULL;
314*4882a593Smuzhiyun }
315*4882a593Smuzhiyun INIT_LIST_HEAD(&objcg->list);
316*4882a593Smuzhiyun return objcg;
317*4882a593Smuzhiyun }
318*4882a593Smuzhiyun
memcg_reparent_objcgs(struct mem_cgroup * memcg,struct mem_cgroup * parent)319*4882a593Smuzhiyun static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
320*4882a593Smuzhiyun struct mem_cgroup *parent)
321*4882a593Smuzhiyun {
322*4882a593Smuzhiyun struct obj_cgroup *objcg, *iter;
323*4882a593Smuzhiyun
324*4882a593Smuzhiyun objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
325*4882a593Smuzhiyun
326*4882a593Smuzhiyun spin_lock_irq(&objcg_lock);
327*4882a593Smuzhiyun
328*4882a593Smuzhiyun /* Move active objcg to the parent's list */
329*4882a593Smuzhiyun xchg(&objcg->memcg, parent);
330*4882a593Smuzhiyun css_get(&parent->css);
331*4882a593Smuzhiyun list_add(&objcg->list, &parent->objcg_list);
332*4882a593Smuzhiyun
333*4882a593Smuzhiyun /* Move already reparented objcgs to the parent's list */
334*4882a593Smuzhiyun list_for_each_entry(iter, &memcg->objcg_list, list) {
335*4882a593Smuzhiyun css_get(&parent->css);
336*4882a593Smuzhiyun xchg(&iter->memcg, parent);
337*4882a593Smuzhiyun css_put(&memcg->css);
338*4882a593Smuzhiyun }
339*4882a593Smuzhiyun list_splice(&memcg->objcg_list, &parent->objcg_list);
340*4882a593Smuzhiyun
341*4882a593Smuzhiyun spin_unlock_irq(&objcg_lock);
342*4882a593Smuzhiyun
343*4882a593Smuzhiyun percpu_ref_kill(&objcg->refcnt);
344*4882a593Smuzhiyun }
345*4882a593Smuzhiyun
346*4882a593Smuzhiyun /*
347*4882a593Smuzhiyun * This will be used as a shrinker list's index.
348*4882a593Smuzhiyun * The main reason for not using cgroup id for this:
349*4882a593Smuzhiyun * this works better in sparse environments, where we have a lot of memcgs,
350*4882a593Smuzhiyun * but only a few kmem-limited. Or also, if we have, for instance, 200
351*4882a593Smuzhiyun * memcgs, and none but the 200th is kmem-limited, we'd have to have a
352*4882a593Smuzhiyun * 200 entry array for that.
353*4882a593Smuzhiyun *
354*4882a593Smuzhiyun * The current size of the caches array is stored in memcg_nr_cache_ids. It
355*4882a593Smuzhiyun * will double each time we have to increase it.
356*4882a593Smuzhiyun */
357*4882a593Smuzhiyun static DEFINE_IDA(memcg_cache_ida);
358*4882a593Smuzhiyun int memcg_nr_cache_ids;
359*4882a593Smuzhiyun
360*4882a593Smuzhiyun /* Protects memcg_nr_cache_ids */
361*4882a593Smuzhiyun static DECLARE_RWSEM(memcg_cache_ids_sem);
362*4882a593Smuzhiyun
memcg_get_cache_ids(void)363*4882a593Smuzhiyun void memcg_get_cache_ids(void)
364*4882a593Smuzhiyun {
365*4882a593Smuzhiyun down_read(&memcg_cache_ids_sem);
366*4882a593Smuzhiyun }
367*4882a593Smuzhiyun
memcg_put_cache_ids(void)368*4882a593Smuzhiyun void memcg_put_cache_ids(void)
369*4882a593Smuzhiyun {
370*4882a593Smuzhiyun up_read(&memcg_cache_ids_sem);
371*4882a593Smuzhiyun }
372*4882a593Smuzhiyun
373*4882a593Smuzhiyun /*
374*4882a593Smuzhiyun * MIN_SIZE is different than 1, because we would like to avoid going through
375*4882a593Smuzhiyun * the alloc/free process all the time. In a small machine, 4 kmem-limited
376*4882a593Smuzhiyun * cgroups is a reasonable guess. In the future, it could be a parameter or
377*4882a593Smuzhiyun * tunable, but that is strictly not necessary.
378*4882a593Smuzhiyun *
379*4882a593Smuzhiyun * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
380*4882a593Smuzhiyun * this constant directly from cgroup, but it is understandable that this is
381*4882a593Smuzhiyun * better kept as an internal representation in cgroup.c. In any case, the
382*4882a593Smuzhiyun * cgrp_id space is not getting any smaller, and we don't have to necessarily
383*4882a593Smuzhiyun * increase ours as well if it increases.
384*4882a593Smuzhiyun */
385*4882a593Smuzhiyun #define MEMCG_CACHES_MIN_SIZE 4
386*4882a593Smuzhiyun #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
387*4882a593Smuzhiyun
388*4882a593Smuzhiyun /*
389*4882a593Smuzhiyun * A lot of the calls to the cache allocation functions are expected to be
390*4882a593Smuzhiyun * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are
391*4882a593Smuzhiyun * conditional to this static branch, we'll have to allow modules that does
392*4882a593Smuzhiyun * kmem_cache_alloc and the such to see this symbol as well
393*4882a593Smuzhiyun */
394*4882a593Smuzhiyun DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
395*4882a593Smuzhiyun EXPORT_SYMBOL(memcg_kmem_enabled_key);
396*4882a593Smuzhiyun #endif
397*4882a593Smuzhiyun
398*4882a593Smuzhiyun static int memcg_shrinker_map_size;
399*4882a593Smuzhiyun static DEFINE_MUTEX(memcg_shrinker_map_mutex);
400*4882a593Smuzhiyun
memcg_free_shrinker_map_rcu(struct rcu_head * head)401*4882a593Smuzhiyun static void memcg_free_shrinker_map_rcu(struct rcu_head *head)
402*4882a593Smuzhiyun {
403*4882a593Smuzhiyun kvfree(container_of(head, struct memcg_shrinker_map, rcu));
404*4882a593Smuzhiyun }
405*4882a593Smuzhiyun
memcg_expand_one_shrinker_map(struct mem_cgroup * memcg,int size,int old_size)406*4882a593Smuzhiyun static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
407*4882a593Smuzhiyun int size, int old_size)
408*4882a593Smuzhiyun {
409*4882a593Smuzhiyun struct memcg_shrinker_map *new, *old;
410*4882a593Smuzhiyun int nid;
411*4882a593Smuzhiyun
412*4882a593Smuzhiyun lockdep_assert_held(&memcg_shrinker_map_mutex);
413*4882a593Smuzhiyun
414*4882a593Smuzhiyun for_each_node(nid) {
415*4882a593Smuzhiyun old = rcu_dereference_protected(
416*4882a593Smuzhiyun mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true);
417*4882a593Smuzhiyun /* Not yet online memcg */
418*4882a593Smuzhiyun if (!old)
419*4882a593Smuzhiyun return 0;
420*4882a593Smuzhiyun
421*4882a593Smuzhiyun new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
422*4882a593Smuzhiyun if (!new)
423*4882a593Smuzhiyun return -ENOMEM;
424*4882a593Smuzhiyun
425*4882a593Smuzhiyun /* Set all old bits, clear all new bits */
426*4882a593Smuzhiyun memset(new->map, (int)0xff, old_size);
427*4882a593Smuzhiyun memset((void *)new->map + old_size, 0, size - old_size);
428*4882a593Smuzhiyun
429*4882a593Smuzhiyun rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new);
430*4882a593Smuzhiyun call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
431*4882a593Smuzhiyun }
432*4882a593Smuzhiyun
433*4882a593Smuzhiyun return 0;
434*4882a593Smuzhiyun }
435*4882a593Smuzhiyun
memcg_free_shrinker_maps(struct mem_cgroup * memcg)436*4882a593Smuzhiyun static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
437*4882a593Smuzhiyun {
438*4882a593Smuzhiyun struct mem_cgroup_per_node *pn;
439*4882a593Smuzhiyun struct memcg_shrinker_map *map;
440*4882a593Smuzhiyun int nid;
441*4882a593Smuzhiyun
442*4882a593Smuzhiyun if (mem_cgroup_is_root(memcg))
443*4882a593Smuzhiyun return;
444*4882a593Smuzhiyun
445*4882a593Smuzhiyun for_each_node(nid) {
446*4882a593Smuzhiyun pn = mem_cgroup_nodeinfo(memcg, nid);
447*4882a593Smuzhiyun map = rcu_dereference_protected(pn->shrinker_map, true);
448*4882a593Smuzhiyun if (map)
449*4882a593Smuzhiyun kvfree(map);
450*4882a593Smuzhiyun rcu_assign_pointer(pn->shrinker_map, NULL);
451*4882a593Smuzhiyun }
452*4882a593Smuzhiyun }
453*4882a593Smuzhiyun
memcg_alloc_shrinker_maps(struct mem_cgroup * memcg)454*4882a593Smuzhiyun static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
455*4882a593Smuzhiyun {
456*4882a593Smuzhiyun struct memcg_shrinker_map *map;
457*4882a593Smuzhiyun int nid, size, ret = 0;
458*4882a593Smuzhiyun
459*4882a593Smuzhiyun if (mem_cgroup_is_root(memcg))
460*4882a593Smuzhiyun return 0;
461*4882a593Smuzhiyun
462*4882a593Smuzhiyun mutex_lock(&memcg_shrinker_map_mutex);
463*4882a593Smuzhiyun size = memcg_shrinker_map_size;
464*4882a593Smuzhiyun for_each_node(nid) {
465*4882a593Smuzhiyun map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid);
466*4882a593Smuzhiyun if (!map) {
467*4882a593Smuzhiyun memcg_free_shrinker_maps(memcg);
468*4882a593Smuzhiyun ret = -ENOMEM;
469*4882a593Smuzhiyun break;
470*4882a593Smuzhiyun }
471*4882a593Smuzhiyun rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map);
472*4882a593Smuzhiyun }
473*4882a593Smuzhiyun mutex_unlock(&memcg_shrinker_map_mutex);
474*4882a593Smuzhiyun
475*4882a593Smuzhiyun return ret;
476*4882a593Smuzhiyun }
477*4882a593Smuzhiyun
memcg_expand_shrinker_maps(int new_id)478*4882a593Smuzhiyun int memcg_expand_shrinker_maps(int new_id)
479*4882a593Smuzhiyun {
480*4882a593Smuzhiyun int size, old_size, ret = 0;
481*4882a593Smuzhiyun struct mem_cgroup *memcg;
482*4882a593Smuzhiyun
483*4882a593Smuzhiyun size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long);
484*4882a593Smuzhiyun old_size = memcg_shrinker_map_size;
485*4882a593Smuzhiyun if (size <= old_size)
486*4882a593Smuzhiyun return 0;
487*4882a593Smuzhiyun
488*4882a593Smuzhiyun mutex_lock(&memcg_shrinker_map_mutex);
489*4882a593Smuzhiyun if (!root_mem_cgroup)
490*4882a593Smuzhiyun goto unlock;
491*4882a593Smuzhiyun
492*4882a593Smuzhiyun for_each_mem_cgroup(memcg) {
493*4882a593Smuzhiyun if (mem_cgroup_is_root(memcg))
494*4882a593Smuzhiyun continue;
495*4882a593Smuzhiyun ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
496*4882a593Smuzhiyun if (ret) {
497*4882a593Smuzhiyun mem_cgroup_iter_break(NULL, memcg);
498*4882a593Smuzhiyun goto unlock;
499*4882a593Smuzhiyun }
500*4882a593Smuzhiyun }
501*4882a593Smuzhiyun unlock:
502*4882a593Smuzhiyun if (!ret)
503*4882a593Smuzhiyun memcg_shrinker_map_size = size;
504*4882a593Smuzhiyun mutex_unlock(&memcg_shrinker_map_mutex);
505*4882a593Smuzhiyun return ret;
506*4882a593Smuzhiyun }
507*4882a593Smuzhiyun
memcg_set_shrinker_bit(struct mem_cgroup * memcg,int nid,int shrinker_id)508*4882a593Smuzhiyun void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
509*4882a593Smuzhiyun {
510*4882a593Smuzhiyun if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
511*4882a593Smuzhiyun struct memcg_shrinker_map *map;
512*4882a593Smuzhiyun
513*4882a593Smuzhiyun rcu_read_lock();
514*4882a593Smuzhiyun map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map);
515*4882a593Smuzhiyun /* Pairs with smp mb in shrink_slab() */
516*4882a593Smuzhiyun smp_mb__before_atomic();
517*4882a593Smuzhiyun set_bit(shrinker_id, map->map);
518*4882a593Smuzhiyun rcu_read_unlock();
519*4882a593Smuzhiyun }
520*4882a593Smuzhiyun }
521*4882a593Smuzhiyun
522*4882a593Smuzhiyun /**
523*4882a593Smuzhiyun * mem_cgroup_css_from_page - css of the memcg associated with a page
524*4882a593Smuzhiyun * @page: page of interest
525*4882a593Smuzhiyun *
526*4882a593Smuzhiyun * If memcg is bound to the default hierarchy, css of the memcg associated
527*4882a593Smuzhiyun * with @page is returned. The returned css remains associated with @page
528*4882a593Smuzhiyun * until it is released.
529*4882a593Smuzhiyun *
530*4882a593Smuzhiyun * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
531*4882a593Smuzhiyun * is returned.
532*4882a593Smuzhiyun */
mem_cgroup_css_from_page(struct page * page)533*4882a593Smuzhiyun struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
534*4882a593Smuzhiyun {
535*4882a593Smuzhiyun struct mem_cgroup *memcg;
536*4882a593Smuzhiyun
537*4882a593Smuzhiyun memcg = page->mem_cgroup;
538*4882a593Smuzhiyun
539*4882a593Smuzhiyun if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
540*4882a593Smuzhiyun memcg = root_mem_cgroup;
541*4882a593Smuzhiyun
542*4882a593Smuzhiyun return &memcg->css;
543*4882a593Smuzhiyun }
544*4882a593Smuzhiyun
545*4882a593Smuzhiyun /**
546*4882a593Smuzhiyun * page_cgroup_ino - return inode number of the memcg a page is charged to
547*4882a593Smuzhiyun * @page: the page
548*4882a593Smuzhiyun *
549*4882a593Smuzhiyun * Look up the closest online ancestor of the memory cgroup @page is charged to
550*4882a593Smuzhiyun * and return its inode number or 0 if @page is not charged to any cgroup. It
551*4882a593Smuzhiyun * is safe to call this function without holding a reference to @page.
552*4882a593Smuzhiyun *
553*4882a593Smuzhiyun * Note, this function is inherently racy, because there is nothing to prevent
554*4882a593Smuzhiyun * the cgroup inode from getting torn down and potentially reallocated a moment
555*4882a593Smuzhiyun * after page_cgroup_ino() returns, so it only should be used by callers that
556*4882a593Smuzhiyun * do not care (such as procfs interfaces).
557*4882a593Smuzhiyun */
page_cgroup_ino(struct page * page)558*4882a593Smuzhiyun ino_t page_cgroup_ino(struct page *page)
559*4882a593Smuzhiyun {
560*4882a593Smuzhiyun struct mem_cgroup *memcg;
561*4882a593Smuzhiyun unsigned long ino = 0;
562*4882a593Smuzhiyun
563*4882a593Smuzhiyun rcu_read_lock();
564*4882a593Smuzhiyun memcg = page->mem_cgroup;
565*4882a593Smuzhiyun
566*4882a593Smuzhiyun /*
567*4882a593Smuzhiyun * The lowest bit set means that memcg isn't a valid
568*4882a593Smuzhiyun * memcg pointer, but a obj_cgroups pointer.
569*4882a593Smuzhiyun * In this case the page is shared and doesn't belong
570*4882a593Smuzhiyun * to any specific memory cgroup.
571*4882a593Smuzhiyun */
572*4882a593Smuzhiyun if ((unsigned long) memcg & 0x1UL)
573*4882a593Smuzhiyun memcg = NULL;
574*4882a593Smuzhiyun
575*4882a593Smuzhiyun while (memcg && !(memcg->css.flags & CSS_ONLINE))
576*4882a593Smuzhiyun memcg = parent_mem_cgroup(memcg);
577*4882a593Smuzhiyun if (memcg)
578*4882a593Smuzhiyun ino = cgroup_ino(memcg->css.cgroup);
579*4882a593Smuzhiyun rcu_read_unlock();
580*4882a593Smuzhiyun return ino;
581*4882a593Smuzhiyun }
582*4882a593Smuzhiyun
583*4882a593Smuzhiyun static struct mem_cgroup_per_node *
mem_cgroup_page_nodeinfo(struct mem_cgroup * memcg,struct page * page)584*4882a593Smuzhiyun mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
585*4882a593Smuzhiyun {
586*4882a593Smuzhiyun int nid = page_to_nid(page);
587*4882a593Smuzhiyun
588*4882a593Smuzhiyun return memcg->nodeinfo[nid];
589*4882a593Smuzhiyun }
590*4882a593Smuzhiyun
591*4882a593Smuzhiyun static struct mem_cgroup_tree_per_node *
soft_limit_tree_node(int nid)592*4882a593Smuzhiyun soft_limit_tree_node(int nid)
593*4882a593Smuzhiyun {
594*4882a593Smuzhiyun return soft_limit_tree.rb_tree_per_node[nid];
595*4882a593Smuzhiyun }
596*4882a593Smuzhiyun
597*4882a593Smuzhiyun static struct mem_cgroup_tree_per_node *
soft_limit_tree_from_page(struct page * page)598*4882a593Smuzhiyun soft_limit_tree_from_page(struct page *page)
599*4882a593Smuzhiyun {
600*4882a593Smuzhiyun int nid = page_to_nid(page);
601*4882a593Smuzhiyun
602*4882a593Smuzhiyun return soft_limit_tree.rb_tree_per_node[nid];
603*4882a593Smuzhiyun }
604*4882a593Smuzhiyun
__mem_cgroup_insert_exceeded(struct mem_cgroup_per_node * mz,struct mem_cgroup_tree_per_node * mctz,unsigned long new_usage_in_excess)605*4882a593Smuzhiyun static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
606*4882a593Smuzhiyun struct mem_cgroup_tree_per_node *mctz,
607*4882a593Smuzhiyun unsigned long new_usage_in_excess)
608*4882a593Smuzhiyun {
609*4882a593Smuzhiyun struct rb_node **p = &mctz->rb_root.rb_node;
610*4882a593Smuzhiyun struct rb_node *parent = NULL;
611*4882a593Smuzhiyun struct mem_cgroup_per_node *mz_node;
612*4882a593Smuzhiyun bool rightmost = true;
613*4882a593Smuzhiyun
614*4882a593Smuzhiyun if (mz->on_tree)
615*4882a593Smuzhiyun return;
616*4882a593Smuzhiyun
617*4882a593Smuzhiyun mz->usage_in_excess = new_usage_in_excess;
618*4882a593Smuzhiyun if (!mz->usage_in_excess)
619*4882a593Smuzhiyun return;
620*4882a593Smuzhiyun while (*p) {
621*4882a593Smuzhiyun parent = *p;
622*4882a593Smuzhiyun mz_node = rb_entry(parent, struct mem_cgroup_per_node,
623*4882a593Smuzhiyun tree_node);
624*4882a593Smuzhiyun if (mz->usage_in_excess < mz_node->usage_in_excess) {
625*4882a593Smuzhiyun p = &(*p)->rb_left;
626*4882a593Smuzhiyun rightmost = false;
627*4882a593Smuzhiyun }
628*4882a593Smuzhiyun
629*4882a593Smuzhiyun /*
630*4882a593Smuzhiyun * We can't avoid mem cgroups that are over their soft
631*4882a593Smuzhiyun * limit by the same amount
632*4882a593Smuzhiyun */
633*4882a593Smuzhiyun else if (mz->usage_in_excess >= mz_node->usage_in_excess)
634*4882a593Smuzhiyun p = &(*p)->rb_right;
635*4882a593Smuzhiyun }
636*4882a593Smuzhiyun
637*4882a593Smuzhiyun if (rightmost)
638*4882a593Smuzhiyun mctz->rb_rightmost = &mz->tree_node;
639*4882a593Smuzhiyun
640*4882a593Smuzhiyun rb_link_node(&mz->tree_node, parent, p);
641*4882a593Smuzhiyun rb_insert_color(&mz->tree_node, &mctz->rb_root);
642*4882a593Smuzhiyun mz->on_tree = true;
643*4882a593Smuzhiyun }
644*4882a593Smuzhiyun
__mem_cgroup_remove_exceeded(struct mem_cgroup_per_node * mz,struct mem_cgroup_tree_per_node * mctz)645*4882a593Smuzhiyun static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
646*4882a593Smuzhiyun struct mem_cgroup_tree_per_node *mctz)
647*4882a593Smuzhiyun {
648*4882a593Smuzhiyun if (!mz->on_tree)
649*4882a593Smuzhiyun return;
650*4882a593Smuzhiyun
651*4882a593Smuzhiyun if (&mz->tree_node == mctz->rb_rightmost)
652*4882a593Smuzhiyun mctz->rb_rightmost = rb_prev(&mz->tree_node);
653*4882a593Smuzhiyun
654*4882a593Smuzhiyun rb_erase(&mz->tree_node, &mctz->rb_root);
655*4882a593Smuzhiyun mz->on_tree = false;
656*4882a593Smuzhiyun }
657*4882a593Smuzhiyun
mem_cgroup_remove_exceeded(struct mem_cgroup_per_node * mz,struct mem_cgroup_tree_per_node * mctz)658*4882a593Smuzhiyun static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
659*4882a593Smuzhiyun struct mem_cgroup_tree_per_node *mctz)
660*4882a593Smuzhiyun {
661*4882a593Smuzhiyun unsigned long flags;
662*4882a593Smuzhiyun
663*4882a593Smuzhiyun spin_lock_irqsave(&mctz->lock, flags);
664*4882a593Smuzhiyun __mem_cgroup_remove_exceeded(mz, mctz);
665*4882a593Smuzhiyun spin_unlock_irqrestore(&mctz->lock, flags);
666*4882a593Smuzhiyun }
667*4882a593Smuzhiyun
soft_limit_excess(struct mem_cgroup * memcg)668*4882a593Smuzhiyun static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
669*4882a593Smuzhiyun {
670*4882a593Smuzhiyun unsigned long nr_pages = page_counter_read(&memcg->memory);
671*4882a593Smuzhiyun unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
672*4882a593Smuzhiyun unsigned long excess = 0;
673*4882a593Smuzhiyun
674*4882a593Smuzhiyun if (nr_pages > soft_limit)
675*4882a593Smuzhiyun excess = nr_pages - soft_limit;
676*4882a593Smuzhiyun
677*4882a593Smuzhiyun return excess;
678*4882a593Smuzhiyun }
679*4882a593Smuzhiyun
mem_cgroup_update_tree(struct mem_cgroup * memcg,struct page * page)680*4882a593Smuzhiyun static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
681*4882a593Smuzhiyun {
682*4882a593Smuzhiyun unsigned long excess;
683*4882a593Smuzhiyun struct mem_cgroup_per_node *mz;
684*4882a593Smuzhiyun struct mem_cgroup_tree_per_node *mctz;
685*4882a593Smuzhiyun
686*4882a593Smuzhiyun mctz = soft_limit_tree_from_page(page);
687*4882a593Smuzhiyun if (!mctz)
688*4882a593Smuzhiyun return;
689*4882a593Smuzhiyun /*
690*4882a593Smuzhiyun * Necessary to update all ancestors when hierarchy is used.
691*4882a593Smuzhiyun * because their event counter is not touched.
692*4882a593Smuzhiyun */
693*4882a593Smuzhiyun for (; memcg; memcg = parent_mem_cgroup(memcg)) {
694*4882a593Smuzhiyun mz = mem_cgroup_page_nodeinfo(memcg, page);
695*4882a593Smuzhiyun excess = soft_limit_excess(memcg);
696*4882a593Smuzhiyun /*
697*4882a593Smuzhiyun * We have to update the tree if mz is on RB-tree or
698*4882a593Smuzhiyun * mem is over its softlimit.
699*4882a593Smuzhiyun */
700*4882a593Smuzhiyun if (excess || mz->on_tree) {
701*4882a593Smuzhiyun unsigned long flags;
702*4882a593Smuzhiyun
703*4882a593Smuzhiyun spin_lock_irqsave(&mctz->lock, flags);
704*4882a593Smuzhiyun /* if on-tree, remove it */
705*4882a593Smuzhiyun if (mz->on_tree)
706*4882a593Smuzhiyun __mem_cgroup_remove_exceeded(mz, mctz);
707*4882a593Smuzhiyun /*
708*4882a593Smuzhiyun * Insert again. mz->usage_in_excess will be updated.
709*4882a593Smuzhiyun * If excess is 0, no tree ops.
710*4882a593Smuzhiyun */
711*4882a593Smuzhiyun __mem_cgroup_insert_exceeded(mz, mctz, excess);
712*4882a593Smuzhiyun spin_unlock_irqrestore(&mctz->lock, flags);
713*4882a593Smuzhiyun }
714*4882a593Smuzhiyun }
715*4882a593Smuzhiyun }
716*4882a593Smuzhiyun
mem_cgroup_remove_from_trees(struct mem_cgroup * memcg)717*4882a593Smuzhiyun static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
718*4882a593Smuzhiyun {
719*4882a593Smuzhiyun struct mem_cgroup_tree_per_node *mctz;
720*4882a593Smuzhiyun struct mem_cgroup_per_node *mz;
721*4882a593Smuzhiyun int nid;
722*4882a593Smuzhiyun
723*4882a593Smuzhiyun for_each_node(nid) {
724*4882a593Smuzhiyun mz = mem_cgroup_nodeinfo(memcg, nid);
725*4882a593Smuzhiyun mctz = soft_limit_tree_node(nid);
726*4882a593Smuzhiyun if (mctz)
727*4882a593Smuzhiyun mem_cgroup_remove_exceeded(mz, mctz);
728*4882a593Smuzhiyun }
729*4882a593Smuzhiyun }
730*4882a593Smuzhiyun
731*4882a593Smuzhiyun static struct mem_cgroup_per_node *
__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node * mctz)732*4882a593Smuzhiyun __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
733*4882a593Smuzhiyun {
734*4882a593Smuzhiyun struct mem_cgroup_per_node *mz;
735*4882a593Smuzhiyun
736*4882a593Smuzhiyun retry:
737*4882a593Smuzhiyun mz = NULL;
738*4882a593Smuzhiyun if (!mctz->rb_rightmost)
739*4882a593Smuzhiyun goto done; /* Nothing to reclaim from */
740*4882a593Smuzhiyun
741*4882a593Smuzhiyun mz = rb_entry(mctz->rb_rightmost,
742*4882a593Smuzhiyun struct mem_cgroup_per_node, tree_node);
743*4882a593Smuzhiyun /*
744*4882a593Smuzhiyun * Remove the node now but someone else can add it back,
745*4882a593Smuzhiyun * we will to add it back at the end of reclaim to its correct
746*4882a593Smuzhiyun * position in the tree.
747*4882a593Smuzhiyun */
748*4882a593Smuzhiyun __mem_cgroup_remove_exceeded(mz, mctz);
749*4882a593Smuzhiyun if (!soft_limit_excess(mz->memcg) ||
750*4882a593Smuzhiyun !css_tryget(&mz->memcg->css))
751*4882a593Smuzhiyun goto retry;
752*4882a593Smuzhiyun done:
753*4882a593Smuzhiyun return mz;
754*4882a593Smuzhiyun }
755*4882a593Smuzhiyun
756*4882a593Smuzhiyun static struct mem_cgroup_per_node *
mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node * mctz)757*4882a593Smuzhiyun mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
758*4882a593Smuzhiyun {
759*4882a593Smuzhiyun struct mem_cgroup_per_node *mz;
760*4882a593Smuzhiyun
761*4882a593Smuzhiyun spin_lock_irq(&mctz->lock);
762*4882a593Smuzhiyun mz = __mem_cgroup_largest_soft_limit_node(mctz);
763*4882a593Smuzhiyun spin_unlock_irq(&mctz->lock);
764*4882a593Smuzhiyun return mz;
765*4882a593Smuzhiyun }
766*4882a593Smuzhiyun
767*4882a593Smuzhiyun /**
768*4882a593Smuzhiyun * __mod_memcg_state - update cgroup memory statistics
769*4882a593Smuzhiyun * @memcg: the memory cgroup
770*4882a593Smuzhiyun * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
771*4882a593Smuzhiyun * @val: delta to add to the counter, can be negative
772*4882a593Smuzhiyun */
__mod_memcg_state(struct mem_cgroup * memcg,int idx,int val)773*4882a593Smuzhiyun void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
774*4882a593Smuzhiyun {
775*4882a593Smuzhiyun long x, threshold = MEMCG_CHARGE_BATCH;
776*4882a593Smuzhiyun
777*4882a593Smuzhiyun if (mem_cgroup_disabled())
778*4882a593Smuzhiyun return;
779*4882a593Smuzhiyun
780*4882a593Smuzhiyun if (memcg_stat_item_in_bytes(idx))
781*4882a593Smuzhiyun threshold <<= PAGE_SHIFT;
782*4882a593Smuzhiyun
783*4882a593Smuzhiyun x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
784*4882a593Smuzhiyun if (unlikely(abs(x) > threshold)) {
785*4882a593Smuzhiyun struct mem_cgroup *mi;
786*4882a593Smuzhiyun
787*4882a593Smuzhiyun /*
788*4882a593Smuzhiyun * Batch local counters to keep them in sync with
789*4882a593Smuzhiyun * the hierarchical ones.
790*4882a593Smuzhiyun */
791*4882a593Smuzhiyun __this_cpu_add(memcg->vmstats_local->stat[idx], x);
792*4882a593Smuzhiyun for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
793*4882a593Smuzhiyun atomic_long_add(x, &mi->vmstats[idx]);
794*4882a593Smuzhiyun x = 0;
795*4882a593Smuzhiyun }
796*4882a593Smuzhiyun __this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
797*4882a593Smuzhiyun }
798*4882a593Smuzhiyun
799*4882a593Smuzhiyun static struct mem_cgroup_per_node *
parent_nodeinfo(struct mem_cgroup_per_node * pn,int nid)800*4882a593Smuzhiyun parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
801*4882a593Smuzhiyun {
802*4882a593Smuzhiyun struct mem_cgroup *parent;
803*4882a593Smuzhiyun
804*4882a593Smuzhiyun parent = parent_mem_cgroup(pn->memcg);
805*4882a593Smuzhiyun if (!parent)
806*4882a593Smuzhiyun return NULL;
807*4882a593Smuzhiyun return mem_cgroup_nodeinfo(parent, nid);
808*4882a593Smuzhiyun }
809*4882a593Smuzhiyun
__mod_memcg_lruvec_state(struct lruvec * lruvec,enum node_stat_item idx,int val)810*4882a593Smuzhiyun void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
811*4882a593Smuzhiyun int val)
812*4882a593Smuzhiyun {
813*4882a593Smuzhiyun struct mem_cgroup_per_node *pn;
814*4882a593Smuzhiyun struct mem_cgroup *memcg;
815*4882a593Smuzhiyun long x, threshold = MEMCG_CHARGE_BATCH;
816*4882a593Smuzhiyun
817*4882a593Smuzhiyun pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
818*4882a593Smuzhiyun memcg = pn->memcg;
819*4882a593Smuzhiyun
820*4882a593Smuzhiyun /* Update memcg */
821*4882a593Smuzhiyun __mod_memcg_state(memcg, idx, val);
822*4882a593Smuzhiyun
823*4882a593Smuzhiyun /* Update lruvec */
824*4882a593Smuzhiyun __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
825*4882a593Smuzhiyun
826*4882a593Smuzhiyun if (vmstat_item_in_bytes(idx))
827*4882a593Smuzhiyun threshold <<= PAGE_SHIFT;
828*4882a593Smuzhiyun
829*4882a593Smuzhiyun x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
830*4882a593Smuzhiyun if (unlikely(abs(x) > threshold)) {
831*4882a593Smuzhiyun pg_data_t *pgdat = lruvec_pgdat(lruvec);
832*4882a593Smuzhiyun struct mem_cgroup_per_node *pi;
833*4882a593Smuzhiyun
834*4882a593Smuzhiyun for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
835*4882a593Smuzhiyun atomic_long_add(x, &pi->lruvec_stat[idx]);
836*4882a593Smuzhiyun x = 0;
837*4882a593Smuzhiyun }
838*4882a593Smuzhiyun __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
839*4882a593Smuzhiyun }
840*4882a593Smuzhiyun
841*4882a593Smuzhiyun /**
842*4882a593Smuzhiyun * __mod_lruvec_state - update lruvec memory statistics
843*4882a593Smuzhiyun * @lruvec: the lruvec
844*4882a593Smuzhiyun * @idx: the stat item
845*4882a593Smuzhiyun * @val: delta to add to the counter, can be negative
846*4882a593Smuzhiyun *
847*4882a593Smuzhiyun * The lruvec is the intersection of the NUMA node and a cgroup. This
848*4882a593Smuzhiyun * function updates the all three counters that are affected by a
849*4882a593Smuzhiyun * change of state at this level: per-node, per-cgroup, per-lruvec.
850*4882a593Smuzhiyun */
__mod_lruvec_state(struct lruvec * lruvec,enum node_stat_item idx,int val)851*4882a593Smuzhiyun void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
852*4882a593Smuzhiyun int val)
853*4882a593Smuzhiyun {
854*4882a593Smuzhiyun /* Update node */
855*4882a593Smuzhiyun __mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
856*4882a593Smuzhiyun
857*4882a593Smuzhiyun /* Update memcg and lruvec */
858*4882a593Smuzhiyun if (!mem_cgroup_disabled())
859*4882a593Smuzhiyun __mod_memcg_lruvec_state(lruvec, idx, val);
860*4882a593Smuzhiyun }
861*4882a593Smuzhiyun
__mod_lruvec_slab_state(void * p,enum node_stat_item idx,int val)862*4882a593Smuzhiyun void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
863*4882a593Smuzhiyun {
864*4882a593Smuzhiyun pg_data_t *pgdat = page_pgdat(virt_to_page(p));
865*4882a593Smuzhiyun struct mem_cgroup *memcg;
866*4882a593Smuzhiyun struct lruvec *lruvec;
867*4882a593Smuzhiyun
868*4882a593Smuzhiyun rcu_read_lock();
869*4882a593Smuzhiyun memcg = mem_cgroup_from_obj(p);
870*4882a593Smuzhiyun
871*4882a593Smuzhiyun /*
872*4882a593Smuzhiyun * Untracked pages have no memcg, no lruvec. Update only the
873*4882a593Smuzhiyun * node. If we reparent the slab objects to the root memcg,
874*4882a593Smuzhiyun * when we free the slab object, we need to update the per-memcg
875*4882a593Smuzhiyun * vmstats to keep it correct for the root memcg.
876*4882a593Smuzhiyun */
877*4882a593Smuzhiyun if (!memcg) {
878*4882a593Smuzhiyun __mod_node_page_state(pgdat, idx, val);
879*4882a593Smuzhiyun } else {
880*4882a593Smuzhiyun lruvec = mem_cgroup_lruvec(memcg, pgdat);
881*4882a593Smuzhiyun __mod_lruvec_state(lruvec, idx, val);
882*4882a593Smuzhiyun }
883*4882a593Smuzhiyun rcu_read_unlock();
884*4882a593Smuzhiyun }
885*4882a593Smuzhiyun
mod_memcg_obj_state(void * p,int idx,int val)886*4882a593Smuzhiyun void mod_memcg_obj_state(void *p, int idx, int val)
887*4882a593Smuzhiyun {
888*4882a593Smuzhiyun struct mem_cgroup *memcg;
889*4882a593Smuzhiyun
890*4882a593Smuzhiyun rcu_read_lock();
891*4882a593Smuzhiyun memcg = mem_cgroup_from_obj(p);
892*4882a593Smuzhiyun if (memcg)
893*4882a593Smuzhiyun mod_memcg_state(memcg, idx, val);
894*4882a593Smuzhiyun rcu_read_unlock();
895*4882a593Smuzhiyun }
896*4882a593Smuzhiyun
897*4882a593Smuzhiyun /**
898*4882a593Smuzhiyun * __count_memcg_events - account VM events in a cgroup
899*4882a593Smuzhiyun * @memcg: the memory cgroup
900*4882a593Smuzhiyun * @idx: the event item
901*4882a593Smuzhiyun * @count: the number of events that occured
902*4882a593Smuzhiyun */
__count_memcg_events(struct mem_cgroup * memcg,enum vm_event_item idx,unsigned long count)903*4882a593Smuzhiyun void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
904*4882a593Smuzhiyun unsigned long count)
905*4882a593Smuzhiyun {
906*4882a593Smuzhiyun unsigned long x;
907*4882a593Smuzhiyun
908*4882a593Smuzhiyun if (mem_cgroup_disabled())
909*4882a593Smuzhiyun return;
910*4882a593Smuzhiyun
911*4882a593Smuzhiyun x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
912*4882a593Smuzhiyun if (unlikely(x > MEMCG_CHARGE_BATCH)) {
913*4882a593Smuzhiyun struct mem_cgroup *mi;
914*4882a593Smuzhiyun
915*4882a593Smuzhiyun /*
916*4882a593Smuzhiyun * Batch local counters to keep them in sync with
917*4882a593Smuzhiyun * the hierarchical ones.
918*4882a593Smuzhiyun */
919*4882a593Smuzhiyun __this_cpu_add(memcg->vmstats_local->events[idx], x);
920*4882a593Smuzhiyun for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
921*4882a593Smuzhiyun atomic_long_add(x, &mi->vmevents[idx]);
922*4882a593Smuzhiyun x = 0;
923*4882a593Smuzhiyun }
924*4882a593Smuzhiyun __this_cpu_write(memcg->vmstats_percpu->events[idx], x);
925*4882a593Smuzhiyun }
926*4882a593Smuzhiyun
memcg_events(struct mem_cgroup * memcg,int event)927*4882a593Smuzhiyun static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
928*4882a593Smuzhiyun {
929*4882a593Smuzhiyun return atomic_long_read(&memcg->vmevents[event]);
930*4882a593Smuzhiyun }
931*4882a593Smuzhiyun
memcg_events_local(struct mem_cgroup * memcg,int event)932*4882a593Smuzhiyun static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
933*4882a593Smuzhiyun {
934*4882a593Smuzhiyun long x = 0;
935*4882a593Smuzhiyun int cpu;
936*4882a593Smuzhiyun
937*4882a593Smuzhiyun for_each_possible_cpu(cpu)
938*4882a593Smuzhiyun x += per_cpu(memcg->vmstats_local->events[event], cpu);
939*4882a593Smuzhiyun return x;
940*4882a593Smuzhiyun }
941*4882a593Smuzhiyun
mem_cgroup_charge_statistics(struct mem_cgroup * memcg,struct page * page,int nr_pages)942*4882a593Smuzhiyun static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
943*4882a593Smuzhiyun struct page *page,
944*4882a593Smuzhiyun int nr_pages)
945*4882a593Smuzhiyun {
946*4882a593Smuzhiyun /* pagein of a big page is an event. So, ignore page size */
947*4882a593Smuzhiyun if (nr_pages > 0)
948*4882a593Smuzhiyun __count_memcg_events(memcg, PGPGIN, 1);
949*4882a593Smuzhiyun else {
950*4882a593Smuzhiyun __count_memcg_events(memcg, PGPGOUT, 1);
951*4882a593Smuzhiyun nr_pages = -nr_pages; /* for event */
952*4882a593Smuzhiyun }
953*4882a593Smuzhiyun
954*4882a593Smuzhiyun __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
955*4882a593Smuzhiyun }
956*4882a593Smuzhiyun
mem_cgroup_event_ratelimit(struct mem_cgroup * memcg,enum mem_cgroup_events_target target)957*4882a593Smuzhiyun static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
958*4882a593Smuzhiyun enum mem_cgroup_events_target target)
959*4882a593Smuzhiyun {
960*4882a593Smuzhiyun unsigned long val, next;
961*4882a593Smuzhiyun
962*4882a593Smuzhiyun val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
963*4882a593Smuzhiyun next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
964*4882a593Smuzhiyun /* from time_after() in jiffies.h */
965*4882a593Smuzhiyun if ((long)(next - val) < 0) {
966*4882a593Smuzhiyun switch (target) {
967*4882a593Smuzhiyun case MEM_CGROUP_TARGET_THRESH:
968*4882a593Smuzhiyun next = val + THRESHOLDS_EVENTS_TARGET;
969*4882a593Smuzhiyun break;
970*4882a593Smuzhiyun case MEM_CGROUP_TARGET_SOFTLIMIT:
971*4882a593Smuzhiyun next = val + SOFTLIMIT_EVENTS_TARGET;
972*4882a593Smuzhiyun break;
973*4882a593Smuzhiyun default:
974*4882a593Smuzhiyun break;
975*4882a593Smuzhiyun }
976*4882a593Smuzhiyun __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
977*4882a593Smuzhiyun return true;
978*4882a593Smuzhiyun }
979*4882a593Smuzhiyun return false;
980*4882a593Smuzhiyun }
981*4882a593Smuzhiyun
982*4882a593Smuzhiyun /*
983*4882a593Smuzhiyun * Check events in order.
984*4882a593Smuzhiyun *
985*4882a593Smuzhiyun */
memcg_check_events(struct mem_cgroup * memcg,struct page * page)986*4882a593Smuzhiyun static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
987*4882a593Smuzhiyun {
988*4882a593Smuzhiyun /* threshold event is triggered in finer grain than soft limit */
989*4882a593Smuzhiyun if (unlikely(mem_cgroup_event_ratelimit(memcg,
990*4882a593Smuzhiyun MEM_CGROUP_TARGET_THRESH))) {
991*4882a593Smuzhiyun bool do_softlimit;
992*4882a593Smuzhiyun
993*4882a593Smuzhiyun do_softlimit = mem_cgroup_event_ratelimit(memcg,
994*4882a593Smuzhiyun MEM_CGROUP_TARGET_SOFTLIMIT);
995*4882a593Smuzhiyun mem_cgroup_threshold(memcg);
996*4882a593Smuzhiyun if (unlikely(do_softlimit))
997*4882a593Smuzhiyun mem_cgroup_update_tree(memcg, page);
998*4882a593Smuzhiyun }
999*4882a593Smuzhiyun }
1000*4882a593Smuzhiyun
mem_cgroup_from_task(struct task_struct * p)1001*4882a593Smuzhiyun struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
1002*4882a593Smuzhiyun {
1003*4882a593Smuzhiyun /*
1004*4882a593Smuzhiyun * mm_update_next_owner() may clear mm->owner to NULL
1005*4882a593Smuzhiyun * if it races with swapoff, page migration, etc.
1006*4882a593Smuzhiyun * So this can be called with p == NULL.
1007*4882a593Smuzhiyun */
1008*4882a593Smuzhiyun if (unlikely(!p))
1009*4882a593Smuzhiyun return NULL;
1010*4882a593Smuzhiyun
1011*4882a593Smuzhiyun return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
1012*4882a593Smuzhiyun }
1013*4882a593Smuzhiyun EXPORT_SYMBOL(mem_cgroup_from_task);
1014*4882a593Smuzhiyun
1015*4882a593Smuzhiyun /**
1016*4882a593Smuzhiyun * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg.
1017*4882a593Smuzhiyun * @mm: mm from which memcg should be extracted. It can be NULL.
1018*4882a593Smuzhiyun *
1019*4882a593Smuzhiyun * Obtain a reference on mm->memcg and returns it if successful. Otherwise
1020*4882a593Smuzhiyun * root_mem_cgroup is returned. However if mem_cgroup is disabled, NULL is
1021*4882a593Smuzhiyun * returned.
1022*4882a593Smuzhiyun */
get_mem_cgroup_from_mm(struct mm_struct * mm)1023*4882a593Smuzhiyun struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
1024*4882a593Smuzhiyun {
1025*4882a593Smuzhiyun struct mem_cgroup *memcg;
1026*4882a593Smuzhiyun
1027*4882a593Smuzhiyun if (mem_cgroup_disabled())
1028*4882a593Smuzhiyun return NULL;
1029*4882a593Smuzhiyun
1030*4882a593Smuzhiyun rcu_read_lock();
1031*4882a593Smuzhiyun do {
1032*4882a593Smuzhiyun /*
1033*4882a593Smuzhiyun * Page cache insertions can happen withou an
1034*4882a593Smuzhiyun * actual mm context, e.g. during disk probing
1035*4882a593Smuzhiyun * on boot, loopback IO, acct() writes etc.
1036*4882a593Smuzhiyun */
1037*4882a593Smuzhiyun if (unlikely(!mm))
1038*4882a593Smuzhiyun memcg = root_mem_cgroup;
1039*4882a593Smuzhiyun else {
1040*4882a593Smuzhiyun memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1041*4882a593Smuzhiyun if (unlikely(!memcg))
1042*4882a593Smuzhiyun memcg = root_mem_cgroup;
1043*4882a593Smuzhiyun }
1044*4882a593Smuzhiyun } while (!css_tryget(&memcg->css));
1045*4882a593Smuzhiyun rcu_read_unlock();
1046*4882a593Smuzhiyun return memcg;
1047*4882a593Smuzhiyun }
1048*4882a593Smuzhiyun EXPORT_SYMBOL(get_mem_cgroup_from_mm);
1049*4882a593Smuzhiyun
1050*4882a593Smuzhiyun /**
1051*4882a593Smuzhiyun * get_mem_cgroup_from_page: Obtain a reference on given page's memcg.
1052*4882a593Smuzhiyun * @page: page from which memcg should be extracted.
1053*4882a593Smuzhiyun *
1054*4882a593Smuzhiyun * Obtain a reference on page->memcg and returns it if successful. Otherwise
1055*4882a593Smuzhiyun * root_mem_cgroup is returned.
1056*4882a593Smuzhiyun */
get_mem_cgroup_from_page(struct page * page)1057*4882a593Smuzhiyun struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
1058*4882a593Smuzhiyun {
1059*4882a593Smuzhiyun struct mem_cgroup *memcg = page->mem_cgroup;
1060*4882a593Smuzhiyun
1061*4882a593Smuzhiyun if (mem_cgroup_disabled())
1062*4882a593Smuzhiyun return NULL;
1063*4882a593Smuzhiyun
1064*4882a593Smuzhiyun rcu_read_lock();
1065*4882a593Smuzhiyun /* Page should not get uncharged and freed memcg under us. */
1066*4882a593Smuzhiyun if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css)))
1067*4882a593Smuzhiyun memcg = root_mem_cgroup;
1068*4882a593Smuzhiyun rcu_read_unlock();
1069*4882a593Smuzhiyun return memcg;
1070*4882a593Smuzhiyun }
1071*4882a593Smuzhiyun EXPORT_SYMBOL(get_mem_cgroup_from_page);
1072*4882a593Smuzhiyun
active_memcg(void)1073*4882a593Smuzhiyun static __always_inline struct mem_cgroup *active_memcg(void)
1074*4882a593Smuzhiyun {
1075*4882a593Smuzhiyun if (in_interrupt())
1076*4882a593Smuzhiyun return this_cpu_read(int_active_memcg);
1077*4882a593Smuzhiyun else
1078*4882a593Smuzhiyun return current->active_memcg;
1079*4882a593Smuzhiyun }
1080*4882a593Smuzhiyun
get_active_memcg(void)1081*4882a593Smuzhiyun static __always_inline struct mem_cgroup *get_active_memcg(void)
1082*4882a593Smuzhiyun {
1083*4882a593Smuzhiyun struct mem_cgroup *memcg;
1084*4882a593Smuzhiyun
1085*4882a593Smuzhiyun rcu_read_lock();
1086*4882a593Smuzhiyun memcg = active_memcg();
1087*4882a593Smuzhiyun /* remote memcg must hold a ref. */
1088*4882a593Smuzhiyun if (memcg && WARN_ON_ONCE(!css_tryget(&memcg->css)))
1089*4882a593Smuzhiyun memcg = root_mem_cgroup;
1090*4882a593Smuzhiyun rcu_read_unlock();
1091*4882a593Smuzhiyun
1092*4882a593Smuzhiyun return memcg;
1093*4882a593Smuzhiyun }
1094*4882a593Smuzhiyun
memcg_kmem_bypass(void)1095*4882a593Smuzhiyun static __always_inline bool memcg_kmem_bypass(void)
1096*4882a593Smuzhiyun {
1097*4882a593Smuzhiyun /* Allow remote memcg charging from any context. */
1098*4882a593Smuzhiyun if (unlikely(active_memcg()))
1099*4882a593Smuzhiyun return false;
1100*4882a593Smuzhiyun
1101*4882a593Smuzhiyun /* Memcg to charge can't be determined. */
1102*4882a593Smuzhiyun if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
1103*4882a593Smuzhiyun return true;
1104*4882a593Smuzhiyun
1105*4882a593Smuzhiyun return false;
1106*4882a593Smuzhiyun }
1107*4882a593Smuzhiyun
1108*4882a593Smuzhiyun /**
1109*4882a593Smuzhiyun * If active memcg is set, do not fallback to current->mm->memcg.
1110*4882a593Smuzhiyun */
get_mem_cgroup_from_current(void)1111*4882a593Smuzhiyun static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
1112*4882a593Smuzhiyun {
1113*4882a593Smuzhiyun if (memcg_kmem_bypass())
1114*4882a593Smuzhiyun return NULL;
1115*4882a593Smuzhiyun
1116*4882a593Smuzhiyun if (unlikely(active_memcg()))
1117*4882a593Smuzhiyun return get_active_memcg();
1118*4882a593Smuzhiyun
1119*4882a593Smuzhiyun return get_mem_cgroup_from_mm(current->mm);
1120*4882a593Smuzhiyun }
1121*4882a593Smuzhiyun
1122*4882a593Smuzhiyun /**
1123*4882a593Smuzhiyun * mem_cgroup_iter - iterate over memory cgroup hierarchy
1124*4882a593Smuzhiyun * @root: hierarchy root
1125*4882a593Smuzhiyun * @prev: previously returned memcg, NULL on first invocation
1126*4882a593Smuzhiyun * @reclaim: cookie for shared reclaim walks, NULL for full walks
1127*4882a593Smuzhiyun *
1128*4882a593Smuzhiyun * Returns references to children of the hierarchy below @root, or
1129*4882a593Smuzhiyun * @root itself, or %NULL after a full round-trip.
1130*4882a593Smuzhiyun *
1131*4882a593Smuzhiyun * Caller must pass the return value in @prev on subsequent
1132*4882a593Smuzhiyun * invocations for reference counting, or use mem_cgroup_iter_break()
1133*4882a593Smuzhiyun * to cancel a hierarchy walk before the round-trip is complete.
1134*4882a593Smuzhiyun *
1135*4882a593Smuzhiyun * Reclaimers can specify a node in @reclaim to divide up the memcgs
1136*4882a593Smuzhiyun * in the hierarchy among all concurrent reclaimers operating on the
1137*4882a593Smuzhiyun * same node.
1138*4882a593Smuzhiyun */
mem_cgroup_iter(struct mem_cgroup * root,struct mem_cgroup * prev,struct mem_cgroup_reclaim_cookie * reclaim)1139*4882a593Smuzhiyun struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1140*4882a593Smuzhiyun struct mem_cgroup *prev,
1141*4882a593Smuzhiyun struct mem_cgroup_reclaim_cookie *reclaim)
1142*4882a593Smuzhiyun {
1143*4882a593Smuzhiyun struct mem_cgroup_reclaim_iter *iter;
1144*4882a593Smuzhiyun struct cgroup_subsys_state *css = NULL;
1145*4882a593Smuzhiyun struct mem_cgroup *memcg = NULL;
1146*4882a593Smuzhiyun struct mem_cgroup *pos = NULL;
1147*4882a593Smuzhiyun
1148*4882a593Smuzhiyun if (mem_cgroup_disabled())
1149*4882a593Smuzhiyun return NULL;
1150*4882a593Smuzhiyun
1151*4882a593Smuzhiyun if (!root)
1152*4882a593Smuzhiyun root = root_mem_cgroup;
1153*4882a593Smuzhiyun
1154*4882a593Smuzhiyun if (prev && !reclaim)
1155*4882a593Smuzhiyun pos = prev;
1156*4882a593Smuzhiyun
1157*4882a593Smuzhiyun if (!root->use_hierarchy && root != root_mem_cgroup) {
1158*4882a593Smuzhiyun if (prev)
1159*4882a593Smuzhiyun goto out;
1160*4882a593Smuzhiyun return root;
1161*4882a593Smuzhiyun }
1162*4882a593Smuzhiyun
1163*4882a593Smuzhiyun rcu_read_lock();
1164*4882a593Smuzhiyun
1165*4882a593Smuzhiyun if (reclaim) {
1166*4882a593Smuzhiyun struct mem_cgroup_per_node *mz;
1167*4882a593Smuzhiyun
1168*4882a593Smuzhiyun mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
1169*4882a593Smuzhiyun iter = &mz->iter;
1170*4882a593Smuzhiyun
1171*4882a593Smuzhiyun if (prev && reclaim->generation != iter->generation)
1172*4882a593Smuzhiyun goto out_unlock;
1173*4882a593Smuzhiyun
1174*4882a593Smuzhiyun while (1) {
1175*4882a593Smuzhiyun pos = READ_ONCE(iter->position);
1176*4882a593Smuzhiyun if (!pos || css_tryget(&pos->css))
1177*4882a593Smuzhiyun break;
1178*4882a593Smuzhiyun /*
1179*4882a593Smuzhiyun * css reference reached zero, so iter->position will
1180*4882a593Smuzhiyun * be cleared by ->css_released. However, we should not
1181*4882a593Smuzhiyun * rely on this happening soon, because ->css_released
1182*4882a593Smuzhiyun * is called from a work queue, and by busy-waiting we
1183*4882a593Smuzhiyun * might block it. So we clear iter->position right
1184*4882a593Smuzhiyun * away.
1185*4882a593Smuzhiyun */
1186*4882a593Smuzhiyun (void)cmpxchg(&iter->position, pos, NULL);
1187*4882a593Smuzhiyun }
1188*4882a593Smuzhiyun }
1189*4882a593Smuzhiyun
1190*4882a593Smuzhiyun if (pos)
1191*4882a593Smuzhiyun css = &pos->css;
1192*4882a593Smuzhiyun
1193*4882a593Smuzhiyun for (;;) {
1194*4882a593Smuzhiyun css = css_next_descendant_pre(css, &root->css);
1195*4882a593Smuzhiyun if (!css) {
1196*4882a593Smuzhiyun /*
1197*4882a593Smuzhiyun * Reclaimers share the hierarchy walk, and a
1198*4882a593Smuzhiyun * new one might jump in right at the end of
1199*4882a593Smuzhiyun * the hierarchy - make sure they see at least
1200*4882a593Smuzhiyun * one group and restart from the beginning.
1201*4882a593Smuzhiyun */
1202*4882a593Smuzhiyun if (!prev)
1203*4882a593Smuzhiyun continue;
1204*4882a593Smuzhiyun break;
1205*4882a593Smuzhiyun }
1206*4882a593Smuzhiyun
1207*4882a593Smuzhiyun /*
1208*4882a593Smuzhiyun * Verify the css and acquire a reference. The root
1209*4882a593Smuzhiyun * is provided by the caller, so we know it's alive
1210*4882a593Smuzhiyun * and kicking, and don't take an extra reference.
1211*4882a593Smuzhiyun */
1212*4882a593Smuzhiyun memcg = mem_cgroup_from_css(css);
1213*4882a593Smuzhiyun
1214*4882a593Smuzhiyun if (css == &root->css)
1215*4882a593Smuzhiyun break;
1216*4882a593Smuzhiyun
1217*4882a593Smuzhiyun if (css_tryget(css))
1218*4882a593Smuzhiyun break;
1219*4882a593Smuzhiyun
1220*4882a593Smuzhiyun memcg = NULL;
1221*4882a593Smuzhiyun }
1222*4882a593Smuzhiyun
1223*4882a593Smuzhiyun if (reclaim) {
1224*4882a593Smuzhiyun /*
1225*4882a593Smuzhiyun * The position could have already been updated by a competing
1226*4882a593Smuzhiyun * thread, so check that the value hasn't changed since we read
1227*4882a593Smuzhiyun * it to avoid reclaiming from the same cgroup twice.
1228*4882a593Smuzhiyun */
1229*4882a593Smuzhiyun (void)cmpxchg(&iter->position, pos, memcg);
1230*4882a593Smuzhiyun
1231*4882a593Smuzhiyun if (pos)
1232*4882a593Smuzhiyun css_put(&pos->css);
1233*4882a593Smuzhiyun
1234*4882a593Smuzhiyun if (!memcg)
1235*4882a593Smuzhiyun iter->generation++;
1236*4882a593Smuzhiyun else if (!prev)
1237*4882a593Smuzhiyun reclaim->generation = iter->generation;
1238*4882a593Smuzhiyun }
1239*4882a593Smuzhiyun
1240*4882a593Smuzhiyun out_unlock:
1241*4882a593Smuzhiyun rcu_read_unlock();
1242*4882a593Smuzhiyun out:
1243*4882a593Smuzhiyun if (prev && prev != root)
1244*4882a593Smuzhiyun css_put(&prev->css);
1245*4882a593Smuzhiyun
1246*4882a593Smuzhiyun return memcg;
1247*4882a593Smuzhiyun }
1248*4882a593Smuzhiyun
1249*4882a593Smuzhiyun /**
1250*4882a593Smuzhiyun * mem_cgroup_iter_break - abort a hierarchy walk prematurely
1251*4882a593Smuzhiyun * @root: hierarchy root
1252*4882a593Smuzhiyun * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
1253*4882a593Smuzhiyun */
mem_cgroup_iter_break(struct mem_cgroup * root,struct mem_cgroup * prev)1254*4882a593Smuzhiyun void mem_cgroup_iter_break(struct mem_cgroup *root,
1255*4882a593Smuzhiyun struct mem_cgroup *prev)
1256*4882a593Smuzhiyun {
1257*4882a593Smuzhiyun if (!root)
1258*4882a593Smuzhiyun root = root_mem_cgroup;
1259*4882a593Smuzhiyun if (prev && prev != root)
1260*4882a593Smuzhiyun css_put(&prev->css);
1261*4882a593Smuzhiyun }
1262*4882a593Smuzhiyun
__invalidate_reclaim_iterators(struct mem_cgroup * from,struct mem_cgroup * dead_memcg)1263*4882a593Smuzhiyun static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
1264*4882a593Smuzhiyun struct mem_cgroup *dead_memcg)
1265*4882a593Smuzhiyun {
1266*4882a593Smuzhiyun struct mem_cgroup_reclaim_iter *iter;
1267*4882a593Smuzhiyun struct mem_cgroup_per_node *mz;
1268*4882a593Smuzhiyun int nid;
1269*4882a593Smuzhiyun
1270*4882a593Smuzhiyun for_each_node(nid) {
1271*4882a593Smuzhiyun mz = mem_cgroup_nodeinfo(from, nid);
1272*4882a593Smuzhiyun iter = &mz->iter;
1273*4882a593Smuzhiyun cmpxchg(&iter->position, dead_memcg, NULL);
1274*4882a593Smuzhiyun }
1275*4882a593Smuzhiyun }
1276*4882a593Smuzhiyun
invalidate_reclaim_iterators(struct mem_cgroup * dead_memcg)1277*4882a593Smuzhiyun static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
1278*4882a593Smuzhiyun {
1279*4882a593Smuzhiyun struct mem_cgroup *memcg = dead_memcg;
1280*4882a593Smuzhiyun struct mem_cgroup *last;
1281*4882a593Smuzhiyun
1282*4882a593Smuzhiyun do {
1283*4882a593Smuzhiyun __invalidate_reclaim_iterators(memcg, dead_memcg);
1284*4882a593Smuzhiyun last = memcg;
1285*4882a593Smuzhiyun } while ((memcg = parent_mem_cgroup(memcg)));
1286*4882a593Smuzhiyun
1287*4882a593Smuzhiyun /*
1288*4882a593Smuzhiyun * When cgruop1 non-hierarchy mode is used,
1289*4882a593Smuzhiyun * parent_mem_cgroup() does not walk all the way up to the
1290*4882a593Smuzhiyun * cgroup root (root_mem_cgroup). So we have to handle
1291*4882a593Smuzhiyun * dead_memcg from cgroup root separately.
1292*4882a593Smuzhiyun */
1293*4882a593Smuzhiyun if (last != root_mem_cgroup)
1294*4882a593Smuzhiyun __invalidate_reclaim_iterators(root_mem_cgroup,
1295*4882a593Smuzhiyun dead_memcg);
1296*4882a593Smuzhiyun }
1297*4882a593Smuzhiyun
1298*4882a593Smuzhiyun /**
1299*4882a593Smuzhiyun * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
1300*4882a593Smuzhiyun * @memcg: hierarchy root
1301*4882a593Smuzhiyun * @fn: function to call for each task
1302*4882a593Smuzhiyun * @arg: argument passed to @fn
1303*4882a593Smuzhiyun *
1304*4882a593Smuzhiyun * This function iterates over tasks attached to @memcg or to any of its
1305*4882a593Smuzhiyun * descendants and calls @fn for each task. If @fn returns a non-zero
1306*4882a593Smuzhiyun * value, the function breaks the iteration loop and returns the value.
1307*4882a593Smuzhiyun * Otherwise, it will iterate over all tasks and return 0.
1308*4882a593Smuzhiyun *
1309*4882a593Smuzhiyun * This function must not be called for the root memory cgroup.
1310*4882a593Smuzhiyun */
mem_cgroup_scan_tasks(struct mem_cgroup * memcg,int (* fn)(struct task_struct *,void *),void * arg)1311*4882a593Smuzhiyun int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
1312*4882a593Smuzhiyun int (*fn)(struct task_struct *, void *), void *arg)
1313*4882a593Smuzhiyun {
1314*4882a593Smuzhiyun struct mem_cgroup *iter;
1315*4882a593Smuzhiyun int ret = 0;
1316*4882a593Smuzhiyun
1317*4882a593Smuzhiyun BUG_ON(memcg == root_mem_cgroup);
1318*4882a593Smuzhiyun
1319*4882a593Smuzhiyun for_each_mem_cgroup_tree(iter, memcg) {
1320*4882a593Smuzhiyun struct css_task_iter it;
1321*4882a593Smuzhiyun struct task_struct *task;
1322*4882a593Smuzhiyun
1323*4882a593Smuzhiyun css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
1324*4882a593Smuzhiyun while (!ret && (task = css_task_iter_next(&it)))
1325*4882a593Smuzhiyun ret = fn(task, arg);
1326*4882a593Smuzhiyun css_task_iter_end(&it);
1327*4882a593Smuzhiyun if (ret) {
1328*4882a593Smuzhiyun mem_cgroup_iter_break(memcg, iter);
1329*4882a593Smuzhiyun break;
1330*4882a593Smuzhiyun }
1331*4882a593Smuzhiyun }
1332*4882a593Smuzhiyun return ret;
1333*4882a593Smuzhiyun }
1334*4882a593Smuzhiyun
1335*4882a593Smuzhiyun /**
1336*4882a593Smuzhiyun * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
1337*4882a593Smuzhiyun * @page: the page
1338*4882a593Smuzhiyun * @pgdat: pgdat of the page
1339*4882a593Smuzhiyun *
1340*4882a593Smuzhiyun * This function relies on page->mem_cgroup being stable - see the
1341*4882a593Smuzhiyun * access rules in commit_charge().
1342*4882a593Smuzhiyun */
mem_cgroup_page_lruvec(struct page * page,struct pglist_data * pgdat)1343*4882a593Smuzhiyun struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
1344*4882a593Smuzhiyun {
1345*4882a593Smuzhiyun struct mem_cgroup_per_node *mz;
1346*4882a593Smuzhiyun struct mem_cgroup *memcg;
1347*4882a593Smuzhiyun struct lruvec *lruvec;
1348*4882a593Smuzhiyun
1349*4882a593Smuzhiyun if (mem_cgroup_disabled()) {
1350*4882a593Smuzhiyun lruvec = &pgdat->__lruvec;
1351*4882a593Smuzhiyun goto out;
1352*4882a593Smuzhiyun }
1353*4882a593Smuzhiyun
1354*4882a593Smuzhiyun memcg = page->mem_cgroup;
1355*4882a593Smuzhiyun /*
1356*4882a593Smuzhiyun * Swapcache readahead pages are added to the LRU - and
1357*4882a593Smuzhiyun * possibly migrated - before they are charged.
1358*4882a593Smuzhiyun */
1359*4882a593Smuzhiyun if (!memcg)
1360*4882a593Smuzhiyun memcg = root_mem_cgroup;
1361*4882a593Smuzhiyun
1362*4882a593Smuzhiyun mz = mem_cgroup_page_nodeinfo(memcg, page);
1363*4882a593Smuzhiyun lruvec = &mz->lruvec;
1364*4882a593Smuzhiyun out:
1365*4882a593Smuzhiyun /*
1366*4882a593Smuzhiyun * Since a node can be onlined after the mem_cgroup was created,
1367*4882a593Smuzhiyun * we have to be prepared to initialize lruvec->zone here;
1368*4882a593Smuzhiyun * and if offlined then reonlined, we need to reinitialize it.
1369*4882a593Smuzhiyun */
1370*4882a593Smuzhiyun if (unlikely(lruvec->pgdat != pgdat))
1371*4882a593Smuzhiyun lruvec->pgdat = pgdat;
1372*4882a593Smuzhiyun return lruvec;
1373*4882a593Smuzhiyun }
1374*4882a593Smuzhiyun
page_to_lruvec(struct page * page,pg_data_t * pgdat)1375*4882a593Smuzhiyun struct lruvec *page_to_lruvec(struct page *page, pg_data_t *pgdat)
1376*4882a593Smuzhiyun {
1377*4882a593Smuzhiyun struct lruvec *lruvec;
1378*4882a593Smuzhiyun
1379*4882a593Smuzhiyun lruvec = mem_cgroup_page_lruvec(page, pgdat);
1380*4882a593Smuzhiyun
1381*4882a593Smuzhiyun return lruvec;
1382*4882a593Smuzhiyun }
1383*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(page_to_lruvec);
1384*4882a593Smuzhiyun
do_traversal_all_lruvec(void)1385*4882a593Smuzhiyun void do_traversal_all_lruvec(void)
1386*4882a593Smuzhiyun {
1387*4882a593Smuzhiyun pg_data_t *pgdat;
1388*4882a593Smuzhiyun
1389*4882a593Smuzhiyun for_each_online_pgdat(pgdat) {
1390*4882a593Smuzhiyun struct mem_cgroup *memcg = NULL;
1391*4882a593Smuzhiyun
1392*4882a593Smuzhiyun spin_lock_irq(&pgdat->lru_lock);
1393*4882a593Smuzhiyun memcg = mem_cgroup_iter(NULL, NULL, NULL);
1394*4882a593Smuzhiyun do {
1395*4882a593Smuzhiyun struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
1396*4882a593Smuzhiyun
1397*4882a593Smuzhiyun trace_android_vh_do_traversal_lruvec(lruvec);
1398*4882a593Smuzhiyun
1399*4882a593Smuzhiyun memcg = mem_cgroup_iter(NULL, memcg, NULL);
1400*4882a593Smuzhiyun } while (memcg);
1401*4882a593Smuzhiyun
1402*4882a593Smuzhiyun spin_unlock_irq(&pgdat->lru_lock);
1403*4882a593Smuzhiyun }
1404*4882a593Smuzhiyun }
1405*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(do_traversal_all_lruvec);
1406*4882a593Smuzhiyun
1407*4882a593Smuzhiyun /**
1408*4882a593Smuzhiyun * mem_cgroup_update_lru_size - account for adding or removing an lru page
1409*4882a593Smuzhiyun * @lruvec: mem_cgroup per zone lru vector
1410*4882a593Smuzhiyun * @lru: index of lru list the page is sitting on
1411*4882a593Smuzhiyun * @zid: zone id of the accounted pages
1412*4882a593Smuzhiyun * @nr_pages: positive when adding or negative when removing
1413*4882a593Smuzhiyun *
1414*4882a593Smuzhiyun * This function must be called under lru_lock, just before a page is added
1415*4882a593Smuzhiyun * to or just after a page is removed from an lru list (that ordering being
1416*4882a593Smuzhiyun * so as to allow it to check that lru_size 0 is consistent with list_empty).
1417*4882a593Smuzhiyun */
mem_cgroup_update_lru_size(struct lruvec * lruvec,enum lru_list lru,int zid,int nr_pages)1418*4882a593Smuzhiyun void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1419*4882a593Smuzhiyun int zid, int nr_pages)
1420*4882a593Smuzhiyun {
1421*4882a593Smuzhiyun struct mem_cgroup_per_node *mz;
1422*4882a593Smuzhiyun unsigned long *lru_size;
1423*4882a593Smuzhiyun long size;
1424*4882a593Smuzhiyun
1425*4882a593Smuzhiyun if (mem_cgroup_disabled())
1426*4882a593Smuzhiyun return;
1427*4882a593Smuzhiyun
1428*4882a593Smuzhiyun mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
1429*4882a593Smuzhiyun lru_size = &mz->lru_zone_size[zid][lru];
1430*4882a593Smuzhiyun
1431*4882a593Smuzhiyun if (nr_pages < 0)
1432*4882a593Smuzhiyun *lru_size += nr_pages;
1433*4882a593Smuzhiyun
1434*4882a593Smuzhiyun size = *lru_size;
1435*4882a593Smuzhiyun if (WARN_ONCE(size < 0,
1436*4882a593Smuzhiyun "%s(%p, %d, %d): lru_size %ld\n",
1437*4882a593Smuzhiyun __func__, lruvec, lru, nr_pages, size)) {
1438*4882a593Smuzhiyun VM_BUG_ON(1);
1439*4882a593Smuzhiyun *lru_size = 0;
1440*4882a593Smuzhiyun }
1441*4882a593Smuzhiyun
1442*4882a593Smuzhiyun if (nr_pages > 0)
1443*4882a593Smuzhiyun *lru_size += nr_pages;
1444*4882a593Smuzhiyun }
1445*4882a593Smuzhiyun
1446*4882a593Smuzhiyun /**
1447*4882a593Smuzhiyun * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1448*4882a593Smuzhiyun * @memcg: the memory cgroup
1449*4882a593Smuzhiyun *
1450*4882a593Smuzhiyun * Returns the maximum amount of memory @mem can be charged with, in
1451*4882a593Smuzhiyun * pages.
1452*4882a593Smuzhiyun */
mem_cgroup_margin(struct mem_cgroup * memcg)1453*4882a593Smuzhiyun static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1454*4882a593Smuzhiyun {
1455*4882a593Smuzhiyun unsigned long margin = 0;
1456*4882a593Smuzhiyun unsigned long count;
1457*4882a593Smuzhiyun unsigned long limit;
1458*4882a593Smuzhiyun
1459*4882a593Smuzhiyun count = page_counter_read(&memcg->memory);
1460*4882a593Smuzhiyun limit = READ_ONCE(memcg->memory.max);
1461*4882a593Smuzhiyun if (count < limit)
1462*4882a593Smuzhiyun margin = limit - count;
1463*4882a593Smuzhiyun
1464*4882a593Smuzhiyun if (do_memsw_account()) {
1465*4882a593Smuzhiyun count = page_counter_read(&memcg->memsw);
1466*4882a593Smuzhiyun limit = READ_ONCE(memcg->memsw.max);
1467*4882a593Smuzhiyun if (count < limit)
1468*4882a593Smuzhiyun margin = min(margin, limit - count);
1469*4882a593Smuzhiyun else
1470*4882a593Smuzhiyun margin = 0;
1471*4882a593Smuzhiyun }
1472*4882a593Smuzhiyun
1473*4882a593Smuzhiyun return margin;
1474*4882a593Smuzhiyun }
1475*4882a593Smuzhiyun
1476*4882a593Smuzhiyun /*
1477*4882a593Smuzhiyun * A routine for checking "mem" is under move_account() or not.
1478*4882a593Smuzhiyun *
1479*4882a593Smuzhiyun * Checking a cgroup is mc.from or mc.to or under hierarchy of
1480*4882a593Smuzhiyun * moving cgroups. This is for waiting at high-memory pressure
1481*4882a593Smuzhiyun * caused by "move".
1482*4882a593Smuzhiyun */
mem_cgroup_under_move(struct mem_cgroup * memcg)1483*4882a593Smuzhiyun static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1484*4882a593Smuzhiyun {
1485*4882a593Smuzhiyun struct mem_cgroup *from;
1486*4882a593Smuzhiyun struct mem_cgroup *to;
1487*4882a593Smuzhiyun bool ret = false;
1488*4882a593Smuzhiyun /*
1489*4882a593Smuzhiyun * Unlike task_move routines, we access mc.to, mc.from not under
1490*4882a593Smuzhiyun * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
1491*4882a593Smuzhiyun */
1492*4882a593Smuzhiyun spin_lock(&mc.lock);
1493*4882a593Smuzhiyun from = mc.from;
1494*4882a593Smuzhiyun to = mc.to;
1495*4882a593Smuzhiyun if (!from)
1496*4882a593Smuzhiyun goto unlock;
1497*4882a593Smuzhiyun
1498*4882a593Smuzhiyun ret = mem_cgroup_is_descendant(from, memcg) ||
1499*4882a593Smuzhiyun mem_cgroup_is_descendant(to, memcg);
1500*4882a593Smuzhiyun unlock:
1501*4882a593Smuzhiyun spin_unlock(&mc.lock);
1502*4882a593Smuzhiyun return ret;
1503*4882a593Smuzhiyun }
1504*4882a593Smuzhiyun
mem_cgroup_wait_acct_move(struct mem_cgroup * memcg)1505*4882a593Smuzhiyun static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1506*4882a593Smuzhiyun {
1507*4882a593Smuzhiyun if (mc.moving_task && current != mc.moving_task) {
1508*4882a593Smuzhiyun if (mem_cgroup_under_move(memcg)) {
1509*4882a593Smuzhiyun DEFINE_WAIT(wait);
1510*4882a593Smuzhiyun prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1511*4882a593Smuzhiyun /* moving charge context might have finished. */
1512*4882a593Smuzhiyun if (mc.moving_task)
1513*4882a593Smuzhiyun schedule();
1514*4882a593Smuzhiyun finish_wait(&mc.waitq, &wait);
1515*4882a593Smuzhiyun return true;
1516*4882a593Smuzhiyun }
1517*4882a593Smuzhiyun }
1518*4882a593Smuzhiyun return false;
1519*4882a593Smuzhiyun }
1520*4882a593Smuzhiyun
1521*4882a593Smuzhiyun struct memory_stat {
1522*4882a593Smuzhiyun const char *name;
1523*4882a593Smuzhiyun unsigned int ratio;
1524*4882a593Smuzhiyun unsigned int idx;
1525*4882a593Smuzhiyun };
1526*4882a593Smuzhiyun
1527*4882a593Smuzhiyun static struct memory_stat memory_stats[] = {
1528*4882a593Smuzhiyun { "anon", PAGE_SIZE, NR_ANON_MAPPED },
1529*4882a593Smuzhiyun { "file", PAGE_SIZE, NR_FILE_PAGES },
1530*4882a593Smuzhiyun { "kernel_stack", 1024, NR_KERNEL_STACK_KB },
1531*4882a593Smuzhiyun { "percpu", 1, MEMCG_PERCPU_B },
1532*4882a593Smuzhiyun { "sock", PAGE_SIZE, MEMCG_SOCK },
1533*4882a593Smuzhiyun { "shmem", PAGE_SIZE, NR_SHMEM },
1534*4882a593Smuzhiyun { "file_mapped", PAGE_SIZE, NR_FILE_MAPPED },
1535*4882a593Smuzhiyun { "file_dirty", PAGE_SIZE, NR_FILE_DIRTY },
1536*4882a593Smuzhiyun { "file_writeback", PAGE_SIZE, NR_WRITEBACK },
1537*4882a593Smuzhiyun #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1538*4882a593Smuzhiyun /*
1539*4882a593Smuzhiyun * The ratio will be initialized in memory_stats_init(). Because
1540*4882a593Smuzhiyun * on some architectures, the macro of HPAGE_PMD_SIZE is not
1541*4882a593Smuzhiyun * constant(e.g. powerpc).
1542*4882a593Smuzhiyun */
1543*4882a593Smuzhiyun { "anon_thp", 0, NR_ANON_THPS },
1544*4882a593Smuzhiyun #endif
1545*4882a593Smuzhiyun { "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON },
1546*4882a593Smuzhiyun { "active_anon", PAGE_SIZE, NR_ACTIVE_ANON },
1547*4882a593Smuzhiyun { "inactive_file", PAGE_SIZE, NR_INACTIVE_FILE },
1548*4882a593Smuzhiyun { "active_file", PAGE_SIZE, NR_ACTIVE_FILE },
1549*4882a593Smuzhiyun { "unevictable", PAGE_SIZE, NR_UNEVICTABLE },
1550*4882a593Smuzhiyun
1551*4882a593Smuzhiyun /*
1552*4882a593Smuzhiyun * Note: The slab_reclaimable and slab_unreclaimable must be
1553*4882a593Smuzhiyun * together and slab_reclaimable must be in front.
1554*4882a593Smuzhiyun */
1555*4882a593Smuzhiyun { "slab_reclaimable", 1, NR_SLAB_RECLAIMABLE_B },
1556*4882a593Smuzhiyun { "slab_unreclaimable", 1, NR_SLAB_UNRECLAIMABLE_B },
1557*4882a593Smuzhiyun
1558*4882a593Smuzhiyun /* The memory events */
1559*4882a593Smuzhiyun { "workingset_refault_anon", 1, WORKINGSET_REFAULT_ANON },
1560*4882a593Smuzhiyun { "workingset_refault_file", 1, WORKINGSET_REFAULT_FILE },
1561*4882a593Smuzhiyun { "workingset_activate_anon", 1, WORKINGSET_ACTIVATE_ANON },
1562*4882a593Smuzhiyun { "workingset_activate_file", 1, WORKINGSET_ACTIVATE_FILE },
1563*4882a593Smuzhiyun { "workingset_restore_anon", 1, WORKINGSET_RESTORE_ANON },
1564*4882a593Smuzhiyun { "workingset_restore_file", 1, WORKINGSET_RESTORE_FILE },
1565*4882a593Smuzhiyun { "workingset_nodereclaim", 1, WORKINGSET_NODERECLAIM },
1566*4882a593Smuzhiyun };
1567*4882a593Smuzhiyun
memory_stats_init(void)1568*4882a593Smuzhiyun static int __init memory_stats_init(void)
1569*4882a593Smuzhiyun {
1570*4882a593Smuzhiyun int i;
1571*4882a593Smuzhiyun
1572*4882a593Smuzhiyun for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
1573*4882a593Smuzhiyun #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1574*4882a593Smuzhiyun if (memory_stats[i].idx == NR_ANON_THPS)
1575*4882a593Smuzhiyun memory_stats[i].ratio = HPAGE_PMD_SIZE;
1576*4882a593Smuzhiyun #endif
1577*4882a593Smuzhiyun VM_BUG_ON(!memory_stats[i].ratio);
1578*4882a593Smuzhiyun VM_BUG_ON(memory_stats[i].idx >= MEMCG_NR_STAT);
1579*4882a593Smuzhiyun }
1580*4882a593Smuzhiyun
1581*4882a593Smuzhiyun return 0;
1582*4882a593Smuzhiyun }
1583*4882a593Smuzhiyun pure_initcall(memory_stats_init);
1584*4882a593Smuzhiyun
memory_stat_format(struct mem_cgroup * memcg)1585*4882a593Smuzhiyun static char *memory_stat_format(struct mem_cgroup *memcg)
1586*4882a593Smuzhiyun {
1587*4882a593Smuzhiyun struct seq_buf s;
1588*4882a593Smuzhiyun int i;
1589*4882a593Smuzhiyun
1590*4882a593Smuzhiyun seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
1591*4882a593Smuzhiyun if (!s.buffer)
1592*4882a593Smuzhiyun return NULL;
1593*4882a593Smuzhiyun
1594*4882a593Smuzhiyun /*
1595*4882a593Smuzhiyun * Provide statistics on the state of the memory subsystem as
1596*4882a593Smuzhiyun * well as cumulative event counters that show past behavior.
1597*4882a593Smuzhiyun *
1598*4882a593Smuzhiyun * This list is ordered following a combination of these gradients:
1599*4882a593Smuzhiyun * 1) generic big picture -> specifics and details
1600*4882a593Smuzhiyun * 2) reflecting userspace activity -> reflecting kernel heuristics
1601*4882a593Smuzhiyun *
1602*4882a593Smuzhiyun * Current memory state:
1603*4882a593Smuzhiyun */
1604*4882a593Smuzhiyun
1605*4882a593Smuzhiyun for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
1606*4882a593Smuzhiyun u64 size;
1607*4882a593Smuzhiyun
1608*4882a593Smuzhiyun size = memcg_page_state(memcg, memory_stats[i].idx);
1609*4882a593Smuzhiyun size *= memory_stats[i].ratio;
1610*4882a593Smuzhiyun seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size);
1611*4882a593Smuzhiyun
1612*4882a593Smuzhiyun if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
1613*4882a593Smuzhiyun size = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
1614*4882a593Smuzhiyun memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B);
1615*4882a593Smuzhiyun seq_buf_printf(&s, "slab %llu\n", size);
1616*4882a593Smuzhiyun }
1617*4882a593Smuzhiyun }
1618*4882a593Smuzhiyun
1619*4882a593Smuzhiyun /* Accumulated memory events */
1620*4882a593Smuzhiyun
1621*4882a593Smuzhiyun seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT),
1622*4882a593Smuzhiyun memcg_events(memcg, PGFAULT));
1623*4882a593Smuzhiyun seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
1624*4882a593Smuzhiyun memcg_events(memcg, PGMAJFAULT));
1625*4882a593Smuzhiyun seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL),
1626*4882a593Smuzhiyun memcg_events(memcg, PGREFILL));
1627*4882a593Smuzhiyun seq_buf_printf(&s, "pgscan %lu\n",
1628*4882a593Smuzhiyun memcg_events(memcg, PGSCAN_KSWAPD) +
1629*4882a593Smuzhiyun memcg_events(memcg, PGSCAN_DIRECT));
1630*4882a593Smuzhiyun seq_buf_printf(&s, "pgsteal %lu\n",
1631*4882a593Smuzhiyun memcg_events(memcg, PGSTEAL_KSWAPD) +
1632*4882a593Smuzhiyun memcg_events(memcg, PGSTEAL_DIRECT));
1633*4882a593Smuzhiyun seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE),
1634*4882a593Smuzhiyun memcg_events(memcg, PGACTIVATE));
1635*4882a593Smuzhiyun seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE),
1636*4882a593Smuzhiyun memcg_events(memcg, PGDEACTIVATE));
1637*4882a593Smuzhiyun seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE),
1638*4882a593Smuzhiyun memcg_events(memcg, PGLAZYFREE));
1639*4882a593Smuzhiyun seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED),
1640*4882a593Smuzhiyun memcg_events(memcg, PGLAZYFREED));
1641*4882a593Smuzhiyun
1642*4882a593Smuzhiyun #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1643*4882a593Smuzhiyun seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC),
1644*4882a593Smuzhiyun memcg_events(memcg, THP_FAULT_ALLOC));
1645*4882a593Smuzhiyun seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC),
1646*4882a593Smuzhiyun memcg_events(memcg, THP_COLLAPSE_ALLOC));
1647*4882a593Smuzhiyun #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1648*4882a593Smuzhiyun
1649*4882a593Smuzhiyun /* The above should easily fit into one page */
1650*4882a593Smuzhiyun WARN_ON_ONCE(seq_buf_has_overflowed(&s));
1651*4882a593Smuzhiyun
1652*4882a593Smuzhiyun return s.buffer;
1653*4882a593Smuzhiyun }
1654*4882a593Smuzhiyun
1655*4882a593Smuzhiyun #define K(x) ((x) << (PAGE_SHIFT-10))
1656*4882a593Smuzhiyun /**
1657*4882a593Smuzhiyun * mem_cgroup_print_oom_context: Print OOM information relevant to
1658*4882a593Smuzhiyun * memory controller.
1659*4882a593Smuzhiyun * @memcg: The memory cgroup that went over limit
1660*4882a593Smuzhiyun * @p: Task that is going to be killed
1661*4882a593Smuzhiyun *
1662*4882a593Smuzhiyun * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1663*4882a593Smuzhiyun * enabled
1664*4882a593Smuzhiyun */
mem_cgroup_print_oom_context(struct mem_cgroup * memcg,struct task_struct * p)1665*4882a593Smuzhiyun void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
1666*4882a593Smuzhiyun {
1667*4882a593Smuzhiyun rcu_read_lock();
1668*4882a593Smuzhiyun
1669*4882a593Smuzhiyun if (memcg) {
1670*4882a593Smuzhiyun pr_cont(",oom_memcg=");
1671*4882a593Smuzhiyun pr_cont_cgroup_path(memcg->css.cgroup);
1672*4882a593Smuzhiyun } else
1673*4882a593Smuzhiyun pr_cont(",global_oom");
1674*4882a593Smuzhiyun if (p) {
1675*4882a593Smuzhiyun pr_cont(",task_memcg=");
1676*4882a593Smuzhiyun pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1677*4882a593Smuzhiyun }
1678*4882a593Smuzhiyun rcu_read_unlock();
1679*4882a593Smuzhiyun }
1680*4882a593Smuzhiyun
1681*4882a593Smuzhiyun /**
1682*4882a593Smuzhiyun * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to
1683*4882a593Smuzhiyun * memory controller.
1684*4882a593Smuzhiyun * @memcg: The memory cgroup that went over limit
1685*4882a593Smuzhiyun */
mem_cgroup_print_oom_meminfo(struct mem_cgroup * memcg)1686*4882a593Smuzhiyun void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
1687*4882a593Smuzhiyun {
1688*4882a593Smuzhiyun char *buf;
1689*4882a593Smuzhiyun
1690*4882a593Smuzhiyun pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1691*4882a593Smuzhiyun K((u64)page_counter_read(&memcg->memory)),
1692*4882a593Smuzhiyun K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
1693*4882a593Smuzhiyun if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1694*4882a593Smuzhiyun pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
1695*4882a593Smuzhiyun K((u64)page_counter_read(&memcg->swap)),
1696*4882a593Smuzhiyun K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
1697*4882a593Smuzhiyun else {
1698*4882a593Smuzhiyun pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1699*4882a593Smuzhiyun K((u64)page_counter_read(&memcg->memsw)),
1700*4882a593Smuzhiyun K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1701*4882a593Smuzhiyun pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1702*4882a593Smuzhiyun K((u64)page_counter_read(&memcg->kmem)),
1703*4882a593Smuzhiyun K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1704*4882a593Smuzhiyun }
1705*4882a593Smuzhiyun
1706*4882a593Smuzhiyun pr_info("Memory cgroup stats for ");
1707*4882a593Smuzhiyun pr_cont_cgroup_path(memcg->css.cgroup);
1708*4882a593Smuzhiyun pr_cont(":");
1709*4882a593Smuzhiyun buf = memory_stat_format(memcg);
1710*4882a593Smuzhiyun if (!buf)
1711*4882a593Smuzhiyun return;
1712*4882a593Smuzhiyun pr_info("%s", buf);
1713*4882a593Smuzhiyun kfree(buf);
1714*4882a593Smuzhiyun }
1715*4882a593Smuzhiyun
1716*4882a593Smuzhiyun /*
1717*4882a593Smuzhiyun * Return the memory (and swap, if configured) limit for a memcg.
1718*4882a593Smuzhiyun */
mem_cgroup_get_max(struct mem_cgroup * memcg)1719*4882a593Smuzhiyun unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
1720*4882a593Smuzhiyun {
1721*4882a593Smuzhiyun unsigned long max = READ_ONCE(memcg->memory.max);
1722*4882a593Smuzhiyun
1723*4882a593Smuzhiyun if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
1724*4882a593Smuzhiyun if (mem_cgroup_swappiness(memcg))
1725*4882a593Smuzhiyun max += min(READ_ONCE(memcg->swap.max),
1726*4882a593Smuzhiyun (unsigned long)total_swap_pages);
1727*4882a593Smuzhiyun } else { /* v1 */
1728*4882a593Smuzhiyun if (mem_cgroup_swappiness(memcg)) {
1729*4882a593Smuzhiyun /* Calculate swap excess capacity from memsw limit */
1730*4882a593Smuzhiyun unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
1731*4882a593Smuzhiyun
1732*4882a593Smuzhiyun max += min(swap, (unsigned long)total_swap_pages);
1733*4882a593Smuzhiyun }
1734*4882a593Smuzhiyun }
1735*4882a593Smuzhiyun return max;
1736*4882a593Smuzhiyun }
1737*4882a593Smuzhiyun
mem_cgroup_size(struct mem_cgroup * memcg)1738*4882a593Smuzhiyun unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
1739*4882a593Smuzhiyun {
1740*4882a593Smuzhiyun return page_counter_read(&memcg->memory);
1741*4882a593Smuzhiyun }
1742*4882a593Smuzhiyun
mem_cgroup_out_of_memory(struct mem_cgroup * memcg,gfp_t gfp_mask,int order)1743*4882a593Smuzhiyun static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1744*4882a593Smuzhiyun int order)
1745*4882a593Smuzhiyun {
1746*4882a593Smuzhiyun struct oom_control oc = {
1747*4882a593Smuzhiyun .zonelist = NULL,
1748*4882a593Smuzhiyun .nodemask = NULL,
1749*4882a593Smuzhiyun .memcg = memcg,
1750*4882a593Smuzhiyun .gfp_mask = gfp_mask,
1751*4882a593Smuzhiyun .order = order,
1752*4882a593Smuzhiyun };
1753*4882a593Smuzhiyun bool ret = true;
1754*4882a593Smuzhiyun
1755*4882a593Smuzhiyun if (mutex_lock_killable(&oom_lock))
1756*4882a593Smuzhiyun return true;
1757*4882a593Smuzhiyun
1758*4882a593Smuzhiyun if (mem_cgroup_margin(memcg) >= (1 << order))
1759*4882a593Smuzhiyun goto unlock;
1760*4882a593Smuzhiyun
1761*4882a593Smuzhiyun /*
1762*4882a593Smuzhiyun * A few threads which were not waiting at mutex_lock_killable() can
1763*4882a593Smuzhiyun * fail to bail out. Therefore, check again after holding oom_lock.
1764*4882a593Smuzhiyun */
1765*4882a593Smuzhiyun ret = task_is_dying() || out_of_memory(&oc);
1766*4882a593Smuzhiyun
1767*4882a593Smuzhiyun unlock:
1768*4882a593Smuzhiyun mutex_unlock(&oom_lock);
1769*4882a593Smuzhiyun return ret;
1770*4882a593Smuzhiyun }
1771*4882a593Smuzhiyun
mem_cgroup_soft_reclaim(struct mem_cgroup * root_memcg,pg_data_t * pgdat,gfp_t gfp_mask,unsigned long * total_scanned)1772*4882a593Smuzhiyun static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1773*4882a593Smuzhiyun pg_data_t *pgdat,
1774*4882a593Smuzhiyun gfp_t gfp_mask,
1775*4882a593Smuzhiyun unsigned long *total_scanned)
1776*4882a593Smuzhiyun {
1777*4882a593Smuzhiyun struct mem_cgroup *victim = NULL;
1778*4882a593Smuzhiyun int total = 0;
1779*4882a593Smuzhiyun int loop = 0;
1780*4882a593Smuzhiyun unsigned long excess;
1781*4882a593Smuzhiyun unsigned long nr_scanned;
1782*4882a593Smuzhiyun struct mem_cgroup_reclaim_cookie reclaim = {
1783*4882a593Smuzhiyun .pgdat = pgdat,
1784*4882a593Smuzhiyun };
1785*4882a593Smuzhiyun
1786*4882a593Smuzhiyun excess = soft_limit_excess(root_memcg);
1787*4882a593Smuzhiyun
1788*4882a593Smuzhiyun while (1) {
1789*4882a593Smuzhiyun victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1790*4882a593Smuzhiyun if (!victim) {
1791*4882a593Smuzhiyun loop++;
1792*4882a593Smuzhiyun if (loop >= 2) {
1793*4882a593Smuzhiyun /*
1794*4882a593Smuzhiyun * If we have not been able to reclaim
1795*4882a593Smuzhiyun * anything, it might because there are
1796*4882a593Smuzhiyun * no reclaimable pages under this hierarchy
1797*4882a593Smuzhiyun */
1798*4882a593Smuzhiyun if (!total)
1799*4882a593Smuzhiyun break;
1800*4882a593Smuzhiyun /*
1801*4882a593Smuzhiyun * We want to do more targeted reclaim.
1802*4882a593Smuzhiyun * excess >> 2 is not to excessive so as to
1803*4882a593Smuzhiyun * reclaim too much, nor too less that we keep
1804*4882a593Smuzhiyun * coming back to reclaim from this cgroup
1805*4882a593Smuzhiyun */
1806*4882a593Smuzhiyun if (total >= (excess >> 2) ||
1807*4882a593Smuzhiyun (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1808*4882a593Smuzhiyun break;
1809*4882a593Smuzhiyun }
1810*4882a593Smuzhiyun continue;
1811*4882a593Smuzhiyun }
1812*4882a593Smuzhiyun total += mem_cgroup_shrink_node(victim, gfp_mask, false,
1813*4882a593Smuzhiyun pgdat, &nr_scanned);
1814*4882a593Smuzhiyun *total_scanned += nr_scanned;
1815*4882a593Smuzhiyun if (!soft_limit_excess(root_memcg))
1816*4882a593Smuzhiyun break;
1817*4882a593Smuzhiyun }
1818*4882a593Smuzhiyun mem_cgroup_iter_break(root_memcg, victim);
1819*4882a593Smuzhiyun return total;
1820*4882a593Smuzhiyun }
1821*4882a593Smuzhiyun
1822*4882a593Smuzhiyun #ifdef CONFIG_LOCKDEP
1823*4882a593Smuzhiyun static struct lockdep_map memcg_oom_lock_dep_map = {
1824*4882a593Smuzhiyun .name = "memcg_oom_lock",
1825*4882a593Smuzhiyun };
1826*4882a593Smuzhiyun #endif
1827*4882a593Smuzhiyun
1828*4882a593Smuzhiyun static DEFINE_SPINLOCK(memcg_oom_lock);
1829*4882a593Smuzhiyun
1830*4882a593Smuzhiyun /*
1831*4882a593Smuzhiyun * Check OOM-Killer is already running under our hierarchy.
1832*4882a593Smuzhiyun * If someone is running, return false.
1833*4882a593Smuzhiyun */
mem_cgroup_oom_trylock(struct mem_cgroup * memcg)1834*4882a593Smuzhiyun static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
1835*4882a593Smuzhiyun {
1836*4882a593Smuzhiyun struct mem_cgroup *iter, *failed = NULL;
1837*4882a593Smuzhiyun
1838*4882a593Smuzhiyun spin_lock(&memcg_oom_lock);
1839*4882a593Smuzhiyun
1840*4882a593Smuzhiyun for_each_mem_cgroup_tree(iter, memcg) {
1841*4882a593Smuzhiyun if (iter->oom_lock) {
1842*4882a593Smuzhiyun /*
1843*4882a593Smuzhiyun * this subtree of our hierarchy is already locked
1844*4882a593Smuzhiyun * so we cannot give a lock.
1845*4882a593Smuzhiyun */
1846*4882a593Smuzhiyun failed = iter;
1847*4882a593Smuzhiyun mem_cgroup_iter_break(memcg, iter);
1848*4882a593Smuzhiyun break;
1849*4882a593Smuzhiyun } else
1850*4882a593Smuzhiyun iter->oom_lock = true;
1851*4882a593Smuzhiyun }
1852*4882a593Smuzhiyun
1853*4882a593Smuzhiyun if (failed) {
1854*4882a593Smuzhiyun /*
1855*4882a593Smuzhiyun * OK, we failed to lock the whole subtree so we have
1856*4882a593Smuzhiyun * to clean up what we set up to the failing subtree
1857*4882a593Smuzhiyun */
1858*4882a593Smuzhiyun for_each_mem_cgroup_tree(iter, memcg) {
1859*4882a593Smuzhiyun if (iter == failed) {
1860*4882a593Smuzhiyun mem_cgroup_iter_break(memcg, iter);
1861*4882a593Smuzhiyun break;
1862*4882a593Smuzhiyun }
1863*4882a593Smuzhiyun iter->oom_lock = false;
1864*4882a593Smuzhiyun }
1865*4882a593Smuzhiyun } else
1866*4882a593Smuzhiyun mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
1867*4882a593Smuzhiyun
1868*4882a593Smuzhiyun spin_unlock(&memcg_oom_lock);
1869*4882a593Smuzhiyun
1870*4882a593Smuzhiyun return !failed;
1871*4882a593Smuzhiyun }
1872*4882a593Smuzhiyun
mem_cgroup_oom_unlock(struct mem_cgroup * memcg)1873*4882a593Smuzhiyun static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1874*4882a593Smuzhiyun {
1875*4882a593Smuzhiyun struct mem_cgroup *iter;
1876*4882a593Smuzhiyun
1877*4882a593Smuzhiyun spin_lock(&memcg_oom_lock);
1878*4882a593Smuzhiyun mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
1879*4882a593Smuzhiyun for_each_mem_cgroup_tree(iter, memcg)
1880*4882a593Smuzhiyun iter->oom_lock = false;
1881*4882a593Smuzhiyun spin_unlock(&memcg_oom_lock);
1882*4882a593Smuzhiyun }
1883*4882a593Smuzhiyun
mem_cgroup_mark_under_oom(struct mem_cgroup * memcg)1884*4882a593Smuzhiyun static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1885*4882a593Smuzhiyun {
1886*4882a593Smuzhiyun struct mem_cgroup *iter;
1887*4882a593Smuzhiyun
1888*4882a593Smuzhiyun spin_lock(&memcg_oom_lock);
1889*4882a593Smuzhiyun for_each_mem_cgroup_tree(iter, memcg)
1890*4882a593Smuzhiyun iter->under_oom++;
1891*4882a593Smuzhiyun spin_unlock(&memcg_oom_lock);
1892*4882a593Smuzhiyun }
1893*4882a593Smuzhiyun
mem_cgroup_unmark_under_oom(struct mem_cgroup * memcg)1894*4882a593Smuzhiyun static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1895*4882a593Smuzhiyun {
1896*4882a593Smuzhiyun struct mem_cgroup *iter;
1897*4882a593Smuzhiyun
1898*4882a593Smuzhiyun /*
1899*4882a593Smuzhiyun * Be careful about under_oom underflows becase a child memcg
1900*4882a593Smuzhiyun * could have been added after mem_cgroup_mark_under_oom.
1901*4882a593Smuzhiyun */
1902*4882a593Smuzhiyun spin_lock(&memcg_oom_lock);
1903*4882a593Smuzhiyun for_each_mem_cgroup_tree(iter, memcg)
1904*4882a593Smuzhiyun if (iter->under_oom > 0)
1905*4882a593Smuzhiyun iter->under_oom--;
1906*4882a593Smuzhiyun spin_unlock(&memcg_oom_lock);
1907*4882a593Smuzhiyun }
1908*4882a593Smuzhiyun
1909*4882a593Smuzhiyun static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1910*4882a593Smuzhiyun
1911*4882a593Smuzhiyun struct oom_wait_info {
1912*4882a593Smuzhiyun struct mem_cgroup *memcg;
1913*4882a593Smuzhiyun wait_queue_entry_t wait;
1914*4882a593Smuzhiyun };
1915*4882a593Smuzhiyun
memcg_oom_wake_function(wait_queue_entry_t * wait,unsigned mode,int sync,void * arg)1916*4882a593Smuzhiyun static int memcg_oom_wake_function(wait_queue_entry_t *wait,
1917*4882a593Smuzhiyun unsigned mode, int sync, void *arg)
1918*4882a593Smuzhiyun {
1919*4882a593Smuzhiyun struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1920*4882a593Smuzhiyun struct mem_cgroup *oom_wait_memcg;
1921*4882a593Smuzhiyun struct oom_wait_info *oom_wait_info;
1922*4882a593Smuzhiyun
1923*4882a593Smuzhiyun oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1924*4882a593Smuzhiyun oom_wait_memcg = oom_wait_info->memcg;
1925*4882a593Smuzhiyun
1926*4882a593Smuzhiyun if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
1927*4882a593Smuzhiyun !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
1928*4882a593Smuzhiyun return 0;
1929*4882a593Smuzhiyun return autoremove_wake_function(wait, mode, sync, arg);
1930*4882a593Smuzhiyun }
1931*4882a593Smuzhiyun
memcg_oom_recover(struct mem_cgroup * memcg)1932*4882a593Smuzhiyun static void memcg_oom_recover(struct mem_cgroup *memcg)
1933*4882a593Smuzhiyun {
1934*4882a593Smuzhiyun /*
1935*4882a593Smuzhiyun * For the following lockless ->under_oom test, the only required
1936*4882a593Smuzhiyun * guarantee is that it must see the state asserted by an OOM when
1937*4882a593Smuzhiyun * this function is called as a result of userland actions
1938*4882a593Smuzhiyun * triggered by the notification of the OOM. This is trivially
1939*4882a593Smuzhiyun * achieved by invoking mem_cgroup_mark_under_oom() before
1940*4882a593Smuzhiyun * triggering notification.
1941*4882a593Smuzhiyun */
1942*4882a593Smuzhiyun if (memcg && memcg->under_oom)
1943*4882a593Smuzhiyun __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1944*4882a593Smuzhiyun }
1945*4882a593Smuzhiyun
1946*4882a593Smuzhiyun enum oom_status {
1947*4882a593Smuzhiyun OOM_SUCCESS,
1948*4882a593Smuzhiyun OOM_FAILED,
1949*4882a593Smuzhiyun OOM_ASYNC,
1950*4882a593Smuzhiyun OOM_SKIPPED
1951*4882a593Smuzhiyun };
1952*4882a593Smuzhiyun
mem_cgroup_oom(struct mem_cgroup * memcg,gfp_t mask,int order)1953*4882a593Smuzhiyun static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1954*4882a593Smuzhiyun {
1955*4882a593Smuzhiyun enum oom_status ret;
1956*4882a593Smuzhiyun bool locked;
1957*4882a593Smuzhiyun
1958*4882a593Smuzhiyun if (order > PAGE_ALLOC_COSTLY_ORDER)
1959*4882a593Smuzhiyun return OOM_SKIPPED;
1960*4882a593Smuzhiyun
1961*4882a593Smuzhiyun memcg_memory_event(memcg, MEMCG_OOM);
1962*4882a593Smuzhiyun
1963*4882a593Smuzhiyun /*
1964*4882a593Smuzhiyun * We are in the middle of the charge context here, so we
1965*4882a593Smuzhiyun * don't want to block when potentially sitting on a callstack
1966*4882a593Smuzhiyun * that holds all kinds of filesystem and mm locks.
1967*4882a593Smuzhiyun *
1968*4882a593Smuzhiyun * cgroup1 allows disabling the OOM killer and waiting for outside
1969*4882a593Smuzhiyun * handling until the charge can succeed; remember the context and put
1970*4882a593Smuzhiyun * the task to sleep at the end of the page fault when all locks are
1971*4882a593Smuzhiyun * released.
1972*4882a593Smuzhiyun *
1973*4882a593Smuzhiyun * On the other hand, in-kernel OOM killer allows for an async victim
1974*4882a593Smuzhiyun * memory reclaim (oom_reaper) and that means that we are not solely
1975*4882a593Smuzhiyun * relying on the oom victim to make a forward progress and we can
1976*4882a593Smuzhiyun * invoke the oom killer here.
1977*4882a593Smuzhiyun *
1978*4882a593Smuzhiyun * Please note that mem_cgroup_out_of_memory might fail to find a
1979*4882a593Smuzhiyun * victim and then we have to bail out from the charge path.
1980*4882a593Smuzhiyun */
1981*4882a593Smuzhiyun if (memcg->oom_kill_disable) {
1982*4882a593Smuzhiyun if (!current->in_user_fault)
1983*4882a593Smuzhiyun return OOM_SKIPPED;
1984*4882a593Smuzhiyun css_get(&memcg->css);
1985*4882a593Smuzhiyun current->memcg_in_oom = memcg;
1986*4882a593Smuzhiyun current->memcg_oom_gfp_mask = mask;
1987*4882a593Smuzhiyun current->memcg_oom_order = order;
1988*4882a593Smuzhiyun
1989*4882a593Smuzhiyun return OOM_ASYNC;
1990*4882a593Smuzhiyun }
1991*4882a593Smuzhiyun
1992*4882a593Smuzhiyun mem_cgroup_mark_under_oom(memcg);
1993*4882a593Smuzhiyun
1994*4882a593Smuzhiyun locked = mem_cgroup_oom_trylock(memcg);
1995*4882a593Smuzhiyun
1996*4882a593Smuzhiyun if (locked)
1997*4882a593Smuzhiyun mem_cgroup_oom_notify(memcg);
1998*4882a593Smuzhiyun
1999*4882a593Smuzhiyun mem_cgroup_unmark_under_oom(memcg);
2000*4882a593Smuzhiyun if (mem_cgroup_out_of_memory(memcg, mask, order))
2001*4882a593Smuzhiyun ret = OOM_SUCCESS;
2002*4882a593Smuzhiyun else
2003*4882a593Smuzhiyun ret = OOM_FAILED;
2004*4882a593Smuzhiyun
2005*4882a593Smuzhiyun if (locked)
2006*4882a593Smuzhiyun mem_cgroup_oom_unlock(memcg);
2007*4882a593Smuzhiyun
2008*4882a593Smuzhiyun return ret;
2009*4882a593Smuzhiyun }
2010*4882a593Smuzhiyun
2011*4882a593Smuzhiyun /**
2012*4882a593Smuzhiyun * mem_cgroup_oom_synchronize - complete memcg OOM handling
2013*4882a593Smuzhiyun * @handle: actually kill/wait or just clean up the OOM state
2014*4882a593Smuzhiyun *
2015*4882a593Smuzhiyun * This has to be called at the end of a page fault if the memcg OOM
2016*4882a593Smuzhiyun * handler was enabled.
2017*4882a593Smuzhiyun *
2018*4882a593Smuzhiyun * Memcg supports userspace OOM handling where failed allocations must
2019*4882a593Smuzhiyun * sleep on a waitqueue until the userspace task resolves the
2020*4882a593Smuzhiyun * situation. Sleeping directly in the charge context with all kinds
2021*4882a593Smuzhiyun * of locks held is not a good idea, instead we remember an OOM state
2022*4882a593Smuzhiyun * in the task and mem_cgroup_oom_synchronize() has to be called at
2023*4882a593Smuzhiyun * the end of the page fault to complete the OOM handling.
2024*4882a593Smuzhiyun *
2025*4882a593Smuzhiyun * Returns %true if an ongoing memcg OOM situation was detected and
2026*4882a593Smuzhiyun * completed, %false otherwise.
2027*4882a593Smuzhiyun */
mem_cgroup_oom_synchronize(bool handle)2028*4882a593Smuzhiyun bool mem_cgroup_oom_synchronize(bool handle)
2029*4882a593Smuzhiyun {
2030*4882a593Smuzhiyun struct mem_cgroup *memcg = current->memcg_in_oom;
2031*4882a593Smuzhiyun struct oom_wait_info owait;
2032*4882a593Smuzhiyun bool locked;
2033*4882a593Smuzhiyun
2034*4882a593Smuzhiyun /* OOM is global, do not handle */
2035*4882a593Smuzhiyun if (!memcg)
2036*4882a593Smuzhiyun return false;
2037*4882a593Smuzhiyun
2038*4882a593Smuzhiyun if (!handle)
2039*4882a593Smuzhiyun goto cleanup;
2040*4882a593Smuzhiyun
2041*4882a593Smuzhiyun owait.memcg = memcg;
2042*4882a593Smuzhiyun owait.wait.flags = 0;
2043*4882a593Smuzhiyun owait.wait.func = memcg_oom_wake_function;
2044*4882a593Smuzhiyun owait.wait.private = current;
2045*4882a593Smuzhiyun INIT_LIST_HEAD(&owait.wait.entry);
2046*4882a593Smuzhiyun
2047*4882a593Smuzhiyun prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
2048*4882a593Smuzhiyun mem_cgroup_mark_under_oom(memcg);
2049*4882a593Smuzhiyun
2050*4882a593Smuzhiyun locked = mem_cgroup_oom_trylock(memcg);
2051*4882a593Smuzhiyun
2052*4882a593Smuzhiyun if (locked)
2053*4882a593Smuzhiyun mem_cgroup_oom_notify(memcg);
2054*4882a593Smuzhiyun
2055*4882a593Smuzhiyun if (locked && !memcg->oom_kill_disable) {
2056*4882a593Smuzhiyun mem_cgroup_unmark_under_oom(memcg);
2057*4882a593Smuzhiyun finish_wait(&memcg_oom_waitq, &owait.wait);
2058*4882a593Smuzhiyun mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
2059*4882a593Smuzhiyun current->memcg_oom_order);
2060*4882a593Smuzhiyun } else {
2061*4882a593Smuzhiyun schedule();
2062*4882a593Smuzhiyun mem_cgroup_unmark_under_oom(memcg);
2063*4882a593Smuzhiyun finish_wait(&memcg_oom_waitq, &owait.wait);
2064*4882a593Smuzhiyun }
2065*4882a593Smuzhiyun
2066*4882a593Smuzhiyun if (locked) {
2067*4882a593Smuzhiyun mem_cgroup_oom_unlock(memcg);
2068*4882a593Smuzhiyun /*
2069*4882a593Smuzhiyun * There is no guarantee that an OOM-lock contender
2070*4882a593Smuzhiyun * sees the wakeups triggered by the OOM kill
2071*4882a593Smuzhiyun * uncharges. Wake any sleepers explicitely.
2072*4882a593Smuzhiyun */
2073*4882a593Smuzhiyun memcg_oom_recover(memcg);
2074*4882a593Smuzhiyun }
2075*4882a593Smuzhiyun cleanup:
2076*4882a593Smuzhiyun current->memcg_in_oom = NULL;
2077*4882a593Smuzhiyun css_put(&memcg->css);
2078*4882a593Smuzhiyun return true;
2079*4882a593Smuzhiyun }
2080*4882a593Smuzhiyun
2081*4882a593Smuzhiyun /**
2082*4882a593Smuzhiyun * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM
2083*4882a593Smuzhiyun * @victim: task to be killed by the OOM killer
2084*4882a593Smuzhiyun * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM
2085*4882a593Smuzhiyun *
2086*4882a593Smuzhiyun * Returns a pointer to a memory cgroup, which has to be cleaned up
2087*4882a593Smuzhiyun * by killing all belonging OOM-killable tasks.
2088*4882a593Smuzhiyun *
2089*4882a593Smuzhiyun * Caller has to call mem_cgroup_put() on the returned non-NULL memcg.
2090*4882a593Smuzhiyun */
mem_cgroup_get_oom_group(struct task_struct * victim,struct mem_cgroup * oom_domain)2091*4882a593Smuzhiyun struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
2092*4882a593Smuzhiyun struct mem_cgroup *oom_domain)
2093*4882a593Smuzhiyun {
2094*4882a593Smuzhiyun struct mem_cgroup *oom_group = NULL;
2095*4882a593Smuzhiyun struct mem_cgroup *memcg;
2096*4882a593Smuzhiyun
2097*4882a593Smuzhiyun if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
2098*4882a593Smuzhiyun return NULL;
2099*4882a593Smuzhiyun
2100*4882a593Smuzhiyun if (!oom_domain)
2101*4882a593Smuzhiyun oom_domain = root_mem_cgroup;
2102*4882a593Smuzhiyun
2103*4882a593Smuzhiyun rcu_read_lock();
2104*4882a593Smuzhiyun
2105*4882a593Smuzhiyun memcg = mem_cgroup_from_task(victim);
2106*4882a593Smuzhiyun if (memcg == root_mem_cgroup)
2107*4882a593Smuzhiyun goto out;
2108*4882a593Smuzhiyun
2109*4882a593Smuzhiyun /*
2110*4882a593Smuzhiyun * If the victim task has been asynchronously moved to a different
2111*4882a593Smuzhiyun * memory cgroup, we might end up killing tasks outside oom_domain.
2112*4882a593Smuzhiyun * In this case it's better to ignore memory.group.oom.
2113*4882a593Smuzhiyun */
2114*4882a593Smuzhiyun if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain)))
2115*4882a593Smuzhiyun goto out;
2116*4882a593Smuzhiyun
2117*4882a593Smuzhiyun /*
2118*4882a593Smuzhiyun * Traverse the memory cgroup hierarchy from the victim task's
2119*4882a593Smuzhiyun * cgroup up to the OOMing cgroup (or root) to find the
2120*4882a593Smuzhiyun * highest-level memory cgroup with oom.group set.
2121*4882a593Smuzhiyun */
2122*4882a593Smuzhiyun for (; memcg; memcg = parent_mem_cgroup(memcg)) {
2123*4882a593Smuzhiyun if (memcg->oom_group)
2124*4882a593Smuzhiyun oom_group = memcg;
2125*4882a593Smuzhiyun
2126*4882a593Smuzhiyun if (memcg == oom_domain)
2127*4882a593Smuzhiyun break;
2128*4882a593Smuzhiyun }
2129*4882a593Smuzhiyun
2130*4882a593Smuzhiyun if (oom_group)
2131*4882a593Smuzhiyun css_get(&oom_group->css);
2132*4882a593Smuzhiyun out:
2133*4882a593Smuzhiyun rcu_read_unlock();
2134*4882a593Smuzhiyun
2135*4882a593Smuzhiyun return oom_group;
2136*4882a593Smuzhiyun }
2137*4882a593Smuzhiyun
mem_cgroup_print_oom_group(struct mem_cgroup * memcg)2138*4882a593Smuzhiyun void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
2139*4882a593Smuzhiyun {
2140*4882a593Smuzhiyun pr_info("Tasks in ");
2141*4882a593Smuzhiyun pr_cont_cgroup_path(memcg->css.cgroup);
2142*4882a593Smuzhiyun pr_cont(" are going to be killed due to memory.oom.group set\n");
2143*4882a593Smuzhiyun }
2144*4882a593Smuzhiyun
2145*4882a593Smuzhiyun /**
2146*4882a593Smuzhiyun * lock_page_memcg - lock a page->mem_cgroup binding
2147*4882a593Smuzhiyun * @page: the page
2148*4882a593Smuzhiyun *
2149*4882a593Smuzhiyun * This function protects unlocked LRU pages from being moved to
2150*4882a593Smuzhiyun * another cgroup.
2151*4882a593Smuzhiyun *
2152*4882a593Smuzhiyun * It ensures lifetime of the returned memcg. Caller is responsible
2153*4882a593Smuzhiyun * for the lifetime of the page; __unlock_page_memcg() is available
2154*4882a593Smuzhiyun * when @page might get freed inside the locked section.
2155*4882a593Smuzhiyun */
lock_page_memcg(struct page * page)2156*4882a593Smuzhiyun struct mem_cgroup *lock_page_memcg(struct page *page)
2157*4882a593Smuzhiyun {
2158*4882a593Smuzhiyun struct page *head = compound_head(page); /* rmap on tail pages */
2159*4882a593Smuzhiyun struct mem_cgroup *memcg;
2160*4882a593Smuzhiyun unsigned long flags;
2161*4882a593Smuzhiyun
2162*4882a593Smuzhiyun /*
2163*4882a593Smuzhiyun * The RCU lock is held throughout the transaction. The fast
2164*4882a593Smuzhiyun * path can get away without acquiring the memcg->move_lock
2165*4882a593Smuzhiyun * because page moving starts with an RCU grace period.
2166*4882a593Smuzhiyun *
2167*4882a593Smuzhiyun * The RCU lock also protects the memcg from being freed when
2168*4882a593Smuzhiyun * the page state that is going to change is the only thing
2169*4882a593Smuzhiyun * preventing the page itself from being freed. E.g. writeback
2170*4882a593Smuzhiyun * doesn't hold a page reference and relies on PG_writeback to
2171*4882a593Smuzhiyun * keep off truncation, migration and so forth.
2172*4882a593Smuzhiyun */
2173*4882a593Smuzhiyun rcu_read_lock();
2174*4882a593Smuzhiyun
2175*4882a593Smuzhiyun if (mem_cgroup_disabled())
2176*4882a593Smuzhiyun return NULL;
2177*4882a593Smuzhiyun again:
2178*4882a593Smuzhiyun memcg = head->mem_cgroup;
2179*4882a593Smuzhiyun if (unlikely(!memcg))
2180*4882a593Smuzhiyun return NULL;
2181*4882a593Smuzhiyun
2182*4882a593Smuzhiyun if (atomic_read(&memcg->moving_account) <= 0)
2183*4882a593Smuzhiyun return memcg;
2184*4882a593Smuzhiyun
2185*4882a593Smuzhiyun spin_lock_irqsave(&memcg->move_lock, flags);
2186*4882a593Smuzhiyun if (memcg != head->mem_cgroup) {
2187*4882a593Smuzhiyun spin_unlock_irqrestore(&memcg->move_lock, flags);
2188*4882a593Smuzhiyun goto again;
2189*4882a593Smuzhiyun }
2190*4882a593Smuzhiyun
2191*4882a593Smuzhiyun /*
2192*4882a593Smuzhiyun * When charge migration first begins, we can have locked and
2193*4882a593Smuzhiyun * unlocked page stat updates happening concurrently. Track
2194*4882a593Smuzhiyun * the task who has the lock for unlock_page_memcg().
2195*4882a593Smuzhiyun */
2196*4882a593Smuzhiyun memcg->move_lock_task = current;
2197*4882a593Smuzhiyun memcg->move_lock_flags = flags;
2198*4882a593Smuzhiyun
2199*4882a593Smuzhiyun return memcg;
2200*4882a593Smuzhiyun }
2201*4882a593Smuzhiyun EXPORT_SYMBOL(lock_page_memcg);
2202*4882a593Smuzhiyun
2203*4882a593Smuzhiyun /**
2204*4882a593Smuzhiyun * __unlock_page_memcg - unlock and unpin a memcg
2205*4882a593Smuzhiyun * @memcg: the memcg
2206*4882a593Smuzhiyun *
2207*4882a593Smuzhiyun * Unlock and unpin a memcg returned by lock_page_memcg().
2208*4882a593Smuzhiyun */
__unlock_page_memcg(struct mem_cgroup * memcg)2209*4882a593Smuzhiyun void __unlock_page_memcg(struct mem_cgroup *memcg)
2210*4882a593Smuzhiyun {
2211*4882a593Smuzhiyun if (memcg && memcg->move_lock_task == current) {
2212*4882a593Smuzhiyun unsigned long flags = memcg->move_lock_flags;
2213*4882a593Smuzhiyun
2214*4882a593Smuzhiyun memcg->move_lock_task = NULL;
2215*4882a593Smuzhiyun memcg->move_lock_flags = 0;
2216*4882a593Smuzhiyun
2217*4882a593Smuzhiyun spin_unlock_irqrestore(&memcg->move_lock, flags);
2218*4882a593Smuzhiyun }
2219*4882a593Smuzhiyun
2220*4882a593Smuzhiyun rcu_read_unlock();
2221*4882a593Smuzhiyun }
2222*4882a593Smuzhiyun
2223*4882a593Smuzhiyun /**
2224*4882a593Smuzhiyun * unlock_page_memcg - unlock a page->mem_cgroup binding
2225*4882a593Smuzhiyun * @page: the page
2226*4882a593Smuzhiyun */
unlock_page_memcg(struct page * page)2227*4882a593Smuzhiyun void unlock_page_memcg(struct page *page)
2228*4882a593Smuzhiyun {
2229*4882a593Smuzhiyun struct page *head = compound_head(page);
2230*4882a593Smuzhiyun
2231*4882a593Smuzhiyun __unlock_page_memcg(head->mem_cgroup);
2232*4882a593Smuzhiyun }
2233*4882a593Smuzhiyun EXPORT_SYMBOL(unlock_page_memcg);
2234*4882a593Smuzhiyun
2235*4882a593Smuzhiyun struct memcg_stock_pcp {
2236*4882a593Smuzhiyun struct mem_cgroup *cached; /* this never be root cgroup */
2237*4882a593Smuzhiyun unsigned int nr_pages;
2238*4882a593Smuzhiyun
2239*4882a593Smuzhiyun #ifdef CONFIG_MEMCG_KMEM
2240*4882a593Smuzhiyun struct obj_cgroup *cached_objcg;
2241*4882a593Smuzhiyun unsigned int nr_bytes;
2242*4882a593Smuzhiyun #endif
2243*4882a593Smuzhiyun
2244*4882a593Smuzhiyun struct work_struct work;
2245*4882a593Smuzhiyun unsigned long flags;
2246*4882a593Smuzhiyun #define FLUSHING_CACHED_CHARGE 0
2247*4882a593Smuzhiyun };
2248*4882a593Smuzhiyun static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2249*4882a593Smuzhiyun static DEFINE_MUTEX(percpu_charge_mutex);
2250*4882a593Smuzhiyun
2251*4882a593Smuzhiyun #ifdef CONFIG_MEMCG_KMEM
2252*4882a593Smuzhiyun static void drain_obj_stock(struct memcg_stock_pcp *stock);
2253*4882a593Smuzhiyun static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
2254*4882a593Smuzhiyun struct mem_cgroup *root_memcg);
2255*4882a593Smuzhiyun
2256*4882a593Smuzhiyun #else
drain_obj_stock(struct memcg_stock_pcp * stock)2257*4882a593Smuzhiyun static inline void drain_obj_stock(struct memcg_stock_pcp *stock)
2258*4882a593Smuzhiyun {
2259*4882a593Smuzhiyun }
obj_stock_flush_required(struct memcg_stock_pcp * stock,struct mem_cgroup * root_memcg)2260*4882a593Smuzhiyun static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
2261*4882a593Smuzhiyun struct mem_cgroup *root_memcg)
2262*4882a593Smuzhiyun {
2263*4882a593Smuzhiyun return false;
2264*4882a593Smuzhiyun }
2265*4882a593Smuzhiyun #endif
2266*4882a593Smuzhiyun
2267*4882a593Smuzhiyun /**
2268*4882a593Smuzhiyun * consume_stock: Try to consume stocked charge on this cpu.
2269*4882a593Smuzhiyun * @memcg: memcg to consume from.
2270*4882a593Smuzhiyun * @nr_pages: how many pages to charge.
2271*4882a593Smuzhiyun *
2272*4882a593Smuzhiyun * The charges will only happen if @memcg matches the current cpu's memcg
2273*4882a593Smuzhiyun * stock, and at least @nr_pages are available in that stock. Failure to
2274*4882a593Smuzhiyun * service an allocation will refill the stock.
2275*4882a593Smuzhiyun *
2276*4882a593Smuzhiyun * returns true if successful, false otherwise.
2277*4882a593Smuzhiyun */
consume_stock(struct mem_cgroup * memcg,unsigned int nr_pages)2278*4882a593Smuzhiyun static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2279*4882a593Smuzhiyun {
2280*4882a593Smuzhiyun struct memcg_stock_pcp *stock;
2281*4882a593Smuzhiyun unsigned long flags;
2282*4882a593Smuzhiyun bool ret = false;
2283*4882a593Smuzhiyun
2284*4882a593Smuzhiyun if (nr_pages > MEMCG_CHARGE_BATCH)
2285*4882a593Smuzhiyun return ret;
2286*4882a593Smuzhiyun
2287*4882a593Smuzhiyun local_irq_save(flags);
2288*4882a593Smuzhiyun
2289*4882a593Smuzhiyun stock = this_cpu_ptr(&memcg_stock);
2290*4882a593Smuzhiyun if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
2291*4882a593Smuzhiyun stock->nr_pages -= nr_pages;
2292*4882a593Smuzhiyun ret = true;
2293*4882a593Smuzhiyun }
2294*4882a593Smuzhiyun
2295*4882a593Smuzhiyun local_irq_restore(flags);
2296*4882a593Smuzhiyun
2297*4882a593Smuzhiyun return ret;
2298*4882a593Smuzhiyun }
2299*4882a593Smuzhiyun
2300*4882a593Smuzhiyun /*
2301*4882a593Smuzhiyun * Returns stocks cached in percpu and reset cached information.
2302*4882a593Smuzhiyun */
drain_stock(struct memcg_stock_pcp * stock)2303*4882a593Smuzhiyun static void drain_stock(struct memcg_stock_pcp *stock)
2304*4882a593Smuzhiyun {
2305*4882a593Smuzhiyun struct mem_cgroup *old = stock->cached;
2306*4882a593Smuzhiyun
2307*4882a593Smuzhiyun if (!old)
2308*4882a593Smuzhiyun return;
2309*4882a593Smuzhiyun
2310*4882a593Smuzhiyun if (stock->nr_pages) {
2311*4882a593Smuzhiyun page_counter_uncharge(&old->memory, stock->nr_pages);
2312*4882a593Smuzhiyun if (do_memsw_account())
2313*4882a593Smuzhiyun page_counter_uncharge(&old->memsw, stock->nr_pages);
2314*4882a593Smuzhiyun stock->nr_pages = 0;
2315*4882a593Smuzhiyun }
2316*4882a593Smuzhiyun
2317*4882a593Smuzhiyun css_put(&old->css);
2318*4882a593Smuzhiyun stock->cached = NULL;
2319*4882a593Smuzhiyun }
2320*4882a593Smuzhiyun
drain_local_stock(struct work_struct * dummy)2321*4882a593Smuzhiyun static void drain_local_stock(struct work_struct *dummy)
2322*4882a593Smuzhiyun {
2323*4882a593Smuzhiyun struct memcg_stock_pcp *stock;
2324*4882a593Smuzhiyun unsigned long flags;
2325*4882a593Smuzhiyun
2326*4882a593Smuzhiyun /*
2327*4882a593Smuzhiyun * The only protection from memory hotplug vs. drain_stock races is
2328*4882a593Smuzhiyun * that we always operate on local CPU stock here with IRQ disabled
2329*4882a593Smuzhiyun */
2330*4882a593Smuzhiyun local_irq_save(flags);
2331*4882a593Smuzhiyun
2332*4882a593Smuzhiyun stock = this_cpu_ptr(&memcg_stock);
2333*4882a593Smuzhiyun drain_obj_stock(stock);
2334*4882a593Smuzhiyun drain_stock(stock);
2335*4882a593Smuzhiyun clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2336*4882a593Smuzhiyun
2337*4882a593Smuzhiyun local_irq_restore(flags);
2338*4882a593Smuzhiyun }
2339*4882a593Smuzhiyun
2340*4882a593Smuzhiyun /*
2341*4882a593Smuzhiyun * Cache charges(val) to local per_cpu area.
2342*4882a593Smuzhiyun * This will be consumed by consume_stock() function, later.
2343*4882a593Smuzhiyun */
refill_stock(struct mem_cgroup * memcg,unsigned int nr_pages)2344*4882a593Smuzhiyun static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2345*4882a593Smuzhiyun {
2346*4882a593Smuzhiyun struct memcg_stock_pcp *stock;
2347*4882a593Smuzhiyun unsigned long flags;
2348*4882a593Smuzhiyun
2349*4882a593Smuzhiyun local_irq_save(flags);
2350*4882a593Smuzhiyun
2351*4882a593Smuzhiyun stock = this_cpu_ptr(&memcg_stock);
2352*4882a593Smuzhiyun if (stock->cached != memcg) { /* reset if necessary */
2353*4882a593Smuzhiyun drain_stock(stock);
2354*4882a593Smuzhiyun css_get(&memcg->css);
2355*4882a593Smuzhiyun stock->cached = memcg;
2356*4882a593Smuzhiyun }
2357*4882a593Smuzhiyun stock->nr_pages += nr_pages;
2358*4882a593Smuzhiyun
2359*4882a593Smuzhiyun if (stock->nr_pages > MEMCG_CHARGE_BATCH)
2360*4882a593Smuzhiyun drain_stock(stock);
2361*4882a593Smuzhiyun
2362*4882a593Smuzhiyun local_irq_restore(flags);
2363*4882a593Smuzhiyun }
2364*4882a593Smuzhiyun
2365*4882a593Smuzhiyun /*
2366*4882a593Smuzhiyun * Drains all per-CPU charge caches for given root_memcg resp. subtree
2367*4882a593Smuzhiyun * of the hierarchy under it.
2368*4882a593Smuzhiyun */
drain_all_stock(struct mem_cgroup * root_memcg)2369*4882a593Smuzhiyun static void drain_all_stock(struct mem_cgroup *root_memcg)
2370*4882a593Smuzhiyun {
2371*4882a593Smuzhiyun int cpu, curcpu;
2372*4882a593Smuzhiyun
2373*4882a593Smuzhiyun /* If someone's already draining, avoid adding running more workers. */
2374*4882a593Smuzhiyun if (!mutex_trylock(&percpu_charge_mutex))
2375*4882a593Smuzhiyun return;
2376*4882a593Smuzhiyun /*
2377*4882a593Smuzhiyun * Notify other cpus that system-wide "drain" is running
2378*4882a593Smuzhiyun * We do not care about races with the cpu hotplug because cpu down
2379*4882a593Smuzhiyun * as well as workers from this path always operate on the local
2380*4882a593Smuzhiyun * per-cpu data. CPU up doesn't touch memcg_stock at all.
2381*4882a593Smuzhiyun */
2382*4882a593Smuzhiyun curcpu = get_cpu();
2383*4882a593Smuzhiyun for_each_online_cpu(cpu) {
2384*4882a593Smuzhiyun struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2385*4882a593Smuzhiyun struct mem_cgroup *memcg;
2386*4882a593Smuzhiyun bool flush = false;
2387*4882a593Smuzhiyun
2388*4882a593Smuzhiyun rcu_read_lock();
2389*4882a593Smuzhiyun memcg = stock->cached;
2390*4882a593Smuzhiyun if (memcg && stock->nr_pages &&
2391*4882a593Smuzhiyun mem_cgroup_is_descendant(memcg, root_memcg))
2392*4882a593Smuzhiyun flush = true;
2393*4882a593Smuzhiyun if (obj_stock_flush_required(stock, root_memcg))
2394*4882a593Smuzhiyun flush = true;
2395*4882a593Smuzhiyun rcu_read_unlock();
2396*4882a593Smuzhiyun
2397*4882a593Smuzhiyun if (flush &&
2398*4882a593Smuzhiyun !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2399*4882a593Smuzhiyun if (cpu == curcpu)
2400*4882a593Smuzhiyun drain_local_stock(&stock->work);
2401*4882a593Smuzhiyun else
2402*4882a593Smuzhiyun schedule_work_on(cpu, &stock->work);
2403*4882a593Smuzhiyun }
2404*4882a593Smuzhiyun }
2405*4882a593Smuzhiyun put_cpu();
2406*4882a593Smuzhiyun mutex_unlock(&percpu_charge_mutex);
2407*4882a593Smuzhiyun }
2408*4882a593Smuzhiyun
memcg_hotplug_cpu_dead(unsigned int cpu)2409*4882a593Smuzhiyun static int memcg_hotplug_cpu_dead(unsigned int cpu)
2410*4882a593Smuzhiyun {
2411*4882a593Smuzhiyun struct memcg_stock_pcp *stock;
2412*4882a593Smuzhiyun struct mem_cgroup *memcg, *mi;
2413*4882a593Smuzhiyun
2414*4882a593Smuzhiyun stock = &per_cpu(memcg_stock, cpu);
2415*4882a593Smuzhiyun drain_stock(stock);
2416*4882a593Smuzhiyun
2417*4882a593Smuzhiyun for_each_mem_cgroup(memcg) {
2418*4882a593Smuzhiyun int i;
2419*4882a593Smuzhiyun
2420*4882a593Smuzhiyun for (i = 0; i < MEMCG_NR_STAT; i++) {
2421*4882a593Smuzhiyun int nid;
2422*4882a593Smuzhiyun long x;
2423*4882a593Smuzhiyun
2424*4882a593Smuzhiyun x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
2425*4882a593Smuzhiyun if (x)
2426*4882a593Smuzhiyun for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
2427*4882a593Smuzhiyun atomic_long_add(x, &memcg->vmstats[i]);
2428*4882a593Smuzhiyun
2429*4882a593Smuzhiyun if (i >= NR_VM_NODE_STAT_ITEMS)
2430*4882a593Smuzhiyun continue;
2431*4882a593Smuzhiyun
2432*4882a593Smuzhiyun for_each_node(nid) {
2433*4882a593Smuzhiyun struct mem_cgroup_per_node *pn;
2434*4882a593Smuzhiyun
2435*4882a593Smuzhiyun pn = mem_cgroup_nodeinfo(memcg, nid);
2436*4882a593Smuzhiyun x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
2437*4882a593Smuzhiyun if (x)
2438*4882a593Smuzhiyun do {
2439*4882a593Smuzhiyun atomic_long_add(x, &pn->lruvec_stat[i]);
2440*4882a593Smuzhiyun } while ((pn = parent_nodeinfo(pn, nid)));
2441*4882a593Smuzhiyun }
2442*4882a593Smuzhiyun }
2443*4882a593Smuzhiyun
2444*4882a593Smuzhiyun for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
2445*4882a593Smuzhiyun long x;
2446*4882a593Smuzhiyun
2447*4882a593Smuzhiyun x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
2448*4882a593Smuzhiyun if (x)
2449*4882a593Smuzhiyun for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
2450*4882a593Smuzhiyun atomic_long_add(x, &memcg->vmevents[i]);
2451*4882a593Smuzhiyun }
2452*4882a593Smuzhiyun }
2453*4882a593Smuzhiyun
2454*4882a593Smuzhiyun return 0;
2455*4882a593Smuzhiyun }
2456*4882a593Smuzhiyun
reclaim_high(struct mem_cgroup * memcg,unsigned int nr_pages,gfp_t gfp_mask)2457*4882a593Smuzhiyun static unsigned long reclaim_high(struct mem_cgroup *memcg,
2458*4882a593Smuzhiyun unsigned int nr_pages,
2459*4882a593Smuzhiyun gfp_t gfp_mask)
2460*4882a593Smuzhiyun {
2461*4882a593Smuzhiyun unsigned long nr_reclaimed = 0;
2462*4882a593Smuzhiyun
2463*4882a593Smuzhiyun do {
2464*4882a593Smuzhiyun unsigned long pflags;
2465*4882a593Smuzhiyun
2466*4882a593Smuzhiyun if (page_counter_read(&memcg->memory) <=
2467*4882a593Smuzhiyun READ_ONCE(memcg->memory.high))
2468*4882a593Smuzhiyun continue;
2469*4882a593Smuzhiyun
2470*4882a593Smuzhiyun memcg_memory_event(memcg, MEMCG_HIGH);
2471*4882a593Smuzhiyun
2472*4882a593Smuzhiyun psi_memstall_enter(&pflags);
2473*4882a593Smuzhiyun nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
2474*4882a593Smuzhiyun gfp_mask, true);
2475*4882a593Smuzhiyun psi_memstall_leave(&pflags);
2476*4882a593Smuzhiyun } while ((memcg = parent_mem_cgroup(memcg)) &&
2477*4882a593Smuzhiyun !mem_cgroup_is_root(memcg));
2478*4882a593Smuzhiyun
2479*4882a593Smuzhiyun return nr_reclaimed;
2480*4882a593Smuzhiyun }
2481*4882a593Smuzhiyun
high_work_func(struct work_struct * work)2482*4882a593Smuzhiyun static void high_work_func(struct work_struct *work)
2483*4882a593Smuzhiyun {
2484*4882a593Smuzhiyun struct mem_cgroup *memcg;
2485*4882a593Smuzhiyun
2486*4882a593Smuzhiyun memcg = container_of(work, struct mem_cgroup, high_work);
2487*4882a593Smuzhiyun reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
2488*4882a593Smuzhiyun }
2489*4882a593Smuzhiyun
2490*4882a593Smuzhiyun /*
2491*4882a593Smuzhiyun * Clamp the maximum sleep time per allocation batch to 2 seconds. This is
2492*4882a593Smuzhiyun * enough to still cause a significant slowdown in most cases, while still
2493*4882a593Smuzhiyun * allowing diagnostics and tracing to proceed without becoming stuck.
2494*4882a593Smuzhiyun */
2495*4882a593Smuzhiyun #define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
2496*4882a593Smuzhiyun
2497*4882a593Smuzhiyun /*
2498*4882a593Smuzhiyun * When calculating the delay, we use these either side of the exponentiation to
2499*4882a593Smuzhiyun * maintain precision and scale to a reasonable number of jiffies (see the table
2500*4882a593Smuzhiyun * below.
2501*4882a593Smuzhiyun *
2502*4882a593Smuzhiyun * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
2503*4882a593Smuzhiyun * overage ratio to a delay.
2504*4882a593Smuzhiyun * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the
2505*4882a593Smuzhiyun * proposed penalty in order to reduce to a reasonable number of jiffies, and
2506*4882a593Smuzhiyun * to produce a reasonable delay curve.
2507*4882a593Smuzhiyun *
2508*4882a593Smuzhiyun * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
2509*4882a593Smuzhiyun * reasonable delay curve compared to precision-adjusted overage, not
2510*4882a593Smuzhiyun * penalising heavily at first, but still making sure that growth beyond the
2511*4882a593Smuzhiyun * limit penalises misbehaviour cgroups by slowing them down exponentially. For
2512*4882a593Smuzhiyun * example, with a high of 100 megabytes:
2513*4882a593Smuzhiyun *
2514*4882a593Smuzhiyun * +-------+------------------------+
2515*4882a593Smuzhiyun * | usage | time to allocate in ms |
2516*4882a593Smuzhiyun * +-------+------------------------+
2517*4882a593Smuzhiyun * | 100M | 0 |
2518*4882a593Smuzhiyun * | 101M | 6 |
2519*4882a593Smuzhiyun * | 102M | 25 |
2520*4882a593Smuzhiyun * | 103M | 57 |
2521*4882a593Smuzhiyun * | 104M | 102 |
2522*4882a593Smuzhiyun * | 105M | 159 |
2523*4882a593Smuzhiyun * | 106M | 230 |
2524*4882a593Smuzhiyun * | 107M | 313 |
2525*4882a593Smuzhiyun * | 108M | 409 |
2526*4882a593Smuzhiyun * | 109M | 518 |
2527*4882a593Smuzhiyun * | 110M | 639 |
2528*4882a593Smuzhiyun * | 111M | 774 |
2529*4882a593Smuzhiyun * | 112M | 921 |
2530*4882a593Smuzhiyun * | 113M | 1081 |
2531*4882a593Smuzhiyun * | 114M | 1254 |
2532*4882a593Smuzhiyun * | 115M | 1439 |
2533*4882a593Smuzhiyun * | 116M | 1638 |
2534*4882a593Smuzhiyun * | 117M | 1849 |
2535*4882a593Smuzhiyun * | 118M | 2000 |
2536*4882a593Smuzhiyun * | 119M | 2000 |
2537*4882a593Smuzhiyun * | 120M | 2000 |
2538*4882a593Smuzhiyun * +-------+------------------------+
2539*4882a593Smuzhiyun */
2540*4882a593Smuzhiyun #define MEMCG_DELAY_PRECISION_SHIFT 20
2541*4882a593Smuzhiyun #define MEMCG_DELAY_SCALING_SHIFT 14
2542*4882a593Smuzhiyun
calculate_overage(unsigned long usage,unsigned long high)2543*4882a593Smuzhiyun static u64 calculate_overage(unsigned long usage, unsigned long high)
2544*4882a593Smuzhiyun {
2545*4882a593Smuzhiyun u64 overage;
2546*4882a593Smuzhiyun
2547*4882a593Smuzhiyun if (usage <= high)
2548*4882a593Smuzhiyun return 0;
2549*4882a593Smuzhiyun
2550*4882a593Smuzhiyun /*
2551*4882a593Smuzhiyun * Prevent division by 0 in overage calculation by acting as if
2552*4882a593Smuzhiyun * it was a threshold of 1 page
2553*4882a593Smuzhiyun */
2554*4882a593Smuzhiyun high = max(high, 1UL);
2555*4882a593Smuzhiyun
2556*4882a593Smuzhiyun overage = usage - high;
2557*4882a593Smuzhiyun overage <<= MEMCG_DELAY_PRECISION_SHIFT;
2558*4882a593Smuzhiyun return div64_u64(overage, high);
2559*4882a593Smuzhiyun }
2560*4882a593Smuzhiyun
mem_find_max_overage(struct mem_cgroup * memcg)2561*4882a593Smuzhiyun static u64 mem_find_max_overage(struct mem_cgroup *memcg)
2562*4882a593Smuzhiyun {
2563*4882a593Smuzhiyun u64 overage, max_overage = 0;
2564*4882a593Smuzhiyun
2565*4882a593Smuzhiyun do {
2566*4882a593Smuzhiyun overage = calculate_overage(page_counter_read(&memcg->memory),
2567*4882a593Smuzhiyun READ_ONCE(memcg->memory.high));
2568*4882a593Smuzhiyun max_overage = max(overage, max_overage);
2569*4882a593Smuzhiyun } while ((memcg = parent_mem_cgroup(memcg)) &&
2570*4882a593Smuzhiyun !mem_cgroup_is_root(memcg));
2571*4882a593Smuzhiyun
2572*4882a593Smuzhiyun return max_overage;
2573*4882a593Smuzhiyun }
2574*4882a593Smuzhiyun
swap_find_max_overage(struct mem_cgroup * memcg)2575*4882a593Smuzhiyun static u64 swap_find_max_overage(struct mem_cgroup *memcg)
2576*4882a593Smuzhiyun {
2577*4882a593Smuzhiyun u64 overage, max_overage = 0;
2578*4882a593Smuzhiyun
2579*4882a593Smuzhiyun do {
2580*4882a593Smuzhiyun overage = calculate_overage(page_counter_read(&memcg->swap),
2581*4882a593Smuzhiyun READ_ONCE(memcg->swap.high));
2582*4882a593Smuzhiyun if (overage)
2583*4882a593Smuzhiyun memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
2584*4882a593Smuzhiyun max_overage = max(overage, max_overage);
2585*4882a593Smuzhiyun } while ((memcg = parent_mem_cgroup(memcg)) &&
2586*4882a593Smuzhiyun !mem_cgroup_is_root(memcg));
2587*4882a593Smuzhiyun
2588*4882a593Smuzhiyun return max_overage;
2589*4882a593Smuzhiyun }
2590*4882a593Smuzhiyun
2591*4882a593Smuzhiyun /*
2592*4882a593Smuzhiyun * Get the number of jiffies that we should penalise a mischievous cgroup which
2593*4882a593Smuzhiyun * is exceeding its memory.high by checking both it and its ancestors.
2594*4882a593Smuzhiyun */
calculate_high_delay(struct mem_cgroup * memcg,unsigned int nr_pages,u64 max_overage)2595*4882a593Smuzhiyun static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
2596*4882a593Smuzhiyun unsigned int nr_pages,
2597*4882a593Smuzhiyun u64 max_overage)
2598*4882a593Smuzhiyun {
2599*4882a593Smuzhiyun unsigned long penalty_jiffies;
2600*4882a593Smuzhiyun
2601*4882a593Smuzhiyun if (!max_overage)
2602*4882a593Smuzhiyun return 0;
2603*4882a593Smuzhiyun
2604*4882a593Smuzhiyun /*
2605*4882a593Smuzhiyun * We use overage compared to memory.high to calculate the number of
2606*4882a593Smuzhiyun * jiffies to sleep (penalty_jiffies). Ideally this value should be
2607*4882a593Smuzhiyun * fairly lenient on small overages, and increasingly harsh when the
2608*4882a593Smuzhiyun * memcg in question makes it clear that it has no intention of stopping
2609*4882a593Smuzhiyun * its crazy behaviour, so we exponentially increase the delay based on
2610*4882a593Smuzhiyun * overage amount.
2611*4882a593Smuzhiyun */
2612*4882a593Smuzhiyun penalty_jiffies = max_overage * max_overage * HZ;
2613*4882a593Smuzhiyun penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
2614*4882a593Smuzhiyun penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
2615*4882a593Smuzhiyun
2616*4882a593Smuzhiyun /*
2617*4882a593Smuzhiyun * Factor in the task's own contribution to the overage, such that four
2618*4882a593Smuzhiyun * N-sized allocations are throttled approximately the same as one
2619*4882a593Smuzhiyun * 4N-sized allocation.
2620*4882a593Smuzhiyun *
2621*4882a593Smuzhiyun * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
2622*4882a593Smuzhiyun * larger the current charge patch is than that.
2623*4882a593Smuzhiyun */
2624*4882a593Smuzhiyun return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
2625*4882a593Smuzhiyun }
2626*4882a593Smuzhiyun
2627*4882a593Smuzhiyun /*
2628*4882a593Smuzhiyun * Scheduled by try_charge() to be executed from the userland return path
2629*4882a593Smuzhiyun * and reclaims memory over the high limit.
2630*4882a593Smuzhiyun */
mem_cgroup_handle_over_high(void)2631*4882a593Smuzhiyun void mem_cgroup_handle_over_high(void)
2632*4882a593Smuzhiyun {
2633*4882a593Smuzhiyun unsigned long penalty_jiffies;
2634*4882a593Smuzhiyun unsigned long pflags;
2635*4882a593Smuzhiyun unsigned long nr_reclaimed;
2636*4882a593Smuzhiyun unsigned int nr_pages = current->memcg_nr_pages_over_high;
2637*4882a593Smuzhiyun int nr_retries = MAX_RECLAIM_RETRIES;
2638*4882a593Smuzhiyun struct mem_cgroup *memcg;
2639*4882a593Smuzhiyun bool in_retry = false;
2640*4882a593Smuzhiyun
2641*4882a593Smuzhiyun if (likely(!nr_pages))
2642*4882a593Smuzhiyun return;
2643*4882a593Smuzhiyun
2644*4882a593Smuzhiyun memcg = get_mem_cgroup_from_mm(current->mm);
2645*4882a593Smuzhiyun current->memcg_nr_pages_over_high = 0;
2646*4882a593Smuzhiyun
2647*4882a593Smuzhiyun retry_reclaim:
2648*4882a593Smuzhiyun /*
2649*4882a593Smuzhiyun * The allocating task should reclaim at least the batch size, but for
2650*4882a593Smuzhiyun * subsequent retries we only want to do what's necessary to prevent oom
2651*4882a593Smuzhiyun * or breaching resource isolation.
2652*4882a593Smuzhiyun *
2653*4882a593Smuzhiyun * This is distinct from memory.max or page allocator behaviour because
2654*4882a593Smuzhiyun * memory.high is currently batched, whereas memory.max and the page
2655*4882a593Smuzhiyun * allocator run every time an allocation is made.
2656*4882a593Smuzhiyun */
2657*4882a593Smuzhiyun nr_reclaimed = reclaim_high(memcg,
2658*4882a593Smuzhiyun in_retry ? SWAP_CLUSTER_MAX : nr_pages,
2659*4882a593Smuzhiyun GFP_KERNEL);
2660*4882a593Smuzhiyun
2661*4882a593Smuzhiyun /*
2662*4882a593Smuzhiyun * memory.high is breached and reclaim is unable to keep up. Throttle
2663*4882a593Smuzhiyun * allocators proactively to slow down excessive growth.
2664*4882a593Smuzhiyun */
2665*4882a593Smuzhiyun penalty_jiffies = calculate_high_delay(memcg, nr_pages,
2666*4882a593Smuzhiyun mem_find_max_overage(memcg));
2667*4882a593Smuzhiyun
2668*4882a593Smuzhiyun penalty_jiffies += calculate_high_delay(memcg, nr_pages,
2669*4882a593Smuzhiyun swap_find_max_overage(memcg));
2670*4882a593Smuzhiyun
2671*4882a593Smuzhiyun /*
2672*4882a593Smuzhiyun * Clamp the max delay per usermode return so as to still keep the
2673*4882a593Smuzhiyun * application moving forwards and also permit diagnostics, albeit
2674*4882a593Smuzhiyun * extremely slowly.
2675*4882a593Smuzhiyun */
2676*4882a593Smuzhiyun penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
2677*4882a593Smuzhiyun
2678*4882a593Smuzhiyun /*
2679*4882a593Smuzhiyun * Don't sleep if the amount of jiffies this memcg owes us is so low
2680*4882a593Smuzhiyun * that it's not even worth doing, in an attempt to be nice to those who
2681*4882a593Smuzhiyun * go only a small amount over their memory.high value and maybe haven't
2682*4882a593Smuzhiyun * been aggressively reclaimed enough yet.
2683*4882a593Smuzhiyun */
2684*4882a593Smuzhiyun if (penalty_jiffies <= HZ / 100)
2685*4882a593Smuzhiyun goto out;
2686*4882a593Smuzhiyun
2687*4882a593Smuzhiyun /*
2688*4882a593Smuzhiyun * If reclaim is making forward progress but we're still over
2689*4882a593Smuzhiyun * memory.high, we want to encourage that rather than doing allocator
2690*4882a593Smuzhiyun * throttling.
2691*4882a593Smuzhiyun */
2692*4882a593Smuzhiyun if (nr_reclaimed || nr_retries--) {
2693*4882a593Smuzhiyun in_retry = true;
2694*4882a593Smuzhiyun goto retry_reclaim;
2695*4882a593Smuzhiyun }
2696*4882a593Smuzhiyun
2697*4882a593Smuzhiyun /*
2698*4882a593Smuzhiyun * If we exit early, we're guaranteed to die (since
2699*4882a593Smuzhiyun * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
2700*4882a593Smuzhiyun * need to account for any ill-begotten jiffies to pay them off later.
2701*4882a593Smuzhiyun */
2702*4882a593Smuzhiyun psi_memstall_enter(&pflags);
2703*4882a593Smuzhiyun schedule_timeout_killable(penalty_jiffies);
2704*4882a593Smuzhiyun psi_memstall_leave(&pflags);
2705*4882a593Smuzhiyun
2706*4882a593Smuzhiyun out:
2707*4882a593Smuzhiyun css_put(&memcg->css);
2708*4882a593Smuzhiyun }
2709*4882a593Smuzhiyun
try_charge(struct mem_cgroup * memcg,gfp_t gfp_mask,unsigned int nr_pages)2710*4882a593Smuzhiyun static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2711*4882a593Smuzhiyun unsigned int nr_pages)
2712*4882a593Smuzhiyun {
2713*4882a593Smuzhiyun unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
2714*4882a593Smuzhiyun int nr_retries = MAX_RECLAIM_RETRIES;
2715*4882a593Smuzhiyun struct mem_cgroup *mem_over_limit;
2716*4882a593Smuzhiyun struct page_counter *counter;
2717*4882a593Smuzhiyun enum oom_status oom_status;
2718*4882a593Smuzhiyun unsigned long nr_reclaimed;
2719*4882a593Smuzhiyun bool passed_oom = false;
2720*4882a593Smuzhiyun bool may_swap = true;
2721*4882a593Smuzhiyun bool drained = false;
2722*4882a593Smuzhiyun unsigned long pflags;
2723*4882a593Smuzhiyun
2724*4882a593Smuzhiyun if (mem_cgroup_is_root(memcg))
2725*4882a593Smuzhiyun return 0;
2726*4882a593Smuzhiyun retry:
2727*4882a593Smuzhiyun if (consume_stock(memcg, nr_pages))
2728*4882a593Smuzhiyun return 0;
2729*4882a593Smuzhiyun
2730*4882a593Smuzhiyun if (!do_memsw_account() ||
2731*4882a593Smuzhiyun page_counter_try_charge(&memcg->memsw, batch, &counter)) {
2732*4882a593Smuzhiyun if (page_counter_try_charge(&memcg->memory, batch, &counter))
2733*4882a593Smuzhiyun goto done_restock;
2734*4882a593Smuzhiyun if (do_memsw_account())
2735*4882a593Smuzhiyun page_counter_uncharge(&memcg->memsw, batch);
2736*4882a593Smuzhiyun mem_over_limit = mem_cgroup_from_counter(counter, memory);
2737*4882a593Smuzhiyun } else {
2738*4882a593Smuzhiyun mem_over_limit = mem_cgroup_from_counter(counter, memsw);
2739*4882a593Smuzhiyun may_swap = false;
2740*4882a593Smuzhiyun }
2741*4882a593Smuzhiyun
2742*4882a593Smuzhiyun if (batch > nr_pages) {
2743*4882a593Smuzhiyun batch = nr_pages;
2744*4882a593Smuzhiyun goto retry;
2745*4882a593Smuzhiyun }
2746*4882a593Smuzhiyun
2747*4882a593Smuzhiyun /*
2748*4882a593Smuzhiyun * Memcg doesn't have a dedicated reserve for atomic
2749*4882a593Smuzhiyun * allocations. But like the global atomic pool, we need to
2750*4882a593Smuzhiyun * put the burden of reclaim on regular allocation requests
2751*4882a593Smuzhiyun * and let these go through as privileged allocations.
2752*4882a593Smuzhiyun */
2753*4882a593Smuzhiyun if (gfp_mask & __GFP_ATOMIC)
2754*4882a593Smuzhiyun goto force;
2755*4882a593Smuzhiyun
2756*4882a593Smuzhiyun /*
2757*4882a593Smuzhiyun * Prevent unbounded recursion when reclaim operations need to
2758*4882a593Smuzhiyun * allocate memory. This might exceed the limits temporarily,
2759*4882a593Smuzhiyun * but we prefer facilitating memory reclaim and getting back
2760*4882a593Smuzhiyun * under the limit over triggering OOM kills in these cases.
2761*4882a593Smuzhiyun */
2762*4882a593Smuzhiyun if (unlikely(current->flags & PF_MEMALLOC))
2763*4882a593Smuzhiyun goto force;
2764*4882a593Smuzhiyun
2765*4882a593Smuzhiyun if (unlikely(task_in_memcg_oom(current)))
2766*4882a593Smuzhiyun goto nomem;
2767*4882a593Smuzhiyun
2768*4882a593Smuzhiyun if (!gfpflags_allow_blocking(gfp_mask))
2769*4882a593Smuzhiyun goto nomem;
2770*4882a593Smuzhiyun
2771*4882a593Smuzhiyun memcg_memory_event(mem_over_limit, MEMCG_MAX);
2772*4882a593Smuzhiyun
2773*4882a593Smuzhiyun psi_memstall_enter(&pflags);
2774*4882a593Smuzhiyun nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2775*4882a593Smuzhiyun gfp_mask, may_swap);
2776*4882a593Smuzhiyun psi_memstall_leave(&pflags);
2777*4882a593Smuzhiyun
2778*4882a593Smuzhiyun if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2779*4882a593Smuzhiyun goto retry;
2780*4882a593Smuzhiyun
2781*4882a593Smuzhiyun if (!drained) {
2782*4882a593Smuzhiyun drain_all_stock(mem_over_limit);
2783*4882a593Smuzhiyun drained = true;
2784*4882a593Smuzhiyun goto retry;
2785*4882a593Smuzhiyun }
2786*4882a593Smuzhiyun
2787*4882a593Smuzhiyun if (gfp_mask & __GFP_NORETRY)
2788*4882a593Smuzhiyun goto nomem;
2789*4882a593Smuzhiyun /*
2790*4882a593Smuzhiyun * Even though the limit is exceeded at this point, reclaim
2791*4882a593Smuzhiyun * may have been able to free some pages. Retry the charge
2792*4882a593Smuzhiyun * before killing the task.
2793*4882a593Smuzhiyun *
2794*4882a593Smuzhiyun * Only for regular pages, though: huge pages are rather
2795*4882a593Smuzhiyun * unlikely to succeed so close to the limit, and we fall back
2796*4882a593Smuzhiyun * to regular pages anyway in case of failure.
2797*4882a593Smuzhiyun */
2798*4882a593Smuzhiyun if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
2799*4882a593Smuzhiyun goto retry;
2800*4882a593Smuzhiyun /*
2801*4882a593Smuzhiyun * At task move, charge accounts can be doubly counted. So, it's
2802*4882a593Smuzhiyun * better to wait until the end of task_move if something is going on.
2803*4882a593Smuzhiyun */
2804*4882a593Smuzhiyun if (mem_cgroup_wait_acct_move(mem_over_limit))
2805*4882a593Smuzhiyun goto retry;
2806*4882a593Smuzhiyun
2807*4882a593Smuzhiyun if (nr_retries--)
2808*4882a593Smuzhiyun goto retry;
2809*4882a593Smuzhiyun
2810*4882a593Smuzhiyun if (gfp_mask & __GFP_RETRY_MAYFAIL)
2811*4882a593Smuzhiyun goto nomem;
2812*4882a593Smuzhiyun
2813*4882a593Smuzhiyun if (gfp_mask & __GFP_NOFAIL)
2814*4882a593Smuzhiyun goto force;
2815*4882a593Smuzhiyun
2816*4882a593Smuzhiyun /* Avoid endless loop for tasks bypassed by the oom killer */
2817*4882a593Smuzhiyun if (passed_oom && task_is_dying())
2818*4882a593Smuzhiyun goto nomem;
2819*4882a593Smuzhiyun
2820*4882a593Smuzhiyun /*
2821*4882a593Smuzhiyun * keep retrying as long as the memcg oom killer is able to make
2822*4882a593Smuzhiyun * a forward progress or bypass the charge if the oom killer
2823*4882a593Smuzhiyun * couldn't make any progress.
2824*4882a593Smuzhiyun */
2825*4882a593Smuzhiyun oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
2826*4882a593Smuzhiyun get_order(nr_pages * PAGE_SIZE));
2827*4882a593Smuzhiyun if (oom_status == OOM_SUCCESS) {
2828*4882a593Smuzhiyun passed_oom = true;
2829*4882a593Smuzhiyun nr_retries = MAX_RECLAIM_RETRIES;
2830*4882a593Smuzhiyun goto retry;
2831*4882a593Smuzhiyun }
2832*4882a593Smuzhiyun nomem:
2833*4882a593Smuzhiyun if (!(gfp_mask & __GFP_NOFAIL))
2834*4882a593Smuzhiyun return -ENOMEM;
2835*4882a593Smuzhiyun force:
2836*4882a593Smuzhiyun /*
2837*4882a593Smuzhiyun * The allocation either can't fail or will lead to more memory
2838*4882a593Smuzhiyun * being freed very soon. Allow memory usage go over the limit
2839*4882a593Smuzhiyun * temporarily by force charging it.
2840*4882a593Smuzhiyun */
2841*4882a593Smuzhiyun page_counter_charge(&memcg->memory, nr_pages);
2842*4882a593Smuzhiyun if (do_memsw_account())
2843*4882a593Smuzhiyun page_counter_charge(&memcg->memsw, nr_pages);
2844*4882a593Smuzhiyun
2845*4882a593Smuzhiyun return 0;
2846*4882a593Smuzhiyun
2847*4882a593Smuzhiyun done_restock:
2848*4882a593Smuzhiyun if (batch > nr_pages)
2849*4882a593Smuzhiyun refill_stock(memcg, batch - nr_pages);
2850*4882a593Smuzhiyun
2851*4882a593Smuzhiyun /*
2852*4882a593Smuzhiyun * If the hierarchy is above the normal consumption range, schedule
2853*4882a593Smuzhiyun * reclaim on returning to userland. We can perform reclaim here
2854*4882a593Smuzhiyun * if __GFP_RECLAIM but let's always punt for simplicity and so that
2855*4882a593Smuzhiyun * GFP_KERNEL can consistently be used during reclaim. @memcg is
2856*4882a593Smuzhiyun * not recorded as it most likely matches current's and won't
2857*4882a593Smuzhiyun * change in the meantime. As high limit is checked again before
2858*4882a593Smuzhiyun * reclaim, the cost of mismatch is negligible.
2859*4882a593Smuzhiyun */
2860*4882a593Smuzhiyun do {
2861*4882a593Smuzhiyun bool mem_high, swap_high;
2862*4882a593Smuzhiyun
2863*4882a593Smuzhiyun mem_high = page_counter_read(&memcg->memory) >
2864*4882a593Smuzhiyun READ_ONCE(memcg->memory.high);
2865*4882a593Smuzhiyun swap_high = page_counter_read(&memcg->swap) >
2866*4882a593Smuzhiyun READ_ONCE(memcg->swap.high);
2867*4882a593Smuzhiyun
2868*4882a593Smuzhiyun /* Don't bother a random interrupted task */
2869*4882a593Smuzhiyun if (in_interrupt()) {
2870*4882a593Smuzhiyun if (mem_high) {
2871*4882a593Smuzhiyun schedule_work(&memcg->high_work);
2872*4882a593Smuzhiyun break;
2873*4882a593Smuzhiyun }
2874*4882a593Smuzhiyun continue;
2875*4882a593Smuzhiyun }
2876*4882a593Smuzhiyun
2877*4882a593Smuzhiyun if (mem_high || swap_high) {
2878*4882a593Smuzhiyun /*
2879*4882a593Smuzhiyun * The allocating tasks in this cgroup will need to do
2880*4882a593Smuzhiyun * reclaim or be throttled to prevent further growth
2881*4882a593Smuzhiyun * of the memory or swap footprints.
2882*4882a593Smuzhiyun *
2883*4882a593Smuzhiyun * Target some best-effort fairness between the tasks,
2884*4882a593Smuzhiyun * and distribute reclaim work and delay penalties
2885*4882a593Smuzhiyun * based on how much each task is actually allocating.
2886*4882a593Smuzhiyun */
2887*4882a593Smuzhiyun current->memcg_nr_pages_over_high += batch;
2888*4882a593Smuzhiyun set_notify_resume(current);
2889*4882a593Smuzhiyun break;
2890*4882a593Smuzhiyun }
2891*4882a593Smuzhiyun } while ((memcg = parent_mem_cgroup(memcg)));
2892*4882a593Smuzhiyun
2893*4882a593Smuzhiyun return 0;
2894*4882a593Smuzhiyun }
2895*4882a593Smuzhiyun
2896*4882a593Smuzhiyun #if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU)
cancel_charge(struct mem_cgroup * memcg,unsigned int nr_pages)2897*4882a593Smuzhiyun static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2898*4882a593Smuzhiyun {
2899*4882a593Smuzhiyun if (mem_cgroup_is_root(memcg))
2900*4882a593Smuzhiyun return;
2901*4882a593Smuzhiyun
2902*4882a593Smuzhiyun page_counter_uncharge(&memcg->memory, nr_pages);
2903*4882a593Smuzhiyun if (do_memsw_account())
2904*4882a593Smuzhiyun page_counter_uncharge(&memcg->memsw, nr_pages);
2905*4882a593Smuzhiyun }
2906*4882a593Smuzhiyun #endif
2907*4882a593Smuzhiyun
commit_charge(struct page * page,struct mem_cgroup * memcg)2908*4882a593Smuzhiyun static void commit_charge(struct page *page, struct mem_cgroup *memcg)
2909*4882a593Smuzhiyun {
2910*4882a593Smuzhiyun VM_BUG_ON_PAGE(page->mem_cgroup, page);
2911*4882a593Smuzhiyun /*
2912*4882a593Smuzhiyun * Any of the following ensures page->mem_cgroup stability:
2913*4882a593Smuzhiyun *
2914*4882a593Smuzhiyun * - the page lock
2915*4882a593Smuzhiyun * - LRU isolation
2916*4882a593Smuzhiyun * - lock_page_memcg()
2917*4882a593Smuzhiyun * - exclusive reference
2918*4882a593Smuzhiyun */
2919*4882a593Smuzhiyun page->mem_cgroup = memcg;
2920*4882a593Smuzhiyun }
2921*4882a593Smuzhiyun
2922*4882a593Smuzhiyun #ifdef CONFIG_MEMCG_KMEM
2923*4882a593Smuzhiyun /*
2924*4882a593Smuzhiyun * The allocated objcg pointers array is not accounted directly.
2925*4882a593Smuzhiyun * Moreover, it should not come from DMA buffer and is not readily
2926*4882a593Smuzhiyun * reclaimable. So those GFP bits should be masked off.
2927*4882a593Smuzhiyun */
2928*4882a593Smuzhiyun #define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT)
2929*4882a593Smuzhiyun
memcg_alloc_page_obj_cgroups(struct page * page,struct kmem_cache * s,gfp_t gfp)2930*4882a593Smuzhiyun int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
2931*4882a593Smuzhiyun gfp_t gfp)
2932*4882a593Smuzhiyun {
2933*4882a593Smuzhiyun unsigned int objects = objs_per_slab_page(s, page);
2934*4882a593Smuzhiyun void *vec;
2935*4882a593Smuzhiyun
2936*4882a593Smuzhiyun gfp &= ~OBJCGS_CLEAR_MASK;
2937*4882a593Smuzhiyun vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
2938*4882a593Smuzhiyun page_to_nid(page));
2939*4882a593Smuzhiyun if (!vec)
2940*4882a593Smuzhiyun return -ENOMEM;
2941*4882a593Smuzhiyun
2942*4882a593Smuzhiyun if (cmpxchg(&page->obj_cgroups, NULL,
2943*4882a593Smuzhiyun (struct obj_cgroup **) ((unsigned long)vec | 0x1UL)))
2944*4882a593Smuzhiyun kfree(vec);
2945*4882a593Smuzhiyun else
2946*4882a593Smuzhiyun kmemleak_not_leak(vec);
2947*4882a593Smuzhiyun
2948*4882a593Smuzhiyun return 0;
2949*4882a593Smuzhiyun }
2950*4882a593Smuzhiyun
2951*4882a593Smuzhiyun /*
2952*4882a593Smuzhiyun * Returns a pointer to the memory cgroup to which the kernel object is charged.
2953*4882a593Smuzhiyun *
2954*4882a593Smuzhiyun * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
2955*4882a593Smuzhiyun * cgroup_mutex, etc.
2956*4882a593Smuzhiyun */
mem_cgroup_from_obj(void * p)2957*4882a593Smuzhiyun struct mem_cgroup *mem_cgroup_from_obj(void *p)
2958*4882a593Smuzhiyun {
2959*4882a593Smuzhiyun struct page *page;
2960*4882a593Smuzhiyun
2961*4882a593Smuzhiyun if (mem_cgroup_disabled())
2962*4882a593Smuzhiyun return NULL;
2963*4882a593Smuzhiyun
2964*4882a593Smuzhiyun page = virt_to_head_page(p);
2965*4882a593Smuzhiyun
2966*4882a593Smuzhiyun /*
2967*4882a593Smuzhiyun * If page->mem_cgroup is set, it's either a simple mem_cgroup pointer
2968*4882a593Smuzhiyun * or a pointer to obj_cgroup vector. In the latter case the lowest
2969*4882a593Smuzhiyun * bit of the pointer is set.
2970*4882a593Smuzhiyun * The page->mem_cgroup pointer can be asynchronously changed
2971*4882a593Smuzhiyun * from NULL to (obj_cgroup_vec | 0x1UL), but can't be changed
2972*4882a593Smuzhiyun * from a valid memcg pointer to objcg vector or back.
2973*4882a593Smuzhiyun */
2974*4882a593Smuzhiyun if (!page->mem_cgroup)
2975*4882a593Smuzhiyun return NULL;
2976*4882a593Smuzhiyun
2977*4882a593Smuzhiyun /*
2978*4882a593Smuzhiyun * Slab objects are accounted individually, not per-page.
2979*4882a593Smuzhiyun * Memcg membership data for each individual object is saved in
2980*4882a593Smuzhiyun * the page->obj_cgroups.
2981*4882a593Smuzhiyun */
2982*4882a593Smuzhiyun if (page_has_obj_cgroups(page)) {
2983*4882a593Smuzhiyun struct obj_cgroup *objcg;
2984*4882a593Smuzhiyun unsigned int off;
2985*4882a593Smuzhiyun
2986*4882a593Smuzhiyun off = obj_to_index(page->slab_cache, page, p);
2987*4882a593Smuzhiyun objcg = page_obj_cgroups(page)[off];
2988*4882a593Smuzhiyun if (objcg)
2989*4882a593Smuzhiyun return obj_cgroup_memcg(objcg);
2990*4882a593Smuzhiyun
2991*4882a593Smuzhiyun return NULL;
2992*4882a593Smuzhiyun }
2993*4882a593Smuzhiyun
2994*4882a593Smuzhiyun /* All other pages use page->mem_cgroup */
2995*4882a593Smuzhiyun return page->mem_cgroup;
2996*4882a593Smuzhiyun }
2997*4882a593Smuzhiyun
get_obj_cgroup_from_current(void)2998*4882a593Smuzhiyun __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
2999*4882a593Smuzhiyun {
3000*4882a593Smuzhiyun struct obj_cgroup *objcg = NULL;
3001*4882a593Smuzhiyun struct mem_cgroup *memcg;
3002*4882a593Smuzhiyun
3003*4882a593Smuzhiyun if (memcg_kmem_bypass())
3004*4882a593Smuzhiyun return NULL;
3005*4882a593Smuzhiyun
3006*4882a593Smuzhiyun rcu_read_lock();
3007*4882a593Smuzhiyun if (unlikely(active_memcg()))
3008*4882a593Smuzhiyun memcg = active_memcg();
3009*4882a593Smuzhiyun else
3010*4882a593Smuzhiyun memcg = mem_cgroup_from_task(current);
3011*4882a593Smuzhiyun
3012*4882a593Smuzhiyun for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
3013*4882a593Smuzhiyun objcg = rcu_dereference(memcg->objcg);
3014*4882a593Smuzhiyun if (objcg && obj_cgroup_tryget(objcg))
3015*4882a593Smuzhiyun break;
3016*4882a593Smuzhiyun objcg = NULL;
3017*4882a593Smuzhiyun }
3018*4882a593Smuzhiyun rcu_read_unlock();
3019*4882a593Smuzhiyun
3020*4882a593Smuzhiyun return objcg;
3021*4882a593Smuzhiyun }
3022*4882a593Smuzhiyun
memcg_alloc_cache_id(void)3023*4882a593Smuzhiyun static int memcg_alloc_cache_id(void)
3024*4882a593Smuzhiyun {
3025*4882a593Smuzhiyun int id, size;
3026*4882a593Smuzhiyun int err;
3027*4882a593Smuzhiyun
3028*4882a593Smuzhiyun id = ida_simple_get(&memcg_cache_ida,
3029*4882a593Smuzhiyun 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
3030*4882a593Smuzhiyun if (id < 0)
3031*4882a593Smuzhiyun return id;
3032*4882a593Smuzhiyun
3033*4882a593Smuzhiyun if (id < memcg_nr_cache_ids)
3034*4882a593Smuzhiyun return id;
3035*4882a593Smuzhiyun
3036*4882a593Smuzhiyun /*
3037*4882a593Smuzhiyun * There's no space for the new id in memcg_caches arrays,
3038*4882a593Smuzhiyun * so we have to grow them.
3039*4882a593Smuzhiyun */
3040*4882a593Smuzhiyun down_write(&memcg_cache_ids_sem);
3041*4882a593Smuzhiyun
3042*4882a593Smuzhiyun size = 2 * (id + 1);
3043*4882a593Smuzhiyun if (size < MEMCG_CACHES_MIN_SIZE)
3044*4882a593Smuzhiyun size = MEMCG_CACHES_MIN_SIZE;
3045*4882a593Smuzhiyun else if (size > MEMCG_CACHES_MAX_SIZE)
3046*4882a593Smuzhiyun size = MEMCG_CACHES_MAX_SIZE;
3047*4882a593Smuzhiyun
3048*4882a593Smuzhiyun err = memcg_update_all_list_lrus(size);
3049*4882a593Smuzhiyun if (!err)
3050*4882a593Smuzhiyun memcg_nr_cache_ids = size;
3051*4882a593Smuzhiyun
3052*4882a593Smuzhiyun up_write(&memcg_cache_ids_sem);
3053*4882a593Smuzhiyun
3054*4882a593Smuzhiyun if (err) {
3055*4882a593Smuzhiyun ida_simple_remove(&memcg_cache_ida, id);
3056*4882a593Smuzhiyun return err;
3057*4882a593Smuzhiyun }
3058*4882a593Smuzhiyun return id;
3059*4882a593Smuzhiyun }
3060*4882a593Smuzhiyun
memcg_free_cache_id(int id)3061*4882a593Smuzhiyun static void memcg_free_cache_id(int id)
3062*4882a593Smuzhiyun {
3063*4882a593Smuzhiyun ida_simple_remove(&memcg_cache_ida, id);
3064*4882a593Smuzhiyun }
3065*4882a593Smuzhiyun
3066*4882a593Smuzhiyun /**
3067*4882a593Smuzhiyun * __memcg_kmem_charge: charge a number of kernel pages to a memcg
3068*4882a593Smuzhiyun * @memcg: memory cgroup to charge
3069*4882a593Smuzhiyun * @gfp: reclaim mode
3070*4882a593Smuzhiyun * @nr_pages: number of pages to charge
3071*4882a593Smuzhiyun *
3072*4882a593Smuzhiyun * Returns 0 on success, an error code on failure.
3073*4882a593Smuzhiyun */
__memcg_kmem_charge(struct mem_cgroup * memcg,gfp_t gfp,unsigned int nr_pages)3074*4882a593Smuzhiyun int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
3075*4882a593Smuzhiyun unsigned int nr_pages)
3076*4882a593Smuzhiyun {
3077*4882a593Smuzhiyun struct page_counter *counter;
3078*4882a593Smuzhiyun int ret;
3079*4882a593Smuzhiyun
3080*4882a593Smuzhiyun ret = try_charge(memcg, gfp, nr_pages);
3081*4882a593Smuzhiyun if (ret)
3082*4882a593Smuzhiyun return ret;
3083*4882a593Smuzhiyun
3084*4882a593Smuzhiyun if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
3085*4882a593Smuzhiyun !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
3086*4882a593Smuzhiyun
3087*4882a593Smuzhiyun /*
3088*4882a593Smuzhiyun * Enforce __GFP_NOFAIL allocation because callers are not
3089*4882a593Smuzhiyun * prepared to see failures and likely do not have any failure
3090*4882a593Smuzhiyun * handling code.
3091*4882a593Smuzhiyun */
3092*4882a593Smuzhiyun if (gfp & __GFP_NOFAIL) {
3093*4882a593Smuzhiyun page_counter_charge(&memcg->kmem, nr_pages);
3094*4882a593Smuzhiyun return 0;
3095*4882a593Smuzhiyun }
3096*4882a593Smuzhiyun cancel_charge(memcg, nr_pages);
3097*4882a593Smuzhiyun return -ENOMEM;
3098*4882a593Smuzhiyun }
3099*4882a593Smuzhiyun return 0;
3100*4882a593Smuzhiyun }
3101*4882a593Smuzhiyun
3102*4882a593Smuzhiyun /**
3103*4882a593Smuzhiyun * __memcg_kmem_uncharge: uncharge a number of kernel pages from a memcg
3104*4882a593Smuzhiyun * @memcg: memcg to uncharge
3105*4882a593Smuzhiyun * @nr_pages: number of pages to uncharge
3106*4882a593Smuzhiyun */
__memcg_kmem_uncharge(struct mem_cgroup * memcg,unsigned int nr_pages)3107*4882a593Smuzhiyun void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
3108*4882a593Smuzhiyun {
3109*4882a593Smuzhiyun if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
3110*4882a593Smuzhiyun page_counter_uncharge(&memcg->kmem, nr_pages);
3111*4882a593Smuzhiyun
3112*4882a593Smuzhiyun refill_stock(memcg, nr_pages);
3113*4882a593Smuzhiyun }
3114*4882a593Smuzhiyun
3115*4882a593Smuzhiyun /**
3116*4882a593Smuzhiyun * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup
3117*4882a593Smuzhiyun * @page: page to charge
3118*4882a593Smuzhiyun * @gfp: reclaim mode
3119*4882a593Smuzhiyun * @order: allocation order
3120*4882a593Smuzhiyun *
3121*4882a593Smuzhiyun * Returns 0 on success, an error code on failure.
3122*4882a593Smuzhiyun */
__memcg_kmem_charge_page(struct page * page,gfp_t gfp,int order)3123*4882a593Smuzhiyun int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
3124*4882a593Smuzhiyun {
3125*4882a593Smuzhiyun struct mem_cgroup *memcg;
3126*4882a593Smuzhiyun int ret = 0;
3127*4882a593Smuzhiyun
3128*4882a593Smuzhiyun memcg = get_mem_cgroup_from_current();
3129*4882a593Smuzhiyun if (memcg && !mem_cgroup_is_root(memcg)) {
3130*4882a593Smuzhiyun ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
3131*4882a593Smuzhiyun if (!ret) {
3132*4882a593Smuzhiyun page->mem_cgroup = memcg;
3133*4882a593Smuzhiyun __SetPageKmemcg(page);
3134*4882a593Smuzhiyun return 0;
3135*4882a593Smuzhiyun }
3136*4882a593Smuzhiyun css_put(&memcg->css);
3137*4882a593Smuzhiyun }
3138*4882a593Smuzhiyun return ret;
3139*4882a593Smuzhiyun }
3140*4882a593Smuzhiyun
3141*4882a593Smuzhiyun /**
3142*4882a593Smuzhiyun * __memcg_kmem_uncharge_page: uncharge a kmem page
3143*4882a593Smuzhiyun * @page: page to uncharge
3144*4882a593Smuzhiyun * @order: allocation order
3145*4882a593Smuzhiyun */
__memcg_kmem_uncharge_page(struct page * page,int order)3146*4882a593Smuzhiyun void __memcg_kmem_uncharge_page(struct page *page, int order)
3147*4882a593Smuzhiyun {
3148*4882a593Smuzhiyun struct mem_cgroup *memcg = page->mem_cgroup;
3149*4882a593Smuzhiyun unsigned int nr_pages = 1 << order;
3150*4882a593Smuzhiyun
3151*4882a593Smuzhiyun if (!memcg)
3152*4882a593Smuzhiyun return;
3153*4882a593Smuzhiyun
3154*4882a593Smuzhiyun VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
3155*4882a593Smuzhiyun __memcg_kmem_uncharge(memcg, nr_pages);
3156*4882a593Smuzhiyun page->mem_cgroup = NULL;
3157*4882a593Smuzhiyun css_put(&memcg->css);
3158*4882a593Smuzhiyun
3159*4882a593Smuzhiyun /* slab pages do not have PageKmemcg flag set */
3160*4882a593Smuzhiyun if (PageKmemcg(page))
3161*4882a593Smuzhiyun __ClearPageKmemcg(page);
3162*4882a593Smuzhiyun }
3163*4882a593Smuzhiyun
consume_obj_stock(struct obj_cgroup * objcg,unsigned int nr_bytes)3164*4882a593Smuzhiyun static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
3165*4882a593Smuzhiyun {
3166*4882a593Smuzhiyun struct memcg_stock_pcp *stock;
3167*4882a593Smuzhiyun unsigned long flags;
3168*4882a593Smuzhiyun bool ret = false;
3169*4882a593Smuzhiyun
3170*4882a593Smuzhiyun local_irq_save(flags);
3171*4882a593Smuzhiyun
3172*4882a593Smuzhiyun stock = this_cpu_ptr(&memcg_stock);
3173*4882a593Smuzhiyun if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
3174*4882a593Smuzhiyun stock->nr_bytes -= nr_bytes;
3175*4882a593Smuzhiyun ret = true;
3176*4882a593Smuzhiyun }
3177*4882a593Smuzhiyun
3178*4882a593Smuzhiyun local_irq_restore(flags);
3179*4882a593Smuzhiyun
3180*4882a593Smuzhiyun return ret;
3181*4882a593Smuzhiyun }
3182*4882a593Smuzhiyun
drain_obj_stock(struct memcg_stock_pcp * stock)3183*4882a593Smuzhiyun static void drain_obj_stock(struct memcg_stock_pcp *stock)
3184*4882a593Smuzhiyun {
3185*4882a593Smuzhiyun struct obj_cgroup *old = stock->cached_objcg;
3186*4882a593Smuzhiyun
3187*4882a593Smuzhiyun if (!old)
3188*4882a593Smuzhiyun return;
3189*4882a593Smuzhiyun
3190*4882a593Smuzhiyun if (stock->nr_bytes) {
3191*4882a593Smuzhiyun unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
3192*4882a593Smuzhiyun unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
3193*4882a593Smuzhiyun
3194*4882a593Smuzhiyun if (nr_pages) {
3195*4882a593Smuzhiyun struct mem_cgroup *memcg;
3196*4882a593Smuzhiyun
3197*4882a593Smuzhiyun rcu_read_lock();
3198*4882a593Smuzhiyun retry:
3199*4882a593Smuzhiyun memcg = obj_cgroup_memcg(old);
3200*4882a593Smuzhiyun if (unlikely(!css_tryget(&memcg->css)))
3201*4882a593Smuzhiyun goto retry;
3202*4882a593Smuzhiyun rcu_read_unlock();
3203*4882a593Smuzhiyun
3204*4882a593Smuzhiyun __memcg_kmem_uncharge(memcg, nr_pages);
3205*4882a593Smuzhiyun css_put(&memcg->css);
3206*4882a593Smuzhiyun }
3207*4882a593Smuzhiyun
3208*4882a593Smuzhiyun /*
3209*4882a593Smuzhiyun * The leftover is flushed to the centralized per-memcg value.
3210*4882a593Smuzhiyun * On the next attempt to refill obj stock it will be moved
3211*4882a593Smuzhiyun * to a per-cpu stock (probably, on an other CPU), see
3212*4882a593Smuzhiyun * refill_obj_stock().
3213*4882a593Smuzhiyun *
3214*4882a593Smuzhiyun * How often it's flushed is a trade-off between the memory
3215*4882a593Smuzhiyun * limit enforcement accuracy and potential CPU contention,
3216*4882a593Smuzhiyun * so it might be changed in the future.
3217*4882a593Smuzhiyun */
3218*4882a593Smuzhiyun atomic_add(nr_bytes, &old->nr_charged_bytes);
3219*4882a593Smuzhiyun stock->nr_bytes = 0;
3220*4882a593Smuzhiyun }
3221*4882a593Smuzhiyun
3222*4882a593Smuzhiyun obj_cgroup_put(old);
3223*4882a593Smuzhiyun stock->cached_objcg = NULL;
3224*4882a593Smuzhiyun }
3225*4882a593Smuzhiyun
obj_stock_flush_required(struct memcg_stock_pcp * stock,struct mem_cgroup * root_memcg)3226*4882a593Smuzhiyun static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
3227*4882a593Smuzhiyun struct mem_cgroup *root_memcg)
3228*4882a593Smuzhiyun {
3229*4882a593Smuzhiyun struct mem_cgroup *memcg;
3230*4882a593Smuzhiyun
3231*4882a593Smuzhiyun if (stock->cached_objcg) {
3232*4882a593Smuzhiyun memcg = obj_cgroup_memcg(stock->cached_objcg);
3233*4882a593Smuzhiyun if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
3234*4882a593Smuzhiyun return true;
3235*4882a593Smuzhiyun }
3236*4882a593Smuzhiyun
3237*4882a593Smuzhiyun return false;
3238*4882a593Smuzhiyun }
3239*4882a593Smuzhiyun
refill_obj_stock(struct obj_cgroup * objcg,unsigned int nr_bytes)3240*4882a593Smuzhiyun static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
3241*4882a593Smuzhiyun {
3242*4882a593Smuzhiyun struct memcg_stock_pcp *stock;
3243*4882a593Smuzhiyun unsigned long flags;
3244*4882a593Smuzhiyun
3245*4882a593Smuzhiyun local_irq_save(flags);
3246*4882a593Smuzhiyun
3247*4882a593Smuzhiyun stock = this_cpu_ptr(&memcg_stock);
3248*4882a593Smuzhiyun if (stock->cached_objcg != objcg) { /* reset if necessary */
3249*4882a593Smuzhiyun drain_obj_stock(stock);
3250*4882a593Smuzhiyun obj_cgroup_get(objcg);
3251*4882a593Smuzhiyun stock->cached_objcg = objcg;
3252*4882a593Smuzhiyun stock->nr_bytes = atomic_xchg(&objcg->nr_charged_bytes, 0);
3253*4882a593Smuzhiyun }
3254*4882a593Smuzhiyun stock->nr_bytes += nr_bytes;
3255*4882a593Smuzhiyun
3256*4882a593Smuzhiyun if (stock->nr_bytes > PAGE_SIZE)
3257*4882a593Smuzhiyun drain_obj_stock(stock);
3258*4882a593Smuzhiyun
3259*4882a593Smuzhiyun local_irq_restore(flags);
3260*4882a593Smuzhiyun }
3261*4882a593Smuzhiyun
obj_cgroup_charge(struct obj_cgroup * objcg,gfp_t gfp,size_t size)3262*4882a593Smuzhiyun int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
3263*4882a593Smuzhiyun {
3264*4882a593Smuzhiyun struct mem_cgroup *memcg;
3265*4882a593Smuzhiyun unsigned int nr_pages, nr_bytes;
3266*4882a593Smuzhiyun int ret;
3267*4882a593Smuzhiyun
3268*4882a593Smuzhiyun if (consume_obj_stock(objcg, size))
3269*4882a593Smuzhiyun return 0;
3270*4882a593Smuzhiyun
3271*4882a593Smuzhiyun /*
3272*4882a593Smuzhiyun * In theory, memcg->nr_charged_bytes can have enough
3273*4882a593Smuzhiyun * pre-charged bytes to satisfy the allocation. However,
3274*4882a593Smuzhiyun * flushing memcg->nr_charged_bytes requires two atomic
3275*4882a593Smuzhiyun * operations, and memcg->nr_charged_bytes can't be big,
3276*4882a593Smuzhiyun * so it's better to ignore it and try grab some new pages.
3277*4882a593Smuzhiyun * memcg->nr_charged_bytes will be flushed in
3278*4882a593Smuzhiyun * refill_obj_stock(), called from this function or
3279*4882a593Smuzhiyun * independently later.
3280*4882a593Smuzhiyun */
3281*4882a593Smuzhiyun rcu_read_lock();
3282*4882a593Smuzhiyun retry:
3283*4882a593Smuzhiyun memcg = obj_cgroup_memcg(objcg);
3284*4882a593Smuzhiyun if (unlikely(!css_tryget(&memcg->css)))
3285*4882a593Smuzhiyun goto retry;
3286*4882a593Smuzhiyun rcu_read_unlock();
3287*4882a593Smuzhiyun
3288*4882a593Smuzhiyun nr_pages = size >> PAGE_SHIFT;
3289*4882a593Smuzhiyun nr_bytes = size & (PAGE_SIZE - 1);
3290*4882a593Smuzhiyun
3291*4882a593Smuzhiyun if (nr_bytes)
3292*4882a593Smuzhiyun nr_pages += 1;
3293*4882a593Smuzhiyun
3294*4882a593Smuzhiyun ret = __memcg_kmem_charge(memcg, gfp, nr_pages);
3295*4882a593Smuzhiyun if (!ret && nr_bytes)
3296*4882a593Smuzhiyun refill_obj_stock(objcg, PAGE_SIZE - nr_bytes);
3297*4882a593Smuzhiyun
3298*4882a593Smuzhiyun css_put(&memcg->css);
3299*4882a593Smuzhiyun return ret;
3300*4882a593Smuzhiyun }
3301*4882a593Smuzhiyun
obj_cgroup_uncharge(struct obj_cgroup * objcg,size_t size)3302*4882a593Smuzhiyun void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
3303*4882a593Smuzhiyun {
3304*4882a593Smuzhiyun refill_obj_stock(objcg, size);
3305*4882a593Smuzhiyun }
3306*4882a593Smuzhiyun
3307*4882a593Smuzhiyun #endif /* CONFIG_MEMCG_KMEM */
3308*4882a593Smuzhiyun
3309*4882a593Smuzhiyun /*
3310*4882a593Smuzhiyun * Because head->mem_cgroup is not set on tails, set it now.
3311*4882a593Smuzhiyun */
split_page_memcg(struct page * head,unsigned int nr)3312*4882a593Smuzhiyun void split_page_memcg(struct page *head, unsigned int nr)
3313*4882a593Smuzhiyun {
3314*4882a593Smuzhiyun struct mem_cgroup *memcg = head->mem_cgroup;
3315*4882a593Smuzhiyun int kmemcg = PageKmemcg(head);
3316*4882a593Smuzhiyun int i;
3317*4882a593Smuzhiyun
3318*4882a593Smuzhiyun if (mem_cgroup_disabled() || !memcg)
3319*4882a593Smuzhiyun return;
3320*4882a593Smuzhiyun
3321*4882a593Smuzhiyun for (i = 1; i < nr; i++) {
3322*4882a593Smuzhiyun head[i].mem_cgroup = memcg;
3323*4882a593Smuzhiyun if (kmemcg)
3324*4882a593Smuzhiyun __SetPageKmemcg(head + i);
3325*4882a593Smuzhiyun }
3326*4882a593Smuzhiyun css_get_many(&memcg->css, nr - 1);
3327*4882a593Smuzhiyun }
3328*4882a593Smuzhiyun
3329*4882a593Smuzhiyun #ifdef CONFIG_MEMCG_SWAP
3330*4882a593Smuzhiyun /**
3331*4882a593Smuzhiyun * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
3332*4882a593Smuzhiyun * @entry: swap entry to be moved
3333*4882a593Smuzhiyun * @from: mem_cgroup which the entry is moved from
3334*4882a593Smuzhiyun * @to: mem_cgroup which the entry is moved to
3335*4882a593Smuzhiyun *
3336*4882a593Smuzhiyun * It succeeds only when the swap_cgroup's record for this entry is the same
3337*4882a593Smuzhiyun * as the mem_cgroup's id of @from.
3338*4882a593Smuzhiyun *
3339*4882a593Smuzhiyun * Returns 0 on success, -EINVAL on failure.
3340*4882a593Smuzhiyun *
3341*4882a593Smuzhiyun * The caller must have charged to @to, IOW, called page_counter_charge() about
3342*4882a593Smuzhiyun * both res and memsw, and called css_get().
3343*4882a593Smuzhiyun */
mem_cgroup_move_swap_account(swp_entry_t entry,struct mem_cgroup * from,struct mem_cgroup * to)3344*4882a593Smuzhiyun static int mem_cgroup_move_swap_account(swp_entry_t entry,
3345*4882a593Smuzhiyun struct mem_cgroup *from, struct mem_cgroup *to)
3346*4882a593Smuzhiyun {
3347*4882a593Smuzhiyun unsigned short old_id, new_id;
3348*4882a593Smuzhiyun
3349*4882a593Smuzhiyun old_id = mem_cgroup_id(from);
3350*4882a593Smuzhiyun new_id = mem_cgroup_id(to);
3351*4882a593Smuzhiyun
3352*4882a593Smuzhiyun if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
3353*4882a593Smuzhiyun mod_memcg_state(from, MEMCG_SWAP, -1);
3354*4882a593Smuzhiyun mod_memcg_state(to, MEMCG_SWAP, 1);
3355*4882a593Smuzhiyun return 0;
3356*4882a593Smuzhiyun }
3357*4882a593Smuzhiyun return -EINVAL;
3358*4882a593Smuzhiyun }
3359*4882a593Smuzhiyun #else
mem_cgroup_move_swap_account(swp_entry_t entry,struct mem_cgroup * from,struct mem_cgroup * to)3360*4882a593Smuzhiyun static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3361*4882a593Smuzhiyun struct mem_cgroup *from, struct mem_cgroup *to)
3362*4882a593Smuzhiyun {
3363*4882a593Smuzhiyun return -EINVAL;
3364*4882a593Smuzhiyun }
3365*4882a593Smuzhiyun #endif
3366*4882a593Smuzhiyun
3367*4882a593Smuzhiyun static DEFINE_MUTEX(memcg_max_mutex);
3368*4882a593Smuzhiyun
mem_cgroup_resize_max(struct mem_cgroup * memcg,unsigned long max,bool memsw)3369*4882a593Smuzhiyun static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
3370*4882a593Smuzhiyun unsigned long max, bool memsw)
3371*4882a593Smuzhiyun {
3372*4882a593Smuzhiyun bool enlarge = false;
3373*4882a593Smuzhiyun bool drained = false;
3374*4882a593Smuzhiyun int ret;
3375*4882a593Smuzhiyun bool limits_invariant;
3376*4882a593Smuzhiyun struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
3377*4882a593Smuzhiyun
3378*4882a593Smuzhiyun do {
3379*4882a593Smuzhiyun if (signal_pending(current)) {
3380*4882a593Smuzhiyun ret = -EINTR;
3381*4882a593Smuzhiyun break;
3382*4882a593Smuzhiyun }
3383*4882a593Smuzhiyun
3384*4882a593Smuzhiyun mutex_lock(&memcg_max_mutex);
3385*4882a593Smuzhiyun /*
3386*4882a593Smuzhiyun * Make sure that the new limit (memsw or memory limit) doesn't
3387*4882a593Smuzhiyun * break our basic invariant rule memory.max <= memsw.max.
3388*4882a593Smuzhiyun */
3389*4882a593Smuzhiyun limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
3390*4882a593Smuzhiyun max <= memcg->memsw.max;
3391*4882a593Smuzhiyun if (!limits_invariant) {
3392*4882a593Smuzhiyun mutex_unlock(&memcg_max_mutex);
3393*4882a593Smuzhiyun ret = -EINVAL;
3394*4882a593Smuzhiyun break;
3395*4882a593Smuzhiyun }
3396*4882a593Smuzhiyun if (max > counter->max)
3397*4882a593Smuzhiyun enlarge = true;
3398*4882a593Smuzhiyun ret = page_counter_set_max(counter, max);
3399*4882a593Smuzhiyun mutex_unlock(&memcg_max_mutex);
3400*4882a593Smuzhiyun
3401*4882a593Smuzhiyun if (!ret)
3402*4882a593Smuzhiyun break;
3403*4882a593Smuzhiyun
3404*4882a593Smuzhiyun if (!drained) {
3405*4882a593Smuzhiyun drain_all_stock(memcg);
3406*4882a593Smuzhiyun drained = true;
3407*4882a593Smuzhiyun continue;
3408*4882a593Smuzhiyun }
3409*4882a593Smuzhiyun
3410*4882a593Smuzhiyun if (!try_to_free_mem_cgroup_pages(memcg, 1,
3411*4882a593Smuzhiyun GFP_KERNEL, !memsw)) {
3412*4882a593Smuzhiyun ret = -EBUSY;
3413*4882a593Smuzhiyun break;
3414*4882a593Smuzhiyun }
3415*4882a593Smuzhiyun } while (true);
3416*4882a593Smuzhiyun
3417*4882a593Smuzhiyun if (!ret && enlarge)
3418*4882a593Smuzhiyun memcg_oom_recover(memcg);
3419*4882a593Smuzhiyun
3420*4882a593Smuzhiyun return ret;
3421*4882a593Smuzhiyun }
3422*4882a593Smuzhiyun
mem_cgroup_soft_limit_reclaim(pg_data_t * pgdat,int order,gfp_t gfp_mask,unsigned long * total_scanned)3423*4882a593Smuzhiyun unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
3424*4882a593Smuzhiyun gfp_t gfp_mask,
3425*4882a593Smuzhiyun unsigned long *total_scanned)
3426*4882a593Smuzhiyun {
3427*4882a593Smuzhiyun unsigned long nr_reclaimed = 0;
3428*4882a593Smuzhiyun struct mem_cgroup_per_node *mz, *next_mz = NULL;
3429*4882a593Smuzhiyun unsigned long reclaimed;
3430*4882a593Smuzhiyun int loop = 0;
3431*4882a593Smuzhiyun struct mem_cgroup_tree_per_node *mctz;
3432*4882a593Smuzhiyun unsigned long excess;
3433*4882a593Smuzhiyun unsigned long nr_scanned;
3434*4882a593Smuzhiyun
3435*4882a593Smuzhiyun if (order > 0)
3436*4882a593Smuzhiyun return 0;
3437*4882a593Smuzhiyun
3438*4882a593Smuzhiyun mctz = soft_limit_tree_node(pgdat->node_id);
3439*4882a593Smuzhiyun
3440*4882a593Smuzhiyun /*
3441*4882a593Smuzhiyun * Do not even bother to check the largest node if the root
3442*4882a593Smuzhiyun * is empty. Do it lockless to prevent lock bouncing. Races
3443*4882a593Smuzhiyun * are acceptable as soft limit is best effort anyway.
3444*4882a593Smuzhiyun */
3445*4882a593Smuzhiyun if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
3446*4882a593Smuzhiyun return 0;
3447*4882a593Smuzhiyun
3448*4882a593Smuzhiyun /*
3449*4882a593Smuzhiyun * This loop can run a while, specially if mem_cgroup's continuously
3450*4882a593Smuzhiyun * keep exceeding their soft limit and putting the system under
3451*4882a593Smuzhiyun * pressure
3452*4882a593Smuzhiyun */
3453*4882a593Smuzhiyun do {
3454*4882a593Smuzhiyun if (next_mz)
3455*4882a593Smuzhiyun mz = next_mz;
3456*4882a593Smuzhiyun else
3457*4882a593Smuzhiyun mz = mem_cgroup_largest_soft_limit_node(mctz);
3458*4882a593Smuzhiyun if (!mz)
3459*4882a593Smuzhiyun break;
3460*4882a593Smuzhiyun
3461*4882a593Smuzhiyun nr_scanned = 0;
3462*4882a593Smuzhiyun reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
3463*4882a593Smuzhiyun gfp_mask, &nr_scanned);
3464*4882a593Smuzhiyun nr_reclaimed += reclaimed;
3465*4882a593Smuzhiyun *total_scanned += nr_scanned;
3466*4882a593Smuzhiyun spin_lock_irq(&mctz->lock);
3467*4882a593Smuzhiyun __mem_cgroup_remove_exceeded(mz, mctz);
3468*4882a593Smuzhiyun
3469*4882a593Smuzhiyun /*
3470*4882a593Smuzhiyun * If we failed to reclaim anything from this memory cgroup
3471*4882a593Smuzhiyun * it is time to move on to the next cgroup
3472*4882a593Smuzhiyun */
3473*4882a593Smuzhiyun next_mz = NULL;
3474*4882a593Smuzhiyun if (!reclaimed)
3475*4882a593Smuzhiyun next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
3476*4882a593Smuzhiyun
3477*4882a593Smuzhiyun excess = soft_limit_excess(mz->memcg);
3478*4882a593Smuzhiyun /*
3479*4882a593Smuzhiyun * One school of thought says that we should not add
3480*4882a593Smuzhiyun * back the node to the tree if reclaim returns 0.
3481*4882a593Smuzhiyun * But our reclaim could return 0, simply because due
3482*4882a593Smuzhiyun * to priority we are exposing a smaller subset of
3483*4882a593Smuzhiyun * memory to reclaim from. Consider this as a longer
3484*4882a593Smuzhiyun * term TODO.
3485*4882a593Smuzhiyun */
3486*4882a593Smuzhiyun /* If excess == 0, no tree ops */
3487*4882a593Smuzhiyun __mem_cgroup_insert_exceeded(mz, mctz, excess);
3488*4882a593Smuzhiyun spin_unlock_irq(&mctz->lock);
3489*4882a593Smuzhiyun css_put(&mz->memcg->css);
3490*4882a593Smuzhiyun loop++;
3491*4882a593Smuzhiyun /*
3492*4882a593Smuzhiyun * Could not reclaim anything and there are no more
3493*4882a593Smuzhiyun * mem cgroups to try or we seem to be looping without
3494*4882a593Smuzhiyun * reclaiming anything.
3495*4882a593Smuzhiyun */
3496*4882a593Smuzhiyun if (!nr_reclaimed &&
3497*4882a593Smuzhiyun (next_mz == NULL ||
3498*4882a593Smuzhiyun loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
3499*4882a593Smuzhiyun break;
3500*4882a593Smuzhiyun } while (!nr_reclaimed);
3501*4882a593Smuzhiyun if (next_mz)
3502*4882a593Smuzhiyun css_put(&next_mz->memcg->css);
3503*4882a593Smuzhiyun return nr_reclaimed;
3504*4882a593Smuzhiyun }
3505*4882a593Smuzhiyun
3506*4882a593Smuzhiyun /*
3507*4882a593Smuzhiyun * Test whether @memcg has children, dead or alive. Note that this
3508*4882a593Smuzhiyun * function doesn't care whether @memcg has use_hierarchy enabled and
3509*4882a593Smuzhiyun * returns %true if there are child csses according to the cgroup
3510*4882a593Smuzhiyun * hierarchy. Testing use_hierarchy is the caller's responsibility.
3511*4882a593Smuzhiyun */
memcg_has_children(struct mem_cgroup * memcg)3512*4882a593Smuzhiyun static inline bool memcg_has_children(struct mem_cgroup *memcg)
3513*4882a593Smuzhiyun {
3514*4882a593Smuzhiyun bool ret;
3515*4882a593Smuzhiyun
3516*4882a593Smuzhiyun rcu_read_lock();
3517*4882a593Smuzhiyun ret = css_next_child(NULL, &memcg->css);
3518*4882a593Smuzhiyun rcu_read_unlock();
3519*4882a593Smuzhiyun return ret;
3520*4882a593Smuzhiyun }
3521*4882a593Smuzhiyun
3522*4882a593Smuzhiyun /*
3523*4882a593Smuzhiyun * Reclaims as many pages from the given memcg as possible.
3524*4882a593Smuzhiyun *
3525*4882a593Smuzhiyun * Caller is responsible for holding css reference for memcg.
3526*4882a593Smuzhiyun */
mem_cgroup_force_empty(struct mem_cgroup * memcg)3527*4882a593Smuzhiyun static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
3528*4882a593Smuzhiyun {
3529*4882a593Smuzhiyun int nr_retries = MAX_RECLAIM_RETRIES;
3530*4882a593Smuzhiyun
3531*4882a593Smuzhiyun /* we call try-to-free pages for make this cgroup empty */
3532*4882a593Smuzhiyun lru_add_drain_all();
3533*4882a593Smuzhiyun
3534*4882a593Smuzhiyun drain_all_stock(memcg);
3535*4882a593Smuzhiyun
3536*4882a593Smuzhiyun /* try to free all pages in this cgroup */
3537*4882a593Smuzhiyun while (nr_retries && page_counter_read(&memcg->memory)) {
3538*4882a593Smuzhiyun int progress;
3539*4882a593Smuzhiyun
3540*4882a593Smuzhiyun if (signal_pending(current))
3541*4882a593Smuzhiyun return -EINTR;
3542*4882a593Smuzhiyun
3543*4882a593Smuzhiyun progress = try_to_free_mem_cgroup_pages(memcg, 1,
3544*4882a593Smuzhiyun GFP_KERNEL, true);
3545*4882a593Smuzhiyun if (!progress) {
3546*4882a593Smuzhiyun nr_retries--;
3547*4882a593Smuzhiyun /* maybe some writeback is necessary */
3548*4882a593Smuzhiyun congestion_wait(BLK_RW_ASYNC, HZ/10);
3549*4882a593Smuzhiyun }
3550*4882a593Smuzhiyun
3551*4882a593Smuzhiyun }
3552*4882a593Smuzhiyun
3553*4882a593Smuzhiyun return 0;
3554*4882a593Smuzhiyun }
3555*4882a593Smuzhiyun
mem_cgroup_force_empty_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)3556*4882a593Smuzhiyun static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
3557*4882a593Smuzhiyun char *buf, size_t nbytes,
3558*4882a593Smuzhiyun loff_t off)
3559*4882a593Smuzhiyun {
3560*4882a593Smuzhiyun struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3561*4882a593Smuzhiyun
3562*4882a593Smuzhiyun if (mem_cgroup_is_root(memcg))
3563*4882a593Smuzhiyun return -EINVAL;
3564*4882a593Smuzhiyun return mem_cgroup_force_empty(memcg) ?: nbytes;
3565*4882a593Smuzhiyun }
3566*4882a593Smuzhiyun
mem_cgroup_hierarchy_read(struct cgroup_subsys_state * css,struct cftype * cft)3567*4882a593Smuzhiyun static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
3568*4882a593Smuzhiyun struct cftype *cft)
3569*4882a593Smuzhiyun {
3570*4882a593Smuzhiyun return mem_cgroup_from_css(css)->use_hierarchy;
3571*4882a593Smuzhiyun }
3572*4882a593Smuzhiyun
mem_cgroup_hierarchy_write(struct cgroup_subsys_state * css,struct cftype * cft,u64 val)3573*4882a593Smuzhiyun static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
3574*4882a593Smuzhiyun struct cftype *cft, u64 val)
3575*4882a593Smuzhiyun {
3576*4882a593Smuzhiyun int retval = 0;
3577*4882a593Smuzhiyun struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3578*4882a593Smuzhiyun struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
3579*4882a593Smuzhiyun
3580*4882a593Smuzhiyun if (memcg->use_hierarchy == val)
3581*4882a593Smuzhiyun return 0;
3582*4882a593Smuzhiyun
3583*4882a593Smuzhiyun /*
3584*4882a593Smuzhiyun * If parent's use_hierarchy is set, we can't make any modifications
3585*4882a593Smuzhiyun * in the child subtrees. If it is unset, then the change can
3586*4882a593Smuzhiyun * occur, provided the current cgroup has no children.
3587*4882a593Smuzhiyun *
3588*4882a593Smuzhiyun * For the root cgroup, parent_mem is NULL, we allow value to be
3589*4882a593Smuzhiyun * set if there are no children.
3590*4882a593Smuzhiyun */
3591*4882a593Smuzhiyun if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
3592*4882a593Smuzhiyun (val == 1 || val == 0)) {
3593*4882a593Smuzhiyun if (!memcg_has_children(memcg))
3594*4882a593Smuzhiyun memcg->use_hierarchy = val;
3595*4882a593Smuzhiyun else
3596*4882a593Smuzhiyun retval = -EBUSY;
3597*4882a593Smuzhiyun } else
3598*4882a593Smuzhiyun retval = -EINVAL;
3599*4882a593Smuzhiyun
3600*4882a593Smuzhiyun return retval;
3601*4882a593Smuzhiyun }
3602*4882a593Smuzhiyun
mem_cgroup_usage(struct mem_cgroup * memcg,bool swap)3603*4882a593Smuzhiyun static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3604*4882a593Smuzhiyun {
3605*4882a593Smuzhiyun unsigned long val;
3606*4882a593Smuzhiyun
3607*4882a593Smuzhiyun if (mem_cgroup_is_root(memcg)) {
3608*4882a593Smuzhiyun val = memcg_page_state(memcg, NR_FILE_PAGES) +
3609*4882a593Smuzhiyun memcg_page_state(memcg, NR_ANON_MAPPED);
3610*4882a593Smuzhiyun if (swap)
3611*4882a593Smuzhiyun val += memcg_page_state(memcg, MEMCG_SWAP);
3612*4882a593Smuzhiyun } else {
3613*4882a593Smuzhiyun if (!swap)
3614*4882a593Smuzhiyun val = page_counter_read(&memcg->memory);
3615*4882a593Smuzhiyun else
3616*4882a593Smuzhiyun val = page_counter_read(&memcg->memsw);
3617*4882a593Smuzhiyun }
3618*4882a593Smuzhiyun return val;
3619*4882a593Smuzhiyun }
3620*4882a593Smuzhiyun
3621*4882a593Smuzhiyun enum {
3622*4882a593Smuzhiyun RES_USAGE,
3623*4882a593Smuzhiyun RES_LIMIT,
3624*4882a593Smuzhiyun RES_MAX_USAGE,
3625*4882a593Smuzhiyun RES_FAILCNT,
3626*4882a593Smuzhiyun RES_SOFT_LIMIT,
3627*4882a593Smuzhiyun };
3628*4882a593Smuzhiyun
mem_cgroup_read_u64(struct cgroup_subsys_state * css,struct cftype * cft)3629*4882a593Smuzhiyun static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
3630*4882a593Smuzhiyun struct cftype *cft)
3631*4882a593Smuzhiyun {
3632*4882a593Smuzhiyun struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3633*4882a593Smuzhiyun struct page_counter *counter;
3634*4882a593Smuzhiyun
3635*4882a593Smuzhiyun switch (MEMFILE_TYPE(cft->private)) {
3636*4882a593Smuzhiyun case _MEM:
3637*4882a593Smuzhiyun counter = &memcg->memory;
3638*4882a593Smuzhiyun break;
3639*4882a593Smuzhiyun case _MEMSWAP:
3640*4882a593Smuzhiyun counter = &memcg->memsw;
3641*4882a593Smuzhiyun break;
3642*4882a593Smuzhiyun case _KMEM:
3643*4882a593Smuzhiyun counter = &memcg->kmem;
3644*4882a593Smuzhiyun break;
3645*4882a593Smuzhiyun case _TCP:
3646*4882a593Smuzhiyun counter = &memcg->tcpmem;
3647*4882a593Smuzhiyun break;
3648*4882a593Smuzhiyun default:
3649*4882a593Smuzhiyun BUG();
3650*4882a593Smuzhiyun }
3651*4882a593Smuzhiyun
3652*4882a593Smuzhiyun switch (MEMFILE_ATTR(cft->private)) {
3653*4882a593Smuzhiyun case RES_USAGE:
3654*4882a593Smuzhiyun if (counter == &memcg->memory)
3655*4882a593Smuzhiyun return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
3656*4882a593Smuzhiyun if (counter == &memcg->memsw)
3657*4882a593Smuzhiyun return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
3658*4882a593Smuzhiyun return (u64)page_counter_read(counter) * PAGE_SIZE;
3659*4882a593Smuzhiyun case RES_LIMIT:
3660*4882a593Smuzhiyun return (u64)counter->max * PAGE_SIZE;
3661*4882a593Smuzhiyun case RES_MAX_USAGE:
3662*4882a593Smuzhiyun return (u64)counter->watermark * PAGE_SIZE;
3663*4882a593Smuzhiyun case RES_FAILCNT:
3664*4882a593Smuzhiyun return counter->failcnt;
3665*4882a593Smuzhiyun case RES_SOFT_LIMIT:
3666*4882a593Smuzhiyun return (u64)memcg->soft_limit * PAGE_SIZE;
3667*4882a593Smuzhiyun default:
3668*4882a593Smuzhiyun BUG();
3669*4882a593Smuzhiyun }
3670*4882a593Smuzhiyun }
3671*4882a593Smuzhiyun
memcg_flush_percpu_vmstats(struct mem_cgroup * memcg)3672*4882a593Smuzhiyun static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
3673*4882a593Smuzhiyun {
3674*4882a593Smuzhiyun unsigned long stat[MEMCG_NR_STAT] = {0};
3675*4882a593Smuzhiyun struct mem_cgroup *mi;
3676*4882a593Smuzhiyun int node, cpu, i;
3677*4882a593Smuzhiyun
3678*4882a593Smuzhiyun for_each_online_cpu(cpu)
3679*4882a593Smuzhiyun for (i = 0; i < MEMCG_NR_STAT; i++)
3680*4882a593Smuzhiyun stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
3681*4882a593Smuzhiyun
3682*4882a593Smuzhiyun for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
3683*4882a593Smuzhiyun for (i = 0; i < MEMCG_NR_STAT; i++)
3684*4882a593Smuzhiyun atomic_long_add(stat[i], &mi->vmstats[i]);
3685*4882a593Smuzhiyun
3686*4882a593Smuzhiyun for_each_node(node) {
3687*4882a593Smuzhiyun struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
3688*4882a593Smuzhiyun struct mem_cgroup_per_node *pi;
3689*4882a593Smuzhiyun
3690*4882a593Smuzhiyun for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3691*4882a593Smuzhiyun stat[i] = 0;
3692*4882a593Smuzhiyun
3693*4882a593Smuzhiyun for_each_online_cpu(cpu)
3694*4882a593Smuzhiyun for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3695*4882a593Smuzhiyun stat[i] += per_cpu(
3696*4882a593Smuzhiyun pn->lruvec_stat_cpu->count[i], cpu);
3697*4882a593Smuzhiyun
3698*4882a593Smuzhiyun for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
3699*4882a593Smuzhiyun for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3700*4882a593Smuzhiyun atomic_long_add(stat[i], &pi->lruvec_stat[i]);
3701*4882a593Smuzhiyun }
3702*4882a593Smuzhiyun }
3703*4882a593Smuzhiyun
memcg_flush_percpu_vmevents(struct mem_cgroup * memcg)3704*4882a593Smuzhiyun static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
3705*4882a593Smuzhiyun {
3706*4882a593Smuzhiyun unsigned long events[NR_VM_EVENT_ITEMS];
3707*4882a593Smuzhiyun struct mem_cgroup *mi;
3708*4882a593Smuzhiyun int cpu, i;
3709*4882a593Smuzhiyun
3710*4882a593Smuzhiyun for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3711*4882a593Smuzhiyun events[i] = 0;
3712*4882a593Smuzhiyun
3713*4882a593Smuzhiyun for_each_online_cpu(cpu)
3714*4882a593Smuzhiyun for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3715*4882a593Smuzhiyun events[i] += per_cpu(memcg->vmstats_percpu->events[i],
3716*4882a593Smuzhiyun cpu);
3717*4882a593Smuzhiyun
3718*4882a593Smuzhiyun for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
3719*4882a593Smuzhiyun for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3720*4882a593Smuzhiyun atomic_long_add(events[i], &mi->vmevents[i]);
3721*4882a593Smuzhiyun }
3722*4882a593Smuzhiyun
3723*4882a593Smuzhiyun #ifdef CONFIG_MEMCG_KMEM
memcg_online_kmem(struct mem_cgroup * memcg)3724*4882a593Smuzhiyun static int memcg_online_kmem(struct mem_cgroup *memcg)
3725*4882a593Smuzhiyun {
3726*4882a593Smuzhiyun struct obj_cgroup *objcg;
3727*4882a593Smuzhiyun int memcg_id;
3728*4882a593Smuzhiyun
3729*4882a593Smuzhiyun if (cgroup_memory_nokmem)
3730*4882a593Smuzhiyun return 0;
3731*4882a593Smuzhiyun
3732*4882a593Smuzhiyun BUG_ON(memcg->kmemcg_id >= 0);
3733*4882a593Smuzhiyun BUG_ON(memcg->kmem_state);
3734*4882a593Smuzhiyun
3735*4882a593Smuzhiyun memcg_id = memcg_alloc_cache_id();
3736*4882a593Smuzhiyun if (memcg_id < 0)
3737*4882a593Smuzhiyun return memcg_id;
3738*4882a593Smuzhiyun
3739*4882a593Smuzhiyun objcg = obj_cgroup_alloc();
3740*4882a593Smuzhiyun if (!objcg) {
3741*4882a593Smuzhiyun memcg_free_cache_id(memcg_id);
3742*4882a593Smuzhiyun return -ENOMEM;
3743*4882a593Smuzhiyun }
3744*4882a593Smuzhiyun objcg->memcg = memcg;
3745*4882a593Smuzhiyun rcu_assign_pointer(memcg->objcg, objcg);
3746*4882a593Smuzhiyun
3747*4882a593Smuzhiyun static_branch_enable(&memcg_kmem_enabled_key);
3748*4882a593Smuzhiyun
3749*4882a593Smuzhiyun /*
3750*4882a593Smuzhiyun * A memory cgroup is considered kmem-online as soon as it gets
3751*4882a593Smuzhiyun * kmemcg_id. Setting the id after enabling static branching will
3752*4882a593Smuzhiyun * guarantee no one starts accounting before all call sites are
3753*4882a593Smuzhiyun * patched.
3754*4882a593Smuzhiyun */
3755*4882a593Smuzhiyun memcg->kmemcg_id = memcg_id;
3756*4882a593Smuzhiyun memcg->kmem_state = KMEM_ONLINE;
3757*4882a593Smuzhiyun
3758*4882a593Smuzhiyun return 0;
3759*4882a593Smuzhiyun }
3760*4882a593Smuzhiyun
memcg_offline_kmem(struct mem_cgroup * memcg)3761*4882a593Smuzhiyun static void memcg_offline_kmem(struct mem_cgroup *memcg)
3762*4882a593Smuzhiyun {
3763*4882a593Smuzhiyun struct cgroup_subsys_state *css;
3764*4882a593Smuzhiyun struct mem_cgroup *parent, *child;
3765*4882a593Smuzhiyun int kmemcg_id;
3766*4882a593Smuzhiyun
3767*4882a593Smuzhiyun if (memcg->kmem_state != KMEM_ONLINE)
3768*4882a593Smuzhiyun return;
3769*4882a593Smuzhiyun
3770*4882a593Smuzhiyun memcg->kmem_state = KMEM_ALLOCATED;
3771*4882a593Smuzhiyun
3772*4882a593Smuzhiyun parent = parent_mem_cgroup(memcg);
3773*4882a593Smuzhiyun if (!parent)
3774*4882a593Smuzhiyun parent = root_mem_cgroup;
3775*4882a593Smuzhiyun
3776*4882a593Smuzhiyun memcg_reparent_objcgs(memcg, parent);
3777*4882a593Smuzhiyun
3778*4882a593Smuzhiyun kmemcg_id = memcg->kmemcg_id;
3779*4882a593Smuzhiyun BUG_ON(kmemcg_id < 0);
3780*4882a593Smuzhiyun
3781*4882a593Smuzhiyun /*
3782*4882a593Smuzhiyun * Change kmemcg_id of this cgroup and all its descendants to the
3783*4882a593Smuzhiyun * parent's id, and then move all entries from this cgroup's list_lrus
3784*4882a593Smuzhiyun * to ones of the parent. After we have finished, all list_lrus
3785*4882a593Smuzhiyun * corresponding to this cgroup are guaranteed to remain empty. The
3786*4882a593Smuzhiyun * ordering is imposed by list_lru_node->lock taken by
3787*4882a593Smuzhiyun * memcg_drain_all_list_lrus().
3788*4882a593Smuzhiyun */
3789*4882a593Smuzhiyun rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */
3790*4882a593Smuzhiyun css_for_each_descendant_pre(css, &memcg->css) {
3791*4882a593Smuzhiyun child = mem_cgroup_from_css(css);
3792*4882a593Smuzhiyun BUG_ON(child->kmemcg_id != kmemcg_id);
3793*4882a593Smuzhiyun child->kmemcg_id = parent->kmemcg_id;
3794*4882a593Smuzhiyun if (!memcg->use_hierarchy)
3795*4882a593Smuzhiyun break;
3796*4882a593Smuzhiyun }
3797*4882a593Smuzhiyun rcu_read_unlock();
3798*4882a593Smuzhiyun
3799*4882a593Smuzhiyun memcg_drain_all_list_lrus(kmemcg_id, parent);
3800*4882a593Smuzhiyun
3801*4882a593Smuzhiyun memcg_free_cache_id(kmemcg_id);
3802*4882a593Smuzhiyun }
3803*4882a593Smuzhiyun
memcg_free_kmem(struct mem_cgroup * memcg)3804*4882a593Smuzhiyun static void memcg_free_kmem(struct mem_cgroup *memcg)
3805*4882a593Smuzhiyun {
3806*4882a593Smuzhiyun /* css_alloc() failed, offlining didn't happen */
3807*4882a593Smuzhiyun if (unlikely(memcg->kmem_state == KMEM_ONLINE))
3808*4882a593Smuzhiyun memcg_offline_kmem(memcg);
3809*4882a593Smuzhiyun }
3810*4882a593Smuzhiyun #else
memcg_online_kmem(struct mem_cgroup * memcg)3811*4882a593Smuzhiyun static int memcg_online_kmem(struct mem_cgroup *memcg)
3812*4882a593Smuzhiyun {
3813*4882a593Smuzhiyun return 0;
3814*4882a593Smuzhiyun }
memcg_offline_kmem(struct mem_cgroup * memcg)3815*4882a593Smuzhiyun static void memcg_offline_kmem(struct mem_cgroup *memcg)
3816*4882a593Smuzhiyun {
3817*4882a593Smuzhiyun }
memcg_free_kmem(struct mem_cgroup * memcg)3818*4882a593Smuzhiyun static void memcg_free_kmem(struct mem_cgroup *memcg)
3819*4882a593Smuzhiyun {
3820*4882a593Smuzhiyun }
3821*4882a593Smuzhiyun #endif /* CONFIG_MEMCG_KMEM */
3822*4882a593Smuzhiyun
memcg_update_kmem_max(struct mem_cgroup * memcg,unsigned long max)3823*4882a593Smuzhiyun static int memcg_update_kmem_max(struct mem_cgroup *memcg,
3824*4882a593Smuzhiyun unsigned long max)
3825*4882a593Smuzhiyun {
3826*4882a593Smuzhiyun int ret;
3827*4882a593Smuzhiyun
3828*4882a593Smuzhiyun mutex_lock(&memcg_max_mutex);
3829*4882a593Smuzhiyun ret = page_counter_set_max(&memcg->kmem, max);
3830*4882a593Smuzhiyun mutex_unlock(&memcg_max_mutex);
3831*4882a593Smuzhiyun return ret;
3832*4882a593Smuzhiyun }
3833*4882a593Smuzhiyun
memcg_update_tcp_max(struct mem_cgroup * memcg,unsigned long max)3834*4882a593Smuzhiyun static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
3835*4882a593Smuzhiyun {
3836*4882a593Smuzhiyun int ret;
3837*4882a593Smuzhiyun
3838*4882a593Smuzhiyun mutex_lock(&memcg_max_mutex);
3839*4882a593Smuzhiyun
3840*4882a593Smuzhiyun ret = page_counter_set_max(&memcg->tcpmem, max);
3841*4882a593Smuzhiyun if (ret)
3842*4882a593Smuzhiyun goto out;
3843*4882a593Smuzhiyun
3844*4882a593Smuzhiyun if (!memcg->tcpmem_active) {
3845*4882a593Smuzhiyun /*
3846*4882a593Smuzhiyun * The active flag needs to be written after the static_key
3847*4882a593Smuzhiyun * update. This is what guarantees that the socket activation
3848*4882a593Smuzhiyun * function is the last one to run. See mem_cgroup_sk_alloc()
3849*4882a593Smuzhiyun * for details, and note that we don't mark any socket as
3850*4882a593Smuzhiyun * belonging to this memcg until that flag is up.
3851*4882a593Smuzhiyun *
3852*4882a593Smuzhiyun * We need to do this, because static_keys will span multiple
3853*4882a593Smuzhiyun * sites, but we can't control their order. If we mark a socket
3854*4882a593Smuzhiyun * as accounted, but the accounting functions are not patched in
3855*4882a593Smuzhiyun * yet, we'll lose accounting.
3856*4882a593Smuzhiyun *
3857*4882a593Smuzhiyun * We never race with the readers in mem_cgroup_sk_alloc(),
3858*4882a593Smuzhiyun * because when this value change, the code to process it is not
3859*4882a593Smuzhiyun * patched in yet.
3860*4882a593Smuzhiyun */
3861*4882a593Smuzhiyun static_branch_inc(&memcg_sockets_enabled_key);
3862*4882a593Smuzhiyun memcg->tcpmem_active = true;
3863*4882a593Smuzhiyun }
3864*4882a593Smuzhiyun out:
3865*4882a593Smuzhiyun mutex_unlock(&memcg_max_mutex);
3866*4882a593Smuzhiyun return ret;
3867*4882a593Smuzhiyun }
3868*4882a593Smuzhiyun
3869*4882a593Smuzhiyun /*
3870*4882a593Smuzhiyun * The user of this function is...
3871*4882a593Smuzhiyun * RES_LIMIT.
3872*4882a593Smuzhiyun */
mem_cgroup_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)3873*4882a593Smuzhiyun static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
3874*4882a593Smuzhiyun char *buf, size_t nbytes, loff_t off)
3875*4882a593Smuzhiyun {
3876*4882a593Smuzhiyun struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3877*4882a593Smuzhiyun unsigned long nr_pages;
3878*4882a593Smuzhiyun int ret;
3879*4882a593Smuzhiyun
3880*4882a593Smuzhiyun buf = strstrip(buf);
3881*4882a593Smuzhiyun ret = page_counter_memparse(buf, "-1", &nr_pages);
3882*4882a593Smuzhiyun if (ret)
3883*4882a593Smuzhiyun return ret;
3884*4882a593Smuzhiyun
3885*4882a593Smuzhiyun switch (MEMFILE_ATTR(of_cft(of)->private)) {
3886*4882a593Smuzhiyun case RES_LIMIT:
3887*4882a593Smuzhiyun if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
3888*4882a593Smuzhiyun ret = -EINVAL;
3889*4882a593Smuzhiyun break;
3890*4882a593Smuzhiyun }
3891*4882a593Smuzhiyun switch (MEMFILE_TYPE(of_cft(of)->private)) {
3892*4882a593Smuzhiyun case _MEM:
3893*4882a593Smuzhiyun ret = mem_cgroup_resize_max(memcg, nr_pages, false);
3894*4882a593Smuzhiyun break;
3895*4882a593Smuzhiyun case _MEMSWAP:
3896*4882a593Smuzhiyun ret = mem_cgroup_resize_max(memcg, nr_pages, true);
3897*4882a593Smuzhiyun break;
3898*4882a593Smuzhiyun case _KMEM:
3899*4882a593Smuzhiyun pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
3900*4882a593Smuzhiyun "Please report your usecase to linux-mm@kvack.org if you "
3901*4882a593Smuzhiyun "depend on this functionality.\n");
3902*4882a593Smuzhiyun ret = memcg_update_kmem_max(memcg, nr_pages);
3903*4882a593Smuzhiyun break;
3904*4882a593Smuzhiyun case _TCP:
3905*4882a593Smuzhiyun ret = memcg_update_tcp_max(memcg, nr_pages);
3906*4882a593Smuzhiyun break;
3907*4882a593Smuzhiyun }
3908*4882a593Smuzhiyun break;
3909*4882a593Smuzhiyun case RES_SOFT_LIMIT:
3910*4882a593Smuzhiyun memcg->soft_limit = nr_pages;
3911*4882a593Smuzhiyun ret = 0;
3912*4882a593Smuzhiyun break;
3913*4882a593Smuzhiyun }
3914*4882a593Smuzhiyun return ret ?: nbytes;
3915*4882a593Smuzhiyun }
3916*4882a593Smuzhiyun
mem_cgroup_reset(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)3917*4882a593Smuzhiyun static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
3918*4882a593Smuzhiyun size_t nbytes, loff_t off)
3919*4882a593Smuzhiyun {
3920*4882a593Smuzhiyun struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3921*4882a593Smuzhiyun struct page_counter *counter;
3922*4882a593Smuzhiyun
3923*4882a593Smuzhiyun switch (MEMFILE_TYPE(of_cft(of)->private)) {
3924*4882a593Smuzhiyun case _MEM:
3925*4882a593Smuzhiyun counter = &memcg->memory;
3926*4882a593Smuzhiyun break;
3927*4882a593Smuzhiyun case _MEMSWAP:
3928*4882a593Smuzhiyun counter = &memcg->memsw;
3929*4882a593Smuzhiyun break;
3930*4882a593Smuzhiyun case _KMEM:
3931*4882a593Smuzhiyun counter = &memcg->kmem;
3932*4882a593Smuzhiyun break;
3933*4882a593Smuzhiyun case _TCP:
3934*4882a593Smuzhiyun counter = &memcg->tcpmem;
3935*4882a593Smuzhiyun break;
3936*4882a593Smuzhiyun default:
3937*4882a593Smuzhiyun BUG();
3938*4882a593Smuzhiyun }
3939*4882a593Smuzhiyun
3940*4882a593Smuzhiyun switch (MEMFILE_ATTR(of_cft(of)->private)) {
3941*4882a593Smuzhiyun case RES_MAX_USAGE:
3942*4882a593Smuzhiyun page_counter_reset_watermark(counter);
3943*4882a593Smuzhiyun break;
3944*4882a593Smuzhiyun case RES_FAILCNT:
3945*4882a593Smuzhiyun counter->failcnt = 0;
3946*4882a593Smuzhiyun break;
3947*4882a593Smuzhiyun default:
3948*4882a593Smuzhiyun BUG();
3949*4882a593Smuzhiyun }
3950*4882a593Smuzhiyun
3951*4882a593Smuzhiyun return nbytes;
3952*4882a593Smuzhiyun }
3953*4882a593Smuzhiyun
mem_cgroup_move_charge_read(struct cgroup_subsys_state * css,struct cftype * cft)3954*4882a593Smuzhiyun static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
3955*4882a593Smuzhiyun struct cftype *cft)
3956*4882a593Smuzhiyun {
3957*4882a593Smuzhiyun return mem_cgroup_from_css(css)->move_charge_at_immigrate;
3958*4882a593Smuzhiyun }
3959*4882a593Smuzhiyun
3960*4882a593Smuzhiyun #ifdef CONFIG_MMU
mem_cgroup_move_charge_write(struct cgroup_subsys_state * css,struct cftype * cft,u64 val)3961*4882a593Smuzhiyun static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3962*4882a593Smuzhiyun struct cftype *cft, u64 val)
3963*4882a593Smuzhiyun {
3964*4882a593Smuzhiyun struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3965*4882a593Smuzhiyun
3966*4882a593Smuzhiyun if (val & ~MOVE_MASK)
3967*4882a593Smuzhiyun return -EINVAL;
3968*4882a593Smuzhiyun
3969*4882a593Smuzhiyun /*
3970*4882a593Smuzhiyun * No kind of locking is needed in here, because ->can_attach() will
3971*4882a593Smuzhiyun * check this value once in the beginning of the process, and then carry
3972*4882a593Smuzhiyun * on with stale data. This means that changes to this value will only
3973*4882a593Smuzhiyun * affect task migrations starting after the change.
3974*4882a593Smuzhiyun */
3975*4882a593Smuzhiyun memcg->move_charge_at_immigrate = val;
3976*4882a593Smuzhiyun return 0;
3977*4882a593Smuzhiyun }
3978*4882a593Smuzhiyun #else
mem_cgroup_move_charge_write(struct cgroup_subsys_state * css,struct cftype * cft,u64 val)3979*4882a593Smuzhiyun static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3980*4882a593Smuzhiyun struct cftype *cft, u64 val)
3981*4882a593Smuzhiyun {
3982*4882a593Smuzhiyun return -ENOSYS;
3983*4882a593Smuzhiyun }
3984*4882a593Smuzhiyun #endif
3985*4882a593Smuzhiyun
3986*4882a593Smuzhiyun #ifdef CONFIG_NUMA
3987*4882a593Smuzhiyun
3988*4882a593Smuzhiyun #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
3989*4882a593Smuzhiyun #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
3990*4882a593Smuzhiyun #define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
3991*4882a593Smuzhiyun
mem_cgroup_node_nr_lru_pages(struct mem_cgroup * memcg,int nid,unsigned int lru_mask,bool tree)3992*4882a593Smuzhiyun static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
3993*4882a593Smuzhiyun int nid, unsigned int lru_mask, bool tree)
3994*4882a593Smuzhiyun {
3995*4882a593Smuzhiyun struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
3996*4882a593Smuzhiyun unsigned long nr = 0;
3997*4882a593Smuzhiyun enum lru_list lru;
3998*4882a593Smuzhiyun
3999*4882a593Smuzhiyun VM_BUG_ON((unsigned)nid >= nr_node_ids);
4000*4882a593Smuzhiyun
4001*4882a593Smuzhiyun for_each_lru(lru) {
4002*4882a593Smuzhiyun if (!(BIT(lru) & lru_mask))
4003*4882a593Smuzhiyun continue;
4004*4882a593Smuzhiyun if (tree)
4005*4882a593Smuzhiyun nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
4006*4882a593Smuzhiyun else
4007*4882a593Smuzhiyun nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
4008*4882a593Smuzhiyun }
4009*4882a593Smuzhiyun return nr;
4010*4882a593Smuzhiyun }
4011*4882a593Smuzhiyun
mem_cgroup_nr_lru_pages(struct mem_cgroup * memcg,unsigned int lru_mask,bool tree)4012*4882a593Smuzhiyun static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
4013*4882a593Smuzhiyun unsigned int lru_mask,
4014*4882a593Smuzhiyun bool tree)
4015*4882a593Smuzhiyun {
4016*4882a593Smuzhiyun unsigned long nr = 0;
4017*4882a593Smuzhiyun enum lru_list lru;
4018*4882a593Smuzhiyun
4019*4882a593Smuzhiyun for_each_lru(lru) {
4020*4882a593Smuzhiyun if (!(BIT(lru) & lru_mask))
4021*4882a593Smuzhiyun continue;
4022*4882a593Smuzhiyun if (tree)
4023*4882a593Smuzhiyun nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
4024*4882a593Smuzhiyun else
4025*4882a593Smuzhiyun nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
4026*4882a593Smuzhiyun }
4027*4882a593Smuzhiyun return nr;
4028*4882a593Smuzhiyun }
4029*4882a593Smuzhiyun
memcg_numa_stat_show(struct seq_file * m,void * v)4030*4882a593Smuzhiyun static int memcg_numa_stat_show(struct seq_file *m, void *v)
4031*4882a593Smuzhiyun {
4032*4882a593Smuzhiyun struct numa_stat {
4033*4882a593Smuzhiyun const char *name;
4034*4882a593Smuzhiyun unsigned int lru_mask;
4035*4882a593Smuzhiyun };
4036*4882a593Smuzhiyun
4037*4882a593Smuzhiyun static const struct numa_stat stats[] = {
4038*4882a593Smuzhiyun { "total", LRU_ALL },
4039*4882a593Smuzhiyun { "file", LRU_ALL_FILE },
4040*4882a593Smuzhiyun { "anon", LRU_ALL_ANON },
4041*4882a593Smuzhiyun { "unevictable", BIT(LRU_UNEVICTABLE) },
4042*4882a593Smuzhiyun };
4043*4882a593Smuzhiyun const struct numa_stat *stat;
4044*4882a593Smuzhiyun int nid;
4045*4882a593Smuzhiyun struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
4046*4882a593Smuzhiyun
4047*4882a593Smuzhiyun for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
4048*4882a593Smuzhiyun seq_printf(m, "%s=%lu", stat->name,
4049*4882a593Smuzhiyun mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
4050*4882a593Smuzhiyun false));
4051*4882a593Smuzhiyun for_each_node_state(nid, N_MEMORY)
4052*4882a593Smuzhiyun seq_printf(m, " N%d=%lu", nid,
4053*4882a593Smuzhiyun mem_cgroup_node_nr_lru_pages(memcg, nid,
4054*4882a593Smuzhiyun stat->lru_mask, false));
4055*4882a593Smuzhiyun seq_putc(m, '\n');
4056*4882a593Smuzhiyun }
4057*4882a593Smuzhiyun
4058*4882a593Smuzhiyun for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
4059*4882a593Smuzhiyun
4060*4882a593Smuzhiyun seq_printf(m, "hierarchical_%s=%lu", stat->name,
4061*4882a593Smuzhiyun mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
4062*4882a593Smuzhiyun true));
4063*4882a593Smuzhiyun for_each_node_state(nid, N_MEMORY)
4064*4882a593Smuzhiyun seq_printf(m, " N%d=%lu", nid,
4065*4882a593Smuzhiyun mem_cgroup_node_nr_lru_pages(memcg, nid,
4066*4882a593Smuzhiyun stat->lru_mask, true));
4067*4882a593Smuzhiyun seq_putc(m, '\n');
4068*4882a593Smuzhiyun }
4069*4882a593Smuzhiyun
4070*4882a593Smuzhiyun return 0;
4071*4882a593Smuzhiyun }
4072*4882a593Smuzhiyun #endif /* CONFIG_NUMA */
4073*4882a593Smuzhiyun
4074*4882a593Smuzhiyun static const unsigned int memcg1_stats[] = {
4075*4882a593Smuzhiyun NR_FILE_PAGES,
4076*4882a593Smuzhiyun NR_ANON_MAPPED,
4077*4882a593Smuzhiyun #ifdef CONFIG_TRANSPARENT_HUGEPAGE
4078*4882a593Smuzhiyun NR_ANON_THPS,
4079*4882a593Smuzhiyun #endif
4080*4882a593Smuzhiyun NR_SHMEM,
4081*4882a593Smuzhiyun NR_FILE_MAPPED,
4082*4882a593Smuzhiyun NR_FILE_DIRTY,
4083*4882a593Smuzhiyun NR_WRITEBACK,
4084*4882a593Smuzhiyun MEMCG_SWAP,
4085*4882a593Smuzhiyun };
4086*4882a593Smuzhiyun
4087*4882a593Smuzhiyun static const char *const memcg1_stat_names[] = {
4088*4882a593Smuzhiyun "cache",
4089*4882a593Smuzhiyun "rss",
4090*4882a593Smuzhiyun #ifdef CONFIG_TRANSPARENT_HUGEPAGE
4091*4882a593Smuzhiyun "rss_huge",
4092*4882a593Smuzhiyun #endif
4093*4882a593Smuzhiyun "shmem",
4094*4882a593Smuzhiyun "mapped_file",
4095*4882a593Smuzhiyun "dirty",
4096*4882a593Smuzhiyun "writeback",
4097*4882a593Smuzhiyun "swap",
4098*4882a593Smuzhiyun };
4099*4882a593Smuzhiyun
4100*4882a593Smuzhiyun /* Universal VM events cgroup1 shows, original sort order */
4101*4882a593Smuzhiyun static const unsigned int memcg1_events[] = {
4102*4882a593Smuzhiyun PGPGIN,
4103*4882a593Smuzhiyun PGPGOUT,
4104*4882a593Smuzhiyun PGFAULT,
4105*4882a593Smuzhiyun PGMAJFAULT,
4106*4882a593Smuzhiyun };
4107*4882a593Smuzhiyun
memcg_stat_show(struct seq_file * m,void * v)4108*4882a593Smuzhiyun static int memcg_stat_show(struct seq_file *m, void *v)
4109*4882a593Smuzhiyun {
4110*4882a593Smuzhiyun struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
4111*4882a593Smuzhiyun unsigned long memory, memsw;
4112*4882a593Smuzhiyun struct mem_cgroup *mi;
4113*4882a593Smuzhiyun unsigned int i;
4114*4882a593Smuzhiyun
4115*4882a593Smuzhiyun BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
4116*4882a593Smuzhiyun
4117*4882a593Smuzhiyun for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
4118*4882a593Smuzhiyun unsigned long nr;
4119*4882a593Smuzhiyun
4120*4882a593Smuzhiyun if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
4121*4882a593Smuzhiyun continue;
4122*4882a593Smuzhiyun nr = memcg_page_state_local(memcg, memcg1_stats[i]);
4123*4882a593Smuzhiyun #ifdef CONFIG_TRANSPARENT_HUGEPAGE
4124*4882a593Smuzhiyun if (memcg1_stats[i] == NR_ANON_THPS)
4125*4882a593Smuzhiyun nr *= HPAGE_PMD_NR;
4126*4882a593Smuzhiyun #endif
4127*4882a593Smuzhiyun seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE);
4128*4882a593Smuzhiyun }
4129*4882a593Smuzhiyun
4130*4882a593Smuzhiyun for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
4131*4882a593Smuzhiyun seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]),
4132*4882a593Smuzhiyun memcg_events_local(memcg, memcg1_events[i]));
4133*4882a593Smuzhiyun
4134*4882a593Smuzhiyun for (i = 0; i < NR_LRU_LISTS; i++)
4135*4882a593Smuzhiyun seq_printf(m, "%s %lu\n", lru_list_name(i),
4136*4882a593Smuzhiyun memcg_page_state_local(memcg, NR_LRU_BASE + i) *
4137*4882a593Smuzhiyun PAGE_SIZE);
4138*4882a593Smuzhiyun
4139*4882a593Smuzhiyun /* Hierarchical information */
4140*4882a593Smuzhiyun memory = memsw = PAGE_COUNTER_MAX;
4141*4882a593Smuzhiyun for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
4142*4882a593Smuzhiyun memory = min(memory, READ_ONCE(mi->memory.max));
4143*4882a593Smuzhiyun memsw = min(memsw, READ_ONCE(mi->memsw.max));
4144*4882a593Smuzhiyun }
4145*4882a593Smuzhiyun seq_printf(m, "hierarchical_memory_limit %llu\n",
4146*4882a593Smuzhiyun (u64)memory * PAGE_SIZE);
4147*4882a593Smuzhiyun if (do_memsw_account())
4148*4882a593Smuzhiyun seq_printf(m, "hierarchical_memsw_limit %llu\n",
4149*4882a593Smuzhiyun (u64)memsw * PAGE_SIZE);
4150*4882a593Smuzhiyun
4151*4882a593Smuzhiyun for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
4152*4882a593Smuzhiyun unsigned long nr;
4153*4882a593Smuzhiyun
4154*4882a593Smuzhiyun if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
4155*4882a593Smuzhiyun continue;
4156*4882a593Smuzhiyun nr = memcg_page_state(memcg, memcg1_stats[i]);
4157*4882a593Smuzhiyun #ifdef CONFIG_TRANSPARENT_HUGEPAGE
4158*4882a593Smuzhiyun if (memcg1_stats[i] == NR_ANON_THPS)
4159*4882a593Smuzhiyun nr *= HPAGE_PMD_NR;
4160*4882a593Smuzhiyun #endif
4161*4882a593Smuzhiyun seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
4162*4882a593Smuzhiyun (u64)nr * PAGE_SIZE);
4163*4882a593Smuzhiyun }
4164*4882a593Smuzhiyun
4165*4882a593Smuzhiyun for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
4166*4882a593Smuzhiyun seq_printf(m, "total_%s %llu\n",
4167*4882a593Smuzhiyun vm_event_name(memcg1_events[i]),
4168*4882a593Smuzhiyun (u64)memcg_events(memcg, memcg1_events[i]));
4169*4882a593Smuzhiyun
4170*4882a593Smuzhiyun for (i = 0; i < NR_LRU_LISTS; i++)
4171*4882a593Smuzhiyun seq_printf(m, "total_%s %llu\n", lru_list_name(i),
4172*4882a593Smuzhiyun (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
4173*4882a593Smuzhiyun PAGE_SIZE);
4174*4882a593Smuzhiyun
4175*4882a593Smuzhiyun #ifdef CONFIG_DEBUG_VM
4176*4882a593Smuzhiyun {
4177*4882a593Smuzhiyun pg_data_t *pgdat;
4178*4882a593Smuzhiyun struct mem_cgroup_per_node *mz;
4179*4882a593Smuzhiyun unsigned long anon_cost = 0;
4180*4882a593Smuzhiyun unsigned long file_cost = 0;
4181*4882a593Smuzhiyun
4182*4882a593Smuzhiyun for_each_online_pgdat(pgdat) {
4183*4882a593Smuzhiyun mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
4184*4882a593Smuzhiyun
4185*4882a593Smuzhiyun anon_cost += mz->lruvec.anon_cost;
4186*4882a593Smuzhiyun file_cost += mz->lruvec.file_cost;
4187*4882a593Smuzhiyun }
4188*4882a593Smuzhiyun seq_printf(m, "anon_cost %lu\n", anon_cost);
4189*4882a593Smuzhiyun seq_printf(m, "file_cost %lu\n", file_cost);
4190*4882a593Smuzhiyun }
4191*4882a593Smuzhiyun #endif
4192*4882a593Smuzhiyun
4193*4882a593Smuzhiyun return 0;
4194*4882a593Smuzhiyun }
4195*4882a593Smuzhiyun
mem_cgroup_swappiness_read(struct cgroup_subsys_state * css,struct cftype * cft)4196*4882a593Smuzhiyun static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
4197*4882a593Smuzhiyun struct cftype *cft)
4198*4882a593Smuzhiyun {
4199*4882a593Smuzhiyun struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4200*4882a593Smuzhiyun
4201*4882a593Smuzhiyun return mem_cgroup_swappiness(memcg);
4202*4882a593Smuzhiyun }
4203*4882a593Smuzhiyun
mem_cgroup_swappiness_write(struct cgroup_subsys_state * css,struct cftype * cft,u64 val)4204*4882a593Smuzhiyun static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
4205*4882a593Smuzhiyun struct cftype *cft, u64 val)
4206*4882a593Smuzhiyun {
4207*4882a593Smuzhiyun struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4208*4882a593Smuzhiyun
4209*4882a593Smuzhiyun if (val > 100)
4210*4882a593Smuzhiyun return -EINVAL;
4211*4882a593Smuzhiyun
4212*4882a593Smuzhiyun if (css->parent)
4213*4882a593Smuzhiyun memcg->swappiness = val;
4214*4882a593Smuzhiyun else
4215*4882a593Smuzhiyun vm_swappiness = val;
4216*4882a593Smuzhiyun
4217*4882a593Smuzhiyun return 0;
4218*4882a593Smuzhiyun }
4219*4882a593Smuzhiyun
__mem_cgroup_threshold(struct mem_cgroup * memcg,bool swap)4220*4882a593Smuzhiyun static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
4221*4882a593Smuzhiyun {
4222*4882a593Smuzhiyun struct mem_cgroup_threshold_ary *t;
4223*4882a593Smuzhiyun unsigned long usage;
4224*4882a593Smuzhiyun int i;
4225*4882a593Smuzhiyun
4226*4882a593Smuzhiyun rcu_read_lock();
4227*4882a593Smuzhiyun if (!swap)
4228*4882a593Smuzhiyun t = rcu_dereference(memcg->thresholds.primary);
4229*4882a593Smuzhiyun else
4230*4882a593Smuzhiyun t = rcu_dereference(memcg->memsw_thresholds.primary);
4231*4882a593Smuzhiyun
4232*4882a593Smuzhiyun if (!t)
4233*4882a593Smuzhiyun goto unlock;
4234*4882a593Smuzhiyun
4235*4882a593Smuzhiyun usage = mem_cgroup_usage(memcg, swap);
4236*4882a593Smuzhiyun
4237*4882a593Smuzhiyun /*
4238*4882a593Smuzhiyun * current_threshold points to threshold just below or equal to usage.
4239*4882a593Smuzhiyun * If it's not true, a threshold was crossed after last
4240*4882a593Smuzhiyun * call of __mem_cgroup_threshold().
4241*4882a593Smuzhiyun */
4242*4882a593Smuzhiyun i = t->current_threshold;
4243*4882a593Smuzhiyun
4244*4882a593Smuzhiyun /*
4245*4882a593Smuzhiyun * Iterate backward over array of thresholds starting from
4246*4882a593Smuzhiyun * current_threshold and check if a threshold is crossed.
4247*4882a593Smuzhiyun * If none of thresholds below usage is crossed, we read
4248*4882a593Smuzhiyun * only one element of the array here.
4249*4882a593Smuzhiyun */
4250*4882a593Smuzhiyun for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
4251*4882a593Smuzhiyun eventfd_signal(t->entries[i].eventfd, 1);
4252*4882a593Smuzhiyun
4253*4882a593Smuzhiyun /* i = current_threshold + 1 */
4254*4882a593Smuzhiyun i++;
4255*4882a593Smuzhiyun
4256*4882a593Smuzhiyun /*
4257*4882a593Smuzhiyun * Iterate forward over array of thresholds starting from
4258*4882a593Smuzhiyun * current_threshold+1 and check if a threshold is crossed.
4259*4882a593Smuzhiyun * If none of thresholds above usage is crossed, we read
4260*4882a593Smuzhiyun * only one element of the array here.
4261*4882a593Smuzhiyun */
4262*4882a593Smuzhiyun for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
4263*4882a593Smuzhiyun eventfd_signal(t->entries[i].eventfd, 1);
4264*4882a593Smuzhiyun
4265*4882a593Smuzhiyun /* Update current_threshold */
4266*4882a593Smuzhiyun t->current_threshold = i - 1;
4267*4882a593Smuzhiyun unlock:
4268*4882a593Smuzhiyun rcu_read_unlock();
4269*4882a593Smuzhiyun }
4270*4882a593Smuzhiyun
mem_cgroup_threshold(struct mem_cgroup * memcg)4271*4882a593Smuzhiyun static void mem_cgroup_threshold(struct mem_cgroup *memcg)
4272*4882a593Smuzhiyun {
4273*4882a593Smuzhiyun while (memcg) {
4274*4882a593Smuzhiyun __mem_cgroup_threshold(memcg, false);
4275*4882a593Smuzhiyun if (do_memsw_account())
4276*4882a593Smuzhiyun __mem_cgroup_threshold(memcg, true);
4277*4882a593Smuzhiyun
4278*4882a593Smuzhiyun memcg = parent_mem_cgroup(memcg);
4279*4882a593Smuzhiyun }
4280*4882a593Smuzhiyun }
4281*4882a593Smuzhiyun
compare_thresholds(const void * a,const void * b)4282*4882a593Smuzhiyun static int compare_thresholds(const void *a, const void *b)
4283*4882a593Smuzhiyun {
4284*4882a593Smuzhiyun const struct mem_cgroup_threshold *_a = a;
4285*4882a593Smuzhiyun const struct mem_cgroup_threshold *_b = b;
4286*4882a593Smuzhiyun
4287*4882a593Smuzhiyun if (_a->threshold > _b->threshold)
4288*4882a593Smuzhiyun return 1;
4289*4882a593Smuzhiyun
4290*4882a593Smuzhiyun if (_a->threshold < _b->threshold)
4291*4882a593Smuzhiyun return -1;
4292*4882a593Smuzhiyun
4293*4882a593Smuzhiyun return 0;
4294*4882a593Smuzhiyun }
4295*4882a593Smuzhiyun
mem_cgroup_oom_notify_cb(struct mem_cgroup * memcg)4296*4882a593Smuzhiyun static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
4297*4882a593Smuzhiyun {
4298*4882a593Smuzhiyun struct mem_cgroup_eventfd_list *ev;
4299*4882a593Smuzhiyun
4300*4882a593Smuzhiyun spin_lock(&memcg_oom_lock);
4301*4882a593Smuzhiyun
4302*4882a593Smuzhiyun list_for_each_entry(ev, &memcg->oom_notify, list)
4303*4882a593Smuzhiyun eventfd_signal(ev->eventfd, 1);
4304*4882a593Smuzhiyun
4305*4882a593Smuzhiyun spin_unlock(&memcg_oom_lock);
4306*4882a593Smuzhiyun return 0;
4307*4882a593Smuzhiyun }
4308*4882a593Smuzhiyun
mem_cgroup_oom_notify(struct mem_cgroup * memcg)4309*4882a593Smuzhiyun static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
4310*4882a593Smuzhiyun {
4311*4882a593Smuzhiyun struct mem_cgroup *iter;
4312*4882a593Smuzhiyun
4313*4882a593Smuzhiyun for_each_mem_cgroup_tree(iter, memcg)
4314*4882a593Smuzhiyun mem_cgroup_oom_notify_cb(iter);
4315*4882a593Smuzhiyun }
4316*4882a593Smuzhiyun
__mem_cgroup_usage_register_event(struct mem_cgroup * memcg,struct eventfd_ctx * eventfd,const char * args,enum res_type type)4317*4882a593Smuzhiyun static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4318*4882a593Smuzhiyun struct eventfd_ctx *eventfd, const char *args, enum res_type type)
4319*4882a593Smuzhiyun {
4320*4882a593Smuzhiyun struct mem_cgroup_thresholds *thresholds;
4321*4882a593Smuzhiyun struct mem_cgroup_threshold_ary *new;
4322*4882a593Smuzhiyun unsigned long threshold;
4323*4882a593Smuzhiyun unsigned long usage;
4324*4882a593Smuzhiyun int i, size, ret;
4325*4882a593Smuzhiyun
4326*4882a593Smuzhiyun ret = page_counter_memparse(args, "-1", &threshold);
4327*4882a593Smuzhiyun if (ret)
4328*4882a593Smuzhiyun return ret;
4329*4882a593Smuzhiyun
4330*4882a593Smuzhiyun mutex_lock(&memcg->thresholds_lock);
4331*4882a593Smuzhiyun
4332*4882a593Smuzhiyun if (type == _MEM) {
4333*4882a593Smuzhiyun thresholds = &memcg->thresholds;
4334*4882a593Smuzhiyun usage = mem_cgroup_usage(memcg, false);
4335*4882a593Smuzhiyun } else if (type == _MEMSWAP) {
4336*4882a593Smuzhiyun thresholds = &memcg->memsw_thresholds;
4337*4882a593Smuzhiyun usage = mem_cgroup_usage(memcg, true);
4338*4882a593Smuzhiyun } else
4339*4882a593Smuzhiyun BUG();
4340*4882a593Smuzhiyun
4341*4882a593Smuzhiyun /* Check if a threshold crossed before adding a new one */
4342*4882a593Smuzhiyun if (thresholds->primary)
4343*4882a593Smuzhiyun __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4344*4882a593Smuzhiyun
4345*4882a593Smuzhiyun size = thresholds->primary ? thresholds->primary->size + 1 : 1;
4346*4882a593Smuzhiyun
4347*4882a593Smuzhiyun /* Allocate memory for new array of thresholds */
4348*4882a593Smuzhiyun new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
4349*4882a593Smuzhiyun if (!new) {
4350*4882a593Smuzhiyun ret = -ENOMEM;
4351*4882a593Smuzhiyun goto unlock;
4352*4882a593Smuzhiyun }
4353*4882a593Smuzhiyun new->size = size;
4354*4882a593Smuzhiyun
4355*4882a593Smuzhiyun /* Copy thresholds (if any) to new array */
4356*4882a593Smuzhiyun if (thresholds->primary)
4357*4882a593Smuzhiyun memcpy(new->entries, thresholds->primary->entries,
4358*4882a593Smuzhiyun flex_array_size(new, entries, size - 1));
4359*4882a593Smuzhiyun
4360*4882a593Smuzhiyun /* Add new threshold */
4361*4882a593Smuzhiyun new->entries[size - 1].eventfd = eventfd;
4362*4882a593Smuzhiyun new->entries[size - 1].threshold = threshold;
4363*4882a593Smuzhiyun
4364*4882a593Smuzhiyun /* Sort thresholds. Registering of new threshold isn't time-critical */
4365*4882a593Smuzhiyun sort(new->entries, size, sizeof(*new->entries),
4366*4882a593Smuzhiyun compare_thresholds, NULL);
4367*4882a593Smuzhiyun
4368*4882a593Smuzhiyun /* Find current threshold */
4369*4882a593Smuzhiyun new->current_threshold = -1;
4370*4882a593Smuzhiyun for (i = 0; i < size; i++) {
4371*4882a593Smuzhiyun if (new->entries[i].threshold <= usage) {
4372*4882a593Smuzhiyun /*
4373*4882a593Smuzhiyun * new->current_threshold will not be used until
4374*4882a593Smuzhiyun * rcu_assign_pointer(), so it's safe to increment
4375*4882a593Smuzhiyun * it here.
4376*4882a593Smuzhiyun */
4377*4882a593Smuzhiyun ++new->current_threshold;
4378*4882a593Smuzhiyun } else
4379*4882a593Smuzhiyun break;
4380*4882a593Smuzhiyun }
4381*4882a593Smuzhiyun
4382*4882a593Smuzhiyun /* Free old spare buffer and save old primary buffer as spare */
4383*4882a593Smuzhiyun kfree(thresholds->spare);
4384*4882a593Smuzhiyun thresholds->spare = thresholds->primary;
4385*4882a593Smuzhiyun
4386*4882a593Smuzhiyun rcu_assign_pointer(thresholds->primary, new);
4387*4882a593Smuzhiyun
4388*4882a593Smuzhiyun /* To be sure that nobody uses thresholds */
4389*4882a593Smuzhiyun synchronize_rcu();
4390*4882a593Smuzhiyun
4391*4882a593Smuzhiyun unlock:
4392*4882a593Smuzhiyun mutex_unlock(&memcg->thresholds_lock);
4393*4882a593Smuzhiyun
4394*4882a593Smuzhiyun return ret;
4395*4882a593Smuzhiyun }
4396*4882a593Smuzhiyun
mem_cgroup_usage_register_event(struct mem_cgroup * memcg,struct eventfd_ctx * eventfd,const char * args)4397*4882a593Smuzhiyun static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4398*4882a593Smuzhiyun struct eventfd_ctx *eventfd, const char *args)
4399*4882a593Smuzhiyun {
4400*4882a593Smuzhiyun return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
4401*4882a593Smuzhiyun }
4402*4882a593Smuzhiyun
memsw_cgroup_usage_register_event(struct mem_cgroup * memcg,struct eventfd_ctx * eventfd,const char * args)4403*4882a593Smuzhiyun static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
4404*4882a593Smuzhiyun struct eventfd_ctx *eventfd, const char *args)
4405*4882a593Smuzhiyun {
4406*4882a593Smuzhiyun return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
4407*4882a593Smuzhiyun }
4408*4882a593Smuzhiyun
__mem_cgroup_usage_unregister_event(struct mem_cgroup * memcg,struct eventfd_ctx * eventfd,enum res_type type)4409*4882a593Smuzhiyun static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4410*4882a593Smuzhiyun struct eventfd_ctx *eventfd, enum res_type type)
4411*4882a593Smuzhiyun {
4412*4882a593Smuzhiyun struct mem_cgroup_thresholds *thresholds;
4413*4882a593Smuzhiyun struct mem_cgroup_threshold_ary *new;
4414*4882a593Smuzhiyun unsigned long usage;
4415*4882a593Smuzhiyun int i, j, size, entries;
4416*4882a593Smuzhiyun
4417*4882a593Smuzhiyun mutex_lock(&memcg->thresholds_lock);
4418*4882a593Smuzhiyun
4419*4882a593Smuzhiyun if (type == _MEM) {
4420*4882a593Smuzhiyun thresholds = &memcg->thresholds;
4421*4882a593Smuzhiyun usage = mem_cgroup_usage(memcg, false);
4422*4882a593Smuzhiyun } else if (type == _MEMSWAP) {
4423*4882a593Smuzhiyun thresholds = &memcg->memsw_thresholds;
4424*4882a593Smuzhiyun usage = mem_cgroup_usage(memcg, true);
4425*4882a593Smuzhiyun } else
4426*4882a593Smuzhiyun BUG();
4427*4882a593Smuzhiyun
4428*4882a593Smuzhiyun if (!thresholds->primary)
4429*4882a593Smuzhiyun goto unlock;
4430*4882a593Smuzhiyun
4431*4882a593Smuzhiyun /* Check if a threshold crossed before removing */
4432*4882a593Smuzhiyun __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4433*4882a593Smuzhiyun
4434*4882a593Smuzhiyun /* Calculate new number of threshold */
4435*4882a593Smuzhiyun size = entries = 0;
4436*4882a593Smuzhiyun for (i = 0; i < thresholds->primary->size; i++) {
4437*4882a593Smuzhiyun if (thresholds->primary->entries[i].eventfd != eventfd)
4438*4882a593Smuzhiyun size++;
4439*4882a593Smuzhiyun else
4440*4882a593Smuzhiyun entries++;
4441*4882a593Smuzhiyun }
4442*4882a593Smuzhiyun
4443*4882a593Smuzhiyun new = thresholds->spare;
4444*4882a593Smuzhiyun
4445*4882a593Smuzhiyun /* If no items related to eventfd have been cleared, nothing to do */
4446*4882a593Smuzhiyun if (!entries)
4447*4882a593Smuzhiyun goto unlock;
4448*4882a593Smuzhiyun
4449*4882a593Smuzhiyun /* Set thresholds array to NULL if we don't have thresholds */
4450*4882a593Smuzhiyun if (!size) {
4451*4882a593Smuzhiyun kfree(new);
4452*4882a593Smuzhiyun new = NULL;
4453*4882a593Smuzhiyun goto swap_buffers;
4454*4882a593Smuzhiyun }
4455*4882a593Smuzhiyun
4456*4882a593Smuzhiyun new->size = size;
4457*4882a593Smuzhiyun
4458*4882a593Smuzhiyun /* Copy thresholds and find current threshold */
4459*4882a593Smuzhiyun new->current_threshold = -1;
4460*4882a593Smuzhiyun for (i = 0, j = 0; i < thresholds->primary->size; i++) {
4461*4882a593Smuzhiyun if (thresholds->primary->entries[i].eventfd == eventfd)
4462*4882a593Smuzhiyun continue;
4463*4882a593Smuzhiyun
4464*4882a593Smuzhiyun new->entries[j] = thresholds->primary->entries[i];
4465*4882a593Smuzhiyun if (new->entries[j].threshold <= usage) {
4466*4882a593Smuzhiyun /*
4467*4882a593Smuzhiyun * new->current_threshold will not be used
4468*4882a593Smuzhiyun * until rcu_assign_pointer(), so it's safe to increment
4469*4882a593Smuzhiyun * it here.
4470*4882a593Smuzhiyun */
4471*4882a593Smuzhiyun ++new->current_threshold;
4472*4882a593Smuzhiyun }
4473*4882a593Smuzhiyun j++;
4474*4882a593Smuzhiyun }
4475*4882a593Smuzhiyun
4476*4882a593Smuzhiyun swap_buffers:
4477*4882a593Smuzhiyun /* Swap primary and spare array */
4478*4882a593Smuzhiyun thresholds->spare = thresholds->primary;
4479*4882a593Smuzhiyun
4480*4882a593Smuzhiyun rcu_assign_pointer(thresholds->primary, new);
4481*4882a593Smuzhiyun
4482*4882a593Smuzhiyun /* To be sure that nobody uses thresholds */
4483*4882a593Smuzhiyun synchronize_rcu();
4484*4882a593Smuzhiyun
4485*4882a593Smuzhiyun /* If all events are unregistered, free the spare array */
4486*4882a593Smuzhiyun if (!new) {
4487*4882a593Smuzhiyun kfree(thresholds->spare);
4488*4882a593Smuzhiyun thresholds->spare = NULL;
4489*4882a593Smuzhiyun }
4490*4882a593Smuzhiyun unlock:
4491*4882a593Smuzhiyun mutex_unlock(&memcg->thresholds_lock);
4492*4882a593Smuzhiyun }
4493*4882a593Smuzhiyun
mem_cgroup_usage_unregister_event(struct mem_cgroup * memcg,struct eventfd_ctx * eventfd)4494*4882a593Smuzhiyun static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4495*4882a593Smuzhiyun struct eventfd_ctx *eventfd)
4496*4882a593Smuzhiyun {
4497*4882a593Smuzhiyun return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
4498*4882a593Smuzhiyun }
4499*4882a593Smuzhiyun
memsw_cgroup_usage_unregister_event(struct mem_cgroup * memcg,struct eventfd_ctx * eventfd)4500*4882a593Smuzhiyun static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4501*4882a593Smuzhiyun struct eventfd_ctx *eventfd)
4502*4882a593Smuzhiyun {
4503*4882a593Smuzhiyun return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
4504*4882a593Smuzhiyun }
4505*4882a593Smuzhiyun
mem_cgroup_oom_register_event(struct mem_cgroup * memcg,struct eventfd_ctx * eventfd,const char * args)4506*4882a593Smuzhiyun static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
4507*4882a593Smuzhiyun struct eventfd_ctx *eventfd, const char *args)
4508*4882a593Smuzhiyun {
4509*4882a593Smuzhiyun struct mem_cgroup_eventfd_list *event;
4510*4882a593Smuzhiyun
4511*4882a593Smuzhiyun event = kmalloc(sizeof(*event), GFP_KERNEL);
4512*4882a593Smuzhiyun if (!event)
4513*4882a593Smuzhiyun return -ENOMEM;
4514*4882a593Smuzhiyun
4515*4882a593Smuzhiyun spin_lock(&memcg_oom_lock);
4516*4882a593Smuzhiyun
4517*4882a593Smuzhiyun event->eventfd = eventfd;
4518*4882a593Smuzhiyun list_add(&event->list, &memcg->oom_notify);
4519*4882a593Smuzhiyun
4520*4882a593Smuzhiyun /* already in OOM ? */
4521*4882a593Smuzhiyun if (memcg->under_oom)
4522*4882a593Smuzhiyun eventfd_signal(eventfd, 1);
4523*4882a593Smuzhiyun spin_unlock(&memcg_oom_lock);
4524*4882a593Smuzhiyun
4525*4882a593Smuzhiyun return 0;
4526*4882a593Smuzhiyun }
4527*4882a593Smuzhiyun
mem_cgroup_oom_unregister_event(struct mem_cgroup * memcg,struct eventfd_ctx * eventfd)4528*4882a593Smuzhiyun static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
4529*4882a593Smuzhiyun struct eventfd_ctx *eventfd)
4530*4882a593Smuzhiyun {
4531*4882a593Smuzhiyun struct mem_cgroup_eventfd_list *ev, *tmp;
4532*4882a593Smuzhiyun
4533*4882a593Smuzhiyun spin_lock(&memcg_oom_lock);
4534*4882a593Smuzhiyun
4535*4882a593Smuzhiyun list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
4536*4882a593Smuzhiyun if (ev->eventfd == eventfd) {
4537*4882a593Smuzhiyun list_del(&ev->list);
4538*4882a593Smuzhiyun kfree(ev);
4539*4882a593Smuzhiyun }
4540*4882a593Smuzhiyun }
4541*4882a593Smuzhiyun
4542*4882a593Smuzhiyun spin_unlock(&memcg_oom_lock);
4543*4882a593Smuzhiyun }
4544*4882a593Smuzhiyun
mem_cgroup_oom_control_read(struct seq_file * sf,void * v)4545*4882a593Smuzhiyun static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
4546*4882a593Smuzhiyun {
4547*4882a593Smuzhiyun struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
4548*4882a593Smuzhiyun
4549*4882a593Smuzhiyun seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
4550*4882a593Smuzhiyun seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
4551*4882a593Smuzhiyun seq_printf(sf, "oom_kill %lu\n",
4552*4882a593Smuzhiyun atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
4553*4882a593Smuzhiyun return 0;
4554*4882a593Smuzhiyun }
4555*4882a593Smuzhiyun
mem_cgroup_oom_control_write(struct cgroup_subsys_state * css,struct cftype * cft,u64 val)4556*4882a593Smuzhiyun static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
4557*4882a593Smuzhiyun struct cftype *cft, u64 val)
4558*4882a593Smuzhiyun {
4559*4882a593Smuzhiyun struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4560*4882a593Smuzhiyun
4561*4882a593Smuzhiyun /* cannot set to root cgroup and only 0 and 1 are allowed */
4562*4882a593Smuzhiyun if (!css->parent || !((val == 0) || (val == 1)))
4563*4882a593Smuzhiyun return -EINVAL;
4564*4882a593Smuzhiyun
4565*4882a593Smuzhiyun memcg->oom_kill_disable = val;
4566*4882a593Smuzhiyun if (!val)
4567*4882a593Smuzhiyun memcg_oom_recover(memcg);
4568*4882a593Smuzhiyun
4569*4882a593Smuzhiyun return 0;
4570*4882a593Smuzhiyun }
4571*4882a593Smuzhiyun
4572*4882a593Smuzhiyun #ifdef CONFIG_CGROUP_WRITEBACK
4573*4882a593Smuzhiyun
4574*4882a593Smuzhiyun #include <trace/events/writeback.h>
4575*4882a593Smuzhiyun
memcg_wb_domain_init(struct mem_cgroup * memcg,gfp_t gfp)4576*4882a593Smuzhiyun static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
4577*4882a593Smuzhiyun {
4578*4882a593Smuzhiyun return wb_domain_init(&memcg->cgwb_domain, gfp);
4579*4882a593Smuzhiyun }
4580*4882a593Smuzhiyun
memcg_wb_domain_exit(struct mem_cgroup * memcg)4581*4882a593Smuzhiyun static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
4582*4882a593Smuzhiyun {
4583*4882a593Smuzhiyun wb_domain_exit(&memcg->cgwb_domain);
4584*4882a593Smuzhiyun }
4585*4882a593Smuzhiyun
memcg_wb_domain_size_changed(struct mem_cgroup * memcg)4586*4882a593Smuzhiyun static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
4587*4882a593Smuzhiyun {
4588*4882a593Smuzhiyun wb_domain_size_changed(&memcg->cgwb_domain);
4589*4882a593Smuzhiyun }
4590*4882a593Smuzhiyun
mem_cgroup_wb_domain(struct bdi_writeback * wb)4591*4882a593Smuzhiyun struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
4592*4882a593Smuzhiyun {
4593*4882a593Smuzhiyun struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4594*4882a593Smuzhiyun
4595*4882a593Smuzhiyun if (!memcg->css.parent)
4596*4882a593Smuzhiyun return NULL;
4597*4882a593Smuzhiyun
4598*4882a593Smuzhiyun return &memcg->cgwb_domain;
4599*4882a593Smuzhiyun }
4600*4882a593Smuzhiyun
4601*4882a593Smuzhiyun /*
4602*4882a593Smuzhiyun * idx can be of type enum memcg_stat_item or node_stat_item.
4603*4882a593Smuzhiyun * Keep in sync with memcg_exact_page().
4604*4882a593Smuzhiyun */
memcg_exact_page_state(struct mem_cgroup * memcg,int idx)4605*4882a593Smuzhiyun static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
4606*4882a593Smuzhiyun {
4607*4882a593Smuzhiyun long x = atomic_long_read(&memcg->vmstats[idx]);
4608*4882a593Smuzhiyun int cpu;
4609*4882a593Smuzhiyun
4610*4882a593Smuzhiyun for_each_online_cpu(cpu)
4611*4882a593Smuzhiyun x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx];
4612*4882a593Smuzhiyun if (x < 0)
4613*4882a593Smuzhiyun x = 0;
4614*4882a593Smuzhiyun return x;
4615*4882a593Smuzhiyun }
4616*4882a593Smuzhiyun
4617*4882a593Smuzhiyun /**
4618*4882a593Smuzhiyun * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
4619*4882a593Smuzhiyun * @wb: bdi_writeback in question
4620*4882a593Smuzhiyun * @pfilepages: out parameter for number of file pages
4621*4882a593Smuzhiyun * @pheadroom: out parameter for number of allocatable pages according to memcg
4622*4882a593Smuzhiyun * @pdirty: out parameter for number of dirty pages
4623*4882a593Smuzhiyun * @pwriteback: out parameter for number of pages under writeback
4624*4882a593Smuzhiyun *
4625*4882a593Smuzhiyun * Determine the numbers of file, headroom, dirty, and writeback pages in
4626*4882a593Smuzhiyun * @wb's memcg. File, dirty and writeback are self-explanatory. Headroom
4627*4882a593Smuzhiyun * is a bit more involved.
4628*4882a593Smuzhiyun *
4629*4882a593Smuzhiyun * A memcg's headroom is "min(max, high) - used". In the hierarchy, the
4630*4882a593Smuzhiyun * headroom is calculated as the lowest headroom of itself and the
4631*4882a593Smuzhiyun * ancestors. Note that this doesn't consider the actual amount of
4632*4882a593Smuzhiyun * available memory in the system. The caller should further cap
4633*4882a593Smuzhiyun * *@pheadroom accordingly.
4634*4882a593Smuzhiyun */
mem_cgroup_wb_stats(struct bdi_writeback * wb,unsigned long * pfilepages,unsigned long * pheadroom,unsigned long * pdirty,unsigned long * pwriteback)4635*4882a593Smuzhiyun void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
4636*4882a593Smuzhiyun unsigned long *pheadroom, unsigned long *pdirty,
4637*4882a593Smuzhiyun unsigned long *pwriteback)
4638*4882a593Smuzhiyun {
4639*4882a593Smuzhiyun struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4640*4882a593Smuzhiyun struct mem_cgroup *parent;
4641*4882a593Smuzhiyun
4642*4882a593Smuzhiyun *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
4643*4882a593Smuzhiyun
4644*4882a593Smuzhiyun *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
4645*4882a593Smuzhiyun *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
4646*4882a593Smuzhiyun memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
4647*4882a593Smuzhiyun *pheadroom = PAGE_COUNTER_MAX;
4648*4882a593Smuzhiyun
4649*4882a593Smuzhiyun while ((parent = parent_mem_cgroup(memcg))) {
4650*4882a593Smuzhiyun unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
4651*4882a593Smuzhiyun READ_ONCE(memcg->memory.high));
4652*4882a593Smuzhiyun unsigned long used = page_counter_read(&memcg->memory);
4653*4882a593Smuzhiyun
4654*4882a593Smuzhiyun *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
4655*4882a593Smuzhiyun memcg = parent;
4656*4882a593Smuzhiyun }
4657*4882a593Smuzhiyun }
4658*4882a593Smuzhiyun
4659*4882a593Smuzhiyun /*
4660*4882a593Smuzhiyun * Foreign dirty flushing
4661*4882a593Smuzhiyun *
4662*4882a593Smuzhiyun * There's an inherent mismatch between memcg and writeback. The former
4663*4882a593Smuzhiyun * trackes ownership per-page while the latter per-inode. This was a
4664*4882a593Smuzhiyun * deliberate design decision because honoring per-page ownership in the
4665*4882a593Smuzhiyun * writeback path is complicated, may lead to higher CPU and IO overheads
4666*4882a593Smuzhiyun * and deemed unnecessary given that write-sharing an inode across
4667*4882a593Smuzhiyun * different cgroups isn't a common use-case.
4668*4882a593Smuzhiyun *
4669*4882a593Smuzhiyun * Combined with inode majority-writer ownership switching, this works well
4670*4882a593Smuzhiyun * enough in most cases but there are some pathological cases. For
4671*4882a593Smuzhiyun * example, let's say there are two cgroups A and B which keep writing to
4672*4882a593Smuzhiyun * different but confined parts of the same inode. B owns the inode and
4673*4882a593Smuzhiyun * A's memory is limited far below B's. A's dirty ratio can rise enough to
4674*4882a593Smuzhiyun * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
4675*4882a593Smuzhiyun * triggering background writeback. A will be slowed down without a way to
4676*4882a593Smuzhiyun * make writeback of the dirty pages happen.
4677*4882a593Smuzhiyun *
4678*4882a593Smuzhiyun * Conditions like the above can lead to a cgroup getting repatedly and
4679*4882a593Smuzhiyun * severely throttled after making some progress after each
4680*4882a593Smuzhiyun * dirty_expire_interval while the underyling IO device is almost
4681*4882a593Smuzhiyun * completely idle.
4682*4882a593Smuzhiyun *
4683*4882a593Smuzhiyun * Solving this problem completely requires matching the ownership tracking
4684*4882a593Smuzhiyun * granularities between memcg and writeback in either direction. However,
4685*4882a593Smuzhiyun * the more egregious behaviors can be avoided by simply remembering the
4686*4882a593Smuzhiyun * most recent foreign dirtying events and initiating remote flushes on
4687*4882a593Smuzhiyun * them when local writeback isn't enough to keep the memory clean enough.
4688*4882a593Smuzhiyun *
4689*4882a593Smuzhiyun * The following two functions implement such mechanism. When a foreign
4690*4882a593Smuzhiyun * page - a page whose memcg and writeback ownerships don't match - is
4691*4882a593Smuzhiyun * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
4692*4882a593Smuzhiyun * bdi_writeback on the page owning memcg. When balance_dirty_pages()
4693*4882a593Smuzhiyun * decides that the memcg needs to sleep due to high dirty ratio, it calls
4694*4882a593Smuzhiyun * mem_cgroup_flush_foreign() which queues writeback on the recorded
4695*4882a593Smuzhiyun * foreign bdi_writebacks which haven't expired. Both the numbers of
4696*4882a593Smuzhiyun * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
4697*4882a593Smuzhiyun * limited to MEMCG_CGWB_FRN_CNT.
4698*4882a593Smuzhiyun *
4699*4882a593Smuzhiyun * The mechanism only remembers IDs and doesn't hold any object references.
4700*4882a593Smuzhiyun * As being wrong occasionally doesn't matter, updates and accesses to the
4701*4882a593Smuzhiyun * records are lockless and racy.
4702*4882a593Smuzhiyun */
mem_cgroup_track_foreign_dirty_slowpath(struct page * page,struct bdi_writeback * wb)4703*4882a593Smuzhiyun void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
4704*4882a593Smuzhiyun struct bdi_writeback *wb)
4705*4882a593Smuzhiyun {
4706*4882a593Smuzhiyun struct mem_cgroup *memcg = page->mem_cgroup;
4707*4882a593Smuzhiyun struct memcg_cgwb_frn *frn;
4708*4882a593Smuzhiyun u64 now = get_jiffies_64();
4709*4882a593Smuzhiyun u64 oldest_at = now;
4710*4882a593Smuzhiyun int oldest = -1;
4711*4882a593Smuzhiyun int i;
4712*4882a593Smuzhiyun
4713*4882a593Smuzhiyun trace_track_foreign_dirty(page, wb);
4714*4882a593Smuzhiyun
4715*4882a593Smuzhiyun /*
4716*4882a593Smuzhiyun * Pick the slot to use. If there is already a slot for @wb, keep
4717*4882a593Smuzhiyun * using it. If not replace the oldest one which isn't being
4718*4882a593Smuzhiyun * written out.
4719*4882a593Smuzhiyun */
4720*4882a593Smuzhiyun for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4721*4882a593Smuzhiyun frn = &memcg->cgwb_frn[i];
4722*4882a593Smuzhiyun if (frn->bdi_id == wb->bdi->id &&
4723*4882a593Smuzhiyun frn->memcg_id == wb->memcg_css->id)
4724*4882a593Smuzhiyun break;
4725*4882a593Smuzhiyun if (time_before64(frn->at, oldest_at) &&
4726*4882a593Smuzhiyun atomic_read(&frn->done.cnt) == 1) {
4727*4882a593Smuzhiyun oldest = i;
4728*4882a593Smuzhiyun oldest_at = frn->at;
4729*4882a593Smuzhiyun }
4730*4882a593Smuzhiyun }
4731*4882a593Smuzhiyun
4732*4882a593Smuzhiyun if (i < MEMCG_CGWB_FRN_CNT) {
4733*4882a593Smuzhiyun /*
4734*4882a593Smuzhiyun * Re-using an existing one. Update timestamp lazily to
4735*4882a593Smuzhiyun * avoid making the cacheline hot. We want them to be
4736*4882a593Smuzhiyun * reasonably up-to-date and significantly shorter than
4737*4882a593Smuzhiyun * dirty_expire_interval as that's what expires the record.
4738*4882a593Smuzhiyun * Use the shorter of 1s and dirty_expire_interval / 8.
4739*4882a593Smuzhiyun */
4740*4882a593Smuzhiyun unsigned long update_intv =
4741*4882a593Smuzhiyun min_t(unsigned long, HZ,
4742*4882a593Smuzhiyun msecs_to_jiffies(dirty_expire_interval * 10) / 8);
4743*4882a593Smuzhiyun
4744*4882a593Smuzhiyun if (time_before64(frn->at, now - update_intv))
4745*4882a593Smuzhiyun frn->at = now;
4746*4882a593Smuzhiyun } else if (oldest >= 0) {
4747*4882a593Smuzhiyun /* replace the oldest free one */
4748*4882a593Smuzhiyun frn = &memcg->cgwb_frn[oldest];
4749*4882a593Smuzhiyun frn->bdi_id = wb->bdi->id;
4750*4882a593Smuzhiyun frn->memcg_id = wb->memcg_css->id;
4751*4882a593Smuzhiyun frn->at = now;
4752*4882a593Smuzhiyun }
4753*4882a593Smuzhiyun }
4754*4882a593Smuzhiyun
4755*4882a593Smuzhiyun /* issue foreign writeback flushes for recorded foreign dirtying events */
mem_cgroup_flush_foreign(struct bdi_writeback * wb)4756*4882a593Smuzhiyun void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
4757*4882a593Smuzhiyun {
4758*4882a593Smuzhiyun struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4759*4882a593Smuzhiyun unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
4760*4882a593Smuzhiyun u64 now = jiffies_64;
4761*4882a593Smuzhiyun int i;
4762*4882a593Smuzhiyun
4763*4882a593Smuzhiyun for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4764*4882a593Smuzhiyun struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
4765*4882a593Smuzhiyun
4766*4882a593Smuzhiyun /*
4767*4882a593Smuzhiyun * If the record is older than dirty_expire_interval,
4768*4882a593Smuzhiyun * writeback on it has already started. No need to kick it
4769*4882a593Smuzhiyun * off again. Also, don't start a new one if there's
4770*4882a593Smuzhiyun * already one in flight.
4771*4882a593Smuzhiyun */
4772*4882a593Smuzhiyun if (time_after64(frn->at, now - intv) &&
4773*4882a593Smuzhiyun atomic_read(&frn->done.cnt) == 1) {
4774*4882a593Smuzhiyun frn->at = 0;
4775*4882a593Smuzhiyun trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
4776*4882a593Smuzhiyun cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
4777*4882a593Smuzhiyun WB_REASON_FOREIGN_FLUSH,
4778*4882a593Smuzhiyun &frn->done);
4779*4882a593Smuzhiyun }
4780*4882a593Smuzhiyun }
4781*4882a593Smuzhiyun }
4782*4882a593Smuzhiyun
4783*4882a593Smuzhiyun #else /* CONFIG_CGROUP_WRITEBACK */
4784*4882a593Smuzhiyun
memcg_wb_domain_init(struct mem_cgroup * memcg,gfp_t gfp)4785*4882a593Smuzhiyun static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
4786*4882a593Smuzhiyun {
4787*4882a593Smuzhiyun return 0;
4788*4882a593Smuzhiyun }
4789*4882a593Smuzhiyun
memcg_wb_domain_exit(struct mem_cgroup * memcg)4790*4882a593Smuzhiyun static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
4791*4882a593Smuzhiyun {
4792*4882a593Smuzhiyun }
4793*4882a593Smuzhiyun
memcg_wb_domain_size_changed(struct mem_cgroup * memcg)4794*4882a593Smuzhiyun static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
4795*4882a593Smuzhiyun {
4796*4882a593Smuzhiyun }
4797*4882a593Smuzhiyun
4798*4882a593Smuzhiyun #endif /* CONFIG_CGROUP_WRITEBACK */
4799*4882a593Smuzhiyun
4800*4882a593Smuzhiyun /*
4801*4882a593Smuzhiyun * DO NOT USE IN NEW FILES.
4802*4882a593Smuzhiyun *
4803*4882a593Smuzhiyun * "cgroup.event_control" implementation.
4804*4882a593Smuzhiyun *
4805*4882a593Smuzhiyun * This is way over-engineered. It tries to support fully configurable
4806*4882a593Smuzhiyun * events for each user. Such level of flexibility is completely
4807*4882a593Smuzhiyun * unnecessary especially in the light of the planned unified hierarchy.
4808*4882a593Smuzhiyun *
4809*4882a593Smuzhiyun * Please deprecate this and replace with something simpler if at all
4810*4882a593Smuzhiyun * possible.
4811*4882a593Smuzhiyun */
4812*4882a593Smuzhiyun
4813*4882a593Smuzhiyun /*
4814*4882a593Smuzhiyun * Unregister event and free resources.
4815*4882a593Smuzhiyun *
4816*4882a593Smuzhiyun * Gets called from workqueue.
4817*4882a593Smuzhiyun */
memcg_event_remove(struct work_struct * work)4818*4882a593Smuzhiyun static void memcg_event_remove(struct work_struct *work)
4819*4882a593Smuzhiyun {
4820*4882a593Smuzhiyun struct mem_cgroup_event *event =
4821*4882a593Smuzhiyun container_of(work, struct mem_cgroup_event, remove);
4822*4882a593Smuzhiyun struct mem_cgroup *memcg = event->memcg;
4823*4882a593Smuzhiyun
4824*4882a593Smuzhiyun remove_wait_queue(event->wqh, &event->wait);
4825*4882a593Smuzhiyun
4826*4882a593Smuzhiyun event->unregister_event(memcg, event->eventfd);
4827*4882a593Smuzhiyun
4828*4882a593Smuzhiyun /* Notify userspace the event is going away. */
4829*4882a593Smuzhiyun eventfd_signal(event->eventfd, 1);
4830*4882a593Smuzhiyun
4831*4882a593Smuzhiyun eventfd_ctx_put(event->eventfd);
4832*4882a593Smuzhiyun kfree(event);
4833*4882a593Smuzhiyun css_put(&memcg->css);
4834*4882a593Smuzhiyun }
4835*4882a593Smuzhiyun
4836*4882a593Smuzhiyun /*
4837*4882a593Smuzhiyun * Gets called on EPOLLHUP on eventfd when user closes it.
4838*4882a593Smuzhiyun *
4839*4882a593Smuzhiyun * Called with wqh->lock held and interrupts disabled.
4840*4882a593Smuzhiyun */
memcg_event_wake(wait_queue_entry_t * wait,unsigned mode,int sync,void * key)4841*4882a593Smuzhiyun static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
4842*4882a593Smuzhiyun int sync, void *key)
4843*4882a593Smuzhiyun {
4844*4882a593Smuzhiyun struct mem_cgroup_event *event =
4845*4882a593Smuzhiyun container_of(wait, struct mem_cgroup_event, wait);
4846*4882a593Smuzhiyun struct mem_cgroup *memcg = event->memcg;
4847*4882a593Smuzhiyun __poll_t flags = key_to_poll(key);
4848*4882a593Smuzhiyun
4849*4882a593Smuzhiyun if (flags & EPOLLHUP) {
4850*4882a593Smuzhiyun /*
4851*4882a593Smuzhiyun * If the event has been detached at cgroup removal, we
4852*4882a593Smuzhiyun * can simply return knowing the other side will cleanup
4853*4882a593Smuzhiyun * for us.
4854*4882a593Smuzhiyun *
4855*4882a593Smuzhiyun * We can't race against event freeing since the other
4856*4882a593Smuzhiyun * side will require wqh->lock via remove_wait_queue(),
4857*4882a593Smuzhiyun * which we hold.
4858*4882a593Smuzhiyun */
4859*4882a593Smuzhiyun spin_lock(&memcg->event_list_lock);
4860*4882a593Smuzhiyun if (!list_empty(&event->list)) {
4861*4882a593Smuzhiyun list_del_init(&event->list);
4862*4882a593Smuzhiyun /*
4863*4882a593Smuzhiyun * We are in atomic context, but cgroup_event_remove()
4864*4882a593Smuzhiyun * may sleep, so we have to call it in workqueue.
4865*4882a593Smuzhiyun */
4866*4882a593Smuzhiyun schedule_work(&event->remove);
4867*4882a593Smuzhiyun }
4868*4882a593Smuzhiyun spin_unlock(&memcg->event_list_lock);
4869*4882a593Smuzhiyun }
4870*4882a593Smuzhiyun
4871*4882a593Smuzhiyun return 0;
4872*4882a593Smuzhiyun }
4873*4882a593Smuzhiyun
memcg_event_ptable_queue_proc(struct file * file,wait_queue_head_t * wqh,poll_table * pt)4874*4882a593Smuzhiyun static void memcg_event_ptable_queue_proc(struct file *file,
4875*4882a593Smuzhiyun wait_queue_head_t *wqh, poll_table *pt)
4876*4882a593Smuzhiyun {
4877*4882a593Smuzhiyun struct mem_cgroup_event *event =
4878*4882a593Smuzhiyun container_of(pt, struct mem_cgroup_event, pt);
4879*4882a593Smuzhiyun
4880*4882a593Smuzhiyun event->wqh = wqh;
4881*4882a593Smuzhiyun add_wait_queue(wqh, &event->wait);
4882*4882a593Smuzhiyun }
4883*4882a593Smuzhiyun
4884*4882a593Smuzhiyun /*
4885*4882a593Smuzhiyun * DO NOT USE IN NEW FILES.
4886*4882a593Smuzhiyun *
4887*4882a593Smuzhiyun * Parse input and register new cgroup event handler.
4888*4882a593Smuzhiyun *
4889*4882a593Smuzhiyun * Input must be in format '<event_fd> <control_fd> <args>'.
4890*4882a593Smuzhiyun * Interpretation of args is defined by control file implementation.
4891*4882a593Smuzhiyun */
memcg_write_event_control(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)4892*4882a593Smuzhiyun static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
4893*4882a593Smuzhiyun char *buf, size_t nbytes, loff_t off)
4894*4882a593Smuzhiyun {
4895*4882a593Smuzhiyun struct cgroup_subsys_state *css = of_css(of);
4896*4882a593Smuzhiyun struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4897*4882a593Smuzhiyun struct mem_cgroup_event *event;
4898*4882a593Smuzhiyun struct cgroup_subsys_state *cfile_css;
4899*4882a593Smuzhiyun unsigned int efd, cfd;
4900*4882a593Smuzhiyun struct fd efile;
4901*4882a593Smuzhiyun struct fd cfile;
4902*4882a593Smuzhiyun struct dentry *cdentry;
4903*4882a593Smuzhiyun const char *name;
4904*4882a593Smuzhiyun char *endp;
4905*4882a593Smuzhiyun int ret;
4906*4882a593Smuzhiyun
4907*4882a593Smuzhiyun buf = strstrip(buf);
4908*4882a593Smuzhiyun
4909*4882a593Smuzhiyun efd = simple_strtoul(buf, &endp, 10);
4910*4882a593Smuzhiyun if (*endp != ' ')
4911*4882a593Smuzhiyun return -EINVAL;
4912*4882a593Smuzhiyun buf = endp + 1;
4913*4882a593Smuzhiyun
4914*4882a593Smuzhiyun cfd = simple_strtoul(buf, &endp, 10);
4915*4882a593Smuzhiyun if ((*endp != ' ') && (*endp != '\0'))
4916*4882a593Smuzhiyun return -EINVAL;
4917*4882a593Smuzhiyun buf = endp + 1;
4918*4882a593Smuzhiyun
4919*4882a593Smuzhiyun event = kzalloc(sizeof(*event), GFP_KERNEL);
4920*4882a593Smuzhiyun if (!event)
4921*4882a593Smuzhiyun return -ENOMEM;
4922*4882a593Smuzhiyun
4923*4882a593Smuzhiyun event->memcg = memcg;
4924*4882a593Smuzhiyun INIT_LIST_HEAD(&event->list);
4925*4882a593Smuzhiyun init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
4926*4882a593Smuzhiyun init_waitqueue_func_entry(&event->wait, memcg_event_wake);
4927*4882a593Smuzhiyun INIT_WORK(&event->remove, memcg_event_remove);
4928*4882a593Smuzhiyun
4929*4882a593Smuzhiyun efile = fdget(efd);
4930*4882a593Smuzhiyun if (!efile.file) {
4931*4882a593Smuzhiyun ret = -EBADF;
4932*4882a593Smuzhiyun goto out_kfree;
4933*4882a593Smuzhiyun }
4934*4882a593Smuzhiyun
4935*4882a593Smuzhiyun event->eventfd = eventfd_ctx_fileget(efile.file);
4936*4882a593Smuzhiyun if (IS_ERR(event->eventfd)) {
4937*4882a593Smuzhiyun ret = PTR_ERR(event->eventfd);
4938*4882a593Smuzhiyun goto out_put_efile;
4939*4882a593Smuzhiyun }
4940*4882a593Smuzhiyun
4941*4882a593Smuzhiyun cfile = fdget(cfd);
4942*4882a593Smuzhiyun if (!cfile.file) {
4943*4882a593Smuzhiyun ret = -EBADF;
4944*4882a593Smuzhiyun goto out_put_eventfd;
4945*4882a593Smuzhiyun }
4946*4882a593Smuzhiyun
4947*4882a593Smuzhiyun /* the process need read permission on control file */
4948*4882a593Smuzhiyun /* AV: shouldn't we check that it's been opened for read instead? */
4949*4882a593Smuzhiyun ret = inode_permission(file_inode(cfile.file), MAY_READ);
4950*4882a593Smuzhiyun if (ret < 0)
4951*4882a593Smuzhiyun goto out_put_cfile;
4952*4882a593Smuzhiyun
4953*4882a593Smuzhiyun /*
4954*4882a593Smuzhiyun * The control file must be a regular cgroup1 file. As a regular cgroup
4955*4882a593Smuzhiyun * file can't be renamed, it's safe to access its name afterwards.
4956*4882a593Smuzhiyun */
4957*4882a593Smuzhiyun cdentry = cfile.file->f_path.dentry;
4958*4882a593Smuzhiyun if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
4959*4882a593Smuzhiyun ret = -EINVAL;
4960*4882a593Smuzhiyun goto out_put_cfile;
4961*4882a593Smuzhiyun }
4962*4882a593Smuzhiyun
4963*4882a593Smuzhiyun /*
4964*4882a593Smuzhiyun * Determine the event callbacks and set them in @event. This used
4965*4882a593Smuzhiyun * to be done via struct cftype but cgroup core no longer knows
4966*4882a593Smuzhiyun * about these events. The following is crude but the whole thing
4967*4882a593Smuzhiyun * is for compatibility anyway.
4968*4882a593Smuzhiyun *
4969*4882a593Smuzhiyun * DO NOT ADD NEW FILES.
4970*4882a593Smuzhiyun */
4971*4882a593Smuzhiyun name = cdentry->d_name.name;
4972*4882a593Smuzhiyun
4973*4882a593Smuzhiyun if (!strcmp(name, "memory.usage_in_bytes")) {
4974*4882a593Smuzhiyun event->register_event = mem_cgroup_usage_register_event;
4975*4882a593Smuzhiyun event->unregister_event = mem_cgroup_usage_unregister_event;
4976*4882a593Smuzhiyun } else if (!strcmp(name, "memory.oom_control")) {
4977*4882a593Smuzhiyun event->register_event = mem_cgroup_oom_register_event;
4978*4882a593Smuzhiyun event->unregister_event = mem_cgroup_oom_unregister_event;
4979*4882a593Smuzhiyun } else if (!strcmp(name, "memory.pressure_level")) {
4980*4882a593Smuzhiyun event->register_event = vmpressure_register_event;
4981*4882a593Smuzhiyun event->unregister_event = vmpressure_unregister_event;
4982*4882a593Smuzhiyun } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
4983*4882a593Smuzhiyun event->register_event = memsw_cgroup_usage_register_event;
4984*4882a593Smuzhiyun event->unregister_event = memsw_cgroup_usage_unregister_event;
4985*4882a593Smuzhiyun } else {
4986*4882a593Smuzhiyun ret = -EINVAL;
4987*4882a593Smuzhiyun goto out_put_cfile;
4988*4882a593Smuzhiyun }
4989*4882a593Smuzhiyun
4990*4882a593Smuzhiyun /*
4991*4882a593Smuzhiyun * Verify @cfile should belong to @css. Also, remaining events are
4992*4882a593Smuzhiyun * automatically removed on cgroup destruction but the removal is
4993*4882a593Smuzhiyun * asynchronous, so take an extra ref on @css.
4994*4882a593Smuzhiyun */
4995*4882a593Smuzhiyun cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
4996*4882a593Smuzhiyun &memory_cgrp_subsys);
4997*4882a593Smuzhiyun ret = -EINVAL;
4998*4882a593Smuzhiyun if (IS_ERR(cfile_css))
4999*4882a593Smuzhiyun goto out_put_cfile;
5000*4882a593Smuzhiyun if (cfile_css != css) {
5001*4882a593Smuzhiyun css_put(cfile_css);
5002*4882a593Smuzhiyun goto out_put_cfile;
5003*4882a593Smuzhiyun }
5004*4882a593Smuzhiyun
5005*4882a593Smuzhiyun ret = event->register_event(memcg, event->eventfd, buf);
5006*4882a593Smuzhiyun if (ret)
5007*4882a593Smuzhiyun goto out_put_css;
5008*4882a593Smuzhiyun
5009*4882a593Smuzhiyun vfs_poll(efile.file, &event->pt);
5010*4882a593Smuzhiyun
5011*4882a593Smuzhiyun spin_lock(&memcg->event_list_lock);
5012*4882a593Smuzhiyun list_add(&event->list, &memcg->event_list);
5013*4882a593Smuzhiyun spin_unlock(&memcg->event_list_lock);
5014*4882a593Smuzhiyun
5015*4882a593Smuzhiyun fdput(cfile);
5016*4882a593Smuzhiyun fdput(efile);
5017*4882a593Smuzhiyun
5018*4882a593Smuzhiyun return nbytes;
5019*4882a593Smuzhiyun
5020*4882a593Smuzhiyun out_put_css:
5021*4882a593Smuzhiyun css_put(css);
5022*4882a593Smuzhiyun out_put_cfile:
5023*4882a593Smuzhiyun fdput(cfile);
5024*4882a593Smuzhiyun out_put_eventfd:
5025*4882a593Smuzhiyun eventfd_ctx_put(event->eventfd);
5026*4882a593Smuzhiyun out_put_efile:
5027*4882a593Smuzhiyun fdput(efile);
5028*4882a593Smuzhiyun out_kfree:
5029*4882a593Smuzhiyun kfree(event);
5030*4882a593Smuzhiyun
5031*4882a593Smuzhiyun return ret;
5032*4882a593Smuzhiyun }
5033*4882a593Smuzhiyun
5034*4882a593Smuzhiyun static struct cftype mem_cgroup_legacy_files[] = {
5035*4882a593Smuzhiyun {
5036*4882a593Smuzhiyun .name = "usage_in_bytes",
5037*4882a593Smuzhiyun .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
5038*4882a593Smuzhiyun .read_u64 = mem_cgroup_read_u64,
5039*4882a593Smuzhiyun },
5040*4882a593Smuzhiyun {
5041*4882a593Smuzhiyun .name = "max_usage_in_bytes",
5042*4882a593Smuzhiyun .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
5043*4882a593Smuzhiyun .write = mem_cgroup_reset,
5044*4882a593Smuzhiyun .read_u64 = mem_cgroup_read_u64,
5045*4882a593Smuzhiyun },
5046*4882a593Smuzhiyun {
5047*4882a593Smuzhiyun .name = "limit_in_bytes",
5048*4882a593Smuzhiyun .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
5049*4882a593Smuzhiyun .write = mem_cgroup_write,
5050*4882a593Smuzhiyun .read_u64 = mem_cgroup_read_u64,
5051*4882a593Smuzhiyun },
5052*4882a593Smuzhiyun {
5053*4882a593Smuzhiyun .name = "soft_limit_in_bytes",
5054*4882a593Smuzhiyun .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
5055*4882a593Smuzhiyun .write = mem_cgroup_write,
5056*4882a593Smuzhiyun .read_u64 = mem_cgroup_read_u64,
5057*4882a593Smuzhiyun },
5058*4882a593Smuzhiyun {
5059*4882a593Smuzhiyun .name = "failcnt",
5060*4882a593Smuzhiyun .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
5061*4882a593Smuzhiyun .write = mem_cgroup_reset,
5062*4882a593Smuzhiyun .read_u64 = mem_cgroup_read_u64,
5063*4882a593Smuzhiyun },
5064*4882a593Smuzhiyun {
5065*4882a593Smuzhiyun .name = "stat",
5066*4882a593Smuzhiyun .seq_show = memcg_stat_show,
5067*4882a593Smuzhiyun },
5068*4882a593Smuzhiyun {
5069*4882a593Smuzhiyun .name = "force_empty",
5070*4882a593Smuzhiyun .write = mem_cgroup_force_empty_write,
5071*4882a593Smuzhiyun },
5072*4882a593Smuzhiyun {
5073*4882a593Smuzhiyun .name = "use_hierarchy",
5074*4882a593Smuzhiyun .write_u64 = mem_cgroup_hierarchy_write,
5075*4882a593Smuzhiyun .read_u64 = mem_cgroup_hierarchy_read,
5076*4882a593Smuzhiyun },
5077*4882a593Smuzhiyun {
5078*4882a593Smuzhiyun .name = "cgroup.event_control", /* XXX: for compat */
5079*4882a593Smuzhiyun .write = memcg_write_event_control,
5080*4882a593Smuzhiyun .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
5081*4882a593Smuzhiyun },
5082*4882a593Smuzhiyun {
5083*4882a593Smuzhiyun .name = "swappiness",
5084*4882a593Smuzhiyun .read_u64 = mem_cgroup_swappiness_read,
5085*4882a593Smuzhiyun .write_u64 = mem_cgroup_swappiness_write,
5086*4882a593Smuzhiyun },
5087*4882a593Smuzhiyun {
5088*4882a593Smuzhiyun .name = "move_charge_at_immigrate",
5089*4882a593Smuzhiyun .read_u64 = mem_cgroup_move_charge_read,
5090*4882a593Smuzhiyun .write_u64 = mem_cgroup_move_charge_write,
5091*4882a593Smuzhiyun },
5092*4882a593Smuzhiyun {
5093*4882a593Smuzhiyun .name = "oom_control",
5094*4882a593Smuzhiyun .seq_show = mem_cgroup_oom_control_read,
5095*4882a593Smuzhiyun .write_u64 = mem_cgroup_oom_control_write,
5096*4882a593Smuzhiyun .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
5097*4882a593Smuzhiyun },
5098*4882a593Smuzhiyun {
5099*4882a593Smuzhiyun .name = "pressure_level",
5100*4882a593Smuzhiyun },
5101*4882a593Smuzhiyun #ifdef CONFIG_NUMA
5102*4882a593Smuzhiyun {
5103*4882a593Smuzhiyun .name = "numa_stat",
5104*4882a593Smuzhiyun .seq_show = memcg_numa_stat_show,
5105*4882a593Smuzhiyun },
5106*4882a593Smuzhiyun #endif
5107*4882a593Smuzhiyun {
5108*4882a593Smuzhiyun .name = "kmem.limit_in_bytes",
5109*4882a593Smuzhiyun .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
5110*4882a593Smuzhiyun .write = mem_cgroup_write,
5111*4882a593Smuzhiyun .read_u64 = mem_cgroup_read_u64,
5112*4882a593Smuzhiyun },
5113*4882a593Smuzhiyun {
5114*4882a593Smuzhiyun .name = "kmem.usage_in_bytes",
5115*4882a593Smuzhiyun .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
5116*4882a593Smuzhiyun .read_u64 = mem_cgroup_read_u64,
5117*4882a593Smuzhiyun },
5118*4882a593Smuzhiyun {
5119*4882a593Smuzhiyun .name = "kmem.failcnt",
5120*4882a593Smuzhiyun .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
5121*4882a593Smuzhiyun .write = mem_cgroup_reset,
5122*4882a593Smuzhiyun .read_u64 = mem_cgroup_read_u64,
5123*4882a593Smuzhiyun },
5124*4882a593Smuzhiyun {
5125*4882a593Smuzhiyun .name = "kmem.max_usage_in_bytes",
5126*4882a593Smuzhiyun .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
5127*4882a593Smuzhiyun .write = mem_cgroup_reset,
5128*4882a593Smuzhiyun .read_u64 = mem_cgroup_read_u64,
5129*4882a593Smuzhiyun },
5130*4882a593Smuzhiyun #if defined(CONFIG_MEMCG_KMEM) && \
5131*4882a593Smuzhiyun (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
5132*4882a593Smuzhiyun {
5133*4882a593Smuzhiyun .name = "kmem.slabinfo",
5134*4882a593Smuzhiyun .seq_show = memcg_slab_show,
5135*4882a593Smuzhiyun },
5136*4882a593Smuzhiyun #endif
5137*4882a593Smuzhiyun {
5138*4882a593Smuzhiyun .name = "kmem.tcp.limit_in_bytes",
5139*4882a593Smuzhiyun .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
5140*4882a593Smuzhiyun .write = mem_cgroup_write,
5141*4882a593Smuzhiyun .read_u64 = mem_cgroup_read_u64,
5142*4882a593Smuzhiyun },
5143*4882a593Smuzhiyun {
5144*4882a593Smuzhiyun .name = "kmem.tcp.usage_in_bytes",
5145*4882a593Smuzhiyun .private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
5146*4882a593Smuzhiyun .read_u64 = mem_cgroup_read_u64,
5147*4882a593Smuzhiyun },
5148*4882a593Smuzhiyun {
5149*4882a593Smuzhiyun .name = "kmem.tcp.failcnt",
5150*4882a593Smuzhiyun .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
5151*4882a593Smuzhiyun .write = mem_cgroup_reset,
5152*4882a593Smuzhiyun .read_u64 = mem_cgroup_read_u64,
5153*4882a593Smuzhiyun },
5154*4882a593Smuzhiyun {
5155*4882a593Smuzhiyun .name = "kmem.tcp.max_usage_in_bytes",
5156*4882a593Smuzhiyun .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
5157*4882a593Smuzhiyun .write = mem_cgroup_reset,
5158*4882a593Smuzhiyun .read_u64 = mem_cgroup_read_u64,
5159*4882a593Smuzhiyun },
5160*4882a593Smuzhiyun { }, /* terminate */
5161*4882a593Smuzhiyun };
5162*4882a593Smuzhiyun
5163*4882a593Smuzhiyun /*
5164*4882a593Smuzhiyun * Private memory cgroup IDR
5165*4882a593Smuzhiyun *
5166*4882a593Smuzhiyun * Swap-out records and page cache shadow entries need to store memcg
5167*4882a593Smuzhiyun * references in constrained space, so we maintain an ID space that is
5168*4882a593Smuzhiyun * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
5169*4882a593Smuzhiyun * memory-controlled cgroups to 64k.
5170*4882a593Smuzhiyun *
5171*4882a593Smuzhiyun * However, there usually are many references to the offline CSS after
5172*4882a593Smuzhiyun * the cgroup has been destroyed, such as page cache or reclaimable
5173*4882a593Smuzhiyun * slab objects, that don't need to hang on to the ID. We want to keep
5174*4882a593Smuzhiyun * those dead CSS from occupying IDs, or we might quickly exhaust the
5175*4882a593Smuzhiyun * relatively small ID space and prevent the creation of new cgroups
5176*4882a593Smuzhiyun * even when there are much fewer than 64k cgroups - possibly none.
5177*4882a593Smuzhiyun *
5178*4882a593Smuzhiyun * Maintain a private 16-bit ID space for memcg, and allow the ID to
5179*4882a593Smuzhiyun * be freed and recycled when it's no longer needed, which is usually
5180*4882a593Smuzhiyun * when the CSS is offlined.
5181*4882a593Smuzhiyun *
5182*4882a593Smuzhiyun * The only exception to that are records of swapped out tmpfs/shmem
5183*4882a593Smuzhiyun * pages that need to be attributed to live ancestors on swapin. But
5184*4882a593Smuzhiyun * those references are manageable from userspace.
5185*4882a593Smuzhiyun */
5186*4882a593Smuzhiyun
5187*4882a593Smuzhiyun static DEFINE_IDR(mem_cgroup_idr);
5188*4882a593Smuzhiyun
mem_cgroup_id_remove(struct mem_cgroup * memcg)5189*4882a593Smuzhiyun static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
5190*4882a593Smuzhiyun {
5191*4882a593Smuzhiyun if (memcg->id.id > 0) {
5192*4882a593Smuzhiyun trace_android_vh_mem_cgroup_id_remove(memcg);
5193*4882a593Smuzhiyun idr_remove(&mem_cgroup_idr, memcg->id.id);
5194*4882a593Smuzhiyun memcg->id.id = 0;
5195*4882a593Smuzhiyun }
5196*4882a593Smuzhiyun }
5197*4882a593Smuzhiyun
mem_cgroup_id_get_many(struct mem_cgroup * memcg,unsigned int n)5198*4882a593Smuzhiyun static void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg,
5199*4882a593Smuzhiyun unsigned int n)
5200*4882a593Smuzhiyun {
5201*4882a593Smuzhiyun refcount_add(n, &memcg->id.ref);
5202*4882a593Smuzhiyun }
5203*4882a593Smuzhiyun
mem_cgroup_id_put_many(struct mem_cgroup * memcg,unsigned int n)5204*4882a593Smuzhiyun static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
5205*4882a593Smuzhiyun {
5206*4882a593Smuzhiyun if (refcount_sub_and_test(n, &memcg->id.ref)) {
5207*4882a593Smuzhiyun mem_cgroup_id_remove(memcg);
5208*4882a593Smuzhiyun
5209*4882a593Smuzhiyun /* Memcg ID pins CSS */
5210*4882a593Smuzhiyun css_put(&memcg->css);
5211*4882a593Smuzhiyun }
5212*4882a593Smuzhiyun }
5213*4882a593Smuzhiyun
mem_cgroup_id_put(struct mem_cgroup * memcg)5214*4882a593Smuzhiyun static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
5215*4882a593Smuzhiyun {
5216*4882a593Smuzhiyun mem_cgroup_id_put_many(memcg, 1);
5217*4882a593Smuzhiyun }
5218*4882a593Smuzhiyun
5219*4882a593Smuzhiyun /**
5220*4882a593Smuzhiyun * mem_cgroup_from_id - look up a memcg from a memcg id
5221*4882a593Smuzhiyun * @id: the memcg id to look up
5222*4882a593Smuzhiyun *
5223*4882a593Smuzhiyun * Caller must hold rcu_read_lock().
5224*4882a593Smuzhiyun */
mem_cgroup_from_id(unsigned short id)5225*4882a593Smuzhiyun struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
5226*4882a593Smuzhiyun {
5227*4882a593Smuzhiyun WARN_ON_ONCE(!rcu_read_lock_held());
5228*4882a593Smuzhiyun return idr_find(&mem_cgroup_idr, id);
5229*4882a593Smuzhiyun }
5230*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(mem_cgroup_from_id);
5231*4882a593Smuzhiyun
alloc_mem_cgroup_per_node_info(struct mem_cgroup * memcg,int node)5232*4882a593Smuzhiyun static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
5233*4882a593Smuzhiyun {
5234*4882a593Smuzhiyun struct mem_cgroup_per_node *pn;
5235*4882a593Smuzhiyun int tmp = node;
5236*4882a593Smuzhiyun /*
5237*4882a593Smuzhiyun * This routine is called against possible nodes.
5238*4882a593Smuzhiyun * But it's BUG to call kmalloc() against offline node.
5239*4882a593Smuzhiyun *
5240*4882a593Smuzhiyun * TODO: this routine can waste much memory for nodes which will
5241*4882a593Smuzhiyun * never be onlined. It's better to use memory hotplug callback
5242*4882a593Smuzhiyun * function.
5243*4882a593Smuzhiyun */
5244*4882a593Smuzhiyun if (!node_state(node, N_NORMAL_MEMORY))
5245*4882a593Smuzhiyun tmp = -1;
5246*4882a593Smuzhiyun pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
5247*4882a593Smuzhiyun if (!pn)
5248*4882a593Smuzhiyun return 1;
5249*4882a593Smuzhiyun
5250*4882a593Smuzhiyun pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat,
5251*4882a593Smuzhiyun GFP_KERNEL_ACCOUNT);
5252*4882a593Smuzhiyun if (!pn->lruvec_stat_local) {
5253*4882a593Smuzhiyun kfree(pn);
5254*4882a593Smuzhiyun return 1;
5255*4882a593Smuzhiyun }
5256*4882a593Smuzhiyun
5257*4882a593Smuzhiyun pn->lruvec_stat_cpu = alloc_percpu_gfp(struct lruvec_stat,
5258*4882a593Smuzhiyun GFP_KERNEL_ACCOUNT);
5259*4882a593Smuzhiyun if (!pn->lruvec_stat_cpu) {
5260*4882a593Smuzhiyun free_percpu(pn->lruvec_stat_local);
5261*4882a593Smuzhiyun kfree(pn);
5262*4882a593Smuzhiyun return 1;
5263*4882a593Smuzhiyun }
5264*4882a593Smuzhiyun
5265*4882a593Smuzhiyun lruvec_init(&pn->lruvec);
5266*4882a593Smuzhiyun pn->usage_in_excess = 0;
5267*4882a593Smuzhiyun pn->on_tree = false;
5268*4882a593Smuzhiyun pn->memcg = memcg;
5269*4882a593Smuzhiyun
5270*4882a593Smuzhiyun memcg->nodeinfo[node] = pn;
5271*4882a593Smuzhiyun return 0;
5272*4882a593Smuzhiyun }
5273*4882a593Smuzhiyun
free_mem_cgroup_per_node_info(struct mem_cgroup * memcg,int node)5274*4882a593Smuzhiyun static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
5275*4882a593Smuzhiyun {
5276*4882a593Smuzhiyun struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
5277*4882a593Smuzhiyun
5278*4882a593Smuzhiyun if (!pn)
5279*4882a593Smuzhiyun return;
5280*4882a593Smuzhiyun
5281*4882a593Smuzhiyun free_percpu(pn->lruvec_stat_cpu);
5282*4882a593Smuzhiyun free_percpu(pn->lruvec_stat_local);
5283*4882a593Smuzhiyun kfree(pn);
5284*4882a593Smuzhiyun }
5285*4882a593Smuzhiyun
__mem_cgroup_free(struct mem_cgroup * memcg)5286*4882a593Smuzhiyun static void __mem_cgroup_free(struct mem_cgroup *memcg)
5287*4882a593Smuzhiyun {
5288*4882a593Smuzhiyun int node;
5289*4882a593Smuzhiyun
5290*4882a593Smuzhiyun trace_android_vh_mem_cgroup_free(memcg);
5291*4882a593Smuzhiyun for_each_node(node)
5292*4882a593Smuzhiyun free_mem_cgroup_per_node_info(memcg, node);
5293*4882a593Smuzhiyun free_percpu(memcg->vmstats_percpu);
5294*4882a593Smuzhiyun free_percpu(memcg->vmstats_local);
5295*4882a593Smuzhiyun kfree(memcg);
5296*4882a593Smuzhiyun }
5297*4882a593Smuzhiyun
mem_cgroup_free(struct mem_cgroup * memcg)5298*4882a593Smuzhiyun static void mem_cgroup_free(struct mem_cgroup *memcg)
5299*4882a593Smuzhiyun {
5300*4882a593Smuzhiyun memcg_wb_domain_exit(memcg);
5301*4882a593Smuzhiyun /*
5302*4882a593Smuzhiyun * Flush percpu vmstats and vmevents to guarantee the value correctness
5303*4882a593Smuzhiyun * on parent's and all ancestor levels.
5304*4882a593Smuzhiyun */
5305*4882a593Smuzhiyun memcg_flush_percpu_vmstats(memcg);
5306*4882a593Smuzhiyun memcg_flush_percpu_vmevents(memcg);
5307*4882a593Smuzhiyun __mem_cgroup_free(memcg);
5308*4882a593Smuzhiyun }
5309*4882a593Smuzhiyun
mem_cgroup_alloc(void)5310*4882a593Smuzhiyun static struct mem_cgroup *mem_cgroup_alloc(void)
5311*4882a593Smuzhiyun {
5312*4882a593Smuzhiyun struct mem_cgroup *memcg;
5313*4882a593Smuzhiyun unsigned int size;
5314*4882a593Smuzhiyun int node;
5315*4882a593Smuzhiyun int __maybe_unused i;
5316*4882a593Smuzhiyun long error = -ENOMEM;
5317*4882a593Smuzhiyun
5318*4882a593Smuzhiyun size = sizeof(struct mem_cgroup);
5319*4882a593Smuzhiyun size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
5320*4882a593Smuzhiyun
5321*4882a593Smuzhiyun memcg = kzalloc(size, GFP_KERNEL);
5322*4882a593Smuzhiyun if (!memcg)
5323*4882a593Smuzhiyun return ERR_PTR(error);
5324*4882a593Smuzhiyun
5325*4882a593Smuzhiyun memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
5326*4882a593Smuzhiyun 1, MEM_CGROUP_ID_MAX,
5327*4882a593Smuzhiyun GFP_KERNEL);
5328*4882a593Smuzhiyun if (memcg->id.id < 0) {
5329*4882a593Smuzhiyun error = memcg->id.id;
5330*4882a593Smuzhiyun goto fail;
5331*4882a593Smuzhiyun }
5332*4882a593Smuzhiyun
5333*4882a593Smuzhiyun memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu,
5334*4882a593Smuzhiyun GFP_KERNEL_ACCOUNT);
5335*4882a593Smuzhiyun if (!memcg->vmstats_local)
5336*4882a593Smuzhiyun goto fail;
5337*4882a593Smuzhiyun
5338*4882a593Smuzhiyun memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
5339*4882a593Smuzhiyun GFP_KERNEL_ACCOUNT);
5340*4882a593Smuzhiyun if (!memcg->vmstats_percpu)
5341*4882a593Smuzhiyun goto fail;
5342*4882a593Smuzhiyun
5343*4882a593Smuzhiyun for_each_node(node)
5344*4882a593Smuzhiyun if (alloc_mem_cgroup_per_node_info(memcg, node))
5345*4882a593Smuzhiyun goto fail;
5346*4882a593Smuzhiyun
5347*4882a593Smuzhiyun if (memcg_wb_domain_init(memcg, GFP_KERNEL))
5348*4882a593Smuzhiyun goto fail;
5349*4882a593Smuzhiyun
5350*4882a593Smuzhiyun INIT_WORK(&memcg->high_work, high_work_func);
5351*4882a593Smuzhiyun INIT_LIST_HEAD(&memcg->oom_notify);
5352*4882a593Smuzhiyun mutex_init(&memcg->thresholds_lock);
5353*4882a593Smuzhiyun spin_lock_init(&memcg->move_lock);
5354*4882a593Smuzhiyun vmpressure_init(&memcg->vmpressure);
5355*4882a593Smuzhiyun INIT_LIST_HEAD(&memcg->event_list);
5356*4882a593Smuzhiyun spin_lock_init(&memcg->event_list_lock);
5357*4882a593Smuzhiyun memcg->socket_pressure = jiffies;
5358*4882a593Smuzhiyun #ifdef CONFIG_MEMCG_KMEM
5359*4882a593Smuzhiyun memcg->kmemcg_id = -1;
5360*4882a593Smuzhiyun INIT_LIST_HEAD(&memcg->objcg_list);
5361*4882a593Smuzhiyun #endif
5362*4882a593Smuzhiyun #ifdef CONFIG_CGROUP_WRITEBACK
5363*4882a593Smuzhiyun INIT_LIST_HEAD(&memcg->cgwb_list);
5364*4882a593Smuzhiyun for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
5365*4882a593Smuzhiyun memcg->cgwb_frn[i].done =
5366*4882a593Smuzhiyun __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
5367*4882a593Smuzhiyun #endif
5368*4882a593Smuzhiyun #ifdef CONFIG_TRANSPARENT_HUGEPAGE
5369*4882a593Smuzhiyun spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
5370*4882a593Smuzhiyun INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
5371*4882a593Smuzhiyun memcg->deferred_split_queue.split_queue_len = 0;
5372*4882a593Smuzhiyun #endif
5373*4882a593Smuzhiyun idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
5374*4882a593Smuzhiyun trace_android_vh_mem_cgroup_alloc(memcg);
5375*4882a593Smuzhiyun return memcg;
5376*4882a593Smuzhiyun fail:
5377*4882a593Smuzhiyun mem_cgroup_id_remove(memcg);
5378*4882a593Smuzhiyun __mem_cgroup_free(memcg);
5379*4882a593Smuzhiyun return ERR_PTR(error);
5380*4882a593Smuzhiyun }
5381*4882a593Smuzhiyun
5382*4882a593Smuzhiyun static struct cgroup_subsys_state * __ref
mem_cgroup_css_alloc(struct cgroup_subsys_state * parent_css)5383*4882a593Smuzhiyun mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
5384*4882a593Smuzhiyun {
5385*4882a593Smuzhiyun struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
5386*4882a593Smuzhiyun struct mem_cgroup *memcg, *old_memcg;
5387*4882a593Smuzhiyun long error = -ENOMEM;
5388*4882a593Smuzhiyun
5389*4882a593Smuzhiyun old_memcg = set_active_memcg(parent);
5390*4882a593Smuzhiyun memcg = mem_cgroup_alloc();
5391*4882a593Smuzhiyun set_active_memcg(old_memcg);
5392*4882a593Smuzhiyun if (IS_ERR(memcg))
5393*4882a593Smuzhiyun return ERR_CAST(memcg);
5394*4882a593Smuzhiyun
5395*4882a593Smuzhiyun page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
5396*4882a593Smuzhiyun memcg->soft_limit = PAGE_COUNTER_MAX;
5397*4882a593Smuzhiyun page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
5398*4882a593Smuzhiyun if (parent) {
5399*4882a593Smuzhiyun memcg->swappiness = mem_cgroup_swappiness(parent);
5400*4882a593Smuzhiyun memcg->oom_kill_disable = parent->oom_kill_disable;
5401*4882a593Smuzhiyun }
5402*4882a593Smuzhiyun if (!parent) {
5403*4882a593Smuzhiyun page_counter_init(&memcg->memory, NULL);
5404*4882a593Smuzhiyun page_counter_init(&memcg->swap, NULL);
5405*4882a593Smuzhiyun page_counter_init(&memcg->kmem, NULL);
5406*4882a593Smuzhiyun page_counter_init(&memcg->tcpmem, NULL);
5407*4882a593Smuzhiyun } else if (parent->use_hierarchy) {
5408*4882a593Smuzhiyun memcg->use_hierarchy = true;
5409*4882a593Smuzhiyun page_counter_init(&memcg->memory, &parent->memory);
5410*4882a593Smuzhiyun page_counter_init(&memcg->swap, &parent->swap);
5411*4882a593Smuzhiyun page_counter_init(&memcg->kmem, &parent->kmem);
5412*4882a593Smuzhiyun page_counter_init(&memcg->tcpmem, &parent->tcpmem);
5413*4882a593Smuzhiyun } else {
5414*4882a593Smuzhiyun page_counter_init(&memcg->memory, &root_mem_cgroup->memory);
5415*4882a593Smuzhiyun page_counter_init(&memcg->swap, &root_mem_cgroup->swap);
5416*4882a593Smuzhiyun page_counter_init(&memcg->kmem, &root_mem_cgroup->kmem);
5417*4882a593Smuzhiyun page_counter_init(&memcg->tcpmem, &root_mem_cgroup->tcpmem);
5418*4882a593Smuzhiyun /*
5419*4882a593Smuzhiyun * Deeper hierachy with use_hierarchy == false doesn't make
5420*4882a593Smuzhiyun * much sense so let cgroup subsystem know about this
5421*4882a593Smuzhiyun * unfortunate state in our controller.
5422*4882a593Smuzhiyun */
5423*4882a593Smuzhiyun if (parent != root_mem_cgroup)
5424*4882a593Smuzhiyun memory_cgrp_subsys.broken_hierarchy = true;
5425*4882a593Smuzhiyun }
5426*4882a593Smuzhiyun
5427*4882a593Smuzhiyun /* The following stuff does not apply to the root */
5428*4882a593Smuzhiyun if (!parent) {
5429*4882a593Smuzhiyun root_mem_cgroup = memcg;
5430*4882a593Smuzhiyun return &memcg->css;
5431*4882a593Smuzhiyun }
5432*4882a593Smuzhiyun
5433*4882a593Smuzhiyun error = memcg_online_kmem(memcg);
5434*4882a593Smuzhiyun if (error)
5435*4882a593Smuzhiyun goto fail;
5436*4882a593Smuzhiyun
5437*4882a593Smuzhiyun if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
5438*4882a593Smuzhiyun static_branch_inc(&memcg_sockets_enabled_key);
5439*4882a593Smuzhiyun
5440*4882a593Smuzhiyun return &memcg->css;
5441*4882a593Smuzhiyun fail:
5442*4882a593Smuzhiyun mem_cgroup_id_remove(memcg);
5443*4882a593Smuzhiyun mem_cgroup_free(memcg);
5444*4882a593Smuzhiyun return ERR_PTR(error);
5445*4882a593Smuzhiyun }
5446*4882a593Smuzhiyun
mem_cgroup_css_online(struct cgroup_subsys_state * css)5447*4882a593Smuzhiyun static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
5448*4882a593Smuzhiyun {
5449*4882a593Smuzhiyun struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5450*4882a593Smuzhiyun
5451*4882a593Smuzhiyun /*
5452*4882a593Smuzhiyun * A memcg must be visible for memcg_expand_shrinker_maps()
5453*4882a593Smuzhiyun * by the time the maps are allocated. So, we allocate maps
5454*4882a593Smuzhiyun * here, when for_each_mem_cgroup() can't skip it.
5455*4882a593Smuzhiyun */
5456*4882a593Smuzhiyun if (memcg_alloc_shrinker_maps(memcg)) {
5457*4882a593Smuzhiyun mem_cgroup_id_remove(memcg);
5458*4882a593Smuzhiyun return -ENOMEM;
5459*4882a593Smuzhiyun }
5460*4882a593Smuzhiyun
5461*4882a593Smuzhiyun /* Online state pins memcg ID, memcg ID pins CSS */
5462*4882a593Smuzhiyun refcount_set(&memcg->id.ref, 1);
5463*4882a593Smuzhiyun css_get(css);
5464*4882a593Smuzhiyun trace_android_vh_mem_cgroup_css_online(css, memcg);
5465*4882a593Smuzhiyun return 0;
5466*4882a593Smuzhiyun }
5467*4882a593Smuzhiyun
mem_cgroup_css_offline(struct cgroup_subsys_state * css)5468*4882a593Smuzhiyun static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
5469*4882a593Smuzhiyun {
5470*4882a593Smuzhiyun struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5471*4882a593Smuzhiyun struct mem_cgroup_event *event, *tmp;
5472*4882a593Smuzhiyun
5473*4882a593Smuzhiyun trace_android_vh_mem_cgroup_css_offline(css, memcg);
5474*4882a593Smuzhiyun /*
5475*4882a593Smuzhiyun * Unregister events and notify userspace.
5476*4882a593Smuzhiyun * Notify userspace about cgroup removing only after rmdir of cgroup
5477*4882a593Smuzhiyun * directory to avoid race between userspace and kernelspace.
5478*4882a593Smuzhiyun */
5479*4882a593Smuzhiyun spin_lock(&memcg->event_list_lock);
5480*4882a593Smuzhiyun list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
5481*4882a593Smuzhiyun list_del_init(&event->list);
5482*4882a593Smuzhiyun schedule_work(&event->remove);
5483*4882a593Smuzhiyun }
5484*4882a593Smuzhiyun spin_unlock(&memcg->event_list_lock);
5485*4882a593Smuzhiyun
5486*4882a593Smuzhiyun page_counter_set_min(&memcg->memory, 0);
5487*4882a593Smuzhiyun page_counter_set_low(&memcg->memory, 0);
5488*4882a593Smuzhiyun
5489*4882a593Smuzhiyun memcg_offline_kmem(memcg);
5490*4882a593Smuzhiyun wb_memcg_offline(memcg);
5491*4882a593Smuzhiyun
5492*4882a593Smuzhiyun drain_all_stock(memcg);
5493*4882a593Smuzhiyun
5494*4882a593Smuzhiyun mem_cgroup_id_put(memcg);
5495*4882a593Smuzhiyun }
5496*4882a593Smuzhiyun
mem_cgroup_css_released(struct cgroup_subsys_state * css)5497*4882a593Smuzhiyun static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
5498*4882a593Smuzhiyun {
5499*4882a593Smuzhiyun struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5500*4882a593Smuzhiyun
5501*4882a593Smuzhiyun invalidate_reclaim_iterators(memcg);
5502*4882a593Smuzhiyun }
5503*4882a593Smuzhiyun
mem_cgroup_css_free(struct cgroup_subsys_state * css)5504*4882a593Smuzhiyun static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
5505*4882a593Smuzhiyun {
5506*4882a593Smuzhiyun struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5507*4882a593Smuzhiyun int __maybe_unused i;
5508*4882a593Smuzhiyun
5509*4882a593Smuzhiyun #ifdef CONFIG_CGROUP_WRITEBACK
5510*4882a593Smuzhiyun for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
5511*4882a593Smuzhiyun wb_wait_for_completion(&memcg->cgwb_frn[i].done);
5512*4882a593Smuzhiyun #endif
5513*4882a593Smuzhiyun if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
5514*4882a593Smuzhiyun static_branch_dec(&memcg_sockets_enabled_key);
5515*4882a593Smuzhiyun
5516*4882a593Smuzhiyun if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
5517*4882a593Smuzhiyun static_branch_dec(&memcg_sockets_enabled_key);
5518*4882a593Smuzhiyun
5519*4882a593Smuzhiyun vmpressure_cleanup(&memcg->vmpressure);
5520*4882a593Smuzhiyun cancel_work_sync(&memcg->high_work);
5521*4882a593Smuzhiyun mem_cgroup_remove_from_trees(memcg);
5522*4882a593Smuzhiyun memcg_free_shrinker_maps(memcg);
5523*4882a593Smuzhiyun memcg_free_kmem(memcg);
5524*4882a593Smuzhiyun mem_cgroup_free(memcg);
5525*4882a593Smuzhiyun }
5526*4882a593Smuzhiyun
5527*4882a593Smuzhiyun /**
5528*4882a593Smuzhiyun * mem_cgroup_css_reset - reset the states of a mem_cgroup
5529*4882a593Smuzhiyun * @css: the target css
5530*4882a593Smuzhiyun *
5531*4882a593Smuzhiyun * Reset the states of the mem_cgroup associated with @css. This is
5532*4882a593Smuzhiyun * invoked when the userland requests disabling on the default hierarchy
5533*4882a593Smuzhiyun * but the memcg is pinned through dependency. The memcg should stop
5534*4882a593Smuzhiyun * applying policies and should revert to the vanilla state as it may be
5535*4882a593Smuzhiyun * made visible again.
5536*4882a593Smuzhiyun *
5537*4882a593Smuzhiyun * The current implementation only resets the essential configurations.
5538*4882a593Smuzhiyun * This needs to be expanded to cover all the visible parts.
5539*4882a593Smuzhiyun */
mem_cgroup_css_reset(struct cgroup_subsys_state * css)5540*4882a593Smuzhiyun static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
5541*4882a593Smuzhiyun {
5542*4882a593Smuzhiyun struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5543*4882a593Smuzhiyun
5544*4882a593Smuzhiyun page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
5545*4882a593Smuzhiyun page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
5546*4882a593Smuzhiyun page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
5547*4882a593Smuzhiyun page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
5548*4882a593Smuzhiyun page_counter_set_min(&memcg->memory, 0);
5549*4882a593Smuzhiyun page_counter_set_low(&memcg->memory, 0);
5550*4882a593Smuzhiyun page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
5551*4882a593Smuzhiyun memcg->soft_limit = PAGE_COUNTER_MAX;
5552*4882a593Smuzhiyun page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
5553*4882a593Smuzhiyun memcg_wb_domain_size_changed(memcg);
5554*4882a593Smuzhiyun }
5555*4882a593Smuzhiyun
5556*4882a593Smuzhiyun #ifdef CONFIG_MMU
5557*4882a593Smuzhiyun /* Handlers for move charge at task migration. */
mem_cgroup_do_precharge(unsigned long count)5558*4882a593Smuzhiyun static int mem_cgroup_do_precharge(unsigned long count)
5559*4882a593Smuzhiyun {
5560*4882a593Smuzhiyun int ret;
5561*4882a593Smuzhiyun
5562*4882a593Smuzhiyun /* Try a single bulk charge without reclaim first, kswapd may wake */
5563*4882a593Smuzhiyun ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
5564*4882a593Smuzhiyun if (!ret) {
5565*4882a593Smuzhiyun mc.precharge += count;
5566*4882a593Smuzhiyun return ret;
5567*4882a593Smuzhiyun }
5568*4882a593Smuzhiyun
5569*4882a593Smuzhiyun /* Try charges one by one with reclaim, but do not retry */
5570*4882a593Smuzhiyun while (count--) {
5571*4882a593Smuzhiyun ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
5572*4882a593Smuzhiyun if (ret)
5573*4882a593Smuzhiyun return ret;
5574*4882a593Smuzhiyun mc.precharge++;
5575*4882a593Smuzhiyun cond_resched();
5576*4882a593Smuzhiyun }
5577*4882a593Smuzhiyun return 0;
5578*4882a593Smuzhiyun }
5579*4882a593Smuzhiyun
5580*4882a593Smuzhiyun union mc_target {
5581*4882a593Smuzhiyun struct page *page;
5582*4882a593Smuzhiyun swp_entry_t ent;
5583*4882a593Smuzhiyun };
5584*4882a593Smuzhiyun
5585*4882a593Smuzhiyun enum mc_target_type {
5586*4882a593Smuzhiyun MC_TARGET_NONE = 0,
5587*4882a593Smuzhiyun MC_TARGET_PAGE,
5588*4882a593Smuzhiyun MC_TARGET_SWAP,
5589*4882a593Smuzhiyun MC_TARGET_DEVICE,
5590*4882a593Smuzhiyun };
5591*4882a593Smuzhiyun
mc_handle_present_pte(struct vm_area_struct * vma,unsigned long addr,pte_t ptent)5592*4882a593Smuzhiyun static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
5593*4882a593Smuzhiyun unsigned long addr, pte_t ptent)
5594*4882a593Smuzhiyun {
5595*4882a593Smuzhiyun struct page *page = vm_normal_page(vma, addr, ptent);
5596*4882a593Smuzhiyun
5597*4882a593Smuzhiyun if (!page || !page_mapped(page))
5598*4882a593Smuzhiyun return NULL;
5599*4882a593Smuzhiyun if (PageAnon(page)) {
5600*4882a593Smuzhiyun if (!(mc.flags & MOVE_ANON))
5601*4882a593Smuzhiyun return NULL;
5602*4882a593Smuzhiyun } else {
5603*4882a593Smuzhiyun if (!(mc.flags & MOVE_FILE))
5604*4882a593Smuzhiyun return NULL;
5605*4882a593Smuzhiyun }
5606*4882a593Smuzhiyun if (!get_page_unless_zero(page))
5607*4882a593Smuzhiyun return NULL;
5608*4882a593Smuzhiyun
5609*4882a593Smuzhiyun return page;
5610*4882a593Smuzhiyun }
5611*4882a593Smuzhiyun
5612*4882a593Smuzhiyun #if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
mc_handle_swap_pte(struct vm_area_struct * vma,pte_t ptent,swp_entry_t * entry)5613*4882a593Smuzhiyun static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5614*4882a593Smuzhiyun pte_t ptent, swp_entry_t *entry)
5615*4882a593Smuzhiyun {
5616*4882a593Smuzhiyun struct page *page = NULL;
5617*4882a593Smuzhiyun swp_entry_t ent = pte_to_swp_entry(ptent);
5618*4882a593Smuzhiyun
5619*4882a593Smuzhiyun if (!(mc.flags & MOVE_ANON))
5620*4882a593Smuzhiyun return NULL;
5621*4882a593Smuzhiyun
5622*4882a593Smuzhiyun /*
5623*4882a593Smuzhiyun * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to
5624*4882a593Smuzhiyun * a device and because they are not accessible by CPU they are store
5625*4882a593Smuzhiyun * as special swap entry in the CPU page table.
5626*4882a593Smuzhiyun */
5627*4882a593Smuzhiyun if (is_device_private_entry(ent)) {
5628*4882a593Smuzhiyun page = device_private_entry_to_page(ent);
5629*4882a593Smuzhiyun /*
5630*4882a593Smuzhiyun * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have
5631*4882a593Smuzhiyun * a refcount of 1 when free (unlike normal page)
5632*4882a593Smuzhiyun */
5633*4882a593Smuzhiyun if (!page_ref_add_unless(page, 1, 1))
5634*4882a593Smuzhiyun return NULL;
5635*4882a593Smuzhiyun return page;
5636*4882a593Smuzhiyun }
5637*4882a593Smuzhiyun
5638*4882a593Smuzhiyun if (non_swap_entry(ent))
5639*4882a593Smuzhiyun return NULL;
5640*4882a593Smuzhiyun
5641*4882a593Smuzhiyun /*
5642*4882a593Smuzhiyun * Because lookup_swap_cache() updates some statistics counter,
5643*4882a593Smuzhiyun * we call find_get_page() with swapper_space directly.
5644*4882a593Smuzhiyun */
5645*4882a593Smuzhiyun page = find_get_page(swap_address_space(ent), swp_offset(ent));
5646*4882a593Smuzhiyun entry->val = ent.val;
5647*4882a593Smuzhiyun
5648*4882a593Smuzhiyun return page;
5649*4882a593Smuzhiyun }
5650*4882a593Smuzhiyun #else
mc_handle_swap_pte(struct vm_area_struct * vma,pte_t ptent,swp_entry_t * entry)5651*4882a593Smuzhiyun static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5652*4882a593Smuzhiyun pte_t ptent, swp_entry_t *entry)
5653*4882a593Smuzhiyun {
5654*4882a593Smuzhiyun return NULL;
5655*4882a593Smuzhiyun }
5656*4882a593Smuzhiyun #endif
5657*4882a593Smuzhiyun
mc_handle_file_pte(struct vm_area_struct * vma,unsigned long addr,pte_t ptent,swp_entry_t * entry)5658*4882a593Smuzhiyun static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5659*4882a593Smuzhiyun unsigned long addr, pte_t ptent, swp_entry_t *entry)
5660*4882a593Smuzhiyun {
5661*4882a593Smuzhiyun if (!vma->vm_file) /* anonymous vma */
5662*4882a593Smuzhiyun return NULL;
5663*4882a593Smuzhiyun if (!(mc.flags & MOVE_FILE))
5664*4882a593Smuzhiyun return NULL;
5665*4882a593Smuzhiyun
5666*4882a593Smuzhiyun /* page is moved even if it's not RSS of this task(page-faulted). */
5667*4882a593Smuzhiyun /* shmem/tmpfs may report page out on swap: account for that too. */
5668*4882a593Smuzhiyun return find_get_incore_page(vma->vm_file->f_mapping,
5669*4882a593Smuzhiyun linear_page_index(vma, addr));
5670*4882a593Smuzhiyun }
5671*4882a593Smuzhiyun
5672*4882a593Smuzhiyun /**
5673*4882a593Smuzhiyun * mem_cgroup_move_account - move account of the page
5674*4882a593Smuzhiyun * @page: the page
5675*4882a593Smuzhiyun * @compound: charge the page as compound or small page
5676*4882a593Smuzhiyun * @from: mem_cgroup which the page is moved from.
5677*4882a593Smuzhiyun * @to: mem_cgroup which the page is moved to. @from != @to.
5678*4882a593Smuzhiyun *
5679*4882a593Smuzhiyun * The caller must make sure the page is not on LRU (isolate_page() is useful.)
5680*4882a593Smuzhiyun *
5681*4882a593Smuzhiyun * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
5682*4882a593Smuzhiyun * from old cgroup.
5683*4882a593Smuzhiyun */
mem_cgroup_move_account(struct page * page,bool compound,struct mem_cgroup * from,struct mem_cgroup * to)5684*4882a593Smuzhiyun static int mem_cgroup_move_account(struct page *page,
5685*4882a593Smuzhiyun bool compound,
5686*4882a593Smuzhiyun struct mem_cgroup *from,
5687*4882a593Smuzhiyun struct mem_cgroup *to)
5688*4882a593Smuzhiyun {
5689*4882a593Smuzhiyun struct lruvec *from_vec, *to_vec;
5690*4882a593Smuzhiyun struct pglist_data *pgdat;
5691*4882a593Smuzhiyun unsigned int nr_pages = compound ? thp_nr_pages(page) : 1;
5692*4882a593Smuzhiyun int ret;
5693*4882a593Smuzhiyun
5694*4882a593Smuzhiyun VM_BUG_ON(from == to);
5695*4882a593Smuzhiyun VM_BUG_ON_PAGE(PageLRU(page), page);
5696*4882a593Smuzhiyun VM_BUG_ON(compound && !PageTransHuge(page));
5697*4882a593Smuzhiyun
5698*4882a593Smuzhiyun /*
5699*4882a593Smuzhiyun * Prevent mem_cgroup_migrate() from looking at
5700*4882a593Smuzhiyun * page->mem_cgroup of its source page while we change it.
5701*4882a593Smuzhiyun */
5702*4882a593Smuzhiyun ret = -EBUSY;
5703*4882a593Smuzhiyun if (!trylock_page(page))
5704*4882a593Smuzhiyun goto out;
5705*4882a593Smuzhiyun
5706*4882a593Smuzhiyun ret = -EINVAL;
5707*4882a593Smuzhiyun if (page->mem_cgroup != from)
5708*4882a593Smuzhiyun goto out_unlock;
5709*4882a593Smuzhiyun
5710*4882a593Smuzhiyun pgdat = page_pgdat(page);
5711*4882a593Smuzhiyun from_vec = mem_cgroup_lruvec(from, pgdat);
5712*4882a593Smuzhiyun to_vec = mem_cgroup_lruvec(to, pgdat);
5713*4882a593Smuzhiyun
5714*4882a593Smuzhiyun lock_page_memcg(page);
5715*4882a593Smuzhiyun
5716*4882a593Smuzhiyun if (PageAnon(page)) {
5717*4882a593Smuzhiyun if (page_mapped(page)) {
5718*4882a593Smuzhiyun __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
5719*4882a593Smuzhiyun __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
5720*4882a593Smuzhiyun if (PageTransHuge(page)) {
5721*4882a593Smuzhiyun __dec_lruvec_state(from_vec, NR_ANON_THPS);
5722*4882a593Smuzhiyun __inc_lruvec_state(to_vec, NR_ANON_THPS);
5723*4882a593Smuzhiyun }
5724*4882a593Smuzhiyun
5725*4882a593Smuzhiyun }
5726*4882a593Smuzhiyun } else {
5727*4882a593Smuzhiyun __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
5728*4882a593Smuzhiyun __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
5729*4882a593Smuzhiyun
5730*4882a593Smuzhiyun if (PageSwapBacked(page)) {
5731*4882a593Smuzhiyun __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
5732*4882a593Smuzhiyun __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
5733*4882a593Smuzhiyun }
5734*4882a593Smuzhiyun
5735*4882a593Smuzhiyun if (page_mapped(page)) {
5736*4882a593Smuzhiyun __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
5737*4882a593Smuzhiyun __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
5738*4882a593Smuzhiyun }
5739*4882a593Smuzhiyun
5740*4882a593Smuzhiyun if (PageDirty(page)) {
5741*4882a593Smuzhiyun struct address_space *mapping = page_mapping(page);
5742*4882a593Smuzhiyun
5743*4882a593Smuzhiyun if (mapping_can_writeback(mapping)) {
5744*4882a593Smuzhiyun __mod_lruvec_state(from_vec, NR_FILE_DIRTY,
5745*4882a593Smuzhiyun -nr_pages);
5746*4882a593Smuzhiyun __mod_lruvec_state(to_vec, NR_FILE_DIRTY,
5747*4882a593Smuzhiyun nr_pages);
5748*4882a593Smuzhiyun }
5749*4882a593Smuzhiyun }
5750*4882a593Smuzhiyun }
5751*4882a593Smuzhiyun
5752*4882a593Smuzhiyun if (PageWriteback(page)) {
5753*4882a593Smuzhiyun __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
5754*4882a593Smuzhiyun __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
5755*4882a593Smuzhiyun }
5756*4882a593Smuzhiyun
5757*4882a593Smuzhiyun /*
5758*4882a593Smuzhiyun * All state has been migrated, let's switch to the new memcg.
5759*4882a593Smuzhiyun *
5760*4882a593Smuzhiyun * It is safe to change page->mem_cgroup here because the page
5761*4882a593Smuzhiyun * is referenced, charged, isolated, and locked: we can't race
5762*4882a593Smuzhiyun * with (un)charging, migration, LRU putback, or anything else
5763*4882a593Smuzhiyun * that would rely on a stable page->mem_cgroup.
5764*4882a593Smuzhiyun *
5765*4882a593Smuzhiyun * Note that lock_page_memcg is a memcg lock, not a page lock,
5766*4882a593Smuzhiyun * to save space. As soon as we switch page->mem_cgroup to a
5767*4882a593Smuzhiyun * new memcg that isn't locked, the above state can change
5768*4882a593Smuzhiyun * concurrently again. Make sure we're truly done with it.
5769*4882a593Smuzhiyun */
5770*4882a593Smuzhiyun smp_mb();
5771*4882a593Smuzhiyun
5772*4882a593Smuzhiyun css_get(&to->css);
5773*4882a593Smuzhiyun css_put(&from->css);
5774*4882a593Smuzhiyun
5775*4882a593Smuzhiyun page->mem_cgroup = to;
5776*4882a593Smuzhiyun
5777*4882a593Smuzhiyun __unlock_page_memcg(from);
5778*4882a593Smuzhiyun
5779*4882a593Smuzhiyun ret = 0;
5780*4882a593Smuzhiyun
5781*4882a593Smuzhiyun local_irq_disable();
5782*4882a593Smuzhiyun mem_cgroup_charge_statistics(to, page, nr_pages);
5783*4882a593Smuzhiyun memcg_check_events(to, page);
5784*4882a593Smuzhiyun mem_cgroup_charge_statistics(from, page, -nr_pages);
5785*4882a593Smuzhiyun memcg_check_events(from, page);
5786*4882a593Smuzhiyun local_irq_enable();
5787*4882a593Smuzhiyun out_unlock:
5788*4882a593Smuzhiyun unlock_page(page);
5789*4882a593Smuzhiyun out:
5790*4882a593Smuzhiyun return ret;
5791*4882a593Smuzhiyun }
5792*4882a593Smuzhiyun
5793*4882a593Smuzhiyun /**
5794*4882a593Smuzhiyun * get_mctgt_type - get target type of moving charge
5795*4882a593Smuzhiyun * @vma: the vma the pte to be checked belongs
5796*4882a593Smuzhiyun * @addr: the address corresponding to the pte to be checked
5797*4882a593Smuzhiyun * @ptent: the pte to be checked
5798*4882a593Smuzhiyun * @target: the pointer the target page or swap ent will be stored(can be NULL)
5799*4882a593Smuzhiyun *
5800*4882a593Smuzhiyun * Returns
5801*4882a593Smuzhiyun * 0(MC_TARGET_NONE): if the pte is not a target for move charge.
5802*4882a593Smuzhiyun * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
5803*4882a593Smuzhiyun * move charge. if @target is not NULL, the page is stored in target->page
5804*4882a593Smuzhiyun * with extra refcnt got(Callers should handle it).
5805*4882a593Smuzhiyun * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
5806*4882a593Smuzhiyun * target for charge migration. if @target is not NULL, the entry is stored
5807*4882a593Smuzhiyun * in target->ent.
5808*4882a593Smuzhiyun * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE
5809*4882a593Smuzhiyun * (so ZONE_DEVICE page and thus not on the lru).
5810*4882a593Smuzhiyun * For now we such page is charge like a regular page would be as for all
5811*4882a593Smuzhiyun * intent and purposes it is just special memory taking the place of a
5812*4882a593Smuzhiyun * regular page.
5813*4882a593Smuzhiyun *
5814*4882a593Smuzhiyun * See Documentations/vm/hmm.txt and include/linux/hmm.h
5815*4882a593Smuzhiyun *
5816*4882a593Smuzhiyun * Called with pte lock held.
5817*4882a593Smuzhiyun */
5818*4882a593Smuzhiyun
get_mctgt_type(struct vm_area_struct * vma,unsigned long addr,pte_t ptent,union mc_target * target)5819*4882a593Smuzhiyun static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
5820*4882a593Smuzhiyun unsigned long addr, pte_t ptent, union mc_target *target)
5821*4882a593Smuzhiyun {
5822*4882a593Smuzhiyun struct page *page = NULL;
5823*4882a593Smuzhiyun enum mc_target_type ret = MC_TARGET_NONE;
5824*4882a593Smuzhiyun swp_entry_t ent = { .val = 0 };
5825*4882a593Smuzhiyun
5826*4882a593Smuzhiyun if (pte_present(ptent))
5827*4882a593Smuzhiyun page = mc_handle_present_pte(vma, addr, ptent);
5828*4882a593Smuzhiyun else if (is_swap_pte(ptent))
5829*4882a593Smuzhiyun page = mc_handle_swap_pte(vma, ptent, &ent);
5830*4882a593Smuzhiyun else if (pte_none(ptent))
5831*4882a593Smuzhiyun page = mc_handle_file_pte(vma, addr, ptent, &ent);
5832*4882a593Smuzhiyun
5833*4882a593Smuzhiyun if (!page && !ent.val)
5834*4882a593Smuzhiyun return ret;
5835*4882a593Smuzhiyun if (page) {
5836*4882a593Smuzhiyun /*
5837*4882a593Smuzhiyun * Do only loose check w/o serialization.
5838*4882a593Smuzhiyun * mem_cgroup_move_account() checks the page is valid or
5839*4882a593Smuzhiyun * not under LRU exclusion.
5840*4882a593Smuzhiyun */
5841*4882a593Smuzhiyun if (page->mem_cgroup == mc.from) {
5842*4882a593Smuzhiyun ret = MC_TARGET_PAGE;
5843*4882a593Smuzhiyun if (is_device_private_page(page))
5844*4882a593Smuzhiyun ret = MC_TARGET_DEVICE;
5845*4882a593Smuzhiyun if (target)
5846*4882a593Smuzhiyun target->page = page;
5847*4882a593Smuzhiyun }
5848*4882a593Smuzhiyun if (!ret || !target)
5849*4882a593Smuzhiyun put_page(page);
5850*4882a593Smuzhiyun }
5851*4882a593Smuzhiyun /*
5852*4882a593Smuzhiyun * There is a swap entry and a page doesn't exist or isn't charged.
5853*4882a593Smuzhiyun * But we cannot move a tail-page in a THP.
5854*4882a593Smuzhiyun */
5855*4882a593Smuzhiyun if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
5856*4882a593Smuzhiyun mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
5857*4882a593Smuzhiyun ret = MC_TARGET_SWAP;
5858*4882a593Smuzhiyun if (target)
5859*4882a593Smuzhiyun target->ent = ent;
5860*4882a593Smuzhiyun }
5861*4882a593Smuzhiyun return ret;
5862*4882a593Smuzhiyun }
5863*4882a593Smuzhiyun
5864*4882a593Smuzhiyun #ifdef CONFIG_TRANSPARENT_HUGEPAGE
5865*4882a593Smuzhiyun /*
5866*4882a593Smuzhiyun * We don't consider PMD mapped swapping or file mapped pages because THP does
5867*4882a593Smuzhiyun * not support them for now.
5868*4882a593Smuzhiyun * Caller should make sure that pmd_trans_huge(pmd) is true.
5869*4882a593Smuzhiyun */
get_mctgt_type_thp(struct vm_area_struct * vma,unsigned long addr,pmd_t pmd,union mc_target * target)5870*4882a593Smuzhiyun static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5871*4882a593Smuzhiyun unsigned long addr, pmd_t pmd, union mc_target *target)
5872*4882a593Smuzhiyun {
5873*4882a593Smuzhiyun struct page *page = NULL;
5874*4882a593Smuzhiyun enum mc_target_type ret = MC_TARGET_NONE;
5875*4882a593Smuzhiyun
5876*4882a593Smuzhiyun if (unlikely(is_swap_pmd(pmd))) {
5877*4882a593Smuzhiyun VM_BUG_ON(thp_migration_supported() &&
5878*4882a593Smuzhiyun !is_pmd_migration_entry(pmd));
5879*4882a593Smuzhiyun return ret;
5880*4882a593Smuzhiyun }
5881*4882a593Smuzhiyun page = pmd_page(pmd);
5882*4882a593Smuzhiyun VM_BUG_ON_PAGE(!page || !PageHead(page), page);
5883*4882a593Smuzhiyun if (!(mc.flags & MOVE_ANON))
5884*4882a593Smuzhiyun return ret;
5885*4882a593Smuzhiyun if (page->mem_cgroup == mc.from) {
5886*4882a593Smuzhiyun ret = MC_TARGET_PAGE;
5887*4882a593Smuzhiyun if (target) {
5888*4882a593Smuzhiyun get_page(page);
5889*4882a593Smuzhiyun target->page = page;
5890*4882a593Smuzhiyun }
5891*4882a593Smuzhiyun }
5892*4882a593Smuzhiyun return ret;
5893*4882a593Smuzhiyun }
5894*4882a593Smuzhiyun #else
get_mctgt_type_thp(struct vm_area_struct * vma,unsigned long addr,pmd_t pmd,union mc_target * target)5895*4882a593Smuzhiyun static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5896*4882a593Smuzhiyun unsigned long addr, pmd_t pmd, union mc_target *target)
5897*4882a593Smuzhiyun {
5898*4882a593Smuzhiyun return MC_TARGET_NONE;
5899*4882a593Smuzhiyun }
5900*4882a593Smuzhiyun #endif
5901*4882a593Smuzhiyun
mem_cgroup_count_precharge_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)5902*4882a593Smuzhiyun static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5903*4882a593Smuzhiyun unsigned long addr, unsigned long end,
5904*4882a593Smuzhiyun struct mm_walk *walk)
5905*4882a593Smuzhiyun {
5906*4882a593Smuzhiyun struct vm_area_struct *vma = walk->vma;
5907*4882a593Smuzhiyun pte_t *pte;
5908*4882a593Smuzhiyun spinlock_t *ptl;
5909*4882a593Smuzhiyun
5910*4882a593Smuzhiyun ptl = pmd_trans_huge_lock(pmd, vma);
5911*4882a593Smuzhiyun if (ptl) {
5912*4882a593Smuzhiyun /*
5913*4882a593Smuzhiyun * Note their can not be MC_TARGET_DEVICE for now as we do not
5914*4882a593Smuzhiyun * support transparent huge page with MEMORY_DEVICE_PRIVATE but
5915*4882a593Smuzhiyun * this might change.
5916*4882a593Smuzhiyun */
5917*4882a593Smuzhiyun if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
5918*4882a593Smuzhiyun mc.precharge += HPAGE_PMD_NR;
5919*4882a593Smuzhiyun spin_unlock(ptl);
5920*4882a593Smuzhiyun return 0;
5921*4882a593Smuzhiyun }
5922*4882a593Smuzhiyun
5923*4882a593Smuzhiyun if (pmd_trans_unstable(pmd))
5924*4882a593Smuzhiyun return 0;
5925*4882a593Smuzhiyun pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5926*4882a593Smuzhiyun for (; addr != end; pte++, addr += PAGE_SIZE)
5927*4882a593Smuzhiyun if (get_mctgt_type(vma, addr, *pte, NULL))
5928*4882a593Smuzhiyun mc.precharge++; /* increment precharge temporarily */
5929*4882a593Smuzhiyun pte_unmap_unlock(pte - 1, ptl);
5930*4882a593Smuzhiyun cond_resched();
5931*4882a593Smuzhiyun
5932*4882a593Smuzhiyun return 0;
5933*4882a593Smuzhiyun }
5934*4882a593Smuzhiyun
5935*4882a593Smuzhiyun static const struct mm_walk_ops precharge_walk_ops = {
5936*4882a593Smuzhiyun .pmd_entry = mem_cgroup_count_precharge_pte_range,
5937*4882a593Smuzhiyun };
5938*4882a593Smuzhiyun
mem_cgroup_count_precharge(struct mm_struct * mm)5939*4882a593Smuzhiyun static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
5940*4882a593Smuzhiyun {
5941*4882a593Smuzhiyun unsigned long precharge;
5942*4882a593Smuzhiyun
5943*4882a593Smuzhiyun mmap_read_lock(mm);
5944*4882a593Smuzhiyun walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
5945*4882a593Smuzhiyun mmap_read_unlock(mm);
5946*4882a593Smuzhiyun
5947*4882a593Smuzhiyun precharge = mc.precharge;
5948*4882a593Smuzhiyun mc.precharge = 0;
5949*4882a593Smuzhiyun
5950*4882a593Smuzhiyun return precharge;
5951*4882a593Smuzhiyun }
5952*4882a593Smuzhiyun
mem_cgroup_precharge_mc(struct mm_struct * mm)5953*4882a593Smuzhiyun static int mem_cgroup_precharge_mc(struct mm_struct *mm)
5954*4882a593Smuzhiyun {
5955*4882a593Smuzhiyun unsigned long precharge = mem_cgroup_count_precharge(mm);
5956*4882a593Smuzhiyun
5957*4882a593Smuzhiyun VM_BUG_ON(mc.moving_task);
5958*4882a593Smuzhiyun mc.moving_task = current;
5959*4882a593Smuzhiyun return mem_cgroup_do_precharge(precharge);
5960*4882a593Smuzhiyun }
5961*4882a593Smuzhiyun
5962*4882a593Smuzhiyun /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
__mem_cgroup_clear_mc(void)5963*4882a593Smuzhiyun static void __mem_cgroup_clear_mc(void)
5964*4882a593Smuzhiyun {
5965*4882a593Smuzhiyun struct mem_cgroup *from = mc.from;
5966*4882a593Smuzhiyun struct mem_cgroup *to = mc.to;
5967*4882a593Smuzhiyun
5968*4882a593Smuzhiyun /* we must uncharge all the leftover precharges from mc.to */
5969*4882a593Smuzhiyun if (mc.precharge) {
5970*4882a593Smuzhiyun cancel_charge(mc.to, mc.precharge);
5971*4882a593Smuzhiyun mc.precharge = 0;
5972*4882a593Smuzhiyun }
5973*4882a593Smuzhiyun /*
5974*4882a593Smuzhiyun * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
5975*4882a593Smuzhiyun * we must uncharge here.
5976*4882a593Smuzhiyun */
5977*4882a593Smuzhiyun if (mc.moved_charge) {
5978*4882a593Smuzhiyun cancel_charge(mc.from, mc.moved_charge);
5979*4882a593Smuzhiyun mc.moved_charge = 0;
5980*4882a593Smuzhiyun }
5981*4882a593Smuzhiyun /* we must fixup refcnts and charges */
5982*4882a593Smuzhiyun if (mc.moved_swap) {
5983*4882a593Smuzhiyun /* uncharge swap account from the old cgroup */
5984*4882a593Smuzhiyun if (!mem_cgroup_is_root(mc.from))
5985*4882a593Smuzhiyun page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
5986*4882a593Smuzhiyun
5987*4882a593Smuzhiyun mem_cgroup_id_put_many(mc.from, mc.moved_swap);
5988*4882a593Smuzhiyun
5989*4882a593Smuzhiyun /*
5990*4882a593Smuzhiyun * we charged both to->memory and to->memsw, so we
5991*4882a593Smuzhiyun * should uncharge to->memory.
5992*4882a593Smuzhiyun */
5993*4882a593Smuzhiyun if (!mem_cgroup_is_root(mc.to))
5994*4882a593Smuzhiyun page_counter_uncharge(&mc.to->memory, mc.moved_swap);
5995*4882a593Smuzhiyun
5996*4882a593Smuzhiyun mc.moved_swap = 0;
5997*4882a593Smuzhiyun }
5998*4882a593Smuzhiyun memcg_oom_recover(from);
5999*4882a593Smuzhiyun memcg_oom_recover(to);
6000*4882a593Smuzhiyun wake_up_all(&mc.waitq);
6001*4882a593Smuzhiyun }
6002*4882a593Smuzhiyun
mem_cgroup_clear_mc(void)6003*4882a593Smuzhiyun static void mem_cgroup_clear_mc(void)
6004*4882a593Smuzhiyun {
6005*4882a593Smuzhiyun struct mm_struct *mm = mc.mm;
6006*4882a593Smuzhiyun
6007*4882a593Smuzhiyun /*
6008*4882a593Smuzhiyun * we must clear moving_task before waking up waiters at the end of
6009*4882a593Smuzhiyun * task migration.
6010*4882a593Smuzhiyun */
6011*4882a593Smuzhiyun mc.moving_task = NULL;
6012*4882a593Smuzhiyun __mem_cgroup_clear_mc();
6013*4882a593Smuzhiyun spin_lock(&mc.lock);
6014*4882a593Smuzhiyun mc.from = NULL;
6015*4882a593Smuzhiyun mc.to = NULL;
6016*4882a593Smuzhiyun mc.mm = NULL;
6017*4882a593Smuzhiyun spin_unlock(&mc.lock);
6018*4882a593Smuzhiyun
6019*4882a593Smuzhiyun mmput(mm);
6020*4882a593Smuzhiyun }
6021*4882a593Smuzhiyun
mem_cgroup_can_attach(struct cgroup_taskset * tset)6022*4882a593Smuzhiyun static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
6023*4882a593Smuzhiyun {
6024*4882a593Smuzhiyun struct cgroup_subsys_state *css;
6025*4882a593Smuzhiyun struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */
6026*4882a593Smuzhiyun struct mem_cgroup *from;
6027*4882a593Smuzhiyun struct task_struct *leader, *p;
6028*4882a593Smuzhiyun struct mm_struct *mm;
6029*4882a593Smuzhiyun unsigned long move_flags;
6030*4882a593Smuzhiyun int ret = 0;
6031*4882a593Smuzhiyun
6032*4882a593Smuzhiyun /* charge immigration isn't supported on the default hierarchy */
6033*4882a593Smuzhiyun if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
6034*4882a593Smuzhiyun return 0;
6035*4882a593Smuzhiyun
6036*4882a593Smuzhiyun /*
6037*4882a593Smuzhiyun * Multi-process migrations only happen on the default hierarchy
6038*4882a593Smuzhiyun * where charge immigration is not used. Perform charge
6039*4882a593Smuzhiyun * immigration if @tset contains a leader and whine if there are
6040*4882a593Smuzhiyun * multiple.
6041*4882a593Smuzhiyun */
6042*4882a593Smuzhiyun p = NULL;
6043*4882a593Smuzhiyun cgroup_taskset_for_each_leader(leader, css, tset) {
6044*4882a593Smuzhiyun WARN_ON_ONCE(p);
6045*4882a593Smuzhiyun p = leader;
6046*4882a593Smuzhiyun memcg = mem_cgroup_from_css(css);
6047*4882a593Smuzhiyun }
6048*4882a593Smuzhiyun if (!p)
6049*4882a593Smuzhiyun return 0;
6050*4882a593Smuzhiyun
6051*4882a593Smuzhiyun /*
6052*4882a593Smuzhiyun * We are now commited to this value whatever it is. Changes in this
6053*4882a593Smuzhiyun * tunable will only affect upcoming migrations, not the current one.
6054*4882a593Smuzhiyun * So we need to save it, and keep it going.
6055*4882a593Smuzhiyun */
6056*4882a593Smuzhiyun move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
6057*4882a593Smuzhiyun if (!move_flags)
6058*4882a593Smuzhiyun return 0;
6059*4882a593Smuzhiyun
6060*4882a593Smuzhiyun from = mem_cgroup_from_task(p);
6061*4882a593Smuzhiyun
6062*4882a593Smuzhiyun VM_BUG_ON(from == memcg);
6063*4882a593Smuzhiyun
6064*4882a593Smuzhiyun mm = get_task_mm(p);
6065*4882a593Smuzhiyun if (!mm)
6066*4882a593Smuzhiyun return 0;
6067*4882a593Smuzhiyun /* We move charges only when we move a owner of the mm */
6068*4882a593Smuzhiyun if (mm->owner == p) {
6069*4882a593Smuzhiyun VM_BUG_ON(mc.from);
6070*4882a593Smuzhiyun VM_BUG_ON(mc.to);
6071*4882a593Smuzhiyun VM_BUG_ON(mc.precharge);
6072*4882a593Smuzhiyun VM_BUG_ON(mc.moved_charge);
6073*4882a593Smuzhiyun VM_BUG_ON(mc.moved_swap);
6074*4882a593Smuzhiyun
6075*4882a593Smuzhiyun spin_lock(&mc.lock);
6076*4882a593Smuzhiyun mc.mm = mm;
6077*4882a593Smuzhiyun mc.from = from;
6078*4882a593Smuzhiyun mc.to = memcg;
6079*4882a593Smuzhiyun mc.flags = move_flags;
6080*4882a593Smuzhiyun spin_unlock(&mc.lock);
6081*4882a593Smuzhiyun /* We set mc.moving_task later */
6082*4882a593Smuzhiyun
6083*4882a593Smuzhiyun ret = mem_cgroup_precharge_mc(mm);
6084*4882a593Smuzhiyun if (ret)
6085*4882a593Smuzhiyun mem_cgroup_clear_mc();
6086*4882a593Smuzhiyun } else {
6087*4882a593Smuzhiyun mmput(mm);
6088*4882a593Smuzhiyun }
6089*4882a593Smuzhiyun return ret;
6090*4882a593Smuzhiyun }
6091*4882a593Smuzhiyun
mem_cgroup_cancel_attach(struct cgroup_taskset * tset)6092*4882a593Smuzhiyun static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
6093*4882a593Smuzhiyun {
6094*4882a593Smuzhiyun if (mc.to)
6095*4882a593Smuzhiyun mem_cgroup_clear_mc();
6096*4882a593Smuzhiyun }
6097*4882a593Smuzhiyun
mem_cgroup_move_charge_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)6098*4882a593Smuzhiyun static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
6099*4882a593Smuzhiyun unsigned long addr, unsigned long end,
6100*4882a593Smuzhiyun struct mm_walk *walk)
6101*4882a593Smuzhiyun {
6102*4882a593Smuzhiyun int ret = 0;
6103*4882a593Smuzhiyun struct vm_area_struct *vma = walk->vma;
6104*4882a593Smuzhiyun pte_t *pte;
6105*4882a593Smuzhiyun spinlock_t *ptl;
6106*4882a593Smuzhiyun enum mc_target_type target_type;
6107*4882a593Smuzhiyun union mc_target target;
6108*4882a593Smuzhiyun struct page *page;
6109*4882a593Smuzhiyun
6110*4882a593Smuzhiyun ptl = pmd_trans_huge_lock(pmd, vma);
6111*4882a593Smuzhiyun if (ptl) {
6112*4882a593Smuzhiyun if (mc.precharge < HPAGE_PMD_NR) {
6113*4882a593Smuzhiyun spin_unlock(ptl);
6114*4882a593Smuzhiyun return 0;
6115*4882a593Smuzhiyun }
6116*4882a593Smuzhiyun target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
6117*4882a593Smuzhiyun if (target_type == MC_TARGET_PAGE) {
6118*4882a593Smuzhiyun page = target.page;
6119*4882a593Smuzhiyun if (!isolate_lru_page(page)) {
6120*4882a593Smuzhiyun if (!mem_cgroup_move_account(page, true,
6121*4882a593Smuzhiyun mc.from, mc.to)) {
6122*4882a593Smuzhiyun mc.precharge -= HPAGE_PMD_NR;
6123*4882a593Smuzhiyun mc.moved_charge += HPAGE_PMD_NR;
6124*4882a593Smuzhiyun }
6125*4882a593Smuzhiyun putback_lru_page(page);
6126*4882a593Smuzhiyun }
6127*4882a593Smuzhiyun put_page(page);
6128*4882a593Smuzhiyun } else if (target_type == MC_TARGET_DEVICE) {
6129*4882a593Smuzhiyun page = target.page;
6130*4882a593Smuzhiyun if (!mem_cgroup_move_account(page, true,
6131*4882a593Smuzhiyun mc.from, mc.to)) {
6132*4882a593Smuzhiyun mc.precharge -= HPAGE_PMD_NR;
6133*4882a593Smuzhiyun mc.moved_charge += HPAGE_PMD_NR;
6134*4882a593Smuzhiyun }
6135*4882a593Smuzhiyun put_page(page);
6136*4882a593Smuzhiyun }
6137*4882a593Smuzhiyun spin_unlock(ptl);
6138*4882a593Smuzhiyun return 0;
6139*4882a593Smuzhiyun }
6140*4882a593Smuzhiyun
6141*4882a593Smuzhiyun if (pmd_trans_unstable(pmd))
6142*4882a593Smuzhiyun return 0;
6143*4882a593Smuzhiyun retry:
6144*4882a593Smuzhiyun pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
6145*4882a593Smuzhiyun for (; addr != end; addr += PAGE_SIZE) {
6146*4882a593Smuzhiyun pte_t ptent = *(pte++);
6147*4882a593Smuzhiyun bool device = false;
6148*4882a593Smuzhiyun swp_entry_t ent;
6149*4882a593Smuzhiyun
6150*4882a593Smuzhiyun if (!mc.precharge)
6151*4882a593Smuzhiyun break;
6152*4882a593Smuzhiyun
6153*4882a593Smuzhiyun switch (get_mctgt_type(vma, addr, ptent, &target)) {
6154*4882a593Smuzhiyun case MC_TARGET_DEVICE:
6155*4882a593Smuzhiyun device = true;
6156*4882a593Smuzhiyun fallthrough;
6157*4882a593Smuzhiyun case MC_TARGET_PAGE:
6158*4882a593Smuzhiyun page = target.page;
6159*4882a593Smuzhiyun /*
6160*4882a593Smuzhiyun * We can have a part of the split pmd here. Moving it
6161*4882a593Smuzhiyun * can be done but it would be too convoluted so simply
6162*4882a593Smuzhiyun * ignore such a partial THP and keep it in original
6163*4882a593Smuzhiyun * memcg. There should be somebody mapping the head.
6164*4882a593Smuzhiyun */
6165*4882a593Smuzhiyun if (PageTransCompound(page))
6166*4882a593Smuzhiyun goto put;
6167*4882a593Smuzhiyun if (!device && isolate_lru_page(page))
6168*4882a593Smuzhiyun goto put;
6169*4882a593Smuzhiyun if (!mem_cgroup_move_account(page, false,
6170*4882a593Smuzhiyun mc.from, mc.to)) {
6171*4882a593Smuzhiyun mc.precharge--;
6172*4882a593Smuzhiyun /* we uncharge from mc.from later. */
6173*4882a593Smuzhiyun mc.moved_charge++;
6174*4882a593Smuzhiyun }
6175*4882a593Smuzhiyun if (!device)
6176*4882a593Smuzhiyun putback_lru_page(page);
6177*4882a593Smuzhiyun put: /* get_mctgt_type() gets the page */
6178*4882a593Smuzhiyun put_page(page);
6179*4882a593Smuzhiyun break;
6180*4882a593Smuzhiyun case MC_TARGET_SWAP:
6181*4882a593Smuzhiyun ent = target.ent;
6182*4882a593Smuzhiyun if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
6183*4882a593Smuzhiyun mc.precharge--;
6184*4882a593Smuzhiyun mem_cgroup_id_get_many(mc.to, 1);
6185*4882a593Smuzhiyun /* we fixup other refcnts and charges later. */
6186*4882a593Smuzhiyun mc.moved_swap++;
6187*4882a593Smuzhiyun }
6188*4882a593Smuzhiyun break;
6189*4882a593Smuzhiyun default:
6190*4882a593Smuzhiyun break;
6191*4882a593Smuzhiyun }
6192*4882a593Smuzhiyun }
6193*4882a593Smuzhiyun pte_unmap_unlock(pte - 1, ptl);
6194*4882a593Smuzhiyun cond_resched();
6195*4882a593Smuzhiyun
6196*4882a593Smuzhiyun if (addr != end) {
6197*4882a593Smuzhiyun /*
6198*4882a593Smuzhiyun * We have consumed all precharges we got in can_attach().
6199*4882a593Smuzhiyun * We try charge one by one, but don't do any additional
6200*4882a593Smuzhiyun * charges to mc.to if we have failed in charge once in attach()
6201*4882a593Smuzhiyun * phase.
6202*4882a593Smuzhiyun */
6203*4882a593Smuzhiyun ret = mem_cgroup_do_precharge(1);
6204*4882a593Smuzhiyun if (!ret)
6205*4882a593Smuzhiyun goto retry;
6206*4882a593Smuzhiyun }
6207*4882a593Smuzhiyun
6208*4882a593Smuzhiyun return ret;
6209*4882a593Smuzhiyun }
6210*4882a593Smuzhiyun
6211*4882a593Smuzhiyun static const struct mm_walk_ops charge_walk_ops = {
6212*4882a593Smuzhiyun .pmd_entry = mem_cgroup_move_charge_pte_range,
6213*4882a593Smuzhiyun };
6214*4882a593Smuzhiyun
mem_cgroup_move_charge(void)6215*4882a593Smuzhiyun static void mem_cgroup_move_charge(void)
6216*4882a593Smuzhiyun {
6217*4882a593Smuzhiyun lru_add_drain_all();
6218*4882a593Smuzhiyun /*
6219*4882a593Smuzhiyun * Signal lock_page_memcg() to take the memcg's move_lock
6220*4882a593Smuzhiyun * while we're moving its pages to another memcg. Then wait
6221*4882a593Smuzhiyun * for already started RCU-only updates to finish.
6222*4882a593Smuzhiyun */
6223*4882a593Smuzhiyun atomic_inc(&mc.from->moving_account);
6224*4882a593Smuzhiyun synchronize_rcu();
6225*4882a593Smuzhiyun retry:
6226*4882a593Smuzhiyun if (unlikely(!mmap_read_trylock(mc.mm))) {
6227*4882a593Smuzhiyun /*
6228*4882a593Smuzhiyun * Someone who are holding the mmap_lock might be waiting in
6229*4882a593Smuzhiyun * waitq. So we cancel all extra charges, wake up all waiters,
6230*4882a593Smuzhiyun * and retry. Because we cancel precharges, we might not be able
6231*4882a593Smuzhiyun * to move enough charges, but moving charge is a best-effort
6232*4882a593Smuzhiyun * feature anyway, so it wouldn't be a big problem.
6233*4882a593Smuzhiyun */
6234*4882a593Smuzhiyun __mem_cgroup_clear_mc();
6235*4882a593Smuzhiyun cond_resched();
6236*4882a593Smuzhiyun goto retry;
6237*4882a593Smuzhiyun }
6238*4882a593Smuzhiyun /*
6239*4882a593Smuzhiyun * When we have consumed all precharges and failed in doing
6240*4882a593Smuzhiyun * additional charge, the page walk just aborts.
6241*4882a593Smuzhiyun */
6242*4882a593Smuzhiyun walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
6243*4882a593Smuzhiyun NULL);
6244*4882a593Smuzhiyun
6245*4882a593Smuzhiyun mmap_read_unlock(mc.mm);
6246*4882a593Smuzhiyun atomic_dec(&mc.from->moving_account);
6247*4882a593Smuzhiyun }
6248*4882a593Smuzhiyun
mem_cgroup_move_task(void)6249*4882a593Smuzhiyun static void mem_cgroup_move_task(void)
6250*4882a593Smuzhiyun {
6251*4882a593Smuzhiyun if (mc.to) {
6252*4882a593Smuzhiyun mem_cgroup_move_charge();
6253*4882a593Smuzhiyun mem_cgroup_clear_mc();
6254*4882a593Smuzhiyun }
6255*4882a593Smuzhiyun }
6256*4882a593Smuzhiyun #else /* !CONFIG_MMU */
mem_cgroup_can_attach(struct cgroup_taskset * tset)6257*4882a593Smuzhiyun static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
6258*4882a593Smuzhiyun {
6259*4882a593Smuzhiyun return 0;
6260*4882a593Smuzhiyun }
mem_cgroup_cancel_attach(struct cgroup_taskset * tset)6261*4882a593Smuzhiyun static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
6262*4882a593Smuzhiyun {
6263*4882a593Smuzhiyun }
mem_cgroup_move_task(void)6264*4882a593Smuzhiyun static void mem_cgroup_move_task(void)
6265*4882a593Smuzhiyun {
6266*4882a593Smuzhiyun }
6267*4882a593Smuzhiyun #endif
6268*4882a593Smuzhiyun
6269*4882a593Smuzhiyun /*
6270*4882a593Smuzhiyun * Cgroup retains root cgroups across [un]mount cycles making it necessary
6271*4882a593Smuzhiyun * to verify whether we're attached to the default hierarchy on each mount
6272*4882a593Smuzhiyun * attempt.
6273*4882a593Smuzhiyun */
mem_cgroup_bind(struct cgroup_subsys_state * root_css)6274*4882a593Smuzhiyun static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
6275*4882a593Smuzhiyun {
6276*4882a593Smuzhiyun /*
6277*4882a593Smuzhiyun * use_hierarchy is forced on the default hierarchy. cgroup core
6278*4882a593Smuzhiyun * guarantees that @root doesn't have any children, so turning it
6279*4882a593Smuzhiyun * on for the root memcg is enough.
6280*4882a593Smuzhiyun */
6281*4882a593Smuzhiyun if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
6282*4882a593Smuzhiyun root_mem_cgroup->use_hierarchy = true;
6283*4882a593Smuzhiyun else
6284*4882a593Smuzhiyun root_mem_cgroup->use_hierarchy = false;
6285*4882a593Smuzhiyun }
6286*4882a593Smuzhiyun
seq_puts_memcg_tunable(struct seq_file * m,unsigned long value)6287*4882a593Smuzhiyun static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
6288*4882a593Smuzhiyun {
6289*4882a593Smuzhiyun if (value == PAGE_COUNTER_MAX)
6290*4882a593Smuzhiyun seq_puts(m, "max\n");
6291*4882a593Smuzhiyun else
6292*4882a593Smuzhiyun seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
6293*4882a593Smuzhiyun
6294*4882a593Smuzhiyun return 0;
6295*4882a593Smuzhiyun }
6296*4882a593Smuzhiyun
memory_current_read(struct cgroup_subsys_state * css,struct cftype * cft)6297*4882a593Smuzhiyun static u64 memory_current_read(struct cgroup_subsys_state *css,
6298*4882a593Smuzhiyun struct cftype *cft)
6299*4882a593Smuzhiyun {
6300*4882a593Smuzhiyun struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6301*4882a593Smuzhiyun
6302*4882a593Smuzhiyun return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
6303*4882a593Smuzhiyun }
6304*4882a593Smuzhiyun
memory_min_show(struct seq_file * m,void * v)6305*4882a593Smuzhiyun static int memory_min_show(struct seq_file *m, void *v)
6306*4882a593Smuzhiyun {
6307*4882a593Smuzhiyun return seq_puts_memcg_tunable(m,
6308*4882a593Smuzhiyun READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
6309*4882a593Smuzhiyun }
6310*4882a593Smuzhiyun
memory_min_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)6311*4882a593Smuzhiyun static ssize_t memory_min_write(struct kernfs_open_file *of,
6312*4882a593Smuzhiyun char *buf, size_t nbytes, loff_t off)
6313*4882a593Smuzhiyun {
6314*4882a593Smuzhiyun struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6315*4882a593Smuzhiyun unsigned long min;
6316*4882a593Smuzhiyun int err;
6317*4882a593Smuzhiyun
6318*4882a593Smuzhiyun buf = strstrip(buf);
6319*4882a593Smuzhiyun err = page_counter_memparse(buf, "max", &min);
6320*4882a593Smuzhiyun if (err)
6321*4882a593Smuzhiyun return err;
6322*4882a593Smuzhiyun
6323*4882a593Smuzhiyun page_counter_set_min(&memcg->memory, min);
6324*4882a593Smuzhiyun
6325*4882a593Smuzhiyun return nbytes;
6326*4882a593Smuzhiyun }
6327*4882a593Smuzhiyun
memory_low_show(struct seq_file * m,void * v)6328*4882a593Smuzhiyun static int memory_low_show(struct seq_file *m, void *v)
6329*4882a593Smuzhiyun {
6330*4882a593Smuzhiyun return seq_puts_memcg_tunable(m,
6331*4882a593Smuzhiyun READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
6332*4882a593Smuzhiyun }
6333*4882a593Smuzhiyun
memory_low_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)6334*4882a593Smuzhiyun static ssize_t memory_low_write(struct kernfs_open_file *of,
6335*4882a593Smuzhiyun char *buf, size_t nbytes, loff_t off)
6336*4882a593Smuzhiyun {
6337*4882a593Smuzhiyun struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6338*4882a593Smuzhiyun unsigned long low;
6339*4882a593Smuzhiyun int err;
6340*4882a593Smuzhiyun
6341*4882a593Smuzhiyun buf = strstrip(buf);
6342*4882a593Smuzhiyun err = page_counter_memparse(buf, "max", &low);
6343*4882a593Smuzhiyun if (err)
6344*4882a593Smuzhiyun return err;
6345*4882a593Smuzhiyun
6346*4882a593Smuzhiyun page_counter_set_low(&memcg->memory, low);
6347*4882a593Smuzhiyun
6348*4882a593Smuzhiyun return nbytes;
6349*4882a593Smuzhiyun }
6350*4882a593Smuzhiyun
memory_high_show(struct seq_file * m,void * v)6351*4882a593Smuzhiyun static int memory_high_show(struct seq_file *m, void *v)
6352*4882a593Smuzhiyun {
6353*4882a593Smuzhiyun return seq_puts_memcg_tunable(m,
6354*4882a593Smuzhiyun READ_ONCE(mem_cgroup_from_seq(m)->memory.high));
6355*4882a593Smuzhiyun }
6356*4882a593Smuzhiyun
memory_high_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)6357*4882a593Smuzhiyun static ssize_t memory_high_write(struct kernfs_open_file *of,
6358*4882a593Smuzhiyun char *buf, size_t nbytes, loff_t off)
6359*4882a593Smuzhiyun {
6360*4882a593Smuzhiyun struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6361*4882a593Smuzhiyun unsigned int nr_retries = MAX_RECLAIM_RETRIES;
6362*4882a593Smuzhiyun bool drained = false;
6363*4882a593Smuzhiyun unsigned long high;
6364*4882a593Smuzhiyun int err;
6365*4882a593Smuzhiyun
6366*4882a593Smuzhiyun buf = strstrip(buf);
6367*4882a593Smuzhiyun err = page_counter_memparse(buf, "max", &high);
6368*4882a593Smuzhiyun if (err)
6369*4882a593Smuzhiyun return err;
6370*4882a593Smuzhiyun
6371*4882a593Smuzhiyun page_counter_set_high(&memcg->memory, high);
6372*4882a593Smuzhiyun
6373*4882a593Smuzhiyun for (;;) {
6374*4882a593Smuzhiyun unsigned long nr_pages = page_counter_read(&memcg->memory);
6375*4882a593Smuzhiyun unsigned long reclaimed;
6376*4882a593Smuzhiyun
6377*4882a593Smuzhiyun if (nr_pages <= high)
6378*4882a593Smuzhiyun break;
6379*4882a593Smuzhiyun
6380*4882a593Smuzhiyun if (signal_pending(current))
6381*4882a593Smuzhiyun break;
6382*4882a593Smuzhiyun
6383*4882a593Smuzhiyun if (!drained) {
6384*4882a593Smuzhiyun drain_all_stock(memcg);
6385*4882a593Smuzhiyun drained = true;
6386*4882a593Smuzhiyun continue;
6387*4882a593Smuzhiyun }
6388*4882a593Smuzhiyun
6389*4882a593Smuzhiyun reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
6390*4882a593Smuzhiyun GFP_KERNEL, true);
6391*4882a593Smuzhiyun
6392*4882a593Smuzhiyun if (!reclaimed && !nr_retries--)
6393*4882a593Smuzhiyun break;
6394*4882a593Smuzhiyun }
6395*4882a593Smuzhiyun
6396*4882a593Smuzhiyun memcg_wb_domain_size_changed(memcg);
6397*4882a593Smuzhiyun return nbytes;
6398*4882a593Smuzhiyun }
6399*4882a593Smuzhiyun
memory_max_show(struct seq_file * m,void * v)6400*4882a593Smuzhiyun static int memory_max_show(struct seq_file *m, void *v)
6401*4882a593Smuzhiyun {
6402*4882a593Smuzhiyun return seq_puts_memcg_tunable(m,
6403*4882a593Smuzhiyun READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
6404*4882a593Smuzhiyun }
6405*4882a593Smuzhiyun
memory_max_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)6406*4882a593Smuzhiyun static ssize_t memory_max_write(struct kernfs_open_file *of,
6407*4882a593Smuzhiyun char *buf, size_t nbytes, loff_t off)
6408*4882a593Smuzhiyun {
6409*4882a593Smuzhiyun struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6410*4882a593Smuzhiyun unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
6411*4882a593Smuzhiyun bool drained = false;
6412*4882a593Smuzhiyun unsigned long max;
6413*4882a593Smuzhiyun int err;
6414*4882a593Smuzhiyun
6415*4882a593Smuzhiyun buf = strstrip(buf);
6416*4882a593Smuzhiyun err = page_counter_memparse(buf, "max", &max);
6417*4882a593Smuzhiyun if (err)
6418*4882a593Smuzhiyun return err;
6419*4882a593Smuzhiyun
6420*4882a593Smuzhiyun xchg(&memcg->memory.max, max);
6421*4882a593Smuzhiyun
6422*4882a593Smuzhiyun for (;;) {
6423*4882a593Smuzhiyun unsigned long nr_pages = page_counter_read(&memcg->memory);
6424*4882a593Smuzhiyun
6425*4882a593Smuzhiyun if (nr_pages <= max)
6426*4882a593Smuzhiyun break;
6427*4882a593Smuzhiyun
6428*4882a593Smuzhiyun if (signal_pending(current))
6429*4882a593Smuzhiyun break;
6430*4882a593Smuzhiyun
6431*4882a593Smuzhiyun if (!drained) {
6432*4882a593Smuzhiyun drain_all_stock(memcg);
6433*4882a593Smuzhiyun drained = true;
6434*4882a593Smuzhiyun continue;
6435*4882a593Smuzhiyun }
6436*4882a593Smuzhiyun
6437*4882a593Smuzhiyun if (nr_reclaims) {
6438*4882a593Smuzhiyun if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
6439*4882a593Smuzhiyun GFP_KERNEL, true))
6440*4882a593Smuzhiyun nr_reclaims--;
6441*4882a593Smuzhiyun continue;
6442*4882a593Smuzhiyun }
6443*4882a593Smuzhiyun
6444*4882a593Smuzhiyun memcg_memory_event(memcg, MEMCG_OOM);
6445*4882a593Smuzhiyun if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
6446*4882a593Smuzhiyun break;
6447*4882a593Smuzhiyun }
6448*4882a593Smuzhiyun
6449*4882a593Smuzhiyun memcg_wb_domain_size_changed(memcg);
6450*4882a593Smuzhiyun return nbytes;
6451*4882a593Smuzhiyun }
6452*4882a593Smuzhiyun
__memory_events_show(struct seq_file * m,atomic_long_t * events)6453*4882a593Smuzhiyun static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
6454*4882a593Smuzhiyun {
6455*4882a593Smuzhiyun seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
6456*4882a593Smuzhiyun seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
6457*4882a593Smuzhiyun seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
6458*4882a593Smuzhiyun seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
6459*4882a593Smuzhiyun seq_printf(m, "oom_kill %lu\n",
6460*4882a593Smuzhiyun atomic_long_read(&events[MEMCG_OOM_KILL]));
6461*4882a593Smuzhiyun }
6462*4882a593Smuzhiyun
memory_events_show(struct seq_file * m,void * v)6463*4882a593Smuzhiyun static int memory_events_show(struct seq_file *m, void *v)
6464*4882a593Smuzhiyun {
6465*4882a593Smuzhiyun struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6466*4882a593Smuzhiyun
6467*4882a593Smuzhiyun __memory_events_show(m, memcg->memory_events);
6468*4882a593Smuzhiyun return 0;
6469*4882a593Smuzhiyun }
6470*4882a593Smuzhiyun
memory_events_local_show(struct seq_file * m,void * v)6471*4882a593Smuzhiyun static int memory_events_local_show(struct seq_file *m, void *v)
6472*4882a593Smuzhiyun {
6473*4882a593Smuzhiyun struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6474*4882a593Smuzhiyun
6475*4882a593Smuzhiyun __memory_events_show(m, memcg->memory_events_local);
6476*4882a593Smuzhiyun return 0;
6477*4882a593Smuzhiyun }
6478*4882a593Smuzhiyun
memory_stat_show(struct seq_file * m,void * v)6479*4882a593Smuzhiyun static int memory_stat_show(struct seq_file *m, void *v)
6480*4882a593Smuzhiyun {
6481*4882a593Smuzhiyun struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6482*4882a593Smuzhiyun char *buf;
6483*4882a593Smuzhiyun
6484*4882a593Smuzhiyun buf = memory_stat_format(memcg);
6485*4882a593Smuzhiyun if (!buf)
6486*4882a593Smuzhiyun return -ENOMEM;
6487*4882a593Smuzhiyun seq_puts(m, buf);
6488*4882a593Smuzhiyun kfree(buf);
6489*4882a593Smuzhiyun return 0;
6490*4882a593Smuzhiyun }
6491*4882a593Smuzhiyun
6492*4882a593Smuzhiyun #ifdef CONFIG_NUMA
memory_numa_stat_show(struct seq_file * m,void * v)6493*4882a593Smuzhiyun static int memory_numa_stat_show(struct seq_file *m, void *v)
6494*4882a593Smuzhiyun {
6495*4882a593Smuzhiyun int i;
6496*4882a593Smuzhiyun struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6497*4882a593Smuzhiyun
6498*4882a593Smuzhiyun for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
6499*4882a593Smuzhiyun int nid;
6500*4882a593Smuzhiyun
6501*4882a593Smuzhiyun if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS)
6502*4882a593Smuzhiyun continue;
6503*4882a593Smuzhiyun
6504*4882a593Smuzhiyun seq_printf(m, "%s", memory_stats[i].name);
6505*4882a593Smuzhiyun for_each_node_state(nid, N_MEMORY) {
6506*4882a593Smuzhiyun u64 size;
6507*4882a593Smuzhiyun struct lruvec *lruvec;
6508*4882a593Smuzhiyun
6509*4882a593Smuzhiyun lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
6510*4882a593Smuzhiyun size = lruvec_page_state(lruvec, memory_stats[i].idx);
6511*4882a593Smuzhiyun size *= memory_stats[i].ratio;
6512*4882a593Smuzhiyun seq_printf(m, " N%d=%llu", nid, size);
6513*4882a593Smuzhiyun }
6514*4882a593Smuzhiyun seq_putc(m, '\n');
6515*4882a593Smuzhiyun }
6516*4882a593Smuzhiyun
6517*4882a593Smuzhiyun return 0;
6518*4882a593Smuzhiyun }
6519*4882a593Smuzhiyun #endif
6520*4882a593Smuzhiyun
memory_oom_group_show(struct seq_file * m,void * v)6521*4882a593Smuzhiyun static int memory_oom_group_show(struct seq_file *m, void *v)
6522*4882a593Smuzhiyun {
6523*4882a593Smuzhiyun struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6524*4882a593Smuzhiyun
6525*4882a593Smuzhiyun seq_printf(m, "%d\n", memcg->oom_group);
6526*4882a593Smuzhiyun
6527*4882a593Smuzhiyun return 0;
6528*4882a593Smuzhiyun }
6529*4882a593Smuzhiyun
memory_oom_group_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)6530*4882a593Smuzhiyun static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
6531*4882a593Smuzhiyun char *buf, size_t nbytes, loff_t off)
6532*4882a593Smuzhiyun {
6533*4882a593Smuzhiyun struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6534*4882a593Smuzhiyun int ret, oom_group;
6535*4882a593Smuzhiyun
6536*4882a593Smuzhiyun buf = strstrip(buf);
6537*4882a593Smuzhiyun if (!buf)
6538*4882a593Smuzhiyun return -EINVAL;
6539*4882a593Smuzhiyun
6540*4882a593Smuzhiyun ret = kstrtoint(buf, 0, &oom_group);
6541*4882a593Smuzhiyun if (ret)
6542*4882a593Smuzhiyun return ret;
6543*4882a593Smuzhiyun
6544*4882a593Smuzhiyun if (oom_group != 0 && oom_group != 1)
6545*4882a593Smuzhiyun return -EINVAL;
6546*4882a593Smuzhiyun
6547*4882a593Smuzhiyun memcg->oom_group = oom_group;
6548*4882a593Smuzhiyun
6549*4882a593Smuzhiyun return nbytes;
6550*4882a593Smuzhiyun }
6551*4882a593Smuzhiyun
6552*4882a593Smuzhiyun static struct cftype memory_files[] = {
6553*4882a593Smuzhiyun {
6554*4882a593Smuzhiyun .name = "current",
6555*4882a593Smuzhiyun .flags = CFTYPE_NOT_ON_ROOT,
6556*4882a593Smuzhiyun .read_u64 = memory_current_read,
6557*4882a593Smuzhiyun },
6558*4882a593Smuzhiyun {
6559*4882a593Smuzhiyun .name = "min",
6560*4882a593Smuzhiyun .flags = CFTYPE_NOT_ON_ROOT,
6561*4882a593Smuzhiyun .seq_show = memory_min_show,
6562*4882a593Smuzhiyun .write = memory_min_write,
6563*4882a593Smuzhiyun },
6564*4882a593Smuzhiyun {
6565*4882a593Smuzhiyun .name = "low",
6566*4882a593Smuzhiyun .flags = CFTYPE_NOT_ON_ROOT,
6567*4882a593Smuzhiyun .seq_show = memory_low_show,
6568*4882a593Smuzhiyun .write = memory_low_write,
6569*4882a593Smuzhiyun },
6570*4882a593Smuzhiyun {
6571*4882a593Smuzhiyun .name = "high",
6572*4882a593Smuzhiyun .flags = CFTYPE_NOT_ON_ROOT,
6573*4882a593Smuzhiyun .seq_show = memory_high_show,
6574*4882a593Smuzhiyun .write = memory_high_write,
6575*4882a593Smuzhiyun },
6576*4882a593Smuzhiyun {
6577*4882a593Smuzhiyun .name = "max",
6578*4882a593Smuzhiyun .flags = CFTYPE_NOT_ON_ROOT,
6579*4882a593Smuzhiyun .seq_show = memory_max_show,
6580*4882a593Smuzhiyun .write = memory_max_write,
6581*4882a593Smuzhiyun },
6582*4882a593Smuzhiyun {
6583*4882a593Smuzhiyun .name = "events",
6584*4882a593Smuzhiyun .flags = CFTYPE_NOT_ON_ROOT,
6585*4882a593Smuzhiyun .file_offset = offsetof(struct mem_cgroup, events_file),
6586*4882a593Smuzhiyun .seq_show = memory_events_show,
6587*4882a593Smuzhiyun },
6588*4882a593Smuzhiyun {
6589*4882a593Smuzhiyun .name = "events.local",
6590*4882a593Smuzhiyun .flags = CFTYPE_NOT_ON_ROOT,
6591*4882a593Smuzhiyun .file_offset = offsetof(struct mem_cgroup, events_local_file),
6592*4882a593Smuzhiyun .seq_show = memory_events_local_show,
6593*4882a593Smuzhiyun },
6594*4882a593Smuzhiyun {
6595*4882a593Smuzhiyun .name = "stat",
6596*4882a593Smuzhiyun .seq_show = memory_stat_show,
6597*4882a593Smuzhiyun },
6598*4882a593Smuzhiyun #ifdef CONFIG_NUMA
6599*4882a593Smuzhiyun {
6600*4882a593Smuzhiyun .name = "numa_stat",
6601*4882a593Smuzhiyun .seq_show = memory_numa_stat_show,
6602*4882a593Smuzhiyun },
6603*4882a593Smuzhiyun #endif
6604*4882a593Smuzhiyun {
6605*4882a593Smuzhiyun .name = "oom.group",
6606*4882a593Smuzhiyun .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
6607*4882a593Smuzhiyun .seq_show = memory_oom_group_show,
6608*4882a593Smuzhiyun .write = memory_oom_group_write,
6609*4882a593Smuzhiyun },
6610*4882a593Smuzhiyun { } /* terminate */
6611*4882a593Smuzhiyun };
6612*4882a593Smuzhiyun
6613*4882a593Smuzhiyun struct cgroup_subsys memory_cgrp_subsys = {
6614*4882a593Smuzhiyun .css_alloc = mem_cgroup_css_alloc,
6615*4882a593Smuzhiyun .css_online = mem_cgroup_css_online,
6616*4882a593Smuzhiyun .css_offline = mem_cgroup_css_offline,
6617*4882a593Smuzhiyun .css_released = mem_cgroup_css_released,
6618*4882a593Smuzhiyun .css_free = mem_cgroup_css_free,
6619*4882a593Smuzhiyun .css_reset = mem_cgroup_css_reset,
6620*4882a593Smuzhiyun .can_attach = mem_cgroup_can_attach,
6621*4882a593Smuzhiyun .cancel_attach = mem_cgroup_cancel_attach,
6622*4882a593Smuzhiyun .post_attach = mem_cgroup_move_task,
6623*4882a593Smuzhiyun .bind = mem_cgroup_bind,
6624*4882a593Smuzhiyun .dfl_cftypes = memory_files,
6625*4882a593Smuzhiyun .legacy_cftypes = mem_cgroup_legacy_files,
6626*4882a593Smuzhiyun .early_init = 0,
6627*4882a593Smuzhiyun };
6628*4882a593Smuzhiyun
6629*4882a593Smuzhiyun /*
6630*4882a593Smuzhiyun * This function calculates an individual cgroup's effective
6631*4882a593Smuzhiyun * protection which is derived from its own memory.min/low, its
6632*4882a593Smuzhiyun * parent's and siblings' settings, as well as the actual memory
6633*4882a593Smuzhiyun * distribution in the tree.
6634*4882a593Smuzhiyun *
6635*4882a593Smuzhiyun * The following rules apply to the effective protection values:
6636*4882a593Smuzhiyun *
6637*4882a593Smuzhiyun * 1. At the first level of reclaim, effective protection is equal to
6638*4882a593Smuzhiyun * the declared protection in memory.min and memory.low.
6639*4882a593Smuzhiyun *
6640*4882a593Smuzhiyun * 2. To enable safe delegation of the protection configuration, at
6641*4882a593Smuzhiyun * subsequent levels the effective protection is capped to the
6642*4882a593Smuzhiyun * parent's effective protection.
6643*4882a593Smuzhiyun *
6644*4882a593Smuzhiyun * 3. To make complex and dynamic subtrees easier to configure, the
6645*4882a593Smuzhiyun * user is allowed to overcommit the declared protection at a given
6646*4882a593Smuzhiyun * level. If that is the case, the parent's effective protection is
6647*4882a593Smuzhiyun * distributed to the children in proportion to how much protection
6648*4882a593Smuzhiyun * they have declared and how much of it they are utilizing.
6649*4882a593Smuzhiyun *
6650*4882a593Smuzhiyun * This makes distribution proportional, but also work-conserving:
6651*4882a593Smuzhiyun * if one cgroup claims much more protection than it uses memory,
6652*4882a593Smuzhiyun * the unused remainder is available to its siblings.
6653*4882a593Smuzhiyun *
6654*4882a593Smuzhiyun * 4. Conversely, when the declared protection is undercommitted at a
6655*4882a593Smuzhiyun * given level, the distribution of the larger parental protection
6656*4882a593Smuzhiyun * budget is NOT proportional. A cgroup's protection from a sibling
6657*4882a593Smuzhiyun * is capped to its own memory.min/low setting.
6658*4882a593Smuzhiyun *
6659*4882a593Smuzhiyun * 5. However, to allow protecting recursive subtrees from each other
6660*4882a593Smuzhiyun * without having to declare each individual cgroup's fixed share
6661*4882a593Smuzhiyun * of the ancestor's claim to protection, any unutilized -
6662*4882a593Smuzhiyun * "floating" - protection from up the tree is distributed in
6663*4882a593Smuzhiyun * proportion to each cgroup's *usage*. This makes the protection
6664*4882a593Smuzhiyun * neutral wrt sibling cgroups and lets them compete freely over
6665*4882a593Smuzhiyun * the shared parental protection budget, but it protects the
6666*4882a593Smuzhiyun * subtree as a whole from neighboring subtrees.
6667*4882a593Smuzhiyun *
6668*4882a593Smuzhiyun * Note that 4. and 5. are not in conflict: 4. is about protecting
6669*4882a593Smuzhiyun * against immediate siblings whereas 5. is about protecting against
6670*4882a593Smuzhiyun * neighboring subtrees.
6671*4882a593Smuzhiyun */
effective_protection(unsigned long usage,unsigned long parent_usage,unsigned long setting,unsigned long parent_effective,unsigned long siblings_protected)6672*4882a593Smuzhiyun static unsigned long effective_protection(unsigned long usage,
6673*4882a593Smuzhiyun unsigned long parent_usage,
6674*4882a593Smuzhiyun unsigned long setting,
6675*4882a593Smuzhiyun unsigned long parent_effective,
6676*4882a593Smuzhiyun unsigned long siblings_protected)
6677*4882a593Smuzhiyun {
6678*4882a593Smuzhiyun unsigned long protected;
6679*4882a593Smuzhiyun unsigned long ep;
6680*4882a593Smuzhiyun
6681*4882a593Smuzhiyun protected = min(usage, setting);
6682*4882a593Smuzhiyun /*
6683*4882a593Smuzhiyun * If all cgroups at this level combined claim and use more
6684*4882a593Smuzhiyun * protection then what the parent affords them, distribute
6685*4882a593Smuzhiyun * shares in proportion to utilization.
6686*4882a593Smuzhiyun *
6687*4882a593Smuzhiyun * We are using actual utilization rather than the statically
6688*4882a593Smuzhiyun * claimed protection in order to be work-conserving: claimed
6689*4882a593Smuzhiyun * but unused protection is available to siblings that would
6690*4882a593Smuzhiyun * otherwise get a smaller chunk than what they claimed.
6691*4882a593Smuzhiyun */
6692*4882a593Smuzhiyun if (siblings_protected > parent_effective)
6693*4882a593Smuzhiyun return protected * parent_effective / siblings_protected;
6694*4882a593Smuzhiyun
6695*4882a593Smuzhiyun /*
6696*4882a593Smuzhiyun * Ok, utilized protection of all children is within what the
6697*4882a593Smuzhiyun * parent affords them, so we know whatever this child claims
6698*4882a593Smuzhiyun * and utilizes is effectively protected.
6699*4882a593Smuzhiyun *
6700*4882a593Smuzhiyun * If there is unprotected usage beyond this value, reclaim
6701*4882a593Smuzhiyun * will apply pressure in proportion to that amount.
6702*4882a593Smuzhiyun *
6703*4882a593Smuzhiyun * If there is unutilized protection, the cgroup will be fully
6704*4882a593Smuzhiyun * shielded from reclaim, but we do return a smaller value for
6705*4882a593Smuzhiyun * protection than what the group could enjoy in theory. This
6706*4882a593Smuzhiyun * is okay. With the overcommit distribution above, effective
6707*4882a593Smuzhiyun * protection is always dependent on how memory is actually
6708*4882a593Smuzhiyun * consumed among the siblings anyway.
6709*4882a593Smuzhiyun */
6710*4882a593Smuzhiyun ep = protected;
6711*4882a593Smuzhiyun
6712*4882a593Smuzhiyun /*
6713*4882a593Smuzhiyun * If the children aren't claiming (all of) the protection
6714*4882a593Smuzhiyun * afforded to them by the parent, distribute the remainder in
6715*4882a593Smuzhiyun * proportion to the (unprotected) memory of each cgroup. That
6716*4882a593Smuzhiyun * way, cgroups that aren't explicitly prioritized wrt each
6717*4882a593Smuzhiyun * other compete freely over the allowance, but they are
6718*4882a593Smuzhiyun * collectively protected from neighboring trees.
6719*4882a593Smuzhiyun *
6720*4882a593Smuzhiyun * We're using unprotected memory for the weight so that if
6721*4882a593Smuzhiyun * some cgroups DO claim explicit protection, we don't protect
6722*4882a593Smuzhiyun * the same bytes twice.
6723*4882a593Smuzhiyun *
6724*4882a593Smuzhiyun * Check both usage and parent_usage against the respective
6725*4882a593Smuzhiyun * protected values. One should imply the other, but they
6726*4882a593Smuzhiyun * aren't read atomically - make sure the division is sane.
6727*4882a593Smuzhiyun */
6728*4882a593Smuzhiyun if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT))
6729*4882a593Smuzhiyun return ep;
6730*4882a593Smuzhiyun if (parent_effective > siblings_protected &&
6731*4882a593Smuzhiyun parent_usage > siblings_protected &&
6732*4882a593Smuzhiyun usage > protected) {
6733*4882a593Smuzhiyun unsigned long unclaimed;
6734*4882a593Smuzhiyun
6735*4882a593Smuzhiyun unclaimed = parent_effective - siblings_protected;
6736*4882a593Smuzhiyun unclaimed *= usage - protected;
6737*4882a593Smuzhiyun unclaimed /= parent_usage - siblings_protected;
6738*4882a593Smuzhiyun
6739*4882a593Smuzhiyun ep += unclaimed;
6740*4882a593Smuzhiyun }
6741*4882a593Smuzhiyun
6742*4882a593Smuzhiyun return ep;
6743*4882a593Smuzhiyun }
6744*4882a593Smuzhiyun
6745*4882a593Smuzhiyun /**
6746*4882a593Smuzhiyun * mem_cgroup_protected - check if memory consumption is in the normal range
6747*4882a593Smuzhiyun * @root: the top ancestor of the sub-tree being checked
6748*4882a593Smuzhiyun * @memcg: the memory cgroup to check
6749*4882a593Smuzhiyun *
6750*4882a593Smuzhiyun * WARNING: This function is not stateless! It can only be used as part
6751*4882a593Smuzhiyun * of a top-down tree iteration, not for isolated queries.
6752*4882a593Smuzhiyun */
mem_cgroup_calculate_protection(struct mem_cgroup * root,struct mem_cgroup * memcg)6753*4882a593Smuzhiyun void mem_cgroup_calculate_protection(struct mem_cgroup *root,
6754*4882a593Smuzhiyun struct mem_cgroup *memcg)
6755*4882a593Smuzhiyun {
6756*4882a593Smuzhiyun unsigned long usage, parent_usage;
6757*4882a593Smuzhiyun struct mem_cgroup *parent;
6758*4882a593Smuzhiyun
6759*4882a593Smuzhiyun if (mem_cgroup_disabled())
6760*4882a593Smuzhiyun return;
6761*4882a593Smuzhiyun
6762*4882a593Smuzhiyun if (!root)
6763*4882a593Smuzhiyun root = root_mem_cgroup;
6764*4882a593Smuzhiyun
6765*4882a593Smuzhiyun /*
6766*4882a593Smuzhiyun * Effective values of the reclaim targets are ignored so they
6767*4882a593Smuzhiyun * can be stale. Have a look at mem_cgroup_protection for more
6768*4882a593Smuzhiyun * details.
6769*4882a593Smuzhiyun * TODO: calculation should be more robust so that we do not need
6770*4882a593Smuzhiyun * that special casing.
6771*4882a593Smuzhiyun */
6772*4882a593Smuzhiyun if (memcg == root)
6773*4882a593Smuzhiyun return;
6774*4882a593Smuzhiyun
6775*4882a593Smuzhiyun usage = page_counter_read(&memcg->memory);
6776*4882a593Smuzhiyun if (!usage)
6777*4882a593Smuzhiyun return;
6778*4882a593Smuzhiyun
6779*4882a593Smuzhiyun parent = parent_mem_cgroup(memcg);
6780*4882a593Smuzhiyun /* No parent means a non-hierarchical mode on v1 memcg */
6781*4882a593Smuzhiyun if (!parent)
6782*4882a593Smuzhiyun return;
6783*4882a593Smuzhiyun
6784*4882a593Smuzhiyun if (parent == root) {
6785*4882a593Smuzhiyun memcg->memory.emin = READ_ONCE(memcg->memory.min);
6786*4882a593Smuzhiyun memcg->memory.elow = READ_ONCE(memcg->memory.low);
6787*4882a593Smuzhiyun return;
6788*4882a593Smuzhiyun }
6789*4882a593Smuzhiyun
6790*4882a593Smuzhiyun parent_usage = page_counter_read(&parent->memory);
6791*4882a593Smuzhiyun
6792*4882a593Smuzhiyun WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage,
6793*4882a593Smuzhiyun READ_ONCE(memcg->memory.min),
6794*4882a593Smuzhiyun READ_ONCE(parent->memory.emin),
6795*4882a593Smuzhiyun atomic_long_read(&parent->memory.children_min_usage)));
6796*4882a593Smuzhiyun
6797*4882a593Smuzhiyun WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage,
6798*4882a593Smuzhiyun READ_ONCE(memcg->memory.low),
6799*4882a593Smuzhiyun READ_ONCE(parent->memory.elow),
6800*4882a593Smuzhiyun atomic_long_read(&parent->memory.children_low_usage)));
6801*4882a593Smuzhiyun }
6802*4882a593Smuzhiyun
6803*4882a593Smuzhiyun /**
6804*4882a593Smuzhiyun * __mem_cgroup_charge - charge a newly allocated page to a cgroup
6805*4882a593Smuzhiyun * @page: page to charge
6806*4882a593Smuzhiyun * @mm: mm context of the victim
6807*4882a593Smuzhiyun * @gfp_mask: reclaim mode
6808*4882a593Smuzhiyun *
6809*4882a593Smuzhiyun * Try to charge @page to the memcg that @mm belongs to, reclaiming
6810*4882a593Smuzhiyun * pages according to @gfp_mask if necessary.
6811*4882a593Smuzhiyun *
6812*4882a593Smuzhiyun * Returns 0 on success. Otherwise, an error code is returned.
6813*4882a593Smuzhiyun */
__mem_cgroup_charge(struct page * page,struct mm_struct * mm,gfp_t gfp_mask)6814*4882a593Smuzhiyun int __mem_cgroup_charge(struct page *page, struct mm_struct *mm,
6815*4882a593Smuzhiyun gfp_t gfp_mask)
6816*4882a593Smuzhiyun {
6817*4882a593Smuzhiyun unsigned int nr_pages = thp_nr_pages(page);
6818*4882a593Smuzhiyun struct mem_cgroup *memcg = NULL;
6819*4882a593Smuzhiyun int ret = 0;
6820*4882a593Smuzhiyun
6821*4882a593Smuzhiyun if (PageSwapCache(page)) {
6822*4882a593Smuzhiyun swp_entry_t ent = { .val = page_private(page), };
6823*4882a593Smuzhiyun unsigned short id;
6824*4882a593Smuzhiyun
6825*4882a593Smuzhiyun /*
6826*4882a593Smuzhiyun * Every swap fault against a single page tries to charge the
6827*4882a593Smuzhiyun * page, bail as early as possible. shmem_unuse() encounters
6828*4882a593Smuzhiyun * already charged pages, too. page->mem_cgroup is protected
6829*4882a593Smuzhiyun * by the page lock, which serializes swap cache removal, which
6830*4882a593Smuzhiyun * in turn serializes uncharging.
6831*4882a593Smuzhiyun */
6832*4882a593Smuzhiyun VM_BUG_ON_PAGE(!PageLocked(page), page);
6833*4882a593Smuzhiyun if (compound_head(page)->mem_cgroup)
6834*4882a593Smuzhiyun goto out;
6835*4882a593Smuzhiyun
6836*4882a593Smuzhiyun id = lookup_swap_cgroup_id(ent);
6837*4882a593Smuzhiyun rcu_read_lock();
6838*4882a593Smuzhiyun memcg = mem_cgroup_from_id(id);
6839*4882a593Smuzhiyun if (memcg && !css_tryget_online(&memcg->css))
6840*4882a593Smuzhiyun memcg = NULL;
6841*4882a593Smuzhiyun rcu_read_unlock();
6842*4882a593Smuzhiyun }
6843*4882a593Smuzhiyun
6844*4882a593Smuzhiyun if (!memcg)
6845*4882a593Smuzhiyun memcg = get_mem_cgroup_from_mm(mm);
6846*4882a593Smuzhiyun
6847*4882a593Smuzhiyun ret = try_charge(memcg, gfp_mask, nr_pages);
6848*4882a593Smuzhiyun if (ret)
6849*4882a593Smuzhiyun goto out_put;
6850*4882a593Smuzhiyun
6851*4882a593Smuzhiyun css_get(&memcg->css);
6852*4882a593Smuzhiyun commit_charge(page, memcg);
6853*4882a593Smuzhiyun
6854*4882a593Smuzhiyun local_irq_disable();
6855*4882a593Smuzhiyun mem_cgroup_charge_statistics(memcg, page, nr_pages);
6856*4882a593Smuzhiyun memcg_check_events(memcg, page);
6857*4882a593Smuzhiyun local_irq_enable();
6858*4882a593Smuzhiyun
6859*4882a593Smuzhiyun /*
6860*4882a593Smuzhiyun * Cgroup1's unified memory+swap counter has been charged with the
6861*4882a593Smuzhiyun * new swapcache page, finish the transfer by uncharging the swap
6862*4882a593Smuzhiyun * slot. The swap slot would also get uncharged when it dies, but
6863*4882a593Smuzhiyun * it can stick around indefinitely and we'd count the page twice
6864*4882a593Smuzhiyun * the entire time.
6865*4882a593Smuzhiyun *
6866*4882a593Smuzhiyun * Cgroup2 has separate resource counters for memory and swap,
6867*4882a593Smuzhiyun * so this is a non-issue here. Memory and swap charge lifetimes
6868*4882a593Smuzhiyun * correspond 1:1 to page and swap slot lifetimes: we charge the
6869*4882a593Smuzhiyun * page to memory here, and uncharge swap when the slot is freed.
6870*4882a593Smuzhiyun */
6871*4882a593Smuzhiyun if (do_memsw_account() && PageSwapCache(page)) {
6872*4882a593Smuzhiyun swp_entry_t entry = { .val = page_private(page) };
6873*4882a593Smuzhiyun /*
6874*4882a593Smuzhiyun * The swap entry might not get freed for a long time,
6875*4882a593Smuzhiyun * let's not wait for it. The page already received a
6876*4882a593Smuzhiyun * memory+swap charge, drop the swap entry duplicate.
6877*4882a593Smuzhiyun */
6878*4882a593Smuzhiyun mem_cgroup_uncharge_swap(entry, nr_pages);
6879*4882a593Smuzhiyun }
6880*4882a593Smuzhiyun
6881*4882a593Smuzhiyun out_put:
6882*4882a593Smuzhiyun css_put(&memcg->css);
6883*4882a593Smuzhiyun out:
6884*4882a593Smuzhiyun return ret;
6885*4882a593Smuzhiyun }
6886*4882a593Smuzhiyun
6887*4882a593Smuzhiyun struct uncharge_gather {
6888*4882a593Smuzhiyun struct mem_cgroup *memcg;
6889*4882a593Smuzhiyun unsigned long nr_pages;
6890*4882a593Smuzhiyun unsigned long pgpgout;
6891*4882a593Smuzhiyun unsigned long nr_kmem;
6892*4882a593Smuzhiyun struct page *dummy_page;
6893*4882a593Smuzhiyun };
6894*4882a593Smuzhiyun
uncharge_gather_clear(struct uncharge_gather * ug)6895*4882a593Smuzhiyun static inline void uncharge_gather_clear(struct uncharge_gather *ug)
6896*4882a593Smuzhiyun {
6897*4882a593Smuzhiyun memset(ug, 0, sizeof(*ug));
6898*4882a593Smuzhiyun }
6899*4882a593Smuzhiyun
uncharge_batch(const struct uncharge_gather * ug)6900*4882a593Smuzhiyun static void uncharge_batch(const struct uncharge_gather *ug)
6901*4882a593Smuzhiyun {
6902*4882a593Smuzhiyun unsigned long flags;
6903*4882a593Smuzhiyun
6904*4882a593Smuzhiyun if (!mem_cgroup_is_root(ug->memcg)) {
6905*4882a593Smuzhiyun page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
6906*4882a593Smuzhiyun if (do_memsw_account())
6907*4882a593Smuzhiyun page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
6908*4882a593Smuzhiyun if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
6909*4882a593Smuzhiyun page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
6910*4882a593Smuzhiyun memcg_oom_recover(ug->memcg);
6911*4882a593Smuzhiyun }
6912*4882a593Smuzhiyun
6913*4882a593Smuzhiyun local_irq_save(flags);
6914*4882a593Smuzhiyun __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
6915*4882a593Smuzhiyun __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages);
6916*4882a593Smuzhiyun memcg_check_events(ug->memcg, ug->dummy_page);
6917*4882a593Smuzhiyun local_irq_restore(flags);
6918*4882a593Smuzhiyun
6919*4882a593Smuzhiyun /* drop reference from uncharge_page */
6920*4882a593Smuzhiyun css_put(&ug->memcg->css);
6921*4882a593Smuzhiyun }
6922*4882a593Smuzhiyun
uncharge_page(struct page * page,struct uncharge_gather * ug)6923*4882a593Smuzhiyun static void uncharge_page(struct page *page, struct uncharge_gather *ug)
6924*4882a593Smuzhiyun {
6925*4882a593Smuzhiyun unsigned long nr_pages;
6926*4882a593Smuzhiyun
6927*4882a593Smuzhiyun VM_BUG_ON_PAGE(PageLRU(page), page);
6928*4882a593Smuzhiyun
6929*4882a593Smuzhiyun if (!page->mem_cgroup)
6930*4882a593Smuzhiyun return;
6931*4882a593Smuzhiyun
6932*4882a593Smuzhiyun /*
6933*4882a593Smuzhiyun * Nobody should be changing or seriously looking at
6934*4882a593Smuzhiyun * page->mem_cgroup at this point, we have fully
6935*4882a593Smuzhiyun * exclusive access to the page.
6936*4882a593Smuzhiyun */
6937*4882a593Smuzhiyun
6938*4882a593Smuzhiyun if (ug->memcg != page->mem_cgroup) {
6939*4882a593Smuzhiyun if (ug->memcg) {
6940*4882a593Smuzhiyun uncharge_batch(ug);
6941*4882a593Smuzhiyun uncharge_gather_clear(ug);
6942*4882a593Smuzhiyun }
6943*4882a593Smuzhiyun ug->memcg = page->mem_cgroup;
6944*4882a593Smuzhiyun
6945*4882a593Smuzhiyun /* pairs with css_put in uncharge_batch */
6946*4882a593Smuzhiyun css_get(&ug->memcg->css);
6947*4882a593Smuzhiyun }
6948*4882a593Smuzhiyun
6949*4882a593Smuzhiyun nr_pages = compound_nr(page);
6950*4882a593Smuzhiyun ug->nr_pages += nr_pages;
6951*4882a593Smuzhiyun
6952*4882a593Smuzhiyun if (!PageKmemcg(page)) {
6953*4882a593Smuzhiyun ug->pgpgout++;
6954*4882a593Smuzhiyun } else {
6955*4882a593Smuzhiyun ug->nr_kmem += nr_pages;
6956*4882a593Smuzhiyun __ClearPageKmemcg(page);
6957*4882a593Smuzhiyun }
6958*4882a593Smuzhiyun
6959*4882a593Smuzhiyun ug->dummy_page = page;
6960*4882a593Smuzhiyun page->mem_cgroup = NULL;
6961*4882a593Smuzhiyun css_put(&ug->memcg->css);
6962*4882a593Smuzhiyun }
6963*4882a593Smuzhiyun
uncharge_list(struct list_head * page_list)6964*4882a593Smuzhiyun static void uncharge_list(struct list_head *page_list)
6965*4882a593Smuzhiyun {
6966*4882a593Smuzhiyun struct uncharge_gather ug;
6967*4882a593Smuzhiyun struct list_head *next;
6968*4882a593Smuzhiyun
6969*4882a593Smuzhiyun uncharge_gather_clear(&ug);
6970*4882a593Smuzhiyun
6971*4882a593Smuzhiyun /*
6972*4882a593Smuzhiyun * Note that the list can be a single page->lru; hence the
6973*4882a593Smuzhiyun * do-while loop instead of a simple list_for_each_entry().
6974*4882a593Smuzhiyun */
6975*4882a593Smuzhiyun next = page_list->next;
6976*4882a593Smuzhiyun do {
6977*4882a593Smuzhiyun struct page *page;
6978*4882a593Smuzhiyun
6979*4882a593Smuzhiyun page = list_entry(next, struct page, lru);
6980*4882a593Smuzhiyun next = page->lru.next;
6981*4882a593Smuzhiyun
6982*4882a593Smuzhiyun uncharge_page(page, &ug);
6983*4882a593Smuzhiyun } while (next != page_list);
6984*4882a593Smuzhiyun
6985*4882a593Smuzhiyun if (ug.memcg)
6986*4882a593Smuzhiyun uncharge_batch(&ug);
6987*4882a593Smuzhiyun }
6988*4882a593Smuzhiyun
6989*4882a593Smuzhiyun /**
6990*4882a593Smuzhiyun * __mem_cgroup_uncharge - uncharge a page
6991*4882a593Smuzhiyun * @page: page to uncharge
6992*4882a593Smuzhiyun *
6993*4882a593Smuzhiyun * Uncharge a page previously charged with __mem_cgroup_charge().
6994*4882a593Smuzhiyun */
__mem_cgroup_uncharge(struct page * page)6995*4882a593Smuzhiyun void __mem_cgroup_uncharge(struct page *page)
6996*4882a593Smuzhiyun {
6997*4882a593Smuzhiyun struct uncharge_gather ug;
6998*4882a593Smuzhiyun
6999*4882a593Smuzhiyun /* Don't touch page->lru of any random page, pre-check: */
7000*4882a593Smuzhiyun if (!page->mem_cgroup)
7001*4882a593Smuzhiyun return;
7002*4882a593Smuzhiyun
7003*4882a593Smuzhiyun uncharge_gather_clear(&ug);
7004*4882a593Smuzhiyun uncharge_page(page, &ug);
7005*4882a593Smuzhiyun uncharge_batch(&ug);
7006*4882a593Smuzhiyun }
7007*4882a593Smuzhiyun
7008*4882a593Smuzhiyun /**
7009*4882a593Smuzhiyun * __mem_cgroup_uncharge_list - uncharge a list of page
7010*4882a593Smuzhiyun * @page_list: list of pages to uncharge
7011*4882a593Smuzhiyun *
7012*4882a593Smuzhiyun * Uncharge a list of pages previously charged with
7013*4882a593Smuzhiyun * __mem_cgroup_charge().
7014*4882a593Smuzhiyun */
__mem_cgroup_uncharge_list(struct list_head * page_list)7015*4882a593Smuzhiyun void __mem_cgroup_uncharge_list(struct list_head *page_list)
7016*4882a593Smuzhiyun {
7017*4882a593Smuzhiyun if (!list_empty(page_list))
7018*4882a593Smuzhiyun uncharge_list(page_list);
7019*4882a593Smuzhiyun }
7020*4882a593Smuzhiyun
7021*4882a593Smuzhiyun /**
7022*4882a593Smuzhiyun * mem_cgroup_migrate - charge a page's replacement
7023*4882a593Smuzhiyun * @oldpage: currently circulating page
7024*4882a593Smuzhiyun * @newpage: replacement page
7025*4882a593Smuzhiyun *
7026*4882a593Smuzhiyun * Charge @newpage as a replacement page for @oldpage. @oldpage will
7027*4882a593Smuzhiyun * be uncharged upon free.
7028*4882a593Smuzhiyun *
7029*4882a593Smuzhiyun * Both pages must be locked, @newpage->mapping must be set up.
7030*4882a593Smuzhiyun */
mem_cgroup_migrate(struct page * oldpage,struct page * newpage)7031*4882a593Smuzhiyun void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
7032*4882a593Smuzhiyun {
7033*4882a593Smuzhiyun struct mem_cgroup *memcg;
7034*4882a593Smuzhiyun unsigned int nr_pages;
7035*4882a593Smuzhiyun unsigned long flags;
7036*4882a593Smuzhiyun
7037*4882a593Smuzhiyun VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
7038*4882a593Smuzhiyun VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
7039*4882a593Smuzhiyun VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
7040*4882a593Smuzhiyun VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
7041*4882a593Smuzhiyun newpage);
7042*4882a593Smuzhiyun
7043*4882a593Smuzhiyun if (mem_cgroup_disabled())
7044*4882a593Smuzhiyun return;
7045*4882a593Smuzhiyun
7046*4882a593Smuzhiyun /* Page cache replacement: new page already charged? */
7047*4882a593Smuzhiyun if (newpage->mem_cgroup)
7048*4882a593Smuzhiyun return;
7049*4882a593Smuzhiyun
7050*4882a593Smuzhiyun /* Swapcache readahead pages can get replaced before being charged */
7051*4882a593Smuzhiyun memcg = oldpage->mem_cgroup;
7052*4882a593Smuzhiyun if (!memcg)
7053*4882a593Smuzhiyun return;
7054*4882a593Smuzhiyun
7055*4882a593Smuzhiyun /* Force-charge the new page. The old one will be freed soon */
7056*4882a593Smuzhiyun nr_pages = thp_nr_pages(newpage);
7057*4882a593Smuzhiyun
7058*4882a593Smuzhiyun page_counter_charge(&memcg->memory, nr_pages);
7059*4882a593Smuzhiyun if (do_memsw_account())
7060*4882a593Smuzhiyun page_counter_charge(&memcg->memsw, nr_pages);
7061*4882a593Smuzhiyun
7062*4882a593Smuzhiyun css_get(&memcg->css);
7063*4882a593Smuzhiyun commit_charge(newpage, memcg);
7064*4882a593Smuzhiyun
7065*4882a593Smuzhiyun local_irq_save(flags);
7066*4882a593Smuzhiyun mem_cgroup_charge_statistics(memcg, newpage, nr_pages);
7067*4882a593Smuzhiyun memcg_check_events(memcg, newpage);
7068*4882a593Smuzhiyun local_irq_restore(flags);
7069*4882a593Smuzhiyun }
7070*4882a593Smuzhiyun
7071*4882a593Smuzhiyun DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
7072*4882a593Smuzhiyun EXPORT_SYMBOL(memcg_sockets_enabled_key);
7073*4882a593Smuzhiyun
mem_cgroup_sk_alloc(struct sock * sk)7074*4882a593Smuzhiyun void mem_cgroup_sk_alloc(struct sock *sk)
7075*4882a593Smuzhiyun {
7076*4882a593Smuzhiyun struct mem_cgroup *memcg;
7077*4882a593Smuzhiyun
7078*4882a593Smuzhiyun if (!mem_cgroup_sockets_enabled)
7079*4882a593Smuzhiyun return;
7080*4882a593Smuzhiyun
7081*4882a593Smuzhiyun /* Do not associate the sock with unrelated interrupted task's memcg. */
7082*4882a593Smuzhiyun if (in_interrupt())
7083*4882a593Smuzhiyun return;
7084*4882a593Smuzhiyun
7085*4882a593Smuzhiyun rcu_read_lock();
7086*4882a593Smuzhiyun memcg = mem_cgroup_from_task(current);
7087*4882a593Smuzhiyun if (memcg == root_mem_cgroup)
7088*4882a593Smuzhiyun goto out;
7089*4882a593Smuzhiyun if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
7090*4882a593Smuzhiyun goto out;
7091*4882a593Smuzhiyun if (css_tryget(&memcg->css))
7092*4882a593Smuzhiyun sk->sk_memcg = memcg;
7093*4882a593Smuzhiyun out:
7094*4882a593Smuzhiyun rcu_read_unlock();
7095*4882a593Smuzhiyun }
7096*4882a593Smuzhiyun
mem_cgroup_sk_free(struct sock * sk)7097*4882a593Smuzhiyun void mem_cgroup_sk_free(struct sock *sk)
7098*4882a593Smuzhiyun {
7099*4882a593Smuzhiyun if (sk->sk_memcg)
7100*4882a593Smuzhiyun css_put(&sk->sk_memcg->css);
7101*4882a593Smuzhiyun }
7102*4882a593Smuzhiyun
7103*4882a593Smuzhiyun /**
7104*4882a593Smuzhiyun * mem_cgroup_charge_skmem - charge socket memory
7105*4882a593Smuzhiyun * @memcg: memcg to charge
7106*4882a593Smuzhiyun * @nr_pages: number of pages to charge
7107*4882a593Smuzhiyun *
7108*4882a593Smuzhiyun * Charges @nr_pages to @memcg. Returns %true if the charge fit within
7109*4882a593Smuzhiyun * @memcg's configured limit, %false if the charge had to be forced.
7110*4882a593Smuzhiyun */
mem_cgroup_charge_skmem(struct mem_cgroup * memcg,unsigned int nr_pages)7111*4882a593Smuzhiyun bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
7112*4882a593Smuzhiyun {
7113*4882a593Smuzhiyun gfp_t gfp_mask = GFP_KERNEL;
7114*4882a593Smuzhiyun
7115*4882a593Smuzhiyun if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
7116*4882a593Smuzhiyun struct page_counter *fail;
7117*4882a593Smuzhiyun
7118*4882a593Smuzhiyun if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
7119*4882a593Smuzhiyun memcg->tcpmem_pressure = 0;
7120*4882a593Smuzhiyun return true;
7121*4882a593Smuzhiyun }
7122*4882a593Smuzhiyun page_counter_charge(&memcg->tcpmem, nr_pages);
7123*4882a593Smuzhiyun memcg->tcpmem_pressure = 1;
7124*4882a593Smuzhiyun return false;
7125*4882a593Smuzhiyun }
7126*4882a593Smuzhiyun
7127*4882a593Smuzhiyun /* Don't block in the packet receive path */
7128*4882a593Smuzhiyun if (in_softirq())
7129*4882a593Smuzhiyun gfp_mask = GFP_NOWAIT;
7130*4882a593Smuzhiyun
7131*4882a593Smuzhiyun mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
7132*4882a593Smuzhiyun
7133*4882a593Smuzhiyun if (try_charge(memcg, gfp_mask, nr_pages) == 0)
7134*4882a593Smuzhiyun return true;
7135*4882a593Smuzhiyun
7136*4882a593Smuzhiyun try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages);
7137*4882a593Smuzhiyun return false;
7138*4882a593Smuzhiyun }
7139*4882a593Smuzhiyun
7140*4882a593Smuzhiyun /**
7141*4882a593Smuzhiyun * mem_cgroup_uncharge_skmem - uncharge socket memory
7142*4882a593Smuzhiyun * @memcg: memcg to uncharge
7143*4882a593Smuzhiyun * @nr_pages: number of pages to uncharge
7144*4882a593Smuzhiyun */
mem_cgroup_uncharge_skmem(struct mem_cgroup * memcg,unsigned int nr_pages)7145*4882a593Smuzhiyun void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
7146*4882a593Smuzhiyun {
7147*4882a593Smuzhiyun if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
7148*4882a593Smuzhiyun page_counter_uncharge(&memcg->tcpmem, nr_pages);
7149*4882a593Smuzhiyun return;
7150*4882a593Smuzhiyun }
7151*4882a593Smuzhiyun
7152*4882a593Smuzhiyun mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
7153*4882a593Smuzhiyun
7154*4882a593Smuzhiyun refill_stock(memcg, nr_pages);
7155*4882a593Smuzhiyun }
7156*4882a593Smuzhiyun
cgroup_memory(char * s)7157*4882a593Smuzhiyun static int __init cgroup_memory(char *s)
7158*4882a593Smuzhiyun {
7159*4882a593Smuzhiyun char *token;
7160*4882a593Smuzhiyun
7161*4882a593Smuzhiyun while ((token = strsep(&s, ",")) != NULL) {
7162*4882a593Smuzhiyun if (!*token)
7163*4882a593Smuzhiyun continue;
7164*4882a593Smuzhiyun if (!strcmp(token, "nosocket"))
7165*4882a593Smuzhiyun cgroup_memory_nosocket = true;
7166*4882a593Smuzhiyun if (!strcmp(token, "nokmem"))
7167*4882a593Smuzhiyun cgroup_memory_nokmem = true;
7168*4882a593Smuzhiyun }
7169*4882a593Smuzhiyun return 1;
7170*4882a593Smuzhiyun }
7171*4882a593Smuzhiyun __setup("cgroup.memory=", cgroup_memory);
7172*4882a593Smuzhiyun
7173*4882a593Smuzhiyun /*
7174*4882a593Smuzhiyun * subsys_initcall() for memory controller.
7175*4882a593Smuzhiyun *
7176*4882a593Smuzhiyun * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this
7177*4882a593Smuzhiyun * context because of lock dependencies (cgroup_lock -> cpu hotplug) but
7178*4882a593Smuzhiyun * basically everything that doesn't depend on a specific mem_cgroup structure
7179*4882a593Smuzhiyun * should be initialized from here.
7180*4882a593Smuzhiyun */
mem_cgroup_init(void)7181*4882a593Smuzhiyun static int __init mem_cgroup_init(void)
7182*4882a593Smuzhiyun {
7183*4882a593Smuzhiyun int cpu, node;
7184*4882a593Smuzhiyun
7185*4882a593Smuzhiyun cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
7186*4882a593Smuzhiyun memcg_hotplug_cpu_dead);
7187*4882a593Smuzhiyun
7188*4882a593Smuzhiyun for_each_possible_cpu(cpu)
7189*4882a593Smuzhiyun INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
7190*4882a593Smuzhiyun drain_local_stock);
7191*4882a593Smuzhiyun
7192*4882a593Smuzhiyun for_each_node(node) {
7193*4882a593Smuzhiyun struct mem_cgroup_tree_per_node *rtpn;
7194*4882a593Smuzhiyun
7195*4882a593Smuzhiyun rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
7196*4882a593Smuzhiyun node_online(node) ? node : NUMA_NO_NODE);
7197*4882a593Smuzhiyun
7198*4882a593Smuzhiyun rtpn->rb_root = RB_ROOT;
7199*4882a593Smuzhiyun rtpn->rb_rightmost = NULL;
7200*4882a593Smuzhiyun spin_lock_init(&rtpn->lock);
7201*4882a593Smuzhiyun soft_limit_tree.rb_tree_per_node[node] = rtpn;
7202*4882a593Smuzhiyun }
7203*4882a593Smuzhiyun
7204*4882a593Smuzhiyun return 0;
7205*4882a593Smuzhiyun }
7206*4882a593Smuzhiyun subsys_initcall(mem_cgroup_init);
7207*4882a593Smuzhiyun
7208*4882a593Smuzhiyun #ifdef CONFIG_MEMCG_SWAP
mem_cgroup_id_get_online(struct mem_cgroup * memcg)7209*4882a593Smuzhiyun static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
7210*4882a593Smuzhiyun {
7211*4882a593Smuzhiyun while (!refcount_inc_not_zero(&memcg->id.ref)) {
7212*4882a593Smuzhiyun /*
7213*4882a593Smuzhiyun * The root cgroup cannot be destroyed, so it's refcount must
7214*4882a593Smuzhiyun * always be >= 1.
7215*4882a593Smuzhiyun */
7216*4882a593Smuzhiyun if (WARN_ON_ONCE(memcg == root_mem_cgroup)) {
7217*4882a593Smuzhiyun VM_BUG_ON(1);
7218*4882a593Smuzhiyun break;
7219*4882a593Smuzhiyun }
7220*4882a593Smuzhiyun memcg = parent_mem_cgroup(memcg);
7221*4882a593Smuzhiyun if (!memcg)
7222*4882a593Smuzhiyun memcg = root_mem_cgroup;
7223*4882a593Smuzhiyun }
7224*4882a593Smuzhiyun return memcg;
7225*4882a593Smuzhiyun }
7226*4882a593Smuzhiyun
7227*4882a593Smuzhiyun /**
7228*4882a593Smuzhiyun * mem_cgroup_swapout - transfer a memsw charge to swap
7229*4882a593Smuzhiyun * @page: page whose memsw charge to transfer
7230*4882a593Smuzhiyun * @entry: swap entry to move the charge to
7231*4882a593Smuzhiyun *
7232*4882a593Smuzhiyun * Transfer the memsw charge of @page to @entry.
7233*4882a593Smuzhiyun */
mem_cgroup_swapout(struct page * page,swp_entry_t entry)7234*4882a593Smuzhiyun void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
7235*4882a593Smuzhiyun {
7236*4882a593Smuzhiyun struct mem_cgroup *memcg, *swap_memcg;
7237*4882a593Smuzhiyun unsigned int nr_entries;
7238*4882a593Smuzhiyun unsigned short oldid;
7239*4882a593Smuzhiyun
7240*4882a593Smuzhiyun VM_BUG_ON_PAGE(PageLRU(page), page);
7241*4882a593Smuzhiyun VM_BUG_ON_PAGE(page_count(page), page);
7242*4882a593Smuzhiyun
7243*4882a593Smuzhiyun if (mem_cgroup_disabled())
7244*4882a593Smuzhiyun return;
7245*4882a593Smuzhiyun
7246*4882a593Smuzhiyun if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
7247*4882a593Smuzhiyun return;
7248*4882a593Smuzhiyun
7249*4882a593Smuzhiyun memcg = page->mem_cgroup;
7250*4882a593Smuzhiyun
7251*4882a593Smuzhiyun /* Readahead page, never charged */
7252*4882a593Smuzhiyun if (!memcg)
7253*4882a593Smuzhiyun return;
7254*4882a593Smuzhiyun
7255*4882a593Smuzhiyun /*
7256*4882a593Smuzhiyun * In case the memcg owning these pages has been offlined and doesn't
7257*4882a593Smuzhiyun * have an ID allocated to it anymore, charge the closest online
7258*4882a593Smuzhiyun * ancestor for the swap instead and transfer the memory+swap charge.
7259*4882a593Smuzhiyun */
7260*4882a593Smuzhiyun swap_memcg = mem_cgroup_id_get_online(memcg);
7261*4882a593Smuzhiyun nr_entries = thp_nr_pages(page);
7262*4882a593Smuzhiyun /* Get references for the tail pages, too */
7263*4882a593Smuzhiyun if (nr_entries > 1)
7264*4882a593Smuzhiyun mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
7265*4882a593Smuzhiyun oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
7266*4882a593Smuzhiyun nr_entries);
7267*4882a593Smuzhiyun VM_BUG_ON_PAGE(oldid, page);
7268*4882a593Smuzhiyun mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
7269*4882a593Smuzhiyun
7270*4882a593Smuzhiyun page->mem_cgroup = NULL;
7271*4882a593Smuzhiyun
7272*4882a593Smuzhiyun if (!mem_cgroup_is_root(memcg))
7273*4882a593Smuzhiyun page_counter_uncharge(&memcg->memory, nr_entries);
7274*4882a593Smuzhiyun
7275*4882a593Smuzhiyun if (!cgroup_memory_noswap && memcg != swap_memcg) {
7276*4882a593Smuzhiyun if (!mem_cgroup_is_root(swap_memcg))
7277*4882a593Smuzhiyun page_counter_charge(&swap_memcg->memsw, nr_entries);
7278*4882a593Smuzhiyun page_counter_uncharge(&memcg->memsw, nr_entries);
7279*4882a593Smuzhiyun }
7280*4882a593Smuzhiyun
7281*4882a593Smuzhiyun /*
7282*4882a593Smuzhiyun * Interrupts should be disabled here because the caller holds the
7283*4882a593Smuzhiyun * i_pages lock which is taken with interrupts-off. It is
7284*4882a593Smuzhiyun * important here to have the interrupts disabled because it is the
7285*4882a593Smuzhiyun * only synchronisation we have for updating the per-CPU variables.
7286*4882a593Smuzhiyun */
7287*4882a593Smuzhiyun VM_BUG_ON(!irqs_disabled());
7288*4882a593Smuzhiyun mem_cgroup_charge_statistics(memcg, page, -nr_entries);
7289*4882a593Smuzhiyun memcg_check_events(memcg, page);
7290*4882a593Smuzhiyun
7291*4882a593Smuzhiyun css_put(&memcg->css);
7292*4882a593Smuzhiyun }
7293*4882a593Smuzhiyun
7294*4882a593Smuzhiyun /**
7295*4882a593Smuzhiyun * __mem_cgroup_try_charge_swap - try charging swap space for a page
7296*4882a593Smuzhiyun * @page: page being added to swap
7297*4882a593Smuzhiyun * @entry: swap entry to charge
7298*4882a593Smuzhiyun *
7299*4882a593Smuzhiyun * Try to charge @page's memcg for the swap space at @entry.
7300*4882a593Smuzhiyun *
7301*4882a593Smuzhiyun * Returns 0 on success, -ENOMEM on failure.
7302*4882a593Smuzhiyun */
__mem_cgroup_try_charge_swap(struct page * page,swp_entry_t entry)7303*4882a593Smuzhiyun int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
7304*4882a593Smuzhiyun {
7305*4882a593Smuzhiyun unsigned int nr_pages = thp_nr_pages(page);
7306*4882a593Smuzhiyun struct page_counter *counter;
7307*4882a593Smuzhiyun struct mem_cgroup *memcg;
7308*4882a593Smuzhiyun unsigned short oldid;
7309*4882a593Smuzhiyun
7310*4882a593Smuzhiyun if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
7311*4882a593Smuzhiyun return 0;
7312*4882a593Smuzhiyun
7313*4882a593Smuzhiyun memcg = page->mem_cgroup;
7314*4882a593Smuzhiyun
7315*4882a593Smuzhiyun /* Readahead page, never charged */
7316*4882a593Smuzhiyun if (!memcg)
7317*4882a593Smuzhiyun return 0;
7318*4882a593Smuzhiyun
7319*4882a593Smuzhiyun if (!entry.val) {
7320*4882a593Smuzhiyun memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
7321*4882a593Smuzhiyun return 0;
7322*4882a593Smuzhiyun }
7323*4882a593Smuzhiyun
7324*4882a593Smuzhiyun memcg = mem_cgroup_id_get_online(memcg);
7325*4882a593Smuzhiyun
7326*4882a593Smuzhiyun if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) &&
7327*4882a593Smuzhiyun !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
7328*4882a593Smuzhiyun memcg_memory_event(memcg, MEMCG_SWAP_MAX);
7329*4882a593Smuzhiyun memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
7330*4882a593Smuzhiyun mem_cgroup_id_put(memcg);
7331*4882a593Smuzhiyun return -ENOMEM;
7332*4882a593Smuzhiyun }
7333*4882a593Smuzhiyun
7334*4882a593Smuzhiyun /* Get references for the tail pages, too */
7335*4882a593Smuzhiyun if (nr_pages > 1)
7336*4882a593Smuzhiyun mem_cgroup_id_get_many(memcg, nr_pages - 1);
7337*4882a593Smuzhiyun oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
7338*4882a593Smuzhiyun VM_BUG_ON_PAGE(oldid, page);
7339*4882a593Smuzhiyun mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
7340*4882a593Smuzhiyun
7341*4882a593Smuzhiyun return 0;
7342*4882a593Smuzhiyun }
7343*4882a593Smuzhiyun
7344*4882a593Smuzhiyun /**
7345*4882a593Smuzhiyun * __mem_cgroup_uncharge_swap - uncharge swap space
7346*4882a593Smuzhiyun * @entry: swap entry to uncharge
7347*4882a593Smuzhiyun * @nr_pages: the amount of swap space to uncharge
7348*4882a593Smuzhiyun */
__mem_cgroup_uncharge_swap(swp_entry_t entry,unsigned int nr_pages)7349*4882a593Smuzhiyun void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
7350*4882a593Smuzhiyun {
7351*4882a593Smuzhiyun struct mem_cgroup *memcg;
7352*4882a593Smuzhiyun unsigned short id;
7353*4882a593Smuzhiyun
7354*4882a593Smuzhiyun id = swap_cgroup_record(entry, 0, nr_pages);
7355*4882a593Smuzhiyun rcu_read_lock();
7356*4882a593Smuzhiyun memcg = mem_cgroup_from_id(id);
7357*4882a593Smuzhiyun if (memcg) {
7358*4882a593Smuzhiyun if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) {
7359*4882a593Smuzhiyun if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
7360*4882a593Smuzhiyun page_counter_uncharge(&memcg->swap, nr_pages);
7361*4882a593Smuzhiyun else
7362*4882a593Smuzhiyun page_counter_uncharge(&memcg->memsw, nr_pages);
7363*4882a593Smuzhiyun }
7364*4882a593Smuzhiyun mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
7365*4882a593Smuzhiyun mem_cgroup_id_put_many(memcg, nr_pages);
7366*4882a593Smuzhiyun }
7367*4882a593Smuzhiyun rcu_read_unlock();
7368*4882a593Smuzhiyun }
7369*4882a593Smuzhiyun
mem_cgroup_get_nr_swap_pages(struct mem_cgroup * memcg)7370*4882a593Smuzhiyun long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
7371*4882a593Smuzhiyun {
7372*4882a593Smuzhiyun long nr_swap_pages = get_nr_swap_pages();
7373*4882a593Smuzhiyun
7374*4882a593Smuzhiyun if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
7375*4882a593Smuzhiyun return nr_swap_pages;
7376*4882a593Smuzhiyun for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
7377*4882a593Smuzhiyun nr_swap_pages = min_t(long, nr_swap_pages,
7378*4882a593Smuzhiyun READ_ONCE(memcg->swap.max) -
7379*4882a593Smuzhiyun page_counter_read(&memcg->swap));
7380*4882a593Smuzhiyun return nr_swap_pages;
7381*4882a593Smuzhiyun }
7382*4882a593Smuzhiyun
mem_cgroup_swap_full(struct page * page)7383*4882a593Smuzhiyun bool mem_cgroup_swap_full(struct page *page)
7384*4882a593Smuzhiyun {
7385*4882a593Smuzhiyun struct mem_cgroup *memcg;
7386*4882a593Smuzhiyun
7387*4882a593Smuzhiyun VM_BUG_ON_PAGE(!PageLocked(page), page);
7388*4882a593Smuzhiyun
7389*4882a593Smuzhiyun if (vm_swap_full())
7390*4882a593Smuzhiyun return true;
7391*4882a593Smuzhiyun if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
7392*4882a593Smuzhiyun return false;
7393*4882a593Smuzhiyun
7394*4882a593Smuzhiyun memcg = page->mem_cgroup;
7395*4882a593Smuzhiyun if (!memcg)
7396*4882a593Smuzhiyun return false;
7397*4882a593Smuzhiyun
7398*4882a593Smuzhiyun for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
7399*4882a593Smuzhiyun unsigned long usage = page_counter_read(&memcg->swap);
7400*4882a593Smuzhiyun
7401*4882a593Smuzhiyun if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
7402*4882a593Smuzhiyun usage * 2 >= READ_ONCE(memcg->swap.max))
7403*4882a593Smuzhiyun return true;
7404*4882a593Smuzhiyun }
7405*4882a593Smuzhiyun
7406*4882a593Smuzhiyun return false;
7407*4882a593Smuzhiyun }
7408*4882a593Smuzhiyun
setup_swap_account(char * s)7409*4882a593Smuzhiyun static int __init setup_swap_account(char *s)
7410*4882a593Smuzhiyun {
7411*4882a593Smuzhiyun if (!strcmp(s, "1"))
7412*4882a593Smuzhiyun cgroup_memory_noswap = 0;
7413*4882a593Smuzhiyun else if (!strcmp(s, "0"))
7414*4882a593Smuzhiyun cgroup_memory_noswap = 1;
7415*4882a593Smuzhiyun return 1;
7416*4882a593Smuzhiyun }
7417*4882a593Smuzhiyun __setup("swapaccount=", setup_swap_account);
7418*4882a593Smuzhiyun
swap_current_read(struct cgroup_subsys_state * css,struct cftype * cft)7419*4882a593Smuzhiyun static u64 swap_current_read(struct cgroup_subsys_state *css,
7420*4882a593Smuzhiyun struct cftype *cft)
7421*4882a593Smuzhiyun {
7422*4882a593Smuzhiyun struct mem_cgroup *memcg = mem_cgroup_from_css(css);
7423*4882a593Smuzhiyun
7424*4882a593Smuzhiyun return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
7425*4882a593Smuzhiyun }
7426*4882a593Smuzhiyun
swap_high_show(struct seq_file * m,void * v)7427*4882a593Smuzhiyun static int swap_high_show(struct seq_file *m, void *v)
7428*4882a593Smuzhiyun {
7429*4882a593Smuzhiyun return seq_puts_memcg_tunable(m,
7430*4882a593Smuzhiyun READ_ONCE(mem_cgroup_from_seq(m)->swap.high));
7431*4882a593Smuzhiyun }
7432*4882a593Smuzhiyun
swap_high_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)7433*4882a593Smuzhiyun static ssize_t swap_high_write(struct kernfs_open_file *of,
7434*4882a593Smuzhiyun char *buf, size_t nbytes, loff_t off)
7435*4882a593Smuzhiyun {
7436*4882a593Smuzhiyun struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
7437*4882a593Smuzhiyun unsigned long high;
7438*4882a593Smuzhiyun int err;
7439*4882a593Smuzhiyun
7440*4882a593Smuzhiyun buf = strstrip(buf);
7441*4882a593Smuzhiyun err = page_counter_memparse(buf, "max", &high);
7442*4882a593Smuzhiyun if (err)
7443*4882a593Smuzhiyun return err;
7444*4882a593Smuzhiyun
7445*4882a593Smuzhiyun page_counter_set_high(&memcg->swap, high);
7446*4882a593Smuzhiyun
7447*4882a593Smuzhiyun return nbytes;
7448*4882a593Smuzhiyun }
7449*4882a593Smuzhiyun
swap_max_show(struct seq_file * m,void * v)7450*4882a593Smuzhiyun static int swap_max_show(struct seq_file *m, void *v)
7451*4882a593Smuzhiyun {
7452*4882a593Smuzhiyun return seq_puts_memcg_tunable(m,
7453*4882a593Smuzhiyun READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
7454*4882a593Smuzhiyun }
7455*4882a593Smuzhiyun
swap_max_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)7456*4882a593Smuzhiyun static ssize_t swap_max_write(struct kernfs_open_file *of,
7457*4882a593Smuzhiyun char *buf, size_t nbytes, loff_t off)
7458*4882a593Smuzhiyun {
7459*4882a593Smuzhiyun struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
7460*4882a593Smuzhiyun unsigned long max;
7461*4882a593Smuzhiyun int err;
7462*4882a593Smuzhiyun
7463*4882a593Smuzhiyun buf = strstrip(buf);
7464*4882a593Smuzhiyun err = page_counter_memparse(buf, "max", &max);
7465*4882a593Smuzhiyun if (err)
7466*4882a593Smuzhiyun return err;
7467*4882a593Smuzhiyun
7468*4882a593Smuzhiyun xchg(&memcg->swap.max, max);
7469*4882a593Smuzhiyun
7470*4882a593Smuzhiyun return nbytes;
7471*4882a593Smuzhiyun }
7472*4882a593Smuzhiyun
swap_events_show(struct seq_file * m,void * v)7473*4882a593Smuzhiyun static int swap_events_show(struct seq_file *m, void *v)
7474*4882a593Smuzhiyun {
7475*4882a593Smuzhiyun struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
7476*4882a593Smuzhiyun
7477*4882a593Smuzhiyun seq_printf(m, "high %lu\n",
7478*4882a593Smuzhiyun atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH]));
7479*4882a593Smuzhiyun seq_printf(m, "max %lu\n",
7480*4882a593Smuzhiyun atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
7481*4882a593Smuzhiyun seq_printf(m, "fail %lu\n",
7482*4882a593Smuzhiyun atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL]));
7483*4882a593Smuzhiyun
7484*4882a593Smuzhiyun return 0;
7485*4882a593Smuzhiyun }
7486*4882a593Smuzhiyun
7487*4882a593Smuzhiyun static struct cftype swap_files[] = {
7488*4882a593Smuzhiyun {
7489*4882a593Smuzhiyun .name = "swap.current",
7490*4882a593Smuzhiyun .flags = CFTYPE_NOT_ON_ROOT,
7491*4882a593Smuzhiyun .read_u64 = swap_current_read,
7492*4882a593Smuzhiyun },
7493*4882a593Smuzhiyun {
7494*4882a593Smuzhiyun .name = "swap.high",
7495*4882a593Smuzhiyun .flags = CFTYPE_NOT_ON_ROOT,
7496*4882a593Smuzhiyun .seq_show = swap_high_show,
7497*4882a593Smuzhiyun .write = swap_high_write,
7498*4882a593Smuzhiyun },
7499*4882a593Smuzhiyun {
7500*4882a593Smuzhiyun .name = "swap.max",
7501*4882a593Smuzhiyun .flags = CFTYPE_NOT_ON_ROOT,
7502*4882a593Smuzhiyun .seq_show = swap_max_show,
7503*4882a593Smuzhiyun .write = swap_max_write,
7504*4882a593Smuzhiyun },
7505*4882a593Smuzhiyun {
7506*4882a593Smuzhiyun .name = "swap.events",
7507*4882a593Smuzhiyun .flags = CFTYPE_NOT_ON_ROOT,
7508*4882a593Smuzhiyun .file_offset = offsetof(struct mem_cgroup, swap_events_file),
7509*4882a593Smuzhiyun .seq_show = swap_events_show,
7510*4882a593Smuzhiyun },
7511*4882a593Smuzhiyun { } /* terminate */
7512*4882a593Smuzhiyun };
7513*4882a593Smuzhiyun
7514*4882a593Smuzhiyun static struct cftype memsw_files[] = {
7515*4882a593Smuzhiyun {
7516*4882a593Smuzhiyun .name = "memsw.usage_in_bytes",
7517*4882a593Smuzhiyun .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
7518*4882a593Smuzhiyun .read_u64 = mem_cgroup_read_u64,
7519*4882a593Smuzhiyun },
7520*4882a593Smuzhiyun {
7521*4882a593Smuzhiyun .name = "memsw.max_usage_in_bytes",
7522*4882a593Smuzhiyun .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
7523*4882a593Smuzhiyun .write = mem_cgroup_reset,
7524*4882a593Smuzhiyun .read_u64 = mem_cgroup_read_u64,
7525*4882a593Smuzhiyun },
7526*4882a593Smuzhiyun {
7527*4882a593Smuzhiyun .name = "memsw.limit_in_bytes",
7528*4882a593Smuzhiyun .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
7529*4882a593Smuzhiyun .write = mem_cgroup_write,
7530*4882a593Smuzhiyun .read_u64 = mem_cgroup_read_u64,
7531*4882a593Smuzhiyun },
7532*4882a593Smuzhiyun {
7533*4882a593Smuzhiyun .name = "memsw.failcnt",
7534*4882a593Smuzhiyun .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
7535*4882a593Smuzhiyun .write = mem_cgroup_reset,
7536*4882a593Smuzhiyun .read_u64 = mem_cgroup_read_u64,
7537*4882a593Smuzhiyun },
7538*4882a593Smuzhiyun { }, /* terminate */
7539*4882a593Smuzhiyun };
7540*4882a593Smuzhiyun
7541*4882a593Smuzhiyun /*
7542*4882a593Smuzhiyun * If mem_cgroup_swap_init() is implemented as a subsys_initcall()
7543*4882a593Smuzhiyun * instead of a core_initcall(), this could mean cgroup_memory_noswap still
7544*4882a593Smuzhiyun * remains set to false even when memcg is disabled via "cgroup_disable=memory"
7545*4882a593Smuzhiyun * boot parameter. This may result in premature OOPS inside
7546*4882a593Smuzhiyun * mem_cgroup_get_nr_swap_pages() function in corner cases.
7547*4882a593Smuzhiyun */
mem_cgroup_swap_init(void)7548*4882a593Smuzhiyun static int __init mem_cgroup_swap_init(void)
7549*4882a593Smuzhiyun {
7550*4882a593Smuzhiyun /* No memory control -> no swap control */
7551*4882a593Smuzhiyun if (mem_cgroup_disabled())
7552*4882a593Smuzhiyun cgroup_memory_noswap = true;
7553*4882a593Smuzhiyun
7554*4882a593Smuzhiyun if (cgroup_memory_noswap)
7555*4882a593Smuzhiyun return 0;
7556*4882a593Smuzhiyun
7557*4882a593Smuzhiyun WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
7558*4882a593Smuzhiyun WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files));
7559*4882a593Smuzhiyun
7560*4882a593Smuzhiyun return 0;
7561*4882a593Smuzhiyun }
7562*4882a593Smuzhiyun core_initcall(mem_cgroup_swap_init);
7563*4882a593Smuzhiyun
7564*4882a593Smuzhiyun #endif /* CONFIG_MEMCG_SWAP */
7565