xref: /OK3568_Linux_fs/kernel/Documentation/RCU/rculist_nulls.rst (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun.. SPDX-License-Identifier: GPL-2.0
2*4882a593Smuzhiyun
3*4882a593Smuzhiyun=================================================
4*4882a593SmuzhiyunUsing RCU hlist_nulls to protect list and objects
5*4882a593Smuzhiyun=================================================
6*4882a593Smuzhiyun
7*4882a593SmuzhiyunThis section describes how to use hlist_nulls to
8*4882a593Smuzhiyunprotect read-mostly linked lists and
9*4882a593Smuzhiyunobjects using SLAB_TYPESAFE_BY_RCU allocations.
10*4882a593Smuzhiyun
11*4882a593SmuzhiyunPlease read the basics in Documentation/RCU/listRCU.rst
12*4882a593Smuzhiyun
13*4882a593SmuzhiyunUsing 'nulls'
14*4882a593Smuzhiyun=============
15*4882a593Smuzhiyun
16*4882a593SmuzhiyunUsing special makers (called 'nulls') is a convenient way
17*4882a593Smuzhiyunto solve following problem :
18*4882a593Smuzhiyun
19*4882a593SmuzhiyunA typical RCU linked list managing objects which are
20*4882a593Smuzhiyunallocated with SLAB_TYPESAFE_BY_RCU kmem_cache can
21*4882a593Smuzhiyunuse following algos :
22*4882a593Smuzhiyun
23*4882a593Smuzhiyun1) Lookup algo
24*4882a593Smuzhiyun--------------
25*4882a593Smuzhiyun
26*4882a593Smuzhiyun::
27*4882a593Smuzhiyun
28*4882a593Smuzhiyun  rcu_read_lock()
29*4882a593Smuzhiyun  begin:
30*4882a593Smuzhiyun  obj = lockless_lookup(key);
31*4882a593Smuzhiyun  if (obj) {
32*4882a593Smuzhiyun    if (!try_get_ref(obj)) // might fail for free objects
33*4882a593Smuzhiyun      goto begin;
34*4882a593Smuzhiyun    /*
35*4882a593Smuzhiyun    * Because a writer could delete object, and a writer could
36*4882a593Smuzhiyun    * reuse these object before the RCU grace period, we
37*4882a593Smuzhiyun    * must check key after getting the reference on object
38*4882a593Smuzhiyun    */
39*4882a593Smuzhiyun    if (obj->key != key) { // not the object we expected
40*4882a593Smuzhiyun      put_ref(obj);
41*4882a593Smuzhiyun      goto begin;
42*4882a593Smuzhiyun    }
43*4882a593Smuzhiyun  }
44*4882a593Smuzhiyun  rcu_read_unlock();
45*4882a593Smuzhiyun
46*4882a593SmuzhiyunBeware that lockless_lookup(key) cannot use traditional hlist_for_each_entry_rcu()
47*4882a593Smuzhiyunbut a version with an additional memory barrier (smp_rmb())
48*4882a593Smuzhiyun
49*4882a593Smuzhiyun::
50*4882a593Smuzhiyun
51*4882a593Smuzhiyun  lockless_lookup(key)
52*4882a593Smuzhiyun  {
53*4882a593Smuzhiyun    struct hlist_node *node, *next;
54*4882a593Smuzhiyun    for (pos = rcu_dereference((head)->first);
55*4882a593Smuzhiyun        pos && ({ next = pos->next; smp_rmb(); prefetch(next); 1; }) &&
56*4882a593Smuzhiyun        ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; });
57*4882a593Smuzhiyun        pos = rcu_dereference(next))
58*4882a593Smuzhiyun      if (obj->key == key)
59*4882a593Smuzhiyun        return obj;
60*4882a593Smuzhiyun    return NULL;
61*4882a593Smuzhiyun  }
62*4882a593Smuzhiyun
63*4882a593SmuzhiyunAnd note the traditional hlist_for_each_entry_rcu() misses this smp_rmb()::
64*4882a593Smuzhiyun
65*4882a593Smuzhiyun  struct hlist_node *node;
66*4882a593Smuzhiyun  for (pos = rcu_dereference((head)->first);
67*4882a593Smuzhiyun        pos && ({ prefetch(pos->next); 1; }) &&
68*4882a593Smuzhiyun        ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; });
69*4882a593Smuzhiyun        pos = rcu_dereference(pos->next))
70*4882a593Smuzhiyun   if (obj->key == key)
71*4882a593Smuzhiyun     return obj;
72*4882a593Smuzhiyun  return NULL;
73*4882a593Smuzhiyun
74*4882a593SmuzhiyunQuoting Corey Minyard::
75*4882a593Smuzhiyun
76*4882a593Smuzhiyun  "If the object is moved from one list to another list in-between the
77*4882a593Smuzhiyun  time the hash is calculated and the next field is accessed, and the
78*4882a593Smuzhiyun  object has moved to the end of a new list, the traversal will not
79*4882a593Smuzhiyun  complete properly on the list it should have, since the object will
80*4882a593Smuzhiyun  be on the end of the new list and there's not a way to tell it's on a
81*4882a593Smuzhiyun  new list and restart the list traversal. I think that this can be
82*4882a593Smuzhiyun  solved by pre-fetching the "next" field (with proper barriers) before
83*4882a593Smuzhiyun  checking the key."
84*4882a593Smuzhiyun
85*4882a593Smuzhiyun2) Insert algo
86*4882a593Smuzhiyun--------------
87*4882a593Smuzhiyun
88*4882a593SmuzhiyunWe need to make sure a reader cannot read the new 'obj->obj_next' value
89*4882a593Smuzhiyunand previous value of 'obj->key'. Or else, an item could be deleted
90*4882a593Smuzhiyunfrom a chain, and inserted into another chain. If new chain was empty
91*4882a593Smuzhiyunbefore the move, 'next' pointer is NULL, and lockless reader can
92*4882a593Smuzhiyunnot detect it missed following items in original chain.
93*4882a593Smuzhiyun
94*4882a593Smuzhiyun::
95*4882a593Smuzhiyun
96*4882a593Smuzhiyun  /*
97*4882a593Smuzhiyun  * Please note that new inserts are done at the head of list,
98*4882a593Smuzhiyun  * not in the middle or end.
99*4882a593Smuzhiyun  */
100*4882a593Smuzhiyun  obj = kmem_cache_alloc(...);
101*4882a593Smuzhiyun  lock_chain(); // typically a spin_lock()
102*4882a593Smuzhiyun  obj->key = key;
103*4882a593Smuzhiyun  /*
104*4882a593Smuzhiyun  * we need to make sure obj->key is updated before obj->next
105*4882a593Smuzhiyun  * or obj->refcnt
106*4882a593Smuzhiyun  */
107*4882a593Smuzhiyun  smp_wmb();
108*4882a593Smuzhiyun  atomic_set(&obj->refcnt, 1);
109*4882a593Smuzhiyun  hlist_add_head_rcu(&obj->obj_node, list);
110*4882a593Smuzhiyun  unlock_chain(); // typically a spin_unlock()
111*4882a593Smuzhiyun
112*4882a593Smuzhiyun
113*4882a593Smuzhiyun3) Remove algo
114*4882a593Smuzhiyun--------------
115*4882a593SmuzhiyunNothing special here, we can use a standard RCU hlist deletion.
116*4882a593SmuzhiyunBut thanks to SLAB_TYPESAFE_BY_RCU, beware a deleted object can be reused
117*4882a593Smuzhiyunvery very fast (before the end of RCU grace period)
118*4882a593Smuzhiyun
119*4882a593Smuzhiyun::
120*4882a593Smuzhiyun
121*4882a593Smuzhiyun  if (put_last_reference_on(obj) {
122*4882a593Smuzhiyun    lock_chain(); // typically a spin_lock()
123*4882a593Smuzhiyun    hlist_del_init_rcu(&obj->obj_node);
124*4882a593Smuzhiyun    unlock_chain(); // typically a spin_unlock()
125*4882a593Smuzhiyun    kmem_cache_free(cachep, obj);
126*4882a593Smuzhiyun  }
127*4882a593Smuzhiyun
128*4882a593Smuzhiyun
129*4882a593Smuzhiyun
130*4882a593Smuzhiyun--------------------------------------------------------------------------
131*4882a593Smuzhiyun
132*4882a593SmuzhiyunAvoiding extra smp_rmb()
133*4882a593Smuzhiyun========================
134*4882a593Smuzhiyun
135*4882a593SmuzhiyunWith hlist_nulls we can avoid extra smp_rmb() in lockless_lookup()
136*4882a593Smuzhiyunand extra smp_wmb() in insert function.
137*4882a593Smuzhiyun
138*4882a593SmuzhiyunFor example, if we choose to store the slot number as the 'nulls'
139*4882a593Smuzhiyunend-of-list marker for each slot of the hash table, we can detect
140*4882a593Smuzhiyuna race (some writer did a delete and/or a move of an object
141*4882a593Smuzhiyunto another chain) checking the final 'nulls' value if
142*4882a593Smuzhiyunthe lookup met the end of chain. If final 'nulls' value
143*4882a593Smuzhiyunis not the slot number, then we must restart the lookup at
144*4882a593Smuzhiyunthe beginning. If the object was moved to the same chain,
145*4882a593Smuzhiyunthen the reader doesn't care : It might eventually
146*4882a593Smuzhiyunscan the list again without harm.
147*4882a593Smuzhiyun
148*4882a593Smuzhiyun
149*4882a593Smuzhiyun1) lookup algo
150*4882a593Smuzhiyun--------------
151*4882a593Smuzhiyun
152*4882a593Smuzhiyun::
153*4882a593Smuzhiyun
154*4882a593Smuzhiyun  head = &table[slot];
155*4882a593Smuzhiyun  rcu_read_lock();
156*4882a593Smuzhiyun  begin:
157*4882a593Smuzhiyun  hlist_nulls_for_each_entry_rcu(obj, node, head, member) {
158*4882a593Smuzhiyun    if (obj->key == key) {
159*4882a593Smuzhiyun      if (!try_get_ref(obj)) // might fail for free objects
160*4882a593Smuzhiyun        goto begin;
161*4882a593Smuzhiyun      if (obj->key != key) { // not the object we expected
162*4882a593Smuzhiyun        put_ref(obj);
163*4882a593Smuzhiyun        goto begin;
164*4882a593Smuzhiyun      }
165*4882a593Smuzhiyun    goto out;
166*4882a593Smuzhiyun  }
167*4882a593Smuzhiyun  /*
168*4882a593Smuzhiyun  * if the nulls value we got at the end of this lookup is
169*4882a593Smuzhiyun  * not the expected one, we must restart lookup.
170*4882a593Smuzhiyun  * We probably met an item that was moved to another chain.
171*4882a593Smuzhiyun  */
172*4882a593Smuzhiyun  if (get_nulls_value(node) != slot)
173*4882a593Smuzhiyun  goto begin;
174*4882a593Smuzhiyun  obj = NULL;
175*4882a593Smuzhiyun
176*4882a593Smuzhiyun  out:
177*4882a593Smuzhiyun  rcu_read_unlock();
178*4882a593Smuzhiyun
179*4882a593Smuzhiyun2) Insert function
180*4882a593Smuzhiyun------------------
181*4882a593Smuzhiyun
182*4882a593Smuzhiyun::
183*4882a593Smuzhiyun
184*4882a593Smuzhiyun  /*
185*4882a593Smuzhiyun  * Please note that new inserts are done at the head of list,
186*4882a593Smuzhiyun  * not in the middle or end.
187*4882a593Smuzhiyun  */
188*4882a593Smuzhiyun  obj = kmem_cache_alloc(cachep);
189*4882a593Smuzhiyun  lock_chain(); // typically a spin_lock()
190*4882a593Smuzhiyun  obj->key = key;
191*4882a593Smuzhiyun  /*
192*4882a593Smuzhiyun  * changes to obj->key must be visible before refcnt one
193*4882a593Smuzhiyun  */
194*4882a593Smuzhiyun  smp_wmb();
195*4882a593Smuzhiyun  atomic_set(&obj->refcnt, 1);
196*4882a593Smuzhiyun  /*
197*4882a593Smuzhiyun  * insert obj in RCU way (readers might be traversing chain)
198*4882a593Smuzhiyun  */
199*4882a593Smuzhiyun  hlist_nulls_add_head_rcu(&obj->obj_node, list);
200*4882a593Smuzhiyun  unlock_chain(); // typically a spin_unlock()
201