1*4882a593Smuzhiyun /*
2*4882a593Smuzhiyun * Copyright(c) 2015 - 2020 Intel Corporation.
3*4882a593Smuzhiyun *
4*4882a593Smuzhiyun * This file is provided under a dual BSD/GPLv2 license. When using or
5*4882a593Smuzhiyun * redistributing this file, you may do so under either license.
6*4882a593Smuzhiyun *
7*4882a593Smuzhiyun * GPL LICENSE SUMMARY
8*4882a593Smuzhiyun *
9*4882a593Smuzhiyun * This program is free software; you can redistribute it and/or modify
10*4882a593Smuzhiyun * it under the terms of version 2 of the GNU General Public License as
11*4882a593Smuzhiyun * published by the Free Software Foundation.
12*4882a593Smuzhiyun *
13*4882a593Smuzhiyun * This program is distributed in the hope that it will be useful, but
14*4882a593Smuzhiyun * WITHOUT ANY WARRANTY; without even the implied warranty of
15*4882a593Smuzhiyun * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16*4882a593Smuzhiyun * General Public License for more details.
17*4882a593Smuzhiyun *
18*4882a593Smuzhiyun * BSD LICENSE
19*4882a593Smuzhiyun *
20*4882a593Smuzhiyun * Redistribution and use in source and binary forms, with or without
21*4882a593Smuzhiyun * modification, are permitted provided that the following conditions
22*4882a593Smuzhiyun * are met:
23*4882a593Smuzhiyun *
24*4882a593Smuzhiyun * - Redistributions of source code must retain the above copyright
25*4882a593Smuzhiyun * notice, this list of conditions and the following disclaimer.
26*4882a593Smuzhiyun * - Redistributions in binary form must reproduce the above copyright
27*4882a593Smuzhiyun * notice, this list of conditions and the following disclaimer in
28*4882a593Smuzhiyun * the documentation and/or other materials provided with the
29*4882a593Smuzhiyun * distribution.
30*4882a593Smuzhiyun * - Neither the name of Intel Corporation nor the names of its
31*4882a593Smuzhiyun * contributors may be used to endorse or promote products derived
32*4882a593Smuzhiyun * from this software without specific prior written permission.
33*4882a593Smuzhiyun *
34*4882a593Smuzhiyun * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
35*4882a593Smuzhiyun * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
36*4882a593Smuzhiyun * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
37*4882a593Smuzhiyun * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
38*4882a593Smuzhiyun * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
39*4882a593Smuzhiyun * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
40*4882a593Smuzhiyun * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
41*4882a593Smuzhiyun * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
42*4882a593Smuzhiyun * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
43*4882a593Smuzhiyun * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
44*4882a593Smuzhiyun * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
45*4882a593Smuzhiyun *
46*4882a593Smuzhiyun */
47*4882a593Smuzhiyun #include <linux/topology.h>
48*4882a593Smuzhiyun #include <linux/cpumask.h>
49*4882a593Smuzhiyun #include <linux/module.h>
50*4882a593Smuzhiyun #include <linux/interrupt.h>
51*4882a593Smuzhiyun #include <linux/numa.h>
52*4882a593Smuzhiyun
53*4882a593Smuzhiyun #include "hfi.h"
54*4882a593Smuzhiyun #include "affinity.h"
55*4882a593Smuzhiyun #include "sdma.h"
56*4882a593Smuzhiyun #include "trace.h"
57*4882a593Smuzhiyun
58*4882a593Smuzhiyun struct hfi1_affinity_node_list node_affinity = {
59*4882a593Smuzhiyun .list = LIST_HEAD_INIT(node_affinity.list),
60*4882a593Smuzhiyun .lock = __MUTEX_INITIALIZER(node_affinity.lock)
61*4882a593Smuzhiyun };
62*4882a593Smuzhiyun
63*4882a593Smuzhiyun /* Name of IRQ types, indexed by enum irq_type */
64*4882a593Smuzhiyun static const char * const irq_type_names[] = {
65*4882a593Smuzhiyun "SDMA",
66*4882a593Smuzhiyun "RCVCTXT",
67*4882a593Smuzhiyun "NETDEVCTXT",
68*4882a593Smuzhiyun "GENERAL",
69*4882a593Smuzhiyun "OTHER",
70*4882a593Smuzhiyun };
71*4882a593Smuzhiyun
72*4882a593Smuzhiyun /* Per NUMA node count of HFI devices */
73*4882a593Smuzhiyun static unsigned int *hfi1_per_node_cntr;
74*4882a593Smuzhiyun
init_cpu_mask_set(struct cpu_mask_set * set)75*4882a593Smuzhiyun static inline void init_cpu_mask_set(struct cpu_mask_set *set)
76*4882a593Smuzhiyun {
77*4882a593Smuzhiyun cpumask_clear(&set->mask);
78*4882a593Smuzhiyun cpumask_clear(&set->used);
79*4882a593Smuzhiyun set->gen = 0;
80*4882a593Smuzhiyun }
81*4882a593Smuzhiyun
82*4882a593Smuzhiyun /* Increment generation of CPU set if needed */
_cpu_mask_set_gen_inc(struct cpu_mask_set * set)83*4882a593Smuzhiyun static void _cpu_mask_set_gen_inc(struct cpu_mask_set *set)
84*4882a593Smuzhiyun {
85*4882a593Smuzhiyun if (cpumask_equal(&set->mask, &set->used)) {
86*4882a593Smuzhiyun /*
87*4882a593Smuzhiyun * We've used up all the CPUs, bump up the generation
88*4882a593Smuzhiyun * and reset the 'used' map
89*4882a593Smuzhiyun */
90*4882a593Smuzhiyun set->gen++;
91*4882a593Smuzhiyun cpumask_clear(&set->used);
92*4882a593Smuzhiyun }
93*4882a593Smuzhiyun }
94*4882a593Smuzhiyun
_cpu_mask_set_gen_dec(struct cpu_mask_set * set)95*4882a593Smuzhiyun static void _cpu_mask_set_gen_dec(struct cpu_mask_set *set)
96*4882a593Smuzhiyun {
97*4882a593Smuzhiyun if (cpumask_empty(&set->used) && set->gen) {
98*4882a593Smuzhiyun set->gen--;
99*4882a593Smuzhiyun cpumask_copy(&set->used, &set->mask);
100*4882a593Smuzhiyun }
101*4882a593Smuzhiyun }
102*4882a593Smuzhiyun
103*4882a593Smuzhiyun /* Get the first CPU from the list of unused CPUs in a CPU set data structure */
cpu_mask_set_get_first(struct cpu_mask_set * set,cpumask_var_t diff)104*4882a593Smuzhiyun static int cpu_mask_set_get_first(struct cpu_mask_set *set, cpumask_var_t diff)
105*4882a593Smuzhiyun {
106*4882a593Smuzhiyun int cpu;
107*4882a593Smuzhiyun
108*4882a593Smuzhiyun if (!diff || !set)
109*4882a593Smuzhiyun return -EINVAL;
110*4882a593Smuzhiyun
111*4882a593Smuzhiyun _cpu_mask_set_gen_inc(set);
112*4882a593Smuzhiyun
113*4882a593Smuzhiyun /* Find out CPUs left in CPU mask */
114*4882a593Smuzhiyun cpumask_andnot(diff, &set->mask, &set->used);
115*4882a593Smuzhiyun
116*4882a593Smuzhiyun cpu = cpumask_first(diff);
117*4882a593Smuzhiyun if (cpu >= nr_cpu_ids) /* empty */
118*4882a593Smuzhiyun cpu = -EINVAL;
119*4882a593Smuzhiyun else
120*4882a593Smuzhiyun cpumask_set_cpu(cpu, &set->used);
121*4882a593Smuzhiyun
122*4882a593Smuzhiyun return cpu;
123*4882a593Smuzhiyun }
124*4882a593Smuzhiyun
cpu_mask_set_put(struct cpu_mask_set * set,int cpu)125*4882a593Smuzhiyun static void cpu_mask_set_put(struct cpu_mask_set *set, int cpu)
126*4882a593Smuzhiyun {
127*4882a593Smuzhiyun if (!set)
128*4882a593Smuzhiyun return;
129*4882a593Smuzhiyun
130*4882a593Smuzhiyun cpumask_clear_cpu(cpu, &set->used);
131*4882a593Smuzhiyun _cpu_mask_set_gen_dec(set);
132*4882a593Smuzhiyun }
133*4882a593Smuzhiyun
134*4882a593Smuzhiyun /* Initialize non-HT cpu cores mask */
init_real_cpu_mask(void)135*4882a593Smuzhiyun void init_real_cpu_mask(void)
136*4882a593Smuzhiyun {
137*4882a593Smuzhiyun int possible, curr_cpu, i, ht;
138*4882a593Smuzhiyun
139*4882a593Smuzhiyun cpumask_clear(&node_affinity.real_cpu_mask);
140*4882a593Smuzhiyun
141*4882a593Smuzhiyun /* Start with cpu online mask as the real cpu mask */
142*4882a593Smuzhiyun cpumask_copy(&node_affinity.real_cpu_mask, cpu_online_mask);
143*4882a593Smuzhiyun
144*4882a593Smuzhiyun /*
145*4882a593Smuzhiyun * Remove HT cores from the real cpu mask. Do this in two steps below.
146*4882a593Smuzhiyun */
147*4882a593Smuzhiyun possible = cpumask_weight(&node_affinity.real_cpu_mask);
148*4882a593Smuzhiyun ht = cpumask_weight(topology_sibling_cpumask(
149*4882a593Smuzhiyun cpumask_first(&node_affinity.real_cpu_mask)));
150*4882a593Smuzhiyun /*
151*4882a593Smuzhiyun * Step 1. Skip over the first N HT siblings and use them as the
152*4882a593Smuzhiyun * "real" cores. Assumes that HT cores are not enumerated in
153*4882a593Smuzhiyun * succession (except in the single core case).
154*4882a593Smuzhiyun */
155*4882a593Smuzhiyun curr_cpu = cpumask_first(&node_affinity.real_cpu_mask);
156*4882a593Smuzhiyun for (i = 0; i < possible / ht; i++)
157*4882a593Smuzhiyun curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
158*4882a593Smuzhiyun /*
159*4882a593Smuzhiyun * Step 2. Remove the remaining HT siblings. Use cpumask_next() to
160*4882a593Smuzhiyun * skip any gaps.
161*4882a593Smuzhiyun */
162*4882a593Smuzhiyun for (; i < possible; i++) {
163*4882a593Smuzhiyun cpumask_clear_cpu(curr_cpu, &node_affinity.real_cpu_mask);
164*4882a593Smuzhiyun curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
165*4882a593Smuzhiyun }
166*4882a593Smuzhiyun }
167*4882a593Smuzhiyun
node_affinity_init(void)168*4882a593Smuzhiyun int node_affinity_init(void)
169*4882a593Smuzhiyun {
170*4882a593Smuzhiyun int node;
171*4882a593Smuzhiyun struct pci_dev *dev = NULL;
172*4882a593Smuzhiyun const struct pci_device_id *ids = hfi1_pci_tbl;
173*4882a593Smuzhiyun
174*4882a593Smuzhiyun cpumask_clear(&node_affinity.proc.used);
175*4882a593Smuzhiyun cpumask_copy(&node_affinity.proc.mask, cpu_online_mask);
176*4882a593Smuzhiyun
177*4882a593Smuzhiyun node_affinity.proc.gen = 0;
178*4882a593Smuzhiyun node_affinity.num_core_siblings =
179*4882a593Smuzhiyun cpumask_weight(topology_sibling_cpumask(
180*4882a593Smuzhiyun cpumask_first(&node_affinity.proc.mask)
181*4882a593Smuzhiyun ));
182*4882a593Smuzhiyun node_affinity.num_possible_nodes = num_possible_nodes();
183*4882a593Smuzhiyun node_affinity.num_online_nodes = num_online_nodes();
184*4882a593Smuzhiyun node_affinity.num_online_cpus = num_online_cpus();
185*4882a593Smuzhiyun
186*4882a593Smuzhiyun /*
187*4882a593Smuzhiyun * The real cpu mask is part of the affinity struct but it has to be
188*4882a593Smuzhiyun * initialized early. It is needed to calculate the number of user
189*4882a593Smuzhiyun * contexts in set_up_context_variables().
190*4882a593Smuzhiyun */
191*4882a593Smuzhiyun init_real_cpu_mask();
192*4882a593Smuzhiyun
193*4882a593Smuzhiyun hfi1_per_node_cntr = kcalloc(node_affinity.num_possible_nodes,
194*4882a593Smuzhiyun sizeof(*hfi1_per_node_cntr), GFP_KERNEL);
195*4882a593Smuzhiyun if (!hfi1_per_node_cntr)
196*4882a593Smuzhiyun return -ENOMEM;
197*4882a593Smuzhiyun
198*4882a593Smuzhiyun while (ids->vendor) {
199*4882a593Smuzhiyun dev = NULL;
200*4882a593Smuzhiyun while ((dev = pci_get_device(ids->vendor, ids->device, dev))) {
201*4882a593Smuzhiyun node = pcibus_to_node(dev->bus);
202*4882a593Smuzhiyun if (node < 0)
203*4882a593Smuzhiyun goto out;
204*4882a593Smuzhiyun
205*4882a593Smuzhiyun hfi1_per_node_cntr[node]++;
206*4882a593Smuzhiyun }
207*4882a593Smuzhiyun ids++;
208*4882a593Smuzhiyun }
209*4882a593Smuzhiyun
210*4882a593Smuzhiyun return 0;
211*4882a593Smuzhiyun
212*4882a593Smuzhiyun out:
213*4882a593Smuzhiyun /*
214*4882a593Smuzhiyun * Invalid PCI NUMA node information found, note it, and populate
215*4882a593Smuzhiyun * our database 1:1.
216*4882a593Smuzhiyun */
217*4882a593Smuzhiyun pr_err("HFI: Invalid PCI NUMA node. Performance may be affected\n");
218*4882a593Smuzhiyun pr_err("HFI: System BIOS may need to be upgraded\n");
219*4882a593Smuzhiyun for (node = 0; node < node_affinity.num_possible_nodes; node++)
220*4882a593Smuzhiyun hfi1_per_node_cntr[node] = 1;
221*4882a593Smuzhiyun
222*4882a593Smuzhiyun return 0;
223*4882a593Smuzhiyun }
224*4882a593Smuzhiyun
node_affinity_destroy(struct hfi1_affinity_node * entry)225*4882a593Smuzhiyun static void node_affinity_destroy(struct hfi1_affinity_node *entry)
226*4882a593Smuzhiyun {
227*4882a593Smuzhiyun free_percpu(entry->comp_vect_affinity);
228*4882a593Smuzhiyun kfree(entry);
229*4882a593Smuzhiyun }
230*4882a593Smuzhiyun
node_affinity_destroy_all(void)231*4882a593Smuzhiyun void node_affinity_destroy_all(void)
232*4882a593Smuzhiyun {
233*4882a593Smuzhiyun struct list_head *pos, *q;
234*4882a593Smuzhiyun struct hfi1_affinity_node *entry;
235*4882a593Smuzhiyun
236*4882a593Smuzhiyun mutex_lock(&node_affinity.lock);
237*4882a593Smuzhiyun list_for_each_safe(pos, q, &node_affinity.list) {
238*4882a593Smuzhiyun entry = list_entry(pos, struct hfi1_affinity_node,
239*4882a593Smuzhiyun list);
240*4882a593Smuzhiyun list_del(pos);
241*4882a593Smuzhiyun node_affinity_destroy(entry);
242*4882a593Smuzhiyun }
243*4882a593Smuzhiyun mutex_unlock(&node_affinity.lock);
244*4882a593Smuzhiyun kfree(hfi1_per_node_cntr);
245*4882a593Smuzhiyun }
246*4882a593Smuzhiyun
node_affinity_allocate(int node)247*4882a593Smuzhiyun static struct hfi1_affinity_node *node_affinity_allocate(int node)
248*4882a593Smuzhiyun {
249*4882a593Smuzhiyun struct hfi1_affinity_node *entry;
250*4882a593Smuzhiyun
251*4882a593Smuzhiyun entry = kzalloc(sizeof(*entry), GFP_KERNEL);
252*4882a593Smuzhiyun if (!entry)
253*4882a593Smuzhiyun return NULL;
254*4882a593Smuzhiyun entry->node = node;
255*4882a593Smuzhiyun entry->comp_vect_affinity = alloc_percpu(u16);
256*4882a593Smuzhiyun INIT_LIST_HEAD(&entry->list);
257*4882a593Smuzhiyun
258*4882a593Smuzhiyun return entry;
259*4882a593Smuzhiyun }
260*4882a593Smuzhiyun
261*4882a593Smuzhiyun /*
262*4882a593Smuzhiyun * It appends an entry to the list.
263*4882a593Smuzhiyun * It *must* be called with node_affinity.lock held.
264*4882a593Smuzhiyun */
node_affinity_add_tail(struct hfi1_affinity_node * entry)265*4882a593Smuzhiyun static void node_affinity_add_tail(struct hfi1_affinity_node *entry)
266*4882a593Smuzhiyun {
267*4882a593Smuzhiyun list_add_tail(&entry->list, &node_affinity.list);
268*4882a593Smuzhiyun }
269*4882a593Smuzhiyun
270*4882a593Smuzhiyun /* It must be called with node_affinity.lock held */
node_affinity_lookup(int node)271*4882a593Smuzhiyun static struct hfi1_affinity_node *node_affinity_lookup(int node)
272*4882a593Smuzhiyun {
273*4882a593Smuzhiyun struct list_head *pos;
274*4882a593Smuzhiyun struct hfi1_affinity_node *entry;
275*4882a593Smuzhiyun
276*4882a593Smuzhiyun list_for_each(pos, &node_affinity.list) {
277*4882a593Smuzhiyun entry = list_entry(pos, struct hfi1_affinity_node, list);
278*4882a593Smuzhiyun if (entry->node == node)
279*4882a593Smuzhiyun return entry;
280*4882a593Smuzhiyun }
281*4882a593Smuzhiyun
282*4882a593Smuzhiyun return NULL;
283*4882a593Smuzhiyun }
284*4882a593Smuzhiyun
per_cpu_affinity_get(cpumask_var_t possible_cpumask,u16 __percpu * comp_vect_affinity)285*4882a593Smuzhiyun static int per_cpu_affinity_get(cpumask_var_t possible_cpumask,
286*4882a593Smuzhiyun u16 __percpu *comp_vect_affinity)
287*4882a593Smuzhiyun {
288*4882a593Smuzhiyun int curr_cpu;
289*4882a593Smuzhiyun u16 cntr;
290*4882a593Smuzhiyun u16 prev_cntr;
291*4882a593Smuzhiyun int ret_cpu;
292*4882a593Smuzhiyun
293*4882a593Smuzhiyun if (!possible_cpumask) {
294*4882a593Smuzhiyun ret_cpu = -EINVAL;
295*4882a593Smuzhiyun goto fail;
296*4882a593Smuzhiyun }
297*4882a593Smuzhiyun
298*4882a593Smuzhiyun if (!comp_vect_affinity) {
299*4882a593Smuzhiyun ret_cpu = -EINVAL;
300*4882a593Smuzhiyun goto fail;
301*4882a593Smuzhiyun }
302*4882a593Smuzhiyun
303*4882a593Smuzhiyun ret_cpu = cpumask_first(possible_cpumask);
304*4882a593Smuzhiyun if (ret_cpu >= nr_cpu_ids) {
305*4882a593Smuzhiyun ret_cpu = -EINVAL;
306*4882a593Smuzhiyun goto fail;
307*4882a593Smuzhiyun }
308*4882a593Smuzhiyun
309*4882a593Smuzhiyun prev_cntr = *per_cpu_ptr(comp_vect_affinity, ret_cpu);
310*4882a593Smuzhiyun for_each_cpu(curr_cpu, possible_cpumask) {
311*4882a593Smuzhiyun cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu);
312*4882a593Smuzhiyun
313*4882a593Smuzhiyun if (cntr < prev_cntr) {
314*4882a593Smuzhiyun ret_cpu = curr_cpu;
315*4882a593Smuzhiyun prev_cntr = cntr;
316*4882a593Smuzhiyun }
317*4882a593Smuzhiyun }
318*4882a593Smuzhiyun
319*4882a593Smuzhiyun *per_cpu_ptr(comp_vect_affinity, ret_cpu) += 1;
320*4882a593Smuzhiyun
321*4882a593Smuzhiyun fail:
322*4882a593Smuzhiyun return ret_cpu;
323*4882a593Smuzhiyun }
324*4882a593Smuzhiyun
per_cpu_affinity_put_max(cpumask_var_t possible_cpumask,u16 __percpu * comp_vect_affinity)325*4882a593Smuzhiyun static int per_cpu_affinity_put_max(cpumask_var_t possible_cpumask,
326*4882a593Smuzhiyun u16 __percpu *comp_vect_affinity)
327*4882a593Smuzhiyun {
328*4882a593Smuzhiyun int curr_cpu;
329*4882a593Smuzhiyun int max_cpu;
330*4882a593Smuzhiyun u16 cntr;
331*4882a593Smuzhiyun u16 prev_cntr;
332*4882a593Smuzhiyun
333*4882a593Smuzhiyun if (!possible_cpumask)
334*4882a593Smuzhiyun return -EINVAL;
335*4882a593Smuzhiyun
336*4882a593Smuzhiyun if (!comp_vect_affinity)
337*4882a593Smuzhiyun return -EINVAL;
338*4882a593Smuzhiyun
339*4882a593Smuzhiyun max_cpu = cpumask_first(possible_cpumask);
340*4882a593Smuzhiyun if (max_cpu >= nr_cpu_ids)
341*4882a593Smuzhiyun return -EINVAL;
342*4882a593Smuzhiyun
343*4882a593Smuzhiyun prev_cntr = *per_cpu_ptr(comp_vect_affinity, max_cpu);
344*4882a593Smuzhiyun for_each_cpu(curr_cpu, possible_cpumask) {
345*4882a593Smuzhiyun cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu);
346*4882a593Smuzhiyun
347*4882a593Smuzhiyun if (cntr > prev_cntr) {
348*4882a593Smuzhiyun max_cpu = curr_cpu;
349*4882a593Smuzhiyun prev_cntr = cntr;
350*4882a593Smuzhiyun }
351*4882a593Smuzhiyun }
352*4882a593Smuzhiyun
353*4882a593Smuzhiyun *per_cpu_ptr(comp_vect_affinity, max_cpu) -= 1;
354*4882a593Smuzhiyun
355*4882a593Smuzhiyun return max_cpu;
356*4882a593Smuzhiyun }
357*4882a593Smuzhiyun
358*4882a593Smuzhiyun /*
359*4882a593Smuzhiyun * Non-interrupt CPUs are used first, then interrupt CPUs.
360*4882a593Smuzhiyun * Two already allocated cpu masks must be passed.
361*4882a593Smuzhiyun */
_dev_comp_vect_cpu_get(struct hfi1_devdata * dd,struct hfi1_affinity_node * entry,cpumask_var_t non_intr_cpus,cpumask_var_t available_cpus)362*4882a593Smuzhiyun static int _dev_comp_vect_cpu_get(struct hfi1_devdata *dd,
363*4882a593Smuzhiyun struct hfi1_affinity_node *entry,
364*4882a593Smuzhiyun cpumask_var_t non_intr_cpus,
365*4882a593Smuzhiyun cpumask_var_t available_cpus)
366*4882a593Smuzhiyun __must_hold(&node_affinity.lock)
367*4882a593Smuzhiyun {
368*4882a593Smuzhiyun int cpu;
369*4882a593Smuzhiyun struct cpu_mask_set *set = dd->comp_vect;
370*4882a593Smuzhiyun
371*4882a593Smuzhiyun lockdep_assert_held(&node_affinity.lock);
372*4882a593Smuzhiyun if (!non_intr_cpus) {
373*4882a593Smuzhiyun cpu = -1;
374*4882a593Smuzhiyun goto fail;
375*4882a593Smuzhiyun }
376*4882a593Smuzhiyun
377*4882a593Smuzhiyun if (!available_cpus) {
378*4882a593Smuzhiyun cpu = -1;
379*4882a593Smuzhiyun goto fail;
380*4882a593Smuzhiyun }
381*4882a593Smuzhiyun
382*4882a593Smuzhiyun /* Available CPUs for pinning completion vectors */
383*4882a593Smuzhiyun _cpu_mask_set_gen_inc(set);
384*4882a593Smuzhiyun cpumask_andnot(available_cpus, &set->mask, &set->used);
385*4882a593Smuzhiyun
386*4882a593Smuzhiyun /* Available CPUs without SDMA engine interrupts */
387*4882a593Smuzhiyun cpumask_andnot(non_intr_cpus, available_cpus,
388*4882a593Smuzhiyun &entry->def_intr.used);
389*4882a593Smuzhiyun
390*4882a593Smuzhiyun /* If there are non-interrupt CPUs available, use them first */
391*4882a593Smuzhiyun if (!cpumask_empty(non_intr_cpus))
392*4882a593Smuzhiyun cpu = cpumask_first(non_intr_cpus);
393*4882a593Smuzhiyun else /* Otherwise, use interrupt CPUs */
394*4882a593Smuzhiyun cpu = cpumask_first(available_cpus);
395*4882a593Smuzhiyun
396*4882a593Smuzhiyun if (cpu >= nr_cpu_ids) { /* empty */
397*4882a593Smuzhiyun cpu = -1;
398*4882a593Smuzhiyun goto fail;
399*4882a593Smuzhiyun }
400*4882a593Smuzhiyun cpumask_set_cpu(cpu, &set->used);
401*4882a593Smuzhiyun
402*4882a593Smuzhiyun fail:
403*4882a593Smuzhiyun return cpu;
404*4882a593Smuzhiyun }
405*4882a593Smuzhiyun
_dev_comp_vect_cpu_put(struct hfi1_devdata * dd,int cpu)406*4882a593Smuzhiyun static void _dev_comp_vect_cpu_put(struct hfi1_devdata *dd, int cpu)
407*4882a593Smuzhiyun {
408*4882a593Smuzhiyun struct cpu_mask_set *set = dd->comp_vect;
409*4882a593Smuzhiyun
410*4882a593Smuzhiyun if (cpu < 0)
411*4882a593Smuzhiyun return;
412*4882a593Smuzhiyun
413*4882a593Smuzhiyun cpu_mask_set_put(set, cpu);
414*4882a593Smuzhiyun }
415*4882a593Smuzhiyun
416*4882a593Smuzhiyun /* _dev_comp_vect_mappings_destroy() is reentrant */
_dev_comp_vect_mappings_destroy(struct hfi1_devdata * dd)417*4882a593Smuzhiyun static void _dev_comp_vect_mappings_destroy(struct hfi1_devdata *dd)
418*4882a593Smuzhiyun {
419*4882a593Smuzhiyun int i, cpu;
420*4882a593Smuzhiyun
421*4882a593Smuzhiyun if (!dd->comp_vect_mappings)
422*4882a593Smuzhiyun return;
423*4882a593Smuzhiyun
424*4882a593Smuzhiyun for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
425*4882a593Smuzhiyun cpu = dd->comp_vect_mappings[i];
426*4882a593Smuzhiyun _dev_comp_vect_cpu_put(dd, cpu);
427*4882a593Smuzhiyun dd->comp_vect_mappings[i] = -1;
428*4882a593Smuzhiyun hfi1_cdbg(AFFINITY,
429*4882a593Smuzhiyun "[%s] Release CPU %d from completion vector %d",
430*4882a593Smuzhiyun rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), cpu, i);
431*4882a593Smuzhiyun }
432*4882a593Smuzhiyun
433*4882a593Smuzhiyun kfree(dd->comp_vect_mappings);
434*4882a593Smuzhiyun dd->comp_vect_mappings = NULL;
435*4882a593Smuzhiyun }
436*4882a593Smuzhiyun
437*4882a593Smuzhiyun /*
438*4882a593Smuzhiyun * This function creates the table for looking up CPUs for completion vectors.
439*4882a593Smuzhiyun * num_comp_vectors needs to have been initilized before calling this function.
440*4882a593Smuzhiyun */
_dev_comp_vect_mappings_create(struct hfi1_devdata * dd,struct hfi1_affinity_node * entry)441*4882a593Smuzhiyun static int _dev_comp_vect_mappings_create(struct hfi1_devdata *dd,
442*4882a593Smuzhiyun struct hfi1_affinity_node *entry)
443*4882a593Smuzhiyun __must_hold(&node_affinity.lock)
444*4882a593Smuzhiyun {
445*4882a593Smuzhiyun int i, cpu, ret;
446*4882a593Smuzhiyun cpumask_var_t non_intr_cpus;
447*4882a593Smuzhiyun cpumask_var_t available_cpus;
448*4882a593Smuzhiyun
449*4882a593Smuzhiyun lockdep_assert_held(&node_affinity.lock);
450*4882a593Smuzhiyun
451*4882a593Smuzhiyun if (!zalloc_cpumask_var(&non_intr_cpus, GFP_KERNEL))
452*4882a593Smuzhiyun return -ENOMEM;
453*4882a593Smuzhiyun
454*4882a593Smuzhiyun if (!zalloc_cpumask_var(&available_cpus, GFP_KERNEL)) {
455*4882a593Smuzhiyun free_cpumask_var(non_intr_cpus);
456*4882a593Smuzhiyun return -ENOMEM;
457*4882a593Smuzhiyun }
458*4882a593Smuzhiyun
459*4882a593Smuzhiyun dd->comp_vect_mappings = kcalloc(dd->comp_vect_possible_cpus,
460*4882a593Smuzhiyun sizeof(*dd->comp_vect_mappings),
461*4882a593Smuzhiyun GFP_KERNEL);
462*4882a593Smuzhiyun if (!dd->comp_vect_mappings) {
463*4882a593Smuzhiyun ret = -ENOMEM;
464*4882a593Smuzhiyun goto fail;
465*4882a593Smuzhiyun }
466*4882a593Smuzhiyun for (i = 0; i < dd->comp_vect_possible_cpus; i++)
467*4882a593Smuzhiyun dd->comp_vect_mappings[i] = -1;
468*4882a593Smuzhiyun
469*4882a593Smuzhiyun for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
470*4882a593Smuzhiyun cpu = _dev_comp_vect_cpu_get(dd, entry, non_intr_cpus,
471*4882a593Smuzhiyun available_cpus);
472*4882a593Smuzhiyun if (cpu < 0) {
473*4882a593Smuzhiyun ret = -EINVAL;
474*4882a593Smuzhiyun goto fail;
475*4882a593Smuzhiyun }
476*4882a593Smuzhiyun
477*4882a593Smuzhiyun dd->comp_vect_mappings[i] = cpu;
478*4882a593Smuzhiyun hfi1_cdbg(AFFINITY,
479*4882a593Smuzhiyun "[%s] Completion Vector %d -> CPU %d",
480*4882a593Smuzhiyun rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), i, cpu);
481*4882a593Smuzhiyun }
482*4882a593Smuzhiyun
483*4882a593Smuzhiyun free_cpumask_var(available_cpus);
484*4882a593Smuzhiyun free_cpumask_var(non_intr_cpus);
485*4882a593Smuzhiyun return 0;
486*4882a593Smuzhiyun
487*4882a593Smuzhiyun fail:
488*4882a593Smuzhiyun free_cpumask_var(available_cpus);
489*4882a593Smuzhiyun free_cpumask_var(non_intr_cpus);
490*4882a593Smuzhiyun _dev_comp_vect_mappings_destroy(dd);
491*4882a593Smuzhiyun
492*4882a593Smuzhiyun return ret;
493*4882a593Smuzhiyun }
494*4882a593Smuzhiyun
hfi1_comp_vectors_set_up(struct hfi1_devdata * dd)495*4882a593Smuzhiyun int hfi1_comp_vectors_set_up(struct hfi1_devdata *dd)
496*4882a593Smuzhiyun {
497*4882a593Smuzhiyun int ret;
498*4882a593Smuzhiyun struct hfi1_affinity_node *entry;
499*4882a593Smuzhiyun
500*4882a593Smuzhiyun mutex_lock(&node_affinity.lock);
501*4882a593Smuzhiyun entry = node_affinity_lookup(dd->node);
502*4882a593Smuzhiyun if (!entry) {
503*4882a593Smuzhiyun ret = -EINVAL;
504*4882a593Smuzhiyun goto unlock;
505*4882a593Smuzhiyun }
506*4882a593Smuzhiyun ret = _dev_comp_vect_mappings_create(dd, entry);
507*4882a593Smuzhiyun unlock:
508*4882a593Smuzhiyun mutex_unlock(&node_affinity.lock);
509*4882a593Smuzhiyun
510*4882a593Smuzhiyun return ret;
511*4882a593Smuzhiyun }
512*4882a593Smuzhiyun
hfi1_comp_vectors_clean_up(struct hfi1_devdata * dd)513*4882a593Smuzhiyun void hfi1_comp_vectors_clean_up(struct hfi1_devdata *dd)
514*4882a593Smuzhiyun {
515*4882a593Smuzhiyun _dev_comp_vect_mappings_destroy(dd);
516*4882a593Smuzhiyun }
517*4882a593Smuzhiyun
hfi1_comp_vect_mappings_lookup(struct rvt_dev_info * rdi,int comp_vect)518*4882a593Smuzhiyun int hfi1_comp_vect_mappings_lookup(struct rvt_dev_info *rdi, int comp_vect)
519*4882a593Smuzhiyun {
520*4882a593Smuzhiyun struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
521*4882a593Smuzhiyun struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
522*4882a593Smuzhiyun
523*4882a593Smuzhiyun if (!dd->comp_vect_mappings)
524*4882a593Smuzhiyun return -EINVAL;
525*4882a593Smuzhiyun if (comp_vect >= dd->comp_vect_possible_cpus)
526*4882a593Smuzhiyun return -EINVAL;
527*4882a593Smuzhiyun
528*4882a593Smuzhiyun return dd->comp_vect_mappings[comp_vect];
529*4882a593Smuzhiyun }
530*4882a593Smuzhiyun
531*4882a593Smuzhiyun /*
532*4882a593Smuzhiyun * It assumes dd->comp_vect_possible_cpus is available.
533*4882a593Smuzhiyun */
_dev_comp_vect_cpu_mask_init(struct hfi1_devdata * dd,struct hfi1_affinity_node * entry,bool first_dev_init)534*4882a593Smuzhiyun static int _dev_comp_vect_cpu_mask_init(struct hfi1_devdata *dd,
535*4882a593Smuzhiyun struct hfi1_affinity_node *entry,
536*4882a593Smuzhiyun bool first_dev_init)
537*4882a593Smuzhiyun __must_hold(&node_affinity.lock)
538*4882a593Smuzhiyun {
539*4882a593Smuzhiyun int i, j, curr_cpu;
540*4882a593Smuzhiyun int possible_cpus_comp_vect = 0;
541*4882a593Smuzhiyun struct cpumask *dev_comp_vect_mask = &dd->comp_vect->mask;
542*4882a593Smuzhiyun
543*4882a593Smuzhiyun lockdep_assert_held(&node_affinity.lock);
544*4882a593Smuzhiyun /*
545*4882a593Smuzhiyun * If there's only one CPU available for completion vectors, then
546*4882a593Smuzhiyun * there will only be one completion vector available. Othewise,
547*4882a593Smuzhiyun * the number of completion vector available will be the number of
548*4882a593Smuzhiyun * available CPUs divide it by the number of devices in the
549*4882a593Smuzhiyun * local NUMA node.
550*4882a593Smuzhiyun */
551*4882a593Smuzhiyun if (cpumask_weight(&entry->comp_vect_mask) == 1) {
552*4882a593Smuzhiyun possible_cpus_comp_vect = 1;
553*4882a593Smuzhiyun dd_dev_warn(dd,
554*4882a593Smuzhiyun "Number of kernel receive queues is too large for completion vector affinity to be effective\n");
555*4882a593Smuzhiyun } else {
556*4882a593Smuzhiyun possible_cpus_comp_vect +=
557*4882a593Smuzhiyun cpumask_weight(&entry->comp_vect_mask) /
558*4882a593Smuzhiyun hfi1_per_node_cntr[dd->node];
559*4882a593Smuzhiyun
560*4882a593Smuzhiyun /*
561*4882a593Smuzhiyun * If the completion vector CPUs available doesn't divide
562*4882a593Smuzhiyun * evenly among devices, then the first device device to be
563*4882a593Smuzhiyun * initialized gets an extra CPU.
564*4882a593Smuzhiyun */
565*4882a593Smuzhiyun if (first_dev_init &&
566*4882a593Smuzhiyun cpumask_weight(&entry->comp_vect_mask) %
567*4882a593Smuzhiyun hfi1_per_node_cntr[dd->node] != 0)
568*4882a593Smuzhiyun possible_cpus_comp_vect++;
569*4882a593Smuzhiyun }
570*4882a593Smuzhiyun
571*4882a593Smuzhiyun dd->comp_vect_possible_cpus = possible_cpus_comp_vect;
572*4882a593Smuzhiyun
573*4882a593Smuzhiyun /* Reserving CPUs for device completion vector */
574*4882a593Smuzhiyun for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
575*4882a593Smuzhiyun curr_cpu = per_cpu_affinity_get(&entry->comp_vect_mask,
576*4882a593Smuzhiyun entry->comp_vect_affinity);
577*4882a593Smuzhiyun if (curr_cpu < 0)
578*4882a593Smuzhiyun goto fail;
579*4882a593Smuzhiyun
580*4882a593Smuzhiyun cpumask_set_cpu(curr_cpu, dev_comp_vect_mask);
581*4882a593Smuzhiyun }
582*4882a593Smuzhiyun
583*4882a593Smuzhiyun hfi1_cdbg(AFFINITY,
584*4882a593Smuzhiyun "[%s] Completion vector affinity CPU set(s) %*pbl",
585*4882a593Smuzhiyun rvt_get_ibdev_name(&(dd)->verbs_dev.rdi),
586*4882a593Smuzhiyun cpumask_pr_args(dev_comp_vect_mask));
587*4882a593Smuzhiyun
588*4882a593Smuzhiyun return 0;
589*4882a593Smuzhiyun
590*4882a593Smuzhiyun fail:
591*4882a593Smuzhiyun for (j = 0; j < i; j++)
592*4882a593Smuzhiyun per_cpu_affinity_put_max(&entry->comp_vect_mask,
593*4882a593Smuzhiyun entry->comp_vect_affinity);
594*4882a593Smuzhiyun
595*4882a593Smuzhiyun return curr_cpu;
596*4882a593Smuzhiyun }
597*4882a593Smuzhiyun
598*4882a593Smuzhiyun /*
599*4882a593Smuzhiyun * It assumes dd->comp_vect_possible_cpus is available.
600*4882a593Smuzhiyun */
_dev_comp_vect_cpu_mask_clean_up(struct hfi1_devdata * dd,struct hfi1_affinity_node * entry)601*4882a593Smuzhiyun static void _dev_comp_vect_cpu_mask_clean_up(struct hfi1_devdata *dd,
602*4882a593Smuzhiyun struct hfi1_affinity_node *entry)
603*4882a593Smuzhiyun __must_hold(&node_affinity.lock)
604*4882a593Smuzhiyun {
605*4882a593Smuzhiyun int i, cpu;
606*4882a593Smuzhiyun
607*4882a593Smuzhiyun lockdep_assert_held(&node_affinity.lock);
608*4882a593Smuzhiyun if (!dd->comp_vect_possible_cpus)
609*4882a593Smuzhiyun return;
610*4882a593Smuzhiyun
611*4882a593Smuzhiyun for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
612*4882a593Smuzhiyun cpu = per_cpu_affinity_put_max(&dd->comp_vect->mask,
613*4882a593Smuzhiyun entry->comp_vect_affinity);
614*4882a593Smuzhiyun /* Clearing CPU in device completion vector cpu mask */
615*4882a593Smuzhiyun if (cpu >= 0)
616*4882a593Smuzhiyun cpumask_clear_cpu(cpu, &dd->comp_vect->mask);
617*4882a593Smuzhiyun }
618*4882a593Smuzhiyun
619*4882a593Smuzhiyun dd->comp_vect_possible_cpus = 0;
620*4882a593Smuzhiyun }
621*4882a593Smuzhiyun
622*4882a593Smuzhiyun /*
623*4882a593Smuzhiyun * Interrupt affinity.
624*4882a593Smuzhiyun *
625*4882a593Smuzhiyun * non-rcv avail gets a default mask that
626*4882a593Smuzhiyun * starts as possible cpus with threads reset
627*4882a593Smuzhiyun * and each rcv avail reset.
628*4882a593Smuzhiyun *
629*4882a593Smuzhiyun * rcv avail gets node relative 1 wrapping back
630*4882a593Smuzhiyun * to the node relative 1 as necessary.
631*4882a593Smuzhiyun *
632*4882a593Smuzhiyun */
hfi1_dev_affinity_init(struct hfi1_devdata * dd)633*4882a593Smuzhiyun int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
634*4882a593Smuzhiyun {
635*4882a593Smuzhiyun struct hfi1_affinity_node *entry;
636*4882a593Smuzhiyun const struct cpumask *local_mask;
637*4882a593Smuzhiyun int curr_cpu, possible, i, ret;
638*4882a593Smuzhiyun bool new_entry = false;
639*4882a593Smuzhiyun
640*4882a593Smuzhiyun local_mask = cpumask_of_node(dd->node);
641*4882a593Smuzhiyun if (cpumask_first(local_mask) >= nr_cpu_ids)
642*4882a593Smuzhiyun local_mask = topology_core_cpumask(0);
643*4882a593Smuzhiyun
644*4882a593Smuzhiyun mutex_lock(&node_affinity.lock);
645*4882a593Smuzhiyun entry = node_affinity_lookup(dd->node);
646*4882a593Smuzhiyun
647*4882a593Smuzhiyun /*
648*4882a593Smuzhiyun * If this is the first time this NUMA node's affinity is used,
649*4882a593Smuzhiyun * create an entry in the global affinity structure and initialize it.
650*4882a593Smuzhiyun */
651*4882a593Smuzhiyun if (!entry) {
652*4882a593Smuzhiyun entry = node_affinity_allocate(dd->node);
653*4882a593Smuzhiyun if (!entry) {
654*4882a593Smuzhiyun dd_dev_err(dd,
655*4882a593Smuzhiyun "Unable to allocate global affinity node\n");
656*4882a593Smuzhiyun ret = -ENOMEM;
657*4882a593Smuzhiyun goto fail;
658*4882a593Smuzhiyun }
659*4882a593Smuzhiyun new_entry = true;
660*4882a593Smuzhiyun
661*4882a593Smuzhiyun init_cpu_mask_set(&entry->def_intr);
662*4882a593Smuzhiyun init_cpu_mask_set(&entry->rcv_intr);
663*4882a593Smuzhiyun cpumask_clear(&entry->comp_vect_mask);
664*4882a593Smuzhiyun cpumask_clear(&entry->general_intr_mask);
665*4882a593Smuzhiyun /* Use the "real" cpu mask of this node as the default */
666*4882a593Smuzhiyun cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask,
667*4882a593Smuzhiyun local_mask);
668*4882a593Smuzhiyun
669*4882a593Smuzhiyun /* fill in the receive list */
670*4882a593Smuzhiyun possible = cpumask_weight(&entry->def_intr.mask);
671*4882a593Smuzhiyun curr_cpu = cpumask_first(&entry->def_intr.mask);
672*4882a593Smuzhiyun
673*4882a593Smuzhiyun if (possible == 1) {
674*4882a593Smuzhiyun /* only one CPU, everyone will use it */
675*4882a593Smuzhiyun cpumask_set_cpu(curr_cpu, &entry->rcv_intr.mask);
676*4882a593Smuzhiyun cpumask_set_cpu(curr_cpu, &entry->general_intr_mask);
677*4882a593Smuzhiyun } else {
678*4882a593Smuzhiyun /*
679*4882a593Smuzhiyun * The general/control context will be the first CPU in
680*4882a593Smuzhiyun * the default list, so it is removed from the default
681*4882a593Smuzhiyun * list and added to the general interrupt list.
682*4882a593Smuzhiyun */
683*4882a593Smuzhiyun cpumask_clear_cpu(curr_cpu, &entry->def_intr.mask);
684*4882a593Smuzhiyun cpumask_set_cpu(curr_cpu, &entry->general_intr_mask);
685*4882a593Smuzhiyun curr_cpu = cpumask_next(curr_cpu,
686*4882a593Smuzhiyun &entry->def_intr.mask);
687*4882a593Smuzhiyun
688*4882a593Smuzhiyun /*
689*4882a593Smuzhiyun * Remove the remaining kernel receive queues from
690*4882a593Smuzhiyun * the default list and add them to the receive list.
691*4882a593Smuzhiyun */
692*4882a593Smuzhiyun for (i = 0;
693*4882a593Smuzhiyun i < (dd->n_krcv_queues - 1) *
694*4882a593Smuzhiyun hfi1_per_node_cntr[dd->node];
695*4882a593Smuzhiyun i++) {
696*4882a593Smuzhiyun cpumask_clear_cpu(curr_cpu,
697*4882a593Smuzhiyun &entry->def_intr.mask);
698*4882a593Smuzhiyun cpumask_set_cpu(curr_cpu,
699*4882a593Smuzhiyun &entry->rcv_intr.mask);
700*4882a593Smuzhiyun curr_cpu = cpumask_next(curr_cpu,
701*4882a593Smuzhiyun &entry->def_intr.mask);
702*4882a593Smuzhiyun if (curr_cpu >= nr_cpu_ids)
703*4882a593Smuzhiyun break;
704*4882a593Smuzhiyun }
705*4882a593Smuzhiyun
706*4882a593Smuzhiyun /*
707*4882a593Smuzhiyun * If there ends up being 0 CPU cores leftover for SDMA
708*4882a593Smuzhiyun * engines, use the same CPU cores as general/control
709*4882a593Smuzhiyun * context.
710*4882a593Smuzhiyun */
711*4882a593Smuzhiyun if (cpumask_weight(&entry->def_intr.mask) == 0)
712*4882a593Smuzhiyun cpumask_copy(&entry->def_intr.mask,
713*4882a593Smuzhiyun &entry->general_intr_mask);
714*4882a593Smuzhiyun }
715*4882a593Smuzhiyun
716*4882a593Smuzhiyun /* Determine completion vector CPUs for the entire node */
717*4882a593Smuzhiyun cpumask_and(&entry->comp_vect_mask,
718*4882a593Smuzhiyun &node_affinity.real_cpu_mask, local_mask);
719*4882a593Smuzhiyun cpumask_andnot(&entry->comp_vect_mask,
720*4882a593Smuzhiyun &entry->comp_vect_mask,
721*4882a593Smuzhiyun &entry->rcv_intr.mask);
722*4882a593Smuzhiyun cpumask_andnot(&entry->comp_vect_mask,
723*4882a593Smuzhiyun &entry->comp_vect_mask,
724*4882a593Smuzhiyun &entry->general_intr_mask);
725*4882a593Smuzhiyun
726*4882a593Smuzhiyun /*
727*4882a593Smuzhiyun * If there ends up being 0 CPU cores leftover for completion
728*4882a593Smuzhiyun * vectors, use the same CPU core as the general/control
729*4882a593Smuzhiyun * context.
730*4882a593Smuzhiyun */
731*4882a593Smuzhiyun if (cpumask_weight(&entry->comp_vect_mask) == 0)
732*4882a593Smuzhiyun cpumask_copy(&entry->comp_vect_mask,
733*4882a593Smuzhiyun &entry->general_intr_mask);
734*4882a593Smuzhiyun }
735*4882a593Smuzhiyun
736*4882a593Smuzhiyun ret = _dev_comp_vect_cpu_mask_init(dd, entry, new_entry);
737*4882a593Smuzhiyun if (ret < 0)
738*4882a593Smuzhiyun goto fail;
739*4882a593Smuzhiyun
740*4882a593Smuzhiyun if (new_entry)
741*4882a593Smuzhiyun node_affinity_add_tail(entry);
742*4882a593Smuzhiyun
743*4882a593Smuzhiyun dd->affinity_entry = entry;
744*4882a593Smuzhiyun mutex_unlock(&node_affinity.lock);
745*4882a593Smuzhiyun
746*4882a593Smuzhiyun return 0;
747*4882a593Smuzhiyun
748*4882a593Smuzhiyun fail:
749*4882a593Smuzhiyun if (new_entry)
750*4882a593Smuzhiyun node_affinity_destroy(entry);
751*4882a593Smuzhiyun mutex_unlock(&node_affinity.lock);
752*4882a593Smuzhiyun return ret;
753*4882a593Smuzhiyun }
754*4882a593Smuzhiyun
hfi1_dev_affinity_clean_up(struct hfi1_devdata * dd)755*4882a593Smuzhiyun void hfi1_dev_affinity_clean_up(struct hfi1_devdata *dd)
756*4882a593Smuzhiyun {
757*4882a593Smuzhiyun struct hfi1_affinity_node *entry;
758*4882a593Smuzhiyun
759*4882a593Smuzhiyun mutex_lock(&node_affinity.lock);
760*4882a593Smuzhiyun if (!dd->affinity_entry)
761*4882a593Smuzhiyun goto unlock;
762*4882a593Smuzhiyun entry = node_affinity_lookup(dd->node);
763*4882a593Smuzhiyun if (!entry)
764*4882a593Smuzhiyun goto unlock;
765*4882a593Smuzhiyun
766*4882a593Smuzhiyun /*
767*4882a593Smuzhiyun * Free device completion vector CPUs to be used by future
768*4882a593Smuzhiyun * completion vectors
769*4882a593Smuzhiyun */
770*4882a593Smuzhiyun _dev_comp_vect_cpu_mask_clean_up(dd, entry);
771*4882a593Smuzhiyun unlock:
772*4882a593Smuzhiyun dd->affinity_entry = NULL;
773*4882a593Smuzhiyun mutex_unlock(&node_affinity.lock);
774*4882a593Smuzhiyun }
775*4882a593Smuzhiyun
776*4882a593Smuzhiyun /*
777*4882a593Smuzhiyun * Function updates the irq affinity hint for msix after it has been changed
778*4882a593Smuzhiyun * by the user using the /proc/irq interface. This function only accepts
779*4882a593Smuzhiyun * one cpu in the mask.
780*4882a593Smuzhiyun */
hfi1_update_sdma_affinity(struct hfi1_msix_entry * msix,int cpu)781*4882a593Smuzhiyun static void hfi1_update_sdma_affinity(struct hfi1_msix_entry *msix, int cpu)
782*4882a593Smuzhiyun {
783*4882a593Smuzhiyun struct sdma_engine *sde = msix->arg;
784*4882a593Smuzhiyun struct hfi1_devdata *dd = sde->dd;
785*4882a593Smuzhiyun struct hfi1_affinity_node *entry;
786*4882a593Smuzhiyun struct cpu_mask_set *set;
787*4882a593Smuzhiyun int i, old_cpu;
788*4882a593Smuzhiyun
789*4882a593Smuzhiyun if (cpu > num_online_cpus() || cpu == sde->cpu)
790*4882a593Smuzhiyun return;
791*4882a593Smuzhiyun
792*4882a593Smuzhiyun mutex_lock(&node_affinity.lock);
793*4882a593Smuzhiyun entry = node_affinity_lookup(dd->node);
794*4882a593Smuzhiyun if (!entry)
795*4882a593Smuzhiyun goto unlock;
796*4882a593Smuzhiyun
797*4882a593Smuzhiyun old_cpu = sde->cpu;
798*4882a593Smuzhiyun sde->cpu = cpu;
799*4882a593Smuzhiyun cpumask_clear(&msix->mask);
800*4882a593Smuzhiyun cpumask_set_cpu(cpu, &msix->mask);
801*4882a593Smuzhiyun dd_dev_dbg(dd, "IRQ: %u, type %s engine %u -> cpu: %d\n",
802*4882a593Smuzhiyun msix->irq, irq_type_names[msix->type],
803*4882a593Smuzhiyun sde->this_idx, cpu);
804*4882a593Smuzhiyun irq_set_affinity_hint(msix->irq, &msix->mask);
805*4882a593Smuzhiyun
806*4882a593Smuzhiyun /*
807*4882a593Smuzhiyun * Set the new cpu in the hfi1_affinity_node and clean
808*4882a593Smuzhiyun * the old cpu if it is not used by any other IRQ
809*4882a593Smuzhiyun */
810*4882a593Smuzhiyun set = &entry->def_intr;
811*4882a593Smuzhiyun cpumask_set_cpu(cpu, &set->mask);
812*4882a593Smuzhiyun cpumask_set_cpu(cpu, &set->used);
813*4882a593Smuzhiyun for (i = 0; i < dd->msix_info.max_requested; i++) {
814*4882a593Smuzhiyun struct hfi1_msix_entry *other_msix;
815*4882a593Smuzhiyun
816*4882a593Smuzhiyun other_msix = &dd->msix_info.msix_entries[i];
817*4882a593Smuzhiyun if (other_msix->type != IRQ_SDMA || other_msix == msix)
818*4882a593Smuzhiyun continue;
819*4882a593Smuzhiyun
820*4882a593Smuzhiyun if (cpumask_test_cpu(old_cpu, &other_msix->mask))
821*4882a593Smuzhiyun goto unlock;
822*4882a593Smuzhiyun }
823*4882a593Smuzhiyun cpumask_clear_cpu(old_cpu, &set->mask);
824*4882a593Smuzhiyun cpumask_clear_cpu(old_cpu, &set->used);
825*4882a593Smuzhiyun unlock:
826*4882a593Smuzhiyun mutex_unlock(&node_affinity.lock);
827*4882a593Smuzhiyun }
828*4882a593Smuzhiyun
hfi1_irq_notifier_notify(struct irq_affinity_notify * notify,const cpumask_t * mask)829*4882a593Smuzhiyun static void hfi1_irq_notifier_notify(struct irq_affinity_notify *notify,
830*4882a593Smuzhiyun const cpumask_t *mask)
831*4882a593Smuzhiyun {
832*4882a593Smuzhiyun int cpu = cpumask_first(mask);
833*4882a593Smuzhiyun struct hfi1_msix_entry *msix = container_of(notify,
834*4882a593Smuzhiyun struct hfi1_msix_entry,
835*4882a593Smuzhiyun notify);
836*4882a593Smuzhiyun
837*4882a593Smuzhiyun /* Only one CPU configuration supported currently */
838*4882a593Smuzhiyun hfi1_update_sdma_affinity(msix, cpu);
839*4882a593Smuzhiyun }
840*4882a593Smuzhiyun
hfi1_irq_notifier_release(struct kref * ref)841*4882a593Smuzhiyun static void hfi1_irq_notifier_release(struct kref *ref)
842*4882a593Smuzhiyun {
843*4882a593Smuzhiyun /*
844*4882a593Smuzhiyun * This is required by affinity notifier. We don't have anything to
845*4882a593Smuzhiyun * free here.
846*4882a593Smuzhiyun */
847*4882a593Smuzhiyun }
848*4882a593Smuzhiyun
hfi1_setup_sdma_notifier(struct hfi1_msix_entry * msix)849*4882a593Smuzhiyun static void hfi1_setup_sdma_notifier(struct hfi1_msix_entry *msix)
850*4882a593Smuzhiyun {
851*4882a593Smuzhiyun struct irq_affinity_notify *notify = &msix->notify;
852*4882a593Smuzhiyun
853*4882a593Smuzhiyun notify->irq = msix->irq;
854*4882a593Smuzhiyun notify->notify = hfi1_irq_notifier_notify;
855*4882a593Smuzhiyun notify->release = hfi1_irq_notifier_release;
856*4882a593Smuzhiyun
857*4882a593Smuzhiyun if (irq_set_affinity_notifier(notify->irq, notify))
858*4882a593Smuzhiyun pr_err("Failed to register sdma irq affinity notifier for irq %d\n",
859*4882a593Smuzhiyun notify->irq);
860*4882a593Smuzhiyun }
861*4882a593Smuzhiyun
hfi1_cleanup_sdma_notifier(struct hfi1_msix_entry * msix)862*4882a593Smuzhiyun static void hfi1_cleanup_sdma_notifier(struct hfi1_msix_entry *msix)
863*4882a593Smuzhiyun {
864*4882a593Smuzhiyun struct irq_affinity_notify *notify = &msix->notify;
865*4882a593Smuzhiyun
866*4882a593Smuzhiyun if (irq_set_affinity_notifier(notify->irq, NULL))
867*4882a593Smuzhiyun pr_err("Failed to cleanup sdma irq affinity notifier for irq %d\n",
868*4882a593Smuzhiyun notify->irq);
869*4882a593Smuzhiyun }
870*4882a593Smuzhiyun
871*4882a593Smuzhiyun /*
872*4882a593Smuzhiyun * Function sets the irq affinity for msix.
873*4882a593Smuzhiyun * It *must* be called with node_affinity.lock held.
874*4882a593Smuzhiyun */
get_irq_affinity(struct hfi1_devdata * dd,struct hfi1_msix_entry * msix)875*4882a593Smuzhiyun static int get_irq_affinity(struct hfi1_devdata *dd,
876*4882a593Smuzhiyun struct hfi1_msix_entry *msix)
877*4882a593Smuzhiyun {
878*4882a593Smuzhiyun cpumask_var_t diff;
879*4882a593Smuzhiyun struct hfi1_affinity_node *entry;
880*4882a593Smuzhiyun struct cpu_mask_set *set = NULL;
881*4882a593Smuzhiyun struct sdma_engine *sde = NULL;
882*4882a593Smuzhiyun struct hfi1_ctxtdata *rcd = NULL;
883*4882a593Smuzhiyun char extra[64];
884*4882a593Smuzhiyun int cpu = -1;
885*4882a593Smuzhiyun
886*4882a593Smuzhiyun extra[0] = '\0';
887*4882a593Smuzhiyun cpumask_clear(&msix->mask);
888*4882a593Smuzhiyun
889*4882a593Smuzhiyun entry = node_affinity_lookup(dd->node);
890*4882a593Smuzhiyun
891*4882a593Smuzhiyun switch (msix->type) {
892*4882a593Smuzhiyun case IRQ_SDMA:
893*4882a593Smuzhiyun sde = (struct sdma_engine *)msix->arg;
894*4882a593Smuzhiyun scnprintf(extra, 64, "engine %u", sde->this_idx);
895*4882a593Smuzhiyun set = &entry->def_intr;
896*4882a593Smuzhiyun break;
897*4882a593Smuzhiyun case IRQ_GENERAL:
898*4882a593Smuzhiyun cpu = cpumask_first(&entry->general_intr_mask);
899*4882a593Smuzhiyun break;
900*4882a593Smuzhiyun case IRQ_RCVCTXT:
901*4882a593Smuzhiyun rcd = (struct hfi1_ctxtdata *)msix->arg;
902*4882a593Smuzhiyun if (rcd->ctxt == HFI1_CTRL_CTXT)
903*4882a593Smuzhiyun cpu = cpumask_first(&entry->general_intr_mask);
904*4882a593Smuzhiyun else
905*4882a593Smuzhiyun set = &entry->rcv_intr;
906*4882a593Smuzhiyun scnprintf(extra, 64, "ctxt %u", rcd->ctxt);
907*4882a593Smuzhiyun break;
908*4882a593Smuzhiyun case IRQ_NETDEVCTXT:
909*4882a593Smuzhiyun rcd = (struct hfi1_ctxtdata *)msix->arg;
910*4882a593Smuzhiyun set = &entry->def_intr;
911*4882a593Smuzhiyun scnprintf(extra, 64, "ctxt %u", rcd->ctxt);
912*4882a593Smuzhiyun break;
913*4882a593Smuzhiyun default:
914*4882a593Smuzhiyun dd_dev_err(dd, "Invalid IRQ type %d\n", msix->type);
915*4882a593Smuzhiyun return -EINVAL;
916*4882a593Smuzhiyun }
917*4882a593Smuzhiyun
918*4882a593Smuzhiyun /*
919*4882a593Smuzhiyun * The general and control contexts are placed on a particular
920*4882a593Smuzhiyun * CPU, which is set above. Skip accounting for it. Everything else
921*4882a593Smuzhiyun * finds its CPU here.
922*4882a593Smuzhiyun */
923*4882a593Smuzhiyun if (cpu == -1 && set) {
924*4882a593Smuzhiyun if (!zalloc_cpumask_var(&diff, GFP_KERNEL))
925*4882a593Smuzhiyun return -ENOMEM;
926*4882a593Smuzhiyun
927*4882a593Smuzhiyun cpu = cpu_mask_set_get_first(set, diff);
928*4882a593Smuzhiyun if (cpu < 0) {
929*4882a593Smuzhiyun free_cpumask_var(diff);
930*4882a593Smuzhiyun dd_dev_err(dd, "Failure to obtain CPU for IRQ\n");
931*4882a593Smuzhiyun return cpu;
932*4882a593Smuzhiyun }
933*4882a593Smuzhiyun
934*4882a593Smuzhiyun free_cpumask_var(diff);
935*4882a593Smuzhiyun }
936*4882a593Smuzhiyun
937*4882a593Smuzhiyun cpumask_set_cpu(cpu, &msix->mask);
938*4882a593Smuzhiyun dd_dev_info(dd, "IRQ: %u, type %s %s -> cpu: %d\n",
939*4882a593Smuzhiyun msix->irq, irq_type_names[msix->type],
940*4882a593Smuzhiyun extra, cpu);
941*4882a593Smuzhiyun irq_set_affinity_hint(msix->irq, &msix->mask);
942*4882a593Smuzhiyun
943*4882a593Smuzhiyun if (msix->type == IRQ_SDMA) {
944*4882a593Smuzhiyun sde->cpu = cpu;
945*4882a593Smuzhiyun hfi1_setup_sdma_notifier(msix);
946*4882a593Smuzhiyun }
947*4882a593Smuzhiyun
948*4882a593Smuzhiyun return 0;
949*4882a593Smuzhiyun }
950*4882a593Smuzhiyun
hfi1_get_irq_affinity(struct hfi1_devdata * dd,struct hfi1_msix_entry * msix)951*4882a593Smuzhiyun int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
952*4882a593Smuzhiyun {
953*4882a593Smuzhiyun int ret;
954*4882a593Smuzhiyun
955*4882a593Smuzhiyun mutex_lock(&node_affinity.lock);
956*4882a593Smuzhiyun ret = get_irq_affinity(dd, msix);
957*4882a593Smuzhiyun mutex_unlock(&node_affinity.lock);
958*4882a593Smuzhiyun return ret;
959*4882a593Smuzhiyun }
960*4882a593Smuzhiyun
hfi1_put_irq_affinity(struct hfi1_devdata * dd,struct hfi1_msix_entry * msix)961*4882a593Smuzhiyun void hfi1_put_irq_affinity(struct hfi1_devdata *dd,
962*4882a593Smuzhiyun struct hfi1_msix_entry *msix)
963*4882a593Smuzhiyun {
964*4882a593Smuzhiyun struct cpu_mask_set *set = NULL;
965*4882a593Smuzhiyun struct hfi1_ctxtdata *rcd;
966*4882a593Smuzhiyun struct hfi1_affinity_node *entry;
967*4882a593Smuzhiyun
968*4882a593Smuzhiyun mutex_lock(&node_affinity.lock);
969*4882a593Smuzhiyun entry = node_affinity_lookup(dd->node);
970*4882a593Smuzhiyun
971*4882a593Smuzhiyun switch (msix->type) {
972*4882a593Smuzhiyun case IRQ_SDMA:
973*4882a593Smuzhiyun set = &entry->def_intr;
974*4882a593Smuzhiyun hfi1_cleanup_sdma_notifier(msix);
975*4882a593Smuzhiyun break;
976*4882a593Smuzhiyun case IRQ_GENERAL:
977*4882a593Smuzhiyun /* Don't do accounting for general contexts */
978*4882a593Smuzhiyun break;
979*4882a593Smuzhiyun case IRQ_RCVCTXT:
980*4882a593Smuzhiyun rcd = (struct hfi1_ctxtdata *)msix->arg;
981*4882a593Smuzhiyun /* Don't do accounting for control contexts */
982*4882a593Smuzhiyun if (rcd->ctxt != HFI1_CTRL_CTXT)
983*4882a593Smuzhiyun set = &entry->rcv_intr;
984*4882a593Smuzhiyun break;
985*4882a593Smuzhiyun case IRQ_NETDEVCTXT:
986*4882a593Smuzhiyun rcd = (struct hfi1_ctxtdata *)msix->arg;
987*4882a593Smuzhiyun set = &entry->def_intr;
988*4882a593Smuzhiyun break;
989*4882a593Smuzhiyun default:
990*4882a593Smuzhiyun mutex_unlock(&node_affinity.lock);
991*4882a593Smuzhiyun return;
992*4882a593Smuzhiyun }
993*4882a593Smuzhiyun
994*4882a593Smuzhiyun if (set) {
995*4882a593Smuzhiyun cpumask_andnot(&set->used, &set->used, &msix->mask);
996*4882a593Smuzhiyun _cpu_mask_set_gen_dec(set);
997*4882a593Smuzhiyun }
998*4882a593Smuzhiyun
999*4882a593Smuzhiyun irq_set_affinity_hint(msix->irq, NULL);
1000*4882a593Smuzhiyun cpumask_clear(&msix->mask);
1001*4882a593Smuzhiyun mutex_unlock(&node_affinity.lock);
1002*4882a593Smuzhiyun }
1003*4882a593Smuzhiyun
1004*4882a593Smuzhiyun /* This should be called with node_affinity.lock held */
find_hw_thread_mask(uint hw_thread_no,cpumask_var_t hw_thread_mask,struct hfi1_affinity_node_list * affinity)1005*4882a593Smuzhiyun static void find_hw_thread_mask(uint hw_thread_no, cpumask_var_t hw_thread_mask,
1006*4882a593Smuzhiyun struct hfi1_affinity_node_list *affinity)
1007*4882a593Smuzhiyun {
1008*4882a593Smuzhiyun int possible, curr_cpu, i;
1009*4882a593Smuzhiyun uint num_cores_per_socket = node_affinity.num_online_cpus /
1010*4882a593Smuzhiyun affinity->num_core_siblings /
1011*4882a593Smuzhiyun node_affinity.num_online_nodes;
1012*4882a593Smuzhiyun
1013*4882a593Smuzhiyun cpumask_copy(hw_thread_mask, &affinity->proc.mask);
1014*4882a593Smuzhiyun if (affinity->num_core_siblings > 0) {
1015*4882a593Smuzhiyun /* Removing other siblings not needed for now */
1016*4882a593Smuzhiyun possible = cpumask_weight(hw_thread_mask);
1017*4882a593Smuzhiyun curr_cpu = cpumask_first(hw_thread_mask);
1018*4882a593Smuzhiyun for (i = 0;
1019*4882a593Smuzhiyun i < num_cores_per_socket * node_affinity.num_online_nodes;
1020*4882a593Smuzhiyun i++)
1021*4882a593Smuzhiyun curr_cpu = cpumask_next(curr_cpu, hw_thread_mask);
1022*4882a593Smuzhiyun
1023*4882a593Smuzhiyun for (; i < possible; i++) {
1024*4882a593Smuzhiyun cpumask_clear_cpu(curr_cpu, hw_thread_mask);
1025*4882a593Smuzhiyun curr_cpu = cpumask_next(curr_cpu, hw_thread_mask);
1026*4882a593Smuzhiyun }
1027*4882a593Smuzhiyun
1028*4882a593Smuzhiyun /* Identifying correct HW threads within physical cores */
1029*4882a593Smuzhiyun cpumask_shift_left(hw_thread_mask, hw_thread_mask,
1030*4882a593Smuzhiyun num_cores_per_socket *
1031*4882a593Smuzhiyun node_affinity.num_online_nodes *
1032*4882a593Smuzhiyun hw_thread_no);
1033*4882a593Smuzhiyun }
1034*4882a593Smuzhiyun }
1035*4882a593Smuzhiyun
hfi1_get_proc_affinity(int node)1036*4882a593Smuzhiyun int hfi1_get_proc_affinity(int node)
1037*4882a593Smuzhiyun {
1038*4882a593Smuzhiyun int cpu = -1, ret, i;
1039*4882a593Smuzhiyun struct hfi1_affinity_node *entry;
1040*4882a593Smuzhiyun cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask;
1041*4882a593Smuzhiyun const struct cpumask *node_mask,
1042*4882a593Smuzhiyun *proc_mask = current->cpus_ptr;
1043*4882a593Smuzhiyun struct hfi1_affinity_node_list *affinity = &node_affinity;
1044*4882a593Smuzhiyun struct cpu_mask_set *set = &affinity->proc;
1045*4882a593Smuzhiyun
1046*4882a593Smuzhiyun /*
1047*4882a593Smuzhiyun * check whether process/context affinity has already
1048*4882a593Smuzhiyun * been set
1049*4882a593Smuzhiyun */
1050*4882a593Smuzhiyun if (current->nr_cpus_allowed == 1) {
1051*4882a593Smuzhiyun hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl",
1052*4882a593Smuzhiyun current->pid, current->comm,
1053*4882a593Smuzhiyun cpumask_pr_args(proc_mask));
1054*4882a593Smuzhiyun /*
1055*4882a593Smuzhiyun * Mark the pre-set CPU as used. This is atomic so we don't
1056*4882a593Smuzhiyun * need the lock
1057*4882a593Smuzhiyun */
1058*4882a593Smuzhiyun cpu = cpumask_first(proc_mask);
1059*4882a593Smuzhiyun cpumask_set_cpu(cpu, &set->used);
1060*4882a593Smuzhiyun goto done;
1061*4882a593Smuzhiyun } else if (current->nr_cpus_allowed < cpumask_weight(&set->mask)) {
1062*4882a593Smuzhiyun hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl",
1063*4882a593Smuzhiyun current->pid, current->comm,
1064*4882a593Smuzhiyun cpumask_pr_args(proc_mask));
1065*4882a593Smuzhiyun goto done;
1066*4882a593Smuzhiyun }
1067*4882a593Smuzhiyun
1068*4882a593Smuzhiyun /*
1069*4882a593Smuzhiyun * The process does not have a preset CPU affinity so find one to
1070*4882a593Smuzhiyun * recommend using the following algorithm:
1071*4882a593Smuzhiyun *
1072*4882a593Smuzhiyun * For each user process that is opening a context on HFI Y:
1073*4882a593Smuzhiyun * a) If all cores are filled, reinitialize the bitmask
1074*4882a593Smuzhiyun * b) Fill real cores first, then HT cores (First set of HT
1075*4882a593Smuzhiyun * cores on all physical cores, then second set of HT core,
1076*4882a593Smuzhiyun * and, so on) in the following order:
1077*4882a593Smuzhiyun *
1078*4882a593Smuzhiyun * 1. Same NUMA node as HFI Y and not running an IRQ
1079*4882a593Smuzhiyun * handler
1080*4882a593Smuzhiyun * 2. Same NUMA node as HFI Y and running an IRQ handler
1081*4882a593Smuzhiyun * 3. Different NUMA node to HFI Y and not running an IRQ
1082*4882a593Smuzhiyun * handler
1083*4882a593Smuzhiyun * 4. Different NUMA node to HFI Y and running an IRQ
1084*4882a593Smuzhiyun * handler
1085*4882a593Smuzhiyun * c) Mark core as filled in the bitmask. As user processes are
1086*4882a593Smuzhiyun * done, clear cores from the bitmask.
1087*4882a593Smuzhiyun */
1088*4882a593Smuzhiyun
1089*4882a593Smuzhiyun ret = zalloc_cpumask_var(&diff, GFP_KERNEL);
1090*4882a593Smuzhiyun if (!ret)
1091*4882a593Smuzhiyun goto done;
1092*4882a593Smuzhiyun ret = zalloc_cpumask_var(&hw_thread_mask, GFP_KERNEL);
1093*4882a593Smuzhiyun if (!ret)
1094*4882a593Smuzhiyun goto free_diff;
1095*4882a593Smuzhiyun ret = zalloc_cpumask_var(&available_mask, GFP_KERNEL);
1096*4882a593Smuzhiyun if (!ret)
1097*4882a593Smuzhiyun goto free_hw_thread_mask;
1098*4882a593Smuzhiyun ret = zalloc_cpumask_var(&intrs_mask, GFP_KERNEL);
1099*4882a593Smuzhiyun if (!ret)
1100*4882a593Smuzhiyun goto free_available_mask;
1101*4882a593Smuzhiyun
1102*4882a593Smuzhiyun mutex_lock(&affinity->lock);
1103*4882a593Smuzhiyun /*
1104*4882a593Smuzhiyun * If we've used all available HW threads, clear the mask and start
1105*4882a593Smuzhiyun * overloading.
1106*4882a593Smuzhiyun */
1107*4882a593Smuzhiyun _cpu_mask_set_gen_inc(set);
1108*4882a593Smuzhiyun
1109*4882a593Smuzhiyun /*
1110*4882a593Smuzhiyun * If NUMA node has CPUs used by interrupt handlers, include them in the
1111*4882a593Smuzhiyun * interrupt handler mask.
1112*4882a593Smuzhiyun */
1113*4882a593Smuzhiyun entry = node_affinity_lookup(node);
1114*4882a593Smuzhiyun if (entry) {
1115*4882a593Smuzhiyun cpumask_copy(intrs_mask, (entry->def_intr.gen ?
1116*4882a593Smuzhiyun &entry->def_intr.mask :
1117*4882a593Smuzhiyun &entry->def_intr.used));
1118*4882a593Smuzhiyun cpumask_or(intrs_mask, intrs_mask, (entry->rcv_intr.gen ?
1119*4882a593Smuzhiyun &entry->rcv_intr.mask :
1120*4882a593Smuzhiyun &entry->rcv_intr.used));
1121*4882a593Smuzhiyun cpumask_or(intrs_mask, intrs_mask, &entry->general_intr_mask);
1122*4882a593Smuzhiyun }
1123*4882a593Smuzhiyun hfi1_cdbg(PROC, "CPUs used by interrupts: %*pbl",
1124*4882a593Smuzhiyun cpumask_pr_args(intrs_mask));
1125*4882a593Smuzhiyun
1126*4882a593Smuzhiyun cpumask_copy(hw_thread_mask, &set->mask);
1127*4882a593Smuzhiyun
1128*4882a593Smuzhiyun /*
1129*4882a593Smuzhiyun * If HT cores are enabled, identify which HW threads within the
1130*4882a593Smuzhiyun * physical cores should be used.
1131*4882a593Smuzhiyun */
1132*4882a593Smuzhiyun if (affinity->num_core_siblings > 0) {
1133*4882a593Smuzhiyun for (i = 0; i < affinity->num_core_siblings; i++) {
1134*4882a593Smuzhiyun find_hw_thread_mask(i, hw_thread_mask, affinity);
1135*4882a593Smuzhiyun
1136*4882a593Smuzhiyun /*
1137*4882a593Smuzhiyun * If there's at least one available core for this HW
1138*4882a593Smuzhiyun * thread number, stop looking for a core.
1139*4882a593Smuzhiyun *
1140*4882a593Smuzhiyun * diff will always be not empty at least once in this
1141*4882a593Smuzhiyun * loop as the used mask gets reset when
1142*4882a593Smuzhiyun * (set->mask == set->used) before this loop.
1143*4882a593Smuzhiyun */
1144*4882a593Smuzhiyun cpumask_andnot(diff, hw_thread_mask, &set->used);
1145*4882a593Smuzhiyun if (!cpumask_empty(diff))
1146*4882a593Smuzhiyun break;
1147*4882a593Smuzhiyun }
1148*4882a593Smuzhiyun }
1149*4882a593Smuzhiyun hfi1_cdbg(PROC, "Same available HW thread on all physical CPUs: %*pbl",
1150*4882a593Smuzhiyun cpumask_pr_args(hw_thread_mask));
1151*4882a593Smuzhiyun
1152*4882a593Smuzhiyun node_mask = cpumask_of_node(node);
1153*4882a593Smuzhiyun hfi1_cdbg(PROC, "Device on NUMA %u, CPUs %*pbl", node,
1154*4882a593Smuzhiyun cpumask_pr_args(node_mask));
1155*4882a593Smuzhiyun
1156*4882a593Smuzhiyun /* Get cpumask of available CPUs on preferred NUMA */
1157*4882a593Smuzhiyun cpumask_and(available_mask, hw_thread_mask, node_mask);
1158*4882a593Smuzhiyun cpumask_andnot(available_mask, available_mask, &set->used);
1159*4882a593Smuzhiyun hfi1_cdbg(PROC, "Available CPUs on NUMA %u: %*pbl", node,
1160*4882a593Smuzhiyun cpumask_pr_args(available_mask));
1161*4882a593Smuzhiyun
1162*4882a593Smuzhiyun /*
1163*4882a593Smuzhiyun * At first, we don't want to place processes on the same
1164*4882a593Smuzhiyun * CPUs as interrupt handlers. Then, CPUs running interrupt
1165*4882a593Smuzhiyun * handlers are used.
1166*4882a593Smuzhiyun *
1167*4882a593Smuzhiyun * 1) If diff is not empty, then there are CPUs not running
1168*4882a593Smuzhiyun * non-interrupt handlers available, so diff gets copied
1169*4882a593Smuzhiyun * over to available_mask.
1170*4882a593Smuzhiyun * 2) If diff is empty, then all CPUs not running interrupt
1171*4882a593Smuzhiyun * handlers are taken, so available_mask contains all
1172*4882a593Smuzhiyun * available CPUs running interrupt handlers.
1173*4882a593Smuzhiyun * 3) If available_mask is empty, then all CPUs on the
1174*4882a593Smuzhiyun * preferred NUMA node are taken, so other NUMA nodes are
1175*4882a593Smuzhiyun * used for process assignments using the same method as
1176*4882a593Smuzhiyun * the preferred NUMA node.
1177*4882a593Smuzhiyun */
1178*4882a593Smuzhiyun cpumask_andnot(diff, available_mask, intrs_mask);
1179*4882a593Smuzhiyun if (!cpumask_empty(diff))
1180*4882a593Smuzhiyun cpumask_copy(available_mask, diff);
1181*4882a593Smuzhiyun
1182*4882a593Smuzhiyun /* If we don't have CPUs on the preferred node, use other NUMA nodes */
1183*4882a593Smuzhiyun if (cpumask_empty(available_mask)) {
1184*4882a593Smuzhiyun cpumask_andnot(available_mask, hw_thread_mask, &set->used);
1185*4882a593Smuzhiyun /* Excluding preferred NUMA cores */
1186*4882a593Smuzhiyun cpumask_andnot(available_mask, available_mask, node_mask);
1187*4882a593Smuzhiyun hfi1_cdbg(PROC,
1188*4882a593Smuzhiyun "Preferred NUMA node cores are taken, cores available in other NUMA nodes: %*pbl",
1189*4882a593Smuzhiyun cpumask_pr_args(available_mask));
1190*4882a593Smuzhiyun
1191*4882a593Smuzhiyun /*
1192*4882a593Smuzhiyun * At first, we don't want to place processes on the same
1193*4882a593Smuzhiyun * CPUs as interrupt handlers.
1194*4882a593Smuzhiyun */
1195*4882a593Smuzhiyun cpumask_andnot(diff, available_mask, intrs_mask);
1196*4882a593Smuzhiyun if (!cpumask_empty(diff))
1197*4882a593Smuzhiyun cpumask_copy(available_mask, diff);
1198*4882a593Smuzhiyun }
1199*4882a593Smuzhiyun hfi1_cdbg(PROC, "Possible CPUs for process: %*pbl",
1200*4882a593Smuzhiyun cpumask_pr_args(available_mask));
1201*4882a593Smuzhiyun
1202*4882a593Smuzhiyun cpu = cpumask_first(available_mask);
1203*4882a593Smuzhiyun if (cpu >= nr_cpu_ids) /* empty */
1204*4882a593Smuzhiyun cpu = -1;
1205*4882a593Smuzhiyun else
1206*4882a593Smuzhiyun cpumask_set_cpu(cpu, &set->used);
1207*4882a593Smuzhiyun
1208*4882a593Smuzhiyun mutex_unlock(&affinity->lock);
1209*4882a593Smuzhiyun hfi1_cdbg(PROC, "Process assigned to CPU %d", cpu);
1210*4882a593Smuzhiyun
1211*4882a593Smuzhiyun free_cpumask_var(intrs_mask);
1212*4882a593Smuzhiyun free_available_mask:
1213*4882a593Smuzhiyun free_cpumask_var(available_mask);
1214*4882a593Smuzhiyun free_hw_thread_mask:
1215*4882a593Smuzhiyun free_cpumask_var(hw_thread_mask);
1216*4882a593Smuzhiyun free_diff:
1217*4882a593Smuzhiyun free_cpumask_var(diff);
1218*4882a593Smuzhiyun done:
1219*4882a593Smuzhiyun return cpu;
1220*4882a593Smuzhiyun }
1221*4882a593Smuzhiyun
hfi1_put_proc_affinity(int cpu)1222*4882a593Smuzhiyun void hfi1_put_proc_affinity(int cpu)
1223*4882a593Smuzhiyun {
1224*4882a593Smuzhiyun struct hfi1_affinity_node_list *affinity = &node_affinity;
1225*4882a593Smuzhiyun struct cpu_mask_set *set = &affinity->proc;
1226*4882a593Smuzhiyun
1227*4882a593Smuzhiyun if (cpu < 0)
1228*4882a593Smuzhiyun return;
1229*4882a593Smuzhiyun
1230*4882a593Smuzhiyun mutex_lock(&affinity->lock);
1231*4882a593Smuzhiyun cpu_mask_set_put(set, cpu);
1232*4882a593Smuzhiyun hfi1_cdbg(PROC, "Returning CPU %d for future process assignment", cpu);
1233*4882a593Smuzhiyun mutex_unlock(&affinity->lock);
1234*4882a593Smuzhiyun }
1235