xref: /OK3568_Linux_fs/kernel/drivers/infiniband/hw/hfi1/affinity.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun /*
2*4882a593Smuzhiyun  * Copyright(c) 2015 - 2020 Intel Corporation.
3*4882a593Smuzhiyun  *
4*4882a593Smuzhiyun  * This file is provided under a dual BSD/GPLv2 license.  When using or
5*4882a593Smuzhiyun  * redistributing this file, you may do so under either license.
6*4882a593Smuzhiyun  *
7*4882a593Smuzhiyun  * GPL LICENSE SUMMARY
8*4882a593Smuzhiyun  *
9*4882a593Smuzhiyun  * This program is free software; you can redistribute it and/or modify
10*4882a593Smuzhiyun  * it under the terms of version 2 of the GNU General Public License as
11*4882a593Smuzhiyun  * published by the Free Software Foundation.
12*4882a593Smuzhiyun  *
13*4882a593Smuzhiyun  * This program is distributed in the hope that it will be useful, but
14*4882a593Smuzhiyun  * WITHOUT ANY WARRANTY; without even the implied warranty of
15*4882a593Smuzhiyun  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16*4882a593Smuzhiyun  * General Public License for more details.
17*4882a593Smuzhiyun  *
18*4882a593Smuzhiyun  * BSD LICENSE
19*4882a593Smuzhiyun  *
20*4882a593Smuzhiyun  * Redistribution and use in source and binary forms, with or without
21*4882a593Smuzhiyun  * modification, are permitted provided that the following conditions
22*4882a593Smuzhiyun  * are met:
23*4882a593Smuzhiyun  *
24*4882a593Smuzhiyun  *  - Redistributions of source code must retain the above copyright
25*4882a593Smuzhiyun  *    notice, this list of conditions and the following disclaimer.
26*4882a593Smuzhiyun  *  - Redistributions in binary form must reproduce the above copyright
27*4882a593Smuzhiyun  *    notice, this list of conditions and the following disclaimer in
28*4882a593Smuzhiyun  *    the documentation and/or other materials provided with the
29*4882a593Smuzhiyun  *    distribution.
30*4882a593Smuzhiyun  *  - Neither the name of Intel Corporation nor the names of its
31*4882a593Smuzhiyun  *    contributors may be used to endorse or promote products derived
32*4882a593Smuzhiyun  *    from this software without specific prior written permission.
33*4882a593Smuzhiyun  *
34*4882a593Smuzhiyun  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
35*4882a593Smuzhiyun  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
36*4882a593Smuzhiyun  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
37*4882a593Smuzhiyun  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
38*4882a593Smuzhiyun  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
39*4882a593Smuzhiyun  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
40*4882a593Smuzhiyun  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
41*4882a593Smuzhiyun  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
42*4882a593Smuzhiyun  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
43*4882a593Smuzhiyun  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
44*4882a593Smuzhiyun  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
45*4882a593Smuzhiyun  *
46*4882a593Smuzhiyun  */
47*4882a593Smuzhiyun #include <linux/topology.h>
48*4882a593Smuzhiyun #include <linux/cpumask.h>
49*4882a593Smuzhiyun #include <linux/module.h>
50*4882a593Smuzhiyun #include <linux/interrupt.h>
51*4882a593Smuzhiyun #include <linux/numa.h>
52*4882a593Smuzhiyun 
53*4882a593Smuzhiyun #include "hfi.h"
54*4882a593Smuzhiyun #include "affinity.h"
55*4882a593Smuzhiyun #include "sdma.h"
56*4882a593Smuzhiyun #include "trace.h"
57*4882a593Smuzhiyun 
58*4882a593Smuzhiyun struct hfi1_affinity_node_list node_affinity = {
59*4882a593Smuzhiyun 	.list = LIST_HEAD_INIT(node_affinity.list),
60*4882a593Smuzhiyun 	.lock = __MUTEX_INITIALIZER(node_affinity.lock)
61*4882a593Smuzhiyun };
62*4882a593Smuzhiyun 
63*4882a593Smuzhiyun /* Name of IRQ types, indexed by enum irq_type */
64*4882a593Smuzhiyun static const char * const irq_type_names[] = {
65*4882a593Smuzhiyun 	"SDMA",
66*4882a593Smuzhiyun 	"RCVCTXT",
67*4882a593Smuzhiyun 	"NETDEVCTXT",
68*4882a593Smuzhiyun 	"GENERAL",
69*4882a593Smuzhiyun 	"OTHER",
70*4882a593Smuzhiyun };
71*4882a593Smuzhiyun 
72*4882a593Smuzhiyun /* Per NUMA node count of HFI devices */
73*4882a593Smuzhiyun static unsigned int *hfi1_per_node_cntr;
74*4882a593Smuzhiyun 
init_cpu_mask_set(struct cpu_mask_set * set)75*4882a593Smuzhiyun static inline void init_cpu_mask_set(struct cpu_mask_set *set)
76*4882a593Smuzhiyun {
77*4882a593Smuzhiyun 	cpumask_clear(&set->mask);
78*4882a593Smuzhiyun 	cpumask_clear(&set->used);
79*4882a593Smuzhiyun 	set->gen = 0;
80*4882a593Smuzhiyun }
81*4882a593Smuzhiyun 
82*4882a593Smuzhiyun /* Increment generation of CPU set if needed */
_cpu_mask_set_gen_inc(struct cpu_mask_set * set)83*4882a593Smuzhiyun static void _cpu_mask_set_gen_inc(struct cpu_mask_set *set)
84*4882a593Smuzhiyun {
85*4882a593Smuzhiyun 	if (cpumask_equal(&set->mask, &set->used)) {
86*4882a593Smuzhiyun 		/*
87*4882a593Smuzhiyun 		 * We've used up all the CPUs, bump up the generation
88*4882a593Smuzhiyun 		 * and reset the 'used' map
89*4882a593Smuzhiyun 		 */
90*4882a593Smuzhiyun 		set->gen++;
91*4882a593Smuzhiyun 		cpumask_clear(&set->used);
92*4882a593Smuzhiyun 	}
93*4882a593Smuzhiyun }
94*4882a593Smuzhiyun 
_cpu_mask_set_gen_dec(struct cpu_mask_set * set)95*4882a593Smuzhiyun static void _cpu_mask_set_gen_dec(struct cpu_mask_set *set)
96*4882a593Smuzhiyun {
97*4882a593Smuzhiyun 	if (cpumask_empty(&set->used) && set->gen) {
98*4882a593Smuzhiyun 		set->gen--;
99*4882a593Smuzhiyun 		cpumask_copy(&set->used, &set->mask);
100*4882a593Smuzhiyun 	}
101*4882a593Smuzhiyun }
102*4882a593Smuzhiyun 
103*4882a593Smuzhiyun /* Get the first CPU from the list of unused CPUs in a CPU set data structure */
cpu_mask_set_get_first(struct cpu_mask_set * set,cpumask_var_t diff)104*4882a593Smuzhiyun static int cpu_mask_set_get_first(struct cpu_mask_set *set, cpumask_var_t diff)
105*4882a593Smuzhiyun {
106*4882a593Smuzhiyun 	int cpu;
107*4882a593Smuzhiyun 
108*4882a593Smuzhiyun 	if (!diff || !set)
109*4882a593Smuzhiyun 		return -EINVAL;
110*4882a593Smuzhiyun 
111*4882a593Smuzhiyun 	_cpu_mask_set_gen_inc(set);
112*4882a593Smuzhiyun 
113*4882a593Smuzhiyun 	/* Find out CPUs left in CPU mask */
114*4882a593Smuzhiyun 	cpumask_andnot(diff, &set->mask, &set->used);
115*4882a593Smuzhiyun 
116*4882a593Smuzhiyun 	cpu = cpumask_first(diff);
117*4882a593Smuzhiyun 	if (cpu >= nr_cpu_ids) /* empty */
118*4882a593Smuzhiyun 		cpu = -EINVAL;
119*4882a593Smuzhiyun 	else
120*4882a593Smuzhiyun 		cpumask_set_cpu(cpu, &set->used);
121*4882a593Smuzhiyun 
122*4882a593Smuzhiyun 	return cpu;
123*4882a593Smuzhiyun }
124*4882a593Smuzhiyun 
cpu_mask_set_put(struct cpu_mask_set * set,int cpu)125*4882a593Smuzhiyun static void cpu_mask_set_put(struct cpu_mask_set *set, int cpu)
126*4882a593Smuzhiyun {
127*4882a593Smuzhiyun 	if (!set)
128*4882a593Smuzhiyun 		return;
129*4882a593Smuzhiyun 
130*4882a593Smuzhiyun 	cpumask_clear_cpu(cpu, &set->used);
131*4882a593Smuzhiyun 	_cpu_mask_set_gen_dec(set);
132*4882a593Smuzhiyun }
133*4882a593Smuzhiyun 
134*4882a593Smuzhiyun /* Initialize non-HT cpu cores mask */
init_real_cpu_mask(void)135*4882a593Smuzhiyun void init_real_cpu_mask(void)
136*4882a593Smuzhiyun {
137*4882a593Smuzhiyun 	int possible, curr_cpu, i, ht;
138*4882a593Smuzhiyun 
139*4882a593Smuzhiyun 	cpumask_clear(&node_affinity.real_cpu_mask);
140*4882a593Smuzhiyun 
141*4882a593Smuzhiyun 	/* Start with cpu online mask as the real cpu mask */
142*4882a593Smuzhiyun 	cpumask_copy(&node_affinity.real_cpu_mask, cpu_online_mask);
143*4882a593Smuzhiyun 
144*4882a593Smuzhiyun 	/*
145*4882a593Smuzhiyun 	 * Remove HT cores from the real cpu mask.  Do this in two steps below.
146*4882a593Smuzhiyun 	 */
147*4882a593Smuzhiyun 	possible = cpumask_weight(&node_affinity.real_cpu_mask);
148*4882a593Smuzhiyun 	ht = cpumask_weight(topology_sibling_cpumask(
149*4882a593Smuzhiyun 				cpumask_first(&node_affinity.real_cpu_mask)));
150*4882a593Smuzhiyun 	/*
151*4882a593Smuzhiyun 	 * Step 1.  Skip over the first N HT siblings and use them as the
152*4882a593Smuzhiyun 	 * "real" cores.  Assumes that HT cores are not enumerated in
153*4882a593Smuzhiyun 	 * succession (except in the single core case).
154*4882a593Smuzhiyun 	 */
155*4882a593Smuzhiyun 	curr_cpu = cpumask_first(&node_affinity.real_cpu_mask);
156*4882a593Smuzhiyun 	for (i = 0; i < possible / ht; i++)
157*4882a593Smuzhiyun 		curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
158*4882a593Smuzhiyun 	/*
159*4882a593Smuzhiyun 	 * Step 2.  Remove the remaining HT siblings.  Use cpumask_next() to
160*4882a593Smuzhiyun 	 * skip any gaps.
161*4882a593Smuzhiyun 	 */
162*4882a593Smuzhiyun 	for (; i < possible; i++) {
163*4882a593Smuzhiyun 		cpumask_clear_cpu(curr_cpu, &node_affinity.real_cpu_mask);
164*4882a593Smuzhiyun 		curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
165*4882a593Smuzhiyun 	}
166*4882a593Smuzhiyun }
167*4882a593Smuzhiyun 
node_affinity_init(void)168*4882a593Smuzhiyun int node_affinity_init(void)
169*4882a593Smuzhiyun {
170*4882a593Smuzhiyun 	int node;
171*4882a593Smuzhiyun 	struct pci_dev *dev = NULL;
172*4882a593Smuzhiyun 	const struct pci_device_id *ids = hfi1_pci_tbl;
173*4882a593Smuzhiyun 
174*4882a593Smuzhiyun 	cpumask_clear(&node_affinity.proc.used);
175*4882a593Smuzhiyun 	cpumask_copy(&node_affinity.proc.mask, cpu_online_mask);
176*4882a593Smuzhiyun 
177*4882a593Smuzhiyun 	node_affinity.proc.gen = 0;
178*4882a593Smuzhiyun 	node_affinity.num_core_siblings =
179*4882a593Smuzhiyun 				cpumask_weight(topology_sibling_cpumask(
180*4882a593Smuzhiyun 					cpumask_first(&node_affinity.proc.mask)
181*4882a593Smuzhiyun 					));
182*4882a593Smuzhiyun 	node_affinity.num_possible_nodes = num_possible_nodes();
183*4882a593Smuzhiyun 	node_affinity.num_online_nodes = num_online_nodes();
184*4882a593Smuzhiyun 	node_affinity.num_online_cpus = num_online_cpus();
185*4882a593Smuzhiyun 
186*4882a593Smuzhiyun 	/*
187*4882a593Smuzhiyun 	 * The real cpu mask is part of the affinity struct but it has to be
188*4882a593Smuzhiyun 	 * initialized early. It is needed to calculate the number of user
189*4882a593Smuzhiyun 	 * contexts in set_up_context_variables().
190*4882a593Smuzhiyun 	 */
191*4882a593Smuzhiyun 	init_real_cpu_mask();
192*4882a593Smuzhiyun 
193*4882a593Smuzhiyun 	hfi1_per_node_cntr = kcalloc(node_affinity.num_possible_nodes,
194*4882a593Smuzhiyun 				     sizeof(*hfi1_per_node_cntr), GFP_KERNEL);
195*4882a593Smuzhiyun 	if (!hfi1_per_node_cntr)
196*4882a593Smuzhiyun 		return -ENOMEM;
197*4882a593Smuzhiyun 
198*4882a593Smuzhiyun 	while (ids->vendor) {
199*4882a593Smuzhiyun 		dev = NULL;
200*4882a593Smuzhiyun 		while ((dev = pci_get_device(ids->vendor, ids->device, dev))) {
201*4882a593Smuzhiyun 			node = pcibus_to_node(dev->bus);
202*4882a593Smuzhiyun 			if (node < 0)
203*4882a593Smuzhiyun 				goto out;
204*4882a593Smuzhiyun 
205*4882a593Smuzhiyun 			hfi1_per_node_cntr[node]++;
206*4882a593Smuzhiyun 		}
207*4882a593Smuzhiyun 		ids++;
208*4882a593Smuzhiyun 	}
209*4882a593Smuzhiyun 
210*4882a593Smuzhiyun 	return 0;
211*4882a593Smuzhiyun 
212*4882a593Smuzhiyun out:
213*4882a593Smuzhiyun 	/*
214*4882a593Smuzhiyun 	 * Invalid PCI NUMA node information found, note it, and populate
215*4882a593Smuzhiyun 	 * our database 1:1.
216*4882a593Smuzhiyun 	 */
217*4882a593Smuzhiyun 	pr_err("HFI: Invalid PCI NUMA node. Performance may be affected\n");
218*4882a593Smuzhiyun 	pr_err("HFI: System BIOS may need to be upgraded\n");
219*4882a593Smuzhiyun 	for (node = 0; node < node_affinity.num_possible_nodes; node++)
220*4882a593Smuzhiyun 		hfi1_per_node_cntr[node] = 1;
221*4882a593Smuzhiyun 
222*4882a593Smuzhiyun 	return 0;
223*4882a593Smuzhiyun }
224*4882a593Smuzhiyun 
node_affinity_destroy(struct hfi1_affinity_node * entry)225*4882a593Smuzhiyun static void node_affinity_destroy(struct hfi1_affinity_node *entry)
226*4882a593Smuzhiyun {
227*4882a593Smuzhiyun 	free_percpu(entry->comp_vect_affinity);
228*4882a593Smuzhiyun 	kfree(entry);
229*4882a593Smuzhiyun }
230*4882a593Smuzhiyun 
node_affinity_destroy_all(void)231*4882a593Smuzhiyun void node_affinity_destroy_all(void)
232*4882a593Smuzhiyun {
233*4882a593Smuzhiyun 	struct list_head *pos, *q;
234*4882a593Smuzhiyun 	struct hfi1_affinity_node *entry;
235*4882a593Smuzhiyun 
236*4882a593Smuzhiyun 	mutex_lock(&node_affinity.lock);
237*4882a593Smuzhiyun 	list_for_each_safe(pos, q, &node_affinity.list) {
238*4882a593Smuzhiyun 		entry = list_entry(pos, struct hfi1_affinity_node,
239*4882a593Smuzhiyun 				   list);
240*4882a593Smuzhiyun 		list_del(pos);
241*4882a593Smuzhiyun 		node_affinity_destroy(entry);
242*4882a593Smuzhiyun 	}
243*4882a593Smuzhiyun 	mutex_unlock(&node_affinity.lock);
244*4882a593Smuzhiyun 	kfree(hfi1_per_node_cntr);
245*4882a593Smuzhiyun }
246*4882a593Smuzhiyun 
node_affinity_allocate(int node)247*4882a593Smuzhiyun static struct hfi1_affinity_node *node_affinity_allocate(int node)
248*4882a593Smuzhiyun {
249*4882a593Smuzhiyun 	struct hfi1_affinity_node *entry;
250*4882a593Smuzhiyun 
251*4882a593Smuzhiyun 	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
252*4882a593Smuzhiyun 	if (!entry)
253*4882a593Smuzhiyun 		return NULL;
254*4882a593Smuzhiyun 	entry->node = node;
255*4882a593Smuzhiyun 	entry->comp_vect_affinity = alloc_percpu(u16);
256*4882a593Smuzhiyun 	INIT_LIST_HEAD(&entry->list);
257*4882a593Smuzhiyun 
258*4882a593Smuzhiyun 	return entry;
259*4882a593Smuzhiyun }
260*4882a593Smuzhiyun 
261*4882a593Smuzhiyun /*
262*4882a593Smuzhiyun  * It appends an entry to the list.
263*4882a593Smuzhiyun  * It *must* be called with node_affinity.lock held.
264*4882a593Smuzhiyun  */
node_affinity_add_tail(struct hfi1_affinity_node * entry)265*4882a593Smuzhiyun static void node_affinity_add_tail(struct hfi1_affinity_node *entry)
266*4882a593Smuzhiyun {
267*4882a593Smuzhiyun 	list_add_tail(&entry->list, &node_affinity.list);
268*4882a593Smuzhiyun }
269*4882a593Smuzhiyun 
270*4882a593Smuzhiyun /* It must be called with node_affinity.lock held */
node_affinity_lookup(int node)271*4882a593Smuzhiyun static struct hfi1_affinity_node *node_affinity_lookup(int node)
272*4882a593Smuzhiyun {
273*4882a593Smuzhiyun 	struct list_head *pos;
274*4882a593Smuzhiyun 	struct hfi1_affinity_node *entry;
275*4882a593Smuzhiyun 
276*4882a593Smuzhiyun 	list_for_each(pos, &node_affinity.list) {
277*4882a593Smuzhiyun 		entry = list_entry(pos, struct hfi1_affinity_node, list);
278*4882a593Smuzhiyun 		if (entry->node == node)
279*4882a593Smuzhiyun 			return entry;
280*4882a593Smuzhiyun 	}
281*4882a593Smuzhiyun 
282*4882a593Smuzhiyun 	return NULL;
283*4882a593Smuzhiyun }
284*4882a593Smuzhiyun 
per_cpu_affinity_get(cpumask_var_t possible_cpumask,u16 __percpu * comp_vect_affinity)285*4882a593Smuzhiyun static int per_cpu_affinity_get(cpumask_var_t possible_cpumask,
286*4882a593Smuzhiyun 				u16 __percpu *comp_vect_affinity)
287*4882a593Smuzhiyun {
288*4882a593Smuzhiyun 	int curr_cpu;
289*4882a593Smuzhiyun 	u16 cntr;
290*4882a593Smuzhiyun 	u16 prev_cntr;
291*4882a593Smuzhiyun 	int ret_cpu;
292*4882a593Smuzhiyun 
293*4882a593Smuzhiyun 	if (!possible_cpumask) {
294*4882a593Smuzhiyun 		ret_cpu = -EINVAL;
295*4882a593Smuzhiyun 		goto fail;
296*4882a593Smuzhiyun 	}
297*4882a593Smuzhiyun 
298*4882a593Smuzhiyun 	if (!comp_vect_affinity) {
299*4882a593Smuzhiyun 		ret_cpu = -EINVAL;
300*4882a593Smuzhiyun 		goto fail;
301*4882a593Smuzhiyun 	}
302*4882a593Smuzhiyun 
303*4882a593Smuzhiyun 	ret_cpu = cpumask_first(possible_cpumask);
304*4882a593Smuzhiyun 	if (ret_cpu >= nr_cpu_ids) {
305*4882a593Smuzhiyun 		ret_cpu = -EINVAL;
306*4882a593Smuzhiyun 		goto fail;
307*4882a593Smuzhiyun 	}
308*4882a593Smuzhiyun 
309*4882a593Smuzhiyun 	prev_cntr = *per_cpu_ptr(comp_vect_affinity, ret_cpu);
310*4882a593Smuzhiyun 	for_each_cpu(curr_cpu, possible_cpumask) {
311*4882a593Smuzhiyun 		cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu);
312*4882a593Smuzhiyun 
313*4882a593Smuzhiyun 		if (cntr < prev_cntr) {
314*4882a593Smuzhiyun 			ret_cpu = curr_cpu;
315*4882a593Smuzhiyun 			prev_cntr = cntr;
316*4882a593Smuzhiyun 		}
317*4882a593Smuzhiyun 	}
318*4882a593Smuzhiyun 
319*4882a593Smuzhiyun 	*per_cpu_ptr(comp_vect_affinity, ret_cpu) += 1;
320*4882a593Smuzhiyun 
321*4882a593Smuzhiyun fail:
322*4882a593Smuzhiyun 	return ret_cpu;
323*4882a593Smuzhiyun }
324*4882a593Smuzhiyun 
per_cpu_affinity_put_max(cpumask_var_t possible_cpumask,u16 __percpu * comp_vect_affinity)325*4882a593Smuzhiyun static int per_cpu_affinity_put_max(cpumask_var_t possible_cpumask,
326*4882a593Smuzhiyun 				    u16 __percpu *comp_vect_affinity)
327*4882a593Smuzhiyun {
328*4882a593Smuzhiyun 	int curr_cpu;
329*4882a593Smuzhiyun 	int max_cpu;
330*4882a593Smuzhiyun 	u16 cntr;
331*4882a593Smuzhiyun 	u16 prev_cntr;
332*4882a593Smuzhiyun 
333*4882a593Smuzhiyun 	if (!possible_cpumask)
334*4882a593Smuzhiyun 		return -EINVAL;
335*4882a593Smuzhiyun 
336*4882a593Smuzhiyun 	if (!comp_vect_affinity)
337*4882a593Smuzhiyun 		return -EINVAL;
338*4882a593Smuzhiyun 
339*4882a593Smuzhiyun 	max_cpu = cpumask_first(possible_cpumask);
340*4882a593Smuzhiyun 	if (max_cpu >= nr_cpu_ids)
341*4882a593Smuzhiyun 		return -EINVAL;
342*4882a593Smuzhiyun 
343*4882a593Smuzhiyun 	prev_cntr = *per_cpu_ptr(comp_vect_affinity, max_cpu);
344*4882a593Smuzhiyun 	for_each_cpu(curr_cpu, possible_cpumask) {
345*4882a593Smuzhiyun 		cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu);
346*4882a593Smuzhiyun 
347*4882a593Smuzhiyun 		if (cntr > prev_cntr) {
348*4882a593Smuzhiyun 			max_cpu = curr_cpu;
349*4882a593Smuzhiyun 			prev_cntr = cntr;
350*4882a593Smuzhiyun 		}
351*4882a593Smuzhiyun 	}
352*4882a593Smuzhiyun 
353*4882a593Smuzhiyun 	*per_cpu_ptr(comp_vect_affinity, max_cpu) -= 1;
354*4882a593Smuzhiyun 
355*4882a593Smuzhiyun 	return max_cpu;
356*4882a593Smuzhiyun }
357*4882a593Smuzhiyun 
358*4882a593Smuzhiyun /*
359*4882a593Smuzhiyun  * Non-interrupt CPUs are used first, then interrupt CPUs.
360*4882a593Smuzhiyun  * Two already allocated cpu masks must be passed.
361*4882a593Smuzhiyun  */
_dev_comp_vect_cpu_get(struct hfi1_devdata * dd,struct hfi1_affinity_node * entry,cpumask_var_t non_intr_cpus,cpumask_var_t available_cpus)362*4882a593Smuzhiyun static int _dev_comp_vect_cpu_get(struct hfi1_devdata *dd,
363*4882a593Smuzhiyun 				  struct hfi1_affinity_node *entry,
364*4882a593Smuzhiyun 				  cpumask_var_t non_intr_cpus,
365*4882a593Smuzhiyun 				  cpumask_var_t available_cpus)
366*4882a593Smuzhiyun 	__must_hold(&node_affinity.lock)
367*4882a593Smuzhiyun {
368*4882a593Smuzhiyun 	int cpu;
369*4882a593Smuzhiyun 	struct cpu_mask_set *set = dd->comp_vect;
370*4882a593Smuzhiyun 
371*4882a593Smuzhiyun 	lockdep_assert_held(&node_affinity.lock);
372*4882a593Smuzhiyun 	if (!non_intr_cpus) {
373*4882a593Smuzhiyun 		cpu = -1;
374*4882a593Smuzhiyun 		goto fail;
375*4882a593Smuzhiyun 	}
376*4882a593Smuzhiyun 
377*4882a593Smuzhiyun 	if (!available_cpus) {
378*4882a593Smuzhiyun 		cpu = -1;
379*4882a593Smuzhiyun 		goto fail;
380*4882a593Smuzhiyun 	}
381*4882a593Smuzhiyun 
382*4882a593Smuzhiyun 	/* Available CPUs for pinning completion vectors */
383*4882a593Smuzhiyun 	_cpu_mask_set_gen_inc(set);
384*4882a593Smuzhiyun 	cpumask_andnot(available_cpus, &set->mask, &set->used);
385*4882a593Smuzhiyun 
386*4882a593Smuzhiyun 	/* Available CPUs without SDMA engine interrupts */
387*4882a593Smuzhiyun 	cpumask_andnot(non_intr_cpus, available_cpus,
388*4882a593Smuzhiyun 		       &entry->def_intr.used);
389*4882a593Smuzhiyun 
390*4882a593Smuzhiyun 	/* If there are non-interrupt CPUs available, use them first */
391*4882a593Smuzhiyun 	if (!cpumask_empty(non_intr_cpus))
392*4882a593Smuzhiyun 		cpu = cpumask_first(non_intr_cpus);
393*4882a593Smuzhiyun 	else /* Otherwise, use interrupt CPUs */
394*4882a593Smuzhiyun 		cpu = cpumask_first(available_cpus);
395*4882a593Smuzhiyun 
396*4882a593Smuzhiyun 	if (cpu >= nr_cpu_ids) { /* empty */
397*4882a593Smuzhiyun 		cpu = -1;
398*4882a593Smuzhiyun 		goto fail;
399*4882a593Smuzhiyun 	}
400*4882a593Smuzhiyun 	cpumask_set_cpu(cpu, &set->used);
401*4882a593Smuzhiyun 
402*4882a593Smuzhiyun fail:
403*4882a593Smuzhiyun 	return cpu;
404*4882a593Smuzhiyun }
405*4882a593Smuzhiyun 
_dev_comp_vect_cpu_put(struct hfi1_devdata * dd,int cpu)406*4882a593Smuzhiyun static void _dev_comp_vect_cpu_put(struct hfi1_devdata *dd, int cpu)
407*4882a593Smuzhiyun {
408*4882a593Smuzhiyun 	struct cpu_mask_set *set = dd->comp_vect;
409*4882a593Smuzhiyun 
410*4882a593Smuzhiyun 	if (cpu < 0)
411*4882a593Smuzhiyun 		return;
412*4882a593Smuzhiyun 
413*4882a593Smuzhiyun 	cpu_mask_set_put(set, cpu);
414*4882a593Smuzhiyun }
415*4882a593Smuzhiyun 
416*4882a593Smuzhiyun /* _dev_comp_vect_mappings_destroy() is reentrant */
_dev_comp_vect_mappings_destroy(struct hfi1_devdata * dd)417*4882a593Smuzhiyun static void _dev_comp_vect_mappings_destroy(struct hfi1_devdata *dd)
418*4882a593Smuzhiyun {
419*4882a593Smuzhiyun 	int i, cpu;
420*4882a593Smuzhiyun 
421*4882a593Smuzhiyun 	if (!dd->comp_vect_mappings)
422*4882a593Smuzhiyun 		return;
423*4882a593Smuzhiyun 
424*4882a593Smuzhiyun 	for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
425*4882a593Smuzhiyun 		cpu = dd->comp_vect_mappings[i];
426*4882a593Smuzhiyun 		_dev_comp_vect_cpu_put(dd, cpu);
427*4882a593Smuzhiyun 		dd->comp_vect_mappings[i] = -1;
428*4882a593Smuzhiyun 		hfi1_cdbg(AFFINITY,
429*4882a593Smuzhiyun 			  "[%s] Release CPU %d from completion vector %d",
430*4882a593Smuzhiyun 			  rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), cpu, i);
431*4882a593Smuzhiyun 	}
432*4882a593Smuzhiyun 
433*4882a593Smuzhiyun 	kfree(dd->comp_vect_mappings);
434*4882a593Smuzhiyun 	dd->comp_vect_mappings = NULL;
435*4882a593Smuzhiyun }
436*4882a593Smuzhiyun 
437*4882a593Smuzhiyun /*
438*4882a593Smuzhiyun  * This function creates the table for looking up CPUs for completion vectors.
439*4882a593Smuzhiyun  * num_comp_vectors needs to have been initilized before calling this function.
440*4882a593Smuzhiyun  */
_dev_comp_vect_mappings_create(struct hfi1_devdata * dd,struct hfi1_affinity_node * entry)441*4882a593Smuzhiyun static int _dev_comp_vect_mappings_create(struct hfi1_devdata *dd,
442*4882a593Smuzhiyun 					  struct hfi1_affinity_node *entry)
443*4882a593Smuzhiyun 	__must_hold(&node_affinity.lock)
444*4882a593Smuzhiyun {
445*4882a593Smuzhiyun 	int i, cpu, ret;
446*4882a593Smuzhiyun 	cpumask_var_t non_intr_cpus;
447*4882a593Smuzhiyun 	cpumask_var_t available_cpus;
448*4882a593Smuzhiyun 
449*4882a593Smuzhiyun 	lockdep_assert_held(&node_affinity.lock);
450*4882a593Smuzhiyun 
451*4882a593Smuzhiyun 	if (!zalloc_cpumask_var(&non_intr_cpus, GFP_KERNEL))
452*4882a593Smuzhiyun 		return -ENOMEM;
453*4882a593Smuzhiyun 
454*4882a593Smuzhiyun 	if (!zalloc_cpumask_var(&available_cpus, GFP_KERNEL)) {
455*4882a593Smuzhiyun 		free_cpumask_var(non_intr_cpus);
456*4882a593Smuzhiyun 		return -ENOMEM;
457*4882a593Smuzhiyun 	}
458*4882a593Smuzhiyun 
459*4882a593Smuzhiyun 	dd->comp_vect_mappings = kcalloc(dd->comp_vect_possible_cpus,
460*4882a593Smuzhiyun 					 sizeof(*dd->comp_vect_mappings),
461*4882a593Smuzhiyun 					 GFP_KERNEL);
462*4882a593Smuzhiyun 	if (!dd->comp_vect_mappings) {
463*4882a593Smuzhiyun 		ret = -ENOMEM;
464*4882a593Smuzhiyun 		goto fail;
465*4882a593Smuzhiyun 	}
466*4882a593Smuzhiyun 	for (i = 0; i < dd->comp_vect_possible_cpus; i++)
467*4882a593Smuzhiyun 		dd->comp_vect_mappings[i] = -1;
468*4882a593Smuzhiyun 
469*4882a593Smuzhiyun 	for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
470*4882a593Smuzhiyun 		cpu = _dev_comp_vect_cpu_get(dd, entry, non_intr_cpus,
471*4882a593Smuzhiyun 					     available_cpus);
472*4882a593Smuzhiyun 		if (cpu < 0) {
473*4882a593Smuzhiyun 			ret = -EINVAL;
474*4882a593Smuzhiyun 			goto fail;
475*4882a593Smuzhiyun 		}
476*4882a593Smuzhiyun 
477*4882a593Smuzhiyun 		dd->comp_vect_mappings[i] = cpu;
478*4882a593Smuzhiyun 		hfi1_cdbg(AFFINITY,
479*4882a593Smuzhiyun 			  "[%s] Completion Vector %d -> CPU %d",
480*4882a593Smuzhiyun 			  rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), i, cpu);
481*4882a593Smuzhiyun 	}
482*4882a593Smuzhiyun 
483*4882a593Smuzhiyun 	free_cpumask_var(available_cpus);
484*4882a593Smuzhiyun 	free_cpumask_var(non_intr_cpus);
485*4882a593Smuzhiyun 	return 0;
486*4882a593Smuzhiyun 
487*4882a593Smuzhiyun fail:
488*4882a593Smuzhiyun 	free_cpumask_var(available_cpus);
489*4882a593Smuzhiyun 	free_cpumask_var(non_intr_cpus);
490*4882a593Smuzhiyun 	_dev_comp_vect_mappings_destroy(dd);
491*4882a593Smuzhiyun 
492*4882a593Smuzhiyun 	return ret;
493*4882a593Smuzhiyun }
494*4882a593Smuzhiyun 
hfi1_comp_vectors_set_up(struct hfi1_devdata * dd)495*4882a593Smuzhiyun int hfi1_comp_vectors_set_up(struct hfi1_devdata *dd)
496*4882a593Smuzhiyun {
497*4882a593Smuzhiyun 	int ret;
498*4882a593Smuzhiyun 	struct hfi1_affinity_node *entry;
499*4882a593Smuzhiyun 
500*4882a593Smuzhiyun 	mutex_lock(&node_affinity.lock);
501*4882a593Smuzhiyun 	entry = node_affinity_lookup(dd->node);
502*4882a593Smuzhiyun 	if (!entry) {
503*4882a593Smuzhiyun 		ret = -EINVAL;
504*4882a593Smuzhiyun 		goto unlock;
505*4882a593Smuzhiyun 	}
506*4882a593Smuzhiyun 	ret = _dev_comp_vect_mappings_create(dd, entry);
507*4882a593Smuzhiyun unlock:
508*4882a593Smuzhiyun 	mutex_unlock(&node_affinity.lock);
509*4882a593Smuzhiyun 
510*4882a593Smuzhiyun 	return ret;
511*4882a593Smuzhiyun }
512*4882a593Smuzhiyun 
hfi1_comp_vectors_clean_up(struct hfi1_devdata * dd)513*4882a593Smuzhiyun void hfi1_comp_vectors_clean_up(struct hfi1_devdata *dd)
514*4882a593Smuzhiyun {
515*4882a593Smuzhiyun 	_dev_comp_vect_mappings_destroy(dd);
516*4882a593Smuzhiyun }
517*4882a593Smuzhiyun 
hfi1_comp_vect_mappings_lookup(struct rvt_dev_info * rdi,int comp_vect)518*4882a593Smuzhiyun int hfi1_comp_vect_mappings_lookup(struct rvt_dev_info *rdi, int comp_vect)
519*4882a593Smuzhiyun {
520*4882a593Smuzhiyun 	struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
521*4882a593Smuzhiyun 	struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
522*4882a593Smuzhiyun 
523*4882a593Smuzhiyun 	if (!dd->comp_vect_mappings)
524*4882a593Smuzhiyun 		return -EINVAL;
525*4882a593Smuzhiyun 	if (comp_vect >= dd->comp_vect_possible_cpus)
526*4882a593Smuzhiyun 		return -EINVAL;
527*4882a593Smuzhiyun 
528*4882a593Smuzhiyun 	return dd->comp_vect_mappings[comp_vect];
529*4882a593Smuzhiyun }
530*4882a593Smuzhiyun 
531*4882a593Smuzhiyun /*
532*4882a593Smuzhiyun  * It assumes dd->comp_vect_possible_cpus is available.
533*4882a593Smuzhiyun  */
_dev_comp_vect_cpu_mask_init(struct hfi1_devdata * dd,struct hfi1_affinity_node * entry,bool first_dev_init)534*4882a593Smuzhiyun static int _dev_comp_vect_cpu_mask_init(struct hfi1_devdata *dd,
535*4882a593Smuzhiyun 					struct hfi1_affinity_node *entry,
536*4882a593Smuzhiyun 					bool first_dev_init)
537*4882a593Smuzhiyun 	__must_hold(&node_affinity.lock)
538*4882a593Smuzhiyun {
539*4882a593Smuzhiyun 	int i, j, curr_cpu;
540*4882a593Smuzhiyun 	int possible_cpus_comp_vect = 0;
541*4882a593Smuzhiyun 	struct cpumask *dev_comp_vect_mask = &dd->comp_vect->mask;
542*4882a593Smuzhiyun 
543*4882a593Smuzhiyun 	lockdep_assert_held(&node_affinity.lock);
544*4882a593Smuzhiyun 	/*
545*4882a593Smuzhiyun 	 * If there's only one CPU available for completion vectors, then
546*4882a593Smuzhiyun 	 * there will only be one completion vector available. Othewise,
547*4882a593Smuzhiyun 	 * the number of completion vector available will be the number of
548*4882a593Smuzhiyun 	 * available CPUs divide it by the number of devices in the
549*4882a593Smuzhiyun 	 * local NUMA node.
550*4882a593Smuzhiyun 	 */
551*4882a593Smuzhiyun 	if (cpumask_weight(&entry->comp_vect_mask) == 1) {
552*4882a593Smuzhiyun 		possible_cpus_comp_vect = 1;
553*4882a593Smuzhiyun 		dd_dev_warn(dd,
554*4882a593Smuzhiyun 			    "Number of kernel receive queues is too large for completion vector affinity to be effective\n");
555*4882a593Smuzhiyun 	} else {
556*4882a593Smuzhiyun 		possible_cpus_comp_vect +=
557*4882a593Smuzhiyun 			cpumask_weight(&entry->comp_vect_mask) /
558*4882a593Smuzhiyun 				       hfi1_per_node_cntr[dd->node];
559*4882a593Smuzhiyun 
560*4882a593Smuzhiyun 		/*
561*4882a593Smuzhiyun 		 * If the completion vector CPUs available doesn't divide
562*4882a593Smuzhiyun 		 * evenly among devices, then the first device device to be
563*4882a593Smuzhiyun 		 * initialized gets an extra CPU.
564*4882a593Smuzhiyun 		 */
565*4882a593Smuzhiyun 		if (first_dev_init &&
566*4882a593Smuzhiyun 		    cpumask_weight(&entry->comp_vect_mask) %
567*4882a593Smuzhiyun 		    hfi1_per_node_cntr[dd->node] != 0)
568*4882a593Smuzhiyun 			possible_cpus_comp_vect++;
569*4882a593Smuzhiyun 	}
570*4882a593Smuzhiyun 
571*4882a593Smuzhiyun 	dd->comp_vect_possible_cpus = possible_cpus_comp_vect;
572*4882a593Smuzhiyun 
573*4882a593Smuzhiyun 	/* Reserving CPUs for device completion vector */
574*4882a593Smuzhiyun 	for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
575*4882a593Smuzhiyun 		curr_cpu = per_cpu_affinity_get(&entry->comp_vect_mask,
576*4882a593Smuzhiyun 						entry->comp_vect_affinity);
577*4882a593Smuzhiyun 		if (curr_cpu < 0)
578*4882a593Smuzhiyun 			goto fail;
579*4882a593Smuzhiyun 
580*4882a593Smuzhiyun 		cpumask_set_cpu(curr_cpu, dev_comp_vect_mask);
581*4882a593Smuzhiyun 	}
582*4882a593Smuzhiyun 
583*4882a593Smuzhiyun 	hfi1_cdbg(AFFINITY,
584*4882a593Smuzhiyun 		  "[%s] Completion vector affinity CPU set(s) %*pbl",
585*4882a593Smuzhiyun 		  rvt_get_ibdev_name(&(dd)->verbs_dev.rdi),
586*4882a593Smuzhiyun 		  cpumask_pr_args(dev_comp_vect_mask));
587*4882a593Smuzhiyun 
588*4882a593Smuzhiyun 	return 0;
589*4882a593Smuzhiyun 
590*4882a593Smuzhiyun fail:
591*4882a593Smuzhiyun 	for (j = 0; j < i; j++)
592*4882a593Smuzhiyun 		per_cpu_affinity_put_max(&entry->comp_vect_mask,
593*4882a593Smuzhiyun 					 entry->comp_vect_affinity);
594*4882a593Smuzhiyun 
595*4882a593Smuzhiyun 	return curr_cpu;
596*4882a593Smuzhiyun }
597*4882a593Smuzhiyun 
598*4882a593Smuzhiyun /*
599*4882a593Smuzhiyun  * It assumes dd->comp_vect_possible_cpus is available.
600*4882a593Smuzhiyun  */
_dev_comp_vect_cpu_mask_clean_up(struct hfi1_devdata * dd,struct hfi1_affinity_node * entry)601*4882a593Smuzhiyun static void _dev_comp_vect_cpu_mask_clean_up(struct hfi1_devdata *dd,
602*4882a593Smuzhiyun 					     struct hfi1_affinity_node *entry)
603*4882a593Smuzhiyun 	__must_hold(&node_affinity.lock)
604*4882a593Smuzhiyun {
605*4882a593Smuzhiyun 	int i, cpu;
606*4882a593Smuzhiyun 
607*4882a593Smuzhiyun 	lockdep_assert_held(&node_affinity.lock);
608*4882a593Smuzhiyun 	if (!dd->comp_vect_possible_cpus)
609*4882a593Smuzhiyun 		return;
610*4882a593Smuzhiyun 
611*4882a593Smuzhiyun 	for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
612*4882a593Smuzhiyun 		cpu = per_cpu_affinity_put_max(&dd->comp_vect->mask,
613*4882a593Smuzhiyun 					       entry->comp_vect_affinity);
614*4882a593Smuzhiyun 		/* Clearing CPU in device completion vector cpu mask */
615*4882a593Smuzhiyun 		if (cpu >= 0)
616*4882a593Smuzhiyun 			cpumask_clear_cpu(cpu, &dd->comp_vect->mask);
617*4882a593Smuzhiyun 	}
618*4882a593Smuzhiyun 
619*4882a593Smuzhiyun 	dd->comp_vect_possible_cpus = 0;
620*4882a593Smuzhiyun }
621*4882a593Smuzhiyun 
622*4882a593Smuzhiyun /*
623*4882a593Smuzhiyun  * Interrupt affinity.
624*4882a593Smuzhiyun  *
625*4882a593Smuzhiyun  * non-rcv avail gets a default mask that
626*4882a593Smuzhiyun  * starts as possible cpus with threads reset
627*4882a593Smuzhiyun  * and each rcv avail reset.
628*4882a593Smuzhiyun  *
629*4882a593Smuzhiyun  * rcv avail gets node relative 1 wrapping back
630*4882a593Smuzhiyun  * to the node relative 1 as necessary.
631*4882a593Smuzhiyun  *
632*4882a593Smuzhiyun  */
hfi1_dev_affinity_init(struct hfi1_devdata * dd)633*4882a593Smuzhiyun int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
634*4882a593Smuzhiyun {
635*4882a593Smuzhiyun 	struct hfi1_affinity_node *entry;
636*4882a593Smuzhiyun 	const struct cpumask *local_mask;
637*4882a593Smuzhiyun 	int curr_cpu, possible, i, ret;
638*4882a593Smuzhiyun 	bool new_entry = false;
639*4882a593Smuzhiyun 
640*4882a593Smuzhiyun 	local_mask = cpumask_of_node(dd->node);
641*4882a593Smuzhiyun 	if (cpumask_first(local_mask) >= nr_cpu_ids)
642*4882a593Smuzhiyun 		local_mask = topology_core_cpumask(0);
643*4882a593Smuzhiyun 
644*4882a593Smuzhiyun 	mutex_lock(&node_affinity.lock);
645*4882a593Smuzhiyun 	entry = node_affinity_lookup(dd->node);
646*4882a593Smuzhiyun 
647*4882a593Smuzhiyun 	/*
648*4882a593Smuzhiyun 	 * If this is the first time this NUMA node's affinity is used,
649*4882a593Smuzhiyun 	 * create an entry in the global affinity structure and initialize it.
650*4882a593Smuzhiyun 	 */
651*4882a593Smuzhiyun 	if (!entry) {
652*4882a593Smuzhiyun 		entry = node_affinity_allocate(dd->node);
653*4882a593Smuzhiyun 		if (!entry) {
654*4882a593Smuzhiyun 			dd_dev_err(dd,
655*4882a593Smuzhiyun 				   "Unable to allocate global affinity node\n");
656*4882a593Smuzhiyun 			ret = -ENOMEM;
657*4882a593Smuzhiyun 			goto fail;
658*4882a593Smuzhiyun 		}
659*4882a593Smuzhiyun 		new_entry = true;
660*4882a593Smuzhiyun 
661*4882a593Smuzhiyun 		init_cpu_mask_set(&entry->def_intr);
662*4882a593Smuzhiyun 		init_cpu_mask_set(&entry->rcv_intr);
663*4882a593Smuzhiyun 		cpumask_clear(&entry->comp_vect_mask);
664*4882a593Smuzhiyun 		cpumask_clear(&entry->general_intr_mask);
665*4882a593Smuzhiyun 		/* Use the "real" cpu mask of this node as the default */
666*4882a593Smuzhiyun 		cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask,
667*4882a593Smuzhiyun 			    local_mask);
668*4882a593Smuzhiyun 
669*4882a593Smuzhiyun 		/* fill in the receive list */
670*4882a593Smuzhiyun 		possible = cpumask_weight(&entry->def_intr.mask);
671*4882a593Smuzhiyun 		curr_cpu = cpumask_first(&entry->def_intr.mask);
672*4882a593Smuzhiyun 
673*4882a593Smuzhiyun 		if (possible == 1) {
674*4882a593Smuzhiyun 			/* only one CPU, everyone will use it */
675*4882a593Smuzhiyun 			cpumask_set_cpu(curr_cpu, &entry->rcv_intr.mask);
676*4882a593Smuzhiyun 			cpumask_set_cpu(curr_cpu, &entry->general_intr_mask);
677*4882a593Smuzhiyun 		} else {
678*4882a593Smuzhiyun 			/*
679*4882a593Smuzhiyun 			 * The general/control context will be the first CPU in
680*4882a593Smuzhiyun 			 * the default list, so it is removed from the default
681*4882a593Smuzhiyun 			 * list and added to the general interrupt list.
682*4882a593Smuzhiyun 			 */
683*4882a593Smuzhiyun 			cpumask_clear_cpu(curr_cpu, &entry->def_intr.mask);
684*4882a593Smuzhiyun 			cpumask_set_cpu(curr_cpu, &entry->general_intr_mask);
685*4882a593Smuzhiyun 			curr_cpu = cpumask_next(curr_cpu,
686*4882a593Smuzhiyun 						&entry->def_intr.mask);
687*4882a593Smuzhiyun 
688*4882a593Smuzhiyun 			/*
689*4882a593Smuzhiyun 			 * Remove the remaining kernel receive queues from
690*4882a593Smuzhiyun 			 * the default list and add them to the receive list.
691*4882a593Smuzhiyun 			 */
692*4882a593Smuzhiyun 			for (i = 0;
693*4882a593Smuzhiyun 			     i < (dd->n_krcv_queues - 1) *
694*4882a593Smuzhiyun 				  hfi1_per_node_cntr[dd->node];
695*4882a593Smuzhiyun 			     i++) {
696*4882a593Smuzhiyun 				cpumask_clear_cpu(curr_cpu,
697*4882a593Smuzhiyun 						  &entry->def_intr.mask);
698*4882a593Smuzhiyun 				cpumask_set_cpu(curr_cpu,
699*4882a593Smuzhiyun 						&entry->rcv_intr.mask);
700*4882a593Smuzhiyun 				curr_cpu = cpumask_next(curr_cpu,
701*4882a593Smuzhiyun 							&entry->def_intr.mask);
702*4882a593Smuzhiyun 				if (curr_cpu >= nr_cpu_ids)
703*4882a593Smuzhiyun 					break;
704*4882a593Smuzhiyun 			}
705*4882a593Smuzhiyun 
706*4882a593Smuzhiyun 			/*
707*4882a593Smuzhiyun 			 * If there ends up being 0 CPU cores leftover for SDMA
708*4882a593Smuzhiyun 			 * engines, use the same CPU cores as general/control
709*4882a593Smuzhiyun 			 * context.
710*4882a593Smuzhiyun 			 */
711*4882a593Smuzhiyun 			if (cpumask_weight(&entry->def_intr.mask) == 0)
712*4882a593Smuzhiyun 				cpumask_copy(&entry->def_intr.mask,
713*4882a593Smuzhiyun 					     &entry->general_intr_mask);
714*4882a593Smuzhiyun 		}
715*4882a593Smuzhiyun 
716*4882a593Smuzhiyun 		/* Determine completion vector CPUs for the entire node */
717*4882a593Smuzhiyun 		cpumask_and(&entry->comp_vect_mask,
718*4882a593Smuzhiyun 			    &node_affinity.real_cpu_mask, local_mask);
719*4882a593Smuzhiyun 		cpumask_andnot(&entry->comp_vect_mask,
720*4882a593Smuzhiyun 			       &entry->comp_vect_mask,
721*4882a593Smuzhiyun 			       &entry->rcv_intr.mask);
722*4882a593Smuzhiyun 		cpumask_andnot(&entry->comp_vect_mask,
723*4882a593Smuzhiyun 			       &entry->comp_vect_mask,
724*4882a593Smuzhiyun 			       &entry->general_intr_mask);
725*4882a593Smuzhiyun 
726*4882a593Smuzhiyun 		/*
727*4882a593Smuzhiyun 		 * If there ends up being 0 CPU cores leftover for completion
728*4882a593Smuzhiyun 		 * vectors, use the same CPU core as the general/control
729*4882a593Smuzhiyun 		 * context.
730*4882a593Smuzhiyun 		 */
731*4882a593Smuzhiyun 		if (cpumask_weight(&entry->comp_vect_mask) == 0)
732*4882a593Smuzhiyun 			cpumask_copy(&entry->comp_vect_mask,
733*4882a593Smuzhiyun 				     &entry->general_intr_mask);
734*4882a593Smuzhiyun 	}
735*4882a593Smuzhiyun 
736*4882a593Smuzhiyun 	ret = _dev_comp_vect_cpu_mask_init(dd, entry, new_entry);
737*4882a593Smuzhiyun 	if (ret < 0)
738*4882a593Smuzhiyun 		goto fail;
739*4882a593Smuzhiyun 
740*4882a593Smuzhiyun 	if (new_entry)
741*4882a593Smuzhiyun 		node_affinity_add_tail(entry);
742*4882a593Smuzhiyun 
743*4882a593Smuzhiyun 	dd->affinity_entry = entry;
744*4882a593Smuzhiyun 	mutex_unlock(&node_affinity.lock);
745*4882a593Smuzhiyun 
746*4882a593Smuzhiyun 	return 0;
747*4882a593Smuzhiyun 
748*4882a593Smuzhiyun fail:
749*4882a593Smuzhiyun 	if (new_entry)
750*4882a593Smuzhiyun 		node_affinity_destroy(entry);
751*4882a593Smuzhiyun 	mutex_unlock(&node_affinity.lock);
752*4882a593Smuzhiyun 	return ret;
753*4882a593Smuzhiyun }
754*4882a593Smuzhiyun 
hfi1_dev_affinity_clean_up(struct hfi1_devdata * dd)755*4882a593Smuzhiyun void hfi1_dev_affinity_clean_up(struct hfi1_devdata *dd)
756*4882a593Smuzhiyun {
757*4882a593Smuzhiyun 	struct hfi1_affinity_node *entry;
758*4882a593Smuzhiyun 
759*4882a593Smuzhiyun 	mutex_lock(&node_affinity.lock);
760*4882a593Smuzhiyun 	if (!dd->affinity_entry)
761*4882a593Smuzhiyun 		goto unlock;
762*4882a593Smuzhiyun 	entry = node_affinity_lookup(dd->node);
763*4882a593Smuzhiyun 	if (!entry)
764*4882a593Smuzhiyun 		goto unlock;
765*4882a593Smuzhiyun 
766*4882a593Smuzhiyun 	/*
767*4882a593Smuzhiyun 	 * Free device completion vector CPUs to be used by future
768*4882a593Smuzhiyun 	 * completion vectors
769*4882a593Smuzhiyun 	 */
770*4882a593Smuzhiyun 	_dev_comp_vect_cpu_mask_clean_up(dd, entry);
771*4882a593Smuzhiyun unlock:
772*4882a593Smuzhiyun 	dd->affinity_entry = NULL;
773*4882a593Smuzhiyun 	mutex_unlock(&node_affinity.lock);
774*4882a593Smuzhiyun }
775*4882a593Smuzhiyun 
776*4882a593Smuzhiyun /*
777*4882a593Smuzhiyun  * Function updates the irq affinity hint for msix after it has been changed
778*4882a593Smuzhiyun  * by the user using the /proc/irq interface. This function only accepts
779*4882a593Smuzhiyun  * one cpu in the mask.
780*4882a593Smuzhiyun  */
hfi1_update_sdma_affinity(struct hfi1_msix_entry * msix,int cpu)781*4882a593Smuzhiyun static void hfi1_update_sdma_affinity(struct hfi1_msix_entry *msix, int cpu)
782*4882a593Smuzhiyun {
783*4882a593Smuzhiyun 	struct sdma_engine *sde = msix->arg;
784*4882a593Smuzhiyun 	struct hfi1_devdata *dd = sde->dd;
785*4882a593Smuzhiyun 	struct hfi1_affinity_node *entry;
786*4882a593Smuzhiyun 	struct cpu_mask_set *set;
787*4882a593Smuzhiyun 	int i, old_cpu;
788*4882a593Smuzhiyun 
789*4882a593Smuzhiyun 	if (cpu > num_online_cpus() || cpu == sde->cpu)
790*4882a593Smuzhiyun 		return;
791*4882a593Smuzhiyun 
792*4882a593Smuzhiyun 	mutex_lock(&node_affinity.lock);
793*4882a593Smuzhiyun 	entry = node_affinity_lookup(dd->node);
794*4882a593Smuzhiyun 	if (!entry)
795*4882a593Smuzhiyun 		goto unlock;
796*4882a593Smuzhiyun 
797*4882a593Smuzhiyun 	old_cpu = sde->cpu;
798*4882a593Smuzhiyun 	sde->cpu = cpu;
799*4882a593Smuzhiyun 	cpumask_clear(&msix->mask);
800*4882a593Smuzhiyun 	cpumask_set_cpu(cpu, &msix->mask);
801*4882a593Smuzhiyun 	dd_dev_dbg(dd, "IRQ: %u, type %s engine %u -> cpu: %d\n",
802*4882a593Smuzhiyun 		   msix->irq, irq_type_names[msix->type],
803*4882a593Smuzhiyun 		   sde->this_idx, cpu);
804*4882a593Smuzhiyun 	irq_set_affinity_hint(msix->irq, &msix->mask);
805*4882a593Smuzhiyun 
806*4882a593Smuzhiyun 	/*
807*4882a593Smuzhiyun 	 * Set the new cpu in the hfi1_affinity_node and clean
808*4882a593Smuzhiyun 	 * the old cpu if it is not used by any other IRQ
809*4882a593Smuzhiyun 	 */
810*4882a593Smuzhiyun 	set = &entry->def_intr;
811*4882a593Smuzhiyun 	cpumask_set_cpu(cpu, &set->mask);
812*4882a593Smuzhiyun 	cpumask_set_cpu(cpu, &set->used);
813*4882a593Smuzhiyun 	for (i = 0; i < dd->msix_info.max_requested; i++) {
814*4882a593Smuzhiyun 		struct hfi1_msix_entry *other_msix;
815*4882a593Smuzhiyun 
816*4882a593Smuzhiyun 		other_msix = &dd->msix_info.msix_entries[i];
817*4882a593Smuzhiyun 		if (other_msix->type != IRQ_SDMA || other_msix == msix)
818*4882a593Smuzhiyun 			continue;
819*4882a593Smuzhiyun 
820*4882a593Smuzhiyun 		if (cpumask_test_cpu(old_cpu, &other_msix->mask))
821*4882a593Smuzhiyun 			goto unlock;
822*4882a593Smuzhiyun 	}
823*4882a593Smuzhiyun 	cpumask_clear_cpu(old_cpu, &set->mask);
824*4882a593Smuzhiyun 	cpumask_clear_cpu(old_cpu, &set->used);
825*4882a593Smuzhiyun unlock:
826*4882a593Smuzhiyun 	mutex_unlock(&node_affinity.lock);
827*4882a593Smuzhiyun }
828*4882a593Smuzhiyun 
hfi1_irq_notifier_notify(struct irq_affinity_notify * notify,const cpumask_t * mask)829*4882a593Smuzhiyun static void hfi1_irq_notifier_notify(struct irq_affinity_notify *notify,
830*4882a593Smuzhiyun 				     const cpumask_t *mask)
831*4882a593Smuzhiyun {
832*4882a593Smuzhiyun 	int cpu = cpumask_first(mask);
833*4882a593Smuzhiyun 	struct hfi1_msix_entry *msix = container_of(notify,
834*4882a593Smuzhiyun 						    struct hfi1_msix_entry,
835*4882a593Smuzhiyun 						    notify);
836*4882a593Smuzhiyun 
837*4882a593Smuzhiyun 	/* Only one CPU configuration supported currently */
838*4882a593Smuzhiyun 	hfi1_update_sdma_affinity(msix, cpu);
839*4882a593Smuzhiyun }
840*4882a593Smuzhiyun 
hfi1_irq_notifier_release(struct kref * ref)841*4882a593Smuzhiyun static void hfi1_irq_notifier_release(struct kref *ref)
842*4882a593Smuzhiyun {
843*4882a593Smuzhiyun 	/*
844*4882a593Smuzhiyun 	 * This is required by affinity notifier. We don't have anything to
845*4882a593Smuzhiyun 	 * free here.
846*4882a593Smuzhiyun 	 */
847*4882a593Smuzhiyun }
848*4882a593Smuzhiyun 
hfi1_setup_sdma_notifier(struct hfi1_msix_entry * msix)849*4882a593Smuzhiyun static void hfi1_setup_sdma_notifier(struct hfi1_msix_entry *msix)
850*4882a593Smuzhiyun {
851*4882a593Smuzhiyun 	struct irq_affinity_notify *notify = &msix->notify;
852*4882a593Smuzhiyun 
853*4882a593Smuzhiyun 	notify->irq = msix->irq;
854*4882a593Smuzhiyun 	notify->notify = hfi1_irq_notifier_notify;
855*4882a593Smuzhiyun 	notify->release = hfi1_irq_notifier_release;
856*4882a593Smuzhiyun 
857*4882a593Smuzhiyun 	if (irq_set_affinity_notifier(notify->irq, notify))
858*4882a593Smuzhiyun 		pr_err("Failed to register sdma irq affinity notifier for irq %d\n",
859*4882a593Smuzhiyun 		       notify->irq);
860*4882a593Smuzhiyun }
861*4882a593Smuzhiyun 
hfi1_cleanup_sdma_notifier(struct hfi1_msix_entry * msix)862*4882a593Smuzhiyun static void hfi1_cleanup_sdma_notifier(struct hfi1_msix_entry *msix)
863*4882a593Smuzhiyun {
864*4882a593Smuzhiyun 	struct irq_affinity_notify *notify = &msix->notify;
865*4882a593Smuzhiyun 
866*4882a593Smuzhiyun 	if (irq_set_affinity_notifier(notify->irq, NULL))
867*4882a593Smuzhiyun 		pr_err("Failed to cleanup sdma irq affinity notifier for irq %d\n",
868*4882a593Smuzhiyun 		       notify->irq);
869*4882a593Smuzhiyun }
870*4882a593Smuzhiyun 
871*4882a593Smuzhiyun /*
872*4882a593Smuzhiyun  * Function sets the irq affinity for msix.
873*4882a593Smuzhiyun  * It *must* be called with node_affinity.lock held.
874*4882a593Smuzhiyun  */
get_irq_affinity(struct hfi1_devdata * dd,struct hfi1_msix_entry * msix)875*4882a593Smuzhiyun static int get_irq_affinity(struct hfi1_devdata *dd,
876*4882a593Smuzhiyun 			    struct hfi1_msix_entry *msix)
877*4882a593Smuzhiyun {
878*4882a593Smuzhiyun 	cpumask_var_t diff;
879*4882a593Smuzhiyun 	struct hfi1_affinity_node *entry;
880*4882a593Smuzhiyun 	struct cpu_mask_set *set = NULL;
881*4882a593Smuzhiyun 	struct sdma_engine *sde = NULL;
882*4882a593Smuzhiyun 	struct hfi1_ctxtdata *rcd = NULL;
883*4882a593Smuzhiyun 	char extra[64];
884*4882a593Smuzhiyun 	int cpu = -1;
885*4882a593Smuzhiyun 
886*4882a593Smuzhiyun 	extra[0] = '\0';
887*4882a593Smuzhiyun 	cpumask_clear(&msix->mask);
888*4882a593Smuzhiyun 
889*4882a593Smuzhiyun 	entry = node_affinity_lookup(dd->node);
890*4882a593Smuzhiyun 
891*4882a593Smuzhiyun 	switch (msix->type) {
892*4882a593Smuzhiyun 	case IRQ_SDMA:
893*4882a593Smuzhiyun 		sde = (struct sdma_engine *)msix->arg;
894*4882a593Smuzhiyun 		scnprintf(extra, 64, "engine %u", sde->this_idx);
895*4882a593Smuzhiyun 		set = &entry->def_intr;
896*4882a593Smuzhiyun 		break;
897*4882a593Smuzhiyun 	case IRQ_GENERAL:
898*4882a593Smuzhiyun 		cpu = cpumask_first(&entry->general_intr_mask);
899*4882a593Smuzhiyun 		break;
900*4882a593Smuzhiyun 	case IRQ_RCVCTXT:
901*4882a593Smuzhiyun 		rcd = (struct hfi1_ctxtdata *)msix->arg;
902*4882a593Smuzhiyun 		if (rcd->ctxt == HFI1_CTRL_CTXT)
903*4882a593Smuzhiyun 			cpu = cpumask_first(&entry->general_intr_mask);
904*4882a593Smuzhiyun 		else
905*4882a593Smuzhiyun 			set = &entry->rcv_intr;
906*4882a593Smuzhiyun 		scnprintf(extra, 64, "ctxt %u", rcd->ctxt);
907*4882a593Smuzhiyun 		break;
908*4882a593Smuzhiyun 	case IRQ_NETDEVCTXT:
909*4882a593Smuzhiyun 		rcd = (struct hfi1_ctxtdata *)msix->arg;
910*4882a593Smuzhiyun 		set = &entry->def_intr;
911*4882a593Smuzhiyun 		scnprintf(extra, 64, "ctxt %u", rcd->ctxt);
912*4882a593Smuzhiyun 		break;
913*4882a593Smuzhiyun 	default:
914*4882a593Smuzhiyun 		dd_dev_err(dd, "Invalid IRQ type %d\n", msix->type);
915*4882a593Smuzhiyun 		return -EINVAL;
916*4882a593Smuzhiyun 	}
917*4882a593Smuzhiyun 
918*4882a593Smuzhiyun 	/*
919*4882a593Smuzhiyun 	 * The general and control contexts are placed on a particular
920*4882a593Smuzhiyun 	 * CPU, which is set above. Skip accounting for it. Everything else
921*4882a593Smuzhiyun 	 * finds its CPU here.
922*4882a593Smuzhiyun 	 */
923*4882a593Smuzhiyun 	if (cpu == -1 && set) {
924*4882a593Smuzhiyun 		if (!zalloc_cpumask_var(&diff, GFP_KERNEL))
925*4882a593Smuzhiyun 			return -ENOMEM;
926*4882a593Smuzhiyun 
927*4882a593Smuzhiyun 		cpu = cpu_mask_set_get_first(set, diff);
928*4882a593Smuzhiyun 		if (cpu < 0) {
929*4882a593Smuzhiyun 			free_cpumask_var(diff);
930*4882a593Smuzhiyun 			dd_dev_err(dd, "Failure to obtain CPU for IRQ\n");
931*4882a593Smuzhiyun 			return cpu;
932*4882a593Smuzhiyun 		}
933*4882a593Smuzhiyun 
934*4882a593Smuzhiyun 		free_cpumask_var(diff);
935*4882a593Smuzhiyun 	}
936*4882a593Smuzhiyun 
937*4882a593Smuzhiyun 	cpumask_set_cpu(cpu, &msix->mask);
938*4882a593Smuzhiyun 	dd_dev_info(dd, "IRQ: %u, type %s %s -> cpu: %d\n",
939*4882a593Smuzhiyun 		    msix->irq, irq_type_names[msix->type],
940*4882a593Smuzhiyun 		    extra, cpu);
941*4882a593Smuzhiyun 	irq_set_affinity_hint(msix->irq, &msix->mask);
942*4882a593Smuzhiyun 
943*4882a593Smuzhiyun 	if (msix->type == IRQ_SDMA) {
944*4882a593Smuzhiyun 		sde->cpu = cpu;
945*4882a593Smuzhiyun 		hfi1_setup_sdma_notifier(msix);
946*4882a593Smuzhiyun 	}
947*4882a593Smuzhiyun 
948*4882a593Smuzhiyun 	return 0;
949*4882a593Smuzhiyun }
950*4882a593Smuzhiyun 
hfi1_get_irq_affinity(struct hfi1_devdata * dd,struct hfi1_msix_entry * msix)951*4882a593Smuzhiyun int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
952*4882a593Smuzhiyun {
953*4882a593Smuzhiyun 	int ret;
954*4882a593Smuzhiyun 
955*4882a593Smuzhiyun 	mutex_lock(&node_affinity.lock);
956*4882a593Smuzhiyun 	ret = get_irq_affinity(dd, msix);
957*4882a593Smuzhiyun 	mutex_unlock(&node_affinity.lock);
958*4882a593Smuzhiyun 	return ret;
959*4882a593Smuzhiyun }
960*4882a593Smuzhiyun 
hfi1_put_irq_affinity(struct hfi1_devdata * dd,struct hfi1_msix_entry * msix)961*4882a593Smuzhiyun void hfi1_put_irq_affinity(struct hfi1_devdata *dd,
962*4882a593Smuzhiyun 			   struct hfi1_msix_entry *msix)
963*4882a593Smuzhiyun {
964*4882a593Smuzhiyun 	struct cpu_mask_set *set = NULL;
965*4882a593Smuzhiyun 	struct hfi1_ctxtdata *rcd;
966*4882a593Smuzhiyun 	struct hfi1_affinity_node *entry;
967*4882a593Smuzhiyun 
968*4882a593Smuzhiyun 	mutex_lock(&node_affinity.lock);
969*4882a593Smuzhiyun 	entry = node_affinity_lookup(dd->node);
970*4882a593Smuzhiyun 
971*4882a593Smuzhiyun 	switch (msix->type) {
972*4882a593Smuzhiyun 	case IRQ_SDMA:
973*4882a593Smuzhiyun 		set = &entry->def_intr;
974*4882a593Smuzhiyun 		hfi1_cleanup_sdma_notifier(msix);
975*4882a593Smuzhiyun 		break;
976*4882a593Smuzhiyun 	case IRQ_GENERAL:
977*4882a593Smuzhiyun 		/* Don't do accounting for general contexts */
978*4882a593Smuzhiyun 		break;
979*4882a593Smuzhiyun 	case IRQ_RCVCTXT:
980*4882a593Smuzhiyun 		rcd = (struct hfi1_ctxtdata *)msix->arg;
981*4882a593Smuzhiyun 		/* Don't do accounting for control contexts */
982*4882a593Smuzhiyun 		if (rcd->ctxt != HFI1_CTRL_CTXT)
983*4882a593Smuzhiyun 			set = &entry->rcv_intr;
984*4882a593Smuzhiyun 		break;
985*4882a593Smuzhiyun 	case IRQ_NETDEVCTXT:
986*4882a593Smuzhiyun 		rcd = (struct hfi1_ctxtdata *)msix->arg;
987*4882a593Smuzhiyun 		set = &entry->def_intr;
988*4882a593Smuzhiyun 		break;
989*4882a593Smuzhiyun 	default:
990*4882a593Smuzhiyun 		mutex_unlock(&node_affinity.lock);
991*4882a593Smuzhiyun 		return;
992*4882a593Smuzhiyun 	}
993*4882a593Smuzhiyun 
994*4882a593Smuzhiyun 	if (set) {
995*4882a593Smuzhiyun 		cpumask_andnot(&set->used, &set->used, &msix->mask);
996*4882a593Smuzhiyun 		_cpu_mask_set_gen_dec(set);
997*4882a593Smuzhiyun 	}
998*4882a593Smuzhiyun 
999*4882a593Smuzhiyun 	irq_set_affinity_hint(msix->irq, NULL);
1000*4882a593Smuzhiyun 	cpumask_clear(&msix->mask);
1001*4882a593Smuzhiyun 	mutex_unlock(&node_affinity.lock);
1002*4882a593Smuzhiyun }
1003*4882a593Smuzhiyun 
1004*4882a593Smuzhiyun /* This should be called with node_affinity.lock held */
find_hw_thread_mask(uint hw_thread_no,cpumask_var_t hw_thread_mask,struct hfi1_affinity_node_list * affinity)1005*4882a593Smuzhiyun static void find_hw_thread_mask(uint hw_thread_no, cpumask_var_t hw_thread_mask,
1006*4882a593Smuzhiyun 				struct hfi1_affinity_node_list *affinity)
1007*4882a593Smuzhiyun {
1008*4882a593Smuzhiyun 	int possible, curr_cpu, i;
1009*4882a593Smuzhiyun 	uint num_cores_per_socket = node_affinity.num_online_cpus /
1010*4882a593Smuzhiyun 					affinity->num_core_siblings /
1011*4882a593Smuzhiyun 						node_affinity.num_online_nodes;
1012*4882a593Smuzhiyun 
1013*4882a593Smuzhiyun 	cpumask_copy(hw_thread_mask, &affinity->proc.mask);
1014*4882a593Smuzhiyun 	if (affinity->num_core_siblings > 0) {
1015*4882a593Smuzhiyun 		/* Removing other siblings not needed for now */
1016*4882a593Smuzhiyun 		possible = cpumask_weight(hw_thread_mask);
1017*4882a593Smuzhiyun 		curr_cpu = cpumask_first(hw_thread_mask);
1018*4882a593Smuzhiyun 		for (i = 0;
1019*4882a593Smuzhiyun 		     i < num_cores_per_socket * node_affinity.num_online_nodes;
1020*4882a593Smuzhiyun 		     i++)
1021*4882a593Smuzhiyun 			curr_cpu = cpumask_next(curr_cpu, hw_thread_mask);
1022*4882a593Smuzhiyun 
1023*4882a593Smuzhiyun 		for (; i < possible; i++) {
1024*4882a593Smuzhiyun 			cpumask_clear_cpu(curr_cpu, hw_thread_mask);
1025*4882a593Smuzhiyun 			curr_cpu = cpumask_next(curr_cpu, hw_thread_mask);
1026*4882a593Smuzhiyun 		}
1027*4882a593Smuzhiyun 
1028*4882a593Smuzhiyun 		/* Identifying correct HW threads within physical cores */
1029*4882a593Smuzhiyun 		cpumask_shift_left(hw_thread_mask, hw_thread_mask,
1030*4882a593Smuzhiyun 				   num_cores_per_socket *
1031*4882a593Smuzhiyun 				   node_affinity.num_online_nodes *
1032*4882a593Smuzhiyun 				   hw_thread_no);
1033*4882a593Smuzhiyun 	}
1034*4882a593Smuzhiyun }
1035*4882a593Smuzhiyun 
hfi1_get_proc_affinity(int node)1036*4882a593Smuzhiyun int hfi1_get_proc_affinity(int node)
1037*4882a593Smuzhiyun {
1038*4882a593Smuzhiyun 	int cpu = -1, ret, i;
1039*4882a593Smuzhiyun 	struct hfi1_affinity_node *entry;
1040*4882a593Smuzhiyun 	cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask;
1041*4882a593Smuzhiyun 	const struct cpumask *node_mask,
1042*4882a593Smuzhiyun 		*proc_mask = current->cpus_ptr;
1043*4882a593Smuzhiyun 	struct hfi1_affinity_node_list *affinity = &node_affinity;
1044*4882a593Smuzhiyun 	struct cpu_mask_set *set = &affinity->proc;
1045*4882a593Smuzhiyun 
1046*4882a593Smuzhiyun 	/*
1047*4882a593Smuzhiyun 	 * check whether process/context affinity has already
1048*4882a593Smuzhiyun 	 * been set
1049*4882a593Smuzhiyun 	 */
1050*4882a593Smuzhiyun 	if (current->nr_cpus_allowed == 1) {
1051*4882a593Smuzhiyun 		hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl",
1052*4882a593Smuzhiyun 			  current->pid, current->comm,
1053*4882a593Smuzhiyun 			  cpumask_pr_args(proc_mask));
1054*4882a593Smuzhiyun 		/*
1055*4882a593Smuzhiyun 		 * Mark the pre-set CPU as used. This is atomic so we don't
1056*4882a593Smuzhiyun 		 * need the lock
1057*4882a593Smuzhiyun 		 */
1058*4882a593Smuzhiyun 		cpu = cpumask_first(proc_mask);
1059*4882a593Smuzhiyun 		cpumask_set_cpu(cpu, &set->used);
1060*4882a593Smuzhiyun 		goto done;
1061*4882a593Smuzhiyun 	} else if (current->nr_cpus_allowed < cpumask_weight(&set->mask)) {
1062*4882a593Smuzhiyun 		hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl",
1063*4882a593Smuzhiyun 			  current->pid, current->comm,
1064*4882a593Smuzhiyun 			  cpumask_pr_args(proc_mask));
1065*4882a593Smuzhiyun 		goto done;
1066*4882a593Smuzhiyun 	}
1067*4882a593Smuzhiyun 
1068*4882a593Smuzhiyun 	/*
1069*4882a593Smuzhiyun 	 * The process does not have a preset CPU affinity so find one to
1070*4882a593Smuzhiyun 	 * recommend using the following algorithm:
1071*4882a593Smuzhiyun 	 *
1072*4882a593Smuzhiyun 	 * For each user process that is opening a context on HFI Y:
1073*4882a593Smuzhiyun 	 *  a) If all cores are filled, reinitialize the bitmask
1074*4882a593Smuzhiyun 	 *  b) Fill real cores first, then HT cores (First set of HT
1075*4882a593Smuzhiyun 	 *     cores on all physical cores, then second set of HT core,
1076*4882a593Smuzhiyun 	 *     and, so on) in the following order:
1077*4882a593Smuzhiyun 	 *
1078*4882a593Smuzhiyun 	 *     1. Same NUMA node as HFI Y and not running an IRQ
1079*4882a593Smuzhiyun 	 *        handler
1080*4882a593Smuzhiyun 	 *     2. Same NUMA node as HFI Y and running an IRQ handler
1081*4882a593Smuzhiyun 	 *     3. Different NUMA node to HFI Y and not running an IRQ
1082*4882a593Smuzhiyun 	 *        handler
1083*4882a593Smuzhiyun 	 *     4. Different NUMA node to HFI Y and running an IRQ
1084*4882a593Smuzhiyun 	 *        handler
1085*4882a593Smuzhiyun 	 *  c) Mark core as filled in the bitmask. As user processes are
1086*4882a593Smuzhiyun 	 *     done, clear cores from the bitmask.
1087*4882a593Smuzhiyun 	 */
1088*4882a593Smuzhiyun 
1089*4882a593Smuzhiyun 	ret = zalloc_cpumask_var(&diff, GFP_KERNEL);
1090*4882a593Smuzhiyun 	if (!ret)
1091*4882a593Smuzhiyun 		goto done;
1092*4882a593Smuzhiyun 	ret = zalloc_cpumask_var(&hw_thread_mask, GFP_KERNEL);
1093*4882a593Smuzhiyun 	if (!ret)
1094*4882a593Smuzhiyun 		goto free_diff;
1095*4882a593Smuzhiyun 	ret = zalloc_cpumask_var(&available_mask, GFP_KERNEL);
1096*4882a593Smuzhiyun 	if (!ret)
1097*4882a593Smuzhiyun 		goto free_hw_thread_mask;
1098*4882a593Smuzhiyun 	ret = zalloc_cpumask_var(&intrs_mask, GFP_KERNEL);
1099*4882a593Smuzhiyun 	if (!ret)
1100*4882a593Smuzhiyun 		goto free_available_mask;
1101*4882a593Smuzhiyun 
1102*4882a593Smuzhiyun 	mutex_lock(&affinity->lock);
1103*4882a593Smuzhiyun 	/*
1104*4882a593Smuzhiyun 	 * If we've used all available HW threads, clear the mask and start
1105*4882a593Smuzhiyun 	 * overloading.
1106*4882a593Smuzhiyun 	 */
1107*4882a593Smuzhiyun 	_cpu_mask_set_gen_inc(set);
1108*4882a593Smuzhiyun 
1109*4882a593Smuzhiyun 	/*
1110*4882a593Smuzhiyun 	 * If NUMA node has CPUs used by interrupt handlers, include them in the
1111*4882a593Smuzhiyun 	 * interrupt handler mask.
1112*4882a593Smuzhiyun 	 */
1113*4882a593Smuzhiyun 	entry = node_affinity_lookup(node);
1114*4882a593Smuzhiyun 	if (entry) {
1115*4882a593Smuzhiyun 		cpumask_copy(intrs_mask, (entry->def_intr.gen ?
1116*4882a593Smuzhiyun 					  &entry->def_intr.mask :
1117*4882a593Smuzhiyun 					  &entry->def_intr.used));
1118*4882a593Smuzhiyun 		cpumask_or(intrs_mask, intrs_mask, (entry->rcv_intr.gen ?
1119*4882a593Smuzhiyun 						    &entry->rcv_intr.mask :
1120*4882a593Smuzhiyun 						    &entry->rcv_intr.used));
1121*4882a593Smuzhiyun 		cpumask_or(intrs_mask, intrs_mask, &entry->general_intr_mask);
1122*4882a593Smuzhiyun 	}
1123*4882a593Smuzhiyun 	hfi1_cdbg(PROC, "CPUs used by interrupts: %*pbl",
1124*4882a593Smuzhiyun 		  cpumask_pr_args(intrs_mask));
1125*4882a593Smuzhiyun 
1126*4882a593Smuzhiyun 	cpumask_copy(hw_thread_mask, &set->mask);
1127*4882a593Smuzhiyun 
1128*4882a593Smuzhiyun 	/*
1129*4882a593Smuzhiyun 	 * If HT cores are enabled, identify which HW threads within the
1130*4882a593Smuzhiyun 	 * physical cores should be used.
1131*4882a593Smuzhiyun 	 */
1132*4882a593Smuzhiyun 	if (affinity->num_core_siblings > 0) {
1133*4882a593Smuzhiyun 		for (i = 0; i < affinity->num_core_siblings; i++) {
1134*4882a593Smuzhiyun 			find_hw_thread_mask(i, hw_thread_mask, affinity);
1135*4882a593Smuzhiyun 
1136*4882a593Smuzhiyun 			/*
1137*4882a593Smuzhiyun 			 * If there's at least one available core for this HW
1138*4882a593Smuzhiyun 			 * thread number, stop looking for a core.
1139*4882a593Smuzhiyun 			 *
1140*4882a593Smuzhiyun 			 * diff will always be not empty at least once in this
1141*4882a593Smuzhiyun 			 * loop as the used mask gets reset when
1142*4882a593Smuzhiyun 			 * (set->mask == set->used) before this loop.
1143*4882a593Smuzhiyun 			 */
1144*4882a593Smuzhiyun 			cpumask_andnot(diff, hw_thread_mask, &set->used);
1145*4882a593Smuzhiyun 			if (!cpumask_empty(diff))
1146*4882a593Smuzhiyun 				break;
1147*4882a593Smuzhiyun 		}
1148*4882a593Smuzhiyun 	}
1149*4882a593Smuzhiyun 	hfi1_cdbg(PROC, "Same available HW thread on all physical CPUs: %*pbl",
1150*4882a593Smuzhiyun 		  cpumask_pr_args(hw_thread_mask));
1151*4882a593Smuzhiyun 
1152*4882a593Smuzhiyun 	node_mask = cpumask_of_node(node);
1153*4882a593Smuzhiyun 	hfi1_cdbg(PROC, "Device on NUMA %u, CPUs %*pbl", node,
1154*4882a593Smuzhiyun 		  cpumask_pr_args(node_mask));
1155*4882a593Smuzhiyun 
1156*4882a593Smuzhiyun 	/* Get cpumask of available CPUs on preferred NUMA */
1157*4882a593Smuzhiyun 	cpumask_and(available_mask, hw_thread_mask, node_mask);
1158*4882a593Smuzhiyun 	cpumask_andnot(available_mask, available_mask, &set->used);
1159*4882a593Smuzhiyun 	hfi1_cdbg(PROC, "Available CPUs on NUMA %u: %*pbl", node,
1160*4882a593Smuzhiyun 		  cpumask_pr_args(available_mask));
1161*4882a593Smuzhiyun 
1162*4882a593Smuzhiyun 	/*
1163*4882a593Smuzhiyun 	 * At first, we don't want to place processes on the same
1164*4882a593Smuzhiyun 	 * CPUs as interrupt handlers. Then, CPUs running interrupt
1165*4882a593Smuzhiyun 	 * handlers are used.
1166*4882a593Smuzhiyun 	 *
1167*4882a593Smuzhiyun 	 * 1) If diff is not empty, then there are CPUs not running
1168*4882a593Smuzhiyun 	 *    non-interrupt handlers available, so diff gets copied
1169*4882a593Smuzhiyun 	 *    over to available_mask.
1170*4882a593Smuzhiyun 	 * 2) If diff is empty, then all CPUs not running interrupt
1171*4882a593Smuzhiyun 	 *    handlers are taken, so available_mask contains all
1172*4882a593Smuzhiyun 	 *    available CPUs running interrupt handlers.
1173*4882a593Smuzhiyun 	 * 3) If available_mask is empty, then all CPUs on the
1174*4882a593Smuzhiyun 	 *    preferred NUMA node are taken, so other NUMA nodes are
1175*4882a593Smuzhiyun 	 *    used for process assignments using the same method as
1176*4882a593Smuzhiyun 	 *    the preferred NUMA node.
1177*4882a593Smuzhiyun 	 */
1178*4882a593Smuzhiyun 	cpumask_andnot(diff, available_mask, intrs_mask);
1179*4882a593Smuzhiyun 	if (!cpumask_empty(diff))
1180*4882a593Smuzhiyun 		cpumask_copy(available_mask, diff);
1181*4882a593Smuzhiyun 
1182*4882a593Smuzhiyun 	/* If we don't have CPUs on the preferred node, use other NUMA nodes */
1183*4882a593Smuzhiyun 	if (cpumask_empty(available_mask)) {
1184*4882a593Smuzhiyun 		cpumask_andnot(available_mask, hw_thread_mask, &set->used);
1185*4882a593Smuzhiyun 		/* Excluding preferred NUMA cores */
1186*4882a593Smuzhiyun 		cpumask_andnot(available_mask, available_mask, node_mask);
1187*4882a593Smuzhiyun 		hfi1_cdbg(PROC,
1188*4882a593Smuzhiyun 			  "Preferred NUMA node cores are taken, cores available in other NUMA nodes: %*pbl",
1189*4882a593Smuzhiyun 			  cpumask_pr_args(available_mask));
1190*4882a593Smuzhiyun 
1191*4882a593Smuzhiyun 		/*
1192*4882a593Smuzhiyun 		 * At first, we don't want to place processes on the same
1193*4882a593Smuzhiyun 		 * CPUs as interrupt handlers.
1194*4882a593Smuzhiyun 		 */
1195*4882a593Smuzhiyun 		cpumask_andnot(diff, available_mask, intrs_mask);
1196*4882a593Smuzhiyun 		if (!cpumask_empty(diff))
1197*4882a593Smuzhiyun 			cpumask_copy(available_mask, diff);
1198*4882a593Smuzhiyun 	}
1199*4882a593Smuzhiyun 	hfi1_cdbg(PROC, "Possible CPUs for process: %*pbl",
1200*4882a593Smuzhiyun 		  cpumask_pr_args(available_mask));
1201*4882a593Smuzhiyun 
1202*4882a593Smuzhiyun 	cpu = cpumask_first(available_mask);
1203*4882a593Smuzhiyun 	if (cpu >= nr_cpu_ids) /* empty */
1204*4882a593Smuzhiyun 		cpu = -1;
1205*4882a593Smuzhiyun 	else
1206*4882a593Smuzhiyun 		cpumask_set_cpu(cpu, &set->used);
1207*4882a593Smuzhiyun 
1208*4882a593Smuzhiyun 	mutex_unlock(&affinity->lock);
1209*4882a593Smuzhiyun 	hfi1_cdbg(PROC, "Process assigned to CPU %d", cpu);
1210*4882a593Smuzhiyun 
1211*4882a593Smuzhiyun 	free_cpumask_var(intrs_mask);
1212*4882a593Smuzhiyun free_available_mask:
1213*4882a593Smuzhiyun 	free_cpumask_var(available_mask);
1214*4882a593Smuzhiyun free_hw_thread_mask:
1215*4882a593Smuzhiyun 	free_cpumask_var(hw_thread_mask);
1216*4882a593Smuzhiyun free_diff:
1217*4882a593Smuzhiyun 	free_cpumask_var(diff);
1218*4882a593Smuzhiyun done:
1219*4882a593Smuzhiyun 	return cpu;
1220*4882a593Smuzhiyun }
1221*4882a593Smuzhiyun 
hfi1_put_proc_affinity(int cpu)1222*4882a593Smuzhiyun void hfi1_put_proc_affinity(int cpu)
1223*4882a593Smuzhiyun {
1224*4882a593Smuzhiyun 	struct hfi1_affinity_node_list *affinity = &node_affinity;
1225*4882a593Smuzhiyun 	struct cpu_mask_set *set = &affinity->proc;
1226*4882a593Smuzhiyun 
1227*4882a593Smuzhiyun 	if (cpu < 0)
1228*4882a593Smuzhiyun 		return;
1229*4882a593Smuzhiyun 
1230*4882a593Smuzhiyun 	mutex_lock(&affinity->lock);
1231*4882a593Smuzhiyun 	cpu_mask_set_put(set, cpu);
1232*4882a593Smuzhiyun 	hfi1_cdbg(PROC, "Returning CPU %d for future process assignment", cpu);
1233*4882a593Smuzhiyun 	mutex_unlock(&affinity->lock);
1234*4882a593Smuzhiyun }
1235