xref: /OK3568_Linux_fs/kernel/drivers/infiniband/hw/hfi1/user_exp_rcv.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun /*
2*4882a593Smuzhiyun  * Copyright(c) 2020 Cornelis Networks, Inc.
3*4882a593Smuzhiyun  * Copyright(c) 2015-2018 Intel Corporation.
4*4882a593Smuzhiyun  *
5*4882a593Smuzhiyun  * This file is provided under a dual BSD/GPLv2 license.  When using or
6*4882a593Smuzhiyun  * redistributing this file, you may do so under either license.
7*4882a593Smuzhiyun  *
8*4882a593Smuzhiyun  * GPL LICENSE SUMMARY
9*4882a593Smuzhiyun  *
10*4882a593Smuzhiyun  * This program is free software; you can redistribute it and/or modify
11*4882a593Smuzhiyun  * it under the terms of version 2 of the GNU General Public License as
12*4882a593Smuzhiyun  * published by the Free Software Foundation.
13*4882a593Smuzhiyun  *
14*4882a593Smuzhiyun  * This program is distributed in the hope that it will be useful, but
15*4882a593Smuzhiyun  * WITHOUT ANY WARRANTY; without even the implied warranty of
16*4882a593Smuzhiyun  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17*4882a593Smuzhiyun  * General Public License for more details.
18*4882a593Smuzhiyun  *
19*4882a593Smuzhiyun  * BSD LICENSE
20*4882a593Smuzhiyun  *
21*4882a593Smuzhiyun  * Redistribution and use in source and binary forms, with or without
22*4882a593Smuzhiyun  * modification, are permitted provided that the following conditions
23*4882a593Smuzhiyun  * are met:
24*4882a593Smuzhiyun  *
25*4882a593Smuzhiyun  *  - Redistributions of source code must retain the above copyright
26*4882a593Smuzhiyun  *    notice, this list of conditions and the following disclaimer.
27*4882a593Smuzhiyun  *  - Redistributions in binary form must reproduce the above copyright
28*4882a593Smuzhiyun  *    notice, this list of conditions and the following disclaimer in
29*4882a593Smuzhiyun  *    the documentation and/or other materials provided with the
30*4882a593Smuzhiyun  *    distribution.
31*4882a593Smuzhiyun  *  - Neither the name of Intel Corporation nor the names of its
32*4882a593Smuzhiyun  *    contributors may be used to endorse or promote products derived
33*4882a593Smuzhiyun  *    from this software without specific prior written permission.
34*4882a593Smuzhiyun  *
35*4882a593Smuzhiyun  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
36*4882a593Smuzhiyun  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
37*4882a593Smuzhiyun  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
38*4882a593Smuzhiyun  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
39*4882a593Smuzhiyun  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40*4882a593Smuzhiyun  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
41*4882a593Smuzhiyun  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
42*4882a593Smuzhiyun  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
43*4882a593Smuzhiyun  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
44*4882a593Smuzhiyun  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
45*4882a593Smuzhiyun  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
46*4882a593Smuzhiyun  *
47*4882a593Smuzhiyun  */
48*4882a593Smuzhiyun #include <asm/page.h>
49*4882a593Smuzhiyun #include <linux/string.h>
50*4882a593Smuzhiyun 
51*4882a593Smuzhiyun #include "mmu_rb.h"
52*4882a593Smuzhiyun #include "user_exp_rcv.h"
53*4882a593Smuzhiyun #include "trace.h"
54*4882a593Smuzhiyun 
55*4882a593Smuzhiyun static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
56*4882a593Smuzhiyun 			    struct exp_tid_set *set,
57*4882a593Smuzhiyun 			    struct hfi1_filedata *fd);
58*4882a593Smuzhiyun static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages);
59*4882a593Smuzhiyun static int set_rcvarray_entry(struct hfi1_filedata *fd,
60*4882a593Smuzhiyun 			      struct tid_user_buf *tbuf,
61*4882a593Smuzhiyun 			      u32 rcventry, struct tid_group *grp,
62*4882a593Smuzhiyun 			      u16 pageidx, unsigned int npages);
63*4882a593Smuzhiyun static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
64*4882a593Smuzhiyun 				    struct tid_rb_node *tnode);
65*4882a593Smuzhiyun static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
66*4882a593Smuzhiyun 			      const struct mmu_notifier_range *range,
67*4882a593Smuzhiyun 			      unsigned long cur_seq);
68*4882a593Smuzhiyun static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *,
69*4882a593Smuzhiyun 			    struct tid_group *grp,
70*4882a593Smuzhiyun 			    unsigned int start, u16 count,
71*4882a593Smuzhiyun 			    u32 *tidlist, unsigned int *tididx,
72*4882a593Smuzhiyun 			    unsigned int *pmapped);
73*4882a593Smuzhiyun static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
74*4882a593Smuzhiyun 			      struct tid_group **grp);
75*4882a593Smuzhiyun static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node);
76*4882a593Smuzhiyun 
77*4882a593Smuzhiyun static const struct mmu_interval_notifier_ops tid_mn_ops = {
78*4882a593Smuzhiyun 	.invalidate = tid_rb_invalidate,
79*4882a593Smuzhiyun };
80*4882a593Smuzhiyun 
81*4882a593Smuzhiyun /*
82*4882a593Smuzhiyun  * Initialize context and file private data needed for Expected
83*4882a593Smuzhiyun  * receive caching. This needs to be done after the context has
84*4882a593Smuzhiyun  * been configured with the eager/expected RcvEntry counts.
85*4882a593Smuzhiyun  */
hfi1_user_exp_rcv_init(struct hfi1_filedata * fd,struct hfi1_ctxtdata * uctxt)86*4882a593Smuzhiyun int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd,
87*4882a593Smuzhiyun 			   struct hfi1_ctxtdata *uctxt)
88*4882a593Smuzhiyun {
89*4882a593Smuzhiyun 	int ret = 0;
90*4882a593Smuzhiyun 
91*4882a593Smuzhiyun 	fd->entry_to_rb = kcalloc(uctxt->expected_count,
92*4882a593Smuzhiyun 				  sizeof(struct rb_node *),
93*4882a593Smuzhiyun 				  GFP_KERNEL);
94*4882a593Smuzhiyun 	if (!fd->entry_to_rb)
95*4882a593Smuzhiyun 		return -ENOMEM;
96*4882a593Smuzhiyun 
97*4882a593Smuzhiyun 	if (!HFI1_CAP_UGET_MASK(uctxt->flags, TID_UNMAP)) {
98*4882a593Smuzhiyun 		fd->invalid_tid_idx = 0;
99*4882a593Smuzhiyun 		fd->invalid_tids = kcalloc(uctxt->expected_count,
100*4882a593Smuzhiyun 					   sizeof(*fd->invalid_tids),
101*4882a593Smuzhiyun 					   GFP_KERNEL);
102*4882a593Smuzhiyun 		if (!fd->invalid_tids) {
103*4882a593Smuzhiyun 			kfree(fd->entry_to_rb);
104*4882a593Smuzhiyun 			fd->entry_to_rb = NULL;
105*4882a593Smuzhiyun 			return -ENOMEM;
106*4882a593Smuzhiyun 		}
107*4882a593Smuzhiyun 		fd->use_mn = true;
108*4882a593Smuzhiyun 	}
109*4882a593Smuzhiyun 
110*4882a593Smuzhiyun 	/*
111*4882a593Smuzhiyun 	 * PSM does not have a good way to separate, count, and
112*4882a593Smuzhiyun 	 * effectively enforce a limit on RcvArray entries used by
113*4882a593Smuzhiyun 	 * subctxts (when context sharing is used) when TID caching
114*4882a593Smuzhiyun 	 * is enabled. To help with that, we calculate a per-process
115*4882a593Smuzhiyun 	 * RcvArray entry share and enforce that.
116*4882a593Smuzhiyun 	 * If TID caching is not in use, PSM deals with usage on its
117*4882a593Smuzhiyun 	 * own. In that case, we allow any subctxt to take all of the
118*4882a593Smuzhiyun 	 * entries.
119*4882a593Smuzhiyun 	 *
120*4882a593Smuzhiyun 	 * Make sure that we set the tid counts only after successful
121*4882a593Smuzhiyun 	 * init.
122*4882a593Smuzhiyun 	 */
123*4882a593Smuzhiyun 	spin_lock(&fd->tid_lock);
124*4882a593Smuzhiyun 	if (uctxt->subctxt_cnt && fd->use_mn) {
125*4882a593Smuzhiyun 		u16 remainder;
126*4882a593Smuzhiyun 
127*4882a593Smuzhiyun 		fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt;
128*4882a593Smuzhiyun 		remainder = uctxt->expected_count % uctxt->subctxt_cnt;
129*4882a593Smuzhiyun 		if (remainder && fd->subctxt < remainder)
130*4882a593Smuzhiyun 			fd->tid_limit++;
131*4882a593Smuzhiyun 	} else {
132*4882a593Smuzhiyun 		fd->tid_limit = uctxt->expected_count;
133*4882a593Smuzhiyun 	}
134*4882a593Smuzhiyun 	spin_unlock(&fd->tid_lock);
135*4882a593Smuzhiyun 
136*4882a593Smuzhiyun 	return ret;
137*4882a593Smuzhiyun }
138*4882a593Smuzhiyun 
hfi1_user_exp_rcv_free(struct hfi1_filedata * fd)139*4882a593Smuzhiyun void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd)
140*4882a593Smuzhiyun {
141*4882a593Smuzhiyun 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
142*4882a593Smuzhiyun 
143*4882a593Smuzhiyun 	mutex_lock(&uctxt->exp_mutex);
144*4882a593Smuzhiyun 	if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list))
145*4882a593Smuzhiyun 		unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd);
146*4882a593Smuzhiyun 	if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list))
147*4882a593Smuzhiyun 		unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd);
148*4882a593Smuzhiyun 	mutex_unlock(&uctxt->exp_mutex);
149*4882a593Smuzhiyun 
150*4882a593Smuzhiyun 	kfree(fd->invalid_tids);
151*4882a593Smuzhiyun 	fd->invalid_tids = NULL;
152*4882a593Smuzhiyun 
153*4882a593Smuzhiyun 	kfree(fd->entry_to_rb);
154*4882a593Smuzhiyun 	fd->entry_to_rb = NULL;
155*4882a593Smuzhiyun }
156*4882a593Smuzhiyun 
157*4882a593Smuzhiyun /**
158*4882a593Smuzhiyun  * Release pinned receive buffer pages.
159*4882a593Smuzhiyun  *
160*4882a593Smuzhiyun  * @mapped - true if the pages have been DMA mapped. false otherwise.
161*4882a593Smuzhiyun  * @idx - Index of the first page to unpin.
162*4882a593Smuzhiyun  * @npages - No of pages to unpin.
163*4882a593Smuzhiyun  *
164*4882a593Smuzhiyun  * If the pages have been DMA mapped (indicated by mapped parameter), their
165*4882a593Smuzhiyun  * info will be passed via a struct tid_rb_node. If they haven't been mapped,
166*4882a593Smuzhiyun  * their info will be passed via a struct tid_user_buf.
167*4882a593Smuzhiyun  */
unpin_rcv_pages(struct hfi1_filedata * fd,struct tid_user_buf * tidbuf,struct tid_rb_node * node,unsigned int idx,unsigned int npages,bool mapped)168*4882a593Smuzhiyun static void unpin_rcv_pages(struct hfi1_filedata *fd,
169*4882a593Smuzhiyun 			    struct tid_user_buf *tidbuf,
170*4882a593Smuzhiyun 			    struct tid_rb_node *node,
171*4882a593Smuzhiyun 			    unsigned int idx,
172*4882a593Smuzhiyun 			    unsigned int npages,
173*4882a593Smuzhiyun 			    bool mapped)
174*4882a593Smuzhiyun {
175*4882a593Smuzhiyun 	struct page **pages;
176*4882a593Smuzhiyun 	struct hfi1_devdata *dd = fd->uctxt->dd;
177*4882a593Smuzhiyun 	struct mm_struct *mm;
178*4882a593Smuzhiyun 
179*4882a593Smuzhiyun 	if (mapped) {
180*4882a593Smuzhiyun 		pci_unmap_single(dd->pcidev, node->dma_addr,
181*4882a593Smuzhiyun 				 node->npages * PAGE_SIZE, PCI_DMA_FROMDEVICE);
182*4882a593Smuzhiyun 		pages = &node->pages[idx];
183*4882a593Smuzhiyun 		mm = mm_from_tid_node(node);
184*4882a593Smuzhiyun 	} else {
185*4882a593Smuzhiyun 		pages = &tidbuf->pages[idx];
186*4882a593Smuzhiyun 		mm = current->mm;
187*4882a593Smuzhiyun 	}
188*4882a593Smuzhiyun 	hfi1_release_user_pages(mm, pages, npages, mapped);
189*4882a593Smuzhiyun 	fd->tid_n_pinned -= npages;
190*4882a593Smuzhiyun }
191*4882a593Smuzhiyun 
192*4882a593Smuzhiyun /**
193*4882a593Smuzhiyun  * Pin receive buffer pages.
194*4882a593Smuzhiyun  */
pin_rcv_pages(struct hfi1_filedata * fd,struct tid_user_buf * tidbuf)195*4882a593Smuzhiyun static int pin_rcv_pages(struct hfi1_filedata *fd, struct tid_user_buf *tidbuf)
196*4882a593Smuzhiyun {
197*4882a593Smuzhiyun 	int pinned;
198*4882a593Smuzhiyun 	unsigned int npages;
199*4882a593Smuzhiyun 	unsigned long vaddr = tidbuf->vaddr;
200*4882a593Smuzhiyun 	struct page **pages = NULL;
201*4882a593Smuzhiyun 	struct hfi1_devdata *dd = fd->uctxt->dd;
202*4882a593Smuzhiyun 
203*4882a593Smuzhiyun 	/* Get the number of pages the user buffer spans */
204*4882a593Smuzhiyun 	npages = num_user_pages(vaddr, tidbuf->length);
205*4882a593Smuzhiyun 	if (!npages)
206*4882a593Smuzhiyun 		return -EINVAL;
207*4882a593Smuzhiyun 
208*4882a593Smuzhiyun 	if (npages > fd->uctxt->expected_count) {
209*4882a593Smuzhiyun 		dd_dev_err(dd, "Expected buffer too big\n");
210*4882a593Smuzhiyun 		return -EINVAL;
211*4882a593Smuzhiyun 	}
212*4882a593Smuzhiyun 
213*4882a593Smuzhiyun 	/* Allocate the array of struct page pointers needed for pinning */
214*4882a593Smuzhiyun 	pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
215*4882a593Smuzhiyun 	if (!pages)
216*4882a593Smuzhiyun 		return -ENOMEM;
217*4882a593Smuzhiyun 
218*4882a593Smuzhiyun 	/*
219*4882a593Smuzhiyun 	 * Pin all the pages of the user buffer. If we can't pin all the
220*4882a593Smuzhiyun 	 * pages, accept the amount pinned so far and program only that.
221*4882a593Smuzhiyun 	 * User space knows how to deal with partially programmed buffers.
222*4882a593Smuzhiyun 	 */
223*4882a593Smuzhiyun 	if (!hfi1_can_pin_pages(dd, current->mm, fd->tid_n_pinned, npages)) {
224*4882a593Smuzhiyun 		kfree(pages);
225*4882a593Smuzhiyun 		return -ENOMEM;
226*4882a593Smuzhiyun 	}
227*4882a593Smuzhiyun 
228*4882a593Smuzhiyun 	pinned = hfi1_acquire_user_pages(current->mm, vaddr, npages, true, pages);
229*4882a593Smuzhiyun 	if (pinned <= 0) {
230*4882a593Smuzhiyun 		kfree(pages);
231*4882a593Smuzhiyun 		return pinned;
232*4882a593Smuzhiyun 	}
233*4882a593Smuzhiyun 	tidbuf->pages = pages;
234*4882a593Smuzhiyun 	tidbuf->npages = npages;
235*4882a593Smuzhiyun 	fd->tid_n_pinned += pinned;
236*4882a593Smuzhiyun 	return pinned;
237*4882a593Smuzhiyun }
238*4882a593Smuzhiyun 
239*4882a593Smuzhiyun /*
240*4882a593Smuzhiyun  * RcvArray entry allocation for Expected Receives is done by the
241*4882a593Smuzhiyun  * following algorithm:
242*4882a593Smuzhiyun  *
243*4882a593Smuzhiyun  * The context keeps 3 lists of groups of RcvArray entries:
244*4882a593Smuzhiyun  *   1. List of empty groups - tid_group_list
245*4882a593Smuzhiyun  *      This list is created during user context creation and
246*4882a593Smuzhiyun  *      contains elements which describe sets (of 8) of empty
247*4882a593Smuzhiyun  *      RcvArray entries.
248*4882a593Smuzhiyun  *   2. List of partially used groups - tid_used_list
249*4882a593Smuzhiyun  *      This list contains sets of RcvArray entries which are
250*4882a593Smuzhiyun  *      not completely used up. Another mapping request could
251*4882a593Smuzhiyun  *      use some of all of the remaining entries.
252*4882a593Smuzhiyun  *   3. List of full groups - tid_full_list
253*4882a593Smuzhiyun  *      This is the list where sets that are completely used
254*4882a593Smuzhiyun  *      up go.
255*4882a593Smuzhiyun  *
256*4882a593Smuzhiyun  * An attempt to optimize the usage of RcvArray entries is
257*4882a593Smuzhiyun  * made by finding all sets of physically contiguous pages in a
258*4882a593Smuzhiyun  * user's buffer.
259*4882a593Smuzhiyun  * These physically contiguous sets are further split into
260*4882a593Smuzhiyun  * sizes supported by the receive engine of the HFI. The
261*4882a593Smuzhiyun  * resulting sets of pages are stored in struct tid_pageset,
262*4882a593Smuzhiyun  * which describes the sets as:
263*4882a593Smuzhiyun  *    * .count - number of pages in this set
264*4882a593Smuzhiyun  *    * .idx - starting index into struct page ** array
265*4882a593Smuzhiyun  *                    of this set
266*4882a593Smuzhiyun  *
267*4882a593Smuzhiyun  * From this point on, the algorithm deals with the page sets
268*4882a593Smuzhiyun  * described above. The number of pagesets is divided by the
269*4882a593Smuzhiyun  * RcvArray group size to produce the number of full groups
270*4882a593Smuzhiyun  * needed.
271*4882a593Smuzhiyun  *
272*4882a593Smuzhiyun  * Groups from the 3 lists are manipulated using the following
273*4882a593Smuzhiyun  * rules:
274*4882a593Smuzhiyun  *   1. For each set of 8 pagesets, a complete group from
275*4882a593Smuzhiyun  *      tid_group_list is taken, programmed, and moved to
276*4882a593Smuzhiyun  *      the tid_full_list list.
277*4882a593Smuzhiyun  *   2. For all remaining pagesets:
278*4882a593Smuzhiyun  *      2.1 If the tid_used_list is empty and the tid_group_list
279*4882a593Smuzhiyun  *          is empty, stop processing pageset and return only
280*4882a593Smuzhiyun  *          what has been programmed up to this point.
281*4882a593Smuzhiyun  *      2.2 If the tid_used_list is empty and the tid_group_list
282*4882a593Smuzhiyun  *          is not empty, move a group from tid_group_list to
283*4882a593Smuzhiyun  *          tid_used_list.
284*4882a593Smuzhiyun  *      2.3 For each group is tid_used_group, program as much as
285*4882a593Smuzhiyun  *          can fit into the group. If the group becomes fully
286*4882a593Smuzhiyun  *          used, move it to tid_full_list.
287*4882a593Smuzhiyun  */
hfi1_user_exp_rcv_setup(struct hfi1_filedata * fd,struct hfi1_tid_info * tinfo)288*4882a593Smuzhiyun int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd,
289*4882a593Smuzhiyun 			    struct hfi1_tid_info *tinfo)
290*4882a593Smuzhiyun {
291*4882a593Smuzhiyun 	int ret = 0, need_group = 0, pinned;
292*4882a593Smuzhiyun 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
293*4882a593Smuzhiyun 	struct hfi1_devdata *dd = uctxt->dd;
294*4882a593Smuzhiyun 	unsigned int ngroups, pageidx = 0, pageset_count,
295*4882a593Smuzhiyun 		tididx = 0, mapped, mapped_pages = 0;
296*4882a593Smuzhiyun 	u32 *tidlist = NULL;
297*4882a593Smuzhiyun 	struct tid_user_buf *tidbuf;
298*4882a593Smuzhiyun 
299*4882a593Smuzhiyun 	if (!PAGE_ALIGNED(tinfo->vaddr))
300*4882a593Smuzhiyun 		return -EINVAL;
301*4882a593Smuzhiyun 
302*4882a593Smuzhiyun 	tidbuf = kzalloc(sizeof(*tidbuf), GFP_KERNEL);
303*4882a593Smuzhiyun 	if (!tidbuf)
304*4882a593Smuzhiyun 		return -ENOMEM;
305*4882a593Smuzhiyun 
306*4882a593Smuzhiyun 	tidbuf->vaddr = tinfo->vaddr;
307*4882a593Smuzhiyun 	tidbuf->length = tinfo->length;
308*4882a593Smuzhiyun 	tidbuf->psets = kcalloc(uctxt->expected_count, sizeof(*tidbuf->psets),
309*4882a593Smuzhiyun 				GFP_KERNEL);
310*4882a593Smuzhiyun 	if (!tidbuf->psets) {
311*4882a593Smuzhiyun 		kfree(tidbuf);
312*4882a593Smuzhiyun 		return -ENOMEM;
313*4882a593Smuzhiyun 	}
314*4882a593Smuzhiyun 
315*4882a593Smuzhiyun 	pinned = pin_rcv_pages(fd, tidbuf);
316*4882a593Smuzhiyun 	if (pinned <= 0) {
317*4882a593Smuzhiyun 		kfree(tidbuf->psets);
318*4882a593Smuzhiyun 		kfree(tidbuf);
319*4882a593Smuzhiyun 		return pinned;
320*4882a593Smuzhiyun 	}
321*4882a593Smuzhiyun 
322*4882a593Smuzhiyun 	/* Find sets of physically contiguous pages */
323*4882a593Smuzhiyun 	tidbuf->n_psets = find_phys_blocks(tidbuf, pinned);
324*4882a593Smuzhiyun 
325*4882a593Smuzhiyun 	/*
326*4882a593Smuzhiyun 	 * We don't need to access this under a lock since tid_used is per
327*4882a593Smuzhiyun 	 * process and the same process cannot be in hfi1_user_exp_rcv_clear()
328*4882a593Smuzhiyun 	 * and hfi1_user_exp_rcv_setup() at the same time.
329*4882a593Smuzhiyun 	 */
330*4882a593Smuzhiyun 	spin_lock(&fd->tid_lock);
331*4882a593Smuzhiyun 	if (fd->tid_used + tidbuf->n_psets > fd->tid_limit)
332*4882a593Smuzhiyun 		pageset_count = fd->tid_limit - fd->tid_used;
333*4882a593Smuzhiyun 	else
334*4882a593Smuzhiyun 		pageset_count = tidbuf->n_psets;
335*4882a593Smuzhiyun 	spin_unlock(&fd->tid_lock);
336*4882a593Smuzhiyun 
337*4882a593Smuzhiyun 	if (!pageset_count)
338*4882a593Smuzhiyun 		goto bail;
339*4882a593Smuzhiyun 
340*4882a593Smuzhiyun 	ngroups = pageset_count / dd->rcv_entries.group_size;
341*4882a593Smuzhiyun 	tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL);
342*4882a593Smuzhiyun 	if (!tidlist) {
343*4882a593Smuzhiyun 		ret = -ENOMEM;
344*4882a593Smuzhiyun 		goto nomem;
345*4882a593Smuzhiyun 	}
346*4882a593Smuzhiyun 
347*4882a593Smuzhiyun 	tididx = 0;
348*4882a593Smuzhiyun 
349*4882a593Smuzhiyun 	/*
350*4882a593Smuzhiyun 	 * From this point on, we are going to be using shared (between master
351*4882a593Smuzhiyun 	 * and subcontexts) context resources. We need to take the lock.
352*4882a593Smuzhiyun 	 */
353*4882a593Smuzhiyun 	mutex_lock(&uctxt->exp_mutex);
354*4882a593Smuzhiyun 	/*
355*4882a593Smuzhiyun 	 * The first step is to program the RcvArray entries which are complete
356*4882a593Smuzhiyun 	 * groups.
357*4882a593Smuzhiyun 	 */
358*4882a593Smuzhiyun 	while (ngroups && uctxt->tid_group_list.count) {
359*4882a593Smuzhiyun 		struct tid_group *grp =
360*4882a593Smuzhiyun 			tid_group_pop(&uctxt->tid_group_list);
361*4882a593Smuzhiyun 
362*4882a593Smuzhiyun 		ret = program_rcvarray(fd, tidbuf, grp,
363*4882a593Smuzhiyun 				       pageidx, dd->rcv_entries.group_size,
364*4882a593Smuzhiyun 				       tidlist, &tididx, &mapped);
365*4882a593Smuzhiyun 		/*
366*4882a593Smuzhiyun 		 * If there was a failure to program the RcvArray
367*4882a593Smuzhiyun 		 * entries for the entire group, reset the grp fields
368*4882a593Smuzhiyun 		 * and add the grp back to the free group list.
369*4882a593Smuzhiyun 		 */
370*4882a593Smuzhiyun 		if (ret <= 0) {
371*4882a593Smuzhiyun 			tid_group_add_tail(grp, &uctxt->tid_group_list);
372*4882a593Smuzhiyun 			hfi1_cdbg(TID,
373*4882a593Smuzhiyun 				  "Failed to program RcvArray group %d", ret);
374*4882a593Smuzhiyun 			goto unlock;
375*4882a593Smuzhiyun 		}
376*4882a593Smuzhiyun 
377*4882a593Smuzhiyun 		tid_group_add_tail(grp, &uctxt->tid_full_list);
378*4882a593Smuzhiyun 		ngroups--;
379*4882a593Smuzhiyun 		pageidx += ret;
380*4882a593Smuzhiyun 		mapped_pages += mapped;
381*4882a593Smuzhiyun 	}
382*4882a593Smuzhiyun 
383*4882a593Smuzhiyun 	while (pageidx < pageset_count) {
384*4882a593Smuzhiyun 		struct tid_group *grp, *ptr;
385*4882a593Smuzhiyun 		/*
386*4882a593Smuzhiyun 		 * If we don't have any partially used tid groups, check
387*4882a593Smuzhiyun 		 * if we have empty groups. If so, take one from there and
388*4882a593Smuzhiyun 		 * put in the partially used list.
389*4882a593Smuzhiyun 		 */
390*4882a593Smuzhiyun 		if (!uctxt->tid_used_list.count || need_group) {
391*4882a593Smuzhiyun 			if (!uctxt->tid_group_list.count)
392*4882a593Smuzhiyun 				goto unlock;
393*4882a593Smuzhiyun 
394*4882a593Smuzhiyun 			grp = tid_group_pop(&uctxt->tid_group_list);
395*4882a593Smuzhiyun 			tid_group_add_tail(grp, &uctxt->tid_used_list);
396*4882a593Smuzhiyun 			need_group = 0;
397*4882a593Smuzhiyun 		}
398*4882a593Smuzhiyun 		/*
399*4882a593Smuzhiyun 		 * There is an optimization opportunity here - instead of
400*4882a593Smuzhiyun 		 * fitting as many page sets as we can, check for a group
401*4882a593Smuzhiyun 		 * later on in the list that could fit all of them.
402*4882a593Smuzhiyun 		 */
403*4882a593Smuzhiyun 		list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list,
404*4882a593Smuzhiyun 					 list) {
405*4882a593Smuzhiyun 			unsigned use = min_t(unsigned, pageset_count - pageidx,
406*4882a593Smuzhiyun 					     grp->size - grp->used);
407*4882a593Smuzhiyun 
408*4882a593Smuzhiyun 			ret = program_rcvarray(fd, tidbuf, grp,
409*4882a593Smuzhiyun 					       pageidx, use, tidlist,
410*4882a593Smuzhiyun 					       &tididx, &mapped);
411*4882a593Smuzhiyun 			if (ret < 0) {
412*4882a593Smuzhiyun 				hfi1_cdbg(TID,
413*4882a593Smuzhiyun 					  "Failed to program RcvArray entries %d",
414*4882a593Smuzhiyun 					  ret);
415*4882a593Smuzhiyun 				goto unlock;
416*4882a593Smuzhiyun 			} else if (ret > 0) {
417*4882a593Smuzhiyun 				if (grp->used == grp->size)
418*4882a593Smuzhiyun 					tid_group_move(grp,
419*4882a593Smuzhiyun 						       &uctxt->tid_used_list,
420*4882a593Smuzhiyun 						       &uctxt->tid_full_list);
421*4882a593Smuzhiyun 				pageidx += ret;
422*4882a593Smuzhiyun 				mapped_pages += mapped;
423*4882a593Smuzhiyun 				need_group = 0;
424*4882a593Smuzhiyun 				/* Check if we are done so we break out early */
425*4882a593Smuzhiyun 				if (pageidx >= pageset_count)
426*4882a593Smuzhiyun 					break;
427*4882a593Smuzhiyun 			} else if (WARN_ON(ret == 0)) {
428*4882a593Smuzhiyun 				/*
429*4882a593Smuzhiyun 				 * If ret is 0, we did not program any entries
430*4882a593Smuzhiyun 				 * into this group, which can only happen if
431*4882a593Smuzhiyun 				 * we've screwed up the accounting somewhere.
432*4882a593Smuzhiyun 				 * Warn and try to continue.
433*4882a593Smuzhiyun 				 */
434*4882a593Smuzhiyun 				need_group = 1;
435*4882a593Smuzhiyun 			}
436*4882a593Smuzhiyun 		}
437*4882a593Smuzhiyun 	}
438*4882a593Smuzhiyun unlock:
439*4882a593Smuzhiyun 	mutex_unlock(&uctxt->exp_mutex);
440*4882a593Smuzhiyun nomem:
441*4882a593Smuzhiyun 	hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx,
442*4882a593Smuzhiyun 		  mapped_pages, ret);
443*4882a593Smuzhiyun 	if (tididx) {
444*4882a593Smuzhiyun 		spin_lock(&fd->tid_lock);
445*4882a593Smuzhiyun 		fd->tid_used += tididx;
446*4882a593Smuzhiyun 		spin_unlock(&fd->tid_lock);
447*4882a593Smuzhiyun 		tinfo->tidcnt = tididx;
448*4882a593Smuzhiyun 		tinfo->length = mapped_pages * PAGE_SIZE;
449*4882a593Smuzhiyun 
450*4882a593Smuzhiyun 		if (copy_to_user(u64_to_user_ptr(tinfo->tidlist),
451*4882a593Smuzhiyun 				 tidlist, sizeof(tidlist[0]) * tididx)) {
452*4882a593Smuzhiyun 			/*
453*4882a593Smuzhiyun 			 * On failure to copy to the user level, we need to undo
454*4882a593Smuzhiyun 			 * everything done so far so we don't leak resources.
455*4882a593Smuzhiyun 			 */
456*4882a593Smuzhiyun 			tinfo->tidlist = (unsigned long)&tidlist;
457*4882a593Smuzhiyun 			hfi1_user_exp_rcv_clear(fd, tinfo);
458*4882a593Smuzhiyun 			tinfo->tidlist = 0;
459*4882a593Smuzhiyun 			ret = -EFAULT;
460*4882a593Smuzhiyun 			goto bail;
461*4882a593Smuzhiyun 		}
462*4882a593Smuzhiyun 	}
463*4882a593Smuzhiyun 
464*4882a593Smuzhiyun 	/*
465*4882a593Smuzhiyun 	 * If not everything was mapped (due to insufficient RcvArray entries,
466*4882a593Smuzhiyun 	 * for example), unpin all unmapped pages so we can pin them nex time.
467*4882a593Smuzhiyun 	 */
468*4882a593Smuzhiyun 	if (mapped_pages != pinned)
469*4882a593Smuzhiyun 		unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages,
470*4882a593Smuzhiyun 				(pinned - mapped_pages), false);
471*4882a593Smuzhiyun bail:
472*4882a593Smuzhiyun 	kfree(tidbuf->psets);
473*4882a593Smuzhiyun 	kfree(tidlist);
474*4882a593Smuzhiyun 	kfree(tidbuf->pages);
475*4882a593Smuzhiyun 	kfree(tidbuf);
476*4882a593Smuzhiyun 	return ret > 0 ? 0 : ret;
477*4882a593Smuzhiyun }
478*4882a593Smuzhiyun 
hfi1_user_exp_rcv_clear(struct hfi1_filedata * fd,struct hfi1_tid_info * tinfo)479*4882a593Smuzhiyun int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd,
480*4882a593Smuzhiyun 			    struct hfi1_tid_info *tinfo)
481*4882a593Smuzhiyun {
482*4882a593Smuzhiyun 	int ret = 0;
483*4882a593Smuzhiyun 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
484*4882a593Smuzhiyun 	u32 *tidinfo;
485*4882a593Smuzhiyun 	unsigned tididx;
486*4882a593Smuzhiyun 
487*4882a593Smuzhiyun 	if (unlikely(tinfo->tidcnt > fd->tid_used))
488*4882a593Smuzhiyun 		return -EINVAL;
489*4882a593Smuzhiyun 
490*4882a593Smuzhiyun 	tidinfo = memdup_user(u64_to_user_ptr(tinfo->tidlist),
491*4882a593Smuzhiyun 			      sizeof(tidinfo[0]) * tinfo->tidcnt);
492*4882a593Smuzhiyun 	if (IS_ERR(tidinfo))
493*4882a593Smuzhiyun 		return PTR_ERR(tidinfo);
494*4882a593Smuzhiyun 
495*4882a593Smuzhiyun 	mutex_lock(&uctxt->exp_mutex);
496*4882a593Smuzhiyun 	for (tididx = 0; tididx < tinfo->tidcnt; tididx++) {
497*4882a593Smuzhiyun 		ret = unprogram_rcvarray(fd, tidinfo[tididx], NULL);
498*4882a593Smuzhiyun 		if (ret) {
499*4882a593Smuzhiyun 			hfi1_cdbg(TID, "Failed to unprogram rcv array %d",
500*4882a593Smuzhiyun 				  ret);
501*4882a593Smuzhiyun 			break;
502*4882a593Smuzhiyun 		}
503*4882a593Smuzhiyun 	}
504*4882a593Smuzhiyun 	spin_lock(&fd->tid_lock);
505*4882a593Smuzhiyun 	fd->tid_used -= tididx;
506*4882a593Smuzhiyun 	spin_unlock(&fd->tid_lock);
507*4882a593Smuzhiyun 	tinfo->tidcnt = tididx;
508*4882a593Smuzhiyun 	mutex_unlock(&uctxt->exp_mutex);
509*4882a593Smuzhiyun 
510*4882a593Smuzhiyun 	kfree(tidinfo);
511*4882a593Smuzhiyun 	return ret;
512*4882a593Smuzhiyun }
513*4882a593Smuzhiyun 
hfi1_user_exp_rcv_invalid(struct hfi1_filedata * fd,struct hfi1_tid_info * tinfo)514*4882a593Smuzhiyun int hfi1_user_exp_rcv_invalid(struct hfi1_filedata *fd,
515*4882a593Smuzhiyun 			      struct hfi1_tid_info *tinfo)
516*4882a593Smuzhiyun {
517*4882a593Smuzhiyun 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
518*4882a593Smuzhiyun 	unsigned long *ev = uctxt->dd->events +
519*4882a593Smuzhiyun 		(uctxt_offset(uctxt) + fd->subctxt);
520*4882a593Smuzhiyun 	u32 *array;
521*4882a593Smuzhiyun 	int ret = 0;
522*4882a593Smuzhiyun 
523*4882a593Smuzhiyun 	/*
524*4882a593Smuzhiyun 	 * copy_to_user() can sleep, which will leave the invalid_lock
525*4882a593Smuzhiyun 	 * locked and cause the MMU notifier to be blocked on the lock
526*4882a593Smuzhiyun 	 * for a long time.
527*4882a593Smuzhiyun 	 * Copy the data to a local buffer so we can release the lock.
528*4882a593Smuzhiyun 	 */
529*4882a593Smuzhiyun 	array = kcalloc(uctxt->expected_count, sizeof(*array), GFP_KERNEL);
530*4882a593Smuzhiyun 	if (!array)
531*4882a593Smuzhiyun 		return -EFAULT;
532*4882a593Smuzhiyun 
533*4882a593Smuzhiyun 	spin_lock(&fd->invalid_lock);
534*4882a593Smuzhiyun 	if (fd->invalid_tid_idx) {
535*4882a593Smuzhiyun 		memcpy(array, fd->invalid_tids, sizeof(*array) *
536*4882a593Smuzhiyun 		       fd->invalid_tid_idx);
537*4882a593Smuzhiyun 		memset(fd->invalid_tids, 0, sizeof(*fd->invalid_tids) *
538*4882a593Smuzhiyun 		       fd->invalid_tid_idx);
539*4882a593Smuzhiyun 		tinfo->tidcnt = fd->invalid_tid_idx;
540*4882a593Smuzhiyun 		fd->invalid_tid_idx = 0;
541*4882a593Smuzhiyun 		/*
542*4882a593Smuzhiyun 		 * Reset the user flag while still holding the lock.
543*4882a593Smuzhiyun 		 * Otherwise, PSM can miss events.
544*4882a593Smuzhiyun 		 */
545*4882a593Smuzhiyun 		clear_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
546*4882a593Smuzhiyun 	} else {
547*4882a593Smuzhiyun 		tinfo->tidcnt = 0;
548*4882a593Smuzhiyun 	}
549*4882a593Smuzhiyun 	spin_unlock(&fd->invalid_lock);
550*4882a593Smuzhiyun 
551*4882a593Smuzhiyun 	if (tinfo->tidcnt) {
552*4882a593Smuzhiyun 		if (copy_to_user((void __user *)tinfo->tidlist,
553*4882a593Smuzhiyun 				 array, sizeof(*array) * tinfo->tidcnt))
554*4882a593Smuzhiyun 			ret = -EFAULT;
555*4882a593Smuzhiyun 	}
556*4882a593Smuzhiyun 	kfree(array);
557*4882a593Smuzhiyun 
558*4882a593Smuzhiyun 	return ret;
559*4882a593Smuzhiyun }
560*4882a593Smuzhiyun 
find_phys_blocks(struct tid_user_buf * tidbuf,unsigned int npages)561*4882a593Smuzhiyun static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages)
562*4882a593Smuzhiyun {
563*4882a593Smuzhiyun 	unsigned pagecount, pageidx, setcount = 0, i;
564*4882a593Smuzhiyun 	unsigned long pfn, this_pfn;
565*4882a593Smuzhiyun 	struct page **pages = tidbuf->pages;
566*4882a593Smuzhiyun 	struct tid_pageset *list = tidbuf->psets;
567*4882a593Smuzhiyun 
568*4882a593Smuzhiyun 	if (!npages)
569*4882a593Smuzhiyun 		return 0;
570*4882a593Smuzhiyun 
571*4882a593Smuzhiyun 	/*
572*4882a593Smuzhiyun 	 * Look for sets of physically contiguous pages in the user buffer.
573*4882a593Smuzhiyun 	 * This will allow us to optimize Expected RcvArray entry usage by
574*4882a593Smuzhiyun 	 * using the bigger supported sizes.
575*4882a593Smuzhiyun 	 */
576*4882a593Smuzhiyun 	pfn = page_to_pfn(pages[0]);
577*4882a593Smuzhiyun 	for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
578*4882a593Smuzhiyun 		this_pfn = i < npages ? page_to_pfn(pages[i]) : 0;
579*4882a593Smuzhiyun 
580*4882a593Smuzhiyun 		/*
581*4882a593Smuzhiyun 		 * If the pfn's are not sequential, pages are not physically
582*4882a593Smuzhiyun 		 * contiguous.
583*4882a593Smuzhiyun 		 */
584*4882a593Smuzhiyun 		if (this_pfn != ++pfn) {
585*4882a593Smuzhiyun 			/*
586*4882a593Smuzhiyun 			 * At this point we have to loop over the set of
587*4882a593Smuzhiyun 			 * physically contiguous pages and break them down it
588*4882a593Smuzhiyun 			 * sizes supported by the HW.
589*4882a593Smuzhiyun 			 * There are two main constraints:
590*4882a593Smuzhiyun 			 *     1. The max buffer size is MAX_EXPECTED_BUFFER.
591*4882a593Smuzhiyun 			 *        If the total set size is bigger than that
592*4882a593Smuzhiyun 			 *        program only a MAX_EXPECTED_BUFFER chunk.
593*4882a593Smuzhiyun 			 *     2. The buffer size has to be a power of two. If
594*4882a593Smuzhiyun 			 *        it is not, round down to the closes power of
595*4882a593Smuzhiyun 			 *        2 and program that size.
596*4882a593Smuzhiyun 			 */
597*4882a593Smuzhiyun 			while (pagecount) {
598*4882a593Smuzhiyun 				int maxpages = pagecount;
599*4882a593Smuzhiyun 				u32 bufsize = pagecount * PAGE_SIZE;
600*4882a593Smuzhiyun 
601*4882a593Smuzhiyun 				if (bufsize > MAX_EXPECTED_BUFFER)
602*4882a593Smuzhiyun 					maxpages =
603*4882a593Smuzhiyun 						MAX_EXPECTED_BUFFER >>
604*4882a593Smuzhiyun 						PAGE_SHIFT;
605*4882a593Smuzhiyun 				else if (!is_power_of_2(bufsize))
606*4882a593Smuzhiyun 					maxpages =
607*4882a593Smuzhiyun 						rounddown_pow_of_two(bufsize) >>
608*4882a593Smuzhiyun 						PAGE_SHIFT;
609*4882a593Smuzhiyun 
610*4882a593Smuzhiyun 				list[setcount].idx = pageidx;
611*4882a593Smuzhiyun 				list[setcount].count = maxpages;
612*4882a593Smuzhiyun 				pagecount -= maxpages;
613*4882a593Smuzhiyun 				pageidx += maxpages;
614*4882a593Smuzhiyun 				setcount++;
615*4882a593Smuzhiyun 			}
616*4882a593Smuzhiyun 			pageidx = i;
617*4882a593Smuzhiyun 			pagecount = 1;
618*4882a593Smuzhiyun 			pfn = this_pfn;
619*4882a593Smuzhiyun 		} else {
620*4882a593Smuzhiyun 			pagecount++;
621*4882a593Smuzhiyun 		}
622*4882a593Smuzhiyun 	}
623*4882a593Smuzhiyun 	return setcount;
624*4882a593Smuzhiyun }
625*4882a593Smuzhiyun 
626*4882a593Smuzhiyun /**
627*4882a593Smuzhiyun  * program_rcvarray() - program an RcvArray group with receive buffers
628*4882a593Smuzhiyun  * @fd: filedata pointer
629*4882a593Smuzhiyun  * @tbuf: pointer to struct tid_user_buf that has the user buffer starting
630*4882a593Smuzhiyun  *	  virtual address, buffer length, page pointers, pagesets (array of
631*4882a593Smuzhiyun  *	  struct tid_pageset holding information on physically contiguous
632*4882a593Smuzhiyun  *	  chunks from the user buffer), and other fields.
633*4882a593Smuzhiyun  * @grp: RcvArray group
634*4882a593Smuzhiyun  * @start: starting index into sets array
635*4882a593Smuzhiyun  * @count: number of struct tid_pageset's to program
636*4882a593Smuzhiyun  * @tidlist: the array of u32 elements when the information about the
637*4882a593Smuzhiyun  *           programmed RcvArray entries is to be encoded.
638*4882a593Smuzhiyun  * @tididx: starting offset into tidlist
639*4882a593Smuzhiyun  * @pmapped: (output parameter) number of pages programmed into the RcvArray
640*4882a593Smuzhiyun  *           entries.
641*4882a593Smuzhiyun  *
642*4882a593Smuzhiyun  * This function will program up to 'count' number of RcvArray entries from the
643*4882a593Smuzhiyun  * group 'grp'. To make best use of write-combining writes, the function will
644*4882a593Smuzhiyun  * perform writes to the unused RcvArray entries which will be ignored by the
645*4882a593Smuzhiyun  * HW. Each RcvArray entry will be programmed with a physically contiguous
646*4882a593Smuzhiyun  * buffer chunk from the user's virtual buffer.
647*4882a593Smuzhiyun  *
648*4882a593Smuzhiyun  * Return:
649*4882a593Smuzhiyun  * -EINVAL if the requested count is larger than the size of the group,
650*4882a593Smuzhiyun  * -ENOMEM or -EFAULT on error from set_rcvarray_entry(), or
651*4882a593Smuzhiyun  * number of RcvArray entries programmed.
652*4882a593Smuzhiyun  */
program_rcvarray(struct hfi1_filedata * fd,struct tid_user_buf * tbuf,struct tid_group * grp,unsigned int start,u16 count,u32 * tidlist,unsigned int * tididx,unsigned int * pmapped)653*4882a593Smuzhiyun static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *tbuf,
654*4882a593Smuzhiyun 			    struct tid_group *grp,
655*4882a593Smuzhiyun 			    unsigned int start, u16 count,
656*4882a593Smuzhiyun 			    u32 *tidlist, unsigned int *tididx,
657*4882a593Smuzhiyun 			    unsigned int *pmapped)
658*4882a593Smuzhiyun {
659*4882a593Smuzhiyun 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
660*4882a593Smuzhiyun 	struct hfi1_devdata *dd = uctxt->dd;
661*4882a593Smuzhiyun 	u16 idx;
662*4882a593Smuzhiyun 	u32 tidinfo = 0, rcventry, useidx = 0;
663*4882a593Smuzhiyun 	int mapped = 0;
664*4882a593Smuzhiyun 
665*4882a593Smuzhiyun 	/* Count should never be larger than the group size */
666*4882a593Smuzhiyun 	if (count > grp->size)
667*4882a593Smuzhiyun 		return -EINVAL;
668*4882a593Smuzhiyun 
669*4882a593Smuzhiyun 	/* Find the first unused entry in the group */
670*4882a593Smuzhiyun 	for (idx = 0; idx < grp->size; idx++) {
671*4882a593Smuzhiyun 		if (!(grp->map & (1 << idx))) {
672*4882a593Smuzhiyun 			useidx = idx;
673*4882a593Smuzhiyun 			break;
674*4882a593Smuzhiyun 		}
675*4882a593Smuzhiyun 		rcv_array_wc_fill(dd, grp->base + idx);
676*4882a593Smuzhiyun 	}
677*4882a593Smuzhiyun 
678*4882a593Smuzhiyun 	idx = 0;
679*4882a593Smuzhiyun 	while (idx < count) {
680*4882a593Smuzhiyun 		u16 npages, pageidx, setidx = start + idx;
681*4882a593Smuzhiyun 		int ret = 0;
682*4882a593Smuzhiyun 
683*4882a593Smuzhiyun 		/*
684*4882a593Smuzhiyun 		 * If this entry in the group is used, move to the next one.
685*4882a593Smuzhiyun 		 * If we go past the end of the group, exit the loop.
686*4882a593Smuzhiyun 		 */
687*4882a593Smuzhiyun 		if (useidx >= grp->size) {
688*4882a593Smuzhiyun 			break;
689*4882a593Smuzhiyun 		} else if (grp->map & (1 << useidx)) {
690*4882a593Smuzhiyun 			rcv_array_wc_fill(dd, grp->base + useidx);
691*4882a593Smuzhiyun 			useidx++;
692*4882a593Smuzhiyun 			continue;
693*4882a593Smuzhiyun 		}
694*4882a593Smuzhiyun 
695*4882a593Smuzhiyun 		rcventry = grp->base + useidx;
696*4882a593Smuzhiyun 		npages = tbuf->psets[setidx].count;
697*4882a593Smuzhiyun 		pageidx = tbuf->psets[setidx].idx;
698*4882a593Smuzhiyun 
699*4882a593Smuzhiyun 		ret = set_rcvarray_entry(fd, tbuf,
700*4882a593Smuzhiyun 					 rcventry, grp, pageidx,
701*4882a593Smuzhiyun 					 npages);
702*4882a593Smuzhiyun 		if (ret)
703*4882a593Smuzhiyun 			return ret;
704*4882a593Smuzhiyun 		mapped += npages;
705*4882a593Smuzhiyun 
706*4882a593Smuzhiyun 		tidinfo = rcventry2tidinfo(rcventry - uctxt->expected_base) |
707*4882a593Smuzhiyun 			EXP_TID_SET(LEN, npages);
708*4882a593Smuzhiyun 		tidlist[(*tididx)++] = tidinfo;
709*4882a593Smuzhiyun 		grp->used++;
710*4882a593Smuzhiyun 		grp->map |= 1 << useidx++;
711*4882a593Smuzhiyun 		idx++;
712*4882a593Smuzhiyun 	}
713*4882a593Smuzhiyun 
714*4882a593Smuzhiyun 	/* Fill the rest of the group with "blank" writes */
715*4882a593Smuzhiyun 	for (; useidx < grp->size; useidx++)
716*4882a593Smuzhiyun 		rcv_array_wc_fill(dd, grp->base + useidx);
717*4882a593Smuzhiyun 	*pmapped = mapped;
718*4882a593Smuzhiyun 	return idx;
719*4882a593Smuzhiyun }
720*4882a593Smuzhiyun 
set_rcvarray_entry(struct hfi1_filedata * fd,struct tid_user_buf * tbuf,u32 rcventry,struct tid_group * grp,u16 pageidx,unsigned int npages)721*4882a593Smuzhiyun static int set_rcvarray_entry(struct hfi1_filedata *fd,
722*4882a593Smuzhiyun 			      struct tid_user_buf *tbuf,
723*4882a593Smuzhiyun 			      u32 rcventry, struct tid_group *grp,
724*4882a593Smuzhiyun 			      u16 pageidx, unsigned int npages)
725*4882a593Smuzhiyun {
726*4882a593Smuzhiyun 	int ret;
727*4882a593Smuzhiyun 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
728*4882a593Smuzhiyun 	struct tid_rb_node *node;
729*4882a593Smuzhiyun 	struct hfi1_devdata *dd = uctxt->dd;
730*4882a593Smuzhiyun 	dma_addr_t phys;
731*4882a593Smuzhiyun 	struct page **pages = tbuf->pages + pageidx;
732*4882a593Smuzhiyun 
733*4882a593Smuzhiyun 	/*
734*4882a593Smuzhiyun 	 * Allocate the node first so we can handle a potential
735*4882a593Smuzhiyun 	 * failure before we've programmed anything.
736*4882a593Smuzhiyun 	 */
737*4882a593Smuzhiyun 	node = kzalloc(sizeof(*node) + (sizeof(struct page *) * npages),
738*4882a593Smuzhiyun 		       GFP_KERNEL);
739*4882a593Smuzhiyun 	if (!node)
740*4882a593Smuzhiyun 		return -ENOMEM;
741*4882a593Smuzhiyun 
742*4882a593Smuzhiyun 	phys = pci_map_single(dd->pcidev,
743*4882a593Smuzhiyun 			      __va(page_to_phys(pages[0])),
744*4882a593Smuzhiyun 			      npages * PAGE_SIZE, PCI_DMA_FROMDEVICE);
745*4882a593Smuzhiyun 	if (dma_mapping_error(&dd->pcidev->dev, phys)) {
746*4882a593Smuzhiyun 		dd_dev_err(dd, "Failed to DMA map Exp Rcv pages 0x%llx\n",
747*4882a593Smuzhiyun 			   phys);
748*4882a593Smuzhiyun 		kfree(node);
749*4882a593Smuzhiyun 		return -EFAULT;
750*4882a593Smuzhiyun 	}
751*4882a593Smuzhiyun 
752*4882a593Smuzhiyun 	node->fdata = fd;
753*4882a593Smuzhiyun 	node->phys = page_to_phys(pages[0]);
754*4882a593Smuzhiyun 	node->npages = npages;
755*4882a593Smuzhiyun 	node->rcventry = rcventry;
756*4882a593Smuzhiyun 	node->dma_addr = phys;
757*4882a593Smuzhiyun 	node->grp = grp;
758*4882a593Smuzhiyun 	node->freed = false;
759*4882a593Smuzhiyun 	memcpy(node->pages, pages, sizeof(struct page *) * npages);
760*4882a593Smuzhiyun 
761*4882a593Smuzhiyun 	if (fd->use_mn) {
762*4882a593Smuzhiyun 		ret = mmu_interval_notifier_insert(
763*4882a593Smuzhiyun 			&node->notifier, current->mm,
764*4882a593Smuzhiyun 			tbuf->vaddr + (pageidx * PAGE_SIZE), npages * PAGE_SIZE,
765*4882a593Smuzhiyun 			&tid_mn_ops);
766*4882a593Smuzhiyun 		if (ret)
767*4882a593Smuzhiyun 			goto out_unmap;
768*4882a593Smuzhiyun 		/*
769*4882a593Smuzhiyun 		 * FIXME: This is in the wrong order, the notifier should be
770*4882a593Smuzhiyun 		 * established before the pages are pinned by pin_rcv_pages.
771*4882a593Smuzhiyun 		 */
772*4882a593Smuzhiyun 		mmu_interval_read_begin(&node->notifier);
773*4882a593Smuzhiyun 	}
774*4882a593Smuzhiyun 	fd->entry_to_rb[node->rcventry - uctxt->expected_base] = node;
775*4882a593Smuzhiyun 
776*4882a593Smuzhiyun 	hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1);
777*4882a593Smuzhiyun 	trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry, npages,
778*4882a593Smuzhiyun 			       node->notifier.interval_tree.start, node->phys,
779*4882a593Smuzhiyun 			       phys);
780*4882a593Smuzhiyun 	return 0;
781*4882a593Smuzhiyun 
782*4882a593Smuzhiyun out_unmap:
783*4882a593Smuzhiyun 	hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d",
784*4882a593Smuzhiyun 		  node->rcventry, node->notifier.interval_tree.start,
785*4882a593Smuzhiyun 		  node->phys, ret);
786*4882a593Smuzhiyun 	pci_unmap_single(dd->pcidev, phys, npages * PAGE_SIZE,
787*4882a593Smuzhiyun 			 PCI_DMA_FROMDEVICE);
788*4882a593Smuzhiyun 	kfree(node);
789*4882a593Smuzhiyun 	return -EFAULT;
790*4882a593Smuzhiyun }
791*4882a593Smuzhiyun 
unprogram_rcvarray(struct hfi1_filedata * fd,u32 tidinfo,struct tid_group ** grp)792*4882a593Smuzhiyun static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
793*4882a593Smuzhiyun 			      struct tid_group **grp)
794*4882a593Smuzhiyun {
795*4882a593Smuzhiyun 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
796*4882a593Smuzhiyun 	struct hfi1_devdata *dd = uctxt->dd;
797*4882a593Smuzhiyun 	struct tid_rb_node *node;
798*4882a593Smuzhiyun 	u8 tidctrl = EXP_TID_GET(tidinfo, CTRL);
799*4882a593Smuzhiyun 	u32 tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry;
800*4882a593Smuzhiyun 
801*4882a593Smuzhiyun 	if (tididx >= uctxt->expected_count) {
802*4882a593Smuzhiyun 		dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n",
803*4882a593Smuzhiyun 			   tididx, uctxt->ctxt);
804*4882a593Smuzhiyun 		return -EINVAL;
805*4882a593Smuzhiyun 	}
806*4882a593Smuzhiyun 
807*4882a593Smuzhiyun 	if (tidctrl == 0x3)
808*4882a593Smuzhiyun 		return -EINVAL;
809*4882a593Smuzhiyun 
810*4882a593Smuzhiyun 	rcventry = tididx + (tidctrl - 1);
811*4882a593Smuzhiyun 
812*4882a593Smuzhiyun 	node = fd->entry_to_rb[rcventry];
813*4882a593Smuzhiyun 	if (!node || node->rcventry != (uctxt->expected_base + rcventry))
814*4882a593Smuzhiyun 		return -EBADF;
815*4882a593Smuzhiyun 
816*4882a593Smuzhiyun 	if (grp)
817*4882a593Smuzhiyun 		*grp = node->grp;
818*4882a593Smuzhiyun 
819*4882a593Smuzhiyun 	if (fd->use_mn)
820*4882a593Smuzhiyun 		mmu_interval_notifier_remove(&node->notifier);
821*4882a593Smuzhiyun 	cacheless_tid_rb_remove(fd, node);
822*4882a593Smuzhiyun 
823*4882a593Smuzhiyun 	return 0;
824*4882a593Smuzhiyun }
825*4882a593Smuzhiyun 
clear_tid_node(struct hfi1_filedata * fd,struct tid_rb_node * node)826*4882a593Smuzhiyun static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
827*4882a593Smuzhiyun {
828*4882a593Smuzhiyun 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
829*4882a593Smuzhiyun 	struct hfi1_devdata *dd = uctxt->dd;
830*4882a593Smuzhiyun 
831*4882a593Smuzhiyun 	trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry,
832*4882a593Smuzhiyun 				 node->npages,
833*4882a593Smuzhiyun 				 node->notifier.interval_tree.start, node->phys,
834*4882a593Smuzhiyun 				 node->dma_addr);
835*4882a593Smuzhiyun 
836*4882a593Smuzhiyun 	/*
837*4882a593Smuzhiyun 	 * Make sure device has seen the write before we unpin the
838*4882a593Smuzhiyun 	 * pages.
839*4882a593Smuzhiyun 	 */
840*4882a593Smuzhiyun 	hfi1_put_tid(dd, node->rcventry, PT_INVALID_FLUSH, 0, 0);
841*4882a593Smuzhiyun 
842*4882a593Smuzhiyun 	unpin_rcv_pages(fd, NULL, node, 0, node->npages, true);
843*4882a593Smuzhiyun 
844*4882a593Smuzhiyun 	node->grp->used--;
845*4882a593Smuzhiyun 	node->grp->map &= ~(1 << (node->rcventry - node->grp->base));
846*4882a593Smuzhiyun 
847*4882a593Smuzhiyun 	if (node->grp->used == node->grp->size - 1)
848*4882a593Smuzhiyun 		tid_group_move(node->grp, &uctxt->tid_full_list,
849*4882a593Smuzhiyun 			       &uctxt->tid_used_list);
850*4882a593Smuzhiyun 	else if (!node->grp->used)
851*4882a593Smuzhiyun 		tid_group_move(node->grp, &uctxt->tid_used_list,
852*4882a593Smuzhiyun 			       &uctxt->tid_group_list);
853*4882a593Smuzhiyun 	kfree(node);
854*4882a593Smuzhiyun }
855*4882a593Smuzhiyun 
856*4882a593Smuzhiyun /*
857*4882a593Smuzhiyun  * As a simple helper for hfi1_user_exp_rcv_free, this function deals with
858*4882a593Smuzhiyun  * clearing nodes in the non-cached case.
859*4882a593Smuzhiyun  */
unlock_exp_tids(struct hfi1_ctxtdata * uctxt,struct exp_tid_set * set,struct hfi1_filedata * fd)860*4882a593Smuzhiyun static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
861*4882a593Smuzhiyun 			    struct exp_tid_set *set,
862*4882a593Smuzhiyun 			    struct hfi1_filedata *fd)
863*4882a593Smuzhiyun {
864*4882a593Smuzhiyun 	struct tid_group *grp, *ptr;
865*4882a593Smuzhiyun 	int i;
866*4882a593Smuzhiyun 
867*4882a593Smuzhiyun 	list_for_each_entry_safe(grp, ptr, &set->list, list) {
868*4882a593Smuzhiyun 		list_del_init(&grp->list);
869*4882a593Smuzhiyun 
870*4882a593Smuzhiyun 		for (i = 0; i < grp->size; i++) {
871*4882a593Smuzhiyun 			if (grp->map & (1 << i)) {
872*4882a593Smuzhiyun 				u16 rcventry = grp->base + i;
873*4882a593Smuzhiyun 				struct tid_rb_node *node;
874*4882a593Smuzhiyun 
875*4882a593Smuzhiyun 				node = fd->entry_to_rb[rcventry -
876*4882a593Smuzhiyun 							  uctxt->expected_base];
877*4882a593Smuzhiyun 				if (!node || node->rcventry != rcventry)
878*4882a593Smuzhiyun 					continue;
879*4882a593Smuzhiyun 
880*4882a593Smuzhiyun 				if (fd->use_mn)
881*4882a593Smuzhiyun 					mmu_interval_notifier_remove(
882*4882a593Smuzhiyun 						&node->notifier);
883*4882a593Smuzhiyun 				cacheless_tid_rb_remove(fd, node);
884*4882a593Smuzhiyun 			}
885*4882a593Smuzhiyun 		}
886*4882a593Smuzhiyun 	}
887*4882a593Smuzhiyun }
888*4882a593Smuzhiyun 
tid_rb_invalidate(struct mmu_interval_notifier * mni,const struct mmu_notifier_range * range,unsigned long cur_seq)889*4882a593Smuzhiyun static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
890*4882a593Smuzhiyun 			      const struct mmu_notifier_range *range,
891*4882a593Smuzhiyun 			      unsigned long cur_seq)
892*4882a593Smuzhiyun {
893*4882a593Smuzhiyun 	struct tid_rb_node *node =
894*4882a593Smuzhiyun 		container_of(mni, struct tid_rb_node, notifier);
895*4882a593Smuzhiyun 	struct hfi1_filedata *fdata = node->fdata;
896*4882a593Smuzhiyun 	struct hfi1_ctxtdata *uctxt = fdata->uctxt;
897*4882a593Smuzhiyun 
898*4882a593Smuzhiyun 	if (node->freed)
899*4882a593Smuzhiyun 		return true;
900*4882a593Smuzhiyun 
901*4882a593Smuzhiyun 	trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt,
902*4882a593Smuzhiyun 				 node->notifier.interval_tree.start,
903*4882a593Smuzhiyun 				 node->rcventry, node->npages, node->dma_addr);
904*4882a593Smuzhiyun 	node->freed = true;
905*4882a593Smuzhiyun 
906*4882a593Smuzhiyun 	spin_lock(&fdata->invalid_lock);
907*4882a593Smuzhiyun 	if (fdata->invalid_tid_idx < uctxt->expected_count) {
908*4882a593Smuzhiyun 		fdata->invalid_tids[fdata->invalid_tid_idx] =
909*4882a593Smuzhiyun 			rcventry2tidinfo(node->rcventry - uctxt->expected_base);
910*4882a593Smuzhiyun 		fdata->invalid_tids[fdata->invalid_tid_idx] |=
911*4882a593Smuzhiyun 			EXP_TID_SET(LEN, node->npages);
912*4882a593Smuzhiyun 		if (!fdata->invalid_tid_idx) {
913*4882a593Smuzhiyun 			unsigned long *ev;
914*4882a593Smuzhiyun 
915*4882a593Smuzhiyun 			/*
916*4882a593Smuzhiyun 			 * hfi1_set_uevent_bits() sets a user event flag
917*4882a593Smuzhiyun 			 * for all processes. Because calling into the
918*4882a593Smuzhiyun 			 * driver to process TID cache invalidations is
919*4882a593Smuzhiyun 			 * expensive and TID cache invalidations are
920*4882a593Smuzhiyun 			 * handled on a per-process basis, we can
921*4882a593Smuzhiyun 			 * optimize this to set the flag only for the
922*4882a593Smuzhiyun 			 * process in question.
923*4882a593Smuzhiyun 			 */
924*4882a593Smuzhiyun 			ev = uctxt->dd->events +
925*4882a593Smuzhiyun 				(uctxt_offset(uctxt) + fdata->subctxt);
926*4882a593Smuzhiyun 			set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
927*4882a593Smuzhiyun 		}
928*4882a593Smuzhiyun 		fdata->invalid_tid_idx++;
929*4882a593Smuzhiyun 	}
930*4882a593Smuzhiyun 	spin_unlock(&fdata->invalid_lock);
931*4882a593Smuzhiyun 	return true;
932*4882a593Smuzhiyun }
933*4882a593Smuzhiyun 
cacheless_tid_rb_remove(struct hfi1_filedata * fdata,struct tid_rb_node * tnode)934*4882a593Smuzhiyun static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
935*4882a593Smuzhiyun 				    struct tid_rb_node *tnode)
936*4882a593Smuzhiyun {
937*4882a593Smuzhiyun 	u32 base = fdata->uctxt->expected_base;
938*4882a593Smuzhiyun 
939*4882a593Smuzhiyun 	fdata->entry_to_rb[tnode->rcventry - base] = NULL;
940*4882a593Smuzhiyun 	clear_tid_node(fdata, tnode);
941*4882a593Smuzhiyun }
942