1*4882a593Smuzhiyun /*
2*4882a593Smuzhiyun * Copyright(c) 2020 Cornelis Networks, Inc.
3*4882a593Smuzhiyun * Copyright(c) 2015-2018 Intel Corporation.
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * This file is provided under a dual BSD/GPLv2 license. When using or
6*4882a593Smuzhiyun * redistributing this file, you may do so under either license.
7*4882a593Smuzhiyun *
8*4882a593Smuzhiyun * GPL LICENSE SUMMARY
9*4882a593Smuzhiyun *
10*4882a593Smuzhiyun * This program is free software; you can redistribute it and/or modify
11*4882a593Smuzhiyun * it under the terms of version 2 of the GNU General Public License as
12*4882a593Smuzhiyun * published by the Free Software Foundation.
13*4882a593Smuzhiyun *
14*4882a593Smuzhiyun * This program is distributed in the hope that it will be useful, but
15*4882a593Smuzhiyun * WITHOUT ANY WARRANTY; without even the implied warranty of
16*4882a593Smuzhiyun * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17*4882a593Smuzhiyun * General Public License for more details.
18*4882a593Smuzhiyun *
19*4882a593Smuzhiyun * BSD LICENSE
20*4882a593Smuzhiyun *
21*4882a593Smuzhiyun * Redistribution and use in source and binary forms, with or without
22*4882a593Smuzhiyun * modification, are permitted provided that the following conditions
23*4882a593Smuzhiyun * are met:
24*4882a593Smuzhiyun *
25*4882a593Smuzhiyun * - Redistributions of source code must retain the above copyright
26*4882a593Smuzhiyun * notice, this list of conditions and the following disclaimer.
27*4882a593Smuzhiyun * - Redistributions in binary form must reproduce the above copyright
28*4882a593Smuzhiyun * notice, this list of conditions and the following disclaimer in
29*4882a593Smuzhiyun * the documentation and/or other materials provided with the
30*4882a593Smuzhiyun * distribution.
31*4882a593Smuzhiyun * - Neither the name of Intel Corporation nor the names of its
32*4882a593Smuzhiyun * contributors may be used to endorse or promote products derived
33*4882a593Smuzhiyun * from this software without specific prior written permission.
34*4882a593Smuzhiyun *
35*4882a593Smuzhiyun * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
36*4882a593Smuzhiyun * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
37*4882a593Smuzhiyun * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
38*4882a593Smuzhiyun * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
39*4882a593Smuzhiyun * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40*4882a593Smuzhiyun * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
41*4882a593Smuzhiyun * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
42*4882a593Smuzhiyun * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
43*4882a593Smuzhiyun * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
44*4882a593Smuzhiyun * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
45*4882a593Smuzhiyun * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
46*4882a593Smuzhiyun *
47*4882a593Smuzhiyun */
48*4882a593Smuzhiyun #include <asm/page.h>
49*4882a593Smuzhiyun #include <linux/string.h>
50*4882a593Smuzhiyun
51*4882a593Smuzhiyun #include "mmu_rb.h"
52*4882a593Smuzhiyun #include "user_exp_rcv.h"
53*4882a593Smuzhiyun #include "trace.h"
54*4882a593Smuzhiyun
55*4882a593Smuzhiyun static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
56*4882a593Smuzhiyun struct exp_tid_set *set,
57*4882a593Smuzhiyun struct hfi1_filedata *fd);
58*4882a593Smuzhiyun static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages);
59*4882a593Smuzhiyun static int set_rcvarray_entry(struct hfi1_filedata *fd,
60*4882a593Smuzhiyun struct tid_user_buf *tbuf,
61*4882a593Smuzhiyun u32 rcventry, struct tid_group *grp,
62*4882a593Smuzhiyun u16 pageidx, unsigned int npages);
63*4882a593Smuzhiyun static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
64*4882a593Smuzhiyun struct tid_rb_node *tnode);
65*4882a593Smuzhiyun static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
66*4882a593Smuzhiyun const struct mmu_notifier_range *range,
67*4882a593Smuzhiyun unsigned long cur_seq);
68*4882a593Smuzhiyun static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *,
69*4882a593Smuzhiyun struct tid_group *grp,
70*4882a593Smuzhiyun unsigned int start, u16 count,
71*4882a593Smuzhiyun u32 *tidlist, unsigned int *tididx,
72*4882a593Smuzhiyun unsigned int *pmapped);
73*4882a593Smuzhiyun static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
74*4882a593Smuzhiyun struct tid_group **grp);
75*4882a593Smuzhiyun static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node);
76*4882a593Smuzhiyun
77*4882a593Smuzhiyun static const struct mmu_interval_notifier_ops tid_mn_ops = {
78*4882a593Smuzhiyun .invalidate = tid_rb_invalidate,
79*4882a593Smuzhiyun };
80*4882a593Smuzhiyun
81*4882a593Smuzhiyun /*
82*4882a593Smuzhiyun * Initialize context and file private data needed for Expected
83*4882a593Smuzhiyun * receive caching. This needs to be done after the context has
84*4882a593Smuzhiyun * been configured with the eager/expected RcvEntry counts.
85*4882a593Smuzhiyun */
hfi1_user_exp_rcv_init(struct hfi1_filedata * fd,struct hfi1_ctxtdata * uctxt)86*4882a593Smuzhiyun int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd,
87*4882a593Smuzhiyun struct hfi1_ctxtdata *uctxt)
88*4882a593Smuzhiyun {
89*4882a593Smuzhiyun int ret = 0;
90*4882a593Smuzhiyun
91*4882a593Smuzhiyun fd->entry_to_rb = kcalloc(uctxt->expected_count,
92*4882a593Smuzhiyun sizeof(struct rb_node *),
93*4882a593Smuzhiyun GFP_KERNEL);
94*4882a593Smuzhiyun if (!fd->entry_to_rb)
95*4882a593Smuzhiyun return -ENOMEM;
96*4882a593Smuzhiyun
97*4882a593Smuzhiyun if (!HFI1_CAP_UGET_MASK(uctxt->flags, TID_UNMAP)) {
98*4882a593Smuzhiyun fd->invalid_tid_idx = 0;
99*4882a593Smuzhiyun fd->invalid_tids = kcalloc(uctxt->expected_count,
100*4882a593Smuzhiyun sizeof(*fd->invalid_tids),
101*4882a593Smuzhiyun GFP_KERNEL);
102*4882a593Smuzhiyun if (!fd->invalid_tids) {
103*4882a593Smuzhiyun kfree(fd->entry_to_rb);
104*4882a593Smuzhiyun fd->entry_to_rb = NULL;
105*4882a593Smuzhiyun return -ENOMEM;
106*4882a593Smuzhiyun }
107*4882a593Smuzhiyun fd->use_mn = true;
108*4882a593Smuzhiyun }
109*4882a593Smuzhiyun
110*4882a593Smuzhiyun /*
111*4882a593Smuzhiyun * PSM does not have a good way to separate, count, and
112*4882a593Smuzhiyun * effectively enforce a limit on RcvArray entries used by
113*4882a593Smuzhiyun * subctxts (when context sharing is used) when TID caching
114*4882a593Smuzhiyun * is enabled. To help with that, we calculate a per-process
115*4882a593Smuzhiyun * RcvArray entry share and enforce that.
116*4882a593Smuzhiyun * If TID caching is not in use, PSM deals with usage on its
117*4882a593Smuzhiyun * own. In that case, we allow any subctxt to take all of the
118*4882a593Smuzhiyun * entries.
119*4882a593Smuzhiyun *
120*4882a593Smuzhiyun * Make sure that we set the tid counts only after successful
121*4882a593Smuzhiyun * init.
122*4882a593Smuzhiyun */
123*4882a593Smuzhiyun spin_lock(&fd->tid_lock);
124*4882a593Smuzhiyun if (uctxt->subctxt_cnt && fd->use_mn) {
125*4882a593Smuzhiyun u16 remainder;
126*4882a593Smuzhiyun
127*4882a593Smuzhiyun fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt;
128*4882a593Smuzhiyun remainder = uctxt->expected_count % uctxt->subctxt_cnt;
129*4882a593Smuzhiyun if (remainder && fd->subctxt < remainder)
130*4882a593Smuzhiyun fd->tid_limit++;
131*4882a593Smuzhiyun } else {
132*4882a593Smuzhiyun fd->tid_limit = uctxt->expected_count;
133*4882a593Smuzhiyun }
134*4882a593Smuzhiyun spin_unlock(&fd->tid_lock);
135*4882a593Smuzhiyun
136*4882a593Smuzhiyun return ret;
137*4882a593Smuzhiyun }
138*4882a593Smuzhiyun
hfi1_user_exp_rcv_free(struct hfi1_filedata * fd)139*4882a593Smuzhiyun void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd)
140*4882a593Smuzhiyun {
141*4882a593Smuzhiyun struct hfi1_ctxtdata *uctxt = fd->uctxt;
142*4882a593Smuzhiyun
143*4882a593Smuzhiyun mutex_lock(&uctxt->exp_mutex);
144*4882a593Smuzhiyun if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list))
145*4882a593Smuzhiyun unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd);
146*4882a593Smuzhiyun if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list))
147*4882a593Smuzhiyun unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd);
148*4882a593Smuzhiyun mutex_unlock(&uctxt->exp_mutex);
149*4882a593Smuzhiyun
150*4882a593Smuzhiyun kfree(fd->invalid_tids);
151*4882a593Smuzhiyun fd->invalid_tids = NULL;
152*4882a593Smuzhiyun
153*4882a593Smuzhiyun kfree(fd->entry_to_rb);
154*4882a593Smuzhiyun fd->entry_to_rb = NULL;
155*4882a593Smuzhiyun }
156*4882a593Smuzhiyun
157*4882a593Smuzhiyun /**
158*4882a593Smuzhiyun * Release pinned receive buffer pages.
159*4882a593Smuzhiyun *
160*4882a593Smuzhiyun * @mapped - true if the pages have been DMA mapped. false otherwise.
161*4882a593Smuzhiyun * @idx - Index of the first page to unpin.
162*4882a593Smuzhiyun * @npages - No of pages to unpin.
163*4882a593Smuzhiyun *
164*4882a593Smuzhiyun * If the pages have been DMA mapped (indicated by mapped parameter), their
165*4882a593Smuzhiyun * info will be passed via a struct tid_rb_node. If they haven't been mapped,
166*4882a593Smuzhiyun * their info will be passed via a struct tid_user_buf.
167*4882a593Smuzhiyun */
unpin_rcv_pages(struct hfi1_filedata * fd,struct tid_user_buf * tidbuf,struct tid_rb_node * node,unsigned int idx,unsigned int npages,bool mapped)168*4882a593Smuzhiyun static void unpin_rcv_pages(struct hfi1_filedata *fd,
169*4882a593Smuzhiyun struct tid_user_buf *tidbuf,
170*4882a593Smuzhiyun struct tid_rb_node *node,
171*4882a593Smuzhiyun unsigned int idx,
172*4882a593Smuzhiyun unsigned int npages,
173*4882a593Smuzhiyun bool mapped)
174*4882a593Smuzhiyun {
175*4882a593Smuzhiyun struct page **pages;
176*4882a593Smuzhiyun struct hfi1_devdata *dd = fd->uctxt->dd;
177*4882a593Smuzhiyun struct mm_struct *mm;
178*4882a593Smuzhiyun
179*4882a593Smuzhiyun if (mapped) {
180*4882a593Smuzhiyun pci_unmap_single(dd->pcidev, node->dma_addr,
181*4882a593Smuzhiyun node->npages * PAGE_SIZE, PCI_DMA_FROMDEVICE);
182*4882a593Smuzhiyun pages = &node->pages[idx];
183*4882a593Smuzhiyun mm = mm_from_tid_node(node);
184*4882a593Smuzhiyun } else {
185*4882a593Smuzhiyun pages = &tidbuf->pages[idx];
186*4882a593Smuzhiyun mm = current->mm;
187*4882a593Smuzhiyun }
188*4882a593Smuzhiyun hfi1_release_user_pages(mm, pages, npages, mapped);
189*4882a593Smuzhiyun fd->tid_n_pinned -= npages;
190*4882a593Smuzhiyun }
191*4882a593Smuzhiyun
192*4882a593Smuzhiyun /**
193*4882a593Smuzhiyun * Pin receive buffer pages.
194*4882a593Smuzhiyun */
pin_rcv_pages(struct hfi1_filedata * fd,struct tid_user_buf * tidbuf)195*4882a593Smuzhiyun static int pin_rcv_pages(struct hfi1_filedata *fd, struct tid_user_buf *tidbuf)
196*4882a593Smuzhiyun {
197*4882a593Smuzhiyun int pinned;
198*4882a593Smuzhiyun unsigned int npages;
199*4882a593Smuzhiyun unsigned long vaddr = tidbuf->vaddr;
200*4882a593Smuzhiyun struct page **pages = NULL;
201*4882a593Smuzhiyun struct hfi1_devdata *dd = fd->uctxt->dd;
202*4882a593Smuzhiyun
203*4882a593Smuzhiyun /* Get the number of pages the user buffer spans */
204*4882a593Smuzhiyun npages = num_user_pages(vaddr, tidbuf->length);
205*4882a593Smuzhiyun if (!npages)
206*4882a593Smuzhiyun return -EINVAL;
207*4882a593Smuzhiyun
208*4882a593Smuzhiyun if (npages > fd->uctxt->expected_count) {
209*4882a593Smuzhiyun dd_dev_err(dd, "Expected buffer too big\n");
210*4882a593Smuzhiyun return -EINVAL;
211*4882a593Smuzhiyun }
212*4882a593Smuzhiyun
213*4882a593Smuzhiyun /* Allocate the array of struct page pointers needed for pinning */
214*4882a593Smuzhiyun pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
215*4882a593Smuzhiyun if (!pages)
216*4882a593Smuzhiyun return -ENOMEM;
217*4882a593Smuzhiyun
218*4882a593Smuzhiyun /*
219*4882a593Smuzhiyun * Pin all the pages of the user buffer. If we can't pin all the
220*4882a593Smuzhiyun * pages, accept the amount pinned so far and program only that.
221*4882a593Smuzhiyun * User space knows how to deal with partially programmed buffers.
222*4882a593Smuzhiyun */
223*4882a593Smuzhiyun if (!hfi1_can_pin_pages(dd, current->mm, fd->tid_n_pinned, npages)) {
224*4882a593Smuzhiyun kfree(pages);
225*4882a593Smuzhiyun return -ENOMEM;
226*4882a593Smuzhiyun }
227*4882a593Smuzhiyun
228*4882a593Smuzhiyun pinned = hfi1_acquire_user_pages(current->mm, vaddr, npages, true, pages);
229*4882a593Smuzhiyun if (pinned <= 0) {
230*4882a593Smuzhiyun kfree(pages);
231*4882a593Smuzhiyun return pinned;
232*4882a593Smuzhiyun }
233*4882a593Smuzhiyun tidbuf->pages = pages;
234*4882a593Smuzhiyun tidbuf->npages = npages;
235*4882a593Smuzhiyun fd->tid_n_pinned += pinned;
236*4882a593Smuzhiyun return pinned;
237*4882a593Smuzhiyun }
238*4882a593Smuzhiyun
239*4882a593Smuzhiyun /*
240*4882a593Smuzhiyun * RcvArray entry allocation for Expected Receives is done by the
241*4882a593Smuzhiyun * following algorithm:
242*4882a593Smuzhiyun *
243*4882a593Smuzhiyun * The context keeps 3 lists of groups of RcvArray entries:
244*4882a593Smuzhiyun * 1. List of empty groups - tid_group_list
245*4882a593Smuzhiyun * This list is created during user context creation and
246*4882a593Smuzhiyun * contains elements which describe sets (of 8) of empty
247*4882a593Smuzhiyun * RcvArray entries.
248*4882a593Smuzhiyun * 2. List of partially used groups - tid_used_list
249*4882a593Smuzhiyun * This list contains sets of RcvArray entries which are
250*4882a593Smuzhiyun * not completely used up. Another mapping request could
251*4882a593Smuzhiyun * use some of all of the remaining entries.
252*4882a593Smuzhiyun * 3. List of full groups - tid_full_list
253*4882a593Smuzhiyun * This is the list where sets that are completely used
254*4882a593Smuzhiyun * up go.
255*4882a593Smuzhiyun *
256*4882a593Smuzhiyun * An attempt to optimize the usage of RcvArray entries is
257*4882a593Smuzhiyun * made by finding all sets of physically contiguous pages in a
258*4882a593Smuzhiyun * user's buffer.
259*4882a593Smuzhiyun * These physically contiguous sets are further split into
260*4882a593Smuzhiyun * sizes supported by the receive engine of the HFI. The
261*4882a593Smuzhiyun * resulting sets of pages are stored in struct tid_pageset,
262*4882a593Smuzhiyun * which describes the sets as:
263*4882a593Smuzhiyun * * .count - number of pages in this set
264*4882a593Smuzhiyun * * .idx - starting index into struct page ** array
265*4882a593Smuzhiyun * of this set
266*4882a593Smuzhiyun *
267*4882a593Smuzhiyun * From this point on, the algorithm deals with the page sets
268*4882a593Smuzhiyun * described above. The number of pagesets is divided by the
269*4882a593Smuzhiyun * RcvArray group size to produce the number of full groups
270*4882a593Smuzhiyun * needed.
271*4882a593Smuzhiyun *
272*4882a593Smuzhiyun * Groups from the 3 lists are manipulated using the following
273*4882a593Smuzhiyun * rules:
274*4882a593Smuzhiyun * 1. For each set of 8 pagesets, a complete group from
275*4882a593Smuzhiyun * tid_group_list is taken, programmed, and moved to
276*4882a593Smuzhiyun * the tid_full_list list.
277*4882a593Smuzhiyun * 2. For all remaining pagesets:
278*4882a593Smuzhiyun * 2.1 If the tid_used_list is empty and the tid_group_list
279*4882a593Smuzhiyun * is empty, stop processing pageset and return only
280*4882a593Smuzhiyun * what has been programmed up to this point.
281*4882a593Smuzhiyun * 2.2 If the tid_used_list is empty and the tid_group_list
282*4882a593Smuzhiyun * is not empty, move a group from tid_group_list to
283*4882a593Smuzhiyun * tid_used_list.
284*4882a593Smuzhiyun * 2.3 For each group is tid_used_group, program as much as
285*4882a593Smuzhiyun * can fit into the group. If the group becomes fully
286*4882a593Smuzhiyun * used, move it to tid_full_list.
287*4882a593Smuzhiyun */
hfi1_user_exp_rcv_setup(struct hfi1_filedata * fd,struct hfi1_tid_info * tinfo)288*4882a593Smuzhiyun int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd,
289*4882a593Smuzhiyun struct hfi1_tid_info *tinfo)
290*4882a593Smuzhiyun {
291*4882a593Smuzhiyun int ret = 0, need_group = 0, pinned;
292*4882a593Smuzhiyun struct hfi1_ctxtdata *uctxt = fd->uctxt;
293*4882a593Smuzhiyun struct hfi1_devdata *dd = uctxt->dd;
294*4882a593Smuzhiyun unsigned int ngroups, pageidx = 0, pageset_count,
295*4882a593Smuzhiyun tididx = 0, mapped, mapped_pages = 0;
296*4882a593Smuzhiyun u32 *tidlist = NULL;
297*4882a593Smuzhiyun struct tid_user_buf *tidbuf;
298*4882a593Smuzhiyun
299*4882a593Smuzhiyun if (!PAGE_ALIGNED(tinfo->vaddr))
300*4882a593Smuzhiyun return -EINVAL;
301*4882a593Smuzhiyun
302*4882a593Smuzhiyun tidbuf = kzalloc(sizeof(*tidbuf), GFP_KERNEL);
303*4882a593Smuzhiyun if (!tidbuf)
304*4882a593Smuzhiyun return -ENOMEM;
305*4882a593Smuzhiyun
306*4882a593Smuzhiyun tidbuf->vaddr = tinfo->vaddr;
307*4882a593Smuzhiyun tidbuf->length = tinfo->length;
308*4882a593Smuzhiyun tidbuf->psets = kcalloc(uctxt->expected_count, sizeof(*tidbuf->psets),
309*4882a593Smuzhiyun GFP_KERNEL);
310*4882a593Smuzhiyun if (!tidbuf->psets) {
311*4882a593Smuzhiyun kfree(tidbuf);
312*4882a593Smuzhiyun return -ENOMEM;
313*4882a593Smuzhiyun }
314*4882a593Smuzhiyun
315*4882a593Smuzhiyun pinned = pin_rcv_pages(fd, tidbuf);
316*4882a593Smuzhiyun if (pinned <= 0) {
317*4882a593Smuzhiyun kfree(tidbuf->psets);
318*4882a593Smuzhiyun kfree(tidbuf);
319*4882a593Smuzhiyun return pinned;
320*4882a593Smuzhiyun }
321*4882a593Smuzhiyun
322*4882a593Smuzhiyun /* Find sets of physically contiguous pages */
323*4882a593Smuzhiyun tidbuf->n_psets = find_phys_blocks(tidbuf, pinned);
324*4882a593Smuzhiyun
325*4882a593Smuzhiyun /*
326*4882a593Smuzhiyun * We don't need to access this under a lock since tid_used is per
327*4882a593Smuzhiyun * process and the same process cannot be in hfi1_user_exp_rcv_clear()
328*4882a593Smuzhiyun * and hfi1_user_exp_rcv_setup() at the same time.
329*4882a593Smuzhiyun */
330*4882a593Smuzhiyun spin_lock(&fd->tid_lock);
331*4882a593Smuzhiyun if (fd->tid_used + tidbuf->n_psets > fd->tid_limit)
332*4882a593Smuzhiyun pageset_count = fd->tid_limit - fd->tid_used;
333*4882a593Smuzhiyun else
334*4882a593Smuzhiyun pageset_count = tidbuf->n_psets;
335*4882a593Smuzhiyun spin_unlock(&fd->tid_lock);
336*4882a593Smuzhiyun
337*4882a593Smuzhiyun if (!pageset_count)
338*4882a593Smuzhiyun goto bail;
339*4882a593Smuzhiyun
340*4882a593Smuzhiyun ngroups = pageset_count / dd->rcv_entries.group_size;
341*4882a593Smuzhiyun tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL);
342*4882a593Smuzhiyun if (!tidlist) {
343*4882a593Smuzhiyun ret = -ENOMEM;
344*4882a593Smuzhiyun goto nomem;
345*4882a593Smuzhiyun }
346*4882a593Smuzhiyun
347*4882a593Smuzhiyun tididx = 0;
348*4882a593Smuzhiyun
349*4882a593Smuzhiyun /*
350*4882a593Smuzhiyun * From this point on, we are going to be using shared (between master
351*4882a593Smuzhiyun * and subcontexts) context resources. We need to take the lock.
352*4882a593Smuzhiyun */
353*4882a593Smuzhiyun mutex_lock(&uctxt->exp_mutex);
354*4882a593Smuzhiyun /*
355*4882a593Smuzhiyun * The first step is to program the RcvArray entries which are complete
356*4882a593Smuzhiyun * groups.
357*4882a593Smuzhiyun */
358*4882a593Smuzhiyun while (ngroups && uctxt->tid_group_list.count) {
359*4882a593Smuzhiyun struct tid_group *grp =
360*4882a593Smuzhiyun tid_group_pop(&uctxt->tid_group_list);
361*4882a593Smuzhiyun
362*4882a593Smuzhiyun ret = program_rcvarray(fd, tidbuf, grp,
363*4882a593Smuzhiyun pageidx, dd->rcv_entries.group_size,
364*4882a593Smuzhiyun tidlist, &tididx, &mapped);
365*4882a593Smuzhiyun /*
366*4882a593Smuzhiyun * If there was a failure to program the RcvArray
367*4882a593Smuzhiyun * entries for the entire group, reset the grp fields
368*4882a593Smuzhiyun * and add the grp back to the free group list.
369*4882a593Smuzhiyun */
370*4882a593Smuzhiyun if (ret <= 0) {
371*4882a593Smuzhiyun tid_group_add_tail(grp, &uctxt->tid_group_list);
372*4882a593Smuzhiyun hfi1_cdbg(TID,
373*4882a593Smuzhiyun "Failed to program RcvArray group %d", ret);
374*4882a593Smuzhiyun goto unlock;
375*4882a593Smuzhiyun }
376*4882a593Smuzhiyun
377*4882a593Smuzhiyun tid_group_add_tail(grp, &uctxt->tid_full_list);
378*4882a593Smuzhiyun ngroups--;
379*4882a593Smuzhiyun pageidx += ret;
380*4882a593Smuzhiyun mapped_pages += mapped;
381*4882a593Smuzhiyun }
382*4882a593Smuzhiyun
383*4882a593Smuzhiyun while (pageidx < pageset_count) {
384*4882a593Smuzhiyun struct tid_group *grp, *ptr;
385*4882a593Smuzhiyun /*
386*4882a593Smuzhiyun * If we don't have any partially used tid groups, check
387*4882a593Smuzhiyun * if we have empty groups. If so, take one from there and
388*4882a593Smuzhiyun * put in the partially used list.
389*4882a593Smuzhiyun */
390*4882a593Smuzhiyun if (!uctxt->tid_used_list.count || need_group) {
391*4882a593Smuzhiyun if (!uctxt->tid_group_list.count)
392*4882a593Smuzhiyun goto unlock;
393*4882a593Smuzhiyun
394*4882a593Smuzhiyun grp = tid_group_pop(&uctxt->tid_group_list);
395*4882a593Smuzhiyun tid_group_add_tail(grp, &uctxt->tid_used_list);
396*4882a593Smuzhiyun need_group = 0;
397*4882a593Smuzhiyun }
398*4882a593Smuzhiyun /*
399*4882a593Smuzhiyun * There is an optimization opportunity here - instead of
400*4882a593Smuzhiyun * fitting as many page sets as we can, check for a group
401*4882a593Smuzhiyun * later on in the list that could fit all of them.
402*4882a593Smuzhiyun */
403*4882a593Smuzhiyun list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list,
404*4882a593Smuzhiyun list) {
405*4882a593Smuzhiyun unsigned use = min_t(unsigned, pageset_count - pageidx,
406*4882a593Smuzhiyun grp->size - grp->used);
407*4882a593Smuzhiyun
408*4882a593Smuzhiyun ret = program_rcvarray(fd, tidbuf, grp,
409*4882a593Smuzhiyun pageidx, use, tidlist,
410*4882a593Smuzhiyun &tididx, &mapped);
411*4882a593Smuzhiyun if (ret < 0) {
412*4882a593Smuzhiyun hfi1_cdbg(TID,
413*4882a593Smuzhiyun "Failed to program RcvArray entries %d",
414*4882a593Smuzhiyun ret);
415*4882a593Smuzhiyun goto unlock;
416*4882a593Smuzhiyun } else if (ret > 0) {
417*4882a593Smuzhiyun if (grp->used == grp->size)
418*4882a593Smuzhiyun tid_group_move(grp,
419*4882a593Smuzhiyun &uctxt->tid_used_list,
420*4882a593Smuzhiyun &uctxt->tid_full_list);
421*4882a593Smuzhiyun pageidx += ret;
422*4882a593Smuzhiyun mapped_pages += mapped;
423*4882a593Smuzhiyun need_group = 0;
424*4882a593Smuzhiyun /* Check if we are done so we break out early */
425*4882a593Smuzhiyun if (pageidx >= pageset_count)
426*4882a593Smuzhiyun break;
427*4882a593Smuzhiyun } else if (WARN_ON(ret == 0)) {
428*4882a593Smuzhiyun /*
429*4882a593Smuzhiyun * If ret is 0, we did not program any entries
430*4882a593Smuzhiyun * into this group, which can only happen if
431*4882a593Smuzhiyun * we've screwed up the accounting somewhere.
432*4882a593Smuzhiyun * Warn and try to continue.
433*4882a593Smuzhiyun */
434*4882a593Smuzhiyun need_group = 1;
435*4882a593Smuzhiyun }
436*4882a593Smuzhiyun }
437*4882a593Smuzhiyun }
438*4882a593Smuzhiyun unlock:
439*4882a593Smuzhiyun mutex_unlock(&uctxt->exp_mutex);
440*4882a593Smuzhiyun nomem:
441*4882a593Smuzhiyun hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx,
442*4882a593Smuzhiyun mapped_pages, ret);
443*4882a593Smuzhiyun if (tididx) {
444*4882a593Smuzhiyun spin_lock(&fd->tid_lock);
445*4882a593Smuzhiyun fd->tid_used += tididx;
446*4882a593Smuzhiyun spin_unlock(&fd->tid_lock);
447*4882a593Smuzhiyun tinfo->tidcnt = tididx;
448*4882a593Smuzhiyun tinfo->length = mapped_pages * PAGE_SIZE;
449*4882a593Smuzhiyun
450*4882a593Smuzhiyun if (copy_to_user(u64_to_user_ptr(tinfo->tidlist),
451*4882a593Smuzhiyun tidlist, sizeof(tidlist[0]) * tididx)) {
452*4882a593Smuzhiyun /*
453*4882a593Smuzhiyun * On failure to copy to the user level, we need to undo
454*4882a593Smuzhiyun * everything done so far so we don't leak resources.
455*4882a593Smuzhiyun */
456*4882a593Smuzhiyun tinfo->tidlist = (unsigned long)&tidlist;
457*4882a593Smuzhiyun hfi1_user_exp_rcv_clear(fd, tinfo);
458*4882a593Smuzhiyun tinfo->tidlist = 0;
459*4882a593Smuzhiyun ret = -EFAULT;
460*4882a593Smuzhiyun goto bail;
461*4882a593Smuzhiyun }
462*4882a593Smuzhiyun }
463*4882a593Smuzhiyun
464*4882a593Smuzhiyun /*
465*4882a593Smuzhiyun * If not everything was mapped (due to insufficient RcvArray entries,
466*4882a593Smuzhiyun * for example), unpin all unmapped pages so we can pin them nex time.
467*4882a593Smuzhiyun */
468*4882a593Smuzhiyun if (mapped_pages != pinned)
469*4882a593Smuzhiyun unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages,
470*4882a593Smuzhiyun (pinned - mapped_pages), false);
471*4882a593Smuzhiyun bail:
472*4882a593Smuzhiyun kfree(tidbuf->psets);
473*4882a593Smuzhiyun kfree(tidlist);
474*4882a593Smuzhiyun kfree(tidbuf->pages);
475*4882a593Smuzhiyun kfree(tidbuf);
476*4882a593Smuzhiyun return ret > 0 ? 0 : ret;
477*4882a593Smuzhiyun }
478*4882a593Smuzhiyun
hfi1_user_exp_rcv_clear(struct hfi1_filedata * fd,struct hfi1_tid_info * tinfo)479*4882a593Smuzhiyun int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd,
480*4882a593Smuzhiyun struct hfi1_tid_info *tinfo)
481*4882a593Smuzhiyun {
482*4882a593Smuzhiyun int ret = 0;
483*4882a593Smuzhiyun struct hfi1_ctxtdata *uctxt = fd->uctxt;
484*4882a593Smuzhiyun u32 *tidinfo;
485*4882a593Smuzhiyun unsigned tididx;
486*4882a593Smuzhiyun
487*4882a593Smuzhiyun if (unlikely(tinfo->tidcnt > fd->tid_used))
488*4882a593Smuzhiyun return -EINVAL;
489*4882a593Smuzhiyun
490*4882a593Smuzhiyun tidinfo = memdup_user(u64_to_user_ptr(tinfo->tidlist),
491*4882a593Smuzhiyun sizeof(tidinfo[0]) * tinfo->tidcnt);
492*4882a593Smuzhiyun if (IS_ERR(tidinfo))
493*4882a593Smuzhiyun return PTR_ERR(tidinfo);
494*4882a593Smuzhiyun
495*4882a593Smuzhiyun mutex_lock(&uctxt->exp_mutex);
496*4882a593Smuzhiyun for (tididx = 0; tididx < tinfo->tidcnt; tididx++) {
497*4882a593Smuzhiyun ret = unprogram_rcvarray(fd, tidinfo[tididx], NULL);
498*4882a593Smuzhiyun if (ret) {
499*4882a593Smuzhiyun hfi1_cdbg(TID, "Failed to unprogram rcv array %d",
500*4882a593Smuzhiyun ret);
501*4882a593Smuzhiyun break;
502*4882a593Smuzhiyun }
503*4882a593Smuzhiyun }
504*4882a593Smuzhiyun spin_lock(&fd->tid_lock);
505*4882a593Smuzhiyun fd->tid_used -= tididx;
506*4882a593Smuzhiyun spin_unlock(&fd->tid_lock);
507*4882a593Smuzhiyun tinfo->tidcnt = tididx;
508*4882a593Smuzhiyun mutex_unlock(&uctxt->exp_mutex);
509*4882a593Smuzhiyun
510*4882a593Smuzhiyun kfree(tidinfo);
511*4882a593Smuzhiyun return ret;
512*4882a593Smuzhiyun }
513*4882a593Smuzhiyun
hfi1_user_exp_rcv_invalid(struct hfi1_filedata * fd,struct hfi1_tid_info * tinfo)514*4882a593Smuzhiyun int hfi1_user_exp_rcv_invalid(struct hfi1_filedata *fd,
515*4882a593Smuzhiyun struct hfi1_tid_info *tinfo)
516*4882a593Smuzhiyun {
517*4882a593Smuzhiyun struct hfi1_ctxtdata *uctxt = fd->uctxt;
518*4882a593Smuzhiyun unsigned long *ev = uctxt->dd->events +
519*4882a593Smuzhiyun (uctxt_offset(uctxt) + fd->subctxt);
520*4882a593Smuzhiyun u32 *array;
521*4882a593Smuzhiyun int ret = 0;
522*4882a593Smuzhiyun
523*4882a593Smuzhiyun /*
524*4882a593Smuzhiyun * copy_to_user() can sleep, which will leave the invalid_lock
525*4882a593Smuzhiyun * locked and cause the MMU notifier to be blocked on the lock
526*4882a593Smuzhiyun * for a long time.
527*4882a593Smuzhiyun * Copy the data to a local buffer so we can release the lock.
528*4882a593Smuzhiyun */
529*4882a593Smuzhiyun array = kcalloc(uctxt->expected_count, sizeof(*array), GFP_KERNEL);
530*4882a593Smuzhiyun if (!array)
531*4882a593Smuzhiyun return -EFAULT;
532*4882a593Smuzhiyun
533*4882a593Smuzhiyun spin_lock(&fd->invalid_lock);
534*4882a593Smuzhiyun if (fd->invalid_tid_idx) {
535*4882a593Smuzhiyun memcpy(array, fd->invalid_tids, sizeof(*array) *
536*4882a593Smuzhiyun fd->invalid_tid_idx);
537*4882a593Smuzhiyun memset(fd->invalid_tids, 0, sizeof(*fd->invalid_tids) *
538*4882a593Smuzhiyun fd->invalid_tid_idx);
539*4882a593Smuzhiyun tinfo->tidcnt = fd->invalid_tid_idx;
540*4882a593Smuzhiyun fd->invalid_tid_idx = 0;
541*4882a593Smuzhiyun /*
542*4882a593Smuzhiyun * Reset the user flag while still holding the lock.
543*4882a593Smuzhiyun * Otherwise, PSM can miss events.
544*4882a593Smuzhiyun */
545*4882a593Smuzhiyun clear_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
546*4882a593Smuzhiyun } else {
547*4882a593Smuzhiyun tinfo->tidcnt = 0;
548*4882a593Smuzhiyun }
549*4882a593Smuzhiyun spin_unlock(&fd->invalid_lock);
550*4882a593Smuzhiyun
551*4882a593Smuzhiyun if (tinfo->tidcnt) {
552*4882a593Smuzhiyun if (copy_to_user((void __user *)tinfo->tidlist,
553*4882a593Smuzhiyun array, sizeof(*array) * tinfo->tidcnt))
554*4882a593Smuzhiyun ret = -EFAULT;
555*4882a593Smuzhiyun }
556*4882a593Smuzhiyun kfree(array);
557*4882a593Smuzhiyun
558*4882a593Smuzhiyun return ret;
559*4882a593Smuzhiyun }
560*4882a593Smuzhiyun
find_phys_blocks(struct tid_user_buf * tidbuf,unsigned int npages)561*4882a593Smuzhiyun static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages)
562*4882a593Smuzhiyun {
563*4882a593Smuzhiyun unsigned pagecount, pageidx, setcount = 0, i;
564*4882a593Smuzhiyun unsigned long pfn, this_pfn;
565*4882a593Smuzhiyun struct page **pages = tidbuf->pages;
566*4882a593Smuzhiyun struct tid_pageset *list = tidbuf->psets;
567*4882a593Smuzhiyun
568*4882a593Smuzhiyun if (!npages)
569*4882a593Smuzhiyun return 0;
570*4882a593Smuzhiyun
571*4882a593Smuzhiyun /*
572*4882a593Smuzhiyun * Look for sets of physically contiguous pages in the user buffer.
573*4882a593Smuzhiyun * This will allow us to optimize Expected RcvArray entry usage by
574*4882a593Smuzhiyun * using the bigger supported sizes.
575*4882a593Smuzhiyun */
576*4882a593Smuzhiyun pfn = page_to_pfn(pages[0]);
577*4882a593Smuzhiyun for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
578*4882a593Smuzhiyun this_pfn = i < npages ? page_to_pfn(pages[i]) : 0;
579*4882a593Smuzhiyun
580*4882a593Smuzhiyun /*
581*4882a593Smuzhiyun * If the pfn's are not sequential, pages are not physically
582*4882a593Smuzhiyun * contiguous.
583*4882a593Smuzhiyun */
584*4882a593Smuzhiyun if (this_pfn != ++pfn) {
585*4882a593Smuzhiyun /*
586*4882a593Smuzhiyun * At this point we have to loop over the set of
587*4882a593Smuzhiyun * physically contiguous pages and break them down it
588*4882a593Smuzhiyun * sizes supported by the HW.
589*4882a593Smuzhiyun * There are two main constraints:
590*4882a593Smuzhiyun * 1. The max buffer size is MAX_EXPECTED_BUFFER.
591*4882a593Smuzhiyun * If the total set size is bigger than that
592*4882a593Smuzhiyun * program only a MAX_EXPECTED_BUFFER chunk.
593*4882a593Smuzhiyun * 2. The buffer size has to be a power of two. If
594*4882a593Smuzhiyun * it is not, round down to the closes power of
595*4882a593Smuzhiyun * 2 and program that size.
596*4882a593Smuzhiyun */
597*4882a593Smuzhiyun while (pagecount) {
598*4882a593Smuzhiyun int maxpages = pagecount;
599*4882a593Smuzhiyun u32 bufsize = pagecount * PAGE_SIZE;
600*4882a593Smuzhiyun
601*4882a593Smuzhiyun if (bufsize > MAX_EXPECTED_BUFFER)
602*4882a593Smuzhiyun maxpages =
603*4882a593Smuzhiyun MAX_EXPECTED_BUFFER >>
604*4882a593Smuzhiyun PAGE_SHIFT;
605*4882a593Smuzhiyun else if (!is_power_of_2(bufsize))
606*4882a593Smuzhiyun maxpages =
607*4882a593Smuzhiyun rounddown_pow_of_two(bufsize) >>
608*4882a593Smuzhiyun PAGE_SHIFT;
609*4882a593Smuzhiyun
610*4882a593Smuzhiyun list[setcount].idx = pageidx;
611*4882a593Smuzhiyun list[setcount].count = maxpages;
612*4882a593Smuzhiyun pagecount -= maxpages;
613*4882a593Smuzhiyun pageidx += maxpages;
614*4882a593Smuzhiyun setcount++;
615*4882a593Smuzhiyun }
616*4882a593Smuzhiyun pageidx = i;
617*4882a593Smuzhiyun pagecount = 1;
618*4882a593Smuzhiyun pfn = this_pfn;
619*4882a593Smuzhiyun } else {
620*4882a593Smuzhiyun pagecount++;
621*4882a593Smuzhiyun }
622*4882a593Smuzhiyun }
623*4882a593Smuzhiyun return setcount;
624*4882a593Smuzhiyun }
625*4882a593Smuzhiyun
626*4882a593Smuzhiyun /**
627*4882a593Smuzhiyun * program_rcvarray() - program an RcvArray group with receive buffers
628*4882a593Smuzhiyun * @fd: filedata pointer
629*4882a593Smuzhiyun * @tbuf: pointer to struct tid_user_buf that has the user buffer starting
630*4882a593Smuzhiyun * virtual address, buffer length, page pointers, pagesets (array of
631*4882a593Smuzhiyun * struct tid_pageset holding information on physically contiguous
632*4882a593Smuzhiyun * chunks from the user buffer), and other fields.
633*4882a593Smuzhiyun * @grp: RcvArray group
634*4882a593Smuzhiyun * @start: starting index into sets array
635*4882a593Smuzhiyun * @count: number of struct tid_pageset's to program
636*4882a593Smuzhiyun * @tidlist: the array of u32 elements when the information about the
637*4882a593Smuzhiyun * programmed RcvArray entries is to be encoded.
638*4882a593Smuzhiyun * @tididx: starting offset into tidlist
639*4882a593Smuzhiyun * @pmapped: (output parameter) number of pages programmed into the RcvArray
640*4882a593Smuzhiyun * entries.
641*4882a593Smuzhiyun *
642*4882a593Smuzhiyun * This function will program up to 'count' number of RcvArray entries from the
643*4882a593Smuzhiyun * group 'grp'. To make best use of write-combining writes, the function will
644*4882a593Smuzhiyun * perform writes to the unused RcvArray entries which will be ignored by the
645*4882a593Smuzhiyun * HW. Each RcvArray entry will be programmed with a physically contiguous
646*4882a593Smuzhiyun * buffer chunk from the user's virtual buffer.
647*4882a593Smuzhiyun *
648*4882a593Smuzhiyun * Return:
649*4882a593Smuzhiyun * -EINVAL if the requested count is larger than the size of the group,
650*4882a593Smuzhiyun * -ENOMEM or -EFAULT on error from set_rcvarray_entry(), or
651*4882a593Smuzhiyun * number of RcvArray entries programmed.
652*4882a593Smuzhiyun */
program_rcvarray(struct hfi1_filedata * fd,struct tid_user_buf * tbuf,struct tid_group * grp,unsigned int start,u16 count,u32 * tidlist,unsigned int * tididx,unsigned int * pmapped)653*4882a593Smuzhiyun static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *tbuf,
654*4882a593Smuzhiyun struct tid_group *grp,
655*4882a593Smuzhiyun unsigned int start, u16 count,
656*4882a593Smuzhiyun u32 *tidlist, unsigned int *tididx,
657*4882a593Smuzhiyun unsigned int *pmapped)
658*4882a593Smuzhiyun {
659*4882a593Smuzhiyun struct hfi1_ctxtdata *uctxt = fd->uctxt;
660*4882a593Smuzhiyun struct hfi1_devdata *dd = uctxt->dd;
661*4882a593Smuzhiyun u16 idx;
662*4882a593Smuzhiyun u32 tidinfo = 0, rcventry, useidx = 0;
663*4882a593Smuzhiyun int mapped = 0;
664*4882a593Smuzhiyun
665*4882a593Smuzhiyun /* Count should never be larger than the group size */
666*4882a593Smuzhiyun if (count > grp->size)
667*4882a593Smuzhiyun return -EINVAL;
668*4882a593Smuzhiyun
669*4882a593Smuzhiyun /* Find the first unused entry in the group */
670*4882a593Smuzhiyun for (idx = 0; idx < grp->size; idx++) {
671*4882a593Smuzhiyun if (!(grp->map & (1 << idx))) {
672*4882a593Smuzhiyun useidx = idx;
673*4882a593Smuzhiyun break;
674*4882a593Smuzhiyun }
675*4882a593Smuzhiyun rcv_array_wc_fill(dd, grp->base + idx);
676*4882a593Smuzhiyun }
677*4882a593Smuzhiyun
678*4882a593Smuzhiyun idx = 0;
679*4882a593Smuzhiyun while (idx < count) {
680*4882a593Smuzhiyun u16 npages, pageidx, setidx = start + idx;
681*4882a593Smuzhiyun int ret = 0;
682*4882a593Smuzhiyun
683*4882a593Smuzhiyun /*
684*4882a593Smuzhiyun * If this entry in the group is used, move to the next one.
685*4882a593Smuzhiyun * If we go past the end of the group, exit the loop.
686*4882a593Smuzhiyun */
687*4882a593Smuzhiyun if (useidx >= grp->size) {
688*4882a593Smuzhiyun break;
689*4882a593Smuzhiyun } else if (grp->map & (1 << useidx)) {
690*4882a593Smuzhiyun rcv_array_wc_fill(dd, grp->base + useidx);
691*4882a593Smuzhiyun useidx++;
692*4882a593Smuzhiyun continue;
693*4882a593Smuzhiyun }
694*4882a593Smuzhiyun
695*4882a593Smuzhiyun rcventry = grp->base + useidx;
696*4882a593Smuzhiyun npages = tbuf->psets[setidx].count;
697*4882a593Smuzhiyun pageidx = tbuf->psets[setidx].idx;
698*4882a593Smuzhiyun
699*4882a593Smuzhiyun ret = set_rcvarray_entry(fd, tbuf,
700*4882a593Smuzhiyun rcventry, grp, pageidx,
701*4882a593Smuzhiyun npages);
702*4882a593Smuzhiyun if (ret)
703*4882a593Smuzhiyun return ret;
704*4882a593Smuzhiyun mapped += npages;
705*4882a593Smuzhiyun
706*4882a593Smuzhiyun tidinfo = rcventry2tidinfo(rcventry - uctxt->expected_base) |
707*4882a593Smuzhiyun EXP_TID_SET(LEN, npages);
708*4882a593Smuzhiyun tidlist[(*tididx)++] = tidinfo;
709*4882a593Smuzhiyun grp->used++;
710*4882a593Smuzhiyun grp->map |= 1 << useidx++;
711*4882a593Smuzhiyun idx++;
712*4882a593Smuzhiyun }
713*4882a593Smuzhiyun
714*4882a593Smuzhiyun /* Fill the rest of the group with "blank" writes */
715*4882a593Smuzhiyun for (; useidx < grp->size; useidx++)
716*4882a593Smuzhiyun rcv_array_wc_fill(dd, grp->base + useidx);
717*4882a593Smuzhiyun *pmapped = mapped;
718*4882a593Smuzhiyun return idx;
719*4882a593Smuzhiyun }
720*4882a593Smuzhiyun
set_rcvarray_entry(struct hfi1_filedata * fd,struct tid_user_buf * tbuf,u32 rcventry,struct tid_group * grp,u16 pageidx,unsigned int npages)721*4882a593Smuzhiyun static int set_rcvarray_entry(struct hfi1_filedata *fd,
722*4882a593Smuzhiyun struct tid_user_buf *tbuf,
723*4882a593Smuzhiyun u32 rcventry, struct tid_group *grp,
724*4882a593Smuzhiyun u16 pageidx, unsigned int npages)
725*4882a593Smuzhiyun {
726*4882a593Smuzhiyun int ret;
727*4882a593Smuzhiyun struct hfi1_ctxtdata *uctxt = fd->uctxt;
728*4882a593Smuzhiyun struct tid_rb_node *node;
729*4882a593Smuzhiyun struct hfi1_devdata *dd = uctxt->dd;
730*4882a593Smuzhiyun dma_addr_t phys;
731*4882a593Smuzhiyun struct page **pages = tbuf->pages + pageidx;
732*4882a593Smuzhiyun
733*4882a593Smuzhiyun /*
734*4882a593Smuzhiyun * Allocate the node first so we can handle a potential
735*4882a593Smuzhiyun * failure before we've programmed anything.
736*4882a593Smuzhiyun */
737*4882a593Smuzhiyun node = kzalloc(sizeof(*node) + (sizeof(struct page *) * npages),
738*4882a593Smuzhiyun GFP_KERNEL);
739*4882a593Smuzhiyun if (!node)
740*4882a593Smuzhiyun return -ENOMEM;
741*4882a593Smuzhiyun
742*4882a593Smuzhiyun phys = pci_map_single(dd->pcidev,
743*4882a593Smuzhiyun __va(page_to_phys(pages[0])),
744*4882a593Smuzhiyun npages * PAGE_SIZE, PCI_DMA_FROMDEVICE);
745*4882a593Smuzhiyun if (dma_mapping_error(&dd->pcidev->dev, phys)) {
746*4882a593Smuzhiyun dd_dev_err(dd, "Failed to DMA map Exp Rcv pages 0x%llx\n",
747*4882a593Smuzhiyun phys);
748*4882a593Smuzhiyun kfree(node);
749*4882a593Smuzhiyun return -EFAULT;
750*4882a593Smuzhiyun }
751*4882a593Smuzhiyun
752*4882a593Smuzhiyun node->fdata = fd;
753*4882a593Smuzhiyun node->phys = page_to_phys(pages[0]);
754*4882a593Smuzhiyun node->npages = npages;
755*4882a593Smuzhiyun node->rcventry = rcventry;
756*4882a593Smuzhiyun node->dma_addr = phys;
757*4882a593Smuzhiyun node->grp = grp;
758*4882a593Smuzhiyun node->freed = false;
759*4882a593Smuzhiyun memcpy(node->pages, pages, sizeof(struct page *) * npages);
760*4882a593Smuzhiyun
761*4882a593Smuzhiyun if (fd->use_mn) {
762*4882a593Smuzhiyun ret = mmu_interval_notifier_insert(
763*4882a593Smuzhiyun &node->notifier, current->mm,
764*4882a593Smuzhiyun tbuf->vaddr + (pageidx * PAGE_SIZE), npages * PAGE_SIZE,
765*4882a593Smuzhiyun &tid_mn_ops);
766*4882a593Smuzhiyun if (ret)
767*4882a593Smuzhiyun goto out_unmap;
768*4882a593Smuzhiyun /*
769*4882a593Smuzhiyun * FIXME: This is in the wrong order, the notifier should be
770*4882a593Smuzhiyun * established before the pages are pinned by pin_rcv_pages.
771*4882a593Smuzhiyun */
772*4882a593Smuzhiyun mmu_interval_read_begin(&node->notifier);
773*4882a593Smuzhiyun }
774*4882a593Smuzhiyun fd->entry_to_rb[node->rcventry - uctxt->expected_base] = node;
775*4882a593Smuzhiyun
776*4882a593Smuzhiyun hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1);
777*4882a593Smuzhiyun trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry, npages,
778*4882a593Smuzhiyun node->notifier.interval_tree.start, node->phys,
779*4882a593Smuzhiyun phys);
780*4882a593Smuzhiyun return 0;
781*4882a593Smuzhiyun
782*4882a593Smuzhiyun out_unmap:
783*4882a593Smuzhiyun hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d",
784*4882a593Smuzhiyun node->rcventry, node->notifier.interval_tree.start,
785*4882a593Smuzhiyun node->phys, ret);
786*4882a593Smuzhiyun pci_unmap_single(dd->pcidev, phys, npages * PAGE_SIZE,
787*4882a593Smuzhiyun PCI_DMA_FROMDEVICE);
788*4882a593Smuzhiyun kfree(node);
789*4882a593Smuzhiyun return -EFAULT;
790*4882a593Smuzhiyun }
791*4882a593Smuzhiyun
unprogram_rcvarray(struct hfi1_filedata * fd,u32 tidinfo,struct tid_group ** grp)792*4882a593Smuzhiyun static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
793*4882a593Smuzhiyun struct tid_group **grp)
794*4882a593Smuzhiyun {
795*4882a593Smuzhiyun struct hfi1_ctxtdata *uctxt = fd->uctxt;
796*4882a593Smuzhiyun struct hfi1_devdata *dd = uctxt->dd;
797*4882a593Smuzhiyun struct tid_rb_node *node;
798*4882a593Smuzhiyun u8 tidctrl = EXP_TID_GET(tidinfo, CTRL);
799*4882a593Smuzhiyun u32 tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry;
800*4882a593Smuzhiyun
801*4882a593Smuzhiyun if (tididx >= uctxt->expected_count) {
802*4882a593Smuzhiyun dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n",
803*4882a593Smuzhiyun tididx, uctxt->ctxt);
804*4882a593Smuzhiyun return -EINVAL;
805*4882a593Smuzhiyun }
806*4882a593Smuzhiyun
807*4882a593Smuzhiyun if (tidctrl == 0x3)
808*4882a593Smuzhiyun return -EINVAL;
809*4882a593Smuzhiyun
810*4882a593Smuzhiyun rcventry = tididx + (tidctrl - 1);
811*4882a593Smuzhiyun
812*4882a593Smuzhiyun node = fd->entry_to_rb[rcventry];
813*4882a593Smuzhiyun if (!node || node->rcventry != (uctxt->expected_base + rcventry))
814*4882a593Smuzhiyun return -EBADF;
815*4882a593Smuzhiyun
816*4882a593Smuzhiyun if (grp)
817*4882a593Smuzhiyun *grp = node->grp;
818*4882a593Smuzhiyun
819*4882a593Smuzhiyun if (fd->use_mn)
820*4882a593Smuzhiyun mmu_interval_notifier_remove(&node->notifier);
821*4882a593Smuzhiyun cacheless_tid_rb_remove(fd, node);
822*4882a593Smuzhiyun
823*4882a593Smuzhiyun return 0;
824*4882a593Smuzhiyun }
825*4882a593Smuzhiyun
clear_tid_node(struct hfi1_filedata * fd,struct tid_rb_node * node)826*4882a593Smuzhiyun static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
827*4882a593Smuzhiyun {
828*4882a593Smuzhiyun struct hfi1_ctxtdata *uctxt = fd->uctxt;
829*4882a593Smuzhiyun struct hfi1_devdata *dd = uctxt->dd;
830*4882a593Smuzhiyun
831*4882a593Smuzhiyun trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry,
832*4882a593Smuzhiyun node->npages,
833*4882a593Smuzhiyun node->notifier.interval_tree.start, node->phys,
834*4882a593Smuzhiyun node->dma_addr);
835*4882a593Smuzhiyun
836*4882a593Smuzhiyun /*
837*4882a593Smuzhiyun * Make sure device has seen the write before we unpin the
838*4882a593Smuzhiyun * pages.
839*4882a593Smuzhiyun */
840*4882a593Smuzhiyun hfi1_put_tid(dd, node->rcventry, PT_INVALID_FLUSH, 0, 0);
841*4882a593Smuzhiyun
842*4882a593Smuzhiyun unpin_rcv_pages(fd, NULL, node, 0, node->npages, true);
843*4882a593Smuzhiyun
844*4882a593Smuzhiyun node->grp->used--;
845*4882a593Smuzhiyun node->grp->map &= ~(1 << (node->rcventry - node->grp->base));
846*4882a593Smuzhiyun
847*4882a593Smuzhiyun if (node->grp->used == node->grp->size - 1)
848*4882a593Smuzhiyun tid_group_move(node->grp, &uctxt->tid_full_list,
849*4882a593Smuzhiyun &uctxt->tid_used_list);
850*4882a593Smuzhiyun else if (!node->grp->used)
851*4882a593Smuzhiyun tid_group_move(node->grp, &uctxt->tid_used_list,
852*4882a593Smuzhiyun &uctxt->tid_group_list);
853*4882a593Smuzhiyun kfree(node);
854*4882a593Smuzhiyun }
855*4882a593Smuzhiyun
856*4882a593Smuzhiyun /*
857*4882a593Smuzhiyun * As a simple helper for hfi1_user_exp_rcv_free, this function deals with
858*4882a593Smuzhiyun * clearing nodes in the non-cached case.
859*4882a593Smuzhiyun */
unlock_exp_tids(struct hfi1_ctxtdata * uctxt,struct exp_tid_set * set,struct hfi1_filedata * fd)860*4882a593Smuzhiyun static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
861*4882a593Smuzhiyun struct exp_tid_set *set,
862*4882a593Smuzhiyun struct hfi1_filedata *fd)
863*4882a593Smuzhiyun {
864*4882a593Smuzhiyun struct tid_group *grp, *ptr;
865*4882a593Smuzhiyun int i;
866*4882a593Smuzhiyun
867*4882a593Smuzhiyun list_for_each_entry_safe(grp, ptr, &set->list, list) {
868*4882a593Smuzhiyun list_del_init(&grp->list);
869*4882a593Smuzhiyun
870*4882a593Smuzhiyun for (i = 0; i < grp->size; i++) {
871*4882a593Smuzhiyun if (grp->map & (1 << i)) {
872*4882a593Smuzhiyun u16 rcventry = grp->base + i;
873*4882a593Smuzhiyun struct tid_rb_node *node;
874*4882a593Smuzhiyun
875*4882a593Smuzhiyun node = fd->entry_to_rb[rcventry -
876*4882a593Smuzhiyun uctxt->expected_base];
877*4882a593Smuzhiyun if (!node || node->rcventry != rcventry)
878*4882a593Smuzhiyun continue;
879*4882a593Smuzhiyun
880*4882a593Smuzhiyun if (fd->use_mn)
881*4882a593Smuzhiyun mmu_interval_notifier_remove(
882*4882a593Smuzhiyun &node->notifier);
883*4882a593Smuzhiyun cacheless_tid_rb_remove(fd, node);
884*4882a593Smuzhiyun }
885*4882a593Smuzhiyun }
886*4882a593Smuzhiyun }
887*4882a593Smuzhiyun }
888*4882a593Smuzhiyun
tid_rb_invalidate(struct mmu_interval_notifier * mni,const struct mmu_notifier_range * range,unsigned long cur_seq)889*4882a593Smuzhiyun static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
890*4882a593Smuzhiyun const struct mmu_notifier_range *range,
891*4882a593Smuzhiyun unsigned long cur_seq)
892*4882a593Smuzhiyun {
893*4882a593Smuzhiyun struct tid_rb_node *node =
894*4882a593Smuzhiyun container_of(mni, struct tid_rb_node, notifier);
895*4882a593Smuzhiyun struct hfi1_filedata *fdata = node->fdata;
896*4882a593Smuzhiyun struct hfi1_ctxtdata *uctxt = fdata->uctxt;
897*4882a593Smuzhiyun
898*4882a593Smuzhiyun if (node->freed)
899*4882a593Smuzhiyun return true;
900*4882a593Smuzhiyun
901*4882a593Smuzhiyun trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt,
902*4882a593Smuzhiyun node->notifier.interval_tree.start,
903*4882a593Smuzhiyun node->rcventry, node->npages, node->dma_addr);
904*4882a593Smuzhiyun node->freed = true;
905*4882a593Smuzhiyun
906*4882a593Smuzhiyun spin_lock(&fdata->invalid_lock);
907*4882a593Smuzhiyun if (fdata->invalid_tid_idx < uctxt->expected_count) {
908*4882a593Smuzhiyun fdata->invalid_tids[fdata->invalid_tid_idx] =
909*4882a593Smuzhiyun rcventry2tidinfo(node->rcventry - uctxt->expected_base);
910*4882a593Smuzhiyun fdata->invalid_tids[fdata->invalid_tid_idx] |=
911*4882a593Smuzhiyun EXP_TID_SET(LEN, node->npages);
912*4882a593Smuzhiyun if (!fdata->invalid_tid_idx) {
913*4882a593Smuzhiyun unsigned long *ev;
914*4882a593Smuzhiyun
915*4882a593Smuzhiyun /*
916*4882a593Smuzhiyun * hfi1_set_uevent_bits() sets a user event flag
917*4882a593Smuzhiyun * for all processes. Because calling into the
918*4882a593Smuzhiyun * driver to process TID cache invalidations is
919*4882a593Smuzhiyun * expensive and TID cache invalidations are
920*4882a593Smuzhiyun * handled on a per-process basis, we can
921*4882a593Smuzhiyun * optimize this to set the flag only for the
922*4882a593Smuzhiyun * process in question.
923*4882a593Smuzhiyun */
924*4882a593Smuzhiyun ev = uctxt->dd->events +
925*4882a593Smuzhiyun (uctxt_offset(uctxt) + fdata->subctxt);
926*4882a593Smuzhiyun set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
927*4882a593Smuzhiyun }
928*4882a593Smuzhiyun fdata->invalid_tid_idx++;
929*4882a593Smuzhiyun }
930*4882a593Smuzhiyun spin_unlock(&fdata->invalid_lock);
931*4882a593Smuzhiyun return true;
932*4882a593Smuzhiyun }
933*4882a593Smuzhiyun
cacheless_tid_rb_remove(struct hfi1_filedata * fdata,struct tid_rb_node * tnode)934*4882a593Smuzhiyun static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
935*4882a593Smuzhiyun struct tid_rb_node *tnode)
936*4882a593Smuzhiyun {
937*4882a593Smuzhiyun u32 base = fdata->uctxt->expected_base;
938*4882a593Smuzhiyun
939*4882a593Smuzhiyun fdata->entry_to_rb[tnode->rcventry - base] = NULL;
940*4882a593Smuzhiyun clear_tid_node(fdata, tnode);
941*4882a593Smuzhiyun }
942