1*4882a593Smuzhiyun /*
2*4882a593Smuzhiyun * Copyright (c) 2006, 2020 Oracle and/or its affiliates.
3*4882a593Smuzhiyun *
4*4882a593Smuzhiyun * This software is available to you under a choice of one of two
5*4882a593Smuzhiyun * licenses. You may choose to be licensed under the terms of the GNU
6*4882a593Smuzhiyun * General Public License (GPL) Version 2, available from the file
7*4882a593Smuzhiyun * COPYING in the main directory of this source tree, or the
8*4882a593Smuzhiyun * OpenIB.org BSD license below:
9*4882a593Smuzhiyun *
10*4882a593Smuzhiyun * Redistribution and use in source and binary forms, with or
11*4882a593Smuzhiyun * without modification, are permitted provided that the following
12*4882a593Smuzhiyun * conditions are met:
13*4882a593Smuzhiyun *
14*4882a593Smuzhiyun * - Redistributions of source code must retain the above
15*4882a593Smuzhiyun * copyright notice, this list of conditions and the following
16*4882a593Smuzhiyun * disclaimer.
17*4882a593Smuzhiyun *
18*4882a593Smuzhiyun * - Redistributions in binary form must reproduce the above
19*4882a593Smuzhiyun * copyright notice, this list of conditions and the following
20*4882a593Smuzhiyun * disclaimer in the documentation and/or other materials
21*4882a593Smuzhiyun * provided with the distribution.
22*4882a593Smuzhiyun *
23*4882a593Smuzhiyun * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24*4882a593Smuzhiyun * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25*4882a593Smuzhiyun * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26*4882a593Smuzhiyun * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27*4882a593Smuzhiyun * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28*4882a593Smuzhiyun * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29*4882a593Smuzhiyun * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30*4882a593Smuzhiyun * SOFTWARE.
31*4882a593Smuzhiyun *
32*4882a593Smuzhiyun */
33*4882a593Smuzhiyun #include <linux/kernel.h>
34*4882a593Smuzhiyun #include <linux/slab.h>
35*4882a593Smuzhiyun #include <linux/export.h>
36*4882a593Smuzhiyun #include <linux/skbuff.h>
37*4882a593Smuzhiyun #include <linux/list.h>
38*4882a593Smuzhiyun #include <linux/errqueue.h>
39*4882a593Smuzhiyun
40*4882a593Smuzhiyun #include "rds.h"
41*4882a593Smuzhiyun
42*4882a593Smuzhiyun static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = {
43*4882a593Smuzhiyun [RDS_EXTHDR_NONE] = 0,
44*4882a593Smuzhiyun [RDS_EXTHDR_VERSION] = sizeof(struct rds_ext_header_version),
45*4882a593Smuzhiyun [RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma),
46*4882a593Smuzhiyun [RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest),
47*4882a593Smuzhiyun [RDS_EXTHDR_NPATHS] = sizeof(u16),
48*4882a593Smuzhiyun [RDS_EXTHDR_GEN_NUM] = sizeof(u32),
49*4882a593Smuzhiyun };
50*4882a593Smuzhiyun
rds_message_addref(struct rds_message * rm)51*4882a593Smuzhiyun void rds_message_addref(struct rds_message *rm)
52*4882a593Smuzhiyun {
53*4882a593Smuzhiyun rdsdebug("addref rm %p ref %d\n", rm, refcount_read(&rm->m_refcount));
54*4882a593Smuzhiyun refcount_inc(&rm->m_refcount);
55*4882a593Smuzhiyun }
56*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(rds_message_addref);
57*4882a593Smuzhiyun
rds_zcookie_add(struct rds_msg_zcopy_info * info,u32 cookie)58*4882a593Smuzhiyun static inline bool rds_zcookie_add(struct rds_msg_zcopy_info *info, u32 cookie)
59*4882a593Smuzhiyun {
60*4882a593Smuzhiyun struct rds_zcopy_cookies *ck = &info->zcookies;
61*4882a593Smuzhiyun int ncookies = ck->num;
62*4882a593Smuzhiyun
63*4882a593Smuzhiyun if (ncookies == RDS_MAX_ZCOOKIES)
64*4882a593Smuzhiyun return false;
65*4882a593Smuzhiyun ck->cookies[ncookies] = cookie;
66*4882a593Smuzhiyun ck->num = ++ncookies;
67*4882a593Smuzhiyun return true;
68*4882a593Smuzhiyun }
69*4882a593Smuzhiyun
rds_info_from_znotifier(struct rds_znotifier * znotif)70*4882a593Smuzhiyun static struct rds_msg_zcopy_info *rds_info_from_znotifier(struct rds_znotifier *znotif)
71*4882a593Smuzhiyun {
72*4882a593Smuzhiyun return container_of(znotif, struct rds_msg_zcopy_info, znotif);
73*4882a593Smuzhiyun }
74*4882a593Smuzhiyun
rds_notify_msg_zcopy_purge(struct rds_msg_zcopy_queue * q)75*4882a593Smuzhiyun void rds_notify_msg_zcopy_purge(struct rds_msg_zcopy_queue *q)
76*4882a593Smuzhiyun {
77*4882a593Smuzhiyun unsigned long flags;
78*4882a593Smuzhiyun LIST_HEAD(copy);
79*4882a593Smuzhiyun struct rds_msg_zcopy_info *info, *tmp;
80*4882a593Smuzhiyun
81*4882a593Smuzhiyun spin_lock_irqsave(&q->lock, flags);
82*4882a593Smuzhiyun list_splice(&q->zcookie_head, ©);
83*4882a593Smuzhiyun INIT_LIST_HEAD(&q->zcookie_head);
84*4882a593Smuzhiyun spin_unlock_irqrestore(&q->lock, flags);
85*4882a593Smuzhiyun
86*4882a593Smuzhiyun list_for_each_entry_safe(info, tmp, ©, rs_zcookie_next) {
87*4882a593Smuzhiyun list_del(&info->rs_zcookie_next);
88*4882a593Smuzhiyun kfree(info);
89*4882a593Smuzhiyun }
90*4882a593Smuzhiyun }
91*4882a593Smuzhiyun
rds_rm_zerocopy_callback(struct rds_sock * rs,struct rds_znotifier * znotif)92*4882a593Smuzhiyun static void rds_rm_zerocopy_callback(struct rds_sock *rs,
93*4882a593Smuzhiyun struct rds_znotifier *znotif)
94*4882a593Smuzhiyun {
95*4882a593Smuzhiyun struct rds_msg_zcopy_info *info;
96*4882a593Smuzhiyun struct rds_msg_zcopy_queue *q;
97*4882a593Smuzhiyun u32 cookie = znotif->z_cookie;
98*4882a593Smuzhiyun struct rds_zcopy_cookies *ck;
99*4882a593Smuzhiyun struct list_head *head;
100*4882a593Smuzhiyun unsigned long flags;
101*4882a593Smuzhiyun
102*4882a593Smuzhiyun mm_unaccount_pinned_pages(&znotif->z_mmp);
103*4882a593Smuzhiyun q = &rs->rs_zcookie_queue;
104*4882a593Smuzhiyun spin_lock_irqsave(&q->lock, flags);
105*4882a593Smuzhiyun head = &q->zcookie_head;
106*4882a593Smuzhiyun if (!list_empty(head)) {
107*4882a593Smuzhiyun info = list_entry(head, struct rds_msg_zcopy_info,
108*4882a593Smuzhiyun rs_zcookie_next);
109*4882a593Smuzhiyun if (info && rds_zcookie_add(info, cookie)) {
110*4882a593Smuzhiyun spin_unlock_irqrestore(&q->lock, flags);
111*4882a593Smuzhiyun kfree(rds_info_from_znotifier(znotif));
112*4882a593Smuzhiyun /* caller invokes rds_wake_sk_sleep() */
113*4882a593Smuzhiyun return;
114*4882a593Smuzhiyun }
115*4882a593Smuzhiyun }
116*4882a593Smuzhiyun
117*4882a593Smuzhiyun info = rds_info_from_znotifier(znotif);
118*4882a593Smuzhiyun ck = &info->zcookies;
119*4882a593Smuzhiyun memset(ck, 0, sizeof(*ck));
120*4882a593Smuzhiyun WARN_ON(!rds_zcookie_add(info, cookie));
121*4882a593Smuzhiyun list_add_tail(&q->zcookie_head, &info->rs_zcookie_next);
122*4882a593Smuzhiyun
123*4882a593Smuzhiyun spin_unlock_irqrestore(&q->lock, flags);
124*4882a593Smuzhiyun /* caller invokes rds_wake_sk_sleep() */
125*4882a593Smuzhiyun }
126*4882a593Smuzhiyun
127*4882a593Smuzhiyun /*
128*4882a593Smuzhiyun * This relies on dma_map_sg() not touching sg[].page during merging.
129*4882a593Smuzhiyun */
rds_message_purge(struct rds_message * rm)130*4882a593Smuzhiyun static void rds_message_purge(struct rds_message *rm)
131*4882a593Smuzhiyun {
132*4882a593Smuzhiyun unsigned long i, flags;
133*4882a593Smuzhiyun bool zcopy = false;
134*4882a593Smuzhiyun
135*4882a593Smuzhiyun if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags)))
136*4882a593Smuzhiyun return;
137*4882a593Smuzhiyun
138*4882a593Smuzhiyun spin_lock_irqsave(&rm->m_rs_lock, flags);
139*4882a593Smuzhiyun if (rm->m_rs) {
140*4882a593Smuzhiyun struct rds_sock *rs = rm->m_rs;
141*4882a593Smuzhiyun
142*4882a593Smuzhiyun if (rm->data.op_mmp_znotifier) {
143*4882a593Smuzhiyun zcopy = true;
144*4882a593Smuzhiyun rds_rm_zerocopy_callback(rs, rm->data.op_mmp_znotifier);
145*4882a593Smuzhiyun rds_wake_sk_sleep(rs);
146*4882a593Smuzhiyun rm->data.op_mmp_znotifier = NULL;
147*4882a593Smuzhiyun }
148*4882a593Smuzhiyun sock_put(rds_rs_to_sk(rs));
149*4882a593Smuzhiyun rm->m_rs = NULL;
150*4882a593Smuzhiyun }
151*4882a593Smuzhiyun spin_unlock_irqrestore(&rm->m_rs_lock, flags);
152*4882a593Smuzhiyun
153*4882a593Smuzhiyun for (i = 0; i < rm->data.op_nents; i++) {
154*4882a593Smuzhiyun /* XXX will have to put_page for page refs */
155*4882a593Smuzhiyun if (!zcopy)
156*4882a593Smuzhiyun __free_page(sg_page(&rm->data.op_sg[i]));
157*4882a593Smuzhiyun else
158*4882a593Smuzhiyun put_page(sg_page(&rm->data.op_sg[i]));
159*4882a593Smuzhiyun }
160*4882a593Smuzhiyun rm->data.op_nents = 0;
161*4882a593Smuzhiyun
162*4882a593Smuzhiyun if (rm->rdma.op_active)
163*4882a593Smuzhiyun rds_rdma_free_op(&rm->rdma);
164*4882a593Smuzhiyun if (rm->rdma.op_rdma_mr)
165*4882a593Smuzhiyun kref_put(&rm->rdma.op_rdma_mr->r_kref, __rds_put_mr_final);
166*4882a593Smuzhiyun
167*4882a593Smuzhiyun if (rm->atomic.op_active)
168*4882a593Smuzhiyun rds_atomic_free_op(&rm->atomic);
169*4882a593Smuzhiyun if (rm->atomic.op_rdma_mr)
170*4882a593Smuzhiyun kref_put(&rm->atomic.op_rdma_mr->r_kref, __rds_put_mr_final);
171*4882a593Smuzhiyun }
172*4882a593Smuzhiyun
rds_message_put(struct rds_message * rm)173*4882a593Smuzhiyun void rds_message_put(struct rds_message *rm)
174*4882a593Smuzhiyun {
175*4882a593Smuzhiyun rdsdebug("put rm %p ref %d\n", rm, refcount_read(&rm->m_refcount));
176*4882a593Smuzhiyun WARN(!refcount_read(&rm->m_refcount), "danger refcount zero on %p\n", rm);
177*4882a593Smuzhiyun if (refcount_dec_and_test(&rm->m_refcount)) {
178*4882a593Smuzhiyun BUG_ON(!list_empty(&rm->m_sock_item));
179*4882a593Smuzhiyun BUG_ON(!list_empty(&rm->m_conn_item));
180*4882a593Smuzhiyun rds_message_purge(rm);
181*4882a593Smuzhiyun
182*4882a593Smuzhiyun kfree(rm);
183*4882a593Smuzhiyun }
184*4882a593Smuzhiyun }
185*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(rds_message_put);
186*4882a593Smuzhiyun
rds_message_populate_header(struct rds_header * hdr,__be16 sport,__be16 dport,u64 seq)187*4882a593Smuzhiyun void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
188*4882a593Smuzhiyun __be16 dport, u64 seq)
189*4882a593Smuzhiyun {
190*4882a593Smuzhiyun hdr->h_flags = 0;
191*4882a593Smuzhiyun hdr->h_sport = sport;
192*4882a593Smuzhiyun hdr->h_dport = dport;
193*4882a593Smuzhiyun hdr->h_sequence = cpu_to_be64(seq);
194*4882a593Smuzhiyun hdr->h_exthdr[0] = RDS_EXTHDR_NONE;
195*4882a593Smuzhiyun }
196*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(rds_message_populate_header);
197*4882a593Smuzhiyun
rds_message_add_extension(struct rds_header * hdr,unsigned int type,const void * data,unsigned int len)198*4882a593Smuzhiyun int rds_message_add_extension(struct rds_header *hdr, unsigned int type,
199*4882a593Smuzhiyun const void *data, unsigned int len)
200*4882a593Smuzhiyun {
201*4882a593Smuzhiyun unsigned int ext_len = sizeof(u8) + len;
202*4882a593Smuzhiyun unsigned char *dst;
203*4882a593Smuzhiyun
204*4882a593Smuzhiyun /* For now, refuse to add more than one extension header */
205*4882a593Smuzhiyun if (hdr->h_exthdr[0] != RDS_EXTHDR_NONE)
206*4882a593Smuzhiyun return 0;
207*4882a593Smuzhiyun
208*4882a593Smuzhiyun if (type >= __RDS_EXTHDR_MAX || len != rds_exthdr_size[type])
209*4882a593Smuzhiyun return 0;
210*4882a593Smuzhiyun
211*4882a593Smuzhiyun if (ext_len >= RDS_HEADER_EXT_SPACE)
212*4882a593Smuzhiyun return 0;
213*4882a593Smuzhiyun dst = hdr->h_exthdr;
214*4882a593Smuzhiyun
215*4882a593Smuzhiyun *dst++ = type;
216*4882a593Smuzhiyun memcpy(dst, data, len);
217*4882a593Smuzhiyun
218*4882a593Smuzhiyun dst[len] = RDS_EXTHDR_NONE;
219*4882a593Smuzhiyun return 1;
220*4882a593Smuzhiyun }
221*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(rds_message_add_extension);
222*4882a593Smuzhiyun
223*4882a593Smuzhiyun /*
224*4882a593Smuzhiyun * If a message has extension headers, retrieve them here.
225*4882a593Smuzhiyun * Call like this:
226*4882a593Smuzhiyun *
227*4882a593Smuzhiyun * unsigned int pos = 0;
228*4882a593Smuzhiyun *
229*4882a593Smuzhiyun * while (1) {
230*4882a593Smuzhiyun * buflen = sizeof(buffer);
231*4882a593Smuzhiyun * type = rds_message_next_extension(hdr, &pos, buffer, &buflen);
232*4882a593Smuzhiyun * if (type == RDS_EXTHDR_NONE)
233*4882a593Smuzhiyun * break;
234*4882a593Smuzhiyun * ...
235*4882a593Smuzhiyun * }
236*4882a593Smuzhiyun */
rds_message_next_extension(struct rds_header * hdr,unsigned int * pos,void * buf,unsigned int * buflen)237*4882a593Smuzhiyun int rds_message_next_extension(struct rds_header *hdr,
238*4882a593Smuzhiyun unsigned int *pos, void *buf, unsigned int *buflen)
239*4882a593Smuzhiyun {
240*4882a593Smuzhiyun unsigned int offset, ext_type, ext_len;
241*4882a593Smuzhiyun u8 *src = hdr->h_exthdr;
242*4882a593Smuzhiyun
243*4882a593Smuzhiyun offset = *pos;
244*4882a593Smuzhiyun if (offset >= RDS_HEADER_EXT_SPACE)
245*4882a593Smuzhiyun goto none;
246*4882a593Smuzhiyun
247*4882a593Smuzhiyun /* Get the extension type and length. For now, the
248*4882a593Smuzhiyun * length is implied by the extension type. */
249*4882a593Smuzhiyun ext_type = src[offset++];
250*4882a593Smuzhiyun
251*4882a593Smuzhiyun if (ext_type == RDS_EXTHDR_NONE || ext_type >= __RDS_EXTHDR_MAX)
252*4882a593Smuzhiyun goto none;
253*4882a593Smuzhiyun ext_len = rds_exthdr_size[ext_type];
254*4882a593Smuzhiyun if (offset + ext_len > RDS_HEADER_EXT_SPACE)
255*4882a593Smuzhiyun goto none;
256*4882a593Smuzhiyun
257*4882a593Smuzhiyun *pos = offset + ext_len;
258*4882a593Smuzhiyun if (ext_len < *buflen)
259*4882a593Smuzhiyun *buflen = ext_len;
260*4882a593Smuzhiyun memcpy(buf, src + offset, *buflen);
261*4882a593Smuzhiyun return ext_type;
262*4882a593Smuzhiyun
263*4882a593Smuzhiyun none:
264*4882a593Smuzhiyun *pos = RDS_HEADER_EXT_SPACE;
265*4882a593Smuzhiyun *buflen = 0;
266*4882a593Smuzhiyun return RDS_EXTHDR_NONE;
267*4882a593Smuzhiyun }
268*4882a593Smuzhiyun
rds_message_add_rdma_dest_extension(struct rds_header * hdr,u32 r_key,u32 offset)269*4882a593Smuzhiyun int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset)
270*4882a593Smuzhiyun {
271*4882a593Smuzhiyun struct rds_ext_header_rdma_dest ext_hdr;
272*4882a593Smuzhiyun
273*4882a593Smuzhiyun ext_hdr.h_rdma_rkey = cpu_to_be32(r_key);
274*4882a593Smuzhiyun ext_hdr.h_rdma_offset = cpu_to_be32(offset);
275*4882a593Smuzhiyun return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr, sizeof(ext_hdr));
276*4882a593Smuzhiyun }
277*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension);
278*4882a593Smuzhiyun
279*4882a593Smuzhiyun /*
280*4882a593Smuzhiyun * Each rds_message is allocated with extra space for the scatterlist entries
281*4882a593Smuzhiyun * rds ops will need. This is to minimize memory allocation count. Then, each rds op
282*4882a593Smuzhiyun * can grab SGs when initializing its part of the rds_message.
283*4882a593Smuzhiyun */
rds_message_alloc(unsigned int extra_len,gfp_t gfp)284*4882a593Smuzhiyun struct rds_message *rds_message_alloc(unsigned int extra_len, gfp_t gfp)
285*4882a593Smuzhiyun {
286*4882a593Smuzhiyun struct rds_message *rm;
287*4882a593Smuzhiyun
288*4882a593Smuzhiyun if (extra_len > KMALLOC_MAX_SIZE - sizeof(struct rds_message))
289*4882a593Smuzhiyun return NULL;
290*4882a593Smuzhiyun
291*4882a593Smuzhiyun rm = kzalloc(sizeof(struct rds_message) + extra_len, gfp);
292*4882a593Smuzhiyun if (!rm)
293*4882a593Smuzhiyun goto out;
294*4882a593Smuzhiyun
295*4882a593Smuzhiyun rm->m_used_sgs = 0;
296*4882a593Smuzhiyun rm->m_total_sgs = extra_len / sizeof(struct scatterlist);
297*4882a593Smuzhiyun
298*4882a593Smuzhiyun refcount_set(&rm->m_refcount, 1);
299*4882a593Smuzhiyun INIT_LIST_HEAD(&rm->m_sock_item);
300*4882a593Smuzhiyun INIT_LIST_HEAD(&rm->m_conn_item);
301*4882a593Smuzhiyun spin_lock_init(&rm->m_rs_lock);
302*4882a593Smuzhiyun init_waitqueue_head(&rm->m_flush_wait);
303*4882a593Smuzhiyun
304*4882a593Smuzhiyun out:
305*4882a593Smuzhiyun return rm;
306*4882a593Smuzhiyun }
307*4882a593Smuzhiyun
308*4882a593Smuzhiyun /*
309*4882a593Smuzhiyun * RDS ops use this to grab SG entries from the rm's sg pool.
310*4882a593Smuzhiyun */
rds_message_alloc_sgs(struct rds_message * rm,int nents)311*4882a593Smuzhiyun struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents)
312*4882a593Smuzhiyun {
313*4882a593Smuzhiyun struct scatterlist *sg_first = (struct scatterlist *) &rm[1];
314*4882a593Smuzhiyun struct scatterlist *sg_ret;
315*4882a593Smuzhiyun
316*4882a593Smuzhiyun if (nents <= 0) {
317*4882a593Smuzhiyun pr_warn("rds: alloc sgs failed! nents <= 0\n");
318*4882a593Smuzhiyun return ERR_PTR(-EINVAL);
319*4882a593Smuzhiyun }
320*4882a593Smuzhiyun
321*4882a593Smuzhiyun if (rm->m_used_sgs + nents > rm->m_total_sgs) {
322*4882a593Smuzhiyun pr_warn("rds: alloc sgs failed! total %d used %d nents %d\n",
323*4882a593Smuzhiyun rm->m_total_sgs, rm->m_used_sgs, nents);
324*4882a593Smuzhiyun return ERR_PTR(-ENOMEM);
325*4882a593Smuzhiyun }
326*4882a593Smuzhiyun
327*4882a593Smuzhiyun sg_ret = &sg_first[rm->m_used_sgs];
328*4882a593Smuzhiyun sg_init_table(sg_ret, nents);
329*4882a593Smuzhiyun rm->m_used_sgs += nents;
330*4882a593Smuzhiyun
331*4882a593Smuzhiyun return sg_ret;
332*4882a593Smuzhiyun }
333*4882a593Smuzhiyun
rds_message_map_pages(unsigned long * page_addrs,unsigned int total_len)334*4882a593Smuzhiyun struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len)
335*4882a593Smuzhiyun {
336*4882a593Smuzhiyun struct rds_message *rm;
337*4882a593Smuzhiyun unsigned int i;
338*4882a593Smuzhiyun int num_sgs = DIV_ROUND_UP(total_len, PAGE_SIZE);
339*4882a593Smuzhiyun int extra_bytes = num_sgs * sizeof(struct scatterlist);
340*4882a593Smuzhiyun
341*4882a593Smuzhiyun rm = rds_message_alloc(extra_bytes, GFP_NOWAIT);
342*4882a593Smuzhiyun if (!rm)
343*4882a593Smuzhiyun return ERR_PTR(-ENOMEM);
344*4882a593Smuzhiyun
345*4882a593Smuzhiyun set_bit(RDS_MSG_PAGEVEC, &rm->m_flags);
346*4882a593Smuzhiyun rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
347*4882a593Smuzhiyun rm->data.op_nents = DIV_ROUND_UP(total_len, PAGE_SIZE);
348*4882a593Smuzhiyun rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs);
349*4882a593Smuzhiyun if (IS_ERR(rm->data.op_sg)) {
350*4882a593Smuzhiyun void *err = ERR_CAST(rm->data.op_sg);
351*4882a593Smuzhiyun rds_message_put(rm);
352*4882a593Smuzhiyun return err;
353*4882a593Smuzhiyun }
354*4882a593Smuzhiyun
355*4882a593Smuzhiyun for (i = 0; i < rm->data.op_nents; ++i) {
356*4882a593Smuzhiyun sg_set_page(&rm->data.op_sg[i],
357*4882a593Smuzhiyun virt_to_page(page_addrs[i]),
358*4882a593Smuzhiyun PAGE_SIZE, 0);
359*4882a593Smuzhiyun }
360*4882a593Smuzhiyun
361*4882a593Smuzhiyun return rm;
362*4882a593Smuzhiyun }
363*4882a593Smuzhiyun
rds_message_zcopy_from_user(struct rds_message * rm,struct iov_iter * from)364*4882a593Smuzhiyun static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *from)
365*4882a593Smuzhiyun {
366*4882a593Smuzhiyun struct scatterlist *sg;
367*4882a593Smuzhiyun int ret = 0;
368*4882a593Smuzhiyun int length = iov_iter_count(from);
369*4882a593Smuzhiyun int total_copied = 0;
370*4882a593Smuzhiyun struct rds_msg_zcopy_info *info;
371*4882a593Smuzhiyun
372*4882a593Smuzhiyun rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from));
373*4882a593Smuzhiyun
374*4882a593Smuzhiyun /*
375*4882a593Smuzhiyun * now allocate and copy in the data payload.
376*4882a593Smuzhiyun */
377*4882a593Smuzhiyun sg = rm->data.op_sg;
378*4882a593Smuzhiyun
379*4882a593Smuzhiyun info = kzalloc(sizeof(*info), GFP_KERNEL);
380*4882a593Smuzhiyun if (!info)
381*4882a593Smuzhiyun return -ENOMEM;
382*4882a593Smuzhiyun INIT_LIST_HEAD(&info->rs_zcookie_next);
383*4882a593Smuzhiyun rm->data.op_mmp_znotifier = &info->znotif;
384*4882a593Smuzhiyun if (mm_account_pinned_pages(&rm->data.op_mmp_znotifier->z_mmp,
385*4882a593Smuzhiyun length)) {
386*4882a593Smuzhiyun ret = -ENOMEM;
387*4882a593Smuzhiyun goto err;
388*4882a593Smuzhiyun }
389*4882a593Smuzhiyun while (iov_iter_count(from)) {
390*4882a593Smuzhiyun struct page *pages;
391*4882a593Smuzhiyun size_t start;
392*4882a593Smuzhiyun ssize_t copied;
393*4882a593Smuzhiyun
394*4882a593Smuzhiyun copied = iov_iter_get_pages(from, &pages, PAGE_SIZE,
395*4882a593Smuzhiyun 1, &start);
396*4882a593Smuzhiyun if (copied < 0) {
397*4882a593Smuzhiyun struct mmpin *mmp;
398*4882a593Smuzhiyun int i;
399*4882a593Smuzhiyun
400*4882a593Smuzhiyun for (i = 0; i < rm->data.op_nents; i++)
401*4882a593Smuzhiyun put_page(sg_page(&rm->data.op_sg[i]));
402*4882a593Smuzhiyun mmp = &rm->data.op_mmp_znotifier->z_mmp;
403*4882a593Smuzhiyun mm_unaccount_pinned_pages(mmp);
404*4882a593Smuzhiyun ret = -EFAULT;
405*4882a593Smuzhiyun goto err;
406*4882a593Smuzhiyun }
407*4882a593Smuzhiyun total_copied += copied;
408*4882a593Smuzhiyun iov_iter_advance(from, copied);
409*4882a593Smuzhiyun length -= copied;
410*4882a593Smuzhiyun sg_set_page(sg, pages, copied, start);
411*4882a593Smuzhiyun rm->data.op_nents++;
412*4882a593Smuzhiyun sg++;
413*4882a593Smuzhiyun }
414*4882a593Smuzhiyun WARN_ON_ONCE(length != 0);
415*4882a593Smuzhiyun return ret;
416*4882a593Smuzhiyun err:
417*4882a593Smuzhiyun kfree(info);
418*4882a593Smuzhiyun rm->data.op_mmp_znotifier = NULL;
419*4882a593Smuzhiyun return ret;
420*4882a593Smuzhiyun }
421*4882a593Smuzhiyun
rds_message_copy_from_user(struct rds_message * rm,struct iov_iter * from,bool zcopy)422*4882a593Smuzhiyun int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from,
423*4882a593Smuzhiyun bool zcopy)
424*4882a593Smuzhiyun {
425*4882a593Smuzhiyun unsigned long to_copy, nbytes;
426*4882a593Smuzhiyun unsigned long sg_off;
427*4882a593Smuzhiyun struct scatterlist *sg;
428*4882a593Smuzhiyun int ret = 0;
429*4882a593Smuzhiyun
430*4882a593Smuzhiyun rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from));
431*4882a593Smuzhiyun
432*4882a593Smuzhiyun /* now allocate and copy in the data payload. */
433*4882a593Smuzhiyun sg = rm->data.op_sg;
434*4882a593Smuzhiyun sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */
435*4882a593Smuzhiyun
436*4882a593Smuzhiyun if (zcopy)
437*4882a593Smuzhiyun return rds_message_zcopy_from_user(rm, from);
438*4882a593Smuzhiyun
439*4882a593Smuzhiyun while (iov_iter_count(from)) {
440*4882a593Smuzhiyun if (!sg_page(sg)) {
441*4882a593Smuzhiyun ret = rds_page_remainder_alloc(sg, iov_iter_count(from),
442*4882a593Smuzhiyun GFP_HIGHUSER);
443*4882a593Smuzhiyun if (ret)
444*4882a593Smuzhiyun return ret;
445*4882a593Smuzhiyun rm->data.op_nents++;
446*4882a593Smuzhiyun sg_off = 0;
447*4882a593Smuzhiyun }
448*4882a593Smuzhiyun
449*4882a593Smuzhiyun to_copy = min_t(unsigned long, iov_iter_count(from),
450*4882a593Smuzhiyun sg->length - sg_off);
451*4882a593Smuzhiyun
452*4882a593Smuzhiyun rds_stats_add(s_copy_from_user, to_copy);
453*4882a593Smuzhiyun nbytes = copy_page_from_iter(sg_page(sg), sg->offset + sg_off,
454*4882a593Smuzhiyun to_copy, from);
455*4882a593Smuzhiyun if (nbytes != to_copy)
456*4882a593Smuzhiyun return -EFAULT;
457*4882a593Smuzhiyun
458*4882a593Smuzhiyun sg_off += to_copy;
459*4882a593Smuzhiyun
460*4882a593Smuzhiyun if (sg_off == sg->length)
461*4882a593Smuzhiyun sg++;
462*4882a593Smuzhiyun }
463*4882a593Smuzhiyun
464*4882a593Smuzhiyun return ret;
465*4882a593Smuzhiyun }
466*4882a593Smuzhiyun
rds_message_inc_copy_to_user(struct rds_incoming * inc,struct iov_iter * to)467*4882a593Smuzhiyun int rds_message_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
468*4882a593Smuzhiyun {
469*4882a593Smuzhiyun struct rds_message *rm;
470*4882a593Smuzhiyun struct scatterlist *sg;
471*4882a593Smuzhiyun unsigned long to_copy;
472*4882a593Smuzhiyun unsigned long vec_off;
473*4882a593Smuzhiyun int copied;
474*4882a593Smuzhiyun int ret;
475*4882a593Smuzhiyun u32 len;
476*4882a593Smuzhiyun
477*4882a593Smuzhiyun rm = container_of(inc, struct rds_message, m_inc);
478*4882a593Smuzhiyun len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
479*4882a593Smuzhiyun
480*4882a593Smuzhiyun sg = rm->data.op_sg;
481*4882a593Smuzhiyun vec_off = 0;
482*4882a593Smuzhiyun copied = 0;
483*4882a593Smuzhiyun
484*4882a593Smuzhiyun while (iov_iter_count(to) && copied < len) {
485*4882a593Smuzhiyun to_copy = min_t(unsigned long, iov_iter_count(to),
486*4882a593Smuzhiyun sg->length - vec_off);
487*4882a593Smuzhiyun to_copy = min_t(unsigned long, to_copy, len - copied);
488*4882a593Smuzhiyun
489*4882a593Smuzhiyun rds_stats_add(s_copy_to_user, to_copy);
490*4882a593Smuzhiyun ret = copy_page_to_iter(sg_page(sg), sg->offset + vec_off,
491*4882a593Smuzhiyun to_copy, to);
492*4882a593Smuzhiyun if (ret != to_copy)
493*4882a593Smuzhiyun return -EFAULT;
494*4882a593Smuzhiyun
495*4882a593Smuzhiyun vec_off += to_copy;
496*4882a593Smuzhiyun copied += to_copy;
497*4882a593Smuzhiyun
498*4882a593Smuzhiyun if (vec_off == sg->length) {
499*4882a593Smuzhiyun vec_off = 0;
500*4882a593Smuzhiyun sg++;
501*4882a593Smuzhiyun }
502*4882a593Smuzhiyun }
503*4882a593Smuzhiyun
504*4882a593Smuzhiyun return copied;
505*4882a593Smuzhiyun }
506*4882a593Smuzhiyun
507*4882a593Smuzhiyun /*
508*4882a593Smuzhiyun * If the message is still on the send queue, wait until the transport
509*4882a593Smuzhiyun * is done with it. This is particularly important for RDMA operations.
510*4882a593Smuzhiyun */
rds_message_wait(struct rds_message * rm)511*4882a593Smuzhiyun void rds_message_wait(struct rds_message *rm)
512*4882a593Smuzhiyun {
513*4882a593Smuzhiyun wait_event_interruptible(rm->m_flush_wait,
514*4882a593Smuzhiyun !test_bit(RDS_MSG_MAPPED, &rm->m_flags));
515*4882a593Smuzhiyun }
516*4882a593Smuzhiyun
rds_message_unmapped(struct rds_message * rm)517*4882a593Smuzhiyun void rds_message_unmapped(struct rds_message *rm)
518*4882a593Smuzhiyun {
519*4882a593Smuzhiyun clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
520*4882a593Smuzhiyun wake_up_interruptible(&rm->m_flush_wait);
521*4882a593Smuzhiyun }
522*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(rds_message_unmapped);
523