1*4882a593Smuzhiyun /*
2*4882a593Smuzhiyun * Copyright (c) 2006 Oracle. All rights reserved.
3*4882a593Smuzhiyun *
4*4882a593Smuzhiyun * This software is available to you under a choice of one of two
5*4882a593Smuzhiyun * licenses. You may choose to be licensed under the terms of the GNU
6*4882a593Smuzhiyun * General Public License (GPL) Version 2, available from the file
7*4882a593Smuzhiyun * COPYING in the main directory of this source tree, or the
8*4882a593Smuzhiyun * OpenIB.org BSD license below:
9*4882a593Smuzhiyun *
10*4882a593Smuzhiyun * Redistribution and use in source and binary forms, with or
11*4882a593Smuzhiyun * without modification, are permitted provided that the following
12*4882a593Smuzhiyun * conditions are met:
13*4882a593Smuzhiyun *
14*4882a593Smuzhiyun * - Redistributions of source code must retain the above
15*4882a593Smuzhiyun * copyright notice, this list of conditions and the following
16*4882a593Smuzhiyun * disclaimer.
17*4882a593Smuzhiyun *
18*4882a593Smuzhiyun * - Redistributions in binary form must reproduce the above
19*4882a593Smuzhiyun * copyright notice, this list of conditions and the following
20*4882a593Smuzhiyun * disclaimer in the documentation and/or other materials
21*4882a593Smuzhiyun * provided with the distribution.
22*4882a593Smuzhiyun *
23*4882a593Smuzhiyun * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24*4882a593Smuzhiyun * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25*4882a593Smuzhiyun * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26*4882a593Smuzhiyun * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27*4882a593Smuzhiyun * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28*4882a593Smuzhiyun * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29*4882a593Smuzhiyun * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30*4882a593Smuzhiyun * SOFTWARE.
31*4882a593Smuzhiyun *
32*4882a593Smuzhiyun */
33*4882a593Smuzhiyun #include <linux/percpu.h>
34*4882a593Smuzhiyun #include <linux/seq_file.h>
35*4882a593Smuzhiyun #include <linux/slab.h>
36*4882a593Smuzhiyun #include <linux/proc_fs.h>
37*4882a593Smuzhiyun #include <linux/export.h>
38*4882a593Smuzhiyun
39*4882a593Smuzhiyun #include "rds.h"
40*4882a593Smuzhiyun
41*4882a593Smuzhiyun /*
42*4882a593Smuzhiyun * This file implements a getsockopt() call which copies a set of fixed
43*4882a593Smuzhiyun * sized structs into a user-specified buffer as a means of providing
44*4882a593Smuzhiyun * read-only information about RDS.
45*4882a593Smuzhiyun *
46*4882a593Smuzhiyun * For a given information source there are a given number of fixed sized
47*4882a593Smuzhiyun * structs at a given time. The structs are only copied if the user-specified
48*4882a593Smuzhiyun * buffer is big enough. The destination pages that make up the buffer
49*4882a593Smuzhiyun * are pinned for the duration of the copy.
50*4882a593Smuzhiyun *
51*4882a593Smuzhiyun * This gives us the following benefits:
52*4882a593Smuzhiyun *
53*4882a593Smuzhiyun * - simple implementation, no copy "position" across multiple calls
54*4882a593Smuzhiyun * - consistent snapshot of an info source
55*4882a593Smuzhiyun * - atomic copy works well with whatever locking info source has
56*4882a593Smuzhiyun * - one portable tool to get rds info across implementations
57*4882a593Smuzhiyun * - long-lived tool can get info without allocating
58*4882a593Smuzhiyun *
59*4882a593Smuzhiyun * at the following costs:
60*4882a593Smuzhiyun *
61*4882a593Smuzhiyun * - info source copy must be pinned, may be "large"
62*4882a593Smuzhiyun */
63*4882a593Smuzhiyun
64*4882a593Smuzhiyun struct rds_info_iterator {
65*4882a593Smuzhiyun struct page **pages;
66*4882a593Smuzhiyun void *addr;
67*4882a593Smuzhiyun unsigned long offset;
68*4882a593Smuzhiyun };
69*4882a593Smuzhiyun
70*4882a593Smuzhiyun static DEFINE_SPINLOCK(rds_info_lock);
71*4882a593Smuzhiyun static rds_info_func rds_info_funcs[RDS_INFO_LAST - RDS_INFO_FIRST + 1];
72*4882a593Smuzhiyun
rds_info_register_func(int optname,rds_info_func func)73*4882a593Smuzhiyun void rds_info_register_func(int optname, rds_info_func func)
74*4882a593Smuzhiyun {
75*4882a593Smuzhiyun int offset = optname - RDS_INFO_FIRST;
76*4882a593Smuzhiyun
77*4882a593Smuzhiyun BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST);
78*4882a593Smuzhiyun
79*4882a593Smuzhiyun spin_lock(&rds_info_lock);
80*4882a593Smuzhiyun BUG_ON(rds_info_funcs[offset]);
81*4882a593Smuzhiyun rds_info_funcs[offset] = func;
82*4882a593Smuzhiyun spin_unlock(&rds_info_lock);
83*4882a593Smuzhiyun }
84*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(rds_info_register_func);
85*4882a593Smuzhiyun
rds_info_deregister_func(int optname,rds_info_func func)86*4882a593Smuzhiyun void rds_info_deregister_func(int optname, rds_info_func func)
87*4882a593Smuzhiyun {
88*4882a593Smuzhiyun int offset = optname - RDS_INFO_FIRST;
89*4882a593Smuzhiyun
90*4882a593Smuzhiyun BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST);
91*4882a593Smuzhiyun
92*4882a593Smuzhiyun spin_lock(&rds_info_lock);
93*4882a593Smuzhiyun BUG_ON(rds_info_funcs[offset] != func);
94*4882a593Smuzhiyun rds_info_funcs[offset] = NULL;
95*4882a593Smuzhiyun spin_unlock(&rds_info_lock);
96*4882a593Smuzhiyun }
97*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(rds_info_deregister_func);
98*4882a593Smuzhiyun
99*4882a593Smuzhiyun /*
100*4882a593Smuzhiyun * Typically we hold an atomic kmap across multiple rds_info_copy() calls
101*4882a593Smuzhiyun * because the kmap is so expensive. This must be called before using blocking
102*4882a593Smuzhiyun * operations while holding the mapping and as the iterator is torn down.
103*4882a593Smuzhiyun */
rds_info_iter_unmap(struct rds_info_iterator * iter)104*4882a593Smuzhiyun void rds_info_iter_unmap(struct rds_info_iterator *iter)
105*4882a593Smuzhiyun {
106*4882a593Smuzhiyun if (iter->addr) {
107*4882a593Smuzhiyun kunmap_atomic(iter->addr);
108*4882a593Smuzhiyun iter->addr = NULL;
109*4882a593Smuzhiyun }
110*4882a593Smuzhiyun }
111*4882a593Smuzhiyun
112*4882a593Smuzhiyun /*
113*4882a593Smuzhiyun * get_user_pages() called flush_dcache_page() on the pages for us.
114*4882a593Smuzhiyun */
rds_info_copy(struct rds_info_iterator * iter,void * data,unsigned long bytes)115*4882a593Smuzhiyun void rds_info_copy(struct rds_info_iterator *iter, void *data,
116*4882a593Smuzhiyun unsigned long bytes)
117*4882a593Smuzhiyun {
118*4882a593Smuzhiyun unsigned long this;
119*4882a593Smuzhiyun
120*4882a593Smuzhiyun while (bytes) {
121*4882a593Smuzhiyun if (!iter->addr)
122*4882a593Smuzhiyun iter->addr = kmap_atomic(*iter->pages);
123*4882a593Smuzhiyun
124*4882a593Smuzhiyun this = min(bytes, PAGE_SIZE - iter->offset);
125*4882a593Smuzhiyun
126*4882a593Smuzhiyun rdsdebug("page %p addr %p offset %lu this %lu data %p "
127*4882a593Smuzhiyun "bytes %lu\n", *iter->pages, iter->addr,
128*4882a593Smuzhiyun iter->offset, this, data, bytes);
129*4882a593Smuzhiyun
130*4882a593Smuzhiyun memcpy(iter->addr + iter->offset, data, this);
131*4882a593Smuzhiyun
132*4882a593Smuzhiyun data += this;
133*4882a593Smuzhiyun bytes -= this;
134*4882a593Smuzhiyun iter->offset += this;
135*4882a593Smuzhiyun
136*4882a593Smuzhiyun if (iter->offset == PAGE_SIZE) {
137*4882a593Smuzhiyun kunmap_atomic(iter->addr);
138*4882a593Smuzhiyun iter->addr = NULL;
139*4882a593Smuzhiyun iter->offset = 0;
140*4882a593Smuzhiyun iter->pages++;
141*4882a593Smuzhiyun }
142*4882a593Smuzhiyun }
143*4882a593Smuzhiyun }
144*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(rds_info_copy);
145*4882a593Smuzhiyun
146*4882a593Smuzhiyun /*
147*4882a593Smuzhiyun * @optval points to the userspace buffer that the information snapshot
148*4882a593Smuzhiyun * will be copied into.
149*4882a593Smuzhiyun *
150*4882a593Smuzhiyun * @optlen on input is the size of the buffer in userspace. @optlen
151*4882a593Smuzhiyun * on output is the size of the requested snapshot in bytes.
152*4882a593Smuzhiyun *
153*4882a593Smuzhiyun * This function returns -errno if there is a failure, particularly -ENOSPC
154*4882a593Smuzhiyun * if the given userspace buffer was not large enough to fit the snapshot.
155*4882a593Smuzhiyun * On success it returns the positive number of bytes of each array element
156*4882a593Smuzhiyun * in the snapshot.
157*4882a593Smuzhiyun */
rds_info_getsockopt(struct socket * sock,int optname,char __user * optval,int __user * optlen)158*4882a593Smuzhiyun int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
159*4882a593Smuzhiyun int __user *optlen)
160*4882a593Smuzhiyun {
161*4882a593Smuzhiyun struct rds_info_iterator iter;
162*4882a593Smuzhiyun struct rds_info_lengths lens;
163*4882a593Smuzhiyun unsigned long nr_pages = 0;
164*4882a593Smuzhiyun unsigned long start;
165*4882a593Smuzhiyun rds_info_func func;
166*4882a593Smuzhiyun struct page **pages = NULL;
167*4882a593Smuzhiyun int ret;
168*4882a593Smuzhiyun int len;
169*4882a593Smuzhiyun int total;
170*4882a593Smuzhiyun
171*4882a593Smuzhiyun if (get_user(len, optlen)) {
172*4882a593Smuzhiyun ret = -EFAULT;
173*4882a593Smuzhiyun goto out;
174*4882a593Smuzhiyun }
175*4882a593Smuzhiyun
176*4882a593Smuzhiyun /* check for all kinds of wrapping and the like */
177*4882a593Smuzhiyun start = (unsigned long)optval;
178*4882a593Smuzhiyun if (len < 0 || len > INT_MAX - PAGE_SIZE + 1 || start + len < start) {
179*4882a593Smuzhiyun ret = -EINVAL;
180*4882a593Smuzhiyun goto out;
181*4882a593Smuzhiyun }
182*4882a593Smuzhiyun
183*4882a593Smuzhiyun /* a 0 len call is just trying to probe its length */
184*4882a593Smuzhiyun if (len == 0)
185*4882a593Smuzhiyun goto call_func;
186*4882a593Smuzhiyun
187*4882a593Smuzhiyun nr_pages = (PAGE_ALIGN(start + len) - (start & PAGE_MASK))
188*4882a593Smuzhiyun >> PAGE_SHIFT;
189*4882a593Smuzhiyun
190*4882a593Smuzhiyun pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
191*4882a593Smuzhiyun if (!pages) {
192*4882a593Smuzhiyun ret = -ENOMEM;
193*4882a593Smuzhiyun goto out;
194*4882a593Smuzhiyun }
195*4882a593Smuzhiyun ret = pin_user_pages_fast(start, nr_pages, FOLL_WRITE, pages);
196*4882a593Smuzhiyun if (ret != nr_pages) {
197*4882a593Smuzhiyun if (ret > 0)
198*4882a593Smuzhiyun nr_pages = ret;
199*4882a593Smuzhiyun else
200*4882a593Smuzhiyun nr_pages = 0;
201*4882a593Smuzhiyun ret = -EAGAIN; /* XXX ? */
202*4882a593Smuzhiyun goto out;
203*4882a593Smuzhiyun }
204*4882a593Smuzhiyun
205*4882a593Smuzhiyun rdsdebug("len %d nr_pages %lu\n", len, nr_pages);
206*4882a593Smuzhiyun
207*4882a593Smuzhiyun call_func:
208*4882a593Smuzhiyun func = rds_info_funcs[optname - RDS_INFO_FIRST];
209*4882a593Smuzhiyun if (!func) {
210*4882a593Smuzhiyun ret = -ENOPROTOOPT;
211*4882a593Smuzhiyun goto out;
212*4882a593Smuzhiyun }
213*4882a593Smuzhiyun
214*4882a593Smuzhiyun iter.pages = pages;
215*4882a593Smuzhiyun iter.addr = NULL;
216*4882a593Smuzhiyun iter.offset = start & (PAGE_SIZE - 1);
217*4882a593Smuzhiyun
218*4882a593Smuzhiyun func(sock, len, &iter, &lens);
219*4882a593Smuzhiyun BUG_ON(lens.each == 0);
220*4882a593Smuzhiyun
221*4882a593Smuzhiyun total = lens.nr * lens.each;
222*4882a593Smuzhiyun
223*4882a593Smuzhiyun rds_info_iter_unmap(&iter);
224*4882a593Smuzhiyun
225*4882a593Smuzhiyun if (total > len) {
226*4882a593Smuzhiyun len = total;
227*4882a593Smuzhiyun ret = -ENOSPC;
228*4882a593Smuzhiyun } else {
229*4882a593Smuzhiyun len = total;
230*4882a593Smuzhiyun ret = lens.each;
231*4882a593Smuzhiyun }
232*4882a593Smuzhiyun
233*4882a593Smuzhiyun if (put_user(len, optlen))
234*4882a593Smuzhiyun ret = -EFAULT;
235*4882a593Smuzhiyun
236*4882a593Smuzhiyun out:
237*4882a593Smuzhiyun if (pages)
238*4882a593Smuzhiyun unpin_user_pages(pages, nr_pages);
239*4882a593Smuzhiyun kfree(pages);
240*4882a593Smuzhiyun
241*4882a593Smuzhiyun return ret;
242*4882a593Smuzhiyun }
243