xref: /OK3568_Linux_fs/kernel/net/ceph/messenger.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0
2*4882a593Smuzhiyun #include <linux/ceph/ceph_debug.h>
3*4882a593Smuzhiyun 
4*4882a593Smuzhiyun #include <linux/crc32c.h>
5*4882a593Smuzhiyun #include <linux/ctype.h>
6*4882a593Smuzhiyun #include <linux/highmem.h>
7*4882a593Smuzhiyun #include <linux/inet.h>
8*4882a593Smuzhiyun #include <linux/kthread.h>
9*4882a593Smuzhiyun #include <linux/net.h>
10*4882a593Smuzhiyun #include <linux/nsproxy.h>
11*4882a593Smuzhiyun #include <linux/sched/mm.h>
12*4882a593Smuzhiyun #include <linux/slab.h>
13*4882a593Smuzhiyun #include <linux/socket.h>
14*4882a593Smuzhiyun #include <linux/string.h>
15*4882a593Smuzhiyun #ifdef	CONFIG_BLOCK
16*4882a593Smuzhiyun #include <linux/bio.h>
17*4882a593Smuzhiyun #endif	/* CONFIG_BLOCK */
18*4882a593Smuzhiyun #include <linux/dns_resolver.h>
19*4882a593Smuzhiyun #include <net/tcp.h>
20*4882a593Smuzhiyun 
21*4882a593Smuzhiyun #include <linux/ceph/ceph_features.h>
22*4882a593Smuzhiyun #include <linux/ceph/libceph.h>
23*4882a593Smuzhiyun #include <linux/ceph/messenger.h>
24*4882a593Smuzhiyun #include <linux/ceph/decode.h>
25*4882a593Smuzhiyun #include <linux/ceph/pagelist.h>
26*4882a593Smuzhiyun #include <linux/export.h>
27*4882a593Smuzhiyun 
28*4882a593Smuzhiyun /*
29*4882a593Smuzhiyun  * Ceph uses the messenger to exchange ceph_msg messages with other
30*4882a593Smuzhiyun  * hosts in the system.  The messenger provides ordered and reliable
31*4882a593Smuzhiyun  * delivery.  We tolerate TCP disconnects by reconnecting (with
32*4882a593Smuzhiyun  * exponential backoff) in the case of a fault (disconnection, bad
33*4882a593Smuzhiyun  * crc, protocol error).  Acks allow sent messages to be discarded by
34*4882a593Smuzhiyun  * the sender.
35*4882a593Smuzhiyun  */
36*4882a593Smuzhiyun 
37*4882a593Smuzhiyun /*
38*4882a593Smuzhiyun  * We track the state of the socket on a given connection using
39*4882a593Smuzhiyun  * values defined below.  The transition to a new socket state is
40*4882a593Smuzhiyun  * handled by a function which verifies we aren't coming from an
41*4882a593Smuzhiyun  * unexpected state.
42*4882a593Smuzhiyun  *
43*4882a593Smuzhiyun  *      --------
44*4882a593Smuzhiyun  *      | NEW* |  transient initial state
45*4882a593Smuzhiyun  *      --------
46*4882a593Smuzhiyun  *          | con_sock_state_init()
47*4882a593Smuzhiyun  *          v
48*4882a593Smuzhiyun  *      ----------
49*4882a593Smuzhiyun  *      | CLOSED |  initialized, but no socket (and no
50*4882a593Smuzhiyun  *      ----------  TCP connection)
51*4882a593Smuzhiyun  *       ^      \
52*4882a593Smuzhiyun  *       |       \ con_sock_state_connecting()
53*4882a593Smuzhiyun  *       |        ----------------------
54*4882a593Smuzhiyun  *       |                              \
55*4882a593Smuzhiyun  *       + con_sock_state_closed()       \
56*4882a593Smuzhiyun  *       |+---------------------------    \
57*4882a593Smuzhiyun  *       | \                          \    \
58*4882a593Smuzhiyun  *       |  -----------                \    \
59*4882a593Smuzhiyun  *       |  | CLOSING |  socket event;  \    \
60*4882a593Smuzhiyun  *       |  -----------  await close     \    \
61*4882a593Smuzhiyun  *       |       ^                        \   |
62*4882a593Smuzhiyun  *       |       |                         \  |
63*4882a593Smuzhiyun  *       |       + con_sock_state_closing() \ |
64*4882a593Smuzhiyun  *       |      / \                         | |
65*4882a593Smuzhiyun  *       |     /   ---------------          | |
66*4882a593Smuzhiyun  *       |    /                   \         v v
67*4882a593Smuzhiyun  *       |   /                    --------------
68*4882a593Smuzhiyun  *       |  /    -----------------| CONNECTING |  socket created, TCP
69*4882a593Smuzhiyun  *       |  |   /                 --------------  connect initiated
70*4882a593Smuzhiyun  *       |  |   | con_sock_state_connected()
71*4882a593Smuzhiyun  *       |  |   v
72*4882a593Smuzhiyun  *      -------------
73*4882a593Smuzhiyun  *      | CONNECTED |  TCP connection established
74*4882a593Smuzhiyun  *      -------------
75*4882a593Smuzhiyun  *
76*4882a593Smuzhiyun  * State values for ceph_connection->sock_state; NEW is assumed to be 0.
77*4882a593Smuzhiyun  */
78*4882a593Smuzhiyun 
79*4882a593Smuzhiyun #define CON_SOCK_STATE_NEW		0	/* -> CLOSED */
80*4882a593Smuzhiyun #define CON_SOCK_STATE_CLOSED		1	/* -> CONNECTING */
81*4882a593Smuzhiyun #define CON_SOCK_STATE_CONNECTING	2	/* -> CONNECTED or -> CLOSING */
82*4882a593Smuzhiyun #define CON_SOCK_STATE_CONNECTED	3	/* -> CLOSING or -> CLOSED */
83*4882a593Smuzhiyun #define CON_SOCK_STATE_CLOSING		4	/* -> CLOSED */
84*4882a593Smuzhiyun 
85*4882a593Smuzhiyun /*
86*4882a593Smuzhiyun  * connection states
87*4882a593Smuzhiyun  */
88*4882a593Smuzhiyun #define CON_STATE_CLOSED        1  /* -> PREOPEN */
89*4882a593Smuzhiyun #define CON_STATE_PREOPEN       2  /* -> CONNECTING, CLOSED */
90*4882a593Smuzhiyun #define CON_STATE_CONNECTING    3  /* -> NEGOTIATING, CLOSED */
91*4882a593Smuzhiyun #define CON_STATE_NEGOTIATING   4  /* -> OPEN, CLOSED */
92*4882a593Smuzhiyun #define CON_STATE_OPEN          5  /* -> STANDBY, CLOSED */
93*4882a593Smuzhiyun #define CON_STATE_STANDBY       6  /* -> PREOPEN, CLOSED */
94*4882a593Smuzhiyun 
95*4882a593Smuzhiyun /*
96*4882a593Smuzhiyun  * ceph_connection flag bits
97*4882a593Smuzhiyun  */
98*4882a593Smuzhiyun #define CON_FLAG_LOSSYTX           0  /* we can close channel or drop
99*4882a593Smuzhiyun 				       * messages on errors */
100*4882a593Smuzhiyun #define CON_FLAG_KEEPALIVE_PENDING 1  /* we need to send a keepalive */
101*4882a593Smuzhiyun #define CON_FLAG_WRITE_PENDING	   2  /* we have data ready to send */
102*4882a593Smuzhiyun #define CON_FLAG_SOCK_CLOSED	   3  /* socket state changed to closed */
103*4882a593Smuzhiyun #define CON_FLAG_BACKOFF           4  /* need to retry queuing delayed work */
104*4882a593Smuzhiyun 
con_flag_valid(unsigned long con_flag)105*4882a593Smuzhiyun static bool con_flag_valid(unsigned long con_flag)
106*4882a593Smuzhiyun {
107*4882a593Smuzhiyun 	switch (con_flag) {
108*4882a593Smuzhiyun 	case CON_FLAG_LOSSYTX:
109*4882a593Smuzhiyun 	case CON_FLAG_KEEPALIVE_PENDING:
110*4882a593Smuzhiyun 	case CON_FLAG_WRITE_PENDING:
111*4882a593Smuzhiyun 	case CON_FLAG_SOCK_CLOSED:
112*4882a593Smuzhiyun 	case CON_FLAG_BACKOFF:
113*4882a593Smuzhiyun 		return true;
114*4882a593Smuzhiyun 	default:
115*4882a593Smuzhiyun 		return false;
116*4882a593Smuzhiyun 	}
117*4882a593Smuzhiyun }
118*4882a593Smuzhiyun 
con_flag_clear(struct ceph_connection * con,unsigned long con_flag)119*4882a593Smuzhiyun static void con_flag_clear(struct ceph_connection *con, unsigned long con_flag)
120*4882a593Smuzhiyun {
121*4882a593Smuzhiyun 	BUG_ON(!con_flag_valid(con_flag));
122*4882a593Smuzhiyun 
123*4882a593Smuzhiyun 	clear_bit(con_flag, &con->flags);
124*4882a593Smuzhiyun }
125*4882a593Smuzhiyun 
con_flag_set(struct ceph_connection * con,unsigned long con_flag)126*4882a593Smuzhiyun static void con_flag_set(struct ceph_connection *con, unsigned long con_flag)
127*4882a593Smuzhiyun {
128*4882a593Smuzhiyun 	BUG_ON(!con_flag_valid(con_flag));
129*4882a593Smuzhiyun 
130*4882a593Smuzhiyun 	set_bit(con_flag, &con->flags);
131*4882a593Smuzhiyun }
132*4882a593Smuzhiyun 
con_flag_test(struct ceph_connection * con,unsigned long con_flag)133*4882a593Smuzhiyun static bool con_flag_test(struct ceph_connection *con, unsigned long con_flag)
134*4882a593Smuzhiyun {
135*4882a593Smuzhiyun 	BUG_ON(!con_flag_valid(con_flag));
136*4882a593Smuzhiyun 
137*4882a593Smuzhiyun 	return test_bit(con_flag, &con->flags);
138*4882a593Smuzhiyun }
139*4882a593Smuzhiyun 
con_flag_test_and_clear(struct ceph_connection * con,unsigned long con_flag)140*4882a593Smuzhiyun static bool con_flag_test_and_clear(struct ceph_connection *con,
141*4882a593Smuzhiyun 					unsigned long con_flag)
142*4882a593Smuzhiyun {
143*4882a593Smuzhiyun 	BUG_ON(!con_flag_valid(con_flag));
144*4882a593Smuzhiyun 
145*4882a593Smuzhiyun 	return test_and_clear_bit(con_flag, &con->flags);
146*4882a593Smuzhiyun }
147*4882a593Smuzhiyun 
con_flag_test_and_set(struct ceph_connection * con,unsigned long con_flag)148*4882a593Smuzhiyun static bool con_flag_test_and_set(struct ceph_connection *con,
149*4882a593Smuzhiyun 					unsigned long con_flag)
150*4882a593Smuzhiyun {
151*4882a593Smuzhiyun 	BUG_ON(!con_flag_valid(con_flag));
152*4882a593Smuzhiyun 
153*4882a593Smuzhiyun 	return test_and_set_bit(con_flag, &con->flags);
154*4882a593Smuzhiyun }
155*4882a593Smuzhiyun 
156*4882a593Smuzhiyun /* Slab caches for frequently-allocated structures */
157*4882a593Smuzhiyun 
158*4882a593Smuzhiyun static struct kmem_cache	*ceph_msg_cache;
159*4882a593Smuzhiyun 
160*4882a593Smuzhiyun /* static tag bytes (protocol control messages) */
161*4882a593Smuzhiyun static char tag_msg = CEPH_MSGR_TAG_MSG;
162*4882a593Smuzhiyun static char tag_ack = CEPH_MSGR_TAG_ACK;
163*4882a593Smuzhiyun static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
164*4882a593Smuzhiyun static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2;
165*4882a593Smuzhiyun 
166*4882a593Smuzhiyun #ifdef CONFIG_LOCKDEP
167*4882a593Smuzhiyun static struct lock_class_key socket_class;
168*4882a593Smuzhiyun #endif
169*4882a593Smuzhiyun 
170*4882a593Smuzhiyun static void queue_con(struct ceph_connection *con);
171*4882a593Smuzhiyun static void cancel_con(struct ceph_connection *con);
172*4882a593Smuzhiyun static void ceph_con_workfn(struct work_struct *);
173*4882a593Smuzhiyun static void con_fault(struct ceph_connection *con);
174*4882a593Smuzhiyun 
175*4882a593Smuzhiyun /*
176*4882a593Smuzhiyun  * Nicely render a sockaddr as a string.  An array of formatted
177*4882a593Smuzhiyun  * strings is used, to approximate reentrancy.
178*4882a593Smuzhiyun  */
179*4882a593Smuzhiyun #define ADDR_STR_COUNT_LOG	5	/* log2(# address strings in array) */
180*4882a593Smuzhiyun #define ADDR_STR_COUNT		(1 << ADDR_STR_COUNT_LOG)
181*4882a593Smuzhiyun #define ADDR_STR_COUNT_MASK	(ADDR_STR_COUNT - 1)
182*4882a593Smuzhiyun #define MAX_ADDR_STR_LEN	64	/* 54 is enough */
183*4882a593Smuzhiyun 
184*4882a593Smuzhiyun static char addr_str[ADDR_STR_COUNT][MAX_ADDR_STR_LEN];
185*4882a593Smuzhiyun static atomic_t addr_str_seq = ATOMIC_INIT(0);
186*4882a593Smuzhiyun 
187*4882a593Smuzhiyun static struct page *zero_page;		/* used in certain error cases */
188*4882a593Smuzhiyun 
ceph_pr_addr(const struct ceph_entity_addr * addr)189*4882a593Smuzhiyun const char *ceph_pr_addr(const struct ceph_entity_addr *addr)
190*4882a593Smuzhiyun {
191*4882a593Smuzhiyun 	int i;
192*4882a593Smuzhiyun 	char *s;
193*4882a593Smuzhiyun 	struct sockaddr_storage ss = addr->in_addr; /* align */
194*4882a593Smuzhiyun 	struct sockaddr_in *in4 = (struct sockaddr_in *)&ss;
195*4882a593Smuzhiyun 	struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)&ss;
196*4882a593Smuzhiyun 
197*4882a593Smuzhiyun 	i = atomic_inc_return(&addr_str_seq) & ADDR_STR_COUNT_MASK;
198*4882a593Smuzhiyun 	s = addr_str[i];
199*4882a593Smuzhiyun 
200*4882a593Smuzhiyun 	switch (ss.ss_family) {
201*4882a593Smuzhiyun 	case AF_INET:
202*4882a593Smuzhiyun 		snprintf(s, MAX_ADDR_STR_LEN, "(%d)%pI4:%hu",
203*4882a593Smuzhiyun 			 le32_to_cpu(addr->type), &in4->sin_addr,
204*4882a593Smuzhiyun 			 ntohs(in4->sin_port));
205*4882a593Smuzhiyun 		break;
206*4882a593Smuzhiyun 
207*4882a593Smuzhiyun 	case AF_INET6:
208*4882a593Smuzhiyun 		snprintf(s, MAX_ADDR_STR_LEN, "(%d)[%pI6c]:%hu",
209*4882a593Smuzhiyun 			 le32_to_cpu(addr->type), &in6->sin6_addr,
210*4882a593Smuzhiyun 			 ntohs(in6->sin6_port));
211*4882a593Smuzhiyun 		break;
212*4882a593Smuzhiyun 
213*4882a593Smuzhiyun 	default:
214*4882a593Smuzhiyun 		snprintf(s, MAX_ADDR_STR_LEN, "(unknown sockaddr family %hu)",
215*4882a593Smuzhiyun 			 ss.ss_family);
216*4882a593Smuzhiyun 	}
217*4882a593Smuzhiyun 
218*4882a593Smuzhiyun 	return s;
219*4882a593Smuzhiyun }
220*4882a593Smuzhiyun EXPORT_SYMBOL(ceph_pr_addr);
221*4882a593Smuzhiyun 
encode_my_addr(struct ceph_messenger * msgr)222*4882a593Smuzhiyun static void encode_my_addr(struct ceph_messenger *msgr)
223*4882a593Smuzhiyun {
224*4882a593Smuzhiyun 	memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
225*4882a593Smuzhiyun 	ceph_encode_banner_addr(&msgr->my_enc_addr);
226*4882a593Smuzhiyun }
227*4882a593Smuzhiyun 
228*4882a593Smuzhiyun /*
229*4882a593Smuzhiyun  * work queue for all reading and writing to/from the socket.
230*4882a593Smuzhiyun  */
231*4882a593Smuzhiyun static struct workqueue_struct *ceph_msgr_wq;
232*4882a593Smuzhiyun 
ceph_msgr_slab_init(void)233*4882a593Smuzhiyun static int ceph_msgr_slab_init(void)
234*4882a593Smuzhiyun {
235*4882a593Smuzhiyun 	BUG_ON(ceph_msg_cache);
236*4882a593Smuzhiyun 	ceph_msg_cache = KMEM_CACHE(ceph_msg, 0);
237*4882a593Smuzhiyun 	if (!ceph_msg_cache)
238*4882a593Smuzhiyun 		return -ENOMEM;
239*4882a593Smuzhiyun 
240*4882a593Smuzhiyun 	return 0;
241*4882a593Smuzhiyun }
242*4882a593Smuzhiyun 
ceph_msgr_slab_exit(void)243*4882a593Smuzhiyun static void ceph_msgr_slab_exit(void)
244*4882a593Smuzhiyun {
245*4882a593Smuzhiyun 	BUG_ON(!ceph_msg_cache);
246*4882a593Smuzhiyun 	kmem_cache_destroy(ceph_msg_cache);
247*4882a593Smuzhiyun 	ceph_msg_cache = NULL;
248*4882a593Smuzhiyun }
249*4882a593Smuzhiyun 
_ceph_msgr_exit(void)250*4882a593Smuzhiyun static void _ceph_msgr_exit(void)
251*4882a593Smuzhiyun {
252*4882a593Smuzhiyun 	if (ceph_msgr_wq) {
253*4882a593Smuzhiyun 		destroy_workqueue(ceph_msgr_wq);
254*4882a593Smuzhiyun 		ceph_msgr_wq = NULL;
255*4882a593Smuzhiyun 	}
256*4882a593Smuzhiyun 
257*4882a593Smuzhiyun 	BUG_ON(zero_page == NULL);
258*4882a593Smuzhiyun 	put_page(zero_page);
259*4882a593Smuzhiyun 	zero_page = NULL;
260*4882a593Smuzhiyun 
261*4882a593Smuzhiyun 	ceph_msgr_slab_exit();
262*4882a593Smuzhiyun }
263*4882a593Smuzhiyun 
ceph_msgr_init(void)264*4882a593Smuzhiyun int __init ceph_msgr_init(void)
265*4882a593Smuzhiyun {
266*4882a593Smuzhiyun 	if (ceph_msgr_slab_init())
267*4882a593Smuzhiyun 		return -ENOMEM;
268*4882a593Smuzhiyun 
269*4882a593Smuzhiyun 	BUG_ON(zero_page != NULL);
270*4882a593Smuzhiyun 	zero_page = ZERO_PAGE(0);
271*4882a593Smuzhiyun 	get_page(zero_page);
272*4882a593Smuzhiyun 
273*4882a593Smuzhiyun 	/*
274*4882a593Smuzhiyun 	 * The number of active work items is limited by the number of
275*4882a593Smuzhiyun 	 * connections, so leave @max_active at default.
276*4882a593Smuzhiyun 	 */
277*4882a593Smuzhiyun 	ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_MEM_RECLAIM, 0);
278*4882a593Smuzhiyun 	if (ceph_msgr_wq)
279*4882a593Smuzhiyun 		return 0;
280*4882a593Smuzhiyun 
281*4882a593Smuzhiyun 	pr_err("msgr_init failed to create workqueue\n");
282*4882a593Smuzhiyun 	_ceph_msgr_exit();
283*4882a593Smuzhiyun 
284*4882a593Smuzhiyun 	return -ENOMEM;
285*4882a593Smuzhiyun }
286*4882a593Smuzhiyun 
ceph_msgr_exit(void)287*4882a593Smuzhiyun void ceph_msgr_exit(void)
288*4882a593Smuzhiyun {
289*4882a593Smuzhiyun 	BUG_ON(ceph_msgr_wq == NULL);
290*4882a593Smuzhiyun 
291*4882a593Smuzhiyun 	_ceph_msgr_exit();
292*4882a593Smuzhiyun }
293*4882a593Smuzhiyun 
ceph_msgr_flush(void)294*4882a593Smuzhiyun void ceph_msgr_flush(void)
295*4882a593Smuzhiyun {
296*4882a593Smuzhiyun 	flush_workqueue(ceph_msgr_wq);
297*4882a593Smuzhiyun }
298*4882a593Smuzhiyun EXPORT_SYMBOL(ceph_msgr_flush);
299*4882a593Smuzhiyun 
300*4882a593Smuzhiyun /* Connection socket state transition functions */
301*4882a593Smuzhiyun 
con_sock_state_init(struct ceph_connection * con)302*4882a593Smuzhiyun static void con_sock_state_init(struct ceph_connection *con)
303*4882a593Smuzhiyun {
304*4882a593Smuzhiyun 	int old_state;
305*4882a593Smuzhiyun 
306*4882a593Smuzhiyun 	old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED);
307*4882a593Smuzhiyun 	if (WARN_ON(old_state != CON_SOCK_STATE_NEW))
308*4882a593Smuzhiyun 		printk("%s: unexpected old state %d\n", __func__, old_state);
309*4882a593Smuzhiyun 	dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
310*4882a593Smuzhiyun 	     CON_SOCK_STATE_CLOSED);
311*4882a593Smuzhiyun }
312*4882a593Smuzhiyun 
con_sock_state_connecting(struct ceph_connection * con)313*4882a593Smuzhiyun static void con_sock_state_connecting(struct ceph_connection *con)
314*4882a593Smuzhiyun {
315*4882a593Smuzhiyun 	int old_state;
316*4882a593Smuzhiyun 
317*4882a593Smuzhiyun 	old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTING);
318*4882a593Smuzhiyun 	if (WARN_ON(old_state != CON_SOCK_STATE_CLOSED))
319*4882a593Smuzhiyun 		printk("%s: unexpected old state %d\n", __func__, old_state);
320*4882a593Smuzhiyun 	dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
321*4882a593Smuzhiyun 	     CON_SOCK_STATE_CONNECTING);
322*4882a593Smuzhiyun }
323*4882a593Smuzhiyun 
con_sock_state_connected(struct ceph_connection * con)324*4882a593Smuzhiyun static void con_sock_state_connected(struct ceph_connection *con)
325*4882a593Smuzhiyun {
326*4882a593Smuzhiyun 	int old_state;
327*4882a593Smuzhiyun 
328*4882a593Smuzhiyun 	old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTED);
329*4882a593Smuzhiyun 	if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING))
330*4882a593Smuzhiyun 		printk("%s: unexpected old state %d\n", __func__, old_state);
331*4882a593Smuzhiyun 	dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
332*4882a593Smuzhiyun 	     CON_SOCK_STATE_CONNECTED);
333*4882a593Smuzhiyun }
334*4882a593Smuzhiyun 
con_sock_state_closing(struct ceph_connection * con)335*4882a593Smuzhiyun static void con_sock_state_closing(struct ceph_connection *con)
336*4882a593Smuzhiyun {
337*4882a593Smuzhiyun 	int old_state;
338*4882a593Smuzhiyun 
339*4882a593Smuzhiyun 	old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSING);
340*4882a593Smuzhiyun 	if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING &&
341*4882a593Smuzhiyun 			old_state != CON_SOCK_STATE_CONNECTED &&
342*4882a593Smuzhiyun 			old_state != CON_SOCK_STATE_CLOSING))
343*4882a593Smuzhiyun 		printk("%s: unexpected old state %d\n", __func__, old_state);
344*4882a593Smuzhiyun 	dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
345*4882a593Smuzhiyun 	     CON_SOCK_STATE_CLOSING);
346*4882a593Smuzhiyun }
347*4882a593Smuzhiyun 
con_sock_state_closed(struct ceph_connection * con)348*4882a593Smuzhiyun static void con_sock_state_closed(struct ceph_connection *con)
349*4882a593Smuzhiyun {
350*4882a593Smuzhiyun 	int old_state;
351*4882a593Smuzhiyun 
352*4882a593Smuzhiyun 	old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED);
353*4882a593Smuzhiyun 	if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTED &&
354*4882a593Smuzhiyun 		    old_state != CON_SOCK_STATE_CLOSING &&
355*4882a593Smuzhiyun 		    old_state != CON_SOCK_STATE_CONNECTING &&
356*4882a593Smuzhiyun 		    old_state != CON_SOCK_STATE_CLOSED))
357*4882a593Smuzhiyun 		printk("%s: unexpected old state %d\n", __func__, old_state);
358*4882a593Smuzhiyun 	dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
359*4882a593Smuzhiyun 	     CON_SOCK_STATE_CLOSED);
360*4882a593Smuzhiyun }
361*4882a593Smuzhiyun 
362*4882a593Smuzhiyun /*
363*4882a593Smuzhiyun  * socket callback functions
364*4882a593Smuzhiyun  */
365*4882a593Smuzhiyun 
366*4882a593Smuzhiyun /* data available on socket, or listen socket received a connect */
ceph_sock_data_ready(struct sock * sk)367*4882a593Smuzhiyun static void ceph_sock_data_ready(struct sock *sk)
368*4882a593Smuzhiyun {
369*4882a593Smuzhiyun 	struct ceph_connection *con = sk->sk_user_data;
370*4882a593Smuzhiyun 	if (atomic_read(&con->msgr->stopping)) {
371*4882a593Smuzhiyun 		return;
372*4882a593Smuzhiyun 	}
373*4882a593Smuzhiyun 
374*4882a593Smuzhiyun 	if (sk->sk_state != TCP_CLOSE_WAIT) {
375*4882a593Smuzhiyun 		dout("%s on %p state = %lu, queueing work\n", __func__,
376*4882a593Smuzhiyun 		     con, con->state);
377*4882a593Smuzhiyun 		queue_con(con);
378*4882a593Smuzhiyun 	}
379*4882a593Smuzhiyun }
380*4882a593Smuzhiyun 
381*4882a593Smuzhiyun /* socket has buffer space for writing */
ceph_sock_write_space(struct sock * sk)382*4882a593Smuzhiyun static void ceph_sock_write_space(struct sock *sk)
383*4882a593Smuzhiyun {
384*4882a593Smuzhiyun 	struct ceph_connection *con = sk->sk_user_data;
385*4882a593Smuzhiyun 
386*4882a593Smuzhiyun 	/* only queue to workqueue if there is data we want to write,
387*4882a593Smuzhiyun 	 * and there is sufficient space in the socket buffer to accept
388*4882a593Smuzhiyun 	 * more data.  clear SOCK_NOSPACE so that ceph_sock_write_space()
389*4882a593Smuzhiyun 	 * doesn't get called again until try_write() fills the socket
390*4882a593Smuzhiyun 	 * buffer. See net/ipv4/tcp_input.c:tcp_check_space()
391*4882a593Smuzhiyun 	 * and net/core/stream.c:sk_stream_write_space().
392*4882a593Smuzhiyun 	 */
393*4882a593Smuzhiyun 	if (con_flag_test(con, CON_FLAG_WRITE_PENDING)) {
394*4882a593Smuzhiyun 		if (sk_stream_is_writeable(sk)) {
395*4882a593Smuzhiyun 			dout("%s %p queueing write work\n", __func__, con);
396*4882a593Smuzhiyun 			clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
397*4882a593Smuzhiyun 			queue_con(con);
398*4882a593Smuzhiyun 		}
399*4882a593Smuzhiyun 	} else {
400*4882a593Smuzhiyun 		dout("%s %p nothing to write\n", __func__, con);
401*4882a593Smuzhiyun 	}
402*4882a593Smuzhiyun }
403*4882a593Smuzhiyun 
404*4882a593Smuzhiyun /* socket's state has changed */
ceph_sock_state_change(struct sock * sk)405*4882a593Smuzhiyun static void ceph_sock_state_change(struct sock *sk)
406*4882a593Smuzhiyun {
407*4882a593Smuzhiyun 	struct ceph_connection *con = sk->sk_user_data;
408*4882a593Smuzhiyun 
409*4882a593Smuzhiyun 	dout("%s %p state = %lu sk_state = %u\n", __func__,
410*4882a593Smuzhiyun 	     con, con->state, sk->sk_state);
411*4882a593Smuzhiyun 
412*4882a593Smuzhiyun 	switch (sk->sk_state) {
413*4882a593Smuzhiyun 	case TCP_CLOSE:
414*4882a593Smuzhiyun 		dout("%s TCP_CLOSE\n", __func__);
415*4882a593Smuzhiyun 		fallthrough;
416*4882a593Smuzhiyun 	case TCP_CLOSE_WAIT:
417*4882a593Smuzhiyun 		dout("%s TCP_CLOSE_WAIT\n", __func__);
418*4882a593Smuzhiyun 		con_sock_state_closing(con);
419*4882a593Smuzhiyun 		con_flag_set(con, CON_FLAG_SOCK_CLOSED);
420*4882a593Smuzhiyun 		queue_con(con);
421*4882a593Smuzhiyun 		break;
422*4882a593Smuzhiyun 	case TCP_ESTABLISHED:
423*4882a593Smuzhiyun 		dout("%s TCP_ESTABLISHED\n", __func__);
424*4882a593Smuzhiyun 		con_sock_state_connected(con);
425*4882a593Smuzhiyun 		queue_con(con);
426*4882a593Smuzhiyun 		break;
427*4882a593Smuzhiyun 	default:	/* Everything else is uninteresting */
428*4882a593Smuzhiyun 		break;
429*4882a593Smuzhiyun 	}
430*4882a593Smuzhiyun }
431*4882a593Smuzhiyun 
432*4882a593Smuzhiyun /*
433*4882a593Smuzhiyun  * set up socket callbacks
434*4882a593Smuzhiyun  */
set_sock_callbacks(struct socket * sock,struct ceph_connection * con)435*4882a593Smuzhiyun static void set_sock_callbacks(struct socket *sock,
436*4882a593Smuzhiyun 			       struct ceph_connection *con)
437*4882a593Smuzhiyun {
438*4882a593Smuzhiyun 	struct sock *sk = sock->sk;
439*4882a593Smuzhiyun 	sk->sk_user_data = con;
440*4882a593Smuzhiyun 	sk->sk_data_ready = ceph_sock_data_ready;
441*4882a593Smuzhiyun 	sk->sk_write_space = ceph_sock_write_space;
442*4882a593Smuzhiyun 	sk->sk_state_change = ceph_sock_state_change;
443*4882a593Smuzhiyun }
444*4882a593Smuzhiyun 
445*4882a593Smuzhiyun 
446*4882a593Smuzhiyun /*
447*4882a593Smuzhiyun  * socket helpers
448*4882a593Smuzhiyun  */
449*4882a593Smuzhiyun 
450*4882a593Smuzhiyun /*
451*4882a593Smuzhiyun  * initiate connection to a remote socket.
452*4882a593Smuzhiyun  */
ceph_tcp_connect(struct ceph_connection * con)453*4882a593Smuzhiyun static int ceph_tcp_connect(struct ceph_connection *con)
454*4882a593Smuzhiyun {
455*4882a593Smuzhiyun 	struct sockaddr_storage ss = con->peer_addr.in_addr; /* align */
456*4882a593Smuzhiyun 	struct socket *sock;
457*4882a593Smuzhiyun 	unsigned int noio_flag;
458*4882a593Smuzhiyun 	int ret;
459*4882a593Smuzhiyun 
460*4882a593Smuzhiyun 	BUG_ON(con->sock);
461*4882a593Smuzhiyun 
462*4882a593Smuzhiyun 	/* sock_create_kern() allocates with GFP_KERNEL */
463*4882a593Smuzhiyun 	noio_flag = memalloc_noio_save();
464*4882a593Smuzhiyun 	ret = sock_create_kern(read_pnet(&con->msgr->net), ss.ss_family,
465*4882a593Smuzhiyun 			       SOCK_STREAM, IPPROTO_TCP, &sock);
466*4882a593Smuzhiyun 	memalloc_noio_restore(noio_flag);
467*4882a593Smuzhiyun 	if (ret)
468*4882a593Smuzhiyun 		return ret;
469*4882a593Smuzhiyun 	sock->sk->sk_allocation = GFP_NOFS;
470*4882a593Smuzhiyun 
471*4882a593Smuzhiyun #ifdef CONFIG_LOCKDEP
472*4882a593Smuzhiyun 	lockdep_set_class(&sock->sk->sk_lock, &socket_class);
473*4882a593Smuzhiyun #endif
474*4882a593Smuzhiyun 
475*4882a593Smuzhiyun 	set_sock_callbacks(sock, con);
476*4882a593Smuzhiyun 
477*4882a593Smuzhiyun 	dout("connect %s\n", ceph_pr_addr(&con->peer_addr));
478*4882a593Smuzhiyun 
479*4882a593Smuzhiyun 	con_sock_state_connecting(con);
480*4882a593Smuzhiyun 	ret = sock->ops->connect(sock, (struct sockaddr *)&ss, sizeof(ss),
481*4882a593Smuzhiyun 				 O_NONBLOCK);
482*4882a593Smuzhiyun 	if (ret == -EINPROGRESS) {
483*4882a593Smuzhiyun 		dout("connect %s EINPROGRESS sk_state = %u\n",
484*4882a593Smuzhiyun 		     ceph_pr_addr(&con->peer_addr),
485*4882a593Smuzhiyun 		     sock->sk->sk_state);
486*4882a593Smuzhiyun 	} else if (ret < 0) {
487*4882a593Smuzhiyun 		pr_err("connect %s error %d\n",
488*4882a593Smuzhiyun 		       ceph_pr_addr(&con->peer_addr), ret);
489*4882a593Smuzhiyun 		sock_release(sock);
490*4882a593Smuzhiyun 		return ret;
491*4882a593Smuzhiyun 	}
492*4882a593Smuzhiyun 
493*4882a593Smuzhiyun 	if (ceph_test_opt(from_msgr(con->msgr), TCP_NODELAY))
494*4882a593Smuzhiyun 		tcp_sock_set_nodelay(sock->sk);
495*4882a593Smuzhiyun 
496*4882a593Smuzhiyun 	con->sock = sock;
497*4882a593Smuzhiyun 	return 0;
498*4882a593Smuzhiyun }
499*4882a593Smuzhiyun 
500*4882a593Smuzhiyun /*
501*4882a593Smuzhiyun  * If @buf is NULL, discard up to @len bytes.
502*4882a593Smuzhiyun  */
ceph_tcp_recvmsg(struct socket * sock,void * buf,size_t len)503*4882a593Smuzhiyun static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
504*4882a593Smuzhiyun {
505*4882a593Smuzhiyun 	struct kvec iov = {buf, len};
506*4882a593Smuzhiyun 	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
507*4882a593Smuzhiyun 	int r;
508*4882a593Smuzhiyun 
509*4882a593Smuzhiyun 	if (!buf)
510*4882a593Smuzhiyun 		msg.msg_flags |= MSG_TRUNC;
511*4882a593Smuzhiyun 
512*4882a593Smuzhiyun 	iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, len);
513*4882a593Smuzhiyun 	r = sock_recvmsg(sock, &msg, msg.msg_flags);
514*4882a593Smuzhiyun 	if (r == -EAGAIN)
515*4882a593Smuzhiyun 		r = 0;
516*4882a593Smuzhiyun 	return r;
517*4882a593Smuzhiyun }
518*4882a593Smuzhiyun 
ceph_tcp_recvpage(struct socket * sock,struct page * page,int page_offset,size_t length)519*4882a593Smuzhiyun static int ceph_tcp_recvpage(struct socket *sock, struct page *page,
520*4882a593Smuzhiyun 		     int page_offset, size_t length)
521*4882a593Smuzhiyun {
522*4882a593Smuzhiyun 	struct bio_vec bvec = {
523*4882a593Smuzhiyun 		.bv_page = page,
524*4882a593Smuzhiyun 		.bv_offset = page_offset,
525*4882a593Smuzhiyun 		.bv_len = length
526*4882a593Smuzhiyun 	};
527*4882a593Smuzhiyun 	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
528*4882a593Smuzhiyun 	int r;
529*4882a593Smuzhiyun 
530*4882a593Smuzhiyun 	BUG_ON(page_offset + length > PAGE_SIZE);
531*4882a593Smuzhiyun 	iov_iter_bvec(&msg.msg_iter, READ, &bvec, 1, length);
532*4882a593Smuzhiyun 	r = sock_recvmsg(sock, &msg, msg.msg_flags);
533*4882a593Smuzhiyun 	if (r == -EAGAIN)
534*4882a593Smuzhiyun 		r = 0;
535*4882a593Smuzhiyun 	return r;
536*4882a593Smuzhiyun }
537*4882a593Smuzhiyun 
538*4882a593Smuzhiyun /*
539*4882a593Smuzhiyun  * write something.  @more is true if caller will be sending more data
540*4882a593Smuzhiyun  * shortly.
541*4882a593Smuzhiyun  */
ceph_tcp_sendmsg(struct socket * sock,struct kvec * iov,size_t kvlen,size_t len,bool more)542*4882a593Smuzhiyun static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
543*4882a593Smuzhiyun 			    size_t kvlen, size_t len, bool more)
544*4882a593Smuzhiyun {
545*4882a593Smuzhiyun 	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
546*4882a593Smuzhiyun 	int r;
547*4882a593Smuzhiyun 
548*4882a593Smuzhiyun 	if (more)
549*4882a593Smuzhiyun 		msg.msg_flags |= MSG_MORE;
550*4882a593Smuzhiyun 	else
551*4882a593Smuzhiyun 		msg.msg_flags |= MSG_EOR;  /* superfluous, but what the hell */
552*4882a593Smuzhiyun 
553*4882a593Smuzhiyun 	r = kernel_sendmsg(sock, &msg, iov, kvlen, len);
554*4882a593Smuzhiyun 	if (r == -EAGAIN)
555*4882a593Smuzhiyun 		r = 0;
556*4882a593Smuzhiyun 	return r;
557*4882a593Smuzhiyun }
558*4882a593Smuzhiyun 
559*4882a593Smuzhiyun /*
560*4882a593Smuzhiyun  * @more: either or both of MSG_MORE and MSG_SENDPAGE_NOTLAST
561*4882a593Smuzhiyun  */
ceph_tcp_sendpage(struct socket * sock,struct page * page,int offset,size_t size,int more)562*4882a593Smuzhiyun static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
563*4882a593Smuzhiyun 			     int offset, size_t size, int more)
564*4882a593Smuzhiyun {
565*4882a593Smuzhiyun 	ssize_t (*sendpage)(struct socket *sock, struct page *page,
566*4882a593Smuzhiyun 			    int offset, size_t size, int flags);
567*4882a593Smuzhiyun 	int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more;
568*4882a593Smuzhiyun 	int ret;
569*4882a593Smuzhiyun 
570*4882a593Smuzhiyun 	/*
571*4882a593Smuzhiyun 	 * sendpage cannot properly handle pages with page_count == 0,
572*4882a593Smuzhiyun 	 * we need to fall back to sendmsg if that's the case.
573*4882a593Smuzhiyun 	 *
574*4882a593Smuzhiyun 	 * Same goes for slab pages: skb_can_coalesce() allows
575*4882a593Smuzhiyun 	 * coalescing neighboring slab objects into a single frag which
576*4882a593Smuzhiyun 	 * triggers one of hardened usercopy checks.
577*4882a593Smuzhiyun 	 */
578*4882a593Smuzhiyun 	if (sendpage_ok(page))
579*4882a593Smuzhiyun 		sendpage = sock->ops->sendpage;
580*4882a593Smuzhiyun 	else
581*4882a593Smuzhiyun 		sendpage = sock_no_sendpage;
582*4882a593Smuzhiyun 
583*4882a593Smuzhiyun 	ret = sendpage(sock, page, offset, size, flags);
584*4882a593Smuzhiyun 	if (ret == -EAGAIN)
585*4882a593Smuzhiyun 		ret = 0;
586*4882a593Smuzhiyun 
587*4882a593Smuzhiyun 	return ret;
588*4882a593Smuzhiyun }
589*4882a593Smuzhiyun 
590*4882a593Smuzhiyun /*
591*4882a593Smuzhiyun  * Shutdown/close the socket for the given connection.
592*4882a593Smuzhiyun  */
con_close_socket(struct ceph_connection * con)593*4882a593Smuzhiyun static int con_close_socket(struct ceph_connection *con)
594*4882a593Smuzhiyun {
595*4882a593Smuzhiyun 	int rc = 0;
596*4882a593Smuzhiyun 
597*4882a593Smuzhiyun 	dout("con_close_socket on %p sock %p\n", con, con->sock);
598*4882a593Smuzhiyun 	if (con->sock) {
599*4882a593Smuzhiyun 		rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
600*4882a593Smuzhiyun 		sock_release(con->sock);
601*4882a593Smuzhiyun 		con->sock = NULL;
602*4882a593Smuzhiyun 	}
603*4882a593Smuzhiyun 
604*4882a593Smuzhiyun 	/*
605*4882a593Smuzhiyun 	 * Forcibly clear the SOCK_CLOSED flag.  It gets set
606*4882a593Smuzhiyun 	 * independent of the connection mutex, and we could have
607*4882a593Smuzhiyun 	 * received a socket close event before we had the chance to
608*4882a593Smuzhiyun 	 * shut the socket down.
609*4882a593Smuzhiyun 	 */
610*4882a593Smuzhiyun 	con_flag_clear(con, CON_FLAG_SOCK_CLOSED);
611*4882a593Smuzhiyun 
612*4882a593Smuzhiyun 	con_sock_state_closed(con);
613*4882a593Smuzhiyun 	return rc;
614*4882a593Smuzhiyun }
615*4882a593Smuzhiyun 
616*4882a593Smuzhiyun /*
617*4882a593Smuzhiyun  * Reset a connection.  Discard all incoming and outgoing messages
618*4882a593Smuzhiyun  * and clear *_seq state.
619*4882a593Smuzhiyun  */
ceph_msg_remove(struct ceph_msg * msg)620*4882a593Smuzhiyun static void ceph_msg_remove(struct ceph_msg *msg)
621*4882a593Smuzhiyun {
622*4882a593Smuzhiyun 	list_del_init(&msg->list_head);
623*4882a593Smuzhiyun 
624*4882a593Smuzhiyun 	ceph_msg_put(msg);
625*4882a593Smuzhiyun }
ceph_msg_remove_list(struct list_head * head)626*4882a593Smuzhiyun static void ceph_msg_remove_list(struct list_head *head)
627*4882a593Smuzhiyun {
628*4882a593Smuzhiyun 	while (!list_empty(head)) {
629*4882a593Smuzhiyun 		struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
630*4882a593Smuzhiyun 							list_head);
631*4882a593Smuzhiyun 		ceph_msg_remove(msg);
632*4882a593Smuzhiyun 	}
633*4882a593Smuzhiyun }
634*4882a593Smuzhiyun 
reset_connection(struct ceph_connection * con)635*4882a593Smuzhiyun static void reset_connection(struct ceph_connection *con)
636*4882a593Smuzhiyun {
637*4882a593Smuzhiyun 	/* reset connection, out_queue, msg_ and connect_seq */
638*4882a593Smuzhiyun 	/* discard existing out_queue and msg_seq */
639*4882a593Smuzhiyun 	dout("reset_connection %p\n", con);
640*4882a593Smuzhiyun 	ceph_msg_remove_list(&con->out_queue);
641*4882a593Smuzhiyun 	ceph_msg_remove_list(&con->out_sent);
642*4882a593Smuzhiyun 
643*4882a593Smuzhiyun 	if (con->in_msg) {
644*4882a593Smuzhiyun 		BUG_ON(con->in_msg->con != con);
645*4882a593Smuzhiyun 		ceph_msg_put(con->in_msg);
646*4882a593Smuzhiyun 		con->in_msg = NULL;
647*4882a593Smuzhiyun 	}
648*4882a593Smuzhiyun 
649*4882a593Smuzhiyun 	con->connect_seq = 0;
650*4882a593Smuzhiyun 	con->out_seq = 0;
651*4882a593Smuzhiyun 	if (con->out_msg) {
652*4882a593Smuzhiyun 		BUG_ON(con->out_msg->con != con);
653*4882a593Smuzhiyun 		ceph_msg_put(con->out_msg);
654*4882a593Smuzhiyun 		con->out_msg = NULL;
655*4882a593Smuzhiyun 	}
656*4882a593Smuzhiyun 	con->in_seq = 0;
657*4882a593Smuzhiyun 	con->in_seq_acked = 0;
658*4882a593Smuzhiyun 
659*4882a593Smuzhiyun 	con->out_skip = 0;
660*4882a593Smuzhiyun }
661*4882a593Smuzhiyun 
662*4882a593Smuzhiyun /*
663*4882a593Smuzhiyun  * mark a peer down.  drop any open connections.
664*4882a593Smuzhiyun  */
ceph_con_close(struct ceph_connection * con)665*4882a593Smuzhiyun void ceph_con_close(struct ceph_connection *con)
666*4882a593Smuzhiyun {
667*4882a593Smuzhiyun 	mutex_lock(&con->mutex);
668*4882a593Smuzhiyun 	dout("con_close %p peer %s\n", con, ceph_pr_addr(&con->peer_addr));
669*4882a593Smuzhiyun 	con->state = CON_STATE_CLOSED;
670*4882a593Smuzhiyun 
671*4882a593Smuzhiyun 	con_flag_clear(con, CON_FLAG_LOSSYTX);	/* so we retry next connect */
672*4882a593Smuzhiyun 	con_flag_clear(con, CON_FLAG_KEEPALIVE_PENDING);
673*4882a593Smuzhiyun 	con_flag_clear(con, CON_FLAG_WRITE_PENDING);
674*4882a593Smuzhiyun 	con_flag_clear(con, CON_FLAG_BACKOFF);
675*4882a593Smuzhiyun 
676*4882a593Smuzhiyun 	reset_connection(con);
677*4882a593Smuzhiyun 	con->peer_global_seq = 0;
678*4882a593Smuzhiyun 	cancel_con(con);
679*4882a593Smuzhiyun 	con_close_socket(con);
680*4882a593Smuzhiyun 	mutex_unlock(&con->mutex);
681*4882a593Smuzhiyun }
682*4882a593Smuzhiyun EXPORT_SYMBOL(ceph_con_close);
683*4882a593Smuzhiyun 
684*4882a593Smuzhiyun /*
685*4882a593Smuzhiyun  * Reopen a closed connection, with a new peer address.
686*4882a593Smuzhiyun  */
ceph_con_open(struct ceph_connection * con,__u8 entity_type,__u64 entity_num,struct ceph_entity_addr * addr)687*4882a593Smuzhiyun void ceph_con_open(struct ceph_connection *con,
688*4882a593Smuzhiyun 		   __u8 entity_type, __u64 entity_num,
689*4882a593Smuzhiyun 		   struct ceph_entity_addr *addr)
690*4882a593Smuzhiyun {
691*4882a593Smuzhiyun 	mutex_lock(&con->mutex);
692*4882a593Smuzhiyun 	dout("con_open %p %s\n", con, ceph_pr_addr(addr));
693*4882a593Smuzhiyun 
694*4882a593Smuzhiyun 	WARN_ON(con->state != CON_STATE_CLOSED);
695*4882a593Smuzhiyun 	con->state = CON_STATE_PREOPEN;
696*4882a593Smuzhiyun 
697*4882a593Smuzhiyun 	con->peer_name.type = (__u8) entity_type;
698*4882a593Smuzhiyun 	con->peer_name.num = cpu_to_le64(entity_num);
699*4882a593Smuzhiyun 
700*4882a593Smuzhiyun 	memcpy(&con->peer_addr, addr, sizeof(*addr));
701*4882a593Smuzhiyun 	con->delay = 0;      /* reset backoff memory */
702*4882a593Smuzhiyun 	mutex_unlock(&con->mutex);
703*4882a593Smuzhiyun 	queue_con(con);
704*4882a593Smuzhiyun }
705*4882a593Smuzhiyun EXPORT_SYMBOL(ceph_con_open);
706*4882a593Smuzhiyun 
707*4882a593Smuzhiyun /*
708*4882a593Smuzhiyun  * return true if this connection ever successfully opened
709*4882a593Smuzhiyun  */
ceph_con_opened(struct ceph_connection * con)710*4882a593Smuzhiyun bool ceph_con_opened(struct ceph_connection *con)
711*4882a593Smuzhiyun {
712*4882a593Smuzhiyun 	return con->connect_seq > 0;
713*4882a593Smuzhiyun }
714*4882a593Smuzhiyun 
715*4882a593Smuzhiyun /*
716*4882a593Smuzhiyun  * initialize a new connection.
717*4882a593Smuzhiyun  */
ceph_con_init(struct ceph_connection * con,void * private,const struct ceph_connection_operations * ops,struct ceph_messenger * msgr)718*4882a593Smuzhiyun void ceph_con_init(struct ceph_connection *con, void *private,
719*4882a593Smuzhiyun 	const struct ceph_connection_operations *ops,
720*4882a593Smuzhiyun 	struct ceph_messenger *msgr)
721*4882a593Smuzhiyun {
722*4882a593Smuzhiyun 	dout("con_init %p\n", con);
723*4882a593Smuzhiyun 	memset(con, 0, sizeof(*con));
724*4882a593Smuzhiyun 	con->private = private;
725*4882a593Smuzhiyun 	con->ops = ops;
726*4882a593Smuzhiyun 	con->msgr = msgr;
727*4882a593Smuzhiyun 
728*4882a593Smuzhiyun 	con_sock_state_init(con);
729*4882a593Smuzhiyun 
730*4882a593Smuzhiyun 	mutex_init(&con->mutex);
731*4882a593Smuzhiyun 	INIT_LIST_HEAD(&con->out_queue);
732*4882a593Smuzhiyun 	INIT_LIST_HEAD(&con->out_sent);
733*4882a593Smuzhiyun 	INIT_DELAYED_WORK(&con->work, ceph_con_workfn);
734*4882a593Smuzhiyun 
735*4882a593Smuzhiyun 	con->state = CON_STATE_CLOSED;
736*4882a593Smuzhiyun }
737*4882a593Smuzhiyun EXPORT_SYMBOL(ceph_con_init);
738*4882a593Smuzhiyun 
739*4882a593Smuzhiyun 
740*4882a593Smuzhiyun /*
741*4882a593Smuzhiyun  * We maintain a global counter to order connection attempts.  Get
742*4882a593Smuzhiyun  * a unique seq greater than @gt.
743*4882a593Smuzhiyun  */
get_global_seq(struct ceph_messenger * msgr,u32 gt)744*4882a593Smuzhiyun static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
745*4882a593Smuzhiyun {
746*4882a593Smuzhiyun 	u32 ret;
747*4882a593Smuzhiyun 
748*4882a593Smuzhiyun 	spin_lock(&msgr->global_seq_lock);
749*4882a593Smuzhiyun 	if (msgr->global_seq < gt)
750*4882a593Smuzhiyun 		msgr->global_seq = gt;
751*4882a593Smuzhiyun 	ret = ++msgr->global_seq;
752*4882a593Smuzhiyun 	spin_unlock(&msgr->global_seq_lock);
753*4882a593Smuzhiyun 	return ret;
754*4882a593Smuzhiyun }
755*4882a593Smuzhiyun 
con_out_kvec_reset(struct ceph_connection * con)756*4882a593Smuzhiyun static void con_out_kvec_reset(struct ceph_connection *con)
757*4882a593Smuzhiyun {
758*4882a593Smuzhiyun 	BUG_ON(con->out_skip);
759*4882a593Smuzhiyun 
760*4882a593Smuzhiyun 	con->out_kvec_left = 0;
761*4882a593Smuzhiyun 	con->out_kvec_bytes = 0;
762*4882a593Smuzhiyun 	con->out_kvec_cur = &con->out_kvec[0];
763*4882a593Smuzhiyun }
764*4882a593Smuzhiyun 
con_out_kvec_add(struct ceph_connection * con,size_t size,void * data)765*4882a593Smuzhiyun static void con_out_kvec_add(struct ceph_connection *con,
766*4882a593Smuzhiyun 				size_t size, void *data)
767*4882a593Smuzhiyun {
768*4882a593Smuzhiyun 	int index = con->out_kvec_left;
769*4882a593Smuzhiyun 
770*4882a593Smuzhiyun 	BUG_ON(con->out_skip);
771*4882a593Smuzhiyun 	BUG_ON(index >= ARRAY_SIZE(con->out_kvec));
772*4882a593Smuzhiyun 
773*4882a593Smuzhiyun 	con->out_kvec[index].iov_len = size;
774*4882a593Smuzhiyun 	con->out_kvec[index].iov_base = data;
775*4882a593Smuzhiyun 	con->out_kvec_left++;
776*4882a593Smuzhiyun 	con->out_kvec_bytes += size;
777*4882a593Smuzhiyun }
778*4882a593Smuzhiyun 
779*4882a593Smuzhiyun /*
780*4882a593Smuzhiyun  * Chop off a kvec from the end.  Return residual number of bytes for
781*4882a593Smuzhiyun  * that kvec, i.e. how many bytes would have been written if the kvec
782*4882a593Smuzhiyun  * hadn't been nuked.
783*4882a593Smuzhiyun  */
con_out_kvec_skip(struct ceph_connection * con)784*4882a593Smuzhiyun static int con_out_kvec_skip(struct ceph_connection *con)
785*4882a593Smuzhiyun {
786*4882a593Smuzhiyun 	int off = con->out_kvec_cur - con->out_kvec;
787*4882a593Smuzhiyun 	int skip = 0;
788*4882a593Smuzhiyun 
789*4882a593Smuzhiyun 	if (con->out_kvec_bytes > 0) {
790*4882a593Smuzhiyun 		skip = con->out_kvec[off + con->out_kvec_left - 1].iov_len;
791*4882a593Smuzhiyun 		BUG_ON(con->out_kvec_bytes < skip);
792*4882a593Smuzhiyun 		BUG_ON(!con->out_kvec_left);
793*4882a593Smuzhiyun 		con->out_kvec_bytes -= skip;
794*4882a593Smuzhiyun 		con->out_kvec_left--;
795*4882a593Smuzhiyun 	}
796*4882a593Smuzhiyun 
797*4882a593Smuzhiyun 	return skip;
798*4882a593Smuzhiyun }
799*4882a593Smuzhiyun 
800*4882a593Smuzhiyun #ifdef CONFIG_BLOCK
801*4882a593Smuzhiyun 
802*4882a593Smuzhiyun /*
803*4882a593Smuzhiyun  * For a bio data item, a piece is whatever remains of the next
804*4882a593Smuzhiyun  * entry in the current bio iovec, or the first entry in the next
805*4882a593Smuzhiyun  * bio in the list.
806*4882a593Smuzhiyun  */
ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor * cursor,size_t length)807*4882a593Smuzhiyun static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor,
808*4882a593Smuzhiyun 					size_t length)
809*4882a593Smuzhiyun {
810*4882a593Smuzhiyun 	struct ceph_msg_data *data = cursor->data;
811*4882a593Smuzhiyun 	struct ceph_bio_iter *it = &cursor->bio_iter;
812*4882a593Smuzhiyun 
813*4882a593Smuzhiyun 	cursor->resid = min_t(size_t, length, data->bio_length);
814*4882a593Smuzhiyun 	*it = data->bio_pos;
815*4882a593Smuzhiyun 	if (cursor->resid < it->iter.bi_size)
816*4882a593Smuzhiyun 		it->iter.bi_size = cursor->resid;
817*4882a593Smuzhiyun 
818*4882a593Smuzhiyun 	BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter));
819*4882a593Smuzhiyun 	cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter);
820*4882a593Smuzhiyun }
821*4882a593Smuzhiyun 
ceph_msg_data_bio_next(struct ceph_msg_data_cursor * cursor,size_t * page_offset,size_t * length)822*4882a593Smuzhiyun static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor,
823*4882a593Smuzhiyun 						size_t *page_offset,
824*4882a593Smuzhiyun 						size_t *length)
825*4882a593Smuzhiyun {
826*4882a593Smuzhiyun 	struct bio_vec bv = bio_iter_iovec(cursor->bio_iter.bio,
827*4882a593Smuzhiyun 					   cursor->bio_iter.iter);
828*4882a593Smuzhiyun 
829*4882a593Smuzhiyun 	*page_offset = bv.bv_offset;
830*4882a593Smuzhiyun 	*length = bv.bv_len;
831*4882a593Smuzhiyun 	return bv.bv_page;
832*4882a593Smuzhiyun }
833*4882a593Smuzhiyun 
ceph_msg_data_bio_advance(struct ceph_msg_data_cursor * cursor,size_t bytes)834*4882a593Smuzhiyun static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor,
835*4882a593Smuzhiyun 					size_t bytes)
836*4882a593Smuzhiyun {
837*4882a593Smuzhiyun 	struct ceph_bio_iter *it = &cursor->bio_iter;
838*4882a593Smuzhiyun 	struct page *page = bio_iter_page(it->bio, it->iter);
839*4882a593Smuzhiyun 
840*4882a593Smuzhiyun 	BUG_ON(bytes > cursor->resid);
841*4882a593Smuzhiyun 	BUG_ON(bytes > bio_iter_len(it->bio, it->iter));
842*4882a593Smuzhiyun 	cursor->resid -= bytes;
843*4882a593Smuzhiyun 	bio_advance_iter(it->bio, &it->iter, bytes);
844*4882a593Smuzhiyun 
845*4882a593Smuzhiyun 	if (!cursor->resid) {
846*4882a593Smuzhiyun 		BUG_ON(!cursor->last_piece);
847*4882a593Smuzhiyun 		return false;   /* no more data */
848*4882a593Smuzhiyun 	}
849*4882a593Smuzhiyun 
850*4882a593Smuzhiyun 	if (!bytes || (it->iter.bi_size && it->iter.bi_bvec_done &&
851*4882a593Smuzhiyun 		       page == bio_iter_page(it->bio, it->iter)))
852*4882a593Smuzhiyun 		return false;	/* more bytes to process in this segment */
853*4882a593Smuzhiyun 
854*4882a593Smuzhiyun 	if (!it->iter.bi_size) {
855*4882a593Smuzhiyun 		it->bio = it->bio->bi_next;
856*4882a593Smuzhiyun 		it->iter = it->bio->bi_iter;
857*4882a593Smuzhiyun 		if (cursor->resid < it->iter.bi_size)
858*4882a593Smuzhiyun 			it->iter.bi_size = cursor->resid;
859*4882a593Smuzhiyun 	}
860*4882a593Smuzhiyun 
861*4882a593Smuzhiyun 	BUG_ON(cursor->last_piece);
862*4882a593Smuzhiyun 	BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter));
863*4882a593Smuzhiyun 	cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter);
864*4882a593Smuzhiyun 	return true;
865*4882a593Smuzhiyun }
866*4882a593Smuzhiyun #endif /* CONFIG_BLOCK */
867*4882a593Smuzhiyun 
ceph_msg_data_bvecs_cursor_init(struct ceph_msg_data_cursor * cursor,size_t length)868*4882a593Smuzhiyun static void ceph_msg_data_bvecs_cursor_init(struct ceph_msg_data_cursor *cursor,
869*4882a593Smuzhiyun 					size_t length)
870*4882a593Smuzhiyun {
871*4882a593Smuzhiyun 	struct ceph_msg_data *data = cursor->data;
872*4882a593Smuzhiyun 	struct bio_vec *bvecs = data->bvec_pos.bvecs;
873*4882a593Smuzhiyun 
874*4882a593Smuzhiyun 	cursor->resid = min_t(size_t, length, data->bvec_pos.iter.bi_size);
875*4882a593Smuzhiyun 	cursor->bvec_iter = data->bvec_pos.iter;
876*4882a593Smuzhiyun 	cursor->bvec_iter.bi_size = cursor->resid;
877*4882a593Smuzhiyun 
878*4882a593Smuzhiyun 	BUG_ON(cursor->resid < bvec_iter_len(bvecs, cursor->bvec_iter));
879*4882a593Smuzhiyun 	cursor->last_piece =
880*4882a593Smuzhiyun 	    cursor->resid == bvec_iter_len(bvecs, cursor->bvec_iter);
881*4882a593Smuzhiyun }
882*4882a593Smuzhiyun 
ceph_msg_data_bvecs_next(struct ceph_msg_data_cursor * cursor,size_t * page_offset,size_t * length)883*4882a593Smuzhiyun static struct page *ceph_msg_data_bvecs_next(struct ceph_msg_data_cursor *cursor,
884*4882a593Smuzhiyun 						size_t *page_offset,
885*4882a593Smuzhiyun 						size_t *length)
886*4882a593Smuzhiyun {
887*4882a593Smuzhiyun 	struct bio_vec bv = bvec_iter_bvec(cursor->data->bvec_pos.bvecs,
888*4882a593Smuzhiyun 					   cursor->bvec_iter);
889*4882a593Smuzhiyun 
890*4882a593Smuzhiyun 	*page_offset = bv.bv_offset;
891*4882a593Smuzhiyun 	*length = bv.bv_len;
892*4882a593Smuzhiyun 	return bv.bv_page;
893*4882a593Smuzhiyun }
894*4882a593Smuzhiyun 
ceph_msg_data_bvecs_advance(struct ceph_msg_data_cursor * cursor,size_t bytes)895*4882a593Smuzhiyun static bool ceph_msg_data_bvecs_advance(struct ceph_msg_data_cursor *cursor,
896*4882a593Smuzhiyun 					size_t bytes)
897*4882a593Smuzhiyun {
898*4882a593Smuzhiyun 	struct bio_vec *bvecs = cursor->data->bvec_pos.bvecs;
899*4882a593Smuzhiyun 	struct page *page = bvec_iter_page(bvecs, cursor->bvec_iter);
900*4882a593Smuzhiyun 
901*4882a593Smuzhiyun 	BUG_ON(bytes > cursor->resid);
902*4882a593Smuzhiyun 	BUG_ON(bytes > bvec_iter_len(bvecs, cursor->bvec_iter));
903*4882a593Smuzhiyun 	cursor->resid -= bytes;
904*4882a593Smuzhiyun 	bvec_iter_advance(bvecs, &cursor->bvec_iter, bytes);
905*4882a593Smuzhiyun 
906*4882a593Smuzhiyun 	if (!cursor->resid) {
907*4882a593Smuzhiyun 		BUG_ON(!cursor->last_piece);
908*4882a593Smuzhiyun 		return false;   /* no more data */
909*4882a593Smuzhiyun 	}
910*4882a593Smuzhiyun 
911*4882a593Smuzhiyun 	if (!bytes || (cursor->bvec_iter.bi_bvec_done &&
912*4882a593Smuzhiyun 		       page == bvec_iter_page(bvecs, cursor->bvec_iter)))
913*4882a593Smuzhiyun 		return false;	/* more bytes to process in this segment */
914*4882a593Smuzhiyun 
915*4882a593Smuzhiyun 	BUG_ON(cursor->last_piece);
916*4882a593Smuzhiyun 	BUG_ON(cursor->resid < bvec_iter_len(bvecs, cursor->bvec_iter));
917*4882a593Smuzhiyun 	cursor->last_piece =
918*4882a593Smuzhiyun 	    cursor->resid == bvec_iter_len(bvecs, cursor->bvec_iter);
919*4882a593Smuzhiyun 	return true;
920*4882a593Smuzhiyun }
921*4882a593Smuzhiyun 
922*4882a593Smuzhiyun /*
923*4882a593Smuzhiyun  * For a page array, a piece comes from the first page in the array
924*4882a593Smuzhiyun  * that has not already been fully consumed.
925*4882a593Smuzhiyun  */
ceph_msg_data_pages_cursor_init(struct ceph_msg_data_cursor * cursor,size_t length)926*4882a593Smuzhiyun static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data_cursor *cursor,
927*4882a593Smuzhiyun 					size_t length)
928*4882a593Smuzhiyun {
929*4882a593Smuzhiyun 	struct ceph_msg_data *data = cursor->data;
930*4882a593Smuzhiyun 	int page_count;
931*4882a593Smuzhiyun 
932*4882a593Smuzhiyun 	BUG_ON(data->type != CEPH_MSG_DATA_PAGES);
933*4882a593Smuzhiyun 
934*4882a593Smuzhiyun 	BUG_ON(!data->pages);
935*4882a593Smuzhiyun 	BUG_ON(!data->length);
936*4882a593Smuzhiyun 
937*4882a593Smuzhiyun 	cursor->resid = min(length, data->length);
938*4882a593Smuzhiyun 	page_count = calc_pages_for(data->alignment, (u64)data->length);
939*4882a593Smuzhiyun 	cursor->page_offset = data->alignment & ~PAGE_MASK;
940*4882a593Smuzhiyun 	cursor->page_index = 0;
941*4882a593Smuzhiyun 	BUG_ON(page_count > (int)USHRT_MAX);
942*4882a593Smuzhiyun 	cursor->page_count = (unsigned short)page_count;
943*4882a593Smuzhiyun 	BUG_ON(length > SIZE_MAX - cursor->page_offset);
944*4882a593Smuzhiyun 	cursor->last_piece = cursor->page_offset + cursor->resid <= PAGE_SIZE;
945*4882a593Smuzhiyun }
946*4882a593Smuzhiyun 
947*4882a593Smuzhiyun static struct page *
ceph_msg_data_pages_next(struct ceph_msg_data_cursor * cursor,size_t * page_offset,size_t * length)948*4882a593Smuzhiyun ceph_msg_data_pages_next(struct ceph_msg_data_cursor *cursor,
949*4882a593Smuzhiyun 					size_t *page_offset, size_t *length)
950*4882a593Smuzhiyun {
951*4882a593Smuzhiyun 	struct ceph_msg_data *data = cursor->data;
952*4882a593Smuzhiyun 
953*4882a593Smuzhiyun 	BUG_ON(data->type != CEPH_MSG_DATA_PAGES);
954*4882a593Smuzhiyun 
955*4882a593Smuzhiyun 	BUG_ON(cursor->page_index >= cursor->page_count);
956*4882a593Smuzhiyun 	BUG_ON(cursor->page_offset >= PAGE_SIZE);
957*4882a593Smuzhiyun 
958*4882a593Smuzhiyun 	*page_offset = cursor->page_offset;
959*4882a593Smuzhiyun 	if (cursor->last_piece)
960*4882a593Smuzhiyun 		*length = cursor->resid;
961*4882a593Smuzhiyun 	else
962*4882a593Smuzhiyun 		*length = PAGE_SIZE - *page_offset;
963*4882a593Smuzhiyun 
964*4882a593Smuzhiyun 	return data->pages[cursor->page_index];
965*4882a593Smuzhiyun }
966*4882a593Smuzhiyun 
ceph_msg_data_pages_advance(struct ceph_msg_data_cursor * cursor,size_t bytes)967*4882a593Smuzhiyun static bool ceph_msg_data_pages_advance(struct ceph_msg_data_cursor *cursor,
968*4882a593Smuzhiyun 						size_t bytes)
969*4882a593Smuzhiyun {
970*4882a593Smuzhiyun 	BUG_ON(cursor->data->type != CEPH_MSG_DATA_PAGES);
971*4882a593Smuzhiyun 
972*4882a593Smuzhiyun 	BUG_ON(cursor->page_offset + bytes > PAGE_SIZE);
973*4882a593Smuzhiyun 
974*4882a593Smuzhiyun 	/* Advance the cursor page offset */
975*4882a593Smuzhiyun 
976*4882a593Smuzhiyun 	cursor->resid -= bytes;
977*4882a593Smuzhiyun 	cursor->page_offset = (cursor->page_offset + bytes) & ~PAGE_MASK;
978*4882a593Smuzhiyun 	if (!bytes || cursor->page_offset)
979*4882a593Smuzhiyun 		return false;	/* more bytes to process in the current page */
980*4882a593Smuzhiyun 
981*4882a593Smuzhiyun 	if (!cursor->resid)
982*4882a593Smuzhiyun 		return false;   /* no more data */
983*4882a593Smuzhiyun 
984*4882a593Smuzhiyun 	/* Move on to the next page; offset is already at 0 */
985*4882a593Smuzhiyun 
986*4882a593Smuzhiyun 	BUG_ON(cursor->page_index >= cursor->page_count);
987*4882a593Smuzhiyun 	cursor->page_index++;
988*4882a593Smuzhiyun 	cursor->last_piece = cursor->resid <= PAGE_SIZE;
989*4882a593Smuzhiyun 
990*4882a593Smuzhiyun 	return true;
991*4882a593Smuzhiyun }
992*4882a593Smuzhiyun 
993*4882a593Smuzhiyun /*
994*4882a593Smuzhiyun  * For a pagelist, a piece is whatever remains to be consumed in the
995*4882a593Smuzhiyun  * first page in the list, or the front of the next page.
996*4882a593Smuzhiyun  */
997*4882a593Smuzhiyun static void
ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data_cursor * cursor,size_t length)998*4882a593Smuzhiyun ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data_cursor *cursor,
999*4882a593Smuzhiyun 					size_t length)
1000*4882a593Smuzhiyun {
1001*4882a593Smuzhiyun 	struct ceph_msg_data *data = cursor->data;
1002*4882a593Smuzhiyun 	struct ceph_pagelist *pagelist;
1003*4882a593Smuzhiyun 	struct page *page;
1004*4882a593Smuzhiyun 
1005*4882a593Smuzhiyun 	BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST);
1006*4882a593Smuzhiyun 
1007*4882a593Smuzhiyun 	pagelist = data->pagelist;
1008*4882a593Smuzhiyun 	BUG_ON(!pagelist);
1009*4882a593Smuzhiyun 
1010*4882a593Smuzhiyun 	if (!length)
1011*4882a593Smuzhiyun 		return;		/* pagelist can be assigned but empty */
1012*4882a593Smuzhiyun 
1013*4882a593Smuzhiyun 	BUG_ON(list_empty(&pagelist->head));
1014*4882a593Smuzhiyun 	page = list_first_entry(&pagelist->head, struct page, lru);
1015*4882a593Smuzhiyun 
1016*4882a593Smuzhiyun 	cursor->resid = min(length, pagelist->length);
1017*4882a593Smuzhiyun 	cursor->page = page;
1018*4882a593Smuzhiyun 	cursor->offset = 0;
1019*4882a593Smuzhiyun 	cursor->last_piece = cursor->resid <= PAGE_SIZE;
1020*4882a593Smuzhiyun }
1021*4882a593Smuzhiyun 
1022*4882a593Smuzhiyun static struct page *
ceph_msg_data_pagelist_next(struct ceph_msg_data_cursor * cursor,size_t * page_offset,size_t * length)1023*4882a593Smuzhiyun ceph_msg_data_pagelist_next(struct ceph_msg_data_cursor *cursor,
1024*4882a593Smuzhiyun 				size_t *page_offset, size_t *length)
1025*4882a593Smuzhiyun {
1026*4882a593Smuzhiyun 	struct ceph_msg_data *data = cursor->data;
1027*4882a593Smuzhiyun 	struct ceph_pagelist *pagelist;
1028*4882a593Smuzhiyun 
1029*4882a593Smuzhiyun 	BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST);
1030*4882a593Smuzhiyun 
1031*4882a593Smuzhiyun 	pagelist = data->pagelist;
1032*4882a593Smuzhiyun 	BUG_ON(!pagelist);
1033*4882a593Smuzhiyun 
1034*4882a593Smuzhiyun 	BUG_ON(!cursor->page);
1035*4882a593Smuzhiyun 	BUG_ON(cursor->offset + cursor->resid != pagelist->length);
1036*4882a593Smuzhiyun 
1037*4882a593Smuzhiyun 	/* offset of first page in pagelist is always 0 */
1038*4882a593Smuzhiyun 	*page_offset = cursor->offset & ~PAGE_MASK;
1039*4882a593Smuzhiyun 	if (cursor->last_piece)
1040*4882a593Smuzhiyun 		*length = cursor->resid;
1041*4882a593Smuzhiyun 	else
1042*4882a593Smuzhiyun 		*length = PAGE_SIZE - *page_offset;
1043*4882a593Smuzhiyun 
1044*4882a593Smuzhiyun 	return cursor->page;
1045*4882a593Smuzhiyun }
1046*4882a593Smuzhiyun 
ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor * cursor,size_t bytes)1047*4882a593Smuzhiyun static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor,
1048*4882a593Smuzhiyun 						size_t bytes)
1049*4882a593Smuzhiyun {
1050*4882a593Smuzhiyun 	struct ceph_msg_data *data = cursor->data;
1051*4882a593Smuzhiyun 	struct ceph_pagelist *pagelist;
1052*4882a593Smuzhiyun 
1053*4882a593Smuzhiyun 	BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST);
1054*4882a593Smuzhiyun 
1055*4882a593Smuzhiyun 	pagelist = data->pagelist;
1056*4882a593Smuzhiyun 	BUG_ON(!pagelist);
1057*4882a593Smuzhiyun 
1058*4882a593Smuzhiyun 	BUG_ON(cursor->offset + cursor->resid != pagelist->length);
1059*4882a593Smuzhiyun 	BUG_ON((cursor->offset & ~PAGE_MASK) + bytes > PAGE_SIZE);
1060*4882a593Smuzhiyun 
1061*4882a593Smuzhiyun 	/* Advance the cursor offset */
1062*4882a593Smuzhiyun 
1063*4882a593Smuzhiyun 	cursor->resid -= bytes;
1064*4882a593Smuzhiyun 	cursor->offset += bytes;
1065*4882a593Smuzhiyun 	/* offset of first page in pagelist is always 0 */
1066*4882a593Smuzhiyun 	if (!bytes || cursor->offset & ~PAGE_MASK)
1067*4882a593Smuzhiyun 		return false;	/* more bytes to process in the current page */
1068*4882a593Smuzhiyun 
1069*4882a593Smuzhiyun 	if (!cursor->resid)
1070*4882a593Smuzhiyun 		return false;   /* no more data */
1071*4882a593Smuzhiyun 
1072*4882a593Smuzhiyun 	/* Move on to the next page */
1073*4882a593Smuzhiyun 
1074*4882a593Smuzhiyun 	BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head));
1075*4882a593Smuzhiyun 	cursor->page = list_next_entry(cursor->page, lru);
1076*4882a593Smuzhiyun 	cursor->last_piece = cursor->resid <= PAGE_SIZE;
1077*4882a593Smuzhiyun 
1078*4882a593Smuzhiyun 	return true;
1079*4882a593Smuzhiyun }
1080*4882a593Smuzhiyun 
1081*4882a593Smuzhiyun /*
1082*4882a593Smuzhiyun  * Message data is handled (sent or received) in pieces, where each
1083*4882a593Smuzhiyun  * piece resides on a single page.  The network layer might not
1084*4882a593Smuzhiyun  * consume an entire piece at once.  A data item's cursor keeps
1085*4882a593Smuzhiyun  * track of which piece is next to process and how much remains to
1086*4882a593Smuzhiyun  * be processed in that piece.  It also tracks whether the current
1087*4882a593Smuzhiyun  * piece is the last one in the data item.
1088*4882a593Smuzhiyun  */
__ceph_msg_data_cursor_init(struct ceph_msg_data_cursor * cursor)1089*4882a593Smuzhiyun static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor)
1090*4882a593Smuzhiyun {
1091*4882a593Smuzhiyun 	size_t length = cursor->total_resid;
1092*4882a593Smuzhiyun 
1093*4882a593Smuzhiyun 	switch (cursor->data->type) {
1094*4882a593Smuzhiyun 	case CEPH_MSG_DATA_PAGELIST:
1095*4882a593Smuzhiyun 		ceph_msg_data_pagelist_cursor_init(cursor, length);
1096*4882a593Smuzhiyun 		break;
1097*4882a593Smuzhiyun 	case CEPH_MSG_DATA_PAGES:
1098*4882a593Smuzhiyun 		ceph_msg_data_pages_cursor_init(cursor, length);
1099*4882a593Smuzhiyun 		break;
1100*4882a593Smuzhiyun #ifdef CONFIG_BLOCK
1101*4882a593Smuzhiyun 	case CEPH_MSG_DATA_BIO:
1102*4882a593Smuzhiyun 		ceph_msg_data_bio_cursor_init(cursor, length);
1103*4882a593Smuzhiyun 		break;
1104*4882a593Smuzhiyun #endif /* CONFIG_BLOCK */
1105*4882a593Smuzhiyun 	case CEPH_MSG_DATA_BVECS:
1106*4882a593Smuzhiyun 		ceph_msg_data_bvecs_cursor_init(cursor, length);
1107*4882a593Smuzhiyun 		break;
1108*4882a593Smuzhiyun 	case CEPH_MSG_DATA_NONE:
1109*4882a593Smuzhiyun 	default:
1110*4882a593Smuzhiyun 		/* BUG(); */
1111*4882a593Smuzhiyun 		break;
1112*4882a593Smuzhiyun 	}
1113*4882a593Smuzhiyun 	cursor->need_crc = true;
1114*4882a593Smuzhiyun }
1115*4882a593Smuzhiyun 
ceph_msg_data_cursor_init(struct ceph_msg * msg,size_t length)1116*4882a593Smuzhiyun static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length)
1117*4882a593Smuzhiyun {
1118*4882a593Smuzhiyun 	struct ceph_msg_data_cursor *cursor = &msg->cursor;
1119*4882a593Smuzhiyun 
1120*4882a593Smuzhiyun 	BUG_ON(!length);
1121*4882a593Smuzhiyun 	BUG_ON(length > msg->data_length);
1122*4882a593Smuzhiyun 	BUG_ON(!msg->num_data_items);
1123*4882a593Smuzhiyun 
1124*4882a593Smuzhiyun 	cursor->total_resid = length;
1125*4882a593Smuzhiyun 	cursor->data = msg->data;
1126*4882a593Smuzhiyun 
1127*4882a593Smuzhiyun 	__ceph_msg_data_cursor_init(cursor);
1128*4882a593Smuzhiyun }
1129*4882a593Smuzhiyun 
1130*4882a593Smuzhiyun /*
1131*4882a593Smuzhiyun  * Return the page containing the next piece to process for a given
1132*4882a593Smuzhiyun  * data item, and supply the page offset and length of that piece.
1133*4882a593Smuzhiyun  * Indicate whether this is the last piece in this data item.
1134*4882a593Smuzhiyun  */
ceph_msg_data_next(struct ceph_msg_data_cursor * cursor,size_t * page_offset,size_t * length,bool * last_piece)1135*4882a593Smuzhiyun static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
1136*4882a593Smuzhiyun 					size_t *page_offset, size_t *length,
1137*4882a593Smuzhiyun 					bool *last_piece)
1138*4882a593Smuzhiyun {
1139*4882a593Smuzhiyun 	struct page *page;
1140*4882a593Smuzhiyun 
1141*4882a593Smuzhiyun 	switch (cursor->data->type) {
1142*4882a593Smuzhiyun 	case CEPH_MSG_DATA_PAGELIST:
1143*4882a593Smuzhiyun 		page = ceph_msg_data_pagelist_next(cursor, page_offset, length);
1144*4882a593Smuzhiyun 		break;
1145*4882a593Smuzhiyun 	case CEPH_MSG_DATA_PAGES:
1146*4882a593Smuzhiyun 		page = ceph_msg_data_pages_next(cursor, page_offset, length);
1147*4882a593Smuzhiyun 		break;
1148*4882a593Smuzhiyun #ifdef CONFIG_BLOCK
1149*4882a593Smuzhiyun 	case CEPH_MSG_DATA_BIO:
1150*4882a593Smuzhiyun 		page = ceph_msg_data_bio_next(cursor, page_offset, length);
1151*4882a593Smuzhiyun 		break;
1152*4882a593Smuzhiyun #endif /* CONFIG_BLOCK */
1153*4882a593Smuzhiyun 	case CEPH_MSG_DATA_BVECS:
1154*4882a593Smuzhiyun 		page = ceph_msg_data_bvecs_next(cursor, page_offset, length);
1155*4882a593Smuzhiyun 		break;
1156*4882a593Smuzhiyun 	case CEPH_MSG_DATA_NONE:
1157*4882a593Smuzhiyun 	default:
1158*4882a593Smuzhiyun 		page = NULL;
1159*4882a593Smuzhiyun 		break;
1160*4882a593Smuzhiyun 	}
1161*4882a593Smuzhiyun 
1162*4882a593Smuzhiyun 	BUG_ON(!page);
1163*4882a593Smuzhiyun 	BUG_ON(*page_offset + *length > PAGE_SIZE);
1164*4882a593Smuzhiyun 	BUG_ON(!*length);
1165*4882a593Smuzhiyun 	BUG_ON(*length > cursor->resid);
1166*4882a593Smuzhiyun 	if (last_piece)
1167*4882a593Smuzhiyun 		*last_piece = cursor->last_piece;
1168*4882a593Smuzhiyun 
1169*4882a593Smuzhiyun 	return page;
1170*4882a593Smuzhiyun }
1171*4882a593Smuzhiyun 
1172*4882a593Smuzhiyun /*
1173*4882a593Smuzhiyun  * Returns true if the result moves the cursor on to the next piece
1174*4882a593Smuzhiyun  * of the data item.
1175*4882a593Smuzhiyun  */
ceph_msg_data_advance(struct ceph_msg_data_cursor * cursor,size_t bytes)1176*4882a593Smuzhiyun static void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor,
1177*4882a593Smuzhiyun 				  size_t bytes)
1178*4882a593Smuzhiyun {
1179*4882a593Smuzhiyun 	bool new_piece;
1180*4882a593Smuzhiyun 
1181*4882a593Smuzhiyun 	BUG_ON(bytes > cursor->resid);
1182*4882a593Smuzhiyun 	switch (cursor->data->type) {
1183*4882a593Smuzhiyun 	case CEPH_MSG_DATA_PAGELIST:
1184*4882a593Smuzhiyun 		new_piece = ceph_msg_data_pagelist_advance(cursor, bytes);
1185*4882a593Smuzhiyun 		break;
1186*4882a593Smuzhiyun 	case CEPH_MSG_DATA_PAGES:
1187*4882a593Smuzhiyun 		new_piece = ceph_msg_data_pages_advance(cursor, bytes);
1188*4882a593Smuzhiyun 		break;
1189*4882a593Smuzhiyun #ifdef CONFIG_BLOCK
1190*4882a593Smuzhiyun 	case CEPH_MSG_DATA_BIO:
1191*4882a593Smuzhiyun 		new_piece = ceph_msg_data_bio_advance(cursor, bytes);
1192*4882a593Smuzhiyun 		break;
1193*4882a593Smuzhiyun #endif /* CONFIG_BLOCK */
1194*4882a593Smuzhiyun 	case CEPH_MSG_DATA_BVECS:
1195*4882a593Smuzhiyun 		new_piece = ceph_msg_data_bvecs_advance(cursor, bytes);
1196*4882a593Smuzhiyun 		break;
1197*4882a593Smuzhiyun 	case CEPH_MSG_DATA_NONE:
1198*4882a593Smuzhiyun 	default:
1199*4882a593Smuzhiyun 		BUG();
1200*4882a593Smuzhiyun 		break;
1201*4882a593Smuzhiyun 	}
1202*4882a593Smuzhiyun 	cursor->total_resid -= bytes;
1203*4882a593Smuzhiyun 
1204*4882a593Smuzhiyun 	if (!cursor->resid && cursor->total_resid) {
1205*4882a593Smuzhiyun 		WARN_ON(!cursor->last_piece);
1206*4882a593Smuzhiyun 		cursor->data++;
1207*4882a593Smuzhiyun 		__ceph_msg_data_cursor_init(cursor);
1208*4882a593Smuzhiyun 		new_piece = true;
1209*4882a593Smuzhiyun 	}
1210*4882a593Smuzhiyun 	cursor->need_crc = new_piece;
1211*4882a593Smuzhiyun }
1212*4882a593Smuzhiyun 
sizeof_footer(struct ceph_connection * con)1213*4882a593Smuzhiyun static size_t sizeof_footer(struct ceph_connection *con)
1214*4882a593Smuzhiyun {
1215*4882a593Smuzhiyun 	return (con->peer_features & CEPH_FEATURE_MSG_AUTH) ?
1216*4882a593Smuzhiyun 	    sizeof(struct ceph_msg_footer) :
1217*4882a593Smuzhiyun 	    sizeof(struct ceph_msg_footer_old);
1218*4882a593Smuzhiyun }
1219*4882a593Smuzhiyun 
prepare_message_data(struct ceph_msg * msg,u32 data_len)1220*4882a593Smuzhiyun static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
1221*4882a593Smuzhiyun {
1222*4882a593Smuzhiyun 	/* Initialize data cursor */
1223*4882a593Smuzhiyun 
1224*4882a593Smuzhiyun 	ceph_msg_data_cursor_init(msg, (size_t)data_len);
1225*4882a593Smuzhiyun }
1226*4882a593Smuzhiyun 
1227*4882a593Smuzhiyun /*
1228*4882a593Smuzhiyun  * Prepare footer for currently outgoing message, and finish things
1229*4882a593Smuzhiyun  * off.  Assumes out_kvec* are already valid.. we just add on to the end.
1230*4882a593Smuzhiyun  */
prepare_write_message_footer(struct ceph_connection * con)1231*4882a593Smuzhiyun static void prepare_write_message_footer(struct ceph_connection *con)
1232*4882a593Smuzhiyun {
1233*4882a593Smuzhiyun 	struct ceph_msg *m = con->out_msg;
1234*4882a593Smuzhiyun 
1235*4882a593Smuzhiyun 	m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;
1236*4882a593Smuzhiyun 
1237*4882a593Smuzhiyun 	dout("prepare_write_message_footer %p\n", con);
1238*4882a593Smuzhiyun 	con_out_kvec_add(con, sizeof_footer(con), &m->footer);
1239*4882a593Smuzhiyun 	if (con->peer_features & CEPH_FEATURE_MSG_AUTH) {
1240*4882a593Smuzhiyun 		if (con->ops->sign_message)
1241*4882a593Smuzhiyun 			con->ops->sign_message(m);
1242*4882a593Smuzhiyun 		else
1243*4882a593Smuzhiyun 			m->footer.sig = 0;
1244*4882a593Smuzhiyun 	} else {
1245*4882a593Smuzhiyun 		m->old_footer.flags = m->footer.flags;
1246*4882a593Smuzhiyun 	}
1247*4882a593Smuzhiyun 	con->out_more = m->more_to_follow;
1248*4882a593Smuzhiyun 	con->out_msg_done = true;
1249*4882a593Smuzhiyun }
1250*4882a593Smuzhiyun 
1251*4882a593Smuzhiyun /*
1252*4882a593Smuzhiyun  * Prepare headers for the next outgoing message.
1253*4882a593Smuzhiyun  */
prepare_write_message(struct ceph_connection * con)1254*4882a593Smuzhiyun static void prepare_write_message(struct ceph_connection *con)
1255*4882a593Smuzhiyun {
1256*4882a593Smuzhiyun 	struct ceph_msg *m;
1257*4882a593Smuzhiyun 	u32 crc;
1258*4882a593Smuzhiyun 
1259*4882a593Smuzhiyun 	con_out_kvec_reset(con);
1260*4882a593Smuzhiyun 	con->out_msg_done = false;
1261*4882a593Smuzhiyun 
1262*4882a593Smuzhiyun 	/* Sneak an ack in there first?  If we can get it into the same
1263*4882a593Smuzhiyun 	 * TCP packet that's a good thing. */
1264*4882a593Smuzhiyun 	if (con->in_seq > con->in_seq_acked) {
1265*4882a593Smuzhiyun 		con->in_seq_acked = con->in_seq;
1266*4882a593Smuzhiyun 		con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
1267*4882a593Smuzhiyun 		con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
1268*4882a593Smuzhiyun 		con_out_kvec_add(con, sizeof (con->out_temp_ack),
1269*4882a593Smuzhiyun 			&con->out_temp_ack);
1270*4882a593Smuzhiyun 	}
1271*4882a593Smuzhiyun 
1272*4882a593Smuzhiyun 	BUG_ON(list_empty(&con->out_queue));
1273*4882a593Smuzhiyun 	m = list_first_entry(&con->out_queue, struct ceph_msg, list_head);
1274*4882a593Smuzhiyun 	con->out_msg = m;
1275*4882a593Smuzhiyun 	BUG_ON(m->con != con);
1276*4882a593Smuzhiyun 
1277*4882a593Smuzhiyun 	/* put message on sent list */
1278*4882a593Smuzhiyun 	ceph_msg_get(m);
1279*4882a593Smuzhiyun 	list_move_tail(&m->list_head, &con->out_sent);
1280*4882a593Smuzhiyun 
1281*4882a593Smuzhiyun 	/*
1282*4882a593Smuzhiyun 	 * only assign outgoing seq # if we haven't sent this message
1283*4882a593Smuzhiyun 	 * yet.  if it is requeued, resend with it's original seq.
1284*4882a593Smuzhiyun 	 */
1285*4882a593Smuzhiyun 	if (m->needs_out_seq) {
1286*4882a593Smuzhiyun 		m->hdr.seq = cpu_to_le64(++con->out_seq);
1287*4882a593Smuzhiyun 		m->needs_out_seq = false;
1288*4882a593Smuzhiyun 
1289*4882a593Smuzhiyun 		if (con->ops->reencode_message)
1290*4882a593Smuzhiyun 			con->ops->reencode_message(m);
1291*4882a593Smuzhiyun 	}
1292*4882a593Smuzhiyun 
1293*4882a593Smuzhiyun 	dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n",
1294*4882a593Smuzhiyun 	     m, con->out_seq, le16_to_cpu(m->hdr.type),
1295*4882a593Smuzhiyun 	     le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
1296*4882a593Smuzhiyun 	     m->data_length);
1297*4882a593Smuzhiyun 	WARN_ON(m->front.iov_len != le32_to_cpu(m->hdr.front_len));
1298*4882a593Smuzhiyun 	WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len));
1299*4882a593Smuzhiyun 
1300*4882a593Smuzhiyun 	/* tag + hdr + front + middle */
1301*4882a593Smuzhiyun 	con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
1302*4882a593Smuzhiyun 	con_out_kvec_add(con, sizeof(con->out_hdr), &con->out_hdr);
1303*4882a593Smuzhiyun 	con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
1304*4882a593Smuzhiyun 
1305*4882a593Smuzhiyun 	if (m->middle)
1306*4882a593Smuzhiyun 		con_out_kvec_add(con, m->middle->vec.iov_len,
1307*4882a593Smuzhiyun 			m->middle->vec.iov_base);
1308*4882a593Smuzhiyun 
1309*4882a593Smuzhiyun 	/* fill in hdr crc and finalize hdr */
1310*4882a593Smuzhiyun 	crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
1311*4882a593Smuzhiyun 	con->out_msg->hdr.crc = cpu_to_le32(crc);
1312*4882a593Smuzhiyun 	memcpy(&con->out_hdr, &con->out_msg->hdr, sizeof(con->out_hdr));
1313*4882a593Smuzhiyun 
1314*4882a593Smuzhiyun 	/* fill in front and middle crc, footer */
1315*4882a593Smuzhiyun 	crc = crc32c(0, m->front.iov_base, m->front.iov_len);
1316*4882a593Smuzhiyun 	con->out_msg->footer.front_crc = cpu_to_le32(crc);
1317*4882a593Smuzhiyun 	if (m->middle) {
1318*4882a593Smuzhiyun 		crc = crc32c(0, m->middle->vec.iov_base,
1319*4882a593Smuzhiyun 				m->middle->vec.iov_len);
1320*4882a593Smuzhiyun 		con->out_msg->footer.middle_crc = cpu_to_le32(crc);
1321*4882a593Smuzhiyun 	} else
1322*4882a593Smuzhiyun 		con->out_msg->footer.middle_crc = 0;
1323*4882a593Smuzhiyun 	dout("%s front_crc %u middle_crc %u\n", __func__,
1324*4882a593Smuzhiyun 	     le32_to_cpu(con->out_msg->footer.front_crc),
1325*4882a593Smuzhiyun 	     le32_to_cpu(con->out_msg->footer.middle_crc));
1326*4882a593Smuzhiyun 	con->out_msg->footer.flags = 0;
1327*4882a593Smuzhiyun 
1328*4882a593Smuzhiyun 	/* is there a data payload? */
1329*4882a593Smuzhiyun 	con->out_msg->footer.data_crc = 0;
1330*4882a593Smuzhiyun 	if (m->data_length) {
1331*4882a593Smuzhiyun 		prepare_message_data(con->out_msg, m->data_length);
1332*4882a593Smuzhiyun 		con->out_more = 1;  /* data + footer will follow */
1333*4882a593Smuzhiyun 	} else {
1334*4882a593Smuzhiyun 		/* no, queue up footer too and be done */
1335*4882a593Smuzhiyun 		prepare_write_message_footer(con);
1336*4882a593Smuzhiyun 	}
1337*4882a593Smuzhiyun 
1338*4882a593Smuzhiyun 	con_flag_set(con, CON_FLAG_WRITE_PENDING);
1339*4882a593Smuzhiyun }
1340*4882a593Smuzhiyun 
1341*4882a593Smuzhiyun /*
1342*4882a593Smuzhiyun  * Prepare an ack.
1343*4882a593Smuzhiyun  */
prepare_write_ack(struct ceph_connection * con)1344*4882a593Smuzhiyun static void prepare_write_ack(struct ceph_connection *con)
1345*4882a593Smuzhiyun {
1346*4882a593Smuzhiyun 	dout("prepare_write_ack %p %llu -> %llu\n", con,
1347*4882a593Smuzhiyun 	     con->in_seq_acked, con->in_seq);
1348*4882a593Smuzhiyun 	con->in_seq_acked = con->in_seq;
1349*4882a593Smuzhiyun 
1350*4882a593Smuzhiyun 	con_out_kvec_reset(con);
1351*4882a593Smuzhiyun 
1352*4882a593Smuzhiyun 	con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
1353*4882a593Smuzhiyun 
1354*4882a593Smuzhiyun 	con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
1355*4882a593Smuzhiyun 	con_out_kvec_add(con, sizeof (con->out_temp_ack),
1356*4882a593Smuzhiyun 				&con->out_temp_ack);
1357*4882a593Smuzhiyun 
1358*4882a593Smuzhiyun 	con->out_more = 1;  /* more will follow.. eventually.. */
1359*4882a593Smuzhiyun 	con_flag_set(con, CON_FLAG_WRITE_PENDING);
1360*4882a593Smuzhiyun }
1361*4882a593Smuzhiyun 
1362*4882a593Smuzhiyun /*
1363*4882a593Smuzhiyun  * Prepare to share the seq during handshake
1364*4882a593Smuzhiyun  */
prepare_write_seq(struct ceph_connection * con)1365*4882a593Smuzhiyun static void prepare_write_seq(struct ceph_connection *con)
1366*4882a593Smuzhiyun {
1367*4882a593Smuzhiyun 	dout("prepare_write_seq %p %llu -> %llu\n", con,
1368*4882a593Smuzhiyun 	     con->in_seq_acked, con->in_seq);
1369*4882a593Smuzhiyun 	con->in_seq_acked = con->in_seq;
1370*4882a593Smuzhiyun 
1371*4882a593Smuzhiyun 	con_out_kvec_reset(con);
1372*4882a593Smuzhiyun 
1373*4882a593Smuzhiyun 	con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
1374*4882a593Smuzhiyun 	con_out_kvec_add(con, sizeof (con->out_temp_ack),
1375*4882a593Smuzhiyun 			 &con->out_temp_ack);
1376*4882a593Smuzhiyun 
1377*4882a593Smuzhiyun 	con_flag_set(con, CON_FLAG_WRITE_PENDING);
1378*4882a593Smuzhiyun }
1379*4882a593Smuzhiyun 
1380*4882a593Smuzhiyun /*
1381*4882a593Smuzhiyun  * Prepare to write keepalive byte.
1382*4882a593Smuzhiyun  */
prepare_write_keepalive(struct ceph_connection * con)1383*4882a593Smuzhiyun static void prepare_write_keepalive(struct ceph_connection *con)
1384*4882a593Smuzhiyun {
1385*4882a593Smuzhiyun 	dout("prepare_write_keepalive %p\n", con);
1386*4882a593Smuzhiyun 	con_out_kvec_reset(con);
1387*4882a593Smuzhiyun 	if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) {
1388*4882a593Smuzhiyun 		struct timespec64 now;
1389*4882a593Smuzhiyun 
1390*4882a593Smuzhiyun 		ktime_get_real_ts64(&now);
1391*4882a593Smuzhiyun 		con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2);
1392*4882a593Smuzhiyun 		ceph_encode_timespec64(&con->out_temp_keepalive2, &now);
1393*4882a593Smuzhiyun 		con_out_kvec_add(con, sizeof(con->out_temp_keepalive2),
1394*4882a593Smuzhiyun 				 &con->out_temp_keepalive2);
1395*4882a593Smuzhiyun 	} else {
1396*4882a593Smuzhiyun 		con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive);
1397*4882a593Smuzhiyun 	}
1398*4882a593Smuzhiyun 	con_flag_set(con, CON_FLAG_WRITE_PENDING);
1399*4882a593Smuzhiyun }
1400*4882a593Smuzhiyun 
1401*4882a593Smuzhiyun /*
1402*4882a593Smuzhiyun  * Connection negotiation.
1403*4882a593Smuzhiyun  */
1404*4882a593Smuzhiyun 
get_connect_authorizer(struct ceph_connection * con)1405*4882a593Smuzhiyun static int get_connect_authorizer(struct ceph_connection *con)
1406*4882a593Smuzhiyun {
1407*4882a593Smuzhiyun 	struct ceph_auth_handshake *auth;
1408*4882a593Smuzhiyun 	int auth_proto;
1409*4882a593Smuzhiyun 
1410*4882a593Smuzhiyun 	if (!con->ops->get_authorizer) {
1411*4882a593Smuzhiyun 		con->auth = NULL;
1412*4882a593Smuzhiyun 		con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN;
1413*4882a593Smuzhiyun 		con->out_connect.authorizer_len = 0;
1414*4882a593Smuzhiyun 		return 0;
1415*4882a593Smuzhiyun 	}
1416*4882a593Smuzhiyun 
1417*4882a593Smuzhiyun 	auth = con->ops->get_authorizer(con, &auth_proto, con->auth_retry);
1418*4882a593Smuzhiyun 	if (IS_ERR(auth))
1419*4882a593Smuzhiyun 		return PTR_ERR(auth);
1420*4882a593Smuzhiyun 
1421*4882a593Smuzhiyun 	con->auth = auth;
1422*4882a593Smuzhiyun 	con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto);
1423*4882a593Smuzhiyun 	con->out_connect.authorizer_len = cpu_to_le32(auth->authorizer_buf_len);
1424*4882a593Smuzhiyun 	return 0;
1425*4882a593Smuzhiyun }
1426*4882a593Smuzhiyun 
1427*4882a593Smuzhiyun /*
1428*4882a593Smuzhiyun  * We connected to a peer and are saying hello.
1429*4882a593Smuzhiyun  */
prepare_write_banner(struct ceph_connection * con)1430*4882a593Smuzhiyun static void prepare_write_banner(struct ceph_connection *con)
1431*4882a593Smuzhiyun {
1432*4882a593Smuzhiyun 	con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER);
1433*4882a593Smuzhiyun 	con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr),
1434*4882a593Smuzhiyun 					&con->msgr->my_enc_addr);
1435*4882a593Smuzhiyun 
1436*4882a593Smuzhiyun 	con->out_more = 0;
1437*4882a593Smuzhiyun 	con_flag_set(con, CON_FLAG_WRITE_PENDING);
1438*4882a593Smuzhiyun }
1439*4882a593Smuzhiyun 
__prepare_write_connect(struct ceph_connection * con)1440*4882a593Smuzhiyun static void __prepare_write_connect(struct ceph_connection *con)
1441*4882a593Smuzhiyun {
1442*4882a593Smuzhiyun 	con_out_kvec_add(con, sizeof(con->out_connect), &con->out_connect);
1443*4882a593Smuzhiyun 	if (con->auth)
1444*4882a593Smuzhiyun 		con_out_kvec_add(con, con->auth->authorizer_buf_len,
1445*4882a593Smuzhiyun 				 con->auth->authorizer_buf);
1446*4882a593Smuzhiyun 
1447*4882a593Smuzhiyun 	con->out_more = 0;
1448*4882a593Smuzhiyun 	con_flag_set(con, CON_FLAG_WRITE_PENDING);
1449*4882a593Smuzhiyun }
1450*4882a593Smuzhiyun 
prepare_write_connect(struct ceph_connection * con)1451*4882a593Smuzhiyun static int prepare_write_connect(struct ceph_connection *con)
1452*4882a593Smuzhiyun {
1453*4882a593Smuzhiyun 	unsigned int global_seq = get_global_seq(con->msgr, 0);
1454*4882a593Smuzhiyun 	int proto;
1455*4882a593Smuzhiyun 	int ret;
1456*4882a593Smuzhiyun 
1457*4882a593Smuzhiyun 	switch (con->peer_name.type) {
1458*4882a593Smuzhiyun 	case CEPH_ENTITY_TYPE_MON:
1459*4882a593Smuzhiyun 		proto = CEPH_MONC_PROTOCOL;
1460*4882a593Smuzhiyun 		break;
1461*4882a593Smuzhiyun 	case CEPH_ENTITY_TYPE_OSD:
1462*4882a593Smuzhiyun 		proto = CEPH_OSDC_PROTOCOL;
1463*4882a593Smuzhiyun 		break;
1464*4882a593Smuzhiyun 	case CEPH_ENTITY_TYPE_MDS:
1465*4882a593Smuzhiyun 		proto = CEPH_MDSC_PROTOCOL;
1466*4882a593Smuzhiyun 		break;
1467*4882a593Smuzhiyun 	default:
1468*4882a593Smuzhiyun 		BUG();
1469*4882a593Smuzhiyun 	}
1470*4882a593Smuzhiyun 
1471*4882a593Smuzhiyun 	dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
1472*4882a593Smuzhiyun 	     con->connect_seq, global_seq, proto);
1473*4882a593Smuzhiyun 
1474*4882a593Smuzhiyun 	con->out_connect.features =
1475*4882a593Smuzhiyun 	    cpu_to_le64(from_msgr(con->msgr)->supported_features);
1476*4882a593Smuzhiyun 	con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
1477*4882a593Smuzhiyun 	con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
1478*4882a593Smuzhiyun 	con->out_connect.global_seq = cpu_to_le32(global_seq);
1479*4882a593Smuzhiyun 	con->out_connect.protocol_version = cpu_to_le32(proto);
1480*4882a593Smuzhiyun 	con->out_connect.flags = 0;
1481*4882a593Smuzhiyun 
1482*4882a593Smuzhiyun 	ret = get_connect_authorizer(con);
1483*4882a593Smuzhiyun 	if (ret)
1484*4882a593Smuzhiyun 		return ret;
1485*4882a593Smuzhiyun 
1486*4882a593Smuzhiyun 	__prepare_write_connect(con);
1487*4882a593Smuzhiyun 	return 0;
1488*4882a593Smuzhiyun }
1489*4882a593Smuzhiyun 
1490*4882a593Smuzhiyun /*
1491*4882a593Smuzhiyun  * write as much of pending kvecs to the socket as we can.
1492*4882a593Smuzhiyun  *  1 -> done
1493*4882a593Smuzhiyun  *  0 -> socket full, but more to do
1494*4882a593Smuzhiyun  * <0 -> error
1495*4882a593Smuzhiyun  */
write_partial_kvec(struct ceph_connection * con)1496*4882a593Smuzhiyun static int write_partial_kvec(struct ceph_connection *con)
1497*4882a593Smuzhiyun {
1498*4882a593Smuzhiyun 	int ret;
1499*4882a593Smuzhiyun 
1500*4882a593Smuzhiyun 	dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
1501*4882a593Smuzhiyun 	while (con->out_kvec_bytes > 0) {
1502*4882a593Smuzhiyun 		ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
1503*4882a593Smuzhiyun 				       con->out_kvec_left, con->out_kvec_bytes,
1504*4882a593Smuzhiyun 				       con->out_more);
1505*4882a593Smuzhiyun 		if (ret <= 0)
1506*4882a593Smuzhiyun 			goto out;
1507*4882a593Smuzhiyun 		con->out_kvec_bytes -= ret;
1508*4882a593Smuzhiyun 		if (con->out_kvec_bytes == 0)
1509*4882a593Smuzhiyun 			break;            /* done */
1510*4882a593Smuzhiyun 
1511*4882a593Smuzhiyun 		/* account for full iov entries consumed */
1512*4882a593Smuzhiyun 		while (ret >= con->out_kvec_cur->iov_len) {
1513*4882a593Smuzhiyun 			BUG_ON(!con->out_kvec_left);
1514*4882a593Smuzhiyun 			ret -= con->out_kvec_cur->iov_len;
1515*4882a593Smuzhiyun 			con->out_kvec_cur++;
1516*4882a593Smuzhiyun 			con->out_kvec_left--;
1517*4882a593Smuzhiyun 		}
1518*4882a593Smuzhiyun 		/* and for a partially-consumed entry */
1519*4882a593Smuzhiyun 		if (ret) {
1520*4882a593Smuzhiyun 			con->out_kvec_cur->iov_len -= ret;
1521*4882a593Smuzhiyun 			con->out_kvec_cur->iov_base += ret;
1522*4882a593Smuzhiyun 		}
1523*4882a593Smuzhiyun 	}
1524*4882a593Smuzhiyun 	con->out_kvec_left = 0;
1525*4882a593Smuzhiyun 	ret = 1;
1526*4882a593Smuzhiyun out:
1527*4882a593Smuzhiyun 	dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
1528*4882a593Smuzhiyun 	     con->out_kvec_bytes, con->out_kvec_left, ret);
1529*4882a593Smuzhiyun 	return ret;  /* done! */
1530*4882a593Smuzhiyun }
1531*4882a593Smuzhiyun 
ceph_crc32c_page(u32 crc,struct page * page,unsigned int page_offset,unsigned int length)1532*4882a593Smuzhiyun static u32 ceph_crc32c_page(u32 crc, struct page *page,
1533*4882a593Smuzhiyun 				unsigned int page_offset,
1534*4882a593Smuzhiyun 				unsigned int length)
1535*4882a593Smuzhiyun {
1536*4882a593Smuzhiyun 	char *kaddr;
1537*4882a593Smuzhiyun 
1538*4882a593Smuzhiyun 	kaddr = kmap(page);
1539*4882a593Smuzhiyun 	BUG_ON(kaddr == NULL);
1540*4882a593Smuzhiyun 	crc = crc32c(crc, kaddr + page_offset, length);
1541*4882a593Smuzhiyun 	kunmap(page);
1542*4882a593Smuzhiyun 
1543*4882a593Smuzhiyun 	return crc;
1544*4882a593Smuzhiyun }
1545*4882a593Smuzhiyun /*
1546*4882a593Smuzhiyun  * Write as much message data payload as we can.  If we finish, queue
1547*4882a593Smuzhiyun  * up the footer.
1548*4882a593Smuzhiyun  *  1 -> done, footer is now queued in out_kvec[].
1549*4882a593Smuzhiyun  *  0 -> socket full, but more to do
1550*4882a593Smuzhiyun  * <0 -> error
1551*4882a593Smuzhiyun  */
write_partial_message_data(struct ceph_connection * con)1552*4882a593Smuzhiyun static int write_partial_message_data(struct ceph_connection *con)
1553*4882a593Smuzhiyun {
1554*4882a593Smuzhiyun 	struct ceph_msg *msg = con->out_msg;
1555*4882a593Smuzhiyun 	struct ceph_msg_data_cursor *cursor = &msg->cursor;
1556*4882a593Smuzhiyun 	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
1557*4882a593Smuzhiyun 	int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
1558*4882a593Smuzhiyun 	u32 crc;
1559*4882a593Smuzhiyun 
1560*4882a593Smuzhiyun 	dout("%s %p msg %p\n", __func__, con, msg);
1561*4882a593Smuzhiyun 
1562*4882a593Smuzhiyun 	if (!msg->num_data_items)
1563*4882a593Smuzhiyun 		return -EINVAL;
1564*4882a593Smuzhiyun 
1565*4882a593Smuzhiyun 	/*
1566*4882a593Smuzhiyun 	 * Iterate through each page that contains data to be
1567*4882a593Smuzhiyun 	 * written, and send as much as possible for each.
1568*4882a593Smuzhiyun 	 *
1569*4882a593Smuzhiyun 	 * If we are calculating the data crc (the default), we will
1570*4882a593Smuzhiyun 	 * need to map the page.  If we have no pages, they have
1571*4882a593Smuzhiyun 	 * been revoked, so use the zero page.
1572*4882a593Smuzhiyun 	 */
1573*4882a593Smuzhiyun 	crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0;
1574*4882a593Smuzhiyun 	while (cursor->total_resid) {
1575*4882a593Smuzhiyun 		struct page *page;
1576*4882a593Smuzhiyun 		size_t page_offset;
1577*4882a593Smuzhiyun 		size_t length;
1578*4882a593Smuzhiyun 		int ret;
1579*4882a593Smuzhiyun 
1580*4882a593Smuzhiyun 		if (!cursor->resid) {
1581*4882a593Smuzhiyun 			ceph_msg_data_advance(cursor, 0);
1582*4882a593Smuzhiyun 			continue;
1583*4882a593Smuzhiyun 		}
1584*4882a593Smuzhiyun 
1585*4882a593Smuzhiyun 		page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
1586*4882a593Smuzhiyun 		if (length == cursor->total_resid)
1587*4882a593Smuzhiyun 			more = MSG_MORE;
1588*4882a593Smuzhiyun 		ret = ceph_tcp_sendpage(con->sock, page, page_offset, length,
1589*4882a593Smuzhiyun 					more);
1590*4882a593Smuzhiyun 		if (ret <= 0) {
1591*4882a593Smuzhiyun 			if (do_datacrc)
1592*4882a593Smuzhiyun 				msg->footer.data_crc = cpu_to_le32(crc);
1593*4882a593Smuzhiyun 
1594*4882a593Smuzhiyun 			return ret;
1595*4882a593Smuzhiyun 		}
1596*4882a593Smuzhiyun 		if (do_datacrc && cursor->need_crc)
1597*4882a593Smuzhiyun 			crc = ceph_crc32c_page(crc, page, page_offset, length);
1598*4882a593Smuzhiyun 		ceph_msg_data_advance(cursor, (size_t)ret);
1599*4882a593Smuzhiyun 	}
1600*4882a593Smuzhiyun 
1601*4882a593Smuzhiyun 	dout("%s %p msg %p done\n", __func__, con, msg);
1602*4882a593Smuzhiyun 
1603*4882a593Smuzhiyun 	/* prepare and queue up footer, too */
1604*4882a593Smuzhiyun 	if (do_datacrc)
1605*4882a593Smuzhiyun 		msg->footer.data_crc = cpu_to_le32(crc);
1606*4882a593Smuzhiyun 	else
1607*4882a593Smuzhiyun 		msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
1608*4882a593Smuzhiyun 	con_out_kvec_reset(con);
1609*4882a593Smuzhiyun 	prepare_write_message_footer(con);
1610*4882a593Smuzhiyun 
1611*4882a593Smuzhiyun 	return 1;	/* must return > 0 to indicate success */
1612*4882a593Smuzhiyun }
1613*4882a593Smuzhiyun 
1614*4882a593Smuzhiyun /*
1615*4882a593Smuzhiyun  * write some zeros
1616*4882a593Smuzhiyun  */
write_partial_skip(struct ceph_connection * con)1617*4882a593Smuzhiyun static int write_partial_skip(struct ceph_connection *con)
1618*4882a593Smuzhiyun {
1619*4882a593Smuzhiyun 	int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
1620*4882a593Smuzhiyun 	int ret;
1621*4882a593Smuzhiyun 
1622*4882a593Smuzhiyun 	dout("%s %p %d left\n", __func__, con, con->out_skip);
1623*4882a593Smuzhiyun 	while (con->out_skip > 0) {
1624*4882a593Smuzhiyun 		size_t size = min(con->out_skip, (int) PAGE_SIZE);
1625*4882a593Smuzhiyun 
1626*4882a593Smuzhiyun 		if (size == con->out_skip)
1627*4882a593Smuzhiyun 			more = MSG_MORE;
1628*4882a593Smuzhiyun 		ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, more);
1629*4882a593Smuzhiyun 		if (ret <= 0)
1630*4882a593Smuzhiyun 			goto out;
1631*4882a593Smuzhiyun 		con->out_skip -= ret;
1632*4882a593Smuzhiyun 	}
1633*4882a593Smuzhiyun 	ret = 1;
1634*4882a593Smuzhiyun out:
1635*4882a593Smuzhiyun 	return ret;
1636*4882a593Smuzhiyun }
1637*4882a593Smuzhiyun 
1638*4882a593Smuzhiyun /*
1639*4882a593Smuzhiyun  * Prepare to read connection handshake, or an ack.
1640*4882a593Smuzhiyun  */
prepare_read_banner(struct ceph_connection * con)1641*4882a593Smuzhiyun static void prepare_read_banner(struct ceph_connection *con)
1642*4882a593Smuzhiyun {
1643*4882a593Smuzhiyun 	dout("prepare_read_banner %p\n", con);
1644*4882a593Smuzhiyun 	con->in_base_pos = 0;
1645*4882a593Smuzhiyun }
1646*4882a593Smuzhiyun 
prepare_read_connect(struct ceph_connection * con)1647*4882a593Smuzhiyun static void prepare_read_connect(struct ceph_connection *con)
1648*4882a593Smuzhiyun {
1649*4882a593Smuzhiyun 	dout("prepare_read_connect %p\n", con);
1650*4882a593Smuzhiyun 	con->in_base_pos = 0;
1651*4882a593Smuzhiyun }
1652*4882a593Smuzhiyun 
prepare_read_ack(struct ceph_connection * con)1653*4882a593Smuzhiyun static void prepare_read_ack(struct ceph_connection *con)
1654*4882a593Smuzhiyun {
1655*4882a593Smuzhiyun 	dout("prepare_read_ack %p\n", con);
1656*4882a593Smuzhiyun 	con->in_base_pos = 0;
1657*4882a593Smuzhiyun }
1658*4882a593Smuzhiyun 
prepare_read_seq(struct ceph_connection * con)1659*4882a593Smuzhiyun static void prepare_read_seq(struct ceph_connection *con)
1660*4882a593Smuzhiyun {
1661*4882a593Smuzhiyun 	dout("prepare_read_seq %p\n", con);
1662*4882a593Smuzhiyun 	con->in_base_pos = 0;
1663*4882a593Smuzhiyun 	con->in_tag = CEPH_MSGR_TAG_SEQ;
1664*4882a593Smuzhiyun }
1665*4882a593Smuzhiyun 
prepare_read_tag(struct ceph_connection * con)1666*4882a593Smuzhiyun static void prepare_read_tag(struct ceph_connection *con)
1667*4882a593Smuzhiyun {
1668*4882a593Smuzhiyun 	dout("prepare_read_tag %p\n", con);
1669*4882a593Smuzhiyun 	con->in_base_pos = 0;
1670*4882a593Smuzhiyun 	con->in_tag = CEPH_MSGR_TAG_READY;
1671*4882a593Smuzhiyun }
1672*4882a593Smuzhiyun 
prepare_read_keepalive_ack(struct ceph_connection * con)1673*4882a593Smuzhiyun static void prepare_read_keepalive_ack(struct ceph_connection *con)
1674*4882a593Smuzhiyun {
1675*4882a593Smuzhiyun 	dout("prepare_read_keepalive_ack %p\n", con);
1676*4882a593Smuzhiyun 	con->in_base_pos = 0;
1677*4882a593Smuzhiyun }
1678*4882a593Smuzhiyun 
1679*4882a593Smuzhiyun /*
1680*4882a593Smuzhiyun  * Prepare to read a message.
1681*4882a593Smuzhiyun  */
prepare_read_message(struct ceph_connection * con)1682*4882a593Smuzhiyun static int prepare_read_message(struct ceph_connection *con)
1683*4882a593Smuzhiyun {
1684*4882a593Smuzhiyun 	dout("prepare_read_message %p\n", con);
1685*4882a593Smuzhiyun 	BUG_ON(con->in_msg != NULL);
1686*4882a593Smuzhiyun 	con->in_base_pos = 0;
1687*4882a593Smuzhiyun 	con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
1688*4882a593Smuzhiyun 	return 0;
1689*4882a593Smuzhiyun }
1690*4882a593Smuzhiyun 
1691*4882a593Smuzhiyun 
read_partial(struct ceph_connection * con,int end,int size,void * object)1692*4882a593Smuzhiyun static int read_partial(struct ceph_connection *con,
1693*4882a593Smuzhiyun 			int end, int size, void *object)
1694*4882a593Smuzhiyun {
1695*4882a593Smuzhiyun 	while (con->in_base_pos < end) {
1696*4882a593Smuzhiyun 		int left = end - con->in_base_pos;
1697*4882a593Smuzhiyun 		int have = size - left;
1698*4882a593Smuzhiyun 		int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
1699*4882a593Smuzhiyun 		if (ret <= 0)
1700*4882a593Smuzhiyun 			return ret;
1701*4882a593Smuzhiyun 		con->in_base_pos += ret;
1702*4882a593Smuzhiyun 	}
1703*4882a593Smuzhiyun 	return 1;
1704*4882a593Smuzhiyun }
1705*4882a593Smuzhiyun 
1706*4882a593Smuzhiyun 
1707*4882a593Smuzhiyun /*
1708*4882a593Smuzhiyun  * Read all or part of the connect-side handshake on a new connection
1709*4882a593Smuzhiyun  */
read_partial_banner(struct ceph_connection * con)1710*4882a593Smuzhiyun static int read_partial_banner(struct ceph_connection *con)
1711*4882a593Smuzhiyun {
1712*4882a593Smuzhiyun 	int size;
1713*4882a593Smuzhiyun 	int end;
1714*4882a593Smuzhiyun 	int ret;
1715*4882a593Smuzhiyun 
1716*4882a593Smuzhiyun 	dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
1717*4882a593Smuzhiyun 
1718*4882a593Smuzhiyun 	/* peer's banner */
1719*4882a593Smuzhiyun 	size = strlen(CEPH_BANNER);
1720*4882a593Smuzhiyun 	end = size;
1721*4882a593Smuzhiyun 	ret = read_partial(con, end, size, con->in_banner);
1722*4882a593Smuzhiyun 	if (ret <= 0)
1723*4882a593Smuzhiyun 		goto out;
1724*4882a593Smuzhiyun 
1725*4882a593Smuzhiyun 	size = sizeof (con->actual_peer_addr);
1726*4882a593Smuzhiyun 	end += size;
1727*4882a593Smuzhiyun 	ret = read_partial(con, end, size, &con->actual_peer_addr);
1728*4882a593Smuzhiyun 	if (ret <= 0)
1729*4882a593Smuzhiyun 		goto out;
1730*4882a593Smuzhiyun 	ceph_decode_banner_addr(&con->actual_peer_addr);
1731*4882a593Smuzhiyun 
1732*4882a593Smuzhiyun 	size = sizeof (con->peer_addr_for_me);
1733*4882a593Smuzhiyun 	end += size;
1734*4882a593Smuzhiyun 	ret = read_partial(con, end, size, &con->peer_addr_for_me);
1735*4882a593Smuzhiyun 	if (ret <= 0)
1736*4882a593Smuzhiyun 		goto out;
1737*4882a593Smuzhiyun 	ceph_decode_banner_addr(&con->peer_addr_for_me);
1738*4882a593Smuzhiyun 
1739*4882a593Smuzhiyun out:
1740*4882a593Smuzhiyun 	return ret;
1741*4882a593Smuzhiyun }
1742*4882a593Smuzhiyun 
read_partial_connect(struct ceph_connection * con)1743*4882a593Smuzhiyun static int read_partial_connect(struct ceph_connection *con)
1744*4882a593Smuzhiyun {
1745*4882a593Smuzhiyun 	int size;
1746*4882a593Smuzhiyun 	int end;
1747*4882a593Smuzhiyun 	int ret;
1748*4882a593Smuzhiyun 
1749*4882a593Smuzhiyun 	dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
1750*4882a593Smuzhiyun 
1751*4882a593Smuzhiyun 	size = sizeof (con->in_reply);
1752*4882a593Smuzhiyun 	end = size;
1753*4882a593Smuzhiyun 	ret = read_partial(con, end, size, &con->in_reply);
1754*4882a593Smuzhiyun 	if (ret <= 0)
1755*4882a593Smuzhiyun 		goto out;
1756*4882a593Smuzhiyun 
1757*4882a593Smuzhiyun 	if (con->auth) {
1758*4882a593Smuzhiyun 		size = le32_to_cpu(con->in_reply.authorizer_len);
1759*4882a593Smuzhiyun 		if (size > con->auth->authorizer_reply_buf_len) {
1760*4882a593Smuzhiyun 			pr_err("authorizer reply too big: %d > %zu\n", size,
1761*4882a593Smuzhiyun 			       con->auth->authorizer_reply_buf_len);
1762*4882a593Smuzhiyun 			ret = -EINVAL;
1763*4882a593Smuzhiyun 			goto out;
1764*4882a593Smuzhiyun 		}
1765*4882a593Smuzhiyun 
1766*4882a593Smuzhiyun 		end += size;
1767*4882a593Smuzhiyun 		ret = read_partial(con, end, size,
1768*4882a593Smuzhiyun 				   con->auth->authorizer_reply_buf);
1769*4882a593Smuzhiyun 		if (ret <= 0)
1770*4882a593Smuzhiyun 			goto out;
1771*4882a593Smuzhiyun 	}
1772*4882a593Smuzhiyun 
1773*4882a593Smuzhiyun 	dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
1774*4882a593Smuzhiyun 	     con, (int)con->in_reply.tag,
1775*4882a593Smuzhiyun 	     le32_to_cpu(con->in_reply.connect_seq),
1776*4882a593Smuzhiyun 	     le32_to_cpu(con->in_reply.global_seq));
1777*4882a593Smuzhiyun out:
1778*4882a593Smuzhiyun 	return ret;
1779*4882a593Smuzhiyun }
1780*4882a593Smuzhiyun 
1781*4882a593Smuzhiyun /*
1782*4882a593Smuzhiyun  * Verify the hello banner looks okay.
1783*4882a593Smuzhiyun  */
verify_hello(struct ceph_connection * con)1784*4882a593Smuzhiyun static int verify_hello(struct ceph_connection *con)
1785*4882a593Smuzhiyun {
1786*4882a593Smuzhiyun 	if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
1787*4882a593Smuzhiyun 		pr_err("connect to %s got bad banner\n",
1788*4882a593Smuzhiyun 		       ceph_pr_addr(&con->peer_addr));
1789*4882a593Smuzhiyun 		con->error_msg = "protocol error, bad banner";
1790*4882a593Smuzhiyun 		return -1;
1791*4882a593Smuzhiyun 	}
1792*4882a593Smuzhiyun 	return 0;
1793*4882a593Smuzhiyun }
1794*4882a593Smuzhiyun 
addr_is_blank(struct ceph_entity_addr * addr)1795*4882a593Smuzhiyun static bool addr_is_blank(struct ceph_entity_addr *addr)
1796*4882a593Smuzhiyun {
1797*4882a593Smuzhiyun 	struct sockaddr_storage ss = addr->in_addr; /* align */
1798*4882a593Smuzhiyun 	struct in_addr *addr4 = &((struct sockaddr_in *)&ss)->sin_addr;
1799*4882a593Smuzhiyun 	struct in6_addr *addr6 = &((struct sockaddr_in6 *)&ss)->sin6_addr;
1800*4882a593Smuzhiyun 
1801*4882a593Smuzhiyun 	switch (ss.ss_family) {
1802*4882a593Smuzhiyun 	case AF_INET:
1803*4882a593Smuzhiyun 		return addr4->s_addr == htonl(INADDR_ANY);
1804*4882a593Smuzhiyun 	case AF_INET6:
1805*4882a593Smuzhiyun 		return ipv6_addr_any(addr6);
1806*4882a593Smuzhiyun 	default:
1807*4882a593Smuzhiyun 		return true;
1808*4882a593Smuzhiyun 	}
1809*4882a593Smuzhiyun }
1810*4882a593Smuzhiyun 
addr_port(struct ceph_entity_addr * addr)1811*4882a593Smuzhiyun static int addr_port(struct ceph_entity_addr *addr)
1812*4882a593Smuzhiyun {
1813*4882a593Smuzhiyun 	switch (get_unaligned(&addr->in_addr.ss_family)) {
1814*4882a593Smuzhiyun 	case AF_INET:
1815*4882a593Smuzhiyun 		return ntohs(get_unaligned(&((struct sockaddr_in *)&addr->in_addr)->sin_port));
1816*4882a593Smuzhiyun 	case AF_INET6:
1817*4882a593Smuzhiyun 		return ntohs(get_unaligned(&((struct sockaddr_in6 *)&addr->in_addr)->sin6_port));
1818*4882a593Smuzhiyun 	}
1819*4882a593Smuzhiyun 	return 0;
1820*4882a593Smuzhiyun }
1821*4882a593Smuzhiyun 
addr_set_port(struct ceph_entity_addr * addr,int p)1822*4882a593Smuzhiyun static void addr_set_port(struct ceph_entity_addr *addr, int p)
1823*4882a593Smuzhiyun {
1824*4882a593Smuzhiyun 	switch (get_unaligned(&addr->in_addr.ss_family)) {
1825*4882a593Smuzhiyun 	case AF_INET:
1826*4882a593Smuzhiyun 		put_unaligned(htons(p), &((struct sockaddr_in *)&addr->in_addr)->sin_port);
1827*4882a593Smuzhiyun 		break;
1828*4882a593Smuzhiyun 	case AF_INET6:
1829*4882a593Smuzhiyun 		put_unaligned(htons(p), &((struct sockaddr_in6 *)&addr->in_addr)->sin6_port);
1830*4882a593Smuzhiyun 		break;
1831*4882a593Smuzhiyun 	}
1832*4882a593Smuzhiyun }
1833*4882a593Smuzhiyun 
1834*4882a593Smuzhiyun /*
1835*4882a593Smuzhiyun  * Unlike other *_pton function semantics, zero indicates success.
1836*4882a593Smuzhiyun  */
ceph_pton(const char * str,size_t len,struct ceph_entity_addr * addr,char delim,const char ** ipend)1837*4882a593Smuzhiyun static int ceph_pton(const char *str, size_t len, struct ceph_entity_addr *addr,
1838*4882a593Smuzhiyun 		char delim, const char **ipend)
1839*4882a593Smuzhiyun {
1840*4882a593Smuzhiyun 	memset(&addr->in_addr, 0, sizeof(addr->in_addr));
1841*4882a593Smuzhiyun 
1842*4882a593Smuzhiyun 	if (in4_pton(str, len, (u8 *)&((struct sockaddr_in *)&addr->in_addr)->sin_addr.s_addr, delim, ipend)) {
1843*4882a593Smuzhiyun 		put_unaligned(AF_INET, &addr->in_addr.ss_family);
1844*4882a593Smuzhiyun 		return 0;
1845*4882a593Smuzhiyun 	}
1846*4882a593Smuzhiyun 
1847*4882a593Smuzhiyun 	if (in6_pton(str, len, (u8 *)&((struct sockaddr_in6 *)&addr->in_addr)->sin6_addr.s6_addr, delim, ipend)) {
1848*4882a593Smuzhiyun 		put_unaligned(AF_INET6, &addr->in_addr.ss_family);
1849*4882a593Smuzhiyun 		return 0;
1850*4882a593Smuzhiyun 	}
1851*4882a593Smuzhiyun 
1852*4882a593Smuzhiyun 	return -EINVAL;
1853*4882a593Smuzhiyun }
1854*4882a593Smuzhiyun 
1855*4882a593Smuzhiyun /*
1856*4882a593Smuzhiyun  * Extract hostname string and resolve using kernel DNS facility.
1857*4882a593Smuzhiyun  */
1858*4882a593Smuzhiyun #ifdef CONFIG_CEPH_LIB_USE_DNS_RESOLVER
ceph_dns_resolve_name(const char * name,size_t namelen,struct ceph_entity_addr * addr,char delim,const char ** ipend)1859*4882a593Smuzhiyun static int ceph_dns_resolve_name(const char *name, size_t namelen,
1860*4882a593Smuzhiyun 		struct ceph_entity_addr *addr, char delim, const char **ipend)
1861*4882a593Smuzhiyun {
1862*4882a593Smuzhiyun 	const char *end, *delim_p;
1863*4882a593Smuzhiyun 	char *colon_p, *ip_addr = NULL;
1864*4882a593Smuzhiyun 	int ip_len, ret;
1865*4882a593Smuzhiyun 
1866*4882a593Smuzhiyun 	/*
1867*4882a593Smuzhiyun 	 * The end of the hostname occurs immediately preceding the delimiter or
1868*4882a593Smuzhiyun 	 * the port marker (':') where the delimiter takes precedence.
1869*4882a593Smuzhiyun 	 */
1870*4882a593Smuzhiyun 	delim_p = memchr(name, delim, namelen);
1871*4882a593Smuzhiyun 	colon_p = memchr(name, ':', namelen);
1872*4882a593Smuzhiyun 
1873*4882a593Smuzhiyun 	if (delim_p && colon_p)
1874*4882a593Smuzhiyun 		end = delim_p < colon_p ? delim_p : colon_p;
1875*4882a593Smuzhiyun 	else if (!delim_p && colon_p)
1876*4882a593Smuzhiyun 		end = colon_p;
1877*4882a593Smuzhiyun 	else {
1878*4882a593Smuzhiyun 		end = delim_p;
1879*4882a593Smuzhiyun 		if (!end) /* case: hostname:/ */
1880*4882a593Smuzhiyun 			end = name + namelen;
1881*4882a593Smuzhiyun 	}
1882*4882a593Smuzhiyun 
1883*4882a593Smuzhiyun 	if (end <= name)
1884*4882a593Smuzhiyun 		return -EINVAL;
1885*4882a593Smuzhiyun 
1886*4882a593Smuzhiyun 	/* do dns_resolve upcall */
1887*4882a593Smuzhiyun 	ip_len = dns_query(current->nsproxy->net_ns,
1888*4882a593Smuzhiyun 			   NULL, name, end - name, NULL, &ip_addr, NULL, false);
1889*4882a593Smuzhiyun 	if (ip_len > 0)
1890*4882a593Smuzhiyun 		ret = ceph_pton(ip_addr, ip_len, addr, -1, NULL);
1891*4882a593Smuzhiyun 	else
1892*4882a593Smuzhiyun 		ret = -ESRCH;
1893*4882a593Smuzhiyun 
1894*4882a593Smuzhiyun 	kfree(ip_addr);
1895*4882a593Smuzhiyun 
1896*4882a593Smuzhiyun 	*ipend = end;
1897*4882a593Smuzhiyun 
1898*4882a593Smuzhiyun 	pr_info("resolve '%.*s' (ret=%d): %s\n", (int)(end - name), name,
1899*4882a593Smuzhiyun 			ret, ret ? "failed" : ceph_pr_addr(addr));
1900*4882a593Smuzhiyun 
1901*4882a593Smuzhiyun 	return ret;
1902*4882a593Smuzhiyun }
1903*4882a593Smuzhiyun #else
ceph_dns_resolve_name(const char * name,size_t namelen,struct ceph_entity_addr * addr,char delim,const char ** ipend)1904*4882a593Smuzhiyun static inline int ceph_dns_resolve_name(const char *name, size_t namelen,
1905*4882a593Smuzhiyun 		struct ceph_entity_addr *addr, char delim, const char **ipend)
1906*4882a593Smuzhiyun {
1907*4882a593Smuzhiyun 	return -EINVAL;
1908*4882a593Smuzhiyun }
1909*4882a593Smuzhiyun #endif
1910*4882a593Smuzhiyun 
1911*4882a593Smuzhiyun /*
1912*4882a593Smuzhiyun  * Parse a server name (IP or hostname). If a valid IP address is not found
1913*4882a593Smuzhiyun  * then try to extract a hostname to resolve using userspace DNS upcall.
1914*4882a593Smuzhiyun  */
ceph_parse_server_name(const char * name,size_t namelen,struct ceph_entity_addr * addr,char delim,const char ** ipend)1915*4882a593Smuzhiyun static int ceph_parse_server_name(const char *name, size_t namelen,
1916*4882a593Smuzhiyun 		struct ceph_entity_addr *addr, char delim, const char **ipend)
1917*4882a593Smuzhiyun {
1918*4882a593Smuzhiyun 	int ret;
1919*4882a593Smuzhiyun 
1920*4882a593Smuzhiyun 	ret = ceph_pton(name, namelen, addr, delim, ipend);
1921*4882a593Smuzhiyun 	if (ret)
1922*4882a593Smuzhiyun 		ret = ceph_dns_resolve_name(name, namelen, addr, delim, ipend);
1923*4882a593Smuzhiyun 
1924*4882a593Smuzhiyun 	return ret;
1925*4882a593Smuzhiyun }
1926*4882a593Smuzhiyun 
1927*4882a593Smuzhiyun /*
1928*4882a593Smuzhiyun  * Parse an ip[:port] list into an addr array.  Use the default
1929*4882a593Smuzhiyun  * monitor port if a port isn't specified.
1930*4882a593Smuzhiyun  */
ceph_parse_ips(const char * c,const char * end,struct ceph_entity_addr * addr,int max_count,int * count)1931*4882a593Smuzhiyun int ceph_parse_ips(const char *c, const char *end,
1932*4882a593Smuzhiyun 		   struct ceph_entity_addr *addr,
1933*4882a593Smuzhiyun 		   int max_count, int *count)
1934*4882a593Smuzhiyun {
1935*4882a593Smuzhiyun 	int i, ret = -EINVAL;
1936*4882a593Smuzhiyun 	const char *p = c;
1937*4882a593Smuzhiyun 
1938*4882a593Smuzhiyun 	dout("parse_ips on '%.*s'\n", (int)(end-c), c);
1939*4882a593Smuzhiyun 	for (i = 0; i < max_count; i++) {
1940*4882a593Smuzhiyun 		const char *ipend;
1941*4882a593Smuzhiyun 		int port;
1942*4882a593Smuzhiyun 		char delim = ',';
1943*4882a593Smuzhiyun 
1944*4882a593Smuzhiyun 		if (*p == '[') {
1945*4882a593Smuzhiyun 			delim = ']';
1946*4882a593Smuzhiyun 			p++;
1947*4882a593Smuzhiyun 		}
1948*4882a593Smuzhiyun 
1949*4882a593Smuzhiyun 		ret = ceph_parse_server_name(p, end - p, &addr[i], delim, &ipend);
1950*4882a593Smuzhiyun 		if (ret)
1951*4882a593Smuzhiyun 			goto bad;
1952*4882a593Smuzhiyun 		ret = -EINVAL;
1953*4882a593Smuzhiyun 
1954*4882a593Smuzhiyun 		p = ipend;
1955*4882a593Smuzhiyun 
1956*4882a593Smuzhiyun 		if (delim == ']') {
1957*4882a593Smuzhiyun 			if (*p != ']') {
1958*4882a593Smuzhiyun 				dout("missing matching ']'\n");
1959*4882a593Smuzhiyun 				goto bad;
1960*4882a593Smuzhiyun 			}
1961*4882a593Smuzhiyun 			p++;
1962*4882a593Smuzhiyun 		}
1963*4882a593Smuzhiyun 
1964*4882a593Smuzhiyun 		/* port? */
1965*4882a593Smuzhiyun 		if (p < end && *p == ':') {
1966*4882a593Smuzhiyun 			port = 0;
1967*4882a593Smuzhiyun 			p++;
1968*4882a593Smuzhiyun 			while (p < end && *p >= '0' && *p <= '9') {
1969*4882a593Smuzhiyun 				port = (port * 10) + (*p - '0');
1970*4882a593Smuzhiyun 				p++;
1971*4882a593Smuzhiyun 			}
1972*4882a593Smuzhiyun 			if (port == 0)
1973*4882a593Smuzhiyun 				port = CEPH_MON_PORT;
1974*4882a593Smuzhiyun 			else if (port > 65535)
1975*4882a593Smuzhiyun 				goto bad;
1976*4882a593Smuzhiyun 		} else {
1977*4882a593Smuzhiyun 			port = CEPH_MON_PORT;
1978*4882a593Smuzhiyun 		}
1979*4882a593Smuzhiyun 
1980*4882a593Smuzhiyun 		addr_set_port(&addr[i], port);
1981*4882a593Smuzhiyun 		addr[i].type = CEPH_ENTITY_ADDR_TYPE_LEGACY;
1982*4882a593Smuzhiyun 
1983*4882a593Smuzhiyun 		dout("parse_ips got %s\n", ceph_pr_addr(&addr[i]));
1984*4882a593Smuzhiyun 
1985*4882a593Smuzhiyun 		if (p == end)
1986*4882a593Smuzhiyun 			break;
1987*4882a593Smuzhiyun 		if (*p != ',')
1988*4882a593Smuzhiyun 			goto bad;
1989*4882a593Smuzhiyun 		p++;
1990*4882a593Smuzhiyun 	}
1991*4882a593Smuzhiyun 
1992*4882a593Smuzhiyun 	if (p != end)
1993*4882a593Smuzhiyun 		goto bad;
1994*4882a593Smuzhiyun 
1995*4882a593Smuzhiyun 	if (count)
1996*4882a593Smuzhiyun 		*count = i + 1;
1997*4882a593Smuzhiyun 	return 0;
1998*4882a593Smuzhiyun 
1999*4882a593Smuzhiyun bad:
2000*4882a593Smuzhiyun 	return ret;
2001*4882a593Smuzhiyun }
2002*4882a593Smuzhiyun 
process_banner(struct ceph_connection * con)2003*4882a593Smuzhiyun static int process_banner(struct ceph_connection *con)
2004*4882a593Smuzhiyun {
2005*4882a593Smuzhiyun 	dout("process_banner on %p\n", con);
2006*4882a593Smuzhiyun 
2007*4882a593Smuzhiyun 	if (verify_hello(con) < 0)
2008*4882a593Smuzhiyun 		return -1;
2009*4882a593Smuzhiyun 
2010*4882a593Smuzhiyun 	/*
2011*4882a593Smuzhiyun 	 * Make sure the other end is who we wanted.  note that the other
2012*4882a593Smuzhiyun 	 * end may not yet know their ip address, so if it's 0.0.0.0, give
2013*4882a593Smuzhiyun 	 * them the benefit of the doubt.
2014*4882a593Smuzhiyun 	 */
2015*4882a593Smuzhiyun 	if (memcmp(&con->peer_addr, &con->actual_peer_addr,
2016*4882a593Smuzhiyun 		   sizeof(con->peer_addr)) != 0 &&
2017*4882a593Smuzhiyun 	    !(addr_is_blank(&con->actual_peer_addr) &&
2018*4882a593Smuzhiyun 	      con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
2019*4882a593Smuzhiyun 		pr_warn("wrong peer, want %s/%u, got %s/%u\n",
2020*4882a593Smuzhiyun 			ceph_pr_addr(&con->peer_addr),
2021*4882a593Smuzhiyun 			le32_to_cpu(con->peer_addr.nonce),
2022*4882a593Smuzhiyun 			ceph_pr_addr(&con->actual_peer_addr),
2023*4882a593Smuzhiyun 			le32_to_cpu(con->actual_peer_addr.nonce));
2024*4882a593Smuzhiyun 		con->error_msg = "wrong peer at address";
2025*4882a593Smuzhiyun 		return -1;
2026*4882a593Smuzhiyun 	}
2027*4882a593Smuzhiyun 
2028*4882a593Smuzhiyun 	/*
2029*4882a593Smuzhiyun 	 * did we learn our address?
2030*4882a593Smuzhiyun 	 */
2031*4882a593Smuzhiyun 	if (addr_is_blank(&con->msgr->inst.addr)) {
2032*4882a593Smuzhiyun 		int port = addr_port(&con->msgr->inst.addr);
2033*4882a593Smuzhiyun 
2034*4882a593Smuzhiyun 		memcpy(&con->msgr->inst.addr.in_addr,
2035*4882a593Smuzhiyun 		       &con->peer_addr_for_me.in_addr,
2036*4882a593Smuzhiyun 		       sizeof(con->peer_addr_for_me.in_addr));
2037*4882a593Smuzhiyun 		addr_set_port(&con->msgr->inst.addr, port);
2038*4882a593Smuzhiyun 		encode_my_addr(con->msgr);
2039*4882a593Smuzhiyun 		dout("process_banner learned my addr is %s\n",
2040*4882a593Smuzhiyun 		     ceph_pr_addr(&con->msgr->inst.addr));
2041*4882a593Smuzhiyun 	}
2042*4882a593Smuzhiyun 
2043*4882a593Smuzhiyun 	return 0;
2044*4882a593Smuzhiyun }
2045*4882a593Smuzhiyun 
process_connect(struct ceph_connection * con)2046*4882a593Smuzhiyun static int process_connect(struct ceph_connection *con)
2047*4882a593Smuzhiyun {
2048*4882a593Smuzhiyun 	u64 sup_feat = from_msgr(con->msgr)->supported_features;
2049*4882a593Smuzhiyun 	u64 req_feat = from_msgr(con->msgr)->required_features;
2050*4882a593Smuzhiyun 	u64 server_feat = le64_to_cpu(con->in_reply.features);
2051*4882a593Smuzhiyun 	int ret;
2052*4882a593Smuzhiyun 
2053*4882a593Smuzhiyun 	dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
2054*4882a593Smuzhiyun 
2055*4882a593Smuzhiyun 	if (con->auth) {
2056*4882a593Smuzhiyun 		int len = le32_to_cpu(con->in_reply.authorizer_len);
2057*4882a593Smuzhiyun 
2058*4882a593Smuzhiyun 		/*
2059*4882a593Smuzhiyun 		 * Any connection that defines ->get_authorizer()
2060*4882a593Smuzhiyun 		 * should also define ->add_authorizer_challenge() and
2061*4882a593Smuzhiyun 		 * ->verify_authorizer_reply().
2062*4882a593Smuzhiyun 		 *
2063*4882a593Smuzhiyun 		 * See get_connect_authorizer().
2064*4882a593Smuzhiyun 		 */
2065*4882a593Smuzhiyun 		if (con->in_reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) {
2066*4882a593Smuzhiyun 			ret = con->ops->add_authorizer_challenge(
2067*4882a593Smuzhiyun 				    con, con->auth->authorizer_reply_buf, len);
2068*4882a593Smuzhiyun 			if (ret < 0)
2069*4882a593Smuzhiyun 				return ret;
2070*4882a593Smuzhiyun 
2071*4882a593Smuzhiyun 			con_out_kvec_reset(con);
2072*4882a593Smuzhiyun 			__prepare_write_connect(con);
2073*4882a593Smuzhiyun 			prepare_read_connect(con);
2074*4882a593Smuzhiyun 			return 0;
2075*4882a593Smuzhiyun 		}
2076*4882a593Smuzhiyun 
2077*4882a593Smuzhiyun 		if (len) {
2078*4882a593Smuzhiyun 			ret = con->ops->verify_authorizer_reply(con);
2079*4882a593Smuzhiyun 			if (ret < 0) {
2080*4882a593Smuzhiyun 				con->error_msg = "bad authorize reply";
2081*4882a593Smuzhiyun 				return ret;
2082*4882a593Smuzhiyun 			}
2083*4882a593Smuzhiyun 		}
2084*4882a593Smuzhiyun 	}
2085*4882a593Smuzhiyun 
2086*4882a593Smuzhiyun 	switch (con->in_reply.tag) {
2087*4882a593Smuzhiyun 	case CEPH_MSGR_TAG_FEATURES:
2088*4882a593Smuzhiyun 		pr_err("%s%lld %s feature set mismatch,"
2089*4882a593Smuzhiyun 		       " my %llx < server's %llx, missing %llx\n",
2090*4882a593Smuzhiyun 		       ENTITY_NAME(con->peer_name),
2091*4882a593Smuzhiyun 		       ceph_pr_addr(&con->peer_addr),
2092*4882a593Smuzhiyun 		       sup_feat, server_feat, server_feat & ~sup_feat);
2093*4882a593Smuzhiyun 		con->error_msg = "missing required protocol features";
2094*4882a593Smuzhiyun 		reset_connection(con);
2095*4882a593Smuzhiyun 		return -1;
2096*4882a593Smuzhiyun 
2097*4882a593Smuzhiyun 	case CEPH_MSGR_TAG_BADPROTOVER:
2098*4882a593Smuzhiyun 		pr_err("%s%lld %s protocol version mismatch,"
2099*4882a593Smuzhiyun 		       " my %d != server's %d\n",
2100*4882a593Smuzhiyun 		       ENTITY_NAME(con->peer_name),
2101*4882a593Smuzhiyun 		       ceph_pr_addr(&con->peer_addr),
2102*4882a593Smuzhiyun 		       le32_to_cpu(con->out_connect.protocol_version),
2103*4882a593Smuzhiyun 		       le32_to_cpu(con->in_reply.protocol_version));
2104*4882a593Smuzhiyun 		con->error_msg = "protocol version mismatch";
2105*4882a593Smuzhiyun 		reset_connection(con);
2106*4882a593Smuzhiyun 		return -1;
2107*4882a593Smuzhiyun 
2108*4882a593Smuzhiyun 	case CEPH_MSGR_TAG_BADAUTHORIZER:
2109*4882a593Smuzhiyun 		con->auth_retry++;
2110*4882a593Smuzhiyun 		dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
2111*4882a593Smuzhiyun 		     con->auth_retry);
2112*4882a593Smuzhiyun 		if (con->auth_retry == 2) {
2113*4882a593Smuzhiyun 			con->error_msg = "connect authorization failure";
2114*4882a593Smuzhiyun 			return -1;
2115*4882a593Smuzhiyun 		}
2116*4882a593Smuzhiyun 		con_out_kvec_reset(con);
2117*4882a593Smuzhiyun 		ret = prepare_write_connect(con);
2118*4882a593Smuzhiyun 		if (ret < 0)
2119*4882a593Smuzhiyun 			return ret;
2120*4882a593Smuzhiyun 		prepare_read_connect(con);
2121*4882a593Smuzhiyun 		break;
2122*4882a593Smuzhiyun 
2123*4882a593Smuzhiyun 	case CEPH_MSGR_TAG_RESETSESSION:
2124*4882a593Smuzhiyun 		/*
2125*4882a593Smuzhiyun 		 * If we connected with a large connect_seq but the peer
2126*4882a593Smuzhiyun 		 * has no record of a session with us (no connection, or
2127*4882a593Smuzhiyun 		 * connect_seq == 0), they will send RESETSESION to indicate
2128*4882a593Smuzhiyun 		 * that they must have reset their session, and may have
2129*4882a593Smuzhiyun 		 * dropped messages.
2130*4882a593Smuzhiyun 		 */
2131*4882a593Smuzhiyun 		dout("process_connect got RESET peer seq %u\n",
2132*4882a593Smuzhiyun 		     le32_to_cpu(con->in_reply.connect_seq));
2133*4882a593Smuzhiyun 		pr_err("%s%lld %s connection reset\n",
2134*4882a593Smuzhiyun 		       ENTITY_NAME(con->peer_name),
2135*4882a593Smuzhiyun 		       ceph_pr_addr(&con->peer_addr));
2136*4882a593Smuzhiyun 		reset_connection(con);
2137*4882a593Smuzhiyun 		con_out_kvec_reset(con);
2138*4882a593Smuzhiyun 		ret = prepare_write_connect(con);
2139*4882a593Smuzhiyun 		if (ret < 0)
2140*4882a593Smuzhiyun 			return ret;
2141*4882a593Smuzhiyun 		prepare_read_connect(con);
2142*4882a593Smuzhiyun 
2143*4882a593Smuzhiyun 		/* Tell ceph about it. */
2144*4882a593Smuzhiyun 		mutex_unlock(&con->mutex);
2145*4882a593Smuzhiyun 		pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
2146*4882a593Smuzhiyun 		if (con->ops->peer_reset)
2147*4882a593Smuzhiyun 			con->ops->peer_reset(con);
2148*4882a593Smuzhiyun 		mutex_lock(&con->mutex);
2149*4882a593Smuzhiyun 		if (con->state != CON_STATE_NEGOTIATING)
2150*4882a593Smuzhiyun 			return -EAGAIN;
2151*4882a593Smuzhiyun 		break;
2152*4882a593Smuzhiyun 
2153*4882a593Smuzhiyun 	case CEPH_MSGR_TAG_RETRY_SESSION:
2154*4882a593Smuzhiyun 		/*
2155*4882a593Smuzhiyun 		 * If we sent a smaller connect_seq than the peer has, try
2156*4882a593Smuzhiyun 		 * again with a larger value.
2157*4882a593Smuzhiyun 		 */
2158*4882a593Smuzhiyun 		dout("process_connect got RETRY_SESSION my seq %u, peer %u\n",
2159*4882a593Smuzhiyun 		     le32_to_cpu(con->out_connect.connect_seq),
2160*4882a593Smuzhiyun 		     le32_to_cpu(con->in_reply.connect_seq));
2161*4882a593Smuzhiyun 		con->connect_seq = le32_to_cpu(con->in_reply.connect_seq);
2162*4882a593Smuzhiyun 		con_out_kvec_reset(con);
2163*4882a593Smuzhiyun 		ret = prepare_write_connect(con);
2164*4882a593Smuzhiyun 		if (ret < 0)
2165*4882a593Smuzhiyun 			return ret;
2166*4882a593Smuzhiyun 		prepare_read_connect(con);
2167*4882a593Smuzhiyun 		break;
2168*4882a593Smuzhiyun 
2169*4882a593Smuzhiyun 	case CEPH_MSGR_TAG_RETRY_GLOBAL:
2170*4882a593Smuzhiyun 		/*
2171*4882a593Smuzhiyun 		 * If we sent a smaller global_seq than the peer has, try
2172*4882a593Smuzhiyun 		 * again with a larger value.
2173*4882a593Smuzhiyun 		 */
2174*4882a593Smuzhiyun 		dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
2175*4882a593Smuzhiyun 		     con->peer_global_seq,
2176*4882a593Smuzhiyun 		     le32_to_cpu(con->in_reply.global_seq));
2177*4882a593Smuzhiyun 		get_global_seq(con->msgr,
2178*4882a593Smuzhiyun 			       le32_to_cpu(con->in_reply.global_seq));
2179*4882a593Smuzhiyun 		con_out_kvec_reset(con);
2180*4882a593Smuzhiyun 		ret = prepare_write_connect(con);
2181*4882a593Smuzhiyun 		if (ret < 0)
2182*4882a593Smuzhiyun 			return ret;
2183*4882a593Smuzhiyun 		prepare_read_connect(con);
2184*4882a593Smuzhiyun 		break;
2185*4882a593Smuzhiyun 
2186*4882a593Smuzhiyun 	case CEPH_MSGR_TAG_SEQ:
2187*4882a593Smuzhiyun 	case CEPH_MSGR_TAG_READY:
2188*4882a593Smuzhiyun 		if (req_feat & ~server_feat) {
2189*4882a593Smuzhiyun 			pr_err("%s%lld %s protocol feature mismatch,"
2190*4882a593Smuzhiyun 			       " my required %llx > server's %llx, need %llx\n",
2191*4882a593Smuzhiyun 			       ENTITY_NAME(con->peer_name),
2192*4882a593Smuzhiyun 			       ceph_pr_addr(&con->peer_addr),
2193*4882a593Smuzhiyun 			       req_feat, server_feat, req_feat & ~server_feat);
2194*4882a593Smuzhiyun 			con->error_msg = "missing required protocol features";
2195*4882a593Smuzhiyun 			reset_connection(con);
2196*4882a593Smuzhiyun 			return -1;
2197*4882a593Smuzhiyun 		}
2198*4882a593Smuzhiyun 
2199*4882a593Smuzhiyun 		WARN_ON(con->state != CON_STATE_NEGOTIATING);
2200*4882a593Smuzhiyun 		con->state = CON_STATE_OPEN;
2201*4882a593Smuzhiyun 		con->auth_retry = 0;    /* we authenticated; clear flag */
2202*4882a593Smuzhiyun 		con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
2203*4882a593Smuzhiyun 		con->connect_seq++;
2204*4882a593Smuzhiyun 		con->peer_features = server_feat;
2205*4882a593Smuzhiyun 		dout("process_connect got READY gseq %d cseq %d (%d)\n",
2206*4882a593Smuzhiyun 		     con->peer_global_seq,
2207*4882a593Smuzhiyun 		     le32_to_cpu(con->in_reply.connect_seq),
2208*4882a593Smuzhiyun 		     con->connect_seq);
2209*4882a593Smuzhiyun 		WARN_ON(con->connect_seq !=
2210*4882a593Smuzhiyun 			le32_to_cpu(con->in_reply.connect_seq));
2211*4882a593Smuzhiyun 
2212*4882a593Smuzhiyun 		if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
2213*4882a593Smuzhiyun 			con_flag_set(con, CON_FLAG_LOSSYTX);
2214*4882a593Smuzhiyun 
2215*4882a593Smuzhiyun 		con->delay = 0;      /* reset backoff memory */
2216*4882a593Smuzhiyun 
2217*4882a593Smuzhiyun 		if (con->in_reply.tag == CEPH_MSGR_TAG_SEQ) {
2218*4882a593Smuzhiyun 			prepare_write_seq(con);
2219*4882a593Smuzhiyun 			prepare_read_seq(con);
2220*4882a593Smuzhiyun 		} else {
2221*4882a593Smuzhiyun 			prepare_read_tag(con);
2222*4882a593Smuzhiyun 		}
2223*4882a593Smuzhiyun 		break;
2224*4882a593Smuzhiyun 
2225*4882a593Smuzhiyun 	case CEPH_MSGR_TAG_WAIT:
2226*4882a593Smuzhiyun 		/*
2227*4882a593Smuzhiyun 		 * If there is a connection race (we are opening
2228*4882a593Smuzhiyun 		 * connections to each other), one of us may just have
2229*4882a593Smuzhiyun 		 * to WAIT.  This shouldn't happen if we are the
2230*4882a593Smuzhiyun 		 * client.
2231*4882a593Smuzhiyun 		 */
2232*4882a593Smuzhiyun 		con->error_msg = "protocol error, got WAIT as client";
2233*4882a593Smuzhiyun 		return -1;
2234*4882a593Smuzhiyun 
2235*4882a593Smuzhiyun 	default:
2236*4882a593Smuzhiyun 		con->error_msg = "protocol error, garbage tag during connect";
2237*4882a593Smuzhiyun 		return -1;
2238*4882a593Smuzhiyun 	}
2239*4882a593Smuzhiyun 	return 0;
2240*4882a593Smuzhiyun }
2241*4882a593Smuzhiyun 
2242*4882a593Smuzhiyun 
2243*4882a593Smuzhiyun /*
2244*4882a593Smuzhiyun  * read (part of) an ack
2245*4882a593Smuzhiyun  */
read_partial_ack(struct ceph_connection * con)2246*4882a593Smuzhiyun static int read_partial_ack(struct ceph_connection *con)
2247*4882a593Smuzhiyun {
2248*4882a593Smuzhiyun 	int size = sizeof (con->in_temp_ack);
2249*4882a593Smuzhiyun 	int end = size;
2250*4882a593Smuzhiyun 
2251*4882a593Smuzhiyun 	return read_partial(con, end, size, &con->in_temp_ack);
2252*4882a593Smuzhiyun }
2253*4882a593Smuzhiyun 
2254*4882a593Smuzhiyun /*
2255*4882a593Smuzhiyun  * We can finally discard anything that's been acked.
2256*4882a593Smuzhiyun  */
process_ack(struct ceph_connection * con)2257*4882a593Smuzhiyun static void process_ack(struct ceph_connection *con)
2258*4882a593Smuzhiyun {
2259*4882a593Smuzhiyun 	struct ceph_msg *m;
2260*4882a593Smuzhiyun 	u64 ack = le64_to_cpu(con->in_temp_ack);
2261*4882a593Smuzhiyun 	u64 seq;
2262*4882a593Smuzhiyun 	bool reconnect = (con->in_tag == CEPH_MSGR_TAG_SEQ);
2263*4882a593Smuzhiyun 	struct list_head *list = reconnect ? &con->out_queue : &con->out_sent;
2264*4882a593Smuzhiyun 
2265*4882a593Smuzhiyun 	/*
2266*4882a593Smuzhiyun 	 * In the reconnect case, con_fault() has requeued messages
2267*4882a593Smuzhiyun 	 * in out_sent. We should cleanup old messages according to
2268*4882a593Smuzhiyun 	 * the reconnect seq.
2269*4882a593Smuzhiyun 	 */
2270*4882a593Smuzhiyun 	while (!list_empty(list)) {
2271*4882a593Smuzhiyun 		m = list_first_entry(list, struct ceph_msg, list_head);
2272*4882a593Smuzhiyun 		if (reconnect && m->needs_out_seq)
2273*4882a593Smuzhiyun 			break;
2274*4882a593Smuzhiyun 		seq = le64_to_cpu(m->hdr.seq);
2275*4882a593Smuzhiyun 		if (seq > ack)
2276*4882a593Smuzhiyun 			break;
2277*4882a593Smuzhiyun 		dout("got ack for seq %llu type %d at %p\n", seq,
2278*4882a593Smuzhiyun 		     le16_to_cpu(m->hdr.type), m);
2279*4882a593Smuzhiyun 		m->ack_stamp = jiffies;
2280*4882a593Smuzhiyun 		ceph_msg_remove(m);
2281*4882a593Smuzhiyun 	}
2282*4882a593Smuzhiyun 
2283*4882a593Smuzhiyun 	prepare_read_tag(con);
2284*4882a593Smuzhiyun }
2285*4882a593Smuzhiyun 
2286*4882a593Smuzhiyun 
read_partial_message_section(struct ceph_connection * con,struct kvec * section,unsigned int sec_len,u32 * crc)2287*4882a593Smuzhiyun static int read_partial_message_section(struct ceph_connection *con,
2288*4882a593Smuzhiyun 					struct kvec *section,
2289*4882a593Smuzhiyun 					unsigned int sec_len, u32 *crc)
2290*4882a593Smuzhiyun {
2291*4882a593Smuzhiyun 	int ret, left;
2292*4882a593Smuzhiyun 
2293*4882a593Smuzhiyun 	BUG_ON(!section);
2294*4882a593Smuzhiyun 
2295*4882a593Smuzhiyun 	while (section->iov_len < sec_len) {
2296*4882a593Smuzhiyun 		BUG_ON(section->iov_base == NULL);
2297*4882a593Smuzhiyun 		left = sec_len - section->iov_len;
2298*4882a593Smuzhiyun 		ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
2299*4882a593Smuzhiyun 				       section->iov_len, left);
2300*4882a593Smuzhiyun 		if (ret <= 0)
2301*4882a593Smuzhiyun 			return ret;
2302*4882a593Smuzhiyun 		section->iov_len += ret;
2303*4882a593Smuzhiyun 	}
2304*4882a593Smuzhiyun 	if (section->iov_len == sec_len)
2305*4882a593Smuzhiyun 		*crc = crc32c(0, section->iov_base, section->iov_len);
2306*4882a593Smuzhiyun 
2307*4882a593Smuzhiyun 	return 1;
2308*4882a593Smuzhiyun }
2309*4882a593Smuzhiyun 
read_partial_msg_data(struct ceph_connection * con)2310*4882a593Smuzhiyun static int read_partial_msg_data(struct ceph_connection *con)
2311*4882a593Smuzhiyun {
2312*4882a593Smuzhiyun 	struct ceph_msg *msg = con->in_msg;
2313*4882a593Smuzhiyun 	struct ceph_msg_data_cursor *cursor = &msg->cursor;
2314*4882a593Smuzhiyun 	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
2315*4882a593Smuzhiyun 	struct page *page;
2316*4882a593Smuzhiyun 	size_t page_offset;
2317*4882a593Smuzhiyun 	size_t length;
2318*4882a593Smuzhiyun 	u32 crc = 0;
2319*4882a593Smuzhiyun 	int ret;
2320*4882a593Smuzhiyun 
2321*4882a593Smuzhiyun 	if (!msg->num_data_items)
2322*4882a593Smuzhiyun 		return -EIO;
2323*4882a593Smuzhiyun 
2324*4882a593Smuzhiyun 	if (do_datacrc)
2325*4882a593Smuzhiyun 		crc = con->in_data_crc;
2326*4882a593Smuzhiyun 	while (cursor->total_resid) {
2327*4882a593Smuzhiyun 		if (!cursor->resid) {
2328*4882a593Smuzhiyun 			ceph_msg_data_advance(cursor, 0);
2329*4882a593Smuzhiyun 			continue;
2330*4882a593Smuzhiyun 		}
2331*4882a593Smuzhiyun 
2332*4882a593Smuzhiyun 		page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
2333*4882a593Smuzhiyun 		ret = ceph_tcp_recvpage(con->sock, page, page_offset, length);
2334*4882a593Smuzhiyun 		if (ret <= 0) {
2335*4882a593Smuzhiyun 			if (do_datacrc)
2336*4882a593Smuzhiyun 				con->in_data_crc = crc;
2337*4882a593Smuzhiyun 
2338*4882a593Smuzhiyun 			return ret;
2339*4882a593Smuzhiyun 		}
2340*4882a593Smuzhiyun 
2341*4882a593Smuzhiyun 		if (do_datacrc)
2342*4882a593Smuzhiyun 			crc = ceph_crc32c_page(crc, page, page_offset, ret);
2343*4882a593Smuzhiyun 		ceph_msg_data_advance(cursor, (size_t)ret);
2344*4882a593Smuzhiyun 	}
2345*4882a593Smuzhiyun 	if (do_datacrc)
2346*4882a593Smuzhiyun 		con->in_data_crc = crc;
2347*4882a593Smuzhiyun 
2348*4882a593Smuzhiyun 	return 1;	/* must return > 0 to indicate success */
2349*4882a593Smuzhiyun }
2350*4882a593Smuzhiyun 
2351*4882a593Smuzhiyun /*
2352*4882a593Smuzhiyun  * read (part of) a message.
2353*4882a593Smuzhiyun  */
2354*4882a593Smuzhiyun static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip);
2355*4882a593Smuzhiyun 
read_partial_message(struct ceph_connection * con)2356*4882a593Smuzhiyun static int read_partial_message(struct ceph_connection *con)
2357*4882a593Smuzhiyun {
2358*4882a593Smuzhiyun 	struct ceph_msg *m = con->in_msg;
2359*4882a593Smuzhiyun 	int size;
2360*4882a593Smuzhiyun 	int end;
2361*4882a593Smuzhiyun 	int ret;
2362*4882a593Smuzhiyun 	unsigned int front_len, middle_len, data_len;
2363*4882a593Smuzhiyun 	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
2364*4882a593Smuzhiyun 	bool need_sign = (con->peer_features & CEPH_FEATURE_MSG_AUTH);
2365*4882a593Smuzhiyun 	u64 seq;
2366*4882a593Smuzhiyun 	u32 crc;
2367*4882a593Smuzhiyun 
2368*4882a593Smuzhiyun 	dout("read_partial_message con %p msg %p\n", con, m);
2369*4882a593Smuzhiyun 
2370*4882a593Smuzhiyun 	/* header */
2371*4882a593Smuzhiyun 	size = sizeof (con->in_hdr);
2372*4882a593Smuzhiyun 	end = size;
2373*4882a593Smuzhiyun 	ret = read_partial(con, end, size, &con->in_hdr);
2374*4882a593Smuzhiyun 	if (ret <= 0)
2375*4882a593Smuzhiyun 		return ret;
2376*4882a593Smuzhiyun 
2377*4882a593Smuzhiyun 	crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc));
2378*4882a593Smuzhiyun 	if (cpu_to_le32(crc) != con->in_hdr.crc) {
2379*4882a593Smuzhiyun 		pr_err("read_partial_message bad hdr crc %u != expected %u\n",
2380*4882a593Smuzhiyun 		       crc, con->in_hdr.crc);
2381*4882a593Smuzhiyun 		return -EBADMSG;
2382*4882a593Smuzhiyun 	}
2383*4882a593Smuzhiyun 
2384*4882a593Smuzhiyun 	front_len = le32_to_cpu(con->in_hdr.front_len);
2385*4882a593Smuzhiyun 	if (front_len > CEPH_MSG_MAX_FRONT_LEN)
2386*4882a593Smuzhiyun 		return -EIO;
2387*4882a593Smuzhiyun 	middle_len = le32_to_cpu(con->in_hdr.middle_len);
2388*4882a593Smuzhiyun 	if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN)
2389*4882a593Smuzhiyun 		return -EIO;
2390*4882a593Smuzhiyun 	data_len = le32_to_cpu(con->in_hdr.data_len);
2391*4882a593Smuzhiyun 	if (data_len > CEPH_MSG_MAX_DATA_LEN)
2392*4882a593Smuzhiyun 		return -EIO;
2393*4882a593Smuzhiyun 
2394*4882a593Smuzhiyun 	/* verify seq# */
2395*4882a593Smuzhiyun 	seq = le64_to_cpu(con->in_hdr.seq);
2396*4882a593Smuzhiyun 	if ((s64)seq - (s64)con->in_seq < 1) {
2397*4882a593Smuzhiyun 		pr_info("skipping %s%lld %s seq %lld expected %lld\n",
2398*4882a593Smuzhiyun 			ENTITY_NAME(con->peer_name),
2399*4882a593Smuzhiyun 			ceph_pr_addr(&con->peer_addr),
2400*4882a593Smuzhiyun 			seq, con->in_seq + 1);
2401*4882a593Smuzhiyun 		con->in_base_pos = -front_len - middle_len - data_len -
2402*4882a593Smuzhiyun 			sizeof_footer(con);
2403*4882a593Smuzhiyun 		con->in_tag = CEPH_MSGR_TAG_READY;
2404*4882a593Smuzhiyun 		return 1;
2405*4882a593Smuzhiyun 	} else if ((s64)seq - (s64)con->in_seq > 1) {
2406*4882a593Smuzhiyun 		pr_err("read_partial_message bad seq %lld expected %lld\n",
2407*4882a593Smuzhiyun 		       seq, con->in_seq + 1);
2408*4882a593Smuzhiyun 		con->error_msg = "bad message sequence # for incoming message";
2409*4882a593Smuzhiyun 		return -EBADE;
2410*4882a593Smuzhiyun 	}
2411*4882a593Smuzhiyun 
2412*4882a593Smuzhiyun 	/* allocate message? */
2413*4882a593Smuzhiyun 	if (!con->in_msg) {
2414*4882a593Smuzhiyun 		int skip = 0;
2415*4882a593Smuzhiyun 
2416*4882a593Smuzhiyun 		dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
2417*4882a593Smuzhiyun 		     front_len, data_len);
2418*4882a593Smuzhiyun 		ret = ceph_con_in_msg_alloc(con, &skip);
2419*4882a593Smuzhiyun 		if (ret < 0)
2420*4882a593Smuzhiyun 			return ret;
2421*4882a593Smuzhiyun 
2422*4882a593Smuzhiyun 		BUG_ON(!con->in_msg ^ skip);
2423*4882a593Smuzhiyun 		if (skip) {
2424*4882a593Smuzhiyun 			/* skip this message */
2425*4882a593Smuzhiyun 			dout("alloc_msg said skip message\n");
2426*4882a593Smuzhiyun 			con->in_base_pos = -front_len - middle_len - data_len -
2427*4882a593Smuzhiyun 				sizeof_footer(con);
2428*4882a593Smuzhiyun 			con->in_tag = CEPH_MSGR_TAG_READY;
2429*4882a593Smuzhiyun 			con->in_seq++;
2430*4882a593Smuzhiyun 			return 1;
2431*4882a593Smuzhiyun 		}
2432*4882a593Smuzhiyun 
2433*4882a593Smuzhiyun 		BUG_ON(!con->in_msg);
2434*4882a593Smuzhiyun 		BUG_ON(con->in_msg->con != con);
2435*4882a593Smuzhiyun 		m = con->in_msg;
2436*4882a593Smuzhiyun 		m->front.iov_len = 0;    /* haven't read it yet */
2437*4882a593Smuzhiyun 		if (m->middle)
2438*4882a593Smuzhiyun 			m->middle->vec.iov_len = 0;
2439*4882a593Smuzhiyun 
2440*4882a593Smuzhiyun 		/* prepare for data payload, if any */
2441*4882a593Smuzhiyun 
2442*4882a593Smuzhiyun 		if (data_len)
2443*4882a593Smuzhiyun 			prepare_message_data(con->in_msg, data_len);
2444*4882a593Smuzhiyun 	}
2445*4882a593Smuzhiyun 
2446*4882a593Smuzhiyun 	/* front */
2447*4882a593Smuzhiyun 	ret = read_partial_message_section(con, &m->front, front_len,
2448*4882a593Smuzhiyun 					   &con->in_front_crc);
2449*4882a593Smuzhiyun 	if (ret <= 0)
2450*4882a593Smuzhiyun 		return ret;
2451*4882a593Smuzhiyun 
2452*4882a593Smuzhiyun 	/* middle */
2453*4882a593Smuzhiyun 	if (m->middle) {
2454*4882a593Smuzhiyun 		ret = read_partial_message_section(con, &m->middle->vec,
2455*4882a593Smuzhiyun 						   middle_len,
2456*4882a593Smuzhiyun 						   &con->in_middle_crc);
2457*4882a593Smuzhiyun 		if (ret <= 0)
2458*4882a593Smuzhiyun 			return ret;
2459*4882a593Smuzhiyun 	}
2460*4882a593Smuzhiyun 
2461*4882a593Smuzhiyun 	/* (page) data */
2462*4882a593Smuzhiyun 	if (data_len) {
2463*4882a593Smuzhiyun 		ret = read_partial_msg_data(con);
2464*4882a593Smuzhiyun 		if (ret <= 0)
2465*4882a593Smuzhiyun 			return ret;
2466*4882a593Smuzhiyun 	}
2467*4882a593Smuzhiyun 
2468*4882a593Smuzhiyun 	/* footer */
2469*4882a593Smuzhiyun 	size = sizeof_footer(con);
2470*4882a593Smuzhiyun 	end += size;
2471*4882a593Smuzhiyun 	ret = read_partial(con, end, size, &m->footer);
2472*4882a593Smuzhiyun 	if (ret <= 0)
2473*4882a593Smuzhiyun 		return ret;
2474*4882a593Smuzhiyun 
2475*4882a593Smuzhiyun 	if (!need_sign) {
2476*4882a593Smuzhiyun 		m->footer.flags = m->old_footer.flags;
2477*4882a593Smuzhiyun 		m->footer.sig = 0;
2478*4882a593Smuzhiyun 	}
2479*4882a593Smuzhiyun 
2480*4882a593Smuzhiyun 	dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
2481*4882a593Smuzhiyun 	     m, front_len, m->footer.front_crc, middle_len,
2482*4882a593Smuzhiyun 	     m->footer.middle_crc, data_len, m->footer.data_crc);
2483*4882a593Smuzhiyun 
2484*4882a593Smuzhiyun 	/* crc ok? */
2485*4882a593Smuzhiyun 	if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
2486*4882a593Smuzhiyun 		pr_err("read_partial_message %p front crc %u != exp. %u\n",
2487*4882a593Smuzhiyun 		       m, con->in_front_crc, m->footer.front_crc);
2488*4882a593Smuzhiyun 		return -EBADMSG;
2489*4882a593Smuzhiyun 	}
2490*4882a593Smuzhiyun 	if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
2491*4882a593Smuzhiyun 		pr_err("read_partial_message %p middle crc %u != exp %u\n",
2492*4882a593Smuzhiyun 		       m, con->in_middle_crc, m->footer.middle_crc);
2493*4882a593Smuzhiyun 		return -EBADMSG;
2494*4882a593Smuzhiyun 	}
2495*4882a593Smuzhiyun 	if (do_datacrc &&
2496*4882a593Smuzhiyun 	    (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
2497*4882a593Smuzhiyun 	    con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
2498*4882a593Smuzhiyun 		pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
2499*4882a593Smuzhiyun 		       con->in_data_crc, le32_to_cpu(m->footer.data_crc));
2500*4882a593Smuzhiyun 		return -EBADMSG;
2501*4882a593Smuzhiyun 	}
2502*4882a593Smuzhiyun 
2503*4882a593Smuzhiyun 	if (need_sign && con->ops->check_message_signature &&
2504*4882a593Smuzhiyun 	    con->ops->check_message_signature(m)) {
2505*4882a593Smuzhiyun 		pr_err("read_partial_message %p signature check failed\n", m);
2506*4882a593Smuzhiyun 		return -EBADMSG;
2507*4882a593Smuzhiyun 	}
2508*4882a593Smuzhiyun 
2509*4882a593Smuzhiyun 	return 1; /* done! */
2510*4882a593Smuzhiyun }
2511*4882a593Smuzhiyun 
2512*4882a593Smuzhiyun /*
2513*4882a593Smuzhiyun  * Process message.  This happens in the worker thread.  The callback should
2514*4882a593Smuzhiyun  * be careful not to do anything that waits on other incoming messages or it
2515*4882a593Smuzhiyun  * may deadlock.
2516*4882a593Smuzhiyun  */
process_message(struct ceph_connection * con)2517*4882a593Smuzhiyun static void process_message(struct ceph_connection *con)
2518*4882a593Smuzhiyun {
2519*4882a593Smuzhiyun 	struct ceph_msg *msg = con->in_msg;
2520*4882a593Smuzhiyun 
2521*4882a593Smuzhiyun 	BUG_ON(con->in_msg->con != con);
2522*4882a593Smuzhiyun 	con->in_msg = NULL;
2523*4882a593Smuzhiyun 
2524*4882a593Smuzhiyun 	/* if first message, set peer_name */
2525*4882a593Smuzhiyun 	if (con->peer_name.type == 0)
2526*4882a593Smuzhiyun 		con->peer_name = msg->hdr.src;
2527*4882a593Smuzhiyun 
2528*4882a593Smuzhiyun 	con->in_seq++;
2529*4882a593Smuzhiyun 	mutex_unlock(&con->mutex);
2530*4882a593Smuzhiyun 
2531*4882a593Smuzhiyun 	dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
2532*4882a593Smuzhiyun 	     msg, le64_to_cpu(msg->hdr.seq),
2533*4882a593Smuzhiyun 	     ENTITY_NAME(msg->hdr.src),
2534*4882a593Smuzhiyun 	     le16_to_cpu(msg->hdr.type),
2535*4882a593Smuzhiyun 	     ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
2536*4882a593Smuzhiyun 	     le32_to_cpu(msg->hdr.front_len),
2537*4882a593Smuzhiyun 	     le32_to_cpu(msg->hdr.data_len),
2538*4882a593Smuzhiyun 	     con->in_front_crc, con->in_middle_crc, con->in_data_crc);
2539*4882a593Smuzhiyun 	con->ops->dispatch(con, msg);
2540*4882a593Smuzhiyun 
2541*4882a593Smuzhiyun 	mutex_lock(&con->mutex);
2542*4882a593Smuzhiyun }
2543*4882a593Smuzhiyun 
read_keepalive_ack(struct ceph_connection * con)2544*4882a593Smuzhiyun static int read_keepalive_ack(struct ceph_connection *con)
2545*4882a593Smuzhiyun {
2546*4882a593Smuzhiyun 	struct ceph_timespec ceph_ts;
2547*4882a593Smuzhiyun 	size_t size = sizeof(ceph_ts);
2548*4882a593Smuzhiyun 	int ret = read_partial(con, size, size, &ceph_ts);
2549*4882a593Smuzhiyun 	if (ret <= 0)
2550*4882a593Smuzhiyun 		return ret;
2551*4882a593Smuzhiyun 	ceph_decode_timespec64(&con->last_keepalive_ack, &ceph_ts);
2552*4882a593Smuzhiyun 	prepare_read_tag(con);
2553*4882a593Smuzhiyun 	return 1;
2554*4882a593Smuzhiyun }
2555*4882a593Smuzhiyun 
2556*4882a593Smuzhiyun /*
2557*4882a593Smuzhiyun  * Write something to the socket.  Called in a worker thread when the
2558*4882a593Smuzhiyun  * socket appears to be writeable and we have something ready to send.
2559*4882a593Smuzhiyun  */
try_write(struct ceph_connection * con)2560*4882a593Smuzhiyun static int try_write(struct ceph_connection *con)
2561*4882a593Smuzhiyun {
2562*4882a593Smuzhiyun 	int ret = 1;
2563*4882a593Smuzhiyun 
2564*4882a593Smuzhiyun 	dout("try_write start %p state %lu\n", con, con->state);
2565*4882a593Smuzhiyun 	if (con->state != CON_STATE_PREOPEN &&
2566*4882a593Smuzhiyun 	    con->state != CON_STATE_CONNECTING &&
2567*4882a593Smuzhiyun 	    con->state != CON_STATE_NEGOTIATING &&
2568*4882a593Smuzhiyun 	    con->state != CON_STATE_OPEN)
2569*4882a593Smuzhiyun 		return 0;
2570*4882a593Smuzhiyun 
2571*4882a593Smuzhiyun 	/* open the socket first? */
2572*4882a593Smuzhiyun 	if (con->state == CON_STATE_PREOPEN) {
2573*4882a593Smuzhiyun 		BUG_ON(con->sock);
2574*4882a593Smuzhiyun 		con->state = CON_STATE_CONNECTING;
2575*4882a593Smuzhiyun 
2576*4882a593Smuzhiyun 		con_out_kvec_reset(con);
2577*4882a593Smuzhiyun 		prepare_write_banner(con);
2578*4882a593Smuzhiyun 		prepare_read_banner(con);
2579*4882a593Smuzhiyun 
2580*4882a593Smuzhiyun 		BUG_ON(con->in_msg);
2581*4882a593Smuzhiyun 		con->in_tag = CEPH_MSGR_TAG_READY;
2582*4882a593Smuzhiyun 		dout("try_write initiating connect on %p new state %lu\n",
2583*4882a593Smuzhiyun 		     con, con->state);
2584*4882a593Smuzhiyun 		ret = ceph_tcp_connect(con);
2585*4882a593Smuzhiyun 		if (ret < 0) {
2586*4882a593Smuzhiyun 			con->error_msg = "connect error";
2587*4882a593Smuzhiyun 			goto out;
2588*4882a593Smuzhiyun 		}
2589*4882a593Smuzhiyun 	}
2590*4882a593Smuzhiyun 
2591*4882a593Smuzhiyun more:
2592*4882a593Smuzhiyun 	dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
2593*4882a593Smuzhiyun 	BUG_ON(!con->sock);
2594*4882a593Smuzhiyun 
2595*4882a593Smuzhiyun 	/* kvec data queued? */
2596*4882a593Smuzhiyun 	if (con->out_kvec_left) {
2597*4882a593Smuzhiyun 		ret = write_partial_kvec(con);
2598*4882a593Smuzhiyun 		if (ret <= 0)
2599*4882a593Smuzhiyun 			goto out;
2600*4882a593Smuzhiyun 	}
2601*4882a593Smuzhiyun 	if (con->out_skip) {
2602*4882a593Smuzhiyun 		ret = write_partial_skip(con);
2603*4882a593Smuzhiyun 		if (ret <= 0)
2604*4882a593Smuzhiyun 			goto out;
2605*4882a593Smuzhiyun 	}
2606*4882a593Smuzhiyun 
2607*4882a593Smuzhiyun 	/* msg pages? */
2608*4882a593Smuzhiyun 	if (con->out_msg) {
2609*4882a593Smuzhiyun 		if (con->out_msg_done) {
2610*4882a593Smuzhiyun 			ceph_msg_put(con->out_msg);
2611*4882a593Smuzhiyun 			con->out_msg = NULL;   /* we're done with this one */
2612*4882a593Smuzhiyun 			goto do_next;
2613*4882a593Smuzhiyun 		}
2614*4882a593Smuzhiyun 
2615*4882a593Smuzhiyun 		ret = write_partial_message_data(con);
2616*4882a593Smuzhiyun 		if (ret == 1)
2617*4882a593Smuzhiyun 			goto more;  /* we need to send the footer, too! */
2618*4882a593Smuzhiyun 		if (ret == 0)
2619*4882a593Smuzhiyun 			goto out;
2620*4882a593Smuzhiyun 		if (ret < 0) {
2621*4882a593Smuzhiyun 			dout("try_write write_partial_message_data err %d\n",
2622*4882a593Smuzhiyun 			     ret);
2623*4882a593Smuzhiyun 			goto out;
2624*4882a593Smuzhiyun 		}
2625*4882a593Smuzhiyun 	}
2626*4882a593Smuzhiyun 
2627*4882a593Smuzhiyun do_next:
2628*4882a593Smuzhiyun 	if (con->state == CON_STATE_OPEN) {
2629*4882a593Smuzhiyun 		if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) {
2630*4882a593Smuzhiyun 			prepare_write_keepalive(con);
2631*4882a593Smuzhiyun 			goto more;
2632*4882a593Smuzhiyun 		}
2633*4882a593Smuzhiyun 		/* is anything else pending? */
2634*4882a593Smuzhiyun 		if (!list_empty(&con->out_queue)) {
2635*4882a593Smuzhiyun 			prepare_write_message(con);
2636*4882a593Smuzhiyun 			goto more;
2637*4882a593Smuzhiyun 		}
2638*4882a593Smuzhiyun 		if (con->in_seq > con->in_seq_acked) {
2639*4882a593Smuzhiyun 			prepare_write_ack(con);
2640*4882a593Smuzhiyun 			goto more;
2641*4882a593Smuzhiyun 		}
2642*4882a593Smuzhiyun 	}
2643*4882a593Smuzhiyun 
2644*4882a593Smuzhiyun 	/* Nothing to do! */
2645*4882a593Smuzhiyun 	con_flag_clear(con, CON_FLAG_WRITE_PENDING);
2646*4882a593Smuzhiyun 	dout("try_write nothing else to write.\n");
2647*4882a593Smuzhiyun 	ret = 0;
2648*4882a593Smuzhiyun out:
2649*4882a593Smuzhiyun 	dout("try_write done on %p ret %d\n", con, ret);
2650*4882a593Smuzhiyun 	return ret;
2651*4882a593Smuzhiyun }
2652*4882a593Smuzhiyun 
2653*4882a593Smuzhiyun /*
2654*4882a593Smuzhiyun  * Read what we can from the socket.
2655*4882a593Smuzhiyun  */
try_read(struct ceph_connection * con)2656*4882a593Smuzhiyun static int try_read(struct ceph_connection *con)
2657*4882a593Smuzhiyun {
2658*4882a593Smuzhiyun 	int ret = -1;
2659*4882a593Smuzhiyun 
2660*4882a593Smuzhiyun more:
2661*4882a593Smuzhiyun 	dout("try_read start on %p state %lu\n", con, con->state);
2662*4882a593Smuzhiyun 	if (con->state != CON_STATE_CONNECTING &&
2663*4882a593Smuzhiyun 	    con->state != CON_STATE_NEGOTIATING &&
2664*4882a593Smuzhiyun 	    con->state != CON_STATE_OPEN)
2665*4882a593Smuzhiyun 		return 0;
2666*4882a593Smuzhiyun 
2667*4882a593Smuzhiyun 	BUG_ON(!con->sock);
2668*4882a593Smuzhiyun 
2669*4882a593Smuzhiyun 	dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
2670*4882a593Smuzhiyun 	     con->in_base_pos);
2671*4882a593Smuzhiyun 
2672*4882a593Smuzhiyun 	if (con->state == CON_STATE_CONNECTING) {
2673*4882a593Smuzhiyun 		dout("try_read connecting\n");
2674*4882a593Smuzhiyun 		ret = read_partial_banner(con);
2675*4882a593Smuzhiyun 		if (ret <= 0)
2676*4882a593Smuzhiyun 			goto out;
2677*4882a593Smuzhiyun 		ret = process_banner(con);
2678*4882a593Smuzhiyun 		if (ret < 0)
2679*4882a593Smuzhiyun 			goto out;
2680*4882a593Smuzhiyun 
2681*4882a593Smuzhiyun 		con->state = CON_STATE_NEGOTIATING;
2682*4882a593Smuzhiyun 
2683*4882a593Smuzhiyun 		/*
2684*4882a593Smuzhiyun 		 * Received banner is good, exchange connection info.
2685*4882a593Smuzhiyun 		 * Do not reset out_kvec, as sending our banner raced
2686*4882a593Smuzhiyun 		 * with receiving peer banner after connect completed.
2687*4882a593Smuzhiyun 		 */
2688*4882a593Smuzhiyun 		ret = prepare_write_connect(con);
2689*4882a593Smuzhiyun 		if (ret < 0)
2690*4882a593Smuzhiyun 			goto out;
2691*4882a593Smuzhiyun 		prepare_read_connect(con);
2692*4882a593Smuzhiyun 
2693*4882a593Smuzhiyun 		/* Send connection info before awaiting response */
2694*4882a593Smuzhiyun 		goto out;
2695*4882a593Smuzhiyun 	}
2696*4882a593Smuzhiyun 
2697*4882a593Smuzhiyun 	if (con->state == CON_STATE_NEGOTIATING) {
2698*4882a593Smuzhiyun 		dout("try_read negotiating\n");
2699*4882a593Smuzhiyun 		ret = read_partial_connect(con);
2700*4882a593Smuzhiyun 		if (ret <= 0)
2701*4882a593Smuzhiyun 			goto out;
2702*4882a593Smuzhiyun 		ret = process_connect(con);
2703*4882a593Smuzhiyun 		if (ret < 0)
2704*4882a593Smuzhiyun 			goto out;
2705*4882a593Smuzhiyun 		goto more;
2706*4882a593Smuzhiyun 	}
2707*4882a593Smuzhiyun 
2708*4882a593Smuzhiyun 	WARN_ON(con->state != CON_STATE_OPEN);
2709*4882a593Smuzhiyun 
2710*4882a593Smuzhiyun 	if (con->in_base_pos < 0) {
2711*4882a593Smuzhiyun 		/*
2712*4882a593Smuzhiyun 		 * skipping + discarding content.
2713*4882a593Smuzhiyun 		 */
2714*4882a593Smuzhiyun 		ret = ceph_tcp_recvmsg(con->sock, NULL, -con->in_base_pos);
2715*4882a593Smuzhiyun 		if (ret <= 0)
2716*4882a593Smuzhiyun 			goto out;
2717*4882a593Smuzhiyun 		dout("skipped %d / %d bytes\n", ret, -con->in_base_pos);
2718*4882a593Smuzhiyun 		con->in_base_pos += ret;
2719*4882a593Smuzhiyun 		if (con->in_base_pos)
2720*4882a593Smuzhiyun 			goto more;
2721*4882a593Smuzhiyun 	}
2722*4882a593Smuzhiyun 	if (con->in_tag == CEPH_MSGR_TAG_READY) {
2723*4882a593Smuzhiyun 		/*
2724*4882a593Smuzhiyun 		 * what's next?
2725*4882a593Smuzhiyun 		 */
2726*4882a593Smuzhiyun 		ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
2727*4882a593Smuzhiyun 		if (ret <= 0)
2728*4882a593Smuzhiyun 			goto out;
2729*4882a593Smuzhiyun 		dout("try_read got tag %d\n", (int)con->in_tag);
2730*4882a593Smuzhiyun 		switch (con->in_tag) {
2731*4882a593Smuzhiyun 		case CEPH_MSGR_TAG_MSG:
2732*4882a593Smuzhiyun 			prepare_read_message(con);
2733*4882a593Smuzhiyun 			break;
2734*4882a593Smuzhiyun 		case CEPH_MSGR_TAG_ACK:
2735*4882a593Smuzhiyun 			prepare_read_ack(con);
2736*4882a593Smuzhiyun 			break;
2737*4882a593Smuzhiyun 		case CEPH_MSGR_TAG_KEEPALIVE2_ACK:
2738*4882a593Smuzhiyun 			prepare_read_keepalive_ack(con);
2739*4882a593Smuzhiyun 			break;
2740*4882a593Smuzhiyun 		case CEPH_MSGR_TAG_CLOSE:
2741*4882a593Smuzhiyun 			con_close_socket(con);
2742*4882a593Smuzhiyun 			con->state = CON_STATE_CLOSED;
2743*4882a593Smuzhiyun 			goto out;
2744*4882a593Smuzhiyun 		default:
2745*4882a593Smuzhiyun 			goto bad_tag;
2746*4882a593Smuzhiyun 		}
2747*4882a593Smuzhiyun 	}
2748*4882a593Smuzhiyun 	if (con->in_tag == CEPH_MSGR_TAG_MSG) {
2749*4882a593Smuzhiyun 		ret = read_partial_message(con);
2750*4882a593Smuzhiyun 		if (ret <= 0) {
2751*4882a593Smuzhiyun 			switch (ret) {
2752*4882a593Smuzhiyun 			case -EBADMSG:
2753*4882a593Smuzhiyun 				con->error_msg = "bad crc/signature";
2754*4882a593Smuzhiyun 				fallthrough;
2755*4882a593Smuzhiyun 			case -EBADE:
2756*4882a593Smuzhiyun 				ret = -EIO;
2757*4882a593Smuzhiyun 				break;
2758*4882a593Smuzhiyun 			case -EIO:
2759*4882a593Smuzhiyun 				con->error_msg = "io error";
2760*4882a593Smuzhiyun 				break;
2761*4882a593Smuzhiyun 			}
2762*4882a593Smuzhiyun 			goto out;
2763*4882a593Smuzhiyun 		}
2764*4882a593Smuzhiyun 		if (con->in_tag == CEPH_MSGR_TAG_READY)
2765*4882a593Smuzhiyun 			goto more;
2766*4882a593Smuzhiyun 		process_message(con);
2767*4882a593Smuzhiyun 		if (con->state == CON_STATE_OPEN)
2768*4882a593Smuzhiyun 			prepare_read_tag(con);
2769*4882a593Smuzhiyun 		goto more;
2770*4882a593Smuzhiyun 	}
2771*4882a593Smuzhiyun 	if (con->in_tag == CEPH_MSGR_TAG_ACK ||
2772*4882a593Smuzhiyun 	    con->in_tag == CEPH_MSGR_TAG_SEQ) {
2773*4882a593Smuzhiyun 		/*
2774*4882a593Smuzhiyun 		 * the final handshake seq exchange is semantically
2775*4882a593Smuzhiyun 		 * equivalent to an ACK
2776*4882a593Smuzhiyun 		 */
2777*4882a593Smuzhiyun 		ret = read_partial_ack(con);
2778*4882a593Smuzhiyun 		if (ret <= 0)
2779*4882a593Smuzhiyun 			goto out;
2780*4882a593Smuzhiyun 		process_ack(con);
2781*4882a593Smuzhiyun 		goto more;
2782*4882a593Smuzhiyun 	}
2783*4882a593Smuzhiyun 	if (con->in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
2784*4882a593Smuzhiyun 		ret = read_keepalive_ack(con);
2785*4882a593Smuzhiyun 		if (ret <= 0)
2786*4882a593Smuzhiyun 			goto out;
2787*4882a593Smuzhiyun 		goto more;
2788*4882a593Smuzhiyun 	}
2789*4882a593Smuzhiyun 
2790*4882a593Smuzhiyun out:
2791*4882a593Smuzhiyun 	dout("try_read done on %p ret %d\n", con, ret);
2792*4882a593Smuzhiyun 	return ret;
2793*4882a593Smuzhiyun 
2794*4882a593Smuzhiyun bad_tag:
2795*4882a593Smuzhiyun 	pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
2796*4882a593Smuzhiyun 	con->error_msg = "protocol error, garbage tag";
2797*4882a593Smuzhiyun 	ret = -1;
2798*4882a593Smuzhiyun 	goto out;
2799*4882a593Smuzhiyun }
2800*4882a593Smuzhiyun 
2801*4882a593Smuzhiyun 
2802*4882a593Smuzhiyun /*
2803*4882a593Smuzhiyun  * Atomically queue work on a connection after the specified delay.
2804*4882a593Smuzhiyun  * Bump @con reference to avoid races with connection teardown.
2805*4882a593Smuzhiyun  * Returns 0 if work was queued, or an error code otherwise.
2806*4882a593Smuzhiyun  */
queue_con_delay(struct ceph_connection * con,unsigned long delay)2807*4882a593Smuzhiyun static int queue_con_delay(struct ceph_connection *con, unsigned long delay)
2808*4882a593Smuzhiyun {
2809*4882a593Smuzhiyun 	if (!con->ops->get(con)) {
2810*4882a593Smuzhiyun 		dout("%s %p ref count 0\n", __func__, con);
2811*4882a593Smuzhiyun 		return -ENOENT;
2812*4882a593Smuzhiyun 	}
2813*4882a593Smuzhiyun 
2814*4882a593Smuzhiyun 	dout("%s %p %lu\n", __func__, con, delay);
2815*4882a593Smuzhiyun 	if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) {
2816*4882a593Smuzhiyun 		dout("%s %p - already queued\n", __func__, con);
2817*4882a593Smuzhiyun 		con->ops->put(con);
2818*4882a593Smuzhiyun 		return -EBUSY;
2819*4882a593Smuzhiyun 	}
2820*4882a593Smuzhiyun 
2821*4882a593Smuzhiyun 	return 0;
2822*4882a593Smuzhiyun }
2823*4882a593Smuzhiyun 
queue_con(struct ceph_connection * con)2824*4882a593Smuzhiyun static void queue_con(struct ceph_connection *con)
2825*4882a593Smuzhiyun {
2826*4882a593Smuzhiyun 	(void) queue_con_delay(con, 0);
2827*4882a593Smuzhiyun }
2828*4882a593Smuzhiyun 
cancel_con(struct ceph_connection * con)2829*4882a593Smuzhiyun static void cancel_con(struct ceph_connection *con)
2830*4882a593Smuzhiyun {
2831*4882a593Smuzhiyun 	if (cancel_delayed_work(&con->work)) {
2832*4882a593Smuzhiyun 		dout("%s %p\n", __func__, con);
2833*4882a593Smuzhiyun 		con->ops->put(con);
2834*4882a593Smuzhiyun 	}
2835*4882a593Smuzhiyun }
2836*4882a593Smuzhiyun 
con_sock_closed(struct ceph_connection * con)2837*4882a593Smuzhiyun static bool con_sock_closed(struct ceph_connection *con)
2838*4882a593Smuzhiyun {
2839*4882a593Smuzhiyun 	if (!con_flag_test_and_clear(con, CON_FLAG_SOCK_CLOSED))
2840*4882a593Smuzhiyun 		return false;
2841*4882a593Smuzhiyun 
2842*4882a593Smuzhiyun #define CASE(x)								\
2843*4882a593Smuzhiyun 	case CON_STATE_ ## x:						\
2844*4882a593Smuzhiyun 		con->error_msg = "socket closed (con state " #x ")";	\
2845*4882a593Smuzhiyun 		break;
2846*4882a593Smuzhiyun 
2847*4882a593Smuzhiyun 	switch (con->state) {
2848*4882a593Smuzhiyun 	CASE(CLOSED);
2849*4882a593Smuzhiyun 	CASE(PREOPEN);
2850*4882a593Smuzhiyun 	CASE(CONNECTING);
2851*4882a593Smuzhiyun 	CASE(NEGOTIATING);
2852*4882a593Smuzhiyun 	CASE(OPEN);
2853*4882a593Smuzhiyun 	CASE(STANDBY);
2854*4882a593Smuzhiyun 	default:
2855*4882a593Smuzhiyun 		pr_warn("%s con %p unrecognized state %lu\n",
2856*4882a593Smuzhiyun 			__func__, con, con->state);
2857*4882a593Smuzhiyun 		con->error_msg = "unrecognized con state";
2858*4882a593Smuzhiyun 		BUG();
2859*4882a593Smuzhiyun 		break;
2860*4882a593Smuzhiyun 	}
2861*4882a593Smuzhiyun #undef CASE
2862*4882a593Smuzhiyun 
2863*4882a593Smuzhiyun 	return true;
2864*4882a593Smuzhiyun }
2865*4882a593Smuzhiyun 
con_backoff(struct ceph_connection * con)2866*4882a593Smuzhiyun static bool con_backoff(struct ceph_connection *con)
2867*4882a593Smuzhiyun {
2868*4882a593Smuzhiyun 	int ret;
2869*4882a593Smuzhiyun 
2870*4882a593Smuzhiyun 	if (!con_flag_test_and_clear(con, CON_FLAG_BACKOFF))
2871*4882a593Smuzhiyun 		return false;
2872*4882a593Smuzhiyun 
2873*4882a593Smuzhiyun 	ret = queue_con_delay(con, round_jiffies_relative(con->delay));
2874*4882a593Smuzhiyun 	if (ret) {
2875*4882a593Smuzhiyun 		dout("%s: con %p FAILED to back off %lu\n", __func__,
2876*4882a593Smuzhiyun 			con, con->delay);
2877*4882a593Smuzhiyun 		BUG_ON(ret == -ENOENT);
2878*4882a593Smuzhiyun 		con_flag_set(con, CON_FLAG_BACKOFF);
2879*4882a593Smuzhiyun 	}
2880*4882a593Smuzhiyun 
2881*4882a593Smuzhiyun 	return true;
2882*4882a593Smuzhiyun }
2883*4882a593Smuzhiyun 
2884*4882a593Smuzhiyun /* Finish fault handling; con->mutex must *not* be held here */
2885*4882a593Smuzhiyun 
con_fault_finish(struct ceph_connection * con)2886*4882a593Smuzhiyun static void con_fault_finish(struct ceph_connection *con)
2887*4882a593Smuzhiyun {
2888*4882a593Smuzhiyun 	dout("%s %p\n", __func__, con);
2889*4882a593Smuzhiyun 
2890*4882a593Smuzhiyun 	/*
2891*4882a593Smuzhiyun 	 * in case we faulted due to authentication, invalidate our
2892*4882a593Smuzhiyun 	 * current tickets so that we can get new ones.
2893*4882a593Smuzhiyun 	 */
2894*4882a593Smuzhiyun 	if (con->auth_retry) {
2895*4882a593Smuzhiyun 		dout("auth_retry %d, invalidating\n", con->auth_retry);
2896*4882a593Smuzhiyun 		if (con->ops->invalidate_authorizer)
2897*4882a593Smuzhiyun 			con->ops->invalidate_authorizer(con);
2898*4882a593Smuzhiyun 		con->auth_retry = 0;
2899*4882a593Smuzhiyun 	}
2900*4882a593Smuzhiyun 
2901*4882a593Smuzhiyun 	if (con->ops->fault)
2902*4882a593Smuzhiyun 		con->ops->fault(con);
2903*4882a593Smuzhiyun }
2904*4882a593Smuzhiyun 
2905*4882a593Smuzhiyun /*
2906*4882a593Smuzhiyun  * Do some work on a connection.  Drop a connection ref when we're done.
2907*4882a593Smuzhiyun  */
ceph_con_workfn(struct work_struct * work)2908*4882a593Smuzhiyun static void ceph_con_workfn(struct work_struct *work)
2909*4882a593Smuzhiyun {
2910*4882a593Smuzhiyun 	struct ceph_connection *con = container_of(work, struct ceph_connection,
2911*4882a593Smuzhiyun 						   work.work);
2912*4882a593Smuzhiyun 	bool fault;
2913*4882a593Smuzhiyun 
2914*4882a593Smuzhiyun 	mutex_lock(&con->mutex);
2915*4882a593Smuzhiyun 	while (true) {
2916*4882a593Smuzhiyun 		int ret;
2917*4882a593Smuzhiyun 
2918*4882a593Smuzhiyun 		if ((fault = con_sock_closed(con))) {
2919*4882a593Smuzhiyun 			dout("%s: con %p SOCK_CLOSED\n", __func__, con);
2920*4882a593Smuzhiyun 			break;
2921*4882a593Smuzhiyun 		}
2922*4882a593Smuzhiyun 		if (con_backoff(con)) {
2923*4882a593Smuzhiyun 			dout("%s: con %p BACKOFF\n", __func__, con);
2924*4882a593Smuzhiyun 			break;
2925*4882a593Smuzhiyun 		}
2926*4882a593Smuzhiyun 		if (con->state == CON_STATE_STANDBY) {
2927*4882a593Smuzhiyun 			dout("%s: con %p STANDBY\n", __func__, con);
2928*4882a593Smuzhiyun 			break;
2929*4882a593Smuzhiyun 		}
2930*4882a593Smuzhiyun 		if (con->state == CON_STATE_CLOSED) {
2931*4882a593Smuzhiyun 			dout("%s: con %p CLOSED\n", __func__, con);
2932*4882a593Smuzhiyun 			BUG_ON(con->sock);
2933*4882a593Smuzhiyun 			break;
2934*4882a593Smuzhiyun 		}
2935*4882a593Smuzhiyun 		if (con->state == CON_STATE_PREOPEN) {
2936*4882a593Smuzhiyun 			dout("%s: con %p PREOPEN\n", __func__, con);
2937*4882a593Smuzhiyun 			BUG_ON(con->sock);
2938*4882a593Smuzhiyun 		}
2939*4882a593Smuzhiyun 
2940*4882a593Smuzhiyun 		ret = try_read(con);
2941*4882a593Smuzhiyun 		if (ret < 0) {
2942*4882a593Smuzhiyun 			if (ret == -EAGAIN)
2943*4882a593Smuzhiyun 				continue;
2944*4882a593Smuzhiyun 			if (!con->error_msg)
2945*4882a593Smuzhiyun 				con->error_msg = "socket error on read";
2946*4882a593Smuzhiyun 			fault = true;
2947*4882a593Smuzhiyun 			break;
2948*4882a593Smuzhiyun 		}
2949*4882a593Smuzhiyun 
2950*4882a593Smuzhiyun 		ret = try_write(con);
2951*4882a593Smuzhiyun 		if (ret < 0) {
2952*4882a593Smuzhiyun 			if (ret == -EAGAIN)
2953*4882a593Smuzhiyun 				continue;
2954*4882a593Smuzhiyun 			if (!con->error_msg)
2955*4882a593Smuzhiyun 				con->error_msg = "socket error on write";
2956*4882a593Smuzhiyun 			fault = true;
2957*4882a593Smuzhiyun 		}
2958*4882a593Smuzhiyun 
2959*4882a593Smuzhiyun 		break;	/* If we make it to here, we're done */
2960*4882a593Smuzhiyun 	}
2961*4882a593Smuzhiyun 	if (fault)
2962*4882a593Smuzhiyun 		con_fault(con);
2963*4882a593Smuzhiyun 	mutex_unlock(&con->mutex);
2964*4882a593Smuzhiyun 
2965*4882a593Smuzhiyun 	if (fault)
2966*4882a593Smuzhiyun 		con_fault_finish(con);
2967*4882a593Smuzhiyun 
2968*4882a593Smuzhiyun 	con->ops->put(con);
2969*4882a593Smuzhiyun }
2970*4882a593Smuzhiyun 
2971*4882a593Smuzhiyun /*
2972*4882a593Smuzhiyun  * Generic error/fault handler.  A retry mechanism is used with
2973*4882a593Smuzhiyun  * exponential backoff
2974*4882a593Smuzhiyun  */
con_fault(struct ceph_connection * con)2975*4882a593Smuzhiyun static void con_fault(struct ceph_connection *con)
2976*4882a593Smuzhiyun {
2977*4882a593Smuzhiyun 	dout("fault %p state %lu to peer %s\n",
2978*4882a593Smuzhiyun 	     con, con->state, ceph_pr_addr(&con->peer_addr));
2979*4882a593Smuzhiyun 
2980*4882a593Smuzhiyun 	pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
2981*4882a593Smuzhiyun 		ceph_pr_addr(&con->peer_addr), con->error_msg);
2982*4882a593Smuzhiyun 	con->error_msg = NULL;
2983*4882a593Smuzhiyun 
2984*4882a593Smuzhiyun 	WARN_ON(con->state != CON_STATE_CONNECTING &&
2985*4882a593Smuzhiyun 	       con->state != CON_STATE_NEGOTIATING &&
2986*4882a593Smuzhiyun 	       con->state != CON_STATE_OPEN);
2987*4882a593Smuzhiyun 
2988*4882a593Smuzhiyun 	con_close_socket(con);
2989*4882a593Smuzhiyun 
2990*4882a593Smuzhiyun 	if (con_flag_test(con, CON_FLAG_LOSSYTX)) {
2991*4882a593Smuzhiyun 		dout("fault on LOSSYTX channel, marking CLOSED\n");
2992*4882a593Smuzhiyun 		con->state = CON_STATE_CLOSED;
2993*4882a593Smuzhiyun 		return;
2994*4882a593Smuzhiyun 	}
2995*4882a593Smuzhiyun 
2996*4882a593Smuzhiyun 	if (con->in_msg) {
2997*4882a593Smuzhiyun 		BUG_ON(con->in_msg->con != con);
2998*4882a593Smuzhiyun 		ceph_msg_put(con->in_msg);
2999*4882a593Smuzhiyun 		con->in_msg = NULL;
3000*4882a593Smuzhiyun 	}
3001*4882a593Smuzhiyun 	if (con->out_msg) {
3002*4882a593Smuzhiyun 		BUG_ON(con->out_msg->con != con);
3003*4882a593Smuzhiyun 		ceph_msg_put(con->out_msg);
3004*4882a593Smuzhiyun 		con->out_msg = NULL;
3005*4882a593Smuzhiyun 	}
3006*4882a593Smuzhiyun 
3007*4882a593Smuzhiyun 	/* Requeue anything that hasn't been acked */
3008*4882a593Smuzhiyun 	list_splice_init(&con->out_sent, &con->out_queue);
3009*4882a593Smuzhiyun 
3010*4882a593Smuzhiyun 	/* If there are no messages queued or keepalive pending, place
3011*4882a593Smuzhiyun 	 * the connection in a STANDBY state */
3012*4882a593Smuzhiyun 	if (list_empty(&con->out_queue) &&
3013*4882a593Smuzhiyun 	    !con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING)) {
3014*4882a593Smuzhiyun 		dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con);
3015*4882a593Smuzhiyun 		con_flag_clear(con, CON_FLAG_WRITE_PENDING);
3016*4882a593Smuzhiyun 		con->state = CON_STATE_STANDBY;
3017*4882a593Smuzhiyun 	} else {
3018*4882a593Smuzhiyun 		/* retry after a delay. */
3019*4882a593Smuzhiyun 		con->state = CON_STATE_PREOPEN;
3020*4882a593Smuzhiyun 		if (con->delay == 0)
3021*4882a593Smuzhiyun 			con->delay = BASE_DELAY_INTERVAL;
3022*4882a593Smuzhiyun 		else if (con->delay < MAX_DELAY_INTERVAL)
3023*4882a593Smuzhiyun 			con->delay *= 2;
3024*4882a593Smuzhiyun 		con_flag_set(con, CON_FLAG_BACKOFF);
3025*4882a593Smuzhiyun 		queue_con(con);
3026*4882a593Smuzhiyun 	}
3027*4882a593Smuzhiyun }
3028*4882a593Smuzhiyun 
3029*4882a593Smuzhiyun 
ceph_messenger_reset_nonce(struct ceph_messenger * msgr)3030*4882a593Smuzhiyun void ceph_messenger_reset_nonce(struct ceph_messenger *msgr)
3031*4882a593Smuzhiyun {
3032*4882a593Smuzhiyun 	u32 nonce = le32_to_cpu(msgr->inst.addr.nonce) + 1000000;
3033*4882a593Smuzhiyun 	msgr->inst.addr.nonce = cpu_to_le32(nonce);
3034*4882a593Smuzhiyun 	encode_my_addr(msgr);
3035*4882a593Smuzhiyun }
3036*4882a593Smuzhiyun 
3037*4882a593Smuzhiyun /*
3038*4882a593Smuzhiyun  * initialize a new messenger instance
3039*4882a593Smuzhiyun  */
ceph_messenger_init(struct ceph_messenger * msgr,struct ceph_entity_addr * myaddr)3040*4882a593Smuzhiyun void ceph_messenger_init(struct ceph_messenger *msgr,
3041*4882a593Smuzhiyun 			 struct ceph_entity_addr *myaddr)
3042*4882a593Smuzhiyun {
3043*4882a593Smuzhiyun 	spin_lock_init(&msgr->global_seq_lock);
3044*4882a593Smuzhiyun 
3045*4882a593Smuzhiyun 	if (myaddr)
3046*4882a593Smuzhiyun 		msgr->inst.addr = *myaddr;
3047*4882a593Smuzhiyun 
3048*4882a593Smuzhiyun 	/* select a random nonce */
3049*4882a593Smuzhiyun 	msgr->inst.addr.type = 0;
3050*4882a593Smuzhiyun 	get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
3051*4882a593Smuzhiyun 	encode_my_addr(msgr);
3052*4882a593Smuzhiyun 
3053*4882a593Smuzhiyun 	atomic_set(&msgr->stopping, 0);
3054*4882a593Smuzhiyun 	write_pnet(&msgr->net, get_net(current->nsproxy->net_ns));
3055*4882a593Smuzhiyun 
3056*4882a593Smuzhiyun 	dout("%s %p\n", __func__, msgr);
3057*4882a593Smuzhiyun }
3058*4882a593Smuzhiyun EXPORT_SYMBOL(ceph_messenger_init);
3059*4882a593Smuzhiyun 
ceph_messenger_fini(struct ceph_messenger * msgr)3060*4882a593Smuzhiyun void ceph_messenger_fini(struct ceph_messenger *msgr)
3061*4882a593Smuzhiyun {
3062*4882a593Smuzhiyun 	put_net(read_pnet(&msgr->net));
3063*4882a593Smuzhiyun }
3064*4882a593Smuzhiyun EXPORT_SYMBOL(ceph_messenger_fini);
3065*4882a593Smuzhiyun 
msg_con_set(struct ceph_msg * msg,struct ceph_connection * con)3066*4882a593Smuzhiyun static void msg_con_set(struct ceph_msg *msg, struct ceph_connection *con)
3067*4882a593Smuzhiyun {
3068*4882a593Smuzhiyun 	if (msg->con)
3069*4882a593Smuzhiyun 		msg->con->ops->put(msg->con);
3070*4882a593Smuzhiyun 
3071*4882a593Smuzhiyun 	msg->con = con ? con->ops->get(con) : NULL;
3072*4882a593Smuzhiyun 	BUG_ON(msg->con != con);
3073*4882a593Smuzhiyun }
3074*4882a593Smuzhiyun 
clear_standby(struct ceph_connection * con)3075*4882a593Smuzhiyun static void clear_standby(struct ceph_connection *con)
3076*4882a593Smuzhiyun {
3077*4882a593Smuzhiyun 	/* come back from STANDBY? */
3078*4882a593Smuzhiyun 	if (con->state == CON_STATE_STANDBY) {
3079*4882a593Smuzhiyun 		dout("clear_standby %p and ++connect_seq\n", con);
3080*4882a593Smuzhiyun 		con->state = CON_STATE_PREOPEN;
3081*4882a593Smuzhiyun 		con->connect_seq++;
3082*4882a593Smuzhiyun 		WARN_ON(con_flag_test(con, CON_FLAG_WRITE_PENDING));
3083*4882a593Smuzhiyun 		WARN_ON(con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING));
3084*4882a593Smuzhiyun 	}
3085*4882a593Smuzhiyun }
3086*4882a593Smuzhiyun 
3087*4882a593Smuzhiyun /*
3088*4882a593Smuzhiyun  * Queue up an outgoing message on the given connection.
3089*4882a593Smuzhiyun  */
ceph_con_send(struct ceph_connection * con,struct ceph_msg * msg)3090*4882a593Smuzhiyun void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
3091*4882a593Smuzhiyun {
3092*4882a593Smuzhiyun 	/* set src+dst */
3093*4882a593Smuzhiyun 	msg->hdr.src = con->msgr->inst.name;
3094*4882a593Smuzhiyun 	BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
3095*4882a593Smuzhiyun 	msg->needs_out_seq = true;
3096*4882a593Smuzhiyun 
3097*4882a593Smuzhiyun 	mutex_lock(&con->mutex);
3098*4882a593Smuzhiyun 
3099*4882a593Smuzhiyun 	if (con->state == CON_STATE_CLOSED) {
3100*4882a593Smuzhiyun 		dout("con_send %p closed, dropping %p\n", con, msg);
3101*4882a593Smuzhiyun 		ceph_msg_put(msg);
3102*4882a593Smuzhiyun 		mutex_unlock(&con->mutex);
3103*4882a593Smuzhiyun 		return;
3104*4882a593Smuzhiyun 	}
3105*4882a593Smuzhiyun 
3106*4882a593Smuzhiyun 	msg_con_set(msg, con);
3107*4882a593Smuzhiyun 
3108*4882a593Smuzhiyun 	BUG_ON(!list_empty(&msg->list_head));
3109*4882a593Smuzhiyun 	list_add_tail(&msg->list_head, &con->out_queue);
3110*4882a593Smuzhiyun 	dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
3111*4882a593Smuzhiyun 	     ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
3112*4882a593Smuzhiyun 	     ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
3113*4882a593Smuzhiyun 	     le32_to_cpu(msg->hdr.front_len),
3114*4882a593Smuzhiyun 	     le32_to_cpu(msg->hdr.middle_len),
3115*4882a593Smuzhiyun 	     le32_to_cpu(msg->hdr.data_len));
3116*4882a593Smuzhiyun 
3117*4882a593Smuzhiyun 	clear_standby(con);
3118*4882a593Smuzhiyun 	mutex_unlock(&con->mutex);
3119*4882a593Smuzhiyun 
3120*4882a593Smuzhiyun 	/* if there wasn't anything waiting to send before, queue
3121*4882a593Smuzhiyun 	 * new work */
3122*4882a593Smuzhiyun 	if (con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0)
3123*4882a593Smuzhiyun 		queue_con(con);
3124*4882a593Smuzhiyun }
3125*4882a593Smuzhiyun EXPORT_SYMBOL(ceph_con_send);
3126*4882a593Smuzhiyun 
3127*4882a593Smuzhiyun /*
3128*4882a593Smuzhiyun  * Revoke a message that was previously queued for send
3129*4882a593Smuzhiyun  */
ceph_msg_revoke(struct ceph_msg * msg)3130*4882a593Smuzhiyun void ceph_msg_revoke(struct ceph_msg *msg)
3131*4882a593Smuzhiyun {
3132*4882a593Smuzhiyun 	struct ceph_connection *con = msg->con;
3133*4882a593Smuzhiyun 
3134*4882a593Smuzhiyun 	if (!con) {
3135*4882a593Smuzhiyun 		dout("%s msg %p null con\n", __func__, msg);
3136*4882a593Smuzhiyun 		return;		/* Message not in our possession */
3137*4882a593Smuzhiyun 	}
3138*4882a593Smuzhiyun 
3139*4882a593Smuzhiyun 	mutex_lock(&con->mutex);
3140*4882a593Smuzhiyun 	if (!list_empty(&msg->list_head)) {
3141*4882a593Smuzhiyun 		dout("%s %p msg %p - was on queue\n", __func__, con, msg);
3142*4882a593Smuzhiyun 		list_del_init(&msg->list_head);
3143*4882a593Smuzhiyun 		msg->hdr.seq = 0;
3144*4882a593Smuzhiyun 
3145*4882a593Smuzhiyun 		ceph_msg_put(msg);
3146*4882a593Smuzhiyun 	}
3147*4882a593Smuzhiyun 	if (con->out_msg == msg) {
3148*4882a593Smuzhiyun 		BUG_ON(con->out_skip);
3149*4882a593Smuzhiyun 		/* footer */
3150*4882a593Smuzhiyun 		if (con->out_msg_done) {
3151*4882a593Smuzhiyun 			con->out_skip += con_out_kvec_skip(con);
3152*4882a593Smuzhiyun 		} else {
3153*4882a593Smuzhiyun 			BUG_ON(!msg->data_length);
3154*4882a593Smuzhiyun 			con->out_skip += sizeof_footer(con);
3155*4882a593Smuzhiyun 		}
3156*4882a593Smuzhiyun 		/* data, middle, front */
3157*4882a593Smuzhiyun 		if (msg->data_length)
3158*4882a593Smuzhiyun 			con->out_skip += msg->cursor.total_resid;
3159*4882a593Smuzhiyun 		if (msg->middle)
3160*4882a593Smuzhiyun 			con->out_skip += con_out_kvec_skip(con);
3161*4882a593Smuzhiyun 		con->out_skip += con_out_kvec_skip(con);
3162*4882a593Smuzhiyun 
3163*4882a593Smuzhiyun 		dout("%s %p msg %p - was sending, will write %d skip %d\n",
3164*4882a593Smuzhiyun 		     __func__, con, msg, con->out_kvec_bytes, con->out_skip);
3165*4882a593Smuzhiyun 		msg->hdr.seq = 0;
3166*4882a593Smuzhiyun 		con->out_msg = NULL;
3167*4882a593Smuzhiyun 		ceph_msg_put(msg);
3168*4882a593Smuzhiyun 	}
3169*4882a593Smuzhiyun 
3170*4882a593Smuzhiyun 	mutex_unlock(&con->mutex);
3171*4882a593Smuzhiyun }
3172*4882a593Smuzhiyun 
3173*4882a593Smuzhiyun /*
3174*4882a593Smuzhiyun  * Revoke a message that we may be reading data into
3175*4882a593Smuzhiyun  */
ceph_msg_revoke_incoming(struct ceph_msg * msg)3176*4882a593Smuzhiyun void ceph_msg_revoke_incoming(struct ceph_msg *msg)
3177*4882a593Smuzhiyun {
3178*4882a593Smuzhiyun 	struct ceph_connection *con = msg->con;
3179*4882a593Smuzhiyun 
3180*4882a593Smuzhiyun 	if (!con) {
3181*4882a593Smuzhiyun 		dout("%s msg %p null con\n", __func__, msg);
3182*4882a593Smuzhiyun 		return;		/* Message not in our possession */
3183*4882a593Smuzhiyun 	}
3184*4882a593Smuzhiyun 
3185*4882a593Smuzhiyun 	mutex_lock(&con->mutex);
3186*4882a593Smuzhiyun 	if (con->in_msg == msg) {
3187*4882a593Smuzhiyun 		unsigned int front_len = le32_to_cpu(con->in_hdr.front_len);
3188*4882a593Smuzhiyun 		unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len);
3189*4882a593Smuzhiyun 		unsigned int data_len = le32_to_cpu(con->in_hdr.data_len);
3190*4882a593Smuzhiyun 
3191*4882a593Smuzhiyun 		/* skip rest of message */
3192*4882a593Smuzhiyun 		dout("%s %p msg %p revoked\n", __func__, con, msg);
3193*4882a593Smuzhiyun 		con->in_base_pos = con->in_base_pos -
3194*4882a593Smuzhiyun 				sizeof(struct ceph_msg_header) -
3195*4882a593Smuzhiyun 				front_len -
3196*4882a593Smuzhiyun 				middle_len -
3197*4882a593Smuzhiyun 				data_len -
3198*4882a593Smuzhiyun 				sizeof(struct ceph_msg_footer);
3199*4882a593Smuzhiyun 		ceph_msg_put(con->in_msg);
3200*4882a593Smuzhiyun 		con->in_msg = NULL;
3201*4882a593Smuzhiyun 		con->in_tag = CEPH_MSGR_TAG_READY;
3202*4882a593Smuzhiyun 		con->in_seq++;
3203*4882a593Smuzhiyun 	} else {
3204*4882a593Smuzhiyun 		dout("%s %p in_msg %p msg %p no-op\n",
3205*4882a593Smuzhiyun 		     __func__, con, con->in_msg, msg);
3206*4882a593Smuzhiyun 	}
3207*4882a593Smuzhiyun 	mutex_unlock(&con->mutex);
3208*4882a593Smuzhiyun }
3209*4882a593Smuzhiyun 
3210*4882a593Smuzhiyun /*
3211*4882a593Smuzhiyun  * Queue a keepalive byte to ensure the tcp connection is alive.
3212*4882a593Smuzhiyun  */
ceph_con_keepalive(struct ceph_connection * con)3213*4882a593Smuzhiyun void ceph_con_keepalive(struct ceph_connection *con)
3214*4882a593Smuzhiyun {
3215*4882a593Smuzhiyun 	dout("con_keepalive %p\n", con);
3216*4882a593Smuzhiyun 	mutex_lock(&con->mutex);
3217*4882a593Smuzhiyun 	clear_standby(con);
3218*4882a593Smuzhiyun 	con_flag_set(con, CON_FLAG_KEEPALIVE_PENDING);
3219*4882a593Smuzhiyun 	mutex_unlock(&con->mutex);
3220*4882a593Smuzhiyun 
3221*4882a593Smuzhiyun 	if (con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0)
3222*4882a593Smuzhiyun 		queue_con(con);
3223*4882a593Smuzhiyun }
3224*4882a593Smuzhiyun EXPORT_SYMBOL(ceph_con_keepalive);
3225*4882a593Smuzhiyun 
ceph_con_keepalive_expired(struct ceph_connection * con,unsigned long interval)3226*4882a593Smuzhiyun bool ceph_con_keepalive_expired(struct ceph_connection *con,
3227*4882a593Smuzhiyun 			       unsigned long interval)
3228*4882a593Smuzhiyun {
3229*4882a593Smuzhiyun 	if (interval > 0 &&
3230*4882a593Smuzhiyun 	    (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2)) {
3231*4882a593Smuzhiyun 		struct timespec64 now;
3232*4882a593Smuzhiyun 		struct timespec64 ts;
3233*4882a593Smuzhiyun 		ktime_get_real_ts64(&now);
3234*4882a593Smuzhiyun 		jiffies_to_timespec64(interval, &ts);
3235*4882a593Smuzhiyun 		ts = timespec64_add(con->last_keepalive_ack, ts);
3236*4882a593Smuzhiyun 		return timespec64_compare(&now, &ts) >= 0;
3237*4882a593Smuzhiyun 	}
3238*4882a593Smuzhiyun 	return false;
3239*4882a593Smuzhiyun }
3240*4882a593Smuzhiyun 
ceph_msg_data_add(struct ceph_msg * msg)3241*4882a593Smuzhiyun static struct ceph_msg_data *ceph_msg_data_add(struct ceph_msg *msg)
3242*4882a593Smuzhiyun {
3243*4882a593Smuzhiyun 	BUG_ON(msg->num_data_items >= msg->max_data_items);
3244*4882a593Smuzhiyun 	return &msg->data[msg->num_data_items++];
3245*4882a593Smuzhiyun }
3246*4882a593Smuzhiyun 
ceph_msg_data_destroy(struct ceph_msg_data * data)3247*4882a593Smuzhiyun static void ceph_msg_data_destroy(struct ceph_msg_data *data)
3248*4882a593Smuzhiyun {
3249*4882a593Smuzhiyun 	if (data->type == CEPH_MSG_DATA_PAGES && data->own_pages) {
3250*4882a593Smuzhiyun 		int num_pages = calc_pages_for(data->alignment, data->length);
3251*4882a593Smuzhiyun 		ceph_release_page_vector(data->pages, num_pages);
3252*4882a593Smuzhiyun 	} else if (data->type == CEPH_MSG_DATA_PAGELIST) {
3253*4882a593Smuzhiyun 		ceph_pagelist_release(data->pagelist);
3254*4882a593Smuzhiyun 	}
3255*4882a593Smuzhiyun }
3256*4882a593Smuzhiyun 
ceph_msg_data_add_pages(struct ceph_msg * msg,struct page ** pages,size_t length,size_t alignment,bool own_pages)3257*4882a593Smuzhiyun void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages,
3258*4882a593Smuzhiyun 			     size_t length, size_t alignment, bool own_pages)
3259*4882a593Smuzhiyun {
3260*4882a593Smuzhiyun 	struct ceph_msg_data *data;
3261*4882a593Smuzhiyun 
3262*4882a593Smuzhiyun 	BUG_ON(!pages);
3263*4882a593Smuzhiyun 	BUG_ON(!length);
3264*4882a593Smuzhiyun 
3265*4882a593Smuzhiyun 	data = ceph_msg_data_add(msg);
3266*4882a593Smuzhiyun 	data->type = CEPH_MSG_DATA_PAGES;
3267*4882a593Smuzhiyun 	data->pages = pages;
3268*4882a593Smuzhiyun 	data->length = length;
3269*4882a593Smuzhiyun 	data->alignment = alignment & ~PAGE_MASK;
3270*4882a593Smuzhiyun 	data->own_pages = own_pages;
3271*4882a593Smuzhiyun 
3272*4882a593Smuzhiyun 	msg->data_length += length;
3273*4882a593Smuzhiyun }
3274*4882a593Smuzhiyun EXPORT_SYMBOL(ceph_msg_data_add_pages);
3275*4882a593Smuzhiyun 
ceph_msg_data_add_pagelist(struct ceph_msg * msg,struct ceph_pagelist * pagelist)3276*4882a593Smuzhiyun void ceph_msg_data_add_pagelist(struct ceph_msg *msg,
3277*4882a593Smuzhiyun 				struct ceph_pagelist *pagelist)
3278*4882a593Smuzhiyun {
3279*4882a593Smuzhiyun 	struct ceph_msg_data *data;
3280*4882a593Smuzhiyun 
3281*4882a593Smuzhiyun 	BUG_ON(!pagelist);
3282*4882a593Smuzhiyun 	BUG_ON(!pagelist->length);
3283*4882a593Smuzhiyun 
3284*4882a593Smuzhiyun 	data = ceph_msg_data_add(msg);
3285*4882a593Smuzhiyun 	data->type = CEPH_MSG_DATA_PAGELIST;
3286*4882a593Smuzhiyun 	refcount_inc(&pagelist->refcnt);
3287*4882a593Smuzhiyun 	data->pagelist = pagelist;
3288*4882a593Smuzhiyun 
3289*4882a593Smuzhiyun 	msg->data_length += pagelist->length;
3290*4882a593Smuzhiyun }
3291*4882a593Smuzhiyun EXPORT_SYMBOL(ceph_msg_data_add_pagelist);
3292*4882a593Smuzhiyun 
3293*4882a593Smuzhiyun #ifdef	CONFIG_BLOCK
ceph_msg_data_add_bio(struct ceph_msg * msg,struct ceph_bio_iter * bio_pos,u32 length)3294*4882a593Smuzhiyun void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos,
3295*4882a593Smuzhiyun 			   u32 length)
3296*4882a593Smuzhiyun {
3297*4882a593Smuzhiyun 	struct ceph_msg_data *data;
3298*4882a593Smuzhiyun 
3299*4882a593Smuzhiyun 	data = ceph_msg_data_add(msg);
3300*4882a593Smuzhiyun 	data->type = CEPH_MSG_DATA_BIO;
3301*4882a593Smuzhiyun 	data->bio_pos = *bio_pos;
3302*4882a593Smuzhiyun 	data->bio_length = length;
3303*4882a593Smuzhiyun 
3304*4882a593Smuzhiyun 	msg->data_length += length;
3305*4882a593Smuzhiyun }
3306*4882a593Smuzhiyun EXPORT_SYMBOL(ceph_msg_data_add_bio);
3307*4882a593Smuzhiyun #endif	/* CONFIG_BLOCK */
3308*4882a593Smuzhiyun 
ceph_msg_data_add_bvecs(struct ceph_msg * msg,struct ceph_bvec_iter * bvec_pos)3309*4882a593Smuzhiyun void ceph_msg_data_add_bvecs(struct ceph_msg *msg,
3310*4882a593Smuzhiyun 			     struct ceph_bvec_iter *bvec_pos)
3311*4882a593Smuzhiyun {
3312*4882a593Smuzhiyun 	struct ceph_msg_data *data;
3313*4882a593Smuzhiyun 
3314*4882a593Smuzhiyun 	data = ceph_msg_data_add(msg);
3315*4882a593Smuzhiyun 	data->type = CEPH_MSG_DATA_BVECS;
3316*4882a593Smuzhiyun 	data->bvec_pos = *bvec_pos;
3317*4882a593Smuzhiyun 
3318*4882a593Smuzhiyun 	msg->data_length += bvec_pos->iter.bi_size;
3319*4882a593Smuzhiyun }
3320*4882a593Smuzhiyun EXPORT_SYMBOL(ceph_msg_data_add_bvecs);
3321*4882a593Smuzhiyun 
3322*4882a593Smuzhiyun /*
3323*4882a593Smuzhiyun  * construct a new message with given type, size
3324*4882a593Smuzhiyun  * the new msg has a ref count of 1.
3325*4882a593Smuzhiyun  */
ceph_msg_new2(int type,int front_len,int max_data_items,gfp_t flags,bool can_fail)3326*4882a593Smuzhiyun struct ceph_msg *ceph_msg_new2(int type, int front_len, int max_data_items,
3327*4882a593Smuzhiyun 			       gfp_t flags, bool can_fail)
3328*4882a593Smuzhiyun {
3329*4882a593Smuzhiyun 	struct ceph_msg *m;
3330*4882a593Smuzhiyun 
3331*4882a593Smuzhiyun 	m = kmem_cache_zalloc(ceph_msg_cache, flags);
3332*4882a593Smuzhiyun 	if (m == NULL)
3333*4882a593Smuzhiyun 		goto out;
3334*4882a593Smuzhiyun 
3335*4882a593Smuzhiyun 	m->hdr.type = cpu_to_le16(type);
3336*4882a593Smuzhiyun 	m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
3337*4882a593Smuzhiyun 	m->hdr.front_len = cpu_to_le32(front_len);
3338*4882a593Smuzhiyun 
3339*4882a593Smuzhiyun 	INIT_LIST_HEAD(&m->list_head);
3340*4882a593Smuzhiyun 	kref_init(&m->kref);
3341*4882a593Smuzhiyun 
3342*4882a593Smuzhiyun 	/* front */
3343*4882a593Smuzhiyun 	if (front_len) {
3344*4882a593Smuzhiyun 		m->front.iov_base = ceph_kvmalloc(front_len, flags);
3345*4882a593Smuzhiyun 		if (m->front.iov_base == NULL) {
3346*4882a593Smuzhiyun 			dout("ceph_msg_new can't allocate %d bytes\n",
3347*4882a593Smuzhiyun 			     front_len);
3348*4882a593Smuzhiyun 			goto out2;
3349*4882a593Smuzhiyun 		}
3350*4882a593Smuzhiyun 	} else {
3351*4882a593Smuzhiyun 		m->front.iov_base = NULL;
3352*4882a593Smuzhiyun 	}
3353*4882a593Smuzhiyun 	m->front_alloc_len = m->front.iov_len = front_len;
3354*4882a593Smuzhiyun 
3355*4882a593Smuzhiyun 	if (max_data_items) {
3356*4882a593Smuzhiyun 		m->data = kmalloc_array(max_data_items, sizeof(*m->data),
3357*4882a593Smuzhiyun 					flags);
3358*4882a593Smuzhiyun 		if (!m->data)
3359*4882a593Smuzhiyun 			goto out2;
3360*4882a593Smuzhiyun 
3361*4882a593Smuzhiyun 		m->max_data_items = max_data_items;
3362*4882a593Smuzhiyun 	}
3363*4882a593Smuzhiyun 
3364*4882a593Smuzhiyun 	dout("ceph_msg_new %p front %d\n", m, front_len);
3365*4882a593Smuzhiyun 	return m;
3366*4882a593Smuzhiyun 
3367*4882a593Smuzhiyun out2:
3368*4882a593Smuzhiyun 	ceph_msg_put(m);
3369*4882a593Smuzhiyun out:
3370*4882a593Smuzhiyun 	if (!can_fail) {
3371*4882a593Smuzhiyun 		pr_err("msg_new can't create type %d front %d\n", type,
3372*4882a593Smuzhiyun 		       front_len);
3373*4882a593Smuzhiyun 		WARN_ON(1);
3374*4882a593Smuzhiyun 	} else {
3375*4882a593Smuzhiyun 		dout("msg_new can't create type %d front %d\n", type,
3376*4882a593Smuzhiyun 		     front_len);
3377*4882a593Smuzhiyun 	}
3378*4882a593Smuzhiyun 	return NULL;
3379*4882a593Smuzhiyun }
3380*4882a593Smuzhiyun EXPORT_SYMBOL(ceph_msg_new2);
3381*4882a593Smuzhiyun 
ceph_msg_new(int type,int front_len,gfp_t flags,bool can_fail)3382*4882a593Smuzhiyun struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
3383*4882a593Smuzhiyun 			      bool can_fail)
3384*4882a593Smuzhiyun {
3385*4882a593Smuzhiyun 	return ceph_msg_new2(type, front_len, 0, flags, can_fail);
3386*4882a593Smuzhiyun }
3387*4882a593Smuzhiyun EXPORT_SYMBOL(ceph_msg_new);
3388*4882a593Smuzhiyun 
3389*4882a593Smuzhiyun /*
3390*4882a593Smuzhiyun  * Allocate "middle" portion of a message, if it is needed and wasn't
3391*4882a593Smuzhiyun  * allocated by alloc_msg.  This allows us to read a small fixed-size
3392*4882a593Smuzhiyun  * per-type header in the front and then gracefully fail (i.e.,
3393*4882a593Smuzhiyun  * propagate the error to the caller based on info in the front) when
3394*4882a593Smuzhiyun  * the middle is too large.
3395*4882a593Smuzhiyun  */
ceph_alloc_middle(struct ceph_connection * con,struct ceph_msg * msg)3396*4882a593Smuzhiyun static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
3397*4882a593Smuzhiyun {
3398*4882a593Smuzhiyun 	int type = le16_to_cpu(msg->hdr.type);
3399*4882a593Smuzhiyun 	int middle_len = le32_to_cpu(msg->hdr.middle_len);
3400*4882a593Smuzhiyun 
3401*4882a593Smuzhiyun 	dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
3402*4882a593Smuzhiyun 	     ceph_msg_type_name(type), middle_len);
3403*4882a593Smuzhiyun 	BUG_ON(!middle_len);
3404*4882a593Smuzhiyun 	BUG_ON(msg->middle);
3405*4882a593Smuzhiyun 
3406*4882a593Smuzhiyun 	msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
3407*4882a593Smuzhiyun 	if (!msg->middle)
3408*4882a593Smuzhiyun 		return -ENOMEM;
3409*4882a593Smuzhiyun 	return 0;
3410*4882a593Smuzhiyun }
3411*4882a593Smuzhiyun 
3412*4882a593Smuzhiyun /*
3413*4882a593Smuzhiyun  * Allocate a message for receiving an incoming message on a
3414*4882a593Smuzhiyun  * connection, and save the result in con->in_msg.  Uses the
3415*4882a593Smuzhiyun  * connection's private alloc_msg op if available.
3416*4882a593Smuzhiyun  *
3417*4882a593Smuzhiyun  * Returns 0 on success, or a negative error code.
3418*4882a593Smuzhiyun  *
3419*4882a593Smuzhiyun  * On success, if we set *skip = 1:
3420*4882a593Smuzhiyun  *  - the next message should be skipped and ignored.
3421*4882a593Smuzhiyun  *  - con->in_msg == NULL
3422*4882a593Smuzhiyun  * or if we set *skip = 0:
3423*4882a593Smuzhiyun  *  - con->in_msg is non-null.
3424*4882a593Smuzhiyun  * On error (ENOMEM, EAGAIN, ...),
3425*4882a593Smuzhiyun  *  - con->in_msg == NULL
3426*4882a593Smuzhiyun  */
ceph_con_in_msg_alloc(struct ceph_connection * con,int * skip)3427*4882a593Smuzhiyun static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
3428*4882a593Smuzhiyun {
3429*4882a593Smuzhiyun 	struct ceph_msg_header *hdr = &con->in_hdr;
3430*4882a593Smuzhiyun 	int middle_len = le32_to_cpu(hdr->middle_len);
3431*4882a593Smuzhiyun 	struct ceph_msg *msg;
3432*4882a593Smuzhiyun 	int ret = 0;
3433*4882a593Smuzhiyun 
3434*4882a593Smuzhiyun 	BUG_ON(con->in_msg != NULL);
3435*4882a593Smuzhiyun 	BUG_ON(!con->ops->alloc_msg);
3436*4882a593Smuzhiyun 
3437*4882a593Smuzhiyun 	mutex_unlock(&con->mutex);
3438*4882a593Smuzhiyun 	msg = con->ops->alloc_msg(con, hdr, skip);
3439*4882a593Smuzhiyun 	mutex_lock(&con->mutex);
3440*4882a593Smuzhiyun 	if (con->state != CON_STATE_OPEN) {
3441*4882a593Smuzhiyun 		if (msg)
3442*4882a593Smuzhiyun 			ceph_msg_put(msg);
3443*4882a593Smuzhiyun 		return -EAGAIN;
3444*4882a593Smuzhiyun 	}
3445*4882a593Smuzhiyun 	if (msg) {
3446*4882a593Smuzhiyun 		BUG_ON(*skip);
3447*4882a593Smuzhiyun 		msg_con_set(msg, con);
3448*4882a593Smuzhiyun 		con->in_msg = msg;
3449*4882a593Smuzhiyun 	} else {
3450*4882a593Smuzhiyun 		/*
3451*4882a593Smuzhiyun 		 * Null message pointer means either we should skip
3452*4882a593Smuzhiyun 		 * this message or we couldn't allocate memory.  The
3453*4882a593Smuzhiyun 		 * former is not an error.
3454*4882a593Smuzhiyun 		 */
3455*4882a593Smuzhiyun 		if (*skip)
3456*4882a593Smuzhiyun 			return 0;
3457*4882a593Smuzhiyun 
3458*4882a593Smuzhiyun 		con->error_msg = "error allocating memory for incoming message";
3459*4882a593Smuzhiyun 		return -ENOMEM;
3460*4882a593Smuzhiyun 	}
3461*4882a593Smuzhiyun 	memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
3462*4882a593Smuzhiyun 
3463*4882a593Smuzhiyun 	if (middle_len && !con->in_msg->middle) {
3464*4882a593Smuzhiyun 		ret = ceph_alloc_middle(con, con->in_msg);
3465*4882a593Smuzhiyun 		if (ret < 0) {
3466*4882a593Smuzhiyun 			ceph_msg_put(con->in_msg);
3467*4882a593Smuzhiyun 			con->in_msg = NULL;
3468*4882a593Smuzhiyun 		}
3469*4882a593Smuzhiyun 	}
3470*4882a593Smuzhiyun 
3471*4882a593Smuzhiyun 	return ret;
3472*4882a593Smuzhiyun }
3473*4882a593Smuzhiyun 
3474*4882a593Smuzhiyun 
3475*4882a593Smuzhiyun /*
3476*4882a593Smuzhiyun  * Free a generically kmalloc'd message.
3477*4882a593Smuzhiyun  */
ceph_msg_free(struct ceph_msg * m)3478*4882a593Smuzhiyun static void ceph_msg_free(struct ceph_msg *m)
3479*4882a593Smuzhiyun {
3480*4882a593Smuzhiyun 	dout("%s %p\n", __func__, m);
3481*4882a593Smuzhiyun 	kvfree(m->front.iov_base);
3482*4882a593Smuzhiyun 	kfree(m->data);
3483*4882a593Smuzhiyun 	kmem_cache_free(ceph_msg_cache, m);
3484*4882a593Smuzhiyun }
3485*4882a593Smuzhiyun 
ceph_msg_release(struct kref * kref)3486*4882a593Smuzhiyun static void ceph_msg_release(struct kref *kref)
3487*4882a593Smuzhiyun {
3488*4882a593Smuzhiyun 	struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
3489*4882a593Smuzhiyun 	int i;
3490*4882a593Smuzhiyun 
3491*4882a593Smuzhiyun 	dout("%s %p\n", __func__, m);
3492*4882a593Smuzhiyun 	WARN_ON(!list_empty(&m->list_head));
3493*4882a593Smuzhiyun 
3494*4882a593Smuzhiyun 	msg_con_set(m, NULL);
3495*4882a593Smuzhiyun 
3496*4882a593Smuzhiyun 	/* drop middle, data, if any */
3497*4882a593Smuzhiyun 	if (m->middle) {
3498*4882a593Smuzhiyun 		ceph_buffer_put(m->middle);
3499*4882a593Smuzhiyun 		m->middle = NULL;
3500*4882a593Smuzhiyun 	}
3501*4882a593Smuzhiyun 
3502*4882a593Smuzhiyun 	for (i = 0; i < m->num_data_items; i++)
3503*4882a593Smuzhiyun 		ceph_msg_data_destroy(&m->data[i]);
3504*4882a593Smuzhiyun 
3505*4882a593Smuzhiyun 	if (m->pool)
3506*4882a593Smuzhiyun 		ceph_msgpool_put(m->pool, m);
3507*4882a593Smuzhiyun 	else
3508*4882a593Smuzhiyun 		ceph_msg_free(m);
3509*4882a593Smuzhiyun }
3510*4882a593Smuzhiyun 
ceph_msg_get(struct ceph_msg * msg)3511*4882a593Smuzhiyun struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
3512*4882a593Smuzhiyun {
3513*4882a593Smuzhiyun 	dout("%s %p (was %d)\n", __func__, msg,
3514*4882a593Smuzhiyun 	     kref_read(&msg->kref));
3515*4882a593Smuzhiyun 	kref_get(&msg->kref);
3516*4882a593Smuzhiyun 	return msg;
3517*4882a593Smuzhiyun }
3518*4882a593Smuzhiyun EXPORT_SYMBOL(ceph_msg_get);
3519*4882a593Smuzhiyun 
ceph_msg_put(struct ceph_msg * msg)3520*4882a593Smuzhiyun void ceph_msg_put(struct ceph_msg *msg)
3521*4882a593Smuzhiyun {
3522*4882a593Smuzhiyun 	dout("%s %p (was %d)\n", __func__, msg,
3523*4882a593Smuzhiyun 	     kref_read(&msg->kref));
3524*4882a593Smuzhiyun 	kref_put(&msg->kref, ceph_msg_release);
3525*4882a593Smuzhiyun }
3526*4882a593Smuzhiyun EXPORT_SYMBOL(ceph_msg_put);
3527*4882a593Smuzhiyun 
ceph_msg_dump(struct ceph_msg * msg)3528*4882a593Smuzhiyun void ceph_msg_dump(struct ceph_msg *msg)
3529*4882a593Smuzhiyun {
3530*4882a593Smuzhiyun 	pr_debug("msg_dump %p (front_alloc_len %d length %zd)\n", msg,
3531*4882a593Smuzhiyun 		 msg->front_alloc_len, msg->data_length);
3532*4882a593Smuzhiyun 	print_hex_dump(KERN_DEBUG, "header: ",
3533*4882a593Smuzhiyun 		       DUMP_PREFIX_OFFSET, 16, 1,
3534*4882a593Smuzhiyun 		       &msg->hdr, sizeof(msg->hdr), true);
3535*4882a593Smuzhiyun 	print_hex_dump(KERN_DEBUG, " front: ",
3536*4882a593Smuzhiyun 		       DUMP_PREFIX_OFFSET, 16, 1,
3537*4882a593Smuzhiyun 		       msg->front.iov_base, msg->front.iov_len, true);
3538*4882a593Smuzhiyun 	if (msg->middle)
3539*4882a593Smuzhiyun 		print_hex_dump(KERN_DEBUG, "middle: ",
3540*4882a593Smuzhiyun 			       DUMP_PREFIX_OFFSET, 16, 1,
3541*4882a593Smuzhiyun 			       msg->middle->vec.iov_base,
3542*4882a593Smuzhiyun 			       msg->middle->vec.iov_len, true);
3543*4882a593Smuzhiyun 	print_hex_dump(KERN_DEBUG, "footer: ",
3544*4882a593Smuzhiyun 		       DUMP_PREFIX_OFFSET, 16, 1,
3545*4882a593Smuzhiyun 		       &msg->footer, sizeof(msg->footer), true);
3546*4882a593Smuzhiyun }
3547*4882a593Smuzhiyun EXPORT_SYMBOL(ceph_msg_dump);
3548