1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0
2*4882a593Smuzhiyun /* Multipath TCP
3*4882a593Smuzhiyun *
4*4882a593Smuzhiyun * Copyright (c) 2017 - 2019, Intel Corporation.
5*4882a593Smuzhiyun */
6*4882a593Smuzhiyun
7*4882a593Smuzhiyun #define pr_fmt(fmt) "MPTCP: " fmt
8*4882a593Smuzhiyun
9*4882a593Smuzhiyun #include <linux/kernel.h>
10*4882a593Smuzhiyun #include <linux/module.h>
11*4882a593Smuzhiyun #include <linux/netdevice.h>
12*4882a593Smuzhiyun #include <linux/sched/signal.h>
13*4882a593Smuzhiyun #include <linux/atomic.h>
14*4882a593Smuzhiyun #include <net/sock.h>
15*4882a593Smuzhiyun #include <net/inet_common.h>
16*4882a593Smuzhiyun #include <net/inet_hashtables.h>
17*4882a593Smuzhiyun #include <net/protocol.h>
18*4882a593Smuzhiyun #include <net/tcp.h>
19*4882a593Smuzhiyun #include <net/tcp_states.h>
20*4882a593Smuzhiyun #if IS_ENABLED(CONFIG_MPTCP_IPV6)
21*4882a593Smuzhiyun #include <net/transp_v6.h>
22*4882a593Smuzhiyun #endif
23*4882a593Smuzhiyun #include <net/mptcp.h>
24*4882a593Smuzhiyun #include "protocol.h"
25*4882a593Smuzhiyun #include "mib.h"
26*4882a593Smuzhiyun
27*4882a593Smuzhiyun #if IS_ENABLED(CONFIG_MPTCP_IPV6)
28*4882a593Smuzhiyun struct mptcp6_sock {
29*4882a593Smuzhiyun struct mptcp_sock msk;
30*4882a593Smuzhiyun struct ipv6_pinfo np;
31*4882a593Smuzhiyun };
32*4882a593Smuzhiyun #endif
33*4882a593Smuzhiyun
34*4882a593Smuzhiyun struct mptcp_skb_cb {
35*4882a593Smuzhiyun u64 map_seq;
36*4882a593Smuzhiyun u64 end_seq;
37*4882a593Smuzhiyun u32 offset;
38*4882a593Smuzhiyun };
39*4882a593Smuzhiyun
40*4882a593Smuzhiyun #define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0]))
41*4882a593Smuzhiyun
42*4882a593Smuzhiyun static struct percpu_counter mptcp_sockets_allocated;
43*4882a593Smuzhiyun
44*4882a593Smuzhiyun /* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not
45*4882a593Smuzhiyun * completed yet or has failed, return the subflow socket.
46*4882a593Smuzhiyun * Otherwise return NULL.
47*4882a593Smuzhiyun */
__mptcp_nmpc_socket(const struct mptcp_sock * msk)48*4882a593Smuzhiyun static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk)
49*4882a593Smuzhiyun {
50*4882a593Smuzhiyun if (!msk->subflow || READ_ONCE(msk->can_ack))
51*4882a593Smuzhiyun return NULL;
52*4882a593Smuzhiyun
53*4882a593Smuzhiyun return msk->subflow;
54*4882a593Smuzhiyun }
55*4882a593Smuzhiyun
mptcp_is_tcpsk(struct sock * sk)56*4882a593Smuzhiyun static bool mptcp_is_tcpsk(struct sock *sk)
57*4882a593Smuzhiyun {
58*4882a593Smuzhiyun struct socket *sock = sk->sk_socket;
59*4882a593Smuzhiyun
60*4882a593Smuzhiyun if (unlikely(sk->sk_prot == &tcp_prot)) {
61*4882a593Smuzhiyun /* we are being invoked after mptcp_accept() has
62*4882a593Smuzhiyun * accepted a non-mp-capable flow: sk is a tcp_sk,
63*4882a593Smuzhiyun * not an mptcp one.
64*4882a593Smuzhiyun *
65*4882a593Smuzhiyun * Hand the socket over to tcp so all further socket ops
66*4882a593Smuzhiyun * bypass mptcp.
67*4882a593Smuzhiyun */
68*4882a593Smuzhiyun sock->ops = &inet_stream_ops;
69*4882a593Smuzhiyun return true;
70*4882a593Smuzhiyun #if IS_ENABLED(CONFIG_MPTCP_IPV6)
71*4882a593Smuzhiyun } else if (unlikely(sk->sk_prot == &tcpv6_prot)) {
72*4882a593Smuzhiyun sock->ops = &inet6_stream_ops;
73*4882a593Smuzhiyun return true;
74*4882a593Smuzhiyun #endif
75*4882a593Smuzhiyun }
76*4882a593Smuzhiyun
77*4882a593Smuzhiyun return false;
78*4882a593Smuzhiyun }
79*4882a593Smuzhiyun
__mptcp_tcp_fallback(struct mptcp_sock * msk)80*4882a593Smuzhiyun static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk)
81*4882a593Smuzhiyun {
82*4882a593Smuzhiyun sock_owned_by_me((const struct sock *)msk);
83*4882a593Smuzhiyun
84*4882a593Smuzhiyun if (likely(!__mptcp_check_fallback(msk)))
85*4882a593Smuzhiyun return NULL;
86*4882a593Smuzhiyun
87*4882a593Smuzhiyun return msk->first;
88*4882a593Smuzhiyun }
89*4882a593Smuzhiyun
__mptcp_socket_create(struct mptcp_sock * msk)90*4882a593Smuzhiyun static int __mptcp_socket_create(struct mptcp_sock *msk)
91*4882a593Smuzhiyun {
92*4882a593Smuzhiyun struct mptcp_subflow_context *subflow;
93*4882a593Smuzhiyun struct sock *sk = (struct sock *)msk;
94*4882a593Smuzhiyun struct socket *ssock;
95*4882a593Smuzhiyun int err;
96*4882a593Smuzhiyun
97*4882a593Smuzhiyun err = mptcp_subflow_create_socket(sk, &ssock);
98*4882a593Smuzhiyun if (err)
99*4882a593Smuzhiyun return err;
100*4882a593Smuzhiyun
101*4882a593Smuzhiyun msk->first = ssock->sk;
102*4882a593Smuzhiyun msk->subflow = ssock;
103*4882a593Smuzhiyun subflow = mptcp_subflow_ctx(ssock->sk);
104*4882a593Smuzhiyun list_add(&subflow->node, &msk->conn_list);
105*4882a593Smuzhiyun subflow->request_mptcp = 1;
106*4882a593Smuzhiyun
107*4882a593Smuzhiyun /* accept() will wait on first subflow sk_wq, and we always wakes up
108*4882a593Smuzhiyun * via msk->sk_socket
109*4882a593Smuzhiyun */
110*4882a593Smuzhiyun RCU_INIT_POINTER(msk->first->sk_wq, &sk->sk_socket->wq);
111*4882a593Smuzhiyun
112*4882a593Smuzhiyun return 0;
113*4882a593Smuzhiyun }
114*4882a593Smuzhiyun
mptcp_drop(struct sock * sk,struct sk_buff * skb)115*4882a593Smuzhiyun static void mptcp_drop(struct sock *sk, struct sk_buff *skb)
116*4882a593Smuzhiyun {
117*4882a593Smuzhiyun sk_drops_add(sk, skb);
118*4882a593Smuzhiyun __kfree_skb(skb);
119*4882a593Smuzhiyun }
120*4882a593Smuzhiyun
mptcp_try_coalesce(struct sock * sk,struct sk_buff * to,struct sk_buff * from)121*4882a593Smuzhiyun static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
122*4882a593Smuzhiyun struct sk_buff *from)
123*4882a593Smuzhiyun {
124*4882a593Smuzhiyun bool fragstolen;
125*4882a593Smuzhiyun int delta;
126*4882a593Smuzhiyun
127*4882a593Smuzhiyun if (MPTCP_SKB_CB(from)->offset ||
128*4882a593Smuzhiyun !skb_try_coalesce(to, from, &fragstolen, &delta))
129*4882a593Smuzhiyun return false;
130*4882a593Smuzhiyun
131*4882a593Smuzhiyun pr_debug("colesced seq %llx into %llx new len %d new end seq %llx",
132*4882a593Smuzhiyun MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq,
133*4882a593Smuzhiyun to->len, MPTCP_SKB_CB(from)->end_seq);
134*4882a593Smuzhiyun MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq;
135*4882a593Smuzhiyun kfree_skb_partial(from, fragstolen);
136*4882a593Smuzhiyun atomic_add(delta, &sk->sk_rmem_alloc);
137*4882a593Smuzhiyun sk_mem_charge(sk, delta);
138*4882a593Smuzhiyun return true;
139*4882a593Smuzhiyun }
140*4882a593Smuzhiyun
mptcp_ooo_try_coalesce(struct mptcp_sock * msk,struct sk_buff * to,struct sk_buff * from)141*4882a593Smuzhiyun static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to,
142*4882a593Smuzhiyun struct sk_buff *from)
143*4882a593Smuzhiyun {
144*4882a593Smuzhiyun if (MPTCP_SKB_CB(from)->map_seq != MPTCP_SKB_CB(to)->end_seq)
145*4882a593Smuzhiyun return false;
146*4882a593Smuzhiyun
147*4882a593Smuzhiyun return mptcp_try_coalesce((struct sock *)msk, to, from);
148*4882a593Smuzhiyun }
149*4882a593Smuzhiyun
150*4882a593Smuzhiyun /* "inspired" by tcp_data_queue_ofo(), main differences:
151*4882a593Smuzhiyun * - use mptcp seqs
152*4882a593Smuzhiyun * - don't cope with sacks
153*4882a593Smuzhiyun */
mptcp_data_queue_ofo(struct mptcp_sock * msk,struct sk_buff * skb)154*4882a593Smuzhiyun static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
155*4882a593Smuzhiyun {
156*4882a593Smuzhiyun struct sock *sk = (struct sock *)msk;
157*4882a593Smuzhiyun struct rb_node **p, *parent;
158*4882a593Smuzhiyun u64 seq, end_seq, max_seq;
159*4882a593Smuzhiyun struct sk_buff *skb1;
160*4882a593Smuzhiyun int space;
161*4882a593Smuzhiyun
162*4882a593Smuzhiyun seq = MPTCP_SKB_CB(skb)->map_seq;
163*4882a593Smuzhiyun end_seq = MPTCP_SKB_CB(skb)->end_seq;
164*4882a593Smuzhiyun space = tcp_space(sk);
165*4882a593Smuzhiyun max_seq = space > 0 ? space + msk->ack_seq : msk->ack_seq;
166*4882a593Smuzhiyun
167*4882a593Smuzhiyun pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq,
168*4882a593Smuzhiyun RB_EMPTY_ROOT(&msk->out_of_order_queue));
169*4882a593Smuzhiyun if (after64(seq, max_seq)) {
170*4882a593Smuzhiyun /* out of window */
171*4882a593Smuzhiyun mptcp_drop(sk, skb);
172*4882a593Smuzhiyun MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW);
173*4882a593Smuzhiyun return;
174*4882a593Smuzhiyun }
175*4882a593Smuzhiyun
176*4882a593Smuzhiyun p = &msk->out_of_order_queue.rb_node;
177*4882a593Smuzhiyun MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUE);
178*4882a593Smuzhiyun if (RB_EMPTY_ROOT(&msk->out_of_order_queue)) {
179*4882a593Smuzhiyun rb_link_node(&skb->rbnode, NULL, p);
180*4882a593Smuzhiyun rb_insert_color(&skb->rbnode, &msk->out_of_order_queue);
181*4882a593Smuzhiyun msk->ooo_last_skb = skb;
182*4882a593Smuzhiyun goto end;
183*4882a593Smuzhiyun }
184*4882a593Smuzhiyun
185*4882a593Smuzhiyun /* with 2 subflows, adding at end of ooo queue is quite likely
186*4882a593Smuzhiyun * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
187*4882a593Smuzhiyun */
188*4882a593Smuzhiyun if (mptcp_ooo_try_coalesce(msk, msk->ooo_last_skb, skb)) {
189*4882a593Smuzhiyun MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE);
190*4882a593Smuzhiyun MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL);
191*4882a593Smuzhiyun return;
192*4882a593Smuzhiyun }
193*4882a593Smuzhiyun
194*4882a593Smuzhiyun /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */
195*4882a593Smuzhiyun if (!before64(seq, MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq)) {
196*4882a593Smuzhiyun MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL);
197*4882a593Smuzhiyun parent = &msk->ooo_last_skb->rbnode;
198*4882a593Smuzhiyun p = &parent->rb_right;
199*4882a593Smuzhiyun goto insert;
200*4882a593Smuzhiyun }
201*4882a593Smuzhiyun
202*4882a593Smuzhiyun /* Find place to insert this segment. Handle overlaps on the way. */
203*4882a593Smuzhiyun parent = NULL;
204*4882a593Smuzhiyun while (*p) {
205*4882a593Smuzhiyun parent = *p;
206*4882a593Smuzhiyun skb1 = rb_to_skb(parent);
207*4882a593Smuzhiyun if (before64(seq, MPTCP_SKB_CB(skb1)->map_seq)) {
208*4882a593Smuzhiyun p = &parent->rb_left;
209*4882a593Smuzhiyun continue;
210*4882a593Smuzhiyun }
211*4882a593Smuzhiyun if (before64(seq, MPTCP_SKB_CB(skb1)->end_seq)) {
212*4882a593Smuzhiyun if (!after64(end_seq, MPTCP_SKB_CB(skb1)->end_seq)) {
213*4882a593Smuzhiyun /* All the bits are present. Drop. */
214*4882a593Smuzhiyun mptcp_drop(sk, skb);
215*4882a593Smuzhiyun MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
216*4882a593Smuzhiyun return;
217*4882a593Smuzhiyun }
218*4882a593Smuzhiyun if (after64(seq, MPTCP_SKB_CB(skb1)->map_seq)) {
219*4882a593Smuzhiyun /* partial overlap:
220*4882a593Smuzhiyun * | skb |
221*4882a593Smuzhiyun * | skb1 |
222*4882a593Smuzhiyun * continue traversing
223*4882a593Smuzhiyun */
224*4882a593Smuzhiyun } else {
225*4882a593Smuzhiyun /* skb's seq == skb1's seq and skb covers skb1.
226*4882a593Smuzhiyun * Replace skb1 with skb.
227*4882a593Smuzhiyun */
228*4882a593Smuzhiyun rb_replace_node(&skb1->rbnode, &skb->rbnode,
229*4882a593Smuzhiyun &msk->out_of_order_queue);
230*4882a593Smuzhiyun mptcp_drop(sk, skb1);
231*4882a593Smuzhiyun MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
232*4882a593Smuzhiyun goto merge_right;
233*4882a593Smuzhiyun }
234*4882a593Smuzhiyun } else if (mptcp_ooo_try_coalesce(msk, skb1, skb)) {
235*4882a593Smuzhiyun MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE);
236*4882a593Smuzhiyun return;
237*4882a593Smuzhiyun }
238*4882a593Smuzhiyun p = &parent->rb_right;
239*4882a593Smuzhiyun }
240*4882a593Smuzhiyun
241*4882a593Smuzhiyun insert:
242*4882a593Smuzhiyun /* Insert segment into RB tree. */
243*4882a593Smuzhiyun rb_link_node(&skb->rbnode, parent, p);
244*4882a593Smuzhiyun rb_insert_color(&skb->rbnode, &msk->out_of_order_queue);
245*4882a593Smuzhiyun
246*4882a593Smuzhiyun merge_right:
247*4882a593Smuzhiyun /* Remove other segments covered by skb. */
248*4882a593Smuzhiyun while ((skb1 = skb_rb_next(skb)) != NULL) {
249*4882a593Smuzhiyun if (before64(end_seq, MPTCP_SKB_CB(skb1)->end_seq))
250*4882a593Smuzhiyun break;
251*4882a593Smuzhiyun rb_erase(&skb1->rbnode, &msk->out_of_order_queue);
252*4882a593Smuzhiyun mptcp_drop(sk, skb1);
253*4882a593Smuzhiyun MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
254*4882a593Smuzhiyun }
255*4882a593Smuzhiyun /* If there is no skb after us, we are the last_skb ! */
256*4882a593Smuzhiyun if (!skb1)
257*4882a593Smuzhiyun msk->ooo_last_skb = skb;
258*4882a593Smuzhiyun
259*4882a593Smuzhiyun end:
260*4882a593Smuzhiyun skb_condense(skb);
261*4882a593Smuzhiyun skb_set_owner_r(skb, sk);
262*4882a593Smuzhiyun }
263*4882a593Smuzhiyun
__mptcp_move_skb(struct mptcp_sock * msk,struct sock * ssk,struct sk_buff * skb,unsigned int offset,size_t copy_len)264*4882a593Smuzhiyun static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
265*4882a593Smuzhiyun struct sk_buff *skb, unsigned int offset,
266*4882a593Smuzhiyun size_t copy_len)
267*4882a593Smuzhiyun {
268*4882a593Smuzhiyun struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
269*4882a593Smuzhiyun struct sock *sk = (struct sock *)msk;
270*4882a593Smuzhiyun struct sk_buff *tail;
271*4882a593Smuzhiyun
272*4882a593Smuzhiyun __skb_unlink(skb, &ssk->sk_receive_queue);
273*4882a593Smuzhiyun
274*4882a593Smuzhiyun skb_ext_reset(skb);
275*4882a593Smuzhiyun skb_orphan(skb);
276*4882a593Smuzhiyun
277*4882a593Smuzhiyun /* try to fetch required memory from subflow */
278*4882a593Smuzhiyun if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
279*4882a593Smuzhiyun int amount = sk_mem_pages(skb->truesize) << SK_MEM_QUANTUM_SHIFT;
280*4882a593Smuzhiyun
281*4882a593Smuzhiyun if (ssk->sk_forward_alloc < amount)
282*4882a593Smuzhiyun goto drop;
283*4882a593Smuzhiyun
284*4882a593Smuzhiyun ssk->sk_forward_alloc -= amount;
285*4882a593Smuzhiyun sk->sk_forward_alloc += amount;
286*4882a593Smuzhiyun }
287*4882a593Smuzhiyun
288*4882a593Smuzhiyun /* the skb map_seq accounts for the skb offset:
289*4882a593Smuzhiyun * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq
290*4882a593Smuzhiyun * value
291*4882a593Smuzhiyun */
292*4882a593Smuzhiyun MPTCP_SKB_CB(skb)->map_seq = mptcp_subflow_get_mapped_dsn(subflow);
293*4882a593Smuzhiyun MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + copy_len;
294*4882a593Smuzhiyun MPTCP_SKB_CB(skb)->offset = offset;
295*4882a593Smuzhiyun
296*4882a593Smuzhiyun if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) {
297*4882a593Smuzhiyun /* in sequence */
298*4882a593Smuzhiyun WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len);
299*4882a593Smuzhiyun tail = skb_peek_tail(&sk->sk_receive_queue);
300*4882a593Smuzhiyun if (tail && mptcp_try_coalesce(sk, tail, skb))
301*4882a593Smuzhiyun return true;
302*4882a593Smuzhiyun
303*4882a593Smuzhiyun skb_set_owner_r(skb, sk);
304*4882a593Smuzhiyun __skb_queue_tail(&sk->sk_receive_queue, skb);
305*4882a593Smuzhiyun return true;
306*4882a593Smuzhiyun } else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) {
307*4882a593Smuzhiyun mptcp_data_queue_ofo(msk, skb);
308*4882a593Smuzhiyun return false;
309*4882a593Smuzhiyun }
310*4882a593Smuzhiyun
311*4882a593Smuzhiyun /* old data, keep it simple and drop the whole pkt, sender
312*4882a593Smuzhiyun * will retransmit as needed, if needed.
313*4882a593Smuzhiyun */
314*4882a593Smuzhiyun MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
315*4882a593Smuzhiyun drop:
316*4882a593Smuzhiyun mptcp_drop(sk, skb);
317*4882a593Smuzhiyun return false;
318*4882a593Smuzhiyun }
319*4882a593Smuzhiyun
mptcp_stop_timer(struct sock * sk)320*4882a593Smuzhiyun static void mptcp_stop_timer(struct sock *sk)
321*4882a593Smuzhiyun {
322*4882a593Smuzhiyun struct inet_connection_sock *icsk = inet_csk(sk);
323*4882a593Smuzhiyun
324*4882a593Smuzhiyun sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
325*4882a593Smuzhiyun mptcp_sk(sk)->timer_ival = 0;
326*4882a593Smuzhiyun }
327*4882a593Smuzhiyun
mptcp_check_data_fin_ack(struct sock * sk)328*4882a593Smuzhiyun static void mptcp_check_data_fin_ack(struct sock *sk)
329*4882a593Smuzhiyun {
330*4882a593Smuzhiyun struct mptcp_sock *msk = mptcp_sk(sk);
331*4882a593Smuzhiyun
332*4882a593Smuzhiyun if (__mptcp_check_fallback(msk))
333*4882a593Smuzhiyun return;
334*4882a593Smuzhiyun
335*4882a593Smuzhiyun /* Look for an acknowledged DATA_FIN */
336*4882a593Smuzhiyun if (((1 << sk->sk_state) &
337*4882a593Smuzhiyun (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) &&
338*4882a593Smuzhiyun msk->write_seq == atomic64_read(&msk->snd_una)) {
339*4882a593Smuzhiyun mptcp_stop_timer(sk);
340*4882a593Smuzhiyun
341*4882a593Smuzhiyun WRITE_ONCE(msk->snd_data_fin_enable, 0);
342*4882a593Smuzhiyun
343*4882a593Smuzhiyun switch (sk->sk_state) {
344*4882a593Smuzhiyun case TCP_FIN_WAIT1:
345*4882a593Smuzhiyun inet_sk_state_store(sk, TCP_FIN_WAIT2);
346*4882a593Smuzhiyun sk->sk_state_change(sk);
347*4882a593Smuzhiyun break;
348*4882a593Smuzhiyun case TCP_CLOSING:
349*4882a593Smuzhiyun case TCP_LAST_ACK:
350*4882a593Smuzhiyun inet_sk_state_store(sk, TCP_CLOSE);
351*4882a593Smuzhiyun sk->sk_state_change(sk);
352*4882a593Smuzhiyun break;
353*4882a593Smuzhiyun }
354*4882a593Smuzhiyun
355*4882a593Smuzhiyun if (sk->sk_shutdown == SHUTDOWN_MASK ||
356*4882a593Smuzhiyun sk->sk_state == TCP_CLOSE)
357*4882a593Smuzhiyun sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
358*4882a593Smuzhiyun else
359*4882a593Smuzhiyun sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
360*4882a593Smuzhiyun }
361*4882a593Smuzhiyun }
362*4882a593Smuzhiyun
mptcp_pending_data_fin(struct sock * sk,u64 * seq)363*4882a593Smuzhiyun static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq)
364*4882a593Smuzhiyun {
365*4882a593Smuzhiyun struct mptcp_sock *msk = mptcp_sk(sk);
366*4882a593Smuzhiyun
367*4882a593Smuzhiyun if (READ_ONCE(msk->rcv_data_fin) &&
368*4882a593Smuzhiyun ((1 << sk->sk_state) &
369*4882a593Smuzhiyun (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2))) {
370*4882a593Smuzhiyun u64 rcv_data_fin_seq = READ_ONCE(msk->rcv_data_fin_seq);
371*4882a593Smuzhiyun
372*4882a593Smuzhiyun if (msk->ack_seq == rcv_data_fin_seq) {
373*4882a593Smuzhiyun if (seq)
374*4882a593Smuzhiyun *seq = rcv_data_fin_seq;
375*4882a593Smuzhiyun
376*4882a593Smuzhiyun return true;
377*4882a593Smuzhiyun }
378*4882a593Smuzhiyun }
379*4882a593Smuzhiyun
380*4882a593Smuzhiyun return false;
381*4882a593Smuzhiyun }
382*4882a593Smuzhiyun
mptcp_set_timeout(const struct sock * sk,const struct sock * ssk)383*4882a593Smuzhiyun static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk)
384*4882a593Smuzhiyun {
385*4882a593Smuzhiyun long tout = ssk && inet_csk(ssk)->icsk_pending ?
386*4882a593Smuzhiyun inet_csk(ssk)->icsk_timeout - jiffies : 0;
387*4882a593Smuzhiyun
388*4882a593Smuzhiyun if (tout <= 0)
389*4882a593Smuzhiyun tout = mptcp_sk(sk)->timer_ival;
390*4882a593Smuzhiyun mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN;
391*4882a593Smuzhiyun }
392*4882a593Smuzhiyun
mptcp_check_data_fin(struct sock * sk)393*4882a593Smuzhiyun static void mptcp_check_data_fin(struct sock *sk)
394*4882a593Smuzhiyun {
395*4882a593Smuzhiyun struct mptcp_sock *msk = mptcp_sk(sk);
396*4882a593Smuzhiyun u64 rcv_data_fin_seq;
397*4882a593Smuzhiyun
398*4882a593Smuzhiyun if (__mptcp_check_fallback(msk) || !msk->first)
399*4882a593Smuzhiyun return;
400*4882a593Smuzhiyun
401*4882a593Smuzhiyun /* Need to ack a DATA_FIN received from a peer while this side
402*4882a593Smuzhiyun * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2.
403*4882a593Smuzhiyun * msk->rcv_data_fin was set when parsing the incoming options
404*4882a593Smuzhiyun * at the subflow level and the msk lock was not held, so this
405*4882a593Smuzhiyun * is the first opportunity to act on the DATA_FIN and change
406*4882a593Smuzhiyun * the msk state.
407*4882a593Smuzhiyun *
408*4882a593Smuzhiyun * If we are caught up to the sequence number of the incoming
409*4882a593Smuzhiyun * DATA_FIN, send the DATA_ACK now and do state transition. If
410*4882a593Smuzhiyun * not caught up, do nothing and let the recv code send DATA_ACK
411*4882a593Smuzhiyun * when catching up.
412*4882a593Smuzhiyun */
413*4882a593Smuzhiyun
414*4882a593Smuzhiyun if (mptcp_pending_data_fin(sk, &rcv_data_fin_seq)) {
415*4882a593Smuzhiyun struct mptcp_subflow_context *subflow;
416*4882a593Smuzhiyun
417*4882a593Smuzhiyun WRITE_ONCE(msk->ack_seq, msk->ack_seq + 1);
418*4882a593Smuzhiyun WRITE_ONCE(msk->rcv_data_fin, 0);
419*4882a593Smuzhiyun
420*4882a593Smuzhiyun sk->sk_shutdown |= RCV_SHUTDOWN;
421*4882a593Smuzhiyun smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
422*4882a593Smuzhiyun set_bit(MPTCP_DATA_READY, &msk->flags);
423*4882a593Smuzhiyun
424*4882a593Smuzhiyun switch (sk->sk_state) {
425*4882a593Smuzhiyun case TCP_ESTABLISHED:
426*4882a593Smuzhiyun inet_sk_state_store(sk, TCP_CLOSE_WAIT);
427*4882a593Smuzhiyun break;
428*4882a593Smuzhiyun case TCP_FIN_WAIT1:
429*4882a593Smuzhiyun inet_sk_state_store(sk, TCP_CLOSING);
430*4882a593Smuzhiyun break;
431*4882a593Smuzhiyun case TCP_FIN_WAIT2:
432*4882a593Smuzhiyun inet_sk_state_store(sk, TCP_CLOSE);
433*4882a593Smuzhiyun // @@ Close subflows now?
434*4882a593Smuzhiyun break;
435*4882a593Smuzhiyun default:
436*4882a593Smuzhiyun /* Other states not expected */
437*4882a593Smuzhiyun WARN_ON_ONCE(1);
438*4882a593Smuzhiyun break;
439*4882a593Smuzhiyun }
440*4882a593Smuzhiyun
441*4882a593Smuzhiyun mptcp_set_timeout(sk, NULL);
442*4882a593Smuzhiyun mptcp_for_each_subflow(msk, subflow) {
443*4882a593Smuzhiyun struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
444*4882a593Smuzhiyun
445*4882a593Smuzhiyun lock_sock(ssk);
446*4882a593Smuzhiyun tcp_send_ack(ssk);
447*4882a593Smuzhiyun release_sock(ssk);
448*4882a593Smuzhiyun }
449*4882a593Smuzhiyun
450*4882a593Smuzhiyun sk->sk_state_change(sk);
451*4882a593Smuzhiyun
452*4882a593Smuzhiyun if (sk->sk_shutdown == SHUTDOWN_MASK ||
453*4882a593Smuzhiyun sk->sk_state == TCP_CLOSE)
454*4882a593Smuzhiyun sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
455*4882a593Smuzhiyun else
456*4882a593Smuzhiyun sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
457*4882a593Smuzhiyun }
458*4882a593Smuzhiyun }
459*4882a593Smuzhiyun
__mptcp_move_skbs_from_subflow(struct mptcp_sock * msk,struct sock * ssk,unsigned int * bytes)460*4882a593Smuzhiyun static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
461*4882a593Smuzhiyun struct sock *ssk,
462*4882a593Smuzhiyun unsigned int *bytes)
463*4882a593Smuzhiyun {
464*4882a593Smuzhiyun struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
465*4882a593Smuzhiyun struct sock *sk = (struct sock *)msk;
466*4882a593Smuzhiyun unsigned int moved = 0;
467*4882a593Smuzhiyun bool more_data_avail;
468*4882a593Smuzhiyun struct tcp_sock *tp;
469*4882a593Smuzhiyun u32 old_copied_seq;
470*4882a593Smuzhiyun bool done = false;
471*4882a593Smuzhiyun
472*4882a593Smuzhiyun pr_debug("msk=%p ssk=%p", msk, ssk);
473*4882a593Smuzhiyun tp = tcp_sk(ssk);
474*4882a593Smuzhiyun old_copied_seq = tp->copied_seq;
475*4882a593Smuzhiyun do {
476*4882a593Smuzhiyun u32 map_remaining, offset;
477*4882a593Smuzhiyun u32 seq = tp->copied_seq;
478*4882a593Smuzhiyun struct sk_buff *skb;
479*4882a593Smuzhiyun bool fin;
480*4882a593Smuzhiyun
481*4882a593Smuzhiyun /* try to move as much data as available */
482*4882a593Smuzhiyun map_remaining = subflow->map_data_len -
483*4882a593Smuzhiyun mptcp_subflow_get_map_offset(subflow);
484*4882a593Smuzhiyun
485*4882a593Smuzhiyun skb = skb_peek(&ssk->sk_receive_queue);
486*4882a593Smuzhiyun if (!skb) {
487*4882a593Smuzhiyun /* if no data is found, a racing workqueue/recvmsg
488*4882a593Smuzhiyun * already processed the new data, stop here or we
489*4882a593Smuzhiyun * can enter an infinite loop
490*4882a593Smuzhiyun */
491*4882a593Smuzhiyun if (!moved)
492*4882a593Smuzhiyun done = true;
493*4882a593Smuzhiyun break;
494*4882a593Smuzhiyun }
495*4882a593Smuzhiyun
496*4882a593Smuzhiyun if (__mptcp_check_fallback(msk)) {
497*4882a593Smuzhiyun /* if we are running under the workqueue, TCP could have
498*4882a593Smuzhiyun * collapsed skbs between dummy map creation and now
499*4882a593Smuzhiyun * be sure to adjust the size
500*4882a593Smuzhiyun */
501*4882a593Smuzhiyun map_remaining = skb->len;
502*4882a593Smuzhiyun subflow->map_data_len = skb->len;
503*4882a593Smuzhiyun }
504*4882a593Smuzhiyun
505*4882a593Smuzhiyun offset = seq - TCP_SKB_CB(skb)->seq;
506*4882a593Smuzhiyun fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
507*4882a593Smuzhiyun if (fin) {
508*4882a593Smuzhiyun done = true;
509*4882a593Smuzhiyun seq++;
510*4882a593Smuzhiyun }
511*4882a593Smuzhiyun
512*4882a593Smuzhiyun if (offset < skb->len) {
513*4882a593Smuzhiyun size_t len = skb->len - offset;
514*4882a593Smuzhiyun
515*4882a593Smuzhiyun if (tp->urg_data)
516*4882a593Smuzhiyun done = true;
517*4882a593Smuzhiyun
518*4882a593Smuzhiyun if (__mptcp_move_skb(msk, ssk, skb, offset, len))
519*4882a593Smuzhiyun moved += len;
520*4882a593Smuzhiyun seq += len;
521*4882a593Smuzhiyun
522*4882a593Smuzhiyun if (WARN_ON_ONCE(map_remaining < len))
523*4882a593Smuzhiyun break;
524*4882a593Smuzhiyun } else {
525*4882a593Smuzhiyun WARN_ON_ONCE(!fin);
526*4882a593Smuzhiyun sk_eat_skb(ssk, skb);
527*4882a593Smuzhiyun done = true;
528*4882a593Smuzhiyun }
529*4882a593Smuzhiyun
530*4882a593Smuzhiyun WRITE_ONCE(tp->copied_seq, seq);
531*4882a593Smuzhiyun more_data_avail = mptcp_subflow_data_available(ssk);
532*4882a593Smuzhiyun
533*4882a593Smuzhiyun if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf)) {
534*4882a593Smuzhiyun done = true;
535*4882a593Smuzhiyun break;
536*4882a593Smuzhiyun }
537*4882a593Smuzhiyun } while (more_data_avail);
538*4882a593Smuzhiyun
539*4882a593Smuzhiyun *bytes += moved;
540*4882a593Smuzhiyun if (tp->copied_seq != old_copied_seq)
541*4882a593Smuzhiyun tcp_cleanup_rbuf(ssk, 1);
542*4882a593Smuzhiyun
543*4882a593Smuzhiyun return done;
544*4882a593Smuzhiyun }
545*4882a593Smuzhiyun
mptcp_ofo_queue(struct mptcp_sock * msk)546*4882a593Smuzhiyun static bool mptcp_ofo_queue(struct mptcp_sock *msk)
547*4882a593Smuzhiyun {
548*4882a593Smuzhiyun struct sock *sk = (struct sock *)msk;
549*4882a593Smuzhiyun struct sk_buff *skb, *tail;
550*4882a593Smuzhiyun bool moved = false;
551*4882a593Smuzhiyun struct rb_node *p;
552*4882a593Smuzhiyun u64 end_seq;
553*4882a593Smuzhiyun
554*4882a593Smuzhiyun p = rb_first(&msk->out_of_order_queue);
555*4882a593Smuzhiyun pr_debug("msk=%p empty=%d", msk, RB_EMPTY_ROOT(&msk->out_of_order_queue));
556*4882a593Smuzhiyun while (p) {
557*4882a593Smuzhiyun skb = rb_to_skb(p);
558*4882a593Smuzhiyun if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq))
559*4882a593Smuzhiyun break;
560*4882a593Smuzhiyun
561*4882a593Smuzhiyun p = rb_next(p);
562*4882a593Smuzhiyun rb_erase(&skb->rbnode, &msk->out_of_order_queue);
563*4882a593Smuzhiyun
564*4882a593Smuzhiyun if (unlikely(!after64(MPTCP_SKB_CB(skb)->end_seq,
565*4882a593Smuzhiyun msk->ack_seq))) {
566*4882a593Smuzhiyun mptcp_drop(sk, skb);
567*4882a593Smuzhiyun MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
568*4882a593Smuzhiyun continue;
569*4882a593Smuzhiyun }
570*4882a593Smuzhiyun
571*4882a593Smuzhiyun end_seq = MPTCP_SKB_CB(skb)->end_seq;
572*4882a593Smuzhiyun tail = skb_peek_tail(&sk->sk_receive_queue);
573*4882a593Smuzhiyun if (!tail || !mptcp_ooo_try_coalesce(msk, tail, skb)) {
574*4882a593Smuzhiyun int delta = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq;
575*4882a593Smuzhiyun
576*4882a593Smuzhiyun /* skip overlapping data, if any */
577*4882a593Smuzhiyun pr_debug("uncoalesced seq=%llx ack seq=%llx delta=%d",
578*4882a593Smuzhiyun MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq,
579*4882a593Smuzhiyun delta);
580*4882a593Smuzhiyun MPTCP_SKB_CB(skb)->offset += delta;
581*4882a593Smuzhiyun __skb_queue_tail(&sk->sk_receive_queue, skb);
582*4882a593Smuzhiyun }
583*4882a593Smuzhiyun msk->ack_seq = end_seq;
584*4882a593Smuzhiyun moved = true;
585*4882a593Smuzhiyun }
586*4882a593Smuzhiyun return moved;
587*4882a593Smuzhiyun }
588*4882a593Smuzhiyun
589*4882a593Smuzhiyun /* In most cases we will be able to lock the mptcp socket. If its already
590*4882a593Smuzhiyun * owned, we need to defer to the work queue to avoid ABBA deadlock.
591*4882a593Smuzhiyun */
move_skbs_to_msk(struct mptcp_sock * msk,struct sock * ssk)592*4882a593Smuzhiyun static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
593*4882a593Smuzhiyun {
594*4882a593Smuzhiyun struct sock *sk = (struct sock *)msk;
595*4882a593Smuzhiyun unsigned int moved = 0;
596*4882a593Smuzhiyun
597*4882a593Smuzhiyun if (READ_ONCE(sk->sk_lock.owned))
598*4882a593Smuzhiyun return false;
599*4882a593Smuzhiyun
600*4882a593Smuzhiyun if (unlikely(!spin_trylock_bh(&sk->sk_lock.slock)))
601*4882a593Smuzhiyun return false;
602*4882a593Smuzhiyun
603*4882a593Smuzhiyun /* must re-check after taking the lock */
604*4882a593Smuzhiyun if (!READ_ONCE(sk->sk_lock.owned)) {
605*4882a593Smuzhiyun __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
606*4882a593Smuzhiyun mptcp_ofo_queue(msk);
607*4882a593Smuzhiyun
608*4882a593Smuzhiyun /* If the moves have caught up with the DATA_FIN sequence number
609*4882a593Smuzhiyun * it's time to ack the DATA_FIN and change socket state, but
610*4882a593Smuzhiyun * this is not a good place to change state. Let the workqueue
611*4882a593Smuzhiyun * do it.
612*4882a593Smuzhiyun */
613*4882a593Smuzhiyun if (mptcp_pending_data_fin(sk, NULL) &&
614*4882a593Smuzhiyun schedule_work(&msk->work))
615*4882a593Smuzhiyun sock_hold(sk);
616*4882a593Smuzhiyun }
617*4882a593Smuzhiyun
618*4882a593Smuzhiyun spin_unlock_bh(&sk->sk_lock.slock);
619*4882a593Smuzhiyun
620*4882a593Smuzhiyun return moved > 0;
621*4882a593Smuzhiyun }
622*4882a593Smuzhiyun
mptcp_data_ready(struct sock * sk,struct sock * ssk)623*4882a593Smuzhiyun void mptcp_data_ready(struct sock *sk, struct sock *ssk)
624*4882a593Smuzhiyun {
625*4882a593Smuzhiyun struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
626*4882a593Smuzhiyun struct mptcp_sock *msk = mptcp_sk(sk);
627*4882a593Smuzhiyun bool wake;
628*4882a593Smuzhiyun
629*4882a593Smuzhiyun /* move_skbs_to_msk below can legitly clear the data_avail flag,
630*4882a593Smuzhiyun * but we will need later to properly woke the reader, cache its
631*4882a593Smuzhiyun * value
632*4882a593Smuzhiyun */
633*4882a593Smuzhiyun wake = subflow->data_avail == MPTCP_SUBFLOW_DATA_AVAIL;
634*4882a593Smuzhiyun if (wake)
635*4882a593Smuzhiyun set_bit(MPTCP_DATA_READY, &msk->flags);
636*4882a593Smuzhiyun
637*4882a593Smuzhiyun if (atomic_read(&sk->sk_rmem_alloc) < READ_ONCE(sk->sk_rcvbuf) &&
638*4882a593Smuzhiyun move_skbs_to_msk(msk, ssk))
639*4882a593Smuzhiyun goto wake;
640*4882a593Smuzhiyun
641*4882a593Smuzhiyun /* don't schedule if mptcp sk is (still) over limit */
642*4882a593Smuzhiyun if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf))
643*4882a593Smuzhiyun goto wake;
644*4882a593Smuzhiyun
645*4882a593Smuzhiyun /* mptcp socket is owned, release_cb should retry */
646*4882a593Smuzhiyun if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED,
647*4882a593Smuzhiyun &sk->sk_tsq_flags)) {
648*4882a593Smuzhiyun sock_hold(sk);
649*4882a593Smuzhiyun
650*4882a593Smuzhiyun /* need to try again, its possible release_cb() has already
651*4882a593Smuzhiyun * been called after the test_and_set_bit() above.
652*4882a593Smuzhiyun */
653*4882a593Smuzhiyun move_skbs_to_msk(msk, ssk);
654*4882a593Smuzhiyun }
655*4882a593Smuzhiyun wake:
656*4882a593Smuzhiyun if (wake)
657*4882a593Smuzhiyun sk->sk_data_ready(sk);
658*4882a593Smuzhiyun }
659*4882a593Smuzhiyun
__mptcp_flush_join_list(struct mptcp_sock * msk)660*4882a593Smuzhiyun static void __mptcp_flush_join_list(struct mptcp_sock *msk)
661*4882a593Smuzhiyun {
662*4882a593Smuzhiyun if (likely(list_empty(&msk->join_list)))
663*4882a593Smuzhiyun return;
664*4882a593Smuzhiyun
665*4882a593Smuzhiyun spin_lock_bh(&msk->join_list_lock);
666*4882a593Smuzhiyun list_splice_tail_init(&msk->join_list, &msk->conn_list);
667*4882a593Smuzhiyun spin_unlock_bh(&msk->join_list_lock);
668*4882a593Smuzhiyun }
669*4882a593Smuzhiyun
mptcp_timer_pending(struct sock * sk)670*4882a593Smuzhiyun static bool mptcp_timer_pending(struct sock *sk)
671*4882a593Smuzhiyun {
672*4882a593Smuzhiyun return timer_pending(&inet_csk(sk)->icsk_retransmit_timer);
673*4882a593Smuzhiyun }
674*4882a593Smuzhiyun
mptcp_reset_timer(struct sock * sk)675*4882a593Smuzhiyun static void mptcp_reset_timer(struct sock *sk)
676*4882a593Smuzhiyun {
677*4882a593Smuzhiyun struct inet_connection_sock *icsk = inet_csk(sk);
678*4882a593Smuzhiyun unsigned long tout;
679*4882a593Smuzhiyun
680*4882a593Smuzhiyun /* should never be called with mptcp level timer cleared */
681*4882a593Smuzhiyun tout = READ_ONCE(mptcp_sk(sk)->timer_ival);
682*4882a593Smuzhiyun if (WARN_ON_ONCE(!tout))
683*4882a593Smuzhiyun tout = TCP_RTO_MIN;
684*4882a593Smuzhiyun sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout);
685*4882a593Smuzhiyun }
686*4882a593Smuzhiyun
mptcp_data_acked(struct sock * sk)687*4882a593Smuzhiyun void mptcp_data_acked(struct sock *sk)
688*4882a593Smuzhiyun {
689*4882a593Smuzhiyun mptcp_reset_timer(sk);
690*4882a593Smuzhiyun
691*4882a593Smuzhiyun if ((!test_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags) ||
692*4882a593Smuzhiyun (inet_sk_state_load(sk) != TCP_ESTABLISHED)) &&
693*4882a593Smuzhiyun schedule_work(&mptcp_sk(sk)->work))
694*4882a593Smuzhiyun sock_hold(sk);
695*4882a593Smuzhiyun }
696*4882a593Smuzhiyun
mptcp_subflow_eof(struct sock * sk)697*4882a593Smuzhiyun void mptcp_subflow_eof(struct sock *sk)
698*4882a593Smuzhiyun {
699*4882a593Smuzhiyun struct mptcp_sock *msk = mptcp_sk(sk);
700*4882a593Smuzhiyun
701*4882a593Smuzhiyun if (!test_and_set_bit(MPTCP_WORK_EOF, &msk->flags) &&
702*4882a593Smuzhiyun schedule_work(&msk->work))
703*4882a593Smuzhiyun sock_hold(sk);
704*4882a593Smuzhiyun }
705*4882a593Smuzhiyun
mptcp_check_for_eof(struct mptcp_sock * msk)706*4882a593Smuzhiyun static void mptcp_check_for_eof(struct mptcp_sock *msk)
707*4882a593Smuzhiyun {
708*4882a593Smuzhiyun struct mptcp_subflow_context *subflow;
709*4882a593Smuzhiyun struct sock *sk = (struct sock *)msk;
710*4882a593Smuzhiyun int receivers = 0;
711*4882a593Smuzhiyun
712*4882a593Smuzhiyun mptcp_for_each_subflow(msk, subflow)
713*4882a593Smuzhiyun receivers += !subflow->rx_eof;
714*4882a593Smuzhiyun
715*4882a593Smuzhiyun if (!receivers && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
716*4882a593Smuzhiyun /* hopefully temporary hack: propagate shutdown status
717*4882a593Smuzhiyun * to msk, when all subflows agree on it
718*4882a593Smuzhiyun */
719*4882a593Smuzhiyun sk->sk_shutdown |= RCV_SHUTDOWN;
720*4882a593Smuzhiyun
721*4882a593Smuzhiyun smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
722*4882a593Smuzhiyun set_bit(MPTCP_DATA_READY, &msk->flags);
723*4882a593Smuzhiyun sk->sk_data_ready(sk);
724*4882a593Smuzhiyun }
725*4882a593Smuzhiyun }
726*4882a593Smuzhiyun
mptcp_ext_cache_refill(struct mptcp_sock * msk)727*4882a593Smuzhiyun static bool mptcp_ext_cache_refill(struct mptcp_sock *msk)
728*4882a593Smuzhiyun {
729*4882a593Smuzhiyun const struct sock *sk = (const struct sock *)msk;
730*4882a593Smuzhiyun
731*4882a593Smuzhiyun if (!msk->cached_ext)
732*4882a593Smuzhiyun msk->cached_ext = __skb_ext_alloc(sk->sk_allocation);
733*4882a593Smuzhiyun
734*4882a593Smuzhiyun return !!msk->cached_ext;
735*4882a593Smuzhiyun }
736*4882a593Smuzhiyun
mptcp_subflow_recv_lookup(const struct mptcp_sock * msk)737*4882a593Smuzhiyun static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk)
738*4882a593Smuzhiyun {
739*4882a593Smuzhiyun struct mptcp_subflow_context *subflow;
740*4882a593Smuzhiyun struct sock *sk = (struct sock *)msk;
741*4882a593Smuzhiyun
742*4882a593Smuzhiyun sock_owned_by_me(sk);
743*4882a593Smuzhiyun
744*4882a593Smuzhiyun mptcp_for_each_subflow(msk, subflow) {
745*4882a593Smuzhiyun if (subflow->data_avail)
746*4882a593Smuzhiyun return mptcp_subflow_tcp_sock(subflow);
747*4882a593Smuzhiyun }
748*4882a593Smuzhiyun
749*4882a593Smuzhiyun return NULL;
750*4882a593Smuzhiyun }
751*4882a593Smuzhiyun
mptcp_skb_can_collapse_to(u64 write_seq,const struct sk_buff * skb,const struct mptcp_ext * mpext)752*4882a593Smuzhiyun static bool mptcp_skb_can_collapse_to(u64 write_seq,
753*4882a593Smuzhiyun const struct sk_buff *skb,
754*4882a593Smuzhiyun const struct mptcp_ext *mpext)
755*4882a593Smuzhiyun {
756*4882a593Smuzhiyun if (!tcp_skb_can_collapse_to(skb))
757*4882a593Smuzhiyun return false;
758*4882a593Smuzhiyun
759*4882a593Smuzhiyun /* can collapse only if MPTCP level sequence is in order */
760*4882a593Smuzhiyun return mpext && mpext->data_seq + mpext->data_len == write_seq;
761*4882a593Smuzhiyun }
762*4882a593Smuzhiyun
763*4882a593Smuzhiyun /* we can append data to the given data frag if:
764*4882a593Smuzhiyun * - there is space available in the backing page_frag
765*4882a593Smuzhiyun * - the data frag tail matches the current page_frag free offset
766*4882a593Smuzhiyun * - the data frag end sequence number matches the current write seq
767*4882a593Smuzhiyun */
mptcp_frag_can_collapse_to(const struct mptcp_sock * msk,const struct page_frag * pfrag,const struct mptcp_data_frag * df)768*4882a593Smuzhiyun static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
769*4882a593Smuzhiyun const struct page_frag *pfrag,
770*4882a593Smuzhiyun const struct mptcp_data_frag *df)
771*4882a593Smuzhiyun {
772*4882a593Smuzhiyun return df && pfrag->page == df->page &&
773*4882a593Smuzhiyun pfrag->offset == (df->offset + df->data_len) &&
774*4882a593Smuzhiyun df->data_seq + df->data_len == msk->write_seq;
775*4882a593Smuzhiyun }
776*4882a593Smuzhiyun
dfrag_uncharge(struct sock * sk,int len)777*4882a593Smuzhiyun static void dfrag_uncharge(struct sock *sk, int len)
778*4882a593Smuzhiyun {
779*4882a593Smuzhiyun sk_mem_uncharge(sk, len);
780*4882a593Smuzhiyun sk_wmem_queued_add(sk, -len);
781*4882a593Smuzhiyun }
782*4882a593Smuzhiyun
dfrag_clear(struct sock * sk,struct mptcp_data_frag * dfrag)783*4882a593Smuzhiyun static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag)
784*4882a593Smuzhiyun {
785*4882a593Smuzhiyun int len = dfrag->data_len + dfrag->overhead;
786*4882a593Smuzhiyun
787*4882a593Smuzhiyun list_del(&dfrag->list);
788*4882a593Smuzhiyun dfrag_uncharge(sk, len);
789*4882a593Smuzhiyun put_page(dfrag->page);
790*4882a593Smuzhiyun }
791*4882a593Smuzhiyun
mptcp_is_writeable(struct mptcp_sock * msk)792*4882a593Smuzhiyun static bool mptcp_is_writeable(struct mptcp_sock *msk)
793*4882a593Smuzhiyun {
794*4882a593Smuzhiyun struct mptcp_subflow_context *subflow;
795*4882a593Smuzhiyun
796*4882a593Smuzhiyun if (!sk_stream_is_writeable((struct sock *)msk))
797*4882a593Smuzhiyun return false;
798*4882a593Smuzhiyun
799*4882a593Smuzhiyun mptcp_for_each_subflow(msk, subflow) {
800*4882a593Smuzhiyun if (sk_stream_is_writeable(subflow->tcp_sock))
801*4882a593Smuzhiyun return true;
802*4882a593Smuzhiyun }
803*4882a593Smuzhiyun return false;
804*4882a593Smuzhiyun }
805*4882a593Smuzhiyun
mptcp_clean_una(struct sock * sk)806*4882a593Smuzhiyun static void mptcp_clean_una(struct sock *sk)
807*4882a593Smuzhiyun {
808*4882a593Smuzhiyun struct mptcp_sock *msk = mptcp_sk(sk);
809*4882a593Smuzhiyun struct mptcp_data_frag *dtmp, *dfrag;
810*4882a593Smuzhiyun bool cleaned = false;
811*4882a593Smuzhiyun u64 snd_una;
812*4882a593Smuzhiyun
813*4882a593Smuzhiyun /* on fallback we just need to ignore snd_una, as this is really
814*4882a593Smuzhiyun * plain TCP
815*4882a593Smuzhiyun */
816*4882a593Smuzhiyun if (__mptcp_check_fallback(msk))
817*4882a593Smuzhiyun atomic64_set(&msk->snd_una, msk->write_seq);
818*4882a593Smuzhiyun snd_una = atomic64_read(&msk->snd_una);
819*4882a593Smuzhiyun
820*4882a593Smuzhiyun list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) {
821*4882a593Smuzhiyun if (after64(dfrag->data_seq + dfrag->data_len, snd_una))
822*4882a593Smuzhiyun break;
823*4882a593Smuzhiyun
824*4882a593Smuzhiyun dfrag_clear(sk, dfrag);
825*4882a593Smuzhiyun cleaned = true;
826*4882a593Smuzhiyun }
827*4882a593Smuzhiyun
828*4882a593Smuzhiyun dfrag = mptcp_rtx_head(sk);
829*4882a593Smuzhiyun if (dfrag && after64(snd_una, dfrag->data_seq)) {
830*4882a593Smuzhiyun u64 delta = snd_una - dfrag->data_seq;
831*4882a593Smuzhiyun
832*4882a593Smuzhiyun if (WARN_ON_ONCE(delta > dfrag->data_len))
833*4882a593Smuzhiyun goto out;
834*4882a593Smuzhiyun
835*4882a593Smuzhiyun dfrag->data_seq += delta;
836*4882a593Smuzhiyun dfrag->offset += delta;
837*4882a593Smuzhiyun dfrag->data_len -= delta;
838*4882a593Smuzhiyun
839*4882a593Smuzhiyun dfrag_uncharge(sk, delta);
840*4882a593Smuzhiyun cleaned = true;
841*4882a593Smuzhiyun }
842*4882a593Smuzhiyun
843*4882a593Smuzhiyun out:
844*4882a593Smuzhiyun if (cleaned) {
845*4882a593Smuzhiyun sk_mem_reclaim_partial(sk);
846*4882a593Smuzhiyun
847*4882a593Smuzhiyun /* Only wake up writers if a subflow is ready */
848*4882a593Smuzhiyun if (mptcp_is_writeable(msk)) {
849*4882a593Smuzhiyun set_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags);
850*4882a593Smuzhiyun smp_mb__after_atomic();
851*4882a593Smuzhiyun
852*4882a593Smuzhiyun /* set SEND_SPACE before sk_stream_write_space clears
853*4882a593Smuzhiyun * NOSPACE
854*4882a593Smuzhiyun */
855*4882a593Smuzhiyun sk_stream_write_space(sk);
856*4882a593Smuzhiyun }
857*4882a593Smuzhiyun }
858*4882a593Smuzhiyun }
859*4882a593Smuzhiyun
860*4882a593Smuzhiyun /* ensure we get enough memory for the frag hdr, beyond some minimal amount of
861*4882a593Smuzhiyun * data
862*4882a593Smuzhiyun */
mptcp_page_frag_refill(struct sock * sk,struct page_frag * pfrag)863*4882a593Smuzhiyun static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
864*4882a593Smuzhiyun {
865*4882a593Smuzhiyun if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag),
866*4882a593Smuzhiyun pfrag, sk->sk_allocation)))
867*4882a593Smuzhiyun return true;
868*4882a593Smuzhiyun
869*4882a593Smuzhiyun sk->sk_prot->enter_memory_pressure(sk);
870*4882a593Smuzhiyun sk_stream_moderate_sndbuf(sk);
871*4882a593Smuzhiyun return false;
872*4882a593Smuzhiyun }
873*4882a593Smuzhiyun
874*4882a593Smuzhiyun static struct mptcp_data_frag *
mptcp_carve_data_frag(const struct mptcp_sock * msk,struct page_frag * pfrag,int orig_offset)875*4882a593Smuzhiyun mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag,
876*4882a593Smuzhiyun int orig_offset)
877*4882a593Smuzhiyun {
878*4882a593Smuzhiyun int offset = ALIGN(orig_offset, sizeof(long));
879*4882a593Smuzhiyun struct mptcp_data_frag *dfrag;
880*4882a593Smuzhiyun
881*4882a593Smuzhiyun dfrag = (struct mptcp_data_frag *)(page_to_virt(pfrag->page) + offset);
882*4882a593Smuzhiyun dfrag->data_len = 0;
883*4882a593Smuzhiyun dfrag->data_seq = msk->write_seq;
884*4882a593Smuzhiyun dfrag->overhead = offset - orig_offset + sizeof(struct mptcp_data_frag);
885*4882a593Smuzhiyun dfrag->offset = offset + sizeof(struct mptcp_data_frag);
886*4882a593Smuzhiyun dfrag->page = pfrag->page;
887*4882a593Smuzhiyun
888*4882a593Smuzhiyun return dfrag;
889*4882a593Smuzhiyun }
890*4882a593Smuzhiyun
mptcp_sendmsg_frag(struct sock * sk,struct sock * ssk,struct msghdr * msg,struct mptcp_data_frag * dfrag,long * timeo,int * pmss_now,int * ps_goal)891*4882a593Smuzhiyun static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
892*4882a593Smuzhiyun struct msghdr *msg, struct mptcp_data_frag *dfrag,
893*4882a593Smuzhiyun long *timeo, int *pmss_now,
894*4882a593Smuzhiyun int *ps_goal)
895*4882a593Smuzhiyun {
896*4882a593Smuzhiyun int mss_now, avail_size, size_goal, offset, ret, frag_truesize = 0;
897*4882a593Smuzhiyun bool dfrag_collapsed, can_collapse = false;
898*4882a593Smuzhiyun struct mptcp_sock *msk = mptcp_sk(sk);
899*4882a593Smuzhiyun struct mptcp_ext *mpext = NULL;
900*4882a593Smuzhiyun bool retransmission = !!dfrag;
901*4882a593Smuzhiyun struct sk_buff *skb, *tail;
902*4882a593Smuzhiyun struct page_frag *pfrag;
903*4882a593Smuzhiyun struct page *page;
904*4882a593Smuzhiyun u64 *write_seq;
905*4882a593Smuzhiyun size_t psize;
906*4882a593Smuzhiyun
907*4882a593Smuzhiyun /* use the mptcp page cache so that we can easily move the data
908*4882a593Smuzhiyun * from one substream to another, but do per subflow memory accounting
909*4882a593Smuzhiyun * Note: pfrag is used only !retransmission, but the compiler if
910*4882a593Smuzhiyun * fooled into a warning if we don't init here
911*4882a593Smuzhiyun */
912*4882a593Smuzhiyun pfrag = sk_page_frag(sk);
913*4882a593Smuzhiyun if (!retransmission) {
914*4882a593Smuzhiyun write_seq = &msk->write_seq;
915*4882a593Smuzhiyun page = pfrag->page;
916*4882a593Smuzhiyun } else {
917*4882a593Smuzhiyun write_seq = &dfrag->data_seq;
918*4882a593Smuzhiyun page = dfrag->page;
919*4882a593Smuzhiyun }
920*4882a593Smuzhiyun
921*4882a593Smuzhiyun /* compute copy limit */
922*4882a593Smuzhiyun mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags);
923*4882a593Smuzhiyun *pmss_now = mss_now;
924*4882a593Smuzhiyun *ps_goal = size_goal;
925*4882a593Smuzhiyun avail_size = size_goal;
926*4882a593Smuzhiyun skb = tcp_write_queue_tail(ssk);
927*4882a593Smuzhiyun if (skb) {
928*4882a593Smuzhiyun mpext = skb_ext_find(skb, SKB_EXT_MPTCP);
929*4882a593Smuzhiyun
930*4882a593Smuzhiyun /* Limit the write to the size available in the
931*4882a593Smuzhiyun * current skb, if any, so that we create at most a new skb.
932*4882a593Smuzhiyun * Explicitly tells TCP internals to avoid collapsing on later
933*4882a593Smuzhiyun * queue management operation, to avoid breaking the ext <->
934*4882a593Smuzhiyun * SSN association set here
935*4882a593Smuzhiyun */
936*4882a593Smuzhiyun can_collapse = (size_goal - skb->len > 0) &&
937*4882a593Smuzhiyun mptcp_skb_can_collapse_to(*write_seq, skb, mpext);
938*4882a593Smuzhiyun if (!can_collapse)
939*4882a593Smuzhiyun TCP_SKB_CB(skb)->eor = 1;
940*4882a593Smuzhiyun else
941*4882a593Smuzhiyun avail_size = size_goal - skb->len;
942*4882a593Smuzhiyun }
943*4882a593Smuzhiyun
944*4882a593Smuzhiyun if (!retransmission) {
945*4882a593Smuzhiyun /* reuse tail pfrag, if possible, or carve a new one from the
946*4882a593Smuzhiyun * page allocator
947*4882a593Smuzhiyun */
948*4882a593Smuzhiyun dfrag = mptcp_rtx_tail(sk);
949*4882a593Smuzhiyun offset = pfrag->offset;
950*4882a593Smuzhiyun dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag);
951*4882a593Smuzhiyun if (!dfrag_collapsed) {
952*4882a593Smuzhiyun dfrag = mptcp_carve_data_frag(msk, pfrag, offset);
953*4882a593Smuzhiyun offset = dfrag->offset;
954*4882a593Smuzhiyun frag_truesize = dfrag->overhead;
955*4882a593Smuzhiyun }
956*4882a593Smuzhiyun psize = min_t(size_t, pfrag->size - offset, avail_size);
957*4882a593Smuzhiyun
958*4882a593Smuzhiyun /* Copy to page */
959*4882a593Smuzhiyun pr_debug("left=%zu", msg_data_left(msg));
960*4882a593Smuzhiyun psize = copy_page_from_iter(pfrag->page, offset,
961*4882a593Smuzhiyun min_t(size_t, msg_data_left(msg),
962*4882a593Smuzhiyun psize),
963*4882a593Smuzhiyun &msg->msg_iter);
964*4882a593Smuzhiyun pr_debug("left=%zu", msg_data_left(msg));
965*4882a593Smuzhiyun if (!psize)
966*4882a593Smuzhiyun return -EINVAL;
967*4882a593Smuzhiyun
968*4882a593Smuzhiyun if (!sk_wmem_schedule(sk, psize + dfrag->overhead)) {
969*4882a593Smuzhiyun iov_iter_revert(&msg->msg_iter, psize);
970*4882a593Smuzhiyun return -ENOMEM;
971*4882a593Smuzhiyun }
972*4882a593Smuzhiyun } else {
973*4882a593Smuzhiyun offset = dfrag->offset;
974*4882a593Smuzhiyun psize = min_t(size_t, dfrag->data_len, avail_size);
975*4882a593Smuzhiyun }
976*4882a593Smuzhiyun
977*4882a593Smuzhiyun /* tell the TCP stack to delay the push so that we can safely
978*4882a593Smuzhiyun * access the skb after the sendpages call
979*4882a593Smuzhiyun */
980*4882a593Smuzhiyun ret = do_tcp_sendpages(ssk, page, offset, psize,
981*4882a593Smuzhiyun msg->msg_flags | MSG_SENDPAGE_NOTLAST | MSG_DONTWAIT);
982*4882a593Smuzhiyun if (ret <= 0) {
983*4882a593Smuzhiyun if (!retransmission)
984*4882a593Smuzhiyun iov_iter_revert(&msg->msg_iter, psize);
985*4882a593Smuzhiyun return ret;
986*4882a593Smuzhiyun }
987*4882a593Smuzhiyun
988*4882a593Smuzhiyun frag_truesize += ret;
989*4882a593Smuzhiyun if (!retransmission) {
990*4882a593Smuzhiyun if (unlikely(ret < psize))
991*4882a593Smuzhiyun iov_iter_revert(&msg->msg_iter, psize - ret);
992*4882a593Smuzhiyun
993*4882a593Smuzhiyun /* send successful, keep track of sent data for mptcp-level
994*4882a593Smuzhiyun * retransmission
995*4882a593Smuzhiyun */
996*4882a593Smuzhiyun dfrag->data_len += ret;
997*4882a593Smuzhiyun if (!dfrag_collapsed) {
998*4882a593Smuzhiyun get_page(dfrag->page);
999*4882a593Smuzhiyun list_add_tail(&dfrag->list, &msk->rtx_queue);
1000*4882a593Smuzhiyun sk_wmem_queued_add(sk, frag_truesize);
1001*4882a593Smuzhiyun } else {
1002*4882a593Smuzhiyun sk_wmem_queued_add(sk, ret);
1003*4882a593Smuzhiyun }
1004*4882a593Smuzhiyun
1005*4882a593Smuzhiyun /* charge data on mptcp rtx queue to the master socket
1006*4882a593Smuzhiyun * Note: we charge such data both to sk and ssk
1007*4882a593Smuzhiyun */
1008*4882a593Smuzhiyun sk->sk_forward_alloc -= frag_truesize;
1009*4882a593Smuzhiyun }
1010*4882a593Smuzhiyun
1011*4882a593Smuzhiyun /* if the tail skb extension is still the cached one, collapsing
1012*4882a593Smuzhiyun * really happened. Note: we can't check for 'same skb' as the sk_buff
1013*4882a593Smuzhiyun * hdr on tail can be transmitted, freed and re-allocated by the
1014*4882a593Smuzhiyun * do_tcp_sendpages() call
1015*4882a593Smuzhiyun */
1016*4882a593Smuzhiyun tail = tcp_write_queue_tail(ssk);
1017*4882a593Smuzhiyun if (mpext && tail && mpext == skb_ext_find(tail, SKB_EXT_MPTCP)) {
1018*4882a593Smuzhiyun WARN_ON_ONCE(!can_collapse);
1019*4882a593Smuzhiyun mpext->data_len += ret;
1020*4882a593Smuzhiyun goto out;
1021*4882a593Smuzhiyun }
1022*4882a593Smuzhiyun
1023*4882a593Smuzhiyun skb = tcp_write_queue_tail(ssk);
1024*4882a593Smuzhiyun mpext = __skb_ext_set(skb, SKB_EXT_MPTCP, msk->cached_ext);
1025*4882a593Smuzhiyun msk->cached_ext = NULL;
1026*4882a593Smuzhiyun
1027*4882a593Smuzhiyun memset(mpext, 0, sizeof(*mpext));
1028*4882a593Smuzhiyun mpext->data_seq = *write_seq;
1029*4882a593Smuzhiyun mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq;
1030*4882a593Smuzhiyun mpext->data_len = ret;
1031*4882a593Smuzhiyun mpext->use_map = 1;
1032*4882a593Smuzhiyun mpext->dsn64 = 1;
1033*4882a593Smuzhiyun
1034*4882a593Smuzhiyun pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d",
1035*4882a593Smuzhiyun mpext->data_seq, mpext->subflow_seq, mpext->data_len,
1036*4882a593Smuzhiyun mpext->dsn64);
1037*4882a593Smuzhiyun
1038*4882a593Smuzhiyun out:
1039*4882a593Smuzhiyun if (!retransmission)
1040*4882a593Smuzhiyun pfrag->offset += frag_truesize;
1041*4882a593Smuzhiyun WRITE_ONCE(*write_seq, *write_seq + ret);
1042*4882a593Smuzhiyun mptcp_subflow_ctx(ssk)->rel_write_seq += ret;
1043*4882a593Smuzhiyun
1044*4882a593Smuzhiyun return ret;
1045*4882a593Smuzhiyun }
1046*4882a593Smuzhiyun
mptcp_nospace(struct mptcp_sock * msk)1047*4882a593Smuzhiyun static void mptcp_nospace(struct mptcp_sock *msk)
1048*4882a593Smuzhiyun {
1049*4882a593Smuzhiyun struct mptcp_subflow_context *subflow;
1050*4882a593Smuzhiyun
1051*4882a593Smuzhiyun clear_bit(MPTCP_SEND_SPACE, &msk->flags);
1052*4882a593Smuzhiyun smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
1053*4882a593Smuzhiyun
1054*4882a593Smuzhiyun mptcp_for_each_subflow(msk, subflow) {
1055*4882a593Smuzhiyun struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
1056*4882a593Smuzhiyun struct socket *sock = READ_ONCE(ssk->sk_socket);
1057*4882a593Smuzhiyun
1058*4882a593Smuzhiyun /* enables ssk->write_space() callbacks */
1059*4882a593Smuzhiyun if (sock)
1060*4882a593Smuzhiyun set_bit(SOCK_NOSPACE, &sock->flags);
1061*4882a593Smuzhiyun }
1062*4882a593Smuzhiyun }
1063*4882a593Smuzhiyun
mptcp_subflow_active(struct mptcp_subflow_context * subflow)1064*4882a593Smuzhiyun static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
1065*4882a593Smuzhiyun {
1066*4882a593Smuzhiyun struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
1067*4882a593Smuzhiyun
1068*4882a593Smuzhiyun /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */
1069*4882a593Smuzhiyun if (subflow->request_join && !subflow->fully_established)
1070*4882a593Smuzhiyun return false;
1071*4882a593Smuzhiyun
1072*4882a593Smuzhiyun /* only send if our side has not closed yet */
1073*4882a593Smuzhiyun return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT));
1074*4882a593Smuzhiyun }
1075*4882a593Smuzhiyun
1076*4882a593Smuzhiyun #define MPTCP_SEND_BURST_SIZE ((1 << 16) - \
1077*4882a593Smuzhiyun sizeof(struct tcphdr) - \
1078*4882a593Smuzhiyun MAX_TCP_OPTION_SPACE - \
1079*4882a593Smuzhiyun sizeof(struct ipv6hdr) - \
1080*4882a593Smuzhiyun sizeof(struct frag_hdr))
1081*4882a593Smuzhiyun
1082*4882a593Smuzhiyun struct subflow_send_info {
1083*4882a593Smuzhiyun struct sock *ssk;
1084*4882a593Smuzhiyun u64 ratio;
1085*4882a593Smuzhiyun };
1086*4882a593Smuzhiyun
mptcp_subflow_get_send(struct mptcp_sock * msk,u32 * sndbuf)1087*4882a593Smuzhiyun static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
1088*4882a593Smuzhiyun u32 *sndbuf)
1089*4882a593Smuzhiyun {
1090*4882a593Smuzhiyun struct subflow_send_info send_info[2];
1091*4882a593Smuzhiyun struct mptcp_subflow_context *subflow;
1092*4882a593Smuzhiyun int i, nr_active = 0;
1093*4882a593Smuzhiyun struct sock *ssk;
1094*4882a593Smuzhiyun u64 ratio;
1095*4882a593Smuzhiyun u32 pace;
1096*4882a593Smuzhiyun
1097*4882a593Smuzhiyun sock_owned_by_me((struct sock *)msk);
1098*4882a593Smuzhiyun
1099*4882a593Smuzhiyun *sndbuf = 0;
1100*4882a593Smuzhiyun if (!mptcp_ext_cache_refill(msk))
1101*4882a593Smuzhiyun return NULL;
1102*4882a593Smuzhiyun
1103*4882a593Smuzhiyun if (__mptcp_check_fallback(msk)) {
1104*4882a593Smuzhiyun if (!msk->first)
1105*4882a593Smuzhiyun return NULL;
1106*4882a593Smuzhiyun *sndbuf = msk->first->sk_sndbuf;
1107*4882a593Smuzhiyun return sk_stream_memory_free(msk->first) ? msk->first : NULL;
1108*4882a593Smuzhiyun }
1109*4882a593Smuzhiyun
1110*4882a593Smuzhiyun /* re-use last subflow, if the burst allow that */
1111*4882a593Smuzhiyun if (msk->last_snd && msk->snd_burst > 0 &&
1112*4882a593Smuzhiyun sk_stream_memory_free(msk->last_snd) &&
1113*4882a593Smuzhiyun mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) {
1114*4882a593Smuzhiyun mptcp_for_each_subflow(msk, subflow) {
1115*4882a593Smuzhiyun ssk = mptcp_subflow_tcp_sock(subflow);
1116*4882a593Smuzhiyun *sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
1117*4882a593Smuzhiyun }
1118*4882a593Smuzhiyun return msk->last_snd;
1119*4882a593Smuzhiyun }
1120*4882a593Smuzhiyun
1121*4882a593Smuzhiyun /* pick the subflow with the lower wmem/wspace ratio */
1122*4882a593Smuzhiyun for (i = 0; i < 2; ++i) {
1123*4882a593Smuzhiyun send_info[i].ssk = NULL;
1124*4882a593Smuzhiyun send_info[i].ratio = -1;
1125*4882a593Smuzhiyun }
1126*4882a593Smuzhiyun mptcp_for_each_subflow(msk, subflow) {
1127*4882a593Smuzhiyun ssk = mptcp_subflow_tcp_sock(subflow);
1128*4882a593Smuzhiyun if (!mptcp_subflow_active(subflow))
1129*4882a593Smuzhiyun continue;
1130*4882a593Smuzhiyun
1131*4882a593Smuzhiyun nr_active += !subflow->backup;
1132*4882a593Smuzhiyun *sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
1133*4882a593Smuzhiyun if (!sk_stream_memory_free(subflow->tcp_sock))
1134*4882a593Smuzhiyun continue;
1135*4882a593Smuzhiyun
1136*4882a593Smuzhiyun pace = READ_ONCE(ssk->sk_pacing_rate);
1137*4882a593Smuzhiyun if (!pace)
1138*4882a593Smuzhiyun continue;
1139*4882a593Smuzhiyun
1140*4882a593Smuzhiyun ratio = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32,
1141*4882a593Smuzhiyun pace);
1142*4882a593Smuzhiyun if (ratio < send_info[subflow->backup].ratio) {
1143*4882a593Smuzhiyun send_info[subflow->backup].ssk = ssk;
1144*4882a593Smuzhiyun send_info[subflow->backup].ratio = ratio;
1145*4882a593Smuzhiyun }
1146*4882a593Smuzhiyun }
1147*4882a593Smuzhiyun
1148*4882a593Smuzhiyun pr_debug("msk=%p nr_active=%d ssk=%p:%lld backup=%p:%lld",
1149*4882a593Smuzhiyun msk, nr_active, send_info[0].ssk, send_info[0].ratio,
1150*4882a593Smuzhiyun send_info[1].ssk, send_info[1].ratio);
1151*4882a593Smuzhiyun
1152*4882a593Smuzhiyun /* pick the best backup if no other subflow is active */
1153*4882a593Smuzhiyun if (!nr_active)
1154*4882a593Smuzhiyun send_info[0].ssk = send_info[1].ssk;
1155*4882a593Smuzhiyun
1156*4882a593Smuzhiyun if (send_info[0].ssk) {
1157*4882a593Smuzhiyun msk->last_snd = send_info[0].ssk;
1158*4882a593Smuzhiyun msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE,
1159*4882a593Smuzhiyun sk_stream_wspace(msk->last_snd));
1160*4882a593Smuzhiyun return msk->last_snd;
1161*4882a593Smuzhiyun }
1162*4882a593Smuzhiyun return NULL;
1163*4882a593Smuzhiyun }
1164*4882a593Smuzhiyun
ssk_check_wmem(struct mptcp_sock * msk)1165*4882a593Smuzhiyun static void ssk_check_wmem(struct mptcp_sock *msk)
1166*4882a593Smuzhiyun {
1167*4882a593Smuzhiyun if (unlikely(!mptcp_is_writeable(msk)))
1168*4882a593Smuzhiyun mptcp_nospace(msk);
1169*4882a593Smuzhiyun }
1170*4882a593Smuzhiyun
mptcp_sendmsg(struct sock * sk,struct msghdr * msg,size_t len)1171*4882a593Smuzhiyun static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
1172*4882a593Smuzhiyun {
1173*4882a593Smuzhiyun int mss_now = 0, size_goal = 0, ret = 0;
1174*4882a593Smuzhiyun struct mptcp_sock *msk = mptcp_sk(sk);
1175*4882a593Smuzhiyun struct page_frag *pfrag;
1176*4882a593Smuzhiyun size_t copied = 0;
1177*4882a593Smuzhiyun struct sock *ssk;
1178*4882a593Smuzhiyun u32 sndbuf;
1179*4882a593Smuzhiyun bool tx_ok;
1180*4882a593Smuzhiyun long timeo;
1181*4882a593Smuzhiyun
1182*4882a593Smuzhiyun if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))
1183*4882a593Smuzhiyun return -EOPNOTSUPP;
1184*4882a593Smuzhiyun
1185*4882a593Smuzhiyun lock_sock(sk);
1186*4882a593Smuzhiyun
1187*4882a593Smuzhiyun timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1188*4882a593Smuzhiyun
1189*4882a593Smuzhiyun if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
1190*4882a593Smuzhiyun ret = sk_stream_wait_connect(sk, &timeo);
1191*4882a593Smuzhiyun if (ret)
1192*4882a593Smuzhiyun goto out;
1193*4882a593Smuzhiyun }
1194*4882a593Smuzhiyun
1195*4882a593Smuzhiyun pfrag = sk_page_frag(sk);
1196*4882a593Smuzhiyun restart:
1197*4882a593Smuzhiyun mptcp_clean_una(sk);
1198*4882a593Smuzhiyun
1199*4882a593Smuzhiyun if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) {
1200*4882a593Smuzhiyun ret = -EPIPE;
1201*4882a593Smuzhiyun goto out;
1202*4882a593Smuzhiyun }
1203*4882a593Smuzhiyun
1204*4882a593Smuzhiyun __mptcp_flush_join_list(msk);
1205*4882a593Smuzhiyun ssk = mptcp_subflow_get_send(msk, &sndbuf);
1206*4882a593Smuzhiyun while (!sk_stream_memory_free(sk) ||
1207*4882a593Smuzhiyun !ssk ||
1208*4882a593Smuzhiyun !mptcp_page_frag_refill(ssk, pfrag)) {
1209*4882a593Smuzhiyun if (ssk) {
1210*4882a593Smuzhiyun /* make sure retransmit timer is
1211*4882a593Smuzhiyun * running before we wait for memory.
1212*4882a593Smuzhiyun *
1213*4882a593Smuzhiyun * The retransmit timer might be needed
1214*4882a593Smuzhiyun * to make the peer send an up-to-date
1215*4882a593Smuzhiyun * MPTCP Ack.
1216*4882a593Smuzhiyun */
1217*4882a593Smuzhiyun mptcp_set_timeout(sk, ssk);
1218*4882a593Smuzhiyun if (!mptcp_timer_pending(sk))
1219*4882a593Smuzhiyun mptcp_reset_timer(sk);
1220*4882a593Smuzhiyun }
1221*4882a593Smuzhiyun
1222*4882a593Smuzhiyun mptcp_nospace(msk);
1223*4882a593Smuzhiyun ret = sk_stream_wait_memory(sk, &timeo);
1224*4882a593Smuzhiyun if (ret)
1225*4882a593Smuzhiyun goto out;
1226*4882a593Smuzhiyun
1227*4882a593Smuzhiyun mptcp_clean_una(sk);
1228*4882a593Smuzhiyun
1229*4882a593Smuzhiyun ssk = mptcp_subflow_get_send(msk, &sndbuf);
1230*4882a593Smuzhiyun if (list_empty(&msk->conn_list)) {
1231*4882a593Smuzhiyun ret = -ENOTCONN;
1232*4882a593Smuzhiyun goto out;
1233*4882a593Smuzhiyun }
1234*4882a593Smuzhiyun }
1235*4882a593Smuzhiyun
1236*4882a593Smuzhiyun /* do auto tuning */
1237*4882a593Smuzhiyun if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) &&
1238*4882a593Smuzhiyun sndbuf > READ_ONCE(sk->sk_sndbuf))
1239*4882a593Smuzhiyun WRITE_ONCE(sk->sk_sndbuf, sndbuf);
1240*4882a593Smuzhiyun
1241*4882a593Smuzhiyun pr_debug("conn_list->subflow=%p", ssk);
1242*4882a593Smuzhiyun
1243*4882a593Smuzhiyun lock_sock(ssk);
1244*4882a593Smuzhiyun tx_ok = msg_data_left(msg);
1245*4882a593Smuzhiyun while (tx_ok) {
1246*4882a593Smuzhiyun ret = mptcp_sendmsg_frag(sk, ssk, msg, NULL, &timeo, &mss_now,
1247*4882a593Smuzhiyun &size_goal);
1248*4882a593Smuzhiyun if (ret < 0) {
1249*4882a593Smuzhiyun if (ret == -EAGAIN && timeo > 0) {
1250*4882a593Smuzhiyun mptcp_set_timeout(sk, ssk);
1251*4882a593Smuzhiyun release_sock(ssk);
1252*4882a593Smuzhiyun goto restart;
1253*4882a593Smuzhiyun }
1254*4882a593Smuzhiyun break;
1255*4882a593Smuzhiyun }
1256*4882a593Smuzhiyun
1257*4882a593Smuzhiyun /* burst can be negative, we will try move to the next subflow
1258*4882a593Smuzhiyun * at selection time, if possible.
1259*4882a593Smuzhiyun */
1260*4882a593Smuzhiyun msk->snd_burst -= ret;
1261*4882a593Smuzhiyun copied += ret;
1262*4882a593Smuzhiyun
1263*4882a593Smuzhiyun tx_ok = msg_data_left(msg);
1264*4882a593Smuzhiyun if (!tx_ok)
1265*4882a593Smuzhiyun break;
1266*4882a593Smuzhiyun
1267*4882a593Smuzhiyun if (!sk_stream_memory_free(ssk) ||
1268*4882a593Smuzhiyun !mptcp_page_frag_refill(ssk, pfrag) ||
1269*4882a593Smuzhiyun !mptcp_ext_cache_refill(msk)) {
1270*4882a593Smuzhiyun tcp_push(ssk, msg->msg_flags, mss_now,
1271*4882a593Smuzhiyun tcp_sk(ssk)->nonagle, size_goal);
1272*4882a593Smuzhiyun mptcp_set_timeout(sk, ssk);
1273*4882a593Smuzhiyun release_sock(ssk);
1274*4882a593Smuzhiyun goto restart;
1275*4882a593Smuzhiyun }
1276*4882a593Smuzhiyun
1277*4882a593Smuzhiyun /* memory is charged to mptcp level socket as well, i.e.
1278*4882a593Smuzhiyun * if msg is very large, mptcp socket may run out of buffer
1279*4882a593Smuzhiyun * space. mptcp_clean_una() will release data that has
1280*4882a593Smuzhiyun * been acked at mptcp level in the mean time, so there is
1281*4882a593Smuzhiyun * a good chance we can continue sending data right away.
1282*4882a593Smuzhiyun *
1283*4882a593Smuzhiyun * Normally, when the tcp subflow can accept more data, then
1284*4882a593Smuzhiyun * so can the MPTCP socket. However, we need to cope with
1285*4882a593Smuzhiyun * peers that might lag behind in their MPTCP-level
1286*4882a593Smuzhiyun * acknowledgements, i.e. data might have been acked at
1287*4882a593Smuzhiyun * tcp level only. So, we must also check the MPTCP socket
1288*4882a593Smuzhiyun * limits before we send more data.
1289*4882a593Smuzhiyun */
1290*4882a593Smuzhiyun if (unlikely(!sk_stream_memory_free(sk))) {
1291*4882a593Smuzhiyun tcp_push(ssk, msg->msg_flags, mss_now,
1292*4882a593Smuzhiyun tcp_sk(ssk)->nonagle, size_goal);
1293*4882a593Smuzhiyun mptcp_clean_una(sk);
1294*4882a593Smuzhiyun if (!sk_stream_memory_free(sk)) {
1295*4882a593Smuzhiyun /* can't send more for now, need to wait for
1296*4882a593Smuzhiyun * MPTCP-level ACKs from peer.
1297*4882a593Smuzhiyun *
1298*4882a593Smuzhiyun * Wakeup will happen via mptcp_clean_una().
1299*4882a593Smuzhiyun */
1300*4882a593Smuzhiyun mptcp_set_timeout(sk, ssk);
1301*4882a593Smuzhiyun release_sock(ssk);
1302*4882a593Smuzhiyun goto restart;
1303*4882a593Smuzhiyun }
1304*4882a593Smuzhiyun }
1305*4882a593Smuzhiyun }
1306*4882a593Smuzhiyun
1307*4882a593Smuzhiyun mptcp_set_timeout(sk, ssk);
1308*4882a593Smuzhiyun if (copied) {
1309*4882a593Smuzhiyun tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle,
1310*4882a593Smuzhiyun size_goal);
1311*4882a593Smuzhiyun
1312*4882a593Smuzhiyun /* start the timer, if it's not pending */
1313*4882a593Smuzhiyun if (!mptcp_timer_pending(sk))
1314*4882a593Smuzhiyun mptcp_reset_timer(sk);
1315*4882a593Smuzhiyun }
1316*4882a593Smuzhiyun
1317*4882a593Smuzhiyun release_sock(ssk);
1318*4882a593Smuzhiyun out:
1319*4882a593Smuzhiyun ssk_check_wmem(msk);
1320*4882a593Smuzhiyun release_sock(sk);
1321*4882a593Smuzhiyun return copied ? : ret;
1322*4882a593Smuzhiyun }
1323*4882a593Smuzhiyun
mptcp_wait_data(struct sock * sk,long * timeo)1324*4882a593Smuzhiyun static void mptcp_wait_data(struct sock *sk, long *timeo)
1325*4882a593Smuzhiyun {
1326*4882a593Smuzhiyun DEFINE_WAIT_FUNC(wait, woken_wake_function);
1327*4882a593Smuzhiyun struct mptcp_sock *msk = mptcp_sk(sk);
1328*4882a593Smuzhiyun
1329*4882a593Smuzhiyun add_wait_queue(sk_sleep(sk), &wait);
1330*4882a593Smuzhiyun sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
1331*4882a593Smuzhiyun
1332*4882a593Smuzhiyun sk_wait_event(sk, timeo,
1333*4882a593Smuzhiyun test_and_clear_bit(MPTCP_DATA_READY, &msk->flags), &wait);
1334*4882a593Smuzhiyun
1335*4882a593Smuzhiyun sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
1336*4882a593Smuzhiyun remove_wait_queue(sk_sleep(sk), &wait);
1337*4882a593Smuzhiyun }
1338*4882a593Smuzhiyun
__mptcp_recvmsg_mskq(struct mptcp_sock * msk,struct msghdr * msg,size_t len)1339*4882a593Smuzhiyun static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
1340*4882a593Smuzhiyun struct msghdr *msg,
1341*4882a593Smuzhiyun size_t len)
1342*4882a593Smuzhiyun {
1343*4882a593Smuzhiyun struct sock *sk = (struct sock *)msk;
1344*4882a593Smuzhiyun struct sk_buff *skb;
1345*4882a593Smuzhiyun int copied = 0;
1346*4882a593Smuzhiyun
1347*4882a593Smuzhiyun while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
1348*4882a593Smuzhiyun u32 offset = MPTCP_SKB_CB(skb)->offset;
1349*4882a593Smuzhiyun u32 data_len = skb->len - offset;
1350*4882a593Smuzhiyun u32 count = min_t(size_t, len - copied, data_len);
1351*4882a593Smuzhiyun int err;
1352*4882a593Smuzhiyun
1353*4882a593Smuzhiyun err = skb_copy_datagram_msg(skb, offset, msg, count);
1354*4882a593Smuzhiyun if (unlikely(err < 0)) {
1355*4882a593Smuzhiyun if (!copied)
1356*4882a593Smuzhiyun return err;
1357*4882a593Smuzhiyun break;
1358*4882a593Smuzhiyun }
1359*4882a593Smuzhiyun
1360*4882a593Smuzhiyun copied += count;
1361*4882a593Smuzhiyun
1362*4882a593Smuzhiyun if (count < data_len) {
1363*4882a593Smuzhiyun MPTCP_SKB_CB(skb)->offset += count;
1364*4882a593Smuzhiyun break;
1365*4882a593Smuzhiyun }
1366*4882a593Smuzhiyun
1367*4882a593Smuzhiyun __skb_unlink(skb, &sk->sk_receive_queue);
1368*4882a593Smuzhiyun __kfree_skb(skb);
1369*4882a593Smuzhiyun
1370*4882a593Smuzhiyun if (copied >= len)
1371*4882a593Smuzhiyun break;
1372*4882a593Smuzhiyun }
1373*4882a593Smuzhiyun
1374*4882a593Smuzhiyun return copied;
1375*4882a593Smuzhiyun }
1376*4882a593Smuzhiyun
1377*4882a593Smuzhiyun /* receive buffer autotuning. See tcp_rcv_space_adjust for more information.
1378*4882a593Smuzhiyun *
1379*4882a593Smuzhiyun * Only difference: Use highest rtt estimate of the subflows in use.
1380*4882a593Smuzhiyun */
mptcp_rcv_space_adjust(struct mptcp_sock * msk,int copied)1381*4882a593Smuzhiyun static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
1382*4882a593Smuzhiyun {
1383*4882a593Smuzhiyun struct mptcp_subflow_context *subflow;
1384*4882a593Smuzhiyun struct sock *sk = (struct sock *)msk;
1385*4882a593Smuzhiyun u32 time, advmss = 1;
1386*4882a593Smuzhiyun u64 rtt_us, mstamp;
1387*4882a593Smuzhiyun
1388*4882a593Smuzhiyun sock_owned_by_me(sk);
1389*4882a593Smuzhiyun
1390*4882a593Smuzhiyun if (copied <= 0)
1391*4882a593Smuzhiyun return;
1392*4882a593Smuzhiyun
1393*4882a593Smuzhiyun msk->rcvq_space.copied += copied;
1394*4882a593Smuzhiyun
1395*4882a593Smuzhiyun mstamp = div_u64(tcp_clock_ns(), NSEC_PER_USEC);
1396*4882a593Smuzhiyun time = tcp_stamp_us_delta(mstamp, msk->rcvq_space.time);
1397*4882a593Smuzhiyun
1398*4882a593Smuzhiyun rtt_us = msk->rcvq_space.rtt_us;
1399*4882a593Smuzhiyun if (rtt_us && time < (rtt_us >> 3))
1400*4882a593Smuzhiyun return;
1401*4882a593Smuzhiyun
1402*4882a593Smuzhiyun rtt_us = 0;
1403*4882a593Smuzhiyun mptcp_for_each_subflow(msk, subflow) {
1404*4882a593Smuzhiyun const struct tcp_sock *tp;
1405*4882a593Smuzhiyun u64 sf_rtt_us;
1406*4882a593Smuzhiyun u32 sf_advmss;
1407*4882a593Smuzhiyun
1408*4882a593Smuzhiyun tp = tcp_sk(mptcp_subflow_tcp_sock(subflow));
1409*4882a593Smuzhiyun
1410*4882a593Smuzhiyun sf_rtt_us = READ_ONCE(tp->rcv_rtt_est.rtt_us);
1411*4882a593Smuzhiyun sf_advmss = READ_ONCE(tp->advmss);
1412*4882a593Smuzhiyun
1413*4882a593Smuzhiyun rtt_us = max(sf_rtt_us, rtt_us);
1414*4882a593Smuzhiyun advmss = max(sf_advmss, advmss);
1415*4882a593Smuzhiyun }
1416*4882a593Smuzhiyun
1417*4882a593Smuzhiyun msk->rcvq_space.rtt_us = rtt_us;
1418*4882a593Smuzhiyun if (time < (rtt_us >> 3) || rtt_us == 0)
1419*4882a593Smuzhiyun return;
1420*4882a593Smuzhiyun
1421*4882a593Smuzhiyun if (msk->rcvq_space.copied <= msk->rcvq_space.space)
1422*4882a593Smuzhiyun goto new_measure;
1423*4882a593Smuzhiyun
1424*4882a593Smuzhiyun if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
1425*4882a593Smuzhiyun !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
1426*4882a593Smuzhiyun int rcvmem, rcvbuf;
1427*4882a593Smuzhiyun u64 rcvwin, grow;
1428*4882a593Smuzhiyun
1429*4882a593Smuzhiyun rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss;
1430*4882a593Smuzhiyun
1431*4882a593Smuzhiyun grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space);
1432*4882a593Smuzhiyun
1433*4882a593Smuzhiyun do_div(grow, msk->rcvq_space.space);
1434*4882a593Smuzhiyun rcvwin += (grow << 1);
1435*4882a593Smuzhiyun
1436*4882a593Smuzhiyun rcvmem = SKB_TRUESIZE(advmss + MAX_TCP_HEADER);
1437*4882a593Smuzhiyun while (tcp_win_from_space(sk, rcvmem) < advmss)
1438*4882a593Smuzhiyun rcvmem += 128;
1439*4882a593Smuzhiyun
1440*4882a593Smuzhiyun do_div(rcvwin, advmss);
1441*4882a593Smuzhiyun rcvbuf = min_t(u64, rcvwin * rcvmem,
1442*4882a593Smuzhiyun READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
1443*4882a593Smuzhiyun
1444*4882a593Smuzhiyun if (rcvbuf > sk->sk_rcvbuf) {
1445*4882a593Smuzhiyun u32 window_clamp;
1446*4882a593Smuzhiyun
1447*4882a593Smuzhiyun window_clamp = tcp_win_from_space(sk, rcvbuf);
1448*4882a593Smuzhiyun WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
1449*4882a593Smuzhiyun
1450*4882a593Smuzhiyun /* Make subflows follow along. If we do not do this, we
1451*4882a593Smuzhiyun * get drops at subflow level if skbs can't be moved to
1452*4882a593Smuzhiyun * the mptcp rx queue fast enough (announced rcv_win can
1453*4882a593Smuzhiyun * exceed ssk->sk_rcvbuf).
1454*4882a593Smuzhiyun */
1455*4882a593Smuzhiyun mptcp_for_each_subflow(msk, subflow) {
1456*4882a593Smuzhiyun struct sock *ssk;
1457*4882a593Smuzhiyun bool slow;
1458*4882a593Smuzhiyun
1459*4882a593Smuzhiyun ssk = mptcp_subflow_tcp_sock(subflow);
1460*4882a593Smuzhiyun slow = lock_sock_fast(ssk);
1461*4882a593Smuzhiyun WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf);
1462*4882a593Smuzhiyun tcp_sk(ssk)->window_clamp = window_clamp;
1463*4882a593Smuzhiyun tcp_cleanup_rbuf(ssk, 1);
1464*4882a593Smuzhiyun unlock_sock_fast(ssk, slow);
1465*4882a593Smuzhiyun }
1466*4882a593Smuzhiyun }
1467*4882a593Smuzhiyun }
1468*4882a593Smuzhiyun
1469*4882a593Smuzhiyun msk->rcvq_space.space = msk->rcvq_space.copied;
1470*4882a593Smuzhiyun new_measure:
1471*4882a593Smuzhiyun msk->rcvq_space.copied = 0;
1472*4882a593Smuzhiyun msk->rcvq_space.time = mstamp;
1473*4882a593Smuzhiyun }
1474*4882a593Smuzhiyun
__mptcp_move_skbs(struct mptcp_sock * msk)1475*4882a593Smuzhiyun static bool __mptcp_move_skbs(struct mptcp_sock *msk)
1476*4882a593Smuzhiyun {
1477*4882a593Smuzhiyun unsigned int moved = 0;
1478*4882a593Smuzhiyun bool done;
1479*4882a593Smuzhiyun
1480*4882a593Smuzhiyun /* avoid looping forever below on racing close */
1481*4882a593Smuzhiyun if (((struct sock *)msk)->sk_state == TCP_CLOSE)
1482*4882a593Smuzhiyun return false;
1483*4882a593Smuzhiyun
1484*4882a593Smuzhiyun __mptcp_flush_join_list(msk);
1485*4882a593Smuzhiyun do {
1486*4882a593Smuzhiyun struct sock *ssk = mptcp_subflow_recv_lookup(msk);
1487*4882a593Smuzhiyun
1488*4882a593Smuzhiyun if (!ssk)
1489*4882a593Smuzhiyun break;
1490*4882a593Smuzhiyun
1491*4882a593Smuzhiyun lock_sock(ssk);
1492*4882a593Smuzhiyun done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
1493*4882a593Smuzhiyun release_sock(ssk);
1494*4882a593Smuzhiyun } while (!done);
1495*4882a593Smuzhiyun
1496*4882a593Smuzhiyun if (mptcp_ofo_queue(msk) || moved > 0) {
1497*4882a593Smuzhiyun mptcp_check_data_fin((struct sock *)msk);
1498*4882a593Smuzhiyun return true;
1499*4882a593Smuzhiyun }
1500*4882a593Smuzhiyun return false;
1501*4882a593Smuzhiyun }
1502*4882a593Smuzhiyun
mptcp_recvmsg(struct sock * sk,struct msghdr * msg,size_t len,int nonblock,int flags,int * addr_len)1503*4882a593Smuzhiyun static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
1504*4882a593Smuzhiyun int nonblock, int flags, int *addr_len)
1505*4882a593Smuzhiyun {
1506*4882a593Smuzhiyun struct mptcp_sock *msk = mptcp_sk(sk);
1507*4882a593Smuzhiyun int copied = 0;
1508*4882a593Smuzhiyun int target;
1509*4882a593Smuzhiyun long timeo;
1510*4882a593Smuzhiyun
1511*4882a593Smuzhiyun if (msg->msg_flags & ~(MSG_WAITALL | MSG_DONTWAIT))
1512*4882a593Smuzhiyun return -EOPNOTSUPP;
1513*4882a593Smuzhiyun
1514*4882a593Smuzhiyun lock_sock(sk);
1515*4882a593Smuzhiyun timeo = sock_rcvtimeo(sk, nonblock);
1516*4882a593Smuzhiyun
1517*4882a593Smuzhiyun len = min_t(size_t, len, INT_MAX);
1518*4882a593Smuzhiyun target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1519*4882a593Smuzhiyun __mptcp_flush_join_list(msk);
1520*4882a593Smuzhiyun
1521*4882a593Smuzhiyun while (len > (size_t)copied) {
1522*4882a593Smuzhiyun int bytes_read;
1523*4882a593Smuzhiyun
1524*4882a593Smuzhiyun bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied);
1525*4882a593Smuzhiyun if (unlikely(bytes_read < 0)) {
1526*4882a593Smuzhiyun if (!copied)
1527*4882a593Smuzhiyun copied = bytes_read;
1528*4882a593Smuzhiyun goto out_err;
1529*4882a593Smuzhiyun }
1530*4882a593Smuzhiyun
1531*4882a593Smuzhiyun copied += bytes_read;
1532*4882a593Smuzhiyun
1533*4882a593Smuzhiyun if (skb_queue_empty(&sk->sk_receive_queue) &&
1534*4882a593Smuzhiyun __mptcp_move_skbs(msk))
1535*4882a593Smuzhiyun continue;
1536*4882a593Smuzhiyun
1537*4882a593Smuzhiyun /* only the master socket status is relevant here. The exit
1538*4882a593Smuzhiyun * conditions mirror closely tcp_recvmsg()
1539*4882a593Smuzhiyun */
1540*4882a593Smuzhiyun if (copied >= target)
1541*4882a593Smuzhiyun break;
1542*4882a593Smuzhiyun
1543*4882a593Smuzhiyun if (copied) {
1544*4882a593Smuzhiyun if (sk->sk_err ||
1545*4882a593Smuzhiyun sk->sk_state == TCP_CLOSE ||
1546*4882a593Smuzhiyun (sk->sk_shutdown & RCV_SHUTDOWN) ||
1547*4882a593Smuzhiyun !timeo ||
1548*4882a593Smuzhiyun signal_pending(current))
1549*4882a593Smuzhiyun break;
1550*4882a593Smuzhiyun } else {
1551*4882a593Smuzhiyun if (sk->sk_err) {
1552*4882a593Smuzhiyun copied = sock_error(sk);
1553*4882a593Smuzhiyun break;
1554*4882a593Smuzhiyun }
1555*4882a593Smuzhiyun
1556*4882a593Smuzhiyun if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags))
1557*4882a593Smuzhiyun mptcp_check_for_eof(msk);
1558*4882a593Smuzhiyun
1559*4882a593Smuzhiyun if (sk->sk_shutdown & RCV_SHUTDOWN)
1560*4882a593Smuzhiyun break;
1561*4882a593Smuzhiyun
1562*4882a593Smuzhiyun if (sk->sk_state == TCP_CLOSE) {
1563*4882a593Smuzhiyun copied = -ENOTCONN;
1564*4882a593Smuzhiyun break;
1565*4882a593Smuzhiyun }
1566*4882a593Smuzhiyun
1567*4882a593Smuzhiyun if (!timeo) {
1568*4882a593Smuzhiyun copied = -EAGAIN;
1569*4882a593Smuzhiyun break;
1570*4882a593Smuzhiyun }
1571*4882a593Smuzhiyun
1572*4882a593Smuzhiyun if (signal_pending(current)) {
1573*4882a593Smuzhiyun copied = sock_intr_errno(timeo);
1574*4882a593Smuzhiyun break;
1575*4882a593Smuzhiyun }
1576*4882a593Smuzhiyun }
1577*4882a593Smuzhiyun
1578*4882a593Smuzhiyun pr_debug("block timeout %ld", timeo);
1579*4882a593Smuzhiyun mptcp_wait_data(sk, &timeo);
1580*4882a593Smuzhiyun }
1581*4882a593Smuzhiyun
1582*4882a593Smuzhiyun if (skb_queue_empty(&sk->sk_receive_queue)) {
1583*4882a593Smuzhiyun /* entire backlog drained, clear DATA_READY. */
1584*4882a593Smuzhiyun clear_bit(MPTCP_DATA_READY, &msk->flags);
1585*4882a593Smuzhiyun
1586*4882a593Smuzhiyun /* .. race-breaker: ssk might have gotten new data
1587*4882a593Smuzhiyun * after last __mptcp_move_skbs() returned false.
1588*4882a593Smuzhiyun */
1589*4882a593Smuzhiyun if (unlikely(__mptcp_move_skbs(msk)))
1590*4882a593Smuzhiyun set_bit(MPTCP_DATA_READY, &msk->flags);
1591*4882a593Smuzhiyun } else if (unlikely(!test_bit(MPTCP_DATA_READY, &msk->flags))) {
1592*4882a593Smuzhiyun /* data to read but mptcp_wait_data() cleared DATA_READY */
1593*4882a593Smuzhiyun set_bit(MPTCP_DATA_READY, &msk->flags);
1594*4882a593Smuzhiyun }
1595*4882a593Smuzhiyun out_err:
1596*4882a593Smuzhiyun pr_debug("msk=%p data_ready=%d rx queue empty=%d copied=%d",
1597*4882a593Smuzhiyun msk, test_bit(MPTCP_DATA_READY, &msk->flags),
1598*4882a593Smuzhiyun skb_queue_empty(&sk->sk_receive_queue), copied);
1599*4882a593Smuzhiyun mptcp_rcv_space_adjust(msk, copied);
1600*4882a593Smuzhiyun
1601*4882a593Smuzhiyun release_sock(sk);
1602*4882a593Smuzhiyun return copied;
1603*4882a593Smuzhiyun }
1604*4882a593Smuzhiyun
mptcp_retransmit_handler(struct sock * sk)1605*4882a593Smuzhiyun static void mptcp_retransmit_handler(struct sock *sk)
1606*4882a593Smuzhiyun {
1607*4882a593Smuzhiyun struct mptcp_sock *msk = mptcp_sk(sk);
1608*4882a593Smuzhiyun
1609*4882a593Smuzhiyun if (atomic64_read(&msk->snd_una) == READ_ONCE(msk->write_seq)) {
1610*4882a593Smuzhiyun mptcp_stop_timer(sk);
1611*4882a593Smuzhiyun } else {
1612*4882a593Smuzhiyun set_bit(MPTCP_WORK_RTX, &msk->flags);
1613*4882a593Smuzhiyun if (schedule_work(&msk->work))
1614*4882a593Smuzhiyun sock_hold(sk);
1615*4882a593Smuzhiyun }
1616*4882a593Smuzhiyun }
1617*4882a593Smuzhiyun
mptcp_retransmit_timer(struct timer_list * t)1618*4882a593Smuzhiyun static void mptcp_retransmit_timer(struct timer_list *t)
1619*4882a593Smuzhiyun {
1620*4882a593Smuzhiyun struct inet_connection_sock *icsk = from_timer(icsk, t,
1621*4882a593Smuzhiyun icsk_retransmit_timer);
1622*4882a593Smuzhiyun struct sock *sk = &icsk->icsk_inet.sk;
1623*4882a593Smuzhiyun
1624*4882a593Smuzhiyun bh_lock_sock(sk);
1625*4882a593Smuzhiyun if (!sock_owned_by_user(sk)) {
1626*4882a593Smuzhiyun mptcp_retransmit_handler(sk);
1627*4882a593Smuzhiyun } else {
1628*4882a593Smuzhiyun /* delegate our work to tcp_release_cb() */
1629*4882a593Smuzhiyun if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED,
1630*4882a593Smuzhiyun &sk->sk_tsq_flags))
1631*4882a593Smuzhiyun sock_hold(sk);
1632*4882a593Smuzhiyun }
1633*4882a593Smuzhiyun bh_unlock_sock(sk);
1634*4882a593Smuzhiyun sock_put(sk);
1635*4882a593Smuzhiyun }
1636*4882a593Smuzhiyun
1637*4882a593Smuzhiyun /* Find an idle subflow. Return NULL if there is unacked data at tcp
1638*4882a593Smuzhiyun * level.
1639*4882a593Smuzhiyun *
1640*4882a593Smuzhiyun * A backup subflow is returned only if that is the only kind available.
1641*4882a593Smuzhiyun */
mptcp_subflow_get_retrans(const struct mptcp_sock * msk)1642*4882a593Smuzhiyun static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
1643*4882a593Smuzhiyun {
1644*4882a593Smuzhiyun struct mptcp_subflow_context *subflow;
1645*4882a593Smuzhiyun struct sock *backup = NULL;
1646*4882a593Smuzhiyun
1647*4882a593Smuzhiyun sock_owned_by_me((const struct sock *)msk);
1648*4882a593Smuzhiyun
1649*4882a593Smuzhiyun if (__mptcp_check_fallback(msk))
1650*4882a593Smuzhiyun return msk->first;
1651*4882a593Smuzhiyun
1652*4882a593Smuzhiyun mptcp_for_each_subflow(msk, subflow) {
1653*4882a593Smuzhiyun struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
1654*4882a593Smuzhiyun
1655*4882a593Smuzhiyun if (!mptcp_subflow_active(subflow))
1656*4882a593Smuzhiyun continue;
1657*4882a593Smuzhiyun
1658*4882a593Smuzhiyun /* still data outstanding at TCP level? Don't retransmit. */
1659*4882a593Smuzhiyun if (!tcp_write_queue_empty(ssk)) {
1660*4882a593Smuzhiyun if (inet_csk(ssk)->icsk_ca_state >= TCP_CA_Loss)
1661*4882a593Smuzhiyun continue;
1662*4882a593Smuzhiyun return NULL;
1663*4882a593Smuzhiyun }
1664*4882a593Smuzhiyun
1665*4882a593Smuzhiyun if (subflow->backup) {
1666*4882a593Smuzhiyun if (!backup)
1667*4882a593Smuzhiyun backup = ssk;
1668*4882a593Smuzhiyun continue;
1669*4882a593Smuzhiyun }
1670*4882a593Smuzhiyun
1671*4882a593Smuzhiyun return ssk;
1672*4882a593Smuzhiyun }
1673*4882a593Smuzhiyun
1674*4882a593Smuzhiyun return backup;
1675*4882a593Smuzhiyun }
1676*4882a593Smuzhiyun
1677*4882a593Smuzhiyun /* subflow sockets can be either outgoing (connect) or incoming
1678*4882a593Smuzhiyun * (accept).
1679*4882a593Smuzhiyun *
1680*4882a593Smuzhiyun * Outgoing subflows use in-kernel sockets.
1681*4882a593Smuzhiyun * Incoming subflows do not have their own 'struct socket' allocated,
1682*4882a593Smuzhiyun * so we need to use tcp_close() after detaching them from the mptcp
1683*4882a593Smuzhiyun * parent socket.
1684*4882a593Smuzhiyun */
__mptcp_close_ssk(struct sock * sk,struct sock * ssk,struct mptcp_subflow_context * subflow,long timeout)1685*4882a593Smuzhiyun void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
1686*4882a593Smuzhiyun struct mptcp_subflow_context *subflow,
1687*4882a593Smuzhiyun long timeout)
1688*4882a593Smuzhiyun {
1689*4882a593Smuzhiyun struct socket *sock = READ_ONCE(ssk->sk_socket);
1690*4882a593Smuzhiyun
1691*4882a593Smuzhiyun list_del(&subflow->node);
1692*4882a593Smuzhiyun
1693*4882a593Smuzhiyun if (sock && sock != sk->sk_socket) {
1694*4882a593Smuzhiyun /* outgoing subflow */
1695*4882a593Smuzhiyun sock_release(sock);
1696*4882a593Smuzhiyun } else {
1697*4882a593Smuzhiyun /* incoming subflow */
1698*4882a593Smuzhiyun tcp_close(ssk, timeout);
1699*4882a593Smuzhiyun }
1700*4882a593Smuzhiyun }
1701*4882a593Smuzhiyun
mptcp_sync_mss(struct sock * sk,u32 pmtu)1702*4882a593Smuzhiyun static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu)
1703*4882a593Smuzhiyun {
1704*4882a593Smuzhiyun return 0;
1705*4882a593Smuzhiyun }
1706*4882a593Smuzhiyun
pm_work(struct mptcp_sock * msk)1707*4882a593Smuzhiyun static void pm_work(struct mptcp_sock *msk)
1708*4882a593Smuzhiyun {
1709*4882a593Smuzhiyun struct mptcp_pm_data *pm = &msk->pm;
1710*4882a593Smuzhiyun
1711*4882a593Smuzhiyun spin_lock_bh(&msk->pm.lock);
1712*4882a593Smuzhiyun
1713*4882a593Smuzhiyun pr_debug("msk=%p status=%x", msk, pm->status);
1714*4882a593Smuzhiyun if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) {
1715*4882a593Smuzhiyun pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED);
1716*4882a593Smuzhiyun mptcp_pm_nl_add_addr_received(msk);
1717*4882a593Smuzhiyun }
1718*4882a593Smuzhiyun if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) {
1719*4882a593Smuzhiyun pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED);
1720*4882a593Smuzhiyun mptcp_pm_nl_rm_addr_received(msk);
1721*4882a593Smuzhiyun }
1722*4882a593Smuzhiyun if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) {
1723*4882a593Smuzhiyun pm->status &= ~BIT(MPTCP_PM_ESTABLISHED);
1724*4882a593Smuzhiyun mptcp_pm_nl_fully_established(msk);
1725*4882a593Smuzhiyun }
1726*4882a593Smuzhiyun if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) {
1727*4882a593Smuzhiyun pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED);
1728*4882a593Smuzhiyun mptcp_pm_nl_subflow_established(msk);
1729*4882a593Smuzhiyun }
1730*4882a593Smuzhiyun
1731*4882a593Smuzhiyun spin_unlock_bh(&msk->pm.lock);
1732*4882a593Smuzhiyun }
1733*4882a593Smuzhiyun
__mptcp_close_subflow(struct mptcp_sock * msk)1734*4882a593Smuzhiyun static void __mptcp_close_subflow(struct mptcp_sock *msk)
1735*4882a593Smuzhiyun {
1736*4882a593Smuzhiyun struct mptcp_subflow_context *subflow, *tmp;
1737*4882a593Smuzhiyun
1738*4882a593Smuzhiyun list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
1739*4882a593Smuzhiyun struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
1740*4882a593Smuzhiyun
1741*4882a593Smuzhiyun if (inet_sk_state_load(ssk) != TCP_CLOSE)
1742*4882a593Smuzhiyun continue;
1743*4882a593Smuzhiyun
1744*4882a593Smuzhiyun __mptcp_close_ssk((struct sock *)msk, ssk, subflow, 0);
1745*4882a593Smuzhiyun }
1746*4882a593Smuzhiyun }
1747*4882a593Smuzhiyun
mptcp_worker(struct work_struct * work)1748*4882a593Smuzhiyun static void mptcp_worker(struct work_struct *work)
1749*4882a593Smuzhiyun {
1750*4882a593Smuzhiyun struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work);
1751*4882a593Smuzhiyun struct sock *ssk, *sk = &msk->sk.icsk_inet.sk;
1752*4882a593Smuzhiyun int orig_len, orig_offset, mss_now = 0, size_goal = 0;
1753*4882a593Smuzhiyun struct mptcp_data_frag *dfrag;
1754*4882a593Smuzhiyun u64 orig_write_seq;
1755*4882a593Smuzhiyun size_t copied = 0;
1756*4882a593Smuzhiyun struct msghdr msg = {
1757*4882a593Smuzhiyun .msg_flags = MSG_DONTWAIT,
1758*4882a593Smuzhiyun };
1759*4882a593Smuzhiyun long timeo = 0;
1760*4882a593Smuzhiyun
1761*4882a593Smuzhiyun lock_sock(sk);
1762*4882a593Smuzhiyun mptcp_clean_una(sk);
1763*4882a593Smuzhiyun mptcp_check_data_fin_ack(sk);
1764*4882a593Smuzhiyun __mptcp_flush_join_list(msk);
1765*4882a593Smuzhiyun if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
1766*4882a593Smuzhiyun __mptcp_close_subflow(msk);
1767*4882a593Smuzhiyun
1768*4882a593Smuzhiyun __mptcp_move_skbs(msk);
1769*4882a593Smuzhiyun
1770*4882a593Smuzhiyun if (msk->pm.status)
1771*4882a593Smuzhiyun pm_work(msk);
1772*4882a593Smuzhiyun
1773*4882a593Smuzhiyun if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags))
1774*4882a593Smuzhiyun mptcp_check_for_eof(msk);
1775*4882a593Smuzhiyun
1776*4882a593Smuzhiyun mptcp_check_data_fin(sk);
1777*4882a593Smuzhiyun
1778*4882a593Smuzhiyun if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags))
1779*4882a593Smuzhiyun goto unlock;
1780*4882a593Smuzhiyun
1781*4882a593Smuzhiyun dfrag = mptcp_rtx_head(sk);
1782*4882a593Smuzhiyun if (!dfrag)
1783*4882a593Smuzhiyun goto unlock;
1784*4882a593Smuzhiyun
1785*4882a593Smuzhiyun if (!mptcp_ext_cache_refill(msk))
1786*4882a593Smuzhiyun goto reset_unlock;
1787*4882a593Smuzhiyun
1788*4882a593Smuzhiyun ssk = mptcp_subflow_get_retrans(msk);
1789*4882a593Smuzhiyun if (!ssk)
1790*4882a593Smuzhiyun goto reset_unlock;
1791*4882a593Smuzhiyun
1792*4882a593Smuzhiyun lock_sock(ssk);
1793*4882a593Smuzhiyun
1794*4882a593Smuzhiyun orig_len = dfrag->data_len;
1795*4882a593Smuzhiyun orig_offset = dfrag->offset;
1796*4882a593Smuzhiyun orig_write_seq = dfrag->data_seq;
1797*4882a593Smuzhiyun while (dfrag->data_len > 0) {
1798*4882a593Smuzhiyun int ret = mptcp_sendmsg_frag(sk, ssk, &msg, dfrag, &timeo,
1799*4882a593Smuzhiyun &mss_now, &size_goal);
1800*4882a593Smuzhiyun if (ret < 0)
1801*4882a593Smuzhiyun break;
1802*4882a593Smuzhiyun
1803*4882a593Smuzhiyun MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS);
1804*4882a593Smuzhiyun copied += ret;
1805*4882a593Smuzhiyun dfrag->data_len -= ret;
1806*4882a593Smuzhiyun dfrag->offset += ret;
1807*4882a593Smuzhiyun
1808*4882a593Smuzhiyun if (!mptcp_ext_cache_refill(msk))
1809*4882a593Smuzhiyun break;
1810*4882a593Smuzhiyun }
1811*4882a593Smuzhiyun if (copied)
1812*4882a593Smuzhiyun tcp_push(ssk, msg.msg_flags, mss_now, tcp_sk(ssk)->nonagle,
1813*4882a593Smuzhiyun size_goal);
1814*4882a593Smuzhiyun
1815*4882a593Smuzhiyun dfrag->data_seq = orig_write_seq;
1816*4882a593Smuzhiyun dfrag->offset = orig_offset;
1817*4882a593Smuzhiyun dfrag->data_len = orig_len;
1818*4882a593Smuzhiyun
1819*4882a593Smuzhiyun mptcp_set_timeout(sk, ssk);
1820*4882a593Smuzhiyun release_sock(ssk);
1821*4882a593Smuzhiyun
1822*4882a593Smuzhiyun reset_unlock:
1823*4882a593Smuzhiyun if (!mptcp_timer_pending(sk))
1824*4882a593Smuzhiyun mptcp_reset_timer(sk);
1825*4882a593Smuzhiyun
1826*4882a593Smuzhiyun unlock:
1827*4882a593Smuzhiyun release_sock(sk);
1828*4882a593Smuzhiyun sock_put(sk);
1829*4882a593Smuzhiyun }
1830*4882a593Smuzhiyun
__mptcp_init_sock(struct sock * sk)1831*4882a593Smuzhiyun static int __mptcp_init_sock(struct sock *sk)
1832*4882a593Smuzhiyun {
1833*4882a593Smuzhiyun struct mptcp_sock *msk = mptcp_sk(sk);
1834*4882a593Smuzhiyun
1835*4882a593Smuzhiyun spin_lock_init(&msk->join_list_lock);
1836*4882a593Smuzhiyun
1837*4882a593Smuzhiyun INIT_LIST_HEAD(&msk->conn_list);
1838*4882a593Smuzhiyun INIT_LIST_HEAD(&msk->join_list);
1839*4882a593Smuzhiyun INIT_LIST_HEAD(&msk->rtx_queue);
1840*4882a593Smuzhiyun __set_bit(MPTCP_SEND_SPACE, &msk->flags);
1841*4882a593Smuzhiyun INIT_WORK(&msk->work, mptcp_worker);
1842*4882a593Smuzhiyun msk->out_of_order_queue = RB_ROOT;
1843*4882a593Smuzhiyun
1844*4882a593Smuzhiyun msk->first = NULL;
1845*4882a593Smuzhiyun inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
1846*4882a593Smuzhiyun
1847*4882a593Smuzhiyun mptcp_pm_data_init(msk);
1848*4882a593Smuzhiyun
1849*4882a593Smuzhiyun /* re-use the csk retrans timer for MPTCP-level retrans */
1850*4882a593Smuzhiyun timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0);
1851*4882a593Smuzhiyun
1852*4882a593Smuzhiyun return 0;
1853*4882a593Smuzhiyun }
1854*4882a593Smuzhiyun
mptcp_init_sock(struct sock * sk)1855*4882a593Smuzhiyun static int mptcp_init_sock(struct sock *sk)
1856*4882a593Smuzhiyun {
1857*4882a593Smuzhiyun struct net *net = sock_net(sk);
1858*4882a593Smuzhiyun int ret;
1859*4882a593Smuzhiyun
1860*4882a593Smuzhiyun ret = __mptcp_init_sock(sk);
1861*4882a593Smuzhiyun if (ret)
1862*4882a593Smuzhiyun return ret;
1863*4882a593Smuzhiyun
1864*4882a593Smuzhiyun if (!mptcp_is_enabled(net))
1865*4882a593Smuzhiyun return -ENOPROTOOPT;
1866*4882a593Smuzhiyun
1867*4882a593Smuzhiyun if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net))
1868*4882a593Smuzhiyun return -ENOMEM;
1869*4882a593Smuzhiyun
1870*4882a593Smuzhiyun ret = __mptcp_socket_create(mptcp_sk(sk));
1871*4882a593Smuzhiyun if (ret)
1872*4882a593Smuzhiyun return ret;
1873*4882a593Smuzhiyun
1874*4882a593Smuzhiyun sk_sockets_allocated_inc(sk);
1875*4882a593Smuzhiyun sk->sk_rcvbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]);
1876*4882a593Smuzhiyun sk->sk_sndbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]);
1877*4882a593Smuzhiyun
1878*4882a593Smuzhiyun return 0;
1879*4882a593Smuzhiyun }
1880*4882a593Smuzhiyun
__mptcp_clear_xmit(struct sock * sk)1881*4882a593Smuzhiyun static void __mptcp_clear_xmit(struct sock *sk)
1882*4882a593Smuzhiyun {
1883*4882a593Smuzhiyun struct mptcp_sock *msk = mptcp_sk(sk);
1884*4882a593Smuzhiyun struct mptcp_data_frag *dtmp, *dfrag;
1885*4882a593Smuzhiyun
1886*4882a593Smuzhiyun sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer);
1887*4882a593Smuzhiyun
1888*4882a593Smuzhiyun list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list)
1889*4882a593Smuzhiyun dfrag_clear(sk, dfrag);
1890*4882a593Smuzhiyun }
1891*4882a593Smuzhiyun
mptcp_cancel_work(struct sock * sk)1892*4882a593Smuzhiyun static void mptcp_cancel_work(struct sock *sk)
1893*4882a593Smuzhiyun {
1894*4882a593Smuzhiyun struct mptcp_sock *msk = mptcp_sk(sk);
1895*4882a593Smuzhiyun
1896*4882a593Smuzhiyun if (cancel_work_sync(&msk->work))
1897*4882a593Smuzhiyun sock_put(sk);
1898*4882a593Smuzhiyun }
1899*4882a593Smuzhiyun
mptcp_subflow_shutdown(struct sock * sk,struct sock * ssk,int how)1900*4882a593Smuzhiyun void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how)
1901*4882a593Smuzhiyun {
1902*4882a593Smuzhiyun lock_sock(ssk);
1903*4882a593Smuzhiyun
1904*4882a593Smuzhiyun switch (ssk->sk_state) {
1905*4882a593Smuzhiyun case TCP_LISTEN:
1906*4882a593Smuzhiyun if (!(how & RCV_SHUTDOWN))
1907*4882a593Smuzhiyun break;
1908*4882a593Smuzhiyun fallthrough;
1909*4882a593Smuzhiyun case TCP_SYN_SENT:
1910*4882a593Smuzhiyun tcp_disconnect(ssk, O_NONBLOCK);
1911*4882a593Smuzhiyun break;
1912*4882a593Smuzhiyun default:
1913*4882a593Smuzhiyun if (__mptcp_check_fallback(mptcp_sk(sk))) {
1914*4882a593Smuzhiyun pr_debug("Fallback");
1915*4882a593Smuzhiyun ssk->sk_shutdown |= how;
1916*4882a593Smuzhiyun tcp_shutdown(ssk, how);
1917*4882a593Smuzhiyun } else {
1918*4882a593Smuzhiyun pr_debug("Sending DATA_FIN on subflow %p", ssk);
1919*4882a593Smuzhiyun mptcp_set_timeout(sk, ssk);
1920*4882a593Smuzhiyun tcp_send_ack(ssk);
1921*4882a593Smuzhiyun }
1922*4882a593Smuzhiyun break;
1923*4882a593Smuzhiyun }
1924*4882a593Smuzhiyun
1925*4882a593Smuzhiyun release_sock(ssk);
1926*4882a593Smuzhiyun }
1927*4882a593Smuzhiyun
1928*4882a593Smuzhiyun static const unsigned char new_state[16] = {
1929*4882a593Smuzhiyun /* current state: new state: action: */
1930*4882a593Smuzhiyun [0 /* (Invalid) */] = TCP_CLOSE,
1931*4882a593Smuzhiyun [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1932*4882a593Smuzhiyun [TCP_SYN_SENT] = TCP_CLOSE,
1933*4882a593Smuzhiyun [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1934*4882a593Smuzhiyun [TCP_FIN_WAIT1] = TCP_FIN_WAIT1,
1935*4882a593Smuzhiyun [TCP_FIN_WAIT2] = TCP_FIN_WAIT2,
1936*4882a593Smuzhiyun [TCP_TIME_WAIT] = TCP_CLOSE, /* should not happen ! */
1937*4882a593Smuzhiyun [TCP_CLOSE] = TCP_CLOSE,
1938*4882a593Smuzhiyun [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN,
1939*4882a593Smuzhiyun [TCP_LAST_ACK] = TCP_LAST_ACK,
1940*4882a593Smuzhiyun [TCP_LISTEN] = TCP_CLOSE,
1941*4882a593Smuzhiyun [TCP_CLOSING] = TCP_CLOSING,
1942*4882a593Smuzhiyun [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */
1943*4882a593Smuzhiyun };
1944*4882a593Smuzhiyun
mptcp_close_state(struct sock * sk)1945*4882a593Smuzhiyun static int mptcp_close_state(struct sock *sk)
1946*4882a593Smuzhiyun {
1947*4882a593Smuzhiyun int next = (int)new_state[sk->sk_state];
1948*4882a593Smuzhiyun int ns = next & TCP_STATE_MASK;
1949*4882a593Smuzhiyun
1950*4882a593Smuzhiyun inet_sk_state_store(sk, ns);
1951*4882a593Smuzhiyun
1952*4882a593Smuzhiyun return next & TCP_ACTION_FIN;
1953*4882a593Smuzhiyun }
1954*4882a593Smuzhiyun
mptcp_close(struct sock * sk,long timeout)1955*4882a593Smuzhiyun static void mptcp_close(struct sock *sk, long timeout)
1956*4882a593Smuzhiyun {
1957*4882a593Smuzhiyun struct mptcp_subflow_context *subflow, *tmp;
1958*4882a593Smuzhiyun struct mptcp_sock *msk = mptcp_sk(sk);
1959*4882a593Smuzhiyun LIST_HEAD(conn_list);
1960*4882a593Smuzhiyun
1961*4882a593Smuzhiyun lock_sock(sk);
1962*4882a593Smuzhiyun sk->sk_shutdown = SHUTDOWN_MASK;
1963*4882a593Smuzhiyun
1964*4882a593Smuzhiyun if (sk->sk_state == TCP_LISTEN) {
1965*4882a593Smuzhiyun inet_sk_state_store(sk, TCP_CLOSE);
1966*4882a593Smuzhiyun goto cleanup;
1967*4882a593Smuzhiyun } else if (sk->sk_state == TCP_CLOSE) {
1968*4882a593Smuzhiyun goto cleanup;
1969*4882a593Smuzhiyun }
1970*4882a593Smuzhiyun
1971*4882a593Smuzhiyun if (__mptcp_check_fallback(msk)) {
1972*4882a593Smuzhiyun goto update_state;
1973*4882a593Smuzhiyun } else if (mptcp_close_state(sk)) {
1974*4882a593Smuzhiyun pr_debug("Sending DATA_FIN sk=%p", sk);
1975*4882a593Smuzhiyun WRITE_ONCE(msk->write_seq, msk->write_seq + 1);
1976*4882a593Smuzhiyun WRITE_ONCE(msk->snd_data_fin_enable, 1);
1977*4882a593Smuzhiyun
1978*4882a593Smuzhiyun mptcp_for_each_subflow(msk, subflow) {
1979*4882a593Smuzhiyun struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
1980*4882a593Smuzhiyun
1981*4882a593Smuzhiyun mptcp_subflow_shutdown(sk, tcp_sk, SHUTDOWN_MASK);
1982*4882a593Smuzhiyun }
1983*4882a593Smuzhiyun }
1984*4882a593Smuzhiyun
1985*4882a593Smuzhiyun sk_stream_wait_close(sk, timeout);
1986*4882a593Smuzhiyun
1987*4882a593Smuzhiyun update_state:
1988*4882a593Smuzhiyun inet_sk_state_store(sk, TCP_CLOSE);
1989*4882a593Smuzhiyun
1990*4882a593Smuzhiyun cleanup:
1991*4882a593Smuzhiyun /* be sure to always acquire the join list lock, to sync vs
1992*4882a593Smuzhiyun * mptcp_finish_join().
1993*4882a593Smuzhiyun */
1994*4882a593Smuzhiyun spin_lock_bh(&msk->join_list_lock);
1995*4882a593Smuzhiyun list_splice_tail_init(&msk->join_list, &msk->conn_list);
1996*4882a593Smuzhiyun spin_unlock_bh(&msk->join_list_lock);
1997*4882a593Smuzhiyun list_splice_init(&msk->conn_list, &conn_list);
1998*4882a593Smuzhiyun
1999*4882a593Smuzhiyun __mptcp_clear_xmit(sk);
2000*4882a593Smuzhiyun
2001*4882a593Smuzhiyun release_sock(sk);
2002*4882a593Smuzhiyun
2003*4882a593Smuzhiyun list_for_each_entry_safe(subflow, tmp, &conn_list, node) {
2004*4882a593Smuzhiyun struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
2005*4882a593Smuzhiyun __mptcp_close_ssk(sk, ssk, subflow, timeout);
2006*4882a593Smuzhiyun }
2007*4882a593Smuzhiyun
2008*4882a593Smuzhiyun mptcp_cancel_work(sk);
2009*4882a593Smuzhiyun
2010*4882a593Smuzhiyun __skb_queue_purge(&sk->sk_receive_queue);
2011*4882a593Smuzhiyun
2012*4882a593Smuzhiyun sk_common_release(sk);
2013*4882a593Smuzhiyun }
2014*4882a593Smuzhiyun
mptcp_copy_inaddrs(struct sock * msk,const struct sock * ssk)2015*4882a593Smuzhiyun static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk)
2016*4882a593Smuzhiyun {
2017*4882a593Smuzhiyun #if IS_ENABLED(CONFIG_MPTCP_IPV6)
2018*4882a593Smuzhiyun const struct ipv6_pinfo *ssk6 = inet6_sk(ssk);
2019*4882a593Smuzhiyun struct ipv6_pinfo *msk6 = inet6_sk(msk);
2020*4882a593Smuzhiyun
2021*4882a593Smuzhiyun msk->sk_v6_daddr = ssk->sk_v6_daddr;
2022*4882a593Smuzhiyun msk->sk_v6_rcv_saddr = ssk->sk_v6_rcv_saddr;
2023*4882a593Smuzhiyun
2024*4882a593Smuzhiyun if (msk6 && ssk6) {
2025*4882a593Smuzhiyun msk6->saddr = ssk6->saddr;
2026*4882a593Smuzhiyun msk6->flow_label = ssk6->flow_label;
2027*4882a593Smuzhiyun }
2028*4882a593Smuzhiyun #endif
2029*4882a593Smuzhiyun
2030*4882a593Smuzhiyun inet_sk(msk)->inet_num = inet_sk(ssk)->inet_num;
2031*4882a593Smuzhiyun inet_sk(msk)->inet_dport = inet_sk(ssk)->inet_dport;
2032*4882a593Smuzhiyun inet_sk(msk)->inet_sport = inet_sk(ssk)->inet_sport;
2033*4882a593Smuzhiyun inet_sk(msk)->inet_daddr = inet_sk(ssk)->inet_daddr;
2034*4882a593Smuzhiyun inet_sk(msk)->inet_saddr = inet_sk(ssk)->inet_saddr;
2035*4882a593Smuzhiyun inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr;
2036*4882a593Smuzhiyun }
2037*4882a593Smuzhiyun
mptcp_disconnect(struct sock * sk,int flags)2038*4882a593Smuzhiyun static int mptcp_disconnect(struct sock *sk, int flags)
2039*4882a593Smuzhiyun {
2040*4882a593Smuzhiyun /* Should never be called.
2041*4882a593Smuzhiyun * inet_stream_connect() calls ->disconnect, but that
2042*4882a593Smuzhiyun * refers to the subflow socket, not the mptcp one.
2043*4882a593Smuzhiyun */
2044*4882a593Smuzhiyun WARN_ON_ONCE(1);
2045*4882a593Smuzhiyun return 0;
2046*4882a593Smuzhiyun }
2047*4882a593Smuzhiyun
2048*4882a593Smuzhiyun #if IS_ENABLED(CONFIG_MPTCP_IPV6)
mptcp_inet6_sk(const struct sock * sk)2049*4882a593Smuzhiyun static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk)
2050*4882a593Smuzhiyun {
2051*4882a593Smuzhiyun unsigned int offset = sizeof(struct mptcp6_sock) - sizeof(struct ipv6_pinfo);
2052*4882a593Smuzhiyun
2053*4882a593Smuzhiyun return (struct ipv6_pinfo *)(((u8 *)sk) + offset);
2054*4882a593Smuzhiyun }
2055*4882a593Smuzhiyun #endif
2056*4882a593Smuzhiyun
mptcp_sk_clone(const struct sock * sk,const struct mptcp_options_received * mp_opt,struct request_sock * req)2057*4882a593Smuzhiyun struct sock *mptcp_sk_clone(const struct sock *sk,
2058*4882a593Smuzhiyun const struct mptcp_options_received *mp_opt,
2059*4882a593Smuzhiyun struct request_sock *req)
2060*4882a593Smuzhiyun {
2061*4882a593Smuzhiyun struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
2062*4882a593Smuzhiyun struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC);
2063*4882a593Smuzhiyun struct mptcp_sock *msk;
2064*4882a593Smuzhiyun u64 ack_seq;
2065*4882a593Smuzhiyun
2066*4882a593Smuzhiyun if (!nsk)
2067*4882a593Smuzhiyun return NULL;
2068*4882a593Smuzhiyun
2069*4882a593Smuzhiyun #if IS_ENABLED(CONFIG_MPTCP_IPV6)
2070*4882a593Smuzhiyun if (nsk->sk_family == AF_INET6)
2071*4882a593Smuzhiyun inet_sk(nsk)->pinet6 = mptcp_inet6_sk(nsk);
2072*4882a593Smuzhiyun #endif
2073*4882a593Smuzhiyun
2074*4882a593Smuzhiyun __mptcp_init_sock(nsk);
2075*4882a593Smuzhiyun
2076*4882a593Smuzhiyun msk = mptcp_sk(nsk);
2077*4882a593Smuzhiyun msk->local_key = subflow_req->local_key;
2078*4882a593Smuzhiyun msk->token = subflow_req->token;
2079*4882a593Smuzhiyun msk->subflow = NULL;
2080*4882a593Smuzhiyun WRITE_ONCE(msk->fully_established, false);
2081*4882a593Smuzhiyun
2082*4882a593Smuzhiyun msk->write_seq = subflow_req->idsn + 1;
2083*4882a593Smuzhiyun atomic64_set(&msk->snd_una, msk->write_seq);
2084*4882a593Smuzhiyun if (mp_opt->mp_capable) {
2085*4882a593Smuzhiyun msk->can_ack = true;
2086*4882a593Smuzhiyun msk->remote_key = mp_opt->sndr_key;
2087*4882a593Smuzhiyun mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq);
2088*4882a593Smuzhiyun ack_seq++;
2089*4882a593Smuzhiyun WRITE_ONCE(msk->ack_seq, ack_seq);
2090*4882a593Smuzhiyun }
2091*4882a593Smuzhiyun
2092*4882a593Smuzhiyun sock_reset_flag(nsk, SOCK_RCU_FREE);
2093*4882a593Smuzhiyun /* will be fully established after successful MPC subflow creation */
2094*4882a593Smuzhiyun inet_sk_state_store(nsk, TCP_SYN_RECV);
2095*4882a593Smuzhiyun
2096*4882a593Smuzhiyun security_inet_csk_clone(nsk, req);
2097*4882a593Smuzhiyun bh_unlock_sock(nsk);
2098*4882a593Smuzhiyun
2099*4882a593Smuzhiyun /* keep a single reference */
2100*4882a593Smuzhiyun __sock_put(nsk);
2101*4882a593Smuzhiyun return nsk;
2102*4882a593Smuzhiyun }
2103*4882a593Smuzhiyun
mptcp_rcv_space_init(struct mptcp_sock * msk,const struct sock * ssk)2104*4882a593Smuzhiyun void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk)
2105*4882a593Smuzhiyun {
2106*4882a593Smuzhiyun const struct tcp_sock *tp = tcp_sk(ssk);
2107*4882a593Smuzhiyun
2108*4882a593Smuzhiyun msk->rcvq_space.copied = 0;
2109*4882a593Smuzhiyun msk->rcvq_space.rtt_us = 0;
2110*4882a593Smuzhiyun
2111*4882a593Smuzhiyun msk->rcvq_space.time = tp->tcp_mstamp;
2112*4882a593Smuzhiyun
2113*4882a593Smuzhiyun /* initial rcv_space offering made to peer */
2114*4882a593Smuzhiyun msk->rcvq_space.space = min_t(u32, tp->rcv_wnd,
2115*4882a593Smuzhiyun TCP_INIT_CWND * tp->advmss);
2116*4882a593Smuzhiyun if (msk->rcvq_space.space == 0)
2117*4882a593Smuzhiyun msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT;
2118*4882a593Smuzhiyun }
2119*4882a593Smuzhiyun
mptcp_accept(struct sock * sk,int flags,int * err,bool kern)2120*4882a593Smuzhiyun static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
2121*4882a593Smuzhiyun bool kern)
2122*4882a593Smuzhiyun {
2123*4882a593Smuzhiyun struct mptcp_sock *msk = mptcp_sk(sk);
2124*4882a593Smuzhiyun struct socket *listener;
2125*4882a593Smuzhiyun struct sock *newsk;
2126*4882a593Smuzhiyun
2127*4882a593Smuzhiyun listener = __mptcp_nmpc_socket(msk);
2128*4882a593Smuzhiyun if (WARN_ON_ONCE(!listener)) {
2129*4882a593Smuzhiyun *err = -EINVAL;
2130*4882a593Smuzhiyun return NULL;
2131*4882a593Smuzhiyun }
2132*4882a593Smuzhiyun
2133*4882a593Smuzhiyun pr_debug("msk=%p, listener=%p", msk, mptcp_subflow_ctx(listener->sk));
2134*4882a593Smuzhiyun newsk = inet_csk_accept(listener->sk, flags, err, kern);
2135*4882a593Smuzhiyun if (!newsk)
2136*4882a593Smuzhiyun return NULL;
2137*4882a593Smuzhiyun
2138*4882a593Smuzhiyun pr_debug("msk=%p, subflow is mptcp=%d", msk, sk_is_mptcp(newsk));
2139*4882a593Smuzhiyun if (sk_is_mptcp(newsk)) {
2140*4882a593Smuzhiyun struct mptcp_subflow_context *subflow;
2141*4882a593Smuzhiyun struct sock *new_mptcp_sock;
2142*4882a593Smuzhiyun struct sock *ssk = newsk;
2143*4882a593Smuzhiyun
2144*4882a593Smuzhiyun subflow = mptcp_subflow_ctx(newsk);
2145*4882a593Smuzhiyun new_mptcp_sock = subflow->conn;
2146*4882a593Smuzhiyun
2147*4882a593Smuzhiyun /* is_mptcp should be false if subflow->conn is missing, see
2148*4882a593Smuzhiyun * subflow_syn_recv_sock()
2149*4882a593Smuzhiyun */
2150*4882a593Smuzhiyun if (WARN_ON_ONCE(!new_mptcp_sock)) {
2151*4882a593Smuzhiyun tcp_sk(newsk)->is_mptcp = 0;
2152*4882a593Smuzhiyun goto out;
2153*4882a593Smuzhiyun }
2154*4882a593Smuzhiyun
2155*4882a593Smuzhiyun /* acquire the 2nd reference for the owning socket */
2156*4882a593Smuzhiyun sock_hold(new_mptcp_sock);
2157*4882a593Smuzhiyun
2158*4882a593Smuzhiyun local_bh_disable();
2159*4882a593Smuzhiyun bh_lock_sock(new_mptcp_sock);
2160*4882a593Smuzhiyun msk = mptcp_sk(new_mptcp_sock);
2161*4882a593Smuzhiyun msk->first = newsk;
2162*4882a593Smuzhiyun
2163*4882a593Smuzhiyun newsk = new_mptcp_sock;
2164*4882a593Smuzhiyun mptcp_copy_inaddrs(newsk, ssk);
2165*4882a593Smuzhiyun list_add(&subflow->node, &msk->conn_list);
2166*4882a593Smuzhiyun
2167*4882a593Smuzhiyun mptcp_rcv_space_init(msk, ssk);
2168*4882a593Smuzhiyun bh_unlock_sock(new_mptcp_sock);
2169*4882a593Smuzhiyun
2170*4882a593Smuzhiyun __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
2171*4882a593Smuzhiyun local_bh_enable();
2172*4882a593Smuzhiyun } else {
2173*4882a593Smuzhiyun MPTCP_INC_STATS(sock_net(sk),
2174*4882a593Smuzhiyun MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK);
2175*4882a593Smuzhiyun }
2176*4882a593Smuzhiyun
2177*4882a593Smuzhiyun out:
2178*4882a593Smuzhiyun newsk->sk_kern_sock = kern;
2179*4882a593Smuzhiyun return newsk;
2180*4882a593Smuzhiyun }
2181*4882a593Smuzhiyun
mptcp_destroy_common(struct mptcp_sock * msk)2182*4882a593Smuzhiyun void mptcp_destroy_common(struct mptcp_sock *msk)
2183*4882a593Smuzhiyun {
2184*4882a593Smuzhiyun skb_rbtree_purge(&msk->out_of_order_queue);
2185*4882a593Smuzhiyun mptcp_token_destroy(msk);
2186*4882a593Smuzhiyun mptcp_pm_free_anno_list(msk);
2187*4882a593Smuzhiyun }
2188*4882a593Smuzhiyun
mptcp_destroy(struct sock * sk)2189*4882a593Smuzhiyun static void mptcp_destroy(struct sock *sk)
2190*4882a593Smuzhiyun {
2191*4882a593Smuzhiyun struct mptcp_sock *msk = mptcp_sk(sk);
2192*4882a593Smuzhiyun
2193*4882a593Smuzhiyun if (msk->cached_ext)
2194*4882a593Smuzhiyun __skb_ext_put(msk->cached_ext);
2195*4882a593Smuzhiyun
2196*4882a593Smuzhiyun mptcp_destroy_common(msk);
2197*4882a593Smuzhiyun sk_sockets_allocated_dec(sk);
2198*4882a593Smuzhiyun }
2199*4882a593Smuzhiyun
mptcp_setsockopt_sol_socket(struct mptcp_sock * msk,int optname,sockptr_t optval,unsigned int optlen)2200*4882a593Smuzhiyun static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname,
2201*4882a593Smuzhiyun sockptr_t optval, unsigned int optlen)
2202*4882a593Smuzhiyun {
2203*4882a593Smuzhiyun struct sock *sk = (struct sock *)msk;
2204*4882a593Smuzhiyun struct socket *ssock;
2205*4882a593Smuzhiyun int ret;
2206*4882a593Smuzhiyun
2207*4882a593Smuzhiyun switch (optname) {
2208*4882a593Smuzhiyun case SO_REUSEPORT:
2209*4882a593Smuzhiyun case SO_REUSEADDR:
2210*4882a593Smuzhiyun lock_sock(sk);
2211*4882a593Smuzhiyun ssock = __mptcp_nmpc_socket(msk);
2212*4882a593Smuzhiyun if (!ssock) {
2213*4882a593Smuzhiyun release_sock(sk);
2214*4882a593Smuzhiyun return -EINVAL;
2215*4882a593Smuzhiyun }
2216*4882a593Smuzhiyun
2217*4882a593Smuzhiyun ret = sock_setsockopt(ssock, SOL_SOCKET, optname, optval, optlen);
2218*4882a593Smuzhiyun if (ret == 0) {
2219*4882a593Smuzhiyun if (optname == SO_REUSEPORT)
2220*4882a593Smuzhiyun sk->sk_reuseport = ssock->sk->sk_reuseport;
2221*4882a593Smuzhiyun else if (optname == SO_REUSEADDR)
2222*4882a593Smuzhiyun sk->sk_reuse = ssock->sk->sk_reuse;
2223*4882a593Smuzhiyun }
2224*4882a593Smuzhiyun release_sock(sk);
2225*4882a593Smuzhiyun return ret;
2226*4882a593Smuzhiyun }
2227*4882a593Smuzhiyun
2228*4882a593Smuzhiyun return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen);
2229*4882a593Smuzhiyun }
2230*4882a593Smuzhiyun
mptcp_setsockopt_v6(struct mptcp_sock * msk,int optname,sockptr_t optval,unsigned int optlen)2231*4882a593Smuzhiyun static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname,
2232*4882a593Smuzhiyun sockptr_t optval, unsigned int optlen)
2233*4882a593Smuzhiyun {
2234*4882a593Smuzhiyun struct sock *sk = (struct sock *)msk;
2235*4882a593Smuzhiyun int ret = -EOPNOTSUPP;
2236*4882a593Smuzhiyun struct socket *ssock;
2237*4882a593Smuzhiyun
2238*4882a593Smuzhiyun switch (optname) {
2239*4882a593Smuzhiyun case IPV6_V6ONLY:
2240*4882a593Smuzhiyun lock_sock(sk);
2241*4882a593Smuzhiyun ssock = __mptcp_nmpc_socket(msk);
2242*4882a593Smuzhiyun if (!ssock) {
2243*4882a593Smuzhiyun release_sock(sk);
2244*4882a593Smuzhiyun return -EINVAL;
2245*4882a593Smuzhiyun }
2246*4882a593Smuzhiyun
2247*4882a593Smuzhiyun ret = tcp_setsockopt(ssock->sk, SOL_IPV6, optname, optval, optlen);
2248*4882a593Smuzhiyun if (ret == 0)
2249*4882a593Smuzhiyun sk->sk_ipv6only = ssock->sk->sk_ipv6only;
2250*4882a593Smuzhiyun
2251*4882a593Smuzhiyun release_sock(sk);
2252*4882a593Smuzhiyun break;
2253*4882a593Smuzhiyun }
2254*4882a593Smuzhiyun
2255*4882a593Smuzhiyun return ret;
2256*4882a593Smuzhiyun }
2257*4882a593Smuzhiyun
mptcp_unsupported(int level,int optname)2258*4882a593Smuzhiyun static bool mptcp_unsupported(int level, int optname)
2259*4882a593Smuzhiyun {
2260*4882a593Smuzhiyun if (level == SOL_IP) {
2261*4882a593Smuzhiyun switch (optname) {
2262*4882a593Smuzhiyun case IP_ADD_MEMBERSHIP:
2263*4882a593Smuzhiyun case IP_ADD_SOURCE_MEMBERSHIP:
2264*4882a593Smuzhiyun case IP_DROP_MEMBERSHIP:
2265*4882a593Smuzhiyun case IP_DROP_SOURCE_MEMBERSHIP:
2266*4882a593Smuzhiyun case IP_BLOCK_SOURCE:
2267*4882a593Smuzhiyun case IP_UNBLOCK_SOURCE:
2268*4882a593Smuzhiyun case MCAST_JOIN_GROUP:
2269*4882a593Smuzhiyun case MCAST_LEAVE_GROUP:
2270*4882a593Smuzhiyun case MCAST_JOIN_SOURCE_GROUP:
2271*4882a593Smuzhiyun case MCAST_LEAVE_SOURCE_GROUP:
2272*4882a593Smuzhiyun case MCAST_BLOCK_SOURCE:
2273*4882a593Smuzhiyun case MCAST_UNBLOCK_SOURCE:
2274*4882a593Smuzhiyun case MCAST_MSFILTER:
2275*4882a593Smuzhiyun return true;
2276*4882a593Smuzhiyun }
2277*4882a593Smuzhiyun return false;
2278*4882a593Smuzhiyun }
2279*4882a593Smuzhiyun if (level == SOL_IPV6) {
2280*4882a593Smuzhiyun switch (optname) {
2281*4882a593Smuzhiyun case IPV6_ADDRFORM:
2282*4882a593Smuzhiyun case IPV6_ADD_MEMBERSHIP:
2283*4882a593Smuzhiyun case IPV6_DROP_MEMBERSHIP:
2284*4882a593Smuzhiyun case IPV6_JOIN_ANYCAST:
2285*4882a593Smuzhiyun case IPV6_LEAVE_ANYCAST:
2286*4882a593Smuzhiyun case MCAST_JOIN_GROUP:
2287*4882a593Smuzhiyun case MCAST_LEAVE_GROUP:
2288*4882a593Smuzhiyun case MCAST_JOIN_SOURCE_GROUP:
2289*4882a593Smuzhiyun case MCAST_LEAVE_SOURCE_GROUP:
2290*4882a593Smuzhiyun case MCAST_BLOCK_SOURCE:
2291*4882a593Smuzhiyun case MCAST_UNBLOCK_SOURCE:
2292*4882a593Smuzhiyun case MCAST_MSFILTER:
2293*4882a593Smuzhiyun return true;
2294*4882a593Smuzhiyun }
2295*4882a593Smuzhiyun return false;
2296*4882a593Smuzhiyun }
2297*4882a593Smuzhiyun return false;
2298*4882a593Smuzhiyun }
2299*4882a593Smuzhiyun
mptcp_setsockopt(struct sock * sk,int level,int optname,sockptr_t optval,unsigned int optlen)2300*4882a593Smuzhiyun static int mptcp_setsockopt(struct sock *sk, int level, int optname,
2301*4882a593Smuzhiyun sockptr_t optval, unsigned int optlen)
2302*4882a593Smuzhiyun {
2303*4882a593Smuzhiyun struct mptcp_sock *msk = mptcp_sk(sk);
2304*4882a593Smuzhiyun struct sock *ssk;
2305*4882a593Smuzhiyun
2306*4882a593Smuzhiyun pr_debug("msk=%p", msk);
2307*4882a593Smuzhiyun
2308*4882a593Smuzhiyun if (mptcp_unsupported(level, optname))
2309*4882a593Smuzhiyun return -ENOPROTOOPT;
2310*4882a593Smuzhiyun
2311*4882a593Smuzhiyun if (level == SOL_SOCKET)
2312*4882a593Smuzhiyun return mptcp_setsockopt_sol_socket(msk, optname, optval, optlen);
2313*4882a593Smuzhiyun
2314*4882a593Smuzhiyun /* @@ the meaning of setsockopt() when the socket is connected and
2315*4882a593Smuzhiyun * there are multiple subflows is not yet defined. It is up to the
2316*4882a593Smuzhiyun * MPTCP-level socket to configure the subflows until the subflow
2317*4882a593Smuzhiyun * is in TCP fallback, when TCP socket options are passed through
2318*4882a593Smuzhiyun * to the one remaining subflow.
2319*4882a593Smuzhiyun */
2320*4882a593Smuzhiyun lock_sock(sk);
2321*4882a593Smuzhiyun ssk = __mptcp_tcp_fallback(msk);
2322*4882a593Smuzhiyun release_sock(sk);
2323*4882a593Smuzhiyun if (ssk)
2324*4882a593Smuzhiyun return tcp_setsockopt(ssk, level, optname, optval, optlen);
2325*4882a593Smuzhiyun
2326*4882a593Smuzhiyun if (level == SOL_IPV6)
2327*4882a593Smuzhiyun return mptcp_setsockopt_v6(msk, optname, optval, optlen);
2328*4882a593Smuzhiyun
2329*4882a593Smuzhiyun return -EOPNOTSUPP;
2330*4882a593Smuzhiyun }
2331*4882a593Smuzhiyun
mptcp_getsockopt(struct sock * sk,int level,int optname,char __user * optval,int __user * option)2332*4882a593Smuzhiyun static int mptcp_getsockopt(struct sock *sk, int level, int optname,
2333*4882a593Smuzhiyun char __user *optval, int __user *option)
2334*4882a593Smuzhiyun {
2335*4882a593Smuzhiyun struct mptcp_sock *msk = mptcp_sk(sk);
2336*4882a593Smuzhiyun struct sock *ssk;
2337*4882a593Smuzhiyun
2338*4882a593Smuzhiyun pr_debug("msk=%p", msk);
2339*4882a593Smuzhiyun
2340*4882a593Smuzhiyun /* @@ the meaning of setsockopt() when the socket is connected and
2341*4882a593Smuzhiyun * there are multiple subflows is not yet defined. It is up to the
2342*4882a593Smuzhiyun * MPTCP-level socket to configure the subflows until the subflow
2343*4882a593Smuzhiyun * is in TCP fallback, when socket options are passed through
2344*4882a593Smuzhiyun * to the one remaining subflow.
2345*4882a593Smuzhiyun */
2346*4882a593Smuzhiyun lock_sock(sk);
2347*4882a593Smuzhiyun ssk = __mptcp_tcp_fallback(msk);
2348*4882a593Smuzhiyun release_sock(sk);
2349*4882a593Smuzhiyun if (ssk)
2350*4882a593Smuzhiyun return tcp_getsockopt(ssk, level, optname, optval, option);
2351*4882a593Smuzhiyun
2352*4882a593Smuzhiyun return -EOPNOTSUPP;
2353*4882a593Smuzhiyun }
2354*4882a593Smuzhiyun
2355*4882a593Smuzhiyun #define MPTCP_DEFERRED_ALL (TCPF_DELACK_TIMER_DEFERRED | \
2356*4882a593Smuzhiyun TCPF_WRITE_TIMER_DEFERRED)
2357*4882a593Smuzhiyun
2358*4882a593Smuzhiyun /* this is very alike tcp_release_cb() but we must handle differently a
2359*4882a593Smuzhiyun * different set of events
2360*4882a593Smuzhiyun */
mptcp_release_cb(struct sock * sk)2361*4882a593Smuzhiyun static void mptcp_release_cb(struct sock *sk)
2362*4882a593Smuzhiyun {
2363*4882a593Smuzhiyun unsigned long flags, nflags;
2364*4882a593Smuzhiyun
2365*4882a593Smuzhiyun do {
2366*4882a593Smuzhiyun flags = sk->sk_tsq_flags;
2367*4882a593Smuzhiyun if (!(flags & MPTCP_DEFERRED_ALL))
2368*4882a593Smuzhiyun return;
2369*4882a593Smuzhiyun nflags = flags & ~MPTCP_DEFERRED_ALL;
2370*4882a593Smuzhiyun } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
2371*4882a593Smuzhiyun
2372*4882a593Smuzhiyun sock_release_ownership(sk);
2373*4882a593Smuzhiyun
2374*4882a593Smuzhiyun if (flags & TCPF_DELACK_TIMER_DEFERRED) {
2375*4882a593Smuzhiyun struct mptcp_sock *msk = mptcp_sk(sk);
2376*4882a593Smuzhiyun struct sock *ssk;
2377*4882a593Smuzhiyun
2378*4882a593Smuzhiyun ssk = mptcp_subflow_recv_lookup(msk);
2379*4882a593Smuzhiyun if (!ssk || !schedule_work(&msk->work))
2380*4882a593Smuzhiyun __sock_put(sk);
2381*4882a593Smuzhiyun }
2382*4882a593Smuzhiyun
2383*4882a593Smuzhiyun if (flags & TCPF_WRITE_TIMER_DEFERRED) {
2384*4882a593Smuzhiyun mptcp_retransmit_handler(sk);
2385*4882a593Smuzhiyun __sock_put(sk);
2386*4882a593Smuzhiyun }
2387*4882a593Smuzhiyun }
2388*4882a593Smuzhiyun
mptcp_hash(struct sock * sk)2389*4882a593Smuzhiyun static int mptcp_hash(struct sock *sk)
2390*4882a593Smuzhiyun {
2391*4882a593Smuzhiyun /* should never be called,
2392*4882a593Smuzhiyun * we hash the TCP subflows not the master socket
2393*4882a593Smuzhiyun */
2394*4882a593Smuzhiyun WARN_ON_ONCE(1);
2395*4882a593Smuzhiyun return 0;
2396*4882a593Smuzhiyun }
2397*4882a593Smuzhiyun
mptcp_unhash(struct sock * sk)2398*4882a593Smuzhiyun static void mptcp_unhash(struct sock *sk)
2399*4882a593Smuzhiyun {
2400*4882a593Smuzhiyun /* called from sk_common_release(), but nothing to do here */
2401*4882a593Smuzhiyun }
2402*4882a593Smuzhiyun
mptcp_get_port(struct sock * sk,unsigned short snum)2403*4882a593Smuzhiyun static int mptcp_get_port(struct sock *sk, unsigned short snum)
2404*4882a593Smuzhiyun {
2405*4882a593Smuzhiyun struct mptcp_sock *msk = mptcp_sk(sk);
2406*4882a593Smuzhiyun struct socket *ssock;
2407*4882a593Smuzhiyun
2408*4882a593Smuzhiyun ssock = __mptcp_nmpc_socket(msk);
2409*4882a593Smuzhiyun pr_debug("msk=%p, subflow=%p", msk, ssock);
2410*4882a593Smuzhiyun if (WARN_ON_ONCE(!ssock))
2411*4882a593Smuzhiyun return -EINVAL;
2412*4882a593Smuzhiyun
2413*4882a593Smuzhiyun return inet_csk_get_port(ssock->sk, snum);
2414*4882a593Smuzhiyun }
2415*4882a593Smuzhiyun
mptcp_finish_connect(struct sock * ssk)2416*4882a593Smuzhiyun void mptcp_finish_connect(struct sock *ssk)
2417*4882a593Smuzhiyun {
2418*4882a593Smuzhiyun struct mptcp_subflow_context *subflow;
2419*4882a593Smuzhiyun struct mptcp_sock *msk;
2420*4882a593Smuzhiyun struct sock *sk;
2421*4882a593Smuzhiyun u64 ack_seq;
2422*4882a593Smuzhiyun
2423*4882a593Smuzhiyun subflow = mptcp_subflow_ctx(ssk);
2424*4882a593Smuzhiyun sk = subflow->conn;
2425*4882a593Smuzhiyun msk = mptcp_sk(sk);
2426*4882a593Smuzhiyun
2427*4882a593Smuzhiyun pr_debug("msk=%p, token=%u", sk, subflow->token);
2428*4882a593Smuzhiyun
2429*4882a593Smuzhiyun mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq);
2430*4882a593Smuzhiyun ack_seq++;
2431*4882a593Smuzhiyun subflow->map_seq = ack_seq;
2432*4882a593Smuzhiyun subflow->map_subflow_seq = 1;
2433*4882a593Smuzhiyun
2434*4882a593Smuzhiyun /* the socket is not connected yet, no msk/subflow ops can access/race
2435*4882a593Smuzhiyun * accessing the field below
2436*4882a593Smuzhiyun */
2437*4882a593Smuzhiyun WRITE_ONCE(msk->remote_key, subflow->remote_key);
2438*4882a593Smuzhiyun WRITE_ONCE(msk->local_key, subflow->local_key);
2439*4882a593Smuzhiyun WRITE_ONCE(msk->write_seq, subflow->idsn + 1);
2440*4882a593Smuzhiyun WRITE_ONCE(msk->ack_seq, ack_seq);
2441*4882a593Smuzhiyun WRITE_ONCE(msk->can_ack, 1);
2442*4882a593Smuzhiyun atomic64_set(&msk->snd_una, msk->write_seq);
2443*4882a593Smuzhiyun
2444*4882a593Smuzhiyun mptcp_pm_new_connection(msk, 0);
2445*4882a593Smuzhiyun
2446*4882a593Smuzhiyun mptcp_rcv_space_init(msk, ssk);
2447*4882a593Smuzhiyun }
2448*4882a593Smuzhiyun
mptcp_sock_graft(struct sock * sk,struct socket * parent)2449*4882a593Smuzhiyun static void mptcp_sock_graft(struct sock *sk, struct socket *parent)
2450*4882a593Smuzhiyun {
2451*4882a593Smuzhiyun write_lock_bh(&sk->sk_callback_lock);
2452*4882a593Smuzhiyun rcu_assign_pointer(sk->sk_wq, &parent->wq);
2453*4882a593Smuzhiyun sk_set_socket(sk, parent);
2454*4882a593Smuzhiyun sk->sk_uid = SOCK_INODE(parent)->i_uid;
2455*4882a593Smuzhiyun write_unlock_bh(&sk->sk_callback_lock);
2456*4882a593Smuzhiyun }
2457*4882a593Smuzhiyun
mptcp_finish_join(struct sock * sk)2458*4882a593Smuzhiyun bool mptcp_finish_join(struct sock *sk)
2459*4882a593Smuzhiyun {
2460*4882a593Smuzhiyun struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
2461*4882a593Smuzhiyun struct mptcp_sock *msk = mptcp_sk(subflow->conn);
2462*4882a593Smuzhiyun struct sock *parent = (void *)msk;
2463*4882a593Smuzhiyun struct socket *parent_sock;
2464*4882a593Smuzhiyun bool ret;
2465*4882a593Smuzhiyun
2466*4882a593Smuzhiyun pr_debug("msk=%p, subflow=%p", msk, subflow);
2467*4882a593Smuzhiyun
2468*4882a593Smuzhiyun /* mptcp socket already closing? */
2469*4882a593Smuzhiyun if (!mptcp_is_fully_established(parent))
2470*4882a593Smuzhiyun return false;
2471*4882a593Smuzhiyun
2472*4882a593Smuzhiyun if (!msk->pm.server_side)
2473*4882a593Smuzhiyun return true;
2474*4882a593Smuzhiyun
2475*4882a593Smuzhiyun if (!mptcp_pm_allow_new_subflow(msk))
2476*4882a593Smuzhiyun return false;
2477*4882a593Smuzhiyun
2478*4882a593Smuzhiyun /* active connections are already on conn_list, and we can't acquire
2479*4882a593Smuzhiyun * msk lock here.
2480*4882a593Smuzhiyun * use the join list lock as synchronization point and double-check
2481*4882a593Smuzhiyun * msk status to avoid racing with mptcp_close()
2482*4882a593Smuzhiyun */
2483*4882a593Smuzhiyun spin_lock_bh(&msk->join_list_lock);
2484*4882a593Smuzhiyun ret = inet_sk_state_load(parent) == TCP_ESTABLISHED;
2485*4882a593Smuzhiyun if (ret && !WARN_ON_ONCE(!list_empty(&subflow->node)))
2486*4882a593Smuzhiyun list_add_tail(&subflow->node, &msk->join_list);
2487*4882a593Smuzhiyun spin_unlock_bh(&msk->join_list_lock);
2488*4882a593Smuzhiyun if (!ret)
2489*4882a593Smuzhiyun return false;
2490*4882a593Smuzhiyun
2491*4882a593Smuzhiyun /* attach to msk socket only after we are sure he will deal with us
2492*4882a593Smuzhiyun * at close time
2493*4882a593Smuzhiyun */
2494*4882a593Smuzhiyun parent_sock = READ_ONCE(parent->sk_socket);
2495*4882a593Smuzhiyun if (parent_sock && !sk->sk_socket)
2496*4882a593Smuzhiyun mptcp_sock_graft(sk, parent_sock);
2497*4882a593Smuzhiyun subflow->map_seq = READ_ONCE(msk->ack_seq);
2498*4882a593Smuzhiyun return true;
2499*4882a593Smuzhiyun }
2500*4882a593Smuzhiyun
mptcp_memory_free(const struct sock * sk,int wake)2501*4882a593Smuzhiyun static bool mptcp_memory_free(const struct sock *sk, int wake)
2502*4882a593Smuzhiyun {
2503*4882a593Smuzhiyun struct mptcp_sock *msk = mptcp_sk(sk);
2504*4882a593Smuzhiyun
2505*4882a593Smuzhiyun return wake ? test_bit(MPTCP_SEND_SPACE, &msk->flags) : true;
2506*4882a593Smuzhiyun }
2507*4882a593Smuzhiyun
2508*4882a593Smuzhiyun static struct proto mptcp_prot = {
2509*4882a593Smuzhiyun .name = "MPTCP",
2510*4882a593Smuzhiyun .owner = THIS_MODULE,
2511*4882a593Smuzhiyun .init = mptcp_init_sock,
2512*4882a593Smuzhiyun .disconnect = mptcp_disconnect,
2513*4882a593Smuzhiyun .close = mptcp_close,
2514*4882a593Smuzhiyun .accept = mptcp_accept,
2515*4882a593Smuzhiyun .setsockopt = mptcp_setsockopt,
2516*4882a593Smuzhiyun .getsockopt = mptcp_getsockopt,
2517*4882a593Smuzhiyun .shutdown = tcp_shutdown,
2518*4882a593Smuzhiyun .destroy = mptcp_destroy,
2519*4882a593Smuzhiyun .sendmsg = mptcp_sendmsg,
2520*4882a593Smuzhiyun .recvmsg = mptcp_recvmsg,
2521*4882a593Smuzhiyun .release_cb = mptcp_release_cb,
2522*4882a593Smuzhiyun .hash = mptcp_hash,
2523*4882a593Smuzhiyun .unhash = mptcp_unhash,
2524*4882a593Smuzhiyun .get_port = mptcp_get_port,
2525*4882a593Smuzhiyun .sockets_allocated = &mptcp_sockets_allocated,
2526*4882a593Smuzhiyun .memory_allocated = &tcp_memory_allocated,
2527*4882a593Smuzhiyun .memory_pressure = &tcp_memory_pressure,
2528*4882a593Smuzhiyun .stream_memory_free = mptcp_memory_free,
2529*4882a593Smuzhiyun .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2530*4882a593Smuzhiyun .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2531*4882a593Smuzhiyun .sysctl_mem = sysctl_tcp_mem,
2532*4882a593Smuzhiyun .obj_size = sizeof(struct mptcp_sock),
2533*4882a593Smuzhiyun .slab_flags = SLAB_TYPESAFE_BY_RCU,
2534*4882a593Smuzhiyun .no_autobind = true,
2535*4882a593Smuzhiyun };
2536*4882a593Smuzhiyun
mptcp_bind(struct socket * sock,struct sockaddr * uaddr,int addr_len)2537*4882a593Smuzhiyun static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
2538*4882a593Smuzhiyun {
2539*4882a593Smuzhiyun struct mptcp_sock *msk = mptcp_sk(sock->sk);
2540*4882a593Smuzhiyun struct socket *ssock;
2541*4882a593Smuzhiyun int err;
2542*4882a593Smuzhiyun
2543*4882a593Smuzhiyun lock_sock(sock->sk);
2544*4882a593Smuzhiyun ssock = __mptcp_nmpc_socket(msk);
2545*4882a593Smuzhiyun if (!ssock) {
2546*4882a593Smuzhiyun err = -EINVAL;
2547*4882a593Smuzhiyun goto unlock;
2548*4882a593Smuzhiyun }
2549*4882a593Smuzhiyun
2550*4882a593Smuzhiyun err = ssock->ops->bind(ssock, uaddr, addr_len);
2551*4882a593Smuzhiyun if (!err)
2552*4882a593Smuzhiyun mptcp_copy_inaddrs(sock->sk, ssock->sk);
2553*4882a593Smuzhiyun
2554*4882a593Smuzhiyun unlock:
2555*4882a593Smuzhiyun release_sock(sock->sk);
2556*4882a593Smuzhiyun return err;
2557*4882a593Smuzhiyun }
2558*4882a593Smuzhiyun
mptcp_subflow_early_fallback(struct mptcp_sock * msk,struct mptcp_subflow_context * subflow)2559*4882a593Smuzhiyun static void mptcp_subflow_early_fallback(struct mptcp_sock *msk,
2560*4882a593Smuzhiyun struct mptcp_subflow_context *subflow)
2561*4882a593Smuzhiyun {
2562*4882a593Smuzhiyun subflow->request_mptcp = 0;
2563*4882a593Smuzhiyun __mptcp_do_fallback(msk);
2564*4882a593Smuzhiyun }
2565*4882a593Smuzhiyun
mptcp_stream_connect(struct socket * sock,struct sockaddr * uaddr,int addr_len,int flags)2566*4882a593Smuzhiyun static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,
2567*4882a593Smuzhiyun int addr_len, int flags)
2568*4882a593Smuzhiyun {
2569*4882a593Smuzhiyun struct mptcp_sock *msk = mptcp_sk(sock->sk);
2570*4882a593Smuzhiyun struct mptcp_subflow_context *subflow;
2571*4882a593Smuzhiyun struct socket *ssock;
2572*4882a593Smuzhiyun int err;
2573*4882a593Smuzhiyun
2574*4882a593Smuzhiyun lock_sock(sock->sk);
2575*4882a593Smuzhiyun if (sock->state != SS_UNCONNECTED && msk->subflow) {
2576*4882a593Smuzhiyun /* pending connection or invalid state, let existing subflow
2577*4882a593Smuzhiyun * cope with that
2578*4882a593Smuzhiyun */
2579*4882a593Smuzhiyun ssock = msk->subflow;
2580*4882a593Smuzhiyun goto do_connect;
2581*4882a593Smuzhiyun }
2582*4882a593Smuzhiyun
2583*4882a593Smuzhiyun ssock = __mptcp_nmpc_socket(msk);
2584*4882a593Smuzhiyun if (!ssock) {
2585*4882a593Smuzhiyun err = -EINVAL;
2586*4882a593Smuzhiyun goto unlock;
2587*4882a593Smuzhiyun }
2588*4882a593Smuzhiyun
2589*4882a593Smuzhiyun mptcp_token_destroy(msk);
2590*4882a593Smuzhiyun inet_sk_state_store(sock->sk, TCP_SYN_SENT);
2591*4882a593Smuzhiyun subflow = mptcp_subflow_ctx(ssock->sk);
2592*4882a593Smuzhiyun #ifdef CONFIG_TCP_MD5SIG
2593*4882a593Smuzhiyun /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
2594*4882a593Smuzhiyun * TCP option space.
2595*4882a593Smuzhiyun */
2596*4882a593Smuzhiyun if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info))
2597*4882a593Smuzhiyun mptcp_subflow_early_fallback(msk, subflow);
2598*4882a593Smuzhiyun #endif
2599*4882a593Smuzhiyun if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk))
2600*4882a593Smuzhiyun mptcp_subflow_early_fallback(msk, subflow);
2601*4882a593Smuzhiyun
2602*4882a593Smuzhiyun do_connect:
2603*4882a593Smuzhiyun err = ssock->ops->connect(ssock, uaddr, addr_len, flags);
2604*4882a593Smuzhiyun sock->state = ssock->state;
2605*4882a593Smuzhiyun
2606*4882a593Smuzhiyun /* on successful connect, the msk state will be moved to established by
2607*4882a593Smuzhiyun * subflow_finish_connect()
2608*4882a593Smuzhiyun */
2609*4882a593Smuzhiyun if (!err || err == -EINPROGRESS)
2610*4882a593Smuzhiyun mptcp_copy_inaddrs(sock->sk, ssock->sk);
2611*4882a593Smuzhiyun else
2612*4882a593Smuzhiyun inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk));
2613*4882a593Smuzhiyun
2614*4882a593Smuzhiyun unlock:
2615*4882a593Smuzhiyun release_sock(sock->sk);
2616*4882a593Smuzhiyun return err;
2617*4882a593Smuzhiyun }
2618*4882a593Smuzhiyun
mptcp_listen(struct socket * sock,int backlog)2619*4882a593Smuzhiyun static int mptcp_listen(struct socket *sock, int backlog)
2620*4882a593Smuzhiyun {
2621*4882a593Smuzhiyun struct mptcp_sock *msk = mptcp_sk(sock->sk);
2622*4882a593Smuzhiyun struct socket *ssock;
2623*4882a593Smuzhiyun int err;
2624*4882a593Smuzhiyun
2625*4882a593Smuzhiyun pr_debug("msk=%p", msk);
2626*4882a593Smuzhiyun
2627*4882a593Smuzhiyun lock_sock(sock->sk);
2628*4882a593Smuzhiyun ssock = __mptcp_nmpc_socket(msk);
2629*4882a593Smuzhiyun if (!ssock) {
2630*4882a593Smuzhiyun err = -EINVAL;
2631*4882a593Smuzhiyun goto unlock;
2632*4882a593Smuzhiyun }
2633*4882a593Smuzhiyun
2634*4882a593Smuzhiyun mptcp_token_destroy(msk);
2635*4882a593Smuzhiyun inet_sk_state_store(sock->sk, TCP_LISTEN);
2636*4882a593Smuzhiyun sock_set_flag(sock->sk, SOCK_RCU_FREE);
2637*4882a593Smuzhiyun
2638*4882a593Smuzhiyun err = ssock->ops->listen(ssock, backlog);
2639*4882a593Smuzhiyun inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk));
2640*4882a593Smuzhiyun if (!err)
2641*4882a593Smuzhiyun mptcp_copy_inaddrs(sock->sk, ssock->sk);
2642*4882a593Smuzhiyun
2643*4882a593Smuzhiyun unlock:
2644*4882a593Smuzhiyun release_sock(sock->sk);
2645*4882a593Smuzhiyun return err;
2646*4882a593Smuzhiyun }
2647*4882a593Smuzhiyun
mptcp_stream_accept(struct socket * sock,struct socket * newsock,int flags,bool kern)2648*4882a593Smuzhiyun static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
2649*4882a593Smuzhiyun int flags, bool kern)
2650*4882a593Smuzhiyun {
2651*4882a593Smuzhiyun struct mptcp_sock *msk = mptcp_sk(sock->sk);
2652*4882a593Smuzhiyun struct socket *ssock;
2653*4882a593Smuzhiyun int err;
2654*4882a593Smuzhiyun
2655*4882a593Smuzhiyun pr_debug("msk=%p", msk);
2656*4882a593Smuzhiyun
2657*4882a593Smuzhiyun lock_sock(sock->sk);
2658*4882a593Smuzhiyun if (sock->sk->sk_state != TCP_LISTEN)
2659*4882a593Smuzhiyun goto unlock_fail;
2660*4882a593Smuzhiyun
2661*4882a593Smuzhiyun ssock = __mptcp_nmpc_socket(msk);
2662*4882a593Smuzhiyun if (!ssock)
2663*4882a593Smuzhiyun goto unlock_fail;
2664*4882a593Smuzhiyun
2665*4882a593Smuzhiyun clear_bit(MPTCP_DATA_READY, &msk->flags);
2666*4882a593Smuzhiyun sock_hold(ssock->sk);
2667*4882a593Smuzhiyun release_sock(sock->sk);
2668*4882a593Smuzhiyun
2669*4882a593Smuzhiyun err = ssock->ops->accept(sock, newsock, flags, kern);
2670*4882a593Smuzhiyun if (err == 0 && !mptcp_is_tcpsk(newsock->sk)) {
2671*4882a593Smuzhiyun struct mptcp_sock *msk = mptcp_sk(newsock->sk);
2672*4882a593Smuzhiyun struct mptcp_subflow_context *subflow;
2673*4882a593Smuzhiyun
2674*4882a593Smuzhiyun /* set ssk->sk_socket of accept()ed flows to mptcp socket.
2675*4882a593Smuzhiyun * This is needed so NOSPACE flag can be set from tcp stack.
2676*4882a593Smuzhiyun */
2677*4882a593Smuzhiyun __mptcp_flush_join_list(msk);
2678*4882a593Smuzhiyun mptcp_for_each_subflow(msk, subflow) {
2679*4882a593Smuzhiyun struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
2680*4882a593Smuzhiyun
2681*4882a593Smuzhiyun if (!ssk->sk_socket)
2682*4882a593Smuzhiyun mptcp_sock_graft(ssk, newsock);
2683*4882a593Smuzhiyun }
2684*4882a593Smuzhiyun }
2685*4882a593Smuzhiyun
2686*4882a593Smuzhiyun if (inet_csk_listen_poll(ssock->sk))
2687*4882a593Smuzhiyun set_bit(MPTCP_DATA_READY, &msk->flags);
2688*4882a593Smuzhiyun sock_put(ssock->sk);
2689*4882a593Smuzhiyun return err;
2690*4882a593Smuzhiyun
2691*4882a593Smuzhiyun unlock_fail:
2692*4882a593Smuzhiyun release_sock(sock->sk);
2693*4882a593Smuzhiyun return -EINVAL;
2694*4882a593Smuzhiyun }
2695*4882a593Smuzhiyun
mptcp_check_readable(struct mptcp_sock * msk)2696*4882a593Smuzhiyun static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
2697*4882a593Smuzhiyun {
2698*4882a593Smuzhiyun return test_bit(MPTCP_DATA_READY, &msk->flags) ? EPOLLIN | EPOLLRDNORM :
2699*4882a593Smuzhiyun 0;
2700*4882a593Smuzhiyun }
2701*4882a593Smuzhiyun
mptcp_poll(struct file * file,struct socket * sock,struct poll_table_struct * wait)2702*4882a593Smuzhiyun static __poll_t mptcp_poll(struct file *file, struct socket *sock,
2703*4882a593Smuzhiyun struct poll_table_struct *wait)
2704*4882a593Smuzhiyun {
2705*4882a593Smuzhiyun struct sock *sk = sock->sk;
2706*4882a593Smuzhiyun struct mptcp_sock *msk;
2707*4882a593Smuzhiyun __poll_t mask = 0;
2708*4882a593Smuzhiyun int state;
2709*4882a593Smuzhiyun
2710*4882a593Smuzhiyun msk = mptcp_sk(sk);
2711*4882a593Smuzhiyun sock_poll_wait(file, sock, wait);
2712*4882a593Smuzhiyun
2713*4882a593Smuzhiyun state = inet_sk_state_load(sk);
2714*4882a593Smuzhiyun pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags);
2715*4882a593Smuzhiyun if (state == TCP_LISTEN)
2716*4882a593Smuzhiyun return mptcp_check_readable(msk);
2717*4882a593Smuzhiyun
2718*4882a593Smuzhiyun if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {
2719*4882a593Smuzhiyun mask |= mptcp_check_readable(msk);
2720*4882a593Smuzhiyun if (test_bit(MPTCP_SEND_SPACE, &msk->flags))
2721*4882a593Smuzhiyun mask |= EPOLLOUT | EPOLLWRNORM;
2722*4882a593Smuzhiyun }
2723*4882a593Smuzhiyun if (sk->sk_shutdown & RCV_SHUTDOWN)
2724*4882a593Smuzhiyun mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
2725*4882a593Smuzhiyun
2726*4882a593Smuzhiyun return mask;
2727*4882a593Smuzhiyun }
2728*4882a593Smuzhiyun
mptcp_shutdown(struct socket * sock,int how)2729*4882a593Smuzhiyun static int mptcp_shutdown(struct socket *sock, int how)
2730*4882a593Smuzhiyun {
2731*4882a593Smuzhiyun struct mptcp_sock *msk = mptcp_sk(sock->sk);
2732*4882a593Smuzhiyun struct mptcp_subflow_context *subflow;
2733*4882a593Smuzhiyun int ret = 0;
2734*4882a593Smuzhiyun
2735*4882a593Smuzhiyun pr_debug("sk=%p, how=%d", msk, how);
2736*4882a593Smuzhiyun
2737*4882a593Smuzhiyun lock_sock(sock->sk);
2738*4882a593Smuzhiyun
2739*4882a593Smuzhiyun how++;
2740*4882a593Smuzhiyun if ((how & ~SHUTDOWN_MASK) || !how) {
2741*4882a593Smuzhiyun ret = -EINVAL;
2742*4882a593Smuzhiyun goto out_unlock;
2743*4882a593Smuzhiyun }
2744*4882a593Smuzhiyun
2745*4882a593Smuzhiyun if (sock->state == SS_CONNECTING) {
2746*4882a593Smuzhiyun if ((1 << sock->sk->sk_state) &
2747*4882a593Smuzhiyun (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))
2748*4882a593Smuzhiyun sock->state = SS_DISCONNECTING;
2749*4882a593Smuzhiyun else
2750*4882a593Smuzhiyun sock->state = SS_CONNECTED;
2751*4882a593Smuzhiyun }
2752*4882a593Smuzhiyun
2753*4882a593Smuzhiyun /* If we've already sent a FIN, or it's a closed state, skip this. */
2754*4882a593Smuzhiyun if (__mptcp_check_fallback(msk)) {
2755*4882a593Smuzhiyun if (how == SHUT_WR || how == SHUT_RDWR)
2756*4882a593Smuzhiyun inet_sk_state_store(sock->sk, TCP_FIN_WAIT1);
2757*4882a593Smuzhiyun
2758*4882a593Smuzhiyun mptcp_for_each_subflow(msk, subflow) {
2759*4882a593Smuzhiyun struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
2760*4882a593Smuzhiyun
2761*4882a593Smuzhiyun mptcp_subflow_shutdown(sock->sk, tcp_sk, how);
2762*4882a593Smuzhiyun }
2763*4882a593Smuzhiyun } else if ((how & SEND_SHUTDOWN) &&
2764*4882a593Smuzhiyun ((1 << sock->sk->sk_state) &
2765*4882a593Smuzhiyun (TCPF_ESTABLISHED | TCPF_SYN_SENT |
2766*4882a593Smuzhiyun TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) &&
2767*4882a593Smuzhiyun mptcp_close_state(sock->sk)) {
2768*4882a593Smuzhiyun __mptcp_flush_join_list(msk);
2769*4882a593Smuzhiyun
2770*4882a593Smuzhiyun WRITE_ONCE(msk->write_seq, msk->write_seq + 1);
2771*4882a593Smuzhiyun WRITE_ONCE(msk->snd_data_fin_enable, 1);
2772*4882a593Smuzhiyun
2773*4882a593Smuzhiyun mptcp_for_each_subflow(msk, subflow) {
2774*4882a593Smuzhiyun struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
2775*4882a593Smuzhiyun
2776*4882a593Smuzhiyun mptcp_subflow_shutdown(sock->sk, tcp_sk, how);
2777*4882a593Smuzhiyun }
2778*4882a593Smuzhiyun }
2779*4882a593Smuzhiyun
2780*4882a593Smuzhiyun /* Wake up anyone sleeping in poll. */
2781*4882a593Smuzhiyun sock->sk->sk_state_change(sock->sk);
2782*4882a593Smuzhiyun
2783*4882a593Smuzhiyun out_unlock:
2784*4882a593Smuzhiyun release_sock(sock->sk);
2785*4882a593Smuzhiyun
2786*4882a593Smuzhiyun return ret;
2787*4882a593Smuzhiyun }
2788*4882a593Smuzhiyun
2789*4882a593Smuzhiyun static const struct proto_ops mptcp_stream_ops = {
2790*4882a593Smuzhiyun .family = PF_INET,
2791*4882a593Smuzhiyun .owner = THIS_MODULE,
2792*4882a593Smuzhiyun .release = inet_release,
2793*4882a593Smuzhiyun .bind = mptcp_bind,
2794*4882a593Smuzhiyun .connect = mptcp_stream_connect,
2795*4882a593Smuzhiyun .socketpair = sock_no_socketpair,
2796*4882a593Smuzhiyun .accept = mptcp_stream_accept,
2797*4882a593Smuzhiyun .getname = inet_getname,
2798*4882a593Smuzhiyun .poll = mptcp_poll,
2799*4882a593Smuzhiyun .ioctl = inet_ioctl,
2800*4882a593Smuzhiyun .gettstamp = sock_gettstamp,
2801*4882a593Smuzhiyun .listen = mptcp_listen,
2802*4882a593Smuzhiyun .shutdown = mptcp_shutdown,
2803*4882a593Smuzhiyun .setsockopt = sock_common_setsockopt,
2804*4882a593Smuzhiyun .getsockopt = sock_common_getsockopt,
2805*4882a593Smuzhiyun .sendmsg = inet_sendmsg,
2806*4882a593Smuzhiyun .recvmsg = inet_recvmsg,
2807*4882a593Smuzhiyun .mmap = sock_no_mmap,
2808*4882a593Smuzhiyun .sendpage = inet_sendpage,
2809*4882a593Smuzhiyun };
2810*4882a593Smuzhiyun
2811*4882a593Smuzhiyun static struct inet_protosw mptcp_protosw = {
2812*4882a593Smuzhiyun .type = SOCK_STREAM,
2813*4882a593Smuzhiyun .protocol = IPPROTO_MPTCP,
2814*4882a593Smuzhiyun .prot = &mptcp_prot,
2815*4882a593Smuzhiyun .ops = &mptcp_stream_ops,
2816*4882a593Smuzhiyun .flags = INET_PROTOSW_ICSK,
2817*4882a593Smuzhiyun };
2818*4882a593Smuzhiyun
mptcp_proto_init(void)2819*4882a593Smuzhiyun void __init mptcp_proto_init(void)
2820*4882a593Smuzhiyun {
2821*4882a593Smuzhiyun mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo;
2822*4882a593Smuzhiyun
2823*4882a593Smuzhiyun if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL))
2824*4882a593Smuzhiyun panic("Failed to allocate MPTCP pcpu counter\n");
2825*4882a593Smuzhiyun
2826*4882a593Smuzhiyun mptcp_subflow_init();
2827*4882a593Smuzhiyun mptcp_pm_init();
2828*4882a593Smuzhiyun mptcp_token_init();
2829*4882a593Smuzhiyun
2830*4882a593Smuzhiyun if (proto_register(&mptcp_prot, 1) != 0)
2831*4882a593Smuzhiyun panic("Failed to register MPTCP proto.\n");
2832*4882a593Smuzhiyun
2833*4882a593Smuzhiyun inet_register_protosw(&mptcp_protosw);
2834*4882a593Smuzhiyun
2835*4882a593Smuzhiyun BUILD_BUG_ON(sizeof(struct mptcp_skb_cb) > sizeof_field(struct sk_buff, cb));
2836*4882a593Smuzhiyun }
2837*4882a593Smuzhiyun
2838*4882a593Smuzhiyun #if IS_ENABLED(CONFIG_MPTCP_IPV6)
2839*4882a593Smuzhiyun static const struct proto_ops mptcp_v6_stream_ops = {
2840*4882a593Smuzhiyun .family = PF_INET6,
2841*4882a593Smuzhiyun .owner = THIS_MODULE,
2842*4882a593Smuzhiyun .release = inet6_release,
2843*4882a593Smuzhiyun .bind = mptcp_bind,
2844*4882a593Smuzhiyun .connect = mptcp_stream_connect,
2845*4882a593Smuzhiyun .socketpair = sock_no_socketpair,
2846*4882a593Smuzhiyun .accept = mptcp_stream_accept,
2847*4882a593Smuzhiyun .getname = inet6_getname,
2848*4882a593Smuzhiyun .poll = mptcp_poll,
2849*4882a593Smuzhiyun .ioctl = inet6_ioctl,
2850*4882a593Smuzhiyun .gettstamp = sock_gettstamp,
2851*4882a593Smuzhiyun .listen = mptcp_listen,
2852*4882a593Smuzhiyun .shutdown = mptcp_shutdown,
2853*4882a593Smuzhiyun .setsockopt = sock_common_setsockopt,
2854*4882a593Smuzhiyun .getsockopt = sock_common_getsockopt,
2855*4882a593Smuzhiyun .sendmsg = inet6_sendmsg,
2856*4882a593Smuzhiyun .recvmsg = inet6_recvmsg,
2857*4882a593Smuzhiyun .mmap = sock_no_mmap,
2858*4882a593Smuzhiyun .sendpage = inet_sendpage,
2859*4882a593Smuzhiyun #ifdef CONFIG_COMPAT
2860*4882a593Smuzhiyun .compat_ioctl = inet6_compat_ioctl,
2861*4882a593Smuzhiyun #endif
2862*4882a593Smuzhiyun };
2863*4882a593Smuzhiyun
2864*4882a593Smuzhiyun static struct proto mptcp_v6_prot;
2865*4882a593Smuzhiyun
mptcp_v6_destroy(struct sock * sk)2866*4882a593Smuzhiyun static void mptcp_v6_destroy(struct sock *sk)
2867*4882a593Smuzhiyun {
2868*4882a593Smuzhiyun mptcp_destroy(sk);
2869*4882a593Smuzhiyun inet6_destroy_sock(sk);
2870*4882a593Smuzhiyun }
2871*4882a593Smuzhiyun
2872*4882a593Smuzhiyun static struct inet_protosw mptcp_v6_protosw = {
2873*4882a593Smuzhiyun .type = SOCK_STREAM,
2874*4882a593Smuzhiyun .protocol = IPPROTO_MPTCP,
2875*4882a593Smuzhiyun .prot = &mptcp_v6_prot,
2876*4882a593Smuzhiyun .ops = &mptcp_v6_stream_ops,
2877*4882a593Smuzhiyun .flags = INET_PROTOSW_ICSK,
2878*4882a593Smuzhiyun };
2879*4882a593Smuzhiyun
mptcp_proto_v6_init(void)2880*4882a593Smuzhiyun int __init mptcp_proto_v6_init(void)
2881*4882a593Smuzhiyun {
2882*4882a593Smuzhiyun int err;
2883*4882a593Smuzhiyun
2884*4882a593Smuzhiyun mptcp_v6_prot = mptcp_prot;
2885*4882a593Smuzhiyun strcpy(mptcp_v6_prot.name, "MPTCPv6");
2886*4882a593Smuzhiyun mptcp_v6_prot.slab = NULL;
2887*4882a593Smuzhiyun mptcp_v6_prot.destroy = mptcp_v6_destroy;
2888*4882a593Smuzhiyun mptcp_v6_prot.obj_size = sizeof(struct mptcp6_sock);
2889*4882a593Smuzhiyun
2890*4882a593Smuzhiyun err = proto_register(&mptcp_v6_prot, 1);
2891*4882a593Smuzhiyun if (err)
2892*4882a593Smuzhiyun return err;
2893*4882a593Smuzhiyun
2894*4882a593Smuzhiyun err = inet6_register_protosw(&mptcp_v6_protosw);
2895*4882a593Smuzhiyun if (err)
2896*4882a593Smuzhiyun proto_unregister(&mptcp_v6_prot);
2897*4882a593Smuzhiyun
2898*4882a593Smuzhiyun return err;
2899*4882a593Smuzhiyun }
2900*4882a593Smuzhiyun #endif
2901