1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0-or-later
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * INET An implementation of the TCP/IP protocol suite for the LINUX
4*4882a593Smuzhiyun * operating system. INET is implemented using the BSD Socket
5*4882a593Smuzhiyun * interface as the means of communication with the user level.
6*4882a593Smuzhiyun *
7*4882a593Smuzhiyun * Implementation of the Transmission Control Protocol(TCP).
8*4882a593Smuzhiyun *
9*4882a593Smuzhiyun * Authors: Ross Biro
10*4882a593Smuzhiyun * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11*4882a593Smuzhiyun * Mark Evans, <evansmp@uhura.aston.ac.uk>
12*4882a593Smuzhiyun * Corey Minyard <wf-rch!minyard@relay.EU.net>
13*4882a593Smuzhiyun * Florian La Roche, <flla@stud.uni-sb.de>
14*4882a593Smuzhiyun * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
15*4882a593Smuzhiyun * Linus Torvalds, <torvalds@cs.helsinki.fi>
16*4882a593Smuzhiyun * Alan Cox, <gw4pts@gw4pts.ampr.org>
17*4882a593Smuzhiyun * Matthew Dillon, <dillon@apollo.west.oic.com>
18*4882a593Smuzhiyun * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
19*4882a593Smuzhiyun * Jorge Cwik, <jorge@laser.satlink.net>
20*4882a593Smuzhiyun *
21*4882a593Smuzhiyun * Fixes:
22*4882a593Smuzhiyun * Alan Cox : Numerous verify_area() calls
23*4882a593Smuzhiyun * Alan Cox : Set the ACK bit on a reset
24*4882a593Smuzhiyun * Alan Cox : Stopped it crashing if it closed while
25*4882a593Smuzhiyun * sk->inuse=1 and was trying to connect
26*4882a593Smuzhiyun * (tcp_err()).
27*4882a593Smuzhiyun * Alan Cox : All icmp error handling was broken
28*4882a593Smuzhiyun * pointers passed where wrong and the
29*4882a593Smuzhiyun * socket was looked up backwards. Nobody
30*4882a593Smuzhiyun * tested any icmp error code obviously.
31*4882a593Smuzhiyun * Alan Cox : tcp_err() now handled properly. It
32*4882a593Smuzhiyun * wakes people on errors. poll
33*4882a593Smuzhiyun * behaves and the icmp error race
34*4882a593Smuzhiyun * has gone by moving it into sock.c
35*4882a593Smuzhiyun * Alan Cox : tcp_send_reset() fixed to work for
36*4882a593Smuzhiyun * everything not just packets for
37*4882a593Smuzhiyun * unknown sockets.
38*4882a593Smuzhiyun * Alan Cox : tcp option processing.
39*4882a593Smuzhiyun * Alan Cox : Reset tweaked (still not 100%) [Had
40*4882a593Smuzhiyun * syn rule wrong]
41*4882a593Smuzhiyun * Herp Rosmanith : More reset fixes
42*4882a593Smuzhiyun * Alan Cox : No longer acks invalid rst frames.
43*4882a593Smuzhiyun * Acking any kind of RST is right out.
44*4882a593Smuzhiyun * Alan Cox : Sets an ignore me flag on an rst
45*4882a593Smuzhiyun * receive otherwise odd bits of prattle
46*4882a593Smuzhiyun * escape still
47*4882a593Smuzhiyun * Alan Cox : Fixed another acking RST frame bug.
48*4882a593Smuzhiyun * Should stop LAN workplace lockups.
49*4882a593Smuzhiyun * Alan Cox : Some tidyups using the new skb list
50*4882a593Smuzhiyun * facilities
51*4882a593Smuzhiyun * Alan Cox : sk->keepopen now seems to work
52*4882a593Smuzhiyun * Alan Cox : Pulls options out correctly on accepts
53*4882a593Smuzhiyun * Alan Cox : Fixed assorted sk->rqueue->next errors
54*4882a593Smuzhiyun * Alan Cox : PSH doesn't end a TCP read. Switched a
55*4882a593Smuzhiyun * bit to skb ops.
56*4882a593Smuzhiyun * Alan Cox : Tidied tcp_data to avoid a potential
57*4882a593Smuzhiyun * nasty.
58*4882a593Smuzhiyun * Alan Cox : Added some better commenting, as the
59*4882a593Smuzhiyun * tcp is hard to follow
60*4882a593Smuzhiyun * Alan Cox : Removed incorrect check for 20 * psh
61*4882a593Smuzhiyun * Michael O'Reilly : ack < copied bug fix.
62*4882a593Smuzhiyun * Johannes Stille : Misc tcp fixes (not all in yet).
63*4882a593Smuzhiyun * Alan Cox : FIN with no memory -> CRASH
64*4882a593Smuzhiyun * Alan Cox : Added socket option proto entries.
65*4882a593Smuzhiyun * Also added awareness of them to accept.
66*4882a593Smuzhiyun * Alan Cox : Added TCP options (SOL_TCP)
67*4882a593Smuzhiyun * Alan Cox : Switched wakeup calls to callbacks,
68*4882a593Smuzhiyun * so the kernel can layer network
69*4882a593Smuzhiyun * sockets.
70*4882a593Smuzhiyun * Alan Cox : Use ip_tos/ip_ttl settings.
71*4882a593Smuzhiyun * Alan Cox : Handle FIN (more) properly (we hope).
72*4882a593Smuzhiyun * Alan Cox : RST frames sent on unsynchronised
73*4882a593Smuzhiyun * state ack error.
74*4882a593Smuzhiyun * Alan Cox : Put in missing check for SYN bit.
75*4882a593Smuzhiyun * Alan Cox : Added tcp_select_window() aka NET2E
76*4882a593Smuzhiyun * window non shrink trick.
77*4882a593Smuzhiyun * Alan Cox : Added a couple of small NET2E timer
78*4882a593Smuzhiyun * fixes
79*4882a593Smuzhiyun * Charles Hedrick : TCP fixes
80*4882a593Smuzhiyun * Toomas Tamm : TCP window fixes
81*4882a593Smuzhiyun * Alan Cox : Small URG fix to rlogin ^C ack fight
82*4882a593Smuzhiyun * Charles Hedrick : Rewrote most of it to actually work
83*4882a593Smuzhiyun * Linus : Rewrote tcp_read() and URG handling
84*4882a593Smuzhiyun * completely
85*4882a593Smuzhiyun * Gerhard Koerting: Fixed some missing timer handling
86*4882a593Smuzhiyun * Matthew Dillon : Reworked TCP machine states as per RFC
87*4882a593Smuzhiyun * Gerhard Koerting: PC/TCP workarounds
88*4882a593Smuzhiyun * Adam Caldwell : Assorted timer/timing errors
89*4882a593Smuzhiyun * Matthew Dillon : Fixed another RST bug
90*4882a593Smuzhiyun * Alan Cox : Move to kernel side addressing changes.
91*4882a593Smuzhiyun * Alan Cox : Beginning work on TCP fastpathing
92*4882a593Smuzhiyun * (not yet usable)
93*4882a593Smuzhiyun * Arnt Gulbrandsen: Turbocharged tcp_check() routine.
94*4882a593Smuzhiyun * Alan Cox : TCP fast path debugging
95*4882a593Smuzhiyun * Alan Cox : Window clamping
96*4882a593Smuzhiyun * Michael Riepe : Bug in tcp_check()
97*4882a593Smuzhiyun * Matt Dillon : More TCP improvements and RST bug fixes
98*4882a593Smuzhiyun * Matt Dillon : Yet more small nasties remove from the
99*4882a593Smuzhiyun * TCP code (Be very nice to this man if
100*4882a593Smuzhiyun * tcp finally works 100%) 8)
101*4882a593Smuzhiyun * Alan Cox : BSD accept semantics.
102*4882a593Smuzhiyun * Alan Cox : Reset on closedown bug.
103*4882a593Smuzhiyun * Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
104*4882a593Smuzhiyun * Michael Pall : Handle poll() after URG properly in
105*4882a593Smuzhiyun * all cases.
106*4882a593Smuzhiyun * Michael Pall : Undo the last fix in tcp_read_urg()
107*4882a593Smuzhiyun * (multi URG PUSH broke rlogin).
108*4882a593Smuzhiyun * Michael Pall : Fix the multi URG PUSH problem in
109*4882a593Smuzhiyun * tcp_readable(), poll() after URG
110*4882a593Smuzhiyun * works now.
111*4882a593Smuzhiyun * Michael Pall : recv(...,MSG_OOB) never blocks in the
112*4882a593Smuzhiyun * BSD api.
113*4882a593Smuzhiyun * Alan Cox : Changed the semantics of sk->socket to
114*4882a593Smuzhiyun * fix a race and a signal problem with
115*4882a593Smuzhiyun * accept() and async I/O.
116*4882a593Smuzhiyun * Alan Cox : Relaxed the rules on tcp_sendto().
117*4882a593Smuzhiyun * Yury Shevchuk : Really fixed accept() blocking problem.
118*4882a593Smuzhiyun * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
119*4882a593Smuzhiyun * clients/servers which listen in on
120*4882a593Smuzhiyun * fixed ports.
121*4882a593Smuzhiyun * Alan Cox : Cleaned the above up and shrank it to
122*4882a593Smuzhiyun * a sensible code size.
123*4882a593Smuzhiyun * Alan Cox : Self connect lockup fix.
124*4882a593Smuzhiyun * Alan Cox : No connect to multicast.
125*4882a593Smuzhiyun * Ross Biro : Close unaccepted children on master
126*4882a593Smuzhiyun * socket close.
127*4882a593Smuzhiyun * Alan Cox : Reset tracing code.
128*4882a593Smuzhiyun * Alan Cox : Spurious resets on shutdown.
129*4882a593Smuzhiyun * Alan Cox : Giant 15 minute/60 second timer error
130*4882a593Smuzhiyun * Alan Cox : Small whoops in polling before an
131*4882a593Smuzhiyun * accept.
132*4882a593Smuzhiyun * Alan Cox : Kept the state trace facility since
133*4882a593Smuzhiyun * it's handy for debugging.
134*4882a593Smuzhiyun * Alan Cox : More reset handler fixes.
135*4882a593Smuzhiyun * Alan Cox : Started rewriting the code based on
136*4882a593Smuzhiyun * the RFC's for other useful protocol
137*4882a593Smuzhiyun * references see: Comer, KA9Q NOS, and
138*4882a593Smuzhiyun * for a reference on the difference
139*4882a593Smuzhiyun * between specifications and how BSD
140*4882a593Smuzhiyun * works see the 4.4lite source.
141*4882a593Smuzhiyun * A.N.Kuznetsov : Don't time wait on completion of tidy
142*4882a593Smuzhiyun * close.
143*4882a593Smuzhiyun * Linus Torvalds : Fin/Shutdown & copied_seq changes.
144*4882a593Smuzhiyun * Linus Torvalds : Fixed BSD port reuse to work first syn
145*4882a593Smuzhiyun * Alan Cox : Reimplemented timers as per the RFC
146*4882a593Smuzhiyun * and using multiple timers for sanity.
147*4882a593Smuzhiyun * Alan Cox : Small bug fixes, and a lot of new
148*4882a593Smuzhiyun * comments.
149*4882a593Smuzhiyun * Alan Cox : Fixed dual reader crash by locking
150*4882a593Smuzhiyun * the buffers (much like datagram.c)
151*4882a593Smuzhiyun * Alan Cox : Fixed stuck sockets in probe. A probe
152*4882a593Smuzhiyun * now gets fed up of retrying without
153*4882a593Smuzhiyun * (even a no space) answer.
154*4882a593Smuzhiyun * Alan Cox : Extracted closing code better
155*4882a593Smuzhiyun * Alan Cox : Fixed the closing state machine to
156*4882a593Smuzhiyun * resemble the RFC.
157*4882a593Smuzhiyun * Alan Cox : More 'per spec' fixes.
158*4882a593Smuzhiyun * Jorge Cwik : Even faster checksumming.
159*4882a593Smuzhiyun * Alan Cox : tcp_data() doesn't ack illegal PSH
160*4882a593Smuzhiyun * only frames. At least one pc tcp stack
161*4882a593Smuzhiyun * generates them.
162*4882a593Smuzhiyun * Alan Cox : Cache last socket.
163*4882a593Smuzhiyun * Alan Cox : Per route irtt.
164*4882a593Smuzhiyun * Matt Day : poll()->select() match BSD precisely on error
165*4882a593Smuzhiyun * Alan Cox : New buffers
166*4882a593Smuzhiyun * Marc Tamsky : Various sk->prot->retransmits and
167*4882a593Smuzhiyun * sk->retransmits misupdating fixed.
168*4882a593Smuzhiyun * Fixed tcp_write_timeout: stuck close,
169*4882a593Smuzhiyun * and TCP syn retries gets used now.
170*4882a593Smuzhiyun * Mark Yarvis : In tcp_read_wakeup(), don't send an
171*4882a593Smuzhiyun * ack if state is TCP_CLOSED.
172*4882a593Smuzhiyun * Alan Cox : Look up device on a retransmit - routes may
173*4882a593Smuzhiyun * change. Doesn't yet cope with MSS shrink right
174*4882a593Smuzhiyun * but it's a start!
175*4882a593Smuzhiyun * Marc Tamsky : Closing in closing fixes.
176*4882a593Smuzhiyun * Mike Shaver : RFC1122 verifications.
177*4882a593Smuzhiyun * Alan Cox : rcv_saddr errors.
178*4882a593Smuzhiyun * Alan Cox : Block double connect().
179*4882a593Smuzhiyun * Alan Cox : Small hooks for enSKIP.
180*4882a593Smuzhiyun * Alexey Kuznetsov: Path MTU discovery.
181*4882a593Smuzhiyun * Alan Cox : Support soft errors.
182*4882a593Smuzhiyun * Alan Cox : Fix MTU discovery pathological case
183*4882a593Smuzhiyun * when the remote claims no mtu!
184*4882a593Smuzhiyun * Marc Tamsky : TCP_CLOSE fix.
185*4882a593Smuzhiyun * Colin (G3TNE) : Send a reset on syn ack replies in
186*4882a593Smuzhiyun * window but wrong (fixes NT lpd problems)
187*4882a593Smuzhiyun * Pedro Roque : Better TCP window handling, delayed ack.
188*4882a593Smuzhiyun * Joerg Reuter : No modification of locked buffers in
189*4882a593Smuzhiyun * tcp_do_retransmit()
190*4882a593Smuzhiyun * Eric Schenk : Changed receiver side silly window
191*4882a593Smuzhiyun * avoidance algorithm to BSD style
192*4882a593Smuzhiyun * algorithm. This doubles throughput
193*4882a593Smuzhiyun * against machines running Solaris,
194*4882a593Smuzhiyun * and seems to result in general
195*4882a593Smuzhiyun * improvement.
196*4882a593Smuzhiyun * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD
197*4882a593Smuzhiyun * Willy Konynenberg : Transparent proxying support.
198*4882a593Smuzhiyun * Mike McLagan : Routing by source
199*4882a593Smuzhiyun * Keith Owens : Do proper merging with partial SKB's in
200*4882a593Smuzhiyun * tcp_do_sendmsg to avoid burstiness.
201*4882a593Smuzhiyun * Eric Schenk : Fix fast close down bug with
202*4882a593Smuzhiyun * shutdown() followed by close().
203*4882a593Smuzhiyun * Andi Kleen : Make poll agree with SIGIO
204*4882a593Smuzhiyun * Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and
205*4882a593Smuzhiyun * lingertime == 0 (RFC 793 ABORT Call)
206*4882a593Smuzhiyun * Hirokazu Takahashi : Use copy_from_user() instead of
207*4882a593Smuzhiyun * csum_and_copy_from_user() if possible.
208*4882a593Smuzhiyun *
209*4882a593Smuzhiyun * Description of States:
210*4882a593Smuzhiyun *
211*4882a593Smuzhiyun * TCP_SYN_SENT sent a connection request, waiting for ack
212*4882a593Smuzhiyun *
213*4882a593Smuzhiyun * TCP_SYN_RECV received a connection request, sent ack,
214*4882a593Smuzhiyun * waiting for final ack in three-way handshake.
215*4882a593Smuzhiyun *
216*4882a593Smuzhiyun * TCP_ESTABLISHED connection established
217*4882a593Smuzhiyun *
218*4882a593Smuzhiyun * TCP_FIN_WAIT1 our side has shutdown, waiting to complete
219*4882a593Smuzhiyun * transmission of remaining buffered data
220*4882a593Smuzhiyun *
221*4882a593Smuzhiyun * TCP_FIN_WAIT2 all buffered data sent, waiting for remote
222*4882a593Smuzhiyun * to shutdown
223*4882a593Smuzhiyun *
224*4882a593Smuzhiyun * TCP_CLOSING both sides have shutdown but we still have
225*4882a593Smuzhiyun * data we have to finish sending
226*4882a593Smuzhiyun *
227*4882a593Smuzhiyun * TCP_TIME_WAIT timeout to catch resent junk before entering
228*4882a593Smuzhiyun * closed, can only be entered from FIN_WAIT2
229*4882a593Smuzhiyun * or CLOSING. Required because the other end
230*4882a593Smuzhiyun * may not have gotten our last ACK causing it
231*4882a593Smuzhiyun * to retransmit the data packet (which we ignore)
232*4882a593Smuzhiyun *
233*4882a593Smuzhiyun * TCP_CLOSE_WAIT remote side has shutdown and is waiting for
234*4882a593Smuzhiyun * us to finish writing our data and to shutdown
235*4882a593Smuzhiyun * (we have to close() to move on to LAST_ACK)
236*4882a593Smuzhiyun *
237*4882a593Smuzhiyun * TCP_LAST_ACK out side has shutdown after remote has
238*4882a593Smuzhiyun * shutdown. There may still be data in our
239*4882a593Smuzhiyun * buffer that we have to finish sending
240*4882a593Smuzhiyun *
241*4882a593Smuzhiyun * TCP_CLOSE socket is finished
242*4882a593Smuzhiyun */
243*4882a593Smuzhiyun
244*4882a593Smuzhiyun #define pr_fmt(fmt) "TCP: " fmt
245*4882a593Smuzhiyun
246*4882a593Smuzhiyun #include <crypto/hash.h>
247*4882a593Smuzhiyun #include <linux/kernel.h>
248*4882a593Smuzhiyun #include <linux/module.h>
249*4882a593Smuzhiyun #include <linux/types.h>
250*4882a593Smuzhiyun #include <linux/fcntl.h>
251*4882a593Smuzhiyun #include <linux/poll.h>
252*4882a593Smuzhiyun #include <linux/inet_diag.h>
253*4882a593Smuzhiyun #include <linux/init.h>
254*4882a593Smuzhiyun #include <linux/fs.h>
255*4882a593Smuzhiyun #include <linux/skbuff.h>
256*4882a593Smuzhiyun #include <linux/scatterlist.h>
257*4882a593Smuzhiyun #include <linux/splice.h>
258*4882a593Smuzhiyun #include <linux/net.h>
259*4882a593Smuzhiyun #include <linux/socket.h>
260*4882a593Smuzhiyun #include <linux/random.h>
261*4882a593Smuzhiyun #include <linux/memblock.h>
262*4882a593Smuzhiyun #include <linux/highmem.h>
263*4882a593Smuzhiyun #include <linux/swap.h>
264*4882a593Smuzhiyun #include <linux/cache.h>
265*4882a593Smuzhiyun #include <linux/err.h>
266*4882a593Smuzhiyun #include <linux/time.h>
267*4882a593Smuzhiyun #include <linux/slab.h>
268*4882a593Smuzhiyun #include <linux/errqueue.h>
269*4882a593Smuzhiyun #include <linux/static_key.h>
270*4882a593Smuzhiyun
271*4882a593Smuzhiyun #include <net/icmp.h>
272*4882a593Smuzhiyun #include <net/inet_common.h>
273*4882a593Smuzhiyun #include <net/tcp.h>
274*4882a593Smuzhiyun #include <net/mptcp.h>
275*4882a593Smuzhiyun #include <net/xfrm.h>
276*4882a593Smuzhiyun #include <net/ip.h>
277*4882a593Smuzhiyun #include <net/sock.h>
278*4882a593Smuzhiyun
279*4882a593Smuzhiyun #include <linux/uaccess.h>
280*4882a593Smuzhiyun #include <asm/ioctls.h>
281*4882a593Smuzhiyun #include <net/busy_poll.h>
282*4882a593Smuzhiyun
283*4882a593Smuzhiyun #include <trace/hooks/ipv4.h>
284*4882a593Smuzhiyun
285*4882a593Smuzhiyun struct percpu_counter tcp_orphan_count;
286*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(tcp_orphan_count);
287*4882a593Smuzhiyun
288*4882a593Smuzhiyun long sysctl_tcp_mem[3] __read_mostly;
289*4882a593Smuzhiyun EXPORT_SYMBOL(sysctl_tcp_mem);
290*4882a593Smuzhiyun
291*4882a593Smuzhiyun atomic_long_t tcp_memory_allocated; /* Current allocated memory. */
292*4882a593Smuzhiyun EXPORT_SYMBOL(tcp_memory_allocated);
293*4882a593Smuzhiyun
294*4882a593Smuzhiyun #if IS_ENABLED(CONFIG_SMC)
295*4882a593Smuzhiyun DEFINE_STATIC_KEY_FALSE(tcp_have_smc);
296*4882a593Smuzhiyun EXPORT_SYMBOL(tcp_have_smc);
297*4882a593Smuzhiyun #endif
298*4882a593Smuzhiyun
299*4882a593Smuzhiyun /*
300*4882a593Smuzhiyun * Current number of TCP sockets.
301*4882a593Smuzhiyun */
302*4882a593Smuzhiyun struct percpu_counter tcp_sockets_allocated;
303*4882a593Smuzhiyun EXPORT_SYMBOL(tcp_sockets_allocated);
304*4882a593Smuzhiyun
305*4882a593Smuzhiyun /*
306*4882a593Smuzhiyun * TCP splice context
307*4882a593Smuzhiyun */
308*4882a593Smuzhiyun struct tcp_splice_state {
309*4882a593Smuzhiyun struct pipe_inode_info *pipe;
310*4882a593Smuzhiyun size_t len;
311*4882a593Smuzhiyun unsigned int flags;
312*4882a593Smuzhiyun };
313*4882a593Smuzhiyun
314*4882a593Smuzhiyun /*
315*4882a593Smuzhiyun * Pressure flag: try to collapse.
316*4882a593Smuzhiyun * Technical note: it is used by multiple contexts non atomically.
317*4882a593Smuzhiyun * All the __sk_mem_schedule() is of this nature: accounting
318*4882a593Smuzhiyun * is strict, actions are advisory and have some latency.
319*4882a593Smuzhiyun */
320*4882a593Smuzhiyun unsigned long tcp_memory_pressure __read_mostly;
321*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(tcp_memory_pressure);
322*4882a593Smuzhiyun
323*4882a593Smuzhiyun DEFINE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key);
324*4882a593Smuzhiyun EXPORT_SYMBOL(tcp_rx_skb_cache_key);
325*4882a593Smuzhiyun
326*4882a593Smuzhiyun DEFINE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key);
327*4882a593Smuzhiyun
tcp_enter_memory_pressure(struct sock * sk)328*4882a593Smuzhiyun void tcp_enter_memory_pressure(struct sock *sk)
329*4882a593Smuzhiyun {
330*4882a593Smuzhiyun unsigned long val;
331*4882a593Smuzhiyun
332*4882a593Smuzhiyun if (READ_ONCE(tcp_memory_pressure))
333*4882a593Smuzhiyun return;
334*4882a593Smuzhiyun val = jiffies;
335*4882a593Smuzhiyun
336*4882a593Smuzhiyun if (!val)
337*4882a593Smuzhiyun val--;
338*4882a593Smuzhiyun if (!cmpxchg(&tcp_memory_pressure, 0, val))
339*4882a593Smuzhiyun NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
340*4882a593Smuzhiyun }
341*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(tcp_enter_memory_pressure);
342*4882a593Smuzhiyun
tcp_leave_memory_pressure(struct sock * sk)343*4882a593Smuzhiyun void tcp_leave_memory_pressure(struct sock *sk)
344*4882a593Smuzhiyun {
345*4882a593Smuzhiyun unsigned long val;
346*4882a593Smuzhiyun
347*4882a593Smuzhiyun if (!READ_ONCE(tcp_memory_pressure))
348*4882a593Smuzhiyun return;
349*4882a593Smuzhiyun val = xchg(&tcp_memory_pressure, 0);
350*4882a593Smuzhiyun if (val)
351*4882a593Smuzhiyun NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO,
352*4882a593Smuzhiyun jiffies_to_msecs(jiffies - val));
353*4882a593Smuzhiyun }
354*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(tcp_leave_memory_pressure);
355*4882a593Smuzhiyun
356*4882a593Smuzhiyun /* Convert seconds to retransmits based on initial and max timeout */
secs_to_retrans(int seconds,int timeout,int rto_max)357*4882a593Smuzhiyun static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
358*4882a593Smuzhiyun {
359*4882a593Smuzhiyun u8 res = 0;
360*4882a593Smuzhiyun
361*4882a593Smuzhiyun if (seconds > 0) {
362*4882a593Smuzhiyun int period = timeout;
363*4882a593Smuzhiyun
364*4882a593Smuzhiyun res = 1;
365*4882a593Smuzhiyun while (seconds > period && res < 255) {
366*4882a593Smuzhiyun res++;
367*4882a593Smuzhiyun timeout <<= 1;
368*4882a593Smuzhiyun if (timeout > rto_max)
369*4882a593Smuzhiyun timeout = rto_max;
370*4882a593Smuzhiyun period += timeout;
371*4882a593Smuzhiyun }
372*4882a593Smuzhiyun }
373*4882a593Smuzhiyun return res;
374*4882a593Smuzhiyun }
375*4882a593Smuzhiyun
376*4882a593Smuzhiyun /* Convert retransmits to seconds based on initial and max timeout */
retrans_to_secs(u8 retrans,int timeout,int rto_max)377*4882a593Smuzhiyun static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
378*4882a593Smuzhiyun {
379*4882a593Smuzhiyun int period = 0;
380*4882a593Smuzhiyun
381*4882a593Smuzhiyun if (retrans > 0) {
382*4882a593Smuzhiyun period = timeout;
383*4882a593Smuzhiyun while (--retrans) {
384*4882a593Smuzhiyun timeout <<= 1;
385*4882a593Smuzhiyun if (timeout > rto_max)
386*4882a593Smuzhiyun timeout = rto_max;
387*4882a593Smuzhiyun period += timeout;
388*4882a593Smuzhiyun }
389*4882a593Smuzhiyun }
390*4882a593Smuzhiyun return period;
391*4882a593Smuzhiyun }
392*4882a593Smuzhiyun
tcp_compute_delivery_rate(const struct tcp_sock * tp)393*4882a593Smuzhiyun static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp)
394*4882a593Smuzhiyun {
395*4882a593Smuzhiyun u32 rate = READ_ONCE(tp->rate_delivered);
396*4882a593Smuzhiyun u32 intv = READ_ONCE(tp->rate_interval_us);
397*4882a593Smuzhiyun u64 rate64 = 0;
398*4882a593Smuzhiyun
399*4882a593Smuzhiyun if (rate && intv) {
400*4882a593Smuzhiyun rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
401*4882a593Smuzhiyun do_div(rate64, intv);
402*4882a593Smuzhiyun }
403*4882a593Smuzhiyun return rate64;
404*4882a593Smuzhiyun }
405*4882a593Smuzhiyun
406*4882a593Smuzhiyun /* Address-family independent initialization for a tcp_sock.
407*4882a593Smuzhiyun *
408*4882a593Smuzhiyun * NOTE: A lot of things set to zero explicitly by call to
409*4882a593Smuzhiyun * sk_alloc() so need not be done here.
410*4882a593Smuzhiyun */
tcp_init_sock(struct sock * sk)411*4882a593Smuzhiyun void tcp_init_sock(struct sock *sk)
412*4882a593Smuzhiyun {
413*4882a593Smuzhiyun struct inet_connection_sock *icsk = inet_csk(sk);
414*4882a593Smuzhiyun struct tcp_sock *tp = tcp_sk(sk);
415*4882a593Smuzhiyun
416*4882a593Smuzhiyun tp->out_of_order_queue = RB_ROOT;
417*4882a593Smuzhiyun sk->tcp_rtx_queue = RB_ROOT;
418*4882a593Smuzhiyun tcp_init_xmit_timers(sk);
419*4882a593Smuzhiyun INIT_LIST_HEAD(&tp->tsq_node);
420*4882a593Smuzhiyun INIT_LIST_HEAD(&tp->tsorted_sent_queue);
421*4882a593Smuzhiyun
422*4882a593Smuzhiyun icsk->icsk_rto = TCP_TIMEOUT_INIT;
423*4882a593Smuzhiyun icsk->icsk_rto_min = TCP_RTO_MIN;
424*4882a593Smuzhiyun icsk->icsk_delack_max = TCP_DELACK_MAX;
425*4882a593Smuzhiyun tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
426*4882a593Smuzhiyun minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
427*4882a593Smuzhiyun
428*4882a593Smuzhiyun /* So many TCP implementations out there (incorrectly) count the
429*4882a593Smuzhiyun * initial SYN frame in their delayed-ACK and congestion control
430*4882a593Smuzhiyun * algorithms that we must have the following bandaid to talk
431*4882a593Smuzhiyun * efficiently to them. -DaveM
432*4882a593Smuzhiyun */
433*4882a593Smuzhiyun tp->snd_cwnd = TCP_INIT_CWND;
434*4882a593Smuzhiyun
435*4882a593Smuzhiyun /* There's a bubble in the pipe until at least the first ACK. */
436*4882a593Smuzhiyun tp->app_limited = ~0U;
437*4882a593Smuzhiyun
438*4882a593Smuzhiyun /* See draft-stevens-tcpca-spec-01 for discussion of the
439*4882a593Smuzhiyun * initialization of these values.
440*4882a593Smuzhiyun */
441*4882a593Smuzhiyun tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
442*4882a593Smuzhiyun tp->snd_cwnd_clamp = ~0;
443*4882a593Smuzhiyun tp->mss_cache = TCP_MSS_DEFAULT;
444*4882a593Smuzhiyun
445*4882a593Smuzhiyun tp->reordering = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering);
446*4882a593Smuzhiyun tcp_assign_congestion_control(sk);
447*4882a593Smuzhiyun
448*4882a593Smuzhiyun tp->tsoffset = 0;
449*4882a593Smuzhiyun tp->rack.reo_wnd_steps = 1;
450*4882a593Smuzhiyun
451*4882a593Smuzhiyun sk->sk_write_space = sk_stream_write_space;
452*4882a593Smuzhiyun sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
453*4882a593Smuzhiyun
454*4882a593Smuzhiyun icsk->icsk_sync_mss = tcp_sync_mss;
455*4882a593Smuzhiyun
456*4882a593Smuzhiyun WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]));
457*4882a593Smuzhiyun WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]));
458*4882a593Smuzhiyun
459*4882a593Smuzhiyun sk_sockets_allocated_inc(sk);
460*4882a593Smuzhiyun sk->sk_route_forced_caps = NETIF_F_GSO;
461*4882a593Smuzhiyun }
462*4882a593Smuzhiyun EXPORT_SYMBOL(tcp_init_sock);
463*4882a593Smuzhiyun
tcp_tx_timestamp(struct sock * sk,u16 tsflags)464*4882a593Smuzhiyun static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
465*4882a593Smuzhiyun {
466*4882a593Smuzhiyun struct sk_buff *skb = tcp_write_queue_tail(sk);
467*4882a593Smuzhiyun
468*4882a593Smuzhiyun if (tsflags && skb) {
469*4882a593Smuzhiyun struct skb_shared_info *shinfo = skb_shinfo(skb);
470*4882a593Smuzhiyun struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
471*4882a593Smuzhiyun
472*4882a593Smuzhiyun sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags);
473*4882a593Smuzhiyun if (tsflags & SOF_TIMESTAMPING_TX_ACK)
474*4882a593Smuzhiyun tcb->txstamp_ack = 1;
475*4882a593Smuzhiyun if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
476*4882a593Smuzhiyun shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
477*4882a593Smuzhiyun }
478*4882a593Smuzhiyun }
479*4882a593Smuzhiyun
tcp_stream_is_readable(const struct tcp_sock * tp,int target,struct sock * sk)480*4882a593Smuzhiyun static inline bool tcp_stream_is_readable(const struct tcp_sock *tp,
481*4882a593Smuzhiyun int target, struct sock *sk)
482*4882a593Smuzhiyun {
483*4882a593Smuzhiyun int avail = READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq);
484*4882a593Smuzhiyun
485*4882a593Smuzhiyun if (avail > 0) {
486*4882a593Smuzhiyun if (avail >= target)
487*4882a593Smuzhiyun return true;
488*4882a593Smuzhiyun if (tcp_rmem_pressure(sk))
489*4882a593Smuzhiyun return true;
490*4882a593Smuzhiyun if (tcp_receive_window(tp) <= inet_csk(sk)->icsk_ack.rcv_mss)
491*4882a593Smuzhiyun return true;
492*4882a593Smuzhiyun }
493*4882a593Smuzhiyun if (sk->sk_prot->stream_memory_read)
494*4882a593Smuzhiyun return sk->sk_prot->stream_memory_read(sk);
495*4882a593Smuzhiyun return false;
496*4882a593Smuzhiyun }
497*4882a593Smuzhiyun
498*4882a593Smuzhiyun /*
499*4882a593Smuzhiyun * Wait for a TCP event.
500*4882a593Smuzhiyun *
501*4882a593Smuzhiyun * Note that we don't need to lock the socket, as the upper poll layers
502*4882a593Smuzhiyun * take care of normal races (between the test and the event) and we don't
503*4882a593Smuzhiyun * go look at any of the socket buffers directly.
504*4882a593Smuzhiyun */
tcp_poll(struct file * file,struct socket * sock,poll_table * wait)505*4882a593Smuzhiyun __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
506*4882a593Smuzhiyun {
507*4882a593Smuzhiyun __poll_t mask;
508*4882a593Smuzhiyun struct sock *sk = sock->sk;
509*4882a593Smuzhiyun const struct tcp_sock *tp = tcp_sk(sk);
510*4882a593Smuzhiyun int state;
511*4882a593Smuzhiyun
512*4882a593Smuzhiyun sock_poll_wait(file, sock, wait);
513*4882a593Smuzhiyun
514*4882a593Smuzhiyun state = inet_sk_state_load(sk);
515*4882a593Smuzhiyun if (state == TCP_LISTEN)
516*4882a593Smuzhiyun return inet_csk_listen_poll(sk);
517*4882a593Smuzhiyun
518*4882a593Smuzhiyun /* Socket is not locked. We are protected from async events
519*4882a593Smuzhiyun * by poll logic and correct handling of state changes
520*4882a593Smuzhiyun * made by other threads is impossible in any case.
521*4882a593Smuzhiyun */
522*4882a593Smuzhiyun
523*4882a593Smuzhiyun mask = 0;
524*4882a593Smuzhiyun
525*4882a593Smuzhiyun /*
526*4882a593Smuzhiyun * EPOLLHUP is certainly not done right. But poll() doesn't
527*4882a593Smuzhiyun * have a notion of HUP in just one direction, and for a
528*4882a593Smuzhiyun * socket the read side is more interesting.
529*4882a593Smuzhiyun *
530*4882a593Smuzhiyun * Some poll() documentation says that EPOLLHUP is incompatible
531*4882a593Smuzhiyun * with the EPOLLOUT/POLLWR flags, so somebody should check this
532*4882a593Smuzhiyun * all. But careful, it tends to be safer to return too many
533*4882a593Smuzhiyun * bits than too few, and you can easily break real applications
534*4882a593Smuzhiyun * if you don't tell them that something has hung up!
535*4882a593Smuzhiyun *
536*4882a593Smuzhiyun * Check-me.
537*4882a593Smuzhiyun *
538*4882a593Smuzhiyun * Check number 1. EPOLLHUP is _UNMASKABLE_ event (see UNIX98 and
539*4882a593Smuzhiyun * our fs/select.c). It means that after we received EOF,
540*4882a593Smuzhiyun * poll always returns immediately, making impossible poll() on write()
541*4882a593Smuzhiyun * in state CLOSE_WAIT. One solution is evident --- to set EPOLLHUP
542*4882a593Smuzhiyun * if and only if shutdown has been made in both directions.
543*4882a593Smuzhiyun * Actually, it is interesting to look how Solaris and DUX
544*4882a593Smuzhiyun * solve this dilemma. I would prefer, if EPOLLHUP were maskable,
545*4882a593Smuzhiyun * then we could set it on SND_SHUTDOWN. BTW examples given
546*4882a593Smuzhiyun * in Stevens' books assume exactly this behaviour, it explains
547*4882a593Smuzhiyun * why EPOLLHUP is incompatible with EPOLLOUT. --ANK
548*4882a593Smuzhiyun *
549*4882a593Smuzhiyun * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
550*4882a593Smuzhiyun * blocking on fresh not-connected or disconnected socket. --ANK
551*4882a593Smuzhiyun */
552*4882a593Smuzhiyun if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
553*4882a593Smuzhiyun mask |= EPOLLHUP;
554*4882a593Smuzhiyun if (sk->sk_shutdown & RCV_SHUTDOWN)
555*4882a593Smuzhiyun mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
556*4882a593Smuzhiyun
557*4882a593Smuzhiyun /* Connected or passive Fast Open socket? */
558*4882a593Smuzhiyun if (state != TCP_SYN_SENT &&
559*4882a593Smuzhiyun (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) {
560*4882a593Smuzhiyun int target = sock_rcvlowat(sk, 0, INT_MAX);
561*4882a593Smuzhiyun
562*4882a593Smuzhiyun if (READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) &&
563*4882a593Smuzhiyun !sock_flag(sk, SOCK_URGINLINE) &&
564*4882a593Smuzhiyun tp->urg_data)
565*4882a593Smuzhiyun target++;
566*4882a593Smuzhiyun
567*4882a593Smuzhiyun if (tcp_stream_is_readable(tp, target, sk))
568*4882a593Smuzhiyun mask |= EPOLLIN | EPOLLRDNORM;
569*4882a593Smuzhiyun
570*4882a593Smuzhiyun if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
571*4882a593Smuzhiyun if (__sk_stream_is_writeable(sk, 1)) {
572*4882a593Smuzhiyun mask |= EPOLLOUT | EPOLLWRNORM;
573*4882a593Smuzhiyun } else { /* send SIGIO later */
574*4882a593Smuzhiyun sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
575*4882a593Smuzhiyun set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
576*4882a593Smuzhiyun
577*4882a593Smuzhiyun /* Race breaker. If space is freed after
578*4882a593Smuzhiyun * wspace test but before the flags are set,
579*4882a593Smuzhiyun * IO signal will be lost. Memory barrier
580*4882a593Smuzhiyun * pairs with the input side.
581*4882a593Smuzhiyun */
582*4882a593Smuzhiyun smp_mb__after_atomic();
583*4882a593Smuzhiyun if (__sk_stream_is_writeable(sk, 1))
584*4882a593Smuzhiyun mask |= EPOLLOUT | EPOLLWRNORM;
585*4882a593Smuzhiyun }
586*4882a593Smuzhiyun } else
587*4882a593Smuzhiyun mask |= EPOLLOUT | EPOLLWRNORM;
588*4882a593Smuzhiyun
589*4882a593Smuzhiyun if (tp->urg_data & TCP_URG_VALID)
590*4882a593Smuzhiyun mask |= EPOLLPRI;
591*4882a593Smuzhiyun } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
592*4882a593Smuzhiyun /* Active TCP fastopen socket with defer_connect
593*4882a593Smuzhiyun * Return EPOLLOUT so application can call write()
594*4882a593Smuzhiyun * in order for kernel to generate SYN+data
595*4882a593Smuzhiyun */
596*4882a593Smuzhiyun mask |= EPOLLOUT | EPOLLWRNORM;
597*4882a593Smuzhiyun }
598*4882a593Smuzhiyun /* This barrier is coupled with smp_wmb() in tcp_reset() */
599*4882a593Smuzhiyun smp_rmb();
600*4882a593Smuzhiyun if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
601*4882a593Smuzhiyun mask |= EPOLLERR;
602*4882a593Smuzhiyun
603*4882a593Smuzhiyun return mask;
604*4882a593Smuzhiyun }
605*4882a593Smuzhiyun EXPORT_SYMBOL(tcp_poll);
606*4882a593Smuzhiyun
tcp_ioctl(struct sock * sk,int cmd,unsigned long arg)607*4882a593Smuzhiyun int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
608*4882a593Smuzhiyun {
609*4882a593Smuzhiyun struct tcp_sock *tp = tcp_sk(sk);
610*4882a593Smuzhiyun int answ;
611*4882a593Smuzhiyun bool slow;
612*4882a593Smuzhiyun
613*4882a593Smuzhiyun switch (cmd) {
614*4882a593Smuzhiyun case SIOCINQ:
615*4882a593Smuzhiyun if (sk->sk_state == TCP_LISTEN)
616*4882a593Smuzhiyun return -EINVAL;
617*4882a593Smuzhiyun
618*4882a593Smuzhiyun slow = lock_sock_fast(sk);
619*4882a593Smuzhiyun answ = tcp_inq(sk);
620*4882a593Smuzhiyun unlock_sock_fast(sk, slow);
621*4882a593Smuzhiyun break;
622*4882a593Smuzhiyun case SIOCATMARK:
623*4882a593Smuzhiyun answ = tp->urg_data &&
624*4882a593Smuzhiyun READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq);
625*4882a593Smuzhiyun break;
626*4882a593Smuzhiyun case SIOCOUTQ:
627*4882a593Smuzhiyun if (sk->sk_state == TCP_LISTEN)
628*4882a593Smuzhiyun return -EINVAL;
629*4882a593Smuzhiyun
630*4882a593Smuzhiyun if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
631*4882a593Smuzhiyun answ = 0;
632*4882a593Smuzhiyun else
633*4882a593Smuzhiyun answ = READ_ONCE(tp->write_seq) - tp->snd_una;
634*4882a593Smuzhiyun break;
635*4882a593Smuzhiyun case SIOCOUTQNSD:
636*4882a593Smuzhiyun if (sk->sk_state == TCP_LISTEN)
637*4882a593Smuzhiyun return -EINVAL;
638*4882a593Smuzhiyun
639*4882a593Smuzhiyun if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
640*4882a593Smuzhiyun answ = 0;
641*4882a593Smuzhiyun else
642*4882a593Smuzhiyun answ = READ_ONCE(tp->write_seq) -
643*4882a593Smuzhiyun READ_ONCE(tp->snd_nxt);
644*4882a593Smuzhiyun break;
645*4882a593Smuzhiyun default:
646*4882a593Smuzhiyun return -ENOIOCTLCMD;
647*4882a593Smuzhiyun }
648*4882a593Smuzhiyun
649*4882a593Smuzhiyun return put_user(answ, (int __user *)arg);
650*4882a593Smuzhiyun }
651*4882a593Smuzhiyun EXPORT_SYMBOL(tcp_ioctl);
652*4882a593Smuzhiyun
tcp_mark_push(struct tcp_sock * tp,struct sk_buff * skb)653*4882a593Smuzhiyun static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
654*4882a593Smuzhiyun {
655*4882a593Smuzhiyun TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
656*4882a593Smuzhiyun tp->pushed_seq = tp->write_seq;
657*4882a593Smuzhiyun }
658*4882a593Smuzhiyun
forced_push(const struct tcp_sock * tp)659*4882a593Smuzhiyun static inline bool forced_push(const struct tcp_sock *tp)
660*4882a593Smuzhiyun {
661*4882a593Smuzhiyun return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
662*4882a593Smuzhiyun }
663*4882a593Smuzhiyun
skb_entail(struct sock * sk,struct sk_buff * skb)664*4882a593Smuzhiyun static void skb_entail(struct sock *sk, struct sk_buff *skb)
665*4882a593Smuzhiyun {
666*4882a593Smuzhiyun struct tcp_sock *tp = tcp_sk(sk);
667*4882a593Smuzhiyun struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
668*4882a593Smuzhiyun
669*4882a593Smuzhiyun skb->csum = 0;
670*4882a593Smuzhiyun tcb->seq = tcb->end_seq = tp->write_seq;
671*4882a593Smuzhiyun tcb->tcp_flags = TCPHDR_ACK;
672*4882a593Smuzhiyun tcb->sacked = 0;
673*4882a593Smuzhiyun __skb_header_release(skb);
674*4882a593Smuzhiyun tcp_add_write_queue_tail(sk, skb);
675*4882a593Smuzhiyun sk_wmem_queued_add(sk, skb->truesize);
676*4882a593Smuzhiyun sk_mem_charge(sk, skb->truesize);
677*4882a593Smuzhiyun if (tp->nonagle & TCP_NAGLE_PUSH)
678*4882a593Smuzhiyun tp->nonagle &= ~TCP_NAGLE_PUSH;
679*4882a593Smuzhiyun
680*4882a593Smuzhiyun tcp_slow_start_after_idle_check(sk);
681*4882a593Smuzhiyun }
682*4882a593Smuzhiyun
tcp_mark_urg(struct tcp_sock * tp,int flags)683*4882a593Smuzhiyun static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
684*4882a593Smuzhiyun {
685*4882a593Smuzhiyun if (flags & MSG_OOB)
686*4882a593Smuzhiyun tp->snd_up = tp->write_seq;
687*4882a593Smuzhiyun }
688*4882a593Smuzhiyun
689*4882a593Smuzhiyun /* If a not yet filled skb is pushed, do not send it if
690*4882a593Smuzhiyun * we have data packets in Qdisc or NIC queues :
691*4882a593Smuzhiyun * Because TX completion will happen shortly, it gives a chance
692*4882a593Smuzhiyun * to coalesce future sendmsg() payload into this skb, without
693*4882a593Smuzhiyun * need for a timer, and with no latency trade off.
694*4882a593Smuzhiyun * As packets containing data payload have a bigger truesize
695*4882a593Smuzhiyun * than pure acks (dataless) packets, the last checks prevent
696*4882a593Smuzhiyun * autocorking if we only have an ACK in Qdisc/NIC queues,
697*4882a593Smuzhiyun * or if TX completion was delayed after we processed ACK packet.
698*4882a593Smuzhiyun */
tcp_should_autocork(struct sock * sk,struct sk_buff * skb,int size_goal)699*4882a593Smuzhiyun static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
700*4882a593Smuzhiyun int size_goal)
701*4882a593Smuzhiyun {
702*4882a593Smuzhiyun return skb->len < size_goal &&
703*4882a593Smuzhiyun READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_autocorking) &&
704*4882a593Smuzhiyun !tcp_rtx_queue_empty(sk) &&
705*4882a593Smuzhiyun refcount_read(&sk->sk_wmem_alloc) > skb->truesize;
706*4882a593Smuzhiyun }
707*4882a593Smuzhiyun
tcp_push(struct sock * sk,int flags,int mss_now,int nonagle,int size_goal)708*4882a593Smuzhiyun void tcp_push(struct sock *sk, int flags, int mss_now,
709*4882a593Smuzhiyun int nonagle, int size_goal)
710*4882a593Smuzhiyun {
711*4882a593Smuzhiyun struct tcp_sock *tp = tcp_sk(sk);
712*4882a593Smuzhiyun struct sk_buff *skb;
713*4882a593Smuzhiyun
714*4882a593Smuzhiyun skb = tcp_write_queue_tail(sk);
715*4882a593Smuzhiyun if (!skb)
716*4882a593Smuzhiyun return;
717*4882a593Smuzhiyun if (!(flags & MSG_MORE) || forced_push(tp))
718*4882a593Smuzhiyun tcp_mark_push(tp, skb);
719*4882a593Smuzhiyun
720*4882a593Smuzhiyun tcp_mark_urg(tp, flags);
721*4882a593Smuzhiyun
722*4882a593Smuzhiyun if (tcp_should_autocork(sk, skb, size_goal)) {
723*4882a593Smuzhiyun
724*4882a593Smuzhiyun /* avoid atomic op if TSQ_THROTTLED bit is already set */
725*4882a593Smuzhiyun if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
726*4882a593Smuzhiyun NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
727*4882a593Smuzhiyun set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
728*4882a593Smuzhiyun }
729*4882a593Smuzhiyun /* It is possible TX completion already happened
730*4882a593Smuzhiyun * before we set TSQ_THROTTLED.
731*4882a593Smuzhiyun */
732*4882a593Smuzhiyun if (refcount_read(&sk->sk_wmem_alloc) > skb->truesize)
733*4882a593Smuzhiyun return;
734*4882a593Smuzhiyun }
735*4882a593Smuzhiyun
736*4882a593Smuzhiyun if (flags & MSG_MORE)
737*4882a593Smuzhiyun nonagle = TCP_NAGLE_CORK;
738*4882a593Smuzhiyun
739*4882a593Smuzhiyun __tcp_push_pending_frames(sk, mss_now, nonagle);
740*4882a593Smuzhiyun }
741*4882a593Smuzhiyun
tcp_splice_data_recv(read_descriptor_t * rd_desc,struct sk_buff * skb,unsigned int offset,size_t len)742*4882a593Smuzhiyun static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
743*4882a593Smuzhiyun unsigned int offset, size_t len)
744*4882a593Smuzhiyun {
745*4882a593Smuzhiyun struct tcp_splice_state *tss = rd_desc->arg.data;
746*4882a593Smuzhiyun int ret;
747*4882a593Smuzhiyun
748*4882a593Smuzhiyun ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe,
749*4882a593Smuzhiyun min(rd_desc->count, len), tss->flags);
750*4882a593Smuzhiyun if (ret > 0)
751*4882a593Smuzhiyun rd_desc->count -= ret;
752*4882a593Smuzhiyun return ret;
753*4882a593Smuzhiyun }
754*4882a593Smuzhiyun
__tcp_splice_read(struct sock * sk,struct tcp_splice_state * tss)755*4882a593Smuzhiyun static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
756*4882a593Smuzhiyun {
757*4882a593Smuzhiyun /* Store TCP splice context information in read_descriptor_t. */
758*4882a593Smuzhiyun read_descriptor_t rd_desc = {
759*4882a593Smuzhiyun .arg.data = tss,
760*4882a593Smuzhiyun .count = tss->len,
761*4882a593Smuzhiyun };
762*4882a593Smuzhiyun
763*4882a593Smuzhiyun return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
764*4882a593Smuzhiyun }
765*4882a593Smuzhiyun
766*4882a593Smuzhiyun /**
767*4882a593Smuzhiyun * tcp_splice_read - splice data from TCP socket to a pipe
768*4882a593Smuzhiyun * @sock: socket to splice from
769*4882a593Smuzhiyun * @ppos: position (not valid)
770*4882a593Smuzhiyun * @pipe: pipe to splice to
771*4882a593Smuzhiyun * @len: number of bytes to splice
772*4882a593Smuzhiyun * @flags: splice modifier flags
773*4882a593Smuzhiyun *
774*4882a593Smuzhiyun * Description:
775*4882a593Smuzhiyun * Will read pages from given socket and fill them into a pipe.
776*4882a593Smuzhiyun *
777*4882a593Smuzhiyun **/
tcp_splice_read(struct socket * sock,loff_t * ppos,struct pipe_inode_info * pipe,size_t len,unsigned int flags)778*4882a593Smuzhiyun ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
779*4882a593Smuzhiyun struct pipe_inode_info *pipe, size_t len,
780*4882a593Smuzhiyun unsigned int flags)
781*4882a593Smuzhiyun {
782*4882a593Smuzhiyun struct sock *sk = sock->sk;
783*4882a593Smuzhiyun struct tcp_splice_state tss = {
784*4882a593Smuzhiyun .pipe = pipe,
785*4882a593Smuzhiyun .len = len,
786*4882a593Smuzhiyun .flags = flags,
787*4882a593Smuzhiyun };
788*4882a593Smuzhiyun long timeo;
789*4882a593Smuzhiyun ssize_t spliced;
790*4882a593Smuzhiyun int ret;
791*4882a593Smuzhiyun
792*4882a593Smuzhiyun sock_rps_record_flow(sk);
793*4882a593Smuzhiyun /*
794*4882a593Smuzhiyun * We can't seek on a socket input
795*4882a593Smuzhiyun */
796*4882a593Smuzhiyun if (unlikely(*ppos))
797*4882a593Smuzhiyun return -ESPIPE;
798*4882a593Smuzhiyun
799*4882a593Smuzhiyun ret = spliced = 0;
800*4882a593Smuzhiyun
801*4882a593Smuzhiyun lock_sock(sk);
802*4882a593Smuzhiyun
803*4882a593Smuzhiyun timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
804*4882a593Smuzhiyun while (tss.len) {
805*4882a593Smuzhiyun ret = __tcp_splice_read(sk, &tss);
806*4882a593Smuzhiyun if (ret < 0)
807*4882a593Smuzhiyun break;
808*4882a593Smuzhiyun else if (!ret) {
809*4882a593Smuzhiyun if (spliced)
810*4882a593Smuzhiyun break;
811*4882a593Smuzhiyun if (sock_flag(sk, SOCK_DONE))
812*4882a593Smuzhiyun break;
813*4882a593Smuzhiyun if (sk->sk_err) {
814*4882a593Smuzhiyun ret = sock_error(sk);
815*4882a593Smuzhiyun break;
816*4882a593Smuzhiyun }
817*4882a593Smuzhiyun if (sk->sk_shutdown & RCV_SHUTDOWN)
818*4882a593Smuzhiyun break;
819*4882a593Smuzhiyun if (sk->sk_state == TCP_CLOSE) {
820*4882a593Smuzhiyun /*
821*4882a593Smuzhiyun * This occurs when user tries to read
822*4882a593Smuzhiyun * from never connected socket.
823*4882a593Smuzhiyun */
824*4882a593Smuzhiyun ret = -ENOTCONN;
825*4882a593Smuzhiyun break;
826*4882a593Smuzhiyun }
827*4882a593Smuzhiyun if (!timeo) {
828*4882a593Smuzhiyun ret = -EAGAIN;
829*4882a593Smuzhiyun break;
830*4882a593Smuzhiyun }
831*4882a593Smuzhiyun /* if __tcp_splice_read() got nothing while we have
832*4882a593Smuzhiyun * an skb in receive queue, we do not want to loop.
833*4882a593Smuzhiyun * This might happen with URG data.
834*4882a593Smuzhiyun */
835*4882a593Smuzhiyun if (!skb_queue_empty(&sk->sk_receive_queue))
836*4882a593Smuzhiyun break;
837*4882a593Smuzhiyun sk_wait_data(sk, &timeo, NULL);
838*4882a593Smuzhiyun if (signal_pending(current)) {
839*4882a593Smuzhiyun ret = sock_intr_errno(timeo);
840*4882a593Smuzhiyun break;
841*4882a593Smuzhiyun }
842*4882a593Smuzhiyun continue;
843*4882a593Smuzhiyun }
844*4882a593Smuzhiyun tss.len -= ret;
845*4882a593Smuzhiyun spliced += ret;
846*4882a593Smuzhiyun
847*4882a593Smuzhiyun if (!timeo)
848*4882a593Smuzhiyun break;
849*4882a593Smuzhiyun release_sock(sk);
850*4882a593Smuzhiyun lock_sock(sk);
851*4882a593Smuzhiyun
852*4882a593Smuzhiyun if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
853*4882a593Smuzhiyun (sk->sk_shutdown & RCV_SHUTDOWN) ||
854*4882a593Smuzhiyun signal_pending(current))
855*4882a593Smuzhiyun break;
856*4882a593Smuzhiyun }
857*4882a593Smuzhiyun
858*4882a593Smuzhiyun release_sock(sk);
859*4882a593Smuzhiyun
860*4882a593Smuzhiyun if (spliced)
861*4882a593Smuzhiyun return spliced;
862*4882a593Smuzhiyun
863*4882a593Smuzhiyun return ret;
864*4882a593Smuzhiyun }
865*4882a593Smuzhiyun EXPORT_SYMBOL(tcp_splice_read);
866*4882a593Smuzhiyun
sk_stream_alloc_skb(struct sock * sk,int size,gfp_t gfp,bool force_schedule)867*4882a593Smuzhiyun struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
868*4882a593Smuzhiyun bool force_schedule)
869*4882a593Smuzhiyun {
870*4882a593Smuzhiyun struct sk_buff *skb;
871*4882a593Smuzhiyun
872*4882a593Smuzhiyun if (likely(!size)) {
873*4882a593Smuzhiyun skb = sk->sk_tx_skb_cache;
874*4882a593Smuzhiyun if (skb) {
875*4882a593Smuzhiyun skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
876*4882a593Smuzhiyun sk->sk_tx_skb_cache = NULL;
877*4882a593Smuzhiyun pskb_trim(skb, 0);
878*4882a593Smuzhiyun INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
879*4882a593Smuzhiyun skb_shinfo(skb)->tx_flags = 0;
880*4882a593Smuzhiyun memset(TCP_SKB_CB(skb), 0, sizeof(struct tcp_skb_cb));
881*4882a593Smuzhiyun return skb;
882*4882a593Smuzhiyun }
883*4882a593Smuzhiyun }
884*4882a593Smuzhiyun /* The TCP header must be at least 32-bit aligned. */
885*4882a593Smuzhiyun size = ALIGN(size, 4);
886*4882a593Smuzhiyun
887*4882a593Smuzhiyun if (unlikely(tcp_under_memory_pressure(sk)))
888*4882a593Smuzhiyun sk_mem_reclaim_partial(sk);
889*4882a593Smuzhiyun
890*4882a593Smuzhiyun skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
891*4882a593Smuzhiyun if (likely(skb)) {
892*4882a593Smuzhiyun bool mem_scheduled;
893*4882a593Smuzhiyun
894*4882a593Smuzhiyun if (force_schedule) {
895*4882a593Smuzhiyun mem_scheduled = true;
896*4882a593Smuzhiyun sk_forced_mem_schedule(sk, skb->truesize);
897*4882a593Smuzhiyun } else {
898*4882a593Smuzhiyun mem_scheduled = sk_wmem_schedule(sk, skb->truesize);
899*4882a593Smuzhiyun }
900*4882a593Smuzhiyun if (likely(mem_scheduled)) {
901*4882a593Smuzhiyun skb_reserve(skb, sk->sk_prot->max_header);
902*4882a593Smuzhiyun /*
903*4882a593Smuzhiyun * Make sure that we have exactly size bytes
904*4882a593Smuzhiyun * available to the caller, no more, no less.
905*4882a593Smuzhiyun */
906*4882a593Smuzhiyun skb->reserved_tailroom = skb->end - skb->tail - size;
907*4882a593Smuzhiyun INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
908*4882a593Smuzhiyun return skb;
909*4882a593Smuzhiyun }
910*4882a593Smuzhiyun __kfree_skb(skb);
911*4882a593Smuzhiyun } else {
912*4882a593Smuzhiyun sk->sk_prot->enter_memory_pressure(sk);
913*4882a593Smuzhiyun sk_stream_moderate_sndbuf(sk);
914*4882a593Smuzhiyun }
915*4882a593Smuzhiyun return NULL;
916*4882a593Smuzhiyun }
917*4882a593Smuzhiyun
tcp_xmit_size_goal(struct sock * sk,u32 mss_now,int large_allowed)918*4882a593Smuzhiyun static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
919*4882a593Smuzhiyun int large_allowed)
920*4882a593Smuzhiyun {
921*4882a593Smuzhiyun struct tcp_sock *tp = tcp_sk(sk);
922*4882a593Smuzhiyun u32 new_size_goal, size_goal;
923*4882a593Smuzhiyun
924*4882a593Smuzhiyun if (!large_allowed)
925*4882a593Smuzhiyun return mss_now;
926*4882a593Smuzhiyun
927*4882a593Smuzhiyun /* Note : tcp_tso_autosize() will eventually split this later */
928*4882a593Smuzhiyun new_size_goal = sk->sk_gso_max_size - 1 - MAX_TCP_HEADER;
929*4882a593Smuzhiyun new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal);
930*4882a593Smuzhiyun
931*4882a593Smuzhiyun /* We try hard to avoid divides here */
932*4882a593Smuzhiyun size_goal = tp->gso_segs * mss_now;
933*4882a593Smuzhiyun if (unlikely(new_size_goal < size_goal ||
934*4882a593Smuzhiyun new_size_goal >= size_goal + mss_now)) {
935*4882a593Smuzhiyun tp->gso_segs = min_t(u16, new_size_goal / mss_now,
936*4882a593Smuzhiyun sk->sk_gso_max_segs);
937*4882a593Smuzhiyun size_goal = tp->gso_segs * mss_now;
938*4882a593Smuzhiyun }
939*4882a593Smuzhiyun
940*4882a593Smuzhiyun return max(size_goal, mss_now);
941*4882a593Smuzhiyun }
942*4882a593Smuzhiyun
tcp_send_mss(struct sock * sk,int * size_goal,int flags)943*4882a593Smuzhiyun int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
944*4882a593Smuzhiyun {
945*4882a593Smuzhiyun int mss_now;
946*4882a593Smuzhiyun
947*4882a593Smuzhiyun mss_now = tcp_current_mss(sk);
948*4882a593Smuzhiyun *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
949*4882a593Smuzhiyun
950*4882a593Smuzhiyun return mss_now;
951*4882a593Smuzhiyun }
952*4882a593Smuzhiyun
953*4882a593Smuzhiyun /* In some cases, both sendpage() and sendmsg() could have added
954*4882a593Smuzhiyun * an skb to the write queue, but failed adding payload on it.
955*4882a593Smuzhiyun * We need to remove it to consume less memory, but more
956*4882a593Smuzhiyun * importantly be able to generate EPOLLOUT for Edge Trigger epoll()
957*4882a593Smuzhiyun * users.
958*4882a593Smuzhiyun */
tcp_remove_empty_skb(struct sock * sk,struct sk_buff * skb)959*4882a593Smuzhiyun static void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb)
960*4882a593Smuzhiyun {
961*4882a593Smuzhiyun if (skb && TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
962*4882a593Smuzhiyun tcp_unlink_write_queue(skb, sk);
963*4882a593Smuzhiyun if (tcp_write_queue_empty(sk))
964*4882a593Smuzhiyun tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
965*4882a593Smuzhiyun sk_wmem_free_skb(sk, skb);
966*4882a593Smuzhiyun }
967*4882a593Smuzhiyun }
968*4882a593Smuzhiyun
do_tcp_sendpages(struct sock * sk,struct page * page,int offset,size_t size,int flags)969*4882a593Smuzhiyun ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
970*4882a593Smuzhiyun size_t size, int flags)
971*4882a593Smuzhiyun {
972*4882a593Smuzhiyun struct tcp_sock *tp = tcp_sk(sk);
973*4882a593Smuzhiyun int mss_now, size_goal;
974*4882a593Smuzhiyun int err;
975*4882a593Smuzhiyun ssize_t copied;
976*4882a593Smuzhiyun long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
977*4882a593Smuzhiyun
978*4882a593Smuzhiyun if (IS_ENABLED(CONFIG_DEBUG_VM) &&
979*4882a593Smuzhiyun WARN_ONCE(!sendpage_ok(page),
980*4882a593Smuzhiyun "page must not be a Slab one and have page_count > 0"))
981*4882a593Smuzhiyun return -EINVAL;
982*4882a593Smuzhiyun
983*4882a593Smuzhiyun /* Wait for a connection to finish. One exception is TCP Fast Open
984*4882a593Smuzhiyun * (passive side) where data is allowed to be sent before a connection
985*4882a593Smuzhiyun * is fully established.
986*4882a593Smuzhiyun */
987*4882a593Smuzhiyun if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
988*4882a593Smuzhiyun !tcp_passive_fastopen(sk)) {
989*4882a593Smuzhiyun err = sk_stream_wait_connect(sk, &timeo);
990*4882a593Smuzhiyun if (err != 0)
991*4882a593Smuzhiyun goto out_err;
992*4882a593Smuzhiyun }
993*4882a593Smuzhiyun
994*4882a593Smuzhiyun sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
995*4882a593Smuzhiyun
996*4882a593Smuzhiyun mss_now = tcp_send_mss(sk, &size_goal, flags);
997*4882a593Smuzhiyun copied = 0;
998*4882a593Smuzhiyun
999*4882a593Smuzhiyun err = -EPIPE;
1000*4882a593Smuzhiyun if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1001*4882a593Smuzhiyun goto out_err;
1002*4882a593Smuzhiyun
1003*4882a593Smuzhiyun while (size > 0) {
1004*4882a593Smuzhiyun struct sk_buff *skb = tcp_write_queue_tail(sk);
1005*4882a593Smuzhiyun int copy, i;
1006*4882a593Smuzhiyun bool can_coalesce;
1007*4882a593Smuzhiyun
1008*4882a593Smuzhiyun if (!skb || (copy = size_goal - skb->len) <= 0 ||
1009*4882a593Smuzhiyun !tcp_skb_can_collapse_to(skb)) {
1010*4882a593Smuzhiyun new_segment:
1011*4882a593Smuzhiyun if (!sk_stream_memory_free(sk))
1012*4882a593Smuzhiyun goto wait_for_space;
1013*4882a593Smuzhiyun
1014*4882a593Smuzhiyun skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
1015*4882a593Smuzhiyun tcp_rtx_and_write_queues_empty(sk));
1016*4882a593Smuzhiyun if (!skb)
1017*4882a593Smuzhiyun goto wait_for_space;
1018*4882a593Smuzhiyun
1019*4882a593Smuzhiyun #ifdef CONFIG_TLS_DEVICE
1020*4882a593Smuzhiyun skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
1021*4882a593Smuzhiyun #endif
1022*4882a593Smuzhiyun skb_entail(sk, skb);
1023*4882a593Smuzhiyun copy = size_goal;
1024*4882a593Smuzhiyun }
1025*4882a593Smuzhiyun
1026*4882a593Smuzhiyun if (copy > size)
1027*4882a593Smuzhiyun copy = size;
1028*4882a593Smuzhiyun
1029*4882a593Smuzhiyun i = skb_shinfo(skb)->nr_frags;
1030*4882a593Smuzhiyun can_coalesce = skb_can_coalesce(skb, i, page, offset);
1031*4882a593Smuzhiyun if (!can_coalesce && i >= sysctl_max_skb_frags) {
1032*4882a593Smuzhiyun tcp_mark_push(tp, skb);
1033*4882a593Smuzhiyun goto new_segment;
1034*4882a593Smuzhiyun }
1035*4882a593Smuzhiyun if (!sk_wmem_schedule(sk, copy))
1036*4882a593Smuzhiyun goto wait_for_space;
1037*4882a593Smuzhiyun
1038*4882a593Smuzhiyun if (can_coalesce) {
1039*4882a593Smuzhiyun skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1040*4882a593Smuzhiyun } else {
1041*4882a593Smuzhiyun get_page(page);
1042*4882a593Smuzhiyun skb_fill_page_desc(skb, i, page, offset, copy);
1043*4882a593Smuzhiyun }
1044*4882a593Smuzhiyun
1045*4882a593Smuzhiyun if (!(flags & MSG_NO_SHARED_FRAGS))
1046*4882a593Smuzhiyun skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
1047*4882a593Smuzhiyun
1048*4882a593Smuzhiyun skb->len += copy;
1049*4882a593Smuzhiyun skb->data_len += copy;
1050*4882a593Smuzhiyun skb->truesize += copy;
1051*4882a593Smuzhiyun sk_wmem_queued_add(sk, copy);
1052*4882a593Smuzhiyun sk_mem_charge(sk, copy);
1053*4882a593Smuzhiyun skb->ip_summed = CHECKSUM_PARTIAL;
1054*4882a593Smuzhiyun WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
1055*4882a593Smuzhiyun TCP_SKB_CB(skb)->end_seq += copy;
1056*4882a593Smuzhiyun tcp_skb_pcount_set(skb, 0);
1057*4882a593Smuzhiyun
1058*4882a593Smuzhiyun if (!copied)
1059*4882a593Smuzhiyun TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1060*4882a593Smuzhiyun
1061*4882a593Smuzhiyun copied += copy;
1062*4882a593Smuzhiyun offset += copy;
1063*4882a593Smuzhiyun size -= copy;
1064*4882a593Smuzhiyun if (!size)
1065*4882a593Smuzhiyun goto out;
1066*4882a593Smuzhiyun
1067*4882a593Smuzhiyun if (skb->len < size_goal || (flags & MSG_OOB))
1068*4882a593Smuzhiyun continue;
1069*4882a593Smuzhiyun
1070*4882a593Smuzhiyun if (forced_push(tp)) {
1071*4882a593Smuzhiyun tcp_mark_push(tp, skb);
1072*4882a593Smuzhiyun __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1073*4882a593Smuzhiyun } else if (skb == tcp_send_head(sk))
1074*4882a593Smuzhiyun tcp_push_one(sk, mss_now);
1075*4882a593Smuzhiyun continue;
1076*4882a593Smuzhiyun
1077*4882a593Smuzhiyun wait_for_space:
1078*4882a593Smuzhiyun set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1079*4882a593Smuzhiyun tcp_push(sk, flags & ~MSG_MORE, mss_now,
1080*4882a593Smuzhiyun TCP_NAGLE_PUSH, size_goal);
1081*4882a593Smuzhiyun
1082*4882a593Smuzhiyun err = sk_stream_wait_memory(sk, &timeo);
1083*4882a593Smuzhiyun if (err != 0)
1084*4882a593Smuzhiyun goto do_error;
1085*4882a593Smuzhiyun
1086*4882a593Smuzhiyun mss_now = tcp_send_mss(sk, &size_goal, flags);
1087*4882a593Smuzhiyun }
1088*4882a593Smuzhiyun
1089*4882a593Smuzhiyun out:
1090*4882a593Smuzhiyun if (copied) {
1091*4882a593Smuzhiyun tcp_tx_timestamp(sk, sk->sk_tsflags);
1092*4882a593Smuzhiyun if (!(flags & MSG_SENDPAGE_NOTLAST))
1093*4882a593Smuzhiyun tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
1094*4882a593Smuzhiyun }
1095*4882a593Smuzhiyun return copied;
1096*4882a593Smuzhiyun
1097*4882a593Smuzhiyun do_error:
1098*4882a593Smuzhiyun tcp_remove_empty_skb(sk, tcp_write_queue_tail(sk));
1099*4882a593Smuzhiyun if (copied)
1100*4882a593Smuzhiyun goto out;
1101*4882a593Smuzhiyun out_err:
1102*4882a593Smuzhiyun /* make sure we wake any epoll edge trigger waiter */
1103*4882a593Smuzhiyun if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
1104*4882a593Smuzhiyun sk->sk_write_space(sk);
1105*4882a593Smuzhiyun tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
1106*4882a593Smuzhiyun }
1107*4882a593Smuzhiyun return sk_stream_error(sk, flags, err);
1108*4882a593Smuzhiyun }
1109*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(do_tcp_sendpages);
1110*4882a593Smuzhiyun
tcp_sendpage_locked(struct sock * sk,struct page * page,int offset,size_t size,int flags)1111*4882a593Smuzhiyun int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
1112*4882a593Smuzhiyun size_t size, int flags)
1113*4882a593Smuzhiyun {
1114*4882a593Smuzhiyun if (!(sk->sk_route_caps & NETIF_F_SG))
1115*4882a593Smuzhiyun return sock_no_sendpage_locked(sk, page, offset, size, flags);
1116*4882a593Smuzhiyun
1117*4882a593Smuzhiyun tcp_rate_check_app_limited(sk); /* is sending application-limited? */
1118*4882a593Smuzhiyun
1119*4882a593Smuzhiyun return do_tcp_sendpages(sk, page, offset, size, flags);
1120*4882a593Smuzhiyun }
1121*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(tcp_sendpage_locked);
1122*4882a593Smuzhiyun
tcp_sendpage(struct sock * sk,struct page * page,int offset,size_t size,int flags)1123*4882a593Smuzhiyun int tcp_sendpage(struct sock *sk, struct page *page, int offset,
1124*4882a593Smuzhiyun size_t size, int flags)
1125*4882a593Smuzhiyun {
1126*4882a593Smuzhiyun int ret;
1127*4882a593Smuzhiyun
1128*4882a593Smuzhiyun lock_sock(sk);
1129*4882a593Smuzhiyun ret = tcp_sendpage_locked(sk, page, offset, size, flags);
1130*4882a593Smuzhiyun release_sock(sk);
1131*4882a593Smuzhiyun
1132*4882a593Smuzhiyun return ret;
1133*4882a593Smuzhiyun }
1134*4882a593Smuzhiyun EXPORT_SYMBOL(tcp_sendpage);
1135*4882a593Smuzhiyun
tcp_free_fastopen_req(struct tcp_sock * tp)1136*4882a593Smuzhiyun void tcp_free_fastopen_req(struct tcp_sock *tp)
1137*4882a593Smuzhiyun {
1138*4882a593Smuzhiyun if (tp->fastopen_req) {
1139*4882a593Smuzhiyun kfree(tp->fastopen_req);
1140*4882a593Smuzhiyun tp->fastopen_req = NULL;
1141*4882a593Smuzhiyun }
1142*4882a593Smuzhiyun }
1143*4882a593Smuzhiyun
tcp_sendmsg_fastopen(struct sock * sk,struct msghdr * msg,int * copied,size_t size,struct ubuf_info * uarg)1144*4882a593Smuzhiyun static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
1145*4882a593Smuzhiyun int *copied, size_t size,
1146*4882a593Smuzhiyun struct ubuf_info *uarg)
1147*4882a593Smuzhiyun {
1148*4882a593Smuzhiyun struct tcp_sock *tp = tcp_sk(sk);
1149*4882a593Smuzhiyun struct inet_sock *inet = inet_sk(sk);
1150*4882a593Smuzhiyun struct sockaddr *uaddr = msg->msg_name;
1151*4882a593Smuzhiyun int err, flags;
1152*4882a593Smuzhiyun
1153*4882a593Smuzhiyun if (!(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) &
1154*4882a593Smuzhiyun TFO_CLIENT_ENABLE) ||
1155*4882a593Smuzhiyun (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) &&
1156*4882a593Smuzhiyun uaddr->sa_family == AF_UNSPEC))
1157*4882a593Smuzhiyun return -EOPNOTSUPP;
1158*4882a593Smuzhiyun if (tp->fastopen_req)
1159*4882a593Smuzhiyun return -EALREADY; /* Another Fast Open is in progress */
1160*4882a593Smuzhiyun
1161*4882a593Smuzhiyun tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
1162*4882a593Smuzhiyun sk->sk_allocation);
1163*4882a593Smuzhiyun if (unlikely(!tp->fastopen_req))
1164*4882a593Smuzhiyun return -ENOBUFS;
1165*4882a593Smuzhiyun tp->fastopen_req->data = msg;
1166*4882a593Smuzhiyun tp->fastopen_req->size = size;
1167*4882a593Smuzhiyun tp->fastopen_req->uarg = uarg;
1168*4882a593Smuzhiyun
1169*4882a593Smuzhiyun if (inet->defer_connect) {
1170*4882a593Smuzhiyun err = tcp_connect(sk);
1171*4882a593Smuzhiyun /* Same failure procedure as in tcp_v4/6_connect */
1172*4882a593Smuzhiyun if (err) {
1173*4882a593Smuzhiyun tcp_set_state(sk, TCP_CLOSE);
1174*4882a593Smuzhiyun inet->inet_dport = 0;
1175*4882a593Smuzhiyun sk->sk_route_caps = 0;
1176*4882a593Smuzhiyun }
1177*4882a593Smuzhiyun }
1178*4882a593Smuzhiyun flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
1179*4882a593Smuzhiyun err = __inet_stream_connect(sk->sk_socket, uaddr,
1180*4882a593Smuzhiyun msg->msg_namelen, flags, 1);
1181*4882a593Smuzhiyun /* fastopen_req could already be freed in __inet_stream_connect
1182*4882a593Smuzhiyun * if the connection times out or gets rst
1183*4882a593Smuzhiyun */
1184*4882a593Smuzhiyun if (tp->fastopen_req) {
1185*4882a593Smuzhiyun *copied = tp->fastopen_req->copied;
1186*4882a593Smuzhiyun tcp_free_fastopen_req(tp);
1187*4882a593Smuzhiyun inet->defer_connect = 0;
1188*4882a593Smuzhiyun }
1189*4882a593Smuzhiyun return err;
1190*4882a593Smuzhiyun }
1191*4882a593Smuzhiyun
tcp_sendmsg_locked(struct sock * sk,struct msghdr * msg,size_t size)1192*4882a593Smuzhiyun int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
1193*4882a593Smuzhiyun {
1194*4882a593Smuzhiyun struct tcp_sock *tp = tcp_sk(sk);
1195*4882a593Smuzhiyun struct ubuf_info *uarg = NULL;
1196*4882a593Smuzhiyun struct sk_buff *skb;
1197*4882a593Smuzhiyun struct sockcm_cookie sockc;
1198*4882a593Smuzhiyun int flags, err, copied = 0;
1199*4882a593Smuzhiyun int mss_now = 0, size_goal, copied_syn = 0;
1200*4882a593Smuzhiyun int process_backlog = 0;
1201*4882a593Smuzhiyun bool zc = false;
1202*4882a593Smuzhiyun long timeo;
1203*4882a593Smuzhiyun
1204*4882a593Smuzhiyun trace_android_rvh_tcp_sendmsg_locked(sk, size);
1205*4882a593Smuzhiyun flags = msg->msg_flags;
1206*4882a593Smuzhiyun
1207*4882a593Smuzhiyun if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) {
1208*4882a593Smuzhiyun skb = tcp_write_queue_tail(sk);
1209*4882a593Smuzhiyun uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb));
1210*4882a593Smuzhiyun if (!uarg) {
1211*4882a593Smuzhiyun err = -ENOBUFS;
1212*4882a593Smuzhiyun goto out_err;
1213*4882a593Smuzhiyun }
1214*4882a593Smuzhiyun
1215*4882a593Smuzhiyun zc = sk->sk_route_caps & NETIF_F_SG;
1216*4882a593Smuzhiyun if (!zc)
1217*4882a593Smuzhiyun uarg->zerocopy = 0;
1218*4882a593Smuzhiyun }
1219*4882a593Smuzhiyun
1220*4882a593Smuzhiyun if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) &&
1221*4882a593Smuzhiyun !tp->repair) {
1222*4882a593Smuzhiyun err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg);
1223*4882a593Smuzhiyun if (err == -EINPROGRESS && copied_syn > 0)
1224*4882a593Smuzhiyun goto out;
1225*4882a593Smuzhiyun else if (err)
1226*4882a593Smuzhiyun goto out_err;
1227*4882a593Smuzhiyun }
1228*4882a593Smuzhiyun
1229*4882a593Smuzhiyun timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1230*4882a593Smuzhiyun
1231*4882a593Smuzhiyun tcp_rate_check_app_limited(sk); /* is sending application-limited? */
1232*4882a593Smuzhiyun
1233*4882a593Smuzhiyun /* Wait for a connection to finish. One exception is TCP Fast Open
1234*4882a593Smuzhiyun * (passive side) where data is allowed to be sent before a connection
1235*4882a593Smuzhiyun * is fully established.
1236*4882a593Smuzhiyun */
1237*4882a593Smuzhiyun if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1238*4882a593Smuzhiyun !tcp_passive_fastopen(sk)) {
1239*4882a593Smuzhiyun err = sk_stream_wait_connect(sk, &timeo);
1240*4882a593Smuzhiyun if (err != 0)
1241*4882a593Smuzhiyun goto do_error;
1242*4882a593Smuzhiyun }
1243*4882a593Smuzhiyun
1244*4882a593Smuzhiyun if (unlikely(tp->repair)) {
1245*4882a593Smuzhiyun if (tp->repair_queue == TCP_RECV_QUEUE) {
1246*4882a593Smuzhiyun copied = tcp_send_rcvq(sk, msg, size);
1247*4882a593Smuzhiyun goto out_nopush;
1248*4882a593Smuzhiyun }
1249*4882a593Smuzhiyun
1250*4882a593Smuzhiyun err = -EINVAL;
1251*4882a593Smuzhiyun if (tp->repair_queue == TCP_NO_QUEUE)
1252*4882a593Smuzhiyun goto out_err;
1253*4882a593Smuzhiyun
1254*4882a593Smuzhiyun /* 'common' sending to sendq */
1255*4882a593Smuzhiyun }
1256*4882a593Smuzhiyun
1257*4882a593Smuzhiyun sockcm_init(&sockc, sk);
1258*4882a593Smuzhiyun if (msg->msg_controllen) {
1259*4882a593Smuzhiyun err = sock_cmsg_send(sk, msg, &sockc);
1260*4882a593Smuzhiyun if (unlikely(err)) {
1261*4882a593Smuzhiyun err = -EINVAL;
1262*4882a593Smuzhiyun goto out_err;
1263*4882a593Smuzhiyun }
1264*4882a593Smuzhiyun }
1265*4882a593Smuzhiyun
1266*4882a593Smuzhiyun /* This should be in poll */
1267*4882a593Smuzhiyun sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1268*4882a593Smuzhiyun
1269*4882a593Smuzhiyun /* Ok commence sending. */
1270*4882a593Smuzhiyun copied = 0;
1271*4882a593Smuzhiyun
1272*4882a593Smuzhiyun restart:
1273*4882a593Smuzhiyun mss_now = tcp_send_mss(sk, &size_goal, flags);
1274*4882a593Smuzhiyun
1275*4882a593Smuzhiyun err = -EPIPE;
1276*4882a593Smuzhiyun if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1277*4882a593Smuzhiyun goto do_error;
1278*4882a593Smuzhiyun
1279*4882a593Smuzhiyun while (msg_data_left(msg)) {
1280*4882a593Smuzhiyun int copy = 0;
1281*4882a593Smuzhiyun
1282*4882a593Smuzhiyun skb = tcp_write_queue_tail(sk);
1283*4882a593Smuzhiyun if (skb)
1284*4882a593Smuzhiyun copy = size_goal - skb->len;
1285*4882a593Smuzhiyun
1286*4882a593Smuzhiyun if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
1287*4882a593Smuzhiyun bool first_skb;
1288*4882a593Smuzhiyun
1289*4882a593Smuzhiyun new_segment:
1290*4882a593Smuzhiyun if (!sk_stream_memory_free(sk))
1291*4882a593Smuzhiyun goto wait_for_space;
1292*4882a593Smuzhiyun
1293*4882a593Smuzhiyun if (unlikely(process_backlog >= 16)) {
1294*4882a593Smuzhiyun process_backlog = 0;
1295*4882a593Smuzhiyun if (sk_flush_backlog(sk))
1296*4882a593Smuzhiyun goto restart;
1297*4882a593Smuzhiyun }
1298*4882a593Smuzhiyun first_skb = tcp_rtx_and_write_queues_empty(sk);
1299*4882a593Smuzhiyun skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
1300*4882a593Smuzhiyun first_skb);
1301*4882a593Smuzhiyun if (!skb)
1302*4882a593Smuzhiyun goto wait_for_space;
1303*4882a593Smuzhiyun
1304*4882a593Smuzhiyun process_backlog++;
1305*4882a593Smuzhiyun skb->ip_summed = CHECKSUM_PARTIAL;
1306*4882a593Smuzhiyun
1307*4882a593Smuzhiyun skb_entail(sk, skb);
1308*4882a593Smuzhiyun copy = size_goal;
1309*4882a593Smuzhiyun
1310*4882a593Smuzhiyun /* All packets are restored as if they have
1311*4882a593Smuzhiyun * already been sent. skb_mstamp_ns isn't set to
1312*4882a593Smuzhiyun * avoid wrong rtt estimation.
1313*4882a593Smuzhiyun */
1314*4882a593Smuzhiyun if (tp->repair)
1315*4882a593Smuzhiyun TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
1316*4882a593Smuzhiyun }
1317*4882a593Smuzhiyun
1318*4882a593Smuzhiyun /* Try to append data to the end of skb. */
1319*4882a593Smuzhiyun if (copy > msg_data_left(msg))
1320*4882a593Smuzhiyun copy = msg_data_left(msg);
1321*4882a593Smuzhiyun
1322*4882a593Smuzhiyun /* Where to copy to? */
1323*4882a593Smuzhiyun if (skb_availroom(skb) > 0 && !zc) {
1324*4882a593Smuzhiyun /* We have some space in skb head. Superb! */
1325*4882a593Smuzhiyun copy = min_t(int, copy, skb_availroom(skb));
1326*4882a593Smuzhiyun err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
1327*4882a593Smuzhiyun if (err)
1328*4882a593Smuzhiyun goto do_fault;
1329*4882a593Smuzhiyun } else if (!zc) {
1330*4882a593Smuzhiyun bool merge = true;
1331*4882a593Smuzhiyun int i = skb_shinfo(skb)->nr_frags;
1332*4882a593Smuzhiyun struct page_frag *pfrag = sk_page_frag(sk);
1333*4882a593Smuzhiyun
1334*4882a593Smuzhiyun if (!sk_page_frag_refill(sk, pfrag))
1335*4882a593Smuzhiyun goto wait_for_space;
1336*4882a593Smuzhiyun
1337*4882a593Smuzhiyun if (!skb_can_coalesce(skb, i, pfrag->page,
1338*4882a593Smuzhiyun pfrag->offset)) {
1339*4882a593Smuzhiyun if (i >= sysctl_max_skb_frags) {
1340*4882a593Smuzhiyun tcp_mark_push(tp, skb);
1341*4882a593Smuzhiyun goto new_segment;
1342*4882a593Smuzhiyun }
1343*4882a593Smuzhiyun merge = false;
1344*4882a593Smuzhiyun }
1345*4882a593Smuzhiyun
1346*4882a593Smuzhiyun copy = min_t(int, copy, pfrag->size - pfrag->offset);
1347*4882a593Smuzhiyun
1348*4882a593Smuzhiyun if (!sk_wmem_schedule(sk, copy))
1349*4882a593Smuzhiyun goto wait_for_space;
1350*4882a593Smuzhiyun
1351*4882a593Smuzhiyun err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
1352*4882a593Smuzhiyun pfrag->page,
1353*4882a593Smuzhiyun pfrag->offset,
1354*4882a593Smuzhiyun copy);
1355*4882a593Smuzhiyun if (err)
1356*4882a593Smuzhiyun goto do_error;
1357*4882a593Smuzhiyun
1358*4882a593Smuzhiyun /* Update the skb. */
1359*4882a593Smuzhiyun if (merge) {
1360*4882a593Smuzhiyun skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1361*4882a593Smuzhiyun } else {
1362*4882a593Smuzhiyun skb_fill_page_desc(skb, i, pfrag->page,
1363*4882a593Smuzhiyun pfrag->offset, copy);
1364*4882a593Smuzhiyun page_ref_inc(pfrag->page);
1365*4882a593Smuzhiyun }
1366*4882a593Smuzhiyun pfrag->offset += copy;
1367*4882a593Smuzhiyun } else {
1368*4882a593Smuzhiyun if (!sk_wmem_schedule(sk, copy))
1369*4882a593Smuzhiyun goto wait_for_space;
1370*4882a593Smuzhiyun
1371*4882a593Smuzhiyun err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
1372*4882a593Smuzhiyun if (err == -EMSGSIZE || err == -EEXIST) {
1373*4882a593Smuzhiyun tcp_mark_push(tp, skb);
1374*4882a593Smuzhiyun goto new_segment;
1375*4882a593Smuzhiyun }
1376*4882a593Smuzhiyun if (err < 0)
1377*4882a593Smuzhiyun goto do_error;
1378*4882a593Smuzhiyun copy = err;
1379*4882a593Smuzhiyun }
1380*4882a593Smuzhiyun
1381*4882a593Smuzhiyun if (!copied)
1382*4882a593Smuzhiyun TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1383*4882a593Smuzhiyun
1384*4882a593Smuzhiyun WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
1385*4882a593Smuzhiyun TCP_SKB_CB(skb)->end_seq += copy;
1386*4882a593Smuzhiyun tcp_skb_pcount_set(skb, 0);
1387*4882a593Smuzhiyun
1388*4882a593Smuzhiyun copied += copy;
1389*4882a593Smuzhiyun if (!msg_data_left(msg)) {
1390*4882a593Smuzhiyun if (unlikely(flags & MSG_EOR))
1391*4882a593Smuzhiyun TCP_SKB_CB(skb)->eor = 1;
1392*4882a593Smuzhiyun goto out;
1393*4882a593Smuzhiyun }
1394*4882a593Smuzhiyun
1395*4882a593Smuzhiyun if (skb->len < size_goal || (flags & MSG_OOB) || unlikely(tp->repair))
1396*4882a593Smuzhiyun continue;
1397*4882a593Smuzhiyun
1398*4882a593Smuzhiyun if (forced_push(tp)) {
1399*4882a593Smuzhiyun tcp_mark_push(tp, skb);
1400*4882a593Smuzhiyun __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1401*4882a593Smuzhiyun } else if (skb == tcp_send_head(sk))
1402*4882a593Smuzhiyun tcp_push_one(sk, mss_now);
1403*4882a593Smuzhiyun continue;
1404*4882a593Smuzhiyun
1405*4882a593Smuzhiyun wait_for_space:
1406*4882a593Smuzhiyun set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1407*4882a593Smuzhiyun if (copied)
1408*4882a593Smuzhiyun tcp_push(sk, flags & ~MSG_MORE, mss_now,
1409*4882a593Smuzhiyun TCP_NAGLE_PUSH, size_goal);
1410*4882a593Smuzhiyun
1411*4882a593Smuzhiyun err = sk_stream_wait_memory(sk, &timeo);
1412*4882a593Smuzhiyun if (err != 0)
1413*4882a593Smuzhiyun goto do_error;
1414*4882a593Smuzhiyun
1415*4882a593Smuzhiyun mss_now = tcp_send_mss(sk, &size_goal, flags);
1416*4882a593Smuzhiyun }
1417*4882a593Smuzhiyun
1418*4882a593Smuzhiyun out:
1419*4882a593Smuzhiyun if (copied) {
1420*4882a593Smuzhiyun tcp_tx_timestamp(sk, sockc.tsflags);
1421*4882a593Smuzhiyun tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
1422*4882a593Smuzhiyun }
1423*4882a593Smuzhiyun out_nopush:
1424*4882a593Smuzhiyun sock_zerocopy_put(uarg);
1425*4882a593Smuzhiyun return copied + copied_syn;
1426*4882a593Smuzhiyun
1427*4882a593Smuzhiyun do_error:
1428*4882a593Smuzhiyun skb = tcp_write_queue_tail(sk);
1429*4882a593Smuzhiyun do_fault:
1430*4882a593Smuzhiyun tcp_remove_empty_skb(sk, skb);
1431*4882a593Smuzhiyun
1432*4882a593Smuzhiyun if (copied + copied_syn)
1433*4882a593Smuzhiyun goto out;
1434*4882a593Smuzhiyun out_err:
1435*4882a593Smuzhiyun sock_zerocopy_put_abort(uarg, true);
1436*4882a593Smuzhiyun err = sk_stream_error(sk, flags, err);
1437*4882a593Smuzhiyun /* make sure we wake any epoll edge trigger waiter */
1438*4882a593Smuzhiyun if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
1439*4882a593Smuzhiyun sk->sk_write_space(sk);
1440*4882a593Smuzhiyun tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
1441*4882a593Smuzhiyun }
1442*4882a593Smuzhiyun return err;
1443*4882a593Smuzhiyun }
1444*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(tcp_sendmsg_locked);
1445*4882a593Smuzhiyun
tcp_sendmsg(struct sock * sk,struct msghdr * msg,size_t size)1446*4882a593Smuzhiyun int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
1447*4882a593Smuzhiyun {
1448*4882a593Smuzhiyun int ret;
1449*4882a593Smuzhiyun
1450*4882a593Smuzhiyun lock_sock(sk);
1451*4882a593Smuzhiyun ret = tcp_sendmsg_locked(sk, msg, size);
1452*4882a593Smuzhiyun release_sock(sk);
1453*4882a593Smuzhiyun
1454*4882a593Smuzhiyun return ret;
1455*4882a593Smuzhiyun }
1456*4882a593Smuzhiyun EXPORT_SYMBOL(tcp_sendmsg);
1457*4882a593Smuzhiyun
1458*4882a593Smuzhiyun /*
1459*4882a593Smuzhiyun * Handle reading urgent data. BSD has very simple semantics for
1460*4882a593Smuzhiyun * this, no blocking and very strange errors 8)
1461*4882a593Smuzhiyun */
1462*4882a593Smuzhiyun
tcp_recv_urg(struct sock * sk,struct msghdr * msg,int len,int flags)1463*4882a593Smuzhiyun static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
1464*4882a593Smuzhiyun {
1465*4882a593Smuzhiyun struct tcp_sock *tp = tcp_sk(sk);
1466*4882a593Smuzhiyun
1467*4882a593Smuzhiyun /* No URG data to read. */
1468*4882a593Smuzhiyun if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1469*4882a593Smuzhiyun tp->urg_data == TCP_URG_READ)
1470*4882a593Smuzhiyun return -EINVAL; /* Yes this is right ! */
1471*4882a593Smuzhiyun
1472*4882a593Smuzhiyun if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1473*4882a593Smuzhiyun return -ENOTCONN;
1474*4882a593Smuzhiyun
1475*4882a593Smuzhiyun if (tp->urg_data & TCP_URG_VALID) {
1476*4882a593Smuzhiyun int err = 0;
1477*4882a593Smuzhiyun char c = tp->urg_data;
1478*4882a593Smuzhiyun
1479*4882a593Smuzhiyun if (!(flags & MSG_PEEK))
1480*4882a593Smuzhiyun tp->urg_data = TCP_URG_READ;
1481*4882a593Smuzhiyun
1482*4882a593Smuzhiyun /* Read urgent data. */
1483*4882a593Smuzhiyun msg->msg_flags |= MSG_OOB;
1484*4882a593Smuzhiyun
1485*4882a593Smuzhiyun if (len > 0) {
1486*4882a593Smuzhiyun if (!(flags & MSG_TRUNC))
1487*4882a593Smuzhiyun err = memcpy_to_msg(msg, &c, 1);
1488*4882a593Smuzhiyun len = 1;
1489*4882a593Smuzhiyun } else
1490*4882a593Smuzhiyun msg->msg_flags |= MSG_TRUNC;
1491*4882a593Smuzhiyun
1492*4882a593Smuzhiyun return err ? -EFAULT : len;
1493*4882a593Smuzhiyun }
1494*4882a593Smuzhiyun
1495*4882a593Smuzhiyun if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1496*4882a593Smuzhiyun return 0;
1497*4882a593Smuzhiyun
1498*4882a593Smuzhiyun /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
1499*4882a593Smuzhiyun * the available implementations agree in this case:
1500*4882a593Smuzhiyun * this call should never block, independent of the
1501*4882a593Smuzhiyun * blocking state of the socket.
1502*4882a593Smuzhiyun * Mike <pall@rz.uni-karlsruhe.de>
1503*4882a593Smuzhiyun */
1504*4882a593Smuzhiyun return -EAGAIN;
1505*4882a593Smuzhiyun }
1506*4882a593Smuzhiyun
tcp_peek_sndq(struct sock * sk,struct msghdr * msg,int len)1507*4882a593Smuzhiyun static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
1508*4882a593Smuzhiyun {
1509*4882a593Smuzhiyun struct sk_buff *skb;
1510*4882a593Smuzhiyun int copied = 0, err = 0;
1511*4882a593Smuzhiyun
1512*4882a593Smuzhiyun /* XXX -- need to support SO_PEEK_OFF */
1513*4882a593Smuzhiyun
1514*4882a593Smuzhiyun skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
1515*4882a593Smuzhiyun err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
1516*4882a593Smuzhiyun if (err)
1517*4882a593Smuzhiyun return err;
1518*4882a593Smuzhiyun copied += skb->len;
1519*4882a593Smuzhiyun }
1520*4882a593Smuzhiyun
1521*4882a593Smuzhiyun skb_queue_walk(&sk->sk_write_queue, skb) {
1522*4882a593Smuzhiyun err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
1523*4882a593Smuzhiyun if (err)
1524*4882a593Smuzhiyun break;
1525*4882a593Smuzhiyun
1526*4882a593Smuzhiyun copied += skb->len;
1527*4882a593Smuzhiyun }
1528*4882a593Smuzhiyun
1529*4882a593Smuzhiyun return err ?: copied;
1530*4882a593Smuzhiyun }
1531*4882a593Smuzhiyun
1532*4882a593Smuzhiyun /* Clean up the receive buffer for full frames taken by the user,
1533*4882a593Smuzhiyun * then send an ACK if necessary. COPIED is the number of bytes
1534*4882a593Smuzhiyun * tcp_recvmsg has given to the user so far, it speeds up the
1535*4882a593Smuzhiyun * calculation of whether or not we must ACK for the sake of
1536*4882a593Smuzhiyun * a window update.
1537*4882a593Smuzhiyun */
tcp_cleanup_rbuf(struct sock * sk,int copied)1538*4882a593Smuzhiyun void tcp_cleanup_rbuf(struct sock *sk, int copied)
1539*4882a593Smuzhiyun {
1540*4882a593Smuzhiyun struct tcp_sock *tp = tcp_sk(sk);
1541*4882a593Smuzhiyun bool time_to_ack = false;
1542*4882a593Smuzhiyun
1543*4882a593Smuzhiyun struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1544*4882a593Smuzhiyun
1545*4882a593Smuzhiyun WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1546*4882a593Smuzhiyun "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1547*4882a593Smuzhiyun tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1548*4882a593Smuzhiyun
1549*4882a593Smuzhiyun if (inet_csk_ack_scheduled(sk)) {
1550*4882a593Smuzhiyun const struct inet_connection_sock *icsk = inet_csk(sk);
1551*4882a593Smuzhiyun
1552*4882a593Smuzhiyun if (/* Once-per-two-segments ACK was not sent by tcp_input.c */
1553*4882a593Smuzhiyun tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1554*4882a593Smuzhiyun /*
1555*4882a593Smuzhiyun * If this read emptied read buffer, we send ACK, if
1556*4882a593Smuzhiyun * connection is not bidirectional, user drained
1557*4882a593Smuzhiyun * receive buffer and there was a small segment
1558*4882a593Smuzhiyun * in queue.
1559*4882a593Smuzhiyun */
1560*4882a593Smuzhiyun (copied > 0 &&
1561*4882a593Smuzhiyun ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1562*4882a593Smuzhiyun ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1563*4882a593Smuzhiyun !inet_csk_in_pingpong_mode(sk))) &&
1564*4882a593Smuzhiyun !atomic_read(&sk->sk_rmem_alloc)))
1565*4882a593Smuzhiyun time_to_ack = true;
1566*4882a593Smuzhiyun }
1567*4882a593Smuzhiyun
1568*4882a593Smuzhiyun /* We send an ACK if we can now advertise a non-zero window
1569*4882a593Smuzhiyun * which has been raised "significantly".
1570*4882a593Smuzhiyun *
1571*4882a593Smuzhiyun * Even if window raised up to infinity, do not send window open ACK
1572*4882a593Smuzhiyun * in states, where we will not receive more. It is useless.
1573*4882a593Smuzhiyun */
1574*4882a593Smuzhiyun if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1575*4882a593Smuzhiyun __u32 rcv_window_now = tcp_receive_window(tp);
1576*4882a593Smuzhiyun
1577*4882a593Smuzhiyun /* Optimize, __tcp_select_window() is not cheap. */
1578*4882a593Smuzhiyun if (2*rcv_window_now <= tp->window_clamp) {
1579*4882a593Smuzhiyun __u32 new_window = __tcp_select_window(sk);
1580*4882a593Smuzhiyun
1581*4882a593Smuzhiyun /* Send ACK now, if this read freed lots of space
1582*4882a593Smuzhiyun * in our buffer. Certainly, new_window is new window.
1583*4882a593Smuzhiyun * We can advertise it now, if it is not less than current one.
1584*4882a593Smuzhiyun * "Lots" means "at least twice" here.
1585*4882a593Smuzhiyun */
1586*4882a593Smuzhiyun if (new_window && new_window >= 2 * rcv_window_now)
1587*4882a593Smuzhiyun time_to_ack = true;
1588*4882a593Smuzhiyun }
1589*4882a593Smuzhiyun }
1590*4882a593Smuzhiyun if (time_to_ack)
1591*4882a593Smuzhiyun tcp_send_ack(sk);
1592*4882a593Smuzhiyun }
1593*4882a593Smuzhiyun
tcp_recv_skb(struct sock * sk,u32 seq,u32 * off)1594*4882a593Smuzhiyun static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1595*4882a593Smuzhiyun {
1596*4882a593Smuzhiyun struct sk_buff *skb;
1597*4882a593Smuzhiyun u32 offset;
1598*4882a593Smuzhiyun
1599*4882a593Smuzhiyun while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
1600*4882a593Smuzhiyun offset = seq - TCP_SKB_CB(skb)->seq;
1601*4882a593Smuzhiyun if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
1602*4882a593Smuzhiyun pr_err_once("%s: found a SYN, please report !\n", __func__);
1603*4882a593Smuzhiyun offset--;
1604*4882a593Smuzhiyun }
1605*4882a593Smuzhiyun if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
1606*4882a593Smuzhiyun *off = offset;
1607*4882a593Smuzhiyun return skb;
1608*4882a593Smuzhiyun }
1609*4882a593Smuzhiyun /* This looks weird, but this can happen if TCP collapsing
1610*4882a593Smuzhiyun * splitted a fat GRO packet, while we released socket lock
1611*4882a593Smuzhiyun * in skb_splice_bits()
1612*4882a593Smuzhiyun */
1613*4882a593Smuzhiyun sk_eat_skb(sk, skb);
1614*4882a593Smuzhiyun }
1615*4882a593Smuzhiyun return NULL;
1616*4882a593Smuzhiyun }
1617*4882a593Smuzhiyun
1618*4882a593Smuzhiyun /*
1619*4882a593Smuzhiyun * This routine provides an alternative to tcp_recvmsg() for routines
1620*4882a593Smuzhiyun * that would like to handle copying from skbuffs directly in 'sendfile'
1621*4882a593Smuzhiyun * fashion.
1622*4882a593Smuzhiyun * Note:
1623*4882a593Smuzhiyun * - It is assumed that the socket was locked by the caller.
1624*4882a593Smuzhiyun * - The routine does not block.
1625*4882a593Smuzhiyun * - At present, there is no support for reading OOB data
1626*4882a593Smuzhiyun * or for 'peeking' the socket using this routine
1627*4882a593Smuzhiyun * (although both would be easy to implement).
1628*4882a593Smuzhiyun */
tcp_read_sock(struct sock * sk,read_descriptor_t * desc,sk_read_actor_t recv_actor)1629*4882a593Smuzhiyun int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1630*4882a593Smuzhiyun sk_read_actor_t recv_actor)
1631*4882a593Smuzhiyun {
1632*4882a593Smuzhiyun struct sk_buff *skb;
1633*4882a593Smuzhiyun struct tcp_sock *tp = tcp_sk(sk);
1634*4882a593Smuzhiyun u32 seq = tp->copied_seq;
1635*4882a593Smuzhiyun u32 offset;
1636*4882a593Smuzhiyun int copied = 0;
1637*4882a593Smuzhiyun
1638*4882a593Smuzhiyun if (sk->sk_state == TCP_LISTEN)
1639*4882a593Smuzhiyun return -ENOTCONN;
1640*4882a593Smuzhiyun while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1641*4882a593Smuzhiyun if (offset < skb->len) {
1642*4882a593Smuzhiyun int used;
1643*4882a593Smuzhiyun size_t len;
1644*4882a593Smuzhiyun
1645*4882a593Smuzhiyun len = skb->len - offset;
1646*4882a593Smuzhiyun /* Stop reading if we hit a patch of urgent data */
1647*4882a593Smuzhiyun if (tp->urg_data) {
1648*4882a593Smuzhiyun u32 urg_offset = tp->urg_seq - seq;
1649*4882a593Smuzhiyun if (urg_offset < len)
1650*4882a593Smuzhiyun len = urg_offset;
1651*4882a593Smuzhiyun if (!len)
1652*4882a593Smuzhiyun break;
1653*4882a593Smuzhiyun }
1654*4882a593Smuzhiyun used = recv_actor(desc, skb, offset, len);
1655*4882a593Smuzhiyun if (used <= 0) {
1656*4882a593Smuzhiyun if (!copied)
1657*4882a593Smuzhiyun copied = used;
1658*4882a593Smuzhiyun break;
1659*4882a593Smuzhiyun }
1660*4882a593Smuzhiyun if (WARN_ON_ONCE(used > len))
1661*4882a593Smuzhiyun used = len;
1662*4882a593Smuzhiyun seq += used;
1663*4882a593Smuzhiyun copied += used;
1664*4882a593Smuzhiyun offset += used;
1665*4882a593Smuzhiyun
1666*4882a593Smuzhiyun /* If recv_actor drops the lock (e.g. TCP splice
1667*4882a593Smuzhiyun * receive) the skb pointer might be invalid when
1668*4882a593Smuzhiyun * getting here: tcp_collapse might have deleted it
1669*4882a593Smuzhiyun * while aggregating skbs from the socket queue.
1670*4882a593Smuzhiyun */
1671*4882a593Smuzhiyun skb = tcp_recv_skb(sk, seq - 1, &offset);
1672*4882a593Smuzhiyun if (!skb)
1673*4882a593Smuzhiyun break;
1674*4882a593Smuzhiyun /* TCP coalescing might have appended data to the skb.
1675*4882a593Smuzhiyun * Try to splice more frags
1676*4882a593Smuzhiyun */
1677*4882a593Smuzhiyun if (offset + 1 != skb->len)
1678*4882a593Smuzhiyun continue;
1679*4882a593Smuzhiyun }
1680*4882a593Smuzhiyun if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
1681*4882a593Smuzhiyun sk_eat_skb(sk, skb);
1682*4882a593Smuzhiyun ++seq;
1683*4882a593Smuzhiyun break;
1684*4882a593Smuzhiyun }
1685*4882a593Smuzhiyun sk_eat_skb(sk, skb);
1686*4882a593Smuzhiyun if (!desc->count)
1687*4882a593Smuzhiyun break;
1688*4882a593Smuzhiyun WRITE_ONCE(tp->copied_seq, seq);
1689*4882a593Smuzhiyun }
1690*4882a593Smuzhiyun WRITE_ONCE(tp->copied_seq, seq);
1691*4882a593Smuzhiyun
1692*4882a593Smuzhiyun tcp_rcv_space_adjust(sk);
1693*4882a593Smuzhiyun
1694*4882a593Smuzhiyun /* Clean up data we have read: This will do ACK frames. */
1695*4882a593Smuzhiyun if (copied > 0) {
1696*4882a593Smuzhiyun tcp_recv_skb(sk, seq, &offset);
1697*4882a593Smuzhiyun tcp_cleanup_rbuf(sk, copied);
1698*4882a593Smuzhiyun }
1699*4882a593Smuzhiyun return copied;
1700*4882a593Smuzhiyun }
1701*4882a593Smuzhiyun EXPORT_SYMBOL(tcp_read_sock);
1702*4882a593Smuzhiyun
tcp_peek_len(struct socket * sock)1703*4882a593Smuzhiyun int tcp_peek_len(struct socket *sock)
1704*4882a593Smuzhiyun {
1705*4882a593Smuzhiyun return tcp_inq(sock->sk);
1706*4882a593Smuzhiyun }
1707*4882a593Smuzhiyun EXPORT_SYMBOL(tcp_peek_len);
1708*4882a593Smuzhiyun
1709*4882a593Smuzhiyun /* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */
tcp_set_rcvlowat(struct sock * sk,int val)1710*4882a593Smuzhiyun int tcp_set_rcvlowat(struct sock *sk, int val)
1711*4882a593Smuzhiyun {
1712*4882a593Smuzhiyun int cap;
1713*4882a593Smuzhiyun
1714*4882a593Smuzhiyun if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1715*4882a593Smuzhiyun cap = sk->sk_rcvbuf >> 1;
1716*4882a593Smuzhiyun else
1717*4882a593Smuzhiyun cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
1718*4882a593Smuzhiyun val = min(val, cap);
1719*4882a593Smuzhiyun WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1720*4882a593Smuzhiyun
1721*4882a593Smuzhiyun /* Check if we need to signal EPOLLIN right now */
1722*4882a593Smuzhiyun tcp_data_ready(sk);
1723*4882a593Smuzhiyun
1724*4882a593Smuzhiyun if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1725*4882a593Smuzhiyun return 0;
1726*4882a593Smuzhiyun
1727*4882a593Smuzhiyun val <<= 1;
1728*4882a593Smuzhiyun if (val > sk->sk_rcvbuf) {
1729*4882a593Smuzhiyun WRITE_ONCE(sk->sk_rcvbuf, val);
1730*4882a593Smuzhiyun tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val);
1731*4882a593Smuzhiyun }
1732*4882a593Smuzhiyun return 0;
1733*4882a593Smuzhiyun }
1734*4882a593Smuzhiyun EXPORT_SYMBOL(tcp_set_rcvlowat);
1735*4882a593Smuzhiyun
1736*4882a593Smuzhiyun #ifdef CONFIG_MMU
1737*4882a593Smuzhiyun static const struct vm_operations_struct tcp_vm_ops = {
1738*4882a593Smuzhiyun };
1739*4882a593Smuzhiyun
tcp_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)1740*4882a593Smuzhiyun int tcp_mmap(struct file *file, struct socket *sock,
1741*4882a593Smuzhiyun struct vm_area_struct *vma)
1742*4882a593Smuzhiyun {
1743*4882a593Smuzhiyun if (vma->vm_flags & (VM_WRITE | VM_EXEC))
1744*4882a593Smuzhiyun return -EPERM;
1745*4882a593Smuzhiyun vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
1746*4882a593Smuzhiyun
1747*4882a593Smuzhiyun /* Instruct vm_insert_page() to not mmap_read_lock(mm) */
1748*4882a593Smuzhiyun vma->vm_flags |= VM_MIXEDMAP;
1749*4882a593Smuzhiyun
1750*4882a593Smuzhiyun vma->vm_ops = &tcp_vm_ops;
1751*4882a593Smuzhiyun return 0;
1752*4882a593Smuzhiyun }
1753*4882a593Smuzhiyun EXPORT_SYMBOL(tcp_mmap);
1754*4882a593Smuzhiyun
skb_advance_to_frag(struct sk_buff * skb,u32 offset_skb,u32 * offset_frag)1755*4882a593Smuzhiyun static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb,
1756*4882a593Smuzhiyun u32 *offset_frag)
1757*4882a593Smuzhiyun {
1758*4882a593Smuzhiyun skb_frag_t *frag;
1759*4882a593Smuzhiyun
1760*4882a593Smuzhiyun if (unlikely(offset_skb >= skb->len))
1761*4882a593Smuzhiyun return NULL;
1762*4882a593Smuzhiyun
1763*4882a593Smuzhiyun offset_skb -= skb_headlen(skb);
1764*4882a593Smuzhiyun if ((int)offset_skb < 0 || skb_has_frag_list(skb))
1765*4882a593Smuzhiyun return NULL;
1766*4882a593Smuzhiyun
1767*4882a593Smuzhiyun frag = skb_shinfo(skb)->frags;
1768*4882a593Smuzhiyun while (offset_skb) {
1769*4882a593Smuzhiyun if (skb_frag_size(frag) > offset_skb) {
1770*4882a593Smuzhiyun *offset_frag = offset_skb;
1771*4882a593Smuzhiyun return frag;
1772*4882a593Smuzhiyun }
1773*4882a593Smuzhiyun offset_skb -= skb_frag_size(frag);
1774*4882a593Smuzhiyun ++frag;
1775*4882a593Smuzhiyun }
1776*4882a593Smuzhiyun *offset_frag = 0;
1777*4882a593Smuzhiyun return frag;
1778*4882a593Smuzhiyun }
1779*4882a593Smuzhiyun
tcp_copy_straggler_data(struct tcp_zerocopy_receive * zc,struct sk_buff * skb,u32 copylen,u32 * offset,u32 * seq)1780*4882a593Smuzhiyun static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc,
1781*4882a593Smuzhiyun struct sk_buff *skb, u32 copylen,
1782*4882a593Smuzhiyun u32 *offset, u32 *seq)
1783*4882a593Smuzhiyun {
1784*4882a593Smuzhiyun unsigned long copy_address = (unsigned long)zc->copybuf_address;
1785*4882a593Smuzhiyun struct msghdr msg = {};
1786*4882a593Smuzhiyun struct iovec iov;
1787*4882a593Smuzhiyun int err;
1788*4882a593Smuzhiyun
1789*4882a593Smuzhiyun if (copy_address != zc->copybuf_address)
1790*4882a593Smuzhiyun return -EINVAL;
1791*4882a593Smuzhiyun
1792*4882a593Smuzhiyun err = import_single_range(READ, (void __user *)copy_address,
1793*4882a593Smuzhiyun copylen, &iov, &msg.msg_iter);
1794*4882a593Smuzhiyun if (err)
1795*4882a593Smuzhiyun return err;
1796*4882a593Smuzhiyun err = skb_copy_datagram_msg(skb, *offset, &msg, copylen);
1797*4882a593Smuzhiyun if (err)
1798*4882a593Smuzhiyun return err;
1799*4882a593Smuzhiyun zc->recv_skip_hint -= copylen;
1800*4882a593Smuzhiyun *offset += copylen;
1801*4882a593Smuzhiyun *seq += copylen;
1802*4882a593Smuzhiyun return (__s32)copylen;
1803*4882a593Smuzhiyun }
1804*4882a593Smuzhiyun
tcp_zerocopy_handle_leftover_data(struct tcp_zerocopy_receive * zc,struct sock * sk,struct sk_buff * skb,u32 * seq,s32 copybuf_len)1805*4882a593Smuzhiyun static int tcp_zerocopy_handle_leftover_data(struct tcp_zerocopy_receive *zc,
1806*4882a593Smuzhiyun struct sock *sk,
1807*4882a593Smuzhiyun struct sk_buff *skb,
1808*4882a593Smuzhiyun u32 *seq,
1809*4882a593Smuzhiyun s32 copybuf_len)
1810*4882a593Smuzhiyun {
1811*4882a593Smuzhiyun u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint);
1812*4882a593Smuzhiyun
1813*4882a593Smuzhiyun if (!copylen)
1814*4882a593Smuzhiyun return 0;
1815*4882a593Smuzhiyun /* skb is null if inq < PAGE_SIZE. */
1816*4882a593Smuzhiyun if (skb)
1817*4882a593Smuzhiyun offset = *seq - TCP_SKB_CB(skb)->seq;
1818*4882a593Smuzhiyun else
1819*4882a593Smuzhiyun skb = tcp_recv_skb(sk, *seq, &offset);
1820*4882a593Smuzhiyun
1821*4882a593Smuzhiyun zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset,
1822*4882a593Smuzhiyun seq);
1823*4882a593Smuzhiyun return zc->copybuf_len < 0 ? 0 : copylen;
1824*4882a593Smuzhiyun }
1825*4882a593Smuzhiyun
tcp_zerocopy_vm_insert_batch(struct vm_area_struct * vma,struct page ** pages,unsigned long pages_to_map,unsigned long * insert_addr,u32 * length_with_pending,u32 * seq,struct tcp_zerocopy_receive * zc)1826*4882a593Smuzhiyun static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
1827*4882a593Smuzhiyun struct page **pages,
1828*4882a593Smuzhiyun unsigned long pages_to_map,
1829*4882a593Smuzhiyun unsigned long *insert_addr,
1830*4882a593Smuzhiyun u32 *length_with_pending,
1831*4882a593Smuzhiyun u32 *seq,
1832*4882a593Smuzhiyun struct tcp_zerocopy_receive *zc)
1833*4882a593Smuzhiyun {
1834*4882a593Smuzhiyun unsigned long pages_remaining = pages_to_map;
1835*4882a593Smuzhiyun int bytes_mapped;
1836*4882a593Smuzhiyun int ret;
1837*4882a593Smuzhiyun
1838*4882a593Smuzhiyun ret = vm_insert_pages(vma, *insert_addr, pages, &pages_remaining);
1839*4882a593Smuzhiyun bytes_mapped = PAGE_SIZE * (pages_to_map - pages_remaining);
1840*4882a593Smuzhiyun /* Even if vm_insert_pages fails, it may have partially succeeded in
1841*4882a593Smuzhiyun * mapping (some but not all of the pages).
1842*4882a593Smuzhiyun */
1843*4882a593Smuzhiyun *seq += bytes_mapped;
1844*4882a593Smuzhiyun *insert_addr += bytes_mapped;
1845*4882a593Smuzhiyun if (ret) {
1846*4882a593Smuzhiyun /* But if vm_insert_pages did fail, we have to unroll some state
1847*4882a593Smuzhiyun * we speculatively touched before.
1848*4882a593Smuzhiyun */
1849*4882a593Smuzhiyun const int bytes_not_mapped = PAGE_SIZE * pages_remaining;
1850*4882a593Smuzhiyun *length_with_pending -= bytes_not_mapped;
1851*4882a593Smuzhiyun zc->recv_skip_hint += bytes_not_mapped;
1852*4882a593Smuzhiyun }
1853*4882a593Smuzhiyun return ret;
1854*4882a593Smuzhiyun }
1855*4882a593Smuzhiyun
tcp_zerocopy_receive(struct sock * sk,struct tcp_zerocopy_receive * zc)1856*4882a593Smuzhiyun static int tcp_zerocopy_receive(struct sock *sk,
1857*4882a593Smuzhiyun struct tcp_zerocopy_receive *zc)
1858*4882a593Smuzhiyun {
1859*4882a593Smuzhiyun u32 length = 0, offset, vma_len, avail_len, aligned_len, copylen = 0;
1860*4882a593Smuzhiyun unsigned long address = (unsigned long)zc->address;
1861*4882a593Smuzhiyun s32 copybuf_len = zc->copybuf_len;
1862*4882a593Smuzhiyun struct tcp_sock *tp = tcp_sk(sk);
1863*4882a593Smuzhiyun #define PAGE_BATCH_SIZE 8
1864*4882a593Smuzhiyun struct page *pages[PAGE_BATCH_SIZE];
1865*4882a593Smuzhiyun const skb_frag_t *frags = NULL;
1866*4882a593Smuzhiyun struct vm_area_struct *vma;
1867*4882a593Smuzhiyun struct sk_buff *skb = NULL;
1868*4882a593Smuzhiyun unsigned long pg_idx = 0;
1869*4882a593Smuzhiyun unsigned long curr_addr;
1870*4882a593Smuzhiyun u32 seq = tp->copied_seq;
1871*4882a593Smuzhiyun int inq = tcp_inq(sk);
1872*4882a593Smuzhiyun int ret;
1873*4882a593Smuzhiyun
1874*4882a593Smuzhiyun zc->copybuf_len = 0;
1875*4882a593Smuzhiyun
1876*4882a593Smuzhiyun if (address & (PAGE_SIZE - 1) || address != zc->address)
1877*4882a593Smuzhiyun return -EINVAL;
1878*4882a593Smuzhiyun
1879*4882a593Smuzhiyun if (sk->sk_state == TCP_LISTEN)
1880*4882a593Smuzhiyun return -ENOTCONN;
1881*4882a593Smuzhiyun
1882*4882a593Smuzhiyun sock_rps_record_flow(sk);
1883*4882a593Smuzhiyun
1884*4882a593Smuzhiyun mmap_read_lock(current->mm);
1885*4882a593Smuzhiyun
1886*4882a593Smuzhiyun vma = find_vma(current->mm, address);
1887*4882a593Smuzhiyun if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops) {
1888*4882a593Smuzhiyun mmap_read_unlock(current->mm);
1889*4882a593Smuzhiyun return -EINVAL;
1890*4882a593Smuzhiyun }
1891*4882a593Smuzhiyun vma_len = min_t(unsigned long, zc->length, vma->vm_end - address);
1892*4882a593Smuzhiyun avail_len = min_t(u32, vma_len, inq);
1893*4882a593Smuzhiyun aligned_len = avail_len & ~(PAGE_SIZE - 1);
1894*4882a593Smuzhiyun if (aligned_len) {
1895*4882a593Smuzhiyun zap_page_range(vma, address, aligned_len);
1896*4882a593Smuzhiyun zc->length = aligned_len;
1897*4882a593Smuzhiyun zc->recv_skip_hint = 0;
1898*4882a593Smuzhiyun } else {
1899*4882a593Smuzhiyun zc->length = avail_len;
1900*4882a593Smuzhiyun zc->recv_skip_hint = avail_len;
1901*4882a593Smuzhiyun }
1902*4882a593Smuzhiyun ret = 0;
1903*4882a593Smuzhiyun curr_addr = address;
1904*4882a593Smuzhiyun while (length + PAGE_SIZE <= zc->length) {
1905*4882a593Smuzhiyun if (zc->recv_skip_hint < PAGE_SIZE) {
1906*4882a593Smuzhiyun u32 offset_frag;
1907*4882a593Smuzhiyun
1908*4882a593Smuzhiyun /* If we're here, finish the current batch. */
1909*4882a593Smuzhiyun if (pg_idx) {
1910*4882a593Smuzhiyun ret = tcp_zerocopy_vm_insert_batch(vma, pages,
1911*4882a593Smuzhiyun pg_idx,
1912*4882a593Smuzhiyun &curr_addr,
1913*4882a593Smuzhiyun &length,
1914*4882a593Smuzhiyun &seq, zc);
1915*4882a593Smuzhiyun if (ret)
1916*4882a593Smuzhiyun goto out;
1917*4882a593Smuzhiyun pg_idx = 0;
1918*4882a593Smuzhiyun }
1919*4882a593Smuzhiyun if (skb) {
1920*4882a593Smuzhiyun if (zc->recv_skip_hint > 0)
1921*4882a593Smuzhiyun break;
1922*4882a593Smuzhiyun skb = skb->next;
1923*4882a593Smuzhiyun offset = seq - TCP_SKB_CB(skb)->seq;
1924*4882a593Smuzhiyun } else {
1925*4882a593Smuzhiyun skb = tcp_recv_skb(sk, seq, &offset);
1926*4882a593Smuzhiyun }
1927*4882a593Smuzhiyun zc->recv_skip_hint = skb->len - offset;
1928*4882a593Smuzhiyun frags = skb_advance_to_frag(skb, offset, &offset_frag);
1929*4882a593Smuzhiyun if (!frags || offset_frag)
1930*4882a593Smuzhiyun break;
1931*4882a593Smuzhiyun }
1932*4882a593Smuzhiyun if (skb_frag_size(frags) != PAGE_SIZE || skb_frag_off(frags)) {
1933*4882a593Smuzhiyun int remaining = zc->recv_skip_hint;
1934*4882a593Smuzhiyun
1935*4882a593Smuzhiyun while (remaining && (skb_frag_size(frags) != PAGE_SIZE ||
1936*4882a593Smuzhiyun skb_frag_off(frags))) {
1937*4882a593Smuzhiyun remaining -= skb_frag_size(frags);
1938*4882a593Smuzhiyun frags++;
1939*4882a593Smuzhiyun }
1940*4882a593Smuzhiyun zc->recv_skip_hint -= remaining;
1941*4882a593Smuzhiyun break;
1942*4882a593Smuzhiyun }
1943*4882a593Smuzhiyun pages[pg_idx] = skb_frag_page(frags);
1944*4882a593Smuzhiyun pg_idx++;
1945*4882a593Smuzhiyun length += PAGE_SIZE;
1946*4882a593Smuzhiyun zc->recv_skip_hint -= PAGE_SIZE;
1947*4882a593Smuzhiyun frags++;
1948*4882a593Smuzhiyun if (pg_idx == PAGE_BATCH_SIZE) {
1949*4882a593Smuzhiyun ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx,
1950*4882a593Smuzhiyun &curr_addr, &length,
1951*4882a593Smuzhiyun &seq, zc);
1952*4882a593Smuzhiyun if (ret)
1953*4882a593Smuzhiyun goto out;
1954*4882a593Smuzhiyun pg_idx = 0;
1955*4882a593Smuzhiyun }
1956*4882a593Smuzhiyun }
1957*4882a593Smuzhiyun if (pg_idx) {
1958*4882a593Smuzhiyun ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx,
1959*4882a593Smuzhiyun &curr_addr, &length, &seq,
1960*4882a593Smuzhiyun zc);
1961*4882a593Smuzhiyun }
1962*4882a593Smuzhiyun out:
1963*4882a593Smuzhiyun mmap_read_unlock(current->mm);
1964*4882a593Smuzhiyun /* Try to copy straggler data. */
1965*4882a593Smuzhiyun if (!ret)
1966*4882a593Smuzhiyun copylen = tcp_zerocopy_handle_leftover_data(zc, sk, skb, &seq,
1967*4882a593Smuzhiyun copybuf_len);
1968*4882a593Smuzhiyun
1969*4882a593Smuzhiyun if (length + copylen) {
1970*4882a593Smuzhiyun WRITE_ONCE(tp->copied_seq, seq);
1971*4882a593Smuzhiyun tcp_rcv_space_adjust(sk);
1972*4882a593Smuzhiyun
1973*4882a593Smuzhiyun /* Clean up data we have read: This will do ACK frames. */
1974*4882a593Smuzhiyun tcp_recv_skb(sk, seq, &offset);
1975*4882a593Smuzhiyun tcp_cleanup_rbuf(sk, length + copylen);
1976*4882a593Smuzhiyun ret = 0;
1977*4882a593Smuzhiyun if (length == zc->length)
1978*4882a593Smuzhiyun zc->recv_skip_hint = 0;
1979*4882a593Smuzhiyun } else {
1980*4882a593Smuzhiyun if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE))
1981*4882a593Smuzhiyun ret = -EIO;
1982*4882a593Smuzhiyun }
1983*4882a593Smuzhiyun zc->length = length;
1984*4882a593Smuzhiyun return ret;
1985*4882a593Smuzhiyun }
1986*4882a593Smuzhiyun #endif
1987*4882a593Smuzhiyun
tcp_update_recv_tstamps(struct sk_buff * skb,struct scm_timestamping_internal * tss)1988*4882a593Smuzhiyun static void tcp_update_recv_tstamps(struct sk_buff *skb,
1989*4882a593Smuzhiyun struct scm_timestamping_internal *tss)
1990*4882a593Smuzhiyun {
1991*4882a593Smuzhiyun if (skb->tstamp)
1992*4882a593Smuzhiyun tss->ts[0] = ktime_to_timespec64(skb->tstamp);
1993*4882a593Smuzhiyun else
1994*4882a593Smuzhiyun tss->ts[0] = (struct timespec64) {0};
1995*4882a593Smuzhiyun
1996*4882a593Smuzhiyun if (skb_hwtstamps(skb)->hwtstamp)
1997*4882a593Smuzhiyun tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp);
1998*4882a593Smuzhiyun else
1999*4882a593Smuzhiyun tss->ts[2] = (struct timespec64) {0};
2000*4882a593Smuzhiyun }
2001*4882a593Smuzhiyun
2002*4882a593Smuzhiyun /* Similar to __sock_recv_timestamp, but does not require an skb */
tcp_recv_timestamp(struct msghdr * msg,const struct sock * sk,struct scm_timestamping_internal * tss)2003*4882a593Smuzhiyun static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
2004*4882a593Smuzhiyun struct scm_timestamping_internal *tss)
2005*4882a593Smuzhiyun {
2006*4882a593Smuzhiyun int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
2007*4882a593Smuzhiyun bool has_timestamping = false;
2008*4882a593Smuzhiyun
2009*4882a593Smuzhiyun if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) {
2010*4882a593Smuzhiyun if (sock_flag(sk, SOCK_RCVTSTAMP)) {
2011*4882a593Smuzhiyun if (sock_flag(sk, SOCK_RCVTSTAMPNS)) {
2012*4882a593Smuzhiyun if (new_tstamp) {
2013*4882a593Smuzhiyun struct __kernel_timespec kts = {
2014*4882a593Smuzhiyun .tv_sec = tss->ts[0].tv_sec,
2015*4882a593Smuzhiyun .tv_nsec = tss->ts[0].tv_nsec,
2016*4882a593Smuzhiyun };
2017*4882a593Smuzhiyun put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
2018*4882a593Smuzhiyun sizeof(kts), &kts);
2019*4882a593Smuzhiyun } else {
2020*4882a593Smuzhiyun struct __kernel_old_timespec ts_old = {
2021*4882a593Smuzhiyun .tv_sec = tss->ts[0].tv_sec,
2022*4882a593Smuzhiyun .tv_nsec = tss->ts[0].tv_nsec,
2023*4882a593Smuzhiyun };
2024*4882a593Smuzhiyun put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
2025*4882a593Smuzhiyun sizeof(ts_old), &ts_old);
2026*4882a593Smuzhiyun }
2027*4882a593Smuzhiyun } else {
2028*4882a593Smuzhiyun if (new_tstamp) {
2029*4882a593Smuzhiyun struct __kernel_sock_timeval stv = {
2030*4882a593Smuzhiyun .tv_sec = tss->ts[0].tv_sec,
2031*4882a593Smuzhiyun .tv_usec = tss->ts[0].tv_nsec / 1000,
2032*4882a593Smuzhiyun };
2033*4882a593Smuzhiyun put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
2034*4882a593Smuzhiyun sizeof(stv), &stv);
2035*4882a593Smuzhiyun } else {
2036*4882a593Smuzhiyun struct __kernel_old_timeval tv = {
2037*4882a593Smuzhiyun .tv_sec = tss->ts[0].tv_sec,
2038*4882a593Smuzhiyun .tv_usec = tss->ts[0].tv_nsec / 1000,
2039*4882a593Smuzhiyun };
2040*4882a593Smuzhiyun put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
2041*4882a593Smuzhiyun sizeof(tv), &tv);
2042*4882a593Smuzhiyun }
2043*4882a593Smuzhiyun }
2044*4882a593Smuzhiyun }
2045*4882a593Smuzhiyun
2046*4882a593Smuzhiyun if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE)
2047*4882a593Smuzhiyun has_timestamping = true;
2048*4882a593Smuzhiyun else
2049*4882a593Smuzhiyun tss->ts[0] = (struct timespec64) {0};
2050*4882a593Smuzhiyun }
2051*4882a593Smuzhiyun
2052*4882a593Smuzhiyun if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) {
2053*4882a593Smuzhiyun if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)
2054*4882a593Smuzhiyun has_timestamping = true;
2055*4882a593Smuzhiyun else
2056*4882a593Smuzhiyun tss->ts[2] = (struct timespec64) {0};
2057*4882a593Smuzhiyun }
2058*4882a593Smuzhiyun
2059*4882a593Smuzhiyun if (has_timestamping) {
2060*4882a593Smuzhiyun tss->ts[1] = (struct timespec64) {0};
2061*4882a593Smuzhiyun if (sock_flag(sk, SOCK_TSTAMP_NEW))
2062*4882a593Smuzhiyun put_cmsg_scm_timestamping64(msg, tss);
2063*4882a593Smuzhiyun else
2064*4882a593Smuzhiyun put_cmsg_scm_timestamping(msg, tss);
2065*4882a593Smuzhiyun }
2066*4882a593Smuzhiyun }
2067*4882a593Smuzhiyun
tcp_inq_hint(struct sock * sk)2068*4882a593Smuzhiyun static int tcp_inq_hint(struct sock *sk)
2069*4882a593Smuzhiyun {
2070*4882a593Smuzhiyun const struct tcp_sock *tp = tcp_sk(sk);
2071*4882a593Smuzhiyun u32 copied_seq = READ_ONCE(tp->copied_seq);
2072*4882a593Smuzhiyun u32 rcv_nxt = READ_ONCE(tp->rcv_nxt);
2073*4882a593Smuzhiyun int inq;
2074*4882a593Smuzhiyun
2075*4882a593Smuzhiyun inq = rcv_nxt - copied_seq;
2076*4882a593Smuzhiyun if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) {
2077*4882a593Smuzhiyun lock_sock(sk);
2078*4882a593Smuzhiyun inq = tp->rcv_nxt - tp->copied_seq;
2079*4882a593Smuzhiyun release_sock(sk);
2080*4882a593Smuzhiyun }
2081*4882a593Smuzhiyun /* After receiving a FIN, tell the user-space to continue reading
2082*4882a593Smuzhiyun * by returning a non-zero inq.
2083*4882a593Smuzhiyun */
2084*4882a593Smuzhiyun if (inq == 0 && sock_flag(sk, SOCK_DONE))
2085*4882a593Smuzhiyun inq = 1;
2086*4882a593Smuzhiyun return inq;
2087*4882a593Smuzhiyun }
2088*4882a593Smuzhiyun
2089*4882a593Smuzhiyun /*
2090*4882a593Smuzhiyun * This routine copies from a sock struct into the user buffer.
2091*4882a593Smuzhiyun *
2092*4882a593Smuzhiyun * Technical note: in 2.3 we work on _locked_ socket, so that
2093*4882a593Smuzhiyun * tricks with *seq access order and skb->users are not required.
2094*4882a593Smuzhiyun * Probably, code can be easily improved even more.
2095*4882a593Smuzhiyun */
2096*4882a593Smuzhiyun
tcp_recvmsg(struct sock * sk,struct msghdr * msg,size_t len,int nonblock,int flags,int * addr_len)2097*4882a593Smuzhiyun int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
2098*4882a593Smuzhiyun int flags, int *addr_len)
2099*4882a593Smuzhiyun {
2100*4882a593Smuzhiyun struct tcp_sock *tp = tcp_sk(sk);
2101*4882a593Smuzhiyun int copied = 0;
2102*4882a593Smuzhiyun u32 peek_seq;
2103*4882a593Smuzhiyun u32 *seq;
2104*4882a593Smuzhiyun unsigned long used;
2105*4882a593Smuzhiyun int err, inq;
2106*4882a593Smuzhiyun int target; /* Read at least this many bytes */
2107*4882a593Smuzhiyun long timeo;
2108*4882a593Smuzhiyun struct sk_buff *skb, *last;
2109*4882a593Smuzhiyun u32 urg_hole = 0;
2110*4882a593Smuzhiyun struct scm_timestamping_internal tss;
2111*4882a593Smuzhiyun int cmsg_flags;
2112*4882a593Smuzhiyun
2113*4882a593Smuzhiyun if (unlikely(flags & MSG_ERRQUEUE))
2114*4882a593Smuzhiyun return inet_recv_error(sk, msg, len, addr_len);
2115*4882a593Smuzhiyun trace_android_rvh_tcp_recvmsg(sk);
2116*4882a593Smuzhiyun
2117*4882a593Smuzhiyun if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) &&
2118*4882a593Smuzhiyun (sk->sk_state == TCP_ESTABLISHED))
2119*4882a593Smuzhiyun sk_busy_loop(sk, nonblock);
2120*4882a593Smuzhiyun
2121*4882a593Smuzhiyun lock_sock(sk);
2122*4882a593Smuzhiyun
2123*4882a593Smuzhiyun err = -ENOTCONN;
2124*4882a593Smuzhiyun if (sk->sk_state == TCP_LISTEN)
2125*4882a593Smuzhiyun goto out;
2126*4882a593Smuzhiyun
2127*4882a593Smuzhiyun cmsg_flags = tp->recvmsg_inq ? 1 : 0;
2128*4882a593Smuzhiyun timeo = sock_rcvtimeo(sk, nonblock);
2129*4882a593Smuzhiyun
2130*4882a593Smuzhiyun /* Urgent data needs to be handled specially. */
2131*4882a593Smuzhiyun if (flags & MSG_OOB)
2132*4882a593Smuzhiyun goto recv_urg;
2133*4882a593Smuzhiyun
2134*4882a593Smuzhiyun if (unlikely(tp->repair)) {
2135*4882a593Smuzhiyun err = -EPERM;
2136*4882a593Smuzhiyun if (!(flags & MSG_PEEK))
2137*4882a593Smuzhiyun goto out;
2138*4882a593Smuzhiyun
2139*4882a593Smuzhiyun if (tp->repair_queue == TCP_SEND_QUEUE)
2140*4882a593Smuzhiyun goto recv_sndq;
2141*4882a593Smuzhiyun
2142*4882a593Smuzhiyun err = -EINVAL;
2143*4882a593Smuzhiyun if (tp->repair_queue == TCP_NO_QUEUE)
2144*4882a593Smuzhiyun goto out;
2145*4882a593Smuzhiyun
2146*4882a593Smuzhiyun /* 'common' recv queue MSG_PEEK-ing */
2147*4882a593Smuzhiyun }
2148*4882a593Smuzhiyun
2149*4882a593Smuzhiyun seq = &tp->copied_seq;
2150*4882a593Smuzhiyun if (flags & MSG_PEEK) {
2151*4882a593Smuzhiyun peek_seq = tp->copied_seq;
2152*4882a593Smuzhiyun seq = &peek_seq;
2153*4882a593Smuzhiyun }
2154*4882a593Smuzhiyun
2155*4882a593Smuzhiyun target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
2156*4882a593Smuzhiyun
2157*4882a593Smuzhiyun do {
2158*4882a593Smuzhiyun u32 offset;
2159*4882a593Smuzhiyun
2160*4882a593Smuzhiyun /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
2161*4882a593Smuzhiyun if (tp->urg_data && tp->urg_seq == *seq) {
2162*4882a593Smuzhiyun if (copied)
2163*4882a593Smuzhiyun break;
2164*4882a593Smuzhiyun if (signal_pending(current)) {
2165*4882a593Smuzhiyun copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
2166*4882a593Smuzhiyun break;
2167*4882a593Smuzhiyun }
2168*4882a593Smuzhiyun }
2169*4882a593Smuzhiyun
2170*4882a593Smuzhiyun /* Next get a buffer. */
2171*4882a593Smuzhiyun
2172*4882a593Smuzhiyun last = skb_peek_tail(&sk->sk_receive_queue);
2173*4882a593Smuzhiyun skb_queue_walk(&sk->sk_receive_queue, skb) {
2174*4882a593Smuzhiyun last = skb;
2175*4882a593Smuzhiyun /* Now that we have two receive queues this
2176*4882a593Smuzhiyun * shouldn't happen.
2177*4882a593Smuzhiyun */
2178*4882a593Smuzhiyun if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
2179*4882a593Smuzhiyun "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n",
2180*4882a593Smuzhiyun *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
2181*4882a593Smuzhiyun flags))
2182*4882a593Smuzhiyun break;
2183*4882a593Smuzhiyun
2184*4882a593Smuzhiyun offset = *seq - TCP_SKB_CB(skb)->seq;
2185*4882a593Smuzhiyun if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
2186*4882a593Smuzhiyun pr_err_once("%s: found a SYN, please report !\n", __func__);
2187*4882a593Smuzhiyun offset--;
2188*4882a593Smuzhiyun }
2189*4882a593Smuzhiyun if (offset < skb->len)
2190*4882a593Smuzhiyun goto found_ok_skb;
2191*4882a593Smuzhiyun if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2192*4882a593Smuzhiyun goto found_fin_ok;
2193*4882a593Smuzhiyun WARN(!(flags & MSG_PEEK),
2194*4882a593Smuzhiyun "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n",
2195*4882a593Smuzhiyun *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
2196*4882a593Smuzhiyun }
2197*4882a593Smuzhiyun
2198*4882a593Smuzhiyun /* Well, if we have backlog, try to process it now yet. */
2199*4882a593Smuzhiyun
2200*4882a593Smuzhiyun if (copied >= target && !READ_ONCE(sk->sk_backlog.tail))
2201*4882a593Smuzhiyun break;
2202*4882a593Smuzhiyun
2203*4882a593Smuzhiyun if (copied) {
2204*4882a593Smuzhiyun if (sk->sk_err ||
2205*4882a593Smuzhiyun sk->sk_state == TCP_CLOSE ||
2206*4882a593Smuzhiyun (sk->sk_shutdown & RCV_SHUTDOWN) ||
2207*4882a593Smuzhiyun !timeo ||
2208*4882a593Smuzhiyun signal_pending(current))
2209*4882a593Smuzhiyun break;
2210*4882a593Smuzhiyun } else {
2211*4882a593Smuzhiyun if (sock_flag(sk, SOCK_DONE))
2212*4882a593Smuzhiyun break;
2213*4882a593Smuzhiyun
2214*4882a593Smuzhiyun if (sk->sk_err) {
2215*4882a593Smuzhiyun copied = sock_error(sk);
2216*4882a593Smuzhiyun break;
2217*4882a593Smuzhiyun }
2218*4882a593Smuzhiyun
2219*4882a593Smuzhiyun if (sk->sk_shutdown & RCV_SHUTDOWN)
2220*4882a593Smuzhiyun break;
2221*4882a593Smuzhiyun
2222*4882a593Smuzhiyun if (sk->sk_state == TCP_CLOSE) {
2223*4882a593Smuzhiyun /* This occurs when user tries to read
2224*4882a593Smuzhiyun * from never connected socket.
2225*4882a593Smuzhiyun */
2226*4882a593Smuzhiyun copied = -ENOTCONN;
2227*4882a593Smuzhiyun break;
2228*4882a593Smuzhiyun }
2229*4882a593Smuzhiyun
2230*4882a593Smuzhiyun if (!timeo) {
2231*4882a593Smuzhiyun copied = -EAGAIN;
2232*4882a593Smuzhiyun break;
2233*4882a593Smuzhiyun }
2234*4882a593Smuzhiyun
2235*4882a593Smuzhiyun if (signal_pending(current)) {
2236*4882a593Smuzhiyun copied = sock_intr_errno(timeo);
2237*4882a593Smuzhiyun break;
2238*4882a593Smuzhiyun }
2239*4882a593Smuzhiyun }
2240*4882a593Smuzhiyun
2241*4882a593Smuzhiyun tcp_cleanup_rbuf(sk, copied);
2242*4882a593Smuzhiyun
2243*4882a593Smuzhiyun if (copied >= target) {
2244*4882a593Smuzhiyun /* Do not sleep, just process backlog. */
2245*4882a593Smuzhiyun release_sock(sk);
2246*4882a593Smuzhiyun lock_sock(sk);
2247*4882a593Smuzhiyun } else {
2248*4882a593Smuzhiyun sk_wait_data(sk, &timeo, last);
2249*4882a593Smuzhiyun }
2250*4882a593Smuzhiyun
2251*4882a593Smuzhiyun if ((flags & MSG_PEEK) &&
2252*4882a593Smuzhiyun (peek_seq - copied - urg_hole != tp->copied_seq)) {
2253*4882a593Smuzhiyun net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
2254*4882a593Smuzhiyun current->comm,
2255*4882a593Smuzhiyun task_pid_nr(current));
2256*4882a593Smuzhiyun peek_seq = tp->copied_seq;
2257*4882a593Smuzhiyun }
2258*4882a593Smuzhiyun continue;
2259*4882a593Smuzhiyun
2260*4882a593Smuzhiyun found_ok_skb:
2261*4882a593Smuzhiyun /* Ok so how much can we use? */
2262*4882a593Smuzhiyun used = skb->len - offset;
2263*4882a593Smuzhiyun if (len < used)
2264*4882a593Smuzhiyun used = len;
2265*4882a593Smuzhiyun
2266*4882a593Smuzhiyun /* Do we have urgent data here? */
2267*4882a593Smuzhiyun if (tp->urg_data) {
2268*4882a593Smuzhiyun u32 urg_offset = tp->urg_seq - *seq;
2269*4882a593Smuzhiyun if (urg_offset < used) {
2270*4882a593Smuzhiyun if (!urg_offset) {
2271*4882a593Smuzhiyun if (!sock_flag(sk, SOCK_URGINLINE)) {
2272*4882a593Smuzhiyun WRITE_ONCE(*seq, *seq + 1);
2273*4882a593Smuzhiyun urg_hole++;
2274*4882a593Smuzhiyun offset++;
2275*4882a593Smuzhiyun used--;
2276*4882a593Smuzhiyun if (!used)
2277*4882a593Smuzhiyun goto skip_copy;
2278*4882a593Smuzhiyun }
2279*4882a593Smuzhiyun } else
2280*4882a593Smuzhiyun used = urg_offset;
2281*4882a593Smuzhiyun }
2282*4882a593Smuzhiyun }
2283*4882a593Smuzhiyun
2284*4882a593Smuzhiyun if (!(flags & MSG_TRUNC)) {
2285*4882a593Smuzhiyun err = skb_copy_datagram_msg(skb, offset, msg, used);
2286*4882a593Smuzhiyun if (err) {
2287*4882a593Smuzhiyun /* Exception. Bailout! */
2288*4882a593Smuzhiyun if (!copied)
2289*4882a593Smuzhiyun copied = -EFAULT;
2290*4882a593Smuzhiyun break;
2291*4882a593Smuzhiyun }
2292*4882a593Smuzhiyun }
2293*4882a593Smuzhiyun
2294*4882a593Smuzhiyun WRITE_ONCE(*seq, *seq + used);
2295*4882a593Smuzhiyun copied += used;
2296*4882a593Smuzhiyun len -= used;
2297*4882a593Smuzhiyun
2298*4882a593Smuzhiyun tcp_rcv_space_adjust(sk);
2299*4882a593Smuzhiyun
2300*4882a593Smuzhiyun skip_copy:
2301*4882a593Smuzhiyun if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
2302*4882a593Smuzhiyun tp->urg_data = 0;
2303*4882a593Smuzhiyun tcp_fast_path_check(sk);
2304*4882a593Smuzhiyun }
2305*4882a593Smuzhiyun
2306*4882a593Smuzhiyun if (TCP_SKB_CB(skb)->has_rxtstamp) {
2307*4882a593Smuzhiyun tcp_update_recv_tstamps(skb, &tss);
2308*4882a593Smuzhiyun cmsg_flags |= 2;
2309*4882a593Smuzhiyun }
2310*4882a593Smuzhiyun
2311*4882a593Smuzhiyun if (used + offset < skb->len)
2312*4882a593Smuzhiyun continue;
2313*4882a593Smuzhiyun
2314*4882a593Smuzhiyun if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2315*4882a593Smuzhiyun goto found_fin_ok;
2316*4882a593Smuzhiyun if (!(flags & MSG_PEEK))
2317*4882a593Smuzhiyun sk_eat_skb(sk, skb);
2318*4882a593Smuzhiyun continue;
2319*4882a593Smuzhiyun
2320*4882a593Smuzhiyun found_fin_ok:
2321*4882a593Smuzhiyun /* Process the FIN. */
2322*4882a593Smuzhiyun WRITE_ONCE(*seq, *seq + 1);
2323*4882a593Smuzhiyun if (!(flags & MSG_PEEK))
2324*4882a593Smuzhiyun sk_eat_skb(sk, skb);
2325*4882a593Smuzhiyun break;
2326*4882a593Smuzhiyun } while (len > 0);
2327*4882a593Smuzhiyun
2328*4882a593Smuzhiyun trace_android_rvh_tcp_recvmsg_stat(sk, copied);
2329*4882a593Smuzhiyun /* According to UNIX98, msg_name/msg_namelen are ignored
2330*4882a593Smuzhiyun * on connected socket. I was just happy when found this 8) --ANK
2331*4882a593Smuzhiyun */
2332*4882a593Smuzhiyun
2333*4882a593Smuzhiyun /* Clean up data we have read: This will do ACK frames. */
2334*4882a593Smuzhiyun tcp_cleanup_rbuf(sk, copied);
2335*4882a593Smuzhiyun
2336*4882a593Smuzhiyun release_sock(sk);
2337*4882a593Smuzhiyun
2338*4882a593Smuzhiyun if (cmsg_flags) {
2339*4882a593Smuzhiyun if (cmsg_flags & 2)
2340*4882a593Smuzhiyun tcp_recv_timestamp(msg, sk, &tss);
2341*4882a593Smuzhiyun if (cmsg_flags & 1) {
2342*4882a593Smuzhiyun inq = tcp_inq_hint(sk);
2343*4882a593Smuzhiyun put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
2344*4882a593Smuzhiyun }
2345*4882a593Smuzhiyun }
2346*4882a593Smuzhiyun
2347*4882a593Smuzhiyun return copied;
2348*4882a593Smuzhiyun
2349*4882a593Smuzhiyun out:
2350*4882a593Smuzhiyun release_sock(sk);
2351*4882a593Smuzhiyun return err;
2352*4882a593Smuzhiyun
2353*4882a593Smuzhiyun recv_urg:
2354*4882a593Smuzhiyun err = tcp_recv_urg(sk, msg, len, flags);
2355*4882a593Smuzhiyun goto out;
2356*4882a593Smuzhiyun
2357*4882a593Smuzhiyun recv_sndq:
2358*4882a593Smuzhiyun err = tcp_peek_sndq(sk, msg, len);
2359*4882a593Smuzhiyun goto out;
2360*4882a593Smuzhiyun }
2361*4882a593Smuzhiyun EXPORT_SYMBOL(tcp_recvmsg);
2362*4882a593Smuzhiyun
tcp_set_state(struct sock * sk,int state)2363*4882a593Smuzhiyun void tcp_set_state(struct sock *sk, int state)
2364*4882a593Smuzhiyun {
2365*4882a593Smuzhiyun int oldstate = sk->sk_state;
2366*4882a593Smuzhiyun
2367*4882a593Smuzhiyun /* We defined a new enum for TCP states that are exported in BPF
2368*4882a593Smuzhiyun * so as not force the internal TCP states to be frozen. The
2369*4882a593Smuzhiyun * following checks will detect if an internal state value ever
2370*4882a593Smuzhiyun * differs from the BPF value. If this ever happens, then we will
2371*4882a593Smuzhiyun * need to remap the internal value to the BPF value before calling
2372*4882a593Smuzhiyun * tcp_call_bpf_2arg.
2373*4882a593Smuzhiyun */
2374*4882a593Smuzhiyun BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED);
2375*4882a593Smuzhiyun BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT);
2376*4882a593Smuzhiyun BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV);
2377*4882a593Smuzhiyun BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1);
2378*4882a593Smuzhiyun BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2);
2379*4882a593Smuzhiyun BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT);
2380*4882a593Smuzhiyun BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE);
2381*4882a593Smuzhiyun BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT);
2382*4882a593Smuzhiyun BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK);
2383*4882a593Smuzhiyun BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN);
2384*4882a593Smuzhiyun BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING);
2385*4882a593Smuzhiyun BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV);
2386*4882a593Smuzhiyun BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES);
2387*4882a593Smuzhiyun
2388*4882a593Smuzhiyun if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG))
2389*4882a593Smuzhiyun tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state);
2390*4882a593Smuzhiyun
2391*4882a593Smuzhiyun switch (state) {
2392*4882a593Smuzhiyun case TCP_ESTABLISHED:
2393*4882a593Smuzhiyun if (oldstate != TCP_ESTABLISHED)
2394*4882a593Smuzhiyun TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
2395*4882a593Smuzhiyun break;
2396*4882a593Smuzhiyun
2397*4882a593Smuzhiyun case TCP_CLOSE:
2398*4882a593Smuzhiyun if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
2399*4882a593Smuzhiyun TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
2400*4882a593Smuzhiyun
2401*4882a593Smuzhiyun sk->sk_prot->unhash(sk);
2402*4882a593Smuzhiyun if (inet_csk(sk)->icsk_bind_hash &&
2403*4882a593Smuzhiyun !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
2404*4882a593Smuzhiyun inet_put_port(sk);
2405*4882a593Smuzhiyun fallthrough;
2406*4882a593Smuzhiyun default:
2407*4882a593Smuzhiyun if (oldstate == TCP_ESTABLISHED)
2408*4882a593Smuzhiyun TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
2409*4882a593Smuzhiyun }
2410*4882a593Smuzhiyun
2411*4882a593Smuzhiyun /* Change state AFTER socket is unhashed to avoid closed
2412*4882a593Smuzhiyun * socket sitting in hash tables.
2413*4882a593Smuzhiyun */
2414*4882a593Smuzhiyun inet_sk_state_store(sk, state);
2415*4882a593Smuzhiyun }
2416*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(tcp_set_state);
2417*4882a593Smuzhiyun
2418*4882a593Smuzhiyun /*
2419*4882a593Smuzhiyun * State processing on a close. This implements the state shift for
2420*4882a593Smuzhiyun * sending our FIN frame. Note that we only send a FIN for some
2421*4882a593Smuzhiyun * states. A shutdown() may have already sent the FIN, or we may be
2422*4882a593Smuzhiyun * closed.
2423*4882a593Smuzhiyun */
2424*4882a593Smuzhiyun
2425*4882a593Smuzhiyun static const unsigned char new_state[16] = {
2426*4882a593Smuzhiyun /* current state: new state: action: */
2427*4882a593Smuzhiyun [0 /* (Invalid) */] = TCP_CLOSE,
2428*4882a593Smuzhiyun [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2429*4882a593Smuzhiyun [TCP_SYN_SENT] = TCP_CLOSE,
2430*4882a593Smuzhiyun [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2431*4882a593Smuzhiyun [TCP_FIN_WAIT1] = TCP_FIN_WAIT1,
2432*4882a593Smuzhiyun [TCP_FIN_WAIT2] = TCP_FIN_WAIT2,
2433*4882a593Smuzhiyun [TCP_TIME_WAIT] = TCP_CLOSE,
2434*4882a593Smuzhiyun [TCP_CLOSE] = TCP_CLOSE,
2435*4882a593Smuzhiyun [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN,
2436*4882a593Smuzhiyun [TCP_LAST_ACK] = TCP_LAST_ACK,
2437*4882a593Smuzhiyun [TCP_LISTEN] = TCP_CLOSE,
2438*4882a593Smuzhiyun [TCP_CLOSING] = TCP_CLOSING,
2439*4882a593Smuzhiyun [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */
2440*4882a593Smuzhiyun };
2441*4882a593Smuzhiyun
tcp_close_state(struct sock * sk)2442*4882a593Smuzhiyun static int tcp_close_state(struct sock *sk)
2443*4882a593Smuzhiyun {
2444*4882a593Smuzhiyun int next = (int)new_state[sk->sk_state];
2445*4882a593Smuzhiyun int ns = next & TCP_STATE_MASK;
2446*4882a593Smuzhiyun
2447*4882a593Smuzhiyun tcp_set_state(sk, ns);
2448*4882a593Smuzhiyun
2449*4882a593Smuzhiyun return next & TCP_ACTION_FIN;
2450*4882a593Smuzhiyun }
2451*4882a593Smuzhiyun
2452*4882a593Smuzhiyun /*
2453*4882a593Smuzhiyun * Shutdown the sending side of a connection. Much like close except
2454*4882a593Smuzhiyun * that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD).
2455*4882a593Smuzhiyun */
2456*4882a593Smuzhiyun
tcp_shutdown(struct sock * sk,int how)2457*4882a593Smuzhiyun void tcp_shutdown(struct sock *sk, int how)
2458*4882a593Smuzhiyun {
2459*4882a593Smuzhiyun /* We need to grab some memory, and put together a FIN,
2460*4882a593Smuzhiyun * and then put it into the queue to be sent.
2461*4882a593Smuzhiyun * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2462*4882a593Smuzhiyun */
2463*4882a593Smuzhiyun if (!(how & SEND_SHUTDOWN))
2464*4882a593Smuzhiyun return;
2465*4882a593Smuzhiyun
2466*4882a593Smuzhiyun /* If we've already sent a FIN, or it's a closed state, skip this. */
2467*4882a593Smuzhiyun if ((1 << sk->sk_state) &
2468*4882a593Smuzhiyun (TCPF_ESTABLISHED | TCPF_SYN_SENT |
2469*4882a593Smuzhiyun TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
2470*4882a593Smuzhiyun /* Clear out any half completed packets. FIN if needed. */
2471*4882a593Smuzhiyun if (tcp_close_state(sk))
2472*4882a593Smuzhiyun tcp_send_fin(sk);
2473*4882a593Smuzhiyun }
2474*4882a593Smuzhiyun }
2475*4882a593Smuzhiyun EXPORT_SYMBOL(tcp_shutdown);
2476*4882a593Smuzhiyun
tcp_check_oom(struct sock * sk,int shift)2477*4882a593Smuzhiyun bool tcp_check_oom(struct sock *sk, int shift)
2478*4882a593Smuzhiyun {
2479*4882a593Smuzhiyun bool too_many_orphans, out_of_socket_memory;
2480*4882a593Smuzhiyun
2481*4882a593Smuzhiyun too_many_orphans = tcp_too_many_orphans(sk, shift);
2482*4882a593Smuzhiyun out_of_socket_memory = tcp_out_of_memory(sk);
2483*4882a593Smuzhiyun
2484*4882a593Smuzhiyun if (too_many_orphans)
2485*4882a593Smuzhiyun net_info_ratelimited("too many orphaned sockets\n");
2486*4882a593Smuzhiyun if (out_of_socket_memory)
2487*4882a593Smuzhiyun net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
2488*4882a593Smuzhiyun return too_many_orphans || out_of_socket_memory;
2489*4882a593Smuzhiyun }
2490*4882a593Smuzhiyun
tcp_close(struct sock * sk,long timeout)2491*4882a593Smuzhiyun void tcp_close(struct sock *sk, long timeout)
2492*4882a593Smuzhiyun {
2493*4882a593Smuzhiyun struct sk_buff *skb;
2494*4882a593Smuzhiyun int data_was_unread = 0;
2495*4882a593Smuzhiyun int state;
2496*4882a593Smuzhiyun
2497*4882a593Smuzhiyun lock_sock(sk);
2498*4882a593Smuzhiyun sk->sk_shutdown = SHUTDOWN_MASK;
2499*4882a593Smuzhiyun
2500*4882a593Smuzhiyun if (sk->sk_state == TCP_LISTEN) {
2501*4882a593Smuzhiyun tcp_set_state(sk, TCP_CLOSE);
2502*4882a593Smuzhiyun
2503*4882a593Smuzhiyun /* Special case. */
2504*4882a593Smuzhiyun inet_csk_listen_stop(sk);
2505*4882a593Smuzhiyun
2506*4882a593Smuzhiyun goto adjudge_to_death;
2507*4882a593Smuzhiyun }
2508*4882a593Smuzhiyun
2509*4882a593Smuzhiyun /* We need to flush the recv. buffs. We do this only on the
2510*4882a593Smuzhiyun * descriptor close, not protocol-sourced closes, because the
2511*4882a593Smuzhiyun * reader process may not have drained the data yet!
2512*4882a593Smuzhiyun */
2513*4882a593Smuzhiyun while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
2514*4882a593Smuzhiyun u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
2515*4882a593Smuzhiyun
2516*4882a593Smuzhiyun if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2517*4882a593Smuzhiyun len--;
2518*4882a593Smuzhiyun data_was_unread += len;
2519*4882a593Smuzhiyun __kfree_skb(skb);
2520*4882a593Smuzhiyun }
2521*4882a593Smuzhiyun
2522*4882a593Smuzhiyun sk_mem_reclaim(sk);
2523*4882a593Smuzhiyun
2524*4882a593Smuzhiyun /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
2525*4882a593Smuzhiyun if (sk->sk_state == TCP_CLOSE)
2526*4882a593Smuzhiyun goto adjudge_to_death;
2527*4882a593Smuzhiyun
2528*4882a593Smuzhiyun /* As outlined in RFC 2525, section 2.17, we send a RST here because
2529*4882a593Smuzhiyun * data was lost. To witness the awful effects of the old behavior of
2530*4882a593Smuzhiyun * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
2531*4882a593Smuzhiyun * GET in an FTP client, suspend the process, wait for the client to
2532*4882a593Smuzhiyun * advertise a zero window, then kill -9 the FTP client, wheee...
2533*4882a593Smuzhiyun * Note: timeout is always zero in such a case.
2534*4882a593Smuzhiyun */
2535*4882a593Smuzhiyun if (unlikely(tcp_sk(sk)->repair)) {
2536*4882a593Smuzhiyun sk->sk_prot->disconnect(sk, 0);
2537*4882a593Smuzhiyun } else if (data_was_unread) {
2538*4882a593Smuzhiyun /* Unread data was tossed, zap the connection. */
2539*4882a593Smuzhiyun NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
2540*4882a593Smuzhiyun tcp_set_state(sk, TCP_CLOSE);
2541*4882a593Smuzhiyun tcp_send_active_reset(sk, sk->sk_allocation);
2542*4882a593Smuzhiyun } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
2543*4882a593Smuzhiyun /* Check zero linger _after_ checking for unread data. */
2544*4882a593Smuzhiyun sk->sk_prot->disconnect(sk, 0);
2545*4882a593Smuzhiyun NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
2546*4882a593Smuzhiyun } else if (tcp_close_state(sk)) {
2547*4882a593Smuzhiyun /* We FIN if the application ate all the data before
2548*4882a593Smuzhiyun * zapping the connection.
2549*4882a593Smuzhiyun */
2550*4882a593Smuzhiyun
2551*4882a593Smuzhiyun /* RED-PEN. Formally speaking, we have broken TCP state
2552*4882a593Smuzhiyun * machine. State transitions:
2553*4882a593Smuzhiyun *
2554*4882a593Smuzhiyun * TCP_ESTABLISHED -> TCP_FIN_WAIT1
2555*4882a593Smuzhiyun * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
2556*4882a593Smuzhiyun * TCP_CLOSE_WAIT -> TCP_LAST_ACK
2557*4882a593Smuzhiyun *
2558*4882a593Smuzhiyun * are legal only when FIN has been sent (i.e. in window),
2559*4882a593Smuzhiyun * rather than queued out of window. Purists blame.
2560*4882a593Smuzhiyun *
2561*4882a593Smuzhiyun * F.e. "RFC state" is ESTABLISHED,
2562*4882a593Smuzhiyun * if Linux state is FIN-WAIT-1, but FIN is still not sent.
2563*4882a593Smuzhiyun *
2564*4882a593Smuzhiyun * The visible declinations are that sometimes
2565*4882a593Smuzhiyun * we enter time-wait state, when it is not required really
2566*4882a593Smuzhiyun * (harmless), do not send active resets, when they are
2567*4882a593Smuzhiyun * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
2568*4882a593Smuzhiyun * they look as CLOSING or LAST_ACK for Linux)
2569*4882a593Smuzhiyun * Probably, I missed some more holelets.
2570*4882a593Smuzhiyun * --ANK
2571*4882a593Smuzhiyun * XXX (TFO) - To start off we don't support SYN+ACK+FIN
2572*4882a593Smuzhiyun * in a single packet! (May consider it later but will
2573*4882a593Smuzhiyun * probably need API support or TCP_CORK SYN-ACK until
2574*4882a593Smuzhiyun * data is written and socket is closed.)
2575*4882a593Smuzhiyun */
2576*4882a593Smuzhiyun tcp_send_fin(sk);
2577*4882a593Smuzhiyun }
2578*4882a593Smuzhiyun
2579*4882a593Smuzhiyun sk_stream_wait_close(sk, timeout);
2580*4882a593Smuzhiyun
2581*4882a593Smuzhiyun adjudge_to_death:
2582*4882a593Smuzhiyun state = sk->sk_state;
2583*4882a593Smuzhiyun sock_hold(sk);
2584*4882a593Smuzhiyun sock_orphan(sk);
2585*4882a593Smuzhiyun
2586*4882a593Smuzhiyun local_bh_disable();
2587*4882a593Smuzhiyun bh_lock_sock(sk);
2588*4882a593Smuzhiyun /* remove backlog if any, without releasing ownership. */
2589*4882a593Smuzhiyun __release_sock(sk);
2590*4882a593Smuzhiyun
2591*4882a593Smuzhiyun percpu_counter_inc(sk->sk_prot->orphan_count);
2592*4882a593Smuzhiyun
2593*4882a593Smuzhiyun /* Have we already been destroyed by a softirq or backlog? */
2594*4882a593Smuzhiyun if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
2595*4882a593Smuzhiyun goto out;
2596*4882a593Smuzhiyun
2597*4882a593Smuzhiyun /* This is a (useful) BSD violating of the RFC. There is a
2598*4882a593Smuzhiyun * problem with TCP as specified in that the other end could
2599*4882a593Smuzhiyun * keep a socket open forever with no application left this end.
2600*4882a593Smuzhiyun * We use a 1 minute timeout (about the same as BSD) then kill
2601*4882a593Smuzhiyun * our end. If they send after that then tough - BUT: long enough
2602*4882a593Smuzhiyun * that we won't make the old 4*rto = almost no time - whoops
2603*4882a593Smuzhiyun * reset mistake.
2604*4882a593Smuzhiyun *
2605*4882a593Smuzhiyun * Nope, it was not mistake. It is really desired behaviour
2606*4882a593Smuzhiyun * f.e. on http servers, when such sockets are useless, but
2607*4882a593Smuzhiyun * consume significant resources. Let's do it with special
2608*4882a593Smuzhiyun * linger2 option. --ANK
2609*4882a593Smuzhiyun */
2610*4882a593Smuzhiyun
2611*4882a593Smuzhiyun if (sk->sk_state == TCP_FIN_WAIT2) {
2612*4882a593Smuzhiyun struct tcp_sock *tp = tcp_sk(sk);
2613*4882a593Smuzhiyun if (tp->linger2 < 0) {
2614*4882a593Smuzhiyun tcp_set_state(sk, TCP_CLOSE);
2615*4882a593Smuzhiyun tcp_send_active_reset(sk, GFP_ATOMIC);
2616*4882a593Smuzhiyun __NET_INC_STATS(sock_net(sk),
2617*4882a593Smuzhiyun LINUX_MIB_TCPABORTONLINGER);
2618*4882a593Smuzhiyun } else {
2619*4882a593Smuzhiyun const int tmo = tcp_fin_time(sk);
2620*4882a593Smuzhiyun
2621*4882a593Smuzhiyun if (tmo > TCP_TIMEWAIT_LEN) {
2622*4882a593Smuzhiyun inet_csk_reset_keepalive_timer(sk,
2623*4882a593Smuzhiyun tmo - TCP_TIMEWAIT_LEN);
2624*4882a593Smuzhiyun } else {
2625*4882a593Smuzhiyun tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2626*4882a593Smuzhiyun goto out;
2627*4882a593Smuzhiyun }
2628*4882a593Smuzhiyun }
2629*4882a593Smuzhiyun }
2630*4882a593Smuzhiyun if (sk->sk_state != TCP_CLOSE) {
2631*4882a593Smuzhiyun sk_mem_reclaim(sk);
2632*4882a593Smuzhiyun if (tcp_check_oom(sk, 0)) {
2633*4882a593Smuzhiyun tcp_set_state(sk, TCP_CLOSE);
2634*4882a593Smuzhiyun tcp_send_active_reset(sk, GFP_ATOMIC);
2635*4882a593Smuzhiyun __NET_INC_STATS(sock_net(sk),
2636*4882a593Smuzhiyun LINUX_MIB_TCPABORTONMEMORY);
2637*4882a593Smuzhiyun } else if (!check_net(sock_net(sk))) {
2638*4882a593Smuzhiyun /* Not possible to send reset; just close */
2639*4882a593Smuzhiyun tcp_set_state(sk, TCP_CLOSE);
2640*4882a593Smuzhiyun }
2641*4882a593Smuzhiyun }
2642*4882a593Smuzhiyun
2643*4882a593Smuzhiyun if (sk->sk_state == TCP_CLOSE) {
2644*4882a593Smuzhiyun struct request_sock *req;
2645*4882a593Smuzhiyun
2646*4882a593Smuzhiyun req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
2647*4882a593Smuzhiyun lockdep_sock_is_held(sk));
2648*4882a593Smuzhiyun /* We could get here with a non-NULL req if the socket is
2649*4882a593Smuzhiyun * aborted (e.g., closed with unread data) before 3WHS
2650*4882a593Smuzhiyun * finishes.
2651*4882a593Smuzhiyun */
2652*4882a593Smuzhiyun if (req)
2653*4882a593Smuzhiyun reqsk_fastopen_remove(sk, req, false);
2654*4882a593Smuzhiyun inet_csk_destroy_sock(sk);
2655*4882a593Smuzhiyun }
2656*4882a593Smuzhiyun /* Otherwise, socket is reprieved until protocol close. */
2657*4882a593Smuzhiyun
2658*4882a593Smuzhiyun out:
2659*4882a593Smuzhiyun bh_unlock_sock(sk);
2660*4882a593Smuzhiyun local_bh_enable();
2661*4882a593Smuzhiyun release_sock(sk);
2662*4882a593Smuzhiyun sock_put(sk);
2663*4882a593Smuzhiyun }
2664*4882a593Smuzhiyun EXPORT_SYMBOL(tcp_close);
2665*4882a593Smuzhiyun
2666*4882a593Smuzhiyun /* These states need RST on ABORT according to RFC793 */
2667*4882a593Smuzhiyun
tcp_need_reset(int state)2668*4882a593Smuzhiyun static inline bool tcp_need_reset(int state)
2669*4882a593Smuzhiyun {
2670*4882a593Smuzhiyun return (1 << state) &
2671*4882a593Smuzhiyun (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2672*4882a593Smuzhiyun TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2673*4882a593Smuzhiyun }
2674*4882a593Smuzhiyun
tcp_rtx_queue_purge(struct sock * sk)2675*4882a593Smuzhiyun static void tcp_rtx_queue_purge(struct sock *sk)
2676*4882a593Smuzhiyun {
2677*4882a593Smuzhiyun struct rb_node *p = rb_first(&sk->tcp_rtx_queue);
2678*4882a593Smuzhiyun
2679*4882a593Smuzhiyun tcp_sk(sk)->highest_sack = NULL;
2680*4882a593Smuzhiyun while (p) {
2681*4882a593Smuzhiyun struct sk_buff *skb = rb_to_skb(p);
2682*4882a593Smuzhiyun
2683*4882a593Smuzhiyun p = rb_next(p);
2684*4882a593Smuzhiyun /* Since we are deleting whole queue, no need to
2685*4882a593Smuzhiyun * list_del(&skb->tcp_tsorted_anchor)
2686*4882a593Smuzhiyun */
2687*4882a593Smuzhiyun tcp_rtx_queue_unlink(skb, sk);
2688*4882a593Smuzhiyun sk_wmem_free_skb(sk, skb);
2689*4882a593Smuzhiyun }
2690*4882a593Smuzhiyun }
2691*4882a593Smuzhiyun
tcp_write_queue_purge(struct sock * sk)2692*4882a593Smuzhiyun void tcp_write_queue_purge(struct sock *sk)
2693*4882a593Smuzhiyun {
2694*4882a593Smuzhiyun struct sk_buff *skb;
2695*4882a593Smuzhiyun
2696*4882a593Smuzhiyun tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
2697*4882a593Smuzhiyun while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
2698*4882a593Smuzhiyun tcp_skb_tsorted_anchor_cleanup(skb);
2699*4882a593Smuzhiyun sk_wmem_free_skb(sk, skb);
2700*4882a593Smuzhiyun }
2701*4882a593Smuzhiyun tcp_rtx_queue_purge(sk);
2702*4882a593Smuzhiyun skb = sk->sk_tx_skb_cache;
2703*4882a593Smuzhiyun if (skb) {
2704*4882a593Smuzhiyun __kfree_skb(skb);
2705*4882a593Smuzhiyun sk->sk_tx_skb_cache = NULL;
2706*4882a593Smuzhiyun }
2707*4882a593Smuzhiyun INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
2708*4882a593Smuzhiyun sk_mem_reclaim(sk);
2709*4882a593Smuzhiyun tcp_clear_all_retrans_hints(tcp_sk(sk));
2710*4882a593Smuzhiyun tcp_sk(sk)->packets_out = 0;
2711*4882a593Smuzhiyun inet_csk(sk)->icsk_backoff = 0;
2712*4882a593Smuzhiyun }
2713*4882a593Smuzhiyun
tcp_disconnect(struct sock * sk,int flags)2714*4882a593Smuzhiyun int tcp_disconnect(struct sock *sk, int flags)
2715*4882a593Smuzhiyun {
2716*4882a593Smuzhiyun struct inet_sock *inet = inet_sk(sk);
2717*4882a593Smuzhiyun struct inet_connection_sock *icsk = inet_csk(sk);
2718*4882a593Smuzhiyun struct tcp_sock *tp = tcp_sk(sk);
2719*4882a593Smuzhiyun int old_state = sk->sk_state;
2720*4882a593Smuzhiyun u32 seq;
2721*4882a593Smuzhiyun
2722*4882a593Smuzhiyun if (old_state != TCP_CLOSE)
2723*4882a593Smuzhiyun tcp_set_state(sk, TCP_CLOSE);
2724*4882a593Smuzhiyun
2725*4882a593Smuzhiyun /* ABORT function of RFC793 */
2726*4882a593Smuzhiyun if (old_state == TCP_LISTEN) {
2727*4882a593Smuzhiyun inet_csk_listen_stop(sk);
2728*4882a593Smuzhiyun } else if (unlikely(tp->repair)) {
2729*4882a593Smuzhiyun sk->sk_err = ECONNABORTED;
2730*4882a593Smuzhiyun } else if (tcp_need_reset(old_state) ||
2731*4882a593Smuzhiyun (tp->snd_nxt != tp->write_seq &&
2732*4882a593Smuzhiyun (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2733*4882a593Smuzhiyun /* The last check adjusts for discrepancy of Linux wrt. RFC
2734*4882a593Smuzhiyun * states
2735*4882a593Smuzhiyun */
2736*4882a593Smuzhiyun tcp_send_active_reset(sk, gfp_any());
2737*4882a593Smuzhiyun sk->sk_err = ECONNRESET;
2738*4882a593Smuzhiyun } else if (old_state == TCP_SYN_SENT)
2739*4882a593Smuzhiyun sk->sk_err = ECONNRESET;
2740*4882a593Smuzhiyun
2741*4882a593Smuzhiyun tcp_clear_xmit_timers(sk);
2742*4882a593Smuzhiyun __skb_queue_purge(&sk->sk_receive_queue);
2743*4882a593Smuzhiyun if (sk->sk_rx_skb_cache) {
2744*4882a593Smuzhiyun __kfree_skb(sk->sk_rx_skb_cache);
2745*4882a593Smuzhiyun sk->sk_rx_skb_cache = NULL;
2746*4882a593Smuzhiyun }
2747*4882a593Smuzhiyun WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
2748*4882a593Smuzhiyun tp->urg_data = 0;
2749*4882a593Smuzhiyun tcp_write_queue_purge(sk);
2750*4882a593Smuzhiyun tcp_fastopen_active_disable_ofo_check(sk);
2751*4882a593Smuzhiyun skb_rbtree_purge(&tp->out_of_order_queue);
2752*4882a593Smuzhiyun
2753*4882a593Smuzhiyun inet->inet_dport = 0;
2754*4882a593Smuzhiyun
2755*4882a593Smuzhiyun if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2756*4882a593Smuzhiyun inet_reset_saddr(sk);
2757*4882a593Smuzhiyun
2758*4882a593Smuzhiyun sk->sk_shutdown = 0;
2759*4882a593Smuzhiyun sock_reset_flag(sk, SOCK_DONE);
2760*4882a593Smuzhiyun tp->srtt_us = 0;
2761*4882a593Smuzhiyun tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
2762*4882a593Smuzhiyun tp->rcv_rtt_last_tsecr = 0;
2763*4882a593Smuzhiyun
2764*4882a593Smuzhiyun seq = tp->write_seq + tp->max_window + 2;
2765*4882a593Smuzhiyun if (!seq)
2766*4882a593Smuzhiyun seq = 1;
2767*4882a593Smuzhiyun WRITE_ONCE(tp->write_seq, seq);
2768*4882a593Smuzhiyun
2769*4882a593Smuzhiyun icsk->icsk_backoff = 0;
2770*4882a593Smuzhiyun icsk->icsk_probes_out = 0;
2771*4882a593Smuzhiyun icsk->icsk_probes_tstamp = 0;
2772*4882a593Smuzhiyun icsk->icsk_rto = TCP_TIMEOUT_INIT;
2773*4882a593Smuzhiyun icsk->icsk_rto_min = TCP_RTO_MIN;
2774*4882a593Smuzhiyun icsk->icsk_delack_max = TCP_DELACK_MAX;
2775*4882a593Smuzhiyun tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
2776*4882a593Smuzhiyun tp->snd_cwnd = TCP_INIT_CWND;
2777*4882a593Smuzhiyun tp->snd_cwnd_cnt = 0;
2778*4882a593Smuzhiyun tp->is_cwnd_limited = 0;
2779*4882a593Smuzhiyun tp->max_packets_out = 0;
2780*4882a593Smuzhiyun tp->window_clamp = 0;
2781*4882a593Smuzhiyun tp->delivered = 0;
2782*4882a593Smuzhiyun tp->delivered_ce = 0;
2783*4882a593Smuzhiyun if (icsk->icsk_ca_ops->release)
2784*4882a593Smuzhiyun icsk->icsk_ca_ops->release(sk);
2785*4882a593Smuzhiyun memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
2786*4882a593Smuzhiyun icsk->icsk_ca_initialized = 0;
2787*4882a593Smuzhiyun tcp_set_ca_state(sk, TCP_CA_Open);
2788*4882a593Smuzhiyun tp->is_sack_reneg = 0;
2789*4882a593Smuzhiyun tcp_clear_retrans(tp);
2790*4882a593Smuzhiyun tp->total_retrans = 0;
2791*4882a593Smuzhiyun inet_csk_delack_init(sk);
2792*4882a593Smuzhiyun /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0
2793*4882a593Smuzhiyun * issue in __tcp_select_window()
2794*4882a593Smuzhiyun */
2795*4882a593Smuzhiyun icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
2796*4882a593Smuzhiyun memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
2797*4882a593Smuzhiyun __sk_dst_reset(sk);
2798*4882a593Smuzhiyun dst_release(xchg((__force struct dst_entry **)&sk->sk_rx_dst, NULL));
2799*4882a593Smuzhiyun tcp_saved_syn_free(tp);
2800*4882a593Smuzhiyun tp->compressed_ack = 0;
2801*4882a593Smuzhiyun tp->segs_in = 0;
2802*4882a593Smuzhiyun tp->segs_out = 0;
2803*4882a593Smuzhiyun tp->bytes_sent = 0;
2804*4882a593Smuzhiyun tp->bytes_acked = 0;
2805*4882a593Smuzhiyun tp->bytes_received = 0;
2806*4882a593Smuzhiyun tp->bytes_retrans = 0;
2807*4882a593Smuzhiyun tp->data_segs_in = 0;
2808*4882a593Smuzhiyun tp->data_segs_out = 0;
2809*4882a593Smuzhiyun tp->duplicate_sack[0].start_seq = 0;
2810*4882a593Smuzhiyun tp->duplicate_sack[0].end_seq = 0;
2811*4882a593Smuzhiyun tp->dsack_dups = 0;
2812*4882a593Smuzhiyun tp->reord_seen = 0;
2813*4882a593Smuzhiyun tp->retrans_out = 0;
2814*4882a593Smuzhiyun tp->sacked_out = 0;
2815*4882a593Smuzhiyun tp->tlp_high_seq = 0;
2816*4882a593Smuzhiyun tp->last_oow_ack_time = 0;
2817*4882a593Smuzhiyun /* There's a bubble in the pipe until at least the first ACK. */
2818*4882a593Smuzhiyun tp->app_limited = ~0U;
2819*4882a593Smuzhiyun tp->rack.mstamp = 0;
2820*4882a593Smuzhiyun tp->rack.advanced = 0;
2821*4882a593Smuzhiyun tp->rack.reo_wnd_steps = 1;
2822*4882a593Smuzhiyun tp->rack.last_delivered = 0;
2823*4882a593Smuzhiyun tp->rack.reo_wnd_persist = 0;
2824*4882a593Smuzhiyun tp->rack.dsack_seen = 0;
2825*4882a593Smuzhiyun tp->syn_data_acked = 0;
2826*4882a593Smuzhiyun tp->rx_opt.saw_tstamp = 0;
2827*4882a593Smuzhiyun tp->rx_opt.dsack = 0;
2828*4882a593Smuzhiyun tp->rx_opt.num_sacks = 0;
2829*4882a593Smuzhiyun tp->rcv_ooopack = 0;
2830*4882a593Smuzhiyun
2831*4882a593Smuzhiyun
2832*4882a593Smuzhiyun /* Clean up fastopen related fields */
2833*4882a593Smuzhiyun tcp_free_fastopen_req(tp);
2834*4882a593Smuzhiyun inet->defer_connect = 0;
2835*4882a593Smuzhiyun tp->fastopen_client_fail = 0;
2836*4882a593Smuzhiyun
2837*4882a593Smuzhiyun WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
2838*4882a593Smuzhiyun
2839*4882a593Smuzhiyun if (sk->sk_frag.page) {
2840*4882a593Smuzhiyun put_page(sk->sk_frag.page);
2841*4882a593Smuzhiyun sk->sk_frag.page = NULL;
2842*4882a593Smuzhiyun sk->sk_frag.offset = 0;
2843*4882a593Smuzhiyun }
2844*4882a593Smuzhiyun
2845*4882a593Smuzhiyun sk->sk_error_report(sk);
2846*4882a593Smuzhiyun return 0;
2847*4882a593Smuzhiyun }
2848*4882a593Smuzhiyun EXPORT_SYMBOL(tcp_disconnect);
2849*4882a593Smuzhiyun
tcp_can_repair_sock(const struct sock * sk)2850*4882a593Smuzhiyun static inline bool tcp_can_repair_sock(const struct sock *sk)
2851*4882a593Smuzhiyun {
2852*4882a593Smuzhiyun return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
2853*4882a593Smuzhiyun (sk->sk_state != TCP_LISTEN);
2854*4882a593Smuzhiyun }
2855*4882a593Smuzhiyun
tcp_repair_set_window(struct tcp_sock * tp,sockptr_t optbuf,int len)2856*4882a593Smuzhiyun static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len)
2857*4882a593Smuzhiyun {
2858*4882a593Smuzhiyun struct tcp_repair_window opt;
2859*4882a593Smuzhiyun
2860*4882a593Smuzhiyun if (!tp->repair)
2861*4882a593Smuzhiyun return -EPERM;
2862*4882a593Smuzhiyun
2863*4882a593Smuzhiyun if (len != sizeof(opt))
2864*4882a593Smuzhiyun return -EINVAL;
2865*4882a593Smuzhiyun
2866*4882a593Smuzhiyun if (copy_from_sockptr(&opt, optbuf, sizeof(opt)))
2867*4882a593Smuzhiyun return -EFAULT;
2868*4882a593Smuzhiyun
2869*4882a593Smuzhiyun if (opt.max_window < opt.snd_wnd)
2870*4882a593Smuzhiyun return -EINVAL;
2871*4882a593Smuzhiyun
2872*4882a593Smuzhiyun if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd))
2873*4882a593Smuzhiyun return -EINVAL;
2874*4882a593Smuzhiyun
2875*4882a593Smuzhiyun if (after(opt.rcv_wup, tp->rcv_nxt))
2876*4882a593Smuzhiyun return -EINVAL;
2877*4882a593Smuzhiyun
2878*4882a593Smuzhiyun tp->snd_wl1 = opt.snd_wl1;
2879*4882a593Smuzhiyun tp->snd_wnd = opt.snd_wnd;
2880*4882a593Smuzhiyun tp->max_window = opt.max_window;
2881*4882a593Smuzhiyun
2882*4882a593Smuzhiyun tp->rcv_wnd = opt.rcv_wnd;
2883*4882a593Smuzhiyun tp->rcv_wup = opt.rcv_wup;
2884*4882a593Smuzhiyun
2885*4882a593Smuzhiyun return 0;
2886*4882a593Smuzhiyun }
2887*4882a593Smuzhiyun
tcp_repair_options_est(struct sock * sk,sockptr_t optbuf,unsigned int len)2888*4882a593Smuzhiyun static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf,
2889*4882a593Smuzhiyun unsigned int len)
2890*4882a593Smuzhiyun {
2891*4882a593Smuzhiyun struct tcp_sock *tp = tcp_sk(sk);
2892*4882a593Smuzhiyun struct tcp_repair_opt opt;
2893*4882a593Smuzhiyun size_t offset = 0;
2894*4882a593Smuzhiyun
2895*4882a593Smuzhiyun while (len >= sizeof(opt)) {
2896*4882a593Smuzhiyun if (copy_from_sockptr_offset(&opt, optbuf, offset, sizeof(opt)))
2897*4882a593Smuzhiyun return -EFAULT;
2898*4882a593Smuzhiyun
2899*4882a593Smuzhiyun offset += sizeof(opt);
2900*4882a593Smuzhiyun len -= sizeof(opt);
2901*4882a593Smuzhiyun
2902*4882a593Smuzhiyun switch (opt.opt_code) {
2903*4882a593Smuzhiyun case TCPOPT_MSS:
2904*4882a593Smuzhiyun tp->rx_opt.mss_clamp = opt.opt_val;
2905*4882a593Smuzhiyun tcp_mtup_init(sk);
2906*4882a593Smuzhiyun break;
2907*4882a593Smuzhiyun case TCPOPT_WINDOW:
2908*4882a593Smuzhiyun {
2909*4882a593Smuzhiyun u16 snd_wscale = opt.opt_val & 0xFFFF;
2910*4882a593Smuzhiyun u16 rcv_wscale = opt.opt_val >> 16;
2911*4882a593Smuzhiyun
2912*4882a593Smuzhiyun if (snd_wscale > TCP_MAX_WSCALE || rcv_wscale > TCP_MAX_WSCALE)
2913*4882a593Smuzhiyun return -EFBIG;
2914*4882a593Smuzhiyun
2915*4882a593Smuzhiyun tp->rx_opt.snd_wscale = snd_wscale;
2916*4882a593Smuzhiyun tp->rx_opt.rcv_wscale = rcv_wscale;
2917*4882a593Smuzhiyun tp->rx_opt.wscale_ok = 1;
2918*4882a593Smuzhiyun }
2919*4882a593Smuzhiyun break;
2920*4882a593Smuzhiyun case TCPOPT_SACK_PERM:
2921*4882a593Smuzhiyun if (opt.opt_val != 0)
2922*4882a593Smuzhiyun return -EINVAL;
2923*4882a593Smuzhiyun
2924*4882a593Smuzhiyun tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
2925*4882a593Smuzhiyun break;
2926*4882a593Smuzhiyun case TCPOPT_TIMESTAMP:
2927*4882a593Smuzhiyun if (opt.opt_val != 0)
2928*4882a593Smuzhiyun return -EINVAL;
2929*4882a593Smuzhiyun
2930*4882a593Smuzhiyun tp->rx_opt.tstamp_ok = 1;
2931*4882a593Smuzhiyun break;
2932*4882a593Smuzhiyun }
2933*4882a593Smuzhiyun }
2934*4882a593Smuzhiyun
2935*4882a593Smuzhiyun return 0;
2936*4882a593Smuzhiyun }
2937*4882a593Smuzhiyun
2938*4882a593Smuzhiyun DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
2939*4882a593Smuzhiyun EXPORT_SYMBOL(tcp_tx_delay_enabled);
2940*4882a593Smuzhiyun
tcp_enable_tx_delay(void)2941*4882a593Smuzhiyun static void tcp_enable_tx_delay(void)
2942*4882a593Smuzhiyun {
2943*4882a593Smuzhiyun if (!static_branch_unlikely(&tcp_tx_delay_enabled)) {
2944*4882a593Smuzhiyun static int __tcp_tx_delay_enabled = 0;
2945*4882a593Smuzhiyun
2946*4882a593Smuzhiyun if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) {
2947*4882a593Smuzhiyun static_branch_enable(&tcp_tx_delay_enabled);
2948*4882a593Smuzhiyun pr_info("TCP_TX_DELAY enabled\n");
2949*4882a593Smuzhiyun }
2950*4882a593Smuzhiyun }
2951*4882a593Smuzhiyun }
2952*4882a593Smuzhiyun
2953*4882a593Smuzhiyun /* When set indicates to always queue non-full frames. Later the user clears
2954*4882a593Smuzhiyun * this option and we transmit any pending partial frames in the queue. This is
2955*4882a593Smuzhiyun * meant to be used alongside sendfile() to get properly filled frames when the
2956*4882a593Smuzhiyun * user (for example) must write out headers with a write() call first and then
2957*4882a593Smuzhiyun * use sendfile to send out the data parts.
2958*4882a593Smuzhiyun *
2959*4882a593Smuzhiyun * TCP_CORK can be set together with TCP_NODELAY and it is stronger than
2960*4882a593Smuzhiyun * TCP_NODELAY.
2961*4882a593Smuzhiyun */
__tcp_sock_set_cork(struct sock * sk,bool on)2962*4882a593Smuzhiyun static void __tcp_sock_set_cork(struct sock *sk, bool on)
2963*4882a593Smuzhiyun {
2964*4882a593Smuzhiyun struct tcp_sock *tp = tcp_sk(sk);
2965*4882a593Smuzhiyun
2966*4882a593Smuzhiyun if (on) {
2967*4882a593Smuzhiyun tp->nonagle |= TCP_NAGLE_CORK;
2968*4882a593Smuzhiyun } else {
2969*4882a593Smuzhiyun tp->nonagle &= ~TCP_NAGLE_CORK;
2970*4882a593Smuzhiyun if (tp->nonagle & TCP_NAGLE_OFF)
2971*4882a593Smuzhiyun tp->nonagle |= TCP_NAGLE_PUSH;
2972*4882a593Smuzhiyun tcp_push_pending_frames(sk);
2973*4882a593Smuzhiyun }
2974*4882a593Smuzhiyun }
2975*4882a593Smuzhiyun
tcp_sock_set_cork(struct sock * sk,bool on)2976*4882a593Smuzhiyun void tcp_sock_set_cork(struct sock *sk, bool on)
2977*4882a593Smuzhiyun {
2978*4882a593Smuzhiyun lock_sock(sk);
2979*4882a593Smuzhiyun __tcp_sock_set_cork(sk, on);
2980*4882a593Smuzhiyun release_sock(sk);
2981*4882a593Smuzhiyun }
2982*4882a593Smuzhiyun EXPORT_SYMBOL(tcp_sock_set_cork);
2983*4882a593Smuzhiyun
2984*4882a593Smuzhiyun /* TCP_NODELAY is weaker than TCP_CORK, so that this option on corked socket is
2985*4882a593Smuzhiyun * remembered, but it is not activated until cork is cleared.
2986*4882a593Smuzhiyun *
2987*4882a593Smuzhiyun * However, when TCP_NODELAY is set we make an explicit push, which overrides
2988*4882a593Smuzhiyun * even TCP_CORK for currently queued segments.
2989*4882a593Smuzhiyun */
__tcp_sock_set_nodelay(struct sock * sk,bool on)2990*4882a593Smuzhiyun static void __tcp_sock_set_nodelay(struct sock *sk, bool on)
2991*4882a593Smuzhiyun {
2992*4882a593Smuzhiyun if (on) {
2993*4882a593Smuzhiyun tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2994*4882a593Smuzhiyun tcp_push_pending_frames(sk);
2995*4882a593Smuzhiyun } else {
2996*4882a593Smuzhiyun tcp_sk(sk)->nonagle &= ~TCP_NAGLE_OFF;
2997*4882a593Smuzhiyun }
2998*4882a593Smuzhiyun }
2999*4882a593Smuzhiyun
tcp_sock_set_nodelay(struct sock * sk)3000*4882a593Smuzhiyun void tcp_sock_set_nodelay(struct sock *sk)
3001*4882a593Smuzhiyun {
3002*4882a593Smuzhiyun lock_sock(sk);
3003*4882a593Smuzhiyun __tcp_sock_set_nodelay(sk, true);
3004*4882a593Smuzhiyun release_sock(sk);
3005*4882a593Smuzhiyun }
3006*4882a593Smuzhiyun EXPORT_SYMBOL(tcp_sock_set_nodelay);
3007*4882a593Smuzhiyun
__tcp_sock_set_quickack(struct sock * sk,int val)3008*4882a593Smuzhiyun static void __tcp_sock_set_quickack(struct sock *sk, int val)
3009*4882a593Smuzhiyun {
3010*4882a593Smuzhiyun if (!val) {
3011*4882a593Smuzhiyun inet_csk_enter_pingpong_mode(sk);
3012*4882a593Smuzhiyun return;
3013*4882a593Smuzhiyun }
3014*4882a593Smuzhiyun
3015*4882a593Smuzhiyun inet_csk_exit_pingpong_mode(sk);
3016*4882a593Smuzhiyun if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
3017*4882a593Smuzhiyun inet_csk_ack_scheduled(sk)) {
3018*4882a593Smuzhiyun inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_PUSHED;
3019*4882a593Smuzhiyun tcp_cleanup_rbuf(sk, 1);
3020*4882a593Smuzhiyun if (!(val & 1))
3021*4882a593Smuzhiyun inet_csk_enter_pingpong_mode(sk);
3022*4882a593Smuzhiyun }
3023*4882a593Smuzhiyun }
3024*4882a593Smuzhiyun
tcp_sock_set_quickack(struct sock * sk,int val)3025*4882a593Smuzhiyun void tcp_sock_set_quickack(struct sock *sk, int val)
3026*4882a593Smuzhiyun {
3027*4882a593Smuzhiyun lock_sock(sk);
3028*4882a593Smuzhiyun __tcp_sock_set_quickack(sk, val);
3029*4882a593Smuzhiyun release_sock(sk);
3030*4882a593Smuzhiyun }
3031*4882a593Smuzhiyun EXPORT_SYMBOL(tcp_sock_set_quickack);
3032*4882a593Smuzhiyun
tcp_sock_set_syncnt(struct sock * sk,int val)3033*4882a593Smuzhiyun int tcp_sock_set_syncnt(struct sock *sk, int val)
3034*4882a593Smuzhiyun {
3035*4882a593Smuzhiyun if (val < 1 || val > MAX_TCP_SYNCNT)
3036*4882a593Smuzhiyun return -EINVAL;
3037*4882a593Smuzhiyun
3038*4882a593Smuzhiyun lock_sock(sk);
3039*4882a593Smuzhiyun inet_csk(sk)->icsk_syn_retries = val;
3040*4882a593Smuzhiyun release_sock(sk);
3041*4882a593Smuzhiyun return 0;
3042*4882a593Smuzhiyun }
3043*4882a593Smuzhiyun EXPORT_SYMBOL(tcp_sock_set_syncnt);
3044*4882a593Smuzhiyun
tcp_sock_set_user_timeout(struct sock * sk,u32 val)3045*4882a593Smuzhiyun void tcp_sock_set_user_timeout(struct sock *sk, u32 val)
3046*4882a593Smuzhiyun {
3047*4882a593Smuzhiyun lock_sock(sk);
3048*4882a593Smuzhiyun inet_csk(sk)->icsk_user_timeout = val;
3049*4882a593Smuzhiyun release_sock(sk);
3050*4882a593Smuzhiyun }
3051*4882a593Smuzhiyun EXPORT_SYMBOL(tcp_sock_set_user_timeout);
3052*4882a593Smuzhiyun
tcp_sock_set_keepidle_locked(struct sock * sk,int val)3053*4882a593Smuzhiyun int tcp_sock_set_keepidle_locked(struct sock *sk, int val)
3054*4882a593Smuzhiyun {
3055*4882a593Smuzhiyun struct tcp_sock *tp = tcp_sk(sk);
3056*4882a593Smuzhiyun
3057*4882a593Smuzhiyun if (val < 1 || val > MAX_TCP_KEEPIDLE)
3058*4882a593Smuzhiyun return -EINVAL;
3059*4882a593Smuzhiyun
3060*4882a593Smuzhiyun tp->keepalive_time = val * HZ;
3061*4882a593Smuzhiyun if (sock_flag(sk, SOCK_KEEPOPEN) &&
3062*4882a593Smuzhiyun !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {
3063*4882a593Smuzhiyun u32 elapsed = keepalive_time_elapsed(tp);
3064*4882a593Smuzhiyun
3065*4882a593Smuzhiyun if (tp->keepalive_time > elapsed)
3066*4882a593Smuzhiyun elapsed = tp->keepalive_time - elapsed;
3067*4882a593Smuzhiyun else
3068*4882a593Smuzhiyun elapsed = 0;
3069*4882a593Smuzhiyun inet_csk_reset_keepalive_timer(sk, elapsed);
3070*4882a593Smuzhiyun }
3071*4882a593Smuzhiyun
3072*4882a593Smuzhiyun return 0;
3073*4882a593Smuzhiyun }
3074*4882a593Smuzhiyun
tcp_sock_set_keepidle(struct sock * sk,int val)3075*4882a593Smuzhiyun int tcp_sock_set_keepidle(struct sock *sk, int val)
3076*4882a593Smuzhiyun {
3077*4882a593Smuzhiyun int err;
3078*4882a593Smuzhiyun
3079*4882a593Smuzhiyun lock_sock(sk);
3080*4882a593Smuzhiyun err = tcp_sock_set_keepidle_locked(sk, val);
3081*4882a593Smuzhiyun release_sock(sk);
3082*4882a593Smuzhiyun return err;
3083*4882a593Smuzhiyun }
3084*4882a593Smuzhiyun EXPORT_SYMBOL(tcp_sock_set_keepidle);
3085*4882a593Smuzhiyun
tcp_sock_set_keepintvl(struct sock * sk,int val)3086*4882a593Smuzhiyun int tcp_sock_set_keepintvl(struct sock *sk, int val)
3087*4882a593Smuzhiyun {
3088*4882a593Smuzhiyun if (val < 1 || val > MAX_TCP_KEEPINTVL)
3089*4882a593Smuzhiyun return -EINVAL;
3090*4882a593Smuzhiyun
3091*4882a593Smuzhiyun lock_sock(sk);
3092*4882a593Smuzhiyun tcp_sk(sk)->keepalive_intvl = val * HZ;
3093*4882a593Smuzhiyun release_sock(sk);
3094*4882a593Smuzhiyun return 0;
3095*4882a593Smuzhiyun }
3096*4882a593Smuzhiyun EXPORT_SYMBOL(tcp_sock_set_keepintvl);
3097*4882a593Smuzhiyun
tcp_sock_set_keepcnt(struct sock * sk,int val)3098*4882a593Smuzhiyun int tcp_sock_set_keepcnt(struct sock *sk, int val)
3099*4882a593Smuzhiyun {
3100*4882a593Smuzhiyun if (val < 1 || val > MAX_TCP_KEEPCNT)
3101*4882a593Smuzhiyun return -EINVAL;
3102*4882a593Smuzhiyun
3103*4882a593Smuzhiyun lock_sock(sk);
3104*4882a593Smuzhiyun tcp_sk(sk)->keepalive_probes = val;
3105*4882a593Smuzhiyun release_sock(sk);
3106*4882a593Smuzhiyun return 0;
3107*4882a593Smuzhiyun }
3108*4882a593Smuzhiyun EXPORT_SYMBOL(tcp_sock_set_keepcnt);
3109*4882a593Smuzhiyun
3110*4882a593Smuzhiyun /*
3111*4882a593Smuzhiyun * Socket option code for TCP.
3112*4882a593Smuzhiyun */
do_tcp_setsockopt(struct sock * sk,int level,int optname,sockptr_t optval,unsigned int optlen)3113*4882a593Smuzhiyun static int do_tcp_setsockopt(struct sock *sk, int level, int optname,
3114*4882a593Smuzhiyun sockptr_t optval, unsigned int optlen)
3115*4882a593Smuzhiyun {
3116*4882a593Smuzhiyun struct tcp_sock *tp = tcp_sk(sk);
3117*4882a593Smuzhiyun struct inet_connection_sock *icsk = inet_csk(sk);
3118*4882a593Smuzhiyun struct net *net = sock_net(sk);
3119*4882a593Smuzhiyun int val;
3120*4882a593Smuzhiyun int err = 0;
3121*4882a593Smuzhiyun
3122*4882a593Smuzhiyun /* These are data/string values, all the others are ints */
3123*4882a593Smuzhiyun switch (optname) {
3124*4882a593Smuzhiyun case TCP_CONGESTION: {
3125*4882a593Smuzhiyun char name[TCP_CA_NAME_MAX];
3126*4882a593Smuzhiyun
3127*4882a593Smuzhiyun if (optlen < 1)
3128*4882a593Smuzhiyun return -EINVAL;
3129*4882a593Smuzhiyun
3130*4882a593Smuzhiyun val = strncpy_from_sockptr(name, optval,
3131*4882a593Smuzhiyun min_t(long, TCP_CA_NAME_MAX-1, optlen));
3132*4882a593Smuzhiyun if (val < 0)
3133*4882a593Smuzhiyun return -EFAULT;
3134*4882a593Smuzhiyun name[val] = 0;
3135*4882a593Smuzhiyun
3136*4882a593Smuzhiyun lock_sock(sk);
3137*4882a593Smuzhiyun err = tcp_set_congestion_control(sk, name, true,
3138*4882a593Smuzhiyun ns_capable(sock_net(sk)->user_ns,
3139*4882a593Smuzhiyun CAP_NET_ADMIN));
3140*4882a593Smuzhiyun release_sock(sk);
3141*4882a593Smuzhiyun return err;
3142*4882a593Smuzhiyun }
3143*4882a593Smuzhiyun case TCP_ULP: {
3144*4882a593Smuzhiyun char name[TCP_ULP_NAME_MAX];
3145*4882a593Smuzhiyun
3146*4882a593Smuzhiyun if (optlen < 1)
3147*4882a593Smuzhiyun return -EINVAL;
3148*4882a593Smuzhiyun
3149*4882a593Smuzhiyun val = strncpy_from_sockptr(name, optval,
3150*4882a593Smuzhiyun min_t(long, TCP_ULP_NAME_MAX - 1,
3151*4882a593Smuzhiyun optlen));
3152*4882a593Smuzhiyun if (val < 0)
3153*4882a593Smuzhiyun return -EFAULT;
3154*4882a593Smuzhiyun name[val] = 0;
3155*4882a593Smuzhiyun
3156*4882a593Smuzhiyun lock_sock(sk);
3157*4882a593Smuzhiyun err = tcp_set_ulp(sk, name);
3158*4882a593Smuzhiyun release_sock(sk);
3159*4882a593Smuzhiyun return err;
3160*4882a593Smuzhiyun }
3161*4882a593Smuzhiyun case TCP_FASTOPEN_KEY: {
3162*4882a593Smuzhiyun __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH];
3163*4882a593Smuzhiyun __u8 *backup_key = NULL;
3164*4882a593Smuzhiyun
3165*4882a593Smuzhiyun /* Allow a backup key as well to facilitate key rotation
3166*4882a593Smuzhiyun * First key is the active one.
3167*4882a593Smuzhiyun */
3168*4882a593Smuzhiyun if (optlen != TCP_FASTOPEN_KEY_LENGTH &&
3169*4882a593Smuzhiyun optlen != TCP_FASTOPEN_KEY_BUF_LENGTH)
3170*4882a593Smuzhiyun return -EINVAL;
3171*4882a593Smuzhiyun
3172*4882a593Smuzhiyun if (copy_from_sockptr(key, optval, optlen))
3173*4882a593Smuzhiyun return -EFAULT;
3174*4882a593Smuzhiyun
3175*4882a593Smuzhiyun if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH)
3176*4882a593Smuzhiyun backup_key = key + TCP_FASTOPEN_KEY_LENGTH;
3177*4882a593Smuzhiyun
3178*4882a593Smuzhiyun return tcp_fastopen_reset_cipher(net, sk, key, backup_key);
3179*4882a593Smuzhiyun }
3180*4882a593Smuzhiyun default:
3181*4882a593Smuzhiyun /* fallthru */
3182*4882a593Smuzhiyun break;
3183*4882a593Smuzhiyun }
3184*4882a593Smuzhiyun
3185*4882a593Smuzhiyun if (optlen < sizeof(int))
3186*4882a593Smuzhiyun return -EINVAL;
3187*4882a593Smuzhiyun
3188*4882a593Smuzhiyun if (copy_from_sockptr(&val, optval, sizeof(val)))
3189*4882a593Smuzhiyun return -EFAULT;
3190*4882a593Smuzhiyun
3191*4882a593Smuzhiyun lock_sock(sk);
3192*4882a593Smuzhiyun
3193*4882a593Smuzhiyun switch (optname) {
3194*4882a593Smuzhiyun case TCP_MAXSEG:
3195*4882a593Smuzhiyun /* Values greater than interface MTU won't take effect. However
3196*4882a593Smuzhiyun * at the point when this call is done we typically don't yet
3197*4882a593Smuzhiyun * know which interface is going to be used
3198*4882a593Smuzhiyun */
3199*4882a593Smuzhiyun if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) {
3200*4882a593Smuzhiyun err = -EINVAL;
3201*4882a593Smuzhiyun break;
3202*4882a593Smuzhiyun }
3203*4882a593Smuzhiyun tp->rx_opt.user_mss = val;
3204*4882a593Smuzhiyun break;
3205*4882a593Smuzhiyun
3206*4882a593Smuzhiyun case TCP_NODELAY:
3207*4882a593Smuzhiyun __tcp_sock_set_nodelay(sk, val);
3208*4882a593Smuzhiyun break;
3209*4882a593Smuzhiyun
3210*4882a593Smuzhiyun case TCP_THIN_LINEAR_TIMEOUTS:
3211*4882a593Smuzhiyun if (val < 0 || val > 1)
3212*4882a593Smuzhiyun err = -EINVAL;
3213*4882a593Smuzhiyun else
3214*4882a593Smuzhiyun tp->thin_lto = val;
3215*4882a593Smuzhiyun break;
3216*4882a593Smuzhiyun
3217*4882a593Smuzhiyun case TCP_THIN_DUPACK:
3218*4882a593Smuzhiyun if (val < 0 || val > 1)
3219*4882a593Smuzhiyun err = -EINVAL;
3220*4882a593Smuzhiyun break;
3221*4882a593Smuzhiyun
3222*4882a593Smuzhiyun case TCP_REPAIR:
3223*4882a593Smuzhiyun if (!tcp_can_repair_sock(sk))
3224*4882a593Smuzhiyun err = -EPERM;
3225*4882a593Smuzhiyun else if (val == TCP_REPAIR_ON) {
3226*4882a593Smuzhiyun tp->repair = 1;
3227*4882a593Smuzhiyun sk->sk_reuse = SK_FORCE_REUSE;
3228*4882a593Smuzhiyun tp->repair_queue = TCP_NO_QUEUE;
3229*4882a593Smuzhiyun } else if (val == TCP_REPAIR_OFF) {
3230*4882a593Smuzhiyun tp->repair = 0;
3231*4882a593Smuzhiyun sk->sk_reuse = SK_NO_REUSE;
3232*4882a593Smuzhiyun tcp_send_window_probe(sk);
3233*4882a593Smuzhiyun } else if (val == TCP_REPAIR_OFF_NO_WP) {
3234*4882a593Smuzhiyun tp->repair = 0;
3235*4882a593Smuzhiyun sk->sk_reuse = SK_NO_REUSE;
3236*4882a593Smuzhiyun } else
3237*4882a593Smuzhiyun err = -EINVAL;
3238*4882a593Smuzhiyun
3239*4882a593Smuzhiyun break;
3240*4882a593Smuzhiyun
3241*4882a593Smuzhiyun case TCP_REPAIR_QUEUE:
3242*4882a593Smuzhiyun if (!tp->repair)
3243*4882a593Smuzhiyun err = -EPERM;
3244*4882a593Smuzhiyun else if ((unsigned int)val < TCP_QUEUES_NR)
3245*4882a593Smuzhiyun tp->repair_queue = val;
3246*4882a593Smuzhiyun else
3247*4882a593Smuzhiyun err = -EINVAL;
3248*4882a593Smuzhiyun break;
3249*4882a593Smuzhiyun
3250*4882a593Smuzhiyun case TCP_QUEUE_SEQ:
3251*4882a593Smuzhiyun if (sk->sk_state != TCP_CLOSE) {
3252*4882a593Smuzhiyun err = -EPERM;
3253*4882a593Smuzhiyun } else if (tp->repair_queue == TCP_SEND_QUEUE) {
3254*4882a593Smuzhiyun if (!tcp_rtx_queue_empty(sk))
3255*4882a593Smuzhiyun err = -EPERM;
3256*4882a593Smuzhiyun else
3257*4882a593Smuzhiyun WRITE_ONCE(tp->write_seq, val);
3258*4882a593Smuzhiyun } else if (tp->repair_queue == TCP_RECV_QUEUE) {
3259*4882a593Smuzhiyun if (tp->rcv_nxt != tp->copied_seq) {
3260*4882a593Smuzhiyun err = -EPERM;
3261*4882a593Smuzhiyun } else {
3262*4882a593Smuzhiyun WRITE_ONCE(tp->rcv_nxt, val);
3263*4882a593Smuzhiyun WRITE_ONCE(tp->copied_seq, val);
3264*4882a593Smuzhiyun }
3265*4882a593Smuzhiyun } else {
3266*4882a593Smuzhiyun err = -EINVAL;
3267*4882a593Smuzhiyun }
3268*4882a593Smuzhiyun break;
3269*4882a593Smuzhiyun
3270*4882a593Smuzhiyun case TCP_REPAIR_OPTIONS:
3271*4882a593Smuzhiyun if (!tp->repair)
3272*4882a593Smuzhiyun err = -EINVAL;
3273*4882a593Smuzhiyun else if (sk->sk_state == TCP_ESTABLISHED && !tp->bytes_sent)
3274*4882a593Smuzhiyun err = tcp_repair_options_est(sk, optval, optlen);
3275*4882a593Smuzhiyun else
3276*4882a593Smuzhiyun err = -EPERM;
3277*4882a593Smuzhiyun break;
3278*4882a593Smuzhiyun
3279*4882a593Smuzhiyun case TCP_CORK:
3280*4882a593Smuzhiyun __tcp_sock_set_cork(sk, val);
3281*4882a593Smuzhiyun break;
3282*4882a593Smuzhiyun
3283*4882a593Smuzhiyun case TCP_KEEPIDLE:
3284*4882a593Smuzhiyun err = tcp_sock_set_keepidle_locked(sk, val);
3285*4882a593Smuzhiyun break;
3286*4882a593Smuzhiyun case TCP_KEEPINTVL:
3287*4882a593Smuzhiyun if (val < 1 || val > MAX_TCP_KEEPINTVL)
3288*4882a593Smuzhiyun err = -EINVAL;
3289*4882a593Smuzhiyun else
3290*4882a593Smuzhiyun tp->keepalive_intvl = val * HZ;
3291*4882a593Smuzhiyun break;
3292*4882a593Smuzhiyun case TCP_KEEPCNT:
3293*4882a593Smuzhiyun if (val < 1 || val > MAX_TCP_KEEPCNT)
3294*4882a593Smuzhiyun err = -EINVAL;
3295*4882a593Smuzhiyun else
3296*4882a593Smuzhiyun tp->keepalive_probes = val;
3297*4882a593Smuzhiyun break;
3298*4882a593Smuzhiyun case TCP_SYNCNT:
3299*4882a593Smuzhiyun if (val < 1 || val > MAX_TCP_SYNCNT)
3300*4882a593Smuzhiyun err = -EINVAL;
3301*4882a593Smuzhiyun else
3302*4882a593Smuzhiyun icsk->icsk_syn_retries = val;
3303*4882a593Smuzhiyun break;
3304*4882a593Smuzhiyun
3305*4882a593Smuzhiyun case TCP_SAVE_SYN:
3306*4882a593Smuzhiyun /* 0: disable, 1: enable, 2: start from ether_header */
3307*4882a593Smuzhiyun if (val < 0 || val > 2)
3308*4882a593Smuzhiyun err = -EINVAL;
3309*4882a593Smuzhiyun else
3310*4882a593Smuzhiyun tp->save_syn = val;
3311*4882a593Smuzhiyun break;
3312*4882a593Smuzhiyun
3313*4882a593Smuzhiyun case TCP_LINGER2:
3314*4882a593Smuzhiyun if (val < 0)
3315*4882a593Smuzhiyun tp->linger2 = -1;
3316*4882a593Smuzhiyun else if (val > TCP_FIN_TIMEOUT_MAX / HZ)
3317*4882a593Smuzhiyun tp->linger2 = TCP_FIN_TIMEOUT_MAX;
3318*4882a593Smuzhiyun else
3319*4882a593Smuzhiyun tp->linger2 = val * HZ;
3320*4882a593Smuzhiyun break;
3321*4882a593Smuzhiyun
3322*4882a593Smuzhiyun case TCP_DEFER_ACCEPT:
3323*4882a593Smuzhiyun /* Translate value in seconds to number of retransmits */
3324*4882a593Smuzhiyun icsk->icsk_accept_queue.rskq_defer_accept =
3325*4882a593Smuzhiyun secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
3326*4882a593Smuzhiyun TCP_RTO_MAX / HZ);
3327*4882a593Smuzhiyun break;
3328*4882a593Smuzhiyun
3329*4882a593Smuzhiyun case TCP_WINDOW_CLAMP:
3330*4882a593Smuzhiyun if (!val) {
3331*4882a593Smuzhiyun if (sk->sk_state != TCP_CLOSE) {
3332*4882a593Smuzhiyun err = -EINVAL;
3333*4882a593Smuzhiyun break;
3334*4882a593Smuzhiyun }
3335*4882a593Smuzhiyun tp->window_clamp = 0;
3336*4882a593Smuzhiyun } else
3337*4882a593Smuzhiyun tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
3338*4882a593Smuzhiyun SOCK_MIN_RCVBUF / 2 : val;
3339*4882a593Smuzhiyun break;
3340*4882a593Smuzhiyun
3341*4882a593Smuzhiyun case TCP_QUICKACK:
3342*4882a593Smuzhiyun __tcp_sock_set_quickack(sk, val);
3343*4882a593Smuzhiyun break;
3344*4882a593Smuzhiyun
3345*4882a593Smuzhiyun #ifdef CONFIG_TCP_MD5SIG
3346*4882a593Smuzhiyun case TCP_MD5SIG:
3347*4882a593Smuzhiyun case TCP_MD5SIG_EXT:
3348*4882a593Smuzhiyun err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
3349*4882a593Smuzhiyun break;
3350*4882a593Smuzhiyun #endif
3351*4882a593Smuzhiyun case TCP_USER_TIMEOUT:
3352*4882a593Smuzhiyun /* Cap the max time in ms TCP will retry or probe the window
3353*4882a593Smuzhiyun * before giving up and aborting (ETIMEDOUT) a connection.
3354*4882a593Smuzhiyun */
3355*4882a593Smuzhiyun if (val < 0)
3356*4882a593Smuzhiyun err = -EINVAL;
3357*4882a593Smuzhiyun else
3358*4882a593Smuzhiyun icsk->icsk_user_timeout = val;
3359*4882a593Smuzhiyun break;
3360*4882a593Smuzhiyun
3361*4882a593Smuzhiyun case TCP_FASTOPEN:
3362*4882a593Smuzhiyun if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
3363*4882a593Smuzhiyun TCPF_LISTEN))) {
3364*4882a593Smuzhiyun tcp_fastopen_init_key_once(net);
3365*4882a593Smuzhiyun
3366*4882a593Smuzhiyun fastopen_queue_tune(sk, val);
3367*4882a593Smuzhiyun } else {
3368*4882a593Smuzhiyun err = -EINVAL;
3369*4882a593Smuzhiyun }
3370*4882a593Smuzhiyun break;
3371*4882a593Smuzhiyun case TCP_FASTOPEN_CONNECT:
3372*4882a593Smuzhiyun if (val > 1 || val < 0) {
3373*4882a593Smuzhiyun err = -EINVAL;
3374*4882a593Smuzhiyun } else if (READ_ONCE(net->ipv4.sysctl_tcp_fastopen) &
3375*4882a593Smuzhiyun TFO_CLIENT_ENABLE) {
3376*4882a593Smuzhiyun if (sk->sk_state == TCP_CLOSE)
3377*4882a593Smuzhiyun tp->fastopen_connect = val;
3378*4882a593Smuzhiyun else
3379*4882a593Smuzhiyun err = -EINVAL;
3380*4882a593Smuzhiyun } else {
3381*4882a593Smuzhiyun err = -EOPNOTSUPP;
3382*4882a593Smuzhiyun }
3383*4882a593Smuzhiyun break;
3384*4882a593Smuzhiyun case TCP_FASTOPEN_NO_COOKIE:
3385*4882a593Smuzhiyun if (val > 1 || val < 0)
3386*4882a593Smuzhiyun err = -EINVAL;
3387*4882a593Smuzhiyun else if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
3388*4882a593Smuzhiyun err = -EINVAL;
3389*4882a593Smuzhiyun else
3390*4882a593Smuzhiyun tp->fastopen_no_cookie = val;
3391*4882a593Smuzhiyun break;
3392*4882a593Smuzhiyun case TCP_TIMESTAMP:
3393*4882a593Smuzhiyun if (!tp->repair)
3394*4882a593Smuzhiyun err = -EPERM;
3395*4882a593Smuzhiyun else
3396*4882a593Smuzhiyun tp->tsoffset = val - tcp_time_stamp_raw();
3397*4882a593Smuzhiyun break;
3398*4882a593Smuzhiyun case TCP_REPAIR_WINDOW:
3399*4882a593Smuzhiyun err = tcp_repair_set_window(tp, optval, optlen);
3400*4882a593Smuzhiyun break;
3401*4882a593Smuzhiyun case TCP_NOTSENT_LOWAT:
3402*4882a593Smuzhiyun tp->notsent_lowat = val;
3403*4882a593Smuzhiyun sk->sk_write_space(sk);
3404*4882a593Smuzhiyun break;
3405*4882a593Smuzhiyun case TCP_INQ:
3406*4882a593Smuzhiyun if (val > 1 || val < 0)
3407*4882a593Smuzhiyun err = -EINVAL;
3408*4882a593Smuzhiyun else
3409*4882a593Smuzhiyun tp->recvmsg_inq = val;
3410*4882a593Smuzhiyun break;
3411*4882a593Smuzhiyun case TCP_TX_DELAY:
3412*4882a593Smuzhiyun if (val)
3413*4882a593Smuzhiyun tcp_enable_tx_delay();
3414*4882a593Smuzhiyun tp->tcp_tx_delay = val;
3415*4882a593Smuzhiyun break;
3416*4882a593Smuzhiyun default:
3417*4882a593Smuzhiyun err = -ENOPROTOOPT;
3418*4882a593Smuzhiyun break;
3419*4882a593Smuzhiyun }
3420*4882a593Smuzhiyun
3421*4882a593Smuzhiyun release_sock(sk);
3422*4882a593Smuzhiyun return err;
3423*4882a593Smuzhiyun }
3424*4882a593Smuzhiyun
tcp_setsockopt(struct sock * sk,int level,int optname,sockptr_t optval,unsigned int optlen)3425*4882a593Smuzhiyun int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
3426*4882a593Smuzhiyun unsigned int optlen)
3427*4882a593Smuzhiyun {
3428*4882a593Smuzhiyun const struct inet_connection_sock *icsk = inet_csk(sk);
3429*4882a593Smuzhiyun
3430*4882a593Smuzhiyun if (level != SOL_TCP)
3431*4882a593Smuzhiyun return icsk->icsk_af_ops->setsockopt(sk, level, optname,
3432*4882a593Smuzhiyun optval, optlen);
3433*4882a593Smuzhiyun return do_tcp_setsockopt(sk, level, optname, optval, optlen);
3434*4882a593Smuzhiyun }
3435*4882a593Smuzhiyun EXPORT_SYMBOL(tcp_setsockopt);
3436*4882a593Smuzhiyun
tcp_get_info_chrono_stats(const struct tcp_sock * tp,struct tcp_info * info)3437*4882a593Smuzhiyun static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
3438*4882a593Smuzhiyun struct tcp_info *info)
3439*4882a593Smuzhiyun {
3440*4882a593Smuzhiyun u64 stats[__TCP_CHRONO_MAX], total = 0;
3441*4882a593Smuzhiyun enum tcp_chrono i;
3442*4882a593Smuzhiyun
3443*4882a593Smuzhiyun for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) {
3444*4882a593Smuzhiyun stats[i] = tp->chrono_stat[i - 1];
3445*4882a593Smuzhiyun if (i == tp->chrono_type)
3446*4882a593Smuzhiyun stats[i] += tcp_jiffies32 - tp->chrono_start;
3447*4882a593Smuzhiyun stats[i] *= USEC_PER_SEC / HZ;
3448*4882a593Smuzhiyun total += stats[i];
3449*4882a593Smuzhiyun }
3450*4882a593Smuzhiyun
3451*4882a593Smuzhiyun info->tcpi_busy_time = total;
3452*4882a593Smuzhiyun info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED];
3453*4882a593Smuzhiyun info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED];
3454*4882a593Smuzhiyun }
3455*4882a593Smuzhiyun
3456*4882a593Smuzhiyun /* Return information about state of tcp endpoint in API format. */
tcp_get_info(struct sock * sk,struct tcp_info * info)3457*4882a593Smuzhiyun void tcp_get_info(struct sock *sk, struct tcp_info *info)
3458*4882a593Smuzhiyun {
3459*4882a593Smuzhiyun const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
3460*4882a593Smuzhiyun const struct inet_connection_sock *icsk = inet_csk(sk);
3461*4882a593Smuzhiyun unsigned long rate;
3462*4882a593Smuzhiyun u32 now;
3463*4882a593Smuzhiyun u64 rate64;
3464*4882a593Smuzhiyun bool slow;
3465*4882a593Smuzhiyun
3466*4882a593Smuzhiyun memset(info, 0, sizeof(*info));
3467*4882a593Smuzhiyun if (sk->sk_type != SOCK_STREAM)
3468*4882a593Smuzhiyun return;
3469*4882a593Smuzhiyun
3470*4882a593Smuzhiyun info->tcpi_state = inet_sk_state_load(sk);
3471*4882a593Smuzhiyun
3472*4882a593Smuzhiyun /* Report meaningful fields for all TCP states, including listeners */
3473*4882a593Smuzhiyun rate = READ_ONCE(sk->sk_pacing_rate);
3474*4882a593Smuzhiyun rate64 = (rate != ~0UL) ? rate : ~0ULL;
3475*4882a593Smuzhiyun info->tcpi_pacing_rate = rate64;
3476*4882a593Smuzhiyun
3477*4882a593Smuzhiyun rate = READ_ONCE(sk->sk_max_pacing_rate);
3478*4882a593Smuzhiyun rate64 = (rate != ~0UL) ? rate : ~0ULL;
3479*4882a593Smuzhiyun info->tcpi_max_pacing_rate = rate64;
3480*4882a593Smuzhiyun
3481*4882a593Smuzhiyun info->tcpi_reordering = tp->reordering;
3482*4882a593Smuzhiyun info->tcpi_snd_cwnd = tp->snd_cwnd;
3483*4882a593Smuzhiyun
3484*4882a593Smuzhiyun if (info->tcpi_state == TCP_LISTEN) {
3485*4882a593Smuzhiyun /* listeners aliased fields :
3486*4882a593Smuzhiyun * tcpi_unacked -> Number of children ready for accept()
3487*4882a593Smuzhiyun * tcpi_sacked -> max backlog
3488*4882a593Smuzhiyun */
3489*4882a593Smuzhiyun info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog);
3490*4882a593Smuzhiyun info->tcpi_sacked = READ_ONCE(sk->sk_max_ack_backlog);
3491*4882a593Smuzhiyun return;
3492*4882a593Smuzhiyun }
3493*4882a593Smuzhiyun
3494*4882a593Smuzhiyun slow = lock_sock_fast(sk);
3495*4882a593Smuzhiyun
3496*4882a593Smuzhiyun info->tcpi_ca_state = icsk->icsk_ca_state;
3497*4882a593Smuzhiyun info->tcpi_retransmits = icsk->icsk_retransmits;
3498*4882a593Smuzhiyun info->tcpi_probes = icsk->icsk_probes_out;
3499*4882a593Smuzhiyun info->tcpi_backoff = icsk->icsk_backoff;
3500*4882a593Smuzhiyun
3501*4882a593Smuzhiyun if (tp->rx_opt.tstamp_ok)
3502*4882a593Smuzhiyun info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
3503*4882a593Smuzhiyun if (tcp_is_sack(tp))
3504*4882a593Smuzhiyun info->tcpi_options |= TCPI_OPT_SACK;
3505*4882a593Smuzhiyun if (tp->rx_opt.wscale_ok) {
3506*4882a593Smuzhiyun info->tcpi_options |= TCPI_OPT_WSCALE;
3507*4882a593Smuzhiyun info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
3508*4882a593Smuzhiyun info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
3509*4882a593Smuzhiyun }
3510*4882a593Smuzhiyun
3511*4882a593Smuzhiyun if (tp->ecn_flags & TCP_ECN_OK)
3512*4882a593Smuzhiyun info->tcpi_options |= TCPI_OPT_ECN;
3513*4882a593Smuzhiyun if (tp->ecn_flags & TCP_ECN_SEEN)
3514*4882a593Smuzhiyun info->tcpi_options |= TCPI_OPT_ECN_SEEN;
3515*4882a593Smuzhiyun if (tp->syn_data_acked)
3516*4882a593Smuzhiyun info->tcpi_options |= TCPI_OPT_SYN_DATA;
3517*4882a593Smuzhiyun
3518*4882a593Smuzhiyun info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
3519*4882a593Smuzhiyun info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
3520*4882a593Smuzhiyun info->tcpi_snd_mss = tp->mss_cache;
3521*4882a593Smuzhiyun info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
3522*4882a593Smuzhiyun
3523*4882a593Smuzhiyun info->tcpi_unacked = tp->packets_out;
3524*4882a593Smuzhiyun info->tcpi_sacked = tp->sacked_out;
3525*4882a593Smuzhiyun
3526*4882a593Smuzhiyun info->tcpi_lost = tp->lost_out;
3527*4882a593Smuzhiyun info->tcpi_retrans = tp->retrans_out;
3528*4882a593Smuzhiyun
3529*4882a593Smuzhiyun now = tcp_jiffies32;
3530*4882a593Smuzhiyun info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
3531*4882a593Smuzhiyun info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
3532*4882a593Smuzhiyun info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
3533*4882a593Smuzhiyun
3534*4882a593Smuzhiyun info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
3535*4882a593Smuzhiyun info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
3536*4882a593Smuzhiyun info->tcpi_rtt = tp->srtt_us >> 3;
3537*4882a593Smuzhiyun info->tcpi_rttvar = tp->mdev_us >> 2;
3538*4882a593Smuzhiyun info->tcpi_snd_ssthresh = tp->snd_ssthresh;
3539*4882a593Smuzhiyun info->tcpi_advmss = tp->advmss;
3540*4882a593Smuzhiyun
3541*4882a593Smuzhiyun info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> 3;
3542*4882a593Smuzhiyun info->tcpi_rcv_space = tp->rcvq_space.space;
3543*4882a593Smuzhiyun
3544*4882a593Smuzhiyun info->tcpi_total_retrans = tp->total_retrans;
3545*4882a593Smuzhiyun
3546*4882a593Smuzhiyun info->tcpi_bytes_acked = tp->bytes_acked;
3547*4882a593Smuzhiyun info->tcpi_bytes_received = tp->bytes_received;
3548*4882a593Smuzhiyun info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
3549*4882a593Smuzhiyun tcp_get_info_chrono_stats(tp, info);
3550*4882a593Smuzhiyun
3551*4882a593Smuzhiyun info->tcpi_segs_out = tp->segs_out;
3552*4882a593Smuzhiyun info->tcpi_segs_in = tp->segs_in;
3553*4882a593Smuzhiyun
3554*4882a593Smuzhiyun info->tcpi_min_rtt = tcp_min_rtt(tp);
3555*4882a593Smuzhiyun info->tcpi_data_segs_in = tp->data_segs_in;
3556*4882a593Smuzhiyun info->tcpi_data_segs_out = tp->data_segs_out;
3557*4882a593Smuzhiyun
3558*4882a593Smuzhiyun info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
3559*4882a593Smuzhiyun rate64 = tcp_compute_delivery_rate(tp);
3560*4882a593Smuzhiyun if (rate64)
3561*4882a593Smuzhiyun info->tcpi_delivery_rate = rate64;
3562*4882a593Smuzhiyun info->tcpi_delivered = tp->delivered;
3563*4882a593Smuzhiyun info->tcpi_delivered_ce = tp->delivered_ce;
3564*4882a593Smuzhiyun info->tcpi_bytes_sent = tp->bytes_sent;
3565*4882a593Smuzhiyun info->tcpi_bytes_retrans = tp->bytes_retrans;
3566*4882a593Smuzhiyun info->tcpi_dsack_dups = tp->dsack_dups;
3567*4882a593Smuzhiyun info->tcpi_reord_seen = tp->reord_seen;
3568*4882a593Smuzhiyun info->tcpi_rcv_ooopack = tp->rcv_ooopack;
3569*4882a593Smuzhiyun info->tcpi_snd_wnd = tp->snd_wnd;
3570*4882a593Smuzhiyun info->tcpi_fastopen_client_fail = tp->fastopen_client_fail;
3571*4882a593Smuzhiyun unlock_sock_fast(sk, slow);
3572*4882a593Smuzhiyun }
3573*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(tcp_get_info);
3574*4882a593Smuzhiyun
tcp_opt_stats_get_size(void)3575*4882a593Smuzhiyun static size_t tcp_opt_stats_get_size(void)
3576*4882a593Smuzhiyun {
3577*4882a593Smuzhiyun return
3578*4882a593Smuzhiyun nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BUSY */
3579*4882a593Smuzhiyun nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_RWND_LIMITED */
3580*4882a593Smuzhiyun nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_SNDBUF_LIMITED */
3581*4882a593Smuzhiyun nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DATA_SEGS_OUT */
3582*4882a593Smuzhiyun nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_TOTAL_RETRANS */
3583*4882a593Smuzhiyun nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_PACING_RATE */
3584*4882a593Smuzhiyun nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DELIVERY_RATE */
3585*4882a593Smuzhiyun nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_CWND */
3586*4882a593Smuzhiyun nla_total_size(sizeof(u32)) + /* TCP_NLA_REORDERING */
3587*4882a593Smuzhiyun nla_total_size(sizeof(u32)) + /* TCP_NLA_MIN_RTT */
3588*4882a593Smuzhiyun nla_total_size(sizeof(u8)) + /* TCP_NLA_RECUR_RETRANS */
3589*4882a593Smuzhiyun nla_total_size(sizeof(u8)) + /* TCP_NLA_DELIVERY_RATE_APP_LMT */
3590*4882a593Smuzhiyun nla_total_size(sizeof(u32)) + /* TCP_NLA_SNDQ_SIZE */
3591*4882a593Smuzhiyun nla_total_size(sizeof(u8)) + /* TCP_NLA_CA_STATE */
3592*4882a593Smuzhiyun nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_SSTHRESH */
3593*4882a593Smuzhiyun nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED */
3594*4882a593Smuzhiyun nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */
3595*4882a593Smuzhiyun nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */
3596*4882a593Smuzhiyun nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */
3597*4882a593Smuzhiyun nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */
3598*4882a593Smuzhiyun nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */
3599*4882a593Smuzhiyun nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */
3600*4882a593Smuzhiyun nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */
3601*4882a593Smuzhiyun nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */
3602*4882a593Smuzhiyun nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */
3603*4882a593Smuzhiyun 0;
3604*4882a593Smuzhiyun }
3605*4882a593Smuzhiyun
tcp_get_timestamping_opt_stats(const struct sock * sk,const struct sk_buff * orig_skb)3606*4882a593Smuzhiyun struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
3607*4882a593Smuzhiyun const struct sk_buff *orig_skb)
3608*4882a593Smuzhiyun {
3609*4882a593Smuzhiyun const struct tcp_sock *tp = tcp_sk(sk);
3610*4882a593Smuzhiyun struct sk_buff *stats;
3611*4882a593Smuzhiyun struct tcp_info info;
3612*4882a593Smuzhiyun unsigned long rate;
3613*4882a593Smuzhiyun u64 rate64;
3614*4882a593Smuzhiyun
3615*4882a593Smuzhiyun stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC);
3616*4882a593Smuzhiyun if (!stats)
3617*4882a593Smuzhiyun return NULL;
3618*4882a593Smuzhiyun
3619*4882a593Smuzhiyun tcp_get_info_chrono_stats(tp, &info);
3620*4882a593Smuzhiyun nla_put_u64_64bit(stats, TCP_NLA_BUSY,
3621*4882a593Smuzhiyun info.tcpi_busy_time, TCP_NLA_PAD);
3622*4882a593Smuzhiyun nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED,
3623*4882a593Smuzhiyun info.tcpi_rwnd_limited, TCP_NLA_PAD);
3624*4882a593Smuzhiyun nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
3625*4882a593Smuzhiyun info.tcpi_sndbuf_limited, TCP_NLA_PAD);
3626*4882a593Smuzhiyun nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT,
3627*4882a593Smuzhiyun tp->data_segs_out, TCP_NLA_PAD);
3628*4882a593Smuzhiyun nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
3629*4882a593Smuzhiyun tp->total_retrans, TCP_NLA_PAD);
3630*4882a593Smuzhiyun
3631*4882a593Smuzhiyun rate = READ_ONCE(sk->sk_pacing_rate);
3632*4882a593Smuzhiyun rate64 = (rate != ~0UL) ? rate : ~0ULL;
3633*4882a593Smuzhiyun nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);
3634*4882a593Smuzhiyun
3635*4882a593Smuzhiyun rate64 = tcp_compute_delivery_rate(tp);
3636*4882a593Smuzhiyun nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD);
3637*4882a593Smuzhiyun
3638*4882a593Smuzhiyun nla_put_u32(stats, TCP_NLA_SND_CWND, tp->snd_cwnd);
3639*4882a593Smuzhiyun nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering);
3640*4882a593Smuzhiyun nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp));
3641*4882a593Smuzhiyun
3642*4882a593Smuzhiyun nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
3643*4882a593Smuzhiyun nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
3644*4882a593Smuzhiyun nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh);
3645*4882a593Smuzhiyun nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered);
3646*4882a593Smuzhiyun nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce);
3647*4882a593Smuzhiyun
3648*4882a593Smuzhiyun nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
3649*4882a593Smuzhiyun nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
3650*4882a593Smuzhiyun
3651*4882a593Smuzhiyun nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent,
3652*4882a593Smuzhiyun TCP_NLA_PAD);
3653*4882a593Smuzhiyun nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans,
3654*4882a593Smuzhiyun TCP_NLA_PAD);
3655*4882a593Smuzhiyun nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
3656*4882a593Smuzhiyun nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
3657*4882a593Smuzhiyun nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3);
3658*4882a593Smuzhiyun nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash);
3659*4882a593Smuzhiyun nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT,
3660*4882a593Smuzhiyun max_t(int, 0, tp->write_seq - tp->snd_nxt));
3661*4882a593Smuzhiyun nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns,
3662*4882a593Smuzhiyun TCP_NLA_PAD);
3663*4882a593Smuzhiyun
3664*4882a593Smuzhiyun return stats;
3665*4882a593Smuzhiyun }
3666*4882a593Smuzhiyun
do_tcp_getsockopt(struct sock * sk,int level,int optname,char __user * optval,int __user * optlen)3667*4882a593Smuzhiyun static int do_tcp_getsockopt(struct sock *sk, int level,
3668*4882a593Smuzhiyun int optname, char __user *optval, int __user *optlen)
3669*4882a593Smuzhiyun {
3670*4882a593Smuzhiyun struct inet_connection_sock *icsk = inet_csk(sk);
3671*4882a593Smuzhiyun struct tcp_sock *tp = tcp_sk(sk);
3672*4882a593Smuzhiyun struct net *net = sock_net(sk);
3673*4882a593Smuzhiyun int val, len;
3674*4882a593Smuzhiyun
3675*4882a593Smuzhiyun if (get_user(len, optlen))
3676*4882a593Smuzhiyun return -EFAULT;
3677*4882a593Smuzhiyun
3678*4882a593Smuzhiyun len = min_t(unsigned int, len, sizeof(int));
3679*4882a593Smuzhiyun
3680*4882a593Smuzhiyun if (len < 0)
3681*4882a593Smuzhiyun return -EINVAL;
3682*4882a593Smuzhiyun
3683*4882a593Smuzhiyun switch (optname) {
3684*4882a593Smuzhiyun case TCP_MAXSEG:
3685*4882a593Smuzhiyun val = tp->mss_cache;
3686*4882a593Smuzhiyun if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
3687*4882a593Smuzhiyun val = tp->rx_opt.user_mss;
3688*4882a593Smuzhiyun if (tp->repair)
3689*4882a593Smuzhiyun val = tp->rx_opt.mss_clamp;
3690*4882a593Smuzhiyun break;
3691*4882a593Smuzhiyun case TCP_NODELAY:
3692*4882a593Smuzhiyun val = !!(tp->nonagle&TCP_NAGLE_OFF);
3693*4882a593Smuzhiyun break;
3694*4882a593Smuzhiyun case TCP_CORK:
3695*4882a593Smuzhiyun val = !!(tp->nonagle&TCP_NAGLE_CORK);
3696*4882a593Smuzhiyun break;
3697*4882a593Smuzhiyun case TCP_KEEPIDLE:
3698*4882a593Smuzhiyun val = keepalive_time_when(tp) / HZ;
3699*4882a593Smuzhiyun break;
3700*4882a593Smuzhiyun case TCP_KEEPINTVL:
3701*4882a593Smuzhiyun val = keepalive_intvl_when(tp) / HZ;
3702*4882a593Smuzhiyun break;
3703*4882a593Smuzhiyun case TCP_KEEPCNT:
3704*4882a593Smuzhiyun val = keepalive_probes(tp);
3705*4882a593Smuzhiyun break;
3706*4882a593Smuzhiyun case TCP_SYNCNT:
3707*4882a593Smuzhiyun val = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
3708*4882a593Smuzhiyun break;
3709*4882a593Smuzhiyun case TCP_LINGER2:
3710*4882a593Smuzhiyun val = tp->linger2;
3711*4882a593Smuzhiyun if (val >= 0)
3712*4882a593Smuzhiyun val = (val ? : READ_ONCE(net->ipv4.sysctl_tcp_fin_timeout)) / HZ;
3713*4882a593Smuzhiyun break;
3714*4882a593Smuzhiyun case TCP_DEFER_ACCEPT:
3715*4882a593Smuzhiyun val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
3716*4882a593Smuzhiyun TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
3717*4882a593Smuzhiyun break;
3718*4882a593Smuzhiyun case TCP_WINDOW_CLAMP:
3719*4882a593Smuzhiyun val = tp->window_clamp;
3720*4882a593Smuzhiyun break;
3721*4882a593Smuzhiyun case TCP_INFO: {
3722*4882a593Smuzhiyun struct tcp_info info;
3723*4882a593Smuzhiyun
3724*4882a593Smuzhiyun if (get_user(len, optlen))
3725*4882a593Smuzhiyun return -EFAULT;
3726*4882a593Smuzhiyun
3727*4882a593Smuzhiyun tcp_get_info(sk, &info);
3728*4882a593Smuzhiyun
3729*4882a593Smuzhiyun len = min_t(unsigned int, len, sizeof(info));
3730*4882a593Smuzhiyun if (put_user(len, optlen))
3731*4882a593Smuzhiyun return -EFAULT;
3732*4882a593Smuzhiyun if (copy_to_user(optval, &info, len))
3733*4882a593Smuzhiyun return -EFAULT;
3734*4882a593Smuzhiyun return 0;
3735*4882a593Smuzhiyun }
3736*4882a593Smuzhiyun case TCP_CC_INFO: {
3737*4882a593Smuzhiyun const struct tcp_congestion_ops *ca_ops;
3738*4882a593Smuzhiyun union tcp_cc_info info;
3739*4882a593Smuzhiyun size_t sz = 0;
3740*4882a593Smuzhiyun int attr;
3741*4882a593Smuzhiyun
3742*4882a593Smuzhiyun if (get_user(len, optlen))
3743*4882a593Smuzhiyun return -EFAULT;
3744*4882a593Smuzhiyun
3745*4882a593Smuzhiyun ca_ops = icsk->icsk_ca_ops;
3746*4882a593Smuzhiyun if (ca_ops && ca_ops->get_info)
3747*4882a593Smuzhiyun sz = ca_ops->get_info(sk, ~0U, &attr, &info);
3748*4882a593Smuzhiyun
3749*4882a593Smuzhiyun len = min_t(unsigned int, len, sz);
3750*4882a593Smuzhiyun if (put_user(len, optlen))
3751*4882a593Smuzhiyun return -EFAULT;
3752*4882a593Smuzhiyun if (copy_to_user(optval, &info, len))
3753*4882a593Smuzhiyun return -EFAULT;
3754*4882a593Smuzhiyun return 0;
3755*4882a593Smuzhiyun }
3756*4882a593Smuzhiyun case TCP_QUICKACK:
3757*4882a593Smuzhiyun val = !inet_csk_in_pingpong_mode(sk);
3758*4882a593Smuzhiyun break;
3759*4882a593Smuzhiyun
3760*4882a593Smuzhiyun case TCP_CONGESTION:
3761*4882a593Smuzhiyun if (get_user(len, optlen))
3762*4882a593Smuzhiyun return -EFAULT;
3763*4882a593Smuzhiyun len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
3764*4882a593Smuzhiyun if (put_user(len, optlen))
3765*4882a593Smuzhiyun return -EFAULT;
3766*4882a593Smuzhiyun if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
3767*4882a593Smuzhiyun return -EFAULT;
3768*4882a593Smuzhiyun return 0;
3769*4882a593Smuzhiyun
3770*4882a593Smuzhiyun case TCP_ULP:
3771*4882a593Smuzhiyun if (get_user(len, optlen))
3772*4882a593Smuzhiyun return -EFAULT;
3773*4882a593Smuzhiyun len = min_t(unsigned int, len, TCP_ULP_NAME_MAX);
3774*4882a593Smuzhiyun if (!icsk->icsk_ulp_ops) {
3775*4882a593Smuzhiyun if (put_user(0, optlen))
3776*4882a593Smuzhiyun return -EFAULT;
3777*4882a593Smuzhiyun return 0;
3778*4882a593Smuzhiyun }
3779*4882a593Smuzhiyun if (put_user(len, optlen))
3780*4882a593Smuzhiyun return -EFAULT;
3781*4882a593Smuzhiyun if (copy_to_user(optval, icsk->icsk_ulp_ops->name, len))
3782*4882a593Smuzhiyun return -EFAULT;
3783*4882a593Smuzhiyun return 0;
3784*4882a593Smuzhiyun
3785*4882a593Smuzhiyun case TCP_FASTOPEN_KEY: {
3786*4882a593Smuzhiyun u64 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u64)];
3787*4882a593Smuzhiyun unsigned int key_len;
3788*4882a593Smuzhiyun
3789*4882a593Smuzhiyun if (get_user(len, optlen))
3790*4882a593Smuzhiyun return -EFAULT;
3791*4882a593Smuzhiyun
3792*4882a593Smuzhiyun key_len = tcp_fastopen_get_cipher(net, icsk, key) *
3793*4882a593Smuzhiyun TCP_FASTOPEN_KEY_LENGTH;
3794*4882a593Smuzhiyun len = min_t(unsigned int, len, key_len);
3795*4882a593Smuzhiyun if (put_user(len, optlen))
3796*4882a593Smuzhiyun return -EFAULT;
3797*4882a593Smuzhiyun if (copy_to_user(optval, key, len))
3798*4882a593Smuzhiyun return -EFAULT;
3799*4882a593Smuzhiyun return 0;
3800*4882a593Smuzhiyun }
3801*4882a593Smuzhiyun case TCP_THIN_LINEAR_TIMEOUTS:
3802*4882a593Smuzhiyun val = tp->thin_lto;
3803*4882a593Smuzhiyun break;
3804*4882a593Smuzhiyun
3805*4882a593Smuzhiyun case TCP_THIN_DUPACK:
3806*4882a593Smuzhiyun val = 0;
3807*4882a593Smuzhiyun break;
3808*4882a593Smuzhiyun
3809*4882a593Smuzhiyun case TCP_REPAIR:
3810*4882a593Smuzhiyun val = tp->repair;
3811*4882a593Smuzhiyun break;
3812*4882a593Smuzhiyun
3813*4882a593Smuzhiyun case TCP_REPAIR_QUEUE:
3814*4882a593Smuzhiyun if (tp->repair)
3815*4882a593Smuzhiyun val = tp->repair_queue;
3816*4882a593Smuzhiyun else
3817*4882a593Smuzhiyun return -EINVAL;
3818*4882a593Smuzhiyun break;
3819*4882a593Smuzhiyun
3820*4882a593Smuzhiyun case TCP_REPAIR_WINDOW: {
3821*4882a593Smuzhiyun struct tcp_repair_window opt;
3822*4882a593Smuzhiyun
3823*4882a593Smuzhiyun if (get_user(len, optlen))
3824*4882a593Smuzhiyun return -EFAULT;
3825*4882a593Smuzhiyun
3826*4882a593Smuzhiyun if (len != sizeof(opt))
3827*4882a593Smuzhiyun return -EINVAL;
3828*4882a593Smuzhiyun
3829*4882a593Smuzhiyun if (!tp->repair)
3830*4882a593Smuzhiyun return -EPERM;
3831*4882a593Smuzhiyun
3832*4882a593Smuzhiyun opt.snd_wl1 = tp->snd_wl1;
3833*4882a593Smuzhiyun opt.snd_wnd = tp->snd_wnd;
3834*4882a593Smuzhiyun opt.max_window = tp->max_window;
3835*4882a593Smuzhiyun opt.rcv_wnd = tp->rcv_wnd;
3836*4882a593Smuzhiyun opt.rcv_wup = tp->rcv_wup;
3837*4882a593Smuzhiyun
3838*4882a593Smuzhiyun if (copy_to_user(optval, &opt, len))
3839*4882a593Smuzhiyun return -EFAULT;
3840*4882a593Smuzhiyun return 0;
3841*4882a593Smuzhiyun }
3842*4882a593Smuzhiyun case TCP_QUEUE_SEQ:
3843*4882a593Smuzhiyun if (tp->repair_queue == TCP_SEND_QUEUE)
3844*4882a593Smuzhiyun val = tp->write_seq;
3845*4882a593Smuzhiyun else if (tp->repair_queue == TCP_RECV_QUEUE)
3846*4882a593Smuzhiyun val = tp->rcv_nxt;
3847*4882a593Smuzhiyun else
3848*4882a593Smuzhiyun return -EINVAL;
3849*4882a593Smuzhiyun break;
3850*4882a593Smuzhiyun
3851*4882a593Smuzhiyun case TCP_USER_TIMEOUT:
3852*4882a593Smuzhiyun val = icsk->icsk_user_timeout;
3853*4882a593Smuzhiyun break;
3854*4882a593Smuzhiyun
3855*4882a593Smuzhiyun case TCP_FASTOPEN:
3856*4882a593Smuzhiyun val = icsk->icsk_accept_queue.fastopenq.max_qlen;
3857*4882a593Smuzhiyun break;
3858*4882a593Smuzhiyun
3859*4882a593Smuzhiyun case TCP_FASTOPEN_CONNECT:
3860*4882a593Smuzhiyun val = tp->fastopen_connect;
3861*4882a593Smuzhiyun break;
3862*4882a593Smuzhiyun
3863*4882a593Smuzhiyun case TCP_FASTOPEN_NO_COOKIE:
3864*4882a593Smuzhiyun val = tp->fastopen_no_cookie;
3865*4882a593Smuzhiyun break;
3866*4882a593Smuzhiyun
3867*4882a593Smuzhiyun case TCP_TX_DELAY:
3868*4882a593Smuzhiyun val = tp->tcp_tx_delay;
3869*4882a593Smuzhiyun break;
3870*4882a593Smuzhiyun
3871*4882a593Smuzhiyun case TCP_TIMESTAMP:
3872*4882a593Smuzhiyun val = tcp_time_stamp_raw() + tp->tsoffset;
3873*4882a593Smuzhiyun break;
3874*4882a593Smuzhiyun case TCP_NOTSENT_LOWAT:
3875*4882a593Smuzhiyun val = tp->notsent_lowat;
3876*4882a593Smuzhiyun break;
3877*4882a593Smuzhiyun case TCP_INQ:
3878*4882a593Smuzhiyun val = tp->recvmsg_inq;
3879*4882a593Smuzhiyun break;
3880*4882a593Smuzhiyun case TCP_SAVE_SYN:
3881*4882a593Smuzhiyun val = tp->save_syn;
3882*4882a593Smuzhiyun break;
3883*4882a593Smuzhiyun case TCP_SAVED_SYN: {
3884*4882a593Smuzhiyun if (get_user(len, optlen))
3885*4882a593Smuzhiyun return -EFAULT;
3886*4882a593Smuzhiyun
3887*4882a593Smuzhiyun lock_sock(sk);
3888*4882a593Smuzhiyun if (tp->saved_syn) {
3889*4882a593Smuzhiyun if (len < tcp_saved_syn_len(tp->saved_syn)) {
3890*4882a593Smuzhiyun if (put_user(tcp_saved_syn_len(tp->saved_syn),
3891*4882a593Smuzhiyun optlen)) {
3892*4882a593Smuzhiyun release_sock(sk);
3893*4882a593Smuzhiyun return -EFAULT;
3894*4882a593Smuzhiyun }
3895*4882a593Smuzhiyun release_sock(sk);
3896*4882a593Smuzhiyun return -EINVAL;
3897*4882a593Smuzhiyun }
3898*4882a593Smuzhiyun len = tcp_saved_syn_len(tp->saved_syn);
3899*4882a593Smuzhiyun if (put_user(len, optlen)) {
3900*4882a593Smuzhiyun release_sock(sk);
3901*4882a593Smuzhiyun return -EFAULT;
3902*4882a593Smuzhiyun }
3903*4882a593Smuzhiyun if (copy_to_user(optval, tp->saved_syn->data, len)) {
3904*4882a593Smuzhiyun release_sock(sk);
3905*4882a593Smuzhiyun return -EFAULT;
3906*4882a593Smuzhiyun }
3907*4882a593Smuzhiyun tcp_saved_syn_free(tp);
3908*4882a593Smuzhiyun release_sock(sk);
3909*4882a593Smuzhiyun } else {
3910*4882a593Smuzhiyun release_sock(sk);
3911*4882a593Smuzhiyun len = 0;
3912*4882a593Smuzhiyun if (put_user(len, optlen))
3913*4882a593Smuzhiyun return -EFAULT;
3914*4882a593Smuzhiyun }
3915*4882a593Smuzhiyun return 0;
3916*4882a593Smuzhiyun }
3917*4882a593Smuzhiyun #ifdef CONFIG_MMU
3918*4882a593Smuzhiyun case TCP_ZEROCOPY_RECEIVE: {
3919*4882a593Smuzhiyun struct tcp_zerocopy_receive zc = {};
3920*4882a593Smuzhiyun int err;
3921*4882a593Smuzhiyun
3922*4882a593Smuzhiyun if (get_user(len, optlen))
3923*4882a593Smuzhiyun return -EFAULT;
3924*4882a593Smuzhiyun if (len < 0 ||
3925*4882a593Smuzhiyun len < offsetofend(struct tcp_zerocopy_receive, length))
3926*4882a593Smuzhiyun return -EINVAL;
3927*4882a593Smuzhiyun if (len > sizeof(zc)) {
3928*4882a593Smuzhiyun len = sizeof(zc);
3929*4882a593Smuzhiyun if (put_user(len, optlen))
3930*4882a593Smuzhiyun return -EFAULT;
3931*4882a593Smuzhiyun }
3932*4882a593Smuzhiyun if (copy_from_user(&zc, optval, len))
3933*4882a593Smuzhiyun return -EFAULT;
3934*4882a593Smuzhiyun lock_sock(sk);
3935*4882a593Smuzhiyun err = tcp_zerocopy_receive(sk, &zc);
3936*4882a593Smuzhiyun release_sock(sk);
3937*4882a593Smuzhiyun if (len >= offsetofend(struct tcp_zerocopy_receive, err))
3938*4882a593Smuzhiyun goto zerocopy_rcv_sk_err;
3939*4882a593Smuzhiyun switch (len) {
3940*4882a593Smuzhiyun case offsetofend(struct tcp_zerocopy_receive, err):
3941*4882a593Smuzhiyun goto zerocopy_rcv_sk_err;
3942*4882a593Smuzhiyun case offsetofend(struct tcp_zerocopy_receive, inq):
3943*4882a593Smuzhiyun goto zerocopy_rcv_inq;
3944*4882a593Smuzhiyun case offsetofend(struct tcp_zerocopy_receive, length):
3945*4882a593Smuzhiyun default:
3946*4882a593Smuzhiyun goto zerocopy_rcv_out;
3947*4882a593Smuzhiyun }
3948*4882a593Smuzhiyun zerocopy_rcv_sk_err:
3949*4882a593Smuzhiyun if (!err)
3950*4882a593Smuzhiyun zc.err = sock_error(sk);
3951*4882a593Smuzhiyun zerocopy_rcv_inq:
3952*4882a593Smuzhiyun zc.inq = tcp_inq_hint(sk);
3953*4882a593Smuzhiyun zerocopy_rcv_out:
3954*4882a593Smuzhiyun if (!err && copy_to_user(optval, &zc, len))
3955*4882a593Smuzhiyun err = -EFAULT;
3956*4882a593Smuzhiyun return err;
3957*4882a593Smuzhiyun }
3958*4882a593Smuzhiyun #endif
3959*4882a593Smuzhiyun default:
3960*4882a593Smuzhiyun return -ENOPROTOOPT;
3961*4882a593Smuzhiyun }
3962*4882a593Smuzhiyun
3963*4882a593Smuzhiyun if (put_user(len, optlen))
3964*4882a593Smuzhiyun return -EFAULT;
3965*4882a593Smuzhiyun if (copy_to_user(optval, &val, len))
3966*4882a593Smuzhiyun return -EFAULT;
3967*4882a593Smuzhiyun return 0;
3968*4882a593Smuzhiyun }
3969*4882a593Smuzhiyun
tcp_getsockopt(struct sock * sk,int level,int optname,char __user * optval,int __user * optlen)3970*4882a593Smuzhiyun int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
3971*4882a593Smuzhiyun int __user *optlen)
3972*4882a593Smuzhiyun {
3973*4882a593Smuzhiyun struct inet_connection_sock *icsk = inet_csk(sk);
3974*4882a593Smuzhiyun
3975*4882a593Smuzhiyun if (level != SOL_TCP)
3976*4882a593Smuzhiyun return icsk->icsk_af_ops->getsockopt(sk, level, optname,
3977*4882a593Smuzhiyun optval, optlen);
3978*4882a593Smuzhiyun return do_tcp_getsockopt(sk, level, optname, optval, optlen);
3979*4882a593Smuzhiyun }
3980*4882a593Smuzhiyun EXPORT_SYMBOL(tcp_getsockopt);
3981*4882a593Smuzhiyun
3982*4882a593Smuzhiyun #ifdef CONFIG_TCP_MD5SIG
3983*4882a593Smuzhiyun static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool);
3984*4882a593Smuzhiyun static DEFINE_MUTEX(tcp_md5sig_mutex);
3985*4882a593Smuzhiyun static bool tcp_md5sig_pool_populated = false;
3986*4882a593Smuzhiyun
__tcp_alloc_md5sig_pool(void)3987*4882a593Smuzhiyun static void __tcp_alloc_md5sig_pool(void)
3988*4882a593Smuzhiyun {
3989*4882a593Smuzhiyun struct crypto_ahash *hash;
3990*4882a593Smuzhiyun int cpu;
3991*4882a593Smuzhiyun
3992*4882a593Smuzhiyun hash = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
3993*4882a593Smuzhiyun if (IS_ERR(hash))
3994*4882a593Smuzhiyun return;
3995*4882a593Smuzhiyun
3996*4882a593Smuzhiyun for_each_possible_cpu(cpu) {
3997*4882a593Smuzhiyun void *scratch = per_cpu(tcp_md5sig_pool, cpu).scratch;
3998*4882a593Smuzhiyun struct ahash_request *req;
3999*4882a593Smuzhiyun
4000*4882a593Smuzhiyun if (!scratch) {
4001*4882a593Smuzhiyun scratch = kmalloc_node(sizeof(union tcp_md5sum_block) +
4002*4882a593Smuzhiyun sizeof(struct tcphdr),
4003*4882a593Smuzhiyun GFP_KERNEL,
4004*4882a593Smuzhiyun cpu_to_node(cpu));
4005*4882a593Smuzhiyun if (!scratch)
4006*4882a593Smuzhiyun return;
4007*4882a593Smuzhiyun per_cpu(tcp_md5sig_pool, cpu).scratch = scratch;
4008*4882a593Smuzhiyun }
4009*4882a593Smuzhiyun if (per_cpu(tcp_md5sig_pool, cpu).md5_req)
4010*4882a593Smuzhiyun continue;
4011*4882a593Smuzhiyun
4012*4882a593Smuzhiyun req = ahash_request_alloc(hash, GFP_KERNEL);
4013*4882a593Smuzhiyun if (!req)
4014*4882a593Smuzhiyun return;
4015*4882a593Smuzhiyun
4016*4882a593Smuzhiyun ahash_request_set_callback(req, 0, NULL, NULL);
4017*4882a593Smuzhiyun
4018*4882a593Smuzhiyun per_cpu(tcp_md5sig_pool, cpu).md5_req = req;
4019*4882a593Smuzhiyun }
4020*4882a593Smuzhiyun /* before setting tcp_md5sig_pool_populated, we must commit all writes
4021*4882a593Smuzhiyun * to memory. See smp_rmb() in tcp_get_md5sig_pool()
4022*4882a593Smuzhiyun */
4023*4882a593Smuzhiyun smp_wmb();
4024*4882a593Smuzhiyun /* Paired with READ_ONCE() from tcp_alloc_md5sig_pool()
4025*4882a593Smuzhiyun * and tcp_get_md5sig_pool().
4026*4882a593Smuzhiyun */
4027*4882a593Smuzhiyun WRITE_ONCE(tcp_md5sig_pool_populated, true);
4028*4882a593Smuzhiyun }
4029*4882a593Smuzhiyun
tcp_alloc_md5sig_pool(void)4030*4882a593Smuzhiyun bool tcp_alloc_md5sig_pool(void)
4031*4882a593Smuzhiyun {
4032*4882a593Smuzhiyun /* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */
4033*4882a593Smuzhiyun if (unlikely(!READ_ONCE(tcp_md5sig_pool_populated))) {
4034*4882a593Smuzhiyun mutex_lock(&tcp_md5sig_mutex);
4035*4882a593Smuzhiyun
4036*4882a593Smuzhiyun if (!tcp_md5sig_pool_populated) {
4037*4882a593Smuzhiyun __tcp_alloc_md5sig_pool();
4038*4882a593Smuzhiyun if (tcp_md5sig_pool_populated)
4039*4882a593Smuzhiyun static_branch_inc(&tcp_md5_needed);
4040*4882a593Smuzhiyun }
4041*4882a593Smuzhiyun
4042*4882a593Smuzhiyun mutex_unlock(&tcp_md5sig_mutex);
4043*4882a593Smuzhiyun }
4044*4882a593Smuzhiyun /* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */
4045*4882a593Smuzhiyun return READ_ONCE(tcp_md5sig_pool_populated);
4046*4882a593Smuzhiyun }
4047*4882a593Smuzhiyun EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
4048*4882a593Smuzhiyun
4049*4882a593Smuzhiyun
4050*4882a593Smuzhiyun /**
4051*4882a593Smuzhiyun * tcp_get_md5sig_pool - get md5sig_pool for this user
4052*4882a593Smuzhiyun *
4053*4882a593Smuzhiyun * We use percpu structure, so if we succeed, we exit with preemption
4054*4882a593Smuzhiyun * and BH disabled, to make sure another thread or softirq handling
4055*4882a593Smuzhiyun * wont try to get same context.
4056*4882a593Smuzhiyun */
tcp_get_md5sig_pool(void)4057*4882a593Smuzhiyun struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
4058*4882a593Smuzhiyun {
4059*4882a593Smuzhiyun local_bh_disable();
4060*4882a593Smuzhiyun
4061*4882a593Smuzhiyun /* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */
4062*4882a593Smuzhiyun if (READ_ONCE(tcp_md5sig_pool_populated)) {
4063*4882a593Smuzhiyun /* coupled with smp_wmb() in __tcp_alloc_md5sig_pool() */
4064*4882a593Smuzhiyun smp_rmb();
4065*4882a593Smuzhiyun return this_cpu_ptr(&tcp_md5sig_pool);
4066*4882a593Smuzhiyun }
4067*4882a593Smuzhiyun local_bh_enable();
4068*4882a593Smuzhiyun return NULL;
4069*4882a593Smuzhiyun }
4070*4882a593Smuzhiyun EXPORT_SYMBOL(tcp_get_md5sig_pool);
4071*4882a593Smuzhiyun
tcp_md5_hash_skb_data(struct tcp_md5sig_pool * hp,const struct sk_buff * skb,unsigned int header_len)4072*4882a593Smuzhiyun int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
4073*4882a593Smuzhiyun const struct sk_buff *skb, unsigned int header_len)
4074*4882a593Smuzhiyun {
4075*4882a593Smuzhiyun struct scatterlist sg;
4076*4882a593Smuzhiyun const struct tcphdr *tp = tcp_hdr(skb);
4077*4882a593Smuzhiyun struct ahash_request *req = hp->md5_req;
4078*4882a593Smuzhiyun unsigned int i;
4079*4882a593Smuzhiyun const unsigned int head_data_len = skb_headlen(skb) > header_len ?
4080*4882a593Smuzhiyun skb_headlen(skb) - header_len : 0;
4081*4882a593Smuzhiyun const struct skb_shared_info *shi = skb_shinfo(skb);
4082*4882a593Smuzhiyun struct sk_buff *frag_iter;
4083*4882a593Smuzhiyun
4084*4882a593Smuzhiyun sg_init_table(&sg, 1);
4085*4882a593Smuzhiyun
4086*4882a593Smuzhiyun sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
4087*4882a593Smuzhiyun ahash_request_set_crypt(req, &sg, NULL, head_data_len);
4088*4882a593Smuzhiyun if (crypto_ahash_update(req))
4089*4882a593Smuzhiyun return 1;
4090*4882a593Smuzhiyun
4091*4882a593Smuzhiyun for (i = 0; i < shi->nr_frags; ++i) {
4092*4882a593Smuzhiyun const skb_frag_t *f = &shi->frags[i];
4093*4882a593Smuzhiyun unsigned int offset = skb_frag_off(f);
4094*4882a593Smuzhiyun struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
4095*4882a593Smuzhiyun
4096*4882a593Smuzhiyun sg_set_page(&sg, page, skb_frag_size(f),
4097*4882a593Smuzhiyun offset_in_page(offset));
4098*4882a593Smuzhiyun ahash_request_set_crypt(req, &sg, NULL, skb_frag_size(f));
4099*4882a593Smuzhiyun if (crypto_ahash_update(req))
4100*4882a593Smuzhiyun return 1;
4101*4882a593Smuzhiyun }
4102*4882a593Smuzhiyun
4103*4882a593Smuzhiyun skb_walk_frags(skb, frag_iter)
4104*4882a593Smuzhiyun if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
4105*4882a593Smuzhiyun return 1;
4106*4882a593Smuzhiyun
4107*4882a593Smuzhiyun return 0;
4108*4882a593Smuzhiyun }
4109*4882a593Smuzhiyun EXPORT_SYMBOL(tcp_md5_hash_skb_data);
4110*4882a593Smuzhiyun
tcp_md5_hash_key(struct tcp_md5sig_pool * hp,const struct tcp_md5sig_key * key)4111*4882a593Smuzhiyun int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)
4112*4882a593Smuzhiyun {
4113*4882a593Smuzhiyun u8 keylen = READ_ONCE(key->keylen); /* paired with WRITE_ONCE() in tcp_md5_do_add */
4114*4882a593Smuzhiyun struct scatterlist sg;
4115*4882a593Smuzhiyun
4116*4882a593Smuzhiyun sg_init_one(&sg, key->key, keylen);
4117*4882a593Smuzhiyun ahash_request_set_crypt(hp->md5_req, &sg, NULL, keylen);
4118*4882a593Smuzhiyun
4119*4882a593Smuzhiyun /* We use data_race() because tcp_md5_do_add() might change key->key under us */
4120*4882a593Smuzhiyun return data_race(crypto_ahash_update(hp->md5_req));
4121*4882a593Smuzhiyun }
4122*4882a593Smuzhiyun EXPORT_SYMBOL(tcp_md5_hash_key);
4123*4882a593Smuzhiyun
4124*4882a593Smuzhiyun #endif
4125*4882a593Smuzhiyun
tcp_done(struct sock * sk)4126*4882a593Smuzhiyun void tcp_done(struct sock *sk)
4127*4882a593Smuzhiyun {
4128*4882a593Smuzhiyun struct request_sock *req;
4129*4882a593Smuzhiyun
4130*4882a593Smuzhiyun /* We might be called with a new socket, after
4131*4882a593Smuzhiyun * inet_csk_prepare_forced_close() has been called
4132*4882a593Smuzhiyun * so we can not use lockdep_sock_is_held(sk)
4133*4882a593Smuzhiyun */
4134*4882a593Smuzhiyun req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 1);
4135*4882a593Smuzhiyun
4136*4882a593Smuzhiyun if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
4137*4882a593Smuzhiyun TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
4138*4882a593Smuzhiyun
4139*4882a593Smuzhiyun tcp_set_state(sk, TCP_CLOSE);
4140*4882a593Smuzhiyun tcp_clear_xmit_timers(sk);
4141*4882a593Smuzhiyun if (req)
4142*4882a593Smuzhiyun reqsk_fastopen_remove(sk, req, false);
4143*4882a593Smuzhiyun
4144*4882a593Smuzhiyun sk->sk_shutdown = SHUTDOWN_MASK;
4145*4882a593Smuzhiyun
4146*4882a593Smuzhiyun if (!sock_flag(sk, SOCK_DEAD))
4147*4882a593Smuzhiyun sk->sk_state_change(sk);
4148*4882a593Smuzhiyun else
4149*4882a593Smuzhiyun inet_csk_destroy_sock(sk);
4150*4882a593Smuzhiyun }
4151*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(tcp_done);
4152*4882a593Smuzhiyun
tcp_abort(struct sock * sk,int err)4153*4882a593Smuzhiyun int tcp_abort(struct sock *sk, int err)
4154*4882a593Smuzhiyun {
4155*4882a593Smuzhiyun if (!sk_fullsock(sk)) {
4156*4882a593Smuzhiyun if (sk->sk_state == TCP_NEW_SYN_RECV) {
4157*4882a593Smuzhiyun struct request_sock *req = inet_reqsk(sk);
4158*4882a593Smuzhiyun
4159*4882a593Smuzhiyun local_bh_disable();
4160*4882a593Smuzhiyun inet_csk_reqsk_queue_drop(req->rsk_listener, req);
4161*4882a593Smuzhiyun local_bh_enable();
4162*4882a593Smuzhiyun return 0;
4163*4882a593Smuzhiyun }
4164*4882a593Smuzhiyun return -EOPNOTSUPP;
4165*4882a593Smuzhiyun }
4166*4882a593Smuzhiyun
4167*4882a593Smuzhiyun /* Don't race with userspace socket closes such as tcp_close. */
4168*4882a593Smuzhiyun lock_sock(sk);
4169*4882a593Smuzhiyun
4170*4882a593Smuzhiyun if (sk->sk_state == TCP_LISTEN) {
4171*4882a593Smuzhiyun tcp_set_state(sk, TCP_CLOSE);
4172*4882a593Smuzhiyun inet_csk_listen_stop(sk);
4173*4882a593Smuzhiyun }
4174*4882a593Smuzhiyun
4175*4882a593Smuzhiyun /* Don't race with BH socket closes such as inet_csk_listen_stop. */
4176*4882a593Smuzhiyun local_bh_disable();
4177*4882a593Smuzhiyun bh_lock_sock(sk);
4178*4882a593Smuzhiyun
4179*4882a593Smuzhiyun if (!sock_flag(sk, SOCK_DEAD)) {
4180*4882a593Smuzhiyun sk->sk_err = err;
4181*4882a593Smuzhiyun /* This barrier is coupled with smp_rmb() in tcp_poll() */
4182*4882a593Smuzhiyun smp_wmb();
4183*4882a593Smuzhiyun sk->sk_error_report(sk);
4184*4882a593Smuzhiyun if (tcp_need_reset(sk->sk_state))
4185*4882a593Smuzhiyun tcp_send_active_reset(sk, GFP_ATOMIC);
4186*4882a593Smuzhiyun tcp_done(sk);
4187*4882a593Smuzhiyun }
4188*4882a593Smuzhiyun
4189*4882a593Smuzhiyun bh_unlock_sock(sk);
4190*4882a593Smuzhiyun local_bh_enable();
4191*4882a593Smuzhiyun tcp_write_queue_purge(sk);
4192*4882a593Smuzhiyun release_sock(sk);
4193*4882a593Smuzhiyun return 0;
4194*4882a593Smuzhiyun }
4195*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(tcp_abort);
4196*4882a593Smuzhiyun
4197*4882a593Smuzhiyun extern struct tcp_congestion_ops tcp_reno;
4198*4882a593Smuzhiyun
4199*4882a593Smuzhiyun static __initdata unsigned long thash_entries;
set_thash_entries(char * str)4200*4882a593Smuzhiyun static int __init set_thash_entries(char *str)
4201*4882a593Smuzhiyun {
4202*4882a593Smuzhiyun ssize_t ret;
4203*4882a593Smuzhiyun
4204*4882a593Smuzhiyun if (!str)
4205*4882a593Smuzhiyun return 0;
4206*4882a593Smuzhiyun
4207*4882a593Smuzhiyun ret = kstrtoul(str, 0, &thash_entries);
4208*4882a593Smuzhiyun if (ret)
4209*4882a593Smuzhiyun return 0;
4210*4882a593Smuzhiyun
4211*4882a593Smuzhiyun return 1;
4212*4882a593Smuzhiyun }
4213*4882a593Smuzhiyun __setup("thash_entries=", set_thash_entries);
4214*4882a593Smuzhiyun
tcp_init_mem(void)4215*4882a593Smuzhiyun static void __init tcp_init_mem(void)
4216*4882a593Smuzhiyun {
4217*4882a593Smuzhiyun unsigned long limit = nr_free_buffer_pages() / 16;
4218*4882a593Smuzhiyun
4219*4882a593Smuzhiyun limit = max(limit, 128UL);
4220*4882a593Smuzhiyun sysctl_tcp_mem[0] = limit / 4 * 3; /* 4.68 % */
4221*4882a593Smuzhiyun sysctl_tcp_mem[1] = limit; /* 6.25 % */
4222*4882a593Smuzhiyun sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; /* 9.37 % */
4223*4882a593Smuzhiyun }
4224*4882a593Smuzhiyun
tcp_init(void)4225*4882a593Smuzhiyun void __init tcp_init(void)
4226*4882a593Smuzhiyun {
4227*4882a593Smuzhiyun int max_rshare, max_wshare, cnt;
4228*4882a593Smuzhiyun unsigned long limit;
4229*4882a593Smuzhiyun unsigned int i;
4230*4882a593Smuzhiyun
4231*4882a593Smuzhiyun BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE);
4232*4882a593Smuzhiyun BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
4233*4882a593Smuzhiyun sizeof_field(struct sk_buff, cb));
4234*4882a593Smuzhiyun
4235*4882a593Smuzhiyun percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
4236*4882a593Smuzhiyun percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
4237*4882a593Smuzhiyun inet_hashinfo_init(&tcp_hashinfo);
4238*4882a593Smuzhiyun inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
4239*4882a593Smuzhiyun thash_entries, 21, /* one slot per 2 MB*/
4240*4882a593Smuzhiyun 0, 64 * 1024);
4241*4882a593Smuzhiyun tcp_hashinfo.bind_bucket_cachep =
4242*4882a593Smuzhiyun kmem_cache_create("tcp_bind_bucket",
4243*4882a593Smuzhiyun sizeof(struct inet_bind_bucket), 0,
4244*4882a593Smuzhiyun SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
4245*4882a593Smuzhiyun
4246*4882a593Smuzhiyun /* Size and allocate the main established and bind bucket
4247*4882a593Smuzhiyun * hash tables.
4248*4882a593Smuzhiyun *
4249*4882a593Smuzhiyun * The methodology is similar to that of the buffer cache.
4250*4882a593Smuzhiyun */
4251*4882a593Smuzhiyun tcp_hashinfo.ehash =
4252*4882a593Smuzhiyun alloc_large_system_hash("TCP established",
4253*4882a593Smuzhiyun sizeof(struct inet_ehash_bucket),
4254*4882a593Smuzhiyun thash_entries,
4255*4882a593Smuzhiyun 17, /* one slot per 128 KB of memory */
4256*4882a593Smuzhiyun 0,
4257*4882a593Smuzhiyun NULL,
4258*4882a593Smuzhiyun &tcp_hashinfo.ehash_mask,
4259*4882a593Smuzhiyun 0,
4260*4882a593Smuzhiyun thash_entries ? 0 : 512 * 1024);
4261*4882a593Smuzhiyun for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)
4262*4882a593Smuzhiyun INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
4263*4882a593Smuzhiyun
4264*4882a593Smuzhiyun if (inet_ehash_locks_alloc(&tcp_hashinfo))
4265*4882a593Smuzhiyun panic("TCP: failed to alloc ehash_locks");
4266*4882a593Smuzhiyun tcp_hashinfo.bhash =
4267*4882a593Smuzhiyun alloc_large_system_hash("TCP bind",
4268*4882a593Smuzhiyun sizeof(struct inet_bind_hashbucket),
4269*4882a593Smuzhiyun tcp_hashinfo.ehash_mask + 1,
4270*4882a593Smuzhiyun 17, /* one slot per 128 KB of memory */
4271*4882a593Smuzhiyun 0,
4272*4882a593Smuzhiyun &tcp_hashinfo.bhash_size,
4273*4882a593Smuzhiyun NULL,
4274*4882a593Smuzhiyun 0,
4275*4882a593Smuzhiyun 64 * 1024);
4276*4882a593Smuzhiyun tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
4277*4882a593Smuzhiyun for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
4278*4882a593Smuzhiyun spin_lock_init(&tcp_hashinfo.bhash[i].lock);
4279*4882a593Smuzhiyun INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
4280*4882a593Smuzhiyun }
4281*4882a593Smuzhiyun
4282*4882a593Smuzhiyun
4283*4882a593Smuzhiyun cnt = tcp_hashinfo.ehash_mask + 1;
4284*4882a593Smuzhiyun sysctl_tcp_max_orphans = cnt / 2;
4285*4882a593Smuzhiyun
4286*4882a593Smuzhiyun tcp_init_mem();
4287*4882a593Smuzhiyun /* Set per-socket limits to no more than 1/128 the pressure threshold */
4288*4882a593Smuzhiyun limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
4289*4882a593Smuzhiyun max_wshare = min(4UL*1024*1024, limit);
4290*4882a593Smuzhiyun max_rshare = min(6UL*1024*1024, limit);
4291*4882a593Smuzhiyun
4292*4882a593Smuzhiyun init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
4293*4882a593Smuzhiyun init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
4294*4882a593Smuzhiyun init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
4295*4882a593Smuzhiyun
4296*4882a593Smuzhiyun init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
4297*4882a593Smuzhiyun init_net.ipv4.sysctl_tcp_rmem[1] = 131072;
4298*4882a593Smuzhiyun init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare);
4299*4882a593Smuzhiyun
4300*4882a593Smuzhiyun pr_info("Hash tables configured (established %u bind %u)\n",
4301*4882a593Smuzhiyun tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
4302*4882a593Smuzhiyun
4303*4882a593Smuzhiyun tcp_v4_init();
4304*4882a593Smuzhiyun tcp_metrics_init();
4305*4882a593Smuzhiyun BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
4306*4882a593Smuzhiyun tcp_tasklet_init();
4307*4882a593Smuzhiyun mptcp_init();
4308*4882a593Smuzhiyun }
4309