xref: /OK3568_Linux_fs/kernel/tools/testing/selftests/net/tcp_mmap.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0-only
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun  * Copyright 2018 Google Inc.
4*4882a593Smuzhiyun  * Author: Eric Dumazet (edumazet@google.com)
5*4882a593Smuzhiyun  *
6*4882a593Smuzhiyun  * Reference program demonstrating tcp mmap() usage,
7*4882a593Smuzhiyun  * and SO_RCVLOWAT hints for receiver.
8*4882a593Smuzhiyun  *
9*4882a593Smuzhiyun  * Note : NIC with header split is needed to use mmap() on TCP :
10*4882a593Smuzhiyun  * Each incoming frame must be a multiple of PAGE_SIZE bytes of TCP payload.
11*4882a593Smuzhiyun  *
12*4882a593Smuzhiyun  * How to use on loopback interface :
13*4882a593Smuzhiyun  *
14*4882a593Smuzhiyun  *  ifconfig lo mtu 61512  # 15*4096 + 40 (ipv6 header) + 32 (TCP with TS option header)
15*4882a593Smuzhiyun  *  tcp_mmap -s -z &
16*4882a593Smuzhiyun  *  tcp_mmap -H ::1 -z
17*4882a593Smuzhiyun  *
18*4882a593Smuzhiyun  *  Or leave default lo mtu, but use -M option to set TCP_MAXSEG option to (4096 + 12)
19*4882a593Smuzhiyun  *      (4096 : page size on x86, 12: TCP TS option length)
20*4882a593Smuzhiyun  *  tcp_mmap -s -z -M $((4096+12)) &
21*4882a593Smuzhiyun  *  tcp_mmap -H ::1 -z -M $((4096+12))
22*4882a593Smuzhiyun  *
23*4882a593Smuzhiyun  * Note: -z option on sender uses MSG_ZEROCOPY, which forces a copy when packets go through loopback interface.
24*4882a593Smuzhiyun  *       We might use sendfile() instead, but really this test program is about mmap(), for receivers ;)
25*4882a593Smuzhiyun  *
26*4882a593Smuzhiyun  * $ ./tcp_mmap -s &                                 # Without mmap()
27*4882a593Smuzhiyun  * $ for i in {1..4}; do ./tcp_mmap -H ::1 -z ; done
28*4882a593Smuzhiyun  * received 32768 MB (0 % mmap'ed) in 14.1157 s, 19.4732 Gbit
29*4882a593Smuzhiyun  *   cpu usage user:0.057 sys:7.815, 240.234 usec per MB, 65531 c-switches
30*4882a593Smuzhiyun  * received 32768 MB (0 % mmap'ed) in 14.6833 s, 18.7204 Gbit
31*4882a593Smuzhiyun  *  cpu usage user:0.043 sys:8.103, 248.596 usec per MB, 65524 c-switches
32*4882a593Smuzhiyun  * received 32768 MB (0 % mmap'ed) in 11.143 s, 24.6682 Gbit
33*4882a593Smuzhiyun  *   cpu usage user:0.044 sys:6.576, 202.026 usec per MB, 65519 c-switches
34*4882a593Smuzhiyun  * received 32768 MB (0 % mmap'ed) in 14.9056 s, 18.4413 Gbit
35*4882a593Smuzhiyun  *   cpu usage user:0.036 sys:8.193, 251.129 usec per MB, 65530 c-switches
36*4882a593Smuzhiyun  * $ kill %1   # kill tcp_mmap server
37*4882a593Smuzhiyun  *
38*4882a593Smuzhiyun  * $ ./tcp_mmap -s -z &                              # With mmap()
39*4882a593Smuzhiyun  * $ for i in {1..4}; do ./tcp_mmap -H ::1 -z ; done
40*4882a593Smuzhiyun  * received 32768 MB (99.9939 % mmap'ed) in 6.73792 s, 40.7956 Gbit
41*4882a593Smuzhiyun  *   cpu usage user:0.045 sys:2.827, 87.6465 usec per MB, 65532 c-switches
42*4882a593Smuzhiyun  * received 32768 MB (99.9939 % mmap'ed) in 7.26732 s, 37.8238 Gbit
43*4882a593Smuzhiyun  *   cpu usage user:0.037 sys:3.087, 95.3369 usec per MB, 65532 c-switches
44*4882a593Smuzhiyun  * received 32768 MB (99.9939 % mmap'ed) in 7.61661 s, 36.0893 Gbit
45*4882a593Smuzhiyun  *   cpu usage user:0.046 sys:3.559, 110.016 usec per MB, 65529 c-switches
46*4882a593Smuzhiyun  * received 32768 MB (99.9939 % mmap'ed) in 7.43764 s, 36.9577 Gbit
47*4882a593Smuzhiyun  *   cpu usage user:0.035 sys:3.467, 106.873 usec per MB, 65530 c-switches
48*4882a593Smuzhiyun  */
49*4882a593Smuzhiyun #define _GNU_SOURCE
50*4882a593Smuzhiyun #include <pthread.h>
51*4882a593Smuzhiyun #include <sys/types.h>
52*4882a593Smuzhiyun #include <fcntl.h>
53*4882a593Smuzhiyun #include <error.h>
54*4882a593Smuzhiyun #include <sys/socket.h>
55*4882a593Smuzhiyun #include <sys/mman.h>
56*4882a593Smuzhiyun #include <sys/resource.h>
57*4882a593Smuzhiyun #include <unistd.h>
58*4882a593Smuzhiyun #include <string.h>
59*4882a593Smuzhiyun #include <stdlib.h>
60*4882a593Smuzhiyun #include <stdio.h>
61*4882a593Smuzhiyun #include <errno.h>
62*4882a593Smuzhiyun #include <time.h>
63*4882a593Smuzhiyun #include <sys/time.h>
64*4882a593Smuzhiyun #include <netinet/in.h>
65*4882a593Smuzhiyun #include <arpa/inet.h>
66*4882a593Smuzhiyun #include <poll.h>
67*4882a593Smuzhiyun #include <linux/tcp.h>
68*4882a593Smuzhiyun #include <assert.h>
69*4882a593Smuzhiyun 
70*4882a593Smuzhiyun #ifndef MSG_ZEROCOPY
71*4882a593Smuzhiyun #define MSG_ZEROCOPY    0x4000000
72*4882a593Smuzhiyun #endif
73*4882a593Smuzhiyun 
74*4882a593Smuzhiyun #define FILE_SZ (1ULL << 35)
75*4882a593Smuzhiyun static int cfg_family = AF_INET6;
76*4882a593Smuzhiyun static socklen_t cfg_alen = sizeof(struct sockaddr_in6);
77*4882a593Smuzhiyun static int cfg_port = 8787;
78*4882a593Smuzhiyun 
79*4882a593Smuzhiyun static int rcvbuf; /* Default: autotuning.  Can be set with -r <integer> option */
80*4882a593Smuzhiyun static int sndbuf; /* Default: autotuning.  Can be set with -w <integer> option */
81*4882a593Smuzhiyun static int zflg; /* zero copy option. (MSG_ZEROCOPY for sender, mmap() for receiver */
82*4882a593Smuzhiyun static int xflg; /* hash received data (simple xor) (-h option) */
83*4882a593Smuzhiyun static int keepflag; /* -k option: receiver shall keep all received file in memory (no munmap() calls) */
84*4882a593Smuzhiyun 
85*4882a593Smuzhiyun static size_t chunk_size  = 512*1024;
86*4882a593Smuzhiyun 
87*4882a593Smuzhiyun static size_t map_align;
88*4882a593Smuzhiyun 
89*4882a593Smuzhiyun unsigned long htotal;
90*4882a593Smuzhiyun 
prefetch(const void * x)91*4882a593Smuzhiyun static inline void prefetch(const void *x)
92*4882a593Smuzhiyun {
93*4882a593Smuzhiyun #if defined(__x86_64__)
94*4882a593Smuzhiyun 	asm volatile("prefetcht0 %P0" : : "m" (*(const char *)x));
95*4882a593Smuzhiyun #endif
96*4882a593Smuzhiyun }
97*4882a593Smuzhiyun 
hash_zone(void * zone,unsigned int length)98*4882a593Smuzhiyun void hash_zone(void *zone, unsigned int length)
99*4882a593Smuzhiyun {
100*4882a593Smuzhiyun 	unsigned long temp = htotal;
101*4882a593Smuzhiyun 
102*4882a593Smuzhiyun 	while (length >= 8*sizeof(long)) {
103*4882a593Smuzhiyun 		prefetch(zone + 384);
104*4882a593Smuzhiyun 		temp ^= *(unsigned long *)zone;
105*4882a593Smuzhiyun 		temp ^= *(unsigned long *)(zone + sizeof(long));
106*4882a593Smuzhiyun 		temp ^= *(unsigned long *)(zone + 2*sizeof(long));
107*4882a593Smuzhiyun 		temp ^= *(unsigned long *)(zone + 3*sizeof(long));
108*4882a593Smuzhiyun 		temp ^= *(unsigned long *)(zone + 4*sizeof(long));
109*4882a593Smuzhiyun 		temp ^= *(unsigned long *)(zone + 5*sizeof(long));
110*4882a593Smuzhiyun 		temp ^= *(unsigned long *)(zone + 6*sizeof(long));
111*4882a593Smuzhiyun 		temp ^= *(unsigned long *)(zone + 7*sizeof(long));
112*4882a593Smuzhiyun 		zone += 8*sizeof(long);
113*4882a593Smuzhiyun 		length -= 8*sizeof(long);
114*4882a593Smuzhiyun 	}
115*4882a593Smuzhiyun 	while (length >= 1) {
116*4882a593Smuzhiyun 		temp ^= *(unsigned char *)zone;
117*4882a593Smuzhiyun 		zone += 1;
118*4882a593Smuzhiyun 		length--;
119*4882a593Smuzhiyun 	}
120*4882a593Smuzhiyun 	htotal = temp;
121*4882a593Smuzhiyun }
122*4882a593Smuzhiyun 
123*4882a593Smuzhiyun #define ALIGN_UP(x, align_to)	(((x) + ((align_to)-1)) & ~((align_to)-1))
124*4882a593Smuzhiyun #define ALIGN_PTR_UP(p, ptr_align_to)	((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to))
125*4882a593Smuzhiyun 
126*4882a593Smuzhiyun 
mmap_large_buffer(size_t need,size_t * allocated)127*4882a593Smuzhiyun static void *mmap_large_buffer(size_t need, size_t *allocated)
128*4882a593Smuzhiyun {
129*4882a593Smuzhiyun 	void *buffer;
130*4882a593Smuzhiyun 	size_t sz;
131*4882a593Smuzhiyun 
132*4882a593Smuzhiyun 	/* Attempt to use huge pages if possible. */
133*4882a593Smuzhiyun 	sz = ALIGN_UP(need, map_align);
134*4882a593Smuzhiyun 	buffer = mmap(NULL, sz, PROT_READ | PROT_WRITE,
135*4882a593Smuzhiyun 		      MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
136*4882a593Smuzhiyun 
137*4882a593Smuzhiyun 	if (buffer == (void *)-1) {
138*4882a593Smuzhiyun 		sz = need;
139*4882a593Smuzhiyun 		buffer = mmap(NULL, sz, PROT_READ | PROT_WRITE,
140*4882a593Smuzhiyun 			      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
141*4882a593Smuzhiyun 		if (buffer != (void *)-1)
142*4882a593Smuzhiyun 			fprintf(stderr, "MAP_HUGETLB attempt failed, look at /sys/kernel/mm/hugepages for optimal performance\n");
143*4882a593Smuzhiyun 	}
144*4882a593Smuzhiyun 	*allocated = sz;
145*4882a593Smuzhiyun 	return buffer;
146*4882a593Smuzhiyun }
147*4882a593Smuzhiyun 
child_thread(void * arg)148*4882a593Smuzhiyun void *child_thread(void *arg)
149*4882a593Smuzhiyun {
150*4882a593Smuzhiyun 	unsigned long total_mmap = 0, total = 0;
151*4882a593Smuzhiyun 	struct tcp_zerocopy_receive zc;
152*4882a593Smuzhiyun 	unsigned long delta_usec;
153*4882a593Smuzhiyun 	int flags = MAP_SHARED;
154*4882a593Smuzhiyun 	struct timeval t0, t1;
155*4882a593Smuzhiyun 	char *buffer = NULL;
156*4882a593Smuzhiyun 	void *raddr = NULL;
157*4882a593Smuzhiyun 	void *addr = NULL;
158*4882a593Smuzhiyun 	double throughput;
159*4882a593Smuzhiyun 	struct rusage ru;
160*4882a593Smuzhiyun 	size_t buffer_sz;
161*4882a593Smuzhiyun 	int lu, fd;
162*4882a593Smuzhiyun 
163*4882a593Smuzhiyun 	fd = (int)(unsigned long)arg;
164*4882a593Smuzhiyun 
165*4882a593Smuzhiyun 	gettimeofday(&t0, NULL);
166*4882a593Smuzhiyun 
167*4882a593Smuzhiyun 	fcntl(fd, F_SETFL, O_NDELAY);
168*4882a593Smuzhiyun 	buffer = mmap_large_buffer(chunk_size, &buffer_sz);
169*4882a593Smuzhiyun 	if (buffer == (void *)-1) {
170*4882a593Smuzhiyun 		perror("mmap");
171*4882a593Smuzhiyun 		goto error;
172*4882a593Smuzhiyun 	}
173*4882a593Smuzhiyun 	if (zflg) {
174*4882a593Smuzhiyun 		raddr = mmap(NULL, chunk_size + map_align, PROT_READ, flags, fd, 0);
175*4882a593Smuzhiyun 		if (raddr == (void *)-1) {
176*4882a593Smuzhiyun 			perror("mmap");
177*4882a593Smuzhiyun 			zflg = 0;
178*4882a593Smuzhiyun 		} else {
179*4882a593Smuzhiyun 			addr = ALIGN_PTR_UP(raddr, map_align);
180*4882a593Smuzhiyun 		}
181*4882a593Smuzhiyun 	}
182*4882a593Smuzhiyun 	while (1) {
183*4882a593Smuzhiyun 		struct pollfd pfd = { .fd = fd, .events = POLLIN, };
184*4882a593Smuzhiyun 		int sub;
185*4882a593Smuzhiyun 
186*4882a593Smuzhiyun 		poll(&pfd, 1, 10000);
187*4882a593Smuzhiyun 		if (zflg) {
188*4882a593Smuzhiyun 			socklen_t zc_len = sizeof(zc);
189*4882a593Smuzhiyun 			int res;
190*4882a593Smuzhiyun 
191*4882a593Smuzhiyun 			memset(&zc, 0, sizeof(zc));
192*4882a593Smuzhiyun 			zc.address = (__u64)((unsigned long)addr);
193*4882a593Smuzhiyun 			zc.length = chunk_size;
194*4882a593Smuzhiyun 
195*4882a593Smuzhiyun 			res = getsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE,
196*4882a593Smuzhiyun 					 &zc, &zc_len);
197*4882a593Smuzhiyun 			if (res == -1)
198*4882a593Smuzhiyun 				break;
199*4882a593Smuzhiyun 
200*4882a593Smuzhiyun 			if (zc.length) {
201*4882a593Smuzhiyun 				assert(zc.length <= chunk_size);
202*4882a593Smuzhiyun 				total_mmap += zc.length;
203*4882a593Smuzhiyun 				if (xflg)
204*4882a593Smuzhiyun 					hash_zone(addr, zc.length);
205*4882a593Smuzhiyun 				/* It is more efficient to unmap the pages right now,
206*4882a593Smuzhiyun 				 * instead of doing this in next TCP_ZEROCOPY_RECEIVE.
207*4882a593Smuzhiyun 				 */
208*4882a593Smuzhiyun 				madvise(addr, zc.length, MADV_DONTNEED);
209*4882a593Smuzhiyun 				total += zc.length;
210*4882a593Smuzhiyun 			}
211*4882a593Smuzhiyun 			if (zc.recv_skip_hint) {
212*4882a593Smuzhiyun 				assert(zc.recv_skip_hint <= chunk_size);
213*4882a593Smuzhiyun 				lu = read(fd, buffer, zc.recv_skip_hint);
214*4882a593Smuzhiyun 				if (lu > 0) {
215*4882a593Smuzhiyun 					if (xflg)
216*4882a593Smuzhiyun 						hash_zone(buffer, lu);
217*4882a593Smuzhiyun 					total += lu;
218*4882a593Smuzhiyun 				}
219*4882a593Smuzhiyun 			}
220*4882a593Smuzhiyun 			continue;
221*4882a593Smuzhiyun 		}
222*4882a593Smuzhiyun 		sub = 0;
223*4882a593Smuzhiyun 		while (sub < chunk_size) {
224*4882a593Smuzhiyun 			lu = read(fd, buffer + sub, chunk_size - sub);
225*4882a593Smuzhiyun 			if (lu == 0)
226*4882a593Smuzhiyun 				goto end;
227*4882a593Smuzhiyun 			if (lu < 0)
228*4882a593Smuzhiyun 				break;
229*4882a593Smuzhiyun 			if (xflg)
230*4882a593Smuzhiyun 				hash_zone(buffer + sub, lu);
231*4882a593Smuzhiyun 			total += lu;
232*4882a593Smuzhiyun 			sub += lu;
233*4882a593Smuzhiyun 		}
234*4882a593Smuzhiyun 	}
235*4882a593Smuzhiyun end:
236*4882a593Smuzhiyun 	gettimeofday(&t1, NULL);
237*4882a593Smuzhiyun 	delta_usec = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
238*4882a593Smuzhiyun 
239*4882a593Smuzhiyun 	throughput = 0;
240*4882a593Smuzhiyun 	if (delta_usec)
241*4882a593Smuzhiyun 		throughput = total * 8.0 / (double)delta_usec / 1000.0;
242*4882a593Smuzhiyun 	getrusage(RUSAGE_THREAD, &ru);
243*4882a593Smuzhiyun 	if (total > 1024*1024) {
244*4882a593Smuzhiyun 		unsigned long total_usec;
245*4882a593Smuzhiyun 		unsigned long mb = total >> 20;
246*4882a593Smuzhiyun 		total_usec = 1000000*ru.ru_utime.tv_sec + ru.ru_utime.tv_usec +
247*4882a593Smuzhiyun 			     1000000*ru.ru_stime.tv_sec + ru.ru_stime.tv_usec;
248*4882a593Smuzhiyun 		printf("received %lg MB (%lg %% mmap'ed) in %lg s, %lg Gbit\n"
249*4882a593Smuzhiyun 		       "  cpu usage user:%lg sys:%lg, %lg usec per MB, %lu c-switches\n",
250*4882a593Smuzhiyun 				total / (1024.0 * 1024.0),
251*4882a593Smuzhiyun 				100.0*total_mmap/total,
252*4882a593Smuzhiyun 				(double)delta_usec / 1000000.0,
253*4882a593Smuzhiyun 				throughput,
254*4882a593Smuzhiyun 				(double)ru.ru_utime.tv_sec + (double)ru.ru_utime.tv_usec / 1000000.0,
255*4882a593Smuzhiyun 				(double)ru.ru_stime.tv_sec + (double)ru.ru_stime.tv_usec / 1000000.0,
256*4882a593Smuzhiyun 				(double)total_usec/mb,
257*4882a593Smuzhiyun 				ru.ru_nvcsw);
258*4882a593Smuzhiyun 	}
259*4882a593Smuzhiyun error:
260*4882a593Smuzhiyun 	munmap(buffer, buffer_sz);
261*4882a593Smuzhiyun 	close(fd);
262*4882a593Smuzhiyun 	if (zflg)
263*4882a593Smuzhiyun 		munmap(raddr, chunk_size + map_align);
264*4882a593Smuzhiyun 	pthread_exit(0);
265*4882a593Smuzhiyun }
266*4882a593Smuzhiyun 
apply_rcvsnd_buf(int fd)267*4882a593Smuzhiyun static void apply_rcvsnd_buf(int fd)
268*4882a593Smuzhiyun {
269*4882a593Smuzhiyun 	if (rcvbuf && setsockopt(fd, SOL_SOCKET,
270*4882a593Smuzhiyun 				 SO_RCVBUF, &rcvbuf, sizeof(rcvbuf)) == -1) {
271*4882a593Smuzhiyun 		perror("setsockopt SO_RCVBUF");
272*4882a593Smuzhiyun 	}
273*4882a593Smuzhiyun 
274*4882a593Smuzhiyun 	if (sndbuf && setsockopt(fd, SOL_SOCKET,
275*4882a593Smuzhiyun 				 SO_SNDBUF, &sndbuf, sizeof(sndbuf)) == -1) {
276*4882a593Smuzhiyun 		perror("setsockopt SO_SNDBUF");
277*4882a593Smuzhiyun 	}
278*4882a593Smuzhiyun }
279*4882a593Smuzhiyun 
280*4882a593Smuzhiyun 
setup_sockaddr(int domain,const char * str_addr,struct sockaddr_storage * sockaddr)281*4882a593Smuzhiyun static void setup_sockaddr(int domain, const char *str_addr,
282*4882a593Smuzhiyun 			   struct sockaddr_storage *sockaddr)
283*4882a593Smuzhiyun {
284*4882a593Smuzhiyun 	struct sockaddr_in6 *addr6 = (void *) sockaddr;
285*4882a593Smuzhiyun 	struct sockaddr_in *addr4 = (void *) sockaddr;
286*4882a593Smuzhiyun 
287*4882a593Smuzhiyun 	switch (domain) {
288*4882a593Smuzhiyun 	case PF_INET:
289*4882a593Smuzhiyun 		memset(addr4, 0, sizeof(*addr4));
290*4882a593Smuzhiyun 		addr4->sin_family = AF_INET;
291*4882a593Smuzhiyun 		addr4->sin_port = htons(cfg_port);
292*4882a593Smuzhiyun 		if (str_addr &&
293*4882a593Smuzhiyun 		    inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1)
294*4882a593Smuzhiyun 			error(1, 0, "ipv4 parse error: %s", str_addr);
295*4882a593Smuzhiyun 		break;
296*4882a593Smuzhiyun 	case PF_INET6:
297*4882a593Smuzhiyun 		memset(addr6, 0, sizeof(*addr6));
298*4882a593Smuzhiyun 		addr6->sin6_family = AF_INET6;
299*4882a593Smuzhiyun 		addr6->sin6_port = htons(cfg_port);
300*4882a593Smuzhiyun 		if (str_addr &&
301*4882a593Smuzhiyun 		    inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1)
302*4882a593Smuzhiyun 			error(1, 0, "ipv6 parse error: %s", str_addr);
303*4882a593Smuzhiyun 		break;
304*4882a593Smuzhiyun 	default:
305*4882a593Smuzhiyun 		error(1, 0, "illegal domain");
306*4882a593Smuzhiyun 	}
307*4882a593Smuzhiyun }
308*4882a593Smuzhiyun 
do_accept(int fdlisten)309*4882a593Smuzhiyun static void do_accept(int fdlisten)
310*4882a593Smuzhiyun {
311*4882a593Smuzhiyun 	pthread_attr_t attr;
312*4882a593Smuzhiyun 	int rcvlowat;
313*4882a593Smuzhiyun 
314*4882a593Smuzhiyun 	pthread_attr_init(&attr);
315*4882a593Smuzhiyun 	pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
316*4882a593Smuzhiyun 
317*4882a593Smuzhiyun 	rcvlowat = chunk_size;
318*4882a593Smuzhiyun 	if (setsockopt(fdlisten, SOL_SOCKET, SO_RCVLOWAT,
319*4882a593Smuzhiyun 		       &rcvlowat, sizeof(rcvlowat)) == -1) {
320*4882a593Smuzhiyun 		perror("setsockopt SO_RCVLOWAT");
321*4882a593Smuzhiyun 	}
322*4882a593Smuzhiyun 
323*4882a593Smuzhiyun 	apply_rcvsnd_buf(fdlisten);
324*4882a593Smuzhiyun 
325*4882a593Smuzhiyun 	while (1) {
326*4882a593Smuzhiyun 		struct sockaddr_in addr;
327*4882a593Smuzhiyun 		socklen_t addrlen = sizeof(addr);
328*4882a593Smuzhiyun 		pthread_t th;
329*4882a593Smuzhiyun 		int fd, res;
330*4882a593Smuzhiyun 
331*4882a593Smuzhiyun 		fd = accept(fdlisten, (struct sockaddr *)&addr, &addrlen);
332*4882a593Smuzhiyun 		if (fd == -1) {
333*4882a593Smuzhiyun 			perror("accept");
334*4882a593Smuzhiyun 			continue;
335*4882a593Smuzhiyun 		}
336*4882a593Smuzhiyun 		res = pthread_create(&th, &attr, child_thread,
337*4882a593Smuzhiyun 				     (void *)(unsigned long)fd);
338*4882a593Smuzhiyun 		if (res) {
339*4882a593Smuzhiyun 			errno = res;
340*4882a593Smuzhiyun 			perror("pthread_create");
341*4882a593Smuzhiyun 			close(fd);
342*4882a593Smuzhiyun 		}
343*4882a593Smuzhiyun 	}
344*4882a593Smuzhiyun }
345*4882a593Smuzhiyun 
346*4882a593Smuzhiyun /* Each thread should reserve a big enough vma to avoid
347*4882a593Smuzhiyun  * spinlock collisions in ptl locks.
348*4882a593Smuzhiyun  * This size is 2MB on x86_64, and is exported in /proc/meminfo.
349*4882a593Smuzhiyun  */
default_huge_page_size(void)350*4882a593Smuzhiyun static unsigned long default_huge_page_size(void)
351*4882a593Smuzhiyun {
352*4882a593Smuzhiyun 	FILE *f = fopen("/proc/meminfo", "r");
353*4882a593Smuzhiyun 	unsigned long hps = 0;
354*4882a593Smuzhiyun 	size_t linelen = 0;
355*4882a593Smuzhiyun 	char *line = NULL;
356*4882a593Smuzhiyun 
357*4882a593Smuzhiyun 	if (!f)
358*4882a593Smuzhiyun 		return 0;
359*4882a593Smuzhiyun 	while (getline(&line, &linelen, f) > 0) {
360*4882a593Smuzhiyun 		if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
361*4882a593Smuzhiyun 			hps <<= 10;
362*4882a593Smuzhiyun 			break;
363*4882a593Smuzhiyun 		}
364*4882a593Smuzhiyun 	}
365*4882a593Smuzhiyun 	free(line);
366*4882a593Smuzhiyun 	fclose(f);
367*4882a593Smuzhiyun 	return hps;
368*4882a593Smuzhiyun }
369*4882a593Smuzhiyun 
main(int argc,char * argv[])370*4882a593Smuzhiyun int main(int argc, char *argv[])
371*4882a593Smuzhiyun {
372*4882a593Smuzhiyun 	struct sockaddr_storage listenaddr, addr;
373*4882a593Smuzhiyun 	unsigned int max_pacing_rate = 0;
374*4882a593Smuzhiyun 	uint64_t total = 0;
375*4882a593Smuzhiyun 	char *host = NULL;
376*4882a593Smuzhiyun 	int fd, c, on = 1;
377*4882a593Smuzhiyun 	size_t buffer_sz;
378*4882a593Smuzhiyun 	char *buffer;
379*4882a593Smuzhiyun 	int sflg = 0;
380*4882a593Smuzhiyun 	int mss = 0;
381*4882a593Smuzhiyun 
382*4882a593Smuzhiyun 	while ((c = getopt(argc, argv, "46p:svr:w:H:zxkP:M:C:a:")) != -1) {
383*4882a593Smuzhiyun 		switch (c) {
384*4882a593Smuzhiyun 		case '4':
385*4882a593Smuzhiyun 			cfg_family = PF_INET;
386*4882a593Smuzhiyun 			cfg_alen = sizeof(struct sockaddr_in);
387*4882a593Smuzhiyun 			break;
388*4882a593Smuzhiyun 		case '6':
389*4882a593Smuzhiyun 			cfg_family = PF_INET6;
390*4882a593Smuzhiyun 			cfg_alen = sizeof(struct sockaddr_in6);
391*4882a593Smuzhiyun 			break;
392*4882a593Smuzhiyun 		case 'p':
393*4882a593Smuzhiyun 			cfg_port = atoi(optarg);
394*4882a593Smuzhiyun 			break;
395*4882a593Smuzhiyun 		case 'H':
396*4882a593Smuzhiyun 			host = optarg;
397*4882a593Smuzhiyun 			break;
398*4882a593Smuzhiyun 		case 's': /* server : listen for incoming connections */
399*4882a593Smuzhiyun 			sflg++;
400*4882a593Smuzhiyun 			break;
401*4882a593Smuzhiyun 		case 'r':
402*4882a593Smuzhiyun 			rcvbuf = atoi(optarg);
403*4882a593Smuzhiyun 			break;
404*4882a593Smuzhiyun 		case 'w':
405*4882a593Smuzhiyun 			sndbuf = atoi(optarg);
406*4882a593Smuzhiyun 			break;
407*4882a593Smuzhiyun 		case 'z':
408*4882a593Smuzhiyun 			zflg = 1;
409*4882a593Smuzhiyun 			break;
410*4882a593Smuzhiyun 		case 'M':
411*4882a593Smuzhiyun 			mss = atoi(optarg);
412*4882a593Smuzhiyun 			break;
413*4882a593Smuzhiyun 		case 'x':
414*4882a593Smuzhiyun 			xflg = 1;
415*4882a593Smuzhiyun 			break;
416*4882a593Smuzhiyun 		case 'k':
417*4882a593Smuzhiyun 			keepflag = 1;
418*4882a593Smuzhiyun 			break;
419*4882a593Smuzhiyun 		case 'P':
420*4882a593Smuzhiyun 			max_pacing_rate = atoi(optarg) ;
421*4882a593Smuzhiyun 			break;
422*4882a593Smuzhiyun 		case 'C':
423*4882a593Smuzhiyun 			chunk_size = atol(optarg);
424*4882a593Smuzhiyun 			break;
425*4882a593Smuzhiyun 		case 'a':
426*4882a593Smuzhiyun 			map_align = atol(optarg);
427*4882a593Smuzhiyun 			break;
428*4882a593Smuzhiyun 		default:
429*4882a593Smuzhiyun 			exit(1);
430*4882a593Smuzhiyun 		}
431*4882a593Smuzhiyun 	}
432*4882a593Smuzhiyun 	if (!map_align) {
433*4882a593Smuzhiyun 		map_align = default_huge_page_size();
434*4882a593Smuzhiyun 		/* if really /proc/meminfo is not helping,
435*4882a593Smuzhiyun 		 * we use the default x86_64 hugepagesize.
436*4882a593Smuzhiyun 		 */
437*4882a593Smuzhiyun 		if (!map_align)
438*4882a593Smuzhiyun 			map_align = 2*1024*1024;
439*4882a593Smuzhiyun 	}
440*4882a593Smuzhiyun 	if (sflg) {
441*4882a593Smuzhiyun 		int fdlisten = socket(cfg_family, SOCK_STREAM, 0);
442*4882a593Smuzhiyun 
443*4882a593Smuzhiyun 		if (fdlisten == -1) {
444*4882a593Smuzhiyun 			perror("socket");
445*4882a593Smuzhiyun 			exit(1);
446*4882a593Smuzhiyun 		}
447*4882a593Smuzhiyun 		apply_rcvsnd_buf(fdlisten);
448*4882a593Smuzhiyun 		setsockopt(fdlisten, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on));
449*4882a593Smuzhiyun 
450*4882a593Smuzhiyun 		setup_sockaddr(cfg_family, host, &listenaddr);
451*4882a593Smuzhiyun 
452*4882a593Smuzhiyun 		if (mss &&
453*4882a593Smuzhiyun 		    setsockopt(fdlisten, IPPROTO_TCP, TCP_MAXSEG,
454*4882a593Smuzhiyun 			       &mss, sizeof(mss)) == -1) {
455*4882a593Smuzhiyun 			perror("setsockopt TCP_MAXSEG");
456*4882a593Smuzhiyun 			exit(1);
457*4882a593Smuzhiyun 		}
458*4882a593Smuzhiyun 		if (bind(fdlisten, (const struct sockaddr *)&listenaddr, cfg_alen) == -1) {
459*4882a593Smuzhiyun 			perror("bind");
460*4882a593Smuzhiyun 			exit(1);
461*4882a593Smuzhiyun 		}
462*4882a593Smuzhiyun 		if (listen(fdlisten, 128) == -1) {
463*4882a593Smuzhiyun 			perror("listen");
464*4882a593Smuzhiyun 			exit(1);
465*4882a593Smuzhiyun 		}
466*4882a593Smuzhiyun 		do_accept(fdlisten);
467*4882a593Smuzhiyun 	}
468*4882a593Smuzhiyun 
469*4882a593Smuzhiyun 	buffer = mmap_large_buffer(chunk_size, &buffer_sz);
470*4882a593Smuzhiyun 	if (buffer == (char *)-1) {
471*4882a593Smuzhiyun 		perror("mmap");
472*4882a593Smuzhiyun 		exit(1);
473*4882a593Smuzhiyun 	}
474*4882a593Smuzhiyun 
475*4882a593Smuzhiyun 	fd = socket(cfg_family, SOCK_STREAM, 0);
476*4882a593Smuzhiyun 	if (fd == -1) {
477*4882a593Smuzhiyun 		perror("socket");
478*4882a593Smuzhiyun 		exit(1);
479*4882a593Smuzhiyun 	}
480*4882a593Smuzhiyun 	apply_rcvsnd_buf(fd);
481*4882a593Smuzhiyun 
482*4882a593Smuzhiyun 	setup_sockaddr(cfg_family, host, &addr);
483*4882a593Smuzhiyun 
484*4882a593Smuzhiyun 	if (mss &&
485*4882a593Smuzhiyun 	    setsockopt(fd, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == -1) {
486*4882a593Smuzhiyun 		perror("setsockopt TCP_MAXSEG");
487*4882a593Smuzhiyun 		exit(1);
488*4882a593Smuzhiyun 	}
489*4882a593Smuzhiyun 	if (connect(fd, (const struct sockaddr *)&addr, cfg_alen) == -1) {
490*4882a593Smuzhiyun 		perror("connect");
491*4882a593Smuzhiyun 		exit(1);
492*4882a593Smuzhiyun 	}
493*4882a593Smuzhiyun 	if (max_pacing_rate &&
494*4882a593Smuzhiyun 	    setsockopt(fd, SOL_SOCKET, SO_MAX_PACING_RATE,
495*4882a593Smuzhiyun 		       &max_pacing_rate, sizeof(max_pacing_rate)) == -1)
496*4882a593Smuzhiyun 		perror("setsockopt SO_MAX_PACING_RATE");
497*4882a593Smuzhiyun 
498*4882a593Smuzhiyun 	if (zflg && setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY,
499*4882a593Smuzhiyun 			       &on, sizeof(on)) == -1) {
500*4882a593Smuzhiyun 		perror("setsockopt SO_ZEROCOPY, (-z option disabled)");
501*4882a593Smuzhiyun 		zflg = 0;
502*4882a593Smuzhiyun 	}
503*4882a593Smuzhiyun 	while (total < FILE_SZ) {
504*4882a593Smuzhiyun 		int64_t wr = FILE_SZ - total;
505*4882a593Smuzhiyun 
506*4882a593Smuzhiyun 		if (wr > chunk_size)
507*4882a593Smuzhiyun 			wr = chunk_size;
508*4882a593Smuzhiyun 		/* Note : we just want to fill the pipe with 0 bytes */
509*4882a593Smuzhiyun 		wr = send(fd, buffer, (size_t)wr, zflg ? MSG_ZEROCOPY : 0);
510*4882a593Smuzhiyun 		if (wr <= 0)
511*4882a593Smuzhiyun 			break;
512*4882a593Smuzhiyun 		total += wr;
513*4882a593Smuzhiyun 	}
514*4882a593Smuzhiyun 	close(fd);
515*4882a593Smuzhiyun 	munmap(buffer, buffer_sz);
516*4882a593Smuzhiyun 	return 0;
517*4882a593Smuzhiyun }
518