fix broken dependency
[openadk.git] / target / linux / patches / 3.14.45 / mptcp.patch
blobaf2dc7837e597a9ad69e1304a1cb3b4729938e03
1 diff -Nur linux-3.14.45.orig/drivers/infiniband/hw/cxgb4/cm.c linux-3.14.45/drivers/infiniband/hw/cxgb4/cm.c
2 --- linux-3.14.45.orig/drivers/infiniband/hw/cxgb4/cm.c 2015-06-23 02:01:36.000000000 +0200
3 +++ linux-3.14.45/drivers/infiniband/hw/cxgb4/cm.c 2015-06-24 14:15:48.871862463 +0200
4 @@ -3162,7 +3162,7 @@
5 */
6 memset(&tmp_opt, 0, sizeof(tmp_opt));
7 tcp_clear_options(&tmp_opt);
8 - tcp_parse_options(skb, &tmp_opt, 0, NULL);
9 + tcp_parse_options(skb, &tmp_opt, NULL, 0, NULL);
11 req = (struct cpl_pass_accept_req *)__skb_push(skb, sizeof(*req));
12 memset(req, 0, sizeof(*req));
13 diff -Nur linux-3.14.45.orig/include/linux/ipv6.h linux-3.14.45/include/linux/ipv6.h
14 --- linux-3.14.45.orig/include/linux/ipv6.h 2015-06-23 02:01:36.000000000 +0200
15 +++ linux-3.14.45/include/linux/ipv6.h 2015-06-24 14:15:48.871862463 +0200
16 @@ -309,12 +309,6 @@
17 return NULL;
20 -static inline struct inet6_request_sock *
21 - inet6_rsk(const struct request_sock *rsk)
23 - return NULL;
26 static inline struct raw6_sock *raw6_sk(const struct sock *sk)
28 return NULL;
29 diff -Nur linux-3.14.45.orig/include/linux/tcp.h linux-3.14.45/include/linux/tcp.h
30 --- linux-3.14.45.orig/include/linux/tcp.h 2015-06-23 02:01:36.000000000 +0200
31 +++ linux-3.14.45/include/linux/tcp.h 2015-06-24 14:15:48.871862463 +0200
32 @@ -72,6 +72,53 @@
33 u32 end_seq;
36 +struct tcp_out_options {
37 + u16 options; /* bit field of OPTION_* */
38 + u8 ws; /* window scale, 0 to disable */
39 + u8 num_sack_blocks;/* number of SACK blocks to include */
40 + u8 hash_size; /* bytes in hash_location */
41 + u16 mss; /* 0 to disable */
42 + __u8 *hash_location; /* temporary pointer, overloaded */
43 + __u32 tsval, tsecr; /* need to include OPTION_TS */
44 + struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
45 +#ifdef CONFIG_MPTCP
46 + u16 mptcp_options; /* bit field of MPTCP related OPTION_* */
47 + u8 dss_csum:1,
48 + add_addr_v4:1,
49 + add_addr_v6:1; /* dss-checksum required? */
51 + __u32 data_seq; /* data sequence number, for MPTCP */
52 + __u32 data_ack; /* data ack, for MPTCP */
54 + union {
55 + struct {
56 + __u64 sender_key; /* sender's key for mptcp */
57 + __u64 receiver_key; /* receiver's key for mptcp */
58 + } mp_capable;
60 + struct {
61 + __u64 sender_truncated_mac;
62 + __u32 sender_nonce;
63 + /* random number of the sender */
64 + __u32 token; /* token for mptcp */
65 + } mp_join_syns;
66 + };
68 + struct {
69 + struct in_addr addr;
70 + u8 addr_id;
71 + } add_addr4;
73 + struct {
74 + struct in6_addr addr;
75 + u8 addr_id;
76 + } add_addr6;
78 + u16 remove_addrs; /* list of address id */
79 + u8 addr_id; /* address id (mp_join or add_address) */
80 +#endif /* CONFIG_MPTCP */
81 +};
83 /*These are used to set the sack_ok field in struct tcp_options_received */
84 #define TCP_SACK_SEEN (1 << 0) /*1 = peer is SACK capable, */
85 #define TCP_FACK_ENABLED (1 << 1) /*1 = FACK is enabled locally*/
86 @@ -95,6 +142,9 @@
87 u16 mss_clamp; /* Maximal mss, negotiated at connection setup */
90 +struct mptcp_cb;
91 +struct mptcp_tcp_sock;
93 static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
95 rx_opt->tstamp_ok = rx_opt->sack_ok = 0;
96 @@ -123,6 +173,7 @@
97 * FastOpen it's the seq#
98 * after data-in-SYN.
100 + u8 saw_mpc:1;
103 static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
104 @@ -130,6 +181,8 @@
105 return (struct tcp_request_sock *)req;
108 +struct tcp_md5sig_key;
110 struct tcp_sock {
111 /* inet_connection_sock has to be the first member of tcp_sock */
112 struct inet_connection_sock inet_conn;
113 @@ -323,6 +376,45 @@
114 * socket. Used to retransmit SYNACKs etc.
116 struct request_sock *fastopen_rsk;
119 + struct mptcp_cb *mpcb;
120 + struct sock *meta_sk;
121 + /* We keep these flags even if CONFIG_MPTCP is not checked, because
122 + * it allows checking MPTCP capability just by checking the mpc flag,
123 + * rather than adding ifdefs everywhere.
124 + */
125 + u16 mpc:1, /* Other end is multipath capable */
126 + inside_tk_table:1, /* Is the tcp_sock inside the token-table? */
127 + send_mp_fclose:1,
128 + request_mptcp:1, /* Did we send out an MP_CAPABLE?
129 + * (this speeds up mptcp_doit() in tcp_recvmsg)
130 + */
131 + mptcp_enabled:1, /* Is MPTCP enabled from the application ? */
132 + pf:1, /* Potentially Failed state: when this flag is set, we
133 + * stop using the subflow
134 + */
135 + mp_killed:1, /* Killed with a tcp_done in mptcp? */
136 + was_meta_sk:1, /* This was a meta sk (in case of reuse) */
137 + close_it:1, /* Must close socket in mptcp_data_ready? */
138 + closing:1;
139 + struct mptcp_tcp_sock *mptcp;
140 +#ifdef CONFIG_MPTCP
141 + struct hlist_nulls_node tk_table;
142 + u32 mptcp_loc_token;
143 + u64 mptcp_loc_key;
144 +#endif /* CONFIG_MPTCP */
146 + /* Functions that depend on the value of the mpc flag */
147 + u32 (*__select_window)(struct sock *sk);
148 + u16 (*select_window)(struct sock *sk);
149 + void (*select_initial_window)(int __space, __u32 mss, __u32 *rcv_wnd,
150 + __u32 *window_clamp, int wscale_ok,
151 + __u8 *rcv_wscale, __u32 init_rcv_wnd,
152 + const struct sock *sk);
153 + void (*init_buffer_space)(struct sock *sk);
154 + void (*set_rto)(struct sock *sk);
155 + bool (*should_expand_sndbuf)(const struct sock *sk);
158 enum tsq_flags {
159 @@ -334,6 +426,8 @@
160 TCP_MTU_REDUCED_DEFERRED, /* tcp_v{4|6}_err() could not call
161 * tcp_v{4|6}_mtu_reduced()
163 + MPTCP_PATH_MANAGER, /* MPTCP deferred creation of new subflows */
164 + MPTCP_SUB_DEFERRED, /* A subflow got deferred - process them */
167 static inline struct tcp_sock *tcp_sk(const struct sock *sk)
168 @@ -352,6 +446,7 @@
169 #ifdef CONFIG_TCP_MD5SIG
170 struct tcp_md5sig_key *tw_md5_key;
171 #endif
172 + struct mptcp_tw *mptcp_tw;
175 static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)
176 diff -Nur linux-3.14.45.orig/include/net/inet6_connection_sock.h linux-3.14.45/include/net/inet6_connection_sock.h
177 --- linux-3.14.45.orig/include/net/inet6_connection_sock.h 2015-06-23 02:01:36.000000000 +0200
178 +++ linux-3.14.45/include/net/inet6_connection_sock.h 2015-06-24 14:15:48.871862463 +0200
179 @@ -27,6 +27,8 @@
181 struct dst_entry *inet6_csk_route_req(struct sock *sk, struct flowi6 *fl6,
182 const struct request_sock *req);
183 +u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport,
184 + const u32 rnd, const u32 synq_hsize);
186 struct request_sock *inet6_csk_search_req(const struct sock *sk,
187 struct request_sock ***prevp,
188 diff -Nur linux-3.14.45.orig/include/net/inet_common.h linux-3.14.45/include/net/inet_common.h
189 --- linux-3.14.45.orig/include/net/inet_common.h 2015-06-23 02:01:36.000000000 +0200
190 +++ linux-3.14.45/include/net/inet_common.h 2015-06-24 14:15:48.871862463 +0200
191 @@ -1,6 +1,8 @@
192 #ifndef _INET_COMMON_H
193 #define _INET_COMMON_H
195 +#include <net/sock.h>
197 extern const struct proto_ops inet_stream_ops;
198 extern const struct proto_ops inet_dgram_ops;
200 @@ -13,6 +15,8 @@
201 struct sockaddr;
202 struct socket;
204 +int inet_create(struct net *net, struct socket *sock, int protocol, int kern);
205 +int inet6_create(struct net *net, struct socket *sock, int protocol, int kern);
206 int inet_release(struct socket *sock);
207 int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
208 int addr_len, int flags);
209 diff -Nur linux-3.14.45.orig/include/net/inet_connection_sock.h linux-3.14.45/include/net/inet_connection_sock.h
210 --- linux-3.14.45.orig/include/net/inet_connection_sock.h 2015-06-23 02:01:36.000000000 +0200
211 +++ linux-3.14.45/include/net/inet_connection_sock.h 2015-06-24 14:15:48.871862463 +0200
212 @@ -244,6 +244,9 @@
214 struct sock *inet_csk_accept(struct sock *sk, int flags, int *err);
216 +u32 inet_synq_hash(const __be32 raddr, const __be16 rport, const u32 rnd,
217 + const u32 synq_hsize);
219 struct request_sock *inet_csk_search_req(const struct sock *sk,
220 struct request_sock ***prevp,
221 const __be16 rport,
222 diff -Nur linux-3.14.45.orig/include/net/mptcp.h linux-3.14.45/include/net/mptcp.h
223 --- linux-3.14.45.orig/include/net/mptcp.h 1970-01-01 01:00:00.000000000 +0100
224 +++ linux-3.14.45/include/net/mptcp.h 2015-06-24 14:15:48.871862463 +0200
225 @@ -0,0 +1,1471 @@
227 + * MPTCP implementation
229 + * Initial Design & Implementation:
230 + * Sébastien Barré <sebastien.barre@uclouvain.be>
232 + * Current Maintainer & Author:
233 + * Christoph Paasch <christoph.paasch@uclouvain.be>
235 + * Additional authors:
236 + * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
237 + * Gregory Detal <gregory.detal@uclouvain.be>
238 + * Fabien Duchêne <fabien.duchene@uclouvain.be>
239 + * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
240 + * Lavkesh Lahngir <lavkesh51@gmail.com>
241 + * Andreas Ripke <ripke@neclab.eu>
242 + * Vlad Dogaru <vlad.dogaru@intel.com>
243 + * Octavian Purdila <octavian.purdila@intel.com>
244 + * John Ronan <jronan@tssg.org>
245 + * Catalin Nicutar <catalin.nicutar@gmail.com>
246 + * Brandon Heller <brandonh@stanford.edu>
249 + * This program is free software; you can redistribute it and/or
250 + * modify it under the terms of the GNU General Public License
251 + * as published by the Free Software Foundation; either version
252 + * 2 of the License, or (at your option) any later version.
253 + */
255 +#ifndef _MPTCP_H
256 +#define _MPTCP_H
258 +#include <linux/inetdevice.h>
259 +#include <linux/ipv6.h>
260 +#include <linux/list.h>
261 +#include <linux/net.h>
262 +#include <linux/netpoll.h>
263 +#include <linux/skbuff.h>
264 +#include <linux/socket.h>
265 +#include <linux/tcp.h>
266 +#include <linux/kernel.h>
268 +#include <asm/byteorder.h>
269 +#include <asm/unaligned.h>
270 +#include <crypto/hash.h>
271 +#include <net/tcp.h>
273 +#if defined(__LITTLE_ENDIAN_BITFIELD)
274 + #define ntohll(x) be64_to_cpu(x)
275 + #define htonll(x) cpu_to_be64(x)
276 +#elif defined(__BIG_ENDIAN_BITFIELD)
277 + #define ntohll(x) (x)
278 + #define htonll(x) (x)
279 +#endif
281 +/* Max number of local or remote addresses we can store.
282 + * When changing, see the bitfield below in mptcp_loc4/6. */
283 +#define MPTCP_MAX_ADDR 8
285 +#define MPTCP_SUBFLOW_RETRY_DELAY 1000
287 +struct mptcp_loc4 {
288 + u8 loc4_id;
289 + u8 low_prio:1;
290 + struct in_addr addr;
293 +struct mptcp_rem4 {
294 + u8 rem4_id;
295 + u8 bitfield;
296 + u8 retry_bitfield;
297 + __be16 port;
298 + struct in_addr addr;
301 +struct mptcp_loc6 {
302 + u8 loc6_id;
303 + u8 low_prio:1;
304 + struct in6_addr addr;
307 +struct mptcp_rem6 {
308 + u8 rem6_id;
309 + u8 bitfield;
310 + u8 retry_bitfield;
311 + __be16 port;
312 + struct in6_addr addr;
315 +struct mptcp_request_sock {
316 + struct tcp_request_sock req;
317 + struct mptcp_cb *mpcb;
318 + /* Collision list in the tuple hashtable. We need to find
319 + * the req sock when receiving the third msg of the 3-way handshake,
320 + * since that one does not contain the token. If this makes
321 + * the request sock too long, we can use kmalloc'ed specific entries for
322 + * that tuple hashtable. At the moment, though, I extend the
323 + * request_sock.
324 + */
325 + struct list_head collide_tuple;
326 + struct hlist_nulls_node collide_tk;
327 + u32 mptcp_rem_nonce;
328 + u32 mptcp_loc_token;
329 + u64 mptcp_loc_key;
330 + u64 mptcp_rem_key;
331 + u64 mptcp_hash_tmac;
332 + u32 mptcp_loc_nonce;
333 + u8 loc_id;
334 + u8 rem_id; /* Address-id in the MP_JOIN */
335 + u8 dss_csum:1,
336 + low_prio:1;
339 +struct mptcp_options_received {
340 + u16 saw_mpc:1,
341 + dss_csum:1,
342 + drop_me:1,
344 + is_mp_join:1,
345 + join_ack:1,
347 + saw_low_prio:2, /* 0x1 - low-prio set for this subflow
348 + * 0x2 - low-prio set for another subflow
349 + */
350 + low_prio:1,
352 + saw_add_addr:2, /* Saw at least one add_addr option:
353 + * 0x1: IPv4 - 0x2: IPv6
354 + */
355 + more_add_addr:1, /* Saw one more add-addr. */
357 + saw_rem_addr:1, /* Saw at least one rem_addr option */
358 + more_rem_addr:1, /* Saw one more rem-addr. */
360 + mp_fail:1,
361 + mp_fclose:1;
362 + u8 rem_id; /* Address-id in the MP_JOIN */
363 + u8 prio_addr_id; /* Address-id in the MP_PRIO */
365 + const unsigned char *add_addr_ptr; /* Pointer to add-address option */
366 + const unsigned char *rem_addr_ptr; /* Pointer to rem-address option */
368 + u32 data_ack;
369 + u32 data_seq;
370 + u16 data_len;
372 + u32 mptcp_rem_token;/* Remote token */
374 + /* Key inside the option (from mp_capable or fast_close) */
375 + u64 mptcp_key;
377 + u32 mptcp_recv_nonce;
378 + u64 mptcp_recv_tmac;
379 + u8 mptcp_recv_mac[20];
382 +struct mptcp_tcp_sock {
383 + struct tcp_sock *next; /* Next subflow socket */
384 + struct list_head cb_list;
385 + struct mptcp_options_received rx_opt;
387 + /* Those three fields record the current mapping */
388 + u64 map_data_seq;
389 + u32 map_subseq;
390 + u16 map_data_len;
391 + u16 slave_sk:1,
392 + fully_established:1,
393 + establish_increased:1,
394 + second_packet:1,
395 + attached:1,
396 + send_mp_fail:1,
397 + include_mpc:1,
398 + mapping_present:1,
399 + map_data_fin:1,
400 + low_prio:1, /* use this socket as backup */
401 + rcv_low_prio:1, /* Peer sent low-prio option to us */
402 + send_mp_prio:1, /* Trigger to send mp_prio on this socket */
403 + pre_established:1; /* State between sending 3rd ACK and
404 + * receiving the fourth ack of new subflows.
405 + */
407 + /* isn: needed to translate abs to relative subflow seqnums */
408 + u32 snt_isn;
409 + u32 rcv_isn;
410 + u32 last_data_seq;
411 + u8 path_index;
412 + u8 loc_id;
413 + u8 rem_id;
415 + u32 last_rbuf_opti; /* Timestamp of last rbuf optimization */
416 + unsigned int sent_pkts;
418 + struct sk_buff *shortcut_ofoqueue; /* Shortcut to the current modified
419 + * skb in the ofo-queue.
420 + */
422 + int init_rcv_wnd;
423 + u32 infinite_cutoff_seq;
424 + struct delayed_work work;
425 + u32 mptcp_loc_nonce;
426 + struct tcp_sock *tp; /* Where is my daddy? */
427 + u32 last_end_data_seq;
429 + /* MP_JOIN subflow: timer for retransmitting the 3rd ack */
430 + struct timer_list mptcp_ack_timer;
432 + /* HMAC of the third ack */
433 + char sender_mac[20];
436 +struct mptcp_tw {
437 + struct list_head list;
438 + u64 loc_key;
439 + u64 rcv_nxt;
440 + struct mptcp_cb __rcu *mpcb;
441 + u8 meta_tw:1,
442 + in_list:1;
445 +#define MPTCP_PM_NAME_MAX 16
446 +struct mptcp_pm_ops {
447 + struct list_head list;
449 + /* Signal the creation of a new MPTCP-session. */
450 + void (*new_session)(struct sock *meta_sk, int index);
451 + void (*release_sock)(struct sock *meta_sk);
452 + void (*fully_established)(struct sock *meta_sk);
453 + void (*new_remote_address)(struct sock *meta_sk);
454 + int (*get_local_index)(sa_family_t family, union inet_addr *addr,
455 + struct net *net);
456 + int (*get_local_id)(sa_family_t family, union inet_addr *addr,
457 + struct net *net);
458 + void (*addr_signal)(struct sock *sk, unsigned *size,
459 + struct tcp_out_options *opts, struct sk_buff *skb);
461 + char name[MPTCP_PM_NAME_MAX];
462 + struct module *owner;
465 +struct mptcp_cb {
466 + struct sock *meta_sk;
468 + /* list of sockets in this multipath connection */
469 + struct tcp_sock *connection_list;
470 + /* list of sockets that need a call to release_cb */
471 + struct list_head callback_list;
473 + spinlock_t tw_lock;
474 + struct list_head tw_list;
475 + unsigned char mptw_state;
477 + atomic_t mpcb_refcnt;
479 + /* High-order bits of 64-bit sequence numbers */
480 + u32 snd_high_order[2];
481 + u32 rcv_high_order[2];
483 + u16 send_infinite_mapping:1,
484 + in_time_wait:1,
485 + list_rcvd:1, /* XXX TO REMOVE */
486 + dss_csum:1,
487 + server_side:1,
488 + infinite_mapping_rcv:1,
489 + infinite_mapping_snd:1,
490 + dfin_combined:1, /* Was the DFIN combined with subflow-fin? */
491 + passive_close:1,
492 + snd_hiseq_index:1, /* Index in snd_high_order of snd_nxt */
493 + rcv_hiseq_index:1; /* Index in rcv_high_order of rcv_nxt */
495 + /* socket count in this connection */
496 + u8 cnt_subflows;
497 + u8 cnt_established;
499 + u32 noneligible; /* Path mask of temporarily non
500 + * eligible subflows by the scheduler
501 + */
503 + struct sk_buff_head reinject_queue;
505 + u8 dfin_path_index;
507 +#define MPTCP_PM_SIZE 320
508 + u8 mptcp_pm[MPTCP_PM_SIZE] __aligned(8);
509 + struct mptcp_pm_ops *pm_ops;
511 + /* Mutex needed, because otherwise mptcp_close will complain that the
512 + * socket is owned by the user.
513 + * E.g., mptcp_sub_close_wq is taking the meta-lock.
514 + */
515 + struct mutex mpcb_mutex;
517 + /* Master socket, also part of the connection_list, this
518 + * socket is the one that the application sees.
519 + */
520 + struct sock *master_sk;
522 + u64 csum_cutoff_seq;
524 + __u64 mptcp_loc_key;
525 + __u32 mptcp_loc_token;
526 + __u64 mptcp_rem_key;
527 + __u32 mptcp_rem_token;
529 + /* Create a new subflow - necessary because the meta-sk may be IPv4, but
530 + * the new subflow can be IPv6
531 + */
532 + struct sock *(*syn_recv_sock)(struct sock *sk, struct sk_buff *skb,
533 + struct request_sock *req,
534 + struct dst_entry *dst);
536 + /* Remote addresses */
537 + struct mptcp_rem4 remaddr4[MPTCP_MAX_ADDR];
538 + u8 rem4_bits;
540 + struct mptcp_rem6 remaddr6[MPTCP_MAX_ADDR];
541 + u8 rem6_bits;
543 + u32 path_index_bits;
544 + /* Next pi to pick up in case a new path becomes available */
545 + u8 next_path_index;
547 + /* Original snd/rcvbuf of the initial subflow.
548 + * Used for the new subflows on the server-side to allow correct
549 + * autotuning
550 + */
551 + int orig_sk_rcvbuf;
552 + int orig_sk_sndbuf;
553 + u32 orig_window_clamp;
556 +#define MPTCP_SUB_CAPABLE 0
557 +#define MPTCP_SUB_LEN_CAPABLE_SYN 12
558 +#define MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN 12
559 +#define MPTCP_SUB_LEN_CAPABLE_ACK 20
560 +#define MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN 20
562 +#define MPTCP_SUB_JOIN 1
563 +#define MPTCP_SUB_LEN_JOIN_SYN 12
564 +#define MPTCP_SUB_LEN_JOIN_SYN_ALIGN 12
565 +#define MPTCP_SUB_LEN_JOIN_SYNACK 16
566 +#define MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN 16
567 +#define MPTCP_SUB_LEN_JOIN_ACK 24
568 +#define MPTCP_SUB_LEN_JOIN_ACK_ALIGN 24
570 +#define MPTCP_SUB_DSS 2
571 +#define MPTCP_SUB_LEN_DSS 4
572 +#define MPTCP_SUB_LEN_DSS_ALIGN 4
574 +/* Lengths for seq and ack are the ones without the generic MPTCP-option header,
575 + * as they are part of the DSS-option.
576 + * To get the total length, just add the different options together.
577 + */
578 +#define MPTCP_SUB_LEN_SEQ 10
579 +#define MPTCP_SUB_LEN_SEQ_CSUM 12
580 +#define MPTCP_SUB_LEN_SEQ_ALIGN 12
582 +#define MPTCP_SUB_LEN_SEQ_64 14
583 +#define MPTCP_SUB_LEN_SEQ_CSUM_64 16
584 +#define MPTCP_SUB_LEN_SEQ_64_ALIGN 16
586 +#define MPTCP_SUB_LEN_ACK 4
587 +#define MPTCP_SUB_LEN_ACK_ALIGN 4
589 +#define MPTCP_SUB_LEN_ACK_64 8
590 +#define MPTCP_SUB_LEN_ACK_64_ALIGN 8
592 +/* This is the "default" option-length we will send out most often.
593 + * MPTCP DSS-header
594 + * 32-bit data sequence number
595 + * 32-bit data ack
597 + * It is necessary to calculate the effective MSS we will be using when
598 + * sending data.
599 + */
600 +#define MPTCP_SUB_LEN_DSM_ALIGN (MPTCP_SUB_LEN_DSS_ALIGN + \
601 + MPTCP_SUB_LEN_SEQ_ALIGN + \
602 + MPTCP_SUB_LEN_ACK_ALIGN)
604 +#define MPTCP_SUB_ADD_ADDR 3
605 +#define MPTCP_SUB_LEN_ADD_ADDR4 8
606 +#define MPTCP_SUB_LEN_ADD_ADDR6 20
607 +#define MPTCP_SUB_LEN_ADD_ADDR4_ALIGN 8
608 +#define MPTCP_SUB_LEN_ADD_ADDR6_ALIGN 20
610 +#define MPTCP_SUB_REMOVE_ADDR 4
611 +#define MPTCP_SUB_LEN_REMOVE_ADDR 4
613 +#define MPTCP_SUB_PRIO 5
614 +#define MPTCP_SUB_LEN_PRIO 3
615 +#define MPTCP_SUB_LEN_PRIO_ADDR 4
616 +#define MPTCP_SUB_LEN_PRIO_ALIGN 4
618 +#define MPTCP_SUB_FAIL 6
619 +#define MPTCP_SUB_LEN_FAIL 12
620 +#define MPTCP_SUB_LEN_FAIL_ALIGN 12
622 +#define MPTCP_SUB_FCLOSE 7
623 +#define MPTCP_SUB_LEN_FCLOSE 12
624 +#define MPTCP_SUB_LEN_FCLOSE_ALIGN 12
627 +#define OPTION_MPTCP (1 << 5)
629 +static inline void reset_mpc(struct tcp_sock *tp)
631 + tp->mpc = 0;
633 + tp->__select_window = __tcp_select_window;
634 + tp->select_window = tcp_select_window;
635 + tp->select_initial_window = tcp_select_initial_window;
636 + tp->init_buffer_space = tcp_init_buffer_space;
637 + tp->set_rto = tcp_set_rto;
638 + tp->should_expand_sndbuf = tcp_should_expand_sndbuf;
641 +/* Initializes MPTCP flags in tcp_sock (and other tcp_sock members that depend
642 + * on those flags).
643 + */
644 +static inline void mptcp_init_tcp_sock(struct tcp_sock *tp)
646 + reset_mpc(tp);
649 +#ifdef CONFIG_MPTCP
651 +/* Used for checking if the mptcp initialization has been successful */
652 +extern bool mptcp_init_failed;
654 +/* MPTCP options */
655 +#define OPTION_TYPE_SYN (1 << 0)
656 +#define OPTION_TYPE_SYNACK (1 << 1)
657 +#define OPTION_TYPE_ACK (1 << 2)
658 +#define OPTION_MP_CAPABLE (1 << 3)
659 +#define OPTION_DATA_ACK (1 << 4)
660 +#define OPTION_ADD_ADDR (1 << 5)
661 +#define OPTION_MP_JOIN (1 << 6)
662 +#define OPTION_MP_FAIL (1 << 7)
663 +#define OPTION_MP_FCLOSE (1 << 8)
664 +#define OPTION_REMOVE_ADDR (1 << 9)
665 +#define OPTION_MP_PRIO (1 << 10)
667 +/* MPTCP flags */
668 +#define MPTCPHDR_ACK 0x01
669 +#define MPTCPHDR_SEQ 0x02
670 +#define MPTCPHDR_FIN 0x04
671 +#define MPTCPHDR_INF 0x08
672 +#define MPTCPHDR_SEQ64_SET 0x10 /* Did we received a 64-bit seq number */
673 +#define MPTCPHDR_SEQ64_OFO 0x20 /* Is it not in our circular array? */
674 +#define MPTCPHDR_SEQ64_INDEX 0x40 /* Index of seq in mpcb->snd_high_order */
675 +#define MPTCPHDR_DSS_CSUM 0x80
677 +/* It is impossible, that all 8 bits of mptcp_flags are set to 1 with the above
678 + * Thus, defining MPTCPHDR_JOIN as 0xFF is safe.
679 + */
680 +#define MPTCPHDR_JOIN 0xFF
682 +struct mptcp_option {
683 + __u8 kind;
684 + __u8 len;
685 +#if defined(__LITTLE_ENDIAN_BITFIELD)
686 + __u8 ver:4,
687 + sub:4;
688 +#elif defined(__BIG_ENDIAN_BITFIELD)
689 + __u8 sub:4,
690 + ver:4;
691 +#else
692 +#error "Adjust your <asm/byteorder.h> defines"
693 +#endif
696 +struct mp_capable {
697 + __u8 kind;
698 + __u8 len;
699 +#if defined(__LITTLE_ENDIAN_BITFIELD)
700 + __u8 ver:4,
701 + sub:4;
702 + __u8 h:1,
703 + rsv:5,
704 + b:1,
705 + a:1;
706 +#elif defined(__BIG_ENDIAN_BITFIELD)
707 + __u8 sub:4,
708 + ver:4;
709 + __u8 a:1,
710 + b:1,
711 + rsv:5,
712 + h:1;
713 +#else
714 +#error "Adjust your <asm/byteorder.h> defines"
715 +#endif
716 + __u64 sender_key;
717 + __u64 receiver_key;
718 +} __attribute__((__packed__));
720 +struct mp_join {
721 + __u8 kind;
722 + __u8 len;
723 +#if defined(__LITTLE_ENDIAN_BITFIELD)
724 + __u8 b:1,
725 + rsv:3,
726 + sub:4;
727 +#elif defined(__BIG_ENDIAN_BITFIELD)
728 + __u8 sub:4,
729 + rsv:3,
730 + b:1;
731 +#else
732 +#error "Adjust your <asm/byteorder.h> defines"
733 +#endif
734 + __u8 addr_id;
735 + union {
736 + struct {
737 + u32 token;
738 + u32 nonce;
739 + } syn;
740 + struct {
741 + __u64 mac;
742 + u32 nonce;
743 + } synack;
744 + struct {
745 + __u8 mac[20];
746 + } ack;
747 + } u;
748 +} __attribute__((__packed__));
750 +struct mp_dss {
751 + __u8 kind;
752 + __u8 len;
753 +#if defined(__LITTLE_ENDIAN_BITFIELD)
754 + __u16 rsv1:4,
755 + sub:4,
756 + A:1,
757 + a:1,
758 + M:1,
759 + m:1,
760 + F:1,
761 + rsv2:3;
762 +#elif defined(__BIG_ENDIAN_BITFIELD)
763 + __u16 sub:4,
764 + rsv1:4,
765 + rsv2:3,
766 + F:1,
767 + m:1,
768 + M:1,
769 + a:1,
770 + A:1;
771 +#else
772 +#error "Adjust your <asm/byteorder.h> defines"
773 +#endif
776 +struct mp_add_addr {
777 + __u8 kind;
778 + __u8 len;
779 +#if defined(__LITTLE_ENDIAN_BITFIELD)
780 + __u8 ipver:4,
781 + sub:4;
782 +#elif defined(__BIG_ENDIAN_BITFIELD)
783 + __u8 sub:4,
784 + ipver:4;
785 +#else
786 +#error "Adjust your <asm/byteorder.h> defines"
787 +#endif
788 + __u8 addr_id;
789 + union {
790 + struct {
791 + struct in_addr addr;
792 + __be16 port;
793 + } v4;
794 + struct {
795 + struct in6_addr addr;
796 + __be16 port;
797 + } v6;
798 + } u;
799 +} __attribute__((__packed__));
801 +struct mp_remove_addr {
802 + __u8 kind;
803 + __u8 len;
804 +#if defined(__LITTLE_ENDIAN_BITFIELD)
805 + __u8 rsv:4,
806 + sub:4;
807 +#elif defined(__BIG_ENDIAN_BITFIELD)
808 + __u8 sub:4,
809 + rsv:4;
810 +#else
811 +#error "Adjust your <asm/byteorder.h> defines"
812 +#endif
813 + /* list of addr_id */
814 + __u8 addrs_id;
817 +struct mp_fail {
818 + __u8 kind;
819 + __u8 len;
820 +#if defined(__LITTLE_ENDIAN_BITFIELD)
821 + __u16 rsv1:4,
822 + sub:4,
823 + rsv2:8;
824 +#elif defined(__BIG_ENDIAN_BITFIELD)
825 + __u16 sub:4,
826 + rsv1:4,
827 + rsv2:8;
828 +#else
829 +#error "Adjust your <asm/byteorder.h> defines"
830 +#endif
831 + __be64 data_seq;
832 +} __attribute__((__packed__));
834 +struct mp_fclose {
835 + __u8 kind;
836 + __u8 len;
837 +#if defined(__LITTLE_ENDIAN_BITFIELD)
838 + __u16 rsv1:4,
839 + sub:4,
840 + rsv2:8;
841 +#elif defined(__BIG_ENDIAN_BITFIELD)
842 + __u16 sub:4,
843 + rsv1:4,
844 + rsv2:8;
845 +#else
846 +#error "Adjust your <asm/byteorder.h> defines"
847 +#endif
848 + __u64 key;
849 +} __attribute__((__packed__));
851 +struct mp_prio {
852 + __u8 kind;
853 + __u8 len;
854 +#if defined(__LITTLE_ENDIAN_BITFIELD)
855 + __u8 b:1,
856 + rsv:3,
857 + sub:4;
858 +#elif defined(__BIG_ENDIAN_BITFIELD)
859 + __u8 sub:4,
860 + rsv:3,
861 + b:1;
862 +#else
863 +#error "Adjust your <asm/byteorder.h> defines"
864 +#endif
865 + __u8 addr_id;
866 +} __attribute__((__packed__));
868 +static inline int mptcp_sub_len_dss(struct mp_dss *m, int csum)
870 + return 4 + m->A * (4 + m->a * 4) + m->M * (10 + m->m * 4 + csum * 2);
873 +#define MPTCP_APP 2
875 +extern int sysctl_mptcp_enabled;
876 +extern int sysctl_mptcp_checksum;
877 +extern int sysctl_mptcp_debug;
878 +extern int sysctl_mptcp_syn_retries;
880 +extern struct workqueue_struct *mptcp_wq;
882 +#define mptcp_debug(fmt, args...) \
883 + do { \
884 + if (unlikely(sysctl_mptcp_debug)) \
885 + pr_err(__FILE__ ": " fmt, ##args); \
886 + } while (0)
888 +/* Iterates over all subflows */
889 +#define mptcp_for_each_tp(mpcb, tp) \
890 + for ((tp) = (mpcb)->connection_list; (tp); (tp) = (tp)->mptcp->next)
892 +#define mptcp_for_each_sk(mpcb, sk) \
893 + for ((sk) = (struct sock *)(mpcb)->connection_list; \
894 + sk; \
895 + sk = (struct sock *)tcp_sk(sk)->mptcp->next)
897 +#define mptcp_for_each_sk_safe(__mpcb, __sk, __temp) \
898 + for (__sk = (struct sock *)(__mpcb)->connection_list, \
899 + __temp = __sk ? (struct sock *)tcp_sk(__sk)->mptcp->next : NULL; \
900 + __sk; \
901 + __sk = __temp, \
902 + __temp = __sk ? (struct sock *)tcp_sk(__sk)->mptcp->next : NULL)
904 +/* Iterates over all bit set to 1 in a bitset */
905 +#define mptcp_for_each_bit_set(b, i) \
906 + for (i = ffs(b) - 1; i >= 0; i = ffs(b >> (i + 1) << (i + 1)) - 1)
908 +#define mptcp_for_each_bit_unset(b, i) \
909 + mptcp_for_each_bit_set(~b, i)
911 +extern struct lock_class_key meta_key;
912 +extern struct lock_class_key meta_slock_key;
913 +extern u32 mptcp_secret[MD5_MESSAGE_BYTES / 4];
915 +/* This is needed to ensure that two subsequent key-generation result in
916 + * different keys if the IPs and ports are the same.
917 + */
918 +extern u32 mptcp_key_seed;
920 +#define MPTCP_HASH_SIZE 1024
922 +extern struct hlist_nulls_head tk_hashtable[MPTCP_HASH_SIZE];
924 +/* This second hashtable is needed to retrieve request socks
925 + * created as a result of a join request. While the SYN contains
926 + * the token, the final ack does not, so we need a separate hashtable
927 + * to retrieve the mpcb.
928 + */
929 +extern struct list_head mptcp_reqsk_htb[MPTCP_HASH_SIZE];
930 +extern spinlock_t mptcp_reqsk_hlock; /* hashtable protection */
932 +/* Lock, protecting the two hash-tables that hold the token. Namely,
933 + * mptcp_reqsk_tk_htb and tk_hashtable
934 + */
935 +extern spinlock_t mptcp_tk_hashlock; /* hashtable protection */
937 +void mptcp_data_ready(struct sock *sk, int bytes);
938 +void mptcp_write_space(struct sock *sk);
940 +void mptcp_add_meta_ofo_queue(struct sock *meta_sk, struct sk_buff *skb,
941 + struct sock *sk);
942 +void mptcp_ofo_queue(struct sock *meta_sk);
943 +void mptcp_purge_ofo_queue(struct tcp_sock *meta_tp);
944 +void mptcp_cleanup_rbuf(struct sock *meta_sk, int copied);
945 +int mptcp_alloc_mpcb(struct sock *master_sk, __u64 remote_key, u32 window);
946 +int mptcp_add_sock(struct sock *meta_sk, struct sock *sk, u8 loc_id, u8 rem_id,
947 + gfp_t flags);
948 +void mptcp_del_sock(struct sock *sk);
949 +void mptcp_update_metasocket(struct sock *sock, struct sock *meta_sk);
950 +void mptcp_reinject_data(struct sock *orig_sk, int clone_it);
951 +void mptcp_update_sndbuf(struct mptcp_cb *mpcb);
952 +struct sk_buff *mptcp_next_segment(struct sock *sk, int *reinject);
953 +void mptcp_send_fin(struct sock *meta_sk);
954 +void mptcp_send_active_reset(struct sock *meta_sk, gfp_t priority);
955 +int mptcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
956 + int push_one, gfp_t gfp);
957 +void mptcp_parse_options(const uint8_t *ptr, int opsize,
958 + struct tcp_options_received *opt_rx,
959 + struct mptcp_options_received *mopt,
960 + const struct sk_buff *skb);
961 +void mptcp_syn_options(struct sock *sk, struct tcp_out_options *opts,
962 + unsigned *remaining);
963 +void mptcp_synack_options(struct request_sock *req,
964 + struct tcp_out_options *opts,
965 + unsigned *remaining);
966 +void mptcp_established_options(struct sock *sk, struct sk_buff *skb,
967 + struct tcp_out_options *opts, unsigned *size);
968 +void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp,
969 + struct tcp_out_options *opts,
970 + struct sk_buff *skb);
971 +void mptcp_close(struct sock *meta_sk, long timeout);
972 +int mptcp_doit(struct sock *sk);
973 +int mptcp_create_master_sk(struct sock *meta_sk, __u64 remote_key, u32 window);
974 +int mptcp_check_req_master(struct sock *sk, struct sock *child,
975 + struct request_sock *req,
976 + struct request_sock **prev,
977 + struct mptcp_options_received *mopt);
978 +struct sock *mptcp_check_req_child(struct sock *sk, struct sock *child,
979 + struct request_sock *req,
980 + struct request_sock **prev,
981 + struct mptcp_options_received *mopt);
982 +u32 __mptcp_select_window(struct sock *sk);
983 +void mptcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd,
984 + __u32 *window_clamp, int wscale_ok,
985 + __u8 *rcv_wscale, __u32 init_rcv_wnd,
986 + const struct sock *sk);
987 +unsigned int mptcp_current_mss(struct sock *meta_sk);
988 +int mptcp_select_size(const struct sock *meta_sk, bool sg);
989 +void mptcp_key_sha1(u64 key, u32 *token, u64 *idsn);
990 +void mptcp_hmac_sha1(u8 *key_1, u8 *key_2, u8 *rand_1, u8 *rand_2,
991 + u32 *hash_out);
992 +void mptcp_clean_rtx_infinite(struct sk_buff *skb, struct sock *sk);
993 +void mptcp_fin(struct sock *meta_sk);
994 +void mptcp_retransmit_timer(struct sock *meta_sk);
995 +int mptcp_write_wakeup(struct sock *meta_sk);
996 +void mptcp_sub_close_wq(struct work_struct *work);
997 +void mptcp_sub_close(struct sock *sk, unsigned long delay);
998 +struct sock *mptcp_select_ack_sock(const struct sock *meta_sk, int copied);
999 +void mptcp_fallback_meta_sk(struct sock *meta_sk);
1000 +int mptcp_backlog_rcv(struct sock *meta_sk, struct sk_buff *skb);
1001 +struct sock *mptcp_sk_clone(const struct sock *sk, int family, const gfp_t priority);
1002 +void mptcp_ack_handler(unsigned long);
1003 +int mptcp_check_rtt(const struct tcp_sock *tp, int time);
1004 +int mptcp_check_snd_buf(const struct tcp_sock *tp);
1005 +int mptcp_handle_options(struct sock *sk, const struct tcphdr *th, struct sk_buff *skb);
1006 +void __init mptcp_init(void);
1007 +int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len);
1008 +int mptcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
1009 + unsigned int mss_now, int reinject);
1010 +int mptso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1011 + unsigned int mss_now, gfp_t gfp, int reinject);
1012 +void mptcp_destroy_sock(struct sock *sk);
1013 +int mptcp_rcv_synsent_state_process(struct sock *sk, struct sock **skptr,
1014 + struct sk_buff *skb,
1015 + struct mptcp_options_received *mopt);
1016 +unsigned int mptcp_xmit_size_goal(struct sock *meta_sk, u32 mss_now,
1017 + int large_allowed);
1018 +int mptcp_time_wait(struct sock *sk, struct tcp_timewait_sock *tw);
1019 +void mptcp_twsk_destructor(struct tcp_timewait_sock *tw);
1020 +void mptcp_update_tw_socks(const struct tcp_sock *tp, int state);
1021 +void mptcp_disconnect(struct sock *sk);
1022 +bool mptcp_should_expand_sndbuf(const struct sock *sk);
1023 +int mptcp_retransmit_skb(struct sock *meta_sk, struct sk_buff *skb);
1024 +void mptcp_tsq_flags(struct sock *sk);
1025 +void mptcp_tsq_sub_deferred(struct sock *meta_sk);
1026 +struct mp_join *mptcp_find_join(struct sk_buff *skb);
1027 +void mptcp_hash_remove_bh(struct tcp_sock *meta_tp);
1028 +void mptcp_hash_remove(struct tcp_sock *meta_tp);
1029 +struct sock *mptcp_hash_find(struct net *net, u32 token);
1030 +int mptcp_lookup_join(struct sk_buff *skb, struct inet_timewait_sock *tw);
1031 +int mptcp_do_join_short(struct sk_buff *skb, struct mptcp_options_received *mopt,
1032 + struct tcp_options_received *tmp_opt, struct net *net);
1033 +void mptcp_reqsk_destructor(struct request_sock *req);
1034 +void mptcp_reqsk_new_mptcp(struct request_sock *req,
1035 + const struct tcp_options_received *rx_opt,
1036 + const struct mptcp_options_received *mopt,
1037 + const struct sk_buff *skb);
1038 +int mptcp_check_req(struct sk_buff *skb, struct net *net);
1039 +void mptcp_connect_init(struct sock *sk);
1040 +void mptcp_sub_force_close(struct sock *sk);
1041 +int mptcp_sub_len_remove_addr_align(u16 bitfield);
1042 +void mptcp_remove_shortcuts(const struct mptcp_cb *mpcb,
1043 + const struct sk_buff *skb);
1044 +void mptcp_init_buffer_space(struct sock *sk);
1046 +/* MPTCP-path-manager registration/initialization functions */
1047 +int mptcp_register_path_manager(struct mptcp_pm_ops *pm);
1048 +void mptcp_unregister_path_manager(struct mptcp_pm_ops *pm);
1049 +void mptcp_init_path_manager(struct mptcp_cb *mpcb);
1050 +void mptcp_cleanup_path_manager(struct mptcp_cb *mpcb);
1051 +void mptcp_fallback_default(struct mptcp_cb *mpcb);
1052 +void mptcp_get_default_path_manager(char *name);
1053 +int mptcp_set_default_path_manager(const char *name);
1054 +extern struct mptcp_pm_ops mptcp_pm_default;
1056 +static inline
1057 +struct mptcp_request_sock *mptcp_rsk(const struct request_sock *req)
1059 + return (struct mptcp_request_sock *)req;
1062 +static inline
1063 +struct request_sock *rev_mptcp_rsk(const struct mptcp_request_sock *req)
1065 + return (struct request_sock *)req;
1068 +static inline bool mptcp_can_sendpage(struct sock *sk)
1070 + struct sock *sk_it;
1072 + if (tcp_sk(sk)->mpcb->dss_csum)
1073 + return false;
1075 + mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk_it) {
1076 + if (!(sk_it->sk_route_caps & NETIF_F_SG) ||
1077 + !(sk_it->sk_route_caps & NETIF_F_ALL_CSUM))
1078 + return false;
1081 + return true;
1084 +static inline void mptcp_push_pending_frames(struct sock *meta_sk)
1086 + if (mptcp_next_segment(meta_sk, NULL)) {
1087 + struct tcp_sock *tp = tcp_sk(meta_sk);
1089 + /* We don't care about the MSS, because it will be set in
1090 + * mptcp_write_xmit.
1091 + */
1092 + __tcp_push_pending_frames(meta_sk, 0, tp->nonagle);
1096 +static inline void mptcp_send_reset(struct sock *sk)
1098 + tcp_send_active_reset(sk, GFP_ATOMIC);
1099 + mptcp_sub_force_close(sk);
1102 +static inline int mptcp_is_data_seq(const struct sk_buff *skb)
1104 + return TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ;
1107 +static inline int mptcp_is_data_fin(const struct sk_buff *skb)
1109 + return mptcp_is_data_seq(skb) &&
1110 + (TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_FIN);
1113 +/* Is it a data-fin while in infinite mapping mode?
1114 + * In infinite mode, a subflow-fin is in fact a data-fin.
1115 + */
1116 +static inline int mptcp_is_data_fin2(const struct sk_buff *skb,
1117 + const struct tcp_sock *tp)
1119 + return mptcp_is_data_fin(skb) ||
1120 + (tp->mpcb->infinite_mapping_rcv && tcp_hdr(skb)->fin);
1123 +static inline void mptcp_skb_entail_init(const struct tcp_sock *tp,
1124 + struct sk_buff *skb)
1126 + TCP_SKB_CB(skb)->mptcp_flags = MPTCPHDR_SEQ;
1129 +static inline u8 mptcp_get_64_bit(u64 data_seq, struct mptcp_cb *mpcb)
1131 + u64 data_seq_high = (u32)(data_seq >> 32);
1133 + if (mpcb->rcv_high_order[0] == data_seq_high)
1134 + return 0;
1135 + else if (mpcb->rcv_high_order[1] == data_seq_high)
1136 + return MPTCPHDR_SEQ64_INDEX;
1137 + else
1138 + return MPTCPHDR_SEQ64_OFO;
1141 +/* Sets the data_seq and returns pointer to the in-skb field of the data_seq.
1142 + * If the packet has a 64-bit dseq, the pointer points to the last 32 bits.
1143 + */
1144 +static inline __u32 *mptcp_skb_set_data_seq(const struct sk_buff *skb,
1145 + u32 *data_seq,
1146 + struct mptcp_cb *mpcb)
1148 + __u32 *ptr = (__u32 *)(skb_transport_header(skb) + TCP_SKB_CB(skb)->dss_off);
1150 + if (TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ64_SET) {
1151 + u64 data_seq64 = get_unaligned_be64(ptr);
1153 + if (mpcb)
1154 + TCP_SKB_CB(skb)->mptcp_flags |= mptcp_get_64_bit(data_seq64, mpcb);
1156 + *data_seq = (u32)data_seq64 ;
1157 + ptr++;
1158 + } else {
1159 + *data_seq = get_unaligned_be32(ptr);
1162 + return ptr;
1165 +static inline struct sock *mptcp_meta_sk(const struct sock *sk)
1167 + return tcp_sk(sk)->meta_sk;
1170 +static inline struct tcp_sock *mptcp_meta_tp(const struct tcp_sock *tp)
1172 + return tcp_sk(tp->meta_sk);
1175 +static inline int is_meta_tp(const struct tcp_sock *tp)
1177 + return tp->mpcb && mptcp_meta_tp(tp) == tp;
1180 +static inline int is_meta_sk(const struct sock *sk)
1182 + return sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP &&
1183 + tcp_sk(sk)->mpc && mptcp_meta_sk(sk) == sk;
1186 +static inline int is_master_tp(const struct tcp_sock *tp)
1188 + return !tp->mpc || (!tp->mptcp->slave_sk && !is_meta_tp(tp));
1191 +static inline void mptcp_hash_request_remove(struct request_sock *req)
1193 + int in_softirq = 0;
1195 + if (list_empty(&mptcp_rsk(req)->collide_tuple))
1196 + return;
1198 + if (in_softirq()) {
1199 + spin_lock(&mptcp_reqsk_hlock);
1200 + in_softirq = 1;
1201 + } else {
1202 + spin_lock_bh(&mptcp_reqsk_hlock);
1205 + list_del(&mptcp_rsk(req)->collide_tuple);
1207 + if (in_softirq)
1208 + spin_unlock(&mptcp_reqsk_hlock);
1209 + else
1210 + spin_unlock_bh(&mptcp_reqsk_hlock);
1213 +static inline void mptcp_init_mp_opt(struct mptcp_options_received *mopt)
1215 + mopt->saw_mpc = 0;
1216 + mopt->dss_csum = 0;
1217 + mopt->drop_me = 0;
1219 + mopt->is_mp_join = 0;
1220 + mopt->join_ack = 0;
1222 + mopt->saw_low_prio = 0;
1223 + mopt->low_prio = 0;
1225 + mopt->saw_add_addr = 0;
1226 + mopt->more_add_addr = 0;
1228 + mopt->saw_rem_addr = 0;
1229 + mopt->more_rem_addr = 0;
1231 + mopt->mp_fail = 0;
1232 + mopt->mp_fclose = 0;
1235 +static inline void mptcp_reset_mopt(struct tcp_sock *tp)
1237 + struct mptcp_options_received *mopt = &tp->mptcp->rx_opt;
1239 + mopt->saw_low_prio = 0;
1240 + mopt->saw_add_addr = 0;
1241 + mopt->more_add_addr = 0;
1242 + mopt->saw_rem_addr = 0;
1243 + mopt->more_rem_addr = 0;
1244 + mopt->join_ack = 0;
1245 + mopt->mp_fail = 0;
1246 + mopt->mp_fclose = 0;
1249 +static inline __be32 mptcp_get_highorder_sndbits(const struct sk_buff *skb,
1250 + const struct mptcp_cb *mpcb)
1252 + return htonl(mpcb->snd_high_order[(TCP_SKB_CB(skb)->mptcp_flags &
1253 + MPTCPHDR_SEQ64_INDEX) ? 1 : 0]);
1256 +static inline u64 mptcp_get_data_seq_64(const struct mptcp_cb *mpcb, int index,
1257 + u32 data_seq_32)
1259 + return ((u64)mpcb->rcv_high_order[index] << 32) | data_seq_32;
1262 +static inline u64 mptcp_get_rcv_nxt_64(const struct tcp_sock *meta_tp)
1264 + struct mptcp_cb *mpcb = meta_tp->mpcb;
1265 + return mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index,
1266 + meta_tp->rcv_nxt);
1269 +static inline void mptcp_check_sndseq_wrap(struct tcp_sock *meta_tp, int inc)
1271 + if (unlikely(meta_tp->snd_nxt > meta_tp->snd_nxt + inc)) {
1272 + struct mptcp_cb *mpcb = meta_tp->mpcb;
1273 + mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1;
1274 + mpcb->snd_high_order[mpcb->snd_hiseq_index] += 2;
1278 +static inline void mptcp_check_rcvseq_wrap(struct tcp_sock *meta_tp,
1279 + u32 old_rcv_nxt)
1281 + if (unlikely(old_rcv_nxt > meta_tp->rcv_nxt)) {
1282 + struct mptcp_cb *mpcb = meta_tp->mpcb;
1283 + mpcb->rcv_high_order[mpcb->rcv_hiseq_index] += 2;
1284 + mpcb->rcv_hiseq_index = mpcb->rcv_hiseq_index ? 0 : 1;
1288 +static inline int mptcp_sk_can_send(const struct sock *sk)
1290 + return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
1291 + !tcp_sk(sk)->mptcp->pre_established;
1294 +static inline int mptcp_sk_can_recv(const struct sock *sk)
1296 + return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCP_FIN_WAIT1 | TCP_FIN_WAIT2);
1299 +static inline int mptcp_sk_can_send_ack(const struct sock *sk)
1301 + return !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV |
1302 + TCPF_CLOSE | TCPF_LISTEN)) &&
1303 + !tcp_sk(sk)->mptcp->pre_established;
1306 +/* Only support GSO if all subflows supports it */
1307 +static inline bool mptcp_sk_can_gso(const struct sock *meta_sk)
1309 + struct sock *sk;
1311 + if (tcp_sk(meta_sk)->mpcb->dss_csum)
1312 + return 0;
1314 + mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {
1315 + if (!mptcp_sk_can_send(sk))
1316 + continue;
1317 + if (!sk_can_gso(sk))
1318 + return false;
1320 + return true;
1323 +static inline bool mptcp_can_sg(const struct sock *meta_sk)
1325 + struct sock *sk;
1327 + if (tcp_sk(meta_sk)->mpcb->dss_csum)
1328 + return 0;
1330 + mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {
1331 + if (!mptcp_sk_can_send(sk))
1332 + continue;
1333 + if (!(sk->sk_route_caps & NETIF_F_SG))
1334 + return false;
1336 + return true;
1339 +static inline void mptcp_set_rto(struct sock *sk)
1341 + struct tcp_sock *tp = tcp_sk(sk);
1342 + struct sock *sk_it;
1343 + struct inet_connection_sock *micsk = inet_csk(mptcp_meta_sk(sk));
1344 + __u32 max_rto = 0;
1346 + /* We are in recovery-phase on the MPTCP-level. Do not update the
1347 + * RTO, because this would kill exponential backoff.
1348 + */
1349 + if (micsk->icsk_retransmits)
1350 + return;
1352 + mptcp_for_each_sk(tp->mpcb, sk_it) {
1353 + if (mptcp_sk_can_send(sk_it) &&
1354 + inet_csk(sk_it)->icsk_rto > max_rto)
1355 + max_rto = inet_csk(sk_it)->icsk_rto;
1357 + if (max_rto) {
1358 + micsk->icsk_rto = max_rto << 1;
1360 + /* A successfull rto-measurement - reset backoff counter */
1361 + micsk->icsk_backoff = 0;
1365 +static inline int mptcp_sysctl_syn_retries(void)
1367 + return sysctl_mptcp_syn_retries;
1370 +static inline void mptcp_sub_close_passive(struct sock *sk)
1372 + struct sock *meta_sk = mptcp_meta_sk(sk);
1373 + struct tcp_sock *tp = tcp_sk(sk), *meta_tp = tcp_sk(meta_sk);
1375 + /* Only close, if the app did a send-shutdown (passive close), and we
1376 + * received the data-ack of the data-fin.
1377 + */
1378 + if (tp->mpcb->passive_close && meta_tp->snd_una == meta_tp->write_seq)
1379 + mptcp_sub_close(sk, 0);
1382 +static inline bool mptcp_fallback_infinite(struct sock *sk, int flag)
1384 + struct tcp_sock *tp = tcp_sk(sk);
1386 + /* If data has been acknowleged on the meta-level, fully_established
1387 + * will have been set before and thus we will not fall back to infinite
1388 + * mapping.
1389 + */
1390 + if (likely(tp->mptcp->fully_established))
1391 + return false;
1393 + if (!(flag & MPTCP_FLAG_DATA_ACKED))
1394 + return false;
1396 + /* Don't fallback twice ;) */
1397 + if (tp->mpcb->infinite_mapping_snd)
1398 + return false;
1400 + pr_err("%s %#x will fallback - pi %d, src %pI4 dst %pI4 from %pS\n",
1401 + __func__, tp->mpcb->mptcp_loc_token, tp->mptcp->path_index,
1402 + &inet_sk(sk)->inet_saddr, &inet_sk(sk)->inet_daddr,
1403 + __builtin_return_address(0));
1404 + if (!is_master_tp(tp))
1405 + return true;
1407 + tp->mpcb->infinite_mapping_snd = 1;
1408 + tp->mpcb->infinite_mapping_rcv = 1;
1409 + tp->mptcp->fully_established = 1;
1411 + return false;
1414 +/* Find the first free index in the bitfield */
1415 +static inline int __mptcp_find_free_index(u8 bitfield, int j, u8 base)
1417 + int i;
1418 + mptcp_for_each_bit_unset(bitfield >> base, i) {
1419 + /* We wrapped at the bitfield - try from 0 on */
1420 + if (i + base >= sizeof(bitfield) * 8) {
1421 + mptcp_for_each_bit_unset(bitfield, i) {
1422 + if (i >= sizeof(bitfield) * 8)
1423 + goto exit;
1425 + if (i != j)
1426 + return i;
1428 + goto exit;
1430 + if (i + base >= sizeof(bitfield) * 8)
1431 + break;
1433 + if (i + base != j)
1434 + return i + base;
1436 +exit:
1437 + return -1;
1440 +static inline int mptcp_find_free_index(u8 bitfield)
1442 + return __mptcp_find_free_index(bitfield, -1, 0);
1445 +/* Find the first index whose bit in the bit-field == 0 */
1446 +static inline u8 mptcp_set_new_pathindex(struct mptcp_cb *mpcb)
1448 + u8 base = mpcb->next_path_index;
1449 + int i;
1451 + /* Start at 1, because 0 is reserved for the meta-sk */
1452 + mptcp_for_each_bit_unset(mpcb->path_index_bits >> base, i) {
1453 + if (i + base < 1)
1454 + continue;
1455 + if (i + base >= sizeof(mpcb->path_index_bits) * 8)
1456 + break;
1457 + i += base;
1458 + mpcb->path_index_bits |= (1 << i);
1459 + mpcb->next_path_index = i + 1;
1460 + return i;
1462 + mptcp_for_each_bit_unset(mpcb->path_index_bits, i) {
1463 + if (i >= sizeof(mpcb->path_index_bits) * 8)
1464 + break;
1465 + if (i < 1)
1466 + continue;
1467 + mpcb->path_index_bits |= (1 << i);
1468 + mpcb->next_path_index = i + 1;
1469 + return i;
1472 + return 0;
1475 +static inline int mptcp_v6_is_v4_mapped(struct sock *sk)
1477 + return sk->sk_family == AF_INET6 &&
1478 + ipv6_addr_type(&inet6_sk(sk)->saddr) == IPV6_ADDR_MAPPED;
1481 +/* TCP and MPTCP mpc flag-depending functions */
1482 +u16 mptcp_select_window(struct sock *sk);
1483 +void mptcp_init_buffer_space(struct sock *sk);
1484 +void mptcp_tcp_set_rto(struct sock *sk);
1486 +static inline void set_mpc(struct tcp_sock *tp)
1488 + tp->mpc = 1;
1490 + tp->__select_window = __mptcp_select_window;
1491 + tp->select_window = mptcp_select_window;
1492 + tp->select_initial_window = mptcp_select_initial_window;
1493 + tp->init_buffer_space = mptcp_init_buffer_space;
1494 + tp->set_rto = mptcp_tcp_set_rto;
1495 + tp->should_expand_sndbuf = mptcp_should_expand_sndbuf;
1498 +#else /* CONFIG_MPTCP */
1499 +#define mptcp_debug(fmt, args...) \
1500 + do { \
1501 + } while (0)
1503 +/* Without MPTCP, we just do one iteration
1504 + * over the only socket available. This assumes that
1505 + * the sk/tp arg is the socket in that case.
1506 + */
1507 +#define mptcp_for_each_sk(mpcb, sk)
1508 +#define mptcp_for_each_sk_safe(__mpcb, __sk, __temp)
1510 +static inline int mptcp_is_data_fin(const struct sk_buff *skb)
1512 + return 0;
1514 +static inline int mptcp_is_data_seq(const struct sk_buff *skb)
1516 + return 0;
1518 +static inline struct sock *mptcp_meta_sk(const struct sock *sk)
1520 + return NULL;
1522 +static inline struct tcp_sock *mptcp_meta_tp(const struct tcp_sock *tp)
1524 + return NULL;
1526 +static inline int is_meta_sk(const struct sock *sk)
1528 + return 0;
1530 +static inline int is_master_tp(const struct tcp_sock *tp)
1532 + return 0;
1534 +static inline void mptcp_purge_ofo_queue(struct tcp_sock *meta_tp) {}
1535 +static inline void mptcp_cleanup_rbuf(const struct sock *meta_sk, int copied) {}
1536 +static inline void mptcp_del_sock(const struct sock *sk) {}
1537 +static inline void mptcp_reinject_data(struct sock *orig_sk, int clone_it) {}
1538 +static inline void mptcp_update_sndbuf(const struct mptcp_cb *mpcb) {}
1539 +static inline void mptcp_skb_entail_init(const struct tcp_sock *tp,
1540 + const struct sk_buff *skb) {}
1541 +static inline void mptcp_clean_rtx_infinite(const struct sk_buff *skb,
1542 + const struct sock *sk) {}
1543 +static inline void mptcp_retransmit_timer(const struct sock *meta_sk) {}
1544 +static inline int mptcp_write_wakeup(struct sock *meta_sk)
1546 + return 0;
1548 +static inline void mptcp_sub_close(struct sock *sk, unsigned long delay) {}
1549 +static inline void mptcp_set_rto(const struct sock *sk) {}
1550 +static inline void mptcp_send_fin(const struct sock *meta_sk) {}
1551 +static inline void mptcp_parse_options(const uint8_t *ptr, const int opsize,
1552 + const struct tcp_options_received *opt_rx,
1553 + const struct mptcp_options_received *mopt,
1554 + const struct sk_buff *skb) {}
1555 +static inline void mptcp_syn_options(struct sock *sk,
1556 + struct tcp_out_options *opts,
1557 + unsigned *remaining) {}
1558 +static inline void mptcp_synack_options(struct request_sock *req,
1559 + struct tcp_out_options *opts,
1560 + unsigned *remaining) {}
1562 +static inline void mptcp_established_options(struct sock *sk,
1563 + struct sk_buff *skb,
1564 + struct tcp_out_options *opts,
1565 + unsigned *size) {}
1566 +static inline void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp,
1567 + struct tcp_out_options *opts,
1568 + struct sk_buff *skb) {}
1569 +static inline void mptcp_close(struct sock *meta_sk, long timeout) {}
1570 +static inline int mptcp_doit(struct sock *sk)
1572 + return 0;
1574 +static inline int mptcp_check_req_master(const struct sock *sk,
1575 + const struct sock *child,
1576 + struct request_sock *req,
1577 + struct request_sock **prev,
1578 + const struct mptcp_options_received *mopt)
1580 + return 1;
1582 +static inline struct sock *mptcp_check_req_child(struct sock *sk,
1583 + struct sock *child,
1584 + struct request_sock *req,
1585 + struct request_sock **prev,
1586 + struct mptcp_options_received *mopt)
1588 + return NULL;
1590 +static inline unsigned int mptcp_current_mss(struct sock *meta_sk)
1592 + return 0;
1594 +static inline int mptcp_select_size(const struct sock *meta_sk, bool sg)
1596 + return 0;
1598 +static inline void mptcp_sub_close_passive(struct sock *sk) {}
1599 +static inline bool mptcp_fallback_infinite(const struct sock *sk, int flag)
1601 + return false;
1603 +static inline void mptcp_init_mp_opt(const struct mptcp_options_received *mopt) {}
1604 +static inline int mptcp_check_rtt(const struct tcp_sock *tp, int time)
1606 + return 0;
1608 +static inline int mptcp_check_snd_buf(const struct tcp_sock *tp)
1610 + return 0;
1612 +static inline int mptcp_sysctl_syn_retries(void)
1614 + return 0;
1616 +static inline void mptcp_send_reset(const struct sock *sk) {}
1617 +static inline void mptcp_send_active_reset(struct sock *meta_sk,
1618 + gfp_t priority) {}
1619 +static inline int mptcp_write_xmit(struct sock *sk, unsigned int mss_now,
1620 + int nonagle, int push_one, gfp_t gfp)
1622 + return 0;
1624 +static inline struct sock *mptcp_sk_clone(const struct sock *sk, int family,
1625 + const gfp_t priority)
1627 + return NULL;
1629 +static inline int mptcp_handle_options(struct sock *sk,
1630 + const struct tcphdr *th,
1631 + struct sk_buff *skb)
1633 + return 0;
1635 +static inline void mptcp_reset_mopt(struct tcp_sock *tp) {}
1636 +static inline void __init mptcp_init(void) {}
1637 +static inline int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
1639 + return 0;
1641 +static inline int mptcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
1642 + unsigned int mss_now, int reinject)
1644 + return 0;
1646 +static inline int mptso_fragment(struct sock *sk, struct sk_buff *skb,
1647 + unsigned int len, unsigned int mss_now,
1648 + gfp_t gfp, int reinject)
1650 + return 0;
1652 +static inline bool mptcp_sk_can_gso(const struct sock *sk)
1654 + return false;
1656 +static inline bool mptcp_can_sg(const struct sock *meta_sk)
1658 + return false;
1660 +static inline unsigned int mptcp_xmit_size_goal(struct sock *meta_sk,
1661 + u32 mss_now, int large_allowed)
1663 + return 0;
1665 +static inline void mptcp_destroy_sock(struct sock *sk) {}
1666 +static inline int mptcp_rcv_synsent_state_process(struct sock *sk,
1667 + struct sock **skptr,
1668 + struct sk_buff *skb,
1669 + struct mptcp_options_received *mopt)
1671 + return 0;
1673 +static inline bool mptcp_can_sendpage(struct sock *sk)
1675 + return false;
1677 +static inline int mptcp_time_wait(struct sock *sk, struct tcp_timewait_sock *tw)
1679 + return 0;
1681 +static inline void mptcp_twsk_destructor(struct tcp_timewait_sock *tw) {}
1682 +static inline void mptcp_update_tw_socks(const struct tcp_sock *tp, int state) {}
1683 +static inline void mptcp_disconnect(struct sock *sk) {}
1684 +static inline void mptcp_tsq_flags(struct sock *sk) {}
1685 +static inline void mptcp_tsq_sub_deferred(struct sock *meta_sk) {}
1686 +static inline void mptcp_hash_remove_bh(struct tcp_sock *meta_tp) {}
1687 +static inline void mptcp_hash_remove(struct tcp_sock *meta_tp) {}
1688 +static inline void mptcp_reqsk_new_mptcp(struct request_sock *req,
1689 + const struct tcp_options_received *rx_opt,
1690 + const struct mptcp_options_received *mopt,
1691 + const struct sk_buff *skb) {}
1692 +static inline void mptcp_remove_shortcuts(const struct mptcp_cb *mpcb,
1693 + const struct sk_buff *skb) {}
1694 +#endif /* CONFIG_MPTCP */
1696 +#endif /* _MPTCP_H */
1697 diff -Nur linux-3.14.45.orig/include/net/mptcp_v4.h linux-3.14.45/include/net/mptcp_v4.h
1698 --- linux-3.14.45.orig/include/net/mptcp_v4.h 1970-01-01 01:00:00.000000000 +0100
1699 +++ linux-3.14.45/include/net/mptcp_v4.h 2015-06-24 14:15:48.871862463 +0200
1700 @@ -0,0 +1,69 @@
1702 + * MPTCP implementation
1704 + * Initial Design & Implementation:
1705 + * Sébastien Barré <sebastien.barre@uclouvain.be>
1707 + * Current Maintainer & Author:
1708 + * Christoph Paasch <christoph.paasch@uclouvain.be>
1710 + * Additional authors:
1711 + * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
1712 + * Gregory Detal <gregory.detal@uclouvain.be>
1713 + * Fabien Duchêne <fabien.duchene@uclouvain.be>
1714 + * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
1715 + * Lavkesh Lahngir <lavkesh51@gmail.com>
1716 + * Andreas Ripke <ripke@neclab.eu>
1717 + * Vlad Dogaru <vlad.dogaru@intel.com>
1718 + * Octavian Purdila <octavian.purdila@intel.com>
1719 + * John Ronan <jronan@tssg.org>
1720 + * Catalin Nicutar <catalin.nicutar@gmail.com>
1721 + * Brandon Heller <brandonh@stanford.edu>
1724 + * This program is free software; you can redistribute it and/or
1725 + * modify it under the terms of the GNU General Public License
1726 + * as published by the Free Software Foundation; either version
1727 + * 2 of the License, or (at your option) any later version.
1728 + */
1730 +#ifndef MPTCP_V4_H_
1731 +#define MPTCP_V4_H_
1734 +#include <linux/in.h>
1735 +#include <linux/skbuff.h>
1736 +#include <net/mptcp.h>
1737 +#include <net/request_sock.h>
1738 +#include <net/sock.h>
1740 +extern struct request_sock_ops mptcp_request_sock_ops;
1742 +#ifdef CONFIG_MPTCP
1744 +int mptcp_v4_do_rcv(struct sock *meta_sk, struct sk_buff *skb);
1745 +int mptcp_v4_rem_raddress(struct mptcp_cb *mpcb, u8 id);
1746 +int mptcp_v4_add_raddress(struct mptcp_cb *mpcb, const struct in_addr *addr,
1747 + __be16 port, u8 id);
1748 +void mptcp_v4_set_init_addr_bit(struct mptcp_cb *mpcb, __be32 daddr, int index);
1749 +struct sock *mptcp_v4_search_req(const __be16 rport, const __be32 raddr,
1750 + const __be32 laddr, const struct net *net);
1751 +int mptcp_init4_subsockets(struct sock *meta_sk, const struct mptcp_loc4 *loc,
1752 + struct mptcp_rem4 *rem);
1753 +int mptcp_pm_v4_init(void);
1754 +void mptcp_pm_v4_undo(void);
1755 +u32 mptcp_v4_get_nonce(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
1756 + u32 seq);
1757 +u64 mptcp_v4_get_key(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport);
1759 +#else
1761 +static inline int mptcp_v4_do_rcv(const struct sock *meta_sk,
1762 + const struct sk_buff *skb)
1764 + return 0;
1767 +#endif /* CONFIG_MPTCP */
1769 +#endif /* MPTCP_V4_H_ */
1770 diff -Nur linux-3.14.45.orig/include/net/mptcp_v6.h linux-3.14.45/include/net/mptcp_v6.h
1771 --- linux-3.14.45.orig/include/net/mptcp_v6.h 1970-01-01 01:00:00.000000000 +0100
1772 +++ linux-3.14.45/include/net/mptcp_v6.h 2015-06-24 14:15:48.871862463 +0200
1773 @@ -0,0 +1,72 @@
1775 + * MPTCP implementation
1777 + * Initial Design & Implementation:
1778 + * Sébastien Barré <sebastien.barre@uclouvain.be>
1780 + * Current Maintainer & Author:
1781 + * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
1783 + * Additional authors:
1784 + * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
1785 + * Gregory Detal <gregory.detal@uclouvain.be>
1786 + * Fabien Duchêne <fabien.duchene@uclouvain.be>
1787 + * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
1788 + * Lavkesh Lahngir <lavkesh51@gmail.com>
1789 + * Andreas Ripke <ripke@neclab.eu>
1790 + * Vlad Dogaru <vlad.dogaru@intel.com>
1791 + * Octavian Purdila <octavian.purdila@intel.com>
1792 + * John Ronan <jronan@tssg.org>
1793 + * Catalin Nicutar <catalin.nicutar@gmail.com>
1794 + * Brandon Heller <brandonh@stanford.edu>
1797 + * This program is free software; you can redistribute it and/or
1798 + * modify it under the terms of the GNU General Public License
1799 + * as published by the Free Software Foundation; either version
1800 + * 2 of the License, or (at your option) any later version.
1801 + */
1803 +#ifndef _MPTCP_V6_H
1804 +#define _MPTCP_V6_H
1806 +#include <linux/in6.h>
1807 +#include <net/if_inet6.h>
1809 +#include <net/mptcp.h>
1811 +extern struct request_sock_ops mptcp6_request_sock_ops;
1812 +extern struct proto mptcpv6_prot;
1814 +#ifdef CONFIG_MPTCP
1816 +int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb);
1817 +int mptcp_v6_rem_raddress(struct mptcp_cb *mpcb, u8 id);
1818 +int mptcp_v6_add_raddress(struct mptcp_cb *mpcb, const struct in6_addr *addr,
1819 + __be16 port, u8 id);
1820 +void mptcp_v6_set_init_addr_bit(struct mptcp_cb *mpcb,
1821 + const struct in6_addr *daddr, int index);
1822 +struct sock *mptcp_v6_search_req(const __be16 rport, const struct in6_addr *raddr,
1823 + const struct in6_addr *laddr, const struct net *net);
1824 +int mptcp_init6_subsockets(struct sock *meta_sk, const struct mptcp_loc6 *loc,
1825 + struct mptcp_rem6 *rem);
1826 +int mptcp_pm_v6_init(void);
1827 +void mptcp_pm_v6_undo(void);
1828 +struct sock *mptcp_v6v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1829 + struct request_sock *req,
1830 + struct dst_entry *dst);
1831 +__u32 mptcp_v6_get_nonce(const __be32 *saddr, const __be32 *daddr,
1832 + __be16 sport, __be16 dport, u32 seq);
1833 +u64 mptcp_v6_get_key(const __be32 *saddr, const __be32 *daddr,
1834 + __be16 sport, __be16 dport);
1836 +#else /* CONFIG_MPTCP */
1838 +static inline int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb)
1840 + return 0;
1843 +#endif /* CONFIG_MPTCP */
1845 +#endif /* _MPTCP_V6_H */
1846 diff -Nur linux-3.14.45.orig/include/net/net_namespace.h linux-3.14.45/include/net/net_namespace.h
1847 --- linux-3.14.45.orig/include/net/net_namespace.h 2015-06-23 02:01:36.000000000 +0200
1848 +++ linux-3.14.45/include/net/net_namespace.h 2015-06-24 14:15:48.871862463 +0200
1849 @@ -15,6 +15,7 @@
1850 #include <net/netns/packet.h>
1851 #include <net/netns/ipv4.h>
1852 #include <net/netns/ipv6.h>
1853 +#include <net/netns/mptcp.h>
1854 #include <net/netns/sctp.h>
1855 #include <net/netns/dccp.h>
1856 #include <net/netns/netfilter.h>
1857 @@ -90,6 +91,9 @@
1858 #if IS_ENABLED(CONFIG_IPV6)
1859 struct netns_ipv6 ipv6;
1860 #endif
1861 +#if IS_ENABLED(CONFIG_MPTCP)
1862 + struct netns_mptcp mptcp;
1863 +#endif
1864 #if defined(CONFIG_IP_SCTP) || defined(CONFIG_IP_SCTP_MODULE)
1865 struct netns_sctp sctp;
1866 #endif
1867 diff -Nur linux-3.14.45.orig/include/net/netns/mptcp.h linux-3.14.45/include/net/netns/mptcp.h
1868 --- linux-3.14.45.orig/include/net/netns/mptcp.h 1970-01-01 01:00:00.000000000 +0100
1869 +++ linux-3.14.45/include/net/netns/mptcp.h 2015-06-24 14:15:48.871862463 +0200
1870 @@ -0,0 +1,44 @@
1872 + * MPTCP implementation - MPTCP namespace
1874 + * Initial Design & Implementation:
1875 + * Sébastien Barré <sebastien.barre@uclouvain.be>
1877 + * Current Maintainer:
1878 + * Christoph Paasch <christoph.paasch@uclouvain.be>
1880 + * Additional authors:
1881 + * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
1882 + * Gregory Detal <gregory.detal@uclouvain.be>
1883 + * Fabien Duchêne <fabien.duchene@uclouvain.be>
1884 + * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
1885 + * Lavkesh Lahngir <lavkesh51@gmail.com>
1886 + * Andreas Ripke <ripke@neclab.eu>
1887 + * Vlad Dogaru <vlad.dogaru@intel.com>
1888 + * Octavian Purdila <octavian.purdila@intel.com>
1889 + * John Ronan <jronan@tssg.org>
1890 + * Catalin Nicutar <catalin.nicutar@gmail.com>
1891 + * Brandon Heller <brandonh@stanford.edu>
1894 + * This program is free software; you can redistribute it and/or
1895 + * modify it under the terms of the GNU General Public License
1896 + * as published by the Free Software Foundation; either version
1897 + * 2 of the License, or (at your option) any later version.
1898 + */
1900 +#ifndef __NETNS_MPTCP_H__
1901 +#define __NETNS_MPTCP_H__
1903 +#include <linux/compiler.h>
1905 +enum {
1906 + MPTCP_PM_FULLMESH = 0,
1907 + MPTCP_PM_MAX
1910 +struct netns_mptcp {
1911 + void *path_managers[MPTCP_PM_MAX];
1914 +#endif /* __NETNS_MPTCP_H__ */
1915 diff -Nur linux-3.14.45.orig/include/net/request_sock.h linux-3.14.45/include/net/request_sock.h
1916 --- linux-3.14.45.orig/include/net/request_sock.h 2015-06-23 02:01:36.000000000 +0200
1917 +++ linux-3.14.45/include/net/request_sock.h 2015-06-24 14:15:48.871862463 +0200
1918 @@ -164,7 +164,7 @@
1921 int reqsk_queue_alloc(struct request_sock_queue *queue,
1922 - unsigned int nr_table_entries);
1923 + unsigned int nr_table_entries, gfp_t flags);
1925 void __reqsk_queue_destroy(struct request_sock_queue *queue);
1926 void reqsk_queue_destroy(struct request_sock_queue *queue);
1927 diff -Nur linux-3.14.45.orig/include/net/sock.h linux-3.14.45/include/net/sock.h
1928 --- linux-3.14.45.orig/include/net/sock.h 2015-06-23 02:01:36.000000000 +0200
1929 +++ linux-3.14.45/include/net/sock.h 2015-06-24 14:15:48.871862463 +0200
1930 @@ -899,6 +899,16 @@
1932 int sk_wait_data(struct sock *sk, long *timeo);
1934 +/* START - needed for MPTCP */
1935 +extern void sock_def_error_report(struct sock *sk);
1936 +extern struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1937 + int family);
1938 +extern void sock_lock_init(struct sock *sk);
1940 +extern struct lock_class_key af_callback_keys[AF_MAX];
1941 +extern char *const af_family_clock_key_strings[AF_MAX+1];
1942 +/* END - needed for MPTCP */
1944 struct request_sock_ops;
1945 struct timewait_sock_ops;
1946 struct inet_hashinfo;
1947 diff -Nur linux-3.14.45.orig/include/net/tcp.h linux-3.14.45/include/net/tcp.h
1948 --- linux-3.14.45.orig/include/net/tcp.h 2015-06-23 02:01:36.000000000 +0200
1949 +++ linux-3.14.45/include/net/tcp.h 2015-06-24 14:15:48.875862469 +0200
1950 @@ -176,6 +176,7 @@
1951 #define TCPOPT_SACK 5 /* SACK Block */
1952 #define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */
1953 #define TCPOPT_MD5SIG 19 /* MD5 Signature (RFC2385) */
1954 +#define TCPOPT_MPTCP 30
1955 #define TCPOPT_EXP 254 /* Experimental */
1956 /* Magic number to be after the option value for sharing TCP
1957 * experimental options. See draft-ietf-tcpm-experimental-options-00.txt
1958 @@ -234,6 +235,27 @@
1960 #define TFO_SERVER_ALWAYS 0x1000
1962 +/* Flags from tcp_input.c for tcp_ack */
1963 +#define FLAG_DATA 0x01 /* Incoming frame contained data. */
1964 +#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
1965 +#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
1966 +#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
1967 +#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
1968 +#define FLAG_DATA_SACKED 0x20 /* New SACK. */
1969 +#define FLAG_ECE 0x40 /* ECE in this ACK */
1970 +#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
1971 +#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
1972 +#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
1973 +#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
1974 +#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
1975 +#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
1976 +#define MPTCP_FLAG_DATA_ACKED 0x8000
1978 +#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
1979 +#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
1980 +#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE)
1981 +#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)
1983 extern struct inet_timewait_death_row tcp_death_row;
1985 /* sysctl variables for tcp */
1986 @@ -349,6 +371,112 @@
1987 #define TCP_ADD_STATS_USER(net, field, val) SNMP_ADD_STATS_USER((net)->mib.tcp_statistics, field, val)
1988 #define TCP_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val)
1990 +/**** START - Exports needed for MPTCP ****/
1991 +extern const struct inet_connection_sock_af_ops ipv4_specific;
1992 +extern const struct inet_connection_sock_af_ops ipv6_specific;
1993 +extern const struct inet_connection_sock_af_ops ipv6_mapped;
1994 +extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops;
1995 +extern const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops;
1997 +struct mptcp_options_received;
1999 +int tcp_close_state(struct sock *sk);
2000 +void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle, int
2001 + size_goal);
2002 +void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
2003 + const struct sk_buff *skb);
2004 +int tcp_xmit_probe_skb(struct sock *sk, int urgent);
2005 +void tcp_cwnd_validate(struct sock *sk);
2006 +void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb);
2007 +int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
2008 + gfp_t gfp_mask);
2009 +unsigned int tcp_mss_split_point(const struct sock *sk,
2010 + const struct sk_buff *skb,
2011 + unsigned int mss_now,
2012 + unsigned int max_segs,
2013 + int nonagle);
2014 +bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb);
2015 +bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
2016 + unsigned int cur_mss, int nonagle);
2017 +bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb,
2018 + unsigned int cur_mss);
2019 +unsigned int tcp_cwnd_test(const struct tcp_sock *tp, const struct sk_buff *skb);
2020 +int tcp_mtu_probe(struct sock *sk);
2021 +int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
2022 + unsigned int mss_now);
2023 +void __pskb_trim_head(struct sk_buff *skb, int len);
2024 +void tcp_queue_skb(struct sock *sk, struct sk_buff *skb);
2025 +void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags);
2026 +void tcp_reset(struct sock *sk);
2027 +bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack,
2028 + const u32 ack_seq, const u32 nwin);
2029 +bool tcp_urg_mode(const struct tcp_sock *tp);
2030 +void tcp_ack_probe(struct sock *sk);
2031 +void tcp_rearm_rto(struct sock *sk);
2032 +int tcp_write_timeout(struct sock *sk);
2033 +bool retransmits_timed_out(struct sock *sk, unsigned int boundary,
2034 + unsigned int timeout, bool syn_set);
2035 +void tcp_write_err(struct sock *sk);
2036 +void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr);
2037 +void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
2038 + unsigned int mss_now);
2040 +int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req);
2041 +void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
2042 + struct request_sock *req);
2043 +__u32 tcp_v4_init_sequence(const struct sk_buff *skb);
2044 +int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
2045 + struct request_sock *req,
2046 + u16 queue_mapping);
2047 +void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb);
2048 +struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb);
2049 +struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb);
2050 +void tcp_v4_reqsk_destructor(struct request_sock *req);
2052 +int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req);
2053 +void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
2054 + struct request_sock *req);
2055 +__u32 tcp_v6_init_sequence(const struct sk_buff *skb);
2056 +int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
2057 + struct flowi6 *fl6, struct request_sock *req,
2058 + u16 queue_mapping);
2059 +void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb);
2060 +int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
2061 +int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
2062 +void tcp_v6_destroy_sock(struct sock *sk);
2063 +void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb);
2064 +void tcp_v6_hash(struct sock *sk);
2065 +struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb);
2066 +struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
2067 + struct request_sock *req,
2068 + struct dst_entry *dst);
2069 +void tcp_v6_reqsk_destructor(struct request_sock *req);
2071 +void sock_valbool_flag(struct sock *sk, int bit, int valbool);
2072 +unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
2073 + int large_allowed);
2074 +u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb);
2076 +void skb_clone_fraglist(struct sk_buff *skb);
2077 +void copy_skb_header(struct sk_buff *new, const struct sk_buff *old);
2079 +void inet_twsk_free(struct inet_timewait_sock *tw);
2080 +/* These states need RST on ABORT according to RFC793 */
2081 +static inline bool tcp_need_reset(int state)
2083 + return (1 << state) &
2084 + (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2085 + TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2088 +bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
2089 + int hlen);
2090 +int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
2091 + bool *fragstolen);
2092 +bool tcp_try_coalesce(struct sock *sk, struct sk_buff *to,
2093 + struct sk_buff *from, bool *fragstolen);
2094 +/**** END - Exports needed for MPTCP ****/
2096 void tcp_tasklet_init(void);
2098 void tcp_v4_err(struct sk_buff *skb, u32);
2099 @@ -445,6 +573,7 @@
2100 size_t len, int nonblock, int flags, int *addr_len);
2101 void tcp_parse_options(const struct sk_buff *skb,
2102 struct tcp_options_received *opt_rx,
2103 + struct mptcp_options_received *mopt_rx,
2104 int estab, struct tcp_fastopen_cookie *foc);
2105 const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
2107 @@ -558,11 +687,15 @@
2108 void tcp_send_loss_probe(struct sock *sk);
2109 bool tcp_schedule_loss_probe(struct sock *sk);
2111 +u16 tcp_select_window(struct sock *sk);
2113 /* tcp_input.c */
2114 void tcp_cwnd_application_limited(struct sock *sk);
2115 void tcp_resume_early_retransmit(struct sock *sk);
2116 void tcp_rearm_rto(struct sock *sk);
2117 void tcp_reset(struct sock *sk);
2118 +void tcp_set_rto(struct sock *sk);
2119 +bool tcp_should_expand_sndbuf(const struct sock *sk);
2121 /* tcp_timer.c */
2122 void tcp_init_xmit_timers(struct sock *);
2123 @@ -706,14 +839,24 @@
2125 struct tcp_skb_cb {
2126 union {
2127 - struct inet_skb_parm h4;
2128 + union {
2129 + struct inet_skb_parm h4;
2130 #if IS_ENABLED(CONFIG_IPV6)
2131 - struct inet6_skb_parm h6;
2132 + struct inet6_skb_parm h6;
2133 +#endif
2134 + } header; /* For incoming frames */
2135 +#ifdef CONFIG_MPTCP
2136 + __u32 path_mask; /* path indices that tried to send this skb */
2137 #endif
2138 - } header; /* For incoming frames */
2139 + };
2140 __u32 seq; /* Starting sequence number */
2141 __u32 end_seq; /* SEQ + FIN + SYN + datalen */
2142 __u32 when; /* used to compute rtt's */
2143 +#ifdef CONFIG_MPTCP
2144 + __u8 mptcp_flags; /* flags for the MPTCP layer */
2145 + __u8 dss_off; /* Number of 4-byte words until
2146 + * seq-number */
2147 +#endif
2148 __u8 tcp_flags; /* TCP header flags. (tcp[13]) */
2150 __u8 sacked; /* State flags for SACK/FACK. */
2151 @@ -1061,7 +1204,8 @@
2152 /* Determine a window scaling and initial window to offer. */
2153 void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd,
2154 __u32 *window_clamp, int wscale_ok,
2155 - __u8 *rcv_wscale, __u32 init_rcv_wnd);
2156 + __u8 *rcv_wscale, __u32 init_rcv_wnd,
2157 + const struct sock *sk);
2159 static inline int tcp_win_from_space(int space)
2161 @@ -1073,12 +1217,18 @@
2162 /* Note: caller must be prepared to deal with negative returns */
2163 static inline int tcp_space(const struct sock *sk)
2165 + if (tcp_sk(sk)->mpc)
2166 + sk = tcp_sk(sk)->meta_sk;
2168 return tcp_win_from_space(sk->sk_rcvbuf -
2169 atomic_read(&sk->sk_rmem_alloc));
2172 static inline int tcp_full_space(const struct sock *sk)
2174 + if (tcp_sk(sk)->mpc)
2175 + sk = tcp_sk(sk)->meta_sk;
2177 return tcp_win_from_space(sk->sk_rcvbuf);
2180 @@ -1093,6 +1243,7 @@
2181 tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
2182 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
2183 tcp_rsk(req)->snt_synack = 0;
2184 + tcp_rsk(req)->saw_mpc = 0;
2185 req->mss = rx_opt->mss_clamp;
2186 req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
2187 ireq->tstamp_ok = rx_opt->tstamp_ok;
2188 diff -Nur linux-3.14.45.orig/include/uapi/linux/if.h linux-3.14.45/include/uapi/linux/if.h
2189 --- linux-3.14.45.orig/include/uapi/linux/if.h 2015-06-23 02:01:36.000000000 +0200
2190 +++ linux-3.14.45/include/uapi/linux/if.h 2015-06-24 14:15:48.875862469 +0200
2191 @@ -53,6 +53,9 @@
2193 #define IFF_ECHO 0x40000 /* echo sent packets */
2195 +#define IFF_NOMULTIPATH 0x80000 /* Disable for MPTCP */
2196 +#define IFF_MPBACKUP 0x100000 /* Use as backup path for MPTCP */
2198 #define IFF_VOLATILE (IFF_LOOPBACK|IFF_POINTOPOINT|IFF_BROADCAST|IFF_ECHO|\
2199 IFF_MASTER|IFF_SLAVE|IFF_RUNNING|IFF_LOWER_UP|IFF_DORMANT)
2201 diff -Nur linux-3.14.45.orig/include/uapi/linux/tcp.h linux-3.14.45/include/uapi/linux/tcp.h
2202 --- linux-3.14.45.orig/include/uapi/linux/tcp.h 2015-06-23 02:01:36.000000000 +0200
2203 +++ linux-3.14.45/include/uapi/linux/tcp.h 2015-06-24 14:15:48.875862469 +0200
2204 @@ -112,6 +112,7 @@
2205 #define TCP_FASTOPEN 23 /* Enable FastOpen on listeners */
2206 #define TCP_TIMESTAMP 24
2207 #define TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes in write queue */
2208 +#define MPTCP_ENABLED 26
2210 struct tcp_repair_opt {
2211 __u32 opt_code;
2212 diff -Nur linux-3.14.45.orig/net/Kconfig linux-3.14.45/net/Kconfig
2213 --- linux-3.14.45.orig/net/Kconfig 2015-06-23 02:01:36.000000000 +0200
2214 +++ linux-3.14.45/net/Kconfig 2015-06-24 14:15:48.875862469 +0200
2215 @@ -79,6 +79,7 @@
2216 source "net/ipv4/Kconfig"
2217 source "net/ipv6/Kconfig"
2218 source "net/netlabel/Kconfig"
2219 +source "net/mptcp/Kconfig"
2221 endif # if INET
2223 diff -Nur linux-3.14.45.orig/net/Makefile linux-3.14.45/net/Makefile
2224 --- linux-3.14.45.orig/net/Makefile 2015-06-23 02:01:36.000000000 +0200
2225 +++ linux-3.14.45/net/Makefile 2015-06-24 14:15:48.875862469 +0200
2226 @@ -20,6 +20,7 @@
2227 obj-$(CONFIG_XFRM) += xfrm/
2228 obj-$(CONFIG_UNIX) += unix/
2229 obj-$(CONFIG_NET) += ipv6/
2230 +obj-$(CONFIG_MPTCP) += mptcp/
2231 obj-$(CONFIG_PACKET) += packet/
2232 obj-$(CONFIG_NET_KEY) += key/
2233 obj-$(CONFIG_BRIDGE) += bridge/
2234 diff -Nur linux-3.14.45.orig/net/core/dev.c linux-3.14.45/net/core/dev.c
2235 --- linux-3.14.45.orig/net/core/dev.c 2015-06-23 02:01:36.000000000 +0200
2236 +++ linux-3.14.45/net/core/dev.c 2015-06-24 14:15:48.875862469 +0200
2237 @@ -5399,7 +5399,7 @@
2239 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
2240 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
2241 - IFF_AUTOMEDIA)) |
2242 + IFF_AUTOMEDIA | IFF_NOMULTIPATH | IFF_MPBACKUP)) |
2243 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
2244 IFF_ALLMULTI));
2246 diff -Nur linux-3.14.45.orig/net/core/request_sock.c linux-3.14.45/net/core/request_sock.c
2247 --- linux-3.14.45.orig/net/core/request_sock.c 2015-06-23 02:01:36.000000000 +0200
2248 +++ linux-3.14.45/net/core/request_sock.c 2015-06-24 14:15:48.875862469 +0200
2249 @@ -38,7 +38,8 @@
2250 EXPORT_SYMBOL(sysctl_max_syn_backlog);
2252 int reqsk_queue_alloc(struct request_sock_queue *queue,
2253 - unsigned int nr_table_entries)
2254 + unsigned int nr_table_entries,
2255 + gfp_t flags)
2257 size_t lopt_size = sizeof(struct listen_sock);
2258 struct listen_sock *lopt;
2259 @@ -48,9 +49,11 @@
2260 nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
2261 lopt_size += nr_table_entries * sizeof(struct request_sock *);
2262 if (lopt_size > PAGE_SIZE)
2263 - lopt = vzalloc(lopt_size);
2264 + lopt = __vmalloc(lopt_size,
2265 + flags | __GFP_HIGHMEM | __GFP_ZERO,
2266 + PAGE_KERNEL);
2267 else
2268 - lopt = kzalloc(lopt_size, GFP_KERNEL);
2269 + lopt = kzalloc(lopt_size, flags);
2270 if (lopt == NULL)
2271 return -ENOMEM;
2273 diff -Nur linux-3.14.45.orig/net/core/skbuff.c linux-3.14.45/net/core/skbuff.c
2274 --- linux-3.14.45.orig/net/core/skbuff.c 2015-06-23 02:01:36.000000000 +0200
2275 +++ linux-3.14.45/net/core/skbuff.c 2015-06-24 14:15:48.875862469 +0200
2276 @@ -491,7 +491,7 @@
2277 skb_drop_list(&skb_shinfo(skb)->frag_list);
2280 -static void skb_clone_fraglist(struct sk_buff *skb)
2281 +void skb_clone_fraglist(struct sk_buff *skb)
2283 struct sk_buff *list;
2285 @@ -913,7 +913,7 @@
2286 skb->inner_mac_header += off;
2289 -static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
2290 +void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
2292 __copy_skb_header(new, old);
2294 diff -Nur linux-3.14.45.orig/net/core/sock.c linux-3.14.45/net/core/sock.c
2295 --- linux-3.14.45.orig/net/core/sock.c 2015-06-23 02:01:36.000000000 +0200
2296 +++ linux-3.14.45/net/core/sock.c 2015-06-24 14:15:48.875862469 +0200
2297 @@ -280,7 +280,7 @@
2298 "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
2299 "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_MAX"
2301 -static const char *const af_family_clock_key_strings[AF_MAX+1] = {
2302 +char *const af_family_clock_key_strings[AF_MAX+1] = {
2303 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
2304 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
2305 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
2306 @@ -301,7 +301,7 @@
2307 * sk_callback_lock locking rules are per-address-family,
2308 * so split the lock classes by using a per-AF key:
2310 -static struct lock_class_key af_callback_keys[AF_MAX];
2311 +struct lock_class_key af_callback_keys[AF_MAX];
2313 /* Take into consideration the size of the struct sk_buff overhead in the
2314 * determination of these values, since that is non-constant across
2315 @@ -651,7 +651,7 @@
2316 return ret;
2319 -static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
2320 +void sock_valbool_flag(struct sock *sk, int bit, int valbool)
2322 if (valbool)
2323 sock_set_flag(sk, bit);
2324 @@ -1272,7 +1272,7 @@
2326 * (We also register the sk_lock with the lock validator.)
2328 -static inline void sock_lock_init(struct sock *sk)
2329 +void sock_lock_init(struct sock *sk)
2331 sock_lock_init_class_and_name(sk,
2332 af_family_slock_key_strings[sk->sk_family],
2333 @@ -1320,7 +1320,7 @@
2335 EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
2337 -static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2338 +struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2339 int family)
2341 struct sock *sk;
2342 @@ -2252,7 +2252,7 @@
2343 rcu_read_unlock();
2346 -static void sock_def_error_report(struct sock *sk)
2347 +void sock_def_error_report(struct sock *sk)
2349 struct socket_wq *wq;
2351 diff -Nur linux-3.14.45.orig/net/ipv4/Kconfig linux-3.14.45/net/ipv4/Kconfig
2352 --- linux-3.14.45.orig/net/ipv4/Kconfig 2015-06-23 02:01:36.000000000 +0200
2353 +++ linux-3.14.45/net/ipv4/Kconfig 2015-06-24 14:15:48.875862469 +0200
2354 @@ -556,6 +556,30 @@
2355 For further details see:
2356 http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
2358 +config TCP_CONG_COUPLED
2359 + tristate "MPTCP COUPLED CONGESTION CONTROL"
2360 + depends on MPTCP
2361 + default n
2362 + ---help---
2363 + MultiPath TCP Coupled Congestion Control
2364 + To enable it, just put 'coupled' in tcp_congestion_control
2366 +config TCP_CONG_OLIA
2367 + tristate "MPTCP Opportunistic Linked Increase"
2368 + depends on MPTCP
2369 + default n
2370 + ---help---
2371 + MultiPath TCP Opportunistic Linked Increase Congestion Control
2372 + To enable it, just put 'olia' in tcp_congestion_control
2374 +config TCP_CONG_WVEGAS
2375 + tristate "MPTCP WVEGAS CONGESTION CONTROL"
2376 + depends on MPTCP
2377 + default n
2378 + ---help---
2379 + wVegas congestion control for MPTCP
2380 + To enable it, just put 'wvegas' in tcp_congestion_control
2382 choice
2383 prompt "Default TCP congestion control"
2384 default DEFAULT_CUBIC
2385 @@ -584,6 +608,15 @@
2386 config DEFAULT_WESTWOOD
2387 bool "Westwood" if TCP_CONG_WESTWOOD=y
2389 + config DEFAULT_COUPLED
2390 + bool "Coupled" if TCP_CONG_COUPLED=y
2392 + config DEFAULT_OLIA
2393 + bool "Olia" if TCP_CONG_OLIA=y
2395 + config DEFAULT_WVEGAS
2396 + bool "Wvegas" if TCP_CONG_WVEGAS=y
2398 config DEFAULT_RENO
2399 bool "Reno"
2401 @@ -605,6 +638,8 @@
2402 default "vegas" if DEFAULT_VEGAS
2403 default "westwood" if DEFAULT_WESTWOOD
2404 default "veno" if DEFAULT_VENO
2405 + default "coupled" if DEFAULT_COUPLED
2406 + default "wvegas" if DEFAULT_WVEGAS
2407 default "reno" if DEFAULT_RENO
2408 default "cubic"
2410 diff -Nur linux-3.14.45.orig/net/ipv4/af_inet.c linux-3.14.45/net/ipv4/af_inet.c
2411 --- linux-3.14.45.orig/net/ipv4/af_inet.c 2015-06-23 02:01:36.000000000 +0200
2412 +++ linux-3.14.45/net/ipv4/af_inet.c 2015-06-24 14:15:48.875862469 +0200
2413 @@ -104,6 +104,7 @@
2414 #include <net/ip_fib.h>
2415 #include <net/inet_connection_sock.h>
2416 #include <net/tcp.h>
2417 +#include <net/mptcp.h>
2418 #include <net/udp.h>
2419 #include <net/udplite.h>
2420 #include <net/ping.h>
2421 @@ -246,8 +247,7 @@
2422 * Create an inet socket.
2425 -static int inet_create(struct net *net, struct socket *sock, int protocol,
2426 - int kern)
2427 +int inet_create(struct net *net, struct socket *sock, int protocol, int kern)
2429 struct sock *sk;
2430 struct inet_protosw *answer;
2431 @@ -679,6 +679,23 @@
2432 lock_sock(sk2);
2434 sock_rps_record_flow(sk2);
2436 + if (sk2->sk_protocol == IPPROTO_TCP && tcp_sk(sk2)->mpc) {
2437 + struct sock *sk_it = sk2;
2439 + mptcp_for_each_sk(tcp_sk(sk2)->mpcb, sk_it)
2440 + sock_rps_record_flow(sk_it);
2442 + if (tcp_sk(sk2)->mpcb->master_sk) {
2443 + sk_it = tcp_sk(sk2)->mpcb->master_sk;
2445 + write_lock_bh(&sk_it->sk_callback_lock);
2446 + sk_it->sk_wq = newsock->wq;
2447 + sk_it->sk_socket = newsock;
2448 + write_unlock_bh(&sk_it->sk_callback_lock);
2452 WARN_ON(!((1 << sk2->sk_state) &
2453 (TCPF_ESTABLISHED | TCPF_SYN_RECV |
2454 TCPF_CLOSE_WAIT | TCPF_CLOSE)));
2455 @@ -1770,6 +1787,9 @@
2457 ip_init();
2459 + /* We must initialize MPTCP before TCP. */
2460 + mptcp_init();
2462 tcp_v4_init();
2464 /* Setup TCP slab cache for open requests. */
2465 diff -Nur linux-3.14.45.orig/net/ipv4/inet_connection_sock.c linux-3.14.45/net/ipv4/inet_connection_sock.c
2466 --- linux-3.14.45.orig/net/ipv4/inet_connection_sock.c 2015-06-23 02:01:36.000000000 +0200
2467 +++ linux-3.14.45/net/ipv4/inet_connection_sock.c 2015-06-24 14:15:48.875862469 +0200
2468 @@ -23,6 +23,7 @@
2469 #include <net/route.h>
2470 #include <net/tcp_states.h>
2471 #include <net/xfrm.h>
2472 +#include <net/mptcp.h>
2474 #ifdef INET_CSK_DEBUG
2475 const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
2476 @@ -468,8 +469,8 @@
2478 EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
2480 -static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
2481 - const u32 rnd, const u32 synq_hsize)
2482 +u32 inet_synq_hash(const __be32 raddr, const __be16 rport, const u32 rnd,
2483 + const u32 synq_hsize)
2485 return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);
2487 @@ -667,7 +668,12 @@
2488 const struct request_sock *req,
2489 const gfp_t priority)
2491 - struct sock *newsk = sk_clone_lock(sk, priority);
2492 + struct sock *newsk;
2494 + if (sk->sk_protocol == IPPROTO_TCP && tcp_sk(sk)->mpc)
2495 + newsk = mptcp_sk_clone(sk, req->rsk_ops->family, priority);
2496 + else
2497 + newsk = sk_clone_lock(sk, priority);
2499 if (newsk != NULL) {
2500 struct inet_connection_sock *newicsk = inet_csk(newsk);
2501 @@ -744,7 +750,8 @@
2503 struct inet_sock *inet = inet_sk(sk);
2504 struct inet_connection_sock *icsk = inet_csk(sk);
2505 - int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
2506 + int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries,
2507 + GFP_KERNEL);
2509 if (rc != 0)
2510 return rc;
2511 @@ -802,9 +809,14 @@
2513 while ((req = acc_req) != NULL) {
2514 struct sock *child = req->sk;
2515 + bool mutex_taken = false;
2517 acc_req = req->dl_next;
2519 + if (is_meta_sk(child)) {
2520 + mutex_lock(&tcp_sk(child)->mpcb->mpcb_mutex);
2521 + mutex_taken = true;
2523 local_bh_disable();
2524 bh_lock_sock(child);
2525 WARN_ON(sock_owned_by_user(child));
2526 @@ -833,6 +845,8 @@
2528 bh_unlock_sock(child);
2529 local_bh_enable();
2530 + if (mutex_taken)
2531 + mutex_unlock(&tcp_sk(child)->mpcb->mpcb_mutex);
2532 sock_put(child);
2534 sk_acceptq_removed(sk);
2535 diff -Nur linux-3.14.45.orig/net/ipv4/syncookies.c linux-3.14.45/net/ipv4/syncookies.c
2536 --- linux-3.14.45.orig/net/ipv4/syncookies.c 2015-06-23 02:01:36.000000000 +0200
2537 +++ linux-3.14.45/net/ipv4/syncookies.c 2015-06-24 14:15:48.875862469 +0200
2538 @@ -284,7 +284,7 @@
2540 /* check for timestamp cookie support */
2541 memset(&tcp_opt, 0, sizeof(tcp_opt));
2542 - tcp_parse_options(skb, &tcp_opt, 0, NULL);
2543 + tcp_parse_options(skb, &tcp_opt, NULL, 0, NULL);
2545 if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok))
2546 goto out;
2547 @@ -354,10 +354,10 @@
2548 /* Try to redo what tcp_v4_send_synack did. */
2549 req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
2551 - tcp_select_initial_window(tcp_full_space(sk), req->mss,
2552 + tp->select_initial_window(tcp_full_space(sk), req->mss,
2553 &req->rcv_wnd, &req->window_clamp,
2554 ireq->wscale_ok, &rcv_wscale,
2555 - dst_metric(&rt->dst, RTAX_INITRWND));
2556 + dst_metric(&rt->dst, RTAX_INITRWND), sk);
2558 ireq->rcv_wscale = rcv_wscale;
2560 diff -Nur linux-3.14.45.orig/net/ipv4/tcp.c linux-3.14.45/net/ipv4/tcp.c
2561 --- linux-3.14.45.orig/net/ipv4/tcp.c 2015-06-23 02:01:36.000000000 +0200
2562 +++ linux-3.14.45/net/ipv4/tcp.c 2015-06-24 14:15:48.879862472 +0200
2563 @@ -271,6 +271,7 @@
2565 #include <net/icmp.h>
2566 #include <net/inet_common.h>
2567 +#include <net/mptcp.h>
2568 #include <net/tcp.h>
2569 #include <net/xfrm.h>
2570 #include <net/ip.h>
2571 @@ -419,6 +420,9 @@
2572 sk->sk_sndbuf = sysctl_tcp_wmem[1];
2573 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
2575 + /* Set function pointers in tcp_sock to tcp functions. */
2576 + mptcp_init_tcp_sock(tp);
2578 local_bh_disable();
2579 sock_update_memcg(sk);
2580 sk_sockets_allocated_inc(sk);
2581 @@ -607,6 +611,8 @@
2582 tcb->seq = tcb->end_seq = tp->write_seq;
2583 tcb->tcp_flags = TCPHDR_ACK;
2584 tcb->sacked = 0;
2585 + if (tp->mpc)
2586 + mptcp_skb_entail_init(tp, skb);
2587 skb_header_release(skb);
2588 tcp_add_write_queue_tail(sk, skb);
2589 sk->sk_wmem_queued += skb->truesize;
2590 @@ -640,8 +646,8 @@
2591 atomic_read(&sk->sk_wmem_alloc) > skb->truesize;
2594 -static void tcp_push(struct sock *sk, int flags, int mss_now,
2595 - int nonagle, int size_goal)
2596 +void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle,
2597 + int size_goal)
2599 struct tcp_sock *tp = tcp_sk(sk);
2600 struct sk_buff *skb;
2601 @@ -726,6 +732,14 @@
2602 int ret;
2604 sock_rps_record_flow(sk);
2606 +#ifdef CONFIG_MPTCP
2607 + if (tcp_sk(sk)->mpc) {
2608 + struct sock *sk_it;
2609 + mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk_it)
2610 + sock_rps_record_flow(sk_it);
2612 +#endif
2614 * We can't seek on a socket input
2616 @@ -821,8 +835,7 @@
2617 return NULL;
2620 -static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
2621 - int large_allowed)
2622 +unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, int large_allowed)
2624 struct tcp_sock *tp = tcp_sk(sk);
2625 u32 xmit_size_goal, old_size_goal;
2626 @@ -872,8 +885,13 @@
2628 int mss_now;
2630 - mss_now = tcp_current_mss(sk);
2631 - *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
2632 + if (tcp_sk(sk)->mpc) {
2633 + mss_now = mptcp_current_mss(sk);
2634 + *size_goal = mptcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
2635 + } else {
2636 + mss_now = tcp_current_mss(sk);
2637 + *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
2640 return mss_now;
2642 @@ -897,6 +915,26 @@
2643 goto out_err;
2646 + if (tp->mpc) {
2647 + struct sock *sk_it = sk;
2649 + /* We must check this with socket-lock hold because we iterate
2650 + * over the subflows.
2651 + */
2652 + if (!mptcp_can_sendpage(sk)) {
2653 + ssize_t ret;
2655 + release_sock(sk);
2656 + ret = sock_no_sendpage(sk->sk_socket, page, offset,
2657 + size, flags);
2658 + lock_sock(sk);
2659 + return ret;
2662 + mptcp_for_each_sk(tp->mpcb, sk_it)
2663 + sock_rps_record_flow(sk_it);
2666 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
2668 mss_now = tcp_send_mss(sk, &size_goal, flags);
2669 @@ -1001,8 +1039,9 @@
2671 ssize_t res;
2673 - if (!(sk->sk_route_caps & NETIF_F_SG) ||
2674 - !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
2675 + /* If MPTCP is enabled, we check it later after establishment */
2676 + if (!tcp_sk(sk)->mpc && (!(sk->sk_route_caps & NETIF_F_SG) ||
2677 + !(sk->sk_route_caps & NETIF_F_ALL_CSUM)))
2678 return sock_no_sendpage(sk->sk_socket, page, offset, size,
2679 flags);
2681 @@ -1018,6 +1057,9 @@
2682 const struct tcp_sock *tp = tcp_sk(sk);
2683 int tmp = tp->mss_cache;
2685 + if (tp->mpc)
2686 + return mptcp_select_size(sk, sg);
2688 if (sg) {
2689 if (sk_can_gso(sk)) {
2690 /* Small frames wont use a full page:
2691 @@ -1105,6 +1147,12 @@
2692 goto do_error;
2695 + if (tp->mpc) {
2696 + struct sock *sk_it = sk;
2697 + mptcp_for_each_sk(tp->mpcb, sk_it)
2698 + sock_rps_record_flow(sk_it);
2701 if (unlikely(tp->repair)) {
2702 if (tp->repair_queue == TCP_RECV_QUEUE) {
2703 copied = tcp_send_rcvq(sk, msg, size);
2704 @@ -1132,7 +1180,10 @@
2705 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
2706 goto out_err;
2708 - sg = !!(sk->sk_route_caps & NETIF_F_SG);
2709 + if (tp->mpc)
2710 + sg = mptcp_can_sg(sk);
2711 + else
2712 + sg = !!(sk->sk_route_caps & NETIF_F_SG);
2714 while (--iovlen >= 0) {
2715 size_t seglen = iov->iov_len;
2716 @@ -1176,8 +1227,15 @@
2719 * Check whether we can use HW checksum.
2721 + * If dss-csum is enabled, we do not do hw-csum.
2722 + * In case of non-mptcp we check the
2723 + * device-capabilities.
2724 + * In case of mptcp, hw-csum's will be handled
2725 + * later in mptcp_write_xmit.
2727 - if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
2728 + if (((tp->mpc && !tp->mpcb->dss_csum) || !tp->mpc) &&
2729 + (tp->mpc || sk->sk_route_caps & NETIF_F_ALL_CSUM))
2730 skb->ip_summed = CHECKSUM_PARTIAL;
2732 skb_entail(sk, skb);
2733 @@ -1386,6 +1444,11 @@
2735 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
2737 + if (is_meta_sk(sk)) {
2738 + mptcp_cleanup_rbuf(sk, copied);
2739 + return;
2742 WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
2743 "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
2744 tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
2745 @@ -1422,7 +1485,7 @@
2747 /* Optimize, __tcp_select_window() is not cheap. */
2748 if (2*rcv_window_now <= tp->window_clamp) {
2749 - __u32 new_window = __tcp_select_window(sk);
2750 + __u32 new_window = tp->__select_window(sk);
2752 /* Send ACK now, if this read freed lots of space
2753 * in our buffer. Certainly, new_window is new window.
2754 @@ -1623,6 +1686,14 @@
2756 lock_sock(sk);
2758 +#ifdef CONFIG_MPTCP
2759 + if (tp->mpc) {
2760 + struct sock *sk_it;
2761 + mptcp_for_each_sk(tp->mpcb, sk_it)
2762 + sock_rps_record_flow(sk_it);
2764 +#endif
2766 err = -ENOTCONN;
2767 if (sk->sk_state == TCP_LISTEN)
2768 goto out;
2769 @@ -2070,7 +2141,7 @@
2770 /* TCP_CLOSING */ TCP_CLOSING,
2773 -static int tcp_close_state(struct sock *sk)
2774 +int tcp_close_state(struct sock *sk)
2776 int next = (int)new_state[sk->sk_state];
2777 int ns = next & TCP_STATE_MASK;
2778 @@ -2099,8 +2170,12 @@
2779 (TCPF_ESTABLISHED | TCPF_SYN_SENT |
2780 TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
2781 /* Clear out any half completed packets. FIN if needed. */
2782 - if (tcp_close_state(sk))
2783 - tcp_send_fin(sk);
2784 + if (tcp_close_state(sk)) {
2785 + if (!is_meta_sk(sk))
2786 + tcp_send_fin(sk);
2787 + else
2788 + mptcp_send_fin(sk);
2792 EXPORT_SYMBOL(tcp_shutdown);
2793 @@ -2125,6 +2200,11 @@
2794 int data_was_unread = 0;
2795 int state;
2797 + if (is_meta_sk(sk)) {
2798 + mptcp_close(sk, timeout);
2799 + return;
2802 lock_sock(sk);
2803 sk->sk_shutdown = SHUTDOWN_MASK;
2805 @@ -2291,15 +2371,6 @@
2807 EXPORT_SYMBOL(tcp_close);
2809 -/* These states need RST on ABORT according to RFC793 */
2811 -static inline bool tcp_need_reset(int state)
2813 - return (1 << state) &
2814 - (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2815 - TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2818 int tcp_disconnect(struct sock *sk, int flags)
2820 struct inet_sock *inet = inet_sk(sk);
2821 @@ -2340,6 +2411,13 @@
2822 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2823 inet_reset_saddr(sk);
2825 + if (is_meta_sk(sk)) {
2826 + mptcp_disconnect(sk);
2827 + } else {
2828 + if (tp->inside_tk_table)
2829 + mptcp_hash_remove_bh(tp);
2832 sk->sk_shutdown = 0;
2833 sock_reset_flag(sk, SOCK_DONE);
2834 tp->srtt = 0;
2835 @@ -2699,6 +2777,18 @@
2836 tp->notsent_lowat = val;
2837 sk->sk_write_space(sk);
2838 break;
2839 +#ifdef CONFIG_MPTCP
2840 + case MPTCP_ENABLED:
2841 + if (sk->sk_state == TCP_CLOSE || sk->sk_state == TCP_LISTEN) {
2842 + if (val)
2843 + tp->mptcp_enabled = 1;
2844 + else
2845 + tp->mptcp_enabled = 0;
2846 + } else {
2847 + err = -EPERM;
2849 + break;
2850 +#endif
2851 default:
2852 err = -ENOPROTOOPT;
2853 break;
2854 @@ -2918,6 +3008,11 @@
2855 case TCP_NOTSENT_LOWAT:
2856 val = tp->notsent_lowat;
2857 break;
2858 +#ifdef CONFIG_MPTCP
2859 + case MPTCP_ENABLED:
2860 + val = tp->mptcp_enabled;
2861 + break;
2862 +#endif
2863 default:
2864 return -ENOPROTOOPT;
2866 @@ -3088,8 +3183,11 @@
2867 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
2868 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
2870 + WARN_ON(sk->sk_state == TCP_CLOSE);
2871 tcp_set_state(sk, TCP_CLOSE);
2873 tcp_clear_xmit_timers(sk);
2875 if (req != NULL)
2876 reqsk_fastopen_remove(sk, req, false);
2878 diff -Nur linux-3.14.45.orig/net/ipv4/tcp_input.c linux-3.14.45/net/ipv4/tcp_input.c
2879 --- linux-3.14.45.orig/net/ipv4/tcp_input.c 2015-06-23 02:01:36.000000000 +0200
2880 +++ linux-3.14.45/net/ipv4/tcp_input.c 2015-06-24 14:15:48.883862476 +0200
2881 @@ -74,6 +74,9 @@
2882 #include <linux/ipsec.h>
2883 #include <asm/unaligned.h>
2884 #include <net/netdma.h>
2885 +#include <net/mptcp.h>
2886 +#include <net/mptcp_v4.h>
2887 +#include <net/mptcp_v6.h>
2889 int sysctl_tcp_timestamps __read_mostly = 1;
2890 int sysctl_tcp_window_scaling __read_mostly = 1;
2891 @@ -99,25 +102,6 @@
2892 int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
2893 int sysctl_tcp_early_retrans __read_mostly = 3;
2895 -#define FLAG_DATA 0x01 /* Incoming frame contained data. */
2896 -#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
2897 -#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
2898 -#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
2899 -#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
2900 -#define FLAG_DATA_SACKED 0x20 /* New SACK. */
2901 -#define FLAG_ECE 0x40 /* ECE in this ACK */
2902 -#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
2903 -#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
2904 -#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
2905 -#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
2906 -#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
2907 -#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
2909 -#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
2910 -#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
2911 -#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE)
2912 -#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)
2914 #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
2915 #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
2917 @@ -283,8 +267,12 @@
2918 per_mss = roundup_pow_of_two(per_mss) +
2919 SKB_DATA_ALIGN(sizeof(struct sk_buff));
2921 - nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
2922 - nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
2923 + if (tp->mpc) {
2924 + nr_segs = mptcp_check_snd_buf(tp);
2925 + } else {
2926 + nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
2927 + nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
2930 /* Fast Recovery (RFC 5681 3.2) :
2931 * Cubic needs 1.7 factor, rounded to 2 to include
2932 @@ -292,8 +280,16 @@
2934 sndmem = 2 * nr_segs * per_mss;
2936 - if (sk->sk_sndbuf < sndmem)
2937 + /* MPTCP: after this sndmem is the new contribution of the
2938 + * current subflow to the aggregated sndbuf */
2939 + if (sk->sk_sndbuf < sndmem) {
2940 + int old_sndbuf = sk->sk_sndbuf;
2941 sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
2942 + /* MPTCP: ok, the subflow sndbuf has grown, reflect
2943 + * this in the aggregate buffer.*/
2944 + if (tp->mpc && old_sndbuf != sk->sk_sndbuf)
2945 + mptcp_update_sndbuf(tp->mpcb);
2949 /* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
2950 @@ -342,10 +338,12 @@
2951 static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
2953 struct tcp_sock *tp = tcp_sk(sk);
2954 + struct sock *meta_sk = tp->mpc ? mptcp_meta_sk(sk) : sk;
2955 + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
2957 /* Check #1 */
2958 - if (tp->rcv_ssthresh < tp->window_clamp &&
2959 - (int)tp->rcv_ssthresh < tcp_space(sk) &&
2960 + if (meta_tp->rcv_ssthresh < meta_tp->window_clamp &&
2961 + (int)meta_tp->rcv_ssthresh < tcp_space(sk) &&
2962 !sk_under_memory_pressure(sk)) {
2963 int incr;
2965 @@ -353,14 +351,14 @@
2966 * will fit to rcvbuf in future.
2968 if (tcp_win_from_space(skb->truesize) <= skb->len)
2969 - incr = 2 * tp->advmss;
2970 + incr = 2 * meta_tp->advmss;
2971 else
2972 - incr = __tcp_grow_window(sk, skb);
2973 + incr = __tcp_grow_window(meta_sk, skb);
2975 if (incr) {
2976 incr = max_t(int, incr, 2 * skb->len);
2977 - tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,
2978 - tp->window_clamp);
2979 + meta_tp->rcv_ssthresh = min(meta_tp->rcv_ssthresh + incr,
2980 + meta_tp->window_clamp);
2981 inet_csk(sk)->icsk_ack.quick |= 1;
2984 @@ -543,7 +541,10 @@
2985 int copied;
2987 time = tcp_time_stamp - tp->rcvq_space.time;
2988 - if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
2989 + if (tp->mpc) {
2990 + if (mptcp_check_rtt(tp, time))
2991 + return;
2992 + } else if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
2993 return;
2995 /* Number of bytes copied to user in last RTT */
2996 @@ -768,7 +769,7 @@
2997 /* Calculate rto without backoff. This is the second half of Van Jacobson's
2998 * routine referred to above.
3000 -static void tcp_set_rto(struct sock *sk)
3001 +void tcp_set_rto(struct sock *sk)
3003 const struct tcp_sock *tp = tcp_sk(sk);
3004 /* Old crap is replaced with new one. 8)
3005 @@ -2909,7 +2910,7 @@
3006 return false;
3008 tcp_rtt_estimator(sk, seq_rtt);
3009 - tcp_set_rto(sk);
3010 + tp->set_rto(sk);
3012 /* RFC6298: only reset backoff on valid RTT measurement. */
3013 inet_csk(sk)->icsk_backoff = 0;
3014 @@ -2993,7 +2994,7 @@
3017 /* If we get here, the whole TSO packet has not been acked. */
3018 -static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
3019 +u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
3021 struct tcp_sock *tp = tcp_sk(sk);
3022 u32 packets_acked;
3023 @@ -3088,6 +3089,8 @@
3025 if (!(scb->tcp_flags & TCPHDR_SYN)) {
3026 flag |= FLAG_DATA_ACKED;
3027 + if (tp->mpc && mptcp_is_data_seq(skb))
3028 + flag |= MPTCP_FLAG_DATA_ACKED;
3029 } else {
3030 flag |= FLAG_SYN_ACKED;
3031 tp->retrans_stamp = 0;
3032 @@ -3190,7 +3193,7 @@
3033 return flag;
3036 -static void tcp_ack_probe(struct sock *sk)
3037 +void tcp_ack_probe(struct sock *sk)
3039 const struct tcp_sock *tp = tcp_sk(sk);
3040 struct inet_connection_sock *icsk = inet_csk(sk);
3041 @@ -3237,9 +3240,8 @@
3042 /* Check that window update is acceptable.
3043 * The function assumes that snd_una<=ack<=snd_next.
3045 -static inline bool tcp_may_update_window(const struct tcp_sock *tp,
3046 - const u32 ack, const u32 ack_seq,
3047 - const u32 nwin)
3048 +bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack,
3049 + const u32 ack_seq, const u32 nwin)
3051 return after(ack, tp->snd_una) ||
3052 after(ack_seq, tp->snd_wl1) ||
3053 @@ -3358,7 +3360,7 @@
3056 /* This routine deals with incoming acks, but not outgoing ones. */
3057 -static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3058 +static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
3060 struct inet_connection_sock *icsk = inet_csk(sk);
3061 struct tcp_sock *tp = tcp_sk(sk);
3062 @@ -3453,6 +3455,16 @@
3063 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, sack_rtt);
3064 acked -= tp->packets_out;
3066 + if (tp->mpc) {
3067 + if (mptcp_fallback_infinite(sk, flag)) {
3068 + pr_err("%s resetting flow\n", __func__);
3069 + mptcp_send_reset(sk);
3070 + goto invalid_ack;
3073 + mptcp_clean_rtx_infinite(skb, sk);
3076 /* Advance cwnd if state allows */
3077 if (tcp_may_raise_cwnd(sk, flag))
3078 tcp_cong_avoid(sk, ack, acked, prior_in_flight);
3079 @@ -3517,8 +3529,9 @@
3080 * the fast version below fails.
3082 void tcp_parse_options(const struct sk_buff *skb,
3083 - struct tcp_options_received *opt_rx, int estab,
3084 - struct tcp_fastopen_cookie *foc)
3085 + struct tcp_options_received *opt_rx,
3086 + struct mptcp_options_received *mopt,
3087 + int estab, struct tcp_fastopen_cookie *foc)
3089 const unsigned char *ptr;
3090 const struct tcphdr *th = tcp_hdr(skb);
3091 @@ -3601,6 +3614,10 @@
3093 break;
3094 #endif
3095 + case TCPOPT_MPTCP:
3096 + mptcp_parse_options(ptr - 2, opsize, opt_rx,
3097 + mopt, skb);
3098 + break;
3099 case TCPOPT_EXP:
3100 /* Fast Open option shares code 254 using a
3101 * 16 bits magic number. It's valid only in
3102 @@ -3662,8 +3679,8 @@
3103 if (tcp_parse_aligned_timestamp(tp, th))
3104 return true;
3107 - tcp_parse_options(skb, &tp->rx_opt, 1, NULL);
3108 + tcp_parse_options(skb, &tp->rx_opt, tp->mpc ? &tp->mptcp->rx_opt : NULL,
3109 + 1, NULL);
3110 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
3111 tp->rx_opt.rcv_tsecr -= tp->tsoffset;
3113 @@ -3836,6 +3853,8 @@
3114 dst = __sk_dst_get(sk);
3115 if (!dst || !dst_metric(dst, RTAX_QUICKACK))
3116 inet_csk(sk)->icsk_ack.pingpong = 1;
3117 + if (tp->mpc)
3118 + mptcp_sub_close_passive(sk);
3119 break;
3121 case TCP_CLOSE_WAIT:
3122 @@ -3857,6 +3876,13 @@
3123 tcp_set_state(sk, TCP_CLOSING);
3124 break;
3125 case TCP_FIN_WAIT2:
3126 + if (tp->mpc) {
3127 + /* The socket will get closed by mptcp_data_ready.
3128 + * We first have to process all data-sequences.
3129 + */
3130 + tp->close_it = 1;
3131 + break;
3133 /* Received a FIN -- send ACK and enter TIME_WAIT. */
3134 tcp_send_ack(sk);
3135 tcp_time_wait(sk, TCP_TIME_WAIT, 0);
3136 @@ -3881,6 +3907,10 @@
3137 if (!sock_flag(sk, SOCK_DEAD)) {
3138 sk->sk_state_change(sk);
3140 + /* Don't wake up MPTCP-subflows */
3141 + if (tp->mpc)
3142 + return;
3144 /* Do not send POLL_HUP for half duplex close. */
3145 if (sk->sk_shutdown == SHUTDOWN_MASK ||
3146 sk->sk_state == TCP_CLOSE)
3147 @@ -4078,7 +4108,11 @@
3148 tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
3151 - if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
3152 + /* In case of MPTCP, the segment may be empty if it's a
3153 + * non-data DATA_FIN. (see beginning of tcp_data_queue)
3154 + */
3155 + if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt) &&
3156 + !(tp->mpc && TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)) {
3157 SOCK_DEBUG(sk, "ofo packet was already received\n");
3158 __skb_unlink(skb, &tp->out_of_order_queue);
3159 __kfree_skb(skb);
3160 @@ -4102,6 +4136,9 @@
3161 static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
3162 unsigned int size)
3164 + if (tcp_sk(sk)->mpc)
3165 + sk = mptcp_meta_sk(sk);
3167 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
3168 !sk_rmem_schedule(sk, skb, size)) {
3170 @@ -4132,15 +4169,16 @@
3171 * Better try to coalesce them right now to avoid future collapses.
3172 * Returns true if caller should free @from instead of queueing it
3174 -static bool tcp_try_coalesce(struct sock *sk,
3175 - struct sk_buff *to,
3176 - struct sk_buff *from,
3177 - bool *fragstolen)
3178 +bool tcp_try_coalesce(struct sock *sk, struct sk_buff *to, struct sk_buff *from,
3179 + bool *fragstolen)
3181 int delta;
3183 *fragstolen = false;
3185 + if (tcp_sk(sk)->mpc && !is_meta_sk(sk))
3186 + return false;
3188 if (tcp_hdr(from)->fin)
3189 return false;
3191 @@ -4230,7 +4268,9 @@
3193 /* Do skb overlap to previous one? */
3194 if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
3195 - if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
3196 + /* MPTCP allows non-data data-fin to be in the ofo-queue */
3197 + if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq) &&
3198 + !(tp->mpc && end_seq == seq)) {
3199 /* All the bits are present. Drop. */
3200 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
3201 __kfree_skb(skb);
3202 @@ -4268,6 +4308,9 @@
3203 end_seq);
3204 break;
3206 + /* MPTCP allows non-data data-fin to be in the ofo-queue */
3207 + if (tp->mpc && TCP_SKB_CB(skb1)->seq == TCP_SKB_CB(skb1)->end_seq)
3208 + continue;
3209 __skb_unlink(skb1, &tp->out_of_order_queue);
3210 tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
3211 TCP_SKB_CB(skb1)->end_seq);
3212 @@ -4285,8 +4328,8 @@
3216 -static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
3217 - bool *fragstolen)
3218 +int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
3219 + bool *fragstolen)
3221 int eaten;
3222 struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
3223 @@ -4348,7 +4391,10 @@
3224 int eaten = -1;
3225 bool fragstolen = false;
3227 - if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
3228 + /* If no data is present, but a data_fin is in the options, we still
3229 + * have to call mptcp_queue_skb later on. */
3230 + if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq &&
3231 + !(tp->mpc && mptcp_is_data_fin(skb)))
3232 goto drop;
3234 skb_dst_drop(skb);
3235 @@ -4394,7 +4440,7 @@
3236 eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
3238 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
3239 - if (skb->len)
3240 + if (skb->len || mptcp_is_data_fin(skb))
3241 tcp_event_data_recv(sk, skb);
3242 if (th->fin)
3243 tcp_fin(sk);
3244 @@ -4416,7 +4462,11 @@
3246 if (eaten > 0)
3247 kfree_skb_partial(skb, fragstolen);
3248 - if (!sock_flag(sk, SOCK_DEAD))
3249 + if (!sock_flag(sk, SOCK_DEAD) || tp->mpc)
3250 + /* MPTCP: we always have to call data_ready, because
3251 + * we may be about to receive a data-fin, which still
3252 + * must get queued.
3253 + */
3254 sk->sk_data_ready(sk, 0);
3255 return;
3257 @@ -4468,6 +4518,8 @@
3258 next = skb_queue_next(list, skb);
3260 __skb_unlink(skb, list);
3261 + if (tcp_sk(sk)->mpc)
3262 + mptcp_remove_shortcuts(tcp_sk(sk)->mpcb, skb);
3263 __kfree_skb(skb);
3264 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
3266 @@ -4640,6 +4692,18 @@
3267 struct tcp_sock *tp = tcp_sk(sk);
3268 bool res = false;
3270 + if (is_meta_sk(sk)) {
3271 + if (!skb_queue_empty(&tp->out_of_order_queue)) {
3272 + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
3273 + mptcp_purge_ofo_queue(tp);
3275 + /* No sack at the mptcp-level */
3276 + sk_mem_reclaim(sk);
3277 + res = true;
3279 + return res;
3282 if (!skb_queue_empty(&tp->out_of_order_queue)) {
3283 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
3284 __skb_queue_purge(&tp->out_of_order_queue);
3285 @@ -4729,7 +4793,7 @@
3286 tp->snd_cwnd_stamp = tcp_time_stamp;
3289 -static bool tcp_should_expand_sndbuf(const struct sock *sk)
3290 +bool tcp_should_expand_sndbuf(const struct sock *sk)
3292 const struct tcp_sock *tp = tcp_sk(sk);
3294 @@ -4764,7 +4828,7 @@
3296 struct tcp_sock *tp = tcp_sk(sk);
3298 - if (tcp_should_expand_sndbuf(sk)) {
3299 + if (tp->should_expand_sndbuf(sk)) {
3300 tcp_sndbuf_expand(sk);
3301 tp->snd_cwnd_stamp = tcp_time_stamp;
3303 @@ -4776,8 +4840,9 @@
3305 if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
3306 sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
3307 - if (sk->sk_socket &&
3308 - test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
3309 + if (tcp_sk(sk)->mpc ||
3310 + (sk->sk_socket &&
3311 + test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)))
3312 tcp_new_space(sk);
3315 @@ -4800,7 +4865,7 @@
3316 /* ... and right edge of window advances far enough.
3317 * (tcp_recvmsg() will send ACK otherwise). Or...
3319 - __tcp_select_window(sk) >= tp->rcv_wnd) ||
3320 + tp->__select_window(sk) >= tp->rcv_wnd) ||
3321 /* We ACK each frame or... */
3322 tcp_in_quickack_mode(sk) ||
3323 /* We have out of order data. */
3324 @@ -4902,6 +4967,10 @@
3326 struct tcp_sock *tp = tcp_sk(sk);
3328 + /* MPTCP urgent data is not yet supported */
3329 + if (tp->mpc)
3330 + return;
3332 /* Check if we get a new urgent pointer - normally not. */
3333 if (th->urg)
3334 tcp_check_urg(sk, th);
3335 @@ -4969,8 +5038,7 @@
3338 #ifdef CONFIG_NET_DMA
3339 -static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
3340 - int hlen)
3341 +bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen)
3343 struct tcp_sock *tp = tcp_sk(sk);
3344 int chunk = skb->len - hlen;
3345 @@ -5079,9 +5147,15 @@
3346 goto discard;
3349 + /* If valid: post process the received MPTCP options. */
3350 + if (tp->mpc && mptcp_handle_options(sk, th, skb))
3351 + goto discard;
3353 return true;
3355 discard:
3356 + if (tp->mpc)
3357 + mptcp_reset_mopt(tp);
3358 __kfree_skb(skb);
3359 return false;
3361 @@ -5133,6 +5207,10 @@
3363 tp->rx_opt.saw_tstamp = 0;
3365 + /* MPTCP: force slowpath. */
3366 + if (tp->mpc)
3367 + goto slow_path;
3369 /* pred_flags is 0xS?10 << 16 + snd_wnd
3370 * if header_prediction is to be made
3371 * 'S' will always be tp->tcp_header_len >> 2
3372 @@ -5347,7 +5425,7 @@
3374 tp->lsndtime = tcp_time_stamp;
3376 - tcp_init_buffer_space(sk);
3377 + tp->init_buffer_space(sk);
3379 if (sock_flag(sk, SOCK_KEEPOPEN))
3380 inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
3381 @@ -5377,7 +5455,7 @@
3382 /* Get original SYNACK MSS value if user MSS sets mss_clamp */
3383 tcp_clear_options(&opt);
3384 opt.user_mss = opt.mss_clamp = 0;
3385 - tcp_parse_options(synack, &opt, 0, NULL);
3386 + tcp_parse_options(synack, &opt, NULL, 0, NULL);
3387 mss = opt.mss_clamp;
3390 @@ -5412,8 +5490,11 @@
3391 struct tcp_sock *tp = tcp_sk(sk);
3392 struct tcp_fastopen_cookie foc = { .len = -1 };
3393 int saved_clamp = tp->rx_opt.mss_clamp;
3394 + struct mptcp_options_received mopt;
3395 + mptcp_init_mp_opt(&mopt);
3397 - tcp_parse_options(skb, &tp->rx_opt, 0, &foc);
3398 + tcp_parse_options(skb, &tp->rx_opt,
3399 + tp->mpc ? &tp->mptcp->rx_opt : &mopt, 0, &foc);
3400 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
3401 tp->rx_opt.rcv_tsecr -= tp->tsoffset;
3403 @@ -5460,6 +5541,21 @@
3404 if (!th->syn)
3405 goto discard_and_undo;
3407 + if (tp->request_mptcp || tp->mpc) {
3408 + int ret;
3409 + ret = mptcp_rcv_synsent_state_process(sk, &sk,
3410 + skb, &mopt);
3412 + /* May have changed if we support MPTCP */
3413 + tp = tcp_sk(sk);
3414 + icsk = inet_csk(sk);
3416 + if (ret == 1)
3417 + goto reset_and_undo;
3418 + if (ret == 2)
3419 + goto discard;
3422 /* rfc793:
3423 * "If the SYN bit is on ...
3424 * are acceptable then ...
3425 @@ -5472,6 +5568,15 @@
3426 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
3427 tcp_ack(sk, skb, FLAG_SLOWPATH);
3429 + if (tp->mpc && !is_master_tp(tp)) {
3430 + /* Timer for repeating the ACK until an answer
3431 + * arrives. Used only when establishing an additional
3432 + * subflow inside of an MPTCP connection.
3433 + */
3434 + sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,
3435 + jiffies + icsk->icsk_rto);
3438 /* Ok.. it's good. Set up sequence numbers and
3439 * move to established.
3441 @@ -5498,6 +5603,11 @@
3442 tp->tcp_header_len = sizeof(struct tcphdr);
3445 + if (tp->mpc) {
3446 + tp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN;
3447 + tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN;
3450 if (tcp_is_sack(tp) && sysctl_tcp_fack)
3451 tcp_enable_fack(tp);
3453 @@ -5518,7 +5628,9 @@
3454 tcp_rcv_fastopen_synack(sk, skb, &foc))
3455 return -1;
3457 - if (sk->sk_write_pending ||
3458 + /* With MPTCP we cannot send data on the third ack due to the
3459 + * lack of option-space */
3460 + if ((sk->sk_write_pending && !tp->mpc) ||
3461 icsk->icsk_accept_queue.rskq_defer_accept ||
3462 icsk->icsk_ack.pingpong) {
3463 /* Save one ACK. Data will be ready after
3464 @@ -5560,6 +5672,7 @@
3465 tcp_paws_reject(&tp->rx_opt, 0))
3466 goto discard_and_undo;
3468 + /* TODO - check this here for MPTCP */
3469 if (th->syn) {
3470 /* We see SYN without ACK. It is attempt of
3471 * simultaneous connect with crossed SYNs.
3472 @@ -5576,6 +5689,11 @@
3473 tp->tcp_header_len = sizeof(struct tcphdr);
3476 + if (tp->mpc) {
3477 + tp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN;
3478 + tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN;
3481 tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
3482 tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
3484 @@ -5634,6 +5752,7 @@
3486 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
3487 const struct tcphdr *th, unsigned int len)
3488 + __releases(&sk->sk_lock.slock)
3490 struct tcp_sock *tp = tcp_sk(sk);
3491 struct inet_connection_sock *icsk = inet_csk(sk);
3492 @@ -5685,6 +5804,10 @@
3494 case TCP_SYN_SENT:
3495 queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
3496 + if (is_meta_sk(sk)) {
3497 + sk = tcp_sk(sk)->mpcb->master_sk;
3498 + tp = tcp_sk(sk);
3500 if (queued >= 0)
3501 return queued;
3503 @@ -5692,6 +5815,8 @@
3504 tcp_urg(sk, skb, th);
3505 __kfree_skb(skb);
3506 tcp_data_snd_check(sk);
3507 + if (tp->mpc && is_master_tp(tp))
3508 + bh_unlock_sock(sk);
3509 return 0;
3512 @@ -5734,7 +5859,7 @@
3514 tcp_mtup_init(sk);
3515 tp->copied_seq = tp->rcv_nxt;
3516 - tcp_init_buffer_space(sk);
3517 + tp->init_buffer_space(sk);
3519 smp_mb();
3520 tcp_set_state(sk, TCP_ESTABLISHED);
3521 @@ -5754,6 +5879,8 @@
3523 if (tp->rx_opt.tstamp_ok)
3524 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
3525 + if (tp->mpc)
3526 + tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN;
3528 if (req) {
3529 /* Re-arm the timer because data may have been sent out.
3530 @@ -5775,6 +5902,12 @@
3532 tcp_initialize_rcv_mss(sk);
3533 tcp_fast_path_on(tp);
3534 + /* Send an ACK when establishing a new
3535 + * MPTCP subflow, i.e. using an MP_JOIN
3536 + * subtype.
3537 + */
3538 + if (tp->mpc && !is_master_tp(tp))
3539 + tcp_send_ack(sk);
3540 break;
3542 case TCP_FIN_WAIT1: {
3543 @@ -5826,7 +5959,8 @@
3544 tmo = tcp_fin_time(sk);
3545 if (tmo > TCP_TIMEWAIT_LEN) {
3546 inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
3547 - } else if (th->fin || sock_owned_by_user(sk)) {
3548 + } else if (th->fin || mptcp_is_data_fin(skb) ||
3549 + sock_owned_by_user(sk)) {
3550 /* Bad case. We could lose such FIN otherwise.
3551 * It is not a big problem, but it looks confusing
3552 * and not so rare event. We still can lose it now,
3553 @@ -5855,6 +5989,9 @@
3554 goto discard;
3556 break;
3557 + case TCP_CLOSE:
3558 + if (tp->mp_killed)
3559 + goto discard;
3562 /* step 6: check the URG bit */
3563 @@ -5875,7 +6012,11 @@
3565 if (sk->sk_shutdown & RCV_SHUTDOWN) {
3566 if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
3567 - after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
3568 + after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt) &&
3569 + !tp->mpc) {
3570 + /* In case of mptcp, the reset is handled by
3571 + * mptcp_rcv_state_process
3572 + */
3573 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
3574 tcp_reset(sk);
3575 return 1;
3576 diff -Nur linux-3.14.45.orig/net/ipv4/tcp_ipv4.c linux-3.14.45/net/ipv4/tcp_ipv4.c
3577 --- linux-3.14.45.orig/net/ipv4/tcp_ipv4.c 2015-06-23 02:01:36.000000000 +0200
3578 +++ linux-3.14.45/net/ipv4/tcp_ipv4.c 2015-06-24 14:15:48.883862476 +0200
3579 @@ -67,6 +67,8 @@
3580 #include <net/icmp.h>
3581 #include <net/inet_hashtables.h>
3582 #include <net/tcp.h>
3583 +#include <net/mptcp.h>
3584 +#include <net/mptcp_v4.h>
3585 #include <net/transp_v6.h>
3586 #include <net/ipv6.h>
3587 #include <net/inet_common.h>
3588 @@ -99,7 +101,7 @@
3589 struct inet_hashinfo tcp_hashinfo;
3590 EXPORT_SYMBOL(tcp_hashinfo);
3592 -static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
3593 +__u32 tcp_v4_init_sequence(const struct sk_buff *skb)
3595 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
3596 ip_hdr(skb)->saddr,
3597 @@ -335,7 +337,7 @@
3598 struct inet_sock *inet;
3599 const int type = icmp_hdr(icmp_skb)->type;
3600 const int code = icmp_hdr(icmp_skb)->code;
3601 - struct sock *sk;
3602 + struct sock *sk, *meta_sk;
3603 struct sk_buff *skb;
3604 struct request_sock *req;
3605 __u32 seq;
3606 @@ -359,13 +361,19 @@
3607 return;
3610 - bh_lock_sock(sk);
3611 + tp = tcp_sk(sk);
3612 + if (tp->mpc)
3613 + meta_sk = mptcp_meta_sk(sk);
3614 + else
3615 + meta_sk = sk;
3617 + bh_lock_sock(meta_sk);
3618 /* If too many ICMPs get dropped on busy
3619 * servers this needs to be solved differently.
3620 * We do take care of PMTU discovery (RFC1191) special case :
3621 * we can receive locally generated ICMP messages while socket is held.
3623 - if (sock_owned_by_user(sk)) {
3624 + if (sock_owned_by_user(meta_sk)) {
3625 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
3626 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
3628 @@ -378,7 +386,6 @@
3631 icsk = inet_csk(sk);
3632 - tp = tcp_sk(sk);
3633 req = tp->fastopen_rsk;
3634 seq = ntohl(th->seq);
3635 if (sk->sk_state != TCP_LISTEN &&
3636 @@ -412,11 +419,13 @@
3637 goto out;
3639 tp->mtu_info = info;
3640 - if (!sock_owned_by_user(sk)) {
3641 + if (!sock_owned_by_user(meta_sk)) {
3642 tcp_v4_mtu_reduced(sk);
3643 } else {
3644 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
3645 sock_hold(sk);
3646 + if (tp->mpc)
3647 + mptcp_tsq_flags(sk);
3649 goto out;
3651 @@ -432,7 +441,7 @@
3653 /* XXX (TFO) - revisit the following logic for TFO */
3655 - if (sock_owned_by_user(sk))
3656 + if (sock_owned_by_user(meta_sk))
3657 break;
3659 icsk->icsk_backoff--;
3660 @@ -474,7 +483,7 @@
3661 switch (sk->sk_state) {
3662 struct request_sock *req, **prev;
3663 case TCP_LISTEN:
3664 - if (sock_owned_by_user(sk))
3665 + if (sock_owned_by_user(meta_sk))
3666 goto out;
3668 req = inet_csk_search_req(sk, &prev, th->dest,
3669 @@ -507,7 +516,7 @@
3670 It can f.e. if SYNs crossed,
3671 or Fast Open.
3673 - if (!sock_owned_by_user(sk)) {
3674 + if (!sock_owned_by_user(meta_sk)) {
3675 sk->sk_err = err;
3677 sk->sk_error_report(sk);
3678 @@ -536,7 +545,7 @@
3681 inet = inet_sk(sk);
3682 - if (!sock_owned_by_user(sk) && inet->recverr) {
3683 + if (!sock_owned_by_user(meta_sk) && inet->recverr) {
3684 sk->sk_err = err;
3685 sk->sk_error_report(sk);
3686 } else { /* Only an error on timeout */
3687 @@ -544,7 +553,7 @@
3690 out:
3691 - bh_unlock_sock(sk);
3692 + bh_unlock_sock(meta_sk);
3693 sock_put(sk);
3696 @@ -586,7 +595,7 @@
3697 * Exception: precedence violation. We do not implement it in any case.
3700 -static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
3701 +void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
3703 const struct tcphdr *th = tcp_hdr(skb);
3704 struct {
3705 @@ -711,10 +720,10 @@
3706 outside socket context is ugly, certainly. What can I do?
3709 -static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
3710 +static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 data_ack,
3711 u32 win, u32 tsval, u32 tsecr, int oif,
3712 struct tcp_md5sig_key *key,
3713 - int reply_flags, u8 tos)
3714 + int reply_flags, u8 tos, int mptcp)
3716 const struct tcphdr *th = tcp_hdr(skb);
3717 struct {
3718 @@ -723,6 +732,10 @@
3719 #ifdef CONFIG_TCP_MD5SIG
3720 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
3721 #endif
3722 +#ifdef CONFIG_MPTCP
3723 + + ((MPTCP_SUB_LEN_DSS >> 2) +
3724 + (MPTCP_SUB_LEN_ACK >> 2))
3725 +#endif
3727 } rep;
3728 struct ip_reply_arg arg;
3729 @@ -767,6 +780,21 @@
3730 ip_hdr(skb)->daddr, &rep.th);
3732 #endif
3733 +#ifdef CONFIG_MPTCP
3734 + if (mptcp) {
3735 + int offset = (tsecr) ? 3 : 0;
3736 + /* Construction of 32-bit data_ack */
3737 + rep.opt[offset++] = htonl((TCPOPT_MPTCP << 24) |
3738 + ((MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK) << 16) |
3739 + (0x20 << 8) |
3740 + (0x01));
3741 + rep.opt[offset] = htonl(data_ack);
3743 + arg.iov[0].iov_len += MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK;
3744 + rep.th.doff = arg.iov[0].iov_len / 4;
3746 +#endif /* CONFIG_MPTCP */
3748 arg.flags = reply_flags;
3749 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
3750 ip_hdr(skb)->saddr, /* XXX */
3751 @@ -786,36 +814,44 @@
3753 struct inet_timewait_sock *tw = inet_twsk(sk);
3754 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
3755 + u32 data_ack = 0;
3756 + int mptcp = 0;
3758 + if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw) {
3759 + data_ack = (u32)tcptw->mptcp_tw->rcv_nxt;
3760 + mptcp = 1;
3763 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
3764 + data_ack,
3765 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
3766 tcp_time_stamp + tcptw->tw_ts_offset,
3767 tcptw->tw_ts_recent,
3768 tw->tw_bound_dev_if,
3769 tcp_twsk_md5_key(tcptw),
3770 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
3771 - tw->tw_tos
3772 + tw->tw_tos, mptcp
3775 inet_twsk_put(tw);
3778 -static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
3779 - struct request_sock *req)
3780 +void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
3781 + struct request_sock *req)
3783 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
3784 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
3786 tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
3787 tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
3788 - tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
3789 + tcp_rsk(req)->rcv_nxt, 0, req->rcv_wnd,
3790 tcp_time_stamp,
3791 req->ts_recent,
3793 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
3794 AF_INET),
3795 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
3796 - ip_hdr(skb)->tos);
3797 + ip_hdr(skb)->tos, 0);
3801 @@ -823,9 +859,9 @@
3802 * This still operates on a request_sock only, not on a big
3803 * socket.
3805 -static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
3806 - struct request_sock *req,
3807 - u16 queue_mapping)
3808 +int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
3809 + struct request_sock *req,
3810 + u16 queue_mapping)
3812 const struct inet_request_sock *ireq = inet_rsk(req);
3813 struct flowi4 fl4;
3814 @@ -853,7 +889,7 @@
3815 return err;
3818 -static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
3819 +int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
3821 int res = tcp_v4_send_synack(sk, NULL, req, 0);
3823 @@ -865,7 +901,7 @@
3825 * IPv4 request_sock destructor.
3827 -static void tcp_v4_reqsk_destructor(struct request_sock *req)
3828 +void tcp_v4_reqsk_destructor(struct request_sock *req)
3830 kfree(inet_rsk(req)->opt);
3832 @@ -905,7 +941,7 @@
3834 * Save and compile IPv4 options into the request_sock if needed.
3836 -static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
3837 +struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
3839 const struct ip_options *opt = &(IPCB(skb)->opt);
3840 struct ip_options_rcu *dopt = NULL;
3841 @@ -1257,7 +1293,7 @@
3844 #ifdef CONFIG_TCP_MD5SIG
3845 -static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
3846 +const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
3847 .md5_lookup = tcp_v4_reqsk_md5_lookup,
3848 .calc_md5_hash = tcp_v4_md5_hash_skb,
3850 @@ -1415,7 +1451,7 @@
3851 tcp_init_congestion_control(child);
3852 tcp_mtup_init(child);
3853 tcp_init_metrics(child);
3854 - tcp_init_buffer_space(child);
3855 + tp->init_buffer_space(child);
3857 /* Queue the data carried in the SYN packet. We need to first
3858 * bump skb's refcnt because the caller will attempt to free it.
3859 @@ -1447,6 +1483,7 @@
3860 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
3862 struct tcp_options_received tmp_opt;
3863 + struct mptcp_options_received mopt;
3864 struct request_sock *req;
3865 struct inet_request_sock *ireq;
3866 struct tcp_sock *tp = tcp_sk(sk);
3867 @@ -1461,6 +1498,22 @@
3868 struct sk_buff *skb_synack;
3869 int do_fastopen;
3871 + tcp_clear_options(&tmp_opt);
3872 + tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
3873 + tmp_opt.user_mss = tp->rx_opt.user_mss;
3874 + mptcp_init_mp_opt(&mopt);
3875 + tcp_parse_options(skb, &tmp_opt, &mopt, 0, want_cookie ? NULL : &foc);
3877 +#ifdef CONFIG_MPTCP
3878 + /* MPTCP structures not initialized, so clear MPTCP fields */
3879 + if (mptcp_init_failed)
3880 + mptcp_init_mp_opt(&mopt);
3882 + if (mopt.is_mp_join)
3883 + return mptcp_do_join_short(skb, &mopt, &tmp_opt, sock_net(sk));
3884 + if (mopt.drop_me)
3885 + goto drop;
3886 +#endif
3887 /* Never answer to SYNs send to broadcast or multicast */
3888 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
3889 goto drop;
3890 @@ -1486,7 +1539,22 @@
3891 goto drop;
3894 - req = inet_reqsk_alloc(&tcp_request_sock_ops);
3895 +#ifdef CONFIG_MPTCP
3896 + if (sysctl_mptcp_enabled == MPTCP_APP && !tp->mptcp_enabled)
3897 + mopt.saw_mpc = 0;
3898 + if (mopt.saw_mpc && !want_cookie) {
3899 + req = inet_reqsk_alloc(&mptcp_request_sock_ops);
3901 + if (!req)
3902 + goto drop;
3904 + mptcp_rsk(req)->mpcb = NULL;
3905 + mptcp_rsk(req)->dss_csum = mopt.dss_csum;
3906 + mptcp_rsk(req)->collide_tk.pprev = NULL;
3907 + } else
3908 +#endif
3909 + req = inet_reqsk_alloc(&tcp_request_sock_ops);
3911 if (!req)
3912 goto drop;
3914 @@ -1494,17 +1562,15 @@
3915 tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
3916 #endif
3918 - tcp_clear_options(&tmp_opt);
3919 - tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
3920 - tmp_opt.user_mss = tp->rx_opt.user_mss;
3921 - tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
3923 if (want_cookie && !tmp_opt.saw_tstamp)
3924 tcp_clear_options(&tmp_opt);
3926 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
3927 tcp_openreq_init(req, &tmp_opt, skb);
3929 + if (mopt.saw_mpc && !want_cookie)
3930 + mptcp_reqsk_new_mptcp(req, &tmp_opt, &mopt, skb);
3932 ireq = inet_rsk(req);
3933 ireq->ir_loc_addr = daddr;
3934 ireq->ir_rmt_addr = saddr;
3935 @@ -1716,7 +1782,7 @@
3937 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
3939 -static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
3940 +struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
3942 struct tcphdr *th = tcp_hdr(skb);
3943 const struct iphdr *iph = ip_hdr(skb);
3944 @@ -1733,8 +1799,15 @@
3946 if (nsk) {
3947 if (nsk->sk_state != TCP_TIME_WAIT) {
3948 + /* Don't lock again the meta-sk. It has been locked
3949 + * before mptcp_v4_do_rcv.
3950 + */
3951 + if (tcp_sk(nsk)->mpc && !is_meta_sk(sk))
3952 + bh_lock_sock(mptcp_meta_sk(nsk));
3953 bh_lock_sock(nsk);
3955 return nsk;
3958 inet_twsk_put(inet_twsk(nsk));
3959 return NULL;
3960 @@ -1791,6 +1864,9 @@
3961 goto discard;
3962 #endif
3964 + if (is_meta_sk(sk))
3965 + return mptcp_v4_do_rcv(sk, skb);
3967 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
3968 struct dst_entry *dst = sk->sk_rx_dst;
3970 @@ -1922,7 +1998,7 @@
3971 } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
3972 wake_up_interruptible_sync_poll(sk_sleep(sk),
3973 POLLIN | POLLRDNORM | POLLRDBAND);
3974 - if (!inet_csk_ack_scheduled(sk))
3975 + if (!inet_csk_ack_scheduled(sk) && !tp->mpc)
3976 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
3977 (3 * tcp_rto_min(sk)) / 4,
3978 TCP_RTO_MAX);
3979 @@ -1939,7 +2015,7 @@
3981 const struct iphdr *iph;
3982 const struct tcphdr *th;
3983 - struct sock *sk;
3984 + struct sock *sk, *meta_sk = NULL;
3985 int ret;
3986 struct net *net = dev_net(skb->dev);
3988 @@ -1972,18 +2048,42 @@
3989 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
3990 skb->len - th->doff * 4);
3991 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
3992 +#ifdef CONFIG_MPTCP
3993 + TCP_SKB_CB(skb)->mptcp_flags = 0;
3994 + TCP_SKB_CB(skb)->dss_off = 0;
3995 +#endif
3996 TCP_SKB_CB(skb)->when = 0;
3997 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
3998 TCP_SKB_CB(skb)->sacked = 0;
4000 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
4001 - if (!sk)
4002 - goto no_tcp_socket;
4004 process:
4005 - if (sk->sk_state == TCP_TIME_WAIT)
4006 + if (sk && sk->sk_state == TCP_TIME_WAIT)
4007 goto do_time_wait;
4009 +#ifdef CONFIG_MPTCP
4010 + if (!sk && th->syn && !th->ack) {
4011 + int ret = mptcp_lookup_join(skb, NULL);
4013 + if (ret < 0) {
4014 + tcp_v4_send_reset(NULL, skb);
4015 + goto discard_it;
4016 + } else if (ret > 0) {
4017 + return 0;
4021 + /* Is there a pending request sock for this segment ? */
4022 + if ((!sk || sk->sk_state == TCP_LISTEN) && mptcp_check_req(skb, net)) {
4023 + if (sk)
4024 + sock_put(sk);
4025 + return 0;
4027 +#endif
4028 + if (!sk)
4029 + goto no_tcp_socket;
4031 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
4032 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
4033 goto discard_and_relse;
4034 @@ -1999,11 +2099,21 @@
4035 sk_mark_napi_id(sk, skb);
4036 skb->dev = NULL;
4038 - bh_lock_sock_nested(sk);
4039 + if (tcp_sk(sk)->mpc) {
4040 + meta_sk = mptcp_meta_sk(sk);
4042 + bh_lock_sock_nested(meta_sk);
4043 + if (sock_owned_by_user(meta_sk))
4044 + skb->sk = sk;
4045 + } else {
4046 + meta_sk = sk;
4047 + bh_lock_sock_nested(sk);
4050 ret = 0;
4051 - if (!sock_owned_by_user(sk)) {
4052 + if (!sock_owned_by_user(meta_sk)) {
4053 #ifdef CONFIG_NET_DMA
4054 - struct tcp_sock *tp = tcp_sk(sk);
4055 + struct tcp_sock *tp = tcp_sk(meta_sk);
4056 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
4057 tp->ucopy.dma_chan = net_dma_find_channel();
4058 if (tp->ucopy.dma_chan)
4059 @@ -2011,16 +2121,16 @@
4060 else
4061 #endif
4063 - if (!tcp_prequeue(sk, skb))
4064 + if (!tcp_prequeue(meta_sk, skb))
4065 ret = tcp_v4_do_rcv(sk, skb);
4067 - } else if (unlikely(sk_add_backlog(sk, skb,
4068 - sk->sk_rcvbuf + sk->sk_sndbuf))) {
4069 - bh_unlock_sock(sk);
4070 + } else if (unlikely(sk_add_backlog(meta_sk, skb,
4071 + meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) {
4072 + bh_unlock_sock(meta_sk);
4073 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
4074 goto discard_and_relse;
4076 - bh_unlock_sock(sk);
4077 + bh_unlock_sock(meta_sk);
4079 sock_put(sk);
4081 @@ -2075,6 +2185,18 @@
4082 sk = sk2;
4083 goto process;
4085 +#ifdef CONFIG_MPTCP
4086 + if (th->syn && !th->ack) {
4087 + int ret = mptcp_lookup_join(skb, inet_twsk(sk));
4089 + if (ret < 0) {
4090 + tcp_v4_send_reset(NULL, skb);
4091 + goto discard_it;
4092 + } else if (ret > 0) {
4093 + return 0;
4096 +#endif
4097 /* Fall through to ACK */
4099 case TCP_TW_ACK:
4100 @@ -2158,6 +2280,11 @@
4102 tcp_cleanup_congestion_control(sk);
4104 + if (tp->mpc)
4105 + mptcp_destroy_sock(sk);
4106 + if (tp->inside_tk_table)
4107 + mptcp_hash_remove(tp);
4109 /* Cleanup up the write buffer. */
4110 tcp_write_queue_purge(sk);
4112 diff -Nur linux-3.14.45.orig/net/ipv4/tcp_minisocks.c linux-3.14.45/net/ipv4/tcp_minisocks.c
4113 --- linux-3.14.45.orig/net/ipv4/tcp_minisocks.c 2015-06-23 02:01:36.000000000 +0200
4114 +++ linux-3.14.45/net/ipv4/tcp_minisocks.c 2015-06-24 14:15:48.887862480 +0200
4115 @@ -18,11 +18,13 @@
4116 * Jorge Cwik, <jorge@laser.satlink.net>
4119 +#include <linux/kconfig.h>
4120 #include <linux/mm.h>
4121 #include <linux/module.h>
4122 #include <linux/slab.h>
4123 #include <linux/sysctl.h>
4124 #include <linux/workqueue.h>
4125 +#include <net/mptcp.h>
4126 #include <net/tcp.h>
4127 #include <net/inet_common.h>
4128 #include <net/xfrm.h>
4129 @@ -95,10 +97,13 @@
4130 struct tcp_options_received tmp_opt;
4131 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
4132 bool paws_reject = false;
4133 + struct mptcp_options_received mopt;
4135 tmp_opt.saw_tstamp = 0;
4136 if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
4137 - tcp_parse_options(skb, &tmp_opt, 0, NULL);
4138 + mptcp_init_mp_opt(&mopt);
4140 + tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL);
4142 if (tmp_opt.saw_tstamp) {
4143 tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset;
4144 @@ -106,6 +111,11 @@
4145 tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
4146 paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
4149 + if (unlikely(mopt.mp_fclose) && tcptw->mptcp_tw) {
4150 + if (mopt.mptcp_key == tcptw->mptcp_tw->loc_key)
4151 + goto kill_with_rst;
4155 if (tw->tw_substate == TCP_FIN_WAIT2) {
4156 @@ -128,6 +138,16 @@
4157 if (!th->ack ||
4158 !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
4159 TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
4160 + /* If mptcp_is_data_fin() returns true, we are sure that
4161 + * mopt has been initialized - otherwise it would not
4162 + * be a DATA_FIN.
4163 + */
4164 + if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw &&
4165 + mptcp_is_data_fin(skb) &&
4166 + TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
4167 + mopt.data_seq + 1 == (u32)tcptw->mptcp_tw->rcv_nxt)
4168 + return TCP_TW_ACK;
4170 inet_twsk_put(tw);
4171 return TCP_TW_SUCCESS;
4173 @@ -270,6 +290,11 @@
4174 const struct tcp_sock *tp = tcp_sk(sk);
4175 bool recycle_ok = false;
4177 + if (is_meta_sk(sk)) {
4178 + mptcp_update_tw_socks(tp, state);
4179 + goto tcp_done;
4182 if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
4183 recycle_ok = tcp_remember_stamp(sk);
4185 @@ -290,6 +315,15 @@
4186 tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
4187 tcptw->tw_ts_offset = tp->tsoffset;
4189 + if (tp->mpc) {
4190 + if (mptcp_time_wait(sk, tcptw)) {
4191 + inet_twsk_free(tw);
4192 + goto exit;
4194 + } else {
4195 + tcptw->mptcp_tw = NULL;
4198 #if IS_ENABLED(CONFIG_IPV6)
4199 if (tw->tw_family == PF_INET6) {
4200 struct ipv6_pinfo *np = inet6_sk(sk);
4201 @@ -347,15 +381,19 @@
4202 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
4205 +exit:
4206 tcp_update_metrics(sk);
4207 +tcp_done:
4208 tcp_done(sk);
4211 void tcp_twsk_destructor(struct sock *sk)
4213 -#ifdef CONFIG_TCP_MD5SIG
4214 struct tcp_timewait_sock *twsk = tcp_twsk(sk);
4216 + if (twsk->mptcp_tw)
4217 + mptcp_twsk_destructor(twsk);
4218 +#ifdef CONFIG_TCP_MD5SIG
4219 if (twsk->tw_md5_key)
4220 kfree_rcu(twsk->tw_md5_key, rcu);
4221 #endif
4222 @@ -392,6 +430,9 @@
4224 newtp->snd_sml = newtp->snd_una =
4225 newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
4226 +#ifdef CONFIG_MPTCP
4227 + memset(&newtp->rcvq_space, 0, sizeof(newtp->rcvq_space));
4228 +#endif
4230 tcp_prequeue_init(newtp);
4231 INIT_LIST_HEAD(&newtp->tsq_node);
4232 @@ -436,7 +477,11 @@
4234 newtp->urg_data = 0;
4236 - if (sock_flag(newsk, SOCK_KEEPOPEN))
4237 + /* MPTCP: If we are creating a subflow, KEEPOPEN might have been
4238 + * set on the meta. But, keepalive is entirely handled at the
4239 + * meta-socket, so let's keep it there.
4240 + */
4241 + if (sock_flag(newsk, SOCK_KEEPOPEN) && is_meta_sk(sk))
4242 inet_csk_reset_keepalive_timer(newsk,
4243 keepalive_time_when(newtp));
4245 @@ -468,6 +513,8 @@
4246 newtp->rx_opt.ts_recent_stamp = 0;
4247 newtp->tcp_header_len = sizeof(struct tcphdr);
4249 + if (treq->saw_mpc)
4250 + newtp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN;
4251 newtp->tsoffset = 0;
4252 #ifdef CONFIG_TCP_MD5SIG
4253 newtp->md5sig_info = NULL; /*XXX*/
4254 @@ -504,16 +551,20 @@
4255 bool fastopen)
4257 struct tcp_options_received tmp_opt;
4258 + struct mptcp_options_received mopt;
4259 struct sock *child;
4260 const struct tcphdr *th = tcp_hdr(skb);
4261 __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
4262 bool paws_reject = false;
4264 - BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN));
4265 + BUG_ON(!tcp_sk(sk)->mpc && fastopen == (sk->sk_state == TCP_LISTEN));
4267 tmp_opt.saw_tstamp = 0;
4269 + mptcp_init_mp_opt(&mopt);
4271 if (th->doff > (sizeof(struct tcphdr)>>2)) {
4272 - tcp_parse_options(skb, &tmp_opt, 0, NULL);
4273 + tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL);
4275 if (tmp_opt.saw_tstamp) {
4276 tmp_opt.ts_recent = req->ts_recent;
4277 @@ -552,7 +603,14 @@
4279 * Reset timer after retransmitting SYNACK, similar to
4280 * the idea of fast retransmit in recovery.
4282 + * Fall back to TCP if MP_CAPABLE is not set.
4285 + if (tcp_rsk(req)->saw_mpc && !mopt.saw_mpc)
4286 + tcp_rsk(req)->saw_mpc = false;
4289 if (!inet_rtx_syn_ack(sk, req))
4290 req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout,
4291 TCP_RTO_MAX) + jiffies;
4292 @@ -674,7 +732,20 @@
4294 /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
4295 if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
4296 - TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
4297 + TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1 &&
4298 + /* TODO MPTCP:
4299 + * We do this here, because otherwise options sent in the third ack,
4300 + * or duplicate fourth ack will get lost. Options like MP_PRIO, ADD_ADDR,...
4302 + * We could store them in request_sock, but this would mean that we
4303 + * have to put tcp_options_received and mptcp_options_received in there,
4304 + * increasing considerably the size of the request-sock.
4306 + * As soon as we have reworked the request-sock MPTCP-fields and
4307 + * created a mptcp_request_sock structure, we can handle options
4308 + * correclty there without increasing request_sock.
4309 + */
4310 + !tcp_rsk(req)->saw_mpc) {
4311 inet_rsk(req)->acked = 1;
4312 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
4313 return NULL;
4314 @@ -686,10 +757,29 @@
4315 * ESTABLISHED STATE. If it will be dropped after
4316 * socket is created, wait for troubles.
4318 - child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
4319 +#ifdef CONFIG_MPTCP
4320 + if (tcp_sk(sk)->mpc)
4321 + /* MPTCP: We call the mptcp-specific syn_recv_sock */
4322 + child = tcp_sk(sk)->mpcb->syn_recv_sock(sk, skb, req, NULL);
4323 + else
4324 +#endif
4325 + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb,
4326 + req, NULL);
4328 if (child == NULL)
4329 goto listen_overflow;
4331 + if (!is_meta_sk(sk)) {
4332 + int ret = mptcp_check_req_master(sk, child, req, prev, &mopt);
4333 + if (ret < 0)
4334 + goto listen_overflow;
4336 + /* MPTCP-supported */
4337 + if (!ret)
4338 + return tcp_sk(child)->mpcb->master_sk;
4339 + } else {
4340 + return mptcp_check_req_child(sk, child, req, prev, &mopt);
4342 inet_csk_reqsk_queue_unlink(sk, req, prev);
4343 inet_csk_reqsk_queue_removed(sk, req);
4345 @@ -739,8 +829,9 @@
4347 int ret = 0;
4348 int state = child->sk_state;
4349 + struct sock *meta_sk = tcp_sk(child)->mpc ? mptcp_meta_sk(child) : child;
4351 - if (!sock_owned_by_user(child)) {
4352 + if (!sock_owned_by_user(meta_sk)) {
4353 ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb),
4354 skb->len);
4355 /* Wakeup parent, send SIGIO */
4356 @@ -751,10 +842,14 @@
4357 * in main socket hash table and lock on listening
4358 * socket does not protect us more.
4360 - __sk_add_backlog(child, skb);
4361 + if (tcp_sk(child)->mpc)
4362 + skb->sk = child;
4363 + __sk_add_backlog(meta_sk, skb);
4366 - bh_unlock_sock(child);
4367 + if (tcp_sk(child)->mpc)
4368 + bh_unlock_sock(child);
4369 + bh_unlock_sock(meta_sk);
4370 sock_put(child);
4371 return ret;
4373 diff -Nur linux-3.14.45.orig/net/ipv4/tcp_output.c linux-3.14.45/net/ipv4/tcp_output.c
4374 --- linux-3.14.45.orig/net/ipv4/tcp_output.c 2015-06-23 02:01:36.000000000 +0200
4375 +++ linux-3.14.45/net/ipv4/tcp_output.c 2015-06-24 14:15:48.887862480 +0200
4376 @@ -36,6 +36,12 @@
4378 #define pr_fmt(fmt) "TCP: " fmt
4380 +#include <net/mptcp.h>
4381 +#include <net/mptcp_v4.h>
4382 +#if IS_ENABLED(CONFIG_IPV6)
4383 +#include <net/mptcp_v6.h>
4384 +#endif
4385 +#include <net/ipv6.h>
4386 #include <net/tcp.h>
4388 #include <linux/compiler.h>
4389 @@ -72,7 +78,7 @@
4390 int push_one, gfp_t gfp);
4392 /* Account for new data that has been sent to the network. */
4393 -static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
4394 +void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
4396 struct inet_connection_sock *icsk = inet_csk(sk);
4397 struct tcp_sock *tp = tcp_sk(sk);
4398 @@ -211,7 +217,7 @@
4399 void tcp_select_initial_window(int __space, __u32 mss,
4400 __u32 *rcv_wnd, __u32 *window_clamp,
4401 int wscale_ok, __u8 *rcv_wscale,
4402 - __u32 init_rcv_wnd)
4403 + __u32 init_rcv_wnd, const struct sock *sk)
4405 unsigned int space = (__space < 0 ? 0 : __space);
4407 @@ -266,11 +272,15 @@
4408 * value can be stuffed directly into th->window for an outgoing
4409 * frame.
4411 -static u16 tcp_select_window(struct sock *sk)
4412 +u16 tcp_select_window(struct sock *sk)
4414 struct tcp_sock *tp = tcp_sk(sk);
4415 - u32 cur_win = tcp_receive_window(tp);
4416 - u32 new_win = __tcp_select_window(sk);
4417 + /* The window must never shrink at the meta-level. At the subflow we
4418 + * have to allow this. Otherwise we may announce a window too large
4419 + * for the current meta-level sk_rcvbuf.
4420 + */
4421 + u32 cur_win = tcp_receive_window(tp->mpc ? tcp_sk(mptcp_meta_sk(sk)) : tp);
4422 + u32 new_win = tp->__select_window(sk);
4424 /* Never shrink the offered window */
4425 if (new_win < cur_win) {
4426 @@ -283,6 +293,7 @@
4428 new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
4431 tp->rcv_wnd = new_win;
4432 tp->rcv_wup = tp->rcv_nxt;
4434 @@ -361,7 +372,7 @@
4435 /* Constructs common control bits of non-data skb. If SYN/FIN is present,
4436 * auto increment end seqno.
4438 -static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
4439 +void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
4441 struct skb_shared_info *shinfo = skb_shinfo(skb);
4443 @@ -381,7 +392,7 @@
4444 TCP_SKB_CB(skb)->end_seq = seq;
4447 -static inline bool tcp_urg_mode(const struct tcp_sock *tp)
4448 +bool tcp_urg_mode(const struct tcp_sock *tp)
4450 return tp->snd_una != tp->snd_up;
4452 @@ -391,17 +402,7 @@
4453 #define OPTION_MD5 (1 << 2)
4454 #define OPTION_WSCALE (1 << 3)
4455 #define OPTION_FAST_OPEN_COOKIE (1 << 8)
4457 -struct tcp_out_options {
4458 - u16 options; /* bit field of OPTION_* */
4459 - u16 mss; /* 0 to disable */
4460 - u8 ws; /* window scale, 0 to disable */
4461 - u8 num_sack_blocks; /* number of SACK blocks to include */
4462 - u8 hash_size; /* bytes in hash_location */
4463 - __u8 *hash_location; /* temporary pointer, overloaded */
4464 - __u32 tsval, tsecr; /* need to include OPTION_TS */
4465 - struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
4467 +/* Before adding here - take a look at OPTION_MPTCP in include/net/mptcp.h */
4469 /* Write previously computed TCP options to the packet.
4471 @@ -417,7 +418,7 @@
4472 * (but it may well be that other scenarios fail similarly).
4474 static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
4475 - struct tcp_out_options *opts)
4476 + struct tcp_out_options *opts, struct sk_buff *skb)
4478 u16 options = opts->options; /* mungable copy */
4480 @@ -500,6 +501,9 @@
4482 ptr += (foc->len + 3) >> 2;
4485 + if (unlikely(OPTION_MPTCP & opts->options))
4486 + mptcp_options_write(ptr, tp, opts, skb);
4489 /* Compute TCP options for SYN packets. This is not the final
4490 @@ -551,6 +555,8 @@
4491 if (unlikely(!(OPTION_TS & opts->options)))
4492 remaining -= TCPOLEN_SACKPERM_ALIGNED;
4494 + if (tp->request_mptcp || tp->mpc)
4495 + mptcp_syn_options(sk, opts, &remaining);
4497 if (fastopen && fastopen->cookie.len >= 0) {
4498 u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len;
4499 @@ -624,6 +630,9 @@
4503 + if (tcp_rsk(req)->saw_mpc)
4504 + mptcp_synack_options(req, opts, &remaining);
4506 return MAX_TCP_OPTION_SPACE - remaining;
4509 @@ -657,16 +666,22 @@
4510 opts->tsecr = tp->rx_opt.ts_recent;
4511 size += TCPOLEN_TSTAMP_ALIGNED;
4513 + if (tp->mpc)
4514 + mptcp_established_options(sk, skb, opts, &size);
4516 eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
4517 if (unlikely(eff_sacks)) {
4518 - const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
4519 - opts->num_sack_blocks =
4520 - min_t(unsigned int, eff_sacks,
4521 - (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
4522 - TCPOLEN_SACK_PERBLOCK);
4523 - size += TCPOLEN_SACK_BASE_ALIGNED +
4524 - opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
4525 + const unsigned remaining = MAX_TCP_OPTION_SPACE - size;
4526 + if (remaining < TCPOLEN_SACK_BASE_ALIGNED)
4527 + opts->num_sack_blocks = 0;
4528 + else
4529 + opts->num_sack_blocks =
4530 + min_t(unsigned int, eff_sacks,
4531 + (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
4532 + TCPOLEN_SACK_PERBLOCK);
4533 + if (opts->num_sack_blocks)
4534 + size += TCPOLEN_SACK_BASE_ALIGNED +
4535 + opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
4538 return size;
4539 @@ -714,7 +729,7 @@
4540 unsigned long flags;
4541 struct list_head *q, *n;
4542 struct tcp_sock *tp;
4543 - struct sock *sk;
4544 + struct sock *sk, *meta_sk;
4546 local_irq_save(flags);
4547 list_splice_init(&tsq->head, &list);
4548 @@ -725,15 +740,27 @@
4549 list_del(&tp->tsq_node);
4551 sk = (struct sock *)tp;
4552 - bh_lock_sock(sk);
4553 + meta_sk = tp->mpc ? mptcp_meta_sk(sk) : sk;
4554 + bh_lock_sock(meta_sk);
4556 - if (!sock_owned_by_user(sk)) {
4557 + if (!sock_owned_by_user(meta_sk)) {
4558 tcp_tsq_handler(sk);
4559 + if (tp->mpc)
4560 + tcp_tsq_handler(meta_sk);
4561 } else {
4562 /* defer the work to tcp_release_cb() */
4563 set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
4565 + /* For MPTCP, we set the tsq-bit on the meta, and the
4566 + * subflow as we don't know if the limitation happened
4567 + * while inside mptcp_write_xmit or during tcp_write_xmit.
4568 + */
4569 + if (tp->mpc) {
4570 + set_bit(TCP_TSQ_DEFERRED, &tcp_sk(meta_sk)->tsq_flags);
4571 + mptcp_tsq_flags(sk);
4574 - bh_unlock_sock(sk);
4575 + bh_unlock_sock(meta_sk);
4577 clear_bit(TSQ_QUEUED, &tp->tsq_flags);
4578 sk_free(sk);
4579 @@ -743,7 +770,10 @@
4580 #define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \
4581 (1UL << TCP_WRITE_TIMER_DEFERRED) | \
4582 (1UL << TCP_DELACK_TIMER_DEFERRED) | \
4583 - (1UL << TCP_MTU_REDUCED_DEFERRED))
4584 + (1UL << TCP_MTU_REDUCED_DEFERRED) | \
4585 + (1UL << MPTCP_PATH_MANAGER) | \
4586 + (1UL << MPTCP_SUB_DEFERRED))
4589 * tcp_release_cb - tcp release_sock() callback
4590 * @sk: socket
4591 @@ -790,6 +820,13 @@
4592 inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
4593 __sock_put(sk);
4595 + if (flags & (1UL << MPTCP_PATH_MANAGER)) {
4596 + if (tcp_sk(sk)->mpcb->pm_ops->release_sock)
4597 + tcp_sk(sk)->mpcb->pm_ops->release_sock(sk);
4598 + __sock_put(sk);
4600 + if (flags & (1UL << MPTCP_SUB_DEFERRED))
4601 + mptcp_tsq_sub_deferred(sk);
4603 EXPORT_SYMBOL(tcp_release_cb);
4605 @@ -849,8 +886,8 @@
4606 * We are working here with either a clone of the original
4607 * SKB, or a fresh unique copy made by the retransmit engine.
4609 -static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
4610 - gfp_t gfp_mask)
4611 +int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
4612 + gfp_t gfp_mask)
4614 const struct inet_connection_sock *icsk = inet_csk(sk);
4615 struct inet_sock *inet;
4616 @@ -878,10 +915,28 @@
4617 NET_INC_STATS(sock_net(sk),
4618 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
4620 - if (unlikely(skb_cloned(skb)))
4621 - skb = pskb_copy(skb, gfp_mask);
4622 - else
4623 + if (unlikely(skb_cloned(skb))) {
4624 + struct sk_buff *newskb;
4625 + if (mptcp_is_data_seq(skb))
4626 + skb_push(skb, MPTCP_SUB_LEN_DSS_ALIGN +
4627 + MPTCP_SUB_LEN_ACK_ALIGN +
4628 + MPTCP_SUB_LEN_SEQ_ALIGN);
4630 + newskb = pskb_copy(skb, gfp_mask);
4632 + if (mptcp_is_data_seq(skb)) {
4633 + skb_pull(skb, MPTCP_SUB_LEN_DSS_ALIGN +
4634 + MPTCP_SUB_LEN_ACK_ALIGN +
4635 + MPTCP_SUB_LEN_SEQ_ALIGN);
4636 + if (newskb)
4637 + skb_pull(newskb, MPTCP_SUB_LEN_DSS_ALIGN +
4638 + MPTCP_SUB_LEN_ACK_ALIGN +
4639 + MPTCP_SUB_LEN_SEQ_ALIGN);
4641 + skb = newskb;
4642 + } else {
4643 skb = skb_clone(skb, gfp_mask);
4645 if (unlikely(!skb))
4646 return -ENOBUFS;
4648 @@ -929,7 +984,7 @@
4650 th->window = htons(min(tp->rcv_wnd, 65535U));
4651 } else {
4652 - th->window = htons(tcp_select_window(sk));
4653 + th->window = htons(tp->select_window(sk));
4655 th->check = 0;
4656 th->urg_ptr = 0;
4657 @@ -945,7 +1000,7 @@
4661 - tcp_options_write((__be32 *)(th + 1), tp, &opts);
4662 + tcp_options_write((__be32 *)(th + 1), tp, &opts, skb);
4663 if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0))
4664 TCP_ECN_send(sk, skb, tcp_header_size);
4666 @@ -984,7 +1039,7 @@
4667 * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
4668 * otherwise socket can stall.
4670 -static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
4671 +void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
4673 struct tcp_sock *tp = tcp_sk(sk);
4675 @@ -997,15 +1052,16 @@
4678 /* Initialize TSO segments for a packet. */
4679 -static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
4680 - unsigned int mss_now)
4681 +void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
4682 + unsigned int mss_now)
4684 struct skb_shared_info *shinfo = skb_shinfo(skb);
4686 /* Make sure we own this skb before messing gso_size/gso_segs */
4687 WARN_ON_ONCE(skb_cloned(skb));
4689 - if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) {
4690 + if (skb->len <= mss_now || (is_meta_sk(sk) && !mptcp_sk_can_gso(sk)) ||
4691 + (!is_meta_sk(sk) && !sk_can_gso(sk)) || skb->ip_summed == CHECKSUM_NONE) {
4692 /* Avoid the costly divide in the normal
4693 * non-TSO case.
4695 @@ -1037,7 +1093,7 @@
4696 /* Pcount in the middle of the write queue got changed, we need to do various
4697 * tweaks to fix counters
4699 -static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
4700 +void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
4702 struct tcp_sock *tp = tcp_sk(sk);
4704 @@ -1078,6 +1134,9 @@
4705 int nlen;
4706 u8 flags;
4708 + if (tcp_sk(sk)->mpc && mptcp_is_data_seq(skb))
4709 + mptcp_fragment(sk, skb, len, mss_now, 0);
4711 if (WARN_ON(len > skb->len))
4712 return -EINVAL;
4714 @@ -1160,7 +1219,7 @@
4715 * eventually). The difference is that pulled data not copied, but
4716 * immediately discarded.
4718 -static void __pskb_trim_head(struct sk_buff *skb, int len)
4719 +void __pskb_trim_head(struct sk_buff *skb, int len)
4721 struct skb_shared_info *shinfo;
4722 int i, k, eat;
4723 @@ -1201,6 +1260,9 @@
4724 /* Remove acked data from a packet in the transmit queue. */
4725 int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
4727 + if (tcp_sk(sk)->mpc && !is_meta_sk(sk) && mptcp_is_data_seq(skb))
4728 + return mptcp_trim_head(sk, skb, len);
4730 if (skb_unclone(skb, GFP_ATOMIC))
4731 return -ENOMEM;
4733 @@ -1218,6 +1280,15 @@
4734 if (tcp_skb_pcount(skb) > 1)
4735 tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));
4737 +#ifdef CONFIG_MPTCP
4738 + /* Some data got acked - we assume that the seq-number reached the dest.
4739 + * Anyway, our MPTCP-option has been trimmed above - we lost it here.
4740 + * Only remove the SEQ if the call does not come from a meta retransmit.
4741 + */
4742 + if (tcp_sk(sk)->mpc && !is_meta_sk(sk))
4743 + TCP_SKB_CB(skb)->mptcp_flags &= ~MPTCPHDR_SEQ;
4744 +#endif
4746 return 0;
4749 @@ -1377,7 +1448,7 @@
4752 /* Congestion window validation. (RFC2861) */
4753 -static void tcp_cwnd_validate(struct sock *sk)
4754 +void tcp_cwnd_validate(struct sock *sk)
4756 struct tcp_sock *tp = tcp_sk(sk);
4758 @@ -1411,8 +1482,8 @@
4759 * But we can avoid doing the divide again given we already have
4760 * skb_pcount = skb->len / mss_now
4762 -static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
4763 - const struct sk_buff *skb)
4764 +void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
4765 + const struct sk_buff *skb)
4767 if (skb->len < tcp_skb_pcount(skb) * mss_now)
4768 tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
4769 @@ -1433,19 +1504,28 @@
4770 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
4772 /* Returns the portion of skb which can be sent right away */
4773 -static unsigned int tcp_mss_split_point(const struct sock *sk,
4774 - const struct sk_buff *skb,
4775 - unsigned int mss_now,
4776 - unsigned int max_segs,
4777 - int nonagle)
4778 +unsigned int tcp_mss_split_point(const struct sock *sk,
4779 + const struct sk_buff *skb,
4780 + unsigned int mss_now,
4781 + unsigned int max_segs,
4782 + int nonagle)
4784 const struct tcp_sock *tp = tcp_sk(sk);
4785 + const struct sock *meta_sk = tp->mpc ? mptcp_meta_sk(sk) : sk;
4786 u32 partial, needed, window, max_len;
4788 - window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
4789 + if (!tp->mpc)
4790 + window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
4791 + else
4792 + /* We need to evaluate the available space in the sending window
4793 + * at the subflow level. However, the subflow seq has not yet
4794 + * been set. Nevertheless we know that the caller will set it to
4795 + * write_seq.
4796 + */
4797 + window = tcp_wnd_end(tp) - tp->write_seq;
4798 max_len = mss_now * max_segs;
4800 - if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
4801 + if (likely(max_len <= window && skb != tcp_write_queue_tail(meta_sk)))
4802 return max_len;
4804 needed = min(skb->len, window);
4805 @@ -1467,13 +1547,14 @@
4806 /* Can at least one segment of SKB be sent right now, according to the
4807 * congestion window rules? If so, return how many segments are allowed.
4809 -static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
4810 - const struct sk_buff *skb)
4811 +unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
4812 + const struct sk_buff *skb)
4814 u32 in_flight, cwnd;
4816 /* Don't be strict about the congestion window for the final FIN. */
4817 - if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
4818 + if (skb &&
4819 + ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || mptcp_is_data_fin(skb)) &&
4820 tcp_skb_pcount(skb) == 1)
4821 return 1;
4823 @@ -1489,8 +1570,8 @@
4824 * This must be invoked the first time we consider transmitting
4825 * SKB onto the wire.
4827 -static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
4828 - unsigned int mss_now)
4829 +int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
4830 + unsigned int mss_now)
4832 int tso_segs = tcp_skb_pcount(skb);
4834 @@ -1505,8 +1586,8 @@
4835 /* Return true if the Nagle test allows this packet to be
4836 * sent now.
4838 -static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
4839 - unsigned int cur_mss, int nonagle)
4840 +bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
4841 + unsigned int cur_mss, int nonagle)
4843 /* Nagle rule does not apply to frames, which sit in the middle of the
4844 * write_queue (they have no chances to get new data).
4845 @@ -1518,7 +1599,8 @@
4846 return true;
4848 /* Don't use the nagle rule for urgent data (or for the final FIN). */
4849 - if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
4850 + if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) ||
4851 + mptcp_is_data_fin(skb))
4852 return true;
4854 if (!tcp_nagle_check(skb->len < cur_mss, tp, cur_mss, nonagle))
4855 @@ -1528,9 +1610,8 @@
4858 /* Does at least the first segment of SKB fit into the send window? */
4859 -static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
4860 - const struct sk_buff *skb,
4861 - unsigned int cur_mss)
4862 +bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb,
4863 + unsigned int cur_mss)
4865 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
4867 @@ -1549,14 +1630,16 @@
4869 const struct tcp_sock *tp = tcp_sk(sk);
4870 unsigned int cwnd_quota;
4871 + const struct sock *meta_sk = tp->mpc ? mptcp_meta_sk(sk) : sk;
4872 + const struct tcp_sock *meta_tp = tcp_sk(meta_sk);
4874 - tcp_init_tso_segs(sk, skb, cur_mss);
4875 + tcp_init_tso_segs(meta_sk, skb, cur_mss);
4877 - if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
4878 + if (!tcp_nagle_test(meta_tp, skb, cur_mss, nonagle))
4879 return 0;
4881 cwnd_quota = tcp_cwnd_test(tp, skb);
4882 - if (cwnd_quota && !tcp_snd_wnd_test(tp, skb, cur_mss))
4883 + if (cwnd_quota && !tcp_snd_wnd_test(meta_tp, skb, cur_mss))
4884 cwnd_quota = 0;
4886 return cwnd_quota;
4887 @@ -1566,12 +1649,16 @@
4888 bool tcp_may_send_now(struct sock *sk)
4890 const struct tcp_sock *tp = tcp_sk(sk);
4891 - struct sk_buff *skb = tcp_send_head(sk);
4892 + struct sk_buff *skb;
4893 + const struct sock *meta_sk = tp->mpc ? mptcp_meta_sk(sk) : sk;
4894 + const struct tcp_sock *meta_tp = tcp_sk(meta_sk);
4896 + skb = tcp_send_head(meta_sk);
4898 return skb &&
4899 tcp_snd_test(sk, skb, tcp_current_mss(sk),
4900 - (tcp_skb_is_last(sk, skb) ?
4901 - tp->nonagle : TCP_NAGLE_PUSH));
4902 + (tcp_skb_is_last(meta_sk, skb) ?
4903 + meta_tp->nonagle : TCP_NAGLE_PUSH));
4906 /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
4907 @@ -1588,6 +1675,9 @@
4908 int nlen = skb->len - len;
4909 u8 flags;
4911 + if (tcp_sk(sk)->mpc && mptcp_is_data_seq(skb))
4912 + mptso_fragment(sk, skb, len, mss_now, gfp, 0);
4914 /* All of a TSO frame must be composed of paged data. */
4915 if (skb->len != skb->data_len)
4916 return tcp_fragment(sk, skb, len, mss_now);
4917 @@ -1633,29 +1723,39 @@
4919 * This algorithm is from John Heffner.
4921 -static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
4922 +bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
4924 struct tcp_sock *tp = tcp_sk(sk);
4925 + struct sock *meta_sk = tp->mpc ? mptcp_meta_sk(sk) : sk;
4926 + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
4927 const struct inet_connection_sock *icsk = inet_csk(sk);
4928 u32 send_win, cong_win, limit, in_flight;
4929 int win_divisor;
4931 - if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
4932 + if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || mptcp_is_data_fin(skb))
4933 goto send_now;
4935 if (icsk->icsk_ca_state != TCP_CA_Open)
4936 goto send_now;
4938 /* Defer for less than two clock ticks. */
4939 - if (tp->tso_deferred &&
4940 - (((u32)jiffies << 1) >> 1) - (tp->tso_deferred >> 1) > 1)
4941 + if (meta_tp->tso_deferred &&
4942 + (((u32)jiffies << 1) >> 1) - (meta_tp->tso_deferred >> 1) > 1)
4943 goto send_now;
4945 in_flight = tcp_packets_in_flight(tp);
4947 BUG_ON(tcp_skb_pcount(skb) <= 1 || (tp->snd_cwnd <= in_flight));
4949 - send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
4950 + if (!tp->mpc)
4951 + send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
4952 + else
4953 + /* We need to evaluate the available space in the sending window
4954 + * at the subflow level. However, the subflow seq has not yet
4955 + * been set. Nevertheless we know that the caller will set it to
4956 + * write_seq.
4957 + */
4958 + send_win = tcp_wnd_end(tp) - tp->write_seq;
4960 /* From in_flight test above, we know that cwnd > in_flight. */
4961 cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
4962 @@ -1668,7 +1768,7 @@
4963 goto send_now;
4965 /* Middle in queue won't get any more data, full sendable already? */
4966 - if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
4967 + if ((skb != tcp_write_queue_tail(meta_sk)) && (limit >= skb->len))
4968 goto send_now;
4970 win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor);
4971 @@ -1694,13 +1794,13 @@
4972 /* Ok, it looks like it is advisable to defer.
4973 * Do not rearm the timer if already set to not break TCP ACK clocking.
4975 - if (!tp->tso_deferred)
4976 - tp->tso_deferred = 1 | (jiffies << 1);
4977 + if (!meta_tp->tso_deferred)
4978 + meta_tp->tso_deferred = 1 | (jiffies << 1);
4980 return true;
4982 send_now:
4983 - tp->tso_deferred = 0;
4984 + meta_tp->tso_deferred = 0;
4985 return false;
4988 @@ -1713,7 +1813,7 @@
4989 * 1 if a probe was sent,
4990 * -1 otherwise
4992 -static int tcp_mtu_probe(struct sock *sk)
4993 +int tcp_mtu_probe(struct sock *sk)
4995 struct tcp_sock *tp = tcp_sk(sk);
4996 struct inet_connection_sock *icsk = inet_csk(sk);
4997 @@ -1858,6 +1958,9 @@
4998 int cwnd_quota;
4999 int result;
5001 + if (is_meta_sk(sk))
5002 + return mptcp_write_xmit(sk, mss_now, nonagle, push_one, gfp);
5004 sent_pkts = 0;
5006 if (!push_one) {
5007 @@ -2314,6 +2417,10 @@
5008 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
5009 return;
5011 + /* Currently not supported for MPTCP - but it should be possible */
5012 + if (tp->mpc)
5013 + return;
5015 tcp_for_write_queue_from_safe(skb, tmp, sk) {
5016 if (!tcp_can_collapse(sk, skb))
5017 break;
5018 @@ -2411,10 +2518,26 @@
5020 if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) ||
5021 skb_headroom(skb) >= 0xFFFF)) {
5022 - struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER,
5023 - GFP_ATOMIC);
5024 + struct sk_buff *nskb;
5026 + if (mptcp_is_data_seq(skb))
5027 + skb_push(skb, MPTCP_SUB_LEN_DSS_ALIGN +
5028 + MPTCP_SUB_LEN_ACK_ALIGN +
5029 + MPTCP_SUB_LEN_SEQ_ALIGN);
5031 + nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
5033 + if (mptcp_is_data_seq(skb)) {
5034 + skb_pull(skb, MPTCP_SUB_LEN_DSS_ALIGN +
5035 + MPTCP_SUB_LEN_ACK_ALIGN +
5036 + MPTCP_SUB_LEN_SEQ_ALIGN);
5037 + if (nskb)
5038 + skb_pull(nskb, MPTCP_SUB_LEN_DSS_ALIGN +
5039 + MPTCP_SUB_LEN_ACK_ALIGN +
5040 + MPTCP_SUB_LEN_SEQ_ALIGN);
5042 err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
5043 - -ENOBUFS;
5044 + -ENOBUFS;
5045 } else {
5046 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
5048 @@ -2665,6 +2788,11 @@
5050 struct sk_buff *skb;
5052 + if (is_meta_sk(sk)) {
5053 + mptcp_send_active_reset(sk, priority);
5054 + return;
5057 /* NOTE: No TCP options attached and we never retransmit this. */
5058 skb = alloc_skb(MAX_TCP_HEADER, priority);
5059 if (!skb) {
5060 @@ -2767,14 +2895,14 @@
5061 (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
5062 req->window_clamp = tcp_full_space(sk);
5064 - /* tcp_full_space because it is guaranteed to be the first packet */
5065 - tcp_select_initial_window(tcp_full_space(sk),
5066 - mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
5067 + tp->select_initial_window(tcp_full_space(sk),
5068 + mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) -
5069 + (tcp_rsk(req)->saw_mpc ? MPTCP_SUB_LEN_DSM_ALIGN : 0),
5070 &req->rcv_wnd,
5071 &req->window_clamp,
5072 ireq->wscale_ok,
5073 &rcv_wscale,
5074 - dst_metric(dst, RTAX_INITRWND));
5075 + dst_metric(dst, RTAX_INITRWND), sk);
5076 ireq->rcv_wscale = rcv_wscale;
5079 @@ -2810,7 +2938,7 @@
5081 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
5082 th->window = htons(min(req->rcv_wnd, 65535U));
5083 - tcp_options_write((__be32 *)(th + 1), tp, &opts);
5084 + tcp_options_write((__be32 *)(th + 1), tp, &opts, skb);
5085 th->doff = (tcp_header_size >> 2);
5086 TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb));
5088 @@ -2866,13 +2994,13 @@
5089 (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
5090 tp->window_clamp = tcp_full_space(sk);
5092 - tcp_select_initial_window(tcp_full_space(sk),
5093 + tp->select_initial_window(tcp_full_space(sk),
5094 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
5095 &tp->rcv_wnd,
5096 &tp->window_clamp,
5097 sysctl_tcp_window_scaling,
5098 &rcv_wscale,
5099 - dst_metric(dst, RTAX_INITRWND));
5100 + dst_metric(dst, RTAX_INITRWND), sk);
5102 tp->rx_opt.rcv_wscale = rcv_wscale;
5103 tp->rcv_ssthresh = tp->rcv_wnd;
5104 @@ -2896,6 +3024,38 @@
5105 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
5106 inet_csk(sk)->icsk_retransmits = 0;
5107 tcp_clear_retrans(tp);
5109 +#ifdef CONFIG_MPTCP
5110 + if (sysctl_mptcp_enabled && mptcp_doit(sk)) {
5111 + if (is_master_tp(tp)) {
5112 + tp->request_mptcp = 1;
5113 + mptcp_connect_init(sk);
5114 + } else if (tp->mptcp) {
5115 + struct inet_sock *inet = inet_sk(sk);
5117 + tp->mptcp->snt_isn = tp->write_seq;
5118 + tp->mptcp->init_rcv_wnd = tp->rcv_wnd;
5120 + /* Set nonce for new subflows */
5121 + if (sk->sk_family == AF_INET)
5122 + tp->mptcp->mptcp_loc_nonce = mptcp_v4_get_nonce(
5123 + inet->inet_saddr,
5124 + inet->inet_daddr,
5125 + inet->inet_sport,
5126 + inet->inet_dport,
5127 + tp->write_seq);
5128 +#if IS_ENABLED(CONFIG_IPV6)
5129 + else
5130 + tp->mptcp->mptcp_loc_nonce = mptcp_v6_get_nonce(
5131 + inet6_sk(sk)->saddr.s6_addr32,
5132 + sk->sk_v6_daddr.s6_addr32,
5133 + inet->inet_sport,
5134 + inet->inet_dport,
5135 + tp->write_seq);
5136 +#endif
5139 +#endif
5142 static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
5143 @@ -3131,6 +3291,7 @@
5144 TCP_SKB_CB(buff)->when = tcp_time_stamp;
5145 tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));
5147 +EXPORT_SYMBOL(tcp_send_ack);
5149 /* This routine sends a packet with an out of date sequence
5150 * number. It assumes the other end will try to ack it.
5151 @@ -3143,7 +3304,7 @@
5152 * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
5153 * out-of-date with SND.UNA-1 to probe window.
5155 -static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
5156 +int tcp_xmit_probe_skb(struct sock *sk, int urgent)
5158 struct tcp_sock *tp = tcp_sk(sk);
5159 struct sk_buff *skb;
5160 @@ -3181,6 +3342,9 @@
5161 if (sk->sk_state == TCP_CLOSE)
5162 return -1;
5164 + if (is_meta_sk(sk))
5165 + return mptcp_write_wakeup(sk);
5167 if ((skb = tcp_send_head(sk)) != NULL &&
5168 before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
5169 int err;
5170 diff -Nur linux-3.14.45.orig/net/ipv4/tcp_timer.c linux-3.14.45/net/ipv4/tcp_timer.c
5171 --- linux-3.14.45.orig/net/ipv4/tcp_timer.c 2015-06-23 02:01:36.000000000 +0200
5172 +++ linux-3.14.45/net/ipv4/tcp_timer.c 2015-06-24 14:15:48.891862483 +0200
5173 @@ -20,6 +20,7 @@
5175 #include <linux/module.h>
5176 #include <linux/gfp.h>
5177 +#include <net/mptcp.h>
5178 #include <net/tcp.h>
5180 int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES;
5181 @@ -32,7 +33,7 @@
5182 int sysctl_tcp_orphan_retries __read_mostly;
5183 int sysctl_tcp_thin_linear_timeouts __read_mostly;
5185 -static void tcp_write_err(struct sock *sk)
5186 +void tcp_write_err(struct sock *sk)
5188 sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
5189 sk->sk_error_report(sk);
5190 @@ -124,10 +125,8 @@
5191 * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if
5192 * syn_set flag is set.
5194 -static bool retransmits_timed_out(struct sock *sk,
5195 - unsigned int boundary,
5196 - unsigned int timeout,
5197 - bool syn_set)
5198 +bool retransmits_timed_out(struct sock *sk, unsigned int boundary,
5199 + unsigned int timeout, bool syn_set)
5201 unsigned int linear_backoff_thresh, start_ts;
5202 unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN;
5203 @@ -153,7 +152,7 @@
5206 /* A write timeout has occurred. Process the after effects. */
5207 -static int tcp_write_timeout(struct sock *sk)
5208 +int tcp_write_timeout(struct sock *sk)
5210 struct inet_connection_sock *icsk = inet_csk(sk);
5211 struct tcp_sock *tp = tcp_sk(sk);
5212 @@ -168,6 +167,10 @@
5214 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
5215 syn_set = true;
5216 + /* Stop retransmitting MP_CAPABLE options in SYN if timed out. */
5217 + if (tcp_sk(sk)->request_mptcp &&
5218 + icsk->icsk_retransmits >= mptcp_sysctl_syn_retries())
5219 + tcp_sk(sk)->request_mptcp = 0;
5220 } else {
5221 if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {
5222 /* Black hole detection */
5223 @@ -248,18 +251,22 @@
5224 static void tcp_delack_timer(unsigned long data)
5226 struct sock *sk = (struct sock *)data;
5227 + struct tcp_sock *tp = tcp_sk(sk);
5228 + struct sock *meta_sk = tp->mpc ? mptcp_meta_sk(sk) : sk;
5230 - bh_lock_sock(sk);
5231 - if (!sock_owned_by_user(sk)) {
5232 + bh_lock_sock(meta_sk);
5233 + if (!sock_owned_by_user(meta_sk)) {
5234 tcp_delack_timer_handler(sk);
5235 } else {
5236 inet_csk(sk)->icsk_ack.blocked = 1;
5237 - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
5238 + NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_DELAYEDACKLOCKED);
5239 /* deleguate our work to tcp_release_cb() */
5240 if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
5241 sock_hold(sk);
5242 + if (tp->mpc)
5243 + mptcp_tsq_flags(sk);
5245 - bh_unlock_sock(sk);
5246 + bh_unlock_sock(meta_sk);
5247 sock_put(sk);
5250 @@ -421,6 +428,9 @@
5252 tcp_enter_loss(sk, 0);
5254 + if (tp->mpc)
5255 + mptcp_reinject_data(sk, 1);
5257 if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) {
5258 /* Retransmission failed because of local congestion,
5259 * do not backoff.
5260 @@ -471,6 +481,8 @@
5261 /* Use normal (exponential) backoff */
5262 icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
5264 + if (tp->mpc)
5265 + mptcp_set_rto(sk);
5266 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
5267 if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0))
5268 __sk_dst_reset(sk);
5269 @@ -502,7 +514,10 @@
5270 break;
5271 case ICSK_TIME_RETRANS:
5272 icsk->icsk_pending = 0;
5273 - tcp_retransmit_timer(sk);
5274 + if (is_meta_sk(sk))
5275 + mptcp_retransmit_timer(sk);
5276 + else
5277 + tcp_retransmit_timer(sk);
5278 break;
5279 case ICSK_TIME_PROBE0:
5280 icsk->icsk_pending = 0;
5281 @@ -517,16 +532,19 @@
5282 static void tcp_write_timer(unsigned long data)
5284 struct sock *sk = (struct sock *)data;
5285 + struct sock *meta_sk = tcp_sk(sk)->mpc ? mptcp_meta_sk(sk) : sk;
5287 - bh_lock_sock(sk);
5288 - if (!sock_owned_by_user(sk)) {
5289 + bh_lock_sock(meta_sk);
5290 + if (!sock_owned_by_user(meta_sk)) {
5291 tcp_write_timer_handler(sk);
5292 } else {
5293 /* deleguate our work to tcp_release_cb() */
5294 if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
5295 sock_hold(sk);
5296 + if (tcp_sk(sk)->mpc)
5297 + mptcp_tsq_flags(sk);
5299 - bh_unlock_sock(sk);
5300 + bh_unlock_sock(meta_sk);
5301 sock_put(sk);
5304 @@ -563,11 +581,12 @@
5305 struct sock *sk = (struct sock *) data;
5306 struct inet_connection_sock *icsk = inet_csk(sk);
5307 struct tcp_sock *tp = tcp_sk(sk);
5308 + struct sock *meta_sk = tp->mpc ? mptcp_meta_sk(sk) : sk;
5309 u32 elapsed;
5311 /* Only process if socket is not in use. */
5312 - bh_lock_sock(sk);
5313 - if (sock_owned_by_user(sk)) {
5314 + bh_lock_sock(meta_sk);
5315 + if (sock_owned_by_user(meta_sk)) {
5316 /* Try again later. */
5317 inet_csk_reset_keepalive_timer (sk, HZ/20);
5318 goto out;
5319 @@ -578,6 +597,29 @@
5320 goto out;
5323 + if (tp->send_mp_fclose) {
5324 + /* MUST do this before tcp_write_timeout, because retrans_stamp
5325 + * may have been set to 0 in another part while we are
5326 + * retransmitting MP_FASTCLOSE. Then, we would crash, because
5327 + * retransmits_timed_out accesses the meta-write-queue.
5329 + * We make sure that the timestamp is != 0.
5330 + */
5331 + if (!tp->retrans_stamp)
5332 + tp->retrans_stamp = tcp_time_stamp ? : 1;
5334 + if (tcp_write_timeout(sk))
5335 + goto out;
5337 + tcp_send_ack(sk);
5338 + icsk->icsk_backoff++;
5339 + icsk->icsk_retransmits++;
5341 + icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
5342 + elapsed = icsk->icsk_rto;
5343 + goto resched;
5346 if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
5347 if (tp->linger2 >= 0) {
5348 const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
5349 @@ -639,7 +681,7 @@
5350 tcp_done(sk);
5352 out:
5353 - bh_unlock_sock(sk);
5354 + bh_unlock_sock(meta_sk);
5355 sock_put(sk);
5358 diff -Nur linux-3.14.45.orig/net/ipv6/addrconf.c linux-3.14.45/net/ipv6/addrconf.c
5359 --- linux-3.14.45.orig/net/ipv6/addrconf.c 2015-06-23 02:01:36.000000000 +0200
5360 +++ linux-3.14.45/net/ipv6/addrconf.c 2015-06-24 14:15:48.891862483 +0200
5361 @@ -765,6 +765,7 @@
5363 kfree_rcu(ifp, rcu);
5365 +EXPORT_SYMBOL(inet6_ifa_finish_destroy);
5367 static void
5368 ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp)
5369 diff -Nur linux-3.14.45.orig/net/ipv6/af_inet6.c linux-3.14.45/net/ipv6/af_inet6.c
5370 --- linux-3.14.45.orig/net/ipv6/af_inet6.c 2015-06-23 02:01:36.000000000 +0200
5371 +++ linux-3.14.45/net/ipv6/af_inet6.c 2015-06-24 14:15:48.891862483 +0200
5372 @@ -97,8 +97,7 @@
5373 return (struct ipv6_pinfo *)(((u8 *)sk) + offset);
5376 -static int inet6_create(struct net *net, struct socket *sock, int protocol,
5377 - int kern)
5378 +int inet6_create(struct net *net, struct socket *sock, int protocol, int kern)
5380 struct inet_sock *inet;
5381 struct ipv6_pinfo *np;
5382 diff -Nur linux-3.14.45.orig/net/ipv6/inet6_connection_sock.c linux-3.14.45/net/ipv6/inet6_connection_sock.c
5383 --- linux-3.14.45.orig/net/ipv6/inet6_connection_sock.c 2015-06-23 02:01:36.000000000 +0200
5384 +++ linux-3.14.45/net/ipv6/inet6_connection_sock.c 2015-06-24 14:15:48.891862483 +0200
5385 @@ -96,8 +96,8 @@
5387 * request_sock (formerly open request) hash tables.
5389 -static u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport,
5390 - const u32 rnd, const u32 synq_hsize)
5391 +u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport,
5392 + const u32 rnd, const u32 synq_hsize)
5394 u32 c;
5396 diff -Nur linux-3.14.45.orig/net/ipv6/syncookies.c linux-3.14.45/net/ipv6/syncookies.c
5397 --- linux-3.14.45.orig/net/ipv6/syncookies.c 2015-06-23 02:01:36.000000000 +0200
5398 +++ linux-3.14.45/net/ipv6/syncookies.c 2015-06-24 14:15:48.891862483 +0200
5399 @@ -181,7 +181,7 @@
5401 /* check for timestamp cookie support */
5402 memset(&tcp_opt, 0, sizeof(tcp_opt));
5403 - tcp_parse_options(skb, &tcp_opt, 0, NULL);
5404 + tcp_parse_options(skb, &tcp_opt, NULL, 0, NULL);
5406 if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok))
5407 goto out;
5408 @@ -253,10 +253,10 @@
5411 req->window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW);
5412 - tcp_select_initial_window(tcp_full_space(sk), req->mss,
5413 + tp->select_initial_window(tcp_full_space(sk), req->mss,
5414 &req->rcv_wnd, &req->window_clamp,
5415 ireq->wscale_ok, &rcv_wscale,
5416 - dst_metric(dst, RTAX_INITRWND));
5417 + dst_metric(dst, RTAX_INITRWND), sk);
5419 ireq->rcv_wscale = rcv_wscale;
5421 diff -Nur linux-3.14.45.orig/net/ipv6/tcp_ipv6.c linux-3.14.45/net/ipv6/tcp_ipv6.c
5422 --- linux-3.14.45.orig/net/ipv6/tcp_ipv6.c 2015-06-23 02:01:36.000000000 +0200
5423 +++ linux-3.14.45/net/ipv6/tcp_ipv6.c 2015-06-24 14:44:57.517799806 +0200
5424 @@ -63,6 +63,8 @@
5425 #include <net/inet_common.h>
5426 #include <net/secure_seq.h>
5427 #include <net/tcp_memcontrol.h>
5428 +#include <net/mptcp.h>
5429 +#include <net/mptcp_v6.h>
5430 #include <net/busy_poll.h>
5432 #include <asm/uaccess.h>
5433 @@ -73,14 +75,6 @@
5434 #include <linux/crypto.h>
5435 #include <linux/scatterlist.h>
5437 -static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb);
5438 -static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
5439 - struct request_sock *req);
5441 -static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
5443 -static const struct inet_connection_sock_af_ops ipv6_mapped;
5444 -static const struct inet_connection_sock_af_ops ipv6_specific;
5445 #ifdef CONFIG_TCP_MD5SIG
5446 static const struct tcp_sock_af_ops tcp_sock_ipv6_specific;
5447 static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific;
5448 @@ -92,7 +86,7 @@
5450 #endif
5452 -static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
5453 +void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
5455 struct dst_entry *dst = skb_dst(skb);
5456 const struct rt6_info *rt = (const struct rt6_info *)dst;
5457 @@ -104,7 +98,7 @@
5458 inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum;
5461 -static void tcp_v6_hash(struct sock *sk)
5462 +void tcp_v6_hash(struct sock *sk)
5464 if (sk->sk_state != TCP_CLOSE) {
5465 if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped) {
5466 @@ -117,7 +111,7 @@
5470 -static __u32 tcp_v6_init_sequence(const struct sk_buff *skb)
5471 +__u32 tcp_v6_init_sequence(const struct sk_buff *skb)
5473 return secure_tcpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32,
5474 ipv6_hdr(skb)->saddr.s6_addr32,
5475 @@ -125,7 +119,7 @@
5476 tcp_hdr(skb)->source);
5479 -static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
5480 +int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
5481 int addr_len)
5483 struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
5484 @@ -339,7 +333,7 @@
5485 const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
5486 const struct tcphdr *th = (struct tcphdr *)(skb->data+offset);
5487 struct ipv6_pinfo *np;
5488 - struct sock *sk;
5489 + struct sock *sk, *meta_sk;
5490 int err;
5491 struct tcp_sock *tp;
5492 __u32 seq;
5493 @@ -359,8 +353,14 @@
5494 return;
5497 - bh_lock_sock(sk);
5498 - if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG)
5499 + tp = tcp_sk(sk);
5500 + if (tp->mpc)
5501 + meta_sk = mptcp_meta_sk(sk);
5502 + else
5503 + meta_sk = sk;
5505 + bh_lock_sock(meta_sk);
5506 + if (sock_owned_by_user(meta_sk) && type != ICMPV6_PKT_TOOBIG)
5507 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
5509 if (sk->sk_state == TCP_CLOSE)
5510 @@ -371,7 +371,6 @@
5511 goto out;
5514 - tp = tcp_sk(sk);
5515 seq = ntohl(th->seq);
5516 if (sk->sk_state != TCP_LISTEN &&
5517 !between(seq, tp->snd_una, tp->snd_nxt)) {
5518 @@ -401,11 +400,15 @@
5519 goto out;
5521 tp->mtu_info = ntohl(info);
5522 - if (!sock_owned_by_user(sk))
5523 + if (!sock_owned_by_user(meta_sk))
5524 tcp_v6_mtu_reduced(sk);
5525 - else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
5526 + else {
5527 + if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
5528 &tp->tsq_flags))
5529 - sock_hold(sk);
5530 + sock_hold(sk);
5531 + if (tp->mpc)
5532 + mptcp_tsq_flags(sk);
5534 goto out;
5537 @@ -415,7 +418,7 @@
5538 switch (sk->sk_state) {
5539 struct request_sock *req, **prev;
5540 case TCP_LISTEN:
5541 - if (sock_owned_by_user(sk))
5542 + if (sock_owned_by_user(meta_sk))
5543 goto out;
5545 req = inet6_csk_search_req(sk, &prev, th->dest, &hdr->daddr,
5546 @@ -440,7 +443,7 @@
5547 case TCP_SYN_SENT:
5548 case TCP_SYN_RECV: /* Cannot happen.
5549 It can, it SYNs are crossed. --ANK */
5550 - if (!sock_owned_by_user(sk)) {
5551 + if (!sock_owned_by_user(meta_sk)) {
5552 sk->sk_err = err;
5553 sk->sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */
5555 @@ -450,22 +453,22 @@
5556 goto out;
5559 - if (!sock_owned_by_user(sk) && np->recverr) {
5560 + if (!sock_owned_by_user(meta_sk) && np->recverr) {
5561 sk->sk_err = err;
5562 sk->sk_error_report(sk);
5563 } else
5564 sk->sk_err_soft = err;
5566 out:
5567 - bh_unlock_sock(sk);
5568 + bh_unlock_sock(meta_sk);
5569 sock_put(sk);
5573 -static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
5574 - struct flowi6 *fl6,
5575 - struct request_sock *req,
5576 - u16 queue_mapping)
5577 +int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
5578 + struct flowi6 *fl6,
5579 + struct request_sock *req,
5580 + u16 queue_mapping)
5582 struct inet_request_sock *ireq = inet_rsk(req);
5583 struct ipv6_pinfo *np = inet6_sk(sk);
5584 @@ -495,7 +498,7 @@
5585 return err;
5588 -static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req)
5589 +int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req)
5591 struct flowi6 fl6;
5592 int res;
5593 @@ -506,7 +509,7 @@
5594 return res;
5597 -static void tcp_v6_reqsk_destructor(struct request_sock *req)
5598 +void tcp_v6_reqsk_destructor(struct request_sock *req)
5600 kfree_skb(inet_rsk(req)->pktopts);
5602 @@ -719,16 +722,16 @@
5605 #ifdef CONFIG_TCP_MD5SIG
5606 -static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
5607 +const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
5608 .md5_lookup = tcp_v6_reqsk_md5_lookup,
5609 .calc_md5_hash = tcp_v6_md5_hash_skb,
5611 #endif
5613 -static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
5614 - u32 tsval, u32 tsecr,
5615 +static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack,
5616 + u32 data_ack, u32 win, u32 tsval, u32 tsecr,
5617 struct tcp_md5sig_key *key, int rst, u8 tclass,
5618 - u32 label)
5619 + u32 label, int mptcp)
5621 const struct tcphdr *th = tcp_hdr(skb);
5622 struct tcphdr *t1;
5623 @@ -746,7 +749,10 @@
5624 if (key)
5625 tot_len += TCPOLEN_MD5SIG_ALIGNED;
5626 #endif
5628 +#ifdef CONFIG_MPTCP
5629 + if (mptcp)
5630 + tot_len += MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK;
5631 +#endif
5632 buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len,
5633 GFP_ATOMIC);
5634 if (buff == NULL)
5635 @@ -784,6 +790,17 @@
5636 tcp_v6_md5_hash_hdr((__u8 *)topt, key,
5637 &ipv6_hdr(skb)->saddr,
5638 &ipv6_hdr(skb)->daddr, t1);
5639 + topt += 4;
5641 +#endif
5642 +#ifdef CONFIG_MPTCP
5643 + if (mptcp) {
5644 + /* Construction of 32-bit data_ack */
5645 + *topt++ = htonl((TCPOPT_MPTCP << 24) |
5646 + ((MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK) << 16) |
5647 + (0x20 << 8) |
5648 + (0x01));
5649 + *topt++ = htonl(data_ack);
5651 #endif
5653 @@ -821,7 +838,7 @@
5654 kfree_skb(buff);
5657 -static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
5658 +void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
5660 const struct tcphdr *th = tcp_hdr(skb);
5661 u32 seq = 0, ack_seq = 0;
5662 @@ -876,7 +893,7 @@
5663 ack_seq = ntohl(th->seq) + th->syn + th->fin + skb->len -
5664 (th->doff << 2);
5666 - tcp_v6_send_response(skb, seq, ack_seq, 0, 0, 0, key, 1, 0, 0);
5667 + tcp_v6_send_response(skb, seq, ack_seq, 0, 0, 0, 0, key, 1, 0, 0, 0);
5669 #ifdef CONFIG_TCP_MD5SIG
5670 release_sk1:
5671 @@ -887,40 +904,48 @@
5672 #endif
5675 -static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
5676 +static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 data_ack,
5677 u32 win, u32 tsval, u32 tsecr,
5678 - struct tcp_md5sig_key *key, u8 tclass,
5679 - u32 label)
5680 + struct tcp_md5sig_key *key, u8 tclass, u32 label,
5681 + int mptcp)
5683 tcp_v6_send_response(skb, seq, ack, win, tsval, tsecr, key, 0, tclass,
5684 - label);
5685 + label, mptcp);
5688 static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
5690 struct inet_timewait_sock *tw = inet_twsk(sk);
5691 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
5692 + u32 data_ack = 0;
5693 + int mptcp = 0;
5695 + if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw) {
5696 + data_ack = (u32)tcptw->mptcp_tw->rcv_nxt;
5697 + mptcp = 1;
5700 tcp_v6_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
5701 + data_ack,
5702 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
5703 tcp_time_stamp + tcptw->tw_ts_offset,
5704 tcptw->tw_ts_recent, tcp_twsk_md5_key(tcptw),
5705 - tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel));
5706 + tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), mptcp);
5708 inet_twsk_put(tw);
5711 -static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
5712 +void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
5713 struct request_sock *req)
5715 tcp_v6_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1,
5716 - req->rcv_wnd, tcp_time_stamp, req->ts_recent,
5717 + 0, req->rcv_wnd, tcp_time_stamp, req->ts_recent,
5718 tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr),
5719 - 0, 0);
5720 + 0, 0, 0);
5724 -static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)
5725 +struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)
5727 struct request_sock *req, **prev;
5728 const struct tcphdr *th = tcp_hdr(skb);
5729 @@ -939,7 +964,13 @@
5731 if (nsk) {
5732 if (nsk->sk_state != TCP_TIME_WAIT) {
5733 + /* Don't lock again the meta-sk. It has been locked
5734 + * before mptcp_v6_do_rcv.
5735 + */
5736 + if (tcp_sk(nsk)->mpc && !is_meta_sk(sk))
5737 + bh_lock_sock(mptcp_meta_sk(nsk));
5738 bh_lock_sock(nsk);
5740 return nsk;
5742 inet_twsk_put(inet_twsk(nsk));
5743 @@ -959,6 +990,7 @@
5744 static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
5746 struct tcp_options_received tmp_opt;
5747 + struct mptcp_options_received mopt;
5748 struct request_sock *req;
5749 struct inet_request_sock *ireq;
5750 struct ipv6_pinfo *np = inet6_sk(sk);
5751 @@ -971,6 +1003,23 @@
5752 if (skb->protocol == htons(ETH_P_IP))
5753 return tcp_v4_conn_request(sk, skb);
5755 + tcp_clear_options(&tmp_opt);
5756 + tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
5757 + tmp_opt.user_mss = tp->rx_opt.user_mss;
5758 + mptcp_init_mp_opt(&mopt);
5759 + tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL);
5761 +#ifdef CONFIG_MPTCP
5762 + /*MPTCP structures not initialized, so return error */
5763 + if (mptcp_init_failed)
5764 + mptcp_init_mp_opt(&mopt);
5766 + if (mopt.is_mp_join)
5767 + return mptcp_do_join_short(skb, &mopt, &tmp_opt, sock_net(sk));
5768 + if (mopt.drop_me)
5769 + goto drop;
5770 +#endif
5772 if (!ipv6_unicast_destination(skb))
5773 goto drop;
5775 @@ -986,7 +1035,22 @@
5776 goto drop;
5779 - req = inet6_reqsk_alloc(&tcp6_request_sock_ops);
5780 +#ifdef CONFIG_MPTCP
5781 + if (sysctl_mptcp_enabled == MPTCP_APP && !tp->mptcp_enabled)
5782 + mopt.saw_mpc = 0;
5783 + if (mopt.saw_mpc && !want_cookie) {
5784 + req = inet6_reqsk_alloc(&mptcp6_request_sock_ops);
5786 + if (req == NULL)
5787 + goto drop;
5789 + mptcp_rsk(req)->mpcb = NULL;
5790 + mptcp_rsk(req)->dss_csum = mopt.dss_csum;
5791 + mptcp_rsk(req)->collide_tk.pprev = NULL;
5792 + } else
5793 +#endif
5794 + req = inet6_reqsk_alloc(&tcp6_request_sock_ops);
5796 if (req == NULL)
5797 goto drop;
5799 @@ -994,17 +1058,15 @@
5800 tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops;
5801 #endif
5803 - tcp_clear_options(&tmp_opt);
5804 - tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
5805 - tmp_opt.user_mss = tp->rx_opt.user_mss;
5806 - tcp_parse_options(skb, &tmp_opt, 0, NULL);
5808 if (want_cookie && !tmp_opt.saw_tstamp)
5809 tcp_clear_options(&tmp_opt);
5811 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
5812 tcp_openreq_init(req, &tmp_opt, skb);
5814 + if (mopt.saw_mpc && !want_cookie)
5815 + mptcp_reqsk_new_mptcp(req, &tmp_opt, &mopt, skb);
5817 ireq = inet_rsk(req);
5818 ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
5819 ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
5820 @@ -1094,9 +1156,9 @@
5821 return 0; /* don't send reset */
5824 -static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
5825 - struct request_sock *req,
5826 - struct dst_entry *dst)
5827 +struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
5828 + struct request_sock *req,
5829 + struct dst_entry *dst)
5831 struct inet_request_sock *ireq;
5832 struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
5833 @@ -1317,7 +1379,7 @@
5834 * This is because we cannot sleep with the original spinlock
5835 * held.
5837 -static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
5838 +int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
5840 struct ipv6_pinfo *np = inet6_sk(sk);
5841 struct tcp_sock *tp;
5842 @@ -1339,6 +1401,9 @@
5843 goto discard;
5844 #endif
5846 + if (is_meta_sk(sk))
5847 + return mptcp_v6_do_rcv(sk, skb);
5849 if (sk_filter(sk, skb))
5850 goto discard;
5852 @@ -1460,7 +1525,7 @@
5854 const struct tcphdr *th;
5855 const struct ipv6hdr *hdr;
5856 - struct sock *sk;
5857 + struct sock *sk, *meta_sk = NULL;
5858 int ret;
5859 struct net *net = dev_net(skb->dev);
5861 @@ -1491,18 +1556,43 @@
5862 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
5863 skb->len - th->doff*4);
5864 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
5865 +#ifdef CONFIG_MPTCP
5866 + TCP_SKB_CB(skb)->mptcp_flags = 0;
5867 + TCP_SKB_CB(skb)->dss_off = 0;
5868 +#endif
5869 TCP_SKB_CB(skb)->when = 0;
5870 TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr);
5871 TCP_SKB_CB(skb)->sacked = 0;
5873 sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
5874 - if (!sk)
5875 - goto no_tcp_socket;
5877 process:
5878 - if (sk->sk_state == TCP_TIME_WAIT)
5879 + if (sk && sk->sk_state == TCP_TIME_WAIT)
5880 goto do_time_wait;
5882 +#ifdef CONFIG_MPTCP
5883 + if (!sk && th->syn && !th->ack) {
5884 + int ret = mptcp_lookup_join(skb, NULL);
5886 + if (ret < 0) {
5887 + tcp_v6_send_reset(NULL, skb);
5888 + goto discard_it;
5889 + } else if (ret > 0) {
5890 + return 0;
5894 + /* Is there a pending request sock for this segment ? */
5895 + if ((!sk || sk->sk_state == TCP_LISTEN) && mptcp_check_req(skb, net)) {
5896 + if (sk)
5897 + sock_put(sk);
5898 + return 0;
5900 +#endif
5902 + if (!sk)
5903 + goto no_tcp_socket;
5905 if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) {
5906 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
5907 goto discard_and_relse;
5908 @@ -1517,11 +1607,21 @@
5909 sk_mark_napi_id(sk, skb);
5910 skb->dev = NULL;
5912 - bh_lock_sock_nested(sk);
5913 + if (tcp_sk(sk)->mpc) {
5914 + meta_sk = mptcp_meta_sk(sk);
5916 + bh_lock_sock_nested(meta_sk);
5917 + if (sock_owned_by_user(meta_sk))
5918 + skb->sk = sk;
5919 + } else {
5920 + meta_sk = sk;
5921 + bh_lock_sock_nested(sk);
5924 ret = 0;
5925 - if (!sock_owned_by_user(sk)) {
5926 + if (!sock_owned_by_user(meta_sk)) {
5927 #ifdef CONFIG_NET_DMA
5928 - struct tcp_sock *tp = tcp_sk(sk);
5929 + struct tcp_sock *tp = tcp_sk(meta_sk);
5930 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
5931 tp->ucopy.dma_chan = net_dma_find_channel();
5932 if (tp->ucopy.dma_chan)
5933 @@ -1529,16 +1629,17 @@
5934 else
5935 #endif
5937 - if (!tcp_prequeue(sk, skb))
5938 + if (!tcp_prequeue(meta_sk, skb))
5939 ret = tcp_v6_do_rcv(sk, skb);
5941 - } else if (unlikely(sk_add_backlog(sk, skb,
5942 - sk->sk_rcvbuf + sk->sk_sndbuf))) {
5943 - bh_unlock_sock(sk);
5944 + } else if (unlikely(sk_add_backlog(meta_sk, skb,
5945 + meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) {
5946 + bh_unlock_sock(meta_sk);
5947 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
5948 goto discard_and_relse;
5950 - bh_unlock_sock(sk);
5952 + bh_unlock_sock(meta_sk);
5954 sock_put(sk);
5955 return ret ? -1 : 0;
5956 @@ -1595,6 +1696,18 @@
5957 sk = sk2;
5958 goto process;
5960 +#ifdef CONFIG_MPTCP
5961 + if (th->syn && !th->ack) {
5962 + int ret = mptcp_lookup_join(skb, inet_twsk(sk));
5964 + if (ret < 0) {
5965 + tcp_v6_send_reset(NULL, skb);
5966 + goto discard_it;
5967 + } else if (ret > 0) {
5968 + return 0;
5971 +#endif
5972 /* Fall through to ACK */
5974 case TCP_TW_ACK:
5975 @@ -1644,13 +1757,13 @@
5979 -static struct timewait_sock_ops tcp6_timewait_sock_ops = {
5980 +struct timewait_sock_ops tcp6_timewait_sock_ops = {
5981 .twsk_obj_size = sizeof(struct tcp6_timewait_sock),
5982 .twsk_unique = tcp_twsk_unique,
5983 .twsk_destructor= tcp_twsk_destructor,
5986 -static const struct inet_connection_sock_af_ops ipv6_specific = {
5987 +const struct inet_connection_sock_af_ops ipv6_specific = {
5988 .queue_xmit = inet6_csk_xmit,
5989 .send_check = tcp_v6_send_check,
5990 .rebuild_header = inet6_sk_rebuild_header,
5991 @@ -1683,7 +1796,7 @@
5992 * TCP over IPv4 via INET6 API
5995 -static const struct inet_connection_sock_af_ops ipv6_mapped = {
5996 +const struct inet_connection_sock_af_ops ipv6_mapped = {
5997 .queue_xmit = ip_queue_xmit,
5998 .send_check = tcp_v4_send_check,
5999 .rebuild_header = inet_sk_rebuild_header,
6000 @@ -1729,7 +1842,7 @@
6001 return 0;
6004 -static void tcp_v6_destroy_sock(struct sock *sk)
6005 +void tcp_v6_destroy_sock(struct sock *sk)
6007 tcp_v4_destroy_sock(sk);
6008 inet6_destroy_sock(sk);
6009 diff -Nur linux-3.14.45.orig/net/mptcp/Kconfig linux-3.14.45/net/mptcp/Kconfig
6010 --- linux-3.14.45.orig/net/mptcp/Kconfig 1970-01-01 01:00:00.000000000 +0100
6011 +++ linux-3.14.45/net/mptcp/Kconfig 2015-06-24 14:15:48.891862483 +0200
6012 @@ -0,0 +1,58 @@
6014 +# MPTCP configuration
6016 +config MPTCP
6017 + bool "MPTCP protocol"
6018 + depends on (IPV6=y || IPV6=n)
6019 + ---help---
6020 + This replaces the normal TCP stack with a Multipath TCP stack,
6021 + able to use several paths at once.
6023 +menuconfig MPTCP_PM_ADVANCED
6024 + bool "MPTCP: advanced path-manager control"
6025 + depends on MPTCP=y
6026 + ---help---
6027 + Support for selection of different path-managers. You should choose 'Y' here,
6028 + because otherwise you will not actively create new MPTCP-subflows.
6030 +if MPTCP_PM_ADVANCED
6032 +config MPTCP_FULLMESH
6033 + tristate "MPTCP Full-Mesh Path-Manager"
6034 + depends on MPTCP=y
6035 + ---help---
6036 + This path-management module will create a full-mesh among all IP-addresses.
6038 +config MPTCP_NDIFFPORTS
6039 + tristate "MPTCP ndiff-ports"
6040 + depends on MPTCP=y
6041 + ---help---
6042 + This path-management module will create multiple subflows between the same
6043 + pair of IP-addresses, modifying the source-port. You can set the number
6044 + of subflows via the mptcp_ndiffports-sysctl.
6046 +choice
6047 + prompt "Default MPTCP Path-Manager"
6048 + default DEFAULT
6049 + help
6050 + Select the Path-Manager of your choice
6052 + config DEFAULT_FULLMESH
6053 + bool "Full mesh" if MPTCP_FULLMESH=y
6055 + config DEFAULT_NDIFFPORTS
6056 + bool "ndiff-ports" if MPTCP_NDIFFPORTS=y
6058 + config DEFAULT_DUMMY
6059 + bool "Default"
6061 +endchoice
6063 +endif
6065 +config DEFAULT_MPTCP_PM
6066 + string
6067 + default "default" if DEFAULT_DUMMY
6068 + default "fullmesh" if DEFAULT_FULLMESH
6069 + default "ndiffports" if DEFAULT_NDIFFPORTS
6070 + default "default"
6071 diff -Nur linux-3.14.45.orig/net/mptcp/Makefile linux-3.14.45/net/mptcp/Makefile
6072 --- linux-3.14.45.orig/net/mptcp/Makefile 1970-01-01 01:00:00.000000000 +0100
6073 +++ linux-3.14.45/net/mptcp/Makefile 2015-06-24 14:15:48.891862483 +0200
6074 @@ -0,0 +1,18 @@
6076 +## Makefile for MultiPath TCP support code.
6080 +obj-$(CONFIG_MPTCP) += mptcp.o
6082 +mptcp-y := mptcp_ctrl.o mptcp_ipv4.o mptcp_ofo_queue.o mptcp_pm.o \
6083 + mptcp_output.o mptcp_input.o
6085 +obj-$(CONFIG_TCP_CONG_COUPLED) += mptcp_coupled.o
6086 +obj-$(CONFIG_TCP_CONG_OLIA) += mptcp_olia.o
6087 +obj-$(CONFIG_TCP_CONG_WVEGAS) += mptcp_wvegas.o
6088 +obj-$(CONFIG_MPTCP_FULLMESH) += mptcp_fullmesh.o
6089 +obj-$(CONFIG_MPTCP_NDIFFPORTS) += mptcp_ndiffports.o
6091 +mptcp-$(subst m,y,$(CONFIG_IPV6)) += mptcp_ipv6.o
6093 diff -Nur linux-3.14.45.orig/net/mptcp/mptcp_coupled.c linux-3.14.45/net/mptcp/mptcp_coupled.c
6094 --- linux-3.14.45.orig/net/mptcp/mptcp_coupled.c 1970-01-01 01:00:00.000000000 +0100
6095 +++ linux-3.14.45/net/mptcp/mptcp_coupled.c 2015-06-24 14:15:48.891862483 +0200
6096 @@ -0,0 +1,273 @@
6098 + * MPTCP implementation - Coupled Congestion Control
6100 + * Initial Design & Implementation:
6101 + * Sébastien Barré <sebastien.barre@uclouvain.be>
6103 + * Current Maintainer & Author:
6104 + * Christoph Paasch <christoph.paasch@uclouvain.be>
6106 + * Additional authors:
6107 + * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
6108 + * Gregory Detal <gregory.detal@uclouvain.be>
6109 + * Fabien Duchêne <fabien.duchene@uclouvain.be>
6110 + * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
6111 + * Lavkesh Lahngir <lavkesh51@gmail.com>
6112 + * Andreas Ripke <ripke@neclab.eu>
6113 + * Vlad Dogaru <vlad.dogaru@intel.com>
6114 + * Octavian Purdila <octavian.purdila@intel.com>
6115 + * John Ronan <jronan@tssg.org>
6116 + * Catalin Nicutar <catalin.nicutar@gmail.com>
6117 + * Brandon Heller <brandonh@stanford.edu>
6120 + * This program is free software; you can redistribute it and/or
6121 + * modify it under the terms of the GNU General Public License
6122 + * as published by the Free Software Foundation; either version
6123 + * 2 of the License, or (at your option) any later version.
6124 + */
6125 +#include <net/tcp.h>
6126 +#include <net/mptcp.h>
6128 +#include <linux/module.h>
6130 +/* Scaling is done in the numerator with alpha_scale_num and in the denominator
6131 + * with alpha_scale_den.
6133 + * To downscale, we just need to use alpha_scale.
6135 + * We have: alpha_scale = alpha_scale_num / (alpha_scale_den ^ 2)
6136 + */
6137 +static int alpha_scale_den = 10;
6138 +static int alpha_scale_num = 32;
6139 +static int alpha_scale = 12;
6141 +struct mptcp_ccc {
6142 + u64 alpha;
6143 + bool forced_update;
6146 +static inline int mptcp_ccc_sk_can_send(const struct sock *sk)
6148 + return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt;
6151 +static inline u64 mptcp_get_alpha(struct sock *meta_sk)
6153 + struct mptcp_ccc *mptcp_ccc = inet_csk_ca(meta_sk);
6154 + return mptcp_ccc->alpha;
6157 +static inline void mptcp_set_alpha(struct sock *meta_sk, u64 alpha)
6159 + struct mptcp_ccc *mptcp_ccc = inet_csk_ca(meta_sk);
6160 + mptcp_ccc->alpha = alpha;
6163 +static inline u64 mptcp_ccc_scale(u32 val, int scale)
6165 + return (u64) val << scale;
6168 +static inline bool mptcp_get_forced(struct sock *meta_sk)
6170 + struct mptcp_ccc *mptcp_ccc = inet_csk_ca(meta_sk);
6171 + return mptcp_ccc->forced_update;
6174 +static inline void mptcp_set_forced(struct sock *meta_sk, bool force)
6176 + struct mptcp_ccc *mptcp_ccc = inet_csk_ca(meta_sk);
6177 + mptcp_ccc->forced_update = force;
6180 +static void mptcp_ccc_recalc_alpha(struct sock *sk)
6182 + struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
6183 + struct sock *sub_sk;
6184 + int best_cwnd = 0, best_rtt = 0, can_send = 0;
6185 + u64 max_numerator = 0, sum_denominator = 0, alpha = 1;
6187 + if (!mpcb)
6188 + return;
6190 + /* Only one subflow left - fall back to normal reno-behavior
6191 + * (set alpha to 1) */
6192 + if (mpcb->cnt_established <= 1)
6193 + goto exit;
6195 + /* Do regular alpha-calculation for multiple subflows */
6197 + /* Find the max numerator of the alpha-calculation */
6198 + mptcp_for_each_sk(mpcb, sub_sk) {
6199 + struct tcp_sock *sub_tp = tcp_sk(sub_sk);
6200 + u64 tmp;
6202 + if (!mptcp_ccc_sk_can_send(sub_sk))
6203 + continue;
6205 + can_send++;
6207 + /* We need to look for the path, that provides the max-value.
6208 + * Integer-overflow is not possible here, because
6209 + * tmp will be in u64.
6210 + */
6211 + tmp = div64_u64(mptcp_ccc_scale(sub_tp->snd_cwnd,
6212 + alpha_scale_num), (u64)sub_tp->srtt * sub_tp->srtt);
6214 + if (tmp >= max_numerator) {
6215 + max_numerator = tmp;
6216 + best_cwnd = sub_tp->snd_cwnd;
6217 + best_rtt = sub_tp->srtt;
6221 + /* No subflow is able to send - we don't care anymore */
6222 + if (unlikely(!can_send))
6223 + goto exit;
6225 + /* Calculate the denominator */
6226 + mptcp_for_each_sk(mpcb, sub_sk) {
6227 + struct tcp_sock *sub_tp = tcp_sk(sub_sk);
6229 + if (!mptcp_ccc_sk_can_send(sub_sk))
6230 + continue;
6232 + sum_denominator += div_u64(
6233 + mptcp_ccc_scale(sub_tp->snd_cwnd,
6234 + alpha_scale_den) * best_rtt,
6235 + sub_tp->srtt);
6237 + sum_denominator *= sum_denominator;
6238 + if (unlikely(!sum_denominator)) {
6239 + pr_err("%s: sum_denominator == 0, cnt_established:%d\n",
6240 + __func__, mpcb->cnt_established);
6241 + mptcp_for_each_sk(mpcb, sub_sk) {
6242 + struct tcp_sock *sub_tp = tcp_sk(sub_sk);
6243 + pr_err("%s: pi:%d, state:%d\n, rtt:%u, cwnd: %u",
6244 + __func__, sub_tp->mptcp->path_index,
6245 + sub_sk->sk_state, sub_tp->srtt,
6246 + sub_tp->snd_cwnd);
6250 + alpha = div64_u64(mptcp_ccc_scale(best_cwnd, alpha_scale_num), sum_denominator);
6252 + if (unlikely(!alpha))
6253 + alpha = 1;
6255 +exit:
6256 + mptcp_set_alpha(mptcp_meta_sk(sk), alpha);
6259 +static void mptcp_ccc_init(struct sock *sk)
6261 + if (tcp_sk(sk)->mpc) {
6262 + mptcp_set_forced(mptcp_meta_sk(sk), 0);
6263 + mptcp_set_alpha(mptcp_meta_sk(sk), 1);
6265 + /* If we do not mptcp, behave like reno: return */
6268 +static void mptcp_ccc_cwnd_event(struct sock *sk, enum tcp_ca_event event)
6270 + if (event == CA_EVENT_LOSS)
6271 + mptcp_ccc_recalc_alpha(sk);
6274 +static void mptcp_ccc_set_state(struct sock *sk, u8 ca_state)
6276 + if (!tcp_sk(sk)->mpc)
6277 + return;
6279 + mptcp_set_forced(mptcp_meta_sk(sk), 1);
6282 +static void mptcp_ccc_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight)
6284 + struct tcp_sock *tp = tcp_sk(sk);
6285 + struct mptcp_cb *mpcb = tp->mpcb;
6286 + int snd_cwnd;
6288 + if (!tp->mpc) {
6289 + tcp_reno_cong_avoid(sk, ack, acked, in_flight);
6290 + return;
6293 + if (!tcp_is_cwnd_limited(sk, in_flight))
6294 + return;
6296 + if (tp->snd_cwnd <= tp->snd_ssthresh) {
6297 + /* In "safe" area, increase. */
6298 + tcp_slow_start(tp, acked);
6299 + mptcp_ccc_recalc_alpha(sk);
6300 + return;
6303 + if (mptcp_get_forced(mptcp_meta_sk(sk))) {
6304 + mptcp_ccc_recalc_alpha(sk);
6305 + mptcp_set_forced(mptcp_meta_sk(sk), 0);
6308 + if (mpcb->cnt_established > 1) {
6309 + u64 alpha = mptcp_get_alpha(mptcp_meta_sk(sk));
6311 + /* This may happen, if at the initialization, the mpcb
6312 + * was not yet attached to the sock, and thus
6313 + * initializing alpha failed.
6314 + */
6315 + if (unlikely(!alpha))
6316 + alpha = 1;
6318 + snd_cwnd = (int) div_u64 ((u64) mptcp_ccc_scale(1, alpha_scale),
6319 + alpha);
6321 + /* snd_cwnd_cnt >= max (scale * tot_cwnd / alpha, cwnd)
6322 + * Thus, we select here the max value. */
6323 + if (snd_cwnd < tp->snd_cwnd)
6324 + snd_cwnd = tp->snd_cwnd;
6325 + } else {
6326 + snd_cwnd = tp->snd_cwnd;
6329 + if (tp->snd_cwnd_cnt >= snd_cwnd) {
6330 + if (tp->snd_cwnd < tp->snd_cwnd_clamp) {
6331 + tp->snd_cwnd++;
6332 + mptcp_ccc_recalc_alpha(sk);
6335 + tp->snd_cwnd_cnt = 0;
6336 + } else {
6337 + tp->snd_cwnd_cnt++;
6341 +static struct tcp_congestion_ops mptcp_ccc = {
6342 + .init = mptcp_ccc_init,
6343 + .ssthresh = tcp_reno_ssthresh,
6344 + .cong_avoid = mptcp_ccc_cong_avoid,
6345 + .cwnd_event = mptcp_ccc_cwnd_event,
6346 + .set_state = mptcp_ccc_set_state,
6347 + .min_cwnd = tcp_reno_min_cwnd,
6348 + .owner = THIS_MODULE,
6349 + .name = "coupled",
6352 +static int __init mptcp_ccc_register(void)
6354 + BUILD_BUG_ON(sizeof(struct mptcp_ccc) > ICSK_CA_PRIV_SIZE);
6355 + return tcp_register_congestion_control(&mptcp_ccc);
6358 +static void __exit mptcp_ccc_unregister(void)
6360 + tcp_unregister_congestion_control(&mptcp_ccc);
6363 +module_init(mptcp_ccc_register);
6364 +module_exit(mptcp_ccc_unregister);
6366 +MODULE_AUTHOR("Christoph Paasch, Sébastien Barré");
6367 +MODULE_LICENSE("GPL");
6368 +MODULE_DESCRIPTION("MPTCP COUPLED CONGESTION CONTROL");
6369 +MODULE_VERSION("0.1");
6370 diff -Nur linux-3.14.45.orig/net/mptcp/mptcp_ctrl.c linux-3.14.45/net/mptcp/mptcp_ctrl.c
6371 --- linux-3.14.45.orig/net/mptcp/mptcp_ctrl.c 1970-01-01 01:00:00.000000000 +0100
6372 +++ linux-3.14.45/net/mptcp/mptcp_ctrl.c 2015-06-24 14:15:48.891862483 +0200
6373 @@ -0,0 +1,2270 @@
6375 + * MPTCP implementation - MPTCP-control
6377 + * Initial Design & Implementation:
6378 + * Sébastien Barré <sebastien.barre@uclouvain.be>
6380 + * Current Maintainer & Author:
6381 + * Christoph Paasch <christoph.paasch@uclouvain.be>
6383 + * Additional authors:
6384 + * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
6385 + * Gregory Detal <gregory.detal@uclouvain.be>
6386 + * Fabien Duchêne <fabien.duchene@uclouvain.be>
6387 + * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
6388 + * Lavkesh Lahngir <lavkesh51@gmail.com>
6389 + * Andreas Ripke <ripke@neclab.eu>
6390 + * Vlad Dogaru <vlad.dogaru@intel.com>
6391 + * Octavian Purdila <octavian.purdila@intel.com>
6392 + * John Ronan <jronan@tssg.org>
6393 + * Catalin Nicutar <catalin.nicutar@gmail.com>
6394 + * Brandon Heller <brandonh@stanford.edu>
6397 + * This program is free software; you can redistribute it and/or
6398 + * modify it under the terms of the GNU General Public License
6399 + * as published by the Free Software Foundation; either version
6400 + * 2 of the License, or (at your option) any later version.
6401 + */
6403 +#include <net/inet_common.h>
6404 +#include <net/inet6_hashtables.h>
6405 +#include <net/ipv6.h>
6406 +#include <net/ip6_checksum.h>
6407 +#include <net/mptcp.h>
6408 +#include <net/mptcp_v4.h>
6409 +#if IS_ENABLED(CONFIG_IPV6)
6410 +#include <net/mptcp_v6.h>
6411 +#endif
6412 +#include <net/sock.h>
6413 +#include <net/tcp.h>
6414 +#include <net/tcp_states.h>
6415 +#include <net/transp_v6.h>
6416 +#include <net/xfrm.h>
6418 +#include <linux/cryptohash.h>
6419 +#include <linux/kconfig.h>
6420 +#include <linux/module.h>
6421 +#include <linux/netpoll.h>
6422 +#include <linux/list.h>
6423 +#include <linux/jhash.h>
6424 +#include <linux/tcp.h>
6425 +#include <linux/net.h>
6426 +#include <linux/in.h>
6427 +#include <linux/random.h>
6428 +#include <linux/inetdevice.h>
6429 +#include <linux/workqueue.h>
6430 +#include <linux/atomic.h>
6431 +#include <linux/sysctl.h>
6433 +static struct kmem_cache *mptcp_sock_cache __read_mostly;
6434 +static struct kmem_cache *mptcp_cb_cache __read_mostly;
6435 +static struct kmem_cache *mptcp_tw_cache __read_mostly;
6437 +int sysctl_mptcp_enabled __read_mostly = 1;
6438 +int sysctl_mptcp_checksum __read_mostly = 1;
6439 +int sysctl_mptcp_debug __read_mostly;
6440 +EXPORT_SYMBOL(sysctl_mptcp_debug);
6441 +int sysctl_mptcp_syn_retries __read_mostly = 3;
6443 +bool mptcp_init_failed __read_mostly;
6445 +static int proc_mptcp_path_manager(ctl_table *ctl, int write,
6446 + void __user *buffer, size_t *lenp,
6447 + loff_t *ppos)
6449 + char val[MPTCP_PM_NAME_MAX];
6450 + ctl_table tbl = {
6451 + .data = val,
6452 + .maxlen = MPTCP_PM_NAME_MAX,
6453 + };
6454 + int ret;
6456 + mptcp_get_default_path_manager(val);
6458 + ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
6459 + if (write && ret == 0)
6460 + ret = mptcp_set_default_path_manager(val);
6461 + return ret;
6464 +static struct ctl_table mptcp_table[] = {
6466 + .procname = "mptcp_enabled",
6467 + .data = &sysctl_mptcp_enabled,
6468 + .maxlen = sizeof(int),
6469 + .mode = 0644,
6470 + .proc_handler = &proc_dointvec
6471 + },
6473 + .procname = "mptcp_checksum",
6474 + .data = &sysctl_mptcp_checksum,
6475 + .maxlen = sizeof(int),
6476 + .mode = 0644,
6477 + .proc_handler = &proc_dointvec
6478 + },
6480 + .procname = "mptcp_debug",
6481 + .data = &sysctl_mptcp_debug,
6482 + .maxlen = sizeof(int),
6483 + .mode = 0644,
6484 + .proc_handler = &proc_dointvec
6485 + },
6487 + .procname = "mptcp_syn_retries",
6488 + .data = &sysctl_mptcp_syn_retries,
6489 + .maxlen = sizeof(int),
6490 + .mode = 0644,
6491 + .proc_handler = &proc_dointvec
6492 + },
6494 + .procname = "mptcp_path_manager",
6495 + .mode = 0644,
6496 + .maxlen = MPTCP_PM_NAME_MAX,
6497 + .proc_handler = proc_mptcp_path_manager,
6498 + },
6499 + { }
6502 +static inline u32 mptcp_hash_tk(u32 token)
6504 + return token % MPTCP_HASH_SIZE;
6507 +struct hlist_nulls_head tk_hashtable[MPTCP_HASH_SIZE];
6508 +EXPORT_SYMBOL(tk_hashtable);
6510 +/* This second hashtable is needed to retrieve request socks
6511 + * created as a result of a join request. While the SYN contains
6512 + * the token, the final ack does not, so we need a separate hashtable
6513 + * to retrieve the mpcb.
6514 + */
6515 +struct list_head mptcp_reqsk_htb[MPTCP_HASH_SIZE];
6516 +spinlock_t mptcp_reqsk_hlock; /* hashtable protection */
6518 +/* The following hash table is used to avoid collision of token */
6519 +static struct hlist_nulls_head mptcp_reqsk_tk_htb[MPTCP_HASH_SIZE];
6520 +spinlock_t mptcp_tk_hashlock; /* hashtable protection */
6522 +static int mptcp_reqsk_find_tk(u32 token)
6524 + u32 hash = mptcp_hash_tk(token);
6525 + struct mptcp_request_sock *mtreqsk;
6526 + const struct hlist_nulls_node *node;
6528 + hlist_nulls_for_each_entry_rcu(mtreqsk, node,
6529 + &mptcp_reqsk_tk_htb[hash], collide_tk) {
6530 + if (token == mtreqsk->mptcp_loc_token)
6531 + return 1;
6533 + return 0;
6536 +static void mptcp_reqsk_insert_tk(struct request_sock *reqsk, u32 token)
6538 + u32 hash = mptcp_hash_tk(token);
6540 + hlist_nulls_add_head_rcu(&mptcp_rsk(reqsk)->collide_tk,
6541 + &mptcp_reqsk_tk_htb[hash]);
6544 +static void mptcp_reqsk_remove_tk(struct request_sock *reqsk)
6546 + rcu_read_lock();
6547 + spin_lock(&mptcp_tk_hashlock);
6548 + hlist_nulls_del_init_rcu(&mptcp_rsk(reqsk)->collide_tk);
6549 + spin_unlock(&mptcp_tk_hashlock);
6550 + rcu_read_unlock();
6553 +void mptcp_reqsk_destructor(struct request_sock *req)
6555 + if (!mptcp_rsk(req)->mpcb) {
6556 + if (in_softirq()) {
6557 + mptcp_reqsk_remove_tk(req);
6558 + } else {
6559 + rcu_read_lock_bh();
6560 + spin_lock(&mptcp_tk_hashlock);
6561 + hlist_nulls_del_init_rcu(&mptcp_rsk(req)->collide_tk);
6562 + spin_unlock(&mptcp_tk_hashlock);
6563 + rcu_read_unlock_bh();
6565 + } else {
6566 + mptcp_hash_request_remove(req);
6570 +static void __mptcp_hash_insert(struct tcp_sock *meta_tp, u32 token)
6572 + u32 hash = mptcp_hash_tk(token);
6573 + hlist_nulls_add_head_rcu(&meta_tp->tk_table, &tk_hashtable[hash]);
6574 + meta_tp->inside_tk_table = 1;
6577 +static int mptcp_find_token(u32 token)
6579 + u32 hash = mptcp_hash_tk(token);
6580 + struct tcp_sock *meta_tp;
6581 + const struct hlist_nulls_node *node;
6583 + hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[hash], tk_table) {
6584 + if (token == meta_tp->mptcp_loc_token)
6585 + return 1;
6587 + return 0;
6590 +static void mptcp_set_key_reqsk(struct request_sock *req,
6591 + const struct sk_buff *skb)
6593 + struct inet_request_sock *ireq = inet_rsk(req);
6594 + struct mptcp_request_sock *mtreq = mptcp_rsk(req);
6596 + if (skb->protocol == htons(ETH_P_IP)) {
6597 + mtreq->mptcp_loc_key = mptcp_v4_get_key(ip_hdr(skb)->saddr,
6598 + ip_hdr(skb)->daddr,
6599 + htons(ireq->ir_num),
6600 + ireq->ir_rmt_port);
6601 +#if IS_ENABLED(CONFIG_IPV6)
6602 + } else {
6603 + mtreq->mptcp_loc_key = mptcp_v6_get_key(ipv6_hdr(skb)->saddr.s6_addr32,
6604 + ipv6_hdr(skb)->daddr.s6_addr32,
6605 + htons(ireq->ir_num),
6606 + ireq->ir_rmt_port);
6607 +#endif
6610 + mptcp_key_sha1(mtreq->mptcp_loc_key, &mtreq->mptcp_loc_token, NULL);
6613 +/* New MPTCP-connection request, prepare a new token for the meta-socket that
6614 + * will be created in mptcp_check_req_master(), and store the received token.
6615 + */
6616 +void mptcp_reqsk_new_mptcp(struct request_sock *req,
6617 + const struct tcp_options_received *rx_opt,
6618 + const struct mptcp_options_received *mopt,
6619 + const struct sk_buff *skb)
6621 + struct mptcp_request_sock *mtreq = mptcp_rsk(req);
6623 + tcp_rsk(req)->saw_mpc = 1;
6625 + rcu_read_lock();
6626 + spin_lock(&mptcp_tk_hashlock);
6627 + do {
6628 + mptcp_set_key_reqsk(req, skb);
6629 + } while (mptcp_reqsk_find_tk(mtreq->mptcp_loc_token) ||
6630 + mptcp_find_token(mtreq->mptcp_loc_token));
6632 + mptcp_reqsk_insert_tk(req, mtreq->mptcp_loc_token);
6633 + spin_unlock(&mptcp_tk_hashlock);
6634 + rcu_read_unlock();
6635 + mtreq->mptcp_rem_key = mopt->mptcp_key;
6638 +static void mptcp_set_key_sk(struct sock *sk)
6640 + struct tcp_sock *tp = tcp_sk(sk);
6641 + struct inet_sock *isk = inet_sk(sk);
6643 + if (sk->sk_family == AF_INET)
6644 + tp->mptcp_loc_key = mptcp_v4_get_key(isk->inet_saddr,
6645 + isk->inet_daddr,
6646 + isk->inet_sport,
6647 + isk->inet_dport);
6648 +#if IS_ENABLED(CONFIG_IPV6)
6649 + else
6650 + tp->mptcp_loc_key = mptcp_v6_get_key(inet6_sk(sk)->saddr.s6_addr32,
6651 + sk->sk_v6_daddr.s6_addr32,
6652 + isk->inet_sport,
6653 + isk->inet_dport);
6654 +#endif
6656 + mptcp_key_sha1(tp->mptcp_loc_key,
6657 + &tp->mptcp_loc_token, NULL);
6660 +void mptcp_connect_init(struct sock *sk)
6662 + struct tcp_sock *tp = tcp_sk(sk);
6664 + rcu_read_lock_bh();
6665 + spin_lock(&mptcp_tk_hashlock);
6666 + do {
6667 + mptcp_set_key_sk(sk);
6668 + } while (mptcp_reqsk_find_tk(tp->mptcp_loc_token) ||
6669 + mptcp_find_token(tp->mptcp_loc_token));
6671 + __mptcp_hash_insert(tp, tp->mptcp_loc_token);
6672 + spin_unlock(&mptcp_tk_hashlock);
6673 + rcu_read_unlock_bh();
6676 +/**
6677 + * This function increments the refcount of the mpcb struct.
6678 + * It is the responsibility of the caller to decrement when releasing
6679 + * the structure.
6680 + */
6681 +struct sock *mptcp_hash_find(struct net *net, u32 token)
6683 + u32 hash = mptcp_hash_tk(token);
6684 + struct tcp_sock *meta_tp;
6685 + struct sock *meta_sk = NULL;
6686 + struct hlist_nulls_node *node;
6688 + rcu_read_lock();
6689 + hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[hash],
6690 + tk_table) {
6691 + meta_sk = (struct sock *)meta_tp;
6692 + if (token == meta_tp->mptcp_loc_token &&
6693 + net_eq(net, sock_net(meta_sk)) &&
6694 + atomic_inc_not_zero(&meta_sk->sk_refcnt))
6695 + break;
6696 + meta_sk = NULL;
6698 + rcu_read_unlock();
6699 + return meta_sk;
6702 +void mptcp_hash_remove_bh(struct tcp_sock *meta_tp)
6704 + /* remove from the token hashtable */
6705 + rcu_read_lock_bh();
6706 + spin_lock(&mptcp_tk_hashlock);
6707 + hlist_nulls_del_init_rcu(&meta_tp->tk_table);
6708 + meta_tp->inside_tk_table = 0;
6709 + spin_unlock(&mptcp_tk_hashlock);
6710 + rcu_read_unlock_bh();
6713 +void mptcp_hash_remove(struct tcp_sock *meta_tp)
6715 + rcu_read_lock();
6716 + spin_lock(&mptcp_tk_hashlock);
6717 + hlist_nulls_del_init_rcu(&meta_tp->tk_table);
6718 + meta_tp->inside_tk_table = 0;
6719 + spin_unlock(&mptcp_tk_hashlock);
6720 + rcu_read_unlock();
6723 +static struct sock *mptcp_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
6724 + struct request_sock *req,
6725 + struct dst_entry *dst)
6727 +#if IS_ENABLED(CONFIG_IPV6)
6728 + if (sk->sk_family == AF_INET6)
6729 + return tcp_v6_syn_recv_sock(sk, skb, req, dst);
6731 + /* sk->sk_family == AF_INET */
6732 + if (req->rsk_ops->family == AF_INET6)
6733 + return mptcp_v6v4_syn_recv_sock(sk, skb, req, dst);
6734 +#endif
6736 + /* sk->sk_family == AF_INET && req->rsk_ops->family == AF_INET */
6737 + return tcp_v4_syn_recv_sock(sk, skb, req, dst);
6740 +struct sock *mptcp_select_ack_sock(const struct sock *meta_sk, int copied)
6742 + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
6743 + struct sock *sk, *subsk = NULL;
6744 + u32 max_data_seq = 0;
6745 + /* max_data_seq initialized to correct compiler-warning.
6746 + * But the initialization is handled by max_data_seq_set
6747 + */
6748 + short max_data_seq_set = 0;
6749 + u32 min_time = 0xffffffff;
6751 + /* How do we select the subflow to send the window-update on?
6753 + * 1. He has to be in a state where he can send an ack and is
6754 + * operational (pf = 0).
6755 + * 2. He has to be one of those subflow who recently
6756 + * contributed to the received stream
6757 + * (this guarantees a working subflow)
6758 + * a) its latest data_seq received is after the original
6759 + * copied_seq.
6760 + * We select the one with the lowest rtt, so that the
6761 + * window-update reaches our peer the fastest.
6762 + * b) if no subflow has this kind of data_seq (e.g., very
6763 + * strange meta-level retransmissions going on), we take
6764 + * the subflow who last sent the highest data_seq.
6765 + */
6766 + mptcp_for_each_sk(meta_tp->mpcb, sk) {
6767 + struct tcp_sock *tp = tcp_sk(sk);
6769 + if (!mptcp_sk_can_send_ack(sk) || tp->pf)
6770 + continue;
6772 + /* Select among those who contributed to the
6773 + * current receive-queue.
6774 + */
6775 + if (copied && after(tp->mptcp->last_data_seq, meta_tp->copied_seq - copied)) {
6776 + if (tp->srtt < min_time) {
6777 + min_time = tp->srtt;
6778 + subsk = sk;
6779 + max_data_seq_set = 0;
6781 + continue;
6784 + if (!subsk && !max_data_seq_set) {
6785 + max_data_seq = tp->mptcp->last_data_seq;
6786 + max_data_seq_set = 1;
6787 + subsk = sk;
6790 + /* Otherwise, take the one with the highest data_seq */
6791 + if ((!subsk || max_data_seq_set) &&
6792 + after(tp->mptcp->last_data_seq, max_data_seq)) {
6793 + max_data_seq = tp->mptcp->last_data_seq;
6794 + subsk = sk;
6798 + if (!subsk) {
6799 + mptcp_debug("%s subsk is null, copied %d, cseq %u\n", __func__,
6800 + copied, meta_tp->copied_seq);
6801 + mptcp_for_each_sk(meta_tp->mpcb, sk) {
6802 + struct tcp_sock *tp = tcp_sk(sk);
6803 + mptcp_debug("%s pi %d state %u last_dseq %u\n",
6804 + __func__, tp->mptcp->path_index, sk->sk_state,
6805 + tp->mptcp->last_data_seq);
6809 + return subsk;
6811 +EXPORT_SYMBOL(mptcp_select_ack_sock);
6813 +static void mptcp_sock_def_error_report(struct sock *sk)
6815 + struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
6817 + if (!sock_flag(sk, SOCK_DEAD))
6818 + mptcp_sub_close(sk, 0);
6820 + if (mpcb->infinite_mapping_rcv || mpcb->infinite_mapping_snd ||
6821 + mpcb->send_infinite_mapping) {
6822 + struct sock *meta_sk = mptcp_meta_sk(sk);
6824 + meta_sk->sk_err = sk->sk_err;
6825 + meta_sk->sk_err_soft = sk->sk_err_soft;
6827 + if (!sock_flag(meta_sk, SOCK_DEAD))
6828 + meta_sk->sk_error_report(meta_sk);
6830 + tcp_done(meta_sk);
6833 + sk->sk_err = 0;
6834 + return;
6837 +static void mptcp_mpcb_put(struct mptcp_cb *mpcb)
6839 + if (atomic_dec_and_test(&mpcb->mpcb_refcnt)) {
6840 + mptcp_cleanup_path_manager(mpcb);
6841 + kmem_cache_free(mptcp_cb_cache, mpcb);
6845 +static void mptcp_sock_destruct(struct sock *sk)
6847 + struct tcp_sock *tp = tcp_sk(sk);
6849 + inet_sock_destruct(sk);
6851 + BUG_ON(!list_empty(&tp->mptcp->cb_list));
6853 + kmem_cache_free(mptcp_sock_cache, tp->mptcp);
6854 + tp->mptcp = NULL;
6856 + if (!is_meta_sk(sk) && !tp->was_meta_sk) {
6857 + /* Taken when mpcb pointer was set */
6858 + sock_put(mptcp_meta_sk(sk));
6859 + mptcp_mpcb_put(tp->mpcb);
6860 + } else {
6861 + struct mptcp_cb *mpcb = tp->mpcb;
6862 + struct mptcp_tw *mptw;
6864 + /* The mpcb is disappearing - we can make the final
6865 + * update to the rcv_nxt of the time-wait-sock and remove
6866 + * its reference to the mpcb.
6867 + */
6868 + spin_lock_bh(&mpcb->tw_lock);
6869 + list_for_each_entry_rcu(mptw, &mpcb->tw_list, list) {
6870 + list_del_rcu(&mptw->list);
6871 + mptw->in_list = 0;
6872 + mptcp_mpcb_put(mpcb);
6873 + rcu_assign_pointer(mptw->mpcb, NULL);
6875 + spin_unlock_bh(&mpcb->tw_lock);
6877 + mptcp_mpcb_put(mpcb);
6879 + mptcp_debug("%s destroying meta-sk\n", __func__);
6883 +void mptcp_destroy_sock(struct sock *sk)
6885 + if (is_meta_sk(sk)) {
6886 + struct sock *sk_it, *tmpsk;
6888 + __skb_queue_purge(&tcp_sk(sk)->mpcb->reinject_queue);
6889 + mptcp_purge_ofo_queue(tcp_sk(sk));
6891 + /* We have to close all remaining subflows. Normally, they
6892 + * should all be about to get closed. But, if the kernel is
6893 + * forcing a closure (e.g., tcp_write_err), the subflows might
6894 + * not have been closed properly (as we are waiting for the
6895 + * DATA_ACK of the DATA_FIN).
6896 + */
6897 + mptcp_for_each_sk_safe(tcp_sk(sk)->mpcb, sk_it, tmpsk) {
6898 + /* Already did call tcp_close - waiting for graceful
6899 + * closure, or if we are retransmitting fast-close on
6900 + * the subflow. The reset (or timeout) will kill the
6901 + * subflow..
6902 + */
6903 + if (tcp_sk(sk_it)->closing ||
6904 + tcp_sk(sk_it)->send_mp_fclose)
6905 + continue;
6907 + /* Allow the delayed work first to prevent time-wait state */
6908 + if (delayed_work_pending(&tcp_sk(sk_it)->mptcp->work))
6909 + continue;
6911 + mptcp_sub_close(sk_it, 0);
6913 + } else {
6914 + mptcp_del_sock(sk);
6918 +static void mptcp_set_state(struct sock *sk)
6920 + struct sock *meta_sk = mptcp_meta_sk(sk);
6922 + /* Meta is not yet established - wake up the application */
6923 + if ((1 << meta_sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV) &&
6924 + sk->sk_state == TCP_ESTABLISHED) {
6925 + tcp_set_state(meta_sk, TCP_ESTABLISHED);
6927 + if (!sock_flag(meta_sk, SOCK_DEAD)) {
6928 + meta_sk->sk_state_change(meta_sk);
6929 + sk_wake_async(meta_sk, SOCK_WAKE_IO, POLL_OUT);
6933 + if (sk->sk_state == TCP_ESTABLISHED) {
6934 + tcp_sk(sk)->mptcp->establish_increased = 1;
6935 + tcp_sk(sk)->mpcb->cnt_established++;
6939 +u32 mptcp_secret[MD5_MESSAGE_BYTES / 4] ____cacheline_aligned;
6940 +u32 mptcp_key_seed = 0;
6942 +void mptcp_key_sha1(u64 key, u32 *token, u64 *idsn)
6944 + u32 workspace[SHA_WORKSPACE_WORDS];
6945 + u32 mptcp_hashed_key[SHA_DIGEST_WORDS];
6946 + u8 input[64];
6947 + int i;
6949 + memset(workspace, 0, sizeof(workspace));
6951 + /* Initialize input with appropriate padding */
6952 + memset(&input[9], 0, sizeof(input) - 10); /* -10, because the last byte
6953 + * is explicitly set too */
6954 + memcpy(input, &key, sizeof(key)); /* Copy key to the msg beginning */
6955 + input[8] = 0x80; /* Padding: First bit after message = 1 */
6956 + input[63] = 0x40; /* Padding: Length of the message = 64 bits */
6958 + sha_init(mptcp_hashed_key);
6959 + sha_transform(mptcp_hashed_key, input, workspace);
6961 + for (i = 0; i < 5; i++)
6962 + mptcp_hashed_key[i] = cpu_to_be32(mptcp_hashed_key[i]);
6964 + if (token)
6965 + *token = mptcp_hashed_key[0];
6966 + if (idsn)
6967 + *idsn = *((u64 *)&mptcp_hashed_key[3]);
6970 +void mptcp_hmac_sha1(u8 *key_1, u8 *key_2, u8 *rand_1, u8 *rand_2,
6971 + u32 *hash_out)
6973 + u32 workspace[SHA_WORKSPACE_WORDS];
6974 + u8 input[128]; /* 2 512-bit blocks */
6975 + int i;
6977 + memset(workspace, 0, sizeof(workspace));
6979 + /* Generate key xored with ipad */
6980 + memset(input, 0x36, 64);
6981 + for (i = 0; i < 8; i++)
6982 + input[i] ^= key_1[i];
6983 + for (i = 0; i < 8; i++)
6984 + input[i + 8] ^= key_2[i];
6986 + memcpy(&input[64], rand_1, 4);
6987 + memcpy(&input[68], rand_2, 4);
6988 + input[72] = 0x80; /* Padding: First bit after message = 1 */
6989 + memset(&input[73], 0, 53);
6991 + /* Padding: Length of the message = 512 + 64 bits */
6992 + input[126] = 0x02;
6993 + input[127] = 0x40;
6995 + sha_init(hash_out);
6996 + sha_transform(hash_out, input, workspace);
6997 + memset(workspace, 0, sizeof(workspace));
6999 + sha_transform(hash_out, &input[64], workspace);
7000 + memset(workspace, 0, sizeof(workspace));
7002 + for (i = 0; i < 5; i++)
7003 + hash_out[i] = cpu_to_be32(hash_out[i]);
7005 + /* Prepare second part of hmac */
7006 + memset(input, 0x5C, 64);
7007 + for (i = 0; i < 8; i++)
7008 + input[i] ^= key_1[i];
7009 + for (i = 0; i < 8; i++)
7010 + input[i + 8] ^= key_2[i];
7012 + memcpy(&input[64], hash_out, 20);
7013 + input[84] = 0x80;
7014 + memset(&input[85], 0, 41);
7016 + /* Padding: Length of the message = 512 + 160 bits */
7017 + input[126] = 0x02;
7018 + input[127] = 0xA0;
7020 + sha_init(hash_out);
7021 + sha_transform(hash_out, input, workspace);
7022 + memset(workspace, 0, sizeof(workspace));
7024 + sha_transform(hash_out, &input[64], workspace);
7026 + for (i = 0; i < 5; i++)
7027 + hash_out[i] = cpu_to_be32(hash_out[i]);
7030 +static void mptcp_mpcb_inherit_sockopts(struct sock *meta_sk, struct sock *master_sk)
7032 + /* Socket-options handled by mptcp_inherit_sk while creating the meta-sk.
7033 + * ======
7034 + * SO_SNDBUF, SO_SNDBUFFORCE, SO_RCVBUF, SO_RCVBUFFORCE, SO_RCVLOWAT,
7035 + * SO_RCVTIMEO, SO_SNDTIMEO, SO_ATTACH_FILTER, SO_DETACH_FILTER,
7036 + * TCP_NODELAY, TCP_CORK
7038 + * Socket-options handled in this function here
7039 + * ======
7040 + * TCP_DEFER_ACCEPT
7042 + * Socket-options on the todo-list
7043 + * ======
7044 + * SO_BINDTODEVICE - should probably prevent creation of new subsocks
7045 + * across other devices. - what about the api-draft?
7046 + * SO_DEBUG
7047 + * SO_REUSEADDR - probably we don't care about this
7048 + * SO_DONTROUTE, SO_BROADCAST
7049 + * SO_OOBINLINE
7050 + * SO_LINGER
7051 + * SO_TIMESTAMP* - I don't think this is of concern for a SOCK_STREAM
7052 + * SO_PASSSEC - I don't think this is of concern for a SOCK_STREAM
7053 + * SO_RXQ_OVFL
7054 + * TCP_COOKIE_TRANSACTIONS
7055 + * TCP_MAXSEG
7056 + * TCP_THIN_* - Handled by mptcp_inherit_sk, but we need to support this
7057 + * in mptcp_retransmit_timer. AND we need to check what is
7058 + * about the subsockets.
7059 + * TCP_LINGER2
7060 + * TCP_WINDOW_CLAMP
7061 + * TCP_USER_TIMEOUT
7062 + * TCP_MD5SIG
7064 + * Socket-options of no concern for the meta-socket (but for the subsocket)
7065 + * ======
7066 + * SO_PRIORITY
7067 + * SO_MARK
7068 + * TCP_CONGESTION
7069 + * TCP_SYNCNT
7070 + * TCP_QUICKACK
7071 + * SO_KEEPALIVE
7072 + */
7074 + /****** DEFER_ACCEPT-handler ******/
7076 + /* DEFER_ACCEPT is not of concern for new subflows - we always accept
7077 + * them
7078 + */
7079 + inet_csk(meta_sk)->icsk_accept_queue.rskq_defer_accept = 0;
7082 +static void mptcp_sub_inherit_sockopts(struct sock *meta_sk, struct sock *sub_sk)
7084 + /* IP_TOS also goes to the subflow. */
7085 + if (inet_sk(sub_sk)->tos != inet_sk(meta_sk)->tos) {
7086 + inet_sk(sub_sk)->tos = inet_sk(meta_sk)->tos;
7087 + sub_sk->sk_priority = meta_sk->sk_priority;
7088 + sk_dst_reset(sub_sk);
7091 + /* Inherit SO_REUSEADDR */
7092 + sub_sk->sk_reuse = meta_sk->sk_reuse;
7094 + /* Inherit snd/rcv-buffer locks */
7095 + sub_sk->sk_userlocks = meta_sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
7098 +int mptcp_backlog_rcv(struct sock *meta_sk, struct sk_buff *skb)
7100 + /* skb-sk may be NULL if we receive a packet immediatly after the
7101 + * SYN/ACK + MP_CAPABLE.
7102 + */
7103 + struct sock *sk = skb->sk ? skb->sk : meta_sk;
7104 + int ret = 0;
7106 + skb->sk = NULL;
7108 + if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
7109 + kfree_skb(skb);
7110 + return 0;
7113 + if (sk->sk_family == AF_INET)
7114 + ret = tcp_v4_do_rcv(sk, skb);
7115 +#if IS_ENABLED(CONFIG_IPV6)
7116 + else
7117 + ret = tcp_v6_do_rcv(sk, skb);
7118 +#endif
7120 + sock_put(sk);
7121 + return ret;
7124 +struct lock_class_key meta_key;
7125 +struct lock_class_key meta_slock_key;
7127 +/* Code heavily inspired from sk_clone() */
7128 +static int mptcp_inherit_sk(const struct sock *sk, struct sock *newsk,
7129 + int family, const gfp_t flags)
7131 + struct sk_filter *filter;
7132 + struct proto *prot = newsk->sk_prot;
7133 + const struct inet_connection_sock_af_ops *af_ops = inet_csk(newsk)->icsk_af_ops;
7134 +#ifdef CONFIG_SECURITY_NETWORK
7135 + void *sptr = newsk->sk_security;
7136 +#endif
7138 + if (sk->sk_family == AF_INET) {
7139 + memcpy(newsk, sk, offsetof(struct sock, sk_dontcopy_begin));
7140 + memcpy(&newsk->sk_dontcopy_end, &sk->sk_dontcopy_end,
7141 + sizeof(struct tcp_sock) - offsetof(struct sock, sk_dontcopy_end));
7142 + } else {
7143 + memcpy(newsk, sk, offsetof(struct sock, sk_dontcopy_begin));
7144 + memcpy(&newsk->sk_dontcopy_end, &sk->sk_dontcopy_end,
7145 + sizeof(struct tcp6_sock) - offsetof(struct sock, sk_dontcopy_end));
7148 +#ifdef CONFIG_SECURITY_NETWORK
7149 + newsk->sk_security = sptr;
7150 + security_sk_clone(sk, newsk);
7151 +#endif
7153 + /* Has been changed by sock_copy above - we may need an IPv6-socket */
7154 + newsk->sk_family = family;
7155 + newsk->sk_prot = prot;
7156 + newsk->sk_prot_creator = prot;
7157 + inet_csk(newsk)->icsk_af_ops = af_ops;
7159 + /* We don't yet have the mptcp-point. Thus we still need inet_sock_destruct */
7160 + newsk->sk_destruct = inet_sock_destruct;
7162 + /* SANITY */
7163 + get_net(sock_net(newsk));
7164 + sk_node_init(&newsk->sk_node);
7165 + sock_lock_init_class_and_name(newsk, "slock-AF_INET-MPTCP",
7166 + &meta_slock_key, "sk_lock-AF_INET-MPTCP",
7167 + &meta_key);
7169 + /* Unlocks are in:
7171 + * 1. If we are creating the master-sk
7172 + * * on client-side in tcp_rcv_state_process, "case TCP_SYN_SENT"
7173 + * * on server-side in tcp_child_process
7174 + * 2. If we are creating another subsock
7175 + * * Also in tcp_child_process
7176 + */
7177 + bh_lock_sock(newsk);
7178 + newsk->sk_backlog.head = NULL;
7179 + newsk->sk_backlog.tail = NULL;
7180 + newsk->sk_backlog.len = 0;
7182 + atomic_set(&newsk->sk_rmem_alloc, 0);
7183 + atomic_set(&newsk->sk_wmem_alloc, 1);
7184 + atomic_set(&newsk->sk_omem_alloc, 0);
7186 + skb_queue_head_init(&newsk->sk_receive_queue);
7187 + skb_queue_head_init(&newsk->sk_write_queue);
7188 +#ifdef CONFIG_NET_DMA
7189 + skb_queue_head_init(&newsk->sk_async_wait_queue);
7190 +#endif
7192 + spin_lock_init(&newsk->sk_dst_lock);
7193 + rwlock_init(&newsk->sk_callback_lock);
7194 + lockdep_set_class_and_name(&newsk->sk_callback_lock,
7195 + af_callback_keys + newsk->sk_family,
7196 + af_family_clock_key_strings[newsk->sk_family]);
7197 + newsk->sk_dst_cache = NULL;
7198 + newsk->sk_rx_dst = NULL;
7199 + newsk->sk_wmem_queued = 0;
7200 + newsk->sk_forward_alloc = 0;
7201 + newsk->sk_send_head = NULL;
7202 + newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
7204 + tcp_sk(newsk)->mptcp = NULL;
7206 + sock_reset_flag(newsk, SOCK_DONE);
7207 + skb_queue_head_init(&newsk->sk_error_queue);
7209 + filter = rcu_dereference_protected(newsk->sk_filter, 1);
7210 + if (filter != NULL)
7211 + sk_filter_charge(newsk, filter);
7213 + if (unlikely(xfrm_sk_clone_policy(newsk))) {
7214 + /* It is still raw copy of parent, so invalidate
7215 + * destructor and make plain sk_free()
7216 + */
7217 + newsk->sk_destruct = NULL;
7218 + bh_unlock_sock(newsk);
7219 + sk_free(newsk);
7220 + newsk = NULL;
7221 + return -ENOMEM;
7224 + newsk->sk_err = 0;
7225 + newsk->sk_priority = 0;
7226 + /* Before updating sk_refcnt, we must commit prior changes to memory
7227 + * (Documentation/RCU/rculist_nulls.txt for details)
7228 + */
7229 + smp_wmb();
7230 + atomic_set(&newsk->sk_refcnt, 2);
7232 + /* Increment the counter in the same struct proto as the master
7233 + * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
7234 + * is the same as sk->sk_prot->socks, as this field was copied
7235 + * with memcpy).
7237 + * This _changes_ the previous behaviour, where
7238 + * tcp_create_openreq_child always was incrementing the
7239 + * equivalent to tcp_prot->socks (inet_sock_nr), so this have
7240 + * to be taken into account in all callers. -acme
7241 + */
7242 + sk_refcnt_debug_inc(newsk);
7243 + sk_set_socket(newsk, NULL);
7244 + newsk->sk_wq = NULL;
7246 + if (newsk->sk_prot->sockets_allocated)
7247 + percpu_counter_inc(newsk->sk_prot->sockets_allocated);
7249 + if (sock_flag(newsk, SOCK_TIMESTAMP) ||
7250 + sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE))
7251 + net_enable_timestamp();
7253 + return 0;
7256 +int mptcp_alloc_mpcb(struct sock *meta_sk, __u64 remote_key, u32 window)
7258 + struct mptcp_cb *mpcb;
7259 + struct sock *master_sk;
7260 + struct inet_connection_sock *master_icsk, *meta_icsk = inet_csk(meta_sk);
7261 + struct tcp_sock *master_tp, *meta_tp = tcp_sk(meta_sk);
7262 + struct sk_buff *skb, *tmp;
7263 + u64 idsn;
7265 + master_sk = sk_prot_alloc(meta_sk->sk_prot, GFP_ATOMIC | __GFP_ZERO,
7266 + meta_sk->sk_family);
7267 + if (!master_sk)
7268 + return -ENOBUFS;
7270 + master_tp = tcp_sk(master_sk);
7271 + master_icsk = inet_csk(master_sk);
7273 + /* Need to set this here - it is needed by mptcp_inherit_sk */
7274 + master_sk->sk_prot = meta_sk->sk_prot;
7275 + master_sk->sk_prot_creator = meta_sk->sk_prot;
7276 + master_icsk->icsk_af_ops = meta_icsk->icsk_af_ops;
7278 + mpcb = kmem_cache_zalloc(mptcp_cb_cache, GFP_ATOMIC);
7279 + if (!mpcb) {
7280 + sk_free(master_sk);
7281 + return -ENOBUFS;
7284 + /* master_sk inherits from meta_sk */
7285 + if (mptcp_inherit_sk(meta_sk, master_sk, meta_sk->sk_family, GFP_ATOMIC)) {
7286 + kmem_cache_free(mptcp_cb_cache, mpcb);
7287 + return -ENOBUFS;
7290 +#if IS_ENABLED(CONFIG_IPV6)
7291 + if (meta_icsk->icsk_af_ops == &ipv6_mapped) {
7292 + struct ipv6_pinfo *newnp, *np = inet6_sk(meta_sk);
7294 + inet_sk(master_sk)->pinet6 = &((struct tcp6_sock *)master_sk)->inet6;
7296 + newnp = inet6_sk(master_sk);
7297 + memcpy(newnp, np, sizeof(struct ipv6_pinfo));
7299 + newnp->ipv6_mc_list = NULL;
7300 + newnp->ipv6_ac_list = NULL;
7301 + newnp->ipv6_fl_list = NULL;
7302 + newnp->opt = NULL;
7303 + newnp->pktoptions = NULL;
7304 + (void)xchg(&newnp->rxpmtu, NULL);
7305 + } else if (meta_sk->sk_family == AF_INET6) {
7306 + struct ipv6_pinfo *newnp, *np = inet6_sk(meta_sk);
7308 + inet_sk(master_sk)->pinet6 = &((struct tcp6_sock *)master_sk)->inet6;
7310 + newnp = inet6_sk(master_sk);
7311 + memcpy(newnp, np, sizeof(struct ipv6_pinfo));
7313 + newnp->hop_limit = -1;
7314 + newnp->mcast_hops = IPV6_DEFAULT_MCASTHOPS;
7315 + newnp->mc_loop = 1;
7316 + newnp->pmtudisc = IPV6_PMTUDISC_WANT;
7317 + newnp->ipv6only = sock_net(master_sk)->ipv6.sysctl.bindv6only;
7319 +#endif
7321 + meta_tp->mptcp = kmem_cache_zalloc(mptcp_sock_cache, GFP_ATOMIC);
7322 + if (!meta_tp->mptcp) {
7323 + kmem_cache_free(mptcp_cb_cache, mpcb);
7324 + sk_free(master_sk);
7325 + return -ENOBUFS;
7328 + INIT_LIST_HEAD(&meta_tp->mptcp->cb_list);
7330 + /* Store the keys and generate the peer's token */
7331 + mpcb->mptcp_loc_key = meta_tp->mptcp_loc_key;
7332 + mpcb->mptcp_loc_token = meta_tp->mptcp_loc_token;
7334 + /* Generate Initial data-sequence-numbers */
7335 + mptcp_key_sha1(mpcb->mptcp_loc_key, NULL, &idsn);
7336 + idsn = ntohll(idsn) + 1;
7337 + mpcb->snd_high_order[0] = idsn >> 32;
7338 + mpcb->snd_high_order[1] = mpcb->snd_high_order[0] - 1;
7340 + meta_tp->write_seq = (u32)idsn;
7341 + meta_tp->snd_sml = meta_tp->write_seq;
7342 + meta_tp->snd_una = meta_tp->write_seq;
7343 + meta_tp->snd_nxt = meta_tp->write_seq;
7344 + meta_tp->pushed_seq = meta_tp->write_seq;
7345 + meta_tp->snd_up = meta_tp->write_seq;
7347 + mpcb->mptcp_rem_key = remote_key;
7348 + mptcp_key_sha1(mpcb->mptcp_rem_key, &mpcb->mptcp_rem_token, &idsn);
7349 + idsn = ntohll(idsn) + 1;
7350 + mpcb->rcv_high_order[0] = idsn >> 32;
7351 + mpcb->rcv_high_order[1] = mpcb->rcv_high_order[0] + 1;
7352 + meta_tp->copied_seq = (u32) idsn;
7353 + meta_tp->rcv_nxt = (u32) idsn;
7354 + meta_tp->rcv_wup = (u32) idsn;
7356 + meta_tp->snd_wl1 = meta_tp->rcv_nxt - 1;
7357 + meta_tp->snd_wnd = window;
7358 + meta_tp->retrans_stamp = 0; /* Set in tcp_connect() */
7360 + meta_tp->packets_out = 0;
7361 + meta_tp->mptcp->snt_isn = meta_tp->write_seq; /* Initial data-sequence-number */
7362 + meta_icsk->icsk_probes_out = 0;
7364 + /* Set mptcp-pointers */
7365 + master_tp->mpcb = mpcb;
7366 + master_tp->meta_sk = meta_sk;
7367 + meta_tp->mpcb = mpcb;
7368 + meta_tp->meta_sk = meta_sk;
7369 + mpcb->meta_sk = meta_sk;
7370 + mpcb->master_sk = master_sk;
7372 + set_mpc(meta_tp);
7373 + meta_tp->mptcp->attached = 0;
7374 + meta_tp->was_meta_sk = 0;
7376 + /* Initialize the queues */
7377 + skb_queue_head_init(&mpcb->reinject_queue);
7378 + skb_queue_head_init(&master_tp->out_of_order_queue);
7379 + tcp_prequeue_init(master_tp);
7380 + INIT_LIST_HEAD(&master_tp->tsq_node);
7382 + master_tp->tsq_flags = 0;
7384 + /* Copy the write-queue from the meta down to the master.
7385 + * This is necessary to get the SYN to the master-write-queue.
7386 + * No other data can be queued, before tcp_sendmsg waits for the
7387 + * connection to finish.
7388 + */
7389 + skb_queue_walk_safe(&meta_sk->sk_write_queue, skb, tmp) {
7390 + skb_unlink(skb, &meta_sk->sk_write_queue);
7391 + skb_queue_tail(&master_sk->sk_write_queue, skb);
7393 + master_sk->sk_wmem_queued += skb->truesize;
7394 + sk_mem_charge(master_sk, skb->truesize);
7397 + meta_sk->sk_wmem_queued = 0;
7398 + meta_sk->sk_forward_alloc = 0;
7400 + mutex_init(&mpcb->mpcb_mutex);
7402 + /* Init the accept_queue structure, we support a queue of 32 pending
7403 + * connections, it does not need to be huge, since we only store here
7404 + * pending subflow creations.
7405 + */
7406 + if (reqsk_queue_alloc(&meta_icsk->icsk_accept_queue, 32, GFP_ATOMIC)) {
7407 + inet_put_port(master_sk);
7408 + kmem_cache_free(mptcp_sock_cache, meta_tp->mptcp);
7409 + kmem_cache_free(mptcp_cb_cache, mpcb);
7410 + sk_free(master_sk);
7411 + reset_mpc(meta_tp);
7412 + return -ENOMEM;
7415 + /* Redefine function-pointers as the meta-sk is now fully ready */
7416 + meta_sk->sk_backlog_rcv = mptcp_backlog_rcv;
7417 + meta_sk->sk_destruct = mptcp_sock_destruct;
7418 + mpcb->syn_recv_sock = mptcp_syn_recv_sock;
7420 + /* Meta-level retransmit timer */
7421 + meta_icsk->icsk_rto *= 2; /* Double of initial - rto */
7423 + tcp_init_xmit_timers(master_sk);
7424 + /* Has been set for sending out the SYN */
7425 + inet_csk_clear_xmit_timer(meta_sk, ICSK_TIME_RETRANS);
7427 + if (!meta_tp->inside_tk_table) {
7428 + /* Adding the meta_tp in the token hashtable - coming from server-side */
7429 + rcu_read_lock();
7430 + spin_lock(&mptcp_tk_hashlock);
7432 + __mptcp_hash_insert(meta_tp, mpcb->mptcp_loc_token);
7434 + spin_unlock(&mptcp_tk_hashlock);
7435 + rcu_read_unlock();
7437 + master_tp->inside_tk_table = 0;
7439 + /* Init time-wait stuff */
7440 + INIT_LIST_HEAD(&mpcb->tw_list);
7441 + spin_lock_init(&mpcb->tw_lock);
7443 + INIT_LIST_HEAD(&mpcb->callback_list);
7445 + mptcp_mpcb_inherit_sockopts(meta_sk, master_sk);
7447 + mpcb->orig_sk_rcvbuf = meta_sk->sk_rcvbuf;
7448 + mpcb->orig_sk_sndbuf = meta_sk->sk_sndbuf;
7449 + mpcb->orig_window_clamp = meta_tp->window_clamp;
7451 + /* The meta is directly linked - set refcnt to 1 */
7452 + atomic_set(&mpcb->mpcb_refcnt, 1);
7454 + mptcp_init_path_manager(mpcb);
7456 + mptcp_debug("%s: created mpcb with token %#x\n",
7457 + __func__, mpcb->mptcp_loc_token);
7459 + return 0;
7462 +struct sock *mptcp_sk_clone(const struct sock *sk, int family,
7463 + const gfp_t priority)
7465 + struct sock *newsk = NULL;
7467 + if (family == AF_INET && sk->sk_family == AF_INET) {
7468 + newsk = sk_prot_alloc(&tcp_prot, priority, family);
7469 + if (!newsk)
7470 + return NULL;
7472 + /* Set these pointers - they are needed by mptcp_inherit_sk */
7473 + newsk->sk_prot = &tcp_prot;
7474 + newsk->sk_prot_creator = &tcp_prot;
7475 + inet_csk(newsk)->icsk_af_ops = &ipv4_specific;
7476 + newsk->sk_family = AF_INET;
7478 +#if IS_ENABLED(CONFIG_IPV6)
7479 + else {
7480 + newsk = sk_prot_alloc(&tcpv6_prot, priority, family);
7481 + if (!newsk)
7482 + return NULL;
7484 + newsk->sk_prot = &tcpv6_prot;
7485 + newsk->sk_prot_creator = &tcpv6_prot;
7486 + if (family == AF_INET)
7487 + inet_csk(newsk)->icsk_af_ops = &ipv6_mapped;
7488 + else
7489 + inet_csk(newsk)->icsk_af_ops = &ipv6_specific;
7490 + newsk->sk_family = AF_INET6;
7492 +#endif
7494 + if (mptcp_inherit_sk(sk, newsk, family, priority))
7495 + return NULL;
7497 + return newsk;
7500 +void mptcp_fallback_meta_sk(struct sock *meta_sk)
7502 + kfree(inet_csk(meta_sk)->icsk_accept_queue.listen_opt);
7503 + kmem_cache_free(mptcp_sock_cache, tcp_sk(meta_sk)->mptcp);
7504 + kmem_cache_free(mptcp_cb_cache, tcp_sk(meta_sk)->mpcb);
7507 +int mptcp_add_sock(struct sock *meta_sk, struct sock *sk, u8 loc_id, u8 rem_id,
7508 + gfp_t flags)
7510 + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
7511 + struct tcp_sock *tp = tcp_sk(sk);
7513 + tp->mptcp = kmem_cache_zalloc(mptcp_sock_cache, flags);
7514 + if (!tp->mptcp)
7515 + return -ENOMEM;
7517 + tp->mptcp->path_index = mptcp_set_new_pathindex(mpcb);
7518 + /* No more space for more subflows? */
7519 + if (!tp->mptcp->path_index) {
7520 + kmem_cache_free(mptcp_sock_cache, tp->mptcp);
7521 + return -EPERM;
7524 + INIT_LIST_HEAD(&tp->mptcp->cb_list);
7526 + tp->mptcp->tp = tp;
7527 + tp->mpcb = mpcb;
7528 + tp->meta_sk = meta_sk;
7529 + set_mpc(tp);
7530 + tp->mptcp->loc_id = loc_id;
7531 + tp->mptcp->rem_id = rem_id;
7532 + tp->mptcp->last_rbuf_opti = tcp_time_stamp;
7534 + /* The corresponding sock_put is in mptcp_sock_destruct(). It cannot be
7535 + * included in mptcp_del_sock(), because the mpcb must remain alive
7536 + * until the last subsocket is completely destroyed.
7537 + */
7538 + sock_hold(meta_sk);
7539 + atomic_inc(&mpcb->mpcb_refcnt);
7541 + tp->mptcp->next = mpcb->connection_list;
7542 + mpcb->connection_list = tp;
7543 + tp->mptcp->attached = 1;
7545 + mpcb->cnt_subflows++;
7546 + atomic_add(atomic_read(&((struct sock *)tp)->sk_rmem_alloc),
7547 + &meta_sk->sk_rmem_alloc);
7549 + mptcp_sub_inherit_sockopts(meta_sk, sk);
7550 + INIT_DELAYED_WORK(&tp->mptcp->work, mptcp_sub_close_wq);
7552 + /* As we successfully allocated the mptcp_tcp_sock, we have to
7553 + * change the function-pointers here (for sk_destruct to work correctly)
7554 + */
7555 + sk->sk_error_report = mptcp_sock_def_error_report;
7556 + sk->sk_data_ready = mptcp_data_ready;
7557 + sk->sk_write_space = mptcp_write_space;
7558 + sk->sk_state_change = mptcp_set_state;
7559 + sk->sk_destruct = mptcp_sock_destruct;
7561 + if (sk->sk_family == AF_INET)
7562 + mptcp_debug("%s: token %#x pi %d, src_addr:%pI4:%d dst_addr:%pI4:%d, cnt_subflows now %d\n",
7563 + __func__ , mpcb->mptcp_loc_token,
7564 + tp->mptcp->path_index,
7565 + &((struct inet_sock *)tp)->inet_saddr,
7566 + ntohs(((struct inet_sock *)tp)->inet_sport),
7567 + &((struct inet_sock *)tp)->inet_daddr,
7568 + ntohs(((struct inet_sock *)tp)->inet_dport),
7569 + mpcb->cnt_subflows);
7570 +#if IS_ENABLED(CONFIG_IPV6)
7571 + else
7572 + mptcp_debug("%s: token %#x pi %d, src_addr:%pI6:%d dst_addr:%pI6:%d, cnt_subflows now %d\n",
7573 + __func__ , mpcb->mptcp_loc_token,
7574 + tp->mptcp->path_index, &inet6_sk(sk)->saddr,
7575 + ntohs(((struct inet_sock *)tp)->inet_sport),
7576 + &sk->sk_v6_daddr,
7577 + ntohs(((struct inet_sock *)tp)->inet_dport),
7578 + mpcb->cnt_subflows);
7579 +#endif
7581 + return 0;
7584 +void mptcp_del_sock(struct sock *sk)
7586 + struct tcp_sock *tp = tcp_sk(sk), *tp_prev;
7587 + struct mptcp_cb *mpcb;
7589 + if (!tp->mptcp || !tp->mptcp->attached)
7590 + return;
7592 + mpcb = tp->mpcb;
7593 + tp_prev = mpcb->connection_list;
7595 + mptcp_debug("%s: Removing subsock tok %#x pi:%d state %d is_meta? %d\n",
7596 + __func__, mpcb->mptcp_loc_token, tp->mptcp->path_index,
7597 + sk->sk_state, is_meta_sk(sk));
7599 + if (tp_prev == tp) {
7600 + mpcb->connection_list = tp->mptcp->next;
7601 + } else {
7602 + for (; tp_prev && tp_prev->mptcp->next; tp_prev = tp_prev->mptcp->next) {
7603 + if (tp_prev->mptcp->next == tp) {
7604 + tp_prev->mptcp->next = tp->mptcp->next;
7605 + break;
7609 + mpcb->cnt_subflows--;
7610 + if (tp->mptcp->establish_increased)
7611 + mpcb->cnt_established--;
7613 + tp->mptcp->next = NULL;
7614 + tp->mptcp->attached = 0;
7615 + mpcb->path_index_bits &= ~(1 << tp->mptcp->path_index);
7617 + if (!skb_queue_empty(&sk->sk_write_queue))
7618 + mptcp_reinject_data(sk, 0);
7620 + if (is_master_tp(tp))
7621 + mpcb->master_sk = NULL;
7622 + else if (tp->mptcp->pre_established)
7623 + sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer);
7625 + rcu_assign_pointer(inet_sk(sk)->inet_opt, NULL);
7628 +/* Updates the metasocket ULID/port data, based on the given sock.
7629 + * The argument sock must be the sock accessible to the application.
7630 + * In this function, we update the meta socket info, based on the changes
7631 + * in the application socket (bind, address allocation, ...)
7632 + */
7633 +void mptcp_update_metasocket(struct sock *sk, struct sock *meta_sk)
7635 + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
7636 + union inet_addr addr;
7637 + int index;
7639 + /* Get the index of the local address */
7640 + if (sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(sk)) {
7641 + addr.ip = inet_sk(sk)->inet_saddr;
7642 + index = mpcb->pm_ops->get_local_index(AF_INET, &addr, sock_net(meta_sk));
7643 + } else {
7644 + addr.in6 = inet6_sk(sk)->saddr;
7645 + index = mpcb->pm_ops->get_local_index(AF_INET6, &addr, sock_net(meta_sk));
7648 + if (sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(sk)) {
7649 + mptcp_v4_add_raddress(mpcb,
7650 + (struct in_addr *)&inet_sk(sk)->inet_daddr,
7651 + 0, 0);
7652 + if (index >= 0)
7653 + mptcp_v4_set_init_addr_bit(mpcb, inet_sk(sk)->inet_daddr, index);
7654 + } else {
7655 +#if IS_ENABLED(CONFIG_IPV6)
7656 + mptcp_v6_add_raddress(mpcb, &sk->sk_v6_daddr, 0, 0);
7657 + if (index >= 0)
7658 + mptcp_v6_set_init_addr_bit(mpcb, &sk->sk_v6_daddr, index);
7659 +#endif
7662 + if (mpcb->pm_ops->new_session)
7663 + mpcb->pm_ops->new_session(meta_sk, index);
7665 + tcp_sk(sk)->mptcp->send_mp_prio = tcp_sk(sk)->mptcp->low_prio;
7668 +/* Clean up the receive buffer for full frames taken by the user,
7669 + * then send an ACK if necessary. COPIED is the number of bytes
7670 + * tcp_recvmsg has given to the user so far, it speeds up the
7671 + * calculation of whether or not we must ACK for the sake of
7672 + * a window update.
7673 + */
7674 +void mptcp_cleanup_rbuf(struct sock *meta_sk, int copied)
7676 + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
7677 + struct sock *sk;
7678 + __u32 rcv_window_now = 0;
7680 + if (copied > 0 && !(meta_sk->sk_shutdown & RCV_SHUTDOWN)) {
7681 + rcv_window_now = tcp_receive_window(meta_tp);
7683 + if (2 * rcv_window_now > meta_tp->window_clamp)
7684 + rcv_window_now = 0;
7687 + mptcp_for_each_sk(meta_tp->mpcb, sk) {
7688 + struct tcp_sock *tp = tcp_sk(sk);
7689 + const struct inet_connection_sock *icsk = inet_csk(sk);
7691 + if (!mptcp_sk_can_send_ack(sk))
7692 + continue;
7694 + if (!inet_csk_ack_scheduled(sk))
7695 + goto second_part;
7696 + /* Delayed ACKs frequently hit locked sockets during bulk
7697 + * receive.
7698 + */
7699 + if (icsk->icsk_ack.blocked ||
7700 + /* Once-per-two-segments ACK was not sent by tcp_input.c */
7701 + tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
7702 + /* If this read emptied read buffer, we send ACK, if
7703 + * connection is not bidirectional, user drained
7704 + * receive buffer and there was a small segment
7705 + * in queue.
7706 + */
7707 + (copied > 0 &&
7708 + ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
7709 + ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
7710 + !icsk->icsk_ack.pingpong)) &&
7711 + !atomic_read(&meta_sk->sk_rmem_alloc))) {
7712 + tcp_send_ack(sk);
7713 + continue;
7716 +second_part:
7717 + /* This here is the second part of tcp_cleanup_rbuf */
7718 + if (rcv_window_now) {
7719 + __u32 new_window = tp->__select_window(sk);
7721 + /* Send ACK now, if this read freed lots of space
7722 + * in our buffer. Certainly, new_window is new window.
7723 + * We can advertise it now, if it is not less than
7724 + * current one.
7725 + * "Lots" means "at least twice" here.
7726 + */
7727 + if (new_window && new_window >= 2 * rcv_window_now)
7728 + tcp_send_ack(sk);
7733 +static int mptcp_sub_send_fin(struct sock *sk)
7735 + struct tcp_sock *tp = tcp_sk(sk);
7736 + struct sk_buff *skb = tcp_write_queue_tail(sk);
7737 + int mss_now;
7739 + /* Optimization, tack on the FIN if we have a queue of
7740 + * unsent frames. But be careful about outgoing SACKS
7741 + * and IP options.
7742 + */
7743 + mss_now = tcp_current_mss(sk);
7745 + if (tcp_send_head(sk) != NULL) {
7746 + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
7747 + TCP_SKB_CB(skb)->end_seq++;
7748 + tp->write_seq++;
7749 + } else {
7750 + skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_ATOMIC);
7751 + if (!skb)
7752 + return 1;
7754 + /* Reserve space for headers and prepare control bits. */
7755 + skb_reserve(skb, MAX_TCP_HEADER);
7756 + /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
7757 + tcp_init_nondata_skb(skb, tp->write_seq,
7758 + TCPHDR_ACK | TCPHDR_FIN);
7759 + tcp_queue_skb(sk, skb);
7761 + __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
7763 + return 0;
7766 +void mptcp_sub_close_wq(struct work_struct *work)
7768 + struct mptcp_tcp_sock *mptcp = container_of(work, struct mptcp_tcp_sock, work.work);
7769 + struct tcp_sock *tp = mptcp->tp;
7770 + struct sock *sk = (struct sock *)tp;
7771 + struct sock *meta_sk = mptcp_meta_sk(sk);
7773 + mutex_lock(&tp->mpcb->mpcb_mutex);
7774 + lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
7776 + if (sock_flag(sk, SOCK_DEAD))
7777 + goto exit;
7779 + /* We come from tcp_disconnect. We are sure that meta_sk is set */
7780 + if (!tp->mpc) {
7781 + tp->closing = 1;
7782 + sock_rps_reset_flow(sk);
7783 + tcp_close(sk, 0);
7784 + goto exit;
7787 + if (meta_sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) {
7788 + tp->closing = 1;
7789 + sock_rps_reset_flow(sk);
7790 + tcp_close(sk, 0);
7791 + } else if (tcp_close_state(sk)) {
7792 + sk->sk_shutdown |= SEND_SHUTDOWN;
7793 + tcp_send_fin(sk);
7796 +exit:
7797 + release_sock(meta_sk);
7798 + mutex_unlock(&tp->mpcb->mpcb_mutex);
7799 + sock_put(sk);
7802 +void mptcp_sub_close(struct sock *sk, unsigned long delay)
7804 + struct tcp_sock *tp = tcp_sk(sk);
7805 + struct delayed_work *work = &tcp_sk(sk)->mptcp->work;
7807 + /* We are already closing - e.g., call from sock_def_error_report upon
7808 + * tcp_disconnect in tcp_close.
7809 + */
7810 + if (tp->closing)
7811 + return;
7813 + /* Work already scheduled ? */
7814 + if (work_pending(&work->work)) {
7815 + /* Work present - who will be first ? */
7816 + if (jiffies + delay > work->timer.expires)
7817 + return;
7819 + /* Try canceling - if it fails, work will be executed soon */
7820 + if (!cancel_delayed_work(work))
7821 + return;
7822 + sock_put(sk);
7825 + if (!delay) {
7826 + unsigned char old_state = sk->sk_state;
7828 + /* If we are in user-context we can directly do the closing
7829 + * procedure. No need to schedule a work-queue.
7830 + */
7831 + if (!in_softirq()) {
7832 + if (sock_flag(sk, SOCK_DEAD))
7833 + return;
7835 + if (!tp->mpc) {
7836 + tp->closing = 1;
7837 + sock_rps_reset_flow(sk);
7838 + tcp_close(sk, 0);
7839 + return;
7842 + if (mptcp_meta_sk(sk)->sk_shutdown == SHUTDOWN_MASK ||
7843 + sk->sk_state == TCP_CLOSE) {
7844 + tp->closing = 1;
7845 + sock_rps_reset_flow(sk);
7846 + tcp_close(sk, 0);
7847 + } else if (tcp_close_state(sk)) {
7848 + sk->sk_shutdown |= SEND_SHUTDOWN;
7849 + tcp_send_fin(sk);
7852 + return;
7855 + /* We directly send the FIN. Because it may take so a long time,
7856 + * untile the work-queue will get scheduled...
7858 + * If mptcp_sub_send_fin returns 1, it failed and thus we reset
7859 + * the old state so that tcp_close will finally send the fin
7860 + * in user-context.
7861 + */
7862 + if (!sk->sk_err && old_state != TCP_CLOSE &&
7863 + tcp_close_state(sk) && mptcp_sub_send_fin(sk)) {
7864 + if (old_state == TCP_ESTABLISHED)
7865 + TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
7866 + sk->sk_state = old_state;
7870 + sock_hold(sk);
7871 + queue_delayed_work(mptcp_wq, work, delay);
7874 +void mptcp_sub_force_close(struct sock *sk)
7876 + /* The below tcp_done may have freed the socket, if he is already dead.
7877 + * Thus, we are not allowed to access it afterwards. That's why
7878 + * we have to store the dead-state in this local variable.
7879 + */
7880 + int sock_is_dead = sock_flag(sk, SOCK_DEAD);
7882 + tcp_sk(sk)->mp_killed = 1;
7884 + if (sk->sk_state != TCP_CLOSE)
7885 + tcp_done(sk);
7887 + if (!sock_is_dead)
7888 + mptcp_sub_close(sk, 0);
7890 +EXPORT_SYMBOL(mptcp_sub_force_close);
7892 +/* Update the mpcb send window, based on the contributions
7893 + * of each subflow
7894 + */
7895 +void mptcp_update_sndbuf(struct mptcp_cb *mpcb)
7897 + struct sock *meta_sk = mpcb->meta_sk, *sk;
7898 + int new_sndbuf = 0, old_sndbuf = meta_sk->sk_sndbuf;
7899 + mptcp_for_each_sk(mpcb, sk) {
7900 + if (!mptcp_sk_can_send(sk))
7901 + continue;
7903 + new_sndbuf += sk->sk_sndbuf;
7905 + if (new_sndbuf > sysctl_tcp_wmem[2] || new_sndbuf < 0) {
7906 + new_sndbuf = sysctl_tcp_wmem[2];
7907 + break;
7910 + meta_sk->sk_sndbuf = max(min(new_sndbuf, sysctl_tcp_wmem[2]), meta_sk->sk_sndbuf);
7912 + /* The subflow's call to sk_write_space in tcp_new_space ends up in
7913 + * mptcp_write_space.
7914 + * It has nothing to do with waking up the application.
7915 + * So, we do it here.
7916 + */
7917 + if (old_sndbuf != meta_sk->sk_sndbuf)
7918 + meta_sk->sk_write_space(meta_sk);
7921 +void mptcp_close(struct sock *meta_sk, long timeout)
7923 + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
7924 + struct sock *sk_it, *tmpsk;
7925 + struct mptcp_cb *mpcb = meta_tp->mpcb;
7926 + struct sk_buff *skb;
7927 + int data_was_unread = 0;
7928 + int state;
7930 + mptcp_debug("%s: Close of meta_sk with tok %#x\n",
7931 + __func__, mpcb->mptcp_loc_token);
7933 + mutex_lock(&mpcb->mpcb_mutex);
7934 + lock_sock(meta_sk);
7936 + if (meta_tp->inside_tk_table) {
7937 + /* Detach the mpcb from the token hashtable */
7938 + mptcp_hash_remove_bh(meta_tp);
7939 + reqsk_queue_destroy(&inet_csk(meta_sk)->icsk_accept_queue);
7942 + meta_sk->sk_shutdown = SHUTDOWN_MASK;
7943 + /* We need to flush the recv. buffs. We do this only on the
7944 + * descriptor close, not protocol-sourced closes, because the
7945 + * reader process may not have drained the data yet!
7946 + */
7947 + while ((skb = __skb_dequeue(&meta_sk->sk_receive_queue)) != NULL) {
7948 + u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
7949 + tcp_hdr(skb)->fin;
7950 + data_was_unread += len;
7951 + __kfree_skb(skb);
7954 + sk_mem_reclaim(meta_sk);
7956 + /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
7957 + if (meta_sk->sk_state == TCP_CLOSE) {
7958 + mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) {
7959 + if (tcp_sk(sk_it)->send_mp_fclose)
7960 + continue;
7961 + mptcp_sub_close(sk_it, 0);
7963 + goto adjudge_to_death;
7966 + if (data_was_unread) {
7967 + /* Unread data was tossed, zap the connection. */
7968 + NET_INC_STATS_USER(sock_net(meta_sk), LINUX_MIB_TCPABORTONCLOSE);
7969 + tcp_set_state(meta_sk, TCP_CLOSE);
7970 + tcp_send_active_reset(meta_sk, meta_sk->sk_allocation);
7971 + } else if (sock_flag(meta_sk, SOCK_LINGER) && !meta_sk->sk_lingertime) {
7972 + /* Check zero linger _after_ checking for unread data. */
7973 + meta_sk->sk_prot->disconnect(meta_sk, 0);
7974 + NET_INC_STATS_USER(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA);
7975 + } else if (tcp_close_state(meta_sk)) {
7976 + mptcp_send_fin(meta_sk);
7977 + } else if (meta_tp->snd_una == meta_tp->write_seq) {
7978 + /* The DATA_FIN has been sent and acknowledged
7979 + * (e.g., by sk_shutdown). Close all the other subflows
7980 + */
7981 + mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) {
7982 + unsigned long delay = 0;
7983 + /* If we are the passive closer, don't trigger
7984 + * subflow-fin until the subflow has been finned
7985 + * by the peer. - thus we add a delay
7986 + */
7987 + if (mpcb->passive_close &&
7988 + sk_it->sk_state == TCP_ESTABLISHED)
7989 + delay = inet_csk(sk_it)->icsk_rto << 3;
7991 + mptcp_sub_close(sk_it, delay);
7995 + sk_stream_wait_close(meta_sk, timeout);
7997 +adjudge_to_death:
7998 + state = meta_sk->sk_state;
7999 + sock_hold(meta_sk);
8000 + sock_orphan(meta_sk);
8002 + /* socket will be freed after mptcp_close - we have to prevent
8003 + * access from the subflows.
8004 + */
8005 + mptcp_for_each_sk(mpcb, sk_it) {
8006 + /* Similar to sock_orphan, but we don't set it DEAD, because
8007 + * the callbacks are still set and must be called.
8008 + */
8009 + write_lock_bh(&sk_it->sk_callback_lock);
8010 + sk_set_socket(sk_it, NULL);
8011 + sk_it->sk_wq = NULL;
8012 + write_unlock_bh(&sk_it->sk_callback_lock);
8015 + /* It is the last release_sock in its life. It will remove backlog. */
8016 + release_sock(meta_sk);
8018 + /* Now socket is owned by kernel and we acquire BH lock
8019 + * to finish close. No need to check for user refs.
8020 + */
8021 + local_bh_disable();
8022 + bh_lock_sock(meta_sk);
8023 + WARN_ON(sock_owned_by_user(meta_sk));
8025 + percpu_counter_inc(meta_sk->sk_prot->orphan_count);
8027 + /* Have we already been destroyed by a softirq or backlog? */
8028 + if (state != TCP_CLOSE && meta_sk->sk_state == TCP_CLOSE)
8029 + goto out;
8031 + /* This is a (useful) BSD violating of the RFC. There is a
8032 + * problem with TCP as specified in that the other end could
8033 + * keep a socket open forever with no application left this end.
8034 + * We use a 3 minute timeout (about the same as BSD) then kill
8035 + * our end. If they send after that then tough - BUT: long enough
8036 + * that we won't make the old 4*rto = almost no time - whoops
8037 + * reset mistake.
8039 + * Nope, it was not mistake. It is really desired behaviour
8040 + * f.e. on http servers, when such sockets are useless, but
8041 + * consume significant resources. Let's do it with special
8042 + * linger2 option. --ANK
8043 + */
8045 + if (meta_sk->sk_state == TCP_FIN_WAIT2) {
8046 + if (meta_tp->linger2 < 0) {
8047 + tcp_set_state(meta_sk, TCP_CLOSE);
8048 + tcp_send_active_reset(meta_sk, GFP_ATOMIC);
8049 + NET_INC_STATS_BH(sock_net(meta_sk),
8050 + LINUX_MIB_TCPABORTONLINGER);
8051 + } else {
8052 + const int tmo = tcp_fin_time(meta_sk);
8054 + if (tmo > TCP_TIMEWAIT_LEN) {
8055 + inet_csk_reset_keepalive_timer(meta_sk,
8056 + tmo - TCP_TIMEWAIT_LEN);
8057 + } else {
8058 + tcp_time_wait(meta_sk, TCP_FIN_WAIT2, tmo);
8059 + goto out;
8063 + if (meta_sk->sk_state != TCP_CLOSE) {
8064 + sk_mem_reclaim(meta_sk);
8065 + if (tcp_too_many_orphans(meta_sk, 0)) {
8066 + if (net_ratelimit())
8067 + pr_info("MPTCP: too many of orphaned sockets\n");
8068 + tcp_set_state(meta_sk, TCP_CLOSE);
8069 + tcp_send_active_reset(meta_sk, GFP_ATOMIC);
8070 + NET_INC_STATS_BH(sock_net(meta_sk),
8071 + LINUX_MIB_TCPABORTONMEMORY);
8076 + if (meta_sk->sk_state == TCP_CLOSE)
8077 + inet_csk_destroy_sock(meta_sk);
8078 + /* Otherwise, socket is reprieved until protocol close. */
8080 +out:
8081 + bh_unlock_sock(meta_sk);
8082 + local_bh_enable();
8083 + mutex_unlock(&mpcb->mpcb_mutex);
8084 + sock_put(meta_sk); /* Taken by sock_hold */
8087 +void mptcp_disconnect(struct sock *sk)
8089 + struct sock *subsk, *tmpsk;
8090 + struct tcp_sock *tp = tcp_sk(sk);
8092 + __skb_queue_purge(&tp->mpcb->reinject_queue);
8094 + if (tp->inside_tk_table) {
8095 + mptcp_hash_remove_bh(tp);
8096 + reqsk_queue_destroy(&inet_csk(tp->meta_sk)->icsk_accept_queue);
8099 + local_bh_disable();
8100 + mptcp_for_each_sk_safe(tp->mpcb, subsk, tmpsk) {
8101 + /* The socket will get removed from the subsocket-list
8102 + * and made non-mptcp by setting mpc to 0.
8104 + * This is necessary, because tcp_disconnect assumes
8105 + * that the connection is completly dead afterwards.
8106 + * Thus we need to do a mptcp_del_sock. Due to this call
8107 + * we have to make it non-mptcp.
8109 + * We have to lock the socket, because we set mpc to 0.
8110 + * An incoming packet would take the subsocket's lock
8111 + * and go on into the receive-path.
8112 + * This would be a race.
8113 + */
8115 + bh_lock_sock(subsk);
8116 + mptcp_del_sock(subsk);
8117 + reset_mpc(tcp_sk(subsk));
8118 + mptcp_sub_force_close(subsk);
8119 + bh_unlock_sock(subsk);
8121 + local_bh_enable();
8123 + tp->was_meta_sk = 1;
8124 + reset_mpc(tp);
8128 +/* Returns 1 if we should enable MPTCP for that socket. */
8129 +int mptcp_doit(struct sock *sk)
8131 + /* Do not allow MPTCP enabling if the MPTCP initialization failed */
8132 + if (mptcp_init_failed)
8133 + return 0;
8135 + if (sysctl_mptcp_enabled == MPTCP_APP && !tcp_sk(sk)->mptcp_enabled)
8136 + return 0;
8138 + /* Socket may already be established (e.g., called from tcp_recvmsg) */
8139 + if (tcp_sk(sk)->mpc || tcp_sk(sk)->request_mptcp)
8140 + return 1;
8142 + /* Don't do mptcp over loopback */
8143 + if (sk->sk_family == AF_INET &&
8144 + (ipv4_is_loopback(inet_sk(sk)->inet_daddr) ||
8145 + ipv4_is_loopback(inet_sk(sk)->inet_saddr)))
8146 + return 0;
8147 +#if IS_ENABLED(CONFIG_IPV6)
8148 + if (sk->sk_family == AF_INET6 &&
8149 + (ipv6_addr_loopback(&sk->sk_v6_daddr) ||
8150 + ipv6_addr_loopback(&inet6_sk(sk)->saddr)))
8151 + return 0;
8152 +#endif
8153 + if (mptcp_v6_is_v4_mapped(sk) &&
8154 + ipv4_is_loopback(inet_sk(sk)->inet_saddr))
8155 + return 0;
8157 +#ifdef CONFIG_TCP_MD5SIG
8158 + /* If TCP_MD5SIG is enabled, do not do MPTCP - there is no Option-Space */
8159 + if (tcp_sk(sk)->af_specific->md5_lookup(sk, sk))
8160 + return 0;
8161 +#endif
8163 + return 1;
8166 +int mptcp_create_master_sk(struct sock *meta_sk, __u64 remote_key, u32 window)
8168 + struct tcp_sock *master_tp;
8169 + struct sock *master_sk;
8171 + if (mptcp_alloc_mpcb(meta_sk, remote_key, window))
8172 + goto err_alloc_mpcb;
8174 + master_sk = tcp_sk(meta_sk)->mpcb->master_sk;
8175 + master_tp = tcp_sk(master_sk);
8177 + if (mptcp_add_sock(meta_sk, master_sk, 0, 0, GFP_ATOMIC))
8178 + goto err_add_sock;
8180 + if (__inet_inherit_port(meta_sk, master_sk) < 0)
8181 + goto err_add_sock;
8183 + meta_sk->sk_prot->unhash(meta_sk);
8185 + if (master_sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(master_sk))
8186 + __inet_hash_nolisten(master_sk, NULL);
8187 +#if IS_ENABLED(CONFIG_IPV6)
8188 + else
8189 + __inet6_hash(master_sk, NULL);
8190 +#endif
8192 + master_tp->mptcp->init_rcv_wnd = master_tp->rcv_wnd;
8194 + return 0;
8196 +err_add_sock:
8197 + mptcp_fallback_meta_sk(meta_sk);
8199 + inet_csk_prepare_forced_close(master_sk);
8200 + tcp_done(master_sk);
8201 + inet_csk_prepare_forced_close(meta_sk);
8202 + tcp_done(meta_sk);
8204 +err_alloc_mpcb:
8205 + return -ENOBUFS;
8208 +int mptcp_check_req_master(struct sock *sk, struct sock *child,
8209 + struct request_sock *req,
8210 + struct request_sock **prev,
8211 + struct mptcp_options_received *mopt)
8213 + struct tcp_sock *child_tp = tcp_sk(child);
8214 + struct sock *meta_sk = child;
8215 + struct mptcp_cb *mpcb;
8216 + struct mptcp_request_sock *mtreq;
8218 + if (!tcp_rsk(req)->saw_mpc)
8219 + return 1;
8221 + /* Just set this values to pass them to mptcp_alloc_mpcb */
8222 + mtreq = mptcp_rsk(req);
8223 + child_tp->mptcp_loc_key = mtreq->mptcp_loc_key;
8224 + child_tp->mptcp_loc_token = mtreq->mptcp_loc_token;
8226 + if (mptcp_create_master_sk(meta_sk, mtreq->mptcp_rem_key,
8227 + child_tp->snd_wnd))
8228 + return -ENOBUFS;
8230 + child = tcp_sk(child)->mpcb->master_sk;
8231 + child_tp = tcp_sk(child);
8232 + mpcb = child_tp->mpcb;
8234 + child_tp->mptcp->snt_isn = tcp_rsk(req)->snt_isn;
8235 + child_tp->mptcp->rcv_isn = tcp_rsk(req)->rcv_isn;
8237 + mpcb->dss_csum = mtreq->dss_csum;
8238 + mpcb->server_side = 1;
8240 + /* Will be moved to ESTABLISHED by tcp_rcv_state_process() */
8241 + mptcp_update_metasocket(child, meta_sk);
8243 + /* Needs to be done here additionally, because when accepting a
8244 + * new connection we pass by __reqsk_free and not reqsk_free.
8245 + */
8246 + mptcp_reqsk_remove_tk(req);
8248 + /* Hold when creating the meta-sk in tcp_vX_syn_recv_sock. */
8249 + sock_put(meta_sk);
8251 + inet_csk_reqsk_queue_unlink(sk, req, prev);
8252 + inet_csk_reqsk_queue_removed(sk, req);
8253 + inet_csk_reqsk_queue_add(sk, req, meta_sk);
8255 + return 0;
8258 +struct sock *mptcp_check_req_child(struct sock *meta_sk, struct sock *child,
8259 + struct request_sock *req,
8260 + struct request_sock **prev,
8261 + struct mptcp_options_received *mopt)
8263 + struct tcp_sock *child_tp = tcp_sk(child);
8264 + struct mptcp_request_sock *mtreq = mptcp_rsk(req);
8265 + struct mptcp_cb *mpcb = mtreq->mpcb;
8266 + u8 hash_mac_check[20];
8268 + child_tp->inside_tk_table = 0;
8270 + if (!mopt->join_ack)
8271 + goto teardown;
8273 + mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key,
8274 + (u8 *)&mpcb->mptcp_loc_key,
8275 + (u8 *)&mtreq->mptcp_rem_nonce,
8276 + (u8 *)&mtreq->mptcp_loc_nonce,
8277 + (u32 *)hash_mac_check);
8279 + if (memcmp(hash_mac_check, (char *)&mopt->mptcp_recv_mac, 20))
8280 + goto teardown;
8282 + /* Point it to the same struct socket and wq as the meta_sk */
8283 + sk_set_socket(child, meta_sk->sk_socket);
8284 + child->sk_wq = meta_sk->sk_wq;
8286 + if (mptcp_add_sock(meta_sk, child, mtreq->loc_id, mtreq->rem_id, GFP_ATOMIC)) {
8287 + reset_mpc(child_tp); /* Has been inherited, but now
8288 + * child_tp->mptcp is NULL
8289 + */
8290 + /* TODO when we support acking the third ack for new subflows,
8291 + * we should silently discard this third ack, by returning NULL.
8293 + * Maybe, at the retransmission we will have enough memory to
8294 + * fully add the socket to the meta-sk.
8295 + */
8296 + goto teardown;
8299 + /* The child is a clone of the meta socket, we must now reset
8300 + * some of the fields
8301 + */
8302 + child_tp->mptcp->rcv_low_prio = mtreq->low_prio;
8304 + /* We should allow proper increase of the snd/rcv-buffers. Thus, we
8305 + * use the original values instead of the bloated up ones from the
8306 + * clone.
8307 + */
8308 + child->sk_sndbuf = mpcb->orig_sk_sndbuf;
8309 + child->sk_rcvbuf = mpcb->orig_sk_rcvbuf;
8311 + child_tp->mptcp->slave_sk = 1;
8312 + child_tp->mptcp->snt_isn = tcp_rsk(req)->snt_isn;
8313 + child_tp->mptcp->rcv_isn = tcp_rsk(req)->rcv_isn;
8314 + child_tp->mptcp->init_rcv_wnd = req->rcv_wnd;
8316 + child_tp->tsq_flags = 0;
8318 + /* Subflows do not use the accept queue, as they
8319 + * are attached immediately to the mpcb.
8320 + */
8321 + inet_csk_reqsk_queue_drop(meta_sk, req, prev);
8322 + return child;
8324 +teardown:
8325 + /* Drop this request - sock creation failed. */
8326 + inet_csk_reqsk_queue_drop(meta_sk, req, prev);
8327 + inet_csk_prepare_forced_close(child);
8328 + tcp_done(child);
8329 + return meta_sk;
8332 +int mptcp_time_wait(struct sock *sk, struct tcp_timewait_sock *tw)
8334 + struct mptcp_tw *mptw;
8335 + struct tcp_sock *tp = tcp_sk(sk);
8336 + struct mptcp_cb *mpcb = tp->mpcb;
8338 + /* Alloc MPTCP-tw-sock */
8339 + mptw = kmem_cache_alloc(mptcp_tw_cache, GFP_ATOMIC);
8340 + if (!mptw)
8341 + return -ENOBUFS;
8343 + atomic_inc(&mpcb->mpcb_refcnt);
8345 + tw->mptcp_tw = mptw;
8346 + mptw->loc_key = mpcb->mptcp_loc_key;
8347 + mptw->meta_tw = mpcb->in_time_wait;
8348 + if (mptw->meta_tw) {
8349 + mptw->rcv_nxt = mptcp_get_rcv_nxt_64(mptcp_meta_tp(tp));
8350 + if (mpcb->mptw_state != TCP_TIME_WAIT)
8351 + mptw->rcv_nxt++;
8353 + rcu_assign_pointer(mptw->mpcb, mpcb);
8355 + spin_lock(&mpcb->tw_lock);
8356 + list_add_rcu(&mptw->list, &tp->mpcb->tw_list);
8357 + mptw->in_list = 1;
8358 + spin_unlock(&mpcb->tw_lock);
8360 + return 0;
8363 +void mptcp_twsk_destructor(struct tcp_timewait_sock *tw)
8365 + struct mptcp_cb *mpcb;
8367 + rcu_read_lock();
8368 + mpcb = rcu_dereference(tw->mptcp_tw->mpcb);
8370 + /* If we are still holding a ref to the mpcb, we have to remove ourself
8371 + * from the list and drop the ref properly.
8372 + */
8373 + if (mpcb && atomic_inc_not_zero(&mpcb->mpcb_refcnt)) {
8374 + spin_lock(&mpcb->tw_lock);
8375 + if (tw->mptcp_tw->in_list) {
8376 + list_del_rcu(&tw->mptcp_tw->list);
8377 + tw->mptcp_tw->in_list = 0;
8379 + spin_unlock(&mpcb->tw_lock);
8381 + /* Twice, because we increased it above */
8382 + mptcp_mpcb_put(mpcb);
8383 + mptcp_mpcb_put(mpcb);
8386 + rcu_read_unlock();
8388 + kmem_cache_free(mptcp_tw_cache, tw->mptcp_tw);
8391 +/* Updates the rcv_nxt of the time-wait-socks and allows them to ack a
8392 + * data-fin.
8393 + */
8394 +void mptcp_update_tw_socks(const struct tcp_sock *tp, int state)
8396 + struct mptcp_tw *mptw;
8398 + /* Used for sockets that go into tw after the meta
8399 + * (see mptcp_time_wait())
8400 + */
8401 + tp->mpcb->in_time_wait = 1;
8402 + tp->mpcb->mptw_state = state;
8404 + /* Update the time-wait-sock's information */
8405 + rcu_read_lock_bh();
8406 + list_for_each_entry_rcu(mptw, &tp->mpcb->tw_list, list) {
8407 + mptw->meta_tw = 1;
8408 + mptw->rcv_nxt = mptcp_get_rcv_nxt_64(tp);
8410 + /* We want to ack a DATA_FIN, but are yet in FIN_WAIT_2 -
8411 + * pretend as if the DATA_FIN has already reached us, that way
8412 + * the checks in tcp_timewait_state_process will be good as the
8413 + * DATA_FIN comes in.
8414 + */
8415 + if (state != TCP_TIME_WAIT)
8416 + mptw->rcv_nxt++;
8418 + rcu_read_unlock_bh();
8421 +void mptcp_tsq_flags(struct sock *sk)
8423 + struct tcp_sock *tp = tcp_sk(sk);
8424 + struct sock *meta_sk = mptcp_meta_sk(sk);
8426 + /* It will be handled as a regular deferred-call */
8427 + if (is_meta_sk(sk))
8428 + return;
8430 + if (list_empty(&tp->mptcp->cb_list)) {
8431 + list_add(&tp->mptcp->cb_list, &tp->mpcb->callback_list);
8432 + /* We need to hold it here, as the sock_hold is not assured
8433 + * by the release_sock as it is done in regular TCP.
8435 + * The subsocket may get inet_csk_destroy'd while it is inside
8436 + * the callback_list.
8437 + */
8438 + sock_hold(sk);
8441 + if (!test_and_set_bit(MPTCP_SUB_DEFERRED, &tcp_sk(meta_sk)->tsq_flags))
8442 + sock_hold(meta_sk);
8445 +void mptcp_tsq_sub_deferred(struct sock *meta_sk)
8447 + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
8448 + struct mptcp_tcp_sock *mptcp, *tmp;
8450 + BUG_ON(!is_meta_sk(meta_sk) && !meta_tp->was_meta_sk);
8452 + __sock_put(meta_sk);
8453 + list_for_each_entry_safe(mptcp, tmp, &meta_tp->mpcb->callback_list, cb_list) {
8454 + struct tcp_sock *tp = mptcp->tp;
8455 + struct sock *sk = (struct sock *)tp;
8457 + list_del_init(&mptcp->cb_list);
8458 + sk->sk_prot->release_cb(sk);
8459 + /* Final sock_put (cfr. mptcp_tsq_flags */
8460 + sock_put(sk);
8464 +struct workqueue_struct *mptcp_wq;
8465 +EXPORT_SYMBOL(mptcp_wq);
8467 +/* Output /proc/net/mptcp */
8468 +static int mptcp_pm_seq_show(struct seq_file *seq, void *v)
8470 + struct tcp_sock *meta_tp;
8471 + struct net *net = seq->private;
8472 + int i, n = 0;
8474 + seq_printf(seq, " sl loc_tok rem_tok v6 "
8475 + "local_address "
8476 + "remote_address "
8477 + "st ns tx_queue rx_queue inode");
8478 + seq_putc(seq, '\n');
8480 + for (i = 0; i < MPTCP_HASH_SIZE; i++) {
8481 + struct hlist_nulls_node *node;
8482 + rcu_read_lock_bh();
8483 + hlist_nulls_for_each_entry_rcu(meta_tp, node,
8484 + &tk_hashtable[i], tk_table) {
8485 + struct mptcp_cb *mpcb = meta_tp->mpcb;
8486 + struct sock *meta_sk = (struct sock *)meta_tp;
8487 + struct inet_sock *isk = inet_sk(meta_sk);
8489 + if (!meta_tp->mpc || !net_eq(net, sock_net(meta_sk)))
8490 + continue;
8492 + seq_printf(seq, "%4d: %04X %04X ", n++,
8493 + mpcb->mptcp_loc_token,
8494 + mpcb->mptcp_rem_token);
8495 + if (meta_sk->sk_family == AF_INET ||
8496 + mptcp_v6_is_v4_mapped(meta_sk)) {
8497 + seq_printf(seq, " 0 %08X:%04X %08X:%04X ",
8498 + isk->inet_saddr,
8499 + ntohs(isk->inet_sport),
8500 + isk->inet_daddr,
8501 + ntohs(isk->inet_dport));
8502 +#if IS_ENABLED(CONFIG_IPV6)
8503 + } else if (meta_sk->sk_family == AF_INET6) {
8504 + struct in6_addr *src = &isk->pinet6->saddr;
8505 + struct in6_addr *dst = &meta_sk->sk_v6_daddr;
8506 + seq_printf(seq, " 1 %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X",
8507 + src->s6_addr32[0], src->s6_addr32[1],
8508 + src->s6_addr32[2], src->s6_addr32[3],
8509 + ntohs(isk->inet_sport),
8510 + dst->s6_addr32[0], dst->s6_addr32[1],
8511 + dst->s6_addr32[2], dst->s6_addr32[3],
8512 + ntohs(isk->inet_dport));
8513 +#endif
8515 + seq_printf(seq, " %02X %02X %08X:%08X %lu",
8516 + meta_sk->sk_state, mpcb->cnt_subflows,
8517 + meta_tp->write_seq - meta_tp->snd_una,
8518 + max_t(int, meta_tp->rcv_nxt -
8519 + meta_tp->copied_seq, 0),
8520 + sock_i_ino(meta_sk));
8521 + seq_putc(seq, '\n');
8523 + rcu_read_unlock_bh();
8526 + return 0;
8529 +static int mptcp_pm_seq_open(struct inode *inode, struct file *file)
8531 + return single_open_net(inode, file, mptcp_pm_seq_show);
8534 +static const struct file_operations mptcp_pm_seq_fops = {
8535 + .owner = THIS_MODULE,
8536 + .open = mptcp_pm_seq_open,
8537 + .read = seq_read,
8538 + .llseek = seq_lseek,
8539 + .release = single_release_net,
8542 +static int mptcp_pm_init_net(struct net *net)
8544 + if (!proc_create("mptcp", S_IRUGO, net->proc_net, &mptcp_pm_seq_fops))
8545 + return -ENOMEM;
8547 + return 0;
8550 +static void mptcp_pm_exit_net(struct net *net)
8552 + remove_proc_entry("mptcp", net->proc_net);
8555 +static struct pernet_operations mptcp_pm_proc_ops = {
8556 + .init = mptcp_pm_init_net,
8557 + .exit = mptcp_pm_exit_net,
8560 +/* General initialization of mptcp */
8561 +void __init mptcp_init(void)
8563 + int i;
8564 + struct ctl_table_header *mptcp_sysctl;
8566 + mptcp_sock_cache = kmem_cache_create("mptcp_sock",
8567 + sizeof(struct mptcp_tcp_sock),
8568 + 0, SLAB_HWCACHE_ALIGN,
8569 + NULL);
8570 + if (!mptcp_sock_cache)
8571 + goto mptcp_sock_cache_failed;
8573 + mptcp_cb_cache = kmem_cache_create("mptcp_cb", sizeof(struct mptcp_cb),
8574 + 0, SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN,
8575 + NULL);
8576 + if (!mptcp_cb_cache)
8577 + goto mptcp_cb_cache_failed;
8579 + mptcp_tw_cache = kmem_cache_create("mptcp_tw", sizeof(struct mptcp_tw),
8580 + 0, SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN,
8581 + NULL);
8582 + if (!mptcp_tw_cache)
8583 + goto mptcp_tw_cache_failed;
8585 + get_random_bytes(mptcp_secret, sizeof(mptcp_secret));
8587 + mptcp_wq = alloc_workqueue("mptcp_wq", WQ_UNBOUND | WQ_MEM_RECLAIM, 8);
8588 + if (!mptcp_wq)
8589 + goto alloc_workqueue_failed;
8591 + for (i = 0; i < MPTCP_HASH_SIZE; i++) {
8592 + INIT_HLIST_NULLS_HEAD(&tk_hashtable[i], i);
8593 + INIT_LIST_HEAD(&mptcp_reqsk_htb[i]);
8594 + INIT_HLIST_NULLS_HEAD(&mptcp_reqsk_tk_htb[i], i);
8597 + spin_lock_init(&mptcp_reqsk_hlock);
8598 + spin_lock_init(&mptcp_tk_hashlock);
8600 + if (register_pernet_subsys(&mptcp_pm_proc_ops))
8601 + goto pernet_failed;
8603 +#if IS_ENABLED(CONFIG_IPV6)
8604 + if (mptcp_pm_v6_init())
8605 + goto mptcp_pm_v6_failed;
8606 +#endif
8607 + if (mptcp_pm_v4_init())
8608 + goto mptcp_pm_v4_failed;
8610 + mptcp_sysctl = register_net_sysctl(&init_net, "net/mptcp", mptcp_table);
8611 + if (!mptcp_sysctl)
8612 + goto register_sysctl_failed;
8614 + if (mptcp_register_path_manager(&mptcp_pm_default))
8615 + goto register_pm_failed;
8617 + pr_info("MPTCP: Stable release v0.89.0-rc");
8619 + mptcp_init_failed = false;
8621 + return;
8623 +register_pm_failed:
8624 + unregister_net_sysctl_table(mptcp_sysctl);
8625 +register_sysctl_failed:
8626 + mptcp_pm_v4_undo();
8627 +mptcp_pm_v4_failed:
8628 +#if IS_ENABLED(CONFIG_IPV6)
8629 + mptcp_pm_v6_undo();
8630 +mptcp_pm_v6_failed:
8631 +#endif
8632 + unregister_pernet_subsys(&mptcp_pm_proc_ops);
8633 +pernet_failed:
8634 + destroy_workqueue(mptcp_wq);
8635 +alloc_workqueue_failed:
8636 + kmem_cache_destroy(mptcp_tw_cache);
8637 +mptcp_tw_cache_failed:
8638 + kmem_cache_destroy(mptcp_cb_cache);
8639 +mptcp_cb_cache_failed:
8640 + kmem_cache_destroy(mptcp_sock_cache);
8641 +mptcp_sock_cache_failed:
8642 + mptcp_init_failed = true;
8644 diff -Nur linux-3.14.45.orig/net/mptcp/mptcp_fullmesh.c linux-3.14.45/net/mptcp/mptcp_fullmesh.c
8645 --- linux-3.14.45.orig/net/mptcp/mptcp_fullmesh.c 1970-01-01 01:00:00.000000000 +0100
8646 +++ linux-3.14.45/net/mptcp/mptcp_fullmesh.c 2015-06-24 14:15:48.891862483 +0200
8647 @@ -0,0 +1,1313 @@
8648 +#include <linux/module.h>
8650 +#include <net/mptcp.h>
8651 +#include <net/mptcp_v4.h>
8653 +#if IS_ENABLED(CONFIG_IPV6)
8654 +#include <net/mptcp_v6.h>
8655 +#include <net/addrconf.h>
8656 +#endif
8658 +enum {
8659 + MPTCP_EVENT_ADD = 1,
8660 + MPTCP_EVENT_DEL,
8661 + MPTCP_EVENT_MOD,
8664 +struct mptcp_loc_addr {
8665 + struct mptcp_loc4 locaddr4[MPTCP_MAX_ADDR];
8666 + u8 loc4_bits;
8667 + u8 next_v4_index;
8669 + struct mptcp_loc6 locaddr6[MPTCP_MAX_ADDR];
8670 + u8 loc6_bits;
8671 + u8 next_v6_index;
8674 +struct mptcp_addr_event {
8675 + struct list_head list;
8676 + unsigned short family;
8677 + u8 code:7,
8678 + low_prio:1;
8679 + union inet_addr addr;
8682 +struct fullmesh_priv {
8683 + /* Worker struct for subflow establishment */
8684 + struct work_struct subflow_work;
8685 + /* Delayed worker, when the routing-tables are not yet ready. */
8686 + struct delayed_work subflow_retry_work;
8688 + struct mptcp_cb *mpcb;
8690 + u16 remove_addrs; /* Addresses to remove */
8691 + u8 announced_addrs_v4; /* IPv4 Addresses we did announce */
8692 + u8 announced_addrs_v6; /* IPv4 Addresses we did announce */
8694 + u8 add_addr; /* Are we sending an add_addr? */
8697 +struct mptcp_fm_ns {
8698 + struct mptcp_loc_addr __rcu *local;
8699 + spinlock_t local_lock; /* Protecting the above pointer */
8700 + struct list_head events;
8701 + struct delayed_work address_worker;
8703 + struct net *net;
8706 +static struct mptcp_pm_ops full_mesh __read_mostly;
8708 +static struct mptcp_fm_ns *fm_get_ns(struct net *net)
8710 + return (struct mptcp_fm_ns *)net->mptcp.path_managers[MPTCP_PM_FULLMESH];
8713 +static void full_mesh_create_subflows(struct sock *meta_sk);
8715 +static void retry_subflow_worker(struct work_struct *work)
8717 + struct delayed_work *delayed_work = container_of(work,
8718 + struct delayed_work,
8719 + work);
8720 + struct fullmesh_priv *pm_priv = container_of(delayed_work,
8721 + struct fullmesh_priv,
8722 + subflow_retry_work);
8723 + struct mptcp_cb *mpcb = pm_priv->mpcb;
8724 + struct sock *meta_sk = mpcb->meta_sk;
8725 + struct mptcp_loc_addr *mptcp_local;
8726 + struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk));
8727 + int iter = 0, i;
8729 + /* We need a local (stable) copy of the address-list. Really, it is not
8730 + * such a big deal, if the address-list is not 100% up-to-date.
8731 + */
8732 + rcu_read_lock_bh();
8733 + mptcp_local = rcu_dereference_bh(fm_ns->local);
8734 + mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), GFP_ATOMIC);
8735 + rcu_read_unlock_bh();
8737 + if (!mptcp_local)
8738 + return;
8740 +next_subflow:
8741 + if (iter) {
8742 + release_sock(meta_sk);
8743 + mutex_unlock(&mpcb->mpcb_mutex);
8745 + yield();
8747 + mutex_lock(&mpcb->mpcb_mutex);
8748 + lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
8750 + iter++;
8752 + if (sock_flag(meta_sk, SOCK_DEAD))
8753 + goto exit;
8755 + mptcp_for_each_bit_set(mpcb->rem4_bits, i) {
8756 + struct mptcp_rem4 *rem = &mpcb->remaddr4[i];
8757 + /* Do we need to retry establishing a subflow ? */
8758 + if (rem->retry_bitfield) {
8759 + int i = mptcp_find_free_index(~rem->retry_bitfield);
8761 + rem->bitfield |= (1 << i);
8762 + rem->retry_bitfield &= ~(1 << i);
8764 + mptcp_init4_subsockets(meta_sk, &mptcp_local->locaddr4[i], rem);
8765 + goto next_subflow;
8769 +#if IS_ENABLED(CONFIG_IPV6)
8770 + mptcp_for_each_bit_set(mpcb->rem6_bits, i) {
8771 + struct mptcp_rem6 *rem = &mpcb->remaddr6[i];
8773 + /* Do we need to retry establishing a subflow ? */
8774 + if (rem->retry_bitfield) {
8775 + int i = mptcp_find_free_index(~rem->retry_bitfield);
8777 + rem->bitfield |= (1 << i);
8778 + rem->retry_bitfield &= ~(1 << i);
8780 + mptcp_init6_subsockets(meta_sk, &mptcp_local->locaddr6[i], rem);
8781 + goto next_subflow;
8784 +#endif
8786 +exit:
8787 + kfree(mptcp_local);
8788 + release_sock(meta_sk);
8789 + mutex_unlock(&mpcb->mpcb_mutex);
8790 + sock_put(meta_sk);
8793 +/**
8794 + * Create all new subflows, by doing calls to mptcp_initX_subsockets
8796 + * This function uses a goto next_subflow, to allow releasing the lock between
8797 + * new subflows and giving other processes a chance to do some work on the
8798 + * socket and potentially finishing the communication.
8799 + **/
8800 +static void create_subflow_worker(struct work_struct *work)
8802 + struct fullmesh_priv *pm_priv = container_of(work,
8803 + struct fullmesh_priv,
8804 + subflow_work);
8805 + struct mptcp_cb *mpcb = pm_priv->mpcb;
8806 + struct sock *meta_sk = mpcb->meta_sk;
8807 + struct mptcp_loc_addr *mptcp_local;
8808 + struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk));
8809 + int iter = 0, retry = 0;
8810 + int i;
8812 + /* We need a local (stable) copy of the address-list. Really, it is not
8813 + * such a big deal, if the address-list is not 100% up-to-date.
8814 + */
8815 + rcu_read_lock_bh();
8816 + mptcp_local = rcu_dereference_bh(fm_ns->local);
8817 + mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), GFP_ATOMIC);
8818 + rcu_read_unlock_bh();
8820 + if (!mptcp_local)
8821 + return;
8823 +next_subflow:
8824 + if (iter) {
8825 + release_sock(meta_sk);
8826 + mutex_unlock(&mpcb->mpcb_mutex);
8828 + yield();
8830 + mutex_lock(&mpcb->mpcb_mutex);
8831 + lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
8833 + iter++;
8835 + if (sock_flag(meta_sk, SOCK_DEAD))
8836 + goto exit;
8838 + if (mpcb->master_sk &&
8839 + !tcp_sk(mpcb->master_sk)->mptcp->fully_established)
8840 + goto exit;
8842 + mptcp_for_each_bit_set(mpcb->rem4_bits, i) {
8843 + struct mptcp_rem4 *rem;
8844 + u8 remaining_bits;
8846 + rem = &mpcb->remaddr4[i];
8847 + remaining_bits = ~(rem->bitfield) & mptcp_local->loc4_bits;
8849 + /* Are there still combinations to handle? */
8850 + if (remaining_bits) {
8851 + int i = mptcp_find_free_index(~remaining_bits);
8853 + rem->bitfield |= (1 << i);
8855 + /* If a route is not yet available then retry once */
8856 + if (mptcp_init4_subsockets(meta_sk, &mptcp_local->locaddr4[i],
8857 + rem) == -ENETUNREACH)
8858 + retry = rem->retry_bitfield |= (1 << i);
8859 + goto next_subflow;
8863 +#if IS_ENABLED(CONFIG_IPV6)
8864 + mptcp_for_each_bit_set(mpcb->rem6_bits, i) {
8865 + struct mptcp_rem6 *rem;
8866 + u8 remaining_bits;
8868 + rem = &mpcb->remaddr6[i];
8869 + remaining_bits = ~(rem->bitfield) & mptcp_local->loc6_bits;
8871 + /* Are there still combinations to handle? */
8872 + if (remaining_bits) {
8873 + int i = mptcp_find_free_index(~remaining_bits);
8875 + rem->bitfield |= (1 << i);
8877 + /* If a route is not yet available then retry once */
8878 + if (mptcp_init6_subsockets(meta_sk, &mptcp_local->locaddr6[i],
8879 + rem) == -ENETUNREACH)
8880 + retry = rem->retry_bitfield |= (1 << i);
8881 + goto next_subflow;
8884 +#endif
8886 + if (retry && !delayed_work_pending(&pm_priv->subflow_retry_work)) {
8887 + sock_hold(meta_sk);
8888 + queue_delayed_work(mptcp_wq, &pm_priv->subflow_retry_work,
8889 + msecs_to_jiffies(MPTCP_SUBFLOW_RETRY_DELAY));
8892 +exit:
8893 + kfree(mptcp_local);
8894 + release_sock(meta_sk);
8895 + mutex_unlock(&mpcb->mpcb_mutex);
8896 + sock_put(meta_sk);
8899 +static void update_remove_addrs(u8 addr_id, struct sock *meta_sk,
8900 + struct mptcp_loc_addr *mptcp_local)
8902 + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
8903 + struct fullmesh_priv *fmp = (struct fullmesh_priv *)&mpcb->mptcp_pm[0];
8904 + struct sock *sk;
8905 + int i;
8907 + fmp->remove_addrs |= (1 << addr_id);
8908 + /* v4 goes from 0 to MPTCP_MAX_ADDR, v6 beyond */
8909 + if (addr_id < MPTCP_MAX_ADDR) {
8910 + fmp->announced_addrs_v4 &= ~(1 << addr_id);
8912 + mptcp_for_each_bit_set(mpcb->rem4_bits, i) {
8913 + mpcb->remaddr4[i].bitfield &= mptcp_local->loc4_bits;
8914 + mpcb->remaddr4[i].retry_bitfield &= mptcp_local->loc4_bits;
8916 + } else {
8917 + fmp->announced_addrs_v6 &= ~(1 << (addr_id - MPTCP_MAX_ADDR));
8919 + mptcp_for_each_bit_set(mpcb->rem6_bits, i) {
8920 + mpcb->remaddr6[i].bitfield &= mptcp_local->loc6_bits;
8921 + mpcb->remaddr6[i].retry_bitfield &= mptcp_local->loc6_bits;
8925 + sk = mptcp_select_ack_sock(meta_sk, 0);
8926 + if (sk)
8927 + tcp_send_ack(sk);
8930 +static int mptcp_find_address(struct mptcp_loc_addr *mptcp_local,
8931 + sa_family_t family, union inet_addr *addr)
8933 + int i;
8934 + u8 loc_bits;
8935 + bool found = false;
8937 + if (family == AF_INET)
8938 + loc_bits = mptcp_local->loc4_bits;
8939 + else
8940 + loc_bits = mptcp_local->loc6_bits;
8942 + mptcp_for_each_bit_set(loc_bits, i) {
8943 + if (family == AF_INET &&
8944 + mptcp_local->locaddr4[i].addr.s_addr == addr->in.s_addr) {
8945 + found = true;
8946 + break;
8948 + if (family == AF_INET6 &&
8949 + ipv6_addr_equal(&mptcp_local->locaddr6[i].addr,
8950 + &addr->in6)) {
8951 + found = true;
8952 + break;
8956 + if (!found)
8957 + return -1;
8959 + return i;
8962 +static void mptcp_address_worker(struct work_struct *work)
8964 + struct delayed_work *delayed_work = container_of(work,
8965 + struct delayed_work,
8966 + work);
8967 + struct mptcp_fm_ns *fm_ns = container_of(delayed_work,
8968 + struct mptcp_fm_ns,
8969 + address_worker);
8970 + struct net *net = fm_ns->net;
8971 + struct mptcp_addr_event *event = NULL;
8972 + struct mptcp_loc_addr *mptcp_local, *old;
8973 + int i, id = -1; /* id is used in the socket-code on a delete-event */
8974 + bool success; /* Used to indicate if we succeeded handling the event */
8976 +next_event:
8977 + success = false;
8978 + kfree(event);
8980 + /* First, let's dequeue an event from our event-list */
8981 + rcu_read_lock_bh();
8982 + spin_lock(&fm_ns->local_lock);
8984 + event = list_first_entry_or_null(&fm_ns->events,
8985 + struct mptcp_addr_event, list);
8986 + if (!event) {
8987 + spin_unlock(&fm_ns->local_lock);
8988 + rcu_read_unlock_bh();
8989 + return;
8992 + list_del(&event->list);
8994 + mptcp_local = rcu_dereference_bh(fm_ns->local);
8996 + if (event->code == MPTCP_EVENT_DEL) {
8997 + id = mptcp_find_address(mptcp_local, event->family, &event->addr);
8999 + /* Not in the list - so we don't care */
9000 + if (id < 0)
9001 + goto duno;
9003 + old = mptcp_local;
9004 + mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local),
9005 + GFP_ATOMIC);
9006 + if (!mptcp_local)
9007 + goto duno;
9009 + if (event->family == AF_INET)
9010 + mptcp_local->loc4_bits &= ~(1 << id);
9011 + else
9012 + mptcp_local->loc6_bits &= ~(1 << id);
9014 + rcu_assign_pointer(fm_ns->local, mptcp_local);
9015 + kfree(old);
9016 + } else {
9017 + int i = mptcp_find_address(mptcp_local, event->family, &event->addr);
9018 + int j = i;
9020 + if (j < 0) {
9021 + /* Not in the list, so we have to find an empty slot */
9022 + if (event->family == AF_INET)
9023 + i = __mptcp_find_free_index(mptcp_local->loc4_bits, -1,
9024 + mptcp_local->next_v4_index);
9025 + if (event->family == AF_INET6)
9026 + i = __mptcp_find_free_index(mptcp_local->loc6_bits, -1,
9027 + mptcp_local->next_v6_index);
9029 + if (i < 0) {
9030 + mptcp_debug("%s no more space\n", __func__);
9031 + goto duno;
9034 + /* It might have been a MOD-event. */
9035 + event->code = MPTCP_EVENT_ADD;
9036 + } else {
9037 + /* Let's check if anything changes */
9038 + if (event->family == AF_INET &&
9039 + event->low_prio == mptcp_local->locaddr4[i].low_prio)
9040 + goto duno;
9042 + if (event->family == AF_INET6 &&
9043 + event->low_prio == mptcp_local->locaddr6[i].low_prio)
9044 + goto duno;
9047 + old = mptcp_local;
9048 + mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local),
9049 + GFP_ATOMIC);
9050 + if (!mptcp_local)
9051 + goto duno;
9053 + if (event->family == AF_INET) {
9054 + mptcp_local->locaddr4[i].addr.s_addr = event->addr.in.s_addr;
9055 + mptcp_local->locaddr4[i].loc4_id = i + 1;
9056 + mptcp_local->locaddr4[i].low_prio = event->low_prio;
9057 + } else {
9058 + mptcp_local->locaddr6[i].addr = event->addr.in6;
9059 + mptcp_local->locaddr6[i].loc6_id = i + MPTCP_MAX_ADDR;
9060 + mptcp_local->locaddr6[i].low_prio = event->low_prio;
9063 + if (j < 0) {
9064 + if (event->family == AF_INET) {
9065 + mptcp_local->loc4_bits |= (1 << i);
9066 + mptcp_local->next_v4_index = i + 1;
9067 + } else {
9068 + mptcp_local->loc6_bits |= (1 << i);
9069 + mptcp_local->next_v6_index = i + 1;
9073 + rcu_assign_pointer(fm_ns->local, mptcp_local);
9074 + kfree(old);
9076 + success = true;
9078 +duno:
9079 + spin_unlock(&fm_ns->local_lock);
9080 + rcu_read_unlock_bh();
9082 + if (!success)
9083 + goto next_event;
9085 + /* Now we iterate over the MPTCP-sockets and apply the event. */
9086 + for (i = 0; i < MPTCP_HASH_SIZE; i++) {
9087 + struct hlist_nulls_node *node;
9088 + struct tcp_sock *meta_tp;
9090 + rcu_read_lock_bh();
9091 + hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[i],
9092 + tk_table) {
9093 + struct mptcp_cb *mpcb = meta_tp->mpcb;
9094 + struct sock *meta_sk = (struct sock *)meta_tp, *sk;
9095 + struct fullmesh_priv *fmp = (struct fullmesh_priv *)&mpcb->mptcp_pm[0];
9097 + if (sock_net(meta_sk) != net)
9098 + continue;
9100 + if (unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt)))
9101 + continue;
9103 + bh_lock_sock(meta_sk);
9105 + if (!meta_tp->mpc || !is_meta_sk(meta_sk) ||
9106 + mpcb->infinite_mapping_snd ||
9107 + mpcb->infinite_mapping_rcv ||
9108 + mpcb->send_infinite_mapping)
9109 + goto next;
9111 + /* May be that the pm has changed in-between */
9112 + if (mpcb->pm_ops != &full_mesh)
9113 + goto next;
9115 + if (sock_owned_by_user(meta_sk)) {
9116 + if (!test_and_set_bit(MPTCP_PATH_MANAGER,
9117 + &meta_tp->tsq_flags))
9118 + sock_hold(meta_sk);
9120 + goto next;
9123 + if (event->code == MPTCP_EVENT_ADD) {
9124 + if (event->family == AF_INET)
9125 + fmp->add_addr++;
9126 +#if IS_ENABLED(CONFIG_IPV6)
9127 + if (event->family == AF_INET6)
9128 + fmp->add_addr++;
9129 +#endif
9131 + sk = mptcp_select_ack_sock(meta_sk, 0);
9132 + if (sk)
9133 + tcp_send_ack(sk);
9135 + full_mesh_create_subflows(meta_sk);
9138 + if (event->code == MPTCP_EVENT_DEL) {
9139 + struct sock *sk, *tmpsk;
9140 + struct mptcp_loc_addr *mptcp_local;
9141 + bool found = false;
9143 + mptcp_local = rcu_dereference_bh(fm_ns->local);
9145 + /* Look for the socket and remove him */
9146 + mptcp_for_each_sk_safe(mpcb, sk, tmpsk) {
9147 + if ((event->family == AF_INET6 &&
9148 + (sk->sk_family == AF_INET ||
9149 + mptcp_v6_is_v4_mapped(sk))) ||
9150 + (event->family == AF_INET &&
9151 + (sk->sk_family == AF_INET6 &&
9152 + !mptcp_v6_is_v4_mapped(sk))))
9153 + continue;
9155 + if (event->family == AF_INET &&
9156 + (sk->sk_family == AF_INET ||
9157 + mptcp_v6_is_v4_mapped(sk)) &&
9158 + inet_sk(sk)->inet_saddr != event->addr.in.s_addr)
9159 + continue;
9161 + if (event->family == AF_INET6 &&
9162 + sk->sk_family == AF_INET6 &&
9163 + !ipv6_addr_equal(&inet6_sk(sk)->saddr, &event->addr.in6))
9164 + continue;
9166 + /* Reinject, so that pf = 1 and so we
9167 + * won't select this one as the
9168 + * ack-sock.
9169 + */
9170 + mptcp_reinject_data(sk, 0);
9172 + /* A master is special, it has
9173 + * address-id 0
9174 + */
9175 + if (!tcp_sk(sk)->mptcp->loc_id)
9176 + update_remove_addrs(0, meta_sk, mptcp_local);
9177 + else if (tcp_sk(sk)->mptcp->loc_id != id)
9178 + update_remove_addrs(tcp_sk(sk)->mptcp->loc_id, meta_sk, mptcp_local);
9180 + mptcp_sub_force_close(sk);
9181 + found = true;
9184 + if (!found)
9185 + goto next;
9187 + /* The id may have been given by the event,
9188 + * matching on a local address. And it may not
9189 + * have matched on one of the above sockets,
9190 + * because the client never created a subflow.
9191 + * So, we have to finally remove it here.
9192 + */
9193 + if (id > 0)
9194 + update_remove_addrs(id, meta_sk, mptcp_local);
9197 + if (event->code == MPTCP_EVENT_MOD) {
9198 + struct sock *sk;
9200 + mptcp_for_each_sk(mpcb, sk) {
9201 + struct tcp_sock *tp = tcp_sk(sk);
9202 + if (event->family == AF_INET &&
9203 + (sk->sk_family == AF_INET ||
9204 + mptcp_v6_is_v4_mapped(sk)) &&
9205 + inet_sk(sk)->inet_saddr == event->addr.in.s_addr) {
9206 + if (event->low_prio != tp->mptcp->low_prio) {
9207 + tp->mptcp->send_mp_prio = 1;
9208 + tp->mptcp->low_prio = event->low_prio;
9210 + tcp_send_ack(sk);
9214 + if (event->family == AF_INET6 &&
9215 + sk->sk_family == AF_INET6 &&
9216 + !ipv6_addr_equal(&inet6_sk(sk)->saddr, &event->addr.in6)) {
9217 + if (event->low_prio != tp->mptcp->low_prio) {
9218 + tp->mptcp->send_mp_prio = 1;
9219 + tp->mptcp->low_prio = event->low_prio;
9221 + tcp_send_ack(sk);
9226 +next:
9227 + bh_unlock_sock(meta_sk);
9228 + sock_put(meta_sk);
9230 + rcu_read_unlock_bh();
9232 + goto next_event;
9235 +static struct mptcp_addr_event *lookup_similar_event(struct net *net,
9236 + struct mptcp_addr_event *event)
9238 + struct mptcp_addr_event *eventq;
9239 + struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
9241 + list_for_each_entry(eventq, &fm_ns->events, list) {
9242 + if (eventq->family != event->family)
9243 + continue;
9244 + if (event->family == AF_INET) {
9245 + if (eventq->addr.in.s_addr == event->addr.in.s_addr)
9246 + return eventq;
9247 + } else {
9248 + if (ipv6_addr_equal(&eventq->addr.in6, &event->addr.in6))
9249 + return eventq;
9252 + return NULL;
9255 +/* We already hold the net-namespace MPTCP-lock */
9256 +static void add_pm_event(struct net *net, struct mptcp_addr_event *event)
9258 + struct mptcp_addr_event *eventq = lookup_similar_event(net, event);
9259 + struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
9261 + if (eventq) {
9262 + switch (event->code) {
9263 + case MPTCP_EVENT_DEL:
9264 + list_del(&eventq->list);
9265 + kfree(eventq);
9266 + break;
9267 + case MPTCP_EVENT_ADD:
9268 + eventq->low_prio = event->low_prio;
9269 + eventq->code = MPTCP_EVENT_ADD;
9270 + return;
9271 + case MPTCP_EVENT_MOD:
9272 + eventq->low_prio = event->low_prio;
9273 + return;
9277 + /* OK, we have to add the new address to the wait queue */
9278 + eventq = kmemdup(event, sizeof(struct mptcp_addr_event), GFP_ATOMIC);
9279 + if (!eventq)
9280 + return;
9282 + list_add_tail(&eventq->list, &fm_ns->events);
9284 + /* Create work-queue */
9285 + if (!delayed_work_pending(&fm_ns->address_worker))
9286 + queue_delayed_work(mptcp_wq, &fm_ns->address_worker,
9287 + msecs_to_jiffies(500));
9290 +static void addr4_event_handler(struct in_ifaddr *ifa, unsigned long event,
9291 + struct net *net)
9293 + struct net_device *netdev = ifa->ifa_dev->dev;
9294 + struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
9295 + struct mptcp_addr_event mpevent;
9297 + if (ifa->ifa_scope > RT_SCOPE_LINK ||
9298 + ipv4_is_loopback(ifa->ifa_local))
9299 + return;
9301 + spin_lock_bh(&fm_ns->local_lock);
9303 + mpevent.family = AF_INET;
9304 + mpevent.addr.in.s_addr = ifa->ifa_local;
9305 + mpevent.low_prio = (netdev->flags & IFF_MPBACKUP) ? 1 : 0;
9307 + if (event == NETDEV_DOWN || !netif_running(netdev) ||
9308 + (netdev->flags & IFF_NOMULTIPATH))
9309 + mpevent.code = MPTCP_EVENT_DEL;
9310 + else if (event == NETDEV_UP)
9311 + mpevent.code = MPTCP_EVENT_ADD;
9312 + else if (event == NETDEV_CHANGE)
9313 + mpevent.code = MPTCP_EVENT_MOD;
9315 + add_pm_event(net, &mpevent);
9317 + spin_unlock_bh(&fm_ns->local_lock);
9318 + return;
9321 +/* React on IPv4-addr add/rem-events */
9322 +static int mptcp_pm_inetaddr_event(struct notifier_block *this,
9323 + unsigned long event, void *ptr)
9325 + struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
9326 + struct net *net = dev_net(ifa->ifa_dev->dev);
9328 + addr4_event_handler(ifa, event, net);
9330 + return NOTIFY_DONE;
9333 +static struct notifier_block mptcp_pm_inetaddr_notifier = {
9334 + .notifier_call = mptcp_pm_inetaddr_event,
9337 +#if IS_ENABLED(CONFIG_IPV6)
9339 +/* IPV6-related address/interface watchers */
9340 +struct mptcp_dad_data {
9341 + struct timer_list timer;
9342 + struct inet6_ifaddr *ifa;
9345 +static void dad_callback(unsigned long arg);
9346 +static int inet6_addr_event(struct notifier_block *this,
9347 + unsigned long event, void *ptr);
9349 +static int ipv6_is_in_dad_state(struct inet6_ifaddr *ifa)
9351 + return ((ifa->flags & IFA_F_TENTATIVE) &&
9352 + ifa->state == INET6_IFADDR_STATE_DAD);
9355 +static void dad_init_timer(struct mptcp_dad_data *data,
9356 + struct inet6_ifaddr *ifa)
9358 + data->ifa = ifa;
9359 + data->timer.data = (unsigned long)data;
9360 + data->timer.function = dad_callback;
9361 + if (ifa->idev->cnf.rtr_solicit_delay)
9362 + data->timer.expires = jiffies + ifa->idev->cnf.rtr_solicit_delay;
9363 + else
9364 + data->timer.expires = jiffies + (HZ/10);
9367 +static void dad_callback(unsigned long arg)
9369 + struct mptcp_dad_data *data = (struct mptcp_dad_data *)arg;
9371 + if (ipv6_is_in_dad_state(data->ifa)) {
9372 + dad_init_timer(data, data->ifa);
9373 + add_timer(&data->timer);
9374 + } else {
9375 + inet6_addr_event(NULL, NETDEV_UP, data->ifa);
9376 + in6_ifa_put(data->ifa);
9377 + kfree(data);
9381 +static inline void dad_setup_timer(struct inet6_ifaddr *ifa)
9383 + struct mptcp_dad_data *data;
9385 + data = kmalloc(sizeof(*data), GFP_ATOMIC);
9387 + if (!data)
9388 + return;
9390 + init_timer(&data->timer);
9391 + dad_init_timer(data, ifa);
9392 + add_timer(&data->timer);
9393 + in6_ifa_hold(ifa);
9396 +static void addr6_event_handler(struct inet6_ifaddr *ifa, unsigned long event,
9397 + struct net *net)
9399 + struct net_device *netdev = ifa->idev->dev;
9400 + int addr_type = ipv6_addr_type(&ifa->addr);
9401 + struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
9402 + struct mptcp_addr_event mpevent;
9404 + if (ifa->scope > RT_SCOPE_LINK ||
9405 + addr_type == IPV6_ADDR_ANY ||
9406 + (addr_type & IPV6_ADDR_LOOPBACK) ||
9407 + (addr_type & IPV6_ADDR_LINKLOCAL))
9408 + return;
9410 + spin_lock_bh(&fm_ns->local_lock);
9412 + mpevent.family = AF_INET6;
9413 + mpevent.addr.in6 = ifa->addr;
9414 + mpevent.low_prio = (netdev->flags & IFF_MPBACKUP) ? 1 : 0;
9416 + if (event == NETDEV_DOWN ||!netif_running(netdev) ||
9417 + (netdev->flags & IFF_NOMULTIPATH))
9418 + mpevent.code = MPTCP_EVENT_DEL;
9419 + else if (event == NETDEV_UP)
9420 + mpevent.code = MPTCP_EVENT_ADD;
9421 + else if (event == NETDEV_CHANGE)
9422 + mpevent.code = MPTCP_EVENT_MOD;
9424 + add_pm_event(net, &mpevent);
9426 + spin_unlock_bh(&fm_ns->local_lock);
9427 + return;
9430 +/* React on IPv6-addr add/rem-events */
9431 +static int inet6_addr_event(struct notifier_block *this, unsigned long event,
9432 + void *ptr)
9434 + struct inet6_ifaddr *ifa6 = (struct inet6_ifaddr *)ptr;
9435 + struct net *net = dev_net(ifa6->idev->dev);
9437 + if (ipv6_is_in_dad_state(ifa6))
9438 + dad_setup_timer(ifa6);
9439 + else
9440 + addr6_event_handler(ifa6, event, net);
9442 + return NOTIFY_DONE;
9445 +static struct notifier_block inet6_addr_notifier = {
9446 + .notifier_call = inet6_addr_event,
9449 +#endif
9451 +/* React on ifup/down-events */
9452 +static int netdev_event(struct notifier_block *this, unsigned long event,
9453 + void *ptr)
9455 + struct net_device *dev = netdev_notifier_info_to_dev(ptr);
9456 + struct in_device *in_dev;
9457 +#if IS_ENABLED(CONFIG_IPV6)
9458 + struct inet6_dev *in6_dev;
9459 +#endif
9461 + if (!(event == NETDEV_UP || event == NETDEV_DOWN ||
9462 + event == NETDEV_CHANGE))
9463 + return NOTIFY_DONE;
9465 + rcu_read_lock();
9466 + in_dev = __in_dev_get_rtnl(dev);
9468 + if (in_dev) {
9469 + for_ifa(in_dev) {
9470 + mptcp_pm_inetaddr_event(NULL, event, ifa);
9471 + } endfor_ifa(in_dev);
9474 +#if IS_ENABLED(CONFIG_IPV6)
9475 + in6_dev = __in6_dev_get(dev);
9477 + if (in6_dev) {
9478 + struct inet6_ifaddr *ifa6;
9479 + list_for_each_entry(ifa6, &in6_dev->addr_list, if_list)
9480 + inet6_addr_event(NULL, event, ifa6);
9482 +#endif
9484 + rcu_read_unlock();
9485 + return NOTIFY_DONE;
9488 +static struct notifier_block mptcp_pm_netdev_notifier = {
9489 + .notifier_call = netdev_event,
9492 +static void full_mesh_new_session(struct sock *meta_sk, int index)
9494 + struct mptcp_loc_addr *mptcp_local;
9495 + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
9496 + struct fullmesh_priv *fmp = (struct fullmesh_priv *)&mpcb->mptcp_pm[0];
9497 + struct net *net = sock_net(meta_sk);
9498 + struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
9499 + struct sock *sk;
9500 + int i;
9502 + if (index == -1) {
9503 + mptcp_fallback_default(mpcb);
9504 + return;
9507 + /* Initialize workqueue-struct */
9508 + INIT_WORK(&fmp->subflow_work, create_subflow_worker);
9509 + INIT_DELAYED_WORK(&fmp->subflow_retry_work, retry_subflow_worker);
9510 + fmp->mpcb = mpcb;
9512 + sk = mptcp_select_ack_sock(meta_sk, 0);
9514 + rcu_read_lock();
9515 + mptcp_local = rcu_dereference(fm_ns->local);
9517 + /* Look for the address among the local addresses */
9518 + mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) {
9519 + __be32 ifa_address = mptcp_local->locaddr4[i].addr.s_addr;
9521 + /* We do not need to announce the initial subflow's address again */
9522 + if ((meta_sk->sk_family == AF_INET ||
9523 + mptcp_v6_is_v4_mapped(meta_sk)) &&
9524 + inet_sk(meta_sk)->inet_saddr == ifa_address)
9525 + continue;
9527 + fmp->add_addr++;
9529 + if (sk)
9530 + tcp_send_ack(sk);
9533 +#if IS_ENABLED(CONFIG_IPV6)
9534 + mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) {
9535 + struct in6_addr *ifa6 = &mptcp_local->locaddr6[i].addr;
9537 + /* We do not need to announce the initial subflow's address again */
9538 + if (meta_sk->sk_family == AF_INET6 &&
9539 + ipv6_addr_equal(&inet6_sk(meta_sk)->saddr, ifa6))
9540 + continue;
9542 + fmp->add_addr++;
9544 + if (sk)
9545 + tcp_send_ack(sk);
9547 +#endif
9549 + rcu_read_unlock();
9551 + if (meta_sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(meta_sk))
9552 + fmp->announced_addrs_v4 |= (1 << index);
9553 + else
9554 + fmp->announced_addrs_v6 |= (1 << index);
9557 +static void full_mesh_create_subflows(struct sock *meta_sk)
9559 + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
9560 + struct fullmesh_priv *pm_priv = (struct fullmesh_priv *)&mpcb->mptcp_pm[0];
9562 + if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv ||
9563 + mpcb->send_infinite_mapping ||
9564 + mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD))
9565 + return;
9567 + /* The master may not yet be fully established (address added through
9568 + * mptcp_update_metasocket). Then, we should not attempt to create new
9569 + * subflows.
9570 + */
9571 + if (mpcb->master_sk &&
9572 + !tcp_sk(mpcb->master_sk)->mptcp->fully_established)
9573 + return;
9575 + if (!work_pending(&pm_priv->subflow_work)) {
9576 + sock_hold(meta_sk);
9577 + queue_work(mptcp_wq, &pm_priv->subflow_work);
9581 +/* Called upon release_sock, if the socket was owned by the user during
9582 + * a path-management event.
9583 + */
9584 +static void full_mesh_release_sock(struct sock *meta_sk)
9586 + struct mptcp_loc_addr *mptcp_local;
9587 + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
9588 + struct fullmesh_priv *fmp = (struct fullmesh_priv *)&mpcb->mptcp_pm[0];
9589 + struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk));
9590 + struct sock *sk, *tmpsk;
9591 + int i;
9593 + rcu_read_lock();
9594 + mptcp_local = rcu_dereference(fm_ns->local);
9596 + /* First, detect modifications or additions */
9597 + mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) {
9598 + struct in_addr ifa = mptcp_local->locaddr4[i].addr;
9599 + bool found = false;
9601 + mptcp_for_each_sk(mpcb, sk) {
9602 + struct tcp_sock *tp = tcp_sk(sk);
9604 + if (sk->sk_family == AF_INET6 &&
9605 + !mptcp_v6_is_v4_mapped(sk))
9606 + continue;
9608 + if (inet_sk(sk)->inet_saddr != ifa.s_addr)
9609 + continue;
9611 + found = true;
9613 + if (mptcp_local->locaddr4[i].low_prio != tp->mptcp->low_prio) {
9614 + tp->mptcp->send_mp_prio = 1;
9615 + tp->mptcp->low_prio = mptcp_local->locaddr4[i].low_prio;
9617 + tcp_send_ack(sk);
9621 + if (!found) {
9622 + fmp->add_addr++;
9624 + sk = mptcp_select_ack_sock(meta_sk, 0);
9625 + if (sk)
9626 + tcp_send_ack(sk);
9627 + full_mesh_create_subflows(meta_sk);
9631 +#if IS_ENABLED(CONFIG_IPV6)
9632 + mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) {
9633 + struct in6_addr ifa = mptcp_local->locaddr6[i].addr;
9634 + bool found = false;
9636 + mptcp_for_each_sk(mpcb, sk) {
9637 + struct tcp_sock *tp = tcp_sk(sk);
9639 + if (sk->sk_family == AF_INET ||
9640 + mptcp_v6_is_v4_mapped(sk))
9641 + continue;
9643 + if (!ipv6_addr_equal(&inet6_sk(sk)->saddr, &ifa))
9644 + continue;
9646 + found = true;
9648 + if (mptcp_local->locaddr6[i].low_prio != tp->mptcp->low_prio) {
9649 + tp->mptcp->send_mp_prio = 1;
9650 + tp->mptcp->low_prio = mptcp_local->locaddr6[i].low_prio;
9652 + tcp_send_ack(sk);
9656 + if (!found) {
9657 + fmp->add_addr++;
9659 + sk = mptcp_select_ack_sock(meta_sk, 0);
9660 + if (sk)
9661 + tcp_send_ack(sk);
9662 + full_mesh_create_subflows(meta_sk);
9665 +#endif
9667 + /* Now, detect address-removals */
9668 + mptcp_for_each_sk_safe(mpcb, sk, tmpsk) {
9669 + bool shall_remove = true;
9671 + if (sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(sk)) {
9672 + mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) {
9673 + if (inet_sk(sk)->inet_saddr == mptcp_local->locaddr4[i].addr.s_addr) {
9674 + shall_remove = false;
9675 + break;
9678 + } else {
9679 + mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) {
9680 + if (ipv6_addr_equal(&inet6_sk(sk)->saddr, &mptcp_local->locaddr6[i].addr)) {
9681 + shall_remove = false;
9682 + break;
9687 + if (shall_remove) {
9688 + /* Reinject, so that pf = 1 and so we
9689 + * won't select this one as the
9690 + * ack-sock.
9691 + */
9692 + mptcp_reinject_data(sk, 0);
9694 + update_remove_addrs(tcp_sk(sk)->mptcp->loc_id, meta_sk,
9695 + mptcp_local);
9697 + if (mpcb->master_sk == sk)
9698 + update_remove_addrs(0, meta_sk, mptcp_local);
9700 + mptcp_sub_force_close(sk);
9703 + rcu_read_unlock();
9706 +static int full_mesh_get_local_index(sa_family_t family, union inet_addr *addr,
9707 + struct net *net)
9709 + struct mptcp_loc_addr *mptcp_local;
9710 + struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
9711 + int index;
9713 + /* Handle the backup-flows */
9714 + rcu_read_lock();
9715 + mptcp_local = rcu_dereference(fm_ns->local);
9717 + index = mptcp_find_address(mptcp_local, family, addr);
9719 + rcu_read_unlock();
9721 + return index;
9724 +static int full_mesh_get_local_id(sa_family_t family, union inet_addr *addr,
9725 + struct net *net)
9727 + struct mptcp_loc_addr *mptcp_local;
9728 + struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
9729 + int index, id = -1;
9731 + /* Handle the backup-flows */
9732 + rcu_read_lock();
9733 + mptcp_local = rcu_dereference(fm_ns->local);
9735 + index = mptcp_find_address(mptcp_local, family, addr);
9737 + if (index != -1) {
9738 + if (family == AF_INET)
9739 + id = mptcp_local->locaddr4[index].loc4_id;
9740 + else
9741 + id = mptcp_local->locaddr6[index].loc6_id;
9745 + rcu_read_unlock();
9747 + return id;
9750 +static void full_mesh_addr_signal(struct sock *sk, unsigned *size,
9751 + struct tcp_out_options *opts,
9752 + struct sk_buff *skb)
9754 + struct tcp_sock *tp = tcp_sk(sk);
9755 + struct mptcp_cb *mpcb = tp->mpcb;
9756 + struct fullmesh_priv *fmp = (struct fullmesh_priv *)&mpcb->mptcp_pm[0];
9757 + struct mptcp_loc_addr *mptcp_local;
9758 + struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(sk));
9759 + int remove_addr_len;
9760 + u8 unannouncedv4, unannouncedv6;
9762 + if (likely(!fmp->add_addr))
9763 + goto remove_addr;
9765 + rcu_read_lock();
9766 + mptcp_local = rcu_dereference(fm_ns->local);
9768 + /* IPv4 */
9769 + unannouncedv4 = (~fmp->announced_addrs_v4) & mptcp_local->loc4_bits;
9770 + if (unannouncedv4 &&
9771 + MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR4_ALIGN) {
9772 + int ind = mptcp_find_free_index(~unannouncedv4);
9774 + opts->options |= OPTION_MPTCP;
9775 + opts->mptcp_options |= OPTION_ADD_ADDR;
9776 + opts->add_addr4.addr_id = mptcp_local->locaddr4[ind].loc4_id;
9777 + opts->add_addr4.addr = mptcp_local->locaddr4[ind].addr;
9778 + opts->add_addr_v4 = 1;
9780 + if (skb) {
9781 + fmp->announced_addrs_v4 |= (1 << ind);
9782 + fmp->add_addr--;
9784 + *size += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN;
9787 + /* IPv6 */
9788 + unannouncedv6 = (~fmp->announced_addrs_v6) & mptcp_local->loc6_bits;
9789 + if (unannouncedv6 &&
9790 + MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR6_ALIGN) {
9791 + int ind = mptcp_find_free_index(~unannouncedv6);
9793 + opts->options |= OPTION_MPTCP;
9794 + opts->mptcp_options |= OPTION_ADD_ADDR;
9795 + opts->add_addr6.addr_id = mptcp_local->locaddr6[ind].loc6_id;
9796 + opts->add_addr6.addr = mptcp_local->locaddr6[ind].addr;
9797 + opts->add_addr_v6 = 1;
9799 + if (skb) {
9800 + fmp->announced_addrs_v6 |= (1 << ind);
9801 + fmp->add_addr--;
9803 + *size += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN;
9806 + rcu_read_unlock();
9808 + if (!unannouncedv4 && !unannouncedv6 && skb) {
9809 + fmp->add_addr--;
9812 +remove_addr:
9813 + if (likely(!fmp->remove_addrs))
9814 + return;
9816 + remove_addr_len = mptcp_sub_len_remove_addr_align(fmp->remove_addrs);
9817 + if (MAX_TCP_OPTION_SPACE - *size < remove_addr_len)
9818 + return;
9820 + opts->options |= OPTION_MPTCP;
9821 + opts->mptcp_options |= OPTION_REMOVE_ADDR;
9822 + opts->remove_addrs = fmp->remove_addrs;
9823 + *size += remove_addr_len;
9824 + if (skb)
9825 + fmp->remove_addrs = 0;
9828 +static int mptcp_fm_init_net(struct net *net)
9830 + struct mptcp_loc_addr *mptcp_local;
9831 + struct mptcp_fm_ns *fm_ns;
9833 + fm_ns = kzalloc(sizeof(*fm_ns), GFP_KERNEL);
9834 + if (!fm_ns)
9835 + return -ENOBUFS;
9837 + mptcp_local = kzalloc(sizeof(*mptcp_local), GFP_KERNEL);
9838 + if (!mptcp_local) {
9839 + kfree(fm_ns);
9840 + return -ENOBUFS;
9843 + mptcp_local->next_v4_index = 1;
9845 + rcu_assign_pointer(fm_ns->local, mptcp_local);
9846 + INIT_DELAYED_WORK(&fm_ns->address_worker, mptcp_address_worker);
9847 + INIT_LIST_HEAD(&fm_ns->events);
9848 + spin_lock_init(&fm_ns->local_lock);
9849 + fm_ns->net = net;
9850 + net->mptcp.path_managers[MPTCP_PM_FULLMESH] = fm_ns;
9852 + return 0;
9855 +static void mptcp_fm_exit_net(struct net *net)
9857 + struct mptcp_addr_event *eventq, *tmp;
9858 + struct mptcp_fm_ns *fm_ns;
9859 + struct mptcp_loc_addr *mptcp_local;
9861 + fm_ns = fm_get_ns(net);
9862 + cancel_delayed_work_sync(&fm_ns->address_worker);
9864 + rcu_read_lock_bh();
9866 + mptcp_local = rcu_dereference_bh(fm_ns->local);
9867 + kfree(mptcp_local);
9869 + spin_lock(&fm_ns->local_lock);
9870 + list_for_each_entry_safe(eventq, tmp, &fm_ns->events, list) {
9871 + list_del(&eventq->list);
9872 + kfree(eventq);
9874 + spin_unlock(&fm_ns->local_lock);
9876 + rcu_read_unlock_bh();
9878 + kfree(fm_ns);
9881 +static struct pernet_operations full_mesh_net_ops = {
9882 + .init = mptcp_fm_init_net,
9883 + .exit = mptcp_fm_exit_net,
9886 +static struct mptcp_pm_ops full_mesh __read_mostly = {
9887 + .new_session = full_mesh_new_session,
9888 + .release_sock = full_mesh_release_sock,
9889 + .fully_established = full_mesh_create_subflows,
9890 + .new_remote_address = full_mesh_create_subflows,
9891 + .get_local_index = full_mesh_get_local_index,
9892 + .get_local_id = full_mesh_get_local_id,
9893 + .addr_signal = full_mesh_addr_signal,
9894 + .name = "fullmesh",
9895 + .owner = THIS_MODULE,
9898 +/* General initialization of MPTCP_PM */
9899 +static int __init full_mesh_register(void)
9901 + int ret;
9903 + BUILD_BUG_ON(sizeof(struct fullmesh_priv) > MPTCP_PM_SIZE);
9905 + ret = register_pernet_subsys(&full_mesh_net_ops);
9906 + if (ret)
9907 + goto out;
9909 + ret = register_inetaddr_notifier(&mptcp_pm_inetaddr_notifier);
9910 + if (ret)
9911 + goto err_reg_inetaddr;
9912 + ret = register_netdevice_notifier(&mptcp_pm_netdev_notifier);
9913 + if (ret)
9914 + goto err_reg_netdev;
9916 +#if IS_ENABLED(CONFIG_IPV6)
9917 + ret = register_inet6addr_notifier(&inet6_addr_notifier);
9918 + if (ret)
9919 + goto err_reg_inet6addr;
9920 +#endif
9922 + ret = mptcp_register_path_manager(&full_mesh);
9923 + if (ret)
9924 + goto err_reg_pm;
9926 +out:
9927 + return ret;
9930 +err_reg_pm:
9931 +#if IS_ENABLED(CONFIG_IPV6)
9932 + unregister_inet6addr_notifier(&inet6_addr_notifier);
9933 +err_reg_inet6addr:
9934 +#endif
9935 + unregister_netdevice_notifier(&mptcp_pm_netdev_notifier);
9936 +err_reg_netdev:
9937 + unregister_inetaddr_notifier(&mptcp_pm_inetaddr_notifier);
9938 +err_reg_inetaddr:
9939 + unregister_pernet_subsys(&full_mesh_net_ops);
9940 + goto out;
9943 +static void full_mesh_unregister(void)
9945 +#if IS_ENABLED(CONFIG_IPV6)
9946 + unregister_inet6addr_notifier(&inet6_addr_notifier);
9947 +#endif
9948 + unregister_netdevice_notifier(&mptcp_pm_netdev_notifier);
9949 + unregister_inetaddr_notifier(&mptcp_pm_inetaddr_notifier);
9950 + unregister_pernet_subsys(&full_mesh_net_ops);
9951 + mptcp_unregister_path_manager(&full_mesh);
9954 +module_init(full_mesh_register);
9955 +module_exit(full_mesh_unregister);
9957 +MODULE_AUTHOR("Christoph Paasch");
9958 +MODULE_LICENSE("GPL");
9959 +MODULE_DESCRIPTION("Full-Mesh MPTCP");
9960 +MODULE_VERSION("0.88");
9961 diff -Nur linux-3.14.45.orig/net/mptcp/mptcp_input.c linux-3.14.45/net/mptcp/mptcp_input.c
9962 --- linux-3.14.45.orig/net/mptcp/mptcp_input.c 1970-01-01 01:00:00.000000000 +0100
9963 +++ linux-3.14.45/net/mptcp/mptcp_input.c 2015-06-24 14:15:48.895862487 +0200
9964 @@ -0,0 +1,2254 @@
9966 + * MPTCP implementation - Sending side
9968 + * Initial Design & Implementation:
9969 + * Sébastien Barré <sebastien.barre@uclouvain.be>
9971 + * Current Maintainer & Author:
9972 + * Christoph Paasch <christoph.paasch@uclouvain.be>
9974 + * Additional authors:
9975 + * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
9976 + * Gregory Detal <gregory.detal@uclouvain.be>
9977 + * Fabien Duchêne <fabien.duchene@uclouvain.be>
9978 + * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
9979 + * Lavkesh Lahngir <lavkesh51@gmail.com>
9980 + * Andreas Ripke <ripke@neclab.eu>
9981 + * Vlad Dogaru <vlad.dogaru@intel.com>
9982 + * Octavian Purdila <octavian.purdila@intel.com>
9983 + * John Ronan <jronan@tssg.org>
9984 + * Catalin Nicutar <catalin.nicutar@gmail.com>
9985 + * Brandon Heller <brandonh@stanford.edu>
9988 + * This program is free software; you can redistribute it and/or
9989 + * modify it under the terms of the GNU General Public License
9990 + * as published by the Free Software Foundation; either version
9991 + * 2 of the License, or (at your option) any later version.
9992 + */
9994 +#include <asm/unaligned.h>
9996 +#include <net/mptcp.h>
9997 +#include <net/mptcp_v4.h>
9998 +#include <net/mptcp_v6.h>
10000 +#include <linux/kconfig.h>
10002 +/* is seq1 < seq2 ? */
10003 +static inline int before64(const u64 seq1, const u64 seq2)
10005 + return (s64)(seq1 - seq2) < 0;
10008 +/* is seq1 > seq2 ? */
10009 +#define after64(seq1, seq2) before64(seq2, seq1)
10011 +static inline void mptcp_become_fully_estab(struct sock *sk)
10013 + tcp_sk(sk)->mptcp->fully_established = 1;
10015 + if (is_master_tp(tcp_sk(sk)) &&
10016 + tcp_sk(sk)->mpcb->pm_ops->fully_established)
10017 + tcp_sk(sk)->mpcb->pm_ops->fully_established(mptcp_meta_sk(sk));
10020 +/* Similar to tcp_tso_acked without any memory accounting */
10021 +static inline int mptcp_tso_acked_reinject(struct sock *sk, struct sk_buff *skb)
10023 + struct tcp_sock *tp = tcp_sk(sk);
10024 + u32 packets_acked, len;
10026 + BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una));
10028 + packets_acked = tcp_skb_pcount(skb);
10030 + if (skb_unclone(skb, GFP_ATOMIC))
10031 + return 0;
10033 + len = tp->snd_una - TCP_SKB_CB(skb)->seq;
10034 + __pskb_trim_head(skb, len);
10036 + TCP_SKB_CB(skb)->seq += len;
10037 + skb->ip_summed = CHECKSUM_PARTIAL;
10038 + skb->truesize -= len;
10040 + /* Any change of skb->len requires recalculation of tso factor. */
10041 + if (tcp_skb_pcount(skb) > 1)
10042 + tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));
10043 + packets_acked -= tcp_skb_pcount(skb);
10045 + if (packets_acked) {
10046 + BUG_ON(tcp_skb_pcount(skb) == 0);
10047 + BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));
10050 + return packets_acked;
10053 +/**
10054 + * Cleans the meta-socket retransmission queue and the reinject-queue.
10055 + * @sk must be the metasocket.
10056 + */
10057 +static void mptcp_clean_rtx_queue(struct sock *meta_sk, u32 prior_snd_una)
10059 + struct sk_buff *skb, *tmp;
10060 + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
10061 + struct mptcp_cb *mpcb = meta_tp->mpcb;
10062 + bool acked = false;
10063 + u32 acked_pcount;
10065 + while ((skb = tcp_write_queue_head(meta_sk)) &&
10066 + skb != tcp_send_head(meta_sk)) {
10067 + bool fully_acked = true;
10069 + if (before(meta_tp->snd_una, TCP_SKB_CB(skb)->end_seq)) {
10070 + if (tcp_skb_pcount(skb) == 1 ||
10071 + !after(meta_tp->snd_una, TCP_SKB_CB(skb)->seq))
10072 + break;
10074 + acked_pcount = tcp_tso_acked(meta_sk, skb);
10075 + if (!acked_pcount)
10076 + break;
10078 + fully_acked = false;
10079 + } else {
10080 + acked_pcount = tcp_skb_pcount(skb);
10083 + acked = true;
10084 + meta_tp->packets_out -= acked_pcount;
10085 + meta_tp->retrans_stamp = 0;
10087 + if (!fully_acked)
10088 + break;
10090 + tcp_unlink_write_queue(skb, meta_sk);
10092 + if (mptcp_is_data_fin(skb)) {
10093 + struct sock *sk_it;
10095 + /* DATA_FIN has been acknowledged - now we can close
10096 + * the subflows
10097 + */
10098 + mptcp_for_each_sk(mpcb, sk_it) {
10099 + unsigned long delay = 0;
10101 + /* If we are the passive closer, don't trigger
10102 + * subflow-fin until the subflow has been finned
10103 + * by the peer - thus we add a delay.
10104 + */
10105 + if (mpcb->passive_close &&
10106 + sk_it->sk_state == TCP_ESTABLISHED)
10107 + delay = inet_csk(sk_it)->icsk_rto << 3;
10109 + mptcp_sub_close(sk_it, delay);
10112 + sk_wmem_free_skb(meta_sk, skb);
10114 + /* Remove acknowledged data from the reinject queue */
10115 + skb_queue_walk_safe(&mpcb->reinject_queue, skb, tmp) {
10116 + if (before(meta_tp->snd_una, TCP_SKB_CB(skb)->end_seq)) {
10117 + if (tcp_skb_pcount(skb) == 1 ||
10118 + !after(meta_tp->snd_una, TCP_SKB_CB(skb)->seq))
10119 + break;
10121 + mptcp_tso_acked_reinject(meta_sk, skb);
10122 + break;
10125 + __skb_unlink(skb, &mpcb->reinject_queue);
10126 + __kfree_skb(skb);
10129 + if (likely(between(meta_tp->snd_up, prior_snd_una, meta_tp->snd_una)))
10130 + meta_tp->snd_up = meta_tp->snd_una;
10132 + if (acked) {
10133 + tcp_rearm_rto(meta_sk);
10134 + /* Normally this is done in tcp_try_undo_loss - but MPTCP
10135 + * does not call this function.
10136 + */
10137 + inet_csk(meta_sk)->icsk_retransmits = 0;
10141 +/* Inspired by tcp_rcv_state_process */
10142 +static int mptcp_rcv_state_process(struct sock *meta_sk, struct sock *sk,
10143 + const struct sk_buff *skb, u32 data_seq,
10144 + u16 data_len)
10146 + struct tcp_sock *meta_tp = tcp_sk(meta_sk), *tp = tcp_sk(sk);
10147 + struct tcphdr *th = tcp_hdr(skb);
10149 + /* State-machine handling if FIN has been enqueued and he has
10150 + * been acked (snd_una == write_seq) - it's important that this
10151 + * here is after sk_wmem_free_skb because otherwise
10152 + * sk_forward_alloc is wrong upon inet_csk_destroy_sock()
10153 + */
10154 + switch (meta_sk->sk_state) {
10155 + case TCP_FIN_WAIT1:
10156 + if (meta_tp->snd_una == meta_tp->write_seq) {
10157 + struct dst_entry *dst = __sk_dst_get(meta_sk);
10159 + tcp_set_state(meta_sk, TCP_FIN_WAIT2);
10160 + meta_sk->sk_shutdown |= SEND_SHUTDOWN;
10162 + dst = __sk_dst_get(sk);
10163 + if (dst)
10164 + dst_confirm(dst);
10166 + if (!sock_flag(meta_sk, SOCK_DEAD)) {
10167 + /* Wake up lingering close() */
10168 + meta_sk->sk_state_change(meta_sk);
10169 + } else {
10170 + int tmo;
10172 + if (meta_tp->linger2 < 0 ||
10173 + (data_len &&
10174 + after(data_seq + data_len - (mptcp_is_data_fin2(skb, tp) ? 1 : 0),
10175 + meta_tp->rcv_nxt))) {
10176 + mptcp_send_active_reset(meta_sk, GFP_ATOMIC);
10177 + tcp_done(meta_sk);
10178 + NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA);
10179 + return 1;
10182 + tmo = tcp_fin_time(meta_sk);
10183 + if (tmo > TCP_TIMEWAIT_LEN) {
10184 + inet_csk_reset_keepalive_timer(meta_sk, tmo - TCP_TIMEWAIT_LEN);
10185 + } else if (mptcp_is_data_fin2(skb, tp) ||
10186 + sock_owned_by_user(meta_sk)) {
10187 + /* Bad case. We could lose such FIN otherwise.
10188 + * It is not a big problem, but it looks confusing
10189 + * and not so rare event. We still can lose it now,
10190 + * if it spins in bh_lock_sock(), but it is really
10191 + * marginal case.
10192 + */
10193 + inet_csk_reset_keepalive_timer(meta_sk, tmo);
10194 + } else {
10195 + tcp_time_wait(meta_sk, TCP_FIN_WAIT2, tmo);
10199 + break;
10200 + case TCP_CLOSING:
10201 + case TCP_LAST_ACK:
10202 + if (meta_tp->snd_una == meta_tp->write_seq) {
10203 + tcp_done(meta_sk);
10204 + return 1;
10206 + break;
10209 + /* step 7: process the segment text */
10210 + switch (meta_sk->sk_state) {
10211 + case TCP_FIN_WAIT1:
10212 + case TCP_FIN_WAIT2:
10213 + /* RFC 793 says to queue data in these states,
10214 + * RFC 1122 says we MUST send a reset.
10215 + * BSD 4.4 also does reset.
10216 + */
10217 + if (meta_sk->sk_shutdown & RCV_SHUTDOWN) {
10218 + if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
10219 + after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt) &&
10220 + !mptcp_is_data_fin2(skb, tp)) {
10221 + NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA);
10222 + mptcp_send_active_reset(meta_sk, GFP_ATOMIC);
10223 + tcp_reset(meta_sk);
10224 + return 1;
10227 + break;
10230 + return 0;
10233 +/**
10234 + * @return:
10235 + * i) 1: Everything's fine.
10236 + * ii) -1: A reset has been sent on the subflow - csum-failure
10237 + * iii) 0: csum-failure but no reset sent, because it's the last subflow.
10238 + * Last packet should not be destroyed by the caller because it has
10239 + * been done here.
10240 + */
10241 +static int mptcp_verif_dss_csum(struct sock *sk)
10243 + struct tcp_sock *tp = tcp_sk(sk);
10244 + struct sk_buff *tmp, *tmp1, *last = NULL;
10245 + __wsum csum_tcp = 0; /* cumulative checksum of pld + mptcp-header */
10246 + int ans = 1, overflowed = 0, offset = 0, dss_csum_added = 0;
10247 + int iter = 0;
10249 + skb_queue_walk_safe(&sk->sk_receive_queue, tmp, tmp1) {
10250 + unsigned int csum_len;
10252 + if (before(tp->mptcp->map_subseq + tp->mptcp->map_data_len, TCP_SKB_CB(tmp)->end_seq))
10253 + /* Mapping ends in the middle of the packet -
10254 + * csum only these bytes
10255 + */
10256 + csum_len = tp->mptcp->map_subseq + tp->mptcp->map_data_len - TCP_SKB_CB(tmp)->seq;
10257 + else
10258 + csum_len = tmp->len;
10260 + offset = 0;
10261 + if (overflowed) {
10262 + char first_word[4];
10263 + first_word[0] = 0;
10264 + first_word[1] = 0;
10265 + first_word[2] = 0;
10266 + first_word[3] = *(tmp->data);
10267 + csum_tcp = csum_partial(first_word, 4, csum_tcp);
10268 + offset = 1;
10269 + csum_len--;
10270 + overflowed = 0;
10273 + csum_tcp = skb_checksum(tmp, offset, csum_len, csum_tcp);
10275 + /* Was it on an odd-length? Then we have to merge the next byte
10276 + * correctly (see above)
10277 + */
10278 + if (csum_len != (csum_len & (~1)))
10279 + overflowed = 1;
10281 + if (mptcp_is_data_seq(tmp) && !dss_csum_added) {
10282 + __be32 data_seq = htonl((u32)(tp->mptcp->map_data_seq >> 32));
10284 + /* If a 64-bit dss is present, we increase the offset
10285 + * by 4 bytes, as the high-order 64-bits will be added
10286 + * in the final csum_partial-call.
10287 + */
10288 + u32 offset = skb_transport_offset(tmp) +
10289 + TCP_SKB_CB(tmp)->dss_off;
10290 + if (TCP_SKB_CB(tmp)->mptcp_flags & MPTCPHDR_SEQ64_SET)
10291 + offset += 4;
10293 + csum_tcp = skb_checksum(tmp, offset,
10294 + MPTCP_SUB_LEN_SEQ_CSUM,
10295 + csum_tcp);
10297 + csum_tcp = csum_partial(&data_seq,
10298 + sizeof(data_seq), csum_tcp);
10300 + dss_csum_added = 1; /* Just do it once */
10302 + last = tmp;
10303 + iter++;
10305 + if (!skb_queue_is_last(&sk->sk_receive_queue, tmp) &&
10306 + !before(TCP_SKB_CB(tmp1)->seq,
10307 + tp->mptcp->map_subseq + tp->mptcp->map_data_len))
10308 + break;
10311 + /* Now, checksum must be 0 */
10312 + if (unlikely(csum_fold(csum_tcp))) {
10313 + pr_err("%s csum is wrong: %#x data_seq %u dss_csum_added %d overflowed %d iterations %d\n",
10314 + __func__, csum_fold(csum_tcp),
10315 + TCP_SKB_CB(last)->seq, dss_csum_added, overflowed,
10316 + iter);
10318 + tp->mptcp->send_mp_fail = 1;
10320 + /* map_data_seq is the data-seq number of the
10321 + * mapping we are currently checking
10322 + */
10323 + tp->mpcb->csum_cutoff_seq = tp->mptcp->map_data_seq;
10325 + if (tp->mpcb->cnt_subflows > 1) {
10326 + mptcp_send_reset(sk);
10327 + ans = -1;
10328 + } else {
10329 + tp->mpcb->send_infinite_mapping = 1;
10331 + /* Need to purge the rcv-queue as it's no more valid */
10332 + while ((tmp = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
10333 + tp->copied_seq = TCP_SKB_CB(tmp)->end_seq;
10334 + kfree_skb(tmp);
10337 + ans = 0;
10341 + return ans;
10344 +static inline void mptcp_prepare_skb(struct sk_buff *skb, struct sk_buff *next,
10345 + struct sock *sk)
10347 + struct tcp_sock *tp = tcp_sk(sk);
10348 + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
10349 + /* Adapt data-seq's to the packet itself. We kinda transform the
10350 + * dss-mapping to a per-packet granularity. This is necessary to
10351 + * correctly handle overlapping mappings coming from different
10352 + * subflows. Otherwise it would be a complete mess.
10353 + */
10354 + tcb->seq = ((u32)tp->mptcp->map_data_seq) + tcb->seq - tp->mptcp->map_subseq;
10355 + tcb->end_seq = tcb->seq + skb->len;
10357 + /* If cur is the last one in the rcv-queue (or the last one for this
10358 + * mapping), and data_fin is enqueued, the end_data_seq is +1.
10359 + */
10360 + if (skb_queue_is_last(&sk->sk_receive_queue, skb) ||
10361 + after(TCP_SKB_CB(next)->end_seq, tp->mptcp->map_subseq + tp->mptcp->map_data_len)) {
10362 + tcb->end_seq += tp->mptcp->map_data_fin;
10364 + /* We manually set the fin-flag if it is a data-fin. For easy
10365 + * processing in tcp_recvmsg.
10366 + */
10367 + if (mptcp_is_data_fin2(skb, tp))
10368 + tcp_hdr(skb)->fin = 1;
10369 + else
10370 + tcp_hdr(skb)->fin = 0;
10371 + } else {
10372 + /* We may have a subflow-fin with data but without data-fin */
10373 + tcp_hdr(skb)->fin = 0;
10377 +/**
10378 + * @return: 1 if the segment has been eaten and can be suppressed,
10379 + * otherwise 0.
10380 + */
10381 +static inline int mptcp_direct_copy(struct sk_buff *skb, struct sock *meta_sk)
10383 + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
10384 + int chunk = min_t(unsigned int, skb->len, meta_tp->ucopy.len);
10385 + int eaten = 0;
10387 + __set_current_state(TASK_RUNNING);
10389 + local_bh_enable();
10390 + if (!skb_copy_datagram_iovec(skb, 0, meta_tp->ucopy.iov, chunk)) {
10391 + meta_tp->ucopy.len -= chunk;
10392 + meta_tp->copied_seq += chunk;
10393 + eaten = (chunk == skb->len);
10394 + tcp_rcv_space_adjust(meta_sk);
10396 + local_bh_disable();
10397 + return eaten;
10400 +static inline void mptcp_reset_mapping(struct tcp_sock *tp)
10402 + tp->mptcp->map_data_len = 0;
10403 + tp->mptcp->map_data_seq = 0;
10404 + tp->mptcp->map_subseq = 0;
10405 + tp->mptcp->map_data_fin = 0;
10406 + tp->mptcp->mapping_present = 0;
10409 +/* The DSS-mapping received on the sk only covers the second half of the skb
10410 + * (cut at seq). We trim the head from the skb.
10411 + * Data will be freed upon kfree().
10413 + * Inspired by tcp_trim_head().
10414 + */
10415 +static void mptcp_skb_trim_head(struct sk_buff *skb, struct sock *sk, u32 seq)
10417 + int len = seq - TCP_SKB_CB(skb)->seq;
10418 + u32 new_seq = TCP_SKB_CB(skb)->seq + len;
10420 + if (len < skb_headlen(skb))
10421 + __skb_pull(skb, len);
10422 + else
10423 + __pskb_trim_head(skb, len - skb_headlen(skb));
10425 + TCP_SKB_CB(skb)->seq = new_seq;
10427 + skb->truesize -= len;
10428 + atomic_sub(len, &sk->sk_rmem_alloc);
10429 + sk_mem_uncharge(sk, len);
10432 +/* The DSS-mapping received on the sk only covers the first half of the skb
10433 + * (cut at seq). We create a second skb (@return), and queue it in the rcv-queue
10434 + * as further packets may resolve the mapping of the second half of data.
10436 + * Inspired by tcp_fragment().
10437 + */
10438 +static int mptcp_skb_split_tail(struct sk_buff *skb, struct sock *sk, u32 seq)
10440 + struct sk_buff *buff;
10441 + int nsize;
10442 + int nlen, len;
10444 + len = seq - TCP_SKB_CB(skb)->seq;
10445 + nsize = skb_headlen(skb) - len + tcp_sk(sk)->tcp_header_len;
10446 + if (nsize < 0)
10447 + nsize = 0;
10449 + /* Get a new skb... force flag on. */
10450 + buff = alloc_skb(nsize, GFP_ATOMIC);
10451 + if (buff == NULL)
10452 + return -ENOMEM;
10454 + skb_reserve(buff, tcp_sk(sk)->tcp_header_len);
10455 + skb_reset_transport_header(buff);
10457 + tcp_hdr(buff)->fin = tcp_hdr(skb)->fin;
10458 + tcp_hdr(skb)->fin = 0;
10460 + /* We absolutly need to call skb_set_owner_r before refreshing the
10461 + * truesize of buff, otherwise the moved data will account twice.
10462 + */
10463 + skb_set_owner_r(buff, sk);
10464 + nlen = skb->len - len - nsize;
10465 + buff->truesize += nlen;
10466 + skb->truesize -= nlen;
10468 + /* Correct the sequence numbers. */
10469 + TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
10470 + TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
10471 + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
10473 + skb_split(skb, buff, len);
10475 + __skb_queue_after(&sk->sk_receive_queue, skb, buff);
10477 + return 0;
10480 +/* @return: 0 everything is fine. Just continue processing
10481 + * 1 subflow is broken stop everything
10482 + * -1 this packet was broken - continue with the next one.
10483 + */
10484 +static int mptcp_prevalidate_skb(struct sock *sk, struct sk_buff *skb)
10486 + struct tcp_sock *tp = tcp_sk(sk);
10488 + /* If we are in infinite mode, the subflow-fin is in fact a data-fin. */
10489 + if (!skb->len && tcp_hdr(skb)->fin && !mptcp_is_data_fin(skb) &&
10490 + !tp->mpcb->infinite_mapping_rcv) {
10491 + /* Remove a pure subflow-fin from the queue and increase
10492 + * copied_seq.
10493 + */
10494 + tp->copied_seq = TCP_SKB_CB(skb)->end_seq;
10495 + __skb_unlink(skb, &sk->sk_receive_queue);
10496 + __kfree_skb(skb);
10497 + return -1;
10500 + /* If we are not yet fully established and do not know the mapping for
10501 + * this segment, this path has to fallback to infinite or be torn down.
10502 + */
10503 + if (!tp->mptcp->fully_established && !mptcp_is_data_seq(skb) &&
10504 + !tp->mptcp->mapping_present && !tp->mpcb->infinite_mapping_rcv) {
10505 + pr_err("%s %#x will fallback - pi %d from %pS, seq %u\n",
10506 + __func__, tp->mpcb->mptcp_loc_token,
10507 + tp->mptcp->path_index, __builtin_return_address(0),
10508 + TCP_SKB_CB(skb)->seq);
10510 + if (!is_master_tp(tp)) {
10511 + mptcp_send_reset(sk);
10512 + return 1;
10515 + tp->mpcb->infinite_mapping_snd = 1;
10516 + tp->mpcb->infinite_mapping_rcv = 1;
10517 + tp->mptcp->fully_established = 1;
10520 + /* Receiver-side becomes fully established when a whole rcv-window has
10521 + * been received without the need to fallback due to the previous
10522 + * condition. */
10523 + if (!tp->mptcp->fully_established) {
10524 + tp->mptcp->init_rcv_wnd -= skb->len;
10525 + if (tp->mptcp->init_rcv_wnd < 0)
10526 + mptcp_become_fully_estab(sk);
10529 + return 0;
10532 +/* @return: 0 everything is fine. Just continue processing
10533 + * 1 subflow is broken stop everything
10534 + * -1 this packet was broken - continue with the next one.
10535 + */
10536 +static int mptcp_detect_mapping(struct sock *sk, struct sk_buff *skb)
10538 + struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp);
10539 + struct mptcp_cb *mpcb = tp->mpcb;
10540 + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
10541 + u32 *ptr;
10542 + u32 data_seq, sub_seq, data_len, tcp_end_seq;
10544 + /* If we are in infinite-mapping-mode, the subflow is guaranteed to be
10545 + * in-order at the data-level. Thus data-seq-numbers can be inferred
10546 + * from what is expected at the data-level.
10547 + */
10548 + if (mpcb->infinite_mapping_rcv) {
10549 + tp->mptcp->map_data_seq = mptcp_get_rcv_nxt_64(meta_tp);
10550 + tp->mptcp->map_subseq = tcb->seq;
10551 + tp->mptcp->map_data_len = skb->len;
10552 + tp->mptcp->map_data_fin = tcp_hdr(skb)->fin;
10553 + tp->mptcp->mapping_present = 1;
10554 + return 0;
10557 + /* No mapping here? Exit - it is either already set or still on its way */
10558 + if (!mptcp_is_data_seq(skb)) {
10559 + /* Too many packets without a mapping - this subflow is broken */
10560 + if (!tp->mptcp->mapping_present &&
10561 + tp->rcv_nxt - tp->copied_seq > 65536) {
10562 + mptcp_send_reset(sk);
10563 + return 1;
10566 + return 0;
10569 + ptr = mptcp_skb_set_data_seq(skb, &data_seq, mpcb);
10570 + ptr++;
10571 + sub_seq = get_unaligned_be32(ptr) + tp->mptcp->rcv_isn;
10572 + ptr++;
10573 + data_len = get_unaligned_be16(ptr);
10575 + /* If it's an empty skb with DATA_FIN, sub_seq must get fixed.
10576 + * The draft sets it to 0, but we really would like to have the
10577 + * real value, to have an easy handling afterwards here in this
10578 + * function.
10579 + */
10580 + if (mptcp_is_data_fin(skb) && skb->len == 0)
10581 + sub_seq = TCP_SKB_CB(skb)->seq;
10583 + /* If there is already a mapping - we check if it maps with the current
10584 + * one. If not - we reset.
10585 + */
10586 + if (tp->mptcp->mapping_present &&
10587 + (data_seq != (u32)tp->mptcp->map_data_seq ||
10588 + sub_seq != tp->mptcp->map_subseq ||
10589 + data_len != tp->mptcp->map_data_len + tp->mptcp->map_data_fin ||
10590 + mptcp_is_data_fin(skb) != tp->mptcp->map_data_fin)) {
10591 + /* Mapping in packet is different from what we want */
10592 + pr_err("%s Mappings do not match!\n", __func__);
10593 + pr_err("%s dseq %u mdseq %u, sseq %u msseq %u dlen %u mdlen %u dfin %d mdfin %d\n",
10594 + __func__, data_seq, (u32)tp->mptcp->map_data_seq,
10595 + sub_seq, tp->mptcp->map_subseq, data_len,
10596 + tp->mptcp->map_data_len, mptcp_is_data_fin(skb),
10597 + tp->mptcp->map_data_fin);
10598 + mptcp_send_reset(sk);
10599 + return 1;
10602 + /* If the previous check was good, the current mapping is valid and we exit. */
10603 + if (tp->mptcp->mapping_present)
10604 + return 0;
10606 + /* Mapping not yet set on this subflow - we set it here! */
10608 + if (!data_len) {
10609 + mpcb->infinite_mapping_rcv = 1;
10610 + tp->mptcp->fully_established = 1;
10611 + /* We need to repeat mp_fail's until the sender felt
10612 + * back to infinite-mapping - here we stop repeating it.
10613 + */
10614 + tp->mptcp->send_mp_fail = 0;
10616 + /* We have to fixup data_len - it must be the same as skb->len */
10617 + data_len = skb->len + (mptcp_is_data_fin(skb) ? 1 : 0);
10618 + sub_seq = tcb->seq;
10620 + /* TODO kill all other subflows than this one */
10621 + /* data_seq and so on are set correctly */
10623 + /* At this point, the meta-ofo-queue has to be emptied,
10624 + * as the following data is guaranteed to be in-order at
10625 + * the data and subflow-level
10626 + */
10627 + mptcp_purge_ofo_queue(meta_tp);
10630 + /* We are sending mp-fail's and thus are in fallback mode.
10631 + * Ignore packets which do not announce the fallback and still
10632 + * want to provide a mapping.
10633 + */
10634 + if (tp->mptcp->send_mp_fail) {
10635 + tp->copied_seq = TCP_SKB_CB(skb)->end_seq;
10636 + __skb_unlink(skb, &sk->sk_receive_queue);
10637 + __kfree_skb(skb);
10638 + return -1;
10641 + /* FIN increased the mapping-length by 1 */
10642 + if (mptcp_is_data_fin(skb))
10643 + data_len--;
10645 + /* Subflow-sequences of packet must be
10646 + * (at least partially) be part of the DSS-mapping's
10647 + * subflow-sequence-space.
10649 + * Basically the mapping is not valid, if either of the
10650 + * following conditions is true:
10652 + * 1. It's not a data_fin and
10653 + * MPTCP-sub_seq >= TCP-end_seq
10655 + * 2. It's a data_fin and TCP-end_seq > TCP-seq and
10656 + * MPTCP-sub_seq >= TCP-end_seq
10658 + * The previous two can be merged into:
10659 + * TCP-end_seq > TCP-seq and MPTCP-sub_seq >= TCP-end_seq
10660 + * Because if it's not a data-fin, TCP-end_seq > TCP-seq
10662 + * 3. It's a data_fin and skb->len == 0 and
10663 + * MPTCP-sub_seq > TCP-end_seq
10665 + * 4. It's not a data_fin and TCP-end_seq > TCP-seq and
10666 + * MPTCP-sub_seq + MPTCP-data_len <= TCP-seq
10668 + * 5. MPTCP-sub_seq is prior to what we already copied (copied_seq)
10669 + */
10671 + /* subflow-fin is not part of the mapping - ignore it here ! */
10672 + tcp_end_seq = tcb->end_seq - tcp_hdr(skb)->fin;
10673 + if ((!before(sub_seq, tcb->end_seq) && after(tcp_end_seq, tcb->seq)) ||
10674 + (mptcp_is_data_fin(skb) && skb->len == 0 && after(sub_seq, tcb->end_seq)) ||
10675 + (!after(sub_seq + data_len, tcb->seq) && after(tcp_end_seq, tcb->seq)) ||
10676 + before(sub_seq, tp->copied_seq)) {
10677 + /* Subflow-sequences of packet is different from what is in the
10678 + * packet's dss-mapping. The peer is misbehaving - reset
10679 + */
10680 + pr_err("%s Packet's mapping does not map to the DSS sub_seq %u "
10681 + "end_seq %u, tcp_end_seq %u seq %u dfin %u len %u data_len %u"
10682 + "copied_seq %u\n", __func__, sub_seq, tcb->end_seq, tcp_end_seq, tcb->seq, mptcp_is_data_fin(skb),
10683 + skb->len, data_len, tp->copied_seq);
10684 + mptcp_send_reset(sk);
10685 + return 1;
10688 + /* Does the DSS had 64-bit seqnum's ? */
10689 + if (!(tcb->mptcp_flags & MPTCPHDR_SEQ64_SET)) {
10690 + /* Wrapped around? */
10691 + if (unlikely(after(data_seq, meta_tp->rcv_nxt) && data_seq < meta_tp->rcv_nxt)) {
10692 + tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, !mpcb->rcv_hiseq_index, data_seq);
10693 + } else {
10694 + /* Else, access the default high-order bits */
10695 + tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index, data_seq);
10697 + } else {
10698 + tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, (tcb->mptcp_flags & MPTCPHDR_SEQ64_INDEX) ? 1 : 0, data_seq);
10700 + if (unlikely(tcb->mptcp_flags & MPTCPHDR_SEQ64_OFO)) {
10701 + /* We make sure that the data_seq is invalid.
10702 + * It will be dropped later.
10703 + */
10704 + tp->mptcp->map_data_seq += 0xFFFFFFFF;
10705 + tp->mptcp->map_data_seq += 0xFFFFFFFF;
10709 + tp->mptcp->map_data_len = data_len;
10710 + tp->mptcp->map_subseq = sub_seq;
10711 + tp->mptcp->map_data_fin = mptcp_is_data_fin(skb) ? 1 : 0;
10712 + tp->mptcp->mapping_present = 1;
10714 + return 0;
10717 +/* Similar to tcp_sequence(...) */
10718 +static inline int mptcp_sequence(const struct tcp_sock *meta_tp,
10719 + u64 data_seq, u64 end_data_seq)
10721 + struct mptcp_cb *mpcb = meta_tp->mpcb;
10722 + u64 rcv_wup64;
10724 + /* Wrap-around? */
10725 + if (meta_tp->rcv_wup > meta_tp->rcv_nxt) {
10726 + rcv_wup64 = ((u64)(mpcb->rcv_high_order[mpcb->rcv_hiseq_index] - 1) << 32) |
10727 + meta_tp->rcv_wup;
10728 + } else {
10729 + rcv_wup64 = mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index,
10730 + meta_tp->rcv_wup);
10733 + return !before64(end_data_seq, rcv_wup64) &&
10734 + !after64(data_seq, mptcp_get_rcv_nxt_64(meta_tp) + tcp_receive_window(meta_tp));
10737 +/* @return: 0 everything is fine. Just continue processing
10738 + * -1 this packet was broken - continue with the next one.
10739 + */
10740 +static int mptcp_validate_mapping(struct sock *sk, struct sk_buff *skb)
10742 + struct tcp_sock *tp = tcp_sk(sk);
10743 + struct sk_buff *tmp, *tmp1;
10744 + u32 tcp_end_seq;
10746 + if (!tp->mptcp->mapping_present)
10747 + return 0;
10749 + /* either, the new skb gave us the mapping and the first segment
10750 + * in the sub-rcv-queue has to be trimmed ...
10751 + */
10752 + tmp = skb_peek(&sk->sk_receive_queue);
10753 + if (before(TCP_SKB_CB(tmp)->seq, tp->mptcp->map_subseq) &&
10754 + after(TCP_SKB_CB(tmp)->end_seq, tp->mptcp->map_subseq))
10755 + mptcp_skb_trim_head(tmp, sk, tp->mptcp->map_subseq);
10757 + /* ... or the new skb (tail) has to be split at the end. */
10758 + tcp_end_seq = TCP_SKB_CB(skb)->end_seq - (tcp_hdr(skb)->fin ? 1 : 0);
10759 + if (after(tcp_end_seq, tp->mptcp->map_subseq + tp->mptcp->map_data_len)) {
10760 + u32 seq = tp->mptcp->map_subseq + tp->mptcp->map_data_len;
10761 + if (mptcp_skb_split_tail(skb, sk, seq)) { /* Allocation failed */
10762 + /* TODO : maybe handle this here better.
10763 + * We now just force meta-retransmission.
10764 + */
10765 + tp->copied_seq = TCP_SKB_CB(skb)->end_seq;
10766 + __skb_unlink(skb, &sk->sk_receive_queue);
10767 + __kfree_skb(skb);
10768 + return -1;
10772 + /* Now, remove old sk_buff's from the receive-queue.
10773 + * This may happen if the mapping has been lost for these segments and
10774 + * the next mapping has already been received.
10775 + */
10776 + if (tp->mptcp->mapping_present &&
10777 + before(TCP_SKB_CB(skb_peek(&sk->sk_receive_queue))->seq, tp->mptcp->map_subseq)) {
10778 + skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {
10779 + if (!before(TCP_SKB_CB(tmp1)->seq, tp->mptcp->map_subseq))
10780 + break;
10782 + tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;
10783 + __skb_unlink(tmp1, &sk->sk_receive_queue);
10785 + /* Impossible that we could free skb here, because his
10786 + * mapping is known to be valid from previous checks
10787 + */
10788 + __kfree_skb(tmp1);
10792 + return 0;
10795 +/* @return: 0 everything is fine. Just continue processing
10796 + * 1 subflow is broken stop everything
10797 + * -1 this mapping has been put in the meta-receive-queue
10798 + * -2 this mapping has been eaten by the application
10799 + */
10800 +static int mptcp_queue_skb(struct sock *sk)
10802 + struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp);
10803 + struct sock *meta_sk = mptcp_meta_sk(sk);
10804 + struct mptcp_cb *mpcb = tp->mpcb;
10805 + struct sk_buff *tmp, *tmp1;
10806 + u64 rcv_nxt64 = mptcp_get_rcv_nxt_64(meta_tp);
10807 + bool data_queued = false;
10809 + /* Have we not yet received the full mapping? */
10810 + if (!tp->mptcp->mapping_present ||
10811 + before(tp->rcv_nxt, tp->mptcp->map_subseq + tp->mptcp->map_data_len))
10812 + return 0;
10814 + /* Is this an overlapping mapping? rcv_nxt >= end_data_seq
10815 + * OR
10816 + * This mapping is out of window
10817 + */
10818 + if (!before64(rcv_nxt64, tp->mptcp->map_data_seq + tp->mptcp->map_data_len + tp->mptcp->map_data_fin) ||
10819 + !mptcp_sequence(meta_tp, tp->mptcp->map_data_seq,
10820 + tp->mptcp->map_data_seq + tp->mptcp->map_data_len + tp->mptcp->map_data_fin)) {
10821 + skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {
10822 + __skb_unlink(tmp1, &sk->sk_receive_queue);
10823 + tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;
10824 + __kfree_skb(tmp1);
10826 + if (!skb_queue_empty(&sk->sk_receive_queue) &&
10827 + !before(TCP_SKB_CB(tmp)->seq,
10828 + tp->mptcp->map_subseq + tp->mptcp->map_data_len))
10829 + break;
10832 + mptcp_reset_mapping(tp);
10834 + return -1;
10837 + /* Record it, because we want to send our data_fin on the same path */
10838 + if (tp->mptcp->map_data_fin) {
10839 + mpcb->dfin_path_index = tp->mptcp->path_index;
10840 + mpcb->dfin_combined = !!(sk->sk_shutdown & RCV_SHUTDOWN);
10843 + /* Verify the checksum */
10844 + if (mpcb->dss_csum && !mpcb->infinite_mapping_rcv) {
10845 + int ret = mptcp_verif_dss_csum(sk);
10847 + if (ret <= 0) {
10848 + mptcp_reset_mapping(tp);
10849 + return 1;
10853 + if (before64(rcv_nxt64, tp->mptcp->map_data_seq)) {
10854 + /* Seg's have to go to the meta-ofo-queue */
10855 + skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {
10856 + tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;
10857 + mptcp_prepare_skb(tmp1, tmp, sk);
10858 + __skb_unlink(tmp1, &sk->sk_receive_queue);
10859 + /* MUST be done here, because fragstolen may be true later.
10860 + * Then, kfree_skb_partial will not account the memory.
10861 + */
10862 + skb_orphan(tmp1);
10864 + if (!mpcb->in_time_wait) /* In time-wait, do not receive data */
10865 + mptcp_add_meta_ofo_queue(meta_sk, tmp1, sk);
10866 + else
10867 + __kfree_skb(tmp1);
10869 + if (!skb_queue_empty(&sk->sk_receive_queue) &&
10870 + !before(TCP_SKB_CB(tmp)->seq,
10871 + tp->mptcp->map_subseq + tp->mptcp->map_data_len))
10872 + break;
10875 + } else {
10876 + /* Ready for the meta-rcv-queue */
10877 + skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {
10878 + int eaten = 0;
10879 + int copied_early = 0;
10880 + bool fragstolen = false;
10881 + u32 old_rcv_nxt = meta_tp->rcv_nxt;
10883 + tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;
10884 + mptcp_prepare_skb(tmp1, tmp, sk);
10885 + __skb_unlink(tmp1, &sk->sk_receive_queue);
10886 + /* MUST be done here, because fragstolen may be true.
10887 + * Then, kfree_skb_partial will not account the memory.
10888 + */
10889 + skb_orphan(tmp1);
10891 + /* This segment has already been received */
10892 + if (!after(TCP_SKB_CB(tmp1)->end_seq, meta_tp->rcv_nxt)) {
10893 + __kfree_skb(tmp1);
10894 + goto next;
10897 +#ifdef CONFIG_NET_DMA
10898 + if (TCP_SKB_CB(tmp1)->seq == meta_tp->rcv_nxt &&
10899 + meta_tp->ucopy.task == current &&
10900 + meta_tp->copied_seq == meta_tp->rcv_nxt &&
10901 + tmp1->len <= meta_tp->ucopy.len &&
10902 + sock_owned_by_user(meta_sk) &&
10903 + tcp_dma_try_early_copy(meta_sk, tmp1, 0)) {
10904 + copied_early = 1;
10905 + eaten = 1;
10907 +#endif
10909 + /* Is direct copy possible ? */
10910 + if (TCP_SKB_CB(tmp1)->seq == meta_tp->rcv_nxt &&
10911 + meta_tp->ucopy.task == current &&
10912 + meta_tp->copied_seq == meta_tp->rcv_nxt &&
10913 + meta_tp->ucopy.len && sock_owned_by_user(meta_sk) &&
10914 + !copied_early)
10915 + eaten = mptcp_direct_copy(tmp1, meta_sk);
10917 + if (mpcb->in_time_wait) /* In time-wait, do not receive data */
10918 + eaten = 1;
10920 + if (!eaten)
10921 + eaten = tcp_queue_rcv(meta_sk, tmp1, 0, &fragstolen);
10923 + meta_tp->rcv_nxt = TCP_SKB_CB(tmp1)->end_seq;
10924 + mptcp_check_rcvseq_wrap(meta_tp, old_rcv_nxt);
10926 + if (copied_early)
10927 + tcp_cleanup_rbuf(meta_sk, tmp1->len);
10929 + if (tcp_hdr(tmp1)->fin && !mpcb->in_time_wait)
10930 + mptcp_fin(meta_sk);
10932 + /* Check if this fills a gap in the ofo queue */
10933 + if (!skb_queue_empty(&meta_tp->out_of_order_queue))
10934 + mptcp_ofo_queue(meta_sk);
10936 +#ifdef CONFIG_NET_DMA
10937 + if (copied_early)
10938 + __skb_queue_tail(&meta_sk->sk_async_wait_queue,
10939 + tmp1);
10940 + else
10941 +#endif
10942 + if (eaten)
10943 + kfree_skb_partial(tmp1, fragstolen);
10945 + data_queued = true;
10946 +next:
10947 + if (!skb_queue_empty(&sk->sk_receive_queue) &&
10948 + !before(TCP_SKB_CB(tmp)->seq,
10949 + tp->mptcp->map_subseq + tp->mptcp->map_data_len))
10950 + break;
10954 + inet_csk(meta_sk)->icsk_ack.lrcvtime = tcp_time_stamp;
10955 + tp->mptcp->last_data_seq = tp->mptcp->map_data_seq;
10956 + mptcp_reset_mapping(tp);
10958 + return data_queued ? -1 : -2;
10961 +void mptcp_data_ready(struct sock *sk, int bytes)
10963 + struct sock *meta_sk = mptcp_meta_sk(sk);
10964 + struct sk_buff *skb, *tmp;
10965 + int queued = 0;
10967 + /* If the meta is already closed, there is no point in pushing data */
10968 + if (meta_sk->sk_state == TCP_CLOSE && !tcp_sk(sk)->mpcb->in_time_wait) {
10969 + skb_queue_purge(&sk->sk_receive_queue);
10970 + tcp_sk(sk)->copied_seq = tcp_sk(sk)->rcv_nxt;
10971 + goto exit;
10974 +restart:
10975 + /* Iterate over all segments, detect their mapping (if we don't have
10976 + * one yet), validate them and push everything one level higher.
10977 + */
10978 + skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp) {
10979 + int ret;
10980 + /* Pre-validation - e.g., early fallback */
10981 + ret = mptcp_prevalidate_skb(sk, skb);
10982 + if (ret < 0)
10983 + goto restart;
10984 + else if (ret > 0)
10985 + break;
10987 + /* Set the current mapping */
10988 + ret = mptcp_detect_mapping(sk, skb);
10989 + if (ret < 0)
10990 + goto restart;
10991 + else if (ret > 0)
10992 + break;
10994 + /* Validation */
10995 + if (mptcp_validate_mapping(sk, skb) < 0)
10996 + goto restart;
10998 + /* Push a level higher */
10999 + ret = mptcp_queue_skb(sk);
11000 + if (ret < 0) {
11001 + if (ret == -1)
11002 + queued = ret;
11003 + goto restart;
11004 + } else if (ret == 0) {
11005 + continue;
11006 + } else { /* ret == 1 */
11007 + break;
11011 +exit:
11012 + if (tcp_sk(sk)->close_it) {
11013 + tcp_send_ack(sk);
11014 + tcp_time_wait(sk, TCP_TIME_WAIT, 0);
11017 + if (queued == -1 && !sock_flag(meta_sk, SOCK_DEAD))
11018 + meta_sk->sk_data_ready(meta_sk, 0);
11022 +int mptcp_check_req(struct sk_buff *skb, struct net *net)
11024 + struct tcphdr *th = tcp_hdr(skb);
11025 + struct sock *meta_sk = NULL;
11027 + /* MPTCP structures not initialized */
11028 + if (mptcp_init_failed)
11029 + return 0;
11031 + if (skb->protocol == htons(ETH_P_IP))
11032 + meta_sk = mptcp_v4_search_req(th->source, ip_hdr(skb)->saddr,
11033 + ip_hdr(skb)->daddr, net);
11034 +#if IS_ENABLED(CONFIG_IPV6)
11035 + else /* IPv6 */
11036 + meta_sk = mptcp_v6_search_req(th->source, &ipv6_hdr(skb)->saddr,
11037 + &ipv6_hdr(skb)->daddr, net);
11038 +#endif /* CONFIG_IPV6 */
11040 + if (!meta_sk)
11041 + return 0;
11043 + TCP_SKB_CB(skb)->mptcp_flags = MPTCPHDR_JOIN;
11045 + bh_lock_sock_nested(meta_sk);
11046 + if (sock_owned_by_user(meta_sk)) {
11047 + skb->sk = meta_sk;
11048 + if (unlikely(sk_add_backlog(meta_sk, skb,
11049 + meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) {
11050 + bh_unlock_sock(meta_sk);
11051 + NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
11052 + sock_put(meta_sk); /* Taken by mptcp_search_req */
11053 + kfree_skb(skb);
11054 + return 1;
11056 + } else if (skb->protocol == htons(ETH_P_IP)) {
11057 + tcp_v4_do_rcv(meta_sk, skb);
11058 +#if IS_ENABLED(CONFIG_IPV6)
11059 + } else { /* IPv6 */
11060 + tcp_v6_do_rcv(meta_sk, skb);
11061 +#endif /* CONFIG_IPV6 */
11063 + bh_unlock_sock(meta_sk);
11064 + sock_put(meta_sk); /* Taken by mptcp_vX_search_req */
11065 + return 1;
11068 +struct mp_join *mptcp_find_join(struct sk_buff *skb)
11070 + struct tcphdr *th = tcp_hdr(skb);
11071 + unsigned char *ptr;
11072 + int length = (th->doff * 4) - sizeof(struct tcphdr);
11074 + /* Jump through the options to check whether JOIN is there */
11075 + ptr = (unsigned char *)(th + 1);
11076 + while (length > 0) {
11077 + int opcode = *ptr++;
11078 + int opsize;
11080 + switch (opcode) {
11081 + case TCPOPT_EOL:
11082 + return NULL;
11083 + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
11084 + length--;
11085 + continue;
11086 + default:
11087 + opsize = *ptr++;
11088 + if (opsize < 2) /* "silly options" */
11089 + return NULL;
11090 + if (opsize > length)
11091 + return NULL; /* don't parse partial options */
11092 + if (opcode == TCPOPT_MPTCP &&
11093 + ((struct mptcp_option *)(ptr - 2))->sub == MPTCP_SUB_JOIN) {
11094 + return (struct mp_join *)(ptr - 2);
11096 + ptr += opsize - 2;
11097 + length -= opsize;
11100 + return NULL;
11103 +int mptcp_lookup_join(struct sk_buff *skb, struct inet_timewait_sock *tw)
11105 + struct mptcp_cb *mpcb;
11106 + struct sock *meta_sk;
11107 + u32 token;
11108 + struct mp_join *join_opt = mptcp_find_join(skb);
11109 + if (!join_opt)
11110 + return 0;
11112 + /* MPTCP structures were not initialized, so return error */
11113 + if (mptcp_init_failed)
11114 + return -1;
11116 + token = join_opt->u.syn.token;
11117 + meta_sk = mptcp_hash_find(dev_net(skb_dst(skb)->dev), token);
11118 + if (!meta_sk) {
11119 + mptcp_debug("%s:mpcb not found:%x\n", __func__, token);
11120 + return -1;
11123 + mpcb = tcp_sk(meta_sk)->mpcb;
11124 + if (mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping) {
11125 + /* We are in fallback-mode on the reception-side -
11126 + * no new subflows!
11127 + */
11128 + sock_put(meta_sk); /* Taken by mptcp_hash_find */
11129 + return -1;
11132 + /* Coming from time-wait-sock processing in tcp_v4_rcv.
11133 + * We have to deschedule it before continuing, because otherwise
11134 + * mptcp_v4_do_rcv will hit again on it inside tcp_v4_hnd_req.
11135 + */
11136 + if (tw) {
11137 + inet_twsk_deschedule(tw, &tcp_death_row);
11138 + inet_twsk_put(tw);
11141 + TCP_SKB_CB(skb)->mptcp_flags = MPTCPHDR_JOIN;
11142 + /* OK, this is a new syn/join, let's create a new open request and
11143 + * send syn+ack
11144 + */
11145 + bh_lock_sock_nested(meta_sk);
11146 + if (sock_owned_by_user(meta_sk)) {
11147 + skb->sk = meta_sk;
11148 + if (unlikely(sk_add_backlog(meta_sk, skb,
11149 + meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) {
11150 + bh_unlock_sock(meta_sk);
11151 + NET_INC_STATS_BH(sock_net(meta_sk),
11152 + LINUX_MIB_TCPBACKLOGDROP);
11153 + sock_put(meta_sk); /* Taken by mptcp_hash_find */
11154 + kfree_skb(skb);
11155 + return 1;
11157 + } else if (skb->protocol == htons(ETH_P_IP)) {
11158 + tcp_v4_do_rcv(meta_sk, skb);
11159 +#if IS_ENABLED(CONFIG_IPV6)
11160 + } else {
11161 + tcp_v6_do_rcv(meta_sk, skb);
11162 +#endif /* CONFIG_IPV6 */
11164 + bh_unlock_sock(meta_sk);
11165 + sock_put(meta_sk); /* Taken by mptcp_hash_find */
11166 + return 1;
11169 +int mptcp_do_join_short(struct sk_buff *skb, struct mptcp_options_received *mopt,
11170 + struct tcp_options_received *tmp_opt, struct net *net)
11172 + struct sock *meta_sk;
11173 + u32 token;
11175 + token = mopt->mptcp_rem_token;
11176 + meta_sk = mptcp_hash_find(net, token);
11177 + if (!meta_sk) {
11178 + mptcp_debug("%s:mpcb not found:%x\n", __func__, token);
11179 + return -1;
11182 + TCP_SKB_CB(skb)->mptcp_flags = MPTCPHDR_JOIN;
11184 + /* OK, this is a new syn/join, let's create a new open request and
11185 + * send syn+ack
11186 + */
11187 + bh_lock_sock(meta_sk);
11189 + /* This check is also done in mptcp_vX_do_rcv. But, there we cannot
11190 + * call tcp_vX_send_reset, because we hold already two socket-locks.
11191 + * (the listener and the meta from above)
11193 + * And the send-reset will try to take yet another one (ip_send_reply).
11194 + * Thus, we propagate the reset up to tcp_rcv_state_process.
11195 + */
11196 + if (tcp_sk(meta_sk)->mpcb->infinite_mapping_rcv ||
11197 + tcp_sk(meta_sk)->mpcb->send_infinite_mapping ||
11198 + meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table) {
11199 + bh_unlock_sock(meta_sk);
11200 + sock_put(meta_sk); /* Taken by mptcp_hash_find */
11201 + return -1;
11204 + if (sock_owned_by_user(meta_sk)) {
11205 + skb->sk = meta_sk;
11206 + if (unlikely(sk_add_backlog(meta_sk, skb,
11207 + meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf)))
11208 + NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
11209 + else
11210 + /* Must make sure that upper layers won't free the
11211 + * skb if it is added to the backlog-queue.
11212 + */
11213 + skb_get(skb);
11214 + } else {
11215 + /* mptcp_v4_do_rcv tries to free the skb - we prevent this, as
11216 + * the skb will finally be freed by tcp_v4_do_rcv (where we are
11217 + * coming from)
11218 + */
11219 + skb_get(skb);
11220 + if (skb->protocol == htons(ETH_P_IP)) {
11221 + tcp_v4_do_rcv(meta_sk, skb);
11222 +#if IS_ENABLED(CONFIG_IPV6)
11223 + } else { /* IPv6 */
11224 + tcp_v6_do_rcv(meta_sk, skb);
11225 +#endif /* CONFIG_IPV6 */
11229 + bh_unlock_sock(meta_sk);
11230 + sock_put(meta_sk); /* Taken by mptcp_hash_find */
11231 + return 0;
11234 +/**
11235 + * Equivalent of tcp_fin() for MPTCP
11236 + * Can be called only when the FIN is validly part
11237 + * of the data seqnum space. Not before when we get holes.
11238 + */
11239 +void mptcp_fin(struct sock *meta_sk)
11241 + struct sock *sk = NULL, *sk_it;
11242 + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
11243 + struct mptcp_cb *mpcb = meta_tp->mpcb;
11245 + mptcp_for_each_sk(mpcb, sk_it) {
11246 + if (tcp_sk(sk_it)->mptcp->path_index == mpcb->dfin_path_index) {
11247 + sk = sk_it;
11248 + break;
11252 + if (!sk || sk->sk_state == TCP_CLOSE)
11253 + sk = mptcp_select_ack_sock(meta_sk, 0);
11255 + inet_csk_schedule_ack(sk);
11257 + meta_sk->sk_shutdown |= RCV_SHUTDOWN;
11258 + sock_set_flag(meta_sk, SOCK_DONE);
11260 + switch (meta_sk->sk_state) {
11261 + case TCP_SYN_RECV:
11262 + case TCP_ESTABLISHED:
11263 + /* Move to CLOSE_WAIT */
11264 + tcp_set_state(meta_sk, TCP_CLOSE_WAIT);
11265 + inet_csk(sk)->icsk_ack.pingpong = 1;
11266 + break;
11268 + case TCP_CLOSE_WAIT:
11269 + case TCP_CLOSING:
11270 + /* Received a retransmission of the FIN, do
11271 + * nothing.
11272 + */
11273 + break;
11274 + case TCP_LAST_ACK:
11275 + /* RFC793: Remain in the LAST-ACK state. */
11276 + break;
11278 + case TCP_FIN_WAIT1:
11279 + /* This case occurs when a simultaneous close
11280 + * happens, we must ack the received FIN and
11281 + * enter the CLOSING state.
11282 + */
11283 + tcp_send_ack(sk);
11284 + tcp_set_state(meta_sk, TCP_CLOSING);
11285 + break;
11286 + case TCP_FIN_WAIT2:
11287 + /* Received a FIN -- send ACK and enter TIME_WAIT. */
11288 + tcp_send_ack(sk);
11289 + tcp_time_wait(meta_sk, TCP_TIME_WAIT, 0);
11290 + break;
11291 + default:
11292 + /* Only TCP_LISTEN and TCP_CLOSE are left, in these
11293 + * cases we should never reach this piece of code.
11294 + */
11295 + pr_err("%s: Impossible, meta_sk->sk_state=%d\n", __func__,
11296 + meta_sk->sk_state);
11297 + break;
11300 + /* It _is_ possible, that we have something out-of-order _after_ FIN.
11301 + * Probably, we should reset in this case. For now drop them.
11302 + */
11303 + mptcp_purge_ofo_queue(meta_tp);
11304 + sk_mem_reclaim(meta_sk);
11306 + if (!sock_flag(meta_sk, SOCK_DEAD)) {
11307 + meta_sk->sk_state_change(meta_sk);
11309 + /* Do not send POLL_HUP for half duplex close. */
11310 + if (meta_sk->sk_shutdown == SHUTDOWN_MASK ||
11311 + meta_sk->sk_state == TCP_CLOSE)
11312 + sk_wake_async(meta_sk, SOCK_WAKE_WAITD, POLL_HUP);
11313 + else
11314 + sk_wake_async(meta_sk, SOCK_WAKE_WAITD, POLL_IN);
11317 + return;
11320 +static void mptcp_xmit_retransmit_queue(struct sock *meta_sk)
11322 + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
11323 + struct sk_buff *skb;
11325 + if (!meta_tp->packets_out)
11326 + return;
11328 + tcp_for_write_queue(skb, meta_sk) {
11329 + if (skb == tcp_send_head(meta_sk))
11330 + break;
11332 + if (mptcp_retransmit_skb(meta_sk, skb))
11333 + return;
11335 + if (skb == tcp_write_queue_head(meta_sk))
11336 + inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS,
11337 + inet_csk(meta_sk)->icsk_rto,
11338 + TCP_RTO_MAX);
11342 +/* Handle the DATA_ACK */
11343 +static void mptcp_data_ack(struct sock *sk, const struct sk_buff *skb)
11345 + struct sock *meta_sk = mptcp_meta_sk(sk);
11346 + struct tcp_sock *meta_tp = tcp_sk(meta_sk), *tp = tcp_sk(sk);
11347 + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
11348 + u32 prior_snd_una = meta_tp->snd_una;
11349 + int prior_packets;
11350 + u32 nwin, data_ack, data_seq;
11351 + u16 data_len = 0;
11353 + /* A valid packet came in - subflow is operational again */
11354 + tp->pf = 0;
11356 + /* Even if there is no data-ack, we stop retransmitting.
11357 + * Except if this is a SYN/ACK. Then it is just a retransmission
11358 + */
11359 + if (tp->mptcp->pre_established && !tcp_hdr(skb)->syn) {
11360 + tp->mptcp->pre_established = 0;
11361 + sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer);
11364 + /* If we are in infinite mapping mode, rx_opt.data_ack has been
11365 + * set by mptcp_clean_rtx_infinite.
11366 + */
11367 + if (!(tcb->mptcp_flags & MPTCPHDR_ACK) && !tp->mpcb->infinite_mapping_snd)
11368 + goto exit;
11370 + data_ack = tp->mptcp->rx_opt.data_ack;
11372 + if (unlikely(!tp->mptcp->fully_established) &&
11373 + (data_ack != meta_tp->mptcp->snt_isn ||
11374 + tp->mptcp->snt_isn + 1 != TCP_SKB_CB(skb)->ack_seq))
11375 + /* As soon as data has been data-acked,
11376 + * or a subflow-data-ack (not acking syn - thus snt_isn + 1)
11377 + * includes a data-ack, we are fully established
11378 + */
11379 + mptcp_become_fully_estab(sk);
11381 + /* Get the data_seq */
11382 + if (mptcp_is_data_seq(skb)) {
11383 + data_seq = tp->mptcp->rx_opt.data_seq;
11384 + data_len = tp->mptcp->rx_opt.data_len;
11385 + } else {
11386 + data_seq = meta_tp->snd_wl1;
11389 + /* If the ack is older than previous acks
11390 + * then we can probably ignore it.
11391 + */
11392 + if (before(data_ack, prior_snd_una))
11393 + goto exit;
11395 + /* If the ack includes data we haven't sent yet, discard
11396 + * this segment (RFC793 Section 3.9).
11397 + */
11398 + if (after(data_ack, meta_tp->snd_nxt))
11399 + goto exit;
11401 + /*** Now, update the window - inspired by tcp_ack_update_window ***/
11402 + nwin = ntohs(tcp_hdr(skb)->window);
11404 + if (likely(!tcp_hdr(skb)->syn))
11405 + nwin <<= tp->rx_opt.snd_wscale;
11407 + if (tcp_may_update_window(meta_tp, data_ack, data_seq, nwin)) {
11408 + tcp_update_wl(meta_tp, data_seq);
11410 + /* Draft v09, Section 3.3.5:
11411 + * [...] It should only update its local receive window values
11412 + * when the largest sequence number allowed (i.e. DATA_ACK +
11413 + * receive window) increases. [...]
11414 + */
11415 + if (meta_tp->snd_wnd != nwin &&
11416 + !before(data_ack + nwin, tcp_wnd_end(meta_tp))) {
11417 + meta_tp->snd_wnd = nwin;
11419 + if (nwin > meta_tp->max_window)
11420 + meta_tp->max_window = nwin;
11423 + /*** Done, update the window ***/
11425 + /* We passed data and got it acked, remove any soft error
11426 + * log. Something worked...
11427 + */
11428 + sk->sk_err_soft = 0;
11429 + inet_csk(meta_sk)->icsk_probes_out = 0;
11430 + meta_tp->rcv_tstamp = tcp_time_stamp;
11431 + prior_packets = meta_tp->packets_out;
11432 + if (!prior_packets)
11433 + goto no_queue;
11435 + meta_tp->snd_una = data_ack;
11437 + mptcp_clean_rtx_queue(meta_sk, prior_snd_una);
11439 + /* We are in loss-state, and something got acked, retransmit the whole
11440 + * queue now!
11441 + */
11442 + if (inet_csk(meta_sk)->icsk_ca_state == TCP_CA_Loss &&
11443 + after(data_ack, prior_snd_una)) {
11444 + mptcp_xmit_retransmit_queue(meta_sk);
11445 + inet_csk(meta_sk)->icsk_ca_state = TCP_CA_Open;
11448 + /* Simplified version of tcp_new_space, because the snd-buffer
11449 + * is handled by all the subflows.
11450 + */
11451 + if (sock_flag(meta_sk, SOCK_QUEUE_SHRUNK)) {
11452 + sock_reset_flag(meta_sk, SOCK_QUEUE_SHRUNK);
11453 + if (meta_sk->sk_socket &&
11454 + test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags))
11455 + meta_sk->sk_write_space(meta_sk);
11458 + if (meta_sk->sk_state != TCP_ESTABLISHED &&
11459 + mptcp_rcv_state_process(meta_sk, sk, skb, data_seq, data_len))
11460 + return;
11462 +exit:
11463 + mptcp_push_pending_frames(meta_sk);
11465 + return;
11467 +no_queue:
11468 + if (tcp_send_head(meta_sk))
11469 + tcp_ack_probe(meta_sk);
11471 + mptcp_push_pending_frames(meta_sk);
11473 + return;
11476 +void mptcp_clean_rtx_infinite(struct sk_buff *skb, struct sock *sk)
11478 + struct tcp_sock *tp = tcp_sk(sk), *meta_tp = tcp_sk(mptcp_meta_sk(sk));
11480 + if (!tp->mpcb->infinite_mapping_snd)
11481 + return;
11483 + /* The difference between both write_seq's represents the offset between
11484 + * data-sequence and subflow-sequence. As we are infinite, this must
11485 + * match.
11487 + * Thus, from this difference we can infer the meta snd_una.
11488 + */
11489 + tp->mptcp->rx_opt.data_ack = meta_tp->snd_nxt - tp->snd_nxt +
11490 + tp->snd_una;
11492 + mptcp_data_ack(sk, skb);
11495 +/**** static functions used by mptcp_parse_options */
11497 +static inline int mptcp_rem_raddress(struct mptcp_cb *mpcb, u8 rem_id)
11499 + if (mptcp_v4_rem_raddress(mpcb, rem_id) < 0) {
11500 +#if IS_ENABLED(CONFIG_IPV6)
11501 + if (mptcp_v6_rem_raddress(mpcb, rem_id) < 0)
11502 + return -1;
11503 +#else
11504 + return -1;
11505 +#endif /* CONFIG_IPV6 */
11507 + return 0;
11510 +static void mptcp_send_reset_rem_id(const struct mptcp_cb *mpcb, u8 rem_id)
11512 + struct sock *sk_it, *tmpsk;
11514 + mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) {
11515 + if (tcp_sk(sk_it)->mptcp->rem_id == rem_id) {
11516 + mptcp_reinject_data(sk_it, 0);
11517 + sk_it->sk_err = ECONNRESET;
11518 + if (tcp_need_reset(sk_it->sk_state))
11519 + tcp_send_active_reset(sk_it, GFP_ATOMIC);
11520 + mptcp_sub_force_close(sk_it);
11525 +void mptcp_parse_options(const uint8_t *ptr, int opsize,
11526 + struct tcp_options_received *opt_rx,
11527 + struct mptcp_options_received *mopt,
11528 + const struct sk_buff *skb)
11530 + struct mptcp_option *mp_opt = (struct mptcp_option *)ptr;
11532 + /* If the socket is mp-capable we would have a mopt. */
11533 + if (!mopt)
11534 + return;
11536 + switch (mp_opt->sub) {
11537 + case MPTCP_SUB_CAPABLE:
11539 + struct mp_capable *mpcapable = (struct mp_capable *)ptr;
11541 + if (opsize != MPTCP_SUB_LEN_CAPABLE_SYN &&
11542 + opsize != MPTCP_SUB_LEN_CAPABLE_ACK) {
11543 + mptcp_debug("%s: mp_capable: bad option size %d\n",
11544 + __func__, opsize);
11545 + break;
11548 + if (!sysctl_mptcp_enabled)
11549 + break;
11551 + /* We only support MPTCP version 0 */
11552 + if (mpcapable->ver != 0)
11553 + break;
11555 + /* MPTCP-RFC 6824:
11556 + * "If receiving a message with the 'B' flag set to 1, and this
11557 + * is not understood, then this SYN MUST be silently ignored;
11558 + */
11559 + if (mpcapable->b) {
11560 + mopt->drop_me = 1;
11561 + break;
11564 + /* MPTCP-RFC 6824:
11565 + * "An implementation that only supports this method MUST set
11566 + * bit "H" to 1, and bits "C" through "G" to 0."
11567 + */
11568 + if (!mpcapable->h)
11569 + break;
11571 + mopt->saw_mpc = 1;
11572 + mopt->dss_csum = sysctl_mptcp_checksum || mpcapable->a;
11574 + if (opsize >= MPTCP_SUB_LEN_CAPABLE_SYN)
11575 + mopt->mptcp_key = mpcapable->sender_key;
11577 + break;
11579 + case MPTCP_SUB_JOIN:
11581 + struct mp_join *mpjoin = (struct mp_join *)ptr;
11583 + if (opsize != MPTCP_SUB_LEN_JOIN_SYN &&
11584 + opsize != MPTCP_SUB_LEN_JOIN_SYNACK &&
11585 + opsize != MPTCP_SUB_LEN_JOIN_ACK) {
11586 + mptcp_debug("%s: mp_join: bad option size %d\n",
11587 + __func__, opsize);
11588 + break;
11591 + /* saw_mpc must be set, because in tcp_check_req we assume that
11592 + * it is set to support falling back to reg. TCP if a rexmitted
11593 + * SYN has no MP_CAPABLE or MP_JOIN
11594 + */
11595 + switch (opsize) {
11596 + case MPTCP_SUB_LEN_JOIN_SYN:
11597 + mopt->is_mp_join = 1;
11598 + mopt->saw_mpc = 1;
11599 + mopt->low_prio = mpjoin->b;
11600 + mopt->rem_id = mpjoin->addr_id;
11601 + mopt->mptcp_rem_token = mpjoin->u.syn.token;
11602 + mopt->mptcp_recv_nonce = mpjoin->u.syn.nonce;
11603 + break;
11604 + case MPTCP_SUB_LEN_JOIN_SYNACK:
11605 + mopt->saw_mpc = 1;
11606 + mopt->low_prio = mpjoin->b;
11607 + mopt->rem_id = mpjoin->addr_id;
11608 + mopt->mptcp_recv_tmac = mpjoin->u.synack.mac;
11609 + mopt->mptcp_recv_nonce = mpjoin->u.synack.nonce;
11610 + break;
11611 + case MPTCP_SUB_LEN_JOIN_ACK:
11612 + mopt->saw_mpc = 1;
11613 + mopt->join_ack = 1;
11614 + memcpy(mopt->mptcp_recv_mac, mpjoin->u.ack.mac, 20);
11615 + break;
11617 + break;
11619 + case MPTCP_SUB_DSS:
11621 + struct mp_dss *mdss = (struct mp_dss *)ptr;
11622 + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
11624 + /* We check opsize for the csum and non-csum case. We do this,
11625 + * because the draft says that the csum SHOULD be ignored if
11626 + * it has not been negotiated in the MP_CAPABLE but still is
11627 + * present in the data.
11629 + * It will get ignored later in mptcp_queue_skb.
11630 + */
11631 + if (opsize != mptcp_sub_len_dss(mdss, 0) &&
11632 + opsize != mptcp_sub_len_dss(mdss, 1)) {
11633 + mptcp_debug("%s: mp_dss: bad option size %d\n",
11634 + __func__, opsize);
11635 + break;
11638 + ptr += 4;
11640 + if (mdss->A) {
11641 + tcb->mptcp_flags |= MPTCPHDR_ACK;
11643 + if (mdss->a) {
11644 + mopt->data_ack = (u32) get_unaligned_be64(ptr);
11645 + ptr += MPTCP_SUB_LEN_ACK_64;
11646 + } else {
11647 + mopt->data_ack = get_unaligned_be32(ptr);
11648 + ptr += MPTCP_SUB_LEN_ACK;
11652 + tcb->dss_off = (ptr - skb_transport_header(skb));
11654 + if (mdss->M) {
11655 + if (mdss->m) {
11656 + u64 data_seq64 = get_unaligned_be64(ptr);
11658 + tcb->mptcp_flags |= MPTCPHDR_SEQ64_SET;
11659 + mopt->data_seq = (u32) data_seq64;
11661 + ptr += 12; /* 64-bit dseq + subseq */
11662 + } else {
11663 + mopt->data_seq = get_unaligned_be32(ptr);
11664 + ptr += 8; /* 32-bit dseq + subseq */
11666 + mopt->data_len = get_unaligned_be16(ptr);
11668 + tcb->mptcp_flags |= MPTCPHDR_SEQ;
11670 + /* Is a check-sum present? */
11671 + if (opsize == mptcp_sub_len_dss(mdss, 1))
11672 + tcb->mptcp_flags |= MPTCPHDR_DSS_CSUM;
11674 + /* DATA_FIN only possible with DSS-mapping */
11675 + if (mdss->F)
11676 + tcb->mptcp_flags |= MPTCPHDR_FIN;
11679 + break;
11681 + case MPTCP_SUB_ADD_ADDR:
11683 +#if IS_ENABLED(CONFIG_IPV6)
11684 + struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;
11686 + if ((mpadd->ipver == 4 && opsize != MPTCP_SUB_LEN_ADD_ADDR4 &&
11687 + opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) ||
11688 + (mpadd->ipver == 6 && opsize != MPTCP_SUB_LEN_ADD_ADDR6 &&
11689 + opsize != MPTCP_SUB_LEN_ADD_ADDR6 + 2)) {
11690 +#else
11691 + if (opsize != MPTCP_SUB_LEN_ADD_ADDR4 &&
11692 + opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) {
11693 +#endif /* CONFIG_IPV6 */
11694 + mptcp_debug("%s: mp_add_addr: bad option size %d\n",
11695 + __func__, opsize);
11696 + break;
11699 + /* We have to manually parse the options if we got two of them. */
11700 + if (mopt->saw_add_addr) {
11701 + mopt->more_add_addr = 1;
11702 + break;
11704 + mopt->saw_add_addr = 1;
11705 + mopt->add_addr_ptr = ptr;
11706 + break;
11708 + case MPTCP_SUB_REMOVE_ADDR:
11709 + if ((opsize - MPTCP_SUB_LEN_REMOVE_ADDR) < 0) {
11710 + mptcp_debug("%s: mp_remove_addr: bad option size %d\n",
11711 + __func__, opsize);
11712 + break;
11715 + if (mopt->saw_rem_addr) {
11716 + mopt->more_rem_addr = 1;
11717 + break;
11719 + mopt->saw_rem_addr = 1;
11720 + mopt->rem_addr_ptr = ptr;
11721 + break;
11722 + case MPTCP_SUB_PRIO:
11724 + struct mp_prio *mpprio = (struct mp_prio *)ptr;
11726 + if (opsize != MPTCP_SUB_LEN_PRIO &&
11727 + opsize != MPTCP_SUB_LEN_PRIO_ADDR) {
11728 + mptcp_debug("%s: mp_prio: bad option size %d\n",
11729 + __func__, opsize);
11730 + break;
11733 + mopt->saw_low_prio = 1;
11734 + mopt->low_prio = mpprio->b;
11736 + if (opsize == MPTCP_SUB_LEN_PRIO_ADDR) {
11737 + mopt->saw_low_prio = 2;
11738 + mopt->prio_addr_id = mpprio->addr_id;
11740 + break;
11742 + case MPTCP_SUB_FAIL:
11743 + if (opsize != MPTCP_SUB_LEN_FAIL) {
11744 + mptcp_debug("%s: mp_fail: bad option size %d\n",
11745 + __func__, opsize);
11746 + break;
11748 + mopt->mp_fail = 1;
11749 + break;
11750 + case MPTCP_SUB_FCLOSE:
11751 + if (opsize != MPTCP_SUB_LEN_FCLOSE) {
11752 + mptcp_debug("%s: mp_fclose: bad option size %d\n",
11753 + __func__, opsize);
11754 + break;
11757 + mopt->mp_fclose = 1;
11758 + mopt->mptcp_key = ((struct mp_fclose *)ptr)->key;
11760 + break;
11761 + default:
11762 + mptcp_debug("%s: Received unkown subtype: %d\n",
11763 + __func__, mp_opt->sub);
11764 + break;
11768 +int mptcp_check_rtt(const struct tcp_sock *tp, int time)
11770 + struct mptcp_cb *mpcb = tp->mpcb;
11771 + struct sock *sk;
11772 + u32 rtt_max = 0;
11774 + /* In MPTCP, we take the max delay across all flows,
11775 + * in order to take into account meta-reordering buffers.
11776 + */
11777 + mptcp_for_each_sk(mpcb, sk) {
11778 + if (!mptcp_sk_can_recv(sk))
11779 + continue;
11781 + if (rtt_max < tcp_sk(sk)->rcv_rtt_est.rtt)
11782 + rtt_max = tcp_sk(sk)->rcv_rtt_est.rtt;
11784 + if (time < (rtt_max >> 3) || !rtt_max)
11785 + return 1;
11787 + return 0;
11790 +static void mptcp_handle_add_addr(const unsigned char *ptr, struct sock *sk)
11792 + struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;
11794 + if (mpadd->ipver == 4) {
11795 + __be16 port = 0;
11796 + if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4 + 2)
11797 + port = mpadd->u.v4.port;
11799 + mptcp_v4_add_raddress(tcp_sk(sk)->mpcb, &mpadd->u.v4.addr, port,
11800 + mpadd->addr_id);
11801 +#if IS_ENABLED(CONFIG_IPV6)
11802 + } else if (mpadd->ipver == 6) {
11803 + __be16 port = 0;
11804 + if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR6 + 2)
11805 + port = mpadd->u.v6.port;
11807 + mptcp_v6_add_raddress(tcp_sk(sk)->mpcb, &mpadd->u.v6.addr, port,
11808 + mpadd->addr_id);
11809 +#endif /* CONFIG_IPV6 */
11813 +static void mptcp_handle_rem_addr(const unsigned char *ptr, struct sock *sk)
11815 + struct mp_remove_addr *mprem = (struct mp_remove_addr *)ptr;
11816 + int i;
11817 + u8 rem_id;
11819 + for (i = 0; i <= mprem->len - MPTCP_SUB_LEN_REMOVE_ADDR; i++) {
11820 + rem_id = (&mprem->addrs_id)[i];
11821 + if (!mptcp_rem_raddress(tcp_sk(sk)->mpcb, rem_id))
11822 + mptcp_send_reset_rem_id(tcp_sk(sk)->mpcb, rem_id);
11826 +static void mptcp_parse_addropt(const struct sk_buff *skb, struct sock *sk)
11828 + struct tcphdr *th = tcp_hdr(skb);
11829 + unsigned char *ptr;
11830 + int length = (th->doff * 4) - sizeof(struct tcphdr);
11832 + /* Jump through the options to check whether ADD_ADDR is there */
11833 + ptr = (unsigned char *)(th + 1);
11834 + while (length > 0) {
11835 + int opcode = *ptr++;
11836 + int opsize;
11838 + switch (opcode) {
11839 + case TCPOPT_EOL:
11840 + return;
11841 + case TCPOPT_NOP:
11842 + length--;
11843 + continue;
11844 + default:
11845 + opsize = *ptr++;
11846 + if (opsize < 2)
11847 + return;
11848 + if (opsize > length)
11849 + return; /* don't parse partial options */
11850 + if (opcode == TCPOPT_MPTCP &&
11851 + ((struct mptcp_option *)ptr)->sub == MPTCP_SUB_ADD_ADDR) {
11852 +#if IS_ENABLED(CONFIG_IPV6)
11853 + struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;
11854 + if ((mpadd->ipver == 4 && opsize != MPTCP_SUB_LEN_ADD_ADDR4 &&
11855 + opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) ||
11856 + (mpadd->ipver == 6 && opsize != MPTCP_SUB_LEN_ADD_ADDR6 &&
11857 + opsize != MPTCP_SUB_LEN_ADD_ADDR6 + 2))
11858 +#else
11859 + if (opsize != MPTCP_SUB_LEN_ADD_ADDR4 &&
11860 + opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2)
11861 +#endif /* CONFIG_IPV6 */
11862 + goto cont;
11864 + mptcp_handle_add_addr(ptr, sk);
11866 + if (opcode == TCPOPT_MPTCP &&
11867 + ((struct mptcp_option *)ptr)->sub == MPTCP_SUB_REMOVE_ADDR) {
11868 + if ((opsize - MPTCP_SUB_LEN_REMOVE_ADDR) < 0)
11869 + goto cont;
11871 + mptcp_handle_rem_addr(ptr, sk);
11873 +cont:
11874 + ptr += opsize - 2;
11875 + length -= opsize;
11878 + return;
11881 +static inline int mptcp_mp_fail_rcvd(struct sock *sk, const struct tcphdr *th)
11883 + struct mptcp_tcp_sock *mptcp = tcp_sk(sk)->mptcp;
11884 + struct sock *meta_sk = mptcp_meta_sk(sk);
11885 + struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
11887 + if (unlikely(mptcp->rx_opt.mp_fail)) {
11888 + mptcp->rx_opt.mp_fail = 0;
11890 + if (!th->rst && !mpcb->infinite_mapping_snd) {
11891 + struct sock *sk_it;
11893 + mpcb->send_infinite_mapping = 1;
11894 + /* We resend everything that has not been acknowledged */
11895 + meta_sk->sk_send_head = tcp_write_queue_head(meta_sk);
11897 + /* We artificially restart the whole send-queue. Thus,
11898 + * it is as if no packets are in flight
11899 + */
11900 + tcp_sk(meta_sk)->packets_out = 0;
11902 + /* If the snd_nxt already wrapped around, we have to
11903 + * undo the wrapping, as we are restarting from snd_una
11904 + * on.
11905 + */
11906 + if (tcp_sk(meta_sk)->snd_nxt < tcp_sk(meta_sk)->snd_una) {
11907 + mpcb->snd_high_order[mpcb->snd_hiseq_index] -= 2;
11908 + mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1;
11910 + tcp_sk(meta_sk)->snd_nxt = tcp_sk(meta_sk)->snd_una;
11912 + /* Trigger a sending on the meta. */
11913 + mptcp_push_pending_frames(meta_sk);
11915 + mptcp_for_each_sk(mpcb, sk_it) {
11916 + if (sk != sk_it)
11917 + mptcp_sub_force_close(sk_it);
11921 + return 0;
11924 + if (unlikely(mptcp->rx_opt.mp_fclose)) {
11925 + struct sock *sk_it, *tmpsk;
11927 + mptcp->rx_opt.mp_fclose = 0;
11928 + if (mptcp->rx_opt.mptcp_key != mpcb->mptcp_loc_key)
11929 + return 0;
11931 + if (tcp_need_reset(sk->sk_state))
11932 + tcp_send_active_reset(sk, GFP_ATOMIC);
11934 + mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk)
11935 + mptcp_sub_force_close(sk_it);
11937 + tcp_reset(meta_sk);
11939 + return 1;
11942 + return 0;
11945 +static inline void mptcp_path_array_check(struct sock *meta_sk)
11947 + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
11949 + if (unlikely(mpcb->list_rcvd)) {
11950 + mpcb->list_rcvd = 0;
11951 + if (mpcb->pm_ops->new_remote_address)
11952 + mpcb->pm_ops->new_remote_address(meta_sk);
11956 +int mptcp_handle_options(struct sock *sk, const struct tcphdr *th, struct sk_buff *skb)
11958 + struct tcp_sock *tp = tcp_sk(sk);
11959 + struct mptcp_options_received *mopt = &tp->mptcp->rx_opt;
11961 + if (tp->mpcb->infinite_mapping_rcv || tp->mpcb->infinite_mapping_snd)
11962 + return 0;
11964 + if (mptcp_mp_fail_rcvd(sk, th))
11965 + return 1;
11967 + /* RFC 6824, Section 3.3:
11968 + * If a checksum is not present when its use has been negotiated, the
11969 + * receiver MUST close the subflow with a RST as it is considered broken.
11970 + */
11971 + if (mptcp_is_data_seq(skb) && tp->mpcb->dss_csum &&
11972 + !(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_DSS_CSUM)) {
11973 + if (tcp_need_reset(sk->sk_state))
11974 + tcp_send_active_reset(sk, GFP_ATOMIC);
11976 + mptcp_sub_force_close(sk);
11977 + return 1;
11980 + /* We have to acknowledge retransmissions of the third
11981 + * ack.
11982 + */
11983 + if (mopt->join_ack) {
11984 + tcp_send_delayed_ack(sk);
11985 + mopt->join_ack = 0;
11988 + if (mopt->saw_add_addr || mopt->saw_rem_addr) {
11989 + if (mopt->more_add_addr || mopt->more_rem_addr) {
11990 + mptcp_parse_addropt(skb, sk);
11991 + } else {
11992 + if (mopt->saw_add_addr)
11993 + mptcp_handle_add_addr(mopt->add_addr_ptr, sk);
11994 + if (mopt->saw_rem_addr)
11995 + mptcp_handle_rem_addr(mopt->rem_addr_ptr, sk);
11998 + mopt->more_add_addr = 0;
11999 + mopt->saw_add_addr = 0;
12000 + mopt->more_rem_addr = 0;
12001 + mopt->saw_rem_addr = 0;
12003 + if (mopt->saw_low_prio) {
12004 + if (mopt->saw_low_prio == 1) {
12005 + tp->mptcp->rcv_low_prio = mopt->low_prio;
12006 + } else {
12007 + struct sock *sk_it;
12008 + mptcp_for_each_sk(tp->mpcb, sk_it) {
12009 + struct mptcp_tcp_sock *mptcp = tcp_sk(sk_it)->mptcp;
12010 + if (mptcp->rem_id == mopt->prio_addr_id)
12011 + mptcp->rcv_low_prio = mopt->low_prio;
12014 + mopt->saw_low_prio = 0;
12017 + mptcp_data_ack(sk, skb);
12019 + mptcp_path_array_check(mptcp_meta_sk(sk));
12020 + /* Socket may have been mp_killed by a REMOVE_ADDR */
12021 + if (tp->mp_killed)
12022 + return 1;
12024 + return 0;
12027 +/* The skptr is needed, because if we become MPTCP-capable, we have to switch
12028 + * from meta-socket to master-socket.
12030 + * @return: 1 - we want to reset this connection
12031 + * 2 - we want to discard the received syn/ack
12032 + * 0 - everything is fine - continue
12033 + */
12034 +int mptcp_rcv_synsent_state_process(struct sock *sk, struct sock **skptr,
12035 + struct sk_buff *skb,
12036 + struct mptcp_options_received *mopt)
12038 + struct tcp_sock *tp = tcp_sk(sk);
12040 + if (tp->mpc) {
12041 + u8 hash_mac_check[20];
12042 + struct mptcp_cb *mpcb = tp->mpcb;
12044 + mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key,
12045 + (u8 *)&mpcb->mptcp_loc_key,
12046 + (u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce,
12047 + (u8 *)&tp->mptcp->mptcp_loc_nonce,
12048 + (u32 *)hash_mac_check);
12049 + if (memcmp(hash_mac_check,
12050 + (char *)&tp->mptcp->rx_opt.mptcp_recv_tmac, 8)) {
12051 + mptcp_sub_force_close(sk);
12052 + return 1;
12055 + /* Set this flag in order to postpone data sending
12056 + * until the 4th ack arrives.
12057 + */
12058 + tp->mptcp->pre_established = 1;
12059 + tp->mptcp->rcv_low_prio = tp->mptcp->rx_opt.low_prio;
12061 + mptcp_hmac_sha1((u8 *)&mpcb->mptcp_loc_key,
12062 + (u8 *)&mpcb->mptcp_rem_key,
12063 + (u8 *)&tp->mptcp->mptcp_loc_nonce,
12064 + (u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce,
12065 + (u32 *)&tp->mptcp->sender_mac[0]);
12067 + } else if (mopt->saw_mpc) {
12068 + if (mptcp_create_master_sk(sk, mopt->mptcp_key,
12069 + ntohs(tcp_hdr(skb)->window)))
12070 + return 2;
12072 + sk = tcp_sk(sk)->mpcb->master_sk;
12073 + *skptr = sk;
12074 + tp = tcp_sk(sk);
12076 + /* snd_nxt - 1, because it has been incremented
12077 + * by tcp_connect for the SYN
12078 + */
12079 + tp->mptcp->snt_isn = tp->snd_nxt - 1;
12080 + tp->mpcb->dss_csum = mopt->dss_csum;
12081 + tp->mptcp->include_mpc = 1;
12083 + sk_set_socket(sk, mptcp_meta_sk(sk)->sk_socket);
12084 + sk->sk_wq = mptcp_meta_sk(sk)->sk_wq;
12086 + mptcp_update_metasocket(sk, mptcp_meta_sk(sk));
12088 + /* hold in mptcp_inherit_sk due to initialization to 2 */
12089 + sock_put(sk);
12090 + } else {
12091 + tp->request_mptcp = 0;
12093 + if (tp->inside_tk_table)
12094 + mptcp_hash_remove(tp);
12097 + if (tp->mpc)
12098 + tp->mptcp->rcv_isn = TCP_SKB_CB(skb)->seq;
12100 + return 0;
12103 +bool mptcp_should_expand_sndbuf(const struct sock *sk)
12105 + struct sock *sk_it;
12106 + struct sock *meta_sk = mptcp_meta_sk(sk);
12107 + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
12108 + int cnt_backups = 0;
12109 + int backup_available = 0;
12111 + /* We circumvent this check in tcp_check_space, because we want to
12112 + * always call sk_write_space. So, we reproduce the check here.
12113 + */
12114 + if (!meta_sk->sk_socket ||
12115 + !test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags))
12116 + return false;
12118 + /* If the user specified a specific send buffer setting, do
12119 + * not modify it.
12120 + */
12121 + if (meta_sk->sk_userlocks & SOCK_SNDBUF_LOCK)
12122 + return false;
12124 + /* If we are under global TCP memory pressure, do not expand. */
12125 + if (sk_under_memory_pressure(meta_sk))
12126 + return false;
12128 + /* If we are under soft global TCP memory pressure, do not expand. */
12129 + if (sk_memory_allocated(meta_sk) >= sk_prot_mem_limits(meta_sk, 0))
12130 + return false;
12133 + /* For MPTCP we look for a subsocket that could send data.
12134 + * If we found one, then we update the send-buffer.
12135 + */
12136 + mptcp_for_each_sk(meta_tp->mpcb, sk_it) {
12137 + struct tcp_sock *tp_it = tcp_sk(sk_it);
12139 + if (!mptcp_sk_can_send(sk_it))
12140 + continue;
12142 + /* Backup-flows have to be counted - if there is no other
12143 + * subflow we take the backup-flow into account. */
12144 + if (tp_it->mptcp->rcv_low_prio || tp_it->mptcp->low_prio) {
12145 + cnt_backups++;
12148 + if (tp_it->packets_out < tp_it->snd_cwnd) {
12149 + if (tp_it->mptcp->rcv_low_prio || tp_it->mptcp->low_prio) {
12150 + backup_available = 1;
12151 + continue;
12153 + return true;
12157 + /* Backup-flow is available for sending - update send-buffer */
12158 + if (meta_tp->mpcb->cnt_established == cnt_backups && backup_available)
12159 + return true;
12160 + return false;
12163 +void mptcp_init_buffer_space(struct sock *sk)
12165 + struct tcp_sock *tp = tcp_sk(sk);
12166 + struct sock *meta_sk = mptcp_meta_sk(sk);
12167 + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
12168 + int space;
12170 + tcp_init_buffer_space(sk);
12172 + if (is_master_tp(tp)) {
12173 + /* If there is only one subflow, we just use regular TCP
12174 + * autotuning. User-locks are handled already by
12175 + * tcp_init_buffer_space
12176 + */
12177 + meta_tp->window_clamp = tp->window_clamp;
12178 + meta_tp->rcv_ssthresh = tp->rcv_ssthresh;
12179 + meta_sk->sk_rcvbuf = sk->sk_rcvbuf;
12180 + meta_sk->sk_sndbuf = sk->sk_sndbuf;
12182 + return;
12185 + if (meta_sk->sk_userlocks & SOCK_RCVBUF_LOCK)
12186 + goto snd_buf;
12188 + /* Adding a new subflow to the rcv-buffer space. We make a simple
12189 + * addition, to give some space to allow traffic on the new subflow.
12190 + * Autotuning will increase it further later on.
12191 + */
12192 + space = min(meta_sk->sk_rcvbuf + sk->sk_rcvbuf, sysctl_tcp_rmem[2]);
12193 + if (space > meta_sk->sk_rcvbuf) {
12194 + meta_tp->window_clamp += tp->window_clamp;
12195 + meta_tp->rcv_ssthresh += tp->rcv_ssthresh;
12196 + meta_sk->sk_rcvbuf = space;
12199 +snd_buf:
12200 + if (meta_sk->sk_userlocks & SOCK_SNDBUF_LOCK)
12201 + return;
12203 + /* Adding a new subflow to the send-buffer space. We make a simple
12204 + * addition, to give some space to allow traffic on the new subflow.
12205 + * Autotuning will increase it further later on.
12206 + */
12207 + space = min(meta_sk->sk_sndbuf + sk->sk_sndbuf, sysctl_tcp_wmem[2]);
12208 + if (space > meta_sk->sk_sndbuf) {
12209 + meta_sk->sk_sndbuf = space;
12210 + meta_sk->sk_write_space(meta_sk);
12214 +void mptcp_tcp_set_rto(struct sock *sk)
12216 + tcp_set_rto(sk);
12217 + mptcp_set_rto(sk);
12219 diff -Nur linux-3.14.45.orig/net/mptcp/mptcp_ipv4.c linux-3.14.45/net/mptcp/mptcp_ipv4.c
12220 --- linux-3.14.45.orig/net/mptcp/mptcp_ipv4.c 1970-01-01 01:00:00.000000000 +0100
12221 +++ linux-3.14.45/net/mptcp/mptcp_ipv4.c 2015-06-24 14:15:48.895862487 +0200
12222 @@ -0,0 +1,603 @@
12224 + * MPTCP implementation - IPv4-specific functions
12226 + * Initial Design & Implementation:
12227 + * Sébastien Barré <sebastien.barre@uclouvain.be>
12229 + * Current Maintainer:
12230 + * Christoph Paasch <christoph.paasch@uclouvain.be>
12232 + * Additional authors:
12233 + * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
12234 + * Gregory Detal <gregory.detal@uclouvain.be>
12235 + * Fabien Duchêne <fabien.duchene@uclouvain.be>
12236 + * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
12237 + * Lavkesh Lahngir <lavkesh51@gmail.com>
12238 + * Andreas Ripke <ripke@neclab.eu>
12239 + * Vlad Dogaru <vlad.dogaru@intel.com>
12240 + * Octavian Purdila <octavian.purdila@intel.com>
12241 + * John Ronan <jronan@tssg.org>
12242 + * Catalin Nicutar <catalin.nicutar@gmail.com>
12243 + * Brandon Heller <brandonh@stanford.edu>
12246 + * This program is free software; you can redistribute it and/or
12247 + * modify it under the terms of the GNU General Public License
12248 + * as published by the Free Software Foundation; either version
12249 + * 2 of the License, or (at your option) any later version.
12250 + */
12252 +#include <linux/export.h>
12253 +#include <linux/ip.h>
12254 +#include <linux/list.h>
12255 +#include <linux/skbuff.h>
12256 +#include <linux/spinlock.h>
12257 +#include <linux/tcp.h>
12259 +#include <net/inet_common.h>
12260 +#include <net/inet_connection_sock.h>
12261 +#include <net/mptcp.h>
12262 +#include <net/mptcp_v4.h>
12263 +#include <net/request_sock.h>
12264 +#include <net/tcp.h>
12266 +u32 mptcp_v4_get_nonce(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
12267 + u32 seq)
12269 + u32 hash[MD5_DIGEST_WORDS];
12271 + hash[0] = (__force u32)saddr;
12272 + hash[1] = (__force u32)daddr;
12273 + hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
12274 + hash[3] = seq;
12276 + md5_transform(hash, mptcp_secret);
12278 + return hash[0];
12281 +u64 mptcp_v4_get_key(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport)
12283 + u32 hash[MD5_DIGEST_WORDS];
12285 + hash[0] = (__force u32)saddr;
12286 + hash[1] = (__force u32)daddr;
12287 + hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
12288 + hash[3] = mptcp_key_seed++;
12290 + md5_transform(hash, mptcp_secret);
12292 + return *((u64 *)hash);
12296 +static void mptcp_v4_reqsk_destructor(struct request_sock *req)
12298 + mptcp_reqsk_destructor(req);
12300 + tcp_v4_reqsk_destructor(req);
12303 +/* Similar to tcp_request_sock_ops */
12304 +struct request_sock_ops mptcp_request_sock_ops __read_mostly = {
12305 + .family = PF_INET,
12306 + .obj_size = sizeof(struct mptcp_request_sock),
12307 + .rtx_syn_ack = tcp_v4_rtx_synack,
12308 + .send_ack = tcp_v4_reqsk_send_ack,
12309 + .destructor = mptcp_v4_reqsk_destructor,
12310 + .send_reset = tcp_v4_send_reset,
12311 + .syn_ack_timeout = tcp_syn_ack_timeout,
12314 +static void mptcp_v4_reqsk_queue_hash_add(struct sock *meta_sk,
12315 + struct request_sock *req,
12316 + unsigned long timeout)
12318 + const u32 h1 = inet_synq_hash(inet_rsk(req)->ir_rmt_addr,
12319 + inet_rsk(req)->ir_rmt_port,
12320 + 0, MPTCP_HASH_SIZE);
12321 + /* We cannot call inet_csk_reqsk_queue_hash_add(), because we do not
12322 + * want to reset the keepalive-timer (responsible for retransmitting
12323 + * SYN/ACKs). We do not retransmit SYN/ACKs+MP_JOINs, because we cannot
12324 + * overload the keepalive timer. Also, it's not a big deal, because the
12325 + * third ACK of the MP_JOIN-handshake is sent in a reliable manner. So,
12326 + * if the third ACK gets lost, the client will handle the retransmission
12327 + * anyways. If our SYN/ACK gets lost, the client will retransmit the
12328 + * SYN.
12329 + */
12330 + struct inet_connection_sock *meta_icsk = inet_csk(meta_sk);
12331 + struct listen_sock *lopt = meta_icsk->icsk_accept_queue.listen_opt;
12332 + const u32 h2 = inet_synq_hash(inet_rsk(req)->ir_rmt_addr,
12333 + inet_rsk(req)->ir_rmt_port,
12334 + lopt->hash_rnd, lopt->nr_table_entries);
12336 + reqsk_queue_hash_req(&meta_icsk->icsk_accept_queue, h2, req, timeout);
12337 + reqsk_queue_added(&meta_icsk->icsk_accept_queue);
12339 + spin_lock(&mptcp_reqsk_hlock);
12340 + list_add(&mptcp_rsk(req)->collide_tuple, &mptcp_reqsk_htb[h1]);
12341 + spin_unlock(&mptcp_reqsk_hlock);
12344 +/* Similar to tcp_v4_conn_request */
12345 +static void mptcp_v4_join_request(struct sock *meta_sk, struct sk_buff *skb)
12347 + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
12348 + struct tcp_options_received tmp_opt;
12349 + struct mptcp_options_received mopt;
12350 + struct request_sock *req;
12351 + struct inet_request_sock *ireq;
12352 + struct mptcp_request_sock *mtreq;
12353 + struct dst_entry *dst = NULL;
12354 + u8 mptcp_hash_mac[20];
12355 + __be32 saddr = ip_hdr(skb)->saddr;
12356 + __be32 daddr = ip_hdr(skb)->daddr;
12357 + __u32 isn = TCP_SKB_CB(skb)->when;
12358 + int want_cookie = 0;
12359 + union inet_addr addr;
12361 + tcp_clear_options(&tmp_opt);
12362 + mptcp_init_mp_opt(&mopt);
12363 + tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
12364 + tmp_opt.user_mss = tcp_sk(meta_sk)->rx_opt.user_mss;
12365 + tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL);
12367 + req = inet_reqsk_alloc(&mptcp_request_sock_ops);
12368 + if (!req)
12369 + return;
12371 +#ifdef CONFIG_TCP_MD5SIG
12372 + tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
12373 +#endif
12375 + tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
12376 + tcp_openreq_init(req, &tmp_opt, skb);
12378 + ireq = inet_rsk(req);
12379 + ireq->ir_loc_addr = daddr;
12380 + ireq->ir_rmt_addr = saddr;
12381 + ireq->no_srccheck = inet_sk(meta_sk)->transparent;
12382 + ireq->opt = tcp_v4_save_options(skb);
12384 + if (security_inet_conn_request(meta_sk, skb, req))
12385 + goto drop_and_free;
12387 + if (!want_cookie || tmp_opt.tstamp_ok)
12388 + TCP_ECN_create_request(req, skb, sock_net(meta_sk));
12390 + if (!isn) {
12391 + struct flowi4 fl4;
12393 + /* VJ's idea. We save last timestamp seen
12394 + * from the destination in peer table, when entering
12395 + * state TIME-WAIT, and check against it before
12396 + * accepting new connection request.
12398 + * If "isn" is not zero, this request hit alive
12399 + * timewait bucket, so that all the necessary checks
12400 + * are made in the function processing timewait state.
12401 + */
12402 + if (tmp_opt.saw_tstamp &&
12403 + tcp_death_row.sysctl_tw_recycle &&
12404 + (dst = inet_csk_route_req(meta_sk, &fl4, req)) != NULL &&
12405 + fl4.daddr == saddr) {
12406 + if (!tcp_peer_is_proven(req, dst, true)) {
12407 + NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_PAWSPASSIVEREJECTED);
12408 + goto drop_and_release;
12411 + /* Kill the following clause, if you dislike this way. */
12412 + else if (!sysctl_tcp_syncookies &&
12413 + (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(meta_sk) <
12414 + (sysctl_max_syn_backlog >> 2)) &&
12415 + !tcp_peer_is_proven(req, dst, false)) {
12416 + /* Without syncookies last quarter of
12417 + * backlog is filled with destinations,
12418 + * proven to be alive.
12419 + * It means that we continue to communicate
12420 + * to destinations, already remembered
12421 + * to the moment of synflood.
12422 + */
12423 + LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
12424 + &saddr, ntohs(tcp_hdr(skb)->source));
12425 + goto drop_and_release;
12428 + isn = tcp_v4_init_sequence(skb);
12430 + tcp_rsk(req)->snt_isn = isn;
12431 + tcp_rsk(req)->snt_synack = tcp_time_stamp;
12432 + tcp_rsk(req)->listener = NULL;
12434 + mtreq = mptcp_rsk(req);
12435 + mtreq->mpcb = mpcb;
12436 + INIT_LIST_HEAD(&mtreq->collide_tuple);
12437 + mtreq->mptcp_rem_nonce = mopt.mptcp_recv_nonce;
12438 + mtreq->mptcp_rem_key = mpcb->mptcp_rem_key;
12439 + mtreq->mptcp_loc_key = mpcb->mptcp_loc_key;
12440 + mtreq->mptcp_loc_nonce = mptcp_v4_get_nonce(saddr, daddr,
12441 + tcp_hdr(skb)->source,
12442 + tcp_hdr(skb)->dest, isn);
12443 + mptcp_hmac_sha1((u8 *)&mtreq->mptcp_loc_key,
12444 + (u8 *)&mtreq->mptcp_rem_key,
12445 + (u8 *)&mtreq->mptcp_loc_nonce,
12446 + (u8 *)&mtreq->mptcp_rem_nonce, (u32 *)mptcp_hash_mac);
12447 + mtreq->mptcp_hash_tmac = *(u64 *)mptcp_hash_mac;
12449 + addr.ip = ireq->ir_loc_addr;
12450 + mtreq->loc_id = mpcb->pm_ops->get_local_id(AF_INET, &addr, sock_net(meta_sk));
12451 + if (mtreq->loc_id == -1) /* Address not part of the allowed ones */
12452 + goto drop_and_release;
12453 + mtreq->rem_id = mopt.rem_id;
12454 + mtreq->low_prio = mopt.low_prio;
12455 + tcp_rsk(req)->saw_mpc = 1;
12457 + if (tcp_v4_send_synack(meta_sk, dst, req, skb_get_queue_mapping(skb)))
12458 + goto drop_and_free;
12460 + /* Adding to request queue in metasocket */
12461 + mptcp_v4_reqsk_queue_hash_add(meta_sk, req, TCP_TIMEOUT_INIT);
12463 + return;
12465 +drop_and_release:
12466 + dst_release(dst);
12467 +drop_and_free:
12468 + reqsk_free(req);
12469 + return;
12472 +int mptcp_v4_rem_raddress(struct mptcp_cb *mpcb, u8 id)
12474 + int i;
12476 + for (i = 0; i < MPTCP_MAX_ADDR; i++) {
12477 + if (!((1 << i) & mpcb->rem4_bits))
12478 + continue;
12480 + if (mpcb->remaddr4[i].rem4_id == id) {
12481 + /* remove address from bitfield */
12482 + mpcb->rem4_bits &= ~(1 << i);
12484 + return 0;
12488 + return -1;
12491 +/* Based on function tcp_v4_conn_request (tcp_ipv4.c)
12492 + * Returns -1 if there is no space anymore to store an additional
12493 + * address
12494 + */
12495 +int mptcp_v4_add_raddress(struct mptcp_cb *mpcb, const struct in_addr *addr,
12496 + __be16 port, u8 id)
12498 + int i;
12499 + struct mptcp_rem4 *rem4;
12501 + mptcp_for_each_bit_set(mpcb->rem4_bits, i) {
12502 + rem4 = &mpcb->remaddr4[i];
12504 + /* Address is already in the list --- continue */
12505 + if (rem4->rem4_id == id &&
12506 + rem4->addr.s_addr == addr->s_addr && rem4->port == port)
12507 + return 0;
12509 + /* This may be the case, when the peer is behind a NAT. He is
12510 + * trying to JOIN, thus sending the JOIN with a certain ID.
12511 + * However the src_addr of the IP-packet has been changed. We
12512 + * update the addr in the list, because this is the address as
12513 + * OUR BOX sees it.
12514 + */
12515 + if (rem4->rem4_id == id && rem4->addr.s_addr != addr->s_addr) {
12516 + /* update the address */
12517 + mptcp_debug("%s: updating old addr:%pI4 to addr %pI4 with id:%d\n",
12518 + __func__, &rem4->addr.s_addr,
12519 + &addr->s_addr, id);
12520 + rem4->addr.s_addr = addr->s_addr;
12521 + rem4->port = port;
12522 + mpcb->list_rcvd = 1;
12523 + return 0;
12527 + i = mptcp_find_free_index(mpcb->rem4_bits);
12528 + /* Do we have already the maximum number of local/remote addresses? */
12529 + if (i < 0) {
12530 + mptcp_debug("%s: At max num of remote addresses: %d --- not adding address: %pI4\n",
12531 + __func__, MPTCP_MAX_ADDR, &addr->s_addr);
12532 + return -1;
12535 + rem4 = &mpcb->remaddr4[i];
12537 + /* Address is not known yet, store it */
12538 + rem4->addr.s_addr = addr->s_addr;
12539 + rem4->port = port;
12540 + rem4->bitfield = 0;
12541 + rem4->retry_bitfield = 0;
12542 + rem4->rem4_id = id;
12543 + mpcb->list_rcvd = 1;
12544 + mpcb->rem4_bits |= (1 << i);
12546 + return 0;
12549 +/* Sets the bitfield of the remote-address field
12550 + * local address is not set as it will disappear with the global address-list
12551 + */
12552 +void mptcp_v4_set_init_addr_bit(struct mptcp_cb *mpcb, __be32 daddr, int index)
12554 + int i;
12556 + mptcp_for_each_bit_set(mpcb->rem4_bits, i) {
12557 + if (mpcb->remaddr4[i].addr.s_addr == daddr) {
12558 + mpcb->remaddr4[i].bitfield |= (1 << index);
12559 + return;
12564 +/* We only process join requests here. (either the SYN or the final ACK) */
12565 +int mptcp_v4_do_rcv(struct sock *meta_sk, struct sk_buff *skb)
12567 + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
12568 + struct sock *child, *rsk = NULL;
12569 + int ret;
12571 + if (!(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_JOIN)) {
12572 + struct tcphdr *th = tcp_hdr(skb);
12573 + const struct iphdr *iph = ip_hdr(skb);
12574 + struct sock *sk;
12576 + sk = inet_lookup_established(sock_net(meta_sk), &tcp_hashinfo,
12577 + iph->saddr, th->source, iph->daddr,
12578 + th->dest, inet_iif(skb));
12580 + if (!sk) {
12581 + kfree_skb(skb);
12582 + return 0;
12584 + if (is_meta_sk(sk)) {
12585 + WARN("%s Did not find a sub-sk - did found the meta!\n", __func__);
12586 + kfree_skb(skb);
12587 + sock_put(sk);
12588 + return 0;
12591 + if (sk->sk_state == TCP_TIME_WAIT) {
12592 + inet_twsk_put(inet_twsk(sk));
12593 + kfree_skb(skb);
12594 + return 0;
12597 + ret = tcp_v4_do_rcv(sk, skb);
12598 + sock_put(sk);
12600 + return ret;
12602 + TCP_SKB_CB(skb)->mptcp_flags = 0;
12604 + /* Has been removed from the tk-table. Thus, no new subflows.
12606 + * Check for close-state is necessary, because we may have been closed
12607 + * without passing by mptcp_close().
12609 + * When falling back, no new subflows are allowed either.
12610 + */
12611 + if (meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table ||
12612 + mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping)
12613 + goto reset_and_discard;
12615 + child = tcp_v4_hnd_req(meta_sk, skb);
12617 + if (!child)
12618 + goto discard;
12620 + if (child != meta_sk) {
12621 + sock_rps_save_rxhash(child, skb);
12622 + /* We don't call tcp_child_process here, because we hold
12623 + * already the meta-sk-lock and are sure that it is not owned
12624 + * by the user.
12625 + */
12626 + ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), skb->len);
12627 + bh_unlock_sock(child);
12628 + sock_put(child);
12629 + if (ret) {
12630 + rsk = child;
12631 + goto reset_and_discard;
12633 + } else {
12634 + if (tcp_hdr(skb)->syn) {
12635 + struct mp_join *join_opt = mptcp_find_join(skb);
12636 + /* Currently we make two calls to mptcp_find_join(). This
12637 + * can probably be optimized.
12638 + */
12639 + if (mptcp_v4_add_raddress(mpcb,
12640 + (struct in_addr *)&ip_hdr(skb)->saddr,
12641 + 0,
12642 + join_opt->addr_id) < 0)
12643 + goto reset_and_discard;
12644 + mpcb->list_rcvd = 0;
12646 + mptcp_v4_join_request(meta_sk, skb);
12647 + goto discard;
12649 + goto reset_and_discard;
12651 + return 0;
12653 +reset_and_discard:
12654 + tcp_v4_send_reset(rsk, skb);
12655 +discard:
12656 + kfree_skb(skb);
12657 + return 0;
12660 +/* After this, the ref count of the meta_sk associated with the request_sock
12661 + * is incremented. Thus it is the responsibility of the caller
12662 + * to call sock_put() when the reference is not needed anymore.
12663 + */
12664 +struct sock *mptcp_v4_search_req(const __be16 rport, const __be32 raddr,
12665 + const __be32 laddr, const struct net *net)
12667 + struct mptcp_request_sock *mtreq;
12668 + struct sock *meta_sk = NULL;
12670 + spin_lock(&mptcp_reqsk_hlock);
12671 + list_for_each_entry(mtreq,
12672 + &mptcp_reqsk_htb[inet_synq_hash(raddr, rport, 0,
12673 + MPTCP_HASH_SIZE)],
12674 + collide_tuple) {
12675 + struct inet_request_sock *ireq = inet_rsk(rev_mptcp_rsk(mtreq));
12676 + meta_sk = mtreq->mpcb->meta_sk;
12678 + if (ireq->ir_rmt_port == rport &&
12679 + ireq->ir_rmt_addr == raddr &&
12680 + ireq->ir_loc_addr == laddr &&
12681 + rev_mptcp_rsk(mtreq)->rsk_ops->family == AF_INET &&
12682 + net_eq(net, sock_net(meta_sk)))
12683 + break;
12684 + meta_sk = NULL;
12687 + if (meta_sk && unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt)))
12688 + meta_sk = NULL;
12689 + spin_unlock(&mptcp_reqsk_hlock);
12691 + return meta_sk;
12694 +/* Create a new IPv4 subflow.
12696 + * We are in user-context and meta-sock-lock is hold.
12697 + */
12698 +int mptcp_init4_subsockets(struct sock *meta_sk, const struct mptcp_loc4 *loc,
12699 + struct mptcp_rem4 *rem)
12701 + struct tcp_sock *tp;
12702 + struct sock *sk;
12703 + struct sockaddr_in loc_in, rem_in;
12704 + struct socket sock;
12705 + int ulid_size = 0, ret;
12707 + /** First, create and prepare the new socket */
12709 + sock.type = meta_sk->sk_socket->type;
12710 + sock.state = SS_UNCONNECTED;
12711 + sock.wq = meta_sk->sk_socket->wq;
12712 + sock.file = meta_sk->sk_socket->file;
12713 + sock.ops = NULL;
12715 + ret = inet_create(sock_net(meta_sk), &sock, IPPROTO_TCP, 1);
12716 + if (unlikely(ret < 0)) {
12717 + mptcp_debug("%s inet_create failed ret: %d\n", __func__, ret);
12718 + return ret;
12721 + sk = sock.sk;
12722 + tp = tcp_sk(sk);
12724 + /* All subsockets need the MPTCP-lock-class */
12725 + lockdep_set_class_and_name(&(sk)->sk_lock.slock, &meta_slock_key, "slock-AF_INET-MPTCP");
12726 + lockdep_init_map(&(sk)->sk_lock.dep_map, "sk_lock-AF_INET-MPTCP", &meta_key, 0);
12728 + if (mptcp_add_sock(meta_sk, sk, loc->loc4_id, rem->rem4_id, GFP_KERNEL))
12729 + goto error;
12731 + tp->mptcp->slave_sk = 1;
12732 + tp->mptcp->low_prio = loc->low_prio;
12734 + /* Initializing the timer for an MPTCP subflow */
12735 + setup_timer(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, (unsigned long)sk);
12737 + /** Then, connect the socket to the peer */
12739 + ulid_size = sizeof(struct sockaddr_in);
12740 + loc_in.sin_family = AF_INET;
12741 + rem_in.sin_family = AF_INET;
12742 + loc_in.sin_port = 0;
12743 + if (rem->port)
12744 + rem_in.sin_port = rem->port;
12745 + else
12746 + rem_in.sin_port = inet_sk(meta_sk)->inet_dport;
12747 + loc_in.sin_addr = loc->addr;
12748 + rem_in.sin_addr = rem->addr;
12750 + ret = sock.ops->bind(&sock, (struct sockaddr *)&loc_in, ulid_size);
12751 + if (ret < 0) {
12752 + mptcp_debug("%s: MPTCP subsocket bind() failed, error %d\n",
12753 + __func__, ret);
12754 + goto error;
12757 + mptcp_debug("%s: token %#x pi %d src_addr:%pI4:%d dst_addr:%pI4:%d\n",
12758 + __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token,
12759 + tp->mptcp->path_index, &loc_in.sin_addr,
12760 + ntohs(loc_in.sin_port), &rem_in.sin_addr,
12761 + ntohs(rem_in.sin_port));
12763 + ret = sock.ops->connect(&sock, (struct sockaddr *)&rem_in,
12764 + ulid_size, O_NONBLOCK);
12765 + if (ret < 0 && ret != -EINPROGRESS) {
12766 + mptcp_debug("%s: MPTCP subsocket connect() failed, error %d\n",
12767 + __func__, ret);
12768 + goto error;
12771 + sk_set_socket(sk, meta_sk->sk_socket);
12772 + sk->sk_wq = meta_sk->sk_wq;
12774 + return 0;
12776 +error:
12777 + /* May happen if mptcp_add_sock fails first */
12778 + if (!tp->mpc) {
12779 + tcp_close(sk, 0);
12780 + } else {
12781 + local_bh_disable();
12782 + mptcp_sub_force_close(sk);
12783 + local_bh_enable();
12785 + return ret;
12787 +EXPORT_SYMBOL(mptcp_init4_subsockets);
12789 +/* General initialization of IPv4 for MPTCP */
12790 +int mptcp_pm_v4_init(void)
12792 + int ret = 0;
12793 + struct request_sock_ops *ops = &mptcp_request_sock_ops;
12795 + ops->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", "MPTCP");
12796 + if (ops->slab_name == NULL) {
12797 + ret = -ENOMEM;
12798 + goto out;
12801 + ops->slab = kmem_cache_create(ops->slab_name, ops->obj_size, 0,
12802 + SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN,
12803 + NULL);
12805 + if (ops->slab == NULL) {
12806 + ret = -ENOMEM;
12807 + goto err_reqsk_create;
12810 +out:
12811 + return ret;
12813 +err_reqsk_create:
12814 + kfree(ops->slab_name);
12815 + ops->slab_name = NULL;
12816 + goto out;
12819 +void mptcp_pm_v4_undo(void)
12821 + kmem_cache_destroy(mptcp_request_sock_ops.slab);
12822 + kfree(mptcp_request_sock_ops.slab_name);
12826 diff -Nur linux-3.14.45.orig/net/mptcp/mptcp_ipv6.c linux-3.14.45/net/mptcp/mptcp_ipv6.c
12827 --- linux-3.14.45.orig/net/mptcp/mptcp_ipv6.c 1970-01-01 01:00:00.000000000 +0100
12828 +++ linux-3.14.45/net/mptcp/mptcp_ipv6.c 2015-06-24 14:15:48.931862523 +0200
12829 @@ -0,0 +1,822 @@
12831 + * MPTCP implementation - IPv6-specific functions
12833 + * Initial Design & Implementation:
12834 + * Sébastien Barré <sebastien.barre@uclouvain.be>
12836 + * Current Maintainer:
12837 + * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
12839 + * Additional authors:
12840 + * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
12841 + * Gregory Detal <gregory.detal@uclouvain.be>
12842 + * Fabien Duchêne <fabien.duchene@uclouvain.be>
12843 + * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
12844 + * Lavkesh Lahngir <lavkesh51@gmail.com>
12845 + * Andreas Ripke <ripke@neclab.eu>
12846 + * Vlad Dogaru <vlad.dogaru@intel.com>
12847 + * Octavian Purdila <octavian.purdila@intel.com>
12848 + * John Ronan <jronan@tssg.org>
12849 + * Catalin Nicutar <catalin.nicutar@gmail.com>
12850 + * Brandon Heller <brandonh@stanford.edu>
12853 + * This program is free software; you can redistribute it and/or
12854 + * modify it under the terms of the GNU General Public License
12855 + * as published by the Free Software Foundation; either version
12856 + * 2 of the License, or (at your option) any later version.
12857 + */
12859 +#include <linux/export.h>
12860 +#include <linux/in6.h>
12861 +#include <linux/kernel.h>
12863 +#include <net/addrconf.h>
12864 +#include <net/flow.h>
12865 +#include <net/inet6_connection_sock.h>
12866 +#include <net/inet6_hashtables.h>
12867 +#include <net/inet_common.h>
12868 +#include <net/ipv6.h>
12869 +#include <net/ip6_checksum.h>
12870 +#include <net/ip6_route.h>
12871 +#include <net/mptcp.h>
12872 +#include <net/mptcp_v6.h>
12873 +#include <net/tcp.h>
12874 +#include <net/transp_v6.h>
12876 +static int mptcp_v6v4_send_synack(struct sock *meta_sk, struct request_sock *req,
12877 + u16 queue_mapping);
12879 +__u32 mptcp_v6_get_nonce(const __be32 *saddr, const __be32 *daddr,
12880 + __be16 sport, __be16 dport, u32 seq)
12882 + u32 secret[MD5_MESSAGE_BYTES / 4];
12883 + u32 hash[MD5_DIGEST_WORDS];
12884 + u32 i;
12886 + memcpy(hash, saddr, 16);
12887 + for (i = 0; i < 4; i++)
12888 + secret[i] = mptcp_secret[i] + (__force u32)daddr[i];
12889 + secret[4] = mptcp_secret[4] +
12890 + (((__force u16)sport << 16) + (__force u16)dport);
12891 + secret[5] = seq;
12892 + for (i = 6; i < MD5_MESSAGE_BYTES / 4; i++)
12893 + secret[i] = mptcp_secret[i];
12895 + md5_transform(hash, secret);
12897 + return hash[0];
12900 +u64 mptcp_v6_get_key(const __be32 *saddr, const __be32 *daddr,
12901 + __be16 sport, __be16 dport)
12903 + u32 secret[MD5_MESSAGE_BYTES / 4];
12904 + u32 hash[MD5_DIGEST_WORDS];
12905 + u32 i;
12907 + memcpy(hash, saddr, 16);
12908 + for (i = 0; i < 4; i++)
12909 + secret[i] = mptcp_secret[i] + (__force u32)daddr[i];
12910 + secret[4] = mptcp_secret[4] +
12911 + (((__force u16)sport << 16) + (__force u16)dport);
12912 + secret[5] = mptcp_key_seed++;
12913 + for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
12914 + secret[i] = mptcp_secret[i];
12916 + md5_transform(hash, secret);
12918 + return *((u64 *)hash);
12921 +static void mptcp_v6_reqsk_destructor(struct request_sock *req)
12923 + mptcp_reqsk_destructor(req);
12925 + tcp_v6_reqsk_destructor(req);
12928 +/* Similar to tcp_v6_rtx_synack */
12929 +static int mptcp_v6_rtx_synack(struct sock *meta_sk, struct request_sock *req)
12931 + if (meta_sk->sk_family == AF_INET6)
12932 + return tcp_v6_rtx_synack(meta_sk, req);
12934 + TCP_INC_STATS_BH(sock_net(meta_sk), TCP_MIB_RETRANSSEGS);
12935 + return mptcp_v6v4_send_synack(meta_sk, req, 0);
12938 +/* Similar to tcp6_request_sock_ops */
12939 +struct request_sock_ops mptcp6_request_sock_ops __read_mostly = {
12940 + .family = AF_INET6,
12941 + .obj_size = sizeof(struct mptcp_request_sock),
12942 + .rtx_syn_ack = mptcp_v6_rtx_synack,
12943 + .send_ack = tcp_v6_reqsk_send_ack,
12944 + .destructor = mptcp_v6_reqsk_destructor,
12945 + .send_reset = tcp_v6_send_reset,
12946 + .syn_ack_timeout = tcp_syn_ack_timeout,
12949 +static void mptcp_v6_reqsk_queue_hash_add(struct sock *meta_sk,
12950 + struct request_sock *req,
12951 + unsigned long timeout)
12953 + const u32 h1 = inet6_synq_hash(&inet_rsk(req)->ir_v6_rmt_addr,
12954 + inet_rsk(req)->ir_rmt_port,
12955 + 0, MPTCP_HASH_SIZE);
12956 + /* We cannot call inet6_csk_reqsk_queue_hash_add(), because we do not
12957 + * want to reset the keepalive-timer (responsible for retransmitting
12958 + * SYN/ACKs). We do not retransmit SYN/ACKs+MP_JOINs, because we cannot
12959 + * overload the keepalive timer. Also, it's not a big deal, because the
12960 + * third ACK of the MP_JOIN-handshake is sent in a reliable manner. So,
12961 + * if the third ACK gets lost, the client will handle the retransmission
12962 + * anyways. If our SYN/ACK gets lost, the client will retransmit the
12963 + * SYN.
12964 + */
12965 + struct inet_connection_sock *meta_icsk = inet_csk(meta_sk);
12966 + struct listen_sock *lopt = meta_icsk->icsk_accept_queue.listen_opt;
12967 + const u32 h2 = inet6_synq_hash(&inet_rsk(req)->ir_v6_rmt_addr,
12968 + inet_rsk(req)->ir_rmt_port,
12969 + lopt->hash_rnd, lopt->nr_table_entries);
12971 + reqsk_queue_hash_req(&meta_icsk->icsk_accept_queue, h2, req, timeout);
12972 + reqsk_queue_added(&meta_icsk->icsk_accept_queue);
12974 + spin_lock(&mptcp_reqsk_hlock);
12975 + list_add(&mptcp_rsk(req)->collide_tuple, &mptcp_reqsk_htb[h1]);
12976 + spin_unlock(&mptcp_reqsk_hlock);
12979 +/* Similar to tcp_v6_send_synack
12981 + * The meta-socket is IPv4, but a new subsocket is IPv6
12982 + */
12983 +static int mptcp_v6v4_send_synack(struct sock *meta_sk, struct request_sock *req,
12984 + u16 queue_mapping)
12986 + struct inet_request_sock *treq = inet_rsk(req);
12987 + struct sk_buff *skb;
12988 + struct flowi6 fl6;
12989 + struct dst_entry *dst;
12990 + int err = -ENOMEM;
12992 + memset(&fl6, 0, sizeof(fl6));
12993 + fl6.flowi6_proto = IPPROTO_TCP;
12994 + fl6.daddr = treq->ir_v6_rmt_addr;
12995 + fl6.saddr = treq->ir_v6_loc_addr;
12996 + fl6.flowlabel = 0;
12997 + fl6.flowi6_oif = treq->ir_iif;
12998 + fl6.flowi6_mark = meta_sk->sk_mark;
12999 + fl6.fl6_dport = inet_rsk(req)->ir_rmt_port;
13000 + fl6.fl6_sport = htons(inet_rsk(req)->ir_num);
13001 + security_req_classify_flow(req, flowi6_to_flowi(&fl6));
13003 + dst = ip6_dst_lookup_flow(meta_sk, &fl6, NULL);
13004 + if (IS_ERR(dst)) {
13005 + err = PTR_ERR(dst);
13006 + return err;
13008 + skb = tcp_make_synack(meta_sk, dst, req, NULL);
13010 + if (skb) {
13011 + __tcp_v6_send_check(skb, &treq->ir_v6_loc_addr,
13012 + &treq->ir_v6_rmt_addr);
13014 + fl6.daddr = treq->ir_v6_rmt_addr;
13015 + skb_set_queue_mapping(skb, queue_mapping);
13016 + err = ip6_xmit(meta_sk, skb, &fl6, NULL, 0);
13017 + err = net_xmit_eval(err);
13020 + return err;
13023 +/* Similar to tcp_v6_syn_recv_sock
13025 + * The meta-socket is IPv4, but a new subsocket is IPv6
13026 + */
13027 +struct sock *mptcp_v6v4_syn_recv_sock(struct sock *meta_sk, struct sk_buff *skb,
13028 + struct request_sock *req,
13029 + struct dst_entry *dst)
13031 + struct inet_request_sock *treq;
13032 + struct ipv6_pinfo *newnp;
13033 + struct tcp6_sock *newtcp6sk;
13034 + struct inet_sock *newinet;
13035 + struct tcp_sock *newtp;
13036 + struct sock *newsk;
13038 + treq = inet_rsk(req);
13040 + if (sk_acceptq_is_full(meta_sk))
13041 + goto out_overflow;
13043 + if (!dst) {
13044 + /* This code is similar to inet6_csk_route_req, but as we
13045 + * don't have a np-pointer in the meta, we have to do it
13046 + * manually.
13047 + */
13048 + struct flowi6 fl6;
13050 + memset(&fl6, 0, sizeof(fl6));
13051 + fl6.flowi6_proto = IPPROTO_TCP;
13052 + fl6.daddr = treq->ir_v6_rmt_addr;
13053 + fl6.saddr = treq->ir_v6_loc_addr;
13054 + fl6.flowi6_oif = treq->ir_iif;
13055 + fl6.flowi6_mark = meta_sk->sk_mark;
13056 + fl6.fl6_dport = inet_rsk(req)->ir_rmt_port;
13057 + fl6.fl6_sport = htons(inet_rsk(req)->ir_num);
13058 + security_req_classify_flow(req, flowi6_to_flowi(&fl6));
13060 + dst = ip6_dst_lookup_flow(meta_sk, &fl6, NULL);
13061 + if (IS_ERR(dst))
13062 + goto out;
13065 + newsk = tcp_create_openreq_child(meta_sk, req, skb);
13066 + if (newsk == NULL)
13067 + goto out_nonewsk;
13069 + /* Diff to tcp_v6_syn_recv_sock: Must do this prior to __ip6_dst_store,
13070 + * as it tries to access the pinet6-pointer.
13071 + */
13072 + newtcp6sk = (struct tcp6_sock *)newsk;
13073 + inet_sk(newsk)->pinet6 = &newtcp6sk->inet6;
13075 + /*
13076 + * No need to charge this sock to the relevant IPv6 refcnt debug socks
13077 + * count here, tcp_create_openreq_child now does this for us, see the
13078 + * comment in that function for the gory details. -acme
13079 + */
13081 + newsk->sk_gso_type = SKB_GSO_TCPV6;
13082 + __ip6_dst_store(newsk, dst, NULL, NULL);
13083 + inet6_sk_rx_dst_set(newsk, skb);
13085 + newtp = tcp_sk(newsk);
13086 + newinet = inet_sk(newsk);
13087 + newnp = inet6_sk(newsk);
13089 + newsk->sk_v6_daddr = treq->ir_v6_rmt_addr;
13090 + newnp->saddr = treq->ir_v6_loc_addr;
13091 + newsk->sk_v6_rcv_saddr = treq->ir_v6_loc_addr;
13092 + newsk->sk_bound_dev_if = treq->ir_iif;
13094 + /* Now IPv6 options...
13096 + First: no IPv4 options.
13097 + */
13098 + newinet->inet_opt = NULL;
13099 + newnp->ipv6_ac_list = NULL;
13100 + newnp->ipv6_fl_list = NULL;
13101 + newnp->rxopt.all = 0;
13103 + /* Clone pktoptions received with SYN */
13104 + newnp->pktoptions = NULL;
13105 + if (treq->pktopts != NULL) {
13106 + newnp->pktoptions = skb_clone(treq->pktopts,
13107 + sk_gfp_atomic(meta_sk, GFP_ATOMIC));
13108 + consume_skb(treq->pktopts);
13109 + treq->pktopts = NULL;
13110 + if (newnp->pktoptions)
13111 + skb_set_owner_r(newnp->pktoptions, newsk);
13113 + newnp->opt = NULL;
13114 + newnp->mcast_oif = inet6_iif(skb);
13115 + newnp->mcast_hops = ipv6_hdr(skb)->hop_limit;
13116 + newnp->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(skb));
13118 + /* Initialization copied from inet6_create - normally this should have
13119 + * been handled by the memcpy as in tcp_v6_syn_recv_sock
13120 + */
13121 + newnp->hop_limit = -1;
13122 + newnp->mc_loop = 1;
13123 + newnp->pmtudisc = IPV6_PMTUDISC_WANT;
13124 + (void)xchg(&newnp->rxpmtu, NULL);
13126 + inet_csk(newsk)->icsk_ext_hdr_len = 0;
13128 + tcp_mtup_init(newsk);
13129 + tcp_sync_mss(newsk, dst_mtu(dst));
13130 + newtp->advmss = dst_metric_advmss(dst);
13131 + if (tcp_sk(meta_sk)->rx_opt.user_mss &&
13132 + tcp_sk(meta_sk)->rx_opt.user_mss < newtp->advmss)
13133 + newtp->advmss = tcp_sk(meta_sk)->rx_opt.user_mss;
13135 + tcp_initialize_rcv_mss(newsk);
13137 + newinet->inet_daddr = LOOPBACK4_IPV6;
13138 + newinet->inet_saddr = LOOPBACK4_IPV6;
13139 + newinet->inet_rcv_saddr = LOOPBACK4_IPV6;
13141 + if (__inet_inherit_port(meta_sk, newsk) < 0) {
13142 + inet_csk_prepare_forced_close(newsk);
13143 + tcp_done(newsk);
13144 + goto out;
13146 + __inet6_hash(newsk, NULL);
13148 + return newsk;
13150 +out_overflow:
13151 + NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_LISTENOVERFLOWS);
13152 +out_nonewsk:
13153 + dst_release(dst);
13154 +out:
13155 + NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_LISTENDROPS);
13156 + return NULL;
13159 +/* Similar to tcp_v6_conn_request */
13160 +static void mptcp_v6_join_request(struct sock *meta_sk, struct sk_buff *skb)
13162 + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
13163 + struct tcp_options_received tmp_opt;
13164 + struct mptcp_options_received mopt;
13165 + struct ipv6_pinfo *np = inet6_sk(meta_sk);
13166 + struct request_sock *req;
13167 + struct inet_request_sock *treq;
13168 + struct mptcp_request_sock *mtreq;
13169 + u8 mptcp_hash_mac[20];
13170 + __u32 isn = TCP_SKB_CB(skb)->when;
13171 + struct dst_entry *dst = NULL;
13172 + struct flowi6 fl6;
13173 + int want_cookie = 0;
13174 + union inet_addr addr;
13176 + tcp_clear_options(&tmp_opt);
13177 + mptcp_init_mp_opt(&mopt);
13178 + tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
13179 + tmp_opt.user_mss = tcp_sk(meta_sk)->rx_opt.user_mss;
13180 + tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL);
13182 + req = inet6_reqsk_alloc(&mptcp6_request_sock_ops);
13183 + if (!req)
13184 + return;
13186 +#ifdef CONFIG_TCP_MD5SIG
13187 + tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops;
13188 +#endif
13190 + tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
13191 + tcp_openreq_init(req, &tmp_opt, skb);
13193 + treq = inet_rsk(req);
13194 + treq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
13195 + treq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
13197 + if (!want_cookie || tmp_opt.tstamp_ok)
13198 + TCP_ECN_create_request(req, skb, sock_net(meta_sk));
13200 + treq->ir_iif = meta_sk->sk_bound_dev_if;
13202 + /* So that link locals have meaning */
13203 + if (!meta_sk->sk_bound_dev_if &&
13204 + ipv6_addr_type(&treq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
13205 + treq->ir_iif = inet6_iif(skb);
13207 + if (!isn) {
13208 + if (meta_sk->sk_family == AF_INET6 &&
13209 + (ipv6_opt_accepted(meta_sk, skb) ||
13210 + np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
13211 + np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim)) {
13212 + atomic_inc(&skb->users);
13213 + treq->pktopts = skb;
13216 + /* VJ's idea. We save last timestamp seen
13217 + * from the destination in peer table, when entering
13218 + * state TIME-WAIT, and check against it before
13219 + * accepting new connection request.
13221 + * If "isn" is not zero, this request hit alive
13222 + * timewait bucket, so that all the necessary checks
13223 + * are made in the function processing timewait state.
13224 + */
13225 + if (tmp_opt.saw_tstamp &&
13226 + tcp_death_row.sysctl_tw_recycle &&
13227 + (dst = inet6_csk_route_req(meta_sk, &fl6, req)) != NULL) {
13228 + if (!tcp_peer_is_proven(req, dst, true)) {
13229 + NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_PAWSPASSIVEREJECTED);
13230 + goto drop_and_release;
13233 + /* Kill the following clause, if you dislike this way. */
13234 + else if (!sysctl_tcp_syncookies &&
13235 + (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(meta_sk) <
13236 + (sysctl_max_syn_backlog >> 2)) &&
13237 + !tcp_peer_is_proven(req, dst, false)) {
13238 + /* Without syncookies last quarter of
13239 + * backlog is filled with destinations,
13240 + * proven to be alive.
13241 + * It means that we continue to communicate
13242 + * to destinations, already remembered
13243 + * to the moment of synflood.
13244 + */
13245 + LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI6/%u\n",
13246 + &treq->ir_v6_rmt_addr,
13247 + ntohs(tcp_hdr(skb)->source));
13248 + goto drop_and_release;
13251 + isn = tcp_v6_init_sequence(skb);
13254 + tcp_rsk(req)->snt_isn = isn;
13255 + tcp_rsk(req)->snt_synack = tcp_time_stamp;
13256 + tcp_rsk(req)->listener = NULL;
13258 + mtreq = mptcp_rsk(req);
13259 + mtreq->mpcb = mpcb;
13260 + INIT_LIST_HEAD(&mtreq->collide_tuple);
13261 + mtreq->mptcp_rem_nonce = mopt.mptcp_recv_nonce;
13262 + mtreq->mptcp_rem_key = mpcb->mptcp_rem_key;
13263 + mtreq->mptcp_loc_key = mpcb->mptcp_loc_key;
13264 + mtreq->mptcp_loc_nonce = mptcp_v6_get_nonce(ipv6_hdr(skb)->daddr.s6_addr32,
13265 + ipv6_hdr(skb)->saddr.s6_addr32,
13266 + tcp_hdr(skb)->dest,
13267 + tcp_hdr(skb)->source, isn);
13268 + mptcp_hmac_sha1((u8 *)&mtreq->mptcp_loc_key,
13269 + (u8 *)&mtreq->mptcp_rem_key,
13270 + (u8 *)&mtreq->mptcp_loc_nonce,
13271 + (u8 *)&mtreq->mptcp_rem_nonce, (u32 *)mptcp_hash_mac);
13272 + mtreq->mptcp_hash_tmac = *(u64 *)mptcp_hash_mac;
13274 + addr.in6 = treq->ir_v6_loc_addr;
13275 + mtreq->loc_id = mpcb->pm_ops->get_local_id(AF_INET6, &addr, sock_net(meta_sk));
13276 + if (mtreq->loc_id == -1) /* Address not part of the allowed ones */
13277 + goto drop_and_release;
13278 + mtreq->rem_id = mopt.rem_id;
13279 + mtreq->low_prio = mopt.low_prio;
13280 + tcp_rsk(req)->saw_mpc = 1;
13282 + if (meta_sk->sk_family == AF_INET6) {
13283 + if (tcp_v6_send_synack(meta_sk, dst, &fl6, req,
13284 + skb_get_queue_mapping(skb)))
13285 + goto drop_and_free;
13286 + } else {
13287 + if (mptcp_v6v4_send_synack(meta_sk, req, skb_get_queue_mapping(skb)))
13288 + goto drop_and_free;
13291 + /* Adding to request queue in metasocket */
13292 + mptcp_v6_reqsk_queue_hash_add(meta_sk, req, TCP_TIMEOUT_INIT);
13294 + return;
13296 +drop_and_release:
13297 + dst_release(dst);
13298 +drop_and_free:
13299 + reqsk_free(req);
13300 + return;
13303 +int mptcp_v6_rem_raddress(struct mptcp_cb *mpcb, u8 id)
13305 + int i;
13307 + for (i = 0; i < MPTCP_MAX_ADDR; i++) {
13308 + if (!((1 << i) & mpcb->rem6_bits))
13309 + continue;
13311 + if (mpcb->remaddr6[i].rem6_id == id) {
13312 + /* remove address from bitfield */
13313 + mpcb->rem6_bits &= ~(1 << i);
13315 + return 0;
13319 + return -1;
13322 +/* Returns -1 if there is no space anymore to store an additional
13323 + * address
13324 + */
13325 +int mptcp_v6_add_raddress(struct mptcp_cb *mpcb, const struct in6_addr *addr,
13326 + __be16 port, u8 id)
13328 + int i;
13329 + struct mptcp_rem6 *rem6;
13331 + mptcp_for_each_bit_set(mpcb->rem6_bits, i) {
13332 + rem6 = &mpcb->remaddr6[i];
13334 + /* Address is already in the list --- continue */
13335 + if (rem6->rem6_id == id &&
13336 + ipv6_addr_equal(&rem6->addr, addr) && rem6->port == port)
13337 + return 0;
13339 + /* This may be the case, when the peer is behind a NAT. He is
13340 + * trying to JOIN, thus sending the JOIN with a certain ID.
13341 + * However the src_addr of the IP-packet has been changed. We
13342 + * update the addr in the list, because this is the address as
13343 + * OUR BOX sees it.
13344 + */
13345 + if (rem6->rem6_id == id) {
13346 + /* update the address */
13347 + mptcp_debug("%s: updating old addr: %pI6 to addr %pI6 with id:%d\n",
13348 + __func__, &rem6->addr, addr, id);
13349 + rem6->addr = *addr;
13350 + rem6->port = port;
13351 + mpcb->list_rcvd = 1;
13352 + return 0;
13356 + i = mptcp_find_free_index(mpcb->rem6_bits);
13357 + /* Do we have already the maximum number of local/remote addresses? */
13358 + if (i < 0) {
13359 + mptcp_debug("%s: At max num of remote addresses: %d --- not adding address: %pI6\n",
13360 + __func__, MPTCP_MAX_ADDR, addr);
13361 + return -1;
13364 + rem6 = &mpcb->remaddr6[i];
13366 + /* Address is not known yet, store it */
13367 + rem6->addr = *addr;
13368 + rem6->port = port;
13369 + rem6->bitfield = 0;
13370 + rem6->retry_bitfield = 0;
13371 + rem6->rem6_id = id;
13372 + mpcb->list_rcvd = 1;
13373 + mpcb->rem6_bits |= (1 << i);
13375 + return 0;
13378 +/* Sets the bitfield of the remote-address field
13379 + * local address is not set as it will disappear with the global address-list
13380 + */
13381 +void mptcp_v6_set_init_addr_bit(struct mptcp_cb *mpcb,
13382 + const struct in6_addr *daddr, int index)
13384 + int i;
13385 + mptcp_for_each_bit_set(mpcb->rem6_bits, i) {
13386 + if (ipv6_addr_equal(&mpcb->remaddr6[i].addr, daddr)) {
13387 + mpcb->remaddr6[i].bitfield |= (1 << index);
13388 + return;
13393 +int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb)
13395 + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
13396 + struct sock *child, *rsk = NULL;
13397 + int ret;
13399 + if (!(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_JOIN)) {
13400 + struct tcphdr *th = tcp_hdr(skb);
13401 + const struct ipv6hdr *ip6h = ipv6_hdr(skb);
13402 + struct sock *sk;
13404 + sk = __inet6_lookup_established(sock_net(meta_sk),
13405 + &tcp_hashinfo,
13406 + &ip6h->saddr, th->source,
13407 + &ip6h->daddr, ntohs(th->dest),
13408 + inet6_iif(skb));
13410 + if (!sk) {
13411 + kfree_skb(skb);
13412 + return 0;
13414 + if (is_meta_sk(sk)) {
13415 + WARN("%s Did not find a sub-sk!\n", __func__);
13416 + kfree_skb(skb);
13417 + sock_put(sk);
13418 + return 0;
13421 + if (sk->sk_state == TCP_TIME_WAIT) {
13422 + inet_twsk_put(inet_twsk(sk));
13423 + kfree_skb(skb);
13424 + return 0;
13427 + ret = tcp_v6_do_rcv(sk, skb);
13428 + sock_put(sk);
13430 + return ret;
13432 + TCP_SKB_CB(skb)->mptcp_flags = 0;
13434 + /* Has been removed from the tk-table. Thus, no new subflows.
13436 + * Check for close-state is necessary, because we may have been closed
13437 + * without passing by mptcp_close().
13439 + * When falling back, no new subflows are allowed either.
13440 + */
13441 + if (meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table ||
13442 + mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping)
13443 + goto reset_and_discard;
13445 + child = tcp_v6_hnd_req(meta_sk, skb);
13447 + if (!child)
13448 + goto discard;
13450 + if (child != meta_sk) {
13451 + sock_rps_save_rxhash(child, skb);
13452 + /* We don't call tcp_child_process here, because we hold
13453 + * already the meta-sk-lock and are sure that it is not owned
13454 + * by the user.
13455 + */
13456 + ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), skb->len);
13457 + bh_unlock_sock(child);
13458 + sock_put(child);
13459 + if (ret) {
13460 + rsk = child;
13461 + goto reset_and_discard;
13463 + } else {
13464 + if (tcp_hdr(skb)->syn) {
13465 + struct mp_join *join_opt = mptcp_find_join(skb);
13466 + /* Currently we make two calls to mptcp_find_join(). This
13467 + * can probably be optimized. */
13468 + if (mptcp_v6_add_raddress(mpcb,
13469 + (struct in6_addr *)&ipv6_hdr(skb)->saddr,
13470 + 0,
13471 + join_opt->addr_id) < 0)
13472 + goto reset_and_discard;
13473 + mpcb->list_rcvd = 0;
13475 + mptcp_v6_join_request(meta_sk, skb);
13476 + goto discard;
13478 + goto reset_and_discard;
13480 + return 0;
13482 +reset_and_discard:
13483 + tcp_v6_send_reset(rsk, skb);
13484 +discard:
13485 + kfree_skb(skb);
13486 + return 0;
13489 +/* After this, the ref count of the meta_sk associated with the request_sock
13490 + * is incremented. Thus it is the responsibility of the caller
13491 + * to call sock_put() when the reference is not needed anymore.
13492 + */
13493 +struct sock *mptcp_v6_search_req(const __be16 rport, const struct in6_addr *raddr,
13494 + const struct in6_addr *laddr, const struct net *net)
13496 + struct mptcp_request_sock *mtreq;
13497 + struct sock *meta_sk = NULL;
13499 + spin_lock(&mptcp_reqsk_hlock);
13500 + list_for_each_entry(mtreq,
13501 + &mptcp_reqsk_htb[inet6_synq_hash(raddr, rport, 0,
13502 + MPTCP_HASH_SIZE)],
13503 + collide_tuple) {
13504 + struct inet_request_sock *treq = inet_rsk(rev_mptcp_rsk(mtreq));
13505 + meta_sk = mtreq->mpcb->meta_sk;
13507 + if (inet_rsk(rev_mptcp_rsk(mtreq))->ir_rmt_port == rport &&
13508 + rev_mptcp_rsk(mtreq)->rsk_ops->family == AF_INET6 &&
13509 + ipv6_addr_equal(&treq->ir_v6_rmt_addr, raddr) &&
13510 + ipv6_addr_equal(&treq->ir_v6_loc_addr, laddr) &&
13511 + net_eq(net, sock_net(meta_sk)))
13512 + break;
13513 + meta_sk = NULL;
13516 + if (meta_sk && unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt)))
13517 + meta_sk = NULL;
13518 + spin_unlock(&mptcp_reqsk_hlock);
13520 + return meta_sk;
13523 +/* Create a new IPv6 subflow.
13525 + * We are in user-context and meta-sock-lock is hold.
13526 + */
13527 +int mptcp_init6_subsockets(struct sock *meta_sk, const struct mptcp_loc6 *loc,
13528 + struct mptcp_rem6 *rem)
13530 + struct tcp_sock *tp;
13531 + struct sock *sk;
13532 + struct sockaddr_in6 loc_in, rem_in;
13533 + struct socket sock;
13534 + int ulid_size = 0, ret;
13536 + /** First, create and prepare the new socket */
13538 + sock.type = meta_sk->sk_socket->type;
13539 + sock.state = SS_UNCONNECTED;
13540 + sock.wq = meta_sk->sk_socket->wq;
13541 + sock.file = meta_sk->sk_socket->file;
13542 + sock.ops = NULL;
13544 + ret = inet6_create(sock_net(meta_sk), &sock, IPPROTO_TCP, 1);
13545 + if (unlikely(ret < 0)) {
13546 + mptcp_debug("%s inet6_create failed ret: %d\n", __func__, ret);
13547 + return ret;
13550 + sk = sock.sk;
13551 + tp = tcp_sk(sk);
13553 + /* All subsockets need the MPTCP-lock-class */
13554 + lockdep_set_class_and_name(&(sk)->sk_lock.slock, &meta_slock_key, "slock-AF_INET-MPTCP");
13555 + lockdep_init_map(&(sk)->sk_lock.dep_map, "sk_lock-AF_INET-MPTCP", &meta_key, 0);
13557 + if (mptcp_add_sock(meta_sk, sk, loc->loc6_id, rem->rem6_id, GFP_KERNEL))
13558 + goto error;
13560 + tp->mptcp->slave_sk = 1;
13561 + tp->mptcp->low_prio = loc->low_prio;
13563 + /* Initializing the timer for an MPTCP subflow */
13564 + setup_timer(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, (unsigned long)sk);
13566 + /** Then, connect the socket to the peer */
13568 + ulid_size = sizeof(struct sockaddr_in6);
13569 + loc_in.sin6_family = AF_INET6;
13570 + rem_in.sin6_family = AF_INET6;
13571 + loc_in.sin6_port = 0;
13572 + if (rem->port)
13573 + rem_in.sin6_port = rem->port;
13574 + else
13575 + rem_in.sin6_port = inet_sk(meta_sk)->inet_dport;
13576 + loc_in.sin6_addr = loc->addr;
13577 + rem_in.sin6_addr = rem->addr;
13579 + ret = sock.ops->bind(&sock, (struct sockaddr *)&loc_in, ulid_size);
13580 + if (ret < 0) {
13581 + mptcp_debug("%s: MPTCP subsocket bind()failed, error %d\n",
13582 + __func__, ret);
13583 + goto error;
13586 + mptcp_debug("%s: token %#x pi %d src_addr:%pI6:%d dst_addr:%pI6:%d\n",
13587 + __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token,
13588 + tp->mptcp->path_index, &loc_in.sin6_addr,
13589 + ntohs(loc_in.sin6_port), &rem_in.sin6_addr,
13590 + ntohs(rem_in.sin6_port));
13592 + ret = sock.ops->connect(&sock, (struct sockaddr *)&rem_in,
13593 + ulid_size, O_NONBLOCK);
13594 + if (ret < 0 && ret != -EINPROGRESS) {
13595 + mptcp_debug("%s: MPTCP subsocket connect() failed, error %d\n",
13596 + __func__, ret);
13597 + goto error;
13600 + sk_set_socket(sk, meta_sk->sk_socket);
13601 + sk->sk_wq = meta_sk->sk_wq;
13603 + return 0;
13605 +error:
13606 + /* May happen if mptcp_add_sock fails first */
13607 + if (!tp->mpc) {
13608 + tcp_close(sk, 0);
13609 + } else {
13610 + local_bh_disable();
13611 + mptcp_sub_force_close(sk);
13612 + local_bh_enable();
13614 + return ret;
13616 +EXPORT_SYMBOL(mptcp_init6_subsockets);
13618 +int mptcp_pm_v6_init(void)
13620 + int ret = 0;
13621 + struct request_sock_ops *ops = &mptcp6_request_sock_ops;
13623 + ops->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", "MPTCP6");
13624 + if (ops->slab_name == NULL) {
13625 + ret = -ENOMEM;
13626 + goto out;
13629 + ops->slab = kmem_cache_create(ops->slab_name, ops->obj_size, 0,
13630 + SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN,
13631 + NULL);
13633 + if (ops->slab == NULL) {
13634 + ret = -ENOMEM;
13635 + goto err_reqsk_create;
13638 +out:
13639 + return ret;
13641 +err_reqsk_create:
13642 + kfree(ops->slab_name);
13643 + ops->slab_name = NULL;
13644 + goto out;
13647 +void mptcp_pm_v6_undo(void)
13649 + kmem_cache_destroy(mptcp6_request_sock_ops.slab);
13650 + kfree(mptcp6_request_sock_ops.slab_name);
13652 diff -Nur linux-3.14.45.orig/net/mptcp/mptcp_ndiffports.c linux-3.14.45/net/mptcp/mptcp_ndiffports.c
13653 --- linux-3.14.45.orig/net/mptcp/mptcp_ndiffports.c 1970-01-01 01:00:00.000000000 +0100
13654 +++ linux-3.14.45/net/mptcp/mptcp_ndiffports.c 2015-06-24 14:15:48.931862523 +0200
13655 @@ -0,0 +1,171 @@
13656 +#include <linux/module.h>
13658 +#include <net/mptcp.h>
13659 +#include <net/mptcp_v4.h>
13661 +#if IS_ENABLED(CONFIG_IPV6)
13662 +#include <net/mptcp_v6.h>
13663 +#endif
13665 +struct ndiffports_priv {
13666 + /* Worker struct for subflow establishment */
13667 + struct work_struct subflow_work;
13669 + struct mptcp_cb *mpcb;
13672 +static int sysctl_mptcp_ndiffports __read_mostly = 2;
13674 +/**
13675 + * Create all new subflows, by doing calls to mptcp_initX_subsockets
13677 + * This function uses a goto next_subflow, to allow releasing the lock between
13678 + * new subflows and giving other processes a chance to do some work on the
13679 + * socket and potentially finishing the communication.
13680 + **/
13681 +static void create_subflow_worker(struct work_struct *work)
13683 + struct ndiffports_priv *pm_priv = container_of(work,
13684 + struct ndiffports_priv,
13685 + subflow_work);
13686 + struct mptcp_cb *mpcb = pm_priv->mpcb;
13687 + struct sock *meta_sk = mpcb->meta_sk;
13688 + int iter = 0;
13690 +next_subflow:
13691 + if (iter) {
13692 + release_sock(meta_sk);
13693 + mutex_unlock(&mpcb->mpcb_mutex);
13695 + yield();
13697 + mutex_lock(&mpcb->mpcb_mutex);
13698 + lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
13700 + iter++;
13702 + if (sock_flag(meta_sk, SOCK_DEAD))
13703 + goto exit;
13705 + if (mpcb->master_sk &&
13706 + !tcp_sk(mpcb->master_sk)->mptcp->fully_established)
13707 + goto exit;
13709 + if (sysctl_mptcp_ndiffports > iter &&
13710 + sysctl_mptcp_ndiffports > mpcb->cnt_subflows) {
13711 + if (meta_sk->sk_family == AF_INET ||
13712 + mptcp_v6_is_v4_mapped(meta_sk)) {
13713 + struct mptcp_loc4 loc;
13715 + loc.addr.s_addr = inet_sk(meta_sk)->inet_saddr;
13716 + loc.loc4_id = 0;
13717 + loc.low_prio = 0;
13719 + mptcp_init4_subsockets(meta_sk, &loc, &mpcb->remaddr4[0]);
13720 + } else {
13721 +#if IS_ENABLED(CONFIG_IPV6)
13722 + struct mptcp_loc6 loc;
13724 + loc.addr = inet6_sk(meta_sk)->saddr;
13725 + loc.loc6_id = 0;
13726 + loc.low_prio = 0;
13728 + mptcp_init6_subsockets(meta_sk, &loc, &mpcb->remaddr6[0]);
13729 +#endif
13731 + goto next_subflow;
13734 +exit:
13735 + release_sock(meta_sk);
13736 + mutex_unlock(&mpcb->mpcb_mutex);
13737 + sock_put(meta_sk);
13740 +static void ndiffports_new_session(struct sock *meta_sk, int index)
13742 + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
13743 + struct ndiffports_priv *fmp = (struct ndiffports_priv *)&mpcb->mptcp_pm[0];
13745 + /* Initialize workqueue-struct */
13746 + INIT_WORK(&fmp->subflow_work, create_subflow_worker);
13747 + fmp->mpcb = mpcb;
13750 +static void ndiffports_create_subflows(struct sock *meta_sk)
13752 + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
13753 + struct ndiffports_priv *pm_priv = (struct ndiffports_priv *)&mpcb->mptcp_pm[0];
13755 + if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv ||
13756 + mpcb->send_infinite_mapping ||
13757 + mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD))
13758 + return;
13760 + if (!work_pending(&pm_priv->subflow_work)) {
13761 + sock_hold(meta_sk);
13762 + queue_work(mptcp_wq, &pm_priv->subflow_work);
13766 +static int ndiffports_get_local_index(sa_family_t family, union inet_addr *addr,
13767 + struct net *net)
13769 + return 0;
13772 +static struct mptcp_pm_ops ndiffports __read_mostly = {
13773 + .new_session = ndiffports_new_session,
13774 + .fully_established = ndiffports_create_subflows,
13775 + .get_local_index = ndiffports_get_local_index,
13776 + .get_local_id = ndiffports_get_local_index,
13777 + .name = "ndiffports",
13778 + .owner = THIS_MODULE,
13781 +static struct ctl_table ndiff_table[] = {
13783 + .procname = "mptcp_ndiffports",
13784 + .data = &sysctl_mptcp_ndiffports,
13785 + .maxlen = sizeof(int),
13786 + .mode = 0644,
13787 + .proc_handler = &proc_dointvec
13788 + },
13789 + { }
13792 +struct ctl_table_header *mptcp_sysctl;
13794 +/* General initialization of MPTCP_PM */
13795 +static int __init ndiffports_register(void)
13797 + BUILD_BUG_ON(sizeof(struct ndiffports_priv) > MPTCP_PM_SIZE);
13799 + mptcp_sysctl = register_net_sysctl(&init_net, "net/mptcp", ndiff_table);
13800 + if (!mptcp_sysctl)
13801 + goto exit;
13803 + if (mptcp_register_path_manager(&ndiffports))
13804 + goto pm_failed;
13806 + return 0;
13808 +pm_failed:
13809 + unregister_net_sysctl_table(mptcp_sysctl);
13810 +exit:
13811 + return -1;
13814 +static void ndiffports_unregister(void)
13816 + mptcp_unregister_path_manager(&ndiffports);
13817 + unregister_net_sysctl_table(mptcp_sysctl);
13820 +module_init(ndiffports_register);
13821 +module_exit(ndiffports_unregister);
13823 +MODULE_AUTHOR("Christoph Paasch");
13824 +MODULE_LICENSE("GPL");
13825 +MODULE_DESCRIPTION("NDIFF-PORTS MPTCP");
13826 +MODULE_VERSION("0.88");
13827 diff -Nur linux-3.14.45.orig/net/mptcp/mptcp_ofo_queue.c linux-3.14.45/net/mptcp/mptcp_ofo_queue.c
13828 --- linux-3.14.45.orig/net/mptcp/mptcp_ofo_queue.c 1970-01-01 01:00:00.000000000 +0100
13829 +++ linux-3.14.45/net/mptcp/mptcp_ofo_queue.c 2015-06-24 14:15:48.931862523 +0200
13830 @@ -0,0 +1,278 @@
13832 + * MPTCP implementation - Fast algorithm for MPTCP meta-reordering
13834 + * Initial Design & Implementation:
13835 + * Sébastien Barré <sebastien.barre@uclouvain.be>
13837 + * Current Maintainer & Author:
13838 + * Christoph Paasch <christoph.paasch@uclouvain.be>
13840 + * Additional authors:
13841 + * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
13842 + * Gregory Detal <gregory.detal@uclouvain.be>
13843 + * Fabien Duchêne <fabien.duchene@uclouvain.be>
13844 + * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
13845 + * Lavkesh Lahngir <lavkesh51@gmail.com>
13846 + * Andreas Ripke <ripke@neclab.eu>
13847 + * Vlad Dogaru <vlad.dogaru@intel.com>
13848 + * Octavian Purdila <octavian.purdila@intel.com>
13849 + * John Ronan <jronan@tssg.org>
13850 + * Catalin Nicutar <catalin.nicutar@gmail.com>
13851 + * Brandon Heller <brandonh@stanford.edu>
13853 + * This program is free software; you can redistribute it and/or
13854 + * modify it under the terms of the GNU General Public License
13855 + * as published by the Free Software Foundation; either version
13856 + * 2 of the License, or (at your option) any later version.
13857 + */
13859 +#include <linux/skbuff.h>
13860 +#include <linux/slab.h>
13861 +#include <net/tcp.h>
13862 +#include <net/mptcp.h>
13864 +void mptcp_remove_shortcuts(const struct mptcp_cb *mpcb,
13865 + const struct sk_buff *skb)
13867 + struct tcp_sock *tp;
13869 + mptcp_for_each_tp(mpcb, tp) {
13870 + if (tp->mptcp->shortcut_ofoqueue == skb) {
13871 + tp->mptcp->shortcut_ofoqueue = NULL;
13872 + return;
13877 +/* Does 'skb' fits after 'here' in the queue 'head' ?
13878 + * If yes, we queue it and return 1
13879 + */
13880 +static int mptcp_ofo_queue_after(struct sk_buff_head *head,
13881 + struct sk_buff *skb, struct sk_buff *here,
13882 + struct tcp_sock *tp)
13884 + struct sock *meta_sk = tp->meta_sk;
13885 + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
13886 + u32 seq = TCP_SKB_CB(skb)->seq;
13887 + u32 end_seq = TCP_SKB_CB(skb)->end_seq;
13889 + /* We want to queue skb after here, thus seq >= end_seq */
13890 + if (before(seq, TCP_SKB_CB(here)->end_seq))
13891 + return 0;
13893 + if (seq == TCP_SKB_CB(here)->end_seq) {
13894 + bool fragstolen = false;
13896 + if (!tcp_try_coalesce(meta_sk, here, skb, &fragstolen)) {
13897 + __skb_queue_after(&meta_tp->out_of_order_queue, here, skb);
13898 + return 1;
13899 + } else {
13900 + kfree_skb_partial(skb, fragstolen);
13901 + return -1;
13905 + /* If here is the last one, we can always queue it */
13906 + if (skb_queue_is_last(head, here)) {
13907 + __skb_queue_after(head, here, skb);
13908 + return 1;
13909 + } else {
13910 + struct sk_buff *skb1 = skb_queue_next(head, here);
13911 + /* It's not the last one, but does it fits between 'here' and
13912 + * the one after 'here' ? Thus, does end_seq <= after_here->seq
13913 + */
13914 + if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) {
13915 + __skb_queue_after(head, here, skb);
13916 + return 1;
13920 + return 0;
13923 +static void try_shortcut(struct sk_buff *shortcut, struct sk_buff *skb,
13924 + struct sk_buff_head *head, struct tcp_sock *tp)
13926 + struct sock *meta_sk = tp->meta_sk;
13927 + struct tcp_sock *tp_it, *meta_tp = tcp_sk(meta_sk);
13928 + struct mptcp_cb *mpcb = meta_tp->mpcb;
13929 + struct sk_buff *skb1, *best_shortcut = NULL;
13930 + u32 seq = TCP_SKB_CB(skb)->seq;
13931 + u32 end_seq = TCP_SKB_CB(skb)->end_seq;
13932 + u32 distance = 0xffffffff;
13934 + /* First, check the tp's shortcut */
13935 + if (!shortcut) {
13936 + if (skb_queue_empty(head)) {
13937 + __skb_queue_head(head, skb);
13938 + goto end;
13940 + } else {
13941 + int ret = mptcp_ofo_queue_after(head, skb, shortcut, tp);
13942 + /* Does the tp's shortcut is a hit? If yes, we insert. */
13944 + if (ret) {
13945 + skb = (ret > 0) ? skb : NULL;
13946 + goto end;
13950 + /* Check the shortcuts of the other subsockets. */
13951 + mptcp_for_each_tp(mpcb, tp_it) {
13952 + shortcut = tp_it->mptcp->shortcut_ofoqueue;
13953 + /* Can we queue it here? If yes, do so! */
13954 + if (shortcut) {
13955 + int ret = mptcp_ofo_queue_after(head, skb, shortcut, tp);
13957 + if (ret) {
13958 + skb = (ret > 0) ? skb : NULL;
13959 + goto end;
13963 + /* Could not queue it, check if we are close.
13964 + * We are looking for a shortcut, close enough to seq to
13965 + * set skb1 prematurely and thus improve the subsequent lookup,
13966 + * which tries to find a skb1 so that skb1->seq <= seq.
13968 + * So, here we only take shortcuts, whose shortcut->seq > seq,
13969 + * and minimize the distance between shortcut->seq and seq and
13970 + * set best_shortcut to this one with the minimal distance.
13972 + * That way, the subsequent while-loop is shortest.
13973 + */
13974 + if (shortcut && after(TCP_SKB_CB(shortcut)->seq, seq)) {
13975 + /* Are we closer than the current best shortcut? */
13976 + if ((u32)(TCP_SKB_CB(shortcut)->seq - seq) < distance) {
13977 + distance = (u32)(TCP_SKB_CB(shortcut)->seq - seq);
13978 + best_shortcut = shortcut;
13983 + if (best_shortcut)
13984 + skb1 = best_shortcut;
13985 + else
13986 + skb1 = skb_peek_tail(head);
13988 + if (seq == TCP_SKB_CB(skb1)->end_seq) {
13989 + bool fragstolen = false;
13991 + if (!tcp_try_coalesce(meta_sk, skb1, skb, &fragstolen)) {
13992 + __skb_queue_after(&meta_tp->out_of_order_queue, skb1, skb);
13993 + } else {
13994 + kfree_skb_partial(skb, fragstolen);
13995 + skb = NULL;
13998 + goto end;
14001 + /* Find the insertion point, starting from best_shortcut if available.
14003 + * Inspired from tcp_data_queue_ofo.
14004 + */
14005 + while (1) {
14006 + /* skb1->seq <= seq */
14007 + if (!after(TCP_SKB_CB(skb1)->seq, seq))
14008 + break;
14009 + if (skb_queue_is_first(head, skb1)) {
14010 + skb1 = NULL;
14011 + break;
14013 + skb1 = skb_queue_prev(head, skb1);
14016 + /* Do skb overlap to previous one? */
14017 + if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
14018 + if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
14019 + /* All the bits are present. */
14020 + __kfree_skb(skb);
14021 + skb = NULL;
14022 + goto end;
14024 + if (seq == TCP_SKB_CB(skb1)->seq) {
14025 + if (skb_queue_is_first(head, skb1))
14026 + skb1 = NULL;
14027 + else
14028 + skb1 = skb_queue_prev(head, skb1);
14031 + if (!skb1)
14032 + __skb_queue_head(head, skb);
14033 + else
14034 + __skb_queue_after(head, skb1, skb);
14036 + /* And clean segments covered by new one as whole. */
14037 + while (!skb_queue_is_last(head, skb)) {
14038 + skb1 = skb_queue_next(head, skb);
14040 + if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
14041 + break;
14043 + __skb_unlink(skb1, head);
14044 + mptcp_remove_shortcuts(mpcb, skb1);
14045 + __kfree_skb(skb1);
14048 +end:
14049 + if (skb) {
14050 + skb_set_owner_r(skb, meta_sk);
14051 + tp->mptcp->shortcut_ofoqueue = skb;
14054 + return;
14057 +/**
14058 + * @sk: the subflow that received this skb.
14059 + */
14060 +void mptcp_add_meta_ofo_queue(struct sock *meta_sk, struct sk_buff *skb,
14061 + struct sock *sk)
14063 + struct tcp_sock *tp = tcp_sk(sk);
14065 + try_shortcut(tp->mptcp->shortcut_ofoqueue, skb,
14066 + &tcp_sk(meta_sk)->out_of_order_queue, tp);
14069 +void mptcp_ofo_queue(struct sock *meta_sk)
14071 + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
14072 + struct sk_buff *skb;
14074 + while ((skb = skb_peek(&meta_tp->out_of_order_queue)) != NULL) {
14075 + u32 old_rcv_nxt = meta_tp->rcv_nxt;
14076 + if (after(TCP_SKB_CB(skb)->seq, meta_tp->rcv_nxt))
14077 + break;
14079 + if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->rcv_nxt)) {
14080 + __skb_unlink(skb, &meta_tp->out_of_order_queue);
14081 + mptcp_remove_shortcuts(meta_tp->mpcb, skb);
14082 + __kfree_skb(skb);
14083 + continue;
14086 + __skb_unlink(skb, &meta_tp->out_of_order_queue);
14087 + mptcp_remove_shortcuts(meta_tp->mpcb, skb);
14089 + __skb_queue_tail(&meta_sk->sk_receive_queue, skb);
14090 + meta_tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
14091 + mptcp_check_rcvseq_wrap(meta_tp, old_rcv_nxt);
14093 + if (tcp_hdr(skb)->fin)
14094 + mptcp_fin(meta_sk);
14098 +void mptcp_purge_ofo_queue(struct tcp_sock *meta_tp)
14100 + struct sk_buff_head *head = &meta_tp->out_of_order_queue;
14101 + struct sk_buff *skb, *tmp;
14103 + skb_queue_walk_safe(head, skb, tmp) {
14104 + __skb_unlink(skb, head);
14105 + mptcp_remove_shortcuts(meta_tp->mpcb, skb);
14106 + kfree_skb(skb);
14109 diff -Nur linux-3.14.45.orig/net/mptcp/mptcp_olia.c linux-3.14.45/net/mptcp/mptcp_olia.c
14110 --- linux-3.14.45.orig/net/mptcp/mptcp_olia.c 1970-01-01 01:00:00.000000000 +0100
14111 +++ linux-3.14.45/net/mptcp/mptcp_olia.c 2015-06-24 14:15:48.931862523 +0200
14112 @@ -0,0 +1,314 @@
14114 + * MPTCP implementation - OPPORTUNISTIC LINKED INCREASES CONGESTION CONTROL:
14116 + * Algorithm design:
14117 + * Ramin Khalili <ramin.khalili@epfl.ch>
14118 + * Nicolas Gast <nicolas.gast@epfl.ch>
14119 + * Jean-Yves Le Boudec <jean-yves.leboudec@epfl.ch>
14121 + * Implementation:
14122 + * Ramin Khalili <ramin.khalili@epfl.ch>
14124 + * Ported to the official MPTCP-kernel:
14125 + * Christoph Paasch <christoph.paasch@uclouvain.be>
14127 + * This program is free software; you can redistribute it and/or
14128 + * modify it under the terms of the GNU General Public License
14129 + * as published by the Free Software Foundation; either version
14130 + * 2 of the License, or (at your option) any later version.
14131 + */
14134 +#include <net/tcp.h>
14135 +#include <net/mptcp.h>
14137 +#include <linux/module.h>
14139 +static int scale = 10;
14141 +struct mptcp_olia {
14142 + u32 mptcp_loss1;
14143 + u32 mptcp_loss2;
14144 + u32 mptcp_loss3;
14145 + int epsilon_num;
14146 + u32 epsilon_den;
14147 + int mptcp_snd_cwnd_cnt;
14150 +static inline int mptcp_olia_sk_can_send(const struct sock *sk)
14152 + return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt;
14155 +static inline u64 mptcp_olia_scale(u64 val, int scale)
14157 + return (u64) val << scale;
14160 +/* take care of artificially inflate (see RFC5681)
14161 + * of cwnd during fast-retransmit phase
14162 + */
14163 +static u32 mptcp_get_crt_cwnd(struct sock *sk)
14165 + struct inet_connection_sock *icsk = inet_csk(sk);
14167 + if (icsk->icsk_ca_state == TCP_CA_Recovery)
14168 + return tcp_sk(sk)->snd_ssthresh;
14169 + else
14170 + return tcp_sk(sk)->snd_cwnd;
14173 +/* return the dominator of the first term of the increasing term */
14174 +static u64 mptcp_get_rate(struct mptcp_cb *mpcb , u32 path_rtt)
14176 + struct sock *sk;
14177 + u64 rate = 1; /* We have to avoid a zero-rate because it is used as a divisor */
14179 + mptcp_for_each_sk(mpcb, sk) {
14180 + struct tcp_sock *tp = tcp_sk(sk);
14181 + u64 scaled_num;
14182 + u32 tmp_cwnd;
14184 + if (!mptcp_olia_sk_can_send(sk))
14185 + continue;
14187 + tmp_cwnd = mptcp_get_crt_cwnd(sk);
14188 + scaled_num = mptcp_olia_scale(tmp_cwnd, scale) * path_rtt;
14189 + rate += div_u64(scaled_num , tp->srtt);
14191 + rate *= rate;
14192 + return rate;
14195 +/* find the maximum cwnd, used to find set M */
14196 +static u32 mptcp_get_max_cwnd(struct mptcp_cb *mpcb)
14198 + struct sock *sk;
14199 + u32 best_cwnd = 0;
14201 + mptcp_for_each_sk(mpcb, sk) {
14202 + u32 tmp_cwnd;
14204 + if (!mptcp_olia_sk_can_send(sk))
14205 + continue;
14207 + tmp_cwnd = mptcp_get_crt_cwnd(sk);
14208 + if (tmp_cwnd > best_cwnd)
14209 + best_cwnd = tmp_cwnd;
14211 + return best_cwnd;
14214 +static void mptcp_get_epsilon(struct mptcp_cb *mpcb)
14216 + struct mptcp_olia *ca;
14217 + struct tcp_sock *tp;
14218 + struct sock *sk;
14219 + u64 tmp_int, tmp_rtt, best_int = 0, best_rtt = 1;
14220 + u32 max_cwnd = 1, best_cwnd = 1, tmp_cwnd;
14221 + u8 M = 0, B_not_M = 0;
14223 + /* TODO - integrate this in the following loop - we just want to iterate once */
14225 + max_cwnd = mptcp_get_max_cwnd(mpcb);
14227 + /* find the best path */
14228 + mptcp_for_each_sk(mpcb, sk) {
14229 + tp = tcp_sk(sk);
14230 + ca = inet_csk_ca(sk);
14232 + if (!mptcp_olia_sk_can_send(sk))
14233 + continue;
14235 + tmp_rtt = tp->srtt * tp->srtt;
14236 + /* TODO - check here and rename variables */
14237 + tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2,
14238 + ca->mptcp_loss2 - ca->mptcp_loss1);
14240 + tmp_cwnd = mptcp_get_crt_cwnd(sk);
14241 + if (tmp_int * best_rtt >= best_int * tmp_rtt) {
14242 + best_rtt = tmp_rtt;
14243 + best_int = tmp_int;
14244 + best_cwnd = tmp_cwnd;
14248 + /* TODO - integrate this here in mptcp_get_max_cwnd and in the previous loop */
14249 + /* find the size of M and B_not_M */
14250 + mptcp_for_each_sk(mpcb, sk) {
14251 + tp = tcp_sk(sk);
14252 + ca = inet_csk_ca(sk);
14254 + if (!mptcp_olia_sk_can_send(sk))
14255 + continue;
14257 + tmp_cwnd = mptcp_get_crt_cwnd(sk);
14258 + if (tmp_cwnd == max_cwnd) {
14259 + M++;
14260 + } else {
14261 + tmp_rtt = tp->srtt * tp->srtt;
14262 + tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2,
14263 + ca->mptcp_loss2 - ca->mptcp_loss1);
14265 + if (tmp_int * best_rtt == best_int * tmp_rtt)
14266 + B_not_M++;
14270 + /* check if the path is in M or B_not_M and set the value of epsilon accordingly */
14271 + mptcp_for_each_sk(mpcb, sk) {
14272 + tp = tcp_sk(sk);
14273 + ca = inet_csk_ca(sk);
14275 + if (!mptcp_olia_sk_can_send(sk))
14276 + continue;
14278 + if (B_not_M == 0) {
14279 + ca->epsilon_num = 0;
14280 + ca->epsilon_den = 1;
14281 + } else {
14282 + tmp_rtt = tp->srtt * tp->srtt;
14283 + tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2,
14284 + ca->mptcp_loss2 - ca->mptcp_loss1);
14285 + tmp_cwnd = mptcp_get_crt_cwnd(sk);
14287 + if (tmp_cwnd < max_cwnd &&
14288 + tmp_int * best_rtt == best_int * tmp_rtt){
14289 + ca->epsilon_num = 1;
14290 + ca->epsilon_den = mpcb->cnt_established * B_not_M;
14291 + } else if (tmp_cwnd == max_cwnd) {
14292 + ca->epsilon_num = -1;
14293 + ca->epsilon_den = mpcb->cnt_established * M;
14294 + } else {
14295 + ca->epsilon_num = 0;
14296 + ca->epsilon_den = 1;
14303 +/* setting the initial values */
14304 +static void mptcp_olia_init(struct sock *sk)
14306 + struct tcp_sock *tp = tcp_sk(sk);
14307 + struct mptcp_olia *ca = inet_csk_ca(sk);
14309 + if (tp->mpc) {
14310 + ca->mptcp_loss1 = tp->snd_una;
14311 + ca->mptcp_loss2 = tp->snd_una;
14312 + ca->mptcp_loss3 = tp->snd_una;
14313 + ca->mptcp_snd_cwnd_cnt = 0;
14314 + ca->epsilon_num = 0;
14315 + ca->epsilon_den = 1;
14319 +/* updating inter-loss distance and ssthresh */
14320 +static void mptcp_olia_set_state(struct sock *sk, u8 new_state)
14322 + if (!tcp_sk(sk)->mpc)
14323 + return;
14325 + if (new_state == TCP_CA_Loss ||
14326 + new_state == TCP_CA_Recovery || new_state == TCP_CA_CWR) {
14327 + struct mptcp_olia *ca = inet_csk_ca(sk);
14329 + if (ca->mptcp_loss3 != ca->mptcp_loss2 &&
14330 + !inet_csk(sk)->icsk_retransmits) {
14331 + ca->mptcp_loss1 = ca->mptcp_loss2;
14332 + ca->mptcp_loss2 = ca->mptcp_loss3;
14338 +/* main algorithm */
14339 +static void mptcp_olia_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight)
14341 + struct tcp_sock *tp = tcp_sk(sk);
14342 + struct mptcp_olia *ca = inet_csk_ca(sk);
14343 + struct mptcp_cb *mpcb = tp->mpcb;
14345 + u64 inc_num, inc_den, rate, cwnd_scaled;
14347 + if (!tp->mpc) {
14348 + tcp_reno_cong_avoid(sk, ack, acked, in_flight);
14349 + return;
14352 + ca->mptcp_loss3 = tp->snd_una;
14354 + if (!tcp_is_cwnd_limited(sk, in_flight))
14355 + return;
14357 + /* slow start if it is in the safe area */
14358 + if (tp->snd_cwnd <= tp->snd_ssthresh) {
14359 + tcp_slow_start(tp, acked);
14360 + return;
14363 + mptcp_get_epsilon(mpcb);
14364 + rate = mptcp_get_rate(mpcb, tp->srtt);
14365 + cwnd_scaled = mptcp_olia_scale(tp->snd_cwnd, scale);
14366 + inc_den = ca->epsilon_den * tp->snd_cwnd * rate ? : 1;
14368 + /* calculate the increasing term, scaling is used to reduce the rounding effect */
14369 + if (ca->epsilon_num == -1) {
14370 + if (ca->epsilon_den * cwnd_scaled * cwnd_scaled < rate) {
14371 + inc_num = rate - ca->epsilon_den *
14372 + cwnd_scaled * cwnd_scaled;
14373 + ca->mptcp_snd_cwnd_cnt -= div64_u64(
14374 + mptcp_olia_scale(inc_num , scale) , inc_den);
14375 + } else {
14376 + inc_num = ca->epsilon_den *
14377 + cwnd_scaled * cwnd_scaled - rate;
14378 + ca->mptcp_snd_cwnd_cnt += div64_u64(
14379 + mptcp_olia_scale(inc_num , scale) , inc_den);
14381 + } else {
14382 + inc_num = ca->epsilon_num * rate +
14383 + ca->epsilon_den * cwnd_scaled * cwnd_scaled;
14384 + ca->mptcp_snd_cwnd_cnt += div64_u64(
14385 + mptcp_olia_scale(inc_num , scale) , inc_den);
14389 + if (ca->mptcp_snd_cwnd_cnt >= (1 << scale) - 1) {
14390 + if (tp->snd_cwnd < tp->snd_cwnd_clamp)
14391 + tp->snd_cwnd++;
14392 + ca->mptcp_snd_cwnd_cnt = 0;
14393 + } else if (ca->mptcp_snd_cwnd_cnt <= 0 - (1 << scale) + 1) {
14394 + tp->snd_cwnd = max((int) 1 , (int) tp->snd_cwnd - 1);
14395 + ca->mptcp_snd_cwnd_cnt = 0;
14399 +static struct tcp_congestion_ops mptcp_olia = {
14400 + .init = mptcp_olia_init,
14401 + .ssthresh = tcp_reno_ssthresh,
14402 + .cong_avoid = mptcp_olia_cong_avoid,
14403 + .set_state = mptcp_olia_set_state,
14404 + .min_cwnd = tcp_reno_min_cwnd,
14405 + .owner = THIS_MODULE,
14406 + .name = "olia",
14409 +static int __init mptcp_olia_register(void)
14411 + BUILD_BUG_ON(sizeof(struct mptcp_olia) > ICSK_CA_PRIV_SIZE);
14412 + return tcp_register_congestion_control(&mptcp_olia);
14415 +static void __exit mptcp_olia_unregister(void)
14417 + tcp_unregister_congestion_control(&mptcp_olia);
14420 +module_init(mptcp_olia_register);
14421 +module_exit(mptcp_olia_unregister);
14423 +MODULE_AUTHOR("Ramin Khalili, Nicolas Gast, Jean-Yves Le Boudec");
14424 +MODULE_LICENSE("GPL");
14425 +MODULE_DESCRIPTION("MPTCP COUPLED CONGESTION CONTROL");
14426 +MODULE_VERSION("0.1");
14427 diff -Nur linux-3.14.45.orig/net/mptcp/mptcp_output.c linux-3.14.45/net/mptcp/mptcp_output.c
14428 --- linux-3.14.45.orig/net/mptcp/mptcp_output.c 1970-01-01 01:00:00.000000000 +0100
14429 +++ linux-3.14.45/net/mptcp/mptcp_output.c 2015-06-24 14:15:48.931862523 +0200
14430 @@ -0,0 +1,2255 @@
14432 + * MPTCP implementation - Sending side
14434 + * Initial Design & Implementation:
14435 + * Sébastien Barré <sebastien.barre@uclouvain.be>
14437 + * Current Maintainer & Author:
14438 + * Christoph Paasch <christoph.paasch@uclouvain.be>
14440 + * Additional authors:
14441 + * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
14442 + * Gregory Detal <gregory.detal@uclouvain.be>
14443 + * Fabien Duchêne <fabien.duchene@uclouvain.be>
14444 + * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
14445 + * Lavkesh Lahngir <lavkesh51@gmail.com>
14446 + * Andreas Ripke <ripke@neclab.eu>
14447 + * Vlad Dogaru <vlad.dogaru@intel.com>
14448 + * Octavian Purdila <octavian.purdila@intel.com>
14449 + * John Ronan <jronan@tssg.org>
14450 + * Catalin Nicutar <catalin.nicutar@gmail.com>
14451 + * Brandon Heller <brandonh@stanford.edu>
14454 + * This program is free software; you can redistribute it and/or
14455 + * modify it under the terms of the GNU General Public License
14456 + * as published by the Free Software Foundation; either version
14457 + * 2 of the License, or (at your option) any later version.
14458 + */
14460 +#include <linux/kconfig.h>
14461 +#include <linux/skbuff.h>
14462 +#include <linux/tcp.h>
14464 +#include <net/mptcp.h>
14465 +#include <net/mptcp_v4.h>
14466 +#include <net/mptcp_v6.h>
14467 +#include <net/sock.h>
14469 +static inline int mptcp_pi_to_flag(int pi)
14471 + return 1 << (pi - 1);
14474 +static inline int mptcp_sub_len_remove_addr(u16 bitfield)
14476 + unsigned int c;
14477 + for (c = 0; bitfield; c++)
14478 + bitfield &= bitfield - 1;
14479 + return MPTCP_SUB_LEN_REMOVE_ADDR + c - 1;
14482 +int mptcp_sub_len_remove_addr_align(u16 bitfield)
14484 + return ALIGN(mptcp_sub_len_remove_addr(bitfield), 4);
14486 +EXPORT_SYMBOL(mptcp_sub_len_remove_addr_align);
14488 +/* If the sub-socket sk available to send the skb? */
14489 +static int mptcp_is_available(struct sock *sk, struct sk_buff *skb,
14490 + unsigned int *mss)
14492 + struct tcp_sock *tp = tcp_sk(sk);
14493 + unsigned int mss_now;
14495 + /* Set of states for which we are allowed to send data */
14496 + if (!mptcp_sk_can_send(sk))
14497 + return 0;
14499 + /* We do not send data on this subflow unless it is
14500 + * fully established, i.e. the 4th ack has been received.
14501 + */
14502 + if (tp->mptcp->pre_established)
14503 + return 0;
14505 + if (tp->pf ||
14506 + (tp->mpcb->noneligible & mptcp_pi_to_flag(tp->mptcp->path_index)))
14507 + return 0;
14509 + if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) {
14510 + /* If SACK is disabled, and we got a loss, TCP does not exit
14511 + * the loss-state until something above high_seq has been acked.
14512 + * (see tcp_try_undo_recovery)
14514 + * high_seq is the snd_nxt at the moment of the RTO. As soon
14515 + * as we have an RTO, we won't push data on the subflow.
14516 + * Thus, snd_una can never go beyond high_seq.
14517 + */
14518 + if (!tcp_is_reno(tp))
14519 + return 0;
14520 + else if (tp->snd_una != tp->high_seq)
14521 + return 0;
14524 + if (!tp->mptcp->fully_established) {
14525 + /* Make sure that we send in-order data */
14526 + if (skb && tp->mptcp->second_packet &&
14527 + tp->mptcp->last_end_data_seq != TCP_SKB_CB(skb)->seq)
14528 + return 0;
14531 + if (!tcp_cwnd_test(tp, skb))
14532 + return 0;
14534 + mss_now = tcp_current_mss(sk);
14535 + /* Don't send on this subflow if we bypass the allowed send-window at
14536 + * the per-subflow level. Similar to tcp_snd_wnd_test, but manually
14537 + * calculated end_seq (because here at this point end_seq is still at
14538 + * the meta-level).
14539 + */
14540 + if (skb && after(tp->write_seq + min(skb->len, mss_now), tcp_wnd_end(tp)))
14541 + return 0;
14543 + if (mss)
14544 + *mss = mss_now;
14546 + return 1;
14549 +/* Are we not allowed to reinject this skb on tp? */
14550 +static int mptcp_dont_reinject_skb(struct tcp_sock *tp, struct sk_buff *skb)
14552 + /* If the skb has already been enqueued in this sk, try to find
14553 + * another one.
14554 + */
14555 + return skb &&
14556 + /* Has the skb already been enqueued into this subsocket? */
14557 + mptcp_pi_to_flag(tp->mptcp->path_index) & TCP_SKB_CB(skb)->path_mask;
14560 +/* This is the scheduler. This function decides on which flow to send
14561 + * a given MSS. If all subflows are found to be busy, NULL is returned
14562 + * The flow is selected based on the shortest RTT.
14563 + * If all paths have full cong windows, we simply return NULL.
14565 + * Additionally, this function is aware of the backup-subflows.
14566 + */
14567 +static struct sock *get_available_subflow(struct sock *meta_sk,
14568 + struct sk_buff *skb,
14569 + unsigned int *mss_now)
14571 + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
14572 + struct sock *sk, *bestsk = NULL, *lowpriosk = NULL, *backupsk = NULL;
14573 + unsigned int mss = 0, mss_lowprio = 0, mss_backup = 0;
14574 + u32 min_time_to_peer = 0xffffffff, lowprio_min_time_to_peer = 0xffffffff;
14575 + int cnt_backups = 0;
14577 + /* if there is only one subflow, bypass the scheduling function */
14578 + if (mpcb->cnt_subflows == 1) {
14579 + bestsk = (struct sock *)mpcb->connection_list;
14580 + if (!mptcp_is_available(bestsk, skb, mss_now))
14581 + bestsk = NULL;
14582 + return bestsk;
14585 + /* Answer data_fin on same subflow!!! */
14586 + if (meta_sk->sk_shutdown & RCV_SHUTDOWN &&
14587 + skb && mptcp_is_data_fin(skb)) {
14588 + mptcp_for_each_sk(mpcb, sk) {
14589 + if (tcp_sk(sk)->mptcp->path_index == mpcb->dfin_path_index &&
14590 + mptcp_is_available(sk, skb, mss_now))
14591 + return sk;
14595 + /* First, find the best subflow */
14596 + mptcp_for_each_sk(mpcb, sk) {
14597 + struct tcp_sock *tp = tcp_sk(sk);
14598 + int this_mss;
14600 + if (tp->mptcp->rcv_low_prio || tp->mptcp->low_prio)
14601 + cnt_backups++;
14603 + if ((tp->mptcp->rcv_low_prio || tp->mptcp->low_prio) &&
14604 + tp->srtt < lowprio_min_time_to_peer) {
14606 + if (!mptcp_is_available(sk, skb, &this_mss))
14607 + continue;
14609 + if (mptcp_dont_reinject_skb(tp, skb)) {
14610 + mss_backup = this_mss;
14611 + backupsk = sk;
14612 + continue;
14615 + lowprio_min_time_to_peer = tp->srtt;
14616 + lowpriosk = sk;
14617 + mss_lowprio = this_mss;
14618 + } else if (!(tp->mptcp->rcv_low_prio || tp->mptcp->low_prio) &&
14619 + tp->srtt < min_time_to_peer) {
14620 + if (!mptcp_is_available(sk, skb, &this_mss))
14621 + continue;
14623 + if (mptcp_dont_reinject_skb(tp, skb)) {
14624 + mss_backup = this_mss;
14625 + backupsk = sk;
14626 + continue;
14629 + min_time_to_peer = tp->srtt;
14630 + bestsk = sk;
14631 + mss = this_mss;
14635 + if (mpcb->cnt_established == cnt_backups && lowpriosk) {
14636 + mss = mss_lowprio;
14637 + sk = lowpriosk;
14638 + } else if (bestsk) {
14639 + sk = bestsk;
14640 + } else if (backupsk){
14641 + /* It has been sent on all subflows once - let's give it a
14642 + * chance again by restarting its pathmask.
14643 + */
14644 + if (skb)
14645 + TCP_SKB_CB(skb)->path_mask = 0;
14646 + mss = mss_backup;
14647 + sk = backupsk;
14650 + if (mss_now)
14651 + *mss_now = mss;
14653 + return sk;
14656 +static struct mp_dss *mptcp_skb_find_dss(const struct sk_buff *skb)
14658 + if (!mptcp_is_data_seq(skb))
14659 + return NULL;
14661 + return (struct mp_dss *)(skb->data - (MPTCP_SUB_LEN_DSS_ALIGN +
14662 + MPTCP_SUB_LEN_ACK_ALIGN +
14663 + MPTCP_SUB_LEN_SEQ_ALIGN));
14666 +/* get the data-seq and end-data-seq and store them again in the
14667 + * tcp_skb_cb
14668 + */
14669 +static int mptcp_reconstruct_mapping(struct sk_buff *skb, struct sk_buff *orig_skb)
14671 + struct mp_dss *mpdss = mptcp_skb_find_dss(orig_skb);
14672 + u32 *p32;
14673 + u16 *p16;
14675 + if (!mpdss || !mpdss->M)
14676 + return 1;
14678 + /* Move the pointer to the data-seq */
14679 + p32 = (u32 *)mpdss;
14680 + p32++;
14681 + if (mpdss->A) {
14682 + p32++;
14683 + if (mpdss->a)
14684 + p32++;
14687 + TCP_SKB_CB(skb)->seq = ntohl(*p32);
14689 + /* Get the data_len to calculate the end_data_seq */
14690 + p32++;
14691 + p32++;
14692 + p16 = (u16 *)p32;
14693 + TCP_SKB_CB(skb)->end_seq = ntohs(*p16) + TCP_SKB_CB(skb)->seq;
14695 + return 0;
14698 +/* Similar to __pskb_copy and sk_stream_alloc_skb. */
14699 +static struct sk_buff *mptcp_pskb_copy(struct sk_buff *skb)
14701 + struct sk_buff *n;
14702 + /* The TCP header must be at least 32-bit aligned. */
14703 + int size = ALIGN(skb_headlen(skb), 4);
14705 + n = alloc_skb_fclone(size + MAX_TCP_HEADER, GFP_ATOMIC);
14706 + if (!n)
14707 + return NULL;
14709 + /* Set the data pointer */
14710 + skb_reserve(n, MAX_TCP_HEADER);
14711 + /* Set the tail pointer and length */
14712 + skb_put(n, skb_headlen(skb));
14713 + /* Copy the bytes */
14714 + skb_copy_from_linear_data(skb, n->data, n->len);
14716 + n->truesize += skb->data_len;
14717 + n->data_len = skb->data_len;
14718 + n->len = skb->len;
14720 + if (skb_shinfo(skb)->nr_frags) {
14721 + int i;
14723 + if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
14724 + if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
14725 + kfree_skb(n);
14726 + n = NULL;
14727 + goto out;
14730 + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
14731 + skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
14732 + skb_frag_ref(skb, i);
14734 + skb_shinfo(n)->nr_frags = i;
14737 + if (skb_has_frag_list(skb)) {
14738 + skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
14739 + skb_clone_fraglist(n);
14742 + copy_skb_header(n, skb);
14743 +out:
14744 + return n;
14747 +/* Reinject data from one TCP subflow to the meta_sk. If sk == NULL, we are
14748 + * coming from the meta-retransmit-timer
14749 + */
14750 +static void __mptcp_reinject_data(struct sk_buff *orig_skb, struct sock *meta_sk,
14751 + struct sock *sk, int clone_it)
14753 + struct sk_buff *skb, *skb1;
14754 + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
14755 + struct mptcp_cb *mpcb = meta_tp->mpcb;
14756 + u32 seq, end_seq;
14758 + if (clone_it) {
14759 + /* pskb_copy is necessary here, because the TCP/IP-headers
14760 + * will be changed when it's going to be reinjected on another
14761 + * subflow.
14762 + */
14763 + skb = mptcp_pskb_copy(orig_skb);
14764 + } else {
14765 + __skb_unlink(orig_skb, &sk->sk_write_queue);
14766 + sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
14767 + sk->sk_wmem_queued -= orig_skb->truesize;
14768 + sk_mem_uncharge(sk, orig_skb->truesize);
14769 + skb = orig_skb;
14771 + if (unlikely(!skb))
14772 + return;
14774 + if (sk && mptcp_reconstruct_mapping(skb, orig_skb)) {
14775 + __kfree_skb(skb);
14776 + return;
14779 + skb->sk = meta_sk;
14781 + /* If it reached already the destination, we don't have to reinject it */
14782 + if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)) {
14783 + __kfree_skb(skb);
14784 + return;
14787 + /* Only reinject segments that are fully covered by the mapping */
14788 + if (skb->len + (mptcp_is_data_fin(skb) ? 1 : 0) !=
14789 + TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) {
14790 + u32 seq = TCP_SKB_CB(skb)->seq;
14791 + u32 end_seq = TCP_SKB_CB(skb)->end_seq;
14793 + __kfree_skb(skb);
14795 + /* Ok, now we have to look for the full mapping in the meta
14796 + * send-queue :S
14797 + */
14798 + tcp_for_write_queue(skb, meta_sk) {
14799 + /* Not yet at the mapping? */
14800 + if (before(TCP_SKB_CB(skb)->seq, seq))
14801 + continue;
14802 + /* We have passed by the mapping */
14803 + if (after(TCP_SKB_CB(skb)->end_seq, end_seq))
14804 + return;
14806 + __mptcp_reinject_data(skb, meta_sk, NULL, 1);
14808 + return;
14811 + /* If it's empty, just add */
14812 + if (skb_queue_empty(&mpcb->reinject_queue)) {
14813 + skb_queue_head(&mpcb->reinject_queue, skb);
14814 + return;
14817 + /* Find place to insert skb - or even we can 'drop' it, as the
14818 + * data is already covered by other skb's in the reinject-queue.
14820 + * This is inspired by code from tcp_data_queue.
14821 + */
14823 + skb1 = skb_peek_tail(&mpcb->reinject_queue);
14824 + seq = TCP_SKB_CB(skb)->seq;
14825 + while (1) {
14826 + if (!after(TCP_SKB_CB(skb1)->seq, seq))
14827 + break;
14828 + if (skb_queue_is_first(&mpcb->reinject_queue, skb1)) {
14829 + skb1 = NULL;
14830 + break;
14832 + skb1 = skb_queue_prev(&mpcb->reinject_queue, skb1);
14835 + /* Do skb overlap to previous one? */
14836 + end_seq = TCP_SKB_CB(skb)->end_seq;
14837 + if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
14838 + if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
14839 + /* All the bits are present. Don't reinject */
14840 + __kfree_skb(skb);
14841 + return;
14843 + if (seq == TCP_SKB_CB(skb1)->seq) {
14844 + if (skb_queue_is_first(&mpcb->reinject_queue, skb1))
14845 + skb1 = NULL;
14846 + else
14847 + skb1 = skb_queue_prev(&mpcb->reinject_queue, skb1);
14850 + if (!skb1)
14851 + __skb_queue_head(&mpcb->reinject_queue, skb);
14852 + else
14853 + __skb_queue_after(&mpcb->reinject_queue, skb1, skb);
14855 + /* And clean segments covered by new one as whole. */
14856 + while (!skb_queue_is_last(&mpcb->reinject_queue, skb)) {
14857 + skb1 = skb_queue_next(&mpcb->reinject_queue, skb);
14859 + if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
14860 + break;
14862 + __skb_unlink(skb1, &mpcb->reinject_queue);
14863 + __kfree_skb(skb1);
14865 + return;
14868 +/* Inserts data into the reinject queue */
14869 +void mptcp_reinject_data(struct sock *sk, int clone_it)
14871 + struct sk_buff *skb_it, *tmp;
14872 + struct tcp_sock *tp = tcp_sk(sk);
14873 + struct sock *meta_sk = tp->meta_sk;
14875 + /* It has already been closed - there is really no point in reinjecting */
14876 + if (meta_sk->sk_state == TCP_CLOSE)
14877 + return;
14879 + skb_queue_walk_safe(&sk->sk_write_queue, skb_it, tmp) {
14880 + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb_it);
14881 + /* Subflow syn's and fin's are not reinjected.
14883 + * As well as empty subflow-fins with a data-fin.
14884 + * They are reinjected below (without the subflow-fin-flag)
14885 + */
14886 + if (tcb->tcp_flags & TCPHDR_SYN ||
14887 + (tcb->tcp_flags & TCPHDR_FIN && !mptcp_is_data_fin(skb_it)) ||
14888 + (tcb->tcp_flags & TCPHDR_FIN && mptcp_is_data_fin(skb_it) && !skb_it->len))
14889 + continue;
14891 + __mptcp_reinject_data(skb_it, meta_sk, sk, clone_it);
14894 + skb_it = tcp_write_queue_tail(meta_sk);
14895 + /* If sk has sent the empty data-fin, we have to reinject it too. */
14896 + if (skb_it && mptcp_is_data_fin(skb_it) && skb_it->len == 0 &&
14897 + TCP_SKB_CB(skb_it)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index)) {
14898 + __mptcp_reinject_data(skb_it, meta_sk, NULL, 1);
14901 + mptcp_push_pending_frames(meta_sk);
14903 + tp->pf = 1;
14905 +EXPORT_SYMBOL(mptcp_reinject_data);
14907 +static void mptcp_combine_dfin(struct sk_buff *skb, struct sock *meta_sk,
14908 + struct sock *subsk)
14910 + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
14911 + struct mptcp_cb *mpcb = meta_tp->mpcb;
14912 + struct sock *sk_it;
14913 + int all_empty = 1, all_acked;
14915 + /* In infinite mapping we always try to combine */
14916 + if (mpcb->infinite_mapping_snd && tcp_close_state(subsk)) {
14917 + subsk->sk_shutdown |= SEND_SHUTDOWN;
14918 + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
14919 + return;
14922 + /* Don't combine, if they didn't combine - otherwise we end up in
14923 + * TIME_WAIT, even if our app is smart enough to avoid it
14924 + */
14925 + if (meta_sk->sk_shutdown & RCV_SHUTDOWN) {
14926 + if (!mpcb->dfin_combined)
14927 + return;
14930 + /* If no other subflow has data to send, we can combine */
14931 + mptcp_for_each_sk(mpcb, sk_it) {
14932 + if (!mptcp_sk_can_send(sk_it))
14933 + continue;
14935 + if (!tcp_write_queue_empty(sk_it))
14936 + all_empty = 0;
14939 + /* If all data has been DATA_ACKed, we can combine.
14940 + * -1, because the data_fin consumed one byte
14941 + */
14942 + all_acked = (meta_tp->snd_una == (meta_tp->write_seq - 1));
14944 + if ((all_empty || all_acked) && tcp_close_state(subsk)) {
14945 + subsk->sk_shutdown |= SEND_SHUTDOWN;
14946 + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
14950 +static struct sk_buff *mptcp_skb_entail(struct sock *sk, struct sk_buff *skb,
14951 + int reinject)
14953 + __be32 *ptr;
14954 + __u16 data_len;
14955 + struct mp_dss *mdss;
14956 + struct tcp_sock *tp = tcp_sk(sk);
14957 + struct sock *meta_sk = mptcp_meta_sk(sk);
14958 + struct mptcp_cb *mpcb = tp->mpcb;
14959 + struct tcp_skb_cb *tcb;
14960 + struct sk_buff *subskb = NULL;
14962 + if (!reinject)
14963 + TCP_SKB_CB(skb)->mptcp_flags |= (mpcb->snd_hiseq_index ?
14964 + MPTCPHDR_SEQ64_INDEX : 0);
14966 + subskb = mptcp_pskb_copy(skb);
14967 + if (!subskb)
14968 + return NULL;
14970 + TCP_SKB_CB(skb)->path_mask |= mptcp_pi_to_flag(tp->mptcp->path_index);
14972 + if (!(sk->sk_route_caps & NETIF_F_ALL_CSUM) &&
14973 + skb->ip_summed == CHECKSUM_PARTIAL) {
14974 + subskb->csum = skb->csum = skb_checksum(skb, 0, skb->len, 0);
14975 + subskb->ip_summed = skb->ip_summed = CHECKSUM_NONE;
14978 + /* The subskb is going in the subflow send-queue. Its path-mask
14979 + * is not needed anymore and MUST be set to 0, as the path-mask
14980 + * is a union with inet_skb_param.
14981 + */
14982 + tcb = TCP_SKB_CB(subskb);
14983 + tcb->path_mask = 0;
14985 + if (mptcp_is_data_fin(subskb))
14986 + mptcp_combine_dfin(subskb, meta_sk, sk);
14988 + if (tp->mpcb->infinite_mapping_snd)
14989 + goto no_data_seq;
14991 + if (tp->mpcb->send_infinite_mapping &&
14992 + !before(tcb->seq, mptcp_meta_tp(tp)->snd_nxt)) {
14993 + tp->mptcp->fully_established = 1;
14994 + tp->mpcb->infinite_mapping_snd = 1;
14995 + tp->mptcp->infinite_cutoff_seq = tp->write_seq;
14996 + tcb->mptcp_flags |= MPTCPHDR_INF;
14997 + data_len = 0;
14998 + } else {
14999 + data_len = tcb->end_seq - tcb->seq;
15002 + /**** Write MPTCP DSS-option to the packet. ****/
15003 + ptr = (__be32 *)(subskb->data - (MPTCP_SUB_LEN_DSS_ALIGN +
15004 + MPTCP_SUB_LEN_ACK_ALIGN +
15005 + MPTCP_SUB_LEN_SEQ_ALIGN));
15007 + /* Then we start writing it from the start */
15008 + mdss = (struct mp_dss *)ptr;
15010 + mdss->kind = TCPOPT_MPTCP;
15011 + mdss->sub = MPTCP_SUB_DSS;
15012 + mdss->rsv1 = 0;
15013 + mdss->rsv2 = 0;
15014 + mdss->F = (mptcp_is_data_fin(subskb) ? 1 : 0);
15015 + mdss->m = 0;
15016 + mdss->M = 1;
15017 + mdss->a = 0;
15018 + mdss->A = 1;
15019 + mdss->len = mptcp_sub_len_dss(mdss, tp->mpcb->dss_csum);
15021 + ptr++;
15022 + ptr++; /* data_ack will be set in mptcp_options_write */
15023 + *ptr++ = htonl(tcb->seq); /* data_seq */
15025 + /* If it's a non-data DATA_FIN, we set subseq to 0 (draft v7) */
15026 + if (mptcp_is_data_fin(subskb) && subskb->len == 0)
15027 + *ptr++ = 0; /* subseq */
15028 + else
15029 + *ptr++ = htonl(tp->write_seq - tp->mptcp->snt_isn); /* subseq */
15031 + if (tp->mpcb->dss_csum && data_len) {
15032 + __be16 *p16 = (__be16 *)ptr;
15033 + __be32 hdseq = mptcp_get_highorder_sndbits(subskb, tp->mpcb);
15034 + __wsum csum;
15035 + *ptr = htonl(((data_len) << 16) |
15036 + (TCPOPT_EOL << 8) |
15037 + (TCPOPT_EOL));
15039 + csum = csum_partial(ptr - 2, 12, subskb->csum);
15040 + p16++;
15041 + *p16++ = csum_fold(csum_partial(&hdseq, sizeof(hdseq), csum));
15042 + } else {
15043 + *ptr++ = htonl(((data_len) << 16) |
15044 + (TCPOPT_NOP << 8) |
15045 + (TCPOPT_NOP));
15048 +no_data_seq:
15049 + tcb->seq = tp->write_seq;
15050 + tcb->sacked = 0; /* reset the sacked field: from the point of view
15051 + * of this subflow, we are sending a brand new
15052 + * segment */
15053 + /* Take into account seg len */
15054 + tp->write_seq += subskb->len + ((tcb->tcp_flags & TCPHDR_FIN) ? 1 : 0);
15055 + tcb->end_seq = tp->write_seq;
15057 + /* If it's a non-payload DATA_FIN (also no subflow-fin), the
15058 + * segment is not part of the subflow but on a meta-only-level
15059 + */
15060 + if (!mptcp_is_data_fin(subskb) || tcb->end_seq != tcb->seq) {
15061 + tcp_add_write_queue_tail(sk, subskb);
15062 + sk->sk_wmem_queued += subskb->truesize;
15063 + sk_mem_charge(sk, subskb->truesize);
15066 + return subskb;
15069 +static void mptcp_sub_event_new_data_sent(struct sock *sk,
15070 + struct sk_buff *subskb,
15071 + struct sk_buff *skb)
15073 + /* If it's a non-payload DATA_FIN (also no subflow-fin), the
15074 + * segment is not part of the subflow but on a meta-only-level
15076 + * We free it, because it has been queued nowhere.
15077 + */
15078 + if (!mptcp_is_data_fin(subskb) ||
15079 + (TCP_SKB_CB(subskb)->end_seq != TCP_SKB_CB(subskb)->seq)) {
15080 + tcp_event_new_data_sent(sk, subskb);
15081 + tcp_sk(sk)->mptcp->second_packet = 1;
15082 + tcp_sk(sk)->mptcp->last_end_data_seq = TCP_SKB_CB(skb)->end_seq;
15083 + } else {
15084 + kfree_skb(subskb);
15088 +/* Handle the packets and sockets after a tcp_transmit_skb failed */
15089 +static void mptcp_transmit_skb_failed(struct sock *sk, struct sk_buff *skb,
15090 + struct sk_buff *subskb)
15092 + struct tcp_sock *tp = tcp_sk(sk);
15093 + struct mptcp_cb *mpcb = tp->mpcb;
15095 + /* No work to do if we are in infinite mapping mode
15096 + * There is only one subflow left and we cannot send this segment on
15097 + * another subflow.
15098 + */
15099 + if (mpcb->infinite_mapping_snd)
15100 + return;
15102 + TCP_SKB_CB(skb)->path_mask &= ~mptcp_pi_to_flag(tp->mptcp->path_index);
15104 + if (TCP_SKB_CB(subskb)->tcp_flags & TCPHDR_FIN) {
15105 + /* If it is a subflow-fin we must leave it on the
15106 + * subflow-send-queue, so that the probe-timer
15107 + * can retransmit it.
15108 + */
15109 + if (!tp->packets_out && !inet_csk(sk)->icsk_pending)
15110 + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
15111 + inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
15112 + } else if (mptcp_is_data_fin(subskb) &&
15113 + TCP_SKB_CB(subskb)->end_seq == TCP_SKB_CB(subskb)->seq) {
15114 + /* An empty data-fin has not been enqueued on the subflow
15115 + * and thus we free it.
15116 + */
15118 + kfree_skb(subskb);
15119 + } else {
15120 + /* In all other cases we remove it from the sub-queue.
15121 + * Other subflows may send it, or the probe-timer will
15122 + * handle it.
15123 + */
15124 + tcp_advance_send_head(sk, subskb);
15126 + /* tcp_add_write_queue_tail initialized highest_sack. We have
15127 + * to reset it, if necessary.
15128 + */
15129 + if (tp->highest_sack == subskb)
15130 + tp->highest_sack = NULL;
15132 + tcp_unlink_write_queue(subskb, sk);
15133 + tp->write_seq -= subskb->len;
15134 + sk_wmem_free_skb(sk, subskb);
15138 +/* Function to create two new TCP segments. Shrinks the given segment
15139 + * to the specified size and appends a new segment with the rest of the
15140 + * packet to the list. This won't be called frequently, I hope.
15141 + * Remember, these are still headerless SKBs at this point.
15142 + */
15143 +int mptcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
15144 + unsigned int mss_now, int reinject)
15146 + struct tcp_sock *tp = tcp_sk(sk);
15147 + struct sk_buff *buff;
15148 + int nsize, old_factor;
15149 + int nlen;
15150 + u8 flags;
15151 + int dsslen = MPTCP_SUB_LEN_DSS_ALIGN + MPTCP_SUB_LEN_ACK_ALIGN +
15152 + MPTCP_SUB_LEN_SEQ_ALIGN;
15153 + char dss[MPTCP_SUB_LEN_DSS_ALIGN + MPTCP_SUB_LEN_ACK_ALIGN +
15154 + MPTCP_SUB_LEN_SEQ_ALIGN];
15156 + if (WARN_ON(len > skb->len))
15157 + return -EINVAL;
15159 + /* DSS-option must be recovered afterwards. */
15160 + if (!is_meta_sk(sk))
15161 + memcpy(dss, skb->data - dsslen, dsslen);
15163 + nsize = skb_headlen(skb) - len;
15164 + if (nsize < 0)
15165 + nsize = 0;
15167 + if (skb_cloned(skb)) {
15168 + if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
15169 + return -ENOMEM;
15170 + /* Recover dss-option */
15171 + if (!is_meta_sk(sk))
15172 + memcpy(skb->data - dsslen, dss, dsslen);
15175 + /* Get a new skb... force flag on. */
15176 + buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);
15177 + if (buff == NULL)
15178 + return -ENOMEM; /* We'll just try again later. */
15180 + /* See below - if reinject == 1, the buff will be added to the reinject-
15181 + * queue, which is currently not part of the memory-accounting.
15182 + */
15183 + if (reinject != 1) {
15184 + sk->sk_wmem_queued += buff->truesize;
15185 + sk_mem_charge(sk, buff->truesize);
15187 + nlen = skb->len - len - nsize;
15188 + buff->truesize += nlen;
15189 + skb->truesize -= nlen;
15191 + /* Correct the sequence numbers. */
15192 + TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
15193 + TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
15194 + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
15196 + /* PSH and FIN should only be set in the second packet. */
15197 + flags = TCP_SKB_CB(skb)->tcp_flags;
15198 + TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
15199 + TCP_SKB_CB(buff)->tcp_flags = flags;
15200 + TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
15202 + flags = TCP_SKB_CB(skb)->mptcp_flags;
15203 + TCP_SKB_CB(skb)->mptcp_flags = flags & ~(MPTCPHDR_FIN);
15204 + TCP_SKB_CB(buff)->mptcp_flags = flags;
15206 + if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {
15207 + /* Copy and checksum data tail into the new buffer. */
15208 + buff->csum = csum_partial_copy_nocheck(skb->data + len,
15209 + skb_put(buff, nsize),
15210 + nsize, 0);
15212 + skb_trim(skb, len);
15214 + skb->csum = csum_block_sub(skb->csum, buff->csum, len);
15215 + } else {
15216 + skb->ip_summed = CHECKSUM_PARTIAL;
15217 + skb_split(skb, buff, len);
15220 + /* We lost the dss-option when creating buff - put it back! */
15221 + if (!is_meta_sk(sk))
15222 + memcpy(buff->data - dsslen, dss, dsslen);
15224 + buff->ip_summed = skb->ip_summed;
15226 + /* Looks stupid, but our code really uses when of
15227 + * skbs, which it never sent before. --ANK
15228 + */
15229 + TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
15230 + buff->tstamp = skb->tstamp;
15232 + old_factor = tcp_skb_pcount(skb);
15234 + /* Fix up tso_factor for both original and new SKB. */
15235 + tcp_set_skb_tso_segs(sk, skb, mss_now);
15236 + tcp_set_skb_tso_segs(sk, buff, mss_now);
15238 + /* If this packet has been sent out already, we must
15239 + * adjust the various packet counters.
15240 + */
15241 + if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq) && reinject != 1) {
15242 + int diff = old_factor - tcp_skb_pcount(skb) -
15243 + tcp_skb_pcount(buff);
15245 + if (diff)
15246 + tcp_adjust_pcount(sk, skb, diff);
15249 + /* Link BUFF into the send queue. */
15250 + skb_header_release(buff);
15251 + if (reinject == 1)
15252 + __skb_queue_after(&tcp_sk(sk)->mpcb->reinject_queue, skb, buff);
15253 + else
15254 + tcp_insert_write_queue_after(skb, buff, sk);
15256 + return 0;
15259 +int mptso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
15260 + unsigned int mss_now, gfp_t gfp, int reinject)
15262 + struct sk_buff *buff;
15263 + int nlen = skb->len - len, old_factor;
15264 + u8 flags;
15265 + int dsslen = MPTCP_SUB_LEN_DSS_ALIGN + MPTCP_SUB_LEN_ACK_ALIGN +
15266 + MPTCP_SUB_LEN_SEQ_ALIGN;
15268 + /* All of a TSO frame must be composed of paged data. */
15269 + if (skb->len != skb->data_len)
15270 + return mptcp_fragment(sk, skb, len, mss_now, reinject);
15272 + buff = sk_stream_alloc_skb(sk, 0, gfp);
15273 + if (unlikely(buff == NULL))
15274 + return -ENOMEM;
15276 + /* See below - if reinject == 1, the buff will be added to the reinject-
15277 + * queue, which is currently not part of the memory-accounting.
15278 + */
15279 + if (reinject != 1) {
15280 + sk->sk_wmem_queued += buff->truesize;
15281 + sk_mem_charge(sk, buff->truesize);
15283 + buff->truesize += nlen;
15284 + skb->truesize -= nlen;
15286 + /* Correct the sequence numbers. */
15287 + TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
15288 + TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
15289 + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
15291 + /* PSH and FIN should only be set in the second packet. */
15292 + flags = TCP_SKB_CB(skb)->tcp_flags;
15293 + TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
15294 + TCP_SKB_CB(buff)->tcp_flags = flags;
15296 + flags = TCP_SKB_CB(skb)->mptcp_flags;
15297 + TCP_SKB_CB(skb)->mptcp_flags = flags & ~(MPTCPHDR_FIN);
15298 + TCP_SKB_CB(buff)->mptcp_flags = flags;
15300 + /* This packet was never sent out yet, so no SACK bits. */
15301 + TCP_SKB_CB(buff)->sacked = 0;
15303 + buff->ip_summed = CHECKSUM_PARTIAL;
15304 + skb->ip_summed = CHECKSUM_PARTIAL;
15305 + skb_split(skb, buff, len);
15307 + /* We lost the dss-option when creating buff - put it back! */
15308 + if (!is_meta_sk(sk))
15309 + memcpy(buff->data - dsslen, skb->data - dsslen, dsslen);
15311 + old_factor = tcp_skb_pcount(skb);
15313 + /* Fix up tso_factor for both original and new SKB. */
15314 + tcp_set_skb_tso_segs(sk, skb, mss_now);
15315 + tcp_set_skb_tso_segs(sk, buff, mss_now);
15317 + /* If this packet has been sent out already, we must
15318 + * adjust the various packet counters.
15319 + */
15320 + if (!before(tcp_sk(sk)->snd_nxt, TCP_SKB_CB(buff)->end_seq) && reinject != 1) {
15321 + int diff = old_factor - tcp_skb_pcount(skb) -
15322 + tcp_skb_pcount(buff);
15324 + if (diff)
15325 + tcp_adjust_pcount(sk, skb, diff);
15328 + /* Link BUFF into the send queue. */
15329 + skb_header_release(buff);
15330 + if (reinject == 1)
15331 + __skb_queue_after(&tcp_sk(sk)->mpcb->reinject_queue, skb, buff);
15332 + else
15333 + tcp_insert_write_queue_after(skb, buff, sk);
15335 + return 0;
15338 +/* Inspired by tcp_write_wakeup */
15339 +int mptcp_write_wakeup(struct sock *meta_sk)
15341 + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
15342 + struct sk_buff *skb, *subskb;
15344 + skb = tcp_send_head(meta_sk);
15345 + if (skb &&
15346 + before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(meta_tp))) {
15347 + int err;
15348 + unsigned int mss;
15349 + unsigned int seg_size = tcp_wnd_end(meta_tp) - TCP_SKB_CB(skb)->seq;
15350 + struct sock *subsk = get_available_subflow(meta_sk, skb, &mss);
15351 + if (!subsk)
15352 + return -1;
15354 + if (before(meta_tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
15355 + meta_tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
15357 + /* We are probing the opening of a window
15358 + * but the window size is != 0
15359 + * must have been a result SWS avoidance ( sender )
15360 + */
15361 + if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
15362 + skb->len > mss) {
15363 + seg_size = min(seg_size, mss);
15364 + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
15365 + if (mptcp_fragment(meta_sk, skb, seg_size, mss, 0))
15366 + return -1;
15367 + } else if (!tcp_skb_pcount(skb)) {
15368 + tcp_set_skb_tso_segs(meta_sk, skb, mss);
15371 + subskb = mptcp_skb_entail(subsk, skb, 0);
15372 + if (!subskb)
15373 + return -1;
15375 + TCP_SKB_CB(subskb)->tcp_flags |= TCPHDR_PSH;
15376 + TCP_SKB_CB(skb)->when = tcp_time_stamp;
15377 + TCP_SKB_CB(subskb)->when = tcp_time_stamp;
15378 + err = tcp_transmit_skb(subsk, subskb, 1, GFP_ATOMIC);
15379 + if (unlikely(err)) {
15380 + mptcp_transmit_skb_failed(subsk, skb, subskb);
15381 + return err;
15384 + mptcp_check_sndseq_wrap(meta_tp, TCP_SKB_CB(skb)->end_seq -
15385 + TCP_SKB_CB(skb)->seq);
15386 + tcp_event_new_data_sent(meta_sk, skb);
15387 + mptcp_sub_event_new_data_sent(subsk, subskb, skb);
15389 + return 0;
15390 + } else {
15391 + struct sock *sk_it;
15392 + int ans = 0;
15394 + if (between(meta_tp->snd_up, meta_tp->snd_una + 1,
15395 + meta_tp->snd_una + 0xFFFF)) {
15396 + mptcp_for_each_sk(meta_tp->mpcb, sk_it) {
15397 + if (mptcp_sk_can_send_ack(sk_it))
15398 + tcp_xmit_probe_skb(sk_it, 1);
15402 + /* At least one of the tcp_xmit_probe_skb's has to succeed */
15403 + mptcp_for_each_sk(meta_tp->mpcb, sk_it) {
15404 + int ret;
15406 + if (!mptcp_sk_can_send_ack(sk_it))
15407 + continue;
15409 + ret = tcp_xmit_probe_skb(sk_it, 0);
15410 + if (unlikely(ret > 0))
15411 + ans = ret;
15413 + return ans;
15417 +static void mptcp_find_and_set_pathmask(struct sock *meta_sk, struct sk_buff *skb)
15419 + struct sk_buff *skb_it;
15421 + skb_it = tcp_write_queue_head(meta_sk);
15423 + tcp_for_write_queue_from(skb_it, meta_sk) {
15424 + if (skb_it == tcp_send_head(meta_sk))
15425 + break;
15427 + if (TCP_SKB_CB(skb_it)->seq == TCP_SKB_CB(skb)->seq) {
15428 + TCP_SKB_CB(skb)->path_mask = TCP_SKB_CB(skb_it)->path_mask;
15429 + break;
15434 +static struct sk_buff *mptcp_rcv_buf_optimization(struct sock *sk, int penal)
15436 + struct sock *meta_sk;
15437 + struct tcp_sock *tp = tcp_sk(sk), *tp_it;
15438 + struct sk_buff *skb_head;
15440 + if (tp->mpcb->cnt_subflows == 1)
15441 + return NULL;
15443 + meta_sk = mptcp_meta_sk(sk);
15444 + skb_head = tcp_write_queue_head(meta_sk);
15446 + if (!skb_head || skb_head == tcp_send_head(meta_sk))
15447 + return NULL;
15449 + /* If penalization is optional (coming from mptcp_next_segment() and
15450 + * We are not send-buffer-limited we do not penalize. The retransmission
15451 + * is just an optimization to fix the idle-time due to the delay before
15452 + * we wake up the application.
15453 + */
15454 + if (!penal && sk_stream_memory_free(meta_sk))
15455 + goto retrans;
15457 + /* Only penalize again after an RTT has elapsed */
15458 + if (tcp_time_stamp - tp->mptcp->last_rbuf_opti < tp->srtt >> 3)
15459 + goto retrans;
15461 + /* Half the cwnd of the slow flow */
15462 + mptcp_for_each_tp(tp->mpcb, tp_it) {
15463 + if (tp_it != tp &&
15464 + TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) {
15465 + if (tp->srtt < tp_it->srtt && inet_csk((struct sock *)tp_it)->icsk_ca_state == TCP_CA_Open) {
15466 + tp_it->snd_cwnd = max(tp_it->snd_cwnd >> 1U, 1U);
15467 + if (tp_it->snd_ssthresh != TCP_INFINITE_SSTHRESH)
15468 + tp_it->snd_ssthresh = max(tp_it->snd_ssthresh >> 1U, 2U);
15470 + tp->mptcp->last_rbuf_opti = tcp_time_stamp;
15472 + break;
15476 +retrans:
15478 + /* Segment not yet injected into this path? Take it!!! */
15479 + if (!(TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index))) {
15480 + bool do_retrans = false;
15481 + mptcp_for_each_tp(tp->mpcb, tp_it) {
15482 + if (tp_it != tp &&
15483 + TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) {
15484 + if (tp_it->snd_cwnd <= 4) {
15485 + do_retrans = true;
15486 + break;
15489 + if (4 * tp->srtt >= tp_it->srtt) {
15490 + do_retrans = false;
15491 + break;
15492 + } else {
15493 + do_retrans = true;
15498 + if (do_retrans)
15499 + return skb_head;
15501 + return NULL;
15504 +int mptcp_write_xmit(struct sock *meta_sk, unsigned int mss_now, int nonagle,
15505 + int push_one, gfp_t gfp)
15507 + struct tcp_sock *meta_tp = tcp_sk(meta_sk), *subtp;
15508 + struct sock *subsk;
15509 + struct mptcp_cb *mpcb = meta_tp->mpcb;
15510 + struct sk_buff *skb;
15511 + unsigned int tso_segs, old_factor, sent_pkts;
15512 + int cwnd_quota;
15513 + int result;
15514 + int reinject = 0;
15516 + sent_pkts = 0;
15518 + /* Currently mtu-probing is not done in MPTCP */
15519 + if (!push_one && 0) {
15520 + /* Do MTU probing. */
15521 + result = tcp_mtu_probe(meta_sk);
15522 + if (!result)
15523 + return 0;
15524 + else if (result > 0)
15525 + sent_pkts = 1;
15528 + while ((skb = mptcp_next_segment(meta_sk, &reinject))) {
15529 + unsigned int limit;
15530 + struct sk_buff *subskb = NULL;
15531 + u32 noneligible = mpcb->noneligible;
15533 + if (reinject == 1) {
15534 + if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)) {
15535 + /* Segment already reached the peer, take the next one */
15536 + __skb_unlink(skb, &mpcb->reinject_queue);
15537 + __kfree_skb(skb);
15538 + continue;
15541 + /* Reinjection and it is coming from a subflow? We need
15542 + * to find out the path-mask from the meta-write-queue
15543 + * to properly select a subflow.
15544 + */
15545 + if (!TCP_SKB_CB(skb)->path_mask)
15546 + mptcp_find_and_set_pathmask(meta_sk, skb);
15549 +subflow:
15550 + subsk = get_available_subflow(meta_sk, skb, &mss_now);
15551 + if (!subsk)
15552 + break;
15553 + subtp = tcp_sk(subsk);
15555 + /* Since all subsocks are locked before calling the scheduler,
15556 + * the tcp_send_head should not change.
15557 + */
15558 + BUG_ON(!reinject && tcp_send_head(meta_sk) != skb);
15559 +retry:
15560 + /* If the segment was cloned (e.g. a meta retransmission),
15561 + * the header must be expanded/copied so that there is no
15562 + * corruption of TSO information.
15563 + */
15564 + if (skb_unclone(skb, GFP_ATOMIC))
15565 + break;
15567 + old_factor = tcp_skb_pcount(skb);
15568 + tcp_set_skb_tso_segs(meta_sk, skb, mss_now);
15569 + tso_segs = tcp_skb_pcount(skb);
15571 + if (reinject == -1) {
15572 + /* The packet has already once been sent, so if we
15573 + * change the pcount here we have to adjust packets_out
15574 + * in the meta-sk
15575 + */
15576 + int diff = old_factor - tso_segs;
15578 + if (diff)
15579 + tcp_adjust_pcount(meta_sk, skb, diff);
15582 + cwnd_quota = tcp_cwnd_test(subtp, skb);
15583 + if (!cwnd_quota) {
15584 + /* May happen due to two cases:
15586 + * - if at the first selection we circumvented
15587 + * the test due to a DATA_FIN (and got rejected at
15588 + * tcp_snd_wnd_test), but the reinjected segment is not
15589 + * a DATA_FIN.
15590 + * - if we take a DATA_FIN with data, but
15591 + * tcp_set_skb_tso_segs() increases the number of
15592 + * tso_segs to something > 1. Then, cwnd_test might
15593 + * reject it.
15594 + */
15595 + mpcb->noneligible |= mptcp_pi_to_flag(subtp->mptcp->path_index);
15596 + continue;
15599 + if (!reinject && unlikely(!tcp_snd_wnd_test(meta_tp, skb, mss_now))) {
15600 + skb = mptcp_rcv_buf_optimization(subsk, 1);
15601 + if (skb) {
15602 + reinject = -1;
15603 + goto retry;
15605 + break;
15608 + if (tso_segs == 1) {
15609 + if (unlikely(!tcp_nagle_test(meta_tp, skb, mss_now,
15610 + (tcp_skb_is_last(meta_sk, skb) ?
15611 + nonagle : TCP_NAGLE_PUSH))))
15612 + break;
15613 + } else {
15614 + /* Do not try to defer the transmission of a reinjected
15615 + * segment. Send it directly.
15616 + * If it is not possible to send the TSO segment on the
15617 + * best subflow right now try to look for another subflow.
15618 + * If there is no subflow available defer the segment to avoid
15619 + * the call to mptso_fragment.
15620 + */
15621 + if (!push_one && !reinject && tcp_tso_should_defer(subsk, skb)) {
15622 + mpcb->noneligible |= mptcp_pi_to_flag(subtp->mptcp->path_index);
15623 + goto subflow;
15627 + limit = mss_now;
15628 + if (tso_segs > 1 && !tcp_urg_mode(meta_tp))
15629 + limit = tcp_mss_split_point(subsk, skb, mss_now,
15630 + min_t(unsigned int,
15631 + cwnd_quota,
15632 + subsk->sk_gso_max_segs),
15633 + nonagle);
15635 + if (skb->len > limit &&
15636 + unlikely(mptso_fragment(meta_sk, skb, limit, mss_now, gfp, reinject)))
15637 + break;
15639 + subskb = mptcp_skb_entail(subsk, skb, reinject);
15640 + if (!subskb)
15641 + break;
15643 + mpcb->noneligible = noneligible;
15644 + TCP_SKB_CB(skb)->when = tcp_time_stamp;
15645 + TCP_SKB_CB(subskb)->when = tcp_time_stamp;
15646 + if (unlikely(tcp_transmit_skb(subsk, subskb, 1, gfp))) {
15647 + mptcp_transmit_skb_failed(subsk, skb, subskb);
15648 + mpcb->noneligible |= mptcp_pi_to_flag(subtp->mptcp->path_index);
15649 + continue;
15652 + if (!reinject) {
15653 + mptcp_check_sndseq_wrap(meta_tp,
15654 + TCP_SKB_CB(skb)->end_seq -
15655 + TCP_SKB_CB(skb)->seq);
15656 + tcp_event_new_data_sent(meta_sk, skb);
15659 + tcp_minshall_update(meta_tp, mss_now, skb);
15660 + sent_pkts += tcp_skb_pcount(skb);
15661 + tcp_sk(subsk)->mptcp->sent_pkts += tcp_skb_pcount(skb);
15663 + mptcp_sub_event_new_data_sent(subsk, subskb, skb);
15665 + if (reinject > 0) {
15666 + __skb_unlink(skb, &mpcb->reinject_queue);
15667 + kfree_skb(skb);
15670 + if (push_one)
15671 + break;
15674 + mpcb->noneligible = 0;
15676 + if (likely(sent_pkts)) {
15677 + mptcp_for_each_sk(mpcb, subsk) {
15678 + subtp = tcp_sk(subsk);
15679 + if (subtp->mptcp->sent_pkts) {
15680 + if (tcp_in_cwnd_reduction(subsk))
15681 + subtp->prr_out += subtp->mptcp->sent_pkts;
15682 + tcp_cwnd_validate(subsk);
15683 + subtp->mptcp->sent_pkts = 0;
15686 + return 0;
15689 + return !meta_tp->packets_out && tcp_send_head(meta_sk);
15692 +void mptcp_write_space(struct sock *sk)
15694 + mptcp_push_pending_frames(mptcp_meta_sk(sk));
15697 +u32 __mptcp_select_window(struct sock *sk)
15699 + struct inet_connection_sock *icsk = inet_csk(sk);
15700 + struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp);
15701 + int mss, free_space, full_space, window;
15703 + /* MSS for the peer's data. Previous versions used mss_clamp
15704 + * here. I don't know if the value based on our guesses
15705 + * of peer's MSS is better for the performance. It's more correct
15706 + * but may be worse for the performance because of rcv_mss
15707 + * fluctuations. --SAW 1998/11/1
15708 + */
15709 + mss = icsk->icsk_ack.rcv_mss;
15710 + free_space = tcp_space(sk);
15711 + full_space = min_t(int, meta_tp->window_clamp,
15712 + tcp_full_space(sk));
15714 + if (mss > full_space)
15715 + mss = full_space;
15717 + if (free_space < (full_space >> 1)) {
15718 + icsk->icsk_ack.quick = 0;
15720 + if (tcp_memory_pressure)
15721 + /* TODO this has to be adapted when we support different
15722 + * MSS's among the subflows.
15723 + */
15724 + meta_tp->rcv_ssthresh = min(meta_tp->rcv_ssthresh,
15725 + 4U * meta_tp->advmss);
15727 + if (free_space < mss)
15728 + return 0;
15731 + if (free_space > meta_tp->rcv_ssthresh)
15732 + free_space = meta_tp->rcv_ssthresh;
15734 + /* Don't do rounding if we are using window scaling, since the
15735 + * scaled window will not line up with the MSS boundary anyway.
15736 + */
15737 + window = meta_tp->rcv_wnd;
15738 + if (tp->rx_opt.rcv_wscale) {
15739 + window = free_space;
15741 + /* Advertise enough space so that it won't get scaled away.
15742 + * Import case: prevent zero window announcement if
15743 + * 1<<rcv_wscale > mss.
15744 + */
15745 + if (((window >> tp->rx_opt.rcv_wscale) << tp->
15746 + rx_opt.rcv_wscale) != window)
15747 + window = (((window >> tp->rx_opt.rcv_wscale) + 1)
15748 + << tp->rx_opt.rcv_wscale);
15749 + } else {
15750 + /* Get the largest window that is a nice multiple of mss.
15751 + * Window clamp already applied above.
15752 + * If our current window offering is within 1 mss of the
15753 + * free space we just keep it. This prevents the divide
15754 + * and multiply from happening most of the time.
15755 + * We also don't do any window rounding when the free space
15756 + * is too small.
15757 + */
15758 + if (window <= free_space - mss || window > free_space)
15759 + window = (free_space / mss) * mss;
15760 + else if (mss == full_space &&
15761 + free_space > window + (full_space >> 1))
15762 + window = free_space;
15765 + return window;
15768 +void mptcp_syn_options(struct sock *sk, struct tcp_out_options *opts,
15769 + unsigned *remaining)
15771 + struct tcp_sock *tp = tcp_sk(sk);
15773 + opts->options |= OPTION_MPTCP;
15774 + if (is_master_tp(tp)) {
15775 + opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYN;
15776 + *remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN;
15777 + opts->mp_capable.sender_key = tp->mptcp_loc_key;
15778 + opts->dss_csum = !!sysctl_mptcp_checksum;
15779 + } else {
15780 + struct mptcp_cb *mpcb = tp->mpcb;
15782 + opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYN;
15783 + *remaining -= MPTCP_SUB_LEN_JOIN_SYN_ALIGN;
15784 + opts->mp_join_syns.token = mpcb->mptcp_rem_token;
15785 + opts->addr_id = tp->mptcp->loc_id;
15786 + opts->mp_join_syns.sender_nonce = tp->mptcp->mptcp_loc_nonce;
15790 +void mptcp_synack_options(struct request_sock *req,
15791 + struct tcp_out_options *opts, unsigned *remaining)
15793 + struct mptcp_request_sock *mtreq;
15794 + mtreq = mptcp_rsk(req);
15796 + opts->options |= OPTION_MPTCP;
15797 + /* MPCB not yet set - thus it's a new MPTCP-session */
15798 + if (!mtreq->mpcb) {
15799 + opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYNACK;
15800 + opts->mp_capable.sender_key = mtreq->mptcp_loc_key;
15801 + opts->dss_csum = !!sysctl_mptcp_checksum || mtreq->dss_csum;
15802 + *remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN;
15803 + } else {
15804 + opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYNACK;
15805 + opts->mp_join_syns.sender_truncated_mac =
15806 + mtreq->mptcp_hash_tmac;
15807 + opts->mp_join_syns.sender_nonce = mtreq->mptcp_loc_nonce;
15808 + opts->addr_id = mtreq->loc_id;
15809 + *remaining -= MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN;
15813 +void mptcp_established_options(struct sock *sk, struct sk_buff *skb,
15814 + struct tcp_out_options *opts, unsigned *size)
15816 + struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp);
15817 + struct mptcp_cb *mpcb = tp->mpcb;
15818 + struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;
15820 + /* In fallback mp_fail-mode, we have to repeat it until the fallback
15821 + * has been done by the sender
15822 + */
15823 + if (unlikely(tp->mptcp->send_mp_fail)) {
15824 + opts->options |= OPTION_MPTCP;
15825 + opts->mptcp_options |= OPTION_MP_FAIL;
15826 + opts->data_ack = (__u32)(mpcb->csum_cutoff_seq >> 32);
15827 + opts->data_seq = (__u32)mpcb->csum_cutoff_seq;
15828 + *size += MPTCP_SUB_LEN_FAIL;
15829 + return;
15832 + if (unlikely(tp->send_mp_fclose)) {
15833 + opts->options |= OPTION_MPTCP;
15834 + opts->mptcp_options |= OPTION_MP_FCLOSE;
15835 + opts->mp_capable.receiver_key = mpcb->mptcp_rem_key;
15836 + *size += MPTCP_SUB_LEN_FCLOSE_ALIGN;
15837 + return;
15840 + /* 1. If we are the sender of the infinite-mapping, we need the
15841 + * MPTCPHDR_INF-flag, because a retransmission of the
15842 + * infinite-announcment still needs the mptcp-option.
15844 + * We need infinite_cutoff_seq, because retransmissions from before
15845 + * the infinite-cutoff-moment still need the MPTCP-signalling to stay
15846 + * consistent.
15848 + * 2. If we are the receiver of the infinite-mapping, we always skip
15849 + * mptcp-options, because acknowledgments from before the
15850 + * infinite-mapping point have already been sent out.
15852 + * I know, the whole infinite-mapping stuff is ugly...
15854 + * TODO: Handle wrapped data-sequence numbers
15855 + * (even if it's very unlikely)
15856 + */
15857 + if (unlikely(mpcb->infinite_mapping_snd) &&
15858 + tp->mptcp->fully_established &&
15859 + ((mpcb->send_infinite_mapping && tcb &&
15860 + !(tcb->mptcp_flags & MPTCPHDR_INF) &&
15861 + !before(tcb->seq, tp->mptcp->infinite_cutoff_seq)) ||
15862 + !mpcb->send_infinite_mapping))
15863 + return;
15865 + if (unlikely(tp->mptcp->include_mpc)) {
15866 + opts->options |= OPTION_MPTCP;
15867 + opts->mptcp_options |= OPTION_MP_CAPABLE |
15868 + OPTION_TYPE_ACK;
15869 + *size += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN;
15870 + opts->mp_capable.sender_key = mpcb->mptcp_loc_key;
15871 + opts->mp_capable.receiver_key = mpcb->mptcp_rem_key;
15872 + opts->dss_csum = mpcb->dss_csum;
15874 + if (skb)
15875 + tp->mptcp->include_mpc = 0;
15877 + if (unlikely(tp->mptcp->pre_established)) {
15878 + opts->options |= OPTION_MPTCP;
15879 + opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_ACK;
15880 + *size += MPTCP_SUB_LEN_JOIN_ACK_ALIGN;
15883 + if (!tp->mptcp->include_mpc && !tp->mptcp->pre_established) {
15884 + opts->options |= OPTION_MPTCP;
15885 + opts->mptcp_options |= OPTION_DATA_ACK;
15886 + /* If !skb, we come from tcp_current_mss and thus we always
15887 + * assume that the DSS-option will be set for the data-packet.
15888 + */
15889 + if (skb && !mptcp_is_data_seq(skb)) {
15890 + opts->data_ack = meta_tp->rcv_nxt;
15892 + *size += MPTCP_SUB_LEN_ACK_ALIGN;
15893 + } else {
15894 + opts->data_ack = meta_tp->rcv_nxt;
15896 + /* Doesn't matter, if csum included or not. It will be
15897 + * either 10 or 12, and thus aligned = 12
15898 + */
15899 + *size += MPTCP_SUB_LEN_ACK_ALIGN +
15900 + MPTCP_SUB_LEN_SEQ_ALIGN;
15903 + *size += MPTCP_SUB_LEN_DSS_ALIGN;
15906 + if (mpcb->pm_ops->addr_signal)
15907 + mpcb->pm_ops->addr_signal(sk, size, opts, skb);
15909 + if (unlikely(tp->mptcp->send_mp_prio) &&
15910 + MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_PRIO_ALIGN) {
15911 + opts->options |= OPTION_MPTCP;
15912 + opts->mptcp_options |= OPTION_MP_PRIO;
15913 + if (skb)
15914 + tp->mptcp->send_mp_prio = 0;
15915 + *size += MPTCP_SUB_LEN_PRIO_ALIGN;
15918 + return;
15921 +u16 mptcp_select_window(struct sock *sk)
15923 + u16 new_win = tcp_select_window(sk);
15924 + struct tcp_sock *tp = tcp_sk(sk);
15925 + struct tcp_sock *meta_tp = mptcp_meta_tp(tp);
15927 + meta_tp->rcv_wnd = tp->rcv_wnd;
15928 + meta_tp->rcv_wup = meta_tp->rcv_nxt;
15930 + return new_win;
15933 +void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp,
15934 + struct tcp_out_options *opts,
15935 + struct sk_buff *skb)
15937 + if (unlikely(OPTION_MP_CAPABLE & opts->mptcp_options)) {
15938 + struct mp_capable *mpc = (struct mp_capable *)ptr;
15940 + mpc->kind = TCPOPT_MPTCP;
15942 + if ((OPTION_TYPE_SYN & opts->mptcp_options) ||
15943 + (OPTION_TYPE_SYNACK & opts->mptcp_options)) {
15944 + mpc->sender_key = opts->mp_capable.sender_key;
15945 + mpc->len = MPTCP_SUB_LEN_CAPABLE_SYN;
15946 + ptr += MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN >> 2;
15947 + } else if (OPTION_TYPE_ACK & opts->mptcp_options) {
15948 + mpc->sender_key = opts->mp_capable.sender_key;
15949 + mpc->receiver_key = opts->mp_capable.receiver_key;
15950 + mpc->len = MPTCP_SUB_LEN_CAPABLE_ACK;
15951 + ptr += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN >> 2;
15954 + mpc->sub = MPTCP_SUB_CAPABLE;
15955 + mpc->ver = 0;
15956 + mpc->a = opts->dss_csum;
15957 + mpc->b = 0;
15958 + mpc->rsv = 0;
15959 + mpc->h = 1;
15962 + if (unlikely(OPTION_MP_JOIN & opts->mptcp_options)) {
15963 + struct mp_join *mpj = (struct mp_join *)ptr;
15965 + mpj->kind = TCPOPT_MPTCP;
15966 + mpj->sub = MPTCP_SUB_JOIN;
15967 + mpj->rsv = 0;
15968 + mpj->addr_id = opts->addr_id;
15970 + if (OPTION_TYPE_SYN & opts->mptcp_options) {
15971 + mpj->len = MPTCP_SUB_LEN_JOIN_SYN;
15972 + mpj->u.syn.token = opts->mp_join_syns.token;
15973 + mpj->u.syn.nonce = opts->mp_join_syns.sender_nonce;
15974 + mpj->b = tp->mptcp->low_prio;
15975 + ptr += MPTCP_SUB_LEN_JOIN_SYN_ALIGN >> 2;
15976 + } else if (OPTION_TYPE_SYNACK & opts->mptcp_options) {
15977 + mpj->len = MPTCP_SUB_LEN_JOIN_SYNACK;
15978 + mpj->u.synack.mac =
15979 + opts->mp_join_syns.sender_truncated_mac;
15980 + mpj->u.synack.nonce = opts->mp_join_syns.sender_nonce;
15981 + mpj->b = tp->mptcp->low_prio;
15982 + ptr += MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN >> 2;
15983 + } else if (OPTION_TYPE_ACK & opts->mptcp_options) {
15984 + mpj->len = MPTCP_SUB_LEN_JOIN_ACK;
15985 + memcpy(mpj->u.ack.mac, &tp->mptcp->sender_mac[0], 20);
15986 + ptr += MPTCP_SUB_LEN_JOIN_ACK_ALIGN >> 2;
15989 + if (unlikely(OPTION_ADD_ADDR & opts->mptcp_options)) {
15990 + struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;
15992 + mpadd->kind = TCPOPT_MPTCP;
15993 + if (opts->add_addr_v4) {
15994 + mpadd->len = MPTCP_SUB_LEN_ADD_ADDR4;
15995 + mpadd->sub = MPTCP_SUB_ADD_ADDR;
15996 + mpadd->ipver = 4;
15997 + mpadd->addr_id = opts->add_addr4.addr_id;
15998 + mpadd->u.v4.addr = opts->add_addr4.addr;
15999 + ptr += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN >> 2;
16000 + } else if (opts->add_addr_v6) {
16001 + mpadd->len = MPTCP_SUB_LEN_ADD_ADDR6;
16002 + mpadd->sub = MPTCP_SUB_ADD_ADDR;
16003 + mpadd->ipver = 6;
16004 + mpadd->addr_id = opts->add_addr6.addr_id;
16005 + memcpy(&mpadd->u.v6.addr, &opts->add_addr6.addr,
16006 + sizeof(mpadd->u.v6.addr));
16007 + ptr += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN >> 2;
16010 + if (unlikely(OPTION_REMOVE_ADDR & opts->mptcp_options)) {
16011 + struct mp_remove_addr *mprem = (struct mp_remove_addr *)ptr;
16012 + u8 *addrs_id;
16013 + int id, len, len_align;
16015 + len = mptcp_sub_len_remove_addr(opts->remove_addrs);
16016 + len_align = mptcp_sub_len_remove_addr_align(opts->remove_addrs);
16018 + mprem->kind = TCPOPT_MPTCP;
16019 + mprem->len = len;
16020 + mprem->sub = MPTCP_SUB_REMOVE_ADDR;
16021 + mprem->rsv = 0;
16022 + addrs_id = &mprem->addrs_id;
16024 + mptcp_for_each_bit_set(opts->remove_addrs, id)
16025 + *(addrs_id++) = id;
16027 + /* Fill the rest with NOP's */
16028 + if (len_align > len) {
16029 + int i;
16030 + for (i = 0; i < len_align - len; i++)
16031 + *(addrs_id++) = TCPOPT_NOP;
16034 + ptr += len_align >> 2;
16036 + if (unlikely(OPTION_MP_FAIL & opts->mptcp_options)) {
16037 + struct mp_fail *mpfail = (struct mp_fail *)ptr;
16039 + mpfail->kind = TCPOPT_MPTCP;
16040 + mpfail->len = MPTCP_SUB_LEN_FAIL;
16041 + mpfail->sub = MPTCP_SUB_FAIL;
16042 + mpfail->rsv1 = 0;
16043 + mpfail->rsv2 = 0;
16044 + mpfail->data_seq = htonll(((u64)opts->data_ack << 32) | opts->data_seq);
16046 + ptr += MPTCP_SUB_LEN_FAIL_ALIGN >> 2;
16048 + if (unlikely(OPTION_MP_FCLOSE & opts->mptcp_options)) {
16049 + struct mp_fclose *mpfclose = (struct mp_fclose *)ptr;
16051 + mpfclose->kind = TCPOPT_MPTCP;
16052 + mpfclose->len = MPTCP_SUB_LEN_FCLOSE;
16053 + mpfclose->sub = MPTCP_SUB_FCLOSE;
16054 + mpfclose->rsv1 = 0;
16055 + mpfclose->rsv2 = 0;
16056 + mpfclose->key = opts->mp_capable.receiver_key;
16058 + ptr += MPTCP_SUB_LEN_FCLOSE_ALIGN >> 2;
16061 + if (OPTION_DATA_ACK & opts->mptcp_options) {
16062 + if (!mptcp_is_data_seq(skb)) {
16063 + struct mp_dss *mdss = (struct mp_dss *)ptr;
16065 + mdss->kind = TCPOPT_MPTCP;
16066 + mdss->sub = MPTCP_SUB_DSS;
16067 + mdss->rsv1 = 0;
16068 + mdss->rsv2 = 0;
16069 + mdss->F = 0;
16070 + mdss->m = 0;
16071 + mdss->M = 0;
16072 + mdss->a = 0;
16073 + mdss->A = 1;
16074 + mdss->len = mptcp_sub_len_dss(mdss, tp->mpcb->dss_csum);
16076 + ptr++;
16077 + *ptr++ = htonl(opts->data_ack);
16078 + } else {
16079 + /**** Just update the data_ack ****/
16081 + /* Get pointer to data_ack-field. MPTCP is always at
16082 + * the end of the TCP-options.
16083 + */
16084 + /* TODO if we allow sending 64-bit dseq's we have to change "16" */
16085 + __be32 *dack = (__be32 *)(skb->data + (tcp_hdr(skb)->doff << 2) - 16);
16087 + *dack = htonl(opts->data_ack);
16090 + if (unlikely(OPTION_MP_PRIO & opts->mptcp_options)) {
16091 + struct mp_prio *mpprio = (struct mp_prio *)ptr;
16093 + mpprio->kind = TCPOPT_MPTCP;
16094 + mpprio->len = MPTCP_SUB_LEN_PRIO;
16095 + mpprio->sub = MPTCP_SUB_PRIO;
16096 + mpprio->rsv = 0;
16097 + mpprio->b = tp->mptcp->low_prio;
16098 + mpprio->addr_id = TCPOPT_NOP;
16100 + ptr += MPTCP_SUB_LEN_PRIO_ALIGN >> 2;
16104 +/* Returns the next segment to be sent from the mptcp meta-queue.
16105 + * (chooses the reinject queue if any segment is waiting in it, otherwise,
16106 + * chooses the normal write queue).
16107 + * Sets *@reinject to 1 if the returned segment comes from the
16108 + * reinject queue. Sets it to 0 if it is the regular send-head of the meta-sk,
16109 + * and sets it to -1 if it is a meta-level retransmission to optimize the
16110 + * receive-buffer.
16111 + */
16112 +struct sk_buff *mptcp_next_segment(struct sock *meta_sk, int *reinject)
16114 + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
16115 + struct sk_buff *skb = NULL;
16116 + if (reinject)
16117 + *reinject = 0;
16119 + /* If we are in fallback-mode, just take from the meta-send-queue */
16120 + if (mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping)
16121 + return tcp_send_head(meta_sk);
16123 + skb = skb_peek(&mpcb->reinject_queue);
16125 + if (skb) {
16126 + if (reinject)
16127 + *reinject = 1;
16128 + } else {
16129 + skb = tcp_send_head(meta_sk);
16131 + if (!skb && meta_sk->sk_socket &&
16132 + test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags) &&
16133 + sk_stream_wspace(meta_sk) < sk_stream_min_wspace(meta_sk)) {
16134 + struct sock *subsk = get_available_subflow(meta_sk, NULL, NULL);
16135 + if (!subsk)
16136 + return NULL;
16138 + skb = mptcp_rcv_buf_optimization(subsk, 0);
16139 + if (skb && reinject)
16140 + *reinject = -1;
16143 + return skb;
16146 +/* Sends the datafin */
16147 +void mptcp_send_fin(struct sock *meta_sk)
16149 + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
16150 + struct sk_buff *skb = tcp_write_queue_tail(meta_sk);
16151 + int mss_now;
16153 + if ((1 << meta_sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK))
16154 + meta_tp->mpcb->passive_close = 1;
16156 + /* Optimization, tack on the FIN if we have a queue of
16157 + * unsent frames. But be careful about outgoing SACKS
16158 + * and IP options.
16159 + */
16160 + mss_now = mptcp_current_mss(meta_sk);
16162 + if (tcp_send_head(meta_sk) != NULL) {
16163 + TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_FIN;
16164 + TCP_SKB_CB(skb)->end_seq++;
16165 + meta_tp->write_seq++;
16166 + } else {
16167 + /* Socket is locked, keep trying until memory is available. */
16168 + for (;;) {
16169 + skb = alloc_skb_fclone(MAX_TCP_HEADER,
16170 + meta_sk->sk_allocation);
16171 + if (skb)
16172 + break;
16173 + yield();
16175 + /* Reserve space for headers and prepare control bits. */
16176 + skb_reserve(skb, MAX_TCP_HEADER);
16178 + tcp_init_nondata_skb(skb, meta_tp->write_seq, TCPHDR_ACK);
16179 + TCP_SKB_CB(skb)->end_seq++;
16180 + TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_FIN | MPTCPHDR_SEQ;
16181 + tcp_queue_skb(meta_sk, skb);
16183 + __tcp_push_pending_frames(meta_sk, mss_now, TCP_NAGLE_OFF);
16186 +void mptcp_send_active_reset(struct sock *meta_sk, gfp_t priority)
16188 + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
16189 + struct mptcp_cb *mpcb = meta_tp->mpcb;
16190 + struct sock *sk = NULL, *sk_it = NULL, *tmpsk;
16192 + if (!mpcb->cnt_subflows)
16193 + return;
16195 + WARN_ON(meta_tp->send_mp_fclose);
16197 + /* First - select a socket */
16198 + sk = mptcp_select_ack_sock(meta_sk, 0);
16200 + /* May happen if no subflow is in an appropriate state */
16201 + if (!sk)
16202 + return;
16204 + /* We are in infinite mode - just send a reset */
16205 + if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv) {
16206 + sk->sk_err = ECONNRESET;
16207 + if (tcp_need_reset(sk->sk_state))
16208 + tcp_send_active_reset(sk, priority);
16209 + mptcp_sub_force_close(sk);
16210 + return;
16214 + tcp_sk(sk)->send_mp_fclose = 1;
16215 + /** Reset all other subflows */
16217 + /* tcp_done must be handled with bh disabled */
16218 + if (!in_serving_softirq())
16219 + local_bh_disable();
16221 + mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) {
16222 + if (tcp_sk(sk_it)->send_mp_fclose)
16223 + continue;
16225 + sk_it->sk_err = ECONNRESET;
16226 + if (tcp_need_reset(sk_it->sk_state))
16227 + tcp_send_active_reset(sk_it, GFP_ATOMIC);
16228 + mptcp_sub_force_close(sk_it);
16231 + if (!in_serving_softirq())
16232 + local_bh_enable();
16234 + tcp_send_ack(sk);
16235 + inet_csk_reset_keepalive_timer(sk, inet_csk(sk)->icsk_rto);
16237 + meta_tp->send_mp_fclose = 1;
16240 +static void mptcp_ack_retransmit_timer(struct sock *sk)
16242 + struct sk_buff *skb;
16243 + struct tcp_sock *tp = tcp_sk(sk);
16244 + struct inet_connection_sock *icsk = inet_csk(sk);
16246 + if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
16247 + goto out; /* Routing failure or similar */
16249 + if (!tp->retrans_stamp)
16250 + tp->retrans_stamp = tcp_time_stamp ? : 1;
16252 + if (tcp_write_timeout(sk)) {
16253 + tp->mptcp->pre_established = 0;
16254 + sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer);
16255 + tcp_send_active_reset(sk, GFP_ATOMIC);
16256 + goto out;
16259 + skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
16260 + if (skb == NULL) {
16261 + sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,
16262 + jiffies + icsk->icsk_rto);
16263 + return;
16266 + /* Reserve space for headers and prepare control bits */
16267 + skb_reserve(skb, MAX_TCP_HEADER);
16268 + tcp_init_nondata_skb(skb, tp->snd_una, TCPHDR_ACK);
16270 + TCP_SKB_CB(skb)->when = tcp_time_stamp;
16271 + if (tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC) > 0) {
16272 + /* Retransmission failed because of local congestion,
16273 + * do not backoff.
16274 + */
16275 + if (!icsk->icsk_retransmits)
16276 + icsk->icsk_retransmits = 1;
16277 + sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,
16278 + jiffies + icsk->icsk_rto);
16279 + return;
16283 + icsk->icsk_retransmits++;
16284 + icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
16285 + sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,
16286 + jiffies + icsk->icsk_rto);
16287 + if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0)) {
16288 + __sk_dst_reset(sk);
16291 +out:;
16294 +void mptcp_ack_handler(unsigned long data)
16296 + struct sock *sk = (struct sock *)data;
16297 + struct sock *meta_sk = mptcp_meta_sk(sk);
16299 + bh_lock_sock(meta_sk);
16300 + if (sock_owned_by_user(meta_sk)) {
16301 + /* Try again later */
16302 + sk_reset_timer(sk, &tcp_sk(sk)->mptcp->mptcp_ack_timer,
16303 + jiffies + (HZ / 20));
16304 + goto out_unlock;
16307 + if (sk->sk_state == TCP_CLOSE)
16308 + goto out_unlock;
16310 + mptcp_ack_retransmit_timer(sk);
16312 + sk_mem_reclaim(sk);
16314 +out_unlock:
16315 + bh_unlock_sock(meta_sk);
16316 + sock_put(sk);
16319 +/* Similar to tcp_retransmit_skb
16321 + * The diff is that we handle the retransmission-stats (retrans_stamp) at the
16322 + * meta-level.
16323 + */
16324 +int mptcp_retransmit_skb(struct sock *meta_sk, struct sk_buff *skb)
16326 + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
16327 + struct sock *subsk;
16328 + struct sk_buff *subskb;
16329 + unsigned int limit, tso_segs, mss_now;
16330 + int err = -1, oldpcount;
16332 + /* Do not sent more than we queued. 1/4 is reserved for possible
16333 + * copying overhead: fragmentation, tunneling, mangling etc.
16335 + * This is a meta-retransmission thus we check on the meta-socket.
16336 + */
16337 + if (atomic_read(&meta_sk->sk_wmem_alloc) >
16338 + min(meta_sk->sk_wmem_queued + (meta_sk->sk_wmem_queued >> 2), meta_sk->sk_sndbuf)) {
16339 + return -EAGAIN;
16342 + /* We need to make sure that the retransmitted segment can be sent on a
16343 + * subflow right now. If it is too big, it needs to be fragmented.
16344 + */
16345 + subsk = get_available_subflow(meta_sk, skb, &mss_now);
16346 + if (!subsk) {
16347 + /* We want to increase icsk_retransmits, thus return 0, so that
16348 + * mptcp_retransmit_timer enters the desired branch.
16349 + */
16350 + err = 0;
16351 + goto failed;
16354 + /* If the segment was cloned (e.g. a meta retransmission), the header
16355 + * must be expanded/copied so that there is no corruption of TSO
16356 + * information.
16357 + */
16358 + if (skb_unclone(skb, GFP_ATOMIC)) {
16359 + err = ENOMEM;
16360 + goto failed;
16363 + oldpcount = tcp_skb_pcount(skb);
16364 + tcp_set_skb_tso_segs(meta_sk, skb, mss_now);
16365 + tso_segs = tcp_skb_pcount(skb);
16366 + BUG_ON(!tso_segs);
16368 + /* The MSS might have changed and so the number of segments. We
16369 + * need to account for this change.
16370 + */
16371 + if (unlikely(oldpcount != tso_segs))
16372 + tcp_adjust_pcount(meta_sk, skb, oldpcount - tso_segs);
16374 + limit = mss_now;
16375 + if (tso_segs > 1 && !tcp_urg_mode(meta_tp))
16376 + limit = tcp_mss_split_point(subsk, skb, mss_now,
16377 + min_t(unsigned int,
16378 + tcp_cwnd_test(tcp_sk(subsk), skb),
16379 + subsk->sk_gso_max_segs),
16380 + TCP_NAGLE_OFF);
16382 + if (skb->len > limit &&
16383 + unlikely(mptso_fragment(meta_sk, skb, limit, mss_now,
16384 + GFP_ATOMIC, 0)))
16385 + goto failed;
16387 + subskb = mptcp_skb_entail(subsk, skb, -1);
16388 + if (!subskb)
16389 + goto failed;
16391 + TCP_SKB_CB(skb)->when = tcp_time_stamp;
16392 + TCP_SKB_CB(subskb)->when = tcp_time_stamp;
16393 + err = tcp_transmit_skb(subsk, subskb, 1, GFP_ATOMIC);
16394 + if (!err) {
16395 + /* Update global TCP statistics. */
16396 + TCP_INC_STATS(sock_net(meta_sk), TCP_MIB_RETRANSSEGS);
16398 + /* Diff to tcp_retransmit_skb */
16400 + /* Save stamp of the first retransmit. */
16401 + if (!meta_tp->retrans_stamp)
16402 + meta_tp->retrans_stamp = TCP_SKB_CB(subskb)->when;
16403 + mptcp_sub_event_new_data_sent(subsk, subskb, skb);
16404 + } else {
16405 + mptcp_transmit_skb_failed(subsk, skb, subskb);
16408 +failed:
16409 + return err;
16412 +/* Similar to tcp_retransmit_timer
16414 + * The diff is that we have to handle retransmissions of the FAST_CLOSE-message
16415 + * and that we don't have an srtt estimation at the meta-level.
16416 + */
16417 +void mptcp_retransmit_timer(struct sock *meta_sk)
16419 + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
16420 + struct mptcp_cb *mpcb = meta_tp->mpcb;
16421 + struct inet_connection_sock *meta_icsk = inet_csk(meta_sk);
16422 + int err;
16424 + /* In fallback, retransmission is handled at the subflow-level */
16425 + if (!meta_tp->packets_out || mpcb->infinite_mapping_snd ||
16426 + mpcb->send_infinite_mapping)
16427 + return;
16429 + WARN_ON(tcp_write_queue_empty(meta_sk));
16431 + if (!meta_tp->snd_wnd && !sock_flag(meta_sk, SOCK_DEAD) &&
16432 + !((1 << meta_sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
16433 + /* Receiver dastardly shrinks window. Our retransmits
16434 + * become zero probes, but we should not timeout this
16435 + * connection. If the socket is an orphan, time it out,
16436 + * we cannot allow such beasts to hang infinitely.
16437 + */
16438 + struct inet_sock *meta_inet = inet_sk(meta_sk);
16439 + if (meta_sk->sk_family == AF_INET) {
16440 + LIMIT_NETDEBUG(KERN_DEBUG "MPTCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
16441 + &meta_inet->inet_daddr,
16442 + ntohs(meta_inet->inet_dport),
16443 + meta_inet->inet_num, meta_tp->snd_una,
16444 + meta_tp->snd_nxt);
16446 +#if IS_ENABLED(CONFIG_IPV6)
16447 + else if (meta_sk->sk_family == AF_INET6) {
16448 + LIMIT_NETDEBUG(KERN_DEBUG "MPTCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
16449 + &meta_sk->sk_v6_daddr,
16450 + ntohs(meta_inet->inet_dport),
16451 + meta_inet->inet_num, meta_tp->snd_una,
16452 + meta_tp->snd_nxt);
16454 +#endif
16455 + if (tcp_time_stamp - meta_tp->rcv_tstamp > TCP_RTO_MAX) {
16456 + tcp_write_err(meta_sk);
16457 + return;
16460 + mptcp_retransmit_skb(meta_sk, tcp_write_queue_head(meta_sk));
16461 + goto out_reset_timer;
16464 + if (tcp_write_timeout(meta_sk))
16465 + return;
16467 + if (meta_icsk->icsk_retransmits == 0)
16468 + NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPTIMEOUTS);
16470 + meta_icsk->icsk_ca_state = TCP_CA_Loss;
16472 + err = mptcp_retransmit_skb(meta_sk, tcp_write_queue_head(meta_sk));
16473 + if (err > 0) {
16474 + /* Retransmission failed because of local congestion,
16475 + * do not backoff.
16476 + */
16477 + if (!meta_icsk->icsk_retransmits)
16478 + meta_icsk->icsk_retransmits = 1;
16479 + inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS,
16480 + min(meta_icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),
16481 + TCP_RTO_MAX);
16482 + return;
16485 + /* Increase the timeout each time we retransmit. Note that
16486 + * we do not increase the rtt estimate. rto is initialized
16487 + * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
16488 + * that doubling rto each time is the least we can get away with.
16489 + * In KA9Q, Karn uses this for the first few times, and then
16490 + * goes to quadratic. netBSD doubles, but only goes up to *64,
16491 + * and clamps at 1 to 64 sec afterwards. Note that 120 sec is
16492 + * defined in the protocol as the maximum possible RTT. I guess
16493 + * we'll have to use something other than TCP to talk to the
16494 + * University of Mars.
16496 + * PAWS allows us longer timeouts and large windows, so once
16497 + * implemented ftp to mars will work nicely. We will have to fix
16498 + * the 120 second clamps though!
16499 + */
16500 + meta_icsk->icsk_backoff++;
16501 + meta_icsk->icsk_retransmits++;
16503 +out_reset_timer:
16504 + /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
16505 + * used to reset timer, set to 0. Recalculate 'icsk_rto' as this
16506 + * might be increased if the stream oscillates between thin and thick,
16507 + * thus the old value might already be too high compared to the value
16508 + * set by 'tcp_set_rto' in tcp_input.c which resets the rto without
16509 + * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating
16510 + * exponential backoff behaviour to avoid continue hammering
16511 + * linear-timeout retransmissions into a black hole
16512 + */
16513 + if (meta_sk->sk_state == TCP_ESTABLISHED &&
16514 + (meta_tp->thin_lto || sysctl_tcp_thin_linear_timeouts) &&
16515 + tcp_stream_is_thin(meta_tp) &&
16516 + meta_icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) {
16517 + meta_icsk->icsk_backoff = 0;
16518 + /* We cannot do the same as in tcp_write_timer because the
16519 + * srtt is not set here.
16520 + */
16521 + mptcp_set_rto(meta_sk);
16522 + } else {
16523 + /* Use normal (exponential) backoff */
16524 + meta_icsk->icsk_rto = min(meta_icsk->icsk_rto << 1, TCP_RTO_MAX);
16526 + inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS, meta_icsk->icsk_rto, TCP_RTO_MAX);
16528 + return;
16531 +/* Modify values to an mptcp-level for the initial window of new subflows */
16532 +void mptcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd,
16533 + __u32 *window_clamp, int wscale_ok,
16534 + __u8 *rcv_wscale, __u32 init_rcv_wnd,
16535 + const struct sock *sk)
16537 + struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
16539 + *window_clamp = mpcb->orig_window_clamp;
16540 + __space = tcp_win_from_space(mpcb->orig_sk_rcvbuf);
16542 + tcp_select_initial_window(__space, mss, rcv_wnd, window_clamp,
16543 + wscale_ok, rcv_wscale, init_rcv_wnd, sk);
16546 +unsigned int mptcp_current_mss(struct sock *meta_sk)
16548 + unsigned int mss = 0;
16549 + struct sock *sk;
16551 + mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {
16552 + int this_mss;
16554 + if (!mptcp_sk_can_send(sk))
16555 + continue;
16557 + this_mss = tcp_current_mss(sk);
16558 + if (this_mss > mss)
16559 + mss = this_mss;
16562 + /* If no subflow is available, we take a default-mss from the
16563 + * meta-socket.
16564 + */
16565 + return !mss ? tcp_current_mss(meta_sk) : mss;
16568 +int mptcp_select_size(const struct sock *meta_sk, bool sg)
16570 + int mss = 0; /* We look for the smallest MSS */
16571 + struct sock *sk;
16573 + mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {
16574 + int this_mss;
16576 + if (!mptcp_sk_can_send(sk))
16577 + continue;
16579 + this_mss = tcp_sk(sk)->mss_cache;
16580 + if (this_mss > mss)
16581 + mss = this_mss;
16584 + if (sg) {
16585 + if (mptcp_sk_can_gso(meta_sk)) {
16586 + mss = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
16587 + } else {
16588 + int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
16590 + if (mss >= pgbreak &&
16591 + mss <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
16592 + mss = pgbreak;
16596 + return !mss ? tcp_sk(meta_sk)->mss_cache : mss;
16599 +int mptcp_check_snd_buf(const struct tcp_sock *tp)
16601 + struct sock *sk;
16602 + u32 rtt_max = tp->srtt;
16603 + u64 bw_est;
16605 + if (!tp->srtt)
16606 + return tp->reordering + 1;
16608 + mptcp_for_each_sk(tp->mpcb, sk) {
16609 + if (!mptcp_sk_can_send(sk))
16610 + continue;
16612 + if (rtt_max < tcp_sk(sk)->srtt)
16613 + rtt_max = tcp_sk(sk)->srtt;
16616 + bw_est = div64_u64(((u64)tp->snd_cwnd * rtt_max) << 16,
16617 + (u64)tp->srtt);
16619 + return max_t(unsigned int, (u32)(bw_est >> 16),
16620 + tp->reordering + 1);
16624 +unsigned int mptcp_xmit_size_goal(struct sock *meta_sk, u32 mss_now,
16625 + int large_allowed)
16627 + struct sock *sk;
16628 + u32 xmit_size_goal = 0;
16630 + if (large_allowed && mptcp_sk_can_gso(meta_sk)) {
16631 + mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {
16632 + int this_size_goal;
16634 + if (!mptcp_sk_can_send(sk))
16635 + continue;
16637 + this_size_goal = tcp_xmit_size_goal(sk, mss_now, 1);
16638 + if (this_size_goal > xmit_size_goal)
16639 + xmit_size_goal = this_size_goal;
16643 + return max(xmit_size_goal, mss_now);
16646 +/* Similar to tcp_trim_head - but we correctly copy the DSS-option */
16647 +int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
16649 + int dsslen = MPTCP_SUB_LEN_DSS_ALIGN + MPTCP_SUB_LEN_ACK_ALIGN +
16650 + MPTCP_SUB_LEN_SEQ_ALIGN;
16651 + char dss[dsslen];
16653 + /* DSS-option must be recovered afterwards. */
16654 + memcpy(dss, skb->data - dsslen, dsslen);
16656 + if (skb_cloned(skb)) {
16657 + /* pskb_expand_head will delete our DSS-option. We have to copy
16658 + * it back if pskb_expand_head succeeds.
16659 + */
16661 + if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
16662 + return -ENOMEM;
16664 + memcpy(skb->data - dsslen, dss, dsslen);
16667 + __pskb_trim_head(skb, len);
16669 + /* Put the DSS-option back in our header */
16670 + memcpy(skb->data - dsslen, dss, dsslen);
16672 + TCP_SKB_CB(skb)->seq += len;
16673 + skb->ip_summed = CHECKSUM_PARTIAL;
16675 + skb->truesize -= len;
16676 + sk->sk_wmem_queued -= len;
16677 + sk_mem_uncharge(sk, len);
16678 + sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
16680 + /* Any change of skb->len requires recalculation of tso factor. */
16681 + if (tcp_skb_pcount(skb) > 1)
16682 + tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));
16684 + return 0;
16686 diff -Nur linux-3.14.45.orig/net/mptcp/mptcp_pm.c linux-3.14.45/net/mptcp/mptcp_pm.c
16687 --- linux-3.14.45.orig/net/mptcp/mptcp_pm.c 1970-01-01 01:00:00.000000000 +0100
16688 +++ linux-3.14.45/net/mptcp/mptcp_pm.c 2015-06-24 14:15:48.931862523 +0200
16689 @@ -0,0 +1,170 @@
16691 + * MPTCP implementation - MPTCP-subflow-management
16693 + * Initial Design & Implementation:
16694 + * Sébastien Barré <sebastien.barre@uclouvain.be>
16696 + * Current Maintainer & Author:
16697 + * Christoph Paasch <christoph.paasch@uclouvain.be>
16699 + * Additional authors:
16700 + * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
16701 + * Gregory Detal <gregory.detal@uclouvain.be>
16702 + * Fabien Duchêne <fabien.duchene@uclouvain.be>
16703 + * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
16704 + * Lavkesh Lahngir <lavkesh51@gmail.com>
16705 + * Andreas Ripke <ripke@neclab.eu>
16706 + * Vlad Dogaru <vlad.dogaru@intel.com>
16707 + * Octavian Purdila <octavian.purdila@intel.com>
16708 + * John Ronan <jronan@tssg.org>
16709 + * Catalin Nicutar <catalin.nicutar@gmail.com>
16710 + * Brandon Heller <brandonh@stanford.edu>
16713 + * This program is free software; you can redistribute it and/or
16714 + * modify it under the terms of the GNU General Public License
16715 + * as published by the Free Software Foundation; either version
16716 + * 2 of the License, or (at your option) any later version.
16717 + */
16720 +#include <linux/module.h>
16721 +#include <net/mptcp.h>
16723 +static DEFINE_SPINLOCK(mptcp_pm_list_lock);
16724 +static LIST_HEAD(mptcp_pm_list);
16726 +static int mptcp_default_index(sa_family_t family, union inet_addr *addr,
16727 + struct net *net)
16729 + return 0;
16732 +struct mptcp_pm_ops mptcp_pm_default = {
16733 + .get_local_index = mptcp_default_index,
16734 + .get_local_id = mptcp_default_index, /* We do not care */
16735 + .name = "default",
16736 + .owner = THIS_MODULE,
16739 +static struct mptcp_pm_ops *mptcp_pm_find(const char *name)
16741 + struct mptcp_pm_ops *e;
16743 + list_for_each_entry_rcu(e, &mptcp_pm_list, list) {
16744 + if (strcmp(e->name, name) == 0)
16745 + return e;
16748 + return NULL;
16751 +int mptcp_register_path_manager(struct mptcp_pm_ops *pm)
16753 + int ret = 0;
16755 + if (!pm->get_local_index || !pm->get_local_id)
16756 + return -EINVAL;
16758 + spin_lock(&mptcp_pm_list_lock);
16759 + if (mptcp_pm_find(pm->name)) {
16760 + pr_notice("%s already registered\n", pm->name);
16761 + ret = -EEXIST;
16762 + } else {
16763 + list_add_tail_rcu(&pm->list, &mptcp_pm_list);
16764 + pr_info("%s registered\n", pm->name);
16766 + spin_unlock(&mptcp_pm_list_lock);
16768 + return ret;
16770 +EXPORT_SYMBOL_GPL(mptcp_register_path_manager);
16772 +void mptcp_unregister_path_manager(struct mptcp_pm_ops *pm)
16774 + spin_lock(&mptcp_pm_list_lock);
16775 + list_del_rcu(&pm->list);
16776 + spin_unlock(&mptcp_pm_list_lock);
16778 +EXPORT_SYMBOL_GPL(mptcp_unregister_path_manager);
16780 +void mptcp_get_default_path_manager(char *name)
16782 + struct mptcp_pm_ops *pm;
16784 + BUG_ON(list_empty(&mptcp_pm_list));
16786 + rcu_read_lock();
16787 + pm = list_entry(mptcp_pm_list.next, struct mptcp_pm_ops, list);
16788 + strncpy(name, pm->name, MPTCP_PM_NAME_MAX);
16789 + rcu_read_unlock();
16792 +int mptcp_set_default_path_manager(const char *name)
16794 + struct mptcp_pm_ops *pm;
16795 + int ret = -ENOENT;
16797 + spin_lock(&mptcp_pm_list_lock);
16798 + pm = mptcp_pm_find(name);
16799 +#ifdef CONFIG_MODULES
16800 + if (!pm && capable(CAP_NET_ADMIN)) {
16801 + spin_unlock(&mptcp_pm_list_lock);
16803 + request_module("mptcp_%s", name);
16804 + spin_lock(&mptcp_pm_list_lock);
16805 + pm = mptcp_pm_find(name);
16807 +#endif
16809 + if (pm) {
16810 + list_move(&pm->list, &mptcp_pm_list);
16811 + ret = 0;
16812 + } else {
16813 + pr_info("%s is not available\n", name);
16815 + spin_unlock(&mptcp_pm_list_lock);
16817 + return ret;
16820 +void mptcp_init_path_manager(struct mptcp_cb *mpcb)
16822 + struct mptcp_pm_ops *pm;
16824 + rcu_read_lock();
16825 + list_for_each_entry_rcu(pm, &mptcp_pm_list, list) {
16826 + if (try_module_get(pm->owner)) {
16827 + mpcb->pm_ops = pm;
16828 + break;
16831 + rcu_read_unlock();
16834 +/* Manage refcounts on socket close. */
16835 +void mptcp_cleanup_path_manager(struct mptcp_cb *mpcb)
16837 + module_put(mpcb->pm_ops->owner);
16840 +/* Fallback to the default path-manager. */
16841 +void mptcp_fallback_default(struct mptcp_cb *mpcb)
16843 + struct mptcp_pm_ops *pm;
16845 + mptcp_cleanup_path_manager(mpcb);
16846 + pm = mptcp_pm_find("default");
16848 + /* Cannot fail - it's the default module */
16849 + try_module_get(pm->owner);
16850 + mpcb->pm_ops = pm;
16852 +EXPORT_SYMBOL_GPL(mptcp_fallback_default);
16854 +/* Set default value from kernel configuration at bootup */
16855 +static int __init mptcp_path_manager_default(void)
16857 + return mptcp_set_default_path_manager(CONFIG_DEFAULT_MPTCP_PM);
16859 +late_initcall(mptcp_path_manager_default);
16860 diff -Nur linux-3.14.45.orig/net/mptcp/mptcp_wvegas.c linux-3.14.45/net/mptcp/mptcp_wvegas.c
16861 --- linux-3.14.45.orig/net/mptcp/mptcp_wvegas.c 1970-01-01 01:00:00.000000000 +0100
16862 +++ linux-3.14.45/net/mptcp/mptcp_wvegas.c 2015-06-24 14:15:48.931862523 +0200
16863 @@ -0,0 +1,270 @@
16865 + * MPTCP implementation - WEIGHTED VEGAS
16867 + * Algorithm design:
16868 + * Yu Cao <cyAnalyst@126.com>
16869 + * Mingwei Xu <xmw@csnet1.cs.tsinghua.edu.cn>
16870 + * Xiaoming Fu <fu@cs.uni-goettinggen.de>
16872 + * Implementation:
16873 + * Yu Cao <cyAnalyst@126.com>
16874 + * Enhuan Dong <deh13@mails.tsinghua.edu.cn>
16876 + * Ported to the official MPTCP-kernel:
16877 + * Christoph Paasch <christoph.paasch@uclouvain.be>
16879 + * This program is free software; you can redistribute it and/or
16880 + * modify it under the terms of the GNU General Public License
16881 + * as published by the Free Software Foundation; either version
16882 + * 2 of the License, or (at your option) any later version.
16883 + */
16885 +#include <linux/skbuff.h>
16886 +#include <net/tcp.h>
16887 +#include <net/mptcp.h>
16888 +#include <linux/module.h>
16889 +#include <linux/tcp.h>
16891 +static int initial_alpha = 2;
16892 +static int total_alpha = 10;
16893 +static int gamma = 1;
16895 +module_param(initial_alpha, int, 0644);
16896 +MODULE_PARM_DESC(initial_alpha, "initial alpha for all subflows");
16897 +module_param(total_alpha, int, 0644);
16898 +MODULE_PARM_DESC(total_alpha, "total alpha for all subflows");
16899 +module_param(gamma, int, 0644);
16900 +MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)");
16902 +#define MPTCP_WVEGAS_SCALE 16
16904 +/* wVegas variables */
16905 +struct wvegas {
16906 + u32 beg_snd_nxt; /* right edge during last RTT */
16907 + u8 doing_wvegas_now;/* if true, do wvegas for this RTT */
16909 + u16 cnt_rtt; /* # of RTTs measured within last RTT */
16910 + u32 sampled_rtt; /* cumulative RTTs measured within last RTT (in usec) */
16911 + u32 base_rtt; /* the min of all wVegas RTT measurements seen (in usec) */
16913 + u64 instant_rate; /* cwnd / srtt_us, unit: pkts/us * 2^16 */
16914 + u64 weight; /* the ratio of subflow's rate to the total rate, * 2^16 */
16915 + int alpha; /* alpha for each subflows */
16917 + u32 queue_delay; /* queue delay*/
16921 +static inline u64 mptcp_wvegas_scale(u32 val, int scale)
16923 + return (u64) val << scale;
16926 +static void wvegas_enable(struct sock *sk)
16928 + const struct tcp_sock *tp = tcp_sk(sk);
16929 + struct wvegas *wvegas = inet_csk_ca(sk);
16931 + wvegas->doing_wvegas_now = 1;
16933 + wvegas->beg_snd_nxt = tp->snd_nxt;
16935 + wvegas->cnt_rtt = 0;
16936 + wvegas->sampled_rtt = 0;
16938 + wvegas->instant_rate = 0;
16939 + wvegas->alpha = initial_alpha;
16940 + wvegas->weight = mptcp_wvegas_scale(1, MPTCP_WVEGAS_SCALE);
16942 + wvegas->queue_delay = 0;
16945 +static inline void wvegas_disable(struct sock *sk)
16947 + struct wvegas *wvegas = inet_csk_ca(sk);
16949 + wvegas->doing_wvegas_now = 0;
16952 +static void mptcp_wvegas_init(struct sock *sk)
16954 + struct wvegas *wvegas = inet_csk_ca(sk);
16956 + wvegas->base_rtt = 0x7fffffff;
16957 + wvegas_enable(sk);
16960 +static inline u64 mptcp_wvegas_rate(u32 cwnd, u32 rtt_us)
16962 + return div_u64(mptcp_wvegas_scale(cwnd, MPTCP_WVEGAS_SCALE), rtt_us);
16965 +static void mptcp_wvegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us)
16967 + struct wvegas *wvegas = inet_csk_ca(sk);
16968 + u32 vrtt;
16970 + if (rtt_us < 0)
16971 + return;
16973 + vrtt = rtt_us + 1;
16975 + if (vrtt < wvegas->base_rtt)
16976 + wvegas->base_rtt = vrtt;
16978 + wvegas->sampled_rtt += vrtt;
16979 + wvegas->cnt_rtt++;
16982 +static void mptcp_wvegas_state(struct sock *sk, u8 ca_state)
16984 + if (ca_state == TCP_CA_Open)
16985 + wvegas_enable(sk);
16986 + else
16987 + wvegas_disable(sk);
16990 +static void mptcp_wvegas_cwnd_event(struct sock *sk, enum tcp_ca_event event)
16992 + if (event == CA_EVENT_CWND_RESTART) {
16993 + mptcp_wvegas_init(sk);
16994 + } else if (event == CA_EVENT_LOSS) {
16995 + struct wvegas *wvegas = inet_csk_ca(sk);
16996 + wvegas->instant_rate = 0;
17000 +static inline u32 mptcp_wvegas_ssthresh(struct tcp_sock *tp)
17002 + return min(tp->snd_ssthresh, tp->snd_cwnd - 1);
17005 +static u64 mptcp_wvegas_weight(struct mptcp_cb *mpcb, struct sock *sk)
17007 + u64 total_rate = 0;
17008 + struct sock *sub_sk;
17009 + struct wvegas *wvegas = inet_csk_ca(sk);
17011 + if (!mpcb)
17012 + return wvegas->weight;
17015 + mptcp_for_each_sk(mpcb, sub_sk) {
17016 + struct wvegas *sub_wvegas = inet_csk_ca(sub_sk);
17018 + /* sampled_rtt is initialized by 0 */
17019 + if (mptcp_sk_can_send(sub_sk) && (sub_wvegas->sampled_rtt > 0))
17020 + total_rate += sub_wvegas->instant_rate;
17023 + if (total_rate && wvegas->instant_rate)
17024 + return div64_u64(mptcp_wvegas_scale(wvegas->instant_rate, MPTCP_WVEGAS_SCALE), total_rate);
17025 + else
17026 + return wvegas->weight;
17029 +static void mptcp_wvegas_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight)
17031 + struct tcp_sock *tp = tcp_sk(sk);
17032 + struct wvegas *wvegas = inet_csk_ca(sk);
17034 + if (!wvegas->doing_wvegas_now) {
17035 + tcp_reno_cong_avoid(sk, ack, acked, in_flight);
17036 + return;
17039 + if (after(ack, wvegas->beg_snd_nxt)) {
17040 + wvegas->beg_snd_nxt = tp->snd_nxt;
17042 + if (wvegas->cnt_rtt <= 2) {
17043 + tcp_reno_cong_avoid(sk, ack, acked, in_flight);
17044 + } else {
17045 + u32 rtt, diff, q_delay;
17046 + u64 target_cwnd;
17048 + rtt = wvegas->sampled_rtt / wvegas->cnt_rtt;
17049 + target_cwnd = div_u64(((u64)tp->snd_cwnd * wvegas->base_rtt), rtt);
17051 + diff = div_u64((u64)tp->snd_cwnd * (rtt - wvegas->base_rtt), rtt);
17053 + if (diff > gamma && tp->snd_cwnd <= tp->snd_ssthresh) {
17054 + tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1);
17055 + tp->snd_ssthresh = mptcp_wvegas_ssthresh(tp);
17057 + } else if (tp->snd_cwnd <= tp->snd_ssthresh) {
17058 + tcp_slow_start(tp, acked);
17059 + } else {
17060 + if (diff >= wvegas->alpha) {
17061 + wvegas->instant_rate = mptcp_wvegas_rate(tp->snd_cwnd, rtt);
17062 + wvegas->weight = mptcp_wvegas_weight(tp->mpcb, sk);
17063 + wvegas->alpha = max(2U, (u32)((wvegas->weight * total_alpha) >> MPTCP_WVEGAS_SCALE));
17065 + if (diff > wvegas->alpha) {
17066 + tp->snd_cwnd--;
17067 + tp->snd_ssthresh = mptcp_wvegas_ssthresh(tp);
17068 + } else if (diff < wvegas->alpha) {
17069 + tp->snd_cwnd++;
17072 + /* Try to drain link queue if needed*/
17073 + q_delay = rtt - wvegas->base_rtt;
17074 + if ((wvegas->queue_delay == 0) || (wvegas->queue_delay > q_delay))
17075 + wvegas->queue_delay = q_delay;
17077 + if (q_delay >= 2 * wvegas->queue_delay) {
17078 + u32 backoff_factor = div_u64(mptcp_wvegas_scale(wvegas->base_rtt, MPTCP_WVEGAS_SCALE), 2 * rtt);
17079 + tp->snd_cwnd = ((u64)tp->snd_cwnd * backoff_factor) >> MPTCP_WVEGAS_SCALE;
17080 + wvegas->queue_delay = 0;
17084 + if (tp->snd_cwnd < 2)
17085 + tp->snd_cwnd = 2;
17086 + else if (tp->snd_cwnd > tp->snd_cwnd_clamp)
17087 + tp->snd_cwnd = tp->snd_cwnd_clamp;
17089 + tp->snd_ssthresh = tcp_current_ssthresh(sk);
17092 + wvegas->cnt_rtt = 0;
17093 + wvegas->sampled_rtt = 0;
17095 + /* Use normal slow start */
17096 + else if (tp->snd_cwnd <= tp->snd_ssthresh)
17097 + tcp_slow_start(tp, acked);
17101 +static struct tcp_congestion_ops mptcp_wvegas __read_mostly = {
17102 + .flags = TCP_CONG_RTT_STAMP,
17103 + .init = mptcp_wvegas_init,
17104 + .ssthresh = tcp_reno_ssthresh,
17105 + .cong_avoid = mptcp_wvegas_cong_avoid,
17106 + .min_cwnd = tcp_reno_min_cwnd,
17107 + .pkts_acked = mptcp_wvegas_pkts_acked,
17108 + .set_state = mptcp_wvegas_state,
17109 + .cwnd_event = mptcp_wvegas_cwnd_event,
17111 + .owner = THIS_MODULE,
17112 + .name = "wvegas",
17115 +static int __init mptcp_wvegas_register(void)
17117 + BUILD_BUG_ON(sizeof(struct wvegas) > ICSK_CA_PRIV_SIZE);
17118 + tcp_register_congestion_control(&mptcp_wvegas);
17119 + return 0;
17122 +static void __exit mptcp_wvegas_unregister(void)
17124 + tcp_unregister_congestion_control(&mptcp_wvegas);
17127 +module_init(mptcp_wvegas_register);
17128 +module_exit(mptcp_wvegas_unregister);
17130 +MODULE_AUTHOR("Yu Cao, Enhuan Dong");
17131 +MODULE_LICENSE("GPL");
17132 +MODULE_DESCRIPTION("MPTCP wVegas");
17133 +MODULE_VERSION("0.1");