// SPDX-License-Identifier: GPL-2.0 /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Authors: Ross Biro * Fred N. van Kempen, <[email protected]> * Mark Evans, <[email protected]> * Corey Minyard <[email protected]> * Florian La Roche, <[email protected]> * Charles Hedrick, <[email protected]> * Linus Torvalds, <[email protected]> * Alan Cox, <[email protected]> * Matthew Dillon, <[email protected]> * Arnt Gulbrandsen, <[email protected]> * Jorge Cwik, <[email protected]> */ /* * Changes: * Pedro Roque : Fast Retransmit/Recovery. * Two receive queues. * Retransmit queue handled by TCP. * Better retransmit timer handling. * New congestion avoidance. * Header prediction. * Variable renaming. * * Eric : Fast Retransmit. * Randy Scott : MSS option defines. * Eric Schenk : Fixes to slow start algorithm. * Eric Schenk : Yet another double ACK bug. * Eric Schenk : Delayed ACK bug fixes. * Eric Schenk : Floyd style fast retrans war avoidance. * David S. Miller : Don't allow zero congestion window. * Eric Schenk : Fix retransmitter so that it sends * next packet on ack of previous packet. * Andi Kleen : Moved open_request checking here * and process RSTs for open_requests. * Andi Kleen : Better prune_queue, and other fixes. * Andrey Savochkin: Fix RTT measurements in the presence of * timestamps. * Andrey Savochkin: Check sequence numbers correctly when * removing SACKs due to in sequence incoming * data segments. * Andi Kleen: Make sure we never ack data there is not * enough room for. Also make this condition * a fatal error if it might still happen. * Andi Kleen: Add tcp_measure_rcv_mss to make * connections with MSS<min(MTU,ann. MSS) * work without delayed acks. * Andi Kleen: Process packets with PSH set in the * fast path. * J Hadi Salim: ECN support * Andrei Gurtov, * Pasi Sarolahti, * Panu Kuhlberg: Experimental audit of TCP (re)transmission * engine. Lots of bugs are found. * Pasi Sarolahti: F-RTO for dealing with spurious RTOs */ #define pr_fmt(fmt) … #include <linux/mm.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/sysctl.h> #include <linux/kernel.h> #include <linux/prefetch.h> #include <net/dst.h> #include <net/tcp.h> #include <net/proto_memory.h> #include <net/inet_common.h> #include <linux/ipsec.h> #include <linux/unaligned.h> #include <linux/errqueue.h> #include <trace/events/tcp.h> #include <linux/jump_label_ratelimit.h> #include <net/busy_poll.h> #include <net/mptcp.h> int sysctl_tcp_max_orphans __read_mostly = …; #define FLAG_DATA … #define FLAG_WIN_UPDATE … #define FLAG_DATA_ACKED … #define FLAG_RETRANS_DATA_ACKED … #define FLAG_SYN_ACKED … #define FLAG_DATA_SACKED … #define FLAG_ECE … #define FLAG_LOST_RETRANS … #define FLAG_SLOWPATH … #define FLAG_ORIG_SACK_ACKED … #define FLAG_SND_UNA_ADVANCED … #define FLAG_DSACKING_ACK … #define FLAG_SET_XMIT_TIMER … #define FLAG_SACK_RENEGING … #define FLAG_UPDATE_TS_RECENT … #define FLAG_NO_CHALLENGE_ACK … #define FLAG_ACK_MAYBE_DELAYED … #define FLAG_DSACK_TLP … #define FLAG_ACKED … #define FLAG_NOT_DUP … #define FLAG_CA_ALERT … #define FLAG_FORWARD_PROGRESS … #define TCP_REMNANT … #define TCP_HP_BITS … #define REXMIT_NONE … #define REXMIT_LOST … #define REXMIT_NEW … #if IS_ENABLED(CONFIG_TLS_DEVICE) static DEFINE_STATIC_KEY_DEFERRED_FALSE(clean_acked_data_enabled, HZ); void clean_acked_data_enable(struct inet_connection_sock *icsk, void (*cad)(struct sock *sk, u32 ack_seq)) { … } EXPORT_SYMBOL_GPL(…); void clean_acked_data_disable(struct inet_connection_sock *icsk) { … } EXPORT_SYMBOL_GPL(…); void clean_acked_data_flush(void) { … } EXPORT_SYMBOL_GPL(…); #endif #ifdef CONFIG_CGROUP_BPF static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) { … } static void bpf_skops_established(struct sock *sk, int bpf_op, struct sk_buff *skb) { … } #else static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) { } static void bpf_skops_established(struct sock *sk, int bpf_op, struct sk_buff *skb) { } #endif static __cold void tcp_gro_dev_warn(const struct sock *sk, const struct sk_buff *skb, unsigned int len) { … } /* Adapt the MSS value used to make delayed ack decision to the * real world. */ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) { … } static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks) { … } static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) { … } /* Send ACKs quickly, if "quick" count is not exhausted * and the session is not interactive. */ static bool tcp_in_quickack_mode(struct sock *sk) { … } static void tcp_ecn_queue_cwr(struct tcp_sock *tp) { … } static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb) { … } static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) { … } static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) { … } static void tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) { … } static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) { … } static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th) { … } static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) { … } /* Buffer size and advertised window tuning. * * 1. Tuning sk->sk_sndbuf, when connection enters established state. */ static void tcp_sndbuf_expand(struct sock *sk) { … } /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) * * All tcp_full_space() is split to two parts: "network" buffer, allocated * forward and advertised in receiver window (tp->rcv_wnd) and * "application buffer", required to isolate scheduling/application * latencies from network. * window_clamp is maximal advertised window. It can be less than * tcp_full_space(), in this case tcp_full_space() - window_clamp * is reserved for "application" buffer. The less window_clamp is * the smoother our behaviour from viewpoint of network, but the lower * throughput and the higher sensitivity of the connection to losses. 8) * * rcv_ssthresh is more strict window_clamp used at "slow start" * phase to predict further behaviour of this connection. * It is used for two goals: * - to enforce header prediction at sender, even when application * requires some significant "application buffer". It is check #1. * - to prevent pruning of receive queue because of misprediction * of receiver window. Check #2. * * The scheme does not work when sender sends good segments opening * window and then starts to feed us spaghetti. But it should work * in common situations. Otherwise, we have to rely on queue collapsing. */ /* Slow part of check#2. */ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb, unsigned int skbtruesize) { … } /* Even if skb appears to have a bad len/truesize ratio, TCP coalescing * can play nice with us, as sk_buff and skb->head might be either * freed or shared with up to MAX_SKB_FRAGS segments. * Only give a boost to drivers using page frag(s) to hold the frame(s), * and if no payload was pulled in skb->head before reaching us. */ static u32 truesize_adjust(bool adjust, const struct sk_buff *skb) { … } static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb, bool adjust) { … } /* 3. Try to fixup all. It is made immediately after connection enters * established state. */ static void tcp_init_buffer_space(struct sock *sk) { … } /* 4. Recalculate window clamp after socket hit its memory bounds. */ static void tcp_clamp_window(struct sock *sk) { … } /* Initialize RCV_MSS value. * RCV_MSS is an our guess about MSS used by the peer. * We haven't any direct information about the MSS. * It's better to underestimate the RCV_MSS rather than overestimate. * Overestimations make us ACKing less frequently than needed. * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss(). */ void tcp_initialize_rcv_mss(struct sock *sk) { … } EXPORT_SYMBOL(…); /* Receiver "autotuning" code. * * The algorithm for RTT estimation w/o timestamps is based on * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL. * <https://public.lanl.gov/radiant/pubs.html#DRS> * * More detail on this code can be found at * <http://staff.psc.edu/jheffner/>, * though this reference is out of date. A new paper * is pending. */ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) { … } static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp) { … } static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp) { … } static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, const struct sk_buff *skb) { … } /* * This function should be called every time data is copied to user space. * It calculates the appropriate TCP receive buffer space. */ void tcp_rcv_space_adjust(struct sock *sk) { … } static void tcp_save_lrcv_flowlabel(struct sock *sk, const struct sk_buff *skb) { … } /* There is something which you must keep in mind when you analyze the * behavior of the tp->ato delayed ack timeout interval. When a * connection starts up, we want to ack as quickly as possible. The * problem is that "good" TCP's do slow start at the beginning of data * transmission. The means that until we send the first few ACK's the * sender will sit on his end and only queue most of his data, because * he can only send snd_cwnd unacked packets at any given time. For * each ACK we send, he increments snd_cwnd and transmits more of his * queue. -DaveM */ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) { … } /* Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88 * piece by Van Jacobson. * NOTE: the next three routines used to be one big routine. * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) { … } static void tcp_update_pacing_rate(struct sock *sk) { … } /* Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ static void tcp_set_rto(struct sock *sk) { … } __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) { … } struct tcp_sacktag_state { … }; /* Take a notice that peer is sending D-SACKs. Skip update of data delivery * and spurious retransmission information if this DSACK is unlikely caused by * sender's action: * - DSACKed sequence range is larger than maximum receiver's window. * - Total no. of DSACKed segments exceed the total no. of retransmitted segs. */ static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq, u32 end_seq, struct tcp_sacktag_state *state) { … } /* It's reordering when higher sequence was delivered (i.e. sacked) before * some lower never-retransmitted sequence ("low_seq"). The maximum reordering * distance is approximated in full-mss packet distance ("reordering"). */ static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq, const int ts) { … } /* This must be called before lost_out or retrans_out are updated * on a new loss, because we want to know if all skbs previously * known to be lost have already been retransmitted, indicating * that this newly lost skb is our next skb to retransmit. */ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) { … } /* Sum the number of packets on the wire we have marked as lost, and * notify the congestion control module that the given skb was marked lost. */ static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb) { … } void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) { … } /* Updates the delivered and delivered_ce counts */ static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, bool ece_ack) { … } /* This procedure tags the retransmission queue when SACKs arrive. * * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L). * Packets in queue with these bits set are counted in variables * sacked_out, retrans_out and lost_out, correspondingly. * * Valid combinations are: * Tag InFlight Description * 0 1 - orig segment is in flight. * S 0 - nothing flies, orig reached receiver. * L 0 - nothing flies, orig lost by net. * R 2 - both orig and retransmit are in flight. * L|R 1 - orig is lost, retransmit is in flight. * S|R 1 - orig reached receiver, retrans is still in flight. * (L|S|R is logically valid, it could occur when L|R is sacked, * but it is equivalent to plain S and code short-circuits it to S. * L|S is logically invalid, it would mean -1 packet in flight 8)) * * These 6 states form finite state machine, controlled by the following events: * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue()) * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue()) * 3. Loss detection event of two flavors: * A. Scoreboard estimator decided the packet is lost. * A'. Reno "three dupacks" marks head of queue lost. * B. SACK arrives sacking SND.NXT at the moment, when the * segment was retransmitted. * 4. D-SACK added new rule: D-SACK changes any tag to S. * * It is pleasant to note, that state diagram turns out to be commutative, * so that we are allowed not to be bothered by order of our actions, * when multiple events arrive simultaneously. (see the function below). * * Reordering detection. * -------------------- * Reordering metric is maximal distance, which a packet can be displaced * in packet stream. With SACKs we can estimate it: * * 1. SACK fills old hole and the corresponding segment was not * ever retransmitted -> reordering. Alas, we cannot use it * when segment was retransmitted. * 2. The last flaw is solved with D-SACK. D-SACK arrives * for retransmitted and already SACKed segment -> reordering.. * Both of these heuristics are not used in Loss state, when we cannot * account for retransmits accurately. * * SACK block validation. * ---------------------- * * SACK block range validation checks that the received SACK block fits to * the expected sequence limits, i.e., it is between SND.UNA and SND.NXT. * Note that SND.UNA is not included to the range though being valid because * it means that the receiver is rather inconsistent with itself reporting * SACK reneging when it should advance SND.UNA. Such SACK block this is * perfectly valid, however, in light of RFC2018 which explicitly states * that "SACK block MUST reflect the newest segment. Even if the newest * segment is going to be discarded ...", not that it looks very clever * in case of head skb. Due to potentional receiver driven attacks, we * choose to avoid immediate execution of a walk in write queue due to * reneging and defer head skb's loss recovery to standard loss recovery * procedure that will eventually trigger (nothing forbids us doing this). * * Implements also blockage to start_seq wrap-around. Problem lies in the * fact that though start_seq (s) is before end_seq (i.e., not reversed), * there's no guarantee that it will be before snd_nxt (n). The problem * happens when start_seq resides between end_seq wrap (e_w) and snd_nxt * wrap (s_w): * * <- outs wnd -> <- wrapzone -> * u e n u_w e_w s n_w * | | | | | | | * |<------------+------+----- TCP seqno space --------------+---------->| * ...-- <2^31 ->| |<--------... * ...---- >2^31 ------>| |<--------... * * Current code wouldn't be vulnerable but it's better still to discard such * crazy SACK blocks. Doing this check for start_seq alone closes somewhat * similar case (end_seq after snd_nxt wrap) as earlier reversed check in * snd_nxt wrap -> snd_una region will then become "well defined", i.e., * equal to the ideal case (infinite seqno space without wrap caused issues). * * With D-SACK the lower bound is extended to cover sequence space below * SND.UNA down to undo_marker, which is the last point of interest. Yet * again, D-SACK block must not to go across snd_una (for the same reason as * for the normal SACK blocks, explained above). But there all simplicity * ends, TCP might receive valid D-SACKs below that. As long as they reside * fully below undo_marker they do not affect behavior in anyway and can * therefore be safely ignored. In rare cases (which are more or less * theoretical ones), the D-SACK will nicely cross that boundary due to skb * fragmentation and packet reordering past skb's retransmission. To consider * them correctly, the acceptable range must be extended even more though * the exact amount is rather hard to quantify. However, tp->max_window can * be used as an exaggerated estimate. */ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack, u32 start_seq, u32 end_seq) { … } static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, struct tcp_sack_block_wire *sp, int num_sacks, u32 prior_snd_una, struct tcp_sacktag_state *state) { … } /* Check if skb is fully within the SACK block. In presence of GSO skbs, * the incoming SACK may not exactly match but we can find smaller MSS * aligned portion of it that matches. Therefore we might need to fragment * which may fail and creates some hassle (caller must handle error case * returns). * * FIXME: this could be merged to shift decision code */ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, u32 start_seq, u32 end_seq) { … } /* Mark the given newly-SACKed range as such, adjusting counters and hints. */ static u8 tcp_sacktag_one(struct sock *sk, struct tcp_sacktag_state *state, u8 sacked, u32 start_seq, u32 end_seq, int dup_sack, int pcount, u64 xmit_time) { … } /* Shift newly-SACKed bytes from this skb to the immediately previous * already-SACKed sk_buff. Mark the newly-SACKed bytes as such. */ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, struct sk_buff *skb, struct tcp_sacktag_state *state, unsigned int pcount, int shifted, int mss, bool dup_sack) { … } /* I wish gso_size would have a bit more sane initialization than * something-or-zero which complicates things */ static int tcp_skb_seglen(const struct sk_buff *skb) { … } /* Shifting pages past head area doesn't work */ static int skb_can_shift(const struct sk_buff *skb) { … } int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount, int shiftlen) { … } /* Try collapsing SACK blocks spanning across multiple skbs to a single * skb. */ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, struct tcp_sacktag_state *state, u32 start_seq, u32 end_seq, bool dup_sack) { … } static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, struct tcp_sack_block *next_dup, struct tcp_sacktag_state *state, u32 start_seq, u32 end_seq, bool dup_sack_in) { … } static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, u32 seq) { … } static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, u32 skip_to_seq) { … } static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, struct sock *sk, struct tcp_sack_block *next_dup, struct tcp_sacktag_state *state, u32 skip_to_seq) { … } static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_block *cache) { … } static int tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, u32 prior_snd_una, struct tcp_sacktag_state *state) { … } /* Limits sacked_out so that sum with lost_out isn't ever larger than * packets_out. Returns false if sacked_out adjustement wasn't necessary. */ static bool tcp_limit_reno_sacked(struct tcp_sock *tp) { … } /* If we receive more dupacks than we expected counting segments * in assumption of absent reordering, interpret this as reordering. * The only another reason could be bug in receiver TCP. */ static void tcp_check_reno_reordering(struct sock *sk, const int addend) { … } /* Emulate SACKs for SACKless connection: account for a new dupack. */ static void tcp_add_reno_sack(struct sock *sk, int num_dupack, bool ece_ack) { … } /* Account for ACK, ACKing some data in Reno Recovery phase. */ static void tcp_remove_reno_sacks(struct sock *sk, int acked, bool ece_ack) { … } static inline void tcp_reset_reno_sack(struct tcp_sock *tp) { … } void tcp_clear_retrans(struct tcp_sock *tp) { … } static inline void tcp_init_undo(struct tcp_sock *tp) { … } static bool tcp_is_rack(const struct sock *sk) { … } /* If we detect SACK reneging, forget all SACK information * and reset tags completely, otherwise preserve SACKs. If receiver * dropped its ofo queue, we will know this due to reneging detection. */ static void tcp_timeout_mark_lost(struct sock *sk) { … } /* Enter Loss state. */ void tcp_enter_loss(struct sock *sk) { … } /* If ACK arrived pointing to a remembered SACK, it means that our * remembered SACKs do not reflect real state of receiver i.e. * receiver _host_ is heavily congested (or buggy). * * To avoid big spurious retransmission bursts due to transient SACK * scoreboard oddities that look like reneging, we give the receiver a * little time (max(RTT/2, 10ms)) to send us some more ACKs that will * restore sanity to the SACK scoreboard. If the apparent reneging * persists until this RTO then we'll clear the SACK scoreboard. */ static bool tcp_check_sack_reneging(struct sock *sk, int *ack_flag) { … } /* Heurestics to calculate number of duplicate ACKs. There's no dupACKs * counter when SACK is enabled (without SACK, sacked_out is used for * that purpose). * * With reordering, holes may still be in flight, so RFC3517 recovery * uses pure sacked_out (total number of SACKed segments) even though * it violates the RFC that uses duplicate ACKs, often these are equal * but when e.g. out-of-window ACKs or packet duplication occurs, * they differ. Since neither occurs due to loss, TCP should really * ignore them. */ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) { … } /* Linux NewReno/SACK/ECN state machine. * -------------------------------------- * * "Open" Normal state, no dubious events, fast path. * "Disorder" In all the respects it is "Open", * but requires a bit more attention. It is entered when * we see some SACKs or dupacks. It is split of "Open" * mainly to move some processing from fast path to slow one. * "CWR" CWND was reduced due to some Congestion Notification event. * It can be ECN, ICMP source quench, local device congestion. * "Recovery" CWND was reduced, we are fast-retransmitting. * "Loss" CWND was reduced due to RTO timeout or SACK reneging. * * tcp_fastretrans_alert() is entered: * - each incoming ACK, if state is not "Open" * - when arrived ACK is unusual, namely: * * SACK * * Duplicate ACK. * * ECN ECE. * * Counting packets in flight is pretty simple. * * in_flight = packets_out - left_out + retrans_out * * packets_out is SND.NXT-SND.UNA counted in packets. * * retrans_out is number of retransmitted segments. * * left_out is number of segments left network, but not ACKed yet. * * left_out = sacked_out + lost_out * * sacked_out: Packets, which arrived to receiver out of order * and hence not ACKed. With SACKs this number is simply * amount of SACKed data. Even without SACKs * it is easy to give pretty reliable estimate of this number, * counting duplicate ACKs. * * lost_out: Packets lost by network. TCP has no explicit * "loss notification" feedback from network (for now). * It means that this number can be only _guessed_. * Actually, it is the heuristics to predict lossage that * distinguishes different algorithms. * * F.e. after RTO, when all the queue is considered as lost, * lost_out = packets_out and in_flight = retrans_out. * * Essentially, we have now a few algorithms detecting * lost packets. * * If the receiver supports SACK: * * RFC6675/3517: It is the conventional algorithm. A packet is * considered lost if the number of higher sequence packets * SACKed is greater than or equal the DUPACK thoreshold * (reordering). This is implemented in tcp_mark_head_lost and * tcp_update_scoreboard. * * RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm * (2017-) that checks timing instead of counting DUPACKs. * Essentially a packet is considered lost if it's not S/ACKed * after RTT + reordering_window, where both metrics are * dynamically measured and adjusted. This is implemented in * tcp_rack_mark_lost. * * If the receiver does not support SACK: * * NewReno (RFC6582): in Recovery we assume that one segment * is lost (classic Reno). While we are in Recovery and * a partial ACK arrives, we assume that one more packet * is lost (NewReno). This heuristics are the same in NewReno * and SACK. * * Really tricky (and requiring careful tuning) part of algorithm * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue(). * The first determines the moment _when_ we should reduce CWND and, * hence, slow down forward transmission. In fact, it determines the moment * when we decide that hole is caused by loss, rather than by a reorder. * * tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill * holes, caused by lost packets. * * And the most logically complicated part of algorithm is undo * heuristics. We detect false retransmits due to both too early * fast retransmit (reordering) and underestimated RTO, analyzing * timestamps and D-SACKs. When we detect that some segments were * retransmitted by mistake and CWND reduction was wrong, we undo * window reduction and abort recovery phase. This logic is hidden * inside several functions named tcp_try_undo_<something>. */ /* This function decides, when we should leave Disordered state * and enter Recovery phase, reducing congestion window. * * Main question: may we further continue forward transmission * with the same cwnd? */ static bool tcp_time_to_recover(struct sock *sk, int flag) { … } /* Detect loss in event "A" above by marking head of queue up as lost. * For RFC3517 SACK, a segment is considered lost if it * has at least tp->reordering SACKed seqments above it; "packets" refers to * the maximum SACKed segments to pass before reaching this limit. */ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) { … } /* Account newly detected lost packet(s) */ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) { … } static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when) { … } /* skb is spurious retransmitted if the returned timestamp echo * reply is prior to the skb transmission time */ static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp, const struct sk_buff *skb) { … } /* Nothing was retransmitted or returned timestamp is less * than timestamp of the first retransmission. */ static inline bool tcp_packet_delayed(const struct tcp_sock *tp) { … } /* Undo procedures. */ /* We can clear retrans_stamp when there are no retransmissions in the * window. It would seem that it is trivially available for us in * tp->retrans_out, however, that kind of assumptions doesn't consider * what will happen if errors occur when sending retransmission for the * second time. ...It could the that such segment has only * TCPCB_EVER_RETRANS set at the present time. It seems that checking * the head skb is enough except for some reneging corner cases that * are not worth the effort. * * Main reason for all this complexity is the fact that connection dying * time now depends on the validity of the retrans_stamp, in particular, * that successive retransmissions of a segment must not advance * retrans_stamp under any conditions. */ static bool tcp_any_retrans_done(const struct sock *sk) { … } /* If loss recovery is finished and there are no retransmits out in the * network, then we clear retrans_stamp so that upon the next loss recovery * retransmits_timed_out() and timestamp-undo are using the correct value. */ static void tcp_retrans_stamp_cleanup(struct sock *sk) { … } static void DBGUNDO(struct sock *sk, const char *msg) { … } static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) { … } static inline bool tcp_may_undo(const struct tcp_sock *tp) { … } static bool tcp_is_non_sack_preventing_reopen(struct sock *sk) { … } /* People celebrate: "We love our President!" */ static bool tcp_try_undo_recovery(struct sock *sk) { … } /* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */ static bool tcp_try_undo_dsack(struct sock *sk) { … } /* Undo during loss recovery after partial ACK or using F-RTO. */ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) { … } /* The cwnd reduction in CWR and Recovery uses the PRR algorithm in RFC 6937. * It computes the number of packets to send (sndcnt) based on packets newly * delivered: * 1) If the packets in flight is larger than ssthresh, PRR spreads the * cwnd reductions across a full RTT. * 2) Otherwise PRR uses packet conservation to send as much as delivered. * But when SND_UNA is acked without further losses, * slow starts cwnd up to ssthresh to speed up the recovery. */ static void tcp_init_cwnd_reduction(struct sock *sk) { … } void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, int flag) { … } static inline void tcp_end_cwnd_reduction(struct sock *sk) { … } /* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */ void tcp_enter_cwr(struct sock *sk) { … } EXPORT_SYMBOL(…); static void tcp_try_keep_open(struct sock *sk) { … } static void tcp_try_to_open(struct sock *sk, int flag) { … } static void tcp_mtup_probe_failed(struct sock *sk) { … } static void tcp_mtup_probe_success(struct sock *sk) { … } /* Sometimes we deduce that packets have been dropped due to reasons other than * congestion, like path MTU reductions or failed client TFO attempts. In these * cases we call this function to retransmit as many packets as cwnd allows, * without reducing cwnd. Given that retransmits will set retrans_stamp to a * non-zero value (and may do so in a later calling context due to TSQ), we * also enter CA_Loss so that we track when all retransmitted packets are ACKed * and clear retrans_stamp when that happens (to ensure later recurring RTOs * are using the correct retrans_stamp and don't declare ETIMEDOUT * prematurely). */ static void tcp_non_congestion_loss_retransmit(struct sock *sk) { … } /* Do a simple retransmit without using the backoff mechanisms in * tcp_timer. This is used for path mtu discovery. * The socket is already locked here. */ void tcp_simple_retransmit(struct sock *sk) { … } EXPORT_SYMBOL(…); void tcp_enter_recovery(struct sock *sk, bool ece_ack) { … } static void tcp_update_rto_time(struct tcp_sock *tp) { … } /* Process an ACK in CA_Loss state. Move to CA_Open if lost data are * recovered or spurious. Otherwise retransmits more on partial ACKs. */ static void tcp_process_loss(struct sock *sk, int flag, int num_dupack, int *rexmit) { … } static bool tcp_force_fast_retransmit(struct sock *sk) { … } /* Undo during fast recovery after partial ACK. */ static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una, bool *do_lost) { … } static void tcp_identify_packet_loss(struct sock *sk, int *ack_flag) { … } /* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and * packets lost by network. * * Besides that it updates the congestion state when packet loss or ECN * is detected. But it does not reduce the cwnd, it is done by the * congestion control later. * * It does _not_ decide what to send, it is made in function * tcp_xmit_retransmit_queue(). */ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, int num_dupack, int *ack_flag, int *rexmit) { … } static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us, const int flag) { … } static bool tcp_ack_update_rtt(struct sock *sk, const int flag, long seq_rtt_us, long sack_rtt_us, long ca_rtt_us, struct rate_sample *rs) { … } /* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */ void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req) { … } static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { … } /* Restart timer after forward progress on connection. * RFC2988 recommends to restart timer to now+rto. */ void tcp_rearm_rto(struct sock *sk) { … } /* Try to schedule a loss probe; if that doesn't work, then schedule an RTO. */ static void tcp_set_xmit_timer(struct sock *sk) { … } /* If we get here, the whole TSO packet has not been acked. */ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) { … } static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, const struct sk_buff *ack_skb, u32 prior_snd_una) { … } /* Remove acknowledged frames from the retransmission queue. If our packet * is before the ack sequence we can discard it as it's confirmed to have * arrived at the other end. */ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb, u32 prior_fack, u32 prior_snd_una, struct tcp_sacktag_state *sack, bool ece_ack) { … } static void tcp_ack_probe(struct sock *sk) { … } static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag) { … } /* Decide wheather to run the increase function of congestion control. */ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) { … } /* The "ultimate" congestion control function that aims to replace the rigid * cwnd increase and decrease control (tcp_cong_avoid,tcp_*cwnd_reduction). * It's called toward the end of processing an ACK with precise rate * information. All transmission or retransmission are delayed afterwards. */ static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked, int flag, const struct rate_sample *rs) { … } /* Check that window update is acceptable. * The function assumes that snd_una<=ack<=snd_next. */ static inline bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack, const u32 ack_seq, const u32 nwin) { … } static void tcp_snd_sne_update(struct tcp_sock *tp, u32 ack) { … } /* If we update tp->snd_una, also update tp->bytes_acked */ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack) { … } static void tcp_rcv_sne_update(struct tcp_sock *tp, u32 seq) { … } /* If we update tp->rcv_nxt, also update tp->bytes_received */ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq) { … } /* Update our send window. * * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2 * and in FreeBSD. NetBSD's one is even worse.) is wrong. */ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 ack, u32 ack_seq) { … } static bool __tcp_oow_rate_limited(struct net *net, int mib_idx, u32 *last_oow_ack_time) { … } /* Return true if we're currently rate-limiting out-of-window ACKs and * thus shouldn't send a dupack right now. We rate-limit dupacks in * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS * attacks that send repeated SYNs or ACKs for the same connection. To * do this, we do not send a duplicate SYNACK or ACK if the remote * endpoint is sending out-of-window SYNs or pure ACKs at a high rate. */ bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, int mib_idx, u32 *last_oow_ack_time) { … } /* RFC 5961 7 [ACK Throttling] */ static void tcp_send_challenge_ack(struct sock *sk) { … } static void tcp_store_ts_recent(struct tcp_sock *tp) { … } static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) { … } /* This routine deals with acks during a TLP episode and ends an episode by * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack */ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) { … } static inline void tcp_in_ack_event(struct sock *sk, u32 flags) { … } /* Congestion control has updated the cwnd already. So if we're in * loss recovery then now we do any new sends (for FRTO) or * retransmits (for CA_Loss or CA_recovery) that make sense. */ static void tcp_xmit_recovery(struct sock *sk, int rexmit) { … } /* Returns the number of packets newly acked or sacked by the current ACK */ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) { … } /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) { … } static void tcp_parse_fastopen_option(int len, const unsigned char *cookie, bool syn, struct tcp_fastopen_cookie *foc, bool exp_opt) { … } static bool smc_parse_options(const struct tcphdr *th, struct tcp_options_received *opt_rx, const unsigned char *ptr, int opsize) { … } /* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped * value on success. */ u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss) { … } EXPORT_SYMBOL_GPL(…); /* Look for tcp options. Normally only called on SYN and SYNACK packets. * But, this can also be called on packets in the established flow when * the fast version below fails. */ void tcp_parse_options(const struct net *net, const struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab, struct tcp_fastopen_cookie *foc) { … } EXPORT_SYMBOL(…); static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th) { … } /* Fast parse options. This hopes to only see timestamps. * If it is wrong it falls back on tcp_parse_options(). */ static bool tcp_fast_parse_options(const struct net *net, const struct sk_buff *skb, const struct tcphdr *th, struct tcp_sock *tp) { … } #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) /* * Parse Signature options */ int tcp_do_parse_auth_options(const struct tcphdr *th, const u8 **md5_hash, const u8 **ao_hash) { … } EXPORT_SYMBOL(…); #endif /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM * * It is not fatal. If this ACK does _not_ change critical state (seqs, window) * it can pass through stack. So, the following predicate verifies that * this segment is not used for anything but congestion avoidance or * fast retransmit. Moreover, we even are able to eliminate most of such * second order effects, if we apply some small "replay" window (~RTO) * to timestamp space. * * All these measures still do not guarantee that we reject wrapped ACKs * on networks with high bandwidth, when sequence space is recycled fastly, * but it guarantees that such events will be very rare and do not affect * connection seriously. This doesn't look nice, but alas, PAWS is really * buggy extension. * * [ Later note. Even worse! It is buggy for segments _with_ data. RFC * states that events when retransmit arrives after original data are rare. * It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is * the biggest problem on large power networks even with minor reordering. * OK, let's give it small replay window. If peer clock is even 1hz, it is safe * up to bandwidth of 18Gigabit/sec. 8) ] */ /* Estimates max number of increments of remote peer TSval in * a replay window (based on our current RTO estimation). */ static u32 tcp_tsval_replay(const struct sock *sk) { … } static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb) { … } static inline bool tcp_paws_discard(const struct sock *sk, const struct sk_buff *skb) { … } /* Check segment sequence number for validity. * * Segment controls are considered valid, if the segment * fits to the window after truncation to the window. Acceptability * of data (and SYN, FIN, of course) is checked separately. * See tcp_data_queue(), for example. * * Also, controls (RST is main one) are accepted using RCV.WUP instead * of RCV.NXT. Peer still did not advance his SND.UNA when we * delayed ACK, so that hisSND.UNA<=ourRCV.WUP. * (borrowed from freebsd) */ static enum skb_drop_reason tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) { … } void tcp_done_with_error(struct sock *sk, int err) { … } EXPORT_SYMBOL(…); /* When we get a reset we do this. */ void tcp_reset(struct sock *sk, struct sk_buff *skb) { … } /* * Process the FIN bit. This now behaves as it is supposed to work * and the FIN takes effect when it is validly part of sequence * space. Not before when we get holes. * * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT * (and thence onto LAST-ACK and finally, CLOSE, we never enter * TIME-WAIT) * * If we are in FINWAIT-1, a received FIN indicates simultaneous * close and we go into CLOSING (and later onto TIME-WAIT) * * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT. */ void tcp_fin(struct sock *sk) { … } static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq) { … } static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq) { … } static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq) { … } static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb) { … } static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) { … } /* These routines update the SACK block as out-of-order packets arrive or * in-order packets close up the sequence space. */ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) { … } void tcp_sack_compress_send_ack(struct sock *sk) { … } /* Reasonable amount of sack blocks included in TCP SACK option * The max is 4, but this becomes 3 if TCP timestamps are there. * Given that SACK packets might be lost, be conservative and use 2. */ #define TCP_SACK_BLOCKS_EXPECTED … static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) { … } /* RCV.NXT advances, some SACKs should be eaten. */ static void tcp_sack_remove(struct tcp_sock *tp) { … } /** * tcp_try_coalesce - try to merge skb to prior one * @sk: socket * @to: prior buffer * @from: buffer to add in queue * @fragstolen: pointer to boolean * * Before queueing skb @from after @to, try to merge them * to reduce overall memory use and queue lengths, if cost is small. * Packets in ofo or receive queues can stay a long time. * Better try to coalesce them right now to avoid future collapses. * Returns true if caller should free @from instead of queueing it */ static bool tcp_try_coalesce(struct sock *sk, struct sk_buff *to, struct sk_buff *from, bool *fragstolen) { … } static bool tcp_ooo_try_coalesce(struct sock *sk, struct sk_buff *to, struct sk_buff *from, bool *fragstolen) { … } static void tcp_drop_reason(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason) { … } /* This one checks to see if we can put data from the * out_of_order queue into the receive_queue. */ static void tcp_ofo_queue(struct sock *sk) { … } static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb); static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb); static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, unsigned int size) { … } static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) { … } static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, bool *fragstolen) { … } int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) { … } void tcp_data_ready(struct sock *sk) { … } static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { … } static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *list) { … } static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, struct sk_buff_head *list, struct rb_root *root) { … } /* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */ void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) { … } /* Collapse contiguous sequence of skbs head..tail with * sequence numbers start..end. * * If tail is NULL, this means until the end of the queue. * * Segments with FIN/SYN are not collapsed (only because this * simplifies code) */ static void tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root, struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end) { … } /* Collapse ofo queue. Algorithm: select contiguous sequence of skbs * and tcp_collapse() them until all the queue is collapsed. */ static void tcp_collapse_ofo_queue(struct sock *sk) { … } /* * Clean the out-of-order queue to make room. * We drop high sequences packets to : * 1) Let a chance for holes to be filled. * This means we do not drop packets from ooo queue if their sequence * is before incoming packet sequence. * 2) not add too big latencies if thousands of packets sit there. * (But if application shrinks SO_RCVBUF, we could still end up * freeing whole queue here) * 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks. * * Return true if queue has shrunk. */ static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb) { … } /* Reduce allocated memory if we can, trying to get * the socket within its memory limits again. * * Return less than zero if we should start dropping frames * until the socket owning process reads some of the data * to stabilize the situation. */ static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb) { … } static bool tcp_should_expand_sndbuf(struct sock *sk) { … } static void tcp_new_space(struct sock *sk) { … } /* Caller made space either from: * 1) Freeing skbs in rtx queues (after tp->snd_una has advanced) * 2) Sent skbs from output queue (and thus advancing tp->snd_nxt) * * We might be able to generate EPOLLOUT to the application if: * 1) Space consumed in output/rtx queues is below sk->sk_sndbuf/2 * 2) notsent amount (tp->write_seq - tp->snd_nxt) became * small enough that tcp_stream_memory_free() decides it * is time to generate EPOLLOUT. */ void tcp_check_space(struct sock *sk) { … } static inline void tcp_data_snd_check(struct sock *sk) { … } /* * Check if sending an ack is needed. */ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) { … } static inline void tcp_ack_snd_check(struct sock *sk) { … } /* * This routine is only called when we have urgent data * signaled. Its the 'slow' part of tcp_urg. It could be * moved inline now as tcp_urg is only called from one * place. We handle URGent data wrong. We have to - as * BSD still doesn't use the correction from RFC961. * For 1003.1g we should support a new option TCP_STDURG to permit * either form (or just set the sysctl tcp_stdurg). */ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th) { … } /* This is the 'fast' part of urgent handling. */ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th) { … } /* Accept RST for rcv_nxt - 1 after a FIN. * When tcp connections are abruptly terminated from Mac OSX (via ^C), a * FIN is sent followed by a RST packet. The RST is sent with the same * sequence number as the FIN, and thus according to RFC 5961 a challenge * ACK should be sent. However, Mac OSX rate limits replies to challenge * ACKs on the closed socket. In addition middleboxes can drop either the * challenge ACK or a subsequent RST. */ static bool tcp_reset_check(const struct sock *sk, const struct sk_buff *skb) { … } /* Does PAWS and seqno based validation of an incoming segment, flags will * play significant role here. */ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, int syn_inerr) { … } /* * TCP receive function for the ESTABLISHED state. * * It is split into a fast path and a slow path. The fast path is * disabled when: * - A zero window was announced from us - zero window probing * is only handled properly in the slow path. * - Out of order segments arrived. * - Urgent data is expected. * - There is no buffer space left * - Unexpected TCP flags/window values/header lengths are received * (detected by checking the TCP header against pred_flags) * - Data is sent in both directions. Fast path only supports pure senders * or pure receivers (this means either the sequence number or the ack * value must stay constant) * - Unexpected TCP option. * * When these conditions are not satisfied it drops into a standard * receive procedure patterned after RFC793 to handle all cases. * The first three cases are guaranteed by proper pred_flags setting, * the rest is checked inline. Fast processing is turned on in * tcp_data_queue when everything is OK. */ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) { … } EXPORT_SYMBOL(…); void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb) { … } void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) { … } static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, struct tcp_fastopen_cookie *cookie) { … } static void smc_check_reset_syn(struct tcp_sock *tp) { … } static void tcp_try_undo_spurious_syn(struct sock *sk) { … } static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th) { … } static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) { … } /* * This function implements the receiving procedure of RFC 793 for * all states except ESTABLISHED and TIME_WAIT. * It's called from both tcp_v4_rcv and tcp_v6_rcv and should be * address independent. */ enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) { … } EXPORT_SYMBOL(…); static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) { … } /* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set * * If we receive a SYN packet with these bits set, it means a * network is playing bad games with TOS bits. In order to * avoid possible false congestion notifications, we disable * TCP ECN negotiation. * * Exception: tcp_ca wants ECN. This is required for DCTCP * congestion control: Linux DCTCP asserts ECT on all packets, * including SYN, which is most optimal solution; however, * others, such as FreeBSD do not. * * Exception: At least one of the reserved bits of the TCP header (th->res1) is * set, indicating the use of a future TCP extension (such as AccECN). See * RFC8311 §4.3 which updates RFC3168 to allow the development of such * extensions. */ static void tcp_ecn_create_request(struct request_sock *req, const struct sk_buff *skb, const struct sock *listen_sk, const struct dst_entry *dst) { … } static void tcp_openreq_init(struct request_sock *req, const struct tcp_options_received *rx_opt, struct sk_buff *skb, const struct sock *sk) { … } /* * Return true if a syncookie should be sent */ static bool tcp_syn_flood_action(struct sock *sk, const char *proto) { … } static void tcp_reqsk_record_syn(const struct sock *sk, struct request_sock *req, const struct sk_buff *skb) { … } /* If a SYN cookie is required and supported, returns a clamped MSS value to be * used for SYN cookie generation. */ u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct tcphdr *th) { … } EXPORT_SYMBOL_GPL(…); int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb) { … } EXPORT_SYMBOL(…);