diff --unified --exclude='stamp-*' --exclude=.config --exclude=.config.old --exclude=CVS --exclude=.cvsignore --recursive --ignore-all-space --ignore-blank-lines -x arch -x debian -x drivers -x version.h -x asm-cris -x cyclades.h -x sysctl_net.c -x n_r3964.h --new-file linux-2.4.26-orig/include/linux/sysctl.h linux-2.4.26/include/linux/sysctl.h --- linux-2.4.26-orig/include/linux/sysctl.h 2004-04-14 06:05:40.000000000 -0700 +++ linux-2.4.26/include/linux/sysctl.h 2004-04-29 14:56:06.000000000 -0700 @@ -314,6 +314,8 @@ NET_IPV4_IPFRAG_SECRET_INTERVAL=94, NET_TCP_WESTWOOD=95, NET_IPV4_IGMP_MAX_MSF=96, + NET_TCP_ECN_NONCE=97, + NET_TCP_ECN_NONCE_DEBUG=98, }; enum { diff --unified --exclude='stamp-*' --exclude=.config --exclude=.config.old --exclude=CVS --exclude=.cvsignore --recursive --ignore-all-space --ignore-blank-lines -x arch -x debian -x drivers -x version.h -x asm-cris -x cyclades.h -x sysctl_net.c -x n_r3964.h --new-file linux-2.4.26-orig/include/linux/tcp.h linux-2.4.26/include/linux/tcp.h --- linux-2.4.26-orig/include/linux/tcp.h 2001-11-22 11:47:11.000000000 -0800 +++ linux-2.4.26/include/linux/tcp.h 2004-04-29 14:31:37.000000000 -0700 @@ -26,7 +26,8 @@ __u32 seq; __u32 ack_seq; #if defined(__LITTLE_ENDIAN_BITFIELD) - __u16 res1:4, + __u16 ns:1, + res1:3, doff:4, fin:1, syn:1, @@ -38,7 +39,8 @@ cwr:1; #elif defined(__BIG_ENDIAN_BITFIELD) __u16 doff:4, - res1:4, + res1:3, + ns:1, cwr:1, ece:1, urg:1, @@ -102,6 +104,7 @@ #define tcp_flag_word(tp) ( ((union tcp_word_hdr *)(tp))->words [3]) enum { + TCP_FLAG_NS = __constant_htonl(0x01000000), TCP_FLAG_CWR = __constant_htonl(0x00800000), TCP_FLAG_ECE = __constant_htonl(0x00400000), TCP_FLAG_URG = __constant_htonl(0x00200000), diff --unified --exclude='stamp-*' --exclude=.config --exclude=.config.old --exclude=CVS --exclude=.cvsignore --recursive --ignore-all-space --ignore-blank-lines -x arch -x debian -x drivers -x version.h -x asm-cris -x cyclades.h -x sysctl_net.c -x n_r3964.h --new-file linux-2.4.26-orig/include/net/inet_ecn.h linux-2.4.26/include/net/inet_ecn.h --- linux-2.4.26-orig/include/net/inet_ecn.h 2001-10-30 15:08:12.000000000 -0800 +++ linux-2.4.26/include/net/inet_ecn.h 2004-04-29 14:36:47.000000000 -0700 @@ -1,30 +1,44 @@ #ifndef _INET_ECN_H_ #define _INET_ECN_H_ +#include + static inline int INET_ECN_is_ce(__u8 dsfield) { - return (dsfield&3) == 3; + return ((dsfield&3) == 3); } static inline int INET_ECN_is_not_ce(__u8 dsfield) { - return (dsfield&3) == 2; + return (((dsfield&3) == 2) || ((dsfield&3) == 1)); } static inline int INET_ECN_is_capable(__u8 dsfield) { - return (dsfield&2); + /* returns nonzero (1,2,3) if capable, 0 if not */ + return (dsfield&3); +} + +static inline int INET_ECN_recover_nonce(__u8 dsfield) +{ + return(dsfield&1); /* ECT(1) if 01, ECT(0) if 10. */ } static inline __u8 INET_ECN_encapsulate(__u8 outer, __u8 inner) { outer &= ~3; - if (INET_ECN_is_capable(inner)) outer |= (inner & 3); return outer; } #define INET_ECN_xmit(sk) do { (sk)->protinfo.af_inet.tos |= 2; } while (0) + +/* bit clearing is necessary to alternate between ECT's. */ +#define INET_ECN_xmit_n0(sk) do { (sk)->protinfo.af_inet.tos |= 2; \ + (sk)->protinfo.af_inet.tos &= ~1; } while (0) +#define INET_ECN_xmit_n1(sk) do { (sk)->protinfo.af_inet.tos |= 1; \ + (sk)->protinfo.af_inet.tos &= ~2; } while (0) + #define INET_ECN_dontxmit(sk) do { (sk)->protinfo.af_inet.tos &= ~3; } while (0) #define IP6_ECN_flow_init(label) do { \ @@ -39,16 +53,24 @@ static inline void IP_ECN_set_ce(struct iphdr *iph) { u32 check = iph->check; + __u8 dsfield = ipv4_get_dsfield(iph); + if ((dsfield&3) == 2) { + /* RFC2481 ECT(0) path */ check += __constant_htons(0xFFFE); - iph->check = check + (check>=0xFFFF); iph->tos |= 1; + } else if((dsfield&3) == 1) { + /* RFC3xxx ECT(1) path */ + check += __constant_htons(0xFFFD); + iph->tos |= 2; + } + iph->check = check + (check>=0xFFFF); } struct ipv6hdr; static inline void IP6_ECN_set_ce(struct ipv6hdr *iph) { - *(u32*)iph |= htonl(1<<20); + *(u32*)iph |= htonl(3<<20); } #define ip6_get_dsfield(iph) ((ntohs(*(u16*)(iph)) >> 4) & 0xFF) diff --unified --exclude='stamp-*' --exclude=.config --exclude=.config.old --exclude=CVS --exclude=.cvsignore --recursive --ignore-all-space --ignore-blank-lines -x arch -x debian -x drivers -x version.h -x asm-cris -x cyclades.h -x sysctl_net.c -x n_r3964.h --new-file linux-2.4.26-orig/include/net/snmp.h linux-2.4.26/include/net/snmp.h --- linux-2.4.26-orig/include/net/snmp.h 2003-11-28 10:26:21.000000000 -0800 +++ linux-2.4.26/include/net/snmp.h 2004-04-29 14:35:06.000000000 -0700 @@ -308,6 +308,8 @@ unsigned long TCPAbortOnLinger; unsigned long TCPAbortFailed; unsigned long TCPMemoryPressures; + unsigned long TCPECNConnections; + unsigned long TCPECNNonceFailures; unsigned long __pad[0]; } ____cacheline_aligned; diff --unified --exclude='stamp-*' --exclude=.config --exclude=.config.old --exclude=CVS --exclude=.cvsignore --recursive --ignore-all-space --ignore-blank-lines -x arch -x debian -x drivers -x version.h -x asm-cris -x cyclades.h -x sysctl_net.c -x n_r3964.h --new-file linux-2.4.26-orig/include/net/sock.h linux-2.4.26/include/net/sock.h --- linux-2.4.26-orig/include/net/sock.h 2004-04-14 06:05:40.000000000 -0700 +++ linux-2.4.26/include/net/sock.h 2004-04-29 14:31:37.000000000 -0700 @@ -387,8 +387,25 @@ __u16 advmss; /* Advertised MSS */ __u8 syn_retries; /* num of allowed syn retries */ - __u8 ecn_flags; /* ECN status bits. */ + struct { + __u8 ok:1, /* ECN was negotiated */ + queue_cwr:1, /* will send CWR on next new segment */ + demand_cwr:1, /* will send ECE until next CWR */ + + nonce_synch:1, /* nonce was out of synch last time */ + nonce_sum:1, /* current nonce sum for acknowledgement */ + expected_nonce_sum:1, /* nonce sum expected from last segment sent */ + cwr_ack_pending:1, /* are we waiting for ack-of-cwr? (ecn_cwr_seq is valid) */ + seen_a_nonce:1; /* does the other endpoint support the ecn-nonce? */ + } ecn_flags; /* ECN status bits. */ __u16 prior_ssthresh; /* ssthresh saved at recovery start */ + /* begin ECN-nonce support: */ + __u32 ecn_cwr_seq; /* seq at last cwr; one packet larger than high_seq*/ + struct { + struct ecn_expected_ns_binding *first,*last; + } ecn_ensq; + void (*prev_destruct)(struct sock *sk); + /* end ECN-nonce support */ __u32 lost_out; /* Lost packets */ __u32 sacked_out; /* SACK'd packets */ __u32 fackets_out; /* FACK'd packets */ diff --unified --exclude='stamp-*' --exclude=.config --exclude=.config.old --exclude=CVS --exclude=.cvsignore --recursive --ignore-all-space --ignore-blank-lines -x arch -x debian -x drivers -x version.h -x asm-cris -x cyclades.h -x sysctl_net.c -x n_r3964.h --new-file linux-2.4.26-orig/include/net/tcp.h linux-2.4.26/include/net/tcp.h --- linux-2.4.26-orig/include/net/tcp.h 2004-04-14 06:05:40.000000000 -0700 +++ linux-2.4.26/include/net/tcp.h 2004-04-29 14:36:47.000000000 -0700 @@ -454,6 +454,8 @@ extern int sysctl_tcp_fack; extern int sysctl_tcp_reordering; extern int sysctl_tcp_ecn; +extern int sysctl_tcp_ecn_nonce; +extern int sysctl_tcp_ecn_nonce_debug; extern int sysctl_tcp_dsack; extern int sysctl_tcp_mem[3]; extern int sysctl_tcp_wmem[3]; @@ -986,6 +988,10 @@ */ #define tcp_time_stamp ((__u32)(jiffies)) +/* increased an 8 bit to a 16 bit, then added another 8 bit, + so ns increased by four bytes, I think, but the skbuff + already allocates 48 bytes, so am safe? */ + /* This is what the send packet queueing engine uses to pass * TCP per-packet control information to the transmission * code. We also store the host-order sequence numbers in @@ -1003,8 +1009,11 @@ __u32 seq; /* Starting sequence number */ __u32 end_seq; /* SEQ + FIN + SYN + datalen */ __u32 when; /* used to compute rtt's */ - __u8 flags; /* TCP header flags. */ + __u16 flags; /* TCP header flags. */ + + /* extended the 'flags byte' to a 'flags short' to support using the + four remaining bits between header length and flags */ /* NOTE: These must match up to the flags byte in a * real TCP header. */ @@ -1017,6 +1026,11 @@ #define TCPCB_FLAG_ECE 0x40 #define TCPCB_FLAG_CWR 0x80 +#define TCPCB_FLAG_NS 0x100 +#define TCPCB_FLAG_RSV1 0x200 +#define TCPCB_FLAG_RSV2 0x400 +#define TCPCB_FLAG_RSV3 0x800 + __u8 sacked; /* State flags for SACK/FACK. */ #define TCPCB_SACKED_ACKED 0x01 /* SKB ACK'd by a SACK block */ #define TCPCB_SACKED_RETRANS 0x02 /* SKB retransmitted */ diff --unified --exclude='stamp-*' --exclude=.config --exclude=.config.old --exclude=CVS --exclude=.cvsignore --recursive --ignore-all-space --ignore-blank-lines -x arch -x debian -x drivers -x version.h -x asm-cris -x cyclades.h -x sysctl_net.c -x n_r3964.h --new-file linux-2.4.26-orig/include/net/tcp_ecn.h linux-2.4.26/include/net/tcp_ecn.h --- linux-2.4.26-orig/include/net/tcp_ecn.h 2001-11-02 17:43:26.000000000 -0800 +++ linux-2.4.26/include/net/tcp_ecn.h 2004-04-29 14:36:47.000000000 -0700 @@ -1,21 +1,52 @@ #ifndef _NET_TCP_ECN_H_ #define _NET_TCP_ECN_H_ 1 +#include #include +#include /* for random nonce support */ + +#define ECN_DEBUG 1 +/* #undef ECN_DEBUG */ + +/* fire off a KERN_INFO debug message if the debuglevel set + * in /proc/sys/net/ipv4/tcp_ecn_nonce_debug is high + * enough and ECN_DEBUG is defined. */ +#ifdef ECN_DEBUG +#define DM(tp, dlvl, rest...) if(sysctl_tcp_ecn_nonce_debug >= dlvl && tp->ecn_flags.seen_a_nonce ) printk(rest) +#else +#define DM(...) do { } while(0) +#endif #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) -#define TCP_ECN_OK 1 -#define TCP_ECN_QUEUE_CWR 2 -#define TCP_ECN_DEMAND_CWR 4 +/* separated for ease of user-mode testing and profiling */ +#include + +/* consistent levels of ecn nonce support: L_REPLY: reply to + * nonces if sent, setting the NS bit appropriately; + * L_GENERATE: create nonces in packets (and + * reply); L_VERIFY: check nonce sums in acknowledgements + * for misbehavior; L_PENALIZE: reduce cwnd and disable ecn when verify + * fails. Each level represents additional processing, and + * may not be necessary in all systems. + * + * if unsure, just reply. + */ +#define L_REPLY 1 +#define L_GENERATE 2 +#define L_VERIFY 3 +#define L_PENALIZE 4 static __inline__ void TCP_ECN_queue_cwr(struct tcp_opt *tp) { - if (tp->ecn_flags&TCP_ECN_OK) - tp->ecn_flags |= TCP_ECN_QUEUE_CWR; + if (tp->ecn_flags.ok) { + if (!tp->ecn_flags.queue_cwr) + DM (tp, 4, "posting queue cwr, high_seq=%u\n", + tp->high_seq); + tp->ecn_flags.queue_cwr = 1; + } } - /* Output functions */ @@ -23,45 +54,104 @@ TCP_ECN_send_synack(struct tcp_opt *tp, struct sk_buff *skb) { TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_CWR; - if (!(tp->ecn_flags&TCP_ECN_OK)) + if (!(tp->ecn_flags.ok)) TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_ECE; + else if (sysctl_tcp_ecn_nonce >= L_REPLY) + TCP_SKB_CB (skb)->flags |= TCPCB_FLAG_NS; } static __inline__ void TCP_ECN_send_syn(struct tcp_opt *tp, struct sk_buff *skb) { - tp->ecn_flags = 0; + memset (&tp->ecn_flags, 0, sizeof (tp->ecn_flags)); if (sysctl_tcp_ecn) { TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE|TCPCB_FLAG_CWR; - tp->ecn_flags = TCP_ECN_OK; + tp->ecn_flags.ok = 1; + tp->ecn_flags.nonce_sum = 1; + tp->ecn_flags.expected_nonce_sum = 1; } } static __inline__ void TCP_ECN_make_synack(struct open_request *req, struct tcphdr *th) { - if (req->ecn_ok) + if (req->ecn_ok) { + if (sysctl_tcp_ecn == 0) + printk (KERN_INFO + "sending ecn negotiating synack while sysctl_tcp_ecn == 0\n"); th->ece = 1; + /* set ns if replying is enabled, zero otherwise */ + th->ns = (sysctl_tcp_ecn_nonce >= L_REPLY); + } } static __inline__ void -TCP_ECN_send(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb, int tcp_header_len) +TCP_ECN_send (struct sock *sk, + struct tcp_opt *tp, struct sk_buff *skb, int tcp_header_len) { - if (tp->ecn_flags & TCP_ECN_OK) { - /* Not-retransmitted data segment: set ECT and inject CWR. */ - if (skb->len != tcp_header_len && - !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) { - INET_ECN_xmit(sk); - if (tp->ecn_flags&TCP_ECN_QUEUE_CWR) { - tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; + /* even if ECN is disabled due to error, continue to echo + congestion signals */ + skb->h.th->ece = tp->ecn_flags.demand_cwr; + + /* set the nonce sum in the packet header to prove + the lack of ce marks */ + skb->h.th->ns = tp->ecn_flags.nonce_sum; + + /* only do the following if ECN was negotiated, and has not been + disabled. */ + if (tp->ecn_flags.ok) { + /* ... and there's data in this packet */ + if (skb->len != tcp_header_len) { + /* ... and it's new (not a retransmit) */ + if (!before (TCP_SKB_CB (skb)->seq, tp->snd_nxt)) { + if (sysctl_tcp_ecn_nonce >= L_GENERATE) { + if (ecnn_get_random_bit ()) { + INET_ECN_xmit_n1 (sk); + tp->ecn_flags. + expected_nonce_sum ^= 1; + } else { + INET_ECN_xmit_n0 (sk); + } + if (sysctl_tcp_ecn_nonce >= L_VERIFY) { + /* disable fast path for this connection, + so that we get called to verify sums. */ + tp->pred_flags = 0; + ecnn_bind (sk, tp, + TCP_SKB_CB (skb)-> + end_seq, + tp->ecn_flags. + expected_nonce_sum); + } + } else { + /* nonce generation disabled */ + INET_ECN_xmit_n0 (sk); + } + + if (tp->ecn_flags.queue_cwr) { + tp->ecn_flags.queue_cwr = 0; skb->h.th->cwr = 1; + tp->ecn_flags.cwr_ack_pending = 1; + tp->ecn_cwr_seq = + TCP_SKB_CB (skb)->end_seq; + DM (tp, 5, "sending cwr at (%u:)%u\n", + TCP_SKB_CB (skb)->seq, + tp->ecn_cwr_seq); + } + } else { + /* retransmission; verify that we're in queue_cwr or cwr_ack_pending */ + if (!tp->ecn_flags.queue_cwr + && !tp->ecn_flags.cwr_ack_pending) { + DM (tp, 2, + "unhandled non-congestion retransmission; setting cwr_ack_pending."); + tp->ecn_flags.cwr_ack_pending = 1; + tp->ecn_cwr_seq = tp->high_seq; + } + INET_ECN_dontxmit (sk); } } else { - /* ACK or retransmitted segment: clear ECT|CE */ + /* ACK: clear ECT|CE */ INET_ECN_dontxmit(sk); } - if (tp->ecn_flags & TCP_ECN_DEMAND_CWR) - skb->h.th->ece = 1; } } @@ -71,25 +161,44 @@ TCP_ECN_accept_cwr(struct tcp_opt *tp, struct sk_buff *skb) { if (skb->h.th->cwr) - tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; + tp->ecn_flags.demand_cwr = 0; } static __inline__ void TCP_ECN_withdraw_cwr(struct tcp_opt *tp) { - tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; + tp->ecn_flags.demand_cwr = 0; +} + +static __inline__ void +TCP_ECN_update_nonce (struct tcp_opt *tp, struct sk_buff *skb) +{ + /* received a nonce bit in an in-order packet. incorporate + it into the nonce sum */ + /* this function should be called as rcv_nxt is advanced to + skb->end_seq */ + + /* speak up if the packet was capable... */ + if (INET_ECN_is_capable (TCP_SKB_CB (skb)->flags) + && sysctl_tcp_ecn_nonce >= L_REPLY) { + tp->ecn_flags.nonce_sum ^= + INET_ECN_recover_nonce (TCP_SKB_CB (skb)->flags); + } } static __inline__ void TCP_ECN_check_ce(struct tcp_opt *tp, struct sk_buff *skb) { - if (tp->ecn_flags&TCP_ECN_OK) { - if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags)) - tp->ecn_flags |= TCP_ECN_DEMAND_CWR; + if (tp->ecn_flags.ok || tp->ecn_flags.seen_a_nonce) { + tp->ecn_flags.demand_cwr = + INET_ECN_is_ce (TCP_SKB_CB (skb)->flags); /* Funny extension: if ECT is not set on a segment, * it is surely retransmit. It is not in ECN RFC, * but Linux follows this rule. */ - else if (!INET_ECN_is_capable((TCP_SKB_CB(skb)->flags))) + /* although all retransmits are ecn incapable (now in + * RFC), not all ecn incapable packets are + * retransmits. */ + if (!INET_ECN_is_capable (TCP_SKB_CB (skb)->flags)) tcp_enter_quickack_mode(tp); } } @@ -97,10 +206,18 @@ static __inline__ void TCP_ECN_rcv_synack(struct tcp_opt *tp, struct tcphdr *th) { - if ((tp->ecn_flags&TCP_ECN_OK) && (!th->ece || th->cwr)) - tp->ecn_flags &= ~TCP_ECN_OK; + if (tp->ecn_flags.ok) { + if (!th->ece || th->cwr) { + tp->ecn_flags.ok = 0; + } else { + if (sysctl_tcp_ecn == 0) + printk (KERN_INFO + "rcv synack sysctl_tcp_ecn is zero?\n"); + tp->ecn_flags.seen_a_nonce |= th->ns; + NET_INC_STATS_BH (TCPECNConnections); + } + } } - static __inline__ void TCP_ECN_rcv_syn(struct tcp_opt *tp, struct tcphdr *th) { @@ -104,22 +221,139 @@ static __inline__ void TCP_ECN_rcv_syn(struct tcp_opt *tp, struct tcphdr *th) { - if ((tp->ecn_flags&TCP_ECN_OK) && (!th->ece || !th->cwr)) - tp->ecn_flags &= ~TCP_ECN_OK; + if (tp->ecn_flags.ok) { + if (!th->ece || !th->cwr) + tp->ecn_flags.ok = 0; + else { + if (sysctl_tcp_ecn == 0) { + printk (KERN_INFO + "rcv syn sysctl_tcp_ecn is zero?\n"); + } + NET_INC_STATS_BH (TCPECNConnections); + } + } } - static __inline__ int -TCP_ECN_rcv_ecn_echo(struct tcp_opt *tp, struct tcphdr *th) +TCP_ECN_rcv_ecn_echo (struct tcp_opt *tp, struct tcphdr *th, struct sock *sk) { - if (th->ece && !th->syn && (tp->ecn_flags&TCP_ECN_OK)) + int expected; + /* connection is not capable or not ready to receive + ece */ + if (!tp->ecn_flags.ok || th->syn) + return 0; + /* track when a nonce is seen to determine penalty + for incorrect nonce */ + /* XXX: should be more prominent, or removed given + negotiation */ + tp->ecn_flags.seen_a_nonce |= th->ns; + /* packet shows ece */ + if (th->ece) return 1; + /* kernel configured not to check, or we've received a congestion + * signal, but haven't even sent the cwr */ + if (sysctl_tcp_ecn_nonce < L_VERIFY || tp->ecn_flags.queue_cwr) + return 0; + + /* the next if statement is optional; the '!= + * established' is an otherwise redundant shortcut + * that makes established connections (the common + * case) bypass the rest of the checks. */ + if (sk->state != TCP_ESTABLISHED && + (sk->state == TCP_LAST_ACK || + sk->state == TCP_FIN_WAIT1 || + sk->state == TCP_CLOSING || sk->state == TCP_FIN_WAIT2)) { + /* we don't need to verify the nonce sum, + * the connection's over; this doesn't + * include close_wait, as receiving a fin + * doesn't mean we're not sending data */ + DM (tp, 6, + "connection completed with ECN state 0x%x\n", sk->state); + return 0; + } + expected = ecnn_find (tp, htonl (th->ack_seq)); + /* should ignore nonces while waiting for cwr ack (or pmtu ack) */ + if (tp->ecn_flags.cwr_ack_pending) { + DM (tp, 11, "cwr_seq %u high_seq %u ack %u\n", + tp->ecn_cwr_seq, tp->high_seq, htonl (th->ack_seq)); + if (tp->ecn_cwr_seq <= htonl (th->ack_seq)) { + /* don't need to check not yet ack + * of cwr, but do need to set + * nonce_synch */ + tp->ecn_flags.cwr_ack_pending = 0; + tp->ecn_flags.nonce_synch = th->ns ^ expected; + DM (tp, 5, + "ECN-Nonce resync: %d = seen %d ^ expected %d at %u\n", + tp->ecn_flags.nonce_synch, th->ns, + expected, htonl (th->ack_seq)); + } + /* else, we're still waiting; odd that the + * other end stopped sending ECE, but not + * evil. */ return 0; + } else if ((th->ns ^ tp->ecn_flags.nonce_synch) == expected) { + /* verified the correct nonce sum */ + return 0; + } else { + /* ece is zero, but couldn't be verified. */ + DM (tp, 1, + "ECN-Nonce fail: ack %u expected %u(rightedge-%u) saw %u sync %u\n", + htonl (th->ack_seq), + expected, + tp->ecn_flags.expected_nonce_sum, + th->ns, tp->ecn_flags.nonce_synch); + + NET_INC_STATS_BH (TCPECNNonceFailures); + if (sysctl_tcp_ecn_nonce >= L_PENALIZE) { + DM (tp, 0, + "disabled ecn after incorrect nonce sum; state %s/%d\n", + ((sk->state == 1) ? "ESTABLISHED" : ""), sk->state); + /* mistrust cwnd, so redo from cwnd=1, + without slow-start. */ + tp->ecn_flags.ok = 0; + INET_ECN_dontxmit (sk); /* fix tos now. */ + if (tp->ecn_flags.seen_a_nonce) { + tp->snd_cwnd = 1; + tp->snd_ssthresh = 2; + } + } + return 1; + } +} + +/* small packets in the receive queue may be merged to save + space. this appears to happen in real life, but this code + has not been tested. */ +static __inline__ void +TCP_ECN_nonce_collapse (struct sk_buff *skb, struct sk_buff *skb_next) +{ + int newnonce; + if (sysctl_tcp_ecn_nonce_debug >= 2) + printk (KERN_INFO "collapsing %u with %u at %u to %u\n", + INET_ECN_recover_nonce (TCP_SKB_CB (skb)->flags), + INET_ECN_recover_nonce (TCP_SKB_CB (skb_next)->flags), + TCP_SKB_CB (skb)->seq, TCP_SKB_CB (skb_next)->end_seq); + + newnonce = INET_ECN_recover_nonce (TCP_SKB_CB (skb)->flags) + ^ INET_ECN_recover_nonce (TCP_SKB_CB (skb_next)->flags); + + /* clear the stored nonce field */ + TCP_SKB_CB (skb)->flags &= ~0x3; + /* set the stored nonce field to 01 if 1, 10 if 0 */ + TCP_SKB_CB (skb)->flags |= newnonce ? 0x1 : 0x2; + /* skb->ecn_ns_expected = skb_next->ecn_ns_expected; */ + if (sysctl_tcp_ecn_nonce_debug >= 3) + printk (KERN_INFO "... to yield %u\n", + INET_ECN_recover_nonce (TCP_SKB_CB (skb)->flags)); } static __inline__ void TCP_ECN_openreq_child(struct tcp_opt *tp, struct open_request *req) { - tp->ecn_flags = req->ecn_ok ? TCP_ECN_OK : 0; + tp->ecn_flags.ok = req->ecn_ok; + if (sysctl_tcp_ecn_nonce >= L_REPLY) { + tp->ecn_flags.nonce_sum = + tp->ecn_flags.expected_nonce_sum = req->ecn_ok; + } } static __inline__ void @@ -129,4 +363,11 @@ req->ecn_ok = 1; } +static __inline__ void +TCP_ECN_pmtu_disc_response (struct tcp_opt *tp) +{ + /* pretend we're sending cwr */ + tp->ecn_flags.cwr_ack_pending = 1; + tp->ecn_cwr_seq = tp->high_seq; +} #endif diff --unified --exclude='stamp-*' --exclude=.config --exclude=.config.old --exclude=CVS --exclude=.cvsignore --recursive --ignore-all-space --ignore-blank-lines -x arch -x debian -x drivers -x version.h -x asm-cris -x cyclades.h -x sysctl_net.c -x n_r3964.h --new-file linux-2.4.26-orig/include/net/tcp_nonce.h linux-2.4.26/include/net/tcp_nonce.h --- linux-2.4.26-orig/include/net/tcp_nonce.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.4.26/include/net/tcp_nonce.h 2004-04-29 14:36:47.000000000 -0700 @@ -0,0 +1,128 @@ +#ifndef _NET_TCP_NONCE_H_ +#define _NET_TCP_NONCE_H_ 1 + +#include + +/* inline routines and data structures for managing the + * queue of expected ECN-nonce sums. This is a separate + * file for testing and to separate expected-nonce data + * structures. Neil Spring nspring@cs.washington.edu */ + +/* defined in tcp.c; handles cleaning up dynamically + * allocated expected nonce sum bindings */ +extern void TCP_ECN_destructor (struct sock *sk); + +/* the following structure could be compacted to two 32-bit + * words by using a 16-bit delta for the sequence number. */ +struct ecn_expected_ns_binding { + struct ecn_expected_ns_binding *next; + unsigned int seq; + __u8 expected_ns:1; +}; + +/* bind a sequence number to the nonce sum expected when + that sequence number is ack'd. */ +static __inline__ void +ecnn_bind (struct sock *sk, struct tcp_opt *tp, unsigned int seq, __u8 expected) +{ + struct ecn_expected_ns_binding *p; + if (tp->ecn_ensq.last) { + /* tail of the queue serves as a free list */ + p = tp->ecn_ensq.last->next; + if (p == NULL) { + p = sock_kmalloc (sk, + sizeof (struct + ecn_expected_ns_binding), + GFP_ATOMIC); + if (p == NULL) + goto alloc_failure; + p->next = NULL; + tp->ecn_ensq.last->next = p; + } + tp->ecn_ensq.last = p; + } else { + /* first binding */ + tp->ecn_ensq.first = tp->ecn_ensq.last = p = + sock_kmalloc (sk, sizeof (struct ecn_expected_ns_binding), + GFP_ATOMIC); + if (p == NULL) + goto alloc_failure; + /* setup destructor for this list */ + if (!tp->prev_destruct) { + tp->prev_destruct = sk->destruct; + sk->destruct = TCP_ECN_destructor; + } + p->next = NULL; + } + p->seq = seq; + p->expected_ns = expected; + return; + + /* exception-like section to reduce (inlined) function length */ + alloc_failure: + printk (KERN_ERR + "out of socket optmem allocating a nonce binding;\n"); + tp->ecn_flags.ok = 0; + return; +} + +/* recover the expected nonce sum for an ack'd sequence number */ +static __inline__ __u8 +ecnn_find (struct tcp_opt *tp, unsigned int seq) +{ + struct ecn_expected_ns_binding *p; + if (tp->ecn_ensq.first != NULL) { + for (p = tp->ecn_ensq.first; + before (p->seq, seq) && + tp->ecn_ensq.first != tp->ecn_ensq.last; + p = tp->ecn_ensq.first) { + /* advance first */ + tp->ecn_ensq.first = p->next; + /* insert unused one at tail (a free list) */ + p->next = tp->ecn_ensq.last->next; + tp->ecn_ensq.last->next = p; + } + if (p->seq == seq) { + /* current, keep around for no reason. */ + return (p->expected_ns); + } + } + /* if we get here, it's probably because we haven't + sent any data yet; from the initial handshake. + Otherwise, it's because the receiver decided to + ack a partial segment. */ + return (tp->ecn_flags.expected_nonce_sum); +} + +/* from http://www.ciphersbyritter.com/NEWS4/RANDC.HTM */ +/* written by George Marsaglia, + * as posted to sci.stat.math,sci.math + * cross posted at http://www.tux.org/hypermail/linux-kernel/1999week04/1472.html + * */ +#define ecnn_znew (z=36969*(z&65535)+(z>>16)) +#define ecnn_wnew (w=18000*(w&65535)+(w>>16)) +#define ecnn_MWC ((ecnn_znew<<16)+ecnn_wnew ) +/* end excerpt */ + +static __inline__ __u8 +ecnn_get_random_bit (void) +{ + /* this will ultimately bias in favor of 1's; about 52% of the time. */ + static unsigned int z, w, q; + __u8 retbit; + if (z == 0) { + /* seed the cheap random number generator from the kernel */ + get_random_bytes (&z, sizeof (z)); + get_random_bytes (&w, sizeof (w)); + } + if (q == 0) { + /* grab a cheap 32-bit random number */ + q = ecnn_MWC; + } + /* pull the random bit of the least significant side */ + retbit = q & 0x1; + q >>= 1; + return (retbit); +} + +#endif diff --unified --exclude='stamp-*' --exclude=.config --exclude=.config.old --exclude=CVS --exclude=.cvsignore --recursive --ignore-all-space --ignore-blank-lines -x arch -x debian -x drivers -x version.h -x asm-cris -x cyclades.h -x sysctl_net.c -x n_r3964.h --new-file linux-2.4.26-orig/net/ipv4/ip_fragment.c linux-2.4.26/net/ipv4/ip_fragment.c --- linux-2.4.26-orig/net/ipv4/ip_fragment.c 2003-06-13 07:51:39.000000000 -0700 +++ linux-2.4.26/net/ipv4/ip_fragment.c 2004-04-29 14:12:50.000000000 -0700 @@ -494,7 +494,7 @@ } else { struct sk_buff *free_it = next; - /* Old fragmnet is completely overridden with + /* Old fragment is completely overridden with * new one drop it. */ next = next->next; diff --unified --exclude='stamp-*' --exclude=.config --exclude=.config.old --exclude=CVS --exclude=.cvsignore --recursive --ignore-all-space --ignore-blank-lines -x arch -x debian -x drivers -x version.h -x asm-cris -x cyclades.h -x sysctl_net.c -x n_r3964.h --new-file linux-2.4.26-orig/net/ipv4/proc.c linux-2.4.26/net/ipv4/proc.c --- linux-2.4.26-orig/net/ipv4/proc.c 2003-06-13 07:51:39.000000000 -0700 +++ linux-2.4.26/net/ipv4/proc.c 2004-04-29 14:12:50.000000000 -0700 @@ -200,7 +200,8 @@ " TCPDSACKOldSent TCPDSACKOfoSent TCPDSACKRecv TCPDSACKOfoRecv" " TCPAbortOnSyn TCPAbortOnData TCPAbortOnClose" " TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger" - " TCPAbortFailed TCPMemoryPressures\n" + " TCPAbortFailed TCPMemoryPressures" + " TCPECNConnections TCPECNNonceFailures\n" "TcpExt:"); for (i=0; iecn_flags&TCP_ECN_OK) + if (tp->ecn_flags.ok) info.tcpi_options |= TCPI_OPT_ECN; info.tcpi_rto = (1000000*tp->rto)/HZ; @@ -2647,3 +2647,20 @@ (void) tcp_mib_init(); tcpdiag_init(); } + + +#ifdef CONFIG_INET_ECN +/* overrides the existing socket destructor to clean up + nonce sum bindings */ +void TCP_ECN_destructor(struct sock *sk) { + struct ecn_expected_ns_binding *p; + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + for(p=tp->ecn_ensq.first; p!=NULL; p=p->next) { + sock_kfree_s(sk,p,sizeof(struct ecn_expected_ns_binding)); + } + tp->ecn_ensq.first=NULL; + if(tp->prev_destruct!=NULL) { + (*(tp->prev_destruct))(sk); + } +} +#endif diff --unified --exclude='stamp-*' --exclude=.config --exclude=.config.old --exclude=CVS --exclude=.cvsignore --recursive --ignore-all-space --ignore-blank-lines -x arch -x debian -x drivers -x version.h -x asm-cris -x cyclades.h -x sysctl_net.c -x n_r3964.h --new-file linux-2.4.26-orig/net/ipv4/tcp_diag.c linux-2.4.26/net/ipv4/tcp_diag.c --- linux-2.4.26-orig/net/ipv4/tcp_diag.c 2003-08-25 04:44:44.000000000 -0700 +++ linux-2.4.26/net/ipv4/tcp_diag.c 2004-04-29 14:14:25.000000000 -0700 @@ -156,7 +156,7 @@ info->tcpi_rcv_wscale = 0; } #ifdef CONFIG_INET_ECN - if (tp->ecn_flags&TCP_ECN_OK) + if (tp->ecn_flags.ok) info->tcpi_options |= TCPI_OPT_ECN; #endif diff --unified --exclude='stamp-*' --exclude=.config --exclude=.config.old --exclude=CVS --exclude=.cvsignore --recursive --ignore-all-space --ignore-blank-lines -x arch -x debian -x drivers -x version.h -x asm-cris -x cyclades.h -x sysctl_net.c -x n_r3964.h --new-file linux-2.4.26-orig/net/ipv4/tcp_input.c linux-2.4.26/net/ipv4/tcp_input.c --- linux-2.4.26-orig/net/ipv4/tcp_input.c 2004-04-14 06:05:41.000000000 -0700 +++ linux-2.4.26/net/ipv4/tcp_input.c 2004-04-29 14:12:50.000000000 -0700 @@ -78,9 +78,14 @@ int sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; #ifdef CONFIG_INET_ECN int sysctl_tcp_ecn = 1; +int sysctl_tcp_ecn_nonce = 4; +int sysctl_tcp_ecn_nonce_debug = 0; #else int sysctl_tcp_ecn = 0; +int sysctl_tcp_ecn_nonce = 0; +int sysctl_tcp_ecn_nonce_debug = 0; #endif + int sysctl_tcp_dsack = 1; int sysctl_tcp_app_win = 31; int sysctl_tcp_adv_win_scale = 2; @@ -2305,8 +2310,7 @@ if (TCP_SKB_CB(skb)->sacked) flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); - - if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) + if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th, sk)) flag |= FLAG_ECE; tcp_westwood_slow_bw(sk, skb); @@ -2885,6 +2889,7 @@ __skb_unlink(skb, skb->list); __skb_queue_tail(&sk->receive_queue, skb); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + TCP_ECN_update_nonce( tp, skb ); if(skb->h.th->fin) tcp_fin(skb, sk, skb->h.th); } @@ -2956,6 +2961,7 @@ __skb_queue_tail(&sk->receive_queue, skb); } tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + TCP_ECN_update_nonce( tp, skb ); if(skb->len) tcp_event_data_recv(sk, tp, skb); if(th->fin) @@ -3608,7 +3614,7 @@ tp->saw_tstamp = 0; - /* pred_flags is 0xS?10 << 16 + snd_wnd + /* pred_flags is (0xS?10 << 16) + snd_wnd * if header_predition is to be made * 'S' will always be tp->tcp_header_len >> 2 * '?' will be 0 for the fast path, otherwise pred_flags is 0 to @@ -3698,6 +3704,7 @@ tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; NET_INC_STATS_BH(TCPHPHitsToUser); eaten = 1; + TCP_ECN_update_nonce(tp,skb); } } if (!eaten) { @@ -3723,6 +3730,7 @@ __skb_queue_tail(&sk->receive_queue, skb); tcp_set_owner_r(skb, sk); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + TCP_ECN_update_nonce(tp,skb); } tcp_event_data_recv(sk, tp, skb); diff --unified --exclude='stamp-*' --exclude=.config --exclude=.config.old --exclude=CVS --exclude=.cvsignore --recursive --ignore-all-space --ignore-blank-lines -x arch -x debian -x drivers -x version.h -x asm-cris -x cyclades.h -x sysctl_net.c -x n_r3964.h --new-file linux-2.4.26-orig/net/ipv4/tcp_ipv4.c linux-2.4.26/net/ipv4/tcp_ipv4.c --- linux-2.4.26-orig/net/ipv4/tcp_ipv4.c 2004-04-14 06:05:41.000000000 -0700 +++ linux-2.4.26/net/ipv4/tcp_ipv4.c 2004-04-29 14:12:50.000000000 -0700 @@ -961,7 +961,11 @@ * discovery. */ tcp_simple_retransmit(sk); + + /* account for loss of ECN-nonce synch */ + TCP_ECN_pmtu_disc_response(tp); } /* else let the usual retransmit timer handle it */ + } /* diff --unified --exclude='stamp-*' --exclude=.config --exclude=.config.old --exclude=CVS --exclude=.cvsignore --recursive --ignore-all-space --ignore-blank-lines -x arch -x debian -x drivers -x version.h -x asm-cris -x cyclades.h -x sysctl_net.c -x n_r3964.h --new-file linux-2.4.26-orig/net/netsyms.c linux-2.4.26/net/netsyms.c --- linux-2.4.26-orig/net/netsyms.c 2004-04-14 06:05:41.000000000 -0700 +++ linux-2.4.26/net/netsyms.c 2004-04-29 14:12:50.000000000 -0700 @@ -392,6 +392,8 @@ EXPORT_SYMBOL(sysctl_tcp_rmem); EXPORT_SYMBOL(sysctl_tcp_wmem); EXPORT_SYMBOL(sysctl_tcp_ecn); +EXPORT_SYMBOL(sysctl_tcp_ecn_nonce); +EXPORT_SYMBOL(sysctl_tcp_ecn_nonce_debug); EXPORT_SYMBOL(tcp_cwnd_application_limited); EXPORT_SYMBOL(tcp_sendpage); EXPORT_SYMBOL(sysctl_tcp_low_latency);