diff --unified --exclude=.config --exclude=.config.old --exclude=CVS --exclude=.cvsignore --recursive --ignore-all-space --ignore-blank-lines -I $Id -I $Revision -I $Source: -I $Date -I $Header -I $Log: -x arch -x debian -x drivers -x version.h -x asm-cris -x cyclades.h -x sysctl_net.c -x n_r3964.h -x ppp-comp.h -x openssl -x conf.vars --new-file linux-2.4.10/include/linux/sysctl.h linux/include/linux/sysctl.h --- linux-2.4.10/include/linux/sysctl.h Sun Sep 23 10:31:02 2001 +++ linux/include/linux/sysctl.h Fri Oct 5 00:25:47 2001 @@ -285,7 +285,9 @@ NET_TCP_ADV_WIN_SCALE=87, NET_IPV4_NONLOCAL_BIND=88, NET_IPV4_ICMP_RATELIMIT=89, - NET_IPV4_ICMP_RATEMASK=90 + NET_IPV4_ICMP_RATEMASK=90, + NET_TCP_ECN_NONCE=91, + NET_TCP_ECN_NONCE_DEBUG=92, }; enum { diff --unified --exclude=.config --exclude=.config.old --exclude=CVS --exclude=.cvsignore --recursive --ignore-all-space --ignore-blank-lines -I $Id -I $Revision -I $Source: -I $Date -I $Header -I $Log: -x arch -x debian -x drivers -x version.h -x asm-cris -x cyclades.h -x sysctl_net.c -x n_r3964.h -x ppp-comp.h -x openssl -x conf.vars --new-file linux-2.4.10/include/linux/tcp.h linux/include/linux/tcp.h --- linux-2.4.10/include/linux/tcp.h Sun Sep 23 10:31:11 2001 +++ linux/include/linux/tcp.h Fri Oct 5 00:26:50 2001 @@ -26,7 +26,8 @@ __u32 seq; __u32 ack_seq; #if defined(__LITTLE_ENDIAN_BITFIELD) - __u16 res1:4, + __u16 ns:1, + res1:3, doff:4, fin:1, syn:1, @@ -38,7 +39,8 @@ cwr:1; #elif defined(__BIG_ENDIAN_BITFIELD) __u16 doff:4, - res1:4, + res1:3, + ns:1, cwr:1, ece:1, urg:1, @@ -102,6 +104,7 @@ #define tcp_flag_word(tp) ( ((union tcp_word_hdr *)(tp))->words [3]) enum { + TCP_FLAG_NS = __constant_htonl(0x01000000), TCP_FLAG_CWR = __constant_htonl(0x00800000), TCP_FLAG_ECE = __constant_htonl(0x00400000), TCP_FLAG_URG = __constant_htonl(0x00200000), diff --unified --exclude=.config --exclude=.config.old --exclude=CVS --exclude=.cvsignore --recursive --ignore-all-space --ignore-blank-lines -I $Id -I $Revision -I $Source: -I $Date -I $Header -I $Log: -x arch -x debian -x drivers -x version.h -x asm-cris -x cyclades.h -x sysctl_net.c -x n_r3964.h -x ppp-comp.h -x openssl -x conf.vars --new-file linux-2.4.10/include/net/inet_ecn.h linux/include/net/inet_ecn.h --- linux-2.4.10/include/net/inet_ecn.h Sun Sep 23 10:31:11 2001 +++ linux/include/net/inet_ecn.h Fri Oct 5 00:27:53 2001 @@ -2,22 +2,29 @@ #define _INET_ECN_H_ #include +#include #ifdef CONFIG_INET_ECN static inline int INET_ECN_is_ce(__u8 dsfield) { - return (dsfield&3) == 3; + return ((dsfield&3) == 3); } static inline int INET_ECN_is_not_ce(__u8 dsfield) { - return (dsfield&3) == 2; + return (((dsfield&3) == 2) || ((dsfield&3) == 1)); } static inline int INET_ECN_is_capable(__u8 dsfield) { - return (dsfield&2); + /* returns nonzero (1,2,3) if capable, 0 if not */ + return (dsfield&3); +} + +static inline int INET_ECN_recover_nonce(__u8 dsfield) +{ + return(dsfield&1); /* ECT(1) if 01, ECT(0) if 10. */ } static inline __u8 INET_ECN_encapsulate(__u8 outer, __u8 inner) @@ -29,6 +36,13 @@ } #define INET_ECN_xmit(sk) do { (sk)->protinfo.af_inet.tos |= 2; } while (0) + +/* bit clearing is necessary to alternate between ECT's. */ +#define INET_ECN_xmit_n0(sk) do { (sk)->protinfo.af_inet.tos |= 2; \ + (sk)->protinfo.af_inet.tos &= ~1; } while (0) +#define INET_ECN_xmit_n1(sk) do { (sk)->protinfo.af_inet.tos |= 1; \ + (sk)->protinfo.af_inet.tos &= ~2; } while (0) + #define INET_ECN_dontxmit(sk) do { (sk)->protinfo.af_inet.tos &= ~3; } while (0) #define IP6_ECN_flow_init(label) do { \ @@ -45,6 +58,7 @@ #define INET_ECN_is_ce(x...) (0) #define INET_ECN_is_not_ce(x...) (0) #define INET_ECN_is_capable(x...) (0) +#define INET_ECN_recover_nonce(x...) (0) #define INET_ECN_encapsulate(x, y) (x) #define IP6_ECN_flow_init(x...) do { } while (0) #define IP6_ECN_flow_xmit(x...) do { } while (0) @@ -55,16 +69,24 @@ static inline void IP_ECN_set_ce(struct iphdr *iph) { u32 check = iph->check; + __u8 dsfield = ipv4_get_dsfield(iph); + if ((dsfield&3) == 2) { + /* RFC2481 ECT(0) path */ check += __constant_htons(0xFFFE); - iph->check = check + (check>=0xFFFF); iph->tos |= 1; + } else if((dsfield&3) == 1) { + /* RFC3xxx ECT(1) path */ + check += __constant_htons(0xFFFD); + iph->tos |= 2; + } + iph->check = check + (check>=0xFFFF); } struct ipv6hdr; static inline void IP6_ECN_set_ce(struct ipv6hdr *iph) { - *(u32*)iph |= htonl(1<<20); + *(u32*)iph |= htonl(3<<20); } #define ip6_get_dsfield(iph) ((ntohs(*(u16*)(iph)) >> 4) & 0xFF) diff --unified --exclude=.config --exclude=.config.old --exclude=CVS --exclude=.cvsignore --recursive --ignore-all-space --ignore-blank-lines -I $Id -I $Revision -I $Source: -I $Date -I $Header -I $Log: -x arch -x debian -x drivers -x version.h -x asm-cris -x cyclades.h -x sysctl_net.c -x n_r3964.h -x ppp-comp.h -x openssl -x conf.vars --new-file linux-2.4.10/include/net/snmp.h linux/include/net/snmp.h --- linux-2.4.10/include/net/snmp.h Sun Sep 23 10:31:11 2001 +++ linux/include/net/snmp.h Fri Oct 5 00:26:50 2001 @@ -256,6 +256,8 @@ unsigned long TCPAbortOnLinger; unsigned long TCPAbortFailed; unsigned long TCPMemoryPressures; + unsigned long TCPECNConnections; + unsigned long TCPECNNonceFailures; unsigned long __pad[0]; } ____cacheline_aligned; diff --unified --exclude=.config --exclude=.config.old --exclude=CVS --exclude=.cvsignore --recursive --ignore-all-space --ignore-blank-lines -I $Id -I $Revision -I $Source: -I $Date -I $Header -I $Log: -x arch -x debian -x drivers -x version.h -x asm-cris -x cyclades.h -x sysctl_net.c -x n_r3964.h -x ppp-comp.h -x openssl -x conf.vars --new-file linux-2.4.10/include/net/sock.h linux/include/net/sock.h --- linux-2.4.10/include/net/sock.h Sun Sep 23 10:31:33 2001 +++ linux/include/net/sock.h Fri Oct 5 00:26:53 2001 @@ -376,8 +377,25 @@ __u16 advmss; /* Advertised MSS */ __u8 syn_retries; /* num of allowed syn retries */ - __u8 ecn_flags; /* ECN status bits. */ + struct { + __u8 ok:1, /* ECN was negotiated */ + queue_cwr:1, /* will send CWR on next new segment */ + demand_cwr:1, /* will send ECE until next CWR */ + + nonce_synch:1, /* nonce was out of synch last time */ + nonce_sum:1, /* current nonce sum for acknowledgement */ + expected_nonce_sum:1, /* nonce sum expected from last segment sent */ + cwr_ack_pending:1, /* are we waiting for ack-of-cwr? (ecn_cwr_seq is valid) */ + seen_a_nonce:1; /* does the other endpoint support the ecn-nonce? */ + } ecn_flags; /* ECN status bits. */ __u16 prior_ssthresh; /* ssthresh saved at recovery start */ + /* begin nonce support: */ + __u32 ecn_cwr_seq; /* seq num at last cwr */ + struct { + struct ecn_expected_ns_binding *first,*last; + } ecn_ensq; + void (*prev_destruct)(struct sock *sk); + /* end nonce support */ __u32 lost_out; /* Lost packets */ __u32 sacked_out; /* SACK'd packets */ __u32 fackets_out; /* FACK'd packets */ diff --unified --exclude=.config --exclude=.config.old --exclude=CVS --exclude=.cvsignore --recursive --ignore-all-space --ignore-blank-lines -I $Id -I $Revision -I $Source: -I $Date -I $Header -I $Log: -x arch -x debian -x drivers -x version.h -x asm-cris -x cyclades.h -x sysctl_net.c -x n_r3964.h -x ppp-comp.h -x openssl -x conf.vars --new-file linux-2.4.10/include/net/tcp.h linux/include/net/tcp.h --- linux-2.4.10/include/net/tcp.h Sun Sep 23 10:31:58 2001 +++ linux/include/net/tcp.h Fri Oct 5 00:27:54 2001 @@ -454,6 +454,8 @@ extern int sysctl_tcp_fack; extern int sysctl_tcp_reordering; extern int sysctl_tcp_ecn; +extern int sysctl_tcp_ecn_nonce; +extern int sysctl_tcp_ecn_nonce_debug; extern int sysctl_tcp_dsack; extern int sysctl_tcp_mem[3]; extern int sysctl_tcp_wmem[3]; @@ -977,6 +979,10 @@ */ #define tcp_time_stamp ((__u32)(jiffies)) +/* increased an 8 bit to a 16 bit, then added another 8 bit, + so ns increased by four bytes, I think, but the skbuff + already allocates 48 bytes, so am safe? */ + /* This is what the send packet queueing engine uses to pass * TCP per-packet control information to the transmission * code. We also store the host-order sequence numbers in @@ -994,8 +1000,11 @@ __u32 seq; /* Starting sequence number */ __u32 end_seq; /* SEQ + FIN + SYN + datalen */ __u32 when; /* used to compute rtt's */ - __u8 flags; /* TCP header flags. */ + __u16 flags; /* TCP header flags. */ + + /* extended the 'flags byte' to a 'flags short' to support using the + four remaining bits between header length and flags */ /* NOTE: These must match up to the flags byte in a * real TCP header. */ @@ -1007,6 +1016,11 @@ #define TCPCB_FLAG_URG 0x20 #define TCPCB_FLAG_ECE 0x40 #define TCPCB_FLAG_CWR 0x80 + +#define TCPCB_FLAG_NS 0x100 +#define TCPCB_FLAG_RSV1 0x200 +#define TCPCB_FLAG_RSV2 0x400 +#define TCPCB_FLAG_RSV3 0x800 __u8 sacked; /* State flags for SACK/FACK. */ #define TCPCB_SACKED_ACKED 0x01 /* SKB ACK'd by a SACK block */ diff --unified --exclude=.config --exclude=.config.old --exclude=CVS --exclude=.cvsignore --recursive --ignore-all-space --ignore-blank-lines -I $Id -I $Revision -I $Source: -I $Date -I $Header -I $Log: -x arch -x debian -x drivers -x version.h -x asm-cris -x cyclades.h -x sysctl_net.c -x n_r3964.h -x ppp-comp.h -x openssl -x conf.vars --new-file linux-2.4.10/include/net/tcp_ecn.h linux/include/net/tcp_ecn.h --- linux-2.4.10/include/net/tcp_ecn.h Sun Sep 23 10:31:17 2001 +++ linux/include/net/tcp_ecn.h Fri Oct 5 00:27:54 2001 @@ -5,21 +7,51 @@ #ifdef CONFIG_INET_ECN +#include #include +#include /* for random nonce support */ + #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)|TCP_FLAG_ECE|TCP_FLAG_CWR) -#define TCP_ECN_OK 1 -#define TCP_ECN_QUEUE_CWR 2 -#define TCP_ECN_DEMAND_CWR 4 +#define ECN_DEBUG 1 +/* #undef ECN_DEBUG */ + +/* fire off a KERN_INFO debug message if the debuglevel set + * in /proc/sys/net/ipv4/tcp_ecn_nonce_debug is high + * enough and ECN_DEBUG is defined. */ +#ifdef ECN_DEBUG +#define DM(tp, dlvl, rest...) if(sysctl_tcp_ecn_nonce_debug >= dlvl && tp->ecn_flags.seen_a_nonce ) printk(KERN_INFO ## rest) +#else +#define DM(...) do { } while(0) +#endif + +/* separated for ease of user-mode testing and profiling */ +#include + +/* consistent levels of ecn nonce support: L_REPLY: reply to + * nonces if sent; L_GENERATE: create nonces in packets (and + * reply); L_VERIFY: check nonce sums in acknowledgements + * for misbehavior; L_PENALIZE: reduce cwnd when verify + * fails. Each level represents additional processing, and + * may not be necessary in all systems. + * + * if unsure, just reply. + */ +#define L_REPLY 1 +#define L_GENERATE 2 +#define L_VERIFY 3 +#define L_PENALIZE 4 static __inline__ void TCP_ECN_queue_cwr(struct tcp_opt *tp) { - if (tp->ecn_flags&TCP_ECN_OK) - tp->ecn_flags |= TCP_ECN_QUEUE_CWR; + if (tp->ecn_flags.ok) { + if(!tp->ecn_flags.queue_cwr) + DM (tp, 4, "posting queue cwr, high_seq=%u\n", tp->high_seq); + tp->ecn_flags.queue_cwr = 1; + } } - /* Output functions */ @@ -27,45 +59,92 @@ TCP_ECN_send_synack(struct tcp_opt *tp, struct sk_buff *skb) { TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_CWR; - if (!(tp->ecn_flags&TCP_ECN_OK)) + if (!(tp->ecn_flags.ok)) TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_ECE; + else if (sysctl_tcp_ecn_nonce >= L_REPLY) + TCP_SKB_CB (skb)->flags |= TCPCB_FLAG_NS; } static __inline__ void TCP_ECN_send_syn(struct tcp_opt *tp, struct sk_buff *skb) { - tp->ecn_flags = 0; + memset (&tp->ecn_flags, 0, sizeof (tp->ecn_flags)); if (sysctl_tcp_ecn) { TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE|TCPCB_FLAG_CWR; - tp->ecn_flags = TCP_ECN_OK; + tp->ecn_flags.ok = 1; + tp->ecn_flags.nonce_sum = 1; + tp->ecn_flags.expected_nonce_sum = 1; } } static __inline__ void TCP_ECN_make_synack(struct open_request *req, struct tcphdr *th) { - if (req->ecn_ok) + if (req->ecn_ok) { th->ece = 1; + /* set ns if replying is enabled, zero otherwise */ + th->ns = (sysctl_tcp_ecn_nonce >= L_REPLY); + } } static __inline__ void -TCP_ECN_send(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb, int tcp_header_len) +TCP_ECN_send (struct sock *sk, + struct tcp_opt *tp, struct sk_buff *skb, int tcp_header_len) { - if (tp->ecn_flags & TCP_ECN_OK) { - /* Not-retransmitted data segment: set ECT and inject CWR. */ - if (skb->len != tcp_header_len && - !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) { - INET_ECN_xmit(sk); - if (tp->ecn_flags&TCP_ECN_QUEUE_CWR) { - tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; + /* even if ECN is disabled due to error, continue to echo + congestion signals */ + skb->h.th->ece = tp->ecn_flags.demand_cwr; + + /* set the nonce sum in the packet header to prove + the lack of ce marks */ + skb->h.th->ns = tp->ecn_flags.nonce_sum; + + /* only do the following if ECN was negotiated, and has not been + disabled. */ + if (tp->ecn_flags.ok) { + /* ... and there's new data in this packet */ + if( skb->len != tcp_header_len ) { + if( !before (TCP_SKB_CB (skb)->seq, tp->snd_nxt)) { + if (sysctl_tcp_ecn_nonce >= L_GENERATE) { + if (ecnn_get_random_bit ()) { + INET_ECN_xmit_n1 (sk); + tp->ecn_flags.expected_nonce_sum ^= 1; + } else { + INET_ECN_xmit_n0 (sk); + } + if (sysctl_tcp_ecn_nonce >= L_VERIFY) { + /* disable fast path for this connection, + so that we get called to verify bits. */ + tp->pred_flags = 0; + ecnn_bind (sk, tp, + TCP_SKB_CB (skb)->end_seq, + tp->ecn_flags.expected_nonce_sum); + } + } else { + /* nonce generation disabled */ + INET_ECN_xmit_n0 (sk); + } + + if (tp->ecn_flags.queue_cwr) { + tp->ecn_flags.queue_cwr = 0; skb->h.th->cwr = 1; + tp->ecn_flags.cwr_ack_pending = 1; + tp->ecn_cwr_seq = TCP_SKB_CB (skb)->end_seq; + DM (tp, 5, "sending cwr at (%u:)%u\n", TCP_SKB_CB(skb)->seq, tp->ecn_cwr_seq); + } + } else { + /* retransmission; verify that we're in queue_cwr or cwr_ack_pending */ + if(!tp->ecn_flags.queue_cwr && !tp->ecn_flags.cwr_ack_pending) { + DM(tp,2, "unhandled non-congestion retransmission; setting cwr_ack_pending."); + tp->ecn_flags.cwr_ack_pending = 1; + tp->ecn_cwr_seq = tp->high_seq; + } + INET_ECN_dontxmit (sk); } } else { - /* ACK or retransmitted segment: clear ECT|CE */ + /* ACK: clear ECT|CE */ INET_ECN_dontxmit(sk); } - if (tp->ecn_flags & TCP_ECN_DEMAND_CWR) - skb->h.th->ece = 1; } } @@ -75,25 +154,44 @@ TCP_ECN_accept_cwr(struct tcp_opt *tp, struct sk_buff *skb) { if (skb->h.th->cwr) - tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; + tp->ecn_flags.demand_cwr = 0; } static __inline__ void TCP_ECN_withdraw_cwr(struct tcp_opt *tp) { - tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; + tp->ecn_flags.demand_cwr = 0; +} + +static __inline__ void +TCP_ECN_update_nonce (struct tcp_opt *tp, struct sk_buff *skb) +{ + /* received a nonce bit in an in-order packet. incorporate + it into the nonce sum */ + /* this function should be called as rcv_nxt is advanced to + skb->end_seq */ + + /* speak up if the packet was capable... */ + if (INET_ECN_is_capable (TCP_SKB_CB (skb)->flags) + && sysctl_tcp_ecn_nonce >= L_REPLY) { + tp->ecn_flags.nonce_sum ^= + INET_ECN_recover_nonce (TCP_SKB_CB (skb)->flags); + } } static __inline__ void TCP_ECN_check_ce(struct tcp_opt *tp, struct sk_buff *skb) { - if (tp->ecn_flags&TCP_ECN_OK) { - if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags)) - tp->ecn_flags |= TCP_ECN_DEMAND_CWR; + if (tp->ecn_flags.ok || tp->ecn_flags.seen_a_nonce) { + tp->ecn_flags.demand_cwr = + INET_ECN_is_ce (TCP_SKB_CB (skb)->flags); /* Funny extension: if ECT is not set on a segment, * it is surely retransmit. It is not in ECN RFC, * but Linux follows this rule. */ - else if (!INET_ECN_is_capable((TCP_SKB_CB(skb)->flags))) + /* although all retransmits are ecn incapable (now in + * RFC), not all ecn incapable packets are + * retransmits. */ + if (!INET_ECN_is_capable (TCP_SKB_CB (skb)->flags)) tcp_enter_quickack_mode(tp); } } @@ -101,10 +199,16 @@ static __inline__ void TCP_ECN_rcv_synack(struct tcp_opt *tp, struct tcphdr *th) { - if ((tp->ecn_flags&TCP_ECN_OK) && (!th->ece || th->cwr)) - tp->ecn_flags &= ~TCP_ECN_OK; + if ((tp->ecn_flags.ok) && (!th->ece || th->cwr)) + tp->ecn_flags.ok = 0; + else { + if (sysctl_tcp_ecn == 0) { + printk(KERN_ERR "rcv synack sysctl_tcp_ecn is zero?\n"); + } + tp->ecn_flags.seen_a_nonce |= th->ns; + NET_INC_STATS_BH(TCPECNConnections); + } } - static __inline__ void TCP_ECN_rcv_syn(struct tcp_opt *tp, struct tcphdr *th) { @@ -108,22 +212,145 @@ static __inline__ void TCP_ECN_rcv_syn(struct tcp_opt *tp, struct tcphdr *th) { - if ((tp->ecn_flags&TCP_ECN_OK) && (!th->ece || !th->cwr)) - tp->ecn_flags &= ~TCP_ECN_OK; + if (tp->ecn_flags.ok) { + if(!th->ece || !th->cwr) + tp->ecn_flags.ok = 0; + else { + if (sysctl_tcp_ecn == 0) { + printk(KERN_ERR "sysctl_tcp_ecn is zero, but not.\n"); + } + NET_INC_STATS_BH(TCPECNConnections); + } + } } - static __inline__ int -TCP_ECN_rcv_ecn_echo(struct tcp_opt *tp, struct tcphdr *th) +TCP_ECN_rcv_ecn_echo (struct tcp_opt *tp, struct tcphdr *th, struct sock *sk) { - if (th->ece && !th->syn && (tp->ecn_flags&TCP_ECN_OK)) + int expected; + /* connection is not capable or not ready to receive + ece */ + if (!tp->ecn_flags.ok || th->syn) + return 0; + /* track when a nonce is seen to determine penalty + for incorrect nonce */ + /* XXX: should be more prominent, or removed given + negotiation */ + tp->ecn_flags.seen_a_nonce |= th->ns; + /* packet shows ece */ + if (th->ece) return 1; + /* kernel configured not to check, or we've received a congestion + * signal, but haven't even sent the cwr */ + if (sysctl_tcp_ecn_nonce < L_VERIFY || tp->ecn_flags.queue_cwr) return 0; + + /* the next if statement is optional; the '!= + * established' is an otherwise redundant shortcut + * that makes established connections (the common + * case) bypass the rest of the checks. */ + if (sk->state != TCP_ESTABLISHED && + (sk->state == TCP_LAST_ACK || + sk->state == TCP_FIN_WAIT1 || + sk->state == TCP_CLOSING || sk->state == TCP_FIN_WAIT2)) { + /* we don't need to verify the nonce sum, + * the connection's over; this doesn't + * include close_wait, as receiving a fin + * doesn't mean we're not sending data */ + DM (tp, 6, + "connection completed with ECN state 0x%x\n", sk->state); + return 0; + } + expected = ecnn_find (tp, htonl (th->ack_seq)); + /* should ignore nonces while waiting for cwr ack (or pmtu ack) */ + if (tp->ecn_flags.cwr_ack_pending) { + DM(tp,11, "cwr_seq %u high_seq %u ack %u\n", + tp->ecn_cwr_seq, + tp->high_seq, + htonl(th->ack_seq)); +#if 1 + if (tp->ecn_cwr_seq <= htonl (th->ack_seq)) { +#else + /* todo: figure out why the +1 is necessary, if it works (it doesn't) */ + if (tp->high_seq + 1 < htonl (th->ack_seq)) { +#endif + /* don't need to check not yet ack + * of cwr, but do need to set + * nonce_synch */ + tp->ecn_flags.cwr_ack_pending = 0; + tp->ecn_flags.nonce_synch = th->ns ^ expected; + DM (tp, 5, + "ECN-Nonce resync: %d = seen %d ^ expected %d at %u\n", + tp->ecn_flags.nonce_synch, th->ns, + expected, htonl (th->ack_seq)); + } + /* else, we're still waiting; odd that the + * other end stopped sending ECE, but not + * evil. */ + return 0; + } else if ((th->ns ^ tp->ecn_flags.nonce_synch) == expected) { + /* verified the correct nonce sum */ + return 0; + } else { + /* ece is zero, but couldn't be verified. */ + DM (tp, 1, + "ECN-Nonce fail: ack %u expected %u(rightedge-%u) saw %u sync %u\n", + htonl (th->ack_seq), + expected, + tp->ecn_flags.expected_nonce_sum, + th->ns, tp->ecn_flags.nonce_synch); + + NET_INC_STATS_BH(TCPECNNonceFailures); + if (sysctl_tcp_ecn_nonce >= L_PENALIZE) { + DM (tp, 0, + "disabled ecn after incorrect nonce sum; state %s/%d\n", + ((sk->state==1) ? "ESTABLISHED" : ""), sk->state); + /* mistrust cwnd, so redo from cwnd=1, + without slow-start. */ + tp->ecn_flags.ok = 0; + INET_ECN_dontxmit (sk); /* fix tos now. */ + if (tp->ecn_flags.seen_a_nonce) { + tp->snd_cwnd = 1; + tp->snd_ssthresh = 2; + } + } + return 1; + } +} + +/* small packets in the receive queue may be merged to save + space. this appears to happen in real life, but this code + has not been tested. */ +static __inline__ void +TCP_ECN_nonce_collapse (struct sk_buff *skb, struct sk_buff *skb_next) +{ + int newnonce; + if (sysctl_tcp_ecn_nonce_debug >= 2) + printk (KERN_INFO "collapsing %u with %u at %u to %u\n", + INET_ECN_recover_nonce (TCP_SKB_CB (skb)->flags), + INET_ECN_recover_nonce (TCP_SKB_CB (skb_next)->flags), + TCP_SKB_CB (skb)->seq, TCP_SKB_CB (skb_next)->end_seq); + + newnonce = INET_ECN_recover_nonce (TCP_SKB_CB (skb)->flags) + ^ INET_ECN_recover_nonce (TCP_SKB_CB (skb_next)->flags); + + /* clear the stored nonce field */ + TCP_SKB_CB (skb)->flags &= ~0x3; + /* set the stored nonce field to 01 if 1, 10 if 0 */ + TCP_SKB_CB (skb)->flags |= newnonce ? 0x1 : 0x2; + /* skb->ecn_ns_expected = skb_next->ecn_ns_expected; */ + if (sysctl_tcp_ecn_nonce_debug >= 3) + printk (KERN_INFO "... to yield %u\n", + INET_ECN_recover_nonce (TCP_SKB_CB (skb)->flags)); } static __inline__ void TCP_ECN_openreq_child(struct tcp_opt *tp, struct open_request *req) { - tp->ecn_flags = req->ecn_ok ? TCP_ECN_OK : 0; + tp->ecn_flags.ok = req->ecn_ok; + if(sysctl_tcp_ecn_nonce >= L_REPLY) { + tp->ecn_flags.nonce_sum = + tp->ecn_flags.expected_nonce_sum = req->ecn_ok; + } } static __inline__ void @@ -133,9 +360,15 @@ req->ecn_ok = 1; } - - +static __inline__ void +TCP_ECN_pmtu_disc_response (struct tcp_opt *tp) +{ + /* pretend we're sending cwr */ + tp->ecn_flags.cwr_ack_pending = 1; + tp->ecn_cwr_seq = tp->high_seq; +} #else +/* ECN is disabled in this kernel */ #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) @@ -156,7 +388,9 @@ #define TCP_ECN_create_request(x...) do { } while (0) #define TCP_ECN_withdraw_cwr(x...) do { } while (0) - +#define TCP_ECN_nonce_collapse(x...) do { } while (0) +#define TCP_ECN_update_nonce(x...) do { } while (0) +#define TCP_ECN_pmtu_disc_response(x...) do { } while(0) #endif #endif diff --unified --exclude=.config --exclude=.config.old --exclude=CVS --exclude=.cvsignore --recursive --ignore-all-space --ignore-blank-lines -I $Id -I $Revision -I $Source: -I $Date -I $Header -I $Log: -x arch -x debian -x drivers -x version.h -x asm-cris -x cyclades.h -x sysctl_net.c -x n_r3964.h -x ppp-comp.h -x openssl -x conf.vars --new-file linux-2.4.10/include/net/tcp_nonce.h linux/include/net/tcp_nonce.h --- linux-2.4.10/include/net/tcp_nonce.h Wed Dec 31 16:00:00 1969 +++ linux/include/net/tcp_nonce.h Fri Oct 5 00:27:49 2001 @@ -0,0 +1,128 @@ +#ifndef _NET_TCP_NONCE_H_ +#define _NET_TCP_NONCE_H_ 1 + +#include + +/* inline routines and data structures for managing the + * queue of expected ECN-nonce sums. This is a separate + * file for testing and to separate expected-nonce data + * structures. Neil Spring nspring@cs.washington.edu */ + +/* defined in tcp.c; handles cleaning up dynamically + * allocated expected nonce sum bindings */ +extern void TCP_ECN_destructor (struct sock *sk); + +/* the following structure could be compacted to two 32-bit + * words by using a 16-bit delta for the sequence number. */ +struct ecn_expected_ns_binding { + struct ecn_expected_ns_binding *next; + unsigned int seq; + __u8 expected_ns:1; +}; + +/* bind a sequence number to the nonce sum expected when + that sequence number is ack'd. */ +static __inline__ void +ecnn_bind (struct sock *sk, struct tcp_opt *tp, unsigned int seq, __u8 expected) +{ + struct ecn_expected_ns_binding *p; + if (tp->ecn_ensq.last) { + /* tail of the queue serves as a free list */ + p = tp->ecn_ensq.last->next; + if (p == NULL) { + p = sock_kmalloc (sk, + sizeof (struct + ecn_expected_ns_binding), + GFP_ATOMIC); + if (p == NULL) + goto alloc_failure; + p->next = NULL; + tp->ecn_ensq.last->next = p; + } + tp->ecn_ensq.last = p; + } else { + /* first binding */ + tp->ecn_ensq.first = tp->ecn_ensq.last = p = + sock_kmalloc (sk, sizeof (struct ecn_expected_ns_binding), + GFP_ATOMIC); + if (p == NULL) + goto alloc_failure; + /* setup destructor for this list */ + if (!tp->prev_destruct) { + tp->prev_destruct = sk->destruct; + sk->destruct = TCP_ECN_destructor; + } + p->next = NULL; + } + p->seq = seq; + p->expected_ns = expected; + return; + + /* exception-like section to reduce (inlined) function length */ + alloc_failure: + printk (KERN_ERR + "out of socket optmem allocating a nonce binding;\n"); + tp->ecn_flags.ok = 0; + return; +} + +/* recover the expected nonce sum for an ack'd sequence number */ +static __inline__ __u8 +ecnn_find (struct tcp_opt *tp, unsigned int seq) +{ + struct ecn_expected_ns_binding *p; + if (tp->ecn_ensq.first != NULL) { + for (p = tp->ecn_ensq.first; + before (p->seq, seq) && + tp->ecn_ensq.first != tp->ecn_ensq.last; + p = tp->ecn_ensq.first) { + /* advance first */ + tp->ecn_ensq.first = p->next; + /* insert unused one at tail (a free list) */ + p->next = tp->ecn_ensq.last->next; + tp->ecn_ensq.last->next = p; + } + if (p->seq == seq) { + /* current, keep around for no reason. */ + return (p->expected_ns); + } + } + /* if we get here, it's probably because we haven't + sent any data yet; from the initial handshake. + Otherwise, it's because the receiver decided to + ack a partial segment. */ + return (tp->ecn_flags.expected_nonce_sum); +} + +/* from http://www.ciphersbyritter.com/NEWS4/RANDC.HTM */ +/* written by George Marsaglia, + * as posted to sci.stat.math,sci.math + * cross posted at http://www.tux.org/hypermail/linux-kernel/1999week04/1472.html + * */ +#define ecnn_znew (z=36969*(z&65535)+(z>>16)) +#define ecnn_wnew (w=18000*(w&65535)+(w>>16)) +#define ecnn_MWC ((ecnn_znew<<16)+ecnn_wnew ) +/* end excerpt */ + +static __inline__ __u8 +ecnn_get_random_bit (void) +{ + /* this will ultimately bias in favor of 1's; about 52% of the time. */ + static unsigned int z, w, q; + __u8 retbit; + if (z == 0) { + /* seed the cheap random number generator from the kernel */ + get_random_bytes (&z, sizeof (z)); + get_random_bytes (&w, sizeof (w)); + } + if (q == 0) { + /* grab a cheap 32-bit random number */ + q = ecnn_MWC; + } + /* pull the random bit of the least significant side */ + retbit = q & 0x1; + q >>= 1; + return (retbit); +} + +#endif diff --unified --exclude=.config --exclude=.config.old --exclude=CVS --exclude=.cvsignore --recursive --ignore-all-space --ignore-blank-lines -I $Id -I $Revision -I $Source: -I $Date -I $Header -I $Log: -x arch -x debian -x drivers -x version.h -x asm-cris -x cyclades.h -x sysctl_net.c -x n_r3964.h -x ppp-comp.h -x openssl -x conf.vars --new-file linux-2.4.10/net/ipv4/ip_fragment.c linux/net/ipv4/ip_fragment.c --- linux-2.4.10/net/ipv4/ip_fragment.c Fri Sep 7 11:01:21 2001 +++ linux/net/ipv4/ip_fragment.c Thu Oct 4 23:59:53 2001 @@ -463,7 +463,7 @@ } else { struct sk_buff *free_it = next; - /* Old fragmnet is completely overridden with + /* Old fragment is completely overridden with * new one drop it. */ next = next->next; diff --unified --exclude=.config --exclude=.config.old --exclude=CVS --exclude=.cvsignore --recursive --ignore-all-space --ignore-blank-lines -I $Id -I $Revision -I $Source: -I $Date -I $Header -I $Log: -x arch -x debian -x drivers -x version.h -x asm-cris -x cyclades.h -x sysctl_net.c -x n_r3964.h -x ppp-comp.h -x openssl -x conf.vars --new-file linux-2.4.10/net/ipv4/proc.c linux/net/ipv4/proc.c --- linux-2.4.10/net/ipv4/proc.c Wed May 16 10:21:45 2001 +++ linux/net/ipv4/proc.c Thu Aug 23 00:06:37 2001 @@ -192,7 +192,8 @@ " TCPDSACKOldSent TCPDSACKOfoSent TCPDSACKRecv TCPDSACKOfoRecv" " TCPAbortOnSyn TCPAbortOnData TCPAbortOnClose" " TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger" - " TCPAbortFailed TCPMemoryPressures\n" + " TCPAbortFailed TCPMemoryPressures" + " TCPECNConnections TCPECNNonceFailures\n" "TcpExt:"); for (i=0; iecn_flags&TCP_ECN_OK) + if (tp->ecn_flags.ok) info.tcpi_options |= TCPI_OPT_ECN; #endif @@ -2556,3 +2556,20 @@ printk("TCP: Hash tables configured (established %d bind %d)\n", tcp_ehash_size<<1, tcp_bhash_size); } + + +#ifdef CONFIG_INET_ECN +/* overrides the existing socket destructor to clean up + nonce sum bindings */ +void TCP_ECN_destructor(struct sock *sk) { + struct ecn_expected_ns_binding *p; + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + for(p=tp->ecn_ensq.first; p!=NULL; p=p->next) { + sock_kfree_s(sk,p,sizeof(struct ecn_expected_ns_binding)); + } + tp->ecn_ensq.first=NULL; + if(tp->prev_destruct!=NULL) { + (*(tp->prev_destruct))(sk); + } +} +#endif diff --unified --exclude=.config --exclude=.config.old --exclude=CVS --exclude=.cvsignore --recursive --ignore-all-space --ignore-blank-lines -I $Id -I $Revision -I $Source: -I $Date -I $Header -I $Log: -x arch -x debian -x drivers -x version.h -x asm-cris -x cyclades.h -x sysctl_net.c -x n_r3964.h -x ppp-comp.h -x openssl -x conf.vars --new-file linux-2.4.10/net/ipv4/tcp_input.c linux/net/ipv4/tcp_input.c --- linux-2.4.10/net/ipv4/tcp_input.c Thu Sep 20 14:12:56 2001 +++ linux/net/ipv4/tcp_input.c Thu Oct 4 23:59:53 2001 @@ -80,9 +80,14 @@ int sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; #ifdef CONFIG_INET_ECN int sysctl_tcp_ecn = 1; +int sysctl_tcp_ecn_nonce = 4; +int sysctl_tcp_ecn_nonce_debug = 0; #else int sysctl_tcp_ecn = 0; +int sysctl_tcp_ecn_nonce = 0; +int sysctl_tcp_ecn_nonce_debug = 0; #endif + int sysctl_tcp_dsack = 1; int sysctl_tcp_app_win = 31; int sysctl_tcp_adv_win_scale = 2; @@ -1937,8 +1942,7 @@ if (TCP_SKB_CB(skb)->sacked) flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); - - if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) + if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th, sk)) flag |= FLAG_ECE; } @@ -2512,6 +2516,7 @@ __skb_unlink(skb, skb->list); __skb_queue_tail(&sk->receive_queue, skb); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + TCP_ECN_update_nonce( tp, skb ); if(skb->h.th->fin) tcp_fin(skb, sk, skb->h.th); } @@ -2585,6 +2590,7 @@ __skb_queue_tail(&sk->receive_queue, skb); } tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + TCP_ECN_update_nonce( tp, skb ); if(skb->len) tcp_event_data_recv(sk, tp, skb); if(th->fin) @@ -3246,7 +3252,7 @@ tp->saw_tstamp = 0; - /* pred_flags is 0xS?10 << 16 + snd_wnd + /* pred_flags is (0xS?10 << 16) + snd_wnd * if header_predition is to be made * 'S' will always be tp->tcp_header_len >> 2 * '?' will be 0 for the fast path, otherwise pred_flags is 0 to @@ -3324,6 +3330,7 @@ __skb_pull(skb,tcp_header_len); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + TCP_ECN_update_nonce(tp,skb); } else { if (tcp_checksum_complete_user(sk, skb)) goto csum_error; @@ -3338,6 +3345,7 @@ __skb_queue_tail(&sk->receive_queue, skb); tcp_set_owner_r(skb, sk); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + TCP_ECN_update_nonce(tp,skb); } tcp_event_data_recv(sk, tp, skb); diff --unified --exclude=.config --exclude=.config.old --exclude=CVS --exclude=.cvsignore --recursive --ignore-all-space --ignore-blank-lines -I $Id -I $Revision -I $Source: -I $Date -I $Header -I $Log: -x arch -x debian -x drivers -x version.h -x asm-cris -x cyclades.h -x sysctl_net.c -x n_r3964.h -x ppp-comp.h -x openssl -x conf.vars --new-file linux-2.4.10/net/ipv4/tcp_ipv4.c linux/net/ipv4/tcp_ipv4.c --- linux-2.4.10/net/ipv4/tcp_ipv4.c Fri Sep 7 11:01:21 2001 +++ linux/net/ipv4/tcp_ipv4.c Thu Oct 4 23:59:53 2001 @@ -834,7 +834,11 @@ * discovery. */ tcp_simple_retransmit(sk); + + /* account for loss of ECN-nonce synch */ + TCP_ECN_pmtu_disc_response(tp); } /* else let the usual retransmit timer handle it */ + } /* diff --unified --exclude=.config --exclude=.config.old --exclude=CVS --exclude=.cvsignore --recursive --ignore-all-space --ignore-blank-lines -I $Id -I $Revision -I $Source: -I $Date -I $Header -I $Log: -x arch -x debian -x drivers -x version.h -x asm-cris -x cyclades.h -x sysctl_net.c -x n_r3964.h -x ppp-comp.h -x openssl -x conf.vars --new-file linux-2.4.10/net/netsyms.c linux/net/netsyms.c --- linux-2.4.10/net/netsyms.c Tue Sep 18 13:39:51 2001 +++ linux/net/netsyms.c Thu Oct 4 23:59:53 2001 @@ -378,6 +378,8 @@ EXPORT_SYMBOL(sysctl_tcp_rmem); EXPORT_SYMBOL(sysctl_tcp_wmem); EXPORT_SYMBOL(sysctl_tcp_ecn); +EXPORT_SYMBOL(sysctl_tcp_ecn_nonce); +EXPORT_SYMBOL(sysctl_tcp_ecn_nonce_debug); EXPORT_SYMBOL(tcp_cwnd_application_limited); EXPORT_SYMBOL(tcp_sendpage);