patch-2.4.0-prerelease linux/net/ipv4/tcp_input.c
Next file: linux/net/ipv4/udp.c
Previous file: linux/net/ipv4/netfilter/Makefile
Back to the patch index
Back to the overall index
- Lines: 227
- Date:
Fri Dec 29 14:07:24 2000
- Orig file:
v2.4.0-test12/linux/net/ipv4/tcp_input.c
- Orig date:
Mon Dec 11 17:59:45 2000
diff -u --recursive --new-file v2.4.0-test12/linux/net/ipv4/tcp_input.c linux/net/ipv4/tcp_input.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_input.c,v 1.203 2000/11/28 17:04:09 davem Exp $
+ * Version: $Id: tcp_input.c,v 1.205 2000/12/13 18:31:48 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -108,6 +108,7 @@
#define IsReno(tp) ((tp)->sack_ok == 0)
#define IsFack(tp) ((tp)->sack_ok & 2)
+#define IsDSack(tp) ((tp)->sack_ok & 4)
#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
@@ -438,14 +439,40 @@
if (tp->srtt != 0) {
m -= (tp->srtt >> 3); /* m is now error in rtt est */
tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */
- if (m < 0)
+ if (m < 0) {
m = -m; /* m is now abs(error) */
- m -= (tp->mdev >> 2); /* similar update on mdev */
+ m -= (tp->mdev >> 2); /* similar update on mdev */
+ /* This is similar to one of Eifel findings.
+ * Eifel blocks mdev updates when rtt decreases.
+ * This solution is a bit different: we use finer gain
+ * for mdev in this case (alpha*beta).
+ * Like Eifel it also prevents growth of rto,
+ * but also it limits too fast rto decreases,
+ * happening in pure Eifel.
+ */
+ if (m > 0)
+ m >>= 3;
+ } else {
+ m -= (tp->mdev >> 2); /* similar update on mdev */
+ }
tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */
+ if (tp->mdev > tp->mdev_max) {
+ tp->mdev_max = tp->mdev;
+ if (tp->mdev_max > tp->rttvar)
+ tp->rttvar = tp->mdev_max;
+ }
+ if (after(tp->snd_una, tp->rtt_seq)) {
+ if (tp->mdev_max < tp->rttvar)
+ tp->rttvar -= (tp->rttvar-tp->mdev_max)>>2;
+ tp->rtt_seq = tp->snd_una;
+ tp->mdev_max = TCP_RTO_MIN;
+ }
} else {
/* no previous measure. */
tp->srtt = m<<3; /* take the measured time to be rtt */
tp->mdev = m<<2; /* make sure rto = 3*rtt */
+ tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN);
+ tp->rtt_seq = tp->snd_nxt;
}
}
@@ -454,45 +481,34 @@
*/
static __inline__ void tcp_set_rto(struct tcp_opt *tp)
{
- tp->rto = (tp->srtt >> 3) + tp->mdev;
- /* I am not enough educated to understand this magic.
- * However, it smells bad. snd_cwnd>31 is common case.
+ /* Old crap is replaced with new one. 8)
+ *
+ * More seriously:
+ * 1. If rtt variance happened to be less 50msec, it is hallucination.
+ * It cannot be less due to utterly erratic ACK generation made
+ * at least by solaris and freebsd. "Erratic ACKs" has _nothing_
+ * to do with delayed acks, because at cwnd>2 true delack timeout
+ * is invisible. Actually, Linux-2.4 also generates erratic
+ * ACKs in some curcumstances.
*/
- /* OK, I found comment in 2.0 source tree, it deserves
- * to be reproduced:
- * ====
- * Note: Jacobson's algorithm is fine on BSD which has a 1/2 second
- * granularity clock, but with our 1/100 second granularity clock we
- * become too sensitive to minor changes in the round trip time.
- * We add in two compensating factors. First we multiply by 5/4.
- * For large congestion windows this allows us to tolerate burst
- * traffic delaying up to 1/4 of our packets. We also add in
- * a rtt / cong_window term. For small congestion windows this allows
- * a single packet delay, but has negligible effect
- * on the compensation for large windows.
+ tp->rto = (tp->srtt >> 3) + tp->rttvar;
+
+ /* 2. Fixups made earlier cannot be right.
+ * If we do not estimate RTO correctly without them,
+ * all the algo is pure shit and should be replaced
+ * with correct one. It is exaclty, which we pretend to do.
*/
- tp->rto += (tp->rto >> 2) + (tp->rto >> (tp->snd_cwnd-1));
}
-/* Keep the rto between HZ/5 and 120*HZ. 120*HZ is the upper bound
- * on packet lifetime in the internet. We need the HZ/5 lower
- * bound to behave correctly against BSD stacks with a fixed
- * delayed ack.
- * FIXME: It's not entirely clear this lower bound is the best
- * way to avoid the problem. Is it possible to drop the lower
- * bound and still avoid trouble with BSD stacks? Perhaps
- * some modification to the RTO calculation that takes delayed
- * ack bias into account? This needs serious thought. -- erics
+/* NOTE: clamping at TCP_RTO_MIN is not required, current algo
+ * guarantees that rto is higher.
*/
static __inline__ void tcp_bound_rto(struct tcp_opt *tp)
{
- if (tp->rto < TCP_RTO_MIN)
- tp->rto = TCP_RTO_MIN;
- else if (tp->rto > TCP_RTO_MAX)
+ if (tp->rto > TCP_RTO_MAX)
tp->rto = TCP_RTO_MAX;
}
-
/* Save metrics learned by this TCP session.
This function is called only, when TCP finishes sucessfully
i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
@@ -649,8 +665,10 @@
*/
if (dst->rtt > tp->srtt)
tp->srtt = dst->rtt;
- if (dst->rttvar > tp->mdev)
+ if (dst->rttvar > tp->mdev) {
tp->mdev = dst->rttvar;
+ tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN);
+ }
tcp_set_rto(tp);
tcp_bound_rto(tp);
if (tp->rto < TCP_TIMEOUT_INIT && !tp->saw_tstamp)
@@ -666,7 +684,7 @@
*/
if (!tp->saw_tstamp && tp->srtt) {
tp->srtt = 0;
- tp->mdev = TCP_TIMEOUT_INIT;
+ tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT;
tp->rto = TCP_TIMEOUT_INIT;
}
}
@@ -774,11 +792,13 @@
if (before(start_seq, ack)) {
dup_sack = 1;
+ tp->sack_ok |= 4;
NET_INC_STATS_BH(TCPDSACKRecv);
} else if (num_sacks > 1 &&
!after(end_seq, ntohl(sp[1].end_seq)) &&
!before(start_seq, ntohl(sp[1].start_seq))) {
dup_sack = 1;
+ tp->sack_ok |= 4;
NET_INC_STATS_BH(TCPDSACKOfoRecv);
}
@@ -1286,8 +1306,10 @@
{
if (tp->prior_ssthresh) {
tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1);
- if (undo && tp->prior_ssthresh > tp->snd_ssthresh)
+ if (undo && tp->prior_ssthresh > tp->snd_ssthresh) {
tp->snd_ssthresh = tp->prior_ssthresh;
+ TCP_ECN_withdraw_cwr(tp);
+ }
} else {
tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
}
@@ -1615,13 +1637,16 @@
* I.e. Karn's algorithm. (SIGCOMM '87, p5.)
*/
+ if (flag & FLAG_RETRANS_DATA_ACKED)
+ return;
+
tcp_rtt_estimator(tp, seq_rtt);
tcp_set_rto(tp);
if (tp->backoff) {
/* To relax it? We have valid sample as soon as we are
* here. Why not to clear backoff?
*/
- if (!tp->retransmits || !(flag & FLAG_RETRANS_DATA_ACKED))
+ if (!tp->retransmits)
tp->backoff = 0;
else
tp->rto <<= tp->backoff;
@@ -1661,16 +1686,25 @@
}
}
+/* Restart timer after forward progress on connection.
+ * RFC2988 recommends (and BSD does) to restart timer to now+rto,
+ * which is certainly wrong and effectively means that
+ * rto includes one more _full_ rtt.
+ *
+ * For details see:
+ * ftp://ftp.inr.ac.ru:/ip-routing/README.rto
+ */
+
static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp)
{
if (tp->packets_out==0) {
tcp_clear_xmit_timer(sk, TCP_TIME_RETRANS);
} else {
struct sk_buff *skb = skb_peek(&sk->write_queue);
- __u32 when = tp->rto - (tcp_time_stamp - TCP_SKB_CB(skb)->when);
+ __u32 when = tp->rto + tp->rttvar - (tcp_time_stamp - TCP_SKB_CB(skb)->when);
- if ((__s32)when <= 0)
- when = TCP_RTO_MIN;
+ if ((__s32)when < (__s32)tp->rttvar)
+ when = tp->rttvar;
tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, when);
}
}
@@ -1841,7 +1875,7 @@
#ifdef TCP_DEBUG
if (before(tp->snd_una + tp->snd_wnd, tp->snd_nxt)) {
- if ((tp->snd_una + tp->snd_wnd)-tp->snd_nxt >= (1<<tp->snd_wscale)
+ if (tp->snd_nxt-(tp->snd_una + tp->snd_wnd) >= (1<<tp->snd_wscale)
&& net_ratelimit())
printk(KERN_DEBUG "TCP: peer %u.%u.%u.%u:%u/%u shrinks window %u:%u:%u. Bad, what else can I say?\n",
NIPQUAD(sk->daddr), htons(sk->dport), sk->num,
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)