patch-2.4.0-test10 linux/net/core/dev.c

Next file: linux/net/core/dev_mcast.c
Previous file: linux/net/core/Makefile
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.4.0-test9/linux/net/core/dev.c linux/net/core/dev.c
@@ -59,6 +59,8 @@
  *	Paul Rusty Russell	:	SIOCSIFNAME
  *              Pekka Riikonen  :	Netdev boot-time settings code
  *              Andrew Morton   :       Make unregister_netdevice wait indefinitely on dev->refcnt
+ * 		J Hadi Salim	:	- Backlog queue sampling
+ *				        - netif_rx() feedback	
  */
 
 #include <asm/uaccess.h>
@@ -85,6 +87,7 @@
 #include <linux/proc_fs.h>
 #include <linux/stat.h>
 #include <linux/if_bridge.h>
+#include <linux/divert.h>
 #include <net/dst.h>
 #include <net/pkt_sched.h>
 #include <net/profile.h>
@@ -97,6 +100,18 @@
 extern int plip_init(void);
 #endif
 
+/* This define, if set, will randomly drop a packet when congestion
+ * is more than moderate.  It helps fairness in the multi-interface
+ * case when one of them is a hog, but it kills performance for the
+ * single interface case so it is off now by default.
+ */
+#undef RAND_LIE
+
+/* Setting this will sample the queue lengths and thus congestion
+ * via a timer instead of as each packet is received.
+ */
+#undef OFFLINE_SAMPLE
+
 NET_PROFILE_DEFINE(dev_queue_xmit)
 NET_PROFILE_DEFINE(softnet_process)
 
@@ -133,6 +148,11 @@
 static struct packet_type *ptype_base[16];		/* 16 way hashed list */
 static struct packet_type *ptype_all = NULL;		/* Taps */
 
+#ifdef OFFLINE_SAMPLE
+static void sample_queue(unsigned long dummy);
+static struct timer_list samp_timer = { function: sample_queue };
+#endif
+
 /*
  *	Our notifier list
  */
@@ -933,12 +953,20 @@
   =======================================================================*/
 
 int netdev_max_backlog = 300;
+/* These numbers are selected based on intuition and some
+ * experimentatiom, if you have more scientific way of doing this
+ * please go ahead and fix things.
+ */
+int no_cong_thresh = 10;
+int no_cong = 20;
+int lo_cong = 100;
+int mod_cong = 290;
 
 struct netif_rx_stats netdev_rx_stat[NR_CPUS];
 
 
 #ifdef CONFIG_NET_HW_FLOWCONTROL
-static atomic_t netdev_dropping = ATOMIC_INIT(0);
+atomic_t netdev_dropping = ATOMIC_INIT(0);
 static unsigned long netdev_fc_mask = 1;
 unsigned long netdev_fc_xoff = 0;
 spinlock_t netdev_fc_lock = SPIN_LOCK_UNLOCKED;
@@ -996,6 +1024,56 @@
 }
 #endif
 
+static void get_sample_stats(int cpu)
+{
+#ifdef RAND_LIE
+	unsigned long rd;
+	int rq;
+#endif
+	int blog = softnet_data[cpu].input_pkt_queue.qlen;
+	int avg_blog = softnet_data[cpu].avg_blog;
+
+	avg_blog = (avg_blog >> 1)+ (blog >> 1);
+
+	if (avg_blog > mod_cong) {
+		/* Above moderate congestion levels. */
+		softnet_data[cpu].cng_level = NET_RX_CN_HIGH;
+#ifdef RAND_LIE
+		rd = net_random();
+		rq = rd % netdev_max_backlog;
+		if (rq < avg_blog) /* unlucky bastard */
+			softnet_data[cpu].cng_level = NET_RX_DROP;
+#endif
+	} else if (avg_blog > lo_cong) {
+		softnet_data[cpu].cng_level = NET_RX_CN_MOD;
+#ifdef RAND_LIE
+		rd = net_random();
+		rq = rd % netdev_max_backlog;
+			if (rq < avg_blog) /* unlucky bastard */
+				softnet_data[cpu].cng_level = NET_RX_CN_HIGH;
+#endif
+	} else if (avg_blog > no_cong) 
+		softnet_data[cpu].cng_level = NET_RX_CN_LOW;
+	else  /* no congestion */
+		softnet_data[cpu].cng_level = NET_RX_SUCCESS;
+
+	softnet_data[cpu].avg_blog = avg_blog;
+}
+
+#ifdef OFFLINE_SAMPLE
+static void sample_queue(unsigned long dummy)
+{
+/* 10 ms 0r 1ms -- i dont care -- JHS */
+	int next_tick = 1;
+	int cpu = smp_processor_id();
+
+	get_sample_stats(cpu);
+	next_tick += jiffies;
+	mod_timer(&samp_timer, next_tick);
+}
+#endif
+
+
 /**
  *	netif_rx	-	post buffer to the network code
  *	@skb: buffer to post
@@ -1004,9 +1082,18 @@
  *	the upper (protocol) levels to process.  It always succeeds. The buffer
  *	may be dropped during processing for congestion control or by the 
  *	protocol layers.
+ *      
+ *	return values:
+ *	NET_RX_SUCCESS	(no congestion)           
+ *	NET_RX_CN_LOW     (low congestion) 
+ *	NET_RX_CN_MOD     (moderate congestion)
+ *	NET_RX_CN_HIGH    (high congestion) 
+ *	NET_RX_DROP    (packet was dropped)
+ *      
+ *      
  */
 
-void netif_rx(struct sk_buff *skb)
+int netif_rx(struct sk_buff *skb)
 {
 	int this_cpu = smp_processor_id();
 	struct softnet_data *queue;
@@ -1036,7 +1123,10 @@
 			__skb_queue_tail(&queue->input_pkt_queue,skb);
 			__cpu_raise_softirq(this_cpu, NET_RX_SOFTIRQ);
 			local_irq_restore(flags);
-			return;
+#ifndef OFFLINE_SAMPLE
+			get_sample_stats(this_cpu);
+#endif
+			return softnet_data[this_cpu].cng_level;
 		}
 
 		if (queue->throttle) {
@@ -1062,19 +1152,22 @@
 	local_irq_restore(flags);
 
 	kfree_skb(skb);
+	return NET_RX_DROP;
 }
 
 /* Deliver skb to an old protocol, which is not threaded well
    or which do not understand shared skbs.
  */
-static void deliver_to_old_ones(struct packet_type *pt, struct sk_buff *skb, int last)
+static int deliver_to_old_ones(struct packet_type *pt, struct sk_buff *skb, int last)
 {
 	static spinlock_t net_bh_lock = SPIN_LOCK_UNLOCKED;
+	int ret = NET_RX_DROP;
+
 
 	if (!last) {
 		skb = skb_clone(skb, GFP_ATOMIC);
 		if (skb == NULL)
-			return;
+			return ret;
 	}
 
 	/* The assumption (correct one) is that old protocols
@@ -1087,10 +1180,11 @@
 	/* Disable timers and wait for all timers completion */
 	tasklet_disable(bh_task_vec+TIMER_BH);
 
-	pt->func(skb, skb->dev, pt);
+	ret = pt->func(skb, skb->dev, pt);
 
 	tasklet_enable(bh_task_vec+TIMER_BH);
 	spin_unlock(&net_bh_lock);
+	return ret;
 }
 
 /* Reparent skb to master device. This function is called
@@ -1173,20 +1267,33 @@
 void (*br_handle_frame_hook)(struct sk_buff *skb) = NULL;
 #endif
 
-static void __inline__ handle_bridge(struct sk_buff *skb,
+static int __inline__ handle_bridge(struct sk_buff *skb,
 				     struct packet_type *pt_prev)
 {
+	int ret = NET_RX_DROP;
+
 	if (pt_prev) {
 		if (!pt_prev->data)
-			deliver_to_old_ones(pt_prev, skb, 0);
+			ret = deliver_to_old_ones(pt_prev, skb, 0);
 		else {
 			atomic_inc(&skb->users);
-			pt_prev->func(skb, skb->dev, pt_prev);
+			ret = pt_prev->func(skb, skb->dev, pt_prev);
 		}
 	}
 
 	br_handle_frame_hook(skb);
+	return ret;
+}
+
+
+#ifdef CONFIG_NET_DIVERT
+static inline void handle_diverter(struct sk_buff *skb)
+{
+	/* if diversion is supported on device, then divert */
+	if (skb->dev->divert && skb->dev->divert->divert)
+		divert_frame(skb);
 }
+#endif   /* CONFIG_NET_DIVERT */
 
 
 static void net_rx_action(struct softirq_action *h)
@@ -1239,6 +1346,12 @@
 				}
 			}
 
+#ifdef CONFIG_NET_DIVERT
+			if (skb->dev->divert && skb->dev->divert->divert)
+				handle_diverter(skb);
+#endif /* CONFIG_NET_DIVERT */
+
+			
 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
 			if (skb->dev->br_port != NULL &&
 			    br_handle_frame_hook != NULL) {
@@ -1275,6 +1388,17 @@
 
 		if (bugdet-- < 0 || jiffies - start_time > 1)
 			goto softnet_break;
+
+#ifdef CONFIG_NET_HW_FLOWCONTROL
+	if (queue->throttle && queue->input_pkt_queue.qlen < no_cong_thresh ) {
+		if (atomic_dec_and_test(&netdev_dropping)) {
+			queue->throttle = 0;
+			netdev_wakeup();
+			goto softnet_break;
+		}
+	}
+#endif
+
 	}
 	br_read_unlock(BR_NETPROTO_LOCK);
 
@@ -2113,9 +2237,9 @@
 /**
  *	dev_new_index	-	allocate an ifindex
  *
- *	Returns a suitable unique value for a new device interface number.
- *	The caller must hold the rtnl semaphore to be sure it remains 
- *	unique.
+ *	Returns a suitable unique value for a new device interface
+ *	number.  The caller must hold the rtnl semaphore or the
+ *	dev_base_lock to be sure it remains unique.
  */
  
 int dev_new_index(void)
@@ -2140,6 +2264,10 @@
  *	chain. 0 is returned on success. A negative errno code is returned
  *	on a failure to set up the device, or if the name is a duplicate.
  *
+ *	Callers must hold the rtnl semaphore.  See the comment at the
+ *	end of Space.c for details about the locking.  You may want
+ *	register_netdev() instead of this.
+ *
  *	BUGS:
  *	The locking appears insufficient to guarantee two parallel registers
  *	will not get the same name.
@@ -2148,6 +2276,9 @@
 int register_netdevice(struct net_device *dev)
 {
 	struct net_device *d, **dp;
+#ifdef CONFIG_NET_DIVERT
+	int ret;
+#endif
 
 	spin_lock_init(&dev->queue_lock);
 	spin_lock_init(&dev->xmit_lock);
@@ -2182,6 +2313,12 @@
 		dev_hold(dev);
 		write_unlock_bh(&dev_base_lock);
 
+#ifdef CONFIG_NET_DIVERT
+		ret = alloc_divert_blk(dev);
+		if (ret)
+			return ret;
+#endif /* CONFIG_NET_DIVERT */
+		
 		/*
 		 *	Default initial state at registry is that the
 		 *	device is present.
@@ -2231,6 +2368,12 @@
 	dev->deadbeaf = 0;
 	write_unlock_bh(&dev_base_lock);
 
+#ifdef CONFIG_NET_DIVERT
+	ret = alloc_divert_blk(dev);
+	if (ret)
+		return ret;
+#endif /* CONFIG_NET_DIVERT */
+	
 	/* Notify protocols, that a new device appeared. */
 	notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
 
@@ -2272,6 +2415,10 @@
  *	This function shuts down a device interface and removes it
  *	from the kernel tables. On success 0 is returned, on a failure
  *	a negative errno code is returned.
+ *
+ *	Callers must hold the rtnl semaphore.  See the comment at the
+ *	end of Space.c for details about the locking.  You may want
+ *	unregister_netdev() instead of this.
  */
 
 int unregister_netdevice(struct net_device *dev)
@@ -2325,6 +2472,10 @@
 	/* Notifier chain MUST detach us from master device. */
 	BUG_TRAP(dev->master==NULL);
 
+#ifdef CONFIG_NET_DIVERT
+	free_divert_blk(dev);
+#endif
+
 	if (dev->new_style) {
 #ifdef NET_REFCNT_DEBUG
 		if (atomic_read(&dev->refcnt) != 1)
@@ -2397,7 +2548,15 @@
 
 extern void net_device_init(void);
 extern void ip_auto_config(void);
+#ifdef CONFIG_NET_DIVERT
+extern void dv_init(void);
+#endif /* CONFIG_NET_DIVERT */
 
+
+/*
+ *       Callers must hold the rtnl semaphore.  See the comment at the
+ *       end of Space.c for details about the locking.
+ */
 int __init net_dev_init(void)
 {
 	struct net_device *dev, **dp;
@@ -2407,6 +2566,10 @@
 	pktsched_init();
 #endif
 
+#ifdef CONFIG_NET_DIVERT
+	dv_init();
+#endif /* CONFIG_NET_DIVERT */
+	
 	/*
 	 *	Initialise the packet receive queues.
 	 */
@@ -2417,6 +2580,8 @@
 		queue = &softnet_data[i];
 		skb_queue_head_init(&queue->input_pkt_queue);
 		queue->throttle = 0;
+		queue->cng_level = 0;
+		queue->avg_blog = 10; /* arbitrary non-zero */
 		queue->completion_queue = NULL;
 	}
 	
@@ -2425,6 +2590,12 @@
 	NET_PROFILE_REGISTER(dev_queue_xmit);
 	NET_PROFILE_REGISTER(softnet_process);
 #endif
+
+#ifdef OFFLINE_SAMPLE
+	samp_timer.expires = jiffies + (10 * HZ);
+	add_timer(&samp_timer);
+#endif
+
 	/*
 	 *	Add the devices.
 	 *	If the call to dev->init fails, the dev is removed

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)