patch-2.4.0-test2 linux/drivers/block/ll_rw_blk.c

Next file: linux/drivers/block/loop.c
Previous file: linux/drivers/block/linear.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.4.0-test1/linux/drivers/block/ll_rw_blk.c linux/drivers/block/ll_rw_blk.c
@@ -4,6 +4,7 @@
  * Copyright (C) 1991, 1992 Linus Torvalds
  * Copyright (C) 1994,      Karl Keyte: Added support for disk statistics
  * Elevator latency, (C) 2000  Andrea Arcangeli <andrea@suse.de> SuSE
+ * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
  */
 
 /*
@@ -37,10 +38,9 @@
 #endif
 
 /*
- * The request-struct contains all necessary data
- * to load a nr of sectors into memory
+ * For the allocated request tables
  */
-static struct request all_requests[NR_REQUEST];
+static kmem_cache_t *request_cachep;
 
 /*
  * The "disk" task queue is used to start the actual requests
@@ -62,11 +62,6 @@
  */
 spinlock_t io_request_lock = SPIN_LOCK_UNLOCKED;
 
-/*
- * used to wait on when there are no free requests
- */
-DECLARE_WAIT_QUEUE_HEAD(wait_for_request);
-
 /* This specifies how many sectors to read ahead on the disk. */
 
 int read_ahead[MAX_BLKDEV];
@@ -127,29 +122,61 @@
 	return max_sectors[MAJOR(dev)][MINOR(dev)];
 }
 
+static inline request_queue_t *__blk_get_queue(kdev_t dev)
+{
+	struct blk_dev_struct *bdev = blk_dev + MAJOR(dev);
+
+	if (bdev->queue)
+		return bdev->queue(dev);
+	else
+		return &blk_dev[MAJOR(dev)].request_queue;
+}
+
 /*
  * NOTE: the device-specific queue() functions
  * have to be atomic!
  */
-request_queue_t * blk_get_queue (kdev_t dev)
+request_queue_t *blk_get_queue(kdev_t dev)
 {
-	int major = MAJOR(dev);
-	struct blk_dev_struct *bdev = blk_dev + major;
-	unsigned long flags;
 	request_queue_t *ret;
+	unsigned long flags;
 
 	spin_lock_irqsave(&io_request_lock,flags);
-	if (bdev->queue)
-		ret = bdev->queue(dev);
-	else
-		ret = &blk_dev[major].request_queue;
+	ret = __blk_get_queue(dev);
 	spin_unlock_irqrestore(&io_request_lock,flags);
 
 	return ret;
 }
 
+/*
+ * Hopefully the low level driver has finished any out standing requests
+ * first...
+ */
 void blk_cleanup_queue(request_queue_t * q)
 {
+	struct list_head *entry;
+	struct request *rq;
+	int i = QUEUE_NR_REQUESTS;
+
+	if (list_empty(&q->request_freelist))
+		return;
+
+	if (q->queue_requests)
+		BUG();
+
+	entry = &q->request_freelist;
+	entry = entry->next;
+	do {
+		rq = list_entry(entry, struct request, table);
+		entry = entry->next;
+		list_del(&rq->table);
+		kmem_cache_free(request_cachep, rq);
+		i--;
+	} while (!list_empty(&q->request_freelist));
+
+	if (i)
+		printk("blk_cleanup_queue: leaked requests (%d)\n", i);
+
 	memset(q, 0, sizeof(*q));
 }
 
@@ -222,7 +249,7 @@
  * This is called with interrupts off and no requests on the queue.
  * (and with the request spinlock aquired)
  */
-static void generic_plug_device (request_queue_t *q, kdev_t dev)
+static void generic_plug_device(request_queue_t *q, kdev_t dev)
 {
 #ifdef CONFIG_BLK_DEV_MD
 	if (MAJOR(dev) == MD_MAJOR) {
@@ -230,25 +257,51 @@
 		BUG();
 	}
 #endif
-	if (!list_empty(&q->queue_head))
+	/*
+	 * no need to replug device
+	 */
+	if (!list_empty(&q->queue_head) || q->plugged)
 		return;
 
 	q->plugged = 1;
 	queue_task(&q->plug_tq, &tq_disk);
 }
 
+static void blk_init_free_list(request_queue_t *q)
+{
+	struct request *rq;
+	int i;
+
+	/*
+	 * Divide requests in half between read and write. This used to
+	 * be a 2/3 advantage for reads, but now reads can steal from
+	 * the write free list.
+	 */
+	for (i = 0; i < QUEUE_NR_REQUESTS; i++) {
+		rq = kmem_cache_alloc(request_cachep, SLAB_KERNEL);
+		rq->rq_status = RQ_INACTIVE;
+		list_add(&rq->table, &q->request_freelist);
+	}
+
+	q->queue_requests = 0;
+	init_waitqueue_head(&q->wait_for_request);
+	spin_lock_init(&q->request_lock);
+}
+
 void blk_init_queue(request_queue_t * q, request_fn_proc * rfn)
 {
 	INIT_LIST_HEAD(&q->queue_head);
-	elevator_init(&q->elevator);
+	INIT_LIST_HEAD(&q->request_freelist);
+	elevator_init(&q->elevator, ELEVATOR_LINUS);
+	blk_init_free_list(q);
 	q->request_fn     	= rfn;
 	q->back_merge_fn       	= ll_back_merge_fn;
 	q->front_merge_fn      	= ll_front_merge_fn;
 	q->merge_requests_fn	= ll_merge_requests_fn;
 	q->make_request_fn	= NULL;
-	q->plug_tq.sync   	= 0;
+	q->plug_tq.sync		= 0;
 	q->plug_tq.routine	= &generic_unplug_device;
-	q->plug_tq.data   	= q;
+	q->plug_tq.data		= q;
 	q->plugged        	= 0;
 	/*
 	 * These booleans describe the queue properties.  We set the
@@ -263,89 +316,88 @@
 /*
  * remove the plug and let it rip..
  */
-void generic_unplug_device(void * data)
+static inline void __generic_unplug_device(request_queue_t *q)
 {
-	request_queue_t * q = (request_queue_t *) data;
-	unsigned long flags;
-
-	spin_lock_irqsave(&io_request_lock,flags);
 	if (q->plugged) {
 		q->plugged = 0;
 		if (!list_empty(&q->queue_head))
-			(q->request_fn)(q);
+			q->request_fn(q);
 	}
-	spin_unlock_irqrestore(&io_request_lock,flags);
 }
 
+void generic_unplug_device(void *data)
+{
+	request_queue_t *q = (request_queue_t *) data;
+	unsigned long flags;
+
+	spin_lock_irqsave(&io_request_lock, flags);
+	__generic_unplug_device(q);
+	spin_unlock_irqrestore(&io_request_lock, flags);
+}
+
+#define blkdev_free_rq(list) list_entry((list)->next, struct request, table);
 /*
- * look for a free request in the first N entries.
- * NOTE: interrupts must be disabled on the way in (on SMP the request queue
- * spinlock has to be aquired), and will still be disabled on the way out.
+ * Get a free request. io_request_lock must be held and interrupts
+ * disabled on the way in.
  */
-static inline struct request * get_request(int n, kdev_t dev)
+static inline struct request *get_request(request_queue_t *q, int rw)
 {
-	static struct request *prev_found = NULL, *prev_limit = NULL;
-	register struct request *req, *limit;
+	register struct request *rq = NULL;
 
-	if (n <= 0)
-		panic("get_request(%d): impossible!\n", n);
+	if (!list_empty(&q->request_freelist)) {
+		elevator_t *e = &q->elevator;
 
-	limit = all_requests + n;
-	if (limit != prev_limit) {
-		prev_limit = limit;
-		prev_found = all_requests;
-	}
-	req = prev_found;
-	for (;;) {
-		req = ((req > all_requests) ? req : limit) - 1;
-		if (req->rq_status == RQ_INACTIVE)
-			break;
-		if (req == prev_found)
+		if ((q->queue_requests > QUEUE_WRITES_MAX) && (rw == WRITE))
 			return NULL;
+
+		rq = blkdev_free_rq(&q->request_freelist);
+		list_del(&rq->table);
+		rq->rq_status = RQ_ACTIVE;
+		rq->special = NULL;
+		rq->q = q;
+		if (rq->cmd == READ)
+			rq->elevator_sequence = e->read_latency;
+		else
+			rq->elevator_sequence = e->write_latency;
+		q->queue_requests++;
 	}
-	prev_found = req;
-	req->rq_status = RQ_ACTIVE;
-	req->rq_dev = dev;
-	req->special = NULL;
-	return req;
+	return rq;
 }
 
 /*
- * wait until a free request in the first N entries is available.
+ * No available requests for this queue, unplug the device.
  */
-static struct request * __get_request_wait(int n, kdev_t dev)
+static struct request *__get_request_wait(request_queue_t *q, int rw)
 {
-	register struct request *req;
+	register struct request *rq;
 	DECLARE_WAITQUEUE(wait, current);
-	unsigned long flags;
 
-	add_wait_queue_exclusive(&wait_for_request, &wait);
+	add_wait_queue_exclusive(&q->wait_for_request, &wait);
 	for (;;) {
-		__set_current_state(TASK_UNINTERRUPTIBLE|TASK_EXCLUSIVE);
-		spin_lock_irqsave(&io_request_lock,flags);
-		req = get_request(n, dev);
-		spin_unlock_irqrestore(&io_request_lock,flags);
-		if (req)
+		__set_current_state(TASK_UNINTERRUPTIBLE | TASK_EXCLUSIVE);
+		spin_lock_irq(&io_request_lock);
+		rq = get_request(q, rw);
+		spin_unlock_irq(&io_request_lock);
+		if (rq)
 			break;
-		run_task_queue(&tq_disk);
+		generic_unplug_device(q);
 		schedule();
 	}
-	remove_wait_queue(&wait_for_request, &wait);
+	remove_wait_queue(&q->wait_for_request, &wait);
 	current->state = TASK_RUNNING;
-	return req;
+	return rq;
 }
 
-static inline struct request * get_request_wait(int n, kdev_t dev)
+static inline struct request *get_request_wait(request_queue_t *q, int rw)
 {
-	register struct request *req;
-	unsigned long flags;
+	register struct request *rq;
 
-	spin_lock_irqsave(&io_request_lock,flags);
-	req = get_request(n, dev);
-	spin_unlock_irqrestore(&io_request_lock,flags);
-	if (req)
-		return req;
-	return __get_request_wait(n, dev);
+	spin_lock_irq(&io_request_lock);
+	rq = get_request(q, rw);
+	spin_unlock_irq(&io_request_lock);
+	if (rq)
+		return rq;
+	return __get_request_wait(q, rw);
 }
 
 /* RO fail safe mechanism */
@@ -405,35 +457,45 @@
  */
 
 static inline void add_request(request_queue_t * q, struct request * req,
-			       struct list_head * head, int latency)
+			       struct list_head *head, int lat)
 {
 	int major;
 
 	drive_stat_acct(req->rq_dev, req->cmd, req->nr_sectors, 1);
-
-	if (list_empty(head)) {
-		req->elevator_sequence = elevator_sequence(&q->elevator, latency);
-		list_add(&req->queue, &q->queue_head);
-		return;
-	}
-	q->elevator.elevator_fn(req, &q->elevator, &q->queue_head, head, latency);
-
+	elevator_account_request(req);
 	/*
+	 * let selected elevator insert the request
+	 */
+	q->elevator.elevator_fn(req, &q->elevator, &q->queue_head, head, lat);
+
+        /*
 	 * FIXME(eric) I don't understand why there is a need for this
 	 * special case code.  It clearly doesn't fit any more with
 	 * the new queueing architecture, and it got added in 2.3.10.
 	 * I am leaving this in here until I hear back from the COMPAQ
 	 * people.
-	 */
+         */
 	major = MAJOR(req->rq_dev);
 	if (major >= COMPAQ_SMART2_MAJOR+0 && major <= COMPAQ_SMART2_MAJOR+7)
-	{
 		(q->request_fn)(q);
-	}
-
 	if (major >= DAC960_MAJOR+0 && major <= DAC960_MAJOR+7)
-	{
 		(q->request_fn)(q);
+}
+
+/*
+ * Must be called with io_request_lock held and interrupts disabled
+ */
+void inline blkdev_release_request(struct request *req)
+{
+	req->rq_status = RQ_INACTIVE;
+
+	/*
+	 * Request may not have originated from ll_rw_blk
+	 */
+	if (req->q) {
+		list_add(&req->table, &req->q->request_freelist);
+		req->q->queue_requests--;
+		wake_up(&req->q->wait_for_request);
 	}
 }
 
@@ -461,13 +523,12 @@
 	if(!(q->merge_requests_fn)(q, req, next, max_segments))
 		return;
 
-	elevator_merge_requests(&q->elevator, req, next);
+	elevator_merge_requests(req, next);
 	req->bhtail->b_reqnext = next->bh;
 	req->bhtail = next->bhtail;
 	req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors;
-	next->rq_status = RQ_INACTIVE;
 	list_del(&next->queue);
-	wake_up (&wait_for_request);
+	blkdev_release_request(next);
 }
 
 static inline void attempt_back_merge(request_queue_t * q,
@@ -495,18 +556,16 @@
 }
 
 static inline void __make_request(request_queue_t * q, int rw,
-			   struct buffer_head * bh)
+				  struct buffer_head * bh)
 {
 	int major = MAJOR(bh->b_rdev);
 	unsigned int sector, count;
 	int max_segments = MAX_SEGMENTS;
-	struct request * req;
-	int rw_ahead, max_req, max_sectors;
-	unsigned long flags;
-
-	int orig_latency, latency, starving, sequence;
-	struct list_head * entry, * head = &q->queue_head;
-	elevator_t * elevator;
+	struct request * req = NULL;
+	int rw_ahead, max_sectors, el_ret;
+	struct list_head *head = &q->queue_head;
+	int latency;
+	elevator_t *elevator = &q->elevator;
 
 	count = bh->b_size >> 9;
 	sector = bh->b_rsector;
@@ -540,7 +599,6 @@
 			if (buffer_uptodate(bh)) /* Hmmph! Already have it */
 				goto end_io;
 			kstat.pgpgin++;
-			max_req = NR_REQUEST;	/* reads take precedence */
 			break;
 		case WRITERAW:
 			rw = WRITE;
@@ -557,7 +615,6 @@
 			 * requests are only for reads.
 			 */
 			kstat.pgpgout++;
-			max_req = (NR_REQUEST * 2) / 3;
 			break;
 		default:
 			BUG();
@@ -582,158 +639,82 @@
 
 /* look for a free request. */
 	/*
-	 * Loop uses two requests, 1 for loop and 1 for the real device.
-	 * Cut max_req in half to avoid running out and deadlocking.
-	 */
-	 if ((major == LOOP_MAJOR) || (major == NBD_MAJOR))
-		max_req >>= 1;
-
-	/*
 	 * Try to coalesce the new request with old requests
 	 */
 	max_sectors = get_max_sectors(bh->b_rdev);
 
-	elevator = &q->elevator;
-	orig_latency = elevator_request_latency(elevator, rw);
+	latency = elevator_request_latency(elevator, rw);
 
 	/*
 	 * Now we acquire the request spinlock, we have to be mega careful
 	 * not to schedule or do something nonatomic
 	 */
-	spin_lock_irqsave(&io_request_lock,flags);
-	elevator_debug(q, bh->b_rdev);
+	spin_lock_irq(&io_request_lock);
+	elevator_default_debug(q, bh->b_rdev);
 
 	if (list_empty(head)) {
 		q->plug_device_fn(q, bh->b_rdev); /* is atomic */
 		goto get_rq;
 	}
 
-	/* avoid write-bombs to not hurt iteractiveness of reads */
-	if (rw != READ && elevator->read_pendings)
-		max_segments = elevator->max_bomb_segments;
-
-	sequence = elevator->sequence;
-	latency = orig_latency - elevator->nr_segments;
-	starving = 0;
-	entry = head;
-
-	/*
-	 * The scsi disk and cdrom drivers completely remove the request
-	 * from the queue when they start processing an entry.  For this
-	 * reason it is safe to continue to add links to the top entry
-	 * for those devices.
-	 *
-	 * All other drivers need to jump over the first entry, as that
-	 * entry may be busy being processed and we thus can't change
-	 * it.
-	 */
-	if (q->head_active && !q->plugged)
-		head = head->next;
-
-	while ((entry = entry->prev) != head && !starving) {
-		req = blkdev_entry_to_request(entry);
-		if (!req->q)
-			break;
-		latency += req->nr_segments;
-		if (elevator_sequence_before(req->elevator_sequence, sequence))
-			starving = 1;
-		if (latency < 0)
-			continue;
+	el_ret = elevator->elevator_merge_fn(q, &req, bh, rw, &max_sectors, &max_segments);
+	switch (el_ret) {
 
-		if (req->sem)
-			continue;
-		if (req->cmd != rw)
-			continue;
-		if (req->nr_sectors + count > max_sectors)
-			continue;
-		if (req->rq_dev != bh->b_rdev)
-			continue;
-		/* Can we add it to the end of this request? */
-		if (req->sector + req->nr_sectors == sector) {
-			if (latency - req->nr_segments < 0)
-				break;
-			/*
-			 * The merge_fn is a more advanced way
-			 * of accomplishing the same task.  Instead
-			 * of applying a fixed limit of some sort
-			 * we instead define a function which can
-			 * determine whether or not it is safe to
-			 * merge the request or not.
-			 *
-			 * See if this queue has rules that
-			 * may suggest that we shouldn't merge
-			 * this 
-			 */
-			if(!(q->back_merge_fn)(q, req, bh, max_segments))
+		case ELEVATOR_BACK_MERGE:
+			if (!q->back_merge_fn(q, req, bh, max_segments))
 				break;
 			req->bhtail->b_reqnext = bh;
 			req->bhtail = bh;
-		    	req->nr_sectors = req->hard_nr_sectors += count;
+			req->nr_sectors = req->hard_nr_sectors += count;
+			req->e = elevator;
 			drive_stat_acct(req->rq_dev, req->cmd, count, 0);
-
-			elevator_merge_after(elevator, req, latency);
-
-			/* Can we now merge this req with the next? */
 			attempt_back_merge(q, req, max_sectors, max_segments);
-		/* or to the beginning? */
-		} else if (req->sector - count == sector) {
-			if (starving)
-				break;
-			/*
-			 * The merge_fn is a more advanced way
-			 * of accomplishing the same task.  Instead
-			 * of applying a fixed limit of some sort
-			 * we instead define a function which can
-			 * determine whether or not it is safe to
-			 * merge the request or not.
-			 *
-			 * See if this queue has rules that
-			 * may suggest that we shouldn't merge
-			 * this 
-			 */
-			if(!(q->front_merge_fn)(q, req, bh, max_segments))
+			goto out;
+
+		case ELEVATOR_FRONT_MERGE:
+			if (!q->front_merge_fn(q, req, bh, max_segments))
 				break;
-		    	bh->b_reqnext = req->bh;
-		    	req->bh = bh;
-		    	req->buffer = bh->b_data;
-		    	req->current_nr_sectors = count;
-		    	req->sector = req->hard_sector = sector;
-		    	req->nr_sectors = req->hard_nr_sectors += count;
+			bh->b_reqnext = req->bh;
+			req->bh = bh;
+			req->buffer = bh->b_data;
+			req->current_nr_sectors = count;
+			req->sector = req->hard_sector = sector;
+			req->nr_sectors = req->hard_nr_sectors += count;
+			req->e = elevator;
 			drive_stat_acct(req->rq_dev, req->cmd, count, 0);
-
-			elevator_merge_before(elevator, req, latency);
-
 			attempt_front_merge(q, head, req, max_sectors, max_segments);
-		} else
-			continue;
-
-		q->elevator.sequence++;
-		spin_unlock_irqrestore(&io_request_lock,flags);
-	    	return;
+			goto out;
+		/*
+		 * elevator says don't/can't merge. get new request
+		 */
+		case ELEVATOR_NO_MERGE:
+			break;
 
+		default:
+			printk("elevator returned crap (%d)\n", el_ret);
+			BUG();
 	}
-
-/* find an unused request. */
-get_rq:
-	req = get_request(max_req, bh->b_rdev);
-
+		
 	/*
-	 * if no request available: if rw_ahead, forget it,
-	 * otherwise try again blocking..
+	 * Grab a free request from the freelist. Read first try their
+	 * own queue - if that is empty, we steal from the write list.
+	 * Writes must block if the write list is empty, and read aheads
+	 * are not crucial.
 	 */
-	if (!req) {
-		spin_unlock_irqrestore(&io_request_lock,flags);
+get_rq:
+	if ((req = get_request(q, rw)) == NULL) {
+		spin_unlock_irq(&io_request_lock);
 		if (rw_ahead)
 			goto end_io;
-		req = __get_request_wait(max_req, bh->b_rdev);
-		spin_lock_irqsave(&io_request_lock,flags);
 
-		/* revalidate elevator */
-		head = &q->queue_head;
-		if (q->head_active && !q->plugged)
-			head = head->next;
+		req = __get_request_wait(q, rw);
+		spin_lock_irq(&io_request_lock);
 	}
 
+	head = &q->queue_head;
+	if (q->head_active && !q->plugged)
+		head = head->next;
+
 /* fill up the request-info, and add it to the queue */
 	req->cmd = rw;
 	req->errors = 0;
@@ -746,20 +727,18 @@
 	req->sem = NULL;
 	req->bh = bh;
 	req->bhtail = bh;
-	req->q = q;
-	add_request(q, req, head, orig_latency);
-	elevator_account_request(elevator, req);
-
-	spin_unlock_irqrestore(&io_request_lock, flags);
+	req->rq_dev = bh->b_rdev;
+	req->e = elevator;
+	add_request(q, req, head, latency);
+out:
+	spin_unlock_irq(&io_request_lock);
 	return;
-
 end_io:
 	bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
 }
 
 int generic_make_request (request_queue_t *q, int rw, struct buffer_head * bh)
 {
-	unsigned long flags;
 	int ret;
 
 	/*
@@ -767,7 +746,6 @@
 	 * still free to implement/resolve their own stacking
 	 * by explicitly returning 0)
 	 */
-
 	while (q->make_request_fn) {
 		ret = q->make_request_fn(q, rw, bh);
 		if (ret > 0) {
@@ -781,10 +759,10 @@
 	 * the IO request? (normal case)
 	 */
 	__make_request(q, rw, bh);
-	spin_lock_irqsave(&io_request_lock,flags);
+	spin_lock_irq(&io_request_lock);
 	if (q && !q->plugged)
 		(q->request_fn)(q);
-	spin_unlock_irqrestore(&io_request_lock,flags);
+	spin_unlock_irq(&io_request_lock);
 
 	return 0;
 }
@@ -923,28 +901,27 @@
 
 void end_that_request_last(struct request *req)
 {
-	if (req->q)
+	if (req->e) {
+		printk("end_that_request_last called with non-dequeued req\n");
 		BUG();
+	}
 	if (req->sem != NULL)
 		up(req->sem);
-	req->rq_status = RQ_INACTIVE;
-	wake_up(&wait_for_request);
+
+	blkdev_release_request(req);
 }
 
 int __init blk_dev_init(void)
 {
-	struct request * req;
 	struct blk_dev_struct *dev;
 
-	for (dev = blk_dev + MAX_BLKDEV; dev-- != blk_dev;) {
+	request_cachep = kmem_cache_create("blkdev_requests",
+					   sizeof(struct request),
+					   0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+
+	for (dev = blk_dev + MAX_BLKDEV; dev-- != blk_dev;)
 		dev->queue = NULL;
-		blk_init_queue(&dev->request_queue, NULL);
-	}
 
-	req = all_requests + NR_REQUEST;
-	while (--req >= all_requests) {
-		req->rq_status = RQ_INACTIVE;
-	}
 	memset(ro_bits,0,sizeof(ro_bits));
 	memset(max_readahead, 0, sizeof(max_readahead));
 	memset(max_sectors, 0, sizeof(max_sectors));
@@ -1070,3 +1047,4 @@
 EXPORT_SYMBOL(blk_queue_pluggable);
 EXPORT_SYMBOL(blk_queue_make_request);
 EXPORT_SYMBOL(generic_make_request);
+EXPORT_SYMBOL(blkdev_release_request);

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)