patch-2.4.0-test2 linux/drivers/block/raid1.c
Next file: linux/drivers/block/raid5.c
Previous file: linux/drivers/block/raid0.c
Back to the patch index
Back to the overall index
- Lines: 1163
- Date:
Wed Jun 21 10:10:02 2000
- Orig file:
v2.4.0-test1/linux/drivers/block/raid1.c
- Orig date:
Fri May 12 14:18:55 2000
diff -u --recursive --new-file v2.4.0-test1/linux/drivers/block/raid1.c linux/drivers/block/raid1.c
@@ -7,6 +7,11 @@
*
* RAID-1 management functions.
*
+ * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
+ *
+ * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
+ * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
+ *
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
@@ -26,10 +31,7 @@
#define MD_DRIVER
#define MD_PERSONALITY
-#define MAX_LINEAR_SECTORS 128
-
-#define MAX(a,b) ((a) > (b) ? (a) : (b))
-#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX_WORK_PER_DISK 128
/*
* The following can be used to debug the driver
@@ -41,48 +43,256 @@
#define inline
#define __inline__
#else
-#define inline
-#define __inline__
#define PRINTK(x...) do { } while (0)
#endif
static mdk_personality_t raid1_personality;
static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED;
-struct buffer_head *raid1_retry_list = NULL, **raid1_retry_tail;
+struct raid1_bh *raid1_retry_list = NULL, **raid1_retry_tail;
-static void * raid1_kmalloc (int size)
+static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt)
{
- void * ptr;
- /*
- * now we are rather fault tolerant than nice, but
- * there are a couple of places in the RAID code where we
- * simply can not afford to fail an allocation because
- * there is no failure return path (eg. make_request())
- */
- while (!(ptr = kmalloc (size, GFP_KERNEL)))
- printk ("raid1: out of memory, retrying...\n");
+ /* return a linked list of "cnt" struct buffer_heads.
+ * don't take any off the free list unless we know we can
+ * get all we need, otherwise we could deadlock
+ */
+ struct buffer_head *bh=NULL;
+
+ while(cnt) {
+ struct buffer_head *t;
+ md_spin_lock_irq(&conf->device_lock);
+ if (conf->freebh_cnt >= cnt)
+ while (cnt) {
+ t = conf->freebh;
+ conf->freebh = t->b_next;
+ t->b_next = bh;
+ bh = t;
+ t->b_state = 0;
+ conf->freebh_cnt--;
+ cnt--;
+ }
+ md_spin_unlock_irq(&conf->device_lock);
+ if (cnt == 0)
+ break;
+ t = (struct buffer_head *)kmalloc(sizeof(struct buffer_head), GFP_KERNEL);
+ if (t) {
+ memset(t, 0, sizeof(*t));
+ t->b_next = bh;
+ bh = t;
+ cnt--;
+ } else {
+ PRINTK("waiting for %d bh\n", cnt);
+ wait_event(conf->wait_buffer, conf->freebh_cnt >= cnt);
+ }
+ }
+ return bh;
+}
- memset(ptr, 0, size);
- return ptr;
+static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh)
+{
+ md_spin_lock_irq(&conf->device_lock);
+ while (bh) {
+ struct buffer_head *t = bh;
+ bh=bh->b_next;
+ if (t->b_pprev == NULL)
+ kfree(t);
+ else {
+ t->b_next= conf->freebh;
+ conf->freebh = t;
+ conf->freebh_cnt++;
+ }
+ }
+ md_spin_unlock_irq(&conf->device_lock);
+ wake_up(&conf->wait_buffer);
}
-static struct page * raid1_gfp (void)
+static int raid1_grow_bh(raid1_conf_t *conf, int cnt)
{
- struct page *page;
- /*
- * now we are rather fault tolerant than nice, but
- * there are a couple of places in the RAID code where we
- * simply can not afford to fail an allocation because
- * there is no failure return path (eg. make_request())
- * FIXME: be nicer here.
- */
- while (!(page = (void*)alloc_page(GFP_KERNEL))) {
- printk ("raid1: GFP out of memory, retrying...\n");
- schedule_timeout(2);
+ /* allocate cnt buffer_heads, possibly less if kalloc fails */
+ int i = 0;
+
+ while (i < cnt) {
+ struct buffer_head *bh;
+ bh = kmalloc(sizeof(*bh), GFP_KERNEL);
+ if (!bh) break;
+ memset(bh, 0, sizeof(*bh));
+
+ md_spin_lock_irq(&conf->device_lock);
+ bh->b_pprev = &conf->freebh;
+ bh->b_next = conf->freebh;
+ conf->freebh = bh;
+ conf->freebh_cnt++;
+ md_spin_unlock_irq(&conf->device_lock);
+
+ i++;
+ }
+ return i;
+}
+
+static int raid1_shrink_bh(raid1_conf_t *conf, int cnt)
+{
+ /* discard cnt buffer_heads, if we can find them */
+ int i = 0;
+
+ md_spin_lock_irq(&conf->device_lock);
+ while ((i < cnt) && conf->freebh) {
+ struct buffer_head *bh = conf->freebh;
+ conf->freebh = bh->b_next;
+ kfree(bh);
+ i++;
+ conf->freebh_cnt--;
}
+ md_spin_unlock_irq(&conf->device_lock);
+ return i;
+}
+
- return page;
+static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf)
+{
+ struct raid1_bh *r1_bh = NULL;
+
+ do {
+ md_spin_lock_irq(&conf->device_lock);
+ if (conf->freer1) {
+ r1_bh = conf->freer1;
+ conf->freer1 = r1_bh->next_r1;
+ r1_bh->next_r1 = NULL;
+ r1_bh->state = 0;
+ r1_bh->bh_req.b_state = 0;
+ }
+ md_spin_unlock_irq(&conf->device_lock);
+ if (r1_bh)
+ return r1_bh;
+ r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh),
+ GFP_KERNEL);
+ if (r1_bh) {
+ memset(r1_bh, 0, sizeof(*r1_bh));
+ return r1_bh;
+ }
+ wait_event(conf->wait_buffer, conf->freer1);
+ } while (1);
+}
+
+static inline void raid1_free_r1bh(struct raid1_bh *r1_bh)
+{
+ struct buffer_head *bh = r1_bh->mirror_bh_list;
+ raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
+
+ r1_bh->mirror_bh_list = NULL;
+
+ if (test_bit(R1BH_PreAlloc, &r1_bh->state)) {
+ md_spin_lock_irq(&conf->device_lock);
+ r1_bh->next_r1 = conf->freer1;
+ conf->freer1 = r1_bh;
+ md_spin_unlock_irq(&conf->device_lock);
+ } else {
+ kfree(r1_bh);
+ }
+ raid1_free_bh(conf, bh);
+}
+
+static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt)
+{
+ int i = 0;
+
+ while (i < cnt) {
+ struct raid1_bh *r1_bh;
+ r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL);
+ if (!r1_bh)
+ break;
+ memset(r1_bh, 0, sizeof(*r1_bh));
+
+ md_spin_lock_irq(&conf->device_lock);
+ set_bit(R1BH_PreAlloc, &r1_bh->state);
+ r1_bh->next_r1 = conf->freer1;
+ conf->freer1 = r1_bh;
+ md_spin_unlock_irq(&conf->device_lock);
+
+ i++;
+ }
+ return i;
+}
+
+static void raid1_shrink_r1bh(raid1_conf_t *conf)
+{
+ md_spin_lock_irq(&conf->device_lock);
+ while (conf->freer1) {
+ struct raid1_bh *r1_bh = conf->freer1;
+ conf->freer1 = r1_bh->next_r1;
+ kfree(r1_bh);
+ }
+ md_spin_unlock_irq(&conf->device_lock);
+}
+
+
+
+static inline void raid1_free_buf(struct raid1_bh *r1_bh)
+{
+ struct buffer_head *bh = r1_bh->mirror_bh_list;
+ raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
+ r1_bh->mirror_bh_list = NULL;
+
+ md_spin_lock_irq(&conf->device_lock);
+ r1_bh->next_r1 = conf->freebuf;
+ conf->freebuf = r1_bh;
+ md_spin_unlock_irq(&conf->device_lock);
+ raid1_free_bh(conf, bh);
+}
+
+static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf)
+{
+ struct raid1_bh *r1_bh;
+
+ md_spin_lock_irq(&conf->device_lock);
+ wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock);
+ r1_bh = conf->freebuf;
+ conf->freebuf = r1_bh->next_r1;
+ r1_bh->next_r1= NULL;
+ md_spin_unlock_irq(&conf->device_lock);
+
+ return r1_bh;
+}
+
+static int raid1_grow_buffers (raid1_conf_t *conf, int cnt)
+{
+ int i = 0;
+
+ md_spin_lock_irq(&conf->device_lock);
+ while (i < cnt) {
+ struct raid1_bh *r1_bh;
+ struct page *page;
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ break;
+
+ r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL);
+ if (!r1_bh) {
+ __free_page(page);
+ break;
+ }
+ memset(r1_bh, 0, sizeof(*r1_bh));
+ r1_bh->bh_req.b_page = page;
+ r1_bh->bh_req.b_data = (char *) page_address(page);
+ r1_bh->next_r1 = conf->freebuf;
+ conf->freebuf = r1_bh;
+ i++;
+ }
+ md_spin_unlock_irq(&conf->device_lock);
+ return i;
+}
+
+static void raid1_shrink_buffers (raid1_conf_t *conf)
+{
+ md_spin_lock_irq(&conf->device_lock);
+ while (conf->freebuf) {
+ struct raid1_bh *r1_bh = conf->freebuf;
+ conf->freebuf = r1_bh->next_r1;
+ __free_page(r1_bh->bh_req.b_page);
+ kfree(r1_bh);
+ }
+ md_spin_unlock_irq(&conf->device_lock);
}
static int raid1_map (mddev_t *mddev, kdev_t *rdev, unsigned long size)
@@ -106,19 +316,18 @@
return (-1);
}
-static void raid1_reschedule_retry (struct buffer_head *bh)
+static void raid1_reschedule_retry (struct raid1_bh *r1_bh)
{
unsigned long flags;
- struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id);
mddev_t *mddev = r1_bh->mddev;
raid1_conf_t *conf = mddev_to_conf(mddev);
md_spin_lock_irqsave(&retry_list_lock, flags);
if (raid1_retry_list == NULL)
raid1_retry_tail = &raid1_retry_list;
- *raid1_retry_tail = bh;
- raid1_retry_tail = &r1_bh->next_retry;
- r1_bh->next_retry = NULL;
+ *raid1_retry_tail = r1_bh;
+ raid1_retry_tail = &r1_bh->next_r1;
+ r1_bh->next_r1 = NULL;
md_spin_unlock_irqrestore(&retry_list_lock, flags);
md_wakeup_thread(conf->thread);
}
@@ -166,7 +375,7 @@
test_bit(R1BH_SyncPhase, &r1_bh->state));
bh->b_end_io(bh, uptodate);
- kfree(r1_bh);
+ raid1_free_r1bh(r1_bh);
}
void raid1_end_request (struct buffer_head *bh, int uptodate)
{
@@ -176,7 +385,7 @@
* this branch is our 'one mirror IO has finished' event handler:
*/
if (!uptodate)
- md_error (bh->b_dev, bh->b_rdev);
+ md_error (mddev_to_kdev(r1_bh->mddev), bh->b_dev);
else
/*
* Set R1BH_Uptodate in our master buffer_head, so that
@@ -208,7 +417,7 @@
*/
printk(KERN_ERR "raid1: %s: rescheduling block %lu\n",
partition_name(bh->b_dev), bh->b_blocknr);
- raid1_reschedule_retry(bh);
+ raid1_reschedule_retry(r1_bh);
return;
}
@@ -219,29 +428,128 @@
* already.
*/
- if (atomic_dec_and_test(&r1_bh->remaining)) {
- int i, disks = MD_SB_DISKS;
+ if (atomic_dec_and_test(&r1_bh->remaining))
+ raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state));
+}
- for ( i = 0; i < disks; i++) {
- struct buffer_head *bh = r1_bh->mirror_bh[i];
- if (bh) {
- // FIXME: make us a regular bcache member
- kfree(bh);
- }
+/*
+ * This routine returns the disk from which the requested read should
+ * be done. It bookkeeps the last read position for every disk
+ * in array and when new read requests come, the disk which last
+ * position is nearest to the request, is chosen.
+ *
+ * TODO: now if there are 2 mirrors in the same 2 devices, performance
+ * degrades dramatically because position is mirror, not device based.
+ * This should be changed to be device based. Also atomic sequential
+ * reads should be somehow balanced.
+ */
+
+static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh)
+{
+ int new_disk = conf->last_used;
+ const int sectors = bh->b_size >> 9;
+ const long this_sector = bh->b_blocknr * sectors;
+ int disk = new_disk;
+ unsigned long new_distance;
+ unsigned long current_distance;
+
+ /*
+ * Check if it is sane at all to balance
+ */
+
+ if (conf->resync_mirrors)
+ goto rb_out;
+
+ if (conf->working_disks < 2) {
+ int i = 0;
+
+ while( !conf->mirrors[new_disk].operational &&
+ (i < MD_SB_DISKS) ) {
+ new_disk = conf->mirrors[new_disk].next;
+ i++;
}
+
+ if (i >= MD_SB_DISKS) {
+ /*
+ * This means no working disk was found
+ * Nothing much to do, lets not change anything
+ * and hope for the best...
+ */
+
+ new_disk = conf->last_used;
+ }
+
+ goto rb_out;
+ }
- raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state));
+ /*
+ * Don't touch anything for sequential reads.
+ */
+
+ if (this_sector == conf->mirrors[new_disk].head_position)
+ goto rb_out;
+
+ /*
+ * If reads have been done only on a single disk
+ * for a time, lets give another disk a change.
+ * This is for kicking those idling disks so that
+ * they would find work near some hotspot.
+ */
+
+ if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) {
+ conf->sect_count = 0;
+
+ while( new_disk != conf->mirrors[new_disk].next ) {
+ if ((conf->mirrors[new_disk].write_only) ||
+ (!conf->mirrors[new_disk].operational) )
+ continue;
+
+ new_disk = conf->mirrors[new_disk].next;
+ break;
+ }
+
+ goto rb_out;
}
+
+ current_distance = abs(this_sector -
+ conf->mirrors[disk].head_position);
+
+ /* Find the disk which is closest */
+
+ while( conf->mirrors[disk].next != conf->last_used ) {
+ disk = conf->mirrors[disk].next;
+
+ if ((conf->mirrors[disk].write_only) ||
+ (!conf->mirrors[disk].operational))
+ continue;
+
+ new_distance = abs(this_sector -
+ conf->mirrors[disk].head_position);
+
+ if (new_distance < current_distance) {
+ conf->sect_count = 0;
+ current_distance = new_distance;
+ new_disk = disk;
+ }
+ }
+
+rb_out:
+ conf->mirrors[new_disk].head_position = this_sector + sectors;
+
+ conf->last_used = new_disk;
+ conf->sect_count += sectors;
+
+ return new_disk;
}
static int raid1_make_request (request_queue_t *q, mddev_t *mddev, int rw,
struct buffer_head * bh)
{
raid1_conf_t *conf = mddev_to_conf(mddev);
- struct buffer_head *mirror_bh[MD_SB_DISKS], *bh_req;
+ struct buffer_head *bh_req, *bhl;
struct raid1_bh * r1_bh;
int disks = MD_SB_DISKS;
- int i, sum_bhs = 0, switch_disks = 0, sectors;
+ int i, sum_bhs = 0, sectors;
struct mirror_info *mirror;
DECLARE_WAITQUEUE(wait, current);
@@ -279,8 +587,7 @@
return 0;
}
}
- r1_bh = raid1_kmalloc (sizeof (struct raid1_bh));
-
+ r1_bh = raid1_alloc_r1bh (conf);
spin_lock_irq(&conf->segment_lock);
wait_event_lock_irq(conf->wait_done,
@@ -305,41 +612,20 @@
r1_bh->master_bh = bh;
r1_bh->mddev = mddev;
r1_bh->cmd = rw;
- bh->b_rsector = bh->b_blocknr * (bh->b_size>>9);
+ sectors = bh->b_size >> 9;
if (rw == READ) {
- int last_used = conf->last_used;
-
/*
* read balancing logic:
*/
- mirror = conf->mirrors + last_used;
- bh->b_rdev = mirror->dev;
- sectors = bh->b_size >> 9;
-
- switch_disks = 0;
- if (bh->b_blocknr * sectors == conf->next_sect) {
- conf->sect_count += sectors;
- if (conf->sect_count >= mirror->sect_limit)
- switch_disks = 1;
- } else
- switch_disks = 1;
- conf->next_sect = (bh->b_blocknr + 1) * sectors;
- /*
- * Do not switch disks if full resync is in progress ...
- */
- if (switch_disks && !conf->resync_mirrors) {
- conf->sect_count = 0;
- last_used = conf->last_used = mirror->next;
- /*
- * Do not switch to write-only disks ...
- * reconstruction is in progress
- */
- while (conf->mirrors[last_used].write_only)
- conf->last_used = conf->mirrors[last_used].next;
- }
+ mirror = conf->mirrors + raid1_read_balance(conf, bh);
+
bh_req = &r1_bh->bh_req;
memcpy(bh_req, bh, sizeof(*bh));
+ bh_req->b_blocknr = bh->b_rsector * sectors;
+ bh_req->b_dev = mirror->dev;
+ bh_req->b_rdev = mirror->dev;
+ /* bh_req->b_rsector = bh->n_rsector; */
bh_req->b_end_io = raid1_end_request;
bh_req->b_dev_id = r1_bh;
q = blk_get_queue(bh_req->b_rdev);
@@ -351,15 +637,11 @@
* WRITE:
*/
+ bhl = raid1_alloc_bh(conf, conf->raid_disks);
for (i = 0; i < disks; i++) {
-
- if (!conf->mirrors[i].operational) {
- /*
- * the r1_bh->mirror_bh[i] pointer remains NULL
- */
- mirror_bh[i] = NULL;
+ struct buffer_head *mbh;
+ if (!conf->mirrors[i].operational)
continue;
- }
/*
* We should use a private pool (size depending on NR_REQUEST),
@@ -373,30 +655,38 @@
* manner in the write branch. Look how we lock the buffer at the
* beginning of this function to grok the difference ;)
*/
- mirror_bh[i] = raid1_kmalloc(sizeof(struct buffer_head));
- mirror_bh[i]->b_this_page = (struct buffer_head *)1;
+ mbh = bhl;
+ if (mbh == NULL) {
+ MD_BUG();
+ break;
+ }
+ bhl = mbh->b_next;
+ mbh->b_next = NULL;
+ mbh->b_this_page = (struct buffer_head *)1;
/*
- * prepare mirrored bh (fields ordered for max mem throughput):
+ * prepare mirrored mbh (fields ordered for max mem throughput):
*/
- mirror_bh[i]->b_blocknr = bh->b_blocknr;
- mirror_bh[i]->b_dev = bh->b_dev;
- mirror_bh[i]->b_rdev = conf->mirrors[i].dev;
- mirror_bh[i]->b_rsector = bh->b_rsector;
- mirror_bh[i]->b_state = (1<<BH_Req) | (1<<BH_Dirty) |
+ mbh->b_blocknr = bh->b_rsector * sectors;
+ mbh->b_dev = conf->mirrors[i].dev;
+ mbh->b_rdev = conf->mirrors[i].dev;
+ mbh->b_rsector = bh->b_rsector;
+ mbh->b_state = (1<<BH_Req) | (1<<BH_Dirty) |
(1<<BH_Mapped) | (1<<BH_Lock);
- atomic_set(&mirror_bh[i]->b_count, 1);
- mirror_bh[i]->b_size = bh->b_size;
- mirror_bh[i]->b_data = bh->b_data;
- mirror_bh[i]->b_list = BUF_LOCKED;
- mirror_bh[i]->b_end_io = raid1_end_request;
- mirror_bh[i]->b_dev_id = r1_bh;
-
- r1_bh->mirror_bh[i] = mirror_bh[i];
+ atomic_set(&mbh->b_count, 1);
+ mbh->b_size = bh->b_size;
+ mbh->b_page = bh->b_page;
+ mbh->b_data = bh->b_data;
+ mbh->b_list = BUF_LOCKED;
+ mbh->b_end_io = raid1_end_request;
+ mbh->b_dev_id = r1_bh;
+
+ mbh->b_next = r1_bh->mirror_bh_list;
+ r1_bh->mirror_bh_list = mbh;
sum_bhs++;
}
-
+ if (bhl) raid1_free_bh(conf,bhl);
md_atomic_set(&r1_bh->remaining, sum_bhs);
/*
@@ -410,12 +700,12 @@
* all requests finish until we had a chance to set up the
* semaphore correctly ... lots of races).
*/
- for (i = 0; i < disks; i++) {
- struct buffer_head *mbh = mirror_bh[i];
- if (mbh) {
- q = blk_get_queue(mbh->b_rdev);
- generic_make_request(q, rw, mbh);
- }
+ bh = r1_bh->mirror_bh_list;
+ while(bh) {
+ struct buffer_head *bh2 = bh;
+ bh = bh->b_next;
+ q = blk_get_queue(bh2->b_rdev);
+ generic_make_request(q, rw, bh2);
}
return (0);
}
@@ -792,6 +1082,7 @@
adisk->write_only = 0;
adisk->spare = 1;
adisk->used_slot = 1;
+ adisk->head_position = 0;
conf->nr_disks++;
break;
@@ -803,6 +1094,10 @@
}
abort:
md_spin_unlock_irq(&conf->device_lock);
+ if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE)
+ /* should move to "END_REBUILD" when such exists */
+ raid1_shrink_buffers(conf);
+
print_raid1_conf(conf);
return err;
}
@@ -836,19 +1131,19 @@
for (;;) {
md_spin_lock_irqsave(&retry_list_lock, flags);
- bh = raid1_retry_list;
- if (!bh)
+ r1_bh = raid1_retry_list;
+ if (!r1_bh)
break;
- r1_bh = (struct raid1_bh *)(bh->b_dev_id);
- raid1_retry_list = r1_bh->next_retry;
+ raid1_retry_list = r1_bh->next_r1;
md_spin_unlock_irqrestore(&retry_list_lock, flags);
- mddev = kdev_to_mddev(bh->b_dev);
+ mddev = r1_bh->mddev;
if (mddev->sb_dirty) {
printk(KERN_INFO "dirty sb detected, updating.\n");
mddev->sb_dirty = 0;
md_update_sb(mddev);
}
+ bh = &r1_bh->bh_req;
switch(r1_bh->cmd) {
case SPECIAL:
/* have to allocate lots of bh structures and
@@ -857,68 +1152,74 @@
if (test_bit(R1BH_Uptodate, &r1_bh->state)) {
int i, sum_bhs = 0;
int disks = MD_SB_DISKS;
- struct buffer_head *mirror_bh[MD_SB_DISKS];
+ struct buffer_head *bhl, *mbh;
raid1_conf_t *conf;
+ int sectors = bh->b_size >> 9;
conf = mddev_to_conf(mddev);
+ bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */
for (i = 0; i < disks ; i++) {
- if (!conf->mirrors[i].operational) {
- mirror_bh[i] = NULL;
+ if (!conf->mirrors[i].operational)
continue;
- }
- if (i==conf->last_used) {
+ if (i==conf->last_used)
/* we read from here, no need to write */
- mirror_bh[i] = NULL;
continue;
- }
if (i < conf->raid_disks
- && !conf->resync_mirrors) {
+ && !conf->resync_mirrors)
/* don't need to write this,
* we are just rebuilding */
- mirror_bh[i] = NULL;
continue;
+ mbh = bhl;
+ if (!mbh) {
+ MD_BUG();
+ break;
}
+ bhl = mbh->b_next;
+ mbh->b_this_page = (struct buffer_head *)1;
- mirror_bh[i] = raid1_kmalloc(sizeof(struct buffer_head));
- mirror_bh[i]->b_this_page = (struct buffer_head *)1;
/*
* prepare mirrored bh (fields ordered for max mem throughput):
*/
- mirror_bh[i]->b_blocknr = bh->b_blocknr;
- mirror_bh[i]->b_dev = bh->b_dev;
- mirror_bh[i]->b_rdev = conf->mirrors[i].dev;
- mirror_bh[i]->b_rsector = bh->b_rsector;
- mirror_bh[i]->b_state = (1<<BH_Req) | (1<<BH_Dirty) |
+ mbh->b_blocknr = bh->b_blocknr;
+ mbh->b_dev = conf->mirrors[i].dev;
+ mbh->b_rdev = conf->mirrors[i].dev;
+ mbh->b_rsector = bh->b_blocknr * sectors;
+ mbh->b_state = (1<<BH_Req) | (1<<BH_Dirty) |
(1<<BH_Mapped) | (1<<BH_Lock);
+ atomic_set(&mbh->b_count, 1);
+ mbh->b_size = bh->b_size;
+ mbh->b_page = bh->b_page;
+ mbh->b_data = bh->b_data;
+ mbh->b_list = BUF_LOCKED;
+ mbh->b_end_io = end_sync_write;
+ mbh->b_dev_id = r1_bh;
+
+ mbh->b_next = r1_bh->mirror_bh_list;
+ r1_bh->mirror_bh_list = mbh;
- atomic_set(&mirror_bh[i]->b_count, 1);
- mirror_bh[i]->b_size = bh->b_size;
- mirror_bh[i]->b_data = bh->b_data;
- mirror_bh[i]->b_list = BUF_LOCKED;
- mirror_bh[i]->b_end_io = end_sync_write;
- mirror_bh[i]->b_dev_id = r1_bh;
-
- r1_bh->mirror_bh[i] = mirror_bh[i];
sum_bhs++;
}
md_atomic_set(&r1_bh->remaining, sum_bhs);
- for ( i = 0; i < disks ; i++) {
- struct buffer_head *mbh = mirror_bh[i];
- if (mbh) {
- q = blk_get_queue(mbh->b_rdev);
- generic_make_request(q, WRITE, mbh);
- }
+ if (bhl) raid1_free_bh(conf, bhl);
+ mbh = r1_bh->mirror_bh_list;
+ while (mbh) {
+ struct buffer_head *bh1 = mbh;
+ mbh = mbh->b_next;
+ q = blk_get_queue(bh1->b_rdev);
+ generic_make_request(q, WRITE, bh1);
+ drive_stat_acct(bh1->b_rdev, WRITE, -bh1->b_size/512, 0);
}
} else {
- dev = bh->b_rdev;
- raid1_map (mddev, &bh->b_rdev, bh->b_size >> 9);
- if (bh->b_rdev == dev) {
+ dev = bh->b_dev;
+ raid1_map (mddev, &bh->b_dev, bh->b_size >> 9);
+ if (bh->b_dev == dev) {
printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
md_done_sync(mddev, bh->b_size>>10, 0);
} else {
printk (REDIRECT_SECTOR,
partition_name(bh->b_dev), bh->b_blocknr);
+ bh->b_rdev = bh->b_dev;
q = blk_get_queue(bh->b_rdev);
generic_make_request (q, READ, bh);
}
@@ -927,15 +1228,16 @@
break;
case READ:
case READA:
- dev = bh->b_rdev;
+ dev = bh->b_dev;
- raid1_map (mddev, &bh->b_rdev, bh->b_size >> 9);
- if (bh->b_rdev == dev) {
+ raid1_map (mddev, &bh->b_dev, bh->b_size >> 9);
+ if (bh->b_dev == dev) {
printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
raid1_end_bh_io(r1_bh, 0);
} else {
printk (REDIRECT_SECTOR,
partition_name(bh->b_dev), bh->b_blocknr);
+ bh->b_rdev = bh->b_dev;
q = blk_get_queue(bh->b_rdev);
generic_make_request (q, r1_bh->cmd, bh);
}
@@ -961,15 +1263,37 @@
if (conf->resync_mirrors == 2)
return;
down(&mddev->recovery_sem);
- if (md_do_sync(mddev, NULL)) {
- up(&mddev->recovery_sem);
- return;
+ if (!md_do_sync(mddev, NULL)) {
+ /*
+ * Only if everything went Ok.
+ */
+ conf->resync_mirrors = 0;
}
- /*
- * Only if everything went Ok.
+
+ /* If reconstruction was interrupted, we need to close the "active" and "pending"
+ * holes.
+ * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0
*/
- conf->resync_mirrors = 0;
+ /* this is really needed when recovery stops too... */
+ spin_lock_irq(&conf->segment_lock);
+ conf->start_active = conf->start_pending;
+ conf->start_ready = conf->start_pending;
+ wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
+ conf->start_active =conf->start_ready = conf->start_pending = conf->start_future;
+ conf->start_future = mddev->sb->size+1;
+ conf->cnt_pending = conf->cnt_future;
+ conf->cnt_future = 0;
+ conf->phase = conf->phase ^1;
+ wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
+ conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0;
+ conf->phase = 0;
+ conf->cnt_future = conf->cnt_done;;
+ conf->cnt_done = 0;
+ spin_unlock_irq(&conf->segment_lock);
+ wake_up(&conf->wait_done);
+
up(&mddev->recovery_sem);
+ raid1_shrink_buffers(conf);
}
/*
@@ -1032,12 +1356,19 @@
spin_lock_irq(&conf->segment_lock);
if (!block_nr) {
/* initialize ...*/
+ int buffs;
conf->start_active = 0;
conf->start_ready = 0;
conf->start_pending = 0;
conf->start_future = 0;
conf->phase = 0;
- conf->window = 128;
+ /* we want enough buffers to hold twice the window of 128*/
+ buffs = 128 *2 / (PAGE_SIZE>>9);
+ buffs = raid1_grow_buffers(conf, buffs);
+ if (buffs < 2)
+ goto nomem;
+
+ conf->window = buffs*(PAGE_SIZE>>9)/2;
conf->cnt_future += conf->cnt_done+conf->cnt_pending;
conf->cnt_done = conf->cnt_pending = 0;
if (conf->cnt_ready || conf->cnt_active)
@@ -1057,7 +1388,7 @@
conf->start_ready = conf->start_pending;
conf->start_pending = conf->start_future;
conf->start_future = conf->start_future+conf->window;
- // Note: falling of the end is not a problem
+ // Note: falling off the end is not a problem
conf->phase = conf->phase ^1;
conf->cnt_active = conf->cnt_ready;
conf->cnt_ready = 0;
@@ -1075,12 +1406,11 @@
*/
mirror = conf->mirrors+conf->last_used;
- r1_bh = raid1_kmalloc (sizeof (struct raid1_bh));
+ r1_bh = raid1_alloc_buf (conf);
r1_bh->master_bh = NULL;
r1_bh->mddev = mddev;
r1_bh->cmd = SPECIAL;
bh = &r1_bh->bh_req;
- memset(bh, 0, sizeof(*bh));
bh->b_blocknr = block_nr;
bsize = 1024;
@@ -1091,11 +1421,15 @@
}
bh->b_size = bsize;
bh->b_list = BUF_LOCKED;
- bh->b_dev = mddev_to_kdev(mddev);
+ bh->b_dev = mirror->dev;
bh->b_rdev = mirror->dev;
bh->b_state = (1<<BH_Req) | (1<<BH_Mapped);
- bh->b_page = raid1_gfp();
- bh->b_data = (char *) page_address(bh->b_page);
+ if (!bh->b_page)
+ BUG();
+ if (!bh->b_data)
+ BUG();
+ if (bh->b_data != (char *) page_address(bh->b_page))
+ BUG();
bh->b_end_io = end_sync_read;
bh->b_dev_id = (void *) r1_bh;
bh->b_rsector = block_nr<<1;
@@ -1106,6 +1440,11 @@
drive_stat_acct(bh->b_rdev, READ, -bh->b_size/512, 0);
return (bsize >> 10);
+
+nomem:
+ raid1_shrink_buffers(conf);
+ spin_unlock_irq(&conf->segment_lock);
+ return -ENOMEM;
}
static void end_sync_read(struct buffer_head *bh, int uptodate)
@@ -1117,10 +1456,10 @@
* We don't do much here, just schedule handling by raid1d
*/
if (!uptodate)
- md_error (bh->b_dev, bh->b_rdev);
+ md_error (mddev_to_kdev(r1_bh->mddev), bh->b_dev);
else
set_bit(R1BH_Uptodate, &r1_bh->state);
- raid1_reschedule_retry(bh);
+ raid1_reschedule_retry(r1_bh);
}
static void end_sync_write(struct buffer_head *bh, int uptodate)
@@ -1128,22 +1467,12 @@
struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id);
if (!uptodate)
- md_error (bh->b_dev, bh->b_rdev);
+ md_error (mddev_to_kdev(r1_bh->mddev), bh->b_dev);
if (atomic_dec_and_test(&r1_bh->remaining)) {
- int i, disks = MD_SB_DISKS;
mddev_t *mddev = r1_bh->mddev;
- unsigned long sect = bh->b_rsector;
+ unsigned long sect = bh->b_blocknr * (bh->b_size>>9);
int size = bh->b_size;
-
- free_page((unsigned long)bh->b_data);
- for ( i = 0; i < disks; i++) {
- struct buffer_head *bh = r1_bh->mirror_bh[i];
- if (bh) {
- // FIXME: make us a regular bcache member
- kfree(bh);
- }
- }
- kfree(r1_bh);
+ raid1_free_buf(r1_bh);
sync_request_done(sect, mddev_to_conf(mddev));
md_done_sync(mddev,size>>10, uptodate);
}
@@ -1277,12 +1606,13 @@
* should be freed in raid1_stop()]
*/
- conf = raid1_kmalloc(sizeof(raid1_conf_t));
+ conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL);
mddev->private = conf;
if (!conf) {
printk(MEM_ERROR, mdidx(mddev));
goto out;
}
+ memset(conf, 0, sizeof(*conf));
ITERATE_RDEV(mddev,rdev,tmp) {
if (rdev->faulty) {
@@ -1305,11 +1635,12 @@
disk->number = descriptor->number;
disk->raid_disk = disk_idx;
disk->dev = rdev->dev;
- disk->sect_limit = MAX_LINEAR_SECTORS;
+ disk->sect_limit = MAX_WORK_PER_DISK;
disk->operational = 0;
disk->write_only = 0;
disk->spare = 0;
disk->used_slot = 1;
+ disk->head_position = 0;
continue;
}
if (disk_active(descriptor)) {
@@ -1336,11 +1667,12 @@
disk->number = descriptor->number;
disk->raid_disk = disk_idx;
disk->dev = rdev->dev;
- disk->sect_limit = MAX_LINEAR_SECTORS;
+ disk->sect_limit = MAX_WORK_PER_DISK;
disk->operational = 1;
disk->write_only = 0;
disk->spare = 0;
disk->used_slot = 1;
+ disk->head_position = 0;
conf->working_disks++;
} else {
/*
@@ -1350,27 +1682,43 @@
disk->number = descriptor->number;
disk->raid_disk = disk_idx;
disk->dev = rdev->dev;
- disk->sect_limit = MAX_LINEAR_SECTORS;
+ disk->sect_limit = MAX_WORK_PER_DISK;
disk->operational = 0;
disk->write_only = 0;
disk->spare = 1;
disk->used_slot = 1;
+ disk->head_position = 0;
}
}
- if (!conf->working_disks) {
- printk(NONE_OPERATIONAL, mdidx(mddev));
- goto out_free_conf;
- }
-
conf->raid_disks = sb->raid_disks;
conf->nr_disks = sb->nr_disks;
conf->mddev = mddev;
conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
conf->segment_lock = MD_SPIN_LOCK_UNLOCKED;
+ init_waitqueue_head(&conf->wait_buffer);
init_waitqueue_head(&conf->wait_done);
init_waitqueue_head(&conf->wait_ready);
+ if (!conf->working_disks) {
+ printk(NONE_OPERATIONAL, mdidx(mddev));
+ goto out_free_conf;
+ }
+
+
+ /* pre-allocate some buffer_head structures.
+ * As a minimum, 1 r1bh and raid_disks buffer_heads
+ * would probably get us by in tight memory situations,
+ * but a few more is probably a good idea.
+ * For now, try 16 r1bh and 16*raid_disks bufferheads
+ * This will allow at least 16 concurrent reads or writes
+ * even if kmalloc starts failing
+ */
+ if (raid1_grow_r1bh(conf, 16) < 16 ||
+ raid1_grow_bh(conf, 16*conf->raid_disks)< 16*conf->raid_disks) {
+ printk(MEM_ERROR, mdidx(mddev));
+ goto out_free_conf;
+ }
for (i = 0; i < MD_SB_DISKS; i++) {
@@ -1389,6 +1737,7 @@
disk->write_only = 0;
disk->spare = 0;
disk->used_slot = 1;
+ disk->head_position = 0;
}
}
@@ -1476,6 +1825,9 @@
return 0;
out_free_conf:
+ raid1_shrink_r1bh(conf);
+ raid1_shrink_bh(conf, conf->freebh_cnt);
+ raid1_shrink_buffers(conf);
kfree(conf);
mddev->private = NULL;
out:
@@ -1504,29 +1856,6 @@
conf->resync_mirrors = 2;
md_interrupt_thread(conf->resync_thread);
- /* this is really needed when recovery stops too... */
- spin_lock_irq(&conf->segment_lock);
- wait_event_lock_irq(conf->wait_done, !conf->cnt_active, conf->segment_lock);
- conf->start_active = conf->start_ready;
- conf->start_ready = conf->start_pending;
- conf->cnt_active = conf->cnt_ready;
- conf->cnt_ready = 0;
- wait_event_lock_irq(conf->wait_done, !conf->cnt_active, conf->segment_lock);
- conf->start_active = conf->start_ready;
- conf->cnt_ready = 0;
- wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
- conf->start_active =conf->start_ready = conf->start_pending = conf->start_future;
- conf->start_future = mddev->sb->size+1;
- conf->cnt_pending = conf->cnt_future;
- conf->cnt_future = 0;
- conf->phase = conf->phase ^1;
- wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
- conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0;
- conf->phase = 0;
- conf->cnt_done = conf->cnt_future;
- conf->cnt_future = 0;
- wake_up(&conf->wait_done);
-
printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n");
return 1;
}
@@ -1558,6 +1887,9 @@
md_unregister_thread(conf->thread);
if (conf->resync_thread)
md_unregister_thread(conf->resync_thread);
+ raid1_shrink_r1bh(conf);
+ raid1_shrink_bh(conf, conf->freebh_cnt);
+ raid1_shrink_buffers(conf);
kfree(conf);
mddev->private = NULL;
MOD_DEC_USE_COUNT;
@@ -1566,18 +1898,16 @@
static mdk_personality_t raid1_personality=
{
- "raid1",
- raid1_make_request,
- raid1_end_request,
- raid1_run,
- raid1_stop,
- raid1_status,
- 0,
- raid1_error,
- raid1_diskop,
- raid1_stop_resync,
- raid1_restart_resync,
- raid1_sync_request
+ name: "raid1",
+ make_request: raid1_make_request,
+ run: raid1_run,
+ stop: raid1_stop,
+ status: raid1_status,
+ error_handler: raid1_error,
+ diskop: raid1_diskop,
+ stop_resync: raid1_stop_resync,
+ restart_resync: raid1_restart_resync,
+ sync_request: raid1_sync_request
};
int raid1_init (void)
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)