patch-2.4.0-prerelease linux/include/linux/raid/raid5.h
Next file: linux/include/linux/sched.h
Previous file: linux/include/linux/raid/md_k.h
Back to the patch index
Back to the overall index
- Lines: 239
- Date:
Fri Dec 29 14:07:24 2000
- Orig file:
v2.4.0-test12/linux/include/linux/raid/raid5.h
- Orig date:
Mon Aug 7 23:02:17 2000
diff -u --recursive --new-file v2.4.0-test12/linux/include/linux/raid/raid5.h linux/include/linux/raid/raid5.h
@@ -4,72 +4,167 @@
#include <linux/raid/md.h>
#include <linux/raid/xor.h>
-struct disk_info {
- kdev_t dev;
- int operational;
- int number;
- int raid_disk;
- int write_only;
- int spare;
- int used_slot;
-};
-
+/*
+ *
+ * Each stripe contains one buffer per disc. Each buffer can be in
+ * one of a number of states determined by bh_state. Changes between
+ * these states happen *almost* exclusively under a per-stripe
+ * spinlock. Some very specific changes can happen in b_end_io, and
+ * these are not protected by the spin lock.
+ *
+ * The bh_state bits that are used to represent these states are:
+ * BH_Uptodate, BH_Lock
+ *
+ * State Empty == !Uptodate, !Lock
+ * We have no data, and there is no active request
+ * State Want == !Uptodate, Lock
+ * A read request is being submitted for this block
+ * State Dirty == Uptodate, Lock
+ * Some new data is in this buffer, and it is being written out
+ * State Clean == Uptodate, !Lock
+ * We have valid data which is the same as on disc
+ *
+ * The possible state transitions are:
+ *
+ * Empty -> Want - on read or write to get old data for parity calc
+ * Empty -> Dirty - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE)
+ * Empty -> Clean - on compute_block when computing a block for failed drive
+ * Want -> Empty - on failed read
+ * Want -> Clean - on successful completion of read request
+ * Dirty -> Clean - on successful completion of write request
+ * Dirty -> Clean - on failed write
+ * Clean -> Dirty - on compute_parity to satisfy write/sync (RECONSTRUCT or RMW)
+ *
+ * The Want->Empty, Want->Clean, Dirty->Clean, transitions
+ * all happen in b_end_io at interrupt time.
+ * Each sets the Uptodate bit before releasing the Lock bit.
+ * This leaves one multi-stage transition:
+ * Want->Dirty->Clean
+ * This is safe because thinking that a Clean buffer is actually dirty
+ * will at worst delay some action, and the stripe will be scheduled
+ * for attention after the transition is complete.
+ *
+ * There is one possibility that is not covered by these states. That
+ * is if one drive has failed and there is a spare being rebuilt. We
+ * can't distinguish between a clean block that has been generated
+ * from parity calculations, and a clean block that has been
+ * successfully written to the spare ( or to parity when resyncing).
+ * To distingush these states we have a stripe bit STRIPE_INSYNC that
+ * is set whenever a write is scheduled to the spare, or to the parity
+ * disc if there is no spare. A sync request clears this bit, and
+ * when we find it set with no buffers locked, we know the sync is
+ * complete.
+ *
+ * Buffers for the md device that arrive via make_request are attached
+ * to the appropriate stripe in one of two lists linked on b_reqnext.
+ * One list for read requests, one for write. There should never be
+ * more than one buffer on the two lists together, but we are not
+ * guaranteed of that so we allow for more.
+ *
+ * If a buffer is on the read list when the associated cache buffer is
+ * Uptodate, the data is copied into the read buffer and it's b_end_io
+ * routine is called. This may happen in the end_request routine only
+ * if the buffer has just successfully been read. end_request should
+ * remove the buffers from the list and then set the Uptodate bit on
+ * the buffer. Other threads may do this only if they first check
+ * that the Uptodate bit is set. Once they have checked that they may
+ * take buffers off the read queue.
+ *
+ * When a buffer on the write_list is committed for write, it is
+ * marked clean, copied into the cache buffer, which is then marked
+ * dirty, and moved onto a third list, the written list. Once both
+ * the parity block and the cached buffer are successfully written,
+ * any buffer on a written list can be returned with b_end_io.
+ *
+ * The write_list and read_list lists act as fifos. They are protected by the
+ * device_lock which can be claimed when a stripe_lock is held.
+ * The device_lock is only for list manipulations and will only be held for a very
+ * short time. It can be claimed from interrupts.
+ *
+ *
+ * Stripes in the stripe cache can be on one of two lists (or on
+ * neither). The "inactive_list" contains stripes which are not
+ * currently being used for any request. They can freely be reused
+ * for another stripe. The "handle_list" contains stripes that need
+ * to be handled in some way. Both of these are fifo queues. Each
+ * stripe is also (potentially) linked to a hash bucket in the hash
+ * table so that it can be found by sector number. Stripes that are
+ * not hashed must be on the inactive_list, and will normally be at
+ * the front. All stripes start life this way.
+ *
+ * The inactive_list, handle_list and hash bucket lists are all protected by the
+ * device_lock.
+ * - stripes on the inactive_list never have their stripe_lock held.
+ * - stripes have a reference counter. If count==0, they are on a list.
+ * - If a stripe might need handling, STRIPE_HANDLE is set.
+ * - When refcount reaches zero, then if STRIPE_HANDLE it is put on
+ * handle_list else inactive_list
+ *
+ * This, combined with the fact that STRIPE_HANDLE is only ever
+ * cleared while a stripe has a non-zero count means that if the
+ * refcount is 0 and STRIPE_HANDLE is set, then it is on the
+ * handle_list and if recount is 0 and STRIPE_HANDLE is not set, then
+ * the stripe is on inactive_list.
+ *
+ * The possible transitions are:
+ * activate an unhashed/inactive stripe (get_active_stripe())
+ * lockdev check-hash unlink-stripe cnt++ clean-stripe hash-stripe unlockdev
+ * activate a hashed, possibly active stripe (get_active_stripe())
+ * lockdev check-hash if(!cnt++)unlink-stripe unlockdev
+ * attach a request to an active stripe (add_stripe_bh())
+ * lockdev attach-buffer unlockdev
+ * handle a stripe (handle_stripe())
+ * lockstripe clrSTRIPE_HANDLE ... (lockdev check-buffers unlockdev) .. change-state .. record io needed unlockstripe schedule io
+ * release an active stripe (release_stripe())
+ * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev
+ *
+ * The refcount counts each thread that have activated the stripe,
+ * plus raid5d if it is handling it, plus one for each active request
+ * on a cached buffer.
+ */
struct stripe_head {
- md_spinlock_t stripe_lock;
struct stripe_head *hash_next, **hash_pprev; /* hash pointers */
- struct stripe_head *free_next; /* pool of free sh's */
- struct buffer_head *buffer_pool; /* pool of free buffers */
- struct buffer_head *bh_pool; /* pool of free bh's */
+ struct list_head lru; /* inactive_list or handle_list */
struct raid5_private_data *raid_conf;
- struct buffer_head *bh_old[MD_SB_DISKS]; /* disk image */
- struct buffer_head *bh_new[MD_SB_DISKS]; /* buffers of the MD device (present in buffer cache) */
- struct buffer_head *bh_copy[MD_SB_DISKS]; /* copy on write of bh_new (bh_new can change from under us) */
- struct buffer_head *bh_req[MD_SB_DISKS]; /* copy of bh_new (only the buffer heads), queued to the lower levels */
- int cmd_new[MD_SB_DISKS]; /* READ/WRITE for new */
- int new[MD_SB_DISKS]; /* buffer added since the last handle_stripe() */
+ struct buffer_head *bh_cache[MD_SB_DISKS]; /* buffered copy */
+ struct buffer_head *bh_read[MD_SB_DISKS]; /* read request buffers of the MD device */
+ struct buffer_head *bh_write[MD_SB_DISKS]; /* write request buffers of the MD device */
+ struct buffer_head *bh_written[MD_SB_DISKS]; /* write request buffers of the MD device that have been scheduled for write */
unsigned long sector; /* sector of this row */
int size; /* buffers size */
int pd_idx; /* parity disk index */
- atomic_t nr_pending; /* nr of pending cmds */
unsigned long state; /* state flags */
- int cmd; /* stripe cmd */
- atomic_t count; /* nr of waiters */
- int write_method; /* reconstruct-write / read-modify-write */
- int phase; /* PHASE_BEGIN, ..., PHASE_COMPLETE */
- md_wait_queue_head_t wait; /* processes waiting for this stripe */
-
+ atomic_t count; /* nr of active thread/requests */
+ spinlock_t lock;
int sync_redone;
};
-/*
- * Phase
- */
-#define PHASE_BEGIN 0
-#define PHASE_READ_OLD 1
-#define PHASE_WRITE 2
-#define PHASE_READ 3
-#define PHASE_COMPLETE 4
/*
* Write method
*/
-#define METHOD_NONE 0
#define RECONSTRUCT_WRITE 1
#define READ_MODIFY_WRITE 2
+/* not a write method, but a compute_parity mode */
+#define CHECK_PARITY 3
/*
* Stripe state
*/
-#define STRIPE_LOCKED 0
#define STRIPE_ERROR 1
+#define STRIPE_HANDLE 2
+#define STRIPE_SYNCING 3
+#define STRIPE_INSYNC 4
-/*
- * Stripe commands
- */
-#define STRIPE_NONE 0
-#define STRIPE_WRITE 1
-#define STRIPE_READ 2
-#define STRIPE_SYNC 3
+struct disk_info {
+ kdev_t dev;
+ int operational;
+ int number;
+ int raid_disk;
+ int write_only;
+ int spare;
+ int used_slot;
+};
struct raid5_private_data {
struct stripe_head **stripe_hashtbl;
@@ -80,23 +175,15 @@
int buffer_size;
int chunk_size, level, algorithm;
int raid_disks, working_disks, failed_disks;
- unsigned long next_sector;
- atomic_t nr_handle;
- struct stripe_head *next_free_stripe;
- atomic_t nr_stripes;
int resync_parity;
int max_nr_stripes;
- int clock;
- atomic_t nr_hashed_stripes;
- atomic_t nr_locked_stripes;
- atomic_t nr_pending_stripes;
- atomic_t nr_cached_stripes;
+ struct list_head handle_list; /* stripes needing handling */
/*
* Free stripes pool
*/
- atomic_t nr_free_sh;
- struct stripe_head *free_sh_list;
+ atomic_t active_stripes;
+ struct list_head inactive_list;
md_wait_queue_head_t wait_for_stripe;
md_spinlock_t device_lock;
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)