patch-2.4.0-test6 linux/fs/ext2/inode.c

Next file: linux/fs/ext2/ioctl.c
Previous file: linux/fs/ext2/ialloc.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.4.0-test5/linux/fs/ext2/inode.c linux/fs/ext2/inode.c
@@ -18,6 +18,8 @@
  *        David S. Miller (davem@caip.rutgers.edu), 1995
  *  64-bit file support on 64-bit platforms by Jakub Jelinek
  * 	(jj@sunsite.ms.mff.cuni.cz)
+ *
+ *  Assorted race fixes, rewrite of ext2_get_block() by Al Viro, 2000
  */
 
 #include <linux/fs.h>
@@ -26,8 +28,6 @@
 #include <linux/sched.h>
 #include <linux/highuid.h>
 
-
-
 static int ext2_update_inode(struct inode * inode, int do_sync);
 
 /*
@@ -35,9 +35,7 @@
  */
 void ext2_put_inode (struct inode * inode)
 {
-	lock_kernel();
 	ext2_discard_prealloc (inode);
-	unlock_kernel();
 }
 
 /*
@@ -66,36 +64,20 @@
 	clear_inode(inode);	/* We must guarantee clearing of inode... */
 }
 
-#define inode_bmap(inode, nr) (le32_to_cpu((inode)->u.ext2_i.i_data[(nr)]))
-
-static inline int block_bmap (struct buffer_head * bh, int nr)
-{
-	int tmp;
-
-	if (!bh)
-		return 0;
-	tmp = le32_to_cpu(((u32 *) bh->b_data)[nr]);
-	brelse (bh);
-	return tmp;
-}
-
-/* 
- * ext2_discard_prealloc and ext2_alloc_block are atomic wrt. the
- * superblock in the same manner as are ext2_free_blocks and
- * ext2_new_block.  We just wait on the super rather than locking it
- * here, since ext2_new_block will do the necessary locking and we
- * can't block until then.
- */
 void ext2_discard_prealloc (struct inode * inode)
 {
 #ifdef EXT2_PREALLOCATE
-	unsigned short total;
-
+	lock_kernel();
+	/* Writer: ->i_prealloc* */
 	if (inode->u.ext2_i.i_prealloc_count) {
-		total = inode->u.ext2_i.i_prealloc_count;
+		unsigned short total = inode->u.ext2_i.i_prealloc_count;
+		unsigned long block = inode->u.ext2_i.i_prealloc_block;
 		inode->u.ext2_i.i_prealloc_count = 0;
-		ext2_free_blocks (inode, inode->u.ext2_i.i_prealloc_block, total);
+		inode->u.ext2_i.i_prealloc_block = 0;
+		/* Writer: end */
+		ext2_free_blocks (inode, block, total);
 	}
+	unlock_kernel();
 #endif
 }
 
@@ -106,22 +88,26 @@
 #endif
 	unsigned long result;
 
-	wait_on_super (inode->i_sb);
 
 #ifdef EXT2_PREALLOCATE
+	/* Writer: ->i_prealloc* */
 	if (inode->u.ext2_i.i_prealloc_count &&
 	    (goal == inode->u.ext2_i.i_prealloc_block ||
 	     goal + 1 == inode->u.ext2_i.i_prealloc_block))
 	{		
 		result = inode->u.ext2_i.i_prealloc_block++;
 		inode->u.ext2_i.i_prealloc_count--;
+		/* Writer: end */
+#ifdef EXT2FS_DEBUG
 		ext2_debug ("preallocation hit (%lu/%lu).\n",
 			    ++alloc_hits, ++alloc_attempts);
-
+#endif
 	} else {
 		ext2_discard_prealloc (inode);
+#ifdef EXT2FS_DEBUG
 		ext2_debug ("preallocation miss (%lu/%lu).\n",
 			    alloc_hits, ++alloc_attempts);
+#endif
 		if (S_ISREG(inode->i_mode))
 			result = ext2_new_block (inode, goal, 
 				 &inode->u.ext2_i.i_prealloc_count,
@@ -135,387 +121,486 @@
 	return result;
 }
 
-static inline long ext2_block_map (struct inode * inode, long block)
+typedef struct {
+	u32	*p;
+	u32	key;
+	struct buffer_head *bh;
+} Indirect;
+
+static inline void add_chain(Indirect *p, struct buffer_head *bh, u32 *v)
+{
+	p->key = *(p->p = v);
+	p->bh = bh;
+}
+
+static inline int verify_chain(Indirect *from, Indirect *to)
+{
+	while (from <= to && from->key == *from->p)
+		from++;
+	return (from > to);
+}
+
+/**
+ *	ext2_block_to_path - parse the block number into array of offsets
+ *	@inode: inode in question (we are only interested in its superblock)
+ *	@i_block: block number to be parsed
+ *	@offsets: array to store the offsets in
+ *
+ *	To store the locations of file's data ext2 uses a data structure common
+ *	for UNIX filesystems - tree of pointers anchored in the inode, with
+ *	data blocks at leaves and indirect blocks in intermediate nodes.
+ *	This function translates the block number into path in that tree -
+ *	return value is the path length and @offsets[n] is the offset of
+ *	pointer to (n+1)th node in the nth one. If @block is out of range
+ *	(negative or too large) warning is printed and zero returned.
+ *
+ *	Note: function doesn't find node addresses, so no IO is needed. All
+ *	we need to know is the capacity of indirect blocks (taken from the
+ *	inode->i_sb).
+ */
+
+/*
+ * Portability note: the last comparison (check that we fit into triple
+ * indirect block) is spelled differently, because otherwise on an
+ * architecture with 32-bit longs and 8Kb pages we might get into trouble
+ * if our filesystem had 8Kb blocks. We might use long long, but that would
+ * kill us on x86. Oh, well, at least the sign propagation does not matter -
+ * i_block would have to be negative in the very beginning, so we would not
+ * get there at all.
+ */
+
+static int ext2_block_to_path(struct inode *inode, long i_block, int offsets[4])
 {
-	int i, ret;
 	int ptrs = EXT2_ADDR_PER_BLOCK(inode->i_sb);
 	int ptrs_bits = EXT2_ADDR_PER_BLOCK_BITS(inode->i_sb);
+	const long direct_blocks = EXT2_NDIR_BLOCKS,
+		indirect_blocks = ptrs,
+		double_blocks = (1 << (ptrs_bits * 2));
+	int n = 0;
 
-	ret = 0;
-	lock_kernel();
-	if (block < 0) {
-		ext2_warning (inode->i_sb, "ext2_block_map", "block < 0");
-		goto out;
-	}
-	if (block >= EXT2_NDIR_BLOCKS + ptrs +
-		(1 << (ptrs_bits * 2)) +
-		((1 << (ptrs_bits * 2)) << ptrs_bits)) {
-		ext2_warning (inode->i_sb, "ext2_block_map", "block > big");
-		goto out;
-	}
-	if (block < EXT2_NDIR_BLOCKS) {
-		ret = inode_bmap (inode, block);
-		goto out;
-	}
-	block -= EXT2_NDIR_BLOCKS;
-	if (block < ptrs) {
-		i = inode_bmap (inode, EXT2_IND_BLOCK);
-		if (!i)
-			goto out;
-		ret = block_bmap (bread (inode->i_dev, i,
-					  inode->i_sb->s_blocksize), block);
-		goto out;
+	if (i_block < 0) {
+		ext2_warning (inode->i_sb, "ext2_block_to_path", "block < 0");
+	} else if (i_block < direct_blocks) {
+		offsets[n++] = i_block;
+	} else if ( (i_block -= direct_blocks) < indirect_blocks) {
+		offsets[n++] = EXT2_IND_BLOCK;
+		offsets[n++] = i_block;
+	} else if ((i_block -= indirect_blocks) < double_blocks) {
+		offsets[n++] = EXT2_DIND_BLOCK;
+		offsets[n++] = i_block >> ptrs_bits;
+		offsets[n++] = i_block & (ptrs - 1);
+	} else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
+		offsets[n++] = EXT2_TIND_BLOCK;
+		offsets[n++] = i_block >> (ptrs_bits * 2);
+		offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
+		offsets[n++] = i_block & (ptrs - 1);
+	} else {
+		ext2_warning (inode->i_sb, "ext2_block_to_path", "block > big");
 	}
-	block -= ptrs;
-	if (block < (1 << (ptrs_bits * 2))) {
-		i = inode_bmap (inode, EXT2_DIND_BLOCK);
-		if (!i)
-			goto out;
-		i = block_bmap (bread (inode->i_dev, i,
-				       inode->i_sb->s_blocksize),
-				block >> ptrs_bits);
-		if (!i)
-			goto out;
-		ret = block_bmap (bread (inode->i_dev, i,
-					  inode->i_sb->s_blocksize),
-				block & (ptrs - 1));
-		goto out;
+	return n;
+}
+
+/**
+ *	ext2_get_branch - read the chain of indirect blocks leading to data
+ *	@inode: inode in question
+ *	@depth: depth of the chain (1 - direct pointer, etc.)
+ *	@offsets: offsets of pointers in inode/indirect blocks
+ *	@chain: place to store the result
+ *	@err: here we store the error value
+ *
+ *	Function fills the array of triples <key, p, bh> and returns %NULL
+ *	if everything went OK or the pointer to the last filled triple
+ *	(incomplete one) otherwise. Upon the return chain[i].key contains
+ *	the number of (i+1)-th block in the chain (as it is stored in memory,
+ *	i.e. little-endian 32-bit), chain[i].p contains the address of that
+ *	number (it points into struct inode for i==0 and into the bh->b_data
+ *	for i>0) and chain[i].bh points to the buffer_head of i-th indirect
+ *	block for i>0 and NULL for i==0. In other words, it holds the block
+ *	numbers of the chain, addresses they were taken from (and where we can
+ *	verify that chain did not change) and buffer_heads hosting these
+ *	numbers.
+ *
+ *	Function stops when it stumbles upon zero pointer (absent block)
+ *		(pointer to last triple returned, *@err == 0)
+ *	or when it gets an IO error reading an indirect block
+ *		(ditto, *@err == -EIO)
+ *	or when it notices that chain had been changed while it was reading
+ *		(ditto, *@err == -EAGAIN)
+ *	or when it reads all @depth-1 indirect blocks successfully and finds
+ *	the whole chain, all way to the data (returns %NULL, *err == 0).
+ */
+static inline Indirect *ext2_get_branch(struct inode *inode,
+					int depth,
+					int *offsets,
+					Indirect chain[4],
+					int *err)
+{
+	kdev_t dev = inode->i_dev;
+	int size = inode->i_sb->s_blocksize;
+	Indirect *p = chain;
+	struct buffer_head *bh;
+
+	*err = 0;
+	/* i_data is not going away, no lock needed */
+	add_chain (chain, NULL, inode->u.ext2_i.i_data + *offsets);
+	if (!p->key)
+		goto no_block;
+	/*
+	 * switch below is merely an unrolled loop - body should be
+	 * repeated depth-1 times. Maybe loop would be actually better,
+	 * but that way we get straight execution path in normal cases.
+	 * Easy to change, anyway - all cases in switch are literally
+	 * identical.
+	 */
+	switch (depth) {
+		case 4:
+			bh = bread(dev, le32_to_cpu(p->key), size);
+			if (!bh)
+				goto failure;
+			/* Reader: pointers */
+			if (!verify_chain(chain, p))
+				goto changed;
+			add_chain(++p, bh, (u32*)bh->b_data + *++offsets);
+			/* Reader: end */
+			if (!p->key)
+				goto no_block;
+		case 3:
+			bh = bread(dev, le32_to_cpu(p->key), size);
+			if (!bh)
+				goto failure;
+			/* Reader: pointers */
+			if (!verify_chain(chain, p))
+				goto changed;
+			add_chain(++p, bh, (u32*)bh->b_data + *++offsets);
+			/* Reader: end */
+			if (!p->key)
+				goto no_block;
+		case 2:
+			bh = bread(dev, le32_to_cpu(p->key), size);
+			if (!bh)
+				goto failure;
+			/* Reader: pointers */
+			if (!verify_chain(chain, p))
+				goto changed;
+			add_chain(++p, bh, (u32*)bh->b_data + *++offsets);
+			/* Reader: end */
+			if (!p->key)
+				goto no_block;
 	}
-	block -= (1 << (ptrs_bits * 2));
-	i = inode_bmap (inode, EXT2_TIND_BLOCK);
-	if (!i)
-		goto out;
-	i = block_bmap (bread (inode->i_dev, i, inode->i_sb->s_blocksize),
-			block >> (ptrs_bits * 2));
-	if (!i)
-		goto out;
-	i = block_bmap (bread (inode->i_dev, i, inode->i_sb->s_blocksize),
-			(block >> ptrs_bits) & (ptrs - 1));
-	if (!i)
-		goto out;
-	ret = block_bmap (bread (inode->i_dev, i, inode->i_sb->s_blocksize),
-			   block & (ptrs - 1));
-out:
-	unlock_kernel();
-	return ret;
+	return NULL;
+
+changed:
+	*err = -EAGAIN;
+	goto no_block;
+failure:
+	*err = -EIO;
+no_block:
+	return p;
 }
 
-static struct buffer_head * inode_getblk (struct inode * inode, int nr,
-	int new_block, int * err, int metadata, long *phys, int *new)
+/**
+ *	ext2_find_near - find a place for allocation with sufficient locality
+ *	@inode: owner
+ *	@ind: descriptor of indirect block.
+ *
+ *	This function returns the prefered place for block allocation.
+ *	It is used when heuristic for sequential allocation fails.
+ *	Rules are:
+ *	  + if there is a block to the left of our position - allocate near it.
+ *	  + if pointer will live in indirect block - allocate near that block.
+ *	  + if pointer will live in inode - allocate in the same cylinder group.
+ *	Caller must make sure that @ind is valid and will stay that way.
+ */
+
+static inline unsigned long ext2_find_near(struct inode *inode, Indirect *ind)
 {
-	u32 * p;
-	int tmp, goal = 0;
-	struct buffer_head * result;
-	int blocksize = inode->i_sb->s_blocksize;
+	u32 *start = ind->bh ? (u32*) ind->bh->b_data : inode->u.ext2_i.i_data;
+	u32 *p;
 
-	p = inode->u.ext2_i.i_data + nr;
-repeat:
-	tmp = le32_to_cpu(*p);
-	if (tmp) {
-		if (metadata) {
-			result = getblk (inode->i_dev, tmp, blocksize);
-			if (tmp == le32_to_cpu(*p))
-				return result;
-			brelse (result);
-			goto repeat;
-		} else {
-			*phys = tmp;
-			return NULL;
-		}
-	}
+	/* Try to find previous block */
+	for (p = ind->p - 1; p >= start; p--)
+		if (*p)
+			return le32_to_cpu(*p);
+
+	/* No such thing, so let's try location of indirect block */
+	if (ind->bh)
+		return ind->bh->b_blocknr;
 
-	if (inode->u.ext2_i.i_next_alloc_block == new_block)
-		goal = inode->u.ext2_i.i_next_alloc_goal;
+	/*
+	 * It is going to be refered from inode itself? OK, just put it into
+	 * the same cylinder group then.
+	 */
+	return (inode->u.ext2_i.i_block_group * 
+		EXT2_BLOCKS_PER_GROUP(inode->i_sb)) +
+	       le32_to_cpu(inode->i_sb->u.ext2_sb.s_es->s_first_data_block);
+}
 
-	ext2_debug ("hint = %d,", goal);
+/**
+ *	ext2_find_goal - find a prefered place for allocation.
+ *	@inode: owner
+ *	@block:  block we want
+ *	@chain:  chain of indirect blocks
+ *	@partial: pointer to the last triple within a chain
+ *	@goal:	place to store the result.
+ *
+ *	Normally this function find the prefered place for block allocation,
+ *	stores it in *@goal and returns zero. If the branch had been changed
+ *	under us we return -EAGAIN.
+ */
 
-	if (!goal) {
-		for (tmp = nr - 1; tmp >= 0; tmp--) {
-			if (inode->u.ext2_i.i_data[tmp]) {
-				goal = le32_to_cpu(inode->u.ext2_i.i_data[tmp]);
-				break;
-			}
-		}
-		if (!goal)
-			goal = (inode->u.ext2_i.i_block_group * 
-				EXT2_BLOCKS_PER_GROUP(inode->i_sb)) +
-			       le32_to_cpu(inode->i_sb->u.ext2_sb.s_es->s_first_data_block);
-	}
-
-	ext2_debug ("goal = %d.\n", goal);
-
-	tmp = ext2_alloc_block (inode, goal, err);
-	if (!tmp)
-		return NULL;
-
-	if (metadata) {
-		result = getblk (inode->i_dev, tmp, blocksize);
-		if (!buffer_uptodate(result))
-			wait_on_buffer(result);
-		memset(result->b_data, 0, blocksize);
-		mark_buffer_uptodate(result, 1);
-		mark_buffer_dirty(result, 1);
-		if (*p) {
-			ext2_free_blocks (inode, tmp, 1);
-			bforget (result);
-			goto repeat;
-		}
-	} else {
-		if (*p) {
-			/*
-			 * Nobody is allowed to change block allocation
-			 * state from under us:
-			 */
-			ext2_error (inode->i_sb, "block_getblk",
-				    "data block filled under us");
-			BUG();
-			ext2_free_blocks (inode, tmp, 1);
-			goto repeat;
-		}
-		*phys = tmp;
-		result = NULL;
-		*err = 0;
-		*new = 1;
+static inline int ext2_find_goal(struct inode *inode,
+				 long block,
+				 Indirect chain[4],
+				 Indirect *partial,
+				 unsigned long *goal)
+{
+	/* Writer: ->i_next_alloc* */
+	if (block == inode->u.ext2_i.i_next_alloc_block + 1) {
+		inode->u.ext2_i.i_next_alloc_block++;
+		inode->u.ext2_i.i_next_alloc_goal++;
+	} 
+	/* Writer: end */
+	/* Reader: pointers, ->i_next_alloc* */
+	if (verify_chain(chain, partial)) {
+		/*
+		 * try the heuristic for sequential allocation,
+		 * failing that at least try to get decent locality.
+		 */
+		if (block == inode->u.ext2_i.i_next_alloc_block)
+			*goal = inode->u.ext2_i.i_next_alloc_goal;
+		if (!*goal)
+			*goal = ext2_find_near(inode, partial);
+		return 0;
 	}
-	*p = cpu_to_le32(tmp);
-
-	inode->u.ext2_i.i_next_alloc_block = new_block;
-	inode->u.ext2_i.i_next_alloc_goal = tmp;
-	inode->i_ctime = CURRENT_TIME;
-	inode->i_blocks += blocksize/512;
-	if (IS_SYNC(inode) || inode->u.ext2_i.i_osync)
-		ext2_sync_inode (inode);
-	else
-		mark_inode_dirty(inode);
-	return result;
+	/* Reader: end */
+	return -EAGAIN;
 }
 
-/*
- *   metadata / data
- *   possibly create / access
- *   can fail due to: - not present
- *                    - out of space
+/**
+ *	ext2_alloc_branch - allocate and set up a chain of blocks.
+ *	@inode: owner
+ *	@num: depth of the chain (number of blocks to allocate)
+ *	@offsets: offsets (in the blocks) to store the pointers to next.
+ *	@branch: place to store the chain in.
  *
- *   NULL return in the data case is mandatory.
+ *	This function allocates @num blocks, zeroes out all but the last one,
+ *	links them into chain and (if we are synchronous) writes them to disk.
+ *	In other words, it prepares a branch that can be spliced onto the
+ *	inode. It stores the information about that chain in the branch[], in
+ *	the same format as ext2_get_branch() would do. We are calling it after
+ *	we had read the existing part of chain and partial points to the last
+ *	triple of that (one with zero ->key). Upon the exit we have the same
+ *	picture as after the successful ext2_get_block(), excpet that in one
+ *	place chain is disconnected - *branch->p is still zero (we did not
+ *	set the last link), but branch->key contains the number that should
+ *	be placed into *branch->p to fill that gap.
+ *
+ *	If allocation fails we free all blocks we've allocated (and forget
+ *	ther buffer_heads) and return the error value the from failed
+ *	ext2_alloc_block() (normally -ENOSPC). Otherwise we set the chain
+ *	as described above and return 0.
  */
-static struct buffer_head * block_getblk (struct inode * inode,
-	  struct buffer_head * bh, int nr,
-	  int new_block, int * err, int metadata, long *phys, int *new)
-{
-	int tmp, goal = 0;
-	u32 * p;
-	struct buffer_head * result;
+
+static int ext2_alloc_branch(struct inode *inode,
+			     int num,
+			     unsigned long goal,
+			     int *offsets,
+			     Indirect *branch)
+{
 	int blocksize = inode->i_sb->s_blocksize;
+	int n = 0;
+	int err;
+	int i;
+	int parent = ext2_alloc_block(inode, goal, &err);
 
-	result = NULL;	
-	if (!bh)
-		goto out;
-	if (!buffer_uptodate(bh)) {
-		ll_rw_block (READ, 1, &bh);
-		wait_on_buffer (bh);
+	branch[0].key = cpu_to_le32(parent);
+	if (parent) for (n = 1; n < num; n++) {
+		struct buffer_head *bh;
+		/* Allocate the next block */
+		int nr = ext2_alloc_block(inode, parent, &err);
+		if (!nr)
+			break;
+		branch[n].key = cpu_to_le32(nr);
+		/*
+		 * Get buffer_head for parent block, zero it out and set 
+		 * the pointer to new one, then send parent to disk.
+		 */
+		bh = getblk(inode->i_dev, parent, blocksize);
 		if (!buffer_uptodate(bh))
-			goto out;
-	}
-	p = (u32 *) bh->b_data + nr;
-repeat:
-	tmp = le32_to_cpu(*p);
-	if (tmp) {
-		if (metadata) {
-			result = getblk (bh->b_dev, tmp, blocksize);
-			if (tmp == le32_to_cpu(*p))
-				goto out;
-			brelse (result);
-			goto repeat;
-		} else {
-			*phys = tmp;
-			/* result == NULL */
-			goto out;
+			wait_on_buffer(bh);
+		memset(bh->b_data, 0, blocksize);
+		branch[n].bh = bh;
+		branch[n].p = (u32*) bh->b_data + offsets[n];
+		*branch[n].p = branch[n].key;
+		mark_buffer_uptodate(bh, 1);
+		mark_buffer_dirty(bh, 1);
+		if (IS_SYNC(inode) || inode->u.ext2_i.i_osync) {
+			ll_rw_block (WRITE, 1, &bh);
+			wait_on_buffer (bh);
 		}
+		parent = nr;
 	}
+	if (n == num)
+		return 0;
 
-	if (inode->u.ext2_i.i_next_alloc_block == new_block)
-		goal = inode->u.ext2_i.i_next_alloc_goal;
-	if (!goal) {
-		for (tmp = nr - 1; tmp >= 0; tmp--) {
-			if (le32_to_cpu(((u32 *) bh->b_data)[tmp])) {
-				goal = le32_to_cpu(((u32 *)bh->b_data)[tmp]);
-				break;
-			}
-		}
-		if (!goal)
-			goal = bh->b_blocknr;
-	}
-	tmp = ext2_alloc_block (inode, goal, err);
-	if (!tmp)
-		goto out;
-	if (metadata) {
-		result = getblk (bh->b_dev, tmp, blocksize);
-		if (!buffer_uptodate(result))
-			wait_on_buffer(result);
-		memset(result->b_data, 0, inode->i_sb->s_blocksize);
-		mark_buffer_uptodate(result, 1);
-		mark_buffer_dirty(result, 1);
-		if (*p) {
-			ext2_free_blocks (inode, tmp, 1);
-			bforget (result);
-			goto repeat;
-		}
-	} else {
-		if (*p) {
-			/*
-			 * Nobody is allowed to change block allocation
-			 * state from under us:
-			 */
-			ext2_error (inode->i_sb, "block_getblk",
-				    "data block filled under us");
-			BUG();
-			ext2_free_blocks (inode, tmp, 1);
-			goto repeat;
-		}
-		*phys = tmp;
-		*new = 1;
-	}
-	*p = le32_to_cpu(tmp);
-	mark_buffer_dirty(bh, 1);
-	if (IS_SYNC(inode) || inode->u.ext2_i.i_osync) {
-		ll_rw_block (WRITE, 1, &bh);
-		wait_on_buffer (bh);
-	}
-	inode->i_ctime = CURRENT_TIME;
-	inode->i_blocks += blocksize/512;
-	mark_inode_dirty(inode);
-	inode->u.ext2_i.i_next_alloc_block = new_block;
-	inode->u.ext2_i.i_next_alloc_goal = tmp;
-	*err = 0;
-out:
-	brelse (bh);
-	return result;
+	/* Allocation failed, free what we already allocated */
+	for (i = 1; i < n; i++)
+		bforget(branch[i].bh);
+	for (i = 0; i < n; i++)
+		ext2_free_blocks(inode, le32_to_cpu(branch[i].key), 1);
+	return err;
 }
 
-static int ext2_get_block(struct inode *inode, long iblock, struct buffer_head *bh_result, int create)
+/**
+ *	ext2_splice_branch - splice the allocated branch onto inode.
+ *	@inode: owner
+ *	@block: (logical) number of block we are adding
+ *	@chain: chain of indirect blocks (with a missing link - see
+ *		ext2_alloc_branch)
+ *	@where: location of missing link
+ *	@num:   number of blocks we are adding
+ *
+ *	This function verifies that chain (up to the missing link) had not
+ *	changed, fills the missing link and does all housekeeping needed in
+ *	inode (->i_blocks, etc.). In case of success we end up with the full
+ *	chain to new block and return 0. Otherwise (== chain had been changed)
+ *	we free the new blocks (forgetting their buffer_heads, indeed) and
+ *	return -EAGAIN.
+ */
+
+static inline int ext2_splice_branch(struct inode *inode,
+				     long block,
+				     Indirect chain[4],
+				     Indirect *where,
+				     int num)
 {
-	int ret, err, new;
-	struct buffer_head *bh;
-	unsigned long ptr, phys;
-	/*
-	 * block pointers per block
-	 */
-	unsigned long ptrs = EXT2_ADDR_PER_BLOCK(inode->i_sb);
-	int ptrs_bits = EXT2_ADDR_PER_BLOCK_BITS(inode->i_sb);
-	const int direct_blocks = EXT2_NDIR_BLOCKS,
-		indirect_blocks = ptrs,
-		double_blocks = (1 << (ptrs_bits * 2)),
-		triple_blocks = (1 << (ptrs_bits * 3));
+	int i;
 
-	if (!create) {
-		/*
-		 * Will clean this up further, ext2_block_map() should use the
-		 * bh instead of an integer block-number interface.
-		 */
-		phys = ext2_block_map(inode, iblock);
-		if (phys) {
-			bh_result->b_dev = inode->i_dev;
-			bh_result->b_blocknr = phys;
-			bh_result->b_state |= (1UL << BH_Mapped);
+	/* Verify that place we are splicing to is still there and vacant */
+
+	/* Writer: pointers, ->i_next_alloc*, ->i_blocks */
+	if (!verify_chain(chain, where-1) || *where->p)
+		/* Writer: end */
+		goto changed;
+
+	/* That's it */
+
+	*where->p = where->key;
+	inode->u.ext2_i.i_next_alloc_block = block;
+	inode->u.ext2_i.i_next_alloc_goal = le32_to_cpu(where[num-1].key);
+	inode->i_blocks += num * inode->i_sb->s_blocksize/512;
+
+	/* Writer: end */
+
+	/* We are done with atomic stuff, now do the rest of housekeeping */
+
+	inode->i_ctime = CURRENT_TIME;
+
+	/* had we spliced it onto indirect block? */
+	if (where->bh) {
+		mark_buffer_dirty(where->bh, 1);
+		if (IS_SYNC(inode) || inode->u.ext2_i.i_osync) {
+			ll_rw_block (WRITE, 1, &where->bh);
+			wait_on_buffer(where->bh);
 		}
-		return 0;
 	}
 
-	err = -EIO;
-	new = 0;
-	ret = 0;
-	bh = NULL;
+	if (IS_SYNC(inode) || inode->u.ext2_i.i_osync)
+		ext2_sync_inode (inode);
+	else
+		mark_inode_dirty(inode);
+	return 0;
 
-	lock_kernel();
+changed:
+	for (i = 1; i < num; i++)
+		bforget(where[i].bh);
+	for (i = 0; i < num; i++)
+		ext2_free_blocks(inode, le32_to_cpu(where[i].key), 1);
+	return -EAGAIN;
+}
 
-	if (iblock < 0)
-		goto abort_negative;
-	if (iblock > direct_blocks + indirect_blocks +
-					 double_blocks + triple_blocks)
-		goto abort_too_big;
+/*
+ * Allocation strategy is simple: if we have to allocate something, we will
+ * have to go the whole way to leaf. So let's do it before attaching anything
+ * to tree, set linkage between the newborn blocks, write them if sync is
+ * required, recheck the path, free and repeat if check fails, otherwise
+ * set the last missing link (that will protect us from any truncate-generated
+ * removals - all blocks on the path are immune now) and possibly force the
+ * write on the parent block.
+ * That has a nice additional property: no special recovery from the failed
+ * allocations is needed - we simply release blocks and do not touch anything
+ * reachable from inode.
+ */
 
-	/*
-	 * If this is a sequential block allocation, set the next_alloc_block
-	 * to this block now so that all the indblock and data block
-	 * allocations use the same goal zone
-	 */
+static int ext2_get_block(struct inode *inode, long iblock, struct buffer_head *bh_result, int create)
+{
+	int err = -EIO;
+	int offsets[4];
+	Indirect chain[4];
+	Indirect *partial;
+	unsigned long goal;
+	int left;
+	int depth = ext2_block_to_path(inode, iblock, offsets);
 
-	ext2_debug ("block %lu, next %lu, goal %lu.\n", iblock, 
-		    inode->u.ext2_i.i_next_alloc_block,
-		    inode->u.ext2_i.i_next_alloc_goal);
+	if (depth == 0)
+		goto out;
 
-	if (iblock == inode->u.ext2_i.i_next_alloc_block + 1) {
-		inode->u.ext2_i.i_next_alloc_block++;
-		inode->u.ext2_i.i_next_alloc_goal++;
-	}
+	lock_kernel();
+reread:
+	partial = ext2_get_branch(inode, depth, offsets, chain, &err);
 
-	err = 0;
-	ptr = iblock;
+	/* Simplest case - block found, no allocation needed */
+	if (!partial) {
+got_it:
+		bh_result->b_dev = inode->i_dev;
+		bh_result->b_blocknr = le32_to_cpu(chain[depth-1].key);
+		bh_result->b_state |= (1UL << BH_Mapped);
+		/* Clean up and exit */
+		partial = chain+depth-1; /* the whole chain */
+		goto cleanup;
+	}
+
+	/* Next simple case - plain lookup or failed read of indirect block */
+	if (!create || err == -EIO) {
+cleanup:
+		while (partial > chain) {
+			brelse(partial->bh);
+			partial--;
+		}
+		unlock_kernel();
+out:
+		return err;
+	}
 
 	/*
-	 * ok, these macros clean the logic up a bit and make
-	 * it much more readable:
+	 * Indirect block might be removed by truncate while we were
+	 * reading it. Handling of that case (forget what we've got and
+	 * reread) is taken out of the main path.
 	 */
-#define GET_INODE_DATABLOCK(x) \
-		inode_getblk(inode, x, iblock, &err, 0, &phys, &new)
-#define GET_INODE_PTR(x) \
-		inode_getblk(inode, x, iblock, &err, 1, NULL, NULL)
-#define GET_INDIRECT_DATABLOCK(x) \
-		block_getblk (inode, bh, x, iblock, &err, 0, &phys, &new);
-#define GET_INDIRECT_PTR(x) \
-		block_getblk (inode, bh, x, iblock, &err, 1, NULL, NULL);
+	if (err == -EAGAIN)
+		goto changed;
 
-	if (ptr < direct_blocks) {
-		bh = GET_INODE_DATABLOCK(ptr);
-		goto out;
-	}
-	ptr -= direct_blocks;
-	if (ptr < indirect_blocks) {
-		bh = GET_INODE_PTR(EXT2_IND_BLOCK);
-		goto get_indirect;
-	}
-	ptr -= indirect_blocks;
-	if (ptr < double_blocks) {
-		bh = GET_INODE_PTR(EXT2_DIND_BLOCK);
-		goto get_double;
-	}
-	ptr -= double_blocks;
-	bh = GET_INODE_PTR(EXT2_TIND_BLOCK);
-	bh = GET_INDIRECT_PTR(ptr >> (ptrs_bits * 2));
-get_double:
-	bh = GET_INDIRECT_PTR((ptr >> ptrs_bits) & (ptrs - 1));
-get_indirect:
-	bh = GET_INDIRECT_DATABLOCK(ptr & (ptrs - 1));
-
-#undef GET_INODE_DATABLOCK
-#undef GET_INODE_PTR
-#undef GET_INDIRECT_DATABLOCK
-#undef GET_INDIRECT_PTR
+	if (ext2_find_goal(inode, iblock, chain, partial, &goal) < 0)
+		goto changed;
 
-out:
-	if (bh)
-		BUG();	// temporary debugging check
+	left = (chain + depth) - partial;
+	err = ext2_alloc_branch(inode, left, goal,
+					offsets+(partial-chain), partial);
 	if (err)
-		goto abort;
-	if (!phys)
-		BUG();	// must not happen either
-
-	bh_result->b_dev = inode->i_dev;
-	bh_result->b_blocknr = phys;
-	bh_result->b_state |= (1UL << BH_Mapped); /* safe */
-	if (new)
-		bh_result->b_state |= (1UL << BH_New);
-abort:
-	unlock_kernel();
-	return err;
+		goto cleanup;
+
+	if (ext2_splice_branch(inode, iblock, chain, partial, left) < 0)
+		goto changed;
 
-abort_negative:
-	ext2_warning (inode->i_sb, "ext2_get_block", "block < 0");
-	goto abort;
-
-abort_too_big:
-	ext2_warning (inode->i_sb, "ext2_get_block", "block > big");
-	goto abort;
+	bh_result->b_state |= (1UL << BH_New);
+	goto got_it;
+
+changed:
+	while (partial > chain) {
+		bforget(partial->bh);
+		partial--;
+	}
+	goto reread;
 }
 
 struct buffer_head * ext2_getblk(struct inode * inode, long block, int create, int * err)
@@ -749,7 +834,7 @@
 	inode->i_attr_flags = 0;
 	if (inode->u.ext2_i.i_flags & EXT2_SYNC_FL) {
 		inode->i_attr_flags |= ATTR_FLAG_SYNCRONOUS;
-		inode->i_flags |= MS_SYNCHRONOUS;
+		inode->i_flags |= S_SYNC;
 	}
 	if (inode->u.ext2_i.i_flags & EXT2_APPEND_FL) {
 		inode->i_attr_flags |= ATTR_FLAG_APPEND;
@@ -761,7 +846,7 @@
 	}
 	if (inode->u.ext2_i.i_flags & EXT2_NOATIME_FL) {
 		inode->i_attr_flags |= ATTR_FLAG_NOATIME;
-		inode->i_flags |= MS_NOATIME;
+		inode->i_flags |= S_NOATIME;
 	}
 	return;
 	
@@ -915,17 +1000,17 @@
 	
 	flags = iattr->ia_attr_flags;
 	if (flags & ATTR_FLAG_SYNCRONOUS) {
-		inode->i_flags |= MS_SYNCHRONOUS;
+		inode->i_flags |= S_SYNC;
 		inode->u.ext2_i.i_flags |= EXT2_SYNC_FL;
 	} else {
-		inode->i_flags &= ~MS_SYNCHRONOUS;
+		inode->i_flags &= ~S_SYNC;
 		inode->u.ext2_i.i_flags &= ~EXT2_SYNC_FL;
 	}
 	if (flags & ATTR_FLAG_NOATIME) {
-		inode->i_flags |= MS_NOATIME;
+		inode->i_flags |= S_NOATIME;
 		inode->u.ext2_i.i_flags |= EXT2_NOATIME_FL;
 	} else {
-		inode->i_flags &= ~MS_NOATIME;
+		inode->i_flags &= ~S_NOATIME;
 		inode->u.ext2_i.i_flags &= ~EXT2_NOATIME_FL;
 	}
 	if (flags & ATTR_FLAG_APPEND) {

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)