* @inode: owner
* @ind: descriptor of indirect block.
*
- * This function returns the prefered place for block allocation.
+ * This function returns the preferred place for block allocation.
* It is used when heuristic for sequential allocation fails.
* Rules are:
* + if there is a block to the left of our position - allocate near it.
}
/**
- * ext3_find_goal - find a prefered place for allocation.
+ * ext3_find_goal - find a preferred place for allocation.
* @inode: owner
* @block: block we want
- * @chain: chain of indirect blocks
* @partial: pointer to the last triple within a chain
- * @goal: place to store the result.
*
- * Normally this function find the prefered place for block allocation,
- * stores it in *@goal and returns zero.
+ * Normally this function find the preferred place for block allocation,
+ * returns it.
*/
static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block,
- Indirect chain[4], Indirect *partial)
+ Indirect *partial)
{
struct ext3_block_alloc_info *block_i;
if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
ext3_init_block_alloc_info(inode);
- goal = ext3_find_goal(inode, iblock, chain, partial);
+ goal = ext3_find_goal(inode, iblock, partial);
/* the number of blocks need to allocate for [d,t]indirect blocks */
indirect_blks = (chain + depth) - partial - 1;
return err;
}
-#define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32)
+/* Maximum number of blocks we map for direct IO at once. */
+#define DIO_MAX_BLOCKS 4096
+/*
+ * Number of credits we need for writing DIO_MAX_BLOCKS:
+ * We need sb + group descriptor + bitmap + inode -> 4
+ * For B blocks with A block pointers per block we need:
+ * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).
+ * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.
+ */
+#define DIO_CREDITS 25
static int ext3_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
handle_t *handle = ext3_journal_current_handle();
- int ret = 0;
+ int ret = 0, started = 0;
unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
- if (!create)
- goto get_block; /* A read */
-
- if (max_blocks == 1)
- goto get_block; /* A single block get */
-
- if (handle->h_transaction->t_state == T_LOCKED) {
- /*
- * Huge direct-io writes can hold off commits for long
- * periods of time. Let this commit run.
- */
- ext3_journal_stop(handle);
- handle = ext3_journal_start(inode, DIO_CREDITS);
- if (IS_ERR(handle))
+ if (create && !handle) { /* Direct IO write... */
+ if (max_blocks > DIO_MAX_BLOCKS)
+ max_blocks = DIO_MAX_BLOCKS;
+ handle = ext3_journal_start(inode, DIO_CREDITS +
+ 2 * EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb));
+ if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
- goto get_block;
- }
-
- if (handle->h_buffer_credits <= EXT3_RESERVE_TRANS_BLOCKS) {
- /*
- * Getting low on buffer credits...
- */
- ret = ext3_journal_extend(handle, DIO_CREDITS);
- if (ret > 0) {
- /*
- * Couldn't extend the transaction. Start a new one.
- */
- ret = ext3_journal_restart(handle, DIO_CREDITS);
+ goto out;
}
+ started = 1;
}
-get_block:
- if (ret == 0) {
- ret = ext3_get_blocks_handle(handle, inode, iblock,
+ ret = ext3_get_blocks_handle(handle, inode, iblock,
max_blocks, bh_result, create, 0);
- if (ret > 0) {
- bh_result->b_size = (ret << inode->i_blkbits);
- ret = 0;
- }
+ if (ret > 0) {
+ bh_result->b_size = (ret << inode->i_blkbits);
+ ret = 0;
}
+ if (started)
+ ext3_journal_stop(handle);
+out:
return ret;
}
}
if (buffer_new(&dummy)) {
J_ASSERT(create != 0);
- J_ASSERT(handle != 0);
+ J_ASSERT(handle != NULL);
/*
* Now that we do not always journal data, we should
return ext3_journal_get_write_access(handle, bh);
}
-static int ext3_prepare_write(struct file *file, struct page *page,
- unsigned from, unsigned to)
+static int ext3_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = mapping->host;
int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
handle_t *handle;
int retries = 0;
+ struct page *page;
+ pgoff_t index;
+ unsigned from, to;
+
+ index = pos >> PAGE_CACHE_SHIFT;
+ from = pos & (PAGE_CACHE_SIZE - 1);
+ to = from + len;
retry:
+ page = __grab_cache_page(mapping, index);
+ if (!page)
+ return -ENOMEM;
+ *pagep = page;
+
handle = ext3_journal_start(inode, needed_blocks);
if (IS_ERR(handle)) {
+ unlock_page(page);
+ page_cache_release(page);
ret = PTR_ERR(handle);
goto out;
}
- if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode))
- ret = nobh_prepare_write(page, from, to, ext3_get_block);
- else
- ret = block_prepare_write(page, from, to, ext3_get_block);
+ ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+ ext3_get_block);
if (ret)
- goto prepare_write_failed;
+ goto write_begin_failed;
if (ext3_should_journal_data(inode)) {
ret = walk_page_buffers(handle, page_buffers(page),
from, to, NULL, do_journal_get_write_access);
}
-prepare_write_failed:
- if (ret)
+write_begin_failed:
+ if (ret) {
ext3_journal_stop(handle);
+ unlock_page(page);
+ page_cache_release(page);
+ }
if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
goto retry;
out:
return ret;
}
+
int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
{
int err = journal_dirty_data(handle, bh);
if (err)
ext3_journal_abort_handle(__FUNCTION__, __FUNCTION__,
- bh, handle,err);
+ bh, handle, err);
return err;
}
-/* For commit_write() in data=journal mode */
-static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
+/* For write_end() in data=journal mode */
+static int write_end_fn(handle_t *handle, struct buffer_head *bh)
{
if (!buffer_mapped(bh) || buffer_freed(bh))
return 0;
}
/*
+ * Generic write_end handler for ordered and writeback ext3 journal modes.
+ * We can't use generic_write_end, because that unlocks the page and we need to
+ * unlock the page after ext3_journal_stop, but ext3_journal_stop must run
+ * after block_write_end.
+ */
+static int ext3_generic_write_end(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ struct inode *inode = file->f_mapping->host;
+
+ copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+
+ if (pos+copied > inode->i_size) {
+ i_size_write(inode, pos+copied);
+ mark_inode_dirty(inode);
+ }
+
+ return copied;
+}
+
+/*
* We need to pick up the new inode size which generic_commit_write gave us
* `file' can be NULL - eg, when called from page_symlink().
*
* ext3 never places buffers on inode->i_mapping->private_list. metadata
* buffers are managed internally.
*/
-static int ext3_ordered_commit_write(struct file *file, struct page *page,
- unsigned from, unsigned to)
+static int ext3_ordered_write_end(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
{
handle_t *handle = ext3_journal_current_handle();
- struct inode *inode = page->mapping->host;
+ struct inode *inode = file->f_mapping->host;
+ unsigned from, to;
int ret = 0, ret2;
+ from = pos & (PAGE_CACHE_SIZE - 1);
+ to = from + len;
+
ret = walk_page_buffers(handle, page_buffers(page),
from, to, NULL, ext3_journal_dirty_data);
if (ret == 0) {
/*
- * generic_commit_write() will run mark_inode_dirty() if i_size
+ * generic_write_end() will run mark_inode_dirty() if i_size
* changes. So let's piggyback the i_disksize mark_inode_dirty
* into that.
*/
loff_t new_i_size;
- new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+ new_i_size = pos + copied;
if (new_i_size > EXT3_I(inode)->i_disksize)
EXT3_I(inode)->i_disksize = new_i_size;
- ret = generic_commit_write(file, page, from, to);
+ copied = ext3_generic_write_end(file, mapping, pos, len, copied,
+ page, fsdata);
+ if (copied < 0)
+ ret = copied;
}
ret2 = ext3_journal_stop(handle);
if (!ret)
ret = ret2;
- return ret;
+ unlock_page(page);
+ page_cache_release(page);
+
+ return ret ? ret : copied;
}
-static int ext3_writeback_commit_write(struct file *file, struct page *page,
- unsigned from, unsigned to)
+static int ext3_writeback_write_end(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
{
handle_t *handle = ext3_journal_current_handle();
- struct inode *inode = page->mapping->host;
+ struct inode *inode = file->f_mapping->host;
int ret = 0, ret2;
loff_t new_i_size;
- new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+ new_i_size = pos + copied;
if (new_i_size > EXT3_I(inode)->i_disksize)
EXT3_I(inode)->i_disksize = new_i_size;
- if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode))
- ret = nobh_commit_write(file, page, from, to);
- else
- ret = generic_commit_write(file, page, from, to);
+ copied = ext3_generic_write_end(file, mapping, pos, len, copied,
+ page, fsdata);
+ if (copied < 0)
+ ret = copied;
ret2 = ext3_journal_stop(handle);
if (!ret)
ret = ret2;
- return ret;
+ unlock_page(page);
+ page_cache_release(page);
+
+ return ret ? ret : copied;
}
-static int ext3_journalled_commit_write(struct file *file,
- struct page *page, unsigned from, unsigned to)
+static int ext3_journalled_write_end(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
{
handle_t *handle = ext3_journal_current_handle();
- struct inode *inode = page->mapping->host;
+ struct inode *inode = mapping->host;
int ret = 0, ret2;
int partial = 0;
- loff_t pos;
+ unsigned from, to;
- /*
- * Here we duplicate the generic_commit_write() functionality
- */
- pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+ from = pos & (PAGE_CACHE_SIZE - 1);
+ to = from + len;
+
+ if (copied < len) {
+ if (!PageUptodate(page))
+ copied = 0;
+ page_zero_new_buffers(page, from+copied, to);
+ }
ret = walk_page_buffers(handle, page_buffers(page), from,
- to, &partial, commit_write_fn);
+ to, &partial, write_end_fn);
if (!partial)
SetPageUptodate(page);
- if (pos > inode->i_size)
- i_size_write(inode, pos);
+ if (pos+copied > inode->i_size)
+ i_size_write(inode, pos+copied);
EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
if (inode->i_size > EXT3_I(inode)->i_disksize) {
EXT3_I(inode)->i_disksize = inode->i_size;
if (!ret)
ret = ret2;
}
+
ret2 = ext3_journal_stop(handle);
if (!ret)
ret = ret2;
- return ret;
+ unlock_page(page);
+ page_cache_release(page);
+
+ return ret ? ret : copied;
}
/*
PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
err = walk_page_buffers(handle, page_buffers(page), 0,
- PAGE_CACHE_SIZE, NULL, commit_write_fn);
+ PAGE_CACHE_SIZE, NULL, write_end_fn);
if (ret == 0)
ret = err;
EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
* if the machine crashes during the write.
*
* If the O_DIRECT write is intantiating holes inside i_size and the machine
- * crashes then stale disk data _may_ be exposed inside the file.
+ * crashes then stale disk data _may_ be exposed inside the file. But current
+ * VFS code falls back into buffered path in that case so we are safe.
*/
static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
const struct iovec *iov, loff_t offset,
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
struct ext3_inode_info *ei = EXT3_I(inode);
- handle_t *handle = NULL;
+ handle_t *handle;
ssize_t ret;
int orphan = 0;
size_t count = iov_length(iov, nr_segs);
if (rw == WRITE) {
loff_t final_size = offset + count;
- handle = ext3_journal_start(inode, DIO_CREDITS);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
- }
if (final_size > inode->i_size) {
+ /* Credits for sb + inode write */
+ handle = ext3_journal_start(inode, 2);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
ret = ext3_orphan_add(handle, inode);
- if (ret)
- goto out_stop;
+ if (ret) {
+ ext3_journal_stop(handle);
+ goto out;
+ }
orphan = 1;
ei->i_disksize = inode->i_size;
+ ext3_journal_stop(handle);
}
}
offset, nr_segs,
ext3_get_block, NULL);
- /*
- * Reacquire the handle: ext3_get_block() can restart the transaction
- */
- handle = ext3_journal_current_handle();
-
-out_stop:
- if (handle) {
+ if (orphan) {
int err;
- if (orphan && inode->i_nlink)
+ /* Credits for sb + inode write */
+ handle = ext3_journal_start(inode, 2);
+ if (IS_ERR(handle)) {
+ /* This is really bad luck. We've written the data
+ * but cannot extend i_size. Bail out and pretend
+ * the write failed... */
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+ if (inode->i_nlink)
ext3_orphan_del(handle, inode);
- if (orphan && ret > 0) {
+ if (ret > 0) {
loff_t end = offset + ret;
if (end > inode->i_size) {
ei->i_disksize = end;
.readpages = ext3_readpages,
.writepage = ext3_ordered_writepage,
.sync_page = block_sync_page,
- .prepare_write = ext3_prepare_write,
- .commit_write = ext3_ordered_commit_write,
+ .write_begin = ext3_write_begin,
+ .write_end = ext3_ordered_write_end,
.bmap = ext3_bmap,
.invalidatepage = ext3_invalidatepage,
.releasepage = ext3_releasepage,
.readpages = ext3_readpages,
.writepage = ext3_writeback_writepage,
.sync_page = block_sync_page,
- .prepare_write = ext3_prepare_write,
- .commit_write = ext3_writeback_commit_write,
+ .write_begin = ext3_write_begin,
+ .write_end = ext3_writeback_write_end,
.bmap = ext3_bmap,
.invalidatepage = ext3_invalidatepage,
.releasepage = ext3_releasepage,
.readpages = ext3_readpages,
.writepage = ext3_journalled_writepage,
.sync_page = block_sync_page,
- .prepare_write = ext3_prepare_write,
- .commit_write = ext3_journalled_commit_write,
+ .write_begin = ext3_write_begin,
+ .write_end = ext3_journalled_write_end,
.set_page_dirty = ext3_journalled_set_page_dirty,
.bmap = ext3_bmap,
.invalidatepage = ext3_invalidatepage,
*/
if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
ext3_should_writeback_data(inode) && PageUptodate(page)) {
- zero_user_page(page, offset, length, KM_USER0);
+ zero_user(page, offset, length);
set_page_dirty(page);
goto unlock;
}
goto unlock;
}
- zero_user_page(page, offset, length, KM_USER0);
+ zero_user(page, offset, length);
BUFFER_TRACE(bh, "zeroed end of block");
err = 0;
ei->i_flags |= EXT3_DIRSYNC_FL;
}
-void ext3_read_inode(struct inode * inode)
+struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
{
struct ext3_iloc iloc;
struct ext3_inode *raw_inode;
- struct ext3_inode_info *ei = EXT3_I(inode);
+ struct ext3_inode_info *ei;
struct buffer_head *bh;
+ struct inode *inode;
+ long ret;
int block;
+ inode = iget_locked(sb, ino);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+ if (!(inode->i_state & I_NEW))
+ return inode;
+
+ ei = EXT3_I(inode);
#ifdef CONFIG_EXT3_FS_POSIX_ACL
ei->i_acl = EXT3_ACL_NOT_CACHED;
ei->i_default_acl = EXT3_ACL_NOT_CACHED;
#endif
ei->i_block_alloc_info = NULL;
- if (__ext3_get_inode_loc(inode, &iloc, 0))
+ ret = __ext3_get_inode_loc(inode, &iloc, 0);
+ if (ret < 0)
goto bad_inode;
bh = iloc.bh;
raw_inode = ext3_raw_inode(&iloc);
!(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) {
/* this inode is deleted */
brelse (bh);
+ ret = -ESTALE;
goto bad_inode;
}
/* The only unlinked inodes we let through here have
if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
EXT3_INODE_SIZE(inode->i_sb)) {
brelse (bh);
+ ret = -EIO;
goto bad_inode;
}
if (ei->i_extra_isize == 0) {
}
brelse (iloc.bh);
ext3_set_inode_flags(inode);
- return;
+ unlock_new_inode(inode);
+ return inode;
bad_inode:
- make_bad_inode(inode);
- return;
+ iget_failed(inode);
+ return ERR_PTR(ret);
}
/*
return 0;
if (ext3_journal_current_handle()) {
- jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
+ jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
dump_stack();
return -EIO;
}
*/
journal = EXT3_JOURNAL(inode);
- if (is_journal_aborted(journal) || IS_RDONLY(inode))
+ if (is_journal_aborted(journal))
return -EROFS;
journal_lock_updates(journal);