From: Chris Mason From: mason@suse.com From: jeffm@suse.com The current reiserfs allocator pretty much allocates things sequentially from the start of the disk, it works very nicely for desktop loads but once you've got more then one proc doing io data files can fragment badly. One obvious solution is something like ext2's bitmap groups, which puts file data into different areas of the disk based on which subdirectory they are in. The problem with bitmap groups is that if you've got a group of subdirectories their contents will be spread out all over the disk, leading to lots of seeks during a sequential read. This allocator patch uses the packing locality to determine which bitmap group to allocate from, but when you create a file it looks in the bitmaps to see how 'full' that packing locality already is. If it hasn't been heavily used yet, the packing locality is inherited from the parent directory putting files in new subdirs close to the parent subdir, otherwise it is the inode number of the parent directory putting new files far away from the parent subdir. The end result is fewer bitmap groups for the same working set. For example, one test data set created by 20 procs running in parallel has 6822 subdirs. And with vanilla reiserfs that would mean 6822 packing localities. This patch turns that into 26 packing localities. This makes sequential reads of big directory trees more efficient, but it also makes the btree more efficient in general. Things end up sorted better because groups of subdirs end up with similar keys in the btree, instead of being spread out all over. The bitmap grouping code tries to use the start of each bitmap group for metadata, and offsets the data slightly. The data and metadata are still close together, but not completely intermixed like they are in the default allocator. The end result is that leaf nodes tend to be close to each other, making metadata readahead more effective. The old block allocator had the ability to enforce a minimum allocation size, but did not use it. It now tries to do a pass looking for larger allocation chunks before falling back to the old behaviour of taking any blocks it can find. The patch changes the defaults to: mount -o alloc=skip_busy:dirid_groups:packing_groups You can get back the old behaviour with mount -o alloc=skip_busy mount -o alloc=dirid_groups will turn on the bitmap groups mount -o alloc=packing_groups turns on the packing locality reduction code mount -o alloc=skip_busy:dirid_groups turns on both dirid_groups and skip_busy Finally the patch adds a mount -o alloc=oid_groups, which puts files into bitmap groups based on a hash of their objectid. This would be used for databases or other situations where you have a limited number of very large files. This command will tell you how many packing localities are actually in use: debugreiserfs -d /dev/xxx | grep '^|.*SD' | sed 's/^.....//' | awk '{print $1}' | sort -u | wc -l --- 25-akpm/fs/reiserfs/bitmap.c | 288 +++++++++++++++++++++++++++++------- 25-akpm/fs/reiserfs/file.c | 14 + 25-akpm/fs/reiserfs/inode.c | 3 25-akpm/fs/reiserfs/super.c | 13 - 25-akpm/include/linux/reiserfs_fs.h | 11 + 5 files changed, 262 insertions(+), 67 deletions(-) diff -puN fs/reiserfs/bitmap.c~reiserfs-group-alloc-9 fs/reiserfs/bitmap.c --- 25/fs/reiserfs/bitmap.c~reiserfs-group-alloc-9 Wed May 19 14:27:35 2004 +++ 25-akpm/fs/reiserfs/bitmap.c Wed May 19 14:27:35 2004 @@ -30,6 +30,9 @@ #define _ALLOC_hashed_formatted_nodes 7 #define _ALLOC_old_way 8 #define _ALLOC_hundredth_slices 9 +#define _ALLOC_dirid_groups 10 +#define _ALLOC_oid_groups 11 +#define _ALLOC_packing_groups 12 #define concentrating_formatted_nodes(s) test_bit(_ALLOC_concentrating_formatted_nodes, &SB_ALLOC_OPTS(s)) #define displacing_large_files(s) test_bit(_ALLOC_displacing_large_files, &SB_ALLOC_OPTS(s)) @@ -150,11 +153,6 @@ static int scan_bitmap_block (struct rei __wait_on_buffer (bi->bh); } - /* If we know that first zero bit is only one or first zero bit is - closer to the end of bitmap than our start pointer */ - if (bi->first_zero_hint > *beg || bi->free_count == 1) - *beg = bi->first_zero_hint; - while (1) { cont: if (bi->free_count < min) @@ -204,21 +202,12 @@ static int scan_bitmap_block (struct rei while (--i >= *beg) reiserfs_test_and_clear_le_bit (i, bi->bh->b_data); reiserfs_restore_prepared_buffer (s, bi->bh); - *beg = max(org, (int)bi->first_zero_hint); + *beg = org; /* ... and search again in current block from beginning */ goto cont; } } bi->free_count -= (end - *beg); - - /* if search started from zero_hint bit, and zero hint have not - changed since, then we need to update first_zero_hint */ - if ( bi->first_zero_hint >= *beg) - /* no point in looking for free bit if there is not any */ - bi->first_zero_hint = (bi->free_count > 0 ) ? - reiserfs_find_next_zero_le_bit - ((unsigned long*)(bi->bh->b_data), s->s_blocksize << 3, end) : (s->s_blocksize << 3); - journal_mark_dirty (th, s, bi->bh); /* free block count calculation */ @@ -231,7 +220,52 @@ static int scan_bitmap_block (struct rei *beg = next; } } - } +} + +static int bmap_hash_id(struct super_block *s, u32 id) { + char * hash_in = NULL; + unsigned long hash; + unsigned bm; + + if (id <= 2) { + bm = 1; + } else { + hash_in = (char *)(&id); + hash = keyed_hash(hash_in, 4); + bm = hash % SB_BMAP_NR(s); + if (!bm) + bm = 1; + } + return bm; +} + +/* + * hashes the id and then returns > 0 if the block group for the + * corresponding hash is full + */ +static inline int block_group_used(struct super_block *s, u32 id) { + int bm; + bm = bmap_hash_id(s, id); + if (SB_AP_BITMAP(s)[bm].free_count > ((s->s_blocksize << 3) * 60 / 100) ) { + return 0; + } + return 1; +} + +/* + * the packing is returned in disk byte order + */ +u32 reiserfs_choose_packing(struct inode *dir) { + u32 packing; + if (TEST_OPTION(packing_groups, dir->i_sb)) { + if (block_group_used(dir->i_sb,le32_to_cpu(INODE_PKEY(dir)->k_dir_id))) + packing = INODE_PKEY(dir)->k_objectid; + else + packing = INODE_PKEY(dir)->k_dir_id; + } else + packing = INODE_PKEY(dir)->k_objectid; + return packing; +} /* Tries to find contiguous zero bit window (given size) in given region of * bitmap and place new blocks there. Returns number of allocated blocks. */ @@ -255,8 +289,18 @@ static int scan_bitmap (struct reiserfs_ get_bit_address (s, *start, &bm, &off); get_bit_address (s, finish, &end_bm, &end_off); - // With this option set first we try to find a bitmap that is at least 10% - // free, and if that fails, then we fall back to old whole bitmap scanning + /* When the bitmap is more than 10% free, anyone can allocate. + * When it's less than 10% free, only files that already use the + * bitmap are allowed. Once we pass 80% full, this restriction + * is lifted. + * + * We do this so that files that grow later still have space close to + * their original allocation. This improves locality, and presumably + * performance as a result. + * + * This is only an allocation policy and does not make up for getting a + * bad hint. Decent hinting must be implemented for this to work well. + */ if ( TEST_OPTION(skip_busy, s) && SB_FREE_BLOCKS(s) > SB_BLOCK_COUNT(s)/20 ) { for (;bm < end_bm; bm++, off = 0) { if ( ( off && (!unfm || (file_block != 0))) || SB_AP_BITMAP(s)[bm].free_count > (s->s_blocksize << 3) / 10 ) @@ -314,9 +358,6 @@ static void _reiserfs_free_block (struct "free_block (%s:%lu)[dev:blocknr]: bit already cleared", reiserfs_bdevname (s), block); } - if (offset < apbi[nr].first_zero_hint) { - apbi[nr].first_zero_hint = offset; - } apbi[nr].free_count ++; journal_mark_dirty (th, s, apbi[nr].bh); @@ -396,6 +437,15 @@ void reiserfs_discard_all_prealloc (stru __discard_prealloc(th, ei); } } + +void reiserfs_init_alloc_options (struct super_block *s) +{ + set_bit (_ALLOC_skip_busy, &SB_ALLOC_OPTS(s)); + set_bit (_ALLOC_dirid_groups, &SB_ALLOC_OPTS(s)); + set_bit (_ALLOC_packing_groups, &SB_ALLOC_OPTS(s)); + reiserfs_warning (s, "allocator defaults = [%08x]\n", SB_ALLOC_OPTS(s)); +} + /* block allocator related options are parsed here */ int reiserfs_parse_alloc_options(struct super_block * s, char * options) { @@ -439,6 +489,18 @@ int reiserfs_parse_alloc_options(struct continue; } + if (!strcmp(this_char, "dirid_groups")) { + SET_OPTION(dirid_groups); + continue; + } + if (!strcmp(this_char, "oid_groups")) { + SET_OPTION(oid_groups); + continue; + } + if (!strcmp(this_char, "packing_groups")) { + SET_OPTION(packing_groups); + continue; + } if (!strcmp(this_char, "hashed_formatted_nodes")) { SET_OPTION(hashed_formatted_nodes); continue; @@ -481,6 +543,7 @@ int reiserfs_parse_alloc_options(struct return 1; } + reiserfs_warning (s, "allocator options = [%08x]\n", SB_ALLOC_OPTS(s)); return 0; } @@ -503,17 +566,76 @@ static inline void new_hashed_relocation hint->search_start = hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg); } -static inline void get_left_neighbor(reiserfs_blocknr_hint_t *hint) +/* + * Relocation based on dirid, hashing them into a given bitmap block + * files. Formatted nodes are unaffected, a seperate policy covers them + */ +static void +dirid_groups (reiserfs_blocknr_hint_t *hint) +{ + unsigned long hash; + __u32 dirid = 0; + int bm = 0; + struct super_block *sb = hint->th->t_super; + if (hint->inode) + dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id); + else if (hint->formatted_node) + dirid = hint->key.k_dir_id; + + if (dirid) { + bm = bmap_hash_id(sb, dirid); + hash = bm * (sb->s_blocksize << 3); + /* give a portion of the block group to metadata */ + if (hint->inode) + hash += sb->s_blocksize/2; + hint->search_start = hash; + } +} + +/* + * Relocation based on oid, hashing them into a given bitmap block + * files. Formatted nodes are unaffected, a seperate policy covers them + */ +static void +oid_groups (reiserfs_blocknr_hint_t *hint) +{ + if (hint->inode) { + unsigned long hash; + __u32 oid; + __u32 dirid; + int bm; + + dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id); + + /* keep the root dir and it's first set of subdirs close to + * the start of the disk + */ + if (dirid <= 2) + hash = (hint->inode->i_sb->s_blocksize << 3); + else { + oid = le32_to_cpu(INODE_PKEY(hint->inode)->k_objectid); + bm = bmap_hash_id(hint->inode->i_sb, oid); + hash = bm * (hint->inode->i_sb->s_blocksize << 3); + } + hint->search_start = hash; + } +} + +/* returns 1 if it finds an indirect item and gets valid hint info + * from it, otherwise 0 + */ +static int get_left_neighbor(reiserfs_blocknr_hint_t *hint) { struct path * path; struct buffer_head * bh; struct item_head * ih; int pos_in_item; __u32 * item; + int ret = 0; if (!hint->path) /* reiserfs code can call this function w/o pointer to path * structure supplied; then we rely on supplied search_start */ - return; + return 0; path = hint->path; bh = get_last_bh(path); @@ -534,15 +656,15 @@ static inline void get_left_neighbor(rei int t=get_block_num(item,pos_in_item); if (t) { hint->search_start = t; + ret = 1; break; } pos_in_item --; } - } else { - } + } /* does result value fit into specified region? */ - return; + return ret; } /* should be, if formatted node, then try to put on first part of the device @@ -639,12 +761,13 @@ static inline void hundredth_slices (rei } } -static inline void determine_search_start(reiserfs_blocknr_hint_t *hint, +static void determine_search_start(reiserfs_blocknr_hint_t *hint, int amount_needed) { struct super_block *s = hint->th->t_super; hint->beg = 0; hint->end = SB_BLOCK_COUNT(s) - 1; + int unfm_hint; /* This is former border algorithm. Now with tunable border offset */ if (concentrating_formatted_nodes(s)) @@ -673,19 +796,14 @@ static inline void determine_search_star return; } - /* attempt to copy a feature from old block allocator code */ - if (TEST_OPTION(old_hashed_relocation, s) && !hint->formatted_node) { - old_hashed_relocation(hint); - } - /* if none of our special cases is relevant, use the left neighbor in the tree order of the new node we are allocating for */ if (hint->formatted_node && TEST_OPTION(hashed_formatted_nodes,s)) { - hash_formatted_node(hint); + hash_formatted_node(hint); return; - } + } - get_left_neighbor(hint); + unfm_hint = get_left_neighbor(hint); /* Mimic old block allocator behaviour, that is if VFS allowed for preallocation, new blocks are displaced based on directory ID. Also, if suggested search_start @@ -710,10 +828,36 @@ static inline void determine_search_star return; } - if (TEST_OPTION(old_hashed_relocation, s)) + /* old_hashed_relocation only works on unformatted */ + if (!unfm_hint && !hint->formatted_node && + TEST_OPTION(old_hashed_relocation, s)) + { old_hashed_relocation(hint); - if (TEST_OPTION(new_hashed_relocation, s)) + } + /* new_hashed_relocation works with both formatted/unformatted nodes */ + if ((!unfm_hint || hint->formatted_node) && + TEST_OPTION(new_hashed_relocation, s)) + { new_hashed_relocation(hint); + } + /* dirid grouping works only on unformatted nodes */ + if (!unfm_hint && !hint->formatted_node && TEST_OPTION(dirid_groups,s)) + { + dirid_groups(hint); + } + +#ifdef DISPLACE_NEW_PACKING_LOCALITIES + if (hint->formatted_node && TEST_OPTION(dirid_groups,s)) + { + dirid_groups(hint); + } +#endif + + /* oid grouping works only on unformatted nodes */ + if (!unfm_hint && !hint->formatted_node && TEST_OPTION(oid_groups,s)) + { + oid_groups(hint); + } return; } @@ -738,13 +882,14 @@ static int determine_prealloc_size(reise static inline int allocate_without_wrapping_disk (reiserfs_blocknr_hint_t * hint, b_blocknr_t * new_blocknrs, b_blocknr_t start, b_blocknr_t finish, + int min, int amount_needed, int prealloc_size) { int rest = amount_needed; int nr_allocated; while (rest > 0 && start <= finish) { - nr_allocated = scan_bitmap (hint->th, &start, finish, 1, + nr_allocated = scan_bitmap (hint->th, &start, finish, min, rest + prealloc_size, !hint->formatted_node, hint->block); @@ -777,8 +922,9 @@ static inline int blocknrs_and_prealloc_ struct super_block *s = hint->th->t_super; b_blocknr_t start = hint->search_start; b_blocknr_t finish = SB_BLOCK_COUNT(s) - 1; - int second_pass = 0; + int passno = 0; int nr_allocated = 0; + int bigalloc = 0; determine_prealloc_size(hint); if (!hint->formatted_node) { @@ -797,32 +943,64 @@ static inline int blocknrs_and_prealloc_ if (quota_ret) hint->preallocate=hint->prealloc_size=0; } - } - - while((nr_allocated - += allocate_without_wrapping_disk(hint, new_blocknrs + nr_allocated, start, finish, - amount_needed - nr_allocated, hint->prealloc_size)) - < amount_needed) { - - /* not all blocks were successfully allocated yet*/ - if (second_pass) { /* it was a second pass; we must free all blocks */ + /* for unformatted nodes, force large allocations */ + bigalloc = amount_needed + hint->prealloc_size; + /* try to make things even */ + if (bigalloc & 1 && hint->prealloc_size) + bigalloc--; + } + + do { + /* in bigalloc mode, nr_allocated should stay zero until + * the entire allocation is filled + */ + if (unlikely(bigalloc && nr_allocated)) { + reiserfs_warning(s, "bigalloc is %d, nr_allocated %d\n", + bigalloc, nr_allocated); + /* reset things to a sane value */ + bigalloc = amount_needed - nr_allocated; + } + /* + * try pass 0 and pass 1 looking for a nice big + * contiguous allocation. Then reset and look + * for anything you can find. + */ + if (passno == 2 && bigalloc) { + passno = 0; + bigalloc = 0; + } + switch (passno++) { + case 0: /* Search from hint->search_start to end of disk */ + start = hint->search_start; + finish = SB_BLOCK_COUNT(s) - 1; + break; + case 1: /* Search from hint->beg to hint->search_start */ + start = hint->beg; + finish = hint->search_start; + break; + case 2: /* Last chance: Search from 0 to hint->beg */ + start = 0; + finish = hint->beg; + break; + default: /* We've tried searching everywhere, not enough space */ + /* Free the blocks */ if (!hint->formatted_node) { #ifdef REISERQUOTA_DEBUG reiserfs_debug (s, "reiserquota: freeing (nospace) %d blocks id=%u", amount_needed + hint->prealloc_size - nr_allocated, hint->inode->i_uid); #endif DQUOT_FREE_BLOCK_NODIRTY(hint->inode, amount_needed + hint->prealloc_size - nr_allocated); /* Free not allocated blocks */ } - while (nr_allocated --) + while (nr_allocated --) reiserfs_free_block(hint->th, hint->inode, new_blocknrs[nr_allocated], !hint->formatted_node); return NO_DISK_SPACE; - } else { /* refine search parameters for next pass */ - second_pass = 1; - finish = start; - start = 0; - continue; } - } + } while ((nr_allocated += allocate_without_wrapping_disk (hint, + new_blocknrs + nr_allocated, start, finish, + bigalloc ? bigalloc : 1, + amount_needed - nr_allocated, + hint->prealloc_size)) + < amount_needed); if ( !hint->formatted_node && amount_needed + hint->prealloc_size > nr_allocated + REISERFS_I(hint->inode)->i_prealloc_count) { diff -puN fs/reiserfs/file.c~reiserfs-group-alloc-9 fs/reiserfs/file.c --- 25/fs/reiserfs/file.c~reiserfs-group-alloc-9 Wed May 19 14:27:35 2004 +++ 25-akpm/fs/reiserfs/file.c Wed May 19 14:27:35 2004 @@ -176,12 +176,13 @@ int reiserfs_allocate_blocks_for_region( hint.formatted_node = 0; // We are allocating blocks for unformatted node. /* only preallocate if this is a small write */ - if (blocks_to_allocate < - REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize) + if (REISERFS_I(inode)->i_prealloc_count || + (!(write_bytes & (inode->i_sb->s_blocksize -1)) && + blocks_to_allocate < + REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize)) hint.preallocate = 1; else hint.preallocate = 0; - /* Call block allocator to allocate blocks */ res = reiserfs_allocate_blocknrs(&hint, allocated_blocks, blocks_to_allocate, blocks_to_allocate); if ( res != CARRY_ON ) { @@ -467,6 +468,12 @@ retry: // the inode. // pathrelse(&path); + /* + * cleanup prellocation from previous writes + * if this is a partial block write + */ + if (write_bytes & (inode->i_sb->s_blocksize -1)) + reiserfs_discard_prealloc(th, inode); reiserfs_write_unlock(inode->i_sb); // go through all the pages/buffers and map the buffers to newly allocated @@ -1254,6 +1261,7 @@ ssize_t reiserfs_file_write( struct file journal_end(&th, th.t_super, th.t_blocks_allocated); reiserfs_write_unlock(inode->i_sb); } + if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) res = generic_osync_inode(inode, file->f_mapping, OSYNC_METADATA|OSYNC_DATA); diff -puN fs/reiserfs/inode.c~reiserfs-group-alloc-9 fs/reiserfs/inode.c --- 25/fs/reiserfs/inode.c~reiserfs-group-alloc-9 Wed May 19 14:27:35 2004 +++ 25-akpm/fs/reiserfs/inode.c Wed May 19 14:27:35 2004 @@ -1660,7 +1660,7 @@ int reiserfs_new_inode (struct reiserfs_ sb = dir->i_sb; /* item head of new item */ - ih.ih_key.k_dir_id = INODE_PKEY (dir)->k_objectid; + ih.ih_key.k_dir_id = reiserfs_choose_packing(dir); ih.ih_key.k_objectid = cpu_to_le32 (reiserfs_get_unused_objectid (th)); if (!ih.ih_key.k_objectid) { err = -ENOMEM; @@ -1729,7 +1729,6 @@ int reiserfs_new_inode (struct reiserfs_ err = -EEXIST; goto out_bad_inode; } - if (old_format_only (sb)) { if (inode->i_uid & ~0xffff || inode->i_gid & ~0xffff) { pathrelse (&path_to_key); diff -puN fs/reiserfs/super.c~reiserfs-group-alloc-9 fs/reiserfs/super.c --- 25/fs/reiserfs/super.c~reiserfs-group-alloc-9 Wed May 19 14:27:35 2004 +++ 25-akpm/fs/reiserfs/super.c Wed May 19 14:27:35 2004 @@ -492,7 +492,6 @@ static void reiserfs_clear_inode (struct REISERFS_I(inode)->i_acl_default = NULL; } - struct super_operations reiserfs_sops = { .alloc_inode = reiserfs_alloc_inode, @@ -651,7 +650,7 @@ static int reiserfs_getopt ( struct supe reiserfs_warning (s, "head of option \"%s\" is only correct", opt->option_name); return -1; } - + /* move to the argument, or to next option if argument is not required */ p ++; @@ -1345,15 +1344,17 @@ static int reiserfs_fill_super (struct s memset (sbi, 0, sizeof (struct reiserfs_sb_info)); /* Set default values for options: non-aggressive tails */ REISERFS_SB(s)->s_mount_opt = ( 1 << REISERFS_SMALLTAIL ); - /* default block allocator option: skip_busy */ - REISERFS_SB(s)->s_alloc_options.bits = ( 1 << 5); - /* If file grew past 4 blocks, start preallocation blocks for it. */ - REISERFS_SB(s)->s_alloc_options.preallocmin = 4; + /* no preallocation minimum, be smart in + reiserfs_file_write instead */ + REISERFS_SB(s)->s_alloc_options.preallocmin = 0; /* Preallocate by 16 blocks (17-1) at once */ REISERFS_SB(s)->s_alloc_options.preallocsize = 17; /* Initialize the rwsem for xattr dir */ init_rwsem(&REISERFS_SB(s)->xattr_dir_sem); + /* setup default block allocator options */ + reiserfs_init_alloc_options(s); + jdev_name = NULL; if (reiserfs_parse_options (s, (char *) data, &(sbi->s_mount_opt), &blocks, &jdev_name, &commit_max_age) == 0) { goto error; diff -puN include/linux/reiserfs_fs.h~reiserfs-group-alloc-9 include/linux/reiserfs_fs.h --- 25/include/linux/reiserfs_fs.h~reiserfs-group-alloc-9 Wed May 19 14:27:35 2004 +++ 25-akpm/include/linux/reiserfs_fs.h Wed May 19 14:27:35 2004 @@ -1247,7 +1247,7 @@ struct path { #define pos_in_item(path) ((path)->pos_in_item) #define INITIALIZE_PATH(var) \ -struct path var = {ILLEGAL_PATH_ELEMENT_OFFSET, } +struct path var = {.path_length = ILLEGAL_PATH_ELEMENT_OFFSET,} /* Get path element by path and path position. */ #define PATH_OFFSET_PELEMENT(p_s_path,n_offset) ((p_s_path)->path_elements +(n_offset)) @@ -2149,6 +2149,15 @@ struct buffer_head * get_FEB (struct tre typedef struct __reiserfs_blocknr_hint reiserfs_blocknr_hint_t; int reiserfs_parse_alloc_options (struct super_block *, char *); +void reiserfs_init_alloc_options (struct super_block *s); + +/* + * given a directory, this will tell you what packing locality + * to use for a new object underneat it. The locality is returned + * in disk byte order (le). + */ +u32 reiserfs_choose_packing(struct inode *dir); + int is_reusable (struct super_block * s, b_blocknr_t block, int bit_value); void reiserfs_free_block (struct reiserfs_transaction_handle *th, struct inode *, b_blocknr_t, int for_unformatted); int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *, b_blocknr_t * , int, int); _