Btrfs: move data checksumming into a dedicated tree

Btrfs stores checksums for each data block.  Until now, they have
been stored in the subvolume trees, indexed by the inode that is
referencing the data block.  This means that when we read the inode,
we've probably read in at least some checksums as well.

But, this has a few problems:

* The checksums are indexed by logical offset in the file.  When
compression is on, this means we have to do the expensive checksumming
on the uncompressed data.  It would be faster if we could checksum
the compressed data instead.

* If we implement encryption, we'll be checksumming the plain text and
storing that on disk.  This is significantly less secure.

* For either compression or encryption, we have to get the plain text
back before we can verify the checksum as correct.  This makes the raid
layer balancing and extent moving much more expensive.

* It makes the front end caching code more complex, as we have touch
the subvolume and inodes as we cache extents.

* There is potentitally one copy of the checksum in each subvolume
referencing an extent.

The solution used here is to store the extent checksums in a dedicated
tree.  This allows us to index the checksums by phyiscal extent
start and length.  It means:

* The checksum is against the data stored on disk, after any compression
or encryption is done.

* The checksum is stored in a central location, and can be verified without
following back references, or reading inodes.

This makes compression significantly faster by reducing the amount of
data that needs to be checksummed.  It will also allow much faster
raid management code in general.

The checksums are indexed by a key with a fixed objectid (a magic value
in ctree.h) and offset set to the starting byte of the extent.  This
allows us to copy the checksum items into the fsync log tree directly (or
any other tree), without having to invent a second format for them.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
Chris Mason 2008-12-08 17:00:31 -05:00
parent d45ee76e4f
commit d79f499eae
5 changed files with 51 additions and 10 deletions

View File

@ -2172,7 +2172,7 @@ int do_convert(const char *devname, int datacsum, int packing, int noxattr)
{
int i, fd, ret;
u32 blocksize;
u64 blocks[6];
u64 blocks[7];
u64 total_bytes;
u64 super_bytenr;
ext2_filsys ext2_fs;
@ -2195,7 +2195,7 @@ int do_convert(const char *devname, int datacsum, int packing, int noxattr)
fprintf(stderr, "filetype feature is missing\n");
goto fail;
}
for (i = 0; i < 6; i++) {
for (i = 0; i < 7; i++) {
ret = ext2_alloc_block(ext2_fs, 0, blocks + i);
if (ret) {
fprintf(stderr, "not enough free space\n");

16
ctree.h
View File

@ -54,6 +54,9 @@ struct btrfs_trans_handle;
/* directory objectid inside the root tree */
#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL
/* holds checksums of all the data extents */
#define BTRFS_CSUM_TREE_OBJECTID 7ULL
/* oprhan objectid for tracking unlinked/truncated files */
#define BTRFS_ORPHAN_OBJECTID -5ULL
@ -66,6 +69,13 @@ struct btrfs_trans_handle;
#define BTRFS_TREE_RELOC_OBJECTID -8ULL
#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL
/*
* extent checksums all have this objectid
* this allows them to share the logging tree
* for fsyncs
*/
#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL
/* dummy objectid represents multiple objectids */
#define BTRFS_MULTIPLE_OBJECTIDS -255ULL
@ -583,6 +593,7 @@ struct btrfs_fs_info {
struct btrfs_root *tree_root;
struct btrfs_root *chunk_root;
struct btrfs_root *dev_root;
struct btrfs_root *csum_root;
/* the log root tree is a directory of all the other log roots */
struct btrfs_root *log_root_tree;
@ -688,6 +699,11 @@ struct btrfs_root {
* csum items have the checksums for data in the extents
*/
#define BTRFS_CSUM_ITEM_KEY 120
/*
* extent csums are stored in a separate tree and hold csums for
* an entire extent on disk.
*/
#define BTRFS_EXTENT_CSUM_KEY 128
/*
* root items point to tree roots. There are typically in the root

4
mkfs.c
View File

@ -328,7 +328,7 @@ int main(int ac, char **av)
char *first_file;
u64 block_count = 0;
u64 dev_block_count = 0;
u64 blocks[6];
u64 blocks[7];
u64 alloc_start = 0;
u64 metadata_profile = BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP;
u64 data_profile = BTRFS_BLOCK_GROUP_RAID0;
@ -414,7 +414,7 @@ int main(int ac, char **av)
if (block_count == 0)
block_count = dev_block_count;
for (i = 0; i < 6; i++)
for (i = 0; i < 7; i++)
blocks[i] = BTRFS_SUPER_INFO_OFFSET + leafsize * i;
ret = make_btrfs(fd, file, label, blocks, block_count,

View File

@ -279,6 +279,10 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
ci = btrfs_item_ptr(l, i, struct btrfs_csum_item);
printf("\t\tcsum item\n");
break;
case BTRFS_EXTENT_CSUM_KEY:
ci = btrfs_item_ptr(l, i, struct btrfs_csum_item);
printf("\t\textent csum item\n");
break;
case BTRFS_EXTENT_DATA_KEY:
fi = btrfs_item_ptr(l, i,
struct btrfs_file_extent_item);

33
utils.c
View File

@ -46,16 +46,17 @@
static inline int ioctl(int fd, int define, u64 *size) { return 0; }
#endif
static u64 reference_root_table[6] = {
static u64 reference_root_table[] = {
[1] = BTRFS_ROOT_TREE_OBJECTID,
[2] = BTRFS_EXTENT_TREE_OBJECTID,
[3] = BTRFS_CHUNK_TREE_OBJECTID,
[4] = BTRFS_DEV_TREE_OBJECTID,
[5] = BTRFS_FS_TREE_OBJECTID,
[6] = BTRFS_CSUM_TREE_OBJECTID,
};
int make_btrfs(int fd, const char *device, const char *label,
u64 blocks[6], u64 num_bytes, u32 nodesize,
u64 blocks[7], u64 num_bytes, u32 nodesize,
u32 leafsize, u32 sectorsize, u32 stripesize)
{
struct btrfs_super_block super;
@ -96,7 +97,7 @@ int make_btrfs(int fd, const char *device, const char *label,
btrfs_set_super_root(&super, blocks[1]);
btrfs_set_super_chunk_root(&super, blocks[3]);
btrfs_set_super_total_bytes(&super, num_bytes);
btrfs_set_super_bytes_used(&super, 5 * leafsize);
btrfs_set_super_bytes_used(&super, 6 * leafsize);
btrfs_set_super_sectorsize(&super, sectorsize);
btrfs_set_super_leafsize(&super, leafsize);
btrfs_set_super_nodesize(&super, nodesize);
@ -112,7 +113,7 @@ int make_btrfs(int fd, const char *device, const char *label,
memset(buf->data, 0, leafsize);
buf->len = leafsize;
btrfs_set_header_bytenr(buf, blocks[1]);
btrfs_set_header_nritems(buf, 3);
btrfs_set_header_nritems(buf, 4);
btrfs_set_header_generation(buf, 1);
btrfs_set_header_owner(buf, BTRFS_ROOT_TREE_OBJECTID);
write_extent_buffer(buf, super.fsid, (unsigned long)
@ -174,6 +175,18 @@ int make_btrfs(int fd, const char *device, const char *label,
sizeof(root_item));
nritems++;
itemoff = itemoff - sizeof(root_item);
btrfs_set_root_bytenr(&root_item, blocks[6]);
btrfs_set_disk_key_objectid(&disk_key, BTRFS_CSUM_TREE_OBJECTID);
btrfs_set_item_key(buf, &disk_key, nritems);
btrfs_set_item_offset(buf, btrfs_item_nr(buf, nritems), itemoff);
btrfs_set_item_size(buf, btrfs_item_nr(buf, nritems),
sizeof(root_item));
write_extent_buffer(buf, &root_item,
btrfs_item_ptr_offset(buf, nritems),
sizeof(root_item));
nritems++;
csum_tree_block_size(buf, BTRFS_CRC32_SIZE, 0);
ret = pwrite(fd, buf->data, leafsize, blocks[1]);
@ -193,7 +206,7 @@ int make_btrfs(int fd, const char *device, const char *label,
extent_item = btrfs_item_ptr(buf, nritems, struct btrfs_extent_item);
btrfs_set_extent_refs(buf, extent_item, 1);
nritems++;
for (i = 1; i < 6; i++) {
for (i = 1; i < 7; i++) {
BUG_ON(blocks[i] < first_free);
BUG_ON(blocks[i] < blocks[i - 1]);
@ -352,7 +365,7 @@ int make_btrfs(int fd, const char *device, const char *label,
csum_tree_block_size(buf, BTRFS_CRC32_SIZE, 0);
ret = pwrite(fd, buf->data, leafsize, blocks[4]);
/* finally create the FS root */
/* create the FS root */
btrfs_set_header_bytenr(buf, blocks[5]);
btrfs_set_header_owner(buf, BTRFS_FS_TREE_OBJECTID);
btrfs_set_header_nritems(buf, 0);
@ -360,6 +373,14 @@ int make_btrfs(int fd, const char *device, const char *label,
ret = pwrite(fd, buf->data, leafsize, blocks[5]);
BUG_ON(ret != leafsize);
/* finally create the csum root */
btrfs_set_header_bytenr(buf, blocks[6]);
btrfs_set_header_owner(buf, BTRFS_CSUM_TREE_OBJECTID);
btrfs_set_header_nritems(buf, 0);
csum_tree_block_size(buf, BTRFS_CRC32_SIZE, 0);
ret = pwrite(fd, buf->data, leafsize, blocks[6]);
BUG_ON(ret != leafsize);
/* and write out the super block */
BUG_ON(sizeof(super) > sectorsize);
memset(buf->data, 0, sectorsize);