Add basic RAID[56] support

David Woodhouse originally contributed this code, and Chris Mason
changed it around to reflect the current design goals for raid56.

The original code expected all metadata and data writes to be full
stripes.  This meant metadata block size == stripe size, and had a few
other restrictions.

This version allows metadata blocks smaller than the stripe size.  It
implements both raid5 and raid6, although it does not have code to
rebuild from parity if one of the drives is missing or incorrect.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
master
David Woodhouse 2009-07-11 18:12:37 +01:00 committed by Chris Mason
parent 6f082141d0
commit 4d48b96b28
18 changed files with 540 additions and 53 deletions

View File

@ -5,7 +5,7 @@ objects = ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \
root-tree.o dir-item.o file-item.o inode-item.o \
inode-map.o crc32c.o rbtree.o extent-cache.o extent_io.o \
volumes.o utils.o btrfs-list.o btrfslabel.o repair.o \
send-stream.o send-utils.o qgroup.o
send-stream.o send-utils.o qgroup.o raid6.o
cmds_objects = cmds-subvolume.o cmds-filesystem.o cmds-device.o cmds-scrub.o \
cmds-inspect.o cmds-balance.o cmds-send.o cmds-receive.o \
cmds-quota.o cmds-qgroup.o

View File

@ -50,7 +50,8 @@ struct extent_buffer *debug_corrupt_block(struct btrfs_root *root, u64 bytenr,
length = blocksize;
while (1) {
ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
eb->start, &length, &multi, mirror_num);
eb->start, &length, &multi,
mirror_num, NULL);
BUG_ON(ret);
device = multi->stripes[0].dev;
eb->fd = device->fd;
@ -63,7 +64,7 @@ struct extent_buffer *debug_corrupt_block(struct btrfs_root *root, u64 bytenr,
kfree(multi);
if (!copy || mirror_num == copy) {
ret = read_extent_from_disk(eb);
ret = read_extent_from_disk(eb, 0, eb->len);
printf("corrupting %llu copy %d\n", eb->start,
mirror_num);
memset(eb->data, 0, eb->len);

View File

@ -55,7 +55,8 @@ struct extent_buffer *debug_read_block(struct btrfs_root *root, u64 bytenr,
length = blocksize;
while (1) {
ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
eb->start, &length, &multi, mirror_num);
eb->start, &length, &multi,
mirror_num, NULL);
BUG_ON(ret);
device = multi->stripes[0].dev;
eb->fd = device->fd;
@ -68,7 +69,7 @@ struct extent_buffer *debug_read_block(struct btrfs_root *root, u64 bytenr,
kfree(multi);
if (!copy || mirror_num == copy)
ret = read_extent_from_disk(eb);
ret = read_extent_from_disk(eb, 0, eb->len);
num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
eb->start, eb->len);

View File

@ -47,6 +47,10 @@ static int parse_one_profile(const char *profile, u64 *flags)
*flags |= BTRFS_BLOCK_GROUP_RAID1;
} else if (!strcmp(profile, "raid10")) {
*flags |= BTRFS_BLOCK_GROUP_RAID10;
} else if (!strcmp(profile, "raid5")) {
*flags |= BTRFS_BLOCK_GROUP_RAID5;
} else if (!strcmp(profile, "raid6")) {
*flags |= BTRFS_BLOCK_GROUP_RAID6;
} else if (!strcmp(profile, "dup")) {
*flags |= BTRFS_BLOCK_GROUP_DUP;
} else if (!strcmp(profile, "single")) {

View File

@ -148,6 +148,12 @@ static int cmd_df(int argc, char **argv)
} else if (flags & BTRFS_BLOCK_GROUP_RAID10) {
snprintf(description+written, 9, "%s", ", RAID10");
written += 8;
} else if (flags & BTRFS_BLOCK_GROUP_RAID5) {
snprintf(description+written, 9, "%s", ", RAID5");
written += 7;
} else if (flags & BTRFS_BLOCK_GROUP_RAID6) {
snprintf(description+written, 9, "%s", ", RAID6");
written += 7;
}
total_bytes = pretty_sizes(sargs->spaces[i].total_bytes);

View File

@ -2430,7 +2430,7 @@ static int may_rollback(struct btrfs_root *root)
while (1) {
ret = btrfs_map_block(&info->mapping_tree, WRITE, bytenr,
&length, &multi, 0);
&length, &multi, 0, NULL);
if (ret)
goto fail;

View File

@ -437,6 +437,7 @@ struct btrfs_super_block {
* code was pretty buggy. Lets not let them try anymore.
*/
#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5)
#define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7)
#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
@ -446,6 +447,7 @@ struct btrfs_super_block {
BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
BTRFS_FEATURE_INCOMPAT_RAID56 | \
BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
/*
@ -779,6 +781,8 @@ struct btrfs_csum_item {
#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4)
#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5)
#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
#define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7)
#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8)
#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE
/* used in struct btrfs_balance_args fields */

216
disk-io.c
View File

@ -89,8 +89,8 @@ int csum_tree_block_size(struct extent_buffer *buf, u16 csum_size,
if (verify) {
if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
printk("checksum verify failed on %llu wanted %X "
"found %X\n", (unsigned long long)buf->start,
printk("checksum verify failed on %llu found %X "
"wanted %X\n", (unsigned long long)buf->start,
*((int *)result), *((char *)buf->data));
free(result);
return 1;
@ -141,7 +141,7 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
length = blocksize;
ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
bytenr, &length, &multi, 0);
bytenr, &length, &multi, 0, NULL);
BUG_ON(ret);
device = multi->stripes[0].dev;
device->total_ios++;
@ -182,15 +182,52 @@ out:
}
static int read_whole_eb(struct btrfs_fs_info *info, struct extent_buffer *eb, int mirror)
{
unsigned long offset = 0;
struct btrfs_multi_bio *multi = NULL;
struct btrfs_device *device;
int ret = 0;
u64 read_len;
unsigned long bytes_left = eb->len;
while (bytes_left) {
read_len = bytes_left;
ret = btrfs_map_block(&info->mapping_tree, READ,
eb->start + offset, &read_len, &multi,
mirror, NULL);
if (ret) {
printk("Couldn't map the block %Lu\n", eb->start + offset);
return -EIO;
}
device = multi->stripes[0].dev;
if (device->fd == 0)
return -EIO;
eb->fd = device->fd;
device->total_ios++;
eb->dev_bytenr = multi->stripes[0].physical;
kfree(multi);
if (read_len > bytes_left)
read_len = bytes_left;
ret = read_extent_from_disk(eb, offset, read_len);
if (ret)
return -EIO;
offset += read_len;
bytes_left -= read_len;
}
return 0;
}
struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
u32 blocksize, u64 parent_transid)
{
int ret;
struct extent_buffer *eb;
u64 length;
u64 best_transid = 0;
struct btrfs_multi_bio *multi = NULL;
struct btrfs_device *device;
int mirror_num = 0;
int good_mirror = 0;
int num_copies;
@ -203,21 +240,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
if (btrfs_buffer_uptodate(eb, parent_transid))
return eb;
length = blocksize;
while (1) {
ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
eb->start, &length, &multi, mirror_num);
if (ret) {
printk("Couldn't map the block %Lu\n", bytenr);
break;
}
device = multi->stripes[0].dev;
eb->fd = device->fd;
device->total_ios++;
eb->dev_bytenr = multi->stripes[0].physical;
kfree(multi);
ret = read_extent_from_disk(eb);
ret = read_whole_eb(root->fs_info, eb, mirror_num);
if (ret == 0 && check_tree_block(root, eb) == 0 &&
csum_tree_block(root, eb, 1) == 0 &&
verify_parent_transid(eb->tree, eb, parent_transid, ignore)
@ -253,12 +277,156 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
return NULL;
}
static int rmw_eb(struct btrfs_fs_info *info,
struct extent_buffer *eb, struct extent_buffer *orig_eb)
{
int ret;
unsigned long orig_off = 0;
unsigned long dest_off = 0;
unsigned long copy_len = eb->len;
ret = read_whole_eb(info, eb, 0);
if (ret)
return ret;
if (eb->start + eb->len <= orig_eb->start ||
eb->start >= orig_eb->start + orig_eb->len)
return 0;
/*
* | ----- orig_eb ------- |
* | ----- stripe ------- |
* | ----- orig_eb ------- |
* | ----- orig_eb ------- |
*/
if (eb->start > orig_eb->start)
orig_off = eb->start - orig_eb->start;
if (orig_eb->start > eb->start)
dest_off = orig_eb->start - eb->start;
if (copy_len > orig_eb->len - orig_off)
copy_len = orig_eb->len - orig_off;
if (copy_len > eb->len - dest_off)
copy_len = eb->len - dest_off;
memcpy(eb->data + dest_off, orig_eb->data + orig_off, copy_len);
return 0;
}
static void split_eb_for_raid56(struct btrfs_fs_info *info,
struct extent_buffer *orig_eb,
struct extent_buffer **ebs,
u64 stripe_len, u64 *raid_map,
int num_stripes)
{
struct extent_buffer *eb;
u64 start = orig_eb->start;
u64 this_eb_start;
int i;
int ret;
for (i = 0; i < num_stripes; i++) {
if (raid_map[i] >= BTRFS_RAID5_P_STRIPE)
break;
eb = malloc(sizeof(struct extent_buffer) + stripe_len);
if (!eb)
BUG();
memset(eb, 0, sizeof(struct extent_buffer) + stripe_len);
eb->start = raid_map[i];
eb->len = stripe_len;
eb->refs = 1;
eb->flags = 0;
eb->fd = -1;
eb->dev_bytenr = (u64)-1;
this_eb_start = raid_map[i];
if (start > this_eb_start ||
start + orig_eb->len < this_eb_start + stripe_len) {
ret = rmw_eb(info, eb, orig_eb);
BUG_ON(ret);
} else {
memcpy(eb->data, orig_eb->data + eb->start - start, stripe_len);
}
ebs[i] = eb;
}
}
static int write_raid56_with_parity(struct btrfs_fs_info *info,
struct extent_buffer *eb,
struct btrfs_multi_bio *multi,
u64 stripe_len, u64 *raid_map)
{
struct extent_buffer *ebs[multi->num_stripes], *p_eb = NULL, *q_eb = NULL;
int i;
int j;
int ret;
int alloc_size = eb->len;
if (stripe_len > alloc_size)
alloc_size = stripe_len;
split_eb_for_raid56(info, eb, ebs, stripe_len, raid_map,
multi->num_stripes);
for (i = 0; i < multi->num_stripes; i++) {
struct extent_buffer *new_eb;
if (raid_map[i] < BTRFS_RAID5_P_STRIPE) {
ebs[i]->dev_bytenr = multi->stripes[i].physical;
ebs[i]->fd = multi->stripes[i].dev->fd;
multi->stripes[i].dev->total_ios++;
BUG_ON(ebs[i]->start != raid_map[i]);
continue;
}
new_eb = kmalloc(sizeof(*eb) + alloc_size, GFP_NOFS);
BUG_ON(!new_eb);
new_eb->dev_bytenr = multi->stripes[i].physical;
new_eb->fd = multi->stripes[i].dev->fd;
multi->stripes[i].dev->total_ios++;
new_eb->len = stripe_len;
if (raid_map[i] == BTRFS_RAID5_P_STRIPE)
p_eb = new_eb;
else if (raid_map[i] == BTRFS_RAID6_Q_STRIPE)
q_eb = new_eb;
}
if (q_eb) {
void *pointers[multi->num_stripes];
ebs[multi->num_stripes - 2] = p_eb;
ebs[multi->num_stripes - 1] = q_eb;
for (i = 0; i < multi->num_stripes; i++)
pointers[i] = ebs[i]->data;
raid6_gen_syndrome(multi->num_stripes, stripe_len, pointers);
} else {
ebs[multi->num_stripes - 1] = p_eb;
memcpy(p_eb->data, ebs[0]->data, stripe_len);
for (j = 1; j < multi->num_stripes - 1; j++) {
for (i = 0; i < stripe_len; i += sizeof(unsigned long)) {
*(unsigned long *)(p_eb->data + i) ^=
*(unsigned long *)(ebs[j]->data + i);
}
}
}
for (i = 0; i < multi->num_stripes; i++) {
ret = write_extent_to_disk(ebs[i]);
BUG_ON(ret);
if (ebs[i] != eb)
kfree(ebs[i]);
}
return 0;
}
int write_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *eb)
{
int ret;
int dev_nr;
u64 length;
u64 *raid_map = NULL;
struct btrfs_multi_bio *multi = NULL;
if (check_tree_block(root, eb))
@ -272,9 +440,13 @@ int write_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
dev_nr = 0;
length = eb->len;
ret = btrfs_map_block(&root->fs_info->mapping_tree, WRITE,
eb->start, &length, &multi, 0);
eb->start, &length, &multi, 0, &raid_map);
while(dev_nr < multi->num_stripes) {
if (raid_map) {
ret = write_raid56_with_parity(root->fs_info, eb, multi,
length, raid_map);
BUG_ON(ret);
} else while (dev_nr < multi->num_stripes) {
BUG_ON(ret);
eb->fd = multi->stripes[dev_nr].dev->fd;
eb->dev_bytenr = multi->stripes[dev_nr].physical;

View File

@ -82,3 +82,6 @@ int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
int verify);
int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
#endif
/* raid6.c */
void raid6_gen_syndrome(int disks, size_t bytes, void **ptrs);

View File

@ -1762,6 +1762,8 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10 |
BTRFS_BLOCK_GROUP_RAID5 |
BTRFS_BLOCK_GROUP_RAID6 |
BTRFS_BLOCK_GROUP_DUP);
if (extra_flags) {
if (flags & BTRFS_BLOCK_GROUP_DATA)

View File

@ -663,13 +663,14 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
return eb;
}
int read_extent_from_disk(struct extent_buffer *eb)
int read_extent_from_disk(struct extent_buffer *eb,
unsigned long offset, unsigned long len)
{
int ret;
ret = pread(eb->fd, eb->data, eb->len, eb->dev_bytenr);
ret = pread(eb->fd, eb->data + offset, len, eb->dev_bytenr);
if (ret < 0)
goto out;
if (ret != eb->len) {
if (ret != len) {
ret = -EIO;
goto out;
}

View File

@ -95,7 +95,8 @@ struct extent_buffer *find_first_extent_buffer(struct extent_io_tree *tree,
struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
u64 bytenr, u32 blocksize);
void free_extent_buffer(struct extent_buffer *eb);
int read_extent_from_disk(struct extent_buffer *eb);
int read_extent_from_disk(struct extent_buffer *eb,
unsigned long offset, unsigned long len);
int write_extent_to_disk(struct extent_buffer *eb);
int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
unsigned long start, unsigned long len);

View File

@ -378,7 +378,8 @@ static int find_root(struct btrfs_root *root)
offset = metadata_offset;
}
err = __btrfs_map_block(&root->fs_info->mapping_tree, READ,
offset, &map_length, &type, &multi, 0);
offset, &map_length, &type,
&multi, 0, NULL);
if (err) {
offset += map_length;
continue;

40
mkfs.c
View File

@ -207,7 +207,8 @@ static int create_raid_groups(struct btrfs_trans_handle *trans,
int metadata_profile_opt, int mixed)
{
u64 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy);
u64 allowed;
u64 allowed = 0;
u64 devices_for_raid = num_devices;
int ret;
/*
@ -223,13 +224,22 @@ static int create_raid_groups(struct btrfs_trans_handle *trans,
BTRFS_BLOCK_GROUP_RAID0 : 0; /* raid0 or single */
}
if (num_devices == 1)
allowed = BTRFS_BLOCK_GROUP_DUP;
else if (num_devices >= 4) {
allowed = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10;
} else
allowed = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1;
if (devices_for_raid > 4)
devices_for_raid = 4;
switch (devices_for_raid) {
default:
case 4:
allowed |= BTRFS_BLOCK_GROUP_RAID10;
case 3:
allowed |= BTRFS_BLOCK_GROUP_RAID6;
case 2:
allowed |= BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID5;
break;
case 1:
allowed |= BTRFS_BLOCK_GROUP_DUP;
}
if (metadata_profile & ~allowed) {
fprintf(stderr, "unable to create FS with metadata "
@ -336,6 +346,10 @@ static u64 parse_profile(char *s)
return BTRFS_BLOCK_GROUP_RAID0;
} else if (strcmp(s, "raid1") == 0) {
return BTRFS_BLOCK_GROUP_RAID1;
} else if (strcmp(s, "raid5") == 0) {
return BTRFS_BLOCK_GROUP_RAID5;
} else if (strcmp(s, "raid6") == 0) {
return BTRFS_BLOCK_GROUP_RAID6;
} else if (strcmp(s, "raid10") == 0) {
return BTRFS_BLOCK_GROUP_RAID10;
} else if (strcmp(s, "dup") == 0) {
@ -1438,6 +1452,16 @@ raid_groups:
btrfs_set_super_incompat_flags(super, flags);
}
if ((data_profile | metadata_profile) &
(BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
struct btrfs_super_block *super = &root->fs_info->super_copy;
u64 flags = btrfs_super_incompat_flags(super);
flags |= BTRFS_FEATURE_INCOMPAT_RAID56;
btrfs_set_super_incompat_flags(super, flags);
printf("Setting RAID5/6 feature flag\n");
}
printf("fs created label %s on %s\n\tnodesize %u leafsize %u "
"sectorsize %u size %s\n",
label, first_file, nodesize, leafsize, sectorsize,

97
raid6.c 100644
View File

@ -0,0 +1,97 @@
/* -*- linux-c -*- ------------------------------------------------------- *
*
* Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
* Boston MA 02111-1307, USA; either version 2 of the License, or
* (at your option) any later version; incorporated herein by reference.
*
* ----------------------------------------------------------------------- */
/*
* raid6int1.c
*
* 1-way unrolled portable integer math RAID-6 instruction set
*
* This file was postprocessed using unroll.pl and then ported to userspace
*/
#include <stdint.h>
#include <unistd.h>
/*
* This is the C data type to use
*/
/* Change this from BITS_PER_LONG if there is something better... */
#if BITS_PER_LONG == 64
# define NBYTES(x) ((x) * 0x0101010101010101UL)
# define NSIZE 8
# define NSHIFT 3
typedef uint64_t unative_t;
#else
# define NBYTES(x) ((x) * 0x01010101U)
# define NSIZE 4
# define NSHIFT 2
typedef uint32_t unative_t;
#endif
/*
* These sub-operations are separate inlines since they can sometimes be
* specially optimized using architecture-specific hacks.
*/
/*
* The SHLBYTE() operation shifts each byte left by 1, *not*
* rolling over into the next byte
*/
static inline __attribute_const__ unative_t SHLBYTE(unative_t v)
{
unative_t vv;
vv = (v << 1) & NBYTES(0xfe);
return vv;
}
/*
* The MASK() operation returns 0xFF in any byte for which the high
* bit is 1, 0x00 for any byte for which the high bit is 0.
*/
static inline __attribute_const__ unative_t MASK(unative_t v)
{
unative_t vv;
vv = v & NBYTES(0x80);
vv = (vv << 1) - (vv >> 7); /* Overflow on the top bit is OK */
return vv;
}
void raid6_gen_syndrome(int disks, size_t bytes, void **ptrs)
{
uint8_t **dptr = (uint8_t **)ptrs;
uint8_t *p, *q;
int d, z, z0;
unative_t wd0, wq0, wp0, w10, w20;
z0 = disks - 3; /* Highest data disk */
p = dptr[z0+1]; /* XOR parity */
q = dptr[z0+2]; /* RS syndrome */
for ( d = 0 ; d < bytes ; d += NSIZE*1 ) {
wq0 = wp0 = *(unative_t *)&dptr[z0][d+0*NSIZE];
for ( z = z0-1 ; z >= 0 ; z-- ) {
wd0 = *(unative_t *)&dptr[z][d+0*NSIZE];
wp0 ^= wd0;
w20 = MASK(wq0);
w10 = SHLBYTE(wq0);
w20 &= NBYTES(0x1d);
w10 ^= w20;
wq0 = w10 ^ wd0;
}
*(unative_t *)&p[d+NSIZE*0] = wp0;
*(unative_t *)&q[d+NSIZE*0] = wq0;
}
}

View File

@ -228,7 +228,7 @@ static int copy_one_extent(struct btrfs_root *root, int fd,
again:
length = size_left;
ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
bytenr, &length, &multi, 0);
bytenr, &length, &multi, 0, NULL);
if (ret) {
free(inbuf);
free(outbuf);

182
volumes.c
View File

@ -35,6 +35,23 @@ struct stripe {
u64 physical;
};
static inline int nr_parity_stripes(struct map_lookup *map)
{
if (map->type & BTRFS_BLOCK_GROUP_RAID5)
return 1;
else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
return 2;
else
return 0;
}
static inline int nr_data_stripes(struct map_lookup *map)
{
return map->num_stripes - nr_parity_stripes(map);
}
#define is_parity_stripe(x) ( ((x) == BTRFS_RAID5_P_STRIPE) || ((x) == BTRFS_RAID6_Q_STRIPE) )
#define map_lookup_size(n) (sizeof(struct map_lookup) + \
(sizeof(struct btrfs_bio_stripe) * (n)))
@ -620,11 +637,21 @@ static u64 chunk_bytes_by_type(u64 type, u64 calc_size, int num_stripes,
return calc_size;
else if (type & BTRFS_BLOCK_GROUP_RAID10)
return calc_size * (num_stripes / sub_stripes);
else if (type & BTRFS_BLOCK_GROUP_RAID5)
return calc_size * (num_stripes - 1);
else if (type & BTRFS_BLOCK_GROUP_RAID6)
return calc_size * (num_stripes - 2);
else
return calc_size * num_stripes;
}
static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
{
/* TODO, add a way to store the preferred stripe size */
return 64 * 1024;
}
int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 *start,
u64 *num_bytes, u64 type)
@ -661,6 +688,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
}
if (type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
BTRFS_BLOCK_GROUP_RAID10 |
BTRFS_BLOCK_GROUP_DUP)) {
if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
@ -700,6 +728,22 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
sub_stripes = 2;
min_stripes = 4;
}
if (type & (BTRFS_BLOCK_GROUP_RAID5)) {
num_stripes = btrfs_super_num_devices(&info->super_copy);
if (num_stripes < 2)
return -ENOSPC;
min_stripes = 2;
stripe_len = find_raid56_stripe_len(num_stripes - 1,
btrfs_super_stripesize(&info->super_copy));
}
if (type & (BTRFS_BLOCK_GROUP_RAID6)) {
num_stripes = btrfs_super_num_devices(&info->super_copy);
if (num_stripes < 3)
return -ENOSPC;
min_stripes = 3;
stripe_len = find_raid56_stripe_len(num_stripes - 2,
btrfs_super_stripesize(&info->super_copy));
}
/* we don't want a chunk larger than 10% of the FS */
percent_max = div_factor(btrfs_super_total_bytes(&info->super_copy), 1);
@ -976,6 +1020,10 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
ret = map->num_stripes;
else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
ret = map->sub_stripes;
else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
ret = 2;
else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
ret = 3;
else
ret = 1;
return ret;
@ -1015,6 +1063,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
u64 bytenr;
u64 length;
u64 stripe_nr;
u64 rmap_len;
int i, j, nr = 0;
ce = find_first_cache_extent(&map_tree->cache_tree, chunk_start);
@ -1022,10 +1071,16 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
map = container_of(ce, struct map_lookup, ce);
length = ce->size;
rmap_len = map->stripe_len;
if (map->type & BTRFS_BLOCK_GROUP_RAID10)
length = ce->size / (map->num_stripes / map->sub_stripes);
else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
length = ce->size / map->num_stripes;
else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
BTRFS_BLOCK_GROUP_RAID6)) {
length = ce->size / nr_data_stripes(map);
rmap_len = map->stripe_len * nr_data_stripes(map);
}
buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
@ -1044,8 +1099,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
map->sub_stripes;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
stripe_nr = stripe_nr * map->num_stripes + i;
}
bytenr = ce->start + stripe_nr * map->stripe_len;
} /* else if RAID[56], multiply by nr_data_stripes().
* Alternatively, just use rmap_len below instead of
* map->stripe_len */
bytenr = ce->start + stripe_nr * rmap_len;
for (j = 0; j < nr; j++) {
if (buf[j] == bytenr)
break;
@ -1056,28 +1114,60 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
*logical = buf;
*naddrs = nr;
*stripe_len = map->stripe_len;
*stripe_len = rmap_len;
return 0;
}
static inline int parity_smaller(u64 a, u64 b)
{
return a > b;
}
/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
static void sort_parity_stripes(struct btrfs_multi_bio *bbio, u64 *raid_map)
{
struct btrfs_bio_stripe s;
int i;
u64 l;
int again = 1;
while (again) {
again = 0;
for (i = 0; i < bbio->num_stripes - 1; i++) {
if (parity_smaller(raid_map[i], raid_map[i+1])) {
s = bbio->stripes[i];
l = raid_map[i];
bbio->stripes[i] = bbio->stripes[i+1];
raid_map[i] = raid_map[i+1];
bbio->stripes[i+1] = s;
raid_map[i+1] = l;
again = 1;
}
}
}
}
int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
u64 logical, u64 *length,
struct btrfs_multi_bio **multi_ret, int mirror_num)
struct btrfs_multi_bio **multi_ret, int mirror_num,
u64 **raid_map_ret)
{
return __btrfs_map_block(map_tree, rw, logical, length, NULL,
multi_ret, mirror_num);
multi_ret, mirror_num, raid_map_ret);
}
int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
u64 logical, u64 *length, u64 *type,
struct btrfs_multi_bio **multi_ret, int mirror_num)
struct btrfs_multi_bio **multi_ret, int mirror_num,
u64 **raid_map_ret)
{
struct cache_extent *ce;
struct map_lookup *map;
u64 offset;
u64 stripe_offset;
u64 stripe_nr;
u64 *raid_map = NULL;
int stripes_allocated = 8;
int stripes_required = 1;
int stripe_index;
@ -1117,10 +1207,24 @@ again:
stripes_required = map->sub_stripes;
}
}
if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)
&& multi_ret && ((rw & WRITE) || mirror_num > 1) && raid_map_ret) {
/* RAID[56] write or recovery. Return all stripes */
stripes_required = map->num_stripes;
/* Only allocate the map if we've already got a large enough multi_ret */
if (stripes_allocated >= stripes_required) {
raid_map = kmalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
if (!raid_map) {
kfree(multi);
return -ENOMEM;
}
}
}
/* if our multi bio struct is too small, back off and try again */
if (multi_ret && rw == WRITE &&
stripes_allocated < stripes_required) {
stripes_allocated = map->num_stripes;
if (multi_ret && stripes_allocated < stripes_required) {
stripes_allocated = stripes_required;
kfree(multi);
goto again;
}
@ -1138,6 +1242,7 @@ again:
stripe_offset = offset - stripe_offset;
if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
BTRFS_BLOCK_GROUP_RAID10 |
BTRFS_BLOCK_GROUP_DUP)) {
/* we limit the length of each bio to what fits in a stripe */
@ -1176,6 +1281,59 @@ again:
multi->num_stripes = map->num_stripes;
else if (mirror_num)
stripe_index = mirror_num - 1;
} else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
BTRFS_BLOCK_GROUP_RAID6)) {
if (raid_map) {
int i, rot;
u64 tmp;
u64 raid56_full_stripe_start;
u64 full_stripe_len = nr_data_stripes(map) * map->stripe_len;
/*
* align the start of our data stripe in the logical
* address space
*/
raid56_full_stripe_start = offset / full_stripe_len;
raid56_full_stripe_start *= full_stripe_len;
/* get the data stripe number */
stripe_nr = raid56_full_stripe_start / map->stripe_len;
stripe_nr = stripe_nr / nr_data_stripes(map);
/* Work out the disk rotation on this stripe-set */
rot = stripe_nr % map->num_stripes;
/* Fill in the logical address of each stripe */
tmp = stripe_nr * nr_data_stripes(map);
for (i = 0; i < nr_data_stripes(map); i++)
raid_map[(i+rot) % map->num_stripes] =
ce->start + (tmp + i) * map->stripe_len;
raid_map[(i+rot) % map->num_stripes] = BTRFS_RAID5_P_STRIPE;
if (map->type & BTRFS_BLOCK_GROUP_RAID6)
raid_map[(i+rot+1) % map->num_stripes] = BTRFS_RAID6_Q_STRIPE;
*length = map->stripe_len;
stripe_index = 0;
stripe_offset = 0;
multi->num_stripes = map->num_stripes;
} else {
stripe_index = stripe_nr % nr_data_stripes(map);
stripe_nr = stripe_nr / nr_data_stripes(map);
/*
* Mirror #0 or #1 means the original data block.
* Mirror #2 is RAID5 parity block.
* Mirror #3 is RAID6 Q block.
*/
if (mirror_num > 1)
stripe_index = nr_data_stripes(map) + mirror_num - 2;
/* We distribute the parity blocks across stripes */
stripe_index = (stripe_nr + stripe_index) % map->num_stripes;
}
} else {
/*
* after this do_div call, stripe_nr is the number of stripes
@ -1195,8 +1353,14 @@ again:
stripe_index++;
}
*multi_ret = multi;
if (type)
*type = map->type;
if (raid_map) {
sort_parity_stripes(multi, raid_map);
*raid_map_ret = raid_map;
}
out:
return 0;
}

View File

@ -135,6 +135,10 @@ struct map_lookup {
#define BTRFS_BALANCE_ARGS_CONVERT (1ULL << 8)
#define BTRFS_BALANCE_ARGS_SOFT (1ULL << 9)
#define BTRFS_RAID5_P_STRIPE ((u64)-2)
#define BTRFS_RAID6_Q_STRIPE ((u64)-1)
int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device,
u64 chunk_tree, u64 chunk_objectid,
@ -142,10 +146,12 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
u64 num_bytes, u64 *start);
int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
u64 logical, u64 *length, u64 *type,
struct btrfs_multi_bio **multi_ret, int mirror_num);
struct btrfs_multi_bio **multi_ret, int mirror_num,
u64 **raid_map);
int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
u64 logical, u64 *length,
struct btrfs_multi_bio **multi_ret, int mirror_num);
struct btrfs_multi_bio **multi_ret, int mirror_num,
u64 **raid_map_ret);
int btrfs_next_metadata(struct btrfs_mapping_tree *map_tree, u64 *logical,
u64 *size);
int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,