multiple device driver (linux software raid)

Multiple Device Driver (Linux Software RAID)

Ted Baker Andy WangCIS 4930 / COP 5641

The md driver

Provides virtual devices Created from one or more

independent underlying devices The basic mechanism to support

RAIDs Redundant arrays of inexpensive

disks

Common RAID levels RAID0

Striping RAID1

Mirroring RAID4 (> 3 disks)

Striped array with a parity device

RAID5 (> 3 disks) Striped array with

distributed parity RAID6 (> 4 disks)

Striped array with dual redundancy information

Common RAID levels RAID1+0

Striped array of mirrored disks

RAID0+1 Mirroring two

RAID0s RAID5+0

Striped array of RAID5s

RAID5+1 Mirroring two

RAID5s

md pseudo RAID configurations

Linear (catenates multiple disks into a single one)

Multipath A set of different interfaces to the same

device (e.g., multiple disk controllers) Faulty

A layer over a single device into which errors can be injected

RAID Creation> mdadm --create /dev/md0 --level=1 --raid-devices=2

/dev/hd[ac]1

Create /dev/md0 as RAID1 Consisting of /dev/hda1 and

/dev/hdc1

RAID Status

To check the status for RAIDs See /proc/mdstatPersonalities : [raid1]

md0 : active raid1 sda5[0] sdb5[1]

979840 blocks [2/2] [UU]

md1 : active raid1 sda6[2] sdb6[1]

159661888 blocks [2/1] [_U]

[===>.................] recovery = 17.9%

(28697920/159661888) finish=56.4min speed=38656K/sec

unused devices: <none>

md Super Block

Each device in a RAID may have a superblock with various information Level UUID

128 bit identifier that identifies an array

Some RAID Concepts Personality

RAID level Chunk size

Power of two > 4KB

A RAID assigns chunks to disks in a round robin fashion

Stripe A collection of ith

chunk at each disk form a stripe

Parity A chunk

constructed via XORing other chunks

Synchrony

An update may involve both the data block and the parity block

Implications A RAID may be shut down in an

inconsistency state Resynchronization may be required at

startup, in the background Reduced performance

Recovery

If the md driver detects a write error, it immediately disables that device Continues operation on the remaining

devices Starts recreating the content if there

is a spare drive

Recovery

If the md driver detects a read error Overwrites the bad block Read the block again

If fails, treat it as a write error

Recovery is a background process Can be configured via

/proc/sys/dev/raid/speed_limit_min /proc/sys/dev/raid/speed_limit_max

Bitmap Write-Intent Logging

Records which blocks of the array may be out of sync

Speeds up resynchronization Allows a disk to be temporarily

removed and reinserted without causing an enormous recovery cost Can spin down disks for power savings

Bitmap Write-Intent Logging

Can be stored on a separate device

Write-Behind

Certain devices in the array can be flagged as write-mostly

md will not wait for writes to write-behind devices to complete before returning to the file system

Restriping (Reshaping)

Change the number of disks Change the RAID levels Not robust against failures

faulty.cstatic int __init raid_init(void) {

return register_md_personality(&faulty_personality);

}

static void raid_exit(void) {

unregister_md_personality(&faulty_personality);

}

module_init(raid_init);

module_exit(raid_exit);

faulty.cstatic struct mdk_personality faulty_personality = {

.name = "faulty",

.level = LEVEL_FAULTY,

.owner = THIS_MODULE,

.make_request = make_request,

.run = run,

.stop = stop,

.status = status,

.check_reshape = reshape,

.size = faulty_size

};

faulty.cstatic int run(mddev_t *mddev) {

mdk_rdev_t *rdev;

struct list_head *tmp;

int i;

conf_t *conf = kmalloc(sizeof(*conf), GFP_KERNEL);

.../* error handling + zero out conf */

list_for_each_entry(rdev, mddev, same_set)

conf->rdev = rdev;

md_set_array_sectors(mddev, mddev->dev_sectors);

mddev->private = conf;

reshape(mddev);

return 0;

}

typedef struct faulty_conf { int period[Modes]; atomic_t counters[Modes]; sector_t faults[MaxFault]; int modes[MaxFault]; int nfaults; mdk_rdev_t *rdev;} conf_t;

A field in mdk_rdev_t

list head

faulty.cstatic int reshape(mddev_t *mddev) {

int mode = mddev->new_layout & ModeMask;

int count = mddev->new_layout >> ModeShift;

conf_t *conf = mddev->private;

.../* error checks */

if (mode == /* clear something */)

/* clear various counters */

} else if (mode < Modes) {

conf->period[mode] = count;

if (!count) count++;

atomic_set(&conf->counters[mode], count);

} else ...

return 0;

}

Total number of failure modes (e.g., transient write failure mode)

faulty.cstatic int stop(mddev_t *mddev) {

conf_t *conf = (conf_t *)mddev->private;

kfree(conf);

mddev->private = NULL;

return 0;

}

faulty.cstatic int make_request(request_queue_t *q, struct bio *bio) {

mddev_t *mddev = q->queuedata;

conf_t *conf = (conf_t*)mddev->private;

int failit = 0;

if (bio_data_dir(bio) == WRITE) { /* data direction */

.../* misc cases */

/* if a sector failed before, need to stay failed */

if (check_sector(conf, bio->bi_sector, bio->bi_sector +

(bio->bi_size >> 9), WRITE))

failit = 1;

/* if the period (some predefined constant) is reached

for a sector, record the sector and fail it */

if (check_mode(conf, WritePersistent)) {

add_sector(conf, bio->bi_sector, WritePersistent);

failit = 1;

} ...

faulty.c } else { /* failure cases for reads */

...

}

if (failit) {

struct bio *b = bio_clone(bio, GFP_NOIO);

b->bi_bdev = conf->rdev->bdev;

b->bi_private = bio;

b->bi_end_io = faulty_fail;

generic_make_request(b);

return 0;

} else {

bio->bi_bdev = conf->rdev->bdev;

return 1;

}

}

To the queue of this device,

initialized in md.c from the disk device inode

Make bio point to the actual device, and let the main block layer submit the IO and resolve the

recursion

faulty.cstatic int faulty_fail(struct bio *bio, int error) {

struct bio *b = bio->bi_private;

b->bi_size = bio->bi_size;

b->bi_sector = bio->bi_sector;

bio_put(bio);

bio_io_error(b);

}

blk-core.c

A file system eventually calls __generic_make_request()

static inline void __generic_make_request(struct bio *bio) {

...

do {

...

q = bdev_get_queue(bio->bi_bdev);

.../* check errors */

ret = q->make_request_fn(q, bio);

} while (ret);

}

linear.cstatic int __init linear_init(void) {

return register_md_personality(&linear_personality);

}

static void linear_exit (void) {

unregister_md_personality(&linear_personality);

}

module_init(linear_init);

module_exit(linear_exit);

linear.cstatic struct mdk_personality linear_personality = {

.name = "linear",

.level = LEVEL_LINEAR,

.owner = THIS_MODULE,

.make_request = linear_make_request,

.run = linear_run,

.stop = linear_stop,

.status = linear_status, /* for proc */

.hot_add_disk = linear_add,

.size = linear_size,

};

linear.c

typedef struct linear_private_data { sector_t array_sectors; dev_info_t disks[0]; struct rcu_head rcu;} linear_conf_t;

static int linear_run(mddev_t *mddev) {

linear_conf_t *conf;

/* initialize

conf = linear_conf(mddev, mddev->raid_disks);

if (!conf) return 1;

mddev->private = conf;

md_set_array_sectors(mddev, conf->array_sectors;

...

initialize conf->disks[i].end_sector

linear.c ...

/* determines whether two bio can be merged */

/* overrides the default merge_bvec function */

blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);

/* queues are first plugged to build up the queue length, then unplugged to release requests to devices */

mddev->queue->unplug_fn = linear_unplug;

/* disable prefetching when the device is congested */

mddev->queue->backing_dev_info.congested_fn

= linear_congested;

mddev->queue->backing_dev_info.congested_data = mddev;

md_integrity_register(mddev);

return 0;

}

linear.cstatic int linear_stop(mddev_t *mddev) {

linear_conf_t *conf = mddev->private;

/* the unplug fn references 'conf‘ */

rcu_barrier();

blk_sync_queue(mddev->queue);

kfree(conf);

return 0;

}

linear.cstatic int linear_make_request(request_queue_t *q,

struct bio *bio) {

const int rw = bio_data_dir(bio);

mddev_t *mddev = q->queuedata;

dev_info_t *tmp_dev;

sector_t start_sector;

.../* check for errors and update statistics */

rcu_read_lock();

tmp_dev = which_dev(mddev, bio->bi_sector);

start_sector = tmp_dev->end_sector – tmp_dev->rdev->sectors;

.../* more error checks */

linear.c if (unlikely(bio->bi_sector + (bio->bi_size >> 9) >

tmp_dev->end_sector)) {

/* This bio crosses a device boundary, so we have to

* split it. */

struct bio_pair *bp;

sector_t end_sector = tmp_dev->end_sector;

rcu_read_unlock();

bp = bio_split(bio, end_sector – bio->bi_sector);

if (linear_make_request(q, &bp->bio1)) /* recursion!?# */

generic_make_request(&bp->bio1);

if (linear_make_request(q, &bp->bio2)) /* recursion#!% */

generic_make_request(&bp->bio2);

bio_pair_release(bp); /* remove bio hazard */

return 0;

}

linear.c bio->bi_bdev = tmp_dev->rdev->bdev;

bio->bi_sector = bio->bi_sector – start_sector +

tmp_dev->rdev->data_offset;

rcu_read_unlock();

return 1;

}

Again, let the main block layer submit the IO and resolve the recursion

Points to the specific device instead of the

linear device

Translates the virtual sector number to the

physical sector number for the specific device

multiple device driver (linux software raid)

Documents

init raid

disksstriped array

parity blockimplicationsa

raid levelsnot robust

static void raid

kba raid assigns chunks

personality faulty

write errorrecovery