diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-03-28 10:01:29 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-03-28 10:01:29 -0700 |
commit | 56b59b429b4c26e5e730bc8c3d837de9f7d0a966 (patch) | |
tree | 191bf87e438a3985ccb7e3c5382fab8d31f94edb /drivers/block | |
parent | 9a7259d5c8978bbeb5fdcf64b168f8470d8208a6 (diff) | |
parent | c666601a935b94cc0f3310339411b6940de751ba (diff) |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
Pull Ceph updates for 3.4-rc1 from Sage Weil:
"Alex has been busy. There are a range of rbd and libceph cleanups,
especially surrounding device setup and teardown, and a few critical
fixes in that code. There are more cleanups in the messenger code,
virtual xattrs, a fix for CRC calculation/checks, and lots of other
miscellaneous stuff.
There's a patch from Amon Ott to make inos behave a bit better on
32-bit boxes, some decode check fixes from Xi Wang, and network
throttling fix from Jim Schutt, and a couple RBD fixes from Josh
Durgin.
No new functionality, just a lot of cleanup and bug fixing."
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (65 commits)
rbd: move snap_rwsem to the device, rename to header_rwsem
ceph: fix three bugs, two in ceph_vxattrcb_file_layout()
libceph: isolate kmap() call in write_partial_msg_pages()
libceph: rename "page_shift" variable to something sensible
libceph: get rid of zero_page_address
libceph: only call kernel_sendpage() via helper
libceph: use kernel_sendpage() for sending zeroes
libceph: fix inverted crc option logic
libceph: some simple changes
libceph: small refactor in write_partial_kvec()
libceph: do crc calculations outside loop
libceph: separate CRC calculation from byte swapping
libceph: use "do" in CRC-related Boolean variables
ceph: ensure Boolean options support both senses
libceph: a few small changes
libceph: make ceph_tcp_connect() return int
libceph: encapsulate some messenger cleanup code
libceph: make ceph_msgr_wq private
libceph: encapsulate connection kvec operations
libceph: move prepare_write_banner()
...
Diffstat (limited to 'drivers/block')
-rw-r--r-- | drivers/block/rbd.c | 730 | ||||
-rw-r--r-- | drivers/block/rbd_types.h | 4 |
2 files changed, 448 insertions, 286 deletions
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index a6278e7e61a..013c7a549fb 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -41,19 +41,35 @@ #include "rbd_types.h" -#define DRV_NAME "rbd" -#define DRV_NAME_LONG "rbd (rados block device)" +/* + * The basic unit of block I/O is a sector. It is interpreted in a + * number of contexts in Linux (blk, bio, genhd), but the default is + * universally 512 bytes. These symbols are just slightly more + * meaningful than the bare numbers they represent. + */ +#define SECTOR_SHIFT 9 +#define SECTOR_SIZE (1ULL << SECTOR_SHIFT) + +#define RBD_DRV_NAME "rbd" +#define RBD_DRV_NAME_LONG "rbd (rados block device)" #define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ -#define RBD_MAX_MD_NAME_LEN (96 + sizeof(RBD_SUFFIX)) +#define RBD_MAX_MD_NAME_LEN (RBD_MAX_OBJ_NAME_LEN + sizeof(RBD_SUFFIX)) #define RBD_MAX_POOL_NAME_LEN 64 #define RBD_MAX_SNAP_NAME_LEN 32 #define RBD_MAX_OPT_LEN 1024 #define RBD_SNAP_HEAD_NAME "-" +/* + * An RBD device name will be "rbd#", where the "rbd" comes from + * RBD_DRV_NAME above, and # is a unique integer identifier. + * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big + * enough to hold all possible device names. + */ #define DEV_NAME_LEN 32 +#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) #define RBD_NOTIFY_TIMEOUT_DEFAULT 10 @@ -66,7 +82,6 @@ struct rbd_image_header { __u8 obj_order; __u8 crypt_type; __u8 comp_type; - struct rw_semaphore snap_rwsem; struct ceph_snap_context *snapc; size_t snap_names_len; u64 snap_seq; @@ -83,7 +98,7 @@ struct rbd_options { }; /* - * an instance of the client. multiple devices may share a client. + * an instance of the client. multiple devices may share an rbd client. */ struct rbd_client { struct ceph_client *client; @@ -92,20 +107,9 @@ struct rbd_client { struct list_head node; }; -struct rbd_req_coll; - /* - * a single io request + * a request completion status */ -struct rbd_request { - struct request *rq; /* blk layer request */ - struct bio *bio; /* cloned bio */ - struct page **pages; /* list of used pages */ - u64 len; - int coll_index; - struct rbd_req_coll *coll; -}; - struct rbd_req_status { int done; int rc; @@ -122,6 +126,18 @@ struct rbd_req_coll { struct rbd_req_status status[0]; }; +/* + * a single io request + */ +struct rbd_request { + struct request *rq; /* blk layer request */ + struct bio *bio; /* cloned bio */ + struct page **pages; /* list of used pages */ + u64 len; + int coll_index; + struct rbd_req_coll *coll; +}; + struct rbd_snap { struct device dev; const char *name; @@ -140,7 +156,6 @@ struct rbd_device { struct gendisk *disk; /* blkdev's gendisk and rq */ struct request_queue *q; - struct ceph_client *client; struct rbd_client *rbd_client; char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ @@ -157,6 +172,8 @@ struct rbd_device { struct ceph_osd_event *watch_event; struct ceph_osd_request *watch_request; + /* protects updating the header */ + struct rw_semaphore header_rwsem; char snap_name[RBD_MAX_SNAP_NAME_LEN]; u32 cur_snap; /* index+1 of current snapshot within snap context 0 - for the head */ @@ -171,15 +188,13 @@ struct rbd_device { struct device dev; }; -static struct bus_type rbd_bus_type = { - .name = "rbd", -}; - -static spinlock_t node_lock; /* protects client get/put */ - static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ + static LIST_HEAD(rbd_dev_list); /* devices */ -static LIST_HEAD(rbd_client_list); /* clients */ +static DEFINE_SPINLOCK(rbd_dev_list_lock); + +static LIST_HEAD(rbd_client_list); /* clients */ +static DEFINE_SPINLOCK(rbd_client_list_lock); static int __rbd_init_snaps_header(struct rbd_device *rbd_dev); static void rbd_dev_release(struct device *dev); @@ -190,12 +205,32 @@ static ssize_t rbd_snap_add(struct device *dev, static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev, struct rbd_snap *snap); +static ssize_t rbd_add(struct bus_type *bus, const char *buf, + size_t count); +static ssize_t rbd_remove(struct bus_type *bus, const char *buf, + size_t count); -static struct rbd_device *dev_to_rbd(struct device *dev) +static struct bus_attribute rbd_bus_attrs[] = { + __ATTR(add, S_IWUSR, NULL, rbd_add), + __ATTR(remove, S_IWUSR, NULL, rbd_remove), + __ATTR_NULL +}; + +static struct bus_type rbd_bus_type = { + .name = "rbd", + .bus_attrs = rbd_bus_attrs, +}; + +static void rbd_root_dev_release(struct device *dev) { - return container_of(dev, struct rbd_device, dev); } +static struct device rbd_root_dev = { + .init_name = "rbd", + .release = rbd_root_dev_release, +}; + + static struct device *rbd_get_dev(struct rbd_device *rbd_dev) { return get_device(&rbd_dev->dev); @@ -210,8 +245,7 @@ static int __rbd_update_snaps(struct rbd_device *rbd_dev); static int rbd_open(struct block_device *bdev, fmode_t mode) { - struct gendisk *disk = bdev->bd_disk; - struct rbd_device *rbd_dev = disk->private_data; + struct rbd_device *rbd_dev = bdev->bd_disk->private_data; rbd_get_dev(rbd_dev); @@ -256,9 +290,11 @@ static struct rbd_client *rbd_client_create(struct ceph_options *opt, kref_init(&rbdc->kref); INIT_LIST_HEAD(&rbdc->node); + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + rbdc->client = ceph_create_client(opt, rbdc, 0, 0); if (IS_ERR(rbdc->client)) - goto out_rbdc; + goto out_mutex; opt = NULL; /* Now rbdc->client is responsible for opt */ ret = ceph_open_session(rbdc->client); @@ -267,16 +303,19 @@ static struct rbd_client *rbd_client_create(struct ceph_options *opt, rbdc->rbd_opts = rbd_opts; - spin_lock(&node_lock); + spin_lock(&rbd_client_list_lock); list_add_tail(&rbdc->node, &rbd_client_list); - spin_unlock(&node_lock); + spin_unlock(&rbd_client_list_lock); + + mutex_unlock(&ctl_mutex); dout("rbd_client_create created %p\n", rbdc); return rbdc; out_err: ceph_destroy_client(rbdc->client); -out_rbdc: +out_mutex: + mutex_unlock(&ctl_mutex); kfree(rbdc); out_opt: if (opt) @@ -324,7 +363,7 @@ static int parse_rbd_opts_token(char *c, void *private) substring_t argstr[MAX_OPT_ARGS]; int token, intval, ret; - token = match_token((char *)c, rbdopt_tokens, argstr); + token = match_token(c, rbdopt_tokens, argstr); if (token < 0) return -EINVAL; @@ -357,58 +396,54 @@ static int parse_rbd_opts_token(char *c, void *private) * Get a ceph client with specific addr and configuration, if one does * not exist create it. */ -static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, - char *options) +static struct rbd_client *rbd_get_client(const char *mon_addr, + size_t mon_addr_len, + char *options) { struct rbd_client *rbdc; struct ceph_options *opt; - int ret; struct rbd_options *rbd_opts; rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL); if (!rbd_opts) - return -ENOMEM; + return ERR_PTR(-ENOMEM); rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT; - ret = ceph_parse_options(&opt, options, mon_addr, - mon_addr + strlen(mon_addr), parse_rbd_opts_token, rbd_opts); - if (ret < 0) - goto done_err; + opt = ceph_parse_options(options, mon_addr, + mon_addr + mon_addr_len, + parse_rbd_opts_token, rbd_opts); + if (IS_ERR(opt)) { + kfree(rbd_opts); + return ERR_CAST(opt); + } - spin_lock(&node_lock); + spin_lock(&rbd_client_list_lock); rbdc = __rbd_client_find(opt); if (rbdc) { + /* using an existing client */ + kref_get(&rbdc->kref); + spin_unlock(&rbd_client_list_lock); + ceph_destroy_options(opt); kfree(rbd_opts); - /* using an existing client */ - kref_get(&rbdc->kref); - rbd_dev->rbd_client = rbdc; - rbd_dev->client = rbdc->client; - spin_unlock(&node_lock); - return 0; + return rbdc; } - spin_unlock(&node_lock); + spin_unlock(&rbd_client_list_lock); rbdc = rbd_client_create(opt, rbd_opts); - if (IS_ERR(rbdc)) { - ret = PTR_ERR(rbdc); - goto done_err; - } - rbd_dev->rbd_client = rbdc; - rbd_dev->client = rbdc->client; - return 0; -done_err: - kfree(rbd_opts); - return ret; + if (IS_ERR(rbdc)) + kfree(rbd_opts); + + return rbdc; } /* * Destroy ceph client * - * Caller must hold node_lock. + * Caller must hold rbd_client_list_lock. */ static void rbd_client_release(struct kref *kref) { @@ -428,11 +463,10 @@ static void rbd_client_release(struct kref *kref) */ static void rbd_put_client(struct rbd_device *rbd_dev) { - spin_lock(&node_lock); + spin_lock(&rbd_client_list_lock); kref_put(&rbd_dev->rbd_client->kref, rbd_client_release); - spin_unlock(&node_lock); + spin_unlock(&rbd_client_list_lock); rbd_dev->rbd_client = NULL; - rbd_dev->client = NULL; } /* @@ -457,21 +491,19 @@ static int rbd_header_from_disk(struct rbd_image_header *header, gfp_t gfp_flags) { int i; - u32 snap_count = le32_to_cpu(ondisk->snap_count); - int ret = -ENOMEM; + u32 snap_count; - if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT))) { + if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT))) return -ENXIO; - } - init_rwsem(&header->snap_rwsem); - header->snap_names_len = le64_to_cpu(ondisk->snap_names_len); + snap_count = le32_to_cpu(ondisk->snap_count); header->snapc = kmalloc(sizeof(struct ceph_snap_context) + - snap_count * - sizeof(struct rbd_image_snap_ondisk), + snap_count * sizeof (*ondisk), gfp_flags); if (!header->snapc) return -ENOMEM; + + header->snap_names_len = le64_to_cpu(ondisk->snap_names_len); if (snap_count) { header->snap_names = kmalloc(header->snap_names_len, GFP_KERNEL); @@ -498,8 +530,7 @@ static int rbd_header_from_disk(struct rbd_image_header *header, header->snapc->num_snaps = snap_count; header->total_snaps = snap_count; - if (snap_count && - allocated_snaps == snap_count) { + if (snap_count && allocated_snaps == snap_count) { for (i = 0; i < snap_count; i++) { header->snapc->snaps[i] = le64_to_cpu(ondisk->snaps[i].id); @@ -518,7 +549,7 @@ err_names: kfree(header->snap_names); err_snapc: kfree(header->snapc); - return ret; + return -ENOMEM; } static int snap_index(struct rbd_image_header *header, int snap_num) @@ -542,35 +573,34 @@ static int snap_by_name(struct rbd_image_header *header, const char *snap_name, int i; char *p = header->snap_names; - for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) { - if (strcmp(snap_name, p) == 0) - break; - } - if (i == header->total_snaps) - return -ENOENT; - if (seq) - *seq = header->snapc->snaps[i]; + for (i = 0; i < header->total_snaps; i++) { + if (!strcmp(snap_name, p)) { - if (size) - *size = header->snap_sizes[i]; + /* Found it. Pass back its id and/or size */ - return i; + if (seq) + *seq = header->snapc->snaps[i]; + if (size) + *size = header->snap_sizes[i]; + return i; + } + p += strlen(p) + 1; /* Skip ahead to the next name */ + } + return -ENOENT; } -static int rbd_header_set_snap(struct rbd_device *dev, - const char *snap_name, - u64 *size) +static int rbd_header_set_snap(struct rbd_device *dev, u64 *size) { struct rbd_image_header *header = &dev->header; struct ceph_snap_context *snapc = header->snapc; int ret = -ENOENT; - down_write(&header->snap_rwsem); + BUILD_BUG_ON(sizeof (dev->snap_name) < sizeof (RBD_SNAP_HEAD_NAME)); - if (!snap_name || - !*snap_name || - strcmp(snap_name, "-") == 0 || - strcmp(snap_name, RBD_SNAP_HEAD_NAME) == 0) { + down_write(&dev->header_rwsem); + + if (!memcmp(dev->snap_name, RBD_SNAP_HEAD_NAME, + sizeof (RBD_SNAP_HEAD_NAME))) { if (header->total_snaps) snapc->seq = header->snap_seq; else @@ -580,7 +610,7 @@ static int rbd_header_set_snap(struct rbd_device *dev, if (size) *size = header->image_size; } else { - ret = snap_by_name(header, snap_name, &snapc->seq, size); + ret = snap_by_name(header, dev->snap_name, &snapc->seq, size); if (ret < 0) goto done; @@ -590,7 +620,7 @@ static int rbd_header_set_snap(struct rbd_device *dev, ret = 0; done: - up_write(&header->snap_rwsem); + up_write(&dev->header_rwsem); return ret; } @@ -717,7 +747,7 @@ static struct bio *bio_chain_clone(struct bio **old, struct bio **next, /* split the bio. We'll release it either in the next call, or it will have to be released outside */ - bp = bio_split(old_chain, (len - total) / 512ULL); + bp = bio_split(old_chain, (len - total) / SECTOR_SIZE); if (!bp) goto err_out; @@ -857,7 +887,7 @@ static int rbd_do_request(struct request *rq, struct timespec mtime = CURRENT_TIME; struct rbd_request *req_data; struct ceph_osd_request_head *reqhead; - struct rbd_image_header *header = &dev->header; + struct ceph_osd_client *osdc; req_data = kzalloc(sizeof(*req_data), GFP_NOIO); if (!req_data) { @@ -874,15 +904,13 @@ static int rbd_do_request(struct request *rq, dout("rbd_do_request obj=%s ofs=%lld len=%lld\n", obj, len, ofs); - down_read(&header->snap_rwsem); + down_read(&dev->header_rwsem); - req = ceph_osdc_alloc_request(&dev->client->osdc, flags, - snapc, - ops, - false, - GFP_NOIO, pages, bio); + osdc = &dev->rbd_client->client->osdc; + req = ceph_osdc_alloc_request(osdc, flags, snapc, ops, + false, GFP_NOIO, pages, bio); if (!req) { - up_read(&header->snap_rwsem); + up_read(&dev->header_rwsem); ret = -ENOMEM; goto done_pages; } @@ -909,27 +937,27 @@ static int rbd_do_request(struct request *rq, layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); layout->fl_pg_preferred = cpu_to_le32(-1); layout->fl_pg_pool = cpu_to_le32(dev->poolid); - ceph_calc_raw_layout(&dev->client->osdc, layout, snapid, - ofs, &len, &bno, req, ops); + ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno, + req, ops); ceph_osdc_build_request(req, ofs, &len, ops, snapc, &mtime, req->r_oid, req->r_oid_len); - up_read(&header->snap_rwsem); + up_read(&dev->header_rwsem); if (linger_req) { - ceph_osdc_set_request_linger(&dev->client->osdc, req); + ceph_osdc_set_request_linger(osdc, req); *linger_req = req; } - ret = ceph_osdc_start_request(&dev->client->osdc, req, false); + ret = ceph_osdc_start_request(osdc, req, false); if (ret < 0) goto done_err; if (!rbd_cb) { - ret = ceph_osdc_wait_request(&dev->client->osdc, req); + ret = ceph_osdc_wait_request(osdc, req); if (ver) *ver = le64_to_cpu(req->r_reassert_version.version); dout("reassert_ver=%lld\n", @@ -1213,8 +1241,8 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) rc = __rbd_update_snaps(dev); mutex_unlock(&ctl_mutex); if (rc) - pr_warning(DRV_NAME "%d got notification but failed to update" - " snaps: %d\n", dev->major, rc); + pr_warning(RBD_DRV_NAME "%d got notification but failed to " + " update snaps: %d\n", dev->major, rc); rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name); } @@ -1227,7 +1255,7 @@ static int rbd_req_sync_watch(struct rbd_device *dev, u64 ver) { struct ceph_osd_req_op *ops; - struct ceph_osd_client *osdc = &dev->client->osdc; + struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc; int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0); if (ret < 0) @@ -1314,7 +1342,7 @@ static int rbd_req_sync_notify(struct rbd_device *dev, const char *obj) { struct ceph_osd_req_op *ops; - struct ceph_osd_client *osdc = &dev->client->osdc; + struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc; struct ceph_osd_event *event; struct rbd_notify_info info; int payload_len = sizeof(u32) + sizeof(u32); @@ -1421,9 +1449,7 @@ static void rbd_rq_fn(struct request_queue *q) struct request *rq; struct bio_pair *bp = NULL; - rq = blk_fetch_request(q); - - while (1) { + while ((rq = blk_fetch_request(q))) { struct bio *bio; struct bio *rq_bio, *next_bio = NULL; bool do_write; @@ -1441,32 +1467,32 @@ static void rbd_rq_fn(struct request_queue *q) /* filter out block requests we don't understand */ if ((rq->cmd_type != REQ_TYPE_FS)) { __blk_end_request_all(rq, 0); - goto next; + continue; } /* deduce our operation (read, write) */ do_write = (rq_data_dir(rq) == WRITE); size = blk_rq_bytes(rq); - ofs = blk_rq_pos(rq) * 512ULL; + ofs = blk_rq_pos(rq) * SECTOR_SIZE; rq_bio = rq->bio; if (do_write && rbd_dev->read_only) { __blk_end_request_all(rq, -EROFS); - goto next; + continue; } spin_unlock_irq(q->queue_lock); dout("%s 0x%x bytes at 0x%llx\n", do_write ? "write" : "read", - size, blk_rq_pos(rq) * 512ULL); + size, blk_rq_pos(rq) * SECTOR_SIZE); num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size); coll = rbd_alloc_coll(num_segs); if (!coll) { spin_lock_irq(q->queue_lock); __blk_end_request_all(rq, -ENOMEM); - goto next; + continue; } do { @@ -1512,8 +1538,6 @@ next_seg: if (bp) bio_pair_release(bp); spin_lock_irq(q->queue_lock); -next: - rq = blk_fetch_request(q); } } @@ -1526,13 +1550,17 @@ static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, struct bio_vec *bvec) { struct rbd_device *rbd_dev = q->queuedata; - unsigned int chunk_sectors = 1 << (rbd_dev->header.obj_order - 9); - sector_t sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev); - unsigned int bio_sectors = bmd->bi_size >> 9; + unsigned int chunk_sectors; + sector_t sector; + unsigned int bio_sectors; int max; + chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT); + sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev); + bio_sectors = bmd->bi_size >> SECTOR_SHIFT; + max = (chunk_sectors - ((sector & (chunk_sectors - 1)) - + bio_sectors)) << 9; + + bio_sectors)) << SECTOR_SHIFT; if (max < 0) max = 0; /* bio_add cannot handle a negative return */ if (max <= bvec->bv_len && bio_sectors == 0) @@ -1565,15 +1593,16 @@ static int rbd_read_header(struct rbd_device *rbd_dev, ssize_t rc; struct rbd_image_header_ondisk *dh; int snap_count = 0; - u64 snap_names_len = 0; u64 ver; + size_t len; + /* + * First reads the fixed-size header to determine the number + * of snapshots, then re-reads it, along with all snapshot + * records as well as their stored names. + */ + len = sizeof (*dh); while (1) { - int len = sizeof(*dh) + - snap_count * sizeof(struct rbd_image_snap_ondisk) + - snap_names_len; - - rc = -ENOMEM; dh = kmalloc(len, GFP_KERNEL); if (!dh) return -ENOMEM; @@ -1588,21 +1617,22 @@ static int rbd_read_header(struct rbd_device *rbd_dev, rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL); if (rc < 0) { - if (rc == -ENXIO) { + if (rc == -ENXIO) pr_warning("unrecognized header format" " for image %s", rbd_dev->obj); - } goto out_dh; } - if (snap_count != header->total_snaps) { - snap_count = header->total_snaps; - snap_names_len = header->snap_names_len; - rbd_header_free(header); - kfree(dh); - continue; - } - break; + if (snap_count == header->total_snaps) + break; + + snap_count = header->total_snaps; + len = sizeof (*dh) + + snap_count * sizeof(struct rbd_image_snap_ondisk) + + header->snap_names_len; + + rbd_header_free(header); + kfree(dh); } header->obj_version = ver; @@ -1623,13 +1653,14 @@ static int rbd_header_add_snap(struct rbd_device *dev, int ret; void *data, *p, *e; u64 ver; + struct ceph_mon_client *monc; /* we should create a snapshot only if we're pointing at the head */ if (dev->cur_snap) return -EINVAL; - ret = ceph_monc_create_snapid(&dev->client->monc, dev->poolid, - &new_snapid); + monc = &dev->rbd_client->client->monc; + ret = ceph_monc_create_snapid(monc, dev->poolid, &new_snapid); dout("created snapid=%lld\n", new_snapid); if (ret < 0) return ret; @@ -1684,9 +1715,9 @@ static int __rbd_update_snaps(struct rbd_device *rbd_dev) return ret; /* resized? */ - set_capacity(rbd_dev->disk, h.image_size / 512ULL); + set_capacity(rbd_dev->disk, h.image_size / SECTOR_SIZE); - down_write(&rbd_dev->header.snap_rwsem); + down_write(&rbd_dev->header_rwsem); snap_seq = rbd_dev->header.snapc->seq; if (rbd_dev->header.total_snaps && @@ -1711,7 +1742,7 @@ static int __rbd_update_snaps(struct rbd_device *rbd_dev) ret = __rbd_init_snaps_header(rbd_dev); - up_write(&rbd_dev->header.snap_rwsem); + up_write(&rbd_dev->header_rwsem); return ret; } @@ -1721,6 +1752,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) struct gendisk *disk; struct request_queue *q; int rc; + u64 segment_size; u64 total_size = 0; /* contact OSD, request size info about the object being mapped */ @@ -1733,7 +1765,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) if (rc) return rc; - rc = rbd_header_set_snap(rbd_dev, rbd_dev->snap_name, &total_size); + rc = rbd_header_set_snap(rbd_dev, &total_size); if (rc) return rc; @@ -1743,7 +1775,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) if (!disk) goto out; - snprintf(disk->disk_name, sizeof(disk->disk_name), DRV_NAME "%d", + snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", rbd_dev->id); disk->major = rbd_dev->major; disk->first_minor = 0; @@ -1756,11 +1788,15 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) if (!q) goto out_disk; + /* We use the default size, but let's be explicit about it. */ + blk_queue_physical_block_size(q, SECTOR_SIZE); + /* set io sizes to object size */ - blk_queue_max_hw_sectors(q, rbd_obj_bytes(&rbd_dev->header) / 512ULL); - blk_queue_max_segment_size(q, rbd_obj_bytes(&rbd_dev->header)); - blk_queue_io_min(q, rbd_obj_bytes(&rbd_dev->header)); - blk_queue_io_opt(q, rbd_obj_bytes(&rbd_dev->header)); + segment_size = rbd_obj_bytes(&rbd_dev->header); + blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); + blk_queue_max_segment_size(q, segment_size); + blk_queue_io_min(q, segment_size); + blk_queue_io_opt(q, segment_size); blk_queue_merge_bvec(q, rbd_merge_bvec); disk->queue = q; @@ -1771,7 +1807,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) rbd_dev->q = q; /* finally, announce the disk to the world */ - set_capacity(disk, total_size / 512ULL); + set_capacity(disk, total_size / SECTOR_SIZE); add_disk(disk); pr_info("%s: added with size 0x%llx\n", @@ -1788,10 +1824,15 @@ out: sysfs */ +static struct rbd_device *dev_to_rbd_dev(struct device *dev) +{ + return container_of(dev, struct rbd_device, dev); +} + static ssize_t rbd_size_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct rbd_device *rbd_dev = dev_to_rbd(dev); + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size); } @@ -1799,7 +1840,7 @@ static ssize_t rbd_size_show(struct device *dev, static ssize_t rbd_major_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct rbd_device *rbd_dev = dev_to_rbd(dev); + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); return sprintf(buf, "%d\n", rbd_dev->major); } @@ -1807,15 +1848,16 @@ static ssize_t rbd_major_show(struct device *dev, static ssize_t rbd_client_id_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct rbd_device *rbd_dev = dev_to_rbd(dev); + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); - return sprintf(buf, "client%lld\n", ceph_client_id(rbd_dev->client)); + return sprintf(buf, "client%lld\n", + ceph_client_id(rbd_dev->rbd_client->client)); } static ssize_t rbd_pool_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct rbd_device *rbd_dev = dev_to_rbd(dev); + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); return sprintf(buf, "%s\n", rbd_dev->pool_name); } @@ -1823,7 +1865,7 @@ static ssize_t rbd_pool_show(struct device *dev, static ssize_t rbd_name_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct rbd_device *rbd_dev = dev_to_rbd(dev); + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); return sprintf(buf, "%s\n", rbd_dev->obj); } @@ -1832,7 +1874,7 @@ static ssize_t rbd_snap_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct rbd_device *rbd_dev = dev_to_rbd(dev); + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); return sprintf(buf, "%s\n", rbd_dev->snap_name); } @@ -1842,7 +1884,7 @@ static ssize_t rbd_image_refresh(struct device *dev, const char *buf, size_t size) { - struct rbd_device *rbd_dev = dev_to_rbd(dev); + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); int rc; int ret = size; @@ -1907,7 +1949,7 @@ static ssize_t rbd_snap_size_show(struct device *dev, { struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); - return sprintf(buf, "%lld\n", (long long)snap->size); + return sprintf(buf, "%zd\n", snap->size); } static ssize_t rbd_snap_id_show(struct device *dev, @@ -1916,7 +1958,7 @@ static ssize_t rbd_snap_id_show(struct device *dev, { struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); - return sprintf(buf, "%lld\n", (long long)snap->id); + return sprintf(buf, "%llu\n", (unsigned long long) snap->id); } static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL); @@ -2088,19 +2130,9 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev) return 0; } - -static void rbd_root_dev_release(struct device *dev) -{ -} - -static struct device rbd_root_dev = { - .init_name = "rbd", - .release = rbd_root_dev_release, -}; - static int rbd_bus_add_dev(struct rbd_device *rbd_dev) { - int ret = -ENOMEM; + int ret; struct device *dev; struct rbd_snap *snap; @@ -2114,7 +2146,7 @@ static int rbd_bus_add_dev(struct rbd_device *rbd_dev) dev_set_name(dev, "%d", rbd_dev->id); ret = device_register(dev); if (ret < 0) - goto done_free; + goto out; list_for_each_entry(snap, &rbd_dev->snaps, node) { ret = rbd_register_snap_dev(rbd_dev, snap, @@ -2122,10 +2154,7 @@ static int rbd_bus_add_dev(struct rbd_device *rbd_dev) if (ret < 0) break; } - - mutex_unlock(&ctl_mutex); - return 0; -done_free: +out: mutex_unlock(&ctl_mutex); return ret; } @@ -2154,104 +2183,250 @@ static int rbd_init_watch_dev(struct rbd_device *rbd_dev) return ret; } +static atomic64_t rbd_id_max = ATOMIC64_INIT(0); + +/* + * Get a unique rbd identifier for the given new rbd_dev, and add + * the rbd_dev to the global list. The minimum rbd id is 1. + */ +static void rbd_id_get(struct rbd_device *rbd_dev) +{ + rbd_dev->id = atomic64_inc_return(&rbd_id_max); + + spin_lock(&rbd_dev_list_lock); + list_add_tail(&rbd_dev->node, &rbd_dev_list); + spin_unlock(&rbd_dev_list_lock); +} + +/* + * Remove an rbd_dev from the global list, and record that its + * identifier is no longer in use. + */ +static void rbd_id_put(struct rbd_device *rbd_dev) +{ + struct list_head *tmp; + int rbd_id = rbd_dev->id; + int max_id; + + BUG_ON(rbd_id < 1); + + spin_lock(&rbd_dev_list_lock); + list_del_init(&rbd_dev->node); + + /* + * If the id being "put" is not the current maximum, there + * is nothing special we need to do. + */ + if (rbd_id != atomic64_read(&rbd_id_max)) { + spin_unlock(&rbd_dev_list_lock); + return; + } + + /* + * We need to update the current maximum id. Search the + * list to find out what it is. We're more likely to find + * the maximum at the end, so search the list backward. + */ + max_id = 0; + list_for_each_prev(tmp, &rbd_dev_list) { + struct rbd_device *rbd_dev; + + rbd_dev = list_entry(tmp, struct rbd_device, node); + if (rbd_id > max_id) + max_id = rbd_id; + } + spin_unlock(&rbd_dev_list_lock); + + /* + * The max id could have been updated by rbd_id_get(), in + * which case it now accurately reflects the new maximum. + * Be careful not to overwrite the maximum value in that + * case. + */ + atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id); +} + +/* + * Skips over white space at *buf, and updates *buf to point to the + * first found non-space character (if any). Returns the length of + * the token (string of non-white space characters) found. Note + * that *buf must be terminated with '\0'. + */ +static inline size_t next_token(const char **buf) +{ + /* + * These are the characters that produce nonzero for + * isspace() in the "C" and "POSIX" locales. + */ + const char *spaces = " \f\n\r\t\v"; + + *buf += strspn(*buf, spaces); /* Find start of token */ + + return strcspn(*buf, spaces); /* Return token length */ +} + +/* + * Finds the next token in *buf, and if the provided token buffer is + * big enough, copies the found token into it. The result, if + * copied, is guaranteed to be terminated with '\0'. Note that *buf + * must be terminated with '\0' on entry. + * + * Returns the length of the token found (not including the '\0'). + * Return value will be 0 if no token is found, and it will be >= + * token_size if the token would not fit. + * + * The *buf pointer will be updated to point beyond the end of the + * found token. Note that this occurs even if the token buffer is + * too small to hold it. + */ +static inline size_t copy_token(const char **buf, + char *token, + size_t token_size) +{ + size_t len; + + len = next_token(buf); + if (len < token_size) { + memcpy(token, *buf, len); + *(token + len) = '\0'; + } + *buf += len; + + return len; +} + +/* + * This fills in the pool_name, obj, obj_len, snap_name, obj_len, + * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based + * on the list of monitor addresses and other options provided via + * /sys/bus/rbd/add. + */ +static int rbd_add_parse_args(struct rbd_device *rbd_dev, + const char *buf, + const char **mon_addrs, + size_t *mon_addrs_size, + char *options, + size_t options_size) +{ + size_t len; + + /* The first four tokens are required */ + + len = next_token(&buf); + if (!len) + return -EINVAL; + *mon_addrs_size = len + 1; + *mon_addrs = buf; + + buf += len; + + len = copy_token(&buf, options, options_size); + if (!len || len >= options_size) + return -EINVAL; + + len = copy_token(&buf, rbd_dev->pool_name, sizeof (rbd_dev->pool_name)); + if (!len || len >= sizeof (rbd_dev->pool_name)) + return -EINVAL; + + len = copy_token(&buf, rbd_dev->obj, sizeof (rbd_dev->obj)); + if (!len || len >= sizeof (rbd_dev->obj)) + return -EINVAL; + + /* We have the object length in hand, save it. */ + + rbd_dev->obj_len = len; + + BUILD_BUG_ON(RBD_MAX_MD_NAME_LEN + < RBD_MAX_OBJ_NAME_LEN + sizeof (RBD_SUFFIX)); + sprintf(rbd_dev->obj_md_name, "%s%s", rbd_dev->obj, RBD_SUFFIX); + + /* + * The snapshot name is optional, but it's an error if it's + * too long. If no snapshot is supplied, fill in the default. + */ + len = copy_token(&buf, rbd_dev->snap_name, sizeof (rbd_dev->snap_name)); + if (!len) + memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME, + sizeof (RBD_SNAP_HEAD_NAME)); + else if (len >= sizeof (rbd_dev->snap_name)) + return -EINVAL; + + return 0; +} + static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count) { - struct ceph_osd_client *osdc; struct rbd_device *rbd_dev; - ssize_t rc = -ENOMEM; - int irc, new_id = 0; - struct list_head *tmp; - char *mon_dev_name; - char *options; + const char *mon_addrs = NULL; + size_t mon_addrs_size = 0; + char *options = NULL; + struct ceph_osd_client *osdc; + int rc = -ENOMEM; if (!try_module_get(THIS_MODULE)) return -ENODEV; - mon_dev_name = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL); - if (!mon_dev_name) - goto err_out_mod; - - options = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL); - if (!options) - goto err_mon_dev; - - /* new rbd_device object */ rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); if (!rbd_dev) - goto err_out_opt; + goto err_nomem; + options = kmalloc(count, GFP_KERNEL); + if (!options) + goto err_nomem; /* static rbd_device initialization */ spin_lock_init(&rbd_dev->lock); INIT_LIST_HEAD(&rbd_dev->node); INIT_LIST_HEAD(&rbd_dev->snaps); + init_rwsem(&rbd_dev->header_rwsem); - init_rwsem(&rbd_dev->header.snap_rwsem); + init_rwsem(&rbd_dev->header_rwsem); /* generate unique id: find highest unique id, add one */ - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - - list_for_each(tmp, &rbd_dev_list) { - struct rbd_device *rbd_dev; + rbd_id_get(rbd_dev); - rbd_dev = list_entry(tmp, struct rbd_device, node); - if (rbd_dev->id >= new_id) - new_id = rbd_dev->id + 1; - } - - rbd_dev->id = new_id; - - /* add to global list */ - list_add_tail(&rbd_dev->node, &rbd_dev_list); + /* Fill in the device name, now that we have its id. */ + BUILD_BUG_ON(DEV_NAME_LEN + < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); + sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->id); /* parse add command */ - if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s " - "%" __stringify(RBD_MAX_OPT_LEN) "s " - "%" __stringify(RBD_MAX_POOL_NAME_LEN) "s " - "%" __stringify(RBD_MAX_OBJ_NAME_LEN) "s" - "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s", - mon_dev_name, options, rbd_dev->pool_name, - rbd_dev->obj, rbd_dev->snap_name) < 4) { - rc = -EINVAL; - goto err_out_slot; - } - - if (rbd_dev->snap_name[0] == 0) - rbd_dev->snap_name[0] = '-'; - - rbd_dev->obj_len = strlen(rbd_dev->obj); - snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s", - rbd_dev->obj, RBD_SUFFIX); - - /* initialize rest of new object */ - snprintf(rbd_dev->name, DEV_NAME_LEN, DRV_NAME "%d", rbd_dev->id); - rc = rbd_get_client(rbd_dev, mon_dev_name, options); - if (rc < 0) - goto err_out_slot; + rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size, + options, count); + if (rc) + goto err_put_id; - mutex_unlock(&ctl_mutex); + rbd_dev->rbd_client = rbd_get_client(mon_addrs, mon_addrs_size - 1, + options); + if (IS_ERR(rbd_dev->rbd_client)) { + rc = PTR_ERR(rbd_dev->rbd_client); + goto err_put_id; + } /* pick the pool */ - osdc = &rbd_dev->client->osdc; + osdc = &rbd_dev->rbd_client->client->osdc; rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name); if (rc < 0) goto err_out_client; rbd_dev->poolid = rc; /* register our block device */ - irc = register_blkdev(0, rbd_dev->name); - if (irc < 0) { - rc = irc; + rc = register_blkdev(0, rbd_dev->name); + if (rc < 0) goto err_out_client; - } - rbd_dev->major = irc; + rbd_dev->major = rc; rc = rbd_bus_add_dev(rbd_dev); if (rc) goto err_out_blkdev; - /* set up and announce blkdev mapping */ + /* + * At this point cleanup in the event of an error is the job + * of the sysfs code (initiated by rbd_bus_del_dev()). + * + * Set up and announce blkdev mapping. + */ rc = rbd_init_disk(rbd_dev); if (rc) goto err_out_bus; @@ -2263,35 +2438,26 @@ static ssize_t rbd_add(struct bus_type *bus, return count; err_out_bus: - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - list_del_init(&rbd_dev->node); - mutex_unlock(&ctl_mutex); - /* this will also clean up rest of rbd_dev stuff */ rbd_bus_del_dev(rbd_dev); kfree(options); - kfree(mon_dev_name); return rc; err_out_blkdev: unregister_blkdev(rbd_dev->major, rbd_dev->name); err_out_client: rbd_put_client(rbd_dev); - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); -err_out_slot: - list_del_init(&rbd_dev->node); - mutex_unlock(&ctl_mutex); - - kfree(rbd_dev); -err_out_opt: +err_put_id: + rbd_id_put(rbd_dev); +err_nomem: kfree(options); -err_mon_dev: - kfree(mon_dev_name); -err_out_mod: + kfree(rbd_dev); + dout("Error adding device %s\n", buf); module_put(THIS_MODULE); - return rc; + + return (ssize_t) rc; } static struct rbd_device *__rbd_get_dev(unsigned long id) @@ -2299,22 +2465,28 @@ static struct rbd_device *__rbd_get_dev(unsigned long id) struct list_head *tmp; struct rbd_device *rbd_dev; + spin_lock(&rbd_dev_list_lock); list_for_each(tmp, &rbd_dev_list) { rbd_dev = list_entry(tmp, struct rbd_device, node); - if (rbd_dev->id == id) + if (rbd_dev->id == id) { + spin_unlock(&rbd_dev_list_lock); return rbd_dev; + } } + spin_unlock(&rbd_dev_list_lock); return NULL; } static void rbd_dev_release(struct device *dev) { - struct rbd_device *rbd_dev = - container_of(dev, struct rbd_device, dev); + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); - if (rbd_dev->watch_request) - ceph_osdc_unregister_linger_request(&rbd_dev->client->osdc, + if (rbd_dev->watch_request) { + struct ceph_client *client = rbd_dev->rbd_client->client; + + ceph_osdc_unregister_linger_request(&client->osdc, rbd_dev->watch_request); + } if (rbd_dev->watch_event) rbd_req_sync_unwatch(rbd_dev, rbd_dev->obj_md_name); @@ -2323,6 +2495,9 @@ static void rbd_dev_release(struct device *dev) /* clean up and free blkdev */ rbd_free_disk(rbd_dev); unregister_blkdev(rbd_dev->major, rbd_dev->name); + + /* done with the id, and with the rbd_dev */ + rbd_id_put(rbd_dev); kfree(rbd_dev); /* release module ref */ @@ -2355,8 +2530,6 @@ static ssize_t rbd_remove(struct bus_type *bus, goto done; } - list_del_init(&rbd_dev->node); - __rbd_remove_all_snaps(rbd_dev); rbd_bus_del_dev(rbd_dev); @@ -2370,7 +2543,7 @@ static ssize_t rbd_snap_add(struct device *dev, const char *buf, size_t count) { - struct rbd_device *rbd_dev = dev_to_rbd(dev); + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); int ret; char *name = kmalloc(count + 1, GFP_KERNEL); if (!name) @@ -2406,12 +2579,6 @@ err_unlock: return ret; } -static struct bus_attribute rbd_bus_attrs[] = { - __ATTR(add, S_IWUSR, NULL, rbd_add), - __ATTR(remove, S_IWUSR, NULL, rbd_remove), - __ATTR_NULL -}; - /* * create control files in sysfs * /sys/bus/rbd/... @@ -2420,21 +2587,21 @@ static int rbd_sysfs_init(void) { int ret; - rbd_bus_type.bus_attrs = rbd_bus_attrs; - - ret = bus_register(&rbd_bus_type); - if (ret < 0) + ret = device_register(&rbd_root_dev); + if (ret < 0) return ret; - ret = device_register(&rbd_root_dev); + ret = bus_register(&rbd_bus_type); + if (ret < 0) + device_unregister(&rbd_root_dev); return ret; } static void rbd_sysfs_cleanup(void) { - device_unregister(&rbd_root_dev); bus_unregister(&rbd_bus_type); + device_unregister(&rbd_root_dev); } int __init rbd_init(void) @@ -2444,8 +2611,7 @@ int __init rbd_init(void) rc = rbd_sysfs_init(); if (rc) return rc; - spin_lock_init(&node_lock); - pr_info("loaded " DRV_NAME_LONG "\n"); + pr_info("loaded " RBD_DRV_NAME_LONG "\n"); return 0; } diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h index fc6c678aa2c..950708688f1 100644 --- a/drivers/block/rbd_types.h +++ b/drivers/block/rbd_types.h @@ -41,10 +41,6 @@ #define RBD_HEADER_SIGNATURE "RBD" #define RBD_HEADER_VERSION "001.005" -struct rbd_info { - __le64 max_id; -} __attribute__ ((packed)); - struct rbd_image_snap_ondisk { __le64 id; __le64 image_size; |