diff options
Diffstat (limited to 'drivers/block')
-rw-r--r-- | drivers/block/aoe/aoecmd.c | 2 | ||||
-rw-r--r-- | drivers/block/brd.c | 2 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_nl.c | 28 | ||||
-rw-r--r-- | drivers/block/loop.c | 6 | ||||
-rw-r--r-- | drivers/block/nbd.c | 2 | ||||
-rw-r--r-- | drivers/block/rbd.c | 355 | ||||
-rw-r--r-- | drivers/block/zram/zcomp.c | 300 | ||||
-rw-r--r-- | drivers/block/zram/zcomp.h | 14 | ||||
-rw-r--r-- | drivers/block/zram/zram_drv.c | 104 | ||||
-rw-r--r-- | drivers/block/zram/zram_drv.h | 2 |
10 files changed, 305 insertions, 510 deletions
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 437b3a822f44..d597e432e195 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -861,7 +861,7 @@ rqbiocnt(struct request *r) * discussion. * * We cannot use get_page in the workaround, because it insists on a - * positive page count as a precondition. So we use _count directly. + * positive page count as a precondition. So we use _refcount directly. */ static void bio_pageinc(struct bio *bio) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 51a071e32221..c04bd9bc39fd 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -381,7 +381,7 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector, #ifdef CONFIG_BLK_DEV_RAM_DAX static long brd_direct_access(struct block_device *bdev, sector_t sector, - void __pmem **kaddr, pfn_t *pfn) + void __pmem **kaddr, pfn_t *pfn, long size) { struct brd_device *brd = bdev->bd_disk->private_data; struct page *page; diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 1fd1dccebb6b..0bac9c8246bc 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -3633,14 +3633,15 @@ static int nla_put_status_info(struct sk_buff *skb, struct drbd_device *device, goto nla_put_failure; if (nla_put_u32(skb, T_sib_reason, sib ? sib->sib_reason : SIB_GET_STATUS_REPLY) || nla_put_u32(skb, T_current_state, device->state.i) || - nla_put_u64(skb, T_ed_uuid, device->ed_uuid) || - nla_put_u64(skb, T_capacity, drbd_get_capacity(device->this_bdev)) || - nla_put_u64(skb, T_send_cnt, device->send_cnt) || - nla_put_u64(skb, T_recv_cnt, device->recv_cnt) || - nla_put_u64(skb, T_read_cnt, device->read_cnt) || - nla_put_u64(skb, T_writ_cnt, device->writ_cnt) || - nla_put_u64(skb, T_al_writ_cnt, device->al_writ_cnt) || - nla_put_u64(skb, T_bm_writ_cnt, device->bm_writ_cnt) || + nla_put_u64_0pad(skb, T_ed_uuid, device->ed_uuid) || + nla_put_u64_0pad(skb, T_capacity, + drbd_get_capacity(device->this_bdev)) || + nla_put_u64_0pad(skb, T_send_cnt, device->send_cnt) || + nla_put_u64_0pad(skb, T_recv_cnt, device->recv_cnt) || + nla_put_u64_0pad(skb, T_read_cnt, device->read_cnt) || + nla_put_u64_0pad(skb, T_writ_cnt, device->writ_cnt) || + nla_put_u64_0pad(skb, T_al_writ_cnt, device->al_writ_cnt) || + nla_put_u64_0pad(skb, T_bm_writ_cnt, device->bm_writ_cnt) || nla_put_u32(skb, T_ap_bio_cnt, atomic_read(&device->ap_bio_cnt)) || nla_put_u32(skb, T_ap_pending_cnt, atomic_read(&device->ap_pending_cnt)) || nla_put_u32(skb, T_rs_pending_cnt, atomic_read(&device->rs_pending_cnt))) @@ -3657,13 +3658,16 @@ static int nla_put_status_info(struct sk_buff *skb, struct drbd_device *device, goto nla_put_failure; if (nla_put_u32(skb, T_disk_flags, device->ldev->md.flags) || - nla_put_u64(skb, T_bits_total, drbd_bm_bits(device)) || - nla_put_u64(skb, T_bits_oos, drbd_bm_total_weight(device))) + nla_put_u64_0pad(skb, T_bits_total, drbd_bm_bits(device)) || + nla_put_u64_0pad(skb, T_bits_oos, + drbd_bm_total_weight(device))) goto nla_put_failure; if (C_SYNC_SOURCE <= device->state.conn && C_PAUSED_SYNC_T >= device->state.conn) { - if (nla_put_u64(skb, T_bits_rs_total, device->rs_total) || - nla_put_u64(skb, T_bits_rs_failed, device->rs_failed)) + if (nla_put_u64_0pad(skb, T_bits_rs_total, + device->rs_total) || + nla_put_u64_0pad(skb, T_bits_rs_failed, + device->rs_failed)) goto nla_put_failure; } } diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 7e5e27ac45bb..1fa8cc235977 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -488,6 +488,12 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); iov_iter_bvec(&iter, ITER_BVEC | rw, bvec, bio_segments(bio), blk_rq_bytes(cmd->rq)); + /* + * This bio may be started from the middle of the 'bvec' + * because of bio splitting, so offset from the bvec must + * be passed to iov iterator + */ + iter.iov_offset = bio->bi_iter.bi_bvec_done; cmd->iocb.ki_pos = pos; cmd->iocb.ki_filp = file; diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 31e73a7a40f2..6a48ed41963f 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -941,7 +941,7 @@ static int nbd_dev_dbg_init(struct nbd_device *nbd) debugfs_create_u64("size_bytes", 0444, dir, &nbd->bytesize); debugfs_create_u32("timeout", 0444, dir, &nbd->xmit_timeout); debugfs_create_u32("blocksize", 0444, dir, &nbd->blksize); - debugfs_create_file("flags", 0444, dir, &nbd, &nbd_dbg_flags_ops); + debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_ops); return 0; } diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 94a1843b0426..81666a56415e 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -350,12 +350,12 @@ struct rbd_device { struct rbd_spec *spec; struct rbd_options *opts; - char *header_name; + struct ceph_object_id header_oid; + struct ceph_object_locator header_oloc; struct ceph_file_layout layout; - struct ceph_osd_event *watch_event; - struct rbd_obj_request *watch_request; + struct ceph_osd_linger_request *watch_handle; struct rbd_spec *parent_spec; u64 parent_overlap; @@ -538,7 +538,6 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, u8 *order, u64 *snap_size); static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, u64 *snap_features); -static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name); static int rbd_open(struct block_device *bdev, fmode_t mode) { @@ -1597,12 +1596,6 @@ static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) return __rbd_obj_request_wait(obj_request, 0); } -static int rbd_obj_request_wait_timeout(struct rbd_obj_request *obj_request, - unsigned long timeout) -{ - return __rbd_obj_request_wait(obj_request, timeout); -} - static void rbd_img_request_complete(struct rbd_img_request *img_request) { @@ -1752,12 +1745,6 @@ static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) complete_all(&obj_request->completion); } -static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request) -{ - dout("%s: obj %p\n", __func__, obj_request); - obj_request_done_set(obj_request); -} - static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) { struct rbd_img_request *img_request = NULL; @@ -1829,13 +1816,12 @@ static void rbd_osd_call_callback(struct rbd_obj_request *obj_request) obj_request_done_set(obj_request); } -static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, - struct ceph_msg *msg) +static void rbd_osd_req_callback(struct ceph_osd_request *osd_req) { struct rbd_obj_request *obj_request = osd_req->r_priv; u16 opcode; - dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg); + dout("%s: osd_req %p\n", __func__, osd_req); rbd_assert(osd_req == obj_request->osd_req); if (obj_request_img_data_test(obj_request)) { rbd_assert(obj_request->img_request); @@ -1879,10 +1865,6 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, case CEPH_OSD_OP_CALL: rbd_osd_call_callback(obj_request); break; - case CEPH_OSD_OP_NOTIFY_ACK: - case CEPH_OSD_OP_WATCH: - rbd_osd_trivial_callback(obj_request); - break; default: rbd_warn(NULL, "%s: unsupported op %hu", obj_request->object_name, (unsigned short) opcode); @@ -1897,27 +1879,17 @@ static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) { struct rbd_img_request *img_request = obj_request->img_request; struct ceph_osd_request *osd_req = obj_request->osd_req; - u64 snap_id; - - rbd_assert(osd_req != NULL); - snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP; - ceph_osdc_build_request(osd_req, obj_request->offset, - NULL, snap_id, NULL); + if (img_request) + osd_req->r_snapid = img_request->snap_id; } static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) { - struct rbd_img_request *img_request = obj_request->img_request; struct ceph_osd_request *osd_req = obj_request->osd_req; - struct ceph_snap_context *snapc; - struct timespec mtime = CURRENT_TIME; - rbd_assert(osd_req != NULL); - - snapc = img_request ? img_request->snapc : NULL; - ceph_osdc_build_request(osd_req, obj_request->offset, - snapc, CEPH_NOSNAP, &mtime); + osd_req->r_mtime = CURRENT_TIME; + osd_req->r_data_offset = obj_request->offset; } /* @@ -1955,7 +1927,7 @@ static struct ceph_osd_request *rbd_osd_req_create( osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO); if (!osd_req) - return NULL; /* ENOMEM */ + goto fail; if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; @@ -1966,9 +1938,18 @@ static struct ceph_osd_request *rbd_osd_req_create( osd_req->r_priv = obj_request; osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); - ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); + if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s", + obj_request->object_name)) + goto fail; + + if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO)) + goto fail; return osd_req; + +fail: + ceph_osdc_put_request(osd_req); + return NULL; } /* @@ -2004,16 +1985,25 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops, false, GFP_NOIO); if (!osd_req) - return NULL; /* ENOMEM */ + goto fail; osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; osd_req->r_callback = rbd_osd_req_callback; osd_req->r_priv = obj_request; osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); - ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); + if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s", + obj_request->object_name)) + goto fail; + + if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO)) + goto fail; return osd_req; + +fail: + ceph_osdc_put_request(osd_req); + return NULL; } @@ -2974,17 +2964,20 @@ static int rbd_img_request_submit(struct rbd_img_request *img_request) { struct rbd_obj_request *obj_request; struct rbd_obj_request *next_obj_request; + int ret = 0; dout("%s: img %p\n", __func__, img_request); - for_each_obj_request_safe(img_request, obj_request, next_obj_request) { - int ret; + rbd_img_request_get(img_request); + for_each_obj_request_safe(img_request, obj_request, next_obj_request) { ret = rbd_img_obj_request_submit(obj_request); if (ret) - return ret; + goto out_put_ireq; } - return 0; +out_put_ireq: + rbd_img_request_put(img_request); + return ret; } static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) @@ -3091,48 +3084,18 @@ out_err: obj_request_done_set(obj_request); } -static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id) -{ - struct rbd_obj_request *obj_request; - struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; - int ret; - - obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, - OBJ_REQUEST_NODATA); - if (!obj_request) - return -ENOMEM; - - ret = -ENOMEM; - obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, - obj_request); - if (!obj_request->osd_req) - goto out; - - osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK, - notify_id, 0, 0); - rbd_osd_req_format_read(obj_request); +static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev); +static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev); - ret = rbd_obj_request_submit(osdc, obj_request); - if (ret) - goto out; - ret = rbd_obj_request_wait(obj_request); -out: - rbd_obj_request_put(obj_request); - - return ret; -} - -static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) +static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie, + u64 notifier_id, void *data, size_t data_len) { - struct rbd_device *rbd_dev = (struct rbd_device *)data; + struct rbd_device *rbd_dev = arg; + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; int ret; - if (!rbd_dev) - return; - - dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__, - rbd_dev->header_name, (unsigned long long)notify_id, - (unsigned int)opcode); + dout("%s rbd_dev %p cookie %llu notify_id %llu\n", __func__, rbd_dev, + cookie, notify_id); /* * Until adequate refresh error handling is in place, there is @@ -3144,63 +3107,31 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) if (ret) rbd_warn(rbd_dev, "refresh failed: %d", ret); - ret = rbd_obj_notify_ack_sync(rbd_dev, notify_id); + ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid, + &rbd_dev->header_oloc, notify_id, cookie, + NULL, 0); if (ret) rbd_warn(rbd_dev, "notify_ack ret %d", ret); } -/* - * Send a (un)watch request and wait for the ack. Return a request - * with a ref held on success or error. - */ -static struct rbd_obj_request *rbd_obj_watch_request_helper( - struct rbd_device *rbd_dev, - bool watch) +static void rbd_watch_errcb(void *arg, u64 cookie, int err) { - struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; - struct ceph_options *opts = osdc->client->options; - struct rbd_obj_request *obj_request; + struct rbd_device *rbd_dev = arg; int ret; - obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, - OBJ_REQUEST_NODATA); - if (!obj_request) - return ERR_PTR(-ENOMEM); - - obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_WRITE, 1, - obj_request); - if (!obj_request->osd_req) { - ret = -ENOMEM; - goto out; - } - - osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, - rbd_dev->watch_event->cookie, 0, watch); - rbd_osd_req_format_write(obj_request); + rbd_warn(rbd_dev, "encountered watch error: %d", err); - if (watch) - ceph_osdc_set_request_linger(osdc, obj_request->osd_req); - - ret = rbd_obj_request_submit(osdc, obj_request); - if (ret) - goto out; - - ret = rbd_obj_request_wait_timeout(obj_request, opts->mount_timeout); - if (ret) - goto out; + __rbd_dev_header_unwatch_sync(rbd_dev); - ret = obj_request->result; + ret = rbd_dev_header_watch_sync(rbd_dev); if (ret) { - if (watch) - rbd_obj_request_end(obj_request); - goto out; + rbd_warn(rbd_dev, "failed to reregister watch: %d", ret); + return; } - return obj_request; - -out: - rbd_obj_request_put(obj_request); - return ERR_PTR(ret); + ret = rbd_dev_refresh(rbd_dev); + if (ret) + rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret); } /* @@ -3209,35 +3140,33 @@ out: static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev) { struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; - struct rbd_obj_request *obj_request; - int ret; + struct ceph_osd_linger_request *handle; - rbd_assert(!rbd_dev->watch_event); - rbd_assert(!rbd_dev->watch_request); + rbd_assert(!rbd_dev->watch_handle); - ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev, - &rbd_dev->watch_event); - if (ret < 0) - return ret; + handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid, + &rbd_dev->header_oloc, rbd_watch_cb, + rbd_watch_errcb, rbd_dev); + if (IS_ERR(handle)) + return PTR_ERR(handle); - obj_request = rbd_obj_watch_request_helper(rbd_dev, true); - if (IS_ERR(obj_request)) { - ceph_osdc_cancel_event(rbd_dev->watch_event); - rbd_dev->watch_event = NULL; - return PTR_ERR(obj_request); - } + rbd_dev->watch_handle = handle; + return 0; +} - /* - * A watch request is set to linger, so the underlying osd - * request won't go away until we unregister it. We retain - * a pointer to the object request during that time (in - * rbd_dev->watch_request), so we'll keep a reference to it. - * We'll drop that reference after we've unregistered it in - * rbd_dev_header_unwatch_sync(). - */ - rbd_dev->watch_request = obj_request; +static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) +{ + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + int ret; - return 0; + if (!rbd_dev->watch_handle) + return; + + ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle); + if (ret) + rbd_warn(rbd_dev, "failed to unwatch: %d", ret); + + rbd_dev->watch_handle = NULL; } /* @@ -3245,24 +3174,10 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev) */ static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) { - struct rbd_obj_request *obj_request; - - rbd_assert(rbd_dev->watch_event); - rbd_assert(rbd_dev->watch_request); + __rbd_dev_header_unwatch_sync(rbd_dev); - rbd_obj_request_end(rbd_dev->watch_request); - rbd_obj_request_put(rbd_dev->watch_request); - rbd_dev->watch_request = NULL; - - obj_request = rbd_obj_watch_request_helper(rbd_dev, false); - if (!IS_ERR(obj_request)) - rbd_obj_request_put(obj_request); - else - rbd_warn(rbd_dev, "unable to tear down watch request (%ld)", - PTR_ERR(obj_request)); - - ceph_osdc_cancel_event(rbd_dev->watch_event); - rbd_dev->watch_event = NULL; + dout("%s flushing notifies\n", __func__); + ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc); } /* @@ -3592,7 +3507,7 @@ static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev) if (!ondisk) return -ENOMEM; - ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_oid.name, 0, size, ondisk); if (ret < 0) goto out; @@ -3642,21 +3557,14 @@ static void rbd_exists_validate(struct rbd_device *rbd_dev) static void rbd_dev_update_size(struct rbd_device *rbd_dev) { sector_t size; - bool removing; /* - * Don't hold the lock while doing disk operations, - * or lock ordering will conflict with the bdev mutex via: - * rbd_add() -> blkdev_get() -> rbd_open() - */ - spin_lock_irq(&rbd_dev->lock); - removing = test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags); - spin_unlock_irq(&rbd_dev->lock); - /* - * If the device is being removed, rbd_dev->disk has - * been destroyed, so don't try to update its size + * If EXISTS is not set, rbd_dev->disk may be NULL, so don't + * try to update its size. If REMOVING is set, updating size + * is just useless work since the device can't be opened. */ - if (!removing) { + if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) && + !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) { size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE; dout("setting size to %llu sectors", (unsigned long long)size); set_capacity(rbd_dev->disk, size); @@ -4041,6 +3949,8 @@ static void rbd_dev_release(struct device *dev) struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); bool need_put = !!rbd_dev->opts; + ceph_oid_destroy(&rbd_dev->header_oid); + rbd_put_client(rbd_dev->rbd_client); rbd_spec_put(rbd_dev->spec); kfree(rbd_dev->opts); @@ -4071,6 +3981,9 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, INIT_LIST_HEAD(&rbd_dev->node); init_rwsem(&rbd_dev->header_rwsem); + ceph_oid_init(&rbd_dev->header_oid); + ceph_oloc_init(&rbd_dev->header_oloc); + rbd_dev->dev.bus = &rbd_bus_type; rbd_dev->dev.type = &rbd_device_type; rbd_dev->dev.parent = &rbd_root_dev; @@ -4119,7 +4032,7 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, __le64 size; } __attribute__ ((packed)) size_buf = { 0 }; - ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, "rbd", "get_size", &snapid, sizeof (snapid), &size_buf, sizeof (size_buf)); @@ -4159,7 +4072,7 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) if (!reply_buf) return -ENOMEM; - ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, "rbd", "get_object_prefix", NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); @@ -4191,10 +4104,10 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, __le64 features; __le64 incompat; } __attribute__ ((packed)) features_buf = { 0 }; - u64 incompat; + u64 unsup; int ret; - ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, "rbd", "get_features", &snapid, sizeof (snapid), &features_buf, sizeof (features_buf)); @@ -4204,9 +4117,12 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, if (ret < sizeof (features_buf)) return -ERANGE; - incompat = le64_to_cpu(features_buf.incompat); - if (incompat & ~RBD_FEATURES_SUPPORTED) + unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED; + if (unsup) { + rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx", + unsup); return -ENXIO; + } *snap_features = le64_to_cpu(features_buf.features); @@ -4253,7 +4169,7 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) } snapid = cpu_to_le64(rbd_dev->spec->snap_id); - ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, "rbd", "get_parent", &snapid, sizeof (snapid), reply_buf, size); @@ -4356,7 +4272,7 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) u64 stripe_count; int ret; - ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, "rbd", "get_stripe_unit_count", NULL, 0, (char *)&striping_info_buf, size); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); @@ -4604,7 +4520,7 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) if (!reply_buf) return -ENOMEM; - ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, "rbd", "get_snapcontext", NULL, 0, reply_buf, size); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); @@ -4669,7 +4585,7 @@ static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, return ERR_PTR(-ENOMEM); snapid = cpu_to_le64(snap_id); - ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, "rbd", "get_snapshot_name", &snapid, sizeof (snapid), reply_buf, size); @@ -4980,13 +4896,13 @@ static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name) again: ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name); if (ret == -ENOENT && tries++ < 1) { - ret = ceph_monc_do_get_version(&rbdc->client->monc, "osdmap", - &newest_epoch); + ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap", + &newest_epoch); if (ret < 0) return ret; if (rbdc->client->osdc.osdmap->epoch < newest_epoch) { - ceph_monc_request_next_osdmap(&rbdc->client->monc); + ceph_osdc_maybe_request_map(&rbdc->client->osdc); (void) ceph_monc_wait_osdmap(&rbdc->client->monc, newest_epoch, opts->mount_timeout); @@ -5187,6 +5103,10 @@ out_err: return ret; } +/* + * rbd_dev->header_rwsem must be locked for write and will be unlocked + * upon return. + */ static int rbd_dev_device_setup(struct rbd_device *rbd_dev) { int ret; @@ -5195,7 +5115,7 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev) ret = rbd_dev_id_get(rbd_dev); if (ret) - return ret; + goto err_out_unlock; BUILD_BUG_ON(DEV_NAME_LEN < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); @@ -5236,8 +5156,9 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev) /* Everything's ready. Announce the disk to the world. */ set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); - add_disk(rbd_dev->disk); + up_write(&rbd_dev->header_rwsem); + add_disk(rbd_dev->disk); pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, (unsigned long long) rbd_dev->mapping.size); @@ -5252,41 +5173,34 @@ err_out_blkdev: unregister_blkdev(rbd_dev->major, rbd_dev->name); err_out_id: rbd_dev_id_put(rbd_dev); +err_out_unlock: + up_write(&rbd_dev->header_rwsem); return ret; } static int rbd_dev_header_name(struct rbd_device *rbd_dev) { struct rbd_spec *spec = rbd_dev->spec; - size_t size; + int ret; /* Record the header object name for this rbd image. */ rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); + rbd_dev->header_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); if (rbd_dev->image_format == 1) - size = strlen(spec->image_name) + sizeof (RBD_SUFFIX); + ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", + spec->image_name, RBD_SUFFIX); else - size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id); - - rbd_dev->header_name = kmalloc(size, GFP_KERNEL); - if (!rbd_dev->header_name) - return -ENOMEM; + ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", + RBD_HEADER_PREFIX, spec->image_id); - if (rbd_dev->image_format == 1) - sprintf(rbd_dev->header_name, "%s%s", - spec->image_name, RBD_SUFFIX); - else - sprintf(rbd_dev->header_name, "%s%s", - RBD_HEADER_PREFIX, spec->image_id); - return 0; + return ret; } static void rbd_dev_image_release(struct rbd_device *rbd_dev) { rbd_dev_unprobe(rbd_dev); - kfree(rbd_dev->header_name); - rbd_dev->header_name = NULL; rbd_dev->image_format = 0; kfree(rbd_dev->spec->image_id); rbd_dev->spec->image_id = NULL; @@ -5325,7 +5239,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) pr_info("image %s/%s does not exist\n", rbd_dev->spec->pool_name, rbd_dev->spec->image_name); - goto out_header_name; + goto err_out_format; } } @@ -5371,7 +5285,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) goto err_out_probe; dout("discovered format %u image, header name is %s\n", - rbd_dev->image_format, rbd_dev->header_name); + rbd_dev->image_format, rbd_dev->header_oid.name); return 0; err_out_probe: @@ -5379,9 +5293,6 @@ err_out_probe: err_out_watch: if (!depth) rbd_dev_header_unwatch_sync(rbd_dev); -out_header_name: - kfree(rbd_dev->header_name); - rbd_dev->header_name = NULL; err_out_format: rbd_dev->image_format = 0; kfree(rbd_dev->spec->image_id); @@ -5442,6 +5353,7 @@ static ssize_t do_rbd_add(struct bus_type *bus, spec = NULL; /* rbd_dev now owns this */ rbd_opts = NULL; /* rbd_dev now owns this */ + down_write(&rbd_dev->header_rwsem); rc = rbd_dev_image_probe(rbd_dev, 0); if (rc < 0) goto err_out_rbd_dev; @@ -5471,6 +5383,7 @@ out: return rc; err_out_rbd_dev: + up_write(&rbd_dev->header_rwsem); rbd_dev_destroy(rbd_dev); err_out_client: rbd_put_client(rbdc); @@ -5577,12 +5490,6 @@ static ssize_t do_rbd_remove(struct bus_type *bus, return ret; rbd_dev_header_unwatch_sync(rbd_dev); - /* - * flush remaining watch callbacks - these must be complete - * before the osd_client is shutdown - */ - dout("%s: flushing notifies", __func__); - ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc); /* * Don't free anything from rbd_dev->disk until after all diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index 3ef42e563bb5..b51a816d766b 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -13,6 +13,7 @@ #include <linux/slab.h> #include <linux/wait.h> #include <linux/sched.h> +#include <linux/cpu.h> #include "zcomp.h" #include "zcomp_lzo.h" @@ -20,29 +21,6 @@ #include "zcomp_lz4.h" #endif -/* - * single zcomp_strm backend - */ -struct zcomp_strm_single { - struct mutex strm_lock; - struct zcomp_strm *zstrm; -}; - -/* - * multi zcomp_strm backend - */ -struct zcomp_strm_multi { - /* protect strm list */ - spinlock_t strm_lock; - /* max possible number of zstrm streams */ - int max_strm; - /* number of available zstrm streams */ - int avail_strm; - /* list of available strms */ - struct list_head idle_strm; - wait_queue_head_t strm_wait; -}; - static struct zcomp_backend *backends[] = { &zcomp_lzo, #ifdef CONFIG_ZRAM_LZ4_COMPRESS @@ -93,188 +71,6 @@ static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp, gfp_t flags) return zstrm; } -/* - * get idle zcomp_strm or wait until other process release - * (zcomp_strm_release()) one for us - */ -static struct zcomp_strm *zcomp_strm_multi_find(struct zcomp *comp) -{ - struct zcomp_strm_multi *zs = comp->stream; - struct zcomp_strm *zstrm; - - while (1) { - spin_lock(&zs->strm_lock); - if (!list_empty(&zs->idle_strm)) { - zstrm = list_entry(zs->idle_strm.next, - struct zcomp_strm, list); - list_del(&zstrm->list); - spin_unlock(&zs->strm_lock); - return zstrm; - } - /* zstrm streams limit reached, wait for idle stream */ - if (zs->avail_strm >= zs->max_strm) { - spin_unlock(&zs->strm_lock); - wait_event(zs->strm_wait, !list_empty(&zs->idle_strm)); - continue; - } - /* allocate new zstrm stream */ - zs->avail_strm++; - spin_unlock(&zs->strm_lock); - /* - * This function can be called in swapout/fs write path - * so we can't use GFP_FS|IO. And it assumes we already - * have at least one stream in zram initialization so we - * don't do best effort to allocate more stream in here. - * A default stream will work well without further multiple - * streams. That's why we use NORETRY | NOWARN. - */ - zstrm = zcomp_strm_alloc(comp, GFP_NOIO | __GFP_NORETRY | - __GFP_NOWARN); - if (!zstrm) { - spin_lock(&zs->strm_lock); - zs->avail_strm--; - spin_unlock(&zs->strm_lock); - wait_event(zs->strm_wait, !list_empty(&zs->idle_strm)); - continue; - } - break; - } - return zstrm; -} - -/* add stream back to idle list and wake up waiter or free the stream */ -static void zcomp_strm_multi_release(struct zcomp *comp, struct zcomp_strm *zstrm) -{ - struct zcomp_strm_multi *zs = comp->stream; - - spin_lock(&zs->strm_lock); - if (zs->avail_strm <= zs->max_strm) { - list_add(&zstrm->list, &zs->idle_strm); - spin_unlock(&zs->strm_lock); - wake_up(&zs->strm_wait); - return; - } - - zs->avail_strm--; - spin_unlock(&zs->strm_lock); - zcomp_strm_free(comp, zstrm); -} - -/* change max_strm limit */ -static bool zcomp_strm_multi_set_max_streams(struct zcomp *comp, int num_strm) -{ - struct zcomp_strm_multi *zs = comp->stream; - struct zcomp_strm *zstrm; - - spin_lock(&zs->strm_lock); - zs->max_strm = num_strm; - /* - * if user has lowered the limit and there are idle streams, - * immediately free as much streams (and memory) as we can. - */ - while (zs->avail_strm > num_strm && !list_empty(&zs->idle_strm)) { - zstrm = list_entry(zs->idle_strm.next, - struct zcomp_strm, list); - list_del(&zstrm->list); - zcomp_strm_free(comp, zstrm); - zs->avail_strm--; - } - spin_unlock(&zs->strm_lock); - return true; -} - -static void zcomp_strm_multi_destroy(struct zcomp *comp) -{ - struct zcomp_strm_multi *zs = comp->stream; - struct zcomp_strm *zstrm; - - while (!list_empty(&zs->idle_strm)) { - zstrm = list_entry(zs->idle_strm.next, - struct zcomp_strm, list); - list_del(&zstrm->list); - zcomp_strm_free(comp, zstrm); - } - kfree(zs); -} - -static int zcomp_strm_multi_create(struct zcomp *comp, int max_strm) -{ - struct zcomp_strm *zstrm; - struct zcomp_strm_multi *zs; - - comp->destroy = zcomp_strm_multi_destroy; - comp->strm_find = zcomp_strm_multi_find; - comp->strm_release = zcomp_strm_multi_release; - comp->set_max_streams = zcomp_strm_multi_set_max_streams; - zs = kmalloc(sizeof(struct zcomp_strm_multi), GFP_KERNEL); - if (!zs) - return -ENOMEM; - - comp->stream = zs; - spin_lock_init(&zs->strm_lock); - INIT_LIST_HEAD(&zs->idle_strm); - init_waitqueue_head(&zs->strm_wait); - zs->max_strm = max_strm; - zs->avail_strm = 1; - - zstrm = zcomp_strm_alloc(comp, GFP_KERNEL); - if (!zstrm) { - kfree(zs); - return -ENOMEM; - } - list_add(&zstrm->list, &zs->idle_strm); - return 0; -} - -static struct zcomp_strm *zcomp_strm_single_find(struct zcomp *comp) -{ - struct zcomp_strm_single *zs = comp->stream; - mutex_lock(&zs->strm_lock); - return zs->zstrm; -} - -static void zcomp_strm_single_release(struct zcomp *comp, - struct zcomp_strm *zstrm) -{ - struct zcomp_strm_single *zs = comp->stream; - mutex_unlock(&zs->strm_lock); -} - -static bool zcomp_strm_single_set_max_streams(struct zcomp *comp, int num_strm) -{ - /* zcomp_strm_single support only max_comp_streams == 1 */ - return false; -} - -static void zcomp_strm_single_destroy(struct zcomp *comp) -{ - struct zcomp_strm_single *zs = comp->stream; - zcomp_strm_free(comp, zs->zstrm); - kfree(zs); -} - -static int zcomp_strm_single_create(struct zcomp *comp) -{ - struct zcomp_strm_single *zs; - - comp->destroy = zcomp_strm_single_destroy; - comp->strm_find = zcomp_strm_single_find; - comp->strm_release = zcomp_strm_single_release; - comp->set_max_streams = zcomp_strm_single_set_max_streams; - zs = kmalloc(sizeof(struct zcomp_strm_single), GFP_KERNEL); - if (!zs) - return -ENOMEM; - - comp->stream = zs; - mutex_init(&zs->strm_lock); - zs->zstrm = zcomp_strm_alloc(comp, GFP_KERNEL); - if (!zs->zstrm) { - kfree(zs); - return -ENOMEM; - } - return 0; -} - /* show available compressors */ ssize_t zcomp_available_show(const char *comp, char *buf) { @@ -299,19 +95,14 @@ bool zcomp_available_algorithm(const char *comp) return find_backend(comp) != NULL; } -bool zcomp_set_max_streams(struct zcomp *comp, int num_strm) -{ - return comp->set_max_streams(comp, num_strm); -} - struct zcomp_strm *zcomp_strm_find(struct zcomp *comp) { - return comp->strm_find(comp); + return *get_cpu_ptr(comp->stream); } void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm) { - comp->strm_release(comp, zstrm); + put_cpu_ptr(comp->stream); } int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, @@ -327,9 +118,83 @@ int zcomp_decompress(struct zcomp *comp, const unsigned char *src, return comp->backend->decompress(src, src_len, dst); } +static int __zcomp_cpu_notifier(struct zcomp *comp, + unsigned long action, unsigned long cpu) +{ + struct zcomp_strm *zstrm; + + switch (action) { + case CPU_UP_PREPARE: + if (WARN_ON(*per_cpu_ptr(comp->stream, cpu))) + break; + zstrm = zcomp_strm_alloc(comp, GFP_KERNEL); + if (IS_ERR_OR_NULL(zstrm)) { + pr_err("Can't allocate a compression stream\n"); + return NOTIFY_BAD; + } + *per_cpu_ptr(comp->stream, cpu) = zstrm; + break; + case CPU_DEAD: + case CPU_UP_CANCELED: + zstrm = *per_cpu_ptr(comp->stream, cpu); + if (!IS_ERR_OR_NULL(zstrm)) + zcomp_strm_free(comp, zstrm); + *per_cpu_ptr(comp->stream, cpu) = NULL; + break; + default: + break; + } + return NOTIFY_OK; +} + +static int zcomp_cpu_notifier(struct notifier_block *nb, + unsigned long action, void *pcpu) +{ + unsigned long cpu = (unsigned long)pcpu; + struct zcomp *comp = container_of(nb, typeof(*comp), notifier); + + return __zcomp_cpu_notifier(comp, action, cpu); +} + +static int zcomp_init(struct zcomp *comp) +{ + unsigned long cpu; + int ret; + + comp->notifier.notifier_call = zcomp_cpu_notifier; + + comp->stream = alloc_percpu(struct zcomp_strm *); + if (!comp->stream) + return -ENOMEM; + + cpu_notifier_register_begin(); + for_each_online_cpu(cpu) { + ret = __zcomp_cpu_notifier(comp, CPU_UP_PREPARE, cpu); + if (ret == NOTIFY_BAD) + goto cleanup; + } + __register_cpu_notifier(&comp->notifier); + cpu_notifier_register_done(); + return 0; + +cleanup: + for_each_online_cpu(cpu) + __zcomp_cpu_notifier(comp, CPU_UP_CANCELED, cpu); + cpu_notifier_register_done(); + return -ENOMEM; +} + void zcomp_destroy(struct zcomp *comp) { - comp->destroy(comp); + unsigned long cpu; + + cpu_notifier_register_begin(); + for_each_online_cpu(cpu) + __zcomp_cpu_notifier(comp, CPU_UP_CANCELED, cpu); + __unregister_cpu_notifier(&comp->notifier); + cpu_notifier_register_done(); + + free_percpu(comp->stream); kfree(comp); } @@ -339,9 +204,9 @@ void zcomp_destroy(struct zcomp *comp) * backend pointer or ERR_PTR if things went bad. ERR_PTR(-EINVAL) * if requested algorithm is not supported, ERR_PTR(-ENOMEM) in * case of allocation error, or any other error potentially - * returned by functions zcomp_strm_{multi,single}_create. + * returned by zcomp_init(). */ -struct zcomp *zcomp_create(const char *compress, int max_strm) +struct zcomp *zcomp_create(const char *compress) { struct zcomp *comp; struct zcomp_backend *backend; @@ -356,10 +221,7 @@ struct zcomp *zcomp_create(const char *compress, int max_strm) return ERR_PTR(-ENOMEM); comp->backend = backend; - if (max_strm > 1) - error = zcomp_strm_multi_create(comp, max_strm); - else - error = zcomp_strm_single_create(comp); + error = zcomp_init(comp); if (error) { kfree(comp); return ERR_PTR(error); diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index b7d2a4bcae54..ffd88cb747fe 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -10,8 +10,6 @@ #ifndef _ZCOMP_H_ #define _ZCOMP_H_ -#include <linux/mutex.h> - struct zcomp_strm { /* compression/decompression buffer */ void *buffer; @@ -21,8 +19,6 @@ struct zcomp_strm { * working memory) */ void *private; - /* used in multi stream backend, protected by backend strm_lock */ - struct list_head list; }; /* static compression backend */ @@ -41,19 +37,15 @@ struct zcomp_backend { /* dynamic per-device compression frontend */ struct zcomp { - void *stream; + struct zcomp_strm * __percpu *stream; struct zcomp_backend *backend; - - struct zcomp_strm *(*strm_find)(struct zcomp *comp); - void (*strm_release)(struct zcomp *comp, struct zcomp_strm *zstrm); - bool (*set_max_streams)(struct zcomp *comp, int num_strm); - void (*destroy)(struct zcomp *comp); + struct notifier_block notifier; }; ssize_t zcomp_available_show(const char *comp, char *buf); bool zcomp_available_algorithm(const char *comp); -struct zcomp *zcomp_create(const char *comp, int max_strm); +struct zcomp *zcomp_create(const char *comp); void zcomp_destroy(struct zcomp *comp); struct zcomp_strm *zcomp_strm_find(struct zcomp *comp); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 370c2f76016d..8fcad8b761f1 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -304,46 +304,25 @@ static ssize_t mem_used_max_store(struct device *dev, return len; } +/* + * We switched to per-cpu streams and this attr is not needed anymore. + * However, we will keep it around for some time, because: + * a) we may revert per-cpu streams in the future + * b) it's visible to user space and we need to follow our 2 years + * retirement rule; but we already have a number of 'soon to be + * altered' attrs, so max_comp_streams need to wait for the next + * layoff cycle. + */ static ssize_t max_comp_streams_show(struct device *dev, struct device_attribute *attr, char *buf) { - int val; - struct zram *zram = dev_to_zram(dev); - - down_read(&zram->init_lock); - val = zram->max_comp_streams; - up_read(&zram->init_lock); - - return scnprintf(buf, PAGE_SIZE, "%d\n", val); + return scnprintf(buf, PAGE_SIZE, "%d\n", num_online_cpus()); } static ssize_t max_comp_streams_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { - int num; - struct zram *zram = dev_to_zram(dev); - int ret; - - ret = kstrtoint(buf, 0, &num); - if (ret < 0) - return ret; - if (num < 1) - return -EINVAL; - - down_write(&zram->init_lock); - if (init_done(zram)) { - if (!zcomp_set_max_streams(zram->comp, num)) { - pr_info("Cannot change max compression streams\n"); - ret = -EINVAL; - goto out; - } - } - - zram->max_comp_streams = num; - ret = len; -out: - up_write(&zram->init_lock); - return ret; + return len; } static ssize_t comp_algorithm_show(struct device *dev, @@ -456,8 +435,26 @@ static ssize_t mm_stat_show(struct device *dev, return ret; } +static ssize_t debug_stat_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int version = 1; + struct zram *zram = dev_to_zram(dev); + ssize_t ret; + + down_read(&zram->init_lock); + ret = scnprintf(buf, PAGE_SIZE, + "version: %d\n%8llu\n", + version, + (u64)atomic64_read(&zram->stats.writestall)); + up_read(&zram->init_lock); + + return ret; +} + static DEVICE_ATTR_RO(io_stat); static DEVICE_ATTR_RO(mm_stat); +static DEVICE_ATTR_RO(debug_stat); ZRAM_ATTR_RO(num_reads); ZRAM_ATTR_RO(num_writes); ZRAM_ATTR_RO(failed_reads); @@ -514,7 +511,7 @@ static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize) goto out_error; } - meta->mem_pool = zs_create_pool(pool_name, GFP_NOIO | __GFP_HIGHMEM); + meta->mem_pool = zs_create_pool(pool_name); if (!meta->mem_pool) { pr_err("Error creating memory pool\n"); goto out_error; @@ -650,7 +647,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, { int ret = 0; size_t clen; - unsigned long handle; + unsigned long handle = 0; struct page *page; unsigned char *user_mem, *cmem, *src, *uncmem = NULL; struct zram_meta *meta = zram->meta; @@ -673,9 +670,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, goto out; } - zstrm = zcomp_strm_find(zram->comp); +compress_again: user_mem = kmap_atomic(page); - if (is_partial_io(bvec)) { memcpy(uncmem + offset, user_mem + bvec->bv_offset, bvec->bv_len); @@ -699,6 +695,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, goto out; } + zstrm = zcomp_strm_find(zram->comp); ret = zcomp_compress(zram->comp, zstrm, uncmem, &clen); if (!is_partial_io(bvec)) { kunmap_atomic(user_mem); @@ -710,6 +707,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, pr_err("Compression failed! err=%d\n", ret); goto out; } + src = zstrm->buffer; if (unlikely(clen > max_zpage_size)) { clen = PAGE_SIZE; @@ -717,8 +715,35 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, src = uncmem; } - handle = zs_malloc(meta->mem_pool, clen); + /* + * handle allocation has 2 paths: + * a) fast path is executed with preemption disabled (for + * per-cpu streams) and has __GFP_DIRECT_RECLAIM bit clear, + * since we can't sleep; + * b) slow path enables preemption and attempts to allocate + * the page with __GFP_DIRECT_RECLAIM bit set. we have to + * put per-cpu compression stream and, thus, to re-do + * the compression once handle is allocated. + * + * if we have a 'non-null' handle here then we are coming + * from the slow path and handle has already been allocated. + */ + if (!handle) + handle = zs_malloc(meta->mem_pool, clen, + __GFP_KSWAPD_RECLAIM | + __GFP_NOWARN | + __GFP_HIGHMEM); if (!handle) { + zcomp_strm_release(zram->comp, zstrm); + zstrm = NULL; + + atomic64_inc(&zram->stats.writestall); + + handle = zs_malloc(meta->mem_pool, clen, + GFP_NOIO | __GFP_HIGHMEM); + if (handle) + goto compress_again; + pr_err("Error allocating memory for compressed page: %u, size=%zu\n", index, clen); ret = -ENOMEM; @@ -1009,7 +1034,6 @@ static void zram_reset_device(struct zram *zram) /* Reset stats */ memset(&zram->stats, 0, sizeof(zram->stats)); zram->disksize = 0; - zram->max_comp_streams = 1; set_capacity(zram->disk, 0); part_stat_set_all(&zram->disk->part0, 0); @@ -1038,7 +1062,7 @@ static ssize_t disksize_store(struct device *dev, if (!meta) return -ENOMEM; - comp = zcomp_create(zram->compressor, zram->max_comp_streams); + comp = zcomp_create(zram->compressor); if (IS_ERR(comp)) { pr_err("Cannot initialise %s compressing backend\n", zram->compressor); @@ -1177,6 +1201,7 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_comp_algorithm.attr, &dev_attr_io_stat.attr, &dev_attr_mm_stat.attr, + &dev_attr_debug_stat.attr, NULL, }; @@ -1273,7 +1298,6 @@ static int zram_add(void) } strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor)); zram->meta = NULL; - zram->max_comp_streams = 1; pr_info("Added device: %s\n", zram->disk->disk_name); return device_id; diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 8e92339686d7..3f5bf66a27e4 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -85,6 +85,7 @@ struct zram_stats { atomic64_t zero_pages; /* no. of zero filled pages */ atomic64_t pages_stored; /* no. of pages currently stored */ atomic_long_t max_used_pages; /* no. of maximum pages stored */ + atomic64_t writestall; /* no. of write slow paths */ }; struct zram_meta { @@ -102,7 +103,6 @@ struct zram { * the number of pages zram can consume for storing compressed data */ unsigned long limit_pages; - int max_comp_streams; struct zram_stats stats; atomic_t refcount; /* refcount for zram_meta */ |