From 82dcabad750a36a2b749889bc89c5a3188775b2e Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 19 Jan 2016 16:19:06 +0100 Subject: libceph: revamp subs code, switch to SUBSCRIBE2 protocol It is currently hard-coded in the mon_client that mdsmap and monmap subs are continuous, while osdmap sub is always "onetime". To better handle full clusters/pools in the osd_client, we need to be able to issue continuous osdmap subs. Revamp subs code to allow us to specify for each sub whether it should be continuous or not. Although not strictly required for the above, switch to SUBSCRIBE2 protocol while at it, eliminating the ambiguity between a request for "every map since X" and a request for "just the latest" when we don't have a map yet (i.e. have epoch 0). SUBSCRIBE2 feature bit is now required - it's been supported since pre-argonaut (2010). Move "got mdsmap" call to the end of ceph_mdsc_handle_map() - calling in before we validate the epoch and successfully install the new map can mess up mon_client sub state. Signed-off-by: Ilya Dryomov --- include/linux/ceph/ceph_features.h | 2 ++ include/linux/ceph/ceph_fs.h | 4 ++-- include/linux/ceph/mon_client.h | 28 ++++++++++++++++++++-------- 3 files changed, 24 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index 15151f3c4120..ae2f66833762 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -105,6 +105,7 @@ static inline u64 ceph_sanitize_features(u64 features) */ #define CEPH_FEATURES_SUPPORTED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ + CEPH_FEATURE_SUBSCRIBE2 | \ CEPH_FEATURE_RECONNECT_SEQ | \ CEPH_FEATURE_PGID64 | \ CEPH_FEATURE_PGPOOL3 | \ @@ -127,6 +128,7 @@ static inline u64 ceph_sanitize_features(u64 features) #define CEPH_FEATURES_REQUIRED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ + CEPH_FEATURE_SUBSCRIBE2 | \ CEPH_FEATURE_RECONNECT_SEQ | \ CEPH_FEATURE_PGID64 | \ CEPH_FEATURE_PGPOOL3 | \ diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index d7d072a25c27..bf74005eedec 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -198,8 +198,8 @@ struct ceph_client_mount { #define CEPH_SUBSCRIBE_ONETIME 1 /* i want only 1 update after have */ struct ceph_mon_subscribe_item { - __le64 have_version; __le64 have; - __u8 onetime; + __le64 start; + __u8 flags; } __attribute__ ((packed)); struct ceph_mon_subscribe_ack { diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index 81810dc21f06..8b2d2f0b659e 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -68,7 +68,8 @@ struct ceph_mon_client { bool hunting; int cur_mon; /* last monitor i contacted */ - unsigned long sub_sent, sub_renew_after; + unsigned long sub_renew_after; + unsigned long sub_renew_sent; struct ceph_connection con; /* pending generic requests */ @@ -76,10 +77,12 @@ struct ceph_mon_client { int num_generic_requests; u64 last_tid; - /* mds/osd map */ - int want_mdsmap; - int want_next_osdmap; /* 1 = want, 2 = want+asked */ - u32 have_osdmap, have_mdsmap; + /* subs, indexed with CEPH_SUB_* */ + struct { + struct ceph_mon_subscribe_item item; + bool want; + u32 have; /* epoch */ + } subs[3]; #ifdef CONFIG_DEBUG_FS struct dentry *debugfs_file; @@ -93,14 +96,23 @@ extern int ceph_monmap_contains(struct ceph_monmap *m, extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl); extern void ceph_monc_stop(struct ceph_mon_client *monc); +enum { + CEPH_SUB_MDSMAP = 0, + CEPH_SUB_MONMAP, + CEPH_SUB_OSDMAP, +}; + +extern const char *ceph_sub_str[]; + /* * The model here is to indicate that we need a new map of at least - * epoch @want, and also call in when we receive a map. We will + * epoch @epoch, and also call in when we receive a map. We will * periodically rerequest the map from the monitor cluster until we * get what we want. */ -extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have); -extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have); +bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch, + bool continuous); +void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch); extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc); extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, -- cgit v1.2.3 From 58d81b1294f02262a141687cd62529c1ec8e6484 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 21 Jan 2016 16:33:15 +0100 Subject: libceph: monc ping rate is 10s Split ping interval and ping timeout: ping interval is 10s; keepalive timeout is 30s. Make monc_ping_timeout a constant while at it - it's not actually exported as a mount option (and the rest of tick-related settings won't be either), so it's got no place in ceph_options. Signed-off-by: Ilya Dryomov --- include/linux/ceph/libceph.h | 5 +++-- net/ceph/ceph_common.c | 1 - net/ceph/mon_client.c | 8 ++------ 3 files changed, 5 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 3e3799cdc6e6..f5466273b9a3 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -47,7 +47,6 @@ struct ceph_options { unsigned long mount_timeout; /* jiffies */ unsigned long osd_idle_ttl; /* jiffies */ unsigned long osd_keepalive_timeout; /* jiffies */ - unsigned long monc_ping_timeout; /* jiffies */ /* * any type that can't be simply compared or doesn't need need @@ -68,7 +67,9 @@ struct ceph_options { #define CEPH_MOUNT_TIMEOUT_DEFAULT msecs_to_jiffies(60 * 1000) #define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000) #define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000) -#define CEPH_MONC_PING_TIMEOUT_DEFAULT msecs_to_jiffies(30 * 1000) + +#define CEPH_MONC_PING_INTERVAL msecs_to_jiffies(10 * 1000) +#define CEPH_MONC_PING_TIMEOUT msecs_to_jiffies(30 * 1000) #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) #define CEPH_MSG_MAX_MIDDLE_LEN (16*1024*1024) diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 389dbabba17b..dcc18c6f7cf9 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -361,7 +361,6 @@ ceph_parse_options(char *options, const char *dev_name, opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; - opt->monc_ping_timeout = CEPH_MONC_PING_TIMEOUT_DEFAULT; /* get mon ip(s) */ /* ip1[:port1][,ip2[:port2]...] */ diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index accfded53bae..23a270c49baf 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -202,15 +202,12 @@ static bool __sub_expired(struct ceph_mon_client *monc) */ static void __schedule_delayed(struct ceph_mon_client *monc) { - struct ceph_options *opt = monc->client->options; unsigned long delay; if (monc->cur_mon < 0 || __sub_expired(monc)) { delay = 10 * HZ; } else { - delay = 20 * HZ; - if (opt->monc_ping_timeout > 0) - delay = min(delay, opt->monc_ping_timeout / 3); + delay = CEPH_MONC_PING_INTERVAL; } dout("__schedule_delayed after %lu\n", delay); schedule_delayed_work(&monc->delayed_work, @@ -793,10 +790,9 @@ static void delayed_work(struct work_struct *work) __close_session(monc); __open_session(monc); /* continue hunting */ } else { - struct ceph_options *opt = monc->client->options; int is_auth = ceph_auth_is_authenticated(monc->auth); if (ceph_con_keepalive_expired(&monc->con, - opt->monc_ping_timeout)) { + CEPH_MONC_PING_TIMEOUT)) { dout("monc keepalive timeout\n"); is_auth = 0; __close_session(monc); -- cgit v1.2.3 From 168b9090c739c4b5556023a3f08789b349ca7339 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 21 Jan 2016 16:33:19 +0100 Subject: libceph: monc hunt rate is 3s with backoff up to 30s Unless we are in the process of setting up a client (i.e. connecting to the monitor cluster for the first time), apply a backoff: every time we want to reopen a session, increase our timeout by a multiple (currently 2); when we complete the connection, reduce that multipler by 50%. Mirrors ceph.git commit 794c86fd289bd62a35ed14368fa096c46736e9a2. Signed-off-by: Ilya Dryomov --- include/linux/ceph/libceph.h | 3 +++ include/linux/ceph/mon_client.h | 3 +++ net/ceph/mon_client.c | 25 ++++++++++++++++--------- 3 files changed, 22 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index f5466273b9a3..e7975e4681e1 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -68,8 +68,11 @@ struct ceph_options { #define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000) #define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000) +#define CEPH_MONC_HUNT_INTERVAL msecs_to_jiffies(3 * 1000) #define CEPH_MONC_PING_INTERVAL msecs_to_jiffies(10 * 1000) #define CEPH_MONC_PING_TIMEOUT msecs_to_jiffies(30 * 1000) +#define CEPH_MONC_HUNT_BACKOFF 2 +#define CEPH_MONC_HUNT_MAX_MULT 10 #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) #define CEPH_MSG_MAX_MIDDLE_LEN (16*1024*1024) diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index 8b2d2f0b659e..e230e7ed60d3 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -72,6 +72,9 @@ struct ceph_mon_client { unsigned long sub_renew_sent; struct ceph_connection con; + bool had_a_connection; + int hunt_mult; /* [1..CEPH_MONC_HUNT_MAX_MULT] */ + /* pending generic requests */ struct rb_root generic_request_tree; int num_generic_requests; diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 23a270c49baf..fd1cf408fd89 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -171,6 +171,12 @@ static void __open_session(struct ceph_mon_client *monc) pick_new_mon(monc); + if (monc->had_a_connection) { + monc->hunt_mult *= CEPH_MONC_HUNT_BACKOFF; + if (monc->hunt_mult > CEPH_MONC_HUNT_MAX_MULT) + monc->hunt_mult = CEPH_MONC_HUNT_MAX_MULT; + } + monc->sub_renew_after = jiffies; /* i.e., expired */ monc->sub_renew_sent = 0; @@ -192,11 +198,6 @@ static void __open_session(struct ceph_mon_client *monc) __send_prepared_auth_request(monc, ret); } -static bool __sub_expired(struct ceph_mon_client *monc) -{ - return time_after_eq(jiffies, monc->sub_renew_after); -} - /* * Reschedule delayed work timer. */ @@ -204,11 +205,11 @@ static void __schedule_delayed(struct ceph_mon_client *monc) { unsigned long delay; - if (monc->cur_mon < 0 || __sub_expired(monc)) { - delay = 10 * HZ; - } else { + if (monc->hunting) + delay = CEPH_MONC_HUNT_INTERVAL * monc->hunt_mult; + else delay = CEPH_MONC_PING_INTERVAL; - } + dout("__schedule_delayed after %lu\n", delay); schedule_delayed_work(&monc->delayed_work, round_jiffies_relative(delay)); @@ -902,6 +903,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) monc->hunting = true; monc->sub_renew_after = jiffies; monc->sub_renew_sent = 0; + monc->had_a_connection = false; + monc->hunt_mult = 1; INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); monc->generic_request_tree = RB_ROOT; @@ -959,6 +962,10 @@ static void finish_hunting(struct ceph_mon_client *monc) if (monc->hunting) { dout("%s found mon%d\n", __func__, monc->cur_mon); monc->hunting = false; + monc->had_a_connection = true; + monc->hunt_mult /= 2; /* reduce by 50% */ + if (monc->hunt_mult < 1) + monc->hunt_mult = 1; } } -- cgit v1.2.3 From de2aa102ea464a54dba14b9588e0bc188bd94707 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 8 Feb 2016 13:39:46 +0100 Subject: libceph: rename ceph_osd_req_op::payload_len to indata_len Follow userspace nomenclature on this - the next commit adds outdata_len. Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 2 +- net/ceph/osd_client.c | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 7506b485bb6d..35c8b006916f 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -77,7 +77,7 @@ struct ceph_osd_data { struct ceph_osd_req_op { u16 op; /* CEPH_OSD_OP_* */ u32 flags; /* CEPH_OSD_OP_FLAG_* */ - u32 payload_len; + u32 indata_len; /* request */ union { struct ceph_osd_data raw_data_in; struct { diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 3309112e23d0..84075539135b 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -498,7 +498,7 @@ void osd_req_op_extent_init(struct ceph_osd_request *osd_req, if (opcode == CEPH_OSD_OP_WRITE || opcode == CEPH_OSD_OP_WRITEFULL) payload_len += length; - op->payload_len = payload_len; + op->indata_len = payload_len; } EXPORT_SYMBOL(osd_req_op_extent_init); @@ -517,7 +517,7 @@ void osd_req_op_extent_update(struct ceph_osd_request *osd_req, BUG_ON(length > previous); op->extent.length = length; - op->payload_len -= previous - length; + op->indata_len -= previous - length; } EXPORT_SYMBOL(osd_req_op_extent_update); @@ -554,7 +554,7 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, op->cls.argc = 0; /* currently unused */ - op->payload_len = payload_len; + op->indata_len = payload_len; } EXPORT_SYMBOL(osd_req_op_cls_init); @@ -587,7 +587,7 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, op->xattr.cmp_mode = cmp_mode; ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist); - op->payload_len = payload_len; + op->indata_len = payload_len; return 0; } EXPORT_SYMBOL(osd_req_op_xattr_init); @@ -707,7 +707,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE); dst->cls.indata_len = cpu_to_le32(data_length); ceph_osdc_msg_data_add(req->r_request, osd_data); - src->payload_len += data_length; + src->indata_len += data_length; request_data_len += data_length; } osd_data = &src->cls.response_data; @@ -750,7 +750,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, dst->op = cpu_to_le16(src->op); dst->flags = cpu_to_le32(src->flags); - dst->payload_len = cpu_to_le32(src->payload_len); + dst->payload_len = cpu_to_le32(src->indata_len); return request_data_len; } -- cgit v1.2.3 From 7665d85b7307fa0218881bc2009de067c42dc52e Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 7 Jan 2016 16:48:57 +0800 Subject: libceph: move r_reply_op_{len,result} into struct ceph_osd_req_op This avoids defining large array of r_reply_op_{len,result} in in struct ceph_osd_request. Signed-off-by: Yan, Zheng Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 2 +- include/linux/ceph/osd_client.h | 5 +++-- net/ceph/osd_client.c | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4a876785b68c..94f31bde73e8 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1854,7 +1854,7 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, * passed to the block layer, which just supports a 32-bit * length field. */ - obj_request->xferred = osd_req->r_reply_op_len[0]; + obj_request->xferred = osd_req->r_ops[0].outdata_len; rbd_assert(obj_request->xferred < (u64)UINT_MAX); opcode = osd_req->r_ops[0].op; diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 35c8b006916f..c6d1d603bacf 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -78,6 +78,9 @@ struct ceph_osd_req_op { u16 op; /* CEPH_OSD_OP_* */ u32 flags; /* CEPH_OSD_OP_FLAG_* */ u32 indata_len; /* request */ + u32 outdata_len; /* reply */ + s32 rval; + union { struct ceph_osd_data raw_data_in; struct { @@ -148,8 +151,6 @@ struct ceph_osd_request { struct ceph_eversion *r_request_reassert_version; int r_result; - int r_reply_op_len[CEPH_OSD_MAX_OP]; - s32 r_reply_op_result[CEPH_OSD_MAX_OP]; int r_got_reply; int r_linger; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 84075539135b..1048edb44343 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1821,7 +1821,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) int len; len = le32_to_cpu(op->payload_len); - req->r_reply_op_len[i] = len; + req->r_ops[i].outdata_len = len; dout(" op %d has %d bytes\n", i, len); payload_len += len; p += sizeof(*op); @@ -1836,7 +1836,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) ceph_decode_need(&p, end, 4 + numops * 4, bad_put); retry_attempt = ceph_decode_32(&p); for (i = 0; i < numops; i++) - req->r_reply_op_result[i] = ceph_decode_32(&p); + req->r_ops[i].rval = ceph_decode_32(&p); if (le16_to_cpu(msg->hdr.version) >= 6) { p += 8 + 4; /* skip replay_version */ -- cgit v1.2.3 From 3f1af42ad0fad8a12242233dd0d9fc42f5e83415 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 9 Feb 2016 17:50:15 +0100 Subject: libceph: enable large, variable-sized OSD requests Turn r_ops into a flexible array member to enable large, consisting of up to 16 ops, OSD requests. The use case is scattered writeback in cephfs and, as far as the kernel client is concerned, 16 is just a made up number. r_ops had size 3 for copyup+hint+write, but copyup is really a special case - it can only happen once. ceph_osd_request_cache is therefore stuffed with num_ops=2 requests, anything bigger than that is allocated with kmalloc(). req_mempool is backed by ceph_osd_request_cache, which means either num_ops=1 or num_ops=2 for use_mempool=true - all existing users (ceph_writepages_start(), ceph_osdc_writepages()) are fine with that. Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 2 -- include/linux/ceph/osd_client.h | 6 ++++-- net/ceph/osd_client.c | 43 +++++++++++++++++++++++++++-------------- 3 files changed, 32 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 94f31bde73e8..08018710f363 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1847,8 +1847,6 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, if (osd_req->r_result < 0) obj_request->result = osd_req->r_result; - rbd_assert(osd_req->r_num_ops <= CEPH_OSD_MAX_OP); - /* * We support a 64-bit length, but ultimately it has to be * passed to the block layer, which just supports a 32-bit diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index c6d1d603bacf..aada6a1383a4 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -43,7 +43,8 @@ struct ceph_osd { }; -#define CEPH_OSD_MAX_OP 3 +#define CEPH_OSD_SLAB_OPS 2 +#define CEPH_OSD_MAX_OPS 16 enum ceph_osd_data_type { CEPH_OSD_DATA_TYPE_NONE = 0, @@ -139,7 +140,6 @@ struct ceph_osd_request { /* request osd ops array */ unsigned int r_num_ops; - struct ceph_osd_req_op r_ops[CEPH_OSD_MAX_OP]; /* these are updated on each send */ __le32 *r_request_osdmap_epoch; @@ -175,6 +175,8 @@ struct ceph_osd_request { unsigned long r_stamp; /* send OR check time */ struct ceph_snap_context *r_snapc; /* snap context for writes */ + + struct ceph_osd_req_op r_ops[]; }; struct ceph_request_redirect { diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index f93d0e893f3f..ccd4e031fa3f 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -338,9 +338,10 @@ static void ceph_osdc_release_request(struct kref *kref) ceph_put_snap_context(req->r_snapc); if (req->r_mempool) mempool_free(req, req->r_osdc->req_mempool); - else + else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS) kmem_cache_free(ceph_osd_request_cache, req); - + else + kfree(req); } void ceph_osdc_get_request(struct ceph_osd_request *req) @@ -369,18 +370,22 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_msg *msg; size_t msg_size; - BUILD_BUG_ON(CEPH_OSD_MAX_OP > U16_MAX); - BUG_ON(num_ops > CEPH_OSD_MAX_OP); - if (use_mempool) { + BUG_ON(num_ops > CEPH_OSD_SLAB_OPS); req = mempool_alloc(osdc->req_mempool, gfp_flags); - memset(req, 0, sizeof(*req)); + } else if (num_ops <= CEPH_OSD_SLAB_OPS) { + req = kmem_cache_alloc(ceph_osd_request_cache, gfp_flags); } else { - req = kmem_cache_zalloc(ceph_osd_request_cache, gfp_flags); + BUG_ON(num_ops > CEPH_OSD_MAX_OPS); + req = kmalloc(sizeof(*req) + num_ops * sizeof(req->r_ops[0]), + gfp_flags); } - if (req == NULL) + if (unlikely(!req)) return NULL; + /* req only, each op is zeroed in _osd_req_op_init() */ + memset(req, 0, sizeof(*req)); + req->r_osdc = osdc; req->r_mempool = use_mempool; req->r_num_ops = num_ops; @@ -398,12 +403,19 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, req->r_base_oloc.pool = -1; req->r_target_oloc.pool = -1; + msg_size = OSD_OPREPLY_FRONT_LEN; + if (num_ops > CEPH_OSD_SLAB_OPS) { + /* ceph_osd_op and rval */ + msg_size += (num_ops - CEPH_OSD_SLAB_OPS) * + (sizeof(struct ceph_osd_op) + 4); + } + /* create reply message */ if (use_mempool) msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); else - msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, - OSD_OPREPLY_FRONT_LEN, gfp_flags, true); + msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size, + gfp_flags, true); if (!msg) { ceph_osdc_put_request(req); return NULL; @@ -1811,7 +1823,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) ceph_decode_need(&p, end, 4, bad_put); numops = ceph_decode_32(&p); - if (numops > CEPH_OSD_MAX_OP) + if (numops > CEPH_OSD_MAX_OPS) goto bad_put; if (numops != req->r_num_ops) goto bad_put; @@ -2784,11 +2796,12 @@ EXPORT_SYMBOL(ceph_osdc_writepages); int ceph_osdc_setup(void) { + size_t size = sizeof(struct ceph_osd_request) + + CEPH_OSD_SLAB_OPS * sizeof(struct ceph_osd_req_op); + BUG_ON(ceph_osd_request_cache); - ceph_osd_request_cache = kmem_cache_create("ceph_osd_request", - sizeof (struct ceph_osd_request), - __alignof__(struct ceph_osd_request), - 0, NULL); + ceph_osd_request_cache = kmem_cache_create("ceph_osd_request", size, + 0, 0, NULL); return ceph_osd_request_cache ? 0 : -ENOMEM; } -- cgit v1.2.3 From 2c63f49a724a10bb71cc0fd34f8e5acce78525d5 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 7 Jan 2016 17:32:54 +0800 Subject: libceph: add helper that duplicates last extent operation This helper duplicates last extent operation in OSD request, then adjusts the new extent operation's offset and length. The helper is for scatterd page writeback, which adds nonconsecutive dirty pages to single OSD request. Signed-off-by: Yan, Zheng Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 2 ++ net/ceph/osd_client.c | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+) (limited to 'include') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index aada6a1383a4..4343df806710 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -266,6 +266,8 @@ extern void osd_req_op_extent_init(struct ceph_osd_request *osd_req, u64 truncate_size, u32 truncate_seq); extern void osd_req_op_extent_update(struct ceph_osd_request *osd_req, unsigned int which, u64 length); +extern void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req, + unsigned int which, u64 offset_inc); extern struct ceph_osd_data *osd_req_op_extent_osd_data( struct ceph_osd_request *osd_req, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index ccd4e031fa3f..32355d9d0103 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -534,6 +534,28 @@ void osd_req_op_extent_update(struct ceph_osd_request *osd_req, } EXPORT_SYMBOL(osd_req_op_extent_update); +void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req, + unsigned int which, u64 offset_inc) +{ + struct ceph_osd_req_op *op, *prev_op; + + BUG_ON(which + 1 >= osd_req->r_num_ops); + + prev_op = &osd_req->r_ops[which]; + op = _osd_req_op_init(osd_req, which + 1, prev_op->op, prev_op->flags); + /* dup previous one */ + op->indata_len = prev_op->indata_len; + op->outdata_len = prev_op->outdata_len; + op->extent = prev_op->extent; + /* adjust offset */ + op->extent.offset += offset_inc; + op->extent.length -= offset_inc; + + if (op->op == CEPH_OSD_OP_WRITE || op->op == CEPH_OSD_OP_WRITEFULL) + op->indata_len -= offset_inc; +} +EXPORT_SYMBOL(osd_req_op_extent_dup_last); + void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, const char *class, const char *method) { -- cgit v1.2.3 From 315f24088048a51eed341c53be66ea477a3c7d16 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 7 Mar 2016 10:34:50 +0800 Subject: ceph: fix security xattr deadlock When security is enabled, security module can call filesystem's getxattr/setxattr callbacks during d_instantiate(). For cephfs, d_instantiate() is usually called by MDS' dispatch thread, while handling MDS reply. If the MDS reply does not include xattrs and corresponding caps, getxattr/setxattr need to send a new request to MDS and waits for the reply. This makes MDS' dispatch sleep, nobody handles later MDS replies. The fix is make sure lookup/atomic_open reply include xattrs and corresponding caps. So getxattr can be handled by cached xattrs. This requires some modification to both MDS and request message. (Client tells MDS what caps it wants; MDS encodes proper caps in the reply) Smack security module may call setxattr during d_instantiate(). Unlike getxattr, we can't force MDS to issue CEPH_CAP_XATTR_EXCL to us. So just make setxattr return error when called by MDS' dispatch thread. Signed-off-by: Yan, Zheng --- fs/ceph/dir.c | 9 ++++-- fs/ceph/export.c | 13 +++++++++ fs/ceph/file.c | 7 +++++ fs/ceph/inode.c | 18 +++++++++--- fs/ceph/mds_client.c | 2 ++ fs/ceph/super.h | 16 ++++++++++- fs/ceph/xattr.c | 68 ++++++++++++++++++++++++++++++++++++++++++-- include/linux/ceph/ceph_fs.h | 3 +- 8 files changed, 125 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index fd11fb231a2e..b9f50a388aee 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -624,6 +624,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; int op; + int mask; int err; dout("lookup %p dentry %p '%pd'\n", @@ -666,8 +667,12 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, return ERR_CAST(req); req->r_dentry = dget(dentry); req->r_num_caps = 2; - /* we only need inode linkage */ - req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); + + mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; + if (ceph_security_xattr_wanted(dir)) + mask |= CEPH_CAP_XATTR_SHARED; + req->r_args.getattr.mask = cpu_to_le32(mask); + req->r_locked_dir = dir; err = ceph_mdsc_do_request(mdsc, NULL, req); err = ceph_handle_snapdir(req, dentry, err); diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 3b3172357326..6e72c98162d5 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -71,12 +71,18 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) inode = ceph_find_inode(sb, vino); if (!inode) { struct ceph_mds_request *req; + int mask; req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO, USE_ANY_MDS); if (IS_ERR(req)) return ERR_CAST(req); + mask = CEPH_STAT_CAP_INODE; + if (ceph_security_xattr_wanted(d_inode(sb->s_root))) + mask |= CEPH_CAP_XATTR_SHARED; + req->r_args.getattr.mask = cpu_to_le32(mask); + req->r_ino1 = vino; req->r_num_caps = 1; err = ceph_mdsc_do_request(mdsc, NULL, req); @@ -128,6 +134,7 @@ static struct dentry *__get_parent(struct super_block *sb, struct ceph_mds_request *req; struct inode *inode; struct dentry *dentry; + int mask; int err; req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPPARENT, @@ -144,6 +151,12 @@ static struct dentry *__get_parent(struct super_block *sb, .snap = CEPH_NOSNAP, }; } + + mask = CEPH_STAT_CAP_INODE; + if (ceph_security_xattr_wanted(d_inode(sb->s_root))) + mask |= CEPH_CAP_XATTR_SHARED; + req->r_args.getattr.mask = cpu_to_le32(mask); + req->r_num_caps = 1; err = ceph_mdsc_do_request(mdsc, NULL, req); inode = req->r_target_inode; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 389adacbc719..334a75170a3b 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -300,6 +300,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, struct ceph_mds_request *req; struct dentry *dn; struct ceph_acls_info acls = {}; + int mask; int err; dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o\n", @@ -335,6 +336,12 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, acls.pagelist = NULL; } } + + mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; + if (ceph_security_xattr_wanted(dir)) + mask |= CEPH_CAP_XATTR_SHARED; + req->r_args.open.mask = cpu_to_le32(mask); + req->r_locked_dir = dir; /* caller holds dir->i_mutex */ err = ceph_mdsc_do_request(mdsc, (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 66edef12c6f2..8b136dc0bc13 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1389,7 +1389,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct qstr dname; struct dentry *dn; struct inode *in; - int err = 0, ret, i; + int err = 0, skipped = 0, ret, i; struct inode *snapdir = NULL; struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; struct ceph_dentry_info *di; @@ -1501,7 +1501,17 @@ retry_lookup: } if (d_really_is_negative(dn)) { - struct dentry *realdn = splice_dentry(dn, in); + struct dentry *realdn; + + if (ceph_security_xattr_deadlock(in)) { + dout(" skip splicing dn %p to inode %p" + " (security xattr deadlock)\n", dn, in); + iput(in); + skipped++; + goto next_item; + } + + realdn = splice_dentry(dn, in); if (IS_ERR(realdn)) { err = PTR_ERR(realdn); d_drop(dn); @@ -1518,7 +1528,7 @@ retry_lookup: req->r_session, req->r_request_started); - if (err == 0 && cache_ctl.index >= 0) { + if (err == 0 && skipped == 0 && cache_ctl.index >= 0) { ret = fill_readdir_cache(d_inode(parent), dn, &cache_ctl, req); if (ret < 0) @@ -1529,7 +1539,7 @@ next_item: dput(dn); } out: - if (err == 0) { + if (err == 0 && skipped == 0) { req->r_did_prepopulate = true; req->r_readdir_cache_idx = cache_ctl.index; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index aa43dcb5f9b9..44852c3ae531 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2540,6 +2540,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) /* insert trace into our cache */ mutex_lock(&req->r_fill_mutex); + current->journal_info = req; err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); if (err == 0) { if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR || @@ -2547,6 +2548,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) ceph_readdir_prepopulate(req, req->r_session); ceph_unreserve_caps(mdsc, &req->r_caps_reservation); } + current->journal_info = NULL; mutex_unlock(&req->r_fill_mutex); up_read(&mdsc->snap_rwsem); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 57ac43d64322..2d48138da58e 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -468,7 +468,7 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, #define CEPH_I_POOL_PERM (1 << 4) /* pool rd/wr bits are valid */ #define CEPH_I_POOL_RD (1 << 5) /* can read from pool */ #define CEPH_I_POOL_WR (1 << 6) /* can write to pool */ - +#define CEPH_I_SEC_INITED (1 << 7) /* security initialized */ static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, long long release_count, @@ -804,6 +804,20 @@ extern void __init ceph_xattr_init(void); extern void ceph_xattr_exit(void); extern const struct xattr_handler *ceph_xattr_handlers[]; +#ifdef CONFIG_SECURITY +extern bool ceph_security_xattr_deadlock(struct inode *in); +extern bool ceph_security_xattr_wanted(struct inode *in); +#else +static inline bool ceph_security_xattr_deadlock(struct inode *in) +{ + return false; +} +static inline bool ceph_security_xattr_wanted(struct inode *in) +{ + return false; +} +#endif + /* acl.c */ struct ceph_acls_info { void *default_acl; diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 139cdef8eb41..9410abdef3ce 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -714,13 +714,31 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci) } } +static inline int __get_request_mask(struct inode *in) { + struct ceph_mds_request *req = current->journal_info; + int mask = 0; + if (req && req->r_target_inode == in) { + if (req->r_op == CEPH_MDS_OP_LOOKUP || + req->r_op == CEPH_MDS_OP_LOOKUPINO || + req->r_op == CEPH_MDS_OP_LOOKUPPARENT || + req->r_op == CEPH_MDS_OP_GETATTR) { + mask = le32_to_cpu(req->r_args.getattr.mask); + } else if (req->r_op == CEPH_MDS_OP_OPEN || + req->r_op == CEPH_MDS_OP_CREATE) { + mask = le32_to_cpu(req->r_args.open.mask); + } + } + return mask; +} + ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, size_t size) { struct ceph_inode_info *ci = ceph_inode(inode); - int err; struct ceph_inode_xattr *xattr; struct ceph_vxattr *vxattr = NULL; + int req_mask; + int err; if (!ceph_is_valid_xattr(name)) return -ENODATA; @@ -734,13 +752,24 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, return err; } + req_mask = __get_request_mask(inode); + spin_lock(&ci->i_ceph_lock); dout("getxattr %p ver=%lld index_ver=%lld\n", inode, ci->i_xattrs.version, ci->i_xattrs.index_version); if (ci->i_xattrs.version == 0 || - !__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1)) { + !((req_mask & CEPH_CAP_XATTR_SHARED) || + __ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1))) { spin_unlock(&ci->i_ceph_lock); + + /* security module gets xattr while filling trace */ + if (current->journal_info != NULL) { + pr_warn_ratelimited("sync getxattr %p " + "during filling trace\n", inode); + return -EBUSY; + } + /* get xattrs from mds (if we don't already have them) */ err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR, true); if (err) @@ -767,6 +796,9 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, memcpy(value, xattr->val, xattr->val_len); + if (current->journal_info != NULL && + !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) + ci->i_ceph_flags |= CEPH_I_SEC_INITED; out: spin_unlock(&ci->i_ceph_lock); return err; @@ -1017,7 +1049,15 @@ do_sync: do_sync_unlocked: if (lock_snap_rwsem) up_read(&mdsc->snap_rwsem); - err = ceph_sync_setxattr(dentry, name, value, size, flags); + + /* security module set xattr while filling trace */ + if (current->journal_info != NULL) { + pr_warn_ratelimited("sync setxattr %p " + "during filling trace\n", inode); + err = -EBUSY; + } else { + err = ceph_sync_setxattr(dentry, name, value, size, flags); + } out: ceph_free_cap_flush(prealloc_cf); kfree(newname); @@ -1166,3 +1206,25 @@ int ceph_removexattr(struct dentry *dentry, const char *name) return __ceph_removexattr(dentry, name); } + +#ifdef CONFIG_SECURITY +bool ceph_security_xattr_wanted(struct inode *in) +{ + return in->i_security != NULL; +} + +bool ceph_security_xattr_deadlock(struct inode *in) +{ + struct ceph_inode_info *ci; + bool ret; + if (in->i_security == NULL) + return false; + ci = ceph_inode(in); + spin_lock(&ci->i_ceph_lock); + ret = !(ci->i_ceph_flags & CEPH_I_SEC_INITED) && + !(ci->i_xattrs.version > 0 && + __ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0)); + spin_unlock(&ci->i_ceph_lock); + return ret; +} +#endif diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index bf74005eedec..37f28bf55ce4 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -376,7 +376,8 @@ union ceph_mds_request_args { __le32 stripe_count; /* ... */ __le32 object_size; __le32 file_replication; - __le32 unused; /* used to be preferred osd */ + __le32 mask; /* CEPH_CAP_* */ + __le32 old_size; } __attribute__ ((packed)) open; struct { __le32 flags; -- cgit v1.2.3