From f6ba86363908e3f4e3ef11f768be7ca2745b18cf Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 13 Aug 2014 18:33:55 +0200 Subject: drbd: Move enum write_ordering_e to drbd.h Also change the enum values to all-capital letters. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/block/drbd/drbd_main.c') diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 74d97f4bac34..3ee4a44cb225 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2590,7 +2590,7 @@ struct drbd_resource *drbd_create_resource(const char *name) kref_init(&resource->kref); idr_init(&resource->devices); INIT_LIST_HEAD(&resource->connections); - resource->write_ordering = WO_bdev_flush; + resource->write_ordering = WO_BDEV_FLUSH; list_add_tail_rcu(&resource->resources, &drbd_resources); mutex_init(&resource->conf_update); mutex_init(&resource->adm_mutex); -- cgit v1.2.3 From 28bc3b8c71cda033a4c013131c635d1148889824 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 14 Aug 2014 18:33:30 +0200 Subject: drbd: Fix locking across all resources Instead of using a rwlock for synchronizing state changes across resources, take the request locks of all resources for global state changes. Use resources_mutex to serialize global state changes. This means that taking the request lock of a resource is now enough to prevent changes of that resource. (Previously, a read lock on the global state lock was needed as well.) Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_int.h | 18 ++------- drivers/block/drbd/drbd_main.c | 24 +++++++++++- drivers/block/drbd/drbd_nl.c | 45 +++++++++++---------- drivers/block/drbd/drbd_state.c | 14 +++---- drivers/block/drbd/drbd_state.h | 6 +-- drivers/block/drbd/drbd_worker.c | 85 ++++++++++++++++++---------------------- 6 files changed, 99 insertions(+), 93 deletions(-) (limited to 'drivers/block/drbd/drbd_main.c') diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 47d4b02103b8..2c9ee223d548 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -292,6 +292,9 @@ struct drbd_device_work { extern int drbd_wait_misc(struct drbd_device *, struct drbd_interval *); +extern void lock_all_resources(void); +extern void unlock_all_resources(void); + struct drbd_request { struct drbd_work w; struct drbd_device *device; @@ -1418,7 +1421,7 @@ extern struct bio_set *drbd_md_io_bio_set; /* to allocate from that set */ extern struct bio *bio_alloc_drbd(gfp_t gfp_mask); -extern rwlock_t global_state_lock; +extern struct mutex resources_mutex; extern int conn_lowest_minor(struct drbd_connection *connection); extern enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor); @@ -1688,19 +1691,6 @@ static inline int drbd_peer_req_has_active_page(struct drbd_peer_request *peer_r return 0; } -static inline enum drbd_state_rv -_drbd_set_state(struct drbd_device *device, union drbd_state ns, - enum chg_state_flags flags, struct completion *done) -{ - enum drbd_state_rv rv; - - read_lock(&global_state_lock); - rv = __drbd_set_state(device, ns, flags, done); - read_unlock(&global_state_lock); - - return rv; -} - static inline union drbd_state drbd_read_state(struct drbd_device *device) { struct drbd_resource *resource = device->resource; diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 3ee4a44cb225..f66294db3b08 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -117,6 +117,7 @@ module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0 */ struct idr drbd_devices; struct list_head drbd_resources; +struct mutex resources_mutex; struct kmem_cache *drbd_request_cache; struct kmem_cache *drbd_ee_cache; /* peer requests */ @@ -2923,7 +2924,7 @@ static int __init drbd_init(void) drbd_proc = NULL; /* play safe for drbd_cleanup */ idr_init(&drbd_devices); - rwlock_init(&global_state_lock); + mutex_init(&resources_mutex); INIT_LIST_HEAD(&drbd_resources); err = drbd_genl_register(); @@ -3746,6 +3747,27 @@ int drbd_wait_misc(struct drbd_device *device, struct drbd_interval *i) return 0; } +void lock_all_resources(void) +{ + struct drbd_resource *resource; + int __maybe_unused i = 0; + + mutex_lock(&resources_mutex); + local_irq_disable(); + for_each_resource(resource, &drbd_resources) + spin_lock_nested(&resource->req_lock, i++); +} + +void unlock_all_resources(void) +{ + struct drbd_resource *resource; + + for_each_resource(resource, &drbd_resources) + spin_unlock(&resource->req_lock); + local_irq_enable(); + mutex_unlock(&resources_mutex); +} + #ifdef CONFIG_DRBD_FAULT_INJECTION /* Fault insertion support including random number generator shamelessly * stolen from kernel/rcutorture.c */ diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 94e380ff61ca..d37c509e6a44 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1389,13 +1389,13 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) goto fail_unlock; } - write_lock_irq(&global_state_lock); + lock_all_resources(); retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after); if (retcode == NO_ERROR) { rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf); drbd_resync_after_changed(device); } - write_unlock_irq(&global_state_lock); + unlock_all_resources(); if (retcode != NO_ERROR) goto fail_unlock; @@ -1539,18 +1539,13 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) goto fail; } - write_lock_irq(&global_state_lock); - retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after); - if (retcode != NO_ERROR) - goto fail_unlock; - rcu_read_lock(); nc = rcu_dereference(connection->net_conf); if (nc) { if (new_disk_conf->fencing == FP_STONITH && nc->wire_protocol == DRBD_PROT_A) { rcu_read_unlock(); retcode = ERR_STONITH_AND_PROT_A; - goto fail_unlock; + goto fail; } } rcu_read_unlock(); @@ -1561,7 +1556,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->backing_dev, PTR_ERR(bdev)); retcode = ERR_OPEN_DISK; - goto fail_unlock; + goto fail; } nbc->backing_bdev = bdev; @@ -1581,7 +1576,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->meta_dev, PTR_ERR(bdev)); retcode = ERR_OPEN_MD_DISK; - goto fail_unlock; + goto fail; } nbc->md_bdev = bdev; @@ -1589,7 +1584,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL || new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) { retcode = ERR_MD_IDX_INVALID; - goto fail_unlock; + goto fail; } resync_lru = lc_create("resync", drbd_bm_ext_cache, @@ -1597,14 +1592,14 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) offsetof(struct bm_extent, lce)); if (!resync_lru) { retcode = ERR_NOMEM; - goto fail_unlock; + goto fail; } /* Read our meta data super block early. * This also sets other on-disk offsets. */ retcode = drbd_md_read(device, nbc); if (retcode != NO_ERROR) - goto fail_unlock; + goto fail; if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN) new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN; @@ -1616,7 +1611,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) (unsigned long long) drbd_get_max_capacity(nbc), (unsigned long long) new_disk_conf->disk_size); retcode = ERR_DISK_TOO_SMALL; - goto fail_unlock; + goto fail; } if (new_disk_conf->meta_dev_idx < 0) { @@ -1633,7 +1628,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) drbd_warn(device, "refusing attach: md-device too small, " "at least %llu sectors needed for this meta-disk type\n", (unsigned long long) min_md_device_sectors); - goto fail_unlock; + goto fail; } /* Make sure the new disk is big enough @@ -1641,7 +1636,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) if (drbd_get_max_capacity(nbc) < drbd_get_capacity(device->this_bdev)) { retcode = ERR_DISK_TOO_SMALL; - goto fail_unlock; + goto fail; } nbc->known_size = drbd_get_capacity(nbc->backing_bdev); @@ -1671,7 +1666,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) retcode = rv; /* FIXME: Type mismatch. */ drbd_resume_io(device); if (rv < SS_SUCCESS) - goto fail_unlock; + goto fail; if (!get_ldev_if_state(device, D_ATTACHING)) goto force_diskless; @@ -1706,6 +1701,13 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) goto force_diskless_dec; } + lock_all_resources(); + retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after); + if (retcode != NO_ERROR) { + unlock_all_resources(); + goto force_diskless_dec; + } + /* Reset the "barriers don't work" bits here, then force meta data to * be written, to ensure we determine if barriers are supported. */ if (new_disk_conf->md_flushes) @@ -1728,6 +1730,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) drbd_resync_after_changed(device); drbd_bump_write_ordering(device->resource, device->ldev, WO_BDEV_FLUSH); + unlock_all_resources(); if (drbd_md_test_flag(device->ldev, MDF_CRASHED_PRIMARY)) set_bit(CRASHED_PRIMARY, &device->flags); @@ -1850,8 +1853,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) if (rv < SS_SUCCESS) goto force_diskless_dec; - write_unlock(&global_state_lock); - mod_timer(&device->request_timer, jiffies + HZ); if (device->state.role == R_PRIMARY) @@ -1874,8 +1875,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) force_diskless: drbd_force_state(device, NS(disk, D_DISKLESS)); drbd_md_sync(device); - fail_unlock: - write_unlock_irq(&global_state_lock); fail: conn_reconfig_done(connection); if (nbc) { @@ -3453,8 +3452,10 @@ int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info) } /* not yet safe for genl_family.parallel_ops */ + mutex_lock(&resources_mutex); if (!conn_create(adm_ctx.resource_name, &res_opts)) retcode = ERR_NOMEM; + mutex_unlock(&resources_mutex); out: drbd_adm_finish(&adm_ctx, info, retcode); return 0; @@ -3545,7 +3546,9 @@ static int adm_del_resource(struct drbd_resource *resource) if (!idr_is_empty(&resource->devices)) return ERR_RES_IN_USE; + mutex_lock(&resources_mutex); list_del_rcu(&resource->resources); + mutex_unlock(&resources_mutex); /* Make sure all threads have actually stopped: state handling only * does drbd_thread_stop_nowait(). */ list_for_each_entry(connection, &resource->connections, connections) diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index 2d7dd269b6a8..535ae47f84c9 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -937,7 +937,7 @@ void drbd_resume_al(struct drbd_device *device) drbd_info(device, "Resumed AL updates\n"); } -/* helper for __drbd_set_state */ +/* helper for _drbd_set_state */ static void set_ov_position(struct drbd_device *device, enum drbd_conns cs) { if (first_peer_device(device)->connection->agreed_pro_version < 90) @@ -965,17 +965,17 @@ static void set_ov_position(struct drbd_device *device, enum drbd_conns cs) } /** - * __drbd_set_state() - Set a new DRBD state + * _drbd_set_state() - Set a new DRBD state * @device: DRBD device. * @ns: new state. * @flags: Flags * @done: Optional completion, that will get completed after the after_state_ch() finished * - * Caller needs to hold req_lock, and global_state_lock. Do not call directly. + * Caller needs to hold req_lock. Do not call directly. */ enum drbd_state_rv -__drbd_set_state(struct drbd_device *device, union drbd_state ns, - enum chg_state_flags flags, struct completion *done) +_drbd_set_state(struct drbd_device *device, union drbd_state ns, + enum chg_state_flags flags, struct completion *done) { struct drbd_peer_device *peer_device = first_peer_device(device); struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; @@ -1444,7 +1444,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, if (os.disk != D_FAILED && ns.disk == D_FAILED) { enum drbd_io_error_p eh = EP_PASS_ON; int was_io_error = 0; - /* corresponding get_ldev was in __drbd_set_state, to serialize + /* corresponding get_ldev was in _drbd_set_state, to serialize * our cleanup here with the transition to D_DISKLESS. * But is is still not save to dreference ldev here, since * we might come from an failed Attach before ldev was set. */ @@ -1759,7 +1759,7 @@ conn_set_state(struct drbd_connection *connection, union drbd_state mask, union if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED) ns.disk = os.disk; - rv = __drbd_set_state(device, ns, flags, NULL); + rv = _drbd_set_state(device, ns, flags, NULL); if (rv < SS_SUCCESS) BUG(); diff --git a/drivers/block/drbd/drbd_state.h b/drivers/block/drbd/drbd_state.h index 7f53c40823cd..bd989536f888 100644 --- a/drivers/block/drbd/drbd_state.h +++ b/drivers/block/drbd/drbd_state.h @@ -122,9 +122,9 @@ extern enum drbd_state_rv _drbd_request_state_holding_state_mutex(struct drbd_device *, union drbd_state, union drbd_state, enum chg_state_flags); -extern enum drbd_state_rv __drbd_set_state(struct drbd_device *, union drbd_state, - enum chg_state_flags, - struct completion *done); +extern enum drbd_state_rv _drbd_set_state(struct drbd_device *, union drbd_state, + enum chg_state_flags, + struct completion *done); extern void print_st_err(struct drbd_device *, union drbd_state, union drbd_state, int); diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 5578c1477ba6..3b3d980ac8c7 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -55,13 +55,6 @@ static int make_resync_request(struct drbd_device *, int); * */ - -/* About the global_state_lock - Each state transition on an device holds a read lock. In case we have - to evaluate the resync after dependencies, we grab a write lock, because - we need stable states on all devices for that. */ -rwlock_t global_state_lock; - /* used for synchronous meta data and bitmap IO * submitted by drbd_md_sync_page_io() */ @@ -1456,70 +1449,73 @@ static int _drbd_may_sync_now(struct drbd_device *device) } /** - * _drbd_pause_after() - Pause resync on all devices that may not resync now + * drbd_pause_after() - Pause resync on all devices that may not resync now * @device: DRBD device. * * Called from process context only (admin command and after_state_ch). */ -static int _drbd_pause_after(struct drbd_device *device) +static bool drbd_pause_after(struct drbd_device *device) { + bool changed = false; struct drbd_device *odev; - int i, rv = 0; + int i; rcu_read_lock(); idr_for_each_entry(&drbd_devices, odev, i) { if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) continue; - if (!_drbd_may_sync_now(odev)) - rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL) - != SS_NOTHING_TO_DO); + if (!_drbd_may_sync_now(odev) && + _drbd_set_state(_NS(odev, aftr_isp, 1), + CS_HARD, NULL) != SS_NOTHING_TO_DO) + changed = true; } rcu_read_unlock(); - return rv; + return changed; } /** - * _drbd_resume_next() - Resume resync on all devices that may resync now + * drbd_resume_next() - Resume resync on all devices that may resync now * @device: DRBD device. * * Called from process context only (admin command and worker). */ -static int _drbd_resume_next(struct drbd_device *device) +static bool drbd_resume_next(struct drbd_device *device) { + bool changed = false; struct drbd_device *odev; - int i, rv = 0; + int i; rcu_read_lock(); idr_for_each_entry(&drbd_devices, odev, i) { if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) continue; if (odev->state.aftr_isp) { - if (_drbd_may_sync_now(odev)) - rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0), - CS_HARD, NULL) - != SS_NOTHING_TO_DO) ; + if (_drbd_may_sync_now(odev) && + _drbd_set_state(_NS(odev, aftr_isp, 0), + CS_HARD, NULL) != SS_NOTHING_TO_DO) + changed = true; } } rcu_read_unlock(); - return rv; + return changed; } void resume_next_sg(struct drbd_device *device) { - write_lock_irq(&global_state_lock); - _drbd_resume_next(device); - write_unlock_irq(&global_state_lock); + lock_all_resources(); + drbd_resume_next(device); + unlock_all_resources(); } void suspend_other_sg(struct drbd_device *device) { - write_lock_irq(&global_state_lock); - _drbd_pause_after(device); - write_unlock_irq(&global_state_lock); + lock_all_resources(); + drbd_pause_after(device); + unlock_all_resources(); } -/* caller must hold global_state_lock */ +/* caller must lock_all_resources() */ enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor) { struct drbd_device *odev; @@ -1557,15 +1553,15 @@ enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_min } } -/* caller must hold global_state_lock */ +/* caller must lock_all_resources() */ void drbd_resync_after_changed(struct drbd_device *device) { - int changes; + int changed; do { - changes = _drbd_pause_after(device); - changes |= _drbd_resume_next(device); - } while (changes); + changed = drbd_pause_after(device); + changed |= drbd_resume_next(device); + } while (changed); } void drbd_rs_controller_reset(struct drbd_device *device) @@ -1685,19 +1681,14 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) } else { mutex_lock(device->state_mutex); } - clear_bit(B_RS_H_DONE, &device->flags); - /* req_lock: serialize with drbd_send_and_submit() and others - * global_state_lock: for stable sync-after dependencies */ - spin_lock_irq(&device->resource->req_lock); - write_lock(&global_state_lock); + lock_all_resources(); + clear_bit(B_RS_H_DONE, &device->flags); /* Did some connection breakage or IO error race with us? */ if (device->state.conn < C_CONNECTED || !get_ldev_if_state(device, D_NEGOTIATING)) { - write_unlock(&global_state_lock); - spin_unlock_irq(&device->resource->req_lock); - mutex_unlock(device->state_mutex); - return; + unlock_all_resources(); + goto out; } ns = drbd_read_state(device); @@ -1711,7 +1702,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) else /* side == C_SYNC_SOURCE */ ns.pdsk = D_INCONSISTENT; - r = __drbd_set_state(device, ns, CS_VERBOSE, NULL); + r = _drbd_set_state(device, ns, CS_VERBOSE, NULL); ns = drbd_read_state(device); if (ns.conn < C_CONNECTED) @@ -1732,7 +1723,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) device->rs_mark_left[i] = tw; device->rs_mark_time[i] = now; } - _drbd_pause_after(device); + drbd_pause_after(device); /* Forget potentially stale cached per resync extent bit-counts. * Open coded drbd_rs_cancel_all(device), we already have IRQs * disabled, and know the disk state is ok. */ @@ -1742,8 +1733,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) device->resync_wenr = LC_FREE; spin_unlock(&device->al_lock); } - write_unlock(&global_state_lock); - spin_unlock_irq(&device->resource->req_lock); + unlock_all_resources(); if (r == SS_SUCCESS) { wake_up(&device->al_wait); /* for lc_reset() above */ @@ -1807,6 +1797,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) drbd_md_sync(device); } put_ldev(device); +out: mutex_unlock(device->state_mutex); } -- cgit v1.2.3 From 1c03e52083c8fa6e70a0b921d25d1916f68320fc Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Mon, 16 Mar 2015 15:01:00 +0100 Subject: drbd: Rename asender to ack_receiver This prepares the next patch where the sending on the meta (or control) socket is moved to a dedicated workqueue. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_int.h | 6 +++--- drivers/block/drbd/drbd_main.c | 10 +++++----- drivers/block/drbd/drbd_receiver.c | 6 +++--- 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'drivers/block/drbd/drbd_main.c') diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 1d00f2e061c5..dee629797d0f 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -754,7 +754,7 @@ struct drbd_connection { unsigned long last_reconnect_jif; struct drbd_thread receiver; struct drbd_thread worker; - struct drbd_thread asender; + struct drbd_thread ack_receiver; /* cached pointers, * so we can look up the oldest pending requests more quickly. @@ -1557,7 +1557,7 @@ extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req); /* drbd_receiver.c */ extern int drbd_receiver(struct drbd_thread *thi); -extern int drbd_asender(struct drbd_thread *thi); +extern int drbd_ack_receiver(struct drbd_thread *thi); extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device); extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector, bool throttle_if_app_is_waiting); @@ -1971,7 +1971,7 @@ extern void drbd_flush_workqueue(struct drbd_work_queue *work_queue); static inline void wake_asender(struct drbd_connection *connection) { if (test_bit(SIGNAL_ASENDER, &connection->flags)) - force_sig(DRBD_SIG, connection->asender.task); + force_sig(DRBD_SIG, connection->ack_receiver.task); } static inline void request_ping(struct drbd_connection *connection) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index f66294db3b08..445f2c8bfa1b 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1436,8 +1436,8 @@ static int we_should_drop_the_connection(struct drbd_connection *connection, str /* long elapsed = (long)(jiffies - device->last_received); */ drop_it = connection->meta.socket == sock - || !connection->asender.task - || get_t_state(&connection->asender) != RUNNING + || !connection->ack_receiver.task + || get_t_state(&connection->ack_receiver) != RUNNING || connection->cstate < C_WF_REPORT_PARAMS; if (drop_it) @@ -2564,7 +2564,7 @@ int set_resource_options(struct drbd_resource *resource, struct res_opts *res_op cpumask_copy(resource->cpu_mask, new_cpu_mask); for_each_connection_rcu(connection, resource) { connection->receiver.reset_cpu_mask = 1; - connection->asender.reset_cpu_mask = 1; + connection->ack_receiver.reset_cpu_mask = 1; connection->worker.reset_cpu_mask = 1; } } @@ -2653,8 +2653,8 @@ struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts) connection->receiver.connection = connection; drbd_thread_init(resource, &connection->worker, drbd_worker, "worker"); connection->worker.connection = connection; - drbd_thread_init(resource, &connection->asender, drbd_asender, "asender"); - connection->asender.connection = connection; + drbd_thread_init(resource, &connection->ack_receiver, drbd_ack_receiver, "ack_recv"); + connection->ack_receiver.connection = connection; kref_init(&connection->kref); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 61b73c77a690..eed4ae9107b4 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1099,7 +1099,7 @@ randomize: return 0; } - drbd_thread_start(&connection->asender); + drbd_thread_start(&connection->ack_receiver); mutex_lock(&connection->resource->conf_update); /* The discard_my_data flag is a single-shot modifier to the next @@ -4656,7 +4656,7 @@ static void conn_disconnect(struct drbd_connection *connection) conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD); /* asender does not clean up anything. it must not interfere, either */ - drbd_thread_stop(&connection->asender); + drbd_thread_stop(&connection->ack_receiver); drbd_free_sock(connection); rcu_read_lock(); @@ -5487,7 +5487,7 @@ static struct asender_cmd asender_tbl[] = { [P_RETRY_WRITE] = { sizeof(struct p_block_ack), got_BlockAck }, }; -int drbd_asender(struct drbd_thread *thi) +int drbd_ack_receiver(struct drbd_thread *thi) { struct drbd_connection *connection = thi->connection; struct asender_cmd *cmd = NULL; -- cgit v1.2.3 From 668700b40a7c8727bbd2b3fd4fd22e0ce3f1aeb6 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Mon, 16 Mar 2015 16:08:29 +0100 Subject: drbd: Create a dedicated workqueue for sending acks on the control connection The intention is to reduce CPU utilization. Recent measurements unveiled that the current performance bottleneck is CPU utilization on the receiving node. The asender thread became CPU limited. One of the main points is to eliminate the idr_for_each_entry() loop from the sending acks code path. One exception in that is sending back ping_acks. These stay in the ack-receiver thread. Otherwise the logic becomes too complicated for no added value. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_int.h | 27 ++--- drivers/block/drbd/drbd_main.c | 10 +- drivers/block/drbd/drbd_nl.c | 4 +- drivers/block/drbd/drbd_protocol.h | 2 +- drivers/block/drbd/drbd_receiver.c | 203 +++++++++++++++++++++---------------- drivers/block/drbd/drbd_req.c | 2 +- drivers/block/drbd/drbd_worker.c | 8 +- 7 files changed, 141 insertions(+), 115 deletions(-) (limited to 'drivers/block/drbd/drbd_main.c') diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index dee629797d0f..3efaf181438c 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -77,13 +77,6 @@ extern int fault_devs; extern char usermode_helper[]; -/* I don't remember why XCPU ... - * This is used to wake the asender, - * and to interrupt sending the sending task - * on disconnect. - */ -#define DRBD_SIG SIGXCPU - /* This is used to stop/restart our threads. * Cannot use SIGTERM nor SIGKILL, since these * are sent out by init on runlevel changes @@ -647,8 +640,7 @@ extern struct fifo_buffer *fifo_alloc(int fifo_size); enum { NET_CONGESTED, /* The data socket is congested */ RESOLVE_CONFLICTS, /* Set on one node, cleared on the peer! */ - SEND_PING, /* whether asender should send a ping asap */ - SIGNAL_ASENDER, /* whether asender wants to be interrupted */ + SEND_PING, GOT_PING_ACK, /* set when we receive a ping_ack packet, ping_wait gets woken */ CONN_WD_ST_CHG_REQ, /* A cluster wide state change on the connection is active */ CONN_WD_ST_CHG_OKAY, @@ -755,6 +747,7 @@ struct drbd_connection { struct drbd_thread receiver; struct drbd_thread worker; struct drbd_thread ack_receiver; + struct workqueue_struct *ack_sender; /* cached pointers, * so we can look up the oldest pending requests more quickly. @@ -823,6 +816,7 @@ struct drbd_peer_device { struct list_head peer_devices; struct drbd_device *device; struct drbd_connection *connection; + struct work_struct send_acks_work; #ifdef CONFIG_DEBUG_FS struct dentry *debugfs_peer_dev; #endif @@ -1558,6 +1552,8 @@ extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req); /* drbd_receiver.c */ extern int drbd_receiver(struct drbd_thread *thi); extern int drbd_ack_receiver(struct drbd_thread *thi); +extern void drbd_send_ping_wf(struct work_struct *ws); +extern void drbd_send_acks_wf(struct work_struct *ws); extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device); extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector, bool throttle_if_app_is_waiting); @@ -1968,16 +1964,21 @@ drbd_device_post_work(struct drbd_device *device, int work_bit) extern void drbd_flush_workqueue(struct drbd_work_queue *work_queue); -static inline void wake_asender(struct drbd_connection *connection) +/* To get the ack_receiver out of the blocking network stack, + * so it can change its sk_rcvtimeo from idle- to ping-timeout, + * and send a ping, we need to send a signal. + * Which signal we send is irrelevant. */ +static inline void wake_ack_receiver(struct drbd_connection *connection) { - if (test_bit(SIGNAL_ASENDER, &connection->flags)) - force_sig(DRBD_SIG, connection->ack_receiver.task); + struct task_struct *task = connection->ack_receiver.task; + if (task && get_t_state(&connection->ack_receiver) == RUNNING) + force_sig(SIGXCPU, task); } static inline void request_ping(struct drbd_connection *connection) { set_bit(SEND_PING, &connection->flags); - wake_asender(connection); + wake_ack_receiver(connection); } extern void *conn_prepare_command(struct drbd_connection *, struct drbd_socket *); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 445f2c8bfa1b..938bca2df027 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1794,15 +1794,6 @@ int drbd_send(struct drbd_connection *connection, struct socket *sock, drbd_update_congested(connection); } do { - /* STRANGE - * tcp_sendmsg does _not_ use its size parameter at all ? - * - * -EAGAIN on timeout, -EINTR on signal. - */ -/* THINK - * do we need to block DRBD_SIG if sock == &meta.socket ?? - * otherwise wake_asender() might interrupt some send_*Ack ! - */ rv = kernel_sendmsg(sock, &msg, &iov, 1, size); if (rv == -EAGAIN) { if (we_should_drop_the_connection(connection, sock)) @@ -2821,6 +2812,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig goto out_idr_remove_from_resource; } kref_get(&connection->kref); + INIT_WORK(&peer_device->send_acks_work, drbd_send_acks_wf); } if (init_submitter(device)) { diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 79dc3d4f5aee..f35cefb20e25 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1258,8 +1258,8 @@ static void conn_reconfig_done(struct drbd_connection *connection) connection->cstate == C_STANDALONE; spin_unlock_irq(&connection->resource->req_lock); if (stop_threads) { - /* asender is implicitly stopped by receiver - * in conn_disconnect() */ + /* ack_receiver thread and ack_sender workqueue are implicitly + * stopped by receiver in conn_disconnect() */ drbd_thread_stop(&connection->receiver); drbd_thread_stop(&connection->worker); } diff --git a/drivers/block/drbd/drbd_protocol.h b/drivers/block/drbd/drbd_protocol.h index 2da9104a3851..ef9245363dcc 100644 --- a/drivers/block/drbd/drbd_protocol.h +++ b/drivers/block/drbd/drbd_protocol.h @@ -23,7 +23,7 @@ enum drbd_packet { P_AUTH_RESPONSE = 0x11, P_STATE_CHG_REQ = 0x12, - /* asender (meta socket */ + /* (meta socket) */ P_PING = 0x13, P_PING_ACK = 0x14, P_RECV_ACK = 0x15, /* Used in protocol B */ diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index eed4ae9107b4..ea54341df3bf 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -215,7 +215,7 @@ static void reclaim_finished_net_peer_reqs(struct drbd_device *device, } } -static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device) +static void drbd_reclaim_net_peer_reqs(struct drbd_device *device) { LIST_HEAD(reclaimed); struct drbd_peer_request *peer_req, *t; @@ -223,11 +223,30 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device) spin_lock_irq(&device->resource->req_lock); reclaim_finished_net_peer_reqs(device, &reclaimed); spin_unlock_irq(&device->resource->req_lock); - list_for_each_entry_safe(peer_req, t, &reclaimed, w.list) drbd_free_net_peer_req(device, peer_req); } +static void conn_reclaim_net_peer_reqs(struct drbd_connection *connection) +{ + struct drbd_peer_device *peer_device; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + if (!atomic_read(&device->pp_in_use_by_net)) + continue; + + kref_get(&device->kref); + rcu_read_unlock(); + drbd_reclaim_net_peer_reqs(device); + kref_put(&device->kref, drbd_destroy_device); + rcu_read_lock(); + } + rcu_read_unlock(); +} + /** * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled) * @device: DRBD device. @@ -265,10 +284,15 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int if (atomic_read(&device->pp_in_use) < mxb) page = __drbd_alloc_pages(device, number); + /* Try to keep the fast path fast, but occasionally we need + * to reclaim the pages we lended to the network stack. */ + if (page && atomic_read(&device->pp_in_use_by_net) > 512) + drbd_reclaim_net_peer_reqs(device); + while (page == NULL) { prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE); - drbd_kick_lo_and_reclaim_net(device); + drbd_reclaim_net_peer_reqs(device); if (atomic_read(&device->pp_in_use) < mxb) { page = __drbd_alloc_pages(device, number); @@ -1100,6 +1124,11 @@ randomize: } drbd_thread_start(&connection->ack_receiver); + connection->ack_sender = create_singlethread_workqueue("drbd_ack_sender"); + if (!connection->ack_sender) { + drbd_err(connection, "Failed to create workqueue ack_sender\n"); + return 0; + } mutex_lock(&connection->resource->conf_update); /* The discard_my_data flag is a single-shot modifier to the next @@ -1746,7 +1775,7 @@ static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_req } /* - * e_end_resync_block() is called in asender context via + * e_end_resync_block() is called in ack_sender context via * drbd_finish_peer_reqs(). */ static int e_end_resync_block(struct drbd_work *w, int unused) @@ -1920,7 +1949,7 @@ static void restart_conflicting_writes(struct drbd_device *device, } /* - * e_end_block() is called in asender context via drbd_finish_peer_reqs(). + * e_end_block() is called in ack_sender context via drbd_finish_peer_reqs(). */ static int e_end_block(struct drbd_work *w, int cancel) { @@ -2211,7 +2240,7 @@ static int handle_write_conflicts(struct drbd_device *device, peer_req->w.cb = superseded ? e_send_superseded : e_send_retry_write; list_add_tail(&peer_req->w.list, &device->done_ee); - wake_asender(connection); + queue_work(connection->ack_sender, &peer_req->peer_device->send_acks_work); err = -ENOENT; goto out; @@ -4050,7 +4079,7 @@ static int receive_state(struct drbd_connection *connection, struct packet_info os = ns = drbd_read_state(device); spin_unlock_irq(&device->resource->req_lock); - /* If some other part of the code (asender thread, timeout) + /* If some other part of the code (ack_receiver thread, timeout) * already decided to close the connection again, * we must not "re-establish" it here. */ if (os.conn <= C_TEAR_DOWN) @@ -4655,8 +4684,12 @@ static void conn_disconnect(struct drbd_connection *connection) */ conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD); - /* asender does not clean up anything. it must not interfere, either */ + /* ack_receiver does not clean up anything. it must not interfere, either */ drbd_thread_stop(&connection->ack_receiver); + if (connection->ack_sender) { + destroy_workqueue(connection->ack_sender); + connection->ack_sender = NULL; + } drbd_free_sock(connection); rcu_read_lock(); @@ -5425,49 +5458,39 @@ static int got_skip(struct drbd_connection *connection, struct packet_info *pi) return 0; } -static int connection_finish_peer_reqs(struct drbd_connection *connection) +struct meta_sock_cmd { + size_t pkt_size; + int (*fn)(struct drbd_connection *connection, struct packet_info *); +}; + +static void set_rcvtimeo(struct drbd_connection *connection, bool ping_timeout) { - struct drbd_peer_device *peer_device; - int vnr, not_empty = 0; + long t; + struct net_conf *nc; - do { - clear_bit(SIGNAL_ASENDER, &connection->flags); - flush_signals(current); + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + t = ping_timeout ? nc->ping_timeo : nc->ping_int; + rcu_read_unlock(); - rcu_read_lock(); - idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { - struct drbd_device *device = peer_device->device; - kref_get(&device->kref); - rcu_read_unlock(); - if (drbd_finish_peer_reqs(device)) { - kref_put(&device->kref, drbd_destroy_device); - return 1; - } - kref_put(&device->kref, drbd_destroy_device); - rcu_read_lock(); - } - set_bit(SIGNAL_ASENDER, &connection->flags); + t *= HZ; + if (ping_timeout) + t /= 10; - spin_lock_irq(&connection->resource->req_lock); - idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { - struct drbd_device *device = peer_device->device; - not_empty = !list_empty(&device->done_ee); - if (not_empty) - break; - } - spin_unlock_irq(&connection->resource->req_lock); - rcu_read_unlock(); - } while (not_empty); + connection->meta.socket->sk->sk_rcvtimeo = t; +} - return 0; +static void set_ping_timeout(struct drbd_connection *connection) +{ + set_rcvtimeo(connection, 1); } -struct asender_cmd { - size_t pkt_size; - int (*fn)(struct drbd_connection *connection, struct packet_info *); -}; +static void set_idle_timeout(struct drbd_connection *connection) +{ + set_rcvtimeo(connection, 0); +} -static struct asender_cmd asender_tbl[] = { +static struct meta_sock_cmd ack_receiver_tbl[] = { [P_PING] = { 0, got_Ping }, [P_PING_ACK] = { 0, got_PingAck }, [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, @@ -5490,61 +5513,37 @@ static struct asender_cmd asender_tbl[] = { int drbd_ack_receiver(struct drbd_thread *thi) { struct drbd_connection *connection = thi->connection; - struct asender_cmd *cmd = NULL; + struct meta_sock_cmd *cmd = NULL; struct packet_info pi; + unsigned long pre_recv_jif; int rv; void *buf = connection->meta.rbuf; int received = 0; unsigned int header_size = drbd_header_size(connection); int expect = header_size; bool ping_timeout_active = false; - struct net_conf *nc; - int ping_timeo, tcp_cork, ping_int; struct sched_param param = { .sched_priority = 2 }; rv = sched_setscheduler(current, SCHED_RR, ¶m); if (rv < 0) - drbd_err(connection, "drbd_asender: ERROR set priority, ret=%d\n", rv); + drbd_err(connection, "drbd_ack_receiver: ERROR set priority, ret=%d\n", rv); while (get_t_state(thi) == RUNNING) { drbd_thread_current_set_cpu(thi); - rcu_read_lock(); - nc = rcu_dereference(connection->net_conf); - ping_timeo = nc->ping_timeo; - tcp_cork = nc->tcp_cork; - ping_int = nc->ping_int; - rcu_read_unlock(); + conn_reclaim_net_peer_reqs(connection); if (test_and_clear_bit(SEND_PING, &connection->flags)) { if (drbd_send_ping(connection)) { drbd_err(connection, "drbd_send_ping has failed\n"); goto reconnect; } - connection->meta.socket->sk->sk_rcvtimeo = ping_timeo * HZ / 10; + set_ping_timeout(connection); ping_timeout_active = true; } - /* TODO: conditionally cork; it may hurt latency if we cork without - much to send */ - if (tcp_cork) - drbd_tcp_cork(connection->meta.socket); - if (connection_finish_peer_reqs(connection)) { - drbd_err(connection, "connection_finish_peer_reqs() failed\n"); - goto reconnect; - } - /* but unconditionally uncork unless disabled */ - if (tcp_cork) - drbd_tcp_uncork(connection->meta.socket); - - /* short circuit, recv_msg would return EINTR anyways. */ - if (signal_pending(current)) - continue; - + pre_recv_jif = jiffies; rv = drbd_recv_short(connection->meta.socket, buf, expect-received, 0); - clear_bit(SIGNAL_ASENDER, &connection->flags); - - flush_signals(current); /* Note: * -EINTR (on meta) we got a signal @@ -5556,7 +5555,6 @@ int drbd_ack_receiver(struct drbd_thread *thi) * rv < expected: "woken" by signal during receive * rv == 0 : "connection shut down by peer" */ -received_more: if (likely(rv > 0)) { received += rv; buf += rv; @@ -5578,8 +5576,7 @@ received_more: } else if (rv == -EAGAIN) { /* If the data socket received something meanwhile, * that is good enough: peer is still alive. */ - if (time_after(connection->last_received, - jiffies - connection->meta.socket->sk->sk_rcvtimeo)) + if (time_after(connection->last_received, pre_recv_jif)) continue; if (ping_timeout_active) { drbd_err(connection, "PingAck did not arrive in time.\n"); @@ -5588,6 +5585,10 @@ received_more: set_bit(SEND_PING, &connection->flags); continue; } else if (rv == -EINTR) { + /* maybe drbd_thread_stop(): the while condition will notice. + * maybe woken for send_ping: we'll send a ping above, + * and change the rcvtimeo */ + flush_signals(current); continue; } else { drbd_err(connection, "sock_recvmsg returned %d\n", rv); @@ -5597,8 +5598,8 @@ received_more: if (received == expect && cmd == NULL) { if (decode_header(connection, connection->meta.rbuf, &pi)) goto reconnect; - cmd = &asender_tbl[pi.cmd]; - if (pi.cmd >= ARRAY_SIZE(asender_tbl) || !cmd->fn) { + cmd = &ack_receiver_tbl[pi.cmd]; + if (pi.cmd >= ARRAY_SIZE(ack_receiver_tbl) || !cmd->fn) { drbd_err(connection, "Unexpected meta packet %s (0x%04x)\n", cmdname(pi.cmd), pi.cmd); goto disconnect; @@ -5621,9 +5622,8 @@ received_more: connection->last_received = jiffies; - if (cmd == &asender_tbl[P_PING_ACK]) { - /* restore idle timeout */ - connection->meta.socket->sk->sk_rcvtimeo = ping_int * HZ; + if (cmd == &ack_receiver_tbl[P_PING_ACK]) { + set_idle_timeout(connection); ping_timeout_active = false; } @@ -5632,11 +5632,6 @@ received_more: expect = header_size; cmd = NULL; } - if (test_bit(SEND_PING, &connection->flags)) - continue; - rv = drbd_recv_short(connection->meta.socket, buf, expect-received, MSG_DONTWAIT); - if (rv > 0) - goto received_more; } if (0) { @@ -5648,9 +5643,41 @@ reconnect: disconnect: conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); } - clear_bit(SIGNAL_ASENDER, &connection->flags); - drbd_info(connection, "asender terminated\n"); + drbd_info(connection, "ack_receiver terminated\n"); return 0; } + +void drbd_send_acks_wf(struct work_struct *ws) +{ + struct drbd_peer_device *peer_device = + container_of(ws, struct drbd_peer_device, send_acks_work); + struct drbd_connection *connection = peer_device->connection; + struct drbd_device *device = peer_device->device; + struct net_conf *nc; + int tcp_cork, err; + + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + tcp_cork = nc->tcp_cork; + rcu_read_unlock(); + + if (tcp_cork) + drbd_tcp_cork(connection->meta.socket); + + err = drbd_finish_peer_reqs(device); + kref_put(&device->kref, drbd_destroy_device); + /* get is in drbd_endio_write_sec_final(). That is necessary to keep the + struct work_struct send_acks_work alive, which is in the peer_device object */ + + if (err) { + conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD); + return; + } + + if (tcp_cork) + drbd_tcp_uncork(connection->meta.socket); + + return; +} diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 3add7c5e97e0..7907fb562388 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -453,7 +453,7 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m, kref_get(&req->kref); /* wait for the DONE */ if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) { - /* potentially already completed in the asender thread */ + /* potentially already completed in the ack_receiver thread */ if (!(s & RQ_NET_DONE)) { atomic_add(req->i.size >> 9, &device->ap_in_flight); set_if_null_req_not_net_done(peer_device, req); diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 8bbabe37ef0d..2f29bf3e4dba 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -113,6 +113,7 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l unsigned long flags = 0; struct drbd_peer_device *peer_device = peer_req->peer_device; struct drbd_device *device = peer_device->device; + struct drbd_connection *connection = peer_device->connection; struct drbd_interval i; int do_wake; u64 block_id; @@ -145,6 +146,12 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l * ((peer_req->flags & (EE_WAS_ERROR|EE_IS_TRIM)) == EE_WAS_ERROR) */ if (peer_req->flags & EE_WAS_ERROR) __drbd_chk_io_error(device, DRBD_WRITE_ERROR); + + if (connection->cstate >= C_WF_REPORT_PARAMS) { + kref_get(&device->kref); /* put is in drbd_send_acks_wf() */ + if (!queue_work(connection->ack_sender, &peer_device->send_acks_work)) + kref_put(&device->kref, drbd_destroy_device); + } spin_unlock_irqrestore(&device->resource->req_lock, flags); if (block_id == ID_SYNCER) @@ -156,7 +163,6 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l if (do_al_complete_io) drbd_al_complete_io(device, &i); - wake_asender(peer_device->connection); put_ldev(device); } -- cgit v1.2.3 From 39e91a60c823603d6377cc8fa0b0bf301d1966eb Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 24 Mar 2015 10:40:26 +0100 Subject: drbd: use resource name in workqueue Since kernel 3.3, we can use snprintf-style arguments to create a workqueue. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_main.c | 4 ++-- drivers/block/drbd/drbd_receiver.c | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'drivers/block/drbd/drbd_main.c') diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 938bca2df027..3a9a0f112004 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2694,8 +2694,8 @@ static int init_submitter(struct drbd_device *device) { /* opencoded create_singlethread_workqueue(), * to be able to say "drbd%d", ..., minor */ - device->submit.wq = alloc_workqueue("drbd%u_submit", - WQ_UNBOUND | WQ_MEM_RECLAIM, 1, device->minor); + device->submit.wq = + alloc_ordered_workqueue("drbd%u_submit", WQ_MEM_RECLAIM, device->minor); if (!device->submit.wq) return -ENOMEM; diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index ea54341df3bf..1957fe8601dc 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1124,7 +1124,10 @@ randomize: } drbd_thread_start(&connection->ack_receiver); - connection->ack_sender = create_singlethread_workqueue("drbd_ack_sender"); + /* opencoded create_singlethread_workqueue(), + * to be able to use format string arguments */ + connection->ack_sender = + alloc_ordered_workqueue("drbd_as_%s", WQ_MEM_RECLAIM, connection->resource->name); if (!connection->ack_sender) { drbd_err(connection, "Failed to create workqueue ack_sender\n"); return 0; -- cgit v1.2.3 From edb5e5f63d80c368467e4a0628c5b6d30f1673eb Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 24 Mar 2015 21:35:26 +0100 Subject: drbd: fix spurious alert level printk When accessing out meta data area on disk, we double check the plausibility of the requested sector offsets, and are very noisy about it if they look suspicious. During initial read of our "superblock", for "external" meta data, this triggered because the range estimate returned by drbd_md_last_sector() was still wrong. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_main.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/block/drbd/drbd_main.c') diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 3a9a0f112004..a4aa7eb5507d 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -3270,6 +3270,10 @@ int drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev) * and read it. */ bdev->md.meta_dev_idx = bdev->disk_conf->meta_dev_idx; bdev->md.md_offset = drbd_md_ss(bdev); + /* Even for (flexible or indexed) external meta data, + * initially restrict us to the 4k superblock for now. + * Affects the paranoia out-of-range access check in drbd_md_sync_page_io(). */ + bdev->md.md_size_sect = 8; if (drbd_md_sync_page_io(device, bdev, bdev->md.md_offset, READ)) { /* NOTE: can't do normal error processing here as this is -- cgit v1.2.3 From 63a7c8ad92af5f57d4a2c5be223d6ca424c3670b Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 26 Mar 2015 20:53:55 +0100 Subject: drbd: make drbd known to lsblk: use bd_link_disk_holder lsblk should be able to pick up stacking device driver relations involving DRBD conveniently. Even though upstream kernel since 2011 says "DON'T USE THIS UNLESS YOU'RE ALREADY USING IT." a new user has been added since (bcache), which sets the precedences for us to use it as well. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_int.h | 2 +- drivers/block/drbd/drbd_main.c | 16 +----- drivers/block/drbd/drbd_nl.c | 121 ++++++++++++++++++++++++++++----------- drivers/block/drbd/drbd_worker.c | 2 +- include/linux/drbd.h | 2 +- 5 files changed, 91 insertions(+), 52 deletions(-) (limited to 'drivers/block/drbd/drbd_main.c') diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 08f266eab3ee..a26265375e1e 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1126,7 +1126,7 @@ extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int extern int drbd_send_bitmap(struct drbd_device *device); extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode); extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode); -extern void drbd_free_ldev(struct drbd_backing_dev *ldev); +extern void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev); extern void drbd_device_cleanup(struct drbd_device *device); void drbd_print_uuids(struct drbd_device *device, const char *text); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index a4aa7eb5507d..136fa733a15e 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1992,7 +1992,7 @@ void drbd_device_cleanup(struct drbd_device *device) drbd_bm_cleanup(device); } - drbd_free_ldev(device->ldev); + drbd_backing_dev_free(device, device->ldev); device->ldev = NULL; clear_bit(AL_SUSPENDED, &device->flags); @@ -2171,7 +2171,7 @@ void drbd_destroy_device(struct kref *kref) if (device->this_bdev) bdput(device->this_bdev); - drbd_free_ldev(device->ldev); + drbd_backing_dev_free(device, device->ldev); device->ldev = NULL; drbd_release_all_peer_reqs(device); @@ -2964,18 +2964,6 @@ fail: return err; } -void drbd_free_ldev(struct drbd_backing_dev *ldev) -{ - if (ldev == NULL) - return; - - blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); - blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); - - kfree(ldev->disk_conf); - kfree(ldev); -} - static void drbd_free_one_sock(struct drbd_socket *ds) { struct socket *s; diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 4703f1ad7f92..ee34739ee9ff 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1471,6 +1471,88 @@ success: return 0; } +static struct block_device *open_backing_dev(struct drbd_device *device, + const char *bdev_path, void *claim_ptr, bool do_bd_link) +{ + struct block_device *bdev; + int err = 0; + + bdev = blkdev_get_by_path(bdev_path, + FMODE_READ | FMODE_WRITE | FMODE_EXCL, claim_ptr); + if (IS_ERR(bdev)) { + drbd_err(device, "open(\"%s\") failed with %ld\n", + bdev_path, PTR_ERR(bdev)); + return bdev; + } + + if (!do_bd_link) + return bdev; + + err = bd_link_disk_holder(bdev, device->vdisk); + if (err) { + blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + drbd_err(device, "bd_link_disk_holder(\"%s\", ...) failed with %d\n", + bdev_path, err); + bdev = ERR_PTR(err); + } + return bdev; +} + +static int open_backing_devices(struct drbd_device *device, + struct disk_conf *new_disk_conf, + struct drbd_backing_dev *nbc) +{ + struct block_device *bdev; + + bdev = open_backing_dev(device, new_disk_conf->backing_dev, device, true); + if (IS_ERR(bdev)) + return ERR_OPEN_DISK; + nbc->backing_bdev = bdev; + + /* + * meta_dev_idx >= 0: external fixed size, possibly multiple + * drbd sharing one meta device. TODO in that case, paranoia + * check that [md_bdev, meta_dev_idx] is not yet used by some + * other drbd minor! (if you use drbd.conf + drbdadm, that + * should check it for you already; but if you don't, or + * someone fooled it, we need to double check here) + */ + bdev = open_backing_dev(device, new_disk_conf->meta_dev, + /* claim ptr: device, if claimed exclusively; shared drbd_m_holder, + * if potentially shared with other drbd minors */ + (new_disk_conf->meta_dev_idx < 0) ? (void*)device : (void*)drbd_m_holder, + /* avoid double bd_claim_by_disk() for the same (source,target) tuple, + * as would happen with internal metadata. */ + (new_disk_conf->meta_dev_idx != DRBD_MD_INDEX_FLEX_INT && + new_disk_conf->meta_dev_idx != DRBD_MD_INDEX_INTERNAL)); + if (IS_ERR(bdev)) + return ERR_OPEN_MD_DISK; + nbc->md_bdev = bdev; + return NO_ERROR; +} + +static void close_backing_dev(struct drbd_device *device, struct block_device *bdev, + bool do_bd_unlink) +{ + if (!bdev) + return; + if (do_bd_unlink) + bd_unlink_disk_holder(bdev, device->vdisk); + blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); +} + +void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev) +{ + if (ldev == NULL) + return; + + close_backing_dev(device, ldev->md_bdev, ldev->md_bdev != ldev->backing_bdev); + close_backing_dev(device, ldev->backing_bdev, true); + + kfree(ldev->disk_conf); + kfree(ldev); +} + int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) { struct drbd_config_context adm_ctx; @@ -1484,7 +1566,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) sector_t min_md_device_sectors; struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */ struct disk_conf *new_disk_conf = NULL; - struct block_device *bdev; struct lru_cache *resync_lru = NULL; struct fifo_buffer *new_plan = NULL; union drbd_state ns, os; @@ -1572,35 +1653,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) } rcu_read_unlock(); - bdev = blkdev_get_by_path(new_disk_conf->backing_dev, - FMODE_READ | FMODE_WRITE | FMODE_EXCL, device); - if (IS_ERR(bdev)) { - drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->backing_dev, - PTR_ERR(bdev)); - retcode = ERR_OPEN_DISK; - goto fail; - } - nbc->backing_bdev = bdev; - - /* - * meta_dev_idx >= 0: external fixed size, possibly multiple - * drbd sharing one meta device. TODO in that case, paranoia - * check that [md_bdev, meta_dev_idx] is not yet used by some - * other drbd minor! (if you use drbd.conf + drbdadm, that - * should check it for you already; but if you don't, or - * someone fooled it, we need to double check here) - */ - bdev = blkdev_get_by_path(new_disk_conf->meta_dev, - FMODE_READ | FMODE_WRITE | FMODE_EXCL, - (new_disk_conf->meta_dev_idx < 0) ? - (void *)device : (void *)drbd_m_holder); - if (IS_ERR(bdev)) { - drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->meta_dev, - PTR_ERR(bdev)); - retcode = ERR_OPEN_MD_DISK; + retcode = open_backing_devices(device, new_disk_conf, nbc); + if (retcode != NO_ERROR) goto fail; - } - nbc->md_bdev = bdev; if ((nbc->backing_bdev == nbc->md_bdev) != (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL || @@ -1900,12 +1955,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) fail: conn_reconfig_done(connection); if (nbc) { - if (nbc->backing_bdev) - blkdev_put(nbc->backing_bdev, - FMODE_READ | FMODE_WRITE | FMODE_EXCL); - if (nbc->md_bdev) - blkdev_put(nbc->md_bdev, - FMODE_READ | FMODE_WRITE | FMODE_EXCL); + close_backing_dev(device, nbc->md_bdev, nbc->md_bdev != nbc->backing_bdev); + close_backing_dev(device, nbc->backing_bdev, true); kfree(nbc); } kfree(new_disk_conf); diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 2f29bf3e4dba..eff716c27b1f 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1841,7 +1841,7 @@ static void drbd_ldev_destroy(struct drbd_device *device) device->act_log = NULL; __acquire(local); - drbd_free_ldev(device->ldev); + drbd_backing_dev_free(device, device->ldev); device->ldev = NULL; __release(local); diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 392fc0edb516..d6b3c9943a2c 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -51,7 +51,7 @@ #endif extern const char *drbd_buildtag(void); -#define REL_VERSION "8.4.5" +#define REL_VERSION "8.4.6" #define API_VERSION 1 #define PRO_VERSION_MIN 86 #define PRO_VERSION_MAX 101 -- cgit v1.2.3 From 5bded4effb601d9f92382db38fd501f98692eb2d Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 16 Apr 2015 16:51:34 +0200 Subject: drbd: don't block forever in disconnect during resync if fencing=r-a-stonith Disconnect should wait for pending bitmap IO. But if that bitmap IO is not happening, because it is waiting for pending application IO, and there is no progress, because the fencing policy suspended application IO because of the disconnect, then we deadlock. The bitmap writeout in this case does not care for concurrent application IO, so there is no point waiting for it. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/block/drbd/drbd_main.c') diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 136fa733a15e..5b43dfb79819 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -3563,7 +3563,9 @@ void drbd_queue_bitmap_io(struct drbd_device *device, spin_lock_irq(&device->resource->req_lock); set_bit(BITMAP_IO, &device->flags); - if (atomic_read(&device->ap_bio_cnt) == 0) { + /* don't wait for pending application IO if the caller indicates that + * application IO does not conflict anyways. */ + if (flags == BM_LOCKED_CHANGE_ALLOWED || atomic_read(&device->ap_bio_cnt) == 0) { if (!test_and_set_bit(BITMAP_IO_QUEUED, &device->flags)) drbd_queue_work(&first_peer_device(device)->connection->sender_work, &device->bm_io_work.w); -- cgit v1.2.3