From ae1ff3d623905947158fd3394854c23026337810 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Mon, 13 Jul 2015 14:31:28 +0300 Subject: iommu: iova: Move iova cache management to the iova library This is necessary to separate intel-iommu from the iova library. Signed-off-by: Sakari Ailus Signed-off-by: David Woodhouse --- include/linux/iova.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iova.h b/include/linux/iova.h index 3920a19d8194..92f7177db2ce 100644 --- a/include/linux/iova.h +++ b/include/linux/iova.h @@ -68,8 +68,8 @@ static inline unsigned long iova_pfn(struct iova_domain *iovad, dma_addr_t iova) return iova >> iova_shift(iovad); } -int iommu_iova_cache_init(void); -void iommu_iova_cache_destroy(void); +int iova_cache_get(void); +void iova_cache_put(void); struct iova *alloc_iova_mem(void); void free_iova_mem(struct iova *iova); -- cgit v1.2.3 From 30035e45753b708e7d47a98398500ca005e02b86 Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Wed, 29 Apr 2015 12:52:04 -0400 Subject: string: provide strscpy() The strscpy() API is intended to be used instead of strlcpy(), and instead of most uses of strncpy(). - Unlike strlcpy(), it doesn't read from memory beyond (src + size). - Unlike strlcpy() or strncpy(), the API provides an easy way to check for destination buffer overflow: an -E2BIG error return value. - The provided implementation is robust in the face of the source buffer being asynchronously changed during the copy, unlike the current implementation of strlcpy(). - Unlike strncpy(), the destination buffer will be NUL-terminated if the string in the source buffer is too long. - Also unlike strncpy(), the destination buffer will not be updated beyond the NUL termination, avoiding strncpy's behavior of zeroing the entire tail end of the destination buffer. (A memset() after the strscpy() can be used if this behavior is desired.) - The implementation should be reasonably performant on all platforms since it uses the asm/word-at-a-time.h API rather than simple byte copy. Kernel-to-kernel string copy is not considered to be performance critical in any case. Signed-off-by: Chris Metcalf --- include/linux/string.h | 3 ++ lib/string.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) (limited to 'include/linux') diff --git a/include/linux/string.h b/include/linux/string.h index a8d90db9c4b0..9ef7795e65e4 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -25,6 +25,9 @@ extern char * strncpy(char *,const char *, __kernel_size_t); #ifndef __HAVE_ARCH_STRLCPY size_t strlcpy(char *, const char *, size_t); #endif +#ifndef __HAVE_ARCH_STRSCPY +ssize_t __must_check strscpy(char *, const char *, size_t); +#endif #ifndef __HAVE_ARCH_STRCAT extern char * strcat(char *, const char *); #endif diff --git a/lib/string.c b/lib/string.c index 13d1e84ddb80..8dbb7b1eab50 100644 --- a/lib/string.c +++ b/lib/string.c @@ -27,6 +27,10 @@ #include #include +#include +#include +#include + #ifndef __HAVE_ARCH_STRNCASECMP /** * strncasecmp - Case insensitive, length-limited string comparison @@ -146,6 +150,90 @@ size_t strlcpy(char *dest, const char *src, size_t size) EXPORT_SYMBOL(strlcpy); #endif +#ifndef __HAVE_ARCH_STRSCPY +/** + * strscpy - Copy a C-string into a sized buffer + * @dest: Where to copy the string to + * @src: Where to copy the string from + * @count: Size of destination buffer + * + * Copy the string, or as much of it as fits, into the dest buffer. + * The routine returns the number of characters copied (not including + * the trailing NUL) or -E2BIG if the destination buffer wasn't big enough. + * The behavior is undefined if the string buffers overlap. + * The destination buffer is always NUL terminated, unless it's zero-sized. + * + * Preferred to strlcpy() since the API doesn't require reading memory + * from the src string beyond the specified "count" bytes, and since + * the return value is easier to error-check than strlcpy()'s. + * In addition, the implementation is robust to the string changing out + * from underneath it, unlike the current strlcpy() implementation. + * + * Preferred to strncpy() since it always returns a valid string, and + * doesn't unnecessarily force the tail of the destination buffer to be + * zeroed. If the zeroing is desired, it's likely cleaner to use strscpy() + * with an overflow test, then just memset() the tail of the dest buffer. + */ +ssize_t strscpy(char *dest, const char *src, size_t count) +{ + const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; + size_t max = count; + long res = 0; + + if (count == 0) + return -E2BIG; + +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + /* + * If src is unaligned, don't cross a page boundary, + * since we don't know if the next page is mapped. + */ + if ((long)src & (sizeof(long) - 1)) { + size_t limit = PAGE_SIZE - ((long)src & (PAGE_SIZE - 1)); + if (limit < max) + max = limit; + } +#else + /* If src or dest is unaligned, don't do word-at-a-time. */ + if (((long) dest | (long) src) & (sizeof(long) - 1)) + max = 0; +#endif + + while (max >= sizeof(unsigned long)) { + unsigned long c, data; + + c = *(unsigned long *)(src+res); + *(unsigned long *)(dest+res) = c; + if (has_zero(c, &data, &constants)) { + data = prep_zero_mask(c, data, &constants); + data = create_zero_mask(data); + return res + find_zero(data); + } + res += sizeof(unsigned long); + count -= sizeof(unsigned long); + max -= sizeof(unsigned long); + } + + while (count) { + char c; + + c = src[res]; + dest[res] = c; + if (!c) + return res; + res++; + count--; + } + + /* Hit buffer length without finding a NUL; force NUL-termination. */ + if (res) + dest[res-1] = '\0'; + + return -E2BIG; +} +EXPORT_SYMBOL(strscpy); +#endif + #ifndef __HAVE_ARCH_STRCAT /** * strcat - Append one %NUL-terminated string to another -- cgit v1.2.3 From cd33dc9ac2977ebe30cecbf39d2992190fbac5b4 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 8 Sep 2015 14:51:12 +0100 Subject: thermal: Fix thermal_zone_of_sensor_register to match documentation thermal_zone_of_sensor_register is documented as returning a pointer to either a valid thermal_zone_device on success, or a corresponding ERR_PTR() value. In contrast, the function returns NULL when THERMAL_OF is configured off. Fix this. Signed-off-by: Punit Agrawal Acked-by: Guenter Roeck Cc: Eduardo Valentin Cc: Zhang Rui Signed-off-by: Eduardo Valentin --- include/linux/thermal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 17292fee8686..0c5518e13584 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -360,7 +360,7 @@ static inline struct thermal_zone_device * thermal_zone_of_sensor_register(struct device *dev, int id, void *data, const struct thermal_zone_of_device_ops *ops) { - return NULL; + return ERR_PTR(-ENODEV); } static inline -- cgit v1.2.3 From c973c3bcec3752455c4d7545edd42935cd7942d9 Mon Sep 17 00:00:00 2001 From: Javi Merino Date: Mon, 14 Sep 2015 14:23:50 +0100 Subject: thermal: Add a function to get the minimum power The thermal core already has a function to get the maximum power of a cooling device: power_actor_get_max_power(). Add a function to get the minimum power of a cooling device. Cc: Zhang Rui Cc: Eduardo Valentin Reviewed-by: Daniel Kurtz Signed-off-by: Javi Merino Signed-off-by: Eduardo Valentin --- drivers/thermal/thermal_core.c | 28 ++++++++++++++++++++++++++++ include/linux/thermal.h | 6 ++++++ 2 files changed, 34 insertions(+) (limited to 'include/linux') diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 5e5fc7015c7f..d9e525cc9c1c 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -1012,6 +1012,34 @@ int power_actor_get_max_power(struct thermal_cooling_device *cdev, return cdev->ops->state2power(cdev, tz, 0, max_power); } +/** + * power_actor_get_min_power() - get the mainimum power that a cdev can consume + * @cdev: pointer to &thermal_cooling_device + * @tz: a valid thermal zone device pointer + * @min_power: pointer in which to store the minimum power + * + * Calculate the minimum power consumption in milliwatts that the + * cooling device can currently consume and store it in @min_power. + * + * Return: 0 on success, -EINVAL if @cdev doesn't support the + * power_actor API or -E* on other error. + */ +int power_actor_get_min_power(struct thermal_cooling_device *cdev, + struct thermal_zone_device *tz, u32 *min_power) +{ + unsigned long max_state; + int ret; + + if (!cdev_is_power_actor(cdev)) + return -EINVAL; + + ret = cdev->ops->get_max_state(cdev, &max_state); + if (ret) + return ret; + + return cdev->ops->state2power(cdev, tz, max_state, min_power); +} + /** * power_actor_set_power() - limit the maximum power that a cooling device can consume * @cdev: pointer to &thermal_cooling_device diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 0c5518e13584..157d366e761b 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -380,6 +380,8 @@ static inline bool cdev_is_power_actor(struct thermal_cooling_device *cdev) int power_actor_get_max_power(struct thermal_cooling_device *, struct thermal_zone_device *tz, u32 *max_power); +int power_actor_get_min_power(struct thermal_cooling_device *, + struct thermal_zone_device *tz, u32 *min_power); int power_actor_set_power(struct thermal_cooling_device *, struct thermal_instance *, u32); struct thermal_zone_device *thermal_zone_device_register(const char *, int, int, @@ -415,6 +417,10 @@ static inline bool cdev_is_power_actor(struct thermal_cooling_device *cdev) static inline int power_actor_get_max_power(struct thermal_cooling_device *cdev, struct thermal_zone_device *tz, u32 *max_power) { return 0; } +static inline int power_actor_get_min_power(struct thermal_cooling_device *cdev, + struct thermal_zone_device *tz, + u32 *min_power) +{ return -ENODEV; } static inline int power_actor_set_power(struct thermal_cooling_device *cdev, struct thermal_instance *tz, u32 power) { return 0; } -- cgit v1.2.3 From 63cdbc06b357dcb3a7104a421ee4a4550d7fadfd Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 14 Sep 2015 17:06:27 +0200 Subject: netfilter: bridge: fix routing of bridge frames with call-iptables=1 We can't re-use the physoutdev storage area. 1. When using NFQUEUE in PREROUTING, we attempt to bump a bogus refcnt since nf_bridge->physoutdev is garbage (ipv4/ipv6 address) 2. for same reason, we crash in physdev match in FORWARD or later if skb is routed instead of bridged. This increases nf_bridge_info to 40 bytes, but we have no other choice. Fixes: 72b1e5e4cac7 ("netfilter: bridge: reduce nf_bridge_info to 32 bytes again") Reported-by: Sander Eikelenboom Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/skbuff.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2738d355cdf9..9987af080fa0 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -179,6 +179,9 @@ struct nf_bridge_info { u8 bridged_dnat:1; __u16 frag_max_size; struct net_device *physindev; + + /* always valid & non-NULL from FORWARD on, for physdev match */ + struct net_device *physoutdev; union { /* prerouting: detect dnat in orig/reply direction */ __be32 ipv4_daddr; @@ -189,9 +192,6 @@ struct nf_bridge_info { * skb is out in neigh layer. */ char neigh_header[8]; - - /* always valid & non-NULL from FORWARD on, for physdev match */ - struct net_device *physoutdev; }; }; #endif -- cgit v1.2.3 From 0c986253b939cc14c69d4adbe2b4121bdf4aa220 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Sep 2015 11:51:12 -0400 Subject: Revert "sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem" This reverts commit d59cfc09c32a2ae31f1c3bc2983a0cd79afb3f14. d59cfc09c32a ("sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem") and b5ba75b5fc0e ("cgroup: simplify threadgroup locking") changed how cgroup synchronizes against task fork and exits so that it uses global percpu_rwsem instead of per-process rwsem; unfortunately, the write [un]lock paths of percpu_rwsem always involve synchronize_rcu_expedited() which turned out to be too expensive. Improvements for percpu_rwsem are scheduled to be merged in the coming v4.4-rc1 merge window which alleviates this issue. For now, revert the two commits to restore per-process rwsem. They will be re-applied for the v4.4-rc1 merge window. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/g/55F8097A.7000206@de.ibm.com Reported-by: Christian Borntraeger Cc: Oleg Nesterov Cc: "Paul E. McKenney" Cc: Peter Zijlstra Cc: Paolo Bonzini Cc: stable@vger.kernel.org # v4.2+ --- include/linux/cgroup-defs.h | 27 ++-------------- include/linux/init_task.h | 8 +++++ include/linux/sched.h | 12 +++++++ kernel/cgroup.c | 77 +++++++++++++++++++++++++++++++++------------ kernel/fork.c | 4 +++ 5 files changed, 83 insertions(+), 45 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 4d8fcf2187dc..8492721b39be 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -473,31 +473,8 @@ struct cgroup_subsys { unsigned int depends_on; }; -extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem; - -/** - * cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups - * @tsk: target task - * - * Called from threadgroup_change_begin() and allows cgroup operations to - * synchronize against threadgroup changes using a percpu_rw_semaphore. - */ -static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) -{ - percpu_down_read(&cgroup_threadgroup_rwsem); -} - -/** - * cgroup_threadgroup_change_end - threadgroup exclusion for cgroups - * @tsk: target task - * - * Called from threadgroup_change_end(). Counterpart of - * cgroup_threadcgroup_change_begin(). - */ -static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) -{ - percpu_up_read(&cgroup_threadgroup_rwsem); -} +void cgroup_threadgroup_change_begin(struct task_struct *tsk); +void cgroup_threadgroup_change_end(struct task_struct *tsk); #else /* CONFIG_CGROUPS */ diff --git a/include/linux/init_task.h b/include/linux/init_task.h index d0b380ee7d67..e38681f4912d 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -25,6 +25,13 @@ extern struct files_struct init_files; extern struct fs_struct init_fs; +#ifdef CONFIG_CGROUPS +#define INIT_GROUP_RWSEM(sig) \ + .group_rwsem = __RWSEM_INITIALIZER(sig.group_rwsem), +#else +#define INIT_GROUP_RWSEM(sig) +#endif + #ifdef CONFIG_CPUSETS #define INIT_CPUSET_SEQ(tsk) \ .mems_allowed_seq = SEQCNT_ZERO(tsk.mems_allowed_seq), @@ -57,6 +64,7 @@ extern struct fs_struct init_fs; INIT_PREV_CPUTIME(sig) \ .cred_guard_mutex = \ __MUTEX_INITIALIZER(sig.cred_guard_mutex), \ + INIT_GROUP_RWSEM(sig) \ } extern struct nsproxy init_nsproxy; diff --git a/include/linux/sched.h b/include/linux/sched.h index a4ab9daa387c..b7b9501b41af 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -762,6 +762,18 @@ struct signal_struct { unsigned audit_tty_log_passwd; struct tty_audit_buf *tty_audit_buf; #endif +#ifdef CONFIG_CGROUPS + /* + * group_rwsem prevents new tasks from entering the threadgroup and + * member tasks from exiting,a more specifically, setting of + * PF_EXITING. fork and exit paths are protected with this rwsem + * using threadgroup_change_begin/end(). Users which require + * threadgroup to remain stable should use threadgroup_[un]lock() + * which also takes care of exec path. Currently, cgroup is the + * only user. + */ + struct rw_semaphore group_rwsem; +#endif oom_flags_t oom_flags; short oom_score_adj; /* OOM kill score adjustment */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 115091efa889..2c9eae6ad970 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -46,7 +46,6 @@ #include #include #include -#include #include #include #include @@ -104,8 +103,6 @@ static DEFINE_SPINLOCK(cgroup_idr_lock); */ static DEFINE_SPINLOCK(release_agent_path_lock); -struct percpu_rw_semaphore cgroup_threadgroup_rwsem; - #define cgroup_assert_mutex_or_rcu_locked() \ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ !lockdep_is_held(&cgroup_mutex), \ @@ -874,6 +871,48 @@ static struct css_set *find_css_set(struct css_set *old_cset, return cset; } +void cgroup_threadgroup_change_begin(struct task_struct *tsk) +{ + down_read(&tsk->signal->group_rwsem); +} + +void cgroup_threadgroup_change_end(struct task_struct *tsk) +{ + up_read(&tsk->signal->group_rwsem); +} + +/** + * threadgroup_lock - lock threadgroup + * @tsk: member task of the threadgroup to lock + * + * Lock the threadgroup @tsk belongs to. No new task is allowed to enter + * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or + * change ->group_leader/pid. This is useful for cases where the threadgroup + * needs to stay stable across blockable operations. + * + * fork and exit explicitly call threadgroup_change_{begin|end}() for + * synchronization. While held, no new task will be added to threadgroup + * and no existing live task will have its PF_EXITING set. + * + * de_thread() does threadgroup_change_{begin|end}() when a non-leader + * sub-thread becomes a new leader. + */ +static void threadgroup_lock(struct task_struct *tsk) +{ + down_write(&tsk->signal->group_rwsem); +} + +/** + * threadgroup_unlock - unlock threadgroup + * @tsk: member task of the threadgroup to unlock + * + * Reverse threadgroup_lock(). + */ +static inline void threadgroup_unlock(struct task_struct *tsk) +{ + up_write(&tsk->signal->group_rwsem); +} + static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) { struct cgroup *root_cgrp = kf_root->kn->priv; @@ -2074,9 +2113,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, lockdep_assert_held(&css_set_rwsem); /* - * We are synchronized through cgroup_threadgroup_rwsem against - * PF_EXITING setting such that we can't race against cgroup_exit() - * changing the css_set to init_css_set and dropping the old one. + * We are synchronized through threadgroup_lock() against PF_EXITING + * setting such that we can't race against cgroup_exit() changing the + * css_set to init_css_set and dropping the old one. */ WARN_ON_ONCE(tsk->flags & PF_EXITING); old_cset = task_css_set(tsk); @@ -2133,11 +2172,10 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) * @src_cset and add it to @preloaded_csets, which should later be cleaned * up by cgroup_migrate_finish(). * - * This function may be called without holding cgroup_threadgroup_rwsem - * even if the target is a process. Threads may be created and destroyed - * but as long as cgroup_mutex is not dropped, no new css_set can be put - * into play and the preloaded css_sets are guaranteed to cover all - * migrations. + * This function may be called without holding threadgroup_lock even if the + * target is a process. Threads may be created and destroyed but as long + * as cgroup_mutex is not dropped, no new css_set can be put into play and + * the preloaded css_sets are guaranteed to cover all migrations. */ static void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp, @@ -2240,7 +2278,7 @@ err: * @threadgroup: whether @leader points to the whole process or a single task * * Migrate a process or task denoted by @leader to @cgrp. If migrating a - * process, the caller must be holding cgroup_threadgroup_rwsem. The + * process, the caller must be holding threadgroup_lock of @leader. The * caller is also responsible for invoking cgroup_migrate_add_src() and * cgroup_migrate_prepare_dst() on the targets before invoking this * function and following up with cgroup_migrate_finish(). @@ -2368,7 +2406,7 @@ out_release_tset: * @leader: the task or the leader of the threadgroup to be attached * @threadgroup: attach the whole threadgroup? * - * Call holding cgroup_mutex and cgroup_threadgroup_rwsem. + * Call holding cgroup_mutex and threadgroup_lock of @leader. */ static int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup) @@ -2490,7 +2528,7 @@ retry_find_task: get_task_struct(tsk); rcu_read_unlock(); - percpu_down_write(&cgroup_threadgroup_rwsem); + threadgroup_lock(tsk); if (threadgroup) { if (!thread_group_leader(tsk)) { /* @@ -2500,7 +2538,7 @@ retry_find_task: * try again; this is * "double-double-toil-and-trouble-check locking". */ - percpu_up_write(&cgroup_threadgroup_rwsem); + threadgroup_unlock(tsk); put_task_struct(tsk); goto retry_find_task; } @@ -2510,7 +2548,7 @@ retry_find_task: if (!ret) ret = cgroup_attach_task(cgrp, tsk, threadgroup); - percpu_up_write(&cgroup_threadgroup_rwsem); + threadgroup_unlock(tsk); put_task_struct(tsk); out_unlock_cgroup: @@ -2713,17 +2751,17 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) goto out_finish; last_task = task; - percpu_down_write(&cgroup_threadgroup_rwsem); + threadgroup_lock(task); /* raced against de_thread() from another thread? */ if (!thread_group_leader(task)) { - percpu_up_write(&cgroup_threadgroup_rwsem); + threadgroup_unlock(task); put_task_struct(task); continue; } ret = cgroup_migrate(src_cset->dfl_cgrp, task, true); - percpu_up_write(&cgroup_threadgroup_rwsem); + threadgroup_unlock(task); put_task_struct(task); if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret)) @@ -5045,7 +5083,6 @@ int __init cgroup_init(void) unsigned long key; int ssid, err; - BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); diff --git a/kernel/fork.c b/kernel/fork.c index 7d5f0f118a63..2845623fb582 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1149,6 +1149,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) tty_audit_fork(sig); sched_autogroup_fork(sig); +#ifdef CONFIG_CGROUPS + init_rwsem(&sig->group_rwsem); +#endif + sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; -- cgit v1.2.3 From 0243ed44ad4a25dbd2e92ad97e5e12a1a6c72d6c Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 15 Sep 2015 04:59:21 -0700 Subject: spi: fix kernel-doc warnings in spi.h Fix the following 'make htmldocs' warnings: .//include/linux/spi/spi.h:71: warning: No description found for parameter 'lock' .//include/linux/spi/spi.h:71: warning: Excess struct/union/enum/typedef member 'clock' description in 'spi_statistics' Signed-off-by: Geliang Tang Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 269e8afd3e2a..6b00f18f5e6b 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -34,7 +34,7 @@ extern struct bus_type spi_bus_type; /** * struct spi_statistics - statistics for spi transfers - * @clock: lock protecting this structure + * @lock: lock protecting this structure * * @messages: number of spi-messages handled * @transfers: number of spi_transfers handled -- cgit v1.2.3 From 0fdea1e8a2853f79d39b8555cc9de16a7e0ab26f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 16 Sep 2015 23:43:17 -0400 Subject: SUNRPC: Ensure that we wait for connections to complete before retrying Commit 718ba5b87343, moved the responsibility for unlocking the socket to xs_tcp_setup_socket, meaning that the socket will be unlocked before we know that it has finished trying to connect. The following patch is based on an initial patch by Russell King to ensure that we delay clearing the XPRT_CONNECTING flag until we either know that we failed to initiate a connection attempt, or the connection attempt itself failed. Fixes: 718ba5b87343 ("SUNRPC: Add helpers to prevent socket create from racing") Reported-by: Russell King Reported-by: Russell King Tested-by: Russell King Tested-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprtsock.h | 3 +++ net/sunrpc/xprtsock.c | 11 ++++++++--- 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h index 7591788e9fbf..357e44c1a46b 100644 --- a/include/linux/sunrpc/xprtsock.h +++ b/include/linux/sunrpc/xprtsock.h @@ -42,6 +42,7 @@ struct sock_xprt { /* * Connection of transports */ + unsigned long sock_state; struct delayed_work connect_worker; struct sockaddr_storage srcaddr; unsigned short srcport; @@ -76,6 +77,8 @@ struct sock_xprt { */ #define TCP_RPC_REPLY (1UL << 6) +#define XPRT_SOCK_CONNECTING 1U + #endif /* __KERNEL__ */ #endif /* _LINUX_SUNRPC_XPRTSOCK_H */ diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index d2dfbd043bea..c35038511686 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1438,6 +1438,7 @@ out: static void xs_tcp_state_change(struct sock *sk) { struct rpc_xprt *xprt; + struct sock_xprt *transport; read_lock_bh(&sk->sk_callback_lock); if (!(xprt = xprt_from_sock(sk))) @@ -1449,13 +1450,12 @@ static void xs_tcp_state_change(struct sock *sk) sock_flag(sk, SOCK_ZAPPED), sk->sk_shutdown); + transport = container_of(xprt, struct sock_xprt, xprt); trace_rpc_socket_state_change(xprt, sk->sk_socket); switch (sk->sk_state) { case TCP_ESTABLISHED: spin_lock(&xprt->transport_lock); if (!xprt_test_and_set_connected(xprt)) { - struct sock_xprt *transport = container_of(xprt, - struct sock_xprt, xprt); /* Reset TCP record info */ transport->tcp_offset = 0; @@ -1464,6 +1464,8 @@ static void xs_tcp_state_change(struct sock *sk) transport->tcp_flags = TCP_RCV_COPY_FRAGHDR | TCP_RCV_COPY_XID; xprt->connect_cookie++; + clear_bit(XPRT_SOCK_CONNECTING, &transport->sock_state); + xprt_clear_connecting(xprt); xprt_wake_pending_tasks(xprt, -EAGAIN); } @@ -1499,6 +1501,9 @@ static void xs_tcp_state_change(struct sock *sk) smp_mb__after_atomic(); break; case TCP_CLOSE: + if (test_and_clear_bit(XPRT_SOCK_CONNECTING, + &transport->sock_state)) + xprt_clear_connecting(xprt); xs_sock_mark_closed(xprt); } out: @@ -2182,6 +2187,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) /* Tell the socket layer to start connecting... */ xprt->stat.connect_count++; xprt->stat.connect_start = jiffies; + set_bit(XPRT_SOCK_CONNECTING, &transport->sock_state); ret = kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK); switch (ret) { case 0: @@ -2243,7 +2249,6 @@ static void xs_tcp_setup_socket(struct work_struct *work) case -EINPROGRESS: case -EALREADY: xprt_unlock_connect(xprt, transport); - xprt_clear_connecting(xprt); return; case -EINVAL: /* Happens, for instance, if the user specified a link -- cgit v1.2.3 From b7f76ea2ef6739ee484a165ffbac98deb855d3d3 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 18 Sep 2015 23:41:23 +0200 Subject: security: fix typo in security_task_prctl Signed-off-by: Jann Horn Reviewed-by: Andy Lutomirski Signed-off-by: Linus Torvalds --- include/linux/security.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index 79d85ddf8093..2f4c1f7aa7db 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -946,7 +946,7 @@ static inline int security_task_prctl(int option, unsigned long arg2, unsigned long arg4, unsigned long arg5) { - return cap_task_prctl(option, arg2, arg3, arg3, arg5); + return cap_task_prctl(option, arg2, arg3, arg4, arg5); } static inline void security_task_to_inode(struct task_struct *p, struct inode *inode) -- cgit v1.2.3 From 66e8c57da6bf6b847a48a5a6fda59512f733ed78 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 25 Aug 2015 20:45:18 +0200 Subject: rcu: Change _wait_rcu_gp() to work around GCC bug 67055 Code like this in inline functions confuses some recent versions of gcc: const int n = const-expr; whatever_t array[n]; For more details, see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67055#c13 This compiler bug results in the following failure after 114b7fd4b (rcu: Create rcu_sync infrastructure): In file included from include/linux/rcupdate.h:429:0, from include/linux/rcu_sync.h:5, from kernel/rcu/sync.c:1: include/linux/rcutiny.h: In function 'rcu_barrier_sched': include/linux/rcutiny.h:55:20: internal compiler error: Segmentation fault static inline void rcu_barrier_sched(void) This commit therefore eliminates the constant local variable in favor of direct use of the expression. Reported-and-tested-by: Mark Salter Reported-by: Guenter Roeck Signed-off-by: Oleg Nesterov Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index ff476515f716..581abf848566 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -230,12 +230,11 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, struct rcu_synchronize *rs_array); #define _wait_rcu_gp(checktiny, ...) \ -do { \ - call_rcu_func_t __crcu_array[] = { __VA_ARGS__ }; \ - const int __n = ARRAY_SIZE(__crcu_array); \ - struct rcu_synchronize __rs_array[__n]; \ - \ - __wait_rcu_gp(checktiny, __n, __crcu_array, __rs_array); \ +do { \ + call_rcu_func_t __crcu_array[] = { __VA_ARGS__ }; \ + struct rcu_synchronize __rs_array[ARRAY_SIZE(__crcu_array)]; \ + __wait_rcu_gp(checktiny, ARRAY_SIZE(__crcu_array), \ + __crcu_array, __rs_array); \ } while (0) #define wait_rcu_gp(...) _wait_rcu_gp(false, __VA_ARGS__) -- cgit v1.2.3 From ac5be6b47e8bd25b62bed2c82cda7398999f59e9 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 22 Sep 2015 14:58:49 -0700 Subject: userfaultfd: revert "userfaultfd: waitqueue: add nr wake parameter to __wake_up_locked_key" This reverts commit 51360155eccb907ff8635bd10fc7de876408c2e0 and adapts fs/userfaultfd.c to use the old version of that function. It didn't look robust to call __wake_up_common with "nr == 1" when we absolutely require wakeall semantics, but we've full control of what we insert in the two waitqueue heads of the blocked userfaults. No exclusive waitqueue risks to be inserted into those two waitqueue heads so we can as well stick to "nr == 1" of the old code and we can rely purely on the fact no waitqueue inserted in one of the two waitqueue heads we must enforce as wakeall, has wait->flags WQ_FLAG_EXCLUSIVE set. Signed-off-by: Andrea Arcangeli Cc: Dr. David Alan Gilbert Cc: Michael Ellerman Cc: Shuah Khan Cc: Thierry Reding Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 8 ++++---- include/linux/wait.h | 5 ++--- kernel/sched/wait.c | 7 +++---- net/sunrpc/sched.c | 2 +- 4 files changed, 10 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index f9aeb40a7197..50311703135b 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -467,8 +467,8 @@ static int userfaultfd_release(struct inode *inode, struct file *file) * the fault_*wqh. */ spin_lock(&ctx->fault_pending_wqh.lock); - __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0, &range); - __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, &range); + __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range); + __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, &range); spin_unlock(&ctx->fault_pending_wqh.lock); wake_up_poll(&ctx->fd_wqh, POLLHUP); @@ -650,10 +650,10 @@ static void __wake_userfault(struct userfaultfd_ctx *ctx, spin_lock(&ctx->fault_pending_wqh.lock); /* wake all in the range and autoremove */ if (waitqueue_active(&ctx->fault_pending_wqh)) - __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0, + __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, range); if (waitqueue_active(&ctx->fault_wqh)) - __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, range); + __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, range); spin_unlock(&ctx->fault_pending_wqh.lock); } diff --git a/include/linux/wait.h b/include/linux/wait.h index d3d077228d4c..1e1bf9f963a9 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -147,8 +147,7 @@ __remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old) typedef int wait_bit_action_f(struct wait_bit_key *); void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key); -void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr, - void *key); +void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key); void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key); void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr); void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr); @@ -180,7 +179,7 @@ wait_queue_head_t *bit_waitqueue(void *, int); #define wake_up_poll(x, m) \ __wake_up(x, TASK_NORMAL, 1, (void *) (m)) #define wake_up_locked_poll(x, m) \ - __wake_up_locked_key((x), TASK_NORMAL, 1, (void *) (m)) + __wake_up_locked_key((x), TASK_NORMAL, (void *) (m)) #define wake_up_interruptible_poll(x, m) \ __wake_up(x, TASK_INTERRUPTIBLE, 1, (void *) (m)) #define wake_up_interruptible_sync_poll(x, m) \ diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 272d9322bc5d..052e02672d12 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -106,10 +106,9 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) } EXPORT_SYMBOL_GPL(__wake_up_locked); -void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr, - void *key) +void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) { - __wake_up_common(q, mode, nr, 0, key); + __wake_up_common(q, mode, 1, 0, key); } EXPORT_SYMBOL_GPL(__wake_up_locked_key); @@ -284,7 +283,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, if (!list_empty(&wait->task_list)) list_del_init(&wait->task_list); else if (waitqueue_active(q)) - __wake_up_locked_key(q, mode, 1, key); + __wake_up_locked_key(q, mode, key); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(abort_exclusive_wait); diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index b140c092d226..337ca851a350 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -297,7 +297,7 @@ static int rpc_complete_task(struct rpc_task *task) clear_bit(RPC_TASK_ACTIVE, &task->tk_runstate); ret = atomic_dec_and_test(&task->tk_count); if (waitqueue_active(wq)) - __wake_up_locked_key(wq, TASK_NORMAL, 1, &k); + __wake_up_locked_key(wq, TASK_NORMAL, &k); spin_unlock_irqrestore(&wq->lock, flags); return ret; } -- cgit v1.2.3 From 2d8bff12699abc3a9bf886bb0b79f44d94d81496 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Wed, 23 Sep 2015 14:57:58 -0400 Subject: netpoll: Close race condition between poll_one_napi and napi_disable Drivers might call napi_disable while not holding the napi instance poll_lock. In those instances, its possible for a race condition to exist between poll_one_napi and napi_disable. That is to say, poll_one_napi only tests the NAPI_STATE_SCHED bit to see if there is work to do during a poll, and as such the following may happen: CPU0 CPU1 ndo_tx_timeout napi_poll_dev napi_disable poll_one_napi test_and_set_bit (ret 0) test_bit (ret 1) reset adapter napi_poll_routine If the adapter gets a tx timeout without a napi instance scheduled, its possible for the adapter to think it has exclusive access to the hardware (as the napi instance is now scheduled via the napi_disable call), while the netpoll code thinks there is simply work to do. The result is parallel hardware access leading to corrupt data structures in the driver, and a crash. Additionaly, there is another, more critical race between netpoll and napi_disable. The disabled napi state is actually identical to the scheduled state for a given napi instance. The implication being that, if a napi instance is disabled, a netconsole instance would see the napi state of the device as having been scheduled, and poll it, likely while the driver was dong something requiring exclusive access. In the case above, its fairly clear that not having the rings in a state ready to be polled will cause any number of crashes. The fix should be pretty easy. netpoll uses its own bit to indicate that that the napi instance is in a state of being serviced by netpoll (NAPI_STATE_NPSVC). We can just gate disabling on that bit as well as the sched bit. That should prevent netpoll from conducting a napi poll if we convert its set bit to a test_and_set_bit operation to provide mutual exclusion Change notes: V2) Remove a trailing whtiespace Resubmit with proper subject prefix V3) Clean up spacing nits Signed-off-by: Neil Horman CC: "David S. Miller" CC: jmaxwell@redhat.com Tested-by: jmaxwell@redhat.com Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + net/core/dev.c | 2 ++ net/core/netpoll.c | 10 ++++++++-- 3 files changed, 11 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 88a00694eda5..2d15e3831440 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -507,6 +507,7 @@ static inline void napi_enable(struct napi_struct *n) BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); smp_mb__before_atomic(); clear_bit(NAPI_STATE_SCHED, &n->state); + clear_bit(NAPI_STATE_NPSVC, &n->state); } #ifdef CONFIG_SMP diff --git a/net/core/dev.c b/net/core/dev.c index 877c84834d81..6bb6470f5b7b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4713,6 +4713,8 @@ void napi_disable(struct napi_struct *n) while (test_and_set_bit(NAPI_STATE_SCHED, &n->state)) msleep(1); + while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state)) + msleep(1); hrtimer_cancel(&n->timer); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 6aa3db8dfc3b..8bdada242a7d 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -142,7 +142,7 @@ static void queue_process(struct work_struct *work) */ static int poll_one_napi(struct napi_struct *napi, int budget) { - int work; + int work = 0; /* net_rx_action's ->poll() invocations and our's are * synchronized by this test which is only made while @@ -151,7 +151,12 @@ static int poll_one_napi(struct napi_struct *napi, int budget) if (!test_bit(NAPI_STATE_SCHED, &napi->state)) return budget; - set_bit(NAPI_STATE_NPSVC, &napi->state); + /* If we set this bit but see that it has already been set, + * that indicates that napi has been disabled and we need + * to abort this operation + */ + if (test_and_set_bit(NAPI_STATE_NPSVC, &napi->state)) + goto out; work = napi->poll(napi, budget); WARN_ONCE(work > budget, "%pF exceeded budget in poll\n", napi->poll); @@ -159,6 +164,7 @@ static int poll_one_napi(struct napi_struct *napi, int budget) clear_bit(NAPI_STATE_NPSVC, &napi->state); +out: return budget - work; } -- cgit v1.2.3 From b4fe8ba7a310da6a2b99e3abe67c7815198cde49 Mon Sep 17 00:00:00 2001 From: Qipeng Zha Date: Tue, 15 Sep 2015 00:39:17 +0800 Subject: regmap: Add generic macro to define regmap_irq Add REGMAP_IRQ_REG macro in regmap.h to define regmap_irq structure easily for other driver module. Signed-off-by: Qipeng Zha Acked-by: Mark Brown Signed-off-by: Lee Jones --- include/linux/regmap.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index 8fc0bfd8edc4..f6226976e158 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -791,6 +791,9 @@ struct regmap_irq { unsigned int mask; }; +#define REGMAP_IRQ_REG(_irq, _off, _mask) \ + [_irq] = { .reg_offset = (_off), .mask = (_mask) } + /** * Description of a generic regmap irq_chip. This is not intended to * handle every possible interrupt controller, but it should handle a -- cgit v1.2.3 From 9badce000e2ce68ba74838a3cd356dde58221c2f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Sep 2015 17:07:29 -0400 Subject: cgroup, writeback: don't enable cgroup writeback on traditional hierarchies inode_cgwb_enabled() gates cgroup writeback support. If it returns true, each inode is attached to the corresponding memory domain which gets mapped to io domain. It currently only tests whether the filesystem and bdi support cgroup writeback; however, cgroup writeback support doesn't work on traditional hierarchies and thus it should also test whether memcg and iocg are on the default hierarchy. This caused traditional hierarchy setups to hit the cgroup writeback path inadvertently and ended up creating separate writeback domains for each memcg and mapping them all to the root iocg uncovering a couple issues in the cgroup writeback path. cgroup writeback was never meant to be enabled on traditional hierarchies. Make inode_cgwb_enabled() test whether both memcg and iocg are on the default hierarchy. Signed-off-by: Tejun Heo Reported-by: Artem Bityutskiy Reported-by: Dexuan Cui Link: http://lkml.kernel.org/g/1443012552.19983.209.camel@gmail.com Link: http://lkml.kernel.org/g/f30d4a6aa8a546ff88f73021d026a453@SIXPR30MB031.064d.mgd.msft.net --- include/linux/backing-dev.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 5a5d79ee256f..d5eb4ad1c534 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -252,13 +253,19 @@ int inode_congested(struct inode *inode, int cong_bits); * @inode: inode of interest * * cgroup writeback requires support from both the bdi and filesystem. - * Test whether @inode has both. + * Also, both memcg and iocg have to be on the default hierarchy. Test + * whether all conditions are met. + * + * Note that the test result may change dynamically on the same inode + * depending on how memcg and iocg are configured. */ static inline bool inode_cgwb_enabled(struct inode *inode) { struct backing_dev_info *bdi = inode_to_bdi(inode); - return bdi_cap_account_dirty(bdi) && + return cgroup_on_dfl(mem_cgroup_root_css->cgroup) && + cgroup_on_dfl(blkcg_root_css->cgroup) && + bdi_cap_account_dirty(bdi) && (bdi->capabilities & BDI_CAP_CGROUP_WRITEBACK) && (inode->i_sb->s_iflags & SB_I_CGROUPWB); } -- cgit v1.2.3 From 6ae459bdaaeebc632b16e54dcbabb490c6931d61 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Tue, 22 Sep 2015 12:57:53 -0700 Subject: skbuff: Fix skb checksum flag on skb pull VXLAN device can receive skb with checksum partial. But the checksum offset could be in outer header which is pulled on receive. This results in negative checksum offset for the skb. Such skb can cause the assert failure in skb_checksum_help(). Following patch fixes the bug by setting checksum-none while pulling outer header. Following is the kernel panic msg from old kernel hitting the bug. ------------[ cut here ]------------ kernel BUG at net/core/dev.c:1906! RIP: 0010:[] skb_checksum_help+0x144/0x150 Call Trace: [] queue_userspace_packet+0x408/0x470 [openvswitch] [] ovs_dp_upcall+0x5d/0x60 [openvswitch] [] ovs_dp_process_packet_with_key+0xe6/0x100 [openvswitch] [] ovs_dp_process_received_packet+0x4b/0x80 [openvswitch] [] ovs_vport_receive+0x2a/0x30 [openvswitch] [] vxlan_rcv+0x53/0x60 [openvswitch] [] vxlan_udp_encap_recv+0x8b/0xf0 [openvswitch] [] udp_queue_rcv_skb+0x2dc/0x3b0 [] __udp4_lib_rcv+0x1cf/0x6c0 [] udp_rcv+0x1a/0x20 [] ip_local_deliver_finish+0xdd/0x280 [] ip_local_deliver+0x88/0x90 [] ip_rcv_finish+0x10d/0x370 [] ip_rcv+0x235/0x300 [] __netif_receive_skb+0x55d/0x620 [] netif_receive_skb+0x80/0x90 [] virtnet_poll+0x555/0x6f0 [] net_rx_action+0x134/0x290 [] __do_softirq+0xa8/0x210 [] call_softirq+0x1c/0x30 [] do_softirq+0x65/0xa0 [] irq_exit+0x8e/0xb0 [] do_IRQ+0x63/0xe0 [] common_interrupt+0x6e/0x6e Reported-by: Anupam Chanda Signed-off-by: Pravin B Shelar Acked-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9987af080fa0..2b0a30a6e31c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2707,6 +2707,9 @@ static inline void skb_postpull_rcsum(struct sk_buff *skb, { if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_sub(skb->csum, csum_partial(start, len, 0)); + else if (skb->ip_summed == CHECKSUM_PARTIAL && + skb_checksum_start_offset(skb) <= len) + skb->ip_summed = CHECKSUM_NONE; } unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len); -- cgit v1.2.3 From 3e3aaf649416988ca8be4ad2c52dc24d8be7b46e Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 24 Sep 2015 20:36:02 +0100 Subject: phy: fix mdiobus module safety Re-implement the mdiobus module refcounting to ensure that we actually ensure that the mdiobus module code does not go away while we might call into it. The old scheme using bus->dev.driver was buggy, because bus->dev is a class device which never has a struct device_driver associated with it, and hence the associated code trying to obtain a refcount did nothing useful. Instead, take the approach that other subsystems do: pass the module when calling mdiobus_register(), and record that in the mii_bus struct. When we need to increment the module use count in the phy code, use this stored pointer. When the phy is deteched, drop the module refcount, remembering that the phy device might go away at that point. This doesn't stop the mii_bus going away while there are in-use phys - it merely stops the underlying code vanishing. Signed-off-by: Russell King Signed-off-by: David S. Miller --- drivers/net/phy/mdio_bus.c | 5 +++-- drivers/net/phy/phy_device.c | 32 ++++++++++++++++++-------------- include/linux/phy.h | 5 ++++- 3 files changed, 25 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index 67553e13bd36..992406624b7c 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -244,7 +244,7 @@ static inline void of_mdiobus_link_phydev(struct mii_bus *mdio, * * Returns 0 on success or < 0 on error. */ -int mdiobus_register(struct mii_bus *bus) +int __mdiobus_register(struct mii_bus *bus, struct module *owner) { int i, err; @@ -255,6 +255,7 @@ int mdiobus_register(struct mii_bus *bus) BUG_ON(bus->state != MDIOBUS_ALLOCATED && bus->state != MDIOBUS_UNREGISTERED); + bus->owner = owner; bus->dev.parent = bus->parent; bus->dev.class = &mdio_bus_class; bus->dev.groups = NULL; @@ -296,7 +297,7 @@ error: device_del(&bus->dev); return err; } -EXPORT_SYMBOL(mdiobus_register); +EXPORT_SYMBOL(__mdiobus_register); void mdiobus_unregister(struct mii_bus *bus) { diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index c0f211127274..03adf328f49b 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -582,10 +582,15 @@ EXPORT_SYMBOL(phy_init_hw); int phy_attach_direct(struct net_device *dev, struct phy_device *phydev, u32 flags, phy_interface_t interface) { + struct mii_bus *bus = phydev->bus; struct device *d = &phydev->dev; - struct module *bus_module; int err; + if (!try_module_get(bus->owner)) { + dev_err(&dev->dev, "failed to get the bus module\n"); + return -EIO; + } + /* Assume that if there is no driver, that it doesn't * exist, and we should use the genphy driver. */ @@ -600,20 +605,13 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev, err = device_bind_driver(d); if (err) - return err; + goto error; } if (phydev->attached_dev) { dev_err(&dev->dev, "PHY already attached\n"); - return -EBUSY; - } - - /* Increment the bus module reference count */ - bus_module = phydev->bus->dev.driver ? - phydev->bus->dev.driver->owner : NULL; - if (!try_module_get(bus_module)) { - dev_err(&dev->dev, "failed to get the bus module\n"); - return -EIO; + err = -EBUSY; + goto error; } phydev->attached_dev = dev; @@ -636,6 +634,10 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev, phy_resume(phydev); return err; + +error: + module_put(bus->owner); + return err; } EXPORT_SYMBOL(phy_attach_direct); @@ -680,11 +682,9 @@ EXPORT_SYMBOL(phy_attach); */ void phy_detach(struct phy_device *phydev) { + struct mii_bus *bus; int i; - if (phydev->bus->dev.driver) - module_put(phydev->bus->dev.driver->owner); - phydev->attached_dev->phydev = NULL; phydev->attached_dev = NULL; phy_suspend(phydev); @@ -700,6 +700,10 @@ void phy_detach(struct phy_device *phydev) break; } } + + bus = phydev->bus; + + module_put(bus->owner); } EXPORT_SYMBOL(phy_detach); diff --git a/include/linux/phy.h b/include/linux/phy.h index 962387a192f1..11bce44f6d65 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -153,6 +154,7 @@ struct sk_buff; * PHYs should register using this structure */ struct mii_bus { + struct module *owner; const char *name; char id[MII_BUS_ID_SIZE]; void *priv; @@ -198,7 +200,8 @@ static inline struct mii_bus *mdiobus_alloc(void) return mdiobus_alloc_size(0); } -int mdiobus_register(struct mii_bus *bus); +int __mdiobus_register(struct mii_bus *bus, struct module *owner); +#define mdiobus_register(bus) __mdiobus_register(bus, THIS_MODULE) void mdiobus_unregister(struct mii_bus *bus); void mdiobus_free(struct mii_bus *bus); struct mii_bus *devm_mdiobus_alloc_size(struct device *dev, int sizeof_priv); -- cgit v1.2.3 From 38737e490d4ea91660d3cec83ef88c4e6d360ae4 Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 24 Sep 2015 20:36:28 +0100 Subject: phy: add phy_device_remove() Add a phy_device_remove() function to complement phy_device_register(), which undoes the effects of phy_device_register() by removing the phy device from visibility, but not freeing it. This allows these details to be moved out of the mdio bus code into the phy code where this action belongs. Signed-off-by: Russell King Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/gianfar.c | 5 +++-- drivers/net/phy/mdio_bus.c | 15 ++++++++++----- drivers/net/phy/phy_device.c | 18 ++++++++++++++++++ include/linux/phy.h | 1 + 4 files changed, 32 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c index a5cf4332d307..710715fcb23d 100644 --- a/drivers/net/ethernet/freescale/gianfar.c +++ b/drivers/net/ethernet/freescale/gianfar.c @@ -1702,7 +1702,6 @@ static void gfar_configure_serdes(struct net_device *dev) tbiphy = of_phy_find_device(priv->tbi_node); if (!tbiphy) { dev_err(&dev->dev, "error: Could not get TBI device\n"); - put_device(&tbiphy->dev); return; } @@ -1711,8 +1710,10 @@ static void gfar_configure_serdes(struct net_device *dev) * everything for us? Resetting it takes the link down and requires * several seconds for it to come back. */ - if (phy_read(tbiphy, MII_BMSR) & BMSR_LSTATUS) + if (phy_read(tbiphy, MII_BMSR) & BMSR_LSTATUS) { + put_device(&tbiphy->dev); return; + } /* Single clk mode, mii mode off(for serdes communication) */ phy_write(tbiphy, MII_TBICON, TBICON_CLK_SELECT); diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index 992406624b7c..c340e412b38f 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -291,8 +291,11 @@ int __mdiobus_register(struct mii_bus *bus, struct module *owner) error: while (--i >= 0) { - if (bus->phy_map[i]) - device_unregister(&bus->phy_map[i]->dev); + struct phy_device *phydev = bus->phy_map[i]; + if (phydev) { + phy_device_remove(phydev); + phy_device_free(phydev); + } } device_del(&bus->dev); return err; @@ -307,9 +310,11 @@ void mdiobus_unregister(struct mii_bus *bus) bus->state = MDIOBUS_UNREGISTERED; for (i = 0; i < PHY_MAX_ADDR; i++) { - if (bus->phy_map[i]) - device_unregister(&bus->phy_map[i]->dev); - bus->phy_map[i] = NULL; + struct phy_device *phydev = bus->phy_map[i]; + if (phydev) { + phy_device_remove(phydev); + phy_device_free(phydev); + } } device_del(&bus->dev); } diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 97a4f52addac..f761288abe66 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -383,6 +383,24 @@ int phy_device_register(struct phy_device *phydev) } EXPORT_SYMBOL(phy_device_register); +/** + * phy_device_remove - Remove a previously registered phy device from the MDIO bus + * @phydev: phy_device structure to remove + * + * This doesn't free the phy_device itself, it merely reverses the effects + * of phy_device_register(). Use phy_device_free() to free the device + * after calling this function. + */ +void phy_device_remove(struct phy_device *phydev) +{ + struct mii_bus *bus = phydev->bus; + int addr = phydev->addr; + + device_del(&phydev->dev); + bus->phy_map[addr] = NULL; +} +EXPORT_SYMBOL(phy_device_remove); + /** * phy_find_first - finds the first PHY device on the bus * @bus: the target MII bus diff --git a/include/linux/phy.h b/include/linux/phy.h index 11bce44f6d65..4a4e3a092337 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -745,6 +745,7 @@ struct phy_device *phy_device_create(struct mii_bus *bus, int addr, int phy_id, struct phy_c45_device_ids *c45_ids); struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45); int phy_device_register(struct phy_device *phy); +void phy_device_remove(struct phy_device *phydev); int phy_init_hw(struct phy_device *phydev); int phy_suspend(struct phy_device *phydev); int phy_resume(struct phy_device *phydev); -- cgit v1.2.3 From c6790aa9f4fdc26b1246ba36da2fd749663beb65 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 24 Sep 2015 10:34:23 +0300 Subject: IB/mlx5: Remove support for IB_DEVICE_LOCAL_DMA_LKEY Commit 96249d70dd70 ("IB/core: Guarantee that a local_dma_lkey is available") allows ULPs that make use of the local dma key to keep working as before by allocating a DMA MR with local permissions and converted these consumers to use the MR associated with the PD rather then device->local_dma_lkey. ConnectIB has some known issues with memory registration using the local_dma_lkey (SEND, RDMA, RECV seems to work ok). Thus don't expose support for it (remove device->local_dma_lkey setting), and take advantage of the above commit such that no regression is introduced to working systems. The local_dma_lkey support will be restored in CX4 depending on FW capability query. Signed-off-by: Sagi Grimberg Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/main.c | 10 +--------- drivers/net/ethernet/mellanox/mlx5/core/fw.c | 22 ---------------------- include/linux/mlx5/device.h | 11 ----------- include/linux/mlx5/driver.h | 1 - 4 files changed, 1 insertion(+), 43 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 41d6911e244e..0ab9625911a1 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -245,7 +245,6 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR; if (MLX5_CAP_GEN(mdev, apm)) props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG; - props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY; if (MLX5_CAP_GEN(mdev, xrc)) props->device_cap_flags |= IB_DEVICE_XRC; props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; @@ -1245,18 +1244,10 @@ static int create_dev_resources(struct mlx5_ib_resources *devr) struct ib_srq_init_attr attr; struct mlx5_ib_dev *dev; struct ib_cq_init_attr cq_attr = {.cqe = 1}; - u32 rsvd_lkey; int ret = 0; dev = container_of(devr, struct mlx5_ib_dev, devr); - ret = mlx5_core_query_special_context(dev->mdev, &rsvd_lkey); - if (ret) { - pr_err("Failed to query special context %d\n", ret); - return ret; - } - dev->ib_dev.local_dma_lkey = rsvd_lkey; - devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL); if (IS_ERR(devr->p0)) { ret = PTR_ERR(devr->p0); @@ -1418,6 +1409,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) strlcpy(dev->ib_dev.name, "mlx5_%d", IB_DEVICE_NAME_MAX); dev->ib_dev.owner = THIS_MODULE; dev->ib_dev.node_type = RDMA_NODE_IB_CA; + dev->ib_dev.local_dma_lkey = 0 /* not supported for now */; dev->num_ports = MLX5_CAP_GEN(mdev, num_ports); dev->ib_dev.phys_port_cnt = dev->num_ports; dev->ib_dev.num_comp_vectors = diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c index aa0d5ffe92d8..9335e5ae18cc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c @@ -200,25 +200,3 @@ int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev) return err; } - -int mlx5_core_query_special_context(struct mlx5_core_dev *dev, u32 *rsvd_lkey) -{ - struct mlx5_cmd_query_special_contexts_mbox_in in; - struct mlx5_cmd_query_special_contexts_mbox_out out; - int err; - - memset(&in, 0, sizeof(in)); - memset(&out, 0, sizeof(out)); - in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS); - err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); - if (err) - return err; - - if (out.hdr.status) - err = mlx5_cmd_status_to_err(&out.hdr); - - *rsvd_lkey = be32_to_cpu(out.resd_lkey); - - return err; -} -EXPORT_SYMBOL(mlx5_core_query_special_context); diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 8eb3b19af2a4..250b1ff8b48d 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -402,17 +402,6 @@ struct mlx5_cmd_teardown_hca_mbox_out { u8 rsvd[8]; }; -struct mlx5_cmd_query_special_contexts_mbox_in { - struct mlx5_inbox_hdr hdr; - u8 rsvd[8]; -}; - -struct mlx5_cmd_query_special_contexts_mbox_out { - struct mlx5_outbox_hdr hdr; - __be32 dump_fill_mkey; - __be32 resd_lkey; -}; - struct mlx5_cmd_layout { u8 type; u8 rsvd0[3]; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 27b53f9a24ad..8b6d6f2154a4 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -845,7 +845,6 @@ void *mlx5_get_protocol_dev(struct mlx5_core_dev *mdev, int protocol); int mlx5_register_interface(struct mlx5_interface *intf); void mlx5_unregister_interface(struct mlx5_interface *intf); int mlx5_core_query_vendor_id(struct mlx5_core_dev *mdev, u32 *vendor_id); -int mlx5_core_query_special_context(struct mlx5_core_dev *dev, u32 *rsvd_lkey); struct mlx5_profile { u64 mask; -- cgit v1.2.3 From 5ebc76035303016ec41bb752bec156ea9fde7c34 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Thu, 17 Sep 2015 14:02:45 +0800 Subject: ACPI, PCI, irq: Do not share PCI IRQ with ISA IRQ Avoid IRQs occupied by ISA IRQs when allocating IRQs for PCI link devices, otherwise it may cause interrupt storm due to incompatible pin attributes. This issue was triggered on a KVM virtual machine, which 1) uses IRQ9 for SCI in high level mode. 2) defines an PCI interrupt link device (LNKS) with IRQ9 as the only possible irq. 3) has an PCI device referring to link device LNKS. So it causes interrupt storm when enabling the PCI device because PCI IRQ works in low level mode. Signed-off-by: Jiang Liu Acked-by: Bjorn Helgaas Signed-off-by: Rafael J. Wysocki --- drivers/acpi/pci_irq.c | 1 + drivers/acpi/pci_link.c | 13 +++++++++++++ include/linux/acpi.h | 1 + 3 files changed, 15 insertions(+) (limited to 'include/linux') diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c index 6da0f9beab19..c9336751e5e3 100644 --- a/drivers/acpi/pci_irq.c +++ b/drivers/acpi/pci_irq.c @@ -372,6 +372,7 @@ static int acpi_isa_register_gsi(struct pci_dev *dev) /* Interrupt Line values above 0xF are forbidden */ if (dev->irq > 0 && (dev->irq <= 0xF) && + acpi_isa_irq_available(dev->irq) && (acpi_isa_irq_to_gsi(dev->irq, &dev_gsi) == 0)) { dev_warn(&dev->dev, "PCI INT %c: no GSI - using ISA IRQ %d\n", pin_name(dev->pin), dev->irq); diff --git a/drivers/acpi/pci_link.c b/drivers/acpi/pci_link.c index 3b4ea98e3ea0..246e50d22120 100644 --- a/drivers/acpi/pci_link.c +++ b/drivers/acpi/pci_link.c @@ -553,6 +553,13 @@ static int acpi_pci_link_allocate(struct acpi_pci_link *link) irq = link->irq.possible[i]; } } + if (acpi_irq_penalty[irq] >= PIRQ_PENALTY_ISA_ALWAYS) { + printk(KERN_ERR PREFIX "No IRQ available for %s [%s]. " + "Try pci=noacpi or acpi=off\n", + acpi_device_name(link->device), + acpi_device_bid(link->device)); + return -ENODEV; + } /* Attempt to enable the link device at this IRQ. */ if (acpi_pci_link_set(link, irq)) { @@ -821,6 +828,12 @@ void acpi_penalize_isa_irq(int irq, int active) } } +bool acpi_isa_irq_available(int irq) +{ + return irq >= 0 && (irq >= ARRAY_SIZE(acpi_irq_penalty) || + acpi_irq_penalty[irq] < PIRQ_PENALTY_ISA_ALWAYS); +} + /* * Penalize IRQ used by ACPI SCI. If ACPI SCI pin attributes conflict with * PCI IRQ attributes, mark ACPI SCI as ISA_ALWAYS so it won't be use for diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 7235c4851460..43856d19cf4d 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -217,6 +217,7 @@ struct pci_dev; int acpi_pci_irq_enable (struct pci_dev *dev); void acpi_penalize_isa_irq(int irq, int active); +bool acpi_isa_irq_available(int irq); void acpi_penalize_sci_irq(int irq, int trigger, int polarity); void acpi_pci_irq_disable (struct pci_dev *dev); -- cgit v1.2.3 From 4593fdbe7a2f44d5e64c627c715dd0bcec9bdf14 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Sun, 27 Sep 2015 02:09:20 +0900 Subject: blk-mq: fix sysfs registration/unregistration race There is a race between cpu hotplug handling and adding/deleting gendisk for blk-mq, where both are trying to register and unregister the same sysfs entries. null_add_dev --> blk_mq_init_queue --> blk_mq_init_allocated_queue --> add to 'all_q_list' (*) --> add_disk --> blk_register_queue --> blk_mq_register_disk (++) null_del_dev --> del_gendisk --> blk_unregister_queue --> blk_mq_unregister_disk (--) --> blk_cleanup_queue --> blk_mq_free_queue --> del from 'all_q_list' (*) blk_mq_queue_reinit --> blk_mq_sysfs_unregister (-) --> blk_mq_sysfs_register (+) While the request queue is added to 'all_q_list' (*), blk_mq_queue_reinit() can be called for the queue anytime by CPU hotplug callback. But blk_mq_sysfs_unregister (-) and blk_mq_sysfs_register (+) in blk_mq_queue_reinit must not be called before blk_mq_register_disk (++) and after blk_mq_unregister_disk (--) is finished. Because '/sys/block/*/mq/' is not exists. There has already been BLK_MQ_F_SYSFS_UP flag in hctx->flags which can be used to track these sysfs stuff, but it is only fixing this issue partially. In order to fix it completely, we just need per-queue flag instead of per-hctx flag with appropriate locking. So this introduces q->mq_sysfs_init_done which is properly protected with all_q_mutex. Also, we need to ensure that blk_mq_map_swqueue() is called with all_q_mutex is held. Since hctx->nr_ctx is reset temporarily and updated in blk_mq_map_swqueue(), so we should avoid blk_mq_register_hctx() seeing the temporary hctx->nr_ctx value in CPU hotplug handling or adding/deleting gendisk . Signed-off-by: Akinobu Mita Reviewed-by: Ming Lei Cc: Ming Lei Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq-sysfs.c | 30 ++++++++++++++++++++++-------- block/blk-mq.c | 6 +++--- include/linux/blk-mq.h | 1 - include/linux/blkdev.h | 2 ++ 4 files changed, 27 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 279c5d674edf..189f5ae6522a 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -343,7 +343,7 @@ static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx) struct blk_mq_ctx *ctx; int i; - if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP)) + if (!hctx->nr_ctx) return; hctx_for_each_ctx(hctx, ctx, i) @@ -358,7 +358,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) struct blk_mq_ctx *ctx; int i, ret; - if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP)) + if (!hctx->nr_ctx) return 0; ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", hctx->queue_num); @@ -381,6 +381,8 @@ void blk_mq_unregister_disk(struct gendisk *disk) struct blk_mq_ctx *ctx; int i, j; + blk_mq_disable_hotplug(); + queue_for_each_hw_ctx(q, hctx, i) { blk_mq_unregister_hctx(hctx); @@ -395,6 +397,9 @@ void blk_mq_unregister_disk(struct gendisk *disk) kobject_put(&q->mq_kobj); kobject_put(&disk_to_dev(disk)->kobj); + + q->mq_sysfs_init_done = false; + blk_mq_enable_hotplug(); } static void blk_mq_sysfs_init(struct request_queue *q) @@ -425,27 +430,30 @@ int blk_mq_register_disk(struct gendisk *disk) struct blk_mq_hw_ctx *hctx; int ret, i; + blk_mq_disable_hotplug(); + blk_mq_sysfs_init(q); ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq"); if (ret < 0) - return ret; + goto out; kobject_uevent(&q->mq_kobj, KOBJ_ADD); queue_for_each_hw_ctx(q, hctx, i) { - hctx->flags |= BLK_MQ_F_SYSFS_UP; ret = blk_mq_register_hctx(hctx); if (ret) break; } - if (ret) { + if (ret) blk_mq_unregister_disk(disk); - return ret; - } + else + q->mq_sysfs_init_done = true; +out: + blk_mq_enable_hotplug(); - return 0; + return ret; } EXPORT_SYMBOL_GPL(blk_mq_register_disk); @@ -454,6 +462,9 @@ void blk_mq_sysfs_unregister(struct request_queue *q) struct blk_mq_hw_ctx *hctx; int i; + if (!q->mq_sysfs_init_done) + return; + queue_for_each_hw_ctx(q, hctx, i) blk_mq_unregister_hctx(hctx); } @@ -463,6 +474,9 @@ int blk_mq_sysfs_register(struct request_queue *q) struct blk_mq_hw_ctx *hctx; int i, ret = 0; + if (!q->mq_sysfs_init_done) + return ret; + queue_for_each_hw_ctx(q, hctx, i) { ret = blk_mq_register_hctx(hctx); if (ret) diff --git a/block/blk-mq.c b/block/blk-mq.c index 2fd7283ec62b..0262131ac5f2 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2035,13 +2035,13 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, goto err_hctxs; mutex_lock(&all_q_mutex); - list_add_tail(&q->all_q_node, &all_q_list); - mutex_unlock(&all_q_mutex); + list_add_tail(&q->all_q_node, &all_q_list); blk_mq_add_queue_tag_set(set, q); - blk_mq_map_swqueue(q); + mutex_unlock(&all_q_mutex); + return q; err_hctxs: diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 37d1602c4f7a..b80ba4572a31 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -145,7 +145,6 @@ enum { BLK_MQ_F_SHOULD_MERGE = 1 << 0, BLK_MQ_F_TAG_SHARED = 1 << 1, BLK_MQ_F_SG_MERGE = 1 << 2, - BLK_MQ_F_SYSFS_UP = 1 << 3, BLK_MQ_F_DEFER_ISSUE = 1 << 4, BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, BLK_MQ_F_ALLOC_POLICY_BITS = 1, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 99da9ebc7377..19c2e947d4d1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -456,6 +456,8 @@ struct request_queue { struct blk_mq_tag_set *tag_set; struct list_head tag_set_list; struct bio_set *bio_split; + + bool mq_sysfs_init_done; }; #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ -- cgit v1.2.3 From 31b33dfb0a144469dd805514c9e63f4993729a48 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Mon, 28 Sep 2015 17:24:25 -0700 Subject: skbuff: Fix skb checksum partial check. Earlier patch 6ae459bda tried to detect void ckecksum partial skb by comparing pull length to checksum offset. But it does not work for all cases since checksum-offset depends on updates to skb->data. Following patch fixes it by validating checksum start offset after skb-data pointer is updated. Negative value of checksum offset start means there is no need to checksum. Fixes: 6ae459bda ("skbuff: Fix skb checksum flag on skb pull") Reported-by: Andrew Vagin Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- net/core/skbuff.c | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2b0a30a6e31c..4398411236f1 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2708,7 +2708,7 @@ static inline void skb_postpull_rcsum(struct sk_buff *skb, if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_sub(skb->csum, csum_partial(start, len, 0)); else if (skb->ip_summed == CHECKSUM_PARTIAL && - skb_checksum_start_offset(skb) <= len) + skb_checksum_start_offset(skb) < 0) skb->ip_summed = CHECKSUM_NONE; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index dad4dd37e2aa..fab4599ba8b2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2958,11 +2958,12 @@ EXPORT_SYMBOL_GPL(skb_append_pagefrags); */ unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) { + unsigned char *data = skb->data; + BUG_ON(len > skb->len); - skb->len -= len; - BUG_ON(skb->len < skb->data_len); - skb_postpull_rcsum(skb, skb->data, len); - return skb->data += len; + __skb_pull(skb, len); + skb_postpull_rcsum(skb, data, len); + return skb->data; } EXPORT_SYMBOL_GPL(skb_pull_rcsum); -- cgit v1.2.3 From f4829a9b7a61e159367350008a608b062c4f6840 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 27 Sep 2015 21:01:50 +0200 Subject: blk-mq: fix racy updates of rq->errors blk_mq_complete_request may be a no-op if the request has already been completed by others means (e.g. a timeout or cancellation), but currently drivers have to set rq->errors before calling blk_mq_complete_request, which might leave us with the wrong error value. Add an error parameter to blk_mq_complete_request so that we can defer setting rq->errors until we known we won the race to complete the request. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- block/blk-mq.c | 12 ++++++------ drivers/block/loop.c | 11 +++++------ drivers/block/null_blk.c | 2 +- drivers/block/nvme-core.c | 16 +++++++--------- drivers/block/virtio_blk.c | 2 +- drivers/block/xen-blkfront.c | 19 ++++++++++--------- drivers/scsi/scsi_lib.c | 2 +- include/linux/blk-mq.h | 2 +- 8 files changed, 32 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/block/blk-mq.c b/block/blk-mq.c index 31c0c6259c4c..2306330530e8 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -393,14 +393,16 @@ void __blk_mq_complete_request(struct request *rq) * Ends all I/O on a request. It does not handle partial completions. * The actual completion happens out-of-order, through a IPI handler. **/ -void blk_mq_complete_request(struct request *rq) +void blk_mq_complete_request(struct request *rq, int error) { struct request_queue *q = rq->q; if (unlikely(blk_should_fake_timeout(q))) return; - if (!blk_mark_rq_complete(rq)) + if (!blk_mark_rq_complete(rq)) { + rq->errors = error; __blk_mq_complete_request(rq); + } } EXPORT_SYMBOL(blk_mq_complete_request); @@ -616,10 +618,8 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, * If a request wasn't started before the queue was * marked dying, kill it here or it'll go unnoticed. */ - if (unlikely(blk_queue_dying(rq->q))) { - rq->errors = -EIO; - blk_mq_complete_request(rq); - } + if (unlikely(blk_queue_dying(rq->q))) + blk_mq_complete_request(rq, -EIO); return; } if (rq->cmd_flags & REQ_NO_TIMEOUT) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index f9889b6bc02c..674f800a3b57 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1486,17 +1486,16 @@ static void loop_handle_cmd(struct loop_cmd *cmd) { const bool write = cmd->rq->cmd_flags & REQ_WRITE; struct loop_device *lo = cmd->rq->q->queuedata; - int ret = -EIO; + int ret = 0; - if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY)) + if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY)) { + ret = -EIO; goto failed; + } ret = do_req_filebacked(lo, cmd->rq); - failed: - if (ret) - cmd->rq->errors = -EIO; - blk_mq_complete_request(cmd->rq); + blk_mq_complete_request(cmd->rq, ret ? -EIO : 0); } static void loop_queue_write_work(struct work_struct *work) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index a295b98c6bae..1c9e4fe5aa44 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -289,7 +289,7 @@ static inline void null_handle_cmd(struct nullb_cmd *cmd) case NULL_IRQ_SOFTIRQ: switch (queue_mode) { case NULL_Q_MQ: - blk_mq_complete_request(cmd->rq); + blk_mq_complete_request(cmd->rq, cmd->rq->errors); break; case NULL_Q_RQ: blk_complete_request(cmd->rq); diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 30758bdf69ea..6f04771f1019 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -618,16 +618,15 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx, spin_unlock_irqrestore(req->q->queue_lock, flags); return; } + if (req->cmd_type == REQ_TYPE_DRV_PRIV) { if (cmd_rq->ctx == CMD_CTX_CANCELLED) - req->errors = -EINTR; - else - req->errors = status; + status = -EINTR; } else { - req->errors = nvme_error_status(status); + status = nvme_error_status(status); } - } else - req->errors = 0; + } + if (req->cmd_type == REQ_TYPE_DRV_PRIV) { u32 result = le32_to_cpup(&cqe->result); req->special = (void *)(uintptr_t)result; @@ -650,7 +649,7 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx, } nvme_free_iod(nvmeq->dev, iod); - blk_mq_complete_request(req); + blk_mq_complete_request(req, status); } /* length is in bytes. gfp flags indicates whether we may sleep. */ @@ -863,8 +862,7 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, if (ns && ns->ms && !blk_integrity_rq(req)) { if (!(ns->pi_type && ns->ms == 8) && req->cmd_type != REQ_TYPE_DRV_PRIV) { - req->errors = -EFAULT; - blk_mq_complete_request(req); + blk_mq_complete_request(req, -EFAULT); return BLK_MQ_RQ_QUEUE_OK; } } diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index e93899cc6f60..6ca35495a5be 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -144,7 +144,7 @@ static void virtblk_done(struct virtqueue *vq) do { virtqueue_disable_cb(vq); while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) { - blk_mq_complete_request(vbr->req); + blk_mq_complete_request(vbr->req, vbr->req->errors); req_done = true; } if (unlikely(virtqueue_is_broken(vq))) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 0823a96902f8..611170896b8c 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -1142,6 +1142,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) RING_IDX i, rp; unsigned long flags; struct blkfront_info *info = (struct blkfront_info *)dev_id; + int error; spin_lock_irqsave(&info->io_lock, flags); @@ -1182,37 +1183,37 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) continue; } - req->errors = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO; + error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO; switch (bret->operation) { case BLKIF_OP_DISCARD: if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { struct request_queue *rq = info->rq; printk(KERN_WARNING "blkfront: %s: %s op failed\n", info->gd->disk_name, op_name(bret->operation)); - req->errors = -EOPNOTSUPP; + error = -EOPNOTSUPP; info->feature_discard = 0; info->feature_secdiscard = 0; queue_flag_clear(QUEUE_FLAG_DISCARD, rq); queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq); } - blk_mq_complete_request(req); + blk_mq_complete_request(req, error); break; case BLKIF_OP_FLUSH_DISKCACHE: case BLKIF_OP_WRITE_BARRIER: if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { printk(KERN_WARNING "blkfront: %s: %s op failed\n", info->gd->disk_name, op_name(bret->operation)); - req->errors = -EOPNOTSUPP; + error = -EOPNOTSUPP; } if (unlikely(bret->status == BLKIF_RSP_ERROR && info->shadow[id].req.u.rw.nr_segments == 0)) { printk(KERN_WARNING "blkfront: %s: empty %s op failed\n", info->gd->disk_name, op_name(bret->operation)); - req->errors = -EOPNOTSUPP; + error = -EOPNOTSUPP; } - if (unlikely(req->errors)) { - if (req->errors == -EOPNOTSUPP) - req->errors = 0; + if (unlikely(error)) { + if (error == -EOPNOTSUPP) + error = 0; info->feature_flush = 0; xlvbd_flush(info); } @@ -1223,7 +1224,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) dev_dbg(&info->xbdev->dev, "Bad return from blkdev data " "request: %x\n", bret->status); - blk_mq_complete_request(req); + blk_mq_complete_request(req, error); break; default: BUG(); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index cbfc5990052b..126a48c6431e 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1957,7 +1957,7 @@ static int scsi_mq_prep_fn(struct request *req) static void scsi_mq_done(struct scsi_cmnd *cmd) { trace_scsi_dispatch_cmd_done(cmd); - blk_mq_complete_request(cmd->request); + blk_mq_complete_request(cmd->request, cmd->request->errors); } static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b80ba4572a31..c1b5c867ff07 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -214,7 +214,7 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head); void blk_mq_cancel_requeue_work(struct request_queue *q); void blk_mq_kick_requeue_list(struct request_queue *q); void blk_mq_abort_requeue_list(struct request_queue *q); -void blk_mq_complete_request(struct request *rq); +void blk_mq_complete_request(struct request *rq, int error); void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); -- cgit v1.2.3 From 0bf6cd5b9531bcc29c0a5e504b6ce2984c6fd8d8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 27 Sep 2015 21:01:51 +0200 Subject: blk-mq: factor out a helper to iterate all tags for a request_queue And replace the blk_mq_tag_busy_iter with it - the driver use has been replaced with a new helper a while ago, and internal to the block we only need the new version. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 27 ++++++++++++++++++++------- block/blk-mq-tag.h | 2 ++ block/blk-mq.c | 14 +++----------- include/linux/blk-mq.h | 2 -- 4 files changed, 25 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 9115c6d59948..ed96474d75cb 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -471,17 +471,30 @@ void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, } EXPORT_SYMBOL(blk_mq_all_tag_busy_iter); -void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn, +void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, void *priv) { - struct blk_mq_tags *tags = hctx->tags; + struct blk_mq_hw_ctx *hctx; + int i; + + + queue_for_each_hw_ctx(q, hctx, i) { + struct blk_mq_tags *tags = hctx->tags; + + /* + * If not software queues are currently mapped to this + * hardware queue, there's nothing to check + */ + if (!blk_mq_hw_queue_mapped(hctx)) + continue; + + if (tags->nr_reserved_tags) + bt_for_each(hctx, &tags->breserved_tags, 0, fn, priv, true); + bt_for_each(hctx, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv, + false); + } - if (tags->nr_reserved_tags) - bt_for_each(hctx, &tags->breserved_tags, 0, fn, priv, true); - bt_for_each(hctx, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv, - false); } -EXPORT_SYMBOL(blk_mq_tag_busy_iter); static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt) { diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 9eb2cf4f01cb..d468a79f2c4a 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -58,6 +58,8 @@ extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag); extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth); extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); +void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, + void *priv); enum { BLK_MQ_TAG_CACHE_MIN = 1, diff --git a/block/blk-mq.c b/block/blk-mq.c index 2306330530e8..7785ae96267a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -641,24 +641,16 @@ static void blk_mq_rq_timer(unsigned long priv) .next = 0, .next_set = 0, }; - struct blk_mq_hw_ctx *hctx; int i; - queue_for_each_hw_ctx(q, hctx, i) { - /* - * If not software queues are currently mapped to this - * hardware queue, there's nothing to check - */ - if (!blk_mq_hw_queue_mapped(hctx)) - continue; - - blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data); - } + blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data); if (data.next_set) { data.next = blk_rq_timeout(round_jiffies_up(data.next)); mod_timer(&q->timeout, data.next); } else { + struct blk_mq_hw_ctx *hctx; + queue_for_each_hw_ctx(q, hctx, i) { /* the hctx may be unmapped, so check it here */ if (blk_mq_hw_queue_mapped(hctx)) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index c1b5c867ff07..5e7d43ab61c0 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -223,8 +223,6 @@ void blk_mq_start_hw_queues(struct request_queue *q); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); void blk_mq_run_hw_queues(struct request_queue *q, bool async); void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); -void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn, - void *priv); void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, void *priv); void blk_mq_freeze_queue(struct request_queue *q); -- cgit v1.2.3 From 0610c25daa3e76e38ad5a8fae683a89ff9f71798 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Thu, 1 Oct 2015 15:37:02 -0700 Subject: memcg: fix dirty page migration The problem starts with a file backed dirty page which is charged to a memcg. Then page migration is used to move oldpage to newpage. Migration: - copies the oldpage's data to newpage - clears oldpage.PG_dirty - sets newpage.PG_dirty - uncharges oldpage from memcg - charges newpage to memcg Clearing oldpage.PG_dirty decrements the charged memcg's dirty page count. However, because newpage is not yet charged, setting newpage.PG_dirty does not increment the memcg's dirty page count. After migration completes newpage.PG_dirty is eventually cleared, often in account_page_cleaned(). At this time newpage is charged to a memcg so the memcg's dirty page count is decremented which causes underflow because the count was not previously incremented by migration. This underflow causes balance_dirty_pages() to see a very large unsigned number of dirty memcg pages which leads to aggressive throttling of buffered writes by processes in non root memcg. This issue: - can harm performance of non root memcg buffered writes. - can report too small (even negative) values in memory.stat[(total_)dirty] counters of all memcg, including the root. To avoid polluting migrate.c with #ifdef CONFIG_MEMCG checks, introduce page_memcg() and set_page_memcg() helpers. Test: 0) setup and enter limited memcg mkdir /sys/fs/cgroup/test echo 1G > /sys/fs/cgroup/test/memory.limit_in_bytes echo $$ > /sys/fs/cgroup/test/cgroup.procs 1) buffered writes baseline dd if=/dev/zero of=/data/tmp/foo bs=1M count=1k sync grep ^dirty /sys/fs/cgroup/test/memory.stat 2) buffered writes with compaction antagonist to induce migration yes 1 > /proc/sys/vm/compact_memory & rm -rf /data/tmp/foo dd if=/dev/zero of=/data/tmp/foo bs=1M count=1k kill % sync grep ^dirty /sys/fs/cgroup/test/memory.stat 3) buffered writes without antagonist, should match baseline rm -rf /data/tmp/foo dd if=/dev/zero of=/data/tmp/foo bs=1M count=1k sync grep ^dirty /sys/fs/cgroup/test/memory.stat (speed, dirty residue) unpatched patched 1) 841 MB/s 0 dirty pages 886 MB/s 0 dirty pages 2) 611 MB/s -33427456 dirty pages 793 MB/s 0 dirty pages 3) 114 MB/s -33427456 dirty pages 891 MB/s 0 dirty pages Notice that unpatched baseline performance (1) fell after migration (3): 841 -> 114 MB/s. In the patched kernel, post migration performance matches baseline. Fixes: c4843a7593a9 ("memcg: add per cgroup dirty page accounting") Signed-off-by: Greg Thelen Reported-by: Dave Hansen Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: [4.2+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 21 +++++++++++++++++++++ mm/migrate.c | 12 +++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 91c08f6f0dc9..80001de019ba 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -905,6 +905,27 @@ static inline void set_page_links(struct page *page, enum zone_type zone, #endif } +#ifdef CONFIG_MEMCG +static inline struct mem_cgroup *page_memcg(struct page *page) +{ + return page->mem_cgroup; +} + +static inline void set_page_memcg(struct page *page, struct mem_cgroup *memcg) +{ + page->mem_cgroup = memcg; +} +#else +static inline struct mem_cgroup *page_memcg(struct page *page) +{ + return NULL; +} + +static inline void set_page_memcg(struct page *page, struct mem_cgroup *memcg) +{ +} +#endif + /* * Some inline functions in vmstat.h depend on page_zone() */ diff --git a/mm/migrate.c b/mm/migrate.c index 7452a00bbb50..842ecd7aaf7f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -740,6 +740,15 @@ static int move_to_new_page(struct page *newpage, struct page *page, if (PageSwapBacked(page)) SetPageSwapBacked(newpage); + /* + * Indirectly called below, migrate_page_copy() copies PG_dirty and thus + * needs newpage's memcg set to transfer memcg dirty page accounting. + * So perform memcg migration in two steps: + * 1. set newpage->mem_cgroup (here) + * 2. clear page->mem_cgroup (below) + */ + set_page_memcg(newpage, page_memcg(page)); + mapping = page_mapping(page); if (!mapping) rc = migrate_page(mapping, newpage, page, mode); @@ -756,9 +765,10 @@ static int move_to_new_page(struct page *newpage, struct page *page, rc = fallback_migrate_page(mapping, newpage, page, mode); if (rc != MIGRATEPAGE_SUCCESS) { + set_page_memcg(newpage, NULL); newpage->mapping = NULL; } else { - mem_cgroup_migrate(page, newpage, false); + set_page_memcg(page, NULL); if (page_was_mapped) remove_migration_ptes(page, newpage); page->mapping = NULL; -- cgit v1.2.3 From ef510194cefe0cd369ef73419cd65b0a5bb4fb5b Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Thu, 1 Oct 2015 15:37:13 -0700 Subject: memcg: remove pcp_counter_lock Commit 733a572e66d2 ("memcg: make mem_cgroup_read_{stat|event}() iterate possible cpus instead of online") removed the last use of the per memcg pcp_counter_lock but forgot to remove the variable. Kill the vestigial variable. Signed-off-by: Greg Thelen Acked-by: Michal Hocko Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 1 - mm/memcontrol.c | 1 - 2 files changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ad800e62cb7a..6452ff4c463f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -242,7 +242,6 @@ struct mem_cgroup { * percpu counter. */ struct mem_cgroup_stat_cpu __percpu *stat; - spinlock_t pcp_counter_lock; #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) struct cg_proto tcp_mem; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 03cc0a742ff1..1fedbde68f59 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4185,7 +4185,6 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (memcg_wb_domain_init(memcg, GFP_KERNEL)) goto out_free_stat; - spin_lock_init(&memcg->pcp_counter_lock); return memcg; out_free_stat: -- cgit v1.2.3 From 7e50711993552800644a4667daa0f569a7665eca Mon Sep 17 00:00:00 2001 From: Grigoryev Denis Date: Fri, 2 Oct 2015 16:14:41 +0000 Subject: mfd: tps6105x: Use i2c regmap to access registers This patch modifies tps6105x and associated function driver to use regmap instead of operating directly on i2c. Signed-off-by: Denis Grigoryev Signed-off-by: Lee Jones --- drivers/mfd/tps6105x.c | 78 ++++++---------------------------- drivers/regulator/tps6105x-regulator.c | 16 +++---- include/linux/mfd/tps6105x.h | 10 ++--- 3 files changed, 24 insertions(+), 80 deletions(-) (limited to 'include/linux') diff --git a/drivers/mfd/tps6105x.c b/drivers/mfd/tps6105x.c index 5de95c265c1a..4a174cdb50b6 100644 --- a/drivers/mfd/tps6105x.c +++ b/drivers/mfd/tps6105x.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include @@ -25,73 +25,18 @@ #include #include -int tps6105x_set(struct tps6105x *tps6105x, u8 reg, u8 value) -{ - int ret; - - ret = mutex_lock_interruptible(&tps6105x->lock); - if (ret) - return ret; - ret = i2c_smbus_write_byte_data(tps6105x->client, reg, value); - mutex_unlock(&tps6105x->lock); - if (ret < 0) - return ret; - - return 0; -} -EXPORT_SYMBOL(tps6105x_set); - -int tps6105x_get(struct tps6105x *tps6105x, u8 reg, u8 *buf) -{ - int ret; - - ret = mutex_lock_interruptible(&tps6105x->lock); - if (ret) - return ret; - ret = i2c_smbus_read_byte_data(tps6105x->client, reg); - mutex_unlock(&tps6105x->lock); - if (ret < 0) - return ret; - - *buf = ret; - return 0; -} -EXPORT_SYMBOL(tps6105x_get); - -/* - * Masks off the bits in the mask and sets the bits in the bitvalues - * parameter in one atomic operation - */ -int tps6105x_mask_and_set(struct tps6105x *tps6105x, u8 reg, - u8 bitmask, u8 bitvalues) -{ - int ret; - u8 regval; - - ret = mutex_lock_interruptible(&tps6105x->lock); - if (ret) - return ret; - ret = i2c_smbus_read_byte_data(tps6105x->client, reg); - if (ret < 0) - goto fail; - regval = ret; - regval = (~bitmask & regval) | (bitmask & bitvalues); - ret = i2c_smbus_write_byte_data(tps6105x->client, reg, regval); -fail: - mutex_unlock(&tps6105x->lock); - if (ret < 0) - return ret; - - return 0; -} -EXPORT_SYMBOL(tps6105x_mask_and_set); +static struct regmap_config tps6105x_regmap_config = { + .reg_bits = 8, + .val_bits = 8, + .max_register = TPS6105X_REG_3, +}; static int tps6105x_startup(struct tps6105x *tps6105x) { int ret; - u8 regval; + unsigned int regval; - ret = tps6105x_get(tps6105x, TPS6105X_REG_0, ®val); + ret = regmap_read(tps6105x->regmap, TPS6105X_REG_0, ®val); if (ret) return ret; switch (regval >> TPS6105X_REG0_MODE_SHIFT) { @@ -145,11 +90,14 @@ static int tps6105x_probe(struct i2c_client *client, if (!tps6105x) return -ENOMEM; + tps6105x->regmap = devm_regmap_init_i2c(client, &tps6105x_regmap_config); + if (IS_ERR(tps6105x->regmap)) + return PTR_ERR(tps6105x->regmap); + i2c_set_clientdata(client, tps6105x); tps6105x->client = client; pdata = dev_get_platdata(&client->dev); tps6105x->pdata = pdata; - mutex_init(&tps6105x->lock); ret = tps6105x_startup(tps6105x); if (ret) { @@ -198,7 +146,7 @@ static int tps6105x_remove(struct i2c_client *client) mfd_remove_devices(&client->dev); /* Put chip in shutdown mode */ - tps6105x_mask_and_set(tps6105x, TPS6105X_REG_0, + regmap_update_bits(tps6105x->regmap, TPS6105X_REG_0, TPS6105X_REG0_MODE_MASK, TPS6105X_MODE_SHUTDOWN << TPS6105X_REG0_MODE_SHIFT); diff --git a/drivers/regulator/tps6105x-regulator.c b/drivers/regulator/tps6105x-regulator.c index 3510b3e7330a..ddc4f10e268a 100644 --- a/drivers/regulator/tps6105x-regulator.c +++ b/drivers/regulator/tps6105x-regulator.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include #include @@ -33,7 +33,7 @@ static int tps6105x_regulator_enable(struct regulator_dev *rdev) int ret; /* Activate voltage mode */ - ret = tps6105x_mask_and_set(tps6105x, TPS6105X_REG_0, + ret = regmap_update_bits(tps6105x->regmap, TPS6105X_REG_0, TPS6105X_REG0_MODE_MASK, TPS6105X_REG0_MODE_VOLTAGE << TPS6105X_REG0_MODE_SHIFT); if (ret) @@ -48,7 +48,7 @@ static int tps6105x_regulator_disable(struct regulator_dev *rdev) int ret; /* Set into shutdown mode */ - ret = tps6105x_mask_and_set(tps6105x, TPS6105X_REG_0, + ret = regmap_update_bits(tps6105x->regmap, TPS6105X_REG_0, TPS6105X_REG0_MODE_MASK, TPS6105X_REG0_MODE_SHUTDOWN << TPS6105X_REG0_MODE_SHIFT); if (ret) @@ -60,10 +60,10 @@ static int tps6105x_regulator_disable(struct regulator_dev *rdev) static int tps6105x_regulator_is_enabled(struct regulator_dev *rdev) { struct tps6105x *tps6105x = rdev_get_drvdata(rdev); - u8 regval; + unsigned int regval; int ret; - ret = tps6105x_get(tps6105x, TPS6105X_REG_0, ®val); + ret = regmap_read(tps6105x->regmap, TPS6105X_REG_0, ®val); if (ret) return ret; regval &= TPS6105X_REG0_MODE_MASK; @@ -78,10 +78,10 @@ static int tps6105x_regulator_is_enabled(struct regulator_dev *rdev) static int tps6105x_regulator_get_voltage_sel(struct regulator_dev *rdev) { struct tps6105x *tps6105x = rdev_get_drvdata(rdev); - u8 regval; + unsigned int regval; int ret; - ret = tps6105x_get(tps6105x, TPS6105X_REG_0, ®val); + ret = regmap_read(tps6105x->regmap, TPS6105X_REG_0, ®val); if (ret) return ret; @@ -96,7 +96,7 @@ static int tps6105x_regulator_set_voltage_sel(struct regulator_dev *rdev, struct tps6105x *tps6105x = rdev_get_drvdata(rdev); int ret; - ret = tps6105x_mask_and_set(tps6105x, TPS6105X_REG_0, + ret = regmap_update_bits(tps6105x->regmap, TPS6105X_REG_0, TPS6105X_REG0_VOLTAGE_MASK, selector << TPS6105X_REG0_VOLTAGE_SHIFT); if (ret) diff --git a/include/linux/mfd/tps6105x.h b/include/linux/mfd/tps6105x.h index 386743dd931c..8bc51180800a 100644 --- a/include/linux/mfd/tps6105x.h +++ b/include/linux/mfd/tps6105x.h @@ -10,6 +10,7 @@ #define MFD_TPS6105X_H #include +#include #include /* @@ -82,20 +83,15 @@ struct tps6105x_platform_data { /** * struct tps6105x - state holder for the TPS6105x drivers - * @mutex: mutex to serialize I2C accesses * @i2c_client: corresponding I2C client * @regulator: regulator device if used in voltage mode + * @regmap: used for i2c communcation on accessing registers */ struct tps6105x { struct tps6105x_platform_data *pdata; - struct mutex lock; struct i2c_client *client; struct regulator_dev *regulator; + struct regmap *regmap; }; -extern int tps6105x_set(struct tps6105x *tps6105x, u8 reg, u8 value); -extern int tps6105x_get(struct tps6105x *tps6105x, u8 reg, u8 *buf); -extern int tps6105x_mask_and_set(struct tps6105x *tps6105x, u8 reg, - u8 bitmask, u8 bitvalues); - #endif -- cgit v1.2.3 From 1ac710e08a86e4723286873db73edb2a6e99f591 Mon Sep 17 00:00:00 2001 From: Adam Thomson Date: Wed, 7 Oct 2015 14:54:08 +0100 Subject: mfd: da9150: Add support for Fuel-Gauge Signed-off-by: Adam Thomson Signed-off-by: Lee Jones --- drivers/mfd/da9150-core.c | 156 ++++++++++++++++++++++++++++++++++++++-- include/linux/mfd/da9150/core.h | 19 ++++- 2 files changed, 167 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/mfd/da9150-core.c b/drivers/mfd/da9150-core.c index 94b9bbd1a69b..85ca4b5d0678 100644 --- a/drivers/mfd/da9150-core.c +++ b/drivers/mfd/da9150-core.c @@ -23,6 +23,77 @@ #include #include +/* Raw device access, used for QIF */ +static int da9150_i2c_read_device(struct i2c_client *client, u8 addr, int count, + u8 *buf) +{ + struct i2c_msg xfer; + int ret; + + /* + * Read is split into two transfers as device expects STOP/START rather + * than repeated start to carry out this kind of access. + */ + + /* Write address */ + xfer.addr = client->addr; + xfer.flags = 0; + xfer.len = 1; + xfer.buf = &addr; + + ret = i2c_transfer(client->adapter, &xfer, 1); + if (ret != 1) { + if (ret < 0) + return ret; + else + return -EIO; + } + + /* Read data */ + xfer.addr = client->addr; + xfer.flags = I2C_M_RD; + xfer.len = count; + xfer.buf = buf; + + ret = i2c_transfer(client->adapter, &xfer, 1); + if (ret == 1) + return 0; + else if (ret < 0) + return ret; + else + return -EIO; +} + +static int da9150_i2c_write_device(struct i2c_client *client, u8 addr, + int count, const u8 *buf) +{ + struct i2c_msg xfer; + u8 *reg_data; + int ret; + + reg_data = kzalloc(1 + count, GFP_KERNEL); + if (!reg_data) + return -ENOMEM; + + reg_data[0] = addr; + memcpy(®_data[1], buf, count); + + /* Write address & data */ + xfer.addr = client->addr; + xfer.flags = 0; + xfer.len = 1 + count; + xfer.buf = reg_data; + + ret = i2c_transfer(client->adapter, &xfer, 1); + kfree(reg_data); + if (ret == 1) + return 0; + else if (ret < 0) + return ret; + else + return -EIO; +} + static bool da9150_volatile_reg(struct device *dev, unsigned int reg) { switch (reg) { @@ -107,6 +178,28 @@ static const struct regmap_config da9150_regmap_config = { .volatile_reg = da9150_volatile_reg, }; +void da9150_read_qif(struct da9150 *da9150, u8 addr, int count, u8 *buf) +{ + int ret; + + ret = da9150_i2c_read_device(da9150->core_qif, addr, count, buf); + if (ret < 0) + dev_err(da9150->dev, "Failed to read from QIF 0x%x: %d\n", + addr, ret); +} +EXPORT_SYMBOL_GPL(da9150_read_qif); + +void da9150_write_qif(struct da9150 *da9150, u8 addr, int count, const u8 *buf) +{ + int ret; + + ret = da9150_i2c_write_device(da9150->core_qif, addr, count, buf); + if (ret < 0) + dev_err(da9150->dev, "Failed to write to QIF 0x%x: %d\n", + addr, ret); +} +EXPORT_SYMBOL_GPL(da9150_write_qif); + u8 da9150_reg_read(struct da9150 *da9150, u16 reg) { int val, ret; @@ -297,19 +390,35 @@ static struct resource da9150_charger_resources[] = { }, }; +static struct resource da9150_fg_resources[] = { + DEFINE_RES_IRQ_NAMED(DA9150_IRQ_FG, "FG"), +}; + +enum da9150_dev_idx { + DA9150_GPADC_IDX = 0, + DA9150_CHARGER_IDX, + DA9150_FG_IDX, +}; + static struct mfd_cell da9150_devs[] = { - { + [DA9150_GPADC_IDX] = { .name = "da9150-gpadc", .of_compatible = "dlg,da9150-gpadc", .resources = da9150_gpadc_resources, .num_resources = ARRAY_SIZE(da9150_gpadc_resources), }, - { + [DA9150_CHARGER_IDX] = { .name = "da9150-charger", .of_compatible = "dlg,da9150-charger", .resources = da9150_charger_resources, .num_resources = ARRAY_SIZE(da9150_charger_resources), }, + [DA9150_FG_IDX] = { + .name = "da9150-fuel-gauge", + .of_compatible = "dlg,da9150-fuel-gauge", + .resources = da9150_fg_resources, + .num_resources = ARRAY_SIZE(da9150_fg_resources), + }, }; static int da9150_probe(struct i2c_client *client, @@ -317,6 +426,7 @@ static int da9150_probe(struct i2c_client *client, { struct da9150 *da9150; struct da9150_pdata *pdata = dev_get_platdata(&client->dev); + int qif_addr; int ret; da9150 = devm_kzalloc(&client->dev, sizeof(*da9150), GFP_KERNEL); @@ -335,16 +445,41 @@ static int da9150_probe(struct i2c_client *client, return ret; } - da9150->irq_base = pdata ? pdata->irq_base : -1; + /* Setup secondary I2C interface for QIF access */ + qif_addr = da9150_reg_read(da9150, DA9150_CORE2WIRE_CTRL_A); + qif_addr = (qif_addr & DA9150_CORE_BASE_ADDR_MASK) >> 1; + qif_addr |= DA9150_QIF_I2C_ADDR_LSB; + da9150->core_qif = i2c_new_dummy(client->adapter, qif_addr); + if (!da9150->core_qif) { + dev_err(da9150->dev, "Failed to attach QIF client\n"); + return -ENODEV; + } + + i2c_set_clientdata(da9150->core_qif, da9150); + + if (pdata) { + da9150->irq_base = pdata->irq_base; + + da9150_devs[DA9150_FG_IDX].platform_data = pdata->fg_pdata; + da9150_devs[DA9150_FG_IDX].pdata_size = + sizeof(struct da9150_fg_pdata); + } else { + da9150->irq_base = -1; + } ret = regmap_add_irq_chip(da9150->regmap, da9150->irq, IRQF_TRIGGER_LOW | IRQF_ONESHOT, da9150->irq_base, &da9150_regmap_irq_chip, &da9150->regmap_irq_data); - if (ret) - return ret; + if (ret) { + dev_err(da9150->dev, "Failed to add regmap irq chip: %d\n", + ret); + goto regmap_irq_fail; + } + da9150->irq_base = regmap_irq_chip_get_base(da9150->regmap_irq_data); + enable_irq_wake(da9150->irq); ret = mfd_add_devices(da9150->dev, -1, da9150_devs, @@ -352,11 +487,17 @@ static int da9150_probe(struct i2c_client *client, da9150->irq_base, NULL); if (ret) { dev_err(da9150->dev, "Failed to add child devices: %d\n", ret); - regmap_del_irq_chip(da9150->irq, da9150->regmap_irq_data); - return ret; + goto mfd_fail; } return 0; + +mfd_fail: + regmap_del_irq_chip(da9150->irq, da9150->regmap_irq_data); +regmap_irq_fail: + i2c_unregister_device(da9150->core_qif); + + return ret; } static int da9150_remove(struct i2c_client *client) @@ -365,6 +506,7 @@ static int da9150_remove(struct i2c_client *client) regmap_del_irq_chip(da9150->irq, da9150->regmap_irq_data); mfd_remove_devices(da9150->dev); + i2c_unregister_device(da9150->core_qif); return 0; } diff --git a/include/linux/mfd/da9150/core.h b/include/linux/mfd/da9150/core.h index 76e668933a77..1bf50caeb9fa 100644 --- a/include/linux/mfd/da9150/core.h +++ b/include/linux/mfd/da9150/core.h @@ -15,6 +15,7 @@ #define __DA9150_CORE_H #include +#include #include #include @@ -46,23 +47,39 @@ #define DA9150_IRQ_GPADC 19 #define DA9150_IRQ_WKUP 20 +/* I2C sub-device address */ +#define DA9150_QIF_I2C_ADDR_LSB 0x5 + +struct da9150_fg_pdata { + u32 update_interval; /* msecs */ + u8 warn_soc_lvl; /* % value */ + u8 crit_soc_lvl; /* % value */ +}; + struct da9150_pdata { int irq_base; + struct da9150_fg_pdata *fg_pdata; }; struct da9150 { struct device *dev; struct regmap *regmap; + struct i2c_client *core_qif; + struct regmap_irq_chip_data *regmap_irq_data; int irq; int irq_base; }; -/* Device I/O */ +/* Device I/O - Query Interface for FG and standard register access */ +void da9150_read_qif(struct da9150 *da9150, u8 addr, int count, u8 *buf); +void da9150_write_qif(struct da9150 *da9150, u8 addr, int count, const u8 *buf); + u8 da9150_reg_read(struct da9150 *da9150, u16 reg); void da9150_reg_write(struct da9150 *da9150, u16 reg, u8 val); void da9150_set_bits(struct da9150 *da9150, u16 reg, u8 mask, u8 val); void da9150_bulk_read(struct da9150 *da9150, u16 reg, int count, u8 *buf); void da9150_bulk_write(struct da9150 *da9150, u16 reg, int count, const u8 *buf); + #endif /* __DA9150_CORE_H */ -- cgit v1.2.3 From 98a3be44ffa67b812de7aa7aed9f2331edcfb1a5 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 23 Oct 2015 12:16:41 +0300 Subject: mfd: core: redo ACPI matching of the children devices There is at least one board on the market, i.e. Intel Galileo Gen2, that uses _ADR to distinguish the devices under one actual device. Due to this we have to improve the quirk in the MFD core to handle that board. Acked-by: Rafael J. Wysocki Acked-by: Lee Jones Signed-off-by: Andy Shevchenko Signed-off-by: Wolfram Sang --- Documentation/acpi/enumeration.txt | 11 +++++--- drivers/mfd/mfd-core.c | 52 ++++++++++++++++++++++++++------------ include/linux/mfd/core.h | 10 ++++++-- 3 files changed, 52 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/Documentation/acpi/enumeration.txt b/Documentation/acpi/enumeration.txt index b731b292e812..a91ec5af52df 100644 --- a/Documentation/acpi/enumeration.txt +++ b/Documentation/acpi/enumeration.txt @@ -347,13 +347,18 @@ For the first case, the MFD drivers do not need to do anything. The resulting child platform device will have its ACPI_COMPANION() set to point to the parent device. -If the ACPI namespace has a device that we can match using an ACPI id, -the id should be set like: +If the ACPI namespace has a device that we can match using an ACPI id or ACPI +adr, the cell should be set like: + + static struct mfd_cell_acpi_match my_subdevice_cell_acpi_match = { + .pnpid = "XYZ0001", + .adr = 0, + }; static struct mfd_cell my_subdevice_cell = { .name = "my_subdevice", /* set the resources relative to the parent */ - .acpi_pnpid = "XYZ0001", + .acpi_match = &my_subdevice_cell_acpi_match, }; The ACPI id "XYZ0001" is then used to lookup an ACPI device directly under diff --git a/drivers/mfd/mfd-core.c b/drivers/mfd/mfd-core.c index c17635d3e504..60b60dc63ddd 100644 --- a/drivers/mfd/mfd-core.c +++ b/drivers/mfd/mfd-core.c @@ -82,29 +82,49 @@ static int mfd_platform_add_cell(struct platform_device *pdev, static void mfd_acpi_add_device(const struct mfd_cell *cell, struct platform_device *pdev) { - struct acpi_device *parent_adev; + const struct mfd_cell_acpi_match *match = cell->acpi_match; + struct acpi_device *parent, *child; struct acpi_device *adev; - parent_adev = ACPI_COMPANION(pdev->dev.parent); - if (!parent_adev) + parent = ACPI_COMPANION(pdev->dev.parent); + if (!parent) return; /* - * MFD child device gets its ACPI handle either from the ACPI - * device directly under the parent that matches the acpi_pnpid or - * it will use the parent handle if is no acpi_pnpid is given. + * MFD child device gets its ACPI handle either from the ACPI device + * directly under the parent that matches the either _HID or _CID, or + * _ADR or it will use the parent handle if is no ID is given. + * + * Note that use of _ADR is a grey area in the ACPI specification, + * though Intel Galileo Gen2 is using it to distinguish the children + * devices. */ - adev = parent_adev; - if (cell->acpi_pnpid) { - struct acpi_device_id ids[2] = {}; - struct acpi_device *child_adev; - - strlcpy(ids[0].id, cell->acpi_pnpid, sizeof(ids[0].id)); - list_for_each_entry(child_adev, &parent_adev->children, node) - if (acpi_match_device_ids(child_adev, ids)) { - adev = child_adev; - break; + adev = parent; + if (match) { + if (match->pnpid) { + struct acpi_device_id ids[2] = {}; + + strlcpy(ids[0].id, match->pnpid, sizeof(ids[0].id)); + list_for_each_entry(child, &parent->children, node) { + if (acpi_match_device_ids(child, ids)) { + adev = child; + break; + } + } + } else { + unsigned long long adr; + acpi_status status; + + list_for_each_entry(child, &parent->children, node) { + status = acpi_evaluate_integer(child->handle, + "_ADR", NULL, + &adr); + if (ACPI_SUCCESS(status) && match->adr == adr) { + adev = child; + break; + } } + } } ACPI_COMPANION_SET(&pdev->dev, adev); diff --git a/include/linux/mfd/core.h b/include/linux/mfd/core.h index a76bc100bf97..27dac3ff18b9 100644 --- a/include/linux/mfd/core.h +++ b/include/linux/mfd/core.h @@ -18,6 +18,12 @@ struct irq_domain; +/* Matches ACPI PNP id, either _HID or _CID, or ACPI _ADR */ +struct mfd_cell_acpi_match { + const char *pnpid; + const unsigned long long adr; +}; + /* * This struct describes the MFD part ("cell"). * After registration the copy of this structure will become the platform data @@ -44,8 +50,8 @@ struct mfd_cell { */ const char *of_compatible; - /* Matches ACPI PNP id, either _HID or _CID */ - const char *acpi_pnpid; + /* Matches ACPI */ + const struct mfd_cell_acpi_match *acpi_match; /* * These resources can be specified relative to the parent device. -- cgit v1.2.3