From 607904c357c61adf20b8fd18af765e501d61a385 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Mon, 9 Jan 2017 10:26:52 -0500
Subject: locking/spinlocks: Remove the unused spin_lock_bh_nested() API

The spin_lock_bh_nested() API is defined but is not used anywhere
in the kernel. So all spin_lock_bh_nested() and related APIs are
now removed.

Signed-off-by: Waiman Long <longman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1483975612-16447-1-git-send-email-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/spinlock.h         | 8 --------
 include/linux/spinlock_api_smp.h | 2 --
 include/linux/spinlock_api_up.h  | 1 -
 3 files changed, 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 47dd0cebd204..59248dcc6ef3 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -180,8 +180,6 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # define raw_spin_lock_nested(lock, subclass) \
 	_raw_spin_lock_nested(lock, subclass)
-# define raw_spin_lock_bh_nested(lock, subclass) \
-	_raw_spin_lock_bh_nested(lock, subclass)
 
 # define raw_spin_lock_nest_lock(lock, nest_lock)			\
 	 do {								\
@@ -197,7 +195,6 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
 # define raw_spin_lock_nested(lock, subclass)		\
 	_raw_spin_lock(((void)(subclass), (lock)))
 # define raw_spin_lock_nest_lock(lock, nest_lock)	_raw_spin_lock(lock)
-# define raw_spin_lock_bh_nested(lock, subclass)	_raw_spin_lock_bh(lock)
 #endif
 
 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
@@ -317,11 +314,6 @@ do {								\
 	raw_spin_lock_nested(spinlock_check(lock), subclass);	\
 } while (0)
 
-#define spin_lock_bh_nested(lock, subclass)			\
-do {								\
-	raw_spin_lock_bh_nested(spinlock_check(lock), subclass);\
-} while (0)
-
 #define spin_lock_nest_lock(lock, nest_lock)				\
 do {									\
 	raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock);	\
diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
index 5344268e6e62..42dfab89e740 100644
--- a/include/linux/spinlock_api_smp.h
+++ b/include/linux/spinlock_api_smp.h
@@ -22,8 +22,6 @@ int in_lock_functions(unsigned long addr);
 void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)		__acquires(lock);
 void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
 								__acquires(lock);
-void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass)
-								__acquires(lock);
 void __lockfunc
 _raw_spin_lock_nest_lock(raw_spinlock_t *lock, struct lockdep_map *map)
 								__acquires(lock);
diff --git a/include/linux/spinlock_api_up.h b/include/linux/spinlock_api_up.h
index d3afef9d8dbe..d0d188861ad6 100644
--- a/include/linux/spinlock_api_up.h
+++ b/include/linux/spinlock_api_up.h
@@ -57,7 +57,6 @@
 
 #define _raw_spin_lock(lock)			__LOCK(lock)
 #define _raw_spin_lock_nested(lock, subclass)	__LOCK(lock)
-#define _raw_spin_lock_bh_nested(lock, subclass) __LOCK(lock)
 #define _raw_read_lock(lock)			__LOCK(lock)
 #define _raw_write_lock(lock)			__LOCK(lock)
 #define _raw_spin_lock_bh(lock)			__LOCK_BH(lock)
-- 
cgit v1.2.3


From a665ece8b471de45bc19af05d52a1eaa5bc06dca Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 12 Jan 2017 13:23:31 +0200
Subject: x86/platform/intel: Remove PMIC GPIO block support

Moorestown support was removed by commit:

  1a8359e411eb ("x86/mid: Remove Intel Moorestown")

Remove this leftover.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Darren Hart <dvhart@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: platform-driver-x86@vger.kernel.org
Link: http://lkml.kernel.org/r/20170112112331.93236-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/intel-mid/sfi.c      |   1 -
 drivers/platform/x86/Kconfig           |   7 -
 drivers/platform/x86/Makefile          |   1 -
 drivers/platform/x86/intel_pmic_gpio.c | 326 ---------------------------------
 include/linux/intel_pmic_gpio.h        |  15 --
 5 files changed, 350 deletions(-)
 delete mode 100644 drivers/platform/x86/intel_pmic_gpio.c
 delete mode 100644 include/linux/intel_pmic_gpio.h

(limited to 'include')

diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c
index ce1303830231..19b43e3a9f0f 100644
--- a/arch/x86/platform/intel-mid/sfi.c
+++ b/arch/x86/platform/intel-mid/sfi.c
@@ -15,7 +15,6 @@
 #include <linux/interrupt.h>
 #include <linux/scatterlist.h>
 #include <linux/sfi.h>
-#include <linux/intel_pmic_gpio.h>
 #include <linux/spi/spi.h>
 #include <linux/i2c.h>
 #include <linux/skbuff.h>
diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 5fe8be089b8b..fe5a3da3682f 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -816,13 +816,6 @@ config INTEL_SCU_IPC_UTIL
 	  low level access for debug work and updating the firmware. Say
 	  N unless you will be doing this on an Intel MID platform.
 
-config GPIO_INTEL_PMIC
-	bool "Intel PMIC GPIO support"
-	depends on INTEL_SCU_IPC && GPIOLIB
-	---help---
-	  Say Y here to support GPIO via the SCU IPC interface
-	  on Intel MID platforms.
-
 config INTEL_MID_POWER_BUTTON
 	tristate "power button driver for Intel MID platforms"
 	depends on INTEL_SCU_IPC && INPUT
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index d4111f0f8a78..b2f52a7690af 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -50,7 +50,6 @@ obj-$(CONFIG_INTEL_SCU_IPC)	+= intel_scu_ipc.o
 obj-$(CONFIG_INTEL_SCU_IPC_UTIL) += intel_scu_ipcutil.o
 obj-$(CONFIG_INTEL_MFLD_THERMAL) += intel_mid_thermal.o
 obj-$(CONFIG_INTEL_IPS)		+= intel_ips.o
-obj-$(CONFIG_GPIO_INTEL_PMIC)	+= intel_pmic_gpio.o
 obj-$(CONFIG_XO1_RFKILL)	+= xo1-rfkill.o
 obj-$(CONFIG_XO15_EBOOK)	+= xo15-ebook.o
 obj-$(CONFIG_IBM_RTL)		+= ibm_rtl.o
diff --git a/drivers/platform/x86/intel_pmic_gpio.c b/drivers/platform/x86/intel_pmic_gpio.c
deleted file mode 100644
index 91ae58510d92..000000000000
--- a/drivers/platform/x86/intel_pmic_gpio.c
+++ /dev/null
@@ -1,326 +0,0 @@
-/* Moorestown PMIC GPIO (access through IPC) driver
- * Copyright (c) 2008 - 2009, Intel Corporation.
- *
- * Author: Alek Du <alek.du@intel.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-/* Supports:
- * Moorestown platform PMIC chip
- */
-
-#define pr_fmt(fmt) "%s: " fmt, __func__
-
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/delay.h>
-#include <linux/stddef.h>
-#include <linux/slab.h>
-#include <linux/ioport.h>
-#include <linux/init.h>
-#include <linux/io.h>
-#include <linux/gpio/driver.h>
-#include <asm/intel_scu_ipc.h>
-#include <linux/device.h>
-#include <linux/intel_pmic_gpio.h>
-#include <linux/platform_device.h>
-
-#define DRIVER_NAME "pmic_gpio"
-
-/* register offset that IPC driver should use
- * 8 GPIO + 8 GPOSW (6 controllable) + 8GPO
- */
-enum pmic_gpio_register {
-	GPIO0		= 0xE0,
-	GPIO7		= 0xE7,
-	GPIOINT		= 0xE8,
-	GPOSWCTL0	= 0xEC,
-	GPOSWCTL5	= 0xF1,
-	GPO		= 0xF4,
-};
-
-/* bits definition for GPIO & GPOSW */
-#define GPIO_DRV 0x01
-#define GPIO_DIR 0x02
-#define GPIO_DIN 0x04
-#define GPIO_DOU 0x08
-#define GPIO_INTCTL 0x30
-#define GPIO_DBC 0xc0
-
-#define GPOSW_DRV 0x01
-#define GPOSW_DOU 0x08
-#define GPOSW_RDRV 0x30
-
-#define GPIO_UPDATE_TYPE	0x80000000
-
-#define NUM_GPIO 24
-
-struct pmic_gpio {
-	struct mutex		buslock;
-	struct gpio_chip	chip;
-	void			*gpiointr;
-	int			irq;
-	unsigned		irq_base;
-	unsigned int		update_type;
-	u32			trigger_type;
-};
-
-static void pmic_program_irqtype(int gpio, int type)
-{
-	if (type & IRQ_TYPE_EDGE_RISING)
-		intel_scu_ipc_update_register(GPIO0 + gpio, 0x20, 0x20);
-	else
-		intel_scu_ipc_update_register(GPIO0 + gpio, 0x00, 0x20);
-
-	if (type & IRQ_TYPE_EDGE_FALLING)
-		intel_scu_ipc_update_register(GPIO0 + gpio, 0x10, 0x10);
-	else
-		intel_scu_ipc_update_register(GPIO0 + gpio, 0x00, 0x10);
-};
-
-static int pmic_gpio_direction_input(struct gpio_chip *chip, unsigned offset)
-{
-	if (offset >= 8) {
-		pr_err("only pin 0-7 support input\n");
-		return -1;/* we only have 8 GPIO can use as input */
-	}
-	return intel_scu_ipc_update_register(GPIO0 + offset,
-							GPIO_DIR, GPIO_DIR);
-}
-
-static int pmic_gpio_direction_output(struct gpio_chip *chip,
-			unsigned offset, int value)
-{
-	int rc = 0;
-
-	if (offset < 8)/* it is GPIO */
-		rc = intel_scu_ipc_update_register(GPIO0 + offset,
-				GPIO_DRV | (value ? GPIO_DOU : 0),
-				GPIO_DRV | GPIO_DOU | GPIO_DIR);
-	else if (offset < 16)/* it is GPOSW */
-		rc = intel_scu_ipc_update_register(GPOSWCTL0 + offset - 8,
-				GPOSW_DRV | (value ? GPOSW_DOU : 0),
-				GPOSW_DRV | GPOSW_DOU | GPOSW_RDRV);
-	else if (offset > 15 && offset < 24)/* it is GPO */
-		rc = intel_scu_ipc_update_register(GPO,
-				value ? 1 << (offset - 16) : 0,
-				1 << (offset - 16));
-	else {
-		pr_err("invalid PMIC GPIO pin %d!\n", offset);
-		WARN_ON(1);
-	}
-
-	return rc;
-}
-
-static int pmic_gpio_get(struct gpio_chip *chip, unsigned offset)
-{
-	u8 r;
-	int ret;
-
-	/* we only have 8 GPIO pins we can use as input */
-	if (offset >= 8)
-		return -EOPNOTSUPP;
-	ret = intel_scu_ipc_ioread8(GPIO0 + offset, &r);
-	if (ret < 0)
-		return ret;
-	return r & GPIO_DIN;
-}
-
-static void pmic_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
-{
-	if (offset < 8)/* it is GPIO */
-		intel_scu_ipc_update_register(GPIO0 + offset,
-			GPIO_DRV | (value ? GPIO_DOU : 0),
-			GPIO_DRV | GPIO_DOU);
-	else if (offset < 16)/* it is GPOSW */
-		intel_scu_ipc_update_register(GPOSWCTL0 + offset - 8,
-			GPOSW_DRV | (value ? GPOSW_DOU : 0),
-			GPOSW_DRV | GPOSW_DOU | GPOSW_RDRV);
-	else if (offset > 15 && offset < 24) /* it is GPO */
-		intel_scu_ipc_update_register(GPO,
-			value ? 1 << (offset - 16) : 0,
-			1 << (offset - 16));
-}
-
-/*
- * This is called from genirq with pg->buslock locked and
- * irq_desc->lock held. We can not access the scu bus here, so we
- * store the change and update in the bus_sync_unlock() function below
- */
-static int pmic_irq_type(struct irq_data *data, unsigned type)
-{
-	struct pmic_gpio *pg = irq_data_get_irq_chip_data(data);
-	u32 gpio = data->irq - pg->irq_base;
-
-	if (gpio >= pg->chip.ngpio)
-		return -EINVAL;
-
-	pg->trigger_type = type;
-	pg->update_type = gpio | GPIO_UPDATE_TYPE;
-	return 0;
-}
-
-static int pmic_gpio_to_irq(struct gpio_chip *chip, unsigned offset)
-{
-	struct pmic_gpio *pg = gpiochip_get_data(chip);
-
-	return pg->irq_base + offset;
-}
-
-static void pmic_bus_lock(struct irq_data *data)
-{
-	struct pmic_gpio *pg = irq_data_get_irq_chip_data(data);
-
-	mutex_lock(&pg->buslock);
-}
-
-static void pmic_bus_sync_unlock(struct irq_data *data)
-{
-	struct pmic_gpio *pg = irq_data_get_irq_chip_data(data);
-
-	if (pg->update_type) {
-		unsigned int gpio = pg->update_type & ~GPIO_UPDATE_TYPE;
-
-		pmic_program_irqtype(gpio, pg->trigger_type);
-		pg->update_type = 0;
-	}
-	mutex_unlock(&pg->buslock);
-}
-
-/* the gpiointr register is read-clear, so just do nothing. */
-static void pmic_irq_unmask(struct irq_data *data) { }
-
-static void pmic_irq_mask(struct irq_data *data) { }
-
-static struct irq_chip pmic_irqchip = {
-	.name			= "PMIC-GPIO",
-	.irq_mask		= pmic_irq_mask,
-	.irq_unmask		= pmic_irq_unmask,
-	.irq_set_type		= pmic_irq_type,
-	.irq_bus_lock		= pmic_bus_lock,
-	.irq_bus_sync_unlock	= pmic_bus_sync_unlock,
-};
-
-static irqreturn_t pmic_irq_handler(int irq, void *data)
-{
-	struct pmic_gpio *pg = data;
-	u8 intsts = *((u8 *)pg->gpiointr + 4);
-	int gpio;
-	irqreturn_t ret = IRQ_NONE;
-
-	for (gpio = 0; gpio < 8; gpio++) {
-		if (intsts & (1 << gpio)) {
-			pr_debug("pmic pin %d triggered\n", gpio);
-			generic_handle_irq(pg->irq_base + gpio);
-			ret = IRQ_HANDLED;
-		}
-	}
-	return ret;
-}
-
-static int platform_pmic_gpio_probe(struct platform_device *pdev)
-{
-	struct device *dev = &pdev->dev;
-	int irq = platform_get_irq(pdev, 0);
-	struct intel_pmic_gpio_platform_data *pdata = dev->platform_data;
-
-	struct pmic_gpio *pg;
-	int retval;
-	int i;
-
-	if (irq < 0) {
-		dev_dbg(dev, "no IRQ line\n");
-		return -EINVAL;
-	}
-
-	if (!pdata || !pdata->gpio_base || !pdata->irq_base) {
-		dev_dbg(dev, "incorrect or missing platform data\n");
-		return -EINVAL;
-	}
-
-	pg = kzalloc(sizeof(*pg), GFP_KERNEL);
-	if (!pg)
-		return -ENOMEM;
-
-	dev_set_drvdata(dev, pg);
-
-	pg->irq = irq;
-	/* setting up SRAM mapping for GPIOINT register */
-	pg->gpiointr = ioremap_nocache(pdata->gpiointr, 8);
-	if (!pg->gpiointr) {
-		pr_err("Can not map GPIOINT\n");
-		retval = -EINVAL;
-		goto err2;
-	}
-	pg->irq_base = pdata->irq_base;
-	pg->chip.label = "intel_pmic";
-	pg->chip.direction_input = pmic_gpio_direction_input;
-	pg->chip.direction_output = pmic_gpio_direction_output;
-	pg->chip.get = pmic_gpio_get;
-	pg->chip.set = pmic_gpio_set;
-	pg->chip.to_irq = pmic_gpio_to_irq;
-	pg->chip.base = pdata->gpio_base;
-	pg->chip.ngpio = NUM_GPIO;
-	pg->chip.can_sleep = 1;
-	pg->chip.parent = dev;
-
-	mutex_init(&pg->buslock);
-
-	pg->chip.parent = dev;
-	retval = gpiochip_add_data(&pg->chip, pg);
-	if (retval) {
-		pr_err("Can not add pmic gpio chip\n");
-		goto err;
-	}
-
-	retval = request_irq(pg->irq, pmic_irq_handler, 0, "pmic", pg);
-	if (retval) {
-		pr_warn("Interrupt request failed\n");
-		goto fail_request_irq;
-	}
-
-	for (i = 0; i < 8; i++) {
-		irq_set_chip_and_handler_name(i + pg->irq_base,
-					      &pmic_irqchip,
-					      handle_simple_irq,
-					      "demux");
-		irq_set_chip_data(i + pg->irq_base, pg);
-	}
-	return 0;
-
-fail_request_irq:
-	gpiochip_remove(&pg->chip);
-err:
-	iounmap(pg->gpiointr);
-err2:
-	kfree(pg);
-	return retval;
-}
-
-/* at the same time, register a platform driver
- * this supports the sfi 0.81 fw */
-static struct platform_driver platform_pmic_gpio_driver = {
-	.driver = {
-		.name		= DRIVER_NAME,
-	},
-	.probe		= platform_pmic_gpio_probe,
-};
-
-static int __init platform_pmic_gpio_init(void)
-{
-	return platform_driver_register(&platform_pmic_gpio_driver);
-}
-subsys_initcall(platform_pmic_gpio_init);
diff --git a/include/linux/intel_pmic_gpio.h b/include/linux/intel_pmic_gpio.h
deleted file mode 100644
index 920109a29191..000000000000
--- a/include/linux/intel_pmic_gpio.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef LINUX_INTEL_PMIC_H
-#define LINUX_INTEL_PMIC_H
-
-struct intel_pmic_gpio_platform_data {
-	/* the first IRQ of the chip */
-	unsigned	irq_base;
-	/* number assigned to the first GPIO */
-	unsigned	gpio_base;
-	/* sram address for gpiointr register, the langwell chip will map
-	 * the PMIC spi GPIO expander's GPIOINTR register in sram.
-	 */
-	unsigned	gpiointr;
-};
-
-#endif
-- 
cgit v1.2.3


From 5b485629ba0d5d027880769ff467c587b24b4bde Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Sun, 8 Jan 2017 23:58:09 +0900
Subject: kprobes, extable: Identify kprobes trampolines as kernel text area

Improve __kernel_text_address()/kernel_text_address() to return
true if the given address is on a kprobe's instruction slot
trampoline.

This can help stacktraces to determine the address is on a
text area or not.

To implement this atomically in is_kprobe_*_slot(), also change
the insn_cache page list to an RCU list.

This changes timings a bit (it delays page freeing to the RCU garbage
collection phase), but none of that is in the hot path.

Note: this change can add small overhead to stack unwinders because
it adds 2 additional checks to __kernel_text_address(). However, the
impact should be very small, because kprobe_insn_pages list has 1 entry
per 256 probes(on x86, on arm/arm64 it will be 1024 probes),
and kprobe_optinsn_pages has 1 entry per 32 probes(on x86).
In most use cases, the number of kprobe events may be less
than 20, which means that is_kprobe_*_slot() will check just one entry.

Tested-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/148388747896.6869.6354262871751682264.stgit@devbox
[ Improved the changelog and coding style. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/kprobes.h | 30 +++++++++++++++++++-
 kernel/extable.c        |  9 +++++-
 kernel/kprobes.c        | 73 ++++++++++++++++++++++++++++++++++++-------------
 3 files changed, 91 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 8f6849084248..16ddfb8b304a 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -278,9 +278,13 @@ struct kprobe_insn_cache {
 	int nr_garbage;
 };
 
+#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
 extern kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c);
 extern void __free_insn_slot(struct kprobe_insn_cache *c,
 			     kprobe_opcode_t *slot, int dirty);
+/* sleep-less address checking routine  */
+extern bool __is_insn_slot_addr(struct kprobe_insn_cache *c,
+				unsigned long addr);
 
 #define DEFINE_INSN_CACHE_OPS(__name)					\
 extern struct kprobe_insn_cache kprobe_##__name##_slots;		\
@@ -294,6 +298,18 @@ static inline void free_##__name##_slot(kprobe_opcode_t *slot, int dirty)\
 {									\
 	__free_insn_slot(&kprobe_##__name##_slots, slot, dirty);	\
 }									\
+									\
+static inline bool is_kprobe_##__name##_slot(unsigned long addr)	\
+{									\
+	return __is_insn_slot_addr(&kprobe_##__name##_slots, addr);	\
+}
+#else /* __ARCH_WANT_KPROBES_INSN_SLOT */
+#define DEFINE_INSN_CACHE_OPS(__name)					\
+static inline bool is_kprobe_##__name##_slot(unsigned long addr)	\
+{									\
+	return 0;							\
+}
+#endif
 
 DEFINE_INSN_CACHE_OPS(insn);
 
@@ -330,7 +346,6 @@ extern int proc_kprobes_optimization_handler(struct ctl_table *table,
 					     int write, void __user *buffer,
 					     size_t *length, loff_t *ppos);
 #endif
-
 #endif /* CONFIG_OPTPROBES */
 #ifdef CONFIG_KPROBES_ON_FTRACE
 extern void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
@@ -481,6 +496,19 @@ static inline int enable_jprobe(struct jprobe *jp)
 	return enable_kprobe(&jp->kp);
 }
 
+#ifndef CONFIG_KPROBES
+static inline bool is_kprobe_insn_slot(unsigned long addr)
+{
+	return false;
+}
+#endif
+#ifndef CONFIG_OPTPROBES
+static inline bool is_kprobe_optinsn_slot(unsigned long addr)
+{
+	return false;
+}
+#endif
+
 #ifdef CONFIG_KPROBES
 /*
  * Blacklist ganerating macro. Specify functions which is not probed
diff --git a/kernel/extable.c b/kernel/extable.c
index e3beec4a2339..e1359474baa5 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -20,6 +20,7 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/init.h>
+#include <linux/kprobes.h>
 
 #include <asm/sections.h>
 #include <linux/uaccess.h>
@@ -104,6 +105,8 @@ int __kernel_text_address(unsigned long addr)
 		return 1;
 	if (is_ftrace_trampoline(addr))
 		return 1;
+	if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr))
+		return 1;
 	/*
 	 * There might be init symbols in saved stacktraces.
 	 * Give those symbols a chance to be printed in
@@ -123,7 +126,11 @@ int kernel_text_address(unsigned long addr)
 		return 1;
 	if (is_module_text_address(addr))
 		return 1;
-	return is_ftrace_trampoline(addr);
+	if (is_ftrace_trampoline(addr))
+		return 1;
+	if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr))
+		return 1;
+	return 0;
 }
 
 /*
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 43460104f119..ebb4dadca66b 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -149,9 +149,11 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
 	struct kprobe_insn_page *kip;
 	kprobe_opcode_t *slot = NULL;
 
+	/* Since the slot array is not protected by rcu, we need a mutex */
 	mutex_lock(&c->mutex);
  retry:
-	list_for_each_entry(kip, &c->pages, list) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(kip, &c->pages, list) {
 		if (kip->nused < slots_per_page(c)) {
 			int i;
 			for (i = 0; i < slots_per_page(c); i++) {
@@ -159,6 +161,7 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
 					kip->slot_used[i] = SLOT_USED;
 					kip->nused++;
 					slot = kip->insns + (i * c->insn_size);
+					rcu_read_unlock();
 					goto out;
 				}
 			}
@@ -167,6 +170,7 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
 			WARN_ON(1);
 		}
 	}
+	rcu_read_unlock();
 
 	/* If there are any garbage slots, collect it and try again. */
 	if (c->nr_garbage && collect_garbage_slots(c) == 0)
@@ -193,7 +197,7 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
 	kip->nused = 1;
 	kip->ngarbage = 0;
 	kip->cache = c;
-	list_add(&kip->list, &c->pages);
+	list_add_rcu(&kip->list, &c->pages);
 	slot = kip->insns;
 out:
 	mutex_unlock(&c->mutex);
@@ -213,7 +217,8 @@ static int collect_one_slot(struct kprobe_insn_page *kip, int idx)
 		 * next time somebody inserts a probe.
 		 */
 		if (!list_is_singular(&kip->list)) {
-			list_del(&kip->list);
+			list_del_rcu(&kip->list);
+			synchronize_rcu();
 			kip->cache->free(kip->insns);
 			kfree(kip);
 		}
@@ -235,8 +240,7 @@ static int collect_garbage_slots(struct kprobe_insn_cache *c)
 			continue;
 		kip->ngarbage = 0;	/* we will collect all garbages */
 		for (i = 0; i < slots_per_page(c); i++) {
-			if (kip->slot_used[i] == SLOT_DIRTY &&
-			    collect_one_slot(kip, i))
+			if (kip->slot_used[i] == SLOT_DIRTY && collect_one_slot(kip, i))
 				break;
 		}
 	}
@@ -248,29 +252,60 @@ void __free_insn_slot(struct kprobe_insn_cache *c,
 		      kprobe_opcode_t *slot, int dirty)
 {
 	struct kprobe_insn_page *kip;
+	long idx;
 
 	mutex_lock(&c->mutex);
-	list_for_each_entry(kip, &c->pages, list) {
-		long idx = ((long)slot - (long)kip->insns) /
-				(c->insn_size * sizeof(kprobe_opcode_t));
-		if (idx >= 0 && idx < slots_per_page(c)) {
-			WARN_ON(kip->slot_used[idx] != SLOT_USED);
-			if (dirty) {
-				kip->slot_used[idx] = SLOT_DIRTY;
-				kip->ngarbage++;
-				if (++c->nr_garbage > slots_per_page(c))
-					collect_garbage_slots(c);
-			} else
-				collect_one_slot(kip, idx);
+	rcu_read_lock();
+	list_for_each_entry_rcu(kip, &c->pages, list) {
+		idx = ((long)slot - (long)kip->insns) /
+			(c->insn_size * sizeof(kprobe_opcode_t));
+		if (idx >= 0 && idx < slots_per_page(c))
 			goto out;
-		}
 	}
-	/* Could not free this slot. */
+	/* Could not find this slot. */
 	WARN_ON(1);
+	kip = NULL;
 out:
+	rcu_read_unlock();
+	/* Mark and sweep: this may sleep */
+	if (kip) {
+		/* Check double free */
+		WARN_ON(kip->slot_used[idx] != SLOT_USED);
+		if (dirty) {
+			kip->slot_used[idx] = SLOT_DIRTY;
+			kip->ngarbage++;
+			if (++c->nr_garbage > slots_per_page(c))
+				collect_garbage_slots(c);
+		} else {
+			collect_one_slot(kip, idx);
+		}
+	}
 	mutex_unlock(&c->mutex);
 }
 
+/*
+ * Check given address is on the page of kprobe instruction slots.
+ * This will be used for checking whether the address on a stack
+ * is on a text area or not.
+ */
+bool __is_insn_slot_addr(struct kprobe_insn_cache *c, unsigned long addr)
+{
+	struct kprobe_insn_page *kip;
+	bool ret = false;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(kip, &c->pages, list) {
+		if (addr >= (unsigned long)kip->insns &&
+		    addr < (unsigned long)kip->insns + PAGE_SIZE) {
+			ret = true;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
 #ifdef CONFIG_OPTPROBES
 /* For optimized_kprobe buffer */
 struct kprobe_insn_cache kprobe_optinsn_slots = {
-- 
cgit v1.2.3


From c31cc6a5187e8b09ccee34f81728a90f80e872e7 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 5 Jan 2017 18:11:43 +0100
Subject: sched/cputime: Allow accounting system time using cpustat index

In order to prepare for CONFIG_VIRT_CPU_ACCOUNTING_NATIVE=y to delay
cputime accounting to the tick, let's provide APIs to account system
time to precise contexts: hardirq, softirq, pure system, ...

Inspired-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1483636310-6557-4-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/kernel_stat.h |  2 ++
 kernel/sched/cputime.c      | 10 +++++-----
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 00f776816aa3..14b63bb714d4 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -80,6 +80,8 @@ static inline unsigned int kstat_cpu_irqs_sum(unsigned int cpu)
 
 extern void account_user_time(struct task_struct *, cputime_t);
 extern void account_system_time(struct task_struct *, int, cputime_t);
+extern void account_system_index_time(struct task_struct *, cputime_t,
+				      enum cpu_usage_stat);
 extern void account_steal_time(cputime_t);
 extern void account_idle_time(cputime_t);
 
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 7700a9cba335..aad42835938c 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -176,8 +176,8 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime)
  * @cputime: the cpu time spent in kernel space since the last update
  * @index: pointer to cpustat field that has to be updated
  */
-static inline
-void __account_system_time(struct task_struct *p, cputime_t cputime, int index)
+void account_system_index_time(struct task_struct *p,
+			       cputime_t cputime, enum cpu_usage_stat index)
 {
 	/* Add system time to process. */
 	p->stime += cputime;
@@ -213,7 +213,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 	else
 		index = CPUTIME_SYSTEM;
 
-	__account_system_time(p, cputime, index);
+	account_system_index_time(p, cputime, index);
 }
 
 /*
@@ -400,7 +400,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 		 * So, we have to handle it separately here.
 		 * Also, p->stime needs to be updated for ksoftirqd.
 		 */
-		__account_system_time(p, cputime, CPUTIME_SOFTIRQ);
+		account_system_index_time(p, cputime, CPUTIME_SOFTIRQ);
 	} else if (user_tick) {
 		account_user_time(p, cputime);
 	} else if (p == rq->idle) {
@@ -408,7 +408,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 	} else if (p->flags & PF_VCPU) { /* System time or guest time */
 		account_guest_time(p, cputime);
 	} else {
-		__account_system_time(p, cputime, CPUTIME_SYSTEM);
+		account_system_index_time(p, cputime, CPUTIME_SYSTEM);
 	}
 }
 
-- 
cgit v1.2.3


From 1213699ab426608ff1925ab263dd6925102bb92a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 5 Jan 2017 18:11:44 +0100
Subject: sched/cputime: Export account_guest_time()

In order to prepare for CONFIG_VIRT_CPU_ACCOUNTING_NATIVE=y to delay
cputime accounting to the tick, let's allow archs to account cputime
directly to gtime.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1483636310-6557-5-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/kernel_stat.h | 1 +
 kernel/sched/cputime.c      | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 14b63bb714d4..cfd6c0c6d4e8 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -79,6 +79,7 @@ static inline unsigned int kstat_cpu_irqs_sum(unsigned int cpu)
 }
 
 extern void account_user_time(struct task_struct *, cputime_t);
+extern void account_guest_time(struct task_struct *, cputime_t);
 extern void account_system_time(struct task_struct *, int, cputime_t);
 extern void account_system_index_time(struct task_struct *, cputime_t,
 				      enum cpu_usage_stat);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index aad42835938c..5813ee4a5168 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -151,7 +151,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
  * @p: the process that the cpu time gets accounted to
  * @cputime: the cpu time spent in virtual machine since the last update
  */
-static void account_guest_time(struct task_struct *p, cputime_t cputime)
+void account_guest_time(struct task_struct *p, cputime_t cputime)
 {
 	u64 *cpustat = kcpustat_this_cpu->cpustat;
 
-- 
cgit v1.2.3


From c8d7dabf8f91fadd265e6eb87afb201d14ea299b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 5 Jan 2017 18:11:50 +0100
Subject: sched/cputime: Rename vtime_account_user() to vtime_flush()

CONFIG_VIRT_CPU_ACCOUNTING_NATIVE=y used to accumulate user time and
account it on ticks and context switches only through the
vtime_account_user() function.

Now this model has been generalized on the 3 archs for all kind of
cputime (system, irq, ...) and all the cputime flushing happens under
vtime_account_user().

So let's rename this function to better reflect its new role.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1483636310-6557-11-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/ia64/kernel/time.c     | 2 +-
 arch/powerpc/kernel/time.c  | 6 ++----
 arch/s390/kernel/vtime.c    | 2 +-
 include/linux/kernel_stat.h | 2 +-
 include/linux/vtime.h       | 7 +++++--
 kernel/sched/cputime.c      | 4 +---
 6 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 37f1b315d9b6..d040f12ea9f9 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -61,7 +61,7 @@ static struct clocksource *itc_clocksource;
 
 extern cputime_t cycle_to_cputime(u64 cyc);
 
-void vtime_account_user(struct task_struct *tsk)
+void vtime_flush(struct task_struct *tsk)
 {
 	struct thread_info *ti = task_thread_info(tsk);
 	cputime_t delta;
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 4255e6930ac1..02e97305d22b 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -382,15 +382,13 @@ void vtime_account_idle(struct task_struct *tsk)
 }
 
 /*
- * Transfer the user time accumulated in the paca
- * by the exception entry and exit code to the generic
- * process user time records.
+ * Account the whole cputime accumulated in the paca
  * Must be called with interrupts disabled.
  * Assumes that vtime_account_system/idle() has been called
  * recently (i.e. since the last entry from usermode) so that
  * get_paca()->user_time_scaled is up to date.
  */
-void vtime_account_user(struct task_struct *tsk)
+void vtime_flush(struct task_struct *tsk)
 {
 	struct cpu_accounting_data *acct = get_accounting(tsk);
 
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index 1a53e0bdc90a..0a9e5d67547d 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -214,7 +214,7 @@ void vtime_task_switch(struct task_struct *prev)
  * accounting system time in order to correctly compute
  * the stolen time accounting.
  */
-void vtime_account_user(struct task_struct *tsk)
+void vtime_flush(struct task_struct *tsk)
 {
 	if (do_account_vtime(tsk))
 		virt_timer_expire();
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index cfd6c0c6d4e8..c3e38ded2d73 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -89,7 +89,7 @@ extern void account_idle_time(cputime_t);
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 static inline void account_process_tick(struct task_struct *tsk, int user)
 {
-	vtime_account_user(tsk);
+	vtime_flush(tsk);
 }
 #else
 extern void account_process_tick(struct task_struct *, int user);
diff --git a/include/linux/vtime.h b/include/linux/vtime.h
index aa9bfea8804a..0681fe25abeb 100644
--- a/include/linux/vtime.h
+++ b/include/linux/vtime.h
@@ -58,27 +58,28 @@ static inline void vtime_task_switch(struct task_struct *prev)
 
 extern void vtime_account_system(struct task_struct *tsk);
 extern void vtime_account_idle(struct task_struct *tsk);
-extern void vtime_account_user(struct task_struct *tsk);
 
 #else /* !CONFIG_VIRT_CPU_ACCOUNTING */
 
 static inline void vtime_task_switch(struct task_struct *prev) { }
 static inline void vtime_account_system(struct task_struct *tsk) { }
-static inline void vtime_account_user(struct task_struct *tsk) { }
 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 extern void arch_vtime_task_switch(struct task_struct *tsk);
+extern void vtime_account_user(struct task_struct *tsk);
 extern void vtime_user_enter(struct task_struct *tsk);
 
 static inline void vtime_user_exit(struct task_struct *tsk)
 {
 	vtime_account_user(tsk);
 }
+
 extern void vtime_guest_enter(struct task_struct *tsk);
 extern void vtime_guest_exit(struct task_struct *tsk);
 extern void vtime_init_idle(struct task_struct *tsk, int cpu);
 #else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN  */
+static inline void vtime_account_user(struct task_struct *tsk) { }
 static inline void vtime_user_enter(struct task_struct *tsk) { }
 static inline void vtime_user_exit(struct task_struct *tsk) { }
 static inline void vtime_guest_enter(struct task_struct *tsk) { }
@@ -93,9 +94,11 @@ static inline void vtime_account_irq_exit(struct task_struct *tsk)
 	/* On hard|softirq exit we always account to hard|softirq cputime */
 	vtime_account_system(tsk);
 }
+extern void vtime_flush(struct task_struct *tsk);
 #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 static inline void vtime_account_irq_enter(struct task_struct *tsk) { }
 static inline void vtime_account_irq_exit(struct task_struct *tsk) { }
+static inline void vtime_flush(struct task_struct *tsk) { }
 #endif
 
 
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 5813ee4a5168..f7c14cc71d06 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -437,9 +437,7 @@ void vtime_common_task_switch(struct task_struct *prev)
 	else
 		vtime_account_system(prev);
 
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	vtime_account_user(prev);
-#endif
+	vtime_flush(prev);
 	arch_vtime_task_switch(prev);
 }
 #endif
-- 
cgit v1.2.3


From 642fa448ae6b3a4e5e8737054a094173405b7643 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 3 Jan 2017 13:43:14 -0800
Subject: sched/core: Remove set_task_state()

This is a nasty interface and setting the state of a foreign task must
not be done. As of the following commit:

  be628be0956 ("bcache: Make gc wakeup sane, remove set_task_state()")

... everyone in the kernel calls set_task_state() with current, allowing
the helper to be removed.

However, as the comment indicates, it is still around for those archs
where computing current is more expensive than using a pointer, at least
in theory. An important arch that is affected is arm64, however this has
been addressed now [1] and performance is up to par making no difference
with either calls.

Of all the callers, if any, it's the locking bits that would care most
about this -- ie: we end up passing a tsk pointer to a lot of the lock
slowpath, and setting ->state on that. The following numbers are based
on two tests: a custom ad-hoc microbenchmark that just measures
latencies (for ~65 million calls) between get_task_state() vs
get_current_state().

Secondly for a higher overview, an unlink microbenchmark was used,
which pounds on a single file with open, close,unlink combos with
increasing thread counts (up to 4x ncpus). While the workload is quite
unrealistic, it does contend a lot on the inode mutex or now rwsem.

[1] https://lkml.kernel.org/r/1483468021-8237-1-git-send-email-mark.rutland@arm.com

== 1. x86-64 ==

Avg runtime set_task_state():    601 msecs
Avg runtime set_current_state(): 552 msecs

                                            vanilla                 dirty
Hmean    unlink1-processes-2      36089.26 (  0.00%)    38977.33 (  8.00%)
Hmean    unlink1-processes-5      28555.01 (  0.00%)    29832.55 (  4.28%)
Hmean    unlink1-processes-8      37323.75 (  0.00%)    44974.57 ( 20.50%)
Hmean    unlink1-processes-12     43571.88 (  0.00%)    44283.01 (  1.63%)
Hmean    unlink1-processes-21     34431.52 (  0.00%)    38284.45 ( 11.19%)
Hmean    unlink1-processes-30     34813.26 (  0.00%)    37975.17 (  9.08%)
Hmean    unlink1-processes-48     37048.90 (  0.00%)    39862.78 (  7.59%)
Hmean    unlink1-processes-79     35630.01 (  0.00%)    36855.30 (  3.44%)
Hmean    unlink1-processes-110    36115.85 (  0.00%)    39843.91 ( 10.32%)
Hmean    unlink1-processes-141    32546.96 (  0.00%)    35418.52 (  8.82%)
Hmean    unlink1-processes-172    34674.79 (  0.00%)    36899.21 (  6.42%)
Hmean    unlink1-processes-203    37303.11 (  0.00%)    36393.04 ( -2.44%)
Hmean    unlink1-processes-224    35712.13 (  0.00%)    36685.96 (  2.73%)

== 2. ppc64le ==

Avg runtime set_task_state():  938 msecs
Avg runtime set_current_state: 940 msecs

                                            vanilla                 dirty
Hmean    unlink1-processes-2      19269.19 (  0.00%)    30704.50 ( 59.35%)
Hmean    unlink1-processes-5      20106.15 (  0.00%)    21804.15 (  8.45%)
Hmean    unlink1-processes-8      17496.97 (  0.00%)    17243.28 ( -1.45%)
Hmean    unlink1-processes-12     14224.15 (  0.00%)    17240.21 ( 21.20%)
Hmean    unlink1-processes-21     14155.66 (  0.00%)    15681.23 ( 10.78%)
Hmean    unlink1-processes-30     14450.70 (  0.00%)    15995.83 ( 10.69%)
Hmean    unlink1-processes-48     16945.57 (  0.00%)    16370.42 ( -3.39%)
Hmean    unlink1-processes-79     15788.39 (  0.00%)    14639.27 ( -7.28%)
Hmean    unlink1-processes-110    14268.48 (  0.00%)    14377.40 (  0.76%)
Hmean    unlink1-processes-141    14023.65 (  0.00%)    16271.69 ( 16.03%)
Hmean    unlink1-processes-172    13417.62 (  0.00%)    16067.55 ( 19.75%)
Hmean    unlink1-processes-203    15293.08 (  0.00%)    15440.40 (  0.96%)
Hmean    unlink1-processes-234    13719.32 (  0.00%)    16190.74 ( 18.01%)
Hmean    unlink1-processes-265    16400.97 (  0.00%)    16115.22 ( -1.74%)
Hmean    unlink1-processes-296    14388.60 (  0.00%)    16216.13 ( 12.70%)
Hmean    unlink1-processes-320    15771.85 (  0.00%)    15905.96 (  0.85%)

x86-64 (known to be fast for get_current()/this_cpu_read_stable() caching)
and ppc64 (with paca) show similar improvements in the unlink microbenches.
The small delta for ppc64 (2ms), does not represent the gains on the unlink
runs. In the case of x86, there was a decent amount of variation in the
latency runs, but always within a 20 to 50ms increase), ppc was more constant.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dave@stgolabs.net
Cc: mark.rutland@arm.com
Link: http://lkml.kernel.org/r/1483479794-14013-5-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/um/drivers/random.c                           |  2 +-
 drivers/md/dm-bufio.c                              |  2 +-
 drivers/md/dm-crypt.c                              |  4 ++--
 drivers/md/persistent-data/dm-block-manager.c      |  4 ++--
 .../staging/lustre/lnet/libcfs/linux/linux-debug.c |  2 +-
 drivers/tty/tty_ldsem.c                            | 10 ++++----
 include/linux/sched.h                              | 27 +---------------------
 kernel/exit.c                                      |  4 ++--
 kernel/locking/mutex.c                             |  8 +++----
 kernel/locking/rwsem-spinlock.c                    |  8 +++----
 kernel/locking/rwsem-xadd.c                        |  4 ++--
 kernel/locking/semaphore.c                         |  2 +-
 12 files changed, 26 insertions(+), 51 deletions(-)

(limited to 'include')

diff --git a/arch/um/drivers/random.c b/arch/um/drivers/random.c
index 05523f14d7b2..57f03050c850 100644
--- a/arch/um/drivers/random.c
+++ b/arch/um/drivers/random.c
@@ -76,7 +76,7 @@ static ssize_t rng_dev_read (struct file *filp, char __user *buf, size_t size,
 			add_sigio_fd(random_fd);
 
 			add_wait_queue(&host_read_wait, &wait);
-			set_task_state(current, TASK_INTERRUPTIBLE);
+			set_current_state(TASK_INTERRUPTIBLE);
 
 			schedule();
 			remove_wait_queue(&host_read_wait, &wait);
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 84d2f0e4c754..d36d427a9efb 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -794,7 +794,7 @@ static void __wait_for_free_buffer(struct dm_bufio_client *c)
 	DECLARE_WAITQUEUE(wait, current);
 
 	add_wait_queue(&c->free_buffer_wait, &wait);
-	set_task_state(current, TASK_UNINTERRUPTIBLE);
+	set_current_state(TASK_UNINTERRUPTIBLE);
 	dm_bufio_unlock(c);
 
 	io_schedule();
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 7c6c57216bf2..96692d13a6e4 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1210,14 +1210,14 @@ continue_locked:
 		spin_unlock_irq(&cc->write_thread_wait.lock);
 
 		if (unlikely(kthread_should_stop())) {
-			set_task_state(current, TASK_RUNNING);
+			set_current_state(TASK_RUNNING);
 			remove_wait_queue(&cc->write_thread_wait, &wait);
 			break;
 		}
 
 		schedule();
 
-		set_task_state(current, TASK_RUNNING);
+		set_current_state(TASK_RUNNING);
 		spin_lock_irq(&cc->write_thread_wait.lock);
 		__remove_wait_queue(&cc->write_thread_wait, &wait);
 		goto continue_locked;
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index a6dde7cab458..758d90cc2733 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -120,7 +120,7 @@ static int __check_holder(struct block_lock *lock)
 static void __wait(struct waiter *w)
 {
 	for (;;) {
-		set_task_state(current, TASK_UNINTERRUPTIBLE);
+		set_current_state(TASK_UNINTERRUPTIBLE);
 
 		if (!w->task)
 			break;
@@ -128,7 +128,7 @@ static void __wait(struct waiter *w)
 		schedule();
 	}
 
-	set_task_state(current, TASK_RUNNING);
+	set_current_state(TASK_RUNNING);
 }
 
 static void __wake_waiter(struct waiter *w)
diff --git a/drivers/staging/lustre/lnet/libcfs/linux/linux-debug.c b/drivers/staging/lustre/lnet/libcfs/linux/linux-debug.c
index 39a72e3f0c18..7035356e56b3 100644
--- a/drivers/staging/lustre/lnet/libcfs/linux/linux-debug.c
+++ b/drivers/staging/lustre/lnet/libcfs/linux/linux-debug.c
@@ -107,7 +107,7 @@ void __noreturn lbug_with_loc(struct libcfs_debug_msg_data *msgdata)
 		libcfs_debug_dumplog();
 	if (libcfs_panic_on_lbug)
 		panic("LBUG");
-	set_task_state(current, TASK_UNINTERRUPTIBLE);
+	set_current_state(TASK_UNINTERRUPTIBLE);
 	while (1)
 		schedule();
 }
diff --git a/drivers/tty/tty_ldsem.c b/drivers/tty/tty_ldsem.c
index 3e6722954f29..9229de43e19d 100644
--- a/drivers/tty/tty_ldsem.c
+++ b/drivers/tty/tty_ldsem.c
@@ -231,7 +231,7 @@ down_read_failed(struct ld_semaphore *sem, long count, long timeout)
 
 	/* wait to be given the lock */
 	for (;;) {
-		set_task_state(current, TASK_UNINTERRUPTIBLE);
+		set_current_state(TASK_UNINTERRUPTIBLE);
 
 		if (!waiter.task)
 			break;
@@ -240,7 +240,7 @@ down_read_failed(struct ld_semaphore *sem, long count, long timeout)
 		timeout = schedule_timeout(timeout);
 	}
 
-	__set_task_state(current, TASK_RUNNING);
+	__set_current_state(TASK_RUNNING);
 
 	if (!timeout) {
 		/* lock timed out but check if this task was just
@@ -289,14 +289,14 @@ down_write_failed(struct ld_semaphore *sem, long count, long timeout)
 
 	waiter.task = current;
 
-	set_task_state(current, TASK_UNINTERRUPTIBLE);
+	set_current_state(TASK_UNINTERRUPTIBLE);
 	for (;;) {
 		if (!timeout)
 			break;
 		raw_spin_unlock_irq(&sem->wait_lock);
 		timeout = schedule_timeout(timeout);
 		raw_spin_lock_irq(&sem->wait_lock);
-		set_task_state(current, TASK_UNINTERRUPTIBLE);
+		set_current_state(TASK_UNINTERRUPTIBLE);
 		locked = writer_trylock(sem);
 		if (locked)
 			break;
@@ -307,7 +307,7 @@ down_write_failed(struct ld_semaphore *sem, long count, long timeout)
 	list_del(&waiter.list);
 	raw_spin_unlock_irq(&sem->wait_lock);
 
-	__set_task_state(current, TASK_RUNNING);
+	__set_current_state(TASK_RUNNING);
 
 	/* lock wait may have timed out */
 	if (!locked)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ad3ec9ec61f7..f4f9d32f12b3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -227,7 +227,7 @@ extern void proc_sched_set_task(struct task_struct *p);
 extern char ___assert_task_state[1 - 2*!!(
 		sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
 
-/* Convenience macros for the sake of set_task_state */
+/* Convenience macros for the sake of set_current_state */
 #define TASK_KILLABLE		(TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
 #define TASK_STOPPED		(TASK_WAKEKILL | __TASK_STOPPED)
 #define TASK_TRACED		(TASK_WAKEKILL | __TASK_TRACED)
@@ -254,17 +254,6 @@ extern char ___assert_task_state[1 - 2*!!(
 
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 
-#define __set_task_state(tsk, state_value)			\
-	do {							\
-		(tsk)->task_state_change = _THIS_IP_;		\
-		(tsk)->state = (state_value);			\
-	} while (0)
-#define set_task_state(tsk, state_value)			\
-	do {							\
-		(tsk)->task_state_change = _THIS_IP_;		\
-		smp_store_mb((tsk)->state, (state_value));	\
-	} while (0)
-
 #define __set_current_state(state_value)			\
 	do {							\
 		current->task_state_change = _THIS_IP_;		\
@@ -277,20 +266,6 @@ extern char ___assert_task_state[1 - 2*!!(
 	} while (0)
 
 #else
-
-/*
- * @tsk had better be current, or you get to keep the pieces.
- *
- * The only reason is that computing current can be more expensive than
- * using a pointer that's already available.
- *
- * Therefore, see set_current_state().
- */
-#define __set_task_state(tsk, state_value)		\
-	do { (tsk)->state = (state_value); } while (0)
-#define set_task_state(tsk, state_value)		\
-	smp_store_mb((tsk)->state, (state_value))
-
 /*
  * set_current_state() includes a barrier so that the write of current->state
  * is correctly serialised wrt the caller's subsequent test of whether to
diff --git a/kernel/exit.c b/kernel/exit.c
index 2385d434a46e..27c68653e2fc 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -501,12 +501,12 @@ static void exit_mm(void)
 			complete(&core_state->startup);
 
 		for (;;) {
-			set_task_state(current, TASK_UNINTERRUPTIBLE);
+			set_current_state(TASK_UNINTERRUPTIBLE);
 			if (!self.task) /* see coredump_finish() */
 				break;
 			freezable_schedule();
 		}
-		__set_task_state(current, TASK_RUNNING);
+		__set_current_state(TASK_RUNNING);
 		down_read(&mm->mmap_sem);
 	}
 	atomic_inc(&mm->mm_count);
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 4c7d04362c95..97d142486f93 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -666,7 +666,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 
 	lock_contended(&lock->dep_map, ip);
 
-	set_task_state(current, state);
+	set_current_state(state);
 	for (;;) {
 		/*
 		 * Once we hold wait_lock, we're serialized against
@@ -701,7 +701,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 			__mutex_set_flag(lock, MUTEX_FLAG_HANDOFF);
 		}
 
-		set_task_state(current, state);
+		set_current_state(state);
 		/*
 		 * Here we order against unlock; we must either see it change
 		 * state back to RUNNING and fall through the next schedule(),
@@ -715,7 +715,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	}
 	spin_lock_mutex(&lock->wait_lock, flags);
 acquired:
-	__set_task_state(current, TASK_RUNNING);
+	__set_current_state(TASK_RUNNING);
 
 	mutex_remove_waiter(lock, &waiter, current);
 	if (likely(list_empty(&lock->wait_list)))
@@ -735,7 +735,7 @@ skip_wait:
 	return 0;
 
 err:
-	__set_task_state(current, TASK_RUNNING);
+	__set_current_state(TASK_RUNNING);
 	mutex_remove_waiter(lock, &waiter, current);
 	spin_unlock_mutex(&lock->wait_lock, flags);
 	debug_mutex_free_waiter(&waiter);
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 60d15d3bb2a8..5eacab880f67 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -139,7 +139,7 @@ void __sched __down_read(struct rw_semaphore *sem)
 		goto out;
 	}
 
-	set_task_state(current, TASK_UNINTERRUPTIBLE);
+	set_current_state(TASK_UNINTERRUPTIBLE);
 
 	/* set up my own style of waitqueue */
 	waiter.task = current;
@@ -156,10 +156,10 @@ void __sched __down_read(struct rw_semaphore *sem)
 		if (!waiter.task)
 			break;
 		schedule();
-		set_task_state(current, TASK_UNINTERRUPTIBLE);
+		set_current_state(TASK_UNINTERRUPTIBLE);
 	}
 
-	__set_task_state(current, TASK_RUNNING);
+	__set_current_state(TASK_RUNNING);
  out:
 	;
 }
@@ -216,7 +216,7 @@ int __sched __down_write_common(struct rw_semaphore *sem, int state)
 			ret = -EINTR;
 			goto out;
 		}
-		set_task_state(current, state);
+		set_current_state(state);
 		raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
 		schedule();
 		raw_spin_lock_irqsave(&sem->wait_lock, flags);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index d3b819c7f07f..a3a381aee24b 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -253,13 +253,13 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 
 	/* wait to be given the lock */
 	while (true) {
-		set_task_state(current, TASK_UNINTERRUPTIBLE);
+		set_current_state(TASK_UNINTERRUPTIBLE);
 		if (!waiter.task)
 			break;
 		schedule();
 	}
 
-	__set_task_state(current, TASK_RUNNING);
+	__set_current_state(TASK_RUNNING);
 	return sem;
 }
 EXPORT_SYMBOL(rwsem_down_read_failed);
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 29aac9f9e5e4..9512e37637dc 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -215,7 +215,7 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
 			goto interrupted;
 		if (unlikely(timeout <= 0))
 			goto timed_out;
-		__set_task_state(current, state);
+		__set_current_state(state);
 		raw_spin_unlock_irq(&sem->lock);
 		timeout = schedule_timeout(timeout);
 		raw_spin_lock_irq(&sem->lock);
-- 
cgit v1.2.3


From 8f95c90ceb541a38ac16fec48c05142ef1450c25 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 11 Jan 2017 07:22:25 -0800
Subject: sched/wait, RCU: Introduce rcuwait machinery

rcuwait provides support for (single) RCU-safe task wait/wake functionality,
with the caveat that it must not be called after exit_notify(), such that
we avoid racing with rcu delayed_put_task_struct callbacks, task_struct
being rcu unaware in this context -- for which we similarly have
task_rcu_dereference() magic, but with different return semantics, which
can conflict with the wakeup side.

The interfaces are quite straightforward:

  rcuwait_wait_event()
  rcuwait_wake_up()

More details are in the comments, but it's perhaps worth mentioning at least,
that users must provide proper serialization when waiting on a condition, and
avoid corrupting a concurrent waiter. Also care must be taken between the task
and the condition for when calling the wakeup -- we cannot miss wakeups. When
porting users, this is for example, a given when using waitqueues in that
everything is done under the q->lock. As such, it can remove sources of non
preemptable unbounded work for realtime.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dave@stgolabs.net
Link: http://lkml.kernel.org/r/1484148146-14210-2-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/rcuwait.h | 63 +++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/exit.c           | 30 +++++++++++++++++++++++
 2 files changed, 93 insertions(+)
 create mode 100644 include/linux/rcuwait.h

(limited to 'include')

diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h
new file mode 100644
index 000000000000..0e93d56c7ab2
--- /dev/null
+++ b/include/linux/rcuwait.h
@@ -0,0 +1,63 @@
+#ifndef _LINUX_RCUWAIT_H_
+#define _LINUX_RCUWAIT_H_
+
+#include <linux/rcupdate.h>
+
+/*
+ * rcuwait provides a way of blocking and waking up a single
+ * task in an rcu-safe manner; where it is forbidden to use
+ * after exit_notify(). task_struct is not properly rcu protected,
+ * unless dealing with rcu-aware lists, ie: find_task_by_*().
+ *
+ * Alternatively we have task_rcu_dereference(), but the return
+ * semantics have different implications which would break the
+ * wakeup side. The only time @task is non-nil is when a user is
+ * blocked (or checking if it needs to) on a condition, and reset
+ * as soon as we know that the condition has succeeded and are
+ * awoken.
+ */
+struct rcuwait {
+	struct task_struct *task;
+};
+
+#define __RCUWAIT_INITIALIZER(name)		\
+	{ .task = NULL, }
+
+static inline void rcuwait_init(struct rcuwait *w)
+{
+	w->task = NULL;
+}
+
+extern void rcuwait_wake_up(struct rcuwait *w);
+
+/*
+ * The caller is responsible for locking around rcuwait_wait_event(),
+ * such that writes to @task are properly serialized.
+ */
+#define rcuwait_wait_event(w, condition)				\
+({									\
+	/*								\
+	 * Complain if we are called after do_exit()/exit_notify(),     \
+	 * as we cannot rely on the rcu critical region for the		\
+	 * wakeup side.							\
+	 */                                                             \
+	WARN_ON(current->exit_state);                                   \
+									\
+	rcu_assign_pointer((w)->task, current);				\
+	for (;;) {							\
+		/*							\
+		 * Implicit barrier (A) pairs with (B) in		\
+		 * rcuwait_trywake().					\
+		 */							\
+		set_current_state(TASK_UNINTERRUPTIBLE);		\
+		if (condition)						\
+			break;						\
+									\
+		schedule();						\
+	}								\
+									\
+	WRITE_ONCE((w)->task, NULL);					\
+	__set_current_state(TASK_RUNNING);				\
+})
+
+#endif /* _LINUX_RCUWAIT_H_ */
diff --git a/kernel/exit.c b/kernel/exit.c
index 27c68653e2fc..a9441da69e29 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -55,6 +55,7 @@
 #include <linux/shm.h>
 #include <linux/kcov.h>
 #include <linux/random.h>
+#include <linux/rcuwait.h>
 
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
@@ -282,6 +283,35 @@ retry:
 	return task;
 }
 
+void rcuwait_wake_up(struct rcuwait *w)
+{
+	struct task_struct *task;
+
+	rcu_read_lock();
+
+	/*
+	 * Order condition vs @task, such that everything prior to the load
+	 * of @task is visible. This is the condition as to why the user called
+	 * rcuwait_trywake() in the first place. Pairs with set_current_state()
+	 * barrier (A) in rcuwait_wait_event().
+	 *
+	 *    WAIT                WAKE
+	 *    [S] tsk = current	  [S] cond = true
+	 *        MB (A)	      MB (B)
+	 *    [L] cond		  [L] tsk
+	 */
+	smp_rmb(); /* (B) */
+
+	/*
+	 * Avoid using task_rcu_dereference() magic as long as we are careful,
+	 * see comment in rcuwait_wait_event() regarding ->exit_state.
+	 */
+	task = rcu_dereference(w->task);
+	if (task)
+		wake_up_process(task);
+	rcu_read_unlock();
+}
+
 struct task_struct *try_get_task_struct(struct task_struct **ptask)
 {
 	struct task_struct *task;
-- 
cgit v1.2.3


From 52b94129f274937e4c25dd17b76697664a3c43c9 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 11 Jan 2017 07:22:26 -0800
Subject: locking/percpu-rwsem: Replace waitqueue with rcuwait

The use of any kind of wait queue is an overkill for pcpu-rwsems.
While one option would be to use the less heavy simple (swait)
flavor, this is still too much for what pcpu-rwsems needs. For one,
we do not care about any sort of queuing in that the only (rare) time
writers (and readers, for that matter) are queued is when trying to
acquire the regular contended rw_sem. There cannot be any further
queuing as writers are serialized by the rw_sem in the first place.

Given that percpu_down_write() must not be called after exit_notify(),
we can replace the bulky waitqueue with rcuwait such that a writer
can wait for its turn to take the lock. As such, we can avoid the
queue handling and locking overhead.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dave@stgolabs.net
Link: http://lkml.kernel.org/r/1484148146-14210-3-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/percpu-rwsem.h  | 8 ++++----
 kernel/locking/percpu-rwsem.c | 7 +++----
 2 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index 5b2e6159b744..93664f022ecf 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -4,15 +4,15 @@
 #include <linux/atomic.h>
 #include <linux/rwsem.h>
 #include <linux/percpu.h>
-#include <linux/wait.h>
+#include <linux/rcuwait.h>
 #include <linux/rcu_sync.h>
 #include <linux/lockdep.h>
 
 struct percpu_rw_semaphore {
 	struct rcu_sync		rss;
 	unsigned int __percpu	*read_count;
-	struct rw_semaphore	rw_sem;
-	wait_queue_head_t	writer;
+	struct rw_semaphore	rw_sem; /* slowpath */
+	struct rcuwait          writer; /* blocked writer */
 	int			readers_block;
 };
 
@@ -22,7 +22,7 @@ static struct percpu_rw_semaphore name = {				\
 	.rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC),	\
 	.read_count = &__percpu_rwsem_rc_##name,			\
 	.rw_sem = __RWSEM_INITIALIZER(name.rw_sem),			\
-	.writer = __WAIT_QUEUE_HEAD_INITIALIZER(name.writer),		\
+	.writer = __RCUWAIT_INITIALIZER(name.writer),			\
 }
 
 extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index ce182599cf2e..883cf1b92d90 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -1,7 +1,6 @@
 #include <linux/atomic.h>
 #include <linux/rwsem.h>
 #include <linux/percpu.h>
-#include <linux/wait.h>
 #include <linux/lockdep.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/rcupdate.h>
@@ -18,7 +17,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
 	/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
 	rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
 	__init_rwsem(&sem->rw_sem, name, rwsem_key);
-	init_waitqueue_head(&sem->writer);
+	rcuwait_init(&sem->writer);
 	sem->readers_block = 0;
 	return 0;
 }
@@ -103,7 +102,7 @@ void __percpu_up_read(struct percpu_rw_semaphore *sem)
 	__this_cpu_dec(*sem->read_count);
 
 	/* Prod writer to recheck readers_active */
-	wake_up(&sem->writer);
+	rcuwait_wake_up(&sem->writer);
 }
 EXPORT_SYMBOL_GPL(__percpu_up_read);
 
@@ -160,7 +159,7 @@ void percpu_down_write(struct percpu_rw_semaphore *sem)
 	 */
 
 	/* Wait for all now active readers to complete. */
-	wait_event(sem->writer, readers_active_check(sem));
+	rcuwait_wait_event(&sem->writer, readers_active_check(sem));
 }
 EXPORT_SYMBOL_GPL(percpu_down_write);
 
-- 
cgit v1.2.3


From e274795ea7b7caa0fd74ef651594382a69e2a951 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 11 Jan 2017 14:17:48 +0100
Subject: locking/mutex: Fix mutex handoff

While reviewing the ww_mutex patches, I noticed that it was still
possible to (incorrectly) succeed for (incorrect) code like:

	mutex_lock(&a);
	mutex_lock(&a);

This was possible if the second mutex_lock() would block (as expected)
but then receive a spurious wakeup. At that point it would find itself
at the front of the queue, request a handoff and instantly claim
ownership and continue, since owner would point to itself.

Avoid this scenario and simplify the code by introducing a third low
bit to signal handoff pickup. So once we request handoff, unlock
clears the handoff bit and sets the pickup bit along with the new
owner.

This also removes the need for the .handoff argument to
__mutex_trylock(), since that becomes superfluous with PICKUP.

In order to guarantee enough low bits, ensure task_struct alignment is
at least L1_CACHE_BYTES (which seems a good ideal regardless).

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 9d659ae14b54 ("locking/mutex: Add lock handoff to avoid starvation")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mutex.h  |   2 +-
 kernel/fork.c          |   6 ++-
 kernel/locking/mutex.c | 108 ++++++++++++++++++++++++-------------------------
 3 files changed, 57 insertions(+), 59 deletions(-)

(limited to 'include')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index b97870f2debd..3e1fccb47f11 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -65,7 +65,7 @@ struct mutex {
 
 static inline struct task_struct *__mutex_owner(struct mutex *lock)
 {
-	return (struct task_struct *)(atomic_long_read(&lock->owner) & ~0x03);
+	return (struct task_struct *)(atomic_long_read(&lock->owner) & ~0x07);
 }
 
 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index 11c5c8ab827c..a90510d0bbf8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -432,11 +432,13 @@ void __init fork_init(void)
 	int i;
 #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
 #ifndef ARCH_MIN_TASKALIGN
-#define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES
+#define ARCH_MIN_TASKALIGN	0
 #endif
+	int align = min_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
+
 	/* create a slab on which task_structs can be allocated */
 	task_struct_cachep = kmem_cache_create("task_struct",
-			arch_task_struct_size, ARCH_MIN_TASKALIGN,
+			arch_task_struct_size, align,
 			SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL);
 #endif
 
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 97d142486f93..24284497c425 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -50,16 +50,17 @@ EXPORT_SYMBOL(__mutex_init);
 /*
  * @owner: contains: 'struct task_struct *' to the current lock owner,
  * NULL means not owned. Since task_struct pointers are aligned at
- * ARCH_MIN_TASKALIGN (which is at least sizeof(void *)), we have low
- * bits to store extra state.
+ * at least L1_CACHE_BYTES, we have low bits to store extra state.
  *
  * Bit0 indicates a non-empty waiter list; unlock must issue a wakeup.
  * Bit1 indicates unlock needs to hand the lock to the top-waiter
+ * Bit2 indicates handoff has been done and we're waiting for pickup.
  */
 #define MUTEX_FLAG_WAITERS	0x01
 #define MUTEX_FLAG_HANDOFF	0x02
+#define MUTEX_FLAG_PICKUP	0x04
 
-#define MUTEX_FLAGS		0x03
+#define MUTEX_FLAGS		0x07
 
 static inline struct task_struct *__owner_task(unsigned long owner)
 {
@@ -72,38 +73,29 @@ static inline unsigned long __owner_flags(unsigned long owner)
 }
 
 /*
- * Actual trylock that will work on any unlocked state.
- *
- * When setting the owner field, we must preserve the low flag bits.
- *
- * Be careful with @handoff, only set that in a wait-loop (where you set
- * HANDOFF) to avoid recursive lock attempts.
+ * Trylock variant that retuns the owning task on failure.
  */
-static inline bool __mutex_trylock(struct mutex *lock, const bool handoff)
+static inline struct task_struct *__mutex_trylock_or_owner(struct mutex *lock)
 {
 	unsigned long owner, curr = (unsigned long)current;
 
 	owner = atomic_long_read(&lock->owner);
 	for (;;) { /* must loop, can race against a flag */
 		unsigned long old, flags = __owner_flags(owner);
+		unsigned long task = owner & ~MUTEX_FLAGS;
+
+		if (task) {
+			if (likely(task != curr))
+				break;
+
+			if (likely(!(flags & MUTEX_FLAG_PICKUP)))
+				break;
 
-		if (__owner_task(owner)) {
-			if (handoff && unlikely(__owner_task(owner) == current)) {
-				/*
-				 * Provide ACQUIRE semantics for the lock-handoff.
-				 *
-				 * We cannot easily use load-acquire here, since
-				 * the actual load is a failed cmpxchg, which
-				 * doesn't imply any barriers.
-				 *
-				 * Also, this is a fairly unlikely scenario, and
-				 * this contains the cost.
-				 */
-				smp_mb(); /* ACQUIRE */
-				return true;
-			}
-
-			return false;
+			flags &= ~MUTEX_FLAG_PICKUP;
+		} else {
+#ifdef CONFIG_DEBUG_MUTEXES
+			DEBUG_LOCKS_WARN_ON(flags & MUTEX_FLAG_PICKUP);
+#endif
 		}
 
 		/*
@@ -111,15 +103,24 @@ static inline bool __mutex_trylock(struct mutex *lock, const bool handoff)
 		 * past the point where we acquire it. This would be possible
 		 * if we (accidentally) set the bit on an unlocked mutex.
 		 */
-		if (handoff)
-			flags &= ~MUTEX_FLAG_HANDOFF;
+		flags &= ~MUTEX_FLAG_HANDOFF;
 
 		old = atomic_long_cmpxchg_acquire(&lock->owner, owner, curr | flags);
 		if (old == owner)
-			return true;
+			return NULL;
 
 		owner = old;
 	}
+
+	return __owner_task(owner);
+}
+
+/*
+ * Actual trylock that will work on any unlocked state.
+ */
+static inline bool __mutex_trylock(struct mutex *lock)
+{
+	return !__mutex_trylock_or_owner(lock);
 }
 
 #ifndef CONFIG_DEBUG_LOCK_ALLOC
@@ -171,9 +172,9 @@ static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_wait
 
 /*
  * Give up ownership to a specific task, when @task = NULL, this is equivalent
- * to a regular unlock. Clears HANDOFF, preserves WAITERS. Provides RELEASE
- * semantics like a regular unlock, the __mutex_trylock() provides matching
- * ACQUIRE semantics for the handoff.
+ * to a regular unlock. Sets PICKUP on a handoff, clears HANDOF, preserves
+ * WAITERS. Provides RELEASE semantics like a regular unlock, the
+ * __mutex_trylock() provides a matching ACQUIRE semantics for the handoff.
  */
 static void __mutex_handoff(struct mutex *lock, struct task_struct *task)
 {
@@ -184,10 +185,13 @@ static void __mutex_handoff(struct mutex *lock, struct task_struct *task)
 
 #ifdef CONFIG_DEBUG_MUTEXES
 		DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current);
+		DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP);
 #endif
 
 		new = (owner & MUTEX_FLAG_WAITERS);
 		new |= (unsigned long)task;
+		if (task)
+			new |= MUTEX_FLAG_PICKUP;
 
 		old = atomic_long_cmpxchg_release(&lock->owner, owner, new);
 		if (old == owner)
@@ -435,8 +439,6 @@ static bool mutex_optimistic_spin(struct mutex *lock,
 				  struct ww_acquire_ctx *ww_ctx,
 				  const bool use_ww_ctx, const bool waiter)
 {
-	struct task_struct *task = current;
-
 	if (!waiter) {
 		/*
 		 * The purpose of the mutex_can_spin_on_owner() function is
@@ -476,24 +478,17 @@ static bool mutex_optimistic_spin(struct mutex *lock,
 				goto fail_unlock;
 		}
 
+		/* Try to acquire the mutex... */
+		owner = __mutex_trylock_or_owner(lock);
+		if (!owner)
+			break;
+
 		/*
-		 * If there's an owner, wait for it to either
+		 * There's an owner, wait for it to either
 		 * release the lock or go to sleep.
 		 */
-		owner = __mutex_owner(lock);
-		if (owner) {
-			if (waiter && owner == task) {
-				smp_mb(); /* ACQUIRE */
-				break;
-			}
-
-			if (!mutex_spin_on_owner(lock, owner))
-				goto fail_unlock;
-		}
-
-		/* Try to acquire the mutex if it is unlocked. */
-		if (__mutex_trylock(lock, waiter))
-			break;
+		if (!mutex_spin_on_owner(lock, owner))
+			goto fail_unlock;
 
 		/*
 		 * The cpu_relax() call is a compiler barrier which forces
@@ -637,7 +632,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	preempt_disable();
 	mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
 
-	if (__mutex_trylock(lock, false) ||
+	if (__mutex_trylock(lock) ||
 	    mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, false)) {
 		/* got the lock, yay! */
 		lock_acquired(&lock->dep_map, ip);
@@ -651,7 +646,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	/*
 	 * After waiting to acquire the wait_lock, try again.
 	 */
-	if (__mutex_trylock(lock, false))
+	if (__mutex_trylock(lock))
 		goto skip_wait;
 
 	debug_mutex_lock_common(lock, &waiter);
@@ -674,7 +669,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		 * before testing the error conditions to make sure we pick up
 		 * the handoff.
 		 */
-		if (__mutex_trylock(lock, first))
+		if (__mutex_trylock(lock))
 			goto acquired;
 
 		/*
@@ -707,8 +702,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		 * state back to RUNNING and fall through the next schedule(),
 		 * or we must see its unlock and acquire.
 		 */
-		if ((first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, true)) ||
-		     __mutex_trylock(lock, first))
+		if (__mutex_trylock(lock) ||
+		    (first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, true)))
 			break;
 
 		spin_lock_mutex(&lock->wait_lock, flags);
@@ -865,6 +860,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
 
 #ifdef CONFIG_DEBUG_MUTEXES
 		DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current);
+		DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP);
 #endif
 
 		if (owner & MUTEX_FLAG_HANDOFF)
@@ -1003,7 +999,7 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
  */
 int __sched mutex_trylock(struct mutex *lock)
 {
-	bool locked = __mutex_trylock(lock, false);
+	bool locked = __mutex_trylock(lock);
 
 	if (locked)
 		mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
-- 
cgit v1.2.3


From ea9e0fb8fe1bdfca81bd76052a5cce70bb053430 Mon Sep 17 00:00:00 2001
From: Nicolai Hähnle <Nicolai.Haehnle@amd.com>
Date: Wed, 21 Dec 2016 19:46:32 +0100
Subject: locking/ww_mutex: Set use_ww_ctx even when locking without a context
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We will add a new field to struct mutex_waiter.  This field must be
initialized for all waiters if any waiter uses the ww_use_ctx path.

So there is a trade-off: Keep ww_mutex locking without a context on
the faster non-use_ww_ctx path, at the cost of adding the
initialization to all mutex locks (including non-ww_mutexes), or avoid
the additional cost for non-ww_mutex locks, at the cost of adding
additional checks to the use_ww_ctx path.

We take the latter choice.  It may be worth eliminating the users of
ww_mutex_lock(lock, NULL), but there are a lot of them.

Signed-off-by: Nicolai Hähnle <Nicolai.Haehnle@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Maarten Lankhorst <dev@mblankhorst.nl>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dri-devel@lists.freedesktop.org
Link: http://lkml.kernel.org/r/1482346000-9927-5-git-send-email-nhaehnle@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/ww_mutex.h | 11 ++---------
 kernel/locking/mutex.c   | 29 +++++++++++++++++------------
 2 files changed, 19 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h
index 7b0066814fa0..5f2e8379baff 100644
--- a/include/linux/ww_mutex.h
+++ b/include/linux/ww_mutex.h
@@ -222,11 +222,7 @@ extern int __must_check __ww_mutex_lock_interruptible(struct ww_mutex *lock,
  */
 static inline int ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
-	if (ctx)
-		return __ww_mutex_lock(lock, ctx);
-
-	mutex_lock(&lock->base);
-	return 0;
+	return __ww_mutex_lock(lock, ctx);
 }
 
 /**
@@ -262,10 +258,7 @@ static inline int ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ct
 static inline int __must_check ww_mutex_lock_interruptible(struct ww_mutex *lock,
 							   struct ww_acquire_ctx *ctx)
 {
-	if (ctx)
-		return __ww_mutex_lock_interruptible(lock, ctx);
-	else
-		return mutex_lock_interruptible(&lock->base);
+	return __ww_mutex_lock_interruptible(lock, ctx);
 }
 
 /**
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 9ad03b9a5f7f..44a64c0a851a 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -469,7 +469,7 @@ static bool mutex_optimistic_spin(struct mutex *lock,
 	for (;;) {
 		struct task_struct *owner;
 
-		if (use_ww_ctx && ww_ctx->acquired > 0) {
+		if (use_ww_ctx && ww_ctx && ww_ctx->acquired > 0) {
 			struct ww_mutex *ww;
 
 			ww = container_of(lock, struct ww_mutex, base);
@@ -629,8 +629,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	struct ww_mutex *ww;
 	int ret;
 
-	if (use_ww_ctx) {
-		ww = container_of(lock, struct ww_mutex, base);
+	ww = container_of(lock, struct ww_mutex, base);
+
+	if (use_ww_ctx && ww_ctx) {
 		if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
 			return -EALREADY;
 	}
@@ -642,7 +643,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	    mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, false)) {
 		/* got the lock, yay! */
 		lock_acquired(&lock->dep_map, ip);
-		if (use_ww_ctx)
+		if (use_ww_ctx && ww_ctx)
 			ww_mutex_set_context_fastpath(ww, ww_ctx);
 		preempt_enable();
 		return 0;
@@ -688,7 +689,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 			goto err;
 		}
 
-		if (use_ww_ctx && ww_ctx->acquired > 0) {
+		if (use_ww_ctx && ww_ctx && ww_ctx->acquired > 0) {
 			ret = __ww_mutex_lock_check_stamp(lock, ww_ctx);
 			if (ret)
 				goto err;
@@ -728,7 +729,7 @@ skip_wait:
 	/* got the lock - cleanup and rejoice! */
 	lock_acquired(&lock->dep_map, ip);
 
-	if (use_ww_ctx)
+	if (use_ww_ctx && ww_ctx)
 		ww_mutex_set_context_slowpath(ww, ww_ctx);
 
 	spin_unlock_mutex(&lock->wait_lock, flags);
@@ -816,8 +817,9 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 
 	might_sleep();
 	ret =  __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE,
-				   0, &ctx->dep_map, _RET_IP_, ctx, 1);
-	if (!ret && ctx->acquired > 1)
+				   0, ctx ? &ctx->dep_map : NULL, _RET_IP_,
+				   ctx, 1);
+	if (!ret && ctx && ctx->acquired > 1)
 		return ww_mutex_deadlock_injection(lock, ctx);
 
 	return ret;
@@ -831,9 +833,10 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 
 	might_sleep();
 	ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE,
-				  0, &ctx->dep_map, _RET_IP_, ctx, 1);
+				  0, ctx ? &ctx->dep_map : NULL, _RET_IP_,
+				  ctx, 1);
 
-	if (!ret && ctx->acquired > 1)
+	if (!ret && ctx && ctx->acquired > 1)
 		return ww_mutex_deadlock_injection(lock, ctx);
 
 	return ret;
@@ -1021,7 +1024,8 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 	might_sleep();
 
 	if (__mutex_trylock_fast(&lock->base)) {
-		ww_mutex_set_context_fastpath(lock, ctx);
+		if (ctx)
+			ww_mutex_set_context_fastpath(lock, ctx);
 		return 0;
 	}
 
@@ -1035,7 +1039,8 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 	might_sleep();
 
 	if (__mutex_trylock_fast(&lock->base)) {
-		ww_mutex_set_context_fastpath(lock, ctx);
+		if (ctx)
+			ww_mutex_set_context_fastpath(lock, ctx);
 		return 0;
 	}
 
-- 
cgit v1.2.3


From c5470b22d1833241cae996d23a8a346ff8ec4d58 Mon Sep 17 00:00:00 2001
From: Nicolai Hähnle <Nicolai.Haehnle@amd.com>
Date: Wed, 21 Dec 2016 19:46:33 +0100
Subject: locking/ww_mutex: Remove the __ww_mutex_lock*() inline wrappers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Keep the documentation in the header file since there is no good place
for it in mutex.c: there are two rather different implementations with
different EXPORT_SYMBOLs for each function.

Signed-off-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <Nicolai.Haehnle@amd.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Maarten Lankhorst <dev@mblankhorst.nl>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dri-devel@lists.freedesktop.org
Link: http://lkml.kernel.org/r/1482346000-9927-6-git-send-email-nhaehnle@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/ww_mutex.h | 18 ++++--------------
 kernel/locking/mutex.c   | 16 ++++++++--------
 2 files changed, 12 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h
index 5f2e8379baff..c07528522bbc 100644
--- a/include/linux/ww_mutex.h
+++ b/include/linux/ww_mutex.h
@@ -186,11 +186,6 @@ static inline void ww_acquire_fini(struct ww_acquire_ctx *ctx)
 #endif
 }
 
-extern int __must_check __ww_mutex_lock(struct ww_mutex *lock,
-					struct ww_acquire_ctx *ctx);
-extern int __must_check __ww_mutex_lock_interruptible(struct ww_mutex *lock,
-						      struct ww_acquire_ctx *ctx);
-
 /**
  * ww_mutex_lock - acquire the w/w mutex
  * @lock: the mutex to be acquired
@@ -220,10 +215,8 @@ extern int __must_check __ww_mutex_lock_interruptible(struct ww_mutex *lock,
  *
  * A mutex acquired with this function must be released with ww_mutex_unlock.
  */
-static inline int ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
-{
-	return __ww_mutex_lock(lock, ctx);
-}
+extern int __must_check ww_mutex_lock(struct ww_mutex *lock,
+				      struct ww_acquire_ctx *ctx);
 
 /**
  * ww_mutex_lock_interruptible - acquire the w/w mutex, interruptible
@@ -255,11 +248,8 @@ static inline int ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ct
  *
  * A mutex acquired with this function must be released with ww_mutex_unlock.
  */
-static inline int __must_check ww_mutex_lock_interruptible(struct ww_mutex *lock,
-							   struct ww_acquire_ctx *ctx)
-{
-	return __ww_mutex_lock_interruptible(lock, ctx);
-}
+extern int __must_check ww_mutex_lock_interruptible(struct ww_mutex *lock,
+						    struct ww_acquire_ctx *ctx);
 
 /**
  * ww_mutex_lock_slow - slowpath acquiring of the w/w mutex
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 44a64c0a851a..c696614a6b8b 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -811,7 +811,7 @@ ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 }
 
 int __sched
-__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
 	int ret;
 
@@ -824,10 +824,10 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(__ww_mutex_lock);
+EXPORT_SYMBOL_GPL(ww_mutex_lock);
 
 int __sched
-__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
 	int ret;
 
@@ -841,7 +841,7 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
+EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible);
 
 #endif
 
@@ -1019,7 +1019,7 @@ EXPORT_SYMBOL(mutex_trylock);
 
 #ifndef CONFIG_DEBUG_LOCK_ALLOC
 int __sched
-__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
 	might_sleep();
 
@@ -1031,10 +1031,10 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 
 	return __ww_mutex_lock_slowpath(lock, ctx);
 }
-EXPORT_SYMBOL(__ww_mutex_lock);
+EXPORT_SYMBOL(ww_mutex_lock);
 
 int __sched
-__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
 	might_sleep();
 
@@ -1046,7 +1046,7 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 
 	return __ww_mutex_lock_interruptible_slowpath(lock, ctx);
 }
-EXPORT_SYMBOL(__ww_mutex_lock_interruptible);
+EXPORT_SYMBOL(ww_mutex_lock_interruptible);
 
 #endif
 
-- 
cgit v1.2.3


From 6baa5c60a97d7f1a37a4b5ab5fb3a450e56e8b06 Mon Sep 17 00:00:00 2001
From: Nicolai Hähnle <Nicolai.Haehnle@amd.com>
Date: Wed, 21 Dec 2016 19:46:34 +0100
Subject: locking/ww_mutex: Add waiters in stamp order
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add regular waiters in stamp order. Keep adding waiters that have no
context in FIFO order and take care not to starve them.

While adding our task as a waiter, back off if we detect that there is
a waiter with a lower stamp in front of us.

Make sure to call lock_contended even when we back off early.

For w/w mutexes, being first in the wait list is only stable when
taking the lock without a context. Therefore, the purpose of the first
flag is split into two: 'first' remains to indicate whether we want to
spin optimistically, while 'handoff' indicates that we should be
prepared to accept a handoff.

For w/w locking with a context, we always accept handoffs after the
first schedule(), to handle the following sequence of events:

 1. Task #0 unlocks and hands off to Task #2 which is first in line

 2. Task #1 adds itself in front of Task #2

 3. Task #2 wakes up and must accept the handoff even though it is no
    longer first in line

Signed-off-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <Nicolai.Haehnle@amd.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Maarten Lankhorst <dev@mblankhorst.nl>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dri-devel@lists.freedesktop.org
Link: http://lkml.kernel.org/r/1482346000-9927-7-git-send-email-nhaehnle@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mutex.h  |  3 ++
 kernel/locking/mutex.c | 76 +++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 72 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 3e1fccb47f11..e17942ffb3fc 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -20,6 +20,8 @@
 #include <linux/osq_lock.h>
 #include <linux/debug_locks.h>
 
+struct ww_acquire_ctx;
+
 /*
  * Simple, straightforward mutexes with strict semantics:
  *
@@ -75,6 +77,7 @@ static inline struct task_struct *__mutex_owner(struct mutex *lock)
 struct mutex_waiter {
 	struct list_head	list;
 	struct task_struct	*task;
+	struct ww_acquire_ctx	*ww_ctx;
 #ifdef CONFIG_DEBUG_MUTEXES
 	void			*magic;
 #endif
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index c696614a6b8b..d0f7628b5a3d 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -615,6 +615,52 @@ __ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
 	return 0;
 }
 
+static inline int __sched
+__ww_mutex_add_waiter(struct mutex_waiter *waiter,
+		      struct mutex *lock,
+		      struct ww_acquire_ctx *ww_ctx)
+{
+	struct mutex_waiter *cur;
+	struct list_head *pos;
+
+	if (!ww_ctx) {
+		list_add_tail(&waiter->list, &lock->wait_list);
+		return 0;
+	}
+
+	/*
+	 * Add the waiter before the first waiter with a higher stamp.
+	 * Waiters without a context are skipped to avoid starving
+	 * them.
+	 */
+	pos = &lock->wait_list;
+	list_for_each_entry_reverse(cur, &lock->wait_list, list) {
+		if (!cur->ww_ctx)
+			continue;
+
+		if (__ww_ctx_stamp_after(ww_ctx, cur->ww_ctx)) {
+			/* Back off immediately if necessary. */
+			if (ww_ctx->acquired > 0) {
+#ifdef CONFIG_DEBUG_MUTEXES
+				struct ww_mutex *ww;
+
+				ww = container_of(lock, struct ww_mutex, base);
+				DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock);
+				ww_ctx->contending_lock = ww;
+#endif
+				return -EDEADLK;
+			}
+
+			break;
+		}
+
+		pos = &cur->list;
+	}
+
+	list_add_tail(&waiter->list, pos);
+	return 0;
+}
+
 /*
  * Lock a mutex (possibly interruptible), slowpath:
  */
@@ -659,15 +705,25 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	debug_mutex_lock_common(lock, &waiter);
 	debug_mutex_add_waiter(lock, &waiter, current);
 
-	/* add waiting tasks to the end of the waitqueue (FIFO): */
-	list_add_tail(&waiter.list, &lock->wait_list);
+	lock_contended(&lock->dep_map, ip);
+
+	if (!use_ww_ctx) {
+		/* add waiting tasks to the end of the waitqueue (FIFO): */
+		list_add_tail(&waiter.list, &lock->wait_list);
+	} else {
+		/* Add in stamp order, waking up waiters that must back off. */
+		ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx);
+		if (ret)
+			goto err_early_backoff;
+
+		waiter.ww_ctx = ww_ctx;
+	}
+
 	waiter.task = current;
 
 	if (__mutex_waiter_is_first(lock, &waiter))
 		__mutex_set_flag(lock, MUTEX_FLAG_WAITERS);
 
-	lock_contended(&lock->dep_map, ip);
-
 	set_current_state(state);
 	for (;;) {
 		/*
@@ -698,9 +754,14 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		spin_unlock_mutex(&lock->wait_lock, flags);
 		schedule_preempt_disabled();
 
-		if (!first && __mutex_waiter_is_first(lock, &waiter)) {
-			first = true;
-			__mutex_set_flag(lock, MUTEX_FLAG_HANDOFF);
+		/*
+		 * ww_mutex needs to always recheck its position since its waiter
+		 * list is not FIFO ordered.
+		 */
+		if ((use_ww_ctx && ww_ctx) || !first) {
+			first = __mutex_waiter_is_first(lock, &waiter);
+			if (first)
+				__mutex_set_flag(lock, MUTEX_FLAG_HANDOFF);
 		}
 
 		set_current_state(state);
@@ -739,6 +800,7 @@ skip_wait:
 err:
 	__set_current_state(TASK_RUNNING);
 	mutex_remove_waiter(lock, &waiter, current);
+err_early_backoff:
 	spin_unlock_mutex(&lock->wait_lock, flags);
 	debug_mutex_free_waiter(&waiter);
 	mutex_release(&lock->dep_map, 1, ip);
-- 
cgit v1.2.3


From 977625a693283dbb35b2a3f674bc0237f7347348 Mon Sep 17 00:00:00 2001
From: Nicolai Hähnle <Nicolai.Haehnle@amd.com>
Date: Wed, 21 Dec 2016 19:46:39 +0100
Subject: locking/mutex: Initialize mutex_waiter::ww_ctx with poison when
 debugging
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Help catch cases where mutex_lock is used directly on w/w mutexes, which
otherwise result in the w/w tasks reading uninitialized data.

Signed-off-by: Nicolai Hähnle <Nicolai.Haehnle@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Maarten Lankhorst <dev@mblankhorst.nl>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dri-devel@lists.freedesktop.org
Link: http://lkml.kernel.org/r/1482346000-9927-12-git-send-email-nhaehnle@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/poison.h | 1 +
 kernel/locking/mutex.c | 4 ++++
 2 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/poison.h b/include/linux/poison.h
index 51334edec506..a39540326417 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -80,6 +80,7 @@
 /********** kernel/mutexes **********/
 #define MUTEX_DEBUG_INIT	0x11
 #define MUTEX_DEBUG_FREE	0x22
+#define MUTEX_POISON_WW_CTX	((void *) 0x500 + POISON_POINTER_DELTA)
 
 /********** lib/flex_array.c **********/
 #define FLEX_ARRAY_FREE	0x6c	/* for use-after-free poisoning */
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index cd8598aa0426..935116723a3d 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -785,6 +785,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	if (!use_ww_ctx) {
 		/* add waiting tasks to the end of the waitqueue (FIFO): */
 		list_add_tail(&waiter.list, &lock->wait_list);
+
+#ifdef CONFIG_DEBUG_MUTEXES
+		waiter.ww_ctx = MUTEX_POISON_WW_CTX;
+#endif
 	} else {
 		/* Add in stamp order, waking up waiters that must back off. */
 		ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx);
-- 
cgit v1.2.3


From 12907fbb1a691807bb0420a27126e15934cb7954 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 15 Dec 2016 11:44:28 +0100
Subject: sched/clock, clocksource: Add optional cs::mark_unstable() method

PeterZ reported that we'd fail to mark the TSC unstable when the
clocksource watchdog finds it unsuitable.

Allow a clocksource to run a custom action when its being marked
unstable and hook up the TSC unstable code.

Reported-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/tsc.c       | 11 +++++++++++
 include/linux/clocksource.h |  3 +++
 kernel/time/clocksource.c   |  4 ++++
 3 files changed, 18 insertions(+)

(limited to 'include')

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index be3a49ee0356..c8174c815d83 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1106,6 +1106,16 @@ static u64 read_tsc(struct clocksource *cs)
 	return (u64)rdtsc_ordered();
 }
 
+static void tsc_cs_mark_unstable(struct clocksource *cs)
+{
+	if (tsc_unstable)
+		return;
+	tsc_unstable = 1;
+	clear_sched_clock_stable();
+	disable_sched_clock_irqtime();
+	pr_info("Marking TSC unstable due to clocksource watchdog\n");
+}
+
 /*
  * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc()
  */
@@ -1118,6 +1128,7 @@ static struct clocksource clocksource_tsc = {
 				  CLOCK_SOURCE_MUST_VERIFY,
 	.archdata               = { .vclock_mode = VCLOCK_TSC },
 	.resume			= tsc_resume,
+	.mark_unstable		= tsc_cs_mark_unstable,
 };
 
 void mark_tsc_unstable(char *reason)
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index e315d04a2fd9..cfc75848a35d 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -62,6 +62,8 @@ struct module;
  * @archdata:		arch-specific data
  * @suspend:		suspend function for the clocksource, if necessary
  * @resume:		resume function for the clocksource, if necessary
+ * @mark_unstable:	Optional function to inform the clocksource driver that
+ *			the watchdog marked the clocksource unstable
  * @owner:		module reference, must be set by clocksource in modules
  *
  * Note: This struct is not used in hotpathes of the timekeeping code
@@ -93,6 +95,7 @@ struct clocksource {
 	unsigned long flags;
 	void (*suspend)(struct clocksource *cs);
 	void (*resume)(struct clocksource *cs);
+	void (*mark_unstable)(struct clocksource *cs);
 
 	/* private: */
 #ifdef CONFIG_CLOCKSOURCE_WATCHDOG
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 665985b0a89a..93621ae718d3 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -141,6 +141,10 @@ static void __clocksource_unstable(struct clocksource *cs)
 {
 	cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
 	cs->flags |= CLOCK_SOURCE_UNSTABLE;
+
+	if (cs->mark_unstable)
+		cs->mark_unstable(cs);
+
 	if (finished_booting)
 		schedule_work(&watchdog_work);
 }
-- 
cgit v1.2.3


From 9881b024b7d7671f6a014091bc96506b89081802 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 15 Dec 2016 13:35:52 +0100
Subject: sched/clock: Delay switching sched_clock to stable

Currently we switch to the stable sched_clock if we guess the TSC is
usable, and then switch back to the unstable path if it turns out TSC
isn't stable during SMP bringup after all.

Delay switching to the stable path until after SMP bringup is
complete. This way we'll avoid switching during the time we detect the
worst of the TSC offences.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |  5 +++++
 init/main.c           |  1 -
 kernel/sched/clock.c  | 50 ++++++++++++++++++++++----------------------------
 kernel/sched/core.c   |  4 ++++
 4 files changed, 31 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ad3ec9ec61f7..94a48bb58297 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2515,6 +2515,10 @@ extern u64 sched_clock_cpu(int cpu);
 extern void sched_clock_init(void);
 
 #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+static inline void sched_clock_init_late(void)
+{
+}
+
 static inline void sched_clock_tick(void)
 {
 }
@@ -2537,6 +2541,7 @@ static inline u64 local_clock(void)
 	return sched_clock();
 }
 #else
+extern void sched_clock_init_late(void);
 /*
  * Architectures can set this to 1 if they have specified
  * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
diff --git a/init/main.c b/init/main.c
index b0c9d6facef9..19228149386c 100644
--- a/init/main.c
+++ b/init/main.c
@@ -625,7 +625,6 @@ asmlinkage __visible void __init start_kernel(void)
 	numa_policy_init();
 	if (late_time_init)
 		late_time_init();
-	sched_clock_init();
 	calibrate_delay();
 	pidmap_init();
 	anon_vma_init();
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 5d6dd38b449c..b3466d4e0cc2 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -77,6 +77,11 @@ EXPORT_SYMBOL_GPL(sched_clock);
 
 __read_mostly int sched_clock_running;
 
+void sched_clock_init(void)
+{
+	sched_clock_running = 1;
+}
+
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
 static DEFINE_STATIC_KEY_FALSE(__sched_clock_stable);
 static int __sched_clock_stable_early;
@@ -96,12 +101,18 @@ void set_sched_clock_stable(void)
 {
 	__sched_clock_stable_early = 1;
 
-	smp_mb(); /* matches sched_clock_init() */
-
-	if (!sched_clock_running)
-		return;
+	smp_mb(); /* matches sched_clock_init_late() */
 
-	__set_sched_clock_stable();
+	/*
+	 * This really should only be called early (before
+	 * sched_clock_init_late()) when guestimating our sched_clock() is
+	 * solid.
+	 *
+	 * After that we test stability and we can negate our guess using
+	 * clear_sched_clock_stable, possibly from a watchdog.
+	 */
+	if (WARN_ON_ONCE(sched_clock_running == 2))
+		__set_sched_clock_stable();
 }
 
 static void __clear_sched_clock_stable(struct work_struct *work)
@@ -117,12 +128,10 @@ void clear_sched_clock_stable(void)
 {
 	__sched_clock_stable_early = 0;
 
-	smp_mb(); /* matches sched_clock_init() */
-
-	if (!sched_clock_running)
-		return;
+	smp_mb(); /* matches sched_clock_init_late() */
 
-	schedule_work(&sched_clock_work);
+	if (sched_clock_running == 2)
+		schedule_work(&sched_clock_work);
 }
 
 struct sched_clock_data {
@@ -143,20 +152,9 @@ static inline struct sched_clock_data *cpu_sdc(int cpu)
 	return &per_cpu(sched_clock_data, cpu);
 }
 
-void sched_clock_init(void)
+void sched_clock_init_late(void)
 {
-	u64 ktime_now = ktime_to_ns(ktime_get());
-	int cpu;
-
-	for_each_possible_cpu(cpu) {
-		struct sched_clock_data *scd = cpu_sdc(cpu);
-
-		scd->tick_raw = 0;
-		scd->tick_gtod = ktime_now;
-		scd->clock = ktime_now;
-	}
-
-	sched_clock_running = 1;
+	sched_clock_running = 2;
 
 	/*
 	 * Ensure that it is impossible to not do a static_key update.
@@ -362,11 +360,6 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
 #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
 
-void sched_clock_init(void)
-{
-	sched_clock_running = 1;
-}
-
 u64 sched_clock_cpu(int cpu)
 {
 	if (unlikely(!sched_clock_running))
@@ -374,6 +367,7 @@ u64 sched_clock_cpu(int cpu)
 
 	return sched_clock();
 }
+
 #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
 
 /*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a129b34b8206..96a4267e6020 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7498,6 +7498,7 @@ void __init sched_init_smp(void)
 	init_sched_dl_class();
 
 	sched_init_smt();
+	sched_clock_init_late();
 
 	sched_smp_initialized = true;
 }
@@ -7513,6 +7514,7 @@ early_initcall(migration_init);
 void __init sched_init_smp(void)
 {
 	sched_init_granularity();
+	sched_clock_init_late();
 }
 #endif /* CONFIG_SMP */
 
@@ -7556,6 +7558,8 @@ void __init sched_init(void)
 	int i, j;
 	unsigned long alloc_size = 0, ptr;
 
+	sched_clock_init();
+
 	for (i = 0; i < WAIT_TABLE_SIZE; i++)
 		init_waitqueue_head(bit_wait_table + i);
 
-- 
cgit v1.2.3


From 10ab56434f2f633a51e432ee8b7c29e12438e163 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 28 Oct 2016 12:58:10 -0400
Subject: sched/core: Separate out io_schedule_prepare() and
 io_schedule_finish()

Now that IO schedule accounting is done inside __schedule(),
io_schedule() can be split into three steps - prep, schedule, and
finish - where the schedule part doesn't need any special annotation.
This allows marking a sleep as iowait by simply wrapping an existing
blocking function with io_schedule_prepare() and io_schedule_finish().

Because task_struct->in_iowait is single bit, the caller of
io_schedule_prepare() needs to record and the pass its state to
io_schedule_finish() to be safe regarding nesting.  While this isn't
the prettiest, these functions are mostly gonna be used by core
functions and we don't want to use more space for ->in_iowait.

While at it, as it's simple to do now, reimplement io_schedule()
without unnecessarily going through io_schedule_timeout().

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: adilger.kernel@dilger.ca
Cc: jack@suse.com
Cc: kernel-team@fb.com
Cc: mingbo@fb.com
Cc: tytso@mit.edu
Link: http://lkml.kernel.org/r/1477673892-28940-3-git-send-email-tj@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |  8 +++-----
 kernel/sched/core.c   | 33 ++++++++++++++++++++++++++++-----
 2 files changed, 31 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 94a48bb58297..a8daed914eef 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -461,12 +461,10 @@ extern signed long schedule_timeout_idle(signed long timeout);
 asmlinkage void schedule(void);
 extern void schedule_preempt_disabled(void);
 
+extern int __must_check io_schedule_prepare(void);
+extern void io_schedule_finish(int token);
 extern long io_schedule_timeout(long timeout);
-
-static inline void io_schedule(void)
-{
-	io_schedule_timeout(MAX_SCHEDULE_TIMEOUT);
-}
+extern void io_schedule(void);
 
 void __noreturn do_task_dead(void);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9fd37169b302..49ce1cb3d320 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5128,25 +5128,48 @@ out_irq:
 }
 EXPORT_SYMBOL_GPL(yield_to);
 
+int io_schedule_prepare(void)
+{
+	int old_iowait = current->in_iowait;
+
+	current->in_iowait = 1;
+	blk_schedule_flush_plug(current);
+
+	return old_iowait;
+}
+
+void io_schedule_finish(int token)
+{
+	current->in_iowait = token;
+}
+
 /*
  * This task is about to go to sleep on IO. Increment rq->nr_iowait so
  * that process accounting knows that this is a task in IO wait state.
  */
 long __sched io_schedule_timeout(long timeout)
 {
-	int old_iowait = current->in_iowait;
+	int token;
 	long ret;
 
-	current->in_iowait = 1;
-	blk_schedule_flush_plug(current);
-
+	token = io_schedule_prepare();
 	ret = schedule_timeout(timeout);
-	current->in_iowait = old_iowait;
+	io_schedule_finish(token);
 
 	return ret;
 }
 EXPORT_SYMBOL(io_schedule_timeout);
 
+void io_schedule(void)
+{
+	int token;
+
+	token = io_schedule_prepare();
+	schedule();
+	io_schedule_finish(token);
+}
+EXPORT_SYMBOL(io_schedule);
+
 /**
  * sys_sched_get_priority_max - return maximum RT priority.
  * @policy: scheduling class.
-- 
cgit v1.2.3


From 1460cb65a10f6c7a6e3a1c76513338861a0a43b6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 28 Oct 2016 12:58:11 -0400
Subject: locking/mutex, sched/wait: Add mutex_lock_io()

We sometimes end up propagating IO blocking through mutexes; however,
because there currently is no way of annotating mutex sleeps as
iowait, there are cases where iowait and /proc/stat:procs_blocked
report misleading numbers obscuring the actual state of the system.

This patch adds mutex_lock_io() so that mutex sleeps can be marked as
iowait in those cases.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: adilger.kernel@dilger.ca
Cc: jack@suse.com
Cc: kernel-team@fb.com
Cc: mingbo@fb.com
Cc: tytso@mit.edu
Link: http://lkml.kernel.org/r/1477673892-28940-4-git-send-email-tj@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mutex.h  |  4 ++++
 kernel/locking/mutex.c | 24 ++++++++++++++++++++++++
 2 files changed, 28 insertions(+)

(limited to 'include')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index b97870f2debd..980ba16b8749 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -156,10 +156,12 @@ extern int __must_check mutex_lock_interruptible_nested(struct mutex *lock,
 					unsigned int subclass);
 extern int __must_check mutex_lock_killable_nested(struct mutex *lock,
 					unsigned int subclass);
+extern void mutex_lock_io_nested(struct mutex *lock, unsigned int subclass);
 
 #define mutex_lock(lock) mutex_lock_nested(lock, 0)
 #define mutex_lock_interruptible(lock) mutex_lock_interruptible_nested(lock, 0)
 #define mutex_lock_killable(lock) mutex_lock_killable_nested(lock, 0)
+#define mutex_lock_io(lock) mutex_lock_io_nested(lock, 0)
 
 #define mutex_lock_nest_lock(lock, nest_lock)				\
 do {									\
@@ -171,11 +173,13 @@ do {									\
 extern void mutex_lock(struct mutex *lock);
 extern int __must_check mutex_lock_interruptible(struct mutex *lock);
 extern int __must_check mutex_lock_killable(struct mutex *lock);
+extern void mutex_lock_io(struct mutex *lock);
 
 # define mutex_lock_nested(lock, subclass) mutex_lock(lock)
 # define mutex_lock_interruptible_nested(lock, subclass) mutex_lock_interruptible(lock)
 # define mutex_lock_killable_nested(lock, subclass) mutex_lock_killable(lock)
 # define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
+# define mutex_lock_nest_io(lock, nest_lock) mutex_io(lock)
 #endif
 
 /*
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 9b349619f431..8464a5cbab97 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -783,6 +783,20 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
 }
 EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
 
+void __sched
+mutex_lock_io_nested(struct mutex *lock, unsigned int subclass)
+{
+	int token;
+
+	might_sleep();
+
+	token = io_schedule_prepare();
+	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
+			    subclass, NULL, _RET_IP_, NULL, 0);
+	io_schedule_finish(token);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_io_nested);
+
 static inline int
 ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
@@ -950,6 +964,16 @@ int __sched mutex_lock_killable(struct mutex *lock)
 }
 EXPORT_SYMBOL(mutex_lock_killable);
 
+void __sched mutex_lock_io(struct mutex *lock)
+{
+	int token;
+
+	token = io_schedule_prepare();
+	mutex_lock(lock);
+	io_schedule_finish(token);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_io);
+
 static noinline void __sched
 __mutex_lock_slowpath(struct mutex *lock)
 {
-- 
cgit v1.2.3


From 9e3d6223d2093a8903c8f570a06284453ee59944 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 9 Dec 2016 09:30:11 +0100
Subject: math64, timers: Fix 32bit mul_u64_u32_shr() and friends

It turns out that while GCC-4.4 manages to generate 32x32->64 mult
instructions for the 32bit mul_u64_u32_shr() code, any GCC after that
fails horribly.

Fix this by providing an explicit mul_u32_u32() function which can be
architcture provided.

Reported-by: Chris Metcalf <cmetcalf@mellanox.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Chris Metcalf <cmetcalf@mellanox.com> [for tile]
Cc: Christopher S. Hall <christopher.s.hall@intel.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Laurent Vivier <lvivier@redhat.com>
Cc: Liav Rehana <liavr@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Parit Bhargava <prarit@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20161209083011.GD15765@worktop.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/tile/include/asm/Kbuild  |  1 -
 arch/tile/include/asm/div64.h | 14 ++++++++++++++
 arch/x86/include/asm/div64.h  | 11 +++++++++++
 include/linux/math64.h        | 26 ++++++++++++++++++--------
 4 files changed, 43 insertions(+), 9 deletions(-)
 create mode 100644 arch/tile/include/asm/div64.h

(limited to 'include')

diff --git a/arch/tile/include/asm/Kbuild b/arch/tile/include/asm/Kbuild
index 2d1f5638974c..20f2ba6d79be 100644
--- a/arch/tile/include/asm/Kbuild
+++ b/arch/tile/include/asm/Kbuild
@@ -5,7 +5,6 @@ generic-y += bug.h
 generic-y += bugs.h
 generic-y += clkdev.h
 generic-y += cputime.h
-generic-y += div64.h
 generic-y += emergency-restart.h
 generic-y += errno.h
 generic-y += exec.h
diff --git a/arch/tile/include/asm/div64.h b/arch/tile/include/asm/div64.h
new file mode 100644
index 000000000000..bf6161966dfa
--- /dev/null
+++ b/arch/tile/include/asm/div64.h
@@ -0,0 +1,14 @@
+#ifndef _ASM_TILE_DIV64_H
+#define _ASM_TILE_DIV64_H
+
+#ifdef __tilegx__
+static inline u64 mul_u32_u32(u32 a, u32 b)
+{
+	return __insn_mul_lu_lu(a, b);
+}
+#define mul_u32_u32 mul_u32_u32
+#endif
+
+#include <asm-generic/div64.h>
+
+#endif /* _ASM_TILE_DIV64_H */
diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h
index ced283ac79df..af95c47d5c9e 100644
--- a/arch/x86/include/asm/div64.h
+++ b/arch/x86/include/asm/div64.h
@@ -59,6 +59,17 @@ static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
 }
 #define div_u64_rem	div_u64_rem
 
+static inline u64 mul_u32_u32(u32 a, u32 b)
+{
+	u32 high, low;
+
+	asm ("mull %[b]" : "=a" (low), "=d" (high)
+			 : [a] "a" (a), [b] "rm" (b) );
+
+	return low | ((u64)high) << 32;
+}
+#define mul_u32_u32 mul_u32_u32
+
 #else
 # include <asm-generic/div64.h>
 #endif /* CONFIG_X86_32 */
diff --git a/include/linux/math64.h b/include/linux/math64.h
index 6e8b5b270ffe..80690c96c734 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -133,6 +133,16 @@ __iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder)
 	return ret;
 }
 
+#ifndef mul_u32_u32
+/*
+ * Many a GCC version messes this up and generates a 64x64 mult :-(
+ */
+static inline u64 mul_u32_u32(u32 a, u32 b)
+{
+	return (u64)a * b;
+}
+#endif
+
 #if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
 
 #ifndef mul_u64_u32_shr
@@ -160,9 +170,9 @@ static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
 	al = a;
 	ah = a >> 32;
 
-	ret = ((u64)al * mul) >> shift;
+	ret = mul_u32_u32(al, mul) >> shift;
 	if (ah)
-		ret += ((u64)ah * mul) << (32 - shift);
+		ret += mul_u32_u32(ah, mul) << (32 - shift);
 
 	return ret;
 }
@@ -186,10 +196,10 @@ static inline u64 mul_u64_u64_shr(u64 a, u64 b, unsigned int shift)
 	a0.ll = a;
 	b0.ll = b;
 
-	rl.ll = (u64)a0.l.low * b0.l.low;
-	rm.ll = (u64)a0.l.low * b0.l.high;
-	rn.ll = (u64)a0.l.high * b0.l.low;
-	rh.ll = (u64)a0.l.high * b0.l.high;
+	rl.ll = mul_u32_u32(a0.l.low, b0.l.low);
+	rm.ll = mul_u32_u32(a0.l.low, b0.l.high);
+	rn.ll = mul_u32_u32(a0.l.high, b0.l.low);
+	rh.ll = mul_u32_u32(a0.l.high, b0.l.high);
 
 	/*
 	 * Each of these lines computes a 64-bit intermediate result into "c",
@@ -229,8 +239,8 @@ static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor)
 	} u, rl, rh;
 
 	u.ll = a;
-	rl.ll = (u64)u.l.low * mul;
-	rh.ll = (u64)u.l.high * mul + rl.l.high;
+	rl.ll = mul_u32_u32(u.l.low, mul);
+	rh.ll = mul_u32_u32(u.l.high, mul) + rl.l.high;
 
 	/* Bits 32-63 of the result will be in rh.l.low. */
 	rl.l.high = do_div(rh.ll, divisor);
-- 
cgit v1.2.3


From af2e859edd477fa1ea3d1d106f41a595cff3d162 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 1 Dec 2016 11:47:04 +0000
Subject: locking/ww_mutex: Fix compilation of __WW_MUTEX_INITIALIZER

From conflicting macro parameters, passing the wrong name to
__MUTEX_INITIALIZER and a stray '\', #define __WW_MUTEX_INITIALIZER was
very unhappy.

One unnecessary change was to choose to pass &ww_class instead of
implicitly taking the address of the class within the macro.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Maarten Lankhorst <maarten.lankhorst@canonical.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 1b375dc30710 ("mutex: Move ww_mutex definitions to ww_mutex.h")
Link: http://lkml.kernel.org/r/20161201114711.28697-2-chris@chris-wilson.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/ww_mutex.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h
index c07528522bbc..083950fd5b0b 100644
--- a/include/linux/ww_mutex.h
+++ b/include/linux/ww_mutex.h
@@ -51,10 +51,10 @@ struct ww_mutex {
 };
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define __WW_CLASS_MUTEX_INITIALIZER(lockname, ww_class) \
-		, .ww_class = &ww_class
+# define __WW_CLASS_MUTEX_INITIALIZER(lockname, class) \
+		, .ww_class = class
 #else
-# define __WW_CLASS_MUTEX_INITIALIZER(lockname, ww_class)
+# define __WW_CLASS_MUTEX_INITIALIZER(lockname, class)
 #endif
 
 #define __WW_CLASS_INITIALIZER(ww_class) \
@@ -63,7 +63,7 @@ struct ww_mutex {
 		, .mutex_name = #ww_class "_mutex" }
 
 #define __WW_MUTEX_INITIALIZER(lockname, class) \
-		{ .base = { \__MUTEX_INITIALIZER(lockname) } \
+		{ .base =  __MUTEX_INITIALIZER(lockname.base) \
 		__WW_CLASS_MUTEX_INITIALIZER(lockname, class) }
 
 #define DEFINE_WW_CLASS(classname) \
-- 
cgit v1.2.3


From 1e24edca0557dba6486d39d3c24c288475432bcf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 14 Nov 2016 17:12:23 +0100
Subject: locking/atomic, kref: Add KREF_INIT()

Since we need to change the implementation, stop exposing internals.

Provide KREF_INIT() to allow static initialization of struct kref.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/block/drbd/drbd_bitmap.c | 2 +-
 fs/fuse/fuse_i.h                 | 2 +-
 include/linux/kref.h             | 2 ++
 init/version.c                   | 4 +---
 kernel/pid.c                     | 4 +---
 5 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index ab62b81c2ca7..dece26f119d4 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -1070,7 +1070,7 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned
 		.done = 0,
 		.flags = flags,
 		.error = 0,
-		.kref = { ATOMIC_INIT(2) },
+		.kref = KREF_INIT(2),
 	};
 
 	if (!get_ldev_if_state(device, D_ATTACHING)) {  /* put is in drbd_bm_aio_ctx_destroy() */
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 91307940c8ac..052f8d3c41cb 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -256,7 +256,7 @@ struct fuse_io_priv {
 
 #define FUSE_IO_PRIV_SYNC(f) \
 {					\
-	.refcnt = { ATOMIC_INIT(1) },	\
+	.refcnt = KREF_INIT(1),		\
 	.async = 0,			\
 	.file = f,			\
 }
diff --git a/include/linux/kref.h b/include/linux/kref.h
index e15828fd71f1..9af255ad1e2f 100644
--- a/include/linux/kref.h
+++ b/include/linux/kref.h
@@ -24,6 +24,8 @@ struct kref {
 	atomic_t refcount;
 };
 
+#define KREF_INIT(n)	{ .refcount = ATOMIC_INIT(n), }
+
 /**
  * kref_init - initialize object.
  * @kref: object in question.
diff --git a/init/version.c b/init/version.c
index fe41a63efed6..5606341e9efd 100644
--- a/init/version.c
+++ b/init/version.c
@@ -23,9 +23,7 @@ int version_string(LINUX_VERSION_CODE);
 #endif
 
 struct uts_namespace init_uts_ns = {
-	.kref = {
-		.refcount	= ATOMIC_INIT(2),
-	},
+	.kref = KREF_INIT(2),
 	.name = {
 		.sysname	= UTS_SYSNAME,
 		.nodename	= UTS_NODENAME,
diff --git a/kernel/pid.c b/kernel/pid.c
index f66162f2359b..0291804151b5 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -68,9 +68,7 @@ static inline int mk_pid(struct pid_namespace *pid_ns,
  * the scheme scales to up to 4 million PIDs, runtime.
  */
 struct pid_namespace init_pid_ns = {
-	.kref = {
-		.refcount       = ATOMIC_INIT(2),
-	},
+	.kref = KREF_INIT(2),
 	.pidmap = {
 		[ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
 	},
-- 
cgit v1.2.3


From 2c935bc57221cc2edc787c72ea0e2d30cdcd3d5e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 14 Nov 2016 17:29:48 +0100
Subject: locking/atomic, kref: Add kref_read()

Since we need to change the implementation, stop exposing internals.

Provide kref_read() to read the current reference count; typically
used for debug messages.

Kills two anti-patterns:

	atomic_read(&kref->refcount)
	kref->refcount.counter

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/block/drbd/drbd_req.c                |  2 +-
 drivers/block/rbd.c                          |  8 +++---
 drivers/block/virtio_blk.c                   |  2 +-
 drivers/gpu/drm/drm_gem_cma_helper.c         |  2 +-
 drivers/gpu/drm/drm_info.c                   |  2 +-
 drivers/gpu/drm/drm_mode_object.c            |  4 +--
 drivers/gpu/drm/etnaviv/etnaviv_gem.c        |  2 +-
 drivers/gpu/drm/i915/i915_gem_object.h       |  2 +-
 drivers/gpu/drm/msm/msm_gem.c                |  2 +-
 drivers/gpu/drm/nouveau/nouveau_fence.c      |  2 +-
 drivers/gpu/drm/omapdrm/omap_gem.c           |  2 +-
 drivers/gpu/drm/ttm/ttm_bo.c                 |  4 +--
 drivers/gpu/drm/ttm/ttm_object.c             |  2 +-
 drivers/infiniband/hw/cxgb3/iwch_cm.h        |  6 ++---
 drivers/infiniband/hw/cxgb3/iwch_qp.c        |  2 +-
 drivers/infiniband/hw/cxgb4/iw_cxgb4.h       |  6 ++---
 drivers/infiniband/hw/cxgb4/qp.c             |  2 +-
 drivers/infiniband/hw/usnic/usnic_ib_sysfs.c |  6 ++---
 drivers/infiniband/hw/usnic/usnic_ib_verbs.c |  4 +--
 drivers/misc/genwqe/card_dev.c               |  2 +-
 drivers/misc/mei/debugfs.c                   |  2 +-
 drivers/pci/hotplug/pnv_php.c                |  2 +-
 drivers/pci/slot.c                           |  2 +-
 drivers/scsi/bnx2fc/bnx2fc_io.c              |  8 +++---
 drivers/scsi/cxgbi/libcxgbi.h                |  4 +--
 drivers/scsi/lpfc/lpfc_debugfs.c             |  2 +-
 drivers/scsi/lpfc/lpfc_els.c                 |  2 +-
 drivers/scsi/lpfc/lpfc_hbadisc.c             | 40 ++++++++++++++--------------
 drivers/scsi/lpfc/lpfc_init.c                |  3 +--
 drivers/scsi/qla2xxx/tcm_qla2xxx.c           |  4 +--
 drivers/staging/android/ion/ion.c            |  2 +-
 drivers/staging/comedi/comedi_buf.c          |  2 +-
 drivers/target/target_core_pr.c              | 10 +++----
 drivers/target/tcm_fc/tfc_sess.c             |  2 +-
 drivers/usb/gadget/function/f_fs.c           |  2 +-
 fs/exofs/sys.c                               |  2 +-
 fs/ocfs2/cluster/netdebug.c                  |  2 +-
 fs/ocfs2/cluster/tcp.c                       |  2 +-
 fs/ocfs2/dlm/dlmdebug.c                      | 12 ++++-----
 fs/ocfs2/dlm/dlmdomain.c                     |  2 +-
 fs/ocfs2/dlm/dlmmaster.c                     |  8 +++---
 fs/ocfs2/dlm/dlmunlock.c                     |  2 +-
 include/drm/drm_framebuffer.h                |  2 +-
 include/drm/ttm/ttm_bo_driver.h              |  4 +--
 include/linux/kref.h                         |  5 ++++
 include/linux/sunrpc/cache.h                 |  2 +-
 include/net/bluetooth/hci_core.h             |  4 +--
 net/bluetooth/6lowpan.c                      |  2 +-
 net/bluetooth/a2mp.c                         |  4 +--
 net/bluetooth/amp.c                          |  4 +--
 net/bluetooth/l2cap_core.c                   |  4 +--
 net/ceph/messenger.c                         |  4 +--
 net/ceph/osd_client.c                        | 10 +++----
 net/sunrpc/cache.c                           |  2 +-
 net/sunrpc/svc_xprt.c                        |  6 ++---
 net/sunrpc/xprtrdma/svc_rdma_transport.c     |  4 +--
 56 files changed, 121 insertions(+), 117 deletions(-)

(limited to 'include')

diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index de279fe4e4fd..74306c054983 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -520,7 +520,7 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
 		/* Completion does it's own kref_put.  If we are going to
 		 * kref_sub below, we need req to be still around then. */
 		int at_least = k_put + !!c_put;
-		int refcount = atomic_read(&req->kref.refcount);
+		int refcount = kref_read(&req->kref);
 		if (refcount < at_least)
 			drbd_err(device,
 				"mod_rq_state: Logic BUG: %x -> %x: refcount = %d, should be >= %d\n",
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 36d2b9f4e836..436baa66f701 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1535,7 +1535,7 @@ static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request)
 static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
 {
 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
-		atomic_read(&obj_request->kref.refcount));
+		kref_read(&obj_request->kref));
 	kref_get(&obj_request->kref);
 }
 
@@ -1544,14 +1544,14 @@ static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
 {
 	rbd_assert(obj_request != NULL);
 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
-		atomic_read(&obj_request->kref.refcount));
+		kref_read(&obj_request->kref));
 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
 }
 
 static void rbd_img_request_get(struct rbd_img_request *img_request)
 {
 	dout("%s: img %p (was %d)\n", __func__, img_request,
-	     atomic_read(&img_request->kref.refcount));
+	     kref_read(&img_request->kref));
 	kref_get(&img_request->kref);
 }
 
@@ -1562,7 +1562,7 @@ static void rbd_img_request_put(struct rbd_img_request *img_request)
 {
 	rbd_assert(img_request != NULL);
 	dout("%s: img %p (was %d)\n", __func__, img_request,
-		atomic_read(&img_request->kref.refcount));
+		kref_read(&img_request->kref));
 	if (img_request_child_test(img_request))
 		kref_put(&img_request->kref, rbd_parent_request_destroy);
 	else
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 5545a679abd8..79a346c97446 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -767,7 +767,7 @@ static void virtblk_remove(struct virtio_device *vdev)
 	/* Stop all the virtqueues. */
 	vdev->config->reset(vdev);
 
-	refc = atomic_read(&disk_to_dev(vblk->disk)->kobj.kref.refcount);
+	refc = kref_read(&disk_to_dev(vblk->disk)->kobj.kref);
 	put_disk(vblk->disk);
 	vdev->config->del_vqs(vdev);
 	kfree(vblk->vqs);
diff --git a/drivers/gpu/drm/drm_gem_cma_helper.c b/drivers/gpu/drm/drm_gem_cma_helper.c
index 1d6c335584ec..33cd51632721 100644
--- a/drivers/gpu/drm/drm_gem_cma_helper.c
+++ b/drivers/gpu/drm/drm_gem_cma_helper.c
@@ -376,7 +376,7 @@ void drm_gem_cma_describe(struct drm_gem_cma_object *cma_obj,
 	off = drm_vma_node_start(&obj->vma_node);
 
 	seq_printf(m, "%2d (%2d) %08llx %pad %p %zu",
-			obj->name, obj->refcount.refcount.counter,
+			obj->name, kref_read(&obj->refcount),
 			off, &cma_obj->paddr, cma_obj->vaddr, obj->size);
 
 	seq_printf(m, "\n");
diff --git a/drivers/gpu/drm/drm_info.c b/drivers/gpu/drm/drm_info.c
index ffb2ab389d1d..6b68e9088436 100644
--- a/drivers/gpu/drm/drm_info.c
+++ b/drivers/gpu/drm/drm_info.c
@@ -118,7 +118,7 @@ static int drm_gem_one_name_info(int id, void *ptr, void *data)
 	seq_printf(m, "%6d %8zd %7d %8d\n",
 		   obj->name, obj->size,
 		   obj->handle_count,
-		   atomic_read(&obj->refcount.refcount));
+		   kref_read(&obj->refcount));
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/drm_mode_object.c b/drivers/gpu/drm/drm_mode_object.c
index 9f17085b1fdd..c6885a4911c0 100644
--- a/drivers/gpu/drm/drm_mode_object.c
+++ b/drivers/gpu/drm/drm_mode_object.c
@@ -159,7 +159,7 @@ EXPORT_SYMBOL(drm_mode_object_find);
 void drm_mode_object_unreference(struct drm_mode_object *obj)
 {
 	if (obj->free_cb) {
-		DRM_DEBUG("OBJ ID: %d (%d)\n", obj->id, atomic_read(&obj->refcount.refcount));
+		DRM_DEBUG("OBJ ID: %d (%d)\n", obj->id, kref_read(&obj->refcount));
 		kref_put(&obj->refcount, obj->free_cb);
 	}
 }
@@ -176,7 +176,7 @@ EXPORT_SYMBOL(drm_mode_object_unreference);
 void drm_mode_object_reference(struct drm_mode_object *obj)
 {
 	if (obj->free_cb) {
-		DRM_DEBUG("OBJ ID: %d (%d)\n", obj->id, atomic_read(&obj->refcount.refcount));
+		DRM_DEBUG("OBJ ID: %d (%d)\n", obj->id, kref_read(&obj->refcount));
 		kref_get(&obj->refcount);
 	}
 }
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
index 114dddbd297b..aa6e35ddc87f 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
@@ -486,7 +486,7 @@ static void etnaviv_gem_describe(struct drm_gem_object *obj, struct seq_file *m)
 
 	seq_printf(m, "%08x: %c %2d (%2d) %08lx %p %zd\n",
 			etnaviv_obj->flags, is_active(etnaviv_obj) ? 'A' : 'I',
-			obj->name, obj->refcount.refcount.counter,
+			obj->name, kref_read(&obj->refcount),
 			off, etnaviv_obj->vaddr, obj->size);
 
 	rcu_read_lock();
diff --git a/drivers/gpu/drm/i915/i915_gem_object.h b/drivers/gpu/drm/i915/i915_gem_object.h
index 6a368de9d81e..ecfefb9d42e4 100644
--- a/drivers/gpu/drm/i915/i915_gem_object.h
+++ b/drivers/gpu/drm/i915/i915_gem_object.h
@@ -256,7 +256,7 @@ extern void drm_gem_object_unreference_unlocked(struct drm_gem_object *);
 static inline bool
 i915_gem_object_is_dead(const struct drm_i915_gem_object *obj)
 {
-	return atomic_read(&obj->base.refcount.refcount) == 0;
+	return kref_read(&obj->base.refcount) == 0;
 }
 
 static inline bool
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index d8bc59c7e261..4d24d9389036 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -640,7 +640,7 @@ void msm_gem_describe(struct drm_gem_object *obj, struct seq_file *m)
 
 	seq_printf(m, "%08x: %c %2d (%2d) %08llx %p\t",
 			msm_obj->flags, is_active(msm_obj) ? 'A' : 'I',
-			obj->name, obj->refcount.refcount.counter,
+			obj->name, kref_read(&obj->refcount),
 			off, msm_obj->vaddr);
 
 	for (id = 0; id < priv->num_aspaces; id++)
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c
index a6126c93f215..88ee60d1b907 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
@@ -527,7 +527,7 @@ static bool nouveau_fence_no_signaling(struct dma_fence *f)
 	 * caller should have a reference on the fence,
 	 * else fence could get freed here
 	 */
-	WARN_ON(atomic_read(&fence->base.refcount.refcount) <= 1);
+	WARN_ON(kref_read(&fence->base.refcount) <= 1);
 
 	/*
 	 * This needs uevents to work correctly, but dma_fence_add_callback relies on
diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c
index 4a90c690f09e..74a9968df421 100644
--- a/drivers/gpu/drm/omapdrm/omap_gem.c
+++ b/drivers/gpu/drm/omapdrm/omap_gem.c
@@ -1033,7 +1033,7 @@ void omap_gem_describe(struct drm_gem_object *obj, struct seq_file *m)
 	off = drm_vma_node_start(&obj->vma_node);
 
 	seq_printf(m, "%08x: %2d (%2d) %08llx %pad (%2d) %p %4d",
-			omap_obj->flags, obj->name, obj->refcount.refcount.counter,
+			omap_obj->flags, obj->name, kref_read(&obj->refcount),
 			off, &omap_obj->paddr, omap_obj->paddr_cnt,
 			omap_obj->vaddr, omap_obj->roll);
 
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index d5063618efa7..30aefcc0c969 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -140,8 +140,8 @@ static void ttm_bo_release_list(struct kref *list_kref)
 	struct ttm_bo_device *bdev = bo->bdev;
 	size_t acc_size = bo->acc_size;
 
-	BUG_ON(atomic_read(&bo->list_kref.refcount));
-	BUG_ON(atomic_read(&bo->kref.refcount));
+	BUG_ON(kref_read(&bo->list_kref));
+	BUG_ON(kref_read(&bo->kref));
 	BUG_ON(atomic_read(&bo->cpu_writers));
 	BUG_ON(bo->mem.mm_node != NULL);
 	BUG_ON(!list_empty(&bo->lru));
diff --git a/drivers/gpu/drm/ttm/ttm_object.c b/drivers/gpu/drm/ttm/ttm_object.c
index 4f5fa8d65fe9..fdb451e3ec01 100644
--- a/drivers/gpu/drm/ttm/ttm_object.c
+++ b/drivers/gpu/drm/ttm/ttm_object.c
@@ -304,7 +304,7 @@ bool ttm_ref_object_exists(struct ttm_object_file *tfile,
 	 * Verify that the ref->obj pointer was actually valid!
 	 */
 	rmb();
-	if (unlikely(atomic_read(&ref->kref.refcount) == 0))
+	if (unlikely(kref_read(&ref->kref) == 0))
 		goto out_false;
 
 	rcu_read_unlock();
diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.h b/drivers/infiniband/hw/cxgb3/iwch_cm.h
index b9efadfffb4f..e66e75921797 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_cm.h
+++ b/drivers/infiniband/hw/cxgb3/iwch_cm.h
@@ -55,14 +55,14 @@
 
 #define put_ep(ep) { \
 	PDBG("put_ep (via %s:%u) ep %p refcnt %d\n", __func__, __LINE__,  \
-	     ep, atomic_read(&((ep)->kref.refcount))); \
-	WARN_ON(atomic_read(&((ep)->kref.refcount)) < 1); \
+	     ep, kref_read(&((ep)->kref))); \
+	WARN_ON(kref_read(&((ep)->kref)) < 1); \
 	kref_put(&((ep)->kref), __free_ep); \
 }
 
 #define get_ep(ep) { \
 	PDBG("get_ep (via %s:%u) ep %p, refcnt %d\n", __func__, __LINE__, \
-	     ep, atomic_read(&((ep)->kref.refcount))); \
+	     ep, kref_read(&((ep)->kref))); \
 	kref_get(&((ep)->kref));  \
 }
 
diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c
index d939980a708f..a9194db7f9b8 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_qp.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c
@@ -961,7 +961,7 @@ int iwch_modify_qp(struct iwch_dev *rhp, struct iwch_qp *qhp,
 	case IWCH_QP_STATE_RTS:
 		switch (attrs->next_state) {
 		case IWCH_QP_STATE_CLOSING:
-			BUG_ON(atomic_read(&qhp->ep->com.kref.refcount) < 2);
+			BUG_ON(kref_read(&qhp->ep->com.kref) < 2);
 			qhp->attr.state = IWCH_QP_STATE_CLOSING;
 			if (!internal) {
 				abort=0;
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index 4788e1a46fde..4dc141596e57 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -654,14 +654,14 @@ enum c4iw_mmid_state {
 
 #define c4iw_put_ep(ep) { \
 	PDBG("put_ep (via %s:%u) ep %p refcnt %d\n", __func__, __LINE__,  \
-	     ep, atomic_read(&((ep)->kref.refcount))); \
-	WARN_ON(atomic_read(&((ep)->kref.refcount)) < 1); \
+	     ep, kref_read(&((ep)->kref))); \
+	WARN_ON(kref_read(&((ep)->kref)) < 1); \
 	kref_put(&((ep)->kref), _c4iw_free_ep); \
 }
 
 #define c4iw_get_ep(ep) { \
 	PDBG("get_ep (via %s:%u) ep %p, refcnt %d\n", __func__, __LINE__, \
-	     ep, atomic_read(&((ep)->kref.refcount))); \
+	     ep, kref_read(&((ep)->kref))); \
 	kref_get(&((ep)->kref));  \
 }
 void _c4iw_free_ep(struct kref *kref);
diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c
index cda5542e13a2..347b3c93ffd7 100644
--- a/drivers/infiniband/hw/cxgb4/qp.c
+++ b/drivers/infiniband/hw/cxgb4/qp.c
@@ -1503,7 +1503,7 @@ int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp,
 	case C4IW_QP_STATE_RTS:
 		switch (attrs->next_state) {
 		case C4IW_QP_STATE_CLOSING:
-			BUG_ON(atomic_read(&qhp->ep->com.kref.refcount) < 2);
+			BUG_ON(kref_read(&qhp->ep->com.kref) < 2);
 			t4_set_wq_in_error(&qhp->wq);
 			set_state(qhp, C4IW_QP_STATE_CLOSING);
 			ep = qhp->ep;
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c
index 80ef3f8998c8..04443242e258 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c
@@ -80,7 +80,7 @@ usnic_ib_show_config(struct device *device, struct device_attribute *attr,
 	left = PAGE_SIZE;
 
 	mutex_lock(&us_ibdev->usdev_lock);
-	if (atomic_read(&us_ibdev->vf_cnt.refcount) > 0) {
+	if (kref_read(&us_ibdev->vf_cnt) > 0) {
 		char *busname;
 
 		/*
@@ -99,7 +99,7 @@ usnic_ib_show_config(struct device *device, struct device_attribute *attr,
 			PCI_FUNC(us_ibdev->pdev->devfn),
 			netdev_name(us_ibdev->netdev),
 			us_ibdev->ufdev->mac,
-			atomic_read(&us_ibdev->vf_cnt.refcount));
+			kref_read(&us_ibdev->vf_cnt));
 		UPDATE_PTR_LEFT(n, ptr, left);
 
 		for (res_type = USNIC_VNIC_RES_TYPE_EOL;
@@ -147,7 +147,7 @@ usnic_ib_show_max_vf(struct device *device, struct device_attribute *attr,
 	us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev);
 
 	return scnprintf(buf, PAGE_SIZE, "%u\n",
-			atomic_read(&us_ibdev->vf_cnt.refcount));
+			kref_read(&us_ibdev->vf_cnt));
 }
 
 static ssize_t
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
index 74819a7951e2..69df8e353123 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
@@ -291,11 +291,11 @@ int usnic_ib_query_device(struct ib_device *ibdev,
 	qp_per_vf = max(us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_WQ],
 			us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_RQ]);
 	props->max_qp = qp_per_vf *
-		atomic_read(&us_ibdev->vf_cnt.refcount);
+		kref_read(&us_ibdev->vf_cnt);
 	props->device_cap_flags = IB_DEVICE_PORT_ACTIVE_EVENT |
 		IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
 	props->max_cq = us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_CQ] *
-		atomic_read(&us_ibdev->vf_cnt.refcount);
+		kref_read(&us_ibdev->vf_cnt);
 	props->max_pd = USNIC_UIOM_MAX_PD_CNT;
 	props->max_mr = USNIC_UIOM_MAX_MR_CNT;
 	props->local_ca_ack_delay = 0;
diff --git a/drivers/misc/genwqe/card_dev.c b/drivers/misc/genwqe/card_dev.c
index 7f1b282d7d96..cb290b8ca0c8 100644
--- a/drivers/misc/genwqe/card_dev.c
+++ b/drivers/misc/genwqe/card_dev.c
@@ -1396,7 +1396,7 @@ int genwqe_device_remove(struct genwqe_dev *cd)
 	 * application which will decrease this reference from
 	 * 1/unused to 0/illegal and not from 2/used 1/empty.
 	 */
-	rc = atomic_read(&cd->cdev_genwqe.kobj.kref.refcount);
+	rc = kref_read(&cd->cdev_genwqe.kobj.kref);
 	if (rc != 1) {
 		dev_err(&pci_dev->dev,
 			"[%s] err: cdev_genwqe...refcount=%d\n", __func__, rc);
diff --git a/drivers/misc/mei/debugfs.c b/drivers/misc/mei/debugfs.c
index c6c051b52f55..e32a92341b54 100644
--- a/drivers/misc/mei/debugfs.c
+++ b/drivers/misc/mei/debugfs.c
@@ -67,7 +67,7 @@ static ssize_t mei_dbgfs_read_meclients(struct file *fp, char __user *ubuf,
 				me_cl->props.max_number_of_connections,
 				me_cl->props.max_msg_length,
 				me_cl->props.single_recv_buf,
-				atomic_read(&me_cl->refcnt.refcount));
+				kref_read(&me_cl->refcnt));
 
 			mei_me_cl_put(me_cl);
 		}
diff --git a/drivers/pci/hotplug/pnv_php.c b/drivers/pci/hotplug/pnv_php.c
index 56efaf72d08e..d2961ef39a3a 100644
--- a/drivers/pci/hotplug/pnv_php.c
+++ b/drivers/pci/hotplug/pnv_php.c
@@ -155,7 +155,7 @@ static void pnv_php_detach_device_nodes(struct device_node *parent)
 		pnv_php_detach_device_nodes(dn);
 
 		of_node_put(dn);
-		refcount = atomic_read(&dn->kobj.kref.refcount);
+		refcount = kref_read(&dn->kobj.kref);
 		if (refcount != 1)
 			pr_warn("Invalid refcount %d on <%s>\n",
 				refcount, of_node_full_name(dn));
diff --git a/drivers/pci/slot.c b/drivers/pci/slot.c
index 429d34c348b9..e42909524dee 100644
--- a/drivers/pci/slot.c
+++ b/drivers/pci/slot.c
@@ -345,7 +345,7 @@ EXPORT_SYMBOL_GPL(pci_create_slot);
 void pci_destroy_slot(struct pci_slot *slot)
 {
 	dev_dbg(&slot->bus->dev, "dev %02x, dec refcount to %d\n",
-		slot->number, atomic_read(&slot->kobj.kref.refcount) - 1);
+		slot->number, kref_read(&slot->kobj.kref) - 1);
 
 	mutex_lock(&pci_slot_mutex);
 	kobject_put(&slot->kobj);
diff --git a/drivers/scsi/bnx2fc/bnx2fc_io.c b/drivers/scsi/bnx2fc/bnx2fc_io.c
index f501095f91ac..898461b146cc 100644
--- a/drivers/scsi/bnx2fc/bnx2fc_io.c
+++ b/drivers/scsi/bnx2fc/bnx2fc_io.c
@@ -74,7 +74,7 @@ static void bnx2fc_cmd_timeout(struct work_struct *work)
 				    &io_req->req_flags)) {
 			/* Handle internally generated ABTS timeout */
 			BNX2FC_IO_DBG(io_req, "ABTS timed out refcnt = %d\n",
-					io_req->refcount.refcount.counter);
+					kref_read(&io_req->refcount));
 			if (!(test_and_set_bit(BNX2FC_FLAG_ABTS_DONE,
 					       &io_req->req_flags))) {
 				/*
@@ -1141,7 +1141,7 @@ int bnx2fc_eh_abort(struct scsi_cmnd *sc_cmd)
 		return SUCCESS;
 	}
 	BNX2FC_IO_DBG(io_req, "eh_abort - refcnt = %d\n",
-		      io_req->refcount.refcount.counter);
+		      kref_read(&io_req->refcount));
 
 	/* Hold IO request across abort processing */
 	kref_get(&io_req->refcount);
@@ -1299,7 +1299,7 @@ void bnx2fc_process_cleanup_compl(struct bnx2fc_cmd *io_req,
 {
 	BNX2FC_IO_DBG(io_req, "Entered process_cleanup_compl "
 			      "refcnt = %d, cmd_type = %d\n",
-		   io_req->refcount.refcount.counter, io_req->cmd_type);
+		   kref_read(&io_req->refcount), io_req->cmd_type);
 	bnx2fc_scsi_done(io_req, DID_ERROR);
 	kref_put(&io_req->refcount, bnx2fc_cmd_release);
 	if (io_req->wait_for_comp)
@@ -1318,7 +1318,7 @@ void bnx2fc_process_abts_compl(struct bnx2fc_cmd *io_req,
 	BNX2FC_IO_DBG(io_req, "Entered process_abts_compl xid = 0x%x"
 			      "refcnt = %d, cmd_type = %d\n",
 		   io_req->xid,
-		   io_req->refcount.refcount.counter, io_req->cmd_type);
+		   kref_read(&io_req->refcount), io_req->cmd_type);
 
 	if (test_and_set_bit(BNX2FC_FLAG_ABTS_DONE,
 				       &io_req->req_flags)) {
diff --git a/drivers/scsi/cxgbi/libcxgbi.h b/drivers/scsi/cxgbi/libcxgbi.h
index 95ba99044c3e..18e0ea83d361 100644
--- a/drivers/scsi/cxgbi/libcxgbi.h
+++ b/drivers/scsi/cxgbi/libcxgbi.h
@@ -301,7 +301,7 @@ static inline void __cxgbi_sock_put(const char *fn, struct cxgbi_sock *csk)
 {
 	log_debug(1 << CXGBI_DBG_SOCK,
 		"%s, put csk 0x%p, ref %u-1.\n",
-		fn, csk, atomic_read(&csk->refcnt.refcount));
+		fn, csk, kref_read(&csk->refcnt));
 	kref_put(&csk->refcnt, cxgbi_sock_free);
 }
 #define cxgbi_sock_put(csk)	__cxgbi_sock_put(__func__, csk)
@@ -310,7 +310,7 @@ static inline void __cxgbi_sock_get(const char *fn, struct cxgbi_sock *csk)
 {
 	log_debug(1 << CXGBI_DBG_SOCK,
 		"%s, get csk 0x%p, ref %u+1.\n",
-		fn, csk, atomic_read(&csk->refcnt.refcount));
+		fn, csk, kref_read(&csk->refcnt));
 	kref_get(&csk->refcnt);
 }
 #define cxgbi_sock_get(csk)	__cxgbi_sock_get(__func__, csk)
diff --git a/drivers/scsi/lpfc/lpfc_debugfs.c b/drivers/scsi/lpfc/lpfc_debugfs.c
index a63542bac153..caa7a7b0ec53 100644
--- a/drivers/scsi/lpfc/lpfc_debugfs.c
+++ b/drivers/scsi/lpfc/lpfc_debugfs.c
@@ -607,7 +607,7 @@ lpfc_debugfs_nodelist_data(struct lpfc_vport *vport, char *buf, int size)
 		len += snprintf(buf+len, size-len, "usgmap:%x ",
 			ndlp->nlp_usg_map);
 		len += snprintf(buf+len, size-len, "refcnt:%x",
-			atomic_read(&ndlp->kref.refcount));
+			kref_read(&ndlp->kref));
 		len +=  snprintf(buf+len, size-len, "\n");
 	}
 	spin_unlock_irq(shost->host_lock);
diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c
index 236e4e51d161..7a8829546068 100644
--- a/drivers/scsi/lpfc/lpfc_els.c
+++ b/drivers/scsi/lpfc/lpfc_els.c
@@ -3688,7 +3688,7 @@ lpfc_mbx_cmpl_dflt_rpi(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
 		lpfc_printf_vlog(ndlp->vport, KERN_INFO, LOG_NODE,
 				 "0006 rpi%x DID:%x flg:%x %d map:%x %p\n",
 				 ndlp->nlp_rpi, ndlp->nlp_DID, ndlp->nlp_flag,
-				 atomic_read(&ndlp->kref.refcount),
+				 kref_read(&ndlp->kref),
 				 ndlp->nlp_usg_map, ndlp);
 		if (NLP_CHK_NODE_ACT(ndlp)) {
 			lpfc_nlp_put(ndlp);
diff --git a/drivers/scsi/lpfc/lpfc_hbadisc.c b/drivers/scsi/lpfc/lpfc_hbadisc.c
index ed223937798a..82047070cdc9 100644
--- a/drivers/scsi/lpfc/lpfc_hbadisc.c
+++ b/drivers/scsi/lpfc/lpfc_hbadisc.c
@@ -3440,7 +3440,7 @@ lpfc_mbx_cmpl_reg_login(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
 	lpfc_printf_vlog(vport, KERN_INFO, LOG_SLI,
 			 "0002 rpi:%x DID:%x flg:%x %d map:%x %p\n",
 			 ndlp->nlp_rpi, ndlp->nlp_DID, ndlp->nlp_flag,
-			 atomic_read(&ndlp->kref.refcount),
+			 kref_read(&ndlp->kref),
 			 ndlp->nlp_usg_map, ndlp);
 	if (ndlp->nlp_flag & NLP_REG_LOGIN_SEND)
 		ndlp->nlp_flag &= ~NLP_REG_LOGIN_SEND;
@@ -3861,7 +3861,7 @@ out:
 	lpfc_printf_vlog(vport, KERN_INFO, LOG_SLI,
 			 "0003 rpi:%x DID:%x flg:%x %d map%x %p\n",
 			 ndlp->nlp_rpi, ndlp->nlp_DID, ndlp->nlp_flag,
-			 atomic_read(&ndlp->kref.refcount),
+			 kref_read(&ndlp->kref),
 			 ndlp->nlp_usg_map, ndlp);
 
 	if (vport->port_state < LPFC_VPORT_READY) {
@@ -4238,7 +4238,7 @@ lpfc_enable_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
 				"0277 lpfc_enable_node: ndlp:x%p "
 				"usgmap:x%x refcnt:%d\n",
 				(void *)ndlp, ndlp->nlp_usg_map,
-				atomic_read(&ndlp->kref.refcount));
+				kref_read(&ndlp->kref));
 		return NULL;
 	}
 	/* The ndlp should not already be in active mode */
@@ -4248,7 +4248,7 @@ lpfc_enable_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
 				"0278 lpfc_enable_node: ndlp:x%p "
 				"usgmap:x%x refcnt:%d\n",
 				(void *)ndlp, ndlp->nlp_usg_map,
-				atomic_read(&ndlp->kref.refcount));
+				kref_read(&ndlp->kref));
 		return NULL;
 	}
 
@@ -4272,7 +4272,7 @@ lpfc_enable_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
 				 "0008 rpi:%x DID:%x flg:%x refcnt:%d "
 				 "map:%x %p\n", ndlp->nlp_rpi, ndlp->nlp_DID,
 				 ndlp->nlp_flag,
-				 atomic_read(&ndlp->kref.refcount),
+				 kref_read(&ndlp->kref),
 				 ndlp->nlp_usg_map, ndlp);
 	}
 
@@ -4546,7 +4546,7 @@ lpfc_unreg_rpi(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp)
 				    (bf_get(lpfc_sli_intf_if_type,
 				     &phba->sli4_hba.sli_intf) ==
 				      LPFC_SLI_INTF_IF_TYPE_2) &&
-				    (atomic_read(&ndlp->kref.refcount) > 0)) {
+				    (kref_read(&ndlp->kref) > 0)) {
 					mbox->context1 = lpfc_nlp_get(ndlp);
 					mbox->mbox_cmpl =
 						lpfc_sli4_unreg_rpi_cmpl_clr;
@@ -4695,14 +4695,14 @@ lpfc_cleanup_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp)
 				"0280 lpfc_cleanup_node: ndlp:x%p "
 				"usgmap:x%x refcnt:%d\n",
 				(void *)ndlp, ndlp->nlp_usg_map,
-				atomic_read(&ndlp->kref.refcount));
+				kref_read(&ndlp->kref));
 		lpfc_dequeue_node(vport, ndlp);
 	} else {
 		lpfc_printf_vlog(vport, KERN_WARNING, LOG_NODE,
 				"0281 lpfc_cleanup_node: ndlp:x%p "
 				"usgmap:x%x refcnt:%d\n",
 				(void *)ndlp, ndlp->nlp_usg_map,
-				atomic_read(&ndlp->kref.refcount));
+				kref_read(&ndlp->kref));
 		lpfc_disable_node(vport, ndlp);
 	}
 
@@ -4791,7 +4791,7 @@ lpfc_nlp_remove(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp)
 		lpfc_printf_vlog(vport, KERN_INFO, LOG_NODE,
 				 "0005 rpi:%x DID:%x flg:%x %d map:%x %p\n",
 				 ndlp->nlp_rpi, ndlp->nlp_DID, ndlp->nlp_flag,
-				 atomic_read(&ndlp->kref.refcount),
+				 kref_read(&ndlp->kref),
 				 ndlp->nlp_usg_map, ndlp);
 		if ((mbox = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL))
 			!= NULL) {
@@ -5557,7 +5557,7 @@ lpfc_mbx_cmpl_fdmi_reg_login(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
 	lpfc_printf_vlog(vport, KERN_INFO, LOG_SLI,
 			 "0004 rpi:%x DID:%x flg:%x %d map:%x %p\n",
 			 ndlp->nlp_rpi, ndlp->nlp_DID, ndlp->nlp_flag,
-			 atomic_read(&ndlp->kref.refcount),
+			 kref_read(&ndlp->kref),
 			 ndlp->nlp_usg_map, ndlp);
 	/*
 	 * Start issuing Fabric-Device Management Interface (FDMI) command to
@@ -5728,7 +5728,7 @@ lpfc_nlp_init(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
 				 "0007 rpi:%x DID:%x flg:%x refcnt:%d "
 				 "map:%x %p\n", ndlp->nlp_rpi, ndlp->nlp_DID,
 				 ndlp->nlp_flag,
-				 atomic_read(&ndlp->kref.refcount),
+				 kref_read(&ndlp->kref),
 				 ndlp->nlp_usg_map, ndlp);
 
 		ndlp->active_rrqs_xri_bitmap =
@@ -5767,7 +5767,7 @@ lpfc_nlp_release(struct kref *kref)
 			"0279 lpfc_nlp_release: ndlp:x%p did %x "
 			"usgmap:x%x refcnt:%d rpi:%x\n",
 			(void *)ndlp, ndlp->nlp_DID, ndlp->nlp_usg_map,
-			atomic_read(&ndlp->kref.refcount), ndlp->nlp_rpi);
+			kref_read(&ndlp->kref), ndlp->nlp_rpi);
 
 	/* remove ndlp from action. */
 	lpfc_nlp_remove(ndlp->vport, ndlp);
@@ -5804,7 +5804,7 @@ lpfc_nlp_get(struct lpfc_nodelist *ndlp)
 		lpfc_debugfs_disc_trc(ndlp->vport, LPFC_DISC_TRC_NODE,
 			"node get:        did:x%x flg:x%x refcnt:x%x",
 			ndlp->nlp_DID, ndlp->nlp_flag,
-			atomic_read(&ndlp->kref.refcount));
+			kref_read(&ndlp->kref));
 		/* The check of ndlp usage to prevent incrementing the
 		 * ndlp reference count that is in the process of being
 		 * released.
@@ -5817,7 +5817,7 @@ lpfc_nlp_get(struct lpfc_nodelist *ndlp)
 				"0276 lpfc_nlp_get: ndlp:x%p "
 				"usgmap:x%x refcnt:%d\n",
 				(void *)ndlp, ndlp->nlp_usg_map,
-				atomic_read(&ndlp->kref.refcount));
+				kref_read(&ndlp->kref));
 			return NULL;
 		} else
 			kref_get(&ndlp->kref);
@@ -5844,7 +5844,7 @@ lpfc_nlp_put(struct lpfc_nodelist *ndlp)
 	lpfc_debugfs_disc_trc(ndlp->vport, LPFC_DISC_TRC_NODE,
 	"node put:        did:x%x flg:x%x refcnt:x%x",
 		ndlp->nlp_DID, ndlp->nlp_flag,
-		atomic_read(&ndlp->kref.refcount));
+		kref_read(&ndlp->kref));
 	phba = ndlp->phba;
 	spin_lock_irqsave(&phba->ndlp_lock, flags);
 	/* Check the ndlp memory free acknowledge flag to avoid the
@@ -5857,7 +5857,7 @@ lpfc_nlp_put(struct lpfc_nodelist *ndlp)
 				"0274 lpfc_nlp_put: ndlp:x%p "
 				"usgmap:x%x refcnt:%d\n",
 				(void *)ndlp, ndlp->nlp_usg_map,
-				atomic_read(&ndlp->kref.refcount));
+				kref_read(&ndlp->kref));
 		return 1;
 	}
 	/* Check the ndlp inactivate log flag to avoid the possible
@@ -5870,7 +5870,7 @@ lpfc_nlp_put(struct lpfc_nodelist *ndlp)
 				"0275 lpfc_nlp_put: ndlp:x%p "
 				"usgmap:x%x refcnt:%d\n",
 				(void *)ndlp, ndlp->nlp_usg_map,
-				atomic_read(&ndlp->kref.refcount));
+				kref_read(&ndlp->kref));
 		return 1;
 	}
 	/* For last put, mark the ndlp usage flags to make sure no
@@ -5878,7 +5878,7 @@ lpfc_nlp_put(struct lpfc_nodelist *ndlp)
 	 * in between the process when the final kref_put has been
 	 * invoked on this ndlp.
 	 */
-	if (atomic_read(&ndlp->kref.refcount) == 1) {
+	if (kref_read(&ndlp->kref) == 1) {
 		/* Indicate ndlp is put to inactive state. */
 		NLP_SET_IACT_REQ(ndlp);
 		/* Acknowledge ndlp memory free has been seen. */
@@ -5906,8 +5906,8 @@ lpfc_nlp_not_used(struct lpfc_nodelist *ndlp)
 	lpfc_debugfs_disc_trc(ndlp->vport, LPFC_DISC_TRC_NODE,
 		"node not used:   did:x%x flg:x%x refcnt:x%x",
 		ndlp->nlp_DID, ndlp->nlp_flag,
-		atomic_read(&ndlp->kref.refcount));
-	if (atomic_read(&ndlp->kref.refcount) == 1)
+		kref_read(&ndlp->kref));
+	if (kref_read(&ndlp->kref) == 1)
 		if (lpfc_nlp_put(ndlp))
 			return 1;
 	return 0;
diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c
index 4776fd85514f..64717c171b15 100644
--- a/drivers/scsi/lpfc/lpfc_init.c
+++ b/drivers/scsi/lpfc/lpfc_init.c
@@ -2660,8 +2660,7 @@ lpfc_cleanup(struct lpfc_vport *vport)
 						"usgmap:x%x refcnt:%d\n",
 						ndlp->nlp_DID, (void *)ndlp,
 						ndlp->nlp_usg_map,
-						atomic_read(
-							&ndlp->kref.refcount));
+						kref_read(&ndlp->kref));
 			}
 			break;
 		}
diff --git a/drivers/scsi/qla2xxx/tcm_qla2xxx.c b/drivers/scsi/qla2xxx/tcm_qla2xxx.c
index 6643f6fc7795..dd28c69b6a92 100644
--- a/drivers/scsi/qla2xxx/tcm_qla2xxx.c
+++ b/drivers/scsi/qla2xxx/tcm_qla2xxx.c
@@ -371,7 +371,7 @@ static int tcm_qla2xxx_write_pending(struct se_cmd *se_cmd)
 		 */
 		pr_debug("write_pending aborted cmd[%p] refcount %d "
 			"transport_state %x, t_state %x, se_cmd_flags %x\n",
-			cmd,cmd->se_cmd.cmd_kref.refcount.counter,
+			cmd, kref_read(&cmd->se_cmd.cmd_kref),
 			cmd->se_cmd.transport_state,
 			cmd->se_cmd.t_state,
 			cmd->se_cmd.se_cmd_flags);
@@ -584,7 +584,7 @@ static int tcm_qla2xxx_queue_data_in(struct se_cmd *se_cmd)
 		 */
 		pr_debug("queue_data_in aborted cmd[%p] refcount %d "
 			"transport_state %x, t_state %x, se_cmd_flags %x\n",
-			cmd,cmd->se_cmd.cmd_kref.refcount.counter,
+			cmd, kref_read(&cmd->se_cmd.cmd_kref),
 			cmd->se_cmd.transport_state,
 			cmd->se_cmd.t_state,
 			cmd->se_cmd.se_cmd_flags);
diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c
index b653451843c8..937c2d5d7ec3 100644
--- a/drivers/staging/android/ion/ion.c
+++ b/drivers/staging/android/ion/ion.c
@@ -1300,7 +1300,7 @@ static int ion_debug_heap_show(struct seq_file *s, void *unused)
 			seq_printf(s, "%16s %16u %16zu %d %d\n",
 				   buffer->task_comm, buffer->pid,
 				   buffer->size, buffer->kmap_cnt,
-				   atomic_read(&buffer->ref.refcount));
+				   kref_read(&buffer->ref));
 			total_orphaned_size += buffer->size;
 		}
 	}
diff --git a/drivers/staging/comedi/comedi_buf.c b/drivers/staging/comedi/comedi_buf.c
index c7d7682b1412..1e1df89b5018 100644
--- a/drivers/staging/comedi/comedi_buf.c
+++ b/drivers/staging/comedi/comedi_buf.c
@@ -188,7 +188,7 @@ bool comedi_buf_is_mmapped(struct comedi_subdevice *s)
 {
 	struct comedi_buf_map *bm = s->async->buf_map;
 
-	return bm && (atomic_read(&bm->refcount.refcount) > 1);
+	return bm && (kref_read(&bm->refcount) > 1);
 }
 
 int comedi_buf_alloc(struct comedi_device *dev, struct comedi_subdevice *s,
diff --git a/drivers/target/target_core_pr.c b/drivers/target/target_core_pr.c
index d761025144f9..e18051185846 100644
--- a/drivers/target/target_core_pr.c
+++ b/drivers/target/target_core_pr.c
@@ -788,7 +788,7 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration(
 			 * __core_scsi3_add_registration()
 			 */
 			dest_lun = rcu_dereference_check(deve_tmp->se_lun,
-				atomic_read(&deve_tmp->pr_kref.refcount) != 0);
+				kref_read(&deve_tmp->pr_kref) != 0);
 
 			pr_reg_atp = __core_scsi3_do_alloc_registration(dev,
 						nacl_tmp, dest_lun, deve_tmp,
@@ -1463,7 +1463,7 @@ static int core_scsi3_lunacl_depend_item(struct se_dev_entry *se_deve)
 	 * For nacl->dynamic_node_acl=1
 	 */
 	lun_acl = rcu_dereference_check(se_deve->se_lun_acl,
-				atomic_read(&se_deve->pr_kref.refcount) != 0);
+				kref_read(&se_deve->pr_kref) != 0);
 	if (!lun_acl)
 		return 0;
 
@@ -1478,7 +1478,7 @@ static void core_scsi3_lunacl_undepend_item(struct se_dev_entry *se_deve)
 	 * For nacl->dynamic_node_acl=1
 	 */
 	lun_acl = rcu_dereference_check(se_deve->se_lun_acl,
-				atomic_read(&se_deve->pr_kref.refcount) != 0);
+				kref_read(&se_deve->pr_kref) != 0);
 	if (!lun_acl) {
 		kref_put(&se_deve->pr_kref, target_pr_kref_release);
 		return;
@@ -1759,7 +1759,7 @@ core_scsi3_decode_spec_i_port(
 		 * 2nd loop which will never fail.
 		 */
 		dest_lun = rcu_dereference_check(dest_se_deve->se_lun,
-				atomic_read(&dest_se_deve->pr_kref.refcount) != 0);
+				kref_read(&dest_se_deve->pr_kref) != 0);
 
 		dest_pr_reg = __core_scsi3_alloc_registration(cmd->se_dev,
 					dest_node_acl, dest_lun, dest_se_deve,
@@ -3466,7 +3466,7 @@ after_iport_check:
 					iport_ptr);
 	if (!dest_pr_reg) {
 		struct se_lun *dest_lun = rcu_dereference_check(dest_se_deve->se_lun,
-				atomic_read(&dest_se_deve->pr_kref.refcount) != 0);
+				kref_read(&dest_se_deve->pr_kref) != 0);
 
 		spin_unlock(&dev->dev_reservation_lock);
 		if (core_scsi3_alloc_registration(cmd->se_dev, dest_node_acl,
diff --git a/drivers/target/tcm_fc/tfc_sess.c b/drivers/target/tcm_fc/tfc_sess.c
index fd5c3de79470..c91979c1463d 100644
--- a/drivers/target/tcm_fc/tfc_sess.c
+++ b/drivers/target/tcm_fc/tfc_sess.c
@@ -454,7 +454,7 @@ static void ft_sess_free(struct kref *kref)
 
 void ft_sess_put(struct ft_sess *sess)
 {
-	int sess_held = atomic_read(&sess->kref.refcount);
+	int sess_held = kref_read(&sess->kref);
 
 	BUG_ON(!sess_held);
 	kref_put(&sess->kref, ft_sess_free);
diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
index 5e746adc8a2d..365443cae5b1 100644
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -3687,7 +3687,7 @@ static void ffs_closed(struct ffs_data *ffs)
 		goto done;
 
 	if (opts->no_configfs || !opts->func_inst.group.cg_item.ci_parent
-	    || !atomic_read(&opts->func_inst.group.cg_item.ci_kref.refcount))
+	    || !kref_read(&opts->func_inst.group.cg_item.ci_kref))
 		goto done;
 
 	ci = opts->func_inst.group.cg_item.ci_parent->ci_parent;
diff --git a/fs/exofs/sys.c b/fs/exofs/sys.c
index 5e6a2c0a1f0b..1f7d5e46cdda 100644
--- a/fs/exofs/sys.c
+++ b/fs/exofs/sys.c
@@ -122,7 +122,7 @@ void exofs_sysfs_dbg_print(void)
 	list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) {
 		printk(KERN_INFO "%s: name %s ref %d\n",
 			__func__, kobject_name(k_name),
-			(int)atomic_read(&k_name->kref.refcount));
+			(int)kref_read(&k_name->kref));
 	}
 #endif
 }
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index 27d1242c8383..564c504d6efd 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -349,7 +349,7 @@ static void sc_show_sock_container(struct seq_file *seq,
 		   "  func key:        0x%08x\n"
 		   "  func type:       %u\n",
 		   sc,
-		   atomic_read(&sc->sc_kref.refcount),
+		   kref_read(&sc->sc_kref),
 		   &saddr, inet ? ntohs(sport) : 0,
 		   &daddr, inet ? ntohs(dport) : 0,
 		   sc->sc_node->nd_name,
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index d4b5c81f0445..ec000575e863 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -97,7 +97,7 @@
 	typeof(sc) __sc = (sc);						\
 	mlog(ML_SOCKET, "[sc %p refs %d sock %p node %u page %p "	\
 	     "pg_off %zu] " fmt, __sc,					\
-	     atomic_read(&__sc->sc_kref.refcount), __sc->sc_sock,	\
+	     kref_read(&__sc->sc_kref), __sc->sc_sock,	\
 	    __sc->sc_node->nd_num, __sc->sc_page, __sc->sc_page_off ,	\
 	    ##args);							\
 } while (0)
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index e7b760deefae..9b984cae4c4e 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -81,7 +81,7 @@ static void __dlm_print_lock(struct dlm_lock *lock)
 	       lock->ml.type, lock->ml.convert_type, lock->ml.node,
 	       dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
 	       dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
-	       atomic_read(&lock->lock_refs.refcount),
+	       kref_read(&lock->lock_refs),
 	       (list_empty(&lock->ast_list) ? 'y' : 'n'),
 	       (lock->ast_pending ? 'y' : 'n'),
 	       (list_empty(&lock->bast_list) ? 'y' : 'n'),
@@ -106,7 +106,7 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
 	printk("lockres: %s, owner=%u, state=%u\n",
 	       buf, res->owner, res->state);
 	printk("  last used: %lu, refcnt: %u, on purge list: %s\n",
-	       res->last_used, atomic_read(&res->refs.refcount),
+	       res->last_used, kref_read(&res->refs),
 	       list_empty(&res->purge) ? "no" : "yes");
 	printk("  on dirty list: %s, on reco list: %s, "
 	       "migrating pending: %s\n",
@@ -298,7 +298,7 @@ static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
 			mle_type, mle->master, mle->new_master,
 			!list_empty(&mle->hb_events),
 			!!mle->inuse,
-			atomic_read(&mle->mle_refs.refcount));
+			kref_read(&mle->mle_refs));
 
 	out += snprintf(buf + out, len - out, "Maybe=");
 	out += stringify_nodemap(mle->maybe_map, O2NM_MAX_NODES,
@@ -494,7 +494,7 @@ static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len)
 		       lock->ast_pending, lock->bast_pending,
 		       lock->convert_pending, lock->lock_pending,
 		       lock->cancel_pending, lock->unlock_pending,
-		       atomic_read(&lock->lock_refs.refcount));
+		       kref_read(&lock->lock_refs));
 	spin_unlock(&lock->spinlock);
 
 	return out;
@@ -521,7 +521,7 @@ static int dump_lockres(struct dlm_lock_resource *res, char *buf, int len)
 			!list_empty(&res->recovering),
 			res->inflight_locks, res->migration_pending,
 			atomic_read(&res->asts_reserved),
-			atomic_read(&res->refs.refcount));
+			kref_read(&res->refs));
 
 	/* refmap */
 	out += snprintf(buf + out, len - out, "RMAP:");
@@ -777,7 +777,7 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
 	/* Purge Count: xxx  Refs: xxx */
 	out += snprintf(buf + out, len - out,
 			"Purge Count: %d  Refs: %d\n", dlm->purge_count,
-			atomic_read(&dlm->dlm_refs.refcount));
+			kref_read(&dlm->dlm_refs));
 
 	/* Dead Node: xxx */
 	out += snprintf(buf + out, len - out,
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 733e4e79c8e2..32fd261ae13d 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -2072,7 +2072,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
 	INIT_LIST_HEAD(&dlm->dlm_eviction_callbacks);
 
 	mlog(0, "context init: refcount %u\n",
-		  atomic_read(&dlm->dlm_refs.refcount));
+		  kref_read(&dlm->dlm_refs));
 
 leave:
 	if (ret < 0 && dlm) {
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index a464c8088170..7025d8c27999 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -233,7 +233,7 @@ static void __dlm_put_mle(struct dlm_master_list_entry *mle)
 
 	assert_spin_locked(&dlm->spinlock);
 	assert_spin_locked(&dlm->master_lock);
-	if (!atomic_read(&mle->mle_refs.refcount)) {
+	if (!kref_read(&mle->mle_refs)) {
 		/* this may or may not crash, but who cares.
 		 * it's a BUG. */
 		mlog(ML_ERROR, "bad mle: %p\n", mle);
@@ -1124,9 +1124,9 @@ recheck:
 		unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS);
 
 		/*
-		if (atomic_read(&mle->mle_refs.refcount) < 2)
+		if (kref_read(&mle->mle_refs) < 2)
 			mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle,
-			atomic_read(&mle->mle_refs.refcount),
+			kref_read(&mle->mle_refs),
 			res->lockname.len, res->lockname.name);
 		*/
 		atomic_set(&mle->woken, 0);
@@ -1979,7 +1979,7 @@ ok:
 		 * on this mle. */
 		spin_lock(&dlm->master_lock);
 
-		rr = atomic_read(&mle->mle_refs.refcount);
+		rr = kref_read(&mle->mle_refs);
 		if (mle->inuse > 0) {
 			if (extra_ref && rr < 3)
 				err = 1;
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 1082b2c3014b..63d701cd1e2e 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -251,7 +251,7 @@ leave:
 		mlog(0, "lock %u:%llu should be gone now! refs=%d\n",
 		     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
 		     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
-		     atomic_read(&lock->lock_refs.refcount)-1);
+		     kref_read(&lock->lock_refs)-1);
 		dlm_lock_put(lock);
 	}
 	if (actions & DLM_UNLOCK_CALL_AST)
diff --git a/include/drm/drm_framebuffer.h b/include/drm/drm_framebuffer.h
index 1ddfa2928802..a232e7f0c869 100644
--- a/include/drm/drm_framebuffer.h
+++ b/include/drm/drm_framebuffer.h
@@ -247,7 +247,7 @@ static inline void drm_framebuffer_unreference(struct drm_framebuffer *fb)
  */
 static inline uint32_t drm_framebuffer_read_refcount(struct drm_framebuffer *fb)
 {
-	return atomic_read(&fb->base.refcount.refcount);
+	return kref_read(&fb->base.refcount);
 }
 
 /**
diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
index cdbdb40eb5bd..feecf33a1212 100644
--- a/include/drm/ttm/ttm_bo_driver.h
+++ b/include/drm/ttm/ttm_bo_driver.h
@@ -878,7 +878,7 @@ static inline int ttm_bo_reserve(struct ttm_buffer_object *bo,
 {
 	int ret;
 
-	WARN_ON(!atomic_read(&bo->kref.refcount));
+	WARN_ON(!kref_read(&bo->kref));
 
 	ret = __ttm_bo_reserve(bo, interruptible, no_wait, ticket);
 	if (likely(ret == 0))
@@ -903,7 +903,7 @@ static inline int ttm_bo_reserve_slowpath(struct ttm_buffer_object *bo,
 {
 	int ret = 0;
 
-	WARN_ON(!atomic_read(&bo->kref.refcount));
+	WARN_ON(!kref_read(&bo->kref));
 
 	if (interruptible)
 		ret = ww_mutex_lock_slow_interruptible(&bo->resv->lock,
diff --git a/include/linux/kref.h b/include/linux/kref.h
index 9af255ad1e2f..7c88d865f82f 100644
--- a/include/linux/kref.h
+++ b/include/linux/kref.h
@@ -35,6 +35,11 @@ static inline void kref_init(struct kref *kref)
 	atomic_set(&kref->refcount, 1);
 }
 
+static inline int kref_read(const struct kref *kref)
+{
+	return atomic_read(&kref->refcount);
+}
+
 /**
  * kref_get - increment refcount for object.
  * @kref: object.
diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index 62a60eeacb0a..8a511c0985aa 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -198,7 +198,7 @@ static inline struct cache_head  *cache_get(struct cache_head *h)
 
 static inline void cache_put(struct cache_head *h, struct cache_detail *cd)
 {
-	if (atomic_read(&h->ref.refcount) <= 2 &&
+	if (kref_read(&h->ref) <= 2 &&
 	    h->expiry_time < cd->nextcheck)
 		cd->nextcheck = h->expiry_time;
 	kref_put(&h->ref, cd->cache_put);
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 554671c81f4a..90708f68cc02 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -987,7 +987,7 @@ static inline void hci_conn_drop(struct hci_conn *conn)
 static inline void hci_dev_put(struct hci_dev *d)
 {
 	BT_DBG("%s orig refcnt %d", d->name,
-	       atomic_read(&d->dev.kobj.kref.refcount));
+	       kref_read(&d->dev.kobj.kref));
 
 	put_device(&d->dev);
 }
@@ -995,7 +995,7 @@ static inline void hci_dev_put(struct hci_dev *d)
 static inline struct hci_dev *hci_dev_hold(struct hci_dev *d)
 {
 	BT_DBG("%s orig refcnt %d", d->name,
-	       atomic_read(&d->dev.kobj.kref.refcount));
+	       kref_read(&d->dev.kobj.kref));
 
 	get_device(&d->dev);
 	return d;
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index 1904a93f47d5..d491529332f4 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -920,7 +920,7 @@ static void chan_close_cb(struct l2cap_chan *chan)
 			BT_DBG("dev %p removing %speer %p", dev,
 			       last ? "last " : "1 ", peer);
 			BT_DBG("chan %p orig refcnt %d", chan,
-			       atomic_read(&chan->kref.refcount));
+			       kref_read(&chan->kref));
 
 			l2cap_chan_put(chan);
 			break;
diff --git a/net/bluetooth/a2mp.c b/net/bluetooth/a2mp.c
index 5f123c3320a7..f0095fd79818 100644
--- a/net/bluetooth/a2mp.c
+++ b/net/bluetooth/a2mp.c
@@ -810,7 +810,7 @@ static struct l2cap_chan *a2mp_chan_open(struct l2cap_conn *conn, bool locked)
 /* AMP Manager functions */
 struct amp_mgr *amp_mgr_get(struct amp_mgr *mgr)
 {
-	BT_DBG("mgr %p orig refcnt %d", mgr, atomic_read(&mgr->kref.refcount));
+	BT_DBG("mgr %p orig refcnt %d", mgr, kref_read(&mgr->kref));
 
 	kref_get(&mgr->kref);
 
@@ -833,7 +833,7 @@ static void amp_mgr_destroy(struct kref *kref)
 
 int amp_mgr_put(struct amp_mgr *mgr)
 {
-	BT_DBG("mgr %p orig refcnt %d", mgr, atomic_read(&mgr->kref.refcount));
+	BT_DBG("mgr %p orig refcnt %d", mgr, kref_read(&mgr->kref));
 
 	return kref_put(&mgr->kref, &amp_mgr_destroy);
 }
diff --git a/net/bluetooth/amp.c b/net/bluetooth/amp.c
index e32f34189007..02a4ccc04e1e 100644
--- a/net/bluetooth/amp.c
+++ b/net/bluetooth/amp.c
@@ -24,7 +24,7 @@
 void amp_ctrl_get(struct amp_ctrl *ctrl)
 {
 	BT_DBG("ctrl %p orig refcnt %d", ctrl,
-	       atomic_read(&ctrl->kref.refcount));
+	       kref_read(&ctrl->kref));
 
 	kref_get(&ctrl->kref);
 }
@@ -42,7 +42,7 @@ static void amp_ctrl_destroy(struct kref *kref)
 int amp_ctrl_put(struct amp_ctrl *ctrl)
 {
 	BT_DBG("ctrl %p orig refcnt %d", ctrl,
-	       atomic_read(&ctrl->kref.refcount));
+	       kref_read(&ctrl->kref));
 
 	return kref_put(&ctrl->kref, &amp_ctrl_destroy);
 }
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index ce0b5dd01953..fc7f321a3823 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -481,14 +481,14 @@ static void l2cap_chan_destroy(struct kref *kref)
 
 void l2cap_chan_hold(struct l2cap_chan *c)
 {
-	BT_DBG("chan %p orig refcnt %d", c, atomic_read(&c->kref.refcount));
+	BT_DBG("chan %p orig refcnt %d", c, kref_read(&c->kref));
 
 	kref_get(&c->kref);
 }
 
 void l2cap_chan_put(struct l2cap_chan *c)
 {
-	BT_DBG("chan %p orig refcnt %d", c, atomic_read(&c->kref.refcount));
+	BT_DBG("chan %p orig refcnt %d", c, kref_read(&c->kref));
 
 	kref_put(&c->kref, l2cap_chan_destroy);
 }
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 770c52701efa..bad3d4ae43f6 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -3425,7 +3425,7 @@ static void ceph_msg_release(struct kref *kref)
 struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
 {
 	dout("%s %p (was %d)\n", __func__, msg,
-	     atomic_read(&msg->kref.refcount));
+	     kref_read(&msg->kref));
 	kref_get(&msg->kref);
 	return msg;
 }
@@ -3434,7 +3434,7 @@ EXPORT_SYMBOL(ceph_msg_get);
 void ceph_msg_put(struct ceph_msg *msg)
 {
 	dout("%s %p (was %d)\n", __func__, msg,
-	     atomic_read(&msg->kref.refcount));
+	     kref_read(&msg->kref));
 	kref_put(&msg->kref, ceph_msg_release);
 }
 EXPORT_SYMBOL(ceph_msg_put);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 842f049abb86..f3378ba1a828 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -438,7 +438,7 @@ static void ceph_osdc_release_request(struct kref *kref)
 void ceph_osdc_get_request(struct ceph_osd_request *req)
 {
 	dout("%s %p (was %d)\n", __func__, req,
-	     atomic_read(&req->r_kref.refcount));
+	     kref_read(&req->r_kref));
 	kref_get(&req->r_kref);
 }
 EXPORT_SYMBOL(ceph_osdc_get_request);
@@ -447,7 +447,7 @@ void ceph_osdc_put_request(struct ceph_osd_request *req)
 {
 	if (req) {
 		dout("%s %p (was %d)\n", __func__, req,
-		     atomic_read(&req->r_kref.refcount));
+		     kref_read(&req->r_kref));
 		kref_put(&req->r_kref, ceph_osdc_release_request);
 	}
 }
@@ -487,11 +487,11 @@ static void request_reinit(struct ceph_osd_request *req)
 	struct ceph_msg *reply_msg = req->r_reply;
 
 	dout("%s req %p\n", __func__, req);
-	WARN_ON(atomic_read(&req->r_kref.refcount) != 1);
+	WARN_ON(kref_read(&req->r_kref) != 1);
 	request_release_checks(req);
 
-	WARN_ON(atomic_read(&request_msg->kref.refcount) != 1);
-	WARN_ON(atomic_read(&reply_msg->kref.refcount) != 1);
+	WARN_ON(kref_read(&request_msg->kref) != 1);
+	WARN_ON(kref_read(&reply_msg->kref) != 1);
 	target_destroy(&req->r_t);
 
 	request_init(req);
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 8147e8d56eb2..f39e3e11f9aa 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -1358,7 +1358,7 @@ static int c_show(struct seq_file *m, void *p)
 	ifdebug(CACHE)
 		seq_printf(m, "# expiry=%ld refcnt=%d flags=%lx\n",
 			   convert_to_wallclock(cp->expiry_time),
-			   atomic_read(&cp->ref.refcount), cp->flags);
+			   kref_read(&cp->ref), cp->flags);
 	cache_get(cp);
 	if (cache_check(cd, cp, NULL))
 		/* cache_check does a cache_put on failure */
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 3bc1d61694cb..04e7f8707d71 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -490,7 +490,7 @@ static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool)
 		svc_xprt_get(xprt);
 
 		dprintk("svc: transport %p dequeued, inuse=%d\n",
-			xprt, atomic_read(&xprt->xpt_ref.refcount));
+			xprt, kref_read(&xprt->xpt_ref));
 	}
 	spin_unlock_bh(&pool->sp_lock);
 out:
@@ -820,7 +820,7 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
 		/* XPT_DATA|XPT_DEFERRED case: */
 		dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n",
 			rqstp, rqstp->rq_pool->sp_id, xprt,
-			atomic_read(&xprt->xpt_ref.refcount));
+			kref_read(&xprt->xpt_ref));
 		rqstp->rq_deferred = svc_deferred_dequeue(xprt);
 		if (rqstp->rq_deferred)
 			len = svc_deferred_recv(rqstp);
@@ -978,7 +978,7 @@ static void svc_age_temp_xprts(unsigned long closure)
 		 * through, close it. */
 		if (!test_and_set_bit(XPT_OLD, &xprt->xpt_flags))
 			continue;
-		if (atomic_read(&xprt->xpt_ref.refcount) > 1 ||
+		if (kref_read(&xprt->xpt_ref) > 1 ||
 		    test_bit(XPT_BUSY, &xprt->xpt_flags))
 			continue;
 		list_del_init(le);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index ca2799af05a6..39652d390a9c 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -1201,9 +1201,9 @@ static void __svc_rdma_free(struct work_struct *work)
 		ib_drain_qp(rdma->sc_qp);
 
 	/* We should only be called from kref_put */
-	if (atomic_read(&xprt->xpt_ref.refcount) != 0)
+	if (kref_read(&xprt->xpt_ref) != 0)
 		pr_err("svcrdma: sc_xprt still in use? (%d)\n",
-		       atomic_read(&xprt->xpt_ref.refcount));
+		       kref_read(&xprt->xpt_ref));
 
 	/*
 	 * Destroy queued, but not processed read completions. Note
-- 
cgit v1.2.3


From bdfafc4ffdd24e491119d81f85ddc4393fa49803 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 14 Nov 2016 17:34:19 +0100
Subject: locking/atomic, kref: Kill kref_sub()

By general sentiment kref_sub() is a bad interface, make it go away.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/block/drbd/drbd_main.c         |  7 ++--
 drivers/block/drbd/drbd_req.c          | 31 ++++++------------
 drivers/gpu/drm/ttm/ttm_bo.c           | 59 +++++++++-------------------------
 drivers/gpu/drm/ttm/ttm_execbuf_util.c |  4 +--
 include/drm/ttm/ttm_bo_api.h           | 15 +--------
 include/linux/kref.h                   | 32 +++---------------
 6 files changed, 36 insertions(+), 112 deletions(-)

(limited to 'include')

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 83482721bc01..c3ff60c30dde 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2948,7 +2948,6 @@ void drbd_delete_device(struct drbd_device *device)
 	struct drbd_resource *resource = device->resource;
 	struct drbd_connection *connection;
 	struct drbd_peer_device *peer_device;
-	int refs = 3;
 
 	/* move to free_peer_device() */
 	for_each_peer_device(peer_device, device)
@@ -2956,13 +2955,15 @@ void drbd_delete_device(struct drbd_device *device)
 	drbd_debugfs_device_cleanup(device);
 	for_each_connection(connection, resource) {
 		idr_remove(&connection->peer_devices, device->vnr);
-		refs++;
+		kref_put(&device->kref, drbd_destroy_device);
 	}
 	idr_remove(&resource->devices, device->vnr);
+	kref_put(&device->kref, drbd_destroy_device);
 	idr_remove(&drbd_devices, device_to_minor(device));
+	kref_put(&device->kref, drbd_destroy_device);
 	del_gendisk(device->vdisk);
 	synchronize_rcu();
-	kref_sub(&device->kref, refs, drbd_destroy_device);
+	kref_put(&device->kref, drbd_destroy_device);
 }
 
 static int __init drbd_init(void)
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 74306c054983..b489ac2e9c44 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -421,7 +421,6 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
 	struct drbd_peer_device *peer_device = first_peer_device(device);
 	unsigned s = req->rq_state;
 	int c_put = 0;
-	int k_put = 0;
 
 	if (drbd_suspended(device) && !((s | clear) & RQ_COMPLETION_SUSP))
 		set |= RQ_COMPLETION_SUSP;
@@ -437,6 +436,8 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
 
 	/* intent: get references */
 
+	kref_get(&req->kref);
+
 	if (!(s & RQ_LOCAL_PENDING) && (set & RQ_LOCAL_PENDING))
 		atomic_inc(&req->completion_ref);
 
@@ -473,15 +474,12 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
 
 	if (!(s & RQ_LOCAL_ABORTED) && (set & RQ_LOCAL_ABORTED)) {
 		D_ASSERT(device, req->rq_state & RQ_LOCAL_PENDING);
-		/* local completion may still come in later,
-		 * we need to keep the req object around. */
-		kref_get(&req->kref);
 		++c_put;
 	}
 
 	if ((s & RQ_LOCAL_PENDING) && (clear & RQ_LOCAL_PENDING)) {
 		if (req->rq_state & RQ_LOCAL_ABORTED)
-			++k_put;
+			kref_put(&req->kref, drbd_req_destroy);
 		else
 			++c_put;
 		list_del_init(&req->req_pending_local);
@@ -503,7 +501,7 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
 		if (s & RQ_NET_SENT)
 			atomic_sub(req->i.size >> 9, &device->ap_in_flight);
 		if (s & RQ_EXP_BARR_ACK)
-			++k_put;
+			kref_put(&req->kref, drbd_req_destroy);
 		req->net_done_jif = jiffies;
 
 		/* in ahead/behind mode, or just in case,
@@ -516,25 +514,16 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
 
 	/* potentially complete and destroy */
 
-	if (k_put || c_put) {
-		/* Completion does it's own kref_put.  If we are going to
-		 * kref_sub below, we need req to be still around then. */
-		int at_least = k_put + !!c_put;
-		int refcount = kref_read(&req->kref);
-		if (refcount < at_least)
-			drbd_err(device,
-				"mod_rq_state: Logic BUG: %x -> %x: refcount = %d, should be >= %d\n",
-				s, req->rq_state, refcount, at_least);
-	}
-
 	/* If we made progress, retry conflicting peer requests, if any. */
 	if (req->i.waiting)
 		wake_up(&device->misc_wait);
 
-	if (c_put)
-		k_put += drbd_req_put_completion_ref(req, m, c_put);
-	if (k_put)
-		kref_sub(&req->kref, k_put, drbd_req_destroy);
+	if (c_put) {
+		if (drbd_req_put_completion_ref(req, m, c_put))
+			kref_put(&req->kref, drbd_req_destroy);
+	} else {
+		kref_put(&req->kref, drbd_req_destroy);
+	}
 }
 
 static void drbd_report_io_error(struct drbd_device *device, struct drbd_request *req)
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 30aefcc0c969..ffc6cb55c78c 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -181,61 +181,46 @@ void ttm_bo_add_to_lru(struct ttm_buffer_object *bo)
 }
 EXPORT_SYMBOL(ttm_bo_add_to_lru);
 
-int ttm_bo_del_from_lru(struct ttm_buffer_object *bo)
+static void ttm_bo_ref_bug(struct kref *list_kref)
+{
+	BUG();
+}
+
+void ttm_bo_del_from_lru(struct ttm_buffer_object *bo)
 {
 	struct ttm_bo_device *bdev = bo->bdev;
-	int put_count = 0;
 
 	if (bdev->driver->lru_removal)
 		bdev->driver->lru_removal(bo);
 
 	if (!list_empty(&bo->swap)) {
 		list_del_init(&bo->swap);
-		++put_count;
+		kref_put(&bo->list_kref, ttm_bo_ref_bug);
 	}
 	if (!list_empty(&bo->lru)) {
 		list_del_init(&bo->lru);
-		++put_count;
+		kref_put(&bo->list_kref, ttm_bo_ref_bug);
 	}
-
-	return put_count;
-}
-
-static void ttm_bo_ref_bug(struct kref *list_kref)
-{
-	BUG();
-}
-
-void ttm_bo_list_ref_sub(struct ttm_buffer_object *bo, int count,
-			 bool never_free)
-{
-	kref_sub(&bo->list_kref, count,
-		 (never_free) ? ttm_bo_ref_bug : ttm_bo_release_list);
 }
 
 void ttm_bo_del_sub_from_lru(struct ttm_buffer_object *bo)
 {
-	int put_count;
-
 	spin_lock(&bo->glob->lru_lock);
-	put_count = ttm_bo_del_from_lru(bo);
+	ttm_bo_del_from_lru(bo);
 	spin_unlock(&bo->glob->lru_lock);
-	ttm_bo_list_ref_sub(bo, put_count, true);
 }
 EXPORT_SYMBOL(ttm_bo_del_sub_from_lru);
 
 void ttm_bo_move_to_lru_tail(struct ttm_buffer_object *bo)
 {
 	struct ttm_bo_device *bdev = bo->bdev;
-	int put_count = 0;
 
 	lockdep_assert_held(&bo->resv->lock.base);
 
 	if (bdev->driver->lru_removal)
 		bdev->driver->lru_removal(bo);
 
-	put_count = ttm_bo_del_from_lru(bo);
-	ttm_bo_list_ref_sub(bo, put_count, true);
+	ttm_bo_del_from_lru(bo);
 	ttm_bo_add_to_lru(bo);
 }
 EXPORT_SYMBOL(ttm_bo_move_to_lru_tail);
@@ -447,7 +432,6 @@ static void ttm_bo_cleanup_refs_or_queue(struct ttm_buffer_object *bo)
 {
 	struct ttm_bo_device *bdev = bo->bdev;
 	struct ttm_bo_global *glob = bo->glob;
-	int put_count;
 	int ret;
 
 	spin_lock(&glob->lru_lock);
@@ -455,13 +439,10 @@ static void ttm_bo_cleanup_refs_or_queue(struct ttm_buffer_object *bo)
 
 	if (!ret) {
 		if (!ttm_bo_wait(bo, false, true)) {
-			put_count = ttm_bo_del_from_lru(bo);
-
+			ttm_bo_del_from_lru(bo);
 			spin_unlock(&glob->lru_lock);
 			ttm_bo_cleanup_memtype_use(bo);
 
-			ttm_bo_list_ref_sub(bo, put_count, true);
-
 			return;
 		} else
 			ttm_bo_flush_all_fences(bo);
@@ -504,7 +485,6 @@ static int ttm_bo_cleanup_refs_and_unlock(struct ttm_buffer_object *bo,
 					  bool no_wait_gpu)
 {
 	struct ttm_bo_global *glob = bo->glob;
-	int put_count;
 	int ret;
 
 	ret = ttm_bo_wait(bo, false, true);
@@ -554,15 +534,13 @@ static int ttm_bo_cleanup_refs_and_unlock(struct ttm_buffer_object *bo,
 		return ret;
 	}
 
-	put_count = ttm_bo_del_from_lru(bo);
+	ttm_bo_del_from_lru(bo);
 	list_del_init(&bo->ddestroy);
-	++put_count;
+	kref_put(&bo->list_kref, ttm_bo_ref_bug);
 
 	spin_unlock(&glob->lru_lock);
 	ttm_bo_cleanup_memtype_use(bo);
 
-	ttm_bo_list_ref_sub(bo, put_count, true);
-
 	return 0;
 }
 
@@ -740,7 +718,7 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
 	struct ttm_bo_global *glob = bdev->glob;
 	struct ttm_mem_type_manager *man = &bdev->man[mem_type];
 	struct ttm_buffer_object *bo;
-	int ret = -EBUSY, put_count;
+	int ret = -EBUSY;
 
 	spin_lock(&glob->lru_lock);
 	list_for_each_entry(bo, &man->lru, lru) {
@@ -771,13 +749,11 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
 		return ret;
 	}
 
-	put_count = ttm_bo_del_from_lru(bo);
+	ttm_bo_del_from_lru(bo);
 	spin_unlock(&glob->lru_lock);
 
 	BUG_ON(ret != 0);
 
-	ttm_bo_list_ref_sub(bo, put_count, true);
-
 	ret = ttm_bo_evict(bo, interruptible, no_wait_gpu);
 	ttm_bo_unreserve(bo);
 
@@ -1669,7 +1645,6 @@ static int ttm_bo_swapout(struct ttm_mem_shrink *shrink)
 	    container_of(shrink, struct ttm_bo_global, shrink);
 	struct ttm_buffer_object *bo;
 	int ret = -EBUSY;
-	int put_count;
 	uint32_t swap_placement = (TTM_PL_FLAG_CACHED | TTM_PL_FLAG_SYSTEM);
 
 	spin_lock(&glob->lru_lock);
@@ -1692,11 +1667,9 @@ static int ttm_bo_swapout(struct ttm_mem_shrink *shrink)
 		return ret;
 	}
 
-	put_count = ttm_bo_del_from_lru(bo);
+	ttm_bo_del_from_lru(bo);
 	spin_unlock(&glob->lru_lock);
 
-	ttm_bo_list_ref_sub(bo, put_count, true);
-
 	/**
 	 * Move to system cached
 	 */
diff --git a/drivers/gpu/drm/ttm/ttm_execbuf_util.c b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
index d35bc491e8de..5e1bcabffef5 100644
--- a/drivers/gpu/drm/ttm/ttm_execbuf_util.c
+++ b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
@@ -48,9 +48,7 @@ static void ttm_eu_del_from_lru_locked(struct list_head *list)
 
 	list_for_each_entry(entry, list, head) {
 		struct ttm_buffer_object *bo = entry->bo;
-		unsigned put_count = ttm_bo_del_from_lru(bo);
-
-		ttm_bo_list_ref_sub(bo, put_count, true);
+		ttm_bo_del_from_lru(bo);
 	}
 }
 
diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h
index 652e45be97c8..9a465314572c 100644
--- a/include/drm/ttm/ttm_bo_api.h
+++ b/include/drm/ttm/ttm_bo_api.h
@@ -332,19 +332,6 @@ extern int ttm_bo_validate(struct ttm_buffer_object *bo,
  */
 extern void ttm_bo_unref(struct ttm_buffer_object **bo);
 
-
-/**
- * ttm_bo_list_ref_sub
- *
- * @bo: The buffer object.
- * @count: The number of references with which to decrease @bo::list_kref;
- * @never_free: The refcount should not reach zero with this operation.
- *
- * Release @count lru list references to this buffer object.
- */
-extern void ttm_bo_list_ref_sub(struct ttm_buffer_object *bo, int count,
-				bool never_free);
-
 /**
  * ttm_bo_add_to_lru
  *
@@ -367,7 +354,7 @@ extern void ttm_bo_add_to_lru(struct ttm_buffer_object *bo);
  * and is usually called just immediately after the bo has been reserved to
  * avoid recursive reservation from lru lists.
  */
-extern int ttm_bo_del_from_lru(struct ttm_buffer_object *bo);
+extern void ttm_bo_del_from_lru(struct ttm_buffer_object *bo);
 
 /**
  * ttm_bo_move_to_lru_tail
diff --git a/include/linux/kref.h b/include/linux/kref.h
index 7c88d865f82f..31c49a64cdf9 100644
--- a/include/linux/kref.h
+++ b/include/linux/kref.h
@@ -54,9 +54,8 @@ static inline void kref_get(struct kref *kref)
 }
 
 /**
- * kref_sub - subtract a number of refcounts for object.
+ * kref_put - decrement refcount for object.
  * @kref: object.
- * @count: Number of recounts to subtract.
  * @release: pointer to the function that will clean up the object when the
  *	     last reference to the object is released.
  *	     This pointer is required, and it is not acceptable to pass kfree
@@ -65,46 +64,23 @@ static inline void kref_get(struct kref *kref)
  *	     maintainer, and anyone else who happens to notice it.  You have
  *	     been warned.
  *
- * Subtract @count from the refcount, and if 0, call release().
+ * Decrement the refcount, and if 0, call release().
  * Return 1 if the object was removed, otherwise return 0.  Beware, if this
  * function returns 0, you still can not count on the kref from remaining in
  * memory.  Only use the return value if you want to see if the kref is now
  * gone, not present.
  */
-static inline int kref_sub(struct kref *kref, unsigned int count,
-	     void (*release)(struct kref *kref))
+static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref))
 {
 	WARN_ON(release == NULL);
 
-	if (atomic_sub_and_test((int) count, &kref->refcount)) {
+	if (atomic_dec_and_test(&kref->refcount)) {
 		release(kref);
 		return 1;
 	}
 	return 0;
 }
 
-/**
- * kref_put - decrement refcount for object.
- * @kref: object.
- * @release: pointer to the function that will clean up the object when the
- *	     last reference to the object is released.
- *	     This pointer is required, and it is not acceptable to pass kfree
- *	     in as this function.  If the caller does pass kfree to this
- *	     function, you will be publicly mocked mercilessly by the kref
- *	     maintainer, and anyone else who happens to notice it.  You have
- *	     been warned.
- *
- * Decrement the refcount, and if 0, call release().
- * Return 1 if the object was removed, otherwise return 0.  Beware, if this
- * function returns 0, you still can not count on the kref from remaining in
- * memory.  Only use the return value if you want to see if the kref is now
- * gone, not present.
- */
-static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref))
-{
-	return kref_sub(kref, 1, release);
-}
-
 static inline int kref_put_mutex(struct kref *kref,
 				 void (*release)(struct kref *kref),
 				 struct mutex *lock)
-- 
cgit v1.2.3


From 23b19ec3377924eb8f303880102e056e9214ba72 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 14 Jan 2017 12:11:59 +0100
Subject: locking/ww_mutex: Turn off __must_check for now
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With the ww_mutex inline wrappers gone there's a lot of dormant
anti-patterns emerging in an x86 allyesconfig build:

  kernel/locking/test-ww_mutex.c:80:2: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  kernel/locking/test-ww_mutex.c:55:3: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  kernel/locking/test-ww_mutex.c:134:2: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  kernel/locking/test-ww_mutex.c:213:2: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  kernel/locking/test-ww_mutex.c:177:2: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  kernel/locking/test-ww_mutex.c:266:2: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  lib/locking-selftest.c:213:19: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  lib/locking-selftest.c:213:19: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  lib/locking-selftest.c:213:19: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  lib/locking-selftest.c:213:19: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  lib/locking-selftest.c:213:19: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  lib/locking-selftest.c:213:19: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  lib/locking-selftest.c:213:19: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  lib/locking-selftest.c:213:19: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  lib/locking-selftest.c:211:20: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  lib/locking-selftest.c:211:20: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  lib/locking-selftest.c:211:20: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  lib/locking-selftest.c:211:20: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  lib/locking-selftest.c:213:19: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  lib/locking-selftest.c:211:20: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  lib/locking-selftest.c:211:20: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  lib/locking-selftest.c:211:20: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  drivers/gpu/drm/drm_modeset_lock.c:430:2: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c:70:2: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  drivers/gpu/drm/vgem/vgem_fence.c:193:2: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  drivers/gpu/drm/i915/i915_gem_batch_pool.c:125:4: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  drivers/gpu/drm/i915/i915_gem_execbuffer.c:1302:2: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  drivers/gpu/drm/radeon/radeon_prime.c:69:2: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  drivers/gpu/drm/nouveau/nouveau_prime.c:70:2: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]

... but we cannot just litter the kernel build log with such warnings.

These need to be fixed separately - turn off the warning for now.

Cc: Nicolai Hähnle <nicolai.haehnle@amd.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Maarten Lankhorst <dev@mblankhorst.nl>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dri-devel@lists.freedesktop.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/ww_mutex.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h
index 083950fd5b0b..5dd9a7682227 100644
--- a/include/linux/ww_mutex.h
+++ b/include/linux/ww_mutex.h
@@ -215,8 +215,7 @@ static inline void ww_acquire_fini(struct ww_acquire_ctx *ctx)
  *
  * A mutex acquired with this function must be released with ww_mutex_unlock.
  */
-extern int __must_check ww_mutex_lock(struct ww_mutex *lock,
-				      struct ww_acquire_ctx *ctx);
+extern int /* __must_check */ ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx);
 
 /**
  * ww_mutex_lock_interruptible - acquire the w/w mutex, interruptible
-- 
cgit v1.2.3


From f21860bac05b609d71757338361d26209ff0759b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 14 Jan 2017 17:11:36 +0100
Subject: locking/mutex, sched/wait: Fix the mutex_lock_io_nested() define

Mike noticed this bogosity:

 > > +# define mutex_lock_nest_io(lock, nest_lock) mutex_io(lock)
 >                                                 ^^^^^^^^^^^^^^ typo

This new locking API is not used yet, so this didn't trigger in testing.

Fix it.

Reported-by: Mike Galbraith <efault@gmx.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: adilger.kernel@dilger.ca
Cc: jack@suse.com
Cc: kernel-team@fb.com
Cc: mingbo@fb.com
Cc: tytso@mit.edu
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mutex.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 980ba16b8749..7fffbfcd5430 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -179,7 +179,7 @@ extern void mutex_lock_io(struct mutex *lock);
 # define mutex_lock_interruptible_nested(lock, subclass) mutex_lock_interruptible(lock)
 # define mutex_lock_killable_nested(lock, subclass) mutex_lock_killable(lock)
 # define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
-# define mutex_lock_nest_io(lock, nest_lock) mutex_io(lock)
+# define mutex_lock_io_nested(lock, subclass) mutex_lock(lock)
 #endif
 
 /*
-- 
cgit v1.2.3


From 0a13cd1a05e7a549259d4f803d2ec2efda07ed7c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 14 Nov 2016 18:03:11 +0100
Subject: locking/atomic, kref: Implement kref_put_lock()

Because home-rolling your own is _awesome_, stop doing it. Provide
kref_put_lock(), just like kref_put_mutex() but for a spinlock.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/kref.h | 22 ++++++++++++++++------
 net/sunrpc/svcauth.c | 15 ++++++++++-----
 2 files changed, 26 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/kref.h b/include/linux/kref.h
index 31c49a64cdf9..9db9685fed84 100644
--- a/include/linux/kref.h
+++ b/include/linux/kref.h
@@ -19,6 +19,7 @@
 #include <linux/atomic.h>
 #include <linux/kernel.h>
 #include <linux/mutex.h>
+#include <linux/spinlock.h>
 
 struct kref {
 	atomic_t refcount;
@@ -86,12 +87,21 @@ static inline int kref_put_mutex(struct kref *kref,
 				 struct mutex *lock)
 {
 	WARN_ON(release == NULL);
-	if (unlikely(!atomic_add_unless(&kref->refcount, -1, 1))) {
-		mutex_lock(lock);
-		if (unlikely(!atomic_dec_and_test(&kref->refcount))) {
-			mutex_unlock(lock);
-			return 0;
-		}
+
+	if (atomic_dec_and_mutex_lock(&kref->refcount, lock)) {
+		release(kref);
+		return 1;
+	}
+	return 0;
+}
+
+static inline int kref_put_lock(struct kref *kref,
+				void (*release)(struct kref *kref),
+				spinlock_t *lock)
+{
+	WARN_ON(release == NULL);
+
+	if (atomic_dec_and_lock(&kref->refcount, lock)) {
 		release(kref);
 		return 1;
 	}
diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
index e112da8005b5..bb8db3cb8032 100644
--- a/net/sunrpc/svcauth.c
+++ b/net/sunrpc/svcauth.c
@@ -126,13 +126,18 @@ EXPORT_SYMBOL_GPL(svc_auth_unregister);
 static struct hlist_head	auth_domain_table[DN_HASHMAX];
 static DEFINE_SPINLOCK(auth_domain_lock);
 
+static void auth_domain_release(struct kref *kref)
+{
+	struct auth_domain *dom = container_of(kref, struct auth_domain, ref);
+
+	hlist_del(&dom->hash);
+	dom->flavour->domain_release(dom);
+	spin_unlock(&auth_domain_lock);
+}
+
 void auth_domain_put(struct auth_domain *dom)
 {
-	if (atomic_dec_and_lock(&dom->ref.refcount, &auth_domain_lock)) {
-		hlist_del(&dom->hash);
-		dom->flavour->domain_release(dom);
-		spin_unlock(&auth_domain_lock);
-	}
+	kref_put_lock(&dom->ref, auth_domain_release, &auth_domain_lock);
 }
 EXPORT_SYMBOL_GPL(auth_domain_put);
 
-- 
cgit v1.2.3


From 85b36c931ff328297572a3e6136fac573795ad79 Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@kernel.org>
Date: Wed, 18 Jan 2017 09:38:04 -0800
Subject: jump_labels: Move header guard #endif down where it belongs

The ending header guard is misplaced. This has no
functional change, this is just an eye-sore.

Signed-off-by: Luis R. Rodriguez <mcgrof@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bp@alien8.de
Cc: bp@suse.de
Cc: catalin.marinas@arm.com
Cc: jbaron@akamai.com
Cc: pbonzini@redhat.com
Cc: tony.luck@intel.com
Link: http://lkml.kernel.org/r/20170118173804.16281-1-mcgrof@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/jump_label.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index a0547c571800..b63d6b7b0db0 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -402,6 +402,6 @@ extern bool ____wrong_branch_error(void);
 #define static_branch_enable(x)		static_key_enable(&(x)->key)
 #define static_branch_disable(x)	static_key_disable(&(x)->key)
 
-#endif	/* _LINUX_JUMP_LABEL_H */
-
 #endif /* __ASSEMBLY__ */
+
+#endif	/* _LINUX_JUMP_LABEL_H */
-- 
cgit v1.2.3


From acb04058de49458010c44bb35b849d45113fd668 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 19 Jan 2017 14:36:33 +0100
Subject: sched/clock: Fix hotplug crash

Mike reported that he could trigger the WARN_ON_ONCE() in
set_sched_clock_stable() using hotplug.

This exposed a fundamental problem with the interface, we should never
mark the TSC stable if we ever find it to be unstable. Therefore
set_sched_clock_stable() is a broken interface.

The reason it existed is that not having it is a pain, it means all
relevant architecture code needs to call clear_sched_clock_stable()
where appropriate.

Of the three architectures that select HAVE_UNSTABLE_SCHED_CLOCK ia64
and parisc are trivial in that they never called
set_sched_clock_stable(), so add an unconditional call to
clear_sched_clock_stable() to them.

For x86 the story is a lot more involved, and what this patch tries to
do is ensure we preserve the status quo. So even is Cyrix or Transmeta
have usable TSC they never called set_sched_clock_stable() so they now
get an explicit mark unstable.

Reported-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 9881b024b7d7 ("sched/clock: Delay switching sched_clock to stable")
Link: http://lkml.kernel.org/r/20170119133633.GB6536@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/ia64/kernel/setup.c        |  2 ++
 arch/parisc/kernel/setup.c      |  2 ++
 arch/x86/kernel/cpu/amd.c       |  6 ++++--
 arch/x86/kernel/cpu/centaur.c   |  6 ++++--
 arch/x86/kernel/cpu/common.c    |  3 +++
 arch/x86/kernel/cpu/cyrix.c     |  2 ++
 arch/x86/kernel/cpu/intel.c     |  6 ++++--
 arch/x86/kernel/cpu/transmeta.c |  3 +++
 arch/x86/kernel/kvmclock.c      |  2 +-
 include/linux/sched.h           |  1 -
 kernel/sched/clock.c            | 29 ++++++++---------------------
 11 files changed, 33 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index 7ec7acc844c2..c483ece3eb84 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -619,6 +619,8 @@ setup_arch (char **cmdline_p)
 	check_sal_cache_flush();
 #endif
 	paging_init();
+
+	clear_sched_clock_stable();
 }
 
 /*
diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c
index 2e66a887788e..068ed3607bac 100644
--- a/arch/parisc/kernel/setup.c
+++ b/arch/parisc/kernel/setup.c
@@ -36,6 +36,7 @@
 #undef PCI_DEBUG
 #include <linux/proc_fs.h>
 #include <linux/export.h>
+#include <linux/sched.h>
 
 #include <asm/processor.h>
 #include <asm/sections.h>
@@ -176,6 +177,7 @@ void __init setup_arch(char **cmdline_p)
 	conswitchp = &dummy_con;	/* we use do_take_over_console() later ! */
 #endif
 
+	clear_sched_clock_stable();
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 71cae73a5076..1bb253a6ee4d 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -548,8 +548,10 @@ static void early_init_amd(struct cpuinfo_x86 *c)
 	if (c->x86_power & (1 << 8)) {
 		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 		set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
-		if (!check_tsc_unstable())
-			set_sched_clock_stable();
+		if (check_tsc_unstable())
+			clear_sched_clock_stable();
+	} else {
+		clear_sched_clock_stable();
 	}
 
 	/* Bit 12 of 8000_0007 edx is accumulated power mechanism. */
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 1661d8ec9280..2c234a6d94c4 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -1,5 +1,5 @@
-#include <linux/bitops.h>
-#include <linux/kernel.h>
+
+#include <linux/sched.h>
 
 #include <asm/cpufeature.h>
 #include <asm/e820.h>
@@ -104,6 +104,8 @@ static void early_init_centaur(struct cpuinfo_x86 *c)
 #ifdef CONFIG_X86_64
 	set_cpu_cap(c, X86_FEATURE_SYSENTER32);
 #endif
+
+	clear_sched_clock_stable();
 }
 
 static void init_centaur(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index dc1697ca5191..3457186275a0 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -83,6 +83,7 @@ static void default_init(struct cpuinfo_x86 *c)
 			strcpy(c->x86_model_id, "386");
 	}
 #endif
+	clear_sched_clock_stable();
 }
 
 static const struct cpu_dev default_cpu = {
@@ -1055,6 +1056,8 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 	 */
 	if (this_cpu->c_init)
 		this_cpu->c_init(c);
+	else
+		clear_sched_clock_stable();
 
 	/* Disable the PN if appropriate */
 	squash_the_stupid_serial_number(c);
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index bd9dcd6b712d..47416f959a48 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -9,6 +9,7 @@
 #include <asm/pci-direct.h>
 #include <asm/tsc.h>
 #include <asm/cpufeature.h>
+#include <linux/sched.h>
 
 #include "cpu.h"
 
@@ -183,6 +184,7 @@ static void early_init_cyrix(struct cpuinfo_x86 *c)
 		set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
 		break;
 	}
+	clear_sched_clock_stable();
 }
 
 static void init_cyrix(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index fcd484d2bb03..26eaff4907b2 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -124,8 +124,10 @@ static void early_init_intel(struct cpuinfo_x86 *c)
 	if (c->x86_power & (1 << 8)) {
 		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 		set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
-		if (!check_tsc_unstable())
-			set_sched_clock_stable();
+		if (check_tsc_unstable())
+			clear_sched_clock_stable();
+	} else {
+		clear_sched_clock_stable();
 	}
 
 	/* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index 34178564be2a..c1ea5b999839 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -1,4 +1,5 @@
 #include <linux/kernel.h>
+#include <linux/sched.h>
 #include <linux/mm.h>
 #include <asm/cpufeature.h>
 #include <asm/msr.h>
@@ -14,6 +15,8 @@ static void early_init_transmeta(struct cpuinfo_x86 *c)
 		if (xlvl >= 0x80860001)
 			c->x86_capability[CPUID_8086_0001_EDX] = cpuid_edx(0x80860001);
 	}
+
+	clear_sched_clock_stable();
 }
 
 static void init_transmeta(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 2a5cafdf8808..542710b99f52 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -107,12 +107,12 @@ static inline void kvm_sched_clock_init(bool stable)
 {
 	if (!stable) {
 		pv_time_ops.sched_clock = kvm_clock_read;
+		clear_sched_clock_stable();
 		return;
 	}
 
 	kvm_sched_clock_offset = kvm_clock_read();
 	pv_time_ops.sched_clock = kvm_sched_clock_read;
-	set_sched_clock_stable();
 
 	printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n",
 			kvm_sched_clock_offset);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a8daed914eef..69e6852fede1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2547,7 +2547,6 @@ extern void sched_clock_init_late(void);
  * is reliable after all:
  */
 extern int sched_clock_stable(void);
-extern void set_sched_clock_stable(void);
 extern void clear_sched_clock_stable(void);
 
 extern void sched_clock_tick(void);
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 7713b2b53f61..ad64efe41722 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -83,8 +83,15 @@ void sched_clock_init(void)
 }
 
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+/*
+ * We must start with !__sched_clock_stable because the unstable -> stable
+ * transition is accurate, while the stable -> unstable transition is not.
+ *
+ * Similarly we start with __sched_clock_stable_early, thereby assuming we
+ * will become stable, such that there's only a single 1 -> 0 transition.
+ */
 static DEFINE_STATIC_KEY_FALSE(__sched_clock_stable);
-static int __sched_clock_stable_early;
+static int __sched_clock_stable_early = 1;
 
 /*
  * We want: ktime_get_ns() + gtod_offset == sched_clock() + raw_offset
@@ -132,24 +139,6 @@ static void __set_sched_clock_stable(void)
 	tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
 }
 
-void set_sched_clock_stable(void)
-{
-	__sched_clock_stable_early = 1;
-
-	smp_mb(); /* matches sched_clock_init_late() */
-
-	/*
-	 * This really should only be called early (before
-	 * sched_clock_init_late()) when guestimating our sched_clock() is
-	 * solid.
-	 *
-	 * After that we test stability and we can negate our guess using
-	 * clear_sched_clock_stable, possibly from a watchdog.
-	 */
-	if (WARN_ON_ONCE(sched_clock_running == 2))
-		__set_sched_clock_stable();
-}
-
 static void __clear_sched_clock_stable(struct work_struct *work)
 {
 	struct sched_clock_data *scd = this_scd();
@@ -199,8 +188,6 @@ void sched_clock_init_late(void)
 
 	if (__sched_clock_stable_early)
 		__set_sched_clock_stable();
-	else
-		__clear_sched_clock_stable(NULL);
 }
 
 /*
-- 
cgit v1.2.3


From 06321dd2d1ae5b5bdc847958ab9e71d22a29a33e Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 19 Jan 2017 09:31:52 -0500
Subject: locking/rwsem: Remove unnecessary atomic_long_t casts

Since sem->count had been changed to a atomic_long_t type, it is no
longer necessary to use the atomic_long_t cast anymore. So remove them.

Signed-off-by: Waiman Long <longman@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Link: http://lkml.kernel.org/r/1484836312-6656-1-git-send-email-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/asm-generic/rwsem.h | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/rwsem.h b/include/asm-generic/rwsem.h
index 5be122e3d326..6c6a2141f271 100644
--- a/include/asm-generic/rwsem.h
+++ b/include/asm-generic/rwsem.h
@@ -33,7 +33,7 @@
  */
 static inline void __down_read(struct rw_semaphore *sem)
 {
-	if (unlikely(atomic_long_inc_return_acquire((atomic_long_t *)&sem->count) <= 0))
+	if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0))
 		rwsem_down_read_failed(sem);
 }
 
@@ -58,7 +58,7 @@ static inline void __down_write(struct rw_semaphore *sem)
 	long tmp;
 
 	tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS,
-				     (atomic_long_t *)&sem->count);
+					     &sem->count);
 	if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS))
 		rwsem_down_write_failed(sem);
 }
@@ -68,7 +68,7 @@ static inline int __down_write_killable(struct rw_semaphore *sem)
 	long tmp;
 
 	tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS,
-				     (atomic_long_t *)&sem->count);
+					     &sem->count);
 	if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS))
 		if (IS_ERR(rwsem_down_write_failed_killable(sem)))
 			return -EINTR;
@@ -91,7 +91,7 @@ static inline void __up_read(struct rw_semaphore *sem)
 {
 	long tmp;
 
-	tmp = atomic_long_dec_return_release((atomic_long_t *)&sem->count);
+	tmp = atomic_long_dec_return_release(&sem->count);
 	if (unlikely(tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0))
 		rwsem_wake(sem);
 }
@@ -102,7 +102,7 @@ static inline void __up_read(struct rw_semaphore *sem)
 static inline void __up_write(struct rw_semaphore *sem)
 {
 	if (unlikely(atomic_long_sub_return_release(RWSEM_ACTIVE_WRITE_BIAS,
-				 (atomic_long_t *)&sem->count) < 0))
+						    &sem->count) < 0))
 		rwsem_wake(sem);
 }
 
@@ -120,8 +120,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
 	 * read-locked region is ok to be re-ordered into the
 	 * write side. As such, rely on RELEASE semantics.
 	 */
-	tmp = atomic_long_add_return_release(-RWSEM_WAITING_BIAS,
-				     (atomic_long_t *)&sem->count);
+	tmp = atomic_long_add_return_release(-RWSEM_WAITING_BIAS, &sem->count);
 	if (tmp < 0)
 		rwsem_downgrade_wake(sem);
 }
-- 
cgit v1.2.3


From 9f8197980d87a28ec3d0b3b986f770e7e7878485 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Thu, 19 Jan 2017 10:59:13 +0000
Subject: delay: Add explanation of udelay() inaccuracy

There seems to be some misunderstanding that udelay() and friends will
always guarantee the specified delay.  This is a false understanding.
When udelay() is based on CPU cycles, it can return early for many
reasons which are detailed by Linus' reply to me in a thread in 2011:

  http://lists.openwall.net/linux-kernel/2011/01/12/372

However, a udelay test module was created in 2014 which allows udelay()
to only be 0.5% fast, which is outside of the CPU-cycles udelay()
results I measured back in 2011, which were deemed to be in the "we
don't care" region.

test_udelay() should be fixed to reflect the real allowable tolerance
on udelay(), rather than 0.5%.

Cc: David Riley <davidriley@chromium.org>
Cc: John Stultz <john.stultz@linaro.org>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/delay.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/linux/delay.h b/include/linux/delay.h
index a6ecb34cf547..2ecb3c46b20a 100644
--- a/include/linux/delay.h
+++ b/include/linux/delay.h
@@ -5,6 +5,17 @@
  * Copyright (C) 1993 Linus Torvalds
  *
  * Delay routines, using a pre-computed "loops_per_jiffy" value.
+ *
+ * Please note that ndelay(), udelay() and mdelay() may return early for
+ * several reasons:
+ *  1. computed loops_per_jiffy too low (due to the time taken to
+ *     execute the timer interrupt.)
+ *  2. cache behaviour affecting the time it takes to execute the
+ *     loop function.
+ *  3. CPU clock rate changes.
+ *
+ * Please see this thread:
+ *   http://lists.openwall.net/linux-kernel/2011/01/09/56
  */
 
 #include <linux/kernel.h>
-- 
cgit v1.2.3


From bcc9a76d5ac426bc45c9e863b1830347827ca77a Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Sat, 21 Jan 2017 21:33:35 -0500
Subject: locking/rwsem: Reinit wake_q after use

In __rwsem_down_write_failed_common(), the same wake_q variable name
is defined twice, with the inner wake_q hiding the one in outer scope.
We can either use different names for the two wake_q's.

Even better, we can use the same wake_q twice, if necessary.

To enable the latter change, we need to define a new helper function
wake_q_init() to enable reinitalization of wake_q after use.

Signed-off-by: Waiman Long <longman@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1485052415-9611-1-git-send-email-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h       | 6 ++++++
 kernel/locking/rwsem-xadd.c | 7 +++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f4f9d32f12b3..4e62b378bd65 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1021,6 +1021,12 @@ struct wake_q_head {
 #define DEFINE_WAKE_Q(name)				\
 	struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
 
+static inline void wake_q_init(struct wake_q_head *head)
+{
+	head->first = WAKE_Q_TAIL;
+	head->lastp = &head->first;
+}
+
 extern void wake_q_add(struct wake_q_head *head,
 		       struct task_struct *task);
 extern void wake_up_q(struct wake_q_head *head);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index a3a381aee24b..2ad8d8dc3bb1 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -502,8 +502,6 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
 		 * wake any read locks that were queued ahead of us.
 		 */
 		if (count > RWSEM_WAITING_BIAS) {
-			DEFINE_WAKE_Q(wake_q);
-
 			__rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q);
 			/*
 			 * The wakeup is normally called _after_ the wait_lock
@@ -513,6 +511,11 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
 			 * for attempting rwsem_try_write_lock().
 			 */
 			wake_up_q(&wake_q);
+
+			/*
+			 * Reinitialize wake_q after use.
+			 */
+			wake_q_init(&wake_q);
 		}
 
 	} else
-- 
cgit v1.2.3


From b18b6a9cef7f30e9a8b7738d5fc8d568cf660855 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Sat, 21 Jan 2017 00:09:08 -0500
Subject: timers: Omit POSIX timer stuff from task_struct when disabled

When CONFIG_POSIX_TIMERS is disabled, it is preferable to remove related
structures from struct task_struct and struct signal_struct as they
won't contain anything useful and shouldn't be relied upon by mistake.
Code still referencing those structures is also disabled here.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 fs/proc/base.c            |  4 ++--
 include/linux/init_task.h | 40 +++++++++++++++++++++++++---------------
 include/linux/sched.h     | 13 ++++++++++---
 kernel/fork.c             | 10 +++++++++-
 kernel/sched/rt.c         |  4 ++++
 kernel/sched/stats.h      | 32 ++++++++++++++++++++------------
 6 files changed, 70 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8e7e61b28f31..03deeac1e835 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2179,7 +2179,7 @@ static const struct file_operations proc_map_files_operations = {
 	.llseek		= generic_file_llseek,
 };
 
-#ifdef CONFIG_CHECKPOINT_RESTORE
+#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
 struct timers_private {
 	struct pid *pid;
 	struct task_struct *task;
@@ -2936,7 +2936,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 	REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
 	REG("setgroups",  S_IRUGO|S_IWUSR, proc_setgroups_operations),
 #endif
-#ifdef CONFIG_CHECKPOINT_RESTORE
+#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
 	REG("timers",	  S_IRUGO, proc_timers_operations),
 #endif
 	REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations),
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 325f649d77ff..3a85d61f7614 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -42,6 +42,27 @@ extern struct fs_struct init_fs;
 #define INIT_PREV_CPUTIME(x)
 #endif
 
+#ifdef CONFIG_POSIX_TIMERS
+#define INIT_POSIX_TIMERS(s)						\
+	.posix_timers = LIST_HEAD_INIT(s.posix_timers),
+#define INIT_CPU_TIMERS(s)						\
+	.cpu_timers = {							\
+		LIST_HEAD_INIT(s.cpu_timers[0]),			\
+		LIST_HEAD_INIT(s.cpu_timers[1]),			\
+		LIST_HEAD_INIT(s.cpu_timers[2]),								\
+	},
+#define INIT_CPUTIMER(s)						\
+	.cputimer	= { 						\
+		.cputime_atomic	= INIT_CPUTIME_ATOMIC,			\
+		.running	= false,				\
+		.checking_timer = false,				\
+	},
+#else
+#define INIT_POSIX_TIMERS(s)
+#define INIT_CPU_TIMERS(s)
+#define INIT_CPUTIMER(s)
+#endif
+
 #define INIT_SIGNALS(sig) {						\
 	.nr_threads	= 1,						\
 	.thread_head	= LIST_HEAD_INIT(init_task.thread_node),	\
@@ -49,14 +70,10 @@ extern struct fs_struct init_fs;
 	.shared_pending	= { 						\
 		.list = LIST_HEAD_INIT(sig.shared_pending.list),	\
 		.signal =  {{0}}},					\
-	.posix_timers	 = LIST_HEAD_INIT(sig.posix_timers),		\
-	.cpu_timers	= INIT_CPU_TIMERS(sig.cpu_timers),		\
+	INIT_POSIX_TIMERS(sig)						\
+	INIT_CPU_TIMERS(sig)						\
 	.rlim		= INIT_RLIMITS,					\
-	.cputimer	= { 						\
-		.cputime_atomic	= INIT_CPUTIME_ATOMIC,			\
-		.running	= false,				\
-		.checking_timer = false,				\
-	},								\
+	INIT_CPUTIMER(sig)						\
 	INIT_PREV_CPUTIME(sig)						\
 	.cred_guard_mutex =						\
 		 __MUTEX_INITIALIZER(sig.cred_guard_mutex),		\
@@ -247,7 +264,7 @@ extern struct task_group root_task_group;
 	.blocked	= {{0}},					\
 	.alloc_lock	= __SPIN_LOCK_UNLOCKED(tsk.alloc_lock),		\
 	.journal_info	= NULL,						\
-	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
+	INIT_CPU_TIMERS(tsk)						\
 	.pi_lock	= __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),	\
 	.timer_slack_ns = 50000, /* 50 usec default slack */		\
 	.pids = {							\
@@ -274,13 +291,6 @@ extern struct task_group root_task_group;
 }
 
 
-#define INIT_CPU_TIMERS(cpu_timers)					\
-{									\
-	LIST_HEAD_INIT(cpu_timers[0]),					\
-	LIST_HEAD_INIT(cpu_timers[1]),					\
-	LIST_HEAD_INIT(cpu_timers[2]),					\
-}
-
 /* Attach to the init_task data structure for proper alignment */
 #define __init_task_data __attribute__((__section__(".data..init_task")))
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4d1905245c7a..e8f6af59726d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -734,13 +734,14 @@ struct signal_struct {
 	unsigned int		is_child_subreaper:1;
 	unsigned int		has_child_subreaper:1;
 
+#ifdef CONFIG_POSIX_TIMERS
+
 	/* POSIX.1b Interval Timers */
 	int			posix_timer_id;
 	struct list_head	posix_timers;
 
 	/* ITIMER_REAL timer for the process */
 	struct hrtimer real_timer;
-	struct pid *leader_pid;
 	ktime_t it_real_incr;
 
 	/*
@@ -759,12 +760,16 @@ struct signal_struct {
 	/* Earliest-expiration cache. */
 	struct task_cputime cputime_expires;
 
+	struct list_head cpu_timers[3];
+
+#endif
+
+	struct pid *leader_pid;
+
 #ifdef CONFIG_NO_HZ_FULL
 	atomic_t tick_dep_mask;
 #endif
 
-	struct list_head cpu_timers[3];
-
 	struct pid *tty_old_pgrp;
 
 	/* boolean value for session group leader */
@@ -1681,8 +1686,10 @@ struct task_struct {
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
 	unsigned long min_flt, maj_flt;
 
+#ifdef CONFIG_POSIX_TIMERS
 	struct task_cputime cputime_expires;
 	struct list_head cpu_timers[3];
+#endif
 
 /* process credentials */
 	const struct cred __rcu *ptracer_cred; /* Tracer's credentials at attach */
diff --git a/kernel/fork.c b/kernel/fork.c
index 11c5c8ab827c..105c6676d93b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1304,6 +1304,7 @@ void __cleanup_sighand(struct sighand_struct *sighand)
 	}
 }
 
+#ifdef CONFIG_POSIX_TIMERS
 /*
  * Initialize POSIX timer handling for a thread group.
  */
@@ -1322,6 +1323,9 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
 	INIT_LIST_HEAD(&sig->cpu_timers[1]);
 	INIT_LIST_HEAD(&sig->cpu_timers[2]);
 }
+#else
+static inline void posix_cpu_timers_init_group(struct signal_struct *sig) { }
+#endif
 
 static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 {
@@ -1346,11 +1350,11 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	init_waitqueue_head(&sig->wait_chldexit);
 	sig->curr_target = tsk;
 	init_sigpending(&sig->shared_pending);
-	INIT_LIST_HEAD(&sig->posix_timers);
 	seqlock_init(&sig->stats_lock);
 	prev_cputime_init(&sig->prev_cputime);
 
 #ifdef CONFIG_POSIX_TIMERS
+	INIT_LIST_HEAD(&sig->posix_timers);
 	hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	sig->real_timer.function = it_real_fn;
 #endif
@@ -1425,6 +1429,7 @@ static void rt_mutex_init_task(struct task_struct *p)
 #endif
 }
 
+#ifdef CONFIG_POSIX_TIMERS
 /*
  * Initialize POSIX timer handling for a single task.
  */
@@ -1437,6 +1442,9 @@ static void posix_cpu_timers_init(struct task_struct *tsk)
 	INIT_LIST_HEAD(&tsk->cpu_timers[1]);
 	INIT_LIST_HEAD(&tsk->cpu_timers[2]);
 }
+#else
+static inline void posix_cpu_timers_init(struct task_struct *tsk) { }
+#endif
 
 static inline void
 init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 2516b8df6dbb..a688a8206727 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2246,6 +2246,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
 	}
 }
 
+#ifdef CONFIG_POSIX_TIMERS
 static void watchdog(struct rq *rq, struct task_struct *p)
 {
 	unsigned long soft, hard;
@@ -2267,6 +2268,9 @@ static void watchdog(struct rq *rq, struct task_struct *p)
 			p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
 	}
 }
+#else
+static inline void watchdog(struct rq *rq, struct task_struct *p) { }
+#endif
 
 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 {
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 34659a853505..c69a9870ab79 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -172,18 +172,19 @@ sched_info_switch(struct rq *rq,
  */
 
 /**
- * cputimer_running - return true if cputimer is running
+ * get_running_cputimer - return &tsk->signal->cputimer if cputimer is running
  *
  * @tsk:	Pointer to target task.
  */
-static inline bool cputimer_running(struct task_struct *tsk)
-
+#ifdef CONFIG_POSIX_TIMERS
+static inline
+struct thread_group_cputimer *get_running_cputimer(struct task_struct *tsk)
 {
 	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
 
 	/* Check if cputimer isn't running. This is accessed without locking. */
 	if (!READ_ONCE(cputimer->running))
-		return false;
+		return NULL;
 
 	/*
 	 * After we flush the task's sum_exec_runtime to sig->sum_sched_runtime
@@ -200,10 +201,17 @@ static inline bool cputimer_running(struct task_struct *tsk)
 	 * clock delta is behind the expiring timer value.
 	 */
 	if (unlikely(!tsk->sighand))
-		return false;
+		return NULL;
 
-	return true;
+	return cputimer;
+}
+#else
+static inline
+struct thread_group_cputimer *get_running_cputimer(struct task_struct *tsk)
+{
+	return NULL;
 }
+#endif
 
 /**
  * account_group_user_time - Maintain utime for a thread group.
@@ -218,9 +226,9 @@ static inline bool cputimer_running(struct task_struct *tsk)
 static inline void account_group_user_time(struct task_struct *tsk,
 					   cputime_t cputime)
 {
-	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+	struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);
 
-	if (!cputimer_running(tsk))
+	if (!cputimer)
 		return;
 
 	atomic64_add(cputime, &cputimer->cputime_atomic.utime);
@@ -239,9 +247,9 @@ static inline void account_group_user_time(struct task_struct *tsk,
 static inline void account_group_system_time(struct task_struct *tsk,
 					     cputime_t cputime)
 {
-	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+	struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);
 
-	if (!cputimer_running(tsk))
+	if (!cputimer)
 		return;
 
 	atomic64_add(cputime, &cputimer->cputime_atomic.stime);
@@ -260,9 +268,9 @@ static inline void account_group_system_time(struct task_struct *tsk,
 static inline void account_group_exec_runtime(struct task_struct *tsk,
 					      unsigned long long ns)
 {
-	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+	struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);
 
-	if (!cputimer_running(tsk))
+	if (!cputimer)
 		return;
 
 	atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime);
-- 
cgit v1.2.3


From 058fe1c0440e68a1ba3c2270ae43e9f0298b27d8 Mon Sep 17 00:00:00 2001
From: David Carrillo-Cisneros <davidcc@google.com>
Date: Wed, 18 Jan 2017 11:24:53 -0800
Subject: perf/core: Make cgroup switch visit only cpuctxs with cgroup events

This patch follows from a conversation in CQM/CMT's last series about
speeding up the context switch for cgroup events:

  https://patchwork.kernel.org/patch/9478617/

This is a low-hanging fruit optimization. It replaces the iteration over
the "pmus" list in cgroup switch by an iteration over a new list that
contains only cpuctxs with at least one cgroup event.

This is necessary because the number of PMUs have increased over the years
e.g modern x86 server systems have well above 50 PMUs.

The iteration over the full PMU list is unneccessary and can be costly in
heavy cache contention scenarios.

Below are some instrumentation measurements with 10, 50 and 90 percentiles
of the total cost of context switch before and after this optimization for
a simple array read/write microbenchark.

  Contention
    Level    Nr events      Before (us)            After (us)       Median
  L2    L3     types      (10%, 50%, 90%)       (10%, 50%, 90%     Speedup
  --------------------------------------------------------------------------
  Low   Low       1       (1.72, 2.42, 5.85)    (1.35, 1.64, 5.46)     29%
  High  Low       1       (2.08, 4.56, 19.8)    (1720, 2.20, 13.7)     51%
  High  High      1       (2.86, 10.4, 12.7)    (2.54, 4.32, 12.1)     58%

  Low   Low       2       (1.98, 3.20, 6.89)    (1.68, 2.41, 8.89)     24%
  High  Low       2       (2.48, 5.28, 22.4)    (2150, 3.69, 14.6)     30%
  High  High      2       (3.32, 8.09, 13.9)    (2.80, 5.15, 13.7)     36%

where:

  1 event type  = cycles
  2 event types = cycles,intel_cqm/llc_occupancy/

   Contention L2 Low: workset  <  L2 cache size.
                 High:  "     >>  L2   "     " .
   Contention L3 Low: workset of task on all sockets  <  L3 cache size.
                 High:   "     "   "   "   "    "    >>  L3   "     " .

   Median Speedup is (50%ile Before - 50%ile After) /  50%ile Before

Unsurprisingly, the benefits of this optimization decrease with the number
of cpuctxs with a cgroup events, yet, is never detrimental.

Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: David Carrillo-Cisneros <davidcc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Cc: Vince Weaver <vince@deater.net>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/20170118192454.58008-2-davidcc@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h |  1 +
 kernel/events/core.c       | 98 +++++++++++++++++++++-------------------------
 2 files changed, 46 insertions(+), 53 deletions(-)

(limited to 'include')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 78ed8105e64d..dfa725723f28 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -788,6 +788,7 @@ struct perf_cpu_context {
 	struct pmu			*unique_pmu;
 #ifdef CONFIG_CGROUP_PERF
 	struct perf_cgroup		*cgrp;
+	struct list_head		cgrp_cpuctx_entry;
 #endif
 
 	struct list_head		sched_cb_entry;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e5aaa806702d..928a818d912e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -678,6 +678,8 @@ perf_cgroup_set_timestamp(struct task_struct *task,
 	info->timestamp = ctx->timestamp;
 }
 
+static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
+
 #define PERF_CGROUP_SWOUT	0x1 /* cgroup switch out every event */
 #define PERF_CGROUP_SWIN	0x2 /* cgroup switch in events based on task */
 
@@ -690,61 +692,46 @@ perf_cgroup_set_timestamp(struct task_struct *task,
 static void perf_cgroup_switch(struct task_struct *task, int mode)
 {
 	struct perf_cpu_context *cpuctx;
-	struct pmu *pmu;
+	struct list_head *list;
 	unsigned long flags;
 
 	/*
-	 * disable interrupts to avoid geting nr_cgroup
-	 * changes via __perf_event_disable(). Also
-	 * avoids preemption.
+	 * Disable interrupts and preemption to avoid this CPU's
+	 * cgrp_cpuctx_entry to change under us.
 	 */
 	local_irq_save(flags);
 
-	/*
-	 * we reschedule only in the presence of cgroup
-	 * constrained events.
-	 */
+	list = this_cpu_ptr(&cgrp_cpuctx_list);
+	list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
+		WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
 
-	list_for_each_entry_rcu(pmu, &pmus, entry) {
-		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-		if (cpuctx->unique_pmu != pmu)
-			continue; /* ensure we process each cpuctx once */
+		perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+		perf_pmu_disable(cpuctx->ctx.pmu);
 
-		/*
-		 * perf_cgroup_events says at least one
-		 * context on this CPU has cgroup events.
-		 *
-		 * ctx->nr_cgroups reports the number of cgroup
-		 * events for a context.
-		 */
-		if (cpuctx->ctx.nr_cgroups > 0) {
-			perf_ctx_lock(cpuctx, cpuctx->task_ctx);
-			perf_pmu_disable(cpuctx->ctx.pmu);
-
-			if (mode & PERF_CGROUP_SWOUT) {
-				cpu_ctx_sched_out(cpuctx, EVENT_ALL);
-				/*
-				 * must not be done before ctxswout due
-				 * to event_filter_match() in event_sched_out()
-				 */
-				cpuctx->cgrp = NULL;
-			}
+		if (mode & PERF_CGROUP_SWOUT) {
+			cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+			/*
+			 * must not be done before ctxswout due
+			 * to event_filter_match() in event_sched_out()
+			 */
+			cpuctx->cgrp = NULL;
+		}
 
-			if (mode & PERF_CGROUP_SWIN) {
-				WARN_ON_ONCE(cpuctx->cgrp);
-				/*
-				 * set cgrp before ctxsw in to allow
-				 * event_filter_match() to not have to pass
-				 * task around
-				 * we pass the cpuctx->ctx to perf_cgroup_from_task()
-				 * because cgorup events are only per-cpu
-				 */
-				cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx);
-				cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
-			}
-			perf_pmu_enable(cpuctx->ctx.pmu);
-			perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+		if (mode & PERF_CGROUP_SWIN) {
+			WARN_ON_ONCE(cpuctx->cgrp);
+			/*
+			 * set cgrp before ctxsw in to allow
+			 * event_filter_match() to not have to pass
+			 * task around
+			 * we pass the cpuctx->ctx to perf_cgroup_from_task()
+			 * because cgorup events are only per-cpu
+			 */
+			cpuctx->cgrp = perf_cgroup_from_task(task,
+							     &cpuctx->ctx);
+			cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
 		}
+		perf_pmu_enable(cpuctx->ctx.pmu);
+		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 	}
 
 	local_irq_restore(flags);
@@ -889,6 +876,7 @@ list_update_cgroup_event(struct perf_event *event,
 			 struct perf_event_context *ctx, bool add)
 {
 	struct perf_cpu_context *cpuctx;
+	struct list_head *cpuctx_entry;
 
 	if (!is_cgroup_event(event))
 		return;
@@ -902,15 +890,16 @@ list_update_cgroup_event(struct perf_event *event,
 	 * this will always be called from the right CPU.
 	 */
 	cpuctx = __get_cpu_context(ctx);
-
-	/*
-	 * cpuctx->cgrp is NULL until a cgroup event is sched in or
-	 * ctx->nr_cgroup == 0 .
-	 */
-	if (add && perf_cgroup_from_task(current, ctx) == event->cgrp)
-		cpuctx->cgrp = event->cgrp;
-	else if (!add)
+	cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
+	/* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/
+	if (add) {
+		list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
+		if (perf_cgroup_from_task(current, ctx) == event->cgrp)
+			cpuctx->cgrp = event->cgrp;
+	} else {
+		list_del(cpuctx_entry);
 		cpuctx->cgrp = NULL;
+	}
 }
 
 #else /* !CONFIG_CGROUP_PERF */
@@ -10709,6 +10698,9 @@ static void __init perf_event_init_all_cpus(void)
 		INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
 		raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
 
+#ifdef CONFIG_CGROUP_PERF
+		INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
+#endif
 		INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
 	}
 }
-- 
cgit v1.2.3


From 1fd7e416995401ec082fc0fe6090a223969beda5 Mon Sep 17 00:00:00 2001
From: David Carrillo-Cisneros <davidcc@google.com>
Date: Wed, 18 Jan 2017 11:24:54 -0800
Subject: perf/core: Remove perf_cpu_context::unique_pmu

cpuctx->unique_pmu was originally introduced as a way to identify cpuctxs
with shared pmus in order to avoid visiting the same cpuctx more than once
in a for_each_pmu loop.

cpuctx->unique_pmu == cpuctx->pmu in non-software task contexts since they
have only one pmu per cpuctx. Since perf_pmu_sched_task() is only called in
hw contexts, this patch replaces cpuctx->unique_pmu by cpuctx->pmu in it.

The change above, together with the previous patch in this series, removed
the remaining uses of cpuctx->unique_pmu, so we remove it altogether.

Signed-off-by: David Carrillo-Cisneros <davidcc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Cc: Vince Weaver <vince@deater.net>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/20170118192454.58008-3-davidcc@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h |  1 -
 kernel/events/core.c       | 31 +------------------------------
 2 files changed, 1 insertion(+), 31 deletions(-)

(limited to 'include')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index dfa725723f28..5c58e93c130c 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -785,7 +785,6 @@ struct perf_cpu_context {
 	ktime_t				hrtimer_interval;
 	unsigned int			hrtimer_active;
 
-	struct pmu			*unique_pmu;
 #ifdef CONFIG_CGROUP_PERF
 	struct perf_cgroup		*cgrp;
 	struct list_head		cgrp_cpuctx_entry;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 928a818d912e..d92c7ad7715d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2932,7 +2932,7 @@ static void perf_pmu_sched_task(struct task_struct *prev,
 		return;
 
 	list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
-		pmu = cpuctx->unique_pmu; /* software PMUs will not have sched_task */
+		pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
 
 		if (WARN_ON_ONCE(!pmu->sched_task))
 			continue;
@@ -8636,37 +8636,10 @@ static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
 	return NULL;
 }
 
-static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu) {
-		struct perf_cpu_context *cpuctx;
-
-		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
-
-		if (cpuctx->unique_pmu == old_pmu)
-			cpuctx->unique_pmu = pmu;
-	}
-}
-
 static void free_pmu_context(struct pmu *pmu)
 {
-	struct pmu *i;
-
 	mutex_lock(&pmus_lock);
-	/*
-	 * Like a real lame refcount.
-	 */
-	list_for_each_entry(i, &pmus, entry) {
-		if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
-			update_pmu_context(i, pmu);
-			goto out;
-		}
-	}
-
 	free_percpu(pmu->pmu_cpu_context);
-out:
 	mutex_unlock(&pmus_lock);
 }
 
@@ -8870,8 +8843,6 @@ skip_type:
 		cpuctx->ctx.pmu = pmu;
 
 		__perf_mux_hrtimer_init(cpuctx, cpu);
-
-		cpuctx->unique_pmu = pmu;
 	}
 
 got_cpu_context:
-- 
cgit v1.2.3