summaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/acpi.h36
-rw-r--r--include/linux/acpi_apmt.h19
-rw-r--r--include/linux/arm_ffa.h85
-rw-r--r--include/linux/bcm47xx_nvram.h6
-rw-r--r--include/linux/bitmap.h13
-rw-r--r--include/linux/bitops.h30
-rw-r--r--include/linux/blk-mq.h3
-rw-r--r--include/linux/blkdev.h22
-rw-r--r--include/linux/bma150.h4
-rw-r--r--include/linux/bpf.h50
-rw-r--r--include/linux/buffer_head.h48
-rw-r--r--include/linux/cache.h13
-rw-r--r--include/linux/can/dev.h16
-rw-r--r--include/linux/can/platform/sja1000.h2
-rw-r--r--include/linux/ceph/messenger.h4
-rw-r--r--include/linux/cgroup-defs.h3
-rw-r--r--include/linux/cgroup.h21
-rw-r--r--include/linux/clk-provider.h18
-rw-r--r--include/linux/clk.h2
-rw-r--r--include/linux/clk/at91_pmc.h6
-rw-r--r--include/linux/clk/spear.h14
-rw-r--r--include/linux/compat.h2
-rw-r--r--include/linux/compiler-clang.h23
-rw-r--r--include/linux/compiler-gcc.h6
-rw-r--r--include/linux/compiler_types.h3
-rw-r--r--include/linux/console.h129
-rw-r--r--include/linux/counter.h5
-rw-r--r--include/linux/cpufreq.h28
-rw-r--r--include/linux/cpumask.h113
-rw-r--r--include/linux/damon.h92
-rw-r--r--include/linux/dcache.h3
-rw-r--r--include/linux/delayacct.h16
-rw-r--r--include/linux/devfreq.h7
-rw-r--r--include/linux/device.h8
-rw-r--r--include/linux/dma-iommu.h93
-rw-r--r--include/linux/dsa/tag_qca.h8
-rw-r--r--include/linux/efi.h6
-rw-r--r--include/linux/entry-common.h1
-rw-r--r--include/linux/export-internal.h6
-rw-r--r--include/linux/f2fs_fs.h40
-rw-r--r--include/linux/fault-inject.h7
-rw-r--r--include/linux/fb.h2
-rw-r--r--include/linux/find.h310
-rw-r--r--include/linux/fortify-string.h32
-rw-r--r--include/linux/fs.h44
-rw-r--r--include/linux/fscache.h2
-rw-r--r--include/linux/fscrypt.h4
-rw-r--r--include/linux/ftrace.h88
-rw-r--r--include/linux/gameport.h2
-rw-r--r--include/linux/gfp.h44
-rw-r--r--include/linux/gpio/driver.h2
-rw-r--r--include/linux/highmem.h3
-rw-r--r--include/linux/hisi_acc_qm.h63
-rw-r--r--include/linux/huge_mm.h28
-rw-r--r--include/linux/hugetlb.h71
-rw-r--r--include/linux/hugetlb_cgroup.h19
-rw-r--r--include/linux/hw_random.h3
-rw-r--r--include/linux/hyperv.h4
-rw-r--r--include/linux/init.h2
-rw-r--r--include/linux/input/auo-pixcir-ts.h44
-rw-r--r--include/linux/instrumented.h59
-rw-r--r--include/linux/io-pgtable.h3
-rw-r--r--include/linux/io_uring.h3
-rw-r--r--include/linux/io_uring_types.h5
-rw-r--r--include/linux/iommu.h49
-rw-r--r--include/linux/ioport.h5
-rw-r--r--include/linux/iova.h2
-rw-r--r--include/linux/iova_bitmap.h26
-rw-r--r--include/linux/ipc_namespace.h5
-rw-r--r--include/linux/irqchip.h4
-rw-r--r--include/linux/irqdesc.h1
-rw-r--r--include/linux/irqdomain.h143
-rw-r--r--include/linux/irqdomain_defs.h31
-rw-r--r--include/linux/irqreturn.h8
-rw-r--r--include/linux/iversion.h72
-rw-r--r--include/linux/kasan.h56
-rw-r--r--include/linux/kexec.h2
-rw-r--r--include/linux/khugepaged.h13
-rw-r--r--include/linux/kmsan-checks.h83
-rw-r--r--include/linux/kmsan.h330
-rw-r--r--include/linux/kmsan_string.h21
-rw-r--r--include/linux/kmsan_types.h35
-rw-r--r--include/linux/ksm.h3
-rw-r--r--include/linux/kvm_host.h29
-rw-r--r--include/linux/libnvdimm.h7
-rw-r--r--include/linux/maple_tree.h692
-rw-r--r--include/linux/mdev.h77
-rw-r--r--include/linux/memcontrol.h99
-rw-r--r--include/linux/memory-tiers.h102
-rw-r--r--include/linux/memory_hotplug.h30
-rw-r--r--include/linux/mempolicy.h13
-rw-r--r--include/linux/memregion.h38
-rw-r--r--include/linux/memremap.h1
-rw-r--r--include/linux/migrate.h30
-rw-r--r--include/linux/mlx5/driver.h3
-rw-r--r--include/linux/mm.h190
-rw-r--r--include/linux/mm_inline.h242
-rw-r--r--include/linux/mm_types.h178
-rw-r--r--include/linux/mm_types_task.h12
-rw-r--r--include/linux/mmc/card.h1
-rw-r--r--include/linux/mmc/mmc.h2
-rw-r--r--include/linux/mmzone.h281
-rw-r--r--include/linux/msi.h357
-rw-r--r--include/linux/msi_api.h73
-rw-r--r--include/linux/net.h1
-rw-r--r--include/linux/node.h29
-rw-r--r--include/linux/nodemask.h17
-rw-r--r--include/linux/nsproxy.h1
-rw-r--r--include/linux/of.h4
-rw-r--r--include/linux/of_irq.h6
-rw-r--r--include/linux/oom.h11
-rw-r--r--include/linux/overflow.h38
-rw-r--r--include/linux/page-flags-layout.h16
-rw-r--r--include/linux/page-flags.h4
-rw-r--r--include/linux/page_counter.h26
-rw-r--r--include/linux/page_ext.h24
-rw-r--r--include/linux/page_idle.h34
-rw-r--r--include/linux/pageblock-flags.h4
-rw-r--r--include/linux/pagemap.h14
-rw-r--r--include/linux/pagewalk.h10
-rw-r--r--include/linux/pci.h52
-rw-r--r--include/linux/percpu.h2
-rw-r--r--include/linux/percpu_counter.h32
-rw-r--r--include/linux/perf/arm_pmu.h3
-rw-r--r--include/linux/perf_event.h144
-rw-r--r--include/linux/pgtable.h44
-rw-r--r--include/linux/phylink.h2
-rw-r--r--include/linux/platform_data/adp5588.h171
-rw-r--r--include/linux/platform_data/gpmc-omap.h8
-rw-r--r--include/linux/platform_data/st33zp24.h16
-rw-r--r--include/linux/pm.h37
-rw-r--r--include/linux/pm_domain.h7
-rw-r--r--include/linux/pm_runtime.h20
-rw-r--r--include/linux/prandom.h12
-rw-r--r--include/linux/psi.h12
-rw-r--r--include/linux/psi_types.h35
-rw-r--r--include/linux/pstore_ram.h99
-rw-r--r--include/linux/random.h5
-rw-r--r--include/linux/rcupdate.h14
-rw-r--r--include/linux/rcutiny.h8
-rw-r--r--include/linux/rcutree.h4
-rw-r--r--include/linux/reboot.h8
-rw-r--r--include/linux/resctrl.h6
-rw-r--r--include/linux/ring_buffer.h4
-rw-r--r--include/linux/rmap.h73
-rw-r--r--include/linux/sched.h24
-rw-r--r--include/linux/sched/coredump.h7
-rw-r--r--include/linux/sched/sysctl.h1
-rw-r--r--include/linux/sched/task.h3
-rw-r--r--include/linux/scs.h18
-rw-r--r--include/linux/seccomp.h1
-rw-r--r--include/linux/serial_core.h10
-rw-r--r--include/linux/shmem_fs.h16
-rw-r--r--include/linux/skbuff.h2
-rw-r--r--include/linux/skmsg.h2
-rw-r--r--include/linux/slab.h106
-rw-r--r--include/linux/slab_def.h3
-rw-r--r--include/linux/slub_def.h8
-rw-r--r--include/linux/soc/mediatek/mtk-mmsys.h7
-rw-r--r--include/linux/soc/qcom/llcc-qcom.h12
-rw-r--r--include/linux/spi/spi-mem.h2
-rw-r--r--include/linux/srcu.h72
-rw-r--r--include/linux/srcutree.h5
-rw-r--r--include/linux/stackdepot.h8
-rw-r--r--include/linux/sunrpc/clnt.h1
-rw-r--r--include/linux/sunrpc/sched.h6
-rw-r--r--include/linux/swap.h41
-rw-r--r--include/linux/swap_cgroup.h4
-rw-r--r--include/linux/swapfile.h7
-rw-r--r--include/linux/swapops.h152
-rw-r--r--include/linux/syscalls.h2
-rw-r--r--include/linux/thermal.h11
-rw-r--r--include/linux/time_namespace.h6
-rw-r--r--include/linux/timer.h35
-rw-r--r--include/linux/timerqueue.h2
-rw-r--r--include/linux/trace.h4
-rw-r--r--include/linux/trace_events.h1
-rw-r--r--include/linux/uaccess.h19
-rw-r--r--include/linux/user_events.h15
-rw-r--r--include/linux/userfaultfd_k.h13
-rw-r--r--include/linux/utsname.h1
-rw-r--r--include/linux/vdpa.h1
-rw-r--r--include/linux/vfio.h58
-rw-r--r--include/linux/vfio_pci_core.h149
-rw-r--r--include/linux/virtio_pci_legacy.h2
-rw-r--r--include/linux/vm_event_item.h4
-rw-r--r--include/linux/vmacache.h28
-rw-r--r--include/linux/vmstat.h6
-rw-r--r--include/linux/wireless.h10
-rw-r--r--include/linux/writeback.h8
190 files changed, 5205 insertions, 1939 deletions
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 729cff1ee3f8..5e6a876e17ba 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -498,7 +498,7 @@ bool acpi_dev_resource_address_space(struct acpi_resource *ares,
struct resource_win *win);
bool acpi_dev_resource_ext_address_space(struct acpi_resource *ares,
struct resource_win *win);
-unsigned long acpi_dev_irq_flags(u8 triggering, u8 polarity, u8 shareable);
+unsigned long acpi_dev_irq_flags(u8 triggering, u8 polarity, u8 shareable, u8 wake_capable);
unsigned int acpi_dev_get_irq_type(int triggering, int polarity);
bool acpi_dev_resource_interrupt(struct acpi_resource *ares, int index,
struct resource *res);
@@ -586,6 +586,7 @@ acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context);
#define OSC_SB_CPC_FLEXIBLE_ADR_SPACE 0x00004000
#define OSC_SB_NATIVE_USB4_SUPPORT 0x00040000
#define OSC_SB_PRM_SUPPORT 0x00200000
+#define OSC_SB_FFH_OPR_SUPPORT 0x00400000
extern bool osc_sb_apei_support_acked;
extern bool osc_pc_lpi_support_confirmed;
@@ -1136,6 +1137,7 @@ int acpi_subsys_freeze(struct device *dev);
int acpi_subsys_poweroff(struct device *dev);
void acpi_ec_mark_gpe_for_wake(void);
void acpi_ec_set_gpe_wake_mask(u8 action);
+int acpi_subsys_restore_early(struct device *dev);
#else
static inline int acpi_subsys_prepare(struct device *dev) { return 0; }
static inline void acpi_subsys_complete(struct device *dev) {}
@@ -1144,6 +1146,7 @@ static inline int acpi_subsys_suspend_noirq(struct device *dev) { return 0; }
static inline int acpi_subsys_suspend(struct device *dev) { return 0; }
static inline int acpi_subsys_freeze(struct device *dev) { return 0; }
static inline int acpi_subsys_poweroff(struct device *dev) { return 0; }
+static inline int acpi_subsys_restore_early(struct device *dev) { return 0; }
static inline void acpi_ec_mark_gpe_for_wake(void) {}
static inline void acpi_ec_set_gpe_wake_mask(u8 action) {}
#endif
@@ -1211,7 +1214,8 @@ bool acpi_gpio_get_irq_resource(struct acpi_resource *ares,
struct acpi_resource_gpio **agpio);
bool acpi_gpio_get_io_resource(struct acpi_resource *ares,
struct acpi_resource_gpio **agpio);
-int acpi_dev_gpio_irq_get_by(struct acpi_device *adev, const char *name, int index);
+int acpi_dev_gpio_irq_wake_get_by(struct acpi_device *adev, const char *name, int index,
+ bool *wake_capable);
#else
static inline bool acpi_gpio_get_irq_resource(struct acpi_resource *ares,
struct acpi_resource_gpio **agpio)
@@ -1223,16 +1227,28 @@ static inline bool acpi_gpio_get_io_resource(struct acpi_resource *ares,
{
return false;
}
-static inline int acpi_dev_gpio_irq_get_by(struct acpi_device *adev,
- const char *name, int index)
+static inline int acpi_dev_gpio_irq_wake_get_by(struct acpi_device *adev, const char *name,
+ int index, bool *wake_capable)
{
return -ENXIO;
}
#endif
+static inline int acpi_dev_gpio_irq_wake_get(struct acpi_device *adev, int index,
+ bool *wake_capable)
+{
+ return acpi_dev_gpio_irq_wake_get_by(adev, NULL, index, wake_capable);
+}
+
+static inline int acpi_dev_gpio_irq_get_by(struct acpi_device *adev, const char *name,
+ int index)
+{
+ return acpi_dev_gpio_irq_wake_get_by(adev, name, index, NULL);
+}
+
static inline int acpi_dev_gpio_irq_get(struct acpi_device *adev, int index)
{
- return acpi_dev_gpio_irq_get_by(adev, NULL, index);
+ return acpi_dev_gpio_irq_wake_get_by(adev, NULL, index, NULL);
}
/* Device properties */
@@ -1475,6 +1491,16 @@ void acpi_init_pcc(void);
static inline void acpi_init_pcc(void) { }
#endif
+#ifdef CONFIG_ACPI_FFH
+void acpi_init_ffh(void);
+extern int acpi_ffh_address_space_arch_setup(void *handler_ctxt,
+ void **region_ctxt);
+extern int acpi_ffh_address_space_arch_handler(acpi_integer *value,
+ void *region_context);
+#else
+static inline void acpi_init_ffh(void) { }
+#endif
+
#ifdef CONFIG_ACPI
extern void acpi_device_notify(struct device *dev);
extern void acpi_device_notify_remove(struct device *dev);
diff --git a/include/linux/acpi_apmt.h b/include/linux/acpi_apmt.h
new file mode 100644
index 000000000000..40bd634d082f
--- /dev/null
+++ b/include/linux/acpi_apmt.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * ARM CoreSight PMU driver.
+ * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.
+ *
+ */
+
+#ifndef __ACPI_APMT_H__
+#define __ACPI_APMT_H__
+
+#include <linux/acpi.h>
+
+#ifdef CONFIG_ACPI_APMT
+void acpi_apmt_init(void);
+#else
+static inline void acpi_apmt_init(void) { }
+#endif /* CONFIG_ACPI_APMT */
+
+#endif /* __ACPI_APMT_H__ */
diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index 5f02d2e6b9d9..c87aeecaa9b2 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -11,6 +11,89 @@
#include <linux/types.h>
#include <linux/uuid.h>
+#define FFA_SMC(calling_convention, func_num) \
+ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, (calling_convention), \
+ ARM_SMCCC_OWNER_STANDARD, (func_num))
+
+#define FFA_SMC_32(func_num) FFA_SMC(ARM_SMCCC_SMC_32, (func_num))
+#define FFA_SMC_64(func_num) FFA_SMC(ARM_SMCCC_SMC_64, (func_num))
+
+#define FFA_ERROR FFA_SMC_32(0x60)
+#define FFA_SUCCESS FFA_SMC_32(0x61)
+#define FFA_INTERRUPT FFA_SMC_32(0x62)
+#define FFA_VERSION FFA_SMC_32(0x63)
+#define FFA_FEATURES FFA_SMC_32(0x64)
+#define FFA_RX_RELEASE FFA_SMC_32(0x65)
+#define FFA_RXTX_MAP FFA_SMC_32(0x66)
+#define FFA_FN64_RXTX_MAP FFA_SMC_64(0x66)
+#define FFA_RXTX_UNMAP FFA_SMC_32(0x67)
+#define FFA_PARTITION_INFO_GET FFA_SMC_32(0x68)
+#define FFA_ID_GET FFA_SMC_32(0x69)
+#define FFA_MSG_POLL FFA_SMC_32(0x6A)
+#define FFA_MSG_WAIT FFA_SMC_32(0x6B)
+#define FFA_YIELD FFA_SMC_32(0x6C)
+#define FFA_RUN FFA_SMC_32(0x6D)
+#define FFA_MSG_SEND FFA_SMC_32(0x6E)
+#define FFA_MSG_SEND_DIRECT_REQ FFA_SMC_32(0x6F)
+#define FFA_FN64_MSG_SEND_DIRECT_REQ FFA_SMC_64(0x6F)
+#define FFA_MSG_SEND_DIRECT_RESP FFA_SMC_32(0x70)
+#define FFA_FN64_MSG_SEND_DIRECT_RESP FFA_SMC_64(0x70)
+#define FFA_MEM_DONATE FFA_SMC_32(0x71)
+#define FFA_FN64_MEM_DONATE FFA_SMC_64(0x71)
+#define FFA_MEM_LEND FFA_SMC_32(0x72)
+#define FFA_FN64_MEM_LEND FFA_SMC_64(0x72)
+#define FFA_MEM_SHARE FFA_SMC_32(0x73)
+#define FFA_FN64_MEM_SHARE FFA_SMC_64(0x73)
+#define FFA_MEM_RETRIEVE_REQ FFA_SMC_32(0x74)
+#define FFA_FN64_MEM_RETRIEVE_REQ FFA_SMC_64(0x74)
+#define FFA_MEM_RETRIEVE_RESP FFA_SMC_32(0x75)
+#define FFA_MEM_RELINQUISH FFA_SMC_32(0x76)
+#define FFA_MEM_RECLAIM FFA_SMC_32(0x77)
+#define FFA_MEM_OP_PAUSE FFA_SMC_32(0x78)
+#define FFA_MEM_OP_RESUME FFA_SMC_32(0x79)
+#define FFA_MEM_FRAG_RX FFA_SMC_32(0x7A)
+#define FFA_MEM_FRAG_TX FFA_SMC_32(0x7B)
+#define FFA_NORMAL_WORLD_RESUME FFA_SMC_32(0x7C)
+
+/*
+ * For some calls it is necessary to use SMC64 to pass or return 64-bit values.
+ * For such calls FFA_FN_NATIVE(name) will choose the appropriate
+ * (native-width) function ID.
+ */
+#ifdef CONFIG_64BIT
+#define FFA_FN_NATIVE(name) FFA_FN64_##name
+#else
+#define FFA_FN_NATIVE(name) FFA_##name
+#endif
+
+/* FFA error codes. */
+#define FFA_RET_SUCCESS (0)
+#define FFA_RET_NOT_SUPPORTED (-1)
+#define FFA_RET_INVALID_PARAMETERS (-2)
+#define FFA_RET_NO_MEMORY (-3)
+#define FFA_RET_BUSY (-4)
+#define FFA_RET_INTERRUPTED (-5)
+#define FFA_RET_DENIED (-6)
+#define FFA_RET_RETRY (-7)
+#define FFA_RET_ABORTED (-8)
+
+/* FFA version encoding */
+#define FFA_MAJOR_VERSION_MASK GENMASK(30, 16)
+#define FFA_MINOR_VERSION_MASK GENMASK(15, 0)
+#define FFA_MAJOR_VERSION(x) ((u16)(FIELD_GET(FFA_MAJOR_VERSION_MASK, (x))))
+#define FFA_MINOR_VERSION(x) ((u16)(FIELD_GET(FFA_MINOR_VERSION_MASK, (x))))
+#define FFA_PACK_VERSION_INFO(major, minor) \
+ (FIELD_PREP(FFA_MAJOR_VERSION_MASK, (major)) | \
+ FIELD_PREP(FFA_MINOR_VERSION_MASK, (minor)))
+#define FFA_VERSION_1_0 FFA_PACK_VERSION_INFO(1, 0)
+
+/**
+ * FF-A specification mentions explicitly about '4K pages'. This should
+ * not be confused with the kernel PAGE_SIZE, which is the translation
+ * granule kernel is configured and may be one among 4K, 16K and 64K.
+ */
+#define FFA_PAGE_SIZE SZ_4K
+
/* FFA Bus/Device/Driver related */
struct ffa_device {
int vm_id;
@@ -161,11 +244,11 @@ struct ffa_mem_region_attributes {
*/
#define FFA_MEM_RETRIEVE_SELF_BORROWER BIT(0)
u8 flag;
- u32 composite_off;
/*
* Offset in bytes from the start of the outer `ffa_memory_region` to
* an `struct ffa_mem_region_addr_range`.
*/
+ u32 composite_off;
u64 reserved;
};
diff --git a/include/linux/bcm47xx_nvram.h b/include/linux/bcm47xx_nvram.h
index 53b31f69b74a..7615f8d7b1ed 100644
--- a/include/linux/bcm47xx_nvram.h
+++ b/include/linux/bcm47xx_nvram.h
@@ -11,6 +11,7 @@
#include <linux/vmalloc.h>
#ifdef CONFIG_BCM47XX_NVRAM
+int bcm47xx_nvram_init_from_iomem(void __iomem *nvram_start, size_t res_size);
int bcm47xx_nvram_init_from_mem(u32 base, u32 lim);
int bcm47xx_nvram_getenv(const char *name, char *val, size_t val_len);
int bcm47xx_nvram_gpio_pin(const char *name);
@@ -20,6 +21,11 @@ static inline void bcm47xx_nvram_release_contents(char *nvram)
vfree(nvram);
};
#else
+static inline int bcm47xx_nvram_init_from_iomem(void __iomem *nvram_start,
+ size_t res_size)
+{
+ return -ENOTSUPP;
+}
static inline int bcm47xx_nvram_init_from_mem(u32 base, u32 lim)
{
return -ENOTSUPP;
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index f65410a49fda..7d6d73b78147 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -51,6 +51,7 @@ struct device;
* bitmap_empty(src, nbits) Are all bits zero in *src?
* bitmap_full(src, nbits) Are all bits set in *src?
* bitmap_weight(src, nbits) Hamming Weight: number set bits
+ * bitmap_weight_and(src1, src2, nbits) Hamming Weight of and'ed bitmap
* bitmap_set(dst, pos, nbits) Set specified bit area
* bitmap_clear(dst, pos, nbits) Clear specified bit area
* bitmap_find_next_zero_area(buf, len, pos, n, mask) Find bit free area
@@ -164,6 +165,8 @@ bool __bitmap_intersects(const unsigned long *bitmap1,
bool __bitmap_subset(const unsigned long *bitmap1,
const unsigned long *bitmap2, unsigned int nbits);
unsigned int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits);
+unsigned int __bitmap_weight_and(const unsigned long *bitmap1,
+ const unsigned long *bitmap2, unsigned int nbits);
void __bitmap_set(unsigned long *map, unsigned int start, int len);
void __bitmap_clear(unsigned long *map, unsigned int start, int len);
@@ -222,7 +225,6 @@ void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int n
#else
#define bitmap_copy_le bitmap_copy
#endif
-unsigned int bitmap_ord_to_pos(const unsigned long *bitmap, unsigned int ord, unsigned int nbits);
int bitmap_print_to_pagebuf(bool list, char *buf,
const unsigned long *maskp, int nmaskbits);
@@ -439,6 +441,15 @@ unsigned int bitmap_weight(const unsigned long *src, unsigned int nbits)
return __bitmap_weight(src, nbits);
}
+static __always_inline
+unsigned long bitmap_weight_and(const unsigned long *src1,
+ const unsigned long *src2, unsigned int nbits)
+{
+ if (small_const_nbits(nbits))
+ return hweight_long(*src1 & *src2 & BITMAP_LAST_WORD_MASK(nbits));
+ return __bitmap_weight_and(src1, src2, nbits);
+}
+
static __always_inline void bitmap_set(unsigned long *map, unsigned int start,
unsigned int nbits)
{
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 3b89c64bcfd8..2ba557e067fe 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -248,6 +248,25 @@ static inline unsigned long __ffs64(u64 word)
}
/**
+ * fns - find N'th set bit in a word
+ * @word: The word to search
+ * @n: Bit to find
+ */
+static inline unsigned long fns(unsigned long word, unsigned int n)
+{
+ unsigned int bit;
+
+ while (word) {
+ bit = __ffs(word);
+ if (n-- == 0)
+ return bit;
+ __clear_bit(bit, &word);
+ }
+
+ return BITS_PER_LONG;
+}
+
+/**
* assign_bit - Assign value to a bit in memory
* @nr: the bit to set
* @addr: the address to start counting from
@@ -328,10 +347,10 @@ static __always_inline void __assign_bit(long nr, volatile unsigned long *addr,
const typeof(*(ptr)) mask__ = (mask), bits__ = (bits); \
typeof(*(ptr)) old__, new__; \
\
+ old__ = READ_ONCE(*(ptr)); \
do { \
- old__ = READ_ONCE(*(ptr)); \
new__ = (old__ & ~mask__) | bits__; \
- } while (cmpxchg(ptr, old__, new__) != old__); \
+ } while (!try_cmpxchg(ptr, &old__, new__)); \
\
old__; \
})
@@ -343,11 +362,12 @@ static __always_inline void __assign_bit(long nr, volatile unsigned long *addr,
const typeof(*(ptr)) clear__ = (clear), test__ = (test);\
typeof(*(ptr)) old__, new__; \
\
+ old__ = READ_ONCE(*(ptr)); \
do { \
- old__ = READ_ONCE(*(ptr)); \
+ if (old__ & test__) \
+ break; \
new__ = old__ & ~clear__; \
- } while (!(old__ & test__) && \
- cmpxchg(ptr, old__, new__) != old__); \
+ } while (!try_cmpxchg(ptr, &old__, new__)); \
\
!(old__ & test__); \
})
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index ba18e9bdb799..d6119c5d1069 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -853,7 +853,8 @@ static inline bool blk_mq_add_to_batch(struct request *req,
struct io_comp_batch *iob, int ioerror,
void (*complete)(struct io_comp_batch *))
{
- if (!iob || (req->rq_flags & RQF_ELV) || ioerror)
+ if (!iob || (req->rq_flags & RQF_ELV) || ioerror ||
+ (req->end_io && !blk_rq_is_passthrough(req)))
return false;
if (!iob->complete)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 3e187a02924f..891f8cbcd043 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -311,6 +311,13 @@ struct queue_limits {
unsigned char discard_misaligned;
unsigned char raid_partial_stripes_expensive;
enum blk_zoned_model zoned;
+
+ /*
+ * Drivers that set dma_alignment to less than 511 must be prepared to
+ * handle individual bvec's that are not a multiple of a SECTOR_SIZE
+ * due to possible offsets.
+ */
+ unsigned int dma_alignment;
};
typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx,
@@ -456,12 +463,6 @@ struct request_queue {
unsigned long nr_requests; /* Max # of requests */
unsigned int dma_pad_mask;
- /*
- * Drivers that set dma_alignment to less than 511 must be prepared to
- * handle individual bvec's that are not a multiple of a SECTOR_SIZE
- * due to possible offsets.
- */
- unsigned int dma_alignment;
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
struct blk_crypto_profile *crypto_profile;
@@ -580,9 +581,9 @@ struct request_queue {
#define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */
#define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */
-#define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
- (1 << QUEUE_FLAG_SAME_COMP) | \
- (1 << QUEUE_FLAG_NOWAIT))
+#define QUEUE_FLAG_MQ_DEFAULT ((1UL << QUEUE_FLAG_IO_STAT) | \
+ (1UL << QUEUE_FLAG_SAME_COMP) | \
+ (1UL << QUEUE_FLAG_NOWAIT))
void blk_queue_flag_set(unsigned int flag, struct request_queue *q);
void blk_queue_flag_clear(unsigned int flag, struct request_queue *q);
@@ -944,7 +945,6 @@ extern void blk_queue_io_min(struct request_queue *q, unsigned int min);
extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt);
extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt);
extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth);
-extern void blk_set_default_limits(struct queue_limits *lim);
extern void blk_set_stacking_limits(struct queue_limits *lim);
extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
sector_t offset);
@@ -1324,7 +1324,7 @@ static inline sector_t bdev_zone_sectors(struct block_device *bdev)
static inline int queue_dma_alignment(const struct request_queue *q)
{
- return q ? q->dma_alignment : 511;
+ return q ? q->limits.dma_alignment : 511;
}
static inline unsigned int bdev_dma_alignment(struct block_device *bdev)
diff --git a/include/linux/bma150.h b/include/linux/bma150.h
index 31c9e323a391..4d4a62d49341 100644
--- a/include/linux/bma150.h
+++ b/include/linux/bma150.h
@@ -33,8 +33,8 @@ struct bma150_cfg {
unsigned char lg_hyst; /* Low-G hysterisis */
unsigned char lg_dur; /* Low-G duration */
unsigned char lg_thres; /* Low-G threshold */
- unsigned char range; /* one of BMA0150_RANGE_xxx */
- unsigned char bandwidth; /* one of BMA0150_BW_xxx */
+ unsigned char range; /* one of BMA150_RANGE_xxx */
+ unsigned char bandwidth; /* one of BMA150_BW_xxx */
};
struct bma150_platform_data {
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9e7d46d16032..c1bd1bd10506 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -27,6 +27,7 @@
#include <linux/bpfptr.h>
#include <linux/btf.h>
#include <linux/rcupdate_trace.h>
+#include <linux/static_call.h>
struct bpf_verifier_env;
struct bpf_verifier_log;
@@ -314,7 +315,7 @@ static inline void __copy_map_value(struct bpf_map *map, void *dst, void *src, b
u32 next_off = map->off_arr->field_off[i];
memcpy(dst + curr_off, src + curr_off, next_off - curr_off);
- curr_off += map->off_arr->field_sz[i];
+ curr_off = next_off + map->off_arr->field_sz[i];
}
memcpy(dst + curr_off, src + curr_off, map->value_size - curr_off);
}
@@ -343,7 +344,7 @@ static inline void zero_map_value(struct bpf_map *map, void *dst)
u32 next_off = map->off_arr->field_off[i];
memset(dst + curr_off, 0, next_off - curr_off);
- curr_off += map->off_arr->field_sz[i];
+ curr_off = next_off + map->off_arr->field_sz[i];
}
memset(dst + curr_off, 0, map->value_size - curr_off);
}
@@ -953,6 +954,10 @@ struct bpf_dispatcher {
void *rw_image;
u32 image_off;
struct bpf_ksym ksym;
+#ifdef CONFIG_HAVE_STATIC_CALL
+ struct static_call_key *sc_key;
+ void *sc_tramp;
+#endif
};
static __always_inline __nocfi unsigned int bpf_dispatcher_nop_func(
@@ -970,6 +975,34 @@ struct bpf_trampoline *bpf_trampoline_get(u64 key,
struct bpf_attach_target_info *tgt_info);
void bpf_trampoline_put(struct bpf_trampoline *tr);
int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs);
+
+/*
+ * When the architecture supports STATIC_CALL replace the bpf_dispatcher_fn
+ * indirection with a direct call to the bpf program. If the architecture does
+ * not have STATIC_CALL, avoid a double-indirection.
+ */
+#ifdef CONFIG_HAVE_STATIC_CALL
+
+#define __BPF_DISPATCHER_SC_INIT(_name) \
+ .sc_key = &STATIC_CALL_KEY(_name), \
+ .sc_tramp = STATIC_CALL_TRAMP_ADDR(_name),
+
+#define __BPF_DISPATCHER_SC(name) \
+ DEFINE_STATIC_CALL(bpf_dispatcher_##name##_call, bpf_dispatcher_nop_func)
+
+#define __BPF_DISPATCHER_CALL(name) \
+ static_call(bpf_dispatcher_##name##_call)(ctx, insnsi, bpf_func)
+
+#define __BPF_DISPATCHER_UPDATE(_d, _new) \
+ __static_call_update((_d)->sc_key, (_d)->sc_tramp, (_new))
+
+#else
+#define __BPF_DISPATCHER_SC_INIT(name)
+#define __BPF_DISPATCHER_SC(name)
+#define __BPF_DISPATCHER_CALL(name) bpf_func(ctx, insnsi)
+#define __BPF_DISPATCHER_UPDATE(_d, _new)
+#endif
+
#define BPF_DISPATCHER_INIT(_name) { \
.mutex = __MUTEX_INITIALIZER(_name.mutex), \
.func = &_name##_func, \
@@ -981,32 +1014,29 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func
.name = #_name, \
.lnode = LIST_HEAD_INIT(_name.ksym.lnode), \
}, \
+ __BPF_DISPATCHER_SC_INIT(_name##_call) \
}
-#ifdef CONFIG_X86_64
-#define BPF_DISPATCHER_ATTRIBUTES __attribute__((patchable_function_entry(5)))
-#else
-#define BPF_DISPATCHER_ATTRIBUTES
-#endif
-
#define DEFINE_BPF_DISPATCHER(name) \
- notrace BPF_DISPATCHER_ATTRIBUTES \
+ __BPF_DISPATCHER_SC(name); \
noinline __nocfi unsigned int bpf_dispatcher_##name##_func( \
const void *ctx, \
const struct bpf_insn *insnsi, \
bpf_func_t bpf_func) \
{ \
- return bpf_func(ctx, insnsi); \
+ return __BPF_DISPATCHER_CALL(name); \
} \
EXPORT_SYMBOL(bpf_dispatcher_##name##_func); \
struct bpf_dispatcher bpf_dispatcher_##name = \
BPF_DISPATCHER_INIT(bpf_dispatcher_##name);
+
#define DECLARE_BPF_DISPATCHER(name) \
unsigned int bpf_dispatcher_##name##_func( \
const void *ctx, \
const struct bpf_insn *insnsi, \
bpf_func_t bpf_func); \
extern struct bpf_dispatcher bpf_dispatcher_##name;
+
#define BPF_DISPATCHER_FUNC(name) bpf_dispatcher_##name##_func
#define BPF_DISPATCHER_PTR(name) (&bpf_dispatcher_##name)
void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 06089390d81d..33fa5e94aa80 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -225,8 +225,6 @@ struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block,
void __brelse(struct buffer_head *);
void __bforget(struct buffer_head *);
void __breadahead(struct block_device *, sector_t block, unsigned int size);
-void __breadahead_gfp(struct block_device *, sector_t block, unsigned int size,
- gfp_t gfp);
struct buffer_head *__bread_gfp(struct block_device *,
sector_t block, unsigned size, gfp_t gfp);
void invalidate_bh_lrus(void);
@@ -236,7 +234,6 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
void free_buffer_head(struct buffer_head * bh);
void unlock_buffer(struct buffer_head *bh);
void __lock_buffer(struct buffer_head *bh);
-void ll_rw_block(blk_opf_t, int, struct buffer_head * bh[]);
int sync_dirty_buffer(struct buffer_head *bh);
int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags);
void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags);
@@ -244,7 +241,9 @@ void submit_bh(blk_opf_t, struct buffer_head *);
void write_boundary_block(struct block_device *bdev,
sector_t bblock, unsigned blocksize);
int bh_uptodate_or_lock(struct buffer_head *bh);
-int bh_submit_read(struct buffer_head *bh);
+int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait);
+void __bh_read_batch(int nr, struct buffer_head *bhs[],
+ blk_opf_t op_flags, bool force_lock);
extern int buffer_heads_over_limit;
@@ -351,12 +350,6 @@ sb_breadahead(struct super_block *sb, sector_t block)
__breadahead(sb->s_bdev, block, sb->s_blocksize);
}
-static inline void
-sb_breadahead_unmovable(struct super_block *sb, sector_t block)
-{
- __breadahead_gfp(sb->s_bdev, block, sb->s_blocksize, 0);
-}
-
static inline struct buffer_head *
sb_getblk(struct super_block *sb, sector_t block)
{
@@ -418,6 +411,41 @@ static inline struct buffer_head *__getblk(struct block_device *bdev,
return __getblk_gfp(bdev, block, size, __GFP_MOVABLE);
}
+static inline void bh_readahead(struct buffer_head *bh, blk_opf_t op_flags)
+{
+ if (!buffer_uptodate(bh) && trylock_buffer(bh)) {
+ if (!buffer_uptodate(bh))
+ __bh_read(bh, op_flags, false);
+ else
+ unlock_buffer(bh);
+ }
+}
+
+static inline void bh_read_nowait(struct buffer_head *bh, blk_opf_t op_flags)
+{
+ if (!bh_uptodate_or_lock(bh))
+ __bh_read(bh, op_flags, false);
+}
+
+/* Returns 1 if buffer uptodated, 0 on success, and -EIO on error. */
+static inline int bh_read(struct buffer_head *bh, blk_opf_t op_flags)
+{
+ if (bh_uptodate_or_lock(bh))
+ return 1;
+ return __bh_read(bh, op_flags, true);
+}
+
+static inline void bh_read_batch(int nr, struct buffer_head *bhs[])
+{
+ __bh_read_batch(nr, bhs, 0, true);
+}
+
+static inline void bh_readahead_batch(int nr, struct buffer_head *bhs[],
+ blk_opf_t op_flags)
+{
+ __bh_read_batch(nr, bhs, op_flags, false);
+}
+
/**
* __bread() - reads a specified block and returns the bh
* @bdev: the block_device to read from
diff --git a/include/linux/cache.h b/include/linux/cache.h
index d742c57eaee5..5da1bbd96154 100644
--- a/include/linux/cache.h
+++ b/include/linux/cache.h
@@ -85,4 +85,17 @@
#define cache_line_size() L1_CACHE_BYTES
#endif
+/*
+ * Helper to add padding within a struct to ensure data fall into separate
+ * cachelines.
+ */
+#if defined(CONFIG_SMP)
+struct cacheline_padding {
+ char x[0];
+} ____cacheline_internodealigned_in_smp;
+#define CACHELINE_PADDING(name) struct cacheline_padding name
+#else
+#define CACHELINE_PADDING(name)
+#endif
+
#endif /* __LINUX_CACHE_H */
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index 58f5431a5559..982ba245eb41 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -152,6 +152,22 @@ static inline bool can_is_canxl_dev_mtu(unsigned int mtu)
return (mtu >= CANXL_MIN_MTU && mtu <= CANXL_MAX_MTU);
}
+/* drop skb if it does not contain a valid CAN frame for sending */
+static inline bool can_dev_dropped_skb(struct net_device *dev, struct sk_buff *skb)
+{
+ struct can_priv *priv = netdev_priv(dev);
+
+ if (priv->ctrlmode & CAN_CTRLMODE_LISTENONLY) {
+ netdev_info_once(dev,
+ "interface in listen only mode, dropping skb\n");
+ kfree_skb(skb);
+ dev->stats.tx_dropped++;
+ return true;
+ }
+
+ return can_dropped_invalid_skb(dev, skb);
+}
+
void can_setup(struct net_device *dev);
struct net_device *alloc_candev_mqs(int sizeof_priv, unsigned int echo_skb_max,
diff --git a/include/linux/can/platform/sja1000.h b/include/linux/can/platform/sja1000.h
index 5755ae5a4712..6a869682c120 100644
--- a/include/linux/can/platform/sja1000.h
+++ b/include/linux/can/platform/sja1000.h
@@ -14,7 +14,7 @@
#define OCR_MODE_TEST 0x01
#define OCR_MODE_NORMAL 0x02
#define OCR_MODE_CLOCK 0x03
-#define OCR_MODE_MASK 0x07
+#define OCR_MODE_MASK 0x03
#define OCR_TX0_INVERT 0x04
#define OCR_TX0_PULLDOWN 0x08
#define OCR_TX0_PULLUP 0x10
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index e7f2fb2fc207..99c1726be6ee 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -207,7 +207,6 @@ struct ceph_msg_data_cursor {
struct ceph_msg_data *data; /* current data item */
size_t resid; /* bytes not yet consumed */
- bool last_piece; /* current is last piece */
bool need_crc; /* crc update needed */
union {
#ifdef CONFIG_BLOCK
@@ -498,8 +497,7 @@ void ceph_con_discard_requeued(struct ceph_connection *con, u64 reconnect_seq);
void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor,
struct ceph_msg *msg, size_t length);
struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
- size_t *page_offset, size_t *length,
- bool *last_piece);
+ size_t *page_offset, size_t *length);
void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes);
u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset,
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 8f481d1b159a..6e01f10f0d88 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -428,6 +428,9 @@ struct cgroup {
struct cgroup_file procs_file; /* handle for "cgroup.procs" */
struct cgroup_file events_file; /* handle for "cgroup.events" */
+ /* handles for "{cpu,memory,io,irq}.pressure" */
+ struct cgroup_file psi_files[NR_PSI_RESOURCES];
+
/*
* The bitmask of subsystems enabled on the child cgroups.
* ->subtree_control is the one configured through
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c8441090ca4c..3410aecffdb4 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -68,6 +68,7 @@ struct css_task_iter {
struct list_head iters_node; /* css_set->task_iters */
};
+extern struct file_system_type cgroup_fs_type;
extern struct cgroup_root cgrp_dfl_root;
extern struct css_set init_css_set;
@@ -362,6 +363,18 @@ static inline void cgroup_put(struct cgroup *cgrp)
css_put(&cgrp->self);
}
+extern struct mutex cgroup_mutex;
+
+static inline void cgroup_lock(void)
+{
+ mutex_lock(&cgroup_mutex);
+}
+
+static inline void cgroup_unlock(void)
+{
+ mutex_unlock(&cgroup_mutex);
+}
+
/**
* task_css_set_check - obtain a task's css_set with extra access conditions
* @task: the task to obtain css_set for
@@ -376,7 +389,6 @@ static inline void cgroup_put(struct cgroup *cgrp)
* as locks used during the cgroup_subsys::attach() methods.
*/
#ifdef CONFIG_PROVE_RCU
-extern struct mutex cgroup_mutex;
extern spinlock_t css_set_lock;
#define task_css_set_check(task, __c) \
rcu_dereference_check((task)->cgroups, \
@@ -600,11 +612,6 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
pr_cont_kernfs_path(cgrp->kn);
}
-static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
-{
- return cgrp->psi;
-}
-
bool cgroup_psi_enabled(void);
static inline void cgroup_init_kthreadd(void)
@@ -636,6 +643,8 @@ struct cgroup;
static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
static inline void css_get(struct cgroup_subsys_state *css) {}
static inline void css_put(struct cgroup_subsys_state *css) {}
+static inline void cgroup_lock(void) {}
+static inline void cgroup_unlock(void) {}
static inline int cgroup_attach_task_all(struct task_struct *from,
struct task_struct *t) { return 0; }
static inline int cgroupstats_build(struct cgroupstats *stats,
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 2108b5695327..267cd06b54a0 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -42,6 +42,8 @@ struct dentry;
* struct clk_rate_request - Structure encoding the clk constraints that
* a clock user might require.
*
+ * Should be initialized by calling clk_hw_init_rate_request().
+ *
* @rate: Requested clock rate. This field will be adjusted by
* clock drivers according to hardware capabilities.
* @min_rate: Minimum rate imposed by clk users.
@@ -60,6 +62,15 @@ struct clk_rate_request {
struct clk_hw *best_parent_hw;
};
+void clk_hw_init_rate_request(const struct clk_hw *hw,
+ struct clk_rate_request *req,
+ unsigned long rate);
+void clk_hw_forward_rate_request(const struct clk_hw *core,
+ const struct clk_rate_request *old_req,
+ const struct clk_hw *parent,
+ struct clk_rate_request *req,
+ unsigned long parent_rate);
+
/**
* struct clk_duty - Struture encoding the duty cycle ratio of a clock
*
@@ -118,8 +129,9 @@ struct clk_duty {
*
* @recalc_rate Recalculate the rate of this clock, by querying hardware. The
* parent rate is an input parameter. It is up to the caller to
- * ensure that the prepare_mutex is held across this call.
- * Returns the calculated rate. Optional, but recommended - if
+ * ensure that the prepare_mutex is held across this call. If the
+ * driver cannot figure out a rate for this clock, it must return
+ * 0. Returns the calculated rate. Optional, but recommended - if
* this op is not set then clock rate will be initialized to 0.
*
* @round_rate: Given a target rate as input, returns the closest rate actually
@@ -1303,6 +1315,8 @@ int clk_mux_determine_rate_flags(struct clk_hw *hw,
struct clk_rate_request *req,
unsigned long flags);
void clk_hw_reparent(struct clk_hw *hw, struct clk_hw *new_parent);
+void clk_hw_get_rate_range(struct clk_hw *hw, unsigned long *min_rate,
+ unsigned long *max_rate);
void clk_hw_set_rate_range(struct clk_hw *hw, unsigned long min_rate,
unsigned long max_rate);
diff --git a/include/linux/clk.h b/include/linux/clk.h
index c13061cabdfc..1ef013324237 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -799,7 +799,7 @@ int clk_set_rate_exclusive(struct clk *clk, unsigned long rate);
*
* Returns true if @parent is a possible parent for @clk, false otherwise.
*/
-bool clk_has_parent(struct clk *clk, struct clk *parent);
+bool clk_has_parent(const struct clk *clk, const struct clk *parent);
/**
* clk_set_rate_range - set a rate range for a clock source
diff --git a/include/linux/clk/at91_pmc.h b/include/linux/clk/at91_pmc.h
index 3484309b59bf..7af499bdbecb 100644
--- a/include/linux/clk/at91_pmc.h
+++ b/include/linux/clk/at91_pmc.h
@@ -12,6 +12,8 @@
#ifndef AT91_PMC_H
#define AT91_PMC_H
+#include <linux/bits.h>
+
#define AT91_PMC_V1 (1) /* PMC version 1 */
#define AT91_PMC_V2 (2) /* PMC version 2 [SAM9X60] */
@@ -45,8 +47,8 @@
#define AT91_PMC_PCSR 0x18 /* Peripheral Clock Status Register */
#define AT91_PMC_PLL_ACR 0x18 /* PLL Analog Control Register [for SAM9X60] */
-#define AT91_PMC_PLL_ACR_DEFAULT_UPLL 0x12020010UL /* Default PLL ACR value for UPLL */
-#define AT91_PMC_PLL_ACR_DEFAULT_PLLA 0x00020010UL /* Default PLL ACR value for PLLA */
+#define AT91_PMC_PLL_ACR_DEFAULT_UPLL UL(0x12020010) /* Default PLL ACR value for UPLL */
+#define AT91_PMC_PLL_ACR_DEFAULT_PLLA UL(0x00020010) /* Default PLL ACR value for PLLA */
#define AT91_PMC_PLL_ACR_UTMIVR (1 << 12) /* UPLL Voltage regulator Control */
#define AT91_PMC_PLL_ACR_UTMIBG (1 << 13) /* UPLL Bandgap Control */
diff --git a/include/linux/clk/spear.h b/include/linux/clk/spear.h
index a64d034ceddd..eaf95ca656f8 100644
--- a/include/linux/clk/spear.h
+++ b/include/linux/clk/spear.h
@@ -8,6 +8,20 @@
#ifndef __LINUX_CLK_SPEAR_H
#define __LINUX_CLK_SPEAR_H
+#ifdef CONFIG_ARCH_SPEAR3XX
+void __init spear3xx_clk_init(void __iomem *misc_base,
+ void __iomem *soc_config_base);
+#else
+static inline void __init spear3xx_clk_init(void __iomem *misc_base,
+ void __iomem *soc_config_base) {}
+#endif
+
+#ifdef CONFIG_ARCH_SPEAR6XX
+void __init spear6xx_clk_init(void __iomem *misc_base);
+#else
+static inline void __init spear6xx_clk_init(void __iomem *misc_base) {}
+#endif
+
#ifdef CONFIG_MACH_SPEAR1310
void __init spear1310_clk_init(void __iomem *misc_base, void __iomem *ras_base);
#else
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 594357881b0b..44b1736c95b5 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -126,11 +126,9 @@ struct compat_tms {
#define _COMPAT_NSIG_WORDS (_COMPAT_NSIG / _COMPAT_NSIG_BPW)
-#ifndef compat_sigset_t
typedef struct {
compat_sigset_word sig[_COMPAT_NSIG_WORDS];
} compat_sigset_t;
-#endif
int set_compat_user_sigmask(const compat_sigset_t __user *umask,
size_t sigsetsize);
diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index 42e55579d649..6cfd6902bd5b 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -51,6 +51,29 @@
#define __no_sanitize_undefined
#endif
+#if __has_feature(memory_sanitizer)
+#define __SANITIZE_MEMORY__
+/*
+ * Unlike other sanitizers, KMSAN still inserts code into functions marked with
+ * no_sanitize("kernel-memory"). Using disable_sanitizer_instrumentation
+ * provides the behavior consistent with other __no_sanitize_ attributes,
+ * guaranteeing that __no_sanitize_memory functions remain uninstrumented.
+ */
+#define __no_sanitize_memory __disable_sanitizer_instrumentation
+
+/*
+ * The __no_kmsan_checks attribute ensures that a function does not produce
+ * false positive reports by:
+ * - initializing all local variables and memory stores in this function;
+ * - skipping all shadow checks;
+ * - passing initialized arguments to this function's callees.
+ */
+#define __no_kmsan_checks __attribute__((no_sanitize("kernel-memory")))
+#else
+#define __no_sanitize_memory
+#define __no_kmsan_checks
+#endif
+
/*
* Support for __has_feature(coverage_sanitizer) was added in Clang 13 together
* with no_sanitize("coverage"). Prior versions of Clang support coverage
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 9b157b71036f..f55a37efdb97 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -115,6 +115,12 @@
#endif
/*
+ * GCC does not support KMSAN.
+ */
+#define __no_sanitize_memory
+#define __no_kmsan_checks
+
+/*
* Turn individual warnings and errors on and off locally, depending
* on version.
*/
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 74e04ecd4c89..eb0466236661 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -233,7 +233,8 @@ struct ftrace_likely_data {
/* Section for code which can't be instrumented at all */
#define noinstr \
noinline notrace __attribute((__section__(".noinstr.text"))) \
- __no_kcsan __no_sanitize_address __no_profile __no_sanitize_coverage
+ __no_kcsan __no_sanitize_address __no_profile __no_sanitize_coverage \
+ __no_sanitize_memory
#endif /* __KERNEL__ */
diff --git a/include/linux/console.h b/include/linux/console.h
index 8c1686e2c233..9cea254b34b8 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -15,6 +15,7 @@
#define _LINUX_CONSOLE_H_ 1
#include <linux/atomic.h>
+#include <linux/rculist.h>
#include <linux/types.h>
struct vc_data;
@@ -154,14 +155,132 @@ struct console {
u64 seq;
unsigned long dropped;
void *data;
- struct console *next;
+ struct hlist_node node;
};
+#ifdef CONFIG_LOCKDEP
+extern void lockdep_assert_console_list_lock_held(void);
+#else
+static inline void lockdep_assert_console_list_lock_held(void)
+{
+}
+#endif
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+extern bool console_srcu_read_lock_is_held(void);
+#else
+static inline bool console_srcu_read_lock_is_held(void)
+{
+ return 1;
+}
+#endif
+
+extern int console_srcu_read_lock(void);
+extern void console_srcu_read_unlock(int cookie);
+
+extern void console_list_lock(void) __acquires(console_mutex);
+extern void console_list_unlock(void) __releases(console_mutex);
+
+extern struct hlist_head console_list;
+
+/**
+ * console_srcu_read_flags - Locklessly read the console flags
+ * @con: struct console pointer of console to read flags from
+ *
+ * This function provides the necessary READ_ONCE() and data_race()
+ * notation for locklessly reading the console flags. The READ_ONCE()
+ * in this function matches the WRITE_ONCE() when @flags are modified
+ * for registered consoles with console_srcu_write_flags().
+ *
+ * Only use this function to read console flags when locklessly
+ * iterating the console list via srcu.
+ *
+ * Context: Any context.
+ */
+static inline short console_srcu_read_flags(const struct console *con)
+{
+ WARN_ON_ONCE(!console_srcu_read_lock_is_held());
+
+ /*
+ * Locklessly reading console->flags provides a consistent
+ * read value because there is at most one CPU modifying
+ * console->flags and that CPU is using only read-modify-write
+ * operations to do so.
+ */
+ return data_race(READ_ONCE(con->flags));
+}
+
+/**
+ * console_srcu_write_flags - Write flags for a registered console
+ * @con: struct console pointer of console to write flags to
+ * @flags: new flags value to write
+ *
+ * Only use this function to write flags for registered consoles. It
+ * requires holding the console_list_lock.
+ *
+ * Context: Any context.
+ */
+static inline void console_srcu_write_flags(struct console *con, short flags)
+{
+ lockdep_assert_console_list_lock_held();
+
+ /* This matches the READ_ONCE() in console_srcu_read_flags(). */
+ WRITE_ONCE(con->flags, flags);
+}
+
+/* Variant of console_is_registered() when the console_list_lock is held. */
+static inline bool console_is_registered_locked(const struct console *con)
+{
+ lockdep_assert_console_list_lock_held();
+ return !hlist_unhashed(&con->node);
+}
+
/*
- * for_each_console() allows you to iterate on each console
+ * console_is_registered - Check if the console is registered
+ * @con: struct console pointer of console to check
+ *
+ * Context: Process context. May sleep while acquiring console list lock.
+ * Return: true if the console is in the console list, otherwise false.
+ *
+ * If false is returned for a console that was previously registered, it
+ * can be assumed that the console's unregistration is fully completed,
+ * including the exit() callback after console list removal.
+ */
+static inline bool console_is_registered(const struct console *con)
+{
+ bool ret;
+
+ console_list_lock();
+ ret = console_is_registered_locked(con);
+ console_list_unlock();
+ return ret;
+}
+
+/**
+ * for_each_console_srcu() - Iterator over registered consoles
+ * @con: struct console pointer used as loop cursor
+ *
+ * Although SRCU guarantees the console list will be consistent, the
+ * struct console fields may be updated by other CPUs while iterating.
+ *
+ * Requires console_srcu_read_lock to be held. Can be invoked from
+ * any context.
+ */
+#define for_each_console_srcu(con) \
+ hlist_for_each_entry_srcu(con, &console_list, node, \
+ console_srcu_read_lock_is_held())
+
+/**
+ * for_each_console() - Iterator over registered consoles
+ * @con: struct console pointer used as loop cursor
+ *
+ * The console list and the console->flags are immutable while iterating.
+ *
+ * Requires console_list_lock to be held.
*/
-#define for_each_console(con) \
- for (con = console_drivers; con != NULL; con = con->next)
+#define for_each_console(con) \
+ lockdep_assert_console_list_lock_held(); \
+ hlist_for_each_entry(con, &console_list, node)
extern int console_set_on_cmdline;
extern struct console *early_console;
@@ -172,9 +291,9 @@ enum con_flush_mode {
};
extern int add_preferred_console(char *name, int idx, char *options);
+extern void console_force_preferred_locked(struct console *con);
extern void register_console(struct console *);
extern int unregister_console(struct console *);
-extern struct console *console_drivers;
extern void console_lock(void);
extern int console_trylock(void);
extern void console_unlock(void);
diff --git a/include/linux/counter.h b/include/linux/counter.h
index c41fa602ed28..b63746637de2 100644
--- a/include/linux/counter.h
+++ b/include/linux/counter.h
@@ -542,11 +542,10 @@ struct counter_array {
#define DEFINE_COUNTER_ARRAY_CAPTURE(_name, _length) \
DEFINE_COUNTER_ARRAY_U64(_name, _length)
-#define DEFINE_COUNTER_ARRAY_POLARITY(_name, _enums, _length) \
- DEFINE_COUNTER_AVAILABLE(_name##_available, _enums); \
+#define DEFINE_COUNTER_ARRAY_POLARITY(_name, _available, _length) \
struct counter_array _name = { \
.type = COUNTER_COMP_SIGNAL_POLARITY, \
- .avail = &(_name##_available), \
+ .avail = &(_available), \
.length = (_length), \
}
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index d5595d57f4e5..6a94a6eaad27 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -1110,10 +1110,10 @@ cpufreq_table_set_inefficient(struct cpufreq_policy *policy,
}
static inline int parse_perf_domain(int cpu, const char *list_name,
- const char *cell_name)
+ const char *cell_name,
+ struct of_phandle_args *args)
{
struct device_node *cpu_np;
- struct of_phandle_args args;
int ret;
cpu_np = of_cpu_device_node_get(cpu);
@@ -1121,41 +1121,44 @@ static inline int parse_perf_domain(int cpu, const char *list_name,
return -ENODEV;
ret = of_parse_phandle_with_args(cpu_np, list_name, cell_name, 0,
- &args);
+ args);
if (ret < 0)
return ret;
of_node_put(cpu_np);
- return args.args[0];
+ return 0;
}
static inline int of_perf_domain_get_sharing_cpumask(int pcpu, const char *list_name,
- const char *cell_name, struct cpumask *cpumask)
+ const char *cell_name, struct cpumask *cpumask,
+ struct of_phandle_args *pargs)
{
- int target_idx;
int cpu, ret;
+ struct of_phandle_args args;
- ret = parse_perf_domain(pcpu, list_name, cell_name);
+ ret = parse_perf_domain(pcpu, list_name, cell_name, pargs);
if (ret < 0)
return ret;
- target_idx = ret;
cpumask_set_cpu(pcpu, cpumask);
for_each_possible_cpu(cpu) {
if (cpu == pcpu)
continue;
- ret = parse_perf_domain(cpu, list_name, cell_name);
+ ret = parse_perf_domain(cpu, list_name, cell_name, &args);
if (ret < 0)
continue;
- if (target_idx == ret)
+ if (pargs->np == args.np && pargs->args_count == args.args_count &&
+ !memcmp(pargs->args, args.args, sizeof(args.args[0]) * args.args_count))
cpumask_set_cpu(cpu, cpumask);
+
+ of_node_put(args.np);
}
- return target_idx;
+ return 0;
}
#else
static inline int cpufreq_boost_trigger_state(int state)
@@ -1185,7 +1188,8 @@ cpufreq_table_set_inefficient(struct cpufreq_policy *policy,
}
static inline int of_perf_domain_get_sharing_cpumask(int pcpu, const char *list_name,
- const char *cell_name, struct cpumask *cpumask)
+ const char *cell_name, struct cpumask *cpumask,
+ struct of_phandle_args *pargs)
{
return -EOPNOTSUPP;
}
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index e8ad12b5b9d2..c2aa0aa26b45 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -35,19 +35,23 @@ typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;
*/
#define cpumask_pr_args(maskp) nr_cpu_ids, cpumask_bits(maskp)
-#if NR_CPUS == 1
-#define nr_cpu_ids 1U
+#if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS)
+#define nr_cpu_ids ((unsigned int)NR_CPUS)
#else
extern unsigned int nr_cpu_ids;
#endif
-#ifdef CONFIG_CPUMASK_OFFSTACK
-/* Assuming NR_CPUS is huge, a runtime limit is more efficient. Also,
- * not all bits may be allocated. */
-#define nr_cpumask_bits nr_cpu_ids
+static inline void set_nr_cpu_ids(unsigned int nr)
+{
+#if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS)
+ WARN_ON(nr != nr_cpu_ids);
#else
-#define nr_cpumask_bits ((unsigned int)NR_CPUS)
+ nr_cpu_ids = nr;
#endif
+}
+
+/* Deprecated. Always use nr_cpu_ids. */
+#define nr_cpumask_bits nr_cpu_ids
/*
* The following particular system cpumasks and operations manage
@@ -67,10 +71,6 @@ extern unsigned int nr_cpu_ids;
* cpu_online_mask is the dynamic subset of cpu_present_mask,
* indicating those CPUs available for scheduling.
*
- * If HOTPLUG is enabled, then cpu_possible_mask is forced to have
- * all NR_CPUS bits set, otherwise it is just the set of CPUs that
- * ACPI reports present at boot.
- *
* If HOTPLUG is enabled, then cpu_present_mask varies dynamically,
* depending on what ACPI reports as currently plugged in, otherwise
* cpu_present_mask is just a copy of cpu_possible_mask.
@@ -246,9 +246,7 @@ unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
* After the loop, cpu is >= nr_cpu_ids.
*/
#define for_each_cpu(cpu, mask) \
- for ((cpu) = -1; \
- (cpu) = cpumask_next((cpu), (mask)), \
- (cpu) < nr_cpu_ids;)
+ for_each_set_bit(cpu, cpumask_bits(mask), nr_cpumask_bits)
/**
* for_each_cpu_not - iterate over every cpu in a complemented mask
@@ -258,9 +256,7 @@ unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
* After the loop, cpu is >= nr_cpu_ids.
*/
#define for_each_cpu_not(cpu, mask) \
- for ((cpu) = -1; \
- (cpu) = cpumask_next_zero((cpu), (mask)), \
- (cpu) < nr_cpu_ids;)
+ for_each_clear_bit(cpu, cpumask_bits(mask), nr_cpumask_bits)
#if NR_CPUS == 1
static inline
@@ -293,10 +289,8 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta
*
* After the loop, cpu is >= nr_cpu_ids.
*/
-#define for_each_cpu_wrap(cpu, mask, start) \
- for ((cpu) = cpumask_next_wrap((start)-1, (mask), (start), false); \
- (cpu) < nr_cpumask_bits; \
- (cpu) = cpumask_next_wrap((cpu), (mask), (start), true))
+#define for_each_cpu_wrap(cpu, mask, start) \
+ for_each_set_bit_wrap(cpu, cpumask_bits(mask), nr_cpumask_bits, start)
/**
* for_each_cpu_and - iterate over every cpu in both masks
@@ -313,9 +307,25 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta
* After the loop, cpu is >= nr_cpu_ids.
*/
#define for_each_cpu_and(cpu, mask1, mask2) \
- for ((cpu) = -1; \
- (cpu) = cpumask_next_and((cpu), (mask1), (mask2)), \
- (cpu) < nr_cpu_ids;)
+ for_each_and_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), nr_cpumask_bits)
+
+/**
+ * for_each_cpu_andnot - iterate over every cpu present in one mask, excluding
+ * those present in another.
+ * @cpu: the (optionally unsigned) integer iterator
+ * @mask1: the first cpumask pointer
+ * @mask2: the second cpumask pointer
+ *
+ * This saves a temporary CPU mask in many places. It is equivalent to:
+ * struct cpumask tmp;
+ * cpumask_andnot(&tmp, &mask1, &mask2);
+ * for_each_cpu(cpu, &tmp)
+ * ...
+ *
+ * After the loop, cpu is >= nr_cpu_ids.
+ */
+#define for_each_cpu_andnot(cpu, mask1, mask2) \
+ for_each_andnot_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), nr_cpumask_bits)
/**
* cpumask_any_but - return a "random" in a cpumask, but not this one.
@@ -337,6 +347,50 @@ unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
return i;
}
+/**
+ * cpumask_nth - get the first cpu in a cpumask
+ * @srcp: the cpumask pointer
+ * @cpu: the N'th cpu to find, starting from 0
+ *
+ * Returns >= nr_cpu_ids if such cpu doesn't exist.
+ */
+static inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *srcp)
+{
+ return find_nth_bit(cpumask_bits(srcp), nr_cpumask_bits, cpumask_check(cpu));
+}
+
+/**
+ * cpumask_nth_and - get the first cpu in 2 cpumasks
+ * @srcp1: the cpumask pointer
+ * @srcp2: the cpumask pointer
+ * @cpu: the N'th cpu to find, starting from 0
+ *
+ * Returns >= nr_cpu_ids if such cpu doesn't exist.
+ */
+static inline
+unsigned int cpumask_nth_and(unsigned int cpu, const struct cpumask *srcp1,
+ const struct cpumask *srcp2)
+{
+ return find_nth_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2),
+ nr_cpumask_bits, cpumask_check(cpu));
+}
+
+/**
+ * cpumask_nth_andnot - get the first cpu set in 1st cpumask, and clear in 2nd.
+ * @srcp1: the cpumask pointer
+ * @srcp2: the cpumask pointer
+ * @cpu: the N'th cpu to find, starting from 0
+ *
+ * Returns >= nr_cpu_ids if such cpu doesn't exist.
+ */
+static inline
+unsigned int cpumask_nth_andnot(unsigned int cpu, const struct cpumask *srcp1,
+ const struct cpumask *srcp2)
+{
+ return find_nth_andnot_bit(cpumask_bits(srcp1), cpumask_bits(srcp2),
+ nr_cpumask_bits, cpumask_check(cpu));
+}
+
#define CPU_BITS_NONE \
{ \
[0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL \
@@ -587,6 +641,17 @@ static inline unsigned int cpumask_weight(const struct cpumask *srcp)
}
/**
+ * cpumask_weight_and - Count of bits in (*srcp1 & *srcp2)
+ * @srcp1: the cpumask to count bits (< nr_cpu_ids) in.
+ * @srcp2: the cpumask to count bits (< nr_cpu_ids) in.
+ */
+static inline unsigned int cpumask_weight_and(const struct cpumask *srcp1,
+ const struct cpumask *srcp2)
+{
+ return bitmap_weight_and(cpumask_bits(srcp1), cpumask_bits(srcp2), nr_cpumask_bits);
+}
+
+/**
* cpumask_shift_right - *dstp = *srcp >> n
* @dstp: the cpumask result
* @srcp: the input to shift
diff --git a/include/linux/damon.h b/include/linux/damon.h
index 7b1f4a488230..620ada094c3b 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -216,13 +216,26 @@ struct damos_stat {
};
/**
- * struct damos - Represents a Data Access Monitoring-based Operation Scheme.
+ * struct damos_access_pattern - Target access pattern of the given scheme.
* @min_sz_region: Minimum size of target regions.
* @max_sz_region: Maximum size of target regions.
* @min_nr_accesses: Minimum ``->nr_accesses`` of target regions.
* @max_nr_accesses: Maximum ``->nr_accesses`` of target regions.
* @min_age_region: Minimum age of target regions.
* @max_age_region: Maximum age of target regions.
+ */
+struct damos_access_pattern {
+ unsigned long min_sz_region;
+ unsigned long max_sz_region;
+ unsigned int min_nr_accesses;
+ unsigned int max_nr_accesses;
+ unsigned int min_age_region;
+ unsigned int max_age_region;
+};
+
+/**
+ * struct damos - Represents a Data Access Monitoring-based Operation Scheme.
+ * @pattern: Access pattern of target regions.
* @action: &damo_action to be applied to the target regions.
* @quota: Control the aggressiveness of this scheme.
* @wmarks: Watermarks for automated (in)activation of this scheme.
@@ -230,10 +243,8 @@ struct damos_stat {
* @list: List head for siblings.
*
* For each aggregation interval, DAMON finds regions which fit in the
- * condition (&min_sz_region, &max_sz_region, &min_nr_accesses,
- * &max_nr_accesses, &min_age_region, &max_age_region) and applies &action to
- * those. To avoid consuming too much CPU time or IO resources for the
- * &action, &quota is used.
+ * &pattern and applies &action to those. To avoid consuming too much
+ * CPU time or IO resources for the &action, &quota is used.
*
* To do the work only when needed, schemes can be activated for specific
* system situations using &wmarks. If all schemes that registered to the
@@ -248,12 +259,7 @@ struct damos_stat {
* &action is applied.
*/
struct damos {
- unsigned long min_sz_region;
- unsigned long max_sz_region;
- unsigned int min_nr_accesses;
- unsigned int max_nr_accesses;
- unsigned int min_age_region;
- unsigned int max_age_region;
+ struct damos_access_pattern pattern;
enum damos_action action;
struct damos_quota quota;
struct damos_watermarks wmarks;
@@ -340,7 +346,7 @@ struct damon_operations {
unsigned long (*apply_scheme)(struct damon_ctx *context,
struct damon_target *t, struct damon_region *r,
struct damos *scheme);
- bool (*target_valid)(void *target);
+ bool (*target_valid)(struct damon_target *t);
void (*cleanup)(struct damon_ctx *context);
};
@@ -383,13 +389,15 @@ struct damon_callback {
};
/**
- * struct damon_ctx - Represents a context for each monitoring. This is the
- * main interface that allows users to set the attributes and get the results
- * of the monitoring.
+ * struct damon_attrs - Monitoring attributes for accuracy/overhead control.
*
* @sample_interval: The time between access samplings.
* @aggr_interval: The time between monitor results aggregations.
* @ops_update_interval: The time between monitoring operations updates.
+ * @min_nr_regions: The minimum number of adaptive monitoring
+ * regions.
+ * @max_nr_regions: The maximum number of adaptive monitoring
+ * regions.
*
* For each @sample_interval, DAMON checks whether each region is accessed or
* not. It aggregates and keeps the access information (number of accesses to
@@ -399,7 +407,21 @@ struct damon_callback {
* @ops_update_interval. All time intervals are in micro-seconds.
* Please refer to &struct damon_operations and &struct damon_callback for more
* detail.
+ */
+struct damon_attrs {
+ unsigned long sample_interval;
+ unsigned long aggr_interval;
+ unsigned long ops_update_interval;
+ unsigned long min_nr_regions;
+ unsigned long max_nr_regions;
+};
+
+/**
+ * struct damon_ctx - Represents a context for each monitoring. This is the
+ * main interface that allows users to set the attributes and get the results
+ * of the monitoring.
*
+ * @attrs: Monitoring attributes for accuracy/overhead control.
* @kdamond: Kernel thread who does the monitoring.
* @kdamond_lock: Mutex for the synchronizations with @kdamond.
*
@@ -421,15 +443,11 @@ struct damon_callback {
* @ops: Set of monitoring operations for given use cases.
* @callback: Set of callbacks for monitoring events notifications.
*
- * @min_nr_regions: The minimum number of adaptive monitoring regions.
- * @max_nr_regions: The maximum number of adaptive monitoring regions.
* @adaptive_targets: Head of monitoring targets (&damon_target) list.
* @schemes: Head of schemes (&damos) list.
*/
struct damon_ctx {
- unsigned long sample_interval;
- unsigned long aggr_interval;
- unsigned long ops_update_interval;
+ struct damon_attrs attrs;
/* private: internal use only */
struct timespec64 last_aggregation;
@@ -442,8 +460,6 @@ struct damon_ctx {
struct damon_operations ops;
struct damon_callback callback;
- unsigned long min_nr_regions;
- unsigned long max_nr_regions;
struct list_head adaptive_targets;
struct list_head schemes;
};
@@ -463,9 +479,23 @@ static inline struct damon_region *damon_last_region(struct damon_target *t)
return list_last_entry(&t->regions_list, struct damon_region, list);
}
+static inline struct damon_region *damon_first_region(struct damon_target *t)
+{
+ return list_first_entry(&t->regions_list, struct damon_region, list);
+}
+
+static inline unsigned long damon_sz_region(struct damon_region *r)
+{
+ return r->ar.end - r->ar.start;
+}
+
+
#define damon_for_each_region(r, t) \
list_for_each_entry(r, &t->regions_list, list)
+#define damon_for_each_region_from(r, t) \
+ list_for_each_entry_from(r, &t->regions_list, list)
+
#define damon_for_each_region_safe(r, next, t) \
list_for_each_entry_safe(r, next, &t->regions_list, list)
@@ -501,12 +531,9 @@ void damon_destroy_region(struct damon_region *r, struct damon_target *t);
int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
unsigned int nr_ranges);
-struct damos *damon_new_scheme(
- unsigned long min_sz_region, unsigned long max_sz_region,
- unsigned int min_nr_accesses, unsigned int max_nr_accesses,
- unsigned int min_age_region, unsigned int max_age_region,
- enum damos_action action, struct damos_quota *quota,
- struct damos_watermarks *wmarks);
+struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
+ enum damos_action action, struct damos_quota *quota,
+ struct damos_watermarks *wmarks);
void damon_add_scheme(struct damon_ctx *ctx, struct damos *s);
void damon_destroy_scheme(struct damos *s);
@@ -519,10 +546,8 @@ unsigned int damon_nr_regions(struct damon_target *t);
struct damon_ctx *damon_new_ctx(void);
void damon_destroy_ctx(struct damon_ctx *ctx);
-int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
- unsigned long aggr_int, unsigned long ops_upd_int,
- unsigned long min_nr_reg, unsigned long max_nr_reg);
-int damon_set_schemes(struct damon_ctx *ctx,
+int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs);
+void damon_set_schemes(struct damon_ctx *ctx,
struct damos **schemes, ssize_t nr_schemes);
int damon_nr_running_ctxs(void);
bool damon_is_registered_ops(enum damon_ops_id id);
@@ -538,6 +563,9 @@ static inline bool damon_target_has_pid(const struct damon_ctx *ctx)
int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive);
int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
+int damon_set_region_biggest_system_ram_default(struct damon_target *t,
+ unsigned long *start, unsigned long *end);
+
#endif /* CONFIG_DAMON */
#endif /* _DAMON_H */
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 54d46518c481..6b351e009f59 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -16,6 +16,7 @@
#include <linux/wait.h>
struct path;
+struct file;
struct vfsmount;
/*
@@ -250,7 +251,7 @@ extern struct dentry * d_make_root(struct inode *);
/* <clickety>-<click> the ramfs-type tree */
extern void d_genocide(struct dentry *);
-extern void d_tmpfile(struct dentry *, struct inode *);
+extern void d_tmpfile(struct file *, struct inode *);
extern struct dentry *d_find_alias(struct inode *);
extern void d_prune_aliases(struct inode *);
diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 58aea2d7385c..0da97dba9ef8 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -73,8 +73,8 @@ extern int delayacct_add_tsk(struct taskstats *, struct task_struct *);
extern __u64 __delayacct_blkio_ticks(struct task_struct *);
extern void __delayacct_freepages_start(void);
extern void __delayacct_freepages_end(void);
-extern void __delayacct_thrashing_start(void);
-extern void __delayacct_thrashing_end(void);
+extern void __delayacct_thrashing_start(bool *in_thrashing);
+extern void __delayacct_thrashing_end(bool *in_thrashing);
extern void __delayacct_swapin_start(void);
extern void __delayacct_swapin_end(void);
extern void __delayacct_compact_start(void);
@@ -143,22 +143,22 @@ static inline void delayacct_freepages_end(void)
__delayacct_freepages_end();
}
-static inline void delayacct_thrashing_start(void)
+static inline void delayacct_thrashing_start(bool *in_thrashing)
{
if (!static_branch_unlikely(&delayacct_key))
return;
if (current->delays)
- __delayacct_thrashing_start();
+ __delayacct_thrashing_start(in_thrashing);
}
-static inline void delayacct_thrashing_end(void)
+static inline void delayacct_thrashing_end(bool *in_thrashing)
{
if (!static_branch_unlikely(&delayacct_key))
return;
if (current->delays)
- __delayacct_thrashing_end();
+ __delayacct_thrashing_end(in_thrashing);
}
static inline void delayacct_swapin_start(void)
@@ -237,9 +237,9 @@ static inline void delayacct_freepages_start(void)
{}
static inline void delayacct_freepages_end(void)
{}
-static inline void delayacct_thrashing_start(void)
+static inline void delayacct_thrashing_start(bool *in_thrashing)
{}
-static inline void delayacct_thrashing_end(void)
+static inline void delayacct_thrashing_end(bool *in_thrashing)
{}
static inline void delayacct_swapin_start(void)
{}
diff --git a/include/linux/devfreq.h b/include/linux/devfreq.h
index 34aab4dd336c..4dc7cda4fd46 100644
--- a/include/linux/devfreq.h
+++ b/include/linux/devfreq.h
@@ -152,8 +152,8 @@ struct devfreq_stats {
* @max_state: count of entry present in the frequency table.
* @previous_freq: previously configured frequency value.
* @last_status: devfreq user device info, performance statistics
- * @data: Private data of the governor. The devfreq framework does not
- * touch this.
+ * @data: devfreq driver pass to governors, governor should not change it.
+ * @governor_data: private data for governors, devfreq core doesn't touch it.
* @user_min_freq_req: PM QoS minimum frequency request from user (via sysfs)
* @user_max_freq_req: PM QoS maximum frequency request from user (via sysfs)
* @scaling_min_freq: Limit minimum frequency requested by OPP interface
@@ -193,7 +193,8 @@ struct devfreq {
unsigned long previous_freq;
struct devfreq_dev_status last_status;
- void *data; /* private data for governors */
+ void *data;
+ void *governor_data;
struct dev_pm_qos_request user_min_freq_req;
struct dev_pm_qos_request user_max_freq_req;
diff --git a/include/linux/device.h b/include/linux/device.h
index 424b55df0272..c90a444be1c4 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -378,10 +378,8 @@ struct dev_links_info {
* @data: Pointer to MSI device data
*/
struct dev_msi_info {
-#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
- struct irq_domain *domain;
-#endif
#ifdef CONFIG_GENERIC_MSI_IRQ
+ struct irq_domain *domain;
struct msi_device_data *data;
#endif
};
@@ -742,7 +740,7 @@ static inline void set_dev_node(struct device *dev, int node)
static inline struct irq_domain *dev_get_msi_domain(const struct device *dev)
{
-#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
+#ifdef CONFIG_GENERIC_MSI_IRQ
return dev->msi.domain;
#else
return NULL;
@@ -751,7 +749,7 @@ static inline struct irq_domain *dev_get_msi_domain(const struct device *dev)
static inline void dev_set_msi_domain(struct device *dev, struct irq_domain *d)
{
-#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
+#ifdef CONFIG_GENERIC_MSI_IRQ
dev->msi.domain = d;
#endif
}
diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
deleted file mode 100644
index 24607dc3c2ac..000000000000
--- a/include/linux/dma-iommu.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2014-2015 ARM Ltd.
- */
-#ifndef __DMA_IOMMU_H
-#define __DMA_IOMMU_H
-
-#include <linux/errno.h>
-#include <linux/types.h>
-
-#ifdef CONFIG_IOMMU_DMA
-#include <linux/dma-mapping.h>
-#include <linux/iommu.h>
-#include <linux/msi.h>
-
-/* Domain management interface for IOMMU drivers */
-int iommu_get_dma_cookie(struct iommu_domain *domain);
-int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base);
-void iommu_put_dma_cookie(struct iommu_domain *domain);
-
-/* Setup call for arch DMA mapping code */
-void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit);
-int iommu_dma_init_fq(struct iommu_domain *domain);
-
-/* The DMA API isn't _quite_ the whole story, though... */
-/*
- * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU device
- *
- * The MSI page will be stored in @desc.
- *
- * Return: 0 on success otherwise an error describing the failure.
- */
-int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr);
-
-/* Update the MSI message if required. */
-void iommu_dma_compose_msi_msg(struct msi_desc *desc,
- struct msi_msg *msg);
-
-void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list);
-
-void iommu_dma_free_cpu_cached_iovas(unsigned int cpu,
- struct iommu_domain *domain);
-
-extern bool iommu_dma_forcedac;
-
-#else /* CONFIG_IOMMU_DMA */
-
-struct iommu_domain;
-struct msi_desc;
-struct msi_msg;
-struct device;
-
-static inline void iommu_setup_dma_ops(struct device *dev, u64 dma_base,
- u64 dma_limit)
-{
-}
-
-static inline int iommu_dma_init_fq(struct iommu_domain *domain)
-{
- return -EINVAL;
-}
-
-static inline int iommu_get_dma_cookie(struct iommu_domain *domain)
-{
- return -ENODEV;
-}
-
-static inline int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base)
-{
- return -ENODEV;
-}
-
-static inline void iommu_put_dma_cookie(struct iommu_domain *domain)
-{
-}
-
-static inline int iommu_dma_prepare_msi(struct msi_desc *desc,
- phys_addr_t msi_addr)
-{
- return 0;
-}
-
-static inline void iommu_dma_compose_msi_msg(struct msi_desc *desc,
- struct msi_msg *msg)
-{
-}
-
-static inline void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list)
-{
-}
-
-#endif /* CONFIG_IOMMU_DMA */
-#endif /* __DMA_IOMMU_H */
diff --git a/include/linux/dsa/tag_qca.h b/include/linux/dsa/tag_qca.h
index 50be7cbd93a5..b1b5720d89a5 100644
--- a/include/linux/dsa/tag_qca.h
+++ b/include/linux/dsa/tag_qca.h
@@ -61,9 +61,9 @@ struct sk_buff;
/* Special struct emulating a Ethernet header */
struct qca_mgmt_ethhdr {
- u32 command; /* command bit 31:0 */
- u32 seq; /* seq 63:32 */
- u32 mdio_data; /* first 4byte mdio */
+ __le32 command; /* command bit 31:0 */
+ __le32 seq; /* seq 63:32 */
+ __le32 mdio_data; /* first 4byte mdio */
__be16 hdr; /* qca hdr */
} __packed;
@@ -73,7 +73,7 @@ enum mdio_cmd {
};
struct mib_ethhdr {
- u32 data[3]; /* first 3 mib counter */
+ __le32 data[3]; /* first 3 mib counter */
__be16 hdr; /* qca hdr */
} __packed;
diff --git a/include/linux/efi.h b/include/linux/efi.h
index da3974bf05d3..7603fc58c47c 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -389,6 +389,7 @@ void efi_native_runtime_setup(void);
#define EFI_LOAD_FILE2_PROTOCOL_GUID EFI_GUID(0x4006c0c1, 0xfcb3, 0x403e, 0x99, 0x6d, 0x4a, 0x6c, 0x87, 0x24, 0xe0, 0x6d)
#define EFI_RT_PROPERTIES_TABLE_GUID EFI_GUID(0xeb66918a, 0x7eef, 0x402a, 0x84, 0x2e, 0x93, 0x1d, 0x21, 0xc3, 0x8a, 0xe9)
#define EFI_DXE_SERVICES_TABLE_GUID EFI_GUID(0x05ad34ba, 0x6f02, 0x4214, 0x95, 0x2e, 0x4d, 0xa0, 0x39, 0x8e, 0x2b, 0xb9)
+#define EFI_SMBIOS_PROTOCOL_GUID EFI_GUID(0x03583ff6, 0xcb36, 0x4940, 0x94, 0x7e, 0xb9, 0xb3, 0x9f, 0x4a, 0xfa, 0xf7)
#define EFI_IMAGE_SECURITY_DATABASE_GUID EFI_GUID(0xd719b2cb, 0x3d3a, 0x4596, 0xa3, 0xbc, 0xda, 0xd0, 0x0e, 0x67, 0x65, 0x6f)
#define EFI_SHIM_LOCK_GUID EFI_GUID(0x605dab50, 0xe046, 0x4300, 0xab, 0xb6, 0x3d, 0xd8, 0x10, 0xdd, 0x8b, 0x23)
@@ -1085,9 +1086,6 @@ efi_status_t efivar_set_variable_locked(efi_char16_t *name, efi_guid_t *vendor,
efi_status_t efivar_set_variable(efi_char16_t *name, efi_guid_t *vendor,
u32 attr, unsigned long data_size, void *data);
-efi_status_t check_var_size(u32 attributes, unsigned long size);
-efi_status_t check_var_size_nonblocking(u32 attributes, unsigned long size);
-
#if IS_ENABLED(CONFIG_EFI_CAPSULE_LOADER)
extern bool efi_capsule_pending(int *reset_type);
@@ -1225,7 +1223,7 @@ efi_status_t efi_random_get_seed(void);
arch_efi_call_virt_teardown(); \
})
-#define EFI_RANDOM_SEED_SIZE 64U
+#define EFI_RANDOM_SEED_SIZE 32U // BLAKE2S_HASH_SIZE
struct linux_efi_random_seed {
u32 size;
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 84a466b176cf..d95ab85f96ba 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -253,7 +253,6 @@ static __always_inline void arch_exit_to_user_mode(void) { }
/**
* arch_do_signal_or_restart - Architecture specific signal delivery function
* @regs: Pointer to currents pt_regs
- * @has_signal: actual signal to handle
*
* Invoked from exit_to_user_mode_loop().
*/
diff --git a/include/linux/export-internal.h b/include/linux/export-internal.h
index c2b1d4fd5987..fe7e6ba918f1 100644
--- a/include/linux/export-internal.h
+++ b/include/linux/export-internal.h
@@ -10,8 +10,10 @@
#include <linux/compiler.h>
#include <linux/types.h>
-/* __used is needed to keep __crc_* for LTO */
#define SYMBOL_CRC(sym, crc, sec) \
- u32 __section("___kcrctab" sec "+" #sym) __used __crc_##sym = crc
+ asm(".section \"___kcrctab" sec "+" #sym "\",\"a\"" "\n" \
+ "__crc_" #sym ":" "\n" \
+ ".long " #crc "\n" \
+ ".previous" "\n")
#endif /* __LINUX_EXPORT_INTERNAL_H__ */
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index d445150c5350..ee0d75d9a302 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -73,6 +73,42 @@ struct f2fs_device {
__le32 total_segments;
} __packed;
+/* reason of stop_checkpoint */
+enum stop_cp_reason {
+ STOP_CP_REASON_SHUTDOWN,
+ STOP_CP_REASON_FAULT_INJECT,
+ STOP_CP_REASON_META_PAGE,
+ STOP_CP_REASON_WRITE_FAIL,
+ STOP_CP_REASON_CORRUPTED_SUMMARY,
+ STOP_CP_REASON_UPDATE_INODE,
+ STOP_CP_REASON_FLUSH_FAIL,
+ STOP_CP_REASON_MAX,
+};
+
+#define MAX_STOP_REASON 32
+
+/* detail reason for EFSCORRUPTED */
+enum f2fs_error {
+ ERROR_CORRUPTED_CLUSTER,
+ ERROR_FAIL_DECOMPRESSION,
+ ERROR_INVALID_BLKADDR,
+ ERROR_CORRUPTED_DIRENT,
+ ERROR_CORRUPTED_INODE,
+ ERROR_INCONSISTENT_SUMMARY,
+ ERROR_INCONSISTENT_FOOTER,
+ ERROR_INCONSISTENT_SUM_TYPE,
+ ERROR_CORRUPTED_JOURNAL,
+ ERROR_INCONSISTENT_NODE_COUNT,
+ ERROR_INCONSISTENT_BLOCK_COUNT,
+ ERROR_INVALID_CURSEG,
+ ERROR_INCONSISTENT_SIT,
+ ERROR_CORRUPTED_VERITY_XATTR,
+ ERROR_CORRUPTED_XATTR,
+ ERROR_MAX,
+};
+
+#define MAX_F2FS_ERRORS 16
+
struct f2fs_super_block {
__le32 magic; /* Magic Number */
__le16 major_ver; /* Major Version */
@@ -116,7 +152,9 @@ struct f2fs_super_block {
__u8 hot_ext_count; /* # of hot file extension */
__le16 s_encoding; /* Filename charset encoding */
__le16 s_encoding_flags; /* Filename charset encoding flags */
- __u8 reserved[306]; /* valid reserved region */
+ __u8 s_stop_reason[MAX_STOP_REASON]; /* stop checkpoint reason */
+ __u8 s_errors[MAX_F2FS_ERRORS]; /* reason of image corrupts */
+ __u8 reserved[258]; /* valid reserved region */
__le32 crc; /* checksum of superblock */
} __packed;
diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h
index 9f6e25467844..444236dadcf0 100644
--- a/include/linux/fault-inject.h
+++ b/include/linux/fault-inject.h
@@ -20,7 +20,6 @@ struct fault_attr {
atomic_t space;
unsigned long verbose;
bool task_filter;
- bool no_warn;
unsigned long stacktrace_depth;
unsigned long require_start;
unsigned long require_end;
@@ -32,6 +31,10 @@ struct fault_attr {
struct dentry *dname;
};
+enum fault_flags {
+ FAULT_NOWARN = 1 << 0,
+};
+
#define FAULT_ATTR_INITIALIZER { \
.interval = 1, \
.times = ATOMIC_INIT(1), \
@@ -40,11 +43,11 @@ struct fault_attr {
.ratelimit_state = RATELIMIT_STATE_INIT_DISABLED, \
.verbose = 2, \
.dname = NULL, \
- .no_warn = false, \
}
#define DECLARE_FAULT_ATTR(name) struct fault_attr name = FAULT_ATTR_INITIALIZER
int setup_fault_attr(struct fault_attr *attr, char *str);
+bool should_fail_ex(struct fault_attr *attr, ssize_t size, int flags);
bool should_fail(struct fault_attr *attr, ssize_t size);
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 0aff76bcbb00..bcb8658f5b64 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -555,7 +555,7 @@ static inline struct apertures_struct *alloc_apertures(unsigned int max_num) {
#elif defined(__i386__) || defined(__alpha__) || defined(__x86_64__) || \
defined(__hppa__) || defined(__sh__) || defined(__powerpc__) || \
- defined(__arm__) || defined(__aarch64__)
+ defined(__arm__) || defined(__aarch64__) || defined(__mips__)
#define fb_readb __raw_readb
#define fb_readw __raw_readw
diff --git a/include/linux/find.h b/include/linux/find.h
index 424ef67d4a42..ccaf61a0f5fd 100644
--- a/include/linux/find.h
+++ b/include/linux/find.h
@@ -8,15 +8,33 @@
#include <linux/bitops.h>
-extern unsigned long _find_next_bit(const unsigned long *addr1,
- const unsigned long *addr2, unsigned long nbits,
- unsigned long start, unsigned long invert, unsigned long le);
+unsigned long _find_next_bit(const unsigned long *addr1, unsigned long nbits,
+ unsigned long start);
+unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2,
+ unsigned long nbits, unsigned long start);
+unsigned long _find_next_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
+ unsigned long nbits, unsigned long start);
+unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits,
+ unsigned long start);
extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size);
+unsigned long __find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n);
+unsigned long __find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2,
+ unsigned long size, unsigned long n);
+unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
+ unsigned long size, unsigned long n);
extern unsigned long _find_first_and_bit(const unsigned long *addr1,
const unsigned long *addr2, unsigned long size);
extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size);
extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size);
+#ifdef __BIG_ENDIAN
+unsigned long _find_first_zero_bit_le(const unsigned long *addr, unsigned long size);
+unsigned long _find_next_zero_bit_le(const unsigned long *addr, unsigned
+ long size, unsigned long offset);
+unsigned long _find_next_bit_le(const unsigned long *addr, unsigned
+ long size, unsigned long offset);
+#endif
+
#ifndef find_next_bit
/**
* find_next_bit - find the next set bit in a memory region
@@ -41,7 +59,7 @@ unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
return val ? __ffs(val) : size;
}
- return _find_next_bit(addr, NULL, size, offset, 0UL, 0);
+ return _find_next_bit(addr, size, offset);
}
#endif
@@ -71,7 +89,38 @@ unsigned long find_next_and_bit(const unsigned long *addr1,
return val ? __ffs(val) : size;
}
- return _find_next_bit(addr1, addr2, size, offset, 0UL, 0);
+ return _find_next_and_bit(addr1, addr2, size, offset);
+}
+#endif
+
+#ifndef find_next_andnot_bit
+/**
+ * find_next_andnot_bit - find the next set bit in *addr1 excluding all the bits
+ * in *addr2
+ * @addr1: The first address to base the search on
+ * @addr2: The second address to base the search on
+ * @size: The bitmap size in bits
+ * @offset: The bitnumber to start searching at
+ *
+ * Returns the bit number for the next set bit
+ * If no bits are set, returns @size.
+ */
+static inline
+unsigned long find_next_andnot_bit(const unsigned long *addr1,
+ const unsigned long *addr2, unsigned long size,
+ unsigned long offset)
+{
+ if (small_const_nbits(size)) {
+ unsigned long val;
+
+ if (unlikely(offset >= size))
+ return size;
+
+ val = *addr1 & ~*addr2 & GENMASK(size - 1, offset);
+ return val ? __ffs(val) : size;
+ }
+
+ return _find_next_andnot_bit(addr1, addr2, size, offset);
}
#endif
@@ -99,7 +148,7 @@ unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
return val == ~0UL ? size : ffz(val);
}
- return _find_next_bit(addr, NULL, size, offset, ~0UL, 0);
+ return _find_next_zero_bit(addr, size, offset);
}
#endif
@@ -125,6 +174,87 @@ unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
}
#endif
+/**
+ * find_nth_bit - find N'th set bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum number of bits to search
+ * @n: The number of set bit, which position is needed, counting from 0
+ *
+ * The following is semantically equivalent:
+ * idx = find_nth_bit(addr, size, 0);
+ * idx = find_first_bit(addr, size);
+ *
+ * Returns the bit number of the N'th set bit.
+ * If no such, returns @size.
+ */
+static inline
+unsigned long find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n)
+{
+ if (n >= size)
+ return size;
+
+ if (small_const_nbits(size)) {
+ unsigned long val = *addr & GENMASK(size - 1, 0);
+
+ return val ? fns(val, n) : size;
+ }
+
+ return __find_nth_bit(addr, size, n);
+}
+
+/**
+ * find_nth_and_bit - find N'th set bit in 2 memory regions
+ * @addr1: The 1st address to start the search at
+ * @addr2: The 2nd address to start the search at
+ * @size: The maximum number of bits to search
+ * @n: The number of set bit, which position is needed, counting from 0
+ *
+ * Returns the bit number of the N'th set bit.
+ * If no such, returns @size.
+ */
+static inline
+unsigned long find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2,
+ unsigned long size, unsigned long n)
+{
+ if (n >= size)
+ return size;
+
+ if (small_const_nbits(size)) {
+ unsigned long val = *addr1 & *addr2 & GENMASK(size - 1, 0);
+
+ return val ? fns(val, n) : size;
+ }
+
+ return __find_nth_and_bit(addr1, addr2, size, n);
+}
+
+/**
+ * find_nth_andnot_bit - find N'th set bit in 2 memory regions,
+ * flipping bits in 2nd region
+ * @addr1: The 1st address to start the search at
+ * @addr2: The 2nd address to start the search at
+ * @size: The maximum number of bits to search
+ * @n: The number of set bit, which position is needed, counting from 0
+ *
+ * Returns the bit number of the N'th set bit.
+ * If no such, returns @size.
+ */
+static inline
+unsigned long find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
+ unsigned long size, unsigned long n)
+{
+ if (n >= size)
+ return size;
+
+ if (small_const_nbits(size)) {
+ unsigned long val = *addr1 & (~*addr2) & GENMASK(size - 1, 0);
+
+ return val ? fns(val, n) : size;
+ }
+
+ return __find_nth_andnot_bit(addr1, addr2, size, n);
+}
+
#ifndef find_first_and_bit
/**
* find_first_and_bit - find the first set bit in both memory regions
@@ -194,6 +324,78 @@ unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
#endif
/**
+ * find_next_and_bit_wrap - find the next set bit in both memory regions
+ * @addr1: The first address to base the search on
+ * @addr2: The second address to base the search on
+ * @size: The bitmap size in bits
+ * @offset: The bitnumber to start searching at
+ *
+ * Returns the bit number for the next set bit, or first set bit up to @offset
+ * If no bits are set, returns @size.
+ */
+static inline
+unsigned long find_next_and_bit_wrap(const unsigned long *addr1,
+ const unsigned long *addr2,
+ unsigned long size, unsigned long offset)
+{
+ unsigned long bit = find_next_and_bit(addr1, addr2, size, offset);
+
+ if (bit < size)
+ return bit;
+
+ bit = find_first_and_bit(addr1, addr2, offset);
+ return bit < offset ? bit : size;
+}
+
+/**
+ * find_next_bit_wrap - find the next set bit in both memory regions
+ * @addr: The first address to base the search on
+ * @size: The bitmap size in bits
+ * @offset: The bitnumber to start searching at
+ *
+ * Returns the bit number for the next set bit, or first set bit up to @offset
+ * If no bits are set, returns @size.
+ */
+static inline
+unsigned long find_next_bit_wrap(const unsigned long *addr,
+ unsigned long size, unsigned long offset)
+{
+ unsigned long bit = find_next_bit(addr, size, offset);
+
+ if (bit < size)
+ return bit;
+
+ bit = find_first_bit(addr, offset);
+ return bit < offset ? bit : size;
+}
+
+/*
+ * Helper for for_each_set_bit_wrap(). Make sure you're doing right thing
+ * before using it alone.
+ */
+static inline
+unsigned long __for_each_wrap(const unsigned long *bitmap, unsigned long size,
+ unsigned long start, unsigned long n)
+{
+ unsigned long bit;
+
+ /* If not wrapped around */
+ if (n > start) {
+ /* and have a bit, just return it. */
+ bit = find_next_bit(bitmap, size, n);
+ if (bit < size)
+ return bit;
+
+ /* Otherwise, wrap around and ... */
+ n = 0;
+ }
+
+ /* Search the other part. */
+ bit = find_next_bit(bitmap, start, n);
+ return bit < start ? bit : size;
+}
+
+/**
* find_next_clump8 - find next 8-bit clump with set bits in a memory region
* @clump: location to store copy of found clump
* @addr: address to base the search on
@@ -247,7 +449,21 @@ unsigned long find_next_zero_bit_le(const void *addr, unsigned
return val == ~0UL ? size : ffz(val);
}
- return _find_next_bit(addr, NULL, size, offset, ~0UL, 1);
+ return _find_next_zero_bit_le(addr, size, offset);
+}
+#endif
+
+#ifndef find_first_zero_bit_le
+static inline
+unsigned long find_first_zero_bit_le(const void *addr, unsigned long size)
+{
+ if (small_const_nbits(size)) {
+ unsigned long val = swab(*(const unsigned long *)addr) | ~GENMASK(size - 1, 0);
+
+ return val == ~0UL ? size : ffz(val);
+ }
+
+ return _find_first_zero_bit_le(addr, size);
}
#endif
@@ -266,40 +482,39 @@ unsigned long find_next_bit_le(const void *addr, unsigned
return val ? __ffs(val) : size;
}
- return _find_next_bit(addr, NULL, size, offset, 0UL, 1);
+ return _find_next_bit_le(addr, size, offset);
}
#endif
-#ifndef find_first_zero_bit_le
-#define find_first_zero_bit_le(addr, size) \
- find_next_zero_bit_le((addr), (size), 0)
-#endif
-
#else
#error "Please fix <asm/byteorder.h>"
#endif
#define for_each_set_bit(bit, addr, size) \
- for ((bit) = find_next_bit((addr), (size), 0); \
- (bit) < (size); \
- (bit) = find_next_bit((addr), (size), (bit) + 1))
+ for ((bit) = 0; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++)
+
+#define for_each_and_bit(bit, addr1, addr2, size) \
+ for ((bit) = 0; \
+ (bit) = find_next_and_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\
+ (bit)++)
+
+#define for_each_andnot_bit(bit, addr1, addr2, size) \
+ for ((bit) = 0; \
+ (bit) = find_next_andnot_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\
+ (bit)++)
/* same as for_each_set_bit() but use bit as value to start with */
#define for_each_set_bit_from(bit, addr, size) \
- for ((bit) = find_next_bit((addr), (size), (bit)); \
- (bit) < (size); \
- (bit) = find_next_bit((addr), (size), (bit) + 1))
+ for (; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++)
#define for_each_clear_bit(bit, addr, size) \
- for ((bit) = find_next_zero_bit((addr), (size), 0); \
- (bit) < (size); \
- (bit) = find_next_zero_bit((addr), (size), (bit) + 1))
+ for ((bit) = 0; \
+ (bit) = find_next_zero_bit((addr), (size), (bit)), (bit) < (size); \
+ (bit)++)
/* same as for_each_clear_bit() but use bit as value to start with */
#define for_each_clear_bit_from(bit, addr, size) \
- for ((bit) = find_next_zero_bit((addr), (size), (bit)); \
- (bit) < (size); \
- (bit) = find_next_zero_bit((addr), (size), (bit) + 1))
+ for (; (bit) = find_next_zero_bit((addr), (size), (bit)), (bit) < (size); (bit)++)
/**
* for_each_set_bitrange - iterate over all set bit ranges [b; e)
@@ -309,11 +524,11 @@ unsigned long find_next_bit_le(const void *addr, unsigned
* @size: bitmap size in number of bits
*/
#define for_each_set_bitrange(b, e, addr, size) \
- for ((b) = find_next_bit((addr), (size), 0), \
- (e) = find_next_zero_bit((addr), (size), (b) + 1); \
+ for ((b) = 0; \
+ (b) = find_next_bit((addr), (size), b), \
+ (e) = find_next_zero_bit((addr), (size), (b) + 1), \
(b) < (size); \
- (b) = find_next_bit((addr), (size), (e) + 1), \
- (e) = find_next_zero_bit((addr), (size), (b) + 1))
+ (b) = (e) + 1)
/**
* for_each_set_bitrange_from - iterate over all set bit ranges [b; e)
@@ -323,11 +538,11 @@ unsigned long find_next_bit_le(const void *addr, unsigned
* @size: bitmap size in number of bits
*/
#define for_each_set_bitrange_from(b, e, addr, size) \
- for ((b) = find_next_bit((addr), (size), (b)), \
- (e) = find_next_zero_bit((addr), (size), (b) + 1); \
+ for (; \
+ (b) = find_next_bit((addr), (size), (b)), \
+ (e) = find_next_zero_bit((addr), (size), (b) + 1), \
(b) < (size); \
- (b) = find_next_bit((addr), (size), (e) + 1), \
- (e) = find_next_zero_bit((addr), (size), (b) + 1))
+ (b) = (e) + 1)
/**
* for_each_clear_bitrange - iterate over all unset bit ranges [b; e)
@@ -337,11 +552,11 @@ unsigned long find_next_bit_le(const void *addr, unsigned
* @size: bitmap size in number of bits
*/
#define for_each_clear_bitrange(b, e, addr, size) \
- for ((b) = find_next_zero_bit((addr), (size), 0), \
- (e) = find_next_bit((addr), (size), (b) + 1); \
+ for ((b) = 0; \
+ (b) = find_next_zero_bit((addr), (size), (b)), \
+ (e) = find_next_bit((addr), (size), (b) + 1), \
(b) < (size); \
- (b) = find_next_zero_bit((addr), (size), (e) + 1), \
- (e) = find_next_bit((addr), (size), (b) + 1))
+ (b) = (e) + 1)
/**
* for_each_clear_bitrange_from - iterate over all unset bit ranges [b; e)
@@ -351,11 +566,24 @@ unsigned long find_next_bit_le(const void *addr, unsigned
* @size: bitmap size in number of bits
*/
#define for_each_clear_bitrange_from(b, e, addr, size) \
- for ((b) = find_next_zero_bit((addr), (size), (b)), \
- (e) = find_next_bit((addr), (size), (b) + 1); \
+ for (; \
+ (b) = find_next_zero_bit((addr), (size), (b)), \
+ (e) = find_next_bit((addr), (size), (b) + 1), \
(b) < (size); \
- (b) = find_next_zero_bit((addr), (size), (e) + 1), \
- (e) = find_next_bit((addr), (size), (b) + 1))
+ (b) = (e) + 1)
+
+/**
+ * for_each_set_bit_wrap - iterate over all set bits starting from @start, and
+ * wrapping around the end of bitmap.
+ * @bit: offset for current iteration
+ * @addr: bitmap address to base the search on
+ * @size: bitmap size in number of bits
+ * @start: Starting bit for bitmap traversing, wrapping around the bitmap end
+ */
+#define for_each_set_bit_wrap(bit, addr, size, start) \
+ for ((bit) = find_next_bit_wrap((addr), (size), (start)); \
+ (bit) < (size); \
+ (bit) = __for_each_wrap((addr), (size), (start), (bit) + 1))
/**
* for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits
diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
index b62c90cfafaf..1067a8450826 100644
--- a/include/linux/fortify-string.h
+++ b/include/linux/fortify-string.h
@@ -43,11 +43,24 @@ extern __kernel_size_t __underlying_strlen(const char *p) __RENAME(strlen);
extern char *__underlying_strncat(char *p, const char *q, __kernel_size_t count) __RENAME(strncat);
extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) __RENAME(strncpy);
#else
-#define __underlying_memchr __builtin_memchr
-#define __underlying_memcmp __builtin_memcmp
+
+#if defined(__SANITIZE_MEMORY__)
+/*
+ * For KMSAN builds all memcpy/memset/memmove calls should be replaced by the
+ * corresponding __msan_XXX functions.
+ */
+#include <linux/kmsan_string.h>
+#define __underlying_memcpy __msan_memcpy
+#define __underlying_memmove __msan_memmove
+#define __underlying_memset __msan_memset
+#else
#define __underlying_memcpy __builtin_memcpy
#define __underlying_memmove __builtin_memmove
#define __underlying_memset __builtin_memset
+#endif
+
+#define __underlying_memchr __builtin_memchr
+#define __underlying_memcmp __builtin_memcmp
#define __underlying_strcat __builtin_strcat
#define __underlying_strcpy __builtin_strcpy
#define __underlying_strlen __builtin_strlen
@@ -328,8 +341,10 @@ __FORTIFY_INLINE void fortify_memset_chk(__kernel_size_t size,
* __struct_size() vs __member_size() must be captured here to avoid
* evaluating argument side-effects further into the macro layers.
*/
+#ifndef CONFIG_KMSAN
#define memset(p, c, s) __fortify_memset_chk(p, c, s, \
__struct_size(p), __member_size(p))
+#endif
/*
* To make sure the compiler can enforce protection against buffer overflows,
@@ -439,13 +454,18 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size,
#define __fortify_memcpy_chk(p, q, size, p_size, q_size, \
p_size_field, q_size_field, op) ({ \
- size_t __fortify_size = (size_t)(size); \
- WARN_ONCE(fortify_memcpy_chk(__fortify_size, p_size, q_size, \
- p_size_field, q_size_field, #op), \
+ const size_t __fortify_size = (size_t)(size); \
+ const size_t __p_size = (p_size); \
+ const size_t __q_size = (q_size); \
+ const size_t __p_size_field = (p_size_field); \
+ const size_t __q_size_field = (q_size_field); \
+ WARN_ONCE(fortify_memcpy_chk(__fortify_size, __p_size, \
+ __q_size, __p_size_field, \
+ __q_size_field, #op), \
#op ": detected field-spanning write (size %zu) of single %s (size %zu)\n", \
__fortify_size, \
"field \"" #p "\" at " __FILE__ ":" __stringify(__LINE__), \
- p_size_field); \
+ __p_size_field); \
__underlying_##op(p, q, __fortify_size); \
})
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 619d683eb5fd..71b5c24b55ce 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1170,6 +1170,7 @@ extern int locks_delete_block(struct file_lock *);
extern int vfs_test_lock(struct file *, struct file_lock *);
extern int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *);
extern int vfs_cancel_lock(struct file *filp, struct file_lock *fl);
+bool vfs_inode_has_locks(struct inode *inode);
extern int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl);
extern int __break_lease(struct inode *inode, unsigned int flags, unsigned int type);
extern void lease_get_mtime(struct inode *, struct timespec64 *time);
@@ -1186,6 +1187,13 @@ extern void show_fd_locks(struct seq_file *f,
struct file *filp, struct files_struct *files);
extern bool locks_owner_has_blockers(struct file_lock_context *flctx,
fl_owner_t owner);
+
+static inline struct file_lock_context *
+locks_inode_context(const struct inode *inode)
+{
+ return smp_load_acquire(&inode->i_flctx);
+}
+
#else /* !CONFIG_FILE_LOCKING */
static inline int fcntl_getlk(struct file *file, unsigned int cmd,
struct flock __user *user)
@@ -1284,6 +1292,11 @@ static inline int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
return 0;
}
+static inline bool vfs_inode_has_locks(struct inode *inode)
+{
+ return false;
+}
+
static inline int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl)
{
return -ENOLCK;
@@ -1326,6 +1339,13 @@ static inline bool locks_owner_has_blockers(struct file_lock_context *flctx,
{
return false;
}
+
+static inline struct file_lock_context *
+locks_inode_context(const struct inode *inode)
+{
+ return NULL;
+}
+
#endif /* !CONFIG_FILE_LOCKING */
static inline struct inode *file_inode(const struct file *f)
@@ -2004,8 +2024,9 @@ static inline int vfs_whiteout(struct user_namespace *mnt_userns,
WHITEOUT_DEV);
}
-struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns,
- struct dentry *dentry, umode_t mode, int open_flag);
+struct file *vfs_tmpfile_open(struct user_namespace *mnt_userns,
+ const struct path *parentpath,
+ umode_t mode, int open_flag, const struct cred *cred);
int vfs_mkobj(struct dentry *, umode_t,
int (*f)(struct dentry *, umode_t, void *),
@@ -2088,6 +2109,14 @@ struct dir_context {
*/
#define REMAP_FILE_ADVISORY (REMAP_FILE_CAN_SHORTEN)
+/*
+ * These flags control the behavior of vfs_copy_file_range().
+ * They are not available to the user via syscall.
+ *
+ * COPY_FILE_SPLICE: call splice direct instead of fs clone/copy ops
+ */
+#define COPY_FILE_SPLICE (1 << 0)
+
struct iov_iter;
struct io_uring_cmd;
@@ -2170,7 +2199,7 @@ struct inode_operations {
struct file *, unsigned open_flag,
umode_t create_mode);
int (*tmpfile) (struct user_namespace *, struct inode *,
- struct dentry *, umode_t);
+ struct file *, umode_t);
int (*set_acl)(struct user_namespace *, struct inode *,
struct posix_acl *, int);
int (*fileattr_set)(struct user_namespace *mnt_userns,
@@ -2783,6 +2812,15 @@ extern int finish_open(struct file *file, struct dentry *dentry,
int (*open)(struct inode *, struct file *));
extern int finish_no_open(struct file *file, struct dentry *dentry);
+/* Helper for the simple case when original dentry is used */
+static inline int finish_open_simple(struct file *file, int error)
+{
+ if (error)
+ return error;
+
+ return finish_open(file, file->f_path.dentry, NULL);
+}
+
/* fs/dcache.c */
extern void __init vfs_caches_init_early(void);
extern void __init vfs_caches_init(void);
diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 36e5dd84cf59..8e312c8323a8 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -75,7 +75,7 @@ struct fscache_volume {
atomic_t n_accesses; /* Number of cache accesses in progress */
unsigned int debug_id;
unsigned int key_hash; /* Hash of key string */
- char *key; /* Volume ID, eg. "afs@example.com@1234" */
+ u8 *key; /* Volume ID, eg. "afs@example.com@1234" */
struct list_head proc_link; /* Link in /proc/fs/fscache/volumes */
struct hlist_bl_node hash_link; /* Link in hash table */
struct work_struct work;
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index cad78b569c7e..4f5f8a651213 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -307,7 +307,7 @@ fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy)
}
/* keyring.c */
-void fscrypt_sb_delete(struct super_block *sb);
+void fscrypt_destroy_keyring(struct super_block *sb);
int fscrypt_ioctl_add_key(struct file *filp, void __user *arg);
int fscrypt_add_test_dummy_key(struct super_block *sb,
const struct fscrypt_dummy_policy *dummy_policy);
@@ -521,7 +521,7 @@ fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy)
}
/* keyring.c */
-static inline void fscrypt_sb_delete(struct super_block *sb)
+static inline void fscrypt_destroy_keyring(struct super_block *sb)
{
}
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 0b61371e287b..99f1146614c0 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -37,9 +37,10 @@ extern void ftrace_boot_snapshot(void);
static inline void ftrace_boot_snapshot(void) { }
#endif
-#ifdef CONFIG_FUNCTION_TRACER
struct ftrace_ops;
struct ftrace_regs;
+
+#ifdef CONFIG_FUNCTION_TRACER
/*
* If the arch's mcount caller does not support all of ftrace's
* features, then it must call an indirect function that
@@ -110,12 +111,11 @@ struct ftrace_regs {
#define arch_ftrace_get_regs(fregs) (&(fregs)->regs)
/*
- * ftrace_instruction_pointer_set() is to be defined by the architecture
- * if to allow setting of the instruction pointer from the ftrace_regs
- * when HAVE_DYNAMIC_FTRACE_WITH_ARGS is set and it supports
- * live kernel patching.
+ * ftrace_regs_set_instruction_pointer() is to be defined by the architecture
+ * if to allow setting of the instruction pointer from the ftrace_regs when
+ * HAVE_DYNAMIC_FTRACE_WITH_ARGS is set and it supports live kernel patching.
*/
-#define ftrace_instruction_pointer_set(fregs, ip) do { } while (0)
+#define ftrace_regs_set_instruction_pointer(fregs, ip) do { } while (0)
#endif /* CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */
static __always_inline struct pt_regs *ftrace_get_regs(struct ftrace_regs *fregs)
@@ -126,6 +126,35 @@ static __always_inline struct pt_regs *ftrace_get_regs(struct ftrace_regs *fregs
return arch_ftrace_get_regs(fregs);
}
+/*
+ * When true, the ftrace_regs_{get,set}_*() functions may be used on fregs.
+ * Note: this can be true even when ftrace_get_regs() cannot provide a pt_regs.
+ */
+static __always_inline bool ftrace_regs_has_args(struct ftrace_regs *fregs)
+{
+ if (IS_ENABLED(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS))
+ return true;
+
+ return ftrace_get_regs(fregs) != NULL;
+}
+
+#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
+#define ftrace_regs_get_instruction_pointer(fregs) \
+ instruction_pointer(ftrace_get_regs(fregs))
+#define ftrace_regs_get_argument(fregs, n) \
+ regs_get_kernel_argument(ftrace_get_regs(fregs), n)
+#define ftrace_regs_get_stack_pointer(fregs) \
+ kernel_stack_pointer(ftrace_get_regs(fregs))
+#define ftrace_regs_return_value(fregs) \
+ regs_return_value(ftrace_get_regs(fregs))
+#define ftrace_regs_set_return_value(fregs, ret) \
+ regs_set_return_value(ftrace_get_regs(fregs), ret)
+#define ftrace_override_function_with_return(fregs) \
+ override_function_with_return(ftrace_get_regs(fregs))
+#define ftrace_regs_query_register_offset(name) \
+ regs_query_register_offset(name)
+#endif
+
typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct ftrace_regs *fregs);
@@ -427,9 +456,7 @@ static inline int modify_ftrace_direct_multi_nolock(struct ftrace_ops *ops, unsi
{
return -ENODEV;
}
-#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
-#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
/*
* This must be implemented by the architecture.
* It is the way the ftrace direct_ops helper, when called
@@ -443,9 +470,9 @@ static inline int modify_ftrace_direct_multi_nolock(struct ftrace_ops *ops, unsi
* the return from the trampoline jump to the direct caller
* instead of going back to the function it just traced.
*/
-static inline void arch_ftrace_set_direct_caller(struct pt_regs *regs,
+static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs,
unsigned long addr) { }
-#endif /* CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
#ifdef CONFIG_STACK_TRACER
@@ -1122,47 +1149,6 @@ static inline void unpause_graph_tracing(void) { }
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
#ifdef CONFIG_TRACING
-
-/* flags for current->trace */
-enum {
- TSK_TRACE_FL_TRACE_BIT = 0,
- TSK_TRACE_FL_GRAPH_BIT = 1,
-};
-enum {
- TSK_TRACE_FL_TRACE = 1 << TSK_TRACE_FL_TRACE_BIT,
- TSK_TRACE_FL_GRAPH = 1 << TSK_TRACE_FL_GRAPH_BIT,
-};
-
-static inline void set_tsk_trace_trace(struct task_struct *tsk)
-{
- set_bit(TSK_TRACE_FL_TRACE_BIT, &tsk->trace);
-}
-
-static inline void clear_tsk_trace_trace(struct task_struct *tsk)
-{
- clear_bit(TSK_TRACE_FL_TRACE_BIT, &tsk->trace);
-}
-
-static inline int test_tsk_trace_trace(struct task_struct *tsk)
-{
- return tsk->trace & TSK_TRACE_FL_TRACE;
-}
-
-static inline void set_tsk_trace_graph(struct task_struct *tsk)
-{
- set_bit(TSK_TRACE_FL_GRAPH_BIT, &tsk->trace);
-}
-
-static inline void clear_tsk_trace_graph(struct task_struct *tsk)
-{
- clear_bit(TSK_TRACE_FL_GRAPH_BIT, &tsk->trace);
-}
-
-static inline int test_tsk_trace_graph(struct task_struct *tsk)
-{
- return tsk->trace & TSK_TRACE_FL_GRAPH;
-}
-
enum ftrace_dump_mode;
extern enum ftrace_dump_mode ftrace_dump_on_oops;
diff --git a/include/linux/gameport.h b/include/linux/gameport.h
index 69081d899492..8c2f00018e89 100644
--- a/include/linux/gameport.h
+++ b/include/linux/gameport.h
@@ -110,7 +110,7 @@ static inline void gameport_free_port(struct gameport *gameport)
static inline void gameport_set_name(struct gameport *gameport, const char *name)
{
- strlcpy(gameport->name, name, sizeof(gameport->name));
+ strscpy(gameport->name, name, sizeof(gameport->name));
}
/*
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index f314be58fa77..65a78773dcca 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -18,6 +18,9 @@ static inline int gfp_migratetype(const gfp_t gfp_flags)
VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE);
BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE);
+ BUILD_BUG_ON((___GFP_RECLAIMABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_RECLAIMABLE);
+ BUILD_BUG_ON(((___GFP_MOVABLE | ___GFP_RECLAIMABLE) >>
+ GFP_MOVABLE_SHIFT) != MIGRATE_HIGHATOMIC);
if (unlikely(page_group_by_mobility_disabled))
return MIGRATE_UNMOVABLE;
@@ -33,29 +36,6 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
return !!(gfp_flags & __GFP_DIRECT_RECLAIM);
}
-/**
- * gfpflags_normal_context - is gfp_flags a normal sleepable context?
- * @gfp_flags: gfp_flags to test
- *
- * Test whether @gfp_flags indicates that the allocation is from the
- * %current context and allowed to sleep.
- *
- * An allocation being allowed to block doesn't mean it owns the %current
- * context. When direct reclaim path tries to allocate memory, the
- * allocation context is nested inside whatever %current was doing at the
- * time of the original allocation. The nested allocation may be allowed
- * to block but modifying anything %current owns can corrupt the outer
- * context's expectations.
- *
- * %true result from this function indicates that the allocation context
- * can sleep and use anything that's associated with %current.
- */
-static inline bool gfpflags_normal_context(const gfp_t gfp_flags)
-{
- return (gfp_flags & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC)) ==
- __GFP_DIRECT_RECLAIM;
-}
-
#ifdef CONFIG_HIGHMEM
#define OPT_ZONE_HIGHMEM ZONE_HIGHMEM
#else
@@ -230,6 +210,20 @@ alloc_pages_bulk_array_node(gfp_t gfp, int nid, unsigned long nr_pages, struct p
return __alloc_pages_bulk(gfp, nid, NULL, nr_pages, NULL, page_array);
}
+static inline void warn_if_node_offline(int this_node, gfp_t gfp_mask)
+{
+ gfp_t warn_gfp = gfp_mask & (__GFP_THISNODE|__GFP_NOWARN);
+
+ if (warn_gfp != (__GFP_THISNODE|__GFP_NOWARN))
+ return;
+
+ if (node_online(this_node))
+ return;
+
+ pr_warn("%pGg allocation from offline node %d\n", &gfp_mask, this_node);
+ dump_stack();
+}
+
/*
* Allocate pages, preferring the node given as nid. The node must be valid and
* online. For more general interface, see alloc_pages_node().
@@ -238,7 +232,7 @@ static inline struct page *
__alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
{
VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
- VM_WARN_ON((gfp_mask & __GFP_THISNODE) && !node_online(nid));
+ warn_if_node_offline(nid, gfp_mask);
return __alloc_pages(gfp_mask, order, nid, NULL);
}
@@ -247,7 +241,7 @@ static inline
struct folio *__folio_alloc_node(gfp_t gfp, unsigned int order, int nid)
{
VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
- VM_WARN_ON((gfp & __GFP_THISNODE) && !node_online(nid));
+ warn_if_node_offline(nid, gfp);
return __folio_alloc(gfp, order, nid, NULL);
}
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 6aeea1071b1b..88ae4513abb5 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -27,7 +27,7 @@ struct gpio_chip;
union gpio_irq_fwspec {
struct irq_fwspec fwspec;
-#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
+#ifdef CONFIG_GENERIC_MSI_IRQ
msi_alloc_info_t msiinfo;
#endif
};
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 25679035ca28..e9912da5441b 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -6,6 +6,7 @@
#include <linux/kernel.h>
#include <linux/bug.h>
#include <linux/cacheflush.h>
+#include <linux/kmsan.h>
#include <linux/mm.h>
#include <linux/uaccess.h>
#include <linux/hardirq.h>
@@ -311,6 +312,7 @@ static inline void copy_user_highpage(struct page *to, struct page *from,
vfrom = kmap_local_page(from);
vto = kmap_local_page(to);
copy_user_page(vto, vfrom, vaddr, to);
+ kmsan_unpoison_memory(page_address(to), PAGE_SIZE);
kunmap_local(vto);
kunmap_local(vfrom);
}
@@ -326,6 +328,7 @@ static inline void copy_highpage(struct page *to, struct page *from)
vfrom = kmap_local_page(from);
vto = kmap_local_page(to);
copy_page(vto, vfrom);
+ kmsan_copy_page_meta(to, from);
kunmap_local(vto);
kunmap_local(vfrom);
}
diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
index 116e8bd68c99..e230c7c46110 100644
--- a/include/linux/hisi_acc_qm.h
+++ b/include/linux/hisi_acc_qm.h
@@ -87,29 +87,6 @@
#define PEH_AXUSER_CFG 0x401001
#define PEH_AXUSER_CFG_ENABLE 0xffffffff
-#define QM_AXI_RRESP BIT(0)
-#define QM_AXI_BRESP BIT(1)
-#define QM_ECC_MBIT BIT(2)
-#define QM_ECC_1BIT BIT(3)
-#define QM_ACC_GET_TASK_TIMEOUT BIT(4)
-#define QM_ACC_DO_TASK_TIMEOUT BIT(5)
-#define QM_ACC_WB_NOT_READY_TIMEOUT BIT(6)
-#define QM_SQ_CQ_VF_INVALID BIT(7)
-#define QM_CQ_VF_INVALID BIT(8)
-#define QM_SQ_VF_INVALID BIT(9)
-#define QM_DB_TIMEOUT BIT(10)
-#define QM_OF_FIFO_OF BIT(11)
-#define QM_DB_RANDOM_INVALID BIT(12)
-#define QM_MAILBOX_TIMEOUT BIT(13)
-#define QM_FLR_TIMEOUT BIT(14)
-
-#define QM_BASE_NFE (QM_AXI_RRESP | QM_AXI_BRESP | QM_ECC_MBIT | \
- QM_ACC_GET_TASK_TIMEOUT | QM_DB_TIMEOUT | \
- QM_OF_FIFO_OF | QM_DB_RANDOM_INVALID | \
- QM_MAILBOX_TIMEOUT | QM_FLR_TIMEOUT)
-#define QM_BASE_CE QM_ECC_1BIT
-
-#define QM_Q_DEPTH 1024
#define QM_MIN_QNUM 2
#define HISI_ACC_SGL_SGE_NR_MAX 255
#define QM_SHAPER_CFG 0x100164
@@ -168,6 +145,15 @@ enum qm_vf_state {
QM_NOT_READY,
};
+enum qm_cap_bits {
+ QM_SUPPORT_DB_ISOLATION = 0x0,
+ QM_SUPPORT_FUNC_QOS,
+ QM_SUPPORT_STOP_QP,
+ QM_SUPPORT_MB_COMMAND,
+ QM_SUPPORT_SVA_PREFETCH,
+ QM_SUPPORT_RPM,
+};
+
struct dfx_diff_registers {
u32 *regs;
u32 reg_offset;
@@ -232,7 +218,10 @@ struct hisi_qm_err_info {
char *acpi_rst;
u32 msi_wr_port;
u32 ecc_2bits_mask;
- u32 dev_ce_mask;
+ u32 qm_shutdown_mask;
+ u32 dev_shutdown_mask;
+ u32 qm_reset_mask;
+ u32 dev_reset_mask;
u32 ce;
u32 nfe;
u32 fe;
@@ -258,6 +247,18 @@ struct hisi_qm_err_ini {
void (*err_info_init)(struct hisi_qm *qm);
};
+struct hisi_qm_cap_info {
+ u32 type;
+ /* Register offset */
+ u32 offset;
+ /* Bit offset in register */
+ u32 shift;
+ u32 mask;
+ u32 v1_val;
+ u32 v2_val;
+ u32 v3_val;
+};
+
struct hisi_qm_list {
struct mutex lock;
struct list_head list;
@@ -278,6 +279,9 @@ struct hisi_qm {
struct pci_dev *pdev;
void __iomem *io_base;
void __iomem *db_io_base;
+
+ /* Capbility version, 0: not supports */
+ u32 cap_ver;
u32 sqe_size;
u32 qp_base;
u32 qp_num;
@@ -286,6 +290,8 @@ struct hisi_qm {
u32 max_qp_num;
u32 vfs_num;
u32 db_interval;
+ u16 eq_depth;
+ u16 aeq_depth;
struct list_head list;
struct hisi_qm_list *qm_list;
@@ -304,6 +310,8 @@ struct hisi_qm {
struct hisi_qm_err_info err_info;
struct hisi_qm_err_status err_status;
unsigned long misc_ctl; /* driver removing and reset sched */
+ /* Device capability bit */
+ unsigned long caps;
struct rw_semaphore qps_lock;
struct idr qp_idr;
@@ -326,8 +334,6 @@ struct hisi_qm {
bool use_sva;
bool is_frozen;
- /* doorbell isolation enable */
- bool use_db_isolation;
resource_size_t phys_base;
resource_size_t db_phys_base;
struct uacce_device *uacce;
@@ -351,6 +357,8 @@ struct hisi_qp_ops {
struct hisi_qp {
u32 qp_id;
+ u16 sq_depth;
+ u16 cq_depth;
u8 alg_type;
u8 req_type;
@@ -501,6 +509,9 @@ void hisi_qm_pm_init(struct hisi_qm *qm);
int hisi_qm_get_dfx_access(struct hisi_qm *qm);
void hisi_qm_put_dfx_access(struct hisi_qm *qm);
void hisi_qm_regs_dump(struct seq_file *s, struct debugfs_regset32 *regset);
+u32 hisi_qm_get_hw_info(struct hisi_qm *qm,
+ const struct hisi_qm_cap_info *info_table,
+ u32 index, bool is_read);
/* Used by VFIO ACC live migration driver */
struct pci_driver *hisi_sec_get_pf_driver(void);
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 768e5261fdae..a1341fdcf666 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -168,9 +168,8 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma)
!inode_is_open_for_write(inode) && S_ISREG(inode->i_mode);
}
-bool hugepage_vma_check(struct vm_area_struct *vma,
- unsigned long vm_flags,
- bool smaps, bool in_pf);
+bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
+ bool smaps, bool in_pf, bool enforce_sysfs);
#define transparent_hugepage_use_zero_page() \
(transparent_hugepage_flags & \
@@ -219,6 +218,9 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags,
int advice);
+int madvise_collapse(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end);
void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start,
unsigned long end, long adjust_next);
spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma);
@@ -321,8 +323,8 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
}
static inline bool hugepage_vma_check(struct vm_area_struct *vma,
- unsigned long vm_flags,
- bool smaps, bool in_pf)
+ unsigned long vm_flags, bool smaps,
+ bool in_pf, bool enforce_sysfs)
{
return false;
}
@@ -362,9 +364,16 @@ static inline void split_huge_pmd_address(struct vm_area_struct *vma,
static inline int hugepage_madvise(struct vm_area_struct *vma,
unsigned long *vm_flags, int advice)
{
- BUG();
- return 0;
+ return -EINVAL;
+}
+
+static inline int madvise_collapse(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end)
+{
+ return -EINVAL;
}
+
static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
unsigned long start,
unsigned long end,
@@ -435,6 +444,11 @@ static inline int split_folio_to_list(struct folio *folio,
return split_huge_page_to_list(&folio->page, list);
}
+static inline int split_folio(struct folio *folio)
+{
+ return split_folio_to_list(folio, NULL);
+}
+
/*
* archs that select ARCH_WANTS_THP_SWAP but don't support THP_SWP due to
* limitations in the implementation like arm64 MTE can override this to
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 1ec1535be04f..8b4f93e84868 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -16,6 +16,7 @@
struct ctl_table;
struct user_struct;
struct mmu_gather;
+struct node;
#ifndef CONFIG_ARCH_HAS_HUGEPD
typedef struct { unsigned long pd; } hugepd_t;
@@ -114,6 +115,12 @@ struct file_region {
#endif
};
+struct hugetlb_vma_lock {
+ struct kref refs;
+ struct rw_semaphore rw_sema;
+ struct vm_area_struct *vma;
+};
+
extern struct resv_map *resv_map_alloc(void);
void resv_map_release(struct kref *ref);
@@ -126,7 +133,7 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
long min_hpages);
void hugepage_put_subpool(struct hugepage_subpool *spool);
-void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
+void hugetlb_dup_vma_private(struct vm_area_struct *vma);
void clear_vma_resv_huge_pages(struct vm_area_struct *vma);
int hugetlb_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *);
int hugetlb_overcommit_handler(struct ctl_table *, int, void *, size_t *,
@@ -207,13 +214,21 @@ struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
struct page *follow_huge_pd(struct vm_area_struct *vma,
unsigned long address, hugepd_t hpd,
int flags, int pdshift);
-struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
- pmd_t *pmd, int flags);
+struct page *follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address,
+ int flags);
struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
pud_t *pud, int flags);
struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address,
pgd_t *pgd, int flags);
+void hugetlb_vma_lock_read(struct vm_area_struct *vma);
+void hugetlb_vma_unlock_read(struct vm_area_struct *vma);
+void hugetlb_vma_lock_write(struct vm_area_struct *vma);
+void hugetlb_vma_unlock_write(struct vm_area_struct *vma);
+int hugetlb_vma_trylock_write(struct vm_area_struct *vma);
+void hugetlb_vma_assert_locked(struct vm_area_struct *vma);
+void hugetlb_vma_lock_release(struct kref *kref);
+
int pmd_huge(pmd_t pmd);
int pud_huge(pud_t pud);
unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
@@ -225,7 +240,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
#else /* !CONFIG_HUGETLB_PAGE */
-static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
+static inline void hugetlb_dup_vma_private(struct vm_area_struct *vma)
{
}
@@ -312,8 +327,8 @@ static inline struct page *follow_huge_pd(struct vm_area_struct *vma,
return NULL;
}
-static inline struct page *follow_huge_pmd(struct mm_struct *mm,
- unsigned long address, pmd_t *pmd, int flags)
+static inline struct page *follow_huge_pmd_pte(struct vm_area_struct *vma,
+ unsigned long address, int flags)
{
return NULL;
}
@@ -336,6 +351,31 @@ static inline int prepare_hugepage_range(struct file *file,
return -EINVAL;
}
+static inline void hugetlb_vma_lock_read(struct vm_area_struct *vma)
+{
+}
+
+static inline void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
+{
+}
+
+static inline void hugetlb_vma_lock_write(struct vm_area_struct *vma)
+{
+}
+
+static inline void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
+{
+}
+
+static inline int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
+{
+ return 1;
+}
+
+static inline void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
+{
+}
+
static inline int pmd_huge(pmd_t pmd)
{
return 0;
@@ -665,7 +705,7 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
nodemask_t *nmask, gfp_t gfp_mask);
struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
unsigned long address);
-int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
+int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping,
pgoff_t idx);
void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
unsigned long address, struct page *page);
@@ -935,6 +975,11 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
}
#endif
+#ifdef CONFIG_NUMA
+void hugetlb_register_node(struct node *node);
+void hugetlb_unregister_node(struct node *node);
+#endif
+
#else /* CONFIG_HUGETLB_PAGE */
struct hstate {};
@@ -1109,6 +1154,14 @@ static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte)
{
}
+
+static inline void hugetlb_register_node(struct node *node)
+{
+}
+
+static inline void hugetlb_unregister_node(struct node *node)
+{
+}
#endif /* CONFIG_HUGETLB_PAGE */
static inline spinlock_t *huge_pte_lock(struct hstate *h,
@@ -1123,14 +1176,10 @@ static inline spinlock_t *huge_pte_lock(struct hstate *h,
#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
extern void __init hugetlb_cma_reserve(int order);
-extern void __init hugetlb_cma_check(void);
#else
static inline __init void hugetlb_cma_reserve(int order)
{
}
-static inline __init void hugetlb_cma_check(void)
-{
-}
#endif
bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr);
diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index 379344828e78..630cd255d0cf 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -90,32 +90,31 @@ hugetlb_cgroup_from_page_rsvd(struct page *page)
return __hugetlb_cgroup_from_page(page, true);
}
-static inline int __set_hugetlb_cgroup(struct page *page,
+static inline void __set_hugetlb_cgroup(struct page *page,
struct hugetlb_cgroup *h_cg, bool rsvd)
{
VM_BUG_ON_PAGE(!PageHuge(page), page);
if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER)
- return -1;
+ return;
if (rsvd)
set_page_private(page + SUBPAGE_INDEX_CGROUP_RSVD,
(unsigned long)h_cg);
else
set_page_private(page + SUBPAGE_INDEX_CGROUP,
(unsigned long)h_cg);
- return 0;
}
-static inline int set_hugetlb_cgroup(struct page *page,
+static inline void set_hugetlb_cgroup(struct page *page,
struct hugetlb_cgroup *h_cg)
{
- return __set_hugetlb_cgroup(page, h_cg, false);
+ __set_hugetlb_cgroup(page, h_cg, false);
}
-static inline int set_hugetlb_cgroup_rsvd(struct page *page,
+static inline void set_hugetlb_cgroup_rsvd(struct page *page,
struct hugetlb_cgroup *h_cg)
{
- return __set_hugetlb_cgroup(page, h_cg, true);
+ __set_hugetlb_cgroup(page, h_cg, true);
}
static inline bool hugetlb_cgroup_disabled(void)
@@ -199,16 +198,14 @@ hugetlb_cgroup_from_page_rsvd(struct page *page)
return NULL;
}
-static inline int set_hugetlb_cgroup(struct page *page,
+static inline void set_hugetlb_cgroup(struct page *page,
struct hugetlb_cgroup *h_cg)
{
- return 0;
}
-static inline int set_hugetlb_cgroup_rsvd(struct page *page,
+static inline void set_hugetlb_cgroup_rsvd(struct page *page,
struct hugetlb_cgroup *h_cg)
{
- return 0;
}
static inline bool hugetlb_cgroup_disabled(void)
diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h
index aa1d4da03538..77c2885c4c13 100644
--- a/include/linux/hw_random.h
+++ b/include/linux/hw_random.h
@@ -50,6 +50,7 @@ struct hwrng {
struct list_head list;
struct kref ref;
struct completion cleanup_done;
+ struct completion dying;
};
struct device;
@@ -61,4 +62,6 @@ extern int devm_hwrng_register(struct device *dev, struct hwrng *rng);
extern void hwrng_unregister(struct hwrng *rng);
extern void devm_hwrng_unregister(struct device *dve, struct hwrng *rng);
+extern long hwrng_msleep(struct hwrng *rng, unsigned int msecs);
+
#endif /* LINUX_HWRANDOM_H_ */
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 3b42264333ef..85f7c5a63aa6 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -969,7 +969,7 @@ struct vmbus_channel {
* mechanism improves throughput by:
*
* A) Making the host more efficient - each time it wakes up,
- * potentially it will process morev number of packets. The
+ * potentially it will process more number of packets. The
* monitor latency allows a batch to build up.
* B) By deferring the hypercall to signal, we will also minimize
* the interrupts.
@@ -1341,6 +1341,8 @@ struct hv_ring_buffer_debug_info {
int hv_ringbuffer_get_debuginfo(struct hv_ring_buffer_info *ring_info,
struct hv_ring_buffer_debug_info *debug_info);
+bool hv_ringbuffer_spinlock_busy(struct vmbus_channel *channel);
+
/* Vmbus interface */
#define vmbus_driver_register(driver) \
__vmbus_driver_register(driver, THIS_MODULE, KBUILD_MODNAME)
diff --git a/include/linux/init.h b/include/linux/init.h
index a0a90cd73ebe..077d7f93b402 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -134,7 +134,7 @@ static inline initcall_t initcall_from_entry(initcall_entry_t *entry)
extern initcall_entry_t __con_initcall_start[], __con_initcall_end[];
-/* Used for contructor calls. */
+/* Used for constructor calls. */
typedef void (*ctor_fn_t)(void);
struct file_system_type;
diff --git a/include/linux/input/auo-pixcir-ts.h b/include/linux/input/auo-pixcir-ts.h
deleted file mode 100644
index ed0776997a7a..000000000000
--- a/include/linux/input/auo-pixcir-ts.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Driver for AUO in-cell touchscreens
- *
- * Copyright (c) 2011 Heiko Stuebner <heiko@sntech.de>
- *
- * based on auo_touch.h from Dell Streak kernel
- *
- * Copyright (c) 2008 QUALCOMM Incorporated.
- * Copyright (c) 2008 QUALCOMM USA, INC.
- */
-
-#ifndef __AUO_PIXCIR_TS_H__
-#define __AUO_PIXCIR_TS_H__
-
-/*
- * Interrupt modes:
- * periodical: interrupt is asserted periodicaly
- * compare coordinates: interrupt is asserted when coordinates change
- * indicate touch: interrupt is asserted during touch
- */
-#define AUO_PIXCIR_INT_PERIODICAL 0x00
-#define AUO_PIXCIR_INT_COMP_COORD 0x01
-#define AUO_PIXCIR_INT_TOUCH_IND 0x02
-
-/*
- * @gpio_int interrupt gpio
- * @int_setting one of AUO_PIXCIR_INT_*
- * @init_hw hardwarespecific init
- * @exit_hw hardwarespecific shutdown
- * @x_max x-resolution
- * @y_max y-resolution
- */
-struct auo_pixcir_ts_platdata {
- int gpio_int;
- int gpio_rst;
-
- int int_setting;
-
- unsigned int x_max;
- unsigned int y_max;
-};
-
-#endif
diff --git a/include/linux/instrumented.h b/include/linux/instrumented.h
index 42faebbaa202..501fa8486749 100644
--- a/include/linux/instrumented.h
+++ b/include/linux/instrumented.h
@@ -2,7 +2,7 @@
/*
* This header provides generic wrappers for memory access instrumentation that
- * the compiler cannot emit for: KASAN, KCSAN.
+ * the compiler cannot emit for: KASAN, KCSAN, KMSAN.
*/
#ifndef _LINUX_INSTRUMENTED_H
#define _LINUX_INSTRUMENTED_H
@@ -10,6 +10,7 @@
#include <linux/compiler.h>
#include <linux/kasan-checks.h>
#include <linux/kcsan-checks.h>
+#include <linux/kmsan-checks.h>
#include <linux/types.h>
/**
@@ -117,10 +118,11 @@ instrument_copy_to_user(void __user *to, const void *from, unsigned long n)
{
kasan_check_read(from, n);
kcsan_check_read(from, n);
+ kmsan_copy_to_user(to, from, n, 0);
}
/**
- * instrument_copy_from_user - instrument writes of copy_from_user
+ * instrument_copy_from_user_before - add instrumentation before copy_from_user
*
* Instrument writes to kernel memory, that are due to copy_from_user (and
* variants). The instrumentation should be inserted before the accesses.
@@ -130,10 +132,61 @@ instrument_copy_to_user(void __user *to, const void *from, unsigned long n)
* @n number of bytes to copy
*/
static __always_inline void
-instrument_copy_from_user(const void *to, const void __user *from, unsigned long n)
+instrument_copy_from_user_before(const void *to, const void __user *from, unsigned long n)
{
kasan_check_write(to, n);
kcsan_check_write(to, n);
}
+/**
+ * instrument_copy_from_user_after - add instrumentation after copy_from_user
+ *
+ * Instrument writes to kernel memory, that are due to copy_from_user (and
+ * variants). The instrumentation should be inserted after the accesses.
+ *
+ * @to destination address
+ * @from source address
+ * @n number of bytes to copy
+ * @left number of bytes not copied (as returned by copy_from_user)
+ */
+static __always_inline void
+instrument_copy_from_user_after(const void *to, const void __user *from,
+ unsigned long n, unsigned long left)
+{
+ kmsan_unpoison_memory(to, n - left);
+}
+
+/**
+ * instrument_get_user() - add instrumentation to get_user()-like macros
+ *
+ * get_user() and friends are fragile, so it may depend on the implementation
+ * whether the instrumentation happens before or after the data is copied from
+ * the userspace.
+ *
+ * @to destination variable, may not be address-taken
+ */
+#define instrument_get_user(to) \
+({ \
+ u64 __tmp = (u64)(to); \
+ kmsan_unpoison_memory(&__tmp, sizeof(__tmp)); \
+ to = __tmp; \
+})
+
+
+/**
+ * instrument_put_user() - add instrumentation to put_user()-like macros
+ *
+ * put_user() and friends are fragile, so it may depend on the implementation
+ * whether the instrumentation happens before or after the data is copied from
+ * the userspace.
+ *
+ * @from source address
+ * @ptr userspace pointer to copy to
+ * @size number of bytes to copy
+ */
+#define instrument_put_user(from, ptr, size) \
+({ \
+ kmsan_copy_to_user(ptr, &from, sizeof(from), 0); \
+})
+
#endif /* _LINUX_INSTRUMENTED_H */
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index ca98aeadcc80..1f068dfdb140 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -16,7 +16,9 @@ enum io_pgtable_fmt {
ARM_V7S,
ARM_MALI_LPAE,
AMD_IOMMU_V1,
+ AMD_IOMMU_V2,
APPLE_DART,
+ APPLE_DART2,
IO_PGTABLE_NUM_FMTS,
};
@@ -260,6 +262,7 @@ extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s2_init_fns;
extern struct io_pgtable_init_fns io_pgtable_arm_v7s_init_fns;
extern struct io_pgtable_init_fns io_pgtable_arm_mali_lpae_init_fns;
extern struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns;
+extern struct io_pgtable_init_fns io_pgtable_amd_iommu_v2_init_fns;
extern struct io_pgtable_init_fns io_pgtable_apple_dart_init_fns;
#endif /* __IO_PGTABLE_H */
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 43bc8a2edccf..0ded9e271523 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -16,6 +16,9 @@ enum io_uring_cmd_flags {
IO_URING_F_SQE128 = 4,
IO_URING_F_CQE32 = 8,
IO_URING_F_IOPOLL = 16,
+
+ /* the request is executed from poll, it should not be freed */
+ IO_URING_F_MULTISHOT = 32,
};
struct io_uring_cmd {
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index aa4d90a53866..f5b687a787a3 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -34,9 +34,6 @@ struct io_file_table {
unsigned int alloc_hint;
};
-struct io_notif;
-struct io_notif_slot;
-
struct io_hash_bucket {
spinlock_t lock;
struct hlist_head list;
@@ -242,8 +239,6 @@ struct io_ring_ctx {
unsigned nr_user_files;
unsigned nr_user_bufs;
struct io_mapped_ubuf **user_bufs;
- struct io_notif_slot *notif_slots;
- unsigned nr_notif_slots;
struct io_submit_state submit_state;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index ea30f00dc145..3c9da1f8979e 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -212,7 +212,7 @@ struct iommu_iotlb_gather {
* @of_xlate: add OF master IDs to iommu grouping
* @is_attach_deferred: Check if domain attach should be deferred from iommu
* driver init to device driver init (default no)
- * @dev_has/enable/disable_feat: per device entries to check/enable/disable
+ * @dev_enable/disable_feat: per device entries to enable/disable
* iommu specific features.
* @sva_bind: Bind process address space to device
* @sva_unbind: Unbind process address space from device
@@ -227,7 +227,7 @@ struct iommu_iotlb_gather {
* @owner: Driver module providing these ops
*/
struct iommu_ops {
- bool (*capable)(enum iommu_cap);
+ bool (*capable)(struct device *dev, enum iommu_cap);
/* Domain allocation and freeing by the iommu driver */
struct iommu_domain *(*domain_alloc)(unsigned iommu_domain_type);
@@ -416,11 +416,9 @@ static inline const struct iommu_ops *dev_iommu_ops(struct device *dev)
return dev->iommu->iommu_dev->ops;
}
-extern int bus_set_iommu(struct bus_type *bus, const struct iommu_ops *ops);
extern int bus_iommu_probe(struct bus_type *bus);
extern bool iommu_present(struct bus_type *bus);
extern bool device_iommu_capable(struct device *dev, enum iommu_cap cap);
-extern bool iommu_capable(struct bus_type *bus, enum iommu_cap cap);
extern struct iommu_domain *iommu_domain_alloc(struct bus_type *bus);
extern struct iommu_group *iommu_group_get_by_id(int id);
extern void iommu_domain_free(struct iommu_domain *domain);
@@ -457,7 +455,7 @@ extern void iommu_set_default_translated(bool cmd_line);
extern bool iommu_default_passthrough(void);
extern struct iommu_resv_region *
iommu_alloc_resv_region(phys_addr_t start, size_t length, int prot,
- enum iommu_resv_type type);
+ enum iommu_resv_type type, gfp_t gfp);
extern int iommu_get_group_resv_regions(struct iommu_group *group,
struct list_head *head);
@@ -697,11 +695,6 @@ static inline bool device_iommu_capable(struct device *dev, enum iommu_cap cap)
return false;
}
-static inline bool iommu_capable(struct bus_type *bus, enum iommu_cap cap)
-{
- return false;
-}
-
static inline struct iommu_domain *iommu_domain_alloc(struct bus_type *bus)
{
return NULL;
@@ -1070,4 +1063,40 @@ void iommu_debugfs_setup(void);
static inline void iommu_debugfs_setup(void) {}
#endif
+#ifdef CONFIG_IOMMU_DMA
+#include <linux/msi.h>
+
+/* Setup call for arch DMA mapping code */
+void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit);
+
+int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base);
+
+int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr);
+void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg);
+
+#else /* CONFIG_IOMMU_DMA */
+
+struct msi_desc;
+struct msi_msg;
+
+static inline void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit)
+{
+}
+
+static inline int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base)
+{
+ return -ENODEV;
+}
+
+static inline int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr)
+{
+ return 0;
+}
+
+static inline void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
+{
+}
+
+#endif /* CONFIG_IOMMU_DMA */
+
#endif /* __LINUX_IOMMU_H */
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 8a76dca9deee..4ae3c541ea6f 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -79,7 +79,8 @@ struct resource {
#define IORESOURCE_IRQ_HIGHLEVEL (1<<2)
#define IORESOURCE_IRQ_LOWLEVEL (1<<3)
#define IORESOURCE_IRQ_SHAREABLE (1<<4)
-#define IORESOURCE_IRQ_OPTIONAL (1<<5)
+#define IORESOURCE_IRQ_OPTIONAL (1<<5)
+#define IORESOURCE_IRQ_WAKECAPABLE (1<<6)
/* PnP DMA specific bits (IORESOURCE_BITS) */
#define IORESOURCE_DMA_TYPE_MASK (3<<0)
@@ -317,6 +318,8 @@ extern void __devm_release_region(struct device *dev, struct resource *parent,
resource_size_t start, resource_size_t n);
extern int iomem_map_sanity_check(resource_size_t addr, unsigned long size);
extern bool iomem_is_exclusive(u64 addr);
+extern bool resource_is_exclusive(struct resource *resource, u64 addr,
+ resource_size_t size);
extern int
walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
diff --git a/include/linux/iova.h b/include/linux/iova.h
index c6ba6d95d79c..83c00fac2acb 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -75,7 +75,7 @@ static inline unsigned long iova_pfn(struct iova_domain *iovad, dma_addr_t iova)
return iova >> iova_shift(iovad);
}
-#if IS_ENABLED(CONFIG_IOMMU_IOVA)
+#if IS_REACHABLE(CONFIG_IOMMU_IOVA)
int iova_cache_get(void);
void iova_cache_put(void);
diff --git a/include/linux/iova_bitmap.h b/include/linux/iova_bitmap.h
new file mode 100644
index 000000000000..c006cf0a25f3
--- /dev/null
+++ b/include/linux/iova_bitmap.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022, Oracle and/or its affiliates.
+ * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+#ifndef _IOVA_BITMAP_H_
+#define _IOVA_BITMAP_H_
+
+#include <linux/types.h>
+
+struct iova_bitmap;
+
+typedef int (*iova_bitmap_fn_t)(struct iova_bitmap *bitmap,
+ unsigned long iova, size_t length,
+ void *opaque);
+
+struct iova_bitmap *iova_bitmap_alloc(unsigned long iova, size_t length,
+ unsigned long page_size,
+ u64 __user *data);
+void iova_bitmap_free(struct iova_bitmap *bitmap);
+int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque,
+ iova_bitmap_fn_t fn);
+void iova_bitmap_set(struct iova_bitmap *bitmap,
+ unsigned long iova, size_t length);
+
+#endif
diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index e3e8c8662b49..e8240cf2611a 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -11,6 +11,7 @@
#include <linux/refcount.h>
#include <linux/rhashtable-types.h>
#include <linux/sysctl.h>
+#include <linux/percpu_counter.h>
struct user_namespace;
@@ -36,8 +37,8 @@ struct ipc_namespace {
unsigned int msg_ctlmax;
unsigned int msg_ctlmnb;
unsigned int msg_ctlmni;
- atomic_t msg_bytes;
- atomic_t msg_hdrs;
+ struct percpu_counter percpu_msg_bytes;
+ struct percpu_counter percpu_msg_hdrs;
size_t shm_ctlmax;
size_t shm_ctlall;
diff --git a/include/linux/irqchip.h b/include/linux/irqchip.h
index 3a091d0710ae..d5e6024cb2a8 100644
--- a/include/linux/irqchip.h
+++ b/include/linux/irqchip.h
@@ -44,7 +44,8 @@ static const struct of_device_id drv_name##_irqchip_match_table[] = {
#define IRQCHIP_MATCH(compat, fn) { .compatible = compat, \
.data = typecheck_irq_init_cb(fn), },
-#define IRQCHIP_PLATFORM_DRIVER_END(drv_name) \
+
+#define IRQCHIP_PLATFORM_DRIVER_END(drv_name, ...) \
{}, \
}; \
MODULE_DEVICE_TABLE(of, drv_name##_irqchip_match_table); \
@@ -56,6 +57,7 @@ static struct platform_driver drv_name##_driver = { \
.owner = THIS_MODULE, \
.of_match_table = drv_name##_irqchip_match_table, \
.suppress_bind_attrs = true, \
+ __VA_ARGS__ \
}, \
}; \
builtin_platform_driver(drv_name##_driver)
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 1cd4e36890fb..844a8e30e6de 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -169,6 +169,7 @@ int generic_handle_irq_safe(unsigned int irq);
* conversion failed.
*/
int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq);
+int generic_handle_domain_irq_safe(struct irq_domain *domain, unsigned int hwirq);
int generic_handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq);
#endif
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 00d577f90883..a372086750ca 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -31,6 +31,7 @@
#define _LINUX_IRQDOMAIN_H
#include <linux/types.h>
+#include <linux/irqdomain_defs.h>
#include <linux/irqhandler.h>
#include <linux/of.h>
#include <linux/mutex.h>
@@ -45,6 +46,7 @@ struct irq_desc;
struct cpumask;
struct seq_file;
struct irq_affinity_desc;
+struct msi_parent_ops;
#define IRQ_DOMAIN_IRQ_SPEC_PARAMS 16
@@ -68,27 +70,6 @@ struct irq_fwspec {
void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
unsigned int count, struct irq_fwspec *fwspec);
-/*
- * Should several domains have the same device node, but serve
- * different purposes (for example one domain is for PCI/MSI, and the
- * other for wired IRQs), they can be distinguished using a
- * bus-specific token. Most domains are expected to only carry
- * DOMAIN_BUS_ANY.
- */
-enum irq_domain_bus_token {
- DOMAIN_BUS_ANY = 0,
- DOMAIN_BUS_WIRED,
- DOMAIN_BUS_GENERIC_MSI,
- DOMAIN_BUS_PCI_MSI,
- DOMAIN_BUS_PLATFORM_MSI,
- DOMAIN_BUS_NEXUS,
- DOMAIN_BUS_IPI,
- DOMAIN_BUS_FSL_MC_MSI,
- DOMAIN_BUS_TI_SCI_INTA_MSI,
- DOMAIN_BUS_WAKEUP,
- DOMAIN_BUS_VMD_MSI,
-};
-
/**
* struct irq_domain_ops - Methods for irq_domain objects
* @match: Match an interrupt controller device node to a host, returns
@@ -137,53 +118,61 @@ struct irq_domain_chip_generic;
/**
* struct irq_domain - Hardware interrupt number translation object
- * @link: Element in global irq_domain list.
- * @name: Name of interrupt domain
- * @ops: pointer to irq_domain methods
- * @host_data: private data pointer for use by owner. Not touched by irq_domain
- * core code.
- * @flags: host per irq_domain flags
- * @mapcount: The number of mapped interrupts
+ * @link: Element in global irq_domain list.
+ * @name: Name of interrupt domain
+ * @ops: Pointer to irq_domain methods
+ * @host_data: Private data pointer for use by owner. Not touched by irq_domain
+ * core code.
+ * @flags: Per irq_domain flags
+ * @mapcount: The number of mapped interrupts
*
- * Optional elements
- * @fwnode: Pointer to firmware node associated with the irq_domain. Pretty easy
- * to swap it for the of_node via the irq_domain_get_of_node accessor
- * @gc: Pointer to a list of generic chips. There is a helper function for
- * setting up one or more generic chips for interrupt controllers
- * drivers using the generic chip library which uses this pointer.
- * @dev: Pointer to a device that the domain represent, and that will be
- * used for power management purposes.
- * @parent: Pointer to parent irq_domain to support hierarchy irq_domains
+ * Optional elements:
+ * @fwnode: Pointer to firmware node associated with the irq_domain. Pretty easy
+ * to swap it for the of_node via the irq_domain_get_of_node accessor
+ * @gc: Pointer to a list of generic chips. There is a helper function for
+ * setting up one or more generic chips for interrupt controllers
+ * drivers using the generic chip library which uses this pointer.
+ * @dev: Pointer to the device which instantiated the irqdomain
+ * With per device irq domains this is not necessarily the same
+ * as @pm_dev.
+ * @pm_dev: Pointer to a device that can be utilized for power management
+ * purposes related to the irq domain.
+ * @parent: Pointer to parent irq_domain to support hierarchy irq_domains
+ * @msi_parent_ops: Pointer to MSI parent domain methods for per device domain init
*
- * Revmap data, used internally by irq_domain
- * @revmap_size: Size of the linear map table @revmap[]
- * @revmap_tree: Radix map tree for hwirqs that don't fit in the linear map
- * @revmap_mutex: Lock for the revmap
- * @revmap: Linear table of irq_data pointers
+ * Revmap data, used internally by the irq domain code:
+ * @revmap_size: Size of the linear map table @revmap[]
+ * @revmap_tree: Radix map tree for hwirqs that don't fit in the linear map
+ * @revmap_mutex: Lock for the revmap
+ * @revmap: Linear table of irq_data pointers
*/
struct irq_domain {
- struct list_head link;
- const char *name;
- const struct irq_domain_ops *ops;
- void *host_data;
- unsigned int flags;
- unsigned int mapcount;
+ struct list_head link;
+ const char *name;
+ const struct irq_domain_ops *ops;
+ void *host_data;
+ unsigned int flags;
+ unsigned int mapcount;
/* Optional data */
- struct fwnode_handle *fwnode;
- enum irq_domain_bus_token bus_token;
- struct irq_domain_chip_generic *gc;
- struct device *dev;
+ struct fwnode_handle *fwnode;
+ enum irq_domain_bus_token bus_token;
+ struct irq_domain_chip_generic *gc;
+ struct device *dev;
+ struct device *pm_dev;
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
- struct irq_domain *parent;
+ struct irq_domain *parent;
+#endif
+#ifdef CONFIG_GENERIC_MSI_IRQ
+ const struct msi_parent_ops *msi_parent_ops;
#endif
/* reverse map data. The linear map gets appended to the irq_domain */
- irq_hw_number_t hwirq_max;
- unsigned int revmap_size;
- struct radix_tree_root revmap_tree;
- struct mutex revmap_mutex;
- struct irq_data __rcu *revmap[];
+ irq_hw_number_t hwirq_max;
+ unsigned int revmap_size;
+ struct radix_tree_root revmap_tree;
+ struct mutex revmap_mutex;
+ struct irq_data __rcu *revmap[];
};
/* Irq domain flags */
@@ -206,15 +195,14 @@ enum {
/* Irq domain implements MSI remapping */
IRQ_DOMAIN_FLAG_MSI_REMAP = (1 << 5),
- /*
- * Quirk to handle MSI implementations which do not provide
- * masking. Currently known to affect x86, but partially
- * handled in core code.
- */
- IRQ_DOMAIN_MSI_NOMASK_QUIRK = (1 << 6),
-
/* Irq domain doesn't translate anything */
- IRQ_DOMAIN_FLAG_NO_MAP = (1 << 7),
+ IRQ_DOMAIN_FLAG_NO_MAP = (1 << 6),
+
+ /* Irq domain is a MSI parent domain */
+ IRQ_DOMAIN_FLAG_MSI_PARENT = (1 << 8),
+
+ /* Irq domain is a MSI device domain */
+ IRQ_DOMAIN_FLAG_MSI_DEVICE = (1 << 9),
/*
* Flags starting from IRQ_DOMAIN_FLAG_NONCORE are reserved
@@ -233,7 +221,7 @@ static inline void irq_domain_set_pm_device(struct irq_domain *d,
struct device *dev)
{
if (d)
- d->dev = dev;
+ d->pm_dev = dev;
}
#ifdef CONFIG_IRQ_DOMAIN
@@ -578,6 +566,16 @@ static inline bool irq_domain_is_msi_remap(struct irq_domain *domain)
extern bool irq_domain_hierarchical_is_msi_remap(struct irq_domain *domain);
+static inline bool irq_domain_is_msi_parent(struct irq_domain *domain)
+{
+ return domain->flags & IRQ_DOMAIN_FLAG_MSI_PARENT;
+}
+
+static inline bool irq_domain_is_msi_device(struct irq_domain *domain)
+{
+ return domain->flags & IRQ_DOMAIN_FLAG_MSI_DEVICE;
+}
+
#else /* CONFIG_IRQ_DOMAIN_HIERARCHY */
static inline int irq_domain_alloc_irqs(struct irq_domain *domain,
unsigned int nr_irqs, int node, void *arg)
@@ -623,6 +621,17 @@ irq_domain_hierarchical_is_msi_remap(struct irq_domain *domain)
{
return false;
}
+
+static inline bool irq_domain_is_msi_parent(struct irq_domain *domain)
+{
+ return false;
+}
+
+static inline bool irq_domain_is_msi_device(struct irq_domain *domain)
+{
+ return false;
+}
+
#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */
#else /* CONFIG_IRQ_DOMAIN */
diff --git a/include/linux/irqdomain_defs.h b/include/linux/irqdomain_defs.h
new file mode 100644
index 000000000000..c29921fd8cd1
--- /dev/null
+++ b/include/linux/irqdomain_defs.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_IRQDOMAIN_DEFS_H
+#define _LINUX_IRQDOMAIN_DEFS_H
+
+/*
+ * Should several domains have the same device node, but serve
+ * different purposes (for example one domain is for PCI/MSI, and the
+ * other for wired IRQs), they can be distinguished using a
+ * bus-specific token. Most domains are expected to only carry
+ * DOMAIN_BUS_ANY.
+ */
+enum irq_domain_bus_token {
+ DOMAIN_BUS_ANY = 0,
+ DOMAIN_BUS_WIRED,
+ DOMAIN_BUS_GENERIC_MSI,
+ DOMAIN_BUS_PCI_MSI,
+ DOMAIN_BUS_PLATFORM_MSI,
+ DOMAIN_BUS_NEXUS,
+ DOMAIN_BUS_IPI,
+ DOMAIN_BUS_FSL_MC_MSI,
+ DOMAIN_BUS_TI_SCI_INTA_MSI,
+ DOMAIN_BUS_WAKEUP,
+ DOMAIN_BUS_VMD_MSI,
+ DOMAIN_BUS_PCI_DEVICE_MSI,
+ DOMAIN_BUS_PCI_DEVICE_MSIX,
+ DOMAIN_BUS_DMAR,
+ DOMAIN_BUS_AMDVI,
+ DOMAIN_BUS_PCI_DEVICE_IMS,
+};
+
+#endif /* _LINUX_IRQDOMAIN_DEFS_H */
diff --git a/include/linux/irqreturn.h b/include/linux/irqreturn.h
index bd4c066ad39b..d426c7ad92bf 100644
--- a/include/linux/irqreturn.h
+++ b/include/linux/irqreturn.h
@@ -3,10 +3,10 @@
#define _LINUX_IRQRETURN_H
/**
- * enum irqreturn
- * @IRQ_NONE interrupt was not from this device or was not handled
- * @IRQ_HANDLED interrupt was handled by this device
- * @IRQ_WAKE_THREAD handler requests to wake the handler thread
+ * enum irqreturn - irqreturn type values
+ * @IRQ_NONE: interrupt was not from this device or was not handled
+ * @IRQ_HANDLED: interrupt was handled by this device
+ * @IRQ_WAKE_THREAD: handler requests to wake the handler thread
*/
enum irqreturn {
IRQ_NONE = (0 << 0),
diff --git a/include/linux/iversion.h b/include/linux/iversion.h
index 3bfebde5a1a6..e27bd4f55d84 100644
--- a/include/linux/iversion.h
+++ b/include/linux/iversion.h
@@ -123,17 +123,12 @@ inode_peek_iversion_raw(const struct inode *inode)
static inline void
inode_set_max_iversion_raw(struct inode *inode, u64 val)
{
- u64 cur, old;
+ u64 cur = inode_peek_iversion_raw(inode);
- cur = inode_peek_iversion_raw(inode);
- for (;;) {
+ do {
if (cur > val)
break;
- old = atomic64_cmpxchg(&inode->i_version, cur, val);
- if (likely(old == cur))
- break;
- cur = old;
- }
+ } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, val));
}
/**
@@ -177,56 +172,7 @@ inode_set_iversion_queried(struct inode *inode, u64 val)
I_VERSION_QUERIED);
}
-/**
- * inode_maybe_inc_iversion - increments i_version
- * @inode: inode with the i_version that should be updated
- * @force: increment the counter even if it's not necessary?
- *
- * Every time the inode is modified, the i_version field must be seen to have
- * changed by any observer.
- *
- * If "force" is set or the QUERIED flag is set, then ensure that we increment
- * the value, and clear the queried flag.
- *
- * In the common case where neither is set, then we can return "false" without
- * updating i_version.
- *
- * If this function returns false, and no other metadata has changed, then we
- * can avoid logging the metadata.
- */
-static inline bool
-inode_maybe_inc_iversion(struct inode *inode, bool force)
-{
- u64 cur, old, new;
-
- /*
- * The i_version field is not strictly ordered with any other inode
- * information, but the legacy inode_inc_iversion code used a spinlock
- * to serialize increments.
- *
- * Here, we add full memory barriers to ensure that any de-facto
- * ordering with other info is preserved.
- *
- * This barrier pairs with the barrier in inode_query_iversion()
- */
- smp_mb();
- cur = inode_peek_iversion_raw(inode);
- for (;;) {
- /* If flag is clear then we needn't do anything */
- if (!force && !(cur & I_VERSION_QUERIED))
- return false;
-
- /* Since lowest bit is flag, add 2 to avoid it */
- new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT;
-
- old = atomic64_cmpxchg(&inode->i_version, cur, new);
- if (likely(old == cur))
- break;
- cur = old;
- }
- return true;
-}
-
+bool inode_maybe_inc_iversion(struct inode *inode, bool force);
/**
* inode_inc_iversion - forcibly increment i_version
@@ -304,10 +250,10 @@ inode_peek_iversion(const struct inode *inode)
static inline u64
inode_query_iversion(struct inode *inode)
{
- u64 cur, old, new;
+ u64 cur, new;
cur = inode_peek_iversion_raw(inode);
- for (;;) {
+ do {
/* If flag is already set, then no need to swap */
if (cur & I_VERSION_QUERIED) {
/*
@@ -320,11 +266,7 @@ inode_query_iversion(struct inode *inode)
}
new = cur | I_VERSION_QUERIED;
- old = atomic64_cmpxchg(&inode->i_version, cur, new);
- if (likely(old == cur))
- break;
- cur = old;
- }
+ } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new));
return cur >> I_VERSION_QUERIED_SHIFT;
}
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index b092277bf48d..96c9d56e5510 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -98,19 +98,13 @@ static inline bool kasan_has_integrated_init(void)
#ifdef CONFIG_KASAN
struct kasan_cache {
+#ifdef CONFIG_KASAN_GENERIC
int alloc_meta_offset;
int free_meta_offset;
+#endif
bool is_kmalloc;
};
-slab_flags_t __kasan_never_merge(void);
-static __always_inline slab_flags_t kasan_never_merge(void)
-{
- if (kasan_enabled())
- return __kasan_never_merge();
- return 0;
-}
-
void __kasan_unpoison_range(const void *addr, size_t size);
static __always_inline void kasan_unpoison_range(const void *addr, size_t size)
{
@@ -134,15 +128,6 @@ static __always_inline void kasan_unpoison_pages(struct page *page,
__kasan_unpoison_pages(page, order, init);
}
-void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
- slab_flags_t *flags);
-static __always_inline void kasan_cache_create(struct kmem_cache *cache,
- unsigned int *size, slab_flags_t *flags)
-{
- if (kasan_enabled())
- __kasan_cache_create(cache, size, flags);
-}
-
void __kasan_cache_create_kmalloc(struct kmem_cache *cache);
static __always_inline void kasan_cache_create_kmalloc(struct kmem_cache *cache)
{
@@ -150,14 +135,6 @@ static __always_inline void kasan_cache_create_kmalloc(struct kmem_cache *cache)
__kasan_cache_create_kmalloc(cache);
}
-size_t __kasan_metadata_size(struct kmem_cache *cache);
-static __always_inline size_t kasan_metadata_size(struct kmem_cache *cache)
-{
- if (kasan_enabled())
- return __kasan_metadata_size(cache);
- return 0;
-}
-
void __kasan_poison_slab(struct slab *slab);
static __always_inline void kasan_poison_slab(struct slab *slab)
{
@@ -269,20 +246,12 @@ static __always_inline bool kasan_check_byte(const void *addr)
#else /* CONFIG_KASAN */
-static inline slab_flags_t kasan_never_merge(void)
-{
- return 0;
-}
static inline void kasan_unpoison_range(const void *address, size_t size) {}
static inline void kasan_poison_pages(struct page *page, unsigned int order,
bool init) {}
static inline void kasan_unpoison_pages(struct page *page, unsigned int order,
bool init) {}
-static inline void kasan_cache_create(struct kmem_cache *cache,
- unsigned int *size,
- slab_flags_t *flags) {}
static inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) {}
-static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; }
static inline void kasan_poison_slab(struct slab *slab) {}
static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
void *object) {}
@@ -333,6 +302,11 @@ static inline void kasan_unpoison_task_stack(struct task_struct *task) {}
#ifdef CONFIG_KASAN_GENERIC
+size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object);
+slab_flags_t kasan_never_merge(void);
+void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
+ slab_flags_t *flags);
+
void kasan_cache_shrink(struct kmem_cache *cache);
void kasan_cache_shutdown(struct kmem_cache *cache);
void kasan_record_aux_stack(void *ptr);
@@ -340,6 +314,22 @@ void kasan_record_aux_stack_noalloc(void *ptr);
#else /* CONFIG_KASAN_GENERIC */
+/* Tag-based KASAN modes do not use per-object metadata. */
+static inline size_t kasan_metadata_size(struct kmem_cache *cache,
+ bool in_object)
+{
+ return 0;
+}
+/* And thus nothing prevents cache merging. */
+static inline slab_flags_t kasan_never_merge(void)
+{
+ return 0;
+}
+/* And no cache-related metadata initialization is required. */
+static inline void kasan_cache_create(struct kmem_cache *cache,
+ unsigned int *size,
+ slab_flags_t *flags) {}
+
static inline void kasan_cache_shrink(struct kmem_cache *cache) {}
static inline void kasan_cache_shutdown(struct kmem_cache *cache) {}
static inline void kasan_record_aux_stack(void *ptr) {}
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 13e6c4b58f07..41a686996aaa 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -427,7 +427,7 @@ extern int kexec_load_disabled;
extern bool kexec_in_progress;
int crash_shrink_memory(unsigned long new_size);
-size_t crash_get_memory_size(void);
+ssize_t crash_get_memory_size(void);
#ifndef arch_kexec_protect_crashkres
/*
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index 384f034ae947..70162d707caf 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -16,11 +16,13 @@ extern void khugepaged_enter_vma(struct vm_area_struct *vma,
unsigned long vm_flags);
extern void khugepaged_min_free_kbytes_update(void);
#ifdef CONFIG_SHMEM
-extern void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr);
+extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
+ bool install_pmd);
#else
-static inline void collapse_pte_mapped_thp(struct mm_struct *mm,
- unsigned long addr)
+static inline int collapse_pte_mapped_thp(struct mm_struct *mm,
+ unsigned long addr, bool install_pmd)
{
+ return 0;
}
#endif
@@ -46,9 +48,10 @@ static inline void khugepaged_enter_vma(struct vm_area_struct *vma,
unsigned long vm_flags)
{
}
-static inline void collapse_pte_mapped_thp(struct mm_struct *mm,
- unsigned long addr)
+static inline int collapse_pte_mapped_thp(struct mm_struct *mm,
+ unsigned long addr, bool install_pmd)
{
+ return 0;
}
static inline void khugepaged_min_free_kbytes_update(void)
diff --git a/include/linux/kmsan-checks.h b/include/linux/kmsan-checks.h
new file mode 100644
index 000000000000..c4cae333deec
--- /dev/null
+++ b/include/linux/kmsan-checks.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * KMSAN checks to be used for one-off annotations in subsystems.
+ *
+ * Copyright (C) 2017-2022 Google LLC
+ * Author: Alexander Potapenko <glider@google.com>
+ *
+ */
+
+#ifndef _LINUX_KMSAN_CHECKS_H
+#define _LINUX_KMSAN_CHECKS_H
+
+#include <linux/types.h>
+
+#ifdef CONFIG_KMSAN
+
+/**
+ * kmsan_poison_memory() - Mark the memory range as uninitialized.
+ * @address: address to start with.
+ * @size: size of buffer to poison.
+ * @flags: GFP flags for allocations done by this function.
+ *
+ * Until other data is written to this range, KMSAN will treat it as
+ * uninitialized. Error reports for this memory will reference the call site of
+ * kmsan_poison_memory() as origin.
+ */
+void kmsan_poison_memory(const void *address, size_t size, gfp_t flags);
+
+/**
+ * kmsan_unpoison_memory() - Mark the memory range as initialized.
+ * @address: address to start with.
+ * @size: size of buffer to unpoison.
+ *
+ * Until other data is written to this range, KMSAN will treat it as
+ * initialized.
+ */
+void kmsan_unpoison_memory(const void *address, size_t size);
+
+/**
+ * kmsan_check_memory() - Check the memory range for being initialized.
+ * @address: address to start with.
+ * @size: size of buffer to check.
+ *
+ * If any piece of the given range is marked as uninitialized, KMSAN will report
+ * an error.
+ */
+void kmsan_check_memory(const void *address, size_t size);
+
+/**
+ * kmsan_copy_to_user() - Notify KMSAN about a data transfer to userspace.
+ * @to: destination address in the userspace.
+ * @from: source address in the kernel.
+ * @to_copy: number of bytes to copy.
+ * @left: number of bytes not copied.
+ *
+ * If this is a real userspace data transfer, KMSAN checks the bytes that were
+ * actually copied to ensure there was no information leak. If @to belongs to
+ * the kernel space (which is possible for compat syscalls), KMSAN just copies
+ * the metadata.
+ */
+void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy,
+ size_t left);
+
+#else
+
+static inline void kmsan_poison_memory(const void *address, size_t size,
+ gfp_t flags)
+{
+}
+static inline void kmsan_unpoison_memory(const void *address, size_t size)
+{
+}
+static inline void kmsan_check_memory(const void *address, size_t size)
+{
+}
+static inline void kmsan_copy_to_user(void __user *to, const void *from,
+ size_t to_copy, size_t left)
+{
+}
+
+#endif
+
+#endif /* _LINUX_KMSAN_CHECKS_H */
diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h
new file mode 100644
index 000000000000..e38ae3c34618
--- /dev/null
+++ b/include/linux/kmsan.h
@@ -0,0 +1,330 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * KMSAN API for subsystems.
+ *
+ * Copyright (C) 2017-2022 Google LLC
+ * Author: Alexander Potapenko <glider@google.com>
+ *
+ */
+#ifndef _LINUX_KMSAN_H
+#define _LINUX_KMSAN_H
+
+#include <linux/dma-direction.h>
+#include <linux/gfp.h>
+#include <linux/kmsan-checks.h>
+#include <linux/types.h>
+
+struct page;
+struct kmem_cache;
+struct task_struct;
+struct scatterlist;
+struct urb;
+
+#ifdef CONFIG_KMSAN
+
+/**
+ * kmsan_task_create() - Initialize KMSAN state for the task.
+ * @task: task to initialize.
+ */
+void kmsan_task_create(struct task_struct *task);
+
+/**
+ * kmsan_task_exit() - Notify KMSAN that a task has exited.
+ * @task: task about to finish.
+ */
+void kmsan_task_exit(struct task_struct *task);
+
+/**
+ * kmsan_init_shadow() - Initialize KMSAN shadow at boot time.
+ *
+ * Allocate and initialize KMSAN metadata for early allocations.
+ */
+void __init kmsan_init_shadow(void);
+
+/**
+ * kmsan_init_runtime() - Initialize KMSAN state and enable KMSAN.
+ */
+void __init kmsan_init_runtime(void);
+
+/**
+ * kmsan_memblock_free_pages() - handle freeing of memblock pages.
+ * @page: struct page to free.
+ * @order: order of @page.
+ *
+ * Freed pages are either returned to buddy allocator or held back to be used
+ * as metadata pages.
+ */
+bool __init kmsan_memblock_free_pages(struct page *page, unsigned int order);
+
+/**
+ * kmsan_alloc_page() - Notify KMSAN about an alloc_pages() call.
+ * @page: struct page pointer returned by alloc_pages().
+ * @order: order of allocated struct page.
+ * @flags: GFP flags used by alloc_pages()
+ *
+ * KMSAN marks 1<<@order pages starting at @page as uninitialized, unless
+ * @flags contain __GFP_ZERO.
+ */
+void kmsan_alloc_page(struct page *page, unsigned int order, gfp_t flags);
+
+/**
+ * kmsan_free_page() - Notify KMSAN about a free_pages() call.
+ * @page: struct page pointer passed to free_pages().
+ * @order: order of deallocated struct page.
+ *
+ * KMSAN marks freed memory as uninitialized.
+ */
+void kmsan_free_page(struct page *page, unsigned int order);
+
+/**
+ * kmsan_copy_page_meta() - Copy KMSAN metadata between two pages.
+ * @dst: destination page.
+ * @src: source page.
+ *
+ * KMSAN copies the contents of metadata pages for @src into the metadata pages
+ * for @dst. If @dst has no associated metadata pages, nothing happens.
+ * If @src has no associated metadata pages, @dst metadata pages are unpoisoned.
+ */
+void kmsan_copy_page_meta(struct page *dst, struct page *src);
+
+/**
+ * kmsan_slab_alloc() - Notify KMSAN about a slab allocation.
+ * @s: slab cache the object belongs to.
+ * @object: object pointer.
+ * @flags: GFP flags passed to the allocator.
+ *
+ * Depending on cache flags and GFP flags, KMSAN sets up the metadata of the
+ * newly created object, marking it as initialized or uninitialized.
+ */
+void kmsan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags);
+
+/**
+ * kmsan_slab_free() - Notify KMSAN about a slab deallocation.
+ * @s: slab cache the object belongs to.
+ * @object: object pointer.
+ *
+ * KMSAN marks the freed object as uninitialized.
+ */
+void kmsan_slab_free(struct kmem_cache *s, void *object);
+
+/**
+ * kmsan_kmalloc_large() - Notify KMSAN about a large slab allocation.
+ * @ptr: object pointer.
+ * @size: object size.
+ * @flags: GFP flags passed to the allocator.
+ *
+ * Similar to kmsan_slab_alloc(), but for large allocations.
+ */
+void kmsan_kmalloc_large(const void *ptr, size_t size, gfp_t flags);
+
+/**
+ * kmsan_kfree_large() - Notify KMSAN about a large slab deallocation.
+ * @ptr: object pointer.
+ *
+ * Similar to kmsan_slab_free(), but for large allocations.
+ */
+void kmsan_kfree_large(const void *ptr);
+
+/**
+ * kmsan_map_kernel_range_noflush() - Notify KMSAN about a vmap.
+ * @start: start of vmapped range.
+ * @end: end of vmapped range.
+ * @prot: page protection flags used for vmap.
+ * @pages: array of pages.
+ * @page_shift: page_shift passed to vmap_range_noflush().
+ *
+ * KMSAN maps shadow and origin pages of @pages into contiguous ranges in
+ * vmalloc metadata address range.
+ */
+void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end,
+ pgprot_t prot, struct page **pages,
+ unsigned int page_shift);
+
+/**
+ * kmsan_vunmap_kernel_range_noflush() - Notify KMSAN about a vunmap.
+ * @start: start of vunmapped range.
+ * @end: end of vunmapped range.
+ *
+ * KMSAN unmaps the contiguous metadata ranges created by
+ * kmsan_map_kernel_range_noflush().
+ */
+void kmsan_vunmap_range_noflush(unsigned long start, unsigned long end);
+
+/**
+ * kmsan_ioremap_page_range() - Notify KMSAN about a ioremap_page_range() call.
+ * @addr: range start.
+ * @end: range end.
+ * @phys_addr: physical range start.
+ * @prot: page protection flags used for ioremap_page_range().
+ * @page_shift: page_shift argument passed to vmap_range_noflush().
+ *
+ * KMSAN creates new metadata pages for the physical pages mapped into the
+ * virtual memory.
+ */
+void kmsan_ioremap_page_range(unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int page_shift);
+
+/**
+ * kmsan_iounmap_page_range() - Notify KMSAN about a iounmap_page_range() call.
+ * @start: range start.
+ * @end: range end.
+ *
+ * KMSAN unmaps the metadata pages for the given range and, unlike for
+ * vunmap_page_range(), also deallocates them.
+ */
+void kmsan_iounmap_page_range(unsigned long start, unsigned long end);
+
+/**
+ * kmsan_handle_dma() - Handle a DMA data transfer.
+ * @page: first page of the buffer.
+ * @offset: offset of the buffer within the first page.
+ * @size: buffer size.
+ * @dir: one of possible dma_data_direction values.
+ *
+ * Depending on @direction, KMSAN:
+ * * checks the buffer, if it is copied to device;
+ * * initializes the buffer, if it is copied from device;
+ * * does both, if this is a DMA_BIDIRECTIONAL transfer.
+ */
+void kmsan_handle_dma(struct page *page, size_t offset, size_t size,
+ enum dma_data_direction dir);
+
+/**
+ * kmsan_handle_dma_sg() - Handle a DMA transfer using scatterlist.
+ * @sg: scatterlist holding DMA buffers.
+ * @nents: number of scatterlist entries.
+ * @dir: one of possible dma_data_direction values.
+ *
+ * Depending on @direction, KMSAN:
+ * * checks the buffers in the scatterlist, if they are copied to device;
+ * * initializes the buffers, if they are copied from device;
+ * * does both, if this is a DMA_BIDIRECTIONAL transfer.
+ */
+void kmsan_handle_dma_sg(struct scatterlist *sg, int nents,
+ enum dma_data_direction dir);
+
+/**
+ * kmsan_handle_urb() - Handle a USB data transfer.
+ * @urb: struct urb pointer.
+ * @is_out: data transfer direction (true means output to hardware).
+ *
+ * If @is_out is true, KMSAN checks the transfer buffer of @urb. Otherwise,
+ * KMSAN initializes the transfer buffer.
+ */
+void kmsan_handle_urb(const struct urb *urb, bool is_out);
+
+/**
+ * kmsan_unpoison_entry_regs() - Handle pt_regs in low-level entry code.
+ * @regs: struct pt_regs pointer received from assembly code.
+ *
+ * KMSAN unpoisons the contents of the passed pt_regs, preventing potential
+ * false positive reports. Unlike kmsan_unpoison_memory(),
+ * kmsan_unpoison_entry_regs() can be called from the regions where
+ * kmsan_in_runtime() returns true, which is the case in early entry code.
+ */
+void kmsan_unpoison_entry_regs(const struct pt_regs *regs);
+
+#else
+
+static inline void kmsan_init_shadow(void)
+{
+}
+
+static inline void kmsan_init_runtime(void)
+{
+}
+
+static inline bool kmsan_memblock_free_pages(struct page *page,
+ unsigned int order)
+{
+ return true;
+}
+
+static inline void kmsan_task_create(struct task_struct *task)
+{
+}
+
+static inline void kmsan_task_exit(struct task_struct *task)
+{
+}
+
+static inline int kmsan_alloc_page(struct page *page, unsigned int order,
+ gfp_t flags)
+{
+ return 0;
+}
+
+static inline void kmsan_free_page(struct page *page, unsigned int order)
+{
+}
+
+static inline void kmsan_copy_page_meta(struct page *dst, struct page *src)
+{
+}
+
+static inline void kmsan_slab_alloc(struct kmem_cache *s, void *object,
+ gfp_t flags)
+{
+}
+
+static inline void kmsan_slab_free(struct kmem_cache *s, void *object)
+{
+}
+
+static inline void kmsan_kmalloc_large(const void *ptr, size_t size,
+ gfp_t flags)
+{
+}
+
+static inline void kmsan_kfree_large(const void *ptr)
+{
+}
+
+static inline void kmsan_vmap_pages_range_noflush(unsigned long start,
+ unsigned long end,
+ pgprot_t prot,
+ struct page **pages,
+ unsigned int page_shift)
+{
+}
+
+static inline void kmsan_vunmap_range_noflush(unsigned long start,
+ unsigned long end)
+{
+}
+
+static inline void kmsan_ioremap_page_range(unsigned long start,
+ unsigned long end,
+ phys_addr_t phys_addr,
+ pgprot_t prot,
+ unsigned int page_shift)
+{
+}
+
+static inline void kmsan_iounmap_page_range(unsigned long start,
+ unsigned long end)
+{
+}
+
+static inline void kmsan_handle_dma(struct page *page, size_t offset,
+ size_t size, enum dma_data_direction dir)
+{
+}
+
+static inline void kmsan_handle_dma_sg(struct scatterlist *sg, int nents,
+ enum dma_data_direction dir)
+{
+}
+
+static inline void kmsan_handle_urb(const struct urb *urb, bool is_out)
+{
+}
+
+static inline void kmsan_unpoison_entry_regs(const struct pt_regs *regs)
+{
+}
+
+#endif
+
+#endif /* _LINUX_KMSAN_H */
diff --git a/include/linux/kmsan_string.h b/include/linux/kmsan_string.h
new file mode 100644
index 000000000000..7287da6f52ef
--- /dev/null
+++ b/include/linux/kmsan_string.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * KMSAN string functions API used in other headers.
+ *
+ * Copyright (C) 2022 Google LLC
+ * Author: Alexander Potapenko <glider@google.com>
+ *
+ */
+#ifndef _LINUX_KMSAN_STRING_H
+#define _LINUX_KMSAN_STRING_H
+
+/*
+ * KMSAN overrides the default memcpy/memset/memmove implementations in the
+ * kernel, which requires having __msan_XXX function prototypes in several other
+ * headers. Keep them in one place instead of open-coding.
+ */
+void *__msan_memcpy(void *dst, const void *src, size_t size);
+void *__msan_memset(void *s, int c, size_t n);
+void *__msan_memmove(void *dest, const void *src, size_t len);
+
+#endif /* _LINUX_KMSAN_STRING_H */
diff --git a/include/linux/kmsan_types.h b/include/linux/kmsan_types.h
new file mode 100644
index 000000000000..8bfa6c98176d
--- /dev/null
+++ b/include/linux/kmsan_types.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A minimal header declaring types added by KMSAN to existing kernel structs.
+ *
+ * Copyright (C) 2017-2022 Google LLC
+ * Author: Alexander Potapenko <glider@google.com>
+ *
+ */
+#ifndef _LINUX_KMSAN_TYPES_H
+#define _LINUX_KMSAN_TYPES_H
+
+/* These constants are defined in the MSan LLVM instrumentation pass. */
+#define KMSAN_RETVAL_SIZE 800
+#define KMSAN_PARAM_SIZE 800
+
+struct kmsan_context_state {
+ char param_tls[KMSAN_PARAM_SIZE];
+ char retval_tls[KMSAN_RETVAL_SIZE];
+ char va_arg_tls[KMSAN_PARAM_SIZE];
+ char va_arg_origin_tls[KMSAN_PARAM_SIZE];
+ u64 va_arg_overflow_size_tls;
+ char param_origin_tls[KMSAN_PARAM_SIZE];
+ u32 retval_origin_tls;
+};
+
+#undef KMSAN_PARAM_SIZE
+#undef KMSAN_RETVAL_SIZE
+
+struct kmsan_ctx {
+ struct kmsan_context_state cstate;
+ int kmsan_in_runtime;
+ bool allow_reporting;
+};
+
+#endif /* _LINUX_KMSAN_TYPES_H */
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 0b4f17418f64..7e232ba59b86 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -15,9 +15,6 @@
#include <linux/sched.h>
#include <linux/sched/coredump.h>
-struct stable_node;
-struct mem_cgroup;
-
#ifdef CONFIG_KSM
int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
unsigned long end, int advice, unsigned long *vm_flags);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 32f259fa5801..915142abdf76 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -416,7 +416,7 @@ static __always_inline void guest_context_enter_irqoff(void)
*/
if (!context_tracking_guest_enter()) {
instrumentation_begin();
- rcu_virt_note_context_switch(smp_processor_id());
+ rcu_virt_note_context_switch();
instrumentation_end();
}
}
@@ -776,6 +776,7 @@ struct kvm {
struct srcu_struct srcu;
struct srcu_struct irq_srcu;
pid_t userspace_pid;
+ bool override_halt_poll_ns;
unsigned int max_halt_poll_ns;
u32 dirty_ring_size;
bool vm_bugged;
@@ -1240,8 +1241,18 @@ int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);
/**
- * kvm_gfn_to_pfn_cache_init - prepare a cached kernel mapping and HPA for a
- * given guest physical address.
+ * kvm_gpc_init - initialize gfn_to_pfn_cache.
+ *
+ * @gpc: struct gfn_to_pfn_cache object.
+ *
+ * This sets up a gfn_to_pfn_cache by initializing locks. Note, the cache must
+ * be zero-allocated (or zeroed by the caller before init).
+ */
+void kvm_gpc_init(struct gfn_to_pfn_cache *gpc);
+
+/**
+ * kvm_gpc_activate - prepare a cached kernel mapping and HPA for a given guest
+ * physical address.
*
* @kvm: pointer to kvm instance.
* @gpc: struct gfn_to_pfn_cache object.
@@ -1265,9 +1276,9 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);
* kvm_gfn_to_pfn_cache_check() to ensure that the cache is valid before
* accessing the target page.
*/
-int kvm_gfn_to_pfn_cache_init(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
- struct kvm_vcpu *vcpu, enum pfn_cache_usage usage,
- gpa_t gpa, unsigned long len);
+int kvm_gpc_activate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
+ struct kvm_vcpu *vcpu, enum pfn_cache_usage usage,
+ gpa_t gpa, unsigned long len);
/**
* kvm_gfn_to_pfn_cache_check - check validity of a gfn_to_pfn_cache.
@@ -1324,7 +1335,7 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc);
/**
- * kvm_gfn_to_pfn_cache_destroy - destroy and unlink a gfn_to_pfn_cache.
+ * kvm_gpc_deactivate - deactivate and unlink a gfn_to_pfn_cache.
*
* @kvm: pointer to kvm instance.
* @gpc: struct gfn_to_pfn_cache object.
@@ -1332,7 +1343,7 @@ void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc);
* This removes a cache from the @kvm's list to be processed on MMU notifier
* invocation.
*/
-void kvm_gfn_to_pfn_cache_destroy(struct kvm *kvm, struct gfn_to_pfn_cache *gpc);
+void kvm_gpc_deactivate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc);
void kvm_sigset_activate(struct kvm_vcpu *vcpu);
void kvm_sigset_deactivate(struct kvm_vcpu *vcpu);
@@ -1390,6 +1401,8 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
struct kvm_enable_cap *cap);
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg);
+long kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
+ unsigned long arg);
int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index c74acfa1a3fe..af38252ad704 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -35,6 +35,11 @@ enum {
NDD_WORK_PENDING = 4,
/* dimm supports namespace labels */
NDD_LABELING = 6,
+ /*
+ * dimm contents have changed requiring invalidation of CPU caches prior
+ * to activation of a region that includes this device
+ */
+ NDD_INCOHERENT = 7,
/* need to set a limit somewhere, but yes, this is likely overkill */
ND_IOCTL_MAX_BUFLEN = SZ_4M,
@@ -183,6 +188,8 @@ struct nvdimm_security_ops {
int (*overwrite)(struct nvdimm *nvdimm,
const struct nvdimm_key_data *key_data);
int (*query_overwrite)(struct nvdimm *nvdimm);
+ int (*disable_master)(struct nvdimm *nvdimm,
+ const struct nvdimm_key_data *key_data);
};
enum nvdimm_fwa_state {
diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
new file mode 100644
index 000000000000..e594db58a0f1
--- /dev/null
+++ b/include/linux/maple_tree.h
@@ -0,0 +1,692 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+#ifndef _LINUX_MAPLE_TREE_H
+#define _LINUX_MAPLE_TREE_H
+/*
+ * Maple Tree - An RCU-safe adaptive tree for storing ranges
+ * Copyright (c) 2018-2022 Oracle
+ * Authors: Liam R. Howlett <Liam.Howlett@Oracle.com>
+ * Matthew Wilcox <willy@infradead.org>
+ */
+
+#include <linux/kernel.h>
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+/* #define CONFIG_MAPLE_RCU_DISABLED */
+/* #define CONFIG_DEBUG_MAPLE_TREE_VERBOSE */
+
+/*
+ * Allocated nodes are mutable until they have been inserted into the tree,
+ * at which time they cannot change their type until they have been removed
+ * from the tree and an RCU grace period has passed.
+ *
+ * Removed nodes have their ->parent set to point to themselves. RCU readers
+ * check ->parent before relying on the value that they loaded from the
+ * slots array. This lets us reuse the slots array for the RCU head.
+ *
+ * Nodes in the tree point to their parent unless bit 0 is set.
+ */
+#if defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64)
+/* 64bit sizes */
+#define MAPLE_NODE_SLOTS 31 /* 256 bytes including ->parent */
+#define MAPLE_RANGE64_SLOTS 16 /* 256 bytes */
+#define MAPLE_ARANGE64_SLOTS 10 /* 240 bytes */
+#define MAPLE_ARANGE64_META_MAX 15 /* Out of range for metadata */
+#define MAPLE_ALLOC_SLOTS (MAPLE_NODE_SLOTS - 1)
+#else
+/* 32bit sizes */
+#define MAPLE_NODE_SLOTS 63 /* 256 bytes including ->parent */
+#define MAPLE_RANGE64_SLOTS 32 /* 256 bytes */
+#define MAPLE_ARANGE64_SLOTS 21 /* 240 bytes */
+#define MAPLE_ARANGE64_META_MAX 31 /* Out of range for metadata */
+#define MAPLE_ALLOC_SLOTS (MAPLE_NODE_SLOTS - 2)
+#endif /* defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) */
+
+#define MAPLE_NODE_MASK 255UL
+
+/*
+ * The node->parent of the root node has bit 0 set and the rest of the pointer
+ * is a pointer to the tree itself. No more bits are available in this pointer
+ * (on m68k, the data structure may only be 2-byte aligned).
+ *
+ * Internal non-root nodes can only have maple_range_* nodes as parents. The
+ * parent pointer is 256B aligned like all other tree nodes. When storing a 32
+ * or 64 bit values, the offset can fit into 4 bits. The 16 bit values need an
+ * extra bit to store the offset. This extra bit comes from a reuse of the last
+ * bit in the node type. This is possible by using bit 1 to indicate if bit 2
+ * is part of the type or the slot.
+ *
+ * Once the type is decided, the decision of an allocation range type or a range
+ * type is done by examining the immutable tree flag for the MAPLE_ALLOC_RANGE
+ * flag.
+ *
+ * Node types:
+ * 0x??1 = Root
+ * 0x?00 = 16 bit nodes
+ * 0x010 = 32 bit nodes
+ * 0x110 = 64 bit nodes
+ *
+ * Slot size and location in the parent pointer:
+ * type : slot location
+ * 0x??1 : Root
+ * 0x?00 : 16 bit values, type in 0-1, slot in 2-6
+ * 0x010 : 32 bit values, type in 0-2, slot in 3-6
+ * 0x110 : 64 bit values, type in 0-2, slot in 3-6
+ */
+
+/*
+ * This metadata is used to optimize the gap updating code and in reverse
+ * searching for gaps or any other code that needs to find the end of the data.
+ */
+struct maple_metadata {
+ unsigned char end;
+ unsigned char gap;
+};
+
+/*
+ * Leaf nodes do not store pointers to nodes, they store user data. Users may
+ * store almost any bit pattern. As noted above, the optimisation of storing an
+ * entry at 0 in the root pointer cannot be done for data which have the bottom
+ * two bits set to '10'. We also reserve values with the bottom two bits set to
+ * '10' which are below 4096 (ie 2, 6, 10 .. 4094) for internal use. Some APIs
+ * return errnos as a negative errno shifted right by two bits and the bottom
+ * two bits set to '10', and while choosing to store these values in the array
+ * is not an error, it may lead to confusion if you're testing for an error with
+ * mas_is_err().
+ *
+ * Non-leaf nodes store the type of the node pointed to (enum maple_type in bits
+ * 3-6), bit 2 is reserved. That leaves bits 0-1 unused for now.
+ *
+ * In regular B-Tree terms, pivots are called keys. The term pivot is used to
+ * indicate that the tree is specifying ranges, Pivots may appear in the
+ * subtree with an entry attached to the value whereas keys are unique to a
+ * specific position of a B-tree. Pivot values are inclusive of the slot with
+ * the same index.
+ */
+
+struct maple_range_64 {
+ struct maple_pnode *parent;
+ unsigned long pivot[MAPLE_RANGE64_SLOTS - 1];
+ union {
+ void __rcu *slot[MAPLE_RANGE64_SLOTS];
+ struct {
+ void __rcu *pad[MAPLE_RANGE64_SLOTS - 1];
+ struct maple_metadata meta;
+ };
+ };
+};
+
+/*
+ * At tree creation time, the user can specify that they're willing to trade off
+ * storing fewer entries in a tree in return for storing more information in
+ * each node.
+ *
+ * The maple tree supports recording the largest range of NULL entries available
+ * in this node, also called gaps. This optimises the tree for allocating a
+ * range.
+ */
+struct maple_arange_64 {
+ struct maple_pnode *parent;
+ unsigned long pivot[MAPLE_ARANGE64_SLOTS - 1];
+ void __rcu *slot[MAPLE_ARANGE64_SLOTS];
+ unsigned long gap[MAPLE_ARANGE64_SLOTS];
+ struct maple_metadata meta;
+};
+
+struct maple_alloc {
+ unsigned long total;
+ unsigned char node_count;
+ unsigned int request_count;
+ struct maple_alloc *slot[MAPLE_ALLOC_SLOTS];
+};
+
+struct maple_topiary {
+ struct maple_pnode *parent;
+ struct maple_enode *next; /* Overlaps the pivot */
+};
+
+enum maple_type {
+ maple_dense,
+ maple_leaf_64,
+ maple_range_64,
+ maple_arange_64,
+};
+
+
+/**
+ * DOC: Maple tree flags
+ *
+ * * MT_FLAGS_ALLOC_RANGE - Track gaps in this tree
+ * * MT_FLAGS_USE_RCU - Operate in RCU mode
+ * * MT_FLAGS_HEIGHT_OFFSET - The position of the tree height in the flags
+ * * MT_FLAGS_HEIGHT_MASK - The mask for the maple tree height value
+ * * MT_FLAGS_LOCK_MASK - How the mt_lock is used
+ * * MT_FLAGS_LOCK_IRQ - Acquired irq-safe
+ * * MT_FLAGS_LOCK_BH - Acquired bh-safe
+ * * MT_FLAGS_LOCK_EXTERN - mt_lock is not used
+ *
+ * MAPLE_HEIGHT_MAX The largest height that can be stored
+ */
+#define MT_FLAGS_ALLOC_RANGE 0x01
+#define MT_FLAGS_USE_RCU 0x02
+#define MT_FLAGS_HEIGHT_OFFSET 0x02
+#define MT_FLAGS_HEIGHT_MASK 0x7C
+#define MT_FLAGS_LOCK_MASK 0x300
+#define MT_FLAGS_LOCK_IRQ 0x100
+#define MT_FLAGS_LOCK_BH 0x200
+#define MT_FLAGS_LOCK_EXTERN 0x300
+
+#define MAPLE_HEIGHT_MAX 31
+
+
+#define MAPLE_NODE_TYPE_MASK 0x0F
+#define MAPLE_NODE_TYPE_SHIFT 0x03
+
+#define MAPLE_RESERVED_RANGE 4096
+
+#ifdef CONFIG_LOCKDEP
+typedef struct lockdep_map *lockdep_map_p;
+#define mt_lock_is_held(mt) lock_is_held(mt->ma_external_lock)
+#define mt_set_external_lock(mt, lock) \
+ (mt)->ma_external_lock = &(lock)->dep_map
+#else
+typedef struct { /* nothing */ } lockdep_map_p;
+#define mt_lock_is_held(mt) 1
+#define mt_set_external_lock(mt, lock) do { } while (0)
+#endif
+
+/*
+ * If the tree contains a single entry at index 0, it is usually stored in
+ * tree->ma_root. To optimise for the page cache, an entry which ends in '00',
+ * '01' or '11' is stored in the root, but an entry which ends in '10' will be
+ * stored in a node. Bits 3-6 are used to store enum maple_type.
+ *
+ * The flags are used both to store some immutable information about this tree
+ * (set at tree creation time) and dynamic information set under the spinlock.
+ *
+ * Another use of flags are to indicate global states of the tree. This is the
+ * case with the MAPLE_USE_RCU flag, which indicates the tree is currently in
+ * RCU mode. This mode was added to allow the tree to reuse nodes instead of
+ * re-allocating and RCU freeing nodes when there is a single user.
+ */
+struct maple_tree {
+ union {
+ spinlock_t ma_lock;
+ lockdep_map_p ma_external_lock;
+ };
+ void __rcu *ma_root;
+ unsigned int ma_flags;
+};
+
+/**
+ * MTREE_INIT() - Initialize a maple tree
+ * @name: The maple tree name
+ * @__flags: The maple tree flags
+ *
+ */
+#define MTREE_INIT(name, __flags) { \
+ .ma_lock = __SPIN_LOCK_UNLOCKED((name).ma_lock), \
+ .ma_flags = __flags, \
+ .ma_root = NULL, \
+}
+
+/**
+ * MTREE_INIT_EXT() - Initialize a maple tree with an external lock.
+ * @name: The tree name
+ * @__flags: The maple tree flags
+ * @__lock: The external lock
+ */
+#ifdef CONFIG_LOCKDEP
+#define MTREE_INIT_EXT(name, __flags, __lock) { \
+ .ma_external_lock = &(__lock).dep_map, \
+ .ma_flags = (__flags), \
+ .ma_root = NULL, \
+}
+#else
+#define MTREE_INIT_EXT(name, __flags, __lock) MTREE_INIT(name, __flags)
+#endif
+
+#define DEFINE_MTREE(name) \
+ struct maple_tree name = MTREE_INIT(name, 0)
+
+#define mtree_lock(mt) spin_lock((&(mt)->ma_lock))
+#define mtree_unlock(mt) spin_unlock((&(mt)->ma_lock))
+
+/*
+ * The Maple Tree squeezes various bits in at various points which aren't
+ * necessarily obvious. Usually, this is done by observing that pointers are
+ * N-byte aligned and thus the bottom log_2(N) bits are available for use. We
+ * don't use the high bits of pointers to store additional information because
+ * we don't know what bits are unused on any given architecture.
+ *
+ * Nodes are 256 bytes in size and are also aligned to 256 bytes, giving us 8
+ * low bits for our own purposes. Nodes are currently of 4 types:
+ * 1. Single pointer (Range is 0-0)
+ * 2. Non-leaf Allocation Range nodes
+ * 3. Non-leaf Range nodes
+ * 4. Leaf Range nodes All nodes consist of a number of node slots,
+ * pivots, and a parent pointer.
+ */
+
+struct maple_node {
+ union {
+ struct {
+ struct maple_pnode *parent;
+ void __rcu *slot[MAPLE_NODE_SLOTS];
+ };
+ struct {
+ void *pad;
+ struct rcu_head rcu;
+ struct maple_enode *piv_parent;
+ unsigned char parent_slot;
+ enum maple_type type;
+ unsigned char slot_len;
+ unsigned int ma_flags;
+ };
+ struct maple_range_64 mr64;
+ struct maple_arange_64 ma64;
+ struct maple_alloc alloc;
+ };
+};
+
+/*
+ * More complicated stores can cause two nodes to become one or three and
+ * potentially alter the height of the tree. Either half of the tree may need
+ * to be rebalanced against the other. The ma_topiary struct is used to track
+ * which nodes have been 'cut' from the tree so that the change can be done
+ * safely at a later date. This is done to support RCU.
+ */
+struct ma_topiary {
+ struct maple_enode *head;
+ struct maple_enode *tail;
+ struct maple_tree *mtree;
+};
+
+void *mtree_load(struct maple_tree *mt, unsigned long index);
+
+int mtree_insert(struct maple_tree *mt, unsigned long index,
+ void *entry, gfp_t gfp);
+int mtree_insert_range(struct maple_tree *mt, unsigned long first,
+ unsigned long last, void *entry, gfp_t gfp);
+int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp,
+ void *entry, unsigned long size, unsigned long min,
+ unsigned long max, gfp_t gfp);
+int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp,
+ void *entry, unsigned long size, unsigned long min,
+ unsigned long max, gfp_t gfp);
+
+int mtree_store_range(struct maple_tree *mt, unsigned long first,
+ unsigned long last, void *entry, gfp_t gfp);
+int mtree_store(struct maple_tree *mt, unsigned long index,
+ void *entry, gfp_t gfp);
+void *mtree_erase(struct maple_tree *mt, unsigned long index);
+
+void mtree_destroy(struct maple_tree *mt);
+void __mt_destroy(struct maple_tree *mt);
+
+/**
+ * mtree_empty() - Determine if a tree has any present entries.
+ * @mt: Maple Tree.
+ *
+ * Context: Any context.
+ * Return: %true if the tree contains only NULL pointers.
+ */
+static inline bool mtree_empty(const struct maple_tree *mt)
+{
+ return mt->ma_root == NULL;
+}
+
+/* Advanced API */
+
+/*
+ * The maple state is defined in the struct ma_state and is used to keep track
+ * of information during operations, and even between operations when using the
+ * advanced API.
+ *
+ * If state->node has bit 0 set then it references a tree location which is not
+ * a node (eg the root). If bit 1 is set, the rest of the bits are a negative
+ * errno. Bit 2 (the 'unallocated slots' bit) is clear. Bits 3-6 indicate the
+ * node type.
+ *
+ * state->alloc either has a request number of nodes or an allocated node. If
+ * stat->alloc has a requested number of nodes, the first bit will be set (0x1)
+ * and the remaining bits are the value. If state->alloc is a node, then the
+ * node will be of type maple_alloc. maple_alloc has MAPLE_NODE_SLOTS - 1 for
+ * storing more allocated nodes, a total number of nodes allocated, and the
+ * node_count in this node. node_count is the number of allocated nodes in this
+ * node. The scaling beyond MAPLE_NODE_SLOTS - 1 is handled by storing further
+ * nodes into state->alloc->slot[0]'s node. Nodes are taken from state->alloc
+ * by removing a node from the state->alloc node until state->alloc->node_count
+ * is 1, when state->alloc is returned and the state->alloc->slot[0] is promoted
+ * to state->alloc. Nodes are pushed onto state->alloc by putting the current
+ * state->alloc into the pushed node's slot[0].
+ *
+ * The state also contains the implied min/max of the state->node, the depth of
+ * this search, and the offset. The implied min/max are either from the parent
+ * node or are 0-oo for the root node. The depth is incremented or decremented
+ * every time a node is walked down or up. The offset is the slot/pivot of
+ * interest in the node - either for reading or writing.
+ *
+ * When returning a value the maple state index and last respectively contain
+ * the start and end of the range for the entry. Ranges are inclusive in the
+ * Maple Tree.
+ */
+struct ma_state {
+ struct maple_tree *tree; /* The tree we're operating in */
+ unsigned long index; /* The index we're operating on - range start */
+ unsigned long last; /* The last index we're operating on - range end */
+ struct maple_enode *node; /* The node containing this entry */
+ unsigned long min; /* The minimum index of this node - implied pivot min */
+ unsigned long max; /* The maximum index of this node - implied pivot max */
+ struct maple_alloc *alloc; /* Allocated nodes for this operation */
+ unsigned char depth; /* depth of tree descent during write */
+ unsigned char offset;
+ unsigned char mas_flags;
+};
+
+struct ma_wr_state {
+ struct ma_state *mas;
+ struct maple_node *node; /* Decoded mas->node */
+ unsigned long r_min; /* range min */
+ unsigned long r_max; /* range max */
+ enum maple_type type; /* mas->node type */
+ unsigned char offset_end; /* The offset where the write ends */
+ unsigned char node_end; /* mas->node end */
+ unsigned long *pivots; /* mas->node->pivots pointer */
+ unsigned long end_piv; /* The pivot at the offset end */
+ void __rcu **slots; /* mas->node->slots pointer */
+ void *entry; /* The entry to write */
+ void *content; /* The existing entry that is being overwritten */
+};
+
+#define mas_lock(mas) spin_lock(&((mas)->tree->ma_lock))
+#define mas_unlock(mas) spin_unlock(&((mas)->tree->ma_lock))
+
+
+/*
+ * Special values for ma_state.node.
+ * MAS_START means we have not searched the tree.
+ * MAS_ROOT means we have searched the tree and the entry we found lives in
+ * the root of the tree (ie it has index 0, length 1 and is the only entry in
+ * the tree).
+ * MAS_NONE means we have searched the tree and there is no node in the
+ * tree for this entry. For example, we searched for index 1 in an empty
+ * tree. Or we have a tree which points to a full leaf node and we
+ * searched for an entry which is larger than can be contained in that
+ * leaf node.
+ * MA_ERROR represents an errno. After dropping the lock and attempting
+ * to resolve the error, the walk would have to be restarted from the
+ * top of the tree as the tree may have been modified.
+ */
+#define MAS_START ((struct maple_enode *)1UL)
+#define MAS_ROOT ((struct maple_enode *)5UL)
+#define MAS_NONE ((struct maple_enode *)9UL)
+#define MAS_PAUSE ((struct maple_enode *)17UL)
+#define MA_ERROR(err) \
+ ((struct maple_enode *)(((unsigned long)err << 2) | 2UL))
+
+#define MA_STATE(name, mt, first, end) \
+ struct ma_state name = { \
+ .tree = mt, \
+ .index = first, \
+ .last = end, \
+ .node = MAS_START, \
+ .min = 0, \
+ .max = ULONG_MAX, \
+ .alloc = NULL, \
+ }
+
+#define MA_WR_STATE(name, ma_state, wr_entry) \
+ struct ma_wr_state name = { \
+ .mas = ma_state, \
+ .content = NULL, \
+ .entry = wr_entry, \
+ }
+
+#define MA_TOPIARY(name, tree) \
+ struct ma_topiary name = { \
+ .head = NULL, \
+ .tail = NULL, \
+ .mtree = tree, \
+ }
+
+void *mas_walk(struct ma_state *mas);
+void *mas_store(struct ma_state *mas, void *entry);
+void *mas_erase(struct ma_state *mas);
+int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp);
+void mas_store_prealloc(struct ma_state *mas, void *entry);
+void *mas_find(struct ma_state *mas, unsigned long max);
+void *mas_find_rev(struct ma_state *mas, unsigned long min);
+int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp);
+bool mas_is_err(struct ma_state *mas);
+
+bool mas_nomem(struct ma_state *mas, gfp_t gfp);
+void mas_pause(struct ma_state *mas);
+void maple_tree_init(void);
+void mas_destroy(struct ma_state *mas);
+int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries);
+
+void *mas_prev(struct ma_state *mas, unsigned long min);
+void *mas_next(struct ma_state *mas, unsigned long max);
+
+int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max,
+ unsigned long size);
+
+/* Checks if a mas has not found anything */
+static inline bool mas_is_none(struct ma_state *mas)
+{
+ return mas->node == MAS_NONE;
+}
+
+/* Checks if a mas has been paused */
+static inline bool mas_is_paused(struct ma_state *mas)
+{
+ return mas->node == MAS_PAUSE;
+}
+
+void mas_dup_tree(struct ma_state *oldmas, struct ma_state *mas);
+void mas_dup_store(struct ma_state *mas, void *entry);
+
+/*
+ * This finds an empty area from the highest address to the lowest.
+ * AKA "Topdown" version,
+ */
+int mas_empty_area_rev(struct ma_state *mas, unsigned long min,
+ unsigned long max, unsigned long size);
+/**
+ * mas_reset() - Reset a Maple Tree operation state.
+ * @mas: Maple Tree operation state.
+ *
+ * Resets the error or walk state of the @mas so future walks of the
+ * array will start from the root. Use this if you have dropped the
+ * lock and want to reuse the ma_state.
+ *
+ * Context: Any context.
+ */
+static inline void mas_reset(struct ma_state *mas)
+{
+ mas->node = MAS_START;
+}
+
+/**
+ * mas_for_each() - Iterate over a range of the maple tree.
+ * @__mas: Maple Tree operation state (maple_state)
+ * @__entry: Entry retrieved from the tree
+ * @__max: maximum index to retrieve from the tree
+ *
+ * When returned, mas->index and mas->last will hold the entire range for the
+ * entry.
+ *
+ * Note: may return the zero entry.
+ *
+ */
+#define mas_for_each(__mas, __entry, __max) \
+ while (((__entry) = mas_find((__mas), (__max))) != NULL)
+
+
+/**
+ * mas_set_range() - Set up Maple Tree operation state for a different index.
+ * @mas: Maple Tree operation state.
+ * @start: New start of range in the Maple Tree.
+ * @last: New end of range in the Maple Tree.
+ *
+ * Move the operation state to refer to a different range. This will
+ * have the effect of starting a walk from the top; see mas_next()
+ * to move to an adjacent index.
+ */
+static inline
+void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last)
+{
+ mas->index = start;
+ mas->last = last;
+ mas->node = MAS_START;
+}
+
+/**
+ * mas_set() - Set up Maple Tree operation state for a different index.
+ * @mas: Maple Tree operation state.
+ * @index: New index into the Maple Tree.
+ *
+ * Move the operation state to refer to a different index. This will
+ * have the effect of starting a walk from the top; see mas_next()
+ * to move to an adjacent index.
+ */
+static inline void mas_set(struct ma_state *mas, unsigned long index)
+{
+
+ mas_set_range(mas, index, index);
+}
+
+static inline bool mt_external_lock(const struct maple_tree *mt)
+{
+ return (mt->ma_flags & MT_FLAGS_LOCK_MASK) == MT_FLAGS_LOCK_EXTERN;
+}
+
+/**
+ * mt_init_flags() - Initialise an empty maple tree with flags.
+ * @mt: Maple Tree
+ * @flags: maple tree flags.
+ *
+ * If you need to initialise a Maple Tree with special flags (eg, an
+ * allocation tree), use this function.
+ *
+ * Context: Any context.
+ */
+static inline void mt_init_flags(struct maple_tree *mt, unsigned int flags)
+{
+ mt->ma_flags = flags;
+ if (!mt_external_lock(mt))
+ spin_lock_init(&mt->ma_lock);
+ rcu_assign_pointer(mt->ma_root, NULL);
+}
+
+/**
+ * mt_init() - Initialise an empty maple tree.
+ * @mt: Maple Tree
+ *
+ * An empty Maple Tree.
+ *
+ * Context: Any context.
+ */
+static inline void mt_init(struct maple_tree *mt)
+{
+ mt_init_flags(mt, 0);
+}
+
+static inline bool mt_in_rcu(struct maple_tree *mt)
+{
+#ifdef CONFIG_MAPLE_RCU_DISABLED
+ return false;
+#endif
+ return mt->ma_flags & MT_FLAGS_USE_RCU;
+}
+
+/**
+ * mt_clear_in_rcu() - Switch the tree to non-RCU mode.
+ * @mt: The Maple Tree
+ */
+static inline void mt_clear_in_rcu(struct maple_tree *mt)
+{
+ if (!mt_in_rcu(mt))
+ return;
+
+ if (mt_external_lock(mt)) {
+ BUG_ON(!mt_lock_is_held(mt));
+ mt->ma_flags &= ~MT_FLAGS_USE_RCU;
+ } else {
+ mtree_lock(mt);
+ mt->ma_flags &= ~MT_FLAGS_USE_RCU;
+ mtree_unlock(mt);
+ }
+}
+
+/**
+ * mt_set_in_rcu() - Switch the tree to RCU safe mode.
+ * @mt: The Maple Tree
+ */
+static inline void mt_set_in_rcu(struct maple_tree *mt)
+{
+ if (mt_in_rcu(mt))
+ return;
+
+ if (mt_external_lock(mt)) {
+ BUG_ON(!mt_lock_is_held(mt));
+ mt->ma_flags |= MT_FLAGS_USE_RCU;
+ } else {
+ mtree_lock(mt);
+ mt->ma_flags |= MT_FLAGS_USE_RCU;
+ mtree_unlock(mt);
+ }
+}
+
+static inline unsigned int mt_height(const struct maple_tree *mt)
+
+{
+ return (mt->ma_flags & MT_FLAGS_HEIGHT_MASK) >> MT_FLAGS_HEIGHT_OFFSET;
+}
+
+void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max);
+void *mt_find_after(struct maple_tree *mt, unsigned long *index,
+ unsigned long max);
+void *mt_prev(struct maple_tree *mt, unsigned long index, unsigned long min);
+void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max);
+
+/**
+ * mt_for_each - Iterate over each entry starting at index until max.
+ * @__tree: The Maple Tree
+ * @__entry: The current entry
+ * @__index: The index to update to track the location in the tree
+ * @__max: The maximum limit for @index
+ *
+ * Note: Will not return the zero entry.
+ */
+#define mt_for_each(__tree, __entry, __index, __max) \
+ for (__entry = mt_find(__tree, &(__index), __max); \
+ __entry; __entry = mt_find_after(__tree, &(__index), __max))
+
+
+#ifdef CONFIG_DEBUG_MAPLE_TREE
+extern atomic_t maple_tree_tests_run;
+extern atomic_t maple_tree_tests_passed;
+
+void mt_dump(const struct maple_tree *mt);
+void mt_validate(struct maple_tree *mt);
+void mt_cache_shrink(void);
+#define MT_BUG_ON(__tree, __x) do { \
+ atomic_inc(&maple_tree_tests_run); \
+ if (__x) { \
+ pr_info("BUG at %s:%d (%u)\n", \
+ __func__, __LINE__, __x); \
+ mt_dump(__tree); \
+ pr_info("Pass: %u Run:%u\n", \
+ atomic_read(&maple_tree_tests_passed), \
+ atomic_read(&maple_tree_tests_run)); \
+ dump_stack(); \
+ } else { \
+ atomic_inc(&maple_tree_tests_passed); \
+ } \
+} while (0)
+#else
+#define MT_BUG_ON(__tree, __x) BUG_ON(__x)
+#endif /* CONFIG_DEBUG_MAPLE_TREE */
+
+#endif /*_LINUX_MAPLE_TREE_H */
diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index 47ad3b104d9e..139d05b26f82 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -10,6 +10,9 @@
#ifndef MDEV_H
#define MDEV_H
+#include <linux/device.h>
+#include <linux/uuid.h>
+
struct mdev_type;
struct mdev_device {
@@ -20,67 +23,67 @@ struct mdev_device {
bool active;
};
-static inline struct mdev_device *to_mdev_device(struct device *dev)
-{
- return container_of(dev, struct mdev_device, dev);
-}
+struct mdev_type {
+ /* set by the driver before calling mdev_register parent: */
+ const char *sysfs_name;
+ const char *pretty_name;
-unsigned int mdev_get_type_group_id(struct mdev_device *mdev);
-unsigned int mtype_get_type_group_id(struct mdev_type *mtype);
-struct device *mtype_get_parent_dev(struct mdev_type *mtype);
+ /* set by the core, can be used drivers */
+ struct mdev_parent *parent;
-/* interface for exporting mdev supported type attributes */
-struct mdev_type_attribute {
- struct attribute attr;
- ssize_t (*show)(struct mdev_type *mtype,
- struct mdev_type_attribute *attr, char *buf);
- ssize_t (*store)(struct mdev_type *mtype,
- struct mdev_type_attribute *attr, const char *buf,
- size_t count);
+ /* internal only */
+ struct kobject kobj;
+ struct kobject *devices_kobj;
};
-#define MDEV_TYPE_ATTR(_name, _mode, _show, _store) \
-struct mdev_type_attribute mdev_type_attr_##_name = \
- __ATTR(_name, _mode, _show, _store)
-#define MDEV_TYPE_ATTR_RW(_name) \
- struct mdev_type_attribute mdev_type_attr_##_name = __ATTR_RW(_name)
-#define MDEV_TYPE_ATTR_RO(_name) \
- struct mdev_type_attribute mdev_type_attr_##_name = __ATTR_RO(_name)
-#define MDEV_TYPE_ATTR_WO(_name) \
- struct mdev_type_attribute mdev_type_attr_##_name = __ATTR_WO(_name)
+/* embedded into the struct device that the mdev devices hang off */
+struct mdev_parent {
+ struct device *dev;
+ struct mdev_driver *mdev_driver;
+ struct kset *mdev_types_kset;
+ /* Synchronize device creation/removal with parent unregistration */
+ struct rw_semaphore unreg_sem;
+ struct mdev_type **types;
+ unsigned int nr_types;
+ atomic_t available_instances;
+};
+
+static inline struct mdev_device *to_mdev_device(struct device *dev)
+{
+ return container_of(dev, struct mdev_device, dev);
+}
/**
* struct mdev_driver - Mediated device driver
+ * @device_api: string to return for the device_api sysfs
+ * @max_instances: maximum number of instances supported (optional)
* @probe: called when new device created
* @remove: called when device removed
- * @supported_type_groups: Attributes to define supported types. It is mandatory
- * to provide supported types.
+ * @get_available: Return the max number of instances that can be created
+ * @show_description: Print a description of the mtype
* @driver: device driver structure
- *
**/
struct mdev_driver {
+ const char *device_api;
+ unsigned int max_instances;
int (*probe)(struct mdev_device *dev);
void (*remove)(struct mdev_device *dev);
- struct attribute_group **supported_type_groups;
+ unsigned int (*get_available)(struct mdev_type *mtype);
+ ssize_t (*show_description)(struct mdev_type *mtype, char *buf);
struct device_driver driver;
};
-extern struct bus_type mdev_bus_type;
-
-int mdev_register_device(struct device *dev, struct mdev_driver *mdev_driver);
-void mdev_unregister_device(struct device *dev);
+int mdev_register_parent(struct mdev_parent *parent, struct device *dev,
+ struct mdev_driver *mdev_driver, struct mdev_type **types,
+ unsigned int nr_types);
+void mdev_unregister_parent(struct mdev_parent *parent);
int mdev_register_driver(struct mdev_driver *drv);
void mdev_unregister_driver(struct mdev_driver *drv);
-struct device *mdev_parent_dev(struct mdev_device *mdev);
static inline struct device *mdev_dev(struct mdev_device *mdev)
{
return &mdev->dev;
}
-static inline struct mdev_device *mdev_from_dev(struct device *dev)
-{
- return dev->bus == &mdev_bus_type ? to_mdev_device(dev) : NULL;
-}
#endif /* MDEV_H */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 567f12323f55..e1644a24009c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -80,29 +80,8 @@ enum mem_cgroup_events_target {
MEM_CGROUP_NTARGETS,
};
-struct memcg_vmstats_percpu {
- /* Local (CPU and cgroup) page state & events */
- long state[MEMCG_NR_STAT];
- unsigned long events[NR_VM_EVENT_ITEMS];
-
- /* Delta calculation for lockless upward propagation */
- long state_prev[MEMCG_NR_STAT];
- unsigned long events_prev[NR_VM_EVENT_ITEMS];
-
- /* Cgroup1: threshold notifications & softlimit tree updates */
- unsigned long nr_page_events;
- unsigned long targets[MEM_CGROUP_NTARGETS];
-};
-
-struct memcg_vmstats {
- /* Aggregated (CPU and subtree) page state & events */
- long state[MEMCG_NR_STAT];
- unsigned long events[NR_VM_EVENT_ITEMS];
-
- /* Pending child counts during tree propagation */
- long state_pending[MEMCG_NR_STAT];
- unsigned long events_pending[NR_VM_EVENT_ITEMS];
-};
+struct memcg_vmstats_percpu;
+struct memcg_vmstats;
struct mem_cgroup_reclaim_iter {
struct mem_cgroup *position;
@@ -185,15 +164,6 @@ struct mem_cgroup_thresholds {
struct mem_cgroup_threshold_ary *spare;
};
-#if defined(CONFIG_SMP)
-struct memcg_padding {
- char x[0];
-} ____cacheline_internodealigned_in_smp;
-#define MEMCG_PADDING(name) struct memcg_padding name
-#else
-#define MEMCG_PADDING(name)
-#endif
-
/*
* Remember four most recent foreign writebacks with dirty pages in this
* cgroup. Inode sharing is expected to be uncommon and, even if we miss
@@ -304,10 +274,10 @@ struct mem_cgroup {
spinlock_t move_lock;
unsigned long move_lock_flags;
- MEMCG_PADDING(_pad1_);
+ CACHELINE_PADDING(_pad1_);
/* memory.stat */
- struct memcg_vmstats vmstats;
+ struct memcg_vmstats *vmstats;
/* memory.events */
atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
@@ -326,7 +296,7 @@ struct mem_cgroup {
struct list_head objcg_list;
#endif
- MEMCG_PADDING(_pad2_);
+ CACHELINE_PADDING(_pad2_);
/*
* set > 0 if pages under this cgroup are moving to other cgroup.
@@ -350,14 +320,20 @@ struct mem_cgroup {
struct deferred_split deferred_split_queue;
#endif
+#ifdef CONFIG_LRU_GEN
+ /* per-memcg mm_struct list */
+ struct lru_gen_mm_list mm_list;
+#endif
+
struct mem_cgroup_per_node *nodeinfo[];
};
/*
- * size of first charge trial. "32" comes from vmscan.c's magic value.
- * TODO: maybe necessary to use big numbers in big irons.
+ * size of first charge trial.
+ * TODO: maybe necessary to use big numbers in big irons or dynamic based of the
+ * workload.
*/
-#define MEMCG_CHARGE_BATCH 32U
+#define MEMCG_CHARGE_BATCH 64U
extern struct mem_cgroup *root_mem_cgroup;
@@ -444,6 +420,7 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio)
* - LRU isolation
* - lock_page_memcg()
* - exclusive reference
+ * - mem_cgroup_trylock_pages()
*
* For a kmem folio a caller should hold an rcu read lock to protect memcg
* associated with a kmem folio from being released.
@@ -505,6 +482,7 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
* - LRU isolation
* - lock_page_memcg()
* - exclusive reference
+ * - mem_cgroup_trylock_pages()
*
* For a kmem page a caller should hold an rcu read lock to protect memcg
* associated with a kmem page from being released.
@@ -689,7 +667,7 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm,
return __mem_cgroup_charge(folio, mm, gfp);
}
-int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
+int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
gfp_t gfp, swp_entry_t entry);
void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry);
@@ -959,6 +937,23 @@ void unlock_page_memcg(struct page *page);
void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val);
+/* try to stablize folio_memcg() for all the pages in a memcg */
+static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
+{
+ rcu_read_lock();
+
+ if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account))
+ return true;
+
+ rcu_read_unlock();
+ return false;
+}
+
+static inline void mem_cgroup_unlock_pages(void)
+{
+ rcu_read_unlock();
+}
+
/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void mod_memcg_state(struct mem_cgroup *memcg,
int idx, int val)
@@ -985,15 +980,7 @@ static inline void mod_memcg_page_state(struct page *page,
rcu_read_unlock();
}
-static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
-{
- long x = READ_ONCE(memcg->vmstats.state[idx]);
-#ifdef CONFIG_SMP
- if (x < 0)
- x = 0;
-#endif
- return x;
-}
+unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx);
static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
enum node_stat_item idx)
@@ -1238,7 +1225,7 @@ static inline int mem_cgroup_charge(struct folio *folio,
return 0;
}
-static inline int mem_cgroup_swapin_charge_page(struct page *page,
+static inline int mem_cgroup_swapin_charge_folio(struct folio *folio,
struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
{
return 0;
@@ -1433,6 +1420,18 @@ static inline void folio_memcg_unlock(struct folio *folio)
{
}
+static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
+{
+ /* to match folio_memcg_rcu() */
+ rcu_read_lock();
+ return true;
+}
+
+static inline void mem_cgroup_unlock_pages(void)
+{
+ rcu_read_unlock();
+}
+
static inline void mem_cgroup_handle_over_high(void)
{
}
@@ -1779,7 +1778,7 @@ static inline void count_objcg_event(struct obj_cgroup *objcg,
{
struct mem_cgroup *memcg;
- if (mem_cgroup_kmem_disabled())
+ if (!memcg_kmem_enabled())
return;
rcu_read_lock();
diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
new file mode 100644
index 000000000000..965009aa01d7
--- /dev/null
+++ b/include/linux/memory-tiers.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_MEMORY_TIERS_H
+#define _LINUX_MEMORY_TIERS_H
+
+#include <linux/types.h>
+#include <linux/nodemask.h>
+#include <linux/kref.h>
+#include <linux/mmzone.h>
+/*
+ * Each tier cover a abstrace distance chunk size of 128
+ */
+#define MEMTIER_CHUNK_BITS 7
+#define MEMTIER_CHUNK_SIZE (1 << MEMTIER_CHUNK_BITS)
+/*
+ * Smaller abstract distance values imply faster (higher) memory tiers. Offset
+ * the DRAM adistance so that we can accommodate devices with a slightly lower
+ * adistance value (slightly faster) than default DRAM adistance to be part of
+ * the same memory tier.
+ */
+#define MEMTIER_ADISTANCE_DRAM ((4 * MEMTIER_CHUNK_SIZE) + (MEMTIER_CHUNK_SIZE >> 1))
+#define MEMTIER_HOTPLUG_PRIO 100
+
+struct memory_tier;
+struct memory_dev_type {
+ /* list of memory types that are part of same tier as this type */
+ struct list_head tier_sibiling;
+ /* abstract distance for this specific memory type */
+ int adistance;
+ /* Nodes of same abstract distance */
+ nodemask_t nodes;
+ struct kref kref;
+};
+
+#ifdef CONFIG_NUMA
+extern bool numa_demotion_enabled;
+struct memory_dev_type *alloc_memory_type(int adistance);
+void destroy_memory_type(struct memory_dev_type *memtype);
+void init_node_memory_type(int node, struct memory_dev_type *default_type);
+void clear_node_memory_type(int node, struct memory_dev_type *memtype);
+#ifdef CONFIG_MIGRATION
+int next_demotion_node(int node);
+void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
+bool node_is_toptier(int node);
+#else
+static inline int next_demotion_node(int node)
+{
+ return NUMA_NO_NODE;
+}
+
+static inline void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
+{
+ *targets = NODE_MASK_NONE;
+}
+
+static inline bool node_is_toptier(int node)
+{
+ return true;
+}
+#endif
+
+#else
+
+#define numa_demotion_enabled false
+/*
+ * CONFIG_NUMA implementation returns non NULL error.
+ */
+static inline struct memory_dev_type *alloc_memory_type(int adistance)
+{
+ return NULL;
+}
+
+static inline void destroy_memory_type(struct memory_dev_type *memtype)
+{
+
+}
+
+static inline void init_node_memory_type(int node, struct memory_dev_type *default_type)
+{
+
+}
+
+static inline void clear_node_memory_type(int node, struct memory_dev_type *memtype)
+{
+
+}
+
+static inline int next_demotion_node(int node)
+{
+ return NUMA_NO_NODE;
+}
+
+static inline void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
+{
+ *targets = NODE_MASK_NONE;
+}
+
+static inline bool node_is_toptier(int node)
+{
+ return true;
+}
+#endif /* CONFIG_NUMA */
+#endif /* _LINUX_MEMORY_TIERS_H */
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index e0b2209ab71c..9fcbf5706595 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -11,7 +11,6 @@ struct page;
struct zone;
struct pglist_data;
struct mem_section;
-struct memory_block;
struct memory_group;
struct resource;
struct vmem_altmap;
@@ -44,11 +43,6 @@ extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat);
({ \
memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); \
})
-/*
- * This definition is just for error path in node hotadd.
- * For node hotremove, we have to replace this.
- */
-#define generic_free_nodedata(pgdat) kfree(pgdat)
extern pg_data_t *node_data[];
static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
@@ -64,9 +58,6 @@ static inline pg_data_t *generic_alloc_nodedata(int nid)
BUG();
return NULL;
}
-static inline void generic_free_nodedata(pg_data_t *pgdat)
-{
-}
static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
{
}
@@ -216,6 +207,22 @@ void put_online_mems(void);
void mem_hotplug_begin(void);
void mem_hotplug_done(void);
+/* See kswapd_is_running() */
+static inline void pgdat_kswapd_lock(pg_data_t *pgdat)
+{
+ mutex_lock(&pgdat->kswapd_lock);
+}
+
+static inline void pgdat_kswapd_unlock(pg_data_t *pgdat)
+{
+ mutex_unlock(&pgdat->kswapd_lock);
+}
+
+static inline void pgdat_kswapd_lock_init(pg_data_t *pgdat)
+{
+ mutex_init(&pgdat->kswapd_lock);
+}
+
#else /* ! CONFIG_MEMORY_HOTPLUG */
#define pfn_to_online_page(pfn) \
({ \
@@ -252,6 +259,10 @@ static inline bool movable_node_is_enabled(void)
{
return false;
}
+
+static inline void pgdat_kswapd_lock(pg_data_t *pgdat) {}
+static inline void pgdat_kswapd_unlock(pg_data_t *pgdat) {}
+static inline void pgdat_kswapd_lock_init(pg_data_t *pgdat) {}
#endif /* ! CONFIG_MEMORY_HOTPLUG */
/*
@@ -333,7 +344,6 @@ extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
extern void remove_pfn_range_from_zone(struct zone *zone,
unsigned long start_pfn,
unsigned long nr_pages);
-extern bool is_memblock_offlined(struct memory_block *mem);
extern int sparse_add_section(int nid, unsigned long pfn,
unsigned long nr_pages, struct vmem_altmap *altmap,
struct dev_pagemap *pgmap);
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 668389b4b53d..d232de7cdc56 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -151,13 +151,6 @@ extern bool mempolicy_in_oom_domain(struct task_struct *tsk,
const nodemask_t *mask);
extern nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy);
-static inline nodemask_t *policy_nodemask_current(gfp_t gfp)
-{
- struct mempolicy *mpol = get_task_policy(current);
-
- return policy_nodemask(gfp, mpol);
-}
-
extern unsigned int mempolicy_slab_node(void);
extern enum zone_type policy_zone;
@@ -189,6 +182,7 @@ static inline bool mpol_is_preferred_many(struct mempolicy *pol)
return (pol->mode == MPOL_PREFERRED_MANY);
}
+extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone);
#else
@@ -294,11 +288,6 @@ static inline void mpol_put_task_policy(struct task_struct *task)
{
}
-static inline nodemask_t *policy_nodemask_current(gfp_t gfp)
-{
- return NULL;
-}
-
static inline bool mpol_is_preferred_many(struct mempolicy *pol)
{
return false;
diff --git a/include/linux/memregion.h b/include/linux/memregion.h
index c04c4fd2e209..bf83363807ac 100644
--- a/include/linux/memregion.h
+++ b/include/linux/memregion.h
@@ -3,6 +3,7 @@
#define _MEMREGION_H_
#include <linux/types.h>
#include <linux/errno.h>
+#include <linux/bug.h>
struct memregion_info {
int target_node;
@@ -20,4 +21,41 @@ static inline void memregion_free(int id)
{
}
#endif
+
+/**
+ * cpu_cache_invalidate_memregion - drop any CPU cached data for
+ * memregions described by @res_desc
+ * @res_desc: one of the IORES_DESC_* types
+ *
+ * Perform cache maintenance after a memory event / operation that
+ * changes the contents of physical memory in a cache-incoherent manner.
+ * For example, device memory technologies like NVDIMM and CXL have
+ * device secure erase, and dynamic region provision that can replace
+ * the memory mapped to a given physical address.
+ *
+ * Limit the functionality to architectures that have an efficient way
+ * to writeback and invalidate potentially terabytes of address space at
+ * once. Note that this routine may or may not write back any dirty
+ * contents while performing the invalidation. It is only exported for
+ * the explicit usage of the NVDIMM and CXL modules in the 'DEVMEM'
+ * symbol namespace on bare platforms.
+ *
+ * Returns 0 on success or negative error code on a failure to perform
+ * the cache maintenance.
+ */
+#ifdef CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION
+int cpu_cache_invalidate_memregion(int res_desc);
+bool cpu_cache_has_invalidate_memregion(void);
+#else
+static inline bool cpu_cache_has_invalidate_memregion(void)
+{
+ return false;
+}
+
+static inline int cpu_cache_invalidate_memregion(int res_desc)
+{
+ WARN_ON_ONCE("CPU cache invalidation required");
+ return -ENXIO;
+}
+#endif
#endif /* _MEMREGION_H_ */
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index c3b4cc84877b..7fcaf3180a5b 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -187,6 +187,7 @@ static inline bool folio_is_device_coherent(const struct folio *folio)
}
#ifdef CONFIG_ZONE_DEVICE
+void zone_device_page_init(struct page *page);
void *memremap_pages(struct dev_pagemap *pgmap, int nid);
void memunmap_pages(struct dev_pagemap *pgmap);
void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 22c0a0cf5e0c..3ef77f52a4f0 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -62,6 +62,8 @@ extern const char *migrate_reason_names[MR_TYPES];
#ifdef CONFIG_MIGRATION
extern void putback_movable_pages(struct list_head *l);
+int migrate_folio_extra(struct address_space *mapping, struct folio *dst,
+ struct folio *src, enum migrate_mode mode, int extra_count);
int migrate_folio(struct address_space *mapping, struct folio *dst,
struct folio *src, enum migrate_mode mode);
extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
@@ -100,21 +102,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
#endif /* CONFIG_MIGRATION */
-#if defined(CONFIG_MIGRATION) && defined(CONFIG_NUMA)
-extern void set_migration_target_nodes(void);
-extern void migrate_on_reclaim_init(void);
-extern bool numa_demotion_enabled;
-extern int next_demotion_node(int node);
-#else
-static inline void set_migration_target_nodes(void) {}
-static inline void migrate_on_reclaim_init(void) {}
-static inline int next_demotion_node(int node)
-{
- return NUMA_NO_NODE;
-}
-#define numa_demotion_enabled false
-#endif
-
#ifdef CONFIG_COMPACTION
bool PageMovable(struct page *page);
void __SetPageMovable(struct page *page, const struct movable_operations *ops);
@@ -212,11 +199,24 @@ struct migrate_vma {
*/
void *pgmap_owner;
unsigned long flags;
+
+ /*
+ * Set to vmf->page if this is being called to migrate a page as part of
+ * a migrate_to_ram() callback.
+ */
+ struct page *fault_page;
};
int migrate_vma_setup(struct migrate_vma *args);
void migrate_vma_pages(struct migrate_vma *migrate);
void migrate_vma_finalize(struct migrate_vma *migrate);
+int migrate_device_range(unsigned long *src_pfns, unsigned long start,
+ unsigned long npages);
+void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns,
+ unsigned long npages);
+void migrate_device_finalize(unsigned long *src_pfns,
+ unsigned long *dst_pfns, unsigned long npages);
+
#endif /* CONFIG_MIGRATION */
#endif /* _LINUX_MIGRATE_H */
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index a12929bc31b2..06cbad166225 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -970,7 +970,7 @@ void mlx5_cmd_allowed_opcode(struct mlx5_core_dev *dev, u16 opcode);
struct mlx5_async_ctx {
struct mlx5_core_dev *dev;
atomic_t num_inflight;
- struct wait_queue_head wait;
+ struct completion inflight_done;
};
struct mlx5_async_work;
@@ -981,6 +981,7 @@ struct mlx5_async_work {
struct mlx5_async_ctx *ctx;
mlx5_async_cbk_t user_callback;
u16 opcode; /* cmd opcode */
+ u16 op_mod; /* cmd op_mod */
void *out; /* pointer to the cmd output buffer */
};
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 21f8b27bd9fd..974ccca609d2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -661,6 +661,38 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
return vma->vm_flags & VM_ACCESS_FLAGS;
}
+static inline
+struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max)
+{
+ return mas_find(&vmi->mas, max);
+}
+
+static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi)
+{
+ /*
+ * Uses vma_find() to get the first VMA when the iterator starts.
+ * Calling mas_next() could skip the first entry.
+ */
+ return vma_find(vmi, ULONG_MAX);
+}
+
+static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi)
+{
+ return mas_prev(&vmi->mas, 0);
+}
+
+static inline unsigned long vma_iter_addr(struct vma_iterator *vmi)
+{
+ return vmi->mas.index;
+}
+
+#define for_each_vma(__vmi, __vma) \
+ while (((__vma) = vma_next(&(__vmi))) != NULL)
+
+/* The MM code likes to work with exclusive end addresses */
+#define for_each_vma_range(__vmi, __vma, __end) \
+ while (((__vma) = vma_find(&(__vmi), (__end) - 1)) != NULL)
+
#ifdef CONFIG_SHMEM
/*
* The vma_is_shmem is not inline because it is used only by slow
@@ -697,7 +729,9 @@ static inline unsigned int compound_order(struct page *page)
*/
static inline unsigned int folio_order(struct folio *folio)
{
- return compound_order(&folio->page);
+ if (!folio_test_large(folio))
+ return 0;
+ return folio->_folio_order;
}
#include <linux/huge_mm.h>
@@ -1255,6 +1289,18 @@ static inline int folio_nid(const struct folio *folio)
}
#ifdef CONFIG_NUMA_BALANCING
+/* page access time bits needs to hold at least 4 seconds */
+#define PAGE_ACCESS_TIME_MIN_BITS 12
+#if LAST_CPUPID_SHIFT < PAGE_ACCESS_TIME_MIN_BITS
+#define PAGE_ACCESS_TIME_BUCKETS \
+ (PAGE_ACCESS_TIME_MIN_BITS - LAST_CPUPID_SHIFT)
+#else
+#define PAGE_ACCESS_TIME_BUCKETS 0
+#endif
+
+#define PAGE_ACCESS_TIME_MASK \
+ (LAST_CPUPID_MASK << PAGE_ACCESS_TIME_BUCKETS)
+
static inline int cpu_pid_to_cpupid(int cpu, int pid)
{
return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK);
@@ -1318,12 +1364,25 @@ static inline void page_cpupid_reset_last(struct page *page)
page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT;
}
#endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */
+
+static inline int xchg_page_access_time(struct page *page, int time)
+{
+ int last_time;
+
+ last_time = page_cpupid_xchg_last(page, time >> PAGE_ACCESS_TIME_BUCKETS);
+ return last_time << PAGE_ACCESS_TIME_BUCKETS;
+}
#else /* !CONFIG_NUMA_BALANCING */
static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
{
return page_to_nid(page); /* XXX */
}
+static inline int xchg_page_access_time(struct page *page, int time)
+{
+ return 0;
+}
+
static inline int page_cpupid_last(struct page *page)
{
return page_to_nid(page); /* XXX */
@@ -1465,6 +1524,11 @@ static inline unsigned long folio_pfn(struct folio *folio)
return page_to_pfn(&folio->page);
}
+static inline struct folio *pfn_folio(unsigned long pfn)
+{
+ return page_folio(pfn_to_page(pfn));
+}
+
static inline atomic_t *folio_pincount_ptr(struct folio *folio)
{
return &folio_page(folio, 1)->compound_pincount;
@@ -1597,7 +1661,13 @@ static inline void set_page_links(struct page *page, enum zone_type zone,
*/
static inline long folio_nr_pages(struct folio *folio)
{
- return compound_nr(&folio->page);
+ if (!folio_test_large(folio))
+ return 1;
+#ifdef CONFIG_64BIT
+ return folio->_folio_nr_pages;
+#else
+ return 1L << folio->_folio_order;
+#endif
}
/**
@@ -1776,7 +1846,30 @@ extern void pagefault_out_of_memory(void);
*/
#define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */
-extern void show_free_areas(unsigned int flags, nodemask_t *nodemask);
+extern void __show_free_areas(unsigned int flags, nodemask_t *nodemask, int max_zone_idx);
+static void __maybe_unused show_free_areas(unsigned int flags, nodemask_t *nodemask)
+{
+ __show_free_areas(flags, nodemask, MAX_NR_ZONES - 1);
+}
+
+/*
+ * Parameter block passed down to zap_pte_range in exceptional cases.
+ */
+struct zap_details {
+ struct folio *single_folio; /* Locked folio to be unmapped */
+ bool even_cows; /* Zap COWed private pages too? */
+ zap_flags_t zap_flags; /* Extra flags for zapping */
+};
+
+/*
+ * Whether to drop the pte markers, for example, the uffd-wp information for
+ * file-backed memory. This should only be specified when we will completely
+ * drop the page in the mm, either by truncation or unmapping of the vma. By
+ * default, the flag is not set.
+ */
+#define ZAP_FLAG_DROP_MARKER ((__force zap_flags_t) BIT(0))
+/* Set in unmap_vmas() to indicate a final unmap call. Only used by hugetlb */
+#define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1))
#ifdef CONFIG_MMU
extern bool can_do_mlock(void);
@@ -1795,8 +1888,11 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
unsigned long size);
void zap_page_range(struct vm_area_struct *vma, unsigned long address,
unsigned long size);
-void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
- unsigned long start, unsigned long end);
+void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
+ unsigned long size, struct zap_details *details);
+void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
+ struct vm_area_struct *start_vma, unsigned long start,
+ unsigned long end);
struct mmu_notifier_range;
@@ -2495,7 +2591,6 @@ extern unsigned long absent_pages_in_range(unsigned long start_pfn,
unsigned long end_pfn);
extern void get_pfn_range_for_nid(unsigned int nid,
unsigned long *start_pfn, unsigned long *end_pfn);
-extern unsigned long find_min_pfn_with_active_regions(void);
#ifndef CONFIG_NUMA
static inline int early_pfn_to_nid(unsigned long pfn)
@@ -2516,7 +2611,12 @@ extern void calculate_min_free_kbytes(void);
extern int __meminit init_per_zone_wmark_min(void);
extern void mem_init(void);
extern void __init mmap_init(void);
-extern void show_mem(unsigned int flags, nodemask_t *nodemask);
+
+extern void __show_mem(unsigned int flags, nodemask_t *nodemask, int max_zone_idx);
+static inline void show_mem(unsigned int flags, nodemask_t *nodemask)
+{
+ __show_mem(flags, nodemask, MAX_NR_ZONES - 1);
+}
extern long si_mem_available(void);
extern void si_meminfo(struct sysinfo * val);
extern void si_meminfo_node(struct sysinfo *val, int nid);
@@ -2593,14 +2693,15 @@ extern int __split_vma(struct mm_struct *, struct vm_area_struct *,
extern int split_vma(struct mm_struct *, struct vm_area_struct *,
unsigned long addr, int new_below);
extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
-extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
- struct rb_node **, struct rb_node *);
extern void unlink_file_vma(struct vm_area_struct *);
extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
unsigned long addr, unsigned long len, pgoff_t pgoff,
bool *need_rmap_locks);
extern void exit_mmap(struct mm_struct *);
+void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas);
+void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas);
+
static inline int check_data_rlimit(unsigned long rlim,
unsigned long new,
unsigned long start,
@@ -2648,8 +2749,9 @@ extern unsigned long mmap_region(struct file *file, unsigned long addr,
extern unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot, unsigned long flags,
unsigned long pgoff, unsigned long *populate, struct list_head *uf);
-extern int __do_munmap(struct mm_struct *, unsigned long, size_t,
- struct list_head *uf, bool downgrade);
+extern int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm,
+ unsigned long start, size_t len, struct list_head *uf,
+ bool downgrade);
extern int do_munmap(struct mm_struct *, unsigned long, size_t,
struct list_head *uf);
extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior);
@@ -2716,26 +2818,12 @@ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long add
extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
struct vm_area_struct **pprev);
-/**
- * find_vma_intersection() - Look up the first VMA which intersects the interval
- * @mm: The process address space.
- * @start_addr: The inclusive start user address.
- * @end_addr: The exclusive end user address.
- *
- * Returns: The first VMA within the provided range, %NULL otherwise. Assumes
- * start_addr < end_addr.
+/*
+ * Look up the first VMA which intersects the interval [start_addr, end_addr)
+ * NULL if none. Assume start_addr < end_addr.
*/
-static inline
struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
- unsigned long start_addr,
- unsigned long end_addr)
-{
- struct vm_area_struct *vma = find_vma(mm, start_addr);
-
- if (vma && end_addr <= vma->vm_start)
- vma = NULL;
- return vma;
-}
+ unsigned long start_addr, unsigned long end_addr);
/**
* vma_lookup() - Find a VMA at a specific address
@@ -2747,12 +2835,7 @@ struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
static inline
struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr)
{
- struct vm_area_struct *vma = find_vma(mm, addr);
-
- if (vma && addr < vma->vm_start)
- vma = NULL;
-
- return vma;
+ return mtree_load(&mm->mm_mt, addr);
}
static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
@@ -2788,7 +2871,7 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma)
static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm,
unsigned long vm_start, unsigned long vm_end)
{
- struct vm_area_struct *vma = find_vma(mm, vm_start);
+ struct vm_area_struct *vma = vma_lookup(mm, vm_start);
if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
vma = NULL;
@@ -2888,7 +2971,6 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
* and return without waiting upon it */
#define FOLL_NOFAULT 0x80 /* do not fault in pages */
#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */
-#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */
#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */
#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */
#define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */
@@ -2975,8 +3057,8 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
* PageAnonExclusive() has to protect against concurrent GUP:
* * Ordinary GUP: Using the PT lock
* * GUP-fast and fork(): mm->write_protect_seq
- * * GUP-fast and KSM or temporary unmapping (swap, migration):
- * clear/invalidate+flush of the page table entry
+ * * GUP-fast and KSM or temporary unmapping (swap, migration): see
+ * page_try_share_anon_rmap()
*
* Must be called with the (sub)page that's actually referenced via the
* page table entry, which might not necessarily be the head page for a
@@ -2997,6 +3079,11 @@ static inline bool gup_must_unshare(unsigned int flags, struct page *page)
*/
if (!PageAnon(page))
return false;
+
+ /* Paired with a memory barrier in page_try_share_anon_rmap(). */
+ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+ smp_rmb();
+
/*
* Note that PageKsm() pages cannot be exclusive, and consequently,
* cannot get pinned.
@@ -3004,6 +3091,21 @@ static inline bool gup_must_unshare(unsigned int flags, struct page *page)
return !PageAnonExclusive(page);
}
+/*
+ * Indicates whether GUP can follow a PROT_NONE mapped page, or whether
+ * a (NUMA hinting) fault is required.
+ */
+static inline bool gup_can_follow_protnone(unsigned int flags)
+{
+ /*
+ * FOLL_FORCE has to be able to make progress even if the VMA is
+ * inaccessible. Further, FOLL_FORCE access usually does not represent
+ * application behaviour and we should avoid triggering NUMA hinting
+ * faults.
+ */
+ return flags & FOLL_FORCE;
+}
+
typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data);
extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
unsigned long size, pte_fn_t fn, void *data);
@@ -3011,7 +3113,7 @@ extern int apply_to_existing_page_range(struct mm_struct *mm,
unsigned long address, unsigned long size,
pte_fn_t fn, void *data);
-extern void init_mem_debugging_and_hardening(void);
+extern void __init init_mem_debugging_and_hardening(void);
#ifdef CONFIG_PAGE_POISONING
extern void __kernel_poison_pages(struct page *page, int numpages);
extern void __kernel_unpoison_pages(struct page *page, int numpages);
@@ -3386,12 +3488,4 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
}
#endif
-/*
- * Whether to drop the pte markers, for example, the uffd-wp information for
- * file-backed memory. This should only be specified when we will completely
- * drop the page in the mm, either by truncation or unmapping of the vma. By
- * default, the flag is not set.
- */
-#define ZAP_FLAG_DROP_MARKER ((__force zap_flags_t) BIT(0))
-
#endif /* _LINUX_MM_H */
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 7b25b53c474a..e8ed225d8f7c 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -34,15 +34,25 @@ static inline int page_is_file_lru(struct page *page)
return folio_is_file_lru(page_folio(page));
}
-static __always_inline void update_lru_size(struct lruvec *lruvec,
+static __always_inline void __update_lru_size(struct lruvec *lruvec,
enum lru_list lru, enum zone_type zid,
long nr_pages)
{
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+ lockdep_assert_held(&lruvec->lru_lock);
+ WARN_ON_ONCE(nr_pages != (int)nr_pages);
+
__mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
__mod_zone_page_state(&pgdat->node_zones[zid],
NR_ZONE_LRU_BASE + lru, nr_pages);
+}
+
+static __always_inline void update_lru_size(struct lruvec *lruvec,
+ enum lru_list lru, enum zone_type zid,
+ long nr_pages)
+{
+ __update_lru_size(lruvec, lru, zid, nr_pages);
#ifdef CONFIG_MEMCG
mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages);
#endif
@@ -66,11 +76,6 @@ static __always_inline void __folio_clear_lru_flags(struct folio *folio)
__folio_clear_unevictable(folio);
}
-static __always_inline void __clear_page_lru_flags(struct page *page)
-{
- __folio_clear_lru_flags(page_folio(page));
-}
-
/**
* folio_lru_list - Which LRU list should a folio be on?
* @folio: The folio to test.
@@ -94,11 +99,224 @@ static __always_inline enum lru_list folio_lru_list(struct folio *folio)
return lru;
}
+#ifdef CONFIG_LRU_GEN
+
+#ifdef CONFIG_LRU_GEN_ENABLED
+static inline bool lru_gen_enabled(void)
+{
+ DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]);
+
+ return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]);
+}
+#else
+static inline bool lru_gen_enabled(void)
+{
+ DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]);
+
+ return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]);
+}
+#endif
+
+static inline bool lru_gen_in_fault(void)
+{
+ return current->in_lru_fault;
+}
+
+static inline int lru_gen_from_seq(unsigned long seq)
+{
+ return seq % MAX_NR_GENS;
+}
+
+static inline int lru_hist_from_seq(unsigned long seq)
+{
+ return seq % NR_HIST_GENS;
+}
+
+static inline int lru_tier_from_refs(int refs)
+{
+ VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH));
+
+ /* see the comment in folio_lru_refs() */
+ return order_base_2(refs + 1);
+}
+
+static inline int folio_lru_refs(struct folio *folio)
+{
+ unsigned long flags = READ_ONCE(folio->flags);
+ bool workingset = flags & BIT(PG_workingset);
+
+ /*
+ * Return the number of accesses beyond PG_referenced, i.e., N-1 if the
+ * total number of accesses is N>1, since N=0,1 both map to the first
+ * tier. lru_tier_from_refs() will account for this off-by-one. Also see
+ * the comment on MAX_NR_TIERS.
+ */
+ return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset;
+}
+
+static inline int folio_lru_gen(struct folio *folio)
+{
+ unsigned long flags = READ_ONCE(folio->flags);
+
+ return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+}
+
+static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
+{
+ unsigned long max_seq = lruvec->lrugen.max_seq;
+
+ VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
+
+ /* see the comment on MIN_NR_GENS */
+ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
+}
+
+static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *folio,
+ int old_gen, int new_gen)
+{
+ int type = folio_is_file_lru(folio);
+ int zone = folio_zonenum(folio);
+ int delta = folio_nr_pages(folio);
+ enum lru_list lru = type * LRU_INACTIVE_FILE;
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+ VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
+ VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
+ VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1);
+
+ if (old_gen >= 0)
+ WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone],
+ lrugen->nr_pages[old_gen][type][zone] - delta);
+ if (new_gen >= 0)
+ WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone],
+ lrugen->nr_pages[new_gen][type][zone] + delta);
+
+ /* addition */
+ if (old_gen < 0) {
+ if (lru_gen_is_active(lruvec, new_gen))
+ lru += LRU_ACTIVE;
+ __update_lru_size(lruvec, lru, zone, delta);
+ return;
+ }
+
+ /* deletion */
+ if (new_gen < 0) {
+ if (lru_gen_is_active(lruvec, old_gen))
+ lru += LRU_ACTIVE;
+ __update_lru_size(lruvec, lru, zone, -delta);
+ return;
+ }
+
+ /* promotion */
+ if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) {
+ __update_lru_size(lruvec, lru, zone, -delta);
+ __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta);
+ }
+
+ /* demotion requires isolation, e.g., lru_deactivate_fn() */
+ VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
+}
+
+static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
+{
+ unsigned long seq;
+ unsigned long flags;
+ int gen = folio_lru_gen(folio);
+ int type = folio_is_file_lru(folio);
+ int zone = folio_zonenum(folio);
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+ VM_WARN_ON_ONCE_FOLIO(gen != -1, folio);
+
+ if (folio_test_unevictable(folio) || !lrugen->enabled)
+ return false;
+ /*
+ * There are three common cases for this page:
+ * 1. If it's hot, e.g., freshly faulted in or previously hot and
+ * migrated, add it to the youngest generation.
+ * 2. If it's cold but can't be evicted immediately, i.e., an anon page
+ * not in swapcache or a dirty page pending writeback, add it to the
+ * second oldest generation.
+ * 3. Everything else (clean, cold) is added to the oldest generation.
+ */
+ if (folio_test_active(folio))
+ seq = lrugen->max_seq;
+ else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) ||
+ (folio_test_reclaim(folio) &&
+ (folio_test_dirty(folio) || folio_test_writeback(folio))))
+ seq = lrugen->min_seq[type] + 1;
+ else
+ seq = lrugen->min_seq[type];
+
+ gen = lru_gen_from_seq(seq);
+ flags = (gen + 1UL) << LRU_GEN_PGOFF;
+ /* see the comment on MIN_NR_GENS about PG_active */
+ set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags);
+
+ lru_gen_update_size(lruvec, folio, -1, gen);
+ /* for folio_rotate_reclaimable() */
+ if (reclaiming)
+ list_add_tail(&folio->lru, &lrugen->lists[gen][type][zone]);
+ else
+ list_add(&folio->lru, &lrugen->lists[gen][type][zone]);
+
+ return true;
+}
+
+static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
+{
+ unsigned long flags;
+ int gen = folio_lru_gen(folio);
+
+ if (gen < 0)
+ return false;
+
+ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
+
+ /* for folio_migrate_flags() */
+ flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0;
+ flags = set_mask_bits(&folio->flags, LRU_GEN_MASK, flags);
+ gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+
+ lru_gen_update_size(lruvec, folio, gen, -1);
+ list_del(&folio->lru);
+
+ return true;
+}
+
+#else /* !CONFIG_LRU_GEN */
+
+static inline bool lru_gen_enabled(void)
+{
+ return false;
+}
+
+static inline bool lru_gen_in_fault(void)
+{
+ return false;
+}
+
+static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
+{
+ return false;
+}
+
+static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
+{
+ return false;
+}
+
+#endif /* CONFIG_LRU_GEN */
+
static __always_inline
void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio)
{
enum lru_list lru = folio_lru_list(folio);
+ if (lru_gen_add_folio(lruvec, folio, false))
+ return;
+
update_lru_size(lruvec, lru, folio_zonenum(folio),
folio_nr_pages(folio));
if (lru != LRU_UNEVICTABLE)
@@ -116,23 +334,23 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio)
{
enum lru_list lru = folio_lru_list(folio);
+ if (lru_gen_add_folio(lruvec, folio, true))
+ return;
+
update_lru_size(lruvec, lru, folio_zonenum(folio),
folio_nr_pages(folio));
/* This is not expected to be used on LRU_UNEVICTABLE */
list_add_tail(&folio->lru, &lruvec->lists[lru]);
}
-static __always_inline void add_page_to_lru_list_tail(struct page *page,
- struct lruvec *lruvec)
-{
- lruvec_add_folio_tail(lruvec, page_folio(page));
-}
-
static __always_inline
void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio)
{
enum lru_list lru = folio_lru_list(folio);
+ if (lru_gen_del_folio(lruvec, folio, false))
+ return;
+
if (lru != LRU_UNEVICTABLE)
list_del(&folio->lru);
update_lru_size(lruvec, lru, folio_zonenum(folio),
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index cf97f3884fda..500e536796ca 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -9,6 +9,7 @@
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/rbtree.h>
+#include <linux/maple_tree.h>
#include <linux/rwsem.h>
#include <linux/completion.h>
#include <linux/cpumask.h>
@@ -223,6 +224,18 @@ struct page {
not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */
+#ifdef CONFIG_KMSAN
+ /*
+ * KMSAN metadata for this page:
+ * - shadow page: every bit indicates whether the corresponding
+ * bit of the original page is initialized (0) or not (1);
+ * - origin page: every 4 bytes contain an id of the stack trace
+ * where the uninitialized value was created.
+ */
+ struct page *kmsan_shadow;
+ struct page *kmsan_origin;
+#endif
+
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
int _last_cpupid;
#endif
@@ -244,6 +257,13 @@ struct page {
* @_refcount: Do not access this member directly. Use folio_ref_count()
* to find how many references there are to this folio.
* @memcg_data: Memory Control Group data.
+ * @_flags_1: For large folios, additional page flags.
+ * @__head: Points to the folio. Do not use.
+ * @_folio_dtor: Which destructor to use for this folio.
+ * @_folio_order: Do not use directly, call folio_order().
+ * @_total_mapcount: Do not use directly, call folio_entire_mapcount().
+ * @_pincount: Do not use directly, call folio_maybe_dma_pinned().
+ * @_folio_nr_pages: Do not use directly, call folio_nr_pages().
*
* A folio is a physically, virtually and logically contiguous set
* of bytes. It is a power-of-two in size, and it is aligned to that
@@ -282,9 +302,17 @@ struct folio {
};
struct page page;
};
+ unsigned long _flags_1;
+ unsigned long __head;
+ unsigned char _folio_dtor;
+ unsigned char _folio_order;
+ atomic_t _total_mapcount;
+ atomic_t _pincount;
+#ifdef CONFIG_64BIT
+ unsigned int _folio_nr_pages;
+#endif
};
-static_assert(sizeof(struct page) == sizeof(struct folio));
#define FOLIO_MATCH(pg, fl) \
static_assert(offsetof(struct page, pg) == offsetof(struct folio, fl))
FOLIO_MATCH(flags, flags);
@@ -299,6 +327,19 @@ FOLIO_MATCH(_refcount, _refcount);
FOLIO_MATCH(memcg_data, memcg_data);
#endif
#undef FOLIO_MATCH
+#define FOLIO_MATCH(pg, fl) \
+ static_assert(offsetof(struct folio, fl) == \
+ offsetof(struct page, pg) + sizeof(struct page))
+FOLIO_MATCH(flags, _flags_1);
+FOLIO_MATCH(compound_head, __head);
+FOLIO_MATCH(compound_dtor, _folio_dtor);
+FOLIO_MATCH(compound_order, _folio_order);
+FOLIO_MATCH(compound_mapcount, _total_mapcount);
+FOLIO_MATCH(compound_pincount, _pincount);
+#ifdef CONFIG_64BIT
+FOLIO_MATCH(compound_nr, _folio_nr_pages);
+#endif
+#undef FOLIO_MATCH
static inline atomic_t *folio_mapcount_ptr(struct folio *folio)
{
@@ -407,21 +448,6 @@ struct vm_area_struct {
unsigned long vm_end; /* The first byte after our end address
within vm_mm. */
- /* linked list of VM areas per task, sorted by address */
- struct vm_area_struct *vm_next, *vm_prev;
-
- struct rb_node vm_rb;
-
- /*
- * Largest free memory gap in bytes to the left of this VMA.
- * Either between this VMA and vma->vm_prev, or between one of the
- * VMAs below us in the VMA rbtree and its ->vm_prev. This helps
- * get_unmapped_area find a free area of the right size.
- */
- unsigned long rb_subtree_gap;
-
- /* Second cache line starts here. */
-
struct mm_struct *vm_mm; /* The address space we belong to. */
/*
@@ -485,9 +511,7 @@ struct vm_area_struct {
struct kioctx_table;
struct mm_struct {
struct {
- struct vm_area_struct *mmap; /* list of VMAs */
- struct rb_root mm_rb;
- u64 vmacache_seqnum; /* per-thread vmacache */
+ struct maple_tree mm_mt;
#ifdef CONFIG_MMU
unsigned long (*get_unmapped_area) (struct file *filp,
unsigned long addr, unsigned long len,
@@ -501,7 +525,6 @@ struct mm_struct {
unsigned long mmap_compat_legacy_base;
#endif
unsigned long task_size; /* size of task vm space */
- unsigned long highest_vm_end; /* highest vma end address */
pgd_t * pgd;
#ifdef CONFIG_MEMBARRIER
@@ -631,22 +654,22 @@ struct mm_struct {
#endif
#ifdef CONFIG_NUMA_BALANCING
/*
- * numa_next_scan is the next time that the PTEs will be marked
- * pte_numa. NUMA hinting faults will gather statistics and
- * migrate pages to new nodes if necessary.
+ * numa_next_scan is the next time that PTEs will be remapped
+ * PROT_NONE to trigger NUMA hinting faults; such faults gather
+ * statistics and migrate pages to new nodes if necessary.
*/
unsigned long numa_next_scan;
- /* Restart point for scanning and setting pte_numa */
+ /* Restart point for scanning and remapping PTEs. */
unsigned long numa_scan_offset;
- /* numa_scan_seq prevents two threads setting pte_numa */
+ /* numa_scan_seq prevents two threads remapping PTEs. */
int numa_scan_seq;
#endif
/*
* An operation with batched TLB flushing is going on. Anything
* that can move process memory needs to flush the TLB when
- * moving a PROT_NONE or PROT_NUMA mapped page.
+ * moving a PROT_NONE mapped page.
*/
atomic_t tlb_flush_pending;
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
@@ -671,7 +694,28 @@ struct mm_struct {
* merging.
*/
unsigned long ksm_merging_pages;
+ /*
+ * Represent how many pages are checked for ksm merging
+ * including merged and not merged.
+ */
+ unsigned long ksm_rmap_items;
+#endif
+#ifdef CONFIG_LRU_GEN
+ struct {
+ /* this mm_struct is on lru_gen_mm_list */
+ struct list_head list;
+ /*
+ * Set when switching to this mm_struct, as a hint of
+ * whether it has been used since the last time per-node
+ * page table walkers cleared the corresponding bits.
+ */
+ unsigned long bitmap;
+#ifdef CONFIG_MEMCG
+ /* points to the memcg of "owner" above */
+ struct mem_cgroup *memcg;
#endif
+ } lru_gen;
+#endif /* CONFIG_LRU_GEN */
} __randomize_layout;
/*
@@ -681,6 +725,7 @@ struct mm_struct {
unsigned long cpu_bitmap[];
};
+#define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN)
extern struct mm_struct init_mm;
/* Pointer magic because the dynamic array size confuses some compilers. */
@@ -698,6 +743,87 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
return (struct cpumask *)&mm->cpu_bitmap;
}
+#ifdef CONFIG_LRU_GEN
+
+struct lru_gen_mm_list {
+ /* mm_struct list for page table walkers */
+ struct list_head fifo;
+ /* protects the list above */
+ spinlock_t lock;
+};
+
+void lru_gen_add_mm(struct mm_struct *mm);
+void lru_gen_del_mm(struct mm_struct *mm);
+#ifdef CONFIG_MEMCG
+void lru_gen_migrate_mm(struct mm_struct *mm);
+#endif
+
+static inline void lru_gen_init_mm(struct mm_struct *mm)
+{
+ INIT_LIST_HEAD(&mm->lru_gen.list);
+ mm->lru_gen.bitmap = 0;
+#ifdef CONFIG_MEMCG
+ mm->lru_gen.memcg = NULL;
+#endif
+}
+
+static inline void lru_gen_use_mm(struct mm_struct *mm)
+{
+ /*
+ * When the bitmap is set, page reclaim knows this mm_struct has been
+ * used since the last time it cleared the bitmap. So it might be worth
+ * walking the page tables of this mm_struct to clear the accessed bit.
+ */
+ WRITE_ONCE(mm->lru_gen.bitmap, -1);
+}
+
+#else /* !CONFIG_LRU_GEN */
+
+static inline void lru_gen_add_mm(struct mm_struct *mm)
+{
+}
+
+static inline void lru_gen_del_mm(struct mm_struct *mm)
+{
+}
+
+#ifdef CONFIG_MEMCG
+static inline void lru_gen_migrate_mm(struct mm_struct *mm)
+{
+}
+#endif
+
+static inline void lru_gen_init_mm(struct mm_struct *mm)
+{
+}
+
+static inline void lru_gen_use_mm(struct mm_struct *mm)
+{
+}
+
+#endif /* CONFIG_LRU_GEN */
+
+struct vma_iterator {
+ struct ma_state mas;
+};
+
+#define VMA_ITERATOR(name, __mm, __addr) \
+ struct vma_iterator name = { \
+ .mas = { \
+ .tree = &(__mm)->mm_mt, \
+ .index = __addr, \
+ .node = MAS_START, \
+ }, \
+ }
+
+static inline void vma_iter_init(struct vma_iterator *vmi,
+ struct mm_struct *mm, unsigned long addr)
+{
+ vmi->mas.tree = &mm->mm_mt;
+ vmi->mas.index = addr;
+ vmi->mas.node = MAS_START;
+}
+
struct mmu_gather;
extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index c1bc6731125c..0bb4b6da9993 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -25,18 +25,6 @@
#define ALLOC_SPLIT_PTLOCKS (SPINLOCK_SIZE > BITS_PER_LONG/8)
/*
- * The per task VMA cache array:
- */
-#define VMACACHE_BITS 2
-#define VMACACHE_SIZE (1U << VMACACHE_BITS)
-#define VMACACHE_MASK (VMACACHE_SIZE - 1)
-
-struct vmacache {
- u64 seqnum;
- struct vm_area_struct *vmas[VMACACHE_SIZE];
-};
-
-/*
* When updating this, please also update struct resident_page_types[] in
* kernel/fork.c
*/
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 8a30de08e913..c726ea781255 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -293,6 +293,7 @@ struct mmc_card {
#define MMC_QUIRK_BROKEN_IRQ_POLLING (1<<11) /* Polling SDIO_CCCR_INTx could create a fake interrupt */
#define MMC_QUIRK_TRIM_BROKEN (1<<12) /* Skip trim */
#define MMC_QUIRK_BROKEN_HPI (1<<13) /* Disable broken HPI support */
+#define MMC_QUIRK_BROKEN_SD_DISCARD (1<<14) /* Disable broken SD discard support */
bool reenable_cmdq; /* Re-enable Command Queue */
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index 9c50bc40f8ff..6f7993803ee7 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -451,7 +451,7 @@ static inline bool mmc_ready_for_data(u32 status)
#define MMC_SECURE_TRIM1_ARG 0x80000001
#define MMC_SECURE_TRIM2_ARG 0x80008000
#define MMC_SECURE_ARGS 0x80000000
-#define MMC_TRIM_ARGS 0x00008001
+#define MMC_TRIM_OR_DISCARD_ARGS 0x00008003
#define mmc_driver_type_mask(n) (1 << (n))
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 355d842d2731..5f74891556f3 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -24,10 +24,10 @@
#include <asm/page.h>
/* Free memory management - zoned buddy allocator. */
-#ifndef CONFIG_FORCE_MAX_ZONEORDER
+#ifndef CONFIG_ARCH_FORCE_MAX_ORDER
#define MAX_ORDER 11
#else
-#define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER
+#define MAX_ORDER CONFIG_ARCH_FORCE_MAX_ORDER
#endif
#define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1))
@@ -121,20 +121,6 @@ static inline bool free_area_empty(struct free_area *area, int migratetype)
struct pglist_data;
-/*
- * Add a wild amount of padding here to ensure data fall into separate
- * cachelines. There are very few zone structures in the machine, so space
- * consumption is not a concern here.
- */
-#if defined(CONFIG_SMP)
-struct zone_padding {
- char x[0];
-} ____cacheline_internodealigned_in_smp;
-#define ZONE_PADDING(name) struct zone_padding name;
-#else
-#define ZONE_PADDING(name)
-#endif
-
#ifdef CONFIG_NUMA
enum numa_stat_item {
NUMA_HIT, /* allocated in intended node */
@@ -222,6 +208,7 @@ enum node_stat_item {
#endif
#ifdef CONFIG_NUMA_BALANCING
PGPROMOTE_SUCCESS, /* promote successfully */
+ PGPROMOTE_CANDIDATE, /* candidate pages to promote */
#endif
NR_VM_NODE_STAT_ITEMS
};
@@ -307,6 +294,8 @@ static inline bool is_active_lru(enum lru_list lru)
return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE);
}
+#define WORKINGSET_ANON 0
+#define WORKINGSET_FILE 1
#define ANON_AND_FILE 2
enum lruvec_flags {
@@ -315,6 +304,207 @@ enum lruvec_flags {
*/
};
+#endif /* !__GENERATING_BOUNDS_H */
+
+/*
+ * Evictable pages are divided into multiple generations. The youngest and the
+ * oldest generation numbers, max_seq and min_seq, are monotonically increasing.
+ * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An
+ * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the
+ * corresponding generation. The gen counter in folio->flags stores gen+1 while
+ * a page is on one of lrugen->lists[]. Otherwise it stores 0.
+ *
+ * A page is added to the youngest generation on faulting. The aging needs to
+ * check the accessed bit at least twice before handing this page over to the
+ * eviction. The first check takes care of the accessed bit set on the initial
+ * fault; the second check makes sure this page hasn't been used since then.
+ * This process, AKA second chance, requires a minimum of two generations,
+ * hence MIN_NR_GENS. And to maintain ABI compatibility with the active/inactive
+ * LRU, e.g., /proc/vmstat, these two generations are considered active; the
+ * rest of generations, if they exist, are considered inactive. See
+ * lru_gen_is_active().
+ *
+ * PG_active is always cleared while a page is on one of lrugen->lists[] so that
+ * the aging needs not to worry about it. And it's set again when a page
+ * considered active is isolated for non-reclaiming purposes, e.g., migration.
+ * See lru_gen_add_folio() and lru_gen_del_folio().
+ *
+ * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the
+ * number of categories of the active/inactive LRU when keeping track of
+ * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits
+ * in folio->flags.
+ */
+#define MIN_NR_GENS 2U
+#define MAX_NR_GENS 4U
+
+/*
+ * Each generation is divided into multiple tiers. A page accessed N times
+ * through file descriptors is in tier order_base_2(N). A page in the first tier
+ * (N=0,1) is marked by PG_referenced unless it was faulted in through page
+ * tables or read ahead. A page in any other tier (N>1) is marked by
+ * PG_referenced and PG_workingset. This implies a minimum of two tiers is
+ * supported without using additional bits in folio->flags.
+ *
+ * In contrast to moving across generations which requires the LRU lock, moving
+ * across tiers only involves atomic operations on folio->flags and therefore
+ * has a negligible cost in the buffered access path. In the eviction path,
+ * comparisons of refaulted/(evicted+protected) from the first tier and the
+ * rest infer whether pages accessed multiple times through file descriptors
+ * are statistically hot and thus worth protecting.
+ *
+ * MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the
+ * number of categories of the active/inactive LRU when keeping track of
+ * accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in
+ * folio->flags.
+ */
+#define MAX_NR_TIERS 4U
+
+#ifndef __GENERATING_BOUNDS_H
+
+struct lruvec;
+struct page_vma_mapped_walk;
+
+#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
+#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
+
+#ifdef CONFIG_LRU_GEN
+
+enum {
+ LRU_GEN_ANON,
+ LRU_GEN_FILE,
+};
+
+enum {
+ LRU_GEN_CORE,
+ LRU_GEN_MM_WALK,
+ LRU_GEN_NONLEAF_YOUNG,
+ NR_LRU_GEN_CAPS
+};
+
+#define MIN_LRU_BATCH BITS_PER_LONG
+#define MAX_LRU_BATCH (MIN_LRU_BATCH * 64)
+
+/* whether to keep historical stats from evicted generations */
+#ifdef CONFIG_LRU_GEN_STATS
+#define NR_HIST_GENS MAX_NR_GENS
+#else
+#define NR_HIST_GENS 1U
+#endif
+
+/*
+ * The youngest generation number is stored in max_seq for both anon and file
+ * types as they are aged on an equal footing. The oldest generation numbers are
+ * stored in min_seq[] separately for anon and file types as clean file pages
+ * can be evicted regardless of swap constraints.
+ *
+ * Normally anon and file min_seq are in sync. But if swapping is constrained,
+ * e.g., out of swap space, file min_seq is allowed to advance and leave anon
+ * min_seq behind.
+ *
+ * The number of pages in each generation is eventually consistent and therefore
+ * can be transiently negative when reset_batch_size() is pending.
+ */
+struct lru_gen_struct {
+ /* the aging increments the youngest generation number */
+ unsigned long max_seq;
+ /* the eviction increments the oldest generation numbers */
+ unsigned long min_seq[ANON_AND_FILE];
+ /* the birth time of each generation in jiffies */
+ unsigned long timestamps[MAX_NR_GENS];
+ /* the multi-gen LRU lists, lazily sorted on eviction */
+ struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+ /* the multi-gen LRU sizes, eventually consistent */
+ long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+ /* the exponential moving average of refaulted */
+ unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS];
+ /* the exponential moving average of evicted+protected */
+ unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS];
+ /* the first tier doesn't need protection, hence the minus one */
+ unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1];
+ /* can be modified without holding the LRU lock */
+ atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
+ atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
+ /* whether the multi-gen LRU is enabled */
+ bool enabled;
+};
+
+enum {
+ MM_LEAF_TOTAL, /* total leaf entries */
+ MM_LEAF_OLD, /* old leaf entries */
+ MM_LEAF_YOUNG, /* young leaf entries */
+ MM_NONLEAF_TOTAL, /* total non-leaf entries */
+ MM_NONLEAF_FOUND, /* non-leaf entries found in Bloom filters */
+ MM_NONLEAF_ADDED, /* non-leaf entries added to Bloom filters */
+ NR_MM_STATS
+};
+
+/* double-buffering Bloom filters */
+#define NR_BLOOM_FILTERS 2
+
+struct lru_gen_mm_state {
+ /* set to max_seq after each iteration */
+ unsigned long seq;
+ /* where the current iteration continues (inclusive) */
+ struct list_head *head;
+ /* where the last iteration ended (exclusive) */
+ struct list_head *tail;
+ /* to wait for the last page table walker to finish */
+ struct wait_queue_head wait;
+ /* Bloom filters flip after each iteration */
+ unsigned long *filters[NR_BLOOM_FILTERS];
+ /* the mm stats for debugging */
+ unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
+ /* the number of concurrent page table walkers */
+ int nr_walkers;
+};
+
+struct lru_gen_mm_walk {
+ /* the lruvec under reclaim */
+ struct lruvec *lruvec;
+ /* unstable max_seq from lru_gen_struct */
+ unsigned long max_seq;
+ /* the next address within an mm to scan */
+ unsigned long next_addr;
+ /* to batch promoted pages */
+ int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+ /* to batch the mm stats */
+ int mm_stats[NR_MM_STATS];
+ /* total batched items */
+ int batched;
+ bool can_swap;
+ bool force_scan;
+};
+
+void lru_gen_init_lruvec(struct lruvec *lruvec);
+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
+
+#ifdef CONFIG_MEMCG
+void lru_gen_init_memcg(struct mem_cgroup *memcg);
+void lru_gen_exit_memcg(struct mem_cgroup *memcg);
+#endif
+
+#else /* !CONFIG_LRU_GEN */
+
+static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
+{
+}
+
+static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+{
+}
+
+#ifdef CONFIG_MEMCG
+static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
+{
+}
+
+static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
+{
+}
+#endif
+
+#endif /* CONFIG_LRU_GEN */
+
struct lruvec {
struct list_head lists[NR_LRU_LISTS];
/* per lruvec lru_lock for memcg */
@@ -332,6 +522,12 @@ struct lruvec {
unsigned long refaults[ANON_AND_FILE];
/* Various lruvec state flags (enum lruvec_flags) */
unsigned long flags;
+#ifdef CONFIG_LRU_GEN
+ /* evictable pages divided into generations */
+ struct lru_gen_struct lrugen;
+ /* to concurrently iterate lru_gen_mm_list */
+ struct lru_gen_mm_state mm_state;
+#endif
#ifdef CONFIG_MEMCG
struct pglist_data *pgdat;
#endif
@@ -369,13 +565,6 @@ enum zone_watermarks {
#define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1))
#define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP)
-/*
- * Shift to encode migratetype and order in the same integer, with order
- * in the least significant bits.
- */
-#define NR_PCP_ORDER_WIDTH 8
-#define NR_PCP_ORDER_MASK ((1<<NR_PCP_ORDER_WIDTH) - 1)
-
#define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost)
#define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost)
#define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
@@ -628,7 +817,7 @@ struct zone {
int initialized;
/* Write-intensive fields used from the page allocator */
- ZONE_PADDING(_pad1_)
+ CACHELINE_PADDING(_pad1_);
/* free areas of different sizes */
struct free_area free_area[MAX_ORDER];
@@ -640,7 +829,7 @@ struct zone {
spinlock_t lock;
/* Write-intensive fields used by compaction and vmstats. */
- ZONE_PADDING(_pad2_)
+ CACHELINE_PADDING(_pad2_);
/*
* When free pages are below this point, additional steps are taken
@@ -677,7 +866,7 @@ struct zone {
bool contiguous;
- ZONE_PADDING(_pad3_)
+ CACHELINE_PADDING(_pad3_);
/* Zone statistics */
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];
@@ -747,6 +936,8 @@ static inline bool zone_is_empty(struct zone *zone)
#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
#define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH)
#define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
+#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH)
+#define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH)
/*
* Define the bit shifts to access each section. For non-existent
@@ -954,8 +1145,10 @@ typedef struct pglist_data {
atomic_t nr_writeback_throttled;/* nr of writeback-throttled tasks */
unsigned long nr_reclaim_start; /* nr pages written while throttled
* when throttling started. */
- struct task_struct *kswapd; /* Protected by
- mem_hotplug_begin/done() */
+#ifdef CONFIG_MEMORY_HOTPLUG
+ struct mutex kswapd_lock;
+#endif
+ struct task_struct *kswapd; /* Protected by kswapd_lock */
int kswapd_order;
enum zone_type kswapd_highest_zoneidx;
@@ -983,7 +1176,7 @@ typedef struct pglist_data {
#endif /* CONFIG_NUMA */
/* Write-intensive fields used by page reclaim */
- ZONE_PADDING(_pad1_)
+ CACHELINE_PADDING(_pad1_);
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
@@ -997,6 +1190,21 @@ typedef struct pglist_data {
struct deferred_split deferred_split_queue;
#endif
+#ifdef CONFIG_NUMA_BALANCING
+ /* start time in ms of current promote rate limit period */
+ unsigned int nbp_rl_start;
+ /* number of promote candidate pages at start time of current rate limit period */
+ unsigned long nbp_rl_nr_cand;
+ /* promote threshold in ms */
+ unsigned int nbp_threshold;
+ /* start time in ms of current promote threshold adjustment period */
+ unsigned int nbp_th_start;
+ /*
+ * number of promote candidate pages at stat time of current promote
+ * threshold adjustment period
+ */
+ unsigned long nbp_th_nr_cand;
+#endif
/* Fields commonly accessed by the page reclaim scanner */
/*
@@ -1008,11 +1216,19 @@ typedef struct pglist_data {
unsigned long flags;
- ZONE_PADDING(_pad2_)
+#ifdef CONFIG_LRU_GEN
+ /* kswap mm walk data */
+ struct lru_gen_mm_walk mm_walk;
+#endif
+
+ CACHELINE_PADDING(_pad2_);
/* Per-node vmstats */
struct per_cpu_nodestat __percpu *per_cpu_nodestats;
atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS];
+#ifdef CONFIG_NUMA
+ struct memory_tier __rcu *memtier;
+#endif
} pg_data_t;
#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
@@ -1026,11 +1242,6 @@ static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
return pgdat->node_start_pfn + pgdat->node_spanned_pages;
}
-static inline bool pgdat_is_empty(pg_data_t *pgdat)
-{
- return !pgdat->node_start_pfn && !pgdat->node_spanned_pages;
-}
-
#include <linux/memory_hotplug.h>
void build_all_zonelists(pg_data_t *pgdat);
diff --git a/include/linux/msi.h b/include/linux/msi.h
index fc918a658d48..a112b913fff9 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -13,13 +13,20 @@
*
* Regular device drivers have no business with any of these functions and
* especially storing MSI descriptor pointers in random code is considered
- * abuse. The only function which is relevant for drivers is msi_get_virq().
+ * abuse.
+ *
+ * Device driver relevant functions are available in <linux/msi_api.h>
*/
+#include <linux/irqdomain_defs.h>
#include <linux/cpumask.h>
+#include <linux/msi_api.h>
#include <linux/xarray.h>
#include <linux/mutex.h>
#include <linux/list.h>
+#include <linux/irq.h>
+#include <linux/bits.h>
+
#include <asm/msi.h>
/* Dummy shadow structures if an architecture does not define them */
@@ -68,19 +75,18 @@ struct msi_msg {
extern int pci_msi_ignore_mask;
/* Helper functions */
-struct irq_data;
struct msi_desc;
struct pci_dev;
struct platform_msi_priv_data;
struct device_attribute;
+struct irq_domain;
+struct irq_affinity_desc;
void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
#ifdef CONFIG_GENERIC_MSI_IRQ
void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg);
#else
-static inline void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg)
-{
-}
+static inline void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg) { }
#endif
typedef void (*irq_write_msi_msg_t)(struct msi_desc *desc,
@@ -120,6 +126,38 @@ struct pci_msi_desc {
};
};
+/**
+ * union msi_domain_cookie - Opaque MSI domain specific data
+ * @value: u64 value store
+ * @ptr: Pointer to domain specific data
+ * @iobase: Domain specific IOmem pointer
+ *
+ * The content of this data is implementation defined and used by the MSI
+ * domain to store domain specific information which is requried for
+ * interrupt chip callbacks.
+ */
+union msi_domain_cookie {
+ u64 value;
+ void *ptr;
+ void __iomem *iobase;
+};
+
+/**
+ * struct msi_desc_data - Generic MSI descriptor data
+ * @dcookie: Cookie for MSI domain specific data which is required
+ * for irq_chip callbacks
+ * @icookie: Cookie for the MSI interrupt instance provided by
+ * the usage site to the allocation function
+ *
+ * The content of this data is implementation defined, e.g. PCI/IMS
+ * implementations define the meaning of the data. The MSI core ignores
+ * this data completely.
+ */
+struct msi_desc_data {
+ union msi_domain_cookie dcookie;
+ union msi_instance_cookie icookie;
+};
+
#define MSI_MAX_INDEX ((unsigned int)USHRT_MAX)
/**
@@ -137,6 +175,7 @@ struct pci_msi_desc {
*
* @msi_index: Index of the msi descriptor
* @pci: PCI specific msi descriptor data
+ * @data: Generic MSI descriptor data
*/
struct msi_desc {
/* Shared device/bus type independent data */
@@ -156,7 +195,10 @@ struct msi_desc {
void *write_msi_msg_data;
u16 msi_index;
- struct pci_msi_desc pci;
+ union {
+ struct pci_msi_desc pci;
+ struct msi_desc_data data;
+ };
};
/*
@@ -171,33 +213,80 @@ enum msi_desc_filter {
MSI_DESC_ASSOCIATED,
};
+
+/**
+ * struct msi_dev_domain - The internals of MSI domain info per device
+ * @store: Xarray for storing MSI descriptor pointers
+ * @irqdomain: Pointer to a per device interrupt domain
+ */
+struct msi_dev_domain {
+ struct xarray store;
+ struct irq_domain *domain;
+};
+
/**
* msi_device_data - MSI per device data
* @properties: MSI properties which are interesting to drivers
* @platform_data: Platform-MSI specific data
* @mutex: Mutex protecting the MSI descriptor store
- * @__store: Xarray for storing MSI descriptor pointers
+ * @__domains: Internal data for per device MSI domains
* @__iter_idx: Index to search the next entry for iterators
*/
struct msi_device_data {
unsigned long properties;
struct platform_msi_priv_data *platform_data;
struct mutex mutex;
- struct xarray __store;
+ struct msi_dev_domain __domains[MSI_MAX_DEVICE_IRQDOMAINS];
unsigned long __iter_idx;
};
int msi_setup_device_data(struct device *dev);
-unsigned int msi_get_virq(struct device *dev, unsigned int index);
void msi_lock_descs(struct device *dev);
void msi_unlock_descs(struct device *dev);
-struct msi_desc *msi_first_desc(struct device *dev, enum msi_desc_filter filter);
-struct msi_desc *msi_next_desc(struct device *dev, enum msi_desc_filter filter);
+struct msi_desc *msi_domain_first_desc(struct device *dev, unsigned int domid,
+ enum msi_desc_filter filter);
/**
- * msi_for_each_desc - Iterate the MSI descriptors
+ * msi_first_desc - Get the first MSI descriptor of the default irqdomain
+ * @dev: Device to operate on
+ * @filter: Descriptor state filter
+ *
+ * Must be called with the MSI descriptor mutex held, i.e. msi_lock_descs()
+ * must be invoked before the call.
+ *
+ * Return: Pointer to the first MSI descriptor matching the search
+ * criteria, NULL if none found.
+ */
+static inline struct msi_desc *msi_first_desc(struct device *dev,
+ enum msi_desc_filter filter)
+{
+ return msi_domain_first_desc(dev, MSI_DEFAULT_DOMAIN, filter);
+}
+
+struct msi_desc *msi_next_desc(struct device *dev, unsigned int domid,
+ enum msi_desc_filter filter);
+
+/**
+ * msi_domain_for_each_desc - Iterate the MSI descriptors in a specific domain
+ *
+ * @desc: struct msi_desc pointer used as iterator
+ * @dev: struct device pointer - device to iterate
+ * @domid: The id of the interrupt domain which should be walked.
+ * @filter: Filter for descriptor selection
+ *
+ * Notes:
+ * - The loop must be protected with a msi_lock_descs()/msi_unlock_descs()
+ * pair.
+ * - It is safe to remove a retrieved MSI descriptor in the loop.
+ */
+#define msi_domain_for_each_desc(desc, dev, domid, filter) \
+ for ((desc) = msi_domain_first_desc((dev), (domid), (filter)); (desc); \
+ (desc) = msi_next_desc((dev), (domid), (filter)))
+
+/**
+ * msi_for_each_desc - Iterate the MSI descriptors in the default irqdomain
*
* @desc: struct msi_desc pointer used as iterator
* @dev: struct device pointer - device to iterate
@@ -208,9 +297,8 @@ struct msi_desc *msi_next_desc(struct device *dev, enum msi_desc_filter filter);
* pair.
* - It is safe to remove a retrieved MSI descriptor in the loop.
*/
-#define msi_for_each_desc(desc, dev, filter) \
- for ((desc) = msi_first_desc((dev), (filter)); (desc); \
- (desc) = msi_next_desc((dev), (filter)))
+#define msi_for_each_desc(desc, dev, filter) \
+ msi_domain_for_each_desc((desc), (dev), MSI_DEFAULT_DOMAIN, (filter))
#define msi_desc_to_dev(desc) ((desc)->dev)
@@ -237,34 +325,47 @@ static inline void msi_desc_set_iommu_cookie(struct msi_desc *desc,
}
#endif
-#ifdef CONFIG_PCI_MSI
-struct pci_dev *msi_desc_to_pci_dev(struct msi_desc *desc);
-void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg);
-#else /* CONFIG_PCI_MSI */
-static inline void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg)
+int msi_domain_insert_msi_desc(struct device *dev, unsigned int domid,
+ struct msi_desc *init_desc);
+/**
+ * msi_insert_msi_desc - Allocate and initialize a MSI descriptor in the
+ * default irqdomain and insert it at @init_desc->msi_index
+ * @dev: Pointer to the device for which the descriptor is allocated
+ * @init_desc: Pointer to an MSI descriptor to initialize the new descriptor
+ *
+ * Return: 0 on success or an appropriate failure code.
+ */
+static inline int msi_insert_msi_desc(struct device *dev, struct msi_desc *init_desc)
{
+ return msi_domain_insert_msi_desc(dev, MSI_DEFAULT_DOMAIN, init_desc);
}
-#endif /* CONFIG_PCI_MSI */
-int msi_add_msi_desc(struct device *dev, struct msi_desc *init_desc);
-void msi_free_msi_descs_range(struct device *dev, enum msi_desc_filter filter,
- unsigned int first_index, unsigned int last_index);
+void msi_domain_free_msi_descs_range(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last);
+
+/**
+ * msi_free_msi_descs_range - Free a range of MSI descriptors of a device
+ * in the default irqdomain
+ *
+ * @dev: Device for which to free the descriptors
+ * @first: Index to start freeing from (inclusive)
+ * @last: Last index to be freed (inclusive)
+ */
+static inline void msi_free_msi_descs_range(struct device *dev, unsigned int first,
+ unsigned int last)
+{
+ msi_domain_free_msi_descs_range(dev, MSI_DEFAULT_DOMAIN, first, last);
+}
/**
- * msi_free_msi_descs - Free MSI descriptors of a device
+ * msi_free_msi_descs - Free all MSI descriptors of a device in the default irqdomain
* @dev: Device to free the descriptors
*/
static inline void msi_free_msi_descs(struct device *dev)
{
- msi_free_msi_descs_range(dev, MSI_DESC_ALL, 0, MSI_MAX_INDEX);
+ msi_free_msi_descs_range(dev, 0, MSI_MAX_INDEX);
}
-void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
-void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
-
-void pci_msi_mask_irq(struct irq_data *data);
-void pci_msi_unmask_irq(struct irq_data *data);
-
/*
* The arch hooks to setup up msi irqs. Default functions are implemented
* as weak symbols so that they /can/ be overriden by architecture specific
@@ -293,7 +394,7 @@ static inline void msi_device_destroy_sysfs(struct device *dev) { }
*/
bool arch_restore_msi_irqs(struct pci_dev *dev);
-#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
+#ifdef CONFIG_GENERIC_MSI_IRQ
#include <linux/irqhandler.h>
@@ -309,19 +410,22 @@ struct msi_domain_info;
* @get_hwirq: Retrieve the resulting hw irq number
* @msi_init: Domain specific init function for MSI interrupts
* @msi_free: Domain specific function to free a MSI interrupts
- * @msi_check: Callback for verification of the domain/info/dev data
* @msi_prepare: Prepare the allocation of the interrupts in the domain
+ * @prepare_desc: Optional function to prepare the allocated MSI descriptor
+ * in the domain
* @set_desc: Set the msi descriptor for an interrupt
* @domain_alloc_irqs: Optional function to override the default allocation
* function.
* @domain_free_irqs: Optional function to override the default free
* function.
+ * @msi_post_free: Optional function which is invoked after freeing
+ * all interrupts.
*
* @get_hwirq, @msi_init and @msi_free are callbacks used by the underlying
* irqdomain.
*
- * @msi_check, @msi_prepare and @set_desc are callbacks used by
- * msi_domain_alloc/free_irqs().
+ * @msi_check, @msi_prepare, @prepare_desc and @set_desc are callbacks used by the
+ * msi_domain_alloc/free_irqs*() variants.
*
* @domain_alloc_irqs, @domain_free_irqs can be used to override the
* default allocation/free functions (__msi_domain_alloc/free_irqs). This
@@ -329,15 +433,6 @@ struct msi_domain_info;
* be wrapped into the regular irq domains concepts by mere mortals. This
* allows to universally use msi_domain_alloc/free_irqs without having to
* special case XEN all over the place.
- *
- * Contrary to other operations @domain_alloc_irqs and @domain_free_irqs
- * are set to the default implementation if NULL and even when
- * MSI_FLAG_USE_DEF_DOM_OPS is not set to avoid breaking existing users and
- * because these callbacks are obviously mandatory.
- *
- * This is NOT meant to be abused, but it can be useful to build wrappers
- * for specialized MSI irq domains which need extra work before and after
- * calling __msi_domain_alloc_irqs()/__msi_domain_free_irqs().
*/
struct msi_domain_ops {
irq_hw_number_t (*get_hwirq)(struct msi_domain_info *info,
@@ -349,23 +444,29 @@ struct msi_domain_ops {
void (*msi_free)(struct irq_domain *domain,
struct msi_domain_info *info,
unsigned int virq);
- int (*msi_check)(struct irq_domain *domain,
- struct msi_domain_info *info,
- struct device *dev);
int (*msi_prepare)(struct irq_domain *domain,
struct device *dev, int nvec,
msi_alloc_info_t *arg);
+ void (*prepare_desc)(struct irq_domain *domain, msi_alloc_info_t *arg,
+ struct msi_desc *desc);
void (*set_desc)(msi_alloc_info_t *arg,
struct msi_desc *desc);
int (*domain_alloc_irqs)(struct irq_domain *domain,
struct device *dev, int nvec);
void (*domain_free_irqs)(struct irq_domain *domain,
struct device *dev);
+ void (*msi_post_free)(struct irq_domain *domain,
+ struct device *dev);
};
/**
* struct msi_domain_info - MSI interrupt domain data
* @flags: Flags to decribe features and capabilities
+ * @bus_token: The domain bus token
+ * @hwsize: The hardware table size or the software index limit.
+ * If 0 then the size is considered unlimited and
+ * gets initialized to the maximum software index limit
+ * by the domain creation code.
* @ops: The callback data structure
* @chip: Optional: associated interrupt chip
* @chip_data: Optional: associated interrupt chip data
@@ -375,17 +476,42 @@ struct msi_domain_ops {
* @data: Optional: domain specific data
*/
struct msi_domain_info {
- u32 flags;
- struct msi_domain_ops *ops;
- struct irq_chip *chip;
- void *chip_data;
- irq_flow_handler_t handler;
- void *handler_data;
- const char *handler_name;
- void *data;
+ u32 flags;
+ enum irq_domain_bus_token bus_token;
+ unsigned int hwsize;
+ struct msi_domain_ops *ops;
+ struct irq_chip *chip;
+ void *chip_data;
+ irq_flow_handler_t handler;
+ void *handler_data;
+ const char *handler_name;
+ void *data;
};
-/* Flags for msi_domain_info */
+/**
+ * struct msi_domain_template - Template for MSI device domains
+ * @name: Storage for the resulting name. Filled in by the core.
+ * @chip: Interrupt chip for this domain
+ * @ops: MSI domain ops
+ * @info: MSI domain info data
+ */
+struct msi_domain_template {
+ char name[48];
+ struct irq_chip chip;
+ struct msi_domain_ops ops;
+ struct msi_domain_info info;
+};
+
+/*
+ * Flags for msi_domain_info
+ *
+ * Bit 0-15: Generic MSI functionality which is not subject to restriction
+ * by parent domains
+ *
+ * Bit 16-31: Functionality which depends on the underlying parent domain and
+ * can be masked out by msi_parent_ops::init_dev_msi_info() when
+ * a device MSI domain is initialized.
+ */
enum {
/*
* Init non implemented ops callbacks with default MSI domain
@@ -397,44 +523,100 @@ enum {
* callbacks.
*/
MSI_FLAG_USE_DEF_CHIP_OPS = (1 << 1),
- /* Support multiple PCI MSI interrupts */
- MSI_FLAG_MULTI_PCI_MSI = (1 << 2),
- /* Support PCI MSIX interrupts */
- MSI_FLAG_PCI_MSIX = (1 << 3),
/* Needs early activate, required for PCI */
- MSI_FLAG_ACTIVATE_EARLY = (1 << 4),
+ MSI_FLAG_ACTIVATE_EARLY = (1 << 2),
/*
* Must reactivate when irq is started even when
* MSI_FLAG_ACTIVATE_EARLY has been set.
*/
- MSI_FLAG_MUST_REACTIVATE = (1 << 5),
- /* Is level-triggered capable, using two messages */
- MSI_FLAG_LEVEL_CAPABLE = (1 << 6),
+ MSI_FLAG_MUST_REACTIVATE = (1 << 3),
/* Populate sysfs on alloc() and destroy it on free() */
- MSI_FLAG_DEV_SYSFS = (1 << 7),
- /* MSI-X entries must be contiguous */
- MSI_FLAG_MSIX_CONTIGUOUS = (1 << 8),
+ MSI_FLAG_DEV_SYSFS = (1 << 4),
/* Allocate simple MSI descriptors */
- MSI_FLAG_ALLOC_SIMPLE_MSI_DESCS = (1 << 9),
+ MSI_FLAG_ALLOC_SIMPLE_MSI_DESCS = (1 << 5),
/* Free MSI descriptors */
- MSI_FLAG_FREE_MSI_DESCS = (1 << 10),
+ MSI_FLAG_FREE_MSI_DESCS = (1 << 6),
+ /*
+ * Quirk to handle MSI implementations which do not provide
+ * masking. Currently known to affect x86, but has to be partially
+ * handled in the core MSI code.
+ */
+ MSI_FLAG_NOMASK_QUIRK = (1 << 7),
+
+ /* Mask for the generic functionality */
+ MSI_GENERIC_FLAGS_MASK = GENMASK(15, 0),
+
+ /* Mask for the domain specific functionality */
+ MSI_DOMAIN_FLAGS_MASK = GENMASK(31, 16),
+
+ /* Support multiple PCI MSI interrupts */
+ MSI_FLAG_MULTI_PCI_MSI = (1 << 16),
+ /* Support PCI MSIX interrupts */
+ MSI_FLAG_PCI_MSIX = (1 << 17),
+ /* Is level-triggered capable, using two messages */
+ MSI_FLAG_LEVEL_CAPABLE = (1 << 18),
+ /* MSI-X entries must be contiguous */
+ MSI_FLAG_MSIX_CONTIGUOUS = (1 << 19),
+ /* PCI/MSI-X vectors can be dynamically allocated/freed post MSI-X enable */
+ MSI_FLAG_PCI_MSIX_ALLOC_DYN = (1 << 20),
+ /* Support for PCI/IMS */
+ MSI_FLAG_PCI_IMS = (1 << 21),
};
+/**
+ * struct msi_parent_ops - MSI parent domain callbacks and configuration info
+ *
+ * @supported_flags: Required: The supported MSI flags of the parent domain
+ * @prefix: Optional: Prefix for the domain and chip name
+ * @init_dev_msi_info: Required: Callback for MSI parent domains to setup parent
+ * domain specific domain flags, domain ops and interrupt chip
+ * callbacks when a per device domain is created.
+ */
+struct msi_parent_ops {
+ u32 supported_flags;
+ const char *prefix;
+ bool (*init_dev_msi_info)(struct device *dev, struct irq_domain *domain,
+ struct irq_domain *msi_parent_domain,
+ struct msi_domain_info *msi_child_info);
+};
+
+bool msi_parent_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
+ struct irq_domain *msi_parent_domain,
+ struct msi_domain_info *msi_child_info);
+
int msi_domain_set_affinity(struct irq_data *data, const struct cpumask *mask,
bool force);
struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode,
struct msi_domain_info *info,
struct irq_domain *parent);
-int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
- int nvec);
-int msi_domain_alloc_irqs_descs_locked(struct irq_domain *domain, struct device *dev,
- int nvec);
-int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
- int nvec);
-void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev);
-void msi_domain_free_irqs_descs_locked(struct irq_domain *domain, struct device *dev);
-void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev);
+
+bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
+ const struct msi_domain_template *template,
+ unsigned int hwsize, void *domain_data,
+ void *chip_data);
+void msi_remove_device_irq_domain(struct device *dev, unsigned int domid);
+
+bool msi_match_device_irq_domain(struct device *dev, unsigned int domid,
+ enum irq_domain_bus_token bus_token);
+
+int msi_domain_alloc_irqs_range_locked(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last);
+int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last);
+int msi_domain_alloc_irqs_all_locked(struct device *dev, unsigned int domid, int nirqs);
+
+struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, unsigned int index,
+ const struct irq_affinity_desc *affdesc,
+ union msi_instance_cookie *cookie);
+
+void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last);
+void msi_domain_free_irqs_range(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last);
+void msi_domain_free_irqs_all_locked(struct device *dev, unsigned int domid);
+void msi_domain_free_irqs_all(struct device *dev, unsigned int domid);
+
struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain);
struct irq_domain *platform_msi_create_irq_domain(struct fwnode_handle *fwnode,
@@ -467,20 +649,27 @@ int platform_msi_device_domain_alloc(struct irq_domain *domain, unsigned int vir
void platform_msi_device_domain_free(struct irq_domain *domain, unsigned int virq,
unsigned int nvec);
void *platform_msi_get_host_data(struct irq_domain *domain);
-#endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */
+#endif /* CONFIG_GENERIC_MSI_IRQ */
-#ifdef CONFIG_PCI_MSI_IRQ_DOMAIN
+/* PCI specific interfaces */
+#ifdef CONFIG_PCI_MSI
+struct pci_dev *msi_desc_to_pci_dev(struct msi_desc *desc);
+void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg);
+void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
+void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
+void pci_msi_mask_irq(struct irq_data *data);
+void pci_msi_unmask_irq(struct irq_data *data);
struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode,
struct msi_domain_info *info,
struct irq_domain *parent);
u32 pci_msi_domain_get_msi_rid(struct irq_domain *domain, struct pci_dev *pdev);
struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev);
-bool pci_dev_has_special_msi_domain(struct pci_dev *pdev);
-#else
+#else /* CONFIG_PCI_MSI */
static inline struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev)
{
return NULL;
}
-#endif /* CONFIG_PCI_MSI_IRQ_DOMAIN */
+static inline void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg) { }
+#endif /* !CONFIG_PCI_MSI */
#endif /* LINUX_MSI_H */
diff --git a/include/linux/msi_api.h b/include/linux/msi_api.h
new file mode 100644
index 000000000000..391087ad99b1
--- /dev/null
+++ b/include/linux/msi_api.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef LINUX_MSI_API_H
+#define LINUX_MSI_API_H
+
+/*
+ * APIs which are relevant for device driver code for allocating and
+ * freeing MSI interrupts and querying the associations between
+ * hardware/software MSI indices and the Linux interrupt number.
+ */
+
+struct device;
+
+/*
+ * Per device interrupt domain related constants.
+ */
+enum msi_domain_ids {
+ MSI_DEFAULT_DOMAIN,
+ MSI_SECONDARY_DOMAIN,
+ MSI_MAX_DEVICE_IRQDOMAINS,
+};
+
+/**
+ * union msi_instance_cookie - MSI instance cookie
+ * @value: u64 value store
+ * @ptr: Pointer to usage site specific data
+ *
+ * This cookie is handed to the IMS allocation function and stored in the
+ * MSI descriptor for the interrupt chip callbacks.
+ *
+ * The content of this cookie is MSI domain implementation defined. For
+ * PCI/IMS implementations this could be a PASID or a pointer to queue
+ * memory.
+ */
+union msi_instance_cookie {
+ u64 value;
+ void *ptr;
+};
+
+/**
+ * msi_map - Mapping between MSI index and Linux interrupt number
+ * @index: The MSI index, e.g. slot in the MSI-X table or
+ * a software managed index if >= 0. If negative
+ * the allocation function failed and it contains
+ * the error code.
+ * @virq: The associated Linux interrupt number
+ */
+struct msi_map {
+ int index;
+ int virq;
+};
+
+/*
+ * Constant to be used for dynamic allocations when the allocation is any
+ * free MSI index, which is either an entry in a hardware table or a
+ * software managed index.
+ */
+#define MSI_ANY_INDEX UINT_MAX
+
+unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigned int index);
+
+/**
+ * msi_get_virq - Lookup the Linux interrupt number for a MSI index on the default interrupt domain
+ * @dev: Device for which the lookup happens
+ * @index: The MSI index to lookup
+ *
+ * Return: The Linux interrupt number on success (> 0), 0 if not found
+ */
+static inline unsigned int msi_get_virq(struct device *dev, unsigned int index)
+{
+ return msi_domain_get_virq(dev, MSI_DEFAULT_DOMAIN, index);
+}
+
+#endif
diff --git a/include/linux/net.h b/include/linux/net.h
index 711c3593c3b8..18d942bbdf6e 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -41,6 +41,7 @@ struct net;
#define SOCK_NOSPACE 2
#define SOCK_PASSCRED 3
#define SOCK_PASSSEC 4
+#define SOCK_SUPPORT_ZC 5
#ifndef ARCH_HAS_SOCKET_TYPES
/**
diff --git a/include/linux/node.h b/include/linux/node.h
index 40d641a8bfb0..427a5975cf40 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -2,15 +2,15 @@
/*
* include/linux/node.h - generic node definition
*
- * This is mainly for topological representation. We define the
- * basic 'struct node' here, which can be embedded in per-arch
+ * This is mainly for topological representation. We define the
+ * basic 'struct node' here, which can be embedded in per-arch
* definitions of processors.
*
* Basic handling of the devices is done in drivers/base/node.c
- * and system devices are handled in drivers/base/sys.c.
+ * and system devices are handled in drivers/base/sys.c.
*
* Nodes are exported via driverfs in the class/node/devices/
- * directory.
+ * directory.
*/
#ifndef _LINUX_NODE_H_
#define _LINUX_NODE_H_
@@ -18,7 +18,6 @@
#include <linux/device.h>
#include <linux/cpumask.h>
#include <linux/list.h>
-#include <linux/workqueue.h>
/**
* struct node_hmem_attrs - heterogeneous memory performance attributes
@@ -84,10 +83,6 @@ static inline void node_set_perf_attrs(unsigned int nid,
struct node {
struct device dev;
struct list_head access_list;
-
-#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_HUGETLBFS)
- struct work_struct node_work;
-#endif
#ifdef CONFIG_HMEM_REPORTING
struct list_head cache_attrs;
struct device *cache_dev;
@@ -96,7 +91,6 @@ struct node {
struct memory_block;
extern struct node *node_devices[];
-typedef void (*node_registration_func_t)(struct node *);
#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_NUMA)
void register_memory_blocks_under_node(int nid, unsigned long start_pfn,
@@ -144,11 +138,6 @@ extern void unregister_memory_block_under_nodes(struct memory_block *mem_blk);
extern int register_memory_node_under_compute_node(unsigned int mem_nid,
unsigned int cpu_nid,
unsigned access);
-
-#ifdef CONFIG_HUGETLBFS
-extern void register_hugetlbfs_with_node(node_registration_func_t doregister,
- node_registration_func_t unregister);
-#endif
#else
static inline void node_dev_init(void)
{
@@ -176,18 +165,8 @@ static inline int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
static inline void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
{
}
-
-static inline void register_hugetlbfs_with_node(node_registration_func_t reg,
- node_registration_func_t unreg)
-{
-}
#endif
#define to_node(device) container_of(device, struct node, dev)
-static inline bool node_is_toptier(int node)
-{
- return node_state(node, N_CPU);
-}
-
#endif /* _LINUX_NODE_H_ */
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 4b71a96190a8..efef68c9352a 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -493,6 +493,7 @@ static inline int num_node_state(enum node_states state)
#define first_online_node 0
#define first_memory_node 0
#define next_online_node(nid) (MAX_NUMNODES)
+#define next_memory_node(nid) (MAX_NUMNODES)
#define nr_node_ids 1U
#define nr_online_nodes 1U
@@ -504,12 +505,20 @@ static inline int num_node_state(enum node_states state)
static inline int node_random(const nodemask_t *maskp)
{
#if defined(CONFIG_NUMA) && (MAX_NUMNODES > 1)
- int w, bit = NUMA_NO_NODE;
+ int w, bit;
w = nodes_weight(*maskp);
- if (w)
- bit = bitmap_ord_to_pos(maskp->bits,
- get_random_int() % w, MAX_NUMNODES);
+ switch (w) {
+ case 0:
+ bit = NUMA_NO_NODE;
+ break;
+ case 1:
+ bit = first_node(*maskp);
+ break;
+ default:
+ bit = find_nth_bit(maskp->bits, MAX_NUMNODES, prandom_u32_max(w));
+ break;
+ }
return bit;
#else
return 0;
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index cdb171efc7cb..fee881cded01 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -94,6 +94,7 @@ static inline struct cred *nsset_cred(struct nsset *set)
int copy_namespaces(unsigned long flags, struct task_struct *tsk);
void exit_task_namespaces(struct task_struct *tsk);
void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
+int exec_task_namespaces(void);
void free_nsproxy(struct nsproxy *ns);
int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **,
struct cred *, struct fs_struct *);
diff --git a/include/linux/of.h b/include/linux/of.h
index 766d002bddb9..6b79ef9a6541 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -342,7 +342,7 @@ extern int of_property_read_string_helper(const struct device_node *np,
const char **out_strs, size_t sz, int index);
extern int of_device_is_compatible(const struct device_node *device,
const char *);
-extern int of_device_compatible_match(struct device_node *device,
+extern int of_device_compatible_match(const struct device_node *device,
const char *const *compat);
extern bool of_device_is_available(const struct device_node *device);
extern bool of_device_is_big_endian(const struct device_node *device);
@@ -562,7 +562,7 @@ static inline int of_device_is_compatible(const struct device_node *device,
return 0;
}
-static inline int of_device_compatible_match(struct device_node *device,
+static inline int of_device_compatible_match(const struct device_node *device,
const char *const *compat)
{
return 0;
diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h
index 83fccd0c9bba..d6d3eae2f145 100644
--- a/include/linux/of_irq.h
+++ b/include/linux/of_irq.h
@@ -37,9 +37,8 @@ extern unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data);
extern int of_irq_to_resource(struct device_node *dev, int index,
struct resource *r);
-extern void of_irq_init(const struct of_device_id *matches);
-
#ifdef CONFIG_OF_IRQ
+extern void of_irq_init(const struct of_device_id *matches);
extern int of_irq_parse_one(struct device_node *device, int index,
struct of_phandle_args *out_irq);
extern int of_irq_count(struct device_node *dev);
@@ -57,6 +56,9 @@ extern struct irq_domain *of_msi_map_get_device_domain(struct device *dev,
extern void of_msi_configure(struct device *dev, struct device_node *np);
u32 of_msi_map_id(struct device *dev, struct device_node *msi_np, u32 id_in);
#else
+static inline void of_irq_init(const struct of_device_id *matches)
+{
+}
static inline int of_irq_parse_one(struct device_node *device, int index,
struct of_phandle_args *out_irq)
{
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 02d1e7bbd8cd..7d0c9c48a0c5 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -78,15 +78,6 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk)
}
/*
- * Use this helper if tsk->mm != mm and the victim mm needs a special
- * handling. This is guaranteed to stay true after once set.
- */
-static inline bool mm_is_oom_victim(struct mm_struct *mm)
-{
- return test_bit(MMF_OOM_VICTIM, &mm->flags);
-}
-
-/*
* Checks whether a page fault on the given mm is still reliable.
* This is no longer true if the oom reaper started to reap the
* address space which is reflected by MMF_UNSTABLE flag set in
@@ -106,8 +97,6 @@ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm)
return 0;
}
-bool __oom_reap_task_mm(struct mm_struct *mm);
-
long oom_badness(struct task_struct *p,
unsigned long totalpages);
diff --git a/include/linux/overflow.h b/include/linux/overflow.h
index 19dfdd74835e..1d3be1a2204c 100644
--- a/include/linux/overflow.h
+++ b/include/linux/overflow.h
@@ -51,8 +51,8 @@ static inline bool __must_check __must_check_overflow(bool overflow)
return unlikely(overflow);
}
-/** check_add_overflow() - Calculate addition with overflow checking
- *
+/**
+ * check_add_overflow() - Calculate addition with overflow checking
* @a: first addend
* @b: second addend
* @d: pointer to store sum
@@ -66,8 +66,8 @@ static inline bool __must_check __must_check_overflow(bool overflow)
#define check_add_overflow(a, b, d) \
__must_check_overflow(__builtin_add_overflow(a, b, d))
-/** check_sub_overflow() - Calculate subtraction with overflow checking
- *
+/**
+ * check_sub_overflow() - Calculate subtraction with overflow checking
* @a: minuend; value to subtract from
* @b: subtrahend; value to subtract from @a
* @d: pointer to store difference
@@ -81,8 +81,8 @@ static inline bool __must_check __must_check_overflow(bool overflow)
#define check_sub_overflow(a, b, d) \
__must_check_overflow(__builtin_sub_overflow(a, b, d))
-/** check_mul_overflow() - Calculate multiplication with overflow checking
- *
+/**
+ * check_mul_overflow() - Calculate multiplication with overflow checking
* @a: first factor
* @b: second factor
* @d: pointer to store product
@@ -96,23 +96,24 @@ static inline bool __must_check __must_check_overflow(bool overflow)
#define check_mul_overflow(a, b, d) \
__must_check_overflow(__builtin_mul_overflow(a, b, d))
-/** check_shl_overflow() - Calculate a left-shifted value and check overflow
- *
+/**
+ * check_shl_overflow() - Calculate a left-shifted value and check overflow
* @a: Value to be shifted
* @s: How many bits left to shift
* @d: Pointer to where to store the result
*
* Computes *@d = (@a << @s)
*
- * Returns true if '*d' cannot hold the result or when 'a << s' doesn't
+ * Returns true if '*@d' cannot hold the result or when '@a << @s' doesn't
* make sense. Example conditions:
- * - 'a << s' causes bits to be lost when stored in *d.
- * - 's' is garbage (e.g. negative) or so large that the result of
- * 'a << s' is guaranteed to be 0.
- * - 'a' is negative.
- * - 'a << s' sets the sign bit, if any, in '*d'.
*
- * '*d' will hold the results of the attempted shift, but is not
+ * - '@a << @s' causes bits to be lost when stored in *@d.
+ * - '@s' is garbage (e.g. negative) or so large that the result of
+ * '@a << @s' is guaranteed to be 0.
+ * - '@a' is negative.
+ * - '@a << @s' sets the sign bit, if any, in '*@d'.
+ *
+ * '*@d' will hold the results of the attempted shift, but is not
* considered "safe for use" if true is returned.
*/
#define check_shl_overflow(a, s, d) __must_check_overflow(({ \
@@ -129,7 +130,6 @@ static inline bool __must_check __must_check_overflow(bool overflow)
/**
* size_mul() - Calculate size_t multiplication with saturation at SIZE_MAX
- *
* @factor1: first factor
* @factor2: second factor
*
@@ -149,7 +149,6 @@ static inline size_t __must_check size_mul(size_t factor1, size_t factor2)
/**
* size_add() - Calculate size_t addition with saturation at SIZE_MAX
- *
* @addend1: first addend
* @addend2: second addend
*
@@ -169,7 +168,6 @@ static inline size_t __must_check size_add(size_t addend1, size_t addend2)
/**
* size_sub() - Calculate size_t subtraction with saturation at SIZE_MAX
- *
* @minuend: value to subtract from
* @subtrahend: value to subtract from @minuend
*
@@ -192,7 +190,6 @@ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend)
/**
* array_size() - Calculate size of 2-dimensional array.
- *
* @a: dimension one
* @b: dimension two
*
@@ -205,7 +202,6 @@ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend)
/**
* array3_size() - Calculate size of 3-dimensional array.
- *
* @a: dimension one
* @b: dimension two
* @c: dimension three
@@ -220,7 +216,6 @@ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend)
/**
* flex_array_size() - Calculate size of a flexible array member
* within an enclosing structure.
- *
* @p: Pointer to the structure.
* @member: Name of the flexible array member.
* @count: Number of elements in the array.
@@ -237,7 +232,6 @@ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend)
/**
* struct_size() - Calculate size of structure with trailing flexible array.
- *
* @p: Pointer to the structure.
* @member: Name of the array member.
* @count: Number of elements in the array.
diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h
index ef1e3e736e14..7d79818dc065 100644
--- a/include/linux/page-flags-layout.h
+++ b/include/linux/page-flags-layout.h
@@ -55,7 +55,8 @@
#define SECTIONS_WIDTH 0
#endif
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
+#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \
+ <= BITS_PER_LONG - NR_PAGEFLAGS
#define NODES_WIDTH NODES_SHIFT
#elif defined(CONFIG_SPARSEMEM_VMEMMAP)
#error "Vmemmap: No space for nodes field in page flags"
@@ -89,8 +90,8 @@
#define LAST_CPUPID_SHIFT 0
#endif
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \
- <= BITS_PER_LONG - NR_PAGEFLAGS
+#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
+ KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
#define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
#else
#define LAST_CPUPID_WIDTH 0
@@ -100,10 +101,15 @@
#define LAST_CPUPID_NOT_IN_PAGE_FLAGS
#endif
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \
- > BITS_PER_LONG - NR_PAGEFLAGS
+#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
+ KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
#error "Not enough bits in page flags"
#endif
+/* see the comment on MAX_NR_TIERS */
+#define LRU_REFS_WIDTH min(__LRU_REFS_WIDTH, BITS_PER_LONG - NR_PAGEFLAGS - \
+ ZONES_WIDTH - LRU_GEN_WIDTH - SECTIONS_WIDTH - \
+ NODES_WIDTH - KASAN_TAG_WIDTH - LAST_CPUPID_WIDTH)
+
#endif
#endif /* _LINUX_PAGE_FLAGS_LAYOUT */
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 465ff35a8c00..0b0ae5084e60 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -1058,7 +1058,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page)
1UL << PG_private | 1UL << PG_private_2 | \
1UL << PG_writeback | 1UL << PG_reserved | \
1UL << PG_slab | 1UL << PG_active | \
- 1UL << PG_unevictable | __PG_MLOCKED)
+ 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK)
/*
* Flags checked when a page is prepped for return by the page allocator.
@@ -1069,7 +1069,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page)
* alloc-free cycle to prevent from reusing the page.
*/
#define PAGE_FLAGS_CHECK_AT_PREP \
- (PAGEFLAGS_MASK & ~__PG_HWPOISON)
+ ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK)
#define PAGE_FLAGS_PRIVATE \
(1UL << PG_private | 1UL << PG_private_2)
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index 679591301994..c141ea9a95ef 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -3,15 +3,17 @@
#define _LINUX_PAGE_COUNTER_H
#include <linux/atomic.h>
+#include <linux/cache.h>
#include <linux/kernel.h>
#include <asm/page.h>
struct page_counter {
+ /*
+ * Make sure 'usage' does not share cacheline with any other field. The
+ * memcg->memory.usage is a hot member of struct mem_cgroup.
+ */
atomic_long_t usage;
- unsigned long min;
- unsigned long low;
- unsigned long high;
- unsigned long max;
+ CACHELINE_PADDING(_pad1_);
/* effective memory.min and memory.min usage tracking */
unsigned long emin;
@@ -23,18 +25,18 @@ struct page_counter {
atomic_long_t low_usage;
atomic_long_t children_low_usage;
- /* legacy */
unsigned long watermark;
unsigned long failcnt;
- /*
- * 'parent' is placed here to be far from 'usage' to reduce
- * cache false sharing, as 'usage' is written mostly while
- * parent is frequently read for cgroup's hierarchical
- * counting nature.
- */
+ /* Keep all the read most fields in a separete cacheline. */
+ CACHELINE_PADDING(_pad2_);
+
+ unsigned long min;
+ unsigned long low;
+ unsigned long high;
+ unsigned long max;
struct page_counter *parent;
-};
+} ____cacheline_internodealigned_in_smp;
#if BITS_PER_LONG == 32
#define PAGE_COUNTER_MAX LONG_MAX
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index fabb2e1e087f..22be4582faae 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -36,9 +36,15 @@ struct page_ext {
unsigned long flags;
};
+extern bool early_page_ext;
extern unsigned long page_ext_size;
extern void pgdat_page_ext_init(struct pglist_data *pgdat);
+static inline bool early_page_ext_enabled(void)
+{
+ return early_page_ext;
+}
+
#ifdef CONFIG_SPARSEMEM
static inline void page_ext_init_flatmem(void)
{
@@ -55,7 +61,8 @@ static inline void page_ext_init(void)
}
#endif
-struct page_ext *lookup_page_ext(const struct page *page);
+extern struct page_ext *page_ext_get(struct page *page);
+extern void page_ext_put(struct page_ext *page_ext);
static inline struct page_ext *page_ext_next(struct page_ext *curr)
{
@@ -67,13 +74,13 @@ static inline struct page_ext *page_ext_next(struct page_ext *curr)
#else /* !CONFIG_PAGE_EXTENSION */
struct page_ext;
-static inline void pgdat_page_ext_init(struct pglist_data *pgdat)
+static inline bool early_page_ext_enabled(void)
{
+ return false;
}
-static inline struct page_ext *lookup_page_ext(const struct page *page)
+static inline void pgdat_page_ext_init(struct pglist_data *pgdat)
{
- return NULL;
}
static inline void page_ext_init(void)
@@ -87,5 +94,14 @@ static inline void page_ext_init_flatmem_late(void)
static inline void page_ext_init_flatmem(void)
{
}
+
+static inline struct page_ext *page_ext_get(struct page *page)
+{
+ return NULL;
+}
+
+static inline void page_ext_put(struct page_ext *page_ext)
+{
+}
#endif /* CONFIG_PAGE_EXTENSION */
#endif /* __LINUX_PAGE_EXT_H */
diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h
index 4663dfed1293..5cb7bd2078ec 100644
--- a/include/linux/page_idle.h
+++ b/include/linux/page_idle.h
@@ -13,65 +13,79 @@
* If there is not enough space to store Idle and Young bits in page flags, use
* page ext flags instead.
*/
-
static inline bool folio_test_young(struct folio *folio)
{
- struct page_ext *page_ext = lookup_page_ext(&folio->page);
+ struct page_ext *page_ext = page_ext_get(&folio->page);
+ bool page_young;
if (unlikely(!page_ext))
return false;
- return test_bit(PAGE_EXT_YOUNG, &page_ext->flags);
+ page_young = test_bit(PAGE_EXT_YOUNG, &page_ext->flags);
+ page_ext_put(page_ext);
+
+ return page_young;
}
static inline void folio_set_young(struct folio *folio)
{
- struct page_ext *page_ext = lookup_page_ext(&folio->page);
+ struct page_ext *page_ext = page_ext_get(&folio->page);
if (unlikely(!page_ext))
return;
set_bit(PAGE_EXT_YOUNG, &page_ext->flags);
+ page_ext_put(page_ext);
}
static inline bool folio_test_clear_young(struct folio *folio)
{
- struct page_ext *page_ext = lookup_page_ext(&folio->page);
+ struct page_ext *page_ext = page_ext_get(&folio->page);
+ bool page_young;
if (unlikely(!page_ext))
return false;
- return test_and_clear_bit(PAGE_EXT_YOUNG, &page_ext->flags);
+ page_young = test_and_clear_bit(PAGE_EXT_YOUNG, &page_ext->flags);
+ page_ext_put(page_ext);
+
+ return page_young;
}
static inline bool folio_test_idle(struct folio *folio)
{
- struct page_ext *page_ext = lookup_page_ext(&folio->page);
+ struct page_ext *page_ext = page_ext_get(&folio->page);
+ bool page_idle;
if (unlikely(!page_ext))
return false;
- return test_bit(PAGE_EXT_IDLE, &page_ext->flags);
+ page_idle = test_bit(PAGE_EXT_IDLE, &page_ext->flags);
+ page_ext_put(page_ext);
+
+ return page_idle;
}
static inline void folio_set_idle(struct folio *folio)
{
- struct page_ext *page_ext = lookup_page_ext(&folio->page);
+ struct page_ext *page_ext = page_ext_get(&folio->page);
if (unlikely(!page_ext))
return;
set_bit(PAGE_EXT_IDLE, &page_ext->flags);
+ page_ext_put(page_ext);
}
static inline void folio_clear_idle(struct folio *folio)
{
- struct page_ext *page_ext = lookup_page_ext(&folio->page);
+ struct page_ext *page_ext = page_ext_get(&folio->page);
if (unlikely(!page_ext))
return;
clear_bit(PAGE_EXT_IDLE, &page_ext->flags);
+ page_ext_put(page_ext);
}
#endif /* !CONFIG_64BIT */
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index 83c7248053a1..5f1ae07d724b 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -53,6 +53,10 @@ extern unsigned int pageblock_order;
#endif /* CONFIG_HUGETLB_PAGE */
#define pageblock_nr_pages (1UL << pageblock_order)
+#define pageblock_align(pfn) ALIGN((pfn), pageblock_nr_pages)
+#define pageblock_aligned(pfn) IS_ALIGNED((pfn), pageblock_nr_pages)
+#define pageblock_start_pfn(pfn) ALIGN_DOWN((pfn), pageblock_nr_pages)
+#define pageblock_end_pfn(pfn) ALIGN((pfn) + 1, pageblock_nr_pages)
/* Forward declaration */
struct page;
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 201dc7281640..bbccb4044222 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -718,8 +718,8 @@ static inline struct page *find_subpage(struct page *head, pgoff_t index)
unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start,
pgoff_t end, struct folio_batch *fbatch);
-unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
- unsigned int nr_pages, struct page **pages);
+unsigned filemap_get_folios_contig(struct address_space *mapping,
+ pgoff_t *start, pgoff_t end, struct folio_batch *fbatch);
unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
pgoff_t end, xa_mark_t tag, unsigned int nr_pages,
struct page **pages);
@@ -989,19 +989,16 @@ static inline int lock_page_killable(struct page *page)
}
/*
- * lock_page_or_retry - Lock the page, unless this would block and the
+ * folio_lock_or_retry - Lock the folio, unless this would block and the
* caller indicated that it can handle a retry.
*
* Return value and mmap_lock implications depend on flags; see
* __folio_lock_or_retry().
*/
-static inline bool lock_page_or_retry(struct page *page, struct mm_struct *mm,
- unsigned int flags)
+static inline bool folio_lock_or_retry(struct folio *folio,
+ struct mm_struct *mm, unsigned int flags)
{
- struct folio *folio;
might_sleep();
-
- folio = page_folio(page);
return folio_trylock(folio) || __folio_lock_or_retry(folio, mm, flags);
}
@@ -1042,7 +1039,6 @@ static inline int wait_on_page_locked_killable(struct page *page)
return folio_wait_locked_killable(page_folio(page));
}
-int folio_put_wait_locked(struct folio *folio, int state);
void wait_on_page_writeback(struct page *page);
void folio_wait_writeback(struct folio *folio);
int folio_wait_writeback_killable(struct folio *folio);
diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index ac7b38ad5903..f3fafb731ffd 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -15,12 +15,12 @@ struct mm_walk;
* this handler is required to be able to handle
* pmd_trans_huge() pmds. They may simply choose to
* split_huge_page() instead of handling it explicitly.
- * @pte_entry: if set, called for each non-empty PTE (lowest-level)
- * entry
+ * @pte_entry: if set, called for each PTE (lowest-level) entry,
+ * including empty ones
* @pte_hole: if set, called for each hole at all levels,
- * depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD
- * 4:PTE. Any folded depths (where PTRS_PER_P?D is equal
- * to 1) are skipped.
+ * depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD.
+ * Any folded depths (where PTRS_PER_P?D is equal to 1)
+ * are skipped.
* @hugetlb_entry: if set, called for each hugetlb entry
* @test_walk: caller specific callback function to determine whether
* we walk over the current vma or not. Returning 0 means
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 5da0846aa3c1..c0d939f3169c 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -38,6 +38,7 @@
#include <linux/interrupt.h>
#include <linux/io.h>
#include <linux/resource_ext.h>
+#include <linux/msi_api.h>
#include <uapi/linux/pci.h>
#include <linux/pci_ids.h>
@@ -409,6 +410,7 @@ struct pci_dev {
*/
unsigned int irq;
struct resource resource[DEVICE_COUNT_RESOURCE]; /* I/O and memory regions + expansion ROMs */
+ struct resource driver_exclusive_resource; /* driver exclusive resource ranges */
bool match_driver; /* Skip attaching driver */
@@ -475,6 +477,7 @@ struct pci_dev {
unsigned int broken_cmd_compl:1; /* No compl for some cmds */
#endif
#ifdef CONFIG_PCIE_PTM
+ u16 ptm_cap; /* PTM Capability */
unsigned int ptm_root:1;
unsigned int ptm_enabled:1;
u8 ptm_granularity;
@@ -842,6 +845,9 @@ struct pci_error_handlers {
/* Device driver may resume normal operations */
void (*resume)(struct pci_dev *dev);
+
+ /* Allow device driver to record more details of a correctable error */
+ void (*cor_error_detected)(struct pci_dev *dev);
};
@@ -1406,6 +1412,21 @@ int pci_request_selected_regions(struct pci_dev *, int, const char *);
int pci_request_selected_regions_exclusive(struct pci_dev *, int, const char *);
void pci_release_selected_regions(struct pci_dev *, int);
+static inline __must_check struct resource *
+pci_request_config_region_exclusive(struct pci_dev *pdev, unsigned int offset,
+ unsigned int len, const char *name)
+{
+ return __request_region(&pdev->driver_exclusive_resource, offset, len,
+ name, IORESOURCE_EXCLUSIVE);
+}
+
+static inline void pci_release_config_region(struct pci_dev *pdev,
+ unsigned int offset,
+ unsigned int len)
+{
+ __release_region(&pdev->driver_exclusive_resource, offset, len);
+}
+
/* drivers/pci/bus.c */
void pci_add_resource(struct list_head *resources, struct resource *res);
void pci_add_resource_offset(struct list_head *resources, struct resource *res,
@@ -1552,10 +1573,17 @@ static inline int pci_enable_msix_exact(struct pci_dev *dev,
return rc;
return 0;
}
+int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs,
+ unsigned int max_vecs, unsigned int flags);
int pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
unsigned int max_vecs, unsigned int flags,
struct irq_affinity *affd);
+bool pci_msix_can_alloc_dyn(struct pci_dev *dev);
+struct msi_map pci_msix_alloc_irq_at(struct pci_dev *dev, unsigned int index,
+ const struct irq_affinity_desc *affdesc);
+void pci_msix_free_irq(struct pci_dev *pdev, struct msi_map map);
+
void pci_free_irq_vectors(struct pci_dev *dev);
int pci_irq_vector(struct pci_dev *dev, unsigned int nr);
const struct cpumask *pci_irq_get_affinity(struct pci_dev *pdev, int vec);
@@ -1585,6 +1613,13 @@ pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
return 1;
return -ENOSPC;
}
+static inline int
+pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs,
+ unsigned int max_vecs, unsigned int flags)
+{
+ return pci_alloc_irq_vectors_affinity(dev, min_vecs, max_vecs,
+ flags, NULL);
+}
static inline void pci_free_irq_vectors(struct pci_dev *dev)
{
@@ -1677,10 +1712,12 @@ bool pci_ats_disabled(void);
#ifdef CONFIG_PCIE_PTM
int pci_enable_ptm(struct pci_dev *dev, u8 *granularity);
+void pci_disable_ptm(struct pci_dev *dev);
bool pcie_ptm_enabled(struct pci_dev *dev);
#else
static inline int pci_enable_ptm(struct pci_dev *dev, u8 *granularity)
{ return -EINVAL; }
+static inline void pci_disable_ptm(struct pci_dev *dev) { }
static inline bool pcie_ptm_enabled(struct pci_dev *dev)
{ return false; }
#endif
@@ -1895,15 +1932,13 @@ pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
{
return -ENOSPC;
}
-#endif /* CONFIG_PCI */
-
static inline int
pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs,
unsigned int max_vecs, unsigned int flags)
{
- return pci_alloc_irq_vectors_affinity(dev, min_vecs, max_vecs, flags,
- NULL);
+ return -ENOSPC;
}
+#endif /* CONFIG_PCI */
/* Include architecture-dependent settings and functions */
@@ -2471,6 +2506,14 @@ static inline bool pci_is_thunderbolt_attached(struct pci_dev *pdev)
void pci_uevent_ers(struct pci_dev *pdev, enum pci_ers_result err_type);
#endif
+struct msi_domain_template;
+
+bool pci_create_ims_domain(struct pci_dev *pdev, const struct msi_domain_template *template,
+ unsigned int hwsize, void *data);
+struct msi_map pci_ims_alloc_irq(struct pci_dev *pdev, union msi_instance_cookie *icookie,
+ const struct irq_affinity_desc *affdesc);
+void pci_ims_free_irq(struct pci_dev *pdev, struct msi_map map);
+
#include <linux/dma-mapping.h>
#define pci_printk(level, pdev, fmt, arg...) \
@@ -2481,6 +2524,7 @@ void pci_uevent_ers(struct pci_dev *pdev, enum pci_ers_result err_type);
#define pci_crit(pdev, fmt, arg...) dev_crit(&(pdev)->dev, fmt, ##arg)
#define pci_err(pdev, fmt, arg...) dev_err(&(pdev)->dev, fmt, ##arg)
#define pci_warn(pdev, fmt, arg...) dev_warn(&(pdev)->dev, fmt, ##arg)
+#define pci_warn_once(pdev, fmt, arg...) dev_warn_once(&(pdev)->dev, fmt, ##arg)
#define pci_notice(pdev, fmt, arg...) dev_notice(&(pdev)->dev, fmt, ##arg)
#define pci_info(pdev, fmt, arg...) dev_info(&(pdev)->dev, fmt, ##arg)
#define pci_dbg(pdev, fmt, arg...) dev_dbg(&(pdev)->dev, fmt, ##arg)
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index f1ec5ad1351c..3dbb6fb70658 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -42,7 +42,7 @@
* larger than PERCPU_DYNAMIC_EARLY_SIZE.
*/
#define PERCPU_DYNAMIC_EARLY_SLOTS 128
-#define PERCPU_DYNAMIC_EARLY_SIZE (12 << 10)
+#define PERCPU_DYNAMIC_EARLY_SIZE (20 << 10)
/*
* PERCPU_DYNAMIC_RESERVE indicates the amount of free area to piggy
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 01861eebed79..8ed5fba6d156 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -15,6 +15,9 @@
#include <linux/types.h>
#include <linux/gfp.h>
+/* percpu_counter batch for local add or sub */
+#define PERCPU_COUNTER_LOCAL_BATCH INT_MAX
+
#ifdef CONFIG_SMP
struct percpu_counter {
@@ -56,6 +59,22 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
percpu_counter_add_batch(fbc, amount, percpu_counter_batch);
}
+/*
+ * With percpu_counter_add_local() and percpu_counter_sub_local(), counts
+ * are accumulated in local per cpu counter and not in fbc->count until
+ * local count overflows PERCPU_COUNTER_LOCAL_BATCH. This makes counter
+ * write efficient.
+ * But percpu_counter_sum(), instead of percpu_counter_read(), needs to be
+ * used to add up the counts from each CPU to account for all the local
+ * counts. So percpu_counter_add_local() and percpu_counter_sub_local()
+ * should be used when a counter is updated frequently and read rarely.
+ */
+static inline void
+percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
+{
+ percpu_counter_add_batch(fbc, amount, PERCPU_COUNTER_LOCAL_BATCH);
+}
+
static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
{
s64 ret = __percpu_counter_sum(fbc);
@@ -138,6 +157,13 @@ percpu_counter_add(struct percpu_counter *fbc, s64 amount)
preempt_enable();
}
+/* non-SMP percpu_counter_add_local is the same with percpu_counter_add */
+static inline void
+percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
+{
+ percpu_counter_add(fbc, amount);
+}
+
static inline void
percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
{
@@ -193,4 +219,10 @@ static inline void percpu_counter_sub(struct percpu_counter *fbc, s64 amount)
percpu_counter_add(fbc, -amount);
}
+static inline void
+percpu_counter_sub_local(struct percpu_counter *fbc, s64 amount)
+{
+ percpu_counter_add_local(fbc, -amount);
+}
+
#endif /* _LINUX_PERCPU_COUNTER_H */
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index 0356cb6a215d..ef914a600087 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -100,7 +100,7 @@ struct arm_pmu {
void (*stop)(struct arm_pmu *);
void (*reset)(void *);
int (*map_event)(struct perf_event *event);
- int (*filter_match)(struct perf_event *event);
+ bool (*filter)(struct pmu *pmu, int cpu);
int num_events;
bool secure_access; /* 32-bit ARM only */
#define ARMV8_PMUV3_MAX_COMMON_EVENTS 0x40
@@ -174,7 +174,6 @@ void kvm_host_pmu_init(struct arm_pmu *pmu);
/* Internal functions only for core arm_pmu code */
struct arm_pmu *armpmu_alloc(void);
-struct arm_pmu *armpmu_alloc_atomic(void);
void armpmu_free(struct arm_pmu *pmu);
int armpmu_register(struct arm_pmu *pmu);
int armpmu_request_irq(int irq, int cpu);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 853f64b6c8c2..c6a3bac76966 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -266,6 +266,7 @@ struct hw_perf_event {
};
struct perf_event;
+struct perf_event_pmu_context;
/*
* Common implementation detail of pmu::{start,commit,cancel}_txn
@@ -308,7 +309,7 @@ struct pmu {
int capabilities;
int __percpu *pmu_disable_count;
- struct perf_cpu_context __percpu *pmu_cpu_context;
+ struct perf_cpu_pmu_context __percpu *cpu_pmu_context;
atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */
int task_ctx_nr;
int hrtimer_interval_ms;
@@ -443,7 +444,7 @@ struct pmu {
/*
* context-switches callback
*/
- void (*sched_task) (struct perf_event_context *ctx,
+ void (*sched_task) (struct perf_event_pmu_context *pmu_ctx,
bool sched_in);
/*
@@ -457,8 +458,8 @@ struct pmu {
* implementation and Perf core context switch handling callbacks for usage
* examples.
*/
- void (*swap_task_ctx) (struct perf_event_context *prev,
- struct perf_event_context *next);
+ void (*swap_task_ctx) (struct perf_event_pmu_context *prev_epc,
+ struct perf_event_pmu_context *next_epc);
/* optional */
/*
@@ -522,9 +523,10 @@ struct pmu {
/* optional */
/*
- * Filter events for PMU-specific reasons.
+ * Skip programming this PMU on the given CPU. Typically needed for
+ * big.LITTLE things.
*/
- int (*filter_match) (struct perf_event *event); /* optional */
+ bool (*filter) (struct pmu *pmu, int cpu); /* optional */
/*
* Check period value for PERF_EVENT_IOC_PERIOD ioctl.
@@ -695,6 +697,11 @@ struct perf_event {
int group_caps;
struct perf_event *group_leader;
+ /*
+ * event->pmu will always point to pmu in which this event belongs.
+ * Whereas event->pmu_ctx->pmu may point to other pmu when group of
+ * different pmu events is created.
+ */
struct pmu *pmu;
void *pmu_private;
@@ -720,6 +727,12 @@ struct perf_event {
struct hw_perf_event hw;
struct perf_event_context *ctx;
+ /*
+ * event->pmu_ctx points to perf_event_pmu_context in which the event
+ * is added. This pmu_ctx can be of other pmu for sw event when that
+ * sw event is part of a group which also contains non-sw events.
+ */
+ struct perf_event_pmu_context *pmu_ctx;
atomic_long_t refcount;
/*
@@ -756,11 +769,14 @@ struct perf_event {
struct fasync_struct *fasync;
/* delayed work for NMIs and such */
- int pending_wakeup;
- int pending_kill;
- int pending_disable;
+ unsigned int pending_wakeup;
+ unsigned int pending_kill;
+ unsigned int pending_disable;
+ unsigned int pending_sigtrap;
unsigned long pending_addr; /* SIGTRAP */
- struct irq_work pending;
+ struct irq_work pending_irq;
+ struct callback_head pending_task;
+ unsigned int pending_work;
atomic_t event_limit;
@@ -809,19 +825,69 @@ struct perf_event {
#endif /* CONFIG_PERF_EVENTS */
};
+/*
+ * ,-----------------------[1:n]----------------------.
+ * V V
+ * perf_event_context <-[1:n]-> perf_event_pmu_context <--- perf_event
+ * ^ ^ | |
+ * `--------[1:n]---------' `-[n:1]-> pmu <-[1:n]-'
+ *
+ *
+ * struct perf_event_pmu_context lifetime is refcount based and RCU freed
+ * (similar to perf_event_context). Locking is as if it were a member of
+ * perf_event_context; specifically:
+ *
+ * modification, both: ctx->mutex && ctx->lock
+ * reading, either: ctx->mutex || ctx->lock
+ *
+ * There is one exception to this; namely put_pmu_ctx() isn't always called
+ * with ctx->mutex held; this means that as long as we can guarantee the epc
+ * has events the above rules hold.
+ *
+ * Specificially, sys_perf_event_open()'s group_leader case depends on
+ * ctx->mutex pinning the configuration. Since we hold a reference on
+ * group_leader (through the filedesc) it can't go away, therefore it's
+ * associated pmu_ctx must exist and cannot change due to ctx->mutex.
+ */
+struct perf_event_pmu_context {
+ struct pmu *pmu;
+ struct perf_event_context *ctx;
+
+ struct list_head pmu_ctx_entry;
+
+ struct list_head pinned_active;
+ struct list_head flexible_active;
+
+ /* Used to avoid freeing per-cpu perf_event_pmu_context */
+ unsigned int embedded : 1;
+
+ unsigned int nr_events;
+
+ atomic_t refcount; /* event <-> epc */
+ struct rcu_head rcu_head;
+
+ void *task_ctx_data; /* pmu specific data */
+ /*
+ * Set when one or more (plausibly active) event can't be scheduled
+ * due to pmu overcommit or pmu constraints, except tolerant to
+ * events not necessary to be active due to scheduling constraints,
+ * such as cgroups.
+ */
+ int rotate_necessary;
+};
struct perf_event_groups {
struct rb_root tree;
u64 index;
};
+
/**
* struct perf_event_context - event context structure
*
* Used as a container for task events and CPU events as well:
*/
struct perf_event_context {
- struct pmu *pmu;
/*
* Protect the states of the events in the list,
* nr_active, and the list:
@@ -834,27 +900,21 @@ struct perf_event_context {
*/
struct mutex mutex;
- struct list_head active_ctx_list;
+ struct list_head pmu_ctx_list;
struct perf_event_groups pinned_groups;
struct perf_event_groups flexible_groups;
struct list_head event_list;
- struct list_head pinned_active;
- struct list_head flexible_active;
-
int nr_events;
- int nr_active;
int nr_user;
int is_active;
+
+ int nr_task_data;
int nr_stat;
int nr_freq;
int rotate_disable;
- /*
- * Set when nr_events != nr_active, except tolerant to events not
- * necessary to be active due to scheduling constraints, such as cgroups.
- */
- int rotate_necessary;
- refcount_t refcount;
+
+ refcount_t refcount; /* event <-> ctx */
struct task_struct *task;
/*
@@ -875,8 +935,15 @@ struct perf_event_context {
#ifdef CONFIG_CGROUP_PERF
int nr_cgroups; /* cgroup evts */
#endif
- void *task_ctx_data; /* pmu specific data */
struct rcu_head rcu_head;
+
+ /*
+ * Sum (event->pending_sigtrap + event->pending_work)
+ *
+ * The SIGTRAP is targeted at ctx->task, as such it won't do changing
+ * that until the signal is delivered.
+ */
+ local_t nr_pending;
};
/*
@@ -885,12 +952,13 @@ struct perf_event_context {
*/
#define PERF_NR_CONTEXTS 4
-/**
- * struct perf_cpu_context - per cpu event context structure
- */
-struct perf_cpu_context {
- struct perf_event_context ctx;
- struct perf_event_context *task_ctx;
+struct perf_cpu_pmu_context {
+ struct perf_event_pmu_context epc;
+ struct perf_event_pmu_context *task_epc;
+
+ struct list_head sched_cb_entry;
+ int sched_cb_usage;
+
int active_oncpu;
int exclusive;
@@ -898,16 +966,20 @@ struct perf_cpu_context {
struct hrtimer hrtimer;
ktime_t hrtimer_interval;
unsigned int hrtimer_active;
+};
+
+/**
+ * struct perf_event_cpu_context - per cpu event context structure
+ */
+struct perf_cpu_context {
+ struct perf_event_context ctx;
+ struct perf_event_context *task_ctx;
+ int online;
#ifdef CONFIG_CGROUP_PERF
struct perf_cgroup *cgrp;
- struct list_head cgrp_cpuctx_entry;
#endif
- struct list_head sched_cb_entry;
- int sched_cb_usage;
-
- int online;
/*
* Per-CPU storage for iterators used in visit_groups_merge. The default
* storage is of size 2 to hold the CPU and any CPU event iterators.
@@ -971,6 +1043,8 @@ perf_cgroup_from_task(struct task_struct *task, struct perf_event_context *ctx)
#ifdef CONFIG_PERF_EVENTS
+extern struct perf_event_context *perf_cpu_task_ctx(void);
+
extern void *perf_aux_output_begin(struct perf_output_handle *handle,
struct perf_event *event);
extern void perf_aux_output_end(struct perf_output_handle *handle,
@@ -1176,7 +1250,7 @@ static inline int is_software_event(struct perf_event *event)
*/
static inline int in_software_context(struct perf_event *event)
{
- return event->ctx->pmu->task_ctx_nr == perf_sw_context;
+ return event->pmu_ctx->pmu->task_ctx_nr == perf_sw_context;
}
static inline int is_exclusive_pmu(struct pmu *pmu)
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 014ee8f0fbaa..5f0d7d0b9471 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -165,6 +165,13 @@ static inline pte_t *virt_to_kpte(unsigned long vaddr)
return pmd_none(*pmd) ? NULL : pte_offset_kernel(pmd, vaddr);
}
+#ifndef pmd_young
+static inline int pmd_young(pmd_t pmd)
+{
+ return 0;
+}
+#endif
+
#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
extern int ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
@@ -213,7 +220,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
#endif
#ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long address,
pmd_t *pmdp)
@@ -234,7 +241,7 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
BUILD_BUG();
return 0;
}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */
#endif
#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
@@ -260,6 +267,30 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma,
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
+#ifndef arch_has_hw_nonleaf_pmd_young
+/*
+ * Return whether the accessed bit in non-leaf PMD entries is supported on the
+ * local CPU.
+ */
+static inline bool arch_has_hw_nonleaf_pmd_young(void)
+{
+ return IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG);
+}
+#endif
+
+#ifndef arch_has_hw_pte_young
+/*
+ * Return whether the accessed bit is supported on the local CPU.
+ *
+ * This stub assumes accessing through an old PTE triggers a page fault.
+ * Architectures that automatically set the access bit should overwrite it.
+ */
+static inline bool arch_has_hw_pte_young(void)
+{
+ return false;
+}
+#endif
+
#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
unsigned long address,
@@ -1276,8 +1307,7 @@ static inline int pgd_devmap(pgd_t pgd)
#endif
#if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \
- (defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
- !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD))
+ !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
static inline int pud_trans_huge(pud_t pud)
{
return 0;
@@ -1598,11 +1628,7 @@ typedef unsigned int pgtbl_mod_mask;
#endif
#ifndef has_transparent_hugepage
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define has_transparent_hugepage() 1
-#else
-#define has_transparent_hugepage() 0
-#endif
+#define has_transparent_hugepage() IS_BUILTIN(CONFIG_TRANSPARENT_HUGEPAGE)
#endif
/*
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 664dd409feb9..3f01ac8017e0 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -122,6 +122,7 @@ enum phylink_op_type {
* (See commit 7cceb599d15d ("net: phylink: avoid mac_config calls")
* @poll_fixed_state: if true, starts link_poll,
* if MAC link is at %MLO_AN_FIXED mode.
+ * @mac_managed_pm: if true, indicate the MAC driver is responsible for PHY PM.
* @ovr_an_inband: if true, override PCS to MLO_AN_INBAND
* @get_fixed_state: callback to execute to determine the fixed link state,
* if MAC link is at %MLO_AN_FIXED mode.
@@ -134,6 +135,7 @@ struct phylink_config {
enum phylink_op_type type;
bool legacy_pre_march2020;
bool poll_fixed_state;
+ bool mac_managed_pm;
bool ovr_an_inband;
void (*get_fixed_state)(struct phylink_config *config,
struct phylink_link_state *state);
diff --git a/include/linux/platform_data/adp5588.h b/include/linux/platform_data/adp5588.h
deleted file mode 100644
index 6d3f7d911a92..000000000000
--- a/include/linux/platform_data/adp5588.h
+++ /dev/null
@@ -1,171 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Analog Devices ADP5588 I/O Expander and QWERTY Keypad Controller
- *
- * Copyright 2009-2010 Analog Devices Inc.
- */
-
-#ifndef _ADP5588_H
-#define _ADP5588_H
-
-#define DEV_ID 0x00 /* Device ID */
-#define CFG 0x01 /* Configuration Register1 */
-#define INT_STAT 0x02 /* Interrupt Status Register */
-#define KEY_LCK_EC_STAT 0x03 /* Key Lock and Event Counter Register */
-#define Key_EVENTA 0x04 /* Key Event Register A */
-#define Key_EVENTB 0x05 /* Key Event Register B */
-#define Key_EVENTC 0x06 /* Key Event Register C */
-#define Key_EVENTD 0x07 /* Key Event Register D */
-#define Key_EVENTE 0x08 /* Key Event Register E */
-#define Key_EVENTF 0x09 /* Key Event Register F */
-#define Key_EVENTG 0x0A /* Key Event Register G */
-#define Key_EVENTH 0x0B /* Key Event Register H */
-#define Key_EVENTI 0x0C /* Key Event Register I */
-#define Key_EVENTJ 0x0D /* Key Event Register J */
-#define KP_LCK_TMR 0x0E /* Keypad Lock1 to Lock2 Timer */
-#define UNLOCK1 0x0F /* Unlock Key1 */
-#define UNLOCK2 0x10 /* Unlock Key2 */
-#define GPIO_INT_STAT1 0x11 /* GPIO Interrupt Status */
-#define GPIO_INT_STAT2 0x12 /* GPIO Interrupt Status */
-#define GPIO_INT_STAT3 0x13 /* GPIO Interrupt Status */
-#define GPIO_DAT_STAT1 0x14 /* GPIO Data Status, Read twice to clear */
-#define GPIO_DAT_STAT2 0x15 /* GPIO Data Status, Read twice to clear */
-#define GPIO_DAT_STAT3 0x16 /* GPIO Data Status, Read twice to clear */
-#define GPIO_DAT_OUT1 0x17 /* GPIO DATA OUT */
-#define GPIO_DAT_OUT2 0x18 /* GPIO DATA OUT */
-#define GPIO_DAT_OUT3 0x19 /* GPIO DATA OUT */
-#define GPIO_INT_EN1 0x1A /* GPIO Interrupt Enable */
-#define GPIO_INT_EN2 0x1B /* GPIO Interrupt Enable */
-#define GPIO_INT_EN3 0x1C /* GPIO Interrupt Enable */
-#define KP_GPIO1 0x1D /* Keypad or GPIO Selection */
-#define KP_GPIO2 0x1E /* Keypad or GPIO Selection */
-#define KP_GPIO3 0x1F /* Keypad or GPIO Selection */
-#define GPI_EM1 0x20 /* GPI Event Mode 1 */
-#define GPI_EM2 0x21 /* GPI Event Mode 2 */
-#define GPI_EM3 0x22 /* GPI Event Mode 3 */
-#define GPIO_DIR1 0x23 /* GPIO Data Direction */
-#define GPIO_DIR2 0x24 /* GPIO Data Direction */
-#define GPIO_DIR3 0x25 /* GPIO Data Direction */
-#define GPIO_INT_LVL1 0x26 /* GPIO Edge/Level Detect */
-#define GPIO_INT_LVL2 0x27 /* GPIO Edge/Level Detect */
-#define GPIO_INT_LVL3 0x28 /* GPIO Edge/Level Detect */
-#define Debounce_DIS1 0x29 /* Debounce Disable */
-#define Debounce_DIS2 0x2A /* Debounce Disable */
-#define Debounce_DIS3 0x2B /* Debounce Disable */
-#define GPIO_PULL1 0x2C /* GPIO Pull Disable */
-#define GPIO_PULL2 0x2D /* GPIO Pull Disable */
-#define GPIO_PULL3 0x2E /* GPIO Pull Disable */
-#define CMP_CFG_STAT 0x30 /* Comparator Configuration and Status Register */
-#define CMP_CONFG_SENS1 0x31 /* Sensor1 Comparator Configuration Register */
-#define CMP_CONFG_SENS2 0x32 /* L2 Light Sensor Reference Level, Output Falling for Sensor 1 */
-#define CMP1_LVL2_TRIP 0x33 /* L2 Light Sensor Hysteresis (Active when Output Rising) for Sensor 1 */
-#define CMP1_LVL2_HYS 0x34 /* L3 Light Sensor Reference Level, Output Falling For Sensor 1 */
-#define CMP1_LVL3_TRIP 0x35 /* L3 Light Sensor Hysteresis (Active when Output Rising) For Sensor 1 */
-#define CMP1_LVL3_HYS 0x36 /* Sensor 2 Comparator Configuration Register */
-#define CMP2_LVL2_TRIP 0x37 /* L2 Light Sensor Reference Level, Output Falling for Sensor 2 */
-#define CMP2_LVL2_HYS 0x38 /* L2 Light Sensor Hysteresis (Active when Output Rising) for Sensor 2 */
-#define CMP2_LVL3_TRIP 0x39 /* L3 Light Sensor Reference Level, Output Falling For Sensor 2 */
-#define CMP2_LVL3_HYS 0x3A /* L3 Light Sensor Hysteresis (Active when Output Rising) For Sensor 2 */
-#define CMP1_ADC_DAT_R1 0x3B /* Comparator 1 ADC data Register1 */
-#define CMP1_ADC_DAT_R2 0x3C /* Comparator 1 ADC data Register2 */
-#define CMP2_ADC_DAT_R1 0x3D /* Comparator 2 ADC data Register1 */
-#define CMP2_ADC_DAT_R2 0x3E /* Comparator 2 ADC data Register2 */
-
-#define ADP5588_DEVICE_ID_MASK 0xF
-
- /* Configuration Register1 */
-#define ADP5588_AUTO_INC (1 << 7)
-#define ADP5588_GPIEM_CFG (1 << 6)
-#define ADP5588_OVR_FLOW_M (1 << 5)
-#define ADP5588_INT_CFG (1 << 4)
-#define ADP5588_OVR_FLOW_IEN (1 << 3)
-#define ADP5588_K_LCK_IM (1 << 2)
-#define ADP5588_GPI_IEN (1 << 1)
-#define ADP5588_KE_IEN (1 << 0)
-
-/* Interrupt Status Register */
-#define ADP5588_CMP2_INT (1 << 5)
-#define ADP5588_CMP1_INT (1 << 4)
-#define ADP5588_OVR_FLOW_INT (1 << 3)
-#define ADP5588_K_LCK_INT (1 << 2)
-#define ADP5588_GPI_INT (1 << 1)
-#define ADP5588_KE_INT (1 << 0)
-
-/* Key Lock and Event Counter Register */
-#define ADP5588_K_LCK_EN (1 << 6)
-#define ADP5588_LCK21 0x30
-#define ADP5588_KEC 0xF
-
-#define ADP5588_MAXGPIO 18
-#define ADP5588_BANK(offs) ((offs) >> 3)
-#define ADP5588_BIT(offs) (1u << ((offs) & 0x7))
-
-/* Put one of these structures in i2c_board_info platform_data */
-
-#define ADP5588_KEYMAPSIZE 80
-
-#define GPI_PIN_ROW0 97
-#define GPI_PIN_ROW1 98
-#define GPI_PIN_ROW2 99
-#define GPI_PIN_ROW3 100
-#define GPI_PIN_ROW4 101
-#define GPI_PIN_ROW5 102
-#define GPI_PIN_ROW6 103
-#define GPI_PIN_ROW7 104
-#define GPI_PIN_COL0 105
-#define GPI_PIN_COL1 106
-#define GPI_PIN_COL2 107
-#define GPI_PIN_COL3 108
-#define GPI_PIN_COL4 109
-#define GPI_PIN_COL5 110
-#define GPI_PIN_COL6 111
-#define GPI_PIN_COL7 112
-#define GPI_PIN_COL8 113
-#define GPI_PIN_COL9 114
-
-#define GPI_PIN_ROW_BASE GPI_PIN_ROW0
-#define GPI_PIN_ROW_END GPI_PIN_ROW7
-#define GPI_PIN_COL_BASE GPI_PIN_COL0
-#define GPI_PIN_COL_END GPI_PIN_COL9
-
-#define GPI_PIN_BASE GPI_PIN_ROW_BASE
-#define GPI_PIN_END GPI_PIN_COL_END
-
-#define ADP5588_GPIMAPSIZE_MAX (GPI_PIN_END - GPI_PIN_BASE + 1)
-
-struct adp5588_gpi_map {
- unsigned short pin;
- unsigned short sw_evt;
-};
-
-struct adp5588_kpad_platform_data {
- int rows; /* Number of rows */
- int cols; /* Number of columns */
- const unsigned short *keymap; /* Pointer to keymap */
- unsigned short keymapsize; /* Keymap size */
- unsigned repeat:1; /* Enable key repeat */
- unsigned en_keylock:1; /* Enable Key Lock feature */
- unsigned short unlock_key1; /* Unlock Key 1 */
- unsigned short unlock_key2; /* Unlock Key 2 */
- const struct adp5588_gpi_map *gpimap;
- unsigned short gpimapsize;
- const struct adp5588_gpio_platform_data *gpio_data;
-};
-
-struct i2c_client; /* forward declaration */
-
-struct adp5588_gpio_platform_data {
- int gpio_start; /* GPIO Chip base # */
- const char *const *names;
- unsigned irq_base; /* interrupt base # */
- unsigned pullup_dis_mask; /* Pull-Up Disable Mask */
- int (*setup)(struct i2c_client *client,
- unsigned gpio, unsigned ngpio,
- void *context);
- int (*teardown)(struct i2c_client *client,
- unsigned gpio, unsigned ngpio,
- void *context);
- void *context;
-};
-
-#endif
diff --git a/include/linux/platform_data/gpmc-omap.h b/include/linux/platform_data/gpmc-omap.h
index c9cc4e32435d..dcca6c5e23bb 100644
--- a/include/linux/platform_data/gpmc-omap.h
+++ b/include/linux/platform_data/gpmc-omap.h
@@ -136,6 +136,13 @@ struct gpmc_device_timings {
#define GPMC_MUX_AAD 1 /* Addr-Addr-Data multiplex */
#define GPMC_MUX_AD 2 /* Addr-Data multiplex */
+/* Wait pin polarity values */
+#define GPMC_WAITPINPOLARITY_INVALID UINT_MAX
+#define GPMC_WAITPINPOLARITY_ACTIVE_LOW 0
+#define GPMC_WAITPINPOLARITY_ACTIVE_HIGH 1
+
+#define GPMC_WAITPIN_INVALID UINT_MAX
+
struct gpmc_settings {
bool burst_wrap; /* enables wrap bursting */
bool burst_read; /* enables read page/burst mode */
@@ -149,6 +156,7 @@ struct gpmc_settings {
u32 device_width; /* device bus width (8 or 16 bit) */
u32 mux_add_data; /* multiplex address & data */
u32 wait_pin; /* wait-pin to be used */
+ u32 wait_pin_polarity;
};
/* Data for each chip select */
diff --git a/include/linux/platform_data/st33zp24.h b/include/linux/platform_data/st33zp24.h
deleted file mode 100644
index 61db674f36cc..000000000000
--- a/include/linux/platform_data/st33zp24.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * STMicroelectronics TPM Linux driver for TPM 1.2 ST33ZP24
- * Copyright (C) 2009 - 2016 STMicroelectronics
- */
-#ifndef __ST33ZP24_H__
-#define __ST33ZP24_H__
-
-#define TPM_ST33_I2C "st33zp24-i2c"
-#define TPM_ST33_SPI "st33zp24-spi"
-
-struct st33zp24_platform_data {
- int io_lpcpd;
-};
-
-#endif /* __ST33ZP24_H__ */
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 871c9c49ec9d..93cd34f00822 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -375,19 +375,20 @@ const struct dev_pm_ops name = { \
}
#ifdef CONFIG_PM
-#define _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, runtime_suspend_fn, \
- runtime_resume_fn, idle_fn, sec, ns) \
- _DEFINE_DEV_PM_OPS(name, suspend_fn, resume_fn, runtime_suspend_fn, \
- runtime_resume_fn, idle_fn); \
- __EXPORT_SYMBOL(name, sec, ns)
+#define _EXPORT_DEV_PM_OPS(name, sec, ns) \
+ const struct dev_pm_ops name; \
+ __EXPORT_SYMBOL(name, sec, ns); \
+ const struct dev_pm_ops name
#else
-#define _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, runtime_suspend_fn, \
- runtime_resume_fn, idle_fn, sec, ns) \
-static __maybe_unused _DEFINE_DEV_PM_OPS(__static_##name, suspend_fn, \
- resume_fn, runtime_suspend_fn, \
- runtime_resume_fn, idle_fn)
+#define _EXPORT_DEV_PM_OPS(name, sec, ns) \
+ static __maybe_unused const struct dev_pm_ops __static_##name
#endif
+#define EXPORT_DEV_PM_OPS(name) _EXPORT_DEV_PM_OPS(name, "", "")
+#define EXPORT_GPL_DEV_PM_OPS(name) _EXPORT_DEV_PM_OPS(name, "_gpl", "")
+#define EXPORT_NS_DEV_PM_OPS(name, ns) _EXPORT_DEV_PM_OPS(name, "", #ns)
+#define EXPORT_NS_GPL_DEV_PM_OPS(name, ns) _EXPORT_DEV_PM_OPS(name, "_gpl", #ns)
+
/*
* Use this if you want to use the same suspend and resume callbacks for suspend
* to RAM and hibernation.
@@ -399,13 +400,21 @@ static __maybe_unused _DEFINE_DEV_PM_OPS(__static_##name, suspend_fn, \
_DEFINE_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL)
#define EXPORT_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \
- _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL, "", "")
+ EXPORT_DEV_PM_OPS(name) = { \
+ SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \
+ }
#define EXPORT_GPL_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \
- _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL, "_gpl", "")
+ EXPORT_GPL_DEV_PM_OPS(name) = { \
+ SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \
+ }
#define EXPORT_NS_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn, ns) \
- _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL, "", #ns)
+ EXPORT_NS_DEV_PM_OPS(name, ns) = { \
+ SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \
+ }
#define EXPORT_NS_GPL_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn, ns) \
- _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL, "_gpl", #ns)
+ EXPORT_NS_GPL_DEV_PM_OPS(name, ns) = { \
+ SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \
+ }
/* Deprecated. Use DEFINE_SIMPLE_DEV_PM_OPS() instead. */
#define SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index ebc351698090..1cd41bdf73cf 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -17,6 +17,7 @@
#include <linux/notifier.h>
#include <linux/spinlock.h>
#include <linux/cpumask.h>
+#include <linux/time64.h>
/*
* Flags to control the behaviour of a genpd.
@@ -95,6 +96,7 @@ struct genpd_governor_data {
s64 max_off_time_ns;
bool max_off_time_changed;
ktime_t next_wakeup;
+ ktime_t next_hrtimer;
bool cached_power_down_ok;
bool cached_power_down_state_idx;
};
@@ -232,6 +234,7 @@ int dev_pm_genpd_set_performance_state(struct device *dev, unsigned int state);
int dev_pm_genpd_add_notifier(struct device *dev, struct notifier_block *nb);
int dev_pm_genpd_remove_notifier(struct device *dev);
void dev_pm_genpd_set_next_wakeup(struct device *dev, ktime_t next);
+ktime_t dev_pm_genpd_get_next_hrtimer(struct device *dev);
extern struct dev_power_governor simple_qos_governor;
extern struct dev_power_governor pm_domain_always_on_gov;
@@ -293,6 +296,10 @@ static inline int dev_pm_genpd_remove_notifier(struct device *dev)
static inline void dev_pm_genpd_set_next_wakeup(struct device *dev, ktime_t next)
{ }
+static inline ktime_t dev_pm_genpd_get_next_hrtimer(struct device *dev)
+{
+ return KTIME_MAX;
+}
#define simple_qos_governor (*(struct dev_power_governor *)(NULL))
#define pm_domain_always_on_gov (*(struct dev_power_governor *)(NULL))
#endif
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index 0a41b2dcccad..9a8151a2bdea 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -40,17 +40,21 @@
resume_fn, idle_fn)
#define EXPORT_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \
- _EXPORT_DEV_PM_OPS(name, pm_runtime_force_suspend, pm_runtime_force_resume, \
- suspend_fn, resume_fn, idle_fn, "", "")
+ EXPORT_DEV_PM_OPS(name) = { \
+ RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \
+ }
#define EXPORT_GPL_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \
- _EXPORT_DEV_PM_OPS(name, pm_runtime_force_suspend, pm_runtime_force_resume, \
- suspend_fn, resume_fn, idle_fn, "_gpl", "")
+ EXPORT_GPL_DEV_PM_OPS(name) = { \
+ RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \
+ }
#define EXPORT_NS_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn, ns) \
- _EXPORT_DEV_PM_OPS(name, pm_runtime_force_suspend, pm_runtime_force_resume, \
- suspend_fn, resume_fn, idle_fn, "", #ns)
+ EXPORT_NS_DEV_PM_OPS(name, ns) = { \
+ RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \
+ }
#define EXPORT_NS_GPL_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn, ns) \
- _EXPORT_DEV_PM_OPS(name, pm_runtime_force_suspend, pm_runtime_force_resume, \
- suspend_fn, resume_fn, idle_fn, "_gpl", #ns)
+ EXPORT_NS_GPL_DEV_PM_OPS(name, ns) = { \
+ RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \
+ }
#ifdef CONFIG_PM
extern struct workqueue_struct *pm_wq;
diff --git a/include/linux/prandom.h b/include/linux/prandom.h
index 78db003bc290..e0a0759dd09c 100644
--- a/include/linux/prandom.h
+++ b/include/linux/prandom.h
@@ -12,18 +12,6 @@
#include <linux/percpu.h>
#include <linux/random.h>
-/* Deprecated: use get_random_u32 instead. */
-static inline u32 prandom_u32(void)
-{
- return get_random_u32();
-}
-
-/* Deprecated: use get_random_bytes instead. */
-static inline void prandom_bytes(void *buf, size_t nbytes)
-{
- return get_random_bytes(buf, nbytes);
-}
-
struct rnd_state {
__u32 s1, s2, s3, s4;
};
diff --git a/include/linux/psi.h b/include/linux/psi.h
index dd74411ac21d..b029a847def1 100644
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -7,6 +7,7 @@
#include <linux/sched.h>
#include <linux/poll.h>
#include <linux/cgroup-defs.h>
+#include <linux/cgroup.h>
struct seq_file;
struct css_set;
@@ -18,10 +19,6 @@ extern struct psi_group psi_system;
void psi_init(void);
-void psi_task_change(struct task_struct *task, int clear, int set);
-void psi_task_switch(struct task_struct *prev, struct task_struct *next,
- bool sleep);
-
void psi_memstall_enter(unsigned long *flags);
void psi_memstall_leave(unsigned long *flags);
@@ -34,9 +31,15 @@ __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
poll_table *wait);
#ifdef CONFIG_CGROUPS
+static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
+{
+ return cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
+}
+
int psi_cgroup_alloc(struct cgroup *cgrp);
void psi_cgroup_free(struct cgroup *cgrp);
void cgroup_move_task(struct task_struct *p, struct css_set *to);
+void psi_cgroup_restart(struct psi_group *group);
#endif
#else /* CONFIG_PSI */
@@ -58,6 +61,7 @@ static inline void cgroup_move_task(struct task_struct *p, struct css_set *to)
{
rcu_assign_pointer(p->cgroups, to);
}
+static inline void psi_cgroup_restart(struct psi_group *group) {}
#endif
#endif /* CONFIG_PSI */
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index c7fe7c089718..1e0a0d7ace3a 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -16,13 +16,6 @@ enum psi_task_count {
NR_MEMSTALL,
NR_RUNNING,
/*
- * This can't have values other than 0 or 1 and could be
- * implemented as a bit flag. But for now we still have room
- * in the first cacheline of psi_group_cpu, and this way we
- * don't have to special case any state tracking for it.
- */
- NR_ONCPU,
- /*
* For IO and CPU stalls the presence of running/oncpu tasks
* in the domain means a partial rather than a full stall.
* For memory it's not so simple because of page reclaimers:
@@ -32,22 +25,27 @@ enum psi_task_count {
* threads and memstall ones.
*/
NR_MEMSTALL_RUNNING,
- NR_PSI_TASK_COUNTS = 5,
+ NR_PSI_TASK_COUNTS = 4,
};
/* Task state bitmasks */
#define TSK_IOWAIT (1 << NR_IOWAIT)
#define TSK_MEMSTALL (1 << NR_MEMSTALL)
#define TSK_RUNNING (1 << NR_RUNNING)
-#define TSK_ONCPU (1 << NR_ONCPU)
#define TSK_MEMSTALL_RUNNING (1 << NR_MEMSTALL_RUNNING)
+/* Only one task can be scheduled, no corresponding task count */
+#define TSK_ONCPU (1 << NR_PSI_TASK_COUNTS)
+
/* Resources that workloads could be stalled on */
enum psi_res {
PSI_IO,
PSI_MEM,
PSI_CPU,
- NR_PSI_RESOURCES = 3,
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ PSI_IRQ,
+#endif
+ NR_PSI_RESOURCES,
};
/*
@@ -63,11 +61,20 @@ enum psi_states {
PSI_MEM_FULL,
PSI_CPU_SOME,
PSI_CPU_FULL,
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ PSI_IRQ_FULL,
+#endif
/* Only per-CPU, to weigh the CPU in the global average: */
PSI_NONIDLE,
- NR_PSI_STATES = 7,
+ NR_PSI_STATES,
};
+/* Use one bit in the state mask to track TSK_ONCPU */
+#define PSI_ONCPU (1 << NR_PSI_STATES)
+
+/* Flag whether to re-arm avgs_work, see details in get_recent_times() */
+#define PSI_STATE_RESCHEDULE (1 << (NR_PSI_STATES + 1))
+
enum psi_aggregators {
PSI_AVGS = 0,
PSI_POLL,
@@ -147,6 +154,9 @@ struct psi_trigger {
};
struct psi_group {
+ struct psi_group *parent;
+ bool enabled;
+
/* Protects data used by the aggregator */
struct mutex avgs_lock;
@@ -170,6 +180,7 @@ struct psi_group {
struct timer_list poll_timer;
wait_queue_head_t poll_wait;
atomic_t poll_wakeup;
+ atomic_t poll_scheduled;
/* Protects data used by the monitor */
struct mutex trigger_lock;
@@ -188,6 +199,8 @@ struct psi_group {
#else /* CONFIG_PSI */
+#define NR_PSI_RESOURCES 0
+
struct psi_group { };
#endif /* CONFIG_PSI */
diff --git a/include/linux/pstore_ram.h b/include/linux/pstore_ram.h
index 9f16afec7290..9d65ff94e216 100644
--- a/include/linux/pstore_ram.h
+++ b/include/linux/pstore_ram.h
@@ -8,28 +8,7 @@
#ifndef __LINUX_PSTORE_RAM_H__
#define __LINUX_PSTORE_RAM_H__
-#include <linux/compiler.h>
-#include <linux/device.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/list.h>
#include <linux/pstore.h>
-#include <linux/types.h>
-
-/*
- * Choose whether access to the RAM zone requires locking or not. If a zone
- * can be written to from different CPUs like with ftrace for example, then
- * PRZ_FLAG_NO_LOCK is used. For all other cases, locking is required.
- */
-#define PRZ_FLAG_NO_LOCK BIT(0)
-/*
- * If a PRZ should only have a single-boot lifetime, this marks it as
- * getting wiped after its contents get copied out after boot.
- */
-#define PRZ_FLAG_ZAP_OLD BIT(1)
-
-struct persistent_ram_buffer;
-struct rs_control;
struct persistent_ram_ecc_info {
int block_size;
@@ -39,84 +18,6 @@ struct persistent_ram_ecc_info {
uint16_t *par;
};
-/**
- * struct persistent_ram_zone - Details of a persistent RAM zone (PRZ)
- * used as a pstore backend
- *
- * @paddr: physical address of the mapped RAM area
- * @size: size of mapping
- * @label: unique name of this PRZ
- * @type: frontend type for this PRZ
- * @flags: holds PRZ_FLAGS_* bits
- *
- * @buffer_lock:
- * locks access to @buffer "size" bytes and "start" offset
- * @buffer:
- * pointer to actual RAM area managed by this PRZ
- * @buffer_size:
- * bytes in @buffer->data (not including any trailing ECC bytes)
- *
- * @par_buffer:
- * pointer into @buffer->data containing ECC bytes for @buffer->data
- * @par_header:
- * pointer into @buffer->data containing ECC bytes for @buffer header
- * (i.e. all fields up to @data)
- * @rs_decoder:
- * RSLIB instance for doing ECC calculations
- * @corrected_bytes:
- * ECC corrected bytes accounting since boot
- * @bad_blocks:
- * ECC uncorrectable bytes accounting since boot
- * @ecc_info:
- * ECC configuration details
- *
- * @old_log:
- * saved copy of @buffer->data prior to most recent wipe
- * @old_log_size:
- * bytes contained in @old_log
- *
- */
-struct persistent_ram_zone {
- phys_addr_t paddr;
- size_t size;
- void *vaddr;
- char *label;
- enum pstore_type_id type;
- u32 flags;
-
- raw_spinlock_t buffer_lock;
- struct persistent_ram_buffer *buffer;
- size_t buffer_size;
-
- char *par_buffer;
- char *par_header;
- struct rs_control *rs_decoder;
- int corrected_bytes;
- int bad_blocks;
- struct persistent_ram_ecc_info ecc_info;
-
- char *old_log;
- size_t old_log_size;
-};
-
-struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
- u32 sig, struct persistent_ram_ecc_info *ecc_info,
- unsigned int memtype, u32 flags, char *label);
-void persistent_ram_free(struct persistent_ram_zone *prz);
-void persistent_ram_zap(struct persistent_ram_zone *prz);
-
-int persistent_ram_write(struct persistent_ram_zone *prz, const void *s,
- unsigned int count);
-int persistent_ram_write_user(struct persistent_ram_zone *prz,
- const void __user *s, unsigned int count);
-
-void persistent_ram_save_old(struct persistent_ram_zone *prz);
-size_t persistent_ram_old_size(struct persistent_ram_zone *prz);
-void *persistent_ram_old(struct persistent_ram_zone *prz);
-void persistent_ram_free_old(struct persistent_ram_zone *prz);
-ssize_t persistent_ram_ecc_string(struct persistent_ram_zone *prz,
- char *str, size_t len);
-
/*
* Ramoops platform data
* @mem_size memory size for ramoops
diff --git a/include/linux/random.h b/include/linux/random.h
index 08322f700cdc..147a5e0d0b8e 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -42,10 +42,6 @@ u8 get_random_u8(void);
u16 get_random_u16(void);
u32 get_random_u32(void);
u64 get_random_u64(void);
-static inline unsigned int get_random_int(void)
-{
- return get_random_u32();
-}
static inline unsigned long get_random_long(void)
{
#if BITS_PER_LONG == 64
@@ -100,7 +96,6 @@ declare_get_random_var_wait(u8, u8)
declare_get_random_var_wait(u16, u16)
declare_get_random_var_wait(u32, u32)
declare_get_random_var_wait(u64, u32)
-declare_get_random_var_wait(int, unsigned int)
declare_get_random_var_wait(long, unsigned long)
#undef declare_get_random_var
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 08605ce7379d..4da98ca6273e 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -108,6 +108,15 @@ static inline int rcu_preempt_depth(void)
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
+#ifdef CONFIG_RCU_LAZY
+void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func);
+#else
+static inline void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
+{
+ call_rcu(head, func);
+}
+#endif
+
/* Internal to kernel */
void rcu_init(void);
extern int rcu_scheduler_active;
@@ -340,6 +349,11 @@ static inline int rcu_read_lock_any_held(void)
return !preemptible();
}
+static inline int debug_lockdep_rcu_enabled(void)
+{
+ return 0;
+}
+
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
#ifdef CONFIG_PROVE_RCU
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 768196a5f39d..68f9070aa111 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -142,23 +142,17 @@ static inline int rcu_needs_cpu(void)
* Take advantage of the fact that there is only one CPU, which
* allows us to ignore virtualization-based context switches.
*/
-static inline void rcu_virt_note_context_switch(int cpu) { }
+static inline void rcu_virt_note_context_switch(void) { }
static inline void rcu_cpu_stall_reset(void) { }
static inline int rcu_jiffies_till_stall_check(void) { return 21 * HZ; }
static inline void rcu_irq_exit_check_preempt(void) { }
-#define rcu_is_idle_cpu(cpu) \
- (is_idle_task(current) && !in_nmi() && !in_hardirq() && !in_serving_softirq())
static inline void exit_rcu(void) { }
static inline bool rcu_preempt_need_deferred_qs(struct task_struct *t)
{
return false;
}
static inline void rcu_preempt_deferred_qs(struct task_struct *t) { }
-#ifdef CONFIG_SRCU
void rcu_scheduler_starting(void);
-#else /* #ifndef CONFIG_SRCU */
-static inline void rcu_scheduler_starting(void) { }
-#endif /* #else #ifndef CONFIG_SRCU */
static inline void rcu_end_inkernel_boot(void) { }
static inline bool rcu_inkernel_boot_has_ended(void) { return true; }
static inline bool rcu_is_watching(void) { return true; }
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 5efb51486e8a..4003bf6cfa1c 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -27,7 +27,7 @@ void rcu_cpu_stall_reset(void);
* wrapper around rcu_note_context_switch(), which allows TINY_RCU
* to save a few bytes. The caller must have disabled interrupts.
*/
-static inline void rcu_virt_note_context_switch(int cpu)
+static inline void rcu_virt_note_context_switch(void)
{
rcu_note_context_switch(false);
}
@@ -87,8 +87,6 @@ bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
void cond_synchronize_rcu(unsigned long oldstate);
void cond_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
-bool rcu_is_idle_cpu(int cpu);
-
#ifdef CONFIG_PROVE_RCU
void rcu_irq_exit_check_preempt(void);
#else
diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index e5d9ef886179..2b6bb593be5b 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -106,6 +106,14 @@ enum sys_off_mode {
SYS_OFF_MODE_POWER_OFF,
/**
+ * @SYS_OFF_MODE_RESTART_PREPARE:
+ *
+ * Handlers prepare system to be restarted. Handlers are
+ * allowed to sleep.
+ */
+ SYS_OFF_MODE_RESTART_PREPARE,
+
+ /**
* @SYS_OFF_MODE_RESTART:
*
* Handlers restart system. Handlers are disallowed to sleep.
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 0cf5b20c6ddf..0cee154abc9f 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -89,11 +89,12 @@ struct rdt_domain {
/**
* struct resctrl_cache - Cache allocation related data
* @cbm_len: Length of the cache bit mask
- * @min_cbm_bits: Minimum number of consecutive bits to be set
+ * @min_cbm_bits: Minimum number of consecutive bits to be set.
+ * The value 0 means the architecture can support
+ * zero CBM.
* @shareable_bits: Bitmask of shareable resource with other
* executing entities
* @arch_has_sparse_bitmaps: True if a bitmap like f00f is valid.
- * @arch_has_empty_bitmaps: True if the '0' bitmap is valid.
* @arch_has_per_cpu_cfg: True if QOS_CFG register for this cache
* level has CPU scope.
*/
@@ -102,7 +103,6 @@ struct resctrl_cache {
unsigned int min_cbm_bits;
unsigned int shareable_bits;
bool arch_has_sparse_bitmaps;
- bool arch_has_empty_bitmaps;
bool arch_has_per_cpu_cfg;
};
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index dac53fd3afea..3c7d295746f6 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -100,8 +100,8 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k
int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full);
__poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
- struct file *filp, poll_table *poll_table);
-
+ struct file *filp, poll_table *poll_table, int full);
+void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu);
#define RING_BUFFER_ALL_CPUS -1
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index b89b4b86951f..bd3504d11b15 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -166,7 +166,7 @@ static inline void anon_vma_merge(struct vm_area_struct *vma,
unlink_anon_vmas(next);
}
-struct anon_vma *page_get_anon_vma(struct page *page);
+struct anon_vma *folio_get_anon_vma(struct folio *folio);
/* RMAP flags, currently only relevant for some anon rmap operations. */
typedef int __bitwise rmap_t;
@@ -270,7 +270,7 @@ dup:
* @page: the exclusive anonymous page to try marking possibly shared
*
* The caller needs to hold the PT lock and has to have the page table entry
- * cleared/invalidated+flushed, to properly sync against GUP-fast.
+ * cleared/invalidated.
*
* This is similar to page_try_dup_anon_rmap(), however, not used during fork()
* to duplicate a mapping, but instead to prepare for KSM or temporarily
@@ -286,12 +286,68 @@ static inline int page_try_share_anon_rmap(struct page *page)
{
VM_BUG_ON_PAGE(!PageAnon(page) || !PageAnonExclusive(page), page);
- /* See page_try_dup_anon_rmap(). */
- if (likely(!is_device_private_page(page) &&
- unlikely(page_maybe_dma_pinned(page))))
- return -EBUSY;
+ /* device private pages cannot get pinned via GUP. */
+ if (unlikely(is_device_private_page(page))) {
+ ClearPageAnonExclusive(page);
+ return 0;
+ }
+
+ /*
+ * We have to make sure that when we clear PageAnonExclusive, that
+ * the page is not pinned and that concurrent GUP-fast won't succeed in
+ * concurrently pinning the page.
+ *
+ * Conceptually, PageAnonExclusive clearing consists of:
+ * (A1) Clear PTE
+ * (A2) Check if the page is pinned; back off if so.
+ * (A3) Clear PageAnonExclusive
+ * (A4) Restore PTE (optional, but certainly not writable)
+ *
+ * When clearing PageAnonExclusive, we cannot possibly map the page
+ * writable again, because anon pages that may be shared must never
+ * be writable. So in any case, if the PTE was writable it cannot
+ * be writable anymore afterwards and there would be a PTE change. Only
+ * if the PTE wasn't writable, there might not be a PTE change.
+ *
+ * Conceptually, GUP-fast pinning of an anon page consists of:
+ * (B1) Read the PTE
+ * (B2) FOLL_WRITE: check if the PTE is not writable; back off if so.
+ * (B3) Pin the mapped page
+ * (B4) Check if the PTE changed by re-reading it; back off if so.
+ * (B5) If the original PTE is not writable, check if
+ * PageAnonExclusive is not set; back off if so.
+ *
+ * If the PTE was writable, we only have to make sure that GUP-fast
+ * observes a PTE change and properly backs off.
+ *
+ * If the PTE was not writable, we have to make sure that GUP-fast either
+ * detects a (temporary) PTE change or that PageAnonExclusive is cleared
+ * and properly backs off.
+ *
+ * Consequently, when clearing PageAnonExclusive(), we have to make
+ * sure that (A1), (A2)/(A3) and (A4) happen in the right memory
+ * order. In GUP-fast pinning code, we have to make sure that (B3),(B4)
+ * and (B5) happen in the right memory order.
+ *
+ * We assume that there might not be a memory barrier after
+ * clearing/invalidating the PTE (A1) and before restoring the PTE (A4),
+ * so we use explicit ones here.
+ */
+ /* Paired with the memory barrier in try_grab_folio(). */
+ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+ smp_mb();
+
+ if (unlikely(page_maybe_dma_pinned(page)))
+ return -EBUSY;
ClearPageAnonExclusive(page);
+
+ /*
+ * This is conceptually a smp_wmb() paired with the smp_rmb() in
+ * gup_must_unshare().
+ */
+ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+ smp_mb__after_atomic();
return 0;
}
@@ -405,13 +461,8 @@ struct rmap_walk_control {
void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc);
void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc);
-
-/*
- * Called by memory-failure.c to kill processes.
- */
struct anon_vma *folio_lock_anon_vma_read(struct folio *folio,
struct rmap_walk_control *rwc);
-void page_unlock_anon_vma_read(struct anon_vma *anon_vma);
#else /* !CONFIG_MMU */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 68ec44d6b962..5affff14993d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -14,6 +14,7 @@
#include <linux/pid.h>
#include <linux/sem.h>
#include <linux/shm.h>
+#include <linux/kmsan_types.h>
#include <linux/mutex.h>
#include <linux/plist.h>
#include <linux/hrtimer.h>
@@ -869,9 +870,6 @@ struct task_struct {
struct mm_struct *mm;
struct mm_struct *active_mm;
- /* Per-thread vma caching: */
- struct vmacache vmacache;
-
#ifdef SPLIT_RSS_COUNTING
struct task_rss_stat rss_stat;
#endif
@@ -890,9 +888,6 @@ struct task_struct {
unsigned sched_reset_on_fork:1;
unsigned sched_contributes_to_load:1;
unsigned sched_migrated:1;
-#ifdef CONFIG_PSI
- unsigned sched_psi_wake_requeue:1;
-#endif
/* Force alignment to the next boundary: */
unsigned :0;
@@ -923,6 +918,10 @@ struct task_struct {
#ifdef CONFIG_MEMCG
unsigned in_user_fault:1;
#endif
+#ifdef CONFIG_LRU_GEN
+ /* whether the LRU algorithm may apply to this access */
+ unsigned in_lru_fault:1;
+#endif
#ifdef CONFIG_COMPAT_BRK
unsigned brk_randomized:1;
#endif
@@ -953,6 +952,10 @@ struct task_struct {
#ifdef CONFIG_CPU_SUP_INTEL
unsigned reported_split_lock:1;
#endif
+#ifdef CONFIG_TASK_DELAY_ACCT
+ /* delay due to memory thrashing */
+ unsigned in_thrashing:1;
+#endif
unsigned long atomic_flags; /* Flags requiring atomic access. */
@@ -1237,7 +1240,7 @@ struct task_struct {
unsigned int futex_state;
#endif
#ifdef CONFIG_PERF_EVENTS
- struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
+ struct perf_event_context *perf_event_ctxp;
struct mutex perf_event_mutex;
struct list_head perf_event_list;
#endif
@@ -1364,6 +1367,10 @@ struct task_struct {
#endif
#endif
+#ifdef CONFIG_KMSAN
+ struct kmsan_ctx kmsan_ctx;
+#endif
+
#if IS_ENABLED(CONFIG_KUNIT)
struct kunit *kunit_test;
#endif
@@ -1390,9 +1397,6 @@ struct task_struct {
#endif
#ifdef CONFIG_TRACING
- /* State flags for use by tracers: */
- unsigned long trace;
-
/* Bitmask and counter of trace recursion: */
unsigned long trace_recursion;
#endif /* CONFIG_TRACING */
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index 4d0a5be28b70..8270ad7ae14c 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -71,9 +71,8 @@ static inline int get_dumpable(struct mm_struct *mm)
#define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */
#define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */
#define MMF_DISABLE_THP 24 /* disable THP for all VMAs */
-#define MMF_OOM_VICTIM 25 /* mm is the oom victim */
-#define MMF_OOM_REAP_QUEUED 26 /* mm was queued for oom_reaper */
-#define MMF_MULTIPROCESS 27 /* mm is shared between processes */
+#define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */
+#define MMF_MULTIPROCESS 26 /* mm is shared between processes */
/*
* MMF_HAS_PINNED: Whether this mm has pinned any pages. This can be either
* replaced in the future by mm.pinned_vm when it becomes stable, or grow into
@@ -81,7 +80,7 @@ static inline int get_dumpable(struct mm_struct *mm)
* pinned pages were unpinned later on, we'll still keep this bit set for the
* lifecycle of this mm, just for simplicity.
*/
-#define MMF_HAS_PINNED 28 /* FOLL_PIN has run, never cleared */
+#define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */
#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP)
#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index e650946816d0..303ee7dd0c7e 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -27,6 +27,7 @@ enum sched_tunable_scaling {
#ifdef CONFIG_NUMA_BALANCING
extern int sysctl_numa_balancing_mode;
+extern unsigned int sysctl_numa_balancing_promote_rate_limit;
#else
#define sysctl_numa_balancing_mode 0
#endif
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 81cab4b01edc..d6c48163c6de 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -127,6 +127,9 @@ static inline void put_task_struct_many(struct task_struct *t, int nr)
void put_task_struct_rcu_user(struct task_struct *task);
+/* Free all architecture-specific resources held by a thread. */
+void release_thread(struct task_struct *dead_task);
+
#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
extern int arch_task_struct_size __read_mostly;
#else
diff --git a/include/linux/scs.h b/include/linux/scs.h
index 18122d9e17ff..4ab5bdc898cf 100644
--- a/include/linux/scs.h
+++ b/include/linux/scs.h
@@ -53,6 +53,22 @@ static inline bool task_scs_end_corrupted(struct task_struct *tsk)
return sz >= SCS_SIZE - 1 || READ_ONCE_NOCHECK(*magic) != SCS_END_MAGIC;
}
+DECLARE_STATIC_KEY_FALSE(dynamic_scs_enabled);
+
+static inline bool scs_is_dynamic(void)
+{
+ if (!IS_ENABLED(CONFIG_DYNAMIC_SCS))
+ return false;
+ return static_branch_likely(&dynamic_scs_enabled);
+}
+
+static inline bool scs_is_enabled(void)
+{
+ if (!IS_ENABLED(CONFIG_DYNAMIC_SCS))
+ return true;
+ return scs_is_dynamic();
+}
+
#else /* CONFIG_SHADOW_CALL_STACK */
static inline void *scs_alloc(int node) { return NULL; }
@@ -62,6 +78,8 @@ static inline void scs_task_reset(struct task_struct *tsk) {}
static inline int scs_prepare(struct task_struct *tsk, int node) { return 0; }
static inline void scs_release(struct task_struct *tsk) {}
static inline bool task_scs_end_corrupted(struct task_struct *tsk) { return false; }
+static inline bool scs_is_enabled(void) { return false; }
+static inline bool scs_is_dynamic(void) { return false; }
#endif /* CONFIG_SHADOW_CALL_STACK */
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index d31d76be4982..175079552f68 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -27,6 +27,7 @@ struct seccomp_filter;
*
* @mode: indicates one of the valid values above for controlled
* system calls available to a process.
+ * @filter_count: number of seccomp filters
* @filter: must always point to a valid seccomp-filter or NULL as it is
* accessed without locking during system call entry.
*
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index d657f2a42a7b..91871464b99d 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -743,9 +743,15 @@ static const bool earlycon_acpi_spcr_enable EARLYCON_USED_OR_UNUSED;
static inline int setup_earlycon(char *buf) { return 0; }
#endif
-static inline bool uart_console_enabled(struct uart_port *port)
+/* Variant of uart_console_registered() when the console_list_lock is held. */
+static inline bool uart_console_registered_locked(struct uart_port *port)
{
- return uart_console(port) && (port->cons->flags & CON_ENABLED);
+ return uart_console(port) && console_is_registered_locked(port->cons);
+}
+
+static inline bool uart_console_registered(struct uart_port *port)
+{
+ return uart_console(port) && console_is_registered(port->cons);
}
struct uart_port *uart_get_console(struct uart_port *ports, int nr,
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index ff0b990de83d..d500ea967dc7 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -92,17 +92,19 @@ extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
int shmem_unuse(unsigned int type);
-extern bool shmem_is_huge(struct vm_area_struct *vma,
- struct inode *inode, pgoff_t index);
-static inline bool shmem_huge_enabled(struct vm_area_struct *vma)
+extern bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode,
+ pgoff_t index, bool shmem_huge_force);
+static inline bool shmem_huge_enabled(struct vm_area_struct *vma,
+ bool shmem_huge_force)
{
- return shmem_is_huge(vma, file_inode(vma->vm_file), vma->vm_pgoff);
+ return shmem_is_huge(vma, file_inode(vma->vm_file), vma->vm_pgoff,
+ shmem_huge_force);
}
extern unsigned long shmem_swap_usage(struct vm_area_struct *vma);
extern unsigned long shmem_partial_swap_usage(struct address_space *mapping,
pgoff_t start, pgoff_t end);
-/* Flag allocation requirements to shmem_getpage */
+/* Flag allocation requirements to shmem_get_folio */
enum sgp_type {
SGP_READ, /* don't exceed i_size, don't allocate page */
SGP_NOALLOC, /* similar, but fail on hole or use fallocated page */
@@ -111,8 +113,8 @@ enum sgp_type {
SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */
};
-extern int shmem_getpage(struct inode *inode, pgoff_t index,
- struct page **pagep, enum sgp_type sgp);
+int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop,
+ enum sgp_type sgp);
static inline struct page *shmem_read_mapping_page(
struct address_space *mapping, pgoff_t index)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9fcf534f2d92..7be5bb4c94b6 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -803,6 +803,7 @@ typedef unsigned char *sk_buff_data_t;
* @csum_level: indicates the number of consecutive checksums found in
* the packet minus one that have been verified as
* CHECKSUM_UNNECESSARY (max 3)
+ * @scm_io_uring: SKB holds io_uring registered files
* @dst_pending_confirm: need to confirm neighbour
* @decrypted: Decrypted SKB
* @slow_gro: state present at GRO time, slower prepare step required
@@ -982,6 +983,7 @@ struct sk_buff {
#endif
__u8 slow_gro:1;
__u8 csum_not_inet:1;
+ __u8 scm_io_uring:1;
#ifdef CONFIG_NET_SCHED
__u16 tc_index; /* traffic control index */
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 48f4b645193b..70d6cb94e580 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -376,7 +376,7 @@ static inline void sk_psock_report_error(struct sk_psock *psock, int err)
}
struct sk_psock *sk_psock_init(struct sock *sk, int node);
-void sk_psock_stop(struct sk_psock *psock, bool wait);
+void sk_psock_stop(struct sk_psock *psock);
#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock);
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 6a613e65e78d..45af70315a94 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -76,6 +76,17 @@
* rcu_read_lock before reading the address, then rcu_read_unlock after
* taking the spinlock within the structure expected at that address.
*
+ * Note that it is not possible to acquire a lock within a structure
+ * allocated with SLAB_TYPESAFE_BY_RCU without first acquiring a reference
+ * as described above. The reason is that SLAB_TYPESAFE_BY_RCU pages
+ * are not zeroed before being given to the slab, which means that any
+ * locks must be initialized after each and every kmem_struct_alloc().
+ * Alternatively, make the ctor passed to kmem_cache_create() initialize
+ * the locks at page-allocation time, as is done in __i915_request_ctor(),
+ * sighand_ctor(), and anon_vma_ctor(). Such a ctor permits readers
+ * to safely acquire those ctor-initialized locks under rcu_read_lock()
+ * protection.
+ *
* Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU.
*/
/* Defer freeing slabs to RCU */
@@ -108,7 +119,7 @@
# define SLAB_ACCOUNT 0
#endif
-#ifdef CONFIG_KASAN
+#ifdef CONFIG_KASAN_GENERIC
#define SLAB_KASAN ((slab_flags_t __force)0x08000000U)
#else
#define SLAB_KASAN 0
@@ -121,9 +132,19 @@
*/
#define SLAB_NO_USER_FLAGS ((slab_flags_t __force)0x10000000U)
+#ifdef CONFIG_KFENCE
+#define SLAB_SKIP_KFENCE ((slab_flags_t __force)0x20000000U)
+#else
+#define SLAB_SKIP_KFENCE 0
+#endif
+
/* The following flags affect the page allocator grouping pages by mobility */
/* Objects are reclaimable */
+#ifndef CONFIG_SLUB_TINY
#define SLAB_RECLAIM_ACCOUNT ((slab_flags_t __force)0x00020000U)
+#else
+#define SLAB_RECLAIM_ACCOUNT ((slab_flags_t __force)0)
+#endif
#define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */
/*
@@ -330,13 +351,18 @@ enum kmalloc_cache_type {
#endif
#ifndef CONFIG_MEMCG_KMEM
KMALLOC_CGROUP = KMALLOC_NORMAL,
-#else
- KMALLOC_CGROUP,
#endif
+#ifdef CONFIG_SLUB_TINY
+ KMALLOC_RECLAIM = KMALLOC_NORMAL,
+#else
KMALLOC_RECLAIM,
+#endif
#ifdef CONFIG_ZONE_DMA
KMALLOC_DMA,
#endif
+#ifdef CONFIG_MEMCG_KMEM
+ KMALLOC_CGROUP,
+#endif
NR_KMALLOC_TYPES
};
@@ -435,7 +461,18 @@ static_assert(PAGE_SHIFT <= 20);
#endif /* !CONFIG_SLOB */
void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __alloc_size(1);
-void *kmem_cache_alloc(struct kmem_cache *s, gfp_t flags) __assume_slab_alignment __malloc;
+
+/**
+ * kmem_cache_alloc - Allocate an object
+ * @cachep: The cache to allocate from.
+ * @flags: See kmalloc().
+ *
+ * Allocate an object from this cache.
+ * See kmem_cache_zalloc() for a shortcut of adding __GFP_ZERO to flags.
+ *
+ * Return: pointer to the new object or %NULL in case of error
+ */
+void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) __assume_slab_alignment __malloc;
void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru,
gfp_t gfpflags) __assume_slab_alignment __malloc;
void kmem_cache_free(struct kmem_cache *s, void *objp);
@@ -464,35 +501,12 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignm
void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node) __assume_slab_alignment
__malloc;
-#ifdef CONFIG_TRACING
void *kmalloc_trace(struct kmem_cache *s, gfp_t flags, size_t size)
__assume_kmalloc_alignment __alloc_size(3);
void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags,
int node, size_t size) __assume_kmalloc_alignment
__alloc_size(4);
-#else /* CONFIG_TRACING */
-/* Save a function call when CONFIG_TRACING=n */
-static __always_inline __alloc_size(3)
-void *kmalloc_trace(struct kmem_cache *s, gfp_t flags, size_t size)
-{
- void *ret = kmem_cache_alloc(s, flags);
-
- ret = kasan_kmalloc(s, ret, size, flags);
- return ret;
-}
-
-static __always_inline __alloc_size(4)
-void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags,
- int node, size_t size)
-{
- void *ret = kmem_cache_alloc_node(s, gfpflags, node);
-
- ret = kasan_kmalloc(s, ret, size, gfpflags);
- return ret;
-}
-#endif /* CONFIG_TRACING */
-
void *kmalloc_large(size_t size, gfp_t flags) __assume_page_alignment
__alloc_size(1);
@@ -500,9 +514,9 @@ void *kmalloc_large_node(size_t size, gfp_t flags, int node) __assume_page_align
__alloc_size(1);
/**
- * kmalloc - allocate memory
+ * kmalloc - allocate kernel memory
* @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
+ * @flags: describe the allocation context
*
* kmalloc is the normal method of allocating memory
* for objects smaller than page size in the kernel.
@@ -529,12 +543,12 @@ void *kmalloc_large_node(size_t size, gfp_t flags, int node) __assume_page_align
* %GFP_ATOMIC
* Allocation will not sleep. May use emergency pools.
*
- * %GFP_HIGHUSER
- * Allocate memory from high memory on behalf of user.
- *
* Also it is possible to set different flags by OR'ing
* in one or more of the following additional @flags:
*
+ * %__GFP_ZERO
+ * Zero the allocated memory before returning. Also see kzalloc().
+ *
* %__GFP_HIGH
* This allocation has high priority and may use emergency pools.
*
@@ -553,42 +567,42 @@ void *kmalloc_large_node(size_t size, gfp_t flags, int node) __assume_page_align
* Try really hard to succeed the allocation but fail
* eventually.
*/
+#ifndef CONFIG_SLOB
static __always_inline __alloc_size(1) void *kmalloc(size_t size, gfp_t flags)
{
- if (__builtin_constant_p(size)) {
-#ifndef CONFIG_SLOB
+ if (__builtin_constant_p(size) && size) {
unsigned int index;
-#endif
+
if (size > KMALLOC_MAX_CACHE_SIZE)
return kmalloc_large(size, flags);
-#ifndef CONFIG_SLOB
- index = kmalloc_index(size);
-
- if (!index)
- return ZERO_SIZE_PTR;
+ index = kmalloc_index(size);
return kmalloc_trace(
kmalloc_caches[kmalloc_type(flags)][index],
flags, size);
-#endif
}
return __kmalloc(size, flags);
}
+#else
+static __always_inline __alloc_size(1) void *kmalloc(size_t size, gfp_t flags)
+{
+ if (__builtin_constant_p(size) && size > KMALLOC_MAX_CACHE_SIZE)
+ return kmalloc_large(size, flags);
+
+ return __kmalloc(size, flags);
+}
+#endif
#ifndef CONFIG_SLOB
static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t flags, int node)
{
- if (__builtin_constant_p(size)) {
+ if (__builtin_constant_p(size) && size) {
unsigned int index;
if (size > KMALLOC_MAX_CACHE_SIZE)
return kmalloc_large_node(size, flags, node);
index = kmalloc_index(size);
-
- if (!index)
- return ZERO_SIZE_PTR;
-
return kmalloc_node_trace(
kmalloc_caches[kmalloc_type(flags)][index],
flags, node, size);
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index e24c9aff6fed..5834bad8ad78 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -33,7 +33,6 @@ struct kmem_cache {
size_t colour; /* cache colouring range */
unsigned int colour_off; /* colour offset */
- struct kmem_cache *freelist_cache;
unsigned int freelist_size;
/* constructor func */
@@ -81,8 +80,10 @@ struct kmem_cache {
unsigned int *random_seq;
#endif
+#ifdef CONFIG_HARDENED_USERCOPY
unsigned int useroffset; /* Usercopy region offset */
unsigned int usersize; /* Usercopy region size */
+#endif
struct kmem_cache_node *node[MAX_NUMNODES];
};
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index f9c68a9dac04..aa0ee1678d29 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -41,6 +41,7 @@ enum stat_item {
CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */
NR_SLUB_STAT_ITEMS };
+#ifndef CONFIG_SLUB_TINY
/*
* When changing the layout, make sure freelist and tid are still compatible
* with this_cpu_cmpxchg_double() alignment requirements.
@@ -57,6 +58,7 @@ struct kmem_cache_cpu {
unsigned stat[NR_SLUB_STAT_ITEMS];
#endif
};
+#endif /* CONFIG_SLUB_TINY */
#ifdef CONFIG_SLUB_CPU_PARTIAL
#define slub_percpu_partial(c) ((c)->partial)
@@ -88,7 +90,9 @@ struct kmem_cache_order_objects {
* Slab cache management.
*/
struct kmem_cache {
+#ifndef CONFIG_SLUB_TINY
struct kmem_cache_cpu __percpu *cpu_slab;
+#endif
/* Used for retrieving partial slabs, etc. */
slab_flags_t flags;
unsigned long min_partial;
@@ -136,13 +140,15 @@ struct kmem_cache {
struct kasan_cache kasan_info;
#endif
+#ifdef CONFIG_HARDENED_USERCOPY
unsigned int useroffset; /* Usercopy region offset */
unsigned int usersize; /* Usercopy region size */
+#endif
struct kmem_cache_node *node[MAX_NUMNODES];
};
-#ifdef CONFIG_SYSFS
+#if defined(CONFIG_SYSFS) && !defined(CONFIG_SLUB_TINY)
#define SLAB_SUPPORTS_SYSFS
void sysfs_slab_unlink(struct kmem_cache *);
void sysfs_slab_release(struct kmem_cache *);
diff --git a/include/linux/soc/mediatek/mtk-mmsys.h b/include/linux/soc/mediatek/mtk-mmsys.h
index d2b02bb43768..b85f66db33e1 100644
--- a/include/linux/soc/mediatek/mtk-mmsys.h
+++ b/include/linux/soc/mediatek/mtk-mmsys.h
@@ -9,6 +9,13 @@
enum mtk_ddp_comp_id;
struct device;
+enum mtk_dpi_out_format_con {
+ MTK_DPI_RGB888_SDR_CON,
+ MTK_DPI_RGB888_DDR_CON,
+ MTK_DPI_RGB565_SDR_CON,
+ MTK_DPI_RGB565_DDR_CON
+};
+
enum mtk_ddp_comp_id {
DDP_COMPONENT_AAL0,
DDP_COMPONENT_AAL1,
diff --git a/include/linux/soc/qcom/llcc-qcom.h b/include/linux/soc/qcom/llcc-qcom.h
index bc2fb8343a94..ad1fd718169d 100644
--- a/include/linux/soc/qcom/llcc-qcom.h
+++ b/include/linux/soc/qcom/llcc-qcom.h
@@ -42,7 +42,19 @@
#define LLCC_CPUHWT 36
#define LLCC_MDMCLAD2 37
#define LLCC_CAMEXP1 38
+#define LLCC_CMPTHCP 39
+#define LLCC_LCPDARE 40
#define LLCC_AENPU 45
+#define LLCC_ISLAND1 46
+#define LLCC_ISLAND2 47
+#define LLCC_ISLAND3 48
+#define LLCC_ISLAND4 49
+#define LLCC_CAMEXP2 50
+#define LLCC_CAMEXP3 51
+#define LLCC_CAMEXP4 52
+#define LLCC_DISP_WB 53
+#define LLCC_DISP_1 54
+#define LLCC_VIDVSP 64
/**
* struct llcc_slice_desc - Cache slice descriptor
diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h
index 2ba044d0d5e5..8e984d75f5b6 100644
--- a/include/linux/spi/spi-mem.h
+++ b/include/linux/spi/spi-mem.h
@@ -225,7 +225,7 @@ static inline void *spi_mem_get_drvdata(struct spi_mem *mem)
/**
* struct spi_controller_mem_ops - SPI memory operations
* @adjust_op_size: shrink the data xfer of an operation to match controller's
- * limitations (can be alignment of max RX/TX size
+ * limitations (can be alignment or max RX/TX size
* limitations)
* @supports_op: check if an operation is supported by the controller
* @exec_op: execute a SPI memory operation
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 01226e4d960a..9b9d0bbf1d3c 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -47,11 +47,8 @@ int init_srcu_struct(struct srcu_struct *ssp);
#include <linux/srcutiny.h>
#elif defined(CONFIG_TREE_SRCU)
#include <linux/srcutree.h>
-#elif defined(CONFIG_SRCU)
-#error "Unknown SRCU implementation specified to kernel configuration"
#else
-/* Dummy definition for things like notifiers. Actual use gets link error. */
-struct srcu_struct { };
+#error "Unknown SRCU implementation specified to kernel configuration"
#endif
void call_srcu(struct srcu_struct *ssp, struct rcu_head *head,
@@ -64,11 +61,21 @@ unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp);
unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp);
bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie);
-#ifdef CONFIG_SRCU
+#ifdef CONFIG_NEED_SRCU_NMI_SAFE
+int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp);
+void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) __releases(ssp);
+#else
+static inline int __srcu_read_lock_nmisafe(struct srcu_struct *ssp)
+{
+ return __srcu_read_lock(ssp);
+}
+static inline void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx)
+{
+ __srcu_read_unlock(ssp, idx);
+}
+#endif /* CONFIG_NEED_SRCU_NMI_SAFE */
+
void srcu_init(void);
-#else /* #ifdef CONFIG_SRCU */
-static inline void srcu_init(void) { }
-#endif /* #else #ifdef CONFIG_SRCU */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -104,6 +111,18 @@ static inline int srcu_read_lock_held(const struct srcu_struct *ssp)
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+#define SRCU_NMI_UNKNOWN 0x0
+#define SRCU_NMI_UNSAFE 0x1
+#define SRCU_NMI_SAFE 0x2
+
+#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_TREE_SRCU)
+void srcu_check_nmi_safety(struct srcu_struct *ssp, bool nmi_safe);
+#else
+static inline void srcu_check_nmi_safety(struct srcu_struct *ssp,
+ bool nmi_safe) { }
+#endif
+
+
/**
* srcu_dereference_check - fetch SRCU-protected pointer for later dereferencing
* @p: the pointer to fetch and protect for later dereferencing
@@ -161,17 +180,36 @@ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp)
{
int retval;
+ srcu_check_nmi_safety(ssp, false);
retval = __srcu_read_lock(ssp);
rcu_lock_acquire(&(ssp)->dep_map);
return retval;
}
+/**
+ * srcu_read_lock_nmisafe - register a new reader for an SRCU-protected structure.
+ * @ssp: srcu_struct in which to register the new reader.
+ *
+ * Enter an SRCU read-side critical section, but in an NMI-safe manner.
+ * See srcu_read_lock() for more information.
+ */
+static inline int srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp)
+{
+ int retval;
+
+ srcu_check_nmi_safety(ssp, true);
+ retval = __srcu_read_lock_nmisafe(ssp);
+ rcu_lock_acquire(&(ssp)->dep_map);
+ return retval;
+}
+
/* Used by tracing, cannot be traced and cannot invoke lockdep. */
static inline notrace int
srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp)
{
int retval;
+ srcu_check_nmi_safety(ssp, false);
retval = __srcu_read_lock(ssp);
return retval;
}
@@ -187,14 +225,32 @@ static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx)
__releases(ssp)
{
WARN_ON_ONCE(idx & ~0x1);
+ srcu_check_nmi_safety(ssp, false);
rcu_lock_release(&(ssp)->dep_map);
__srcu_read_unlock(ssp, idx);
}
+/**
+ * srcu_read_unlock_nmisafe - unregister a old reader from an SRCU-protected structure.
+ * @ssp: srcu_struct in which to unregister the old reader.
+ * @idx: return value from corresponding srcu_read_lock().
+ *
+ * Exit an SRCU read-side critical section, but in an NMI-safe manner.
+ */
+static inline void srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx)
+ __releases(ssp)
+{
+ WARN_ON_ONCE(idx & ~0x1);
+ srcu_check_nmi_safety(ssp, true);
+ rcu_lock_release(&(ssp)->dep_map);
+ __srcu_read_unlock_nmisafe(ssp, idx);
+}
+
/* Used by tracing, cannot be traced and cannot call lockdep. */
static inline notrace void
srcu_read_unlock_notrace(struct srcu_struct *ssp, int idx) __releases(ssp)
{
+ srcu_check_nmi_safety(ssp, false);
__srcu_read_unlock(ssp, idx);
}
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index e3014319d1ad..c689a81752c9 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -23,8 +23,9 @@ struct srcu_struct;
*/
struct srcu_data {
/* Read-side state. */
- unsigned long srcu_lock_count[2]; /* Locks per CPU. */
- unsigned long srcu_unlock_count[2]; /* Unlocks per CPU. */
+ atomic_long_t srcu_lock_count[2]; /* Locks per CPU. */
+ atomic_long_t srcu_unlock_count[2]; /* Unlocks per CPU. */
+ int srcu_nmi_safety; /* NMI-safe srcu_struct structure? */
/* Update-side state. */
spinlock_t __private lock ____cacheline_internodealigned_in_smp;
diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
index bc2797955de9..9ca7798d7a31 100644
--- a/include/linux/stackdepot.h
+++ b/include/linux/stackdepot.h
@@ -14,9 +14,15 @@
#include <linux/gfp.h>
typedef u32 depot_stack_handle_t;
+/*
+ * Number of bits in the handle that stack depot doesn't use. Users may store
+ * information in them.
+ */
+#define STACK_DEPOT_EXTRA_BITS 5
depot_stack_handle_t __stack_depot_save(unsigned long *entries,
unsigned int nr_entries,
+ unsigned int extra_bits,
gfp_t gfp_flags, bool can_alloc);
/*
@@ -59,6 +65,8 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries,
unsigned int stack_depot_fetch(depot_stack_handle_t handle,
unsigned long **entries);
+unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle);
+
int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size,
int spaces);
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 75eea5ebb179..770ef2cb5775 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -246,6 +246,7 @@ void rpc_clnt_xprt_switch_remove_xprt(struct rpc_clnt *, struct rpc_xprt *);
bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt,
const struct sockaddr *sap);
void rpc_clnt_xprt_set_online(struct rpc_clnt *clnt, struct rpc_xprt *xprt);
+void rpc_clnt_disconnect(struct rpc_clnt *clnt);
void rpc_cleanup_clids(void);
static inline int rpc_reply_expected(struct rpc_task *task)
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index baeca2f564dc..b8ca3ecaf8d7 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -209,11 +209,17 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *);
struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req);
void rpc_put_task(struct rpc_task *);
void rpc_put_task_async(struct rpc_task *);
+bool rpc_task_set_rpc_status(struct rpc_task *task, int rpc_status);
+void rpc_task_try_cancel(struct rpc_task *task, int error);
void rpc_signal_task(struct rpc_task *);
void rpc_exit_task(struct rpc_task *);
void rpc_exit(struct rpc_task *, int);
void rpc_release_calldata(const struct rpc_call_ops *, void *);
void rpc_killall_tasks(struct rpc_clnt *);
+unsigned long rpc_cancel_tasks(struct rpc_clnt *clnt, int error,
+ bool (*fnmatch)(const struct rpc_task *,
+ const void *),
+ const void *data);
void rpc_execute(struct rpc_task *);
void rpc_init_priority_wait_queue(struct rpc_wait_queue *, const char *);
void rpc_init_wait_queue(struct rpc_wait_queue *, const char *);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 43150b9bbc5c..a18cf4b7c724 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -162,6 +162,10 @@ union swap_header {
*/
struct reclaim_state {
unsigned long reclaimed_slab;
+#ifdef CONFIG_LRU_GEN
+ /* per-thread mm walk data */
+ struct lru_gen_mm_walk *mm_walk;
+#endif
};
#ifdef __KERNEL__
@@ -351,6 +355,11 @@ static inline swp_entry_t folio_swap_entry(struct folio *folio)
return entry;
}
+static inline void folio_set_swap_entry(struct folio *folio, swp_entry_t entry)
+{
+ folio->private = (void *)entry.val;
+}
+
/* linux/mm/workingset.c */
void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages);
void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg);
@@ -375,11 +384,11 @@ extern unsigned long totalreserve_pages;
/* linux/mm/swap.c */
-extern void lru_note_cost(struct lruvec *lruvec, bool file,
- unsigned int nr_pages);
-extern void lru_note_cost_folio(struct folio *);
-extern void folio_add_lru(struct folio *);
-extern void lru_cache_add(struct page *);
+void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages);
+void lru_note_cost_folio(struct folio *);
+void folio_add_lru(struct folio *);
+void folio_add_lru_vma(struct folio *, struct vm_area_struct *);
+void lru_cache_add(struct page *);
void mark_page_accessed(struct page *);
void folio_mark_accessed(struct folio *);
@@ -481,7 +490,8 @@ static inline long get_nr_swap_pages(void)
extern void si_swapinfo(struct sysinfo *);
swp_entry_t folio_alloc_swap(struct folio *folio);
-extern void put_swap_page(struct page *page, swp_entry_t entry);
+bool folio_free_swap(struct folio *folio);
+void put_swap_folio(struct folio *folio, swp_entry_t entry);
extern swp_entry_t get_swap_page_of_type(int);
extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size);
extern int add_swap_count_continuation(swp_entry_t, gfp_t);
@@ -500,7 +510,6 @@ extern int __swp_swapcount(swp_entry_t entry);
extern int swp_swapcount(swp_entry_t entry);
extern struct swap_info_struct *page_swap_info(struct page *);
extern struct swap_info_struct *swp_swap_info(swp_entry_t entry);
-extern int try_to_free_swap(struct page *);
struct backing_dev_info;
extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
extern void exit_swap_address_space(unsigned int type);
@@ -566,7 +575,7 @@ static inline void swap_free(swp_entry_t swp)
{
}
-static inline void put_swap_page(struct page *page, swp_entry_t swp)
+static inline void put_swap_folio(struct folio *folio, swp_entry_t swp)
{
}
@@ -585,11 +594,6 @@ static inline int swp_swapcount(swp_entry_t entry)
return 0;
}
-static inline int try_to_free_swap(struct page *page)
-{
- return 0;
-}
-
static inline swp_entry_t folio_alloc_swap(struct folio *folio)
{
swp_entry_t entry;
@@ -597,6 +601,11 @@ static inline swp_entry_t folio_alloc_swap(struct folio *folio)
return entry;
}
+static inline bool folio_free_swap(struct folio *folio)
+{
+ return false;
+}
+
static inline int add_swap_extent(struct swap_info_struct *sis,
unsigned long start_page,
unsigned long nr_pages, sector_t start_block)
@@ -657,7 +666,7 @@ static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
cgroup_throttle_swaprate(&folio->page, gfp);
}
-#ifdef CONFIG_MEMCG_SWAP
+#if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP)
void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry);
int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry);
static inline int mem_cgroup_try_charge_swap(struct folio *folio,
@@ -677,7 +686,7 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_p
}
extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg);
-extern bool mem_cgroup_swap_full(struct page *page);
+extern bool mem_cgroup_swap_full(struct folio *folio);
#else
static inline void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
{
@@ -699,7 +708,7 @@ static inline long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
return get_nr_swap_pages();
}
-static inline bool mem_cgroup_swap_full(struct page *page)
+static inline bool mem_cgroup_swap_full(struct folio *folio)
{
return vm_swap_full();
}
diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h
index a12dd1c3966c..ae73a87775b3 100644
--- a/include/linux/swap_cgroup.h
+++ b/include/linux/swap_cgroup.h
@@ -4,7 +4,7 @@
#include <linux/swap.h>
-#ifdef CONFIG_MEMCG_SWAP
+#if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP)
extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
unsigned short old, unsigned short new);
@@ -40,6 +40,6 @@ static inline void swap_cgroup_swapoff(int type)
return;
}
-#endif /* CONFIG_MEMCG_SWAP */
+#endif
#endif /* __LINUX_SWAP_CGROUP_H */
diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
index 54078542134c..7ed529a77c5b 100644
--- a/include/linux/swapfile.h
+++ b/include/linux/swapfile.h
@@ -8,6 +8,11 @@
*/
extern struct swap_info_struct *swap_info[];
extern unsigned long generic_max_swapfile_size(void);
-extern unsigned long max_swapfile_size(void);
+unsigned long arch_max_swapfile_size(void);
+
+/* Maximum swapfile size supported for the arch (not inclusive). */
+extern unsigned long swapfile_maximum_size;
+/* Whether swap migration entry supports storing A/D bits for the arch */
+extern bool swap_migration_ad_supported;
#endif /* _LINUX_SWAPFILE_H */
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index a3d435bf9f97..b07b277d6a16 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -8,6 +8,10 @@
#ifdef CONFIG_MMU
+#ifdef CONFIG_SWAP
+#include <linux/swapfile.h>
+#endif /* CONFIG_SWAP */
+
/*
* swapcache pages are stored in the swapper_space radix tree. We want to
* get good packing density in that tree, so the index should be dense in
@@ -23,6 +27,47 @@
#define SWP_TYPE_SHIFT (BITS_PER_XA_VALUE - MAX_SWAPFILES_SHIFT)
#define SWP_OFFSET_MASK ((1UL << SWP_TYPE_SHIFT) - 1)
+/*
+ * Definitions only for PFN swap entries (see is_pfn_swap_entry()). To
+ * store PFN, we only need SWP_PFN_BITS bits. Each of the pfn swap entries
+ * can use the extra bits to store other information besides PFN.
+ */
+#ifdef MAX_PHYSMEM_BITS
+#define SWP_PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT)
+#else /* MAX_PHYSMEM_BITS */
+#define SWP_PFN_BITS min_t(int, \
+ sizeof(phys_addr_t) * 8 - PAGE_SHIFT, \
+ SWP_TYPE_SHIFT)
+#endif /* MAX_PHYSMEM_BITS */
+#define SWP_PFN_MASK (BIT(SWP_PFN_BITS) - 1)
+
+/**
+ * Migration swap entry specific bitfield definitions. Layout:
+ *
+ * |----------+--------------------|
+ * | swp_type | swp_offset |
+ * |----------+--------+-+-+-------|
+ * | | resv |D|A| PFN |
+ * |----------+--------+-+-+-------|
+ *
+ * @SWP_MIG_YOUNG_BIT: Whether the page used to have young bit set (bit A)
+ * @SWP_MIG_DIRTY_BIT: Whether the page used to have dirty bit set (bit D)
+ *
+ * Note: A/D bits will be stored in migration entries iff there're enough
+ * free bits in arch specific swp offset. By default we'll ignore A/D bits
+ * when migrating a page. Please refer to migration_entry_supports_ad()
+ * for more information. If there're more bits besides PFN and A/D bits,
+ * they should be reserved and always be zeros.
+ */
+#define SWP_MIG_YOUNG_BIT (SWP_PFN_BITS)
+#define SWP_MIG_DIRTY_BIT (SWP_PFN_BITS + 1)
+#define SWP_MIG_TOTAL_BITS (SWP_PFN_BITS + 2)
+
+#define SWP_MIG_YOUNG BIT(SWP_MIG_YOUNG_BIT)
+#define SWP_MIG_DIRTY BIT(SWP_MIG_DIRTY_BIT)
+
+static inline bool is_pfn_swap_entry(swp_entry_t entry);
+
/* Clear all flags but only keep swp_entry_t related information */
static inline pte_t pte_swp_clear_flags(pte_t pte)
{
@@ -64,6 +109,17 @@ static inline pgoff_t swp_offset(swp_entry_t entry)
return entry.val & SWP_OFFSET_MASK;
}
+/*
+ * This should only be called upon a pfn swap entry to get the PFN stored
+ * in the swap entry. Please refers to is_pfn_swap_entry() for definition
+ * of pfn swap entry.
+ */
+static inline unsigned long swp_offset_pfn(swp_entry_t entry)
+{
+ VM_BUG_ON(!is_pfn_swap_entry(entry));
+ return swp_offset(entry) & SWP_PFN_MASK;
+}
+
/* check whether a pte points to a swap entry */
static inline int is_swap_pte(pte_t pte)
{
@@ -240,6 +296,52 @@ static inline swp_entry_t make_writable_migration_entry(pgoff_t offset)
return swp_entry(SWP_MIGRATION_WRITE, offset);
}
+/*
+ * Returns whether the host has large enough swap offset field to support
+ * carrying over pgtable A/D bits for page migrations. The result is
+ * pretty much arch specific.
+ */
+static inline bool migration_entry_supports_ad(void)
+{
+#ifdef CONFIG_SWAP
+ return swap_migration_ad_supported;
+#else /* CONFIG_SWAP */
+ return false;
+#endif /* CONFIG_SWAP */
+}
+
+static inline swp_entry_t make_migration_entry_young(swp_entry_t entry)
+{
+ if (migration_entry_supports_ad())
+ return swp_entry(swp_type(entry),
+ swp_offset(entry) | SWP_MIG_YOUNG);
+ return entry;
+}
+
+static inline bool is_migration_entry_young(swp_entry_t entry)
+{
+ if (migration_entry_supports_ad())
+ return swp_offset(entry) & SWP_MIG_YOUNG;
+ /* Keep the old behavior of aging page after migration */
+ return false;
+}
+
+static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry)
+{
+ if (migration_entry_supports_ad())
+ return swp_entry(swp_type(entry),
+ swp_offset(entry) | SWP_MIG_DIRTY);
+ return entry;
+}
+
+static inline bool is_migration_entry_dirty(swp_entry_t entry)
+{
+ if (migration_entry_supports_ad())
+ return swp_offset(entry) & SWP_MIG_DIRTY;
+ /* Keep the old behavior of clean page after migration */
+ return false;
+}
+
extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
spinlock_t *ptl);
extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
@@ -247,8 +349,8 @@ extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
#ifdef CONFIG_HUGETLB_PAGE
extern void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl);
extern void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte);
-#endif
-#else
+#endif /* CONFIG_HUGETLB_PAGE */
+#else /* CONFIG_MIGRATION */
static inline swp_entry_t make_readable_migration_entry(pgoff_t offset)
{
return swp_entry(0, 0);
@@ -276,7 +378,7 @@ static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
#ifdef CONFIG_HUGETLB_PAGE
static inline void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl) { }
static inline void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) { }
-#endif
+#endif /* CONFIG_HUGETLB_PAGE */
static inline int is_writable_migration_entry(swp_entry_t entry)
{
return 0;
@@ -286,7 +388,26 @@ static inline int is_readable_migration_entry(swp_entry_t entry)
return 0;
}
-#endif
+static inline swp_entry_t make_migration_entry_young(swp_entry_t entry)
+{
+ return entry;
+}
+
+static inline bool is_migration_entry_young(swp_entry_t entry)
+{
+ return false;
+}
+
+static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry)
+{
+ return entry;
+}
+
+static inline bool is_migration_entry_dirty(swp_entry_t entry)
+{
+ return false;
+}
+#endif /* CONFIG_MIGRATION */
typedef unsigned long pte_marker;
@@ -369,7 +490,7 @@ static inline int pte_none_mostly(pte_t pte)
static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry)
{
- struct page *p = pfn_to_page(swp_offset(entry));
+ struct page *p = pfn_to_page(swp_offset_pfn(entry));
/*
* Any use of migration entries may only occur while the
@@ -387,6 +508,9 @@ static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry)
*/
static inline bool is_pfn_swap_entry(swp_entry_t entry)
{
+ /* Make sure the swp offset can always store the needed fields */
+ BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS);
+
return is_migration_entry(entry) || is_device_private_entry(entry) ||
is_device_exclusive_entry(entry);
}
@@ -426,7 +550,7 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
{
return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd));
}
-#else
+#else /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
struct page *page)
{
@@ -455,7 +579,7 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
{
return 0;
}
-#endif
+#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
#ifdef CONFIG_MEMORY_FAILURE
@@ -475,27 +599,17 @@ static inline int is_hwpoison_entry(swp_entry_t entry)
return swp_type(entry) == SWP_HWPOISON;
}
-static inline unsigned long hwpoison_entry_to_pfn(swp_entry_t entry)
-{
- return swp_offset(entry);
-}
-
static inline void num_poisoned_pages_inc(void)
{
atomic_long_inc(&num_poisoned_pages);
}
-static inline void num_poisoned_pages_dec(void)
-{
- atomic_long_dec(&num_poisoned_pages);
-}
-
static inline void num_poisoned_pages_sub(long i)
{
atomic_long_sub(i, &num_poisoned_pages);
}
-#else
+#else /* CONFIG_MEMORY_FAILURE */
static inline swp_entry_t make_hwpoison_entry(struct page *page)
{
@@ -514,7 +628,7 @@ static inline void num_poisoned_pages_inc(void)
static inline void num_poisoned_pages_sub(long i)
{
}
-#endif
+#endif /* CONFIG_MEMORY_FAILURE */
static inline int non_swap_entry(swp_entry_t entry)
{
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index a34b0f9a9972..33a0ee3bcb2e 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -264,6 +264,7 @@ static inline int is_syscall_trace_event(struct trace_event_call *tp_event)
#define SC_VAL64(type, name) ((type) name##_hi << 32 | name##_lo)
#ifdef CONFIG_COMPAT
+#define SYSCALL32_DEFINE0 COMPAT_SYSCALL_DEFINE0
#define SYSCALL32_DEFINE1 COMPAT_SYSCALL_DEFINE1
#define SYSCALL32_DEFINE2 COMPAT_SYSCALL_DEFINE2
#define SYSCALL32_DEFINE3 COMPAT_SYSCALL_DEFINE3
@@ -271,6 +272,7 @@ static inline int is_syscall_trace_event(struct trace_event_call *tp_event)
#define SYSCALL32_DEFINE5 COMPAT_SYSCALL_DEFINE5
#define SYSCALL32_DEFINE6 COMPAT_SYSCALL_DEFINE6
#else
+#define SYSCALL32_DEFINE0 SYSCALL_DEFINE0
#define SYSCALL32_DEFINE1 SYSCALL_DEFINE1
#define SYSCALL32_DEFINE2 SYSCALL_DEFINE2
#define SYSCALL32_DEFINE3 SYSCALL_DEFINE3
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 6f1ec4fb7ef8..5e093602e8fc 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -100,6 +100,7 @@ struct thermal_cooling_device_ops {
struct thermal_cooling_device {
int id;
char *type;
+ unsigned long max_state;
struct device device;
struct device_node *np;
void *devdata;
@@ -308,9 +309,6 @@ void devm_thermal_of_zone_unregister(struct device *dev, struct thermal_zone_dev
void thermal_of_zone_unregister(struct thermal_zone_device *tz);
-int thermal_zone_of_get_sensor_id(struct device_node *tz_np,
- struct device_node *sensor_np,
- u32 *id);
#else
static inline
struct thermal_zone_device *thermal_of_zone_register(struct device_node *sensor, int id, void *data,
@@ -334,13 +332,6 @@ static inline void devm_thermal_of_zone_unregister(struct device *dev,
struct thermal_zone_device *tz)
{
}
-
-static inline int thermal_zone_of_get_sensor_id(struct device_node *tz_np,
- struct device_node *sensor_np,
- u32 *id)
-{
- return -ENOENT;
-}
#endif
#ifdef CONFIG_THERMAL
diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h
index 3146f1c056c9..bb9d3f5542f8 100644
--- a/include/linux/time_namespace.h
+++ b/include/linux/time_namespace.h
@@ -45,6 +45,7 @@ struct time_namespace *copy_time_ns(unsigned long flags,
void free_time_ns(struct time_namespace *ns);
void timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk);
struct vdso_data *arch_get_vdso_data(void *vvar_page);
+struct page *find_timens_vvar_page(struct vm_area_struct *vma);
static inline void put_time_ns(struct time_namespace *ns)
{
@@ -141,6 +142,11 @@ static inline void timens_on_fork(struct nsproxy *nsproxy,
return;
}
+static inline struct page *find_timens_vvar_page(struct vm_area_struct *vma)
+{
+ return NULL;
+}
+
static inline void timens_add_monotonic(struct timespec64 *ts) { }
static inline void timens_add_boottime(struct timespec64 *ts) { }
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 648f00105f58..9162f275819a 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -169,7 +169,6 @@ static inline int timer_pending(const struct timer_list * timer)
}
extern void add_timer_on(struct timer_list *timer, int cpu);
-extern int del_timer(struct timer_list * timer);
extern int mod_timer(struct timer_list *timer, unsigned long expires);
extern int mod_timer_pending(struct timer_list *timer, unsigned long expires);
extern int timer_reduce(struct timer_list *timer, unsigned long expires);
@@ -183,14 +182,36 @@ extern int timer_reduce(struct timer_list *timer, unsigned long expires);
extern void add_timer(struct timer_list *timer);
extern int try_to_del_timer_sync(struct timer_list *timer);
+extern int timer_delete_sync(struct timer_list *timer);
+extern int timer_delete(struct timer_list *timer);
+extern int timer_shutdown_sync(struct timer_list *timer);
+extern int timer_shutdown(struct timer_list *timer);
-#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
- extern int del_timer_sync(struct timer_list *timer);
-#else
-# define del_timer_sync(t) del_timer(t)
-#endif
+/**
+ * del_timer_sync - Delete a pending timer and wait for a running callback
+ * @timer: The timer to be deleted
+ *
+ * See timer_delete_sync() for detailed explanation.
+ *
+ * Do not use in new code. Use timer_delete_sync() instead.
+ */
+static inline int del_timer_sync(struct timer_list *timer)
+{
+ return timer_delete_sync(timer);
+}
-#define del_singleshot_timer_sync(t) del_timer_sync(t)
+/**
+ * del_timer - Delete a pending timer
+ * @timer: The timer to be deleted
+ *
+ * See timer_delete() for detailed explanation.
+ *
+ * Do not use in new code. Use timer_delete() instead.
+ */
+static inline int del_timer(struct timer_list *timer)
+{
+ return timer_delete(timer);
+}
extern void init_timers(void);
struct hrtimer;
diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h
index 93884086f392..adc80e29168e 100644
--- a/include/linux/timerqueue.h
+++ b/include/linux/timerqueue.h
@@ -35,7 +35,7 @@ struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head)
{
struct rb_node *leftmost = rb_first_cached(&head->rb_root);
- return rb_entry(leftmost, struct timerqueue_node, node);
+ return rb_entry_safe(leftmost, struct timerqueue_node, node);
}
static inline void timerqueue_init(struct timerqueue_node *node)
diff --git a/include/linux/trace.h b/include/linux/trace.h
index b5e16e438448..80ffda871749 100644
--- a/include/linux/trace.h
+++ b/include/linux/trace.h
@@ -26,13 +26,13 @@ struct trace_export {
int flags;
};
+struct trace_array;
+
#ifdef CONFIG_TRACING
int register_ftrace_export(struct trace_export *export);
int unregister_ftrace_export(struct trace_export *export);
-struct trace_array;
-
void trace_printk_init_buffers(void);
__printf(3, 4)
int trace_array_printk(struct trace_array *tr, unsigned long ip,
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 8401dec93c15..20749bd9db71 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -92,6 +92,7 @@ struct trace_iterator {
unsigned int temp_size;
char *fmt; /* modified format holder */
unsigned int fmt_size;
+ long wait_index;
/* trace_seq for __print_flags() and __print_symbolic() etc. */
struct trace_seq tmp_seq;
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 47e5d374c7eb..afb18f198843 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -58,20 +58,28 @@
static __always_inline __must_check unsigned long
__copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
{
- instrument_copy_from_user(to, from, n);
+ unsigned long res;
+
+ instrument_copy_from_user_before(to, from, n);
check_object_size(to, n, false);
- return raw_copy_from_user(to, from, n);
+ res = raw_copy_from_user(to, from, n);
+ instrument_copy_from_user_after(to, from, n, res);
+ return res;
}
static __always_inline __must_check unsigned long
__copy_from_user(void *to, const void __user *from, unsigned long n)
{
+ unsigned long res;
+
might_fault();
+ instrument_copy_from_user_before(to, from, n);
if (should_fail_usercopy())
return n;
- instrument_copy_from_user(to, from, n);
check_object_size(to, n, false);
- return raw_copy_from_user(to, from, n);
+ res = raw_copy_from_user(to, from, n);
+ instrument_copy_from_user_after(to, from, n, res);
+ return res;
}
/**
@@ -115,8 +123,9 @@ _copy_from_user(void *to, const void __user *from, unsigned long n)
unsigned long res = n;
might_fault();
if (!should_fail_usercopy() && likely(access_ok(from, n))) {
- instrument_copy_from_user(to, from, n);
+ instrument_copy_from_user_before(to, from, n);
res = raw_copy_from_user(to, from, n);
+ instrument_copy_from_user_after(to, from, n, res);
}
if (unlikely(res))
memset(to + (n - res), 0, res);
diff --git a/include/linux/user_events.h b/include/linux/user_events.h
index 736e05603463..592a3fbed98e 100644
--- a/include/linux/user_events.h
+++ b/include/linux/user_events.h
@@ -20,15 +20,6 @@
#define USER_EVENTS_SYSTEM "user_events"
#define USER_EVENTS_PREFIX "u:"
-/* Bits 0-6 are for known probe types, Bit 7 is for unknown probes */
-#define EVENT_BIT_FTRACE 0
-#define EVENT_BIT_PERF 1
-#define EVENT_BIT_OTHER 7
-
-#define EVENT_STATUS_FTRACE (1 << EVENT_BIT_FTRACE)
-#define EVENT_STATUS_PERF (1 << EVENT_BIT_PERF)
-#define EVENT_STATUS_OTHER (1 << EVENT_BIT_OTHER)
-
/* Create dynamic location entry within a 32-bit value */
#define DYN_LOC(offset, size) ((size) << 16 | (offset))
@@ -45,12 +36,12 @@ struct user_reg {
/* Input: Pointer to string with event name, description and flags */
__u64 name_args;
- /* Output: Byte index of the event within the status page */
- __u32 status_index;
+ /* Output: Bitwise index of the event within the status page */
+ __u32 status_bit;
/* Output: Index of the event to use when writing data */
__u32 write_index;
-};
+} __attribute__((__packed__));
#define DIAG_IOC_MAGIC '*'
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index e1b8a915e9e9..9df0b9a762cc 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -146,9 +146,9 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
static inline bool vma_can_userfault(struct vm_area_struct *vma,
unsigned long vm_flags)
{
- if (vm_flags & VM_UFFD_MINOR)
- return is_vm_hugetlb_page(vma) || vma_is_shmem(vma);
-
+ if ((vm_flags & VM_UFFD_MINOR) &&
+ (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma)))
+ return false;
#ifndef CONFIG_PTE_MARKER_UFFD_WP
/*
* If user requested uffd-wp but not enabled pte markers for
@@ -175,9 +175,8 @@ extern bool userfaultfd_remove(struct vm_area_struct *vma,
unsigned long start,
unsigned long end);
-extern int userfaultfd_unmap_prep(struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- struct list_head *uf);
+extern int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start,
+ unsigned long end, struct list_head *uf);
extern void userfaultfd_unmap_complete(struct mm_struct *mm,
struct list_head *uf);
@@ -258,7 +257,7 @@ static inline bool userfaultfd_remove(struct vm_area_struct *vma,
return true;
}
-static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma,
+static inline int userfaultfd_unmap_prep(struct mm_struct *mm,
unsigned long start, unsigned long end,
struct list_head *uf)
{
diff --git a/include/linux/utsname.h b/include/linux/utsname.h
index 2b1737c9b244..bf7613ba412b 100644
--- a/include/linux/utsname.h
+++ b/include/linux/utsname.h
@@ -10,6 +10,7 @@
#include <uapi/linux/utsname.h>
enum uts_proc {
+ UTS_PROC_ARCH,
UTS_PROC_OSTYPE,
UTS_PROC_OSRELEASE,
UTS_PROC_VERSION,
diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index d282f464d2f1..6d0f5e4e82c2 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -104,6 +104,7 @@ struct vdpa_iova_range {
};
struct vdpa_dev_set_config {
+ u64 device_features;
struct {
u8 mac[ETH_ALEN];
u16 mtu;
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index e05ddc6fe6a5..fdd393f70b19 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -14,6 +14,7 @@
#include <linux/workqueue.h>
#include <linux/poll.h>
#include <uapi/linux/vfio.h>
+#include <linux/iova_bitmap.h>
struct kvm;
@@ -33,10 +34,11 @@ struct vfio_device {
struct device *dev;
const struct vfio_device_ops *ops;
/*
- * mig_ops is a static property of the vfio_device which must be set
- * prior to registering the vfio_device.
+ * mig_ops/log_ops is a static property of the vfio_device which must
+ * be set prior to registering the vfio_device.
*/
const struct vfio_migration_ops *mig_ops;
+ const struct vfio_log_ops *log_ops;
struct vfio_group *group;
struct vfio_device_set *dev_set;
struct list_head dev_set_list;
@@ -45,7 +47,9 @@ struct vfio_device {
struct kvm *kvm;
/* Members below here are private, not for driver use */
- refcount_t refcount;
+ unsigned int index;
+ struct device device; /* device.kref covers object life circle */
+ refcount_t refcount; /* user count on registered device*/
unsigned int open_count;
struct completion comp;
struct list_head group_next;
@@ -55,6 +59,8 @@ struct vfio_device {
/**
* struct vfio_device_ops - VFIO bus driver device callbacks
*
+ * @init: initialize private fields in device structure
+ * @release: Reclaim private fields in device structure
* @open_device: Called when the first file descriptor is opened for this device
* @close_device: Opposite of open_device
* @read: Perform read(2) on device file descriptor
@@ -72,6 +78,8 @@ struct vfio_device {
*/
struct vfio_device_ops {
char *name;
+ int (*init)(struct vfio_device *vdev);
+ void (*release)(struct vfio_device *vdev);
int (*open_device)(struct vfio_device *vdev);
void (*close_device)(struct vfio_device *vdev);
ssize_t (*read)(struct vfio_device *vdev, char __user *buf,
@@ -109,6 +117,28 @@ struct vfio_migration_ops {
};
/**
+ * @log_start: Optional callback to ask the device start DMA logging.
+ * @log_stop: Optional callback to ask the device stop DMA logging.
+ * @log_read_and_clear: Optional callback to ask the device read
+ * and clear the dirty DMAs in some given range.
+ *
+ * The vfio core implementation of the DEVICE_FEATURE_DMA_LOGGING_ set
+ * of features does not track logging state relative to the device,
+ * therefore the device implementation of vfio_log_ops must handle
+ * arbitrary user requests. This includes rejecting subsequent calls
+ * to log_start without an intervening log_stop, as well as graceful
+ * handling of log_stop and log_read_and_clear from invalid states.
+ */
+struct vfio_log_ops {
+ int (*log_start)(struct vfio_device *device,
+ struct rb_root_cached *ranges, u32 nnodes, u64 *page_size);
+ int (*log_stop)(struct vfio_device *device);
+ int (*log_read_and_clear)(struct vfio_device *device,
+ unsigned long iova, unsigned long length,
+ struct iova_bitmap *dirty);
+};
+
+/**
* vfio_check_feature - Validate user input for the VFIO_DEVICE_FEATURE ioctl
* @flags: Arg from the device_feature op
* @argsz: Arg from the device_feature op
@@ -137,14 +167,29 @@ static inline int vfio_check_feature(u32 flags, size_t argsz, u32 supported_ops,
return 1;
}
-void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
- const struct vfio_device_ops *ops);
-void vfio_uninit_group_dev(struct vfio_device *device);
+struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
+ const struct vfio_device_ops *ops);
+#define vfio_alloc_device(dev_struct, member, dev, ops) \
+ container_of(_vfio_alloc_device(sizeof(struct dev_struct) + \
+ BUILD_BUG_ON_ZERO(offsetof( \
+ struct dev_struct, member)), \
+ dev, ops), \
+ struct dev_struct, member)
+
+int vfio_init_device(struct vfio_device *device, struct device *dev,
+ const struct vfio_device_ops *ops);
+void vfio_free_device(struct vfio_device *device);
+static inline void vfio_put_device(struct vfio_device *device)
+{
+ put_device(&device->device);
+}
+
int vfio_register_group_dev(struct vfio_device *device);
int vfio_register_emulated_iommu_dev(struct vfio_device *device);
void vfio_unregister_group_dev(struct vfio_device *device);
int vfio_assign_device_set(struct vfio_device *device, void *set_id);
+unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set);
int vfio_mig_get_next_state(struct vfio_device *device,
enum vfio_device_mig_state cur_fsm,
@@ -155,6 +200,7 @@ int vfio_mig_get_next_state(struct vfio_device *device,
* External user API
*/
struct iommu_group *vfio_file_iommu_group(struct file *file);
+bool vfio_file_is_group(struct file *file);
bool vfio_file_enforced_coherent(struct file *file);
void vfio_file_set_kvm(struct file *file, struct kvm *kvm);
bool vfio_file_has_dev(struct file *file, struct vfio_device *device);
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index 5579ece4347b..367fd79226a3 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -20,39 +20,10 @@
#define VFIO_PCI_CORE_H
#define VFIO_PCI_OFFSET_SHIFT 40
-
#define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT)
#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
#define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
-/* Special capability IDs predefined access */
-#define PCI_CAP_ID_INVALID 0xFF /* default raw access */
-#define PCI_CAP_ID_INVALID_VIRT 0xFE /* default virt access */
-
-/* Cap maximum number of ioeventfds per device (arbitrary) */
-#define VFIO_PCI_IOEVENTFD_MAX 1000
-
-struct vfio_pci_ioeventfd {
- struct list_head next;
- struct vfio_pci_core_device *vdev;
- struct virqfd *virqfd;
- void __iomem *addr;
- uint64_t data;
- loff_t pos;
- int bar;
- int count;
- bool test_mem;
-};
-
-struct vfio_pci_irq_ctx {
- struct eventfd_ctx *trigger;
- struct virqfd *unmask;
- struct virqfd *mask;
- char *name;
- bool masked;
- struct irq_bypass_producer producer;
-};
-
struct vfio_pci_core_device;
struct vfio_pci_region;
@@ -78,23 +49,6 @@ struct vfio_pci_region {
u32 flags;
};
-struct vfio_pci_dummy_resource {
- struct resource resource;
- int index;
- struct list_head res_next;
-};
-
-struct vfio_pci_vf_token {
- struct mutex lock;
- uuid_t uuid;
- int users;
-};
-
-struct vfio_pci_mmap_vma {
- struct vm_area_struct *vma;
- struct list_head vma_next;
-};
-
struct vfio_pci_core_device {
struct vfio_device vdev;
struct pci_dev *pdev;
@@ -124,11 +78,14 @@ struct vfio_pci_core_device {
bool needs_reset;
bool nointx;
bool needs_pm_restore;
+ bool pm_intx_masked;
+ bool pm_runtime_engaged;
struct pci_saved_state *pci_saved_state;
struct pci_saved_state *pm_save;
int ioeventfds_nr;
struct eventfd_ctx *err_trigger;
struct eventfd_ctx *req_trigger;
+ struct eventfd_ctx *pm_wake_eventfd_ctx;
struct list_head dummy_resources_list;
struct mutex ioeventfds_lock;
struct list_head ioeventfds_list;
@@ -141,100 +98,17 @@ struct vfio_pci_core_device {
struct rw_semaphore memory_lock;
};
-#define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
-#define is_msi(vdev) (vdev->irq_type == VFIO_PCI_MSI_IRQ_INDEX)
-#define is_msix(vdev) (vdev->irq_type == VFIO_PCI_MSIX_IRQ_INDEX)
-#define is_irq_none(vdev) (!(is_intx(vdev) || is_msi(vdev) || is_msix(vdev)))
-#define irq_is(vdev, type) (vdev->irq_type == type)
-
-void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev);
-void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev);
-
-int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev,
- uint32_t flags, unsigned index,
- unsigned start, unsigned count, void *data);
-
-ssize_t vfio_pci_config_rw(struct vfio_pci_core_device *vdev,
- char __user *buf, size_t count,
- loff_t *ppos, bool iswrite);
-
-ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,
- size_t count, loff_t *ppos, bool iswrite);
-
-#ifdef CONFIG_VFIO_PCI_VGA
-ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, char __user *buf,
- size_t count, loff_t *ppos, bool iswrite);
-#else
-static inline ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev,
- char __user *buf, size_t count,
- loff_t *ppos, bool iswrite)
-{
- return -EINVAL;
-}
-#endif
-
-long vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset,
- uint64_t data, int count, int fd);
-
-int vfio_pci_init_perm_bits(void);
-void vfio_pci_uninit_perm_bits(void);
-
-int vfio_config_init(struct vfio_pci_core_device *vdev);
-void vfio_config_free(struct vfio_pci_core_device *vdev);
-
-int vfio_pci_register_dev_region(struct vfio_pci_core_device *vdev,
- unsigned int type, unsigned int subtype,
- const struct vfio_pci_regops *ops,
- size_t size, u32 flags, void *data);
-
-int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev,
- pci_power_t state);
-
-bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev);
-void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev);
-u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev);
-void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev,
- u16 cmd);
-
-#ifdef CONFIG_VFIO_PCI_IGD
-int vfio_pci_igd_init(struct vfio_pci_core_device *vdev);
-#else
-static inline int vfio_pci_igd_init(struct vfio_pci_core_device *vdev)
-{
- return -ENODEV;
-}
-#endif
-
-#ifdef CONFIG_VFIO_PCI_ZDEV_KVM
-int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
- struct vfio_info_cap *caps);
-int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev);
-void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev);
-#else
-static inline int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
- struct vfio_info_cap *caps)
-{
- return -ENODEV;
-}
-
-static inline int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev)
-{
- return 0;
-}
-
-static inline void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev)
-{}
-#endif
-
/* Will be exported for vfio pci drivers usage */
+int vfio_pci_core_register_dev_region(struct vfio_pci_core_device *vdev,
+ unsigned int type, unsigned int subtype,
+ const struct vfio_pci_regops *ops,
+ size_t size, u32 flags, void *data);
void vfio_pci_core_set_params(bool nointxmask, bool is_disable_vga,
bool is_disable_idle_d3);
void vfio_pci_core_close_device(struct vfio_device *core_vdev);
-void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev,
- struct pci_dev *pdev,
- const struct vfio_device_ops *vfio_pci_ops);
+int vfio_pci_core_init_dev(struct vfio_device *core_vdev);
+void vfio_pci_core_release_dev(struct vfio_device *core_vdev);
int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev);
-void vfio_pci_core_uninit_device(struct vfio_pci_core_device *vdev);
void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev);
extern const struct pci_error_handlers vfio_pci_core_err_handlers;
int vfio_pci_core_sriov_configure(struct vfio_pci_core_device *vdev,
@@ -256,9 +130,4 @@ void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev);
pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev,
pci_channel_state_t state);
-static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
-{
- return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA;
-}
-
#endif /* VFIO_PCI_CORE_H */
diff --git a/include/linux/virtio_pci_legacy.h b/include/linux/virtio_pci_legacy.h
index e5d665faf00e..a8dc757d0367 100644
--- a/include/linux/virtio_pci_legacy.h
+++ b/include/linux/virtio_pci_legacy.h
@@ -32,8 +32,6 @@ void vp_legacy_set_queue_address(struct virtio_pci_legacy_device *ldev,
u16 index, u32 queue_pfn);
bool vp_legacy_get_queue_enable(struct virtio_pci_legacy_device *ldev,
u16 idx);
-void vp_legacy_set_queue_size(struct virtio_pci_legacy_device *ldev,
- u16 idx, u16 size);
u16 vp_legacy_get_queue_size(struct virtio_pci_legacy_device *ldev,
u16 idx);
int vp_legacy_probe(struct virtio_pci_legacy_device *ldev);
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index f3fc36cd2276..3518dba1e02f 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -129,10 +129,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
NR_TLB_LOCAL_FLUSH_ALL,
NR_TLB_LOCAL_FLUSH_ONE,
#endif /* CONFIG_DEBUG_TLBFLUSH */
-#ifdef CONFIG_DEBUG_VM_VMACACHE
- VMACACHE_FIND_CALLS,
- VMACACHE_FIND_HITS,
-#endif
#ifdef CONFIG_SWAP
SWAP_RA,
SWAP_RA_HIT,
diff --git a/include/linux/vmacache.h b/include/linux/vmacache.h
deleted file mode 100644
index 6fce268a4588..000000000000
--- a/include/linux/vmacache.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __LINUX_VMACACHE_H
-#define __LINUX_VMACACHE_H
-
-#include <linux/sched.h>
-#include <linux/mm.h>
-
-static inline void vmacache_flush(struct task_struct *tsk)
-{
- memset(tsk->vmacache.vmas, 0, sizeof(tsk->vmacache.vmas));
-}
-
-extern void vmacache_update(unsigned long addr, struct vm_area_struct *newvma);
-extern struct vm_area_struct *vmacache_find(struct mm_struct *mm,
- unsigned long addr);
-
-#ifndef CONFIG_MMU
-extern struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
- unsigned long start,
- unsigned long end);
-#endif
-
-static inline void vmacache_invalidate(struct mm_struct *mm)
-{
- mm->vmacache_seqnum++;
-}
-
-#endif /* __LINUX_VMACACHE_H */
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index bfe38869498d..19cf5b6892ce 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -125,12 +125,6 @@ static inline void vm_events_fold_cpu(int cpu)
#define count_vm_tlb_events(x, y) do { (void)(y); } while (0)
#endif
-#ifdef CONFIG_DEBUG_VM_VMACACHE
-#define count_vm_vmacache_event(x) count_vm_event(x)
-#else
-#define count_vm_vmacache_event(x) do {} while (0)
-#endif
-
#define __count_zid_vm_events(item, zid, delta) \
__count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta)
diff --git a/include/linux/wireless.h b/include/linux/wireless.h
index 2d1b54556eff..e6e34d74dda0 100644
--- a/include/linux/wireless.h
+++ b/include/linux/wireless.h
@@ -26,7 +26,15 @@ struct compat_iw_point {
struct __compat_iw_event {
__u16 len; /* Real length of this stuff */
__u16 cmd; /* Wireless IOCTL */
- compat_caddr_t pointer;
+
+ union {
+ compat_caddr_t pointer;
+
+ /* we need ptr_bytes to make memcpy() run-time destination
+ * buffer bounds checking happy, nothing special
+ */
+ DECLARE_FLEX_ARRAY(__u8, ptr_bytes);
+ };
};
#define IW_EV_COMPAT_LCP_LEN offsetof(struct __compat_iw_event, pointer)
#define IW_EV_COMPAT_POINT_OFF offsetof(struct compat_iw_point, length)
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 3f045f6d6c4f..06f9291b6fd5 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -17,20 +17,12 @@ struct bio;
DECLARE_PER_CPU(int, dirty_throttle_leaks);
/*
- * The 1/4 region under the global dirty thresh is for smooth dirty throttling:
- *
- * (thresh - thresh/DIRTY_FULL_SCOPE, thresh)
- *
- * Further beyond, all dirtier tasks will enter a loop waiting (possibly long
- * time) for the dirty pages to drop, unless written enough pages.
- *
* The global dirty threshold is normally equal to the global dirty limit,
* except when the system suddenly allocates a lot of anonymous memory and
* knocks down the global dirty threshold quickly, in which case the global
* dirty limit will follow down slowly to prevent livelocking all dirtier tasks.
*/
#define DIRTY_SCOPE 8
-#define DIRTY_FULL_SCOPE (DIRTY_SCOPE / 2)
struct backing_dev_info;