diff options
Diffstat (limited to 'arch/x86')
179 files changed, 5322 insertions, 2234 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index dd47e60aabf5..e487493bbd47 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -412,6 +412,19 @@ config GOLDFISH def_bool y depends on X86_GOLDFISH +config INTEL_RDT_A + bool "Intel Resource Director Technology Allocation support" + default n + depends on X86 && CPU_SUP_INTEL + select KERNFS + help + Select to enable resource allocation which is a sub-feature of + Intel Resource Director Technology(RDT). More information about + RDT can be found in the Intel x86 Architecture Software + Developer Manual. + + Say N if unsure. + if X86_32 config X86_EXTENDED_PLATFORM bool "Support for extended (non-PC) x86 platforms" @@ -555,18 +568,6 @@ config X86_INTEL_QUARK Say Y here if you have a Quark based system such as the Arduino compatible Intel Galileo. -config MLX_PLATFORM - tristate "Mellanox Technologies platform support" - depends on X86_64 - depends on X86_EXTENDED_PLATFORM - ---help--- - This option enables system support for the Mellanox Technologies - platform. - - Say Y here if you are building a kernel for Mellanox system. - - Otherwise, say N. - config X86_INTEL_LPSS bool "Intel Low Power Subsystem Support" depends on X86 && ACPI diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 34d9e15857c3..44163e8c3868 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -25,7 +25,7 @@ KCOV_INSTRUMENT := n targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \ vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 -KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 +KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ -O2 KBUILD_CFLAGS += -fno-strict-aliasing $(call cc-option, -fPIE, -fPIC) KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING cflags-$(CONFIG_X86_32) := -march=i386 diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c index 4224ede43b4e..26240dde081e 100644 --- a/arch/x86/boot/cpu.c +++ b/arch/x86/boot/cpu.c @@ -87,12 +87,6 @@ int validate_cpu(void) return -1; } - if (CONFIG_X86_MINIMUM_CPU_FAMILY <= 4 && !IS_ENABLED(CONFIG_M486) && - !has_eflag(X86_EFLAGS_ID)) { - printf("This kernel requires a CPU with the CPUID instruction. Build with CONFIG_M486=y to run on this CPU.\n"); - return -1; - } - if (err_flags) { puts("This kernel requires the following features " "not present on the CPU:\n"); diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index cc3bd583dce1..9e240fcba784 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -14,6 +14,7 @@ #include <linux/types.h> #include "ctype.h" +#include "string.h" int memcmp(const void *s1, const void *s2, size_t len) { diff --git a/arch/x86/boot/string.h b/arch/x86/boot/string.h index 725e820602b1..113588ddb43f 100644 --- a/arch/x86/boot/string.h +++ b/arch/x86/boot/string.h @@ -18,4 +18,13 @@ int memcmp(const void *s1, const void *s2, size_t len); #define memset(d,c,l) __builtin_memset(d,c,l) #define memcmp __builtin_memcmp +extern int strcmp(const char *str1, const char *str2); +extern int strncmp(const char *cs, const char *ct, size_t count); +extern size_t strlen(const char *s); +extern char *strstr(const char *s1, const char *s2); +extern size_t strnlen(const char *s, size_t maxlen); +extern unsigned int atou(const char *s); +extern unsigned long long simple_strtoull(const char *cp, char **endp, + unsigned int base); + #endif /* BOOT_STRING_H */ diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index aa8b0672f87a..7ff1b0c86a8e 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -21,7 +21,6 @@ #include <linux/hardirq.h> #include <linux/types.h> -#include <linux/crypto.h> #include <linux/module.h> #include <linux/err.h> #include <crypto/algapi.h> @@ -29,14 +28,14 @@ #include <crypto/cryptd.h> #include <crypto/ctr.h> #include <crypto/b128ops.h> -#include <crypto/lrw.h> #include <crypto/xts.h> #include <asm/cpu_device_id.h> #include <asm/fpu/api.h> #include <asm/crypto/aes.h> -#include <crypto/ablk_helper.h> #include <crypto/scatterwalk.h> #include <crypto/internal/aead.h> +#include <crypto/internal/simd.h> +#include <crypto/internal/skcipher.h> #include <linux/workqueue.h> #include <linux/spinlock.h> #ifdef CONFIG_X86_64 @@ -45,28 +44,26 @@ #define AESNI_ALIGN 16 +#define AESNI_ALIGN_ATTR __attribute__ ((__aligned__(AESNI_ALIGN))) #define AES_BLOCK_MASK (~(AES_BLOCK_SIZE - 1)) #define RFC4106_HASH_SUBKEY_SIZE 16 +#define AESNI_ALIGN_EXTRA ((AESNI_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1)) +#define CRYPTO_AES_CTX_SIZE (sizeof(struct crypto_aes_ctx) + AESNI_ALIGN_EXTRA) +#define XTS_AES_CTX_SIZE (sizeof(struct aesni_xts_ctx) + AESNI_ALIGN_EXTRA) /* This data is stored at the end of the crypto_tfm struct. * It's a type of per "session" data storage location. * This needs to be 16 byte aligned. */ struct aesni_rfc4106_gcm_ctx { - u8 hash_subkey[16] __attribute__ ((__aligned__(AESNI_ALIGN))); - struct crypto_aes_ctx aes_key_expanded - __attribute__ ((__aligned__(AESNI_ALIGN))); + u8 hash_subkey[16] AESNI_ALIGN_ATTR; + struct crypto_aes_ctx aes_key_expanded AESNI_ALIGN_ATTR; u8 nonce[4]; }; -struct aesni_lrw_ctx { - struct lrw_table_ctx lrw_table; - u8 raw_aes_ctx[sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1]; -}; - struct aesni_xts_ctx { - u8 raw_tweak_ctx[sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1]; - u8 raw_crypt_ctx[sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1]; + u8 raw_tweak_ctx[sizeof(struct crypto_aes_ctx)] AESNI_ALIGN_ATTR; + u8 raw_crypt_ctx[sizeof(struct crypto_aes_ctx)] AESNI_ALIGN_ATTR; }; asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, @@ -360,96 +357,95 @@ static void __aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) aesni_dec(ctx, dst, src); } -static int ecb_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int aesni_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int len) +{ + return aes_set_key_common(crypto_skcipher_tfm(tfm), + crypto_skcipher_ctx(tfm), key, len); +} + +static int ecb_encrypt(struct skcipher_request *req) { - struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm)); - struct blkcipher_walk walk; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); + struct skcipher_walk walk; + unsigned int nbytes; int err; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + err = skcipher_walk_virt(&walk, req, true); kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK); nbytes &= AES_BLOCK_SIZE - 1; - err = blkcipher_walk_done(desc, &walk, nbytes); + err = skcipher_walk_done(&walk, nbytes); } kernel_fpu_end(); return err; } -static int ecb_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int ecb_decrypt(struct skcipher_request *req) { - struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm)); - struct blkcipher_walk walk; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); + struct skcipher_walk walk; + unsigned int nbytes; int err; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + err = skcipher_walk_virt(&walk, req, true); kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK); nbytes &= AES_BLOCK_SIZE - 1; - err = blkcipher_walk_done(desc, &walk, nbytes); + err = skcipher_walk_done(&walk, nbytes); } kernel_fpu_end(); return err; } -static int cbc_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int cbc_encrypt(struct skcipher_request *req) { - struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm)); - struct blkcipher_walk walk; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); + struct skcipher_walk walk; + unsigned int nbytes; int err; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + err = skcipher_walk_virt(&walk, req, true); kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK, walk.iv); nbytes &= AES_BLOCK_SIZE - 1; - err = blkcipher_walk_done(desc, &walk, nbytes); + err = skcipher_walk_done(&walk, nbytes); } kernel_fpu_end(); return err; } -static int cbc_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int cbc_decrypt(struct skcipher_request *req) { - struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm)); - struct blkcipher_walk walk; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); + struct skcipher_walk walk; + unsigned int nbytes; int err; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + err = skcipher_walk_virt(&walk, req, true); kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK, walk.iv); nbytes &= AES_BLOCK_SIZE - 1; - err = blkcipher_walk_done(desc, &walk, nbytes); + err = skcipher_walk_done(&walk, nbytes); } kernel_fpu_end(); @@ -458,7 +454,7 @@ static int cbc_decrypt(struct blkcipher_desc *desc, #ifdef CONFIG_X86_64 static void ctr_crypt_final(struct crypto_aes_ctx *ctx, - struct blkcipher_walk *walk) + struct skcipher_walk *walk) { u8 *ctrblk = walk->iv; u8 keystream[AES_BLOCK_SIZE]; @@ -491,157 +487,53 @@ static void aesni_ctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out, } #endif -static int ctr_crypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int ctr_crypt(struct skcipher_request *req) { - struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm)); - struct blkcipher_walk walk; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); + struct skcipher_walk walk; + unsigned int nbytes; int err; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + err = skcipher_walk_virt(&walk, req, true); kernel_fpu_begin(); while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) { aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK, walk.iv); nbytes &= AES_BLOCK_SIZE - 1; - err = blkcipher_walk_done(desc, &walk, nbytes); + err = skcipher_walk_done(&walk, nbytes); } if (walk.nbytes) { ctr_crypt_final(ctx, &walk); - err = blkcipher_walk_done(desc, &walk, 0); + err = skcipher_walk_done(&walk, 0); } kernel_fpu_end(); return err; } -#endif - -static int ablk_ecb_init(struct crypto_tfm *tfm) -{ - return ablk_init_common(tfm, "__driver-ecb-aes-aesni"); -} - -static int ablk_cbc_init(struct crypto_tfm *tfm) -{ - return ablk_init_common(tfm, "__driver-cbc-aes-aesni"); -} - -#ifdef CONFIG_X86_64 -static int ablk_ctr_init(struct crypto_tfm *tfm) -{ - return ablk_init_common(tfm, "__driver-ctr-aes-aesni"); -} - -#endif - -#if IS_ENABLED(CONFIG_CRYPTO_PCBC) -static int ablk_pcbc_init(struct crypto_tfm *tfm) -{ - return ablk_init_common(tfm, "fpu(pcbc(__driver-aes-aesni))"); -} -#endif - -static void lrw_xts_encrypt_callback(void *ctx, u8 *blks, unsigned int nbytes) -{ - aesni_ecb_enc(ctx, blks, blks, nbytes); -} -static void lrw_xts_decrypt_callback(void *ctx, u8 *blks, unsigned int nbytes) -{ - aesni_ecb_dec(ctx, blks, blks, nbytes); -} - -static int lrw_aesni_setkey(struct crypto_tfm *tfm, const u8 *key, +static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { - struct aesni_lrw_ctx *ctx = crypto_tfm_ctx(tfm); + struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm); int err; - err = aes_set_key_common(tfm, ctx->raw_aes_ctx, key, - keylen - AES_BLOCK_SIZE); + err = xts_verify_key(tfm, key, keylen); if (err) return err; - return lrw_init_table(&ctx->lrw_table, key + keylen - AES_BLOCK_SIZE); -} - -static void lrw_aesni_exit_tfm(struct crypto_tfm *tfm) -{ - struct aesni_lrw_ctx *ctx = crypto_tfm_ctx(tfm); - - lrw_free_table(&ctx->lrw_table); -} - -static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct aesni_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[8]; - struct lrw_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .table_ctx = &ctx->lrw_table, - .crypt_ctx = aes_ctx(ctx->raw_aes_ctx), - .crypt_fn = lrw_xts_encrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - - kernel_fpu_begin(); - ret = lrw_crypt(desc, dst, src, nbytes, &req); - kernel_fpu_end(); - - return ret; -} - -static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct aesni_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[8]; - struct lrw_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .table_ctx = &ctx->lrw_table, - .crypt_ctx = aes_ctx(ctx->raw_aes_ctx), - .crypt_fn = lrw_xts_decrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - - kernel_fpu_begin(); - ret = lrw_crypt(desc, dst, src, nbytes, &req); - kernel_fpu_end(); - - return ret; -} - -static int xts_aesni_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen) -{ - struct aesni_xts_ctx *ctx = crypto_tfm_ctx(tfm); - int err; - - err = xts_check_key(tfm, key, keylen); - if (err) - return err; + keylen /= 2; /* first half of xts-key is for crypt */ - err = aes_set_key_common(tfm, ctx->raw_crypt_ctx, key, keylen / 2); + err = aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_crypt_ctx, + key, keylen); if (err) return err; /* second half of xts-key is for tweak */ - return aes_set_key_common(tfm, ctx->raw_tweak_ctx, key + keylen / 2, - keylen / 2); + return aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_tweak_ctx, + key + keylen, keylen); } @@ -650,8 +542,6 @@ static void aesni_xts_tweak(void *ctx, u8 *out, const u8 *in) aesni_enc(ctx, out, in); } -#ifdef CONFIG_X86_64 - static void aesni_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) { glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_enc)); @@ -698,83 +588,28 @@ static const struct common_glue_ctx aesni_dec_xts = { } } }; -static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int xts_encrypt(struct skcipher_request *req) { - struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - return glue_xts_crypt_128bit(&aesni_enc_xts, desc, dst, src, nbytes, - XTS_TWEAK_CAST(aesni_xts_tweak), - aes_ctx(ctx->raw_tweak_ctx), - aes_ctx(ctx->raw_crypt_ctx)); + return glue_xts_req_128bit(&aesni_enc_xts, req, + XTS_TWEAK_CAST(aesni_xts_tweak), + aes_ctx(ctx->raw_tweak_ctx), + aes_ctx(ctx->raw_crypt_ctx)); } -static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int xts_decrypt(struct skcipher_request *req) { - struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - return glue_xts_crypt_128bit(&aesni_dec_xts, desc, dst, src, nbytes, - XTS_TWEAK_CAST(aesni_xts_tweak), - aes_ctx(ctx->raw_tweak_ctx), - aes_ctx(ctx->raw_crypt_ctx)); + return glue_xts_req_128bit(&aesni_dec_xts, req, + XTS_TWEAK_CAST(aesni_xts_tweak), + aes_ctx(ctx->raw_tweak_ctx), + aes_ctx(ctx->raw_crypt_ctx)); } -#else - -static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[8]; - struct xts_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .tweak_ctx = aes_ctx(ctx->raw_tweak_ctx), - .tweak_fn = aesni_xts_tweak, - .crypt_ctx = aes_ctx(ctx->raw_crypt_ctx), - .crypt_fn = lrw_xts_encrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - - kernel_fpu_begin(); - ret = xts_crypt(desc, dst, src, nbytes, &req); - kernel_fpu_end(); - - return ret; -} - -static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[8]; - struct xts_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .tweak_ctx = aes_ctx(ctx->raw_tweak_ctx), - .tweak_fn = aesni_xts_tweak, - .crypt_ctx = aes_ctx(ctx->raw_crypt_ctx), - .crypt_fn = lrw_xts_decrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - - kernel_fpu_begin(); - ret = xts_crypt(desc, dst, src, nbytes, &req); - kernel_fpu_end(); - - return ret; -} - -#endif - -#ifdef CONFIG_X86_64 static int rfc4106_init(struct crypto_aead *aead) { struct cryptd_aead *cryptd_tfm; @@ -1077,9 +912,7 @@ static struct crypto_alg aesni_algs[] = { { .cra_priority = 300, .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypto_aes_ctx) + - AESNI_ALIGN - 1, - .cra_alignmask = 0, + .cra_ctxsize = CRYPTO_AES_CTX_SIZE, .cra_module = THIS_MODULE, .cra_u = { .cipher = { @@ -1091,14 +924,12 @@ static struct crypto_alg aesni_algs[] = { { } } }, { - .cra_name = "__aes-aesni", - .cra_driver_name = "__driver-aes-aesni", - .cra_priority = 0, + .cra_name = "__aes", + .cra_driver_name = "__aes-aesni", + .cra_priority = 300, .cra_flags = CRYPTO_ALG_TYPE_CIPHER | CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypto_aes_ctx) + - AESNI_ALIGN - 1, - .cra_alignmask = 0, + .cra_ctxsize = CRYPTO_AES_CTX_SIZE, .cra_module = THIS_MODULE, .cra_u = { .cipher = { @@ -1109,250 +940,95 @@ static struct crypto_alg aesni_algs[] = { { .cia_decrypt = __aes_decrypt } } -}, { - .cra_name = "__ecb-aes-aesni", - .cra_driver_name = "__driver-ecb-aes-aesni", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypto_aes_ctx) + - AESNI_ALIGN - 1, - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .setkey = aes_set_key, - .encrypt = ecb_encrypt, - .decrypt = ecb_decrypt, - }, - }, -}, { - .cra_name = "__cbc-aes-aesni", - .cra_driver_name = "__driver-cbc-aes-aesni", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypto_aes_ctx) + - AESNI_ALIGN - 1, - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .setkey = aes_set_key, - .encrypt = cbc_encrypt, - .decrypt = cbc_decrypt, - }, - }, -}, { - .cra_name = "ecb(aes)", - .cra_driver_name = "ecb-aes-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_ecb_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, +} }; + +static struct skcipher_alg aesni_skciphers[] = { + { + .base = { + .cra_name = "__ecb(aes)", + .cra_driver_name = "__ecb-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = CRYPTO_AES_CTX_SIZE, + .cra_module = THIS_MODULE, }, - }, -}, { - .cra_name = "cbc(aes)", - .cra_driver_name = "cbc-aes-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_cbc_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .setkey = aesni_skcipher_setkey, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, { + .base = { + .cra_name = "__cbc(aes)", + .cra_driver_name = "__cbc-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = CRYPTO_AES_CTX_SIZE, + .cra_module = THIS_MODULE, }, - }, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = aesni_skcipher_setkey, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, #ifdef CONFIG_X86_64 -}, { - .cra_name = "__ctr-aes-aesni", - .cra_driver_name = "__driver-ctr-aes-aesni", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct crypto_aes_ctx) + - AESNI_ALIGN - 1, - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = aes_set_key, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, + }, { + .base = { + .cra_name = "__ctr(aes)", + .cra_driver_name = "__ctr-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = 1, + .cra_ctxsize = CRYPTO_AES_CTX_SIZE, + .cra_module = THIS_MODULE, }, - }, -}, { - .cra_name = "ctr(aes)", - .cra_driver_name = "ctr-aes-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_ctr_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_encrypt, - .geniv = "chainiv", + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .chunksize = AES_BLOCK_SIZE, + .setkey = aesni_skcipher_setkey, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + }, { + .base = { + .cra_name = "__xts(aes)", + .cra_driver_name = "__xts-aes-aesni", + .cra_priority = 401, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = XTS_AES_CTX_SIZE, + .cra_module = THIS_MODULE, }, - }, + .min_keysize = 2 * AES_MIN_KEY_SIZE, + .max_keysize = 2 * AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = xts_aesni_setkey, + .encrypt = xts_encrypt, + .decrypt = xts_decrypt, #endif -#if IS_ENABLED(CONFIG_CRYPTO_PCBC) -}, { - .cra_name = "pcbc(aes)", - .cra_driver_name = "pcbc-aes-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_pcbc_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, + } +}; + +struct simd_skcipher_alg *aesni_simd_skciphers[ARRAY_SIZE(aesni_skciphers)]; + +struct { + const char *algname; + const char *drvname; + const char *basename; + struct simd_skcipher_alg *simd; +} aesni_simd_skciphers2[] = { +#if (defined(MODULE) && IS_ENABLED(CONFIG_CRYPTO_PCBC)) || \ + IS_BUILTIN(CONFIG_CRYPTO_PCBC) + { + .algname = "pcbc(aes)", + .drvname = "pcbc-aes-aesni", + .basename = "fpu(pcbc(__aes-aesni))", }, #endif -}, { - .cra_name = "__lrw-aes-aesni", - .cra_driver_name = "__driver-lrw-aes-aesni", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct aesni_lrw_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_exit = lrw_aesni_exit_tfm, - .cra_u = { - .blkcipher = { - .min_keysize = AES_MIN_KEY_SIZE + AES_BLOCK_SIZE, - .max_keysize = AES_MAX_KEY_SIZE + AES_BLOCK_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = lrw_aesni_setkey, - .encrypt = lrw_encrypt, - .decrypt = lrw_decrypt, - }, - }, -}, { - .cra_name = "__xts-aes-aesni", - .cra_driver_name = "__driver-xts-aes-aesni", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct aesni_xts_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = 2 * AES_MIN_KEY_SIZE, - .max_keysize = 2 * AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = xts_aesni_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, - }, - }, -}, { - .cra_name = "lrw(aes)", - .cra_driver_name = "lrw-aes-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = AES_MIN_KEY_SIZE + AES_BLOCK_SIZE, - .max_keysize = AES_MAX_KEY_SIZE + AES_BLOCK_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "xts(aes)", - .cra_driver_name = "xts-aes-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = 2 * AES_MIN_KEY_SIZE, - .max_keysize = 2 * AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -} }; +}; #ifdef CONFIG_X86_64 static struct aead_alg aesni_aead_algs[] = { { @@ -1401,9 +1077,27 @@ static const struct x86_cpu_id aesni_cpu_id[] = { }; MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id); +static void aesni_free_simds(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(aesni_simd_skciphers) && + aesni_simd_skciphers[i]; i++) + simd_skcipher_free(aesni_simd_skciphers[i]); + + for (i = 0; i < ARRAY_SIZE(aesni_simd_skciphers2); i++) + if (aesni_simd_skciphers2[i].simd) + simd_skcipher_free(aesni_simd_skciphers2[i].simd); +} + static int __init aesni_init(void) { + struct simd_skcipher_alg *simd; + const char *basename; + const char *algname; + const char *drvname; int err; + int i; if (!x86_match_cpu(aesni_cpu_id)) return -ENODEV; @@ -1445,13 +1139,48 @@ static int __init aesni_init(void) if (err) goto fpu_exit; + err = crypto_register_skciphers(aesni_skciphers, + ARRAY_SIZE(aesni_skciphers)); + if (err) + goto unregister_algs; + err = crypto_register_aeads(aesni_aead_algs, ARRAY_SIZE(aesni_aead_algs)); if (err) - goto unregister_algs; + goto unregister_skciphers; + + for (i = 0; i < ARRAY_SIZE(aesni_skciphers); i++) { + algname = aesni_skciphers[i].base.cra_name + 2; + drvname = aesni_skciphers[i].base.cra_driver_name + 2; + basename = aesni_skciphers[i].base.cra_driver_name; + simd = simd_skcipher_create_compat(algname, drvname, basename); + err = PTR_ERR(simd); + if (IS_ERR(simd)) + goto unregister_simds; + + aesni_simd_skciphers[i] = simd; + } - return err; + for (i = 0; i < ARRAY_SIZE(aesni_simd_skciphers2); i++) { + algname = aesni_simd_skciphers2[i].algname; + drvname = aesni_simd_skciphers2[i].drvname; + basename = aesni_simd_skciphers2[i].basename; + simd = simd_skcipher_create_compat(algname, drvname, basename); + err = PTR_ERR(simd); + if (IS_ERR(simd)) + continue; + aesni_simd_skciphers2[i].simd = simd; + } + + return 0; + +unregister_simds: + aesni_free_simds(); + crypto_unregister_aeads(aesni_aead_algs, ARRAY_SIZE(aesni_aead_algs)); +unregister_skciphers: + crypto_unregister_skciphers(aesni_skciphers, + ARRAY_SIZE(aesni_skciphers)); unregister_algs: crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs)); fpu_exit: @@ -1461,7 +1190,10 @@ fpu_exit: static void __exit aesni_exit(void) { + aesni_free_simds(); crypto_unregister_aeads(aesni_aead_algs, ARRAY_SIZE(aesni_aead_algs)); + crypto_unregister_skciphers(aesni_skciphers, + ARRAY_SIZE(aesni_skciphers)); crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs)); crypto_fpu_exit(); diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c index e7d679e2a018..406680476c52 100644 --- a/arch/x86/crypto/fpu.c +++ b/arch/x86/crypto/fpu.c @@ -11,143 +11,186 @@ * */ -#include <crypto/algapi.h> +#include <crypto/internal/skcipher.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> -#include <linux/crypto.h> #include <asm/fpu/api.h> struct crypto_fpu_ctx { - struct crypto_blkcipher *child; + struct crypto_skcipher *child; }; -static int crypto_fpu_setkey(struct crypto_tfm *parent, const u8 *key, +static int crypto_fpu_setkey(struct crypto_skcipher *parent, const u8 *key, unsigned int keylen) { - struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(parent); - struct crypto_blkcipher *child = ctx->child; + struct crypto_fpu_ctx *ctx = crypto_skcipher_ctx(parent); + struct crypto_skcipher *child = ctx->child; int err; - crypto_blkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); - crypto_blkcipher_set_flags(child, crypto_tfm_get_flags(parent) & - CRYPTO_TFM_REQ_MASK); - err = crypto_blkcipher_setkey(child, key, keylen); - crypto_tfm_set_flags(parent, crypto_blkcipher_get_flags(child) & - CRYPTO_TFM_RES_MASK); + crypto_skcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); + crypto_skcipher_set_flags(child, crypto_skcipher_get_flags(parent) & + CRYPTO_TFM_REQ_MASK); + err = crypto_skcipher_setkey(child, key, keylen); + crypto_skcipher_set_flags(parent, crypto_skcipher_get_flags(child) & + CRYPTO_TFM_RES_MASK); return err; } -static int crypto_fpu_encrypt(struct blkcipher_desc *desc_in, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int crypto_fpu_encrypt(struct skcipher_request *req) { + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_fpu_ctx *ctx = crypto_skcipher_ctx(tfm); + struct crypto_skcipher *child = ctx->child; + SKCIPHER_REQUEST_ON_STACK(subreq, child); int err; - struct crypto_fpu_ctx *ctx = crypto_blkcipher_ctx(desc_in->tfm); - struct crypto_blkcipher *child = ctx->child; - struct blkcipher_desc desc = { - .tfm = child, - .info = desc_in->info, - .flags = desc_in->flags & ~CRYPTO_TFM_REQ_MAY_SLEEP, - }; + + skcipher_request_set_tfm(subreq, child); + skcipher_request_set_callback(subreq, 0, NULL, NULL); + skcipher_request_set_crypt(subreq, req->src, req->dst, req->cryptlen, + req->iv); kernel_fpu_begin(); - err = crypto_blkcipher_crt(desc.tfm)->encrypt(&desc, dst, src, nbytes); + err = crypto_skcipher_encrypt(subreq); kernel_fpu_end(); + + skcipher_request_zero(subreq); return err; } -static int crypto_fpu_decrypt(struct blkcipher_desc *desc_in, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int crypto_fpu_decrypt(struct skcipher_request *req) { + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_fpu_ctx *ctx = crypto_skcipher_ctx(tfm); + struct crypto_skcipher *child = ctx->child; + SKCIPHER_REQUEST_ON_STACK(subreq, child); int err; - struct crypto_fpu_ctx *ctx = crypto_blkcipher_ctx(desc_in->tfm); - struct crypto_blkcipher *child = ctx->child; - struct blkcipher_desc desc = { - .tfm = child, - .info = desc_in->info, - .flags = desc_in->flags & ~CRYPTO_TFM_REQ_MAY_SLEEP, - }; + + skcipher_request_set_tfm(subreq, child); + skcipher_request_set_callback(subreq, 0, NULL, NULL); + skcipher_request_set_crypt(subreq, req->src, req->dst, req->cryptlen, + req->iv); kernel_fpu_begin(); - err = crypto_blkcipher_crt(desc.tfm)->decrypt(&desc, dst, src, nbytes); + err = crypto_skcipher_decrypt(subreq); kernel_fpu_end(); + + skcipher_request_zero(subreq); return err; } -static int crypto_fpu_init_tfm(struct crypto_tfm *tfm) +static int crypto_fpu_init_tfm(struct crypto_skcipher *tfm) { - struct crypto_instance *inst = crypto_tfm_alg_instance(tfm); - struct crypto_spawn *spawn = crypto_instance_ctx(inst); - struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(tfm); - struct crypto_blkcipher *cipher; + struct skcipher_instance *inst = skcipher_alg_instance(tfm); + struct crypto_fpu_ctx *ctx = crypto_skcipher_ctx(tfm); + struct crypto_skcipher_spawn *spawn; + struct crypto_skcipher *cipher; - cipher = crypto_spawn_blkcipher(spawn); + spawn = skcipher_instance_ctx(inst); + cipher = crypto_spawn_skcipher(spawn); if (IS_ERR(cipher)) return PTR_ERR(cipher); ctx->child = cipher; + return 0; } -static void crypto_fpu_exit_tfm(struct crypto_tfm *tfm) +static void crypto_fpu_exit_tfm(struct crypto_skcipher *tfm) +{ + struct crypto_fpu_ctx *ctx = crypto_skcipher_ctx(tfm); + + crypto_free_skcipher(ctx->child); +} + +static void crypto_fpu_free(struct skcipher_instance *inst) { - struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(tfm); - crypto_free_blkcipher(ctx->child); + crypto_drop_skcipher(skcipher_instance_ctx(inst)); + kfree(inst); } -static struct crypto_instance *crypto_fpu_alloc(struct rtattr **tb) +static int crypto_fpu_create(struct crypto_template *tmpl, struct rtattr **tb) { - struct crypto_instance *inst; - struct crypto_alg *alg; + struct crypto_skcipher_spawn *spawn; + struct skcipher_instance *inst; + struct crypto_attr_type *algt; + struct skcipher_alg *alg; + const char *cipher_name; int err; - err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_BLKCIPHER); + algt = crypto_get_attr_type(tb); + if (IS_ERR(algt)) + return PTR_ERR(algt); + + if ((algt->type ^ (CRYPTO_ALG_INTERNAL | CRYPTO_ALG_TYPE_SKCIPHER)) & + algt->mask) + return -EINVAL; + + if (!(algt->mask & CRYPTO_ALG_INTERNAL)) + return -EINVAL; + + cipher_name = crypto_attr_alg_name(tb[1]); + if (IS_ERR(cipher_name)) + return PTR_ERR(cipher_name); + + inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL); + if (!inst) + return -ENOMEM; + + spawn = skcipher_instance_ctx(inst); + + crypto_set_skcipher_spawn(spawn, skcipher_crypto_instance(inst)); + err = crypto_grab_skcipher(spawn, cipher_name, CRYPTO_ALG_INTERNAL, + CRYPTO_ALG_INTERNAL | CRYPTO_ALG_ASYNC); if (err) - return ERR_PTR(err); - - alg = crypto_get_attr_alg(tb, CRYPTO_ALG_TYPE_BLKCIPHER, - CRYPTO_ALG_TYPE_MASK); - if (IS_ERR(alg)) - return ERR_CAST(alg); - - inst = crypto_alloc_instance("fpu", alg); - if (IS_ERR(inst)) - goto out_put_alg; - - inst->alg.cra_flags = alg->cra_flags; - inst->alg.cra_priority = alg->cra_priority; - inst->alg.cra_blocksize = alg->cra_blocksize; - inst->alg.cra_alignmask = alg->cra_alignmask; - inst->alg.cra_type = alg->cra_type; - inst->alg.cra_blkcipher.ivsize = alg->cra_blkcipher.ivsize; - inst->alg.cra_blkcipher.min_keysize = alg->cra_blkcipher.min_keysize; - inst->alg.cra_blkcipher.max_keysize = alg->cra_blkcipher.max_keysize; - inst->alg.cra_ctxsize = sizeof(struct crypto_fpu_ctx); - inst->alg.cra_init = crypto_fpu_init_tfm; - inst->alg.cra_exit = crypto_fpu_exit_tfm; - inst->alg.cra_blkcipher.setkey = crypto_fpu_setkey; - inst->alg.cra_blkcipher.encrypt = crypto_fpu_encrypt; - inst->alg.cra_blkcipher.decrypt = crypto_fpu_decrypt; - -out_put_alg: - crypto_mod_put(alg); - return inst; -} + goto out_free_inst; -static void crypto_fpu_free(struct crypto_instance *inst) -{ - crypto_drop_spawn(crypto_instance_ctx(inst)); + alg = crypto_skcipher_spawn_alg(spawn); + + err = crypto_inst_setname(skcipher_crypto_instance(inst), "fpu", + &alg->base); + if (err) + goto out_drop_skcipher; + + inst->alg.base.cra_flags = CRYPTO_ALG_INTERNAL; + inst->alg.base.cra_priority = alg->base.cra_priority; + inst->alg.base.cra_blocksize = alg->base.cra_blocksize; + inst->alg.base.cra_alignmask = alg->base.cra_alignmask; + + inst->alg.ivsize = crypto_skcipher_alg_ivsize(alg); + inst->alg.min_keysize = crypto_skcipher_alg_min_keysize(alg); + inst->alg.max_keysize = crypto_skcipher_alg_max_keysize(alg); + + inst->alg.base.cra_ctxsize = sizeof(struct crypto_fpu_ctx); + + inst->alg.init = crypto_fpu_init_tfm; + inst->alg.exit = crypto_fpu_exit_tfm; + + inst->alg.setkey = crypto_fpu_setkey; + inst->alg.encrypt = crypto_fpu_encrypt; + inst->alg.decrypt = crypto_fpu_decrypt; + + inst->free = crypto_fpu_free; + + err = skcipher_register_instance(tmpl, inst); + if (err) + goto out_drop_skcipher; + +out: + return err; + +out_drop_skcipher: + crypto_drop_skcipher(spawn); +out_free_inst: kfree(inst); + goto out; } static struct crypto_template crypto_fpu_tmpl = { .name = "fpu", - .alloc = crypto_fpu_alloc, - .free = crypto_fpu_free, + .create = crypto_fpu_create, .module = THIS_MODULE, }; diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c index 6a85598931b5..260a060d7275 100644 --- a/arch/x86/crypto/glue_helper.c +++ b/arch/x86/crypto/glue_helper.c @@ -27,10 +27,10 @@ #include <linux/module.h> #include <crypto/b128ops.h> +#include <crypto/internal/skcipher.h> #include <crypto/lrw.h> #include <crypto/xts.h> #include <asm/crypto/glue_helper.h> -#include <crypto/scatterwalk.h> static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, struct blkcipher_desc *desc, @@ -339,6 +339,41 @@ done: return nbytes; } +static unsigned int __glue_xts_req_128bit(const struct common_glue_ctx *gctx, + void *ctx, + struct skcipher_walk *walk) +{ + const unsigned int bsize = 128 / 8; + unsigned int nbytes = walk->nbytes; + u128 *src = walk->src.virt.addr; + u128 *dst = walk->dst.virt.addr; + unsigned int num_blocks, func_bytes; + unsigned int i; + + /* Process multi-block batch */ + for (i = 0; i < gctx->num_funcs; i++) { + num_blocks = gctx->funcs[i].num_blocks; + func_bytes = bsize * num_blocks; + + if (nbytes >= func_bytes) { + do { + gctx->funcs[i].fn_u.xts(ctx, dst, src, + walk->iv); + + src += num_blocks; + dst += num_blocks; + nbytes -= func_bytes; + } while (nbytes >= func_bytes); + + if (nbytes < bsize) + goto done; + } + } + +done: + return nbytes; +} + /* for implementations implementing faster XTS IV generator */ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx, struct blkcipher_desc *desc, struct scatterlist *dst, @@ -379,6 +414,43 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx, } EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit); +int glue_xts_req_128bit(const struct common_glue_ctx *gctx, + struct skcipher_request *req, + common_glue_func_t tweak_fn, void *tweak_ctx, + void *crypt_ctx) +{ + const unsigned int bsize = 128 / 8; + struct skcipher_walk walk; + bool fpu_enabled = false; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + nbytes = walk.nbytes; + if (!nbytes) + return err; + + /* set minimum length to bsize, for tweak_fn */ + fpu_enabled = glue_skwalk_fpu_begin(bsize, gctx->fpu_blocks_limit, + &walk, fpu_enabled, + nbytes < bsize ? bsize : nbytes); + + /* calculate first value of T */ + tweak_fn(tweak_ctx, walk.iv, walk.iv); + + while (nbytes) { + nbytes = __glue_xts_req_128bit(gctx, crypt_ctx, &walk); + + err = skcipher_walk_done(&walk, nbytes); + nbytes = walk.nbytes; + } + + glue_fpu_end(fpu_enabled); + + return err; +} +EXPORT_SYMBOL_GPL(glue_xts_req_128bit); + void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src, le128 *iv, common_glue_func_t fn) { diff --git a/arch/x86/crypto/sha1-mb/sha1_mb.c b/arch/x86/crypto/sha1-mb/sha1_mb.c index 9e5b67127a09..acf9fdf01671 100644 --- a/arch/x86/crypto/sha1-mb/sha1_mb.c +++ b/arch/x86/crypto/sha1-mb/sha1_mb.c @@ -114,7 +114,7 @@ static inline void sha1_init_digest(uint32_t *digest) } static inline uint32_t sha1_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], - uint32_t total_len) + uint64_t total_len) { uint32_t i = total_len & (SHA1_BLOCK_SIZE - 1); diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h b/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h index 98a35bcc6f4a..13590ccf965c 100644 --- a/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h +++ b/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h @@ -125,7 +125,7 @@ struct sha1_hash_ctx { /* error flag */ int error; - uint32_t total_length; + uint64_t total_length; const void *incoming_buffer; uint32_t incoming_buffer_length; uint8_t partial_block_buffer[SHA1_BLOCK_SIZE * 2]; diff --git a/arch/x86/crypto/sha256-mb/sha256_mb.c b/arch/x86/crypto/sha256-mb/sha256_mb.c index 6f97fb33ae21..7926a226b120 100644 --- a/arch/x86/crypto/sha256-mb/sha256_mb.c +++ b/arch/x86/crypto/sha256-mb/sha256_mb.c @@ -115,7 +115,7 @@ inline void sha256_init_digest(uint32_t *digest) } inline uint32_t sha256_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], - uint32_t total_len) + uint64_t total_len) { uint32_t i = total_len & (SHA256_BLOCK_SIZE - 1); diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h b/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h index edd252b73206..aabb30320af0 100644 --- a/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h +++ b/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h @@ -125,7 +125,7 @@ struct sha256_hash_ctx { /* error flag */ int error; - uint32_t total_length; + uint64_t total_length; const void *incoming_buffer; uint32_t incoming_buffer_length; uint8_t partial_block_buffer[SHA256_BLOCK_SIZE * 2]; diff --git a/arch/x86/crypto/sha512-mb/sha512_mb.c b/arch/x86/crypto/sha512-mb/sha512_mb.c index d210174a52b0..9c1bb6d58141 100644 --- a/arch/x86/crypto/sha512-mb/sha512_mb.c +++ b/arch/x86/crypto/sha512-mb/sha512_mb.c @@ -117,7 +117,7 @@ inline void sha512_init_digest(uint64_t *digest) } inline uint32_t sha512_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], - uint32_t total_len) + uint64_t total_len) { uint32_t i = total_len & (SHA512_BLOCK_SIZE - 1); diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h b/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h index 9d4b2c8208d5..e4653f5eec3f 100644 --- a/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h +++ b/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h @@ -119,7 +119,7 @@ struct sha512_hash_ctx { /* error flag */ int error; - uint32_t total_length; + uint64_t total_length; const void *incoming_buffer; uint32_t incoming_buffer_length; uint8_t partial_block_buffer[SHA512_BLOCK_SIZE * 2]; diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index bdd9cc59d20f..b83c61cfd154 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -25,7 +25,7 @@ #include <asm/desc.h> #include <asm/traps.h> #include <asm/vdso.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/cpufeature.h> #define CREATE_TRACE_POINTS diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index acc0c6f36f3f..57f7ec35216e 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -255,23 +255,6 @@ ENTRY(__switch_to_asm) END(__switch_to_asm) /* - * The unwinder expects the last frame on the stack to always be at the same - * offset from the end of the page, which allows it to validate the stack. - * Calling schedule_tail() directly would break that convention because its an - * asmlinkage function so its argument has to be pushed on the stack. This - * wrapper creates a proper "end of stack" frame header before the call. - */ -ENTRY(schedule_tail_wrapper) - FRAME_BEGIN - - pushl %eax - call schedule_tail - popl %eax - - FRAME_END - ret -ENDPROC(schedule_tail_wrapper) -/* * A newly forked process directly context switches into this address. * * eax: prev task we switched from @@ -279,15 +262,24 @@ ENDPROC(schedule_tail_wrapper) * edi: kernel thread arg */ ENTRY(ret_from_fork) - call schedule_tail_wrapper + FRAME_BEGIN /* help unwinder find end of stack */ + + /* + * schedule_tail() is asmlinkage so we have to put its 'prev' argument + * on the stack. + */ + pushl %eax + call schedule_tail + popl %eax testl %ebx, %ebx jnz 1f /* kernel threads are uncommon */ 2: /* When we fork, we trace the syscall return in the child, too. */ - movl %esp, %eax + leal FRAME_OFFSET(%esp), %eax call syscall_return_slowpath + FRAME_END jmp restore_all /* kernel thread */ @@ -926,8 +918,8 @@ ftrace_graph_call: jmp ftrace_stub #endif -.globl ftrace_stub -ftrace_stub: +/* This is weak to keep gas from relaxing the jumps */ +WEAK(ftrace_stub) ret END(ftrace_caller) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 5b219707c2f2..044d18ebc43c 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -36,6 +36,7 @@ #include <asm/smap.h> #include <asm/pgtable_types.h> #include <asm/export.h> +#include <asm/frame.h> #include <linux/err.h> .code64 @@ -408,17 +409,19 @@ END(__switch_to_asm) * r12: kernel thread arg */ ENTRY(ret_from_fork) + FRAME_BEGIN /* help unwinder find end of stack */ movq %rax, %rdi - call schedule_tail /* rdi: 'prev' task parameter */ + call schedule_tail /* rdi: 'prev' task parameter */ - testq %rbx, %rbx /* from kernel_thread? */ - jnz 1f /* kernel threads are uncommon */ + testq %rbx, %rbx /* from kernel_thread? */ + jnz 1f /* kernel threads are uncommon */ 2: - movq %rsp, %rdi + leaq FRAME_OFFSET(%rsp),%rdi /* pt_regs pointer */ call syscall_return_slowpath /* returns with IRQs disabled */ TRACE_IRQS_ON /* user mode is traced as IRQS on */ SWAPGS + FRAME_END jmp restore_regs_and_iret 1: diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 02223cb4bcfd..9d4d6e138311 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -92,10 +92,10 @@ static notrace const struct pvclock_vsyscall_time_info *get_pvti0(void) return (const struct pvclock_vsyscall_time_info *)&pvclock_page; } -static notrace cycle_t vread_pvclock(int *mode) +static notrace u64 vread_pvclock(int *mode) { const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti; - cycle_t ret; + u64 ret; u64 last; u32 version; @@ -142,9 +142,9 @@ static notrace cycle_t vread_pvclock(int *mode) } #endif -notrace static cycle_t vread_tsc(void) +notrace static u64 vread_tsc(void) { - cycle_t ret = (cycle_t)rdtsc_ordered(); + u64 ret = (u64)rdtsc_ordered(); u64 last = gtod->cycle_last; if (likely(ret >= last)) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index e739002427ed..10820f6cefbf 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -109,7 +109,7 @@ static int vvar_fault(const struct vm_special_mapping *sm, return VM_FAULT_SIGBUS; if (sym_offset == image->sym_vvar_page) { - ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, + ret = vm_insert_pfn(vma, vmf->address, __pa_symbol(&__vvar_page) >> PAGE_SHIFT); } else if (sym_offset == image->sym_pvclock_page) { struct pvclock_vsyscall_time_info *pvti = @@ -117,7 +117,7 @@ static int vvar_fault(const struct vm_special_mapping *sm, if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) { ret = vm_insert_pfn( vma, - (unsigned long)vmf->virtual_address, + vmf->address, __pa(pvti) >> PAGE_SHIFT); } } @@ -371,7 +371,7 @@ static int __init init_vdso(void) /* notifier priority > KVM */ return cpuhp_setup_state(CPUHP_AP_X86_VDSO_VMA_ONLINE, - "AP_X86_VDSO_VMA_ONLINE", vgetcpu_online, NULL); + "x86/vdso/vma:online", vgetcpu_online, NULL); } subsys_initcall(init_vdso); #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index b26ee32f73e8..496e60391fac 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -1010,7 +1010,7 @@ static __init int amd_ibs_init(void) * all online cpus. */ cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_IBS_STARTING, - "AP_PERF_X86_AMD_IBS_STARTING", + "perf/x86/amd/ibs:starting", x86_pmu_amd_ibs_starting_cpu, x86_pmu_amd_ibs_dying_cpu); diff --git a/arch/x86/events/amd/power.c b/arch/x86/events/amd/power.c index 9842270ed2f2..a6eee5ac4f58 100644 --- a/arch/x86/events/amd/power.c +++ b/arch/x86/events/amd/power.c @@ -291,7 +291,7 @@ static int __init amd_power_pmu_init(void) cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_POWER_ONLINE, - "AP_PERF_X86_AMD_POWER_ONLINE", + "perf/x86/amd/power:online", power_cpu_init, power_cpu_exit); ret = perf_pmu_register(&pmu_class, "power", -1); diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index 65577f081d07..a0b1bdb3ad42 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -527,16 +527,16 @@ static int __init amd_uncore_init(void) * Install callbacks. Core will call them for each online cpu. */ if (cpuhp_setup_state(CPUHP_PERF_X86_AMD_UNCORE_PREP, - "PERF_X86_AMD_UNCORE_PREP", + "perf/x86/amd/uncore:prepare", amd_uncore_cpu_up_prepare, amd_uncore_cpu_dead)) goto fail_l2; if (cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING, - "AP_PERF_X86_AMD_UNCORE_STARTING", + "perf/x86/amd/uncore:starting", amd_uncore_cpu_starting, NULL)) goto fail_prep; if (cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE, - "AP_PERF_X86_AMD_UNCORE_ONLINE", + "perf/x86/amd/uncore:online", amd_uncore_cpu_online, amd_uncore_cpu_down_prepare)) goto fail_start; diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index f1c22584a46f..1635c0c8df23 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -505,6 +505,10 @@ int x86_pmu_hw_config(struct perf_event *event) if (event->attr.precise_ip > precise) return -EOPNOTSUPP; + + /* There's no sense in having PEBS for non sampling events: */ + if (!is_sampling_event(event)) + return -EINVAL; } /* * check that PEBS LBR correction does not conflict with @@ -1820,18 +1824,18 @@ static int __init init_hw_perf_events(void) * Install callbacks. Core will call them for each online * cpu. */ - err = cpuhp_setup_state(CPUHP_PERF_X86_PREPARE, "PERF_X86_PREPARE", + err = cpuhp_setup_state(CPUHP_PERF_X86_PREPARE, "perf/x86:prepare", x86_pmu_prepare_cpu, x86_pmu_dead_cpu); if (err) return err; err = cpuhp_setup_state(CPUHP_AP_PERF_X86_STARTING, - "AP_PERF_X86_STARTING", x86_pmu_starting_cpu, + "perf/x86:starting", x86_pmu_starting_cpu, x86_pmu_dying_cpu); if (err) goto out; - err = cpuhp_setup_state(CPUHP_AP_PERF_X86_ONLINE, "AP_PERF_X86_ONLINE", + err = cpuhp_setup_state(CPUHP_AP_PERF_X86_ONLINE, "perf/x86:online", x86_pmu_online_cpu, NULL); if (err) goto out1; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index cb8522290e6a..eb1484c86bb4 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2110,6 +2110,27 @@ again: GLOBAL_STATUS_LBRS_FROZEN); if (!status) goto done; + /* + * In case multiple PEBS events are sampled at the same time, + * it is possible to have GLOBAL_STATUS bit 62 set indicating + * PEBS buffer overflow and also seeing at most 3 PEBS counters + * having their bits set in the status register. This is a sign + * that there was at least one PEBS record pending at the time + * of the PMU interrupt. PEBS counters must only be processed + * via the drain_pebs() calls and not via the regular sample + * processing loop coming after that the function, otherwise + * phony regular samples may be generated in the sampling buffer + * not marked with the EXACT tag. Another possibility is to have + * one PEBS event and at least one non-PEBS event whic hoverflows + * while PEBS has armed. In this case, bit 62 of GLOBAL_STATUS will + * not be set, yet the overflow status bit for the PEBS counter will + * be on Skylake. + * + * To avoid this problem, we systematically ignore the PEBS-enabled + * counters from the GLOBAL_STATUS mask and we always process PEBS + * events via drain_pebs(). + */ + status &= ~cpuc->pebs_enabled; /* * PEBS overflow sets bit 62 in the global status register @@ -2117,15 +2138,6 @@ again: if (__test_and_clear_bit(62, (unsigned long *)&status)) { handled++; x86_pmu.drain_pebs(regs); - /* - * There are cases where, even though, the PEBS ovfl bit is set - * in GLOBAL_OVF_STATUS, the PEBS events may also have their - * overflow bits set for their counters. We must clear them - * here because they have been processed as exact samples in - * the drain_pebs() routine. They must not be processed again - * in the for_each_bit_set() loop for regular samples below. - */ - status &= ~cpuc->pebs_enabled; status &= x86_pmu.intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI; } @@ -3164,13 +3176,16 @@ static void intel_pmu_cpu_starting(int cpu) if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) { for_each_cpu(i, topology_sibling_cpumask(cpu)) { + struct cpu_hw_events *sibling; struct intel_excl_cntrs *c; - c = per_cpu(cpu_hw_events, i).excl_cntrs; + sibling = &per_cpu(cpu_hw_events, i); + c = sibling->excl_cntrs; if (c && c->core_id == core_id) { cpuc->kfree_on_online[1] = cpuc->excl_cntrs; cpuc->excl_cntrs = c; - cpuc->excl_thread_id = 1; + if (!sibling->excl_thread_id) + cpuc->excl_thread_id = 1; break; } } @@ -3975,7 +3990,7 @@ __init int intel_pmu_init(void) x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC); x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC; } - x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; + x86_pmu.intel_ctrl = (1ULL << x86_pmu.num_counters) - 1; if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) { WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c index 8f82b02934fa..8c00dc09a5d2 100644 --- a/arch/x86/events/intel/cqm.c +++ b/arch/x86/events/intel/cqm.c @@ -7,9 +7,9 @@ #include <linux/perf_event.h> #include <linux/slab.h> #include <asm/cpu_device_id.h> +#include <asm/intel_rdt_common.h> #include "../perf_event.h" -#define MSR_IA32_PQR_ASSOC 0x0c8f #define MSR_IA32_QM_CTR 0x0c8e #define MSR_IA32_QM_EVTSEL 0x0c8d @@ -24,32 +24,13 @@ static unsigned int cqm_l3_scale; /* supposedly cacheline size */ static bool cqm_enabled, mbm_enabled; unsigned int mbm_socket_max; -/** - * struct intel_pqr_state - State cache for the PQR MSR - * @rmid: The cached Resource Monitoring ID - * @closid: The cached Class Of Service ID - * @rmid_usecnt: The usage counter for rmid - * - * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the - * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always - * contains both parts, so we need to cache them. - * - * The cache also helps to avoid pointless updates if the value does - * not change. - */ -struct intel_pqr_state { - u32 rmid; - u32 closid; - int rmid_usecnt; -}; - /* * The cached intel_pqr_state is strictly per CPU and can never be * updated from a remote CPU. Both functions which modify the state * (intel_cqm_event_start and intel_cqm_event_stop) are called with * interrupts disabled, which is sufficient for the protection. */ -static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state); +DEFINE_PER_CPU(struct intel_pqr_state, pqr_state); static struct hrtimer *mbm_timers; /** * struct sample - mbm event's (local or total) data @@ -1766,9 +1747,9 @@ static int __init intel_cqm_init(void) * is enabled to avoid notifier leak. */ cpuhp_setup_state(CPUHP_AP_PERF_X86_CQM_STARTING, - "AP_PERF_X86_CQM_STARTING", + "perf/x86/cqm:starting", intel_cqm_cpu_starting, NULL); - cpuhp_setup_state(CPUHP_AP_PERF_X86_CQM_ONLINE, "AP_PERF_X86_CQM_ONLINE", + cpuhp_setup_state(CPUHP_AP_PERF_X86_CQM_ONLINE, "perf/x86/cqm:online", NULL, intel_cqm_cpu_exit); out: diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index da51e5a3e2ff..1076c9a77292 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -434,6 +434,7 @@ static struct pmu cstate_core_pmu = { .stop = cstate_pmu_event_stop, .read = cstate_pmu_event_update, .capabilities = PERF_PMU_CAP_NO_INTERRUPT, + .module = THIS_MODULE, }; static struct pmu cstate_pkg_pmu = { @@ -447,6 +448,7 @@ static struct pmu cstate_pkg_pmu = { .stop = cstate_pmu_event_stop, .read = cstate_pmu_event_update, .capabilities = PERF_PMU_CAP_NO_INTERRUPT, + .module = THIS_MODULE, }; static const struct cstate_model nhm_cstates __initconst = { @@ -594,6 +596,9 @@ static int __init cstate_probe(const struct cstate_model *cm) static inline void cstate_cleanup(void) { + cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_CSTATE_ONLINE); + cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_CSTATE_STARTING); + if (has_cstate_core) perf_pmu_unregister(&cstate_core_pmu); @@ -606,16 +611,16 @@ static int __init cstate_init(void) int err; cpuhp_setup_state(CPUHP_AP_PERF_X86_CSTATE_STARTING, - "AP_PERF_X86_CSTATE_STARTING", cstate_cpu_init, - NULL); + "perf/x86/cstate:starting", cstate_cpu_init, NULL); cpuhp_setup_state(CPUHP_AP_PERF_X86_CSTATE_ONLINE, - "AP_PERF_X86_CSTATE_ONLINE", NULL, cstate_cpu_exit); + "perf/x86/cstate:online", NULL, cstate_cpu_exit); if (has_cstate_core) { err = perf_pmu_register(&cstate_core_pmu, cstate_core_pmu.name, -1); if (err) { has_cstate_core = false; pr_info("Failed to register cstate core pmu\n"); + cstate_cleanup(); return err; } } @@ -629,8 +634,7 @@ static int __init cstate_init(void) return err; } } - - return err; + return 0; } static int __init cstate_pmu_init(void) @@ -655,8 +659,6 @@ module_init(cstate_pmu_init); static void __exit cstate_pmu_exit(void) { - cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_CSTATE_ONLINE); - cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_CSTATE_STARTING); cstate_cleanup(); } module_exit(cstate_pmu_exit); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index be202390bbd3..9dfeeeca0ea8 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1389,9 +1389,13 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) continue; /* log dropped samples number */ - if (error[bit]) + if (error[bit]) { perf_log_lost_samples(event, error[bit]); + if (perf_event_account_interrupt(event)) + x86_pmu_stop(event, 0); + } + if (counts[bit]) { __intel_pmu_pebs_event(event, iregs, base, top, bit, counts[bit]); diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 0a535cea8ff3..22ef4f72cf32 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -161,7 +161,13 @@ static u64 rapl_timer_ms; static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu) { - return rapl_pmus->pmus[topology_logical_package_id(cpu)]; + unsigned int pkgid = topology_logical_package_id(cpu); + + /* + * The unsigned check also catches the '-1' return value for non + * existent mappings in the topology map. + */ + return pkgid < rapl_pmus->maxpkg ? rapl_pmus->pmus[pkgid] : NULL; } static inline u64 rapl_read_counter(struct perf_event *event) @@ -402,6 +408,8 @@ static int rapl_pmu_event_init(struct perf_event *event) /* must be done before validate_group */ pmu = cpu_to_rapl_pmu(event->cpu); + if (!pmu) + return -EINVAL; event->cpu = pmu->cpu; event->pmu_private = pmu; event->hw.event_base = msr; @@ -585,6 +593,20 @@ static int rapl_cpu_online(unsigned int cpu) struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); int target; + if (!pmu) { + pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); + if (!pmu) + return -ENOMEM; + + raw_spin_lock_init(&pmu->lock); + INIT_LIST_HEAD(&pmu->active_list); + pmu->pmu = &rapl_pmus->pmu; + pmu->timer_interval = ms_to_ktime(rapl_timer_ms); + rapl_hrtimer_init(pmu); + + rapl_pmus->pmus[topology_logical_package_id(cpu)] = pmu; + } + /* * Check if there is an online cpu in the package which collects rapl * events already. @@ -598,27 +620,6 @@ static int rapl_cpu_online(unsigned int cpu) return 0; } -static int rapl_cpu_prepare(unsigned int cpu) -{ - struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); - - if (pmu) - return 0; - - pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); - if (!pmu) - return -ENOMEM; - - raw_spin_lock_init(&pmu->lock); - INIT_LIST_HEAD(&pmu->active_list); - pmu->pmu = &rapl_pmus->pmu; - pmu->timer_interval = ms_to_ktime(rapl_timer_ms); - pmu->cpu = -1; - rapl_hrtimer_init(pmu); - rapl_pmus->pmus[topology_logical_package_id(cpu)] = pmu; - return 0; -} - static int rapl_check_hw_unit(bool apply_quirk) { u64 msr_rapl_power_unit_bits; @@ -697,6 +698,7 @@ static int __init init_rapl_pmus(void) rapl_pmus->pmu.start = rapl_pmu_event_start; rapl_pmus->pmu.stop = rapl_pmu_event_stop; rapl_pmus->pmu.read = rapl_pmu_event_read; + rapl_pmus->pmu.module = THIS_MODULE; return 0; } @@ -802,29 +804,21 @@ static int __init rapl_pmu_init(void) /* * Install callbacks. Core will call them for each online cpu. */ - - ret = cpuhp_setup_state(CPUHP_PERF_X86_RAPL_PREP, "PERF_X86_RAPL_PREP", - rapl_cpu_prepare, NULL); - if (ret) - goto out; - ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE, - "AP_PERF_X86_RAPL_ONLINE", + "perf/x86/rapl:online", rapl_cpu_online, rapl_cpu_offline); if (ret) - goto out1; + goto out; ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1); if (ret) - goto out2; + goto out1; rapl_advertise(); return 0; -out2: - cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE); out1: - cpuhp_remove_state(CPUHP_PERF_X86_RAPL_PREP); + cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE); out: pr_warn("Initialization failed (%d), disabled\n", ret); cleanup_rapl_pmus(); @@ -835,7 +829,6 @@ module_init(rapl_pmu_init); static void __exit intel_rapl_exit(void) { cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE); - cpuhp_remove_state_nocalls(CPUHP_PERF_X86_RAPL_PREP); perf_pmu_unregister(&rapl_pmus->pmu); cleanup_rapl_pmus(); } diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index dbaaf7dc8373..1ab45976474d 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -100,7 +100,13 @@ ssize_t uncore_event_show(struct kobject *kobj, struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu) { - return pmu->boxes[topology_logical_package_id(cpu)]; + unsigned int pkgid = topology_logical_package_id(cpu); + + /* + * The unsigned check also catches the '-1' return value for non + * existent mappings in the topology map. + */ + return pkgid < max_packages ? pmu->boxes[pkgid] : NULL; } u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event) @@ -733,6 +739,7 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu) .start = uncore_pmu_event_start, .stop = uncore_pmu_event_stop, .read = uncore_pmu_event_read, + .module = THIS_MODULE, }; } else { pmu->pmu = *pmu->type->pmu; @@ -763,30 +770,6 @@ static void uncore_pmu_unregister(struct intel_uncore_pmu *pmu) pmu->registered = false; } -static void __uncore_exit_boxes(struct intel_uncore_type *type, int cpu) -{ - struct intel_uncore_pmu *pmu = type->pmus; - struct intel_uncore_box *box; - int i, pkg; - - if (pmu) { - pkg = topology_physical_package_id(cpu); - for (i = 0; i < type->num_boxes; i++, pmu++) { - box = pmu->boxes[pkg]; - if (box) - uncore_box_exit(box); - } - } -} - -static void uncore_exit_boxes(void *dummy) -{ - struct intel_uncore_type **types; - - for (types = uncore_msr_uncores; *types; types++) - __uncore_exit_boxes(*types++, smp_processor_id()); -} - static void uncore_free_boxes(struct intel_uncore_pmu *pmu) { int pkg; @@ -1057,86 +1040,6 @@ static void uncore_pci_exit(void) } } -static int uncore_cpu_dying(unsigned int cpu) -{ - struct intel_uncore_type *type, **types = uncore_msr_uncores; - struct intel_uncore_pmu *pmu; - struct intel_uncore_box *box; - int i, pkg; - - pkg = topology_logical_package_id(cpu); - for (; *types; types++) { - type = *types; - pmu = type->pmus; - for (i = 0; i < type->num_boxes; i++, pmu++) { - box = pmu->boxes[pkg]; - if (box && atomic_dec_return(&box->refcnt) == 0) - uncore_box_exit(box); - } - } - return 0; -} - -static int first_init; - -static int uncore_cpu_starting(unsigned int cpu) -{ - struct intel_uncore_type *type, **types = uncore_msr_uncores; - struct intel_uncore_pmu *pmu; - struct intel_uncore_box *box; - int i, pkg, ncpus = 1; - - if (first_init) { - /* - * On init we get the number of online cpus in the package - * and set refcount for all of them. - */ - ncpus = cpumask_weight(topology_core_cpumask(cpu)); - } - - pkg = topology_logical_package_id(cpu); - for (; *types; types++) { - type = *types; - pmu = type->pmus; - for (i = 0; i < type->num_boxes; i++, pmu++) { - box = pmu->boxes[pkg]; - if (!box) - continue; - /* The first cpu on a package activates the box */ - if (atomic_add_return(ncpus, &box->refcnt) == ncpus) - uncore_box_init(box); - } - } - - return 0; -} - -static int uncore_cpu_prepare(unsigned int cpu) -{ - struct intel_uncore_type *type, **types = uncore_msr_uncores; - struct intel_uncore_pmu *pmu; - struct intel_uncore_box *box; - int i, pkg; - - pkg = topology_logical_package_id(cpu); - for (; *types; types++) { - type = *types; - pmu = type->pmus; - for (i = 0; i < type->num_boxes; i++, pmu++) { - if (pmu->boxes[pkg]) - continue; - /* First cpu of a package allocates the box */ - box = uncore_alloc_box(type, cpu_to_node(cpu)); - if (!box) - return -ENOMEM; - box->pmu = pmu; - box->pkgid = pkg; - pmu->boxes[pkg] = box; - } - } - return 0; -} - static void uncore_change_type_ctx(struct intel_uncore_type *type, int old_cpu, int new_cpu) { @@ -1176,12 +1079,14 @@ static void uncore_change_context(struct intel_uncore_type **uncores, static int uncore_event_cpu_offline(unsigned int cpu) { - int target; + struct intel_uncore_type *type, **types = uncore_msr_uncores; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, pkg, target; /* Check if exiting cpu is used for collecting uncore events */ if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask)) - return 0; - + goto unref; /* Find a new cpu to collect uncore events */ target = cpumask_any_but(topology_core_cpumask(cpu), cpu); @@ -1193,12 +1098,82 @@ static int uncore_event_cpu_offline(unsigned int cpu) uncore_change_context(uncore_msr_uncores, cpu, target); uncore_change_context(uncore_pci_uncores, cpu, target); + +unref: + /* Clear the references */ + pkg = topology_logical_package_id(cpu); + for (; *types; types++) { + type = *types; + pmu = type->pmus; + for (i = 0; i < type->num_boxes; i++, pmu++) { + box = pmu->boxes[pkg]; + if (box && atomic_dec_return(&box->refcnt) == 0) + uncore_box_exit(box); + } + } return 0; } +static int allocate_boxes(struct intel_uncore_type **types, + unsigned int pkg, unsigned int cpu) +{ + struct intel_uncore_box *box, *tmp; + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + LIST_HEAD(allocated); + int i; + + /* Try to allocate all required boxes */ + for (; *types; types++) { + type = *types; + pmu = type->pmus; + for (i = 0; i < type->num_boxes; i++, pmu++) { + if (pmu->boxes[pkg]) + continue; + box = uncore_alloc_box(type, cpu_to_node(cpu)); + if (!box) + goto cleanup; + box->pmu = pmu; + box->pkgid = pkg; + list_add(&box->active_list, &allocated); + } + } + /* Install them in the pmus */ + list_for_each_entry_safe(box, tmp, &allocated, active_list) { + list_del_init(&box->active_list); + box->pmu->boxes[pkg] = box; + } + return 0; + +cleanup: + list_for_each_entry_safe(box, tmp, &allocated, active_list) { + list_del_init(&box->active_list); + kfree(box); + } + return -ENOMEM; +} + static int uncore_event_cpu_online(unsigned int cpu) { - int target; + struct intel_uncore_type *type, **types = uncore_msr_uncores; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, ret, pkg, target; + + pkg = topology_logical_package_id(cpu); + ret = allocate_boxes(types, pkg, cpu); + if (ret) + return ret; + + for (; *types; types++) { + type = *types; + pmu = type->pmus; + for (i = 0; i < type->num_boxes; i++, pmu++) { + box = pmu->boxes[pkg]; + if (!box && atomic_inc_return(&box->refcnt) == 1) + uncore_box_init(box); + } + } /* * Check if there is an online cpu in the package @@ -1388,38 +1363,16 @@ static int __init intel_uncore_init(void) if (cret && pret) return -ENODEV; - /* - * Install callbacks. Core will call them for each online cpu. - * - * The first online cpu of each package allocates and takes - * the refcounts for all other online cpus in that package. - * If msrs are not enabled no allocation is required and - * uncore_cpu_prepare() is not called for each online cpu. - */ - if (!cret) { - ret = cpuhp_setup_state(CPUHP_PERF_X86_UNCORE_PREP, - "PERF_X86_UNCORE_PREP", - uncore_cpu_prepare, NULL); - if (ret) - goto err; - } else { - cpuhp_setup_state_nocalls(CPUHP_PERF_X86_UNCORE_PREP, - "PERF_X86_UNCORE_PREP", - uncore_cpu_prepare, NULL); - } - first_init = 1; - cpuhp_setup_state(CPUHP_AP_PERF_X86_UNCORE_STARTING, - "AP_PERF_X86_UNCORE_STARTING", - uncore_cpu_starting, uncore_cpu_dying); - first_init = 0; - cpuhp_setup_state(CPUHP_AP_PERF_X86_UNCORE_ONLINE, - "AP_PERF_X86_UNCORE_ONLINE", - uncore_event_cpu_online, uncore_event_cpu_offline); + /* Install hotplug callbacks to setup the targets for each package */ + ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_UNCORE_ONLINE, + "perf/x86/intel/uncore:online", + uncore_event_cpu_online, + uncore_event_cpu_offline); + if (ret) + goto err; return 0; err: - /* Undo box->init_box() */ - on_each_cpu_mask(&uncore_cpu_mask, uncore_exit_boxes, NULL, 1); uncore_types_exit(uncore_msr_uncores); uncore_pci_exit(); return ret; @@ -1428,9 +1381,7 @@ module_init(intel_uncore_init); static void __exit intel_uncore_exit(void) { - cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_UNCORE_ONLINE); - cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_UNCORE_STARTING); - cpuhp_remove_state_nocalls(CPUHP_PERF_X86_UNCORE_PREP); + cpuhp_remove_state(CPUHP_AP_PERF_X86_UNCORE_ONLINE); uncore_types_exit(uncore_msr_uncores); uncore_pci_exit(); } diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 272427700d48..dae2fedc1601 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -669,7 +669,7 @@ static struct event_constraint snbep_uncore_cbox_constraints[] = { UNCORE_EVENT_CONSTRAINT(0x1c, 0xc), UNCORE_EVENT_CONSTRAINT(0x1d, 0xc), UNCORE_EVENT_CONSTRAINT(0x1e, 0xc), - EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff), + UNCORE_EVENT_CONSTRAINT(0x1f, 0xe), UNCORE_EVENT_CONSTRAINT(0x21, 0x3), UNCORE_EVENT_CONSTRAINT(0x23, 0x3), UNCORE_EVENT_CONSTRAINT(0x31, 0x3), @@ -2686,7 +2686,7 @@ static struct intel_uncore_type *hswep_msr_uncores[] = { void hswep_uncore_cpu_init(void) { - int pkg = topology_phys_to_logical_pkg(0); + int pkg = boot_cpu_data.logical_proc_id; if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index cb26f18d43af..7c0a711989d2 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -27,7 +27,7 @@ #include <linux/jiffies.h> #include <linux/perf_event.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/pgalloc.h> #include <asm/cacheflush.h> #include <asm/user32.h> diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index cb13c0564ea7..95c0b4ae09b0 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -20,7 +20,7 @@ #include <linux/compat.h> #include <linux/binfmts.h> #include <asm/ucontext.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/fpu/internal.h> #include <asm/fpu/signal.h> #include <asm/ptrace.h> diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 719cd702b0a4..47956c6a4fd8 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -42,7 +42,7 @@ #include <linux/slab.h> #include <asm/mman.h> #include <asm/types.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/atomic.h> #include <asm/vgtod.h> #include <asm/sys_ia32.h> diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h new file mode 100644 index 000000000000..830b19dbfa31 --- /dev/null +++ b/arch/x86/include/asm/asm-prototypes.h @@ -0,0 +1,16 @@ +#include <asm/ftrace.h> +#include <linux/uaccess.h> +#include <asm/string.h> +#include <asm/page.h> +#include <asm/checksum.h> + +#include <asm-generic/asm-prototypes.h> + +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/special_insns.h> +#include <asm/preempt.h> + +#ifndef CONFIG_X86_CMPXCHG64 +extern void cmpxchg8b_emu(void); +#endif diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 68557f52b961..854022772c5b 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -139,6 +139,19 @@ static __always_inline void __clear_bit(long nr, volatile unsigned long *addr) asm volatile("btr %1,%0" : ADDR : "Ir" (nr)); } +static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr) +{ + bool negative; + asm volatile(LOCK_PREFIX "andb %2,%1\n\t" + CC_SET(s) + : CC_OUT(s) (negative), ADDR + : "ir" ((char) ~(1 << nr)) : "memory"); + return negative; +} + +// Let everybody know we have it +#define clear_bit_unlock_is_negative_byte clear_bit_unlock_is_negative_byte + /* * __clear_bit_unlock - Clears a bit in memory * @nr: Bit to clear diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h index c020ee75dce7..08e7efb2c140 100644 --- a/arch/x86/include/asm/checksum_64.h +++ b/arch/x86/include/asm/checksum_64.h @@ -8,7 +8,7 @@ */ #include <linux/compiler.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/byteorder.h> /** diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 59ac427960d4..eafee3161d1c 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -105,6 +105,7 @@ #define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */ #define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ #define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ +#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ @@ -188,6 +189,9 @@ #define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ #define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ +#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ +#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ +#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ @@ -221,6 +225,7 @@ #define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ #define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ #define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ +#define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */ #define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ #define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */ #define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h index 03bb1065c335..29e53ea7d764 100644 --- a/arch/x86/include/asm/crypto/glue_helper.h +++ b/arch/x86/include/asm/crypto/glue_helper.h @@ -5,8 +5,8 @@ #ifndef _CRYPTO_GLUE_HELPER_H #define _CRYPTO_GLUE_HELPER_H +#include <crypto/internal/skcipher.h> #include <linux/kernel.h> -#include <linux/crypto.h> #include <asm/fpu/api.h> #include <crypto/b128ops.h> @@ -69,6 +69,31 @@ static inline bool glue_fpu_begin(unsigned int bsize, int fpu_blocks_limit, return true; } +static inline bool glue_skwalk_fpu_begin(unsigned int bsize, + int fpu_blocks_limit, + struct skcipher_walk *walk, + bool fpu_enabled, unsigned int nbytes) +{ + if (likely(fpu_blocks_limit < 0)) + return false; + + if (fpu_enabled) + return true; + + /* + * Vector-registers are only used when chunk to be processed is large + * enough, so do not enable FPU until it is necessary. + */ + if (nbytes < bsize * (unsigned int)fpu_blocks_limit) + return false; + + /* prevent sleeping if FPU is in use */ + skcipher_walk_atomise(walk); + + kernel_fpu_begin(); + return true; +} + static inline void glue_fpu_end(bool fpu_enabled) { if (fpu_enabled) @@ -139,6 +164,18 @@ extern int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx, common_glue_func_t tweak_fn, void *tweak_ctx, void *crypt_ctx); +extern int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, + struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes, + common_glue_func_t tweak_fn, void *tweak_ctx, + void *crypt_ctx); + +extern int glue_xts_req_128bit(const struct common_glue_ctx *gctx, + struct skcipher_request *req, + common_glue_func_t tweak_fn, void *tweak_ctx, + void *crypt_ctx); + extern void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src, le128 *iv, common_glue_func_t fn); diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index 476b574de99e..ec23d8e1297c 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -1,13 +1,17 @@ #ifndef _ASM_X86_E820_H #define _ASM_X86_E820_H -#ifdef CONFIG_EFI +/* + * E820_X_MAX is the maximum size of the extended E820 table. The extended + * table may contain up to 3 extra E820 entries per possible NUMA node, so we + * make room for 3 * MAX_NUMNODES possible entries, beyond the standard 128. + * Also note that E820_X_MAX *must* be defined before we include uapi/asm/e820.h. + */ #include <linux/numa.h> #define E820_X_MAX (E820MAX + 3 * MAX_NUMNODES) -#else /* ! CONFIG_EFI */ -#define E820_X_MAX E820MAX -#endif + #include <uapi/asm/e820.h> + #ifndef __ASSEMBLY__ /* see comment in arch/x86/kernel/e820.c */ extern struct e820map *e820; diff --git a/arch/x86/include/asm/floppy.h b/arch/x86/include/asm/floppy.h index 1c7eefe32502..7ec59edde154 100644 --- a/arch/x86/include/asm/floppy.h +++ b/arch/x86/include/asm/floppy.h @@ -229,18 +229,18 @@ static struct fd_routine_l { int (*_dma_setup)(char *addr, unsigned long size, int mode, int io); } fd_routine[] = { { - request_dma, - free_dma, - get_dma_residue, - dma_mem_alloc, - hard_dma_setup + ._request_dma = request_dma, + ._free_dma = free_dma, + ._get_dma_residue = get_dma_residue, + ._dma_mem_alloc = dma_mem_alloc, + ._dma_setup = hard_dma_setup }, { - vdma_request_dma, - vdma_nop, - vdma_get_dma_residue, - vdma_mem_alloc, - vdma_dma_setup + ._request_dma = vdma_request_dma, + ._free_dma = vdma_nop, + ._get_dma_residue = vdma_get_dma_residue, + ._dma_mem_alloc = vdma_mem_alloc, + ._dma_setup = vdma_dma_setup } }; diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 34a46dc076d3..8167fdb67ae8 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -57,7 +57,7 @@ #define INTEL_FAM6_ATOM_SILVERMONT2 0x4D /* Avaton/Rangely */ #define INTEL_FAM6_ATOM_AIRMONT 0x4C /* CherryTrail / Braswell */ #define INTEL_FAM6_ATOM_MERRIFIELD 0x4A /* Tangier */ -#define INTEL_FAM6_ATOM_MOOREFIELD 0x5A /* Annidale */ +#define INTEL_FAM6_ATOM_MOOREFIELD 0x5A /* Anniedale */ #define INTEL_FAM6_ATOM_GOLDMONT 0x5C #define INTEL_FAM6_ATOM_DENVERTON 0x5F /* Goldmont Microserver */ diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h new file mode 100644 index 000000000000..95ce5c85b009 --- /dev/null +++ b/arch/x86/include/asm/intel_rdt.h @@ -0,0 +1,224 @@ +#ifndef _ASM_X86_INTEL_RDT_H +#define _ASM_X86_INTEL_RDT_H + +#ifdef CONFIG_INTEL_RDT_A + +#include <linux/kernfs.h> +#include <linux/jump_label.h> + +#include <asm/intel_rdt_common.h> + +#define IA32_L3_QOS_CFG 0xc81 +#define IA32_L3_CBM_BASE 0xc90 +#define IA32_L2_CBM_BASE 0xd10 + +#define L3_QOS_CDP_ENABLE 0x01ULL + +/** + * struct rdtgroup - store rdtgroup's data in resctrl file system. + * @kn: kernfs node + * @rdtgroup_list: linked list for all rdtgroups + * @closid: closid for this rdtgroup + * @cpu_mask: CPUs assigned to this rdtgroup + * @flags: status bits + * @waitcount: how many cpus expect to find this + * group when they acquire rdtgroup_mutex + */ +struct rdtgroup { + struct kernfs_node *kn; + struct list_head rdtgroup_list; + int closid; + struct cpumask cpu_mask; + int flags; + atomic_t waitcount; +}; + +/* rdtgroup.flags */ +#define RDT_DELETED 1 + +/* List of all resource groups */ +extern struct list_head rdt_all_groups; + +int __init rdtgroup_init(void); + +/** + * struct rftype - describe each file in the resctrl file system + * @name: file name + * @mode: access mode + * @kf_ops: operations + * @seq_show: show content of the file + * @write: write to the file + */ +struct rftype { + char *name; + umode_t mode; + struct kernfs_ops *kf_ops; + + int (*seq_show)(struct kernfs_open_file *of, + struct seq_file *sf, void *v); + /* + * write() is the generic write callback which maps directly to + * kernfs write operation and overrides all other operations. + * Maximum write size is determined by ->max_write_len. + */ + ssize_t (*write)(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +}; + +/** + * struct rdt_resource - attributes of an RDT resource + * @enabled: Is this feature enabled on this machine + * @capable: Is this feature available on this machine + * @name: Name to use in "schemata" file + * @num_closid: Number of CLOSIDs available + * @max_cbm: Largest Cache Bit Mask allowed + * @min_cbm_bits: Minimum number of consecutive bits to be set + * in a cache bit mask + * @domains: All domains for this resource + * @num_domains: Number of domains active + * @msr_base: Base MSR address for CBMs + * @tmp_cbms: Scratch space when updating schemata + * @num_tmp_cbms: Number of CBMs in tmp_cbms + * @cache_level: Which cache level defines scope of this domain + * @cbm_idx_multi: Multiplier of CBM index + * @cbm_idx_offset: Offset of CBM index. CBM index is computed by: + * closid * cbm_idx_multi + cbm_idx_offset + */ +struct rdt_resource { + bool enabled; + bool capable; + char *name; + int num_closid; + int cbm_len; + int min_cbm_bits; + u32 max_cbm; + struct list_head domains; + int num_domains; + int msr_base; + u32 *tmp_cbms; + int num_tmp_cbms; + int cache_level; + int cbm_idx_multi; + int cbm_idx_offset; +}; + +/** + * struct rdt_domain - group of cpus sharing an RDT resource + * @list: all instances of this resource + * @id: unique id for this instance + * @cpu_mask: which cpus share this resource + * @cbm: array of cache bit masks (indexed by CLOSID) + */ +struct rdt_domain { + struct list_head list; + int id; + struct cpumask cpu_mask; + u32 *cbm; +}; + +/** + * struct msr_param - set a range of MSRs from a domain + * @res: The resource to use + * @low: Beginning index from base MSR + * @high: End index + */ +struct msr_param { + struct rdt_resource *res; + int low; + int high; +}; + +extern struct mutex rdtgroup_mutex; + +extern struct rdt_resource rdt_resources_all[]; +extern struct rdtgroup rdtgroup_default; +DECLARE_STATIC_KEY_FALSE(rdt_enable_key); + +int __init rdtgroup_init(void); + +enum { + RDT_RESOURCE_L3, + RDT_RESOURCE_L3DATA, + RDT_RESOURCE_L3CODE, + RDT_RESOURCE_L2, + + /* Must be the last */ + RDT_NUM_RESOURCES, +}; + +#define for_each_capable_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->capable) + +#define for_each_enabled_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->enabled) + +/* CPUID.(EAX=10H, ECX=ResID=1).EAX */ +union cpuid_0x10_1_eax { + struct { + unsigned int cbm_len:5; + } split; + unsigned int full; +}; + +/* CPUID.(EAX=10H, ECX=ResID=1).EDX */ +union cpuid_0x10_1_edx { + struct { + unsigned int cos_max:16; + } split; + unsigned int full; +}; + +DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid); + +void rdt_cbm_update(void *arg); +struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); +void rdtgroup_kn_unlock(struct kernfs_node *kn); +ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +int rdtgroup_schemata_show(struct kernfs_open_file *of, + struct seq_file *s, void *v); + +/* + * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR + * + * Following considerations are made so that this has minimal impact + * on scheduler hot path: + * - This will stay as no-op unless we are running on an Intel SKU + * which supports resource control and we enable by mounting the + * resctrl file system. + * - Caches the per cpu CLOSid values and does the MSR write only + * when a task with a different CLOSid is scheduled in. + * + * Must be called with preemption disabled. + */ +static inline void intel_rdt_sched_in(void) +{ + if (static_branch_likely(&rdt_enable_key)) { + struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); + int closid; + + /* + * If this task has a closid assigned, use it. + * Else use the closid assigned to this cpu. + */ + closid = current->closid; + if (closid == 0) + closid = this_cpu_read(cpu_closid); + + if (closid != state->closid) { + state->closid = closid; + wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, closid); + } + } +} + +#else + +static inline void intel_rdt_sched_in(void) {} + +#endif /* CONFIG_INTEL_RDT_A */ +#endif /* _ASM_X86_INTEL_RDT_H */ diff --git a/arch/x86/include/asm/intel_rdt_common.h b/arch/x86/include/asm/intel_rdt_common.h new file mode 100644 index 000000000000..b31081b89407 --- /dev/null +++ b/arch/x86/include/asm/intel_rdt_common.h @@ -0,0 +1,27 @@ +#ifndef _ASM_X86_INTEL_RDT_COMMON_H +#define _ASM_X86_INTEL_RDT_COMMON_H + +#define MSR_IA32_PQR_ASSOC 0x0c8f + +/** + * struct intel_pqr_state - State cache for the PQR MSR + * @rmid: The cached Resource Monitoring ID + * @closid: The cached Class Of Service ID + * @rmid_usecnt: The usage counter for rmid + * + * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the + * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always + * contains both parts, so we need to cache them. + * + * The cache also helps to avoid pointless updates if the value does + * not change. + */ +struct intel_pqr_state { + u32 rmid; + u32 closid; + int rmid_usecnt; +}; + +DECLARE_PER_CPU(struct intel_pqr_state, pqr_state); + +#endif /* _ASM_X86_INTEL_RDT_COMMON_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index bdde80731f49..a7066dc1a7e9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -191,6 +191,8 @@ enum { #define PFERR_RSVD_BIT 3 #define PFERR_FETCH_BIT 4 #define PFERR_PK_BIT 5 +#define PFERR_GUEST_FINAL_BIT 32 +#define PFERR_GUEST_PAGE_BIT 33 #define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT) #define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT) @@ -198,6 +200,13 @@ enum { #define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) #define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) #define PFERR_PK_MASK (1U << PFERR_PK_BIT) +#define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT) +#define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT) + +#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ + PFERR_USER_MASK | \ + PFERR_WRITE_MASK | \ + PFERR_PRESENT_MASK) /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 @@ -695,6 +704,7 @@ struct kvm_apic_map { /* Hyper-V emulation context */ struct kvm_hv { + struct mutex hv_lock; u64 hv_guest_os_id; u64 hv_hypercall; u64 hv_tsc_page; @@ -758,7 +768,7 @@ struct kvm_arch { spinlock_t pvclock_gtod_sync_lock; bool use_master_clock; u64 master_kernel_ns; - cycle_t master_cycle_now; + u64 master_cycle_now; struct delayed_work kvmclock_update_work; struct delayed_work kvmclock_sync_work; @@ -1062,6 +1072,7 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3); +bool pdptrs_changed(struct kvm_vcpu *vcpu); int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, const void *val, int bytes); @@ -1124,7 +1135,8 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); struct x86_emulate_ctxt; int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port); -void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); +int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, unsigned short port); +int kvm_emulate_cpuid(struct kvm_vcpu *vcpu); int kvm_emulate_halt(struct kvm_vcpu *vcpu); int kvm_vcpu_halt(struct kvm_vcpu *vcpu); int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); @@ -1203,7 +1215,7 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code, void *insn, int insn_len); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu); @@ -1358,7 +1370,8 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu); extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); -void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); +int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu); +int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); int kvm_is_in_guest(void); diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h index c2b8d24a235c..d74747b031ec 100644 --- a/arch/x86/include/asm/kvm_page_track.h +++ b/arch/x86/include/asm/kvm_page_track.h @@ -29,9 +29,20 @@ struct kvm_page_track_notifier_node { * @gpa: the physical address written by guest. * @new: the data was written to the address. * @bytes: the written length. + * @node: this node */ void (*track_write)(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, - int bytes); + int bytes, struct kvm_page_track_notifier_node *node); + /* + * It is called when memory slot is being moved or removed + * users can drop write-protection for the pages in that memory slot + * + * @kvm: the kvm where memory slot being moved or removed + * @slot: the memory slot being moved or removed + * @node: this node + */ + void (*track_flush_slot)(struct kvm *kvm, struct kvm_memory_slot *slot, + struct kvm_page_track_notifier_node *node); }; void kvm_page_track_init(struct kvm *kvm); @@ -58,4 +69,5 @@ kvm_page_track_unregister_notifier(struct kvm *kvm, struct kvm_page_track_notifier_node *n); void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes); +void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot); #endif diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 38711df3bcb5..2266f864b747 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -140,6 +140,7 @@ extern void __init load_ucode_bsp(void); extern void load_ucode_ap(void); void reload_early_microcode(void); extern bool get_builtin_firmware(struct cpio_data *cd, const char *name); +extern bool initrd_gone; #else static inline int __init microcode_init(void) { return 0; }; static inline void __init load_ucode_bsp(void) { } diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h index 195becc6f780..e793fc9a9b20 100644 --- a/arch/x86/include/asm/microcode_intel.h +++ b/arch/x86/include/asm/microcode_intel.h @@ -52,6 +52,21 @@ struct extended_sigtable { #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) +static inline u32 intel_get_microcode_revision(void) +{ + u32 rev, dummy; + + native_wrmsrl(MSR_IA32_UCODE_REV, 0); + + /* As documented in the SDM: Do a CPUID 1 here */ + native_cpuid_eax(1); + + /* get the current revision from MSR 0x8B */ + native_rdmsr(MSR_IA32_UCODE_REV, dummy, rev); + + return rev; +} + #ifdef CONFIG_MICROCODE_INTEL extern void __init load_ucode_intel_bsp(void); extern void load_ucode_intel_ap(void); diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 72198c64e646..f9813b6d8b80 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -31,6 +31,10 @@ typedef struct { u16 pkey_allocation_map; s16 execute_only_pkey; #endif +#ifdef CONFIG_X86_INTEL_MPX + /* address of the bounds directory */ + void __user *bd_addr; +#endif } mm_context_t; #ifdef CONFIG_SMP diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h index 7a35495275a9..0b416d4cf73b 100644 --- a/arch/x86/include/asm/mpx.h +++ b/arch/x86/include/asm/mpx.h @@ -59,7 +59,7 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs); int mpx_handle_bd_fault(void); static inline int kernel_managing_mpx_tables(struct mm_struct *mm) { - return (mm->bd_addr != MPX_INVALID_BOUNDS_DIR); + return (mm->context.bd_addr != MPX_INVALID_BOUNDS_DIR); } static inline void mpx_mm_init(struct mm_struct *mm) { @@ -67,7 +67,7 @@ static inline void mpx_mm_init(struct mm_struct *mm) * NULL is theoretically a valid place to put the bounds * directory, so point this at an invalid address. */ - mm->bd_addr = MPX_INVALID_BOUNDS_DIR; + mm->context.bd_addr = MPX_INVALID_BOUNDS_DIR; } void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long start, unsigned long end); diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 1cc82ece9ac1..62b775926045 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -116,8 +116,7 @@ static inline void native_pgd_clear(pgd_t *pgd) native_set_pgd(pgd, native_make_pgd(0)); } -extern void sync_global_pgds(unsigned long start, unsigned long end, - int removed); +extern void sync_global_pgds(unsigned long start, unsigned long end); /* * Conversion functions: convert a page and protection to a page entry, diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 6aa741fbe1df..e6cfe7ba2d65 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -104,6 +104,7 @@ struct cpuinfo_x86 { __u8 x86_phys_bits; /* CPUID returned core id bits: */ __u8 x86_coreid_bits; + __u8 cu_id; /* Max extended CPUID function supported: */ __u32 extended_cpuid_level; /* Maximum supported CPUID level, -1=no CPUID: */ @@ -219,6 +220,24 @@ static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, : "memory"); } +#define native_cpuid_reg(reg) \ +static inline unsigned int native_cpuid_##reg(unsigned int op) \ +{ \ + unsigned int eax = op, ebx, ecx = 0, edx; \ + \ + native_cpuid(&eax, &ebx, &ecx, &edx); \ + \ + return reg; \ +} + +/* + * Native CPUID functions returning a single datum. + */ +native_cpuid_reg(eax) +native_cpuid_reg(ebx) +native_cpuid_reg(ecx) +native_cpuid_reg(edx) + static inline void load_cr3(pgd_t *pgdir) { write_cr3(__pa(pgdir)); @@ -602,33 +621,69 @@ static __always_inline void cpu_relax(void) rep_nop(); } -/* Stop speculative execution and prefetching of modified code. */ +/* + * This function forces the icache and prefetched instruction stream to + * catch up with reality in two very specific cases: + * + * a) Text was modified using one virtual address and is about to be executed + * from the same physical page at a different virtual address. + * + * b) Text was modified on a different CPU, may subsequently be + * executed on this CPU, and you want to make sure the new version + * gets executed. This generally means you're calling this in a IPI. + * + * If you're calling this for a different reason, you're probably doing + * it wrong. + */ static inline void sync_core(void) { - int tmp; - -#ifdef CONFIG_M486 /* - * Do a CPUID if available, otherwise do a jump. The jump - * can conveniently enough be the jump around CPUID. + * There are quite a few ways to do this. IRET-to-self is nice + * because it works on every CPU, at any CPL (so it's compatible + * with paravirtualization), and it never exits to a hypervisor. + * The only down sides are that it's a bit slow (it seems to be + * a bit more than 2x slower than the fastest options) and that + * it unmasks NMIs. The "push %cs" is needed because, in + * paravirtual environments, __KERNEL_CS may not be a valid CS + * value when we do IRET directly. + * + * In case NMI unmasking or performance ever becomes a problem, + * the next best option appears to be MOV-to-CR2 and an + * unconditional jump. That sequence also works on all CPUs, + * but it will fault at CPL3 (i.e. Xen PV and lguest). + * + * CPUID is the conventional way, but it's nasty: it doesn't + * exist on some 486-like CPUs, and it usually exits to a + * hypervisor. + * + * Like all of Linux's memory ordering operations, this is a + * compiler barrier as well. */ - asm volatile("cmpl %2,%1\n\t" - "jl 1f\n\t" - "cpuid\n" - "1:" - : "=a" (tmp) - : "rm" (boot_cpu_data.cpuid_level), "ri" (0), "0" (1) - : "ebx", "ecx", "edx", "memory"); + register void *__sp asm(_ASM_SP); + +#ifdef CONFIG_X86_32 + asm volatile ( + "pushfl\n\t" + "pushl %%cs\n\t" + "pushl $1f\n\t" + "iret\n\t" + "1:" + : "+r" (__sp) : : "memory"); #else - /* - * CPUID is a barrier to speculative execution. - * Prefetched instructions are automatically - * invalidated when modified. - */ - asm volatile("cpuid" - : "=a" (tmp) - : "0" (1) - : "ebx", "ecx", "edx", "memory"); + unsigned int tmp; + + asm volatile ( + "mov %%ss, %0\n\t" + "pushq %q0\n\t" + "pushq %%rsp\n\t" + "addq $8, (%%rsp)\n\t" + "pushfq\n\t" + "mov %%cs, %0\n\t" + "pushq %q0\n\t" + "pushq $1f\n\t" + "iretq\n\t" + "1:" + : "=&r" (tmp), "+r" (__sp) : : "cc", "memory"); #endif } diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 3ad741b84072..448cfe1b48cf 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -14,7 +14,7 @@ static inline struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void) #endif /* some helper functions for xen and kvm pv clock sources */ -cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); +u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src); void pvclock_set_flags(u8 flags); unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src); @@ -87,11 +87,10 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) } static __always_inline -cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, - u64 tsc) +u64 __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, u64 tsc) { u64 delta = tsc - src->tsc_timestamp; - cycle_t offset = pvclock_scale_delta(delta, src->tsc_to_system_mul, + u64 offset = pvclock_scale_delta(delta, src->tsc_to_system_mul, src->tsc_shift); return src->system_time + offset; } diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index a3269c897ec5..2e41c50ddf47 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -58,7 +58,7 @@ get_frame_pointer(struct task_struct *task, struct pt_regs *regs) if (task == current) return __builtin_frame_address(0); - return (unsigned long *)((struct inactive_task_frame *)task->thread.sp)->bp; + return &((struct inactive_task_frame *)task->thread.sp)->bp; } #else static inline unsigned long * diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 5cb436acd463..fcc5cd387fd1 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -36,7 +36,10 @@ static inline void prepare_switch_to(struct task_struct *prev, asmlinkage void ret_from_fork(void); -/* data that is pointed to by thread.sp */ +/* + * This is the structure pointed to by thread.sp for an inactive task. The + * order of the fields must match the code in __switch_to_asm(). + */ struct inactive_task_frame { #ifdef CONFIG_X86_64 unsigned long r15; @@ -48,6 +51,11 @@ struct inactive_task_frame { unsigned long di; #endif unsigned long bx; + + /* + * These two fields must be together. They form a stack frame header, + * needed by get_frame_pointer(). + */ unsigned long bp; unsigned long ret_addr; }; diff --git a/arch/x86/include/asm/trace/exceptions.h b/arch/x86/include/asm/trace/exceptions.h index 2fbc66c7885b..2422b14c50a7 100644 --- a/arch/x86/include/asm/trace/exceptions.h +++ b/arch/x86/include/asm/trace/exceptions.h @@ -6,7 +6,7 @@ #include <linux/tracepoint.h> -extern void trace_irq_vector_regfunc(void); +extern int trace_irq_vector_regfunc(void); extern void trace_irq_vector_unregfunc(void); DECLARE_EVENT_CLASS(x86_exceptions, diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h index 38a09a13a9bc..32dd6a9e343c 100644 --- a/arch/x86/include/asm/trace/irq_vectors.h +++ b/arch/x86/include/asm/trace/irq_vectors.h @@ -6,7 +6,7 @@ #include <linux/tracepoint.h> -extern void trace_irq_vector_regfunc(void); +extern int trace_irq_vector_regfunc(void); extern void trace_irq_vector_unregfunc(void); DECLARE_EVENT_CLASS(x86_irq_vector, diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 33b6365c22fe..f5e6f1c417df 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -29,7 +29,7 @@ static inline cycles_t get_cycles(void) return rdtsc(); } -extern struct system_counterval_t convert_art_to_tsc(cycle_t art); +extern struct system_counterval_t convert_art_to_tsc(u64 art); extern void tsc_init(void); extern void mark_tsc_unstable(char *reason); @@ -45,8 +45,17 @@ extern int tsc_clocksource_reliable; * Boot-time check whether the TSCs are synchronized across * all CPUs/cores: */ +#ifdef CONFIG_X86_TSC +extern bool tsc_store_and_check_tsc_adjust(bool bootcpu); +extern void tsc_verify_tsc_adjust(bool resume); extern void check_tsc_sync_source(int cpu); extern void check_tsc_sync_target(void); +#else +static inline bool tsc_store_and_check_tsc_adjust(bool bootcpu) { return false; } +static inline void tsc_verify_tsc_adjust(bool resume) { } +static inline void check_tsc_sync_source(int cpu) { } +static inline void check_tsc_sync_target(void) { } +#endif extern int notsc_setup(char *); extern void tsc_save_sched_clock_state(void); diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index c5a7f3a930dd..6fa75b17aec3 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -12,7 +12,7 @@ struct unwind_state { struct task_struct *task; int graph_idx; #ifdef CONFIG_FRAME_POINTER - unsigned long *bp; + unsigned long *bp, *orig_sp; struct pt_regs *regs; #else unsigned long *sp; diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index 3a01996db58f..022e59714562 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -17,8 +17,8 @@ struct vsyscall_gtod_data { unsigned seq; int vclock_mode; - cycle_t cycle_last; - cycle_t mask; + u64 cycle_last; + u64 mask; u32 mult; u32 shift; diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index a002b07a7099..2b5b2d4b924e 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -25,6 +25,7 @@ #define VMX_H +#include <linux/bitops.h> #include <linux/types.h> #include <uapi/asm/vmx.h> @@ -60,6 +61,7 @@ */ #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 #define SECONDARY_EXEC_ENABLE_EPT 0x00000002 +#define SECONDARY_EXEC_DESC 0x00000004 #define SECONDARY_EXEC_RDTSCP 0x00000008 #define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE 0x00000010 #define SECONDARY_EXEC_ENABLE_VPID 0x00000020 @@ -110,6 +112,36 @@ #define VMX_MISC_SAVE_EFER_LMA 0x00000020 #define VMX_MISC_ACTIVITY_HLT 0x00000040 +static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic) +{ + return vmx_basic & GENMASK_ULL(30, 0); +} + +static inline u32 vmx_basic_vmcs_size(u64 vmx_basic) +{ + return (vmx_basic & GENMASK_ULL(44, 32)) >> 32; +} + +static inline int vmx_misc_preemption_timer_rate(u64 vmx_misc) +{ + return vmx_misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; +} + +static inline int vmx_misc_cr3_count(u64 vmx_misc) +{ + return (vmx_misc & GENMASK_ULL(24, 16)) >> 16; +} + +static inline int vmx_misc_max_msr(u64 vmx_misc) +{ + return (vmx_misc & GENMASK_ULL(27, 25)) >> 25; +} + +static inline int vmx_misc_mseg_revid(u64 vmx_misc) +{ + return (vmx_misc & GENMASK_ULL(63, 32)) >> 32; +} + /* VMCS Encodings */ enum vmcs_field { VIRTUAL_PROCESSOR_ID = 0x00000000, @@ -399,10 +431,11 @@ enum vmcs_field { #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 2) #define VMX_NR_VPIDS (1 << 16) +#define VMX_VPID_EXTENT_INDIVIDUAL_ADDR 0 #define VMX_VPID_EXTENT_SINGLE_CONTEXT 1 #define VMX_VPID_EXTENT_ALL_CONTEXT 2 +#define VMX_VPID_EXTENT_SINGLE_NON_GLOBAL 3 -#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0 #define VMX_EPT_EXTENT_CONTEXT 1 #define VMX_EPT_EXTENT_GLOBAL 2 #define VMX_EPT_EXTENT_SHIFT 24 @@ -419,8 +452,10 @@ enum vmcs_field { #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) #define VMX_VPID_INVVPID_BIT (1ull << 0) /* (32 - 32) */ +#define VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT (1ull << 8) /* (40 - 32) */ #define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */ #define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */ +#define VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT (1ull << 11) /* (43 - 32) */ #define VMX_EPT_DEFAULT_GAW 3 #define VMX_EPT_MAX_GAW 0x4 diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 6ba793178441..7ba7e90a9ad6 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -59,7 +59,7 @@ struct x86_init_irqs { /** * struct x86_init_oem - oem platform specific customizing functions - * @arch_setup: platform specific architecure setup + * @arch_setup: platform specific architecture setup * @banner: print a platform specific banner */ struct x86_init_oem { @@ -165,8 +165,25 @@ struct x86_legacy_devices { }; /** + * enum x86_legacy_i8042_state - i8042 keyboard controller state + * @X86_LEGACY_I8042_PLATFORM_ABSENT: the controller is always absent on + * given platform/subarch. + * @X86_LEGACY_I8042_FIRMWARE_ABSENT: firmware reports that the controller + * is absent. + * @X86_LEGACY_i8042_EXPECTED_PRESENT: the controller is likely to be + * present, the i8042 driver should probe for controller existence. + */ +enum x86_legacy_i8042_state { + X86_LEGACY_I8042_PLATFORM_ABSENT, + X86_LEGACY_I8042_FIRMWARE_ABSENT, + X86_LEGACY_I8042_EXPECTED_PRESENT, +}; + +/** * struct x86_legacy_features - legacy x86 features * + * @i8042: indicated if we expect the device to have i8042 controller + * present. * @rtc: this device has a CMOS real-time clock present * @reserve_bios_regions: boot code will search for the EBDA address and the * start of the 640k - 1M BIOS region. If false, the platform must @@ -175,6 +192,7 @@ struct x86_legacy_devices { * documentation for further details. */ struct x86_legacy_features { + enum x86_legacy_i8042_state i8042; int rtc; int reserve_bios_regions; struct x86_legacy_devices devices; @@ -188,15 +206,14 @@ struct x86_legacy_features { * @set_wallclock: set time back to HW clock * @is_untracked_pat_range exclude from PAT logic * @nmi_init enable NMI on cpus - * @i8042_detect pre-detect if i8042 controller exists * @save_sched_clock_state: save state for sched_clock() on suspend * @restore_sched_clock_state: restore state for sched_clock() on resume - * @apic_post_init: adjust apic if neeeded + * @apic_post_init: adjust apic if needed * @legacy: legacy features * @set_legacy_features: override legacy features. Use of this callback * is highly discouraged. You should only need * this if your hardware platform requires further - * custom fine tuning far beyong what may be + * custom fine tuning far beyond what may be * possible in x86_early_init_platform_quirks() by * only using the current x86_hardware_subarch * semantics. @@ -210,7 +227,6 @@ struct x86_platform_ops { bool (*is_untracked_pat_range)(u64 start, u64 end); void (*nmi_init)(void); unsigned char (*get_nmi_reason)(void); - int (*i8042_detect)(void); void (*save_sched_clock_state)(void); void (*restore_sched_clock_state)(void); void (*apic_post_init)(void); diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index f5fb840b43e8..33cbd3db97b9 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -7,7 +7,7 @@ #include <linux/pfn.h> #include <linux/mm.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/page.h> #include <asm/pgtable.h> diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 37fee272618f..14458658e988 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -65,6 +65,8 @@ #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 #define EXIT_REASON_APIC_ACCESS 44 #define EXIT_REASON_EOI_INDUCED 45 +#define EXIT_REASON_GDTR_IDTR 46 +#define EXIT_REASON_LDTR_TR 47 #define EXIT_REASON_EPT_VIOLATION 48 #define EXIT_REASON_EPT_MISCONFIG 49 #define EXIT_REASON_INVEPT 50 @@ -113,6 +115,8 @@ { EXIT_REASON_MCE_DURING_VMENTRY, "MCE_DURING_VMENTRY" }, \ { EXIT_REASON_TPR_BELOW_THRESHOLD, "TPR_BELOW_THRESHOLD" }, \ { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ + { EXIT_REASON_GDTR_IDTR, "GDTR_IDTR" }, \ + { EXIT_REASON_LDTR_TR, "LDTR_TR" }, \ { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ { EXIT_REASON_INVEPT, "INVEPT" }, \ @@ -129,6 +133,7 @@ { EXIT_REASON_XRSTORS, "XRSTORS" } #define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1 +#define VMX_ABORT_LOAD_HOST_PDPTE_FAIL 2 #define VMX_ABORT_LOAD_HOST_MSR_FAIL 4 #endif /* _UAPIVMX_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 05110c1097ae..581386c7e429 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -75,7 +75,7 @@ apm-y := apm_32.o obj-$(CONFIG_APM) += apm.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_SMP) += smpboot.o -obj-$(CONFIG_SMP) += tsc_sync.o +obj-$(CONFIG_X86_TSC) += tsc_sync.o obj-$(CONFIG_SMP) += setup_percpu.o obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-y += apic/ diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c index c280df6b2aa2..ea3046e0b0cf 100644 --- a/arch/x86/kernel/acpi/apei.c +++ b/arch/x86/kernel/acpi/apei.c @@ -24,9 +24,6 @@ int arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, void *data) struct acpi_hest_ia_corrected *cmc; struct acpi_hest_ia_error_bank *mc_bank; - if (hest_hdr->type != ACPI_HEST_TYPE_IA32_CORRECTED_CHECK) - return 0; - cmc = (struct acpi_hest_ia_corrected *)hest_hdr; if (!cmc->enabled) return 0; diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 4764fa56924d..64422f850e95 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -715,7 +715,7 @@ int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) int nid; nid = acpi_get_node(handle); - if (nid != -1) { + if (nid != NUMA_NO_NODE) { set_apicid_to_node(physid, nid); numa_set_node(cpu, nid); } @@ -930,6 +930,13 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table) x86_platform.legacy.devices.pnpbios = 0; } + if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && + !(acpi_gbl_FADT.boot_flags & ACPI_FADT_8042) && + x86_platform.legacy.i8042 != X86_LEGACY_I8042_PLATFORM_ABSENT) { + pr_debug("ACPI: i8042 controller is absent\n"); + x86_platform.legacy.i8042 = X86_LEGACY_I8042_FIRMWARE_ABSENT; + } + if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) { pr_debug("ACPI: not registering RTC platform device\n"); x86_platform.legacy.rtc = 0; diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 169963f471bb..50b8ed0317a3 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -109,6 +109,15 @@ ENTRY(do_suspend_lowlevel) movq pt_regs_r14(%rax), %r14 movq pt_regs_r15(%rax), %r15 +#ifdef CONFIG_KASAN + /* + * The suspend path may have poisoned some areas deeper in the stack, + * which we now need to unpoison. + */ + movq %rsp, %rdi + call kasan_unpoison_task_stack_below +#endif + xorl %eax, %eax addq $8, %rsp FRAME_END diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 5cb272a7a5a3..c5b8f760473c 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -337,7 +337,11 @@ done: n_dspl, (unsigned long)orig_insn + n_dspl + repl_len); } -static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr) +/* + * "noinline" to cause control flow change and thus invalidate I$ and + * cause refetch after modification. + */ +static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr) { unsigned long flags; @@ -346,7 +350,6 @@ static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr) local_irq_save(flags); add_nops(instr + (a->instrlen - a->padlen), a->padlen); - sync_core(); local_irq_restore(flags); DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ", @@ -359,9 +362,12 @@ static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr) * This implies that asymmetric systems where APs have less capabilities than * the boot processor are not handled. Tough. Make sure you disable such * features by hand. + * + * Marked "noinline" to cause control flow change and thus insn cache + * to refetch changed I$ lines. */ -void __init_or_module apply_alternatives(struct alt_instr *start, - struct alt_instr *end) +void __init_or_module noinline apply_alternatives(struct alt_instr *start, + struct alt_instr *end) { struct alt_instr *a; u8 *instr, *replacement; @@ -667,7 +673,6 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode, unsigned long flags; local_irq_save(flags); memcpy(addr, opcode, len); - sync_core(); local_irq_restore(flags); /* Could also do a CLFLUSH here to speed up CPU recovery; but that causes hangs on some VIA CPUs. */ diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 456316f6c868..65721dc73bd8 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -234,7 +234,7 @@ static __init int apbt_late_init(void) if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT || !apb_timer_block_enabled) return 0; - return cpuhp_setup_state(CPUHP_X86_APB_DEAD, "X86_APB_DEAD", NULL, + return cpuhp_setup_state(CPUHP_X86_APB_DEAD, "x86/apb:dead", NULL, apbt_cpu_dead); } fs_initcall(apbt_late_init); @@ -247,7 +247,7 @@ void apbt_setup_secondary_clock(void) {} static int apbt_clocksource_register(void) { u64 start, now; - cycle_t t1; + u64 t1; /* Start the counter, use timer 2 as source, timer 0/1 for event */ dw_apb_clocksource_start(clocksource_apbt); @@ -355,7 +355,7 @@ unsigned long apbt_quick_calibrate(void) { int i, scale; u64 old, new; - cycle_t t1, t2; + u64 t1, t2; unsigned long khz = 0; u32 loop, shift; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index bb47e5eacd44..5b7e43eff139 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2160,21 +2160,6 @@ int __generic_processor_info(int apicid, int version, bool enabled) } /* - * This can happen on physical hotplug. The sanity check at boot time - * is done from native_smp_prepare_cpus() after num_possible_cpus() is - * established. - */ - if (topology_update_package_map(apicid, cpu) < 0) { - int thiscpu = max + disabled_cpus; - - pr_warning("APIC: Package limit reached. Processor %d/0x%x ignored.\n", - thiscpu, apicid); - - disabled_cpus++; - return -ENOSPC; - } - - /* * Validate version */ if (version == 0x0) { diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 945e512a112a..bd6b8c270c24 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2115,6 +2115,7 @@ static inline void __init check_timer(void) if (idx != -1 && irq_trigger(idx)) unmask_ioapic_irq(irq_get_chip_data(0)); } + irq_domain_deactivate_irq(irq_data); irq_domain_activate_irq(irq_data); if (timer_irq_works()) { if (disable_timer_pin_1 > 0) @@ -2136,6 +2137,7 @@ static inline void __init check_timer(void) * legacy devices should be connected to IO APIC #0 */ replace_pin_at_irq_node(data, node, apic1, pin1, apic2, pin2); + irq_domain_deactivate_irq(irq_data); irq_domain_activate_irq(irq_data); legacy_pic->unmask(0); if (timer_irq_works()) { diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 200af5ae9662..5a35f208ed95 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -191,7 +191,7 @@ static int x2apic_cluster_probe(void) if (!x2apic_mode) return 0; - ret = cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "X2APIC_PREPARE", + ret = cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "x86/x2apic:prepare", x2apic_prepare_cpu, x2apic_dead_cpu); if (ret < 0) { pr_err("Failed to register X2APIC_PREPARE\n"); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 643818a7688b..45d44c173cf9 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -234,7 +234,7 @@ #include <linux/i8253.h> #include <linux/cpuidle.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/desc.h> #include <asm/olpc.h> #include <asm/paravirt.h> diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 33b63670bf09..52000010c62e 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -32,6 +32,8 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o +obj-$(CONFIG_INTEL_RDT_A) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o + obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ obj-$(CONFIG_MICROCODE) += microcode/ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 71cae73a5076..2b4cf04239b6 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -312,12 +312,19 @@ static void amd_get_topology(struct cpuinfo_x86 *c) u32 eax, ebx, ecx, edx; cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); - node_id = ecx & 7; - /* get compute unit information */ - smp_num_siblings = ((ebx >> 8) & 3) + 1; - c->x86_max_cores /= smp_num_siblings; - c->cpu_core_id = ebx & 0xff; + node_id = ecx & 0xff; + smp_num_siblings = ((ebx >> 8) & 0xff) + 1; + + if (c->x86 == 0x15) + c->cu_id = ebx & 0xff; + + if (c->x86 >= 0x17) { + c->cpu_core_id = ebx & 0xff; + + if (smp_num_siblings > 1) + c->x86_max_cores /= smp_num_siblings; + } /* * We may have multiple LLCs if L3 caches exist, so check if we diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 729f92ba8224..ede03e849a8b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -667,13 +667,14 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[CPUID_1_EDX] = edx; } + /* Thermal and Power Management Leaf: level 0x00000006 (eax) */ + if (c->cpuid_level >= 0x00000006) + c->x86_capability[CPUID_6_EAX] = cpuid_eax(0x00000006); + /* Additional Intel-defined flags: level 0x00000007 */ if (c->cpuid_level >= 0x00000007) { cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); - c->x86_capability[CPUID_7_0_EBX] = ebx; - - c->x86_capability[CPUID_6_EAX] = cpuid_eax(0x00000006); c->x86_capability[CPUID_7_ECX] = ecx; } @@ -979,29 +980,21 @@ static void x86_init_cache_qos(struct cpuinfo_x86 *c) } /* - * The physical to logical package id mapping is initialized from the - * acpi/mptables information. Make sure that CPUID actually agrees with - * that. + * Validate that ACPI/mptables have the same information about the + * effective APIC id and update the package map. */ -static void sanitize_package_id(struct cpuinfo_x86 *c) +static void validate_apic_and_package_id(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP - unsigned int pkg, apicid, cpu = smp_processor_id(); + unsigned int apicid, cpu = smp_processor_id(); apicid = apic->cpu_present_to_apicid(cpu); - pkg = apicid >> boot_cpu_data.x86_coreid_bits; - if (apicid != c->initial_apicid) { - pr_err(FW_BUG "CPU%u: APIC id mismatch. Firmware: %x CPUID: %x\n", + if (apicid != c->apicid) { + pr_err(FW_BUG "CPU%u: APIC id mismatch. Firmware: %x APIC: %x\n", cpu, apicid, c->initial_apicid); - c->initial_apicid = apicid; - } - if (pkg != c->phys_proc_id) { - pr_err(FW_BUG "CPU%u: Using firmware package id %u instead of %u\n", - cpu, pkg, c->phys_proc_id); - c->phys_proc_id = pkg; } - c->logical_proc_id = topology_phys_to_logical_pkg(pkg); + BUG_ON(topology_update_package_map(c->phys_proc_id, cpu)); #else c->logical_proc_id = 0; #endif @@ -1022,6 +1015,7 @@ static void identify_cpu(struct cpuinfo_x86 *c) c->x86_model_id[0] = '\0'; /* Unset */ c->x86_max_cores = 1; c->x86_coreid_bits = 0; + c->cu_id = 0xff; #ifdef CONFIG_X86_64 c->x86_clflush_size = 64; c->x86_phys_bits = 36; @@ -1132,7 +1126,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) #ifdef CONFIG_NUMA numa_add_cpu(smp_processor_id()); #endif - sanitize_package_id(c); } /* @@ -1187,6 +1180,7 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) enable_sep_cpu(); #endif mtrr_ap_init(); + validate_apic_and_package_id(c); } static __init int setup_noclflush(char *arg) @@ -1228,7 +1222,7 @@ static __init int setup_disablecpuid(char *arg) { int bit; - if (get_option(&arg, &bit) && bit < NCAPINTS*32) + if (get_option(&arg, &bit) && bit >= 0 && bit < NCAPINTS * 32) setup_clear_cpu_cap(bit); else return 0; diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index fcd484d2bb03..203f860d2ab3 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -14,6 +14,7 @@ #include <asm/bugs.h> #include <asm/cpu.h> #include <asm/intel-family.h> +#include <asm/microcode_intel.h> #ifdef CONFIG_X86_64 #include <linux/topology.h> @@ -78,14 +79,8 @@ static void early_init_intel(struct cpuinfo_x86 *c) (c->x86 == 0x6 && c->x86_model >= 0x0e)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); - if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) { - unsigned lower_word; - - wrmsr(MSR_IA32_UCODE_REV, 0, 0); - /* Required by the SDM */ - sync_core(); - rdmsr(MSR_IA32_UCODE_REV, lower_word, c->microcode); - } + if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) + c->microcode = intel_get_microcode_revision(); /* * Atom erratum AAE44/AAF40/AAG38/AAH41: diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index de6626c18e42..0282b0df004a 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -153,6 +153,7 @@ struct _cpuid4_info_regs { union _cpuid4_leaf_eax eax; union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; + unsigned int id; unsigned long size; struct amd_northbridge *nb; }; @@ -894,6 +895,8 @@ static void __cache_cpumap_setup(unsigned int cpu, int index, static void ci_leaf_init(struct cacheinfo *this_leaf, struct _cpuid4_info_regs *base) { + this_leaf->id = base->id; + this_leaf->attributes = CACHE_ID; this_leaf->level = base->eax.split.level; this_leaf->type = cache_type_map[base->eax.split.type]; this_leaf->coherency_line_size = @@ -920,6 +923,22 @@ static int __init_cache_level(unsigned int cpu) return 0; } +/* + * The max shared threads number comes from CPUID.4:EAX[25-14] with input + * ECX as cache index. Then right shift apicid by the number's order to get + * cache id for this cache node. + */ +static void get_cache_id(int cpu, struct _cpuid4_info_regs *id4_regs) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + unsigned long num_threads_sharing; + int index_msb; + + num_threads_sharing = 1 + id4_regs->eax.split.num_threads_sharing; + index_msb = get_count_order(num_threads_sharing); + id4_regs->id = c->apicid >> index_msb; +} + static int __populate_cache_leaves(unsigned int cpu) { unsigned int idx, ret; @@ -931,9 +950,12 @@ static int __populate_cache_leaves(unsigned int cpu) ret = cpuid4_cache_lookup_regs(idx, &id4_regs); if (ret) return ret; + get_cache_id(cpu, &id4_regs); ci_leaf_init(this_leaf++, &id4_regs); __cache_cpumap_setup(cpu, idx, &id4_regs); } + this_cpu_ci->cpu_map_populated = true; + return 0; } diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c new file mode 100644 index 000000000000..5a533fefefa0 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -0,0 +1,403 @@ +/* + * Resource Director Technology(RDT) + * - Cache Allocation code. + * + * Copyright (C) 2016 Intel Corporation + * + * Authors: + * Fenghua Yu <fenghua.yu@intel.com> + * Tony Luck <tony.luck@intel.com> + * Vikas Shivappa <vikas.shivappa@intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual June 2016, volume 3, section 17.17. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/slab.h> +#include <linux/err.h> +#include <linux/cacheinfo.h> +#include <linux/cpuhotplug.h> + +#include <asm/intel-family.h> +#include <asm/intel_rdt.h> + +/* Mutex to protect rdtgroup access. */ +DEFINE_MUTEX(rdtgroup_mutex); + +DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid); + +#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains) + +struct rdt_resource rdt_resources_all[] = { + { + .name = "L3", + .domains = domain_init(RDT_RESOURCE_L3), + .msr_base = IA32_L3_CBM_BASE, + .min_cbm_bits = 1, + .cache_level = 3, + .cbm_idx_multi = 1, + .cbm_idx_offset = 0 + }, + { + .name = "L3DATA", + .domains = domain_init(RDT_RESOURCE_L3DATA), + .msr_base = IA32_L3_CBM_BASE, + .min_cbm_bits = 1, + .cache_level = 3, + .cbm_idx_multi = 2, + .cbm_idx_offset = 0 + }, + { + .name = "L3CODE", + .domains = domain_init(RDT_RESOURCE_L3CODE), + .msr_base = IA32_L3_CBM_BASE, + .min_cbm_bits = 1, + .cache_level = 3, + .cbm_idx_multi = 2, + .cbm_idx_offset = 1 + }, + { + .name = "L2", + .domains = domain_init(RDT_RESOURCE_L2), + .msr_base = IA32_L2_CBM_BASE, + .min_cbm_bits = 1, + .cache_level = 2, + .cbm_idx_multi = 1, + .cbm_idx_offset = 0 + }, +}; + +static int cbm_idx(struct rdt_resource *r, int closid) +{ + return closid * r->cbm_idx_multi + r->cbm_idx_offset; +} + +/* + * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs + * as they do not have CPUID enumeration support for Cache allocation. + * The check for Vendor/Family/Model is not enough to guarantee that + * the MSRs won't #GP fault because only the following SKUs support + * CAT: + * Intel(R) Xeon(R) CPU E5-2658 v3 @ 2.20GHz + * Intel(R) Xeon(R) CPU E5-2648L v3 @ 1.80GHz + * Intel(R) Xeon(R) CPU E5-2628L v3 @ 2.00GHz + * Intel(R) Xeon(R) CPU E5-2618L v3 @ 2.30GHz + * Intel(R) Xeon(R) CPU E5-2608L v3 @ 2.00GHz + * Intel(R) Xeon(R) CPU E5-2658A v3 @ 2.20GHz + * + * Probe by trying to write the first of the L3 cach mask registers + * and checking that the bits stick. Max CLOSids is always 4 and max cbm length + * is always 20 on hsw server parts. The minimum cache bitmask length + * allowed for HSW server is always 2 bits. Hardcode all of them. + */ +static inline bool cache_alloc_hsw_probe(void) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + boot_cpu_data.x86 == 6 && + boot_cpu_data.x86_model == INTEL_FAM6_HASWELL_X) { + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; + u32 l, h, max_cbm = BIT_MASK(20) - 1; + + if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0)) + return false; + rdmsr(IA32_L3_CBM_BASE, l, h); + + /* If all the bits were set in MSR, return success */ + if (l != max_cbm) + return false; + + r->num_closid = 4; + r->cbm_len = 20; + r->max_cbm = max_cbm; + r->min_cbm_bits = 2; + r->capable = true; + r->enabled = true; + + return true; + } + + return false; +} + +static void rdt_get_config(int idx, struct rdt_resource *r) +{ + union cpuid_0x10_1_eax eax; + union cpuid_0x10_1_edx edx; + u32 ebx, ecx; + + cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx, &edx.full); + r->num_closid = edx.split.cos_max + 1; + r->cbm_len = eax.split.cbm_len + 1; + r->max_cbm = BIT_MASK(eax.split.cbm_len + 1) - 1; + r->capable = true; + r->enabled = true; +} + +static void rdt_get_cdp_l3_config(int type) +{ + struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3]; + struct rdt_resource *r = &rdt_resources_all[type]; + + r->num_closid = r_l3->num_closid / 2; + r->cbm_len = r_l3->cbm_len; + r->max_cbm = r_l3->max_cbm; + r->capable = true; + /* + * By default, CDP is disabled. CDP can be enabled by mount parameter + * "cdp" during resctrl file system mount time. + */ + r->enabled = false; +} + +static inline bool get_rdt_resources(void) +{ + bool ret = false; + + if (cache_alloc_hsw_probe()) + return true; + + if (!boot_cpu_has(X86_FEATURE_RDT_A)) + return false; + + if (boot_cpu_has(X86_FEATURE_CAT_L3)) { + rdt_get_config(1, &rdt_resources_all[RDT_RESOURCE_L3]); + if (boot_cpu_has(X86_FEATURE_CDP_L3)) { + rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA); + rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE); + } + ret = true; + } + if (boot_cpu_has(X86_FEATURE_CAT_L2)) { + /* CPUID 0x10.2 fields are same format at 0x10.1 */ + rdt_get_config(2, &rdt_resources_all[RDT_RESOURCE_L2]); + ret = true; + } + + return ret; +} + +static int get_cache_id(int cpu, int level) +{ + struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu); + int i; + + for (i = 0; i < ci->num_leaves; i++) { + if (ci->info_list[i].level == level) + return ci->info_list[i].id; + } + + return -1; +} + +void rdt_cbm_update(void *arg) +{ + struct msr_param *m = (struct msr_param *)arg; + struct rdt_resource *r = m->res; + int i, cpu = smp_processor_id(); + struct rdt_domain *d; + + list_for_each_entry(d, &r->domains, list) { + /* Find the domain that contains this CPU */ + if (cpumask_test_cpu(cpu, &d->cpu_mask)) + goto found; + } + pr_info_once("cpu %d not found in any domain for resource %s\n", + cpu, r->name); + + return; + +found: + for (i = m->low; i < m->high; i++) { + int idx = cbm_idx(r, i); + + wrmsrl(r->msr_base + idx, d->cbm[i]); + } +} + +/* + * rdt_find_domain - Find a domain in a resource that matches input resource id + * + * Search resource r's domain list to find the resource id. If the resource + * id is found in a domain, return the domain. Otherwise, if requested by + * caller, return the first domain whose id is bigger than the input id. + * The domain list is sorted by id in ascending order. + */ +static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, + struct list_head **pos) +{ + struct rdt_domain *d; + struct list_head *l; + + if (id < 0) + return ERR_PTR(id); + + list_for_each(l, &r->domains) { + d = list_entry(l, struct rdt_domain, list); + /* When id is found, return its domain. */ + if (id == d->id) + return d; + /* Stop searching when finding id's position in sorted list. */ + if (id < d->id) + break; + } + + if (pos) + *pos = l; + + return NULL; +} + +/* + * domain_add_cpu - Add a cpu to a resource's domain list. + * + * If an existing domain in the resource r's domain list matches the cpu's + * resource id, add the cpu in the domain. + * + * Otherwise, a new domain is allocated and inserted into the right position + * in the domain list sorted by id in ascending order. + * + * The order in the domain list is visible to users when we print entries + * in the schemata file and schemata input is validated to have the same order + * as this list. + */ +static void domain_add_cpu(int cpu, struct rdt_resource *r) +{ + int i, id = get_cache_id(cpu, r->cache_level); + struct list_head *add_pos = NULL; + struct rdt_domain *d; + + d = rdt_find_domain(r, id, &add_pos); + if (IS_ERR(d)) { + pr_warn("Could't find cache id for cpu %d\n", cpu); + return; + } + + if (d) { + cpumask_set_cpu(cpu, &d->cpu_mask); + return; + } + + d = kzalloc_node(sizeof(*d), GFP_KERNEL, cpu_to_node(cpu)); + if (!d) + return; + + d->id = id; + + d->cbm = kmalloc_array(r->num_closid, sizeof(*d->cbm), GFP_KERNEL); + if (!d->cbm) { + kfree(d); + return; + } + + for (i = 0; i < r->num_closid; i++) { + int idx = cbm_idx(r, i); + + d->cbm[i] = r->max_cbm; + wrmsrl(r->msr_base + idx, d->cbm[i]); + } + + cpumask_set_cpu(cpu, &d->cpu_mask); + list_add_tail(&d->list, add_pos); + r->num_domains++; +} + +static void domain_remove_cpu(int cpu, struct rdt_resource *r) +{ + int id = get_cache_id(cpu, r->cache_level); + struct rdt_domain *d; + + d = rdt_find_domain(r, id, NULL); + if (IS_ERR_OR_NULL(d)) { + pr_warn("Could't find cache id for cpu %d\n", cpu); + return; + } + + cpumask_clear_cpu(cpu, &d->cpu_mask); + if (cpumask_empty(&d->cpu_mask)) { + r->num_domains--; + kfree(d->cbm); + list_del(&d->list); + kfree(d); + } +} + +static void clear_closid(int cpu) +{ + struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); + + per_cpu(cpu_closid, cpu) = 0; + state->closid = 0; + wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, 0); +} + +static int intel_rdt_online_cpu(unsigned int cpu) +{ + struct rdt_resource *r; + + mutex_lock(&rdtgroup_mutex); + for_each_capable_rdt_resource(r) + domain_add_cpu(cpu, r); + /* The cpu is set in default rdtgroup after online. */ + cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); + clear_closid(cpu); + mutex_unlock(&rdtgroup_mutex); + + return 0; +} + +static int intel_rdt_offline_cpu(unsigned int cpu) +{ + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + + mutex_lock(&rdtgroup_mutex); + for_each_capable_rdt_resource(r) + domain_remove_cpu(cpu, r); + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { + if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) + break; + } + clear_closid(cpu); + mutex_unlock(&rdtgroup_mutex); + + return 0; +} + +static int __init intel_rdt_late_init(void) +{ + struct rdt_resource *r; + int state, ret; + + if (!get_rdt_resources()) + return -ENODEV; + + state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "x86/rdt/cat:online:", + intel_rdt_online_cpu, intel_rdt_offline_cpu); + if (state < 0) + return state; + + ret = rdtgroup_init(); + if (ret) { + cpuhp_remove_state(state); + return ret; + } + + for_each_capable_rdt_resource(r) + pr_info("Intel RDT %s allocation detected\n", r->name); + + return 0; +} + +late_initcall(intel_rdt_late_init); diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c new file mode 100644 index 000000000000..8af04afdfcb9 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -0,0 +1,1115 @@ +/* + * User interface for Resource Alloction in Resource Director Technology(RDT) + * + * Copyright (C) 2016 Intel Corporation + * + * Author: Fenghua Yu <fenghua.yu@intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/cpu.h> +#include <linux/fs.h> +#include <linux/sysfs.h> +#include <linux/kernfs.h> +#include <linux/seq_file.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/cpu.h> +#include <linux/task_work.h> + +#include <uapi/linux/magic.h> + +#include <asm/intel_rdt.h> +#include <asm/intel_rdt_common.h> + +DEFINE_STATIC_KEY_FALSE(rdt_enable_key); +struct kernfs_root *rdt_root; +struct rdtgroup rdtgroup_default; +LIST_HEAD(rdt_all_groups); + +/* Kernel fs node for "info" directory under root */ +static struct kernfs_node *kn_info; + +/* + * Trivial allocator for CLOSIDs. Since h/w only supports a small number, + * we can keep a bitmap of free CLOSIDs in a single integer. + * + * Using a global CLOSID across all resources has some advantages and + * some drawbacks: + * + We can simply set "current->closid" to assign a task to a resource + * group. + * + Context switch code can avoid extra memory references deciding which + * CLOSID to load into the PQR_ASSOC MSR + * - We give up some options in configuring resource groups across multi-socket + * systems. + * - Our choices on how to configure each resource become progressively more + * limited as the number of resources grows. + */ +static int closid_free_map; + +static void closid_init(void) +{ + struct rdt_resource *r; + int rdt_min_closid = 32; + + /* Compute rdt_min_closid across all resources */ + for_each_enabled_rdt_resource(r) + rdt_min_closid = min(rdt_min_closid, r->num_closid); + + closid_free_map = BIT_MASK(rdt_min_closid) - 1; + + /* CLOSID 0 is always reserved for the default group */ + closid_free_map &= ~1; +} + +int closid_alloc(void) +{ + int closid = ffs(closid_free_map); + + if (closid == 0) + return -ENOSPC; + closid--; + closid_free_map &= ~(1 << closid); + + return closid; +} + +static void closid_free(int closid) +{ + closid_free_map |= 1 << closid; +} + +/* set uid and gid of rdtgroup dirs and files to that of the creator */ +static int rdtgroup_kn_set_ugid(struct kernfs_node *kn) +{ + struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, + .ia_uid = current_fsuid(), + .ia_gid = current_fsgid(), }; + + if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) && + gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID)) + return 0; + + return kernfs_setattr(kn, &iattr); +} + +static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft) +{ + struct kernfs_node *kn; + int ret; + + kn = __kernfs_create_file(parent_kn, rft->name, rft->mode, + 0, rft->kf_ops, rft, NULL, NULL); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) { + kernfs_remove(kn); + return ret; + } + + return 0; +} + +static int rdtgroup_add_files(struct kernfs_node *kn, struct rftype *rfts, + int len) +{ + struct rftype *rft; + int ret; + + lockdep_assert_held(&rdtgroup_mutex); + + for (rft = rfts; rft < rfts + len; rft++) { + ret = rdtgroup_add_file(kn, rft); + if (ret) + goto error; + } + + return 0; +error: + pr_warn("Failed to add %s, err=%d\n", rft->name, ret); + while (--rft >= rfts) + kernfs_remove_by_name(kn, rft->name); + return ret; +} + +static int rdtgroup_seqfile_show(struct seq_file *m, void *arg) +{ + struct kernfs_open_file *of = m->private; + struct rftype *rft = of->kn->priv; + + if (rft->seq_show) + return rft->seq_show(of, m, arg); + return 0; +} + +static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct rftype *rft = of->kn->priv; + + if (rft->write) + return rft->write(of, buf, nbytes, off); + + return -EINVAL; +} + +static struct kernfs_ops rdtgroup_kf_single_ops = { + .atomic_write_len = PAGE_SIZE, + .write = rdtgroup_file_write, + .seq_show = rdtgroup_seqfile_show, +}; + +static int rdtgroup_cpus_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + + if (rdtgrp) + seq_printf(s, "%*pb\n", cpumask_pr_args(&rdtgrp->cpu_mask)); + else + ret = -ENOENT; + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +/* + * This is safe against intel_rdt_sched_in() called from __switch_to() + * because __switch_to() is executed with interrupts disabled. A local call + * from rdt_update_closid() is proteced against __switch_to() because + * preemption is disabled. + */ +static void rdt_update_cpu_closid(void *closid) +{ + if (closid) + this_cpu_write(cpu_closid, *(int *)closid); + /* + * We cannot unconditionally write the MSR because the current + * executing task might have its own closid selected. Just reuse + * the context switch code. + */ + intel_rdt_sched_in(); +} + +/* + * Update the PGR_ASSOC MSR on all cpus in @cpu_mask, + * + * Per task closids must have been set up before calling this function. + * + * The per cpu closids are updated with the smp function call, when @closid + * is not NULL. If @closid is NULL then all affected percpu closids must + * have been set up before calling this function. + */ +static void +rdt_update_closid(const struct cpumask *cpu_mask, int *closid) +{ + int cpu = get_cpu(); + + if (cpumask_test_cpu(cpu, cpu_mask)) + rdt_update_cpu_closid(closid); + smp_call_function_many(cpu_mask, rdt_update_cpu_closid, closid, 1); + put_cpu(); +} + +static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + cpumask_var_t tmpmask, newmask; + struct rdtgroup *rdtgrp, *r; + int ret; + + if (!buf) + return -EINVAL; + + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; + if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) { + free_cpumask_var(tmpmask); + return -ENOMEM; + } + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + ret = -ENOENT; + goto unlock; + } + + ret = cpumask_parse(buf, newmask); + if (ret) + goto unlock; + + /* check that user didn't specify any offline cpus */ + cpumask_andnot(tmpmask, newmask, cpu_online_mask); + if (cpumask_weight(tmpmask)) { + ret = -EINVAL; + goto unlock; + } + + /* Check whether cpus are dropped from this group */ + cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); + if (cpumask_weight(tmpmask)) { + /* Can't drop from default group */ + if (rdtgrp == &rdtgroup_default) { + ret = -EINVAL; + goto unlock; + } + /* Give any dropped cpus to rdtgroup_default */ + cpumask_or(&rdtgroup_default.cpu_mask, + &rdtgroup_default.cpu_mask, tmpmask); + rdt_update_closid(tmpmask, &rdtgroup_default.closid); + } + + /* + * If we added cpus, remove them from previous group that owned them + * and update per-cpu closid + */ + cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); + if (cpumask_weight(tmpmask)) { + list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { + if (r == rdtgrp) + continue; + cpumask_andnot(&r->cpu_mask, &r->cpu_mask, tmpmask); + } + rdt_update_closid(tmpmask, &rdtgrp->closid); + } + + /* Done pushing/pulling - update this group with new mask */ + cpumask_copy(&rdtgrp->cpu_mask, newmask); + +unlock: + rdtgroup_kn_unlock(of->kn); + free_cpumask_var(tmpmask); + free_cpumask_var(newmask); + + return ret ?: nbytes; +} + +struct task_move_callback { + struct callback_head work; + struct rdtgroup *rdtgrp; +}; + +static void move_myself(struct callback_head *head) +{ + struct task_move_callback *callback; + struct rdtgroup *rdtgrp; + + callback = container_of(head, struct task_move_callback, work); + rdtgrp = callback->rdtgrp; + + /* + * If resource group was deleted before this task work callback + * was invoked, then assign the task to root group and free the + * resource group. + */ + if (atomic_dec_and_test(&rdtgrp->waitcount) && + (rdtgrp->flags & RDT_DELETED)) { + current->closid = 0; + kfree(rdtgrp); + } + + preempt_disable(); + /* update PQR_ASSOC MSR to make resource group go into effect */ + intel_rdt_sched_in(); + preempt_enable(); + + kfree(callback); +} + +static int __rdtgroup_move_task(struct task_struct *tsk, + struct rdtgroup *rdtgrp) +{ + struct task_move_callback *callback; + int ret; + + callback = kzalloc(sizeof(*callback), GFP_KERNEL); + if (!callback) + return -ENOMEM; + callback->work.func = move_myself; + callback->rdtgrp = rdtgrp; + + /* + * Take a refcount, so rdtgrp cannot be freed before the + * callback has been invoked. + */ + atomic_inc(&rdtgrp->waitcount); + ret = task_work_add(tsk, &callback->work, true); + if (ret) { + /* + * Task is exiting. Drop the refcount and free the callback. + * No need to check the refcount as the group cannot be + * deleted before the write function unlocks rdtgroup_mutex. + */ + atomic_dec(&rdtgrp->waitcount); + kfree(callback); + } else { + tsk->closid = rdtgrp->closid; + } + return ret; +} + +static int rdtgroup_task_write_permission(struct task_struct *task, + struct kernfs_open_file *of) +{ + const struct cred *tcred = get_task_cred(task); + const struct cred *cred = current_cred(); + int ret = 0; + + /* + * Even if we're attaching all tasks in the thread group, we only + * need to check permissions on one of them. + */ + if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && + !uid_eq(cred->euid, tcred->uid) && + !uid_eq(cred->euid, tcred->suid)) + ret = -EPERM; + + put_cred(tcred); + return ret; +} + +static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp, + struct kernfs_open_file *of) +{ + struct task_struct *tsk; + int ret; + + rcu_read_lock(); + if (pid) { + tsk = find_task_by_vpid(pid); + if (!tsk) { + rcu_read_unlock(); + return -ESRCH; + } + } else { + tsk = current; + } + + get_task_struct(tsk); + rcu_read_unlock(); + + ret = rdtgroup_task_write_permission(tsk, of); + if (!ret) + ret = __rdtgroup_move_task(tsk, rdtgrp); + + put_task_struct(tsk); + return ret; +} + +static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + pid_t pid; + + if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) + return -EINVAL; + rdtgrp = rdtgroup_kn_lock_live(of->kn); + + if (rdtgrp) + ret = rdtgroup_move_task(pid, rdtgrp, of); + else + ret = -ENOENT; + + rdtgroup_kn_unlock(of->kn); + + return ret ?: nbytes; +} + +static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) +{ + struct task_struct *p, *t; + + rcu_read_lock(); + for_each_process_thread(p, t) { + if (t->closid == r->closid) + seq_printf(s, "%d\n", t->pid); + } + rcu_read_unlock(); +} + +static int rdtgroup_tasks_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (rdtgrp) + show_rdt_tasks(rdtgrp, s); + else + ret = -ENOENT; + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +/* Files in each rdtgroup */ +static struct rftype rdtgroup_base_files[] = { + { + .name = "cpus", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_cpus_write, + .seq_show = rdtgroup_cpus_show, + }, + { + .name = "tasks", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_tasks_write, + .seq_show = rdtgroup_tasks_show, + }, + { + .name = "schemata", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_schemata_write, + .seq_show = rdtgroup_schemata_show, + }, +}; + +static int rdt_num_closids_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%d\n", r->num_closid); + + return 0; +} + +static int rdt_cbm_mask_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%x\n", r->max_cbm); + + return 0; +} + +static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%d\n", r->min_cbm_bits); + + return 0; +} + +/* rdtgroup information files for one cache resource. */ +static struct rftype res_info_files[] = { + { + .name = "num_closids", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_num_closids_show, + }, + { + .name = "cbm_mask", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_cbm_mask_show, + }, + { + .name = "min_cbm_bits", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_min_cbm_bits_show, + }, +}; + +static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) +{ + struct kernfs_node *kn_subdir; + struct rdt_resource *r; + int ret; + + /* create the directory */ + kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); + if (IS_ERR(kn_info)) + return PTR_ERR(kn_info); + kernfs_get(kn_info); + + for_each_enabled_rdt_resource(r) { + kn_subdir = kernfs_create_dir(kn_info, r->name, + kn_info->mode, r); + if (IS_ERR(kn_subdir)) { + ret = PTR_ERR(kn_subdir); + goto out_destroy; + } + kernfs_get(kn_subdir); + ret = rdtgroup_kn_set_ugid(kn_subdir); + if (ret) + goto out_destroy; + ret = rdtgroup_add_files(kn_subdir, res_info_files, + ARRAY_SIZE(res_info_files)); + if (ret) + goto out_destroy; + kernfs_activate(kn_subdir); + } + + /* + * This extra ref will be put in kernfs_remove() and guarantees + * that @rdtgrp->kn is always accessible. + */ + kernfs_get(kn_info); + + ret = rdtgroup_kn_set_ugid(kn_info); + if (ret) + goto out_destroy; + + kernfs_activate(kn_info); + + return 0; + +out_destroy: + kernfs_remove(kn_info); + return ret; +} + +static void l3_qos_cfg_update(void *arg) +{ + bool *enable = arg; + + wrmsrl(IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL); +} + +static int set_l3_qos_cfg(struct rdt_resource *r, bool enable) +{ + cpumask_var_t cpu_mask; + struct rdt_domain *d; + int cpu; + + if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) + return -ENOMEM; + + list_for_each_entry(d, &r->domains, list) { + /* Pick one CPU from each domain instance to update MSR */ + cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); + } + cpu = get_cpu(); + /* Update QOS_CFG MSR on this cpu if it's in cpu_mask. */ + if (cpumask_test_cpu(cpu, cpu_mask)) + l3_qos_cfg_update(&enable); + /* Update QOS_CFG MSR on all other cpus in cpu_mask. */ + smp_call_function_many(cpu_mask, l3_qos_cfg_update, &enable, 1); + put_cpu(); + + free_cpumask_var(cpu_mask); + + return 0; +} + +static int cdp_enable(void) +{ + struct rdt_resource *r_l3data = &rdt_resources_all[RDT_RESOURCE_L3DATA]; + struct rdt_resource *r_l3code = &rdt_resources_all[RDT_RESOURCE_L3CODE]; + struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3]; + int ret; + + if (!r_l3->capable || !r_l3data->capable || !r_l3code->capable) + return -EINVAL; + + ret = set_l3_qos_cfg(r_l3, true); + if (!ret) { + r_l3->enabled = false; + r_l3data->enabled = true; + r_l3code->enabled = true; + } + return ret; +} + +static void cdp_disable(void) +{ + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; + + r->enabled = r->capable; + + if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) { + rdt_resources_all[RDT_RESOURCE_L3DATA].enabled = false; + rdt_resources_all[RDT_RESOURCE_L3CODE].enabled = false; + set_l3_qos_cfg(r, false); + } +} + +static int parse_rdtgroupfs_options(char *data) +{ + char *token, *o = data; + int ret = 0; + + while ((token = strsep(&o, ",")) != NULL) { + if (!*token) + return -EINVAL; + + if (!strcmp(token, "cdp")) + ret = cdp_enable(); + } + + return ret; +} + +/* + * We don't allow rdtgroup directories to be created anywhere + * except the root directory. Thus when looking for the rdtgroup + * structure for a kernfs node we are either looking at a directory, + * in which case the rdtgroup structure is pointed at by the "priv" + * field, otherwise we have a file, and need only look to the parent + * to find the rdtgroup. + */ +static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn) +{ + if (kernfs_type(kn) == KERNFS_DIR) { + /* + * All the resource directories use "kn->priv" + * to point to the "struct rdtgroup" for the + * resource. "info" and its subdirectories don't + * have rdtgroup structures, so return NULL here. + */ + if (kn == kn_info || kn->parent == kn_info) + return NULL; + else + return kn->priv; + } else { + return kn->parent->priv; + } +} + +struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn) +{ + struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); + + if (!rdtgrp) + return NULL; + + atomic_inc(&rdtgrp->waitcount); + kernfs_break_active_protection(kn); + + mutex_lock(&rdtgroup_mutex); + + /* Was this group deleted while we waited? */ + if (rdtgrp->flags & RDT_DELETED) + return NULL; + + return rdtgrp; +} + +void rdtgroup_kn_unlock(struct kernfs_node *kn) +{ + struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); + + if (!rdtgrp) + return; + + mutex_unlock(&rdtgroup_mutex); + + if (atomic_dec_and_test(&rdtgrp->waitcount) && + (rdtgrp->flags & RDT_DELETED)) { + kernfs_unbreak_active_protection(kn); + kernfs_put(kn); + kfree(rdtgrp); + } else { + kernfs_unbreak_active_protection(kn); + } +} + +static struct dentry *rdt_mount(struct file_system_type *fs_type, + int flags, const char *unused_dev_name, + void *data) +{ + struct dentry *dentry; + int ret; + + mutex_lock(&rdtgroup_mutex); + /* + * resctrl file system can only be mounted once. + */ + if (static_branch_unlikely(&rdt_enable_key)) { + dentry = ERR_PTR(-EBUSY); + goto out; + } + + ret = parse_rdtgroupfs_options(data); + if (ret) { + dentry = ERR_PTR(ret); + goto out_cdp; + } + + closid_init(); + + ret = rdtgroup_create_info_dir(rdtgroup_default.kn); + if (ret) { + dentry = ERR_PTR(ret); + goto out_cdp; + } + + dentry = kernfs_mount(fs_type, flags, rdt_root, + RDTGROUP_SUPER_MAGIC, NULL); + if (IS_ERR(dentry)) + goto out_cdp; + + static_branch_enable(&rdt_enable_key); + goto out; + +out_cdp: + cdp_disable(); +out: + mutex_unlock(&rdtgroup_mutex); + + return dentry; +} + +static int reset_all_cbms(struct rdt_resource *r) +{ + struct msr_param msr_param; + cpumask_var_t cpu_mask; + struct rdt_domain *d; + int i, cpu; + + if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) + return -ENOMEM; + + msr_param.res = r; + msr_param.low = 0; + msr_param.high = r->num_closid; + + /* + * Disable resource control for this resource by setting all + * CBMs in all domains to the maximum mask value. Pick one CPU + * from each domain to update the MSRs below. + */ + list_for_each_entry(d, &r->domains, list) { + cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); + + for (i = 0; i < r->num_closid; i++) + d->cbm[i] = r->max_cbm; + } + cpu = get_cpu(); + /* Update CBM on this cpu if it's in cpu_mask. */ + if (cpumask_test_cpu(cpu, cpu_mask)) + rdt_cbm_update(&msr_param); + /* Update CBM on all other cpus in cpu_mask. */ + smp_call_function_many(cpu_mask, rdt_cbm_update, &msr_param, 1); + put_cpu(); + + free_cpumask_var(cpu_mask); + + return 0; +} + +/* + * Move tasks from one to the other group. If @from is NULL, then all tasks + * in the systems are moved unconditionally (used for teardown). + * + * If @mask is not NULL the cpus on which moved tasks are running are set + * in that mask so the update smp function call is restricted to affected + * cpus. + */ +static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, + struct cpumask *mask) +{ + struct task_struct *p, *t; + + read_lock(&tasklist_lock); + for_each_process_thread(p, t) { + if (!from || t->closid == from->closid) { + t->closid = to->closid; +#ifdef CONFIG_SMP + /* + * This is safe on x86 w/o barriers as the ordering + * of writing to task_cpu() and t->on_cpu is + * reverse to the reading here. The detection is + * inaccurate as tasks might move or schedule + * before the smp function call takes place. In + * such a case the function call is pointless, but + * there is no other side effect. + */ + if (mask && t->on_cpu) + cpumask_set_cpu(task_cpu(t), mask); +#endif + } + } + read_unlock(&tasklist_lock); +} + +/* + * Forcibly remove all of subdirectories under root. + */ +static void rmdir_all_sub(void) +{ + struct rdtgroup *rdtgrp, *tmp; + + /* Move all tasks to the default resource group */ + rdt_move_group_tasks(NULL, &rdtgroup_default, NULL); + + list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) { + /* Remove each rdtgroup other than root */ + if (rdtgrp == &rdtgroup_default) + continue; + + /* + * Give any CPUs back to the default group. We cannot copy + * cpu_online_mask because a CPU might have executed the + * offline callback already, but is still marked online. + */ + cpumask_or(&rdtgroup_default.cpu_mask, + &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); + + kernfs_remove(rdtgrp->kn); + list_del(&rdtgrp->rdtgroup_list); + kfree(rdtgrp); + } + /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ + get_online_cpus(); + rdt_update_closid(cpu_online_mask, &rdtgroup_default.closid); + put_online_cpus(); + + kernfs_remove(kn_info); +} + +static void rdt_kill_sb(struct super_block *sb) +{ + struct rdt_resource *r; + + mutex_lock(&rdtgroup_mutex); + + /*Put everything back to default values. */ + for_each_enabled_rdt_resource(r) + reset_all_cbms(r); + cdp_disable(); + rmdir_all_sub(); + static_branch_disable(&rdt_enable_key); + kernfs_kill_sb(sb); + mutex_unlock(&rdtgroup_mutex); +} + +static struct file_system_type rdt_fs_type = { + .name = "resctrl", + .mount = rdt_mount, + .kill_sb = rdt_kill_sb, +}; + +static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, + umode_t mode) +{ + struct rdtgroup *parent, *rdtgrp; + struct kernfs_node *kn; + int ret, closid; + + /* Only allow mkdir in the root directory */ + if (parent_kn != rdtgroup_default.kn) + return -EPERM; + + /* Do not accept '\n' to avoid unparsable situation. */ + if (strchr(name, '\n')) + return -EINVAL; + + parent = rdtgroup_kn_lock_live(parent_kn); + if (!parent) { + ret = -ENODEV; + goto out_unlock; + } + + ret = closid_alloc(); + if (ret < 0) + goto out_unlock; + closid = ret; + + /* allocate the rdtgroup. */ + rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); + if (!rdtgrp) { + ret = -ENOSPC; + goto out_closid_free; + } + rdtgrp->closid = closid; + list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); + + /* kernfs creates the directory for rdtgrp */ + kn = kernfs_create_dir(parent->kn, name, mode, rdtgrp); + if (IS_ERR(kn)) { + ret = PTR_ERR(kn); + goto out_cancel_ref; + } + rdtgrp->kn = kn; + + /* + * kernfs_remove() will drop the reference count on "kn" which + * will free it. But we still need it to stick around for the + * rdtgroup_kn_unlock(kn} call below. Take one extra reference + * here, which will be dropped inside rdtgroup_kn_unlock(). + */ + kernfs_get(kn); + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; + + ret = rdtgroup_add_files(kn, rdtgroup_base_files, + ARRAY_SIZE(rdtgroup_base_files)); + if (ret) + goto out_destroy; + + kernfs_activate(kn); + + ret = 0; + goto out_unlock; + +out_destroy: + kernfs_remove(rdtgrp->kn); +out_cancel_ref: + list_del(&rdtgrp->rdtgroup_list); + kfree(rdtgrp); +out_closid_free: + closid_free(closid); +out_unlock: + rdtgroup_kn_unlock(parent_kn); + return ret; +} + +static int rdtgroup_rmdir(struct kernfs_node *kn) +{ + int ret, cpu, closid = rdtgroup_default.closid; + struct rdtgroup *rdtgrp; + cpumask_var_t tmpmask; + + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; + + rdtgrp = rdtgroup_kn_lock_live(kn); + if (!rdtgrp) { + ret = -EPERM; + goto out; + } + + /* Give any tasks back to the default group */ + rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask); + + /* Give any CPUs back to the default group */ + cpumask_or(&rdtgroup_default.cpu_mask, + &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); + + /* Update per cpu closid of the moved CPUs first */ + for_each_cpu(cpu, &rdtgrp->cpu_mask) + per_cpu(cpu_closid, cpu) = closid; + /* + * Update the MSR on moved CPUs and CPUs which have moved + * task running on them. + */ + cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); + rdt_update_closid(tmpmask, NULL); + + rdtgrp->flags = RDT_DELETED; + closid_free(rdtgrp->closid); + list_del(&rdtgrp->rdtgroup_list); + + /* + * one extra hold on this, will drop when we kfree(rdtgrp) + * in rdtgroup_kn_unlock() + */ + kernfs_get(kn); + kernfs_remove(rdtgrp->kn); + ret = 0; +out: + rdtgroup_kn_unlock(kn); + free_cpumask_var(tmpmask); + return ret; +} + +static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) +{ + if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) + seq_puts(seq, ",cdp"); + return 0; +} + +static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = { + .mkdir = rdtgroup_mkdir, + .rmdir = rdtgroup_rmdir, + .show_options = rdtgroup_show_options, +}; + +static int __init rdtgroup_setup_root(void) +{ + int ret; + + rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops, + KERNFS_ROOT_CREATE_DEACTIVATED, + &rdtgroup_default); + if (IS_ERR(rdt_root)) + return PTR_ERR(rdt_root); + + mutex_lock(&rdtgroup_mutex); + + rdtgroup_default.closid = 0; + list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); + + ret = rdtgroup_add_files(rdt_root->kn, rdtgroup_base_files, + ARRAY_SIZE(rdtgroup_base_files)); + if (ret) { + kernfs_destroy_root(rdt_root); + goto out; + } + + rdtgroup_default.kn = rdt_root->kn; + kernfs_activate(rdtgroup_default.kn); + +out: + mutex_unlock(&rdtgroup_mutex); + + return ret; +} + +/* + * rdtgroup_init - rdtgroup initialization + * + * Setup resctrl file system including set up root, create mount point, + * register rdtgroup filesystem, and initialize files under root directory. + * + * Return: 0 on success or -errno + */ +int __init rdtgroup_init(void) +{ + int ret = 0; + + ret = rdtgroup_setup_root(); + if (ret) + return ret; + + ret = sysfs_create_mount_point(fs_kobj, "resctrl"); + if (ret) + goto cleanup_root; + + ret = register_filesystem(&rdt_fs_type); + if (ret) + goto cleanup_mountpoint; + + return 0; + +cleanup_mountpoint: + sysfs_remove_mount_point(fs_kobj, "resctrl"); +cleanup_root: + kernfs_destroy_root(rdt_root); + + return ret; +} diff --git a/arch/x86/kernel/cpu/intel_rdt_schemata.c b/arch/x86/kernel/cpu/intel_rdt_schemata.c new file mode 100644 index 000000000000..f369cb8db0d5 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_rdt_schemata.c @@ -0,0 +1,245 @@ +/* + * Resource Director Technology(RDT) + * - Cache Allocation code. + * + * Copyright (C) 2016 Intel Corporation + * + * Authors: + * Fenghua Yu <fenghua.yu@intel.com> + * Tony Luck <tony.luck@intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual June 2016, volume 3, section 17.17. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernfs.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <asm/intel_rdt.h> + +/* + * Check whether a cache bit mask is valid. The SDM says: + * Please note that all (and only) contiguous '1' combinations + * are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.). + * Additionally Haswell requires at least two bits set. + */ +static bool cbm_validate(unsigned long var, struct rdt_resource *r) +{ + unsigned long first_bit, zero_bit; + + if (var == 0 || var > r->max_cbm) + return false; + + first_bit = find_first_bit(&var, r->cbm_len); + zero_bit = find_next_zero_bit(&var, r->cbm_len, first_bit); + + if (find_next_bit(&var, r->cbm_len, zero_bit) < r->cbm_len) + return false; + + if ((zero_bit - first_bit) < r->min_cbm_bits) + return false; + return true; +} + +/* + * Read one cache bit mask (hex). Check that it is valid for the current + * resource type. + */ +static int parse_cbm(char *buf, struct rdt_resource *r) +{ + unsigned long data; + int ret; + + ret = kstrtoul(buf, 16, &data); + if (ret) + return ret; + if (!cbm_validate(data, r)) + return -EINVAL; + r->tmp_cbms[r->num_tmp_cbms++] = data; + + return 0; +} + +/* + * For each domain in this resource we expect to find a series of: + * id=mask + * separated by ";". The "id" is in decimal, and must appear in the + * right order. + */ +static int parse_line(char *line, struct rdt_resource *r) +{ + char *dom = NULL, *id; + struct rdt_domain *d; + unsigned long dom_id; + + list_for_each_entry(d, &r->domains, list) { + dom = strsep(&line, ";"); + if (!dom) + return -EINVAL; + id = strsep(&dom, "="); + if (kstrtoul(id, 10, &dom_id) || dom_id != d->id) + return -EINVAL; + if (parse_cbm(dom, r)) + return -EINVAL; + } + + /* Any garbage at the end of the line? */ + if (line && line[0]) + return -EINVAL; + return 0; +} + +static int update_domains(struct rdt_resource *r, int closid) +{ + struct msr_param msr_param; + cpumask_var_t cpu_mask; + struct rdt_domain *d; + int cpu, idx = 0; + + if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) + return -ENOMEM; + + msr_param.low = closid; + msr_param.high = msr_param.low + 1; + msr_param.res = r; + + list_for_each_entry(d, &r->domains, list) { + cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); + d->cbm[msr_param.low] = r->tmp_cbms[idx++]; + } + cpu = get_cpu(); + /* Update CBM on this cpu if it's in cpu_mask. */ + if (cpumask_test_cpu(cpu, cpu_mask)) + rdt_cbm_update(&msr_param); + /* Update CBM on other cpus. */ + smp_call_function_many(cpu_mask, rdt_cbm_update, &msr_param, 1); + put_cpu(); + + free_cpumask_var(cpu_mask); + + return 0; +} + +ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + char *tok, *resname; + int closid, ret = 0; + u32 *l3_cbms = NULL; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + buf[nbytes - 1] = '\0'; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + + closid = rdtgrp->closid; + + /* get scratch space to save all the masks while we validate input */ + for_each_enabled_rdt_resource(r) { + r->tmp_cbms = kcalloc(r->num_domains, sizeof(*l3_cbms), + GFP_KERNEL); + if (!r->tmp_cbms) { + ret = -ENOMEM; + goto out; + } + r->num_tmp_cbms = 0; + } + + while ((tok = strsep(&buf, "\n")) != NULL) { + resname = strsep(&tok, ":"); + if (!tok) { + ret = -EINVAL; + goto out; + } + for_each_enabled_rdt_resource(r) { + if (!strcmp(resname, r->name) && + closid < r->num_closid) { + ret = parse_line(tok, r); + if (ret) + goto out; + break; + } + } + if (!r->name) { + ret = -EINVAL; + goto out; + } + } + + /* Did the parser find all the masks we need? */ + for_each_enabled_rdt_resource(r) { + if (r->num_tmp_cbms != r->num_domains) { + ret = -EINVAL; + goto out; + } + } + + for_each_enabled_rdt_resource(r) { + ret = update_domains(r, closid); + if (ret) + goto out; + } + +out: + rdtgroup_kn_unlock(of->kn); + for_each_enabled_rdt_resource(r) { + kfree(r->tmp_cbms); + r->tmp_cbms = NULL; + } + return ret ?: nbytes; +} + +static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid) +{ + struct rdt_domain *dom; + bool sep = false; + + seq_printf(s, "%s:", r->name); + list_for_each_entry(dom, &r->domains, list) { + if (sep) + seq_puts(s, ";"); + seq_printf(s, "%d=%x", dom->id, dom->cbm[closid]); + sep = true; + } + seq_puts(s, "\n"); +} + +int rdtgroup_schemata_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + int closid, ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (rdtgrp) { + closid = rdtgrp->closid; + for_each_enabled_rdt_resource(r) { + if (closid < r->num_closid) + show_doms(s, r, closid); + } + } else { + ret = -ENOENT; + } + rdtgroup_kn_unlock(of->kn); + return ret; +} diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index c7efbcfbeda6..87cc9ab7a13c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -14,7 +14,7 @@ #include <linux/init.h> #include <linux/debugfs.h> #include <asm/mce.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "mce-internal.h" diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 132e1ec67da0..537c6647d84c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -516,7 +516,7 @@ int mce_available(struct cpuinfo_x86 *c) static void mce_schedule_work(void) { - if (!mce_gen_pool_empty() && keventd_up()) + if (!mce_gen_pool_empty()) schedule_work(&mce_work); } @@ -1373,20 +1373,15 @@ static unsigned long mce_adjust_timer_default(unsigned long interval) static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default; -static void __restart_timer(struct timer_list *t, unsigned long interval) +static void __start_timer(struct timer_list *t, unsigned long interval) { unsigned long when = jiffies + interval; unsigned long flags; local_irq_save(flags); - if (timer_pending(t)) { - if (time_before(when, t->expires)) - mod_timer(t, when); - } else { - t->expires = round_jiffies(when); - add_timer_on(t, smp_processor_id()); - } + if (!timer_pending(t) || time_before(when, t->expires)) + mod_timer(t, round_jiffies(when)); local_irq_restore(flags); } @@ -1421,7 +1416,7 @@ static void mce_timer_fn(unsigned long data) done: __this_cpu_write(mce_next_interval, iv); - __restart_timer(t, iv); + __start_timer(t, iv); } /* @@ -1432,7 +1427,7 @@ void mce_timer_kick(unsigned long interval) struct timer_list *t = this_cpu_ptr(&mce_timer); unsigned long iv = __this_cpu_read(mce_next_interval); - __restart_timer(t, interval); + __start_timer(t, interval); if (interval < iv) __this_cpu_write(mce_next_interval, interval); @@ -1779,17 +1774,15 @@ static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c) } } -static void mce_start_timer(unsigned int cpu, struct timer_list *t) +static void mce_start_timer(struct timer_list *t) { unsigned long iv = check_interval * HZ; if (mca_cfg.ignore_ce || !iv) return; - per_cpu(mce_next_interval, cpu) = iv; - - t->expires = round_jiffies(jiffies + iv); - add_timer_on(t, cpu); + this_cpu_write(mce_next_interval, iv); + __start_timer(t, iv); } static void __mcheck_cpu_setup_timer(void) @@ -1806,7 +1799,7 @@ static void __mcheck_cpu_init_timer(void) unsigned int cpu = smp_processor_id(); setup_pinned_timer(t, mce_timer_fn, cpu); - mce_start_timer(cpu, t); + mce_start_timer(t); } /* Handle unconfigured int18 (should never happen) */ @@ -2566,7 +2559,7 @@ static int mce_cpu_dead(unsigned int cpu) static int mce_cpu_online(unsigned int cpu) { - struct timer_list *t = &per_cpu(mce_timer, cpu); + struct timer_list *t = this_cpu_ptr(&mce_timer); int ret; mce_device_create(cpu); @@ -2577,13 +2570,13 @@ static int mce_cpu_online(unsigned int cpu) return ret; } mce_reenable_cpu(); - mce_start_timer(cpu, t); + mce_start_timer(t); return 0; } static int mce_cpu_pre_down(unsigned int cpu) { - struct timer_list *t = &per_cpu(mce_timer, cpu); + struct timer_list *t = this_cpu_ptr(&mce_timer); mce_disable_cpu(); del_timer_sync(t); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index ffacfdcacb85..a5fd137417a2 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -1182,6 +1182,9 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) const char *name = get_name(bank, NULL); int err = 0; + if (!dev) + return -ENODEV; + if (is_shared_bank(bank)) { nb = node_to_amd_nb(amd_get_nb_id(cpu)); diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 6f353bdb3a25..079e81733a58 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -116,10 +116,11 @@ static inline u16 find_equiv_id(struct equiv_cpu_entry *equiv_cpu_table, /* * This scans the ucode blob for the proper container as we can have multiple - * containers glued together. + * containers glued together. Returns the equivalence ID from the equivalence + * table or 0 if none found. */ -static struct container -find_proper_container(u8 *ucode, size_t size, u16 *ret_id) +static u16 +find_proper_container(u8 *ucode, size_t size, struct container *ret_cont) { struct container ret = { NULL, 0 }; u32 eax, ebx, ecx, edx; @@ -138,7 +139,7 @@ find_proper_container(u8 *ucode, size_t size, u16 *ret_id) if (header[0] != UCODE_MAGIC || header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ header[2] == 0) /* size */ - return ret; + return eq_id; eax = 0x00000001; ecx = 0; @@ -163,8 +164,9 @@ find_proper_container(u8 *ucode, size_t size, u16 *ret_id) * ucode update loop below */ left = ret.size - offset; - *ret_id = eq_id; - return ret; + + *ret_cont = ret; + return eq_id; } /* @@ -189,7 +191,7 @@ find_proper_container(u8 *ucode, size_t size, u16 *ret_id) ucode = data; } - return ret; + return eq_id; } static int __apply_microcode_amd(struct microcode_amd *mc_amd) @@ -214,17 +216,18 @@ static int __apply_microcode_amd(struct microcode_amd *mc_amd) * and on 32-bit during save_microcode_in_initrd_amd() -- we can call * load_microcode_amd() to save equivalent cpu table and microcode patches in * kernel heap memory. + * + * Returns true if container found (sets @ret_cont), false otherwise. */ -static struct container -apply_microcode_early_amd(void *ucode, size_t size, bool save_patch) +static bool apply_microcode_early_amd(void *ucode, size_t size, bool save_patch, + struct container *ret_cont) { - struct container ret = { NULL, 0 }; u8 (*patch)[PATCH_MAX_SIZE]; + u32 rev, *header, *new_rev; + struct container ret; int offset, left; - u32 rev, *header; - u8 *data; u16 eq_id = 0; - u32 *new_rev; + u8 *data; #ifdef CONFIG_X86_32 new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); @@ -235,11 +238,11 @@ apply_microcode_early_amd(void *ucode, size_t size, bool save_patch) #endif if (check_current_patch_level(&rev, true)) - return (struct container){ NULL, 0 }; + return false; - ret = find_proper_container(ucode, size, &eq_id); + eq_id = find_proper_container(ucode, size, &ret); if (!eq_id) - return (struct container){ NULL, 0 }; + return false; this_equiv_id = eq_id; header = (u32 *)ret.data; @@ -273,7 +276,11 @@ apply_microcode_early_amd(void *ucode, size_t size, bool save_patch) data += offset; left -= offset; } - return ret; + + if (ret_cont) + *ret_cont = ret; + + return true; } static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family) @@ -294,6 +301,7 @@ static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family) void __init load_ucode_amd_bsp(unsigned int family) { struct ucode_cpu_info *uci; + u32 eax, ebx, ecx, edx; struct cpio_data cp; const char *path; bool use_pa; @@ -315,9 +323,12 @@ void __init load_ucode_amd_bsp(unsigned int family) return; /* Get BSP's CPUID.EAX(1), needed in load_microcode_amd() */ - uci->cpu_sig.sig = cpuid_eax(1); + eax = 1; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + uci->cpu_sig.sig = eax; - apply_microcode_early_amd(cp.data, cp.size, true); + apply_microcode_early_amd(cp.data, cp.size, true, NULL); } #ifdef CONFIG_X86_32 @@ -349,7 +360,7 @@ void load_ucode_amd_ap(unsigned int family) * This would set amd_ucode_patch above so that the following APs can * use it directly instead of going down this path again. */ - apply_microcode_early_amd(cp.data, cp.size, true); + apply_microcode_early_amd(cp.data, cp.size, true, NULL); } #else void load_ucode_amd_ap(unsigned int family) @@ -373,8 +384,9 @@ void load_ucode_amd_ap(unsigned int family) reget: if (!get_builtin_microcode(&cp, family)) { #ifdef CONFIG_BLK_DEV_INITRD - cp = find_cpio_data(ucode_path, (void *)initrd_start, - initrd_end - initrd_start, NULL); + if (!initrd_gone) + cp = find_cpio_data(ucode_path, (void *)initrd_start, + initrd_end - initrd_start, NULL); #endif if (!(cp.data && cp.size)) { /* @@ -387,8 +399,7 @@ reget: } } - cont = apply_microcode_early_amd(cp.data, cp.size, false); - if (!(cont.data && cont.size)) { + if (!apply_microcode_early_amd(cp.data, cp.size, false, &cont)) { cont.size = -1; return; } @@ -443,7 +454,7 @@ int __init save_microcode_in_initrd_amd(unsigned int fam) return -EINVAL; } - cont = find_proper_container(cp.data, cp.size, &eq_id); + eq_id = find_proper_container(cp.data, cp.size, &cont); if (!eq_id) { cont.size = -1; return -EINVAL; diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 6996413c78c3..73102d932760 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -44,7 +44,9 @@ #define DRIVER_VERSION "2.2" static struct microcode_ops *microcode_ops; -static bool dis_ucode_ldr; +static bool dis_ucode_ldr = true; + +bool initrd_gone; LIST_HEAD(microcode_cache); @@ -76,6 +78,7 @@ struct cpu_info_ctx { static bool __init check_loader_disabled_bsp(void) { static const char *__dis_opt_str = "dis_ucode_ldr"; + u32 a, b, c, d; #ifdef CONFIG_X86_32 const char *cmdline = (const char *)__pa_nodebug(boot_command_line); @@ -88,8 +91,23 @@ static bool __init check_loader_disabled_bsp(void) bool *res = &dis_ucode_ldr; #endif - if (cmdline_find_option_bool(cmdline, option)) - *res = true; + if (!have_cpuid_p()) + return *res; + + a = 1; + c = 0; + native_cpuid(&a, &b, &c, &d); + + /* + * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not + * completely accurate as xen pv guests don't see that CPUID bit set but + * that's good enough as they don't land on the BSP path anyway. + */ + if (c & BIT(31)) + return *res; + + if (cmdline_find_option_bool(cmdline, option) <= 0) + *res = false; return *res; } @@ -121,9 +139,6 @@ void __init load_ucode_bsp(void) if (check_loader_disabled_bsp()) return; - if (!have_cpuid_p()) - return; - vendor = x86_cpuid_vendor(); family = x86_cpuid_family(); @@ -157,9 +172,6 @@ void load_ucode_ap(void) if (check_loader_disabled_ap()) return; - if (!have_cpuid_p()) - return; - vendor = x86_cpuid_vendor(); family = x86_cpuid_family(); @@ -180,21 +192,24 @@ void load_ucode_ap(void) static int __init save_microcode_in_initrd(void) { struct cpuinfo_x86 *c = &boot_cpu_data; + int ret = -EINVAL; switch (c->x86_vendor) { case X86_VENDOR_INTEL: if (c->x86 >= 6) - return save_microcode_in_initrd_intel(); + ret = save_microcode_in_initrd_intel(); break; case X86_VENDOR_AMD: if (c->x86 >= 0x10) - return save_microcode_in_initrd_amd(c->x86); + ret = save_microcode_in_initrd_amd(c->x86); break; default: break; } - return -EINVAL; + initrd_gone = true; + + return ret; } struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa) @@ -233,15 +248,20 @@ struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa) # endif /* - * Did we relocate the ramdisk? + * Fixup the start address: after reserve_initrd() runs, initrd_start + * has the virtual address of the beginning of the initrd. It also + * possibly relocates the ramdisk. In either case, initrd_start contains + * the updated address so use that instead. * - * So we possibly relocate the ramdisk *after* applying microcode on the - * BSP so we rely on use_pa (use physical addresses) - even if it is not - * absolutely correct - to determine whether we've done the ramdisk - * relocation already. + * initrd_gone is for the hotplug case where we've thrown out initrd + * already. */ - if (!use_pa && relocated_ramdisk) - start = initrd_start; + if (!use_pa) { + if (initrd_gone) + return (struct cpio_data){ NULL, 0, "" }; + if (initrd_start) + start = initrd_start; + } return find_cpio_data(path, (void *)start, size, NULL); #else /* !CONFIG_BLK_DEV_INITRD */ diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 54d50c3694d8..8325d8a09ab0 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -41,7 +41,7 @@ static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin"; -/* Current microcode patch used in early patching */ +/* Current microcode patch used in early patching on the APs. */ struct microcode_intel *intel_ucode_patch; static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1, @@ -150,7 +150,7 @@ static struct ucode_patch *__alloc_microcode_buf(void *data, unsigned int size) { struct ucode_patch *p; - p = kzalloc(size, GFP_KERNEL); + p = kzalloc(sizeof(struct ucode_patch), GFP_KERNEL); if (!p) return ERR_PTR(-ENOMEM); @@ -390,15 +390,8 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci) native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); csig.pf = 1 << ((val[1] >> 18) & 7); } - native_wrmsrl(MSR_IA32_UCODE_REV, 0); - /* As documented in the SDM: Do a CPUID 1 here */ - sync_core(); - - /* get the current revision from MSR 0x8B */ - native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - - csig.rev = val[1]; + csig.rev = intel_get_microcode_revision(); uci->cpu_sig = csig; uci->valid = 1; @@ -582,7 +575,7 @@ static inline void print_ucode(struct ucode_cpu_info *uci) static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) { struct microcode_intel *mc; - unsigned int val[2]; + u32 rev; mc = uci->mc; if (!mc) @@ -590,21 +583,16 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) /* write microcode via MSR 0x79 */ native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); - native_wrmsrl(MSR_IA32_UCODE_REV, 0); - /* As documented in the SDM: Do a CPUID 1 here */ - sync_core(); - - /* get the current revision from MSR 0x8B */ - native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - if (val[1] != mc->hdr.rev) + rev = intel_get_microcode_revision(); + if (rev != mc->hdr.rev) return -1; #ifdef CONFIG_X86_64 /* Flush global tlb. This is precaution. */ flush_tlb_early(); #endif - uci->cpu_sig.rev = val[1]; + uci->cpu_sig.rev = rev; if (early) print_ucode(uci); @@ -619,12 +607,6 @@ int __init save_microcode_in_initrd_intel(void) struct ucode_cpu_info uci; struct cpio_data cp; - /* - * AP loading didn't find any microcode patch, no need to save anything. - */ - if (!intel_ucode_patch || IS_ERR(intel_ucode_patch)) - return 0; - if (!load_builtin_intel_microcode(&cp)) cp = find_microcode_in_initrd(ucode_path, false); @@ -640,7 +622,6 @@ int __init save_microcode_in_initrd_intel(void) return 0; } - /* * @res_patch, output: a pointer to the patch we found. */ @@ -784,8 +765,8 @@ static int apply_microcode_intel(int cpu) struct microcode_intel *mc; struct ucode_cpu_info *uci; struct cpuinfo_x86 *c; - unsigned int val[2]; static int prev_rev; + u32 rev; /* We should bind the task to the CPU */ if (WARN_ON(raw_smp_processor_id() != cpu)) @@ -802,33 +783,28 @@ static int apply_microcode_intel(int cpu) /* write microcode via MSR 0x79 */ wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); - wrmsrl(MSR_IA32_UCODE_REV, 0); - - /* As documented in the SDM: Do a CPUID 1 here */ - sync_core(); - /* get the current revision from MSR 0x8B */ - rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); + rev = intel_get_microcode_revision(); - if (val[1] != mc->hdr.rev) { + if (rev != mc->hdr.rev) { pr_err("CPU%d update to revision 0x%x failed\n", cpu, mc->hdr.rev); return -1; } - if (val[1] != prev_rev) { + if (rev != prev_rev) { pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n", - val[1], + rev, mc->hdr.date & 0xffff, mc->hdr.date >> 24, (mc->hdr.date >> 16) & 0xff); - prev_rev = val[1]; + prev_rev = rev; } c = &cpu_data(cpu); - uci->cpu_sig.rev = val[1]; - c->microcode = val[1]; + uci->cpu_sig.rev = rev; + c->microcode = rev; return 0; } @@ -840,7 +816,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL; int new_rev = uci->cpu_sig.rev; unsigned int leftover = size; - unsigned int curr_mc_size = 0; + unsigned int curr_mc_size = 0, new_mc_size = 0; unsigned int csig, cpf; while (leftover) { @@ -881,6 +857,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, vfree(new_mc); new_rev = mc_header.rev; new_mc = mc; + new_mc_size = mc_size; mc = NULL; /* trigger new vmalloc */ } @@ -906,7 +883,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, * permanent memory. So it will be loaded early when a CPU is hot added * or resumes. */ - save_mc_for_early(new_mc, curr_mc_size); + save_mc_for_early(new_mc, new_mc_size); pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", cpu, new_rev, uci->cpu_sig.rev); diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 6c044543545e..65e20c97e04b 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -30,6 +30,7 @@ #include <asm/apic.h> #include <asm/timer.h> #include <asm/reboot.h> +#include <asm/nmi.h> struct ms_hyperv_info ms_hyperv; EXPORT_SYMBOL_GPL(ms_hyperv); @@ -132,9 +133,9 @@ static uint32_t __init ms_hyperv_platform(void) return 0; } -static cycle_t read_hv_clock(struct clocksource *arg) +static u64 read_hv_clock(struct clocksource *arg) { - cycle_t current_tick; + u64 current_tick; /* * Read the partition counter to get the current tick count. This count * is set to 0 when the partition is created and is incremented in @@ -157,6 +158,26 @@ static unsigned char hv_get_nmi_reason(void) return 0; } +#ifdef CONFIG_X86_LOCAL_APIC +/* + * Prior to WS2016 Debug-VM sends NMIs to all CPUs which makes + * it dificult to process CHANNELMSG_UNLOAD in case of crash. Handle + * unknown NMI on the first CPU which gets it. + */ +static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs) +{ + static atomic_t nmi_cpu = ATOMIC_INIT(-1); + + if (!unknown_nmi_panic) + return NMI_DONE; + + if (atomic_cmpxchg(&nmi_cpu, -1, raw_smp_processor_id()) != -1) + return NMI_HANDLED; + + return NMI_DONE; +} +#endif + static void __init ms_hyperv_init_platform(void) { /* @@ -182,6 +203,9 @@ static void __init ms_hyperv_init_platform(void) pr_info("HyperV: LAPIC Timer Frequency: %#x\n", lapic_timer_frequency); } + + register_nmi_handler(NMI_UNKNOWN, hv_nmi_unknown, NMI_FLAG_FIRST, + "hv_nmi_unknown"); #endif if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE) diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index d1316f9c8329..d9794060fe22 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -20,12 +20,15 @@ struct cpuid_bit { /* Please keep the leaf sorted by cpuid_bit.level for faster search. */ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, - { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, - { X86_FEATURE_INTEL_PT, CPUID_EBX, 25, 0x00000007, 0 }, + { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_INTEL_PT, CPUID_EBX, 25, 0x00000007, 0 }, { X86_FEATURE_AVX512_4VNNIW, CPUID_EDX, 2, 0x00000007, 0 }, { X86_FEATURE_AVX512_4FMAPS, CPUID_EDX, 3, 0x00000007, 0 }, - { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, - { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, + { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 }, + { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 }, + { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 }, + { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, + { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 650830e39e3a..3741461c63a0 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -631,9 +631,9 @@ static int determine_backup_region(u64 start, u64 end, void *arg) int crash_load_segments(struct kimage *image) { - unsigned long src_start, src_sz, elf_sz; - void *elf_addr; int ret; + struct kexec_buf kbuf = { .image = image, .buf_min = 0, + .buf_max = ULONG_MAX, .top_down = false }; /* * Determine and load a segment for backup area. First 640K RAM @@ -647,43 +647,44 @@ int crash_load_segments(struct kimage *image) if (ret < 0) return ret; - src_start = image->arch.backup_src_start; - src_sz = image->arch.backup_src_sz; - /* Add backup segment. */ - if (src_sz) { + if (image->arch.backup_src_sz) { + kbuf.buffer = &crash_zero_bytes; + kbuf.bufsz = sizeof(crash_zero_bytes); + kbuf.memsz = image->arch.backup_src_sz; + kbuf.buf_align = PAGE_SIZE; /* * Ideally there is no source for backup segment. This is * copied in purgatory after crash. Just add a zero filled * segment for now to make sure checksum logic works fine. */ - ret = kexec_add_buffer(image, (char *)&crash_zero_bytes, - sizeof(crash_zero_bytes), src_sz, - PAGE_SIZE, 0, -1, 0, - &image->arch.backup_load_addr); + ret = kexec_add_buffer(&kbuf); if (ret) return ret; + image->arch.backup_load_addr = kbuf.mem; pr_debug("Loaded backup region at 0x%lx backup_start=0x%lx memsz=0x%lx\n", - image->arch.backup_load_addr, src_start, src_sz); + image->arch.backup_load_addr, + image->arch.backup_src_start, kbuf.memsz); } /* Prepare elf headers and add a segment */ - ret = prepare_elf_headers(image, &elf_addr, &elf_sz); + ret = prepare_elf_headers(image, &kbuf.buffer, &kbuf.bufsz); if (ret) return ret; - image->arch.elf_headers = elf_addr; - image->arch.elf_headers_sz = elf_sz; + image->arch.elf_headers = kbuf.buffer; + image->arch.elf_headers_sz = kbuf.bufsz; - ret = kexec_add_buffer(image, (char *)elf_addr, elf_sz, elf_sz, - ELF_CORE_HEADER_ALIGN, 0, -1, 0, - &image->arch.elf_load_addr); + kbuf.memsz = kbuf.bufsz; + kbuf.buf_align = ELF_CORE_HEADER_ALIGN; + ret = kexec_add_buffer(&kbuf); if (ret) { vfree((void *)image->arch.elf_headers); return ret; } + image->arch.elf_load_addr = kbuf.mem; pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - image->arch.elf_load_addr, elf_sz, elf_sz); + image->arch.elf_load_addr, kbuf.bufsz, kbuf.bufsz); return ret; } diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index 11891ca7b716..538fedea9b3f 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -10,7 +10,7 @@ #include <linux/highmem.h> #include <linux/crash_dump.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> static void *kdump_buf_page; diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c index f6dfd9334b67..b2f7207ba86c 100644 --- a/arch/x86/kernel/doublefault.c +++ b/arch/x86/kernel/doublefault.c @@ -3,7 +3,7 @@ #include <linux/init_task.h> #include <linux/fs.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/pgtable.h> #include <asm/processor.h> #include <asm/desc.h> diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index e4e97a5355ce..de7234401275 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -9,6 +9,7 @@ #include <asm/fpu/regset.h> #include <asm/fpu/signal.h> #include <asm/fpu/types.h> +#include <asm/fpu/xstate.h> #include <asm/traps.h> #include <linux/hardirq.h> @@ -183,7 +184,8 @@ void fpstate_init(union fpregs_state *state) * it will #GP. Make sure it is replaced after the memset(). */ if (static_cpu_has(X86_FEATURE_XSAVES)) - state->xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT; + state->xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | + xfeatures_mask; if (static_cpu_has(X86_FEATURE_FXSR)) fpstate_init_fxstate(&state->fxsave); diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 90de28841242..b467b14b03eb 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -298,12 +298,13 @@ ENTRY(start_cpu) * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect, * address given in m16:64. */ - call 1f # put return address on stack for unwinder -1: xorq %rbp, %rbp # clear frame pointer + pushq $.Lafter_lret # put return address on stack for unwinder + xorq %rbp, %rbp # clear frame pointer movq initial_code(%rip), %rax pushq $__KERNEL_CS # set correct cs pushq %rax # target address in negative space lretq +.Lafter_lret: ENDPROC(start_cpu) #include "verify_cpu.S" diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 274fab99169d..dc6ba5bda9fc 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -352,6 +352,7 @@ static int hpet_resume(struct clock_event_device *evt, int timer) } else { struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); + irq_domain_deactivate_irq(irq_get_irq_data(hdev->irq)); irq_domain_activate_irq(irq_get_irq_data(hdev->irq)); disable_irq(hdev->irq); irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu)); @@ -791,7 +792,7 @@ static union hpet_lock hpet __cacheline_aligned = { { .lock = __ARCH_SPIN_LOCK_UNLOCKED, }, }; -static cycle_t read_hpet(struct clocksource *cs) +static u64 read_hpet(struct clocksource *cs) { unsigned long flags; union hpet_lock old, new; @@ -802,7 +803,7 @@ static cycle_t read_hpet(struct clocksource *cs) * Read HPET directly if in NMI. */ if (in_nmi()) - return (cycle_t)hpet_readl(HPET_COUNTER); + return (u64)hpet_readl(HPET_COUNTER); /* * Read the current state of the lock and HPET value atomically. @@ -821,7 +822,7 @@ static cycle_t read_hpet(struct clocksource *cs) WRITE_ONCE(hpet.value, new.value); arch_spin_unlock(&hpet.lock); local_irq_restore(flags); - return (cycle_t)new.value; + return (u64)new.value; } local_irq_restore(flags); @@ -843,15 +844,15 @@ contended: new.lockval = READ_ONCE(hpet.lockval); } while ((new.value == old.value) && arch_spin_is_locked(&new.lock)); - return (cycle_t)new.value; + return (u64)new.value; } #else /* * For UP or 32-bit. */ -static cycle_t read_hpet(struct clocksource *cs) +static u64 read_hpet(struct clocksource *cs) { - return (cycle_t)hpet_readl(HPET_COUNTER); + return (u64)hpet_readl(HPET_COUNTER); } #endif @@ -867,7 +868,7 @@ static struct clocksource clocksource_hpet = { static int hpet_clocksource_register(void) { u64 start, now; - cycle_t t1; + u64 t1; /* Start the counter */ hpet_restart_counter(); @@ -1051,11 +1052,11 @@ static __init int hpet_late_init(void) return 0; /* This notifier should be called after workqueue is ready */ - ret = cpuhp_setup_state(CPUHP_AP_X86_HPET_ONLINE, "AP_X86_HPET_ONLINE", + ret = cpuhp_setup_state(CPUHP_AP_X86_HPET_ONLINE, "x86/hpet:online", hpet_cpuhp_online, NULL); if (ret) return ret; - ret = cpuhp_setup_state(CPUHP_X86_HPET_DEAD, "X86_HPET_DEAD", NULL, + ret = cpuhp_setup_state(CPUHP_X86_HPET_DEAD, "x86/hpet:dead", NULL, hpet_cpuhp_dead); if (ret) goto err_cpuhp; diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 3407b148c240..d0a814a9d96a 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -331,17 +331,17 @@ static void *bzImage64_load(struct kimage *image, char *kernel, struct setup_header *header; int setup_sects, kern16_size, ret = 0; - unsigned long setup_header_size, params_cmdline_sz, params_misc_sz; + unsigned long setup_header_size, params_cmdline_sz; struct boot_params *params; unsigned long bootparam_load_addr, kernel_load_addr, initrd_load_addr; unsigned long purgatory_load_addr; - unsigned long kernel_bufsz, kernel_memsz, kernel_align; - char *kernel_buf; struct bzimage64_data *ldata; struct kexec_entry64_regs regs64; void *stack; unsigned int setup_hdr_offset = offsetof(struct boot_params, hdr); unsigned int efi_map_offset, efi_map_sz, efi_setup_data_offset; + struct kexec_buf kbuf = { .image = image, .buf_max = ULONG_MAX, + .top_down = true }; header = (struct setup_header *)(kernel + setup_hdr_offset); setup_sects = header->setup_sects; @@ -402,11 +402,11 @@ static void *bzImage64_load(struct kimage *image, char *kernel, params_cmdline_sz = sizeof(struct boot_params) + cmdline_len + MAX_ELFCOREHDR_STR_LEN; params_cmdline_sz = ALIGN(params_cmdline_sz, 16); - params_misc_sz = params_cmdline_sz + efi_map_sz + + kbuf.bufsz = params_cmdline_sz + efi_map_sz + sizeof(struct setup_data) + sizeof(struct efi_setup_data); - params = kzalloc(params_misc_sz, GFP_KERNEL); + params = kzalloc(kbuf.bufsz, GFP_KERNEL); if (!params) return ERR_PTR(-ENOMEM); efi_map_offset = params_cmdline_sz; @@ -418,37 +418,41 @@ static void *bzImage64_load(struct kimage *image, char *kernel, /* Is there a limit on setup header size? */ memcpy(¶ms->hdr, (kernel + setup_hdr_offset), setup_header_size); - ret = kexec_add_buffer(image, (char *)params, params_misc_sz, - params_misc_sz, 16, MIN_BOOTPARAM_ADDR, - ULONG_MAX, 1, &bootparam_load_addr); + kbuf.buffer = params; + kbuf.memsz = kbuf.bufsz; + kbuf.buf_align = 16; + kbuf.buf_min = MIN_BOOTPARAM_ADDR; + ret = kexec_add_buffer(&kbuf); if (ret) goto out_free_params; + bootparam_load_addr = kbuf.mem; pr_debug("Loaded boot_param, command line and misc at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - bootparam_load_addr, params_misc_sz, params_misc_sz); + bootparam_load_addr, kbuf.bufsz, kbuf.bufsz); /* Load kernel */ - kernel_buf = kernel + kern16_size; - kernel_bufsz = kernel_len - kern16_size; - kernel_memsz = PAGE_ALIGN(header->init_size); - kernel_align = header->kernel_alignment; - - ret = kexec_add_buffer(image, kernel_buf, - kernel_bufsz, kernel_memsz, kernel_align, - MIN_KERNEL_LOAD_ADDR, ULONG_MAX, 1, - &kernel_load_addr); + kbuf.buffer = kernel + kern16_size; + kbuf.bufsz = kernel_len - kern16_size; + kbuf.memsz = PAGE_ALIGN(header->init_size); + kbuf.buf_align = header->kernel_alignment; + kbuf.buf_min = MIN_KERNEL_LOAD_ADDR; + ret = kexec_add_buffer(&kbuf); if (ret) goto out_free_params; + kernel_load_addr = kbuf.mem; pr_debug("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - kernel_load_addr, kernel_memsz, kernel_memsz); + kernel_load_addr, kbuf.bufsz, kbuf.memsz); /* Load initrd high */ if (initrd) { - ret = kexec_add_buffer(image, initrd, initrd_len, initrd_len, - PAGE_SIZE, MIN_INITRD_LOAD_ADDR, - ULONG_MAX, 1, &initrd_load_addr); + kbuf.buffer = initrd; + kbuf.bufsz = kbuf.memsz = initrd_len; + kbuf.buf_align = PAGE_SIZE; + kbuf.buf_min = MIN_INITRD_LOAD_ADDR; + ret = kexec_add_buffer(&kbuf); if (ret) goto out_free_params; + initrd_load_addr = kbuf.mem; pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n", initrd_load_addr, initrd_len, initrd_len); diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index d9d8d16b69db..eb3509338ae0 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -56,7 +56,7 @@ #include <asm/cacheflush.h> #include <asm/desc.h> #include <asm/pgtable.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/alternative.h> #include <asm/insn.h> #include <asm/debugreg.h> diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 3bb4c5f021f6..3d1bee9d6a72 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -33,7 +33,7 @@ #include <asm/cacheflush.h> #include <asm/desc.h> #include <asm/pgtable.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/alternative.h> #include <asm/insn.h> #include <asm/debugreg.h> diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 60b9949f1e65..2a5cafdf8808 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -32,7 +32,7 @@ static int kvmclock __ro_after_init = 1; static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; -static cycle_t kvm_sched_clock_offset; +static u64 kvm_sched_clock_offset; static int parse_no_kvmclock(char *arg) { @@ -79,10 +79,10 @@ static int kvm_set_wallclock(const struct timespec *now) return -1; } -static cycle_t kvm_clock_read(void) +static u64 kvm_clock_read(void) { struct pvclock_vcpu_time_info *src; - cycle_t ret; + u64 ret; int cpu; preempt_disable_notrace(); @@ -93,12 +93,12 @@ static cycle_t kvm_clock_read(void) return ret; } -static cycle_t kvm_clock_get_cycles(struct clocksource *cs) +static u64 kvm_clock_get_cycles(struct clocksource *cs) { return kvm_clock_read(); } -static cycle_t kvm_sched_clock_read(void) +static u64 kvm_sched_clock_read(void) { return kvm_clock_read() - kvm_sched_clock_offset; } diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 8c1f218926d7..307b1f4543de 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -328,7 +328,7 @@ void machine_kexec(struct kimage *image) void arch_crash_save_vmcoreinfo(void) { - VMCOREINFO_SYMBOL(phys_base); + VMCOREINFO_NUMBER(phys_base); VMCOREINFO_SYMBOL(init_level4_pgt); #ifdef CONFIG_NUMA @@ -337,9 +337,7 @@ void arch_crash_save_vmcoreinfo(void) #endif vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); - VMCOREINFO_PAGE_OFFSET(PAGE_OFFSET); - VMCOREINFO_VMALLOC_START(VMALLOC_START); - VMCOREINFO_VMEMMAP_START(VMEMMAP_START); + VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE); } /* arch-dependent functionality related to kexec file-based syscall */ diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index f5e3ff835cc8..ef688804f80d 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -224,7 +224,6 @@ static int __init msr_init(void) return 0; out_class: - cpuhp_remove_state(cpuhp_msr_state); class_destroy(msr_class); out_chrdev: __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c index d33ef165b1f8..553acbbb4d32 100644 --- a/arch/x86/kernel/paravirt_patch_32.c +++ b/arch/x86/kernel/paravirt_patch_32.c @@ -68,7 +68,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, #endif default: -patch_default: +patch_default: __maybe_unused ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); break; diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index f4fcf26c9fce..11aaf1eaa0e4 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -80,7 +80,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, #endif default: -patch_default: +patch_default: __maybe_unused ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); break; diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index b47edb8f5256..410efb2c7b80 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -68,12 +68,10 @@ static struct dma_map_ops swiotlb_dma_ops = { */ int __init pci_swiotlb_detect_override(void) { - int use_swiotlb = swiotlb | swiotlb_force; - - if (swiotlb_force) + if (swiotlb_force == SWIOTLB_FORCE) swiotlb = 1; - return use_swiotlb; + return swiotlb; } IOMMU_INIT_FINISH(pci_swiotlb_detect_override, pci_xen_swiotlb_detect, diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c index 24a50301f150..91271122f0df 100644 --- a/arch/x86/kernel/platform-quirks.c +++ b/arch/x86/kernel/platform-quirks.c @@ -6,6 +6,7 @@ void __init x86_early_init_platform_quirks(void) { + x86_platform.legacy.i8042 = X86_LEGACY_I8042_EXPECTED_PRESENT; x86_platform.legacy.rtc = 1; x86_platform.legacy.reserve_bios_regions = 0; x86_platform.legacy.devices.pnpbios = 1; @@ -16,10 +17,14 @@ void __init x86_early_init_platform_quirks(void) break; case X86_SUBARCH_XEN: case X86_SUBARCH_LGUEST: + x86_platform.legacy.devices.pnpbios = 0; + x86_platform.legacy.rtc = 0; + break; case X86_SUBARCH_INTEL_MID: case X86_SUBARCH_CE4100: x86_platform.legacy.devices.pnpbios = 0; x86_platform.legacy.rtc = 0; + x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT; break; } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 43c36d8a6ae2..b615a1113f58 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -23,7 +23,7 @@ #include <asm/cpu.h> #include <asm/apic.h> #include <asm/syscalls.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/mwait.h> #include <asm/fpu/internal.h> #include <asm/debugreg.h> @@ -235,6 +235,7 @@ static inline void play_dead(void) void arch_cpu_idle_enter(void) { + tsc_verify_tsc_adjust(false); local_touch_nmi(); } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index d0d744108594..a0ac3e81518a 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -53,6 +53,7 @@ #include <asm/debugreg.h> #include <asm/switch_to.h> #include <asm/vm86.h> +#include <asm/intel_rdt.h> void __show_regs(struct pt_regs *regs, int all) { @@ -296,5 +297,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) this_cpu_write(current_task, next_p); + /* Load the Intel cache allocation PQR MSR. */ + intel_rdt_sched_in(); + return prev_p; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index a76b65e3e615..a61e141b6891 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -49,6 +49,7 @@ #include <asm/switch_to.h> #include <asm/xen/hypervisor.h> #include <asm/vdso.h> +#include <asm/intel_rdt.h> __visible DEFINE_PER_CPU(unsigned long, rsp_scratch); @@ -476,6 +477,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) loadsegment(ss, __KERNEL_DS); } + /* Load the Intel cache allocation PQR MSR. */ + intel_rdt_sched_in(); + return prev_p; } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 0e63c0267f99..9cc7d5a330ef 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -24,7 +24,7 @@ #include <linux/export.h> #include <linux/context_tracking.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/pgtable.h> #include <asm/processor.h> #include <asm/fpu/internal.h> diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 5b2cc889ce34..9e93fe5803b4 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -71,10 +71,10 @@ u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src) return flags & valid_flags; } -cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) +u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) { unsigned version; - cycle_t ret; + u64 ret; u64 last; u8 flags; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 0c37d4fd01b2..99b920d0e516 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -103,7 +103,6 @@ static unsigned int max_physical_pkg_id __read_mostly; unsigned int __max_logical_packages __read_mostly; EXPORT_SYMBOL(__max_logical_packages); static unsigned int logical_packages __read_mostly; -static bool logical_packages_frozen __read_mostly; /* Maximum number of SMT threads on any online core */ int __max_smt_threads __read_mostly; @@ -273,9 +272,14 @@ static void notrace start_secondary(void *unused) cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } -int topology_update_package_map(unsigned int apicid, unsigned int cpu) +/** + * topology_update_package_map - Update the physical to logical package map + * @pkg: The physical package id as retrieved via CPUID + * @cpu: The cpu for which this is updated + */ +int topology_update_package_map(unsigned int pkg, unsigned int cpu) { - unsigned int new, pkg = apicid >> boot_cpu_data.x86_coreid_bits; + unsigned int new; /* Called from early boot ? */ if (!physical_package_map) @@ -288,16 +292,17 @@ int topology_update_package_map(unsigned int apicid, unsigned int cpu) if (test_and_set_bit(pkg, physical_package_map)) goto found; - if (logical_packages_frozen) { - physical_to_logical_pkg[pkg] = -1; - pr_warn("APIC(%x) Package %u exceeds logical package max\n", - apicid, pkg); + if (logical_packages >= __max_logical_packages) { + pr_warn("Package %u of CPU %u exceeds BIOS package data %u.\n", + logical_packages, cpu, __max_logical_packages); return -ENOSPC; } new = logical_packages++; - pr_info("APIC(%x) Converting physical %u to logical package %u\n", - apicid, pkg, new); + if (new != pkg) { + pr_info("CPU %u Converting physical %u to logical package %u\n", + cpu, pkg, new); + } physical_to_logical_pkg[pkg] = new; found: @@ -318,9 +323,9 @@ int topology_phys_to_logical_pkg(unsigned int phys_pkg) } EXPORT_SYMBOL(topology_phys_to_logical_pkg); -static void __init smp_init_package_map(void) +static void __init smp_init_package_map(struct cpuinfo_x86 *c, unsigned int cpu) { - unsigned int ncpus, cpu; + unsigned int ncpus; size_t size; /* @@ -365,27 +370,9 @@ static void __init smp_init_package_map(void) size = BITS_TO_LONGS(max_physical_pkg_id) * sizeof(unsigned long); physical_package_map = kzalloc(size, GFP_KERNEL); - for_each_present_cpu(cpu) { - unsigned int apicid = apic->cpu_present_to_apicid(cpu); - - if (apicid == BAD_APICID || !apic->apic_id_valid(apicid)) - continue; - if (!topology_update_package_map(apicid, cpu)) - continue; - pr_warn("CPU %u APICId %x disabled\n", cpu, apicid); - per_cpu(x86_bios_cpu_apicid, cpu) = BAD_APICID; - set_cpu_possible(cpu, false); - set_cpu_present(cpu, false); - } - - if (logical_packages > __max_logical_packages) { - pr_warn("Detected more packages (%u), then computed by BIOS data (%u).\n", - logical_packages, __max_logical_packages); - logical_packages_frozen = true; - __max_logical_packages = logical_packages; - } - pr_info("Max logical packages: %u\n", __max_logical_packages); + + topology_update_package_map(c->phys_proc_id, cpu); } void __init smp_store_boot_cpu_info(void) @@ -395,7 +382,7 @@ void __init smp_store_boot_cpu_info(void) *c = boot_cpu_data; c->cpu_index = id; - smp_init_package_map(); + smp_init_package_map(c, id); } /* @@ -446,9 +433,15 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) int cpu1 = c->cpu_index, cpu2 = o->cpu_index; if (c->phys_proc_id == o->phys_proc_id && - per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2) && - c->cpu_core_id == o->cpu_core_id) - return topology_sane(c, o, "smt"); + per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) { + if (c->cpu_core_id == o->cpu_core_id) + return topology_sane(c, o, "smt"); + + if ((c->cu_id != 0xff) && + (o->cu_id != 0xff) && + (c->cu_id == o->cu_id)) + return topology_sane(c, o, "smt"); + } } else if (c->phys_proc_id == o->phys_proc_id && c->cpu_core_id == o->cpu_core_id) { @@ -1476,15 +1469,15 @@ __init void prefill_possible_map(void) possible = i; } + nr_cpu_ids = possible; + pr_info("Allowing %d CPUs, %d hotplug CPUs\n", possible, max_t(int, possible - num_processors, 0)); + reset_cpu_possible_mask(); + for (i = 0; i < possible; i++) set_cpu_possible(i, true); - for (; i < NR_CPUS; i++) - set_cpu_possible(i, false); - - nr_cpu_ids = possible; } #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 8402907825b0..b868fa1b812b 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -408,7 +408,7 @@ static __init int tboot_late_init(void) tboot_create_trampoline(); atomic_set(&ap_wfs_count, 0); - cpuhp_setup_state(CPUHP_AP_X86_TBOOT_DYING, "AP_X86_TBOOT_DYING", NULL, + cpuhp_setup_state(CPUHP_AP_X86_TBOOT_DYING, "x86/tboot:dying", NULL, tboot_dying_cpu); #ifdef CONFIG_DEBUG_FS debugfs_create_file("tboot_log", S_IRUSR, diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c index 27538f183c3b..a3b875c9e6af 100644 --- a/arch/x86/kernel/test_nx.c +++ b/arch/x86/kernel/test_nx.c @@ -13,7 +13,7 @@ #include <linux/sort.h> #include <linux/slab.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/asm.h> extern int rodata_test_data; diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index 9692a5e9fdab..6c8934406dc9 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -5,7 +5,7 @@ #include <linux/regset.h> #include <linux/syscalls.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/desc.h> #include <asm/ldt.h> #include <asm/processor.h> diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c index 1c113db9ed57..15515132bf0d 100644 --- a/arch/x86/kernel/tracepoint.c +++ b/arch/x86/kernel/tracepoint.c @@ -34,7 +34,7 @@ static void switch_idt(void *arg) local_irq_restore(flags); } -void trace_irq_vector_regfunc(void) +int trace_irq_vector_regfunc(void) { mutex_lock(&irq_vector_mutex); if (!trace_irq_vector_refcount) { @@ -44,6 +44,7 @@ void trace_irq_vector_regfunc(void) } trace_irq_vector_refcount++; mutex_unlock(&irq_vector_mutex); + return 0; } void trace_irq_vector_unregfunc(void) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 46b2f41f8b05..37e7cf544e51 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -694,6 +694,7 @@ unsigned long native_calibrate_tsc(void) crystal_khz = 24000; /* 24.0 MHz */ break; case INTEL_FAM6_SKYLAKE_X: + case INTEL_FAM6_ATOM_DENVERTON: crystal_khz = 25000; /* 25.0 MHz */ break; case INTEL_FAM6_ATOM_GOLDMONT: @@ -702,6 +703,20 @@ unsigned long native_calibrate_tsc(void) } } + /* + * TSC frequency determined by CPUID is a "hardware reported" + * frequency and is the most accurate one so far we have. This + * is considered a known frequency. + */ + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + + /* + * For Atom SoCs TSC is the only reliable clocksource. + * Mark TSC reliable so no watchdog on it. + */ + if (boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT) + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); + return crystal_khz * ebx_numerator / eax_denominator; } @@ -1043,18 +1058,20 @@ static void detect_art(void) if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF) return; - cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator, - &art_to_tsc_numerator, unused, unused+1); - - /* Don't enable ART in a VM, non-stop TSC required */ + /* Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required */ if (boot_cpu_has(X86_FEATURE_HYPERVISOR) || !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) || - art_to_tsc_denominator < ART_MIN_DENOMINATOR) + !boot_cpu_has(X86_FEATURE_TSC_ADJUST)) return; - if (rdmsrl_safe(MSR_IA32_TSC_ADJUST, &art_to_tsc_offset)) + cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator, + &art_to_tsc_numerator, unused, unused+1); + + if (art_to_tsc_denominator < ART_MIN_DENOMINATOR) return; + rdmsrl(MSR_IA32_TSC_ADJUST, art_to_tsc_offset); + /* Make this sticky over multiple CPU init calls */ setup_force_cpu_cap(X86_FEATURE_ART); } @@ -1064,6 +1081,11 @@ static void detect_art(void) static struct clocksource clocksource_tsc; +static void tsc_resume(struct clocksource *cs) +{ + tsc_verify_tsc_adjust(true); +} + /* * We used to compare the TSC to the cycle_last value in the clocksource * structure to avoid a nasty time-warp. This can be observed in a @@ -1080,9 +1102,9 @@ static struct clocksource clocksource_tsc; * checking the result of read_tsc() - cycle_last for being negative. * That works because CLOCKSOURCE_MASK(64) does not mask out any bit. */ -static cycle_t read_tsc(struct clocksource *cs) +static u64 read_tsc(struct clocksource *cs) { - return (cycle_t)rdtsc_ordered(); + return (u64)rdtsc_ordered(); } /* @@ -1096,6 +1118,7 @@ static struct clocksource clocksource_tsc = { .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_MUST_VERIFY, .archdata = { .vclock_mode = VCLOCK_TSC }, + .resume = tsc_resume, }; void mark_tsc_unstable(char *reason) @@ -1170,7 +1193,7 @@ int unsynchronized_tsc(void) /* * Convert ART to TSC given numerator/denominator found in detect_art() */ -struct system_counterval_t convert_art_to_tsc(cycle_t art) +struct system_counterval_t convert_art_to_tsc(u64 art) { u64 tmp, res, rem; @@ -1283,10 +1306,10 @@ static int __init init_tsc_clocksource(void) clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP; /* - * Trust the results of the earlier calibration on systems - * exporting a reliable TSC. + * When TSC frequency is known (retrieved via MSR or CPUID), we skip + * the refined calibration and directly register it as a clocksource. */ - if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { + if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) { clocksource_register_khz(&clocksource_tsc, tsc_khz); return 0; } @@ -1333,6 +1356,9 @@ void __init tsc_init(void) (unsigned long)cpu_khz / 1000, (unsigned long)cpu_khz % 1000); + /* Sanitize TSC ADJUST before cyc2ns gets initialized */ + tsc_store_and_check_tsc_adjust(true); + /* * Secondary CPUs do not run through tsc_init(), so set up * all the scale factors for all CPUs, assuming the same diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 0fe720d64fef..19afdbd7d0a7 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -100,5 +100,24 @@ unsigned long cpu_khz_from_msr(void) #ifdef CONFIG_X86_LOCAL_APIC lapic_timer_frequency = (freq * 1000) / HZ; #endif + + /* + * TSC frequency determined by MSR is always considered "known" + * because it is reported by HW. + * Another fact is that on MSR capable platforms, PIT/HPET is + * generally not available so calibration won't work at all. + */ + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + + /* + * Unfortunately there is no way for hardware to tell whether the + * TSC is reliable. We were told by silicon design team that TSC + * on Atom SoCs are always "reliable". TSC is also the only + * reliable clocksource on these SoCs (HPET is either not present + * or not functional) so mark TSC reliable which removes the + * requirement for a watchdog clocksource. + */ + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); + return res; } diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 78083bf23ed1..728f75378475 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -14,18 +14,166 @@ * ( The serial nature of the boot logic and the CPU hotplug lock * protects against more than 2 CPUs entering this code. ) */ +#include <linux/topology.h> #include <linux/spinlock.h> #include <linux/kernel.h> #include <linux/smp.h> #include <linux/nmi.h> #include <asm/tsc.h> +struct tsc_adjust { + s64 bootval; + s64 adjusted; + unsigned long nextcheck; + bool warned; +}; + +static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust); + +void tsc_verify_tsc_adjust(bool resume) +{ + struct tsc_adjust *adj = this_cpu_ptr(&tsc_adjust); + s64 curval; + + if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) + return; + + /* Rate limit the MSR check */ + if (!resume && time_before(jiffies, adj->nextcheck)) + return; + + adj->nextcheck = jiffies + HZ; + + rdmsrl(MSR_IA32_TSC_ADJUST, curval); + if (adj->adjusted == curval) + return; + + /* Restore the original value */ + wrmsrl(MSR_IA32_TSC_ADJUST, adj->adjusted); + + if (!adj->warned || resume) { + pr_warn(FW_BUG "TSC ADJUST differs: CPU%u %lld --> %lld. Restoring\n", + smp_processor_id(), adj->adjusted, curval); + adj->warned = true; + } +} + +static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval, + unsigned int cpu, bool bootcpu) +{ + /* + * First online CPU in a package stores the boot value in the + * adjustment value. This value might change later via the sync + * mechanism. If that fails we still can yell about boot values not + * being consistent. + * + * On the boot cpu we just force set the ADJUST value to 0 if it's + * non zero. We don't do that on non boot cpus because physical + * hotplug should have set the ADJUST register to a value > 0 so + * the TSC is in sync with the already running cpus. + * + * But we always force positive ADJUST values. Otherwise the TSC + * deadline timer creates an interrupt storm. We also have to + * prevent values > 0x7FFFFFFF as those wreckage the timer as well. + */ + if ((bootcpu && bootval != 0) || (!bootcpu && bootval < 0) || + (bootval > 0x7FFFFFFF)) { + pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n", cpu, + bootval); + wrmsrl(MSR_IA32_TSC_ADJUST, 0); + bootval = 0; + } + cur->adjusted = bootval; +} + +#ifndef CONFIG_SMP +bool __init tsc_store_and_check_tsc_adjust(bool bootcpu) +{ + struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust); + s64 bootval; + + if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) + return false; + + rdmsrl(MSR_IA32_TSC_ADJUST, bootval); + cur->bootval = bootval; + cur->nextcheck = jiffies + HZ; + tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(), bootcpu); + return false; +} + +#else /* !CONFIG_SMP */ + +/* + * Store and check the TSC ADJUST MSR if available + */ +bool tsc_store_and_check_tsc_adjust(bool bootcpu) +{ + struct tsc_adjust *ref, *cur = this_cpu_ptr(&tsc_adjust); + unsigned int refcpu, cpu = smp_processor_id(); + struct cpumask *mask; + s64 bootval; + + if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) + return false; + + rdmsrl(MSR_IA32_TSC_ADJUST, bootval); + cur->bootval = bootval; + cur->nextcheck = jiffies + HZ; + cur->warned = false; + + /* + * Check whether this CPU is the first in a package to come up. In + * this case do not check the boot value against another package + * because the new package might have been physically hotplugged, + * where TSC_ADJUST is expected to be different. When called on the + * boot CPU topology_core_cpumask() might not be available yet. + */ + mask = topology_core_cpumask(cpu); + refcpu = mask ? cpumask_any_but(mask, cpu) : nr_cpu_ids; + + if (refcpu >= nr_cpu_ids) { + tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(), + bootcpu); + return false; + } + + ref = per_cpu_ptr(&tsc_adjust, refcpu); + /* + * Compare the boot value and complain if it differs in the + * package. + */ + if (bootval != ref->bootval) { + pr_warn(FW_BUG "TSC ADJUST differs: Reference CPU%u: %lld CPU%u: %lld\n", + refcpu, ref->bootval, cpu, bootval); + } + /* + * The TSC_ADJUST values in a package must be the same. If the boot + * value on this newly upcoming CPU differs from the adjustment + * value of the already online CPU in this package, set it to that + * adjusted value. + */ + if (bootval != ref->adjusted) { + pr_warn("TSC ADJUST synchronize: Reference CPU%u: %lld CPU%u: %lld\n", + refcpu, ref->adjusted, cpu, bootval); + cur->adjusted = ref->adjusted; + wrmsrl(MSR_IA32_TSC_ADJUST, ref->adjusted); + } + /* + * We have the TSCs forced to be in sync on this package. Skip sync + * test: + */ + return true; +} + /* * Entry/exit counters that make sure that both CPUs * run the measurement code at once: */ static atomic_t start_count; static atomic_t stop_count; +static atomic_t skip_test; +static atomic_t test_runs; /* * We use a raw spinlock in this exceptional case, because @@ -37,15 +185,16 @@ static arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED; static cycles_t last_tsc; static cycles_t max_warp; static int nr_warps; +static int random_warps; /* * TSC-warp measurement loop running on both CPUs. This is not called * if there is no TSC. */ -static void check_tsc_warp(unsigned int timeout) +static cycles_t check_tsc_warp(unsigned int timeout) { - cycles_t start, now, prev, end; - int i; + cycles_t start, now, prev, end, cur_max_warp = 0; + int i, cur_warps = 0; start = rdtsc_ordered(); /* @@ -85,13 +234,22 @@ static void check_tsc_warp(unsigned int timeout) if (unlikely(prev > now)) { arch_spin_lock(&sync_lock); max_warp = max(max_warp, prev - now); + cur_max_warp = max_warp; + /* + * Check whether this bounces back and forth. Only + * one CPU should observe time going backwards. + */ + if (cur_warps != nr_warps) + random_warps++; nr_warps++; + cur_warps = nr_warps; arch_spin_unlock(&sync_lock); } } WARN(!(now-start), "Warning: zero tsc calibration delta: %Ld [max: %Ld]\n", now-start, end-start); + return cur_max_warp; } /* @@ -128,23 +286,27 @@ void check_tsc_sync_source(int cpu) if (unsynchronized_tsc()) return; - if (tsc_clocksource_reliable) { - if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING) - pr_info( - "Skipped synchronization checks as TSC is reliable.\n"); - return; - } - /* - * Reset it - in case this is a second bootup: + * Set the maximum number of test runs to + * 1 if the CPU does not provide the TSC_ADJUST MSR + * 3 if the MSR is available, so the target can try to adjust */ - atomic_set(&stop_count, 0); - + if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) + atomic_set(&test_runs, 1); + else + atomic_set(&test_runs, 3); +retry: /* - * Wait for the target to arrive: + * Wait for the target to start or to skip the test: */ - while (atomic_read(&start_count) != cpus-1) + while (atomic_read(&start_count) != cpus - 1) { + if (atomic_read(&skip_test) > 0) { + atomic_set(&skip_test, 0); + return; + } cpu_relax(); + } + /* * Trigger the target to continue into the measurement too: */ @@ -155,21 +317,35 @@ void check_tsc_sync_source(int cpu) while (atomic_read(&stop_count) != cpus-1) cpu_relax(); - if (nr_warps) { + /* + * If the test was successful set the number of runs to zero and + * stop. If not, decrement the number of runs an check if we can + * retry. In case of random warps no retry is attempted. + */ + if (!nr_warps) { + atomic_set(&test_runs, 0); + + pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n", + smp_processor_id(), cpu); + + } else if (atomic_dec_and_test(&test_runs) || random_warps) { + /* Force it to 0 if random warps brought us here */ + atomic_set(&test_runs, 0); + pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n", smp_processor_id(), cpu); pr_warning("Measured %Ld cycles TSC warp between CPUs, " "turning off TSC clock.\n", max_warp); + if (random_warps) + pr_warning("TSC warped randomly between CPUs\n"); mark_tsc_unstable("check_tsc_sync_source failed"); - } else { - pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n", - smp_processor_id(), cpu); } /* * Reset it - just in case we boot another CPU later: */ atomic_set(&start_count, 0); + random_warps = 0; nr_warps = 0; max_warp = 0; last_tsc = 0; @@ -178,6 +354,12 @@ void check_tsc_sync_source(int cpu) * Let the target continue with the bootup: */ atomic_inc(&stop_count); + + /* + * Retry, if there is a chance to do so. + */ + if (atomic_read(&test_runs) > 0) + goto retry; } /* @@ -185,12 +367,30 @@ void check_tsc_sync_source(int cpu) */ void check_tsc_sync_target(void) { + struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust); + unsigned int cpu = smp_processor_id(); + cycles_t cur_max_warp, gbl_max_warp; int cpus = 2; /* Also aborts if there is no TSC. */ - if (unsynchronized_tsc() || tsc_clocksource_reliable) + if (unsynchronized_tsc()) + return; + + /* + * Store, verify and sanitize the TSC adjust register. If + * successful skip the test. + * + * The test is also skipped when the TSC is marked reliable. This + * is true for SoCs which have no fallback clocksource. On these + * SoCs the TSC is frequency synchronized, but still the TSC ADJUST + * register might have been wreckaged by the BIOS.. + */ + if (tsc_store_and_check_tsc_adjust(false) || tsc_clocksource_reliable) { + atomic_inc(&skip_test); return; + } +retry: /* * Register this CPU's participation and wait for the * source CPU to start the measurement: @@ -199,7 +399,12 @@ void check_tsc_sync_target(void) while (atomic_read(&start_count) != cpus) cpu_relax(); - check_tsc_warp(loop_timeout(smp_processor_id())); + cur_max_warp = check_tsc_warp(loop_timeout(cpu)); + + /* + * Store the maximum observed warp value for a potential retry: + */ + gbl_max_warp = max_warp; /* * Ok, we are done: @@ -211,4 +416,61 @@ void check_tsc_sync_target(void) */ while (atomic_read(&stop_count) != cpus) cpu_relax(); + + /* + * Reset it for the next sync test: + */ + atomic_set(&stop_count, 0); + + /* + * Check the number of remaining test runs. If not zero, the test + * failed and a retry with adjusted TSC is possible. If zero the + * test was either successful or failed terminally. + */ + if (!atomic_read(&test_runs)) + return; + + /* + * If the warp value of this CPU is 0, then the other CPU + * observed time going backwards so this TSC was ahead and + * needs to move backwards. + */ + if (!cur_max_warp) + cur_max_warp = -gbl_max_warp; + + /* + * Add the result to the previous adjustment value. + * + * The adjustement value is slightly off by the overhead of the + * sync mechanism (observed values are ~200 TSC cycles), but this + * really depends on CPU, node distance and frequency. So + * compensating for this is hard to get right. Experiments show + * that the warp is not longer detectable when the observed warp + * value is used. In the worst case the adjustment needs to go + * through a 3rd run for fine tuning. + */ + cur->adjusted += cur_max_warp; + + /* + * TSC deadline timer stops working or creates an interrupt storm + * with adjust values < 0 and > x07ffffff. + * + * To allow adjust values > 0x7FFFFFFF we need to disable the + * deadline timer and use the local APIC timer, but that requires + * more intrusive changes and we do not have any useful information + * from Intel about the underlying HW wreckage yet. + */ + if (cur->adjusted < 0) + cur->adjusted = 0; + if (cur->adjusted > 0x7FFFFFFF) + cur->adjusted = 0x7FFFFFFF; + + pr_warn("TSC ADJUST compensate: CPU%u observed %lld warp. Adjust: %lld\n", + cpu, cur_max_warp, cur->adjusted); + + wrmsrl(MSR_IA32_TSC_ADJUST, cur->adjusted); + goto retry; + } + +#endif /* CONFIG_SMP */ diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index ea7b7f9a3b9e..23d15565d02a 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -6,6 +6,52 @@ #define FRAME_HEADER_SIZE (sizeof(long) * 2) +/* + * This disables KASAN checking when reading a value from another task's stack, + * since the other task could be running on another CPU and could have poisoned + * the stack in the meantime. + */ +#define READ_ONCE_TASK_STACK(task, x) \ +({ \ + unsigned long val; \ + if (task == current) \ + val = READ_ONCE(x); \ + else \ + val = READ_ONCE_NOCHECK(x); \ + val; \ +}) + +static void unwind_dump(struct unwind_state *state, unsigned long *sp) +{ + static bool dumped_before = false; + bool prev_zero, zero = false; + unsigned long word; + + if (dumped_before) + return; + + dumped_before = true; + + printk_deferred("unwind stack type:%d next_sp:%p mask:%lx graph_idx:%d\n", + state->stack_info.type, state->stack_info.next_sp, + state->stack_mask, state->graph_idx); + + for (sp = state->orig_sp; sp < state->stack_info.end; sp++) { + word = READ_ONCE_NOCHECK(*sp); + + prev_zero = zero; + zero = word == 0; + + if (zero) { + if (!prev_zero) + printk_deferred("%p: %016x ...\n", sp, 0); + continue; + } + + printk_deferred("%p: %016lx (%pB)\n", sp, word, (void *)word); + } +} + unsigned long unwind_get_return_address(struct unwind_state *state) { unsigned long addr; @@ -17,18 +63,11 @@ unsigned long unwind_get_return_address(struct unwind_state *state) if (state->regs && user_mode(state->regs)) return 0; - addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, *addr_p, + addr = READ_ONCE_TASK_STACK(state->task, *addr_p); + addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, addr, addr_p); - if (!__kernel_text_address(addr)) { - printk_deferred_once(KERN_WARNING - "WARNING: unrecognized kernel stack return address %p at %p in %s:%d\n", - (void *)addr, addr_p, state->task->comm, - state->task->pid); - return 0; - } - - return addr; + return __kernel_text_address(addr) ? addr : 0; } EXPORT_SYMBOL_GPL(unwind_get_return_address); @@ -46,7 +85,14 @@ static bool is_last_task_frame(struct unwind_state *state) unsigned long bp = (unsigned long)state->bp; unsigned long regs = (unsigned long)task_pt_regs(state->task); - return bp == regs - FRAME_HEADER_SIZE; + /* + * We have to check for the last task frame at two different locations + * because gcc can occasionally decide to realign the stack pointer and + * change the offset of the stack frame by a word in the prologue of a + * function called by head/entry code. + */ + return bp == regs - FRAME_HEADER_SIZE || + bp == regs - FRAME_HEADER_SIZE - sizeof(long); } /* @@ -67,6 +113,7 @@ static bool update_stack_state(struct unwind_state *state, void *addr, size_t len) { struct stack_info *info = &state->stack_info; + enum stack_type orig_type = info->type; /* * If addr isn't on the current stack, switch to the next one. @@ -80,6 +127,9 @@ static bool update_stack_state(struct unwind_state *state, void *addr, &state->stack_mask)) return false; + if (!state->orig_sp || info->type != orig_type) + state->orig_sp = addr; + return true; } @@ -128,7 +178,7 @@ bool unwind_next_frame(struct unwind_state *state) if (state->regs) next_bp = (unsigned long *)state->regs->bp; else - next_bp = (unsigned long *)*state->bp; + next_bp = (unsigned long *)READ_ONCE_TASK_STACK(state->task,*state->bp); /* is the next frame pointer an encoded pointer to pt_regs? */ regs = decode_frame_pointer(next_bp); @@ -173,16 +223,28 @@ bool unwind_next_frame(struct unwind_state *state) return true; bad_address: + /* + * When unwinding a non-current task, the task might actually be + * running on another CPU, in which case it could be modifying its + * stack while we're reading it. This is generally not a problem and + * can be ignored as long as the caller understands that unwinding + * another task will not always succeed. + */ + if (state->task != current) + goto the_end; + if (state->regs) { printk_deferred_once(KERN_WARNING "WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n", state->regs, state->task->comm, state->task->pid, next_frame); + unwind_dump(state, (unsigned long *)state->regs); } else { printk_deferred_once(KERN_WARNING "WARNING: kernel stack frame pointer at %p in %s:%d has bad value %p\n", state->bp, state->task->comm, state->task->pid, next_frame); + unwind_dump(state, state->bp); } the_end: state->stack_info.type = STACK_TYPE_UNKNOWN; diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 01f30e56f99e..ec5d7545e6dc 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -47,7 +47,7 @@ #include <linux/slab.h> #include <linux/security.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/io.h> #include <asm/tlbflush.h> #include <asm/irq.h> diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 0bd9f1287f39..11a93f005268 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -89,7 +89,6 @@ struct x86_cpuinit_ops x86_cpuinit = { }; static void default_nmi_init(void) { }; -static int default_i8042_detect(void) { return 1; }; struct x86_platform_ops x86_platform __ro_after_init = { .calibrate_cpu = native_calibrate_cpu, @@ -100,7 +99,6 @@ struct x86_platform_ops x86_platform __ro_after_init = { .is_untracked_pat_range = is_ISA_range, .nmi_init = default_nmi_init, .get_nmi_reason = default_get_nmi_reason, - .i8042_detect = default_i8042_detect, .save_sched_clock_state = tsc_save_sched_clock_state, .restore_sched_clock_state = tsc_restore_sched_clock_state, }; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 0aefb626fa8f..e85f6bd7b9d5 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -16,6 +16,7 @@ #include <linux/export.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> +#include <asm/processor.h> #include <asm/user.h> #include <asm/fpu/xstate.h> #include "cpuid.h" @@ -64,6 +65,11 @@ u64 kvm_supported_xcr0(void) #define F(x) bit(X86_FEATURE_##x) +/* These are scattered features in cpufeatures.h. */ +#define KVM_CPUID_BIT_AVX512_4VNNIW 2 +#define KVM_CPUID_BIT_AVX512_4FMAPS 3 +#define KF(x) bit(KVM_CPUID_BIT_##x) + int kvm_update_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -80,6 +86,10 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) best->ecx |= F(OSXSAVE); } + best->edx &= ~F(APIC); + if (vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE) + best->edx |= F(APIC); + if (apic) { if (best->ecx & F(TSC_DEADLINE_TIMER)) apic->lapic_timer.timer_mode_mask = 3 << 17; @@ -363,16 +373,21 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, const u32 kvm_cpuid_7_0_ebx_x86_features = F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | - F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | - F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | - F(AVX512BW) | F(AVX512VL); + F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) | + F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | + F(SHA_NI) | F(AVX512BW) | F(AVX512VL); /* cpuid 0xD.1.eax */ const u32 kvm_cpuid_D_1_eax_x86_features = F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves; /* cpuid 7.0.ecx*/ - const u32 kvm_cpuid_7_0_ecx_x86_features = F(PKU) | 0 /*OSPKE*/; + const u32 kvm_cpuid_7_0_ecx_x86_features = + F(AVX512VBMI) | F(PKU) | 0 /*OSPKE*/; + + /* cpuid 7.0.edx*/ + const u32 kvm_cpuid_7_0_edx_x86_features = + KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); @@ -456,12 +471,14 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* PKU is not yet implemented for shadow paging. */ if (!tdp_enabled) entry->ecx &= ~F(PKU); + entry->edx &= kvm_cpuid_7_0_edx_x86_features; + entry->edx &= get_scattered_cpuid_leaf(7, 0, CPUID_EDX); } else { entry->ebx = 0; entry->ecx = 0; + entry->edx = 0; } entry->eax = 0; - entry->edx = 0; break; } case 9: @@ -861,17 +878,17 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) } EXPORT_SYMBOL_GPL(kvm_cpuid); -void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) +int kvm_emulate_cpuid(struct kvm_vcpu *vcpu) { - u32 function, eax, ebx, ecx, edx; + u32 eax, ebx, ecx, edx; - function = eax = kvm_register_read(vcpu, VCPU_REGS_RAX); + eax = kvm_register_read(vcpu, VCPU_REGS_RAX); ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx); kvm_register_write(vcpu, VCPU_REGS_RAX, eax); kvm_register_write(vcpu, VCPU_REGS_RBX, ebx); kvm_register_write(vcpu, VCPU_REGS_RCX, ecx); kvm_register_write(vcpu, VCPU_REGS_RDX, edx); - kvm_x86_ops->skip_emulated_instruction(vcpu); + return kvm_skip_emulated_instruction(vcpu); } EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a3ce9d260d68..cedbba0f3402 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -158,9 +158,11 @@ #define Src2GS (OpGS << Src2Shift) #define Src2Mask (OpMask << Src2Shift) #define Mmx ((u64)1 << 40) /* MMX Vector instruction */ +#define AlignMask ((u64)7 << 41) #define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */ -#define Unaligned ((u64)1 << 42) /* Explicitly unaligned (e.g. MOVDQU) */ -#define Avx ((u64)1 << 43) /* Advanced Vector Extensions */ +#define Unaligned ((u64)2 << 41) /* Explicitly unaligned (e.g. MOVDQU) */ +#define Avx ((u64)3 << 41) /* Advanced Vector Extensions */ +#define Aligned16 ((u64)4 << 41) /* Aligned to 16 byte boundary (e.g. FXSAVE) */ #define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */ #define NoWrite ((u64)1 << 45) /* No writeback */ #define SrcWrite ((u64)1 << 46) /* Write back src operand */ @@ -446,6 +448,26 @@ FOP_END; FOP_START(salc) "pushf; sbb %al, %al; popf \n\t" FOP_RET FOP_END; +/* + * XXX: inoutclob user must know where the argument is being expanded. + * Relying on CC_HAVE_ASM_GOTO would allow us to remove _fault. + */ +#define asm_safe(insn, inoutclob...) \ +({ \ + int _fault = 0; \ + \ + asm volatile("1:" insn "\n" \ + "2:\n" \ + ".pushsection .fixup, \"ax\"\n" \ + "3: movl $1, %[_fault]\n" \ + " jmp 2b\n" \ + ".popsection\n" \ + _ASM_EXTABLE(1b, 3b) \ + : [_fault] "+qm"(_fault) inoutclob ); \ + \ + _fault ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; \ +}) + static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, enum x86_intercept intercept, enum x86_intercept_stage stage) @@ -632,21 +654,26 @@ static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector, * depending on whether they're AVX encoded or not. * * Also included is CMPXCHG16B which is not a vector instruction, yet it is - * subject to the same check. + * subject to the same check. FXSAVE and FXRSTOR are checked here too as their + * 512 bytes of data must be aligned to a 16 byte boundary. */ -static bool insn_aligned(struct x86_emulate_ctxt *ctxt, unsigned size) +static unsigned insn_alignment(struct x86_emulate_ctxt *ctxt, unsigned size) { - if (likely(size < 16)) - return false; + u64 alignment = ctxt->d & AlignMask; - if (ctxt->d & Aligned) - return true; - else if (ctxt->d & Unaligned) - return false; - else if (ctxt->d & Avx) - return false; - else - return true; + if (likely(size < 16)) + return 1; + + switch (alignment) { + case Unaligned: + case Avx: + return 1; + case Aligned16: + return 16; + case Aligned: + default: + return size; + } } static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt, @@ -704,7 +731,7 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt, } break; } - if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0)) + if (la & (insn_alignment(ctxt, size) - 1)) return emulate_gp(ctxt, 0); return X86EMUL_CONTINUE; bad: @@ -791,6 +818,20 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt, return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception); } +static int segmented_write_std(struct x86_emulate_ctxt *ctxt, + struct segmented_address addr, + void *data, + unsigned int size) +{ + int rc; + ulong linear; + + rc = linearize(ctxt, addr, size, true, &linear); + if (rc != X86EMUL_CONTINUE) + return rc; + return ctxt->ops->write_std(ctxt, linear, data, size, &ctxt->exception); +} + /* * Prefetch the remaining bytes of the instruction without crossing page * boundary if they are not in fetch_cache yet. @@ -1544,7 +1585,6 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, &ctxt->exception); } -/* Does not support long mode */ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, int seg, u8 cpl, enum x86_transfer_type transfer, @@ -1581,20 +1621,34 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, rpl = selector & 3; - /* NULL selector is not valid for TR, CS and SS (except for long mode) */ - if ((seg == VCPU_SREG_CS - || (seg == VCPU_SREG_SS - && (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl)) - || seg == VCPU_SREG_TR) - && null_selector) - goto exception; - /* TR should be in GDT only */ if (seg == VCPU_SREG_TR && (selector & (1 << 2))) goto exception; - if (null_selector) /* for NULL selector skip all following checks */ + /* NULL selector is not valid for TR, CS and (except for long mode) SS */ + if (null_selector) { + if (seg == VCPU_SREG_CS || seg == VCPU_SREG_TR) + goto exception; + + if (seg == VCPU_SREG_SS) { + if (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl) + goto exception; + + /* + * ctxt->ops->set_segment expects the CPL to be in + * SS.DPL, so fake an expand-up 32-bit data segment. + */ + seg_desc.type = 3; + seg_desc.p = 1; + seg_desc.s = 1; + seg_desc.dpl = cpl; + seg_desc.d = 1; + seg_desc.g = 1; + } + + /* Skip all following checks */ goto load; + } ret = read_segment_descriptor(ctxt, selector, &seg_desc, &desc_addr); if (ret != X86EMUL_CONTINUE) @@ -1710,6 +1764,21 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, int seg) { u8 cpl = ctxt->ops->cpl(ctxt); + + /* + * None of MOV, POP and LSS can load a NULL selector in CPL=3, but + * they can load it at CPL<3 (Intel's manual says only LSS can, + * but it's wrong). + * + * However, the Intel manual says that putting IST=1/DPL=3 in + * an interrupt gate will result in SS=3 (the AMD manual instead + * says it doesn't), so allow SS=3 in __load_segment_descriptor + * and only forbid it here. + */ + if (seg == VCPU_SREG_SS && selector == 3 && + ctxt->mode == X86EMUL_MODE_PROT64) + return emulate_exception(ctxt, GP_VECTOR, 0, true); + return __load_segment_descriptor(ctxt, selector, seg, cpl, X86_TRANSFER_NONE, NULL); } @@ -3658,8 +3727,8 @@ static int emulate_store_desc_ptr(struct x86_emulate_ctxt *ctxt, } /* Disable writeback. */ ctxt->dst.type = OP_NONE; - return segmented_write(ctxt, ctxt->dst.addr.mem, - &desc_ptr, 2 + ctxt->op_bytes); + return segmented_write_std(ctxt, ctxt->dst.addr.mem, + &desc_ptr, 2 + ctxt->op_bytes); } static int em_sgdt(struct x86_emulate_ctxt *ctxt) @@ -3842,6 +3911,131 @@ static int em_movsxd(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int check_fxsr(struct x86_emulate_ctxt *ctxt) +{ + u32 eax = 1, ebx, ecx = 0, edx; + + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); + if (!(edx & FFL(FXSR))) + return emulate_ud(ctxt); + + if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) + return emulate_nm(ctxt); + + /* + * Don't emulate a case that should never be hit, instead of working + * around a lack of fxsave64/fxrstor64 on old compilers. + */ + if (ctxt->mode >= X86EMUL_MODE_PROT64) + return X86EMUL_UNHANDLEABLE; + + return X86EMUL_CONTINUE; +} + +/* + * FXSAVE and FXRSTOR have 4 different formats depending on execution mode, + * 1) 16 bit mode + * 2) 32 bit mode + * - like (1), but FIP and FDP (foo) are only 16 bit. At least Intel CPUs + * preserve whole 32 bit values, though, so (1) and (2) are the same wrt. + * save and restore + * 3) 64-bit mode with REX.W prefix + * - like (2), but XMM 8-15 are being saved and restored + * 4) 64-bit mode without REX.W prefix + * - like (3), but FIP and FDP are 64 bit + * + * Emulation uses (3) for (1) and (2) and preserves XMM 8-15 to reach the + * desired result. (4) is not emulated. + * + * Note: Guest and host CPUID.(EAX=07H,ECX=0H):EBX[bit 13] (deprecate FPU CS + * and FPU DS) should match. + */ +static int em_fxsave(struct x86_emulate_ctxt *ctxt) +{ + struct fxregs_state fx_state; + size_t size; + int rc; + + rc = check_fxsr(ctxt); + if (rc != X86EMUL_CONTINUE) + return rc; + + ctxt->ops->get_fpu(ctxt); + + rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state)); + + ctxt->ops->put_fpu(ctxt); + + if (rc != X86EMUL_CONTINUE) + return rc; + + if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR) + size = offsetof(struct fxregs_state, xmm_space[8 * 16/4]); + else + size = offsetof(struct fxregs_state, xmm_space[0]); + + return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state, size); +} + +static int fxrstor_fixup(struct x86_emulate_ctxt *ctxt, + struct fxregs_state *new) +{ + int rc = X86EMUL_CONTINUE; + struct fxregs_state old; + + rc = asm_safe("fxsave %[fx]", , [fx] "+m"(old)); + if (rc != X86EMUL_CONTINUE) + return rc; + + /* + * 64 bit host will restore XMM 8-15, which is not correct on non-64 + * bit guests. Load the current values in order to preserve 64 bit + * XMMs after fxrstor. + */ +#ifdef CONFIG_X86_64 + /* XXX: accessing XMM 8-15 very awkwardly */ + memcpy(&new->xmm_space[8 * 16/4], &old.xmm_space[8 * 16/4], 8 * 16); +#endif + + /* + * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but + * does save and restore MXCSR. + */ + if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR)) + memcpy(new->xmm_space, old.xmm_space, 8 * 16); + + return rc; +} + +static int em_fxrstor(struct x86_emulate_ctxt *ctxt) +{ + struct fxregs_state fx_state; + int rc; + + rc = check_fxsr(ctxt); + if (rc != X86EMUL_CONTINUE) + return rc; + + rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, 512); + if (rc != X86EMUL_CONTINUE) + return rc; + + if (fx_state.mxcsr >> 16) + return emulate_gp(ctxt, 0); + + ctxt->ops->get_fpu(ctxt); + + if (ctxt->mode < X86EMUL_MODE_PROT64) + rc = fxrstor_fixup(ctxt, &fx_state); + + if (rc == X86EMUL_CONTINUE) + rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state)); + + ctxt->ops->put_fpu(ctxt); + + return rc; +} + static bool valid_cr(int nr) { switch (nr) { @@ -4194,7 +4388,9 @@ static const struct gprefix pfx_0f_ae_7 = { }; static const struct group_dual group15 = { { - N, N, N, N, N, N, N, GP(0, &pfx_0f_ae_7), + I(ModRM | Aligned16, em_fxsave), + I(ModRM | Aligned16, em_fxrstor), + N, N, N, N, N, GP(0, &pfx_0f_ae_7), }, { N, N, N, N, N, N, N, N, } }; @@ -5066,21 +5262,13 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt) { - bool fault = false; + int rc; ctxt->ops->get_fpu(ctxt); - asm volatile("1: fwait \n\t" - "2: \n\t" - ".pushsection .fixup,\"ax\" \n\t" - "3: \n\t" - "movb $1, %[fault] \n\t" - "jmp 2b \n\t" - ".popsection \n\t" - _ASM_EXTABLE(1b, 3b) - : [fault]"+qm"(fault)); + rc = asm_safe("fwait"); ctxt->ops->put_fpu(ctxt); - if (unlikely(fault)) + if (unlikely(rc != X86EMUL_CONTINUE)) return emulate_exception(ctxt, MF_VECTOR, 0, false); return X86EMUL_CONTINUE; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 42b1c83741c8..1572c35b4f1a 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -291,7 +291,7 @@ static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata) return ret; } -int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) +static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) { struct kvm_vcpu *vcpu = synic_to_vcpu(synic); struct kvm_lapic_irq irq; @@ -852,6 +852,10 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) return; + mutex_lock(&kvm->arch.hyperv.hv_lock); + if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) + goto out_unlock; + gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; /* * Because the TSC parameters only vary when there is a @@ -859,7 +863,7 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, */ if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn), &tsc_seq, sizeof(tsc_seq)))) - return; + goto out_unlock; /* * While we're computing and writing the parameters, force the @@ -868,15 +872,15 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, hv->tsc_ref.tsc_sequence = 0; if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence))) - return; + goto out_unlock; if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref)) - return; + goto out_unlock; /* Ensure sequence is zero before writing the rest of the struct. */ smp_wmb(); if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref))) - return; + goto out_unlock; /* * Now switch to the TSC page mechanism by writing the sequence. @@ -891,6 +895,8 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, hv->tsc_ref.tsc_sequence = tsc_seq; kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)); +out_unlock: + mutex_unlock(&kvm->arch.hyperv.hv_lock); } static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, @@ -1142,9 +1148,9 @@ int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) if (kvm_hv_msr_partition_wide(msr)) { int r; - mutex_lock(&vcpu->kvm->lock); + mutex_lock(&vcpu->kvm->arch.hyperv.hv_lock); r = kvm_hv_set_msr_pw(vcpu, msr, data, host); - mutex_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock); return r; } else return kvm_hv_set_msr(vcpu, msr, data, host); @@ -1155,9 +1161,9 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) if (kvm_hv_msr_partition_wide(msr)) { int r; - mutex_lock(&vcpu->kvm->lock); + mutex_lock(&vcpu->kvm->arch.hyperv.hv_lock); r = kvm_hv_get_msr_pw(vcpu, msr, pdata); - mutex_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock); return r; } else return kvm_hv_get_msr(vcpu, msr, pdata); @@ -1165,7 +1171,7 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) bool kvm_hv_hypercall_enabled(struct kvm *kvm) { - return kvm->arch.hyperv.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE; + return READ_ONCE(kvm->arch.hyperv.hv_hypercall) & HV_X64_MSR_HYPERCALL_ENABLE; } static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 16a7134eedac..a78b445ce411 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -212,7 +212,7 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) */ smp_mb(); if (atomic_dec_if_positive(&ps->pending) > 0) - kthread_queue_work(&pit->worker, &pit->expired); + kthread_queue_work(pit->worker, &pit->expired); } void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) @@ -272,7 +272,7 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) if (atomic_read(&ps->reinject)) atomic_inc(&ps->pending); - kthread_queue_work(&pt->worker, &pt->expired); + kthread_queue_work(pt->worker, &pt->expired); if (ps->is_periodic) { hrtimer_add_expires_ns(&ps->timer, ps->period); @@ -667,10 +667,8 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) pid_nr = pid_vnr(pid); put_pid(pid); - kthread_init_worker(&pit->worker); - pit->worker_task = kthread_run(kthread_worker_fn, &pit->worker, - "kvm-pit/%d", pid_nr); - if (IS_ERR(pit->worker_task)) + pit->worker = kthread_create_worker(0, "kvm-pit/%d", pid_nr); + if (IS_ERR(pit->worker)) goto fail_kthread; kthread_init_work(&pit->expired, pit_do_work); @@ -713,7 +711,7 @@ fail_register_speaker: fail_register_pit: mutex_unlock(&kvm->slots_lock); kvm_pit_set_reinject(pit, false); - kthread_stop(pit->worker_task); + kthread_destroy_worker(pit->worker); fail_kthread: kvm_free_irq_source_id(kvm, pit->irq_source_id); fail_request: @@ -730,8 +728,7 @@ void kvm_free_pit(struct kvm *kvm) kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev); kvm_pit_set_reinject(pit, false); hrtimer_cancel(&pit->pit_state.timer); - kthread_flush_work(&pit->expired); - kthread_stop(pit->worker_task); + kthread_destroy_worker(pit->worker); kvm_free_irq_source_id(kvm, pit->irq_source_id); kfree(pit); } diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index 2f5af0798326..600bee9dcbbd 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -44,8 +44,7 @@ struct kvm_pit { struct kvm_kpit_state pit_state; int irq_source_id; struct kvm_irq_mask_notifier mask_notifier; - struct kthread_worker worker; - struct task_struct *worker_task; + struct kthread_worker *worker; struct kthread_work expired; }; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 6f69340f9fa3..2f6ef5121a4c 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -342,9 +342,11 @@ void __kvm_apic_update_irr(u32 *pir, void *regs) u32 i, pir_val; for (i = 0; i <= 7; i++) { - pir_val = xchg(&pir[i], 0); - if (pir_val) + pir_val = READ_ONCE(pir[i]); + if (pir_val) { + pir_val = xchg(&pir[i], 0); *((u32 *)(regs + APIC_IRR + i * 0x10)) |= pir_val; + } } } EXPORT_SYMBOL_GPL(__kvm_apic_update_irr); @@ -1090,7 +1092,7 @@ static void apic_send_ipi(struct kvm_lapic *apic) static u32 apic_get_tmcct(struct kvm_lapic *apic) { - ktime_t remaining; + ktime_t remaining, now; s64 ns; u32 tmcct; @@ -1101,9 +1103,10 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic) apic->lapic_timer.period == 0) return 0; - remaining = hrtimer_get_remaining(&apic->lapic_timer.timer); + now = ktime_get(); + remaining = ktime_sub(apic->lapic_timer.target_expiration, now); if (ktime_to_ns(remaining) < 0) - remaining = ktime_set(0, 0); + remaining = 0; ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period); tmcct = div64_u64(ns, @@ -1332,7 +1335,7 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic) local_irq_save(flags); - now = apic->lapic_timer.timer.base->get_time(); + now = ktime_get(); guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); if (likely(tscdeadline > guest_tsc)) { ns = (tscdeadline - guest_tsc) * 1000000ULL; @@ -1347,6 +1350,79 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic) local_irq_restore(flags); } +static void start_sw_period(struct kvm_lapic *apic) +{ + if (!apic->lapic_timer.period) + return; + + if (apic_lvtt_oneshot(apic) && + ktime_after(ktime_get(), + apic->lapic_timer.target_expiration)) { + apic_timer_expired(apic); + return; + } + + hrtimer_start(&apic->lapic_timer.timer, + apic->lapic_timer.target_expiration, + HRTIMER_MODE_ABS_PINNED); +} + +static bool set_target_expiration(struct kvm_lapic *apic) +{ + ktime_t now; + u64 tscl = rdtsc(); + + now = ktime_get(); + apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT) + * APIC_BUS_CYCLE_NS * apic->divide_count; + + if (!apic->lapic_timer.period) + return false; + + /* + * Do not allow the guest to program periodic timers with small + * interval, since the hrtimers are not throttled by the host + * scheduler. + */ + if (apic_lvtt_period(apic)) { + s64 min_period = min_timer_period_us * 1000LL; + + if (apic->lapic_timer.period < min_period) { + pr_info_ratelimited( + "kvm: vcpu %i: requested %lld ns " + "lapic timer period limited to %lld ns\n", + apic->vcpu->vcpu_id, + apic->lapic_timer.period, min_period); + apic->lapic_timer.period = min_period; + } + } + + apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" + PRIx64 ", " + "timer initial count 0x%x, period %lldns, " + "expire @ 0x%016" PRIx64 ".\n", __func__, + APIC_BUS_CYCLE_NS, ktime_to_ns(now), + kvm_lapic_get_reg(apic, APIC_TMICT), + apic->lapic_timer.period, + ktime_to_ns(ktime_add_ns(now, + apic->lapic_timer.period))); + + apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) + + nsec_to_cycles(apic->vcpu, apic->lapic_timer.period); + apic->lapic_timer.target_expiration = ktime_add_ns(now, apic->lapic_timer.period); + + return true; +} + +static void advance_periodic_target_expiration(struct kvm_lapic *apic) +{ + apic->lapic_timer.tscdeadline += + nsec_to_cycles(apic->vcpu, apic->lapic_timer.period); + apic->lapic_timer.target_expiration = + ktime_add_ns(apic->lapic_timer.target_expiration, + apic->lapic_timer.period); +} + bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu) { if (!lapic_in_kernel(vcpu)) @@ -1356,52 +1432,59 @@ bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use); -static void cancel_hv_tscdeadline(struct kvm_lapic *apic) +static void cancel_hv_timer(struct kvm_lapic *apic) { kvm_x86_ops->cancel_hv_timer(apic->vcpu); apic->lapic_timer.hv_timer_in_use = false; } -void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - WARN_ON(!apic->lapic_timer.hv_timer_in_use); - WARN_ON(swait_active(&vcpu->wq)); - cancel_hv_tscdeadline(apic); - apic_timer_expired(apic); -} -EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer); - -static bool start_hv_tscdeadline(struct kvm_lapic *apic) +static bool start_hv_timer(struct kvm_lapic *apic) { u64 tscdeadline = apic->lapic_timer.tscdeadline; - if (atomic_read(&apic->lapic_timer.pending) || + if ((atomic_read(&apic->lapic_timer.pending) && + !apic_lvtt_period(apic)) || kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) { if (apic->lapic_timer.hv_timer_in_use) - cancel_hv_tscdeadline(apic); + cancel_hv_timer(apic); } else { apic->lapic_timer.hv_timer_in_use = true; hrtimer_cancel(&apic->lapic_timer.timer); /* In case the sw timer triggered in the window */ - if (atomic_read(&apic->lapic_timer.pending)) - cancel_hv_tscdeadline(apic); + if (atomic_read(&apic->lapic_timer.pending) && + !apic_lvtt_period(apic)) + cancel_hv_timer(apic); } trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, apic->lapic_timer.hv_timer_in_use); return apic->lapic_timer.hv_timer_in_use; } +void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + WARN_ON(!apic->lapic_timer.hv_timer_in_use); + WARN_ON(swait_active(&vcpu->wq)); + cancel_hv_timer(apic); + apic_timer_expired(apic); + + if (apic_lvtt_period(apic) && apic->lapic_timer.period) { + advance_periodic_target_expiration(apic); + if (!start_hv_timer(apic)) + start_sw_period(apic); + } +} +EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer); + void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; WARN_ON(apic->lapic_timer.hv_timer_in_use); - if (apic_lvtt_tscdeadline(apic)) - start_hv_tscdeadline(apic); + start_hv_timer(apic); } EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer); @@ -1413,62 +1496,28 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu) if (!apic->lapic_timer.hv_timer_in_use) return; - cancel_hv_tscdeadline(apic); + cancel_hv_timer(apic); if (atomic_read(&apic->lapic_timer.pending)) return; - start_sw_tscdeadline(apic); + if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) + start_sw_period(apic); + else if (apic_lvtt_tscdeadline(apic)) + start_sw_tscdeadline(apic); } EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer); static void start_apic_timer(struct kvm_lapic *apic) { - ktime_t now; - atomic_set(&apic->lapic_timer.pending, 0); if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { - /* lapic timer in oneshot or periodic mode */ - now = apic->lapic_timer.timer.base->get_time(); - apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT) - * APIC_BUS_CYCLE_NS * apic->divide_count; - - if (!apic->lapic_timer.period) - return; - /* - * Do not allow the guest to program periodic timers with small - * interval, since the hrtimers are not throttled by the host - * scheduler. - */ - if (apic_lvtt_period(apic)) { - s64 min_period = min_timer_period_us * 1000LL; - - if (apic->lapic_timer.period < min_period) { - pr_info_ratelimited( - "kvm: vcpu %i: requested %lld ns " - "lapic timer period limited to %lld ns\n", - apic->vcpu->vcpu_id, - apic->lapic_timer.period, min_period); - apic->lapic_timer.period = min_period; - } - } - - hrtimer_start(&apic->lapic_timer.timer, - ktime_add_ns(now, apic->lapic_timer.period), - HRTIMER_MODE_ABS_PINNED); - - apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" - PRIx64 ", " - "timer initial count 0x%x, period %lldns, " - "expire @ 0x%016" PRIx64 ".\n", __func__, - APIC_BUS_CYCLE_NS, ktime_to_ns(now), - kvm_lapic_get_reg(apic, APIC_TMICT), - apic->lapic_timer.period, - ktime_to_ns(ktime_add_ns(now, - apic->lapic_timer.period))); + if (set_target_expiration(apic) && + !(kvm_x86_ops->set_hv_timer && start_hv_timer(apic))) + start_sw_period(apic); } else if (apic_lvtt_tscdeadline(apic)) { - if (!(kvm_x86_ops->set_hv_timer && start_hv_tscdeadline(apic))) + if (!(kvm_x86_ops->set_hv_timer && start_hv_timer(apic))) start_sw_tscdeadline(apic); } } @@ -1701,13 +1750,22 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) * LAPIC interface *---------------------------------------------------------------------- */ +u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + if (!lapic_in_kernel(vcpu)) + return 0; + + return apic->lapic_timer.tscdeadline; +} u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) || - apic_lvtt_period(apic)) + if (!lapic_in_kernel(vcpu) || + !apic_lvtt_tscdeadline(apic)) return 0; return apic->lapic_timer.tscdeadline; @@ -1748,14 +1806,17 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) u64 old_value = vcpu->arch.apic_base; struct kvm_lapic *apic = vcpu->arch.apic; - if (!apic) { + if (!apic) value |= MSR_IA32_APICBASE_BSP; - vcpu->arch.apic_base = value; - return; - } vcpu->arch.apic_base = value; + if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) + kvm_update_cpuid(vcpu); + + if (!apic) + return; + /* update jump label if enable bit changes */ if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) { if (value & MSR_IA32_APICBASE_ENABLE) { @@ -1909,6 +1970,7 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) apic_timer_expired(apic); if (lapic_is_periodic(apic)) { + advance_periodic_target_expiration(apic); hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); return HRTIMER_RESTART; } else @@ -1993,6 +2055,10 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) kvm_apic_local_deliver(apic, APIC_LVTT); if (apic_lvtt_tscdeadline(apic)) apic->lapic_timer.tscdeadline = 0; + if (apic_lvtt_oneshot(apic)) { + apic->lapic_timer.tscdeadline = 0; + apic->lapic_timer.target_expiration = 0; + } atomic_set(&apic->lapic_timer.pending, 0); } } @@ -2360,3 +2426,9 @@ void kvm_lapic_init(void) jump_label_rate_limit(&apic_hw_disabled, HZ); jump_label_rate_limit(&apic_sw_disabled, HZ); } + +void kvm_lapic_exit(void) +{ + static_key_deferred_flush(&apic_hw_disabled); + static_key_deferred_flush(&apic_sw_disabled); +} diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index f60d01c29d51..ff8039d61672 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -15,6 +15,7 @@ struct kvm_timer { struct hrtimer timer; s64 period; /* unit: ns */ + ktime_t target_expiration; u32 timer_mode; u32 timer_mode_mask; u64 tscdeadline; @@ -85,6 +86,7 @@ int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); +u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu); u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data); @@ -108,6 +110,7 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data); void kvm_lapic_init(void); +void kvm_lapic_exit(void); #define VEC_POS(v) ((v) & (32 - 1)) #define REG_POS(v) (((v) >> 5) << 4) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d9c7e986b4e4..7012de4a1fed 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1660,17 +1660,9 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) * This has some overhead, but not as much as the cost of swapping * out actively used pages or breaking up actively used hugepages. */ - if (!shadow_accessed_mask) { - /* - * We are holding the kvm->mmu_lock, and we are blowing up - * shadow PTEs. MMU notifier consumers need to be kept at bay. - * This is correct as long as we don't decouple the mmu_lock - * protected regions (like invalidate_range_start|end does). - */ - kvm->mmu_notifier_seq++; + if (!shadow_accessed_mask) return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); - } return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); } @@ -4405,7 +4397,8 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte) } static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes) + const u8 *new, int bytes, + struct kvm_page_track_notifier_node *node) { gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *sp; @@ -4508,7 +4501,7 @@ static void make_mmu_pages_available(struct kvm_vcpu *vcpu) kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); } -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, void *insn, int insn_len) { int r, emulation_type = EMULTYPE_RETRY; @@ -4527,12 +4520,28 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, return r; } - r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); + r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code), + false); if (r < 0) return r; if (!r) return 1; + /* + * Before emulating the instruction, check if the error code + * was due to a RO violation while translating the guest page. + * This can occur when using nested virtualization with nested + * paging in both guests. If true, we simply unprotect the page + * and resume the guest. + * + * Note: AMD only (since it supports the PFERR_GUEST_PAGE_MASK used + * in PFERR_NEXT_GUEST_PAGE) + */ + if (error_code == PFERR_NESTED_GUEST_PAGE) { + kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2)); + return 1; + } + if (mmio_info_in_cache(vcpu, cr2, direct)) emulation_type = 0; emulate: @@ -4617,11 +4626,19 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu) init_kvm_mmu(vcpu); } +static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot, + struct kvm_page_track_notifier_node *node) +{ + kvm_mmu_invalidate_zap_all_pages(kvm); +} + void kvm_mmu_init_vm(struct kvm *kvm) { struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; node->track_write = kvm_mmu_pte_write; + node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; kvm_page_track_register_notifier(kvm, node); } @@ -4958,7 +4975,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots) * zap all shadow pages. */ if (unlikely((slots->generation & MMIO_GEN_MASK) == 0)) { - printk_ratelimited(KERN_DEBUG "kvm: zapping shadow pages for mmio generation wraparound\n"); + kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n"); kvm_mmu_invalidate_zap_all_pages(kvm); } } diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c index b431539c3714..4a1c13eaa518 100644 --- a/arch/x86/kvm/page_track.c +++ b/arch/x86/kvm/page_track.c @@ -106,6 +106,7 @@ void kvm_slot_page_track_add_page(struct kvm *kvm, if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn)) kvm_flush_remote_tlbs(kvm); } +EXPORT_SYMBOL_GPL(kvm_slot_page_track_add_page); /* * remove the guest page from the tracking pool which stops the interception @@ -135,6 +136,7 @@ void kvm_slot_page_track_remove_page(struct kvm *kvm, */ kvm_mmu_gfn_allow_lpage(slot, gfn); } +EXPORT_SYMBOL_GPL(kvm_slot_page_track_remove_page); /* * check if the corresponding access on the specified guest page is tracked. @@ -181,6 +183,7 @@ kvm_page_track_register_notifier(struct kvm *kvm, hlist_add_head_rcu(&n->node, &head->track_notifier_list); spin_unlock(&kvm->mmu_lock); } +EXPORT_SYMBOL_GPL(kvm_page_track_register_notifier); /* * stop receiving the event interception. It is the opposed operation of @@ -199,6 +202,7 @@ kvm_page_track_unregister_notifier(struct kvm *kvm, spin_unlock(&kvm->mmu_lock); synchronize_srcu(&head->track_srcu); } +EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier); /* * Notify the node that write access is intercepted and write emulation is @@ -222,6 +226,31 @@ void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, idx = srcu_read_lock(&head->track_srcu); hlist_for_each_entry_rcu(n, &head->track_notifier_list, node) if (n->track_write) - n->track_write(vcpu, gpa, new, bytes); + n->track_write(vcpu, gpa, new, bytes, n); + srcu_read_unlock(&head->track_srcu, idx); +} + +/* + * Notify the node that memory slot is being removed or moved so that it can + * drop write-protection for the pages in the memory slot. + * + * The node should figure out it has any write-protected pages in this slot + * by itself. + */ +void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + struct kvm_page_track_notifier_head *head; + struct kvm_page_track_notifier_node *n; + int idx; + + head = &kvm->arch.track_notifier_head; + + if (hlist_empty(&head->track_notifier_list)) + return; + + idx = srcu_read_lock(&head->track_srcu); + hlist_for_each_entry_rcu(n, &head->track_notifier_list, node) + if (n->track_flush_slot) + n->track_flush_slot(kvm, slot, n); srcu_read_unlock(&head->track_srcu, idx); } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8ca1eca5038d..08a4d3ab3455 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2074,7 +2074,7 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) static int pf_interception(struct vcpu_svm *svm) { u64 fault_address = svm->vmcb->control.exit_info_2; - u32 error_code; + u64 error_code; int r = 1; switch (svm->apf_reason) { @@ -2270,7 +2270,7 @@ static int io_interception(struct vcpu_svm *svm) ++svm->vcpu.stat.io_exits; string = (io_info & SVM_IOIO_STR_MASK) != 0; in = (io_info & SVM_IOIO_TYPE_MASK) != 0; - if (string || in) + if (string) return emulate_instruction(vcpu, 0) == EMULATE_DONE; port = io_info >> 16; @@ -2278,7 +2278,8 @@ static int io_interception(struct vcpu_svm *svm) svm->next_rip = svm->vmcb->control.exit_info_2; skip_emulated_instruction(&svm->vcpu); - return kvm_fast_pio_out(vcpu, size, port); + return in ? kvm_fast_pio_in(vcpu, size, port) + : kvm_fast_pio_out(vcpu, size, port); } static int nmi_interception(struct vcpu_svm *svm) @@ -3150,8 +3151,7 @@ static int skinit_interception(struct vcpu_svm *svm) static int wbinvd_interception(struct vcpu_svm *svm) { - kvm_emulate_wbinvd(&svm->vcpu); - return 1; + return kvm_emulate_wbinvd(&svm->vcpu); } static int xsetbv_interception(struct vcpu_svm *svm) @@ -3238,8 +3238,7 @@ static int task_switch_interception(struct vcpu_svm *svm) static int cpuid_interception(struct vcpu_svm *svm) { svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; - kvm_emulate_cpuid(&svm->vcpu); - return 1; + return kvm_emulate_cpuid(&svm->vcpu); } static int iret_interception(struct vcpu_svm *svm) @@ -3275,9 +3274,7 @@ static int rdpmc_interception(struct vcpu_svm *svm) return emulate_on_interception(svm); err = kvm_rdpmc(&svm->vcpu); - kvm_complete_insn_gp(&svm->vcpu, err); - - return 1; + return kvm_complete_insn_gp(&svm->vcpu, err); } static bool check_selective_cr0_intercepted(struct vcpu_svm *svm, @@ -3374,9 +3371,7 @@ static int cr_interception(struct vcpu_svm *svm) } kvm_register_write(&svm->vcpu, reg, val); } - kvm_complete_insn_gp(&svm->vcpu, err); - - return 1; + return kvm_complete_insn_gp(&svm->vcpu, err); } static int dr_interception(struct vcpu_svm *svm) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3980da515fd0..a236decb81e4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -133,6 +133,16 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 /* + * Hyper-V requires all of these, so mark them as supported even though + * they are just treated the same as all-context. + */ +#define VMX_VPID_EXTENT_SUPPORTED_MASK \ + (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ + VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ + VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ + VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) + +/* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive * executions of PAUSE in a loop. Also indicate if ple enabled. @@ -446,23 +456,31 @@ struct nested_vmx { u16 vpid02; u16 last_vpid; + /* + * We only store the "true" versions of the VMX capability MSRs. We + * generate the "non-true" versions by setting the must-be-1 bits + * according to the SDM. + */ u32 nested_vmx_procbased_ctls_low; u32 nested_vmx_procbased_ctls_high; - u32 nested_vmx_true_procbased_ctls_low; u32 nested_vmx_secondary_ctls_low; u32 nested_vmx_secondary_ctls_high; u32 nested_vmx_pinbased_ctls_low; u32 nested_vmx_pinbased_ctls_high; u32 nested_vmx_exit_ctls_low; u32 nested_vmx_exit_ctls_high; - u32 nested_vmx_true_exit_ctls_low; u32 nested_vmx_entry_ctls_low; u32 nested_vmx_entry_ctls_high; - u32 nested_vmx_true_entry_ctls_low; u32 nested_vmx_misc_low; u32 nested_vmx_misc_high; u32 nested_vmx_ept_caps; u32 nested_vmx_vpid_caps; + u64 nested_vmx_basic; + u64 nested_vmx_cr0_fixed0; + u64 nested_vmx_cr0_fixed1; + u64 nested_vmx_cr4_fixed0; + u64 nested_vmx_cr4_fixed1; + u64 nested_vmx_vmcs_enum; }; #define POSTED_INTR_ON 0 @@ -520,6 +538,12 @@ static inline void pi_set_sn(struct pi_desc *pi_desc) (unsigned long *)&pi_desc->control); } +static inline void pi_clear_on(struct pi_desc *pi_desc) +{ + clear_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + static inline int pi_test_on(struct pi_desc *pi_desc) { return test_bit(POSTED_INTR_ON, @@ -920,16 +944,32 @@ static DEFINE_PER_CPU(struct desc_ptr, host_gdt); static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); -static unsigned long *vmx_io_bitmap_a; -static unsigned long *vmx_io_bitmap_b; -static unsigned long *vmx_msr_bitmap_legacy; -static unsigned long *vmx_msr_bitmap_longmode; -static unsigned long *vmx_msr_bitmap_legacy_x2apic; -static unsigned long *vmx_msr_bitmap_longmode_x2apic; -static unsigned long *vmx_msr_bitmap_legacy_x2apic_apicv_inactive; -static unsigned long *vmx_msr_bitmap_longmode_x2apic_apicv_inactive; -static unsigned long *vmx_vmread_bitmap; -static unsigned long *vmx_vmwrite_bitmap; +enum { + VMX_IO_BITMAP_A, + VMX_IO_BITMAP_B, + VMX_MSR_BITMAP_LEGACY, + VMX_MSR_BITMAP_LONGMODE, + VMX_MSR_BITMAP_LEGACY_X2APIC_APICV, + VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV, + VMX_MSR_BITMAP_LEGACY_X2APIC, + VMX_MSR_BITMAP_LONGMODE_X2APIC, + VMX_VMREAD_BITMAP, + VMX_VMWRITE_BITMAP, + VMX_BITMAP_NR +}; + +static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; + +#define vmx_io_bitmap_a (vmx_bitmap[VMX_IO_BITMAP_A]) +#define vmx_io_bitmap_b (vmx_bitmap[VMX_IO_BITMAP_B]) +#define vmx_msr_bitmap_legacy (vmx_bitmap[VMX_MSR_BITMAP_LEGACY]) +#define vmx_msr_bitmap_longmode (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE]) +#define vmx_msr_bitmap_legacy_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC_APICV]) +#define vmx_msr_bitmap_longmode_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV]) +#define vmx_msr_bitmap_legacy_x2apic (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC]) +#define vmx_msr_bitmap_longmode_x2apic (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC]) +#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) +#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) static bool cpu_has_load_ia32_efer; static bool cpu_has_load_perf_global_ctrl; @@ -1343,10 +1383,10 @@ static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12) return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR; } -static inline bool is_exception(u32 intr_info) +static inline bool is_nmi(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) - == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK); + == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK); } static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, @@ -2523,14 +2563,14 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) { if (is_long_mode(vcpu)) - msr_bitmap = vmx_msr_bitmap_longmode_x2apic; + msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv; else - msr_bitmap = vmx_msr_bitmap_legacy_x2apic; + msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv; } else { if (is_long_mode(vcpu)) - msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv_inactive; + msr_bitmap = vmx_msr_bitmap_longmode_x2apic; else - msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv_inactive; + msr_bitmap = vmx_msr_bitmap_legacy_x2apic; } } else { if (is_long_mode(vcpu)) @@ -2706,9 +2746,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; /* We support free control of debug control saving. */ - vmx->nested.nested_vmx_true_exit_ctls_low = - vmx->nested.nested_vmx_exit_ctls_low & - ~VM_EXIT_SAVE_DEBUG_CONTROLS; + vmx->nested.nested_vmx_exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; /* entry controls */ rdmsr(MSR_IA32_VMX_ENTRY_CTLS, @@ -2727,9 +2765,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; /* We support free control of debug control loading. */ - vmx->nested.nested_vmx_true_entry_ctls_low = - vmx->nested.nested_vmx_entry_ctls_low & - ~VM_ENTRY_LOAD_DEBUG_CONTROLS; + vmx->nested.nested_vmx_entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; /* cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, @@ -2762,8 +2798,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) CPU_BASED_USE_MSR_BITMAPS; /* We support free control of CR3 access interception. */ - vmx->nested.nested_vmx_true_procbased_ctls_low = - vmx->nested.nested_vmx_procbased_ctls_low & + vmx->nested.nested_vmx_procbased_ctls_low &= ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); /* secondary cpu-based controls */ @@ -2774,6 +2809,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_secondary_ctls_high &= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_RDTSCP | + SECONDARY_EXEC_DESC | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_APIC_REGISTER_VIRT | @@ -2805,8 +2841,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) */ if (enable_vpid) vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT | - VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | - VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; + VMX_VPID_EXTENT_SUPPORTED_MASK; else vmx->nested.nested_vmx_vpid_caps = 0; @@ -2823,14 +2858,52 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | VMX_MISC_ACTIVITY_HLT; vmx->nested.nested_vmx_misc_high = 0; + + /* + * This MSR reports some information about VMX support. We + * should return information about the VMX we emulate for the + * guest, and the VMCS structure we give it - not about the + * VMX support of the underlying hardware. + */ + vmx->nested.nested_vmx_basic = + VMCS12_REVISION | + VMX_BASIC_TRUE_CTLS | + ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | + (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); + + if (cpu_has_vmx_basic_inout()) + vmx->nested.nested_vmx_basic |= VMX_BASIC_INOUT; + + /* + * These MSRs specify bits which the guest must keep fixed on + * while L1 is in VMXON mode (in L1's root mode, or running an L2). + * We picked the standard core2 setting. + */ +#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) +#define VMXON_CR4_ALWAYSON X86_CR4_VMXE + vmx->nested.nested_vmx_cr0_fixed0 = VMXON_CR0_ALWAYSON; + vmx->nested.nested_vmx_cr4_fixed0 = VMXON_CR4_ALWAYSON; + + /* These MSRs specify bits which the guest must keep fixed off. */ + rdmsrl(MSR_IA32_VMX_CR0_FIXED1, vmx->nested.nested_vmx_cr0_fixed1); + rdmsrl(MSR_IA32_VMX_CR4_FIXED1, vmx->nested.nested_vmx_cr4_fixed1); + + /* highest index: VMX_PREEMPTION_TIMER_VALUE */ + vmx->nested.nested_vmx_vmcs_enum = 0x2e; +} + +/* + * if fixed0[i] == 1: val[i] must be 1 + * if fixed1[i] == 0: val[i] must be 0 + */ +static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1) +{ + return ((val & fixed1) | fixed0) == val; } static inline bool vmx_control_verify(u32 control, u32 low, u32 high) { - /* - * Bits 0 in high must be 0, and bits 1 in low must be 1. - */ - return ((control & high) | low) == control; + return fixed_bits_valid(control, low, high); } static inline u64 vmx_control_msr(u32 low, u32 high) @@ -2838,87 +2911,285 @@ static inline u64 vmx_control_msr(u32 low, u32 high) return low | ((u64)high << 32); } -/* Returns 0 on success, non-0 otherwise. */ -static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) +static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) +{ + superset &= mask; + subset &= mask; + + return (superset | subset) == superset; +} + +static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) +{ + const u64 feature_and_reserved = + /* feature (except bit 48; see below) */ + BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | + /* reserved */ + BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); + u64 vmx_basic = vmx->nested.nested_vmx_basic; + + if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) + return -EINVAL; + + /* + * KVM does not emulate a version of VMX that constrains physical + * addresses of VMX structures (e.g. VMCS) to 32-bits. + */ + if (data & BIT_ULL(48)) + return -EINVAL; + + if (vmx_basic_vmcs_revision_id(vmx_basic) != + vmx_basic_vmcs_revision_id(data)) + return -EINVAL; + + if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) + return -EINVAL; + + vmx->nested.nested_vmx_basic = data; + return 0; +} + +static int +vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +{ + u64 supported; + u32 *lowp, *highp; + + switch (msr_index) { + case MSR_IA32_VMX_TRUE_PINBASED_CTLS: + lowp = &vmx->nested.nested_vmx_pinbased_ctls_low; + highp = &vmx->nested.nested_vmx_pinbased_ctls_high; + break; + case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: + lowp = &vmx->nested.nested_vmx_procbased_ctls_low; + highp = &vmx->nested.nested_vmx_procbased_ctls_high; + break; + case MSR_IA32_VMX_TRUE_EXIT_CTLS: + lowp = &vmx->nested.nested_vmx_exit_ctls_low; + highp = &vmx->nested.nested_vmx_exit_ctls_high; + break; + case MSR_IA32_VMX_TRUE_ENTRY_CTLS: + lowp = &vmx->nested.nested_vmx_entry_ctls_low; + highp = &vmx->nested.nested_vmx_entry_ctls_high; + break; + case MSR_IA32_VMX_PROCBASED_CTLS2: + lowp = &vmx->nested.nested_vmx_secondary_ctls_low; + highp = &vmx->nested.nested_vmx_secondary_ctls_high; + break; + default: + BUG(); + } + + supported = vmx_control_msr(*lowp, *highp); + + /* Check must-be-1 bits are still 1. */ + if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) + return -EINVAL; + + /* Check must-be-0 bits are still 0. */ + if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) + return -EINVAL; + + *lowp = data; + *highp = data >> 32; + return 0; +} + +static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) +{ + const u64 feature_and_reserved_bits = + /* feature */ + BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | + BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | + /* reserved */ + GENMASK_ULL(13, 9) | BIT_ULL(31); + u64 vmx_misc; + + vmx_misc = vmx_control_msr(vmx->nested.nested_vmx_misc_low, + vmx->nested.nested_vmx_misc_high); + + if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) + return -EINVAL; + + if ((vmx->nested.nested_vmx_pinbased_ctls_high & + PIN_BASED_VMX_PREEMPTION_TIMER) && + vmx_misc_preemption_timer_rate(data) != + vmx_misc_preemption_timer_rate(vmx_misc)) + return -EINVAL; + + if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) + return -EINVAL; + + if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) + return -EINVAL; + + if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) + return -EINVAL; + + vmx->nested.nested_vmx_misc_low = data; + vmx->nested.nested_vmx_misc_high = data >> 32; + return 0; +} + +static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) +{ + u64 vmx_ept_vpid_cap; + + vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.nested_vmx_ept_caps, + vmx->nested.nested_vmx_vpid_caps); + + /* Every bit is either reserved or a feature bit. */ + if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) + return -EINVAL; + + vmx->nested.nested_vmx_ept_caps = data; + vmx->nested.nested_vmx_vpid_caps = data >> 32; + return 0; +} + +static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +{ + u64 *msr; + + switch (msr_index) { + case MSR_IA32_VMX_CR0_FIXED0: + msr = &vmx->nested.nested_vmx_cr0_fixed0; + break; + case MSR_IA32_VMX_CR4_FIXED0: + msr = &vmx->nested.nested_vmx_cr4_fixed0; + break; + default: + BUG(); + } + + /* + * 1 bits (which indicates bits which "must-be-1" during VMX operation) + * must be 1 in the restored value. + */ + if (!is_bitwise_subset(data, *msr, -1ULL)) + return -EINVAL; + + *msr = data; + return 0; +} + +/* + * Called when userspace is restoring VMX MSRs. + * + * Returns 0 on success, non-0 otherwise. + */ +static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) { struct vcpu_vmx *vmx = to_vmx(vcpu); switch (msr_index) { case MSR_IA32_VMX_BASIC: + return vmx_restore_vmx_basic(vmx, data); + case MSR_IA32_VMX_PINBASED_CTLS: + case MSR_IA32_VMX_PROCBASED_CTLS: + case MSR_IA32_VMX_EXIT_CTLS: + case MSR_IA32_VMX_ENTRY_CTLS: + /* + * The "non-true" VMX capability MSRs are generated from the + * "true" MSRs, so we do not support restoring them directly. + * + * If userspace wants to emulate VMX_BASIC[55]=0, userspace + * should restore the "true" MSRs with the must-be-1 bits + * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND + * DEFAULT SETTINGS". + */ + return -EINVAL; + case MSR_IA32_VMX_TRUE_PINBASED_CTLS: + case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: + case MSR_IA32_VMX_TRUE_EXIT_CTLS: + case MSR_IA32_VMX_TRUE_ENTRY_CTLS: + case MSR_IA32_VMX_PROCBASED_CTLS2: + return vmx_restore_control_msr(vmx, msr_index, data); + case MSR_IA32_VMX_MISC: + return vmx_restore_vmx_misc(vmx, data); + case MSR_IA32_VMX_CR0_FIXED0: + case MSR_IA32_VMX_CR4_FIXED0: + return vmx_restore_fixed0_msr(vmx, msr_index, data); + case MSR_IA32_VMX_CR0_FIXED1: + case MSR_IA32_VMX_CR4_FIXED1: + /* + * These MSRs are generated based on the vCPU's CPUID, so we + * do not support restoring them directly. + */ + return -EINVAL; + case MSR_IA32_VMX_EPT_VPID_CAP: + return vmx_restore_vmx_ept_vpid_cap(vmx, data); + case MSR_IA32_VMX_VMCS_ENUM: + vmx->nested.nested_vmx_vmcs_enum = data; + return 0; + default: /* - * This MSR reports some information about VMX support. We - * should return information about the VMX we emulate for the - * guest, and the VMCS structure we give it - not about the - * VMX support of the underlying hardware. + * The rest of the VMX capability MSRs do not support restore. */ - *pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS | - ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | - (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); - if (cpu_has_vmx_basic_inout()) - *pdata |= VMX_BASIC_INOUT; + return -EINVAL; + } +} + +/* Returns 0 on success, non-0 otherwise. */ +static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + switch (msr_index) { + case MSR_IA32_VMX_BASIC: + *pdata = vmx->nested.nested_vmx_basic; break; case MSR_IA32_VMX_TRUE_PINBASED_CTLS: case MSR_IA32_VMX_PINBASED_CTLS: *pdata = vmx_control_msr( vmx->nested.nested_vmx_pinbased_ctls_low, vmx->nested.nested_vmx_pinbased_ctls_high); + if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) + *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: - *pdata = vmx_control_msr( - vmx->nested.nested_vmx_true_procbased_ctls_low, - vmx->nested.nested_vmx_procbased_ctls_high); - break; case MSR_IA32_VMX_PROCBASED_CTLS: *pdata = vmx_control_msr( vmx->nested.nested_vmx_procbased_ctls_low, vmx->nested.nested_vmx_procbased_ctls_high); + if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) + *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_TRUE_EXIT_CTLS: - *pdata = vmx_control_msr( - vmx->nested.nested_vmx_true_exit_ctls_low, - vmx->nested.nested_vmx_exit_ctls_high); - break; case MSR_IA32_VMX_EXIT_CTLS: *pdata = vmx_control_msr( vmx->nested.nested_vmx_exit_ctls_low, vmx->nested.nested_vmx_exit_ctls_high); + if (msr_index == MSR_IA32_VMX_EXIT_CTLS) + *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_TRUE_ENTRY_CTLS: - *pdata = vmx_control_msr( - vmx->nested.nested_vmx_true_entry_ctls_low, - vmx->nested.nested_vmx_entry_ctls_high); - break; case MSR_IA32_VMX_ENTRY_CTLS: *pdata = vmx_control_msr( vmx->nested.nested_vmx_entry_ctls_low, vmx->nested.nested_vmx_entry_ctls_high); + if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) + *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_MISC: *pdata = vmx_control_msr( vmx->nested.nested_vmx_misc_low, vmx->nested.nested_vmx_misc_high); break; - /* - * These MSRs specify bits which the guest must keep fixed (on or off) - * while L1 is in VMXON mode (in L1's root mode, or running an L2). - * We picked the standard core2 setting. - */ -#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) -#define VMXON_CR4_ALWAYSON X86_CR4_VMXE case MSR_IA32_VMX_CR0_FIXED0: - *pdata = VMXON_CR0_ALWAYSON; + *pdata = vmx->nested.nested_vmx_cr0_fixed0; break; case MSR_IA32_VMX_CR0_FIXED1: - *pdata = -1ULL; + *pdata = vmx->nested.nested_vmx_cr0_fixed1; break; case MSR_IA32_VMX_CR4_FIXED0: - *pdata = VMXON_CR4_ALWAYSON; + *pdata = vmx->nested.nested_vmx_cr4_fixed0; break; case MSR_IA32_VMX_CR4_FIXED1: - *pdata = -1ULL; + *pdata = vmx->nested.nested_vmx_cr4_fixed1; break; case MSR_IA32_VMX_VMCS_ENUM: - *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */ + *pdata = vmx->nested.nested_vmx_vmcs_enum; break; case MSR_IA32_VMX_PROCBASED_CTLS2: *pdata = vmx_control_msr( @@ -3101,7 +3372,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmx_leave_nested(vcpu); break; case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: - return 1; /* they are read-only */ + if (!msr_info->host_initiated) + return 1; /* they are read-only */ + if (!nested_vmx_allowed(vcpu)) + return 1; + return vmx_set_vmx_msr(vcpu, msr_index, data); case MSR_IA32_XSS: if (!vmx_xsaves_supported()) return 1; @@ -3863,6 +4138,40 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu) (unsigned long *)&vcpu->arch.regs_dirty); } +static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) +{ + u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed0; + u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed1; + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high & + SECONDARY_EXEC_UNRESTRICTED_GUEST && + nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) + fixed0 &= ~(X86_CR0_PE | X86_CR0_PG); + + return fixed_bits_valid(val, fixed0, fixed1); +} + +static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) +{ + u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed0; + u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed1; + + return fixed_bits_valid(val, fixed0, fixed1); +} + +static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val) +{ + u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr4_fixed0; + u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr4_fixed1; + + return fixed_bits_valid(val, fixed0, fixed1); +} + +/* No difference in the restrictions on guest and host CR4 in VMX operation. */ +#define nested_guest_cr4_valid nested_cr4_valid +#define nested_host_cr4_valid nested_cr4_valid + static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, @@ -3991,8 +4300,8 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!nested_vmx_allowed(vcpu)) return 1; } - if (to_vmx(vcpu)->nested.vmxon && - ((cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) + + if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) return 1; vcpu->arch.cr4 = cr4; @@ -4569,41 +4878,6 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, } } -static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, - u32 msr, int type) -{ - int f = sizeof(unsigned long); - - if (!cpu_has_vmx_msr_bitmap()) - return; - - /* - * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals - * have the write-low and read-high bitmap offsets the wrong way round. - * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. - */ - if (msr <= 0x1fff) { - if (type & MSR_TYPE_R) - /* read-low */ - __set_bit(msr, msr_bitmap + 0x000 / f); - - if (type & MSR_TYPE_W) - /* write-low */ - __set_bit(msr, msr_bitmap + 0x800 / f); - - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - if (type & MSR_TYPE_R) - /* read-high */ - __set_bit(msr, msr_bitmap + 0x400 / f); - - if (type & MSR_TYPE_W) - /* write-high */ - __set_bit(msr, msr_bitmap + 0xc00 / f); - - } -} - /* * If a msr is allowed by L0, we should check whether it is allowed by L1. * The corresponding bit will be cleared unless both of L0 and L1 allow it. @@ -4659,48 +4933,18 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) msr, MSR_TYPE_R | MSR_TYPE_W); } -static void vmx_enable_intercept_msr_read_x2apic(u32 msr, bool apicv_active) -{ - if (apicv_active) { - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, - msr, MSR_TYPE_R); - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, - msr, MSR_TYPE_R); - } else { - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, - msr, MSR_TYPE_R); - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, - msr, MSR_TYPE_R); - } -} - -static void vmx_disable_intercept_msr_read_x2apic(u32 msr, bool apicv_active) +static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_active) { if (apicv_active) { - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, - msr, MSR_TYPE_R); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, - msr, MSR_TYPE_R); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv, + msr, type); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv, + msr, type); } else { - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, - msr, MSR_TYPE_R); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, - msr, MSR_TYPE_R); - } -} - -static void vmx_disable_intercept_msr_write_x2apic(u32 msr, bool apicv_active) -{ - if (apicv_active) { __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, - msr, MSR_TYPE_W); + msr, type); __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, - msr, MSR_TYPE_W); - } else { - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, - msr, MSR_TYPE_W); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, - msr, MSR_TYPE_W); + msr, type); } } @@ -4822,9 +5066,15 @@ static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!pi_test_and_clear_on(&vmx->pi_desc)) + if (!pi_test_on(&vmx->pi_desc)) return; + pi_clear_on(&vmx->pi_desc); + /* + * IOMMU can write to PIR.ON, so the barrier matters even on UP. + * But on x86 this is just a compiler barrier anyway. + */ + smp_mb__after_atomic(); kvm_apic_update_irr(vcpu, vmx->pi_desc.pir); } @@ -5472,7 +5722,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) if (is_machine_check(intr_info)) return handle_machine_check(vcpu); - if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) + if (is_nmi(intr_info)) return 1; /* already handled by vmx_vcpu_run() */ if (is_no_device(intr_info)) { @@ -5583,7 +5833,7 @@ static int handle_triple_fault(struct kvm_vcpu *vcpu) static int handle_io(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; - int size, in, string; + int size, in, string, ret; unsigned port; exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -5597,9 +5847,14 @@ static int handle_io(struct kvm_vcpu *vcpu) port = exit_qualification >> 16; size = (exit_qualification & 7) + 1; - skip_emulated_instruction(vcpu); - return kvm_fast_pio_out(vcpu, size, port); + ret = kvm_skip_emulated_instruction(vcpu); + + /* + * TODO: we might be squashing a KVM_GUESTDBG_SINGLESTEP-triggered + * KVM_EXIT_DEBUG here. + */ + return kvm_fast_pio_out(vcpu, size, port) && ret; } static void @@ -5613,18 +5868,6 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) hypercall[2] = 0xc1; } -static bool nested_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) -{ - unsigned long always_on = VMXON_CR0_ALWAYSON; - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - - if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high & - SECONDARY_EXEC_UNRESTRICTED_GUEST && - nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) - always_on &= ~(X86_CR0_PE | X86_CR0_PG); - return (val & always_on) == always_on; -} - /* called to set cr0 as appropriate for a mov-to-cr0 exit. */ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) { @@ -5643,7 +5886,7 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) val = (val & ~vmcs12->cr0_guest_host_mask) | (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); - if (!nested_cr0_valid(vcpu, val)) + if (!nested_guest_cr0_valid(vcpu, val)) return 1; if (kvm_set_cr0(vcpu, val)) @@ -5652,8 +5895,9 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) return 0; } else { if (to_vmx(vcpu)->nested.vmxon && - ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON)) + !nested_host_cr0_valid(vcpu, val)) return 1; + return kvm_set_cr0(vcpu, val); } } @@ -5697,6 +5941,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) int cr; int reg; int err; + int ret; exit_qualification = vmcs_readl(EXIT_QUALIFICATION); cr = exit_qualification & 15; @@ -5708,25 +5953,27 @@ static int handle_cr(struct kvm_vcpu *vcpu) switch (cr) { case 0: err = handle_set_cr0(vcpu, val); - kvm_complete_insn_gp(vcpu, err); - return 1; + return kvm_complete_insn_gp(vcpu, err); case 3: err = kvm_set_cr3(vcpu, val); - kvm_complete_insn_gp(vcpu, err); - return 1; + return kvm_complete_insn_gp(vcpu, err); case 4: err = handle_set_cr4(vcpu, val); - kvm_complete_insn_gp(vcpu, err); - return 1; + return kvm_complete_insn_gp(vcpu, err); case 8: { u8 cr8_prev = kvm_get_cr8(vcpu); u8 cr8 = (u8)val; err = kvm_set_cr8(vcpu, cr8); - kvm_complete_insn_gp(vcpu, err); + ret = kvm_complete_insn_gp(vcpu, err); if (lapic_in_kernel(vcpu)) - return 1; + return ret; if (cr8_prev <= cr8) - return 1; + return ret; + /* + * TODO: we might be squashing a + * KVM_GUESTDBG_SINGLESTEP-triggered + * KVM_EXIT_DEBUG here. + */ vcpu->run->exit_reason = KVM_EXIT_SET_TPR; return 0; } @@ -5735,23 +5982,20 @@ static int handle_cr(struct kvm_vcpu *vcpu) case 2: /* clts */ handle_clts(vcpu); trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); - skip_emulated_instruction(vcpu); vmx_fpu_activate(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); case 1: /*mov from cr*/ switch (cr) { case 3: val = kvm_read_cr3(vcpu); kvm_register_write(vcpu, reg, val); trace_kvm_cr_read(cr, val); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); case 8: val = kvm_get_cr8(vcpu); kvm_register_write(vcpu, reg, val); trace_kvm_cr_read(cr, val); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } break; case 3: /* lmsw */ @@ -5759,8 +6003,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val); kvm_lmsw(vcpu, val); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); default: break; } @@ -5831,8 +6074,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg))) return 1; - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } static u64 vmx_get_dr6(struct kvm_vcpu *vcpu) @@ -5864,8 +6106,7 @@ static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) static int handle_cpuid(struct kvm_vcpu *vcpu) { - kvm_emulate_cpuid(vcpu); - return 1; + return kvm_emulate_cpuid(vcpu); } static int handle_rdmsr(struct kvm_vcpu *vcpu) @@ -5886,8 +6127,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu) /* FIXME: handling of bits 32:63 of rax, rdx */ vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u; vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u; - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } static int handle_wrmsr(struct kvm_vcpu *vcpu) @@ -5907,8 +6147,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu) } trace_kvm_msr_write(ecx, data); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) @@ -5952,8 +6191,7 @@ static int handle_invlpg(struct kvm_vcpu *vcpu) unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); kvm_mmu_invlpg(vcpu, exit_qualification); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } static int handle_rdpmc(struct kvm_vcpu *vcpu) @@ -5961,15 +6199,12 @@ static int handle_rdpmc(struct kvm_vcpu *vcpu) int err; err = kvm_rdpmc(vcpu); - kvm_complete_insn_gp(vcpu, err); - - return 1; + return kvm_complete_insn_gp(vcpu, err); } static int handle_wbinvd(struct kvm_vcpu *vcpu) { - kvm_emulate_wbinvd(vcpu); - return 1; + return kvm_emulate_wbinvd(vcpu); } static int handle_xsetbv(struct kvm_vcpu *vcpu) @@ -5978,20 +6213,20 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu) u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX); if (kvm_set_xcr(vcpu, index, new_bv) == 0) - skip_emulated_instruction(vcpu); + return kvm_skip_emulated_instruction(vcpu); return 1; } static int handle_xsaves(struct kvm_vcpu *vcpu) { - skip_emulated_instruction(vcpu); + kvm_skip_emulated_instruction(vcpu); WARN(1, "this should never happen\n"); return 1; } static int handle_xrstors(struct kvm_vcpu *vcpu) { - skip_emulated_instruction(vcpu); + kvm_skip_emulated_instruction(vcpu); WARN(1, "this should never happen\n"); return 1; } @@ -6012,8 +6247,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu) if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) && (offset == APIC_EOI)) { kvm_lapic_set_eoi(vcpu); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } } return emulate_instruction(vcpu, 0) == EMULATE_DONE; @@ -6161,9 +6395,8 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { - skip_emulated_instruction(vcpu); trace_kvm_fast_mmio(gpa); - return 1; + return kvm_skip_emulated_instruction(vcpu); } ret = handle_mmio_page_fault(vcpu, gpa, true); @@ -6348,50 +6581,13 @@ static __init int hardware_setup(void) for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) kvm_define_shared_msr(i, vmx_msr_index[i]); - vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_io_bitmap_a) - return r; + for (i = 0; i < VMX_BITMAP_NR; i++) { + vmx_bitmap[i] = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_bitmap[i]) + goto out; + } vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_io_bitmap_b) - goto out; - - vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_legacy) - goto out1; - - vmx_msr_bitmap_legacy_x2apic = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_legacy_x2apic) - goto out2; - - vmx_msr_bitmap_legacy_x2apic_apicv_inactive = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_legacy_x2apic_apicv_inactive) - goto out3; - - vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_longmode) - goto out4; - - vmx_msr_bitmap_longmode_x2apic = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_longmode_x2apic) - goto out5; - - vmx_msr_bitmap_longmode_x2apic_apicv_inactive = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_longmode_x2apic_apicv_inactive) - goto out6; - - vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_vmread_bitmap) - goto out7; - - vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_vmwrite_bitmap) - goto out8; - memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); @@ -6409,7 +6605,7 @@ static __init int hardware_setup(void) if (setup_vmcs_config(&vmcs_config) < 0) { r = -EIO; - goto out9; + goto out; } if (boot_cpu_has(X86_FEATURE_NX)) @@ -6472,39 +6668,34 @@ static __init int hardware_setup(void) vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); - memcpy(vmx_msr_bitmap_legacy_x2apic, + memcpy(vmx_msr_bitmap_legacy_x2apic_apicv, vmx_msr_bitmap_legacy, PAGE_SIZE); - memcpy(vmx_msr_bitmap_longmode_x2apic, + memcpy(vmx_msr_bitmap_longmode_x2apic_apicv, vmx_msr_bitmap_longmode, PAGE_SIZE); - memcpy(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, + memcpy(vmx_msr_bitmap_legacy_x2apic, vmx_msr_bitmap_legacy, PAGE_SIZE); - memcpy(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, + memcpy(vmx_msr_bitmap_longmode_x2apic, vmx_msr_bitmap_longmode, PAGE_SIZE); set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ + for (msr = 0x800; msr <= 0x8ff; msr++) { + if (msr == 0x839 /* TMCCT */) + continue; + vmx_disable_intercept_msr_x2apic(msr, MSR_TYPE_R, true); + } + /* - * enable_apicv && kvm_vcpu_apicv_active() + * TPR reads and writes can be virtualized even if virtual interrupt + * delivery is not in use. */ - for (msr = 0x800; msr <= 0x8ff; msr++) - vmx_disable_intercept_msr_read_x2apic(msr, true); + vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_W, true); + vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_R | MSR_TYPE_W, false); - /* TMCCT */ - vmx_enable_intercept_msr_read_x2apic(0x839, true); - /* TPR */ - vmx_disable_intercept_msr_write_x2apic(0x808, true); /* EOI */ - vmx_disable_intercept_msr_write_x2apic(0x80b, true); + vmx_disable_intercept_msr_x2apic(0x80b, MSR_TYPE_W, true); /* SELF-IPI */ - vmx_disable_intercept_msr_write_x2apic(0x83f, true); - - /* - * (enable_apicv && !kvm_vcpu_apicv_active()) || - * !enable_apicv - */ - /* TPR */ - vmx_disable_intercept_msr_read_x2apic(0x808, false); - vmx_disable_intercept_msr_write_x2apic(0x808, false); + vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true); if (enable_ept) { kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, @@ -6551,42 +6742,19 @@ static __init int hardware_setup(void) return alloc_kvm_area(); -out9: - free_page((unsigned long)vmx_vmwrite_bitmap); -out8: - free_page((unsigned long)vmx_vmread_bitmap); -out7: - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive); -out6: - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); -out5: - free_page((unsigned long)vmx_msr_bitmap_longmode); -out4: - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive); -out3: - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); -out2: - free_page((unsigned long)vmx_msr_bitmap_legacy); -out1: - free_page((unsigned long)vmx_io_bitmap_b); out: - free_page((unsigned long)vmx_io_bitmap_a); + for (i = 0; i < VMX_BITMAP_NR; i++) + free_page((unsigned long)vmx_bitmap[i]); return r; } static __exit void hardware_unsetup(void) { - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive); - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive); - free_page((unsigned long)vmx_msr_bitmap_legacy); - free_page((unsigned long)vmx_msr_bitmap_longmode); - free_page((unsigned long)vmx_io_bitmap_b); - free_page((unsigned long)vmx_io_bitmap_a); - free_page((unsigned long)vmx_vmwrite_bitmap); - free_page((unsigned long)vmx_vmread_bitmap); + int i; + + for (i = 0; i < VMX_BITMAP_NR; i++) + free_page((unsigned long)vmx_bitmap[i]); free_kvm_area(); } @@ -6600,16 +6768,13 @@ static int handle_pause(struct kvm_vcpu *vcpu) if (ple_gap) grow_ple_window(vcpu); - skip_emulated_instruction(vcpu); kvm_vcpu_on_spin(vcpu); - - return 1; + return kvm_skip_emulated_instruction(vcpu); } static int handle_nop(struct kvm_vcpu *vcpu) { - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } static int handle_mwait(struct kvm_vcpu *vcpu) @@ -6916,8 +7081,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, */ if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { nested_vmx_failInvalid(vcpu); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } page = nested_get_page(vcpu, vmptr); @@ -6925,8 +7089,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, *(u32 *)kmap(page) != VMCS12_REVISION) { nested_vmx_failInvalid(vcpu); kunmap(page); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } kunmap(page); vmx->nested.vmxon_ptr = vmptr; @@ -6935,30 +7098,26 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } if (vmptr == vmx->nested.vmxon_ptr) { nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } break; case EXIT_REASON_VMPTRLD: if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } if (vmptr == vmx->nested.vmxon_ptr) { nested_vmx_failValid(vcpu, - VMXERR_VMCLEAR_VMXON_POINTER); - skip_emulated_instruction(vcpu); - return 1; + VMXERR_VMPTRLD_VMXON_POINTER); + return kvm_skip_emulated_instruction(vcpu); } break; default: @@ -7014,8 +7173,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) if (vmx->nested.vmxon) { nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) @@ -7055,9 +7213,8 @@ static int handle_vmon(struct kvm_vcpu *vcpu) vmx->nested.vmxon = true; - skip_emulated_instruction(vcpu); nested_vmx_succeed(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); out_shadow_vmcs: kfree(vmx->nested.cached_vmcs12); @@ -7176,9 +7333,8 @@ static int handle_vmoff(struct kvm_vcpu *vcpu) if (!nested_vmx_check_permission(vcpu)) return 1; free_nested(to_vmx(vcpu)); - skip_emulated_instruction(vcpu); nested_vmx_succeed(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } /* Emulate the VMCLEAR instruction */ @@ -7217,9 +7373,8 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) nested_free_vmcs02(vmx, vmptr); - skip_emulated_instruction(vcpu); nested_vmx_succeed(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch); @@ -7417,7 +7572,6 @@ static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); if (vmx->nested.current_vmptr == -1ull) { nested_vmx_failInvalid(vcpu); - skip_emulated_instruction(vcpu); return 0; } return 1; @@ -7431,17 +7585,18 @@ static int handle_vmread(struct kvm_vcpu *vcpu) u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); gva_t gva = 0; - if (!nested_vmx_check_permission(vcpu) || - !nested_vmx_check_vmcs12(vcpu)) + if (!nested_vmx_check_permission(vcpu)) return 1; + if (!nested_vmx_check_vmcs12(vcpu)) + return kvm_skip_emulated_instruction(vcpu); + /* Decode instruction info and find the field to read */ field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); /* Read the field, zero-extended to a u64 field_value */ if (vmcs12_read_any(vcpu, field, &field_value) < 0) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } /* * Now copy part of this value to register or memory, as requested. @@ -7461,8 +7616,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) } nested_vmx_succeed(vcpu); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } @@ -7481,10 +7635,12 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) u64 field_value = 0; struct x86_exception e; - if (!nested_vmx_check_permission(vcpu) || - !nested_vmx_check_vmcs12(vcpu)) + if (!nested_vmx_check_permission(vcpu)) return 1; + if (!nested_vmx_check_vmcs12(vcpu)) + return kvm_skip_emulated_instruction(vcpu); + if (vmx_instruction_info & (1u << 10)) field_value = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 3) & 0xf)); @@ -7504,19 +7660,16 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) if (vmcs_field_readonly(field)) { nested_vmx_failValid(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } if (vmcs12_write_any(vcpu, field, field_value) < 0) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } nested_vmx_succeed(vcpu); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } /* Emulate the VMPTRLD instruction */ @@ -7537,8 +7690,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) page = nested_get_page(vcpu, vmptr); if (page == NULL) { nested_vmx_failInvalid(vcpu); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } new_vmcs12 = kmap(page); if (new_vmcs12->revision_id != VMCS12_REVISION) { @@ -7546,8 +7698,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) nested_release_page_clean(page); nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } nested_release_vmcs12(vmx); @@ -7571,8 +7722,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) } nested_vmx_succeed(vcpu); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } /* Emulate the VMPTRST instruction */ @@ -7597,8 +7747,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) return 1; } nested_vmx_succeed(vcpu); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } /* Emulate the INVEPT instruction */ @@ -7636,8 +7785,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) if (type >= 32 || !(types & (1 << type))) { nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } /* According to the Intel VMX instruction reference, the memory @@ -7668,8 +7816,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) break; } - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } static int handle_invvpid(struct kvm_vcpu *vcpu) @@ -7694,13 +7841,13 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); - types = (vmx->nested.nested_vmx_vpid_caps >> 8) & 0x7; + types = (vmx->nested.nested_vmx_vpid_caps & + VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; if (type >= 32 || !(types & (1 << type))) { nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } /* according to the intel vmx instruction reference, the memory @@ -7716,23 +7863,26 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) } switch (type) { + case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: case VMX_VPID_EXTENT_SINGLE_CONTEXT: - /* - * Old versions of KVM use the single-context version so we - * have to support it; just treat it the same as all-context. - */ + case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: + if (!vpid) { + nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + return kvm_skip_emulated_instruction(vcpu); + } + break; case VMX_VPID_EXTENT_ALL_CONTEXT: - __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02); - nested_vmx_succeed(vcpu); break; default: - /* Trap individual address invalidation invvpid calls */ - BUG_ON(1); - break; + WARN_ON_ONCE(1); + return kvm_skip_emulated_instruction(vcpu); } - skip_emulated_instruction(vcpu); - return 1; + __vmx_flush_tlb(vcpu, vmx->nested.vpid02); + nested_vmx_succeed(vcpu); + + return kvm_skip_emulated_instruction(vcpu); } static int handle_pml_full(struct kvm_vcpu *vcpu) @@ -8014,7 +8164,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) switch (exit_reason) { case EXIT_REASON_EXCEPTION_NMI: - if (!is_exception(intr_info)) + if (is_nmi(intr_info)) return false; else if (is_page_fault(intr_info)) return enable_ept; @@ -8071,6 +8221,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); case EXIT_REASON_IO_INSTRUCTION: return nested_vmx_exit_handled_io(vcpu, vmcs12); + case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); case EXIT_REASON_MSR_READ: case EXIT_REASON_MSR_WRITE: return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); @@ -8607,8 +8759,7 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) kvm_machine_check(); /* We need to handle NMIs before interrupts are enabled */ - if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && - (exit_intr_info & INTR_INFO_VALID_MASK)) { + if (is_nmi(exit_intr_info)) { kvm_before_handle_nmi(&vmx->vcpu); asm("int $2"); kvm_after_handle_nmi(&vmx->vcpu); @@ -8620,11 +8771,6 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); register void *__sp asm(_ASM_SP); - /* - * If external interrupt exists, IF bit is set in rflags/eflags on the - * interrupt stack frame, and interrupt will be enabled on a return - * from interrupt handler. - */ if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK)) == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) { unsigned int vector; @@ -8809,7 +8955,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) msrs[i].host); } -void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) +static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 tscl; @@ -9279,6 +9425,50 @@ static void vmcs_set_secondary_exec_control(u32 new_ctl) (new_ctl & ~mask) | (cur_ctl & mask)); } +/* + * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits + * (indicating "allowed-1") if they are supported in the guest's CPUID. + */ +static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_cpuid_entry2 *entry; + + vmx->nested.nested_vmx_cr0_fixed1 = 0xffffffff; + vmx->nested.nested_vmx_cr4_fixed1 = X86_CR4_PCE; + +#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \ + if (entry && (entry->_reg & (_cpuid_mask))) \ + vmx->nested.nested_vmx_cr4_fixed1 |= (_cr4_mask); \ +} while (0) + + entry = kvm_find_cpuid_entry(vcpu, 0x1, 0); + cr4_fixed1_update(X86_CR4_VME, edx, bit(X86_FEATURE_VME)); + cr4_fixed1_update(X86_CR4_PVI, edx, bit(X86_FEATURE_VME)); + cr4_fixed1_update(X86_CR4_TSD, edx, bit(X86_FEATURE_TSC)); + cr4_fixed1_update(X86_CR4_DE, edx, bit(X86_FEATURE_DE)); + cr4_fixed1_update(X86_CR4_PSE, edx, bit(X86_FEATURE_PSE)); + cr4_fixed1_update(X86_CR4_PAE, edx, bit(X86_FEATURE_PAE)); + cr4_fixed1_update(X86_CR4_MCE, edx, bit(X86_FEATURE_MCE)); + cr4_fixed1_update(X86_CR4_PGE, edx, bit(X86_FEATURE_PGE)); + cr4_fixed1_update(X86_CR4_OSFXSR, edx, bit(X86_FEATURE_FXSR)); + cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM)); + cr4_fixed1_update(X86_CR4_VMXE, ecx, bit(X86_FEATURE_VMX)); + cr4_fixed1_update(X86_CR4_SMXE, ecx, bit(X86_FEATURE_SMX)); + cr4_fixed1_update(X86_CR4_PCIDE, ecx, bit(X86_FEATURE_PCID)); + cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, bit(X86_FEATURE_XSAVE)); + + entry = kvm_find_cpuid_entry(vcpu, 0x7, 0); + cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, bit(X86_FEATURE_FSGSBASE)); + cr4_fixed1_update(X86_CR4_SMEP, ebx, bit(X86_FEATURE_SMEP)); + cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP)); + cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU)); + /* TODO: Use X86_CR4_UMIP and X86_FEATURE_UMIP macros */ + cr4_fixed1_update(bit(11), ecx, bit(2)); + +#undef cr4_fixed1_update +} + static void vmx_cpuid_update(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -9320,6 +9510,9 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) else to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; + + if (nested_vmx_allowed(vcpu)) + nested_vmx_cr_fixed1_bits_update(vcpu); } static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) @@ -9774,6 +9967,49 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) return 0; } +static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val) +{ + unsigned long invalid_mask; + + invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu); + return (val & invalid_mask) == 0; +} + +/* + * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are + * emulating VM entry into a guest with EPT enabled. + * Returns 0 on success, 1 on failure. Invalid state exit qualification code + * is assigned to entry_failure_code on failure. + */ +static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, + unsigned long *entry_failure_code) +{ + if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) { + if (!nested_cr3_valid(vcpu, cr3)) { + *entry_failure_code = ENTRY_FAIL_DEFAULT; + return 1; + } + + /* + * If PAE paging and EPT are both on, CR3 is not used by the CPU and + * must not be dereferenced. + */ + if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu) && + !nested_ept) { + if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) { + *entry_failure_code = ENTRY_FAIL_PDPTE; + return 1; + } + } + + vcpu->arch.cr3 = cr3; + __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + } + + kvm_mmu_reset_context(vcpu); + return 0; +} + /* * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it @@ -9782,11 +10018,15 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) * needs. In addition to modifying the active vmcs (which is vmcs02), this * function also has additional necessary side-effects, like setting various * vcpu->arch fields. + * Returns 0 on success, 1 on failure. Invalid state exit qualification code + * is assigned to entry_failure_code on failure. */ -static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + unsigned long *entry_failure_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exec_control; + bool nested_ept_enabled = false; vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); @@ -9951,6 +10191,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_intr_status); } + nested_ept_enabled = (exec_control & SECONDARY_EXEC_ENABLE_EPT) != 0; vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } @@ -9964,6 +10205,15 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmx_set_constant_host_state(vmx); /* + * Set the MSR load/store lists to match L0's settings. + */ + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr); + vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr); + vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); + + /* * HOST_RSP is normally set correctly in vmx_vcpu_run() just before * entry, but only if the current (host) sp changed from the value * we wrote last (vmx->host_rsp). This cache is no longer relevant @@ -10069,15 +10319,6 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) nested_ept_init_mmu_context(vcpu); } - if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) - vcpu->arch.efer = vmcs12->guest_ia32_efer; - else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) - vcpu->arch.efer |= (EFER_LMA | EFER_LME); - else - vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); - /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ - vmx_set_efer(vcpu, vcpu->arch.efer); - /* * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified * TS bit (for lazy fpu) and bits which we consider mandatory enabled. @@ -10092,8 +10333,20 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmx_set_cr4(vcpu, vmcs12->guest_cr4); vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); - /* shadow page tables on either EPT or shadow page tables */ - kvm_set_cr3(vcpu, vmcs12->guest_cr3); + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) + vcpu->arch.efer = vmcs12->guest_ia32_efer; + else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) + vcpu->arch.efer |= (EFER_LMA | EFER_LME); + else + vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); + /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ + vmx_set_efer(vcpu, vcpu->arch.efer); + + /* Shadow page tables on either EPT or shadow page tables. */ + if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_ept_enabled, + entry_failure_code)) + return 1; + kvm_mmu_reset_context(vcpu); if (!enable_ept) @@ -10111,6 +10364,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); + return 0; } /* @@ -10125,12 +10379,14 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) struct loaded_vmcs *vmcs02; bool ia32e; u32 msr_entry_idx; + unsigned long exit_qualification; - if (!nested_vmx_check_permission(vcpu) || - !nested_vmx_check_vmcs12(vcpu)) + if (!nested_vmx_check_permission(vcpu)) return 1; - skip_emulated_instruction(vcpu); + if (!nested_vmx_check_vmcs12(vcpu)) + goto out; + vmcs12 = get_vmcs12(vcpu); if (enable_shadow_vmcs) @@ -10150,37 +10406,37 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) nested_vmx_failValid(vcpu, launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS : VMXERR_VMRESUME_NONLAUNCHED_VMCS); - return 1; + goto out; } if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; + goto out; } if (!nested_get_vmcs12_pages(vcpu, vmcs12)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; + goto out; } if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; + goto out; } if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; + goto out; } if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; + goto out; } if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, - vmx->nested.nested_vmx_true_procbased_ctls_low, + vmx->nested.nested_vmx_procbased_ctls_low, vmx->nested.nested_vmx_procbased_ctls_high) || !vmx_control_verify(vmcs12->secondary_vm_exec_control, vmx->nested.nested_vmx_secondary_ctls_low, @@ -10189,25 +10445,26 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) vmx->nested.nested_vmx_pinbased_ctls_low, vmx->nested.nested_vmx_pinbased_ctls_high) || !vmx_control_verify(vmcs12->vm_exit_controls, - vmx->nested.nested_vmx_true_exit_ctls_low, + vmx->nested.nested_vmx_exit_ctls_low, vmx->nested.nested_vmx_exit_ctls_high) || !vmx_control_verify(vmcs12->vm_entry_controls, - vmx->nested.nested_vmx_true_entry_ctls_low, + vmx->nested.nested_vmx_entry_ctls_low, vmx->nested.nested_vmx_entry_ctls_high)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; + goto out; } - if (((vmcs12->host_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) || - ((vmcs12->host_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { + if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) || + !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) || + !nested_cr3_valid(vcpu, vmcs12->host_cr3)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); - return 1; + goto out; } - if (!nested_cr0_valid(vcpu, vmcs12->guest_cr0) || - ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { + if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) || + !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) { nested_vmx_entry_failure(vcpu, vmcs12, EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); return 1; @@ -10266,6 +10523,12 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (!vmcs02) return -ENOMEM; + /* + * After this point, the trap flag no longer triggers a singlestep trap + * on the vm entry instructions. Don't call + * kvm_skip_emulated_instruction. + */ + skip_emulated_instruction(vcpu); enter_guest_mode(vcpu); if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) @@ -10280,7 +10543,13 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) vmx_segment_cache_clear(vmx); - prepare_vmcs02(vcpu, vmcs12); + if (prepare_vmcs02(vcpu, vmcs12, &exit_qualification)) { + leave_guest_mode(vcpu); + vmx_load_vmcs01(vcpu); + nested_vmx_entry_failure(vcpu, vmcs12, + EXIT_REASON_INVALID_STATE, exit_qualification); + return 1; + } msr_entry_idx = nested_vmx_load_msr(vcpu, vmcs12->vm_entry_msr_load_addr, @@ -10307,6 +10576,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * the success flag) when L2 exits (see nested_vmx_vmexit()). */ return 1; + +out: + return kvm_skip_emulated_instruction(vcpu); } /* @@ -10612,6 +10884,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct kvm_segment seg; + unsigned long entry_failure_code; if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) vcpu->arch.efer = vmcs12->host_ia32_efer; @@ -10649,8 +10922,12 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, nested_ept_uninit_mmu_context(vcpu); - kvm_set_cr3(vcpu, vmcs12->host_cr3); - kvm_mmu_reset_context(vcpu); + /* + * Only PDPTE load can fail as the value of cr3 was checked on entry and + * couldn't have changed. + */ + if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code)) + nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); if (!enable_ept) vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; @@ -10751,6 +11028,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + u32 vm_inst_error = 0; /* trying to cancel vmlaunch/vmresume is a bug */ WARN_ON_ONCE(vmx->nested.nested_run_pending); @@ -10763,6 +11041,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmcs12->vm_exit_msr_store_count)) nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); + if (unlikely(vmx->fail)) + vm_inst_error = vmcs_read32(VM_INSTRUCTION_ERROR); + vmx_load_vmcs01(vcpu); if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) @@ -10791,6 +11072,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, load_vmcs12_host_state(vcpu, vmcs12); /* Update any VMCS fields that might have changed while L2 ran */ + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr); vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); if (vmx->hv_deadline_tsc == -1) vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, @@ -10839,7 +11122,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, */ if (unlikely(vmx->fail)) { vmx->fail = 0; - nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR)); + nested_vmx_failValid(vcpu, vm_inst_error); } else nested_vmx_succeed(vcpu); if (enable_shadow_vmcs) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6f5f465fdb6b..e52c9088660f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -434,12 +434,14 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) } EXPORT_SYMBOL_GPL(kvm_requeue_exception); -void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) +int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) { if (err) kvm_inject_gp(vcpu, 0); else - kvm_x86_ops->skip_emulated_instruction(vcpu); + return kvm_skip_emulated_instruction(vcpu); + + return 1; } EXPORT_SYMBOL_GPL(kvm_complete_insn_gp); @@ -573,7 +575,7 @@ out: } EXPORT_SYMBOL_GPL(load_pdptrs); -static bool pdptrs_changed(struct kvm_vcpu *vcpu) +bool pdptrs_changed(struct kvm_vcpu *vcpu) { u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)]; bool changed = true; @@ -599,6 +601,7 @@ out: return changed; } +EXPORT_SYMBOL_GPL(pdptrs_changed); int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { @@ -1128,8 +1131,8 @@ struct pvclock_gtod_data { struct { /* extract of a clocksource struct */ int vclock_mode; - cycle_t cycle_last; - cycle_t mask; + u64 cycle_last; + u64 mask; u32 mult; u32 shift; } clock; @@ -1569,9 +1572,9 @@ static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) #ifdef CONFIG_X86_64 -static cycle_t read_tsc(void) +static u64 read_tsc(void) { - cycle_t ret = (cycle_t)rdtsc_ordered(); + u64 ret = (u64)rdtsc_ordered(); u64 last = pvclock_gtod_data.clock.cycle_last; if (likely(ret >= last)) @@ -1589,7 +1592,7 @@ static cycle_t read_tsc(void) return last; } -static inline u64 vgettsc(cycle_t *cycle_now) +static inline u64 vgettsc(u64 *cycle_now) { long v; struct pvclock_gtod_data *gtod = &pvclock_gtod_data; @@ -1600,7 +1603,7 @@ static inline u64 vgettsc(cycle_t *cycle_now) return v * gtod->clock.mult; } -static int do_monotonic_boot(s64 *t, cycle_t *cycle_now) +static int do_monotonic_boot(s64 *t, u64 *cycle_now) { struct pvclock_gtod_data *gtod = &pvclock_gtod_data; unsigned long seq; @@ -1621,7 +1624,7 @@ static int do_monotonic_boot(s64 *t, cycle_t *cycle_now) } /* returns true if host is using tsc clocksource */ -static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now) +static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now) { /* checked again under seqlock below */ if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC) @@ -2178,7 +2181,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_KVM_SYSTEM_TIME_NEW: case MSR_KVM_SYSTEM_TIME: { - u64 gpa_offset; struct kvm_arch *ka = &vcpu->kvm->arch; kvmclock_reset(vcpu); @@ -2200,8 +2202,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!(data & 1)) break; - gpa_offset = data & ~(PAGE_MASK | 1); - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_time, data & ~1ULL, sizeof(struct pvclock_vcpu_time_info))) @@ -2296,7 +2296,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); if (!ignore_msrs) { - vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n", + vcpu_debug_ratelimited(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n", msr, data); return 1; } else { @@ -2508,7 +2508,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data); if (!ignore_msrs) { - vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr_info->index); + vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n", + msr_info->index); return 1; } else { vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr_info->index); @@ -2812,7 +2813,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } if (kvm_lapic_hv_timer_in_use(vcpu) && kvm_x86_ops->set_hv_timer(vcpu, - kvm_get_lapic_tscdeadline_msr(vcpu))) + kvm_get_lapic_target_expiration_tsc(vcpu))) kvm_lapic_switch_to_sw_timer(vcpu); /* * On a host with synchronized TSC, there is no need to update @@ -2843,7 +2844,24 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { + int idx; + /* + * Disable page faults because we're in atomic context here. + * kvm_write_guest_offset_cached() would call might_fault() + * that relies on pagefault_disable() to tell if there's a + * bug. NOTE: the write to guest memory may not go through if + * during postcopy live migration or if there's heavy guest + * paging. + */ + pagefault_disable(); + /* + * kvm_memslots() will be called by + * kvm_write_guest_offset_cached() so take the srcu lock. + */ + idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_steal_time_set_preempted(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + pagefault_enable(); kvm_x86_ops->vcpu_put(vcpu); kvm_put_guest_fpu(vcpu); vcpu->arch.last_host_tsc = rdtsc(); @@ -3052,6 +3070,8 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, memset(&events->reserved, 0, sizeof(events->reserved)); } +static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags); + static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { @@ -3088,10 +3108,13 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.apic->sipi_vector = events->sipi_vector; if (events->flags & KVM_VCPUEVENT_VALID_SMM) { + u32 hflags = vcpu->arch.hflags; if (events->smi.smm) - vcpu->arch.hflags |= HF_SMM_MASK; + hflags |= HF_SMM_MASK; else - vcpu->arch.hflags &= ~HF_SMM_MASK; + hflags &= ~HF_SMM_MASK; + kvm_set_hflags(vcpu, hflags); + vcpu->arch.smi_pending = events->smi.pending; if (events->smi.smm_inside_nmi) vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; @@ -3159,6 +3182,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) memcpy(dest, xsave, XSAVE_HDR_OFFSET); /* Set XSTATE_BV */ + xstate_bv &= vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE; *(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv; /* @@ -3319,6 +3343,8 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, switch (cap->cap) { case KVM_CAP_HYPERV_SYNIC: + if (!irqchip_in_kernel(vcpu->kvm)) + return -EINVAL; return kvm_hv_activate_synic(vcpu); default: return -EINVAL; @@ -4832,7 +4858,7 @@ static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address) kvm_mmu_invlpg(emul_to_vcpu(ctxt), address); } -int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) +static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) { if (!need_emulate_wbinvd(vcpu)) return X86EMUL_CONTINUE; @@ -4852,8 +4878,8 @@ int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) { - kvm_x86_ops->skip_emulated_instruction(vcpu); - return kvm_emulate_wbinvd_noskip(vcpu); + kvm_emulate_wbinvd_noskip(vcpu); + return kvm_skip_emulated_instruction(vcpu); } EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); @@ -5451,7 +5477,6 @@ static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflag kvm_run->exit_reason = KVM_EXIT_DEBUG; *r = EMULATE_USER_EXIT; } else { - vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF; /* * "Certain debug exceptions may clear bit 0-3. The * remaining contents of the DR6 register are never @@ -5464,6 +5489,17 @@ static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflag } } +int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) +{ + unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); + int r = EMULATE_DONE; + + kvm_x86_ops->skip_emulated_instruction(vcpu); + kvm_vcpu_check_singlestep(vcpu, rflags, &r); + return r == EMULATE_DONE; +} +EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction); + static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) { if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) && @@ -5649,6 +5685,49 @@ int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) } EXPORT_SYMBOL_GPL(kvm_fast_pio_out); +static int complete_fast_pio_in(struct kvm_vcpu *vcpu) +{ + unsigned long val; + + /* We should only ever be called with arch.pio.count equal to 1 */ + BUG_ON(vcpu->arch.pio.count != 1); + + /* For size less than 4 we merge, else we zero extend */ + val = (vcpu->arch.pio.size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX) + : 0; + + /* + * Since vcpu->arch.pio.count == 1 let emulator_pio_in_emulated perform + * the copy and tracing + */ + emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, vcpu->arch.pio.size, + vcpu->arch.pio.port, &val, 1); + kvm_register_write(vcpu, VCPU_REGS_RAX, val); + + return 1; +} + +int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, unsigned short port) +{ + unsigned long val; + int ret; + + /* For size less than 4 we merge, else we zero extend */ + val = (size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX) : 0; + + ret = emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, size, port, + &val, 1); + if (ret) { + kvm_register_write(vcpu, VCPU_REGS_RAX, val); + return ret; + } + + vcpu->arch.complete_userspace_io = complete_fast_pio_in; + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_fast_pio_in); + static int kvmclock_cpu_down_prep(unsigned int cpu) { __this_cpu_write(cpu_tsc_khz, 0); @@ -5784,7 +5863,7 @@ static void kvm_timer_init(void) } pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz); - cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "AP_X86_KVM_CLK_ONLINE", + cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online", kvmclock_cpu_online, kvmclock_cpu_down_prep); } @@ -5969,6 +6048,7 @@ out: void kvm_arch_exit(void) { + kvm_lapic_exit(); perf_unregister_guest_info_callbacks(&kvm_guest_cbs); if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) @@ -5998,8 +6078,12 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_halt); int kvm_emulate_halt(struct kvm_vcpu *vcpu) { - kvm_x86_ops->skip_emulated_instruction(vcpu); - return kvm_vcpu_halt(vcpu); + int ret = kvm_skip_emulated_instruction(vcpu); + /* + * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered + * KVM_EXIT_DEBUG here. + */ + return kvm_vcpu_halt(vcpu) && ret; } EXPORT_SYMBOL_GPL(kvm_emulate_halt); @@ -6030,9 +6114,9 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu) int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { unsigned long nr, a0, a1, a2, a3, ret; - int op_64_bit, r = 1; + int op_64_bit, r; - kvm_x86_ops->skip_emulated_instruction(vcpu); + r = kvm_skip_emulated_instruction(vcpu); if (kvm_hv_hypercall_enabled(vcpu->kvm)) return kvm_hv_hypercall(vcpu); @@ -6088,7 +6172,8 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) kvm_x86_ops->patch_hypercall(vcpu, instruction); - return emulator_write_emulated(ctxt, rip, instruction, 3, NULL); + return emulator_write_emulated(ctxt, rip, instruction, 3, + &ctxt->exception); } static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu) @@ -7823,6 +7908,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) raw_spin_lock_init(&kvm->arch.tsc_write_lock); mutex_init(&kvm->arch.apic_map_lock); + mutex_init(&kvm->arch.hyperv.hv_lock); spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); kvm->arch.kvmclock_offset = -ktime_get_boot_ns(); @@ -8175,7 +8261,7 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm) void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { - kvm_mmu_invalidate_zap_all_pages(kvm); + kvm_page_track_flush_slot(kvm, slot); } static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 4ca0d78adcf0..d3289d7e78fa 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -916,7 +916,7 @@ static unsigned long lguest_tsc_khz(void) * If we can't use the TSC, the kernel falls back to our lower-priority * "lguest_clock", where we read the time value given to us by the Host. */ -static cycle_t lguest_clock_read(struct clocksource *cs) +static u64 lguest_clock_read(struct clocksource *cs) { unsigned long sec, nsec; diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 0b281217c890..1f65ff6540f0 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -11,7 +11,7 @@ #include <linux/export.h> #include <linux/backing-dev.h> #include <linux/interrupt.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/mmx.h> #include <asm/asm.h> diff --git a/arch/x86/math-emu/errors.c b/arch/x86/math-emu/errors.c index 9e6545f269e5..2ccc424a57d9 100644 --- a/arch/x86/math-emu/errors.c +++ b/arch/x86/math-emu/errors.c @@ -19,7 +19,7 @@ #include <linux/signal.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "fpu_emu.h" #include "fpu_system.h" diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index e945fedf1de2..0203baefb5c0 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -27,7 +27,7 @@ #include <linux/signal.h> #include <linux/regset.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/traps.h> #include <asm/user.h> #include <asm/fpu/internal.h> diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c index 8db26591d91a..b8ef9f9d2ffc 100644 --- a/arch/x86/math-emu/get_address.c +++ b/arch/x86/math-emu/get_address.c @@ -19,7 +19,7 @@ #include <linux/stddef.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/vm86.h> #include "fpu_system.h" diff --git a/arch/x86/math-emu/load_store.c b/arch/x86/math-emu/load_store.c index 95228ff042c0..1643054766eb 100644 --- a/arch/x86/math-emu/load_store.c +++ b/arch/x86/math-emu/load_store.c @@ -18,7 +18,7 @@ | other processes using the emulator while swapping is in progress. | +---------------------------------------------------------------------------*/ -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "fpu_system.h" #include "exception.h" diff --git a/arch/x86/math-emu/reg_ld_str.c b/arch/x86/math-emu/reg_ld_str.c index d597fe7423c9..2c98965a60ba 100644 --- a/arch/x86/math-emu/reg_ld_str.c +++ b/arch/x86/math-emu/reg_ld_str.c @@ -19,7 +19,7 @@ #include "fpu_emu.h" -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "fpu_system.h" #include "exception.h" diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index ea9c49adaa1f..8aa6bea1cd6c 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -15,6 +15,7 @@ #include <linux/debugfs.h> #include <linux/mm.h> #include <linux/init.h> +#include <linux/sched.h> #include <linux/seq_file.h> #include <asm/pgtable.h> @@ -406,6 +407,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, } else note_page(m, &st, __pgprot(0), 1); + cond_resched(); start++; } diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index fcd06f7526de..61a7e9ea9aa1 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -1,5 +1,5 @@ #include <linux/extable.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/traps.h> #include <asm/kdebug.h> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 17c55a536fdd..e3254ca0eec4 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -413,7 +413,7 @@ out: void vmalloc_sync_all(void) { - sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END, 0); + sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); } /* diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index cf8059016ec8..928d657de829 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -34,7 +34,7 @@ #include <asm/asm.h> #include <asm/bios_ebda.h> #include <asm/processor.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/pgtable.h> #include <asm/dma.h> #include <asm/fixmap.h> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 14b9dd71d9e8..af85b686a7b0 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -36,7 +36,7 @@ #include <asm/processor.h> #include <asm/bios_ebda.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/dma.h> @@ -89,10 +89,10 @@ static int __init nonx32_setup(char *str) __setup("noexec32=", nonx32_setup); /* - * When memory was added/removed make sure all the processes MM have + * When memory was added make sure all the processes MM have * suitable PGD entries in the local PGD level page. */ -void sync_global_pgds(unsigned long start, unsigned long end, int removed) +void sync_global_pgds(unsigned long start, unsigned long end) { unsigned long address; @@ -100,12 +100,7 @@ void sync_global_pgds(unsigned long start, unsigned long end, int removed) const pgd_t *pgd_ref = pgd_offset_k(address); struct page *page; - /* - * When it is called after memory hot remove, pgd_none() - * returns true. In this case (removed == 1), we must clear - * the PGD entries in the local PGD level page. - */ - if (pgd_none(*pgd_ref) && !removed) + if (pgd_none(*pgd_ref)) continue; spin_lock(&pgd_lock); @@ -122,13 +117,8 @@ void sync_global_pgds(unsigned long start, unsigned long end, int removed) BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - if (removed) { - if (pgd_none(*pgd_ref) && !pgd_none(*pgd)) - pgd_clear(pgd); - } else { - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - } + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); spin_unlock(pgt_lock); } @@ -596,7 +586,7 @@ kernel_physical_mapping_init(unsigned long paddr_start, } if (pgd_changed) - sync_global_pgds(vaddr_start, vaddr_end - 1, 0); + sync_global_pgds(vaddr_start, vaddr_end - 1); __flush_tlb_all(); @@ -1239,7 +1229,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) } else err = vmemmap_populate_basepages(start, end, node); if (!err) - sync_global_pgds(start, end - 1, 0); + sync_global_pgds(start, end - 1); return err; } diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index e4f800999b32..af59f808742f 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -293,7 +293,7 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs) * We were not able to extract an address from the instruction, * probably because there was something invalid in it. */ - if (info->si_addr == (void *)-1) { + if (info->si_addr == (void __user *)-1) { err = -EINVAL; goto err_out; } @@ -350,12 +350,12 @@ int mpx_enable_management(void) * The copy_xregs_to_kernel() beneath get_xsave_field_ptr() is * expected to be relatively expensive. Storing the bounds * directory here means that we do not have to do xsave in the - * unmap path; we can just use mm->bd_addr instead. + * unmap path; we can just use mm->context.bd_addr instead. */ bd_base = mpx_get_bounds_dir(); down_write(&mm->mmap_sem); - mm->bd_addr = bd_base; - if (mm->bd_addr == MPX_INVALID_BOUNDS_DIR) + mm->context.bd_addr = bd_base; + if (mm->context.bd_addr == MPX_INVALID_BOUNDS_DIR) ret = -ENXIO; up_write(&mm->mmap_sem); @@ -370,7 +370,7 @@ int mpx_disable_management(void) return -ENXIO; down_write(&mm->mmap_sem); - mm->bd_addr = MPX_INVALID_BOUNDS_DIR; + mm->context.bd_addr = MPX_INVALID_BOUNDS_DIR; up_write(&mm->mmap_sem); return 0; } @@ -947,7 +947,7 @@ static int try_unmap_single_bt(struct mm_struct *mm, end = bta_end_vaddr; } - bde_vaddr = mm->bd_addr + mpx_get_bd_entry_offset(mm, start); + bde_vaddr = mm->context.bd_addr + mpx_get_bd_entry_offset(mm, start); ret = get_bt_addr(mm, bde_vaddr, &bt_addr); /* * No bounds table there, so nothing to unmap. diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 3f35b48d1d9d..12dcad7297a5 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -19,7 +19,7 @@ #include "numa_internal.h" -int __initdata numa_off; +int numa_off; nodemask_t numa_nodes_parsed __initdata; struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index e3353c97d086..5a287e523eab 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -20,7 +20,7 @@ #include <asm/tlbflush.h> #include <asm/sections.h> #include <asm/setup.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/pgalloc.h> #include <asm/proto.h> #include <asm/pat.h> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index e76d1af60f7a..bb660e53cbd6 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1172,6 +1172,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) set_memory_ro((unsigned long)header, header->pages); prog->bpf_func = (void *)image; prog->jited = 1; + } else { + prog = orig_prog; } out_addrs: diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 3cd69832d7f4..3961103e9176 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -114,6 +114,16 @@ static const struct dmi_system_id pci_crs_quirks[] __initconst = { DMI_MATCH(DMI_BIOS_VERSION, "6JET85WW (1.43 )"), }, }, + /* https://bugzilla.kernel.org/show_bug.cgi?id=42606 */ + { + .callback = set_nouse_crs, + .ident = "Supermicro X8DTH", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Supermicro"), + DMI_MATCH(DMI_PRODUCT_NAME, "X8DTH-i/6/iF/6F"), + DMI_MATCH(DMI_BIOS_VERSION, "2.0a"), + }, + }, /* https://bugzilla.kernel.org/show_bug.cgi?id=15362 */ { diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index bedfab98077a..e1fb269c87af 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -264,8 +264,8 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) return 0; error: - dev_err(&dev->dev, - "Xen PCI frontend has not registered MSI/MSI-X support!\n"); + dev_err(&dev->dev, "Failed to create MSI%s! ret=%d!\n", + type == PCI_CAP_ID_MSI ? "" : "-X", irq); return irq; } diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile index 3c3c19ea94df..184842ef332e 100644 --- a/arch/x86/platform/Makefile +++ b/arch/x86/platform/Makefile @@ -8,7 +8,6 @@ obj-y += iris/ obj-y += intel/ obj-y += intel-mid/ obj-y += intel-quark/ -obj-y += mellanox/ obj-y += olpc/ obj-y += scx200/ obj-y += sfi/ diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c index b27bccd4390f..ce4b06733c09 100644 --- a/arch/x86/platform/ce4100/ce4100.c +++ b/arch/x86/platform/ce4100/ce4100.c @@ -23,11 +23,6 @@ #include <asm/io_apic.h> #include <asm/emergency-restart.h> -static int ce4100_i8042_detect(void) -{ - return 0; -} - /* * The CE4100 platform has an internal 8051 Microcontroller which is * responsible for signaling to the external Power Management Unit the @@ -89,7 +84,7 @@ static void ce4100_mem_serial_out(struct uart_port *p, int offset, int value) } static void ce4100_serial_fixup(int port, struct uart_port *up, - unsigned short *capabilites) + u32 *capabilites) { #ifdef CONFIG_EARLY_PRINTK /* @@ -145,7 +140,6 @@ static void sdv_pci_init(void) void __init x86_ce4100_early_setup(void) { x86_init.oem.arch_setup = sdv_arch_setup; - x86_platform.i8042_detect = ce4100_i8042_detect; x86_init.resources.probe_roms = x86_init_noop; x86_init.mpparse.get_smp_config = x86_init_uint_noop; x86_init.mpparse.find_smp_config = x86_init_noop; diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 936a488d6cf6..274dfc481849 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -210,6 +210,70 @@ int __init efi_memblock_x86_reserve_range(void) return 0; } +#define OVERFLOW_ADDR_SHIFT (64 - EFI_PAGE_SHIFT) +#define OVERFLOW_ADDR_MASK (U64_MAX << OVERFLOW_ADDR_SHIFT) +#define U64_HIGH_BIT (~(U64_MAX >> 1)) + +static bool __init efi_memmap_entry_valid(const efi_memory_desc_t *md, int i) +{ + u64 end = (md->num_pages << EFI_PAGE_SHIFT) + md->phys_addr - 1; + u64 end_hi = 0; + char buf[64]; + + if (md->num_pages == 0) { + end = 0; + } else if (md->num_pages > EFI_PAGES_MAX || + EFI_PAGES_MAX - md->num_pages < + (md->phys_addr >> EFI_PAGE_SHIFT)) { + end_hi = (md->num_pages & OVERFLOW_ADDR_MASK) + >> OVERFLOW_ADDR_SHIFT; + + if ((md->phys_addr & U64_HIGH_BIT) && !(end & U64_HIGH_BIT)) + end_hi += 1; + } else { + return true; + } + + pr_warn_once(FW_BUG "Invalid EFI memory map entries:\n"); + + if (end_hi) { + pr_warn("mem%02u: %s range=[0x%016llx-0x%llx%016llx] (invalid)\n", + i, efi_md_typeattr_format(buf, sizeof(buf), md), + md->phys_addr, end_hi, end); + } else { + pr_warn("mem%02u: %s range=[0x%016llx-0x%016llx] (invalid)\n", + i, efi_md_typeattr_format(buf, sizeof(buf), md), + md->phys_addr, end); + } + return false; +} + +static void __init efi_clean_memmap(void) +{ + efi_memory_desc_t *out = efi.memmap.map; + const efi_memory_desc_t *in = out; + const efi_memory_desc_t *end = efi.memmap.map_end; + int i, n_removal; + + for (i = n_removal = 0; in < end; i++) { + if (efi_memmap_entry_valid(in, i)) { + if (out != in) + memcpy(out, in, efi.memmap.desc_size); + out = (void *)out + efi.memmap.desc_size; + } else { + n_removal++; + } + in = (void *)in + efi.memmap.desc_size; + } + + if (n_removal > 0) { + u64 size = efi.memmap.nr_map - n_removal; + + pr_warn("Removing %d invalid memory map entries.\n", n_removal); + efi_memmap_install(efi.memmap.phys_map, size); + } +} + void __init efi_print_memmap(void) { efi_memory_desc_t *md; @@ -472,6 +536,8 @@ void __init efi_init(void) } } + efi_clean_memmap(); + if (efi_enabled(EFI_DBG)) efi_print_memmap(); } diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 319148bd4b05..2f25a363068c 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -269,6 +269,22 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) efi_scratch.use_pgd = true; /* + * Certain firmware versions are way too sentimential and still believe + * they are exclusive and unquestionable owners of the first physical page, + * even though they explicitly mark it as EFI_CONVENTIONAL_MEMORY + * (but then write-access it later during SetVirtualAddressMap()). + * + * Create a 1:1 mapping for this page, to avoid triple faults during early + * boot with such firmware. We are free to hand this page to the BIOS, + * as trim_bios_range() will reserve the first page and isolate it away + * from memory allocators anyway. + */ + if (kernel_map_pages_in_pgd(pgd, 0x0, 0x0, 1, _PAGE_RW)) { + pr_err("Failed to create 1:1 mapping for the first page!\n"); + return 1; + } + + /* * When making calls to the firmware everything needs to be 1:1 * mapped and addressable with 32-bit pointers. Map the kernel * text and allocate a new stack because we can't rely on the diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 10aca63a50d7..30031d5293c4 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -214,7 +214,7 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size) new_size = efi.memmap.desc_size * num_entries; - new_phys = memblock_alloc(new_size, 0); + new_phys = efi_memmap_alloc(num_entries); if (!new_phys) { pr_err("Could not allocate boot services memmap\n"); return; @@ -355,7 +355,7 @@ void __init efi_free_boot_services(void) } new_size = efi.memmap.desc_size * num_entries; - new_phys = memblock_alloc(new_size, 0); + new_phys = efi_memmap_alloc(num_entries); if (!new_phys) { pr_err("Failed to allocate new EFI memmap\n"); return; diff --git a/arch/x86/platform/intel-mid/device_libs/Makefile b/arch/x86/platform/intel-mid/device_libs/Makefile index dd6cfa4ad3ac..90e4f2a6625b 100644 --- a/arch/x86/platform/intel-mid/device_libs/Makefile +++ b/arch/x86/platform/intel-mid/device_libs/Makefile @@ -15,11 +15,11 @@ obj-$(subst m,y,$(CONFIG_INTEL_MID_POWER_BUTTON)) += platform_msic_power_btn.o obj-$(subst m,y,$(CONFIG_GPIO_INTEL_PMIC)) += platform_pmic_gpio.o obj-$(subst m,y,$(CONFIG_INTEL_MFLD_THERMAL)) += platform_msic_thermal.o # SPI Devices -obj-$(subst m,y,$(CONFIG_SPI_SPIDEV)) += platform_spidev.o +obj-$(subst m,y,$(CONFIG_SPI_SPIDEV)) += platform_mrfld_spidev.o # I2C Devices obj-$(subst m,y,$(CONFIG_SENSORS_EMC1403)) += platform_emc1403.o obj-$(subst m,y,$(CONFIG_SENSORS_LIS3LV02D)) += platform_lis331.o -obj-$(subst m,y,$(CONFIG_INPUT_MPU3050)) += platform_mpu3050.o +obj-$(subst m,y,$(CONFIG_MPU3050_I2C)) += platform_mpu3050.o obj-$(subst m,y,$(CONFIG_INPUT_BMA150)) += platform_bma023.o obj-$(subst m,y,$(CONFIG_DRM_MEDFIELD)) += platform_tc35876x.o # I2C GPIO Expanders diff --git a/arch/x86/platform/intel-mid/device_libs/platform_spidev.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_spidev.c index 30c601b399ee..27186ad654c9 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_spidev.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_spidev.c @@ -11,6 +11,7 @@ * of the License. */ +#include <linux/err.h> #include <linux/init.h> #include <linux/sfi.h> #include <linux/spi/pxa2xx_spi.h> @@ -34,6 +35,9 @@ static void __init *spidev_platform_data(void *info) { struct spi_board_info *spi_info = info; + if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER) + return ERR_PTR(-ENODEV); + spi_info->mode = SPI_MODE_0; spi_info->controller_data = &spidev_spi_chip; diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c index 7850128f0026..12a272582cdc 100644 --- a/arch/x86/platform/intel-mid/intel-mid.c +++ b/arch/x86/platform/intel-mid/intel-mid.c @@ -161,12 +161,6 @@ out: regulator_has_full_constraints(); } -/* MID systems don't have i8042 controller */ -static int intel_mid_i8042_detect(void) -{ - return 0; -} - /* * Moorestown does not have external NMI source nor port 0x61 to report * NMI status. The possible NMI sources are from pmu as a result of NMI @@ -197,7 +191,6 @@ void __init x86_intel_mid_early_setup(void) x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock; x86_platform.calibrate_tsc = intel_mid_calibrate_tsc; - x86_platform.i8042_detect = intel_mid_i8042_detect; x86_init.timers.wallclock_init = intel_mid_rtc_init; x86_platform.get_nmi_reason = intel_mid_get_nmi_reason; diff --git a/arch/x86/platform/intel-mid/mfld.c b/arch/x86/platform/intel-mid/mfld.c index 1eb47b6298c2..e793fe509971 100644 --- a/arch/x86/platform/intel-mid/mfld.c +++ b/arch/x86/platform/intel-mid/mfld.c @@ -49,8 +49,13 @@ static unsigned long __init mfld_calibrate_tsc(void) fast_calibrate = ratio * fsb; pr_debug("read penwell tsc %lu khz\n", fast_calibrate); lapic_timer_frequency = fsb * 1000 / HZ; - /* mark tsc clocksource as reliable */ - set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE); + + /* + * TSC on Intel Atom SoCs is reliable and of known frequency. + * See tsc_msr.c for details. + */ + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); return fast_calibrate; } diff --git a/arch/x86/platform/intel-mid/mrfld.c b/arch/x86/platform/intel-mid/mrfld.c index 59253db41bbc..e0607c77a1bd 100644 --- a/arch/x86/platform/intel-mid/mrfld.c +++ b/arch/x86/platform/intel-mid/mrfld.c @@ -78,8 +78,12 @@ static unsigned long __init tangier_calibrate_tsc(void) pr_debug("Setting lapic_timer_frequency = %d\n", lapic_timer_frequency); - /* mark tsc clocksource as reliable */ - set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE); + /* + * TSC on Intel Atom SoCs is reliable and of known frequency. + * See tsc_msr.c for details. + */ + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); return fast_calibrate; } diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c index f5bad40936ac..b8f562049cad 100644 --- a/arch/x86/platform/intel-quark/imr_selftest.c +++ b/arch/x86/platform/intel-quark/imr_selftest.c @@ -25,7 +25,8 @@ * @fmt: format string. * ... variadic argument list. */ -static void __init imr_self_test_result(int res, const char *fmt, ...) +static __printf(2, 3) +void __init imr_self_test_result(int res, const char *fmt, ...) { va_list vlist; diff --git a/arch/x86/platform/mellanox/Makefile b/arch/x86/platform/mellanox/Makefile deleted file mode 100644 index f43c93188a1d..000000000000 --- a/arch/x86/platform/mellanox/Makefile +++ /dev/null @@ -1 +0,0 @@ -obj-$(CONFIG_MLX_PLATFORM) += mlx-platform.o diff --git a/arch/x86/platform/mellanox/mlx-platform.c b/arch/x86/platform/mellanox/mlx-platform.c deleted file mode 100644 index 7dcfcca97399..000000000000 --- a/arch/x86/platform/mellanox/mlx-platform.c +++ /dev/null @@ -1,266 +0,0 @@ -/* - * arch/x86/platform/mellanox/mlx-platform.c - * Copyright (c) 2016 Mellanox Technologies. All rights reserved. - * Copyright (c) 2016 Vadim Pasternak <vadimp@mellanox.com> - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the names of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL") version 2 as published by the Free - * Software Foundation. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include <linux/device.h> -#include <linux/dmi.h> -#include <linux/i2c.h> -#include <linux/i2c-mux.h> -#include <linux/module.h> -#include <linux/platform_device.h> -#include <linux/platform_data/i2c-mux-reg.h> - -#define MLX_PLAT_DEVICE_NAME "mlxplat" - -/* LPC bus IO offsets */ -#define MLXPLAT_CPLD_LPC_I2C_BASE_ADRR 0x2000 -#define MLXPLAT_CPLD_LPC_REG_BASE_ADRR 0x2500 -#define MLXPLAT_CPLD_LPC_IO_RANGE 0x100 -#define MLXPLAT_CPLD_LPC_I2C_CH1_OFF 0xdb -#define MLXPLAT_CPLD_LPC_I2C_CH2_OFF 0xda -#define MLXPLAT_CPLD_LPC_PIO_OFFSET 0x10000UL -#define MLXPLAT_CPLD_LPC_REG1 ((MLXPLAT_CPLD_LPC_REG_BASE_ADRR + \ - MLXPLAT_CPLD_LPC_I2C_CH1_OFF) | \ - MLXPLAT_CPLD_LPC_PIO_OFFSET) -#define MLXPLAT_CPLD_LPC_REG2 ((MLXPLAT_CPLD_LPC_REG_BASE_ADRR + \ - MLXPLAT_CPLD_LPC_I2C_CH2_OFF) | \ - MLXPLAT_CPLD_LPC_PIO_OFFSET) - -/* Start channel numbers */ -#define MLXPLAT_CPLD_CH1 2 -#define MLXPLAT_CPLD_CH2 10 - -/* Number of LPC attached MUX platform devices */ -#define MLXPLAT_CPLD_LPC_MUX_DEVS 2 - -/* mlxplat_priv - platform private data - * @pdev_i2c - i2c controller platform device - * @pdev_mux - array of mux platform devices - */ -struct mlxplat_priv { - struct platform_device *pdev_i2c; - struct platform_device *pdev_mux[MLXPLAT_CPLD_LPC_MUX_DEVS]; -}; - -/* Regions for LPC I2C controller and LPC base register space */ -static const struct resource mlxplat_lpc_resources[] = { - [0] = DEFINE_RES_NAMED(MLXPLAT_CPLD_LPC_I2C_BASE_ADRR, - MLXPLAT_CPLD_LPC_IO_RANGE, - "mlxplat_cpld_lpc_i2c_ctrl", IORESOURCE_IO), - [1] = DEFINE_RES_NAMED(MLXPLAT_CPLD_LPC_REG_BASE_ADRR, - MLXPLAT_CPLD_LPC_IO_RANGE, - "mlxplat_cpld_lpc_regs", - IORESOURCE_IO), -}; - -/* Platform default channels */ -static const int mlxplat_default_channels[][8] = { - { - MLXPLAT_CPLD_CH1, MLXPLAT_CPLD_CH1 + 1, MLXPLAT_CPLD_CH1 + 2, - MLXPLAT_CPLD_CH1 + 3, MLXPLAT_CPLD_CH1 + 4, MLXPLAT_CPLD_CH1 + - 5, MLXPLAT_CPLD_CH1 + 6, MLXPLAT_CPLD_CH1 + 7 - }, - { - MLXPLAT_CPLD_CH2, MLXPLAT_CPLD_CH2 + 1, MLXPLAT_CPLD_CH2 + 2, - MLXPLAT_CPLD_CH2 + 3, MLXPLAT_CPLD_CH2 + 4, MLXPLAT_CPLD_CH2 + - 5, MLXPLAT_CPLD_CH2 + 6, MLXPLAT_CPLD_CH2 + 7 - }, -}; - -/* Platform channels for MSN21xx system family */ -static const int mlxplat_msn21xx_channels[] = { 1, 2, 3, 4, 5, 6, 7, 8 }; - -/* Platform mux data */ -static struct i2c_mux_reg_platform_data mlxplat_mux_data[] = { - { - .parent = 1, - .base_nr = MLXPLAT_CPLD_CH1, - .write_only = 1, - .reg = (void __iomem *)MLXPLAT_CPLD_LPC_REG1, - .reg_size = 1, - .idle_in_use = 1, - }, - { - .parent = 1, - .base_nr = MLXPLAT_CPLD_CH2, - .write_only = 1, - .reg = (void __iomem *)MLXPLAT_CPLD_LPC_REG2, - .reg_size = 1, - .idle_in_use = 1, - }, - -}; - -static struct platform_device *mlxplat_dev; - -static int __init mlxplat_dmi_default_matched(const struct dmi_system_id *dmi) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(mlxplat_mux_data); i++) { - mlxplat_mux_data[i].values = mlxplat_default_channels[i]; - mlxplat_mux_data[i].n_values = - ARRAY_SIZE(mlxplat_default_channels[i]); - } - - return 1; -}; - -static int __init mlxplat_dmi_msn21xx_matched(const struct dmi_system_id *dmi) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(mlxplat_mux_data); i++) { - mlxplat_mux_data[i].values = mlxplat_msn21xx_channels; - mlxplat_mux_data[i].n_values = - ARRAY_SIZE(mlxplat_msn21xx_channels); - } - - return 1; -}; - -static struct dmi_system_id mlxplat_dmi_table[] __initdata = { - { - .callback = mlxplat_dmi_default_matched, - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"), - DMI_MATCH(DMI_PRODUCT_NAME, "MSN24"), - }, - }, - { - .callback = mlxplat_dmi_default_matched, - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"), - DMI_MATCH(DMI_PRODUCT_NAME, "MSN27"), - }, - }, - { - .callback = mlxplat_dmi_default_matched, - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"), - DMI_MATCH(DMI_PRODUCT_NAME, "MSB"), - }, - }, - { - .callback = mlxplat_dmi_default_matched, - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"), - DMI_MATCH(DMI_PRODUCT_NAME, "MSX"), - }, - }, - { - .callback = mlxplat_dmi_msn21xx_matched, - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"), - DMI_MATCH(DMI_PRODUCT_NAME, "MSN21"), - }, - }, - { } -}; - -static int __init mlxplat_init(void) -{ - struct mlxplat_priv *priv; - int i, err; - - if (!dmi_check_system(mlxplat_dmi_table)) - return -ENODEV; - - mlxplat_dev = platform_device_register_simple(MLX_PLAT_DEVICE_NAME, -1, - mlxplat_lpc_resources, - ARRAY_SIZE(mlxplat_lpc_resources)); - - if (IS_ERR(mlxplat_dev)) - return PTR_ERR(mlxplat_dev); - - priv = devm_kzalloc(&mlxplat_dev->dev, sizeof(struct mlxplat_priv), - GFP_KERNEL); - if (!priv) { - err = -ENOMEM; - goto fail_alloc; - } - platform_set_drvdata(mlxplat_dev, priv); - - priv->pdev_i2c = platform_device_register_simple("i2c_mlxcpld", -1, - NULL, 0); - if (IS_ERR(priv->pdev_i2c)) { - err = PTR_ERR(priv->pdev_i2c); - goto fail_alloc; - }; - - for (i = 0; i < ARRAY_SIZE(mlxplat_mux_data); i++) { - priv->pdev_mux[i] = platform_device_register_resndata( - &mlxplat_dev->dev, - "i2c-mux-reg", i, NULL, - 0, &mlxplat_mux_data[i], - sizeof(mlxplat_mux_data[i])); - if (IS_ERR(priv->pdev_mux[i])) { - err = PTR_ERR(priv->pdev_mux[i]); - goto fail_platform_mux_register; - } - } - - return 0; - -fail_platform_mux_register: - for (i--; i > 0 ; i--) - platform_device_unregister(priv->pdev_mux[i]); - platform_device_unregister(priv->pdev_i2c); -fail_alloc: - platform_device_unregister(mlxplat_dev); - - return err; -} -module_init(mlxplat_init); - -static void __exit mlxplat_exit(void) -{ - struct mlxplat_priv *priv = platform_get_drvdata(mlxplat_dev); - int i; - - for (i = ARRAY_SIZE(mlxplat_mux_data) - 1; i >= 0 ; i--) - platform_device_unregister(priv->pdev_mux[i]); - - platform_device_unregister(priv->pdev_i2c); - platform_device_unregister(mlxplat_dev); -} -module_exit(mlxplat_exit); - -MODULE_AUTHOR("Vadim Pasternak (vadimp@mellanox.com)"); -MODULE_DESCRIPTION("Mellanox platform driver"); -MODULE_LICENSE("Dual BSD/GPL"); -MODULE_ALIAS("dmi:*:*Mellanox*:MSN24*:"); -MODULE_ALIAS("dmi:*:*Mellanox*:MSN27*:"); -MODULE_ALIAS("dmi:*:*Mellanox*:MSB*:"); -MODULE_ALIAS("dmi:*:*Mellanox*:MSX*:"); -MODULE_ALIAS("dmi:*:*Mellanox*:MSN21*:"); diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c index b333fc45f9ec..2ee7632d4916 100644 --- a/arch/x86/platform/uv/uv_time.c +++ b/arch/x86/platform/uv/uv_time.c @@ -30,7 +30,7 @@ #define RTC_NAME "sgi_rtc" -static cycle_t uv_read_rtc(struct clocksource *cs); +static u64 uv_read_rtc(struct clocksource *cs); static int uv_rtc_next_event(unsigned long, struct clock_event_device *); static int uv_rtc_shutdown(struct clock_event_device *evt); @@ -38,7 +38,7 @@ static struct clocksource clocksource_uv = { .name = RTC_NAME, .rating = 299, .read = uv_read_rtc, - .mask = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK, + .mask = (u64)UVH_RTC_REAL_TIME_CLOCK_MASK, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -296,7 +296,7 @@ static int uv_rtc_unset_timer(int cpu, int force) * cachelines of it's own page. This allows faster simultaneous reads * from a given socket. */ -static cycle_t uv_read_rtc(struct clocksource *cs) +static u64 uv_read_rtc(struct clocksource *cs) { unsigned long offset; @@ -305,7 +305,7 @@ static cycle_t uv_read_rtc(struct clocksource *cs) else offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE; - return (cycle_t)uv_read_local_mmr(UVH_RTC | offset); + return (u64)uv_read_local_mmr(UVH_RTC | offset); } /* diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 53cace2ec0e2..66ade16c7693 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -252,6 +252,7 @@ static void notrace __restore_processor_state(struct saved_context *ctxt) fix_processor_context(); do_fpu_end(); + tsc_verify_tsc_adjust(true); x86_platform.restore_sched_clock_state(); mtrr_bp_restore(); perf_restore_debug_store(); diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 9634557a5444..ded2e8272382 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -11,6 +11,10 @@ #include <linux/gfp.h> #include <linux/smp.h> #include <linux/suspend.h> +#include <linux/scatterlist.h> +#include <linux/kdebug.h> + +#include <crypto/hash.h> #include <asm/init.h> #include <asm/proto.h> @@ -177,14 +181,86 @@ int pfn_is_nosave(unsigned long pfn) return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); } +#define MD5_DIGEST_SIZE 16 + struct restore_data_record { unsigned long jump_address; unsigned long jump_address_phys; unsigned long cr3; unsigned long magic; + u8 e820_digest[MD5_DIGEST_SIZE]; }; -#define RESTORE_MAGIC 0x123456789ABCDEF0UL +#define RESTORE_MAGIC 0x23456789ABCDEF01UL + +#if IS_BUILTIN(CONFIG_CRYPTO_MD5) +/** + * get_e820_md5 - calculate md5 according to given e820 map + * + * @map: the e820 map to be calculated + * @buf: the md5 result to be stored to + */ +static int get_e820_md5(struct e820map *map, void *buf) +{ + struct scatterlist sg; + struct crypto_ahash *tfm; + int size; + int ret = 0; + + tfm = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) + return -ENOMEM; + + { + AHASH_REQUEST_ON_STACK(req, tfm); + size = offsetof(struct e820map, map) + + sizeof(struct e820entry) * map->nr_map; + ahash_request_set_tfm(req, tfm); + sg_init_one(&sg, (u8 *)map, size); + ahash_request_set_callback(req, 0, NULL, NULL); + ahash_request_set_crypt(req, &sg, buf, size); + + if (crypto_ahash_digest(req)) + ret = -EINVAL; + ahash_request_zero(req); + } + crypto_free_ahash(tfm); + + return ret; +} + +static void hibernation_e820_save(void *buf) +{ + get_e820_md5(e820_saved, buf); +} + +static bool hibernation_e820_mismatch(void *buf) +{ + int ret; + u8 result[MD5_DIGEST_SIZE]; + + memset(result, 0, MD5_DIGEST_SIZE); + /* If there is no digest in suspend kernel, let it go. */ + if (!memcmp(result, buf, MD5_DIGEST_SIZE)) + return false; + + ret = get_e820_md5(e820_saved, result); + if (ret) + return true; + + return memcmp(result, buf, MD5_DIGEST_SIZE) ? true : false; +} +#else +static void hibernation_e820_save(void *buf) +{ +} + +static bool hibernation_e820_mismatch(void *buf) +{ + /* If md5 is not builtin for restore kernel, let it go. */ + return false; +} +#endif /** * arch_hibernation_header_save - populate the architecture specific part @@ -201,6 +277,9 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size) rdr->jump_address_phys = __pa_symbol(&restore_registers); rdr->cr3 = restore_cr3; rdr->magic = RESTORE_MAGIC; + + hibernation_e820_save(rdr->e820_digest); + return 0; } @@ -216,5 +295,16 @@ int arch_hibernation_header_restore(void *addr) restore_jump_address = rdr->jump_address; jump_address_phys = rdr->jump_address_phys; restore_cr3 = rdr->cr3; - return (rdr->magic == RESTORE_MAGIC) ? 0 : -EINVAL; + + if (rdr->magic != RESTORE_MAGIC) { + pr_crit("Unrecognized hibernate image header format!\n"); + return -EINVAL; + } + + if (hibernation_e820_mismatch(rdr->e820_digest)) { + pr_crit("Hibernate inconsistent memory map detected!\n"); + return -ENODEV; + } + + return 0; } diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 0c2fae8d929d..73eb7fd4aec4 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -992,11 +992,12 @@ static void emit_relocs(int as_text, int use_real_mode) die("Segment relocations found but --realmode not specified\n"); /* Order the relocations for more efficient processing */ - sort_relocs(&relocs16); sort_relocs(&relocs32); #if ELF_BITS == 64 sort_relocs(&relocs32neg); sort_relocs(&relocs64); +#else + sort_relocs(&relocs16); #endif /* Print the relocations */ diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c index 60a5a5a85505..2497bac56066 100644 --- a/arch/x86/um/ptrace_32.c +++ b/arch/x86/um/ptrace_32.c @@ -5,7 +5,7 @@ #include <linux/mm.h> #include <linux/sched.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/ptrace-abi.h> #include <skas.h> diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c index e30202b1716e..a5c9910d234f 100644 --- a/arch/x86/um/ptrace_64.c +++ b/arch/x86/um/ptrace_64.c @@ -10,7 +10,7 @@ #include <linux/errno.h> #define __FRAME_OFFSETS #include <asm/ptrace.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/ptrace-abi.h> /* diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index 49e503697022..727ed442e0a5 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -9,7 +9,7 @@ #include <linux/ptrace.h> #include <linux/kernel.h> #include <asm/unistd.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/ucontext.h> #include <frame_kern.h> #include <skas.h> diff --git a/arch/x86/um/tls_32.c b/arch/x86/um/tls_32.c index 48e38584d5c1..5bd949da7a4a 100644 --- a/arch/x86/um/tls_32.c +++ b/arch/x86/um/tls_32.c @@ -6,7 +6,7 @@ #include <linux/percpu.h> #include <linux/sched.h> #include <linux/syscalls.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/ptrace-abi.h> #include <os.h> #include <skas.h> diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index ced7027b3fbc..51ef95232725 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1529,11 +1529,11 @@ static int xen_cpuhp_setup(void) int rc; rc = cpuhp_setup_state_nocalls(CPUHP_XEN_PREPARE, - "XEN_HVM_GUEST_PREPARE", + "x86/xen/hvm_guest:prepare", xen_cpu_up_prepare, xen_cpu_dead); if (rc >= 0) { rc = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, - "XEN_HVM_GUEST_ONLINE", + "x86/xen/hvm_guest:online", xen_cpu_up_online, NULL); if (rc < 0) cpuhp_remove_state_nocalls(CPUHP_XEN_PREPARE); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 37129db76d33..276da636dd39 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -71,7 +71,7 @@ #include <asm/cache.h> #include <asm/setup.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/xen/page.h> #include <asm/xen/hypercall.h> diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index 0e98e5d241d0..a0b36a9d5df1 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -19,7 +19,6 @@ int xen_swiotlb __read_mostly; static struct dma_map_ops xen_swiotlb_dma_ops = { - .mapping_error = xen_swiotlb_dma_mapping_error, .alloc = xen_swiotlb_alloc_coherent, .free = xen_swiotlb_free_coherent, .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu, @@ -49,7 +48,7 @@ int __init pci_xen_swiotlb_detect(void) * activate this IOMMU. If running as PV privileged, activate it * irregardless. */ - if ((xen_initial_domain() || swiotlb || swiotlb_force)) + if (xen_initial_domain() || swiotlb || swiotlb_force == SWIOTLB_FORCE) xen_swiotlb = 1; /* If we are running under Xen, we MUST disable the native SWIOTLB. diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index f8960fca0827..f3f7b41116f7 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -41,7 +41,7 @@ struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; unsigned long xen_released_pages; /* E820 map used during setting up memory. */ -static struct e820entry xen_e820_map[E820MAX] __initdata; +static struct e820entry xen_e820_map[E820_X_MAX] __initdata; static u32 xen_e820_map_entries __initdata; /* @@ -713,10 +713,9 @@ static void __init xen_reserve_xen_mfnlist(void) size = PFN_PHYS(xen_start_info->nr_p2m_frames); } - if (!xen_is_e820_reserved(start, size)) { - memblock_reserve(start, size); + memblock_reserve(start, size); + if (!xen_is_e820_reserved(start, size)) return; - } #ifdef CONFIG_X86_32 /* @@ -727,6 +726,7 @@ static void __init xen_reserve_xen_mfnlist(void) BUG(); #else xen_relocate_p2m(); + memblock_free(start, size); #endif } @@ -750,7 +750,7 @@ char * __init xen_memory_setup(void) max_pfn = min(max_pfn, xen_start_info->nr_pages); mem_end = PFN_PHYS(max_pfn); - memmap.nr_entries = E820MAX; + memmap.nr_entries = ARRAY_SIZE(xen_e820_map); set_xen_guest_handle(memmap.buffer, xen_e820_map); op = xen_initial_domain() ? @@ -923,7 +923,7 @@ char * __init xen_auto_xlated_memory_setup(void) int i; int rc; - memmap.nr_entries = E820MAX; + memmap.nr_entries = ARRAY_SIZE(xen_e820_map); set_xen_guest_handle(memmap.buffer, xen_e820_map); rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 9fa27ceeecfd..311acad7dad2 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -87,12 +87,6 @@ static void cpu_bringup(void) cpu_data(cpu).x86_max_cores = 1; set_cpu_sibling_map(cpu); - /* - * identify_cpu() may have set logical_pkg_id to -1 due - * to incorrect phys_proc_id. Let's re-comupte it. - */ - topology_update_package_map(apic->cpu_present_to_apicid(cpu), cpu); - xen_setup_cpu_clockevents(); notify_cpu_starting(cpu); diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 33d8f6a7829d..1e69956d7852 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -39,10 +39,10 @@ static unsigned long xen_tsc_khz(void) return pvclock_tsc_khz(info); } -cycle_t xen_clocksource_read(void) +u64 xen_clocksource_read(void) { struct pvclock_vcpu_time_info *src; - cycle_t ret; + u64 ret; preempt_disable_notrace(); src = &__this_cpu_read(xen_vcpu)->time; @@ -51,7 +51,7 @@ cycle_t xen_clocksource_read(void) return ret; } -static cycle_t xen_clocksource_get_cycles(struct clocksource *cs) +static u64 xen_clocksource_get_cycles(struct clocksource *cs) { return xen_clocksource_read(); } diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 3cbce3b085e7..ac0a2b0f9e62 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -67,7 +67,7 @@ void xen_init_irq_ops(void); void xen_setup_timer(int cpu); void xen_setup_runstate_info(int cpu); void xen_teardown_timer(int cpu); -cycle_t xen_clocksource_read(void); +u64 xen_clocksource_read(void); void xen_setup_cpu_clockevents(void); void __init xen_init_time_ops(void); void __init xen_hvm_init_time_ops(void); |