diff options
71 files changed, 2899 insertions, 128 deletions
diff --git a/Documentation/kmemcheck.txt b/Documentation/kmemcheck.txt new file mode 100644 index 00000000000..363044609da --- /dev/null +++ b/Documentation/kmemcheck.txt @@ -0,0 +1,773 @@ +GETTING STARTED WITH KMEMCHECK +============================== + +Vegard Nossum <vegardno@ifi.uio.no> + + +Contents +======== +0. Introduction +1. Downloading +2. Configuring and compiling +3. How to use +3.1. Booting +3.2. Run-time enable/disable +3.3. Debugging +3.4. Annotating false positives +4. Reporting errors +5. Technical description + + +0. Introduction +=============== + +kmemcheck is a debugging feature for the Linux Kernel. More specifically, it +is a dynamic checker that detects and warns about some uses of uninitialized +memory. + +Userspace programmers might be familiar with Valgrind's memcheck. The main +difference between memcheck and kmemcheck is that memcheck works for userspace +programs only, and kmemcheck works for the kernel only. The implementations +are of course vastly different. Because of this, kmemcheck is not as accurate +as memcheck, but it turns out to be good enough in practice to discover real +programmer errors that the compiler is not able to find through static +analysis. + +Enabling kmemcheck on a kernel will probably slow it down to the extent that +the machine will not be usable for normal workloads such as e.g. an +interactive desktop. kmemcheck will also cause the kernel to use about twice +as much memory as normal. For this reason, kmemcheck is strictly a debugging +feature. + + +1. Downloading +============== + +kmemcheck can only be downloaded using git. If you want to write patches +against the current code, you should use the kmemcheck development branch of +the tip tree. It is also possible to use the linux-next tree, which also +includes the latest version of kmemcheck. + +Assuming that you've already cloned the linux-2.6.git repository, all you +have to do is add the -tip tree as a remote, like this: + + $ git remote add tip git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git + +To actually download the tree, fetch the remote: + + $ git fetch tip + +And to check out a new local branch with the kmemcheck code: + + $ git checkout -b kmemcheck tip/kmemcheck + +General instructions for the -tip tree can be found here: +http://people.redhat.com/mingo/tip.git/readme.txt + + +2. Configuring and compiling +============================ + +kmemcheck only works for the x86 (both 32- and 64-bit) platform. A number of +configuration variables must have specific settings in order for the kmemcheck +menu to even appear in "menuconfig". These are: + + o CONFIG_CC_OPTIMIZE_FOR_SIZE=n + + This option is located under "General setup" / "Optimize for size". + + Without this, gcc will use certain optimizations that usually lead to + false positive warnings from kmemcheck. An example of this is a 16-bit + field in a struct, where gcc may load 32 bits, then discard the upper + 16 bits. kmemcheck sees only the 32-bit load, and may trigger a + warning for the upper 16 bits (if they're uninitialized). + + o CONFIG_SLAB=y or CONFIG_SLUB=y + + This option is located under "General setup" / "Choose SLAB + allocator". + + o CONFIG_FUNCTION_TRACER=n + + This option is located under "Kernel hacking" / "Tracers" / "Kernel + Function Tracer" + + When function tracing is compiled in, gcc emits a call to another + function at the beginning of every function. This means that when the + page fault handler is called, the ftrace framework will be called + before kmemcheck has had a chance to handle the fault. If ftrace then + modifies memory that was tracked by kmemcheck, the result is an + endless recursive page fault. + + o CONFIG_DEBUG_PAGEALLOC=n + + This option is located under "Kernel hacking" / "Debug page memory + allocations". + +In addition, I highly recommend turning on CONFIG_DEBUG_INFO=y. This is also +located under "Kernel hacking". With this, you will be able to get line number +information from the kmemcheck warnings, which is extremely valuable in +debugging a problem. This option is not mandatory, however, because it slows +down the compilation process and produces a much bigger kernel image. + +Now the kmemcheck menu should be visible (under "Kernel hacking" / "kmemcheck: +trap use of uninitialized memory"). Here follows a description of the +kmemcheck configuration variables: + + o CONFIG_KMEMCHECK + + This must be enabled in order to use kmemcheck at all... + + o CONFIG_KMEMCHECK_[DISABLED | ENABLED | ONESHOT]_BY_DEFAULT + + This option controls the status of kmemcheck at boot-time. "Enabled" + will enable kmemcheck right from the start, "disabled" will boot the + kernel as normal (but with the kmemcheck code compiled in, so it can + be enabled at run-time after the kernel has booted), and "one-shot" is + a special mode which will turn kmemcheck off automatically after + detecting the first use of uninitialized memory. + + If you are using kmemcheck to actively debug a problem, then you + probably want to choose "enabled" here. + + The one-shot mode is mostly useful in automated test setups because it + can prevent floods of warnings and increase the chances of the machine + surviving in case something is really wrong. In other cases, the one- + shot mode could actually be counter-productive because it would turn + itself off at the very first error -- in the case of a false positive + too -- and this would come in the way of debugging the specific + problem you were interested in. + + If you would like to use your kernel as normal, but with a chance to + enable kmemcheck in case of some problem, it might be a good idea to + choose "disabled" here. When kmemcheck is disabled, most of the run- + time overhead is not incurred, and the kernel will be almost as fast + as normal. + + o CONFIG_KMEMCHECK_QUEUE_SIZE + + Select the maximum number of error reports to store in an internal + (fixed-size) buffer. Since errors can occur virtually anywhere and in + any context, we need a temporary storage area which is guaranteed not + to generate any other page faults when accessed. The queue will be + emptied as soon as a tasklet may be scheduled. If the queue is full, + new error reports will be lost. + + The default value of 64 is probably fine. If some code produces more + than 64 errors within an irqs-off section, then the code is likely to + produce many, many more, too, and these additional reports seldom give + any more information (the first report is usually the most valuable + anyway). + + This number might have to be adjusted if you are not using serial + console or similar to capture the kernel log. If you are using the + "dmesg" command to save the log, then getting a lot of kmemcheck + warnings might overflow the kernel log itself, and the earlier reports + will get lost in that way instead. Try setting this to 10 or so on + such a setup. + + o CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT + + Select the number of shadow bytes to save along with each entry of the + error-report queue. These bytes indicate what parts of an allocation + are initialized, uninitialized, etc. and will be displayed when an + error is detected to help the debugging of a particular problem. + + The number entered here is actually the logarithm of the number of + bytes that will be saved. So if you pick for example 5 here, kmemcheck + will save 2^5 = 32 bytes. + + The default value should be fine for debugging most problems. It also + fits nicely within 80 columns. + + o CONFIG_KMEMCHECK_PARTIAL_OK + + This option (when enabled) works around certain GCC optimizations that + produce 32-bit reads from 16-bit variables where the upper 16 bits are + thrown away afterwards. + + The default value (enabled) is recommended. This may of course hide + some real errors, but disabling it would probably produce a lot of + false positives. + + o CONFIG_KMEMCHECK_BITOPS_OK + + This option silences warnings that would be generated for bit-field + accesses where not all the bits are initialized at the same time. This + may also hide some real bugs. + + This option is probably obsolete, or it should be replaced with + the kmemcheck-/bitfield-annotations for the code in question. The + default value is therefore fine. + +Now compile the kernel as usual. + + +3. How to use +============= + +3.1. Booting +============ + +First some information about the command-line options. There is only one +option specific to kmemcheck, and this is called "kmemcheck". It can be used +to override the default mode as chosen by the CONFIG_KMEMCHECK_*_BY_DEFAULT +option. Its possible settings are: + + o kmemcheck=0 (disabled) + o kmemcheck=1 (enabled) + o kmemcheck=2 (one-shot mode) + +If SLUB debugging has been enabled in the kernel, it may take precedence over +kmemcheck in such a way that the slab caches which are under SLUB debugging +will not be tracked by kmemcheck. In order to ensure that this doesn't happen +(even though it shouldn't by default), use SLUB's boot option "slub_debug", +like this: slub_debug=- + +In fact, this option may also be used for fine-grained control over SLUB vs. +kmemcheck. For example, if the command line includes "kmemcheck=1 +slub_debug=,dentry", then SLUB debugging will be used only for the "dentry" +slab cache, and with kmemcheck tracking all the other caches. This is advanced +usage, however, and is not generally recommended. + + +3.2. Run-time enable/disable +============================ + +When the kernel has booted, it is possible to enable or disable kmemcheck at +run-time. WARNING: This feature is still experimental and may cause false +positive warnings to appear. Therefore, try not to use this. If you find that +it doesn't work properly (e.g. you see an unreasonable amount of warnings), I +will be happy to take bug reports. + +Use the file /proc/sys/kernel/kmemcheck for this purpose, e.g.: + + $ echo 0 > /proc/sys/kernel/kmemcheck # disables kmemcheck + +The numbers are the same as for the kmemcheck= command-line option. + + +3.3. Debugging +============== + +A typical report will look something like this: + +WARNING: kmemcheck: Caught 32-bit read from uninitialized memory (ffff88003e4a2024) +80000000000000000000000000000000000000000088ffff0000000000000000 + i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u + ^ + +Pid: 1856, comm: ntpdate Not tainted 2.6.29-rc5 #264 945P-A +RIP: 0010:[<ffffffff8104ede8>] [<ffffffff8104ede8>] __dequeue_signal+0xc8/0x190 +RSP: 0018:ffff88003cdf7d98 EFLAGS: 00210002 +RAX: 0000000000000030 RBX: ffff88003d4ea968 RCX: 0000000000000009 +RDX: ffff88003e5d6018 RSI: ffff88003e5d6024 RDI: ffff88003cdf7e84 +RBP: ffff88003cdf7db8 R08: ffff88003e5d6000 R09: 0000000000000000 +R10: 0000000000000080 R11: 0000000000000000 R12: 000000000000000e +R13: ffff88003cdf7e78 R14: ffff88003d530710 R15: ffff88003d5a98c8 +FS: 0000000000000000(0000) GS:ffff880001982000(0063) knlGS:00000 +CS: 0010 DS: 002b ES: 002b CR0: 0000000080050033 +CR2: ffff88003f806ea0 CR3: 000000003c036000 CR4: 00000000000006a0 +DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +DR3: 0000000000000000 DR6: 00000000ffff4ff0 DR7: 0000000000000400 + [<ffffffff8104f04e>] dequeue_signal+0x8e/0x170 + [<ffffffff81050bd8>] get_signal_to_deliver+0x98/0x390 + [<ffffffff8100b87d>] do_notify_resume+0xad/0x7d0 + [<ffffffff8100c7b5>] int_signal+0x12/0x17 + [<ffffffffffffffff>] 0xffffffffffffffff + +The single most valuable information in this report is the RIP (or EIP on 32- +bit) value. This will help us pinpoint exactly which instruction that caused +the warning. + +If your kernel was compiled with CONFIG_DEBUG_INFO=y, then all we have to do +is give this address to the addr2line program, like this: + + $ addr2line -e vmlinux -i ffffffff8104ede8 + arch/x86/include/asm/string_64.h:12 + include/asm-generic/siginfo.h:287 + kernel/signal.c:380 + kernel/signal.c:410 + +The "-e vmlinux" tells addr2line which file to look in. IMPORTANT: This must +be the vmlinux of the kernel that produced the warning in the first place! If +not, the line number information will almost certainly be wrong. + +The "-i" tells addr2line to also print the line numbers of inlined functions. +In this case, the flag was very important, because otherwise, it would only +have printed the first line, which is just a call to memcpy(), which could be +called from a thousand places in the kernel, and is therefore not very useful. +These inlined functions would not show up in the stack trace above, simply +because the kernel doesn't load the extra debugging information. This +technique can of course be used with ordinary kernel oopses as well. + +In this case, it's the caller of memcpy() that is interesting, and it can be +found in include/asm-generic/siginfo.h, line 287: + +281 static inline void copy_siginfo(struct siginfo *to, struct siginfo *from) +282 { +283 if (from->si_code < 0) +284 memcpy(to, from, sizeof(*to)); +285 else +286 /* _sigchld is currently the largest know union member */ +287 memcpy(to, from, __ARCH_SI_PREAMBLE_SIZE + sizeof(from->_sifields._sigchld)); +288 } + +Since this was a read (kmemcheck usually warns about reads only, though it can +warn about writes to unallocated or freed memory as well), it was probably the +"from" argument which contained some uninitialized bytes. Following the chain +of calls, we move upwards to see where "from" was allocated or initialized, +kernel/signal.c, line 380: + +359 static void collect_signal(int sig, struct sigpending *list, siginfo_t *info) +360 { +... +367 list_for_each_entry(q, &list->list, list) { +368 if (q->info.si_signo == sig) { +369 if (first) +370 goto still_pending; +371 first = q; +... +377 if (first) { +378 still_pending: +379 list_del_init(&first->list); +380 copy_siginfo(info, &first->info); +381 __sigqueue_free(first); +... +392 } +393 } + +Here, it is &first->info that is being passed on to copy_siginfo(). The +variable "first" was found on a list -- passed in as the second argument to +collect_signal(). We continue our journey through the stack, to figure out +where the item on "list" was allocated or initialized. We move to line 410: + +395 static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, +396 siginfo_t *info) +397 { +... +410 collect_signal(sig, pending, info); +... +414 } + +Now we need to follow the "pending" pointer, since that is being passed on to +collect_signal() as "list". At this point, we've run out of lines from the +"addr2line" output. Not to worry, we just paste the next addresses from the +kmemcheck stack dump, i.e.: + + [<ffffffff8104f04e>] dequeue_signal+0x8e/0x170 + [<ffffffff81050bd8>] get_signal_to_deliver+0x98/0x390 + [<ffffffff8100b87d>] do_notify_resume+0xad/0x7d0 + [<ffffffff8100c7b5>] int_signal+0x12/0x17 + + $ addr2line -e vmlinux -i ffffffff8104f04e ffffffff81050bd8 \ + ffffffff8100b87d ffffffff8100c7b5 + kernel/signal.c:446 + kernel/signal.c:1806 + arch/x86/kernel/signal.c:805 + arch/x86/kernel/signal.c:871 + arch/x86/kernel/entry_64.S:694 + +Remember that since these addresses were found on the stack and not as the +RIP value, they actually point to the _next_ instruction (they are return +addresses). This becomes obvious when we look at the code for line 446: + +422 int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) +423 { +... +431 signr = __dequeue_signal(&tsk->signal->shared_pending, +432 mask, info); +433 /* +434 * itimer signal ? +435 * +436 * itimers are process shared and we restart periodic +437 * itimers in the signal delivery path to prevent DoS +438 * attacks in the high resolution timer case. This is +439 * compliant with the old way of self restarting +440 * itimers, as the SIGALRM is a legacy signal and only +441 * queued once. Changing the restart behaviour to +442 * restart the timer in the signal dequeue path is +443 * reducing the timer noise on heavy loaded !highres +444 * systems too. +445 */ +446 if (unlikely(signr == SIGALRM)) { +... +489 } + +So instead of looking at 446, we should be looking at 431, which is the line +that executes just before 446. Here we see that what we are looking for is +&tsk->signal->shared_pending. + +Our next task is now to figure out which function that puts items on this +"shared_pending" list. A crude, but efficient tool, is git grep: + + $ git grep -n 'shared_pending' kernel/ + ... + kernel/signal.c:828: pending = group ? &t->signal->shared_pending : &t->pending; + kernel/signal.c:1339: pending = group ? &t->signal->shared_pending : &t->pending; + ... + +There were more results, but none of them were related to list operations, +and these were the only assignments. We inspect the line numbers more closely +and find that this is indeed where items are being added to the list: + +816 static int send_signal(int sig, struct siginfo *info, struct task_struct *t, +817 int group) +818 { +... +828 pending = group ? &t->signal->shared_pending : &t->pending; +... +851 q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN && +852 (is_si_special(info) || +853 info->si_code >= 0))); +854 if (q) { +855 list_add_tail(&q->list, &pending->list); +... +890 } + +and: + +1309 int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) +1310 { +.... +1339 pending = group ? &t->signal->shared_pending : &t->pending; +1340 list_add_tail(&q->list, &pending->list); +.... +1347 } + +In the first case, the list element we are looking for, "q", is being returned +from the function __sigqueue_alloc(), which looks like an allocation function. +Let's take a look at it: + +187 static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, +188 int override_rlimit) +189 { +190 struct sigqueue *q = NULL; +191 struct user_struct *user; +192 +193 /* +194 * We won't get problems with the target's UID changing under us +195 * because changing it requires RCU be used, and if t != current, the +196 * caller must be holding the RCU readlock (by way of a spinlock) and +197 * we use RCU protection here +198 */ +199 user = get_uid(__task_cred(t)->user); +200 atomic_inc(&user->sigpending); +201 if (override_rlimit || +202 atomic_read(&user->sigpending) <= +203 t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) +204 q = kmem_cache_alloc(sigqueue_cachep, flags); +205 if (unlikely(q == NULL)) { +206 atomic_dec(&user->sigpending); +207 free_uid(user); +208 } else { +209 INIT_LIST_HEAD(&q->list); +210 q->flags = 0; +211 q->user = user; +212 } +213 +214 return q; +215 } + +We see that this function initializes q->list, q->flags, and q->user. It seems +that now is the time to look at the definition of "struct sigqueue", e.g.: + +14 struct sigqueue { +15 struct list_head list; +16 int flags; +17 siginfo_t info; +18 struct user_struct *user; +19 }; + +And, you might remember, it was a memcpy() on &first->info that caused the +warning, so this makes perfect sense. It also seems reasonable to assume that +it is the caller of __sigqueue_alloc() that has the responsibility of filling +out (initializing) this member. + +But just which fields of the struct were uninitialized? Let's look at +kmemcheck's report again: + +WARNING: kmemcheck: Caught 32-bit read from uninitialized memory (ffff88003e4a2024) +80000000000000000000000000000000000000000088ffff0000000000000000 + i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u + ^ + +These first two lines are the memory dump of the memory object itself, and the +shadow bytemap, respectively. The memory object itself is in this case +&first->info. Just beware that the start of this dump is NOT the start of the +object itself! The position of the caret (^) corresponds with the address of +the read (ffff88003e4a2024). + +The shadow bytemap dump legend is as follows: + + i - initialized + u - uninitialized + a - unallocated (memory has been allocated by the slab layer, but has not + yet been handed off to anybody) + f - freed (memory has been allocated by the slab layer, but has been freed + by the previous owner) + +In order to figure out where (relative to the start of the object) the +uninitialized memory was located, we have to look at the disassembly. For +that, we'll need the RIP address again: + +RIP: 0010:[<ffffffff8104ede8>] [<ffffffff8104ede8>] __dequeue_signal+0xc8/0x190 + + $ objdump -d --no-show-raw-insn vmlinux | grep -C 8 ffffffff8104ede8: + ffffffff8104edc8: mov %r8,0x8(%r8) + ffffffff8104edcc: test %r10d,%r10d + ffffffff8104edcf: js ffffffff8104ee88 <__dequeue_signal+0x168> + ffffffff8104edd5: mov %rax,%rdx + ffffffff8104edd8: mov $0xc,%ecx + ffffffff8104eddd: mov %r13,%rdi + ffffffff8104ede0: mov $0x30,%eax + ffffffff8104ede5: mov %rdx,%rsi + ffffffff8104ede8: rep movsl %ds:(%rsi),%es:(%rdi) + ffffffff8104edea: test $0x2,%al + ffffffff8104edec: je ffffffff8104edf0 <__dequeue_signal+0xd0> + ffffffff8104edee: movsw %ds:(%rsi),%es:(%rdi) + ffffffff8104edf0: test $0x1,%al + ffffffff8104edf2: je ffffffff8104edf5 <__dequeue_signal+0xd5> + ffffffff8104edf4: movsb %ds:(%rsi),%es:(%rdi) + ffffffff8104edf5: mov %r8,%rdi + ffffffff8104edf8: callq ffffffff8104de60 <__sigqueue_free> + +As expected, it's the "rep movsl" instruction from the memcpy() that causes +the warning. We know about REP MOVSL that it uses the register RCX to count +the number of remaining iterations. By taking a look at the register dump +again (from the kmemcheck report), we can figure out how many bytes were left +to copy: + +RAX: 0000000000000030 RBX: ffff88003d4ea968 RCX: 0000000000000009 + +By looking at the disassembly, we also see that %ecx is being loaded with the +value $0xc just before (ffffffff8104edd8), so we are very lucky. Keep in mind +that this is the number of iterations, not bytes. And since this is a "long" +operation, we need to multiply by 4 to get the number of bytes. So this means +that the uninitialized value was encountered at 4 * (0xc - 0x9) = 12 bytes +from the start of the object. + +We can now try to figure out which field of the "struct siginfo" that was not +initialized. This is the beginning of the struct: + +40 typedef struct siginfo { +41 int si_signo; +42 int si_errno; +43 int si_code; +44 +45 union { +.. +92 } _sifields; +93 } siginfo_t; + +On 64-bit, the int is 4 bytes long, so it must the the union member that has +not been initialized. We can verify this using gdb: + + $ gdb vmlinux + ... + (gdb) p &((struct siginfo *) 0)->_sifields + $1 = (union {...} *) 0x10 + +Actually, it seems that the union member is located at offset 0x10 -- which +means that gcc has inserted 4 bytes of padding between the members si_code +and _sifields. We can now get a fuller picture of the memory dump: + + _----------------------------=> si_code + / _--------------------=> (padding) + | / _------------=> _sifields(._kill._pid) + | | / _----=> _sifields(._kill._uid) + | | | / +-------|-------|-------|-------| +80000000000000000000000000000000000000000088ffff0000000000000000 + i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u + +This allows us to realize another important fact: si_code contains the value +0x80. Remember that x86 is little endian, so the first 4 bytes "80000000" are +really the number 0x00000080. With a bit of research, we find that this is +actually the constant SI_KERNEL defined in include/asm-generic/siginfo.h: + +144 #define SI_KERNEL 0x80 /* sent by the kernel from somewhere */ + +This macro is used in exactly one place in the x86 kernel: In send_signal() +in kernel/signal.c: + +816 static int send_signal(int sig, struct siginfo *info, struct task_struct *t, +817 int group) +818 { +... +828 pending = group ? &t->signal->shared_pending : &t->pending; +... +851 q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN && +852 (is_si_special(info) || +853 info->si_code >= 0))); +854 if (q) { +855 list_add_tail(&q->list, &pending->list); +856 switch ((unsigned long) info) { +... +865 case (unsigned long) SEND_SIG_PRIV: +866 q->info.si_signo = sig; +867 q->info.si_errno = 0; +868 q->info.si_code = SI_KERNEL; +869 q->info.si_pid = 0; +870 q->info.si_uid = 0; +871 break; +... +890 } + +Not only does this match with the .si_code member, it also matches the place +we found earlier when looking for where siginfo_t objects are enqueued on the +"shared_pending" list. + +So to sum up: It seems that it is the padding introduced by the compiler +between two struct fields that is uninitialized, and this gets reported when +we do a memcpy() on the struct. This means that we have identified a false +positive warning. + +Normally, kmemcheck will not report uninitialized accesses in memcpy() calls +when both the source and destination addresses are tracked. (Instead, we copy +the shadow bytemap as well). In this case, the destination address clearly +was not tracked. We can dig a little deeper into the stack trace from above: + + arch/x86/kernel/signal.c:805 + arch/x86/kernel/signal.c:871 + arch/x86/kernel/entry_64.S:694 + +And we clearly see that the destination siginfo object is located on the +stack: + +782 static void do_signal(struct pt_regs *regs) +783 { +784 struct k_sigaction ka; +785 siginfo_t info; +... +804 signr = get_signal_to_deliver(&info, &ka, regs, NULL); +... +854 } + +And this &info is what eventually gets passed to copy_siginfo() as the +destination argument. + +Now, even though we didn't find an actual error here, the example is still a +good one, because it shows how one would go about to find out what the report +was all about. + + +3.4. Annotating false positives +=============================== + +There are a few different ways to make annotations in the source code that +will keep kmemcheck from checking and reporting certain allocations. Here +they are: + + o __GFP_NOTRACK_FALSE_POSITIVE + + This flag can be passed to kmalloc() or kmem_cache_alloc() (therefore + also to other functions that end up calling one of these) to indicate + that the allocation should not be tracked because it would lead to + a false positive report. This is a "big hammer" way of silencing + kmemcheck; after all, even if the false positive pertains to + particular field in a struct, for example, we will now lose the + ability to find (real) errors in other parts of the same struct. + + Example: + + /* No warnings will ever trigger on accessing any part of x */ + x = kmalloc(sizeof *x, GFP_KERNEL | __GFP_NOTRACK_FALSE_POSITIVE); + + o kmemcheck_bitfield_begin(name)/kmemcheck_bitfield_end(name) and + kmemcheck_annotate_bitfield(ptr, name) + + The first two of these three macros can be used inside struct + definitions to signal, respectively, the beginning and end of a + bitfield. Additionally, this will assign the bitfield a name, which + is given as an argument to the macros. + + Having used these markers, one can later use + kmemcheck_annotate_bitfield() at the point of allocation, to indicate + which parts of the allocation is part of a bitfield. + + Example: + + struct foo { + int x; + + kmemcheck_bitfield_begin(flags); + int flag_a:1; + int flag_b:1; + kmemcheck_bitfield_end(flags); + + int y; + }; + + struct foo *x = kmalloc(sizeof *x); + + /* No warnings will trigger on accessing the bitfield of x */ + kmemcheck_annotate_bitfield(x, flags); + + Note that kmemcheck_annotate_bitfield() can be used even before the + return value of kmalloc() is checked -- in other words, passing NULL + as the first argument is legal (and will do nothing). + + +4. Reporting errors +=================== + +As we have seen, kmemcheck will produce false positive reports. Therefore, it +is not very wise to blindly post kmemcheck warnings to mailing lists and +maintainers. Instead, I encourage maintainers and developers to find errors +in their own code. If you get a warning, you can try to work around it, try +to figure out if it's a real error or not, or simply ignore it. Most +developers know their own code and will quickly and efficiently determine the +root cause of a kmemcheck report. This is therefore also the most efficient +way to work with kmemcheck. + +That said, we (the kmemcheck maintainers) will always be on the lookout for +false positives that we can annotate and silence. So whatever you find, +please drop us a note privately! Kernel configs and steps to reproduce (if +available) are of course a great help too. + +Happy hacking! + + +5. Technical description +======================== + +kmemcheck works by marking memory pages non-present. This means that whenever +somebody attempts to access the page, a page fault is generated. The page +fault handler notices that the page was in fact only hidden, and so it calls +on the kmemcheck code to make further investigations. + +When the investigations are completed, kmemcheck "shows" the page by marking +it present (as it would be under normal circumstances). This way, the +interrupted code can continue as usual. + +But after the instruction has been executed, we should hide the page again, so +that we can catch the next access too! Now kmemcheck makes use of a debugging +feature of the processor, namely single-stepping. When the processor has +finished the one instruction that generated the memory access, a debug +exception is raised. From here, we simply hide the page again and continue +execution, this time with the single-stepping feature turned off. + +kmemcheck requires some assistance from the memory allocator in order to work. +The memory allocator needs to + + 1. Tell kmemcheck about newly allocated pages and pages that are about to + be freed. This allows kmemcheck to set up and tear down the shadow memory + for the pages in question. The shadow memory stores the status of each + byte in the allocation proper, e.g. whether it is initialized or + uninitialized. + + 2. Tell kmemcheck which parts of memory should be marked uninitialized. + There are actually a few more states, such as "not yet allocated" and + "recently freed". + +If a slab cache is set up using the SLAB_NOTRACK flag, it will never return +memory that can take page faults because of kmemcheck. + +If a slab cache is NOT set up using the SLAB_NOTRACK flag, callers can still +request memory with the __GFP_NOTRACK or __GFP_NOTRACK_FALSE_POSITIVE flags. +This does not prevent the page faults from occurring, however, but marks the +object in question as being initialized so that no warnings will ever be +produced for this object. + +Currently, the SLAB and SLUB allocators are supported by kmemcheck. diff --git a/MAINTAINERS b/MAINTAINERS index 685784cc023..af8ef6527f2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3406,6 +3406,14 @@ F: drivers/serial/kgdboc.c F: include/linux/kgdb.h F: kernel/kgdb.c +KMEMCHECK +P: Vegard Nossum +M: vegardno@ifi.uio.no +P Pekka Enberg +M: penberg@cs.helsinki.fi +L: linux-kernel@vger.kernel.org +S: Maintained + KMEMLEAK P: Catalin Marinas M: catalin.marinas@arm.com diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 356d2ec8e2f..cf42fc30541 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -46,6 +46,7 @@ config X86 select HAVE_KERNEL_GZIP select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_LZMA + select HAVE_ARCH_KMEMCHECK config OUTPUT_FORMAT string diff --git a/arch/x86/Makefile b/arch/x86/Makefile index edbd0ca6206..1b68659c41b 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -81,6 +81,11 @@ ifdef CONFIG_CC_STACKPROTECTOR endif endif +# Don't unroll struct assignments with kmemcheck enabled +ifeq ($(CONFIG_KMEMCHECK),y) + KBUILD_CFLAGS += $(call cc-option,-fno-builtin-memcpy) +endif + # Stackpointer is addressed different for 32 bit and 64 bit x86 sp-$(CONFIG_X86_32) := esp sp-$(CONFIG_X86_64) := rsp diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index f82fdc412c6..b93405b228b 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -6,6 +6,7 @@ * Documentation/DMA-API.txt for documentation. */ +#include <linux/kmemcheck.h> #include <linux/scatterlist.h> #include <linux/dma-debug.h> #include <linux/dma-attrs.h> @@ -60,6 +61,7 @@ dma_map_single(struct device *hwdev, void *ptr, size_t size, dma_addr_t addr; BUG_ON(!valid_dma_direction(dir)); + kmemcheck_mark_initialized(ptr, size); addr = ops->map_page(hwdev, virt_to_page(ptr), (unsigned long)ptr & ~PAGE_MASK, size, dir, NULL); @@ -87,8 +89,12 @@ dma_map_sg(struct device *hwdev, struct scatterlist *sg, { struct dma_map_ops *ops = get_dma_ops(hwdev); int ents; + struct scatterlist *s; + int i; BUG_ON(!valid_dma_direction(dir)); + for_each_sg(sg, s, nents, i) + kmemcheck_mark_initialized(sg_virt(s), s->length); ents = ops->map_sg(hwdev, sg, nents, dir, NULL); debug_dma_map_sg(hwdev, sg, nents, ents, dir); @@ -200,6 +206,7 @@ static inline dma_addr_t dma_map_page(struct device *dev, struct page *page, dma_addr_t addr; BUG_ON(!valid_dma_direction(dir)); + kmemcheck_mark_initialized(page_address(page) + offset, size); addr = ops->map_page(dev, page, offset, size, dir, NULL); debug_dma_map_page(dev, page, offset, size, dir, addr, false); diff --git a/arch/x86/include/asm/kmemcheck.h b/arch/x86/include/asm/kmemcheck.h new file mode 100644 index 00000000000..ed01518f297 --- /dev/null +++ b/arch/x86/include/asm/kmemcheck.h @@ -0,0 +1,42 @@ +#ifndef ASM_X86_KMEMCHECK_H +#define ASM_X86_KMEMCHECK_H + +#include <linux/types.h> +#include <asm/ptrace.h> + +#ifdef CONFIG_KMEMCHECK +bool kmemcheck_active(struct pt_regs *regs); + +void kmemcheck_show(struct pt_regs *regs); +void kmemcheck_hide(struct pt_regs *regs); + +bool kmemcheck_fault(struct pt_regs *regs, + unsigned long address, unsigned long error_code); +bool kmemcheck_trap(struct pt_regs *regs); +#else +static inline bool kmemcheck_active(struct pt_regs *regs) +{ + return false; +} + +static inline void kmemcheck_show(struct pt_regs *regs) +{ +} + +static inline void kmemcheck_hide(struct pt_regs *regs) +{ +} + +static inline bool kmemcheck_fault(struct pt_regs *regs, + unsigned long address, unsigned long error_code) +{ + return false; +} + +static inline bool kmemcheck_trap(struct pt_regs *regs) +{ + return false; +} +#endif /* CONFIG_KMEMCHECK */ + +#endif diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 18ef7ebf263..3cc06e3fceb 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -317,6 +317,11 @@ static inline int pte_present(pte_t a) return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); } +static inline int pte_hidden(pte_t pte) +{ + return pte_flags(pte) & _PAGE_HIDDEN; +} + static inline int pmd_present(pmd_t pmd) { return pmd_flags(pmd) & _PAGE_PRESENT; diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 4d258ad76a0..54cb697f490 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -18,7 +18,7 @@ #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ #define _PAGE_BIT_UNUSED1 9 /* available for programmer */ #define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */ -#define _PAGE_BIT_UNUSED3 11 +#define _PAGE_BIT_HIDDEN 11 /* hidden by kmemcheck */ #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ #define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 #define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 @@ -41,13 +41,18 @@ #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) #define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1) #define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) -#define _PAGE_UNUSED3 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED3) #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) #define __HAVE_ARCH_PTE_SPECIAL +#ifdef CONFIG_KMEMCHECK +#define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN) +#else +#define _PAGE_HIDDEN (_AT(pteval_t, 0)) +#endif + #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) #else diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h index 0e0e3ba827f..c86f452256d 100644 --- a/arch/x86/include/asm/string_32.h +++ b/arch/x86/include/asm/string_32.h @@ -177,10 +177,18 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len) * No 3D Now! */ +#ifndef CONFIG_KMEMCHECK #define memcpy(t, f, n) \ (__builtin_constant_p((n)) \ ? __constant_memcpy((t), (f), (n)) \ : __memcpy((t), (f), (n))) +#else +/* + * kmemcheck becomes very happy if we use the REP instructions unconditionally, + * because it means that we know both memory operands in advance. + */ +#define memcpy(t, f, n) __memcpy((t), (f), (n)) +#endif #endif diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 2afe164bf1e..19e2c468fc2 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -27,6 +27,7 @@ static __always_inline void *__inline_memcpy(void *to, const void *from, size_t function. */ #define __HAVE_ARCH_MEMCPY 1 +#ifndef CONFIG_KMEMCHECK #if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4 extern void *memcpy(void *to, const void *from, size_t len); #else @@ -42,6 +43,13 @@ extern void *__memcpy(void *to, const void *from, size_t len); __ret; \ }) #endif +#else +/* + * kmemcheck becomes very happy if we use the REP instructions unconditionally, + * because it means that we know both memory operands in advance. + */ +#define memcpy(dst, src, len) __inline_memcpy((dst), (src), (len)) +#endif #define __HAVE_ARCH_MEMSET void *memset(void *s, int c, size_t n); diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 602c769fc98..b0783520988 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -154,9 +154,9 @@ struct thread_info { /* thread information allocation */ #ifdef CONFIG_DEBUG_STACK_USAGE -#define THREAD_FLAGS (GFP_KERNEL | __GFP_ZERO) +#define THREAD_FLAGS (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) #else -#define THREAD_FLAGS GFP_KERNEL +#define THREAD_FLAGS (GFP_KERNEL | __GFP_NOTRACK) #endif #define __HAVE_ARCH_THREAD_INFO_ALLOCATOR diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h index 11b3bb86e17..7fcf6f3dbcc 100644 --- a/arch/x86/include/asm/xor.h +++ b/arch/x86/include/asm/xor.h @@ -1,5 +1,10 @@ +#ifdef CONFIG_KMEMCHECK +/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */ +# include <asm-generic/xor.h> +#else #ifdef CONFIG_X86_32 # include "xor_32.h" #else # include "xor_64.h" #endif +#endif diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index daed39ba261..3260ab04499 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -86,6 +86,29 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) */ if (c->x86 == 6 && c->x86_model < 15) clear_cpu_cap(c, X86_FEATURE_PAT); + +#ifdef CONFIG_KMEMCHECK + /* + * P4s have a "fast strings" feature which causes single- + * stepping REP instructions to only generate a #DB on + * cache-line boundaries. + * + * Ingo Molnar reported a Pentium D (model 6) and a Xeon + * (model 2) with the same problem. + */ + if (c->x86 == 15) { + u64 misc_enable; + + rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); + + if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) { + printk(KERN_INFO "kmemcheck: Disabling fast string operations\n"); + + misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING; + wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); + } + } +#endif } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 3bb2be1649b..994dd6a4a2a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -63,7 +63,7 @@ void arch_task_cache_init(void) task_xstate_cachep = kmem_cache_create("task_xstate", xstate_size, __alignof__(union thread_xstate), - SLAB_PANIC, NULL); + SLAB_PANIC | SLAB_NOTRACK, NULL); } /* diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 4aaf7e48394..c3eb207181f 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -77,6 +77,13 @@ void save_stack_trace(struct stack_trace *trace) } EXPORT_SYMBOL_GPL(save_stack_trace); +void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp) +{ + dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace); + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = ULONG_MAX; +} + void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 1e1e27b7d43..5f935f0d586 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -45,6 +45,7 @@ #include <linux/edac.h> #endif +#include <asm/kmemcheck.h> #include <asm/stacktrace.h> #include <asm/processor.h> #include <asm/debugreg.h> @@ -534,6 +535,10 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) get_debugreg(condition, 6); + /* Catch kmemcheck conditions first of all! */ + if (condition & DR_STEP && kmemcheck_trap(regs)) + return; + /* * The processor cleared BTF, so don't mark that we need it set. */ diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index fdd30d08ab5..eefdeee8a87 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -10,6 +10,8 @@ obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o obj-$(CONFIG_HIGHMEM) += highmem_32.o +obj-$(CONFIG_KMEMCHECK) += kmemcheck/ + obj-$(CONFIG_MMIOTRACE) += mmiotrace.o mmiotrace-y := kmmio.o pf_in.o mmio-mod.o obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index c6acc632637..baa0e86adfb 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -14,6 +14,7 @@ #include <asm/traps.h> /* dotraplinkage, ... */ #include <asm/pgalloc.h> /* pgd_*(), ... */ +#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ /* * Page fault error code bits: @@ -956,6 +957,13 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) /* Get the faulting address: */ address = read_cr2(); + /* + * Detect and handle instructions that would cause a page fault for + * both a tracked kernel page and a userspace page. + */ + if (kmemcheck_active(regs)) + kmemcheck_hide(regs); + if (unlikely(kmmio_fault(regs, address))) return; @@ -973,9 +981,13 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) * protection error (error_code & 9) == 0. */ if (unlikely(fault_in_kernel_space(address))) { - if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && - vmalloc_fault(address) >= 0) - return; + if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) { + if (vmalloc_fault(address) >= 0) + return; + + if (kmemcheck_fault(regs, address, error_code)) + return; + } /* Can handle a stale RO->RW TLB: */ if (spurious_fault(error_code, address)) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 34c1bfb64f1..f53b57e4086 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -213,7 +213,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, if (!after_bootmem) init_gbpages(); -#ifdef CONFIG_DEBUG_PAGEALLOC +#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) /* * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. * This will simplify cpa(), which otherwise needs to support splitting diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 9ff3c0816d1..3cd7711bb94 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -111,7 +111,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) pte_t *page_table = NULL; if (after_bootmem) { -#ifdef CONFIG_DEBUG_PAGEALLOC +#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); #endif if (!page_table) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 52bb9519bb8..9c543290a81 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -104,7 +104,7 @@ static __ref void *spp_getpage(void) void *ptr; if (after_bootmem) - ptr = (void *) get_zeroed_page(GFP_ATOMIC); + ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); else ptr = alloc_bootmem_pages(PAGE_SIZE); @@ -281,7 +281,7 @@ static __ref void *alloc_low_page(unsigned long *phys) void *adr; if (after_bootmem) { - adr = (void *)get_zeroed_page(GFP_ATOMIC); + adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); *phys = __pa(adr); return adr; diff --git a/arch/x86/mm/kmemcheck/Makefile b/arch/x86/mm/kmemcheck/Makefile new file mode 100644 index 00000000000..520b3bce409 --- /dev/null +++ b/arch/x86/mm/kmemcheck/Makefile @@ -0,0 +1 @@ +obj-y := error.o kmemcheck.o opcode.o pte.o selftest.o shadow.o diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c new file mode 100644 index 00000000000..4901d0dafda --- /dev/null +++ b/arch/x86/mm/kmemcheck/error.c @@ -0,0 +1,228 @@ +#include <linux/interrupt.h> +#include <linux/kdebug.h> +#include <linux/kmemcheck.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/stacktrace.h> +#include <linux/string.h> + +#include "error.h" +#include "shadow.h" + +enum kmemcheck_error_type { + KMEMCHECK_ERROR_INVALID_ACCESS, + KMEMCHECK_ERROR_BUG, +}; + +#define SHADOW_COPY_SIZE (1 << CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT) + +struct kmemcheck_error { + enum kmemcheck_error_type type; + + union { + /* KMEMCHECK_ERROR_INVALID_ACCESS */ + struct { + /* Kind of access that caused the error */ + enum kmemcheck_shadow state; + /* Address and size of the erroneous read */ + unsigned long address; + unsigned int size; + }; + }; + + struct pt_regs regs; + struct stack_trace trace; + unsigned long trace_entries[32]; + + /* We compress it to a char. */ + unsigned char shadow_copy[SHADOW_COPY_SIZE]; + unsigned char memory_copy[SHADOW_COPY_SIZE]; +}; + +/* + * Create a ring queue of errors to output. We can't call printk() directly + * from the kmemcheck traps, since this may call the console drivers and + * result in a recursive fault. + */ +static struct kmemcheck_error error_fifo[CONFIG_KMEMCHECK_QUEUE_SIZE]; +static unsigned int error_count; +static unsigned int error_rd; +static unsigned int error_wr; +static unsigned int error_missed_count; + +static struct kmemcheck_error *error_next_wr(void) +{ + struct kmemcheck_error *e; + + if (error_count == ARRAY_SIZE(error_fifo)) { + ++error_missed_count; + return NULL; + } + + e = &error_fifo[error_wr]; + if (++error_wr == ARRAY_SIZE(error_fifo)) + error_wr = 0; + ++error_count; + return e; +} + +static struct kmemcheck_error *error_next_rd(void) +{ + struct kmemcheck_error *e; + + if (error_count == 0) + return NULL; + + e = &error_fifo[error_rd]; + if (++error_rd == ARRAY_SIZE(error_fifo)) + error_rd = 0; + --error_count; + return e; +} + +void kmemcheck_error_recall(void) +{ + static const char *desc[] = { + [KMEMCHECK_SHADOW_UNALLOCATED] = "unallocated", + [KMEMCHECK_SHADOW_UNINITIALIZED] = "uninitialized", + [KMEMCHECK_SHADOW_INITIALIZED] = "initialized", + [KMEMCHECK_SHADOW_FREED] = "freed", + }; + + static const char short_desc[] = { + [KMEMCHECK_SHADOW_UNALLOCATED] = 'a', + [KMEMCHECK_SHADOW_UNINITIALIZED] = 'u', + [KMEMCHECK_SHADOW_INITIALIZED] = 'i', + [KMEMCHECK_SHADOW_FREED] = 'f', + }; + + struct kmemcheck_error *e; + unsigned int i; + + e = error_next_rd(); + if (!e) + return; + + switch (e->type) { + case KMEMCHECK_ERROR_INVALID_ACCESS: + printk(KERN_ERR "WARNING: kmemcheck: Caught %d-bit read " + "from %s memory (%p)\n", + 8 * e->size, e->state < ARRAY_SIZE(desc) ? + desc[e->state] : "(invalid shadow state)", + (void *) e->address); + + printk(KERN_INFO); + for (i = 0; i < SHADOW_COPY_SIZE; ++i) + printk("%02x", e->memory_copy[i]); + printk("\n"); + + printk(KERN_INFO); + for (i = 0; i < SHADOW_COPY_SIZE; ++i) { + if (e->shadow_copy[i] < ARRAY_SIZE(short_desc)) + printk(" %c", short_desc[e->shadow_copy[i]]); + else + printk(" ?"); + } + printk("\n"); + printk(KERN_INFO "%*c\n", 2 + 2 + * (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^'); + break; + case KMEMCHECK_ERROR_BUG: + printk(KERN_EMERG "ERROR: kmemcheck: Fatal error\n"); + break; + } + + __show_regs(&e->regs, 1); + print_stack_trace(&e->trace, 0); +} + +static void do_wakeup(unsigned long data) +{ + while (error_count > 0) + kmemcheck_error_recall(); + + if (error_missed_count > 0) { + printk(KERN_WARNING "kmemcheck: Lost %d error reports because " + "the queue was too small\n", error_missed_count); + error_missed_count = 0; + } +} + +static DECLARE_TASKLET(kmemcheck_tasklet, &do_wakeup, 0); + +/* + * Save the context of an error report. + */ +void kmemcheck_error_save(enum kmemcheck_shadow state, + unsigned long address, unsigned int size, struct pt_regs *regs) +{ + static unsigned long prev_ip; + + struct kmemcheck_error *e; + void *shadow_copy; + void *memory_copy; + + /* Don't report several adjacent errors from the same EIP. */ + if (regs->ip == prev_ip) + return; + prev_ip = regs->ip; + + e = error_next_wr(); + if (!e) + return; + + e->type = KMEMCHECK_ERROR_INVALID_ACCESS; + + e->state = state; + e->address = address; + e->size = size; + + /* Save regs */ + memcpy(&e->regs, regs, sizeof(*regs)); + + /* Save stack trace */ + e->trace.nr_entries = 0; + e->trace.entries = e->trace_entries; + e->trace.max_entries = ARRAY_SIZE(e->trace_entries); + e->trace.skip = 0; + save_stack_trace_bp(&e->trace, regs->bp); + + /* Round address down to nearest 16 bytes */ + shadow_copy = kmemcheck_shadow_lookup(address + & ~(SHADOW_COPY_SIZE - 1)); + BUG_ON(!shadow_copy); + + memcpy(e->shadow_copy, shadow_copy, SHADOW_COPY_SIZE); + + kmemcheck_show_addr(address); + memory_copy = (void *) (address & ~(SHADOW_COPY_SIZE - 1)); + memcpy(e->memory_copy, memory_copy, SHADOW_COPY_SIZE); + kmemcheck_hide_addr(address); + + tasklet_hi_schedule_first(&kmemcheck_tasklet); +} + +/* + * Save the context of a kmemcheck bug. + */ +void kmemcheck_error_save_bug(struct pt_regs *regs) +{ + struct kmemcheck_error *e; + + e = error_next_wr(); + if (!e) + return; + + e->type = KMEMCHECK_ERROR_BUG; + + memcpy(&e->regs, regs, sizeof(*regs)); + + e->trace.nr_entries = 0; + e->trace.entries = e->trace_entries; + e->trace.max_entries = ARRAY_SIZE(e->trace_entries); + e->trace.skip = 1; + save_stack_trace(&e->trace); + + tasklet_hi_schedule_first(&kmemcheck_tasklet); +} diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h new file mode 100644 index 00000000000..0efc2e8d0a2 --- /dev/null +++ b/arch/x86/mm/kmemcheck/error.h @@ -0,0 +1,15 @@ +#ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H +#define ARCH__X86__MM__KMEMCHECK__ERROR_H + +#include <linux/ptrace.h> + +#include "shadow.h" + +void kmemcheck_error_save(enum kmemcheck_shadow state, + unsigned long address, unsigned int size, struct pt_regs *regs); + +void kmemcheck_error_save_bug(struct pt_regs *regs); + +void kmemcheck_error_recall(void); + +#endif diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c new file mode 100644 index 00000000000..2c55ed09865 --- /dev/null +++ b/arch/x86/mm/kmemcheck/kmemcheck.c @@ -0,0 +1,640 @@ +/** + * kmemcheck - a heavyweight memory checker for the linux kernel + * Copyright (C) 2007, 2008 Vegard Nossum <vegardno@ifi.uio.no> + * (With a lot of help from Ingo Molnar and Pekka Enberg.) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License (version 2) as + * published by the Free Software Foundation. + */ + +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/kallsyms.h> +#include <linux/kernel.h> +#include <linux/kmemcheck.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/page-flags.h> +#include <linux/percpu.h> +#include <linux/ptrace.h> +#include <linux/string.h> +#include <linux/types.h> + +#include <asm/cacheflush.h> +#include <asm/kmemcheck.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> + +#include "error.h" +#include "opcode.h" +#include "pte.h" +#include "selftest.h" +#include "shadow.h" + + +#ifdef CONFIG_KMEMCHECK_DISABLED_BY_DEFAULT +# define KMEMCHECK_ENABLED 0 +#endif + +#ifdef CONFIG_KMEMCHECK_ENABLED_BY_DEFAULT +# define KMEMCHECK_ENABLED 1 +#endif + +#ifdef CONFIG_KMEMCHECK_ONESHOT_BY_DEFAULT +# define KMEMCHECK_ENABLED 2 +#endif + +int kmemcheck_enabled = KMEMCHECK_ENABLED; + +int __init kmemcheck_init(void) +{ +#ifdef CONFIG_SMP + /* + * Limit SMP to use a single CPU. We rely on the fact that this code + * runs before SMP is set up. + */ + if (setup_max_cpus > 1) { + printk(KERN_INFO + "kmemcheck: Limiting number of CPUs to 1.\n"); + setup_max_cpus = 1; + } +#endif + + if (!kmemcheck_selftest()) { + printk(KERN_INFO "kmemcheck: self-tests failed; disabling\n"); + kmemcheck_enabled = 0; + return -EINVAL; + } + + printk(KERN_INFO "kmemcheck: Initialized\n"); + return 0; +} + +early_initcall(kmemcheck_init); + +/* + * We need to parse the kmemcheck= option before any memory is allocated. + */ +static int __init param_kmemcheck(char *str) +{ + if (!str) + return -EINVAL; + + sscanf(str, "%d", &kmemcheck_enabled); + return 0; +} + +early_param("kmemcheck", param_kmemcheck); + +int kmemcheck_show_addr(unsigned long address) +{ + pte_t *pte; + + pte = kmemcheck_pte_lookup(address); + if (!pte) + return 0; + + set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); + __flush_tlb_one(address); + return 1; +} + +int kmemcheck_hide_addr(unsigned long address) +{ + pte_t *pte; + + pte = kmemcheck_pte_lookup(address); + if (!pte) + return 0; + + set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); + __flush_tlb_one(address); + return 1; +} + +struct kmemcheck_context { + bool busy; + int balance; + + /* + * There can be at most two memory operands to an instruction, but + * each address can cross a page boundary -- so we may need up to + * four addresses that must be hidden/revealed for each fault. + */ + unsigned long addr[4]; + unsigned long n_addrs; + unsigned long flags; + + /* Data size of the instruction that caused a fault. */ + unsigned int size; +}; + +static DEFINE_PER_CPU(struct kmemcheck_context, kmemcheck_context); + +bool kmemcheck_active(struct pt_regs *regs) +{ + struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); + + return data->balance > 0; +} + +/* Save an address that needs to be shown/hidden */ +static void kmemcheck_save_addr(unsigned long addr) +{ + struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); + + BUG_ON(data->n_addrs >= ARRAY_SIZE(data->addr)); + data->addr[data->n_addrs++] = addr; +} + +static unsigned int kmemcheck_show_all(void) +{ + struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); + unsigned int i; + unsigned int n; + + n = 0; + for (i = 0; i < data->n_addrs; ++i) + n += kmemcheck_show_addr(data->addr[i]); + + return n; +} + +static unsigned int kmemcheck_hide_all(void) +{ + struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); + unsigned int i; + unsigned int n; + + n = 0; + for (i = 0; i < data->n_addrs; ++i) + n += kmemcheck_hide_addr(data->addr[i]); + + return n; +} + +/* + * Called from the #PF handler. + */ +void kmemcheck_show(struct pt_regs *regs) +{ + struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); + + BUG_ON(!irqs_disabled()); + + if (unlikely(data->balance != 0)) { + kmemcheck_show_all(); + kmemcheck_error_save_bug(regs); + data->balance = 0; + return; + } + + /* + * None of the addresses actually belonged to kmemcheck. Note that + * this is not an error. + */ + if (kmemcheck_show_all() == 0) + return; + + ++data->balance; + + /* + * The IF needs to be cleared as well, so that the faulting + * instruction can run "uninterrupted". Otherwise, we might take + * an interrupt and start executing that before we've had a chance + * to hide the page again. + * + * NOTE: In the rare case of multiple faults, we must not override + * the original flags: + */ + if (!(regs->flags & X86_EFLAGS_TF)) + data->flags = regs->flags; + + regs->flags |= X86_EFLAGS_TF; + regs->flags &= ~X86_EFLAGS_IF; +} + +/* + * Called from the #DB handler. + */ +void kmemcheck_hide(struct pt_regs *regs) +{ + struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); + int n; + + BUG_ON(!irqs_disabled()); + + if (data->balance == 0) + return; + + if (unlikely(data->balance != 1)) { + kmemcheck_show_all(); + kmemcheck_error_save_bug(regs); + data->n_addrs = 0; + data->balance = 0; + + if (!(data->flags & X86_EFLAGS_TF)) + regs->flags &= ~X86_EFLAGS_TF; + if (data->flags & X86_EFLAGS_IF) + regs->flags |= X86_EFLAGS_IF; + return; + } + + if (kmemcheck_enabled) + n = kmemcheck_hide_all(); + else + n = kmemcheck_show_all(); + + if (n == 0) + return; + + --data->balance; + + data->n_addrs = 0; + + if (!(data->flags & X86_EFLAGS_TF)) + regs->flags &= ~X86_EFLAGS_TF; + if (data->flags & X86_EFLAGS_IF) + regs->flags |= X86_EFLAGS_IF; +} + +void kmemcheck_show_pages(struct page *p, unsigned int n) +{ + unsigned int i; + + for (i = 0; i < n; ++i) { + unsigned long address; + pte_t *pte; + unsigned int level; + + address = (unsigned long) page_address(&p[i]); + pte = lookup_address(address, &level); + BUG_ON(!pte); + BUG_ON(level != PG_LEVEL_4K); + + set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); + set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_HIDDEN)); + __flush_tlb_one(address); + } +} + +bool kmemcheck_page_is_tracked(struct page *p) +{ + /* This will also check the "hidden" flag of the PTE. */ + return kmemcheck_pte_lookup((unsigned long) page_address(p)); +} + +void kmemcheck_hide_pages(struct page *p, unsigned int n) +{ + unsigned int i; + + for (i = 0; i < n; ++i) { + unsigned long address; + pte_t *pte; + unsigned int level; + + address = (unsigned long) page_address(&p[i]); + pte = lookup_address(address, &level); + BUG_ON(!pte); + BUG_ON(level != PG_LEVEL_4K); + + set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); + set_pte(pte, __pte(pte_val(*pte) | _PAGE_HIDDEN)); + __flush_tlb_one(address); + } +} + +/* Access may NOT cross page boundary */ +static void kmemcheck_read_strict(struct pt_regs *regs, + unsigned long addr, unsigned int size) +{ + void *shadow; + enum kmemcheck_shadow status; + + shadow = kmemcheck_shadow_lookup(addr); + if (!shadow) + return; + + kmemcheck_save_addr(addr); + status = kmemcheck_shadow_test(shadow, size); + if (status == KMEMCHECK_SHADOW_INITIALIZED) + return; + + if (kmemcheck_enabled) + kmemcheck_error_save(status, addr, size, regs); + + if (kmemcheck_enabled == 2) + kmemcheck_enabled = 0; + + /* Don't warn about it again. */ + kmemcheck_shadow_set(shadow, size); +} + +/* Access may cross page boundary */ +static void kmemcheck_read(struct pt_regs *regs, + unsigned long addr, unsigned int size) +{ + unsigned long page = addr & PAGE_MASK; + unsigned long next_addr = addr + size - 1; + unsigned long next_page = next_addr & PAGE_MASK; + + if (likely(page == next_page)) { + kmemcheck_read_strict(regs, addr, size); + return; + } + + /* + * What we do is basically to split the access across the + * two pages and handle each part separately. Yes, this means + * that we may now see reads that are 3 + 5 bytes, for + * example (and if both are uninitialized, there will be two + * reports), but it makes the code a lot simpler. + */ + kmemcheck_read_strict(regs, addr, next_page - addr); + kmemcheck_read_strict(regs, next_page, next_addr - next_page); +} + +static void kmemcheck_write_strict(struct pt_regs *regs, + unsigned long addr, unsigned int size) +{ + void *shadow; + + shadow = kmemcheck_shadow_lookup(addr); + if (!shadow) + return; + + kmemcheck_save_addr(addr); + kmemcheck_shadow_set(shadow, size); +} + +static void kmemcheck_write(struct pt_regs *regs, + unsigned long addr, unsigned int size) +{ + unsigned long page = addr & PAGE_MASK; + unsigned long next_addr = addr + size - 1; + unsigned long next_page = next_addr & PAGE_MASK; + + if (likely(page == next_page)) { + kmemcheck_write_strict(regs, addr, size); + return; + } + + /* See comment in kmemcheck_read(). */ + kmemcheck_write_strict(regs, addr, next_page - addr); + kmemcheck_write_strict(regs, next_page, next_addr - next_page); +} + +/* + * Copying is hard. We have two addresses, each of which may be split across + * a page (and each page will have different shadow addresses). + */ +static void kmemcheck_copy(struct pt_regs *regs, + unsigned long src_addr, unsigned long dst_addr, unsigned int size) +{ + uint8_t shadow[8]; + enum kmemcheck_shadow status; + + unsigned long page; + unsigned long next_addr; + unsigned long next_page; + + uint8_t *x; + unsigned int i; + unsigned int n; + + BUG_ON(size > sizeof(shadow)); + + page = src_addr & PAGE_MASK; + next_addr = src_addr + size - 1; + next_page = next_addr & PAGE_MASK; + + if (likely(page == next_page)) { + /* Same page */ + x = kmemcheck_shadow_lookup(src_addr); + if (x) { + kmemcheck_save_addr(src_addr); + for (i = 0; i < size; ++i) + shadow[i] = x[i]; + } else { + for (i = 0; i < size; ++i) + shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; + } + } else { + n = next_page - src_addr; + BUG_ON(n > sizeof(shadow)); + + /* First page */ + x = kmemcheck_shadow_lookup(src_addr); + if (x) { + kmemcheck_save_addr(src_addr); + for (i = 0; i < n; ++i) + shadow[i] = x[i]; + } else { + /* Not tracked */ + for (i = 0; i < n; ++i) + shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; + } + + /* Second page */ + x = kmemcheck_shadow_lookup(next_page); + if (x) { + kmemcheck_save_addr(next_page); + for (i = n; i < size; ++i) + shadow[i] = x[i - n]; + } else { + /* Not tracked */ + for (i = n; i < size; ++i) + shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; + } + } + + page = dst_addr & PAGE_MASK; + next_addr = dst_addr + size - 1; + next_page = next_addr & PAGE_MASK; + + if (likely(page == next_page)) { + /* Same page */ + x = kmemcheck_shadow_lookup(dst_addr); + if (x) { + kmemcheck_save_addr(dst_addr); + for (i = 0; i < size; ++i) { + x[i] = shadow[i]; + shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; + } + } + } else { + n = next_page - dst_addr; + BUG_ON(n > sizeof(shadow)); + + /* First page */ + x = kmemcheck_shadow_lookup(dst_addr); + if (x) { + kmemcheck_save_addr(dst_addr); + for (i = 0; i < n; ++i) { + x[i] = shadow[i]; + shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; + } + } + + /* Second page */ + x = kmemcheck_shadow_lookup(next_page); + if (x) { + kmemcheck_save_addr(next_page); + for (i = n; i < size; ++i) { + x[i - n] = shadow[i]; + shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; + } + } + } + + status = kmemcheck_shadow_test(shadow, size); + if (status == KMEMCHECK_SHADOW_INITIALIZED) + return; + + if (kmemcheck_enabled) + kmemcheck_error_save(status, src_addr, size, regs); + + if (kmemcheck_enabled == 2) + kmemcheck_enabled = 0; +} + +enum kmemcheck_method { + KMEMCHECK_READ, + KMEMCHECK_WRITE, +}; + +static void kmemcheck_access(struct pt_regs *regs, + unsigned long fallback_address, enum kmemcheck_method fallback_method) +{ + const uint8_t *insn; + const uint8_t *insn_primary; + unsigned int size; + + struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); + + /* Recursive fault -- ouch. */ + if (data->busy) { + kmemcheck_show_addr(fallback_address); + kmemcheck_error_save_bug(regs); + return; + } + + data->busy = true; + + insn = (const uint8_t *) regs->ip; + insn_primary = kmemcheck_opcode_get_primary(insn); + + kmemcheck_opcode_decode(insn, &size); + + switch (insn_primary[0]) { +#ifdef CONFIG_KMEMCHECK_BITOPS_OK + /* AND, OR, XOR */ + /* + * Unfortunately, these instructions have to be excluded from + * our regular checking since they access only some (and not + * all) bits. This clears out "bogus" bitfield-access warnings. + */ + case 0x80: + case 0x81: + case 0x82: + case 0x83: + switch ((insn_primary[1] >> 3) & 7) { + /* OR */ + case 1: + /* AND */ + case 4: + /* XOR */ + case 6: + kmemcheck_write(regs, fallback_address, size); + goto out; + + /* ADD */ + case 0: + /* ADC */ + case 2: + /* SBB */ + case 3: + /* SUB */ + case 5: + /* CMP */ + case 7: + break; + } + break; +#endif + + /* MOVS, MOVSB, MOVSW, MOVSD */ + case 0xa4: + case 0xa5: + /* + * These instructions are special because they take two + * addresses, but we only get one page fault. + */ + kmemcheck_copy(regs, regs->si, regs->di, size); + goto out; + + /* CMPS, CMPSB, CMPSW, CMPSD */ + case 0xa6: + case 0xa7: + kmemcheck_read(regs, regs->si, size); + kmemcheck_read(regs, regs->di, size); + goto out; + } + + /* + * If the opcode isn't special in any way, we use the data from the + * page fault handler to determine the address and type of memory + * access. + */ + switch (fallback_method) { + case KMEMCHECK_READ: + kmemcheck_read(regs, fallback_address, size); + goto out; + case KMEMCHECK_WRITE: + kmemcheck_write(regs, fallback_address, size); + goto out; + } + +out: + data->busy = false; +} + +bool kmemcheck_fault(struct pt_regs *regs, unsigned long address, + unsigned long error_code) +{ + pte_t *pte; + + /* + * XXX: Is it safe to assume that memory accesses from virtual 86 + * mode or non-kernel code segments will _never_ access kernel + * memory (e.g. tracked pages)? For now, we need this to avoid + * invoking kmemcheck for PnP BIOS calls. + */ + if (regs->flags & X86_VM_MASK) + return false; + if (regs->cs != __KERNEL_CS) + return false; + + pte = kmemcheck_pte_lookup(address); + if (!pte) + return false; + + if (error_code & 2) + kmemcheck_access(regs, address, KMEMCHECK_WRITE); + else + kmemcheck_access(regs, address, KMEMCHECK_READ); + + kmemcheck_show(regs); + return true; +} + +bool kmemcheck_trap(struct pt_regs *regs) +{ + if (!kmemcheck_active(regs)) + return false; + + /* We're done. */ + kmemcheck_hide(regs); + return true; +} diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c new file mode 100644 index 00000000000..63c19e27aa6 --- /dev/null +++ b/arch/x86/mm/kmemcheck/opcode.c @@ -0,0 +1,106 @@ +#include <linux/types.h> + +#include "opcode.h" + +static bool opcode_is_prefix(uint8_t b) +{ + return + /* Group 1 */ + b == 0xf0 || b == 0xf2 || b == 0xf3 + /* Group 2 */ + || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26 + || b == 0x64 || b == 0x65 || b == 0x2e || b == 0x3e + /* Group 3 */ + || b == 0x66 + /* Group 4 */ + || b == 0x67; +} + +#ifdef CONFIG_X86_64 +static bool opcode_is_rex_prefix(uint8_t b) +{ + return (b & 0xf0) == 0x40; +} +#else +static bool opcode_is_rex_prefix(uint8_t b) +{ + return false; +} +#endif + +#define REX_W (1 << 3) + +/* + * This is a VERY crude opcode decoder. We only need to find the size of the + * load/store that caused our #PF and this should work for all the opcodes + * that we care about. Moreover, the ones who invented this instruction set + * should be shot. + */ +void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size) +{ + /* Default operand size */ + int operand_size_override = 4; + + /* prefixes */ + for (; opcode_is_prefix(*op); ++op) { + if (*op == 0x66) + operand_size_override = 2; + } + + /* REX prefix */ + if (opcode_is_rex_prefix(*op)) { + uint8_t rex = *op; + + ++op; + if (rex & REX_W) { + switch (*op) { + case 0x63: + *size = 4; + return; + case 0x0f: + ++op; + + switch (*op) { + case 0xb6: + case 0xbe: + *size = 1; + return; + case 0xb7: + case 0xbf: + *size = 2; + return; + } + + break; + } + + *size = 8; + return; + } + } + + /* escape opcode */ + if (*op == 0x0f) { + ++op; + + /* + * This is move with zero-extend and sign-extend, respectively; + * we don't have to think about 0xb6/0xbe, because this is + * already handled in the conditional below. + */ + if (*op == 0xb7 || *op == 0xbf) + operand_size_override = 2; + } + + *size = (*op & 1) ? operand_size_override : 1; +} + +const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op) +{ + /* skip prefixes */ + while (opcode_is_prefix(*op)) + ++op; + if (opcode_is_rex_prefix(*op)) + ++op; + return op; +} diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h new file mode 100644 index 00000000000..6956aad66b5 --- /dev/null +++ b/arch/x86/mm/kmemcheck/opcode.h @@ -0,0 +1,9 @@ +#ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H +#define ARCH__X86__MM__KMEMCHECK__OPCODE_H + +#include <linux/types.h> + +void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size); +const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op); + +#endif diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c new file mode 100644 index 00000000000..4ead26eeaf9 --- /dev/null +++ b/arch/x86/mm/kmemcheck/pte.c @@ -0,0 +1,22 @@ +#include <linux/mm.h> + +#include <asm/pgtable.h> + +#include "pte.h" + +pte_t *kmemcheck_pte_lookup(unsigned long address) +{ + pte_t *pte; + unsigned int level; + + pte = lookup_address(address, &level); + if (!pte) + return NULL; + if (level != PG_LEVEL_4K) + return NULL; + if (!pte_hidden(*pte)) + return NULL; + + return pte; +} + diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h new file mode 100644 index 00000000000..9f596645649 --- /dev/null +++ b/arch/x86/mm/kmemcheck/pte.h @@ -0,0 +1,10 @@ +#ifndef ARCH__X86__MM__KMEMCHECK__PTE_H +#define ARCH__X86__MM__KMEMCHECK__PTE_H + +#include <linux/mm.h> + +#include <asm/pgtable.h> + +pte_t *kmemcheck_pte_lookup(unsigned long address); + +#endif diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c new file mode 100644 index 00000000000..036efbea8b2 --- /dev/null +++ b/arch/x86/mm/kmemcheck/selftest.c @@ -0,0 +1,69 @@ +#include <linux/kernel.h> + +#include "opcode.h" +#include "selftest.h" + +struct selftest_opcode { + unsigned int expected_size; + const uint8_t *insn; + const char *desc; +}; + +static const struct selftest_opcode selftest_opcodes[] = { + /* REP MOVS */ + {1, "\xf3\xa4", "rep movsb <mem8>, <mem8>"}, + {4, "\xf3\xa5", "rep movsl <mem32>, <mem32>"}, + + /* MOVZX / MOVZXD */ + {1, "\x66\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg16>"}, + {1, "\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg32>"}, + + /* MOVSX / MOVSXD */ + {1, "\x66\x0f\xbe\x51\xf8", "movswq <mem8>, <reg16>"}, + {1, "\x0f\xbe\x51\xf8", "movswq <mem8>, <reg32>"}, + +#ifdef CONFIG_X86_64 + /* MOVZX / MOVZXD */ + {1, "\x49\x0f\xb6\x51\xf8", "movzbq <mem8>, <reg64>"}, + {2, "\x49\x0f\xb7\x51\xf8", "movzbq <mem16>, <reg64>"}, + + /* MOVSX / MOVSXD */ + {1, "\x49\x0f\xbe\x51\xf8", "movsbq <mem8>, <reg64>"}, + {2, "\x49\x0f\xbf\x51\xf8", "movsbq <mem16>, <reg64>"}, + {4, "\x49\x63\x51\xf8", "movslq <mem32>, <reg64>"}, +#endif +}; + +static bool selftest_opcode_one(const struct selftest_opcode *op) +{ + unsigned size; + + kmemcheck_opcode_decode(op->insn, &size); + + if (size == op->expected_size) + return true; + + printk(KERN_WARNING "kmemcheck: opcode %s: expected size %d, got %d\n", + op->desc, op->expected_size, size); + return false; +} + +static bool selftest_opcodes_all(void) +{ + bool pass = true; + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(selftest_opcodes); ++i) + pass = pass && selftest_opcode_one(&selftest_opcodes[i]); + + return pass; +} + +bool kmemcheck_selftest(void) +{ + bool pass = true; + + pass = pass && selftest_opcodes_all(); + + return pass; +} diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h new file mode 100644 index 00000000000..8fed4fe11f9 --- /dev/null +++ b/arch/x86/mm/kmemcheck/selftest.h @@ -0,0 +1,6 @@ +#ifndef ARCH_X86_MM_KMEMCHECK_SELFTEST_H +#define ARCH_X86_MM_KMEMCHECK_SELFTEST_H + +bool kmemcheck_selftest(void); + +#endif diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c new file mode 100644 index 00000000000..e773b6bd007 --- /dev/null +++ b/arch/x86/mm/kmemcheck/shadow.c @@ -0,0 +1,162 @@ +#include <linux/kmemcheck.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/module.h> + +#include <asm/page.h> +#include <asm/pgtable.h> + +#include "pte.h" +#include "shadow.h" + +/* + * Return the shadow address for the given address. Returns NULL if the + * address is not tracked. + * + * We need to be extremely careful not to follow any invalid pointers, + * because this function can be called for *any* possible address. + */ +void *kmemcheck_shadow_lookup(unsigned long address) +{ + pte_t *pte; + struct page *page; + + if (!virt_addr_valid(address)) + return NULL; + + pte = kmemcheck_pte_lookup(address); + if (!pte) + return NULL; + + page = virt_to_page(address); + if (!page->shadow) + return NULL; + return page->shadow + (address & (PAGE_SIZE - 1)); +} + +static void mark_shadow(void *address, unsigned int n, + enum kmemcheck_shadow status) +{ + unsigned long addr = (unsigned long) address; + unsigned long last_addr = addr + n - 1; + unsigned long page = addr & PAGE_MASK; + unsigned long last_page = last_addr & PAGE_MASK; + unsigned int first_n; + void *shadow; + + /* If the memory range crosses a page boundary, stop there. */ + if (page == last_page) + first_n = n; + else + first_n = page + PAGE_SIZE - addr; + + shadow = kmemcheck_shadow_lookup(addr); + if (shadow) + memset(shadow, status, first_n); + + addr += first_n; + n -= first_n; + + /* Do full-page memset()s. */ + while (n >= PAGE_SIZE) { + shadow = kmemcheck_shadow_lookup(addr); + if (shadow) + memset(shadow, status, PAGE_SIZE); + + addr += PAGE_SIZE; + n -= PAGE_SIZE; + } + + /* Do the remaining page, if any. */ + if (n > 0) { + shadow = kmemcheck_shadow_lookup(addr); + if (shadow) + memset(shadow, status, n); + } +} + +void kmemcheck_mark_unallocated(void *address, unsigned int n) +{ + mark_shadow(address, n, KMEMCHECK_SHADOW_UNALLOCATED); +} + +void kmemcheck_mark_uninitialized(void *address, unsigned int n) +{ + mark_shadow(address, n, KMEMCHECK_SHADOW_UNINITIALIZED); +} + +/* + * Fill the shadow memory of the given address such that the memory at that + * address is marked as being initialized. + */ +void kmemcheck_mark_initialized(void *address, unsigned int n) +{ + mark_shadow(address, n, KMEMCHECK_SHADOW_INITIALIZED); +} +EXPORT_SYMBOL_GPL(kmemcheck_mark_initialized); + +void kmemcheck_mark_freed(void *address, unsigned int n) +{ + mark_shadow(address, n, KMEMCHECK_SHADOW_FREED); +} + +void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n) +{ + unsigned int i; + + for (i = 0; i < n; ++i) + kmemcheck_mark_unallocated(page_address(&p[i]), PAGE_SIZE); +} + +void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n) +{ + unsigned int i; + + for (i = 0; i < n; ++i) + kmemcheck_mark_uninitialized(page_address(&p[i]), PAGE_SIZE); +} + +void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n) +{ + unsigned int i; + + for (i = 0; i < n; ++i) + kmemcheck_mark_initialized(page_address(&p[i]), PAGE_SIZE); +} + +enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size) +{ + uint8_t *x; + unsigned int i; + + x = shadow; + +#ifdef CONFIG_KMEMCHECK_PARTIAL_OK + /* + * Make sure _some_ bytes are initialized. Gcc frequently generates + * code to access neighboring bytes. + */ + for (i = 0; i < size; ++i) { + if (x[i] == KMEMCHECK_SHADOW_INITIALIZED) + return x[i]; + } +#else + /* All bytes must be initialized. */ + for (i = 0; i < size; ++i) { + if (x[i] != KMEMCHECK_SHADOW_INITIALIZED) + return x[i]; + } +#endif + + return x[0]; +} + +void kmemcheck_shadow_set(void *shadow, unsigned int size) +{ + uint8_t *x; + unsigned int i; + + x = shadow; + for (i = 0; i < size; ++i) + x[i] = KMEMCHECK_SHADOW_INITIALIZED; +} diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h new file mode 100644 index 00000000000..af46d9ab9d8 --- /dev/null +++ b/arch/x86/mm/kmemcheck/shadow.h @@ -0,0 +1,16 @@ +#ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H +#define ARCH__X86__MM__KMEMCHECK__SHADOW_H + +enum kmemcheck_shadow { + KMEMCHECK_SHADOW_UNALLOCATED, + KMEMCHECK_SHADOW_UNINITIALIZED, + KMEMCHECK_SHADOW_INITIALIZED, + KMEMCHECK_SHADOW_FREED, +}; + +void *kmemcheck_shadow_lookup(unsigned long address); + +enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size); +void kmemcheck_shadow_set(void *shadow, unsigned int size); + +#endif diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 6ce9518fe2a..3cfe9ced8a4 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -470,7 +470,7 @@ static int split_large_page(pte_t *kpte, unsigned long address) if (!debug_pagealloc) spin_unlock(&cpa_lock); - base = alloc_pages(GFP_KERNEL, 0); + base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); if (!debug_pagealloc) spin_lock(&cpa_lock); if (!base) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 7aa03a5389f..8e43bdd4545 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -4,9 +4,11 @@ #include <asm/tlb.h> #include <asm/fixmap.h> +#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO + pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); + return (pte_t *)__get_free_page(PGALLOC_GFP); } pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) @@ -14,9 +16,9 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) struct page *pte; #ifdef CONFIG_HIGHPTE - pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); + pte = alloc_pages(PGALLOC_GFP | __GFP_HIGHMEM, 0); #else - pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); + pte = alloc_pages(PGALLOC_GFP, 0); #endif if (pte) pgtable_page_ctor(pte); @@ -161,7 +163,7 @@ static int preallocate_pmds(pmd_t *pmds[]) bool failed = false; for(i = 0; i < PREALLOCATED_PMDS; i++) { - pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); + pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP); if (pmd == NULL) failed = true; pmds[i] = pmd; @@ -228,7 +230,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) pmd_t *pmds[PREALLOCATED_PMDS]; unsigned long flags; - pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + pgd = (pgd_t *)__get_free_page(PGALLOC_GFP); if (pgd == NULL) goto out; diff --git a/crypto/xor.c b/crypto/xor.c index 996b6ee57d9..fc5b836f343 100644 --- a/crypto/xor.c +++ b/crypto/xor.c @@ -101,7 +101,12 @@ calibrate_xor_blocks(void) void *b1, *b2; struct xor_block_template *f, *fastest; - b1 = (void *) __get_free_pages(GFP_KERNEL, 2); + /* + * Note: Since the memory is not actually used for _anything_ but to + * test the XOR speed, we don't really want kmemcheck to warn about + * reading uninitialized bytes here. + */ + b1 = (void *) __get_free_pages(GFP_KERNEL | __GFP_NOTRACK, 2); if (!b1) { printk(KERN_WARNING "xor: Yikes! No memory available.\n"); return -ENOMEM; diff --git a/drivers/ieee1394/csr1212.c b/drivers/ieee1394/csr1212.c index a6dfeb0b337..e76cac64c53 100644 --- a/drivers/ieee1394/csr1212.c +++ b/drivers/ieee1394/csr1212.c @@ -35,6 +35,7 @@ #include <linux/errno.h> #include <linux/kernel.h> +#include <linux/kmemcheck.h> #include <linux/string.h> #include <asm/bug.h> #include <asm/byteorder.h> @@ -387,6 +388,7 @@ csr1212_new_descriptor_leaf(u8 dtype, u32 specifier_id, if (!kv) return NULL; + kmemcheck_annotate_variable(kv->value.leaf.data[0]); CSR1212_DESCRIPTOR_LEAF_SET_TYPE(kv, dtype); CSR1212_DESCRIPTOR_LEAF_SET_SPECIFIER_ID(kv, specifier_id); diff --git a/drivers/ieee1394/nodemgr.c b/drivers/ieee1394/nodemgr.c index a6d55bebe61..5122b5a8aa2 100644 --- a/drivers/ieee1394/nodemgr.c +++ b/drivers/ieee1394/nodemgr.c @@ -10,6 +10,7 @@ #include <linux/bitmap.h> #include <linux/kernel.h> +#include <linux/kmemcheck.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/delay.h> @@ -39,7 +40,10 @@ struct nodemgr_csr_info { struct hpsb_host *host; nodeid_t nodeid; unsigned int generation; + + kmemcheck_bitfield_begin(flags); unsigned int speed_unverified:1; + kmemcheck_bitfield_end(flags); }; @@ -1293,6 +1297,7 @@ static void nodemgr_node_scan_one(struct hpsb_host *host, u8 *speed; ci = kmalloc(sizeof(*ci), GFP_KERNEL); + kmemcheck_annotate_bitfield(ci, flags); if (!ci) return; diff --git a/drivers/misc/c2port/core.c b/drivers/misc/c2port/core.c index 0207dd59090..b5346b4db91 100644 --- a/drivers/misc/c2port/core.c +++ b/drivers/misc/c2port/core.c @@ -15,6 +15,7 @@ #include <linux/errno.h> #include <linux/err.h> #include <linux/kernel.h> +#include <linux/kmemcheck.h> #include <linux/ctype.h> #include <linux/delay.h> #include <linux/idr.h> @@ -891,6 +892,7 @@ struct c2port_device *c2port_device_register(char *name, return ERR_PTR(-EINVAL); c2dev = kmalloc(sizeof(struct c2port_device), GFP_KERNEL); + kmemcheck_annotate_bitfield(c2dev, flags); if (unlikely(!c2dev)) return ERR_PTR(-ENOMEM); diff --git a/include/linux/c2port.h b/include/linux/c2port.h index 7b5a2388ba6..2a5cd867c36 100644 --- a/include/linux/c2port.h +++ b/include/linux/c2port.h @@ -10,6 +10,7 @@ */ #include <linux/device.h> +#include <linux/kmemcheck.h> #define C2PORT_NAME_LEN 32 @@ -20,8 +21,10 @@ /* Main struct */ struct c2port_ops; struct c2port_device { + kmemcheck_bitfield_begin(flags); unsigned int access:1; unsigned int flash_access:1; + kmemcheck_bitfield_end(flags); int id; char name[C2PORT_NAME_LEN]; diff --git a/include/linux/fs.h b/include/linux/fs.h index ede84fa7da5..6d12174fbe1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1919,8 +1919,9 @@ extern void __init vfs_caches_init(unsigned long); extern struct kmem_cache *names_cachep; -#define __getname() kmem_cache_alloc(names_cachep, GFP_KERNEL) -#define __putname(name) kmem_cache_free(names_cachep, (void *)(name)) +#define __getname_gfp(gfp) kmem_cache_alloc(names_cachep, (gfp)) +#define __getname() __getname_gfp(GFP_KERNEL) +#define __putname(name) kmem_cache_free(names_cachep, (void *)(name)) #ifndef CONFIG_AUDITSYSCALL #define putname(name) __putname(name) #else diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 3760e7c5de0..80e14b8c2e7 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -52,7 +52,19 @@ struct vm_area_struct; #define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) /* Page is reclaimable */ #define __GFP_MOVABLE ((__force gfp_t)0x100000u) /* Page is movable */ -#define __GFP_BITS_SHIFT 21 /* Room for 21 __GFP_FOO bits */ +#ifdef CONFIG_KMEMCHECK +#define __GFP_NOTRACK ((__force gfp_t)0x200000u) /* Don't track with kmemcheck */ +#else +#define __GFP_NOTRACK ((__force gfp_t)0) +#endif + +/* + * This may seem redundant, but it's a way of annotating false positives vs. + * allocations that simply cannot be supported (e.g. page tables). + */ +#define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK) + +#define __GFP_BITS_SHIFT 22 /* Room for 22 __GFP_FOO bits */ #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /* This equals 0, but use constants in case they ever change */ diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index c41e812e9d5..2721f07e935 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -472,6 +472,20 @@ static inline void tasklet_hi_schedule(struct tasklet_struct *t) __tasklet_hi_schedule(t); } +extern void __tasklet_hi_schedule_first(struct tasklet_struct *t); + +/* + * This version avoids touching any other tasklets. Needed for kmemcheck + * in order not to take any page faults while enqueueing this tasklet; + * consider VERY carefully whether you really need this or + * tasklet_hi_schedule()... + */ +static inline void tasklet_hi_schedule_first(struct tasklet_struct *t) +{ + if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) + __tasklet_hi_schedule_first(t); +} + static inline void tasklet_disable_nosync(struct tasklet_struct *t) { diff --git a/include/linux/kmemcheck.h b/include/linux/kmemcheck.h new file mode 100644 index 00000000000..47b39b7c7e8 --- /dev/null +++ b/include/linux/kmemcheck.h @@ -0,0 +1,153 @@ +#ifndef LINUX_KMEMCHECK_H +#define LINUX_KMEMCHECK_H + +#include <linux/mm_types.h> +#include <linux/types.h> + +#ifdef CONFIG_KMEMCHECK +extern int kmemcheck_enabled; + +/* The slab-related functions. */ +void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node); +void kmemcheck_free_shadow(struct page *page, int order); +void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, + size_t size); +void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size); + +void kmemcheck_pagealloc_alloc(struct page *p, unsigned int order, + gfp_t gfpflags); + +void kmemcheck_show_pages(struct page *p, unsigned int n); +void kmemcheck_hide_pages(struct page *p, unsigned int n); + +bool kmemcheck_page_is_tracked(struct page *p); + +void kmemcheck_mark_unallocated(void *address, unsigned int n); +void kmemcheck_mark_uninitialized(void *address, unsigned int n); +void kmemcheck_mark_initialized(void *address, unsigned int n); +void kmemcheck_mark_freed(void *address, unsigned int n); + +void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n); +void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n); +void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n); + +int kmemcheck_show_addr(unsigned long address); +int kmemcheck_hide_addr(unsigned long address); + +#else +#define kmemcheck_enabled 0 + +static inline void +kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) +{ +} + +static inline void +kmemcheck_free_shadow(struct page *page, int order) +{ +} + +static inline void +kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, + size_t size) +{ +} + +static inline void kmemcheck_slab_free(struct kmem_cache *s, void *object, + size_t size) +{ +} + +static inline void kmemcheck_pagealloc_alloc(struct page *p, + unsigned int order, gfp_t gfpflags) +{ +} + +static inline bool kmemcheck_page_is_tracked(struct page *p) +{ + return false; +} + +static inline void kmemcheck_mark_unallocated(void *address, unsigned int n) +{ +} + +static inline void kmemcheck_mark_uninitialized(void *address, unsigned int n) +{ +} + +static inline void kmemcheck_mark_initialized(void *address, unsigned int n) +{ +} + +static inline void kmemcheck_mark_freed(void *address, unsigned int n) +{ +} + +static inline void kmemcheck_mark_unallocated_pages(struct page *p, + unsigned int n) +{ +} + +static inline void kmemcheck_mark_uninitialized_pages(struct page *p, + unsigned int n) +{ +} + +static inline void kmemcheck_mark_initialized_pages(struct page *p, + unsigned int n) +{ +} + +#endif /* CONFIG_KMEMCHECK */ + +/* + * Bitfield annotations + * + * How to use: If you have a struct using bitfields, for example + * + * struct a { + * int x:8, y:8; + * }; + * + * then this should be rewritten as + * + * struct a { + * kmemcheck_bitfield_begin(flags); + * int x:8, y:8; + * kmemcheck_bitfield_end(flags); + * }; + * + * Now the "flags_begin" and "flags_end" members may be used to refer to the + * beginning and end, respectively, of the bitfield (and things like + * &x.flags_begin is allowed). As soon as the struct is allocated, the bit- + * fields should be annotated: + * + * struct a *a = kmalloc(sizeof(struct a), GFP_KERNEL); + * kmemcheck_annotate_bitfield(a, flags); + * + * Note: We provide the same definitions for both kmemcheck and non- + * kmemcheck kernels. This makes it harder to introduce accidental errors. It + * is also allowed to pass NULL pointers to kmemcheck_annotate_bitfield(). + */ +#define kmemcheck_bitfield_begin(name) \ + int name##_begin[0]; + +#define kmemcheck_bitfield_end(name) \ + int name##_end[0]; + +#define kmemcheck_annotate_bitfield(ptr, name) \ + do if (ptr) { \ + int _n = (long) &((ptr)->name##_end) \ + - (long) &((ptr)->name##_begin); \ + BUILD_BUG_ON(_n < 0); \ + \ + kmemcheck_mark_initialized(&((ptr)->name##_begin), _n); \ + } while (0) + +#define kmemcheck_annotate_variable(var) \ + do { \ + kmemcheck_mark_initialized(&(var), sizeof(var)); \ + } while (0) \ + +#endif /* LINUX_KMEMCHECK_H */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0e80e26ecf2..0042090a4d7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -98,6 +98,14 @@ struct page { #ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS unsigned long debug_flags; /* Use atomic bitops on this */ #endif + +#ifdef CONFIG_KMEMCHECK + /* + * kmemcheck wants to track the status of each byte in a page; this + * is a pointer to such a status block. NULL if not tracked. + */ + void *shadow; +#endif }; /* diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 8670f1575fe..29f8599e6be 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -1,6 +1,7 @@ #ifndef _LINUX_RING_BUFFER_H #define _LINUX_RING_BUFFER_H +#include <linux/kmemcheck.h> #include <linux/mm.h> #include <linux/seq_file.h> @@ -11,7 +12,10 @@ struct ring_buffer_iter; * Don't refer to this struct directly, use functions below. */ struct ring_buffer_event { + kmemcheck_bitfield_begin(bitfield); u32 type_len:5, time_delta:27; + kmemcheck_bitfield_end(bitfield); + u32 array[]; }; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index fa51293f270..63ef24bc01d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -15,6 +15,7 @@ #define _LINUX_SKBUFF_H #include <linux/kernel.h> +#include <linux/kmemcheck.h> #include <linux/compiler.h> #include <linux/time.h> #include <linux/cache.h> @@ -343,6 +344,7 @@ struct sk_buff { }; }; __u32 priority; + kmemcheck_bitfield_begin(flags1); __u8 local_df:1, cloned:1, ip_summed:2, @@ -353,6 +355,7 @@ struct sk_buff { ipvs_property:1, peeked:1, nf_trace:1; + kmemcheck_bitfield_end(flags1); __be16 protocol; void (*destructor)(struct sk_buff *skb); @@ -372,12 +375,16 @@ struct sk_buff { __u16 tc_verd; /* traffic control verdict */ #endif #endif + + kmemcheck_bitfield_begin(flags2); #ifdef CONFIG_IPV6_NDISC_NODETYPE __u8 ndisc_nodetype:2; #endif #if defined(CONFIG_MAC80211) || defined(CONFIG_MAC80211_MODULE) __u8 do_not_encrypt:1; #endif + kmemcheck_bitfield_end(flags2); + /* 0/13/14 bit hole */ #ifdef CONFIG_NET_DMA diff --git a/include/linux/slab.h b/include/linux/slab.h index 219b8fb4651..2da8372519f 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -64,6 +64,13 @@ #define SLAB_NOLEAKTRACE 0x00800000UL /* Avoid kmemleak tracing */ +/* Don't track use of uninitialized memory */ +#ifdef CONFIG_KMEMCHECK +# define SLAB_NOTRACK 0x01000000UL +#else +# define SLAB_NOTRACK 0x00000000UL +#endif + /* The following flags affect the page allocator grouping pages by mobility */ #define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */ #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 713f841ecaa..850d057500d 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -16,6 +16,87 @@ #include <linux/compiler.h> #include <linux/kmemtrace.h> +/* + * struct kmem_cache + * + * manages a cache. + */ + +struct kmem_cache { +/* 1) per-cpu data, touched during every alloc/free */ + struct array_cache *array[NR_CPUS]; +/* 2) Cache tunables. Protected by cache_chain_mutex */ + unsigned int batchcount; + unsigned int limit; + unsigned int shared; + + unsigned int buffer_size; + u32 reciprocal_buffer_size; +/* 3) touched by every alloc & free from the backend */ + + unsigned int flags; /* constant flags */ + unsigned int num; /* # of objs per slab */ + +/* 4) cache_grow/shrink */ + /* order of pgs per slab (2^n) */ + unsigned int gfporder; + + /* force GFP flags, e.g. GFP_DMA */ + gfp_t gfpflags; + + size_t colour; /* cache colouring range */ + unsigned int colour_off; /* colour offset */ + struct kmem_cache *slabp_cache; + unsigned int slab_size; + unsigned int dflags; /* dynamic flags */ + + /* constructor func */ + void (*ctor)(void *obj); + +/* 5) cache creation/removal */ + const char *name; + struct list_head next; + +/* 6) statistics */ +#ifdef CONFIG_DEBUG_SLAB + unsigned long num_active; + unsigned long num_allocations; + unsigned long high_mark; + unsigned long grown; + unsigned long reaped; + unsigned long errors; + unsigned long max_freeable; + unsigned long node_allocs; + unsigned long node_frees; + unsigned long node_overflow; + atomic_t allochit; + atomic_t allocmiss; + atomic_t freehit; + atomic_t freemiss; + + /* + * If debugging is enabled, then the allocator can add additional + * fields and/or padding to every object. buffer_size contains the total + * object size including these internal fields, the following two + * variables contain the offset to the user object and its size. + */ + int obj_offset; + int obj_size; +#endif /* CONFIG_DEBUG_SLAB */ + + /* + * We put nodelists[] at the end of kmem_cache, because we want to size + * this array to nr_node_ids slots instead of MAX_NUMNODES + * (see kmem_cache_init()) + * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache + * is statically defined, so we reserve the max number of nodes. + */ + struct kmem_list3 *nodelists[MAX_NUMNODES]; + /* + * Do not add fields after nodelists[] + */ +}; + /* Size description struct for general caches. */ struct cache_sizes { size_t cs_size; diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h index 1a8cecc4f38..51efbef38fb 100644 --- a/include/linux/stacktrace.h +++ b/include/linux/stacktrace.h @@ -4,6 +4,8 @@ struct task_struct; #ifdef CONFIG_STACKTRACE +struct task_struct; + struct stack_trace { unsigned int nr_entries, max_entries; unsigned long *entries; @@ -11,6 +13,7 @@ struct stack_trace { }; extern void save_stack_trace(struct stack_trace *trace); +extern void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp); extern void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace); diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 20a6957af87..47004f35cc7 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -17,6 +17,7 @@ #define _INET_SOCK_H +#include <linux/kmemcheck.h> #include <linux/string.h> #include <linux/types.h> #include <linux/jhash.h> @@ -66,14 +67,16 @@ struct inet_request_sock { __be32 loc_addr; __be32 rmt_addr; __be16 rmt_port; - u16 snd_wscale : 4, - rcv_wscale : 4, + kmemcheck_bitfield_begin(flags); + u16 snd_wscale : 4, + rcv_wscale : 4, tstamp_ok : 1, sack_ok : 1, wscale_ok : 1, ecn_ok : 1, acked : 1, no_srccheck: 1; + kmemcheck_bitfield_end(flags); struct ip_options *opt; }; @@ -199,9 +202,12 @@ static inline int inet_sk_ehashfn(const struct sock *sk) static inline struct request_sock *inet_reqsk_alloc(struct request_sock_ops *ops) { struct request_sock *req = reqsk_alloc(ops); + struct inet_request_sock *ireq = inet_rsk(req); - if (req != NULL) - inet_rsk(req)->opt = NULL; + if (req != NULL) { + kmemcheck_annotate_bitfield(ireq, flags); + ireq->opt = NULL; + } return req; } diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 4b8ece22b8e..b63b80fac56 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -16,6 +16,7 @@ #define _INET_TIMEWAIT_SOCK_ +#include <linux/kmemcheck.h> #include <linux/list.h> #include <linux/module.h> #include <linux/timer.h> @@ -127,10 +128,12 @@ struct inet_timewait_sock { __be32 tw_rcv_saddr; __be16 tw_dport; __u16 tw_num; + kmemcheck_bitfield_begin(flags); /* And these are ours. */ __u8 tw_ipv6only:1, tw_transparent:1; - /* 15 bits hole, try to pack */ + /* 14 bits hole, try to pack */ + kmemcheck_bitfield_end(flags); __u16 tw_ipv6_offset; unsigned long tw_ttd; struct inet_bind_bucket *tw_tb; diff --git a/include/net/sock.h b/include/net/sock.h index 010e14a93c9..95bd3fd75f9 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -218,9 +218,11 @@ struct sock { #define sk_hash __sk_common.skc_hash #define sk_prot __sk_common.skc_prot #define sk_net __sk_common.skc_net + kmemcheck_bitfield_begin(flags); unsigned char sk_shutdown : 2, sk_no_check : 2, sk_userlocks : 4; + kmemcheck_bitfield_end(flags); unsigned char sk_protocol; unsigned short sk_type; int sk_rcvbuf; diff --git a/init/do_mounts.c b/init/do_mounts.c index dd7ee5f203f..093f6591550 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -231,7 +231,8 @@ static int __init do_mount_root(char *name, char *fs, int flags, void *data) void __init mount_block_root(char *name, int flags) { - char *fs_names = __getname(); + char *fs_names = __getname_gfp(GFP_KERNEL + | __GFP_NOTRACK_FALSE_POSITIVE); char *p; #ifdef CONFIG_BLOCK char b[BDEVNAME_SIZE]; diff --git a/init/main.c b/init/main.c index f6204f712e7..7becd8b5c5b 100644 --- a/init/main.c +++ b/init/main.c @@ -65,6 +65,7 @@ #include <linux/idr.h> #include <linux/ftrace.h> #include <linux/async.h> +#include <linux/kmemcheck.h> #include <linux/kmemtrace.h> #include <trace/boot.h> diff --git a/kernel/fork.c b/kernel/fork.c index 4430eb1376f..be022c200da 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -178,7 +178,7 @@ void __init fork_init(unsigned long mempages) /* create a slab on which task_structs can be allocated */ task_struct_cachep = kmem_cache_create("task_struct", sizeof(struct task_struct), - ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL); + ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL); #endif /* do the arch specific task caches init */ @@ -1470,20 +1470,20 @@ void __init proc_caches_init(void) { sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU, - sighand_ctor); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU| + SLAB_NOTRACK, sighand_ctor); signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); files_cachep = kmem_cache_create("files_cache", sizeof(struct files_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); fs_cachep = kmem_cache_create("fs_cache", sizeof(struct fs_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); mm_cachep = kmem_cache_create("mm_struct", sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC); mmap_init(); } diff --git a/kernel/signal.c b/kernel/signal.c index 809a228019a..d81f4952eeb 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -832,6 +832,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, { struct sigpending *pending; struct sigqueue *q; + int override_rlimit; trace_sched_signal_send(sig, t); @@ -863,9 +864,13 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, make sure at least one signal gets delivered and don't pass on the info struct. */ - q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN && - (is_si_special(info) || - info->si_code >= 0))); + if (sig < SIGRTMIN) + override_rlimit = (is_si_special(info) || info->si_code >= 0); + else + override_rlimit = 0; + + q = __sigqueue_alloc(t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE, + override_rlimit); if (q) { list_add_tail(&q->list, &pending->list); switch ((unsigned long) info) { diff --git a/kernel/softirq.c b/kernel/softirq.c index 258885a543d..b41fb710e11 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -382,6 +382,17 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) EXPORT_SYMBOL(__tasklet_hi_schedule); +void __tasklet_hi_schedule_first(struct tasklet_struct *t) +{ + BUG_ON(!irqs_disabled()); + + t->next = __get_cpu_var(tasklet_hi_vec).head; + __get_cpu_var(tasklet_hi_vec).head = t; + __raise_softirq_irqoff(HI_SOFTIRQ); +} + +EXPORT_SYMBOL(__tasklet_hi_schedule_first); + static void tasklet_action(struct softirq_action *a) { struct tasklet_struct *list; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0e51a35a448..f5c76b6cd61 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -27,6 +27,7 @@ #include <linux/security.h> #include <linux/ctype.h> #include <linux/utsname.h> +#include <linux/kmemcheck.h> #include <linux/smp_lock.h> #include <linux/fs.h> #include <linux/init.h> @@ -967,6 +968,17 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#ifdef CONFIG_KMEMCHECK + { + .ctl_name = CTL_UNNUMBERED, + .procname = "kmemcheck", + .data = &kmemcheck_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2e642b2b725..dc4dc70171c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -10,6 +10,7 @@ #include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/hardirq.h> +#include <linux/kmemcheck.h> #include <linux/module.h> #include <linux/percpu.h> #include <linux/mutex.h> @@ -1270,6 +1271,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, if (tail < BUF_PAGE_SIZE) { /* Mark the rest of the page with padding */ event = __rb_page_index(tail_page, tail); + kmemcheck_annotate_bitfield(event, bitfield); rb_event_set_padding(event); } @@ -1327,6 +1329,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, return NULL; event = __rb_page_index(tail_page, tail); + kmemcheck_annotate_bitfield(event, bitfield); rb_update_event(event, type, length); /* The passed in type is zero for DATA */ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 116a35051be..6b0c2d8a212 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -300,7 +300,7 @@ config DEBUG_OBJECTS_ENABLE_DEFAULT config DEBUG_SLAB bool "Debug slab memory allocations" - depends on DEBUG_KERNEL && SLAB + depends on DEBUG_KERNEL && SLAB && !KMEMCHECK help Say Y here to have the kernel do limited verification on memory allocation as well as poisoning memory on free to catch use of freed @@ -312,7 +312,7 @@ config DEBUG_SLAB_LEAK config SLUB_DEBUG_ON bool "SLUB debugging on by default" - depends on SLUB && SLUB_DEBUG + depends on SLUB && SLUB_DEBUG && !KMEMCHECK default n help Boot with debugging on by default. SLUB boots by default with @@ -996,3 +996,5 @@ config DMA_API_DEBUG source "samples/Kconfig" source "lib/Kconfig.kgdb" + +source "lib/Kconfig.kmemcheck" diff --git a/lib/Kconfig.kmemcheck b/lib/Kconfig.kmemcheck new file mode 100644 index 00000000000..603c81b6654 --- /dev/null +++ b/lib/Kconfig.kmemcheck @@ -0,0 +1,91 @@ +config HAVE_ARCH_KMEMCHECK + bool + +menuconfig KMEMCHECK + bool "kmemcheck: trap use of uninitialized memory" + depends on DEBUG_KERNEL + depends on !X86_USE_3DNOW + depends on SLUB || SLAB + depends on !CC_OPTIMIZE_FOR_SIZE + depends on !FUNCTION_TRACER + select FRAME_POINTER + select STACKTRACE + default n + help + This option enables tracing of dynamically allocated kernel memory + to see if memory is used before it has been given an initial value. + Be aware that this requires half of your memory for bookkeeping and + will insert extra code at *every* read and write to tracked memory + thus slow down the kernel code (but user code is unaffected). + + The kernel may be started with kmemcheck=0 or kmemcheck=1 to disable + or enable kmemcheck at boot-time. If the kernel is started with + kmemcheck=0, the large memory and CPU overhead is not incurred. + +choice + prompt "kmemcheck: default mode at boot" + depends on KMEMCHECK + default KMEMCHECK_ONESHOT_BY_DEFAULT + help + This option controls the default behaviour of kmemcheck when the + kernel boots and no kmemcheck= parameter is given. + +config KMEMCHECK_DISABLED_BY_DEFAULT + bool "disabled" + depends on KMEMCHECK + +config KMEMCHECK_ENABLED_BY_DEFAULT + bool "enabled" + depends on KMEMCHECK + +config KMEMCHECK_ONESHOT_BY_DEFAULT + bool "one-shot" + depends on KMEMCHECK + help + In one-shot mode, only the first error detected is reported before + kmemcheck is disabled. + +endchoice + +config KMEMCHECK_QUEUE_SIZE + int "kmemcheck: error queue size" + depends on KMEMCHECK + default 64 + help + Select the maximum number of errors to store in the queue. Since + errors can occur virtually anywhere and in any context, we need a + temporary storage area which is guarantueed not to generate any + other faults. The queue will be emptied as soon as a tasklet may + be scheduled. If the queue is full, new error reports will be + lost. + +config KMEMCHECK_SHADOW_COPY_SHIFT + int "kmemcheck: shadow copy size (5 => 32 bytes, 6 => 64 bytes)" + depends on KMEMCHECK + range 2 8 + default 5 + help + Select the number of shadow bytes to save along with each entry of + the queue. These bytes indicate what parts of an allocation are + initialized, uninitialized, etc. and will be displayed when an + error is detected to help the debugging of a particular problem. + +config KMEMCHECK_PARTIAL_OK + bool "kmemcheck: allow partially uninitialized memory" + depends on KMEMCHECK + default y + help + This option works around certain GCC optimizations that produce + 32-bit reads from 16-bit variables where the upper 16 bits are + thrown away afterwards. This may of course also hide some real + bugs. + +config KMEMCHECK_BITOPS_OK + bool "kmemcheck: allow bit-field manipulation" + depends on KMEMCHECK + default n + help + This option silences warnings that would be generated for bit-field + accesses where not all the bits are initialized at the same time. + This may also hide some real bugs. + diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index bb01e298f26..aa99fd1f710 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -2,6 +2,7 @@ config DEBUG_PAGEALLOC bool "Debug page memory allocations" depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC depends on !HIBERNATION || !PPC && !SPARC + depends on !KMEMCHECK ---help--- Unmap pages from the kernel linear mapping after free_pages(). This results in a large slowdown, but helps to find certain types diff --git a/mm/Makefile b/mm/Makefile index e89acb090b4..c379ce08354 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -27,6 +27,7 @@ obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o +obj-$(CONFIG_KMEMCHECK) += kmemcheck.o obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c new file mode 100644 index 00000000000..fd814fd6131 --- /dev/null +++ b/mm/kmemcheck.c @@ -0,0 +1,122 @@ +#include <linux/gfp.h> +#include <linux/mm_types.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/kmemcheck.h> + +void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) +{ + struct page *shadow; + int pages; + int i; + + pages = 1 << order; + + /* + * With kmemcheck enabled, we need to allocate a memory area for the + * shadow bits as well. + */ + shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order); + if (!shadow) { + if (printk_ratelimit()) + printk(KERN_ERR "kmemcheck: failed to allocate " + "shadow bitmap\n"); + return; + } + + for(i = 0; i < pages; ++i) + page[i].shadow = page_address(&shadow[i]); + + /* + * Mark it as non-present for the MMU so that our accesses to + * this memory will trigger a page fault and let us analyze + * the memory accesses. + */ + kmemcheck_hide_pages(page, pages); +} + +void kmemcheck_free_shadow(struct page *page, int order) +{ + struct page *shadow; + int pages; + int i; + + if (!kmemcheck_page_is_tracked(page)) + return; + + pages = 1 << order; + + kmemcheck_show_pages(page, pages); + + shadow = virt_to_page(page[0].shadow); + + for(i = 0; i < pages; ++i) + page[i].shadow = NULL; + + __free_pages(shadow, order); +} + +void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, + size_t size) +{ + /* + * Has already been memset(), which initializes the shadow for us + * as well. + */ + if (gfpflags & __GFP_ZERO) + return; + + /* No need to initialize the shadow of a non-tracked slab. */ + if (s->flags & SLAB_NOTRACK) + return; + + if (!kmemcheck_enabled || gfpflags & __GFP_NOTRACK) { + /* + * Allow notracked objects to be allocated from + * tracked caches. Note however that these objects + * will still get page faults on access, they just + * won't ever be flagged as uninitialized. If page + * faults are not acceptable, the slab cache itself + * should be marked NOTRACK. + */ + kmemcheck_mark_initialized(object, size); + } else if (!s->ctor) { + /* + * New objects should be marked uninitialized before + * they're returned to the called. + */ + kmemcheck_mark_uninitialized(object, size); + } +} + +void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size) +{ + /* TODO: RCU freeing is unsupported for now; hide false positives. */ + if (!s->ctor && !(s->flags & SLAB_DESTROY_BY_RCU)) + kmemcheck_mark_freed(object, size); +} + +void kmemcheck_pagealloc_alloc(struct page *page, unsigned int order, + gfp_t gfpflags) +{ + int pages; + + if (gfpflags & (__GFP_HIGHMEM | __GFP_NOTRACK)) + return; + + pages = 1 << order; + + /* + * NOTE: We choose to track GFP_ZERO pages too; in fact, they + * can become uninitialized by copying uninitialized memory + * into them. + */ + + /* XXX: Can use zone->node for node? */ + kmemcheck_alloc_shadow(page, order, gfpflags, -1); + + if (gfpflags & __GFP_ZERO) + kmemcheck_mark_initialized_pages(page, pages); + else + kmemcheck_mark_uninitialized_pages(page, pages); +} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 17d5f539a9a..0727896a88a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -23,6 +23,7 @@ #include <linux/bootmem.h> #include <linux/compiler.h> #include <linux/kernel.h> +#include <linux/kmemcheck.h> #include <linux/module.h> #include <linux/suspend.h> #include <linux/pagevec.h> @@ -546,6 +547,8 @@ static void __free_pages_ok(struct page *page, unsigned int order) int i; int bad = 0; + kmemcheck_free_shadow(page, order); + for (i = 0 ; i < (1 << order) ; ++i) bad += free_pages_check(page + i); if (bad) @@ -994,6 +997,8 @@ static void free_hot_cold_page(struct page *page, int cold) struct per_cpu_pages *pcp; unsigned long flags; + kmemcheck_free_shadow(page, 0); + if (PageAnon(page)) page->mapping = NULL; if (free_pages_check(page)) @@ -1047,6 +1052,16 @@ void split_page(struct page *page, unsigned int order) VM_BUG_ON(PageCompound(page)); VM_BUG_ON(!page_count(page)); + +#ifdef CONFIG_KMEMCHECK + /* + * Split shadow pages too, because free(page[0]) would + * otherwise free the whole shadow. + */ + if (kmemcheck_page_is_tracked(page)) + split_page(virt_to_page(page[0].shadow), order); +#endif + for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); } @@ -1667,7 +1682,10 @@ nopage: dump_stack(); show_mem(); } + return page; got_pg: + if (kmemcheck_enabled) + kmemcheck_pagealloc_alloc(page, order, gfp_mask); return page; } EXPORT_SYMBOL(__alloc_pages_internal); diff --git a/mm/slab.c b/mm/slab.c index 18e3164de09..af3376d0a83 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -114,6 +114,7 @@ #include <linux/rtmutex.h> #include <linux/reciprocal_div.h> #include <linux/debugobjects.h> +#include <linux/kmemcheck.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> @@ -179,13 +180,13 @@ SLAB_STORE_USER | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ - SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE) + SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK) #else # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ SLAB_CACHE_DMA | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ - SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE) + SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK) #endif /* @@ -380,87 +381,6 @@ static void kmem_list3_init(struct kmem_list3 *parent) MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ } while (0) -/* - * struct kmem_cache - * - * manages a cache. - */ - -struct kmem_cache { -/* 1) per-cpu data, touched during every alloc/free */ - struct array_cache *array[NR_CPUS]; -/* 2) Cache tunables. Protected by cache_chain_mutex */ - unsigned int batchcount; - unsigned int limit; - unsigned int shared; - - unsigned int buffer_size; - u32 reciprocal_buffer_size; -/* 3) touched by every alloc & free from the backend */ - - unsigned int flags; /* constant flags */ - unsigned int num; /* # of objs per slab */ - -/* 4) cache_grow/shrink */ - /* order of pgs per slab (2^n) */ - unsigned int gfporder; - - /* force GFP flags, e.g. GFP_DMA */ - gfp_t gfpflags; - - size_t colour; /* cache colouring range */ - unsigned int colour_off; /* colour offset */ - struct kmem_cache *slabp_cache; - unsigned int slab_size; - unsigned int dflags; /* dynamic flags */ - - /* constructor func */ - void (*ctor)(void *obj); - -/* 5) cache creation/removal */ - const char *name; - struct list_head next; - -/* 6) statistics */ -#if STATS - unsigned long num_active; - unsigned long num_allocations; - unsigned long high_mark; - unsigned long grown; - unsigned long reaped; - unsigned long errors; - unsigned long max_freeable; - unsigned long node_allocs; - unsigned long node_frees; - unsigned long node_overflow; - atomic_t allochit; - atomic_t allocmiss; - atomic_t freehit; - atomic_t freemiss; -#endif -#if DEBUG - /* - * If debugging is enabled, then the allocator can add additional - * fields and/or padding to every object. buffer_size contains the total - * object size including these internal fields, the following two - * variables contain the offset to the user object and its size. - */ - int obj_offset; - int obj_size; -#endif - /* - * We put nodelists[] at the end of kmem_cache, because we want to size - * this array to nr_node_ids slots instead of MAX_NUMNODES - * (see kmem_cache_init()) - * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache - * is statically defined, so we reserve the max number of nodes. - */ - struct kmem_list3 *nodelists[MAX_NUMNODES]; - /* - * Do not add fields after nodelists[] - */ -}; - #define CFLGS_OFF_SLAB (0x80000000UL) #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) @@ -1707,7 +1627,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) if (cachep->flags & SLAB_RECLAIM_ACCOUNT) flags |= __GFP_RECLAIMABLE; - page = alloc_pages_node(nodeid, flags, cachep->gfporder); + page = alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); if (!page) return NULL; @@ -1720,6 +1640,16 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) NR_SLAB_UNRECLAIMABLE, nr_pages); for (i = 0; i < nr_pages; i++) __SetPageSlab(page + i); + + if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { + kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); + + if (cachep->ctor) + kmemcheck_mark_uninitialized_pages(page, nr_pages); + else + kmemcheck_mark_unallocated_pages(page, nr_pages); + } + return page_address(page); } @@ -1732,6 +1662,8 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) struct page *page = virt_to_page(addr); const unsigned long nr_freed = i; + kmemcheck_free_shadow(page, cachep->gfporder); + if (cachep->flags & SLAB_RECLAIM_ACCOUNT) sub_zone_page_state(page_zone(page), NR_SLAB_RECLAIMABLE, nr_freed); @@ -3407,6 +3339,9 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags, flags); + if (likely(ptr)) + kmemcheck_slab_alloc(cachep, flags, ptr, obj_size(cachep)); + if (unlikely((flags & __GFP_ZERO) && ptr)) memset(ptr, 0, obj_size(cachep)); @@ -3467,6 +3402,9 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) flags); prefetchw(objp); + if (likely(objp)) + kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep)); + if (unlikely((flags & __GFP_ZERO) && objp)) memset(objp, 0, obj_size(cachep)); @@ -3583,6 +3521,8 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) kmemleak_free_recursive(objp, cachep->flags); objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); + kmemcheck_slab_free(cachep, objp, obj_size(cachep)); + /* * Skip calling cache_free_alien() when the platform is not numa. * This will avoid cache misses that happen while accessing slabp (which diff --git a/mm/slub.c b/mm/slub.c index 30354bfeb43..15960a09abb 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -18,6 +18,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/kmemtrace.h> +#include <linux/kmemcheck.h> #include <linux/cpu.h> #include <linux/cpuset.h> #include <linux/kmemleak.h> @@ -147,7 +148,7 @@ SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE) #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ - SLAB_CACHE_DMA) + SLAB_CACHE_DMA | SLAB_NOTRACK) #ifndef ARCH_KMALLOC_MINALIGN #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) @@ -1071,6 +1072,8 @@ static inline struct page *alloc_slab_page(gfp_t flags, int node, { int order = oo_order(oo); + flags |= __GFP_NOTRACK; + if (node == -1) return alloc_pages(flags, order); else @@ -1098,6 +1101,24 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); } + + if (kmemcheck_enabled + && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) + { + int pages = 1 << oo_order(oo); + + kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); + + /* + * Objects from caches that have a constructor don't get + * cleared when they're allocated, so we need to do it here. + */ + if (s->ctor) + kmemcheck_mark_uninitialized_pages(page, pages); + else + kmemcheck_mark_unallocated_pages(page, pages); + } + page->objects = oo_objects(oo); mod_zone_page_state(page_zone(page), (s->flags & SLAB_RECLAIM_ACCOUNT) ? @@ -1171,6 +1192,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __ClearPageSlubDebug(page); } + kmemcheck_free_shadow(page, compound_order(page)); + mod_zone_page_state(page_zone(page), (s->flags & SLAB_RECLAIM_ACCOUNT) ? NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, @@ -1626,7 +1649,9 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, if (unlikely((gfpflags & __GFP_ZERO) && object)) memset(object, 0, objsize); + kmemcheck_slab_alloc(s, gfpflags, object, c->objsize); kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags); + return object; } @@ -1759,6 +1784,7 @@ static __always_inline void slab_free(struct kmem_cache *s, kmemleak_free_recursive(x, s->flags); local_irq_save(flags); c = get_cpu_slab(s, smp_processor_id()); + kmemcheck_slab_free(s, object, c->objsize); debug_check_no_locks_freed(object, c->objsize); if (!(s->flags & SLAB_DEBUG_OBJECTS)) debug_check_no_obj_freed(object, c->objsize); @@ -2633,7 +2659,8 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) if (!s || !text || !kmem_cache_open(s, flags, text, realsize, ARCH_KMALLOC_MINALIGN, - SLAB_CACHE_DMA|__SYSFS_ADD_DEFERRED, NULL)) { + SLAB_CACHE_DMA|SLAB_NOTRACK|__SYSFS_ADD_DEFERRED, + NULL)) { kfree(s); kfree(text); goto unlock_out; @@ -2727,9 +2754,10 @@ EXPORT_SYMBOL(__kmalloc); static void *kmalloc_large_node(size_t size, gfp_t flags, int node) { - struct page *page = alloc_pages_node(node, flags | __GFP_COMP, - get_order(size)); + struct page *page; + flags |= __GFP_COMP | __GFP_NOTRACK; + page = alloc_pages_node(node, flags, get_order(size)); if (page) return page_address(page); else @@ -4412,6 +4440,8 @@ static char *create_unique_id(struct kmem_cache *s) *p++ = 'a'; if (s->flags & SLAB_DEBUG_FREE) *p++ = 'F'; + if (!(s->flags & SLAB_NOTRACK)) + *p++ = 't'; if (p != name + 1) *p++ = '-'; p += sprintf(p, "%07d", s->size); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1a94a303737..5c93435b034 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -39,6 +39,7 @@ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> +#include <linux/kmemcheck.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/in.h> @@ -201,6 +202,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, skb->data = data; skb_reset_tail_pointer(skb); skb->end = skb->tail + size; + kmemcheck_annotate_bitfield(skb, flags1); + kmemcheck_annotate_bitfield(skb, flags2); /* make sure we initialize shinfo sequentially */ shinfo = skb_shinfo(skb); atomic_set(&shinfo->dataref, 1); @@ -217,6 +220,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, struct sk_buff *child = skb + 1; atomic_t *fclone_ref = (atomic_t *) (child + 1); + kmemcheck_annotate_bitfield(child, flags1); + kmemcheck_annotate_bitfield(child, flags2); skb->fclone = SKB_FCLONE_ORIG; atomic_set(fclone_ref, 1); @@ -635,6 +640,9 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); if (!n) return NULL; + + kmemcheck_annotate_bitfield(n, flags1); + kmemcheck_annotate_bitfield(n, flags2); n->fclone = SKB_FCLONE_UNAVAILABLE; } diff --git a/net/core/sock.c b/net/core/sock.c index 06e26b77ad9..b0ba569bc97 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -945,6 +945,8 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, sk = kmalloc(prot->obj_size, priority); if (sk != NULL) { + kmemcheck_annotate_bitfield(sk, flags); + if (security_sk_alloc(sk, family, priority)) goto out_free; diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 68a8d892c71..61283f92882 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -9,6 +9,7 @@ */ #include <linux/kernel.h> +#include <linux/kmemcheck.h> #include <net/inet_hashtables.h> #include <net/inet_timewait_sock.h> #include <net/ip.h> @@ -120,6 +121,8 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat if (tw != NULL) { const struct inet_sock *inet = inet_sk(sk); + kmemcheck_annotate_bitfield(tw, flags); + /* Give us an identity. */ tw->tw_daddr = inet->daddr; tw->tw_rcv_saddr = inet->rcv_saddr; |