diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2017-02-09 13:51:16 +1100 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2017-02-09 13:51:20 +1100 |
commit | bd0044e8d5ddf308e4ac09734192418767700b82 (patch) | |
tree | 50fa4fea46466b3acd8e1bcca3ac69a9d323c6f7 | |
parent | 31fab8ad1ce0b55ddf9a356b374e3b556dc02a2b (diff) | |
parent | 5bf728f0221887fe3ddea3143c812404b85003d1 (diff) |
Merge remote-tracking branch 'tip/auto-latest'
293 files changed, 5135 insertions, 2792 deletions
diff --git a/Documentation/locking/ww-mutex-design.txt b/Documentation/locking/ww-mutex-design.txt index 8a112dc304c3..34c3a1b50b9a 100644 --- a/Documentation/locking/ww-mutex-design.txt +++ b/Documentation/locking/ww-mutex-design.txt @@ -309,11 +309,15 @@ Design: normal mutex locks, which are far more common. As such there is only a small increase in code size if wait/wound mutexes are not used. + We maintain the following invariants for the wait list: + (1) Waiters with an acquire context are sorted by stamp order; waiters + without an acquire context are interspersed in FIFO order. + (2) Among waiters with contexts, only the first one can have other locks + acquired already (ctx->acquired > 0). Note that this waiter may come + after other waiters without contexts in the list. + In general, not much contention is expected. The locks are typically used to - serialize access to resources for devices. The only way to make wakeups - smarter would be at the cost of adding a field to struct mutex_waiter. This - would add overhead to all cases where normal mutexes are used, and - ww_mutexes are generally less performance sensitive. + serialize access to resources for devices. Lockdep: Special care has been taken to warn for as many cases of api abuse diff --git a/Documentation/scheduler/sched-deadline.txt b/Documentation/scheduler/sched-deadline.txt index 8e37b0ba2c9d..cbc1b46cbf70 100644 --- a/Documentation/scheduler/sched-deadline.txt +++ b/Documentation/scheduler/sched-deadline.txt @@ -408,6 +408,11 @@ CONTENTS * the new scheduling related syscalls that manipulate it, i.e., sched_setattr() and sched_getattr() are implemented. + For debugging purposes, the leftover runtime and absolute deadline of a + SCHED_DEADLINE task can be retrieved through /proc/<pid>/sched (entries + dl.runtime and dl.deadline, both values in ns). A programmatic way to + retrieve these values from production code is under discussion. + 4.3 Default behavior --------------------- @@ -476,6 +481,7 @@ CONTENTS Still missing: + - programmatic way to retrieve current runtime and absolute deadline - refinements to deadline inheritance, especially regarding the possibility of retaining bandwidth isolation among non-interacting tasks. This is being studied from both theoretical and practical points of view, and diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt index a03f0d944fe6..d8fce3e78457 100644 --- a/Documentation/scheduler/sched-rt-group.txt +++ b/Documentation/scheduler/sched-rt-group.txt @@ -158,11 +158,11 @@ as its prone to starvation without deadline scheduling. Consider two sibling groups A and B; both have 50% bandwidth, but A's period is twice the length of B's. -* group A: period=100000us, runtime=10000us - - this runs for 0.01s once every 0.1s +* group A: period=100000us, runtime=50000us + - this runs for 0.05s once every 0.1s -* group B: period= 50000us, runtime=10000us - - this runs for 0.01s twice every 0.1s (or once every 0.05 sec). +* group B: period= 50000us, runtime=25000us + - this runs for 0.025s twice every 0.1s (or once every 0.05 sec). This means that currently a while (1) loop in A will run for the full period of B and can starve B's tasks (assuming they are of lower priority) for a whole diff --git a/Documentation/x86/intel_rdt_ui.txt b/Documentation/x86/intel_rdt_ui.txt index d918d268cd72..51cf6fa5591f 100644 --- a/Documentation/x86/intel_rdt_ui.txt +++ b/Documentation/x86/intel_rdt_ui.txt @@ -212,3 +212,117 @@ Finally we move core 4-7 over to the new group and make sure that the kernel and the tasks running there get 50% of the cache. # echo C0 > p0/cpus + +4) Locking between applications + +Certain operations on the resctrl filesystem, composed of read/writes +to/from multiple files, must be atomic. + +As an example, the allocation of an exclusive reservation of L3 cache +involves: + + 1. Read the cbmmasks from each directory + 2. Find a contiguous set of bits in the global CBM bitmask that is clear + in any of the directory cbmmasks + 3. Create a new directory + 4. Set the bits found in step 2 to the new directory "schemata" file + +If two applications attempt to allocate space concurrently then they can +end up allocating the same bits so the reservations are shared instead of +exclusive. + +To coordinate atomic operations on the resctrlfs and to avoid the problem +above, the following locking procedure is recommended: + +Locking is based on flock, which is available in libc and also as a shell +script command + +Write lock: + + A) Take flock(LOCK_EX) on /sys/fs/resctrl + B) Read/write the directory structure. + C) funlock + +Read lock: + + A) Take flock(LOCK_SH) on /sys/fs/resctrl + B) If success read the directory structure. + C) funlock + +Example with bash: + +# Atomically read directory structure +$ flock -s /sys/fs/resctrl/ find /sys/fs/resctrl + +# Read directory contents and create new subdirectory + +$ cat create-dir.sh +find /sys/fs/resctrl/ > output.txt +mask = function-of(output.txt) +mkdir /sys/fs/resctrl/newres/ +echo mask > /sys/fs/resctrl/newres/schemata + +$ flock /sys/fs/resctrl/ ./create-dir.sh + +Example with C: + +/* + * Example code do take advisory locks + * before accessing resctrl filesystem + */ +#include <sys/file.h> +#include <stdlib.h> + +void resctrl_take_shared_lock(int fd) +{ + int ret; + + /* take shared lock on resctrl filesystem */ + ret = flock(fd, LOCK_SH); + if (ret) { + perror("flock"); + exit(-1); + } +} + +void resctrl_take_exclusive_lock(int fd) +{ + int ret; + + /* release lock on resctrl filesystem */ + ret = flock(fd, LOCK_EX); + if (ret) { + perror("flock"); + exit(-1); + } +} + +void resctrl_release_lock(int fd) +{ + int ret; + + /* take shared lock on resctrl filesystem */ + ret = flock(fd, LOCK_UN); + if (ret) { + perror("flock"); + exit(-1); + } +} + +void main(void) +{ + int fd, ret; + + fd = open("/sys/fs/resctrl", O_DIRECTORY); + if (fd == -1) { + perror("open"); + exit(-1); + } + resctrl_take_shared_lock(fd); + /* code to read directory contents */ + resctrl_release_lock(fd); + + resctrl_take_exclusive_lock(fd); + /* code to read and write directory contents */ + resctrl_release_lock(fd); +} diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h index c7026429816b..8742d741d19a 100644 --- a/arch/ia64/include/asm/thread_info.h +++ b/arch/ia64/include/asm/thread_info.h @@ -27,6 +27,12 @@ struct thread_info { mm_segment_t addr_limit; /* user-level address space limit */ int preempt_count; /* 0=premptable, <0=BUG; will also serve as bh-counter */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + __u64 utime; + __u64 stime; + __u64 gtime; + __u64 hardirq_time; + __u64 softirq_time; + __u64 idle_time; __u64 ac_stamp; __u64 ac_leave; __u64 ac_stime; diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index 7ec7acc844c2..c483ece3eb84 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -619,6 +619,8 @@ setup_arch (char **cmdline_p) check_sal_cache_flush(); #endif paging_init(); + + clear_sched_clock_stable(); } /* diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index 71775b95d6cc..d040f12ea9f9 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -61,16 +61,41 @@ static struct clocksource *itc_clocksource; extern cputime_t cycle_to_cputime(u64 cyc); -void vtime_account_user(struct task_struct *tsk) +void vtime_flush(struct task_struct *tsk) { - cputime_t delta_utime; struct thread_info *ti = task_thread_info(tsk); + cputime_t delta; - if (ti->ac_utime) { - delta_utime = cycle_to_cputime(ti->ac_utime); - account_user_time(tsk, delta_utime); - ti->ac_utime = 0; + if (ti->utime) + account_user_time(tsk, cycle_to_cputime(ti->utime)); + + if (ti->gtime) + account_guest_time(tsk, cycle_to_cputime(ti->gtime)); + + if (ti->idle_time) + account_idle_time(cycle_to_cputime(ti->idle_time)); + + if (ti->stime) { + delta = cycle_to_cputime(ti->stime); + account_system_index_time(tsk, delta, CPUTIME_SYSTEM); + } + + if (ti->hardirq_time) { + delta = cycle_to_cputime(ti->hardirq_time); + account_system_index_time(tsk, delta, CPUTIME_IRQ); + } + + if (ti->softirq_time) { + delta = cycle_to_cputime(ti->softirq_time); + account_system_index_time(tsk, delta, CPUTIME_SOFTIRQ); } + + ti->utime = 0; + ti->gtime = 0; + ti->idle_time = 0; + ti->stime = 0; + ti->hardirq_time = 0; + ti->softirq_time = 0; } /* @@ -83,7 +108,7 @@ void arch_vtime_task_switch(struct task_struct *prev) struct thread_info *pi = task_thread_info(prev); struct thread_info *ni = task_thread_info(current); - pi->ac_stamp = ni->ac_stamp; + ni->ac_stamp = pi->ac_stamp; ni->ac_stime = ni->ac_utime = 0; } @@ -91,18 +116,15 @@ void arch_vtime_task_switch(struct task_struct *prev) * Account time for a transition between system, hard irq or soft irq state. * Note that this function is called with interrupts enabled. */ -static cputime_t vtime_delta(struct task_struct *tsk) +static __u64 vtime_delta(struct task_struct *tsk) { struct thread_info *ti = task_thread_info(tsk); - cputime_t delta_stime; - __u64 now; + __u64 now, delta_stime; WARN_ON_ONCE(!irqs_disabled()); now = ia64_get_itc(); - - delta_stime = cycle_to_cputime(ti->ac_stime + (now - ti->ac_stamp)); - ti->ac_stime = 0; + delta_stime = now - ti->ac_stamp; ti->ac_stamp = now; return delta_stime; @@ -110,15 +132,25 @@ static cputime_t vtime_delta(struct task_struct *tsk) void vtime_account_system(struct task_struct *tsk) { - cputime_t delta = vtime_delta(tsk); - - account_system_time(tsk, 0, delta); + struct thread_info *ti = task_thread_info(tsk); + __u64 stime = vtime_delta(tsk); + + if ((tsk->flags & PF_VCPU) && !irq_count()) + ti->gtime += stime; + else if (hardirq_count()) + ti->hardirq_time += stime; + else if (in_serving_softirq()) + ti->softirq_time += stime; + else + ti->stime += stime; } EXPORT_SYMBOL_GPL(vtime_account_system); void vtime_account_idle(struct task_struct *tsk) { - account_idle_time(vtime_delta(tsk)); + struct thread_info *ti = task_thread_info(tsk); + + ti->idle_time += vtime_delta(tsk); } #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c index 2e66a887788e..068ed3607bac 100644 --- a/arch/parisc/kernel/setup.c +++ b/arch/parisc/kernel/setup.c @@ -36,6 +36,7 @@ #undef PCI_DEBUG #include <linux/proc_fs.h> #include <linux/export.h> +#include <linux/sched.h> #include <asm/processor.h> #include <asm/sections.h> @@ -176,6 +177,7 @@ void __init setup_arch(char **cmdline_p) conswitchp = &dummy_con; /* we use do_take_over_console() later ! */ #endif + clear_sched_clock_stable(); } /* diff --git a/arch/powerpc/include/asm/accounting.h b/arch/powerpc/include/asm/accounting.h index c133246df467..3abcf98ed2e0 100644 --- a/arch/powerpc/include/asm/accounting.h +++ b/arch/powerpc/include/asm/accounting.h @@ -12,9 +12,17 @@ /* Stuff for accurate time accounting */ struct cpu_accounting_data { - unsigned long user_time; /* accumulated usermode TB ticks */ - unsigned long system_time; /* accumulated system TB ticks */ - unsigned long user_time_scaled; /* accumulated usermode SPURR ticks */ + /* Accumulated cputime values to flush on ticks*/ + unsigned long utime; + unsigned long stime; + unsigned long utime_scaled; + unsigned long stime_scaled; + unsigned long gtime; + unsigned long hardirq_time; + unsigned long softirq_time; + unsigned long steal_time; + unsigned long idle_time; + /* Internal counters */ unsigned long starttime; /* TB value snapshot */ unsigned long starttime_user; /* TB value on exit to usermode */ unsigned long startspurr; /* SPURR value snapshot */ diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 6a6792bb39fb..708c3e592eeb 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -187,7 +187,6 @@ struct paca_struct { /* Stuff for accurate time accounting */ struct cpu_accounting_data accounting; - u64 stolen_time; /* TB ticks taken by hypervisor */ u64 dtl_ridx; /* read index in dispatch log */ struct dtl_entry *dtl_curr; /* pointer corresponding to dtl_ridx */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index c0a2e077d34d..1e8e556996e7 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -249,9 +249,9 @@ int main(void) DEFINE(ACCOUNT_STARTTIME_USER, offsetof(struct paca_struct, accounting.starttime_user)); DEFINE(ACCOUNT_USER_TIME, - offsetof(struct paca_struct, accounting.user_time)); + offsetof(struct paca_struct, accounting.utime)); DEFINE(ACCOUNT_SYSTEM_TIME, - offsetof(struct paca_struct, accounting.system_time)); + offsetof(struct paca_struct, accounting.stime)); DEFINE(PACA_TRAP_SAVE, offsetof(struct paca_struct, trap_save)); DEFINE(PACA_NAPSTATELOST, offsetof(struct paca_struct, nap_state_lost)); DEFINE(PACA_SPRG_VDSO, offsetof(struct paca_struct, sprg_vdso)); @@ -262,9 +262,9 @@ int main(void) DEFINE(ACCOUNT_STARTTIME_USER, offsetof(struct thread_info, accounting.starttime_user)); DEFINE(ACCOUNT_USER_TIME, - offsetof(struct thread_info, accounting.user_time)); + offsetof(struct thread_info, accounting.utime)); DEFINE(ACCOUNT_SYSTEM_TIME, - offsetof(struct thread_info, accounting.system_time)); + offsetof(struct thread_info, accounting.stime)); #endif #endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index bc2e08d415fa..02e97305d22b 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -271,25 +271,19 @@ void accumulate_stolen_time(void) sst = scan_dispatch_log(acct->starttime_user); ust = scan_dispatch_log(acct->starttime); - acct->system_time -= sst; - acct->user_time -= ust; - local_paca->stolen_time += ust + sst; + acct->stime -= sst; + acct->utime -= ust; + acct->steal_time += ust + sst; local_paca->soft_enabled = save_soft_enabled; } static inline u64 calculate_stolen_time(u64 stop_tb) { - u64 stolen = 0; - - if (get_paca()->dtl_ridx != be64_to_cpu(get_lppaca()->dtl_idx)) { - stolen = scan_dispatch_log(stop_tb); - get_paca()->accounting.system_time -= stolen; - } + if (get_paca()->dtl_ridx != be64_to_cpu(get_lppaca()->dtl_idx)) + return scan_dispatch_log(stop_tb); - stolen += get_paca()->stolen_time; - get_paca()->stolen_time = 0; - return stolen; + return 0; } #else /* CONFIG_PPC_SPLPAR */ @@ -305,28 +299,27 @@ static inline u64 calculate_stolen_time(u64 stop_tb) * or soft irq state. */ static unsigned long vtime_delta(struct task_struct *tsk, - unsigned long *sys_scaled, - unsigned long *stolen) + unsigned long *stime_scaled, + unsigned long *steal_time) { unsigned long now, nowscaled, deltascaled; - unsigned long udelta, delta, user_scaled; + unsigned long stime; + unsigned long utime, utime_scaled; struct cpu_accounting_data *acct = get_accounting(tsk); WARN_ON_ONCE(!irqs_disabled()); now = mftb(); nowscaled = read_spurr(now); - acct->system_time += now - acct->starttime; + stime = now - acct->starttime; acct->starttime = now; deltascaled = nowscaled - acct->startspurr; acct->startspurr = nowscaled; - *stolen = calculate_stolen_time(now); + *steal_time = calculate_stolen_time(now); - delta = acct->system_time; - acct->system_time = 0; - udelta = acct->user_time - acct->utime_sspurr; - acct->utime_sspurr = acct->user_time; + utime = acct->utime - acct->utime_sspurr; + acct->utime_sspurr = acct->utime; /* * Because we don't read the SPURR on every kernel entry/exit, @@ -338,62 +331,104 @@ static unsigned long vtime_delta(struct task_struct *tsk, * the user ticks get saved up in paca->user_time_scaled to be * used by account_process_tick. */ - *sys_scaled = delta; - user_scaled = udelta; - if (deltascaled != delta + udelta) { - if (udelta) { - *sys_scaled = deltascaled * delta / (delta + udelta); - user_scaled = deltascaled - *sys_scaled; + *stime_scaled = stime; + utime_scaled = utime; + if (deltascaled != stime + utime) { + if (utime) { + *stime_scaled = deltascaled * stime / (stime + utime); + utime_scaled = deltascaled - *stime_scaled; } else { - *sys_scaled = deltascaled; + *stime_scaled = deltascaled; } } - acct->user_time_scaled += user_scaled; + acct->utime_scaled += utime_scaled; - return delta; + return stime; } void vtime_account_system(struct task_struct *tsk) { - unsigned long delta, sys_scaled, stolen; + unsigned long stime, stime_scaled, steal_time; + struct cpu_accounting_data *acct = get_accounting(tsk); + + stime = vtime_delta(tsk, &stime_scaled, &steal_time); + + stime -= min(stime, steal_time); + acct->steal_time += steal_time; - delta = vtime_delta(tsk, &sys_scaled, &stolen); - account_system_time(tsk, 0, delta); - tsk->stimescaled += sys_scaled; - if (stolen) - account_steal_time(stolen); + if ((tsk->flags & PF_VCPU) && !irq_count()) { + acct->gtime += stime; + acct->utime_scaled += stime_scaled; + } else { + if (hardirq_count()) + acct->hardirq_time += stime; + else if (in_serving_softirq()) + acct->softirq_time += stime; + else + acct->stime += stime; + + acct->stime_scaled += stime_scaled; + } } EXPORT_SYMBOL_GPL(vtime_account_system); void vtime_account_idle(struct task_struct *tsk) { - unsigned long delta, sys_scaled, stolen; + unsigned long stime, stime_scaled, steal_time; + struct cpu_accounting_data *acct = get_accounting(tsk); - delta = vtime_delta(tsk, &sys_scaled, &stolen); - account_idle_time(delta + stolen); + stime = vtime_delta(tsk, &stime_scaled, &steal_time); + acct->idle_time += stime + steal_time; } /* - * Transfer the user time accumulated in the paca - * by the exception entry and exit code to the generic - * process user time records. + * Account the whole cputime accumulated in the paca * Must be called with interrupts disabled. * Assumes that vtime_account_system/idle() has been called * recently (i.e. since the last entry from usermode) so that * get_paca()->user_time_scaled is up to date. */ -void vtime_account_user(struct task_struct *tsk) +void vtime_flush(struct task_struct *tsk) { - cputime_t utime, utimescaled; struct cpu_accounting_data *acct = get_accounting(tsk); - utime = acct->user_time; - utimescaled = acct->user_time_scaled; - acct->user_time = 0; - acct->user_time_scaled = 0; + if (acct->utime) + account_user_time(tsk, acct->utime); + + if (acct->utime_scaled) + tsk->utimescaled += acct->utime_scaled; + + if (acct->gtime) + account_guest_time(tsk, acct->gtime); + + if (acct->steal_time) + account_steal_time(acct->steal_time); + + if (acct->idle_time) + account_idle_time(acct->idle_time); + + if (acct->stime) + account_system_index_time(tsk, acct->stime, CPUTIME_SYSTEM); + + if (acct->stime_scaled) + tsk->stimescaled += acct->stime_scaled; + + if (acct->hardirq_time) + account_system_index_time(tsk, acct->hardirq_time, CPUTIME_IRQ); + + if (acct->softirq_time) + account_system_index_time(tsk, acct->softirq_time, CPUTIME_SOFTIRQ); + + acct->utime = 0; + acct->utime_scaled = 0; acct->utime_sspurr = 0; - account_user_time(tsk, utime); - tsk->utimescaled += utimescaled; + acct->gtime = 0; + acct->steal_time = 0; + acct->idle_time = 0; + acct->stime = 0; + acct->stime_scaled = 0; + acct->hardirq_time = 0; + acct->softirq_time = 0; } #ifdef CONFIG_PPC32 @@ -407,8 +442,7 @@ void arch_vtime_task_switch(struct task_struct *prev) struct cpu_accounting_data *acct = get_accounting(current); acct->starttime = get_accounting(prev)->starttime; - acct->system_time = 0; - acct->user_time = 0; + acct->startspurr = get_accounting(prev)->startspurr; } #endif /* CONFIG_PPC32 */ diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index a44b049b9cf6..1be0499f5397 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -2287,14 +2287,14 @@ static void dump_one_paca(int cpu) DUMP(p, subcore_sibling_mask, "x"); #endif - DUMP(p, accounting.user_time, "llx"); - DUMP(p, accounting.system_time, "llx"); - DUMP(p, accounting.user_time_scaled, "llx"); + DUMP(p, accounting.utime, "llx"); + DUMP(p, accounting.stime, "llx"); + DUMP(p, accounting.utime_scaled, "llx"); DUMP(p, accounting.starttime, "llx"); DUMP(p, accounting.starttime_user, "llx"); DUMP(p, accounting.startspurr, "llx"); DUMP(p, accounting.utime_sspurr, "llx"); - DUMP(p, stolen_time, "llx"); + DUMP(p, accounting.steal_time, "llx"); #undef DUMP catch_memory_errors = 0; diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index 9bfad2ad6312..61261e0e95c0 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -85,53 +85,56 @@ struct lowcore { __u64 mcck_enter_timer; /* 0x02c0 */ __u64 exit_timer; /* 0x02c8 */ __u64 user_timer; /* 0x02d0 */ - __u64 system_timer; /* 0x02d8 */ - __u64 steal_timer; /* 0x02e0 */ - __u64 last_update_timer; /* 0x02e8 */ - __u64 last_update_clock; /* 0x02f0 */ - __u64 int_clock; /* 0x02f8 */ - __u64 mcck_clock; /* 0x0300 */ - __u64 clock_comparator; /* 0x0308 */ + __u64 guest_timer; /* 0x02d8 */ + __u64 system_timer; /* 0x02e0 */ + __u64 hardirq_timer; /* 0x02e8 */ + __u64 softirq_timer; /* 0x02f0 */ + __u64 steal_timer; /* 0x02f8 */ + __u64 last_update_timer; /* 0x0300 */ + __u64 last_update_clock; /* 0x0308 */ + __u64 int_clock; /* 0x0310 */ + __u64 mcck_clock; /* 0x0318 */ + __u64 clock_comparator; /* 0x0320 */ /* Current process. */ - __u64 current_task; /* 0x0310 */ - __u8 pad_0x318[0x320-0x318]; /* 0x0318 */ - __u64 kernel_stack; /* 0x0320 */ + __u64 current_task; /* 0x0328 */ + __u8 pad_0x318[0x320-0x318]; /* 0x0330 */ + __u64 kernel_stack; /* 0x0338 */ /* Interrupt, panic and restart stack. */ - __u64 async_stack; /* 0x0328 */ - __u64 panic_stack; /* 0x0330 */ - __u64 restart_stack; /* 0x0338 */ + __u64 async_stack; /* 0x0340 */ + __u64 panic_stack; /* 0x0348 */ + __u64 restart_stack; /* 0x0350 */ /* Restart function and parameter. */ - __u64 restart_fn; /* 0x0340 */ - __u64 restart_data; /* 0x0348 */ - __u64 restart_source; /* 0x0350 */ + __u64 restart_fn; /* 0x0358 */ + __u64 restart_data; /* 0x0360 */ + __u64 restart_source; /* 0x0368 */ /* Address space pointer. */ - __u64 kernel_asce; /* 0x0358 */ - __u64 user_asce; /* 0x0360 */ + __u64 kernel_asce; /* 0x0370 */ + __u64 user_asce; /* 0x0378 */ /* * The lpp and current_pid fields form a * 64-bit value that is set as program * parameter with the LPP instruction. */ - __u32 lpp; /* 0x0368 */ - __u32 current_pid; /* 0x036c */ + __u32 lpp; /* 0x0380 */ + __u32 current_pid; /* 0x0384 */ /* SMP info area */ - __u32 cpu_nr; /* 0x0370 */ - __u32 softirq_pending; /* 0x0374 */ - __u64 percpu_offset; /* 0x0378 */ - __u64 vdso_per_cpu_data; /* 0x0380 */ - __u64 machine_flags; /* 0x0388 */ - __u32 preempt_count; /* 0x0390 */ - __u8 pad_0x0394[0x0398-0x0394]; /* 0x0394 */ - __u64 gmap; /* 0x0398 */ - __u32 spinlock_lockval; /* 0x03a0 */ - __u32 fpu_flags; /* 0x03a4 */ - __u8 pad_0x03a8[0x0400-0x03a8]; /* 0x03a8 */ + __u32 cpu_nr; /* 0x0388 */ + __u32 softirq_pending; /* 0x038c */ + __u64 percpu_offset; /* 0x0390 */ + __u64 vdso_per_cpu_data; /* 0x0398 */ + __u64 machine_flags; /* 0x03a0 */ + __u32 preempt_count; /* 0x03a8 */ + __u8 pad_0x03ac[0x03b0-0x03ac]; /* 0x03ac */ + __u64 gmap; /* 0x03b0 */ + __u32 spinlock_lockval; /* 0x03b8 */ + __u32 fpu_flags; /* 0x03bc */ + __u8 pad_0x03c0[0x0400-0x03c0]; /* 0x03c0 */ /* Per cpu primary space access list */ __u32 paste[16]; /* 0x0400 */ diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 6bca916a5ba0..977a5b6501b8 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -111,7 +111,10 @@ struct thread_struct { unsigned int acrs[NUM_ACRS]; unsigned long ksp; /* kernel stack pointer */ unsigned long user_timer; /* task cputime in user space */ + unsigned long guest_timer; /* task cputime in kvm guest */ unsigned long system_timer; /* task cputime in kernel space */ + unsigned long hardirq_timer; /* task cputime in hardirq context */ + unsigned long softirq_timer; /* task cputime in softirq context */ unsigned long sys_call_table; /* system call table address */ mm_segment_t mm_segment; unsigned long gmap_addr; /* address of last gmap fault. */ diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 1b5c5ee9fc1b..0a9e5d67547d 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -90,14 +90,41 @@ static void update_mt_scaling(void) __this_cpu_write(mt_scaling_jiffies, jiffies_64); } +static inline u64 update_tsk_timer(unsigned long *tsk_vtime, u64 new) +{ + u64 delta; + + delta = new - *tsk_vtime; + *tsk_vtime = new; + return delta; +} + + +static inline u64 scale_vtime(u64 vtime) +{ + u64 mult = __this_cpu_read(mt_scaling_mult); + u64 div = __this_cpu_read(mt_scaling_div); + + if (smp_cpu_mtid) + return vtime * mult / div; + return vtime; +} + +static void account_system_index_scaled(struct task_struct *p, + cputime_t cputime, cputime_t scaled, + enum cpu_usage_stat index) +{ + p->stimescaled += scaled; + account_system_index_time(p, cputime, index); +} + /* * Update process times based on virtual cpu times stored by entry.S * to the lowcore fields user_timer, system_timer & steal_clock. */ static int do_account_vtime(struct task_struct *tsk) { - u64 timer, clock, user, system, steal; - u64 user_scaled, system_scaled; + u64 timer, clock, user, guest, system, hardirq, softirq, steal; timer = S390_lowcore.last_update_timer; clock = S390_lowcore.last_update_clock; @@ -110,36 +137,53 @@ static int do_account_vtime(struct task_struct *tsk) #endif : "=m" (S390_lowcore.last_update_timer), "=m" (S390_lowcore.last_update_clock)); - S390_lowcore.system_timer += timer - S390_lowcore.last_update_timer; - S390_lowcore.steal_timer += S390_lowcore.last_update_clock - clock; + clock = S390_lowcore.last_update_clock - clock; + timer -= S390_lowcore.last_update_timer; + + if (hardirq_count()) + S390_lowcore.hardirq_timer += timer; + else + S390_lowcore.system_timer += timer; /* Update MT utilization calculation */ if (smp_cpu_mtid && time_after64(jiffies_64, this_cpu_read(mt_scaling_jiffies))) update_mt_scaling(); - user = S390_lowcore.user_timer - tsk->thread.user_timer; - S390_lowcore.steal_timer -= user; - tsk->thread.user_timer = S390_lowcore.user_timer; - - system = S390_lowcore.system_timer - tsk->thread.system_timer; - S390_lowcore.steal_timer -= system; - tsk->thread.system_timer = S390_lowcore.system_timer; - - user_scaled = user; - system_scaled = system; - /* Do MT utilization scaling */ - if (smp_cpu_mtid) { - u64 mult = __this_cpu_read(mt_scaling_mult); - u64 div = __this_cpu_read(mt_scaling_div); + /* Calculate cputime delta */ + user = update_tsk_timer(&tsk->thread.user_timer, + READ_ONCE(S390_lowcore.user_timer)); + guest = update_tsk_timer(&tsk->thread.guest_timer, + READ_ONCE(S390_lowcore.guest_timer)); + system = update_tsk_timer(&tsk->thread.system_timer, + READ_ONCE(S390_lowcore.system_timer)); + hardirq = update_tsk_timer(&tsk->thread.hardirq_timer, + READ_ONCE(S390_lowcore.hardirq_timer)); + softirq = update_tsk_timer(&tsk->thread.softirq_timer, + READ_ONCE(S390_lowcore.softirq_timer)); + S390_lowcore.steal_timer += + clock - user - guest - system - hardirq - softirq; + + /* Push account value */ + if (user) { + account_user_time(tsk, user); + tsk->utimescaled += scale_vtime(user); + } - user_scaled = (user_scaled * mult) / div; - system_scaled = (system_scaled * mult) / div; + if (guest) { + account_guest_time(tsk, guest); + tsk->utimescaled += scale_vtime(guest); } - account_user_time(tsk, user); - tsk->utimescaled += user_scaled; - account_system_time(tsk, 0, system); - tsk->stimescaled += system_scaled; + + if (system) + account_system_index_scaled(tsk, system, scale_vtime(system), + CPUTIME_SYSTEM); + if (hardirq) + account_system_index_scaled(tsk, hardirq, scale_vtime(hardirq), + CPUTIME_IRQ); + if (softirq) + account_system_index_scaled(tsk, softirq, scale_vtime(softirq), + CPUTIME_SOFTIRQ); steal = S390_lowcore.steal_timer; if ((s64) steal > 0) { @@ -147,16 +191,22 @@ static int do_account_vtime(struct task_struct *tsk) account_steal_time(steal); } - return virt_timer_forward(user + system); + return virt_timer_forward(user + guest + system + hardirq + softirq); } void vtime_task_switch(struct task_struct *prev) { do_account_vtime(prev); prev->thread.user_timer = S390_lowcore.user_timer; + prev->thread.guest_timer = S390_lowcore.guest_timer; prev->thread.system_timer = S390_lowcore.system_timer; + prev->thread.hardirq_timer = S390_lowcore.hardirq_timer; + prev->thread.softirq_timer = S390_lowcore.softirq_timer; S390_lowcore.user_timer = current->thread.user_timer; + S390_lowcore.guest_timer = current->thread.guest_timer; S390_lowcore.system_timer = current->thread.system_timer; + S390_lowcore.hardirq_timer = current->thread.hardirq_timer; + S390_lowcore.softirq_timer = current->thread.softirq_timer; } /* @@ -164,7 +214,7 @@ void vtime_task_switch(struct task_struct *prev) * accounting system time in order to correctly compute * the stolen time accounting. */ -void vtime_account_user(struct task_struct *tsk) +void vtime_flush(struct task_struct *tsk) { if (do_account_vtime(tsk)) virt_timer_expire(); @@ -176,32 +226,22 @@ void vtime_account_user(struct task_struct *tsk) */ void vtime_account_irq_enter(struct task_struct *tsk) { - u64 timer, system, system_scaled; + u64 timer; timer = S390_lowcore.last_update_timer; S390_lowcore.last_update_timer = get_vtimer(); - S390_lowcore.system_timer += timer - S390_lowcore.last_update_timer; - - /* Update MT utilization calculation */ - if (smp_cpu_mtid && - time_after64(jiffies_64, this_cpu_read(mt_scaling_jiffies))) - update_mt_scaling(); - - system = S390_lowcore.system_timer - tsk->thread.system_timer; - S390_lowcore.steal_timer -= system; - tsk->thread.system_timer = S390_lowcore.system_timer; - system_scaled = system; - /* Do MT utilization scaling */ - if (smp_cpu_mtid) { - u64 mult = __this_cpu_read(mt_scaling_mult); - u64 div = __this_cpu_read(mt_scaling_div); - - system_scaled = (system_scaled * mult) / div; - } - account_system_time(tsk, 0, system); - tsk->stimescaled += system_scaled; - - virt_timer_forward(system); + timer -= S390_lowcore.last_update_timer; + + if ((tsk->flags & PF_VCPU) && (irq_count() == 0)) + S390_lowcore.guest_timer += timer; + else if (hardirq_count()) + S390_lowcore.hardirq_timer += timer; + else if (in_serving_softirq()) + S390_lowcore.softirq_timer += timer; + else + S390_lowcore.system_timer += timer; + + virt_timer_forward(timer); } EXPORT_SYMBOL_GPL(vtime_account_irq_enter); diff --git a/arch/tile/include/asm/Kbuild b/arch/tile/include/asm/Kbuild index 2d1f5638974c..20f2ba6d79be 100644 --- a/arch/tile/include/asm/Kbuild +++ b/arch/tile/include/asm/Kbuild @@ -5,7 +5,6 @@ generic-y += bug.h generic-y += bugs.h generic-y += clkdev.h generic-y += cputime.h -generic-y += div64.h generic-y += emergency-restart.h generic-y += errno.h generic-y += exec.h diff --git a/arch/tile/include/asm/div64.h b/arch/tile/include/asm/div64.h new file mode 100644 index 000000000000..bf6161966dfa --- /dev/null +++ b/arch/tile/include/asm/div64.h @@ -0,0 +1,14 @@ +#ifndef _ASM_TILE_DIV64_H +#define _ASM_TILE_DIV64_H + +#ifdef __tilegx__ +static inline u64 mul_u32_u32(u32 a, u32 b) +{ + return __insn_mul_lu_lu(a, b); +} +#define mul_u32_u32 mul_u32_u32 +#endif + +#include <asm-generic/div64.h> + +#endif /* _ASM_TILE_DIV64_H */ diff --git a/arch/um/drivers/random.c b/arch/um/drivers/random.c index 05523f14d7b2..57f03050c850 100644 --- a/arch/um/drivers/random.c +++ b/arch/um/drivers/random.c @@ -76,7 +76,7 @@ static ssize_t rng_dev_read (struct file *filp, char __user *buf, size_t size, add_sigio_fd(random_fd); add_wait_queue(&host_read_wait, &wait); - set_task_state(current, TASK_INTERRUPTIBLE); + set_current_state(TASK_INTERRUPTIBLE); schedule(); remove_wait_queue(&host_read_wait, &wait); diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 43f6efae73cf..ddf6d788470e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1071,7 +1071,7 @@ config X86_MCE_THRESHOLD def_bool y config X86_MCE_INJECT - depends on X86_MCE + depends on X86_MCE && X86_LOCAL_APIC tristate "Machine check injector support" ---help--- Provide support for injecting machine checks for testing purposes. diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index e5612f3e3b57..9b42b6d1e902 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -333,6 +333,7 @@ size_t strnlen(const char *s, size_t maxlen); unsigned int atou(const char *s); unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base); size_t strlen(const char *s); +char *strchr(const char *s, int c); /* tty.c */ void puts(const char *); diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index a66854d99ee1..8b7c9e75edcb 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -11,6 +11,7 @@ */ #include "misc.h" #include "error.h" +#include "../boot.h" #include <generated/compile.h> #include <linux/module.h> @@ -52,15 +53,22 @@ static unsigned long get_boot_seed(void) #include "../../lib/kaslr.c" struct mem_vector { - unsigned long start; - unsigned long size; + unsigned long long start; + unsigned long long size; }; +/* Only supporting at most 4 unusable memmap regions with kaslr */ +#define MAX_MEMMAP_REGIONS 4 + +static bool memmap_too_large; + enum mem_avoid_index { MEM_AVOID_ZO_RANGE = 0, MEM_AVOID_INITRD, MEM_AVOID_CMDLINE, MEM_AVOID_BOOTPARAMS, + MEM_AVOID_MEMMAP_BEGIN, + MEM_AVOID_MEMMAP_END = MEM_AVOID_MEMMAP_BEGIN + MAX_MEMMAP_REGIONS - 1, MEM_AVOID_MAX, }; @@ -77,6 +85,123 @@ static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two) return true; } +/** + * _memparse - Parse a string with mem suffixes into a number + * @ptr: Where parse begins + * @retptr: (output) Optional pointer to next char after parse completes + * + * Parses a string into a number. The number stored at @ptr is + * potentially suffixed with K, M, G, T, P, E. + */ +static unsigned long long _memparse(const char *ptr, char **retptr) +{ + char *endptr; /* Local pointer to end of parsed string */ + + unsigned long long ret = simple_strtoull(ptr, &endptr, 0); + + switch (*endptr) { + case 'E': + case 'e': + ret <<= 10; + case 'P': + case 'p': + ret <<= 10; + case 'T': + case 't': + ret <<= 10; + case 'G': + case 'g': + ret <<= 10; + case 'M': + case 'm': + ret <<= 10; + case 'K': + case 'k': + ret <<= 10; + endptr++; + default: + break; + } + + if (retptr) + *retptr = endptr; + + return ret; +} + +static int +parse_memmap(char *p, unsigned long long *start, unsigned long long *size) +{ + char *oldp; + + if (!p) + return -EINVAL; + + /* We don't care about this option here */ + if (!strncmp(p, "exactmap", 8)) + return -EINVAL; + + oldp = p; + *size = _memparse(p, &p); + if (p == oldp) + return -EINVAL; + + switch (*p) { + case '@': + /* Skip this region, usable */ + *start = 0; + *size = 0; + return 0; + case '#': + case '$': + case '!': + *start = _memparse(p + 1, &p); + return 0; + } + + return -EINVAL; +} + +static void mem_avoid_memmap(void) +{ + char arg[128]; + int rc; + int i; + char *str; + + /* See if we have any memmap areas */ + rc = cmdline_find_option("memmap", arg, sizeof(arg)); + if (rc <= 0) + return; + + i = 0; + str = arg; + while (str && (i < MAX_MEMMAP_REGIONS)) { + int rc; + unsigned long long start, size; + char *k = strchr(str, ','); + + if (k) + *k++ = 0; + + rc = parse_memmap(str, &start, &size); + if (rc < 0) + break; + str = k; + /* A usable region that should not be skipped */ + if (size == 0) + continue; + + mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].start = start; + mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].size = size; + i++; + } + + /* More than 4 memmaps, fail kaslr */ + if ((i >= MAX_MEMMAP_REGIONS) && str) + memmap_too_large = true; +} + /* * In theory, KASLR can put the kernel anywhere in the range of [16M, 64T). * The mem_avoid array is used to store the ranges that need to be avoided @@ -197,6 +322,9 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, /* We don't need to set a mapping for setup_data. */ + /* Mark the memmap regions we need to avoid */ + mem_avoid_memmap(); + #ifdef CONFIG_X86_VERBOSE_BOOTUP /* Make sure video RAM can be used. */ add_identity_map(0, PMD_SIZE); @@ -379,6 +507,12 @@ static unsigned long find_random_phys_addr(unsigned long minimum, int i; unsigned long addr; + /* Check if we had too many memmaps. */ + if (memmap_too_large) { + debug_putstr("Aborted e820 scan (more than 4 memmap= args)!\n"); + return 0; + } + /* Make sure minimum is aligned. */ minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); @@ -456,7 +590,7 @@ void choose_random_location(unsigned long input, /* Walk e820 and find a random address. */ random_addr = find_random_phys_addr(min_addr, output_size); if (!random_addr) { - warn("KASLR disabled: could not find suitable E820 region!"); + warn("Physical KASLR disabled: no suitable memory region!"); } else { /* Update the new physical address location. */ if (*output != random_addr) { diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index 9e240fcba784..5457b02fc050 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -156,3 +156,16 @@ char *strstr(const char *s1, const char *s2) } return NULL; } + +/** + * strchr - Find the first occurrence of the character c in the string s. + * @s: the string to be searched + * @c: the character to search for + */ +char *strchr(const char *s, int c) +{ + while (*s != (char)c) + if (*s++ == '\0') + return NULL; + return (char *)s; +} diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile index 1d392c39fe56..b8ccdb5c9244 100644 --- a/arch/x86/events/Makefile +++ b/arch/x86/events/Makefile @@ -1,11 +1,4 @@ -obj-y += core.o - -obj-$(CONFIG_CPU_SUP_AMD) += amd/core.o amd/uncore.o -obj-$(CONFIG_PERF_EVENTS_AMD_POWER) += amd/power.o -obj-$(CONFIG_X86_LOCAL_APIC) += amd/ibs.o msr.o -ifdef CONFIG_AMD_IOMMU -obj-$(CONFIG_CPU_SUP_AMD) += amd/iommu.o -endif - -obj-$(CONFIG_CPU_SUP_INTEL) += msr.o +obj-y += core.o +obj-y += amd/ +obj-$(CONFIG_X86_LOCAL_APIC) += msr.o obj-$(CONFIG_CPU_SUP_INTEL) += intel/ diff --git a/arch/x86/events/amd/Makefile b/arch/x86/events/amd/Makefile new file mode 100644 index 000000000000..b1da46f396e0 --- /dev/null +++ b/arch/x86/events/amd/Makefile @@ -0,0 +1,7 @@ +obj-$(CONFIG_CPU_SUP_AMD) += core.o uncore.o +obj-$(CONFIG_PERF_EVENTS_AMD_POWER) += power.o +obj-$(CONFIG_X86_LOCAL_APIC) += ibs.o +ifdef CONFIG_AMD_IOMMU +obj-$(CONFIG_CPU_SUP_AMD) += iommu.o +endif + diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index a0b1bdb3ad42..4d1f7f2d9aff 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -22,13 +22,17 @@ #define NUM_COUNTERS_NB 4 #define NUM_COUNTERS_L2 4 -#define MAX_COUNTERS NUM_COUNTERS_NB +#define NUM_COUNTERS_L3 6 +#define MAX_COUNTERS 6 #define RDPMC_BASE_NB 6 -#define RDPMC_BASE_L2 10 +#define RDPMC_BASE_LLC 10 #define COUNTER_SHIFT 16 +static int num_counters_llc; +static int num_counters_nb; + static HLIST_HEAD(uncore_unused_list); struct amd_uncore { @@ -45,30 +49,30 @@ struct amd_uncore { }; static struct amd_uncore * __percpu *amd_uncore_nb; -static struct amd_uncore * __percpu *amd_uncore_l2; +static struct amd_uncore * __percpu *amd_uncore_llc; static struct pmu amd_nb_pmu; -static struct pmu amd_l2_pmu; +static struct pmu amd_llc_pmu; static cpumask_t amd_nb_active_mask; -static cpumask_t amd_l2_active_mask; +static cpumask_t amd_llc_active_mask; static bool is_nb_event(struct perf_event *event) { return event->pmu->type == amd_nb_pmu.type; } -static bool is_l2_event(struct perf_event *event) +static bool is_llc_event(struct perf_event *event) { - return event->pmu->type == amd_l2_pmu.type; + return event->pmu->type == amd_llc_pmu.type; } static struct amd_uncore *event_to_amd_uncore(struct perf_event *event) { if (is_nb_event(event) && amd_uncore_nb) return *per_cpu_ptr(amd_uncore_nb, event->cpu); - else if (is_l2_event(event) && amd_uncore_l2) - return *per_cpu_ptr(amd_uncore_l2, event->cpu); + else if (is_llc_event(event) && amd_uncore_llc) + return *per_cpu_ptr(amd_uncore_llc, event->cpu); return NULL; } @@ -183,16 +187,16 @@ static int amd_uncore_event_init(struct perf_event *event) return -ENOENT; /* - * NB and L2 counters (MSRs) are shared across all cores that share the - * same NB / L2 cache. Interrupts can be directed to a single target - * core, however, event counts generated by processes running on other - * cores cannot be masked out. So we do not support sampling and - * per-thread events. + * NB and Last level cache counters (MSRs) are shared across all cores + * that share the same NB / Last level cache. Interrupts can be directed + * to a single target core, however, event counts generated by processes + * running on other cores cannot be masked out. So we do not support + * sampling and per-thread events. */ if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK) return -EINVAL; - /* NB and L2 counters do not have usr/os/guest/host bits */ + /* NB and Last level cache counters do not have usr/os/guest/host bits */ if (event->attr.exclude_user || event->attr.exclude_kernel || event->attr.exclude_host || event->attr.exclude_guest) return -EINVAL; @@ -226,8 +230,8 @@ static ssize_t amd_uncore_attr_show_cpumask(struct device *dev, if (pmu->type == amd_nb_pmu.type) active_mask = &amd_nb_active_mask; - else if (pmu->type == amd_l2_pmu.type) - active_mask = &amd_l2_active_mask; + else if (pmu->type == amd_llc_pmu.type) + active_mask = &amd_llc_active_mask; else return 0; @@ -244,30 +248,47 @@ static struct attribute_group amd_uncore_attr_group = { .attrs = amd_uncore_attrs, }; -PMU_FORMAT_ATTR(event, "config:0-7,32-35"); -PMU_FORMAT_ATTR(umask, "config:8-15"); - -static struct attribute *amd_uncore_format_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - NULL, -}; - -static struct attribute_group amd_uncore_format_group = { - .name = "format", - .attrs = amd_uncore_format_attr, +/* + * Similar to PMU_FORMAT_ATTR but allowing for format_attr to be assigned based + * on family + */ +#define AMD_FORMAT_ATTR(_dev, _name, _format) \ +static ssize_t \ +_dev##_show##_name(struct device *dev, \ + struct device_attribute *attr, \ + char *page) \ +{ \ + BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ + return sprintf(page, _format "\n"); \ +} \ +static struct device_attribute format_attr_##_dev##_name = __ATTR_RO(_dev); + +/* Used for each uncore counter type */ +#define AMD_ATTRIBUTE(_name) \ +static struct attribute *amd_uncore_format_attr_##_name[] = { \ + &format_attr_event_##_name.attr, \ + &format_attr_umask.attr, \ + NULL, \ +}; \ +static struct attribute_group amd_uncore_format_group_##_name = { \ + .name = "format", \ + .attrs = amd_uncore_format_attr_##_name, \ +}; \ +static const struct attribute_group *amd_uncore_attr_groups_##_name[] = { \ + &amd_uncore_attr_group, \ + &amd_uncore_format_group_##_name, \ + NULL, \ }; -static const struct attribute_group *amd_uncore_attr_groups[] = { - &amd_uncore_attr_group, - &amd_uncore_format_group, - NULL, -}; +AMD_FORMAT_ATTR(event, , "config:0-7,32-35"); +AMD_FORMAT_ATTR(umask, , "config:8-15"); +AMD_FORMAT_ATTR(event, _df, "config:0-7,32-35,59-60"); +AMD_FORMAT_ATTR(event, _l3, "config:0-7"); +AMD_ATTRIBUTE(df); +AMD_ATTRIBUTE(l3); static struct pmu amd_nb_pmu = { .task_ctx_nr = perf_invalid_context, - .attr_groups = amd_uncore_attr_groups, - .name = "amd_nb", .event_init = amd_uncore_event_init, .add = amd_uncore_add, .del = amd_uncore_del, @@ -276,10 +297,8 @@ static struct pmu amd_nb_pmu = { .read = amd_uncore_read, }; -static struct pmu amd_l2_pmu = { +static struct pmu amd_llc_pmu = { .task_ctx_nr = perf_invalid_context, - .attr_groups = amd_uncore_attr_groups, - .name = "amd_l2", .event_init = amd_uncore_event_init, .add = amd_uncore_add, .del = amd_uncore_del, @@ -296,14 +315,14 @@ static struct amd_uncore *amd_uncore_alloc(unsigned int cpu) static int amd_uncore_cpu_up_prepare(unsigned int cpu) { - struct amd_uncore *uncore_nb = NULL, *uncore_l2; + struct amd_uncore *uncore_nb = NULL, *uncore_llc; if (amd_uncore_nb) { uncore_nb = amd_uncore_alloc(cpu); if (!uncore_nb) goto fail; uncore_nb->cpu = cpu; - uncore_nb->num_counters = NUM_COUNTERS_NB; + uncore_nb->num_counters = num_counters_nb; uncore_nb->rdpmc_base = RDPMC_BASE_NB; uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL; uncore_nb->active_mask = &amd_nb_active_mask; @@ -312,18 +331,18 @@ static int amd_uncore_cpu_up_prepare(unsigned int cpu) *per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb; } - if (amd_uncore_l2) { - uncore_l2 = amd_uncore_alloc(cpu); - if (!uncore_l2) + if (amd_uncore_llc) { + uncore_llc = amd_uncore_alloc(cpu); + if (!uncore_llc) goto fail; - uncore_l2->cpu = cpu; - uncore_l2->num_counters = NUM_COUNTERS_L2; - uncore_l2->rdpmc_base = RDPMC_BASE_L2; - uncore_l2->msr_base = MSR_F16H_L2I_PERF_CTL; - uncore_l2->active_mask = &amd_l2_active_mask; - uncore_l2->pmu = &amd_l2_pmu; - uncore_l2->id = -1; - *per_cpu_ptr(amd_uncore_l2, cpu) = uncore_l2; + uncore_llc->cpu = cpu; + uncore_llc->num_counters = num_counters_llc; + uncore_llc->rdpmc_base = RDPMC_BASE_LLC; + uncore_llc->msr_base = MSR_F16H_L2I_PERF_CTL; + uncore_llc->active_mask = &amd_llc_active_mask; + uncore_llc->pmu = &amd_llc_pmu; + uncore_llc->id = -1; + *per_cpu_ptr(amd_uncore_llc, cpu) = uncore_llc; } return 0; @@ -376,17 +395,17 @@ static int amd_uncore_cpu_starting(unsigned int cpu) *per_cpu_ptr(amd_uncore_nb, cpu) = uncore; } - if (amd_uncore_l2) { + if (amd_uncore_llc) { unsigned int apicid = cpu_data(cpu).apicid; unsigned int nshared; - uncore = *per_cpu_ptr(amd_uncore_l2, cpu); + uncore = *per_cpu_ptr(amd_uncore_llc, cpu); cpuid_count(0x8000001d, 2, &eax, &ebx, &ecx, &edx); nshared = ((eax >> 14) & 0xfff) + 1; uncore->id = apicid - (apicid % nshared); - uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_l2); - *per_cpu_ptr(amd_uncore_l2, cpu) = uncore; + uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_llc); + *per_cpu_ptr(amd_uncore_llc, cpu) = uncore; } return 0; @@ -419,8 +438,8 @@ static int amd_uncore_cpu_online(unsigned int cpu) if (amd_uncore_nb) uncore_online(cpu, amd_uncore_nb); - if (amd_uncore_l2) - uncore_online(cpu, amd_uncore_l2); + if (amd_uncore_llc) + uncore_online(cpu, amd_uncore_llc); return 0; } @@ -456,8 +475,8 @@ static int amd_uncore_cpu_down_prepare(unsigned int cpu) if (amd_uncore_nb) uncore_down_prepare(cpu, amd_uncore_nb); - if (amd_uncore_l2) - uncore_down_prepare(cpu, amd_uncore_l2); + if (amd_uncore_llc) + uncore_down_prepare(cpu, amd_uncore_llc); return 0; } @@ -479,8 +498,8 @@ static int amd_uncore_cpu_dead(unsigned int cpu) if (amd_uncore_nb) uncore_dead(cpu, amd_uncore_nb); - if (amd_uncore_l2) - uncore_dead(cpu, amd_uncore_l2); + if (amd_uncore_llc) + uncore_dead(cpu, amd_uncore_llc); return 0; } @@ -492,6 +511,47 @@ static int __init amd_uncore_init(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) goto fail_nodev; + switch(boot_cpu_data.x86) { + case 23: + /* Family 17h: */ + num_counters_nb = NUM_COUNTERS_NB; + num_counters_llc = NUM_COUNTERS_L3; + /* + * For Family17h, the NorthBridge counters are + * re-purposed as Data Fabric counters. Also, support is + * added for L3 counters. The pmus are exported based on + * family as either L2 or L3 and NB or DF. + */ + amd_nb_pmu.name = "amd_df"; + amd_llc_pmu.name = "amd_l3"; + format_attr_event_df.show = &event_show_df; + format_attr_event_l3.show = &event_show_l3; + break; + case 22: + /* Family 16h - may change: */ + num_counters_nb = NUM_COUNTERS_NB; + num_counters_llc = NUM_COUNTERS_L2; + amd_nb_pmu.name = "amd_nb"; + amd_llc_pmu.name = "amd_l2"; + format_attr_event_df = format_attr_event; + format_attr_event_l3 = format_attr_event; + break; + default: + /* + * All prior families have the same number of + * NorthBridge and Last Level Cache counters + */ + num_counters_nb = NUM_COUNTERS_NB; + num_counters_llc = NUM_COUNTERS_L2; + amd_nb_pmu.name = "amd_nb"; + amd_llc_pmu.name = "amd_l2"; + format_attr_event_df = format_attr_event; + format_attr_event_l3 = format_attr_event; + break; + } + amd_nb_pmu.attr_groups = amd_uncore_attr_groups_df; + amd_llc_pmu.attr_groups = amd_uncore_attr_groups_l3; + if (!boot_cpu_has(X86_FEATURE_TOPOEXT)) goto fail_nodev; @@ -510,16 +570,16 @@ static int __init amd_uncore_init(void) } if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) { - amd_uncore_l2 = alloc_percpu(struct amd_uncore *); - if (!amd_uncore_l2) { + amd_uncore_llc = alloc_percpu(struct amd_uncore *); + if (!amd_uncore_llc) { ret = -ENOMEM; - goto fail_l2; + goto fail_llc; } - ret = perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1); + ret = perf_pmu_register(&amd_llc_pmu, amd_llc_pmu.name, -1); if (ret) - goto fail_l2; + goto fail_llc; - pr_info("perf: AMD L2I counters detected\n"); + pr_info("perf: AMD LLC counters detected\n"); ret = 0; } @@ -529,7 +589,7 @@ static int __init amd_uncore_init(void) if (cpuhp_setup_state(CPUHP_PERF_X86_AMD_UNCORE_PREP, "perf/x86/amd/uncore:prepare", amd_uncore_cpu_up_prepare, amd_uncore_cpu_dead)) - goto fail_l2; + goto fail_llc; if (cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING, "perf/x86/amd/uncore:starting", @@ -546,11 +606,11 @@ fail_start: cpuhp_remove_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING); fail_prep: cpuhp_remove_state(CPUHP_PERF_X86_AMD_UNCORE_PREP); -fail_l2: +fail_llc: if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) perf_pmu_unregister(&amd_nb_pmu); - if (amd_uncore_l2) - free_percpu(amd_uncore_l2); + if (amd_uncore_llc) + free_percpu(amd_uncore_llc); fail_nb: if (amd_uncore_nb) free_percpu(amd_uncore_nb); diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 0c5fbc68e82d..eff8e36aaf72 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -195,7 +195,7 @@ static inline void native_apic_msr_write(u32 reg, u32 v) static inline void native_apic_msr_eoi_write(u32 reg, u32 v) { - wrmsr_notrace(APIC_BASE_MSR + (APIC_EOI >> 4), APIC_EOI_ACK, 0); + __wrmsr(APIC_BASE_MSR + (APIC_EOI >> 4), APIC_EOI_ACK, 0); } static inline u32 native_apic_msr_read(u32 reg) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index eafee3161d1c..d45ab4b53396 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -100,7 +100,7 @@ #define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */ #define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */ #define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */ -/* free, was #define X86_FEATURE_CLFLUSH_MONITOR ( 3*32+25) * "" clflush reqd with monitor */ +#define X86_FEATURE_CPUID ( 3*32+25) /* CPU has CPUID instruction itself */ #define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */ #define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */ #define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ @@ -288,6 +288,7 @@ #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ +#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ #define X86_FEATURE_RDPID (16*32+ 22) /* RDPID instruction */ /* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */ @@ -320,5 +321,4 @@ #define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ #define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ #define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ - #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h index ced283ac79df..af95c47d5c9e 100644 --- a/arch/x86/include/asm/div64.h +++ b/arch/x86/include/asm/div64.h @@ -59,6 +59,17 @@ static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder) } #define div_u64_rem div_u64_rem +static inline u64 mul_u32_u32(u32 a, u32 b) +{ + u32 high, low; + + asm ("mull %[b]" : "=a" (low), "=d" (high) + : [a] "a" (a), [b] "rm" (b) ); + + return low | ((u64)high) << 32; +} +#define mul_u32_u32 mul_u32_u32 + #else # include <asm-generic/div64.h> #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index ec23d8e1297c..67313f3a9874 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -30,8 +30,6 @@ extern u64 e820_remove_range(u64 start, u64 size, unsigned old_type, int checktype); extern void update_e820(void); extern void e820_setup_gap(void); -extern int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, - unsigned long start_addr, unsigned long long end_addr); struct setup_data; extern void parse_e820_ext(u64 phys_addr, u32 data_len); diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index d4a684997497..255645f60ca2 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -87,6 +87,16 @@ extern void fpstate_init_soft(struct swregs_state *soft); #else static inline void fpstate_init_soft(struct swregs_state *soft) {} #endif + +static inline void fpstate_init_xstate(struct xregs_state *xsave) +{ + /* + * XRSTORS requires these bits set in xcomp_bv, or it will + * trigger #GP: + */ + xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask; +} + static inline void fpstate_init_fxstate(struct fxregs_state *fx) { fx->cwd = 0x37f; diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h index 49da9f497b90..fe04491130ae 100644 --- a/arch/x86/include/asm/intel-mid.h +++ b/arch/x86/include/asm/intel-mid.h @@ -27,7 +27,6 @@ extern void intel_mid_pwr_power_off(void); extern int intel_mid_pwr_get_lss_id(struct pci_dev *pdev); extern int get_gpio_by_name(const char *name); -extern void intel_scu_device_register(struct platform_device *pdev); extern int __init sfi_parse_mrtc(struct sfi_table_header *table); extern int __init sfi_parse_mtmr(struct sfi_table_header *table); extern int sfi_mrtc_num; @@ -42,10 +41,8 @@ struct devs_id { char name[SFI_NAME_LEN + 1]; u8 type; u8 delay; + u8 msic; void *(*get_platform_data)(void *info); - /* Custom handler for devices */ - void (*device_handler)(struct sfi_device_table_entry *pentry, - struct devs_id *dev); }; #define sfi_device(i) \ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index d34bd370074b..7afb0e2f07f4 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -164,6 +164,17 @@ static inline unsigned int isa_virt_to_bus(volatile void *address) #define virt_to_bus virt_to_phys #define bus_to_virt phys_to_virt +/* + * The default ioremap() behavior is non-cached; if you need something + * else, you probably want one of the following. + */ +extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size); +extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size); +#define ioremap_uc ioremap_uc + +extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); +extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val); + /** * ioremap - map bus memory into CPU space * @offset: bus address of the memory @@ -178,17 +189,6 @@ static inline unsigned int isa_virt_to_bus(volatile void *address) * If the area you are trying to map is a PCI BAR you should have a * look at pci_iomap(). */ -extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size); -extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size); -#define ioremap_uc ioremap_uc - -extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); -extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, - unsigned long prot_val); - -/* - * The default ioremap() behavior is non-cached: - */ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size) { return ioremap_nocache(offset, size); @@ -207,18 +207,42 @@ extern void set_iounmap_nonlazy(void); */ #define xlate_dev_kmem_ptr(p) p +/** + * memset_io Set a range of I/O memory to a constant value + * @addr: The beginning of the I/O-memory range to set + * @val: The value to set the memory to + * @count: The number of bytes to set + * + * Set a range of I/O memory to a given value. + */ static inline void memset_io(volatile void __iomem *addr, unsigned char val, size_t count) { memset((void __force *)addr, val, count); } +/** + * memcpy_fromio Copy a block of data from I/O memory + * @dst: The (RAM) destination for the copy + * @src: The (I/O memory) source for the data + * @count: The number of bytes to copy + * + * Copy a block of data from I/O memory. + */ static inline void memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count) { memcpy(dst, (const void __force *)src, count); } +/** + * memcpy_toio Copy a block of data into I/O memory + * @dst: The (I/O memory) destination for the copy + * @src: The (RAM) source for the data + * @count: The number of bytes to copy + * + * Copy a block of data to I/O memory. + */ static inline void memcpy_toio(volatile void __iomem *dst, const void *src, size_t count) { diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 5132f2a6c0a2..e63873683d4a 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -97,10 +97,6 @@ #define MCE_OVERFLOW 0 /* bit 0 in flags means overflow */ -/* Software defined banks */ -#define MCE_EXTENDED_BANK 128 -#define MCE_THERMAL_BANK (MCE_EXTENDED_BANK + 0) - #define MCE_LOG_LEN 32 #define MCE_LOG_SIGNATURE "MACHINECHECK" @@ -193,6 +189,15 @@ extern struct mce_vendor_flags mce_flags; extern struct mca_config mca_cfg; extern struct mca_msr_regs msr_ops; + +enum mce_notifier_prios { + MCE_PRIO_SRAO = INT_MAX, + MCE_PRIO_EXTLOG = INT_MAX - 1, + MCE_PRIO_NFIT = INT_MAX - 2, + MCE_PRIO_EDAC = INT_MAX - 3, + MCE_PRIO_LOWEST = 0, +}; + extern void mce_register_decode_chain(struct notifier_block *nb); extern void mce_unregister_decode_chain(struct notifier_block *nb); @@ -306,8 +311,6 @@ extern void (*deferred_error_int_vector)(void); void intel_init_thermal(struct cpuinfo_x86 *c); -void mce_log_therm_throt_event(__u64 status); - /* Interrupt Handler for core thermal thresholds */ extern int (*platform_thermal_notify)(__u64 msr_val); @@ -362,12 +365,13 @@ struct smca_hwid { unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */ u32 hwid_mcatype; /* (hwid,mcatype) tuple */ u32 xec_bitmap; /* Bitmap of valid ExtErrorCodes; current max is 21. */ + u8 count; /* Number of instances. */ }; struct smca_bank { struct smca_hwid *hwid; - /* Instance ID */ - u32 id; + u32 id; /* Value of MCA_IPID[InstanceId]. */ + u8 sysfs_id; /* Value used for sysfs name. */ }; extern struct smca_bank smca_banks[MAX_NR_BANKS]; diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 2266f864b747..daadeeea00b1 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -7,18 +7,17 @@ #define native_rdmsr(msr, val1, val2) \ do { \ - u64 __val = native_read_msr((msr)); \ + u64 __val = __rdmsr((msr)); \ (void)((val1) = (u32)__val); \ (void)((val2) = (u32)(__val >> 32)); \ } while (0) #define native_wrmsr(msr, low, high) \ - native_write_msr(msr, low, high) + __wrmsr(msr, low, high) #define native_wrmsrl(msr, val) \ - native_write_msr((msr), \ - (u32)((u64)(val)), \ - (u32)((u64)(val) >> 32)) + __wrmsr((msr), (u32)((u64)(val)), \ + (u32)((u64)(val) >> 32)) struct ucode_patch { struct list_head plist; diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h index 3e3e20be829a..3d57009e168b 100644 --- a/arch/x86/include/asm/microcode_amd.h +++ b/arch/x86/include/asm/microcode_amd.h @@ -54,6 +54,4 @@ static inline int __init save_microcode_in_initrd_amd(unsigned int family) { return -EINVAL; } void reload_ucode_amd(void) {} #endif - -extern bool check_current_patch_level(u32 *rev, bool early); #endif /* _ASM_X86_MICROCODE_AMD_H */ diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index db0b90c3b03e..898dba2e2e2c 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -80,7 +80,14 @@ static inline void do_trace_read_msr(unsigned int msr, u64 val, int failed) {} static inline void do_trace_rdpmc(unsigned int msr, u64 val, int failed) {} #endif -static inline unsigned long long native_read_msr(unsigned int msr) +/* + * __rdmsr() and __wrmsr() are the two primitives which are the bare minimum MSR + * accessors and should not have any tracing or other functionality piggybacking + * on them - those are *purely* for accessing MSRs and nothing more. So don't even + * think of extending them - you will be slapped with a stinking trout or a frozen + * shark will reach you, wherever you are! You've been warned. + */ +static inline unsigned long long notrace __rdmsr(unsigned int msr) { DECLARE_ARGS(val, low, high); @@ -88,11 +95,30 @@ static inline unsigned long long native_read_msr(unsigned int msr) "2:\n" _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_unsafe) : EAX_EDX_RET(val, low, high) : "c" (msr)); - if (msr_tracepoint_active(__tracepoint_read_msr)) - do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), 0); + return EAX_EDX_VAL(val, low, high); } +static inline void notrace __wrmsr(unsigned int msr, u32 low, u32 high) +{ + asm volatile("1: wrmsr\n" + "2:\n" + _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_unsafe) + : : "c" (msr), "a"(low), "d" (high) : "memory"); +} + +static inline unsigned long long native_read_msr(unsigned int msr) +{ + unsigned long long val; + + val = __rdmsr(msr); + + if (msr_tracepoint_active(__tracepoint_read_msr)) + do_trace_read_msr(msr, val, 0); + + return val; +} + static inline unsigned long long native_read_msr_safe(unsigned int msr, int *err) { @@ -116,29 +142,14 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr, /* Can be uninlined because referenced by paravirt */ static inline void notrace -__native_write_msr_notrace(unsigned int msr, u32 low, u32 high) -{ - asm volatile("1: wrmsr\n" - "2:\n" - _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_unsafe) - : : "c" (msr), "a"(low), "d" (high) : "memory"); -} - -/* Can be uninlined because referenced by paravirt */ -static inline void notrace native_write_msr(unsigned int msr, u32 low, u32 high) { - __native_write_msr_notrace(msr, low, high); + __wrmsr(msr, low, high); + if (msr_tracepoint_active(__tracepoint_write_msr)) do_trace_write_msr(msr, ((u64)high << 32 | low), 0); } -static inline void -wrmsr_notrace(unsigned int msr, u32 low, u32 high) -{ - __native_write_msr_notrace(msr, low, high); -} - /* Can be uninlined because referenced by paravirt */ static inline int notrace native_write_msr_safe(unsigned int msr, u32 low, u32 high) diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index b6c0b404898a..fbc73360aea0 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -27,6 +27,7 @@ struct vm_area_struct; extern pgd_t swapper_pg_dir[1024]; extern pgd_t initial_page_table[1024]; +extern pmd_t initial_pg_pmd[]; static inline void pgtable_cache_init(void) { } static inline void check_pgt_cache(void) { } @@ -75,4 +76,35 @@ do { \ #define kern_addr_valid(kaddr) (0) #endif +/* + * This is how much memory in addition to the memory covered up to + * and including _end we need mapped initially. + * We need: + * (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE) + * (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE) + * + * Modulo rounding, each megabyte assigned here requires a kilobyte of + * memory, which is currently unreclaimed. + * + * This should be a multiple of a page. + * + * KERNEL_IMAGE_SIZE should be greater than pa(_end) + * and small than max_low_pfn, otherwise will waste some page table entries + */ +#if PTRS_PER_PMD > 1 +#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD) +#else +#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD) +#endif + +/* + * Number of possible pages in the lowmem region. + * + * We shift 2 by 31 instead of 1 by 32 to the left in order to avoid a + * gas warning about overflowing shift count when gas has been compiled + * with only a host target support using a 32-bit type for internal + * representation. + */ +#define LOWMEM_PAGES ((((2<<31) - __PAGE_OFFSET) >> PAGE_SHIFT)) + #endif /* _ASM_X86_PGTABLE_32_H */ diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 921bea7a2708..6d391909e864 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -23,9 +23,6 @@ /* How long a lock should spin before we consider blocking */ #define SPIN_THRESHOLD (1 << 15) -extern struct static_key paravirt_ticketlocks_enabled; -static __always_inline bool static_key_false(struct static_key *key); - #include <asm/qspinlock.h> /* diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 5b7e43eff139..04eb4c1c0a90 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -529,18 +529,19 @@ static void lapic_timer_broadcast(const struct cpumask *mask) * The local apic timer can be used for any function which is CPU local. */ static struct clock_event_device lapic_clockevent = { - .name = "lapic", - .features = CLOCK_EVT_FEAT_PERIODIC | - CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP - | CLOCK_EVT_FEAT_DUMMY, - .shift = 32, - .set_state_shutdown = lapic_timer_shutdown, - .set_state_periodic = lapic_timer_set_periodic, - .set_state_oneshot = lapic_timer_set_oneshot, - .set_next_event = lapic_next_event, - .broadcast = lapic_timer_broadcast, - .rating = 100, - .irq = -1, + .name = "lapic", + .features = CLOCK_EVT_FEAT_PERIODIC | + CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP + | CLOCK_EVT_FEAT_DUMMY, + .shift = 32, + .set_state_shutdown = lapic_timer_shutdown, + .set_state_periodic = lapic_timer_set_periodic, + .set_state_oneshot = lapic_timer_set_oneshot, + .set_state_oneshot_stopped = lapic_timer_shutdown, + .set_next_event = lapic_next_event, + .broadcast = lapic_timer_broadcast, + .rating = 100, + .irq = -1, }; static DEFINE_PER_CPU(struct clock_event_device, lapic_events); @@ -1864,14 +1865,14 @@ static void __smp_spurious_interrupt(u8 vector) "should never happen.\n", vector, smp_processor_id()); } -__visible void smp_spurious_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs) { entering_irq(); __smp_spurious_interrupt(~regs->orig_ax); exiting_irq(); } -__visible void smp_trace_spurious_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_trace_spurious_interrupt(struct pt_regs *regs) { u8 vector = ~regs->orig_ax; @@ -1922,14 +1923,14 @@ static void __smp_error_interrupt(struct pt_regs *regs) } -__visible void smp_error_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs) { entering_irq(); __smp_error_interrupt(regs); exiting_irq(); } -__visible void smp_trace_error_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_trace_error_interrupt(struct pt_regs *regs) { entering_irq(); trace_error_apic_entry(ERROR_APIC_VECTOR); @@ -2028,8 +2029,8 @@ void disconnect_bsp_APIC(int virt_wire_setup) /* * The number of allocated logical CPU IDs. Since logical CPU IDs are allocated * contiguously, it equals to current allocated max logical CPU ID plus 1. - * All allocated CPU ID should be in [0, nr_logical_cpuidi), so the maximum of - * nr_logical_cpuids is nr_cpu_ids. + * All allocated CPU IDs should be in the [0, nr_logical_cpuids) range, + * so the maximum of nr_logical_cpuids is nr_cpu_ids. * * NOTE: Reserve 0 for BSP. */ @@ -2094,7 +2095,7 @@ int __generic_processor_info(int apicid, int version, bool enabled) * Since fixing handling of boot_cpu_physical_apicid requires * another discussion and tests on each platform, we leave it * for now and here we use read_apic_id() directly in this - * function, generic_processor_info(). + * function, __generic_processor_info(). */ if (disabled_cpu_apicid != BAD_APICID && disabled_cpu_apicid != read_apic_id() && diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 52f352b063fd..347bb9f65737 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1107,12 +1107,12 @@ int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, struct irq_alloc_info *info) ioapic = mp_find_ioapic(gsi); if (ioapic < 0) - return -1; + return -ENODEV; pin = mp_find_ioapic_pin(ioapic, gsi); idx = find_irq_entry(ioapic, pin, mp_INT); if ((flags & IOAPIC_MAP_CHECK) && idx < 0) - return -1; + return -ENODEV; return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, info); } diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 5d30c5e42bb1..f3557a1eb562 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -559,7 +559,7 @@ void send_cleanup_vector(struct irq_cfg *cfg) __send_cleanup_vector(data); } -asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) +asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 35690a168cf7..97ea712fc72f 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -56,6 +56,7 @@ static struct { unsigned int socketid_shift; /* aka pnode_shift for UV1/2/3 */ unsigned int pnode_mask; unsigned int gpa_shift; + unsigned int gnode_shift; } uv_cpuid; int uv_min_hub_revision_id; @@ -133,6 +134,7 @@ static int __init early_get_pnodeid(void) break; case UV4_HUB_PART_NUMBER: uv_min_hub_revision_id += UV4_HUB_REVISION_BASE - 1; + uv_cpuid.gnode_shift = 2; /* min partition is 4 sockets */ break; } @@ -1074,8 +1076,10 @@ void __init uv_init_hub_info(struct uv_hub_info_s *hub_info) (1UL << uv_cpuid.gpa_shift) - 1; node_id.v = uv_read_local_mmr(UVH_NODE_ID); + uv_cpuid.gnode_shift = max_t(unsigned int, + uv_cpuid.gnode_shift, mn.n_val); hub_info->gnode_extra = - (node_id.s.node_id & ~((1 << mn.n_val) - 1)) >> 1; + (node_id.s.node_id & ~((1 << uv_cpuid.gnode_shift) - 1)) >> 1; hub_info->gnode_upper = ((unsigned long)hub_info->gnode_extra << mn.m_val); @@ -1172,19 +1176,25 @@ static void __init decode_gam_rng_tbl(unsigned long ptr) index, _min_socket, _max_socket, _min_pnode, _max_pnode); } -static void __init decode_uv_systab(void) +static int __init decode_uv_systab(void) { struct uv_systab *st; int i; + if (uv_hub_info->hub_revision < UV4_HUB_REVISION_BASE) + return 0; /* No extended UVsystab required */ + st = uv_systab; - if ((!st || st->revision < UV_SYSTAB_VERSION_UV4) && !is_uv4_hub()) - return; - if (st->revision != UV_SYSTAB_VERSION_UV4_LATEST) { - pr_crit( + if ((!st) || (st->revision < UV_SYSTAB_VERSION_UV4_LATEST)) { + int rev = st ? st->revision : 0; + + pr_err( "UV: BIOS UVsystab version(%x) mismatch, expecting(%x)\n", - st->revision, UV_SYSTAB_VERSION_UV4_LATEST); - BUG(); + rev, UV_SYSTAB_VERSION_UV4_LATEST); + pr_err( + "UV: Cannot support UV operations, switching to generic PC\n"); + uv_system_type = UV_NONE; + return -EINVAL; } for (i = 0; st->entry[i].type != UV_SYSTAB_TYPE_UNUSED; i++) { @@ -1205,6 +1215,7 @@ static void __init decode_uv_systab(void) break; } } + return 0; } /* @@ -1373,7 +1384,8 @@ void __init uv_system_init(void) map_low_mmrs(); uv_bios_init(); /* get uv_systab for decoding */ - decode_uv_systab(); + if (decode_uv_systab() < 0) + return; /* UVsystab problem, abort UV init */ build_socket_tables(); build_uv_gr_table(); uv_init_hub_info(&hub_info); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 1d3167269a67..80e657e89eed 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -541,8 +541,10 @@ static void early_init_amd(struct cpuinfo_x86 *c) if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); - if (!check_tsc_unstable()) - set_sched_clock_stable(); + if (check_tsc_unstable()) + clear_sched_clock_stable(); + } else { + clear_sched_clock_stable(); } /* Bit 12 of 8000_0007 edx is accumulated power mechanism. */ diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 1661d8ec9280..2c234a6d94c4 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -1,5 +1,5 @@ -#include <linux/bitops.h> -#include <linux/kernel.h> + +#include <linux/sched.h> #include <asm/cpufeature.h> #include <asm/e820.h> @@ -104,6 +104,8 @@ static void early_init_centaur(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_SYSENTER32); #endif + + clear_sched_clock_stable(); } static void init_centaur(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 9bab7a8a4293..f74e84ea8557 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -83,6 +83,7 @@ static void default_init(struct cpuinfo_x86 *c) strcpy(c->x86_model_id, "386"); } #endif + clear_sched_clock_stable(); } static const struct cpu_dev default_cpu = { @@ -655,6 +656,16 @@ void cpu_detect(struct cpuinfo_x86 *c) } } +static void apply_forced_caps(struct cpuinfo_x86 *c) +{ + int i; + + for (i = 0; i < NCAPINTS; i++) { + c->x86_capability[i] &= ~cpu_caps_cleared[i]; + c->x86_capability[i] |= cpu_caps_set[i]; + } +} + void get_cpu_cap(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; @@ -748,6 +759,13 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a); init_scattered_cpuid_features(c); + + /* + * Clear/Set all flags overridden by options, after probe. + * This needs to happen each time we re-probe, which may happen + * several times during CPU initialization. + */ + apply_forced_caps(c); } static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) @@ -801,14 +819,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) memset(&c->x86_capability, 0, sizeof c->x86_capability); c->extended_cpuid_level = 0; - if (!have_cpuid_p()) - identify_cpu_without_cpuid(c); - /* cyrix could have cpuid enabled via c_identify()*/ if (have_cpuid_p()) { cpu_detect(c); get_cpu_vendor(c); get_cpu_cap(c); + setup_force_cpu_cap(X86_FEATURE_CPUID); if (this_cpu->c_early_init) this_cpu->c_early_init(c); @@ -818,6 +834,9 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) if (this_cpu->c_bsp_init) this_cpu->c_bsp_init(c); + } else { + identify_cpu_without_cpuid(c); + setup_clear_cpu_cap(X86_FEATURE_CPUID); } setup_force_cpu_cap(X86_FEATURE_ALWAYS); @@ -1034,10 +1053,7 @@ static void identify_cpu(struct cpuinfo_x86 *c) this_cpu->c_identify(c); /* Clear/Set all flags overridden by options, after probe */ - for (i = 0; i < NCAPINTS; i++) { - c->x86_capability[i] &= ~cpu_caps_cleared[i]; - c->x86_capability[i] |= cpu_caps_set[i]; - } + apply_forced_caps(c); #ifdef CONFIG_X86_64 c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); @@ -1055,6 +1071,8 @@ static void identify_cpu(struct cpuinfo_x86 *c) */ if (this_cpu->c_init) this_cpu->c_init(c); + else + clear_sched_clock_stable(); /* Disable the PN if appropriate */ squash_the_stupid_serial_number(c); @@ -1096,10 +1114,7 @@ static void identify_cpu(struct cpuinfo_x86 *c) * Clear/Set all flags overridden by options, need do it * before following smp all cpus cap AND. */ - for (i = 0; i < NCAPINTS; i++) { - c->x86_capability[i] &= ~cpu_caps_cleared[i]; - c->x86_capability[i] |= cpu_caps_set[i]; - } + apply_forced_caps(c); /* * On SMP, boot_cpu_data holds the common feature set between diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index bd9dcd6b712d..47416f959a48 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -9,6 +9,7 @@ #include <asm/pci-direct.h> #include <asm/tsc.h> #include <asm/cpufeature.h> +#include <linux/sched.h> #include "cpu.h" @@ -183,6 +184,7 @@ static void early_init_cyrix(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); break; } + clear_sched_clock_stable(); } static void init_cyrix(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 203f860d2ab3..026c728d6ba7 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -119,8 +119,10 @@ static void early_init_intel(struct cpuinfo_x86 *c) if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); - if (!check_tsc_unstable()) - set_sched_clock_stable(); + if (check_tsc_unstable()) + clear_sched_clock_stable(); + } else { + clear_sched_clock_stable(); } /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index 83f1a98d37db..2eee85379689 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -52,8 +52,11 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) if (severity >= GHES_SEV_RECOVERABLE) m.status |= MCI_STATUS_UC; - if (severity >= GHES_SEV_PANIC) + + if (severity >= GHES_SEV_PANIC) { m.status |= MCI_STATUS_PCC; + m.tsc = rdtsc(); + } m.addr = mem_err->physical_addr; mce_log(&m); diff --git a/arch/x86/kernel/cpu/mcheck/mce-genpool.c b/arch/x86/kernel/cpu/mcheck/mce-genpool.c index 93d824ec3120..1e5a50c11d3c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-genpool.c +++ b/arch/x86/kernel/cpu/mcheck/mce-genpool.c @@ -72,7 +72,7 @@ struct llist_node *mce_gen_pool_prepare_records(void) return new_head.first; } -void mce_gen_pool_process(void) +void mce_gen_pool_process(struct work_struct *__unused) { struct llist_node *head; struct mce_evt_llist *node, *tmp; diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 517619ea6498..99165b206df3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -152,7 +152,6 @@ static void raise_mce(struct mce *m) if (context == MCJ_CTX_RANDOM) return; -#ifdef CONFIG_X86_LOCAL_APIC if (m->inject_flags & (MCJ_IRQ_BROADCAST | MCJ_NMI_BROADCAST)) { unsigned long start; int cpu; @@ -192,9 +191,7 @@ static void raise_mce(struct mce *m) raise_local(); put_cpu(); put_online_cpus(); - } else -#endif - { + } else { preempt_disable(); raise_local(); preempt_enable(); diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index cd74a3f00aea..903043e6a62b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -31,7 +31,7 @@ struct mce_evt_llist { struct mce mce; }; -void mce_gen_pool_process(void); +void mce_gen_pool_process(struct work_struct *__unused); bool mce_gen_pool_empty(void); int mce_gen_pool_add(struct mce *mce); int mce_gen_pool_init(void); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 537c6647d84c..8e9725c607ea 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -128,7 +128,6 @@ void mce_setup(struct mce *m) { memset(m, 0, sizeof(struct mce)); m->cpu = m->extcpu = smp_processor_id(); - m->tsc = rdtsc(); /* We hope get_seconds stays lockless */ m->time = get_seconds(); m->cpuvendor = boot_cpu_data.x86_vendor; @@ -217,9 +216,7 @@ void mce_register_decode_chain(struct notifier_block *nb) { atomic_inc(&num_notifiers); - /* Ensure SRAO notifier has the highest priority in the decode chain. */ - if (nb != &mce_srao_nb && nb->priority == INT_MAX) - nb->priority -= 1; + WARN_ON(nb->priority > MCE_PRIO_LOWEST && nb->priority < MCE_PRIO_EDAC); atomic_notifier_chain_register(&x86_mce_decoder_chain, nb); } @@ -583,7 +580,7 @@ static int srao_decode_notifier(struct notifier_block *nb, unsigned long val, } static struct notifier_block mce_srao_nb = { .notifier_call = srao_decode_notifier, - .priority = INT_MAX, + .priority = MCE_PRIO_SRAO, }; static int mce_default_notifier(struct notifier_block *nb, unsigned long val, @@ -609,7 +606,7 @@ static int mce_default_notifier(struct notifier_block *nb, unsigned long val, static struct notifier_block mce_default_nb = { .notifier_call = mce_default_notifier, /* lowest prio, we want it to run last. */ - .priority = 0, + .priority = MCE_PRIO_LOWEST, }; /* @@ -710,14 +707,8 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) mce_gather_info(&m, NULL); - /* - * m.tsc was set in mce_setup(). Clear it if not requested. - * - * FIXME: Propagate @flags to mce_gather_info/mce_setup() to avoid - * that dance. - */ - if (!(flags & MCP_TIMESTAMP)) - m.tsc = 0; + if (flags & MCP_TIMESTAMP) + m.tsc = rdtsc(); for (i = 0; i < mca_cfg.banks; i++) { if (!mce_banks[i].ctl || !test_bit(i, *b)) @@ -1156,6 +1147,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) goto out; mce_gather_info(&m, regs); + m.tsc = rdtsc(); final = this_cpu_ptr(&mces_seen); *final = m; @@ -1322,41 +1314,6 @@ int memory_failure(unsigned long pfn, int vector, int flags) #endif /* - * Action optional processing happens here (picking up - * from the list of faulting pages that do_machine_check() - * placed into the genpool). - */ -static void mce_process_work(struct work_struct *dummy) -{ - mce_gen_pool_process(); -} - -#ifdef CONFIG_X86_MCE_INTEL -/*** - * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog - * @cpu: The CPU on which the event occurred. - * @status: Event status information - * - * This function should be called by the thermal interrupt after the - * event has been processed and the decision was made to log the event - * further. - * - * The status parameter will be saved to the 'status' field of 'struct mce' - * and historically has been the register value of the - * MSR_IA32_THERMAL_STATUS (Intel) msr. - */ -void mce_log_therm_throt_event(__u64 status) -{ - struct mce m; - - mce_setup(&m); - m.bank = MCE_THERMAL_BANK; - m.status = status; - mce_log(&m); -} -#endif /* CONFIG_X86_MCE_INTEL */ - -/* * Periodic polling timer for "silent" machine check errors. If the * poller finds an MCE, poll 2x faster. When the poller finds no more * errors, poll 2x slower (up to check_interval seconds). @@ -2189,7 +2146,7 @@ int __init mcheck_init(void) mce_register_decode_chain(&mce_default_nb); mcheck_vendor_init_severity(); - INIT_WORK(&mce_work, mce_process_work); + INIT_WORK(&mce_work, mce_gen_pool_process); init_irq_work(&mce_irq_work, mce_irq_work_cb); return 0; diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index a5fd137417a2..524cc5780a77 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -192,6 +192,7 @@ static void get_smca_bank_info(unsigned int bank) smca_banks[bank].hwid = s_hwid; smca_banks[bank].id = instance_id; + smca_banks[bank].sysfs_id = s_hwid->count++; break; } } @@ -777,7 +778,8 @@ __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc) mce_setup(&m); m.status = status; - m.bank = bank; + m.bank = bank; + m.tsc = rdtsc(); if (threshold_err) m.misc = misc; @@ -814,14 +816,14 @@ static inline void __smp_deferred_error_interrupt(void) deferred_error_int_vector(); } -asmlinkage __visible void smp_deferred_error_interrupt(void) +asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(void) { entering_irq(); __smp_deferred_error_interrupt(); exiting_ack_irq(); } -asmlinkage __visible void smp_trace_deferred_error_interrupt(void) +asmlinkage __visible void __irq_entry smp_trace_deferred_error_interrupt(void) { entering_irq(); trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR); @@ -1064,9 +1066,12 @@ static const char *get_name(unsigned int bank, struct threshold_block *b) return NULL; } + if (smca_banks[bank].hwid->count == 1) + return smca_get_name(bank_type); + snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, "%s_%x", smca_get_name(bank_type), - smca_banks[bank].id); + smca_banks[bank].sysfs_id); return buf_mcatype; } diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 465aca8be009..d7cc190ae457 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -6,7 +6,7 @@ * * Maintains a counter in /sys that keeps track of the number of thermal * events, such that the user knows how bad the thermal problem might be - * (since the logging to syslog and mcelog is rate limited). + * (since the logging to syslog is rate limited). * * Author: Dmitriy Zavin (dmitriyz@google.com) * @@ -141,13 +141,8 @@ static struct attribute_group thermal_attr_group = { * IRQ has been acknowledged. * * It will take care of rate limiting and printing messages to the syslog. - * - * Returns: 0 : Event should NOT be further logged, i.e. still in - * "timeout" from previous log message. - * 1 : Event should be logged further, and a message has been - * printed to the syslog. */ -static int therm_throt_process(bool new_event, int event, int level) +static void therm_throt_process(bool new_event, int event, int level) { struct _thermal_state *state; unsigned int this_cpu = smp_processor_id(); @@ -162,16 +157,16 @@ static int therm_throt_process(bool new_event, int event, int level) else if (event == POWER_LIMIT_EVENT) state = &pstate->core_power_limit; else - return 0; + return; } else if (level == PACKAGE_LEVEL) { if (event == THERMAL_THROTTLING_EVENT) state = &pstate->package_throttle; else if (event == POWER_LIMIT_EVENT) state = &pstate->package_power_limit; else - return 0; + return; } else - return 0; + return; old_event = state->new_event; state->new_event = new_event; @@ -181,7 +176,7 @@ static int therm_throt_process(bool new_event, int event, int level) if (time_before64(now, state->next_check) && state->count != state->last_count) - return 0; + return; state->next_check = now + CHECK_INTERVAL; state->last_count = state->count; @@ -193,16 +188,14 @@ static int therm_throt_process(bool new_event, int event, int level) this_cpu, level == CORE_LEVEL ? "Core" : "Package", state->count); - return 1; + return; } if (old_event) { if (event == THERMAL_THROTTLING_EVENT) pr_info("CPU%d: %s temperature/speed normal\n", this_cpu, level == CORE_LEVEL ? "Core" : "Package"); - return 1; + return; } - - return 0; } static int thresh_event_valid(int level, int event) @@ -365,10 +358,9 @@ static void intel_thermal_interrupt(void) /* Check for violation of core thermal thresholds*/ notify_thresholds(msr_val); - if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, - THERMAL_THROTTLING_EVENT, - CORE_LEVEL) != 0) - mce_log_therm_throt_event(msr_val); + therm_throt_process(msr_val & THERM_STATUS_PROCHOT, + THERMAL_THROTTLING_EVENT, + CORE_LEVEL); if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable) therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, @@ -404,14 +396,16 @@ static inline void __smp_thermal_interrupt(void) smp_thermal_vector(); } -asmlinkage __visible void smp_thermal_interrupt(struct pt_regs *regs) +asmlinkage __visible void __irq_entry +smp_thermal_interrupt(struct pt_regs *regs) { entering_irq(); __smp_thermal_interrupt(); exiting_ack_irq(); } -asmlinkage __visible void smp_trace_thermal_interrupt(struct pt_regs *regs) +asmlinkage __visible void __irq_entry +smp_trace_thermal_interrupt(struct pt_regs *regs) { entering_irq(); trace_thermal_apic_entry(THERMAL_APIC_VECTOR); diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index 9beb092d68a5..bb0e75eed10a 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -23,14 +23,14 @@ static inline void __smp_threshold_interrupt(void) mce_threshold_vector(); } -asmlinkage __visible void smp_threshold_interrupt(void) +asmlinkage __visible void __irq_entry smp_threshold_interrupt(void) { entering_irq(); __smp_threshold_interrupt(); exiting_ack_irq(); } -asmlinkage __visible void smp_trace_threshold_interrupt(void) +asmlinkage __visible void __irq_entry smp_trace_threshold_interrupt(void) { entering_irq(); trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR); diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 079e81733a58..73082365ed1c 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -42,16 +42,19 @@ static struct equiv_cpu_entry *equiv_cpu_table; /* * This points to the current valid container of microcode patches which we will - * save from the initrd/builtin before jettisoning its contents. + * save from the initrd/builtin before jettisoning its contents. @mc is the + * microcode patch we found to match. */ -struct container { - u8 *data; - size_t size; -} cont; +struct cont_desc { + struct microcode_amd *mc; + u32 cpuid_1_eax; + u32 psize; + u8 *data; + size_t size; +}; static u32 ucode_new_rev; static u8 amd_ucode_patch[PATCH_MAX_SIZE]; -static u16 this_equiv_id; /* * Microcode patch container file is prepended to the initrd in cpio @@ -60,57 +63,13 @@ static u16 this_equiv_id; static const char ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin"; -static size_t compute_container_size(u8 *data, u32 total_size) +static u16 find_equiv_id(struct equiv_cpu_entry *equiv_table, u32 sig) { - size_t size = 0; - u32 *header = (u32 *)data; - - if (header[0] != UCODE_MAGIC || - header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ - header[2] == 0) /* size */ - return size; - - size = header[2] + CONTAINER_HDR_SZ; - total_size -= size; - data += size; - - while (total_size) { - u16 patch_size; - - header = (u32 *)data; - - if (header[0] != UCODE_UCODE_TYPE) - break; - - /* - * Sanity-check patch size. - */ - patch_size = header[1]; - if (patch_size > PATCH_MAX_SIZE) - break; - - size += patch_size + SECTION_HDR_SIZE; - data += patch_size + SECTION_HDR_SIZE; - total_size -= patch_size + SECTION_HDR_SIZE; + for (; equiv_table && equiv_table->installed_cpu; equiv_table++) { + if (sig == equiv_table->installed_cpu) + return equiv_table->equiv_cpu; } - return size; -} - -static inline u16 find_equiv_id(struct equiv_cpu_entry *equiv_cpu_table, - unsigned int sig) -{ - int i = 0; - - if (!equiv_cpu_table) - return 0; - - while (equiv_cpu_table[i].installed_cpu != 0) { - if (sig == equiv_cpu_table[i].installed_cpu) - return equiv_cpu_table[i].equiv_cpu; - - i++; - } return 0; } @@ -118,91 +77,109 @@ static inline u16 find_equiv_id(struct equiv_cpu_entry *equiv_cpu_table, * This scans the ucode blob for the proper container as we can have multiple * containers glued together. Returns the equivalence ID from the equivalence * table or 0 if none found. + * Returns the amount of bytes consumed while scanning. @desc contains all the + * data we're going to use in later stages of the application. */ -static u16 -find_proper_container(u8 *ucode, size_t size, struct container *ret_cont) +static ssize_t parse_container(u8 *ucode, ssize_t size, struct cont_desc *desc) { - struct container ret = { NULL, 0 }; - u32 eax, ebx, ecx, edx; struct equiv_cpu_entry *eq; - int offset, left; - u16 eq_id = 0; - u32 *header; - u8 *data; + ssize_t orig_size = size; + u32 *hdr = (u32 *)ucode; + u16 eq_id; + u8 *buf; - data = ucode; - left = size; - header = (u32 *)data; + /* Am I looking at an equivalence table header? */ + if (hdr[0] != UCODE_MAGIC || + hdr[1] != UCODE_EQUIV_CPU_TABLE_TYPE || + hdr[2] == 0) + return CONTAINER_HDR_SZ; + buf = ucode; - /* find equiv cpu table */ - if (header[0] != UCODE_MAGIC || - header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ - header[2] == 0) /* size */ - return eq_id; + eq = (struct equiv_cpu_entry *)(buf + CONTAINER_HDR_SZ); - eax = 0x00000001; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); + /* Find the equivalence ID of our CPU in this table: */ + eq_id = find_equiv_id(eq, desc->cpuid_1_eax); - while (left > 0) { - eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ); + buf += hdr[2] + CONTAINER_HDR_SZ; + size -= hdr[2] + CONTAINER_HDR_SZ; + + /* + * Scan through the rest of the container to find where it ends. We do + * some basic sanity-checking too. + */ + while (size > 0) { + struct microcode_amd *mc; + u32 patch_size; - ret.data = data; + hdr = (u32 *)buf; - /* Advance past the container header */ - offset = header[2] + CONTAINER_HDR_SZ; - data += offset; - left -= offset; + if (hdr[0] != UCODE_UCODE_TYPE) + break; - eq_id = find_equiv_id(eq, eax); - if (eq_id) { - ret.size = compute_container_size(ret.data, left + offset); + /* Sanity-check patch size. */ + patch_size = hdr[1]; + if (patch_size > PATCH_MAX_SIZE) + break; - /* - * truncate how much we need to iterate over in the - * ucode update loop below - */ - left = ret.size - offset; + /* Skip patch section header: */ + buf += SECTION_HDR_SIZE; + size -= SECTION_HDR_SIZE; - *ret_cont = ret; - return eq_id; + mc = (struct microcode_amd *)buf; + if (eq_id == mc->hdr.processor_rev_id) { + desc->psize = patch_size; + desc->mc = mc; } - /* - * support multiple container files appended together. if this - * one does not have a matching equivalent cpu entry, we fast - * forward to the next container file. - */ - while (left > 0) { - header = (u32 *)data; - - if (header[0] == UCODE_MAGIC && - header[1] == UCODE_EQUIV_CPU_TABLE_TYPE) - break; - - offset = header[1] + SECTION_HDR_SIZE; - data += offset; - left -= offset; - } + buf += patch_size; + size -= patch_size; + } - /* mark where the next microcode container file starts */ - offset = data - (u8 *)ucode; - ucode = data; + /* + * If we have found a patch (desc->mc), it means we're looking at the + * container which has a patch for this CPU so return 0 to mean, @ucode + * already points to the proper container. Otherwise, we return the size + * we scanned so that we can advance to the next container in the + * buffer. + */ + if (desc->mc) { + desc->data = ucode; + desc->size = orig_size - size; + + return 0; } - return eq_id; + return orig_size - size; } -static int __apply_microcode_amd(struct microcode_amd *mc_amd) +/* + * Scan the ucode blob for the proper container as we can have multiple + * containers glued together. + */ +static void scan_containers(u8 *ucode, size_t size, struct cont_desc *desc) +{ + ssize_t rem = size; + + while (rem >= 0) { + ssize_t s = parse_container(ucode, rem, desc); + if (!s) + return; + + ucode += s; + rem -= s; + } +} + +static int __apply_microcode_amd(struct microcode_amd *mc) { u32 rev, dummy; - native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); + native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc->hdr.data_code); /* verify patch application was successful */ native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - if (rev != mc_amd->hdr.patch_id) + if (rev != mc->hdr.patch_id) return -1; return 0; @@ -217,17 +194,16 @@ static int __apply_microcode_amd(struct microcode_amd *mc_amd) * load_microcode_amd() to save equivalent cpu table and microcode patches in * kernel heap memory. * - * Returns true if container found (sets @ret_cont), false otherwise. + * Returns true if container found (sets @desc), false otherwise. */ -static bool apply_microcode_early_amd(void *ucode, size_t size, bool save_patch, - struct container *ret_cont) +static bool +apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_patch) { + struct cont_desc desc = { 0 }; u8 (*patch)[PATCH_MAX_SIZE]; - u32 rev, *header, *new_rev; - struct container ret; - int offset, left; - u16 eq_id = 0; - u8 *data; + struct microcode_amd *mc; + u32 rev, dummy, *new_rev; + bool ret = false; #ifdef CONFIG_X86_32 new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); @@ -237,50 +213,27 @@ static bool apply_microcode_early_amd(void *ucode, size_t size, bool save_patch, patch = &amd_ucode_patch; #endif - if (check_current_patch_level(&rev, true)) - return false; - - eq_id = find_proper_container(ucode, size, &ret); - if (!eq_id) - return false; - - this_equiv_id = eq_id; - header = (u32 *)ret.data; - - /* We're pointing to an equiv table, skip over it. */ - data = ret.data + header[2] + CONTAINER_HDR_SZ; - left = ret.size - (header[2] + CONTAINER_HDR_SZ); - - while (left > 0) { - struct microcode_amd *mc; - - header = (u32 *)data; - if (header[0] != UCODE_UCODE_TYPE || /* type */ - header[1] == 0) /* size */ - break; + desc.cpuid_1_eax = cpuid_1_eax; - mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE); + scan_containers(ucode, size, &desc); - if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) { + mc = desc.mc; + if (!mc) + return ret; - if (!__apply_microcode_amd(mc)) { - rev = mc->hdr.patch_id; - *new_rev = rev; + native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + if (rev >= mc->hdr.patch_id) + return ret; - if (save_patch) - memcpy(patch, mc, min_t(u32, header[1], PATCH_MAX_SIZE)); - } - } + if (!__apply_microcode_amd(mc)) { + *new_rev = mc->hdr.patch_id; + ret = true; - offset = header[1] + SECTION_HDR_SIZE; - data += offset; - left -= offset; + if (save_patch) + memcpy(patch, mc, min_t(u32, desc.psize, PATCH_MAX_SIZE)); } - if (ret_cont) - *ret_cont = ret; - - return true; + return ret; } static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family) @@ -298,10 +251,9 @@ static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family) #endif } -void __init load_ucode_amd_bsp(unsigned int family) +void __load_ucode_amd(unsigned int cpuid_1_eax, struct cpio_data *ret) { struct ucode_cpu_info *uci; - u32 eax, ebx, ecx, edx; struct cpio_data cp; const char *path; bool use_pa; @@ -316,184 +268,95 @@ void __init load_ucode_amd_bsp(unsigned int family) use_pa = false; } - if (!get_builtin_microcode(&cp, family)) + if (!get_builtin_microcode(&cp, x86_family(cpuid_1_eax)) && !initrd_gone) cp = find_microcode_in_initrd(path, use_pa); - if (!(cp.data && cp.size)) - return; - - /* Get BSP's CPUID.EAX(1), needed in load_microcode_amd() */ - eax = 1; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - uci->cpu_sig.sig = eax; + /* Needed in load_microcode_amd() */ + uci->cpu_sig.sig = cpuid_1_eax; - apply_microcode_early_amd(cp.data, cp.size, true, NULL); + *ret = cp; } -#ifdef CONFIG_X86_32 -/* - * On 32-bit, since AP's early load occurs before paging is turned on, we - * cannot traverse cpu_equiv_table and microcode_cache in kernel heap memory. - * So during cold boot, AP will apply_ucode_in_initrd() just like the BSP. - * In save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch, - * which is used upon resume from suspend. - */ -void load_ucode_amd_ap(unsigned int family) +void __init load_ucode_amd_bsp(unsigned int cpuid_1_eax) { - struct microcode_amd *mc; - struct cpio_data cp; - - mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch); - if (mc->hdr.patch_id && mc->hdr.processor_rev_id) { - __apply_microcode_amd(mc); - return; - } - - if (!get_builtin_microcode(&cp, family)) - cp = find_microcode_in_initrd((const char *)__pa_nodebug(ucode_path), true); + struct cpio_data cp = { }; + __load_ucode_amd(cpuid_1_eax, &cp); if (!(cp.data && cp.size)) return; - /* - * This would set amd_ucode_patch above so that the following APs can - * use it directly instead of going down this path again. - */ - apply_microcode_early_amd(cp.data, cp.size, true, NULL); + apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, true); } -#else -void load_ucode_amd_ap(unsigned int family) + +void load_ucode_amd_ap(unsigned int cpuid_1_eax) { - struct equiv_cpu_entry *eq; struct microcode_amd *mc; - u32 rev, eax; - u16 eq_id; - - /* 64-bit runs with paging enabled, thus early==false. */ - if (check_current_patch_level(&rev, false)) - return; - - /* First AP hasn't cached it yet, go through the blob. */ - if (!cont.data) { - struct cpio_data cp = { NULL, 0, "" }; + struct cpio_data cp; + u32 *new_rev, rev, dummy; - if (cont.size == -1) - return; + if (IS_ENABLED(CONFIG_X86_32)) { + mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch); + new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); + } else { + mc = (struct microcode_amd *)amd_ucode_patch; + new_rev = &ucode_new_rev; + } -reget: - if (!get_builtin_microcode(&cp, family)) { -#ifdef CONFIG_BLK_DEV_INITRD - if (!initrd_gone) - cp = find_cpio_data(ucode_path, (void *)initrd_start, - initrd_end - initrd_start, NULL); -#endif - if (!(cp.data && cp.size)) { - /* - * Mark it so that other APs do not scan again - * for no real reason and slow down boot - * needlessly. - */ - cont.size = -1; - return; - } - } + native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - if (!apply_microcode_early_amd(cp.data, cp.size, false, &cont)) { - cont.size = -1; + /* Check whether we have saved a new patch already: */ + if (*new_rev && rev < mc->hdr.patch_id) { + if (!__apply_microcode_amd(mc)) { + *new_rev = mc->hdr.patch_id; return; } } - eax = cpuid_eax(0x00000001); - eq = (struct equiv_cpu_entry *)(cont.data + CONTAINER_HDR_SZ); - - eq_id = find_equiv_id(eq, eax); - if (!eq_id) + __load_ucode_amd(cpuid_1_eax, &cp); + if (!(cp.data && cp.size)) return; - if (eq_id == this_equiv_id) { - mc = (struct microcode_amd *)amd_ucode_patch; - - if (mc && rev < mc->hdr.patch_id) { - if (!__apply_microcode_amd(mc)) - ucode_new_rev = mc->hdr.patch_id; - } - - } else { - - /* - * AP has a different equivalence ID than BSP, looks like - * mixed-steppings silicon so go through the ucode blob anew. - */ - goto reget; - } + apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, false); } -#endif /* CONFIG_X86_32 */ static enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size); -int __init save_microcode_in_initrd_amd(unsigned int fam) +int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax) { + struct cont_desc desc = { 0 }; enum ucode_state ret; - int retval = 0; - u16 eq_id; - - if (!cont.data) { - if (IS_ENABLED(CONFIG_X86_32) && (cont.size != -1)) { - struct cpio_data cp = { NULL, 0, "" }; - -#ifdef CONFIG_BLK_DEV_INITRD - cp = find_cpio_data(ucode_path, (void *)initrd_start, - initrd_end - initrd_start, NULL); -#endif + struct cpio_data cp; - if (!(cp.data && cp.size)) { - cont.size = -1; - return -EINVAL; - } + cp = find_microcode_in_initrd(ucode_path, false); + if (!(cp.data && cp.size)) + return -EINVAL; - eq_id = find_proper_container(cp.data, cp.size, &cont); - if (!eq_id) { - cont.size = -1; - return -EINVAL; - } + desc.cpuid_1_eax = cpuid_1_eax; - } else - return -EINVAL; - } + scan_containers(cp.data, cp.size, &desc); + if (!desc.mc) + return -EINVAL; - ret = load_microcode_amd(smp_processor_id(), fam, cont.data, cont.size); + ret = load_microcode_amd(smp_processor_id(), x86_family(cpuid_1_eax), + desc.data, desc.size); if (ret != UCODE_OK) - retval = -EINVAL; - - /* - * This will be freed any msec now, stash patches for the current - * family and switch to patch cache for cpu hotplug, etc later. - */ - cont.data = NULL; - cont.size = 0; + return -EINVAL; - return retval; + return 0; } void reload_ucode_amd(void) { struct microcode_amd *mc; - u32 rev; - - /* - * early==false because this is a syscore ->resume path and by - * that time paging is long enabled. - */ - if (check_current_patch_level(&rev, false)) - return; + u32 rev, dummy; mc = (struct microcode_amd *)amd_ucode_patch; if (!mc) return; + rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + if (rev < mc->hdr.patch_id) { if (!__apply_microcode_amd(mc)) { ucode_new_rev = mc->hdr.patch_id; @@ -631,60 +494,13 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size, return patch_size; } -/* - * Those patch levels cannot be updated to newer ones and thus should be final. - */ -static u32 final_levels[] = { - 0x01000098, - 0x0100009f, - 0x010000af, - 0, /* T-101 terminator */ -}; - -/* - * Check the current patch level on this CPU. - * - * @rev: Use it to return the patch level. It is set to 0 in the case of - * error. - * - * Returns: - * - true: if update should stop - * - false: otherwise - */ -bool check_current_patch_level(u32 *rev, bool early) -{ - u32 lvl, dummy, i; - bool ret = false; - u32 *levels; - - native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy); - - if (IS_ENABLED(CONFIG_X86_32) && early) - levels = (u32 *)__pa_nodebug(&final_levels); - else - levels = final_levels; - - for (i = 0; levels[i]; i++) { - if (lvl == levels[i]) { - lvl = 0; - ret = true; - break; - } - } - - if (rev) - *rev = lvl; - - return ret; -} - static int apply_microcode_amd(int cpu) { struct cpuinfo_x86 *c = &cpu_data(cpu); struct microcode_amd *mc_amd; struct ucode_cpu_info *uci; struct ucode_patch *p; - u32 rev; + u32 rev, dummy; BUG_ON(raw_smp_processor_id() != cpu); @@ -697,8 +513,7 @@ static int apply_microcode_amd(int cpu) mc_amd = p->data; uci->mc = p->data; - if (check_current_patch_level(&rev, false)) - return -1; + rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); /* need to apply patch? */ if (rev >= mc_amd->hdr.patch_id) { diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 73102d932760..e51eeaed8016 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -66,19 +66,50 @@ static DEFINE_MUTEX(microcode_mutex); struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; -/* - * Operations that are run on a target cpu: - */ - struct cpu_info_ctx { struct cpu_signature *cpu_sig; int err; }; +/* + * Those patch levels cannot be updated to newer ones and thus should be final. + */ +static u32 final_levels[] = { + 0x01000098, + 0x0100009f, + 0x010000af, + 0, /* T-101 terminator */ +}; + +/* + * Check the current patch level on this CPU. + * + * Returns: + * - true: if update should stop + * - false: otherwise + */ +static bool amd_check_current_patch_level(void) +{ + u32 lvl, dummy, i; + u32 *levels; + + native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy); + + if (IS_ENABLED(CONFIG_X86_32)) + levels = (u32 *)__pa_nodebug(&final_levels); + else + levels = final_levels; + + for (i = 0; levels[i]; i++) { + if (lvl == levels[i]) + return true; + } + return false; +} + static bool __init check_loader_disabled_bsp(void) { static const char *__dis_opt_str = "dis_ucode_ldr"; - u32 a, b, c, d; #ifdef CONFIG_X86_32 const char *cmdline = (const char *)__pa_nodebug(boot_command_line); @@ -94,18 +125,19 @@ static bool __init check_loader_disabled_bsp(void) if (!have_cpuid_p()) return *res; - a = 1; - c = 0; - native_cpuid(&a, &b, &c, &d); - /* * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not * completely accurate as xen pv guests don't see that CPUID bit set but * that's good enough as they don't land on the BSP path anyway. */ - if (c & BIT(31)) + if (native_cpuid_ecx(1) & BIT(31)) return *res; + if (x86_cpuid_vendor() == X86_VENDOR_AMD) { + if (amd_check_current_patch_level()) + return *res; + } + if (cmdline_find_option_bool(cmdline, option) <= 0) *res = false; @@ -133,23 +165,21 @@ bool get_builtin_firmware(struct cpio_data *cd, const char *name) void __init load_ucode_bsp(void) { - int vendor; - unsigned int family; + unsigned int cpuid_1_eax; if (check_loader_disabled_bsp()) return; - vendor = x86_cpuid_vendor(); - family = x86_cpuid_family(); + cpuid_1_eax = native_cpuid_eax(1); - switch (vendor) { + switch (x86_cpuid_vendor()) { case X86_VENDOR_INTEL: - if (family >= 6) + if (x86_family(cpuid_1_eax) >= 6) load_ucode_intel_bsp(); break; case X86_VENDOR_AMD: - if (family >= 0x10) - load_ucode_amd_bsp(family); + if (x86_family(cpuid_1_eax) >= 0x10) + load_ucode_amd_bsp(cpuid_1_eax); break; default: break; @@ -167,22 +197,21 @@ static bool check_loader_disabled_ap(void) void load_ucode_ap(void) { - int vendor, family; + unsigned int cpuid_1_eax; if (check_loader_disabled_ap()) return; - vendor = x86_cpuid_vendor(); - family = x86_cpuid_family(); + cpuid_1_eax = native_cpuid_eax(1); - switch (vendor) { + switch (x86_cpuid_vendor()) { case X86_VENDOR_INTEL: - if (family >= 6) + if (x86_family(cpuid_1_eax) >= 6) load_ucode_intel_ap(); break; case X86_VENDOR_AMD: - if (family >= 0x10) - load_ucode_amd_ap(family); + if (x86_family(cpuid_1_eax) >= 0x10) + load_ucode_amd_ap(cpuid_1_eax); break; default: break; @@ -201,7 +230,7 @@ static int __init save_microcode_in_initrd(void) break; case X86_VENDOR_AMD: if (c->x86 >= 0x10) - ret = save_microcode_in_initrd_amd(c->x86); + ret = save_microcode_in_initrd_amd(cpuid_eax(1)); break; default: break; diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index 34178564be2a..c1ea5b999839 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c @@ -1,4 +1,5 @@ #include <linux/kernel.h> +#include <linux/sched.h> #include <linux/mm.h> #include <asm/cpufeature.h> #include <asm/msr.h> @@ -14,6 +15,8 @@ static void early_init_transmeta(struct cpuinfo_x86 *c) if (xlvl >= 0x80860001) c->x86_capability[CPUID_8086_0001_EDX] = cpuid_edx(0x80860001); } + + clear_sched_clock_stable(); } static void init_transmeta(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 90e8dde3ec26..b2bbad6ebe4d 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -580,24 +580,19 @@ static void __init update_e820_saved(void) } #define MAX_GAP_END 0x100000000ull /* - * Search for a gap in the e820 memory space from start_addr to end_addr. + * Search for a gap in the e820 memory space from 0 to MAX_GAP_END. */ -__init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, - unsigned long start_addr, unsigned long long end_addr) +static int __init e820_search_gap(unsigned long *gapstart, + unsigned long *gapsize) { - unsigned long long last; + unsigned long long last = MAX_GAP_END; int i = e820->nr_map; int found = 0; - last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END; - while (--i >= 0) { unsigned long long start = e820->map[i].addr; unsigned long long end = start + e820->map[i].size; - if (end < start_addr) - continue; - /* * Since "last" is at most 4GB, we know we'll * fit in 32 bits if this condition is true @@ -628,18 +623,19 @@ __init void e820_setup_gap(void) unsigned long gapstart, gapsize; int found; - gapstart = 0x10000000; gapsize = 0x400000; - found = e820_search_gap(&gapstart, &gapsize, 0, MAX_GAP_END); + found = e820_search_gap(&gapstart, &gapsize); -#ifdef CONFIG_X86_64 if (!found) { +#ifdef CONFIG_X86_64 gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; printk(KERN_ERR "e820: cannot find a gap in the 32bit address range\n" "e820: PCI devices with unassigned 32bit BARs may break!\n"); - } +#else + gapstart = 0x10000000; #endif + } /* * e820_reserve_resources_late protect stolen RAM already diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index de7234401275..e1114f070c2d 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -9,7 +9,6 @@ #include <asm/fpu/regset.h> #include <asm/fpu/signal.h> #include <asm/fpu/types.h> -#include <asm/fpu/xstate.h> #include <asm/traps.h> #include <linux/hardirq.h> @@ -179,14 +178,8 @@ void fpstate_init(union fpregs_state *state) memset(state, 0, fpu_kernel_xstate_size); - /* - * XRSTORS requires that this bit is set in xcomp_bv, or - * it will #GP. Make sure it is replaced after the memset(). - */ if (static_cpu_has(X86_FEATURE_XSAVES)) - state->xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | - xfeatures_mask; - + fpstate_init_xstate(&state->xsave); if (static_cpu_has(X86_FEATURE_FXSR)) fpstate_init_fxstate(&state->fxsave); else diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 60dece392b3a..19bdd1bf8160 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -48,13 +48,7 @@ void fpu__init_cpu(void) fpu__init_cpu_xstate(); } -/* - * The earliest FPU detection code. - * - * Set the X86_FEATURE_FPU CPU-capability bit based on - * trying to execute an actual sequence of FPU instructions: - */ -static void fpu__init_system_early_generic(struct cpuinfo_x86 *c) +static bool fpu__probe_without_cpuid(void) { unsigned long cr0; u16 fsw, fcw; @@ -65,18 +59,25 @@ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c) cr0 &= ~(X86_CR0_TS | X86_CR0_EM); write_cr0(cr0); - if (!test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) { - asm volatile("fninit ; fnstsw %0 ; fnstcw %1" - : "+m" (fsw), "+m" (fcw)); + asm volatile("fninit ; fnstsw %0 ; fnstcw %1" : "+m" (fsw), "+m" (fcw)); + + pr_info("x86/fpu: Probing for FPU: FSW=0x%04hx FCW=0x%04hx\n", fsw, fcw); - if (fsw == 0 && (fcw & 0x103f) == 0x003f) - set_cpu_cap(c, X86_FEATURE_FPU); + return fsw == 0 && (fcw & 0x103f) == 0x003f; +} + +static void fpu__init_system_early_generic(struct cpuinfo_x86 *c) +{ + if (!boot_cpu_has(X86_FEATURE_CPUID) && + !test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) { + if (fpu__probe_without_cpuid()) + setup_force_cpu_cap(X86_FEATURE_FPU); else - clear_cpu_cap(c, X86_FEATURE_FPU); + setup_clear_cpu_cap(X86_FEATURE_FPU); } #ifndef CONFIG_MATH_EMULATION - if (!boot_cpu_has(X86_FEATURE_FPU)) { + if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_FPU)) { pr_emerg("x86/fpu: Giving up, no FPU found and no math emulation present\n"); for (;;) asm volatile("hlt"); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 1d7770447b3e..c24ac1efb12d 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -78,6 +78,7 @@ void fpu__xstate_clear_all_cpu_caps(void) setup_clear_cpu_cap(X86_FEATURE_PKU); setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW); setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS); + setup_clear_cpu_cap(X86_FEATURE_AVX512_VPOPCNTDQ); } /* @@ -705,8 +706,14 @@ void __init fpu__init_system_xstate(void) WARN_ON_FPU(!on_boot_cpu); on_boot_cpu = 0; + if (!boot_cpu_has(X86_FEATURE_FPU)) { + pr_info("x86/fpu: No FPU detected\n"); + return; + } + if (!boot_cpu_has(X86_FEATURE_XSAVE)) { - pr_info("x86/fpu: Legacy x87 FPU detected.\n"); + pr_info("x86/fpu: x87 FPU will use %s\n", + boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE"); return; } diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index f16c55bfc090..e5fb436a6548 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -49,3 +49,65 @@ asmlinkage __visible void __init i386_start_kernel(void) start_kernel(); } + +/* + * Initialize page tables. This creates a PDE and a set of page + * tables, which are located immediately beyond __brk_base. The variable + * _brk_end is set up to point to the first "safe" location. + * Mappings are created both at virtual address 0 (identity mapping) + * and PAGE_OFFSET for up to _end. + * + * In PAE mode initial_page_table is statically defined to contain + * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3 + * entries). The identity mapping is handled by pointing two PGD entries + * to the first kernel PMD. Note the upper half of each PMD or PTE are + * always zero at this stage. + */ +void __init mk_early_pgtbl_32(void) +{ +#ifdef __pa +#undef __pa +#endif +#define __pa(x) ((unsigned long)(x) - PAGE_OFFSET) + pte_t pte, *ptep; + int i; + unsigned long *ptr; + /* Enough space to fit pagetables for the low memory linear map */ + const unsigned long limit = __pa(_end) + + (PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT); +#ifdef CONFIG_X86_PAE + pmd_t pl2, *pl2p = (pmd_t *)__pa(initial_pg_pmd); +#define SET_PL2(pl2, val) { (pl2).pmd = (val); } +#else + pgd_t pl2, *pl2p = (pgd_t *)__pa(initial_page_table); +#define SET_PL2(pl2, val) { (pl2).pgd = (val); } +#endif + + ptep = (pte_t *)__pa(__brk_base); + pte.pte = PTE_IDENT_ATTR; + + while ((pte.pte & PTE_PFN_MASK) < limit) { + + SET_PL2(pl2, (unsigned long)ptep | PDE_IDENT_ATTR); + *pl2p = pl2; +#ifndef CONFIG_X86_PAE + /* Kernel PDE entry */ + *(pl2p + ((PAGE_OFFSET >> PGDIR_SHIFT))) = pl2; +#endif + for (i = 0; i < PTRS_PER_PTE; i++) { + *ptep = pte; + pte.pte += PAGE_SIZE; + ptep++; + } + + pl2p++; + } + + ptr = (unsigned long *)__pa(&max_pfn_mapped); + /* Can't use pte_pfn() since it's a call with CONFIG_PARAVIRT */ + *ptr = (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT; + + ptr = (unsigned long *)__pa(&_brk_end); + *ptr = (unsigned long)ptep + PAGE_OFFSET; +} + diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 4e8577d03372..1f85ee8f9439 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -24,6 +24,7 @@ #include <asm/nops.h> #include <asm/bootparam.h> #include <asm/export.h> +#include <asm/pgtable_32.h> /* Physical address */ #define pa(X) ((X) - __PAGE_OFFSET) @@ -41,44 +42,10 @@ #define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability #define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id -/* - * This is how much memory in addition to the memory covered up to - * and including _end we need mapped initially. - * We need: - * (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE) - * (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE) - * - * Modulo rounding, each megabyte assigned here requires a kilobyte of - * memory, which is currently unreclaimed. - * - * This should be a multiple of a page. - * - * KERNEL_IMAGE_SIZE should be greater than pa(_end) - * and small than max_low_pfn, otherwise will waste some page table entries - */ - -#if PTRS_PER_PMD > 1 -#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD) -#else -#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD) -#endif #define SIZEOF_PTREGS 17*4 /* - * Number of possible pages in the lowmem region. - * - * We shift 2 by 31 instead of 1 by 32 to the left in order to avoid a - * gas warning about overflowing shift count when gas has been compiled - * with only a host target support using a 32-bit type for internal - * representation. - */ -LOWMEM_PAGES = (((2<<31) - __PAGE_OFFSET) >> PAGE_SHIFT) - -/* Enough space to fit pagetables for the low memory linear map */ -MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT - -/* * Worst-case size of the kernel mapping we need to make: * a relocatable kernel can live anywhere in lowmem, so we need to be able * to map all of lowmem. @@ -160,90 +127,15 @@ ENTRY(startup_32) call load_ucode_bsp #endif -/* - * Initialize page tables. This creates a PDE and a set of page - * tables, which are located immediately beyond __brk_base. The variable - * _brk_end is set up to point to the first "safe" location. - * Mappings are created both at virtual address 0 (identity mapping) - * and PAGE_OFFSET for up to _end. - */ -#ifdef CONFIG_X86_PAE - - /* - * In PAE mode initial_page_table is statically defined to contain - * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3 - * entries). The identity mapping is handled by pointing two PGD entries - * to the first kernel PMD. - * - * Note the upper half of each PMD or PTE are always zero at this stage. - */ - -#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */ - - xorl %ebx,%ebx /* %ebx is kept at zero */ - - movl $pa(__brk_base), %edi - movl $pa(initial_pg_pmd), %edx - movl $PTE_IDENT_ATTR, %eax -10: - leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */ - movl %ecx,(%edx) /* Store PMD entry */ - /* Upper half already zero */ - addl $8,%edx - movl $512,%ecx -11: - stosl - xchgl %eax,%ebx - stosl - xchgl %eax,%ebx - addl $0x1000,%eax - loop 11b - - /* - * End condition: we must map up to the end + MAPPING_BEYOND_END. - */ - movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp - cmpl %ebp,%eax - jb 10b -1: - addl $__PAGE_OFFSET, %edi - movl %edi, pa(_brk_end) - shrl $12, %eax - movl %eax, pa(max_pfn_mapped) + /* Create early pagetables. */ + call mk_early_pgtbl_32 /* Do early initialization of the fixmap area */ movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax +#ifdef CONFIG_X86_PAE +#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */ movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8) -#else /* Not PAE */ - -page_pde_offset = (__PAGE_OFFSET >> 20); - - movl $pa(__brk_base), %edi - movl $pa(initial_page_table), %edx - movl $PTE_IDENT_ATTR, %eax -10: - leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */ - movl %ecx,(%edx) /* Store identity PDE entry */ - movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */ - addl $4,%edx - movl $1024, %ecx -11: - stosl - addl $0x1000,%eax - loop 11b - /* - * End condition: we must map up to the end + MAPPING_BEYOND_END. - */ - movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp - cmpl %ebp,%eax - jb 10b - addl $__PAGE_OFFSET, %edi - movl %edi, pa(_brk_end) - shrl $12, %eax - movl %eax, pa(max_pfn_mapped) - - /* Do early initialization of the fixmap area */ - movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax +#else movl %eax,pa(initial_page_table+0xffc) #endif @@ -666,6 +558,7 @@ ENTRY(setup_once_ref) __PAGE_ALIGNED_BSS .align PAGE_SIZE #ifdef CONFIG_X86_PAE +.globl initial_pg_pmd initial_pg_pmd: .fill 1024*KPMDS,4,0 #else diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 7c6e9ffe4424..4d8183b5f113 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -264,7 +264,7 @@ void __smp_x86_platform_ipi(void) x86_platform_ipi_callback(); } -__visible void smp_x86_platform_ipi(struct pt_regs *regs) +__visible void __irq_entry smp_x86_platform_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -315,7 +315,7 @@ __visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs) } #endif -__visible void smp_trace_x86_platform_ipi(struct pt_regs *regs) +__visible void __irq_entry smp_trace_x86_platform_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index 3512ba607361..275487872be2 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -9,6 +9,7 @@ #include <linux/hardirq.h> #include <asm/apic.h> #include <asm/trace/irq_vectors.h> +#include <linux/interrupt.h> static inline void __smp_irq_work_interrupt(void) { @@ -16,14 +17,14 @@ static inline void __smp_irq_work_interrupt(void) irq_work_run(); } -__visible void smp_irq_work_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_irq_work_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); __smp_irq_work_interrupt(); exiting_irq(); } -__visible void smp_trace_irq_work_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_trace_irq_work_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); trace_irq_work_entry(IRQ_WORK_VECTOR); diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index cb9c1ed1d391..f73f475d0573 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c @@ -132,10 +132,8 @@ int sched_set_itmt_support(void) sysctl_sched_itmt_enabled = 1; - if (sysctl_sched_itmt_enabled) { - x86_topology_update = true; - rebuild_sched_domains(); - } + x86_topology_update = true; + rebuild_sched_domains(); mutex_unlock(&itmt_update_mutex); diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index fc25f698d792..c37bd0f39c70 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -32,8 +32,7 @@ static void bug_at(unsigned char *ip, int line) * Something went wrong. Crash the box, as something could be * corrupting the kernel. */ - pr_warning("Unexpected op at %pS [%p] (%02x %02x %02x %02x %02x) %s:%d\n", - ip, ip, ip[0], ip[1], ip[2], ip[3], ip[4], __FILE__, line); + pr_crit("jump_label: Fatal kernel bug, unexpected op at %pS [%p] (%5ph) %d\n", ip, ip, ip, line); BUG(); } diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 36bc66416021..099fcba4981d 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -620,18 +620,4 @@ void __init kvm_spinlock_init(void) } } -static __init int kvm_spinlock_init_jump(void) -{ - if (!kvm_para_available()) - return 0; - if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) - return 0; - - static_key_slow_inc(¶virt_ticketlocks_enabled); - printk(KERN_INFO "KVM setup paravirtual spinlock\n"); - - return 0; -} -early_initcall(kvm_spinlock_init_jump); - #endif /* CONFIG_PARAVIRT_SPINLOCKS */ diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 2a5cafdf8808..542710b99f52 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -107,12 +107,12 @@ static inline void kvm_sched_clock_init(bool stable) { if (!stable) { pv_time_ops.sched_clock = kvm_clock_read; + clear_sched_clock_stable(); return; } kvm_sched_clock_offset = kvm_clock_read(); pv_time_ops.sched_clock = kvm_sched_clock_read; - set_sched_clock_stable(); printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n", kvm_sched_clock_offset); diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index 6d4bf812af45..6259327f3454 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -42,6 +42,3 @@ struct pv_lock_ops pv_lock_ops = { #endif /* SMP */ }; EXPORT_SYMBOL(pv_lock_ops); - -struct static_key paravirt_ticketlocks_enabled = STATIC_KEY_INIT_FALSE; -EXPORT_SYMBOL(paravirt_ticketlocks_enabled); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 4cfba947d774..eb69b14dbfc8 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -575,7 +575,9 @@ static void __init reserve_crashkernel(void) /* 0 means: find the address automatically */ if (crash_base <= 0) { /* - * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX + * Set CRASH_ADDR_LOW_MAX upper bound for crash memory, + * as old kexec-tools loads bzImage below that, unless + * "crashkernel=size[KMG],high" is specified. */ crash_base = memblock_find_in_range(CRASH_ALIGN, high ? CRASH_ADDR_HIGH_MAX diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 68f8cc222f25..d3c66a15bbde 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -259,7 +259,7 @@ static inline void __smp_reschedule_interrupt(void) scheduler_ipi(); } -__visible void smp_reschedule_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); __smp_reschedule_interrupt(); @@ -268,7 +268,7 @@ __visible void smp_reschedule_interrupt(struct pt_regs *regs) */ } -__visible void smp_trace_reschedule_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_trace_reschedule_interrupt(struct pt_regs *regs) { /* * Need to call irq_enter() before calling the trace point. @@ -292,14 +292,15 @@ static inline void __smp_call_function_interrupt(void) inc_irq_stat(irq_call_count); } -__visible void smp_call_function_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_call_function_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); __smp_call_function_interrupt(); exiting_irq(); } -__visible void smp_trace_call_function_interrupt(struct pt_regs *regs) +__visible void __irq_entry +smp_trace_call_function_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); trace_call_function_entry(CALL_FUNCTION_VECTOR); @@ -314,14 +315,16 @@ static inline void __smp_call_function_single_interrupt(void) inc_irq_stat(irq_call_count); } -__visible void smp_call_function_single_interrupt(struct pt_regs *regs) +__visible void __irq_entry +smp_call_function_single_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); __smp_call_function_single_interrupt(); exiting_irq(); } -__visible void smp_trace_call_function_single_interrupt(struct pt_regs *regs) +__visible void __irq_entry +smp_trace_call_function_single_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index e41af597aed8..cb60bbe093b2 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1107,6 +1107,16 @@ static u64 read_tsc(struct clocksource *cs) return (u64)rdtsc_ordered(); } +static void tsc_cs_mark_unstable(struct clocksource *cs) +{ + if (tsc_unstable) + return; + tsc_unstable = 1; + clear_sched_clock_stable(); + disable_sched_clock_irqtime(); + pr_info("Marking TSC unstable due to clocksource watchdog\n"); +} + /* * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc() */ @@ -1119,6 +1129,7 @@ static struct clocksource clocksource_tsc = { CLOCK_SOURCE_MUST_VERIFY, .archdata = { .vclock_mode = VCLOCK_TSC }, .resume = tsc_resume, + .mark_unstable = tsc_cs_mark_unstable, }; void mark_tsc_unstable(char *reason) diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 073d1f1a620b..a8e91ae89fb3 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -156,13 +156,13 @@ EXPORT_SYMBOL(__delay); inline void __const_udelay(unsigned long xloops) { + unsigned long lpj = this_cpu_read(cpu_info.loops_per_jiffy) ? : loops_per_jiffy; int d0; xloops *= 4; asm("mull %%edx" :"=d" (xloops), "=&a" (d0) - :"1" (xloops), "0" - (this_cpu_read(cpu_info.loops_per_jiffy) * (HZ/4))); + :"1" (xloops), "0" (lpj * (HZ / 4))); __delay(++xloops); } diff --git a/arch/x86/platform/intel-mid/device_libs/Makefile b/arch/x86/platform/intel-mid/device_libs/Makefile index 90e4f2a6625b..a7dbec4dce27 100644 --- a/arch/x86/platform/intel-mid/device_libs/Makefile +++ b/arch/x86/platform/intel-mid/device_libs/Makefile @@ -5,14 +5,12 @@ obj-$(subst m,y,$(CONFIG_MMC_SDHCI_PCI)) += platform_mrfld_sd.o # WiFi obj-$(subst m,y,$(CONFIG_BRCMFMAC_SDIO)) += platform_bcm43xx.o # IPC Devices -obj-y += platform_ipc.o obj-$(subst m,y,$(CONFIG_MFD_INTEL_MSIC)) += platform_msic.o obj-$(subst m,y,$(CONFIG_SND_MFLD_MACHINE)) += platform_msic_audio.o obj-$(subst m,y,$(CONFIG_GPIO_MSIC)) += platform_msic_gpio.o obj-$(subst m,y,$(CONFIG_MFD_INTEL_MSIC)) += platform_msic_ocd.o obj-$(subst m,y,$(CONFIG_MFD_INTEL_MSIC)) += platform_msic_battery.o obj-$(subst m,y,$(CONFIG_INTEL_MID_POWER_BUTTON)) += platform_msic_power_btn.o -obj-$(subst m,y,$(CONFIG_GPIO_INTEL_PMIC)) += platform_pmic_gpio.o obj-$(subst m,y,$(CONFIG_INTEL_MFLD_THERMAL)) += platform_msic_thermal.o # SPI Devices obj-$(subst m,y,$(CONFIG_SPI_SPIDEV)) += platform_mrfld_spidev.o @@ -28,4 +26,5 @@ obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_pcal9555a.o obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_tca6416.o # MISC Devices obj-$(subst m,y,$(CONFIG_KEYBOARD_GPIO)) += platform_gpio_keys.o +obj-$(subst m,y,$(CONFIG_RTC_DRV_CMOS)) += platform_mrfld_rtc.o obj-$(subst m,y,$(CONFIG_INTEL_MID_WATCHDOG)) += platform_mrfld_wdt.o diff --git a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c b/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c index 52534ec29765..74283875c7e8 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c @@ -32,6 +32,9 @@ static struct gpio_keys_button gpio_button[] = { {SW_LID, -1, 1, "lid_switch", EV_SW, 0, 20}, {KEY_VOLUMEUP, -1, 1, "vol_up", EV_KEY, 0, 20}, {KEY_VOLUMEDOWN, -1, 1, "vol_down", EV_KEY, 0, 20}, + {KEY_MUTE, -1, 1, "mute_enable", EV_KEY, 0, 20}, + {KEY_VOLUMEUP, -1, 1, "volume_up", EV_KEY, 0, 20}, + {KEY_VOLUMEDOWN, -1, 1, "volume_down", EV_KEY, 0, 20}, {KEY_CAMERA, -1, 1, "camera_full", EV_KEY, 0, 20}, {KEY_CAMERA_FOCUS, -1, 1, "camera_half", EV_KEY, 0, 20}, {SW_KEYPAD_SLIDE, -1, 1, "MagSw1", EV_SW, 0, 20}, diff --git a/arch/x86/platform/intel-mid/device_libs/platform_ipc.c b/arch/x86/platform/intel-mid/device_libs/platform_ipc.c deleted file mode 100644 index a84b73d6c4a0..000000000000 --- a/arch/x86/platform/intel-mid/device_libs/platform_ipc.c +++ /dev/null @@ -1,68 +0,0 @@ -/* - * platform_ipc.c: IPC platform library file - * - * (C) Copyright 2013 Intel Corporation - * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ - -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/interrupt.h> -#include <linux/sfi.h> -#include <linux/gpio.h> -#include <asm/intel-mid.h> -#include "platform_ipc.h" - -void __init ipc_device_handler(struct sfi_device_table_entry *pentry, - struct devs_id *dev) -{ - struct platform_device *pdev; - void *pdata = NULL; - static struct resource res __initdata = { - .name = "IRQ", - .flags = IORESOURCE_IRQ, - }; - - pr_debug("IPC bus, name = %16.16s, irq = 0x%2x\n", - pentry->name, pentry->irq); - - /* - * We need to call platform init of IPC devices to fill misc_pdata - * structure. It will be used in msic_init for initialization. - */ - if (dev != NULL) - pdata = dev->get_platform_data(pentry); - - /* - * On Medfield the platform device creation is handled by the MSIC - * MFD driver so we don't need to do it here. - */ - if (intel_mid_has_msic()) - return; - - pdev = platform_device_alloc(pentry->name, 0); - if (pdev == NULL) { - pr_err("out of memory for SFI platform device '%s'.\n", - pentry->name); - return; - } - res.start = pentry->irq; - platform_device_add_resources(pdev, &res, 1); - - pdev->dev.platform_data = pdata; - intel_scu_device_register(pdev); -} - -static const struct devs_id pmic_audio_dev_id __initconst = { - .name = "pmic_audio", - .type = SFI_DEV_TYPE_IPC, - .delay = 1, - .device_handler = &ipc_device_handler, -}; - -sfi_device(pmic_audio_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_ipc.h b/arch/x86/platform/intel-mid/device_libs/platform_ipc.h deleted file mode 100644 index 79bb09d4f718..000000000000 --- a/arch/x86/platform/intel-mid/device_libs/platform_ipc.h +++ /dev/null @@ -1,18 +0,0 @@ -/* - * platform_ipc.h: IPC platform library header file - * - * (C) Copyright 2013 Intel Corporation - * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ -#ifndef _PLATFORM_IPC_H_ -#define _PLATFORM_IPC_H_ - -void __init -ipc_device_handler(struct sfi_device_table_entry *pentry, struct devs_id *dev); - -#endif diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_rtc.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_rtc.c new file mode 100644 index 000000000000..3135416df037 --- /dev/null +++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_rtc.c @@ -0,0 +1,48 @@ +/* + * Intel Merrifield legacy RTC initialization file + * + * (C) Copyright 2017 Intel Corporation + * + * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/init.h> + +#include <asm/hw_irq.h> +#include <asm/intel-mid.h> +#include <asm/io_apic.h> +#include <asm/time.h> +#include <asm/x86_init.h> + +static int __init mrfld_legacy_rtc_alloc_irq(void) +{ + struct irq_alloc_info info; + int ret; + + if (!x86_platform.legacy.rtc) + return -ENODEV; + + ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 1, 0); + ret = mp_map_gsi_to_irq(RTC_IRQ, IOAPIC_MAP_ALLOC, &info); + if (ret < 0) { + pr_info("Failed to allocate RTC interrupt. Disabling RTC\n"); + x86_platform.legacy.rtc = 0; + return ret; + } + + return 0; +} + +static int __init mrfld_legacy_rtc_init(void) +{ + if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER) + return -ENODEV; + + return mrfld_legacy_rtc_alloc_irq(); +} +arch_initcall(mrfld_legacy_rtc_init); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c index 3f1f1c77d090..86edd1e941eb 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c @@ -28,9 +28,9 @@ static struct platform_device wdt_dev = { static int tangier_probe(struct platform_device *pdev) { - int gsi; struct irq_alloc_info info; struct intel_mid_wdt_pdata *pdata = pdev->dev.platform_data; + int gsi, irq; if (!pdata) return -EINVAL; @@ -38,10 +38,10 @@ static int tangier_probe(struct platform_device *pdev) /* IOAPIC builds identity mapping between GSI and IRQ on MID */ gsi = pdata->irq; ioapic_set_alloc_attr(&info, cpu_to_node(0), 1, 0); - if (mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info) <= 0) { - dev_warn(&pdev->dev, "cannot find interrupt %d in ioapic\n", - gsi); - return -EINVAL; + irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info); + if (irq < 0) { + dev_warn(&pdev->dev, "cannot find interrupt %d in ioapic\n", gsi); + return irq; } return 0; @@ -82,4 +82,4 @@ static int __init register_mid_wdt(void) return 0; } -rootfs_initcall(register_mid_wdt); +arch_initcall(register_mid_wdt); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c index cb3490ecb341..d4dc744dd5a5 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c @@ -20,7 +20,6 @@ #include <asm/intel-mid.h> #include "platform_msic.h" -#include "platform_ipc.h" static void *msic_audio_platform_data(void *info) { @@ -40,8 +39,8 @@ static const struct devs_id msic_audio_dev_id __initconst = { .name = "msic_audio", .type = SFI_DEV_TYPE_IPC, .delay = 1, + .msic = 1, .get_platform_data = &msic_audio_platform_data, - .device_handler = &ipc_device_handler, }; sfi_device(msic_audio_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c index 4f72193939a6..5c3e9919633f 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c @@ -19,7 +19,6 @@ #include <asm/intel-mid.h> #include "platform_msic.h" -#include "platform_ipc.h" static void __init *msic_battery_platform_data(void *info) { @@ -30,8 +29,8 @@ static const struct devs_id msic_battery_dev_id __initconst = { .name = "msic_battery", .type = SFI_DEV_TYPE_IPC, .delay = 1, + .msic = 1, .get_platform_data = &msic_battery_platform_data, - .device_handler = &ipc_device_handler, }; sfi_device(msic_battery_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c index 70de5b531ba0..9fdb88d460d7 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c @@ -20,7 +20,6 @@ #include <asm/intel-mid.h> #include "platform_msic.h" -#include "platform_ipc.h" static void __init *msic_gpio_platform_data(void *info) { @@ -41,8 +40,8 @@ static const struct devs_id msic_gpio_dev_id __initconst = { .name = "msic_gpio", .type = SFI_DEV_TYPE_IPC, .delay = 1, + .msic = 1, .get_platform_data = &msic_gpio_platform_data, - .device_handler = &ipc_device_handler, }; sfi_device(msic_gpio_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c index 3d7c2011b6cf..7ae37cdbf256 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c @@ -20,7 +20,6 @@ #include <asm/intel-mid.h> #include "platform_msic.h" -#include "platform_ipc.h" static void __init *msic_ocd_platform_data(void *info) { @@ -42,8 +41,8 @@ static const struct devs_id msic_ocd_dev_id __initconst = { .name = "msic_ocd", .type = SFI_DEV_TYPE_IPC, .delay = 1, + .msic = 1, .get_platform_data = &msic_ocd_platform_data, - .device_handler = &ipc_device_handler, }; sfi_device(msic_ocd_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c index 038f618fbc52..96809b98cf69 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c @@ -18,7 +18,6 @@ #include <asm/intel-mid.h> #include "platform_msic.h" -#include "platform_ipc.h" static void __init *msic_power_btn_platform_data(void *info) { @@ -29,8 +28,8 @@ static const struct devs_id msic_power_btn_dev_id __initconst = { .name = "msic_power_btn", .type = SFI_DEV_TYPE_IPC, .delay = 1, + .msic = 1, .get_platform_data = &msic_power_btn_platform_data, - .device_handler = &ipc_device_handler, }; sfi_device(msic_power_btn_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c index 114a5755b1e4..3e4167d246cd 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c @@ -19,7 +19,6 @@ #include <asm/intel-mid.h> #include "platform_msic.h" -#include "platform_ipc.h" static void __init *msic_thermal_platform_data(void *info) { @@ -30,8 +29,8 @@ static const struct devs_id msic_thermal_dev_id __initconst = { .name = "msic_thermal", .type = SFI_DEV_TYPE_IPC, .delay = 1, + .msic = 1, .get_platform_data = &msic_thermal_platform_data, - .device_handler = &ipc_device_handler, }; sfi_device(msic_thermal_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c b/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c deleted file mode 100644 index e30cb62e3300..000000000000 --- a/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c +++ /dev/null @@ -1,54 +0,0 @@ -/* - * platform_pmic_gpio.c: PMIC GPIO platform data initialization file - * - * (C) Copyright 2013 Intel Corporation - * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ - -#include <linux/kernel.h> -#include <linux/interrupt.h> -#include <linux/scatterlist.h> -#include <linux/gpio.h> -#include <linux/init.h> -#include <linux/sfi.h> -#include <linux/intel_pmic_gpio.h> -#include <asm/intel-mid.h> - -#include "platform_ipc.h" - -static void __init *pmic_gpio_platform_data(void *info) -{ - static struct intel_pmic_gpio_platform_data pmic_gpio_pdata; - int gpio_base = get_gpio_by_name("pmic_gpio_base"); - - if (gpio_base < 0) - gpio_base = 64; - pmic_gpio_pdata.gpio_base = gpio_base; - pmic_gpio_pdata.irq_base = gpio_base + INTEL_MID_IRQ_OFFSET; - pmic_gpio_pdata.gpiointr = 0xffffeff8; - - return &pmic_gpio_pdata; -} - -static const struct devs_id pmic_gpio_spi_dev_id __initconst = { - .name = "pmic_gpio", - .type = SFI_DEV_TYPE_SPI, - .delay = 1, - .get_platform_data = &pmic_gpio_platform_data, -}; - -static const struct devs_id pmic_gpio_ipc_dev_id __initconst = { - .name = "pmic_gpio", - .type = SFI_DEV_TYPE_IPC, - .delay = 1, - .get_platform_data = &pmic_gpio_platform_data, - .device_handler = &ipc_device_handler -}; - -sfi_device(pmic_gpio_spi_dev_id); -sfi_device(pmic_gpio_ipc_dev_id); diff --git a/arch/x86/platform/intel-mid/mrfld.c b/arch/x86/platform/intel-mid/mrfld.c index e0607c77a1bd..ae7bdeb0e507 100644 --- a/arch/x86/platform/intel-mid/mrfld.c +++ b/arch/x86/platform/intel-mid/mrfld.c @@ -91,6 +91,7 @@ static unsigned long __init tangier_calibrate_tsc(void) static void __init tangier_arch_setup(void) { x86_platform.calibrate_tsc = tangier_calibrate_tsc; + x86_platform.legacy.rtc = 1; } /* tangier arch ops */ diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c index 051d264fce2e..19b43e3a9f0f 100644 --- a/arch/x86/platform/intel-mid/sfi.c +++ b/arch/x86/platform/intel-mid/sfi.c @@ -15,7 +15,6 @@ #include <linux/interrupt.h> #include <linux/scatterlist.h> #include <linux/sfi.h> -#include <linux/intel_pmic_gpio.h> #include <linux/spi/spi.h> #include <linux/i2c.h> #include <linux/skbuff.h> @@ -226,7 +225,7 @@ int get_gpio_by_name(const char *name) return -EINVAL; } -void __init intel_scu_device_register(struct platform_device *pdev) +static void __init intel_scu_ipc_device_register(struct platform_device *pdev) { if (ipc_next_dev == MAX_IPCDEVS) pr_err("too many SCU IPC devices"); @@ -335,10 +334,22 @@ static void __init sfi_handle_ipc_dev(struct sfi_device_table_entry *pentry, pr_debug("IPC bus, name = %16.16s, irq = 0x%2x\n", pentry->name, pentry->irq); + + /* + * We need to call platform init of IPC devices to fill misc_pdata + * structure. It will be used in msic_init for initialization. + */ pdata = intel_mid_sfi_get_pdata(dev, pentry); if (IS_ERR(pdata)) return; + /* + * On Medfield the platform device creation is handled by the MSIC + * MFD driver so we don't need to do it here. + */ + if (dev->msic && intel_mid_has_msic()) + return; + pdev = platform_device_alloc(pentry->name, 0); if (pdev == NULL) { pr_err("out of memory for SFI platform device '%s'.\n", @@ -348,7 +359,10 @@ static void __init sfi_handle_ipc_dev(struct sfi_device_table_entry *pentry, install_irq_resource(pdev, pentry->irq); pdev->dev.platform_data = pdata; - platform_device_add(pdev); + if (dev->delay) + intel_scu_ipc_device_register(pdev); + else + platform_device_add(pdev); } static void __init sfi_handle_spi_dev(struct sfi_device_table_entry *pentry, @@ -503,27 +517,23 @@ static int __init sfi_parse_devs(struct sfi_table_header *table) if (!dev) continue; - if (dev->device_handler) { - dev->device_handler(pentry, dev); - } else { - switch (pentry->type) { - case SFI_DEV_TYPE_IPC: - sfi_handle_ipc_dev(pentry, dev); - break; - case SFI_DEV_TYPE_SPI: - sfi_handle_spi_dev(pentry, dev); - break; - case SFI_DEV_TYPE_I2C: - sfi_handle_i2c_dev(pentry, dev); - break; - case SFI_DEV_TYPE_SD: - sfi_handle_sd_dev(pentry, dev); - break; - case SFI_DEV_TYPE_UART: - case SFI_DEV_TYPE_HSI: - default: - break; - } + switch (pentry->type) { + case SFI_DEV_TYPE_IPC: + sfi_handle_ipc_dev(pentry, dev); + break; + case SFI_DEV_TYPE_SPI: + sfi_handle_spi_dev(pentry, dev); + break; + case SFI_DEV_TYPE_I2C: + sfi_handle_i2c_dev(pentry, dev); + break; + case SFI_DEV_TYPE_SD: + sfi_handle_sd_dev(pentry, dev); + break; + case SFI_DEV_TYPE_UART: + case SFI_DEV_TYPE_HSI: + default: + break; } } return 0; diff --git a/arch/x86/ras/Kconfig b/arch/x86/ras/Kconfig index d957d5f21a86..0bc60a308730 100644 --- a/arch/x86/ras/Kconfig +++ b/arch/x86/ras/Kconfig @@ -1,6 +1,6 @@ config MCE_AMD_INJ tristate "Simple MCE injection interface for AMD processors" - depends on RAS && EDAC_DECODE_MCE && DEBUG_FS && AMD_NB + depends on RAS && X86_MCE && DEBUG_FS && AMD_NB default n help This is a simple debugfs interface to inject MCEs and test different diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index e8a9ea7d7a21..25a7c4302ce7 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -141,25 +141,6 @@ void __init xen_init_spinlocks(void) pv_lock_ops.vcpu_is_preempted = PV_CALLEE_SAVE(xen_vcpu_stolen); } -/* - * While the jump_label init code needs to happend _after_ the jump labels are - * enabled and before SMP is started. Hence we use pre-SMP initcall level - * init. We cannot do it in xen_init_spinlocks as that is done before - * jump labels are activated. - */ -static __init int xen_init_spinlocks_jump(void) -{ - if (!xen_pvspin) - return 0; - - if (!xen_domain()) - return 0; - - static_key_slow_inc(¶virt_ticketlocks_enabled); - return 0; -} -early_initcall(xen_init_spinlocks_jump); - static __init int xen_parse_nopvspin(char *arg) { xen_pvspin = false; diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c index b3842ffc19ba..a15270a806fc 100644 --- a/drivers/acpi/acpi_extlog.c +++ b/drivers/acpi/acpi_extlog.c @@ -212,6 +212,7 @@ static bool __init extlog_get_l1addr(void) } static struct notifier_block extlog_mce_dec = { .notifier_call = extlog_print, + .priority = MCE_PRIO_EXTLOG, }; static int __init extlog_init(void) diff --git a/drivers/acpi/nfit/mce.c b/drivers/acpi/nfit/mce.c index e5ce81c38eed..3ba1c3472cf9 100644 --- a/drivers/acpi/nfit/mce.c +++ b/drivers/acpi/nfit/mce.c @@ -90,6 +90,7 @@ static int nfit_handle_mce(struct notifier_block *nb, unsigned long val, static struct notifier_block nfit_mce_dec = { .notifier_call = nfit_handle_mce, + .priority = MCE_PRIO_NFIT, }; void nfit_mce_register(void) diff --git a/drivers/base/platform-msi.c b/drivers/base/platform-msi.c index be6a599bc0c1..0fc7c4da7756 100644 --- a/drivers/base/platform-msi.c +++ b/drivers/base/platform-msi.c @@ -206,7 +206,7 @@ platform_msi_alloc_priv_data(struct device *dev, unsigned int nvec, { struct platform_msi_priv_data *datap; /* - * Limit the number of interrupts to 256 per device. Should we + * Limit the number of interrupts to 2048 per device. Should we * need to bump this up, DEV_ID_SHIFT should be adjusted * accordingly (which would impact the max number of MSI * capable devices). diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index ab62b81c2ca7..dece26f119d4 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -1070,7 +1070,7 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned .done = 0, .flags = flags, .error = 0, - .kref = { ATOMIC_INIT(2) }, + .kref = KREF_INIT(2), }; if (!get_ldev_if_state(device, D_ATTACHING)) { /* put is in drbd_bm_aio_ctx_destroy() */ diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 82934eaa76ae..2f1772d5ee51 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2945,7 +2945,6 @@ void drbd_delete_device(struct drbd_device *device) struct drbd_resource *resource = device->resource; struct drbd_connection *connection; struct drbd_peer_device *peer_device; - int refs = 3; /* move to free_peer_device() */ for_each_peer_device(peer_device, device) @@ -2953,13 +2952,15 @@ void drbd_delete_device(struct drbd_device *device) drbd_debugfs_device_cleanup(device); for_each_connection(connection, resource) { idr_remove(&connection->peer_devices, device->vnr); - refs++; + kref_put(&device->kref, drbd_destroy_device); } idr_remove(&resource->devices, device->vnr); + kref_put(&device->kref, drbd_destroy_device); idr_remove(&drbd_devices, device_to_minor(device)); + kref_put(&device->kref, drbd_destroy_device); del_gendisk(device->vdisk); synchronize_rcu(); - kref_sub(&device->kref, refs, drbd_destroy_device); + kref_put(&device->kref, drbd_destroy_device); } static int __init drbd_init(void) diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index cb6bdb75d52d..652114ae1a8a 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -421,7 +421,6 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m, struct drbd_peer_device *peer_device = first_peer_device(device); unsigned s = req->rq_state; int c_put = 0; - int k_put = 0; if (drbd_suspended(device) && !((s | clear) & RQ_COMPLETION_SUSP)) set |= RQ_COMPLETION_SUSP; @@ -437,6 +436,8 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m, /* intent: get references */ + kref_get(&req->kref); + if (!(s & RQ_LOCAL_PENDING) && (set & RQ_LOCAL_PENDING)) atomic_inc(&req->completion_ref); @@ -473,15 +474,12 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m, if (!(s & RQ_LOCAL_ABORTED) && (set & RQ_LOCAL_ABORTED)) { D_ASSERT(device, req->rq_state & RQ_LOCAL_PENDING); - /* local completion may still come in later, - * we need to keep the req object around. */ - kref_get(&req->kref); ++c_put; } if ((s & RQ_LOCAL_PENDING) && (clear & RQ_LOCAL_PENDING)) { if (req->rq_state & RQ_LOCAL_ABORTED) - ++k_put; + kref_put(&req->kref, drbd_req_destroy); else ++c_put; list_del_init(&req->req_pending_local); @@ -503,7 +501,7 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m, if (s & RQ_NET_SENT) atomic_sub(req->i.size >> 9, &device->ap_in_flight); if (s & RQ_EXP_BARR_ACK) - ++k_put; + kref_put(&req->kref, drbd_req_destroy); req->net_done_jif = jiffies; /* in ahead/behind mode, or just in case, @@ -516,25 +514,16 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m, /* potentially complete and destroy */ - if (k_put || c_put) { - /* Completion does it's own kref_put. If we are going to - * kref_sub below, we need req to be still around then. */ - int at_least = k_put + !!c_put; - int refcount = atomic_read(&req->kref.refcount); - if (refcount < at_least) - drbd_err(device, - "mod_rq_state: Logic BUG: %x -> %x: refcount = %d, should be >= %d\n", - s, req->rq_state, refcount, at_least); - } - /* If we made progress, retry conflicting peer requests, if any. */ if (req->i.waiting) wake_up(&device->misc_wait); - if (c_put) - k_put += drbd_req_put_completion_ref(req, m, c_put); - if (k_put) - kref_sub(&req->kref, k_put, drbd_req_destroy); + if (c_put) { + if (drbd_req_put_completion_ref(req, m, c_put)) + kref_put(&req->kref, drbd_req_destroy); + } else { + kref_put(&req->kref, drbd_req_destroy); + } } static void drbd_report_io_error(struct drbd_device *device, struct drbd_request *req) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 588721f30a22..362cecc77130 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1535,7 +1535,7 @@ static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request) static void rbd_obj_request_get(struct rbd_obj_request *obj_request) { dout("%s: obj %p (was %d)\n", __func__, obj_request, - atomic_read(&obj_request->kref.refcount)); + kref_read(&obj_request->kref)); kref_get(&obj_request->kref); } @@ -1544,14 +1544,14 @@ static void rbd_obj_request_put(struct rbd_obj_request *obj_request) { rbd_assert(obj_request != NULL); dout("%s: obj %p (was %d)\n", __func__, obj_request, - atomic_read(&obj_request->kref.refcount)); + kref_read(&obj_request->kref)); kref_put(&obj_request->kref, rbd_obj_request_destroy); } static void rbd_img_request_get(struct rbd_img_request *img_request) { dout("%s: img %p (was %d)\n", __func__, img_request, - atomic_read(&img_request->kref.refcount)); + kref_read(&img_request->kref)); kref_get(&img_request->kref); } @@ -1562,7 +1562,7 @@ static void rbd_img_request_put(struct rbd_img_request *img_request) { rbd_assert(img_request != NULL); dout("%s: img %p (was %d)\n", __func__, img_request, - atomic_read(&img_request->kref.refcount)); + kref_read(&img_request->kref)); if (img_request_child_test(img_request)) kref_put(&img_request->kref, rbd_parent_request_destroy); else diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index a363170e45b1..024b473524c0 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -814,7 +814,7 @@ static void virtblk_remove(struct virtio_device *vdev) /* Stop all the virtqueues. */ vdev->config->reset(vdev); - refc = atomic_read(&disk_to_dev(vblk->disk)->kobj.kref.refcount); + refc = kref_read(&disk_to_dev(vblk->disk)->kobj.kref); put_disk(vblk->disk); vdev->config->del_vqs(vdev); kfree(vblk->vqs); diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c index 69b5adead0ad..75ad847593b7 100644 --- a/drivers/edac/i7core_edac.c +++ b/drivers/edac/i7core_edac.c @@ -1835,6 +1835,7 @@ static int i7core_mce_check_error(struct notifier_block *nb, unsigned long val, static struct notifier_block i7_mce_dec = { .notifier_call = i7core_mce_check_error, + .priority = MCE_PRIO_EDAC, }; struct memdev_dmi_entry { diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 34208f38c5b1..0d9bc25543d8 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -942,7 +942,8 @@ static const char *decode_error_status(struct mce *m) return "Corrected error, no action required."; } -int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) +static int +amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) { struct mce *m = (struct mce *)data; struct cpuinfo_x86 *c = &cpu_data(m->extcpu); @@ -1006,6 +1007,9 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) } else pr_cont("\n"); + if (m->tsc) + pr_emerg(HW_ERR "TSC: %llu\n", m->tsc); + if (!fam_ops) goto err_code; @@ -1047,10 +1051,10 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) return NOTIFY_STOP; } -EXPORT_SYMBOL_GPL(amd_decode_mce); static struct notifier_block amd_mce_dec_nb = { .notifier_call = amd_decode_mce, + .priority = MCE_PRIO_EDAC, }; static int __init mce_amd_init(void) diff --git a/drivers/edac/mce_amd.h b/drivers/edac/mce_amd.h index c2359a1ea6b3..0b6a68673e0e 100644 --- a/drivers/edac/mce_amd.h +++ b/drivers/edac/mce_amd.h @@ -79,6 +79,5 @@ struct amd_decoder_ops { void amd_report_gart_errors(bool); void amd_register_ecc_decoder(void (*f)(int, struct mce *)); void amd_unregister_ecc_decoder(void (*f)(int, struct mce *)); -int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data); #endif /* _EDAC_MCE_AMD_H */ diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index 54ae6dc45ab2..c585a014dd3d 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -3136,7 +3136,8 @@ static int sbridge_mce_check_error(struct notifier_block *nb, unsigned long val, } static struct notifier_block sbridge_mce_dec = { - .notifier_call = sbridge_mce_check_error, + .notifier_call = sbridge_mce_check_error, + .priority = MCE_PRIO_EDAC, }; /**************************************************************************** diff --git a/drivers/edac/skx_edac.c b/drivers/edac/skx_edac.c index 79ef675e4d6f..1159dba4671f 100644 --- a/drivers/edac/skx_edac.c +++ b/drivers/edac/skx_edac.c @@ -1007,7 +1007,8 @@ static int skx_mce_check_error(struct notifier_block *nb, unsigned long val, } static struct notifier_block skx_mce_dec = { - .notifier_call = skx_mce_check_error, + .notifier_call = skx_mce_check_error, + .priority = MCE_PRIO_EDAC, }; static void skx_remove(void) diff --git a/drivers/gpu/drm/drm_gem_cma_helper.c b/drivers/gpu/drm/drm_gem_cma_helper.c index 5cf38a474845..f7ba32bfe39b 100644 --- a/drivers/gpu/drm/drm_gem_cma_helper.c +++ b/drivers/gpu/drm/drm_gem_cma_helper.c @@ -447,7 +447,7 @@ void drm_gem_cma_describe(struct drm_gem_cma_object *cma_obj, off = drm_vma_node_start(&obj->vma_node); seq_printf(m, "%2d (%2d) %08llx %pad %p %zu", - obj->name, obj->refcount.refcount.counter, + obj->name, kref_read(&obj->refcount), off, &cma_obj->paddr, cma_obj->vaddr, obj->size); seq_printf(m, "\n"); diff --git a/drivers/gpu/drm/drm_info.c b/drivers/gpu/drm/drm_info.c index ffb2ab389d1d..6b68e9088436 100644 --- a/drivers/gpu/drm/drm_info.c +++ b/drivers/gpu/drm/drm_info.c @@ -118,7 +118,7 @@ static int drm_gem_one_name_info(int id, void *ptr, void *data) seq_printf(m, "%6d %8zd %7d %8d\n", obj->name, obj->size, obj->handle_count, - atomic_read(&obj->refcount.refcount)); + kref_read(&obj->refcount)); return 0; } diff --git a/drivers/gpu/drm/drm_mode_object.c b/drivers/gpu/drm/drm_mode_object.c index 14543ff08c51..220a6c1f4ab9 100644 --- a/drivers/gpu/drm/drm_mode_object.c +++ b/drivers/gpu/drm/drm_mode_object.c @@ -160,7 +160,7 @@ EXPORT_SYMBOL(drm_mode_object_find); void drm_mode_object_unreference(struct drm_mode_object *obj) { if (obj->free_cb) { - DRM_DEBUG("OBJ ID: %d (%d)\n", obj->id, atomic_read(&obj->refcount.refcount)); + DRM_DEBUG("OBJ ID: %d (%d)\n", obj->id, kref_read(&obj->refcount)); kref_put(&obj->refcount, obj->free_cb); } } @@ -177,7 +177,7 @@ EXPORT_SYMBOL(drm_mode_object_unreference); void drm_mode_object_reference(struct drm_mode_object *obj) { if (obj->free_cb) { - DRM_DEBUG("OBJ ID: %d (%d)\n", obj->id, atomic_read(&obj->refcount.refcount)); + DRM_DEBUG("OBJ ID: %d (%d)\n", obj->id, kref_read(&obj->refcount)); kref_get(&obj->refcount); } } diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c index 114dddbd297b..aa6e35ddc87f 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c @@ -486,7 +486,7 @@ static void etnaviv_gem_describe(struct drm_gem_object *obj, struct seq_file *m) seq_printf(m, "%08x: %c %2d (%2d) %08lx %p %zd\n", etnaviv_obj->flags, is_active(etnaviv_obj) ? 'A' : 'I', - obj->name, obj->refcount.refcount.counter, + obj->name, kref_read(&obj->refcount), off, etnaviv_obj->vaddr, obj->size); rcu_read_lock(); diff --git a/drivers/gpu/drm/i915/i915_gem_object.h b/drivers/gpu/drm/i915/i915_gem_object.h index 290eaa7fc9eb..bf90b07163d1 100644 --- a/drivers/gpu/drm/i915/i915_gem_object.h +++ b/drivers/gpu/drm/i915/i915_gem_object.h @@ -256,7 +256,7 @@ extern void drm_gem_object_unreference_unlocked(struct drm_gem_object *); static inline bool i915_gem_object_is_dead(const struct drm_i915_gem_object *obj) { - return atomic_read(&obj->base.refcount.refcount) == 0; + return kref_read(&obj->base.refcount) == 0; } static inline bool diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index c3b43f4d4f1f..e140b05af134 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -641,7 +641,7 @@ void msm_gem_describe(struct drm_gem_object *obj, struct seq_file *m) seq_printf(m, "%08x: %c %2d (%2d) %08llx %p\t", msm_obj->flags, is_active(msm_obj) ? 'A' : 'I', - obj->name, obj->refcount.refcount.counter, + obj->name, kref_read(&obj->refcount), off, msm_obj->vaddr); for (id = 0; id < priv->num_aspaces; id++) diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c index a6126c93f215..88ee60d1b907 100644 --- a/drivers/gpu/drm/nouveau/nouveau_fence.c +++ b/drivers/gpu/drm/nouveau/nouveau_fence.c @@ -527,7 +527,7 @@ static bool nouveau_fence_no_signaling(struct dma_fence *f) * caller should have a reference on the fence, * else fence could get freed here */ - WARN_ON(atomic_read(&fence->base.refcount.refcount) <= 1); + WARN_ON(kref_read(&fence->base.refcount) <= 1); /* * This needs uevents to work correctly, but dma_fence_add_callback relies on diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c index 4a90c690f09e..74a9968df421 100644 --- a/drivers/gpu/drm/omapdrm/omap_gem.c +++ b/drivers/gpu/drm/omapdrm/omap_gem.c @@ -1033,7 +1033,7 @@ void omap_gem_describe(struct drm_gem_object *obj, struct seq_file *m) off = drm_vma_node_start(&obj->vma_node); seq_printf(m, "%08x: %2d (%2d) %08llx %pad (%2d) %p %4d", - omap_obj->flags, obj->name, obj->refcount.refcount.counter, + omap_obj->flags, obj->name, kref_read(&obj->refcount), off, &omap_obj->paddr, omap_obj->paddr_cnt, omap_obj->vaddr, omap_obj->roll); diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index 4562e53c8244..17478f38dea3 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -140,8 +140,8 @@ static void ttm_bo_release_list(struct kref *list_kref) struct ttm_bo_device *bdev = bo->bdev; size_t acc_size = bo->acc_size; - BUG_ON(atomic_read(&bo->list_kref.refcount)); - BUG_ON(atomic_read(&bo->kref.refcount)); + BUG_ON(kref_read(&bo->list_kref)); + BUG_ON(kref_read(&bo->kref)); BUG_ON(atomic_read(&bo->cpu_writers)); BUG_ON(bo->mem.mm_node != NULL); BUG_ON(!list_empty(&bo->lru)); @@ -184,58 +184,41 @@ void ttm_bo_add_to_lru(struct ttm_buffer_object *bo) } EXPORT_SYMBOL(ttm_bo_add_to_lru); -int ttm_bo_del_from_lru(struct ttm_buffer_object *bo) +static void ttm_bo_ref_bug(struct kref *list_kref) { - int put_count = 0; + BUG(); +} +void ttm_bo_del_from_lru(struct ttm_buffer_object *bo) +{ if (!list_empty(&bo->swap)) { list_del_init(&bo->swap); - ++put_count; + kref_put(&bo->list_kref, ttm_bo_ref_bug); } if (!list_empty(&bo->lru)) { list_del_init(&bo->lru); - ++put_count; + kref_put(&bo->list_kref, ttm_bo_ref_bug); } /* * TODO: Add a driver hook to delete from * driver-specific LRU's here. */ - - return put_count; -} - -static void ttm_bo_ref_bug(struct kref *list_kref) -{ - BUG(); -} - -void ttm_bo_list_ref_sub(struct ttm_buffer_object *bo, int count, - bool never_free) -{ - kref_sub(&bo->list_kref, count, - (never_free) ? ttm_bo_ref_bug : ttm_bo_release_list); } void ttm_bo_del_sub_from_lru(struct ttm_buffer_object *bo) { - int put_count; - spin_lock(&bo->glob->lru_lock); - put_count = ttm_bo_del_from_lru(bo); + ttm_bo_del_from_lru(bo); spin_unlock(&bo->glob->lru_lock); - ttm_bo_list_ref_sub(bo, put_count, true); } EXPORT_SYMBOL(ttm_bo_del_sub_from_lru); void ttm_bo_move_to_lru_tail(struct ttm_buffer_object *bo) { - int put_count = 0; - lockdep_assert_held(&bo->resv->lock.base); - put_count = ttm_bo_del_from_lru(bo); - ttm_bo_list_ref_sub(bo, put_count, true); + ttm_bo_del_from_lru(bo); ttm_bo_add_to_lru(bo); } EXPORT_SYMBOL(ttm_bo_move_to_lru_tail); @@ -435,7 +418,6 @@ static void ttm_bo_cleanup_refs_or_queue(struct ttm_buffer_object *bo) { struct ttm_bo_device *bdev = bo->bdev; struct ttm_bo_global *glob = bo->glob; - int put_count; int ret; spin_lock(&glob->lru_lock); @@ -443,13 +425,10 @@ static void ttm_bo_cleanup_refs_or_queue(struct ttm_buffer_object *bo) if (!ret) { if (!ttm_bo_wait(bo, false, true)) { - put_count = ttm_bo_del_from_lru(bo); - + ttm_bo_del_from_lru(bo); spin_unlock(&glob->lru_lock); ttm_bo_cleanup_memtype_use(bo); - ttm_bo_list_ref_sub(bo, put_count, true); - return; } else ttm_bo_flush_all_fences(bo); @@ -492,7 +471,6 @@ static int ttm_bo_cleanup_refs_and_unlock(struct ttm_buffer_object *bo, bool no_wait_gpu) { struct ttm_bo_global *glob = bo->glob; - int put_count; int ret; ret = ttm_bo_wait(bo, false, true); @@ -542,15 +520,13 @@ static int ttm_bo_cleanup_refs_and_unlock(struct ttm_buffer_object *bo, return ret; } - put_count = ttm_bo_del_from_lru(bo); + ttm_bo_del_from_lru(bo); list_del_init(&bo->ddestroy); - ++put_count; + kref_put(&bo->list_kref, ttm_bo_ref_bug); spin_unlock(&glob->lru_lock); ttm_bo_cleanup_memtype_use(bo); - ttm_bo_list_ref_sub(bo, put_count, true); - return 0; } @@ -728,7 +704,7 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev, struct ttm_bo_global *glob = bdev->glob; struct ttm_mem_type_manager *man = &bdev->man[mem_type]; struct ttm_buffer_object *bo; - int ret = -EBUSY, put_count; + int ret = -EBUSY; unsigned i; spin_lock(&glob->lru_lock); @@ -766,13 +742,11 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev, return ret; } - put_count = ttm_bo_del_from_lru(bo); + ttm_bo_del_from_lru(bo); spin_unlock(&glob->lru_lock); BUG_ON(ret != 0); - ttm_bo_list_ref_sub(bo, put_count, true); - ret = ttm_bo_evict(bo, interruptible, no_wait_gpu); ttm_bo_unreserve(bo); @@ -1667,7 +1641,6 @@ static int ttm_bo_swapout(struct ttm_mem_shrink *shrink) container_of(shrink, struct ttm_bo_global, shrink); struct ttm_buffer_object *bo; int ret = -EBUSY; - int put_count; unsigned i; spin_lock(&glob->lru_lock); @@ -1694,11 +1667,9 @@ static int ttm_bo_swapout(struct ttm_mem_shrink *shrink) return ret; } - put_count = ttm_bo_del_from_lru(bo); + ttm_bo_del_from_lru(bo); spin_unlock(&glob->lru_lock); - ttm_bo_list_ref_sub(bo, put_count, true); - /** * Move to system cached */ diff --git a/drivers/gpu/drm/ttm/ttm_execbuf_util.c b/drivers/gpu/drm/ttm/ttm_execbuf_util.c index d35bc491e8de..5e1bcabffef5 100644 --- a/drivers/gpu/drm/ttm/ttm_execbuf_util.c +++ b/drivers/gpu/drm/ttm/ttm_execbuf_util.c @@ -48,9 +48,7 @@ static void ttm_eu_del_from_lru_locked(struct list_head *list) list_for_each_entry(entry, list, head) { struct ttm_buffer_object *bo = entry->bo; - unsigned put_count = ttm_bo_del_from_lru(bo); - - ttm_bo_list_ref_sub(bo, put_count, true); + ttm_bo_del_from_lru(bo); } } diff --git a/drivers/gpu/drm/ttm/ttm_object.c b/drivers/gpu/drm/ttm/ttm_object.c index 4f5fa8d65fe9..fdb451e3ec01 100644 --- a/drivers/gpu/drm/ttm/ttm_object.c +++ b/drivers/gpu/drm/ttm/ttm_object.c @@ -304,7 +304,7 @@ bool ttm_ref_object_exists(struct ttm_object_file *tfile, * Verify that the ref->obj pointer was actually valid! */ rmb(); - if (unlikely(atomic_read(&ref->kref.refcount) == 0)) + if (unlikely(kref_read(&ref->kref) == 0)) goto out_false; rcu_read_unlock(); diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.h b/drivers/infiniband/hw/cxgb3/iwch_cm.h index b9efadfffb4f..e66e75921797 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_cm.h +++ b/drivers/infiniband/hw/cxgb3/iwch_cm.h @@ -55,14 +55,14 @@ #define put_ep(ep) { \ PDBG("put_ep (via %s:%u) ep %p refcnt %d\n", __func__, __LINE__, \ - ep, atomic_read(&((ep)->kref.refcount))); \ - WARN_ON(atomic_read(&((ep)->kref.refcount)) < 1); \ + ep, kref_read(&((ep)->kref))); \ + WARN_ON(kref_read(&((ep)->kref)) < 1); \ kref_put(&((ep)->kref), __free_ep); \ } #define get_ep(ep) { \ PDBG("get_ep (via %s:%u) ep %p, refcnt %d\n", __func__, __LINE__, \ - ep, atomic_read(&((ep)->kref.refcount))); \ + ep, kref_read(&((ep)->kref))); \ kref_get(&((ep)->kref)); \ } diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c index d939980a708f..a9194db7f9b8 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_qp.c +++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c @@ -961,7 +961,7 @@ int iwch_modify_qp(struct iwch_dev *rhp, struct iwch_qp *qhp, case IWCH_QP_STATE_RTS: switch (attrs->next_state) { case IWCH_QP_STATE_CLOSING: - BUG_ON(atomic_read(&qhp->ep->com.kref.refcount) < 2); + BUG_ON(kref_read(&qhp->ep->com.kref) < 2); qhp->attr.state = IWCH_QP_STATE_CLOSING; if (!internal) { abort=0; diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index 8cd4d054a87e..d19662f635b1 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -672,14 +672,14 @@ enum c4iw_mmid_state { #define c4iw_put_ep(ep) { \ PDBG("put_ep (via %s:%u) ep %p refcnt %d\n", __func__, __LINE__, \ - ep, atomic_read(&((ep)->kref.refcount))); \ - WARN_ON(atomic_read(&((ep)->kref.refcount)) < 1); \ + ep, kref_read(&((ep)->kref))); \ + WARN_ON(kref_read(&((ep)->kref)) < 1); \ kref_put(&((ep)->kref), _c4iw_free_ep); \ } #define c4iw_get_ep(ep) { \ PDBG("get_ep (via %s:%u) ep %p, refcnt %d\n", __func__, __LINE__, \ - ep, atomic_read(&((ep)->kref.refcount))); \ + ep, kref_read(&((ep)->kref))); \ kref_get(&((ep)->kref)); \ } void _c4iw_free_ep(struct kref *kref); diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index 04c1c382dedb..d4fd2f5c8326 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -1580,7 +1580,7 @@ int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp, case C4IW_QP_STATE_RTS: switch (attrs->next_state) { case C4IW_QP_STATE_CLOSING: - BUG_ON(atomic_read(&qhp->ep->com.kref.refcount) < 2); + BUG_ON(kref_read(&qhp->ep->com.kref) < 2); t4_set_wq_in_error(&qhp->wq); set_state(qhp, C4IW_QP_STATE_CLOSING); ep = qhp->ep; diff --git a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c index 80ef3f8998c8..04443242e258 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c @@ -80,7 +80,7 @@ usnic_ib_show_config(struct device *device, struct device_attribute *attr, left = PAGE_SIZE; mutex_lock(&us_ibdev->usdev_lock); - if (atomic_read(&us_ibdev->vf_cnt.refcount) > 0) { + if (kref_read(&us_ibdev->vf_cnt) > 0) { char *busname; /* @@ -99,7 +99,7 @@ usnic_ib_show_config(struct device *device, struct device_attribute *attr, PCI_FUNC(us_ibdev->pdev->devfn), netdev_name(us_ibdev->netdev), us_ibdev->ufdev->mac, - atomic_read(&us_ibdev->vf_cnt.refcount)); + kref_read(&us_ibdev->vf_cnt)); UPDATE_PTR_LEFT(n, ptr, left); for (res_type = USNIC_VNIC_RES_TYPE_EOL; @@ -147,7 +147,7 @@ usnic_ib_show_max_vf(struct device *device, struct device_attribute *attr, us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); return scnprintf(buf, PAGE_SIZE, "%u\n", - atomic_read(&us_ibdev->vf_cnt.refcount)); + kref_read(&us_ibdev->vf_cnt)); } static ssize_t diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c index 0ba274ff7be6..3284730d3c09 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -291,11 +291,11 @@ int usnic_ib_query_device(struct ib_device *ibdev, qp_per_vf = max(us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_WQ], us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_RQ]); props->max_qp = qp_per_vf * - atomic_read(&us_ibdev->vf_cnt.refcount); + kref_read(&us_ibdev->vf_cnt); props->device_cap_flags = IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; props->max_cq = us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_CQ] * - atomic_read(&us_ibdev->vf_cnt.refcount); + kref_read(&us_ibdev->vf_cnt); props->max_pd = USNIC_UIOM_MAX_PD_CNT; props->max_mr = USNIC_UIOM_MAX_MR_CNT; props->local_ca_ack_delay = 0; diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 84d2f0e4c754..d36d427a9efb 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -794,7 +794,7 @@ static void __wait_for_free_buffer(struct dm_bufio_client *c) DECLARE_WAITQUEUE(wait, current); add_wait_queue(&c->free_buffer_wait, &wait); - set_task_state(current, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); dm_bufio_unlock(c); io_schedule(); diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 8a9f742d8ed7..1cb2ca9dfae3 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1210,14 +1210,14 @@ continue_locked: spin_unlock_irq(&cc->write_thread_wait.lock); if (unlikely(kthread_should_stop())) { - set_task_state(current, TASK_RUNNING); + set_current_state(TASK_RUNNING); remove_wait_queue(&cc->write_thread_wait, &wait); break; } schedule(); - set_task_state(current, TASK_RUNNING); + set_current_state(TASK_RUNNING); spin_lock_irq(&cc->write_thread_wait.lock); __remove_wait_queue(&cc->write_thread_wait, &wait); goto continue_locked; diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c index a6dde7cab458..758d90cc2733 100644 --- a/drivers/md/persistent-data/dm-block-manager.c +++ b/drivers/md/persistent-data/dm-block-manager.c @@ -120,7 +120,7 @@ static int __check_holder(struct block_lock *lock) static void __wait(struct waiter *w) { for (;;) { - set_task_state(current, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); if (!w->task) break; @@ -128,7 +128,7 @@ static void __wait(struct waiter *w) schedule(); } - set_task_state(current, TASK_RUNNING); + set_current_state(TASK_RUNNING); } static void __wake_waiter(struct waiter *w) diff --git a/drivers/misc/genwqe/card_dev.c b/drivers/misc/genwqe/card_dev.c index 7f1b282d7d96..cb290b8ca0c8 100644 --- a/drivers/misc/genwqe/card_dev.c +++ b/drivers/misc/genwqe/card_dev.c @@ -1396,7 +1396,7 @@ int genwqe_device_remove(struct genwqe_dev *cd) * application which will decrease this reference from * 1/unused to 0/illegal and not from 2/used 1/empty. */ - rc = atomic_read(&cd->cdev_genwqe.kobj.kref.refcount); + rc = kref_read(&cd->cdev_genwqe.kobj.kref); if (rc != 1) { dev_err(&pci_dev->dev, "[%s] err: cdev_genwqe...refcount=%d\n", __func__, rc); diff --git a/drivers/misc/mei/debugfs.c b/drivers/misc/mei/debugfs.c index c6217a4993ad..a617aa5a3ad8 100644 --- a/drivers/misc/mei/debugfs.c +++ b/drivers/misc/mei/debugfs.c @@ -67,7 +67,7 @@ static ssize_t mei_dbgfs_read_meclients(struct file *fp, char __user *ubuf, me_cl->props.max_number_of_connections, me_cl->props.max_msg_length, me_cl->props.single_recv_buf, - atomic_read(&me_cl->refcnt.refcount)); + kref_read(&me_cl->refcnt)); mei_me_cl_put(me_cl); } diff --git a/drivers/pci/hotplug/pnv_php.c b/drivers/pci/hotplug/pnv_php.c index 56efaf72d08e..d2961ef39a3a 100644 --- a/drivers/pci/hotplug/pnv_php.c +++ b/drivers/pci/hotplug/pnv_php.c @@ -155,7 +155,7 @@ static void pnv_php_detach_device_nodes(struct device_node *parent) pnv_php_detach_device_nodes(dn); of_node_put(dn); - refcount = atomic_read(&dn->kobj.kref.refcount); + refcount = kref_read(&dn->kobj.kref); if (refcount != 1) pr_warn("Invalid refcount %d on <%s>\n", refcount, of_node_full_name(dn)); diff --git a/drivers/pci/slot.c b/drivers/pci/slot.c index 429d34c348b9..e42909524dee 100644 --- a/drivers/pci/slot.c +++ b/drivers/pci/slot.c @@ -345,7 +345,7 @@ EXPORT_SYMBOL_GPL(pci_create_slot); void pci_destroy_slot(struct pci_slot *slot) { dev_dbg(&slot->bus->dev, "dev %02x, dec refcount to %d\n", - slot->number, atomic_read(&slot->kobj.kref.refcount) - 1); + slot->number, kref_read(&slot->kobj.kref) - 1); mutex_lock(&pci_slot_mutex); kobject_put(&slot->kobj); diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index 2c95f77895e2..a9e9c91cf4d4 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -816,13 +816,6 @@ config INTEL_SCU_IPC_UTIL low level access for debug work and updating the firmware. Say N unless you will be doing this on an Intel MID platform. -config GPIO_INTEL_PMIC - bool "Intel PMIC GPIO support" - depends on INTEL_SCU_IPC && GPIOLIB - ---help--- - Say Y here to support GPIO via the SCU IPC interface - on Intel MID platforms. - config INTEL_MID_POWER_BUTTON tristate "power button driver for Intel MID platforms" depends on INTEL_SCU_IPC && INPUT diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile index 49ee7ef283bb..cf9fc3e930c7 100644 --- a/drivers/platform/x86/Makefile +++ b/drivers/platform/x86/Makefile @@ -50,7 +50,6 @@ obj-$(CONFIG_INTEL_SCU_IPC) += intel_scu_ipc.o obj-$(CONFIG_INTEL_SCU_IPC_UTIL) += intel_scu_ipcutil.o obj-$(CONFIG_INTEL_MFLD_THERMAL) += intel_mid_thermal.o obj-$(CONFIG_INTEL_IPS) += intel_ips.o -obj-$(CONFIG_GPIO_INTEL_PMIC) += intel_pmic_gpio.o obj-$(CONFIG_XO1_RFKILL) += xo1-rfkill.o obj-$(CONFIG_XO15_EBOOK) += xo15-ebook.o obj-$(CONFIG_IBM_RTL) += ibm_rtl.o diff --git a/drivers/platform/x86/intel_pmic_gpio.c b/drivers/platform/x86/intel_pmic_gpio.c deleted file mode 100644 index 91ae58510d92..000000000000 --- a/drivers/platform/x86/intel_pmic_gpio.c +++ /dev/null @@ -1,326 +0,0 @@ -/* Moorestown PMIC GPIO (access through IPC) driver - * Copyright (c) 2008 - 2009, Intel Corporation. - * - * Author: Alek Du <alek.du@intel.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -/* Supports: - * Moorestown platform PMIC chip - */ - -#define pr_fmt(fmt) "%s: " fmt, __func__ - -#include <linux/kernel.h> -#include <linux/interrupt.h> -#include <linux/delay.h> -#include <linux/stddef.h> -#include <linux/slab.h> -#include <linux/ioport.h> -#include <linux/init.h> -#include <linux/io.h> -#include <linux/gpio/driver.h> -#include <asm/intel_scu_ipc.h> -#include <linux/device.h> -#include <linux/intel_pmic_gpio.h> -#include <linux/platform_device.h> - -#define DRIVER_NAME "pmic_gpio" - -/* register offset that IPC driver should use - * 8 GPIO + 8 GPOSW (6 controllable) + 8GPO - */ -enum pmic_gpio_register { - GPIO0 = 0xE0, - GPIO7 = 0xE7, - GPIOINT = 0xE8, - GPOSWCTL0 = 0xEC, - GPOSWCTL5 = 0xF1, - GPO = 0xF4, -}; - -/* bits definition for GPIO & GPOSW */ -#define GPIO_DRV 0x01 -#define GPIO_DIR 0x02 -#define GPIO_DIN 0x04 -#define GPIO_DOU 0x08 -#define GPIO_INTCTL 0x30 -#define GPIO_DBC 0xc0 - -#define GPOSW_DRV 0x01 -#define GPOSW_DOU 0x08 -#define GPOSW_RDRV 0x30 - -#define GPIO_UPDATE_TYPE 0x80000000 - -#define NUM_GPIO 24 - -struct pmic_gpio { - struct mutex buslock; - struct gpio_chip chip; - void *gpiointr; - int irq; - unsigned irq_base; - unsigned int update_type; - u32 trigger_type; -}; - -static void pmic_program_irqtype(int gpio, int type) -{ - if (type & IRQ_TYPE_EDGE_RISING) - intel_scu_ipc_update_register(GPIO0 + gpio, 0x20, 0x20); - else - intel_scu_ipc_update_register(GPIO0 + gpio, 0x00, 0x20); - - if (type & IRQ_TYPE_EDGE_FALLING) - intel_scu_ipc_update_register(GPIO0 + gpio, 0x10, 0x10); - else - intel_scu_ipc_update_register(GPIO0 + gpio, 0x00, 0x10); -}; - -static int pmic_gpio_direction_input(struct gpio_chip *chip, unsigned offset) -{ - if (offset >= 8) { - pr_err("only pin 0-7 support input\n"); - return -1;/* we only have 8 GPIO can use as input */ - } - return intel_scu_ipc_update_register(GPIO0 + offset, - GPIO_DIR, GPIO_DIR); -} - -static int pmic_gpio_direction_output(struct gpio_chip *chip, - unsigned offset, int value) -{ - int rc = 0; - - if (offset < 8)/* it is GPIO */ - rc = intel_scu_ipc_update_register(GPIO0 + offset, - GPIO_DRV | (value ? GPIO_DOU : 0), - GPIO_DRV | GPIO_DOU | GPIO_DIR); - else if (offset < 16)/* it is GPOSW */ - rc = intel_scu_ipc_update_register(GPOSWCTL0 + offset - 8, - GPOSW_DRV | (value ? GPOSW_DOU : 0), - GPOSW_DRV | GPOSW_DOU | GPOSW_RDRV); - else if (offset > 15 && offset < 24)/* it is GPO */ - rc = intel_scu_ipc_update_register(GPO, - value ? 1 << (offset - 16) : 0, - 1 << (offset - 16)); - else { - pr_err("invalid PMIC GPIO pin %d!\n", offset); - WARN_ON(1); - } - - return rc; -} - -static int pmic_gpio_get(struct gpio_chip *chip, unsigned offset) -{ - u8 r; - int ret; - - /* we only have 8 GPIO pins we can use as input */ - if (offset >= 8) - return -EOPNOTSUPP; - ret = intel_scu_ipc_ioread8(GPIO0 + offset, &r); - if (ret < 0) - return ret; - return r & GPIO_DIN; -} - -static void pmic_gpio_set(struct gpio_chip *chip, unsigned offset, int value) -{ - if (offset < 8)/* it is GPIO */ - intel_scu_ipc_update_register(GPIO0 + offset, - GPIO_DRV | (value ? GPIO_DOU : 0), - GPIO_DRV | GPIO_DOU); - else if (offset < 16)/* it is GPOSW */ - intel_scu_ipc_update_register(GPOSWCTL0 + offset - 8, - GPOSW_DRV | (value ? GPOSW_DOU : 0), - GPOSW_DRV | GPOSW_DOU | GPOSW_RDRV); - else if (offset > 15 && offset < 24) /* it is GPO */ - intel_scu_ipc_update_register(GPO, - value ? 1 << (offset - 16) : 0, - 1 << (offset - 16)); -} - -/* - * This is called from genirq with pg->buslock locked and - * irq_desc->lock held. We can not access the scu bus here, so we - * store the change and update in the bus_sync_unlock() function below - */ -static int pmic_irq_type(struct irq_data *data, unsigned type) -{ - struct pmic_gpio *pg = irq_data_get_irq_chip_data(data); - u32 gpio = data->irq - pg->irq_base; - - if (gpio >= pg->chip.ngpio) - return -EINVAL; - - pg->trigger_type = type; - pg->update_type = gpio | GPIO_UPDATE_TYPE; - return 0; -} - -static int pmic_gpio_to_irq(struct gpio_chip *chip, unsigned offset) -{ - struct pmic_gpio *pg = gpiochip_get_data(chip); - - return pg->irq_base + offset; -} - -static void pmic_bus_lock(struct irq_data *data) -{ - struct pmic_gpio *pg = irq_data_get_irq_chip_data(data); - - mutex_lock(&pg->buslock); -} - -static void pmic_bus_sync_unlock(struct irq_data *data) -{ - struct pmic_gpio *pg = irq_data_get_irq_chip_data(data); - - if (pg->update_type) { - unsigned int gpio = pg->update_type & ~GPIO_UPDATE_TYPE; - - pmic_program_irqtype(gpio, pg->trigger_type); - pg->update_type = 0; - } - mutex_unlock(&pg->buslock); -} - -/* the gpiointr register is read-clear, so just do nothing. */ -static void pmic_irq_unmask(struct irq_data *data) { } - -static void pmic_irq_mask(struct irq_data *data) { } - -static struct irq_chip pmic_irqchip = { - .name = "PMIC-GPIO", - .irq_mask = pmic_irq_mask, - .irq_unmask = pmic_irq_unmask, - .irq_set_type = pmic_irq_type, - .irq_bus_lock = pmic_bus_lock, - .irq_bus_sync_unlock = pmic_bus_sync_unlock, -}; - -static irqreturn_t pmic_irq_handler(int irq, void *data) -{ - struct pmic_gpio *pg = data; - u8 intsts = *((u8 *)pg->gpiointr + 4); - int gpio; - irqreturn_t ret = IRQ_NONE; - - for (gpio = 0; gpio < 8; gpio++) { - if (intsts & (1 << gpio)) { - pr_debug("pmic pin %d triggered\n", gpio); - generic_handle_irq(pg->irq_base + gpio); - ret = IRQ_HANDLED; - } - } - return ret; -} - -static int platform_pmic_gpio_probe(struct platform_device *pdev) -{ - struct device *dev = &pdev->dev; - int irq = platform_get_irq(pdev, 0); - struct intel_pmic_gpio_platform_data *pdata = dev->platform_data; - - struct pmic_gpio *pg; - int retval; - int i; - - if (irq < 0) { - dev_dbg(dev, "no IRQ line\n"); - return -EINVAL; - } - - if (!pdata || !pdata->gpio_base || !pdata->irq_base) { - dev_dbg(dev, "incorrect or missing platform data\n"); - return -EINVAL; - } - - pg = kzalloc(sizeof(*pg), GFP_KERNEL); - if (!pg) - return -ENOMEM; - - dev_set_drvdata(dev, pg); - - pg->irq = irq; - /* setting up SRAM mapping for GPIOINT register */ - pg->gpiointr = ioremap_nocache(pdata->gpiointr, 8); - if (!pg->gpiointr) { - pr_err("Can not map GPIOINT\n"); - retval = -EINVAL; - goto err2; - } - pg->irq_base = pdata->irq_base; - pg->chip.label = "intel_pmic"; - pg->chip.direction_input = pmic_gpio_direction_input; - pg->chip.direction_output = pmic_gpio_direction_output; - pg->chip.get = pmic_gpio_get; - pg->chip.set = pmic_gpio_set; - pg->chip.to_irq = pmic_gpio_to_irq; - pg->chip.base = pdata->gpio_base; - pg->chip.ngpio = NUM_GPIO; - pg->chip.can_sleep = 1; - pg->chip.parent = dev; - - mutex_init(&pg->buslock); - - pg->chip.parent = dev; - retval = gpiochip_add_data(&pg->chip, pg); - if (retval) { - pr_err("Can not add pmic gpio chip\n"); - goto err; - } - - retval = request_irq(pg->irq, pmic_irq_handler, 0, "pmic", pg); - if (retval) { - pr_warn("Interrupt request failed\n"); - goto fail_request_irq; - } - - for (i = 0; i < 8; i++) { - irq_set_chip_and_handler_name(i + pg->irq_base, - &pmic_irqchip, - handle_simple_irq, - "demux"); - irq_set_chip_data(i + pg->irq_base, pg); - } - return 0; - -fail_request_irq: - gpiochip_remove(&pg->chip); -err: - iounmap(pg->gpiointr); -err2: - kfree(pg); - return retval; -} - -/* at the same time, register a platform driver - * this supports the sfi 0.81 fw */ -static struct platform_driver platform_pmic_gpio_driver = { - .driver = { - .name = DRIVER_NAME, - }, - .probe = platform_pmic_gpio_probe, -}; - -static int __init platform_pmic_gpio_init(void) -{ - return platform_driver_register(&platform_pmic_gpio_driver); -} -subsys_initcall(platform_pmic_gpio_init); diff --git a/drivers/scsi/bnx2fc/bnx2fc_io.c b/drivers/scsi/bnx2fc/bnx2fc_io.c index f501095f91ac..898461b146cc 100644 --- a/drivers/scsi/bnx2fc/bnx2fc_io.c +++ b/drivers/scsi/bnx2fc/bnx2fc_io.c @@ -74,7 +74,7 @@ static void bnx2fc_cmd_timeout(struct work_struct *work) &io_req->req_flags)) { /* Handle internally generated ABTS timeout */ BNX2FC_IO_DBG(io_req, "ABTS timed out refcnt = %d\n", - io_req->refcount.refcount.counter); + kref_read(&io_req->refcount)); if (!(test_and_set_bit(BNX2FC_FLAG_ABTS_DONE, &io_req->req_flags))) { /* @@ -1141,7 +1141,7 @@ int bnx2fc_eh_abort(struct scsi_cmnd *sc_cmd) return SUCCESS; } BNX2FC_IO_DBG(io_req, "eh_abort - refcnt = %d\n", - io_req->refcount.refcount.counter); + kref_read(&io_req->refcount)); /* Hold IO request across abort processing */ kref_get(&io_req->refcount); @@ -1299,7 +1299,7 @@ void bnx2fc_process_cleanup_compl(struct bnx2fc_cmd *io_req, { BNX2FC_IO_DBG(io_req, "Entered process_cleanup_compl " "refcnt = %d, cmd_type = %d\n", - io_req->refcount.refcount.counter, io_req->cmd_type); + kref_read(&io_req->refcount), io_req->cmd_type); bnx2fc_scsi_done(io_req, DID_ERROR); kref_put(&io_req->refcount, bnx2fc_cmd_release); if (io_req->wait_for_comp) @@ -1318,7 +1318,7 @@ void bnx2fc_process_abts_compl(struct bnx2fc_cmd *io_req, BNX2FC_IO_DBG(io_req, "Entered process_abts_compl xid = 0x%x" "refcnt = %d, cmd_type = %d\n", io_req->xid, - io_req->refcount.refcount.counter, io_req->cmd_type); + kref_read(&io_req->refcount), io_req->cmd_type); if (test_and_set_bit(BNX2FC_FLAG_ABTS_DONE, &io_req->req_flags)) { diff --git a/drivers/scsi/cxgbi/libcxgbi.h b/drivers/scsi/cxgbi/libcxgbi.h index 95ba99044c3e..18e0ea83d361 100644 --- a/drivers/scsi/cxgbi/libcxgbi.h +++ b/drivers/scsi/cxgbi/libcxgbi.h @@ -301,7 +301,7 @@ static inline void __cxgbi_sock_put(const char *fn, struct cxgbi_sock *csk) { log_debug(1 << CXGBI_DBG_SOCK, "%s, put csk 0x%p, ref %u-1.\n", - fn, csk, atomic_read(&csk->refcnt.refcount)); + fn, csk, kref_read(&csk->refcnt)); kref_put(&csk->refcnt, cxgbi_sock_free); } #define cxgbi_sock_put(csk) __cxgbi_sock_put(__func__, csk) @@ -310,7 +310,7 @@ static inline void __cxgbi_sock_get(const char *fn, struct cxgbi_sock *csk) { log_debug(1 << CXGBI_DBG_SOCK, "%s, get csk 0x%p, ref %u+1.\n", - fn, csk, atomic_read(&csk->refcnt.refcount)); + fn, csk, kref_read(&csk->refcnt)); kref_get(&csk->refcnt); } #define cxgbi_sock_get(csk) __cxgbi_sock_get(__func__, csk) diff --git a/drivers/scsi/lpfc/lpfc_debugfs.c b/drivers/scsi/lpfc/lpfc_debugfs.c index a63542bac153..caa7a7b0ec53 100644 --- a/drivers/scsi/lpfc/lpfc_debugfs.c +++ b/drivers/scsi/lpfc/lpfc_debugfs.c @@ -607,7 +607,7 @@ lpfc_debugfs_nodelist_data(struct lpfc_vport *vport, char *buf, int size) len += snprintf(buf+len, size-len, "usgmap:%x ", ndlp->nlp_usg_map); len += snprintf(buf+len, size-len, "refcnt:%x", - atomic_read(&ndlp->kref.refcount)); + kref_read(&ndlp->kref)); len += snprintf(buf+len, size-len, "\n"); } spin_unlock_irq(shost->host_lock); diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c index 7b6bd8ed0d0b..63bef4566548 100644 --- a/drivers/scsi/lpfc/lpfc_els.c +++ b/drivers/scsi/lpfc/lpfc_els.c @@ -3690,7 +3690,7 @@ lpfc_mbx_cmpl_dflt_rpi(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb) lpfc_printf_vlog(ndlp->vport, KERN_INFO, LOG_NODE, "0006 rpi%x DID:%x flg:%x %d map:%x %p\n", ndlp->nlp_rpi, ndlp->nlp_DID, ndlp->nlp_flag, - atomic_read(&ndlp->kref.refcount), + kref_read(&ndlp->kref), ndlp->nlp_usg_map, ndlp); if (NLP_CHK_NODE_ACT(ndlp)) { lpfc_nlp_put(ndlp); diff --git a/drivers/scsi/lpfc/lpfc_hbadisc.c b/drivers/scsi/lpfc/lpfc_hbadisc.c index ed223937798a..82047070cdc9 100644 --- a/drivers/scsi/lpfc/lpfc_hbadisc.c +++ b/drivers/scsi/lpfc/lpfc_hbadisc.c @@ -3440,7 +3440,7 @@ lpfc_mbx_cmpl_reg_login(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb) lpfc_printf_vlog(vport, KERN_INFO, LOG_SLI, "0002 rpi:%x DID:%x flg:%x %d map:%x %p\n", ndlp->nlp_rpi, ndlp->nlp_DID, ndlp->nlp_flag, - atomic_read(&ndlp->kref.refcount), + kref_read(&ndlp->kref), ndlp->nlp_usg_map, ndlp); if (ndlp->nlp_flag & NLP_REG_LOGIN_SEND) ndlp->nlp_flag &= ~NLP_REG_LOGIN_SEND; @@ -3861,7 +3861,7 @@ out: lpfc_printf_vlog(vport, KERN_INFO, LOG_SLI, "0003 rpi:%x DID:%x flg:%x %d map%x %p\n", ndlp->nlp_rpi, ndlp->nlp_DID, ndlp->nlp_flag, - atomic_read(&ndlp->kref.refcount), + kref_read(&ndlp->kref), ndlp->nlp_usg_map, ndlp); if (vport->port_state < LPFC_VPORT_READY) { @@ -4238,7 +4238,7 @@ lpfc_enable_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp, "0277 lpfc_enable_node: ndlp:x%p " "usgmap:x%x refcnt:%d\n", (void *)ndlp, ndlp->nlp_usg_map, - atomic_read(&ndlp->kref.refcount)); + kref_read(&ndlp->kref)); return NULL; } /* The ndlp should not already be in active mode */ @@ -4248,7 +4248,7 @@ lpfc_enable_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp, "0278 lpfc_enable_node: ndlp:x%p " "usgmap:x%x refcnt:%d\n", (void *)ndlp, ndlp->nlp_usg_map, - atomic_read(&ndlp->kref.refcount)); + kref_read(&ndlp->kref)); return NULL; } @@ -4272,7 +4272,7 @@ lpfc_enable_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp, "0008 rpi:%x DID:%x flg:%x refcnt:%d " "map:%x %p\n", ndlp->nlp_rpi, ndlp->nlp_DID, ndlp->nlp_flag, - atomic_read(&ndlp->kref.refcount), + kref_read(&ndlp->kref), ndlp->nlp_usg_map, ndlp); } @@ -4546,7 +4546,7 @@ lpfc_unreg_rpi(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp) (bf_get(lpfc_sli_intf_if_type, &phba->sli4_hba.sli_intf) == LPFC_SLI_INTF_IF_TYPE_2) && - (atomic_read(&ndlp->kref.refcount) > 0)) { + (kref_read(&ndlp->kref) > 0)) { mbox->context1 = lpfc_nlp_get(ndlp); mbox->mbox_cmpl = lpfc_sli4_unreg_rpi_cmpl_clr; @@ -4695,14 +4695,14 @@ lpfc_cleanup_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp) "0280 lpfc_cleanup_node: ndlp:x%p " "usgmap:x%x refcnt:%d\n", (void *)ndlp, ndlp->nlp_usg_map, - atomic_read(&ndlp->kref.refcount)); + kref_read(&ndlp->kref)); lpfc_dequeue_node(vport, ndlp); } else { lpfc_printf_vlog(vport, KERN_WARNING, LOG_NODE, "0281 lpfc_cleanup_node: ndlp:x%p " "usgmap:x%x refcnt:%d\n", (void *)ndlp, ndlp->nlp_usg_map, - atomic_read(&ndlp->kref.refcount)); + kref_read(&ndlp->kref)); lpfc_disable_node(vport, ndlp); } @@ -4791,7 +4791,7 @@ lpfc_nlp_remove(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp) lpfc_printf_vlog(vport, KERN_INFO, LOG_NODE, "0005 rpi:%x DID:%x flg:%x %d map:%x %p\n", ndlp->nlp_rpi, ndlp->nlp_DID, ndlp->nlp_flag, - atomic_read(&ndlp->kref.refcount), + kref_read(&ndlp->kref), ndlp->nlp_usg_map, ndlp); if ((mbox = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL)) != NULL) { @@ -5557,7 +5557,7 @@ lpfc_mbx_cmpl_fdmi_reg_login(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb) lpfc_printf_vlog(vport, KERN_INFO, LOG_SLI, "0004 rpi:%x DID:%x flg:%x %d map:%x %p\n", ndlp->nlp_rpi, ndlp->nlp_DID, ndlp->nlp_flag, - atomic_read(&ndlp->kref.refcount), + kref_read(&ndlp->kref), ndlp->nlp_usg_map, ndlp); /* * Start issuing Fabric-Device Management Interface (FDMI) command to @@ -5728,7 +5728,7 @@ lpfc_nlp_init(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp, "0007 rpi:%x DID:%x flg:%x refcnt:%d " "map:%x %p\n", ndlp->nlp_rpi, ndlp->nlp_DID, ndlp->nlp_flag, - atomic_read(&ndlp->kref.refcount), + kref_read(&ndlp->kref), ndlp->nlp_usg_map, ndlp); ndlp->active_rrqs_xri_bitmap = @@ -5767,7 +5767,7 @@ lpfc_nlp_release(struct kref *kref) "0279 lpfc_nlp_release: ndlp:x%p did %x " "usgmap:x%x refcnt:%d rpi:%x\n", (void *)ndlp, ndlp->nlp_DID, ndlp->nlp_usg_map, - atomic_read(&ndlp->kref.refcount), ndlp->nlp_rpi); + kref_read(&ndlp->kref), ndlp->nlp_rpi); /* remove ndlp from action. */ lpfc_nlp_remove(ndlp->vport, ndlp); @@ -5804,7 +5804,7 @@ lpfc_nlp_get(struct lpfc_nodelist *ndlp) lpfc_debugfs_disc_trc(ndlp->vport, LPFC_DISC_TRC_NODE, "node get: did:x%x flg:x%x refcnt:x%x", ndlp->nlp_DID, ndlp->nlp_flag, - atomic_read(&ndlp->kref.refcount)); + kref_read(&ndlp->kref)); /* The check of ndlp usage to prevent incrementing the * ndlp reference count that is in the process of being * released. @@ -5817,7 +5817,7 @@ lpfc_nlp_get(struct lpfc_nodelist *ndlp) "0276 lpfc_nlp_get: ndlp:x%p " "usgmap:x%x refcnt:%d\n", (void *)ndlp, ndlp->nlp_usg_map, - atomic_read(&ndlp->kref.refcount)); + kref_read(&ndlp->kref)); return NULL; } else kref_get(&ndlp->kref); @@ -5844,7 +5844,7 @@ lpfc_nlp_put(struct lpfc_nodelist *ndlp) lpfc_debugfs_disc_trc(ndlp->vport, LPFC_DISC_TRC_NODE, "node put: did:x%x flg:x%x refcnt:x%x", ndlp->nlp_DID, ndlp->nlp_flag, - atomic_read(&ndlp->kref.refcount)); + kref_read(&ndlp->kref)); phba = ndlp->phba; spin_lock_irqsave(&phba->ndlp_lock, flags); /* Check the ndlp memory free acknowledge flag to avoid the @@ -5857,7 +5857,7 @@ lpfc_nlp_put(struct lpfc_nodelist *ndlp) "0274 lpfc_nlp_put: ndlp:x%p " "usgmap:x%x refcnt:%d\n", (void *)ndlp, ndlp->nlp_usg_map, - atomic_read(&ndlp->kref.refcount)); + kref_read(&ndlp->kref)); return 1; } /* Check the ndlp inactivate log flag to avoid the possible @@ -5870,7 +5870,7 @@ lpfc_nlp_put(struct lpfc_nodelist *ndlp) "0275 lpfc_nlp_put: ndlp:x%p " "usgmap:x%x refcnt:%d\n", (void *)ndlp, ndlp->nlp_usg_map, - atomic_read(&ndlp->kref.refcount)); + kref_read(&ndlp->kref)); return 1; } /* For last put, mark the ndlp usage flags to make sure no @@ -5878,7 +5878,7 @@ lpfc_nlp_put(struct lpfc_nodelist *ndlp) * in between the process when the final kref_put has been * invoked on this ndlp. */ - if (atomic_read(&ndlp->kref.refcount) == 1) { + if (kref_read(&ndlp->kref) == 1) { /* Indicate ndlp is put to inactive state. */ NLP_SET_IACT_REQ(ndlp); /* Acknowledge ndlp memory free has been seen. */ @@ -5906,8 +5906,8 @@ lpfc_nlp_not_used(struct lpfc_nodelist *ndlp) lpfc_debugfs_disc_trc(ndlp->vport, LPFC_DISC_TRC_NODE, "node not used: did:x%x flg:x%x refcnt:x%x", ndlp->nlp_DID, ndlp->nlp_flag, - atomic_read(&ndlp->kref.refcount)); - if (atomic_read(&ndlp->kref.refcount) == 1) + kref_read(&ndlp->kref)); + if (kref_read(&ndlp->kref) == 1) if (lpfc_nlp_put(ndlp)) return 1; return 0; diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index 4776fd85514f..64717c171b15 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -2660,8 +2660,7 @@ lpfc_cleanup(struct lpfc_vport *vport) "usgmap:x%x refcnt:%d\n", ndlp->nlp_DID, (void *)ndlp, ndlp->nlp_usg_map, - atomic_read( - &ndlp->kref.refcount)); + kref_read(&ndlp->kref)); } break; } diff --git a/drivers/scsi/qla2xxx/tcm_qla2xxx.c b/drivers/scsi/qla2xxx/tcm_qla2xxx.c index d925910be761..3084983c1287 100644 --- a/drivers/scsi/qla2xxx/tcm_qla2xxx.c +++ b/drivers/scsi/qla2xxx/tcm_qla2xxx.c @@ -371,7 +371,7 @@ static int tcm_qla2xxx_write_pending(struct se_cmd *se_cmd) */ pr_debug("write_pending aborted cmd[%p] refcount %d " "transport_state %x, t_state %x, se_cmd_flags %x\n", - cmd,cmd->se_cmd.cmd_kref.refcount.counter, + cmd, kref_read(&cmd->se_cmd.cmd_kref), cmd->se_cmd.transport_state, cmd->se_cmd.t_state, cmd->se_cmd.se_cmd_flags); @@ -584,7 +584,7 @@ static int tcm_qla2xxx_queue_data_in(struct se_cmd *se_cmd) */ pr_debug("queue_data_in aborted cmd[%p] refcount %d " "transport_state %x, t_state %x, se_cmd_flags %x\n", - cmd,cmd->se_cmd.cmd_kref.refcount.counter, + cmd, kref_read(&cmd->se_cmd.cmd_kref), cmd->se_cmd.transport_state, cmd->se_cmd.t_state, cmd->se_cmd.se_cmd_flags); diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c index b653451843c8..937c2d5d7ec3 100644 --- a/drivers/staging/android/ion/ion.c +++ b/drivers/staging/android/ion/ion.c @@ -1300,7 +1300,7 @@ static int ion_debug_heap_show(struct seq_file *s, void *unused) seq_printf(s, "%16s %16u %16zu %d %d\n", buffer->task_comm, buffer->pid, buffer->size, buffer->kmap_cnt, - atomic_read(&buffer->ref.refcount)); + kref_read(&buffer->ref)); total_orphaned_size += buffer->size; } } diff --git a/drivers/staging/comedi/comedi_buf.c b/drivers/staging/comedi/comedi_buf.c index c7d7682b1412..1e1df89b5018 100644 --- a/drivers/staging/comedi/comedi_buf.c +++ b/drivers/staging/comedi/comedi_buf.c @@ -188,7 +188,7 @@ bool comedi_buf_is_mmapped(struct comedi_subdevice *s) { struct comedi_buf_map *bm = s->async->buf_map; - return bm && (atomic_read(&bm->refcount.refcount) > 1); + return bm && (kref_read(&bm->refcount) > 1); } int comedi_buf_alloc(struct comedi_device *dev, struct comedi_subdevice *s, diff --git a/drivers/staging/lustre/lnet/libcfs/linux/linux-debug.c b/drivers/staging/lustre/lnet/libcfs/linux/linux-debug.c index 39a72e3f0c18..7035356e56b3 100644 --- a/drivers/staging/lustre/lnet/libcfs/linux/linux-debug.c +++ b/drivers/staging/lustre/lnet/libcfs/linux/linux-debug.c @@ -107,7 +107,7 @@ void __noreturn lbug_with_loc(struct libcfs_debug_msg_data *msgdata) libcfs_debug_dumplog(); if (libcfs_panic_on_lbug) panic("LBUG"); - set_task_state(current, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); while (1) schedule(); } diff --git a/drivers/target/target_core_pr.c b/drivers/target/target_core_pr.c index d761025144f9..e18051185846 100644 --- a/drivers/target/target_core_pr.c +++ b/drivers/target/target_core_pr.c @@ -788,7 +788,7 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration( * __core_scsi3_add_registration() */ dest_lun = rcu_dereference_check(deve_tmp->se_lun, - atomic_read(&deve_tmp->pr_kref.refcount) != 0); + kref_read(&deve_tmp->pr_kref) != 0); pr_reg_atp = __core_scsi3_do_alloc_registration(dev, nacl_tmp, dest_lun, deve_tmp, @@ -1463,7 +1463,7 @@ static int core_scsi3_lunacl_depend_item(struct se_dev_entry *se_deve) * For nacl->dynamic_node_acl=1 */ lun_acl = rcu_dereference_check(se_deve->se_lun_acl, - atomic_read(&se_deve->pr_kref.refcount) != 0); + kref_read(&se_deve->pr_kref) != 0); if (!lun_acl) return 0; @@ -1478,7 +1478,7 @@ static void core_scsi3_lunacl_undepend_item(struct se_dev_entry *se_deve) * For nacl->dynamic_node_acl=1 */ lun_acl = rcu_dereference_check(se_deve->se_lun_acl, - atomic_read(&se_deve->pr_kref.refcount) != 0); + kref_read(&se_deve->pr_kref) != 0); if (!lun_acl) { kref_put(&se_deve->pr_kref, target_pr_kref_release); return; @@ -1759,7 +1759,7 @@ core_scsi3_decode_spec_i_port( * 2nd loop which will never fail. */ dest_lun = rcu_dereference_check(dest_se_deve->se_lun, - atomic_read(&dest_se_deve->pr_kref.refcount) != 0); + kref_read(&dest_se_deve->pr_kref) != 0); dest_pr_reg = __core_scsi3_alloc_registration(cmd->se_dev, dest_node_acl, dest_lun, dest_se_deve, @@ -3466,7 +3466,7 @@ after_iport_check: iport_ptr); if (!dest_pr_reg) { struct se_lun *dest_lun = rcu_dereference_check(dest_se_deve->se_lun, - atomic_read(&dest_se_deve->pr_kref.refcount) != 0); + kref_read(&dest_se_deve->pr_kref) != 0); spin_unlock(&dev->dev_reservation_lock); if (core_scsi3_alloc_registration(cmd->se_dev, dest_node_acl, diff --git a/drivers/target/tcm_fc/tfc_sess.c b/drivers/target/tcm_fc/tfc_sess.c index fd5c3de79470..c91979c1463d 100644 --- a/drivers/target/tcm_fc/tfc_sess.c +++ b/drivers/target/tcm_fc/tfc_sess.c @@ -454,7 +454,7 @@ static void ft_sess_free(struct kref *kref) void ft_sess_put(struct ft_sess *sess) { - int sess_held = atomic_read(&sess->kref.refcount); + int sess_held = kref_read(&sess->kref); BUG_ON(!sess_held); kref_put(&sess->kref, ft_sess_free); diff --git a/drivers/tty/tty_ldsem.c b/drivers/tty/tty_ldsem.c index 1bf8ed13f827..9229de43e19d 100644 --- a/drivers/tty/tty_ldsem.c +++ b/drivers/tty/tty_ldsem.c @@ -200,7 +200,6 @@ static struct ld_semaphore __sched * down_read_failed(struct ld_semaphore *sem, long count, long timeout) { struct ldsem_waiter waiter; - struct task_struct *tsk = current; long adjust = -LDSEM_ACTIVE_BIAS + LDSEM_WAIT_BIAS; /* set up my own style of waitqueue */ @@ -221,8 +220,8 @@ down_read_failed(struct ld_semaphore *sem, long count, long timeout) list_add_tail(&waiter.list, &sem->read_wait); sem->wait_readers++; - waiter.task = tsk; - get_task_struct(tsk); + waiter.task = current; + get_task_struct(current); /* if there are no active locks, wake the new lock owner(s) */ if ((count & LDSEM_ACTIVE_MASK) == 0) @@ -232,7 +231,7 @@ down_read_failed(struct ld_semaphore *sem, long count, long timeout) /* wait to be given the lock */ for (;;) { - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); if (!waiter.task) break; @@ -241,7 +240,7 @@ down_read_failed(struct ld_semaphore *sem, long count, long timeout) timeout = schedule_timeout(timeout); } - __set_task_state(tsk, TASK_RUNNING); + __set_current_state(TASK_RUNNING); if (!timeout) { /* lock timed out but check if this task was just @@ -268,7 +267,6 @@ static struct ld_semaphore __sched * down_write_failed(struct ld_semaphore *sem, long count, long timeout) { struct ldsem_waiter waiter; - struct task_struct *tsk = current; long adjust = -LDSEM_ACTIVE_BIAS; int locked = 0; @@ -289,16 +287,16 @@ down_write_failed(struct ld_semaphore *sem, long count, long timeout) list_add_tail(&waiter.list, &sem->write_wait); - waiter.task = tsk; + waiter.task = current; - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); for (;;) { if (!timeout) break; raw_spin_unlock_irq(&sem->wait_lock); timeout = schedule_timeout(timeout); raw_spin_lock_irq(&sem->wait_lock); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); locked = writer_trylock(sem); if (locked) break; @@ -309,7 +307,7 @@ down_write_failed(struct ld_semaphore *sem, long count, long timeout) list_del(&waiter.list); raw_spin_unlock_irq(&sem->wait_lock); - __set_task_state(tsk, TASK_RUNNING); + __set_current_state(TASK_RUNNING); /* lock wait may have timed out */ if (!locked) diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index fd80c1b9c823..e6a17455adac 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -3698,7 +3698,7 @@ static void ffs_closed(struct ffs_data *ffs) goto done; if (opts->no_configfs || !opts->func_inst.group.cg_item.ci_parent - || !atomic_read(&opts->func_inst.group.cg_item.ci_kref.refcount)) + || !kref_read(&opts->func_inst.group.cg_item.ci_kref)) goto done; ci = opts->func_inst.group.cg_item.ci_parent->ci_parent; diff --git a/drivers/usb/mon/mon_main.c b/drivers/usb/mon/mon_main.c index 33ff49c4cea4..46847340b819 100644 --- a/drivers/usb/mon/mon_main.c +++ b/drivers/usb/mon/mon_main.c @@ -409,7 +409,7 @@ static void __exit mon_exit(void) printk(KERN_ERR TAG ": Outstanding opens (%d) on usb%d, leaking...\n", mbus->nreaders, mbus->u_bus->busnum); - atomic_set(&mbus->ref.refcount, 2); /* Force leak */ + kref_get(&mbus->ref); /* Force leak */ } mon_dissolve(mbus, mbus->u_bus); diff --git a/fs/exofs/sys.c b/fs/exofs/sys.c index 5e6a2c0a1f0b..1f7d5e46cdda 100644 --- a/fs/exofs/sys.c +++ b/fs/exofs/sys.c @@ -122,7 +122,7 @@ void exofs_sysfs_dbg_print(void) list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) { printk(KERN_INFO "%s: name %s ref %d\n", __func__, kobject_name(k_name), - (int)atomic_read(&k_name->kref.refcount)); + (int)kref_read(&k_name->kref)); } #endif } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 91307940c8ac..052f8d3c41cb 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -256,7 +256,7 @@ struct fuse_io_priv { #define FUSE_IO_PRIV_SYNC(f) \ { \ - .refcnt = { ATOMIC_INIT(1) }, \ + .refcnt = KREF_INIT(1), \ .async = 0, \ .file = f, \ } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 8c514367ba5a..b6b194ec1b4f 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -393,7 +393,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) /* Do we need to erase the effects of a prior jbd2_journal_flush? */ if (journal->j_flags & JBD2_FLUSHED) { jbd_debug(3, "super block updated\n"); - mutex_lock(&journal->j_checkpoint_mutex); + mutex_lock_io(&journal->j_checkpoint_mutex); /* * We hold j_checkpoint_mutex so tail cannot change under us. * We don't need any special data guarantees for writing sb diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 85d1483939b2..a1a359bfcc9c 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -944,7 +944,7 @@ out: */ void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) { - mutex_lock(&journal->j_checkpoint_mutex); + mutex_lock_io(&journal->j_checkpoint_mutex); if (tid_gt(tid, journal->j_tail_sequence)) __jbd2_update_log_tail(journal, tid, block); mutex_unlock(&journal->j_checkpoint_mutex); @@ -1304,7 +1304,7 @@ static int journal_reset(journal_t *journal) journal->j_flags |= JBD2_FLUSHED; } else { /* Lock here to make assertions happy... */ - mutex_lock(&journal->j_checkpoint_mutex); + mutex_lock_io(&journal->j_checkpoint_mutex); /* * Update log tail information. We use REQ_FUA since new * transaction will start reusing journal space and so we @@ -1691,7 +1691,7 @@ int jbd2_journal_destroy(journal_t *journal) spin_lock(&journal->j_list_lock); while (journal->j_checkpoint_transactions != NULL) { spin_unlock(&journal->j_list_lock); - mutex_lock(&journal->j_checkpoint_mutex); + mutex_lock_io(&journal->j_checkpoint_mutex); err = jbd2_log_do_checkpoint(journal); mutex_unlock(&journal->j_checkpoint_mutex); /* @@ -1713,7 +1713,7 @@ int jbd2_journal_destroy(journal_t *journal) if (journal->j_sb_buffer) { if (!is_journal_aborted(journal)) { - mutex_lock(&journal->j_checkpoint_mutex); + mutex_lock_io(&journal->j_checkpoint_mutex); write_lock(&journal->j_state_lock); journal->j_tail_sequence = @@ -1955,7 +1955,7 @@ int jbd2_journal_flush(journal_t *journal) spin_lock(&journal->j_list_lock); while (!err && journal->j_checkpoint_transactions != NULL) { spin_unlock(&journal->j_list_lock); - mutex_lock(&journal->j_checkpoint_mutex); + mutex_lock_io(&journal->j_checkpoint_mutex); err = jbd2_log_do_checkpoint(journal); mutex_unlock(&journal->j_checkpoint_mutex); spin_lock(&journal->j_list_lock); @@ -1965,7 +1965,7 @@ int jbd2_journal_flush(journal_t *journal) if (is_journal_aborted(journal)) return -EIO; - mutex_lock(&journal->j_checkpoint_mutex); + mutex_lock_io(&journal->j_checkpoint_mutex); if (!err) { err = jbd2_cleanup_journal_tail(journal); if (err < 0) { diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index 27d1242c8383..564c504d6efd 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -349,7 +349,7 @@ static void sc_show_sock_container(struct seq_file *seq, " func key: 0x%08x\n" " func type: %u\n", sc, - atomic_read(&sc->sc_kref.refcount), + kref_read(&sc->sc_kref), &saddr, inet ? ntohs(sport) : 0, &daddr, inet ? ntohs(dport) : 0, sc->sc_node->nd_name, diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index d4b5c81f0445..ec000575e863 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -97,7 +97,7 @@ typeof(sc) __sc = (sc); \ mlog(ML_SOCKET, "[sc %p refs %d sock %p node %u page %p " \ "pg_off %zu] " fmt, __sc, \ - atomic_read(&__sc->sc_kref.refcount), __sc->sc_sock, \ + kref_read(&__sc->sc_kref), __sc->sc_sock, \ __sc->sc_node->nd_num, __sc->sc_page, __sc->sc_page_off , \ ##args); \ } while (0) diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index e7b760deefae..9b984cae4c4e 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -81,7 +81,7 @@ static void __dlm_print_lock(struct dlm_lock *lock) lock->ml.type, lock->ml.convert_type, lock->ml.node, dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), - atomic_read(&lock->lock_refs.refcount), + kref_read(&lock->lock_refs), (list_empty(&lock->ast_list) ? 'y' : 'n'), (lock->ast_pending ? 'y' : 'n'), (list_empty(&lock->bast_list) ? 'y' : 'n'), @@ -106,7 +106,7 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) printk("lockres: %s, owner=%u, state=%u\n", buf, res->owner, res->state); printk(" last used: %lu, refcnt: %u, on purge list: %s\n", - res->last_used, atomic_read(&res->refs.refcount), + res->last_used, kref_read(&res->refs), list_empty(&res->purge) ? "no" : "yes"); printk(" on dirty list: %s, on reco list: %s, " "migrating pending: %s\n", @@ -298,7 +298,7 @@ static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len) mle_type, mle->master, mle->new_master, !list_empty(&mle->hb_events), !!mle->inuse, - atomic_read(&mle->mle_refs.refcount)); + kref_read(&mle->mle_refs)); out += snprintf(buf + out, len - out, "Maybe="); out += stringify_nodemap(mle->maybe_map, O2NM_MAX_NODES, @@ -494,7 +494,7 @@ static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len) lock->ast_pending, lock->bast_pending, lock->convert_pending, lock->lock_pending, lock->cancel_pending, lock->unlock_pending, - atomic_read(&lock->lock_refs.refcount)); + kref_read(&lock->lock_refs)); spin_unlock(&lock->spinlock); return out; @@ -521,7 +521,7 @@ static int dump_lockres(struct dlm_lock_resource *res, char *buf, int len) !list_empty(&res->recovering), res->inflight_locks, res->migration_pending, atomic_read(&res->asts_reserved), - atomic_read(&res->refs.refcount)); + kref_read(&res->refs)); /* refmap */ out += snprintf(buf + out, len - out, "RMAP:"); @@ -777,7 +777,7 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len) /* Purge Count: xxx Refs: xxx */ out += snprintf(buf + out, len - out, "Purge Count: %d Refs: %d\n", dlm->purge_count, - atomic_read(&dlm->dlm_refs.refcount)); + kref_read(&dlm->dlm_refs)); /* Dead Node: xxx */ out += snprintf(buf + out, len - out, diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 733e4e79c8e2..32fd261ae13d 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -2072,7 +2072,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, INIT_LIST_HEAD(&dlm->dlm_eviction_callbacks); mlog(0, "context init: refcount %u\n", - atomic_read(&dlm->dlm_refs.refcount)); + kref_read(&dlm->dlm_refs)); leave: if (ret < 0 && dlm) { diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index a464c8088170..7025d8c27999 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -233,7 +233,7 @@ static void __dlm_put_mle(struct dlm_master_list_entry *mle) assert_spin_locked(&dlm->spinlock); assert_spin_locked(&dlm->master_lock); - if (!atomic_read(&mle->mle_refs.refcount)) { + if (!kref_read(&mle->mle_refs)) { /* this may or may not crash, but who cares. * it's a BUG. */ mlog(ML_ERROR, "bad mle: %p\n", mle); @@ -1124,9 +1124,9 @@ recheck: unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS); /* - if (atomic_read(&mle->mle_refs.refcount) < 2) + if (kref_read(&mle->mle_refs) < 2) mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle, - atomic_read(&mle->mle_refs.refcount), + kref_read(&mle->mle_refs), res->lockname.len, res->lockname.name); */ atomic_set(&mle->woken, 0); @@ -1979,7 +1979,7 @@ ok: * on this mle. */ spin_lock(&dlm->master_lock); - rr = atomic_read(&mle->mle_refs.refcount); + rr = kref_read(&mle->mle_refs); if (mle->inuse > 0) { if (extra_ref && rr < 3) err = 1; diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 1082b2c3014b..63d701cd1e2e 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -251,7 +251,7 @@ leave: mlog(0, "lock %u:%llu should be gone now! refs=%d\n", dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), - atomic_read(&lock->lock_refs.refcount)-1); + kref_read(&lock->lock_refs)-1); dlm_lock_put(lock); } if (actions & DLM_UNLOCK_CALL_AST) diff --git a/fs/proc/base.c b/fs/proc/base.c index c147f10fcaa5..3d773eb9e144 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2179,7 +2179,7 @@ static const struct file_operations proc_map_files_operations = { .llseek = generic_file_llseek, }; -#ifdef CONFIG_CHECKPOINT_RESTORE +#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS) struct timers_private { struct pid *pid; struct task_struct *task; @@ -2941,7 +2941,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations), #endif -#ifdef CONFIG_CHECKPOINT_RESTORE +#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS) REG("timers", S_IRUGO, proc_timers_operations), #endif REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations), diff --git a/include/asm-generic/rwsem.h b/include/asm-generic/rwsem.h index 5be122e3d326..6c6a2141f271 100644 --- a/include/asm-generic/rwsem.h +++ b/include/asm-generic/rwsem.h @@ -33,7 +33,7 @@ */ static inline void __down_read(struct rw_semaphore *sem) { - if (unlikely(atomic_long_inc_return_acquire((atomic_long_t *)&sem->count) <= 0)) + if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) rwsem_down_read_failed(sem); } @@ -58,7 +58,7 @@ static inline void __down_write(struct rw_semaphore *sem) long tmp; tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS, - (atomic_long_t *)&sem->count); + &sem->count); if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS)) rwsem_down_write_failed(sem); } @@ -68,7 +68,7 @@ static inline int __down_write_killable(struct rw_semaphore *sem) long tmp; tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS, - (atomic_long_t *)&sem->count); + &sem->count); if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS)) if (IS_ERR(rwsem_down_write_failed_killable(sem))) return -EINTR; @@ -91,7 +91,7 @@ static inline void __up_read(struct rw_semaphore *sem) { long tmp; - tmp = atomic_long_dec_return_release((atomic_long_t *)&sem->count); + tmp = atomic_long_dec_return_release(&sem->count); if (unlikely(tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0)) rwsem_wake(sem); } @@ -102,7 +102,7 @@ static inline void __up_read(struct rw_semaphore *sem) static inline void __up_write(struct rw_semaphore *sem) { if (unlikely(atomic_long_sub_return_release(RWSEM_ACTIVE_WRITE_BIAS, - (atomic_long_t *)&sem->count) < 0)) + &sem->count) < 0)) rwsem_wake(sem); } @@ -120,8 +120,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem) * read-locked region is ok to be re-ordered into the * write side. As such, rely on RELEASE semantics. */ - tmp = atomic_long_add_return_release(-RWSEM_WAITING_BIAS, - (atomic_long_t *)&sem->count); + tmp = atomic_long_add_return_release(-RWSEM_WAITING_BIAS, &sem->count); if (tmp < 0) rwsem_downgrade_wake(sem); } diff --git a/include/drm/drm_framebuffer.h b/include/drm/drm_framebuffer.h index 04c77eee9c20..dd1e3e99dcff 100644 --- a/include/drm/drm_framebuffer.h +++ b/include/drm/drm_framebuffer.h @@ -233,7 +233,7 @@ static inline void drm_framebuffer_unreference(struct drm_framebuffer *fb) */ static inline uint32_t drm_framebuffer_read_refcount(struct drm_framebuffer *fb) { - return atomic_read(&fb->base.refcount.refcount); + return kref_read(&fb->base.refcount); } /** diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h index c356df40ac49..8f619f499e55 100644 --- a/include/drm/ttm/ttm_bo_api.h +++ b/include/drm/ttm/ttm_bo_api.h @@ -334,19 +334,6 @@ extern int ttm_bo_validate(struct ttm_buffer_object *bo, */ extern void ttm_bo_unref(struct ttm_buffer_object **bo); - -/** - * ttm_bo_list_ref_sub - * - * @bo: The buffer object. - * @count: The number of references with which to decrease @bo::list_kref; - * @never_free: The refcount should not reach zero with this operation. - * - * Release @count lru list references to this buffer object. - */ -extern void ttm_bo_list_ref_sub(struct ttm_buffer_object *bo, int count, - bool never_free); - /** * ttm_bo_add_to_lru * @@ -369,7 +356,7 @@ extern void ttm_bo_add_to_lru(struct ttm_buffer_object *bo); * and is usually called just immediately after the bo has been reserved to * avoid recursive reservation from lru lists. */ -extern int ttm_bo_del_from_lru(struct ttm_buffer_object *bo); +extern void ttm_bo_del_from_lru(struct ttm_buffer_object *bo); /** * ttm_bo_move_to_lru_tail diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h index 7e75fa053473..dfb260f7878d 100644 --- a/include/drm/ttm/ttm_bo_driver.h +++ b/include/drm/ttm/ttm_bo_driver.h @@ -871,7 +871,7 @@ static inline int ttm_bo_reserve(struct ttm_buffer_object *bo, { int ret; - WARN_ON(!atomic_read(&bo->kref.refcount)); + WARN_ON(!kref_read(&bo->kref)); ret = __ttm_bo_reserve(bo, interruptible, no_wait, ticket); if (likely(ret == 0)) @@ -896,7 +896,7 @@ static inline int ttm_bo_reserve_slowpath(struct ttm_buffer_object *bo, { int ret = 0; - WARN_ON(!atomic_read(&bo->kref.refcount)); + WARN_ON(!kref_read(&bo->kref)); if (interruptible) ret = ww_mutex_lock_slow_interruptible(&bo->resv->lock, diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index e315d04a2fd9..cfc75848a35d 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -62,6 +62,8 @@ struct module; * @archdata: arch-specific data * @suspend: suspend function for the clocksource, if necessary * @resume: resume function for the clocksource, if necessary + * @mark_unstable: Optional function to inform the clocksource driver that + * the watchdog marked the clocksource unstable * @owner: module reference, must be set by clocksource in modules * * Note: This struct is not used in hotpathes of the timekeeping code @@ -93,6 +95,7 @@ struct clocksource { unsigned long flags; void (*suspend)(struct clocksource *cs); void (*resume)(struct clocksource *cs); + void (*mark_unstable)(struct clocksource *cs); /* private: */ #ifdef CONFIG_CLOCKSOURCE_WATCHDOG diff --git a/include/linux/delay.h b/include/linux/delay.h index a6ecb34cf547..2ecb3c46b20a 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h @@ -5,6 +5,17 @@ * Copyright (C) 1993 Linus Torvalds * * Delay routines, using a pre-computed "loops_per_jiffy" value. + * + * Please note that ndelay(), udelay() and mdelay() may return early for + * several reasons: + * 1. computed loops_per_jiffy too low (due to the time taken to + * execute the timer interrupt.) + * 2. cache behaviour affecting the time it takes to execute the + * loop function. + * 3. CPU clock rate changes. + * + * Please see this thread: + * http://lists.openwall.net/linux-kernel/2011/01/09/56 */ #include <linux/kernel.h> diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 325f649d77ff..3a85d61f7614 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -42,6 +42,27 @@ extern struct fs_struct init_fs; #define INIT_PREV_CPUTIME(x) #endif +#ifdef CONFIG_POSIX_TIMERS +#define INIT_POSIX_TIMERS(s) \ + .posix_timers = LIST_HEAD_INIT(s.posix_timers), +#define INIT_CPU_TIMERS(s) \ + .cpu_timers = { \ + LIST_HEAD_INIT(s.cpu_timers[0]), \ + LIST_HEAD_INIT(s.cpu_timers[1]), \ + LIST_HEAD_INIT(s.cpu_timers[2]), \ + }, +#define INIT_CPUTIMER(s) \ + .cputimer = { \ + .cputime_atomic = INIT_CPUTIME_ATOMIC, \ + .running = false, \ + .checking_timer = false, \ + }, +#else +#define INIT_POSIX_TIMERS(s) +#define INIT_CPU_TIMERS(s) +#define INIT_CPUTIMER(s) +#endif + #define INIT_SIGNALS(sig) { \ .nr_threads = 1, \ .thread_head = LIST_HEAD_INIT(init_task.thread_node), \ @@ -49,14 +70,10 @@ extern struct fs_struct init_fs; .shared_pending = { \ .list = LIST_HEAD_INIT(sig.shared_pending.list), \ .signal = {{0}}}, \ - .posix_timers = LIST_HEAD_INIT(sig.posix_timers), \ - .cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \ + INIT_POSIX_TIMERS(sig) \ + INIT_CPU_TIMERS(sig) \ .rlim = INIT_RLIMITS, \ - .cputimer = { \ - .cputime_atomic = INIT_CPUTIME_ATOMIC, \ - .running = false, \ - .checking_timer = false, \ - }, \ + INIT_CPUTIMER(sig) \ INIT_PREV_CPUTIME(sig) \ .cred_guard_mutex = \ __MUTEX_INITIALIZER(sig.cred_guard_mutex), \ @@ -247,7 +264,7 @@ extern struct task_group root_task_group; .blocked = {{0}}, \ .alloc_lock = __SPIN_LOCK_UNLOCKED(tsk.alloc_lock), \ .journal_info = NULL, \ - .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ + INIT_CPU_TIMERS(tsk) \ .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \ .timer_slack_ns = 50000, /* 50 usec default slack */ \ .pids = { \ @@ -274,13 +291,6 @@ extern struct task_group root_task_group; } -#define INIT_CPU_TIMERS(cpu_timers) \ -{ \ - LIST_HEAD_INIT(cpu_timers[0]), \ - LIST_HEAD_INIT(cpu_timers[1]), \ - LIST_HEAD_INIT(cpu_timers[2]), \ -} - /* Attach to the init_task data structure for proper alignment */ #define __init_task_data __attribute__((__section__(".data..init_task"))) diff --git a/include/linux/intel_pmic_gpio.h b/include/linux/intel_pmic_gpio.h deleted file mode 100644 index 920109a29191..000000000000 --- a/include/linux/intel_pmic_gpio.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef LINUX_INTEL_PMIC_H -#define LINUX_INTEL_PMIC_H - -struct intel_pmic_gpio_platform_data { - /* the first IRQ of the chip */ - unsigned irq_base; - /* number assigned to the first GPIO */ - unsigned gpio_base; - /* sram address for gpiointr register, the langwell chip will map - * the PMIC spi GPIO expander's GPIOINTR register in sram. - */ - unsigned gpiointr; -}; - -#endif diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index a0547c571800..b63d6b7b0db0 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -402,6 +402,6 @@ extern bool ____wrong_branch_error(void); #define static_branch_enable(x) static_key_enable(&(x)->key) #define static_branch_disable(x) static_key_disable(&(x)->key) -#endif /* _LINUX_JUMP_LABEL_H */ - #endif /* __ASSEMBLY__ */ + +#endif /* _LINUX_JUMP_LABEL_H */ diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 00f776816aa3..c3e38ded2d73 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -79,14 +79,17 @@ static inline unsigned int kstat_cpu_irqs_sum(unsigned int cpu) } extern void account_user_time(struct task_struct *, cputime_t); +extern void account_guest_time(struct task_struct *, cputime_t); extern void account_system_time(struct task_struct *, int, cputime_t); +extern void account_system_index_time(struct task_struct *, cputime_t, + enum cpu_usage_stat); extern void account_steal_time(cputime_t); extern void account_idle_time(cputime_t); #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE static inline void account_process_tick(struct task_struct *tsk, int user) { - vtime_account_user(tsk); + vtime_flush(tsk); } #else extern void account_process_tick(struct task_struct *, int user); diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 8f6849084248..16ddfb8b304a 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -278,9 +278,13 @@ struct kprobe_insn_cache { int nr_garbage; }; +#ifdef __ARCH_WANT_KPROBES_INSN_SLOT extern kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c); extern void __free_insn_slot(struct kprobe_insn_cache *c, kprobe_opcode_t *slot, int dirty); +/* sleep-less address checking routine */ +extern bool __is_insn_slot_addr(struct kprobe_insn_cache *c, + unsigned long addr); #define DEFINE_INSN_CACHE_OPS(__name) \ extern struct kprobe_insn_cache kprobe_##__name##_slots; \ @@ -294,6 +298,18 @@ static inline void free_##__name##_slot(kprobe_opcode_t *slot, int dirty)\ { \ __free_insn_slot(&kprobe_##__name##_slots, slot, dirty); \ } \ + \ +static inline bool is_kprobe_##__name##_slot(unsigned long addr) \ +{ \ + return __is_insn_slot_addr(&kprobe_##__name##_slots, addr); \ +} +#else /* __ARCH_WANT_KPROBES_INSN_SLOT */ +#define DEFINE_INSN_CACHE_OPS(__name) \ +static inline bool is_kprobe_##__name##_slot(unsigned long addr) \ +{ \ + return 0; \ +} +#endif DEFINE_INSN_CACHE_OPS(insn); @@ -330,7 +346,6 @@ extern int proc_kprobes_optimization_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos); #endif - #endif /* CONFIG_OPTPROBES */ #ifdef CONFIG_KPROBES_ON_FTRACE extern void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, @@ -481,6 +496,19 @@ static inline int enable_jprobe(struct jprobe *jp) return enable_kprobe(&jp->kp); } +#ifndef CONFIG_KPROBES +static inline bool is_kprobe_insn_slot(unsigned long addr) +{ + return false; +} +#endif +#ifndef CONFIG_OPTPROBES +static inline bool is_kprobe_optinsn_slot(unsigned long addr) +{ + return false; +} +#endif + #ifdef CONFIG_KPROBES /* * Blacklist ganerating macro. Specify functions which is not probed diff --git a/include/linux/kref.h b/include/linux/kref.h index 62f0a84ae94e..308263af63ad 100644 --- a/include/linux/kref.h +++ b/include/linux/kref.h @@ -19,11 +19,14 @@ #include <linux/atomic.h> #include <linux/kernel.h> #include <linux/mutex.h> +#include <linux/spinlock.h> struct kref { atomic_t refcount; }; +#define KREF_INIT(n) { .refcount = ATOMIC_INIT(n), } + /** * kref_init - initialize object. * @kref: object in question. @@ -33,6 +36,11 @@ static inline void kref_init(struct kref *kref) atomic_set(&kref->refcount, 1); } +static inline int kref_read(const struct kref *kref) +{ + return atomic_read(&kref->refcount); +} + /** * kref_get - increment refcount for object. * @kref: object. @@ -47,9 +55,8 @@ static inline void kref_get(struct kref *kref) } /** - * kref_sub - subtract a number of refcounts for object. + * kref_put - decrement refcount for object. * @kref: object. - * @count: Number of recounts to subtract. * @release: pointer to the function that will clean up the object when the * last reference to the object is released. * This pointer is required, and it is not acceptable to pass kfree @@ -58,57 +65,43 @@ static inline void kref_get(struct kref *kref) * maintainer, and anyone else who happens to notice it. You have * been warned. * - * Subtract @count from the refcount, and if 0, call release(). + * Decrement the refcount, and if 0, call release(). * Return 1 if the object was removed, otherwise return 0. Beware, if this * function returns 0, you still can not count on the kref from remaining in * memory. Only use the return value if you want to see if the kref is now * gone, not present. */ -static inline int kref_sub(struct kref *kref, unsigned int count, - void (*release)(struct kref *kref)) +static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref)) { WARN_ON(release == NULL); - if (atomic_sub_and_test((int) count, &kref->refcount)) { + if (atomic_dec_and_test(&kref->refcount)) { release(kref); return 1; } return 0; } -/** - * kref_put - decrement refcount for object. - * @kref: object. - * @release: pointer to the function that will clean up the object when the - * last reference to the object is released. - * This pointer is required, and it is not acceptable to pass kfree - * in as this function. If the caller does pass kfree to this - * function, you will be publicly mocked mercilessly by the kref - * maintainer, and anyone else who happens to notice it. You have - * been warned. - * - * Decrement the refcount, and if 0, call release(). - * Return 1 if the object was removed, otherwise return 0. Beware, if this - * function returns 0, you still can not count on the kref from remaining in - * memory. Only use the return value if you want to see if the kref is now - * gone, not present. - */ -static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref)) -{ - return kref_sub(kref, 1, release); -} - static inline int kref_put_mutex(struct kref *kref, void (*release)(struct kref *kref), struct mutex *lock) { WARN_ON(release == NULL); - if (unlikely(!atomic_add_unless(&kref->refcount, -1, 1))) { - mutex_lock(lock); - if (unlikely(!atomic_dec_and_test(&kref->refcount))) { - mutex_unlock(lock); - return 0; - } + + if (atomic_dec_and_mutex_lock(&kref->refcount, lock)) { + release(kref); + return 1; + } + return 0; +} + +static inline int kref_put_lock(struct kref *kref, + void (*release)(struct kref *kref), + spinlock_t *lock) +{ + WARN_ON(release == NULL); + + if (atomic_dec_and_lock(&kref->refcount, lock)) { release(kref); return 1; } diff --git a/include/linux/math64.h b/include/linux/math64.h index 6e8b5b270ffe..80690c96c734 100644 --- a/include/linux/math64.h +++ b/include/linux/math64.h @@ -133,6 +133,16 @@ __iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder) return ret; } +#ifndef mul_u32_u32 +/* + * Many a GCC version messes this up and generates a 64x64 mult :-( + */ +static inline u64 mul_u32_u32(u32 a, u32 b) +{ + return (u64)a * b; +} +#endif + #if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__) #ifndef mul_u64_u32_shr @@ -160,9 +170,9 @@ static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift) al = a; ah = a >> 32; - ret = ((u64)al * mul) >> shift; + ret = mul_u32_u32(al, mul) >> shift; if (ah) - ret += ((u64)ah * mul) << (32 - shift); + ret += mul_u32_u32(ah, mul) << (32 - shift); return ret; } @@ -186,10 +196,10 @@ static inline u64 mul_u64_u64_shr(u64 a, u64 b, unsigned int shift) a0.ll = a; b0.ll = b; - rl.ll = (u64)a0.l.low * b0.l.low; - rm.ll = (u64)a0.l.low * b0.l.high; - rn.ll = (u64)a0.l.high * b0.l.low; - rh.ll = (u64)a0.l.high * b0.l.high; + rl.ll = mul_u32_u32(a0.l.low, b0.l.low); + rm.ll = mul_u32_u32(a0.l.low, b0.l.high); + rn.ll = mul_u32_u32(a0.l.high, b0.l.low); + rh.ll = mul_u32_u32(a0.l.high, b0.l.high); /* * Each of these lines computes a 64-bit intermediate result into "c", @@ -229,8 +239,8 @@ static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor) } u, rl, rh; u.ll = a; - rl.ll = (u64)u.l.low * mul; - rh.ll = (u64)u.l.high * mul + rl.l.high; + rl.ll = mul_u32_u32(u.l.low, mul); + rh.ll = mul_u32_u32(u.l.high, mul) + rl.l.high; /* Bits 32-63 of the result will be in rh.l.low. */ rl.l.high = do_div(rh.ll, divisor); diff --git a/include/linux/mutex.h b/include/linux/mutex.h index b97870f2debd..1127fe31645d 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -20,6 +20,8 @@ #include <linux/osq_lock.h> #include <linux/debug_locks.h> +struct ww_acquire_ctx; + /* * Simple, straightforward mutexes with strict semantics: * @@ -65,7 +67,7 @@ struct mutex { static inline struct task_struct *__mutex_owner(struct mutex *lock) { - return (struct task_struct *)(atomic_long_read(&lock->owner) & ~0x03); + return (struct task_struct *)(atomic_long_read(&lock->owner) & ~0x07); } /* @@ -75,6 +77,7 @@ static inline struct task_struct *__mutex_owner(struct mutex *lock) struct mutex_waiter { struct list_head list; struct task_struct *task; + struct ww_acquire_ctx *ww_ctx; #ifdef CONFIG_DEBUG_MUTEXES void *magic; #endif @@ -156,10 +159,12 @@ extern int __must_check mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass); extern int __must_check mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass); +extern void mutex_lock_io_nested(struct mutex *lock, unsigned int subclass); #define mutex_lock(lock) mutex_lock_nested(lock, 0) #define mutex_lock_interruptible(lock) mutex_lock_interruptible_nested(lock, 0) #define mutex_lock_killable(lock) mutex_lock_killable_nested(lock, 0) +#define mutex_lock_io(lock) mutex_lock_io_nested(lock, 0) #define mutex_lock_nest_lock(lock, nest_lock) \ do { \ @@ -171,11 +176,13 @@ do { \ extern void mutex_lock(struct mutex *lock); extern int __must_check mutex_lock_interruptible(struct mutex *lock); extern int __must_check mutex_lock_killable(struct mutex *lock); +extern void mutex_lock_io(struct mutex *lock); # define mutex_lock_nested(lock, subclass) mutex_lock(lock) # define mutex_lock_interruptible_nested(lock, subclass) mutex_lock_interruptible(lock) # define mutex_lock_killable_nested(lock, subclass) mutex_lock_killable(lock) # define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock) +# define mutex_lock_io_nested(lock, subclass) mutex_lock(lock) #endif /* diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index 5b2e6159b744..93664f022ecf 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -4,15 +4,15 @@ #include <linux/atomic.h> #include <linux/rwsem.h> #include <linux/percpu.h> -#include <linux/wait.h> +#include <linux/rcuwait.h> #include <linux/rcu_sync.h> #include <linux/lockdep.h> struct percpu_rw_semaphore { struct rcu_sync rss; unsigned int __percpu *read_count; - struct rw_semaphore rw_sem; - wait_queue_head_t writer; + struct rw_semaphore rw_sem; /* slowpath */ + struct rcuwait writer; /* blocked writer */ int readers_block; }; @@ -22,7 +22,7 @@ static struct percpu_rw_semaphore name = { \ .rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC), \ .read_count = &__percpu_rwsem_rc_##name, \ .rw_sem = __RWSEM_INITIALIZER(name.rw_sem), \ - .writer = __WAIT_QUEUE_HEAD_INITIALIZER(name.writer), \ + .writer = __RCUWAIT_INITIALIZER(name.writer), \ } extern int __percpu_down_read(struct percpu_rw_semaphore *, int); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 78ed8105e64d..5c58e93c130c 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -785,9 +785,9 @@ struct perf_cpu_context { ktime_t hrtimer_interval; unsigned int hrtimer_active; - struct pmu *unique_pmu; #ifdef CONFIG_CGROUP_PERF struct perf_cgroup *cgrp; + struct list_head cgrp_cpuctx_entry; #endif struct list_head sched_cb_entry; diff --git a/include/linux/poison.h b/include/linux/poison.h index 51334edec506..a39540326417 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -80,6 +80,7 @@ /********** kernel/mutexes **********/ #define MUTEX_DEBUG_INIT 0x11 #define MUTEX_DEBUG_FREE 0x22 +#define MUTEX_POISON_WW_CTX ((void *) 0x500 + POISON_POINTER_DELTA) /********** lib/flex_array.c **********/ #define FLEX_ARRAY_FREE 0x6c /* for use-after-free poisoning */ diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h new file mode 100644 index 000000000000..0e93d56c7ab2 --- /dev/null +++ b/include/linux/rcuwait.h @@ -0,0 +1,63 @@ +#ifndef _LINUX_RCUWAIT_H_ +#define _LINUX_RCUWAIT_H_ + +#include <linux/rcupdate.h> + +/* + * rcuwait provides a way of blocking and waking up a single + * task in an rcu-safe manner; where it is forbidden to use + * after exit_notify(). task_struct is not properly rcu protected, + * unless dealing with rcu-aware lists, ie: find_task_by_*(). + * + * Alternatively we have task_rcu_dereference(), but the return + * semantics have different implications which would break the + * wakeup side. The only time @task is non-nil is when a user is + * blocked (or checking if it needs to) on a condition, and reset + * as soon as we know that the condition has succeeded and are + * awoken. + */ +struct rcuwait { + struct task_struct *task; +}; + +#define __RCUWAIT_INITIALIZER(name) \ + { .task = NULL, } + +static inline void rcuwait_init(struct rcuwait *w) +{ + w->task = NULL; +} + +extern void rcuwait_wake_up(struct rcuwait *w); + +/* + * The caller is responsible for locking around rcuwait_wait_event(), + * such that writes to @task are properly serialized. + */ +#define rcuwait_wait_event(w, condition) \ +({ \ + /* \ + * Complain if we are called after do_exit()/exit_notify(), \ + * as we cannot rely on the rcu critical region for the \ + * wakeup side. \ + */ \ + WARN_ON(current->exit_state); \ + \ + rcu_assign_pointer((w)->task, current); \ + for (;;) { \ + /* \ + * Implicit barrier (A) pairs with (B) in \ + * rcuwait_trywake(). \ + */ \ + set_current_state(TASK_UNINTERRUPTIBLE); \ + if (condition) \ + break; \ + \ + schedule(); \ + } \ + \ + WRITE_ONCE((w)->task, NULL); \ + __set_current_state(TASK_RUNNING); \ +}) + +#endif /* _LINUX_RCUWAIT_H_ */ diff --git a/include/linux/sched.h b/include/linux/sched.h index ad3ec9ec61f7..48954b0ae2ab 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -227,7 +227,7 @@ extern void proc_sched_set_task(struct task_struct *p); extern char ___assert_task_state[1 - 2*!!( sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)]; -/* Convenience macros for the sake of set_task_state */ +/* Convenience macros for the sake of set_current_state */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) #define TASK_TRACED (TASK_WAKEKILL | __TASK_TRACED) @@ -254,17 +254,6 @@ extern char ___assert_task_state[1 - 2*!!( #ifdef CONFIG_DEBUG_ATOMIC_SLEEP -#define __set_task_state(tsk, state_value) \ - do { \ - (tsk)->task_state_change = _THIS_IP_; \ - (tsk)->state = (state_value); \ - } while (0) -#define set_task_state(tsk, state_value) \ - do { \ - (tsk)->task_state_change = _THIS_IP_; \ - smp_store_mb((tsk)->state, (state_value)); \ - } while (0) - #define __set_current_state(state_value) \ do { \ current->task_state_change = _THIS_IP_; \ @@ -277,20 +266,6 @@ extern char ___assert_task_state[1 - 2*!!( } while (0) #else - -/* - * @tsk had better be current, or you get to keep the pieces. - * - * The only reason is that computing current can be more expensive than - * using a pointer that's already available. - * - * Therefore, see set_current_state(). - */ -#define __set_task_state(tsk, state_value) \ - do { (tsk)->state = (state_value); } while (0) -#define set_task_state(tsk, state_value) \ - smp_store_mb((tsk)->state, (state_value)) - /* * set_current_state() includes a barrier so that the write of current->state * is correctly serialised wrt the caller's subsequent test of whether to @@ -461,12 +436,10 @@ extern signed long schedule_timeout_idle(signed long timeout); asmlinkage void schedule(void); extern void schedule_preempt_disabled(void); +extern int __must_check io_schedule_prepare(void); +extern void io_schedule_finish(int token); extern long io_schedule_timeout(long timeout); - -static inline void io_schedule(void) -{ - io_schedule_timeout(MAX_SCHEDULE_TIMEOUT); -} +extern void io_schedule(void); void __noreturn do_task_dead(void); @@ -734,13 +707,14 @@ struct signal_struct { unsigned int is_child_subreaper:1; unsigned int has_child_subreaper:1; +#ifdef CONFIG_POSIX_TIMERS + /* POSIX.1b Interval Timers */ int posix_timer_id; struct list_head posix_timers; /* ITIMER_REAL timer for the process */ struct hrtimer real_timer; - struct pid *leader_pid; ktime_t it_real_incr; /* @@ -759,12 +733,16 @@ struct signal_struct { /* Earliest-expiration cache. */ struct task_cputime cputime_expires; + struct list_head cpu_timers[3]; + +#endif + + struct pid *leader_pid; + #ifdef CONFIG_NO_HZ_FULL atomic_t tick_dep_mask; #endif - struct list_head cpu_timers[3]; - struct pid *tty_old_pgrp; /* boolean value for session group leader */ @@ -1046,6 +1024,12 @@ struct wake_q_head { #define DEFINE_WAKE_Q(name) \ struct wake_q_head name = { WAKE_Q_TAIL, &name.first } +static inline void wake_q_init(struct wake_q_head *head) +{ + head->first = WAKE_Q_TAIL; + head->lastp = &head->first; +} + extern void wake_q_add(struct wake_q_head *head, struct task_struct *task); extern void wake_up_q(struct wake_q_head *head); @@ -1691,8 +1675,10 @@ struct task_struct { /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ unsigned long min_flt, maj_flt; +#ifdef CONFIG_POSIX_TIMERS struct task_cputime cputime_expires; struct list_head cpu_timers[3]; +#endif /* process credentials */ const struct cred __rcu *ptracer_cred; /* Tracer's credentials at attach */ @@ -2515,6 +2501,10 @@ extern u64 sched_clock_cpu(int cpu); extern void sched_clock_init(void); #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +static inline void sched_clock_init_late(void) +{ +} + static inline void sched_clock_tick(void) { } @@ -2537,6 +2527,7 @@ static inline u64 local_clock(void) return sched_clock(); } #else +extern void sched_clock_init_late(void); /* * Architectures can set this to 1 if they have specified * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig, @@ -2544,7 +2535,6 @@ static inline u64 local_clock(void) * is reliable after all: */ extern int sched_clock_stable(void); -extern void set_sched_clock_stable(void); extern void clear_sched_clock_stable(void); extern void sched_clock_tick(void); diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 47dd0cebd204..59248dcc6ef3 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -180,8 +180,6 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) #ifdef CONFIG_DEBUG_LOCK_ALLOC # define raw_spin_lock_nested(lock, subclass) \ _raw_spin_lock_nested(lock, subclass) -# define raw_spin_lock_bh_nested(lock, subclass) \ - _raw_spin_lock_bh_nested(lock, subclass) # define raw_spin_lock_nest_lock(lock, nest_lock) \ do { \ @@ -197,7 +195,6 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) # define raw_spin_lock_nested(lock, subclass) \ _raw_spin_lock(((void)(subclass), (lock))) # define raw_spin_lock_nest_lock(lock, nest_lock) _raw_spin_lock(lock) -# define raw_spin_lock_bh_nested(lock, subclass) _raw_spin_lock_bh(lock) #endif #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) @@ -317,11 +314,6 @@ do { \ raw_spin_lock_nested(spinlock_check(lock), subclass); \ } while (0) -#define spin_lock_bh_nested(lock, subclass) \ -do { \ - raw_spin_lock_bh_nested(spinlock_check(lock), subclass);\ -} while (0) - #define spin_lock_nest_lock(lock, nest_lock) \ do { \ raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock); \ diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h index 5344268e6e62..42dfab89e740 100644 --- a/include/linux/spinlock_api_smp.h +++ b/include/linux/spinlock_api_smp.h @@ -22,8 +22,6 @@ int in_lock_functions(unsigned long addr); void __lockfunc _raw_spin_lock(raw_spinlock_t *lock) __acquires(lock); void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass) __acquires(lock); -void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass) - __acquires(lock); void __lockfunc _raw_spin_lock_nest_lock(raw_spinlock_t *lock, struct lockdep_map *map) __acquires(lock); diff --git a/include/linux/spinlock_api_up.h b/include/linux/spinlock_api_up.h index d3afef9d8dbe..d0d188861ad6 100644 --- a/include/linux/spinlock_api_up.h +++ b/include/linux/spinlock_api_up.h @@ -57,7 +57,6 @@ #define _raw_spin_lock(lock) __LOCK(lock) #define _raw_spin_lock_nested(lock, subclass) __LOCK(lock) -#define _raw_spin_lock_bh_nested(lock, subclass) __LOCK(lock) #define _raw_read_lock(lock) __LOCK(lock) #define _raw_write_lock(lock) __LOCK(lock) #define _raw_spin_lock_bh(lock) __LOCK_BH(lock) diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index 9dcf2c8fe1c5..a5d335a27d0e 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -198,7 +198,7 @@ static inline struct cache_head *cache_get(struct cache_head *h) static inline void cache_put(struct cache_head *h, struct cache_detail *cd) { - if (atomic_read(&h->ref.refcount) <= 2 && + if (kref_read(&h->ref) <= 2 && h->expiry_time < cd->nextcheck) cd->nextcheck = h->expiry_time; kref_put(&h->ref, cd->cache_put); diff --git a/include/linux/vtime.h b/include/linux/vtime.h index aa9bfea8804a..0681fe25abeb 100644 --- a/include/linux/vtime.h +++ b/include/linux/vtime.h @@ -58,27 +58,28 @@ static inline void vtime_task_switch(struct task_struct *prev) extern void vtime_account_system(struct task_struct *tsk); extern void vtime_account_idle(struct task_struct *tsk); -extern void vtime_account_user(struct task_struct *tsk); #else /* !CONFIG_VIRT_CPU_ACCOUNTING */ static inline void vtime_task_switch(struct task_struct *prev) { } static inline void vtime_account_system(struct task_struct *tsk) { } -static inline void vtime_account_user(struct task_struct *tsk) { } #endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN extern void arch_vtime_task_switch(struct task_struct *tsk); +extern void vtime_account_user(struct task_struct *tsk); extern void vtime_user_enter(struct task_struct *tsk); static inline void vtime_user_exit(struct task_struct *tsk) { vtime_account_user(tsk); } + extern void vtime_guest_enter(struct task_struct *tsk); extern void vtime_guest_exit(struct task_struct *tsk); extern void vtime_init_idle(struct task_struct *tsk, int cpu); #else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN */ +static inline void vtime_account_user(struct task_struct *tsk) { } static inline void vtime_user_enter(struct task_struct *tsk) { } static inline void vtime_user_exit(struct task_struct *tsk) { } static inline void vtime_guest_enter(struct task_struct *tsk) { } @@ -93,9 +94,11 @@ static inline void vtime_account_irq_exit(struct task_struct *tsk) /* On hard|softirq exit we always account to hard|softirq cputime */ vtime_account_system(tsk); } +extern void vtime_flush(struct task_struct *tsk); #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ static inline void vtime_account_irq_enter(struct task_struct *tsk) { } static inline void vtime_account_irq_exit(struct task_struct *tsk) { } +static inline void vtime_flush(struct task_struct *tsk) { } #endif diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h index 7b0066814fa0..5dd9a7682227 100644 --- a/include/linux/ww_mutex.h +++ b/include/linux/ww_mutex.h @@ -51,10 +51,10 @@ struct ww_mutex { }; #ifdef CONFIG_DEBUG_LOCK_ALLOC -# define __WW_CLASS_MUTEX_INITIALIZER(lockname, ww_class) \ - , .ww_class = &ww_class +# define __WW_CLASS_MUTEX_INITIALIZER(lockname, class) \ + , .ww_class = class #else -# define __WW_CLASS_MUTEX_INITIALIZER(lockname, ww_class) +# define __WW_CLASS_MUTEX_INITIALIZER(lockname, class) #endif #define __WW_CLASS_INITIALIZER(ww_class) \ @@ -63,7 +63,7 @@ struct ww_mutex { , .mutex_name = #ww_class "_mutex" } #define __WW_MUTEX_INITIALIZER(lockname, class) \ - { .base = { \__MUTEX_INITIALIZER(lockname) } \ + { .base = __MUTEX_INITIALIZER(lockname.base) \ __WW_CLASS_MUTEX_INITIALIZER(lockname, class) } #define DEFINE_WW_CLASS(classname) \ @@ -186,11 +186,6 @@ static inline void ww_acquire_fini(struct ww_acquire_ctx *ctx) #endif } -extern int __must_check __ww_mutex_lock(struct ww_mutex *lock, - struct ww_acquire_ctx *ctx); -extern int __must_check __ww_mutex_lock_interruptible(struct ww_mutex *lock, - struct ww_acquire_ctx *ctx); - /** * ww_mutex_lock - acquire the w/w mutex * @lock: the mutex to be acquired @@ -220,14 +215,7 @@ extern int __must_check __ww_mutex_lock_interruptible(struct ww_mutex *lock, * * A mutex acquired with this function must be released with ww_mutex_unlock. */ -static inline int ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) -{ - if (ctx) - return __ww_mutex_lock(lock, ctx); - - mutex_lock(&lock->base); - return 0; -} +extern int /* __must_check */ ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx); /** * ww_mutex_lock_interruptible - acquire the w/w mutex, interruptible @@ -259,14 +247,8 @@ static inline int ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ct * * A mutex acquired with this function must be released with ww_mutex_unlock. */ -static inline int __must_check ww_mutex_lock_interruptible(struct ww_mutex *lock, - struct ww_acquire_ctx *ctx) -{ - if (ctx) - return __ww_mutex_lock_interruptible(lock, ctx); - else - return mutex_lock_interruptible(&lock->base); -} +extern int __must_check ww_mutex_lock_interruptible(struct ww_mutex *lock, + struct ww_acquire_ctx *ctx); /** * ww_mutex_lock_slow - slowpath acquiring of the w/w mutex diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 554671c81f4a..90708f68cc02 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -987,7 +987,7 @@ static inline void hci_conn_drop(struct hci_conn *conn) static inline void hci_dev_put(struct hci_dev *d) { BT_DBG("%s orig refcnt %d", d->name, - atomic_read(&d->dev.kobj.kref.refcount)); + kref_read(&d->dev.kobj.kref)); put_device(&d->dev); } @@ -995,7 +995,7 @@ static inline void hci_dev_put(struct hci_dev *d) static inline struct hci_dev *hci_dev_hold(struct hci_dev *d) { BT_DBG("%s orig refcnt %d", d->name, - atomic_read(&d->dev.kobj.kref.refcount)); + kref_read(&d->dev.kobj.kref)); get_device(&d->dev); return d; diff --git a/init/main.c b/init/main.c index 86f5cf0a92a2..8e75d6041adb 100644 --- a/init/main.c +++ b/init/main.c @@ -625,7 +625,6 @@ asmlinkage __visible void __init start_kernel(void) numa_policy_init(); if (late_time_init) late_time_init(); - sched_clock_init(); calibrate_delay(); pidmap_init(); anon_vma_init(); diff --git a/init/version.c b/init/version.c index fe41a63efed6..5606341e9efd 100644 --- a/init/version.c +++ b/init/version.c @@ -23,9 +23,7 @@ int version_string(LINUX_VERSION_CODE); #endif struct uts_namespace init_uts_ns = { - .kref = { - .refcount = ATOMIC_INIT(2), - }, + .kref = KREF_INIT(2), .name = { .sysname = UTS_SYSNAME, .nodename = UTS_NODENAME, diff --git a/kernel/events/core.c b/kernel/events/core.c index e5aaa806702d..88676ff98c0f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -355,6 +355,8 @@ enum event_type_t { EVENT_FLEXIBLE = 0x1, EVENT_PINNED = 0x2, EVENT_TIME = 0x4, + /* see ctx_resched() for details */ + EVENT_CPU = 0x8, EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, }; @@ -678,6 +680,8 @@ perf_cgroup_set_timestamp(struct task_struct *task, info->timestamp = ctx->timestamp; } +static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list); + #define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ #define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */ @@ -690,61 +694,46 @@ perf_cgroup_set_timestamp(struct task_struct *task, static void perf_cgroup_switch(struct task_struct *task, int mode) { struct perf_cpu_context *cpuctx; - struct pmu *pmu; + struct list_head *list; unsigned long flags; /* - * disable interrupts to avoid geting nr_cgroup - * changes via __perf_event_disable(). Also - * avoids preemption. + * Disable interrupts and preemption to avoid this CPU's + * cgrp_cpuctx_entry to change under us. */ local_irq_save(flags); - /* - * we reschedule only in the presence of cgroup - * constrained events. - */ + list = this_cpu_ptr(&cgrp_cpuctx_list); + list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) { + WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->unique_pmu != pmu) - continue; /* ensure we process each cpuctx once */ + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + perf_pmu_disable(cpuctx->ctx.pmu); - /* - * perf_cgroup_events says at least one - * context on this CPU has cgroup events. - * - * ctx->nr_cgroups reports the number of cgroup - * events for a context. - */ - if (cpuctx->ctx.nr_cgroups > 0) { - perf_ctx_lock(cpuctx, cpuctx->task_ctx); - perf_pmu_disable(cpuctx->ctx.pmu); - - if (mode & PERF_CGROUP_SWOUT) { - cpu_ctx_sched_out(cpuctx, EVENT_ALL); - /* - * must not be done before ctxswout due - * to event_filter_match() in event_sched_out() - */ - cpuctx->cgrp = NULL; - } + if (mode & PERF_CGROUP_SWOUT) { + cpu_ctx_sched_out(cpuctx, EVENT_ALL); + /* + * must not be done before ctxswout due + * to event_filter_match() in event_sched_out() + */ + cpuctx->cgrp = NULL; + } - if (mode & PERF_CGROUP_SWIN) { - WARN_ON_ONCE(cpuctx->cgrp); - /* - * set cgrp before ctxsw in to allow - * event_filter_match() to not have to pass - * task around - * we pass the cpuctx->ctx to perf_cgroup_from_task() - * because cgorup events are only per-cpu - */ - cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx); - cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); - } - perf_pmu_enable(cpuctx->ctx.pmu); - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); + if (mode & PERF_CGROUP_SWIN) { + WARN_ON_ONCE(cpuctx->cgrp); + /* + * set cgrp before ctxsw in to allow + * event_filter_match() to not have to pass + * task around + * we pass the cpuctx->ctx to perf_cgroup_from_task() + * because cgorup events are only per-cpu + */ + cpuctx->cgrp = perf_cgroup_from_task(task, + &cpuctx->ctx); + cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); } + perf_pmu_enable(cpuctx->ctx.pmu); + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } local_irq_restore(flags); @@ -889,6 +878,7 @@ list_update_cgroup_event(struct perf_event *event, struct perf_event_context *ctx, bool add) { struct perf_cpu_context *cpuctx; + struct list_head *cpuctx_entry; if (!is_cgroup_event(event)) return; @@ -902,15 +892,16 @@ list_update_cgroup_event(struct perf_event *event, * this will always be called from the right CPU. */ cpuctx = __get_cpu_context(ctx); - - /* - * cpuctx->cgrp is NULL until a cgroup event is sched in or - * ctx->nr_cgroup == 0 . - */ - if (add && perf_cgroup_from_task(current, ctx) == event->cgrp) - cpuctx->cgrp = event->cgrp; - else if (!add) + cpuctx_entry = &cpuctx->cgrp_cpuctx_entry; + /* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/ + if (add) { + list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list)); + if (perf_cgroup_from_task(current, ctx) == event->cgrp) + cpuctx->cgrp = event->cgrp; + } else { + list_del(cpuctx_entry); cpuctx->cgrp = NULL; + } } #else /* !CONFIG_CGROUP_PERF */ @@ -1453,6 +1444,20 @@ static void update_group_times(struct perf_event *leader) update_event_times(event); } +static enum event_type_t get_event_type(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + enum event_type_t event_type; + + lockdep_assert_held(&ctx->lock); + + event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE; + if (!ctx->task) + event_type |= EVENT_CPU; + + return event_type; +} + static struct list_head * ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) { @@ -2226,7 +2231,8 @@ ctx_sched_in(struct perf_event_context *ctx, struct task_struct *task); static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) + struct perf_event_context *ctx, + enum event_type_t event_type) { if (!cpuctx->task_ctx) return; @@ -2234,7 +2240,7 @@ static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) return; - ctx_sched_out(ctx, cpuctx, EVENT_ALL); + ctx_sched_out(ctx, cpuctx, event_type); } static void perf_event_sched_in(struct perf_cpu_context *cpuctx, @@ -2249,13 +2255,51 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx, ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); } +/* + * We want to maintain the following priority of scheduling: + * - CPU pinned (EVENT_CPU | EVENT_PINNED) + * - task pinned (EVENT_PINNED) + * - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE) + * - task flexible (EVENT_FLEXIBLE). + * + * In order to avoid unscheduling and scheduling back in everything every + * time an event is added, only do it for the groups of equal priority and + * below. + * + * This can be called after a batch operation on task events, in which case + * event_type is a bit mask of the types of events involved. For CPU events, + * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE. + */ static void ctx_resched(struct perf_cpu_context *cpuctx, - struct perf_event_context *task_ctx) + struct perf_event_context *task_ctx, + enum event_type_t event_type) { + enum event_type_t ctx_event_type = event_type & EVENT_ALL; + bool cpu_event = !!(event_type & EVENT_CPU); + + /* + * If pinned groups are involved, flexible groups also need to be + * scheduled out. + */ + if (event_type & EVENT_PINNED) + event_type |= EVENT_FLEXIBLE; + perf_pmu_disable(cpuctx->ctx.pmu); if (task_ctx) - task_ctx_sched_out(cpuctx, task_ctx); - cpu_ctx_sched_out(cpuctx, EVENT_ALL); + task_ctx_sched_out(cpuctx, task_ctx, event_type); + + /* + * Decide which cpu ctx groups to schedule out based on the types + * of events that caused rescheduling: + * - EVENT_CPU: schedule out corresponding groups; + * - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups; + * - otherwise, do nothing more. + */ + if (cpu_event) + cpu_ctx_sched_out(cpuctx, ctx_event_type); + else if (ctx_event_type & EVENT_PINNED) + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + perf_event_sched_in(cpuctx, task_ctx, current); perf_pmu_enable(cpuctx->ctx.pmu); } @@ -2302,7 +2346,7 @@ static int __perf_install_in_context(void *info) if (reprogram) { ctx_sched_out(ctx, cpuctx, EVENT_TIME); add_event_to_ctx(event, ctx); - ctx_resched(cpuctx, task_ctx); + ctx_resched(cpuctx, task_ctx, get_event_type(event)); } else { add_event_to_ctx(event, ctx); } @@ -2469,7 +2513,7 @@ static void __perf_event_enable(struct perf_event *event, if (ctx->task) WARN_ON_ONCE(task_ctx != ctx); - ctx_resched(cpuctx, task_ctx); + ctx_resched(cpuctx, task_ctx, get_event_type(event)); } /* @@ -2896,7 +2940,7 @@ unlock: if (do_switch) { raw_spin_lock(&ctx->lock); - task_ctx_sched_out(cpuctx, ctx); + task_ctx_sched_out(cpuctx, ctx, EVENT_ALL); raw_spin_unlock(&ctx->lock); } } @@ -2943,7 +2987,7 @@ static void perf_pmu_sched_task(struct task_struct *prev, return; list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) { - pmu = cpuctx->unique_pmu; /* software PMUs will not have sched_task */ + pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */ if (WARN_ON_ONCE(!pmu->sched_task)) continue; @@ -3133,8 +3177,12 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, * cpu flexible, task flexible. + * + * However, if task's ctx is not carrying any pinned + * events, no need to flip the cpuctx's events around. */ - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + if (!list_empty(&ctx->pinned_groups)) + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); perf_event_sched_in(cpuctx, ctx, task); perf_pmu_enable(ctx->pmu); perf_ctx_unlock(cpuctx, ctx); @@ -3449,6 +3497,7 @@ static int event_enable_on_exec(struct perf_event *event, static void perf_event_enable_on_exec(int ctxn) { struct perf_event_context *ctx, *clone_ctx = NULL; + enum event_type_t event_type = 0; struct perf_cpu_context *cpuctx; struct perf_event *event; unsigned long flags; @@ -3462,15 +3511,17 @@ static void perf_event_enable_on_exec(int ctxn) cpuctx = __get_cpu_context(ctx); perf_ctx_lock(cpuctx, ctx); ctx_sched_out(ctx, cpuctx, EVENT_TIME); - list_for_each_entry(event, &ctx->event_list, event_entry) + list_for_each_entry(event, &ctx->event_list, event_entry) { enabled |= event_enable_on_exec(event, ctx); + event_type |= get_event_type(event); + } /* * Unclone and reschedule this context if we enabled any event. */ if (enabled) { clone_ctx = unclone_ctx(ctx); - ctx_resched(cpuctx, ctx); + ctx_resched(cpuctx, ctx, event_type); } perf_ctx_unlock(cpuctx, ctx); @@ -8647,37 +8698,10 @@ static struct perf_cpu_context __percpu *find_pmu_context(int ctxn) return NULL; } -static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu) -{ - int cpu; - - for_each_possible_cpu(cpu) { - struct perf_cpu_context *cpuctx; - - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - - if (cpuctx->unique_pmu == old_pmu) - cpuctx->unique_pmu = pmu; - } -} - static void free_pmu_context(struct pmu *pmu) { - struct pmu *i; - mutex_lock(&pmus_lock); - /* - * Like a real lame refcount. - */ - list_for_each_entry(i, &pmus, entry) { - if (i->pmu_cpu_context == pmu->pmu_cpu_context) { - update_pmu_context(i, pmu); - goto out; - } - } - free_percpu(pmu->pmu_cpu_context); -out: mutex_unlock(&pmus_lock); } @@ -8881,8 +8905,6 @@ skip_type: cpuctx->ctx.pmu = pmu; __perf_mux_hrtimer_init(cpuctx, cpu); - - cpuctx->unique_pmu = pmu; } got_cpu_context: @@ -9000,6 +9022,14 @@ static struct pmu *perf_init_event(struct perf_event *event) idx = srcu_read_lock(&pmus_srcu); + /* Try parent's PMU first: */ + if (event->parent && event->parent->pmu) { + pmu = event->parent->pmu; + ret = perf_try_init_event(pmu, event); + if (!ret) + goto unlock; + } + rcu_read_lock(); pmu = idr_find(&pmu_idr, event->attr.type); rcu_read_unlock(); @@ -10260,7 +10290,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * in. */ raw_spin_lock_irq(&child_ctx->lock); - task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx); + task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL); /* * Now that the context is inactive, destroy the task <-> ctx relation @@ -10709,6 +10739,9 @@ static void __init perf_event_init_all_cpus(void) INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu)); +#ifdef CONFIG_CGROUP_PERF + INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu)); +#endif INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu)); } } diff --git a/kernel/exit.c b/kernel/exit.c index 60f245190571..2b06674202f4 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -54,6 +54,7 @@ #include <linux/shm.h> #include <linux/kcov.h> #include <linux/random.h> +#include <linux/rcuwait.h> #include <linux/uaccess.h> #include <asm/unistd.h> @@ -281,6 +282,35 @@ retry: return task; } +void rcuwait_wake_up(struct rcuwait *w) +{ + struct task_struct *task; + + rcu_read_lock(); + + /* + * Order condition vs @task, such that everything prior to the load + * of @task is visible. This is the condition as to why the user called + * rcuwait_trywake() in the first place. Pairs with set_current_state() + * barrier (A) in rcuwait_wait_event(). + * + * WAIT WAKE + * [S] tsk = current [S] cond = true + * MB (A) MB (B) + * [L] cond [L] tsk + */ + smp_rmb(); /* (B) */ + + /* + * Avoid using task_rcu_dereference() magic as long as we are careful, + * see comment in rcuwait_wait_event() regarding ->exit_state. + */ + task = rcu_dereference(w->task); + if (task) + wake_up_process(task); + rcu_read_unlock(); +} + struct task_struct *try_get_task_struct(struct task_struct **ptask) { struct task_struct *task; @@ -467,12 +497,12 @@ assign_new_owner: * Turn us into a lazy TLB process if we * aren't already.. */ -static void exit_mm(struct task_struct *tsk) +static void exit_mm(void) { - struct mm_struct *mm = tsk->mm; + struct mm_struct *mm = current->mm; struct core_state *core_state; - mm_release(tsk, mm); + mm_release(current, mm); if (!mm) return; sync_mm_rss(mm); @@ -490,7 +520,7 @@ static void exit_mm(struct task_struct *tsk) up_read(&mm->mmap_sem); - self.task = tsk; + self.task = current; self.next = xchg(&core_state->dumper.next, &self); /* * Implies mb(), the result of xchg() must be visible @@ -500,22 +530,22 @@ static void exit_mm(struct task_struct *tsk) complete(&core_state->startup); for (;;) { - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); if (!self.task) /* see coredump_finish() */ break; freezable_schedule(); } - __set_task_state(tsk, TASK_RUNNING); + __set_current_state(TASK_RUNNING); down_read(&mm->mmap_sem); } atomic_inc(&mm->mm_count); - BUG_ON(mm != tsk->active_mm); + BUG_ON(mm != current->active_mm); /* more a memory barrier than a real lock */ - task_lock(tsk); - tsk->mm = NULL; + task_lock(current); + current->mm = NULL; up_read(&mm->mmap_sem); enter_lazy_tlb(mm, current); - task_unlock(tsk); + task_unlock(current); mm_update_next_owner(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) @@ -822,7 +852,7 @@ void __noreturn do_exit(long code) tsk->exit_code = code; taskstats_exit(tsk, group_dead); - exit_mm(tsk); + exit_mm(); if (group_dead) acct_process(); diff --git a/kernel/extable.c b/kernel/extable.c index e3beec4a2339..e1359474baa5 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -20,6 +20,7 @@ #include <linux/module.h> #include <linux/mutex.h> #include <linux/init.h> +#include <linux/kprobes.h> #include <asm/sections.h> #include <linux/uaccess.h> @@ -104,6 +105,8 @@ int __kernel_text_address(unsigned long addr) return 1; if (is_ftrace_trampoline(addr)) return 1; + if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr)) + return 1; /* * There might be init symbols in saved stacktraces. * Give those symbols a chance to be printed in @@ -123,7 +126,11 @@ int kernel_text_address(unsigned long addr) return 1; if (is_module_text_address(addr)) return 1; - return is_ftrace_trampoline(addr); + if (is_ftrace_trampoline(addr)) + return 1; + if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr)) + return 1; + return 0; } /* diff --git a/kernel/fork.c b/kernel/fork.c index 11c5c8ab827c..2de63d1d654c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -432,11 +432,13 @@ void __init fork_init(void) int i; #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR #ifndef ARCH_MIN_TASKALIGN -#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES +#define ARCH_MIN_TASKALIGN 0 #endif + int align = min_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN); + /* create a slab on which task_structs can be allocated */ task_struct_cachep = kmem_cache_create("task_struct", - arch_task_struct_size, ARCH_MIN_TASKALIGN, + arch_task_struct_size, align, SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL); #endif @@ -1304,6 +1306,7 @@ void __cleanup_sighand(struct sighand_struct *sighand) } } +#ifdef CONFIG_POSIX_TIMERS /* * Initialize POSIX timer handling for a thread group. */ @@ -1322,6 +1325,9 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) INIT_LIST_HEAD(&sig->cpu_timers[1]); INIT_LIST_HEAD(&sig->cpu_timers[2]); } +#else +static inline void posix_cpu_timers_init_group(struct signal_struct *sig) { } +#endif static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) { @@ -1346,11 +1352,11 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) init_waitqueue_head(&sig->wait_chldexit); sig->curr_target = tsk; init_sigpending(&sig->shared_pending); - INIT_LIST_HEAD(&sig->posix_timers); seqlock_init(&sig->stats_lock); prev_cputime_init(&sig->prev_cputime); #ifdef CONFIG_POSIX_TIMERS + INIT_LIST_HEAD(&sig->posix_timers); hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); sig->real_timer.function = it_real_fn; #endif @@ -1425,6 +1431,7 @@ static void rt_mutex_init_task(struct task_struct *p) #endif } +#ifdef CONFIG_POSIX_TIMERS /* * Initialize POSIX timer handling for a single task. */ @@ -1437,6 +1444,9 @@ static void posix_cpu_timers_init(struct task_struct *tsk) INIT_LIST_HEAD(&tsk->cpu_timers[1]); INIT_LIST_HEAD(&tsk->cpu_timers[2]); } +#else +static inline void posix_cpu_timers_init(struct task_struct *tsk) { } +#endif static inline void init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 43460104f119..ebb4dadca66b 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -149,9 +149,11 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) struct kprobe_insn_page *kip; kprobe_opcode_t *slot = NULL; + /* Since the slot array is not protected by rcu, we need a mutex */ mutex_lock(&c->mutex); retry: - list_for_each_entry(kip, &c->pages, list) { + rcu_read_lock(); + list_for_each_entry_rcu(kip, &c->pages, list) { if (kip->nused < slots_per_page(c)) { int i; for (i = 0; i < slots_per_page(c); i++) { @@ -159,6 +161,7 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) kip->slot_used[i] = SLOT_USED; kip->nused++; slot = kip->insns + (i * c->insn_size); + rcu_read_unlock(); goto out; } } @@ -167,6 +170,7 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) WARN_ON(1); } } + rcu_read_unlock(); /* If there are any garbage slots, collect it and try again. */ if (c->nr_garbage && collect_garbage_slots(c) == 0) @@ -193,7 +197,7 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) kip->nused = 1; kip->ngarbage = 0; kip->cache = c; - list_add(&kip->list, &c->pages); + list_add_rcu(&kip->list, &c->pages); slot = kip->insns; out: mutex_unlock(&c->mutex); @@ -213,7 +217,8 @@ static int collect_one_slot(struct kprobe_insn_page *kip, int idx) * next time somebody inserts a probe. */ if (!list_is_singular(&kip->list)) { - list_del(&kip->list); + list_del_rcu(&kip->list); + synchronize_rcu(); kip->cache->free(kip->insns); kfree(kip); } @@ -235,8 +240,7 @@ static int collect_garbage_slots(struct kprobe_insn_cache *c) continue; kip->ngarbage = 0; /* we will collect all garbages */ for (i = 0; i < slots_per_page(c); i++) { - if (kip->slot_used[i] == SLOT_DIRTY && - collect_one_slot(kip, i)) + if (kip->slot_used[i] == SLOT_DIRTY && collect_one_slot(kip, i)) break; } } @@ -248,29 +252,60 @@ void __free_insn_slot(struct kprobe_insn_cache *c, kprobe_opcode_t *slot, int dirty) { struct kprobe_insn_page *kip; + long idx; mutex_lock(&c->mutex); - list_for_each_entry(kip, &c->pages, list) { - long idx = ((long)slot - (long)kip->insns) / - (c->insn_size * sizeof(kprobe_opcode_t)); - if (idx >= 0 && idx < slots_per_page(c)) { - WARN_ON(kip->slot_used[idx] != SLOT_USED); - if (dirty) { - kip->slot_used[idx] = SLOT_DIRTY; - kip->ngarbage++; - if (++c->nr_garbage > slots_per_page(c)) - collect_garbage_slots(c); - } else - collect_one_slot(kip, idx); + rcu_read_lock(); + list_for_each_entry_rcu(kip, &c->pages, list) { + idx = ((long)slot - (long)kip->insns) / + (c->insn_size * sizeof(kprobe_opcode_t)); + if (idx >= 0 && idx < slots_per_page(c)) goto out; - } } - /* Could not free this slot. */ + /* Could not find this slot. */ WARN_ON(1); + kip = NULL; out: + rcu_read_unlock(); + /* Mark and sweep: this may sleep */ + if (kip) { + /* Check double free */ + WARN_ON(kip->slot_used[idx] != SLOT_USED); + if (dirty) { + kip->slot_used[idx] = SLOT_DIRTY; + kip->ngarbage++; + if (++c->nr_garbage > slots_per_page(c)) + collect_garbage_slots(c); + } else { + collect_one_slot(kip, idx); + } + } mutex_unlock(&c->mutex); } +/* + * Check given address is on the page of kprobe instruction slots. + * This will be used for checking whether the address on a stack + * is on a text area or not. + */ +bool __is_insn_slot_addr(struct kprobe_insn_cache *c, unsigned long addr) +{ + struct kprobe_insn_page *kip; + bool ret = false; + + rcu_read_lock(); + list_for_each_entry_rcu(kip, &c->pages, list) { + if (addr >= (unsigned long)kip->insns && + addr < (unsigned long)kip->insns + PAGE_SIZE) { + ret = true; + break; + } + } + rcu_read_unlock(); + + return ret; +} + #ifdef CONFIG_OPTPROBES /* For optimized_kprobe buffer */ struct kprobe_insn_cache kprobe_optinsn_slots = { diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 6f88e352cd4f..760158d9d98d 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -28,3 +28,4 @@ obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o +obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index f8c5af52a131..9bffedd82884 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -372,6 +372,78 @@ static struct lock_torture_ops mutex_lock_ops = { .name = "mutex_lock" }; +#include <linux/ww_mutex.h> +static DEFINE_WW_CLASS(torture_ww_class); +static DEFINE_WW_MUTEX(torture_ww_mutex_0, &torture_ww_class); +static DEFINE_WW_MUTEX(torture_ww_mutex_1, &torture_ww_class); +static DEFINE_WW_MUTEX(torture_ww_mutex_2, &torture_ww_class); + +static int torture_ww_mutex_lock(void) +__acquires(torture_ww_mutex_0) +__acquires(torture_ww_mutex_1) +__acquires(torture_ww_mutex_2) +{ + LIST_HEAD(list); + struct reorder_lock { + struct list_head link; + struct ww_mutex *lock; + } locks[3], *ll, *ln; + struct ww_acquire_ctx ctx; + + locks[0].lock = &torture_ww_mutex_0; + list_add(&locks[0].link, &list); + + locks[1].lock = &torture_ww_mutex_1; + list_add(&locks[1].link, &list); + + locks[2].lock = &torture_ww_mutex_2; + list_add(&locks[2].link, &list); + + ww_acquire_init(&ctx, &torture_ww_class); + + list_for_each_entry(ll, &list, link) { + int err; + + err = ww_mutex_lock(ll->lock, &ctx); + if (!err) + continue; + + ln = ll; + list_for_each_entry_continue_reverse(ln, &list, link) + ww_mutex_unlock(ln->lock); + + if (err != -EDEADLK) + return err; + + ww_mutex_lock_slow(ll->lock, &ctx); + list_move(&ll->link, &list); + } + + ww_acquire_fini(&ctx); + return 0; +} + +static void torture_ww_mutex_unlock(void) +__releases(torture_ww_mutex_0) +__releases(torture_ww_mutex_1) +__releases(torture_ww_mutex_2) +{ + ww_mutex_unlock(&torture_ww_mutex_0); + ww_mutex_unlock(&torture_ww_mutex_1); + ww_mutex_unlock(&torture_ww_mutex_2); +} + +static struct lock_torture_ops ww_mutex_lock_ops = { + .writelock = torture_ww_mutex_lock, + .write_delay = torture_mutex_delay, + .task_boost = torture_boost_dummy, + .writeunlock = torture_ww_mutex_unlock, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, + .name = "ww_mutex_lock" +}; + #ifdef CONFIG_RT_MUTEXES static DEFINE_RT_MUTEX(torture_rtmutex); @@ -793,6 +865,7 @@ static int __init lock_torture_init(void) &spin_lock_ops, &spin_lock_irq_ops, &rw_lock_ops, &rw_lock_irq_ops, &mutex_lock_ops, + &ww_mutex_lock_ops, #ifdef CONFIG_RT_MUTEXES &rtmutex_lock_ops, #endif diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h index a459faa48987..4174417d5309 100644 --- a/kernel/locking/mutex-debug.h +++ b/kernel/locking/mutex-debug.h @@ -26,20 +26,3 @@ extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, extern void debug_mutex_unlock(struct mutex *lock); extern void debug_mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key); - -#define spin_lock_mutex(lock, flags) \ - do { \ - struct mutex *l = container_of(lock, struct mutex, wait_lock); \ - \ - DEBUG_LOCKS_WARN_ON(in_interrupt()); \ - local_irq_save(flags); \ - arch_spin_lock(&(lock)->rlock.raw_lock);\ - DEBUG_LOCKS_WARN_ON(l->magic != l); \ - } while (0) - -#define spin_unlock_mutex(lock, flags) \ - do { \ - arch_spin_unlock(&(lock)->rlock.raw_lock); \ - local_irq_restore(flags); \ - preempt_check_resched(); \ - } while (0) diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 9b349619f431..ad2d9e22697b 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -50,16 +50,17 @@ EXPORT_SYMBOL(__mutex_init); /* * @owner: contains: 'struct task_struct *' to the current lock owner, * NULL means not owned. Since task_struct pointers are aligned at - * ARCH_MIN_TASKALIGN (which is at least sizeof(void *)), we have low - * bits to store extra state. + * at least L1_CACHE_BYTES, we have low bits to store extra state. * * Bit0 indicates a non-empty waiter list; unlock must issue a wakeup. * Bit1 indicates unlock needs to hand the lock to the top-waiter + * Bit2 indicates handoff has been done and we're waiting for pickup. */ #define MUTEX_FLAG_WAITERS 0x01 #define MUTEX_FLAG_HANDOFF 0x02 +#define MUTEX_FLAG_PICKUP 0x04 -#define MUTEX_FLAGS 0x03 +#define MUTEX_FLAGS 0x07 static inline struct task_struct *__owner_task(unsigned long owner) { @@ -72,38 +73,29 @@ static inline unsigned long __owner_flags(unsigned long owner) } /* - * Actual trylock that will work on any unlocked state. - * - * When setting the owner field, we must preserve the low flag bits. - * - * Be careful with @handoff, only set that in a wait-loop (where you set - * HANDOFF) to avoid recursive lock attempts. + * Trylock variant that retuns the owning task on failure. */ -static inline bool __mutex_trylock(struct mutex *lock, const bool handoff) +static inline struct task_struct *__mutex_trylock_or_owner(struct mutex *lock) { unsigned long owner, curr = (unsigned long)current; owner = atomic_long_read(&lock->owner); for (;;) { /* must loop, can race against a flag */ unsigned long old, flags = __owner_flags(owner); + unsigned long task = owner & ~MUTEX_FLAGS; - if (__owner_task(owner)) { - if (handoff && unlikely(__owner_task(owner) == current)) { - /* - * Provide ACQUIRE semantics for the lock-handoff. - * - * We cannot easily use load-acquire here, since - * the actual load is a failed cmpxchg, which - * doesn't imply any barriers. - * - * Also, this is a fairly unlikely scenario, and - * this contains the cost. - */ - smp_mb(); /* ACQUIRE */ - return true; - } + if (task) { + if (likely(task != curr)) + break; - return false; + if (likely(!(flags & MUTEX_FLAG_PICKUP))) + break; + + flags &= ~MUTEX_FLAG_PICKUP; + } else { +#ifdef CONFIG_DEBUG_MUTEXES + DEBUG_LOCKS_WARN_ON(flags & MUTEX_FLAG_PICKUP); +#endif } /* @@ -111,15 +103,24 @@ static inline bool __mutex_trylock(struct mutex *lock, const bool handoff) * past the point where we acquire it. This would be possible * if we (accidentally) set the bit on an unlocked mutex. */ - if (handoff) - flags &= ~MUTEX_FLAG_HANDOFF; + flags &= ~MUTEX_FLAG_HANDOFF; old = atomic_long_cmpxchg_acquire(&lock->owner, owner, curr | flags); if (old == owner) - return true; + return NULL; owner = old; } + + return __owner_task(owner); +} + +/* + * Actual trylock that will work on any unlocked state. + */ +static inline bool __mutex_trylock(struct mutex *lock) +{ + return !__mutex_trylock_or_owner(lock); } #ifndef CONFIG_DEBUG_LOCK_ALLOC @@ -171,9 +172,9 @@ static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_wait /* * Give up ownership to a specific task, when @task = NULL, this is equivalent - * to a regular unlock. Clears HANDOFF, preserves WAITERS. Provides RELEASE - * semantics like a regular unlock, the __mutex_trylock() provides matching - * ACQUIRE semantics for the handoff. + * to a regular unlock. Sets PICKUP on a handoff, clears HANDOF, preserves + * WAITERS. Provides RELEASE semantics like a regular unlock, the + * __mutex_trylock() provides a matching ACQUIRE semantics for the handoff. */ static void __mutex_handoff(struct mutex *lock, struct task_struct *task) { @@ -184,10 +185,13 @@ static void __mutex_handoff(struct mutex *lock, struct task_struct *task) #ifdef CONFIG_DEBUG_MUTEXES DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current); + DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP); #endif new = (owner & MUTEX_FLAG_WAITERS); new |= (unsigned long)task; + if (task) + new |= MUTEX_FLAG_PICKUP; old = atomic_long_cmpxchg_release(&lock->owner, owner, new); if (old == owner) @@ -237,8 +241,8 @@ void __sched mutex_lock(struct mutex *lock) EXPORT_SYMBOL(mutex_lock); #endif -static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, - struct ww_acquire_ctx *ww_ctx) +static __always_inline void +ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx) { #ifdef CONFIG_DEBUG_MUTEXES /* @@ -277,17 +281,50 @@ static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, ww_ctx->acquired++; } +static inline bool __sched +__ww_ctx_stamp_after(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b) +{ + return a->stamp - b->stamp <= LONG_MAX && + (a->stamp != b->stamp || a > b); +} + +/* + * Wake up any waiters that may have to back off when the lock is held by the + * given context. + * + * Due to the invariants on the wait list, this can only affect the first + * waiter with a context. + * + * The current task must not be on the wait list. + */ +static void __sched +__ww_mutex_wakeup_for_backoff(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) +{ + struct mutex_waiter *cur; + + lockdep_assert_held(&lock->wait_lock); + + list_for_each_entry(cur, &lock->wait_list, list) { + if (!cur->ww_ctx) + continue; + + if (cur->ww_ctx->acquired > 0 && + __ww_ctx_stamp_after(cur->ww_ctx, ww_ctx)) { + debug_mutex_wake_waiter(lock, cur); + wake_up_process(cur->task); + } + + break; + } +} + /* * After acquiring lock with fastpath or when we lost out in contested * slowpath, set ctx and wake up any waiters so they can recheck. */ static __always_inline void -ww_mutex_set_context_fastpath(struct ww_mutex *lock, - struct ww_acquire_ctx *ctx) +ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { - unsigned long flags; - struct mutex_waiter *cur; - ww_mutex_lock_acquired(lock, ctx); lock->ctx = ctx; @@ -311,46 +348,79 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, * Uh oh, we raced in fastpath, wake up everyone in this case, * so they can see the new lock->ctx. */ - spin_lock_mutex(&lock->base.wait_lock, flags); - list_for_each_entry(cur, &lock->base.wait_list, list) { - debug_mutex_wake_waiter(&lock->base, cur); - wake_up_process(cur->task); - } - spin_unlock_mutex(&lock->base.wait_lock, flags); + spin_lock(&lock->base.wait_lock); + __ww_mutex_wakeup_for_backoff(&lock->base, ctx); + spin_unlock(&lock->base.wait_lock); } /* - * After acquiring lock in the slowpath set ctx and wake up any - * waiters so they can recheck. + * After acquiring lock in the slowpath set ctx. + * + * Unlike for the fast path, the caller ensures that waiters are woken up where + * necessary. * * Callers must hold the mutex wait_lock. */ static __always_inline void -ww_mutex_set_context_slowpath(struct ww_mutex *lock, - struct ww_acquire_ctx *ctx) +ww_mutex_set_context_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { - struct mutex_waiter *cur; - ww_mutex_lock_acquired(lock, ctx); lock->ctx = ctx; +} + +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER + +static inline +bool ww_mutex_spin_on_owner(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, + struct mutex_waiter *waiter) +{ + struct ww_mutex *ww; + + ww = container_of(lock, struct ww_mutex, base); /* - * Give any possible sleeping processes the chance to wake up, - * so they can recheck if they have to back off. + * If ww->ctx is set the contents are undefined, only + * by acquiring wait_lock there is a guarantee that + * they are not invalid when reading. + * + * As such, when deadlock detection needs to be + * performed the optimistic spinning cannot be done. + * + * Check this in every inner iteration because we may + * be racing against another thread's ww_mutex_lock. */ - list_for_each_entry(cur, &lock->base.wait_list, list) { - debug_mutex_wake_waiter(&lock->base, cur); - wake_up_process(cur->task); - } + if (ww_ctx->acquired > 0 && READ_ONCE(ww->ctx)) + return false; + + /* + * If we aren't on the wait list yet, cancel the spin + * if there are waiters. We want to avoid stealing the + * lock from a waiter with an earlier stamp, since the + * other thread may already own a lock that we also + * need. + */ + if (!waiter && (atomic_long_read(&lock->owner) & MUTEX_FLAG_WAITERS)) + return false; + + /* + * Similarly, stop spinning if we are no longer the + * first waiter. + */ + if (waiter && !__mutex_waiter_is_first(lock, waiter)) + return false; + + return true; } -#ifdef CONFIG_MUTEX_SPIN_ON_OWNER /* - * Look out! "owner" is an entirely speculative pointer - * access and not reliable. + * Look out! "owner" is an entirely speculative pointer access and not + * reliable. + * + * "noinline" so that this function shows up on perf profiles. */ static noinline -bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) +bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner, + struct ww_acquire_ctx *ww_ctx, struct mutex_waiter *waiter) { bool ret = true; @@ -373,6 +443,11 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) break; } + if (ww_ctx && !ww_mutex_spin_on_owner(lock, ww_ctx, waiter)) { + ret = false; + break; + } + cpu_relax(); } rcu_read_unlock(); @@ -431,12 +506,10 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) * with the spinner at the head of the OSQ, if present, until the owner is * changed to itself. */ -static bool mutex_optimistic_spin(struct mutex *lock, - struct ww_acquire_ctx *ww_ctx, - const bool use_ww_ctx, const bool waiter) +static __always_inline bool +mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, + const bool use_ww_ctx, struct mutex_waiter *waiter) { - struct task_struct *task = current; - if (!waiter) { /* * The purpose of the mutex_can_spin_on_owner() function is @@ -460,40 +533,17 @@ static bool mutex_optimistic_spin(struct mutex *lock, for (;;) { struct task_struct *owner; - if (use_ww_ctx && ww_ctx->acquired > 0) { - struct ww_mutex *ww; - - ww = container_of(lock, struct ww_mutex, base); - /* - * If ww->ctx is set the contents are undefined, only - * by acquiring wait_lock there is a guarantee that - * they are not invalid when reading. - * - * As such, when deadlock detection needs to be - * performed the optimistic spinning cannot be done. - */ - if (READ_ONCE(ww->ctx)) - goto fail_unlock; - } + /* Try to acquire the mutex... */ + owner = __mutex_trylock_or_owner(lock); + if (!owner) + break; /* - * If there's an owner, wait for it to either + * There's an owner, wait for it to either * release the lock or go to sleep. */ - owner = __mutex_owner(lock); - if (owner) { - if (waiter && owner == task) { - smp_mb(); /* ACQUIRE */ - break; - } - - if (!mutex_spin_on_owner(lock, owner)) - goto fail_unlock; - } - - /* Try to acquire the mutex if it is unlocked. */ - if (__mutex_trylock(lock, waiter)) - break; + if (!mutex_spin_on_owner(lock, owner, ww_ctx, waiter)) + goto fail_unlock; /* * The cpu_relax() call is a compiler barrier which forces @@ -532,9 +582,9 @@ fail: return false; } #else -static bool mutex_optimistic_spin(struct mutex *lock, - struct ww_acquire_ctx *ww_ctx, - const bool use_ww_ctx, const bool waiter) +static __always_inline bool +mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, + const bool use_ww_ctx, struct mutex_waiter *waiter) { return false; } @@ -594,23 +644,88 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock) EXPORT_SYMBOL(ww_mutex_unlock); static inline int __sched -__ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) +__ww_mutex_lock_check_stamp(struct mutex *lock, struct mutex_waiter *waiter, + struct ww_acquire_ctx *ctx) { struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx); + struct mutex_waiter *cur; + + if (hold_ctx && __ww_ctx_stamp_after(ctx, hold_ctx)) + goto deadlock; + + /* + * If there is a waiter in front of us that has a context, then its + * stamp is earlier than ours and we must back off. + */ + cur = waiter; + list_for_each_entry_continue_reverse(cur, &lock->wait_list, list) { + if (cur->ww_ctx) + goto deadlock; + } + + return 0; - if (!hold_ctx) +deadlock: +#ifdef CONFIG_DEBUG_MUTEXES + DEBUG_LOCKS_WARN_ON(ctx->contending_lock); + ctx->contending_lock = ww; +#endif + return -EDEADLK; +} + +static inline int __sched +__ww_mutex_add_waiter(struct mutex_waiter *waiter, + struct mutex *lock, + struct ww_acquire_ctx *ww_ctx) +{ + struct mutex_waiter *cur; + struct list_head *pos; + + if (!ww_ctx) { + list_add_tail(&waiter->list, &lock->wait_list); return 0; + } - if (ctx->stamp - hold_ctx->stamp <= LONG_MAX && - (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) { + /* + * Add the waiter before the first waiter with a higher stamp. + * Waiters without a context are skipped to avoid starving + * them. + */ + pos = &lock->wait_list; + list_for_each_entry_reverse(cur, &lock->wait_list, list) { + if (!cur->ww_ctx) + continue; + + if (__ww_ctx_stamp_after(ww_ctx, cur->ww_ctx)) { + /* Back off immediately if necessary. */ + if (ww_ctx->acquired > 0) { #ifdef CONFIG_DEBUG_MUTEXES - DEBUG_LOCKS_WARN_ON(ctx->contending_lock); - ctx->contending_lock = ww; + struct ww_mutex *ww; + + ww = container_of(lock, struct ww_mutex, base); + DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock); + ww_ctx->contending_lock = ww; #endif - return -EDEADLK; + return -EDEADLK; + } + + break; + } + + pos = &cur->list; + + /* + * Wake up the waiter so that it gets a chance to back + * off. + */ + if (cur->ww_ctx->acquired > 0) { + debug_mutex_wake_waiter(lock, cur); + wake_up_process(cur->task); + } } + list_add_tail(&waiter->list, pos); return 0; } @@ -622,15 +737,15 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, struct lockdep_map *nest_lock, unsigned long ip, struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) { - struct task_struct *task = current; struct mutex_waiter waiter; - unsigned long flags; bool first = false; struct ww_mutex *ww; int ret; - if (use_ww_ctx) { - ww = container_of(lock, struct ww_mutex, base); + might_sleep(); + + ww = container_of(lock, struct ww_mutex, base); + if (use_ww_ctx && ww_ctx) { if (unlikely(ww_ctx == READ_ONCE(ww->ctx))) return -EALREADY; } @@ -638,36 +753,54 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, preempt_disable(); mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); - if (__mutex_trylock(lock, false) || - mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, false)) { + if (__mutex_trylock(lock) || + mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, NULL)) { /* got the lock, yay! */ lock_acquired(&lock->dep_map, ip); - if (use_ww_ctx) + if (use_ww_ctx && ww_ctx) ww_mutex_set_context_fastpath(ww, ww_ctx); preempt_enable(); return 0; } - spin_lock_mutex(&lock->wait_lock, flags); + spin_lock(&lock->wait_lock); /* * After waiting to acquire the wait_lock, try again. */ - if (__mutex_trylock(lock, false)) + if (__mutex_trylock(lock)) { + if (use_ww_ctx && ww_ctx) + __ww_mutex_wakeup_for_backoff(lock, ww_ctx); + goto skip_wait; + } debug_mutex_lock_common(lock, &waiter); - debug_mutex_add_waiter(lock, &waiter, task); + debug_mutex_add_waiter(lock, &waiter, current); + + lock_contended(&lock->dep_map, ip); - /* add waiting tasks to the end of the waitqueue (FIFO): */ - list_add_tail(&waiter.list, &lock->wait_list); - waiter.task = task; + if (!use_ww_ctx) { + /* add waiting tasks to the end of the waitqueue (FIFO): */ + list_add_tail(&waiter.list, &lock->wait_list); + +#ifdef CONFIG_DEBUG_MUTEXES + waiter.ww_ctx = MUTEX_POISON_WW_CTX; +#endif + } else { + /* Add in stamp order, waking up waiters that must back off. */ + ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx); + if (ret) + goto err_early_backoff; + + waiter.ww_ctx = ww_ctx; + } + + waiter.task = current; if (__mutex_waiter_is_first(lock, &waiter)) __mutex_set_flag(lock, MUTEX_FLAG_WAITERS); - lock_contended(&lock->dep_map, ip); - - set_task_state(task, state); + set_current_state(state); for (;;) { /* * Once we hold wait_lock, we're serialized against @@ -675,7 +808,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * before testing the error conditions to make sure we pick up * the handoff. */ - if (__mutex_trylock(lock, first)) + if (__mutex_trylock(lock)) goto acquired; /* @@ -683,42 +816,47 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * wait_lock. This ensures the lock cancellation is ordered * against mutex_unlock() and wake-ups do not go missing. */ - if (unlikely(signal_pending_state(state, task))) { + if (unlikely(signal_pending_state(state, current))) { ret = -EINTR; goto err; } - if (use_ww_ctx && ww_ctx->acquired > 0) { - ret = __ww_mutex_lock_check_stamp(lock, ww_ctx); + if (use_ww_ctx && ww_ctx && ww_ctx->acquired > 0) { + ret = __ww_mutex_lock_check_stamp(lock, &waiter, ww_ctx); if (ret) goto err; } - spin_unlock_mutex(&lock->wait_lock, flags); + spin_unlock(&lock->wait_lock); schedule_preempt_disabled(); - if (!first && __mutex_waiter_is_first(lock, &waiter)) { - first = true; - __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF); + /* + * ww_mutex needs to always recheck its position since its waiter + * list is not FIFO ordered. + */ + if ((use_ww_ctx && ww_ctx) || !first) { + first = __mutex_waiter_is_first(lock, &waiter); + if (first) + __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF); } - set_task_state(task, state); + set_current_state(state); /* * Here we order against unlock; we must either see it change * state back to RUNNING and fall through the next schedule(), * or we must see its unlock and acquire. */ - if ((first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, true)) || - __mutex_trylock(lock, first)) + if (__mutex_trylock(lock) || + (first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, &waiter))) break; - spin_lock_mutex(&lock->wait_lock, flags); + spin_lock(&lock->wait_lock); } - spin_lock_mutex(&lock->wait_lock, flags); + spin_lock(&lock->wait_lock); acquired: - __set_task_state(task, TASK_RUNNING); + __set_current_state(TASK_RUNNING); - mutex_remove_waiter(lock, &waiter, task); + mutex_remove_waiter(lock, &waiter, current); if (likely(list_empty(&lock->wait_list))) __mutex_clear_flag(lock, MUTEX_FLAGS); @@ -728,30 +866,44 @@ skip_wait: /* got the lock - cleanup and rejoice! */ lock_acquired(&lock->dep_map, ip); - if (use_ww_ctx) + if (use_ww_ctx && ww_ctx) ww_mutex_set_context_slowpath(ww, ww_ctx); - spin_unlock_mutex(&lock->wait_lock, flags); + spin_unlock(&lock->wait_lock); preempt_enable(); return 0; err: - __set_task_state(task, TASK_RUNNING); - mutex_remove_waiter(lock, &waiter, task); - spin_unlock_mutex(&lock->wait_lock, flags); + __set_current_state(TASK_RUNNING); + mutex_remove_waiter(lock, &waiter, current); +err_early_backoff: + spin_unlock(&lock->wait_lock); debug_mutex_free_waiter(&waiter); mutex_release(&lock->dep_map, 1, ip); preempt_enable(); return ret; } +static int __sched +__mutex_lock(struct mutex *lock, long state, unsigned int subclass, + struct lockdep_map *nest_lock, unsigned long ip) +{ + return __mutex_lock_common(lock, state, subclass, nest_lock, ip, NULL, false); +} + +static int __sched +__ww_mutex_lock(struct mutex *lock, long state, unsigned int subclass, + struct lockdep_map *nest_lock, unsigned long ip, + struct ww_acquire_ctx *ww_ctx) +{ + return __mutex_lock_common(lock, state, subclass, nest_lock, ip, ww_ctx, true); +} + #ifdef CONFIG_DEBUG_LOCK_ALLOC void __sched mutex_lock_nested(struct mutex *lock, unsigned int subclass) { - might_sleep(); - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, - subclass, NULL, _RET_IP_, NULL, 0); + __mutex_lock(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); } EXPORT_SYMBOL_GPL(mutex_lock_nested); @@ -759,30 +911,38 @@ EXPORT_SYMBOL_GPL(mutex_lock_nested); void __sched _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) { - might_sleep(); - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, - 0, nest, _RET_IP_, NULL, 0); + __mutex_lock(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_); } EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); int __sched mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) { - might_sleep(); - return __mutex_lock_common(lock, TASK_KILLABLE, - subclass, NULL, _RET_IP_, NULL, 0); + return __mutex_lock(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_); } EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); int __sched mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) { - might_sleep(); - return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, - subclass, NULL, _RET_IP_, NULL, 0); + return __mutex_lock(lock, TASK_INTERRUPTIBLE, subclass, NULL, _RET_IP_); } EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); +void __sched +mutex_lock_io_nested(struct mutex *lock, unsigned int subclass) +{ + int token; + + might_sleep(); + + token = io_schedule_prepare(); + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, + subclass, NULL, _RET_IP_, NULL, 0); + io_schedule_finish(token); +} +EXPORT_SYMBOL_GPL(mutex_lock_io_nested); + static inline int ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { @@ -810,35 +970,37 @@ ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) } int __sched -__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { int ret; might_sleep(); - ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, - 0, &ctx->dep_map, _RET_IP_, ctx, 1); - if (!ret && ctx->acquired > 1) + ret = __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, + 0, ctx ? &ctx->dep_map : NULL, _RET_IP_, + ctx); + if (!ret && ctx && ctx->acquired > 1) return ww_mutex_deadlock_injection(lock, ctx); return ret; } -EXPORT_SYMBOL_GPL(__ww_mutex_lock); +EXPORT_SYMBOL_GPL(ww_mutex_lock); int __sched -__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { int ret; might_sleep(); - ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, - 0, &ctx->dep_map, _RET_IP_, ctx, 1); + ret = __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, + 0, ctx ? &ctx->dep_map : NULL, _RET_IP_, + ctx); - if (!ret && ctx->acquired > 1) + if (!ret && ctx && ctx->acquired > 1) return ww_mutex_deadlock_injection(lock, ctx); return ret; } -EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible); +EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible); #endif @@ -848,8 +1010,8 @@ EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible); static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip) { struct task_struct *next = NULL; - unsigned long owner, flags; DEFINE_WAKE_Q(wake_q); + unsigned long owner; mutex_release(&lock->dep_map, 1, ip); @@ -866,6 +1028,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne #ifdef CONFIG_DEBUG_MUTEXES DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current); + DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP); #endif if (owner & MUTEX_FLAG_HANDOFF) @@ -883,7 +1046,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne owner = old; } - spin_lock_mutex(&lock->wait_lock, flags); + spin_lock(&lock->wait_lock); debug_mutex_unlock(lock); if (!list_empty(&lock->wait_list)) { /* get the first entry from the wait-list: */ @@ -900,7 +1063,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne if (owner & MUTEX_FLAG_HANDOFF) __mutex_handoff(lock, next); - spin_unlock_mutex(&lock->wait_lock, flags); + spin_unlock(&lock->wait_lock); wake_up_q(&wake_q); } @@ -950,40 +1113,47 @@ int __sched mutex_lock_killable(struct mutex *lock) } EXPORT_SYMBOL(mutex_lock_killable); +void __sched mutex_lock_io(struct mutex *lock) +{ + int token; + + token = io_schedule_prepare(); + mutex_lock(lock); + io_schedule_finish(token); +} +EXPORT_SYMBOL_GPL(mutex_lock_io); + static noinline void __sched __mutex_lock_slowpath(struct mutex *lock) { - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, - NULL, _RET_IP_, NULL, 0); + __mutex_lock(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); } static noinline int __sched __mutex_lock_killable_slowpath(struct mutex *lock) { - return __mutex_lock_common(lock, TASK_KILLABLE, 0, - NULL, _RET_IP_, NULL, 0); + return __mutex_lock(lock, TASK_KILLABLE, 0, NULL, _RET_IP_); } static noinline int __sched __mutex_lock_interruptible_slowpath(struct mutex *lock) { - return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, - NULL, _RET_IP_, NULL, 0); + return __mutex_lock(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_); } static noinline int __sched __ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { - return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0, - NULL, _RET_IP_, ctx, 1); + return __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, 0, NULL, + _RET_IP_, ctx); } static noinline int __sched __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { - return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0, - NULL, _RET_IP_, ctx, 1); + return __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, 0, NULL, + _RET_IP_, ctx); } #endif @@ -1004,7 +1174,7 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, */ int __sched mutex_trylock(struct mutex *lock) { - bool locked = __mutex_trylock(lock, false); + bool locked = __mutex_trylock(lock); if (locked) mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); @@ -1015,32 +1185,34 @@ EXPORT_SYMBOL(mutex_trylock); #ifndef CONFIG_DEBUG_LOCK_ALLOC int __sched -__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { might_sleep(); if (__mutex_trylock_fast(&lock->base)) { - ww_mutex_set_context_fastpath(lock, ctx); + if (ctx) + ww_mutex_set_context_fastpath(lock, ctx); return 0; } return __ww_mutex_lock_slowpath(lock, ctx); } -EXPORT_SYMBOL(__ww_mutex_lock); +EXPORT_SYMBOL(ww_mutex_lock); int __sched -__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { might_sleep(); if (__mutex_trylock_fast(&lock->base)) { - ww_mutex_set_context_fastpath(lock, ctx); + if (ctx) + ww_mutex_set_context_fastpath(lock, ctx); return 0; } return __ww_mutex_lock_interruptible_slowpath(lock, ctx); } -EXPORT_SYMBOL(__ww_mutex_lock_interruptible); +EXPORT_SYMBOL(ww_mutex_lock_interruptible); #endif diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 4410a4af42a3..6ebc1902f779 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -9,10 +9,6 @@ * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs: */ -#define spin_lock_mutex(lock, flags) \ - do { spin_lock(lock); (void)(flags); } while (0) -#define spin_unlock_mutex(lock, flags) \ - do { spin_unlock(lock); (void)(flags); } while (0) #define mutex_remove_waiter(lock, waiter, task) \ __list_del((waiter)->list.prev, (waiter)->list.next) diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index ce182599cf2e..883cf1b92d90 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -1,7 +1,6 @@ #include <linux/atomic.h> #include <linux/rwsem.h> #include <linux/percpu.h> -#include <linux/wait.h> #include <linux/lockdep.h> #include <linux/percpu-rwsem.h> #include <linux/rcupdate.h> @@ -18,7 +17,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ rcu_sync_init(&sem->rss, RCU_SCHED_SYNC); __init_rwsem(&sem->rw_sem, name, rwsem_key); - init_waitqueue_head(&sem->writer); + rcuwait_init(&sem->writer); sem->readers_block = 0; return 0; } @@ -103,7 +102,7 @@ void __percpu_up_read(struct percpu_rw_semaphore *sem) __this_cpu_dec(*sem->read_count); /* Prod writer to recheck readers_active */ - wake_up(&sem->writer); + rcuwait_wake_up(&sem->writer); } EXPORT_SYMBOL_GPL(__percpu_up_read); @@ -160,7 +159,7 @@ void percpu_down_write(struct percpu_rw_semaphore *sem) */ /* Wait for all now active readers to complete. */ - wait_event(sem->writer, readers_active_check(sem)); + rcuwait_wait_event(&sem->writer, readers_active_check(sem)); } EXPORT_SYMBOL_GPL(percpu_down_write); diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index e3b5520005db..e6b2f7ad3e51 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -263,7 +263,7 @@ pv_wait_early(struct pv_node *prev, int loop) if ((loop & PV_PREV_CHECK_MASK) != 0) return false; - return READ_ONCE(prev->state) != vcpu_running; + return READ_ONCE(prev->state) != vcpu_running || vcpu_is_preempted(prev->cpu); } /* diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 2f443ed2320a..d340be3a488f 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1179,7 +1179,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, * TASK_INTERRUPTIBLE checks for signals and * timeout. Ignored otherwise. */ - if (unlikely(state == TASK_INTERRUPTIBLE)) { + if (likely(state == TASK_INTERRUPTIBLE)) { /* Signal pending? */ if (signal_pending(current)) ret = -EINTR; diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index 1591f6b3539f..5eacab880f67 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c @@ -128,7 +128,6 @@ __rwsem_wake_one_writer(struct rw_semaphore *sem) void __sched __down_read(struct rw_semaphore *sem) { struct rwsem_waiter waiter; - struct task_struct *tsk; unsigned long flags; raw_spin_lock_irqsave(&sem->wait_lock, flags); @@ -140,13 +139,12 @@ void __sched __down_read(struct rw_semaphore *sem) goto out; } - tsk = current; - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); /* set up my own style of waitqueue */ - waiter.task = tsk; + waiter.task = current; waiter.type = RWSEM_WAITING_FOR_READ; - get_task_struct(tsk); + get_task_struct(current); list_add_tail(&waiter.list, &sem->wait_list); @@ -158,10 +156,10 @@ void __sched __down_read(struct rw_semaphore *sem) if (!waiter.task) break; schedule(); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); } - __set_task_state(tsk, TASK_RUNNING); + __set_current_state(TASK_RUNNING); out: ; } @@ -194,15 +192,13 @@ int __down_read_trylock(struct rw_semaphore *sem) int __sched __down_write_common(struct rw_semaphore *sem, int state) { struct rwsem_waiter waiter; - struct task_struct *tsk; unsigned long flags; int ret = 0; raw_spin_lock_irqsave(&sem->wait_lock, flags); /* set up my own style of waitqueue */ - tsk = current; - waiter.task = tsk; + waiter.task = current; waiter.type = RWSEM_WAITING_FOR_WRITE; list_add_tail(&waiter.list, &sem->wait_list); @@ -220,7 +216,7 @@ int __sched __down_write_common(struct rw_semaphore *sem, int state) ret = -EINTR; goto out; } - set_task_state(tsk, state); + set_current_state(state); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); schedule(); raw_spin_lock_irqsave(&sem->wait_lock, flags); diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 631506004f9e..2ad8d8dc3bb1 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -224,10 +224,9 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) { long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; struct rwsem_waiter waiter; - struct task_struct *tsk = current; DEFINE_WAKE_Q(wake_q); - waiter.task = tsk; + waiter.task = current; waiter.type = RWSEM_WAITING_FOR_READ; raw_spin_lock_irq(&sem->wait_lock); @@ -254,13 +253,13 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) /* wait to be given the lock */ while (true) { - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); if (!waiter.task) break; schedule(); } - __set_task_state(tsk, TASK_RUNNING); + __set_current_state(TASK_RUNNING); return sem; } EXPORT_SYMBOL(rwsem_down_read_failed); @@ -503,8 +502,6 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) * wake any read locks that were queued ahead of us. */ if (count > RWSEM_WAITING_BIAS) { - DEFINE_WAKE_Q(wake_q); - __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); /* * The wakeup is normally called _after_ the wait_lock @@ -514,6 +511,11 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) * for attempting rwsem_try_write_lock(). */ wake_up_q(&wake_q); + + /* + * Reinitialize wake_q after use. + */ + wake_q_init(&wake_q); } } else diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index b8120abe594b..9512e37637dc 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -204,19 +204,18 @@ struct semaphore_waiter { static inline int __sched __down_common(struct semaphore *sem, long state, long timeout) { - struct task_struct *task = current; struct semaphore_waiter waiter; list_add_tail(&waiter.list, &sem->wait_list); - waiter.task = task; + waiter.task = current; waiter.up = false; for (;;) { - if (signal_pending_state(state, task)) + if (signal_pending_state(state, current)) goto interrupted; if (unlikely(timeout <= 0)) goto timed_out; - __set_task_state(task, state); + __set_current_state(state); raw_spin_unlock_irq(&sem->lock); timeout = schedule_timeout(timeout); raw_spin_lock_irq(&sem->lock); diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index db3ccb1dd614..4b082b5cac9e 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -363,14 +363,6 @@ void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass) } EXPORT_SYMBOL(_raw_spin_lock_nested); -void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass) -{ - __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); - spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); -} -EXPORT_SYMBOL(_raw_spin_lock_bh_nested); - unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock, int subclass) { diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c new file mode 100644 index 000000000000..da6c9a34f62f --- /dev/null +++ b/kernel/locking/test-ww_mutex.c @@ -0,0 +1,646 @@ +/* + * Module-based API test facility for ww_mutexes + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + */ + +#include <linux/kernel.h> + +#include <linux/completion.h> +#include <linux/delay.h> +#include <linux/kthread.h> +#include <linux/module.h> +#include <linux/random.h> +#include <linux/slab.h> +#include <linux/ww_mutex.h> + +static DEFINE_WW_CLASS(ww_class); +struct workqueue_struct *wq; + +struct test_mutex { + struct work_struct work; + struct ww_mutex mutex; + struct completion ready, go, done; + unsigned int flags; +}; + +#define TEST_MTX_SPIN BIT(0) +#define TEST_MTX_TRY BIT(1) +#define TEST_MTX_CTX BIT(2) +#define __TEST_MTX_LAST BIT(3) + +static void test_mutex_work(struct work_struct *work) +{ + struct test_mutex *mtx = container_of(work, typeof(*mtx), work); + + complete(&mtx->ready); + wait_for_completion(&mtx->go); + + if (mtx->flags & TEST_MTX_TRY) { + while (!ww_mutex_trylock(&mtx->mutex)) + cpu_relax(); + } else { + ww_mutex_lock(&mtx->mutex, NULL); + } + complete(&mtx->done); + ww_mutex_unlock(&mtx->mutex); +} + +static int __test_mutex(unsigned int flags) +{ +#define TIMEOUT (HZ / 16) + struct test_mutex mtx; + struct ww_acquire_ctx ctx; + int ret; + + ww_mutex_init(&mtx.mutex, &ww_class); + ww_acquire_init(&ctx, &ww_class); + + INIT_WORK_ONSTACK(&mtx.work, test_mutex_work); + init_completion(&mtx.ready); + init_completion(&mtx.go); + init_completion(&mtx.done); + mtx.flags = flags; + + schedule_work(&mtx.work); + + wait_for_completion(&mtx.ready); + ww_mutex_lock(&mtx.mutex, (flags & TEST_MTX_CTX) ? &ctx : NULL); + complete(&mtx.go); + if (flags & TEST_MTX_SPIN) { + unsigned long timeout = jiffies + TIMEOUT; + + ret = 0; + do { + if (completion_done(&mtx.done)) { + ret = -EINVAL; + break; + } + cpu_relax(); + } while (time_before(jiffies, timeout)); + } else { + ret = wait_for_completion_timeout(&mtx.done, TIMEOUT); + } + ww_mutex_unlock(&mtx.mutex); + ww_acquire_fini(&ctx); + + if (ret) { + pr_err("%s(flags=%x): mutual exclusion failure\n", + __func__, flags); + ret = -EINVAL; + } + + flush_work(&mtx.work); + destroy_work_on_stack(&mtx.work); + return ret; +#undef TIMEOUT +} + +static int test_mutex(void) +{ + int ret; + int i; + + for (i = 0; i < __TEST_MTX_LAST; i++) { + ret = __test_mutex(i); + if (ret) + return ret; + } + + return 0; +} + +static int test_aa(void) +{ + struct ww_mutex mutex; + struct ww_acquire_ctx ctx; + int ret; + + ww_mutex_init(&mutex, &ww_class); + ww_acquire_init(&ctx, &ww_class); + + ww_mutex_lock(&mutex, &ctx); + + if (ww_mutex_trylock(&mutex)) { + pr_err("%s: trylocked itself!\n", __func__); + ww_mutex_unlock(&mutex); + ret = -EINVAL; + goto out; + } + + ret = ww_mutex_lock(&mutex, &ctx); + if (ret != -EALREADY) { + pr_err("%s: missed deadlock for recursing, ret=%d\n", + __func__, ret); + if (!ret) + ww_mutex_unlock(&mutex); + ret = -EINVAL; + goto out; + } + + ret = 0; +out: + ww_mutex_unlock(&mutex); + ww_acquire_fini(&ctx); + return ret; +} + +struct test_abba { + struct work_struct work; + struct ww_mutex a_mutex; + struct ww_mutex b_mutex; + struct completion a_ready; + struct completion b_ready; + bool resolve; + int result; +}; + +static void test_abba_work(struct work_struct *work) +{ + struct test_abba *abba = container_of(work, typeof(*abba), work); + struct ww_acquire_ctx ctx; + int err; + + ww_acquire_init(&ctx, &ww_class); + ww_mutex_lock(&abba->b_mutex, &ctx); + + complete(&abba->b_ready); + wait_for_completion(&abba->a_ready); + + err = ww_mutex_lock(&abba->a_mutex, &ctx); + if (abba->resolve && err == -EDEADLK) { + ww_mutex_unlock(&abba->b_mutex); + ww_mutex_lock_slow(&abba->a_mutex, &ctx); + err = ww_mutex_lock(&abba->b_mutex, &ctx); + } + + if (!err) + ww_mutex_unlock(&abba->a_mutex); + ww_mutex_unlock(&abba->b_mutex); + ww_acquire_fini(&ctx); + + abba->result = err; +} + +static int test_abba(bool resolve) +{ + struct test_abba abba; + struct ww_acquire_ctx ctx; + int err, ret; + + ww_mutex_init(&abba.a_mutex, &ww_class); + ww_mutex_init(&abba.b_mutex, &ww_class); + INIT_WORK_ONSTACK(&abba.work, test_abba_work); + init_completion(&abba.a_ready); + init_completion(&abba.b_ready); + abba.resolve = resolve; + + schedule_work(&abba.work); + + ww_acquire_init(&ctx, &ww_class); + ww_mutex_lock(&abba.a_mutex, &ctx); + + complete(&abba.a_ready); + wait_for_completion(&abba.b_ready); + + err = ww_mutex_lock(&abba.b_mutex, &ctx); + if (resolve && err == -EDEADLK) { + ww_mutex_unlock(&abba.a_mutex); + ww_mutex_lock_slow(&abba.b_mutex, &ctx); + err = ww_mutex_lock(&abba.a_mutex, &ctx); + } + + if (!err) + ww_mutex_unlock(&abba.b_mutex); + ww_mutex_unlock(&abba.a_mutex); + ww_acquire_fini(&ctx); + + flush_work(&abba.work); + destroy_work_on_stack(&abba.work); + + ret = 0; + if (resolve) { + if (err || abba.result) { + pr_err("%s: failed to resolve ABBA deadlock, A err=%d, B err=%d\n", + __func__, err, abba.result); + ret = -EINVAL; + } + } else { + if (err != -EDEADLK && abba.result != -EDEADLK) { + pr_err("%s: missed ABBA deadlock, A err=%d, B err=%d\n", + __func__, err, abba.result); + ret = -EINVAL; + } + } + return ret; +} + +struct test_cycle { + struct work_struct work; + struct ww_mutex a_mutex; + struct ww_mutex *b_mutex; + struct completion *a_signal; + struct completion b_signal; + int result; +}; + +static void test_cycle_work(struct work_struct *work) +{ + struct test_cycle *cycle = container_of(work, typeof(*cycle), work); + struct ww_acquire_ctx ctx; + int err; + + ww_acquire_init(&ctx, &ww_class); + ww_mutex_lock(&cycle->a_mutex, &ctx); + + complete(cycle->a_signal); + wait_for_completion(&cycle->b_signal); + + err = ww_mutex_lock(cycle->b_mutex, &ctx); + if (err == -EDEADLK) { + ww_mutex_unlock(&cycle->a_mutex); + ww_mutex_lock_slow(cycle->b_mutex, &ctx); + err = ww_mutex_lock(&cycle->a_mutex, &ctx); + } + + if (!err) + ww_mutex_unlock(cycle->b_mutex); + ww_mutex_unlock(&cycle->a_mutex); + ww_acquire_fini(&ctx); + + cycle->result = err; +} + +static int __test_cycle(unsigned int nthreads) +{ + struct test_cycle *cycles; + unsigned int n, last = nthreads - 1; + int ret; + + cycles = kmalloc_array(nthreads, sizeof(*cycles), GFP_KERNEL); + if (!cycles) + return -ENOMEM; + + for (n = 0; n < nthreads; n++) { + struct test_cycle *cycle = &cycles[n]; + + ww_mutex_init(&cycle->a_mutex, &ww_class); + if (n == last) + cycle->b_mutex = &cycles[0].a_mutex; + else + cycle->b_mutex = &cycles[n + 1].a_mutex; + + if (n == 0) + cycle->a_signal = &cycles[last].b_signal; + else + cycle->a_signal = &cycles[n - 1].b_signal; + init_completion(&cycle->b_signal); + + INIT_WORK(&cycle->work, test_cycle_work); + cycle->result = 0; + } + + for (n = 0; n < nthreads; n++) + queue_work(wq, &cycles[n].work); + + flush_workqueue(wq); + + ret = 0; + for (n = 0; n < nthreads; n++) { + struct test_cycle *cycle = &cycles[n]; + + if (!cycle->result) + continue; + + pr_err("cylic deadlock not resolved, ret[%d/%d] = %d\n", + n, nthreads, cycle->result); + ret = -EINVAL; + break; + } + + for (n = 0; n < nthreads; n++) + ww_mutex_destroy(&cycles[n].a_mutex); + kfree(cycles); + return ret; +} + +static int test_cycle(unsigned int ncpus) +{ + unsigned int n; + int ret; + + for (n = 2; n <= ncpus + 1; n++) { + ret = __test_cycle(n); + if (ret) + return ret; + } + + return 0; +} + +struct stress { + struct work_struct work; + struct ww_mutex *locks; + int nlocks; + int nloops; +}; + +static int *get_random_order(int count) +{ + int *order; + int n, r, tmp; + + order = kmalloc_array(count, sizeof(*order), GFP_TEMPORARY); + if (!order) + return order; + + for (n = 0; n < count; n++) + order[n] = n; + + for (n = count - 1; n > 1; n--) { + r = get_random_int() % (n + 1); + if (r != n) { + tmp = order[n]; + order[n] = order[r]; + order[r] = tmp; + } + } + + return order; +} + +static void dummy_load(struct stress *stress) +{ + usleep_range(1000, 2000); +} + +static void stress_inorder_work(struct work_struct *work) +{ + struct stress *stress = container_of(work, typeof(*stress), work); + const int nlocks = stress->nlocks; + struct ww_mutex *locks = stress->locks; + struct ww_acquire_ctx ctx; + int *order; + + order = get_random_order(nlocks); + if (!order) + return; + + ww_acquire_init(&ctx, &ww_class); + + do { + int contended = -1; + int n, err; + +retry: + err = 0; + for (n = 0; n < nlocks; n++) { + if (n == contended) + continue; + + err = ww_mutex_lock(&locks[order[n]], &ctx); + if (err < 0) + break; + } + if (!err) + dummy_load(stress); + + if (contended > n) + ww_mutex_unlock(&locks[order[contended]]); + contended = n; + while (n--) + ww_mutex_unlock(&locks[order[n]]); + + if (err == -EDEADLK) { + ww_mutex_lock_slow(&locks[order[contended]], &ctx); + goto retry; + } + + if (err) { + pr_err_once("stress (%s) failed with %d\n", + __func__, err); + break; + } + } while (--stress->nloops); + + ww_acquire_fini(&ctx); + + kfree(order); + kfree(stress); +} + +struct reorder_lock { + struct list_head link; + struct ww_mutex *lock; +}; + +static void stress_reorder_work(struct work_struct *work) +{ + struct stress *stress = container_of(work, typeof(*stress), work); + LIST_HEAD(locks); + struct ww_acquire_ctx ctx; + struct reorder_lock *ll, *ln; + int *order; + int n, err; + + order = get_random_order(stress->nlocks); + if (!order) + return; + + for (n = 0; n < stress->nlocks; n++) { + ll = kmalloc(sizeof(*ll), GFP_KERNEL); + if (!ll) + goto out; + + ll->lock = &stress->locks[order[n]]; + list_add(&ll->link, &locks); + } + kfree(order); + order = NULL; + + ww_acquire_init(&ctx, &ww_class); + + do { + list_for_each_entry(ll, &locks, link) { + err = ww_mutex_lock(ll->lock, &ctx); + if (!err) + continue; + + ln = ll; + list_for_each_entry_continue_reverse(ln, &locks, link) + ww_mutex_unlock(ln->lock); + + if (err != -EDEADLK) { + pr_err_once("stress (%s) failed with %d\n", + __func__, err); + break; + } + + ww_mutex_lock_slow(ll->lock, &ctx); + list_move(&ll->link, &locks); /* restarts iteration */ + } + + dummy_load(stress); + list_for_each_entry(ll, &locks, link) + ww_mutex_unlock(ll->lock); + } while (--stress->nloops); + + ww_acquire_fini(&ctx); + +out: + list_for_each_entry_safe(ll, ln, &locks, link) + kfree(ll); + kfree(order); + kfree(stress); +} + +static void stress_one_work(struct work_struct *work) +{ + struct stress *stress = container_of(work, typeof(*stress), work); + const int nlocks = stress->nlocks; + struct ww_mutex *lock = stress->locks + (get_random_int() % nlocks); + int err; + + do { + err = ww_mutex_lock(lock, NULL); + if (!err) { + dummy_load(stress); + ww_mutex_unlock(lock); + } else { + pr_err_once("stress (%s) failed with %d\n", + __func__, err); + break; + } + } while (--stress->nloops); + + kfree(stress); +} + +#define STRESS_INORDER BIT(0) +#define STRESS_REORDER BIT(1) +#define STRESS_ONE BIT(2) +#define STRESS_ALL (STRESS_INORDER | STRESS_REORDER | STRESS_ONE) + +static int stress(int nlocks, int nthreads, int nloops, unsigned int flags) +{ + struct ww_mutex *locks; + int n; + + locks = kmalloc_array(nlocks, sizeof(*locks), GFP_KERNEL); + if (!locks) + return -ENOMEM; + + for (n = 0; n < nlocks; n++) + ww_mutex_init(&locks[n], &ww_class); + + for (n = 0; nthreads; n++) { + struct stress *stress; + void (*fn)(struct work_struct *work); + + fn = NULL; + switch (n & 3) { + case 0: + if (flags & STRESS_INORDER) + fn = stress_inorder_work; + break; + case 1: + if (flags & STRESS_REORDER) + fn = stress_reorder_work; + break; + case 2: + if (flags & STRESS_ONE) + fn = stress_one_work; + break; + } + + if (!fn) + continue; + + stress = kmalloc(sizeof(*stress), GFP_KERNEL); + if (!stress) + break; + + INIT_WORK(&stress->work, fn); + stress->locks = locks; + stress->nlocks = nlocks; + stress->nloops = nloops; + + queue_work(wq, &stress->work); + nthreads--; + } + + flush_workqueue(wq); + + for (n = 0; n < nlocks; n++) + ww_mutex_destroy(&locks[n]); + kfree(locks); + + return 0; +} + +static int __init test_ww_mutex_init(void) +{ + int ncpus = num_online_cpus(); + int ret; + + wq = alloc_workqueue("test-ww_mutex", WQ_UNBOUND, 0); + if (!wq) + return -ENOMEM; + + ret = test_mutex(); + if (ret) + return ret; + + ret = test_aa(); + if (ret) + return ret; + + ret = test_abba(false); + if (ret) + return ret; + + ret = test_abba(true); + if (ret) + return ret; + + ret = test_cycle(ncpus); + if (ret) + return ret; + + ret = stress(16, 2*ncpus, 1<<10, STRESS_INORDER); + if (ret) + return ret; + + ret = stress(16, 2*ncpus, 1<<10, STRESS_REORDER); + if (ret) + return ret; + + ret = stress(4096, hweight32(STRESS_ALL)*ncpus, 1<<12, STRESS_ALL); + if (ret) + return ret; + + return 0; +} + +static void __exit test_ww_mutex_exit(void) +{ + destroy_workqueue(wq); +} + +module_init(test_ww_mutex_init); +module_exit(test_ww_mutex_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Intel Corporation"); diff --git a/kernel/pid.c b/kernel/pid.c index f66162f2359b..0291804151b5 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -68,9 +68,7 @@ static inline int mk_pid(struct pid_namespace *pid_ns, * the scheme scales to up to 4 million PIDs, runtime. */ struct pid_namespace init_pid_ns = { - .kref = { - .refcount = ATOMIC_INIT(2), - }, + .kref = KREF_INIT(2), .pidmap = { [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }, diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index e85a725e5c34..ad64efe41722 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -77,41 +77,88 @@ EXPORT_SYMBOL_GPL(sched_clock); __read_mostly int sched_clock_running; +void sched_clock_init(void) +{ + sched_clock_running = 1; +} + #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -static struct static_key __sched_clock_stable = STATIC_KEY_INIT; -static int __sched_clock_stable_early; +/* + * We must start with !__sched_clock_stable because the unstable -> stable + * transition is accurate, while the stable -> unstable transition is not. + * + * Similarly we start with __sched_clock_stable_early, thereby assuming we + * will become stable, such that there's only a single 1 -> 0 transition. + */ +static DEFINE_STATIC_KEY_FALSE(__sched_clock_stable); +static int __sched_clock_stable_early = 1; -int sched_clock_stable(void) +/* + * We want: ktime_get_ns() + gtod_offset == sched_clock() + raw_offset + */ +static __read_mostly u64 raw_offset; +static __read_mostly u64 gtod_offset; + +struct sched_clock_data { + u64 tick_raw; + u64 tick_gtod; + u64 clock; +}; + +static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); + +static inline struct sched_clock_data *this_scd(void) { - return static_key_false(&__sched_clock_stable); + return this_cpu_ptr(&sched_clock_data); } -static void __set_sched_clock_stable(void) +static inline struct sched_clock_data *cpu_sdc(int cpu) { - if (!sched_clock_stable()) - static_key_slow_inc(&__sched_clock_stable); + return &per_cpu(sched_clock_data, cpu); +} - tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE); +int sched_clock_stable(void) +{ + return static_branch_likely(&__sched_clock_stable); } -void set_sched_clock_stable(void) +static void __set_sched_clock_stable(void) { - __sched_clock_stable_early = 1; + struct sched_clock_data *scd = this_scd(); - smp_mb(); /* matches sched_clock_init() */ + /* + * Attempt to make the (initial) unstable->stable transition continuous. + */ + raw_offset = (scd->tick_gtod + gtod_offset) - (scd->tick_raw); - if (!sched_clock_running) - return; + printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n", + scd->tick_gtod, gtod_offset, + scd->tick_raw, raw_offset); - __set_sched_clock_stable(); + static_branch_enable(&__sched_clock_stable); + tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE); } static void __clear_sched_clock_stable(struct work_struct *work) { - /* XXX worry about clock continuity */ - if (sched_clock_stable()) - static_key_slow_dec(&__sched_clock_stable); + struct sched_clock_data *scd = this_scd(); + + /* + * Attempt to make the stable->unstable transition continuous. + * + * Trouble is, this is typically called from the TSC watchdog + * timer, which is late per definition. This means the tick + * values can already be screwy. + * + * Still do what we can. + */ + gtod_offset = (scd->tick_raw + raw_offset) - (scd->tick_gtod); + + printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n", + scd->tick_gtod, gtod_offset, + scd->tick_raw, raw_offset); + static_branch_disable(&__sched_clock_stable); tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE); } @@ -121,47 +168,15 @@ void clear_sched_clock_stable(void) { __sched_clock_stable_early = 0; - smp_mb(); /* matches sched_clock_init() */ - - if (!sched_clock_running) - return; + smp_mb(); /* matches sched_clock_init_late() */ - schedule_work(&sched_clock_work); + if (sched_clock_running == 2) + schedule_work(&sched_clock_work); } -struct sched_clock_data { - u64 tick_raw; - u64 tick_gtod; - u64 clock; -}; - -static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); - -static inline struct sched_clock_data *this_scd(void) +void sched_clock_init_late(void) { - return this_cpu_ptr(&sched_clock_data); -} - -static inline struct sched_clock_data *cpu_sdc(int cpu) -{ - return &per_cpu(sched_clock_data, cpu); -} - -void sched_clock_init(void) -{ - u64 ktime_now = ktime_to_ns(ktime_get()); - int cpu; - - for_each_possible_cpu(cpu) { - struct sched_clock_data *scd = cpu_sdc(cpu); - - scd->tick_raw = 0; - scd->tick_gtod = ktime_now; - scd->clock = ktime_now; - } - - sched_clock_running = 1; - + sched_clock_running = 2; /* * Ensure that it is impossible to not do a static_key update. * @@ -173,8 +188,6 @@ void sched_clock_init(void) if (__sched_clock_stable_early) __set_sched_clock_stable(); - else - __clear_sched_clock_stable(NULL); } /* @@ -216,7 +229,7 @@ again: * scd->tick_gtod + TICK_NSEC); */ - clock = scd->tick_gtod + delta; + clock = scd->tick_gtod + gtod_offset + delta; min_clock = wrap_max(scd->tick_gtod, old_clock); max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC); @@ -302,7 +315,7 @@ u64 sched_clock_cpu(int cpu) u64 clock; if (sched_clock_stable()) - return sched_clock(); + return sched_clock() + raw_offset; if (unlikely(!sched_clock_running)) return 0ull; @@ -323,23 +336,22 @@ EXPORT_SYMBOL_GPL(sched_clock_cpu); void sched_clock_tick(void) { struct sched_clock_data *scd; - u64 now, now_gtod; - - if (sched_clock_stable()) - return; - - if (unlikely(!sched_clock_running)) - return; WARN_ON_ONCE(!irqs_disabled()); + /* + * Update these values even if sched_clock_stable(), because it can + * become unstable at any point in time at which point we need some + * values to fall back on. + * + * XXX arguably we can skip this if we expose tsc_clocksource_reliable + */ scd = this_scd(); - now_gtod = ktime_to_ns(ktime_get()); - now = sched_clock(); + scd->tick_raw = sched_clock(); + scd->tick_gtod = ktime_get_ns(); - scd->tick_raw = now; - scd->tick_gtod = now_gtod; - sched_clock_local(scd); + if (!sched_clock_stable() && likely(sched_clock_running)) + sched_clock_local(scd); } /* @@ -366,11 +378,6 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ -void sched_clock_init(void) -{ - sched_clock_running = 1; -} - u64 sched_clock_cpu(int cpu) { if (unlikely(!sched_clock_running)) @@ -378,6 +385,7 @@ u64 sched_clock_cpu(int cpu) return sched_clock(); } + #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ /* diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 8d0f35debf35..f063a25d4449 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -31,7 +31,8 @@ void complete(struct completion *x) unsigned long flags; spin_lock_irqsave(&x->wait.lock, flags); - x->done++; + if (x->done != UINT_MAX) + x->done++; __wake_up_locked(&x->wait, TASK_NORMAL, 1); spin_unlock_irqrestore(&x->wait.lock, flags); } @@ -51,7 +52,7 @@ void complete_all(struct completion *x) unsigned long flags; spin_lock_irqsave(&x->wait.lock, flags); - x->done += UINT_MAX/2; + x->done = UINT_MAX; __wake_up_locked(&x->wait, TASK_NORMAL, 0); spin_unlock_irqrestore(&x->wait.lock, flags); } @@ -79,7 +80,8 @@ do_wait_for_common(struct completion *x, if (!x->done) return timeout; } - x->done--; + if (x->done != UINT_MAX) + x->done--; return timeout ?: 1; } @@ -280,7 +282,7 @@ bool try_wait_for_completion(struct completion *x) spin_lock_irqsave(&x->wait.lock, flags); if (!x->done) ret = 0; - else + else if (x->done != UINT_MAX) x->done--; spin_unlock_irqrestore(&x->wait.lock, flags); return ret; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c56fb57f2991..d01f9d047397 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -102,9 +102,12 @@ void update_rq_clock(struct rq *rq) lockdep_assert_held(&rq->lock); - if (rq->clock_skip_update & RQCF_ACT_SKIP) + if (rq->clock_update_flags & RQCF_ACT_SKIP) return; +#ifdef CONFIG_SCHED_DEBUG + rq->clock_update_flags |= RQCF_UPDATED; +#endif delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; if (delta < 0) return; @@ -185,7 +188,7 @@ struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) rq = task_rq(p); raw_spin_lock(&rq->lock); if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { - rf->cookie = lockdep_pin_lock(&rq->lock); + rq_pin_lock(rq, rf); return rq; } raw_spin_unlock(&rq->lock); @@ -225,7 +228,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) * pair with the WMB to ensure we must then also see migrating. */ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { - rf->cookie = lockdep_pin_lock(&rq->lock); + rq_pin_lock(rq, rf); return rq; } raw_spin_unlock(&rq->lock); @@ -1195,9 +1198,9 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, * OK, since we're going to drop the lock immediately * afterwards anyway. */ - lockdep_unpin_lock(&rq->lock, rf.cookie); + rq_unpin_lock(rq, &rf); rq = move_queued_task(rq, p, dest_cpu); - lockdep_repin_lock(&rq->lock, rf.cookie); + rq_repin_lock(rq, &rf); } out: task_rq_unlock(rq, p, &rf); @@ -1690,7 +1693,7 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl * Mark the task runnable and perform wakeup-preemption. */ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, - struct pin_cookie cookie) + struct rq_flags *rf) { check_preempt_curr(rq, p, wake_flags); p->state = TASK_RUNNING; @@ -1702,9 +1705,9 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, * Our task @p is fully woken up and running; so its safe to * drop the rq->lock, hereafter rq is only used for statistics. */ - lockdep_unpin_lock(&rq->lock, cookie); + rq_unpin_lock(rq, rf); p->sched_class->task_woken(rq, p); - lockdep_repin_lock(&rq->lock, cookie); + rq_repin_lock(rq, rf); } if (rq->idle_stamp) { @@ -1723,7 +1726,7 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, static void ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, - struct pin_cookie cookie) + struct rq_flags *rf) { int en_flags = ENQUEUE_WAKEUP; @@ -1738,7 +1741,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, #endif ttwu_activate(rq, p, en_flags); - ttwu_do_wakeup(rq, p, wake_flags, cookie); + ttwu_do_wakeup(rq, p, wake_flags, rf); } /* @@ -1757,7 +1760,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) if (task_on_rq_queued(p)) { /* check_preempt_curr() may use rq clock */ update_rq_clock(rq); - ttwu_do_wakeup(rq, p, wake_flags, rf.cookie); + ttwu_do_wakeup(rq, p, wake_flags, &rf); ret = 1; } __task_rq_unlock(rq, &rf); @@ -1770,15 +1773,15 @@ void sched_ttwu_pending(void) { struct rq *rq = this_rq(); struct llist_node *llist = llist_del_all(&rq->wake_list); - struct pin_cookie cookie; struct task_struct *p; unsigned long flags; + struct rq_flags rf; if (!llist) return; raw_spin_lock_irqsave(&rq->lock, flags); - cookie = lockdep_pin_lock(&rq->lock); + rq_pin_lock(rq, &rf); while (llist) { int wake_flags = 0; @@ -1789,10 +1792,10 @@ void sched_ttwu_pending(void) if (p->sched_remote_wakeup) wake_flags = WF_MIGRATED; - ttwu_do_activate(rq, p, wake_flags, cookie); + ttwu_do_activate(rq, p, wake_flags, &rf); } - lockdep_unpin_lock(&rq->lock, cookie); + rq_unpin_lock(rq, &rf); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -1881,7 +1884,7 @@ bool cpus_share_cache(int this_cpu, int that_cpu) static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) { struct rq *rq = cpu_rq(cpu); - struct pin_cookie cookie; + struct rq_flags rf; #if defined(CONFIG_SMP) if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { @@ -1892,9 +1895,9 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) #endif raw_spin_lock(&rq->lock); - cookie = lockdep_pin_lock(&rq->lock); - ttwu_do_activate(rq, p, wake_flags, cookie); - lockdep_unpin_lock(&rq->lock, cookie); + rq_pin_lock(rq, &rf); + ttwu_do_activate(rq, p, wake_flags, &rf); + rq_unpin_lock(rq, &rf); raw_spin_unlock(&rq->lock); } @@ -2086,11 +2089,24 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; + if (p->in_iowait) { + delayacct_blkio_end(); + atomic_dec(&task_rq(p)->nr_iowait); + } + cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; set_task_cpu(p, cpu); } + +#else /* CONFIG_SMP */ + + if (p->in_iowait) { + delayacct_blkio_end(); + atomic_dec(&task_rq(p)->nr_iowait); + } + #endif /* CONFIG_SMP */ ttwu_queue(p, cpu, wake_flags); @@ -2111,7 +2127,7 @@ out: * ensure that this_rq() is locked, @p is bound to this_rq() and not * the current task. */ -static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie) +static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) { struct rq *rq = task_rq(p); @@ -2128,11 +2144,11 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie * disabled avoiding further scheduler activity on it and we've * not yet picked a replacement task. */ - lockdep_unpin_lock(&rq->lock, cookie); + rq_unpin_lock(rq, rf); raw_spin_unlock(&rq->lock); raw_spin_lock(&p->pi_lock); raw_spin_lock(&rq->lock); - lockdep_repin_lock(&rq->lock, cookie); + rq_repin_lock(rq, rf); } if (!(p->state & TASK_NORMAL)) @@ -2140,10 +2156,15 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie trace_sched_waking(p); - if (!task_on_rq_queued(p)) + if (!task_on_rq_queued(p)) { + if (p->in_iowait) { + delayacct_blkio_end(); + atomic_dec(&rq->nr_iowait); + } ttwu_activate(rq, p, ENQUEUE_WAKEUP); + } - ttwu_do_wakeup(rq, p, 0, cookie); + ttwu_do_wakeup(rq, p, 0, rf); ttwu_stat(p, smp_processor_id(), 0); out: raw_spin_unlock(&p->pi_lock); @@ -2578,6 +2599,7 @@ void wake_up_new_task(struct task_struct *p) __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif rq = __task_rq_lock(p, &rf); + update_rq_clock(rq); post_init_entity_util_avg(&p->se); activate_task(rq, p, 0); @@ -2590,9 +2612,9 @@ void wake_up_new_task(struct task_struct *p) * Nothing relies on rq->lock after this, so its fine to * drop it. */ - lockdep_unpin_lock(&rq->lock, rf.cookie); + rq_unpin_lock(rq, &rf); p->sched_class->task_woken(rq, p); - lockdep_repin_lock(&rq->lock, rf.cookie); + rq_repin_lock(rq, &rf); } #endif task_rq_unlock(rq, p, &rf); @@ -2861,7 +2883,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) */ static __always_inline struct rq * context_switch(struct rq *rq, struct task_struct *prev, - struct task_struct *next, struct pin_cookie cookie) + struct task_struct *next, struct rq_flags *rf) { struct mm_struct *mm, *oldmm; @@ -2887,13 +2909,16 @@ context_switch(struct rq *rq, struct task_struct *prev, prev->active_mm = NULL; rq->prev_mm = oldmm; } + + rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); + /* * Since the runqueue lock will be released by the next * task (which is an invalid locking op but in the case * of the scheduler it's an obvious special-case), so we * do an early lockdep release here: */ - lockdep_unpin_lock(&rq->lock, cookie); + rq_unpin_lock(rq, rf); spin_release(&rq->lock.dep_map, 1, _THIS_IP_); /* Here we just switch the register state and the stack. */ @@ -2949,6 +2974,36 @@ unsigned long long nr_context_switches(void) return sum; } +/* + * IO-wait accounting, and how its mostly bollocks (on SMP). + * + * The idea behind IO-wait account is to account the idle time that we could + * have spend running if it were not for IO. That is, if we were to improve the + * storage performance, we'd have a proportional reduction in IO-wait time. + * + * This all works nicely on UP, where, when a task blocks on IO, we account + * idle time as IO-wait, because if the storage were faster, it could've been + * running and we'd not be idle. + * + * This has been extended to SMP, by doing the same for each CPU. This however + * is broken. + * + * Imagine for instance the case where two tasks block on one CPU, only the one + * CPU will have IO-wait accounted, while the other has regular idle. Even + * though, if the storage were faster, both could've ran at the same time, + * utilising both CPUs. + * + * This means, that when looking globally, the current IO-wait accounting on + * SMP is a lower bound, by reason of under accounting. + * + * Worse, since the numbers are provided per CPU, they are sometimes + * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly + * associated with any one particular CPU, it can wake to another CPU than it + * blocked on. This means the per CPU IO-wait number is meaningless. + * + * Task CPU affinities can make all that even more 'interesting'. + */ + unsigned long nr_iowait(void) { unsigned long i, sum = 0; @@ -2959,6 +3014,13 @@ unsigned long nr_iowait(void) return sum; } +/* + * Consumers of these two interfaces, like for example the cpufreq menu + * governor are using nonsensical data. Boosting frequency for a CPU that has + * IO-wait which might not even end up running the task when it does become + * runnable. + */ + unsigned long nr_iowait_cpu(int cpu) { struct rq *this = cpu_rq(cpu); @@ -3257,31 +3319,30 @@ static inline void schedule_debug(struct task_struct *prev) * Pick up the highest-prio task: */ static inline struct task_struct * -pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) +pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { - const struct sched_class *class = &fair_sched_class; + const struct sched_class *class; struct task_struct *p; /* * Optimization: we know that if all tasks are in * the fair class we can call that function directly: */ - if (likely(prev->sched_class == class && - rq->nr_running == rq->cfs.h_nr_running)) { - p = fair_sched_class.pick_next_task(rq, prev, cookie); + if (likely(rq->nr_running == rq->cfs.h_nr_running)) { + p = fair_sched_class.pick_next_task(rq, prev, rf); if (unlikely(p == RETRY_TASK)) goto again; /* assumes fair_sched_class->next == idle_sched_class */ if (unlikely(!p)) - p = idle_sched_class.pick_next_task(rq, prev, cookie); + p = idle_sched_class.pick_next_task(rq, prev, rf); return p; } again: for_each_class(class) { - p = class->pick_next_task(rq, prev, cookie); + p = class->pick_next_task(rq, prev, rf); if (p) { if (unlikely(p == RETRY_TASK)) goto again; @@ -3335,7 +3396,7 @@ static void __sched notrace __schedule(bool preempt) { struct task_struct *prev, *next; unsigned long *switch_count; - struct pin_cookie cookie; + struct rq_flags rf; struct rq *rq; int cpu; @@ -3358,9 +3419,9 @@ static void __sched notrace __schedule(bool preempt) */ smp_mb__before_spinlock(); raw_spin_lock(&rq->lock); - cookie = lockdep_pin_lock(&rq->lock); + rq_pin_lock(rq, &rf); - rq->clock_skip_update <<= 1; /* promote REQ to ACT */ + rq->clock_update_flags <<= 1; /* promote REQ to ACT */ switch_count = &prev->nivcsw; if (!preempt && prev->state) { @@ -3370,6 +3431,11 @@ static void __sched notrace __schedule(bool preempt) deactivate_task(rq, prev, DEQUEUE_SLEEP); prev->on_rq = 0; + if (prev->in_iowait) { + atomic_inc(&rq->nr_iowait); + delayacct_blkio_start(); + } + /* * If a worker went to sleep, notify and ask workqueue * whether it wants to wake up a task to maintain @@ -3380,7 +3446,7 @@ static void __sched notrace __schedule(bool preempt) to_wakeup = wq_worker_sleeping(prev); if (to_wakeup) - try_to_wake_up_local(to_wakeup, cookie); + try_to_wake_up_local(to_wakeup, &rf); } } switch_count = &prev->nvcsw; @@ -3389,10 +3455,9 @@ static void __sched notrace __schedule(bool preempt) if (task_on_rq_queued(prev)) update_rq_clock(rq); - next = pick_next_task(rq, prev, cookie); + next = pick_next_task(rq, prev, &rf); clear_tsk_need_resched(prev); clear_preempt_need_resched(); - rq->clock_skip_update = 0; if (likely(prev != next)) { rq->nr_switches++; @@ -3400,9 +3465,10 @@ static void __sched notrace __schedule(bool preempt) ++*switch_count; trace_sched_switch(preempt, prev, next); - rq = context_switch(rq, prev, next, cookie); /* unlocks the rq */ + rq = context_switch(rq, prev, next, &rf); /* unlocks the rq */ } else { - lockdep_unpin_lock(&rq->lock, cookie); + rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); + rq_unpin_lock(rq, &rf); raw_spin_unlock_irq(&rq->lock); } @@ -3651,6 +3717,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) BUG_ON(prio > MAX_PRIO); rq = __task_rq_lock(p, &rf); + update_rq_clock(rq); /* * Idle task boosting is a nono in general. There is one @@ -3747,6 +3814,8 @@ void set_user_nice(struct task_struct *p, long nice) * the task might be in the middle of scheduling on another CPU. */ rq = task_rq_lock(p, &rf); + update_rq_clock(rq); + /* * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected @@ -4179,6 +4248,7 @@ recheck: * runqueue lock must be held. */ rq = task_rq_lock(p, &rf); + update_rq_clock(rq); /* * Changing the policy of the stop threads its a very bad idea @@ -5057,31 +5127,48 @@ out_irq: } EXPORT_SYMBOL_GPL(yield_to); +int io_schedule_prepare(void) +{ + int old_iowait = current->in_iowait; + + current->in_iowait = 1; + blk_schedule_flush_plug(current); + + return old_iowait; +} + +void io_schedule_finish(int token) +{ + current->in_iowait = token; +} + /* * This task is about to go to sleep on IO. Increment rq->nr_iowait so * that process accounting knows that this is a task in IO wait state. */ long __sched io_schedule_timeout(long timeout) { - int old_iowait = current->in_iowait; - struct rq *rq; + int token; long ret; - current->in_iowait = 1; - blk_schedule_flush_plug(current); - - delayacct_blkio_start(); - rq = raw_rq(); - atomic_inc(&rq->nr_iowait); + token = io_schedule_prepare(); ret = schedule_timeout(timeout); - current->in_iowait = old_iowait; - atomic_dec(&rq->nr_iowait); - delayacct_blkio_end(); + io_schedule_finish(token); return ret; } EXPORT_SYMBOL(io_schedule_timeout); +void io_schedule(void) +{ + int token; + + token = io_schedule_prepare(); + schedule(); + io_schedule_finish(token); +} +EXPORT_SYMBOL(io_schedule); + /** * sys_sched_get_priority_max - return maximum RT priority. * @policy: scheduling class. @@ -5521,7 +5608,7 @@ static void migrate_tasks(struct rq *dead_rq) { struct rq *rq = dead_rq; struct task_struct *next, *stop = rq->stop; - struct pin_cookie cookie; + struct rq_flags rf, old_rf; int dest_cpu; /* @@ -5553,8 +5640,8 @@ static void migrate_tasks(struct rq *dead_rq) /* * pick_next_task assumes pinned rq->lock. */ - cookie = lockdep_pin_lock(&rq->lock); - next = pick_next_task(rq, &fake_task, cookie); + rq_pin_lock(rq, &rf); + next = pick_next_task(rq, &fake_task, &rf); BUG_ON(!next); next->sched_class->put_prev_task(rq, next); @@ -5567,7 +5654,7 @@ static void migrate_tasks(struct rq *dead_rq) * because !cpu_active at this point, which means load-balance * will not interfere. Also, stop-machine. */ - lockdep_unpin_lock(&rq->lock, cookie); + rq_unpin_lock(rq, &rf); raw_spin_unlock(&rq->lock); raw_spin_lock(&next->pi_lock); raw_spin_lock(&rq->lock); @@ -5582,6 +5669,13 @@ static void migrate_tasks(struct rq *dead_rq) continue; } + /* + * __migrate_task() may return with a different + * rq->lock held and a new cookie in 'rf', but we need + * to preserve rf::clock_update_flags for 'dead_rq'. + */ + old_rf = rf; + /* Find suitable destination for @next, with force if needed. */ dest_cpu = select_fallback_rq(dead_rq->cpu, next); @@ -5590,6 +5684,7 @@ static void migrate_tasks(struct rq *dead_rq) raw_spin_unlock(&rq->lock); rq = dead_rq; raw_spin_lock(&rq->lock); + rf = old_rf; } raw_spin_unlock(&next->pi_lock); } @@ -5881,12 +5976,14 @@ static int init_rootdomain(struct root_domain *rd) init_dl_bw(&rd->dl_bw); if (cpudl_init(&rd->cpudl) != 0) - goto free_dlo_mask; + goto free_rto_mask; if (cpupri_init(&rd->cpupri) != 0) - goto free_rto_mask; + goto free_cpudl; return 0; +free_cpudl: + cpudl_cleanup(&rd->cpudl); free_rto_mask: free_cpumask_var(rd->rto_mask); free_dlo_mask: @@ -7487,6 +7584,7 @@ void __init sched_init_smp(void) init_sched_dl_class(); sched_init_smt(); + sched_clock_init_late(); sched_smp_initialized = true; } @@ -7502,6 +7600,7 @@ early_initcall(migration_init); void __init sched_init_smp(void) { sched_init_granularity(); + sched_clock_init_late(); } #endif /* CONFIG_SMP */ @@ -7545,6 +7644,8 @@ void __init sched_init(void) int i, j; unsigned long alloc_size = 0, ptr; + sched_clock_init(); + for (i = 0; i < WAIT_TABLE_SIZE; i++) init_waitqueue_head(bit_wait_table + i); @@ -7983,6 +8084,7 @@ void sched_move_task(struct task_struct *tsk) struct rq *rq; rq = task_rq_lock(tsk, &rf); + update_rq_clock(rq); running = task_current(rq, tsk); queued = task_on_rq_queued(tsk); @@ -8431,6 +8533,7 @@ static void cpu_cgroup_fork(struct task_struct *task) rq = task_rq_lock(task, &rf); + update_rq_clock(rq); sched_change_group(task, TASK_SET_GROUP); task_rq_unlock(rq, task, &rf); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 7700a9cba335..f7c14cc71d06 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -151,7 +151,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime) * @p: the process that the cpu time gets accounted to * @cputime: the cpu time spent in virtual machine since the last update */ -static void account_guest_time(struct task_struct *p, cputime_t cputime) +void account_guest_time(struct task_struct *p, cputime_t cputime) { u64 *cpustat = kcpustat_this_cpu->cpustat; @@ -176,8 +176,8 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime) * @cputime: the cpu time spent in kernel space since the last update * @index: pointer to cpustat field that has to be updated */ -static inline -void __account_system_time(struct task_struct *p, cputime_t cputime, int index) +void account_system_index_time(struct task_struct *p, + cputime_t cputime, enum cpu_usage_stat index) { /* Add system time to process. */ p->stime += cputime; @@ -213,7 +213,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, else index = CPUTIME_SYSTEM; - __account_system_time(p, cputime, index); + account_system_index_time(p, cputime, index); } /* @@ -400,7 +400,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, * So, we have to handle it separately here. * Also, p->stime needs to be updated for ksoftirqd. */ - __account_system_time(p, cputime, CPUTIME_SOFTIRQ); + account_system_index_time(p, cputime, CPUTIME_SOFTIRQ); } else if (user_tick) { account_user_time(p, cputime); } else if (p == rq->idle) { @@ -408,7 +408,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, } else if (p->flags & PF_VCPU) { /* System time or guest time */ account_guest_time(p, cputime); } else { - __account_system_time(p, cputime, CPUTIME_SYSTEM); + account_system_index_time(p, cputime, CPUTIME_SYSTEM); } } @@ -437,9 +437,7 @@ void vtime_common_task_switch(struct task_struct *prev) else vtime_account_system(prev); -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE - vtime_account_user(prev); -#endif + vtime_flush(prev); arch_vtime_task_switch(prev); } #endif diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 70ef2b1901e4..27737f34757d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -663,9 +663,9 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * Nothing relies on rq->lock after this, so its safe to drop * rq->lock. */ - lockdep_unpin_lock(&rq->lock, rf.cookie); + rq_unpin_lock(rq, &rf); push_dl_task(rq); - lockdep_repin_lock(&rq->lock, rf.cookie); + rq_repin_lock(rq, &rf); } #endif @@ -1118,7 +1118,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, } struct task_struct * -pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) +pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct sched_dl_entity *dl_se; struct task_struct *p; @@ -1133,9 +1133,9 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct pin_cookie coo * disabled avoiding further scheduler activity on it and we're * being very careful to re-start the picking loop. */ - lockdep_unpin_lock(&rq->lock, cookie); + rq_unpin_lock(rq, rf); pull_dl_task(rq); - lockdep_repin_lock(&rq->lock, cookie); + rq_repin_lock(rq, rf); /* * pull_dl_task() can drop (and re-acquire) rq->lock; this * means a stop task can slip in, in which case we need to @@ -1729,12 +1729,11 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) #ifdef CONFIG_SMP if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded) queue_push_tasks(rq); -#else +#endif if (dl_task(rq->curr)) check_preempt_curr_dl(rq, p, 0); else resched_curr(rq); -#endif } } diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index fa178b62ea79..109adc0e9cb9 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -953,6 +953,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) #endif P(policy); P(prio); + if (p->policy == SCHED_DEADLINE) { + P(dl.runtime); + P(dl.deadline); + } #undef PN_SCHEDSTAT #undef PN #undef __PN diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6559d197e08a..274c747a01ce 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2657,6 +2657,18 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) if (tg_weight) shares /= tg_weight; + /* + * MIN_SHARES has to be unscaled here to support per-CPU partitioning + * of a group with small tg->shares value. It is a floor value which is + * assigned as a minimum load.weight to the sched_entity representing + * the group on a CPU. + * + * E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024 + * on an 8-core system with 8 tasks each runnable on one CPU shares has + * to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In + * case no task is runnable on a CPU MIN_SHARES=2 should be returned + * instead of 0. + */ if (shares < MIN_SHARES) shares = MIN_SHARES; if (shares > tg->shares) @@ -2689,16 +2701,20 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); -static void update_cfs_shares(struct cfs_rq *cfs_rq) +static void update_cfs_shares(struct sched_entity *se) { + struct cfs_rq *cfs_rq = group_cfs_rq(se); struct task_group *tg; - struct sched_entity *se; long shares; - tg = cfs_rq->tg; - se = tg->se[cpu_of(rq_of(cfs_rq))]; - if (!se || throttled_hierarchy(cfs_rq)) + if (!cfs_rq) + return; + + if (throttled_hierarchy(cfs_rq)) return; + + tg = cfs_rq->tg; + #ifndef CONFIG_SMP if (likely(se->load.weight == tg->shares)) return; @@ -2707,8 +2723,9 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq) reweight_entity(cfs_rq_of(se), se, shares); } + #else /* CONFIG_FAIR_GROUP_SCHED */ -static inline void update_cfs_shares(struct cfs_rq *cfs_rq) +static inline void update_cfs_shares(struct sched_entity *se) { } #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -3424,7 +3441,7 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) return cfs_rq->avg.load_avg; } -static int idle_balance(struct rq *this_rq); +static int idle_balance(struct rq *this_rq, struct rq_flags *rf); #else /* CONFIG_SMP */ @@ -3453,7 +3470,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} -static inline int idle_balance(struct rq *rq) +static inline int idle_balance(struct rq *rq, struct rq_flags *rf) { return 0; } @@ -3582,10 +3599,18 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (renorm && !curr) se->vruntime += cfs_rq->min_vruntime; + /* + * When enqueuing a sched_entity, we must: + * - Update loads to have both entity and cfs_rq synced with now. + * - Add its load to cfs_rq->runnable_avg + * - For group_entity, update its weight to reflect the new share of + * its group cfs_rq + * - Add its new weight to cfs_rq->load.weight + */ update_load_avg(se, UPDATE_TG); enqueue_entity_load_avg(cfs_rq, se); + update_cfs_shares(se); account_entity_enqueue(cfs_rq, se); - update_cfs_shares(cfs_rq); if (flags & ENQUEUE_WAKEUP) place_entity(cfs_rq, se, 0); @@ -3657,6 +3682,15 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + + /* + * When dequeuing a sched_entity, we must: + * - Update loads to have both entity and cfs_rq synced with now. + * - Substract its load from the cfs_rq->runnable_avg. + * - Substract its previous weight from cfs_rq->load.weight. + * - For group entity, update its weight to reflect the new share + * of its group cfs_rq. + */ update_load_avg(se, UPDATE_TG); dequeue_entity_load_avg(cfs_rq, se); @@ -3681,7 +3715,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) /* return excess runtime on last dequeue */ return_cfs_rq_runtime(cfs_rq); - update_cfs_shares(cfs_rq); + update_cfs_shares(se); /* * Now advance min_vruntime if @se was the entity holding it back, @@ -3864,7 +3898,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * Ensure that runnable average is periodically updated. */ update_load_avg(curr, UPDATE_TG); - update_cfs_shares(cfs_rq); + update_cfs_shares(curr); #ifdef CONFIG_SCHED_HRTICK /* @@ -4761,7 +4795,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) break; update_load_avg(se, UPDATE_TG); - update_cfs_shares(cfs_rq); + update_cfs_shares(se); } if (!se) @@ -4820,7 +4854,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) break; update_load_avg(se, UPDATE_TG); - update_cfs_shares(cfs_rq); + update_cfs_shares(se); } if (!se) @@ -6213,7 +6247,7 @@ preempt: } static struct task_struct * -pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) +pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; @@ -6320,15 +6354,8 @@ simple: return p; idle: - /* - * This is OK, because current is on_cpu, which avoids it being picked - * for load-balance and preemption/IRQs are still disabled avoiding - * further scheduler activity on it and we're being very careful to - * re-start the picking loop. - */ - lockdep_unpin_lock(&rq->lock, cookie); - new_tasks = idle_balance(rq); - lockdep_repin_lock(&rq->lock, cookie); + new_tasks = idle_balance(rq, rf); + /* * Because idle_balance() releases (and re-acquires) rq->lock, it is * possible for any higher priority task to appear. In that case we @@ -8077,6 +8104,7 @@ redo: more_balance: raw_spin_lock_irqsave(&busiest->lock, flags); + update_rq_clock(busiest); /* * cur_ld_moved - load moved in current iteration @@ -8297,7 +8325,7 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance) * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. */ -static int idle_balance(struct rq *this_rq) +static int idle_balance(struct rq *this_rq, struct rq_flags *rf) { unsigned long next_balance = jiffies + HZ; int this_cpu = this_rq->cpu; @@ -8311,6 +8339,14 @@ static int idle_balance(struct rq *this_rq) */ this_rq->idle_stamp = rq_clock(this_rq); + /* + * This is OK, because current is on_cpu, which avoids it being picked + * for load-balance and preemption/IRQs are still disabled avoiding + * further scheduler activity on it and we're being very careful to + * re-start the picking loop. + */ + rq_unpin_lock(this_rq, rf); + if (this_rq->avg_idle < sysctl_sched_migration_cost || !this_rq->rd->overload) { rcu_read_lock(); @@ -8388,6 +8424,8 @@ out: if (pulled_task) this_rq->idle_stamp = 0; + rq_repin_lock(this_rq, rf); + return pulled_task; } @@ -8443,6 +8481,7 @@ static int active_load_balance_cpu_stop(void *data) }; schedstat_inc(sd->alb_count); + update_rq_clock(busiest_rq); p = detach_one_task(&env); if (p) { @@ -9264,6 +9303,7 @@ void online_fair_sched_group(struct task_group *tg) se = tg->se[i]; raw_spin_lock_irq(&rq->lock); + update_rq_clock(rq); attach_entity_cfs_rq(se); sync_throttle(tg, i); raw_spin_unlock_irq(&rq->lock); @@ -9356,8 +9396,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) /* Possible calls to update_curr() need rq clock */ update_rq_clock(rq); - for_each_sched_entity(se) - update_cfs_shares(group_cfs_rq(se)); + for_each_sched_entity(se) { + update_load_avg(se, UPDATE_TG); + update_cfs_shares(se); + } raw_spin_unlock_irqrestore(&rq->lock, flags); } diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 5405d3feb112..0c00172db63e 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -24,7 +24,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl } static struct task_struct * -pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) +pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { put_prev_task(rq, prev); update_idle_core(rq); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 2516b8df6dbb..8bd30db5577b 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1523,7 +1523,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) } static struct task_struct * -pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) +pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct task_struct *p; struct rt_rq *rt_rq = &rq->rt; @@ -1535,9 +1535,9 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct pin_cookie coo * disabled avoiding further scheduler activity on it and we're * being very careful to re-start the picking loop. */ - lockdep_unpin_lock(&rq->lock, cookie); + rq_unpin_lock(rq, rf); pull_rt_task(rq); - lockdep_repin_lock(&rq->lock, cookie); + rq_repin_lock(rq, rf); /* * pull_rt_task() can drop (and re-acquire) rq->lock; this * means a dl or stop task can slip in, in which case we need @@ -2198,10 +2198,9 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) #ifdef CONFIG_SMP if (tsk_nr_cpus_allowed(p) > 1 && rq->rt.overloaded) queue_push_tasks(rq); -#else +#endif /* CONFIG_SMP */ if (p->prio < rq->curr->prio) resched_curr(rq); -#endif /* CONFIG_SMP */ } } @@ -2246,6 +2245,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) } } +#ifdef CONFIG_POSIX_TIMERS static void watchdog(struct rq *rq, struct task_struct *p) { unsigned long soft, hard; @@ -2267,6 +2267,9 @@ static void watchdog(struct rq *rq, struct task_struct *p) p->cputime_expires.sched_exp = p->se.sum_exec_runtime; } } +#else +static inline void watchdog(struct rq *rq, struct task_struct *p) { } +#endif static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7b34c7826ca5..6eeae7ebd99b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -644,7 +644,7 @@ struct rq { unsigned long next_balance; struct mm_struct *prev_mm; - unsigned int clock_skip_update; + unsigned int clock_update_flags; u64 clock; u64 clock_task; @@ -768,28 +768,110 @@ static inline u64 __rq_clock_broken(struct rq *rq) return READ_ONCE(rq->clock); } +/* + * rq::clock_update_flags bits + * + * %RQCF_REQ_SKIP - will request skipping of clock update on the next + * call to __schedule(). This is an optimisation to avoid + * neighbouring rq clock updates. + * + * %RQCF_ACT_SKIP - is set from inside of __schedule() when skipping is + * in effect and calls to update_rq_clock() are being ignored. + * + * %RQCF_UPDATED - is a debug flag that indicates whether a call has been + * made to update_rq_clock() since the last time rq::lock was pinned. + * + * If inside of __schedule(), clock_update_flags will have been + * shifted left (a left shift is a cheap operation for the fast path + * to promote %RQCF_REQ_SKIP to %RQCF_ACT_SKIP), so you must use, + * + * if (rq-clock_update_flags >= RQCF_UPDATED) + * + * to check if %RQCF_UPADTED is set. It'll never be shifted more than + * one position though, because the next rq_unpin_lock() will shift it + * back. + */ +#define RQCF_REQ_SKIP 0x01 +#define RQCF_ACT_SKIP 0x02 +#define RQCF_UPDATED 0x04 + +static inline void assert_clock_updated(struct rq *rq) +{ + /* + * The only reason for not seeing a clock update since the + * last rq_pin_lock() is if we're currently skipping updates. + */ + SCHED_WARN_ON(rq->clock_update_flags < RQCF_ACT_SKIP); +} + static inline u64 rq_clock(struct rq *rq) { lockdep_assert_held(&rq->lock); + assert_clock_updated(rq); + return rq->clock; } static inline u64 rq_clock_task(struct rq *rq) { lockdep_assert_held(&rq->lock); + assert_clock_updated(rq); + return rq->clock_task; } -#define RQCF_REQ_SKIP 0x01 -#define RQCF_ACT_SKIP 0x02 - static inline void rq_clock_skip_update(struct rq *rq, bool skip) { lockdep_assert_held(&rq->lock); if (skip) - rq->clock_skip_update |= RQCF_REQ_SKIP; + rq->clock_update_flags |= RQCF_REQ_SKIP; else - rq->clock_skip_update &= ~RQCF_REQ_SKIP; + rq->clock_update_flags &= ~RQCF_REQ_SKIP; +} + +struct rq_flags { + unsigned long flags; + struct pin_cookie cookie; +#ifdef CONFIG_SCHED_DEBUG + /* + * A copy of (rq::clock_update_flags & RQCF_UPDATED) for the + * current pin context is stashed here in case it needs to be + * restored in rq_repin_lock(). + */ + unsigned int clock_update_flags; +#endif +}; + +static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) +{ + rf->cookie = lockdep_pin_lock(&rq->lock); + +#ifdef CONFIG_SCHED_DEBUG + rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); + rf->clock_update_flags = 0; +#endif +} + +static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) +{ +#ifdef CONFIG_SCHED_DEBUG + if (rq->clock_update_flags > RQCF_ACT_SKIP) + rf->clock_update_flags = RQCF_UPDATED; +#endif + + lockdep_unpin_lock(&rq->lock, rf->cookie); +} + +static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) +{ + lockdep_repin_lock(&rq->lock, rf->cookie); + +#ifdef CONFIG_SCHED_DEBUG + /* + * Restore the value we stashed in @rf for this pin context. + */ + rq->clock_update_flags |= rf->clock_update_flags; +#endif } #ifdef CONFIG_NUMA @@ -1245,7 +1327,7 @@ struct sched_class { */ struct task_struct * (*pick_next_task) (struct rq *rq, struct task_struct *prev, - struct pin_cookie cookie); + struct rq_flags *rf); void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP @@ -1501,11 +1583,6 @@ static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } static inline void sched_avg_update(struct rq *rq) { } #endif -struct rq_flags { - unsigned long flags; - struct pin_cookie cookie; -}; - struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(rq->lock); struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) @@ -1515,7 +1592,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) { - lockdep_unpin_lock(&rq->lock, rf->cookie); + rq_unpin_lock(rq, rf); raw_spin_unlock(&rq->lock); } @@ -1524,7 +1601,7 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) __releases(rq->lock) __releases(p->pi_lock) { - lockdep_unpin_lock(&rq->lock, rf->cookie); + rq_unpin_lock(rq, rf); raw_spin_unlock(&rq->lock); raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); } diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 34659a853505..c69a9870ab79 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -172,18 +172,19 @@ sched_info_switch(struct rq *rq, */ /** - * cputimer_running - return true if cputimer is running + * get_running_cputimer - return &tsk->signal->cputimer if cputimer is running * * @tsk: Pointer to target task. */ -static inline bool cputimer_running(struct task_struct *tsk) - +#ifdef CONFIG_POSIX_TIMERS +static inline +struct thread_group_cputimer *get_running_cputimer(struct task_struct *tsk) { struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; /* Check if cputimer isn't running. This is accessed without locking. */ if (!READ_ONCE(cputimer->running)) - return false; + return NULL; /* * After we flush the task's sum_exec_runtime to sig->sum_sched_runtime @@ -200,10 +201,17 @@ static inline bool cputimer_running(struct task_struct *tsk) * clock delta is behind the expiring timer value. */ if (unlikely(!tsk->sighand)) - return false; + return NULL; - return true; + return cputimer; +} +#else +static inline +struct thread_group_cputimer *get_running_cputimer(struct task_struct *tsk) +{ + return NULL; } +#endif /** * account_group_user_time - Maintain utime for a thread group. @@ -218,9 +226,9 @@ static inline bool cputimer_running(struct task_struct *tsk) static inline void account_group_user_time(struct task_struct *tsk, cputime_t cputime) { - struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + struct thread_group_cputimer *cputimer = get_running_cputimer(tsk); - if (!cputimer_running(tsk)) + if (!cputimer) return; atomic64_add(cputime, &cputimer->cputime_atomic.utime); @@ -239,9 +247,9 @@ static inline void account_group_user_time(struct task_struct *tsk, static inline void account_group_system_time(struct task_struct *tsk, cputime_t cputime) { - struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + struct thread_group_cputimer *cputimer = get_running_cputimer(tsk); - if (!cputimer_running(tsk)) + if (!cputimer) return; atomic64_add(cputime, &cputimer->cputime_atomic.stime); @@ -260,9 +268,9 @@ static inline void account_group_system_time(struct task_struct *tsk, static inline void account_group_exec_runtime(struct task_struct *tsk, unsigned long long ns) { - struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + struct thread_group_cputimer *cputimer = get_running_cputimer(tsk); - if (!cputimer_running(tsk)) + if (!cputimer) return; atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime); diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 604297a08b3a..9f69fb630853 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -24,7 +24,7 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) } static struct task_struct * -pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) +pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct task_struct *stop = rq->stop; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 665985b0a89a..93621ae718d3 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -141,6 +141,10 @@ static void __clocksource_unstable(struct clocksource *cs) { cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); cs->flags |= CLOCK_SOURCE_UNSTABLE; + + if (cs->mark_unstable) + cs->mark_unstable(cs); + if (finished_booting) schedule_work(&watchdog_work); } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index db087d7e106d..95b258dd75db 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1275,27 +1275,8 @@ error: /* even if we error out, we forwarded the time, so call update */ } EXPORT_SYMBOL(timekeeping_inject_offset); - -/** - * timekeeping_get_tai_offset - Returns current TAI offset from UTC - * - */ -s32 timekeeping_get_tai_offset(void) -{ - struct timekeeper *tk = &tk_core.timekeeper; - unsigned int seq; - s32 ret; - - do { - seq = read_seqcount_begin(&tk_core.seq); - ret = tk->tai_offset; - } while (read_seqcount_retry(&tk_core.seq, seq)); - - return ret; -} - /** - * __timekeeping_set_tai_offset - Lock free worker function + * __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic * */ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) @@ -1305,24 +1286,6 @@ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) } /** - * timekeeping_set_tai_offset - Sets the current TAI offset from UTC - * - */ -void timekeeping_set_tai_offset(s32 tai_offset) -{ - struct timekeeper *tk = &tk_core.timekeeper; - unsigned long flags; - - raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&tk_core.seq); - __timekeeping_set_tai_offset(tk, tai_offset); - timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); - write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); - clock_was_set(); -} - -/** * change_clocksource - Swaps clocksources if a new one is available * * Accumulates current time interval and initializes new clocksource diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index 704f595ce83f..d0914676d4c5 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -11,8 +11,6 @@ extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, extern int timekeeping_valid_for_hres(void); extern u64 timekeeping_max_deferment(void); extern int timekeeping_inject_offset(struct timespec *ts); -extern s32 timekeeping_get_tai_offset(void); -extern void timekeeping_set_tai_offset(s32 tai_offset); extern int timekeeping_suspend(void); extern void timekeeping_resume(void); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index c98e977dc157..d3f67f7a6260 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1183,6 +1183,18 @@ config LOCK_TORTURE_TEST Say M if you want these torture tests to build as a module. Say N if you are unsure. +config WW_MUTEX_SELFTEST + tristate "Wait/wound mutex selftests" + help + This option provides a kernel module that runs tests on the + on the struct ww_mutex locking API. + + It is recommended to enable DEBUG_WW_MUTEX_SLOWPATH in conjunction + with this test harness. + + Say M if you want these self tests to build as a module. + Say N if you are unsure. + endmenu # lock debugging config TRACE_IRQFLAGS diff --git a/lib/timerqueue.c b/lib/timerqueue.c index adc6ee0a5126..4a720ed4fdaf 100644 --- a/lib/timerqueue.c +++ b/lib/timerqueue.c @@ -80,8 +80,7 @@ bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node) if (head->next == node) { struct rb_node *rbn = rb_next(&node->node); - head->next = rbn ? - rb_entry(rbn, struct timerqueue_node, node) : NULL; + head->next = rb_entry_safe(rbn, struct timerqueue_node, node); } rb_erase(&node->node, &head->head); RB_CLEAR_NODE(&node->node); diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 1904a93f47d5..d491529332f4 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -920,7 +920,7 @@ static void chan_close_cb(struct l2cap_chan *chan) BT_DBG("dev %p removing %speer %p", dev, last ? "last " : "1 ", peer); BT_DBG("chan %p orig refcnt %d", chan, - atomic_read(&chan->kref.refcount)); + kref_read(&chan->kref)); l2cap_chan_put(chan); break; diff --git a/net/bluetooth/a2mp.c b/net/bluetooth/a2mp.c index 5f123c3320a7..f0095fd79818 100644 --- a/net/bluetooth/a2mp.c +++ b/net/bluetooth/a2mp.c @@ -810,7 +810,7 @@ static struct l2cap_chan *a2mp_chan_open(struct l2cap_conn *conn, bool locked) /* AMP Manager functions */ struct amp_mgr *amp_mgr_get(struct amp_mgr *mgr) { - BT_DBG("mgr %p orig refcnt %d", mgr, atomic_read(&mgr->kref.refcount)); + BT_DBG("mgr %p orig refcnt %d", mgr, kref_read(&mgr->kref)); kref_get(&mgr->kref); @@ -833,7 +833,7 @@ static void amp_mgr_destroy(struct kref *kref) int amp_mgr_put(struct amp_mgr *mgr) { - BT_DBG("mgr %p orig refcnt %d", mgr, atomic_read(&mgr->kref.refcount)); + BT_DBG("mgr %p orig refcnt %d", mgr, kref_read(&mgr->kref)); return kref_put(&mgr->kref, &_mgr_destroy); } diff --git a/net/bluetooth/amp.c b/net/bluetooth/amp.c index e32f34189007..02a4ccc04e1e 100644 --- a/net/bluetooth/amp.c +++ b/net/bluetooth/amp.c @@ -24,7 +24,7 @@ void amp_ctrl_get(struct amp_ctrl *ctrl) { BT_DBG("ctrl %p orig refcnt %d", ctrl, - atomic_read(&ctrl->kref.refcount)); + kref_read(&ctrl->kref)); kref_get(&ctrl->kref); } @@ -42,7 +42,7 @@ static void amp_ctrl_destroy(struct kref *kref) int amp_ctrl_put(struct amp_ctrl *ctrl) { BT_DBG("ctrl %p orig refcnt %d", ctrl, - atomic_read(&ctrl->kref.refcount)); + kref_read(&ctrl->kref)); return kref_put(&ctrl->kref, &_ctrl_destroy); } diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index ce0b5dd01953..fc7f321a3823 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -481,14 +481,14 @@ static void l2cap_chan_destroy(struct kref *kref) void l2cap_chan_hold(struct l2cap_chan *c) { - BT_DBG("chan %p orig refcnt %d", c, atomic_read(&c->kref.refcount)); + BT_DBG("chan %p orig refcnt %d", c, kref_read(&c->kref)); kref_get(&c->kref); } void l2cap_chan_put(struct l2cap_chan *c) { - BT_DBG("chan %p orig refcnt %d", c, atomic_read(&c->kref.refcount)); + BT_DBG("chan %p orig refcnt %d", c, kref_read(&c->kref)); kref_put(&c->kref, l2cap_chan_destroy); } diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 6f3b5754cc7e..38dcf1eb427d 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -3439,7 +3439,7 @@ static void ceph_msg_release(struct kref *kref) struct ceph_msg *ceph_msg_get(struct ceph_msg *msg) { dout("%s %p (was %d)\n", __func__, msg, - atomic_read(&msg->kref.refcount)); + kref_read(&msg->kref)); kref_get(&msg->kref); return msg; } @@ -3448,7 +3448,7 @@ EXPORT_SYMBOL(ceph_msg_get); void ceph_msg_put(struct ceph_msg *msg) { dout("%s %p (was %d)\n", __func__, msg, - atomic_read(&msg->kref.refcount)); + kref_read(&msg->kref)); kref_put(&msg->kref, ceph_msg_release); } EXPORT_SYMBOL(ceph_msg_put); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 842f049abb86..f3378ba1a828 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -438,7 +438,7 @@ static void ceph_osdc_release_request(struct kref *kref) void ceph_osdc_get_request(struct ceph_osd_request *req) { dout("%s %p (was %d)\n", __func__, req, - atomic_read(&req->r_kref.refcount)); + kref_read(&req->r_kref)); kref_get(&req->r_kref); } EXPORT_SYMBOL(ceph_osdc_get_request); @@ -447,7 +447,7 @@ void ceph_osdc_put_request(struct ceph_osd_request *req) { if (req) { dout("%s %p (was %d)\n", __func__, req, - atomic_read(&req->r_kref.refcount)); + kref_read(&req->r_kref)); kref_put(&req->r_kref, ceph_osdc_release_request); } } @@ -487,11 +487,11 @@ static void request_reinit(struct ceph_osd_request *req) struct ceph_msg *reply_msg = req->r_reply; dout("%s req %p\n", __func__, req); - WARN_ON(atomic_read(&req->r_kref.refcount) != 1); + WARN_ON(kref_read(&req->r_kref) != 1); request_release_checks(req); - WARN_ON(atomic_read(&request_msg->kref.refcount) != 1); - WARN_ON(atomic_read(&reply_msg->kref.refcount) != 1); + WARN_ON(kref_read(&request_msg->kref) != 1); + WARN_ON(kref_read(&reply_msg->kref) != 1); target_destroy(&req->r_t); request_init(req); diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 502b9fe9817b..948d837ca9e4 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -1358,7 +1358,7 @@ static int c_show(struct seq_file *m, void *p) ifdebug(CACHE) seq_printf(m, "# expiry=%ld refcnt=%d flags=%lx\n", convert_to_wallclock(cp->expiry_time), - atomic_read(&cp->ref.refcount), cp->flags); + kref_read(&cp->ref), cp->flags); cache_get(cp); if (cache_check(cd, cp, NULL)) /* cache_check does a cache_put on failure */ diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 9c9db55a0c1e..7bfe1fb42add 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -490,7 +490,7 @@ static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool) svc_xprt_get(xprt); dprintk("svc: transport %p dequeued, inuse=%d\n", - xprt, atomic_read(&xprt->xpt_ref.refcount)); + xprt, kref_read(&xprt->xpt_ref)); } spin_unlock_bh(&pool->sp_lock); out: @@ -822,7 +822,7 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) /* XPT_DATA|XPT_DEFERRED case: */ dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n", rqstp, rqstp->rq_pool->sp_id, xprt, - atomic_read(&xprt->xpt_ref.refcount)); + kref_read(&xprt->xpt_ref)); rqstp->rq_deferred = svc_deferred_dequeue(xprt); if (rqstp->rq_deferred) len = svc_deferred_recv(rqstp); @@ -980,7 +980,7 @@ static void svc_age_temp_xprts(unsigned long closure) * through, close it. */ if (!test_and_set_bit(XPT_OLD, &xprt->xpt_flags)) continue; - if (atomic_read(&xprt->xpt_ref.refcount) > 1 || + if (kref_read(&xprt->xpt_ref) > 1 || test_bit(XPT_BUSY, &xprt->xpt_flags)) continue; list_del_init(le); diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c index e112da8005b5..bb8db3cb8032 100644 --- a/net/sunrpc/svcauth.c +++ b/net/sunrpc/svcauth.c @@ -126,13 +126,18 @@ EXPORT_SYMBOL_GPL(svc_auth_unregister); static struct hlist_head auth_domain_table[DN_HASHMAX]; static DEFINE_SPINLOCK(auth_domain_lock); +static void auth_domain_release(struct kref *kref) +{ + struct auth_domain *dom = container_of(kref, struct auth_domain, ref); + + hlist_del(&dom->hash); + dom->flavour->domain_release(dom); + spin_unlock(&auth_domain_lock); +} + void auth_domain_put(struct auth_domain *dom) { - if (atomic_dec_and_lock(&dom->ref.refcount, &auth_domain_lock)) { - hlist_del(&dom->hash); - dom->flavour->domain_release(dom); - spin_unlock(&auth_domain_lock); - } + kref_put_lock(&dom->ref, auth_domain_release, &auth_domain_lock); } EXPORT_SYMBOL_GPL(auth_domain_put); diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index ab2fd5377f94..96c9287f123f 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -1200,9 +1200,9 @@ static void __svc_rdma_free(struct work_struct *work) ib_drain_qp(rdma->sc_qp); /* We should only be called from kref_put */ - if (atomic_read(&xprt->xpt_ref.refcount) != 0) + if (kref_read(&xprt->xpt_ref) != 0) pr_err("svcrdma: sc_xprt still in use? (%d)\n", - atomic_read(&xprt->xpt_ref.refcount)); + kref_read(&xprt->xpt_ref)); /* * Destroy queued, but not processed read completions. Note diff --git a/security/apparmor/include/lib.h b/security/apparmor/include/lib.h index fa281c272970..65ff492a9807 100644 --- a/security/apparmor/include/lib.h +++ b/security/apparmor/include/lib.h @@ -76,12 +76,6 @@ static inline void *kvzalloc(size_t size) return __aa_kvmalloc(size, __GFP_ZERO); } -/* returns 0 if kref not incremented */ -static inline int kref_get_not0(struct kref *kref) -{ - return atomic_inc_not_zero(&kref->refcount); -} - /** * aa_strneq - compare null terminated @str to a non null terminated substring * @str: a null terminated string diff --git a/security/apparmor/include/policy.h b/security/apparmor/include/policy.h index 93b1b1f440d3..67bc96afe541 100644 --- a/security/apparmor/include/policy.h +++ b/security/apparmor/include/policy.h @@ -240,7 +240,7 @@ static inline struct aa_profile *aa_get_profile(struct aa_profile *p) */ static inline struct aa_profile *aa_get_profile_not0(struct aa_profile *p) { - if (p && kref_get_not0(&p->count)) + if (p && kref_get_unless_zero(&p->count)) return p; return NULL; @@ -260,7 +260,7 @@ static inline struct aa_profile *aa_get_profile_rcu(struct aa_profile __rcu **p) rcu_read_lock(); do { c = rcu_dereference(*p); - } while (c && !kref_get_not0(&c->count)); + } while (c && !kref_get_unless_zero(&c->count)); rcu_read_unlock(); return c; diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index cddd5d06e1cb..3603556fa0d9 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -280,6 +280,7 @@ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */ #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ +#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ /* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */ #define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */ diff --git a/tools/arch/x86/include/uapi/asm/vmx.h b/tools/arch/x86/include/uapi/asm/vmx.h index 37fee272618f..14458658e988 100644 --- a/tools/arch/x86/include/uapi/asm/vmx.h +++ b/tools/arch/x86/include/uapi/asm/vmx.h @@ -65,6 +65,8 @@ #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 #define EXIT_REASON_APIC_ACCESS 44 #define EXIT_REASON_EOI_INDUCED 45 +#define EXIT_REASON_GDTR_IDTR 46 +#define EXIT_REASON_LDTR_TR 47 #define EXIT_REASON_EPT_VIOLATION 48 #define EXIT_REASON_EPT_MISCONFIG 49 #define EXIT_REASON_INVEPT 50 @@ -113,6 +115,8 @@ { EXIT_REASON_MCE_DURING_VMENTRY, "MCE_DURING_VMENTRY" }, \ { EXIT_REASON_TPR_BELOW_THRESHOLD, "TPR_BELOW_THRESHOLD" }, \ { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ + { EXIT_REASON_GDTR_IDTR, "GDTR_IDTR" }, \ + { EXIT_REASON_LDTR_TR, "LDTR_TR" }, \ { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ { EXIT_REASON_INVEPT, "INVEPT" }, \ @@ -129,6 +133,7 @@ { EXIT_REASON_XRSTORS, "XRSTORS" } #define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1 +#define VMX_ABORT_LOAD_HOST_PDPTE_FAIL 2 #define VMX_ABORT_LOAD_HOST_MSR_FAIL 4 #endif /* _UAPIVMX_H */ diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 84e6b35da4bd..e6cd62b1264b 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -28,6 +28,7 @@ #include <fcntl.h> #include <errno.h> #include <asm/unistd.h> +#include <linux/err.h> #include <linux/kernel.h> #include <linux/bpf.h> #include <linux/list.h> @@ -779,7 +780,7 @@ static int bpf_program__collect_reloc(struct bpf_program *prog, size_t nr_maps, GElf_Shdr *shdr, Elf_Data *data, Elf_Data *symbols, - int maps_shndx) + int maps_shndx, struct bpf_map *maps) { int i, nrels; @@ -829,7 +830,15 @@ bpf_program__collect_reloc(struct bpf_program *prog, return -LIBBPF_ERRNO__RELOC; } - map_idx = sym.st_value / sizeof(struct bpf_map_def); + /* TODO: 'maps' is sorted. We can use bsearch to make it faster. */ + for (map_idx = 0; map_idx < nr_maps; map_idx++) { + if (maps[map_idx].offset == sym.st_value) { + pr_debug("relocation: find map %zd (%s) for insn %u\n", + map_idx, maps[map_idx].name, insn_idx); + break; + } + } + if (map_idx >= nr_maps) { pr_warning("bpf relocation: map_idx %d large than %d\n", (int)map_idx, (int)nr_maps - 1); @@ -953,7 +962,8 @@ static int bpf_object__collect_reloc(struct bpf_object *obj) err = bpf_program__collect_reloc(prog, nr_maps, shdr, data, obj->efile.symbols, - obj->efile.maps_shndx); + obj->efile.maps_shndx, + obj->maps); if (err) return err; } @@ -1419,37 +1429,33 @@ static void bpf_program__set_type(struct bpf_program *prog, prog->type = type; } -int bpf_program__set_tracepoint(struct bpf_program *prog) -{ - if (!prog) - return -EINVAL; - bpf_program__set_type(prog, BPF_PROG_TYPE_TRACEPOINT); - return 0; -} - -int bpf_program__set_kprobe(struct bpf_program *prog) -{ - if (!prog) - return -EINVAL; - bpf_program__set_type(prog, BPF_PROG_TYPE_KPROBE); - return 0; -} - static bool bpf_program__is_type(struct bpf_program *prog, enum bpf_prog_type type) { return prog ? (prog->type == type) : false; } -bool bpf_program__is_tracepoint(struct bpf_program *prog) -{ - return bpf_program__is_type(prog, BPF_PROG_TYPE_TRACEPOINT); -} - -bool bpf_program__is_kprobe(struct bpf_program *prog) -{ - return bpf_program__is_type(prog, BPF_PROG_TYPE_KPROBE); -} +#define BPF_PROG_TYPE_FNS(NAME, TYPE) \ +int bpf_program__set_##NAME(struct bpf_program *prog) \ +{ \ + if (!prog) \ + return -EINVAL; \ + bpf_program__set_type(prog, TYPE); \ + return 0; \ +} \ + \ +bool bpf_program__is_##NAME(struct bpf_program *prog) \ +{ \ + return bpf_program__is_type(prog, TYPE); \ +} \ + +BPF_PROG_TYPE_FNS(socket_filter, BPF_PROG_TYPE_SOCKET_FILTER); +BPF_PROG_TYPE_FNS(kprobe, BPF_PROG_TYPE_KPROBE); +BPF_PROG_TYPE_FNS(sched_cls, BPF_PROG_TYPE_SCHED_CLS); +BPF_PROG_TYPE_FNS(sched_act, BPF_PROG_TYPE_SCHED_ACT); +BPF_PROG_TYPE_FNS(tracepoint, BPF_PROG_TYPE_TRACEPOINT); +BPF_PROG_TYPE_FNS(xdp, BPF_PROG_TYPE_XDP); +BPF_PROG_TYPE_FNS(perf_event, BPF_PROG_TYPE_PERF_EVENT); int bpf_map__fd(struct bpf_map *map) { @@ -1537,3 +1543,10 @@ bpf_object__find_map_by_offset(struct bpf_object *obj, size_t offset) } return ERR_PTR(-ENOENT); } + +long libbpf_get_error(const void *ptr) +{ + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + return 0; +} diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index a5a8b86a06fe..4014d1ba5e3d 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -22,8 +22,8 @@ #define __BPF_LIBBPF_H #include <stdio.h> +#include <stdint.h> #include <stdbool.h> -#include <linux/err.h> #include <sys/types.h> // for size_t enum libbpf_errno { @@ -174,11 +174,21 @@ int bpf_program__nth_fd(struct bpf_program *prog, int n); /* * Adjust type of bpf program. Default is kprobe. */ +int bpf_program__set_socket_filter(struct bpf_program *prog); int bpf_program__set_tracepoint(struct bpf_program *prog); int bpf_program__set_kprobe(struct bpf_program *prog); +int bpf_program__set_sched_cls(struct bpf_program *prog); +int bpf_program__set_sched_act(struct bpf_program *prog); +int bpf_program__set_xdp(struct bpf_program *prog); +int bpf_program__set_perf_event(struct bpf_program *prog); +bool bpf_program__is_socket_filter(struct bpf_program *prog); bool bpf_program__is_tracepoint(struct bpf_program *prog); bool bpf_program__is_kprobe(struct bpf_program *prog); +bool bpf_program__is_sched_cls(struct bpf_program *prog); +bool bpf_program__is_sched_act(struct bpf_program *prog); +bool bpf_program__is_xdp(struct bpf_program *prog); +bool bpf_program__is_perf_event(struct bpf_program *prog); /* * We don't need __attribute__((packed)) now since it is @@ -224,4 +234,6 @@ int bpf_map__set_priv(struct bpf_map *map, void *priv, bpf_map_clear_priv_t clear_priv); void *bpf_map__priv(struct bpf_map *map); +long libbpf_get_error(const void *ptr); + #endif diff --git a/tools/lib/subcmd/parse-options.h b/tools/lib/subcmd/parse-options.h index 11c3be3bcce7..f054ca1b899d 100644 --- a/tools/lib/subcmd/parse-options.h +++ b/tools/lib/subcmd/parse-options.h @@ -1,6 +1,7 @@ #ifndef __SUBCMD_PARSE_OPTIONS_H #define __SUBCMD_PARSE_OPTIONS_H +#include <linux/kernel.h> #include <stdbool.h> #include <stdint.h> @@ -132,32 +133,32 @@ struct option { #define OPT_UINTEGER(s, l, v, h) { .type = OPTION_UINTEGER, .short_name = (s), .long_name = (l), .value = check_vtype(v, unsigned int *), .help = (h) } #define OPT_LONG(s, l, v, h) { .type = OPTION_LONG, .short_name = (s), .long_name = (l), .value = check_vtype(v, long *), .help = (h) } #define OPT_U64(s, l, v, h) { .type = OPTION_U64, .short_name = (s), .long_name = (l), .value = check_vtype(v, u64 *), .help = (h) } -#define OPT_STRING(s, l, v, a, h) { .type = OPTION_STRING, .short_name = (s), .long_name = (l), .value = check_vtype(v, const char **), (a), .help = (h) } +#define OPT_STRING(s, l, v, a, h) { .type = OPTION_STRING, .short_name = (s), .long_name = (l), .value = check_vtype(v, const char **), .argh = (a), .help = (h) } #define OPT_STRING_OPTARG(s, l, v, a, h, d) \ { .type = OPTION_STRING, .short_name = (s), .long_name = (l), \ - .value = check_vtype(v, const char **), (a), .help = (h), \ + .value = check_vtype(v, const char **), .argh =(a), .help = (h), \ .flags = PARSE_OPT_OPTARG, .defval = (intptr_t)(d) } #define OPT_STRING_OPTARG_SET(s, l, v, os, a, h, d) \ { .type = OPTION_STRING, .short_name = (s), .long_name = (l), \ - .value = check_vtype(v, const char **), (a), .help = (h), \ + .value = check_vtype(v, const char **), .argh = (a), .help = (h), \ .flags = PARSE_OPT_OPTARG, .defval = (intptr_t)(d), \ .set = check_vtype(os, bool *)} -#define OPT_STRING_NOEMPTY(s, l, v, a, h) { .type = OPTION_STRING, .short_name = (s), .long_name = (l), .value = check_vtype(v, const char **), (a), .help = (h), .flags = PARSE_OPT_NOEMPTY} +#define OPT_STRING_NOEMPTY(s, l, v, a, h) { .type = OPTION_STRING, .short_name = (s), .long_name = (l), .value = check_vtype(v, const char **), .argh = (a), .help = (h), .flags = PARSE_OPT_NOEMPTY} #define OPT_DATE(s, l, v, h) \ { .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), .argh = "time", .help = (h), .callback = parse_opt_approxidate_cb } #define OPT_CALLBACK(s, l, v, a, h, f) \ - { .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), (a), .help = (h), .callback = (f) } + { .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), .argh = (a), .help = (h), .callback = (f) } #define OPT_CALLBACK_NOOPT(s, l, v, a, h, f) \ - { .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), (a), .help = (h), .callback = (f), .flags = PARSE_OPT_NOARG } + { .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), .argh = (a), .help = (h), .callback = (f), .flags = PARSE_OPT_NOARG } #define OPT_CALLBACK_DEFAULT(s, l, v, a, h, f, d) \ - { .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), (a), .help = (h), .callback = (f), .defval = (intptr_t)d, .flags = PARSE_OPT_LASTARG_DEFAULT } + { .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), .argh = (a), .help = (h), .callback = (f), .defval = (intptr_t)d, .flags = PARSE_OPT_LASTARG_DEFAULT } #define OPT_CALLBACK_DEFAULT_NOOPT(s, l, v, a, h, f, d) \ { .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l),\ - .value = (v), (a), .help = (h), .callback = (f), .defval = (intptr_t)d,\ + .value = (v), .arg = (a), .help = (h), .callback = (f), .defval = (intptr_t)d,\ .flags = PARSE_OPT_LASTARG_DEFAULT | PARSE_OPT_NOARG} #define OPT_CALLBACK_OPTARG(s, l, v, d, a, h, f) \ { .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), \ - .value = (v), (a), .help = (h), .callback = (f), \ + .value = (v), .argh = (a), .help = (h), .callback = (f), \ .flags = PARSE_OPT_OPTARG, .data = (d) } /* parse_options() will filter out the processed options and leave the diff --git a/tools/perf/Build b/tools/perf/Build index b12d5d1666e3..9b79f8d7db50 100644 --- a/tools/perf/Build +++ b/tools/perf/Build @@ -3,10 +3,12 @@ perf-y += builtin-annotate.o perf-y += builtin-config.o perf-y += builtin-diff.o perf-y += builtin-evlist.o +perf-y += builtin-ftrace.o perf-y += builtin-help.o perf-y += builtin-sched.o perf-y += builtin-buildid-list.o perf-y += builtin-buildid-cache.o +perf-y += builtin-kallsyms.o perf-y += builtin-list.o perf-y += builtin-record.o perf-y += builtin-report.o @@ -39,8 +41,7 @@ CFLAGS_builtin-help.o += $(paths) CFLAGS_builtin-timechart.o += $(paths) CFLAGS_perf.o += -DPERF_HTML_PATH="BUILD_STR($(htmldir_SQ))" \ -DPERF_EXEC_PATH="BUILD_STR($(perfexecdir_SQ))" \ - -DPREFIX="BUILD_STR($(prefix_SQ))" \ - -include $(OUTPUT)PERF-VERSION-FILE + -DPREFIX="BUILD_STR($(prefix_SQ))" CFLAGS_builtin-trace.o += -DSTRACE_GROUPS_DIR="BUILD_STR($(STRACE_GROUPS_DIR_SQ))" CFLAGS_builtin-report.o += -DTIPDIR="BUILD_STR($(tipdir_SQ))" CFLAGS_builtin-report.o += -DDOCDIR="BUILD_STR($(srcdir_SQ)/Documentation)" diff --git a/tools/perf/Documentation/perf-c2c.txt b/tools/perf/Documentation/perf-c2c.txt index 3f06730c7f47..2da07e51e119 100644 --- a/tools/perf/Documentation/perf-c2c.txt +++ b/tools/perf/Documentation/perf-c2c.txt @@ -248,7 +248,7 @@ output fields set for caheline offsets output: Code address, Code symbol, Shared Object, Source line dso - coalesced by shared object -By default the coalescing is setup with 'pid,tid,iaddr'. +By default the coalescing is setup with 'pid,iaddr'. STDIO OUTPUT ------------ diff --git a/tools/perf/Documentation/perf-ftrace.txt b/tools/perf/Documentation/perf-ftrace.txt new file mode 100644 index 000000000000..2d96de6132a9 --- /dev/null +++ b/tools/perf/Documentation/perf-ftrace.txt @@ -0,0 +1,36 @@ +perf-ftrace(1) +============= + +NAME +---- +perf-ftrace - simple wrapper for kernel's ftrace functionality + + +SYNOPSIS +-------- +[verse] +'perf ftrace' <command> + +DESCRIPTION +----------- +The 'perf ftrace' command is a simple wrapper of kernel's ftrace +functionality. It only supports single thread tracing currently and +just reads trace_pipe in text and then write it to stdout. + +The following options apply to perf ftrace. + +OPTIONS +------- + +-t:: +--tracer=:: + Tracer to use: function_graph or function. + +-v:: +--verbose=:: + Verbosity level. + + +SEE ALSO +-------- +linkperf:perf-record[1], linkperf:perf-trace[1] diff --git a/tools/perf/Documentation/perf-kallsyms.txt b/tools/perf/Documentation/perf-kallsyms.txt new file mode 100644 index 000000000000..954ea9e21236 --- /dev/null +++ b/tools/perf/Documentation/perf-kallsyms.txt @@ -0,0 +1,24 @@ +perf-kallsyms(1) +============== + +NAME +---- +perf-kallsyms - Searches running kernel for symbols + +SYNOPSIS +-------- +[verse] +'perf kallsyms <options> symbol_name[,symbol_name...]' + +DESCRIPTION +----------- +This command searches the running kernel kallsyms file for the given symbol(s) +and prints information about it, including the DSO, the kallsyms begin/end +addresses and the addresses in the ELF kallsyms symbol table (for symbols in +modules). + +OPTIONS +------- +-v:: +--verbose=:: + Increase verbosity level, showing details about symbol table loading, etc. diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 5054d9147f0f..27256bc68eda 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -421,9 +421,19 @@ Configure all used events to run in user space. --timestamp-filename Append timestamp to output file name. ---switch-output:: +--switch-output[=mode]:: Generate multiple perf.data files, timestamp prefixed, switching to a new one -when receiving a SIGUSR2. +based on 'mode' value: + "signal" - when receiving a SIGUSR2 (default value) or + <size> - when reaching the size threshold, size is expected to + be a number with appended unit character - B/K/M/G + <time> - when reaching the time threshold, size is expected to + be a number with appended unit character - s/m/h/d + + Note: the precision of the size threshold hugely depends + on your configuration - the number and size of your ring + buffers (-m). It is generally more precise for higher sizes + (like >5M), for lower values expect different sizes. A possible use case is to, given an external event, slice the perf.data file that gets then processed, possibly via a perf script, to decide if that diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt index 76173969ab80..d33deddb0146 100644 --- a/tools/perf/Documentation/perf-sched.txt +++ b/tools/perf/Documentation/perf-sched.txt @@ -143,6 +143,8 @@ OPTIONS for 'perf sched timehist' stop time is not given (i.e, time string is 'x.y,') then analysis goes to end of file. +--state:: + Show task state when it switched out. SEE ALSO -------- diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index 5dc5c6a09ac4..4ed5f239ba7d 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -36,7 +36,7 @@ There are several variants of perf script: 'perf script report <script> [args]' to run and display the results of <script>. <script> is the name displayed in the output of 'perf - trace --list' i.e. the actual script name minus any language + script --list' i.e. the actual script name minus any language extension. The perf.data output from a previous run of 'perf script record <script>' is used and should be present for this command to succeed. [args] refers to the (mainly optional) args expected by @@ -76,7 +76,7 @@ OPTIONS Any command you can specify in a shell. -D:: ---dump-raw-script=:: +--dump-raw-trace=:: Display verbose dump of the trace data. -L:: diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt index 781b019751a4..afd728672b6f 100644 --- a/tools/perf/Documentation/perf-trace.txt +++ b/tools/perf/Documentation/perf-trace.txt @@ -35,7 +35,10 @@ OPTIONS -e:: --expr:: - List of syscalls to show, currently only syscall names. +--event:: + List of syscalls and other perf events (tracepoints, HW cache events, + etc) to show. + See 'perf list' for a complete list of events. Prefixing with ! shows all syscalls but the ones specified. You may need to escape it. @@ -135,9 +138,6 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs. --kernel-syscall-graph:: Show the kernel callchains on the syscall exit path. ---event:: - Trace other events, see 'perf list' for a complete list. - --max-stack:: Set the stack depth limit when parsing the callchain, anything beyond the specified depth will be ignored. Note that at this point diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 76c84f0eec52..03cf947755b9 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -291,8 +291,10 @@ else endif endif ifneq ($(feature-dwarf), 1) - msg := $(warning No libdw.h found or old libdw.h found or elfutils is older than 0.138, disables dwarf support. Please install new elfutils-devel/libdw-dev); - NO_DWARF := 1 + ifndef NO_DWARF + msg := $(warning No libdw.h found or old libdw.h found or elfutils is older than 0.138, disables dwarf support. Please install new elfutils-devel/libdw-dev); + NO_DWARF := 1 + endif else ifneq ($(feature-dwarf_getlocations), 1) msg := $(warning Old libdw.h, finding variables at given 'perf probe' point will not work, install elfutils-devel/libdw-dev >= 0.157); diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 8bb16aa9d661..4da19b6ba94a 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -661,6 +661,7 @@ ifndef NO_PERF_READ_VDSOX32 endif ifndef NO_JVMTI $(call QUIET_INSTALL, $(LIBJVMTI)) \ + $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(libdir_SQ)'; \ $(INSTALL) $(OUTPUT)$(LIBJVMTI) '$(DESTDIR_SQ)$(libdir_SQ)'; endif $(call QUIET_INSTALL, libexec) \ diff --git a/tools/perf/arch/arm64/include/dwarf-regs-table.h b/tools/perf/arch/arm64/include/dwarf-regs-table.h index 26759363f921..36e375f5a211 100644 --- a/tools/perf/arch/arm64/include/dwarf-regs-table.h +++ b/tools/perf/arch/arm64/include/dwarf-regs-table.h @@ -2,12 +2,12 @@ /* This is included in perf/util/dwarf-regs.c */ static const char * const aarch64_regstr_tbl[] = { - "%r0", "%r1", "%r2", "%r3", "%r4", - "%r5", "%r6", "%r7", "%r8", "%r9", - "%r10", "%r11", "%r12", "%r13", "%r14", - "%r15", "%r16", "%r17", "%r18", "%r19", - "%r20", "%r21", "%r22", "%r23", "%r24", - "%r25", "%r26", "%r27", "%r28", "%r29", + "%x0", "%x1", "%x2", "%x3", "%x4", + "%x5", "%x6", "%x7", "%x8", "%x9", + "%x10", "%x11", "%x12", "%x13", "%x14", + "%x15", "%x16", "%x17", "%x18", "%x19", + "%x20", "%x21", "%x22", "%x23", "%x24", + "%x25", "%x26", "%x27", "%x28", "%x29", "%lr", "%sp", }; #endif diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c index f8ca7a4ebabc..e2b21723bbf8 100644 --- a/tools/perf/builtin-c2c.c +++ b/tools/perf/builtin-c2c.c @@ -58,7 +58,7 @@ struct c2c_hist_entry { struct hist_entry he; }; -static char const *coalesce_default = "pid,tid,iaddr"; +static char const *coalesce_default = "pid,iaddr"; struct perf_c2c { struct perf_tool tool; @@ -2476,6 +2476,7 @@ static int build_cl_output(char *cl_sort, bool no_source) "mean_rmt," "mean_lcl," "mean_load," + "tot_recs," "cpucnt,", add_sym ? "symbol," : "", add_dso ? "dso," : "", diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c new file mode 100644 index 000000000000..d05658d2b8f1 --- /dev/null +++ b/tools/perf/builtin-ftrace.c @@ -0,0 +1,243 @@ +/* + * builtin-ftrace.c + * + * Copyright (c) 2013 LG Electronics, Namhyung Kim <namhyung@kernel.org> + * + * Released under the GPL v2. + */ + +#include "builtin.h" +#include "perf.h" + +#include <unistd.h> +#include <signal.h> + +#include "debug.h" +#include <subcmd/parse-options.h> +#include "evlist.h" +#include "target.h" +#include "thread_map.h" + + +#define DEFAULT_TRACER "function_graph" + +struct perf_ftrace { + struct perf_evlist *evlist; + struct target target; + const char *tracer; +}; + +static bool done; + +static void sig_handler(int sig __maybe_unused) +{ + done = true; +} + +/* + * perf_evlist__prepare_workload will send a SIGUSR1 if the fork fails, since + * we asked by setting its exec_error to the function below, + * ftrace__workload_exec_failed_signal. + * + * XXX We need to handle this more appropriately, emitting an error, etc. + */ +static void ftrace__workload_exec_failed_signal(int signo __maybe_unused, + siginfo_t *info __maybe_unused, + void *ucontext __maybe_unused) +{ + /* workload_exec_errno = info->si_value.sival_int; */ + done = true; +} + +static int write_tracing_file(const char *name, const char *val) +{ + char *file; + int fd, ret = -1; + ssize_t size = strlen(val); + + file = get_tracing_file(name); + if (!file) { + pr_debug("cannot get tracing file: %s\n", name); + return -1; + } + + fd = open(file, O_WRONLY); + if (fd < 0) { + pr_debug("cannot open tracing file: %s\n", name); + goto out; + } + + if (write(fd, val, size) == size) + ret = 0; + else + pr_debug("write '%s' to tracing/%s failed\n", val, name); + + close(fd); +out: + put_tracing_file(file); + return ret; +} + +static int reset_tracing_files(struct perf_ftrace *ftrace __maybe_unused) +{ + if (write_tracing_file("tracing_on", "0") < 0) + return -1; + + if (write_tracing_file("current_tracer", "nop") < 0) + return -1; + + if (write_tracing_file("set_ftrace_pid", " ") < 0) + return -1; + + return 0; +} + +static int __cmd_ftrace(struct perf_ftrace *ftrace, int argc, const char **argv) +{ + char *trace_file; + int trace_fd; + char *trace_pid; + char buf[4096]; + struct pollfd pollfd = { + .events = POLLIN, + }; + + if (geteuid() != 0) { + pr_err("ftrace only works for root!\n"); + return -1; + } + + if (argc < 1) + return -1; + + signal(SIGINT, sig_handler); + signal(SIGUSR1, sig_handler); + signal(SIGCHLD, sig_handler); + + reset_tracing_files(ftrace); + + /* reset ftrace buffer */ + if (write_tracing_file("trace", "0") < 0) + goto out; + + if (perf_evlist__prepare_workload(ftrace->evlist, &ftrace->target, + argv, false, ftrace__workload_exec_failed_signal) < 0) + goto out; + + if (write_tracing_file("current_tracer", ftrace->tracer) < 0) { + pr_err("failed to set current_tracer to %s\n", ftrace->tracer); + goto out; + } + + if (asprintf(&trace_pid, "%d", thread_map__pid(ftrace->evlist->threads, 0)) < 0) { + pr_err("failed to allocate pid string\n"); + goto out; + } + + if (write_tracing_file("set_ftrace_pid", trace_pid) < 0) { + pr_err("failed to set pid: %s\n", trace_pid); + goto out_free_pid; + } + + trace_file = get_tracing_file("trace_pipe"); + if (!trace_file) { + pr_err("failed to open trace_pipe\n"); + goto out_free_pid; + } + + trace_fd = open(trace_file, O_RDONLY); + + put_tracing_file(trace_file); + + if (trace_fd < 0) { + pr_err("failed to open trace_pipe\n"); + goto out_free_pid; + } + + fcntl(trace_fd, F_SETFL, O_NONBLOCK); + pollfd.fd = trace_fd; + + if (write_tracing_file("tracing_on", "1") < 0) { + pr_err("can't enable tracing\n"); + goto out_close_fd; + } + + perf_evlist__start_workload(ftrace->evlist); + + while (!done) { + if (poll(&pollfd, 1, -1) < 0) + break; + + if (pollfd.revents & POLLIN) { + int n = read(trace_fd, buf, sizeof(buf)); + if (n < 0) + break; + if (fwrite(buf, n, 1, stdout) != 1) + break; + } + } + + write_tracing_file("tracing_on", "0"); + + /* read remaining buffer contents */ + while (true) { + int n = read(trace_fd, buf, sizeof(buf)); + if (n <= 0) + break; + if (fwrite(buf, n, 1, stdout) != 1) + break; + } + +out_close_fd: + close(trace_fd); +out_free_pid: + free(trace_pid); +out: + reset_tracing_files(ftrace); + + return done ? 0 : -1; +} + +int cmd_ftrace(int argc, const char **argv, const char *prefix __maybe_unused) +{ + int ret; + struct perf_ftrace ftrace = { + .tracer = "function_graph", + .target = { .uid = UINT_MAX, }, + }; + const char * const ftrace_usage[] = { + "perf ftrace [<options>] <command>", + "perf ftrace [<options>] -- <command> [<options>]", + NULL + }; + const struct option ftrace_options[] = { + OPT_STRING('t', "tracer", &ftrace.tracer, "tracer", + "tracer to use: function_graph(default) or function"), + OPT_INCR('v', "verbose", &verbose, + "be more verbose"), + OPT_END() + }; + + argc = parse_options(argc, argv, ftrace_options, ftrace_usage, + PARSE_OPT_STOP_AT_NON_OPTION); + if (!argc) + usage_with_options(ftrace_usage, ftrace_options); + + ftrace.evlist = perf_evlist__new(); + if (ftrace.evlist == NULL) + return -ENOMEM; + + ret = perf_evlist__create_maps(ftrace.evlist, &ftrace.target); + if (ret < 0) + goto out_delete_evlist; + + if (ftrace.tracer == NULL) + ftrace.tracer = DEFAULT_TRACER; + + ret = __cmd_ftrace(&ftrace, argc, argv); + +out_delete_evlist: + perf_evlist__delete(ftrace.evlist); + + return ret; +} diff --git a/tools/perf/builtin-help.c b/tools/perf/builtin-help.c index 3bdb2c78a21b..93da24a638be 100644 --- a/tools/perf/builtin-help.c +++ b/tools/perf/builtin-help.c @@ -434,7 +434,7 @@ int cmd_help(int argc, const char **argv, const char *prefix __maybe_unused) const char * const builtin_help_subcommands[] = { "buildid-cache", "buildid-list", "diff", "evlist", "help", "list", "record", "report", "bench", "stat", "timechart", "top", "annotate", - "script", "sched", "kmem", "lock", "kvm", "test", "inject", "mem", "data", + "script", "sched", "kallsyms", "kmem", "lock", "kvm", "test", "inject", "mem", "data", #ifdef HAVE_LIBELF_SUPPORT "probe", #endif diff --git a/tools/perf/builtin-kallsyms.c b/tools/perf/builtin-kallsyms.c new file mode 100644 index 000000000000..224bfc454b4a --- /dev/null +++ b/tools/perf/builtin-kallsyms.c @@ -0,0 +1,67 @@ +/* + * builtin-kallsyms.c + * + * Builtin command: Look for a symbol in the running kernel and its modules + * + * Copyright (C) 2017, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com> + * + * Released under the GPL v2. (and only v2, not any later version) + */ +#include "builtin.h" +#include <linux/compiler.h> +#include <subcmd/parse-options.h> +#include "debug.h" +#include "machine.h" +#include "symbol.h" + +static int __cmd_kallsyms(int argc, const char **argv) +{ + int i; + struct machine *machine = machine__new_kallsyms(); + + if (machine == NULL) { + pr_err("Couldn't read /proc/kallsyms\n"); + return -1; + } + + for (i = 0; i < argc; ++i) { + struct map *map; + struct symbol *symbol = machine__find_kernel_function_by_name(machine, argv[i], &map); + + if (symbol == NULL) { + printf("%s: not found\n", argv[i]); + continue; + } + + printf("%s: %s %s %#" PRIx64 "-%#" PRIx64 " (%#" PRIx64 "-%#" PRIx64")\n", + symbol->name, map->dso->short_name, map->dso->long_name, + map->unmap_ip(map, symbol->start), map->unmap_ip(map, symbol->end), + symbol->start, symbol->end); + } + + machine__delete(machine); + return 0; +} + +int cmd_kallsyms(int argc, const char **argv, const char *prefix __maybe_unused) +{ + const struct option options[] = { + OPT_INCR('v', "verbose", &verbose, "be more verbose (show counter open errors, etc)"), + OPT_END() + }; + const char * const kallsyms_usage[] = { + "perf kallsyms [<options>] symbol_name", + NULL + }; + + argc = parse_options(argc, argv, options, kallsyms_usage, 0); + if (argc < 1) + usage_with_options(kallsyms_usage, options); + + symbol_conf.sort_by_name = true; + symbol_conf.try_vmlinux_path = (symbol_conf.vmlinux_name == NULL); + if (symbol__init(NULL) < 0) + return -1; + + return __cmd_kallsyms(argc, argv); +} diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 4ec10e9427d9..33a9eaaf9db4 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -46,6 +46,15 @@ #include <asm/bug.h> #include <linux/time64.h> +struct switch_output { + bool enabled; + bool signal; + unsigned long size; + unsigned long time; + const char *str; + bool set; +}; + struct record { struct perf_tool tool; struct record_opts opts; @@ -62,10 +71,33 @@ struct record { bool no_buildid_cache_set; bool buildid_all; bool timestamp_filename; - bool switch_output; + struct switch_output switch_output; unsigned long long samples; }; +static volatile int auxtrace_record__snapshot_started; +static DEFINE_TRIGGER(auxtrace_snapshot_trigger); +static DEFINE_TRIGGER(switch_output_trigger); + +static bool switch_output_signal(struct record *rec) +{ + return rec->switch_output.signal && + trigger_is_ready(&switch_output_trigger); +} + +static bool switch_output_size(struct record *rec) +{ + return rec->switch_output.size && + trigger_is_ready(&switch_output_trigger) && + (rec->bytes_written >= rec->switch_output.size); +} + +static bool switch_output_time(struct record *rec) +{ + return rec->switch_output.time && + trigger_is_ready(&switch_output_trigger); +} + static int record__write(struct record *rec, void *bf, size_t size) { if (perf_data_file__write(rec->session->file, bf, size) < 0) { @@ -74,6 +106,10 @@ static int record__write(struct record *rec, void *bf, size_t size) } rec->bytes_written += size; + + if (switch_output_size(rec)) + trigger_hit(&switch_output_trigger); + return 0; } @@ -193,10 +229,6 @@ static volatile int done; static volatile int signr = -1; static volatile int child_finished; -static volatile int auxtrace_record__snapshot_started; -static DEFINE_TRIGGER(auxtrace_snapshot_trigger); -static DEFINE_TRIGGER(switch_output_trigger); - static void sig_handler(int sig) { if (sig == SIGCHLD) @@ -712,6 +744,7 @@ static void workload_exec_failed_signal(int signo __maybe_unused, } static void snapshot_sig_handler(int sig); +static void alarm_sig_handler(int sig); int __weak perf_event__synth_time_conv(const struct perf_event_mmap_page *pc __maybe_unused, @@ -842,11 +875,11 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) signal(SIGTERM, sig_handler); signal(SIGSEGV, sigsegv_handler); - if (rec->opts.auxtrace_snapshot_mode || rec->switch_output) { + if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) { signal(SIGUSR2, snapshot_sig_handler); if (rec->opts.auxtrace_snapshot_mode) trigger_on(&auxtrace_snapshot_trigger); - if (rec->switch_output) + if (rec->switch_output.enabled) trigger_on(&switch_output_trigger); } else { signal(SIGUSR2, SIG_IGN); @@ -1043,6 +1076,10 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) err = fd; goto out_child; } + + /* re-arm the alarm */ + if (rec->switch_output.time) + alarm(rec->switch_output.time); } if (hits == rec->samples) { @@ -1352,6 +1389,78 @@ out_free: return ret; } +static void switch_output_size_warn(struct record *rec) +{ + u64 wakeup_size = perf_evlist__mmap_size(rec->opts.mmap_pages); + struct switch_output *s = &rec->switch_output; + + wakeup_size /= 2; + + if (s->size < wakeup_size) { + char buf[100]; + + unit_number__scnprintf(buf, sizeof(buf), wakeup_size); + pr_warning("WARNING: switch-output data size lower than " + "wakeup kernel buffer size (%s) " + "expect bigger perf.data sizes\n", buf); + } +} + +static int switch_output_setup(struct record *rec) +{ + struct switch_output *s = &rec->switch_output; + static struct parse_tag tags_size[] = { + { .tag = 'B', .mult = 1 }, + { .tag = 'K', .mult = 1 << 10 }, + { .tag = 'M', .mult = 1 << 20 }, + { .tag = 'G', .mult = 1 << 30 }, + { .tag = 0 }, + }; + static struct parse_tag tags_time[] = { + { .tag = 's', .mult = 1 }, + { .tag = 'm', .mult = 60 }, + { .tag = 'h', .mult = 60*60 }, + { .tag = 'd', .mult = 60*60*24 }, + { .tag = 0 }, + }; + unsigned long val; + + if (!s->set) + return 0; + + if (!strcmp(s->str, "signal")) { + s->signal = true; + pr_debug("switch-output with SIGUSR2 signal\n"); + goto enabled; + } + + val = parse_tag_value(s->str, tags_size); + if (val != (unsigned long) -1) { + s->size = val; + pr_debug("switch-output with %s size threshold\n", s->str); + goto enabled; + } + + val = parse_tag_value(s->str, tags_time); + if (val != (unsigned long) -1) { + s->time = val; + pr_debug("switch-output with %s time threshold (%lu seconds)\n", + s->str, s->time); + goto enabled; + } + + return -1; + +enabled: + rec->timestamp_filename = true; + s->enabled = true; + + if (s->size && !rec->opts.no_buffering) + switch_output_size_warn(rec); + + return 0; +} + static const char * const __record_usage[] = { "perf record [<options>] [<command>]", "perf record [<options>] -- <command> [<options>]", @@ -1519,8 +1628,10 @@ static struct option __record_options[] = { "Record build-id of all DSOs regardless of hits"), OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename, "append timestamp to output filename"), - OPT_BOOLEAN(0, "switch-output", &record.switch_output, - "Switch output when receive SIGUSR2"), + OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str, + &record.switch_output.set, "signal,size,time", + "Switch output when receive SIGUSR2 or cross size,time threshold", + "signal"), OPT_BOOLEAN(0, "dry-run", &dry_run, "Parse options then exit"), OPT_END() @@ -1578,8 +1689,15 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused) return -EINVAL; } - if (rec->switch_output) - rec->timestamp_filename = true; + if (switch_output_setup(rec)) { + parse_options_usage(record_usage, record_options, "switch-output", 0); + return -EINVAL; + } + + if (rec->switch_output.time) { + signal(SIGALRM, alarm_sig_handler); + alarm(rec->switch_output.time); + } if (!rec->itr) { rec->itr = auxtrace_record__init(rec->evlist, &err); @@ -1629,7 +1747,7 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused) if (rec->no_buildid_cache || rec->no_buildid) { disable_buildid_cache(); - } else if (rec->switch_output) { + } else if (rec->switch_output.enabled) { /* * In 'perf record --switch-output', disable buildid * generation by default to reduce data file switching @@ -1721,6 +1839,8 @@ out: static void snapshot_sig_handler(int sig __maybe_unused) { + struct record *rec = &record; + if (trigger_is_ready(&auxtrace_snapshot_trigger)) { trigger_hit(&auxtrace_snapshot_trigger); auxtrace_record__snapshot_started = 1; @@ -1728,6 +1848,14 @@ static void snapshot_sig_handler(int sig __maybe_unused) trigger_error(&auxtrace_snapshot_trigger); } - if (trigger_is_ready(&switch_output_trigger)) + if (switch_output_signal(rec)) + trigger_hit(&switch_output_trigger); +} + +static void alarm_sig_handler(int sig __maybe_unused) +{ + struct record *rec = &record; + + if (switch_output_time(rec)) trigger_hit(&switch_output_trigger); } diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 5b134b0d1ff3..daceb3202200 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -77,6 +77,22 @@ struct sched_atom { #define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP" +/* task state bitmask, copied from include/linux/sched.h */ +#define TASK_RUNNING 0 +#define TASK_INTERRUPTIBLE 1 +#define TASK_UNINTERRUPTIBLE 2 +#define __TASK_STOPPED 4 +#define __TASK_TRACED 8 +/* in tsk->exit_state */ +#define EXIT_DEAD 16 +#define EXIT_ZOMBIE 32 +#define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) +/* in tsk->state again */ +#define TASK_DEAD 64 +#define TASK_WAKEKILL 128 +#define TASK_WAKING 256 +#define TASK_PARKED 512 + enum thread_state { THREAD_SLEEPING = 0, THREAD_WAIT_CPU, @@ -206,6 +222,7 @@ struct perf_sched { bool show_cpu_visual; bool show_wakeups; bool show_migrations; + bool show_state; u64 skipped_samples; const char *time_str; struct perf_time_interval ptime; @@ -216,13 +233,20 @@ struct perf_sched { struct thread_runtime { u64 last_time; /* time of previous sched in/out event */ u64 dt_run; /* run time */ - u64 dt_wait; /* time between CPU access (off cpu) */ + u64 dt_sleep; /* time between CPU access by sleep (off cpu) */ + u64 dt_iowait; /* time between CPU access by iowait (off cpu) */ + u64 dt_preempt; /* time between CPU access by preempt (off cpu) */ u64 dt_delay; /* time between wakeup and sched-in */ u64 ready_to_run; /* time of wakeup */ struct stats run_stats; u64 total_run_time; + u64 total_sleep_time; + u64 total_iowait_time; + u64 total_preempt_time; + u64 total_delay_time; + int last_state; u64 migrations; }; @@ -1821,6 +1845,9 @@ static void timehist_header(struct perf_sched *sched) printf(" %-*s %9s %9s %9s", comm_width, "task name", "wait time", "sch delay", "run time"); + if (sched->show_state) + printf(" %s", "state"); + printf("\n"); /* @@ -1831,9 +1858,14 @@ static void timehist_header(struct perf_sched *sched) if (sched->show_cpu_visual) printf(" %*s ", ncpus, ""); - printf(" %-*s %9s %9s %9s\n", comm_width, + printf(" %-*s %9s %9s %9s", comm_width, "[tid/pid]", "(msec)", "(msec)", "(msec)"); + if (sched->show_state) + printf(" %5s", ""); + + printf("\n"); + /* * separator */ @@ -1846,18 +1878,34 @@ static void timehist_header(struct perf_sched *sched) graph_dotted_line, graph_dotted_line, graph_dotted_line, graph_dotted_line); + if (sched->show_state) + printf(" %.5s", graph_dotted_line); + printf("\n"); } +static char task_state_char(struct thread *thread, int state) +{ + static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; + unsigned bit = state ? ffs(state) : 0; + + /* 'I' for idle */ + if (thread->tid == 0) + return 'I'; + + return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?'; +} + static void timehist_print_sample(struct perf_sched *sched, struct perf_sample *sample, struct addr_location *al, struct thread *thread, - u64 t) + u64 t, int state) { struct thread_runtime *tr = thread__priv(thread); u32 max_cpus = sched->max_cpu + 1; char tstr[64]; + u64 wait_time; timestamp__scnprintf_usec(t, tstr, sizeof(tstr)); printf("%15s [%04d] ", tstr, sample->cpu); @@ -1880,10 +1928,15 @@ static void timehist_print_sample(struct perf_sched *sched, printf(" %-*s ", comm_width, timehist_get_commstr(thread)); - print_sched_time(tr->dt_wait, 6); + wait_time = tr->dt_sleep + tr->dt_iowait + tr->dt_preempt; + print_sched_time(wait_time, 6); + print_sched_time(tr->dt_delay, 6); print_sched_time(tr->dt_run, 6); + if (sched->show_state) + printf(" %5c ", task_state_char(thread, state)); + if (sched->show_wakeups) printf(" %-*s", comm_width, ""); @@ -1930,8 +1983,11 @@ static void timehist_update_runtime_stats(struct thread_runtime *r, u64 t, u64 tprev) { r->dt_delay = 0; - r->dt_wait = 0; + r->dt_sleep = 0; + r->dt_iowait = 0; + r->dt_preempt = 0; r->dt_run = 0; + if (tprev) { r->dt_run = t - tprev; if (r->ready_to_run) { @@ -1943,12 +1999,25 @@ static void timehist_update_runtime_stats(struct thread_runtime *r, if (r->last_time > tprev) pr_debug("time travel: last sched out time for task > previous sched_switch event\n"); - else if (r->last_time) - r->dt_wait = tprev - r->last_time; + else if (r->last_time) { + u64 dt_wait = tprev - r->last_time; + + if (r->last_state == TASK_RUNNING) + r->dt_preempt = dt_wait; + else if (r->last_state == TASK_UNINTERRUPTIBLE) + r->dt_iowait = dt_wait; + else + r->dt_sleep = dt_wait; + } } update_stats(&r->run_stats, r->dt_run); - r->total_run_time += r->dt_run; + + r->total_run_time += r->dt_run; + r->total_delay_time += r->dt_delay; + r->total_sleep_time += r->dt_sleep; + r->total_iowait_time += r->dt_iowait; + r->total_preempt_time += r->dt_preempt; } static bool is_idle_sample(struct perf_sample *sample, @@ -2373,6 +2442,8 @@ static int timehist_sched_change_event(struct perf_tool *tool, struct thread_runtime *tr = NULL; u64 tprev, t = sample->time; int rc = 0; + int state = perf_evsel__intval(evsel, sample, "prev_state"); + if (machine__resolve(machine, &al, sample) < 0) { pr_err("problem processing %d event. skipping it\n", @@ -2447,8 +2518,10 @@ static int timehist_sched_change_event(struct perf_tool *tool, * time. we only care total run time and run stat. */ last_tr->dt_run = 0; - last_tr->dt_wait = 0; last_tr->dt_delay = 0; + last_tr->dt_sleep = 0; + last_tr->dt_iowait = 0; + last_tr->dt_preempt = 0; if (itr->cursor.nr) callchain_append(&itr->callchain, &itr->cursor, t - tprev); @@ -2458,7 +2531,7 @@ static int timehist_sched_change_event(struct perf_tool *tool, } if (!sched->summary_only) - timehist_print_sample(sched, sample, &al, thread, t); + timehist_print_sample(sched, sample, &al, thread, t, state); out: if (sched->hist_time.start == 0 && t >= ptime->start) @@ -2470,6 +2543,9 @@ out: /* time of this sched_switch event becomes last time task seen */ tr->last_time = sample->time; + /* last state is used to determine where to account wait time */ + tr->last_state = state; + /* sched out event for task so reset ready to run time */ tr->ready_to_run = 0; } @@ -2526,7 +2602,26 @@ static void print_thread_runtime(struct thread *t, printf("\n"); } +static void print_thread_waittime(struct thread *t, + struct thread_runtime *r) +{ + printf("%*s %5d %9" PRIu64 " ", + comm_width, timehist_get_commstr(t), t->ppid, + (u64) r->run_stats.n); + + print_sched_time(r->total_run_time, 8); + print_sched_time(r->total_sleep_time, 6); + printf(" "); + print_sched_time(r->total_iowait_time, 6); + printf(" "); + print_sched_time(r->total_preempt_time, 6); + printf(" "); + print_sched_time(r->total_delay_time, 6); + printf("\n"); +} + struct total_run_stats { + struct perf_sched *sched; u64 sched_count; u64 task_count; u64 total_run_time; @@ -2545,7 +2640,11 @@ static int __show_thread_runtime(struct thread *t, void *priv) stats->task_count++; stats->sched_count += r->run_stats.n; stats->total_run_time += r->total_run_time; - print_thread_runtime(t, r); + + if (stats->sched->show_state) + print_thread_waittime(t, r); + else + print_thread_runtime(t, r); } return 0; @@ -2633,18 +2732,24 @@ static void timehist_print_summary(struct perf_sched *sched, u64 hist_time = sched->hist_time.end - sched->hist_time.start; memset(&totals, 0, sizeof(totals)); + totals.sched = sched; if (sched->idle_hist) { printf("\nIdle-time summary\n"); printf("%*s parent sched-out ", comm_width, "comm"); printf(" idle-time min-idle avg-idle max-idle stddev migrations\n"); + } else if (sched->show_state) { + printf("\nWait-time summary\n"); + printf("%*s parent sched-in ", comm_width, "comm"); + printf(" run-time sleep iowait preempt delay\n"); } else { printf("\nRuntime summary\n"); printf("%*s parent sched-in ", comm_width, "comm"); printf(" run-time min-run avg-run max-run stddev migrations\n"); } printf("%*s (count) ", comm_width, ""); - printf(" (msec) (msec) (msec) (msec) %%\n"); + printf(" (msec) (msec) (msec) (msec) %s\n", + sched->show_state ? "(msec)" : "%"); printf("%.117s\n", graph_dotted_line); machine__for_each_thread(m, show_thread_runtime, &totals); @@ -3240,6 +3345,7 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused) OPT_BOOLEAN('I', "idle-hist", &sched.idle_hist, "Show idle events only"), OPT_STRING(0, "time", &sched.time_str, "str", "Time span for analysis (start,stop)"), + OPT_BOOLEAN(0, "state", &sched.show_state, "Show task state when sched-out"), OPT_PARENT(sched_options) }; diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 2f3ff69fc4e7..c0783b4f7b6c 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -2180,7 +2180,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused) "Show the mmap events"), OPT_BOOLEAN('\0', "show-switch-events", &script.show_switch_events, "Show context switch events (if recorded)"), - OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"), + OPT_BOOLEAN('f', "force", &symbol_conf.force, "don't complain, do it"), OPT_BOOLEAN(0, "ns", &nanosecs, "Use 9 decimal places when displaying time"), OPT_CALLBACK_OPTARG(0, "itrace", &itrace_synth_opts, NULL, "opts", @@ -2212,6 +2212,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused) PARSE_OPT_STOP_AT_NON_OPTION); file.path = input_name; + file.force = symbol_conf.force; if (argc > 1 && !strncmp(argv[0], "rec", strlen("rec"))) { rec_script_path = get_script_path(argv[1], RECORD_SUFFIX); diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 206bf72b77fc..40ef9b293d1b 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -40,6 +40,7 @@ #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */ #include <stdlib.h> +#include <string.h> #include <linux/err.h> #include <linux/filter.h> #include <linux/audit.h> @@ -2699,6 +2700,91 @@ static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler) evsel->handler = handler; } +/* + * XXX: Hackish, just splitting the combined -e+--event (syscalls + * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use + * existing facilities unchanged (trace->ev_qualifier + parse_options()). + * + * It'd be better to introduce a parse_options() variant that would return a + * list with the terms it didn't match to an event... + */ +static int trace__parse_events_option(const struct option *opt, const char *str, + int unset __maybe_unused) +{ + struct trace *trace = (struct trace *)opt->value; + const char *s = str; + char *sep = NULL, *lists[2] = { NULL, NULL, }; + int len = strlen(str), err = -1, list; + char *strace_groups_dir = system_path(STRACE_GROUPS_DIR); + char group_name[PATH_MAX]; + + if (strace_groups_dir == NULL) + return -1; + + if (*s == '!') { + ++s; + trace->not_ev_qualifier = true; + } + + while (1) { + if ((sep = strchr(s, ',')) != NULL) + *sep = '\0'; + + list = 0; + if (syscalltbl__id(trace->sctbl, s) >= 0) { + list = 1; + } else { + path__join(group_name, sizeof(group_name), strace_groups_dir, s); + if (access(group_name, R_OK) == 0) + list = 1; + } + + if (lists[list]) { + sprintf(lists[list] + strlen(lists[list]), ",%s", s); + } else { + lists[list] = malloc(len); + if (lists[list] == NULL) + goto out; + strcpy(lists[list], s); + } + + if (!sep) + break; + + *sep = ','; + s = sep + 1; + } + + if (lists[1] != NULL) { + struct strlist_config slist_config = { + .dirname = strace_groups_dir, + }; + + trace->ev_qualifier = strlist__new(lists[1], &slist_config); + if (trace->ev_qualifier == NULL) { + fputs("Not enough memory to parse event qualifier", trace->output); + goto out; + } + + if (trace__validate_ev_qualifier(trace)) + goto out; + } + + err = 0; + + if (lists[0]) { + struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event", + "event selector. use 'perf list' to list available events", + parse_events_option); + err = parse_events_option(&o, lists[0], 0); + } +out: + if (sep) + *sep = ','; + + return err; +} + int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) { const char *trace_usage[] = { @@ -2730,15 +2816,15 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) .max_stack = UINT_MAX, }; const char *output_name = NULL; - const char *ev_qualifier_str = NULL; const struct option trace_options[] = { - OPT_CALLBACK(0, "event", &trace.evlist, "event", - "event selector. use 'perf list' to list available events", - parse_events_option), + OPT_CALLBACK('e', "event", &trace, "event", + "event/syscall selector. use 'perf list' to list available events", + trace__parse_events_option), OPT_BOOLEAN(0, "comm", &trace.show_comm, "show the thread COMM next to its id"), OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"), - OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"), + OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace", + trace__parse_events_option), OPT_STRING('o', "output", &output_name, "file", "output file name"), OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"), OPT_STRING('p', "pid", &trace.opts.target.pid, "pid", @@ -2863,7 +2949,7 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) return -1; } - if (!trace.trace_syscalls && ev_qualifier_str) { + if (!trace.trace_syscalls && trace.ev_qualifier) { pr_err("The -e option can't be used with --no-syscalls.\n"); goto out; } @@ -2878,28 +2964,6 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) trace.open_id = syscalltbl__id(trace.sctbl, "open"); - if (ev_qualifier_str != NULL) { - const char *s = ev_qualifier_str; - struct strlist_config slist_config = { - .dirname = system_path(STRACE_GROUPS_DIR), - }; - - trace.not_ev_qualifier = *s == '!'; - if (trace.not_ev_qualifier) - ++s; - trace.ev_qualifier = strlist__new(s, &slist_config); - if (trace.ev_qualifier == NULL) { - fputs("Not enough memory to parse event qualifier", - trace.output); - err = -ENOMEM; - goto out_close; - } - - err = trace__validate_ev_qualifier(&trace); - if (err) - goto out_close; - } - err = target__validate(&trace.opts.target); if (err) { target__strerror(&trace.opts.target, err, bf, sizeof(bf)); diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h index 0bcf68e98ccc..036e1e35b1a8 100644 --- a/tools/perf/builtin.h +++ b/tools/perf/builtin.h @@ -23,6 +23,7 @@ int cmd_diff(int argc, const char **argv, const char *prefix); int cmd_evlist(int argc, const char **argv, const char *prefix); int cmd_help(int argc, const char **argv, const char *prefix); int cmd_sched(int argc, const char **argv, const char *prefix); +int cmd_kallsyms(int argc, const char **argv, const char *prefix); int cmd_list(int argc, const char **argv, const char *prefix); int cmd_record(int argc, const char **argv, const char *prefix); int cmd_report(int argc, const char **argv, const char *prefix); @@ -40,6 +41,7 @@ int cmd_trace(int argc, const char **argv, const char *prefix); int cmd_inject(int argc, const char **argv, const char *prefix); int cmd_mem(int argc, const char **argv, const char *prefix); int cmd_data(int argc, const char **argv, const char *prefix); +int cmd_ftrace(int argc, const char **argv, const char *prefix); int find_scripts(char **scripts_array, char **scripts_path_array); #endif diff --git a/tools/perf/command-list.txt b/tools/perf/command-list.txt index ab5cbaa170d0..ac3efd396a72 100644 --- a/tools/perf/command-list.txt +++ b/tools/perf/command-list.txt @@ -11,7 +11,9 @@ perf-data mainporcelain common perf-diff mainporcelain common perf-config mainporcelain common perf-evlist mainporcelain common +perf-ftrace mainporcelain common perf-inject mainporcelain common +perf-kallsyms mainporcelain common perf-kmem mainporcelain common perf-kvm mainporcelain common perf-list mainporcelain common diff --git a/tools/perf/perf.c b/tools/perf/perf.c index aa23b3347d6b..34bcf90f8871 100644 --- a/tools/perf/perf.c +++ b/tools/perf/perf.c @@ -29,7 +29,6 @@ const char perf_usage_string[] = const char perf_more_info_string[] = "See 'perf help COMMAND' for more information on a specific command."; -int use_browser = -1; static int use_pager = -1; const char *input_name; @@ -47,6 +46,7 @@ static struct cmd_struct commands[] = { { "diff", cmd_diff, 0 }, { "evlist", cmd_evlist, 0 }, { "help", cmd_help, 0 }, + { "kallsyms", cmd_kallsyms, 0 }, { "list", cmd_list, 0 }, { "record", cmd_record, 0 }, { "report", cmd_report, 0 }, @@ -71,6 +71,7 @@ static struct cmd_struct commands[] = { { "inject", cmd_inject, 0 }, { "mem", cmd_mem, 0 }, { "data", cmd_data, 0 }, + { "ftrace", cmd_ftrace, 0 }, }; struct pager_config { @@ -329,8 +330,6 @@ static int handle_alias(int *argcp, const char ***argv) return ret; } -const char perf_version_string[] = PERF_VERSION; - #define RUN_SETUP (1<<0) #define USE_PAGER (1<<1) diff --git a/tools/perf/tests/Build b/tools/perf/tests/Build index 6676c2dd6dcb..1cb3d9b540e9 100644 --- a/tools/perf/tests/Build +++ b/tools/perf/tests/Build @@ -44,6 +44,7 @@ perf-y += is_printable_array.o perf-y += bitmap.o perf-y += perf-hooks.o perf-y += clang.o +perf-y += unit_number__scnprintf.o $(OUTPUT)tests/llvm-src-base.c: tests/bpf-script-example.c tests/Build $(call rule_mkdir) diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c index a77dcc0d24e3..37e326bfd2dc 100644 --- a/tools/perf/tests/builtin-test.c +++ b/tools/perf/tests/builtin-test.c @@ -247,6 +247,10 @@ static struct test generic_tests[] = { } }, { + .desc = "unit_number__scnprintf", + .func = test__unit_number__scnprint, + }, + { .func = NULL, }, }; diff --git a/tools/perf/tests/llvm.c b/tools/perf/tests/llvm.c index 02a33ebcd992..d357dab72e68 100644 --- a/tools/perf/tests/llvm.c +++ b/tools/perf/tests/llvm.c @@ -13,7 +13,7 @@ static int test__bpf_parsing(void *obj_buf, size_t obj_buf_sz) struct bpf_object *obj; obj = bpf_object__open_buffer(obj_buf, obj_buf_sz, NULL); - if (IS_ERR(obj)) + if (libbpf_get_error(obj)) return TEST_FAIL; bpf_object__close(obj); return TEST_OK; diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h index a512f0c8ff5b..1fa9b9d83aa5 100644 --- a/tools/perf/tests/tests.h +++ b/tools/perf/tests/tests.h @@ -96,6 +96,7 @@ int test__perf_hooks(int subtest); int test__clang(int subtest); const char *test__clang_subtest_get_desc(int subtest); int test__clang_subtest_get_nr(void); +int test__unit_number__scnprint(int subtest); #if defined(__arm__) || defined(__aarch64__) #ifdef HAVE_DWARF_UNWIND_SUPPORT diff --git a/tools/perf/tests/unit_number__scnprintf.c b/tools/perf/tests/unit_number__scnprintf.c new file mode 100644 index 000000000000..623c2aa53c4a --- /dev/null +++ b/tools/perf/tests/unit_number__scnprintf.c @@ -0,0 +1,37 @@ +#include <linux/compiler.h> +#include <linux/types.h> +#include "tests.h" +#include "util.h" +#include "debug.h" + +int test__unit_number__scnprint(int subtest __maybe_unused) +{ + struct { + u64 n; + const char *str; + } test[] = { + { 1, "1B" }, + { 10*1024, "10K" }, + { 20*1024*1024, "20M" }, + { 30*1024*1024*1024ULL, "30G" }, + { 0, "0B" }, + { 0, NULL }, + }; + unsigned i = 0; + + while (test[i].str) { + char buf[100]; + + unit_number__scnprintf(buf, sizeof(buf), test[i].n); + + pr_debug("n %" PRIu64 ", str '%s', buf '%s'\n", + test[i].n, test[i].str, buf); + + if (strcmp(test[i].str, buf)) + return TEST_FAIL; + + i++; + } + + return TEST_OK; +} diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index 641b40234a9d..fc4fb669ceee 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -501,8 +501,8 @@ static int hierarchy_set_folding(struct hist_browser *hb, struct hist_entry *he, return n; } -static void hist_entry__set_folding(struct hist_entry *he, - struct hist_browser *hb, bool unfold) +static void __hist_entry__set_folding(struct hist_entry *he, + struct hist_browser *hb, bool unfold) { hist_entry__init_have_children(he); he->unfolded = unfold ? he->has_children : false; @@ -520,12 +520,34 @@ static void hist_entry__set_folding(struct hist_entry *he, he->nr_rows = 0; } +static void hist_entry__set_folding(struct hist_entry *he, + struct hist_browser *browser, bool unfold) +{ + double percent; + + percent = hist_entry__get_percent_limit(he); + if (he->filtered || percent < browser->min_pcnt) + return; + + __hist_entry__set_folding(he, browser, unfold); + + if (!he->depth || unfold) + browser->nr_hierarchy_entries++; + if (he->leaf) + browser->nr_callchain_rows += he->nr_rows; + else if (unfold && !hist_entry__has_hierarchy_children(he, browser->min_pcnt)) { + browser->nr_hierarchy_entries++; + he->has_no_entry = true; + he->nr_rows = 1; + } else + he->has_no_entry = false; +} + static void __hist_browser__set_folding(struct hist_browser *browser, bool unfold) { struct rb_node *nd; struct hist_entry *he; - double percent; nd = rb_first(&browser->hists->entries); while (nd) { @@ -535,21 +557,6 @@ __hist_browser__set_folding(struct hist_browser *browser, bool unfold) nd = __rb_hierarchy_next(nd, HMD_FORCE_CHILD); hist_entry__set_folding(he, browser, unfold); - - percent = hist_entry__get_percent_limit(he); - if (he->filtered || percent < browser->min_pcnt) - continue; - - if (!he->depth || unfold) - browser->nr_hierarchy_entries++; - if (he->leaf) - browser->nr_callchain_rows += he->nr_rows; - else if (unfold && !hist_entry__has_hierarchy_children(he, browser->min_pcnt)) { - browser->nr_hierarchy_entries++; - he->has_no_entry = true; - he->nr_rows = 1; - } else - he->has_no_entry = false; } } @@ -564,6 +571,15 @@ static void hist_browser__set_folding(struct hist_browser *browser, bool unfold) ui_browser__reset_index(&browser->b); } +static void hist_browser__set_folding_selected(struct hist_browser *browser, bool unfold) +{ + if (!browser->he_selection) + return; + + hist_entry__set_folding(browser->he_selection, browser, unfold); + browser->b.nr_entries = hist_browser__nr_entries(browser); +} + static void ui_browser__warn_lost_events(struct ui_browser *browser) { ui_browser__warning(browser, 4, @@ -637,10 +653,18 @@ int hist_browser__run(struct hist_browser *browser, const char *help) /* Collapse the whole world. */ hist_browser__set_folding(browser, false); break; + case 'c': + /* Collapse the selected entry. */ + hist_browser__set_folding_selected(browser, false); + break; case 'E': /* Expand the whole world. */ hist_browser__set_folding(browser, true); break; + case 'e': + /* Expand the selected entry. */ + hist_browser__set_folding_selected(browser, true); + break; case 'H': browser->show_headers = !browser->show_headers; hist_browser__update_rows(browser); diff --git a/tools/perf/ui/setup.c b/tools/perf/ui/setup.c index 1f6b0994f4f4..50d13e58210f 100644 --- a/tools/perf/ui/setup.c +++ b/tools/perf/ui/setup.c @@ -7,6 +7,7 @@ pthread_mutex_t ui__lock = PTHREAD_MUTEX_INITIALIZER; void *perf_gtk_handle; +int use_browser = -1; #ifdef HAVE_GTK2_SUPPORT static int setup_gtk_browser(void) diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 3840e3a87057..5da376bc1afc 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -162,6 +162,7 @@ CFLAGS_rbtree.o += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ET CFLAGS_libstring.o += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))" CFLAGS_hweight.o += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))" CFLAGS_parse-events.o += -Wno-redundant-decls +CFLAGS_header.o += -include $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)util/kallsyms.o: ../lib/symbol/kallsyms.c FORCE $(call rule_mkdir) diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c index d2c6cdd9d42b..28d41e709128 100644 --- a/tools/perf/util/dso.c +++ b/tools/perf/util/dso.c @@ -9,6 +9,13 @@ #include "debug.h" #include "vdso.h" +static const char * const debuglink_paths[] = { + "%.0s%s", + "%s/%s", + "%s/.debug/%s", + "/usr/lib/debug%s/%s" +}; + char dso__symtab_origin(const struct dso *dso) { static const char origin[] = { @@ -44,24 +51,43 @@ int dso__read_binary_type_filename(const struct dso *dso, size_t len; switch (type) { - case DSO_BINARY_TYPE__DEBUGLINK: { - char *debuglink; + case DSO_BINARY_TYPE__DEBUGLINK: + { + const char *last_slash; + char dso_dir[PATH_MAX]; + char symfile[PATH_MAX]; + unsigned int i; len = __symbol__join_symfs(filename, size, dso->long_name); - debuglink = filename + len; - while (debuglink != filename && *debuglink != '/') - debuglink--; - if (*debuglink == '/') - debuglink++; + last_slash = filename + len; + while (last_slash != filename && *last_slash != '/') + last_slash--; - ret = -1; - if (!is_regular_file(filename)) + strncpy(dso_dir, filename, last_slash - filename); + dso_dir[last_slash-filename] = '\0'; + + if (!is_regular_file(filename)) { + ret = -1; + break; + } + + ret = filename__read_debuglink(filename, symfile, PATH_MAX); + if (ret) break; - ret = filename__read_debuglink(filename, debuglink, - size - (debuglink - filename)); + /* Check predefined locations where debug file might reside */ + ret = -1; + for (i = 0; i < ARRAY_SIZE(debuglink_paths); i++) { + snprintf(filename, size, + debuglink_paths[i], dso_dir, symfile); + if (is_regular_file(filename)) { + ret = 0; + break; + } } + break; + } case DSO_BINARY_TYPE__BUILD_ID_CACHE: if (dso__build_id_filename(dso, filename, size) == NULL) ret = -1; diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index d92e02006fb8..b601f2814a30 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1184,7 +1184,7 @@ unsigned long perf_event_mlock_kb_in_pages(void) return pages; } -static size_t perf_evlist__mmap_size(unsigned long pages) +size_t perf_evlist__mmap_size(unsigned long pages) { if (pages == UINT_MAX) pages = perf_event_mlock_kb_in_pages(); @@ -1224,12 +1224,16 @@ static long parse_pages_arg(const char *str, unsigned long min, if (pages == 0 && min == 0) { /* leave number of pages at 0 */ } else if (!is_power_of_2(pages)) { + char buf[100]; + /* round pages up to next power of 2 */ pages = roundup_pow_of_two(pages); if (!pages) return -EINVAL; - pr_info("rounding mmap pages size to %lu bytes (%lu pages)\n", - pages * page_size, pages); + + unit_number__scnprintf(buf, sizeof(buf), pages * page_size); + pr_info("rounding mmap pages size to %s (%lu pages)\n", + buf, pages); } if (pages > max) @@ -1797,7 +1801,7 @@ int perf_evlist__start_workload(struct perf_evlist *evlist) */ ret = write(evlist->workload.cork_fd, &bf, 1); if (ret < 0) - perror("enable to write to pipe"); + perror("unable to write to pipe"); close(evlist->workload.cork_fd); return ret; diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 4fd034f22d2f..389b9ccdf8c7 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -218,6 +218,8 @@ int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages, bool overwrite); void perf_evlist__munmap(struct perf_evlist *evlist); +size_t perf_evlist__mmap_size(unsigned long pages); + void perf_evlist__disable(struct perf_evlist *evlist); void perf_evlist__enable(struct perf_evlist *evlist); void perf_evlist__toggle_enable(struct perf_evlist *evlist); diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index d89c9c7ef4e5..c567d9f0aa92 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -41,6 +41,8 @@ static const u64 __perf_magic2_sw = 0x50455246494c4532ULL; #define PERF_MAGIC __perf_magic2 +const char perf_version_string[] = PERF_VERSION; + struct perf_file_attr { struct perf_event_attr attr; struct perf_file_section ids; @@ -2801,8 +2803,10 @@ static int perf_evsel__prepare_tracepoint_event(struct perf_evsel *evsel, } event = pevent_find_event(pevent, evsel->attr.config); - if (event == NULL) + if (event == NULL) { + pr_debug("cannot find event format for %d\n", (int)evsel->attr.config); return -1; + } if (!evsel->name) { snprintf(bf, sizeof(bf), "%s:%s", event->system, event->name); diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 9b33bef54581..747a034d1ff3 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -87,6 +87,25 @@ out_delete: return NULL; } +struct machine *machine__new_kallsyms(void) +{ + struct machine *machine = machine__new_host(); + /* + * FIXME: + * 1) MAP__FUNCTION will go away when we stop loading separate maps for + * functions and data objects. + * 2) We should switch to machine__load_kallsyms(), i.e. not explicitely + * ask for not using the kcore parsing code, once this one is fixed + * to create a map per module. + */ + if (machine && __machine__load_kallsyms(machine, "/proc/kallsyms", MAP__FUNCTION, true) <= 0) { + machine__delete(machine); + machine = NULL; + } + + return machine; +} + static void dsos__purge(struct dsos *dsos) { struct dso *pos, *n; diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h index 354de6e56109..a28305029711 100644 --- a/tools/perf/util/machine.h +++ b/tools/perf/util/machine.h @@ -129,6 +129,7 @@ char *machine__mmap_name(struct machine *machine, char *bf, size_t size); void machines__set_comm_exec(struct machines *machines, bool comm_exec); struct machine *machine__new_host(void); +struct machine *machine__new_kallsyms(void); int machine__init(struct machine *machine, const char *root_dir, pid_t pid); void machine__exit(struct machine *machine); void machine__delete_threads(struct machine *machine); diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index dc6ccaa4e927..78b16100567d 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -94,32 +94,10 @@ static int pmu_format(const char *name, struct list_head *format) return 0; } -static int perf_pmu__parse_scale(struct perf_pmu_alias *alias, char *dir, char *name) +static int convert_scale(const char *scale, char **end, double *sval) { - struct stat st; - ssize_t sret; - char scale[128]; - int fd, ret = -1; - char path[PATH_MAX]; char *lc; - - snprintf(path, PATH_MAX, "%s/%s.scale", dir, name); - - fd = open(path, O_RDONLY); - if (fd == -1) - return -1; - - if (fstat(fd, &st) < 0) - goto error; - - sret = read(fd, scale, sizeof(scale)-1); - if (sret < 0) - goto error; - - if (scale[sret - 1] == '\n') - scale[sret - 1] = '\0'; - else - scale[sret] = '\0'; + int ret = 0; /* * save current locale @@ -134,7 +112,7 @@ static int perf_pmu__parse_scale(struct perf_pmu_alias *alias, char *dir, char * lc = strdup(lc); if (!lc) { ret = -ENOMEM; - goto error; + goto out; } /* @@ -144,14 +122,42 @@ static int perf_pmu__parse_scale(struct perf_pmu_alias *alias, char *dir, char * */ setlocale(LC_NUMERIC, "C"); - alias->scale = strtod(scale, NULL); + *sval = strtod(scale, end); +out: /* restore locale */ setlocale(LC_NUMERIC, lc); - free(lc); + return ret; +} + +static int perf_pmu__parse_scale(struct perf_pmu_alias *alias, char *dir, char *name) +{ + struct stat st; + ssize_t sret; + char scale[128]; + int fd, ret = -1; + char path[PATH_MAX]; + + snprintf(path, PATH_MAX, "%s/%s.scale", dir, name); + + fd = open(path, O_RDONLY); + if (fd == -1) + return -1; + + if (fstat(fd, &st) < 0) + goto error; + + sret = read(fd, scale, sizeof(scale)-1); + if (sret < 0) + goto error; + + if (scale[sret - 1] == '\n') + scale[sret - 1] = '\0'; + else + scale[sret] = '\0'; - ret = 0; + ret = convert_scale(scale, NULL, &alias->scale); error: close(fd); return ret; diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 6a6f44dd594b..2c1bca240597 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -3023,20 +3023,17 @@ static int try_to_find_absolute_address(struct perf_probe_event *pev, tev->nargs = pev->nargs; tev->args = zalloc(sizeof(struct probe_trace_arg) * tev->nargs); - if (!tev->args) { - err = -ENOMEM; + if (!tev->args) goto errout; - } + for (i = 0; i < tev->nargs; i++) copy_to_probe_trace_arg(&tev->args[i], &pev->args[i]); return 1; errout: - if (*tevs) { - clear_probe_trace_events(*tevs, 1); - *tevs = NULL; - } + clear_probe_trace_events(*tevs, 1); + *tevs = NULL; return err; } diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c index e74adfbd6a2e..ecabc26ad1ee 100644 --- a/tools/perf/util/scripting-engines/trace-event-perl.c +++ b/tools/perf/util/scripting-engines/trace-event-perl.c @@ -351,8 +351,10 @@ static void perl_process_tracepoint(struct perf_sample *sample, if (evsel->attr.type != PERF_TYPE_TRACEPOINT) return; - if (!event) - die("ug! no event found for type %" PRIu64, (u64)evsel->attr.config); + if (!event) { + pr_debug("ug! no event found for type %" PRIu64, (u64)evsel->attr.config); + return; + } pid = raw_field_value(event, "common_pid", data); diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index f268201048a0..349c68144e55 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1191,7 +1191,7 @@ static int u64 sample_type = evsel->attr.sample_type; u64 read_format = evsel->attr.read_format; - /* Standard sample delievery. */ + /* Standard sample delivery. */ if (!(sample_type & PERF_SAMPLE_READ)) return tool->sample(tool, event, sample, evsel, machine); diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c index d995743cb673..ceb0e2720223 100644 --- a/tools/perf/util/trace-event-info.c +++ b/tools/perf/util/trace-event-info.c @@ -42,7 +42,7 @@ #include "evsel.h" #include "debug.h" -#define VERSION "0.5" +#define VERSION "0.6" static int output_fd; @@ -379,6 +379,34 @@ out: return err; } +static int record_saved_cmdline(void) +{ + unsigned int size; + char *path; + struct stat st; + int ret, err = 0; + + path = get_tracing_file("saved_cmdlines"); + if (!path) { + pr_debug("can't get tracing/saved_cmdline"); + return -ENOMEM; + } + + ret = stat(path, &st); + if (ret < 0) { + /* not found */ + size = 0; + if (write(output_fd, &size, 8) != 8) + err = -EIO; + goto out; + } + err = record_file(path, 8); + +out: + put_tracing_file(path); + return err; +} + static void put_tracepoints_path(struct tracepoint_path *tps) { @@ -539,6 +567,9 @@ struct tracing_data *tracing_data_get(struct list_head *pattrs, if (err) goto out; err = record_ftrace_printk(); + if (err) + goto out; + err = record_saved_cmdline(); out: /* diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c index 33b52eaa39db..de0078e21408 100644 --- a/tools/perf/util/trace-event-parse.c +++ b/tools/perf/util/trace-event-parse.c @@ -160,6 +160,23 @@ void parse_ftrace_printk(struct pevent *pevent, } } +void parse_saved_cmdline(struct pevent *pevent, + char *file, unsigned int size __maybe_unused) +{ + char *comm; + char *line; + char *next = NULL; + int pid; + + line = strtok_r(file, "\n", &next); + while (line) { + sscanf(line, "%d %ms", &pid, &comm); + pevent_register_comm(pevent, comm, pid); + free(comm); + line = strtok_r(NULL, "\n", &next); + } +} + int parse_ftrace_file(struct pevent *pevent, char *buf, unsigned long size) { return pevent_parse_event(pevent, buf, size, "ftrace"); diff --git a/tools/perf/util/trace-event-read.c b/tools/perf/util/trace-event-read.c index b67a0ccf5ab9..27420159bf69 100644 --- a/tools/perf/util/trace-event-read.c +++ b/tools/perf/util/trace-event-read.c @@ -260,39 +260,53 @@ static int read_header_files(struct pevent *pevent) static int read_ftrace_file(struct pevent *pevent, unsigned long long size) { + int ret; char *buf; buf = malloc(size); - if (buf == NULL) + if (buf == NULL) { + pr_debug("memory allocation failure\n"); return -1; + } - if (do_read(buf, size) < 0) { - free(buf); - return -1; + ret = do_read(buf, size); + if (ret < 0) { + pr_debug("error reading ftrace file.\n"); + goto out; } - parse_ftrace_file(pevent, buf, size); + ret = parse_ftrace_file(pevent, buf, size); + if (ret < 0) + pr_debug("error parsing ftrace file.\n"); +out: free(buf); - return 0; + return ret; } static int read_event_file(struct pevent *pevent, char *sys, unsigned long long size) { + int ret; char *buf; buf = malloc(size); - if (buf == NULL) + if (buf == NULL) { + pr_debug("memory allocation failure\n"); return -1; + } - if (do_read(buf, size) < 0) { + ret = do_read(buf, size); + if (ret < 0) { free(buf); - return -1; + goto out; } - parse_event_file(pevent, buf, size, sys); + ret = parse_event_file(pevent, buf, size, sys); + if (ret < 0) + pr_debug("error parsing event file.\n"); +out: free(buf); - return 0; + return ret; } static int read_ftrace_files(struct pevent *pevent) @@ -341,6 +355,36 @@ static int read_event_files(struct pevent *pevent) return 0; } +static int read_saved_cmdline(struct pevent *pevent) +{ + unsigned long long size; + char *buf; + int ret; + + /* it can have 0 size */ + size = read8(pevent); + if (!size) + return 0; + + buf = malloc(size + 1); + if (buf == NULL) { + pr_debug("memory allocation failure\n"); + return -1; + } + + ret = do_read(buf, size); + if (ret < 0) { + pr_debug("error reading saved cmdlines\n"); + goto out; + } + + parse_saved_cmdline(pevent, buf, size); + ret = 0; +out: + free(buf); + return ret; +} + ssize_t trace_report(int fd, struct trace_event *tevent, bool __repipe) { char buf[BUFSIZ]; @@ -379,10 +423,11 @@ ssize_t trace_report(int fd, struct trace_event *tevent, bool __repipe) return -1; if (show_version) printf("version = %s\n", version); - free(version); - if (do_read(buf, 1) < 0) + if (do_read(buf, 1) < 0) { + free(version); return -1; + } file_bigendian = buf[0]; host_bigendian = bigendian(); @@ -423,6 +468,11 @@ ssize_t trace_report(int fd, struct trace_event *tevent, bool __repipe) err = read_ftrace_printk(pevent); if (err) goto out; + if (atof(version) >= 0.6) { + err = read_saved_cmdline(pevent); + if (err) + goto out; + } size = trace_data_size; repipe = false; @@ -438,5 +488,6 @@ ssize_t trace_report(int fd, struct trace_event *tevent, bool __repipe) out: if (pevent) trace_event__cleanup(tevent); + free(version); return size; } diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h index b0af9c81bb0d..1fbc044f9eb0 100644 --- a/tools/perf/util/trace-event.h +++ b/tools/perf/util/trace-event.h @@ -42,6 +42,7 @@ raw_field_value(struct event_format *event, const char *name, void *data); void parse_proc_kallsyms(struct pevent *pevent, char *file, unsigned int size); void parse_ftrace_printk(struct pevent *pevent, char *file, unsigned int size); +void parse_saved_cmdline(struct pevent *pevent, char *file, unsigned int size); ssize_t trace_report(int fd, struct trace_event *tevent, bool repipe); diff --git a/tools/perf/util/unwind-libunwind-local.c b/tools/perf/util/unwind-libunwind-local.c index 6fec84dff3f7..bfb9b7987692 100644 --- a/tools/perf/util/unwind-libunwind-local.c +++ b/tools/perf/util/unwind-libunwind-local.c @@ -35,6 +35,7 @@ #include "util.h" #include "debug.h" #include "asm/bug.h" +#include "dso.h" extern int UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as, @@ -297,15 +298,58 @@ static int read_unwind_spec_debug_frame(struct dso *dso, int fd; u64 ofs = dso->data.debug_frame_offset; + /* debug_frame can reside in: + * - dso + * - debug pointed by symsrc_filename + * - gnu_debuglink, which doesn't necessary + * has to be pointed by symsrc_filename + */ if (ofs == 0) { fd = dso__data_get_fd(dso, machine); - if (fd < 0) - return -EINVAL; + if (fd >= 0) { + ofs = elf_section_offset(fd, ".debug_frame"); + dso__data_put_fd(dso); + } + + if (ofs <= 0) { + fd = open(dso->symsrc_filename, O_RDONLY); + if (fd >= 0) { + ofs = elf_section_offset(fd, ".debug_frame"); + close(fd); + } + } + + if (ofs <= 0) { + char *debuglink = malloc(PATH_MAX); + int ret = 0; + + ret = dso__read_binary_type_filename( + dso, DSO_BINARY_TYPE__DEBUGLINK, + machine->root_dir, debuglink, PATH_MAX); + if (!ret) { + fd = open(debuglink, O_RDONLY); + if (fd >= 0) { + ofs = elf_section_offset(fd, + ".debug_frame"); + close(fd); + } + } + if (ofs > 0) { + if (dso->symsrc_filename != NULL) { + pr_warning( + "%s: overwrite symsrc(%s,%s)\n", + __func__, + dso->symsrc_filename, + debuglink); + free(dso->symsrc_filename); + } + dso->symsrc_filename = debuglink; + } else { + free(debuglink); + } + } - /* Check the .debug_frame section for unwinding info */ - ofs = elf_section_offset(fd, ".debug_frame"); dso->data.debug_frame_offset = ofs; - dso__data_put_fd(dso); } *offset = ofs; diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c index 9ddd98827d12..bf29aed16bd6 100644 --- a/tools/perf/util/util.c +++ b/tools/perf/util/util.c @@ -789,3 +789,16 @@ int is_printable_array(char *p, unsigned int len) } return 1; } + +int unit_number__scnprintf(char *buf, size_t size, u64 n) +{ + char unit[4] = "BKMG"; + int i = 0; + + while (((n / 1024) > 1) && (i < 3)) { + n /= 1024; + i++; + } + + return scnprintf(buf, size, "%" PRIu64 "%c", n, unit[i]); +} diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index 1d639e38aa82..6e8be174ec0b 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -363,4 +363,5 @@ int is_printable_array(char *p, unsigned int len); int timestamp__scnprintf_usec(u64 timestamp, char *buf, size_t sz); +int unit_number__scnprintf(char *buf, size_t size, u64 n); #endif /* GIT_COMPAT_UTIL_H */ diff --git a/tools/testing/selftests/locking/ww_mutex.sh b/tools/testing/selftests/locking/ww_mutex.sh new file mode 100644 index 000000000000..6905da965f3b --- /dev/null +++ b/tools/testing/selftests/locking/ww_mutex.sh @@ -0,0 +1,10 @@ +#!/bin/sh +# Runs API tests for struct ww_mutex (Wait/Wound mutexes) + +if /sbin/modprobe -q test-ww_mutex; then + /sbin/modprobe -q -r test-ww_mutex + echo "locking/ww_mutex: ok" +else + echo "locking/ww_mutex: [FAIL]" + exit 1 +fi diff --git a/tools/testing/selftests/rcutorture/configs/lock/CFLIST b/tools/testing/selftests/rcutorture/configs/lock/CFLIST index b9611c523723..41bae5824339 100644 --- a/tools/testing/selftests/rcutorture/configs/lock/CFLIST +++ b/tools/testing/selftests/rcutorture/configs/lock/CFLIST @@ -4,3 +4,4 @@ LOCK03 LOCK04 LOCK05 LOCK06 +LOCK07 diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK07 b/tools/testing/selftests/rcutorture/configs/lock/LOCK07 new file mode 100644 index 000000000000..1d1da1477fc3 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK07 @@ -0,0 +1,6 @@ +CONFIG_SMP=y +CONFIG_NR_CPUS=4 +CONFIG_HOTPLUG_CPU=y +CONFIG_PREEMPT_NONE=n +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=y diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK07.boot b/tools/testing/selftests/rcutorture/configs/lock/LOCK07.boot new file mode 100644 index 000000000000..97dadd1a9e45 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK07.boot @@ -0,0 +1 @@ +locktorture.torture_type=ww_mutex_lock diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index 8c1cb423cfe6..25d4067c11e4 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -10,7 +10,7 @@ TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_sysc TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \ test_FCMOV test_FCOMI test_FISTTP \ vdso_restorer -TARGETS_C_64BIT_ONLY := fsgsbase +TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY) TARGETS_C_64BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_64BIT_ONLY) diff --git a/tools/testing/selftests/x86/sysret_rip.c b/tools/testing/selftests/x86/sysret_rip.c new file mode 100644 index 000000000000..d85ec5b3671c --- /dev/null +++ b/tools/testing/selftests/x86/sysret_rip.c @@ -0,0 +1,195 @@ +/* + * sigreturn.c - tests that x86 avoids Intel SYSRET pitfalls + * Copyright (c) 2014-2016 Andrew Lutomirski + * + * This program is free software; you can redistribute it and/or modify + * it under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#define _GNU_SOURCE + +#include <stdlib.h> +#include <unistd.h> +#include <stdio.h> +#include <string.h> +#include <inttypes.h> +#include <sys/signal.h> +#include <sys/ucontext.h> +#include <sys/syscall.h> +#include <err.h> +#include <stddef.h> +#include <stdbool.h> +#include <setjmp.h> +#include <sys/user.h> +#include <sys/mman.h> +#include <assert.h> + + +asm ( + ".pushsection \".text\", \"ax\"\n\t" + ".balign 4096\n\t" + "test_page: .globl test_page\n\t" + ".fill 4094,1,0xcc\n\t" + "test_syscall_insn:\n\t" + "syscall\n\t" + ".ifne . - test_page - 4096\n\t" + ".error \"test page is not one page long\"\n\t" + ".endif\n\t" + ".popsection" + ); + +extern const char test_page[]; +static void const *current_test_page_addr = test_page; + +static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), + int flags) +{ + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = handler; + sa.sa_flags = SA_SIGINFO | flags; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + err(1, "sigaction"); +} + +static void clearhandler(int sig) +{ + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_handler = SIG_DFL; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + err(1, "sigaction"); +} + +/* State used by our signal handlers. */ +static gregset_t initial_regs; + +static volatile unsigned long rip; + +static void sigsegv_for_sigreturn_test(int sig, siginfo_t *info, void *ctx_void) +{ + ucontext_t *ctx = (ucontext_t*)ctx_void; + + if (rip != ctx->uc_mcontext.gregs[REG_RIP]) { + printf("[FAIL]\tRequested RIP=0x%lx but got RIP=0x%lx\n", + rip, (unsigned long)ctx->uc_mcontext.gregs[REG_RIP]); + fflush(stdout); + _exit(1); + } + + memcpy(&ctx->uc_mcontext.gregs, &initial_regs, sizeof(gregset_t)); + + printf("[OK]\tGot SIGSEGV at RIP=0x%lx\n", rip); +} + +static void sigusr1(int sig, siginfo_t *info, void *ctx_void) +{ + ucontext_t *ctx = (ucontext_t*)ctx_void; + + memcpy(&initial_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t)); + + /* Set IP and CX to match so that SYSRET can happen. */ + ctx->uc_mcontext.gregs[REG_RIP] = rip; + ctx->uc_mcontext.gregs[REG_RCX] = rip; + + /* R11 and EFLAGS should already match. */ + assert(ctx->uc_mcontext.gregs[REG_EFL] == + ctx->uc_mcontext.gregs[REG_R11]); + + sethandler(SIGSEGV, sigsegv_for_sigreturn_test, SA_RESETHAND); + + return; +} + +static void test_sigreturn_to(unsigned long ip) +{ + rip = ip; + printf("[RUN]\tsigreturn to 0x%lx\n", ip); + raise(SIGUSR1); +} + +static jmp_buf jmpbuf; + +static void sigsegv_for_fallthrough(int sig, siginfo_t *info, void *ctx_void) +{ + ucontext_t *ctx = (ucontext_t*)ctx_void; + + if (rip != ctx->uc_mcontext.gregs[REG_RIP]) { + printf("[FAIL]\tExpected SIGSEGV at 0x%lx but got RIP=0x%lx\n", + rip, (unsigned long)ctx->uc_mcontext.gregs[REG_RIP]); + fflush(stdout); + _exit(1); + } + + siglongjmp(jmpbuf, 1); +} + +static void test_syscall_fallthrough_to(unsigned long ip) +{ + void *new_address = (void *)(ip - 4096); + void *ret; + + printf("[RUN]\tTrying a SYSCALL that falls through to 0x%lx\n", ip); + + ret = mremap((void *)current_test_page_addr, 4096, 4096, + MREMAP_MAYMOVE | MREMAP_FIXED, new_address); + if (ret == MAP_FAILED) { + if (ip <= (1UL << 47) - PAGE_SIZE) { + err(1, "mremap to %p", new_address); + } else { + printf("[OK]\tmremap to %p failed\n", new_address); + return; + } + } + + if (ret != new_address) + errx(1, "mremap malfunctioned: asked for %p but got %p\n", + new_address, ret); + + current_test_page_addr = new_address; + rip = ip; + + if (sigsetjmp(jmpbuf, 1) == 0) { + asm volatile ("call *%[syscall_insn]" :: "a" (SYS_getpid), + [syscall_insn] "rm" (ip - 2)); + errx(1, "[FAIL]\tSyscall trampoline returned"); + } + + printf("[OK]\tWe survived\n"); +} + +int main() +{ + /* + * When the kernel returns from a slow-path syscall, it will + * detect whether SYSRET is appropriate. If it incorrectly + * thinks that SYSRET is appropriate when RIP is noncanonical, + * it'll crash on Intel CPUs. + */ + sethandler(SIGUSR1, sigusr1, 0); + for (int i = 47; i < 64; i++) + test_sigreturn_to(1UL<<i); + + clearhandler(SIGUSR1); + + sethandler(SIGSEGV, sigsegv_for_fallthrough, 0); + + /* One extra test to check that we didn't screw up the mremap logic. */ + test_syscall_fallthrough_to((1UL << 47) - 2*PAGE_SIZE); + + /* These are the interesting cases. */ + for (int i = 47; i < 64; i++) { + test_syscall_fallthrough_to((1UL<<i) - PAGE_SIZE); + test_syscall_fallthrough_to(1UL<<i); + } + + return 0; +} |